├── .gitignore
├── 01.Introduction
    ├── 1.intro-2.pdf
    ├── 1.intro-4.pdf
    ├── 1.intro.pdf
    ├── cvxbook.png
    ├── main.tex
    ├── mathmlbook.png
    └── probcover-2nd.png
├── 02.LinearAlgebra
    ├── 2.LA-2.pdf
    ├── 2.LA-4.pdf
    ├── 2.LA.pdf
    ├── L2_affine.png
    ├── L2_affine_linear.png
    ├── L2_basischange.png
    ├── L2_coordinate.png
    ├── L2_image_kernel.png
    ├── L2_rank_nullity.png
    ├── L2_vector_ex.png
    └── main.tex
├── 03.Geometry
    ├── 3.AG-2.pdf
    ├── 3.AG-4.pdf
    ├── 3.AG.pdf
    ├── L3_gramschmidt.png
    ├── L3_ocomp.png
    ├── L3_projection_1D.png
    ├── L3_projection_affine.png
    ├── L3_projection_ex.png
    └── main.tex
├── 04.MatrixDecomposition
    ├── 4.MD-2.pdf
    ├── 4.MD-4.pdf
    ├── 4.MD.pdf
    ├── L4_SVD_matrix.png
    ├── L4_UTM_LTM.png
    ├── L4_cofactor_ex.png
    ├── L4_eigendecomposition.png
    ├── L4_ev_ex1.png
    ├── L4_ev_ex2.png
    ├── L4_ev_ex3.png
    ├── L4_ev_ex4.png
    ├── L4_ev_ex5.png
    ├── L4_matrix_approx.png
    ├── L4_matrix_tree.png
    └── main.tex
├── 05.VectorCaculus
    ├── 5.VC-2.pdf
    ├── 5.VC-4.pdf
    ├── 5.VC.pdf
    ├── L5_computation_graph.png
    ├── L5_grad_matrix_1.png
    ├── L5_grad_matrix_2.png
    ├── L5_grad_matrix_3.png
    ├── L5_useful.png
    └── main.tex
├── 06.Probability
    ├── 6.PD-2.pdf
    ├── 6.PD-4.pdf
    ├── 6.PD.pdf
    ├── L6_CDF_ex1.png
    ├── L6_CDF_ex2.png
    ├── L6_RV_ex.png
    ├── L6_binomial_ex.png
    ├── L6_condind_ex.png
    ├── L6_cov_ex.png
    ├── L6_cov_notind.png
    ├── L6_exp_pdf.png
    ├── L6_gaussian_formula.png
    ├── L6_geo_ex.png
    ├── L6_joint_ex.png
    ├── L6_marginal_conditional.png
    ├── L6_needle.png
    ├── L6_pdf_delta.png
    ├── L6_pdf_ex.png
    ├── L6_pdf_uniform_ex.png
    ├── L6_pmf_ex.png
    ├── L6_total_ex.png
    ├── L6_tworolls.png
    ├── L6_uniform_ex.png
    └── main.tex
├── 07.Optimization
    ├── 7.OPT-2.pdf
    ├── 7.OPT-4.pdf
    ├── 7.OPT.pdf
    ├── L7_convex_conjugate.png
    ├── L7_convex_fn.png
    ├── L7_convex_set_ex1.png
    ├── L7_convex_set_ex2.png
    ├── L7_first_condition.png
    ├── L7_gradient_ex.png
    ├── L7_halfspace.png
    ├── L7_separating.png
    ├── L7_supporting.png
    └── main.tex
├── 08.Model_Data
    ├── 8.MMD-2.pdf
    ├── 8.MMD-4.pdf
    ├── 8.MMD.pdf
    ├── L10_latent.png
    ├── L8_all_gmodels.png
    ├── L8_coinflip.png
    ├── L8_cross_validation.png
    ├── L8_dsep.png
    ├── L8_fittings.png
    ├── L8_gmodel_ex1.png
    ├── L8_gmodel_ex2.png
    ├── L8_lung_cancer.png
    ├── L8_model_class.png
    ├── L8_model_function.png
    ├── L8_model_pmodel.png
    ├── L8_nested_cross_validation.png
    └── main.tex
├── 09.LinearRegression
    ├── 9.LR-2.pdf
    ├── 9.LR-4.pdf
    ├── 9.LR.pdf
    ├── L9_LR_gmodel.png
    ├── L9_bayesian_regression.png
    ├── L9_overfit_linear.png
    ├── L9_poly4fit.png
    ├── L9_posterior_predictive_ex.png
    ├── L9_regression_ex.png
    ├── L9_training_test.png
    └── main.tex
├── 10.PCA
    ├── 10.PCA-2.pdf
    ├── 10.PCA-4.pdf
    ├── 10.PCA.pdf
    ├── L10_PCA_onepicture.png
    ├── L10_dr_ex.png
    ├── L10_latent.png
    ├── L10_mnist.png
    ├── L10_pca_algorithm.png
    ├── L10_pca_picture.png
    ├── L10_variance_diff.png
    └── main.tex
├── 11.DensityEstimation
    ├── 11.GMM-2.pdf
    ├── 11.GMM-4.pdf
    ├── 11.GMM.pdf
    ├── L11_Gaussian_fail.png
    ├── L11_em_ex.png
    ├── L11_gm_ex.png
    ├── L11_gmm_gm.png
    └── main.tex
├── 12.SVM
    ├── 12.SVM-2.pdf
    ├── 12.SVM-4.pdf
    ├── 12.SVM.pdf
    ├── L12_disthyper.png
    ├── L12_halfspace.png
    ├── L12_hingeloss.png
    ├── L12_kernel_ex.png
    ├── L12_soft_hard_svm.png
    ├── L12_softsvm_geo.png
    ├── dist_hyperplane.pptx
    └── main.tex
├── compile.sh
├── kaist_ee.png
├── mydefault.tex
├── myhead.tex
├── mymacro.tex
├── mymath.tex
└── print.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | *.log
 3 | *.toc
 4 | *.snm
 5 | *.out
 6 | *.nav
 7 | *.aux
 8 | *.vrb
 9 | 01.Introduction/.DS_Store
10 | 


--------------------------------------------------------------------------------
/01.Introduction/1.intro-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro-2.pdf


--------------------------------------------------------------------------------
/01.Introduction/1.intro-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro-4.pdf


--------------------------------------------------------------------------------
/01.Introduction/1.intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro.pdf


--------------------------------------------------------------------------------
/01.Introduction/cvxbook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/cvxbook.png


--------------------------------------------------------------------------------
/01.Introduction/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | 
  7 | 
  8 | \title[]{Lecture 1: Introduction}
  9 | \author{Yi, Yung (이융)}
 10 | \institute{Mathematics for Machine Learning\\ 
 11 | \url{https://yung-web.github.io/home/courses/mathml.html}
 12 | \\KAIST EE}
 13 | \date{\today}
 14 | 
 15 | \input{../mymath}
 16 | \input{../mymacro}
 17 | 
 18 | \begin{document}
 19 | 
 20 | \input{../mydefault}
 21 | 
 22 | % START START START START START START START START START START START START START
 23 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 24 | \begin{frame}{Textbook}
 25 | 
 26 |  \begin{center}
 27 |  \begin{tabular}{ccc}
 28 |  \includegraphics[width=2.0cm]{mathmlbook.png} & 
 29 |  \includegraphics[width=2.1cm]{cvxbook.png} &
 30 |  \includegraphics[width=2.4cm]{probcover-2nd.png} 
 31 | \end{tabular}
 32 |  \end{center}
 33 | 
 34 | %\small
 35 | \vspace{-0.4cm}
 36 | \plitemsep 0.02in
 37 | 
 38 | \bci 
 39 | \item Mathematics for Machine Learning\footnote{The entire textbook can be downloaded at \url{https://mml-book.github.io/}}, Cambridge University Press,   Marc Peter Deisenroth, A. Aldo Faisal, and Cheng Soon Ong
 40 | \item Other books
 41 | \bci
 42 | \item Convex Optimization, Cambridge University Press, by Stephen Boyd and Lieven Vandenberghe
 43 | \item Introduction to Probability, 2nd edition, Athena Scientific, by Dimitri P. Bertsekas and John N. Tsitsiklis
 44 | \eci
 45 | \eci
 46 | 
 47 | \end{frame}
 48 | 
 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 50 | \begin{frame}{Organization}
 51 | 
 52 | \plitemsep 0.03in
 53 | 
 54 | \bci 
 55 | \item Part I: Math 
 56 | \bce
 57 | \item Linear Algebra
 58 | \item Analytic Geometry
 59 | \item  Matrix Decomposition
 60 | \item Vector Calculus
 61 | \item Probability and Distributions
 62 | \item Optimization
 63 | \ece
 64 | 
 65 | \medskip
 66 | \item Part II: 4 Basic Machine Learning Problems
 67 | \bce
 68 | \item When Models Meet Data
 69 | 
 70 | \item Dimensionality Reduction with Principal Component Analysis
 71 | 
 72 | \item Density Estimation with Gaussian Mixture Models
 73 | 
 74 | \item Classification with Support Vector Machines
 75 | \ece
 76 | 
 77 | \eci
 78 | \end{frame}
 79 | 
 80 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 81 | \begin{frame}{Suggestions on Course Schedules}
 82 | 
 83 | Total 16 weeks
 84 | \vspace{-0.2cm}
 85 | \plitemsep 0.01in
 86 | 
 87 | \bci 
 88 | \item Part I: Math 
 89 | \bce
 90 | \item Linear Algebra \hfill (2 weeks)
 91 | \item Analytic Geometry \hfill (1 week)
 92 | \item  Matrix Decomposition \hfill(1 week)
 93 | \item Vector Calculus \hfill(1 week)
 94 | \item Probability and Distributions \hfill(2 weeks)
 95 | \item Optimization \hfill(2 weeks)
 96 | \ece
 97 | 
 98 | \item Part II: 4 Basic Machine Learning Problems
 99 | \bce
100 | \item When Models Meet Data \hfill(1 week)
101 | 
102 | \item Dimensionality Reduction with Principal Component Analysis \hfill(1 week)
103 | 
104 | \item Density Estimation with Gaussian Mixture Models \hfill(1 week)
105 | 
106 | \item Classification with Support Vector Machines \hfill(1 week)
107 | \ece
108 | 
109 | \item Total 13 weeks + Midterm (1 week) + Final (1 week) + Extra (1 week)
110 | \eci
111 | \end{frame}
112 | 
113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
114 | \begin{frame}{Target Audience}
115 | 
116 | \plitemsep 0.1in
117 | 
118 | \bci 
119 | 
120 | \item Undergraduate
121 | \bci 
122 | \item They may have partial backgrounds on the math (e.g., only vector calculus + linear algebra). Depending on the students' background, the amount of time for math can be adjusted. 
123 | 
124 | \item Some mathematical parts may need to be provided with some degree of rigorous proofs. 
125 | \eci 
126 | 
127 | \item Graduate
128 | \bci
129 | \item Graduate students have already taken the basic math courses on linear algebra, vector calculus, probability, optimization, but they don't have almost no background on machine learning.
130 | \item Math parts can be just reviewed by minimizing the proofs, but additional ML problems can be added to the course, so that they can have more exposure to the ML part. 
131 | \eci
132 | \eci
133 | \end{frame}
134 | 
135 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
136 | % \begin{frame}{}
137 | % \vspace{2cm}
138 | % \LARGE How to use the downloaded latex source files
139 | 
140 | % \end{frame}
141 | 
142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
143 | \begin{frame}{File Organization}
144 | 
145 | \plitemsep 0.1in
146 | 
147 | \bci 
148 | 
149 | \item In each chapter, there is a {\tt main.tex} which you can compile.
150 | 
151 | \item Common files for all chapters
152 | \bci
153 | \item {\tt myhead.tex}: common headers, e.g., including necessary packages
154 | \item {\tt mydefault.tex}: default values of many latex environments
155 | \item {\tt mymacro.tex}: macros related to linear algebra, e.g., matrix, transpose, inverse, etc
156 | \item {\tt mymath.tex}: other misc. math macros
157 | \item {\tt compile.sh}: shell script which compiles and generate the pdfs of all chapters
158 | \item {\tt print.sh}: shell script which generates the pdfs of 2/1, 4/1 printed formats
159 | \eci
160 | 
161 | \item Just type "./compile.sh" if you want to get all the pdfs\footnote{Please make compile.sh and print.sh executable, if not, by typing {\tt chmod u+x compile.sh}}. 
162 | \eci
163 | \end{frame}
164 | 
165 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
166 | \begin{frame}[fragile]{Slide vs. Handout}
167 | 
168 | \plitemsep 0.1in
169 | 
170 | \bci 
171 | 
172 | \item Handout
173 | 
174 | \begin{verbatim}
175 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
176 | \end{verbatim}
177 | 
178 | \item Slide
179 | \begin{verbatim}
180 | \documentclass[fleqn,aspectratio=169]{beamer}
181 | \end{verbatim}
182 | 
183 | \item Difference between Handout and Slide? If you want to use the functionality of ``beamer overlay" to add animations to the slides, you need to compile without handout option. Please visit the following url if you are interested. 
184 | \medskip
185 | \url{https://youtu.be/kkM_VPSM8kA}
186 | 
187 | % \item Using shell scripts: run {\tt make_slide.sh} or {\tt make_handout.sh}
188 | 
189 | % \item Using mode.tex file
190 | % \begin{verbatim}
191 | % \documentclass[fleqn,aspectratio=169]{beamer}
192 | % \end{verbatim}
193 | 
194 | 
195 | \eci
196 | \end{frame}
197 | 
198 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
199 | \begin{frame}[fragile]{Letter vs. A4}
200 | 
201 | \plitemsep 0.3in
202 | 
203 | \bci 
204 | 
205 | \item A4
206 | In the {\tt myhead.tex} file:
207 | 
208 | \medskip
209 | {\scriptsize
210 | \begin{verbatim}
211 | \usepackage{pgfpages}
212 | \pgfpagesuselayout{resize to}[a4paper,landscape,border shrink=5mm]
213 | \end{verbatim}
214 | }
215 | 
216 | \item Letter
217 | In the {\tt myhead.tex} file:
218 | 
219 | \medskip
220 | {\scriptsize
221 | \begin{verbatim}
222 | \usepackage{pgfpages}
223 | \pgfpagesuselayout{resize to}[letterpaper,landscape,border shrink=5mm]
224 | \end{verbatim}
225 | }
226 | 
227 | \eci
228 | \end{frame}
229 | 
230 | 
231 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
232 | \begin{frame}{Basic Notations}
233 | 
234 | \plitemsep 0.1in
235 | 
236 | \bci 
237 | \item Scalars: $a,b,c,\alpha,\beta,\gamma$
238 | 
239 | \item Vectors: $\vec{x},\vec{y},\vec{z}$
240 | 
241 | \item Matrices: $\mat{X},\mat{Y},\mat{Z}$
242 | 
243 | \item Sets: $\set{A}, \set{B}, \set{C}$
244 | 
245 | \item (Ordered) tuple: $B=(\bm{b}_1, \bm{b}_2, \bm{b}_3)$
246 | 
247 | \item Matrix of column vectors: $\mat{B} = [\vec{b}_1, \vec{b}_2, \vec{b}_3]$ or 
248 | $\mB = \rowvec{\vb_1 & \vb_2 & \vb_3}$
249 | 
250 | \item Set of vectors:  $\set{B} = \sets{\vec{b}_1, \vec{b}_2, \vec{b}_3}$
251 | 
252 | \item $\real,$ $\complex,$ $\integer,$ $\natu,$ $\real^n$, etc
253 | 
254 | \item Probability: We use both $p(\cdot)$, $\prob{\cdot}$.
255 | \eci
256 | \end{frame}
257 | 
258 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
259 | \begin{frame}{}
260 | \vspace{2cm}
261 | \LARGE Enjoy!
262 | 
263 | \bigskip
264 | \large When you modify the latex files for your convenience, if you have any question on macros or pdf generation, feel free to send an email to \url{yiyung@gmail.com}.
265 | \end{frame}
266 | 
267 | % \begin{frame}{Review Questions}
268 | % % \tableofcontents
269 | % %\plitemsep 0.1in
270 | % \bce[1)]
271 | % \item 
272 | 
273 | % \ece
274 | % \end{frame}
275 | 
276 | 
277 | \end{document}
278 | 


--------------------------------------------------------------------------------
/01.Introduction/mathmlbook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/mathmlbook.png


--------------------------------------------------------------------------------
/01.Introduction/probcover-2nd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/probcover-2nd.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/2.LA-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA-2.pdf


--------------------------------------------------------------------------------
/02.LinearAlgebra/2.LA-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA-4.pdf


--------------------------------------------------------------------------------
/02.LinearAlgebra/2.LA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA.pdf


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_affine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_affine.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_affine_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_affine_linear.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_basischange.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_basischange.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_coordinate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_coordinate.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_image_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_image_kernel.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_rank_nullity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_rank_nullity.png


--------------------------------------------------------------------------------
/02.LinearAlgebra/L2_vector_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_vector_ex.png


--------------------------------------------------------------------------------
/03.Geometry/3.AG-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG-2.pdf


--------------------------------------------------------------------------------
/03.Geometry/3.AG-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG-4.pdf


--------------------------------------------------------------------------------
/03.Geometry/3.AG.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG.pdf


--------------------------------------------------------------------------------
/03.Geometry/L3_gramschmidt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_gramschmidt.png


--------------------------------------------------------------------------------
/03.Geometry/L3_ocomp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_ocomp.png


--------------------------------------------------------------------------------
/03.Geometry/L3_projection_1D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_1D.png


--------------------------------------------------------------------------------
/03.Geometry/L3_projection_affine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_affine.png


--------------------------------------------------------------------------------
/03.Geometry/L3_projection_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_ex.png


--------------------------------------------------------------------------------
/03.Geometry/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | 
  7 | \title[]{Lecture 3: Analytic Geometry}
  8 | \author{Yi, Yung (이융)}
  9 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
 10 | \\KAIST EE}
 11 | \date{\today}
 12 | 
 13 | \input{../mymath}
 14 | \input{../mymacro}
 15 | 
 16 | \begin{document}
 17 | 
 18 | \input{../mydefault}
 19 | 
 20 | 
 21 | % START START START START START START START START START START START START START
 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 23 | \begin{frame}{Roadmap}
 24 | 
 25 | \plitemsep 0.1in
 26 | 
 27 | \bce[(1)] 
 28 | \item Norms
 29 | 
 30 | \item Inner Products
 31 | 
 32 | \item Lengths and Distances
 33 | 
 34 | \item Angles and Orthogonality
 35 | 
 36 | \item Orthonormal Basis
 37 | 
 38 | \item Orthogonal Complement
 39 | 
 40 | \item Inner Product of Functions
 41 | 
 42 | \item Orthogonal Projections
 43 | 
 44 | \item Rotations
 45 | 
 46 | \ece
 47 | \end{frame}
 48 | 
 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 50 | \section{L3(1)}
 51 | \begin{frame}{Roadmap}
 52 | 
 53 | \plitemsep 0.1in
 54 | 
 55 | \bce[(1)] 
 56 | \item \redf{Norms}
 57 | 
 58 | \item \grayf{Inner Products
 59 | 
 60 | \item Lengths and Distances
 61 | 
 62 | \item Angles and Orthogonality
 63 | 
 64 | \item Orthonormal Basis
 65 | 
 66 | \item Orthogonal Complement
 67 | 
 68 | \item Inner Product of Functions
 69 | 
 70 | \item Orthogonal Projections
 71 | 
 72 | \item Rotations}
 73 | 
 74 | \ece
 75 | \end{frame}
 76 | 
 77 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 78 | \begin{frame}{Norm}
 79 | 
 80 | \plitemsep 0.1in
 81 | 
 82 | \bci 
 83 | \item A notion of the length of vectors
 84 | 
 85 | \item \defi A norm on a vector space $V$ is a function $\norm{\cdot}: V \mapsto \real,$ such that for all $\lambda \in \real$ the following hold:
 86 | 
 87 | \bci
 88 | \item \bluef{Absolutely homogeneous}: $\norm{\lambda \vec{x}} = |\lambda| \norm{\vec{x}}$
 89 | \item \bluef{Triangle inequality}: $\norm{\vec{x} + \vec{y}} \le \norm{\vec{x}} + \norm{\vec{y}} $
 90 | \item \bluef{Positive definite}: $\norm{\vec{x}} \ge 0$ and $\norm{\vec{x}} \Longleftrightarrow \vec{x} = \vec{0}$
 91 | \eci
 92 | \eci
 93 | \end{frame}
 94 | 
 95 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 96 | \begin{frame}{Example for $V \in \real^n$}
 97 | 
 98 | \plitemsep 0.1in
 99 | 
100 | \bci 
101 | \item \bluef{Manhattan Norm} (also called $\ell_1$ norm) For $\vec{x}= [x_1, \cdots, x_n] \in \real^n,$
102 | $$
103 | \norm{\vec{x}}_1 \eqdef = \sum_{i=1}^n |x_i|
104 | $$
105 | \item \bluef{Euclidean Norm} (also called $\ell_2$ norm) For $\vec{x} \in \real^n,$
106 | $$
107 | \norm{\vec{x}}_2 \eqdef = \sqrt{\sum_{i=1}^n x_i^2} = \sqrt{\trans{\vec{x}} \vec{x}}
108 | $$
109 | 
110 | \eci
111 | \end{frame}
112 | 
113 | 
114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
115 | \begin{frame}{Roadmap}
116 | 
117 | \plitemsep 0.1in
118 | 
119 | \bce[(1)] 
120 | \item \grayf{Norms}
121 | 
122 | \item \redf{Inner Products}
123 | 
124 | \item \grayf{Lengths and Distances
125 | 
126 | \item Angles and Orthogonality
127 | 
128 | \item Orthonormal Basis
129 | 
130 | \item Orthogonal Complement
131 | 
132 | \item Inner Product of Functions
133 | 
134 | \item Orthogonal Projections
135 | 
136 | \item Rotations}
137 | 
138 | \ece
139 | \end{frame}
140 | 
141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
142 | \section{L3(2)}
143 | \begin{frame}{Motivation}
144 | 
145 | \plitemsep 0.1in
146 | 
147 | \bci 
148 | \item Need to talk about the length of a vector and the angle or distance between two vectors, where vectors are defined in abstract vector spaces
149 | 
150 | \item To this end, we define the notion of \bluef{inner product} in an abstract manner.
151 | 
152 | \item Dot product: A kind of inner product in vector space $\real^n$. $\trans{\vec{x}} \vec{y} = \sum_{i=1}^n x_i y_i$ 
153 | 
154 | 
155 | \bigskip
156 | \item \question How can we generalize this and do a similar thing in some other vector spaces?
157 | \eci
158 | \end{frame}
159 | 
160 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
161 | \begin{frame}{Formal Definition}
162 | 
163 | \plitemsep 0.1in
164 | 
165 | \bci 
166 | \item An inner product is a mapping $\inner{\cdot}{\cdot}: V \times V \mapsto \real$ that satisfies the following conditions for all vectors $\vec{u},\vec{v},\vec{w} \in V$ and all scalars $\lambda \in \real$:
167 | 
168 | \medskip
169 | \bce
170 | \item $\inner{\vec{u}+ \vec{v}}{\vec{w}} = \inner{\vec{u}}{\vec{w}} + \inner{\vec{v}}{\vec{w}}$
171 | \item $\inner{\lambda \vec{v}}{\vec{w}} = \lambda \inner{\vec{v}}{\vec{w}}$
172 | \item $\inner{\vec{v}}{\vec{w}} = \inner{\vec{w}}{\vec{v}}$
173 | \item $\inner{\vec{v}}{\vec{v}} \ge 0$ and equal iff $\vec{v}=\vec{0}$
174 | \ece
175 | \medskip
176 | 
177 | \item The pair $(V,\inner{\cdot}{\cdot})$ is called an \bluef{inner product space}.
178 | 
179 | \eci
180 | \end{frame}
181 | 
182 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
183 | \begin{frame}{Example}
184 | 
185 | \plitemsep 0.3in
186 | 
187 | \bci 
188 | 
189 | \item \exam $V=\real^n$ and the dot product $\inner{\vec{x}}{\vec{y}} \eqdef \trans{\vec{x}}\vec{y}$
190 | 
191 | \item \exam $V=\real^2$ and $\inner{\vec{x}}{\vec{y}} \eqdef x_1y_1 - (x_1y_2 + x_2y_1) + 2x_2y_2$
192 | 
193 | \item \exam $V=\{\text{continuous functions in $\real$ over $[a,b]$} \},$ $\inner{u}{v} \eqdef \int_a^b u(x)v(x) dx$
194 | \eci
195 | \end{frame}
196 | 
197 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
198 | \begin{frame}{Symmetric, Positive Definite Matrix}
199 | 
200 | \plitemsep 0.1in
201 | 
202 | \bci 
203 | \item A square matrix $\mat{A} \in \real^{n \times n}$ that satisfies the following is called \bluef{symmetric, positive definite} (or just positive definite): 
204 | $$
205 | \forall \vec{x} \in V \setminus \{\vec{0} \}: \trans{\vec{x}} \mat{A} \vec{x} > 0.
206 | $$
207 | If only $\ge$ in the above holds, then $\mat{A}$ is called \bluef{symmetric, positive semidefinite.} 
208 | 
209 | \bigskip
210 | \item $\mat{A}_1 = \begin{nmat}
211 | 9 & 6 \cr
212 | 6 & 5
213 | \end{nmat}
214 | $ is positive definite.
215 | 
216 | \item $\mat{A}_2 = \begin{nmat}
217 | 9 & 6 \cr
218 | 6 & 3
219 | \end{nmat}
220 | $ is not positive definite.
221 | 
222 | 
223 | 
224 | \eci
225 | \end{frame}
226 | 
227 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
228 | \begin{frame}{Inner Product and Positive Definite Matrix (1)}
229 | 
230 | \plitemsep 0.2in
231 | 
232 | \bci 
233 | \item Consider an $n$-dimensional vector space $V$ with an inner product $\inner{\cdot}{\cdot}$ and an ordered basis $B=(\vec{b}_1, \ldots, \vec{b}_n)$ of $V.$
234 | 
235 | \item Any $\vec{x},\vec{y} \in V$ can be represented as: $\vec{x}=\sum_{i=1}^n \psi_i \vec{b}_i$ and $\vec{y}=\sum_{i=j}^n \lambda_j \vec{b}_j$ for some $\psi_i$ and $\lambda_j,$ $i,j=1, \ldots, n.$
236 | \aleq{
237 | \inner{\vec{x}}{\vec{y}} = \inner{\sum_{i=1}^n \psi_i\vec{b}_i}{\sum_{i=j}^n \lambda_j \vec{b}_j} = 
238 | \sum_{i=1}^n \sum_{j=1}^n \psi_i \inner{\vec{b}_i}{\vec{b}_j} \lambda_j = \trans{\hat{\vec{x}}} \mat{A} \hat{\vec{y}},
239 | }
240 | where $\mat{A}_{ij} = \inner{\vec{b}_i}{\vec{b}_j}$ and $\hat{\vec{x}}$ and $\hat{\vec{y}}$ are the coordinates w.r.t. $B.$
241 | 
242 | \eci
243 | \end{frame}
244 | 
245 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
246 | \begin{frame}{Inner Product and Positive Definite Matrix (2)}
247 | 
248 | \plitemsep 0.2in
249 | 
250 | \bci 
251 | 
252 | \item Then, if $\forall \vec{x} \in V \setminus \{\vec{0} \}: \trans{\vec{x}} \mat{A} \vec{x} > 0$ (i.e., $\mat{A}$ is symmetric, positive definite), $\bluef{\trans{\hat{\vec{x}}} \mat{A} \hat{\vec{y}}}$ legitimately defines an inner product (w.r.t. $B$)
253 | 
254 | \item Properties
255 | \bci
256 | \item The kernel of $\mat{A}$ is only $\{\vec{0} \}$, because $\trans{\vec{x}} \mat{A} \vec{x} > 0$ for all $\vec{x} \neq \vec{0}  \implies$ $\mat{A} \vec{x} \neq \vec{0}$ if $\vec{x} \neq \vec{0}.$
257 | \item The diagonal elements $a_{ii}$ of $\mat{A}$ are all positive, because $a_{ii} = \trans{\vec{e}_i} \mat{A} \vec{e}_i >0.$
258 | \eci
259 | \eci
260 | \end{frame}
261 | 
262 | 
263 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
264 | \section{L3(3)}
265 | \begin{frame}{Roadmap}
266 | 
267 | \plitemsep 0.1in
268 | 
269 | \bce[(1)] 
270 | \item \grayf{Norms
271 | 
272 | \item Inner Products}
273 | 
274 | \item \redf{Lengths and Distances
275 | 
276 | \item Angles and Orthogonality}
277 | 
278 | \item \grayf{Orthonormal Basis
279 | 
280 | \item Orthogonal Complement
281 | 
282 | \item Inner Product of Functions
283 | 
284 | \item Orthogonal Projections
285 | 
286 | \item Rotations}
287 | 
288 | \ece
289 | \end{frame}
290 | 
291 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
292 | 
293 | \begin{frame}{Length}
294 | 
295 | \plitemsep 0.2in
296 | 
297 | \bci 
298 | 
299 | \item Inner product naturally induces a norm by defining:
300 | $$
301 | \norm{x} \eqdef \sqrt{\inner{\vec{x}}{\vec{x}}}
302 | $$
303 | 
304 | \item Not every norm is induced by an inner product
305 | 
306 | \item \redf{Cachy-Schwarz inequality.} For the induced norm by the inner product, 
307 | $$
308 | |\inner{\vec{x}}{\vec{y}}| \le \norm{\vec{x}} \ \norm{\vec{y}}
309 | $$
310 | 
311 | \eci
312 | \end{frame}
313 | 
314 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
315 | \begin{frame}{Distance}
316 | 
317 | \plitemsep 0.1in
318 | 
319 | \bci 
320 | 
321 | \item Now, we can introduce a notion of distance using a norm as:
322 | 
323 | \medskip
324 | \redf{Distance}. $d(\vec{x},\vec{y}) \eqdef \norm{\vec{x} - \vec{y}} = \sqrt{\inner{\vec{x}-\vec{y}}{\vec{x}-\vec{y}}} $
325 | 
326 | \item If the dot product is used as an inner product in $\real^n,$ it is \bluef{Euclidian distance.}
327 | 
328 | \item \redf{Note.} The distance between two vectors does \bluef{NOT} necessarily require the notion of norm. Norm is just sufficient. 
329 | 
330 | \item Generally, if the following is satisfied, it is a suitable notion of distance, called \bluef{metric}. 
331 | \bci
332 | \item \bluef{\em Positive definite}. $d(\vec{x},\vec{y}) \ge 0$ for all $\vec{x},\vec{y}$ and $d(\vec{x},\vec{y}) = 0 \Longleftrightarrow \vec{x}=\vec{y}$ 
333 | \item \bluef{\em Symmetric}. $d(\vec{x},\vec{y}) = d(\vec{y},\vec{x})$
334 | \item \bluef{\em Triangle inequality}. $d(\vec{x},\vec{z}) \le d(\vec{x},\vec{y}) + d(\vec{y},\vec{z})$
335 | \eci
336 | \eci
337 | \end{frame}
338 | 
339 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
340 | \section{L3(4)}
341 | \begin{frame}{Angle,  Orthogonal, and Orthonormal}
342 | 
343 | \plitemsep 0.1in
344 | 
345 | \bci 
346 | 
347 | \item Using C-S inequality, $$-1 \le \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}} \le 1$$
348 | 
349 | \item Then, there exists a unique $\omega \in [0,\pi]$ with $$\cos \omega = \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}}$$
350 | 
351 | \item We define $\omega$ as the \bluef{angle} between $\vec{x}$ and $\vec{y}.$
352 | 
353 | \item \defi If $\inner{\vec{x}}{\vec{y}} = 0,$ in other words their angle is $\pi/2,$ we say that they are \bluef{orthogonal}, denoted by $\vec{x} \perp \vec{y}.$ Additionally, if $\norm{x} = \norm{y} =1,$ they are \bluef{orthonormal}.
354 | \eci
355 | \end{frame}
356 | 
357 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
358 | \begin{frame}{Example}
359 | 
360 | \plitemsep 0.15in
361 | 
362 | \bci 
363 | 
364 | \item Orthogonality is defined by a given inner product. Thus, different inner products may lead to different results about orthogonality. 
365 | 
366 | \item \exam Consider two vectors $\vec{x}=\colvec{1 \\1 }$ and $\vec{y}=\colvec{-1 \\ 1 }$
367 | 
368 | \item Using the dot product as the inner product, they are orthogonal.
369 | 
370 | \item However, using $\inner{\vec{x}}{\vec{y}} = \trans{\vec{x}}
371 | \begin{nmat}
372 | 2 & 0 \cr
373 | 0 & 1
374 | \end{nmat} \vec{y}$, they are not orthogonal. 
375 | \aleq{
376 | \cos \omega = \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}} = -\frac{1}{3} \implies \omega \approx 1.91 \text{ rad } \approx 109.5\text{\textdegree}
377 | }
378 | \eci
379 | 
380 | 
381 | \end{frame}
382 | 
383 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
384 | \begin{frame}{Orthogonal Matrix}
385 | 
386 | \plitemsep 0.05in
387 | 
388 | \bci 
389 | 
390 | \item \defi A square matrix $\mat{A} \in \real^{n \times n}$ is an \bluef{orthogonal matrix}, iff its columns (or rows) are \bluef{orthonormal} so that 
391 | $$
392 | \mat{A} \trans{\mat{A}} = I = \trans{\mat{A}}\mat{A}, \text{ implying } \inv{\mat{A}} = \trans{\mat{A}}.
393 | $$
394 | \vspace{-0.3cm}
395 | \bci
396 | \item We can use \bluef{$\inv{\mat{A}} = \trans{\mat{A}}$} for the definition of orthogonal matrices. 
397 | \item Fact 1. $\mA,\mB$: orthogonal $\implies$ $\mA\mB$: orthogonal
398 | \item Fact 2. $\mA$: orthogonal $\implies$ $\det(\mA) = \pm 1$
399 | \eci
400 | 
401 | 
402 | \item The linear mapping $\Phi$ by orthogonal matrices preserve \bluef{length} and \bluef{angle} (for the dot product)
403 | \aleq{
404 | \norm{\Phi(\mA)} = \norm{\mat{A}\vec{x}}^2 = \trans{(\mat{A}\vec{\vec{x}})} (\mat{A} \vec{x}) = \trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{x} = \trans{\vec{x}} \vec{x} = \norm{\vec{x}}^2
405 | }
406 | \vspace{-0.7cm}
407 | \aleq{
408 | \cos \omega = \frac{\trans{(\mat{A}\vec{x})} (\mat{A}\vec{y})}{\norm{\mat{A}\vec{x}} \ \norm{\mat{A}\vec{y}}} = 
409 | \frac
410 | {
411 | \trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{y}
412 | }
413 | {
414 | \sqrt{\trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{x} \trans{\vec{y}} \trans{\mat{A}} \mat{A} \vec{y}
415 | }
416 | }
417 | = \frac{\trans{\vec{x}} \vec{y}}{\norm{\vec{x}} \ \norm{\vec{y}}}
418 | }
419 | 
420 | \eci
421 | \end{frame}
422 | 
423 | 
424 | 
425 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
426 | \section{L3(5)}
427 | \begin{frame}{Roadmap}
428 | 
429 | \plitemsep 0.1in
430 | 
431 | \bce[(1)] 
432 | \item \grayf{Norms
433 | 
434 | \item Inner Products
435 | 
436 | \item Lengths and Distances
437 | 
438 | \item Angles and Orthogonality}
439 | 
440 | \item \redf{Orthonormal Basis
441 | 
442 | \item Orthogonal Complement
443 | 
444 | \item Inner Product of Functions}
445 | 
446 | \item \grayf{Orthogonal Projections
447 | 
448 | \item Rotations}
449 | 
450 | \ece
451 | \end{frame}
452 | 
453 | 
454 | 
455 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
456 | \begin{frame}{Orthonormal Basis}
457 | 
458 | \plitemsep 0.1in
459 | 
460 | \bci 
461 | 
462 | \item Basis that is orthonormal, i.e., they are all orthogonal to each other and their lengths are 1. 
463 | 
464 | \item Standard basis in $\real^n,$ $\{\vec{e}_1, \ldots, \vec{e}_n \},$ is orthonormal.
465 | 
466 | 
467 | \item \question How to obtain an orthonormal basis?
468 | 
469 | \bigskip
470 | \mycolorbox{
471 | \item[1.] Use Gaussian elimination to find a basis for a vector space spanned by a set of vectors.
472 | \bci
473 | \item Given a set $\{\vec{b}_1, \ldots, \vec{b}_n \}$ of unorthogonal and unnormalized basis vectors. Apply Gaussian elimination to the augmented matrix $(\mat{B}\trans{\mat{B}}|\mat{B})$
474 | \eci
475 | 
476 | \item[2.] Constructive way: Gram-Schmidt process (we will cover this later)
477 | }
478 | \eci
479 | \end{frame}
480 | 
481 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
482 | \section{L3(6)}
483 | \begin{frame}{Orthogonal Complement (1)}
484 | 
485 | \plitemsep 0.1in
486 | 
487 | \bci 
488 | 
489 | \item Consider $D$-dimensional vector space $V$ and $M$-dimensional subspace  $W \subset V.$ The \bluef{orthogonal complement} $\ocomp{U}$ is a $(D-M)$-dimensional subspace of $V$ and contains all vectors in $V$ that are orthogonal to every vector in $U.$
490 | 
491 | \item $U \cap \ocomp{U} = \vec{0}$
492 | 
493 | \item Any vector $x \in V$ can be uniquely decomposed into:
494 | \aleq{
495 | \vec{x} = \sum_{m=1}^M \lambda_m \vec{b}_m + \sum_{j=1}^{D-M} \psi_j \ocomp{\vec{b}}_j, \quad \lambda_m, \psi_j \in \real,
496 | }
497 | where $(\vec{b}_1 \ldots, \vec{b}_M)$ and $(\ocomp{\vec{b}}_1, \ldots, \ocomp{\vec{b}}_{D-M} )$ are the \bluef{bases} of $U$ and $\ocomp{U},$ respectively. 
498 | \eci
499 | \end{frame}
500 | 
501 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
502 | \begin{frame}{Orthogonal Complement (2)}
503 | 
504 | \plitemsep 0.1in
505 | 
506 | \vspace{-0.3cm}
507 | \begin{center}
508 | \mypic{0.35}{L3_ocomp.png}
509 | \end{center}
510 | \vspace{-0.5cm}
511 | \bci 
512 | \item The vector $\vw$ with $\norm{\vw}=1,$ which is orthogonal to $U$, is the basis of $\ocomp{U}.$
513 | \item Such $\vw$ is called \bluef{normal vector} to $U.$
514 | 
515 | \item For a linear mapping represented by a matrix $\mat{A} \in \real^{m \times n},$ the solution space of $\mat{A} \vec{x} =0$ is $\ocomp{\text{row}(\mat{A})},$ where $\text{row}(\mat{A})$ is the row space of $\mat{A}$ (i.e., span of row vectors).
516 | 
517 | In other words, $\ocomp{\text{row}(\mat{A})} = \ker(\mat{A})$
518 | \eci
519 | 
520 | 
521 | \end{frame}
522 | 
523 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
524 | \section{L3(7)}
525 | \begin{frame}{Inner Product of Functions}
526 | 
527 | \plitemsep 0.15in
528 | 
529 | \bci 
530 | 
531 | \item \redf{Remind:} $V=\{\text{continuous functions in $\real$ over $[a,b]$} \},$ the following is a proper inner product. 
532 | \bluef{$$\inner{u}{v} \eqdef \int_a^b u(x)v(x) dx$$}
533 | 
534 | \item \exam Choose $u(x) = \sin(x)$ and $v(x)= \cos(x),$ where we select $a=-\pi$ and $b=\pi.$ Then, since $f(x) = u(x)v(x)$ is odd (i.e., $f(-x) = -f(x)$), 
535 | $$
536 | \int_{-\pi}^\pi u(x) v(x) dx =0.
537 | $$
538 | 
539 | \item Thus, $u$ and $v$ are orthogonal. 
540 | 
541 | \item Similarly, $\{1, \cos(x), \cos(2x), \cos(3x), \ldots,  \}$ is orthogonal over $[-\pi,\pi].$
542 | \eci
543 | 
544 | 
545 | \end{frame}
546 | 
547 | 
548 | 
549 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
550 | \section{L3(8)}
551 | \begin{frame}{Roadmap}
552 | 
553 | \plitemsep 0.1in
554 | 
555 | \bce[(1)] 
556 | \item \grayf{Norms
557 | 
558 | \item Inner Products
559 | 
560 | \item Lengths and Distances
561 | 
562 | \item Angles and Orthogonality
563 | 
564 | \item Orthonormal Basis
565 | 
566 | \item Orthogonal Complement
567 | 
568 | \item Inner Product of Functions}
569 | 
570 | \item \redf{Orthogonal Projections}
571 | 
572 | \item \grayf{Rotations}
573 | 
574 | \ece
575 | \end{frame}
576 | 
577 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
578 | \begin{frame}{Projection: Motivation}
579 | 
580 | \plitemsep 0.05in
581 | 
582 | \bci 
583 | 
584 | \item Big data: high dimensional
585 | 
586 | \item However, most information is contained in a few dimensions
587 | 
588 | \item \bluef{Projection}: A process of reducing the dimensions (hopefully) without loss of much information\footnote{In \lecturemark{L10}, we will formally study this with the topic of PCA (Principal Component Analysis).}
589 | 
590 | \item \exam Projection of 2D dataset onto 1D subspace
591 | 
592 | \centering
593 | \mypic{0.4}{L3_projection_ex.png}
594 | \eci
595 | 
596 | 
597 | \end{frame}
598 | 
599 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
600 | \begin{frame}{Projection onto Lines (1D Subspaces)}
601 | 
602 | \plitemsep 0.1in
603 | 
604 | \bci 
605 | \item Consider a 1D subspace $U \subset \real^n$ spanned by the basis $\vec{b}.$ 
606 | 
607 | \item For $\vx \in \realn,$ what is its projection \bluef{$\pi_U(\vec{x})$} onto $U$ (assume the dot product)?
608 | \myvartwocols{0.3}{0.7}{0.29}
609 | {
610 | \small
611 | \aleq{
612 | &\inner{\vec{x} - \pi_U(\vec{x})}{\vec{b}} = 0 \xleftrightarrow{\pi_U(\vec{x}) = \lambda \vec{b}} \inner{\vec{x} - \lambda \vec{b}}{\vec{b}}=0\cr
613 | & \implies \lambda = \frac{\inner{\vec{b}}{\vec{x}}}{\norm{\vec{b}}^2} = \frac{\trans{\vec{b}}\vec{x}}{\norm{\vec{b}}^2}, \ \text{and} \ \pi_U(\vec{x}) = \lambda \vec{b} =  \bluef{\frac{\trans{\vec{b}}\vec{x}}{\norm{\vec{b}}^2} \vec{b}}
614 | }
615 | }
616 | {
617 | \vspace{-0.2cm}
618 | \mypic{0.8}{L3_projection_1D.png}
619 | }
620 | \vspace{-0.5cm}
621 | \item Projection matrix \redf{$\mat{P}_\pi \in \realnn$}  in $\pi_U(\vec{x}) = \mat{P}_\pi \vec{x}$
622 | \aleq{
623 | \pi_U(\vec{x}) = \lambda \vec{b} =  \vec{b} \lambda =  \frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2} \vec{x}, \quad \mat{P}_\pi = \bluef{\frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2}}
624 | }
625 | \eci
626 | 
627 | \end{frame}
628 | 
629 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
630 | \begin{frame}{Inner Product and Projection}
631 | 
632 | \plitemsep 0.1in
633 | 
634 | \bci 
635 | \item We project $\vx$ onto $\vb$, and let $\pi_{\vb}(\vx)$ be the projected vector. 
636 | 
637 | 
638 | \item \question Understanding the inner project $\inner{\vx}{\vb}$ from the projection perspective?
639 | \mycolorbox{
640 | $$
641 | \inner{\vx}{\vb} = \norm{\pi_{\vb}(\vx)} \times \norm{\vb}
642 | $$
643 | }
644 | \mytwocols{0.4}
645 | {
646 | \item In other words, the inner product of $\vx$ and $\vb$ is the product of (\bluef{length of the projection of $\vx$ onto $\vb$}) $\times$ (\bluef{length of $\vb$})
647 | }
648 | {
649 | \vspace{-0.2cm}
650 | \mypic{0.6}{L3_projection_1D.png}
651 | }
652 | 
653 | \eci
654 | 
655 | \end{frame}
656 | 
657 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
658 | \begin{frame}{Example}
659 | 
660 | \plitemsep 0.1in
661 | 
662 | \bci 
663 | \item $\vec{b} = \colvec{1 \\ 2 \\ 2}$
664 | \aleq{
665 | \mat{P}_\pi = \frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2} = \frac{1}{9}\colvec{1\\2\\2}\rowvec{1 & 2 & 2} = \frac{1}{9}
666 | \begin{nmat}
667 | 1&2&2 \cr
668 | 2&4&4 \cr
669 | 2&4&4 
670 | \end{nmat}
671 | }
672 | For $\vec{x} = \colvec{1\\1\\1},$
673 | \aleq{
674 | \pi_U(\vec{x}) = \mat{P}_\pi \vec{x} = \frac{1}{9}
675 | \begin{nmat}
676 | 1&2&2 \cr
677 | 2&4&4 \cr
678 | 2&4&4 
679 | \end{nmat} \colvec{1\\1\\1} = \frac{1}{9} \colvec{5\\10\\10} \in \spn{\colvec{1\\2\\2}} 
680 | }
681 | \eci
682 | 
683 | \end{frame}
684 | 
685 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
686 | \begin{frame}{Projection onto General Subspaces}
687 | 
688 | \plitemsep 0.1in
689 | 
690 | 
691 | %\item Compare the results:
692 | 
693 | \mytwocols{0.4}
694 | {
695 | \bci
696 | \item $\realn \rightarrow$ 1-Dim
697 | \item A basis vector $\vec{b}$ in 1D subspace
698 | \eci
699 | \centering
700 | $$
701 | \pi_U(\vec{x}) = \bluef{\frac{\vec{b}\trans{\vec{b}}\vec{x}}{\trans{\vec{b}}\vec{b}}}, \ \lambda = \frac{\trans{\vec{b}}\vec{x}}{\trans{\vec{b}}\vec{b}}
702 | $$
703 | $$
704 | \mat{P}_\pi  = \redf{\frac{\vec{b}\trans{\vec{b}}}{\trans{\vec{b}}\vec{b} }}
705 | $$
706 | }
707 | {
708 | \bci
709 | \item $\realn \rightarrow$ $m$-Dim, $(m < n)$
710 | \item A basis matrix $B=\rowvec{\vec{b}_1, \cdots, \vec{b}_m} \in \real^{n \times m}$ 
711 | \eci
712 | $$
713 | \pi_U(\vec{x}) = \bluef{\mB\inv{(\trans{\mB}\mB)}\trans{\mB} \vec{x}}, \ 
714 | \vlam = \inv{(\trans{\mB}\mB)}\trans{\mB} \vec{x}
715 | $$
716 | $$
717 | \mat{P}_\pi  = \redf{\mB\inv{(\trans{\mB}\mB)}\trans{\mB} }
718 | $$
719 | }
720 | \vspace{-0.4cm}
721 | \bci
722 | \item $\lambda \in \real^{1}$ and $\vlam \in \realm$ are the coordinates in the projected spaces, respectively. 
723 | \item $\inv{(\trans{\mB}\mB)}\trans{\mB}$ is called \bluef{pseudo-inverse}.
724 | \item How to derive is analogous to the case of 1-D lines (see pp. 71).
725 | \eci
726 | \end{frame}
727 | 
728 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
729 | \begin{frame}{Example: Projection onto 2D Subspace}
730 | 
731 | \plitemsep 0.1in
732 | \small
733 | \bci 
734 | \item $U = \spn{\colvec{1\\1\\1}, \colvec{0\\1\\2}} \subset \real^3$ and $\vec{x} = \colvec{6\\0\\0}$. Check that $\{ \trans{\rowvec{1&1&1}}, \trans{\rowvec{0&1&2}}\}$ is a basis.
735 | \item Let $\mat{B} = \begin{nmat}
736 | 1&0\cr
737 | 1&1\cr
738 | 1&2
739 | \end{nmat}.$ Then, $\trans{\mat{B}}\mat{B} = 
740 | \begin{nmat}
741 | 1&1&2\cr
742 | 0&1&2
743 | \end{nmat}
744 | \begin{nmat}
745 | 1&0\cr
746 | 1&1\cr
747 | 1&2
748 | \end{nmat}
749 | = 
750 | \begin{nmat}
751 | 3&3\cr
752 | 3&5
753 | \end{nmat}
754 | $
755 | \item Can see that $\mat{P}_\pi  = \mB\inv{(\trans{\mB}\mB)}\trans{\mB} = \dfrac{1}{6}
756 | \begin{nmat}
757 | 5&2&-1\cr
758 | 2&2&2\cr
759 | -1&2&5
760 | \end{nmat}
761 | $, and $\pi_U(\vec{x}) = \dfrac{1}{6}
762 | \begin{nmat}
763 | 5&2&-1\cr
764 | 2&2&2\cr
765 | -1&2&5
766 | \end{nmat} \colvec{6\\0\\0} = \colvec{5\\2\\-1}$
767 | 
768 | \eci
769 | 
770 | \end{frame}
771 | 
772 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
773 | \begin{frame}{Gram-Schmidt Orthogonalization Method (G-S method)}
774 | 
775 | \plitemsep 0.05in
776 | 
777 | \bci 
778 | \item Constructively transform any basis $(\vb_1, \ldots, \vb_n)$ of $n$-dimensional vector space $V$ into an orthogonal/orthonormal basis $(\vu_1, \ldots, \vu_n)$ of $V$ 
779 | 
780 | \item Iteratively construct as follows
781 | \mycolorbox{
782 | \vspace{-0.2cm}
783 | \aleq{
784 | \vu_1 &\eqdef \vb_1 \cr
785 | \vu_k &\eqdef \vb_k - \pi_{\text{span}[\vu_1, \ldots, \vu_{k-1}]}(\vb_k), \ k=2, \ldots, n \qquad \qquad (*)
786 | }
787 | }
788 | %\item In $(*)$
789 | % \mytwocols{0.3}
790 | % {
791 | % \bci
792 | % \item $\pi_{\text{span}[\vu_1, \ldots, \vu_{k-1}]}(\vb_k)$: projection of $\vb_k$ onto the subspace spanned by $[\vu_1, \ldots, \vu_{k-1}]$
793 | % \item Then, $\vu_k$ becomes orthogonal to $\text{span}[\vu_1, \ldots, \vu_{k-1}]$
794 | % \eci
795 | % }
796 | % {
797 | % }
798 | \eci
799 | \vspace{-0.3cm}
800 | \mypic{0.9}{L3_gramschmidt.png}
801 | 
802 | \end{frame}
803 | 
804 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
805 | \begin{frame}{Example: G-S method}
806 | 
807 | \plitemsep 0.15in
808 | 
809 | \bci 
810 | \item A basis $(\vb_1, \vb_2) \in \real^2,$ $\vb_1 = \colvec{2 \\ 0}$ and $\vb_2 = \colvec{1 \\1}$
811 | 
812 | \item $\vu_1 = \vb_1 = \colvec{2 \\ 0}$ and
813 | \aleq{
814 | \vu_2 = \vb_2 - \pi_{\text{span}[\vu_1]}(\vb_2) = \frac{\vu_1\trans{\vu_2}}{\norm{\vu_1}} \vb_2
815 | = \colvec{1\\1} - \begin{nmat}
816 | 1 & 0 \cr
817 | 0 & 0
818 | \end{nmat}
819 | \colvec{1 \\1} = \colvec{0 \\1}
820 | }
821 | 
822 | \item $\vu_1$ and $\vu_2$ are orthogonal. If we want them to be orthonormal, then just normaliation would do the job. 
823 | \eci
824 | \end{frame}
825 | 
826 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
827 | \begin{frame}{Projection onto Affine Subspaces}
828 | 
829 | \begin{center}
830 |     \mypic{0.7}{L3_projection_affine.png}
831 | \end{center}
832 | 
833 | \plitemsep 0.05in
834 | \vspace{-0.5cm}
835 | \bci 
836 | \item Affine space: $L = \vec{x}_0 + U$
837 | \item Affine subspaces are not vector spaces
838 | \item Idea: (i) move $\vec{x}$ to a point in $U$, (ii) do the projection, (iii) move back to $L$
839 | \bluef{$$\pi_L(\vec{x}) = \vec{x}_0 + \pi_{U}(\vec{x} - \vec{x}_0)$$}
840 | \eci
841 | 
842 | \end{frame}
843 | 
844 | 
845 | 
846 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
847 | \section{L3(9)}
848 | \begin{frame}{Roadmap}
849 | 
850 | \plitemsep 0.1in
851 | 
852 | \bce[(1)] 
853 | \item \grayf{Norms
854 | 
855 | \item Inner Products
856 | 
857 | \item Lengths and Distances
858 | 
859 | \item Angles and Orthogonality
860 | 
861 | \item Orthonormal Basis
862 | 
863 | \item Orthogonal Complement
864 | 
865 | \item Inner Product of Functions
866 | 
867 | \item Orthogonal Projections}
868 | 
869 | \item \redf{Rotations}
870 | 
871 | \ece
872 | \end{frame}
873 | 
874 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
875 | \begin{frame}{Rotation}
876 | 
877 | \plitemsep 0.07in
878 | 
879 | \bci 
880 | \item Length and angle preservation: two properties of linear mappings with \bluef{orthogonal matrices}. Let's look at some of their special cases. 
881 | 
882 | \item A linear mapping that rotates the given coordinate system by an angle $\theta.$
883 | 
884 | \item Basis change
885 | \item $\vec{e}_1 = \colvec{1 \\ 0} \rightarrow \colvec{\cos\theta \\ \sin\theta}$ and $\vec{e}_2 = \colvec{0 \\ 1} \rightarrow \colvec{-\sin\theta \\ \cos\theta}$
886 | 
887 | \item Rotation matrix $\vec{R}(\theta) = \begin{nmat}
888 | \cos\theta & -\sin\theta \cr
889 | \sin\theta & \cos\theta 
890 | \end{nmat}$
891 | 
892 | \item Properties
893 | \bci
894 | \item Preserves distance: $\norm{\vec{x} - \vec{y}} = \norm{\mat{R}_\theta(\vec{x}) - \mat{R}_\theta(\vec{y})}$ 
895 | \item Preserves angle
896 | \eci
897 | \eci
898 | 
899 | \end{frame}
900 | 
901 | 
902 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
903 | \begin{frame}{}
904 | \vspace{2cm}
905 | \LARGE Questions?
906 | 
907 | 
908 | \end{frame}
909 | 
910 | \begin{frame}{Review Questions}
911 | % \tableofcontents
912 | %\plitemsep 0.1in
913 | \bce[1)]
914 | \item 
915 | 
916 | \ece
917 | \end{frame}
918 | 
919 | 
920 | \end{document}
921 | 


--------------------------------------------------------------------------------
/04.MatrixDecomposition/4.MD-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD-2.pdf


--------------------------------------------------------------------------------
/04.MatrixDecomposition/4.MD-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD-4.pdf


--------------------------------------------------------------------------------
/04.MatrixDecomposition/4.MD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD.pdf


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_SVD_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_SVD_matrix.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_UTM_LTM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_UTM_LTM.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_cofactor_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_cofactor_ex.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_eigendecomposition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_eigendecomposition.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_ev_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex1.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_ev_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex2.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_ev_ex3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex3.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_ev_ex4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex4.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_ev_ex5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex5.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_matrix_approx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_matrix_approx.png


--------------------------------------------------------------------------------
/04.MatrixDecomposition/L4_matrix_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_matrix_tree.png


--------------------------------------------------------------------------------
/05.VectorCaculus/5.VC-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC-2.pdf


--------------------------------------------------------------------------------
/05.VectorCaculus/5.VC-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC-4.pdf


--------------------------------------------------------------------------------
/05.VectorCaculus/5.VC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC.pdf


--------------------------------------------------------------------------------
/05.VectorCaculus/L5_computation_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_computation_graph.png


--------------------------------------------------------------------------------
/05.VectorCaculus/L5_grad_matrix_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_1.png


--------------------------------------------------------------------------------
/05.VectorCaculus/L5_grad_matrix_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_2.png


--------------------------------------------------------------------------------
/05.VectorCaculus/L5_grad_matrix_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_3.png


--------------------------------------------------------------------------------
/05.VectorCaculus/L5_useful.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_useful.png


--------------------------------------------------------------------------------
/05.VectorCaculus/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | 
  7 | 
  8 | \title[]{Lecture 5: Vector Calculus}
  9 | \author{Yi, Yung (이융)}
 10 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
 11 | \\KAIST EE}
 12 | \date{\today}
 13 | 
 14 | \input{../mymath}
 15 | \input{../mymacro}
 16 | 
 17 | \begin{document}
 18 | 
 19 | \input{../mydefault}
 20 | 
 21 | % START START START START START START START START START START START START START
 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 23 | \begin{frame}{Roadmap}
 24 | 
 25 | \plitemsep 0.1in
 26 | 
 27 | \bce[(1)] 
 28 | \item Differentiation of Univariate Functions
 29 | 
 30 | \item Partial Differentiation and Gradients 
 31 | 
 32 | \item Gradients of Vector-Valued Functions 
 33 | 
 34 | \item Gradients of Matrices 
 35 | 
 36 | \item Useful Identities for Computing Gradients 
 37 | 
 38 | \item Backpropagation and Automatic Differentiation 
 39 | 
 40 | \item Higher-Order Derivatives 
 41 | 
 42 | \item Linearization and Multivariate Taylor Series
 43 | 
 44 | \ece
 45 | \end{frame}
 46 | 
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | \begin{frame}{Summary}
 49 | 
 50 | \plitemsep 0.1in
 51 | 
 52 | \bci 
 53 | \item Machine learning is about solving an optimization problem whose variables are the parameters of a given model. 
 54 | 
 55 | \item Solving optimization problems require gradient information.  
 56 | 
 57 | \item Central to this chapter is the concept of the function, which we often write
 58 | 
 59 | \aleq{
 60 | f : \real^{D} \mapsto \real\cr
 61 | \vec{x} \mapsto f(\vec{x})
 62 | }
 63 | 
 64 | \eci
 65 | \end{frame}
 66 | 
 67 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 68 | \section{L5(1)}
 69 | \begin{frame}{Roadmap}
 70 | 
 71 | \plitemsep 0.1in
 72 | 
 73 | \bce[(1)]
 74 | \item \redf{Differentiation of Univariate Functions}
 75 | 
 76 | \item \grayf{Partial Differentiation and Gradients 
 77 | 
 78 | \item Gradients of Vector-Valued Functions 
 79 | 
 80 | \item Gradients of Matrices 
 81 | 
 82 | \item Useful Identities for Computing Gradients 
 83 | 
 84 | \item Backpropagation and Automatic Differentiation 
 85 | 
 86 | \item Higher-Order Derivatives 
 87 | 
 88 | \item Linearization and Multivariate Taylor Series}
 89 | 
 90 | \ece
 91 | \end{frame}
 92 | 
 93 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 94 | \begin{frame}{Difference Quotient and Derivative}
 95 | 
 96 | \plitemsep 0.3in
 97 | 
 98 | \bci 
 99 | \item \redf{Difference Quotient.} The average slope of $f$ between $x$ and $x+\partial x$
100 | 
101 | \aleq{
102 | \pd{y}{x} \eqdef \frac{f(x+\partial x) - f(x)}{\partial x}
103 | }
104 | 
105 | \item \redf{Derivative.} Pointing in the direction of steepest ascent of $f.$
106 | 
107 | \aleq{
108 | \d{f}{x} \eqdef \lim_{h \rightarrow 0} \frac{f(x+h)-f(x)}{h}
109 | }
110 | 
111 | \item Unless confusion arises, we often use $f' = \d{f}{x}.$
112 | \eci
113 | \end{frame}
114 | 
115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
116 | \begin{frame}{Taylor Series}
117 | 
118 | \plitemsep 0.1in
119 | 
120 | \bci 
121 | 
122 | \item Representation of a function as an infinite sum of terms, using derivatives of evaluated at $x_0.$
123 | 
124 | \item \redf{Taylor polynomial.} The Taylor polynomial of degree $n$ of $f : \real \mapsto \real$ at $x_0$ is:
125 | \aleq{
126 | T_n(x) \eqdef \sum_{k=0}^n \frac{f^{(k)}(x_0)}{k!} (x-x_0)^k, \ \text{where $f^{(k)}(x_0)$ is the $k$th derivative of $f$ at $x_0.$}
127 | }
128 | 
129 | \item \redf{Taylor Series.} For a smooth function $f\in \set{C}^{\infty},$ the Taylor series of $f$ at $x_0$ is:
130 | \aleq{
131 | T_\infty(x) \eqdef \sum_{k=0}^\infty \frac{f^{(k)}(x_0)}{k!} (x-x_0)^k.
132 | }
133 | 
134 | \item If $f(x) = T_\infty(x),$ $f$ is called \bluef{analytic}.
135 | \eci
136 | \end{frame}
137 | 
138 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
139 | \begin{frame}{Differentiation Rules}
140 | 
141 | \plitemsep 0.25in
142 | 
143 | \bci 
144 | 
145 | \item \bluef{Product rule.} $(f(x)g(x))' = f'(x)g(x) + f(x)g'(x)$
146 | 
147 | \item \bluef{Quotient rule.} $\left(\dfrac{f(x)}{g(x)}\right)' = \dfrac{f'(x)g(x) - f(x)g'(x)}{(g(x))^2} $
148 | 
149 | \item \bluef{Sum rule.} $(f(x)+g(x))' = f'(x) + g'(x)$
150 | 
151 | \item \bluef{Chain rule.} $(g(f(x)))' = g'(f(x))f'(x)$
152 | 
153 | \eci
154 | \end{frame}
155 | 
156 | 
157 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
158 | \section{L5(2)}
159 | \begin{frame}{Roadmap}
160 | 
161 | \plitemsep 0.1in
162 | 
163 | \bce[(1)]
164 | \item \grayf{Differentiation of Univariate Functions}
165 | 
166 | \item \redf{Partial Differentiation and Gradients }
167 | 
168 | \item \grayf{Gradients of Vector-Valued Functions 
169 | 
170 | \item Gradients of Matrices 
171 | 
172 | \item Useful Identities for Computing Gradients 
173 | 
174 | \item Backpropagation and Automatic Differentiation 
175 | 
176 | \item Higher-Order Derivatives 
177 | 
178 | \item Linearization and Multivariate Taylor Series}
179 | 
180 | \ece
181 | \end{frame}
182 | 
183 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
184 | \begin{frame}{Gradient}
185 | 
186 | \plitemsep 0.1in
187 | 
188 | \bci 
189 | 
190 | \item Now, \bluef{$f: \realn \mapsto \real.$}
191 | 
192 | \item Gradient of $f$ w.r.t. $\vec{x}$ $\grad_{\vec{x}} f$: Varying one variable at a time and keeping the others constant.
193 | 
194 | \bigskip
195 | 
196 | \mytwocols{0.5}
197 | {
198 | \redf{Partial Derivative.}  
199 | For $f : \realn \mapsto \real,$
200 | 
201 | \aleq{
202 | \pd{f}{x_1} &= \lim_{h \rightarrow 0} \frac{f(x_1+h,x_2, \ldots, x_n) - f(\vec{x})}{h}\cr
203 | & \vdots \cr
204 | \pd{f}{x_n} &= \lim_{h \rightarrow 0} \frac{f(x_1,x_2, \ldots, x_n+h) - f(\vec{x})}{h}
205 | }
206 | }
207 | {
208 | \redf{Gradient.} Get the partial derivatives and collect them in the row vector. 
209 | 
210 | \aleq{
211 | \grad_{\vec{x}} f = \d{f}{\vec{x}} = 
212 | \rowvec{\pd{f(\vec{x})}{x_1} & \cdots & \pd{f(\vec{x})}{x_n}} \in \real^{1 \times n}
213 | }
214 | }
215 | \eci
216 | \end{frame}
217 | 
218 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
219 | \begin{frame}{Example}
220 | 
221 | \plitemsep 0.2in
222 | 
223 | \bci 
224 | \item \exam $f(x,y) = (x+2y^3)^2$
225 | \aleq{
226 | \pd{f(x,y)}{x} &= 2(x+2y^3) \pd{x+2y^3}{x} = 2(x+2y^3)\cr
227 | \pd{f(x,y)}{y} &= 2(x+2y^3) \pd{x+2y^3}{y} = 12(x+2y^3)y^2
228 | }
229 | 
230 | \item \exam $f(x_1, x_2) = x_1^2 x_2 + x_1 x_2^3$
231 | \aleq{
232 | \grad_{(x_1,x_2)}f = \d{f}{x} = \rowvec{\pd{f(x_1,x_2)}{x_1} &\pd{f(x_1,x_2)}{x_2}} = \rowvec{2x_1x_2 + x_2^3 &
233 | x_1^2 + 3x_1x_2^2} 
234 | }
235 | \eci
236 | \end{frame}
237 | 
238 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
239 | \begin{frame}{Rules for Partial Differentiation}
240 | 
241 | \plitemsep 0.2in
242 | 
243 | \bci 
244 | \item \bluef{Product rule} $$\pd{}{\vec{x}}\big(f(\vec{x})g(\vec{x})\big) = \pd{f}{\vec{x}} g(\vec{x}) + f(\vec{x})\pd{g}{\vec{x}}$$
245 | 
246 | \item \bluef{Sum rule} $$\pd{}{\vec{x}} \big(f(\vec{x})+ g(\vec{x})\big) = \pd{f}{\vec{x}} + \pd{g}{\vec{x}}$$
247 | 
248 | \item \bluef{Chain rule} $$\pd{}{\vec{x}} g\big(f(\vec{x})\big) = \pd{g}{f}\pd{f}{\vec{x}}$$
249 | 
250 | \eci
251 | \end{frame}
252 | 
253 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
254 | \begin{frame}{More about Chain Rule}
255 | 
256 | \plitemsep 0.05in
257 | 
258 | \bci 
259 | \item $f: \real^2 \mapsto \real$ of two variables $x_1$ and $x_2.$ $x_1(t)$ and $x_2(t)$ are functions of $t.$
260 | \aleq{
261 | \d{f}{t} = \rowvec{\pd{f}{x_1} & \pd{f}{x_2}} \colvec{\pd{x_1(t)}{t} \\ \pd{x_2(t)}{t}}
262 | = \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t}
263 | }
264 | \item \exam $f(x_1, x_2) = x_1^2 + 2 x_2,$ where $x_1(t) = \sin(t),\ x_2(t)=\cos(t)$
265 | \aleq{
266 | \d{f}{t} = \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t} = 2\sin(t)\cos(t) - 2\sin{t} = 2\sin(t)(\cos(t)-1) 
267 | }
268 | 
269 | \item $f: \real^2 \mapsto \real$ of two variables $x_1$ and $x_2.$ $x_1(s,t)$ and $x_2(s,t)$ are functions of $s,t.$
270 | 
271 | \myvartwocols{0.2}{0.37}{0.6}
272 | {
273 | \small
274 | \vspace{-0.2cm}
275 | \aleq{
276 | \pd{f}{s} &= \pd{f}{x_1}\pd{x_1}{s} + \pd{f}{x_2}\pd{x_2}{s}\cr
277 | \pd{f}{t} &= \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t}
278 | }
279 | }
280 | {
281 | \aleq{
282 | \d{f}{(s,t)} = \pd{f}{\vec{x}}\pd{\vec{x}}{(s,t)} = \rowvec{\pd{f}{x_1} & \pd{f}{x_2}}
283 | \begin{nmat}
284 | \pd{x_1}{s} & \pd{x_1}{t} \cr
285 | \pd{x_2}{s} & \pd{x_2}{t} 
286 | \end{nmat}
287 | }
288 | }
289 | \eci
290 | \end{frame}
291 | 
292 | 
293 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
294 | \section{L5(3)}
295 | \begin{frame}{Roadmap}
296 | 
297 | \plitemsep 0.1in
298 | 
299 | \bce[(1)] 
300 | \item \grayf{Differentiation of Univariate Functions}
301 | 
302 | \item \grayf{Partial Differentiation and Gradients }
303 | 
304 | \item \redf{Gradients of Vector-Valued Functions} 
305 | 
306 | \item \grayf{Gradients of Matrices 
307 | 
308 | \item Useful Identities for Computing Gradients 
309 | 
310 | \item Backpropagation and Automatic Differentiation 
311 | 
312 | \item Higher-Order Derivatives 
313 | 
314 | \item Linearization and Multivariate Taylor Series}
315 | 
316 | \ece
317 | \end{frame}
318 | 
319 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
320 | \begin{frame}{$\vec{f}: \realn \mapsto \realm$}
321 | 
322 | \plitemsep 0.1in
323 | 
324 | \bci 
325 | \item For a function $\vec{f}: \realn \mapsto \realm$ and vector $\vec{x}= \trans{\rowvec{x_1 & \ldots & x_n}} \in \realn,$ the vector-valued function is:
326 | $$
327 | \vec{f}(\vec{x}) = \colvec{f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})}
328 | $$
329 | \item Partial derivative w.r.t. $x_i$ is a column vector: $\displaystyle \pd{\vec{f}}{x_i} = 
330 | \colvec{\pd{f_1}{x_i} \\ \vdots \\ \pd{f_m}{x_i}}$
331 | 
332 | \item Gradient (or Jacobian): $\displaystyle \d{\vec{f}(\vec{x})}{\vec{x}} = \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }$
333 | \eci
334 | \end{frame}
335 | 
336 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
337 | \begin{frame}{Jacobian}
338 | 
339 | \aleq{
340 | \mJ &= \grad_{\vec{x}} \vec{f} = \d{\vec{f}(\vec{x})}{\vec{x}} = 
341 | \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }\cr
342 | &= \begin{nmat}
343 | \pd{f_1(\vec{x})}{x_1} & \cdots & \pd{f_1(\vec{x})}{x_n} \cr
344 | \vdots &  & \vdots \cr
345 | \pd{f_m(\vec{x})}{x_1} & \cdots & \pd{f_m(\vec{x})}{x_n} 
346 | \end{nmat}
347 | }
348 | 
349 | \bci
350 | \item For a \bluef{$\realn \mapsto \realm$} function, its Jacobian is a \bluef{$m \times n$} matrix. 
351 | \eci
352 | % \plitemsep 0.1in
353 | % \bci 
354 | % \item For a function $\vec{f}: \realn \mapsto \realm$ and vector $\vec{x}= \trans{\rowvec{x_1 & \ldots & x_n}} \in \realn,$ the vector-valued function is:
355 | % $$
356 | % \vec{f}(\vec{x}) = \colvec{f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})}
357 | % $$
358 | % \item Partial derivative w.r.t. $x_i$ is a column vector: $\displaystyle \pd{\vec{f}}{x_i} = 
359 | % \colvec{\pd{f_1}{x_i} \\ \vdots \\ \pd{f_m}{x_i}}$
360 | 
361 | % \item Gradient (or Jacobian): $\displaystyle \d{\vec{f}(\vec{x})}{\vec{x}} = \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }$
362 | % \eci
363 | \end{frame}
364 | 
365 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
366 | \begin{frame}{Example: Gradient of Vector-Valued Function}
367 | 
368 | \bci
369 | \item $\vf(\vx) = \mA \vx,$ $\vf: \realn \mapsto \realm,$ $\mA \in \realmn,$ $\vx \in \realn$
370 | 
371 | \item Partial derivatives: 
372 | $
373 | \displaystyle
374 | f_i(\vx) = \sum_{j=1}^n A_{ij} x_j \implies \pd{f_i}{x_j} = A_{ij}
375 | $
376 | 
377 | \item Graident
378 | \aleq{
379 | \d{\vf}{\vx} = \begin{nmat}
380 | \pd{f_1}{x_1} & \cdots & \pd{f_1}{x_n} \cr
381 | \vdots & & \vdots \cr
382 | \pd{f_m}{x_1} & \cdots & \pd{f_m}{x_n} 
383 | \end{nmat} = 
384 | \begin{nmat}
385 | A_{11} & \cdots & A_{1n} \cr
386 | \vdots & & \vdots \cr
387 | A_{m1} & \cdots & A_{mn} 
388 | \end{nmat} = \mA
389 | }
390 | 
391 | \eci
392 | 
393 | \end{frame}
394 | 
395 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
396 | \begin{frame}{Example: Chain Rule}
397 | 
398 | \bci
399 | \item $h: \real \mapsto \real,$ $h(t) = (f\circ g)(t)$ with 
400 | \aleq{
401 | f: \real^2 \mapsto \real, \ f(\vx) = \exp(x_1x_2^2), \quad g: \real \mapsto \real^2, \
402 | \vx = \colvec{x_1 \\ x_2} = g(t) = \colvec{t\cos(t) \\ t\sin(t)}
403 | }
404 | 
405 | \item \bluef{(Note)} $\pd{f}{\vx} \in \real^{1 \times 2}$ and $\pd{g}{t} \in \real^{2 \times 1}$
406 | 
407 | \item Using the chain rule, 
408 | \aleq{
409 | \d{h}{t} = \pd{f}{\vx} \pd{\vx}{t} &= \rowvec{\pd{f}{x_1} & \pd{f}{x_2}}\colvec{\pd{x_1}{t} \\ \pd{x_2}{t}}\cr
410 | &= \rowvec{\exp(x_1x_2^2)x_2^2 & 2\exp(x_1x_2^2)x_1x_2} \colvec{\cos(t)-t\sin(t) \\ \sin(t)+t\cos(t)} 
411 | }
412 | \eci
413 | \end{frame}
414 | 
415 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
416 | \begin{frame}{Example: Least-Square Loss (1)}
417 | 
418 | \plitemsep 0.1in
419 | 
420 | \bci
421 | \item A linear model: $\vy = \mat{\Phi} \vth$
422 | \item $\vth \in \real^D$: parameter vector
423 | \item $\mat{\Phi} \in \real^{N \times D}$: input features
424 | \item $\vy \in \real^N$: observations
425 | 
426 | \item Goal: Find a good parameter vector that provides the best-fit, formulated by minimizing the following loss $L: \real^D \mapsto \real$ over the parameter vector $\vth$. 
427 | \mycolorbox{
428 | \vspace{-0.2cm}
429 | $$
430 | L(\ve) \eqdef \norm{\ve}^2, \quad \text{where} \ \ve(\vth) = \vy - \mat{\Phi} \vth
431 | $$
432 | }
433 | \eci
434 | \end{frame}
435 | 
436 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
437 | \begin{frame}{Example: Least-Square Loss (2)}
438 | 
439 | \plitemsep 0.2in
440 | 
441 | \bci
442 | 
443 | \item $\displaystyle \pd{L}{\vth} = \greenf{\pd{L}{\ve}} \orangef{\pd{\ve}{\vth}}$ 
444 | \item \redf{Note.} $\displaystyle \pd{L}{\vth} \in \real^{1 \times D},$ $\displaystyle \greenf{\pd{L}{\ve}} \in \real^{1 \times N},$ $\displaystyle \orangef{\pd{\ve}{\vth}} \in \real^{N \times D}$
445 | 
446 | \item Using that $\norm{\ve}^2 = \trans{\ve}\ve$, $\displaystyle \greenf{\pd{L}{\ve}} = 2 \trans{\ve} \in \real ^{1 \times N}$ and $\displaystyle \orangef{\pd{\ve}{\vth}} = - \mat{\Phi} \in \real^{N \times D}$
447 | \aleq{
448 | \text{Finally, we get:} \quad \pd{L}{\vth} = \greenf{2\trans{\ve}}\orangef{(-\mat{\Phi})} = -\underbrace{2(\trans{\vy} - \trans{\vth}\trans{\mat{\Phi}})}_{1 \times N} \underbrace{\mat{\Phi}}_{N \times D}
449 | }
450 | \eci
451 | \end{frame}
452 | 
453 | 
454 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
455 | \section{L5(4)}
456 | \begin{frame}{Roadmap}
457 | 
458 | \plitemsep 0.1in
459 | 
460 | \bce[(1)] 
461 | \item \grayf{Differentiation of Univariate Functions}
462 | 
463 | \item \grayf{Partial Differentiation and Gradients }
464 | 
465 | \item \grayf{Gradients of Vector-Valued Functions} 
466 | 
467 | \item \redf{Gradients of Matrices 
468 | 
469 | \item Useful Identities for Computing Gradients} 
470 | 
471 | \item \grayf{Backpropagation and Automatic Differentiation 
472 | 
473 | \item Higher-Order Derivatives 
474 | 
475 | \item Linearization and Multivariate Taylor Series}
476 | 
477 | \ece
478 | \end{frame}
479 | 
480 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
481 | \begin{frame}{Gradients of matrices}
482 | 
483 | \plitemsep 0.1in
484 | 
485 | \bci 
486 | \item Gradient of matrix $\mA \in \real^{m \times n}$ w.r.t. matrix $\mB \in \real^{p \times q}$ 
487 | 
488 | \item Jacobian: A four-dimensional tensor\footnote{A multidimensional array} $\mJ = \d{\mA}{\mB} \in \real^{(m \times n) \times (p \times q)}$
489 | 
490 | \eci
491 | 
492 | \myvartwocols{0.5}{0.15}{0.83}
493 | {
494 | \includegraphics[width=0.9\columnwidth]{L5_grad_matrix_1.png}
495 | }
496 | {
497 | \includegraphics[width=0.47\columnwidth]{L5_grad_matrix_2.png}
498 | \includegraphics[width=0.47\columnwidth]{L5_grad_matrix_3.png}
499 | }
500 | 
501 | 
502 | 
503 | 
504 | \end{frame}
505 | 
506 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
507 | \begin{frame}{Example: Gradient of Vectors for Matrices (1)}
508 | 
509 | \bci
510 | \item $\vf(\vx) = \mA \vx,$  $\vf \in \realm$, $\mA \in \realmn,$ $\vx \in \realn.$ What is \bluef{$\d{\vf}{\mA}$?}
511 | 
512 | \item Dimension: If we consider $\vf: \realmn \mapsto \realm,$ $\d{\vf}{\mA} \in \real^{m\times (m \times n)}$
513 | 
514 | 
515 | \item Partial derivatives: 
516 | $
517 |  \pd{f_i}{\mA} \in \real^{1\times (m \times n)}, \quad \d{\vf}{\mA} = \colvec{ \pd{f_1}{\mA} \\ \vdots \\  \pd{f_m}{\mA}} 
518 | $
519 | \mytwocols{0.4}
520 | {
521 | \small
522 | \aleq{
523 | f_i &= \sum_{j=1}^n A_{ij} x_j, \ i=1, \ldots, m \implies \pd{f_i}{A_{iq}} = x_q,\cr    
524 | \pd{f_i}{A_{i\cdot}} &= \trans{\vx} \in \real^{1\times 1\times n} \ \text{(for $i$th row vector)}\cr
525 | \pd{f_{i}}{A_{{k\neq i}\cdot}} & = \trans{\vec{0}} \in \real^{1\times 1\times n} \ \text{(for $k$th row vector, $k\neq i$)}
526 | }
527 | }
528 | {
529 | \small
530 | \aleq{
531 | \pd{f_i}{\mA} = \colvec{\trans{\vec{0}} \\ \vdots \\ \trans{\vec{0}} \\ \trans{\vx} \\ \trans{\vec{0}} \\ \vdots \\ \trans{\vec{0}}} \in \real^{1 \times (m \times n)}
532 | }
533 | }
534 | 
535 | \eci
536 | 
537 | \end{frame}
538 | 
539 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
540 | \begin{frame}{Example: Gradient of Matrices for Matrices (2)}
541 | 
542 | \bci
543 | \item $\mR \in \realmn$ and $\vf: \realmn \mapsto \realnn$ with $\vf(\mR) = \mK \eqdef \trans{\mR}\mR \in \realnn.$  What is \bluef{$\d{\mK}{\mR} \in \real^{(n\times n) \times (m\times n)}$?}
544 | 
545 | \item $\d{K_{pq}}{\mR} \in \real^{1 \times m \times n}.$ Let $\vr_i$ be the $i$th column of $\mR.$ Then
546 | \(
547 | K_{pq} = \trans{\vr_p} \vr_q = \sum_{k=1}^m R_{kp} R_{kq}.
548 | \)
549 | 
550 | \item Partial derivative $\pd{K_{pq}}{R_{ij}}$
551 | \aleq{
552 | \pd{K_{pq}}{R_{ij}} = \sum_{k=1}^m \pd{}{R_{ij}} R_{kp} R_{kq} = \partial_{pqij}, \ 
553 | \partial_{pqij} = 
554 | \begin{cases}
555 | R_{iq} & \text{if} \ j=p, p\neq q \cr
556 | R_{ip} &  \text{if} \ j=q, p\neq q \cr
557 | 2R_{iq} &  \text{if} \ j=p, p=q \cr
558 | 0 & \text{otherwise}
559 | \end{cases}
560 | }
561 | \eci
562 | 
563 | \end{frame}
564 | 
565 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
566 | \section{L5(5)}
567 | \begin{frame}{Useful Identities}
568 | 
569 | \vspace{-0.6cm}
570 | \raggedleft
571 | \includegraphics[width=0.7\columnwidth]{L5_useful.png}
572 | \end{frame}
573 | 
574 | 
575 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
576 | \section{L5(6)}
577 | \begin{frame}{Roadmap}
578 | 
579 | \plitemsep 0.1in
580 | 
581 | \bce[(1)]
582 | \item \grayf{Differentiation of Univariate Functions}
583 | 
584 | \item \grayf{Partial Differentiation and Gradients }
585 | 
586 | \item \grayf{Gradients of Vector-Valued Functions} 
587 | 
588 | \item \gray{Gradients of Matrices} 
589 | 
590 | \item \grayf{Useful Identities for Computing Gradients} 
591 | 
592 | \item \redf{Backpropagation and Automatic Differentiation} 
593 | 
594 | \item \grayf{Higher-Order Derivatives 
595 | 
596 | \item Linearization and Multivariate Taylor Series}
597 | 
598 | \ece
599 | \end{frame}
600 | 
601 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
602 | \begin{frame}{Motivation: Neural Networks with Many Layers (1)}
603 | 
604 | \plitemsep 0.01in
605 | 
606 | \bci 
607 | \item In a neural network with many layers, the function $\vy$ is a many-level function compositions
608 | $$
609 | \vy = (f_K \circ f_{K-1} \circ \cdots \circ f_1)(\vx),
610 | $$
611 | where, for example,  
612 | \bci
613 | \item $\vx$: images as inputs, $\vy$: class labels (e.g., cat or dog) as outputs
614 | \item each $f_i$ has its own parameters
615 | \eci
616 | 
617 | \item In neural networks, with the model parameters $\vth = \{\mA_0, \vb_0, \ldots, \mA_{K-1}, \vb_{K-1} \}$
618 | 
619 | \smallskip
620 | \mysmalltwocols{0.4}
621 | {
622 | \small
623 | \vspace{-0.4cm}
624 | \aleq{
625 | \begin{cases}
626 | \vf_0 &\eqdef \vx \cr
627 | \vf_1 &\eqdef \sigma_1(\mA_{0}\vf_{0} + \vb_{0})\cr
628 | \vdots& \cr
629 | \vf_K &\eqdef \sigma_K(\mA_{K-1}\vf_{K-1} + \vb_{K-1})
630 | \end{cases}
631 | }
632 | $\circ$ $\sigma_i$ is called the \bluef{activation function} at $i$-th layer
633 | }
634 | {
635 | \hspace{-0.7cm} $\circ$ Minimizing the loss function over $\vth$:
636 | \aleq{
637 | \min_{\vth} L(\vth),
638 | }
639 | where 
640 | $
641 | L(\vth) = \norm{\vy - \vf_K(\vth,\vx)}^2
642 | $
643 | }
644 | 
645 | \eci
646 | \end{frame}
647 | 
648 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
649 | \begin{frame}{Motivation: Neural Networks with Many Layers (2)}
650 | 
651 | \plitemsep 0.01in
652 | 
653 | \bci 
654 | 
655 | \item In neural networks, with the model parameters $\vth = \{\mA_0, \vb_0, \ldots, \mA_{K-1}, \vb_{K-1} \}$
656 | 
657 | \smallskip
658 | \mysmalltwocols{0.4}
659 | {
660 | \small
661 | \vspace{-0.4cm}
662 | \aleq{
663 | \begin{cases}
664 | \vf_0 &\eqdef \vx \cr
665 | \vf_1 &\eqdef \sigma_1(\mA_{0}\vf_{0} + \vb_{0})\cr
666 | \vdots& \cr
667 | \vf_K &\eqdef \sigma_K(\mA_{K-1}\vf_{K-1} + \vb_{K-1})
668 | \end{cases}
669 | }
670 | $\circ$ $\sigma_i$ is called the activation function at $i$-th layer
671 | }
672 | {
673 | \hspace{-0.7cm} $\circ$ Minimizing the loss function over $\vth$:
674 | \aleq{
675 | \min_{\vth} L(\vth),
676 | }
677 | where 
678 | $
679 | L(\vth) = \norm{\vy - \vf_K(\vth,\vx)}^2
680 | $
681 | }
682 | 
683 | \medskip
684 | \item \question \bluef{\large How can we efficiently compute $\displaystyle \d{L}{\vth}$ in computers?}
685 | 
686 | \eci
687 | \end{frame}
688 | 
689 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
690 | \begin{frame}{Backpropagatin: Example (1)}
691 | 
692 | \plitemsep 0.1in
693 | 
694 | \bci 
695 | 
696 | \item $f(x) = \sqrt{x^2 + \exp(x^2)} + \cos\left (x^2 + \exp(x^2)\right)$
697 | 
698 | 
699 | 
700 | \item Computation graph: Connect via ``elementary'' operations
701 | 
702 | \smallskip
703 | \mypic{0.7}{L5_computation_graph.png}
704 | \aleq{
705 | \bluef{a} = x^2, \ \bluef{b}=\exp(a), \ \bluef{c}=a+b, \ \bluef{d}=\sqrt{c}, \ \bluef{e}=\cos(c), \ \bluef{f} = d+e
706 | }
707 | 
708 | \item Automatic Differentiation
709 | \bci
710 | \item  A set of techniques to \bluef{numerically} (not symbolically) evaluate the gradient of a function by working with \bluef{intermediate variables} and applying the \bluef{chain rule}. 
711 | \eci
712 | \eci
713 | \end{frame}
714 | 
715 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
716 | \begin{frame}{Backpropagation: Example (2)}
717 | 
718 | \plitemsep 0.1in
719 | 
720 | \bci 
721 | 
722 | \item 
723 | % $f(x) = \sqrt{x^2 + \exp(x^2)} + \cos\left (x^2 + \exp(x^2)\right)$
724 | $
725 | \bluef{a} = x^2, \ \bluef{b}=\exp(a), \ \bluef{c}=a+b, \ \bluef{d}=\sqrt{c}, \ \bluef{e}=\cos(c), \ \bluef{f} = d+e
726 | $
727 | \item Derivatives of the intermediate variables with their inputs
728 | \aleq{
729 | \bluef{\pd{a}{x}} = 2x, \ \bluef{\pd{b}{a}}=\exp(a), \ \bluef{\pd{c}{a}}=1 = \bluef{\pd{c}{b}}, \ \bluef{\pd{d}{c}}=\frac{1}{2\sqrt{c}}, \ \bluef{\pd{e}{c}}=-\sin(c), \ \bluef{\pd{f}{d}} = 1 = \bluef{\pd{f}{e}}
730 | }
731 | \item Compute $\displaystyle \pd{f}{x}$ by working backward from the output
732 | \mytwocols{0.3}
733 | {
734 | \small
735 | \vspace{-0.3cm}
736 | \aleq{
737 | \orangef{\pd{f}{c}} &= \bluef{\pd{f}{d}\pd{d}{c}} + \bluef{\pd{f}{e}\pd{e}{c}}, \ \redf{\pd{f}{b}} =\bluef{\pd{f}{c}\pd{c}{b}}  \cr
738 | \greenf{\pd{f}{a}} &= \redf{\pd{f}{b}}\bluef{\pd{b}{a}} + \orangef{\pd{f}{c}}\bluef{\pd{c}{a}}, \ \mybox{$\displaystyle \pd{f}{x}$} =\greenf{\pd{f}{a}}\bluef{\pd{a}{x}}  
739 | }
740 | }
741 | {
742 | \small
743 | \vspace{-0.3cm}
744 | \aleq{
745 | \orangef{\pd{f}{c}} &= 1\cdot \frac{1}{2\sqrt{c}} + 1\cdot (-\sin(c))\cr
746 | \redf{\pd{f}{b}} &= \orangef{\pd{f}{c}} \cdot 1, \quad \greenf{\pd{f}{a}} = \redf{\pd{f}{b}} \exp(a) + \orangef{\pd{f}{c}}\cdot 1 \cr
747 | \mybox{$\displaystyle \pd{f}{x}$} &=\greenf{\pd{f}{a}} \cdot 2x
748 | }
749 | }
750 | 
751 | \eci
752 | \end{frame}
753 | 
754 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
755 | \begin{frame}{Backpropagation}
756 | 
757 | \plitemsep 0.1in
758 | 
759 | \bci 
760 | 
761 | \item Implementation of gradients can be very expensive, unless we are careful. 
762 | 
763 | \item Using the idea of automatic differentiation, the whole gradient computation is decomposed into a set of gradients of elementary functions and application of the chain rule.
764 | 
765 | \item Why \bluef{backward}? 
766 | 
767 | \bci
768 | \item In neural networks, the input dimensionality is often much higher than the dimensionality of labels.
769 | \item In this case, the backward computation (than the forward computation) is much cheaper. 
770 | \eci
771 | 
772 | \item Works if the target is expressed as a computation graph whose elementary functions are differentiable. If not, some care needs to be taken. 
773 | \eci
774 | \end{frame}
775 | 
776 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
777 | \section{L5(7)}
778 | \begin{frame}{Roadmap}
779 | 
780 | \plitemsep 0.1in
781 | 
782 | \bce[(1)] 
783 | \item \grayf{Differentiation of Univariate Functions}
784 | 
785 | \item \grayf{Partial Differentiation and Gradients }
786 | 
787 | \item \grayf{Gradients of Vector-Valued Functions} 
788 | 
789 | \item \gray{Gradients of Matrices} 
790 | 
791 | \item \grayf{Useful Identities for Computing Gradients} 
792 | 
793 | \item \grayf{Backpropagation and Automatic Differentiation} 
794 | 
795 | \item \redf{Higher-Order Derivatives 
796 | 
797 | \item Linearization and Multivariate Taylor Series}
798 | 
799 | \ece
800 | \end{frame}
801 | 
802 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
803 | \begin{frame}{Higher-Order Derivatives}
804 | 
805 | \plitemsep 0.05in
806 | 
807 | \bci 
808 | \item Some optimization algorithms (e.g., Newton's method) require second-order derivatives, if they exist. 
809 | \item (Truncated) Taylor series is often used as an approximation of a function. 
810 | 
811 | \item For $f: \realn \mapsto \real$ of variable $\vx \in \realn$, $
812 | \grad_{\vec{x}} f = \d{f}{\vec{x}} = 
813 | \rowvec{\pd{f(\vec{x})}{x_1} & \cdots & \pd{f(\vec{x})}{x_n}} \in \real^{1 \times n}
814 | $
815 | \bci
816 | \item If $f$ is twice-differentiable, the order doesn't matter. 
817 | \aleq{
818 | \hess_{\vec{x}} f = \begin{nmat}
819 | \pdd{f}{x_1}& \pdda{f}{x_1}{x_2}& \cdots & \pdda{f}{x_1}{x_n}\cr 
820 | \vdots & & & \vdots\cr
821 | \pdda{f}{x_1}{x_n} & \pdda{f}{x_2}{x_n} & \cdots & \pdda{f}{x_{n}}{x_n}
822 | \end{nmat}
823 | }
824 | % Gradient $\grad f: \realn \mapsto $
825 | \eci
826 | 
827 | \item For $f: \realn \mapsto \realm$, $\grad_{\vec{x}} f \in \realmn$
828 | \bci
829 | \item Thus, $\hess_{\vec{x}} f \in \real^{m \times n \times n}$ (a tensor)
830 | \eci
831 | 
832 | \eci
833 | \end{frame}
834 | 
835 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
836 | \section{L5(7)}
837 | \begin{frame}{Function Approximation: Linearization and More}
838 | 
839 | \plitemsep 0.1in
840 | 
841 | \bci 
842 | \item First-order approximation of $f(\vx)$ (i.e., linearization by taking the first two terms of Taylor Series)
843 | $$
844 | f(\vx) \approx f(\vx_0) + (\grad_{\vx} f)(\vx_0)(\vx-\vx_0)
845 | $$
846 | 
847 | \item Multivariate Talyer Series for $f: \real^D \mapsto \real$ at $\vx_0$
848 | $$
849 | f(\vx) = \sum_{k=0}^\infty \frac{D^k_{\vx} f(\vx_0)}{k!} \vec{\delta}^k,
850 | $$
851 | where $D^k_{\vx} f(\vx_0)$ is the $k$th derivative of $f$ w.r.t. $\vx$, evaluated at $\vx_0,$ and $\vec{\delta} \eqdef \vx - \vx_0.$
852 | \bci
853 | \item Partial sum up to, say $n$, can be an approximation of $f(\vx).$
854 | \item $D^k_{\vx} f(\vx_0)$ and $\vec{\delta}^k$ are $k$th order tensors, i.e., $k$-dimensional array. 
855 | 
856 | \item $\vec{\delta}^k$ is a $k$-fold outer product $\otimes$. For example, $\vec{\delta}^2 = \vec{\delta} \otimes \vec{\delta} = \vec{\delta}\trans{\vec{\delta}}.$ $\vec{\delta}^3 = \vec{\delta} \otimes \vec{\delta} \otimes \vec{\delta}.$ 
857 | \eci
858 | 
859 | \eci
860 | 
861 | 
862 | 
863 | \end{frame}
864 | 
865 | 
866 | 
867 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
868 | \begin{frame}{}
869 | \vspace{2cm}
870 | \LARGE Questions?
871 | 
872 | 
873 | \end{frame}
874 | 
875 | \begin{frame}{Review Questions}
876 | % \tableofcontents
877 | %\plitemsep 0.1in
878 | \bce[1)]
879 | \item 
880 | 
881 | \ece
882 | \end{frame}
883 | 
884 | 
885 | \end{document}
886 | 


--------------------------------------------------------------------------------
/06.Probability/6.PD-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD-2.pdf


--------------------------------------------------------------------------------
/06.Probability/6.PD-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD-4.pdf


--------------------------------------------------------------------------------
/06.Probability/6.PD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD.pdf


--------------------------------------------------------------------------------
/06.Probability/L6_CDF_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_CDF_ex1.png


--------------------------------------------------------------------------------
/06.Probability/L6_CDF_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_CDF_ex2.png


--------------------------------------------------------------------------------
/06.Probability/L6_RV_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_RV_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_binomial_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_binomial_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_condind_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_condind_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_cov_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_cov_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_cov_notind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_cov_notind.png


--------------------------------------------------------------------------------
/06.Probability/L6_exp_pdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_exp_pdf.png


--------------------------------------------------------------------------------
/06.Probability/L6_gaussian_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_gaussian_formula.png


--------------------------------------------------------------------------------
/06.Probability/L6_geo_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_geo_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_joint_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_joint_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_marginal_conditional.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_marginal_conditional.png


--------------------------------------------------------------------------------
/06.Probability/L6_needle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_needle.png


--------------------------------------------------------------------------------
/06.Probability/L6_pdf_delta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_delta.png


--------------------------------------------------------------------------------
/06.Probability/L6_pdf_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_pdf_uniform_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_uniform_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_pmf_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pmf_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_total_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_total_ex.png


--------------------------------------------------------------------------------
/06.Probability/L6_tworolls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_tworolls.png


--------------------------------------------------------------------------------
/06.Probability/L6_uniform_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_uniform_ex.png


--------------------------------------------------------------------------------
/07.Optimization/7.OPT-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT-2.pdf


--------------------------------------------------------------------------------
/07.Optimization/7.OPT-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT-4.pdf


--------------------------------------------------------------------------------
/07.Optimization/7.OPT.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT.pdf


--------------------------------------------------------------------------------
/07.Optimization/L7_convex_conjugate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_conjugate.png


--------------------------------------------------------------------------------
/07.Optimization/L7_convex_fn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_fn.png


--------------------------------------------------------------------------------
/07.Optimization/L7_convex_set_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_set_ex1.png


--------------------------------------------------------------------------------
/07.Optimization/L7_convex_set_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_set_ex2.png


--------------------------------------------------------------------------------
/07.Optimization/L7_first_condition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_first_condition.png


--------------------------------------------------------------------------------
/07.Optimization/L7_gradient_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_gradient_ex.png


--------------------------------------------------------------------------------
/07.Optimization/L7_halfspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_halfspace.png


--------------------------------------------------------------------------------
/07.Optimization/L7_separating.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_separating.png


--------------------------------------------------------------------------------
/07.Optimization/L7_supporting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_supporting.png


--------------------------------------------------------------------------------
/08.Model_Data/8.MMD-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD-2.pdf


--------------------------------------------------------------------------------
/08.Model_Data/8.MMD-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD-4.pdf


--------------------------------------------------------------------------------
/08.Model_Data/8.MMD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD.pdf


--------------------------------------------------------------------------------
/08.Model_Data/L10_latent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L10_latent.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_all_gmodels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_all_gmodels.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_coinflip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_coinflip.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_cross_validation.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_dsep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_dsep.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_fittings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_fittings.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_gmodel_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_gmodel_ex1.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_gmodel_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_gmodel_ex2.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_lung_cancer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_lung_cancer.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_model_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_class.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_model_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_function.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_model_pmodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_pmodel.png


--------------------------------------------------------------------------------
/08.Model_Data/L8_nested_cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_nested_cross_validation.png


--------------------------------------------------------------------------------
/08.Model_Data/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | \title[]{Lecture 8: When Models Meet Data}
  7 | \author{Yi, Yung (이융)}
  8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
  9 | \\KAIST EE}
 10 | \date{\today}
 11 | 
 12 | 
 13 | \input{../mymath}
 14 | \input{../mymacro}
 15 | 
 16 | 
 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{}
 18 | 
 19 | \begin{document}
 20 | 
 21 | \input{../mydefault}
 22 | 
 23 | 
 24 | 
 25 | % START START START START START START START START START START START START START
 26 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 27 | \begin{frame}{Roadmap}
 28 | 
 29 | \plitemsep 0.1in
 30 | 
 31 | \bce[(1)]
 32 | 
 33 | \item Data, Models, and Learning 
 34 | \item Models as Functions: Empirical Risk Minimization 
 35 | \item Models as Probabilistic Models: Parameter Estimation (ML and MAP)
 36 | \item Probabilistic Modeling and Inference 
 37 | \item Directed Graphical Models 
 38 | \item Model Selection
 39 | 
 40 | \ece
 41 | \end{frame}
 42 | 
 43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 44 | \section{L8(1)}
 45 | \begin{frame}{Roadmap}
 46 | 
 47 | \plitemsep 0.1in
 48 | 
 49 | \bce[(1)]
 50 | 
 51 | \item \redf{Data, Models, and Learning}
 52 | \item \grayf{Models as Functions: Empirical Risk Minimization 
 53 | \item Models as Probabilistic Models: Parameter Estimation (ML and MAP)
 54 | \item Probabilistic Modeling and Inference 
 55 | \item Directed Graphical Models 
 56 | \item Model Selection
 57 | }
 58 | 
 59 | \ece
 60 | \end{frame}
 61 | 
 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 63 | \begin{frame}{Data, Models, and Learning}
 64 | 
 65 | \plitemsep 0.15in
 66 | 
 67 | \bci
 68 | 
 69 | \item Three major components of a machine learning system
 70 | 
 71 | \bce
 72 | \item Data:  $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \}$
 73 | \item Models: deterministic functions or probabilistic models
 74 | \item Learning: Training, and prediction/inference
 75 | \ece
 76 | \item Good machine learning models: Perform well for unseen (untrained) data
 77 | 
 78 | \item Machine learning algorithm: training and prediction
 79 | \eci
 80 | \end{frame}
 81 | 
 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 83 | \begin{frame}{Data as Vectors}
 84 | 
 85 | \plitemsep 0.1in
 86 | 
 87 | \bci 
 88 | 
 89 | \item Tabular format or not, numerical or not, good feature extraction etc. 
 90 | 
 91 | \item Assume that data is given as $D$-dimensional vector $\vx_n$ of real numbers, each called \bluef{features}, \bluef{attributes}, or \bluef{covariates}.
 92 | 
 93 | \item Dataset: consisting of data points or examples $\{ \vx_1,$ $\vx_2,$ \ldots, $\vx_N \}$
 94 | \item In supervised learning, $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ where $y_n$ is the \bluef{label} (or target, response variable, or annotation). 
 95 | \item Better representation of data as vectors
 96 | \bci
 97 | \item finding lower-dimensional approximations of the original feature vector (e.g., PCA via SVD or EVD)
 98 | \item using nonlinear higher-dimensional combinations of the original feature vector (e.g., feature map and kernel)
 99 | \eci
100 | \eci
101 | \end{frame}
102 | 
103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
104 | \begin{frame}{Models: Functions vs. Probabilistic Models}
105 | 
106 | \myvartwocols{0.7}{0.65}{0.31}
107 | {
108 | \plitemsep 0.07in
109 | 
110 | \bci 
111 | 
112 | \item Now, the business of constructing a predictor
113 | 
114 | \item Models as \bluef{functions}
115 | \bci
116 | \item $f: \realD \mapsto \real.$ 
117 | \item \exam $f(\vx) = \trans{\vth}\vx + \theta_0,$ Unknown parameter: $\vth,\theta_0$
118 | 
119 | \eci
120 | 
121 | \item Models as \bluef{probabilistic models}
122 | 
123 | \bci
124 | \item model our uncertainty due to the \bluef{observation process} and our uncertainty in the \bluef{parameters of our model}
125 | 
126 | \item predictors should be able to express some sort of uncertainty via probabilistic models
127 | \item Parameters: parameters of a chosen probabilistic model (e.g., mean and variance of Gaussian)
128 | \eci
129 | 
130 | \eci
131 | 
132 | }
133 | {
134 | \vspace{-0.3cm}
135 | \mypic{0.99}{L8_model_function.png}
136 | \mypic{0.99}{L8_model_pmodel.png}
137 | 
138 | }
139 | 
140 | \end{frame}
141 | 
142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
143 | \begin{frame}{Learning Algorithms}
144 | 
145 | \plitemsep 0.07in
146 | 
147 | \bci 
148 | 
149 | \item[] Three algorithmic phases
150 | 
151 | \item[(1)] Prediction or inference: via function or probabilitic models
152 | 
153 | \item[(2)] Training or parameters estimation 
154 | 
155 | \bci
156 | \item fixed parameter assumption (non-probabilistic) or Bayesisan approach (probabilistic)
157 | \item non-probabilistic: e.g., empirical risk minimization
158 | \item probabilistic: e.g., ML (Maximum Likelihood), MAP (Maximum A Posteriori)
159 | \item cross-validation: simulation of performing for unseen data
160 | \item regularization/prior: balancing models between training and unseen data
161 | 
162 | \eci
163 | \item[(3)] Hyperparameter tuning or model selection
164 | \eci
165 | \end{frame}
166 | 
167 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
168 | \section{L8(2)}
169 | \begin{frame}{Roadmap}
170 | 
171 | \plitemsep 0.1in
172 | 
173 | \bce[(1)] 
174 | 
175 | \item \grayf{Data, Models, and Learning}
176 | \item \redf{Models as Functions: Empirical Risk Minimization} 
177 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)
178 | \item Probabilistic Modeling and Inference 
179 | \item Directed Graphical Models 
180 | \item Model Selection
181 | }
182 | 
183 | \ece
184 | \end{frame}
185 | 
186 | 
187 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
188 | \begin{frame}{Empirical Risk Minimization}
189 | 
190 | \plitemsep 0.07in
191 | 
192 | \bci 
193 | 
194 | \item Predictor as a function
195 | 
196 | \item Given $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ estimate a predictor $f(\cdot, \vth): \realD \mapsto \real$
197 | 
198 | \item Find a good parameter $\vth^*,$ such that $f(\vx_n,\vth^*) = \hat{y}_n \approx y_n,$ for all $n=1,\ldots, N$
199 | 
200 | 
201 | \bigskip
202 | \item \exam Affine function: By adding the unit feature $x^{(0)}=1$ and $\theta_0$, i.e., $\vx_n = \trans{[1, x_n^{(1)}, \ldots, x_n^{(D)}]},$ $\vth = \trans{[\theta_0, \theta_1, \ldots, \theta_D]}$
203 | \aleq
204 | {
205 | f(\vx_n,\vth) = \trans{\vth} \vx_n = \theta_0 + \sum_{d=1}^D \theta_d x_n^{(d)}
206 | }
207 | 
208 | \item \exam Neural network: Complex non-linear function
209 | \eci
210 | \end{frame}
211 | 
212 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
213 | \begin{frame}{Loss Function}
214 | 
215 | \plitemsep 0.07in
216 | 
217 | \bci 
218 | 
219 | \item Training set: $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ an example matrix\footnote{In other chapters, we often use $D \times N$ example matrix by defining it as $\mat{X} \eqdef [\vx_1, \ldots, \vx_N].$ \lecturemark{L10(4)}} 
220 | $\mat{X} \eqdef \trans{[\vx_1, \ldots, \vx_N]} \in \real^{N \times D},$ a label vector $\vy \eqdef 
221 | \trans{[y_1, \ldots, y_N]},$
222 | 
223 | \item Average loss, empirical risk
224 | $$
225 | \bm{R}_{\text{emp}}(f,\mat{X},\vy) = \frac{1}{N} \sum_{n=1}^N \ell(y_n,\hat{y}_n)
226 | $$
227 | 
228 | %\bigskip
229 | \item Goal: Minimizing empirical risk
230 | 
231 | \item \exam The squared loss function $\ell(y_n,\hat{y}_n) = (y_n - \hat{y}_n)^2$ leads to:
232 | $$
233 | \min_{\vth \in \realD} \frac{1}{N} \norm{\vy - \mat{X}\vth}^2
234 | $$
235 | 
236 | \item \question Ultimgate goal: Minimizing expected risk (for unseen data) $\bm{R}_{\text{true}} = \expecti{\vx,y}{\ell(y,f(\vx))}$? 
237 | 
238 | \eci
239 | \end{frame}
240 | 
241 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
242 | \begin{frame}{Overfitting and Regularization}
243 | 
244 | \plitemsep 0.07in
245 | 
246 | \bci 
247 | 
248 | \item The predictor fits too closely to the training data and does not generalize well to new data
249 | 
250 | \item Need to somehow bias the search for the minimizer of empirical risk by introducing a \bluef{penalty term}
251 | 
252 | \item \bluef{Regularization}: compromise between accurate solution of empirical risk minimization and the size or complexity of the solution.
253 | 
254 | \item \exam Regularized Least Squares
255 | $$
256 | \min_{\vth \in \realD} \frac{1}{N} \norm{\vy - \mat{X}\vth}^2 + \lambda \norm{\vth}^2
257 | $$
258 | \bci
259 | \item $\norm{\vth}^2$: regularizer, $\lambda$: regularization parameter
260 | \eci
261 | 
262 | \eci
263 | \end{frame}
264 | 
265 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
266 | \begin{frame}{Cross-Validation for Generalization Performance}
267 | 
268 | \bigskip
269 | 
270 | 
271 | \mypic{0.8}{L8_cross_validation.png}
272 | 
273 | \end{frame}
274 | 
275 | 
276 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
277 | \section{L8(3)}
278 | \begin{frame}{Roadmap}
279 | 
280 | \plitemsep 0.1in
281 | 
282 | \bce[(1)] 
283 | 
284 | \item \grayf{Data, Models, and Learning}
285 | \item \grayf{Models as Functions: Empirical Risk Minimization} 
286 | \item \redf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)}
287 | \item \grayf{Probabilistic Modeling and Inference 
288 | \item Directed Graphical Models 
289 | \item Model Selection
290 | }
291 | 
292 | \ece
293 | \end{frame}
294 | 
295 | 
296 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
297 | \begin{frame}{MLE (Maximum Likelihood Estimation): Concept}
298 | 
299 | \plitemsep 0.07in
300 | 
301 | \bci 
302 | 
303 | \item Idea: define a function of the parameters called \bluef{likelihood function}.
304 | 
305 | \item Negative log-likelihood for data $\vx$ and a family of probability densities $\cprob{\vx \mid \vth}$ parameterized by $\vth$:
306 | $$
307 | \cL_{\vx}(\vth) = \cL(\vth) \eqdef - \log \cprob{\vx \mid \vth}
308 | $$
309 | \bci
310 | \item $\cL(\vth)$: how likely a particular setting of $\vth$ is for the observations $\vx$.
311 | \eci
312 | 
313 | \bigskip
314 | \item \redf{MLE}: Find $\vth$ such that $\cL(\vth)$ is \bluef{minimized} (i.e., likelihood is \bluef{maximized})
315 | \eci
316 | \end{frame}
317 | 
318 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
319 | \begin{frame}{MLE: Supervised Learning}
320 | 
321 | \plitemsep 0.05in
322 | 
323 | \bci 
324 | 
325 | \item The set of iid examples $(\vx_1, y_1), \ldots, (\vx_N,y_N)$
326 | 
327 | \item $\set{X} = \{\vx_1, \ldots, \vx_N \}$ and $\set{Y} = \{y_1, \ldots, y_N \}$
328 | 
329 | \item Negative log-likelihood
330 | $$
331 | \cL(\vth) = - \log \cprob{\set{Y} \mid \set{X}, \vth} = \sum_{n=1}^N \log \cprob{y_n \mid \vx_n, \vth}
332 | $$
333 | 
334 | \item \exam Assume independent Gaussian noise $\set{N}(0,\sigma^2)$ and linear model $y_n = \trans{\vx}_n \vth$ for prediction. Then, $Y_n| (\vx_n,\vth) \sim \set{N}(\trans{\vx}_n\vth, \sigma^2).$
335 | {\small
336 | \aleq{
337 | \cL(\vth) &= - \sum_{n=1}^N \log \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\trans{\vx}_n\vth )^2}{2\sigma^2} \right)= \frac{1}{2\sigma^2} \sum_{n=1}^N (y_n-\trans{\vx}_n\vth )^2 - \sum_{n=1}^N \log \frac{1}{\sqrt{2\pi\sigma^2}}
338 | }}
339 | 
340 | \eci
341 | \end{frame}
342 | 
343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
344 | \begin{frame}{MAP (Maximum A Posteriori)}
345 | 
346 | \plitemsep 0.1in
347 | 
348 | \bci 
349 | 
350 | \item What if we have some \bluef{prior knowledge} about $\vth$? Then, how should we change our knowledge about $\vth$ after observing data $\vx$?
351 | 
352 | \item Compute a posteriori distribution (using Bayes' Theorem) and find $\vth$ that maximizes the distribution:
353 | $$
354 | \max_{\vth} \cprob{\vth \mid \vx} = \max_{\vth} \frac{\cprob{\vx \mid \vth}\cprob{\vth}}{\cprob{\vx}} 
355 | \Longleftrightarrow \min_{\vth}\Big (  -\log \cprob{\vth \mid \vx} \Big )
356 | $$
357 | \bci
358 | \item In finding the optimal $\vth,$ $\cprob{\vx}$ can be ignored
359 | \eci
360 | \item ML and MAP: Bridging the non-probabilistic and probabilistic worlds as it explicitly acknowledges the need for a prior distribution, yet producing a \bluef{point estimate} (one single parameter return). 
361 | 
362 | %\item We later see the full parameter distributions
363 | \eci
364 | \end{frame}
365 | 
366 | 
367 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
368 | \begin{frame}{Model Fitting}
369 | 
370 | \plitemsep 0.1in
371 | 
372 | \bci 
373 | 
374 | \item Model class $M_{\vth}$ vs. Right model $M^*$
375 | \mypic{0.3}{L8_model_class.png}
376 | 
377 | \item Overfitting vs. Underfitting vs. Good fitting
378 | \mypic{0.7}{L8_fittings.png}
379 | \eci
380 | \end{frame}
381 | 
382 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
383 | \section{L8(4)}
384 | \begin{frame}{Roadmap}
385 | 
386 | \plitemsep 0.1in
387 | 
388 | \bce[(1)] 
389 | 
390 | \item \grayf{Data, Models, and Learning}
391 | \item \grayf{Models as Functions: Empirical Risk Minimization} 
392 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)}
393 | \item \redf{Probabilistic Modeling and Inference} 
394 | \item \grayf{Directed Graphical Models 
395 | \item Model Selection
396 | }
397 | 
398 | \ece
399 | \end{frame}
400 | 
401 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
402 | \begin{frame}{Modeling Generative Process and Probabilistic Models}
403 | 
404 | \plitemsep 0.1in
405 | 
406 | \bci 
407 | 
408 | \item Many machine learning tasks: prediction of future events and decision making
409 | 
410 | \item Often build (probabilistic) models that describe the \bluef{generative process} that generates the observed data
411 | 
412 | \item In probabilistic modeling, the joint distribution $\cprob{\vx,\vth}$ of the observed variables 
413 | $\vx$ and the hidden parameters $\vth$ encapsulate the key information
414 | \bci
415 | \item Given: \orangef{prior} $\cprob{\vth}$ and \orangef{likelihood} $\cprob{\vx | \vth}$
416 | \item \greenf{Joint dist.} from prior and likelihood: $\cprob{\vx,\vth} = \cprob{\vx | \vth} \cprob{\vth}$
417 | \item We get: \redf{marginal likelihood} $\cprob{\vx} = \int \cprob{\vx,\vth} \text{d}\vth$ and \redf{posterior} $\cprob{\vth|\vx} = \frac{\cprob{\vx,\vth}}{\cprob{\vx}}$
418 | \eci
419 | 
420 | % \item Essentially, if we know the \bluef{joint distribution}, we know all about its probabilistic model
421 | \eci
422 | \end{frame}
423 | 
424 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
425 | \begin{frame}{Fully Bayesian vs. ML/MAP}
426 | Given the data set $\set{X},$ we want to predict $A,$ i.e., \redblk{$\cprob{A \mid \set{X}}$}
427 | %\vspace{-1.2cm}
428 | \plitemsep 0.05in
429 | \bci 
430 | \item \redf{ML}: Easy (high), Exact (low)
431 | $$
432 | \cprob{A \mid \set{X}} \approx \cprob{A \mid \vth}, \quad \vth = \arg\max \cprob{\set{X} \mid \vth}
433 | $$
434 | 
435 | \item \redf{MAP}: Easy (mid), Exact (mid)
436 | $$
437 | \cprob{A \mid \set{X}} \approx \cprob{A \mid \vth}, \quad \vth = \arg\max \cprob{\vth \mid \set{X}}
438 | $$
439 | 
440 | \item \redf{Fully Bayesian}: Easy (low), Exact (high)
441 | 
442 | \medskip
443 | - predictive inference, use of posterior predictive distribution, bayesian prediction
444 | 
445 | - remove dependence on the model parameters $\vth$
446 | 
447 | $$
448 | \cprob{A \mid \set{X}} = \int \cprob{A \mid \vth} \cprob{\vth \mid \set{X}}\text{d}\vth
449 | $$
450 | 
451 | - Only possible by getting the full posterior distribution $\cprob{\vth \mid \set{X}}$
452 | \eci
453 | \end{frame}
454 | 
455 | 
456 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
457 | \begin{frame}{(Fully) Bayesian Inference: Hardness}
458 | 
459 | \plitemsep 0.2in
460 | 
461 | \bci 
462 | 
463 | % \item \bluef{Sinle} Earlier, two ways of estimating the parameter $\vth$: ML and MAP. Essentially, it is solving an optimization problem to get a single best value $\vth^*.$ $\implies$ Prediction through $\cprob{\vx \mid \vth^*}.$
464 | 
465 | % \item Rather than just a likelihood, having the \bluef{full posterior distribution} can be useful. 
466 | 
467 | \item For a data set $\set{X},$ a parameter prior $\cprob{\vth},$ and a likelihood function, the posterior is:
468 | $$
469 | \cprob{\vth \mid \set{X}} = \frac{\cprob{\set{X} \mid \vth} \cprob{\vth}}{\cprob{\set{X}}}, \quad 
470 | \cprob{\set{X}} = \int \cprob{\set{X} \mid \vth} \cprob{\vth} \; \text{d}\vth
471 | $$
472 | 
473 | %\item \question \bluef{Examples of prediction using the posterior distribution?}
474 | 
475 | \item Implementation hardness
476 | \bci
477 | \item Bayesian inference requires to solve integration, which is often challenging. In particular, a conjugate prior is not chosen, the integration is not analytically tractable.
478 | 
479 | \item Approximation techniques: MCMC (Markov Chain Monte Carlo), Laplace approximation, variational inference, expectation propagation
480 | \eci
481 | \eci
482 | \end{frame}
483 | 
484 | 
485 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
486 | \begin{frame}{Latent-Variable Models (1)}
487 | 
488 | \plitemsep 0.1in
489 | 
490 | \bci 
491 | 
492 | \item Including latent variables in the model $\rightarrow$ contributing to the interpretability of the model
493 | 
494 | \item General discussions here would be applied the following examples later
495 | \bci
496 | \item PCA for dimensionality reduction \hfill \lecturemark{L10(7)}
497 | \item Gaussian mixture models for density estimation \hfill \lecturemark{L11(3)}
498 | \eci
499 | 
500 | 
501 | \item In latent-variable models (LVMs)\footnote{In our note, we express the dependence on the model parameters $\vth$ using subscript notations, e.g., $\cprobi{\vth}{\vx | \vz}$ rather than $\cprob{\vx| \vz, \vth}$ to highlight the role of $\vz.$ }, 
502 | \bci
503 | \item Given: \orangef{prior} $\cprob{\vz}$ and \orangef{likelihood} $\cprobi{\vth}{\vx | \vz}$
504 | \item \greenf{Joint dist.} from prior and likelihood: $\cprobi{\vth}{\vx,\vz} = \cprobi{\vth}{\vx | \vz} \cprob{\vz}$
505 | \item Our interest: \redf{marginal likelihood} $\cprobi{\vth}{\vx}$ 
506 | and \redf{posterior} $\cprobi{\vth}{\vz|\vx}$
507 | \eci
508 | 
509 | 
510 | % \item Offers data generation process through parameters: $\cprob{\vx \mid \vth, \vz}$, $\cprob{\vz}$
511 | 
512 | % \item Marginalization over the latent variables, which allows parameter estimation by ML and MAP (using the prior $\cprob{\vth}$)
513 | % $$
514 | % \cprob{\vx \mid \vth} = \int \cprob{\vx \mid \vth, \vz} \cprob{\vz}\; \text{d}\vz
515 | % $$
516 | 
517 | \eci
518 | \end{frame}
519 | 
520 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
521 | \begin{frame}{LVM (2)}
522 | 
523 | \plitemsep 0.1in
524 | 
525 | \bci 
526 | 
527 | \item Assuming we know $\vth$, to generate a data sample from the model (i) sample $\vz$ from $\cprob{\vz}$ and (ii) sample $\vx$ from $\cprobi{\vth}{\vx|\vz}$ 
528 | 
529 | \item \redf{Inference.} computing the \bluef{posterior distribution} $\cprobi{\vth}{\vz | \vx}$:
530 | $$
531 | \cprobi{\vth}{\vz | \vx} = \frac{\cprobi{\vth}{\vx,\vz}}{\cprobi{\vth}{\vx}} = 
532 | \frac{\cprobi{\vth}{\vx,\vz}}{\int \cprobi{\vth}{\vx,\vz} \text{d}\vz}
533 | $$
534 | 
535 | \item This requires to solve the sub-problem of computing the \bluef{marginal likelihood} of the observation:
536 | $$\displaystyle \cprobi{\vth}{\vx} = \int \cprobi{\vth}{\vx,\vz} \text{d}\vz$$
537 | 
538 | \eci
539 | \end{frame}
540 | 
541 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
542 | \begin{frame}{LVM (3): Why the posterior distribution $\cprobi{\vth}{\vz | \vx}$?}
543 | 
544 | \plitemsep 0.2in
545 | 
546 | \bci 
547 | 
548 | \item \bluef{Explanation of the observation.} Allows us to figure out which latent configurations could have plausibly generated the observation data samples.
549 | 
550 | \item \bluef{Learning of model parameters $\vth$.} Training LVMs to estimate $\vth$ (e.g., ML) requires $\cprobi{\vth}{\vz | \vx}$ in its inner loops
551 | 
552 | \item[]
553 | \mycolorbox{
554 | \vspace{-0.3cm}
555 | $$
556 | \text{marginal likelihood $\cprobi{\vth}{\vx}$} \implies \text{posterior distribution $\cprobi{\vth}{\vz | \vx}$} \implies \text{$\vth_{\ml}$}
557 | $$
558 | }
559 | \eci
560 | \end{frame}
561 | 
562 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
563 | \begin{frame}{LVM (4): How is $\cprobi{\vth}{\vz | \vx}$? used for $\vth_{\ml}$?}
564 | 
565 | \plitemsep 0.2in
566 | 
567 | \bci 
568 | 
569 | \item In ML, we need the gradient of the marginal log-likelihood. For a data sample $\vx,$
570 | \aleq{
571 | \grad_{\vth} \log p_{\vth}(\vx) &= \frac{\grad_{\vth} p_{\vth}(\vx)}{p_{\vth}(\vx)}
572 | = \frac{\int \grad_{\vth} p_{\vth}(\vx,\vz)\text{d}\vz}{p_{\vth}(\vx)} = 
573 | \frac{\int p_{\vth}(\vx,\vz) \grad_{\vth} \log p_{\vth}(\vx,\vz)\text{d}\vz}{p_{\vth}(\vx)} \cr
574 | &= \int \orangef{p_{\vth}(\vz|\vx)} \grad_{\vth} \log p_{\vth}(\vx,\vz)\text{d}\vz
575 | }
576 | 
577 | \item $\cprobi{\vth}{\vz | \vx}$ performs \bluef{credit assignment} over latent configurations
578 | \eci
579 | \end{frame}
580 | 
581 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
582 | % \begin{frame}{Generative Modeling with Latent Variables}
583 | 
584 | % \plitemsep 0.1in
585 | 
586 | % \myvartwocols{0.7}{0.29}{0.65}
587 | % {
588 | % \bci 
589 | % \item Generative process
590 | % \bci
591 | % \item $\vz \sim p(\vz)$
592 | % \item $\vx \sim p(\vx | \vz)$
593 | % \eci
594 | % \eci
595 | 
596 | % \bigskip
597 | % \aleq
598 | % {
599 | % p(\vx) &= \int p(\vx,\vz)\text{d}\vz \cr
600 | % &=\int p(\vx|\vz)p(\vz)\text{d}\vz
601 | % }
602 | 
603 | % }
604 | % {
605 | % \vspace{-0.5cm}
606 | % \mypic{0.4}{L10_latent.png}
607 | % \vspace{-0.6cm}
608 | % \raggedleft
609 | % {\tiny Source: \url{https://dlvu.github.io/slides/dlvu.lecture06.pdf}}
610 | % }
611 | 
612 | % \end{frame}
613 | 
614 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
615 | % \begin{frame}{Latent-Variable Models (2)}
616 | 
617 | % \plitemsep 0.1in
618 | 
619 | % \bci 
620 | 
621 | % \item We can compute a posterior on the latent variables, but marginalizing over both $\vz$ and $\vth$ is hard:
622 | % $$
623 | % \cprob{\vz \mid \set{X}} = \frac{\cprob{\set{X}\mid \vz} \cprob{\vz}}{\cprob{X}}, \quad 
624 | % \cprob{\set{X} \mid \vz} = \int \cprob{\set{X} \mid \vz, \vth} \cprob{\vth} \text{d}\vth
625 | % $$
626 | 
627 | 
628 | % \item Instead, it is easier to compute the latent-variable posterior, but conditioned on the model parameters, i.e., 
629 | % $$
630 | % \cprob{\vz \mid \set{X},\vth} = \frac{\cprob{\set{X}\mid \vz,\vth} \cprob{\vz}}{\cprob{\set{X} \mid \vth}}
631 | % $$
632 | 
633 | % \item \question How do we use the posteriors $\cprob{\vz \mid \set{X}}$ or $\cprob{\vz \mid \set{X},\vth}$ in practice? Any examples?
634 | % \eci
635 | % \end{frame}
636 | 
637 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
638 | \section{L8(5)}
639 | \begin{frame}{Roadmap}
640 | 
641 | \plitemsep 0.1in
642 | 
643 | \bce[(1)] 
644 | 
645 | \item \grayf{Data, Models, and Learning}
646 | \item \grayf{Models as Functions: Empirical Risk Minimization} 
647 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)}
648 | \item \grayf{Probabilistic Modeling and Inference} 
649 | \item \redf{Directed Graphical Models }
650 | \item \grayf{Model Selection}
651 | \ece
652 | \end{frame}
653 | 
654 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
655 | \begin{frame}{Graphical Models}
656 | 
657 | \plitemsep 0.1in
658 | 
659 | \bci 
660 | 
661 | \item Joint distribution of a probabilistic model: key quantity of interest, but quite complicated without structural properties
662 | 
663 | \item However, there exist relations of \bluef{independence}, \bluef{conditional independence} among random variables.  
664 | 
665 | \item (Probabilistic) graphical models: Roughly speaking, a graph of random variables.
666 | 
667 | \bci
668 | \item Simple ways to visualize the structure of the model
669 | \item Insights into the structural properties, e.g., conditional independence
670 | \item Computations for inference and learning can be expressed in terms of graphical manipulations
671 | \eci
672 | \eci
673 | \end{frame}
674 | 
675 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
676 | \begin{frame}{Graph Semantics}
677 | 
678 | \mytwocols{0.4}
679 | {
680 | \aleq{
681 | \cprob{a,b,c} = \cprob{c| a,b} \cprob{b | a}\cprob{a}
682 | }
683 | \mypic{0.4}{L8_gmodel_ex1.png}
684 | }
685 | {
686 | %\vspace{-0.3cm}
687 | \aleq{
688 | &\cprob{x_1, x_2, x_3, x_4, x_5} = \cr
689 | &\cprob{x_1}\cprob{x_5}\cprob{x_2 | x_5}\cprob{x_3 | x_1, x_2} \cprob{x_4 | x_2}}
690 | \mypic{0.6}{L8_gmodel_ex2.png}
691 | }
692 | \vspace{-0.3cm}
693 | \plitemsep 0.03in
694 | \bci 
695 | \item Nodes: random variables
696 | \item Directed edge for direct dependence: $b$ directly depends on $a$: $a \rightarrow b$
697 | \item Graph layout: factorization of the joint distribution
698 | $$
699 | \cprob{x_1, \ldots, x_K} = \prod_{k=1}^K \cprob{x_k \mid \mathbf{Pa}_k}, \quad\text{$\mathbf{Pa}_k$ are the parent nodes of $x_k.$}
700 | $$
701 | \eci
702 | \end{frame}
703 | 
704 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
705 | \begin{frame}{Example: $N$ coin-flip experiments}
706 | 
707 | \mypic{0.75}{L8_coinflip.png}
708 | \plitemsep 0.07in
709 | \bci 
710 | \item Shaded nodes: observables, $\mu$: probability of head, a (latent) random variable
711 | \item Joint distribution
712 | $$
713 | \cprob{x_1, \ldots, x_N \mid \mu} = \prod_{n=1}^N \cprob{x_n \mid \mu}
714 | $$
715 | \eci
716 | \end{frame}
717 | 
718 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
719 | \begin{frame}{Conditional Independence and $d$-Separation}
720 | 
721 | \plitemsep 0.07in
722 | \bci 
723 | \item \question How can we see conditional independence in the directed graphical models? For example, $\set{A} \indep \set{B} \mid \set{C}$?
724 | \item \bluef{$d$-separation}
725 | \bci
726 | \item All possible trails\footnote{paths that ignore the direction of the arrows} from any node $\set{A}$ to any node in $\set{B}$
727 | \item Any such path is blocked if it includes any node such that either of the following is true:
728 | \bci
729 | \item The arrows on the path meet either head to tail or tail to tail at the node, and the node is in \set{C}
730 | \item The arrows meet head to head at the node, and neither the node nor any of its descendants is in \set{C}
731 | \eci
732 | \item If all the paths are blocked, then $\set{A}$ is $d$-separated from $\set{B}$ by $\set{C}.$
733 | \item If $d$-separated, $\set{A} \indep \set{B} \mid \set{C}$
734 | \eci
735 | \eci
736 | 
737 | % \myvartwocols{0.4}{0.7}{0.26}
738 | % {
739 | % \small
740 | 
741 | % }
742 | % {
743 | % \mypic{0.7}{L8_dsep.png}
744 | % }
745 | 
746 | \end{frame}
747 | 
748 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
749 | \begin{frame}{Example}
750 | 
751 | \mypic{0.25}{L8_dsep.png}
752 | % \vspace{-0.8cm}
753 | % \raggedleft{\scriptsize Source: \url{http://www.causality.inf.ethz.ch/data/LUCAS.html}}
754 | \plitemsep 0.1in
755 | \bci 
756 | \item $b \indep d \mid a,c$
757 | \item $a \indep c \mid b$
758 | \item $b \not\indep d \mid c$
759 | \item $a \not\indep c \mid b,e$
760 | \eci
761 | \end{frame}
762 | 
763 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
764 | \begin{frame}{Example in Healthcare}
765 | 
766 | \mypic{0.6}{L8_lung_cancer.png}
767 | \vspace{-0.8cm}
768 | \raggedleft{\scriptsize Source: \url{http://www.causality.inf.ethz.ch/data/LUCAS.html}}
769 | 
770 | \end{frame}
771 | 
772 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
773 | \begin{frame}{Three Types of Graphical Models}
774 | 
775 | \mypic{0.7}{L8_all_gmodels.png}
776 | 
777 | \plitemsep 0.15in
778 | \bci
779 | \item \bluef{Directed graphical models (or Bayesian Networks)}
780 | 
781 | \item Undirected graphical models (Markov Random Fields)
782 | 
783 | \item Factor graphs
784 | \eci
785 | 
786 | \end{frame}
787 | 
788 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
789 | \begin{frame}{Roadmap}
790 | 
791 | \plitemsep 0.1in
792 | 
793 | \bce[(6)]
794 | 
795 | \item \grayf{Data, Models, and Learning}
796 | \item \grayf{Models as Functions: Empirical Risk Minimization} 
797 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)}
798 | \item \grayf{Probabilistic Modeling and Inference} 
799 | \item \grayf{Directed Graphical Models }
800 | \item \redf{Model Selection}
801 | \ece
802 | \end{frame}
803 | 
804 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
805 | \begin{frame}{Nested Cross-Validation}
806 | 
807 | 
808 | \mypic{0.65}{L8_nested_cross_validation.png}
809 | 
810 | \plitemsep 0.1in
811 | 
812 | \bci 
813 | 
814 | \item Model selection
815 | \bci
816 | \item Tradeoff between model complexity and data fit
817 | 
818 | \item \bluef{Occam's razor.} Find the simplest model that explains the data resonably well. 
819 | \eci
820 | 
821 | 
822 | \item Test set: estimate the generalization performance
823 | 
824 | \item Validation set: choose the best model
825 | \eci
826 | \end{frame}
827 | 
828 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
829 | \begin{frame}{Bayesian Model Selection}
830 | 
831 | \plitemsep 0.1in
832 | 
833 | \bci 
834 | 
835 | \item A set of models $\bm{M} = \{M_1, \ldots, M_k \},$ where each $M_k$ has $\vth_k$ parameters. A prior $\cprob{M}$ on each model $M \in \bm{M}.$ 
836 | $$
837 | M_k \sim \cprob{M}, \quad \vth_k \sim \cprob{\vth \mid M_k}, \quad \set{D} \sim \cprob{\set{D} \mid \vth_k}
838 | $$
839 | \item Posterior distribution $\cprob{M_k \mid \set{D}} \propto \cprob{M_k} \cprob{\set{D} \mid M_k},$ where we have the following \bluef{model evidence} or \bluef{marginal likelihood}:
840 | $$
841 | \cprob{\set{D} \mid M_k}  = \int \cprob{\set{D} \mid \vth_k} \cprob{\vth_k \mid M_k} \text{d}\vth_k \quad \text{(***)}
842 | $$
843 | \item MAP for the model: $M^* = \arg \max_{M_k} \cprob{M_k \mid \set{D}}$
844 | \item With the uniform model prior (i.e., $\cprob{M_k} = 1/k$), the MAP estimate equals to maximization of model evidence. 
845 | \eci
846 | \end{frame}
847 | 
848 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
849 | \begin{frame}{Bayes Factors for Model Comparison}
850 | 
851 | \plitemsep 0.1in
852 | 
853 | \bci 
854 | 
855 | \item Compare two probabilistic models $M_1$ and $M_2$:
856 | $$
857 | \text{(Posterior odds)} = \frac{\cprob{M_1 \mid \set{D}}}{\cprob{M_2 \mid \set{D}}} = \frac
858 | {
859 | \frac{\cprob{\set{D} \mid M_1}\cprob{M_1}}{\cprob{\set{D}}}
860 | }
861 | {
862 | \frac{\cprob{\set{D} \mid M_2}\cprob{M_2}}{\cprob{\set{D}}}
863 | }
864 | = \underbrace{\frac{\cprob{M_1}}{\cprob{M_2}}}_{\text{Prior odds}} 
865 | \underbrace{\frac{\cprob{\set{D} \mid M_1}}{\cprob{\set{D} \mid M_2}}}_{\text{Bayes factor}}
866 | $$
867 | \item $\cprob{\set{D} \mid M_k}$: How well the data is predicted by the model $M_k$
868 | \item With the uniform model prior, the prior odds $= 1$
869 | 
870 | \item Computation of Bayes factor requires the complex integration (***) in the previous slide. In this case, we rely on some approximations such as MCMC (Markov Chain Monte Carlo). 
871 | \eci
872 | 
873 | \end{frame}
874 | 
875 | 
876 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
877 | \begin{frame}{Summary}
878 | 
879 | \plitemsep 0.1in
880 | 
881 | \bci 
882 | 
883 | \item 
884 | \eci
885 | \end{frame}
886 | 
887 | 
888 | 
889 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
890 | \begin{frame}{}
891 | \vspace{2cm}
892 | \LARGE Questions?
893 | 
894 | 
895 | \end{frame}
896 | 
897 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
898 | \begin{frame}{Review Questions}
899 | % \tableofcontents
900 | %\plitemsep 0.1in
901 | \bce[1)]
902 | \item 
903 | 
904 | \ece
905 | \end{frame}
906 | 
907 | 
908 | \end{document}
909 | 


--------------------------------------------------------------------------------
/09.LinearRegression/9.LR-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR-2.pdf


--------------------------------------------------------------------------------
/09.LinearRegression/9.LR-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR-4.pdf


--------------------------------------------------------------------------------
/09.LinearRegression/9.LR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR.pdf


--------------------------------------------------------------------------------
/09.LinearRegression/L9_LR_gmodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_LR_gmodel.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_bayesian_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_bayesian_regression.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_overfit_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_overfit_linear.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_poly4fit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_poly4fit.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_posterior_predictive_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_posterior_predictive_ex.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_regression_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_regression_ex.png


--------------------------------------------------------------------------------
/09.LinearRegression/L9_training_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_training_test.png


--------------------------------------------------------------------------------
/09.LinearRegression/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | \title[]{Lecture 9: Linear Regression}
  7 | \author{Yi, Yung (이융)}
  8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
  9 | \\KAIST EE}
 10 | \date{\today}
 11 | 
 12 | 
 13 | \input{../mymath}
 14 | \input{../mymacro}
 15 | 
 16 | 
 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{}
 18 | 
 19 | \begin{document}
 20 | 
 21 | \input{../mydefault}
 22 | 
 23 | 
 24 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 25 | \begin{frame}{Warm-Up}
 26 | 
 27 | {\Large Please watch this tutorial video by Luis Serrano on PCA.}
 28 | 
 29 | \bigskip
 30 | 
 31 | \bigskip
 32 | 
 33 | \url{https://www.youtube.com/watch?v=wYPUhge9w5c}
 34 | 
 35 | \end{frame}
 36 | 
 37 | % START START START START START START START START START START START START START
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 | \begin{frame}{Roadmap}
 40 | 
 41 | \plitemsep 0.1in
 42 | 
 43 | \bce[(1)] 
 44 | 
 45 | \item  Problem Formulation 
 46 | \item  Parameter Estimation: ML 
 47 | \item  Parameter Estimation: MAP 
 48 | \item  Bayesian Linear Regression 
 49 | \item  Maximum Likelihood as Orthogonal Projection 
 50 | 
 51 | \ece
 52 | \end{frame}
 53 | 
 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 55 | \section{L9(1)}
 56 | \begin{frame}{Roadmap}
 57 | 
 58 | \plitemsep 0.1in
 59 | 
 60 | \bce[(1)] 
 61 | 
 62 | \item  \redf{Problem Formulation}
 63 | \item  \grayf{Parameter Estimation: ML 
 64 | \item  Parameter Estimation: MAP 
 65 | \item  Bayesian Linear Regression 
 66 | \item  Maximum Likelihood as Orthogonal Projection }
 67 | 
 68 | \ece
 69 | \end{frame}
 70 | 
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 | \begin{frame}{Regression Problem}
 73 | 
 74 | \mypic{0.75}{L9_regression_ex.png}
 75 | 
 76 | \plitemsep 0.1in
 77 | 
 78 | \bci 
 79 | 
 80 | \item For some input values $x_n,$ we observe noisy function values $y_n = f(x_n) + \epsilon$
 81 | 
 82 | \item Goal: infer the function $f$ that generalizes well to function values at new inputs 
 83 | 
 84 | \item Applications: time-series analysis, control and robotics, image recognition, etc.
 85 | \eci
 86 | \end{frame}
 87 | 
 88 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 89 | \begin{frame}{Formulation}
 90 | 
 91 | \plitemsep 0.07in
 92 | 
 93 | \bci 
 94 | 
 95 | 
 96 | \item[] 
 97 | {\small
 98 | Notation for simplification (this is how the textbook uses)
 99 | \aleq{
100 | \redf{p(y|\vx)} = p_{Y|\vX}(y | \vx), \quad Y \sim \set{N}(\mu,\sigma^2) \xrightarrow{\text{simplifies}} \set{N}(y \mid f(\vx), \sigma^2)
101 | }
102 | }
103 | \item Assume: \bluef{linear} regression, \bluef{Gaussian} noise
104 | 
105 | \item $y = f(\vx) + \epsilon,$ where $\epsilon \sim \set{N}(0,\sigma^2)$
106 | 
107 | 
108 | \item Likelihood: for $\vx \in \real^D$ and $y \in \real,$ $p(y \mid \vx) = \set{N}(y \mid f(\vx), \sigma^2)$ 
109 | 
110 | 
111 | \item Linear regression with the parameter $\vth \in \realD,$ i.e., $f(\vx) = \trans{\vx}\vth$
112 | $$
113 | p(y \mid \vx) = \set{N}(y \mid \trans{\vx}\vth, \sigma^2) \Longleftrightarrow y = \trans{\vx}\vth + \epsilon, \quad \epsilon \sim \set{N}(0,\sigma^2)
114 | $$
115 | 
116 | \mycolorbox
117 | {
118 | \centering
119 | Prior with Gaussian nose: $p(y \mid \vx) = \set{N}(y \mid \trans{\vx}\vth, \sigma^2)$
120 | }
121 | 
122 | \eci
123 | \end{frame}
124 | 
125 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
126 | \begin{frame}{Parameter Estimation}
127 | 
128 | \plitemsep 0.2in
129 | 
130 | \bci 
131 | 
132 | \item Training set $\set{D} = \{(\vx_1, y_1), \ldots, (\vx_N,y_N) \}$\hspace{3cm} 
133 | \myinlinepic{2.5cm}{L9_LR_gmodel.png}
134 | 
135 | \item Assuming iid $N$ data samples, the likelihood is factorized into:
136 | $$
137 | p(\set{Y} \mid \set{X},\vth) = \prod_{n=1}^N p(y_n \mid \vx_n, \vth) = \prod_{n=1}^N 
138 | \set{N}(y_n \mid \trans{\vx}_n, \sigma^2),
139 | $$
140 | where $\set{X} = \{\vx_1,\ldots,\vx_n \}$ and $\set{Y} = \{y_1,\ldots,y_n \}$
141 | \item Estimation methods: ML and MAP
142 | \eci
143 | \end{frame}
144 | 
145 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
146 | \section{L9(2)}
147 | \begin{frame}{Roadmap}
148 | 
149 | \plitemsep 0.1in
150 | 
151 | \bce[(1)] 
152 | 
153 | \item  \grayf{Problem Formulation}
154 | \item  \redf{Parameter Estimation: ML} 
155 | \item  \grayf{Parameter Estimation: MAP 
156 | \item  Bayesian Linear Regression 
157 | \item  Maximum Likelihood as Orthogonal Projection }
158 | 
159 | \ece
160 | \end{frame}
161 | 
162 | 
163 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
164 | \begin{frame}{MLE (Maximum Likelihood Estimation) (1)}
165 | 
166 | \plitemsep 0.1in
167 | 
168 | \bci 
169 | 
170 | \item $\vth_\ml = \arg \max_{\vth} p(\cY \mid \cX, \vth) = \arg \min_{\vth} \Big( -\log p(\cY \mid \cX, \vth) \Big)$ 
171 | \item For Gaussian noise with $\mX = \trans{[\vx_1, \ldots, \vx_n]}$ and $\vy = \trans{[y_1, \ldots, y_n]},$
172 | \aleq{
173 | -\log p(\cY \mid \cX, \vth) &=  -\log \prod_{n=1}^N p(y_n \mid \vx_n, \vth) =  -\sum_{n=1}^N \log p(y_n \mid \vx_n, \vth) \cr 
174 | & =  \frac{1}{2\sigma^2} \sum_{n=1}^N (y_n - \trans{\vx}_n \vth)^2 + \ \text{const} =  \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2 + \ \text{const}
175 | }
176 | 
177 | \mycolorbox
178 | {
179 | Negative-log likelihood for $f(\vx) = \trans{\vx}\vth + \set{N}(0,\sigma^2)$:
180 | \vspace{-0.1cm}
181 | $$
182 | -\log p(\cY \mid \cX, \vth) = \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2 + \ \text{const}
183 | $$
184 | }
185 | \eci
186 | \end{frame}
187 | 
188 | 
189 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
190 | \begin{frame}{MLE (Maximum Likelihood Estimation) (2)}
191 | 
192 | \plitemsep 0.2in
193 | 
194 | \bci 
195 | 
196 | \item For Gaussian noise with $\mX = \trans{[\vx_1, \ldots, \vx_n]}$ and $\vy = \trans{[y_1, \ldots, y_n]},$
197 | \aleq{
198 | \vth_\ml  = \arg \min_{\vth} \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2, \quad L(\vth) = \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2
199 | }
200 | 
201 | \item In case of Gaussian noise, $\vth_{\ml}= \vth$ that minimizes the empirical risk with the squared loss function
202 | \bci
203 | \item Models as functions $=$ Model as probabilistic models
204 | \eci
205 | 
206 | \eci
207 | \end{frame}
208 | 
209 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
210 | \begin{frame}{MLE (Maximum Likelihood Estimation) (3)}
211 | 
212 | \plitemsep 0.2in
213 | 
214 | \bci 
215 | 
216 | \item We find $\vth$ such that $\d{L}{\vth}=0$ 
217 | \aleq{
218 | &\d{L}{\vth} = \frac{1}{2\sigma^2} \left (-2 \trans{(\vy - \mX \vth)} \mX\right) = \frac{1}{\sigma^2} \left (-\trans{\vy}\vX +\trans{\vth}\trans{\mX}\mX  \right ) = 0\cr
219 | &\Longleftrightarrow \trans{\vth}_\ml\trans{\mX}\mX = \trans{\vy}\vX \cr
220 | & \Longleftrightarrow 
221 | \trans{\vth}_\ml = \trans{\vy}\vX \inv{(\trans{\mX}\mX)} \quad \text{($\trans{\mX}\mX$ is positive definite if $\rk{\mX}=D$)} \cr
222 | & \Longleftrightarrow \vth_\ml = \inv{(\trans{\mX}\mX)} \trans{\vX} \vy
223 | }
224 | 
225 | \eci
226 | \end{frame}
227 | 
228 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
229 | \begin{frame}{MLE with Features}
230 | 
231 | \plitemsep 0.07in
232 | 
233 | \bci 
234 | 
235 | \item Linear regression: Linear in terms of \bluef{the parameters}
236 | \bci
237 | \item $\trans{\phi(\vx)} \vth$ is also fine, where $\phi(\vx)$ can be non-linear (we will cover this later)
238 | \item $\phi(\vx)$ are the features
239 | \eci
240 | 
241 | \item Linear regression with the parameter $\vth \in \real^K,$ $\phi(\vx): \realD \mapsto \real^K$:
242 | $$
243 | p(y \mid \vx) = \set{N}(y \mid \trans{\phi(\vx)} \vth, \sigma^2) \Longleftrightarrow y = \trans{\phi(\vx)} \vth + \epsilon = \sum_{k=0}^{K-1} \theta_k \phi_k(\vx) + \epsilon
244 | $$
245 | 
246 | \item \exam \bluef{Polynomial regression.} For $x\in \real$ and $\vth \in \real^K$, we lift the original 1-D input into $K$-D feature space with monomials $x^k$:
247 | \aleq{
248 | \phi(x) = \colvec{\phi_0(x) \\ \vdots \\ \phi_{K-1}(x)} = \colvec{1 \\ \vdots \\ x^{K-1}} \in \real^K
249 | \quad \implies \quad f(x) = \sum_{k=0}^{K-1} \theta_k x^k
250 | }
251 | 
252 | \eci
253 | \end{frame}
254 | 
255 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
256 | \begin{frame}{Feature Matrix and MLE}
257 | 
258 | \plitemsep 0.15in
259 | 
260 | \bci 
261 | 
262 | \item Now, for the entire training set $\{\vx_1, \ldots, \vx_N \}$, 
263 | \aleq{
264 | \bm{\Phi} \eqdef \colvec{\trans{\phi}(\vx_1)\\ \vdots \\\trans{\phi}(\vx_N)}
265 | = \begin{nmat}
266 | \phi_0(\vx_1) & \cdots & \phi_{K-1}(\vx_1) \cr
267 | \vdots & \cdots & \vdots \cr
268 | \phi_0(\vx_N) & \cdots & \phi_{K-1}(\vx_N) 
269 | \end{nmat}
270 | \in \real^{N \times K}, \ \mPhi_{ij} = \phi_j(\vx_i), \ \phi_j: \realD \mapsto \real
271 | }
272 | \item Negative log-likelihood: Similarly to the case of $\vy = \mX \vth,$
273 | \mycolorbox
274 | {
275 | \bci
276 | \item $p(\set{Y}| \set{X},\vth) = \set{N}(\vy \mid \mPhi\vth, \sigma^2\mI)$
277 | \item Negative-log likelihood for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$: 
278 | \vspace{-0.1cm}
279 | $$
280 | -\log p(\cY \mid \cX, \vth) = \dfrac{1}{2\sigma^2} \norm{\vy - \bm{\Phi}\vth}^2 + \text{const}
281 | $$
282 | \eci
283 | }
284 | 
285 | 
286 | % $$
287 | % -\log p(\set{Y} \mid \set{X},\vth) = \dfrac{1}{2\sigma^2} \norm{\vy - \bm{\Phi}\vth}^2 + \text{const}
288 | % $$
289 | 
290 | \item MLE:
291 | $
292 | \vth_\ml = \inv{(\trans{\bm{\Phi}}\bm{\Phi})} \trans{\bm{\Phi}} \vy
293 | $
294 | \eci
295 | \end{frame}
296 | 
297 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
298 | \begin{frame}{Polynomial Fit}
299 | 
300 | \plitemsep 0.07in
301 | 
302 | \bci 
303 | 
304 | \item $N=10$ data, where $x_n \sim \set{U}[-5,5]$ and $y_n = -\sin(x_n/5) + \cos(x_n) + \epsilon,$ $\epsilon \sim \set{N}(0,0.2^2)$
305 | 
306 | \item Fit with poloynomial with degree 4 using ML
307 | \eci
308 | 
309 | \mypic{0.8}{L9_poly4fit.png}
310 | 
311 | \end{frame}
312 | 
313 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
314 | \begin{frame}{Overfitting in Linear Regression}
315 | 
316 | \myvartwocols{0.55}{0.7}{0.27}
317 | {
318 | \vspace{-0.4cm}
319 | \mypic{0.95}{L9_overfit_linear.png}
320 | }
321 | {
322 | %\vspace{-0.4cm}
323 | \mypic{0.99}{L9_training_test.png}
324 | }
325 | 
326 | 
327 | \plitemsep 0.04in
328 | 
329 | \bci 
330 | 
331 | \item Higher polynomial degree is better (training error always decreases)
332 | 
333 | \item Test error increases after some polynomial degree
334 | \eci
335 | \end{frame}
336 | 
337 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
338 | \section{L9(3)}
339 | \begin{frame}{Roadmap}
340 | 
341 | \plitemsep 0.1in
342 | 
343 | \bce[(1)] 
344 | 
345 | \item  \grayf{Problem Formulation}
346 | \item  \grayf{Parameter Estimation: ML} 
347 | \item  \redf{Parameter Estimation: MAP} 
348 | \item  \grayf{Bayesian Linear Regression 
349 | \item  Maximum Likelihood as Orthogonal Projection }
350 | 
351 | \ece
352 | \end{frame}
353 | 
354 | 
355 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
356 | \begin{frame}{MAPE (Maximum A Posteriori Estimation)}
357 | 
358 | \plitemsep 0.15in
359 | 
360 | \bci 
361 | 
362 | \item MLE: prone to overfitting, where the magnitude of the parameters becomes large.
363 | \item a prior distribution $p(\vth)$ helps: what $\vth$ is plausible 
364 | \item MAPE and Bayes' theorem
365 | \aleq{
366 | p(\vth \mid \set{X},\set{Y}) = \frac{p(\set{Y} \mid \set{X}, \vth) p(\vth)}{p(\set{Y} \mid \set{X})}
367 | \implies
368 | \vth_\map  \in \arg\min_{\vth} \Big(-\log p(\set{Y} \mid \set{X},\vth) - \log p(\vth)\Big) 
369 | }
370 | \item Gradient
371 | \aleq{
372 | - \d{\log p(\vth | \set{X},\set{Y})}{\vth} = - \d{\log p(\set{Y}|\set{X},\vth)}{\vth} - \d{\log p(\vth)}{\vth}
373 | }
374 | 
375 | \eci
376 | 
377 | \end{frame}
378 | 
379 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
380 | \begin{frame}{MAPE for Gausssian Prior (1)}
381 | 
382 | \plitemsep 0.07in
383 | 
384 | \bci 
385 | 
386 | \item \exam A (conjugate) Gaussian prior $p(\vth) \sim \set{N}(\vec{0}, b^2 \mI)$
387 | \bci
388 | \item For Gaussian likelihood, Gaussian prior $\implies$ Gaussian posterior \hfill \lecturemark{L6(6)}
389 | \eci
390 | 
391 | \item Negative log-posterior
392 | 
393 | \medskip
394 | \mycolorbox
395 | {
396 | Negative-log posterior for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$ and $p(\vth) \sim \set{N}(\vec{0}, b^2 \mI)$:
397 | \vspace{-0.1cm}
398 | $$
399 | -\log p(\vth | \set{X},\set{Y}) = \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \frac{1}{2b^2}\trans{\vth}\vth + \text{const}
400 | $$
401 | }
402 | \item Gradient
403 | \aleq{
404 |  -\d{\log p(\vth | \set{X},\set{Y})}{\vth} &= \frac{1}{\sigma^2} 
405 |  (\trans{\vth}\trans{\mPhi}\mPhi - \trans{\vy}\mPhi) + \frac{1}{b^2}\trans{\vth}
406 | }
407 | 
408 | \eci
409 | 
410 | \end{frame}
411 | 
412 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
413 | \begin{frame}{MAPE for Gausssian Prior (2)}
414 | 
415 | \plitemsep 0.1in
416 | 
417 | \bci 
418 | 
419 | \item MAP vs. ML
420 | $$
421 | \vth_\map = \inv{
422 | \underbrace{\Big(\trans{\mPhi} \mPhi + \bluef{\frac{\sigma^2}{b^2}\mI} \Big)}_{(*)}
423 | } \trans{\mPhi} \vy, \quad \vth_\ml = \inv{(\trans{\bm{\Phi}}\bm{\Phi})} \trans{\bm{\Phi}} \vy
424 | $$
425 | 
426 | \item The term $\bluef{\dfrac{\sigma^2}{b^2}\mI}$ 
427 | \bci
428 | \item Ensures that $(*)$ is symmetric, strictly positive definite
429 | \item Role of regularizer
430 | \eci
431 | \eci
432 | 
433 | \end{frame}
434 | 
435 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
436 | \begin{frame}{Aside: MAPE for General Gausssian Prior (3)}
437 | 
438 | \plitemsep 0.07in
439 | 
440 | \bci 
441 | 
442 | \item \exam A (conjugate) Gaussian prior $p(\vth) \sim \bluef{\set{N}(\vm_0, \mS_0)}$
443 | 
444 | \item Negative log-posterior
445 | 
446 | \medskip
447 | 
448 | \begin{tcolorbox}[width=14cm,colback=red!5!white,colframe=red!75!black]
449 | Negative-log posterior for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$ and $p(\vth) \sim \set{N}(\vm_0, \mS_0)$:
450 | \vspace{-0.1cm}
451 | \aleq{
452 | -\log p(\vth | \set{X},\set{Y}) &= \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \bluef{\frac{1}{2}\trans{(\vth - \vm_0)}\inv{\mS}_0(\vth-\vm_0)}  + \text{const}
453 | }
454 | \end{tcolorbox}
455 | 
456 | \item We will use this later for computing the parameter posterior distribution in Bayesian linear regression.
457 | \eci
458 | 
459 | \end{frame}
460 | 
461 | 
462 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
463 | \begin{frame}{Regularization: MAPE vs. Explicit Regularizer}
464 | 
465 | 
466 | \plitemsep 0.1in
467 | 
468 | 
469 | \bci 
470 | 
471 | \item \bluef{Explicit regularizer} in regularized least squares (RLS)
472 | $$
473 | \norm{\vy - \mPhi\vth}^2 + \lambda \norm{\vth}^2
474 | $$
475 | 
476 | \item \bluef{MAPE wth Gaussian prior} $p(\vth) \sim \set{N}(\vec{0},b^2 \mI)$
477 | \bci
478 | \item Negative log-Gaussian prior
479 | $$
480 | -\log p(\vth) = \frac{1}{2b^2}\trans{\vth}\vth + \text{const}
481 | $$
482 | \item $\lambda = 1/2b^2$ is the regularization term
483 | \eci
484 | 
485 | \item Not surprising that we have
486 | $$
487 | \vth_{\text{RLS}} = \inv{
488 | \Big(\trans{\mPhi} \mPhi + \bluef{\lambda \mI} \Big)
489 | } \trans{\mPhi} \vy
490 | $$
491 | \eci
492 | 
493 | \end{frame}
494 | 
495 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
496 | \section{L9(4)}
497 | \begin{frame}{Roadmap}
498 | 
499 | \plitemsep 0.1in
500 | 
501 | \bce[(1)] 
502 | 
503 | \item  \grayf{Problem Formulation}
504 | \item  \grayf{Parameter Estimation: ML} 
505 | \item  \grayf{Parameter Estimation: MAP} 
506 | \item  \redf{Bayesian Linear Regression} 
507 | \item  \grayf{Maximum Likelihood as Orthogonal Projection }
508 | 
509 | \ece
510 | \end{frame}
511 | 
512 | 
513 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
514 | \begin{frame}{Bayesian Linear Regression}
515 | 
516 | 
517 | \plitemsep 0.05in
518 | 
519 | 
520 | \bci 
521 | 
522 | \item Earlier, ML and MAP. Now, \bluef{fully Bayesian} \hfill \lecturemark{L8(4)}
523 | 
524 | \item Model 
525 | \mysmalltwocols{0.25}
526 | {
527 | \vspace{-0.2cm}
528 | \aleq{
529 | \text{prior} \quad & p(\vth) \sim \set{N}(\vm_0, \mS_0)   \cr
530 | \text{likelihood} \quad& p(y | \vx, \vth) \sim \set{N}\big(y \mid \trans{\phi}(\vx)\vth,\sigma^2 \big)\cr
531 | \text{joint} \quad & p(y,\vth | \vx) = p(y \mid \vx,\vth) p(\vth)
532 | }
533 | }
534 | {
535 | \vspace{-0.2cm}
536 | \mypic{0.4}{L9_bayesian_regression.png}
537 | }
538 | \item Goal: For an input $\vx_*,$ we want to compute the following \bluef{posterior predictive distribution}\footnote{\lecturemark{Chapter 9.3.4} For ease of understanding, I've slightly changed the organization of these lecture slides from that of the textbook.} of $y_*$:
539 | \vspace{-0.3cm}
540 | $$
541 | \displaystyle
542 |  p(y_* | x_*, \set{X},\set{Y}) = \int \overbrace{p(y_* | \vx_*, \vth)}^{\text{likelihood}} 
543 |  \overbrace{p(\vth | \set{X},\set{Y})}^{(*)}\text{d}\vth
544 | $$
545 | \vspace{-0.3cm}
546 | \bci
547 | \item $(*)$: parameter posterior distribution that needs to be computed
548 | \eci
549 | \eci
550 | 
551 | \end{frame}
552 | 
553 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
554 | % \begin{frame}{Prior Predictions}
555 | 
556 | 
557 | % \plitemsep 0.07in
558 | 
559 | % \bci 
560 | 
561 | % \item Fully Bayesian: Predictions by taking the parameter distribution and average over all plausible parameter setting. For an input $\vx_*$,
562 | % $$
563 | % p(y_* | \vx_*) = \int p(y,\vth | \vx) \text{d}\vth
564 | % = \int p(y_* | \vx_*, \vth) p(\vth) \text{d}\vth = \bexpecti{\vth}{p(y_*|\vx_*, \vth)}
565 | % $$
566 | 
567 | % \item This prediction based on \bluef{prior distribution} requires only input, not depending on the  training data. 
568 | 
569 | % \item Later, we will discuss \bluef{posterior prediction} which uses the modified predictive distribution based on the training data. 
570 | 
571 | % \item \bluef{Prior predictive distribution}
572 | % $$
573 | % \redf{p(y_* | x_*) = \set{N}\Big(\trans{\phi}(\vx_*)\vm_0, \trans{\phi}(\vx_*)\mS_0 \phi(\vx_*)+ 
574 | % \sigma^2 \Big)}
575 | % $$
576 | 
577 | % \eci
578 | 
579 | % \end{frame}
580 | 
581 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
582 | \begin{frame}{Parameter Posterior Distribution (1)}
583 | 
584 | \plitemsep 0.07in
585 | 
586 | \bci 
587 | 
588 | \item \bluef{Parameter posterior distribution}\hfill \lecturemark{Chapter 9.3.3}
589 | \mycolorbox{
590 | $$
591 | \redf{p(\vth \mid \set{X},\set{Y}) = \set{N}(\vth \mid \vm_N,\mS_N)}, \quad \text{where}  
592 | $$
593 | $$
594 | \redf{
595 | \mS_N = \inv{\big(\inv{\mS}_0 + \sigma^2 \trans{\mPhi}\mPhi \big)}, \quad \vm_N = \mS_N 
596 | \big(\inv{\mS}_0\vm_0 + \sigma^{-2}\trans{\mPhi}\vy \big)}
597 | $$
598 | }
599 | \eci
600 | (Proof Sketch)
601 | \small
602 | \bci 
603 | 
604 | \item From the negative-log posterior for general Gaussian prior,
605 | \aleq{
606 | -\log p(\vth | \set{X},\set{Y}) &= \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \bluef{\frac{1}{2}\trans{(\vth - \vm_0)}\inv{\mS}_0(\vth-\vm_0)}  + \text{const}
607 | }
608 | 
609 | \eci
610 | \end{frame}
611 | 
612 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
613 | \begin{frame}{Parameter Posterior Distribution (2)}
614 | 
615 | \plitemsep 0.07in
616 | \bci 
617 | \small
618 | \item[] 
619 | \aleq{
620 | &= \frac{1}{2} \Big( 
621 | \sigma^{-2}\trans{\vy}\vy - \orangef{2\sigma^{-2}\trans{\vy}\mPhi\vth} + \cyanf{\trans{\vth}\sigma^{-2} 
622 | \trans{\mPhi}\mPhi\vth} + \cyanf{\trans{\vth}\inv{\mS}_0\vth} -\orangef{2\trans{\vm}_0\inv{\mS}_0\vth} 
623 | +\trans{\vm}_0\inv{\mS}_0\vm_0
624 | \Big ) \cr
625 | &=\frac{1}{2} \Big( 
626 | \cyanf{\trans{\vth}(\sigma^{-2}\trans{\mPhi}\mPhi + \inv{\mS}_0)\vth} 
627 | -\orangef{2\trans{(\sigma^{-2}\trans{\mPhi}\vy + \inv{\mS}_0\vm_0)}\vth}
628 | \Big) + \text{const}
629 | }
630 | \item \cyanf{cyan color}: quadratic term, \orangef{orange color}: linear term
631 | 
632 | \item $p(\vth|\cX,\cY) \propto \exp(\text{ quadratic in $\vth$ })$ $\implies$ Gaussian distribution
633 | \item Assume that $p(\vth|\cX,\cY) = \set{N}(\vth|\vm_N, \mS_N),$ and find $\vm_N$ and $\mS_N.$
634 | \aleq{
635 | - \log\set{N}(\vth|\vm_N,\mS_N) &= \frac{1}{2}\trans{(\vth-\vm_N)}\inv{\mS}_N(\vth-\vm_N) + \text{const} \cr
636 | &= \frac{1}{2}\Big( 
637 | \cyanf{\trans{\vth}\inv{\mS}_N\vth} - \orangef{2\trans{\vm_N}\inv{\mS}_N\vth} + \trans{\vm}_N\inv{\mS}_N\vm_N  
638 | \Big) + \text{const}
639 | }
640 | \item Thus, 
641 | $
642 | \cyanf{\inv{\mS}_N = \sigma^{-2}\trans{\mPhi}\mPhi + \inv{\mS}_0} \quad \text{and} \quad 
643 | \orangef{
644 | \trans{\vm}_N\inv{\mS}_N = \trans{(\sigma^{-2}\trans{\mPhi}\vy + \inv{\mS}_0\vm_0)}
645 | }
646 | $
647 | \eci
648 | 
649 | \end{frame}
650 | 
651 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
652 | \begin{frame}{Posterior Predictions (1)}
653 | 
654 | \plitemsep 0.07in
655 | 
656 | \bci 
657 | 
658 | \item \bluef{Posterior predictive distribution} \hfill \lecturemark{L6(5)}
659 | \red{
660 | \aleq{
661 |  p(y_* | x_*, \set{X},\set{Y}) &= \int p(y_* | \vx_*, \vth)p(\vth | \set{X},\set{Y})\text{d}\vth \cr
662 |   &= \int \set{N}\Big(y_* | \trans{\phi}(\vx_*)\vth, \sigma^2\Big) \set{N}\Big(\vth | \vm_N, \mS_N\Big)\text{d}\vth \cr
663 |   &= \set{N}\Big(y_* | \trans{\phi}(\vx_*)\vm_N, \trans{\phi}(\vx_*)\mS_N \phi(\vx_*)+ 
664 |   \sigma^2 \Big)
665 | }}
666 | 
667 | \item The mean $\trans{\phi}(\vx_*)\vm_N$ coincides with the MAP estimate
668 | \eci
669 | \end{frame}
670 | 
671 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
672 | \begin{frame}{Posterior Predictions (2)}
673 | 
674 | \mypic{0.95}{L9_posterior_predictive_ex.png}
675 | 
676 | \bci
677 | \item BLR: Bayesian Linear Regression
678 | \eci
679 | \end{frame}
680 | 
681 | 
682 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
683 | \begin{frame}{Computing Marginal Likelihood}
684 | 
685 | \plitemsep 0.07in
686 | 
687 | \bci 
688 | 
689 | \item Likelihood: $p(\cY | \cX, \vth),$ \bluef{Marginal likelihood}: $p(\cY | \cX) = \int p(\cY | \cX,\vth)p(\vth)\text{d}\vth$
690 | 
691 | \item Recall that the marginal likelihood is important for model selection via Bayes factor: 
692 | $$
693 | \text{(Posterior odds)} = \frac{\cprob{M_1 \mid \set{D}}}{\cprob{M_2 \mid \set{D}}} = \frac
694 | {
695 | \frac{\cprob{\set{D} \mid M_1}\cprob{M_1}}{\cprob{\set{D}}}
696 | }
697 | {
698 | \frac{\cprob{\set{D} \mid M_2}\cprob{M_2}}{\cprob{\set{D}}}
699 | }
700 | = \underbrace{\frac{\cprob{M_1}}{\cprob{M_2}}}_{\text{Prior odds}} 
701 | \underbrace{\frac{\cprob{\set{D} \mid M_1}}{\cprob{\set{D} \mid M_2}}}_{\bluef{\text{Bayes factor}}}
702 | $$
703 | 
704 | \item[] 
705 | \aleq{
706 | p(\cY | \cX) &= \int p(\cY | \cX,\vth)p(\vth)\text{d}\vth = \int \set{N}(\vy | \mPhi\vth, \sigma^2\mI)
707 | \set{N}(\vth | \vm_0,\mS_0)\, \text{d}\vth \cr
708 | & = \set{N}(\vy \mid \mPhi\vm_0, \mPhi\mS_0\trans{\mPhi} + \sigma^2\mI)
709 | }
710 | \eci
711 | \end{frame}
712 | 
713 | 
714 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
715 | \section{L9(5)}
716 | \begin{frame}{Roadmap}
717 | 
718 | \plitemsep 0.1in
719 | 
720 | \bce[(1)] 
721 | 
722 | \item  \grayf{Problem Formulation}
723 | \item  \grayf{Parameter Estimation: ML} 
724 | \item  \grayf{Parameter Estimation: MAP} 
725 | \item  \grayf{Bayesian Linear Regression} 
726 | \item  \redf{Maximum Likelihood as Orthogonal Projection}
727 | 
728 | \ece
729 | \end{frame}
730 | 
731 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
732 | \begin{frame}{ML as Orthogonal Projection}
733 | 
734 | \plitemsep 0.07in
735 | 
736 | \bci 
737 | 
738 | \item For $f(\vx) = \trans{\vx}\vth + \set{N}(0,\sigma^2),$ $\vth_\ml = \inv{(\trans{\mX}\mX)} \trans{\vX} \vy = \dfrac{\trans{\vX}\vy}{\trans{\mX}\mX} \in \real$
739 | $$
740 | \mX\vth_\ml =  \dfrac{\vX\trans{\vX}}{\trans{\mX}\mX}\vy
741 | $$
742 | \vspace{-0.3cm}
743 | \bci
744 | \item Orthogonal projection of $\vy$ onto the one-dimensional subspace spanned by $\mX$
745 | \eci
746 | 
747 | 
748 | \item For $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2),$ $\vth_\ml = \inv{(\trans{\mPhi}\mPhi)} \trans{\mPhi} \vy = \dfrac{\trans{\mPhi}\vy}{\trans{\mPhi}\mPhi} \in \real$
749 | $$
750 | \mPhi\vth_\ml =  \dfrac{\mPhi\trans{\mPhi}}{\trans{\mPhi}\mPhi}\vy
751 | $$
752 | \vspace{-0.3cm}
753 | \bci
754 | \item Orthogonal projection of $\vy$ onto the $K$-dimensional subspace spanned by columns of $\mPhi$
755 | \eci
756 | \eci
757 | \end{frame}
758 | 
759 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
760 | \begin{frame}{Summary and Other Issues (1)}
761 | 
762 | \plitemsep 0.07in
763 | 
764 | \bci 
765 | 
766 | \item Linear regression for Gaussian likelihood and conjugate Gaussian priors. Nice analytical results and closed forms 
767 | 
768 | \item Other forms of likelihoods for other applications (e.g., classification)
769 | 
770 | \item GLM (generalized linear model): $y = \sigma \circ f$ ($\sigma$: activation function)
771 | \bci
772 | \item No longer linear in $\vth$
773 | \item Logistic regression: $\sigma(f) = \dfrac{1}{1+\exp(-f)} \in [0,1]$ (interpreted as the probability of becoming 1)
774 | \item Building blocks of (deep) feedforward neural nets
775 | 
776 | \item $\vy = \sigma(\mA \vx + \vb)$. $\mA$: weight matrix, $\vb$: bias vector
777 | \item $K$-layer deep neural nets: $\vx_{k+1} = f_k(\vx_k),$ $f_k(\vx_k) = \sigma_k(\mA_k\vx_k + \vb_k)$
778 | \eci
779 | 
780 | \eci
781 | \end{frame}
782 | 
783 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
784 | \begin{frame}{Summary and Other Issues (2)}
785 | 
786 | \plitemsep 0.1in
787 | 
788 | \bci 
789 | 
790 | \item Gaussian process
791 | \bci
792 | \item A distribution over parameters $\rightarrow$ a distribution over functions
793 | \item Gaussian process: distribution over functions without detouring via parameters
794 | \item Closely related to BLR and support vector regression, also interpreted as Bayesian neural network with a single hidden layer and the infinite number of units
795 | \eci
796 | 
797 | \item Gaussian likelihood, but non-Gaussian prior
798 | \bci
799 | \item When $N \ll D$ (small training data)
800 | \item Prior that enforces sparsity, e.g., Laplace prior 
801 | \item A linear regression with the Laplace prior $=$ linear regression with LASSO (L1 regularization)
802 | \eci
803 | \eci
804 | \end{frame}
805 | 
806 | 
807 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
808 | \begin{frame}{}
809 | \vspace{2cm}
810 | \LARGE Questions?
811 | 
812 | 
813 | \end{frame}
814 | 
815 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
816 | \begin{frame}{Review Questions}
817 | % \tableofcontents
818 | %\plitemsep 0.1in
819 | \bce[1)]
820 | \item 
821 | 
822 | \ece
823 | \end{frame}
824 | 
825 | 
826 | \end{document}
827 | 


--------------------------------------------------------------------------------
/10.PCA/10.PCA-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA-2.pdf


--------------------------------------------------------------------------------
/10.PCA/10.PCA-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA-4.pdf


--------------------------------------------------------------------------------
/10.PCA/10.PCA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA.pdf


--------------------------------------------------------------------------------
/10.PCA/L10_PCA_onepicture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_PCA_onepicture.png


--------------------------------------------------------------------------------
/10.PCA/L10_dr_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_dr_ex.png


--------------------------------------------------------------------------------
/10.PCA/L10_latent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_latent.png


--------------------------------------------------------------------------------
/10.PCA/L10_mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_mnist.png


--------------------------------------------------------------------------------
/10.PCA/L10_pca_algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_pca_algorithm.png


--------------------------------------------------------------------------------
/10.PCA/L10_pca_picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_pca_picture.png


--------------------------------------------------------------------------------
/10.PCA/L10_variance_diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_variance_diff.png


--------------------------------------------------------------------------------
/11.DensityEstimation/11.GMM-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM-2.pdf


--------------------------------------------------------------------------------
/11.DensityEstimation/11.GMM-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM-4.pdf


--------------------------------------------------------------------------------
/11.DensityEstimation/11.GMM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM.pdf


--------------------------------------------------------------------------------
/11.DensityEstimation/L11_Gaussian_fail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_Gaussian_fail.png


--------------------------------------------------------------------------------
/11.DensityEstimation/L11_em_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_em_ex.png


--------------------------------------------------------------------------------
/11.DensityEstimation/L11_gm_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_gm_ex.png


--------------------------------------------------------------------------------
/11.DensityEstimation/L11_gmm_gm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_gmm_gm.png


--------------------------------------------------------------------------------
/11.DensityEstimation/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | \title[]{Lecture 11: Density Estimation \\with Gaussian Mixture Models}
  7 | \author{Yi, Yung (이융)}
  8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
  9 | \\KAIST EE}
 10 | \date{\today}
 11 | 
 12 | 
 13 | \input{../mymath}
 14 | \input{../mymacro}
 15 | 
 16 | 
 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{}
 18 | 
 19 | \begin{document}
 20 | 
 21 | \input{../mydefault}
 22 | 
 23 | 
 24 | 
 25 | % START START START START START START START START START START START START START
 26 | 
 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 28 | \begin{frame}{Warm-Up}
 29 | 
 30 | {\Large Please watch this tutorial video by Luis Serrano on Gaussian Mixture Model.}
 31 | 
 32 | \bigskip
 33 | 
 34 | \bigskip
 35 | 
 36 | \url{https://www.youtube.com/watch?v=q71Niz856KE}
 37 | 
 38 | \end{frame}
 39 | 
 40 | 
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | \begin{frame}{Roadmap}
 43 | 
 44 | \plitemsep 0.1in
 45 | 
 46 | \bce[(1)] 
 47 | 
 48 | \item Gaussian Mixture Model
 49 | \item Parameter Learning: MLE
 50 | \item Latent-Variable Perspective for Probabilistic Modeling
 51 | \item EM Algorithm
 52 | \ece
 53 | \end{frame}
 54 | 
 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 56 | \section{L11(1)}
 57 | \begin{frame}{Roadmap}
 58 | 
 59 | \plitemsep 0.1in
 60 | 
 61 | \bce[(1)] 
 62 | 
 63 | \item \redf{Gaussian Mixture Model}
 64 | \item \grayf{Parameter Learning: MLE
 65 | \item Latent-Variable Perspective for Probabilistic Modeling
 66 | \item EM Algorithm}
 67 | \ece
 68 | \end{frame}
 69 | 
 70 | 
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 | \begin{frame}{Density Estimation}
 73 | 
 74 | \plitemsep 0.1in
 75 | 
 76 | \bci
 77 | 
 78 | \item Represent data compactly using a density from a parametric family, e.g., Gaussian or Beta distribution
 79 | 
 80 | \item Parameters of those families can be found by MLE and MAPE
 81 | 
 82 | \item However, there are many cases when simple distributions (e.g., just Gaussian) fail to approximate data.
 83 | 
 84 | \mypic{0.4}{L11_Gaussian_fail.png}
 85 | 
 86 | \eci
 87 | \end{frame}
 88 | 
 89 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 90 | \begin{frame}{Mixture Models}
 91 | 
 92 | \plitemsep 0.1in
 93 | 
 94 | \bci
 95 | 
 96 | \item More expressive family of distribution
 97 | 
 98 | \item Idea: Let's mix! A \bluef{convex combination} of $K$ ``base'' distributions
 99 | \mycolorbox{
100 | \vspace{-0.2cm}
101 | \aleq{
102 | p(\vx) = \sum_{k=1}^K \pi_k p_k(\vx), \quad 0 \le \pi_k \le 1, \quad \sum_{k=1}^K \pi_k = 1}
103 | }
104 | \item Multi-modal distributions: Can be used to describe datasets with multiple clusters
105 | 
106 | \item Our focus: Gaussian mixture models
107 | 
108 | \item Want to finding the parameters using MLE, but \bluef{cannot have the closed form} solution (even with the mixture of Gaussians) $\rightarrow$ some iterative methods needed
109 | \eci
110 | \end{frame}
111 | 
112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113 | \begin{frame}{Gaussian Mixture Model}
114 | 
115 | \mycolorbox{
116 | \vspace{-0.2cm}
117 | \aleq{
118 | p(\vx | \vth) = \sum_{k=1}^K \set{N}(\vx | \vmu_k, \msig_k), \quad 0 \le \pi_k \le 1, \quad \sum_{k=1}^K \pi_k = 1,
119 | }
120 | where the parameters $\vth \eqdef \{\vmu_k, \msig_k, \pi_k: k= 1, \ldots, K \}$
121 | }
122 | 
123 | 
124 | \vspace{-0.3cm}
125 | \plitemsep 0.01in
126 | 
127 | \bci
128 | \item \exam $p(x|\vth) = \bluef{0.5\set{N}(x|-2,1/2)} + \orangef{0.2\set{N}(x|1,2)} + \greenf{0.3\set{N}(x|4,1)}$
129 | 
130 | \mypic{0.4}{L11_gm_ex.png}
131 | \eci
132 | \end{frame}
133 | 
134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
135 | \section{L11(2)}
136 | \begin{frame}{Roadmap}
137 | 
138 | \plitemsep 0.1in
139 | 
140 | \bce[(1)] 
141 | 
142 | \item \grayf{Gaussian Mixture Model}
143 | \item \redf{Parameter Learning: MLE}
144 | \item \grayf{Latent-Variable Perspective for Probabilistic Modeling
145 | \item EM Algorithm}
146 | \ece
147 | \end{frame}
148 | 
149 | 
150 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
151 | \begin{frame}{Parameter Learning: Maximum Likelihood}
152 | 
153 | \plitemsep 0.1in
154 | 
155 | \bci
156 | \item Given a iid dataset $\set{X}= \{\vx_1, \ldots, \vx_n \},$ the log-likelihood is:
157 | \aleq{
158 | \cL(\vth) = \log p(\set{X} | \vth) = \sum_{n=1}^N \log p(\vx_n|\vth) = \sum_{n=1}^N \log \sum_{k=1}^K \pi_k
159 | \cN(\vx_n | \vmu_k,\msig_k)
160 | }
161 | 
162 | \item $\vth_{\ml} = \arg \min_{\vth} (-\cL(\vth))$
163 | \item Necessary condition for $\vth_\ml$: $\dfrac{d\cL}{d\vth}\Big|_{\vth_\ml} = 0$
164 | 
165 | \item However, the closed-form solution of $\vth_\ml$ does not exist, so we rely on an iterative algorithm (also called EM algorithm).
166 | 
167 | \item We show the algorithm first, and then discuss how we get the algorithm.
168 | \eci
169 | \end{frame}
170 | 
171 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
172 | \begin{frame}{Responsibilities}
173 | 
174 | \plitemsep 0.1in
175 | 
176 | \bci
177 | \item \defi \bluef{Responsibilities.} Given $n$-th data point $\vx_n$ and the parameters  $(\vmu_k, \msig_k, \pi_k: k=1, \ldots, K)$, 
178 | $$
179 | r_{nk}=  \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) }
180 | $$
181 | 
182 | \item How much is each component $k$ responsible, if the data $\vx_n$ is sampled from the current mixture model? 
183 | 
184 | \item $\vec{r}_n = (r_{nk}: k=1, \ldots, K)$ is a probability distribution, so $\sum_{k=1}^K r_{nk} =1$
185 | 
186 | \item Soft assignment of $\vx_n$ to the $K$ mixture components
187 | \eci
188 | \end{frame}
189 | 
190 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
191 | \begin{frame}{EM Algorithm:  MLE in Gaussian Mixture Models}
192 | 
193 | \small
194 | \myblock{EM for MLE in Gaussian Mixture Models}
195 | {
196 | \bce[\red \bf S1.]
197 | \item Initialize $\vmu_k, \msig_k, \pi_k$
198 | 
199 | \item \bluef{\bf E-step:} Evaluate responsibilities $r_{nk}$ for every data point $\vx_n$ using the current  $\vmu_k, \msig_k, \pi_k$:
200 | $$
201 | \greenf{r_{nk}}=  \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) }, \quad \greenf{N_k} = \sum_{n=1}^N \greenf{r_{nk}}
202 | $$
203 | 
204 | \item \bluef{\bf M-step:} Reestimate parameters $\vmu_k, \msig_k, \pi_k$ using the current 
205 | responsibilities $r_{nk}$:
206 | \aleq{
207 | \orangef{\vmu_k} = \frac{1}{\greenf{N_k}} \sum_{n=1}^N \greenf{r_{nk}} \vx_n, \ \orangef{\msig_k} = \frac{1}{\greenf{N_k}} 
208 | \sum_{n=1}^N \greenf{r_{nk}} (\vx_n - \vmu_k)\trans{(\vx_n - \vmu_k)}, \ \orangef{\pi_k} = \frac{\greenf{N_k}}{N},
209 | }
210 | and go to \redf{\bf S2.}
211 | \ece
212 | }
213 | \vspace{-0.3cm}
214 | - The update equation in \bluef{\bf M-step} is still mysterious, which will be covered later.
215 | 
216 | % \vspace{-0.5cm}
217 | % \plitemsep 0.1in
218 | % \bci
219 | % \item 
220 | % \eci
221 | \end{frame}
222 | 
223 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
224 | \begin{frame}{Example: EM Algorithm}
225 | 
226 | \mypic{0.7}{L11_em_ex.png}
227 | \end{frame}
228 | 
229 | 
230 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
231 | \begin{frame}{M-Step: Towards the Zero Gradient}
232 | 
233 | \plitemsep 0.07in
234 | 
235 | \bci
236 | \item Given $\cX$ and $r_{nk}$ from E-step, the new updates of $\vmu_k$, $\msig_k$, $\pi_k$ should be made, such that the followings are satisfied:
237 | \aleq{
238 | \pd{\cL}{\vmu_k} &= \trans{\vec{0}} \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\vmu_k} = \trans{\vec{0}} \cr
239 | \pd{\cL}{\msig_k} &= \vec{0} \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\msig_k} = \vec{0} \cr\pd{\cL}{\pi_k} &= 0 \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\pi_k} = 0
240 | }
241 | 
242 | \item Nice thing: the new updates of $\vmu_k$, $\msig_k$, $\pi_k$ are all expressed by the responsibilities $[r_{nk}]$ 
243 | 
244 | \item Let's take a look at them one by one!
245 | \eci
246 | \end{frame}
247 | 
248 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
249 | \begin{frame}{M-Step: Update of $\vmu_k$}
250 | \mycolorbox{ 
251 | $$
252 | \vmu_k^{\text{new}} = \frac{\sum_{n=1}^N r_{nk} \vx_n}{\sum_{n=1}^N r_{nk}}, k=1,\ldots, K
253 | $$
254 | }
255 | 
256 | \plitemsep 0.07in
257 | \bci
258 | \item 
259 | \eci
260 | \end{frame}
261 | 
262 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
263 | \begin{frame}{M-Step: Update of $\msig_k$}
264 | \mycolorbox{ 
265 | $$
266 | \msig_k^{\text{new}} = \frac{1}{N_k} 
267 | \sum_{n=1}^N r_{nk} (\vx_n - \vmu_k)\trans{(\vx_n - \vmu_k)}, k=1,\ldots, K
268 | $$
269 | }
270 | 
271 | \plitemsep 0.07in
272 | \bci
273 | \item 
274 | \eci
275 | \end{frame}
276 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
277 | \begin{frame}{M-Step: Update of $\pi_k$}
278 | \mycolorbox{ 
279 | $$
280 | \pi_k^{\text{new}} = \frac{\sum_{n=1}^N r_{nk}}{N}, k=1,\ldots, K
281 | $$
282 | }
283 | 
284 | \plitemsep 0.07in
285 | \bci
286 | \item 
287 | \eci
288 | \end{frame}
289 | 
290 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
291 | \section{L11(3)}
292 | \begin{frame}{Roadmap}
293 | 
294 | \plitemsep 0.1in
295 | 
296 | \bce[(1)] 
297 | 
298 | \item \grayf{Gaussian Mixture Model}
299 | \item \grayf{Parameter Learning: MLE}
300 | \item \redf{Latent-Variable Perspective for Probabilistic Modeling}
301 | \item \grayf{EM Algorithm}
302 | \ece
303 | \end{frame}
304 | 
305 | 
306 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
307 | \begin{frame}{Latent-Variable Perspective}
308 | 
309 | \plitemsep 0.07in
310 | \bci
311 | \item Justify some ad hoc decisions made earlier
312 | \item Allow for a concrete interpretation of the responsibilities as \bluef{posterior distributions}
313 | \item Iterative algorithm for updating the model parameters can be derived in a principled manner
314 | \eci
315 | \vspace{-0.9cm}
316 | \mypic{0.3}{L11_gmm_gm.png}
317 | 
318 | \end{frame}
319 | 
320 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
321 | \begin{frame}{Generative Process}
322 | 
323 | \plitemsep 0.07in
324 | \bci
325 | \item \redf{Latent variable $\vz$}: \bluef{One-hot encoding} random vector $\vz = \trans{[z_1, \ldots, z_K]}$ consisting of $K-1$ many 0s and exactly one 1. 
326 | 
327 | \item An indicator rv $z_k=1$ represents whether \bluef{$k$-th component is used to generate the data sample} $\vx$ or not. 
328 | 
329 | \item $p(\vx | z_k=1) = \cN(\vx| \vmu_k,\msig_k)$
330 | \item Prior for $\vz$ with $\pi_k = p(z_k =1)$
331 | $$
332 | p(\vz) = \vpi = \trans{[\pi_1, \ldots, \pi_K]}, \quad \sum_{k=1}^K \pi_k = 1
333 | $$
334 | 
335 | \item Sampling procedure
336 | \bce
337 | \item Sample which component to use $z^{(i)} \sim p(\vz)$
338 | \item Sample data according to $i$-th Gaussian $\vx^{(i)} \sim p(\vx | z^{(i)})$
339 | \ece
340 | \eci
341 | \end{frame}
342 | 
343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
344 | \begin{frame}{Joint Distribution, Likelihood, and Posterior (1)}
345 | 
346 | \plitemsep 0.1in
347 | \bci
348 | \item Joint distribution
349 | $$
350 | p(\vx,\vz) = \colvec{p(\vx, z_1=1) \\ \vdots \\ p(\vx, z_K=1)} = 
351 | \colvec{p(\vx| z_1=1)p(z_1=1) \\ \vdots \\ p(\vx|z_K=1)p(z_K=1)}=
352 | \colvec{\pi_1\cN(\vx|\vmu_1,\msig_1) \\ \vdots \\ \pi_K\cN(\vx|\vmu_K,\msig_K)}
353 | $$
354 | 
355 | \item Likelihood for an arbitrary single data $\vx$: By summing out all latent variables\footnote{In probabilistic PCA, $\vz$ was continuous, so we integrated them out.},
356 | \aleq{
357 | p(\vx | \vth) &= \sum_{\vz} p(\vx|\vth,\vz)p(\vz|\vth) = \sum_{k=1}^K p(\vx|\vth,z_k=1)p(z_k=1|\vth) 
358 | = \sum_{k=1}^K \pi_k \cN(\vx|\vmu_k, \msig_k)
359 | }
360 | \item For all the data samples $\cX,$ the log-likelihood is:
361 | \aleq{
362 |  \log p(\set{X} | \vth) &= \sum_{n=1}^N \log p(\vx_n|\vth) = \sum_{n=1}^N \log \sum_{k=1}^K \pi_k
363 |  \cN(\vx_n | \vmu_k,\msig_k) \hspace{2cm} \lecturemark{\text{Compare: Page 7}} 
364 | }
365 | \eci
366 | \end{frame}
367 | 
368 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
369 | \begin{frame}{Joint Distribution, Likelihood, and Posterior (2)}
370 | 
371 | \plitemsep 0.1in
372 | \bci
373 | 
374 | \item Posterior for the $k$-th $z_k$, given an arbitrary single data $\vx$:
375 | \aleq{
376 | p(z_k=1 | \vx) = \frac{p(z_k=1)p(\vx|z_k=1)}{\sum_{j=1}^K p(z_j=1)p(\vx|z_j=1)}
377 | = \frac{\pi_k
378 |  \cN(\vx | \vmu_k,\msig_k)}{\sum_{j=1}^K\pi_j
379 |  \cN(\vx | \vmu_j,\msig_j)}
380 | }
381 | \item Now, for all data samples $\set{X},$ each data $\vx_n$ has $\vz_n= \trans{[z_{n1}, \ldots, z_{nK}]},$ but with the same prior $\vpi.$
382 | \aleq{
383 | p(z_{nk}=1 | \vx_n) = \frac{p(z_{nk}=1)p(\vx_n|z_{nk}=1)}{\sum_{j=1}^K p(z_{nj}=1)p(\vx_n|z_{nj}=1)}
384 | = \frac{\pi_k
385 |  \cN(\vx_n | \vmu_k,\msig_k)}{\sum_{j=1}^K\pi_j
386 |  \cN(\vx_n | \vmu_j,\msig_j)} = r_{nk}
387 | }
388 | 
389 | \item Responsibilities are mathematically interpreted as \bluef{posterior distributions.}
390 | \eci
391 | \end{frame}
392 | 
393 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
394 | \section{L11(4)}
395 | \begin{frame}{Roadmap}
396 | 
397 | \plitemsep 0.1in
398 | 
399 | \bce[(1)] 
400 | 
401 | \item \grayf{Gaussian Mixture Model}
402 | \item \grayf{Parameter Learning: MLE}
403 | \item \grayf{Latent-Variable Perspective for Probabilistic Modeling}
404 | \item \redf{EM Algorithm}
405 | \ece
406 | \end{frame}
407 | 
408 | 
409 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
410 | \begin{frame}{Revisiting EM Algorithm for MLE}
411 | 
412 | \mytwocols{0.5}
413 | {
414 | \small
415 | \bce[\red \bf S1.]
416 | \item Initialize $\vmu_k, \msig_k, \pi_k$
417 | 
418 | \item \bluef{\bf E-step:} 
419 | $$
420 | r_{nk}=  \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) } 
421 | $$
422 | 
423 | \item \bluef{\bf M-step:} Update $\vmu_k, \msig_k, \pi_k$ using $r_{nk}$
424 | and go to \redf{\bf S2.}
425 | \ece
426 | }
427 | {
428 | \small
429 | \bci
430 | \item \bluef{\bf E-step.} \orangef{Expectation} over $\vz | \vx, \vth^{(t)}$: 
431 | Given the current $\vth^{(t)} = (\vmu_k, \msig_k, \pi_k),$ calculates the expected log-likelihood
432 | \aleq{
433 | Q(\vth|\vth^{(t)}) &= \expecti{\vz|\vx,\vth^{(t)}}{\log p(\vx,\vz | \vth)} \cr
434 | & = \int \log p(\vx,\vz | \vth) p(\vz|\vx,\vth^{(t)})\text{d}\vz
435 | }
436 | 
437 | \item \bluef{\bf M-step.} \orangef{Maximization} of the computation results in E-step for the new model parameters.  
438 | 
439 | \eci
440 | }
441 | 
442 | \bci
443 | \item Only guarantee of just local-optimum because the original optimization is not necessarily a convex optimization. \hfill \lecturemark{L7(4)}
444 | \eci
445 | 
446 | \end{frame}
447 | 
448 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
449 | \begin{frame}{Other Issues}
450 | 
451 | \plitemsep 0.1in
452 | \bci
453 | \item Model selection for finding a good $K$, e.g., using nested cross-validation
454 | 
455 | \item Application: Clustering
456 | \bci
457 | \item K-means: Treat the means in GMM as cluster centers and ignore the covariances. 
458 | \item K-means: hard assignment, GMM: soft assignment
459 | \eci
460 | 
461 | \item EM algorithm: Highly generic in the sense that it can be used for parameter learning in general latent-variable models
462 | 
463 | \item Standard criticism for MLE exists such as overfitting. Also, fully-Bayesian approach assuming some priors on the parameters is possible, but not covered in this notes. 
464 | 
465 | \item Other density estimation methods
466 | \bci
467 | \item Histogram-based method: non-parametric method
468 | \item Kernel-density estimation: non-parametric method
469 | \eci
470 | \eci
471 | \end{frame}
472 | 
473 | 
474 | 
475 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
476 | \begin{frame}{}
477 | \vspace{2cm}
478 | \LARGE Questions?
479 | 
480 | 
481 | \end{frame}
482 | 
483 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
484 | \begin{frame}{Review Questions}
485 | % \tableofcontents
486 | %\plitemsep 0.1in
487 | \bce[1)]
488 | \item 
489 | 
490 | \ece
491 | \end{frame}
492 | 
493 | 
494 | \end{document}
495 | 


--------------------------------------------------------------------------------
/12.SVM/12.SVM-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM-2.pdf


--------------------------------------------------------------------------------
/12.SVM/12.SVM-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM-4.pdf


--------------------------------------------------------------------------------
/12.SVM/12.SVM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM.pdf


--------------------------------------------------------------------------------
/12.SVM/L12_disthyper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_disthyper.png


--------------------------------------------------------------------------------
/12.SVM/L12_halfspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_halfspace.png


--------------------------------------------------------------------------------
/12.SVM/L12_hingeloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_hingeloss.png


--------------------------------------------------------------------------------
/12.SVM/L12_kernel_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_kernel_ex.png


--------------------------------------------------------------------------------
/12.SVM/L12_soft_hard_svm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_soft_hard_svm.png


--------------------------------------------------------------------------------
/12.SVM/L12_softsvm_geo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_softsvm_geo.png


--------------------------------------------------------------------------------
/12.SVM/dist_hyperplane.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/dist_hyperplane.pptx


--------------------------------------------------------------------------------
/12.SVM/main.tex:
--------------------------------------------------------------------------------
  1 | %\pdfminorversion=4
  2 | \documentclass[handout,fleqn,aspectratio=169]{beamer}
  3 | 
  4 | \input{../myhead}
  5 | 
  6 | \title[]{Lecture 12: Classification with Support Vector Machines}
  7 | \author{Yi, Yung (이융)}
  8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html}
  9 | \\KAIST EE}
 10 | \date{\today}
 11 | 
 12 | 
 13 | \input{../mymath}
 14 | \input{../mymacro}
 15 | 
 16 | 
 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{}
 18 | 
 19 | \begin{document}
 20 | 
 21 | \input{../mydefault}
 22 | 
 23 | 
 24 | 
 25 | % START START START START START START START START START START START START START
 26 | 
 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 28 | \begin{frame}{Warm-Up}
 29 | 
 30 | {\Large Please watch this tutorial video by Luis Serrano on Support Vector Machine.}
 31 | 
 32 | \bigskip
 33 | 
 34 | \bigskip
 35 | 
 36 | \url{https://youtu.be/Lpr__X8zuE8}
 37 | 
 38 | \end{frame}
 39 | 
 40 | 
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | \begin{frame}{Roadmap}
 43 | 
 44 | \plitemsep 0.1in
 45 | 
 46 | \bce[(1)] 
 47 | 
 48 | \item Story and Separating Hyperplanes 
 49 | \item Primal SVM: Hard SVM 
 50 | \item Primal SVM: Soft SVM 
 51 | \item Dual SVM 
 52 | \item Kernels 
 53 | \item Numerical Solution 
 54 | 
 55 | \ece
 56 | \end{frame}
 57 | 
 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 59 | \section{L12(1)}
 60 | \begin{frame}{Roadmap}
 61 | 
 62 | \plitemsep 0.1in
 63 | 
 64 | \bce[(1)] 
 65 | 
 66 | \item \redf{Story and Separating Hyperplanes}
 67 | \item \grayf{Primal SVM: Hard SVM 
 68 | \item Primal SVM: Soft SVM 
 69 | \item Dual SVM 
 70 | \item Kernels 
 71 | \item Numerical Solution}
 72 | 
 73 | \ece
 74 | \end{frame}
 75 | 
 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 77 | \begin{frame}{Storyline}
 78 | 
 79 | \plitemsep 0.1in
 80 | 
 81 | \bci
 82 | 
 83 | \item (Binary) classification vs. regression
 84 | 
 85 | \item A Classification predictor $f:\realD \mapsto \{+1, -1 \},$ where $D$ is the dimension of features.
 86 | \item Suppervised learning as in the regression with a given dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \},$ where our task is to learn the model parameters which produces the smallest classification errors. 
 87 | 
 88 | \item SVM
 89 | \bci
 90 | \item Geometric way of thinking about supvervised learning
 91 | \item Relying on empirical risk minimization
 92 | \item Binary classification = Drawing a separating hyperplane
 93 | \item Various interpretation from various perspectives: geometric view, loss function view, the view from convex hulls of data points 
 94 | \eci
 95 | \eci
 96 | \end{frame}
 97 | 
 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 99 | \begin{frame}{Hard SVM vs. Soft SVM}
100 | 
101 | \mypic{0.55}{L12_soft_hard_svm.png}
102 | 
103 | \plitemsep 0.1in
104 | 
105 | \bci
106 | 
107 | \item Hard SVM: Linearly separable, and thus, allow  no classification error 
108 | 
109 | \item Soft SVM: Non-linearly separable, thus, allow some classification error
110 | \eci
111 | \end{frame}
112 | 
113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
114 | \begin{frame}{Separating Hyperplane}
115 | 
116 | \plitemsep 0.07in
117 | 
118 | \bci
119 | 
120 | \item \bluef{Hyperplane} in $\realD$ is a set:
121 | $\{x \mid \trans{a}x=b\}$ where $a\in\realn, a\neq 0, b\in\real$ \hfill \lecturemark{L7(3)}
122 | 
123 | In other words, $\{ x \mid \trans{a}(x-x_0) =0\},$ where $x_0$ is any point in
124 | the hyperplane, i.e., $\trans{a} x_0 = b.$
125 | 
126 | \mysmalltwocols{0.2}
127 | {
128 | \item Divides $\realD$ into two {\blue halfspaces}: 
129 | $\{x|\trans{a}x\leq b\}$ and $\{x|\trans{a}x>b\}$
130 | }
131 | {
132 | \vspace{-0.3cm}
133 | \mypic{0.7}{L12_halfspace.png}
134 | }
135 | \vspace{-0.2cm}
136 | \item In our problem, we consider the hyperplane $\trans{\vw}\vx + b=0,$ where $\vw$ and $b$ are the parameters of the model.
137 | 
138 | \item Classification logic
139 | \aleq{
140 | \begin{cases}
141 | \trans{\vw}\vx_n + b \geq 0 & \ \text{when} \ y_n = +1\cr
142 | \trans{\vw}\vx_n + b < 0 & \ \text{when} \ y_n = -1
143 | \end{cases}
144 | \implies \redf{y_n \big(\trans{\vw}\vx_n +b \big) \geq 0}
145 | }
146 | 
147 | % \bci
148 | % \item $\trans{\vw}\vx_n + b \geq 0$ when $y_n = +1$
149 | % \item $\trans{\vw}\vx_n + b < 0$ when $y_n = -1$
150 | % \eci
151 | \eci
152 | \end{frame}
153 | 
154 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
155 | \begin{frame}{Distance bertween Two Hyperplanes}
156 | 
157 | \plitemsep 0.07in
158 | 
159 | \bci
160 | 
161 | \item Consider two hyperplanes $\trans{\vw}\vx - b =0$ and $\trans{\vw}\vx - b= r$, where assume $r >0.$
162 | 
163 | \item \question What is the distance\footnote{Shortested distance between two hyperplanes.} between two hyperplanes? Answer: \bluef{$\dfrac{r}{\norm{w}}$}
164 | \eci
165 | 
166 | \vspace{-0.7cm}
167 | \mypic{0.5}{L12_disthyper.png}
168 | 
169 | % \mysmalltwocols{0.4}
170 | % {
171 | % }
172 | % {
173 | 
174 | % }
175 | 
176 | 
177 | \end{frame}
178 | 
179 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
180 | \section{L12(2)}
181 | \begin{frame}{Roadmap}
182 | 
183 | \plitemsep 0.1in
184 | 
185 | \bce[(1)] 
186 | 
187 | \item \grayf{Story and Separating Hyperplanes}
188 | \item \redf{Primal SVM: Hard SVM} 
189 | \item \grayf{Primal SVM: Soft SVM 
190 | \item Dual SVM 
191 | \item Kernels 
192 | \item Numerical Solution} 
193 | 
194 | \ece
195 | \end{frame}
196 | 
197 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
198 | \begin{frame}{Hard Support Vector Machine}
199 | 
200 | \plitemsep 0.07in
201 | 
202 | \bci
203 | 
204 | \item Assume that the data points are linearly separable.
205 | 
206 | \item Goal: Find the hyperplane that maximizes the margin between the positive and the negative samples
207 | 
208 | \item Given the training dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \}$ 
209 | and a hyperplane $\trans{\vw}\vx + b =0,$ what is the constraint that all data points are $\frac{r}{\norm{w}}$-away from the hyperplane?
210 | $$
211 | y_n \big(\trans{\vw}\vx_n +b \big) \geq \frac{r}{\norm{\vw}}
212 | $$
213 | 
214 | \item Note that $r$ and $\norm{w}$ are scaled together, so if we fix $\norm{w}=1$, then 
215 | $$
216 | y_n \big(\trans{\vw}\vx_n +b \big) \geq r
217 | $$
218 | 
219 | \eci
220 | \end{frame}
221 | 
222 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
223 | \begin{frame}{Hard SVM: Formulation 1}
224 | 
225 | \plitemsep 0.07in
226 | 
227 | \bci
228 | 
229 | \item Maximize the margin, such that all the training data points are well-classified into their classes ($+$ or $-$)
230 | \mycolorbox
231 | {
232 | \vspace{-0.3cm}
233 | \aleq{
234 | \max_{\vw, b, r} \quad &r \cr
235 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq r, \ \text{for all} \ n=1,\ldots, N, \quad \norm{\vw}=1, \quad r>0
236 | }
237 | }
238 | 
239 | \eci
240 | \end{frame}
241 | 
242 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
243 | \begin{frame}{Formulation 2 (1)}
244 | 
245 | \mycolorbox
246 | {
247 | \aleq{
248 | \max_{\vw, b, r} \quad &r \cr
249 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq r, \ \text{for all} \ n=1,\ldots, N, \quad \norm{\vw}=1, \quad r>0
250 | }
251 | }
252 | \plitemsep 0.07in
253 | \bci
254 | 
255 | \item Since $\norm{\vw}=1,$ reformulate $\vw$ by $\vw'$ as: 
256 | $y_n \Big(\dfrac{\trans{\vw'}}{\norm{\vw'}}\vx_n +b \Big) \geq r$
257 | \item Change the objective from $r$ to $r^2.$
258 | \item Define $\vw''$ and $b''$ by rescaling the constraint:
259 | \aleq{
260 | y_n \Big(\frac{\trans{\vw'}}{\norm{\vw'}}\vx_n +b \Big) \geq r \Longleftrightarrow
261 | y_n \Big(\trans{\vw''}\vx_n +b'' \Big) \geq 1, \quad 
262 | \vw'' = \frac{\vw'}{\norm{\vw'}r} \ \text{and} \ b'' = \frac{b}{r}
263 | }
264 | \eci
265 | \end{frame}
266 | 
267 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
268 | \begin{frame}{Formulation 2 (2)}
269 | 
270 | \plitemsep 0.07in
271 | \bci
272 | 
273 | \item Note that $\norm{\vw''} = \frac{1}{r}$
274 | \item Thus, we have the following reformulated problem:
275 | \mycolorbox
276 | {
277 | \vspace{-0.3cm}
278 | \aleq{
279 | \max_{\vw'', b''} \quad &\frac{1}{\norm{\vw''}^2} \cr
280 | \text{subject to} \quad & y_n \big(\trans{\vw''}\vx_n +b'' \big) \geq 1, \ \text{for all} \ n=1,\ldots, N,
281 | }
282 | }
283 | =
284 | 
285 | \mycolorbox
286 | {
287 | \vspace{-0.3cm}
288 | \aleq{
289 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 \cr
290 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1, \ \text{for all} \ n=1,\ldots, N,
291 | }
292 | }
293 | 
294 | 
295 | \eci
296 | \end{frame}
297 | 
298 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
299 | \begin{frame}{Understanding Formulation 2 Intuitively}
300 | 
301 | \plitemsep 0.07in
302 | \bci
303 | 
304 | \item Given the training dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \}$ 
305 | and a hyperplane $\trans{\vw}\vx + b =0,$ what is the constraint that all data points are $\frac{r}{\norm{w}}$-away from the hyperplane?
306 | $$
307 | y_n \big(\trans{\vw}\vx_n +b \big) \geq \frac{r}{\norm{\vw}}
308 | $$
309 | 
310 | \item \redf{Formulation 1.} Note that $r$ and $\norm{w}$ are scaled together, so if we fix $\norm{w}=1$, then 
311 | $$
312 | y_n \big(\trans{\vw}\vx_n +b \big) \geq r.
313 | $$
314 | And, \bluef{maximize $r.$}
315 | 
316 | \item \redf{Formulation 2.} If we fix $r=1,$ then  
317 | $$
318 | y_n \big(\trans{\vw}\vx_n +b \big) \geq 1.
319 | $$
320 | And, minimize $\norm{\vw}$
321 | \eci
322 | \end{frame}
323 | 
324 | 
325 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
326 | \section{L12(3)}
327 | \begin{frame}{Roadmap}
328 | 
329 | \plitemsep 0.1in
330 | 
331 | \bce[(1)] 
332 | 
333 | \item \grayf{Story and Separating Hyperplanes}
334 | \item \grayf{Primal SVM: Hard SVM} 
335 | \item \redf{Primal SVM: Soft SVM} 
336 | \item \grayf{Dual SVM 
337 | \item Kernels 
338 | \item Numerical Solution} 
339 | 
340 | \ece
341 | \end{frame}
342 | 
343 | 
344 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
345 | \begin{frame}{Soft SVM: Geometric View}
346 | 
347 | \plitemsep 0.07in
348 | \bci
349 | 
350 | \item Now we allow some classification errors, because it's not linearly separable. 
351 | 
352 | \item Introduce a slack variable that quantifies how much errors will be allowed in my optimization problem
353 | \mytwocols{0.6}
354 | {
355 | \small
356 | \item $\vxi = (\xi_n: n=1, \ldots, N)$
357 | \item $\xi_n$: slack for the $n$-th sample $(\vx_n,y_n)$
358 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black]
359 | \vspace{-0.3cm}
360 | \aleq{
361 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n \cr
362 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1 - \xi_n,\cr
363 | & \xi_n \geq 0, \qquad \text{for all} \ n
364 | }
365 | \end{tcolorbox}
366 | 
367 | \item $C$: Trade-off between width and slack
368 | }
369 | {
370 | %\vspace{-0.4cm}
371 | \mypic{0.75}{L12_softsvm_geo.png}
372 | }
373 | 
374 | \eci
375 | \end{frame}
376 | 
377 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
378 | \begin{frame}{Soft SVM: Loss Function View (1)}
379 | 
380 | \plitemsep 0.07in
381 | \bci
382 | 
383 | \item From the perspective of empirical risk minimizaiton
384 | 
385 | \item Loss function design
386 | \bci
387 | \item \bluef{zero-one loss} $\mathbf{1}(f(x_n) \neq y_n)$: \# of mismatches between the prediction and the label $\implies$ combinatorial optimization (typically NP-hard)
388 | 
389 | \item \bluef{hinge loss}
390 | $$
391 | \ell(t) = \max(0,1-t), \ \text{where} \ t = y f(\vx) = y(\trans{\vw}\vx + b)
392 | $$
393 | 
394 | \mysmalltwocols{0.4}
395 | {
396 | \bci
397 | \item If $\vx$ is really at the correct side, $t \geq 1$ $\rightarrow$ $\ell(t) =0$
398 | \item If $\vx$ is at the correct side, but too close to the boundary, $0 < t < 1$ \\$\rightarrow$ $0< \ell(t) =1-t <1$
399 | \item If $\vx$ is at the wrong side, $ t < 0$ \\$\rightarrow$ $1 < \ell(t) =1-t$
400 | \eci
401 | }
402 | {
403 | \mypic{0.8}{L12_hingeloss.png}
404 | }
405 | 
406 | \eci
407 | 
408 | \eci
409 | \end{frame}
410 | 
411 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
412 | \begin{frame}{Soft SVM: Loss Function View (2)}
413 | 
414 | \mycolorbox{
415 | \vspace{-0.3cm}
416 | \aleq{
417 | \min_{\vw, b} \ \text{(regularizer + loss)} = \min_{\vw, b} \quad \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \max \{0,1- y(\trans{\vw}\vx + b) \}
418 | }
419 | }
420 | \plitemsep 0.1in
421 | \bci
422 | 
423 | \item $\frac{1}{2}\norm{\vw}^2$: L2-regularizer (margin maximization = regularization)
424 | 
425 | \item $C$: regularization parameter, which moves from the regularization term to the loss term
426 | \item Why this loss function view = geometric view?
427 | \aleq{
428 | \min_t \max(0,1-t) \Longleftrightarrow \min_{\xi,t} \xi, \ \text{subject to} \ \xi \geq 0, \ \xi \geq 1-t
429 | }
430 | 
431 | \eci
432 | \end{frame}
433 | 
434 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
435 | \section{L12(4)}
436 | \begin{frame}{Roadmap}
437 | 
438 | \plitemsep 0.1in
439 | 
440 | \bce[(1)] 
441 | 
442 | \item \grayf{Story and Separating Hyperplanes}
443 | \item \grayf{Primal SVM: Hard SVM} 
444 | \item \grayf{Primal SVM: Soft SVM} 
445 | \item \red{Dual SVM} 
446 | \item \grayf{Kernels 
447 | \item Numerical Solution} 
448 | 
449 | \ece
450 | \end{frame}
451 | 
452 | 
453 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
454 | \begin{frame}{Dual SVM: Idea}
455 | 
456 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black]
457 | \vspace{-0.3cm}
458 | \aleq{
459 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n \cr
460 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1 - \xi_n, \ \xi_n \geq 0, \quad \text{for all} \ n
461 | }
462 | \end{tcolorbox}
463 | 
464 | \vspace{-0.3cm}
465 | \plitemsep 0.05in
466 | \bci
467 | 
468 | \item The above primal problem is a convex optimization problem. 
469 | 
470 | \item Let's apply Lagrange multipliers, find another formulation, and see what other nice properties are shown \hfill \lecturemark{L7(2), L7(4)}
471 | 
472 | \item Convert the problem into "$\leq$" constraints, so as to apply \redf{min-min-max} rule
473 | \mycolorbox{
474 | \vspace{-0.3cm}
475 | \aleq{
476 | \min_{\vw, b} \ \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n, \  
477 | \text{s.t.} \  -y_n \big(\trans{\vw}\vx_n +b \big) \leq -1 + \xi_n, \ -\xi_n \leq 0, \quad \text{for all} \ n
478 | }
479 | }
480 | 
481 | % \item Lagrangian
482 | % \aleq{
483 | % \cL(\vw, b, \vxi, \valpha, \vgamma) = \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n
484 | % - \sum_{n=1}^N \alpha_n\Big[y_n \big(\trans{\vw}\vx_n +b \big) -1 + \xi_n \Big] - \sum_{n=1}^N \gamma_n \xi_n
485 | % }
486 | 
487 | \eci
488 | 
489 | 
490 | \end{frame}
491 | 
492 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
493 | \begin{frame}{Applying Lagrange Multipliers (1)}
494 | 
495 | \mycolorbox{
496 | \vspace{-0.3cm}
497 | \aleq{
498 | \min_{\vw, b} \ \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n, \  
499 | \text{s.t.} \  -y_n \big(\trans{\vw}\vx_n +b \big) \leq -1 + \xi_n, \ -\xi_n \leq 0, \quad \text{for all} \ n
500 | }
501 | }
502 | \vspace{-0.5cm}
503 | \plitemsep 0.05in
504 | \bci
505 | 
506 | \item Lagrangian with multipliers $\alpha_n \geq 0$ and $\gamma_n \geq 0$
507 | \aleq{
508 | \cL(\vw, b, \vxi, \valpha, \vgamma) = \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n
509 | - \sum_{n=1}^N \alpha_n\Big[y_n \big(\trans{\vw}\vx_n +b \big) -1 + \xi_n \Big] - \sum_{n=1}^N \gamma_n \xi_n
510 | }
511 | 
512 | \item Dual function: $\cD(\valpha,\vgamma) = \inf_{\vw, b, \vxi} \cL(\vw, b, \vxi, \valpha, \vgamma)$ for which the followings should be met:
513 | \small
514 | \aleq{
515 | \text{\blue (D1)} \ \pd{\cL}{\vw} = \trans{\vw} - \sum_{n=1}^N \alpha_n y_n \trans{\vx}_n = 0, \ \text{\blue (D2)} \ \pd{\cL}{b} = \sum_{n=1}^N \alpha_n y_n =0 , \ \text{(\blue D3)} \ \pd{\cL}{\xi_n} = C - \alpha_n - \gamma_n = 0
516 | }
517 | \eci
518 | 
519 | 
520 | \end{frame}
521 | 
522 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
523 | \begin{frame}{Applying Lagrange Multipliers (2)}
524 | 
525 | \plitemsep 0.07in
526 | \bci
527 | 
528 | \item Dual function $\cD(\valpha,\vgamma) = \inf_{\vw, b, \vxi} \cL(\vw, b, \vxi, \valpha, \vgamma)$ with \bluef{(D1)} is given by:
529 | \aleq{
530 | \cD(\valpha,\vgamma) &= \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 
531 | \inner{\vx_i}{\vx_j} - \redf{\sum_{i=1}^N y_i \alpha_i} \inner{\sum_{j=1}^N y_j \alpha_j \vx_j}{\vx_i} -b \redf{\sum_{i=1}^N y_i \alpha_i} \cr
532 | &  + \sum_{i=1}^N \alpha_i + \sum_{i=1}^N \magenf{(C-\alpha_i -\gamma_i)}\xi_i
533 | }
534 | 
535 | \item From \redf{(D2)} and \magenf{(D3)}, the above is simplified into:
536 | \aleq{
537 | \cD(\valpha,\vgamma) = \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 
538 | \inner{\vx_i}{\vx_j} + \sum_{i=1}^N \alpha_i 
539 | }
540 | 
541 | \item $\alpha_i, \gamma_i \geq 0$ and $C-\alpha_i-\gamma_i =0$ $\implies$ $ 0 \le \alpha_i \le C$
542 | \eci
543 | 
544 | 
545 | \end{frame}
546 | 
547 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
548 | \begin{frame}{Dual SVM}
549 | 
550 | \plitemsep 0.07in
551 | \bci
552 | 
553 | \item (Lagrangian) Dual Problem: \redf{maximize $\cD(\valpha,\vgamma)$}
554 | \mycolorbox
555 | {
556 | \vspace{-0.3cm}
557 | \aleq{
558 | \min_{\valpha} \quad & \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 
559 | \inner{\vx_i}{\vx_j} + \sum_{i=1}^N \alpha_i \cr
560 | \text{subject to} \quad& \sum_{i=1}^N y_i \alpha_i =0, \quad 0 \le \alpha_i \le C, \ \forall i=1, \ldots, N
561 | }
562 | \vspace{-0.2cm}
563 | }
564 | \item Primal SVM: the number of parameters scales as \bluef{the number of features ($D$)}
565 | 
566 | \item Dual SVM
567 | \bci
568 | \item the number of parameters scales as \bluef{the number of training data ($N$)}
569 | \item only depends on the inner products of individual training data points $\inner{\vx_i}{\vx_j}$ $\rightarrow$ allow the application of \redf{kernel}
570 | \eci
571 | 
572 | \eci
573 | \end{frame}
574 | 
575 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
576 | \section{L12(5)}
577 | \begin{frame}{Roadmap}
578 | 
579 | \plitemsep 0.1in
580 | 
581 | \bce[(1)] 
582 | 
583 | \item \grayf{Story and Separating Hyperplanes}
584 | \item \grayf{Primal SVM: Hard SVM} 
585 | \item \grayf{Primal SVM: Soft SVM} 
586 | \item \grayf{Dual SVM} 
587 | \item \redf{Kernels 
588 | \item Numerical Solution} 
589 | 
590 | \ece
591 | \end{frame}
592 | 
593 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
594 | \begin{frame}{Kernel}
595 | 
596 | \mytwocols{0.7}
597 | {
598 | \bigskip
599 | 
600 | \plitemsep 0.1in
601 | \bci
602 | 
603 | \item Modularity: Using the feature transformation $\vphi(\vx),$ dual SVMs can be modularized
604 | $$
605 | \inner{\vx_i}{\vx_j} \implies \inner{\vphi(\vx_i)}{\vphi(\vx_j)}
606 | $$
607 | 
608 | \item Similarity function $k: \cX \times \cX \mapsto \real$, $k(\vx_i,\vx_j) = \inner{\vphi(\vx_i)}{\vphi(\vx_j)}$
609 | 
610 | \item Kernel matrix, Gram matrix: must be symmetric and positive semidifinite 
611 | 
612 | \item Examples: polynomial kernel, Gaussian radial basis function, rational quadratic kernel
613 | \eci
614 | }
615 | {
616 | \mypic{0.9}{L12_kernel_ex.png}
617 | }
618 | 
619 | \end{frame}
620 | 
621 | 
622 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
623 | \begin{frame}{Numerical Solution}
624 | 
625 | \plitemsep 0.07in
626 | \bci
627 | 
628 | \item 
629 | 
630 | \eci
631 | \end{frame}
632 | 
633 | 
634 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
635 | \begin{frame}{}
636 | \vspace{2cm}
637 | \LARGE Questions?
638 | 
639 | 
640 | \end{frame}
641 | 
642 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
643 | \begin{frame}{Review Questions}
644 | % \tableofcontents
645 | %\plitemsep 0.1in
646 | \bce[1)]
647 | \item 
648 | 
649 | \ece
650 | \end{frame}
651 | 
652 | 
653 | \end{document}
654 | 


--------------------------------------------------------------------------------
/compile.sh:
--------------------------------------------------------------------------------
 1 | (cd 01.Introduction; pdflatex -jobname=1.intro main.tex)		
 2 | (cd 02.LinearAlgebra; 	pdflatex -jobname=2.LA main.tex)		
 3 | (cd 03.Geometry; pdflatex -jobname=3.AG main.tex)				
 4 | (cd 04.MatrixDecomposition; pdflatex -jobname=4.MD main.tex)			
 5 | (cd 05.VectorCaculus; pdflatex -jobname=5.VC main.tex)			
 6 | (cd 06.Probability; pdflatex -jobname=6.PD main.tex)				
 7 | (cd 07.Optimization; pdflatex -jobname=7.OPT main.tex)				
 8 | (cd 08.Model_Data; pdflatex -jobname=8.MMD main.tex)				
 9 | (cd 09.LinearRegression; pdflatex -jobname=9.LR main.tex)		
10 | (cd 10.PCA; pdflatex -jobname=10.PCA main.tex)		
11 | (cd 11.DensityEstimation; pdflatex -jobname=11.GMM main.tex)		
12 | (cd 12.SVM; pdflatex -jobname=12.SVM main.tex)		
13 | 


--------------------------------------------------------------------------------
/kaist_ee.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/kaist_ee.png


--------------------------------------------------------------------------------
/mydefault.tex:
--------------------------------------------------------------------------------
 1 | %itemshape
 2 | \setbeamertemplate{itemize item}{\scriptsize\raise1.25pt\hbox{\donotcoloroutermaths$\bullet$}}
 3 | \setbeamertemplate{itemize subitem}{\tiny\raise1.5pt\hbox{\donotcoloroutermaths$\circ$}}
 4 | \setbeamertemplate{itemize subsubitem}{\tiny\raise1.5pt\hbox{\donotcoloroutermaths$\blacktriangleright$}}
 5 | %default value for spacing
 6 | \plitemsep 0.1in
 7 | \pltopsep 0.03in
 8 | \setlength{\parskip}{0.15in}
 9 | %\setlength{\parindent}{-0.5in}
10 | \setlength{\abovedisplayskip}{0.07in}
11 | \setlength{\belowdisplayskip}{0.07in}
12 | \setlength{\mathindent}{0cm}
13 | \setbeamertemplate{frametitle continuation}{[\insertcontinuationcount]}
14 | 
15 | \setlength{\leftmargini}{0.5cm}
16 | \setlength{\leftmarginii}{0.5cm}
17 | 
18 | \setlength{\fboxrule}{0.05pt}
19 | \setlength{\fboxsep}{5pt}
20 | 
21 | 
22 | %%%%%%% This should be placed at the end of this file
23 | \logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=0.7 cm]{../kaist_ee.png}
24 | }}}}
25 | 
26 | \begin{frame}
27 |   \titlepage
28 | \end{frame}
29 | 
30 | \logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=0.7 cm]{../kaist_ee.png}
31 | }}}}
32 | 
33 | % rule color - gray
34 | \makeatletter
35 | \let\old@rule\@rule
36 | \def\@rule[#1]#2#3{\textcolor{gray}{\old@rule[#1]{#2}{#3}}}
37 | \makeatother
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/myhead.tex:
--------------------------------------------------------------------------------
 1 | % when making printed slides
 2 | \usepackage{pgfpages}
 3 | \pgfpagesuselayout{resize to}[a4paper,landscape,border shrink=5mm]
 4 | 
 5 | \usepackage[english]{babel}
 6 | \usepackage{tikz}
 7 | \usepackage{courier}
 8 | \usepackage{array}
 9 | \usepackage{bold-extra}
10 | %\usepackage{minted}
11 | \usepackage[thicklines]{cancel}
12 | \usepackage{fancyvrb}
13 | \usepackage{kotex}
14 | \usepackage{paralist}
15 | \usepackage{collectbox}
16 | \usepackage{bm}
17 | 
18 | \usepackage{mathrsfs}
19 | \usepackage[reqno,disallowspaces]{mathtools}  % imports amsmath
20 | \usepackage{amsfonts} %for Y&Y BSR AMS fonts
21 | \usepackage{amssymb}
22 | \usepackage{amscd}
23 | %\usepackage{tikz,lipsum,lmodern}
24 | \usepackage[most]{tcolorbox}
25 | \usepackage{verbatim}
26 | \mode<presentation>
27 | {
28 |   \usetheme{default}
29 |   \usecolortheme{default}
30 |   \usefonttheme{default}
31 |   \setbeamertemplate{navigation symbols}{}
32 |   \setbeamertemplate{caption}[numbered]
33 |   \setbeamertemplate{footline}[frame number]  % or "page number"
34 |   \setbeamercolor{frametitle}{fg=yellow}
35 |   \setbeamercolor{footline}{fg=black}
36 | } 
37 | 
38 | \setbeamercolor{block body alerted}{bg=alerted text.fg!10}
39 | \setbeamercolor{block title alerted}{bg=alerted text.fg!20}
40 | \setbeamercolor{block body}{bg=structure!10}
41 | \setbeamercolor{block title}{bg=structure!20}
42 | \setbeamercolor{block body example}{bg=green!10}
43 | \setbeamercolor{block title example}{bg=green!20}
44 | \setbeamertemplate{blocks}[rounded][shadow]
45 | 
46 | \xdefinecolor{dianablue}{rgb}{0.18,0.24,0.31}
47 | \xdefinecolor{darkblue}{rgb}{0.1,0.1,0.7}
48 | \xdefinecolor{darkgreen}{rgb}{0,0.5,0}
49 | \xdefinecolor{darkgrey}{rgb}{0.35,0.35,0.35}
50 | \xdefinecolor{darkorange}{rgb}{0.8,0.5,0}
51 | \xdefinecolor{darkred}{rgb}{0.7,0,0}
52 | \definecolor{darkgreen}{rgb}{0,0.6,0}
53 | \definecolor{mauve}{rgb}{0.58,0,0.82}
54 | 
55 | \usetikzlibrary{shapes.callouts}
56 | 
57 | \makeatletter
58 | \setbeamertemplate{footline}
59 | {
60 |   \leavevmode%
61 |   \hbox{%
62 |   \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{author in head/foot}%
63 |     \usebeamerfont{author in head/foot}\insertsection
64 |   \end{beamercolorbox}%
65 |   \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{title in head/foot}%
66 |     \usebeamerfont{title in head/foot}\insertsubsection
67 |   \end{beamercolorbox}%
68 |   \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,right]{date in head/foot}%
69 |     \usebeamerfont{date in head/foot}
70 |     \insertshortdate{}\hspace*{2em}
71 |     \insertframenumber{} / \inserttotalframenumber\hspace*{2ex} 
72 |   \end{beamercolorbox}}%
73 | 
74 |   \vskip0pt%
75 | }
76 | \makeatother


--------------------------------------------------------------------------------
/mymacro.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%% linear algebra macros %%%%%%%%%%%%%%%%%%%%%%%
  2 | 
  3 | %--------linsys
  4 | %  Use as \begin{linsys}{3}
  5 | %           x &+ &3y &+ &a &= &7 \\
  6 | %           x &- &3y &+ &a &= &7
  7 | %         \end{linsys}
  8 | % Remark: TeXbook pp. 167-170 says to put a medmuskip around a +; and that's
  9 | % 4/18-ths of an em.  Why does 2/18-ths of an em work?  I don't know, but
 10 | % comparing to a regular displayed equation suggests it is right.
 11 | % (darseneau says LaTeX puts in half an \arraycolsep.)
 12 | \newenvironment{linsys}[2][m]{%
 13 | \setlength{\arraycolsep}{.1111em} % p. 170 TeXbook; a medmuskip
 14 | \begin{array}[#1]{@{}*{#2}{rc}r@{}}
 15 | }{%
 16 | \end{array}}
 17 | 
 18 | \newsavebox\boxofmathplus
 19 | \sbox{\boxofmathplus}{$+$}
 20 | \newcommand{\spaceforemptycolumn}{\makebox[\wd\boxofmathplus]{\ }}
 21 | 
 22 | %--------grstep
 23 | % For denoting a Gauss' reduction step.
 24 | % Use as: \grstep{\rho_1+\rho_3} or \grstep[2\rho_5 \\ 3\rho_6]{\rho_1+\rho_3}
 25 | % \newcommand{\grstep}[2][\relax]{%
 26 | %    \ensuremath{\mathrel{
 27 | %        \mathop{\longrightarrow}\limits^{#2\mathstrut}_{
 28 | %                                    \begin{subarray}{l} #1 \end{subarray}}}}}
 29 | 
 30 | % Advantage of length formulation is that between adjacent
 31 | % \grstep's you can add \hspace{-\grsteplength} to make it look not too wide
 32 | \newlength{\grsteplength}
 33 | \setlength{\grsteplength}{1.5ex plus .1ex minus .1ex}
 34 | 
 35 | \newcommand{\grstep}[2][\relax]{%
 36 |    \ensuremath{\mathrel{
 37 |        \hspace{\grsteplength}\mathop{\longrightarrow}\limits^{#2\mathstrut}_{
 38 |                                      \begin{subarray}{l} #1 \end{subarray}}\hspace{\grsteplength}}}}
 39 | % If two or more \grsteps are in a row then they need to be tightened
 40 | \newcommand{\repeatedgrstep}[2][\relax]{\hspace{-\grsteplength}\grstep[#1]{#2}}
 41 | 
 42 | % row swap operation: \rho_1\swap\rho_2
 43 | \newcommand{\swap}{\leftrightarrow}
 44 | 
 45 | %-------------amatrix
 46 | % Augmented matrix.  Usage (note the argument does not count the aug col):
 47 | % \begin{amatrix}{2}
 48 | %   1  2  3 \\  4  5  6
 49 | % \end{amatrix}
 50 | \newenvironment{amatrix}[1]{%
 51 |   \left(\begin{array}{@{}*{#1}{c}|c@{}}
 52 | }{%
 53 |   \end{array}\right)
 54 | }
 55 | 
 56 | 
 57 | 
 58 | %-------------pmat
 59 | % For matrices with arguments.
 60 | % Usage: \begin{pmat}{c|c|c} 1 &2 &3 \end{pmat}
 61 | \newenvironment{pmat}[1]{
 62 |   \left(\begin{array}{@{}#1@{}}
 63 | }{\end{array}\right)
 64 | }
 65 | 
 66 | 
 67 | 
 68 | %-------------misc matrices
 69 | % \newenvironment{mat}{\left(\begin{array}}{\end{array}\right)}
 70 | \newenvironment{detmat}{\left|\begin{array}}{\end{array}\right|}
 71 | \newcommand{\deter}[1]{ \mathchoice{\left|#1\right|}{|#1|}{|#1|}{|#1|} }
 72 | \newcommand{\generalmatrix}[3]{ %arg1: low-case letter, arg2: rows, arg3: cols 
 73 |                \left(
 74 |                   \begin{array}{cccc}
 75 |                     #1_{1,1}  _{1,2}  &\ldots  _{1,#2}  \\
 76 |                     #1_{2,1}  _{2,2}  &\ldots  _{2,#2}  \\
 77 |                               &\vdots                         \\
 78 |                     #1_{#3,1} _{#3,2} &\ldots  _{#3,#2}
 79 |                   \end{array}
 80 |                \right)  }
 81 | 
 82 | \newcommand{\generaldet}[3]{ %arg1: low-case letter, arg2: rows, arg3: cols 
 83 |                \left|
 84 |                   \begin{array}{cccc}
 85 |                     #1_{11}  _{12}  &\ldots  _{1 #2}  \\
 86 |                     #1_{21}  _{22}  &\ldots  _{2 #2}  \\
 87 |                               &\vdots                         \\
 88 |                     #1_{#3 1} _{#3 2} &\ldots  _{#3 #2}
 89 |                   \end{array}
 90 |                \right|  }
 91 | 
 92 | % With mathtools we can have column entries right flushed
 93 | % There is an optional argument \begin{mat}[r]{3} .. \end{mat} for
 94 | % right-flushed columns.  Perhaps the rule is that numbers are better 
 95 | % right-flushed but if there are any letters it is better centered?
 96 | \newenvironment{nmat}[1][c]{\begin{pmatrix*} % disable optional arg [#1] 
 97 |       }{\end{pmatrix*}}
 98 | % If mat starts with &\vdots get an error; why?  No apparent macro fix, according to texexchange
 99 | \newenvironment{vmat}[1][c]{\begin{vmatrix*} % disable optional arg [#1] 
100 |       }{\end{vmatrix*}}
101 | \newenvironment{amat}[2][c]{%
102 |   % disable optional arg \left(\begin{array}{@{}*{#2}{#1}|#1@{}}
103 |   \left(\begin{array}{@{}*{#2}{c}|#1@{}}
104 | }{%
105 |   \end{array}\right)
106 | }
107 | % \newcommand\vdotswithin[1]{% Taken from mathtools.dtx because my TL is not 2011
108 | %   {\mathmakebox[\widthof{\ensuremath{{}#1{}}}][c]{{\vdots}}}}
109 | 
110 | 
111 | %------------colvec and rowvec
112 | % Column vector and row vector.  Usage:
113 | %  \colvec{1  \\ 2 \\ 3 \\ 4} and \rowvec{1  &2  &3}
114 | % Colvec takes an optional argument \colvec[r]{x_1 \\ 0}.  Perhaps 
115 | % digits look better right aligned, but if there are any letters it
116 | % needs to be centered?
117 | \newcommand{\colvec}[2][c]{\begin{nmat}[#1] #2 \end{nmat}}
118 | \newcommand{\smallcolvec}[1]{\left(\begin{smallmatrix} #1 \end{smallmatrix}\right)}
119 | % For row vectors, cannot do \newcommand{\rowvec}[1]{\begin{mat} #1 \end{mat}}
120 | % since the delimiters come out too large.
121 | \newcommand{\rowvec}[1]{\setlength{\arraycolsep}{3pt}\left(\begin{matrix} #1 \end{matrix}\right)}
122 | 
123 | 
124 | 
125 | %-------------making aligned columns
126 | % Usage: \begin{aligncolondecimal}{2} 1.2 \\ .33 \end{aligncolondecimal}
127 | % (negative argument centers decimal pt in column).  Also Usage:
128 | % \begin{aligncolondecimal}[0em]{2} 1.2 \\ .33 \end{aligncolondecimal}
129 | % to make the left and right LaTeX-array padding disappear.
130 | \RequirePackage{array}\RequirePackage{dcolumn}
131 | \newenvironment{aligncolondecimal}[2][.1111em]{%
132 | \setlength{\arraycolsep}{#1}
133 | \newcolumntype{.}{D{.}{.}{#2}}\begin{array}{.}}{%
134 | \end{array}}
135 | 
136 | % Matrix and vector, with numbers centered on decimal point
137 | % Usage: \begin{dmat}{D{.}{.}{1}D{.}{.}{3}}  0  &.123 \\ .2 &.456 \end{dmat}
138 | %  (in the D{.}{.}{number} that is the number of decimal places)
139 | \newlength{\dmatcolsep}\setlength{\dmatcolsep}{5pt}
140 | \newenvironment{dmat}[2][\dmatcolsep]{%
141 |   \setlength{\arraycolsep}{#1}
142 |   \left(\begin{array}{@{}#2@{}}
143 | }{%
144 |   \end{array}\right)}
145 | % Usage: \dcolvec[2]{1.23 \\ 4.56} where the optional argument is the number
146 | % of decimal places.
147 | \newcommand{\dcolvec}[2][-1]{\left(\begin{array}{@{}D{.}{.}{#1}@{}} #2 \end{array}\right)}
148 | 
149 | %\newcommand{\trans}[1]{ {{#1}^{\mathsf{T}}} } 
150 | \newcommand{\trans}[1]{ {#1}^{\mathsf{T}} } 
151 | \newcommand{\inv}[1]{ {#1}^{-1} } 
152 | \newcommand{\spn}[1]{\ensuremath{\text{span}[#1]} } 
153 | \newcommand{\rk}[1]{\ensuremath{\text{rk}(#1)} } 
154 | \newcommand{\dimm}[1]{\ensuremath{\text{dim}(#1)} } 
155 | \newcommand{\img}[1]{\ensuremath{\text{Im}(#1)} } 
156 | %\newcommand{\norm}[1]{\ensuremath{\left || #1 \right ||} } 
157 | \newcommand{\norm}[1]{\ensuremath{\left \lVert #1 \right \rVert} } 
158 | % orthogonal complement
159 | \newcommand{\ocomp}[1]{\ensuremath{#1^{\bot}} } 
160 | \newcommand{\inner}[2]{\ensuremath{\left\langle #1, #2 \right\rangle} } 
161 | \DeclareMathOperator{\tr}{tr}
162 | 
163 | 
164 | % \NewDocumentCommand{\grad}{e{_^}}{%
165 | %   \mathop{}\!% \mathop for good spacing before \nabla
166 | %   \nabla
167 | %   \IfValueT{#1}{_{\!#1}}% tuck in the subscript
168 | %   \IfValueT{#2}{^{#2}}% possible superscript
169 | % }
170 | % \begin{equation*}
171 | %          \begin{nmat}[r]
172 | % 1 &2 &13 \\
173 | %           4  &5  &6
174 | %          \end{nmat}
175 | %       \end{equation*}
176 | 
177 | % \begin{equation*}
178 | %          \begin{amat}{2}
179 | %           1  &2  &3  \\
180 | %           4  &5  &6
181 | %          \end{amat}
182 | %       \end{equation*}
183 |        
184 | %       \begin{equation*}
185 | %          \begin{pmat}{c|c|c}
186 | % 1 &2 &3 \\
187 | %           4  &5  &6
188 | %          \end{pmat}
189 | %       \end{equation*}
190 | 
191 | % \begin{equation*}
192 | %          \begin{vmat}
193 | % a &c \\
194 | %           b  &d
195 | %          \end{vmat}
196 | %          =ad-bc
197 | % \end{equation*}
198 | 
199 | %  \begin{equation*}
200 | %   \vec{v}=\colvec{-1  \\ -0.5  \\ 0}
201 | % \end{equation*}
202 | 
203 | %  \begin{equation*}
204 | %   \vec{v}=\rowvec{-1  & -0.5  & 0}
205 | % \end{equation*}
206 | 
207 | 


--------------------------------------------------------------------------------
/mymath.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%% real, integer notation
  2 | \newcommand{\real}{{\mathbb R}}
  3 | \newcommand{\realn}{{\mathbb R}^{n}}
  4 | \newcommand{\realm}{{\mathbb R}^{m}}
  5 | \newcommand{\realD}{{\mathbb R}^{D}}
  6 | \newcommand{\realM}{{\mathbb R}^{M}}
  7 | \newcommand{\realN}{{\mathbb R}^{N}}
  8 | \newcommand{\realnn}{{\mathbb R}^{n \times n}}
  9 | \newcommand{\realmm}{{\mathbb R}^{m \times m}}
 10 | \newcommand{\realmn}{{\mathbb R}^{m \times n}}
 11 | \newcommand{\realnm}{{\mathbb R}^{n \times m}}
 12 | \newcommand{\realDM}{{\mathbb R}^{D \times M}}
 13 | \newcommand{\realMD}{{\mathbb R}^{M \times D}}
 14 | \newcommand{\complex}{{\mathbb C}}
 15 | \newcommand{\integer}{{\mathbb Z}}
 16 | \newcommand{\natu}{{\mathbb N}}
 17 | 
 18 | 
 19 | %%% set, vector, matrix
 20 | \newcommand{\set}[1]{\ensuremath{\mathcal #1}}
 21 | \newcommand{\sets}[1]{\ensuremath{\{#1 \}}}
 22 | \renewcommand{\vec}[1]{\bm{#1}}
 23 | \newcommand{\mat}[1]{\bm{#1}}
 24 | 
 25 | %%%% vector
 26 | \def\vx{\vec{x}}
 27 | \def\vy{\vec{y}}
 28 | \def\vz{\vec{z}}
 29 | \def\vf{\vec{f}}
 30 | \def\ve{\vec{e}}
 31 | \def\vr{\vec{r}}
 32 | \def\vb{\vec{b}}
 33 | \def\vc{\vec{c}}
 34 | \def\vd{\vec{d}}
 35 | \def\vm{\vec{m}}
 36 | \def\vu{\vec{u}}
 37 | \def\vv{\vec{v}}
 38 | \def\vw{\vec{w}}
 39 | \def\vX{\vec{X}}
 40 | \def\vY{\vec{Y}}
 41 | \def\vZ{\vec{Z}}
 42 | \def\vth{\vec{\theta}}
 43 | \def\vmu{\vec{\mu}}
 44 | \def\vnu{\vec{\nu}}
 45 | \def\vlam{\vec{\lambda}}
 46 | \def\vep{\vec{\epsilon}}
 47 | \def\vpi{\vec{\pi}}
 48 | \def\vphi{\vec{\phi}}
 49 | \def\vxi{\vec{\xi}}
 50 | \def\valpha{\vec{\alpha}}
 51 | \def\vgamma{\vec{\gamma}}
 52 | 
 53 | %%%% Well-used matrices
 54 | \def\mA{\mat{A}}
 55 | \def\mB{\mat{B}}
 56 | \def\mC{\mat{C}}
 57 | \def\mD{\mat{D}}
 58 | \def\mI{\mat{I}}
 59 | \def\mJ{\mat{J}}
 60 | \def\mK{\mat{K}}
 61 | \def\mE{\mat{E}}
 62 | \def\mP{\mat{P}}
 63 | \def\mQ{\mat{Q}}
 64 | \def\mU{\mat{U}}
 65 | \def\mV{\mat{V}}
 66 | \def\mR{\mat{R}}
 67 | \def\mS{\mat{S}}
 68 | \def\mX{\mat{X}}
 69 | \def\msig{\mat{\Sigma}}
 70 | \def\mPhi{\mat{\Phi}}
 71 | 
 72 | 
 73 | \usepackage{amsmath}
 74 | %%%%% vector caculus useful macro
 75 | % ...\d, which typesets a derivative. ex: \d{y}{x}, instead of \frac{dx}{dy}.
 76 | \renewcommand{\d}[2]{\frac{\text{d} #1}{\text{d} #2}}
 77 | 
 78 | 
 79 | % ...similar for double-derivatives. ex: \dd{y}{x}.
 80 | \newcommand{\dd}[2]{\frac{\text{d}^2 #1}{\text{d} #2^2}}
 81 | 
 82 | % ...similar for partial derivatives. ex: \pd{y}{x}.
 83 | \newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
 84 | 
 85 | 
 86 | % ...similar for partial double derivatives. ex: \pdd{y}{x}.
 87 | \newcommand{\pdd}[2]{\frac{\partial^2 #1}{\partial #2^2}}
 88 | % pdd with argument
 89 | \newcommand{\pdda}[3]{\frac{\partial^2 #1}{\partial #2 \partial #3}}
 90 | 
 91 | \usepackage{xparse}
 92 | 
 93 | %%%% caligraphic fonts
 94 | \def\cL{\ensuremath{{\cal L}}}
 95 | \def\cN{\ensuremath{{\cal N}}}
 96 | \def\cD{\ensuremath{{\cal D}}}
 97 | \def\cC{\ensuremath{{\cal C}}}
 98 | \def\cX{\ensuremath{{\cal X}}}
 99 | \def\cY{\ensuremath{{\cal Y}}}
100 | 
101 | %%% big parenthesis
102 | \def\Bl{\Bigl}
103 | \def\Br{\Bigr}
104 | \def\lf{\left}
105 | \def\ri{\right}
106 | 
107 | 
108 | %%% floor notations
109 | \newcommand{\lfl}{{\lfloor}}
110 | \newcommand{\rfl}{{\rfloor}}
111 | \newcommand{\floor}[1]{{\lfloor #1 \rfloor}}
112 | 
113 | %%% gradient
114 | \newcommand{\grad}[1]{\nabla #1}
115 | \newcommand{\hess}[1]{\text{H} #1}
116 | 
117 | %%% definition
118 | %\newcommand{\eqdef}{\ensuremath{\triangleq}}
119 | \newcommand{\eqdef}{\ensuremath{:=}}
120 | %%% imply
121 | \newcommand{\imp}{\Longrightarrow}
122 | 
123 | 
124 | 
125 | \newcommand{\separator}{
126 | %  \begin{center}
127 |     \par\noindent\rule{\columnwidth}{0.3mm}
128 | %  \end{center}
129 | }
130 | 
131 | \newcommand{\mynote}[1]{{\it \color{red} [#1]}}
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | %%% equation alignment
140 | \newcommand{\aleq}[1]{\begin{align*}#1\end{align*}}
141 | 
142 | %%%%%%%%%%%%%%%% colored emphasized font, blanked words
143 | 
144 | \newcommand{\empr}[1]{{\color{red}\emph{#1}}}
145 | \newcommand{\empb}[1]{{\color{blue}\emph{#1}}}
146 | \newcommand{\redf}[1]{{\color{red} #1}}
147 | \newcommand{\bluef}[1]{{\color{blue} #1}}
148 | \newcommand{\grayf}[1]{{\color{gray} #1}}
149 | \newcommand{\magenf}[1]{{\color{magenta} #1}}
150 | \newcommand{\greenf}[1]{{\color{green} #1}}
151 | \newcommand{\cyanf}[1]{{\color{cyan} #1}}
152 | \newcommand{\orangef}[1]{{\color{orange} #1}}
153 | 
154 | \newcommand{\blk}[1]{\underline{\mbox{\hspace{#1}}}}
155 | 
156 | 
157 | \newcommand{\redblk}[1]{\framebox{\color{red} #1}}
158 | \newcommand{\redblank}[2]{\framebox{\onslide<#1->{\color{red} #2}}}
159 | \newcommand{\blueblk}[1]{\framebox{\color{blue} #1}}
160 | \newcommand{\blueblank}[2]{\framebox{\onslide<#1->{\color{blue} #2}}}
161 | 
162 | 
163 | 
164 | \makeatletter
165 | \newcommand{\mybox}{%
166 |     \collectbox{%
167 |         \setlength{\fboxsep}{1pt}%
168 |         \fbox{\BOXCONTENT}%
169 |     }%
170 | }
171 | \makeatother
172 | 
173 | \makeatletter
174 | \newcommand{\lecturemark}{%
175 |     \collectbox{%
176 |         \setlength{\fboxsep}{1pt}%
177 |         \fcolorbox{red}{yellow}{\BOXCONTENT}%
178 |     }%
179 | }
180 | \makeatother
181 | 
182 | \newcommand{\mycolorbox}[1]{
183 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black]
184 | #1
185 | \end{tcolorbox}
186 | }
187 | %%%% figure inclusion
188 | \newcommand{\mypic}[2]{
189 | \begin{center}
190 | \includegraphics[width=#1\textwidth]{#2}
191 | \end{center}
192 | }
193 | 
194 | \newcommand{\myinlinepic}[2]{
195 | \makebox[0cm][r]{\raisebox{-4ex}{\includegraphics[height=#1]{#2}}}
196 | }
197 | 
198 | 
199 | 
200 | 
201 | %%%% itemized and enumerated list
202 | \newcommand{\bci}{\begin{compactitem}}
203 | \newcommand{\eci}{\end{compactitem}}
204 | \newcommand{\bce}{\begin{compactenum}}
205 | \newcommand{\ece}{\end{compactenum}}
206 | 
207 | 
208 | %%%% making 0.5/0.5 two columns
209 | %%%% how to use: first number: length of separation bar
210 | % \mytwocols{0.6}
211 | % {
212 | % contents in the left column
213 | % }
214 | % {
215 | % contents in the right column
216 | % }
217 | %%%%
218 | 
219 | \newcommand{\mytwocols}[3]{
220 | \begin{columns}[T] \column{.499\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.499\textwidth} #3 \end{columns}}
221 | 
222 | \newcommand{\mythreecols}[4]{
223 | \begin{columns}[T] \column{.31\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.31\textwidth} #3 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.31\textwidth} #4  \end{columns}}
224 | 
225 | \newcommand{\mysmalltwocols}[3]{
226 | \begin{columns}[T] \column{.4\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.4\textwidth} #3 \end{columns}}
227 | 
228 | %%%% making two columns with customized ratios
229 | %%%% how to use: 
230 | %first parameter: length of separation bar
231 | %second parameter: ratio of left column
232 | %third parameter: ratio of right column
233 | % \mytwocols{0.6}{0.7}{0.29}
234 | % {
235 | % contents in the left column
236 | % }
237 | % {
238 | % contents in the right column
239 | % }
240 | %%%%
241 | \newcommand{\myvartwocols}[5]{
242 | \begin{columns}[T] \column{#2\textwidth} {#4} \column{.01\textwidth} \rule{.3mm}{{#1}\textheight} \column{#3\textwidth} {#5} \end{columns}}
243 | 
244 | %%% making my block in beamer
245 | %%% first parameter: title of block
246 | %%% second parameter: contents of block
247 | \newcommand{\myblock}[2]{
248 | \begin{block}{#1} {#2}  \end{block}}
249 | 
250 | %%% independence notation
251 | \newcommand{\indep}{\perp \!\!\! \perp}
252 | 
253 | %%%% probability with different shapes (parenthesis or bracket) and different sizes
254 | %%% `i' enables us to insert the subscript to the probability 
255 | \newcommand{\bprob}[1]{\mathbb{P}\Bl[ #1 \Br]}
256 | \newcommand{\prob}[1]{\mathbb{P}[ #1 ]}
257 | \newcommand{\cbprob}[1]{\mathbb{P}\Bl( #1 \Br)}
258 | \newcommand{\cprob}[1]{\mathbb{P}( #1 )}
259 | \newcommand{\probi}[2]{\mathbb{P}_{#1}[ #2 ]}
260 | \newcommand{\bprobi}[2]{\mathbb{P}_{#1}\Bl[ #2 \Br]}
261 | \newcommand{\cprobi}[2]{\mathbb{P}_{#1}( #2 )}
262 | \newcommand{\cbprobi}[2]{\mathbb{P}_{#1}\Bl( #2 \Br)}
263 | 
264 | %%%% expectation with different shapes (parenthesis or bracket) and different sizes
265 | %%% `i' enables us to insert the subscript to the expectation
266 | \newcommand{\expect}[1]{\mathbb{E}[ #1 ]}
267 | \newcommand{\cexpect}[1]{\mathbb{E}( #1 )}
268 | \newcommand{\bexpect}[1]{\mathbb{E}\Bl[ #1 \Br]}
269 | \newcommand{\cbexpect}[1]{\mathbb{E}\Bl( #1 \Br)}
270 | \newcommand{\bbexpect}[1]{\mathbb{E}\lf[ #1 \ri]}
271 | \newcommand{\expecti}[2]{\mathbb{E}_{#1}[ #2 ]}
272 | \newcommand{\bexpecti}[2]{\mathbb{E}_{#1}\Bl[ #2 \Br]}
273 | \newcommand{\bbexpecti}[2]{\mathbb{E}_{#1}\lf[ #2 \ri]}
274 | 
275 | %%%% variance
276 | \newcommand{\var}[1]{\text{var}[ #1 ]}
277 | \newcommand{\bvar}[1]{\text{var}\Bl[ #1 \Br]}
278 | \newcommand{\cvar}[1]{\text{var}( #1 )}
279 | \newcommand{\cbvar}[1]{\text{var}\Bl( #1 \Br)}
280 | 
281 | %%%% covariance
282 | \newcommand{\cov}[1]{\text{cov}( #1 )}
283 | \newcommand{\bcov}[1]{\text{cov}\Bl( #1 \Br)}
284 | 
285 | %%% Popular pmf, pdf notation to avoid long typing
286 | \newcommand{\px}{\ensuremath{p_X(x)}}
287 | \newcommand{\py}{\ensuremath{p_Y(y)}}
288 | \newcommand{\pz}{\ensuremath{p_Z(z)}}
289 | \newcommand{\pxA}{\ensuremath{p_{X|A}(x)}}
290 | \newcommand{\pyA}{\ensuremath{p_{Y|A}(y)}}
291 | \newcommand{\pzA}{\ensuremath{p_{Z|A}(z)}}
292 | \newcommand{\pxy}{\ensuremath{p_{X,Y}(x,y)}}
293 | \newcommand{\pxcy}{\ensuremath{p_{X|Y}(x|y)}}
294 | \newcommand{\pycx}{\ensuremath{p_{Y|X}(y|x)}}
295 | 
296 | \newcommand{\fx}{\ensuremath{f_X(x)}}
297 | \newcommand{\Fx}{\ensuremath{F_X(x)}}
298 | \newcommand{\fy}{\ensuremath{f_Y(y)}}
299 | \newcommand{\Fy}{\ensuremath{F_Y(y)}}
300 | \newcommand{\fz}{\ensuremath{f_Z(z)}}
301 | \newcommand{\Fz}{\ensuremath{F_Z(z)}}
302 | \newcommand{\fxA}{\ensuremath{f_{X|A}(x)}}
303 | \newcommand{\fyA}{\ensuremath{f_{Y|A}(y)}}
304 | \newcommand{\fzA}{\ensuremath{f_{Z|A}(z)}}
305 | \newcommand{\fxy}{\ensuremath{f_{X,Y}(x,y)}}
306 | \newcommand{\Fxy}{\ensuremath{F_{X,Y}(x,y)}}
307 | \newcommand{\fxcy}{\ensuremath{f_{X|Y}(x|y)}}
308 | \newcommand{\fycx}{\ensuremath{f_{Y|X}(y|x)}}
309 | 
310 | \newcommand{\fth}{\ensuremath{f_\Theta(\theta)}}
311 | \newcommand{\fxcth}{\ensuremath{f_{X|\Theta}(x|\theta)}}
312 | \newcommand{\fthcx}{\ensuremath{f_{\Theta|X}(\theta|x)}}
313 | 
314 | \newcommand{\pkcth}{\ensuremath{p_{X|\Theta}(k|\theta)}}
315 | \newcommand{\fthck}{\ensuremath{f_{\Theta|X}(\theta|k)}}
316 | 
317 | 
318 | %%%% indicator
319 | \newcommand{\indi}[1]{\mathbf{1}_{ #1 }}
320 | 
321 | %%%% exponential rv.
322 | \newcommand{\elambdax}{\ensuremath{e^{-\lambda x}}}
323 | 
324 | %%%% normal  rv.
325 | \newcommand{\stdnormal}{\ensuremath{\frac{1}{\sqrt{2\pi}} e^{-x^2/2}}}
326 | \newcommand{\gennormal}{\ensuremath{\frac{1}{\sigma\sqrt{2\pi}} e^{-(x-\mu)^2/2}}}
327 | 
328 | %%%%%% estimator, estimate
329 | \newcommand{\hth}{\ensuremath{\hat{\theta}}}
330 | \newcommand{\hTH}{\ensuremath{\hat{\Theta}}}
331 | \newcommand{\MAP}{\ensuremath{\text{MAP}}}
332 | \newcommand{\LMS}{\ensuremath{\text{LMS}}}
333 | \newcommand{\LLMS}{\ensuremath{\text{L}}}
334 | \newcommand{\ML}{\ensuremath{\text{ML}}}
335 | 
336 | %%%% colored text
337 | \newcommand{\red}[1]{\color{red}#1} 
338 | \newcommand{\cyan}[1]{\color{cyan}#1} 
339 | \newcommand{\magenta}[1]{\color{magenta}#1} 
340 | \newcommand{\blue}[1]{\color{blue}#1} 
341 | \newcommand{\green}[1]{\color{green}#1} 
342 | \newcommand{\white}[1]{\color{white}#1} 
343 | \newcommand{\gray}[1]{\color{gray}#1} 
344 | 
345 | %%% definition
346 | \newcommand{\defi}{{\color{red} Definition.} } 
347 | \newcommand{\exam}{{\color{red} Example.} } 
348 | \newcommand{\question}{{\color{red} Question.} } 
349 | \newcommand{\thm}{{\color{red} Theorem.} } 
350 | \newcommand{\background}{{\color{red} Background.} } 
351 | \newcommand{\msg}{{\color{red} Message.} } 
352 | 
353 | 
354 | \def\ml{\text{ML}}
355 | \def\map{\text{MAP}}
356 | 
357 | %%%%%%%%%%%%%%%%%%%%%%% old macros that you can ignore %%%%%%%%%%%%%%%%%%%%%%%%
358 | 
359 | % \def\un{\underline}
360 | % \def\ov{\overline}
361 | 
362 | 
363 | % \newcommand{\beq}{\begin{eqnarray*}}
364 | % \newcommand{\eeq}{\end{eqnarray*}}
365 | % \newcommand{\beqn}{\begin{eqnarray}}
366 | % \newcommand{\eeqn}{\end{eqnarray}}
367 | % \newcommand{\bemn}{\begin{multiline}}
368 | % \newcommand{\eemn}{\end{multiline}}
369 | % \newcommand{\beal}{\begin{align}}
370 | % \newcommand{\eeal}{\end{align}}
371 | % \newcommand{\beas}{\begin{align*}}
372 | % \newcommand{\eeas}{\end{align*}}
373 | 
374 | 
375 | 
376 | % \newcommand{\bd}{\begin{displaymath}}
377 | % \newcommand{\ed}{\end{displaymath}}
378 | % \newcommand{\bee}{\begin{equation}}
379 | % \newcommand{\eee}{\end{equation}}
380 | 
381 | 
382 | % \newcommand{\vs}{\vspace{0.2in}}
383 | % \newcommand{\hs}{\hspace{0.5in}}
384 | % \newcommand{\el}{\end{flushleft}}
385 | % \newcommand{\bl}{\begin{flushleft}}
386 | % \newcommand{\bc}{\begin{center}}
387 | % \newcommand{\ec}{\end{center}}
388 | % \newcommand{\remove}[1]{}
389 | 
390 | % \newtheorem{theorem}{Theorem}
391 | % \newtheorem{corollary}{Corollary}
392 | % \newtheorem{prop}{Proposition}
393 | % \newtheorem{lemma}{Lemma}
394 | % \newtheorem{defi}{Definition}
395 | % \newtheorem{assum}{Assumption}
396 | % \newtheorem{example}{Example}
397 | % \newtheorem{property}{Property}
398 | % \newtheorem{remark}{Remark}
399 | 
400 | % \newcommand{\separator}{
401 | %   \begin{center}
402 | %     \rule{\columnwidth}{0.3mm}
403 | %   \end{center}
404 | % }
405 | 
406 | % \newenvironment{separation}
407 | % { \vspace{-0.3cm}
408 | %   \separator
409 | %   \vspace{-0.25cm}
410 | % }
411 | % {
412 | %   \vspace{-0.5cm}
413 | %   \separator
414 | %   \vspace{-0.15cm}
415 | % }
416 | 
417 | % \def\A{\mathcal A}
418 | % \def\oA{\overline{\mathcal A}}
419 | % \def\S{\mathcal S}
420 | % \def\D{\mathcal D}
421 | % \def\eff{{\rm Eff}}
422 | % \def\bD{\bm{D}}
423 | % \def\cU{{\cal U}}
424 | % \def\bbs{{\mathbb{s}}}
425 | % \def\bbS{{\mathbb{S} }}
426 | % \def\cM{{\cal M}}
427 | % \def\bV{{\bm{V}}}
428 | % \def\cH{{\cal H}}
429 | % \def\ch{{\cal h}}
430 | % \def\cR{{\cal R}}
431 | % \def\cV{{\cal V}}
432 | % \def\cA{{\cal A}}
433 | % \def\cX{{\cal X}}
434 | % \def\cN{{\cal N}}
435 | % \def\cJ{{\cal J}}
436 | % \def\cK{{\cal K}}
437 | % \def\cL{{\cal L}}
438 | % \def\cI{{\cal I}}
439 | % \def\cY{{\cal Y}}
440 | % \def\cZ{{\cal Z}}
441 | % \def\cC{{\cal C}}
442 | % \def\cR{{\cal R}}
443 | % \def\id{{\rm Id}}
444 | % \def\st{{\rm st}}
445 | % \def\cF{{\cal F}}
446 | % \def\bz{{\bm z}}
447 | % \def\cG{{\cal G}}
448 | % \def\N{\mathbb{N}}
449 | % \def\bbh{\mathbb{h}}
450 | % \def\bbH{\mathbb{H}}
451 | % \def\bbi{\mathbb{i}}
452 | % \def\bbI{\mathbb{I}}
453 | % \def\R{\mathbb{R}}
454 | % \def\bbR{\mathbb{R}}
455 | % \def\bbr{\mathbb{r}}
456 | % \def\cB{{\cal B}}
457 | % \def\cP{{\cal P}}
458 | % \def\cS{{\cal S}}
459 | % \def\bW{{\bm W}}
460 | % \def\bc{{\bm c}}
461 | 
462 | % %\def\and{\quad\mbox{and}\quad}
463 | % \def\ind{{\bf 1}}
464 | 
465 | 
466 | % \def\bmg{{\bm{\gamma}}}
467 | % \def\bmr{{\bm{\rho}}}
468 | % \def\bmq{{\bm{q}}}
469 | % \def\bmt{{\bm{\tau}}}
470 | % \def\bmn{{\bm{n}}}
471 | % \def\bmcapn{{\bm{N}}}
472 | % \def\bmrho{{\bm{\rho}}}
473 | 
474 | % \def\igam{\underline{\gamma}(\lambda)}
475 | % \def\sgam{\overline{\gamma}(\lambda)}
476 | % \def\ovt{\overline{\theta}}
477 | % \def\ovT{\overline{\Theta}}
478 | % \def\PP{{\mathrm P}}
479 | % \def\EE{{\mathrm E}}
480 | % \def\iskip{{\vskip -0.4cm}}
481 | % \def\siskip{{\vskip -0.2cm}}
482 | 
483 | % \def\bp{\noindent{\it Proof.}\ }
484 | % \def\ep{\hfill $\Box$}
485 | 
486 | 
487 | 


--------------------------------------------------------------------------------
/print.sh:
--------------------------------------------------------------------------------
 1 | (cd 01.Introduction;
 2 |  pdfjam --nup 1x2 1.intro.pdf --outfile 1.intro-2.pdf;
 3 |  pdfjam --nup 2x2 1.intro.pdf --outfile 1.intro-4.pdf --landscape)
 4 | 
 5 | (cd 02.LinearAlgebra;
 6 |  pdfjam --nup 1x2 2.LA.pdf --outfile 2.LA-2.pdf;
 7 |  pdfjam --nup 2x2 2.LA.pdf --outfile 2.LA-4.pdf --landscape)
 8 | 
 9 | (cd 03.Geometry;
10 |  pdfjam --nup 1x2 3.AG.pdf --outfile 3.AG-2.pdf;
11 |  pdfjam --nup 2x2 3.AG.pdf --outfile 3.AG-4.pdf --landscape)
12 | 
13 | (cd 04.MatrixDecomposition;
14 |  pdfjam --nup 1x2 4.MD.pdf --outfile 4.MD-2.pdf;
15 |  pdfjam --nup 2x2 4.MD.pdf --outfile 4.MD-4.pdf --landscape)
16 | 
17 | (cd 05.VectorCaculus;
18 |  pdfjam --nup 1x2 5.VC.pdf --outfile 5.VC-2.pdf;
19 |  pdfjam --nup 2x2 5.VC.pdf --outfile 5.VC-4.pdf --landscape)
20 | 
21 | (cd 06.Probability;
22 |  pdfjam --nup 1x2 6.PD.pdf --outfile 6.PD-2.pdf;
23 |  pdfjam --nup 2x2 6.PD.pdf --outfile 6.PD-4.pdf --landscape)
24 | 
25 | (cd 07.Optimization;
26 |  pdfjam --nup 1x2 7.OPT.pdf --outfile 7.OPT-2.pdf;
27 |  pdfjam --nup 2x2 7.OPT.pdf --outfile 7.OPT-4.pdf --landscape)
28 | 
29 | (cd 08.Model_Data;
30 |  pdfjam --nup 1x2 8.MMD.pdf --outfile 8.MMD-2.pdf;
31 |  pdfjam --nup 2x2 8.MMD.pdf --outfile 8.MMD-4.pdf --landscape)
32 | 
33 | (cd 09.LinearRegression;
34 |  pdfjam --nup 1x2 9.LR.pdf --outfile 9.LR-2.pdf;
35 |  pdfjam --nup 2x2 9.LR.pdf --outfile 9.LR-4.pdf --landscape)
36 | 
37 | (cd 10.PCA;
38 |  pdfjam --nup 1x2 10.PCA.pdf --outfile 10.PCA-2.pdf;
39 |  pdfjam --nup 2x2 10.PCA.pdf --outfile 10.PCA-4.pdf --landscape)
40 | 
41 | (cd 11.DensityEstimation;
42 |  pdfjam --nup 1x2 11.GMM.pdf --outfile 11.GMM-2.pdf;
43 |  pdfjam --nup 2x2 11.GMM.pdf --outfile 11.GMM-4.pdf --landscape)
44 | 
45 | (cd 12.SVM;
46 |  pdfjam --nup 1x2 12.SVM.pdf --outfile 12.SVM-2.pdf;
47 |  pdfjam --nup 2x2 12.SVM.pdf --outfile 12.SVM-4.pdf --landscape)
48 | 
49 | 


--------------------------------------------------------------------------------