├── Spring2021
    ├── figure
    │   ├── Lecture03
    │   │   └── statsdata.txt
    │   ├── Lecture01
    │   │   ├── data2.dat
    │   │   ├── data3.dat
    │   │   ├── data4.dat
    │   │   └── data.dat
    │   ├── Lecture04
    │   │   ├── graph.png
    │   │   ├── knn10.png
    │   │   ├── knn50.png
    │   │   ├── knn400.png
    │   │   └── image_flip.png
    │   ├── Lecture08
    │   │   ├── fig-1.pdf
    │   │   └── fig-2.pdf
    │   └── Lecture06
    │   │   └── discretization.png
    ├── pdf
    │   ├── 04-02-2021.pdf
    │   ├── 04-09-2021.pdf
    │   ├── 04-16-2021.pdf
    │   ├── 04-23-2021.pdf
    │   ├── 04-30-2021.pdf
    │   ├── 05-07-2021.pdf
    │   ├── 05-14-2021.pdf
    │   ├── 05-21-2021.pdf
    │   ├── 05-28-2021.pdf
    │   └── 06-04-2021.pdf
    ├── bibliography.bib
    ├── template_final.tex
    ├── macros_final.tex
    ├── 04-30-2021.tex
    ├── 04-02-2021.tex
    ├── 04-16-2021.tex
    ├── 06-04-2021.tex
    ├── 05-07-2021.tex
    ├── 05-21-2021.tex
    ├── 04-23-2021.tex
    └── 05-14-2021.tex
├── Templates
    ├── yoursunetID2.tex
    ├── yoursunetID.tex
    ├── template.tex
    ├── macros.tex
    └── master.tex
└── .gitignore


/Spring2021/figure/Lecture03/statsdata.txt:
--------------------------------------------------------------------------------
1 | 1,2
2 | 2,3.5
3 | 2.5, 5
4 | 3.25,6
5 | 4,4
6 | 5,3.25
7 | 6,3


--------------------------------------------------------------------------------
/Spring2021/pdf/04-02-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/04-02-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/04-09-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/04-09-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/04-16-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/04-16-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/04-23-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/04-23-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/04-30-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/04-30-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/05-07-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/05-07-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/05-14-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/05-14-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/05-21-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/05-21-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/05-28-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/05-28-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/pdf/06-04-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/pdf/06-04-2021.pdf


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture01/data2.dat:
--------------------------------------------------------------------------------
1 | X $Y_1$
2 | 1 1.5
3 | 2 2
4 | 3 3
5 | 4 2
6 | 2.3 2.125
7 | 1.7 1.85
8 | 2.4 1.9
9 | 1.8 2.175


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture04/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture04/graph.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture04/knn10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture04/knn10.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture04/knn50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture04/knn50.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture08/fig-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture08/fig-1.pdf


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture08/fig-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture08/fig-2.pdf


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture04/knn400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture04/knn400.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture01/data3.dat:
--------------------------------------------------------------------------------
 1 | X $Y_1$
 2 | -2 0
 3 | -1 0
 4 | -1 0.5
 5 | 0 0.5 
 6 | 1 0.5
 7 | 1 0 
 8 | 2 0 
 9 | 3 0 
10 | -3 0


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture01/data4.dat:
--------------------------------------------------------------------------------
 1 | Y $X_1$
 2 | 1 1
 3 | 1.2 2
 4 | 0.8 3
 5 | 0.9 4
 6 | 1.1 5
 7 | 0.9 6
 8 | 1.3 7
 9 | 0.75 8
10 | 1 9


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture04/image_flip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture04/image_flip.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture06/discretization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tengyuma/stats205_notes/HEAD/Spring2021/figure/Lecture06/discretization.png


--------------------------------------------------------------------------------
/Spring2021/figure/Lecture01/data.dat:
--------------------------------------------------------------------------------
1 | Y $X_1$ $X_2$ $X_3$
2 | -.1 -5 0.475 0.475 
3 | -1.3 -4 -.95 -.5
4 | 0.6  -1.4 0.24  0.24
5 | 1.35  0. 1 0.5
6 | 0.70  1.4 0.27 0.27
7 | -0.5 3 -.75 0.27
8 | -0.1  5 0.1 0.27


--------------------------------------------------------------------------------
/Templates/yoursunetID2.tex:
--------------------------------------------------------------------------------
 1 | %\newcommand{\Exp}{\mathbb{E}}
 2 | 
 3 | % reset section counter
 4 | \setcounter{section}{0}
 5 | 
 6 | \metadata{2}{Mary and Alex}{Jan 3rd, 2021}
 7 | 
 8 | \sec{Review and Overview}
 9 | 
10 | \begin{enumerate}
11 | 	\item If appropriate, one paragraph to briefly review the connection to previous lectures.
12 | 	\item An overview paragraph that summarizes the main idea of the lecture at a high-level. 
13 | \end{enumerate}  
14 | \sec{Macros for frequently used notations}
15 | Please try to reuse the macros defined below to ensure consistency.
16 | \begin{itemize}
17 | 	\item $\Exp$, 
18 | 	\al{
19 | 		\E_{x\sim P}, \Exp_{x\sim P} 
20 | 	}
21 | 	\item $\Pr[X=1\vert Y=2]$
22 | 	\item 
23 | 	\al{
24 | 		\argmin_{x: x\ge 1}
25 | 	}
26 | 	\item 
27 | 	$\theta$, $\theta^\star$, $\thetaerm$, 
28 | 	\item 
29 | 	$\cX,\cY, \cH, \cF$
30 | 	\item $x\sp{1}, y\sp{k}$
31 | 	\item 
32 | 	$x\in \R^3, \bbZ$
33 | 	\item $\err(\theta)$
34 | 	\item $O(\cdot)$, $\tilO(\cdot)$
35 | 	\item $\iid$
36 | 	\item $\norm{x}, \Norm{x^{2^3}}$, $\norm{x}_{2}$
37 | 	\item 
38 | \end{itemize}
39 | \begin{theorem}
40 | 	..
41 | \end{theorem}
42 | \begin{lemma}
43 | 	...
44 | \end{lemma}
45 | 
46 | 
47 | 	
48 | \lipsum
49 | %\subsection{}


--------------------------------------------------------------------------------
/Templates/yoursunetID.tex:
--------------------------------------------------------------------------------
 1 | %\newcommand{\Exp}{\mathbb{E}}
 2 | 
 3 | % reset section counter
 4 | \setcounter{section}{0}
 5 | 
 6 | %\metadata{lecture ID}{Your names}{date}
 7 | \metadata{1}{Alice and Bob}{Jan 1st, 2021}
 8 | 
 9 | \sec{Review and Overview}
10 | 
11 | \begin{enumerate}
12 | 	\item If appropriate, one paragraph to briefly review the connection to previous lectures.
13 | 	\item An overview paragraph that summarizes the main idea of the lecture at a high-level. 
14 | \end{enumerate}  
15 | \sec{Macros for frequently used notations}
16 | Please try to reuse the macros defined below to ensure consistency. {\color{blue}We encourage you to use macros frequently which could save a lot of time typing the equations and also help address notation inconsistency. }
17 | \begin{itemize}
18 | 	\item $\Exp$, 
19 | 	\al{
20 | 		\E_{x\sim P}, \Exp_{x\sim P} 
21 | 	}
22 | 	\item $\Pr[X=1\vert Y=2]$
23 | 	\item 
24 | 	\al{
25 | 		\argmin_{x: x\ge 1}
26 | 	}
27 | 	\item 
28 | 	$\theta$, $\theta^\star$, $\thetaerm$, 
29 | 	\item 
30 | 	$\cX,\cY, \cH, \cF$
31 | 	\item $x\sp{1}, y\sp{k}$
32 | 	\item 
33 | 	$x\in \R^3, \bbZ$
34 | 	\item $\err(\theta)$
35 | 	\item $O(\cdot)$, $\tilO(\cdot)$
36 | 	\item $\iid$
37 | 	\item $\norm{x}, \Norm{x^{2^3}}$, $\norm{x}_{2}$
38 | 	\item $x^\top$ 
39 | \end{itemize}
40 | \begin{theorem}
41 | 	..
42 | \end{theorem}
43 | \begin{lemma}
44 | 	...
45 | \end{lemma}
46 | 
47 | 
48 | 	
49 | \lipsum
50 | %\subsection{}


--------------------------------------------------------------------------------
/Spring2021/bibliography.bib:
--------------------------------------------------------------------------------
 1 | @misc{ enwiki:987099180,
 2 | 	author = "{Wikipedia contributors}",
 3 | 	title = "Mercer's theorem --- {Wikipedia}{,} The Free Encyclopedia",
 4 | 	year = "2020",
 5 | 	url = "https://en.wikipedia.org/w/index.php?title=Mercer%27s_theorem&oldid=987099180",
 6 | 	note = "[Online; accessed 29-April-2021]"
 7 | }
 8 | 
 9 | @misc{wei2020regularization,
10 | 	title={Regularization Matters: Generalization and Optimization of Neural Nets v.s. their Induced Kernel}, 
11 | 	author={Colin Wei and Jason D. Lee and Qiang Liu and Tengyu Ma},
12 | 	year={2020},
13 | 	eprint={1810.05369},
14 | 	archivePrefix={arXiv},
15 | 	primaryClass={stat.ML}
16 | }
17 | 
18 | @inproceedings{deng2009imagenet,
19 | 	title={Imagenet: A large-scale hierarchical image database},
20 | 	author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
21 | 	booktitle={2009 IEEE conference on computer vision and pattern recognition},
22 | 	pages={248--255},
23 | 	year={2009},
24 | 	organization={Ieee}
25 | }
26 | 
27 | @book{johnsonbaugh_foundations_2010,
28 | 	address = {Mineola, N.Y},
29 | 	edition = {Dover ed},
30 | 	series = {Dover books on mathematics},
31 | 	title = {Foundations of mathematical analysis},
32 | 	isbn = {9780486477664},
33 | 	publisher = {Dover Publications},
34 | 	author = {Johnsonbaugh, Richard and Pfaffenberger, W. E.},
35 | 	year = {2010},
36 | 	note = {OCLC: ocn463454165},
37 | 	keywords = {Mathematical analysis, Foundations},
38 | }
39 | 
40 | @book{axler_linear_2014,
41 | 	address = {New York},
42 | 	title = {Linear algebra done right},
43 | 	isbn = {9783319110790},
44 | 	publisher = {Springer},
45 | 	author = {Axler, Sheldon},
46 | 	year = {2014},
47 | }
48 | 
49 | @misc{owen_lecture_2018,
50 | 	title = {Lecture 6: {Bayesian} estimation},
51 | 	author = {Owen, Art},
52 | 	month = oct,
53 | 	year = {2018},
54 | 	note = {Unpublished lecture notes from STATS 200. },
55 | }


--------------------------------------------------------------------------------
/Templates/template.tex:
--------------------------------------------------------------------------------
 1 | 	\documentclass[11pt]{book}
 2 | 	
 3 | 	\usepackage{amsfonts,amsthm, bm,amsmath, bbm,amssymb,mathtools}
 4 | 	\usepackage{fullpage}
 5 | 	
 6 | 	
 7 | 	\newtheorem{theorem}{Theorem}[chapter]
 8 | 	\newtheorem{lemma}[theorem]{Lemma}
 9 | 	
10 | 	\theoremstyle{definition}
11 | 	\newtheorem{definition}[theorem]{Definition}
12 | 	\newtheorem{example}[theorem]{Example}
13 | 	\newtheorem{xca}[theorem]{Exercise}
14 | 	\newtheorem{corollary}[theorem]{Corollary}  % added for Lecture 5
15 | 	\newtheorem{proposition}{Proposition}[section]  % added for Lecture 6
16 | 	
17 | 	\theoremstyle{remark}
18 | 	\newtheorem{remark}[theorem]{Remark}
19 | 	
20 | 	\numberwithin{section}{chapter}
21 | 	\numberwithin{equation}{chapter}
22 | 	
23 | 	\makeindex
24 | 	
25 | 	\def\lectureformat{1}
26 | 	\input{macros}
27 | 	\begin{document}
28 | 	
29 | 	\frontmatter
30 | 	
31 | 	\mainmatter
32 | 	\let\sec\section
33 | 	\let\subsec\subsection
34 | 	
35 | 	\newcommand{\secwarning}[1]{
36 | 		{	
37 | 			\color{red}
38 | 			$\backslash$section and $\backslash$subsection are disallowed, please use 	$\backslash$sec and $\backslash$subsec instead
39 | 		}
40 | 	}
41 | 	\let\section\secwarning
42 | 	\let\subsection\secwarning
43 | 	
44 | 	
45 | 	\newcommand{\draftnotice}{\vbox to 0.25in{\noindent
46 | 			\raisebox{0.6in}[0in][0in]{\makebox[\textwidth][r]{\it
47 | 					DRAFT --- a final version will be posted shortly}}}
48 | 		\vspace{-.25in}\vspace{-\baselineskip}
49 | 	}
50 | 	
51 | 	%\section{}
52 | 	\input{yoursunetID}
53 | 	
54 | 	\input{yoursunetID2}
55 | 	
56 | 	%    Include main chapters here.
57 | 	%\include{}
58 | 	\appendix
59 | 	%    Include appendix "chapters" here.
60 | 	
61 | 	
62 | 	\backmatter
63 | 	%    Bibliography styles amsplain or harvard are also acceptable.
64 | 	\bibliographystyle{amsalpha}
65 | 	\bibliography{}
66 | 	%    See note above about multiple indexes.
67 | %	\printindex
68 | 	
69 | 	\end{document}
70 | 	
71 | 	%-----------------------------------------------------------------------
72 | 	% End of amsbook-template.tex
73 | 	%-----------------------------------------------------------------------
74 | 


--------------------------------------------------------------------------------
/Spring2021/template_final.tex:
--------------------------------------------------------------------------------
 1 | 	\documentclass[11pt]{book}
 2 | 	
 3 | 	\usepackage{amsfonts,amsthm, bm,amsmath, bbm,amssymb,mathtools}
 4 | 	\usepackage{fullpage}
 5 | 	\usepackage{tikz, pgfplots} % added for Lecture 2	
 6 | 	\usepackage{float}  % added for Lecture 8
 7 | 	\usepackage[ruled,vlined,linesnumbered]{algorithm2e}  % added for Lecture 15
 8 | 	\usepackage{booktabs}  % added for Lecture 15
 9 | 	
10 | 	\newtheorem{theorem}{Theorem}[chapter]
11 | 	\newtheorem{lemma}[theorem]{Lemma}
12 | 	
13 | 	\theoremstyle{definition}
14 | 	\newtheorem{definition}[theorem]{Definition}
15 | 	\newtheorem{example}[theorem]{Example}
16 | 	\newtheorem{xca}[theorem]{Exercise}
17 | 	\newtheorem{corollary}[theorem]{Corollary}  % added for Lecture 5
18 | 	\newtheorem{proposition}{Proposition}[section]  % added for Lecture 6
19 | 		
20 | 	\theoremstyle{remark}
21 | 	\newtheorem{remark}[theorem]{Remark}
22 | 	
23 | 	\numberwithin{section}{chapter}
24 | 	\numberwithin{equation}{chapter}
25 | 	
26 | 	\makeindex
27 | 	
28 | 	\def\lectureformat{1}
29 | 	\input{macros_final}
30 | 	\begin{document}
31 | 	
32 | 	\frontmatter
33 | 	
34 | 	\mainmatter
35 | 	\let\sec\section
36 | 	\let\subsec\subsection
37 | 	
38 | 	\newcommand{\secwarning}[1]{
39 | 		{	
40 | 			\color{red}
41 | 			$\backslash$section and $\backslash$subsection are disallowed, please use 	$\backslash$sec and $\backslash$subsec instead
42 | 		}
43 | 	}
44 | 	\let\section\secwarning
45 | 	\let\subsection\secwarning
46 | 	
47 | 	
48 | 	\newcommand{\draftnotice}{\vbox to 0.25in{\noindent
49 | 			\raisebox{0.6in}[0in][0in]{\makebox[\textwidth][r]{\it
50 | 					DRAFT --- a final version will be posted shortly}}}
51 | 		\vspace{-.25in}\vspace{-\baselineskip}
52 | 	}
53 | 	
54 | 	%\section{}
55 | 	\input{04-02-2021.tex}
56 | 
57 | 	
58 | 	%    Include main chapters here.
59 | 	%\include{}
60 | 	\appendix
61 | 	%    Include appendix "chapters" here.
62 | 	
63 | 	
64 | 	%\backmatter
65 | 	%    Bibliography styles amsplain or harvard are also acceptable.
66 | 	\bibliographystyle{amsalpha}
67 | 	%\bibliography{bibliography}
68 | 	%    See note above about multiple indexes.
69 | %	\printindex
70 | 	
71 | 	\end{document}
72 | 	
73 | 	%-----------------------------------------------------------------------
74 | 	% End of amsbook-template.tex
75 | 	%-----------------------------------------------------------------------
76 | 


--------------------------------------------------------------------------------
/Templates/macros.tex:
--------------------------------------------------------------------------------
  1 | \usepackage{color}
  2 | \usepackage{lipsum}
  3 | 
  4 | 
  5 | 
  6 | \ifnum\lectureformat=1
  7 | \newcommand{\metadata}[3]
  8 | {
  9 | 	\newpage
 10 | 	
 11 | 	\def\lectureID{#1}
 12 | 	
 13 | 	\setcounter{chapter}{\lectureID}
 14 | 
 15 | 	\draftnotice
 16 | 	
 17 | 	\begin{center}
 18 | 		\bf\large STATS205: Introduction to Nonparametric Statistics
 19 | 	\end{center}
 20 | 	
 21 | 	\noindent
 22 | 	Lecturer: Tengyu Ma   %%% FILL IN LECTURER (if not RS)
 23 | 	\hfill
 24 | 	Lecture \# \lectureID              %%% FILL IN LECTURE NUMBER HERE
 25 | 	\\
 26 | 	Scribe: #2                  %%% FILL IN YOUR NAME HERE
 27 | 	\hfill
 28 | 	#3           %%% FILL IN LECTURE DATE HERE
 29 | 	
 30 | 	\noindent
 31 | 	\rule{\textwidth}{1pt}
 32 | 	
 33 | 	\medskip
 34 | }
 35 | \else 
 36 | \newcommand{\metadata}[3]{}
 37 | \fi
 38 | 
 39 | \DeclareMathOperator*{\Exp}{\mathbb{E}}
 40 | \DeclareMathOperator*{\argmin}{\textup{argmin}}
 41 | \DeclareMathOperator*{\argmax}{\textup{argmax}}
 42 | \newcommand{\E}{\mathbb{E}}
 43 | 
 44 | \newcommand{\err}{\ell_{\textup{0-1}}}
 45 | \newcommand{\thetaerm}{\theta_{\textup{ERM}}}
 46 | \newcommand{\hatL}{\widehat{L}}
 47 | \newcommand{\tilO}{\widetilde{O}}
 48 | \newcommand{\iid}{\overset{\textup{iid}}{\sim}}
 49 | 
 50 | \newcommand{\norm}[1]{\|#1\|}
 51 | \newcommand{\Norm}[1]{\left\|#1\right\|}
 52 | 
 53 | 
 54 | \newcommand{\al}[1]{
 55 | 	\begin{align}
 56 | 	#1
 57 | 	\end{align}
 58 | }
 59 | 
 60 | 
 61 | \renewcommand{\sp}[1]{^{(#1)}}
 62 | 
 63 | \newcommand{\cA}{\mathcal A}
 64 | \newcommand{\cB}{\mathcal B}
 65 | \newcommand{\cC}{\mathcal C}
 66 | \newcommand{\cD}{\mathcal D}
 67 | \newcommand{\cE}{\mathcal E}
 68 | \newcommand{\cF}{\mathcal F}
 69 | \newcommand{\cG}{\mathcal G}
 70 | \newcommand{\cH}{\mathcal H}
 71 | \newcommand{\cI}{\mathcal I}
 72 | \newcommand{\cJ}{\mathcal J}
 73 | \newcommand{\cK}{\mathcal K}
 74 | \newcommand{\cL}{\mathcal L}
 75 | \newcommand{\cM}{\mathcal M}
 76 | \newcommand{\cN}{\mathcal N}
 77 | \newcommand{\cO}{\mathcal O}
 78 | \newcommand{\cP}{\mathcal P}
 79 | \newcommand{\cQ}{\mathcal Q}
 80 | \newcommand{\cR}{\mathcal R}
 81 | \newcommand{\cS}{\mathcal S}
 82 | \newcommand{\cT}{\mathcal T}
 83 | \newcommand{\cU}{\mathcal U}
 84 | \newcommand{\cV}{\mathcal V}
 85 | \newcommand{\cW}{\mathcal W}
 86 | \newcommand{\cX}{\mathcal X}
 87 | \newcommand{\cY}{\mathcal Y}
 88 | \newcommand{\cZ}{\mathcal Z}
 89 | 
 90 | \newcommand{\bbB}{\mathbb B}
 91 | \newcommand{\bbS}{\mathbb S}
 92 | \newcommand{\bbR}{\mathbb R}
 93 | \newcommand{\bbZ}{\mathbb Z}
 94 | \newcommand{\bbI}{\mathbb I}
 95 | \newcommand{\bbQ}{\mathbb Q}
 96 | \newcommand{\bbP}{\mathbb P}
 97 | \newcommand{\bbE}{\mathbb E}
 98 | \newcommand{\bbN}{\mathbb N}
 99 | 
100 | \newcommand{\R}{\bbR}


--------------------------------------------------------------------------------
/Templates/master.tex:
--------------------------------------------------------------------------------
  1 | %% filename: amsbook-template.tex
  2 | %% version: 1.1
  3 | %% date: 2014/07/24
  4 | %%
  5 | %% American Mathematical Society
  6 | %% Technical Support
  7 | %% Publications Technical Group
  8 | %% 201 Charles Street
  9 | %% Providence, RI 02904
 10 | %% USA
 11 | %% tel: (401) 455-4080
 12 | %%      (800) 321-4267 (USA and Canada only)
 13 | %% fax: (401) 331-3842
 14 | %% email: tech-support@ams.org
 15 | %% 
 16 | %% Copyright 2006, 2008-2010, 2014 American Mathematical Society.
 17 | %% 
 18 | %% This work may be distributed and/or modified under the
 19 | %% conditions of the LaTeX Project Public License, either version 1.3c
 20 | %% of this license or (at your option) any later version.
 21 | %% The latest version of this license is in
 22 | %%   http://www.latex-project.org/lppl.txt
 23 | %% and version 1.3c or later is part of all distributions of LaTeX
 24 | %% version 2005/12/01 or later.
 25 | %% 
 26 | %% This work has the LPPL maintenance status `maintained'.
 27 | %% 
 28 | %% The Current Maintainer of this work is the American Mathematical
 29 | %% Society.
 30 | %%
 31 | %% ====================================================================
 32 | 
 33 | %    AMS-LaTeX v.2 driver file template for use with amsbook
 34 | %
 35 | %    Remove any commented or uncommented macros you do not use.
 36 | 
 37 | \documentclass{book}
 38 | \usepackage{amsfonts,bm, amsthm, amsmath}
 39 | 
 40 | 
 41 | \newtheorem{theorem}{Theorem}[chapter]
 42 | \newtheorem{lemma}[theorem]{Lemma}
 43 | 
 44 | \theoremstyle{definition}
 45 | \newtheorem{definition}[theorem]{Definition}
 46 | \newtheorem{example}[theorem]{Example}
 47 | \newtheorem{xca}[theorem]{Exercise}
 48 | 
 49 | \theoremstyle{remark}
 50 | \newtheorem{remark}[theorem]{Remark}
 51 | 
 52 | \numberwithin{section}{chapter}
 53 | \numberwithin{equation}{chapter}
 54 | 
 55 | %    For a single index; for multiple indexes, see the manual
 56 | %    "Instructions for preparation of papers and monographs:
 57 | %    AMS-LaTeX" (instr-l.pdf in the AMS-LaTeX distribution).
 58 | \makeindex
 59 | \def\lectureformat{0}
 60 | \input{macros}
 61 | \begin{document}
 62 | 
 63 | \frontmatter
 64 | 
 65 | \title{Lecture Notes for Introduction to Nonparametric Statistics (STATS205)}
 66 | 
 67 | %    Remove any unused author tags.
 68 | 
 69 | %    author one information
 70 | \author{Instructor: Tengyu Ma}
 71 | %\address{}
 72 | %\curraddr{}
 73 | %\email{}
 74 | \thanks{}
 75 | 
 76 | %    author two information
 77 | %\author{}
 78 | %\address{}
 79 | %\curraddr{}
 80 | %\email{}
 81 | %\thanks{}
 82 | 
 83 | %\subjclass[2010]{Primary }
 84 | 
 85 | %\keywords{}
 86 | 
 87 | %\date{}
 88 | 
 89 | %\begin{abstract}
 90 | %\end{abstract}
 91 | 
 92 | \maketitle
 93 | 
 94 | %    Dedication.  If the dedication is longer than a line or two,
 95 | %    remove the centering instructions and the line break.
 96 | %\cleardoublepage
 97 | %\thispagestyle{empty}
 98 | %\vspace*{13.5pc}
 99 | %\begin{center}
100 | %  Dedication text (use \\[2pt] for line break if necessary)
101 | %\end{center}
102 | %\cleardoublepage
103 | 
104 | %    Change page number to 6 if a dedication is present.
105 | \setcounter{page}{4}
106 | 
107 | \tableofcontents
108 | 
109 | %    Include unnumbered chapters (preface, acknowledgments, etc.) here.
110 | %\include{}
111 | \mainmatter
112 | \let\sec\section
113 | \let\subsec\subsection
114 | 
115 | \chapter{Introduction to nonparametric regression}
116 | %\section{}
117 | \input{yoursunetID}
118 | \input{yoursunetID2}
119 | 
120 | %    Include main chapters here.
121 | %\include{}
122 | ,,
123 | \appendix
124 | %    Include appendix "chapters" here.
125 | 
126 | 
127 | \backmatter
128 | %    Bibliography styles amsplain or harvard are also acceptable.
129 | \bibliographystyle{amsalpha}
130 | \bibliography{}
131 | %    See note above about multiple indexes.
132 | %\printindex
133 | 
134 | \end{document}
135 | 
136 | %-----------------------------------------------------------------------
137 | % End of amsbook-template.tex
138 | %-----------------------------------------------------------------------
139 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Core latex/pdflatex auxiliary files:
  2 | *.aux
  3 | *.lof
  4 | *.log
  5 | *.lot
  6 | *.fls
  7 | *.out
  8 | *.toc
  9 | *.fmt
 10 | *.fot
 11 | *.cb
 12 | *.cb2
 13 | .*.lb
 14 | 
 15 | ## Intermediate documents:
 16 | *.dvi
 17 | *.xdv
 18 | *-converted-to.*
 19 | # these rules might exclude image files for figures etc.
 20 | # *.ps
 21 | # *.eps
 22 | # *.pdf
 23 | 
 24 | ## Generated if empty string is given at "Please type another file name for output:"
 25 | .pdf
 26 | 
 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 28 | *.bbl
 29 | *.bcf
 30 | *.blg
 31 | *-blx.aux
 32 | *-blx.bib
 33 | *.run.xml
 34 | 
 35 | ## Build tool auxiliary files:
 36 | *.fdb_latexmk
 37 | *.synctex
 38 | *.synctex(busy)
 39 | *.synctex.gz
 40 | *.synctex.gz(busy)
 41 | *.pdfsync
 42 | 
 43 | ## Build tool directories for auxiliary files
 44 | # latexrun
 45 | latex.out/
 46 | 
 47 | ## Auxiliary and intermediate files from other packages:
 48 | # algorithms
 49 | *.alg
 50 | *.loa
 51 | 
 52 | # achemso
 53 | acs-*.bib
 54 | 
 55 | # amsthm
 56 | *.thm
 57 | 
 58 | # beamer
 59 | *.nav
 60 | *.pre
 61 | *.snm
 62 | *.vrb
 63 | 
 64 | # changes
 65 | *.soc
 66 | 
 67 | # comment
 68 | *.cut
 69 | 
 70 | # cprotect
 71 | *.cpt
 72 | 
 73 | # elsarticle (documentclass of Elsevier journals)
 74 | *.spl
 75 | 
 76 | # endnotes
 77 | *.ent
 78 | 
 79 | # fixme
 80 | *.lox
 81 | 
 82 | # feynmf/feynmp
 83 | *.mf
 84 | *.mp
 85 | *.t[1-9]
 86 | *.t[1-9][0-9]
 87 | *.tfm
 88 | 
 89 | #(r)(e)ledmac/(r)(e)ledpar
 90 | *.end
 91 | *.?end
 92 | *.[1-9]
 93 | *.[1-9][0-9]
 94 | *.[1-9][0-9][0-9]
 95 | *.[1-9]R
 96 | *.[1-9][0-9]R
 97 | *.[1-9][0-9][0-9]R
 98 | *.eledsec[1-9]
 99 | *.eledsec[1-9]R
100 | *.eledsec[1-9][0-9]
101 | *.eledsec[1-9][0-9]R
102 | *.eledsec[1-9][0-9][0-9]
103 | *.eledsec[1-9][0-9][0-9]R
104 | 
105 | # glossaries
106 | *.acn
107 | *.acr
108 | *.glg
109 | *.glo
110 | *.gls
111 | *.glsdefs
112 | *.lzo
113 | *.lzs
114 | 
115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
116 | # *.ist
117 | 
118 | # gnuplottex
119 | *-gnuplottex-*
120 | 
121 | # gregoriotex
122 | *.gaux
123 | *.gtex
124 | 
125 | # htlatex
126 | *.4ct
127 | *.4tc
128 | *.idv
129 | *.lg
130 | *.trc
131 | *.xref
132 | 
133 | # hyperref
134 | *.brf
135 | 
136 | # knitr
137 | *-concordance.tex
138 | # TODO Comment the next line if you want to keep your tikz graphics files
139 | *.tikz
140 | *-tikzDictionary
141 | 
142 | # listings
143 | *.lol
144 | 
145 | # luatexja-ruby
146 | *.ltjruby
147 | 
148 | # makeidx
149 | *.idx
150 | *.ilg
151 | *.ind
152 | 
153 | # minitoc
154 | *.maf
155 | *.mlf
156 | *.mlt
157 | *.mtc[0-9]*
158 | *.slf[0-9]*
159 | *.slt[0-9]*
160 | *.stc[0-9]*
161 | 
162 | # minted
163 | _minted*
164 | *.pyg
165 | 
166 | # morewrites
167 | *.mw
168 | 
169 | # nomencl
170 | *.nlg
171 | *.nlo
172 | *.nls
173 | 
174 | # pax
175 | *.pax
176 | 
177 | # pdfpcnotes
178 | *.pdfpc
179 | 
180 | # sagetex
181 | *.sagetex.sage
182 | *.sagetex.py
183 | *.sagetex.scmd
184 | 
185 | # scrwfile
186 | *.wrt
187 | 
188 | # sympy
189 | *.sout
190 | *.sympy
191 | sympy-plots-for-*.tex/
192 | 
193 | # pdfcomment
194 | *.upa
195 | *.upb
196 | 
197 | # pythontex
198 | *.pytxcode
199 | pythontex-files-*/
200 | 
201 | # tcolorbox
202 | *.listing
203 | 
204 | # thmtools
205 | *.loe
206 | 
207 | # TikZ & PGF
208 | *.dpth
209 | *.md5
210 | *.auxlock
211 | 
212 | # todonotes
213 | *.tdo
214 | 
215 | # vhistory
216 | *.hst
217 | *.ver
218 | 
219 | # easy-todo
220 | *.lod
221 | 
222 | # xcolor
223 | *.xcp
224 | 
225 | # xmpincl
226 | *.xmpi
227 | 
228 | # xindy
229 | *.xdy
230 | 
231 | # xypic precompiled matrices and outlines
232 | *.xyc
233 | *.xyd
234 | 
235 | # endfloat
236 | *.ttt
237 | *.fff
238 | 
239 | # Latexian
240 | TSWLatexianTemp*
241 | 
242 | ## Editors:
243 | # WinEdt
244 | *.bak
245 | *.sav
246 | 
247 | # Texpad
248 | .texpadtmp
249 | 
250 | # LyX
251 | *.lyx~
252 | 
253 | # Kile
254 | *.backup
255 | 
256 | # gummi
257 | .*.swp
258 | 
259 | # KBibTeX
260 | *~[0-9]*
261 | 
262 | # TeXnicCenter
263 | *.tps
264 | 
265 | # auto folder when using emacs and auctex
266 | ./auto/*
267 | *.el
268 | 
269 | # expex forward references with \gathertags
270 | *-tags.tex
271 | 
272 | # standalone packages
273 | *.sta
274 | 
275 | # Makeindex log files
276 | *.lpz
277 | 


--------------------------------------------------------------------------------
/Spring2021/macros_final.tex:
--------------------------------------------------------------------------------
  1 | \usepackage{color}
  2 | \usepackage{lipsum}
  3 | 
  4 | \ifnum\lectureformat=1
  5 | \newcommand{\metadata}[3]
  6 | {
  7 | 	\newpage
  8 | 	
  9 | 	\def\lectureID{#1}
 10 | 	
 11 | 	\setcounter{chapter}{\lectureID}
 12 | 
 13 | %	\draftnotice
 14 | 	
 15 | 	\begin{center}
 16 | 		\bf\large STATS205: Introduction to Nonparametric Statistics
 17 | 	\end{center}
 18 | 	
 19 | 	\noindent
 20 | 	Lecturer: Tengyu Ma   %%% FILL IN LECTURER (if not RS)
 21 | 	\hfill
 22 | 	Lecture \# \lectureID              %%% FILL IN LECTURE NUMBER HERE
 23 | 	\\
 24 | 	Scribe: #2                  %%% FILL IN YOUR NAME HERE
 25 | 	\hfill
 26 | 	#3           %%% FILL IN LECTURE DATE HERE
 27 | 	
 28 | 	\noindent
 29 | 	\rule{\textwidth}{1pt}
 30 | 	
 31 | 	\medskip
 32 | }
 33 | \else 
 34 | \newcommand{\metadata}[3]{}
 35 | \fi
 36 | 
 37 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{
 38 | 	\node[shape=circle,draw,inner sep=2pt] (char) {#1};}}
 39 | 
 40 | \DeclareMathOperator*{\Exp}{\mathbb{E}}
 41 | \DeclareMathOperator*{\argmin}{\textup{argmin}}
 42 | \DeclareMathOperator*{\argmax}{\textup{argmax}}
 43 | 
 44 | \newcommand{\Cov}{\operatorname{Cov}}
 45 | \newcommand{\KL}{\operatorname{KL}}
 46 | \newcommand{\margin}{\text{margin}}
 47 | \newcommand{\poly}{\operatorname{poly}}
 48 | \newcommand{\sd}{\operatorname{sd}}
 49 | \newcommand{\sgn}{\text{sgn}}
 50 | \newcommand{\tr}{\operatorname{tr}}
 51 | \newcommand{\Var}{\operatorname{Var}}
 52 | 
 53 | \newcommand{\err}{\ell_{\textup{0-1}}}
 54 | \newcommand{\Err}{L_{\textup{0-1}}}
 55 | \newcommand{\thetaerm}{\theta_{\textup{ERM}}}
 56 | \newcommand{\hatL}{\widehat{L}}
 57 | \newcommand{\tilO}{\widetilde{O}}
 58 | \newcommand{\iid}{\overset{\textup{iid}}{\sim}}
 59 | \newcommand\defeq{\stackrel{\mathclap{\tiny \mbox{$\Delta$}}}{=}}
 60 | 
 61 | \newcommand{\gammamin}{\gamma_{\mathrm{min}}}
 62 | \newcommand{\phirelu}{\phi_{\textup{relu}}}
 63 | \newcommand{\supunitball}{\sup_{\overline{u}:\norm{\overline{u}}_2 \le 1}}
 64 | \newcommand{\ubar}{\overline{u}}
 65 | \newcommand{\thetazero}{\theta^{0}}
 66 | \newcommand{\popL}{L(\beta)}
 67 | \newcommand{\empL}{\hatL(\beta)}
 68 | \newcommand{\popLt}{L(\beta^t)}
 69 | \newcommand{\empLt}{\hatL(\beta^t)}
 70 | \newcommand{\yhat}[0]{\hat{y}}
 71 | 
 72 | \newcommand{\norm}[1]{\|#1\|}
 73 | \newcommand{\Norm}[1]{\left\|#1\right\|}
 74 | \renewcommand{\l}{\left}
 75 | \renewcommand{\r}{\right}
 76 | \newcommand{\rbr}[1]{\left(#1\right)}
 77 | \newcommand{\sbr}[1]{\left[#1\right]}
 78 | \newcommand{\cbr}[1]{\left\{#1\right\}}
 79 | \newcommand{\abs}[1]{\left\lvert#1\right\rvert}
 80 | \newcommand{\inprod}[1]{\left\langle#1\right\rangle}
 81 | 
 82 | \newcommand{\al}[1]{
 83 | 	\begin{align}
 84 | 	#1
 85 | 	\end{align}
 86 | }
 87 | 
 88 | \newcommand{\als}[1]{
 89 | 	\begin{align*}
 90 | 		#1
 91 | 	\end{align*}
 92 | }
 93 | 
 94 | \renewcommand{\sp}[1]{^{(#1)}}
 95 | 
 96 | \newcommand{\cA}{\mathcal A}
 97 | \newcommand{\cB}{\mathcal B}
 98 | \newcommand{\cC}{\mathcal C}
 99 | \newcommand{\cD}{\mathcal D}
100 | \newcommand{\cE}{\mathcal E}
101 | \newcommand{\cF}{\mathcal F}
102 | \newcommand{\cG}{\mathcal G}
103 | \newcommand{\cH}{\mathcal H}
104 | \newcommand{\cI}{\mathcal I}
105 | \newcommand{\cJ}{\mathcal J}
106 | \newcommand{\cK}{\mathcal K}
107 | \newcommand{\cL}{\mathcal L}
108 | \newcommand{\cM}{\mathcal M}
109 | \newcommand{\cN}{\mathcal N}
110 | \newcommand{\cO}{\mathcal O}
111 | \newcommand{\cP}{\mathcal P}
112 | \newcommand{\cQ}{\mathcal Q}
113 | \newcommand{\cR}{\mathcal R}
114 | \newcommand{\cS}{\mathcal S}
115 | \newcommand{\cT}{\mathcal T}
116 | \newcommand{\cU}{\mathcal U}
117 | \newcommand{\cV}{\mathcal V}
118 | \newcommand{\cW}{\mathcal W}
119 | \newcommand{\cX}{\mathcal X}
120 | \newcommand{\cY}{\mathcal Y}
121 | \newcommand{\cZ}{\mathcal Z}
122 | 
123 | \newcommand{\bbB}{\mathbb B}
124 | \newcommand{\bbS}{\mathbb S}
125 | \newcommand{\bbR}{\mathbb R}
126 | \newcommand{\bbZ}{\mathbb Z}
127 | \newcommand{\bbI}{\mathbb I}
128 | \newcommand{\bbQ}{\mathbb Q}
129 | \newcommand{\bbP}{\mathbb P}
130 | \newcommand{\bbE}{\mathbb E}
131 | \newcommand{\bbN}{\mathbb N}
132 | 
133 | \newcommand{\E}{\mathbb{E}}
134 | \newcommand{\N}{\mathbb{N}}
135 | \newcommand{\R}{\bbR}
136 | \newcommand{\Z}{\mathbb{Z}}
137 | 
138 | 
139 | % for course staff to edit or comment
140 | \def\shownotes{1}  %set 1 to show author notes
141 | \ifnum\shownotes=1
142 | \newcommand{\authnote}[2]{[#1: #2]}
143 | \else
144 | \newcommand{\authnote}[2]{}
145 | \fi
146 | \newcommand{\tnote}[1]{{\color{blue}\authnote{TM}{#1}}}
147 | 
148 | % for long term comments 
149 | \def\shownotes{0}  %set 1 to show author notes
150 | \ifnum\shownotes=1
151 | \newcommand{\authnotelong}[2]{[#1: #2]}
152 | \else
153 | \newcommand{\authnotelong}[2]{}
154 | \fi
155 | \newcommand{\tnotelong}[1]{{\color{blue}\authnotelong{TM}{#1}}}


--------------------------------------------------------------------------------
/Spring2021/04-30-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | % reset section counter
  4 | \setcounter{section}{0}
  5 | 
  6 | 
  7 | %\metadata{lecture ID}{wer names}{date}
  8 | \metadata{5}{Cam Burton and Matt Johnson}{April 30th, 2021}
  9 | 
 10 | \sec{Review and overview}
 11 | 
 12 | 
 13 | In the previous lecture, we finished our discussion on natural cubic splines with different interpretations. In particular, we discussed interpreting splines as linear smoothers and how to use kernel estimation to estimate splines. We then began our discussion on nonparametric methods in higher dimensions, exploring the $k$-nearest neighbor algorithm and the kernel method. 
 14 | 
 15 | In this lecture we will finish our discussion on the kernel method and begin talking about neural networks. We will explore their connection to the kernel method and their practical implementation. 
 16 | \sec{More about kernel methods}
 17 | \subsec{Recap}
 18 | Let us quickly review the kernel method from the last lecture. The basic principle of the kernel method is that given a set of data points
 19 | $$
 20 | \left\{(x\sp{1}, y\sp{1}), \cdots,  (x\sp{n}, y\sp{n})\right\}, \quad x\sp{i} \in \mathbb{R}^d, y\sp{i} \in \mathbb{R},
 21 | $$
 22 | we look for a suitable feature map $\phi$ such that
 23 | $$
 24 | \phi: x \mapsto \phi(x) \in \R^m.
 25 | $$
 26 | 
 27 | Our interpretation of this feature map is that it is transforming the feature pairs in our dataset. If we run a linear regression or a logistic regression on the transformed dataset $(\phi(x^{(i)}), y^{(i)})$, then the algorithm only depends on the inner product (i.e. we don't need to know $\phi(x)$ or $\phi(z)$ explicitly). We only need to compute $\langle \phi(x) , \phi(z) \rangle$. This is called the kernel function
 28 | \begin{equation}
 29 | K(x,z):=	\langle \phi(x) , \phi(z) \rangle.
 30 | \end{equation}
 31 | If we can compute the kernel function directly, then we don't need to pay the computational overhead of computing the $\phi$ function/map explicitly. When the number of features is large, computing the feature map explicitly can be quite costly. 
 32 | 
 33 | \subsec{Another approach to kernel methods}
 34 | An alternate way of understanding the kernel method is to view each feature as a function of $x$, that is
 35 | $$
 36 | \phi(x)_k : x \rightarrow \mathbb{R},
 37 | $$
 38 | where
 39 | $$
 40 | \phi(x) = \begin{bmatrix}
 41 | 	\phi(x)_1 \\
 42 | 	\vdots \\
 43 | 	\phi(x)_m
 44 | \end{bmatrix}.
 45 | $$
 46 | An example could be the second degree polynomial kernel such that $\phi(x)_{(ij)} = x_i x_j$. We can also view the linear prediction function of our features as a linear combination of these functions 
 47 | $$
 48 | \theta^T \phi(x) = \sum_{i=1}^{m} \theta_i \phi(x)_i \in \mbox{span}\{ \phi(x)_1, ..., \phi(x)_m \}.
 49 | $$
 50 | The kernel method can be thought of as looking for a function in a linear span of functions. 
 51 | 
 52 | \subsec{Connection to splines}
 53 | A cubic spline is a function in the span of a family of basis of cubic splines, that is our model $r(x)$ satisfies
 54 | $$
 55 |  r(x) \in \mbox{span}\{ h_1(x), ..., h_{n+4}(x) \}.
 56 |  $$
 57 | Equivalently, we can write
 58 | $$
 59 | r(x) = \sum_{i=1}^{n+4} \beta_i h_i(x) = \beta^T \phi(x),
 60 | $$
 61 | where $\phi(x)_i = h_i(x), i =1, \cdots, n+4$, and thus
 62 | $$
 63 | \phi: x \mapsto  \phi(x) = \begin{bmatrix} h_1(x) \\ \vdots \\ h_{n+4}(x) \end{bmatrix}
 64 | $$
 65 | is a feature map. Consequently, in our connection between kernels and splines, we can write out the kernel function for cubic splines as 
 66 | $$K(x,z) = \langle \phi(x) , \phi(z) \rangle = \sum_{i=1}^{n+4}  h_i(x) h_i(z).$$
 67 | Empirically, our main design choice centers around our choosing a basis $h_i(x)$ such that $K(x,z)$ is efficiently computable. Previous bases we have used for splines have been good mathematically but are not necessarily the best choice when thinking about computability.
 68 | 
 69 | The kernel method with feature map $\phi$ is equivalent to a cubic spline $\hat{r}$ with no regularization, or a ridgeless kernel regression since
 70 | $$ 
 71 | \argmin_{\beta} \sum_{i=1}^{n} (y^{(i)} - \beta^T \phi(x^{(i)}) )^2 \Leftrightarrow \argmin_{\hat{r}} \sum_{i=1}^{n} (y^{(i)} - \hat{r}(x^{(i)}))^2. $$
 72 | However, this is underspecified because the number of parameters $(n + 4) > $ number of data points $n$. Therefore, we look for the minimum norm solution or a regularization. An example of a regularized solution is the kernel ridge regression
 73 | $$ \min_{\beta} \frac{1}{2} \sum_{i=1}^{n} (y^{(i)} - \beta^T \phi(x^{(i)}) )^2 + \frac{\lambda}{2} \norm{\beta}_2^2, $$
 74 | whose minimizer takes a simple form (cf. Homework 3),
 75 | $$
 76 | \hat \beta = \Phi^\top (\Phi \Phi^\top + \lambda I)^{-1} y,
 77 | $$
 78 | where
 79 | $$
 80 | \Phi = \begin{bmatrix}
 81 | 	\phi(x^{(1)})^\top \\
 82 | 	\vdots  \\
 83 | 	\phi(x^{(n)})^\top
 84 | \end{bmatrix} \in \mathbb{R}^{n \times m}.
 85 | $$
 86 | To connect with splines, we see in in our previous lecture that in natural cubic splines we're using a similar but different regularizer $\beta^\top \Omega \beta$.
 87 | 
 88 | \subsec{Connection to nearest neighbor methods}
 89 | Another application of the kernel method is to perform a nearest neighbor strategy in feature space. We run nearest neighbor on $ \{ (\phi(x^{(1)}), y^{(1)}), ..., (\phi(x^{(n)}), y^{(n)})\}$, and the $\ell_2$ squared distance metric is then
 90 | \begin{align*} 
 91 | d(x,z) &= \norm{\phi(x) - \phi(z)}_2^2 \\
 92 | &= \langle \phi(x)^T - \phi(z)^T, \phi(x) - \phi(z) \rangle \\
 93 | &= \phi(x)^T \phi(x) - 2 \phi(x)^T \phi(z) + \phi(z)^T \phi(z) \\
 94 | &= K(x,x) - 2K(x,z) + K(z,z)
 95 | \end{align*}
 96 | We see again that we don't need to compute the features $\phi$ explicitly.
 97 | \sec{Neural networks}
 98 | 
 99 | \subsec{A glimpse into deep learning theory}
100 | While Neural Networks are not typically studied in most classic statistics classes, in recent years they have revolutionized the field of machine learning and thus have become an increasingly interesting topic in the field of statistics.  The result we will show in this lecture is that a one-dimensional neural network is a fully non-parametric method which is somewhat similar to what we have already discussed in cubic splines. We will primarily discuss the following two things.
101 | \begin{enumerate}
102 | \item We will use a neural net to represent features and then find those features, this allows for more dynamic and better features than those computed in the kernel method.
103 | \item We will also show that for one dimensional two layer wide neural network, it is equivalent to a linear spline.
104 | \end{enumerate}
105 | \subsec{Fully-connected two layer neural networks}
106 | A neural network can be thought of as a method to learn the features $\phi$ in a non-parametric model. If we think about neural networks in the specific case where the input is only 1-dimensional, and we have two layers, then we can see that it is really a type of linear spline. To begin, we introduce some basic notations.
107 | \begin{definition}[Transformation of fully-connected neural network in each layer]
108 | 	We denote by the input of $i$-th layer of a neural network by $h_{i-1} \in \R^d$ and its output by $h_i \in \R^m$. The weighted matrix parameters are denoted by $W \in \R^{m \times d}$. Let $\sigma$ be the activation function $\mathbb{R} \to \mathbb{R}$. Examples of activation functions include
109 | 	\begin{align*}
110 | 		\text{ReLU}(x) &:= \max\{x, 0\}, \\
111 | 		\text{Sigmoid}(x) &:= \frac{1}{1+e^{-x}}, \\
112 | 		 \text{Softplus}(x) &:= \text{log} \left( \frac{1}{1+e^x} \right). \\
113 | 	\end{align*}
114 | Then the output vector can be written as $h_i = \sigma(Wh_{i-1})$, where $\sigma$ here is understood to be applied elementwise.
115 | \end{definition}
116 | 
117 | For a fully-connected two layer neural networks, we thus can write
118 | \begin{align*}
119 | 	\hat y = a^\top \sigma(Wx),
120 | \end{align*}
121 | where $x := h_0$ is the input of the network and $a \in \mathbb{R}^m$. If we view $\sigma(Wx)$ as a feature map $\phi(x)$ which depends on $W$ (hence we might more accurately write this feature map as $\phi_W (x)$), then this is similar to a kernel method. If we fix $W$, then this is exactly a the kernel method with $K(x,z) = \langle \phi_W(x), \phi_W(z) \rangle $. In neural networks, the difference is that we train both $a$ and $W$. If we don't train $W$, then we essentially have a kernel method. 
122 | \subsec{Deep neural networks}
123 | With the above notations, we can formally define a fully-connected deep neural network with $r$ layers, parameters $W_1, \cdots, W_r, a$ and input vector $ x:=h_0$ as
124 | \begin{align*} 
125 | \text{First layer:}\hspace{0.3cm}  &h_1 = \sigma(W_1,x) \\
126 | \text{Second layer:}\hspace{0.3cm}  &h_2 = \sigma(W_2, h_1) \\
127 | & \hspace{0.4cm} \vdots \\
128 |  \text{Output:}\hspace{0.3cm} &\hat{y} = a^T h_r
129 | \end{align*}
130 | Often times, $h_r$ is called ``the features''. $h_r = \sigma(W_r \sigma(W_{r-1} \dots )) \rightarrow \phi_{W_1 \dots W_r}(x)$ where $\phi_{W_1 \dots W_r}(x)$ is referred to as the feature extractor or the feature map. The key difference is that $\phi_{W_1 \dots W_r}(x)$ is learned. More broadly, any sequence of parameterized computations is called a neural network. For example, the residual neural network is
131 | \begin{align*} 
132 | 	\text{First layer:}\hspace{0.3cm}  &h_1 = \sigma(W_1,x) \\
133 | 	\text{Second layer:}\hspace{0.3cm}  &h_2 = h_1 + \sigma(W_2, h_1) \\
134 | 	& \hspace{0.4cm} \vdots \\
135 | 	r\text{-th layer:}\hspace{0.3cm}  &h_r = h_{r-1} + \sigma(W_r, h_{r-1}) \\
136 | 	\text{Output:}\hspace{0.3cm} &\hat{y} = a^T h_r
137 | \end{align*}
138 | \subsec{Equivalence to linear splines}
139 | We will work with infinitely wide neural networks to make the connection to linear splines. We don't really need an infinite width, but we do need a really large width. First, we introduce some notations.
140 | We call an input $x \in \R$ and its associated output $y \in \R$. We will call our model $h_{\theta}(x) = \sum_{i=1}^{m} a_i [w_ix + b_i ]_{+} + c$ where $a_i \in \R$, $w_i \in \R$, $x \in \R$, $b_i \in \R$, $c \in \R$. Our activation function in this neural network is simply the $\text{ReLU}(x) = \max\{t, 0\}$ function which we denoted by $[ \dots ]_{+}$. \\
141 | And as an aside
142 | \begin{align*}
143 | & a^T \sigma(Wx) = \sum_{i=1}^{m} a_i (\sigma(Wx))_i  = \sum_{i=1}^{m} a_i \sigma(Wx)_i), \\
144 | & W = \begin{bmatrix} w_1^T \\ \vdots \\ w_m^T \end{bmatrix} \hspace{0.2cm} \rightarrow \hspace{0.2cm}
145 | Wx = \begin{bmatrix} w_1^Tx \\ \vdots \\ w_m^Tx \end{bmatrix}, \\
146 | & (Wx)_i = w_i^Tx, \\
147 | & a^T \sigma(Wx) = \sum_{i=1}^{m} a_i \sigma(w_i^Tx). \\
148 | \end{align*}
149 | We will denote our parameters by $\theta = (m, a, w, b, c)$ where $m \in \bbN$, $a \in \R^m$, $w \in \R^m$, $b \in \R^m$ and $c \in \R$. 
150 | 
151 | Our regularizer will be the $l_2$ norm of the weights parameter:
152 | \al { C(\theta) = \frac{1}{2} \left( \norm{a}_2^2 + \norm{w}_2^2 \right) = \frac{1}{2} \sum_{i=1}^{m}(a_i^2 + w_i^2)}
153 | And we now define our regularized training objective: 
154 | \al{ \inf_{\theta} [L(h_{\theta}) + \lambda C(\theta)] } 
155 | Where $L(h_{\theta})$ can be any loss function that is continuous in $\theta$, and $C(\theta)$ is our regularizer.  An example of a loss function is
156 | \al {L(\theta) = \frac{1}{n} \sum_{i=1}^{n} (y\sp{i} - h_{\theta}(x\sp{i}))^2. }
157 | 
158 | \subsec{Simplification: $m$ goes to infinity}
159 | With some abuse of notation, we will work with the following neural network
160 | 
161 | \begin{align*}
162 | &h_{\theta}(x) = \sum_{i=1}^{\infty} a_i [w_i + b_i]_{+}, \\
163 | & a = (a_i, ..., a_k, ...) \in \R^\infty, w = (w_i, ..., w_k, ...) \in \R^\infty, \\
164 | & b = (b_i, ... ) \in \R^\infty, \\
165 | & C(\theta) = \frac{1}{2} \sum_{i=1}^{\infty} a_i^2 + \sum_{i=1}^{\infty} w_i^2.
166 | \end{align*}
167 | 
168 | \begin{theorem}
169 | Define the nonparametric complexity measure
170 | \begin{align}
171 | 	\bar{R}(f) = \max \left\{ \int_{-\infty}^{+\infty} |f^{''}(x)|dx, |f^{'}(-\infty) + f^{'}(+\infty)|\right\}. \label{eq:complexity-measure}
172 | \end{align}
173 | The first term is related to the continuity of the function, and the second term pertains to the slope. Recall that for cubic splines, the penalization was $\int r^{''}(x)^2 dx$. For a nonparametric penalized regression, our goal is to find the minimizer to
174 | \al {\inf_{f} L(f) + \lambda \bar{R}(f). }
175 | For a parameterized neural network, we are trying to find the minimizer to
176 | \al {\inf_{\theta} L(h_{\theta}) + \lambda C(\theta). }
177 | We claim that these methods are doing the same thing. Specifically we claim that
178 | $$\inf_{f} L(f) + \lambda \bar{R}(f) = \inf_{\theta} L(h_{\theta}) + \lambda C(\theta) , $$
179 | and
180 | $$f^{*} = h_{\theta}^{*},$$
181 | where $f^{*}$ and $\theta^{*}$ are the minimizers of the above two problems respectively. 
182 | 
183 | In other words, on the one hand we have a non-parametric approach, i.e. a penalized regression with a complexity measure $\bar{R}(f)$. On the other hand we have a parameterized regression which comes from neural networks. We claim that these are doing the exact same thing.
184 | \end{theorem}
185 | How do we interpret this? What does minimizing $L(f) + \lambda \bar{R}(f)$ really do? First, let's consider
186 | $$\text{minimize  }\bar{R}(f) \hspace{0.2cm} \text{s.t} \hspace{0.2cm} L(f) = \sum_{i=1}^{n}(y\sp{i}-f(x\sp{i}))^2 = 0.$$
187 | As this corresponds to the case where $\lambda \rightarrow 0$. The above is minimized when $f$ is a linear spline that fits the data exactly and $L(f) = 0$. 
188 |  
189 | From equation \eqref{eq:complexity-measure}, we see that $\bar{R}(f)$ consists of two terms. We know that $f^{''}(x) = \infty$ at the data points (since this is where the slope instantaneously changes from one linear line to another), and $f^{''}(x) = 0$ otherwise.  Hence, we can model $f^{''}(x)$ by the dirac delta function $\{\delta(t) | t = \text{datapoint} \} $. For dirac delta functions, we know that $\int_{-\infty}^{+\infty} \delta(t) dt = 1$, hence $\int |f^{''}(x)|dx$ in equation \eqref{eq:complexity-measure} is actually quite small. We won't go into the formality of proving this, but take it as true that minimizing $\bar{R}$ gives we a linear spline since the penalization from the second order derivatives is actually quite small. 
190 | 
191 | To represent a linear spline with $n$ knots, we only need $n+1$ pieces. We can therefore represent a linear spline with a neural net of at most $n+1$ terms. Analogously, in penalized regression we started with all possible solutions $r(x)$, but after we realized that the solution has a structure like a cubic spline, we then reduced our infinitely large solution space to an $n+4$ dimensional space ($n+4$ neurons/width of the neural net); this simplification makes the optimization of the problem a lot easier. 
192 | 
193 | \subsec{Outline of the proof}
194 | The proof follows mainly from the following two steps.
195 | \begin{itemize}
196 | \item Step 1: Show $ \exists$ $ \bar{R}(f)$ such that $\min L(f) + \lambda \bar{R}(f) = \min L(\theta) + \lambda C(\theta)$;
197 | \item Step 2: Derive the formula for $\bar{R}(f)$.
198 | \end{itemize}
199 | In this lecture we show the first step. We begin by looking for a representation of $f(x)$ by a neural network with minimum complexity. 
200 | \al {\bar{R}(f) \overset{\Delta}{=} \min C(\theta) \mbox{     s.t.   } f(x) = h_\theta(x)}
201 | Why do we know that such a neural network exists? For any piecewise linear function with a finite number of pieces, we know that there exists a $h_\theta(x)$ that represents $f(x)$, since neural networks are piecewise linear for a finite number of neurons. A uniformly continuous function $f(x)$ can be approximated by a 2-layer neural network with finite width, and it can be exactly represented by a two-layer neural network with infinite width. This is done by taking finer and finer approximations of our function, and then taking the limit as the number of approximations (width of our neural network) $\rightarrow \infty$.
202 | 
203 | Hence, we wish to prove that
204 | $$ \min L(f) + \lambda \bar{R}(f) = \min L(\theta) + \lambda C(\theta),$$
205 | where $\bar{R}(f) = \min C(\theta) \mbox{   s.t.   } f = h_{\theta}$. Let $\theta^*$ be the minimizer of the $\min L(\theta) + \lambda C(\theta)$, thus
206 | $$
207 | L(h_{\theta^*}) + \lambda C(\theta^*) = \min L(\theta) + \lambda C(\theta).
208 | $$
209 | Take $f = h_{\theta^*}$, then we have $L(f) = L(h_{\theta^*})$ and $\bar{R}(f) = \min C(\theta) = C(\theta^*); \text{where   } h_\theta = f$. Combining these statements implies that $L(f) + \lambda \bar{R}(f) \leq L(h_{\theta^*}) + \lambda C(\theta^*) = \min L(\theta) + \lambda C(\theta) $, which suggests
210 | \begin{align}
211 | 	\min L(f) + \lambda \bar{R}(f) \leq \min L(\theta) + \lambda C(\theta).
212 | \end{align}
213 | On the other direction, let $f^*$ be the minimizer of $\min L(f) + \lambda \bar{R}(f)$. By the argument above, we can construct $\theta$ such that $ \min L(f) + \lambda \bar{R}(f) \geq L(h_\theta) + \lambda C(\theta) \geq \min L(\theta) + \lambda C(\theta)$. Take $\theta$ to be the minimizer of $\min C(\theta) = \bar{R}(f^*) \mbox{   s.t.   } f^* = h_\theta$. This is the minimum complexity network that can represent $f$. This means that $C(\theta) = \bar{R}(f)$ and 
214 | \begin{align}
215 | \min L(f) + \lambda \bar{R}(f) & = L(f^*) + \lambda \bar{R}(f^*) \geq L(f^*) + \lambda C(\theta) \nonumber \\
216 | & = L(h_\theta) + \lambda C(\theta) \nonumber \\
217 | &\geq \min L(h_\theta) + \lambda C(\theta) \nonumber \\
218 | & = \min L(\theta) + \lambda C(\theta).
219 | \end{align}
220 | Taken collectively, we conclude that $\min L(f) + \lambda \bar{R}(f) = \min L(\theta) + \lambda C(\theta)$. The second step is left for the next lecture.
221 | 


--------------------------------------------------------------------------------
/Spring2021/04-02-2021.tex:
--------------------------------------------------------------------------------
  1 | % reset section counter
  2 | \setcounter{section}{0}
  3 | 
  4 | %\metadata{lecture ID}{Your names}{date}
  5 | \metadata{1}{Lan Jiang and Sameer Sundrani}{April 2nd, 2021}
  6 | 
  7 | \sec{Overview}
  8 | 
  9 | In this lecture, we begin our exploration of nonparametric statistics. We first describe the underlying motivation for the field of nonparametric statistics and its general principles. Then, we look at our first examples of such statistics with the nonparametric regression problem, wherein we focus on three different approaches: the regressogram, local averaging, and Nadaraya-Watson kernel estimator.
 10 | 
 11 | 
 12 | \sec{Overview of nonparametric statistics}
 13 | 
 14 | The overarching idea of nonparametric statistics is that it does not leverage standard parameterization. While there are not many precise definitions for the field of nonparametric statistics, there are a few core tenets to know.
 15 | \begin{itemize}
 16 | 	\item Make as few assumptions as possible. For example, do not assume that data extends from linear or quadratic model. 
 17 | 	\item No fixed set of parameters exists. For example, in nonparametric statistics we will often see across infinite dimensional models, infinite parameters, or circumstances where the dimension $\rightarrow \infty$ as the number of data point $n \rightarrow \infty$.
 18 | \end{itemize}
 19 | Such principles are widely applicable to many areas of statistics and machine learning, such as in nonparametric testing, supervised learning, and unsupervised learning. 
 20 | 
 21 | However, often and particularly in this class, our data is low dimensional\footnote{In this course, low dimensions generally refers to the case when data dimension $d = 1, 2, 3$}, with exceptions like neural networks and some kernel methods. This is important since high dimensional data without many strong parametric assumptions will fundamentally and statistically need many samples (i.e. the data will need exponential dimensions) to estimate anything (density, CDF, etc.), suffering from the ``curse of dimensionality''. The lack of high dimensional data without many strong parametric assumptions results in estimate errors at the zero-th or first order. 
 22 | 
 23 | \sec{The nonparametric regression problem}
 24 | \subsec{Setup}
 25 | Our first example in nonparametric statistics will be nonparametric regression. In such a problem, we have $n$ pairs of observations
 26 | \begin{equation} 
 27 | (x_1, Y_1), ..., (x_n, Y_n),
 28 | \end{equation}
 29 |  where each $x_i, Y_i \in \R$ and $x_i$ refers to an input (or covariate) and $Y_i$ refers to an output (or label) or response variable. Furthermore, each $Y_i$ can be written as
 30 |   \begin{equation} 
 31 |   	Y_i = r(x_i) + \xi_i,
 32 |    \end{equation} 
 33 | where $r(.)$ is some function and $\xi$ is some noise on the observed output. Here, $r(x_i)$ is defined by
 34 |  \begin{equation} r(x_i) := \Exp[Y_i | x_i]
 35 |  \end{equation} 
 36 | and $\xi_i = Y_i - r(x_i)$, with $\Exp[\xi_i] = 0$. 
 37 | 
 38 | With these observations now defined, we can view the regression problem under two frameworks: deterministic inputs or random inputs, and we will explore both possibilities in the subsequent sections.
 39 | 
 40 | \subsec{Deterministic design and mean squared error}
 41 | In this view, we treat $x_1, ..., x_n$ as fixed, deterministic inputs with $Y_1, ... Y_n$ being random variables. Our goal then is to estimate or recover $r(x_1), ..., r(x_n)$ as accurately as possible. Pictorially, we can see this in Figure \ref{fig:regression example}, where the red circular open dots are the "noisy" set of observations and the blue $r(x)$ is the function where each labeled $r(x_i)$ (in black circular open dots) is what we aim to recover. 
 42 | 
 43 | Our estimator will therefore be denoted as $\hat{r}: \hat{r}(x_1), ..., \hat{r}(x_n)$. We will evaluate $\hat{r}$ utilizing mean squared error or MSE as  \begin{equation} \text{MSE}(\hat{r}) = \frac{1}{n} \sum_{i = 1}^{n} (\hat{r}(x_i) - r(x_i))^2.\end{equation} 
 44 | Because there is randomness from each $Y_i$, we can rewrite this as an expectation utilizing the same form we just described as
 45 | \begin{equation} \text{MSE} = \Exp[\text{MSE}(\hat{r})] = \Exp \left[\frac{1}{n} \sum_{i = 1}^{n} (\hat{r}(x_i) - r(x_i))^2\right].
 46 | \end{equation}
 47 | 
 48 | \begin{figure}[htbp!]
 49 | 	\begin{center}
 50 | 		\begin{tikzpicture}
 51 | 			\begin{axis}[
 52 | 				axis lines=middle,
 53 | 				xtick=\empty, ytick=\empty,
 54 | 				xlabel=$x$,ylabel=$y$,
 55 | 				]
 56 | 				\addplot[
 57 | 				domain = -10:10,
 58 | 				samples = 250,
 59 | 				smooth,
 60 | 				thick,
 61 | 				blue,
 62 | 				] {exp(-x/10)*( cos(deg(x)) + sin(deg(x))/10 )};
 63 | 				\addplot [only marks, mark = o, thick, red] table [y=Y, x=$X_1$]{figure/Lecture01/data.dat};
 64 | 				\addplot [only marks, mark = o, thick, black] table [y=$X_2$, x=$X_1$]{figure/Lecture01/data.dat};
 65 | 				\addlegendentry{$r(x)$}
 66 | 				\addlegendentry{$(x_i, Y_i)$'s}
 67 | 				\addlegendentry{$r(x_i)$'s}
 68 | 			\end{axis}
 69 | 		\end{tikzpicture}
 70 | 		\caption{Graphical representation of regression problem}
 71 | 		\label{fig:regression example}
 72 | 	\end{center}
 73 | \end{figure}
 74 | 
 75 | 
 76 | \subsec{The alternative view point: random design}
 77 | Alternatively, we could have viewed our input as a series of independent and identically distributed  random variables $X_1, ..., X_n \iid P$ (note the upper-case $X_i$ now) where our $Y_i = r(X_i) + \xi_i$. Our interpretation of such an perspective while modeling the problem remains very similar, though, and our estimator is still some $\hat{r}$ evaluated using MSE as $\Exp_{X\sim P} [(\hat{r}(X) - r(X))^2]$. For the rest of this note, though, we will maintain within the deterministic design paradigm described above. 
 78 | 
 79 | \subsec{Motivation for nonparametric regression}
 80 | 
 81 | With our current understanding of the regression problem at hand, one may claim we can solve such examples parametrically by assuming that each $Y_i$ is a linear combination of the input (as in linear regression) or some polynomial combination of the input (as in polynomial regression). However, consider the regression where $r(x)$ is neither linear nor polynomial as in Figure \ref{fig:nonparametric regression example} where $r(x)$ cannot be fitted with any polynomial fit perfectly (assume here that after some $x_0$, $r(x \geq x_0)$ remains fixed at some constant value).
 82 | 
 83 | To see why polynomial regression would fail, suppose we fit $r(x)$ with $f(x)$ a $k$-degree polynomial. Suppose $x_1, \cdots, x_{k+1} \geq x_0$ and then $f(x_1) =  ... = f(x_{k+1}) = c$ for distinct values of $x_1, ... , x_{k+1}$. Since a $k$-degree polynomial is uniquely determined to its values at $k+1$ different points, the only possible solution would be $f(x) = c$, the constant function. Therefore it is not possible to fit all the points on both the right and left hand side of this curve above. We now see that we cannot simply make an assumption of the behavior of our $r(x)$ in this case, and must turn to new ways of achieving our goal. We turn now to some methodologies of nonparametric regression. 
 84 | 
 85 | \begin{figure}[htbp!]
 86 | 	\begin{center}
 87 | 		\begin{tikzpicture}
 88 | 			\begin{axis}[
 89 | 				axis lines=middle,
 90 | 				xtick=\empty, ytick=\empty,
 91 | 				xlabel=$x$,ylabel=$y$,
 92 | 				ymin=-1,ymax=1,
 93 | 				]
 94 | 				\addplot [smooth, thick, red] table [y=$X_3$, x=$X_1$]{figure/Lecture01/data.dat};
 95 | 				\addlegendentry{$r(x)$}
 96 | 			\end{axis}
 97 | 		\end{tikzpicture}
 98 | 		\caption{nonparametric regression problem on which polynomial regression fail.}
 99 | 		\label{fig:nonparametric regression example}
100 | 	\end{center}
101 | \end{figure}
102 | 
103 | \sec{nonparametric regression methods}
104 | 
105 | \subsec{Regressogram}
106 | Our first methodology to approach our modeling problem is known as the regressogram approach. Our algorithm is quite rudimentary: divide our domain of $x$ into some number of bins (assume that our bins are of equal size) as shown in Figure \ref{fig:regressogram}. For each $(x_i, Y_i)$ that falls within a given bin $B_j$, our estimator is defined as
107 |  \begin{equation}\hat{r} = \frac{1}{|B_j|}\sum_{i \in B_j} Y_i,\end{equation}
108 |  which is, in other words, the average of all $Y_i$ where $x_i \in B_j$. Because each point will fall into some $B_j$ (and for all bins where there is no observation, we won't have a defined prediction), we recover a piece-wise constant function for that $B_j$. We then see that each point within a particular $B_j$ will recover the same $\hat{r}$. While this approach is simple, it is not often used in practice as choosing the binning method and size is quite tricky. Additionally, some bins may not capture many observations while others may not capture enough of a set of observations if there are too many variable regions within a bin. 
109 | 
110 | \begin{figure}
111 | 	\begin{center}
112 | 		\begin{tikzpicture}
113 | 			\begin{axis}[
114 | 				axis lines=middle,
115 | 				xtick=\empty, ytick=\empty,
116 | 				xlabel=$x$,ylabel=$y$,
117 | 				ymin=-1,ymax=1,
118 | 				]
119 | 				\addplot [smooth, thick, red] table [y=$X_3$, x=$X_1$]{figure/Lecture01/data.dat};
120 | 				\addplot[thick, samples=50, dashed,black] coordinates {(2,-1)(2,3)};
121 | 				\addplot[thick, samples=50, dashed,black] coordinates {(-2,-1)(-2,3)};
122 | 				\addplot[thick, samples=50, dashed,black] coordinates {(-1,-1)(-1,3)};
123 | 				\addplot[thick, samples=50, dashed,black] coordinates {(1,-1)(1,3)};
124 | 				\addplot[thick, samples=50, dashed,black] coordinates {(3,-1)(3,3)};
125 | 				\addplot[thick, samples=50, dashed,black] coordinates {(-3,-1)(-3,3)};
126 | 				\addplot[thick, samples=50, smooth,purple, domain= -3:-2] coordinates {(-3,-0.2)(-2,-0.2)};
127 | 				\addplot[thick, samples=50, smooth,purple, domain= -2:-1] coordinates {(-2,0.3)(-1,0.3)};
128 | 				\addplot[thick, samples=50, smooth,purple, domain= -1:0] coordinates {(-1,0.43)(0,0.43)};
129 | 				\addplot[thick, samples=50, smooth,purple, domain= 0:1] coordinates {(0,0.4)(1,0.4)};
130 | 				\addplot[thick, samples=50, smooth,purple, domain= 1:2] coordinates {(1,0.3)(2,0.3)};
131 | 				\addplot[thick, samples=50, smooth,purple, domain= 2:3] coordinates {(2,0.25)(3,0.25)};
132 | 				\addlegendentry{$r(x)$}
133 | 			\end{axis}
134 | 		\end{tikzpicture}
135 | 		\caption{Regressogram binning with piecewise constant functions}
136 | 		\label{fig:regressogram}
137 | 	\end{center}
138 | \end{figure}
139 | 
140 | 
141 | \subsec{Local averaging}
142 | As the original regressogram can easily fail to capture or over-capture a set of observations, we move to a modified version of binning known as local averaging. Here, we instead define bins dynamically i.e. for each observation $(x_i, Y_i)$, we define a bin of some size $h$ in each direction around that value. Each bin is therefore defined in terms of a particular observation such that $B_{x_i} = \{j: |x_j -x_i| \leq h\}$ where our estimator is now \begin{equation} \hat{r}(x_i) = \frac{1}{|B_{x_i}|}\sum_{i \in B_{x_i}} Y_i,\end{equation} 
143 | which is, similar to before, an average, but now with the nuance that each bin is defined with respect to each observation. One such bin with locally averaged $\hat{r}(x_i)$ can be seen in Figure \ref{fig:local averaging}. Note that we are making the assumption that within each $B_{x_i}$, $\hat{r}(x_i)$ is a constant denoted by $a$. Following this assumption, we can then derive $\hat{r}(x_i)$ as minimizing the MSE inside each local bin, namely
144 | \begin{equation}
145 | 	\hat{r}(x_i) = \argmin_{a} \frac{1}{|B_{x_i}|}\sum_{i \in B_{x_i}} (Y_i - a)^2 = \frac{1}{|B_{x_i}|}\sum_{i \in B_{x_i}} Y_i.
146 | \end{equation}
147 | Setting the first derivative of this function to zero and solving for $a$, we see that the minimum $a$ would be the average value within a bin, as we estimate within this method.
148 | 
149 | As with the regressogram, we make some assumptions in local averaging that may not be sufficient for our regression. Namely, we are assuming that we can safely ignore all points outside the boundary of each $B_{x_i}$, even if such points are borderline to the bin determined by some $h$. Such a problem motivates our next methodology: soft-weight averaging. 
150 | 
151 | \begin{figure}
152 | 	\begin{center}
153 | 		\begin{tikzpicture}
154 | 			\begin{axis}[
155 | 				axis lines=middle,
156 | 				xtick=\empty, ytick=\empty,
157 | 				xlabel=$x$,ylabel=$y$
158 | 				]
159 | 				\addplot [only marks, mark = x, thick, red] table [y=$Y_1$, x=X]{figure/Lecture01/data2.dat};
160 | 				\addplot[thick, samples=50, dashed,black] coordinates {(2.5,-1)(2.5,3)};
161 | 				\addplot[thick, samples=50, dashed,black] coordinates {(1.5,-1)(1.5,3)};
162 | 				\addplot[thick, samples=50, smooth, blue] coordinates {(2,-1)(2,1.95)};
163 | 				\addplot[thick, samples=50, smooth, red] coordinates {(1.5,2)(2.5,2)};
164 | 				\addlegendentry{$r(x_i)$'s}
165 | 			\end{axis}
166 | 		\end{tikzpicture}
167 | 		\caption{Local averaging with one $x_i$, estimating $\hat{r}(x_i)$ as a red line}
168 | 		\label{fig:local averaging}
169 | 	\end{center}
170 | \end{figure}
171 | 
172 | 
173 | 
174 | \subsec{Nadaraya-Watson kernel estimator (soft-weight averaging)}
175 | As we saw in the previous section on local averaging, we make a potentially detrimental assumption that all points outside the boundary of a given bin should not be considered for our estimator $\hat{r}(x)$. To alleviate this problem, we introduce the concept of soft-weight averaging, where we introduce a weighting for each observation that can be distance dependent. More specifically, we define our new constant estimate $a$ for each observation $(x_i, Y_i)$ over all $n$ observations as follows
176 |  \begin{equation}\argmin_{a \in \R} \sum_{i = 1}^n w_i (Y_i - a)^2 =\frac{\sum_{i = 1}^n w_iY_i}{\sum_{i = 1}^n w_i},\end{equation} 
177 | where the minimizer can be found by taking derivative w.r.t. $a$. Notice that if $w_i = \bm{1}\cbr{|x_i - x| \leq h}$, we recover the same local averaging we have previously described. In general, we desire $w_i$ to be smaller as $| x_i - x |$ increases and for any  $| x_i - x | > | x_j - x |$ it follows that $w_i < w_j$. 
178 | 
179 | We now need to define a weighting scheme that satisfies our weighting desires, and we will do so using a kernel estimator. Namely, because we desire to have a weighting dependent on the distance a particular observation $x_i$ is from $x$, we will define 
180 | \begin{equation}
181 | w_i = f(x_i -x) = K\rbr{\frac{x_i -x}{h}},
182 | \end{equation}
183 | where $K(\cdot)$ is a kernel function and $h$ is the width of our bins. 
184 | \subsec{Kernel functions}
185 | Before we describe further our possible choices of kernel $K$, we must define more specifically what properties $K$ follows. Formally, we make the following definition.
186 | \begin{definition}[Kernel function] $K: \R \to \R$ is called a kernel function if it is non-negative and satisfies the following properties.
187 | 	\begin{enumerate}
188 | 		\item $\int_{\R} K(t) dt = 1$. $K$ is normalized and scales to 1. 
189 | 		\item $\int_{\R} tK(t)dt = 0$. There must be some kind of symmetry within $K$. 
190 | 		\item $\sigma_t^2 := \int_{\R} t^2 K(t) dt > 0$.
191 | 	\end{enumerate}
192 | \end{definition}
193 | Now, we will discuss four variants of kernels. We start by the boxcar kernel
194 | \begin{equation}K(t) = \frac{1}{2}\bm{1}\cbr{|t| \leq 1}.\end{equation} 
195 | The boxcar is named for its box-like shape and can be seen in red in Figure \ref{fig:kernels} overlayed with the Gaussian kernel as well. One then sees that this corresponds exactly to local averaging for some $h$.  Next we have the Gaussian kernel
196 |  \begin{equation}K(t) = \frac{1}{\sqrt{2\pi}}\exp\rbr{\frac{-t^2}{2}},\end{equation}
197 | Unlike the boxcar, here we have some non-zero weight for some $x_i$ when $|x_i - x| > h$, which tapers off as that difference increases. Other choices of kernels include the Epanechnikov kernel 
198 | \begin{equation}
199 | 	K(t) = \frac{3}{4}(1 - t^2)\bm{1} \cbr{|t| \leq 1}
200 | \end{equation}
201 | and the tricube kernel
202 | \begin{equation}
203 | 	K(t) = \frac{70}{81}(1 - |t|^3)\bm{1} \cbr{|t| \leq 1}.
204 | \end{equation}
205 | In practice, however, the explicit choice of kernel is not that important empirically between the these (excluding boxcar). 
206 | 
207 | \begin{figure}
208 | 	\begin{center}
209 | 		\begin{tikzpicture}
210 | 			\begin{axis}[
211 | 				axis lines=middle,
212 | 				xtick={-1, 0, 1}, ytick=\empty,
213 | 				xlabel=$x$,ylabel=$y$, 
214 | 				ymin=0, ymax = 1
215 | 				]
216 | 				\addplot [thick, red] table [y=$Y_1$, x=X]{figure/Lecture01/data3.dat};
217 | 				\addplot[
218 | 				domain = -5:5,
219 | 				samples = 250,
220 | 				smooth,
221 | 				thick,
222 | 				blue,
223 | 				] {(1/(sqrt(2*pi))*exp(-(x^2)/2)};
224 | 				\addlegendentry{Boxcar}
225 | 				\addlegendentry{Gaussian}
226 | 			\end{axis}
227 | 		\end{tikzpicture}
228 | 		\caption{Boxcar and Gaussian Kernels}
229 | 		\label{fig:kernels}
230 | 	\end{center}
231 | \end{figure}
232 | 
233 | \subsec{Choosing the bandwith}
234 | Finally, we approach the discussion of choosing our bin width $h$. Larger or smaller values of $h$ can dramatically change our estimates $\hat{r}(x_i)$, so understanding how to do so is critical for our regression. To see why this is true, consider any choice of kernel. Here we see that if we increase $h$, we are simply increasing the bin widths (imagining a larger width in, say, Figure \ref{fig:local averaging}) and thereby averaging more observations for any particular $x_i$. In other words, a large $h$ allows for greater weightings $w_i$ for farther observations. 
235 | 
236 | In practice, we see that this choice of $h$ is highly dependent on the data. For example, if observations are truly close to one another and have a similar true value of $r(x)$, we will achieve better results using a large $h$. To see this, consider the example set of observations in Figure \ref{fig:choosing h large}, where we see small fluctuations across observations but on average not very many differences and assume that the true value is somewhere along the average of these observations. Here, an $h = 0$ would yield separate constants for each observation and no ``denoising'' for each observation. On the other hand, choosing $h = \infty$ here would yield the same  constant prediction $\hat{r}(x_i)$ for each $x_i$, which we can see as follows: \begin{equation}\frac{1}{n}\sum_{i =1}^n Y_i = \frac{1}{n}\sum_{i =1}^n r(x_i) + \xi_i = \frac{1}{n}\sum_{i =1}^n c + \xi_i = c + \frac{1}{n}\sum_{i =1}^n \xi_i \approx c \pm \frac{1}{\sqrt{n}} \end{equation}
237 | where we draw the simplification of the last term via the central limit theorem. \footnote{Assuming $\Var(\xi_i) =1$, the central limit theorem implies $\frac{1}{\sqrt n} \sum_{i =1}^n \xi_i \xrightarrow{\mathrm d} \sf{N}(0,1)$.}
238 | 
239 | If instead we had chosen a moderate $h$, our estimate for each $x_i$ may have not included all observations and then would be represented using a similar derivation as follows: \begin{equation}\hat{r}(x_i) = \frac{1}{|B_{x_i}|}\sum_{j \in B_{x_i}} Y_j = \frac{1}{|B_{x_i}|}\sum_{j \in B_{x_i}} r(x_j) + \xi_j =\frac{1}{|B_{x_i}|}\sum_{j \in B_{x_i}} c + \xi_i = c + \frac{1}{|B_{x_i}|}\sum_{j \in B_{x_i}} \xi_i \approx c \pm \frac{1}{\sqrt{|B_{x_i}|}}\end{equation} where we can see here that we could be obtaining a noisier value for our estimate compared to a larger $h$. 
240 | 
241 | Finally, considering another extreme case, where the true $r(x)$ fluctuates a lot such as the $r(x)$ in Figure \ref{fig:regression example}, utilizing a large $h$ would yield a poorer result than simply assuming there is no noise $\xi_i$ and simply choosing $h = 0$. In such a case, although we may not be correct in our assumption, we may still obtain a reasonable estimate $\hat{r}(x_i) = Y_i$ (when $h = 0$). 
242 | \begin{figure}
243 | 	\begin{center}
244 | 		\begin{tikzpicture}
245 | 			\begin{axis}[
246 | 				axis lines = middle,
247 | 				xtick=\empty, ytick=\empty,
248 | 				xlabel=$x$,ylabel=$y$, 
249 | 				ymin = 0, ymax = 3,
250 | 				]
251 | 				\addplot [only marks, mark = o, thick, red] table [y=Y, x=$X_1$]{figure/Lecture01/data4.dat};
252 | 				\addplot[thick, samples=50, dashed,black] coordinates {(9,-1)(9,3)};
253 | 				\addplot[thick, samples=50, dashed,black] coordinates {(2,-1)(2,3)};
254 | 				\addplot[thick, samples=50, smooth,red] coordinates {(2,1)(9,1)};
255 | 			\end{axis}
256 | 		\end{tikzpicture}
257 | 		\caption{Example where choosing a large $h$ would yield a better estimate}
258 | 		\label{fig:choosing h large}
259 | 	\end{center}
260 | \end{figure}


--------------------------------------------------------------------------------
/Spring2021/04-16-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | % reset section counter
  4 | \setcounter{section}{0}
  5 | 
  6 | %\metadata{lecture ID}{Your names}{date}
  7 | \metadata{3}{Luis Alcaraz}{April 16, 2021}
  8 | 
  9 | \sec{Review and overview}
 10 | 
 11 | 	In the previous lecture, we covered bias-variance trade-off, local linear regression and had a brief introduction to linear smoothers. 
 12 | 	
 13 | 	In this lecture we will continue exploring classical non-parametric methods. First, by exploring local polynomial regression as an extension of local linear regression. Then, different methods of optimizing regression are evaluated, including cross validation where one selects various hyperparameters and dropout methods where the data set is split into validation and training sets. Finally, the rest of the lecture covered splines as a new framework for an non-parametric algorithm. 
 14 | 	
 15 | \sec{Local polynomial regression}
 16 | As we saw in the previous lecture, local linear regression as in Fig.~\ref{fig:local_linear_regression example} extends local averaging, where local averaging fits a locally constant function while local linear regression fits a linear one, fixing the design bias locally. To fix the tension within local linear regression, we now apply local polynomial regression, which just like local linear regression extends local averaging, local polynomial extends local linear regression.
 17 | 
 18 | Essentially, in local polynomial regression, we are fitting polynomial functions locally. We first start by fixing $x$.
 19 | 
 20 | \begin{figure}[htbp!]
 21 |         \begin{center}
 22 |         \begin{tikzpicture}
 23 |             \begin{axis}[
 24 |             axis lines=middle,
 25 |             xtick=\empty, ytick=\empty,
 26 |             xlabel=$x$,ylabel=$y$,
 27 |             ]
 28 | 	            \addplot[
 29 | 		            thick,
 30 | 		            blue,
 31 | 	                ] table [y index=1, x index=0, col sep=comma]{figure/Lecture03/statsdata.txt};
 32 |                 \addplot [only marks, mark = o, thick, red] table [y index=1, x index=0, col sep=comma]{figure/Lecture03/statsdata.txt};
 33 |                
 34 |                 
 35 |                 \addlegendentry{$r(x)$}
 36 |                 \addlegendentry{$(x_i, Y_i)$'s}
 37 | 	        \end{axis}
 38 |         \end{tikzpicture}
 39 |         \caption{Graphical representation of local linear regression (LLR).}
 40 |         \label{fig:local_linear_regression example}
 41 |         \end{center}
 42 |         \end{figure}
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | Once we have a fixed $x$, we now want to approximate the function $r(\cdot)$ in the neighborhood of $x$ by defining the function $P_x(u;a)$ with $a = (a_0, \cdots, a_p)$ where\footnote{$p!$ is a convenient choice if we want to take $k$-th order derivative of $P_x(u,a)$ at $u=x$, i.e. $\left.\frac{\mathrm d^k}{\mathrm d u^k}P_x(u,a)\right|_{u=x} = a_k$ for all $k=0,\cdots, p$.} 
 49 | \begin{align}
 50 | P_x(u;a) = a_0 + a_1(u - x) + \frac{a_2}{2}(u - x)^2 + ... + \frac{a_p}{p!}(u - x)^p  
 51 | \end{align}
 52 |  With this new approximate $P_x$, we now want to fit this degree-$p$ polynomial to the data around point $x$. Much like the logic with local linear regression, we are no longer assuming $r(x)$ is neither constant or locally linear, but rather we assume $r(x)$ is a polynomial function locally. Therefore, we minimize over $a$, giving us the minimized equation: 
 53 | 
 54 | \begin{align}
 55 |     \hat{a} = (\hat{a}_0, ..., \hat{a}_p) &= \argmin_{(a_0,..., a_p) \in \R^{p+1}} \sum_{j=1}^{n}w_j(Y_j - P_x(u_j;a))^2 \nonumber \\
 56 |     &= \argmin_{(a_0,..., a_p) \in \R^{p+1}} \sum_{j=1}^{n}w_j\left(Y_j - (a_0 + a_1(u - x) + ... + \frac{a_p}{p!}(u - x)^p)\right)^2,
 57 |   \end{align}
 58 | where $w_j = K \left(\frac{x-x_j}{h}\right)$ for some kernel function $K$. Notice that we can rewrite this equation into
 59 | \begin{align*}
 60 | \argmin_{a \in \R^{p+1}} \sum_{j=1}^{n}w_j(Y_j - a^\top z_j)^2, 
 61 | \end{align*}
 62 | where $a$ and $z_j$ are
 63 | \begin{align*}
 64 | a = \begin{bmatrix} a_0 \\ \vdots \\a_p \end{bmatrix}, \qquad
 65 | z_j = \begin{bmatrix} 1 \\ x_j-x \\ \vdots \\\frac{1}{p!}(x_j-x)^p \end{bmatrix}.
 66 | \end{align*}
 67 | As we can see, this function closely resembles local linear regression with the exception of $P_x$ representing degree-$p$ polynomials. Therefore, we can classify it as weighted linear regression with vector $a$ as parameter and $z_j$'s as the input, giving us a very similar final estimator function to local linear regression
 68 | \[
 69 | \hat{r}(x) = P_x(x;\hat{a}) = \hat{a}_0.
 70 | \]
 71 | It is noted when using local polynomial regression, the convention is to use up to degree 3 polynomials as higher-degree polynomials are not much helper in complex data sets. Also note, local polynomial regression is also a linear smoother.
 72 | \sec{Cross validation}
 73 | There are different forms of cross validation that can be conducted on an algorithm. The main reason to the use of cross validation is reducing the chance of over-fitting through altering various hyperparameters. This cross-validation technique is widely used in machine learning, specifically, neural nets, when trying to create models that best fit specific datasets. Nevertheless, in our case, we will be looking at cross validation for local polynomial regression which concerns selecting the optimal bandwidth $h$, degree polynomial $p$, and which method (like splines, regressogram, etc.) is used. The reason we care about cross validation is in order to optimize our model, and therefore our results. Recall we want to evaluate and minimize 
 74 | \begin{align*}
 75 | \mathrm{MSE} &= \E_{Y_i} \left[\mathrm{MSE}(\hat r)\right] \nonumber \\
 76 | & =  \E \left[\frac{1}{n}\sum_{i=1}^{n}(\hat{r}(x_i) - r(x_i))^2\right] ,
 77 | \end{align*}
 78 | where
 79 |  \[
 80 | \mathrm{MSE}(\hat r) = \frac{1}{n}\sum_{i=1}^{n}(\hat{r}(x_i) - r(x_i))^2.
 81 | \]
 82 | As we have seen before, this issue cannot simply be solved with minimizing the training error
 83 | \[
 84 | \frac{1}{n} \sum_{i = 1}^{n} (Y_i - \hat{r}(x_i))^2,
 85 | \]
 86 | as setting $h=0$ gives zero training error, as $\hat{r}(x_i) = Y_i$
 87 | \subsec{Holdout dataset}
 88 | One method of cross validation that is widely used in machine learning is holdout set. However, this is only useful when you have a large dataset, something rarely available in non-parametric statistics. Nevertheless, holdout is a technique where a dataset is split into two sets where you take a random permutation of $1,\cdots, n$ as $i_1,...,i_n$, such that you use $(X_{i_1}, Y_{i_1}),..., (X_{i_m}, Y_{i_m})$ for training and $(X_{i_{m+1}}, Y_{i_{m+1}}),..., (X_{i_n}, Y_{i_n})$ as a validation set.
 89 | \subsec{Leave-one-out estimate}
 90 | Another technique for cross validation is leave-one-out estimate where you have an estimator for the risk
 91 | \[
 92 | \hat{R}(h) = \frac{1}{n} \sum_{i = 1}^{n}(Y_i - \hat{r}_{-i}(x_i))^2,
 93 | \]
 94 | where $\hat{r}_{-i}(\cdot)$ is the estimator applied to the dataset excluding $x_i$. So basically you remove $x_i$ from the dataset, and then apply the estimator, and finally use estimator on $x_i$ to see if it can produce $Y_i$ with relative small error. To implement this, recall a general linear smoother can be written as
 95 | \[
 96 | \hat{r}(x) = \sum_{j = 1}^{n} l_j(x) Y_j, \quad \sum_{j=1}^n l_j(x) = 1.
 97 | \]
 98 | Therefore, for the leave-one-out estimator, we can obtain that
 99 | \begin{align*}
100 |     \hat{r}_{-1}(x) = \sum_{j \not = i} \left( \frac{ l_j(x) }{\sum_{j \not = i} l_j(x)} \cdot Y_j \right).
101 |   \end{align*}
102 | For the kernel estimator, $\hat{r}_{-1}(x)$ defined in the equation above is indeed the estimator applied on $(X_{1}, Y_{1}),..., (X_{n}, Y_{n})$ excluding $(X_{i}, Y_{i})$. Here $\hat{R}$ is almost an unbiased estimator for the predictive risk. Now follows the question on how to computer $\hat{R}$ efficiently. We will do this in the form of a theorem.
103 | \begin{theorem}
104 | 	If $\hat{r}$ is a linear smoother \[
105 | 	\hat{r}(x) = \sum_{j =1}^{n} l_j(x) Y_j,
106 | 	\]
107 | 	then
108 | 	\begin{align}
109 | 	\hat{R}(h) = \frac{1}{n}\sum_{i=1}^{n} \frac{(Y_i - \hat{r}(x_i))^2}{1 - L_{ii}},
110 | 	\end{align}
111 | 	where $L_{ii} = l_i(x_i)$. 
112 | \end{theorem}
113 | \begin{proof} Consider the estimator \[
114 | \hat{r}_{-1}(x_i) = \frac{\sum_{j \not = i} l_j(x_i) Y_j}{\sum_{j \not = i} l_j(x_i)},
115 | \] 
116 | Recall the sum of the weights for data point $x$ is always $1$, therefore we can rewrite the estimator as
117 | \begin{align*}
118 |     \hat{r}_{-1}(x_i) &= \frac{\sum_{j = 1}^{n} l_j(x_i) Y_j - l_i(x_i)}{\sum_{j \not = i} l_j} \\
119 |     &= \frac{\hat{r}(x_i) - l_i(x_i)Y_i}{1 - L_{ii}}. \\
120 |   \end{align*}
121 |   Therefore,
122 |   \begin{align*}
123 |     \hat{R}(h) &= \frac{1}{n} \sum_{i = 1}^{n}(Y_i - \hat{r}_{-i}(x_i))^2\\
124 |     &= \frac{1}{n} \sum_{i = 1}^{n}\left(Y_i - \frac{\hat{r}(x_i) - l_i(x_i)Y_i}{1 - L_{ii}}\right)^2\\
125 |     &= \frac{1}{n} \sum_{i = 1}^{n}\left(\frac{Y_i - \hat{r}(x_i)}{1 - L_{ii}}\right)^2,
126 |   \end{align*}
127 |   as desired. 
128 | \end{proof}
129 | 
130 | \sec{Splines}
131 | \subsec{Penalized Regression}
132 | To motivate the use of splines, first let us recall the MSE for local linear regression where we seek to minimize $\int r''(x)^2dx$. In the case where we explicitly leverage the smoothest function that fits the data is known as penalized regression. We can rewrite the function as
133 | \[
134 | \argmin_{\hat{r}} \sum_{i =1}^{n} (Y_i - \hat{r}(x_i))^2 + \lambda J(\hat{r}) \triangleq L_\lambda(\hat{r}).
135 | \]
136 | where $J(\hat{r}) = \int r''(x)^2dx$. Let us consider one of the extreme cases when
137 | \begin{figure}[htbp!]
138 |         \begin{center}
139 |         \begin{tikzpicture}
140 |             \begin{axis}[
141 |             legend pos=south east,
142 |             axis lines=middle,
143 |             xtick=\empty, ytick=\empty,
144 |             xlabel=$x$,ylabel=$y$,
145 |             ymin=-5
146 |             ]
147 | 	            \addplot[
148 | 		            thick,
149 | 		            blue,
150 | 	                ] table [y index=1, x index=0, col sep=comma]{figure/Lecture03/statsdata.txt};
151 |                 \addplot [only marks, mark = o, thick, red] table [y index=1, x index=0, col sep=comma]{figure/Lecture03/statsdata.txt};
152 |                 \addplot[
153 |                 domain = 0:7,
154 | 		            thick,
155 | 		            red,
156 | 	                ] {-(cos(10*pi * deg(x-4)) + .5 * (x-4)^2) + 6};
157 |                
158 |                 
159 |                 \addlegendentry{$r(x)$}
160 |                 \addlegendentry{$(x_i, Y_i)$'s}
161 |                 \addlegendentry{$\lambda = \infty$}
162 | 	        \end{axis}
163 |         \end{tikzpicture}
164 |         \caption{Example of extreme cases when $\lambda = \infty$.}
165 |         \label{fig:regression example}
166 |         \end{center}
167 |         \end{figure}
168 | $\lambda = \infty$. In this case, $J(\hat{r})$ can only be zero since $r''(x) = 0$ and $\hat{r}$ can only be linear. Here, what spline is doing is that you do not have to be so strict with being linear, in addition to controlling second order derivatives. We can now move onto defining splines.
169 | \subsec{Splines}
170 | Splines themselves are family of functions $f$, where we have the set points $\xi_1 < \xi_2 < ... , \xi_k$ (also known as knots) contained in some interval $[a,b]$. Generally, $M$-th order splines are piecewise $(M-1)$-degree polynomial with continuous $(M-2)$-th order derivatives at the knots. More specifically, a cubic spline ($4$-th order spline) $q$ is a continuous function such that
171 | \begin{itemize}
172 |     \item $q$ is a cubic polynomial on $(a, \xi_1]$,$ [\xi_1, \xi_2]$, ..., $[\xi_k-1, \xi_k]$, $[\xi_k, b)$, where you have fixed cubic polynomial between $\xi_i$ and $\xi_{i+1}$. 
173 |     \item $q$ has continuous first and second derivatives at the knots.
174 | \end{itemize}
175 | 
176 | However, there is another type of spline, known as a natural spline. This spline is one that extrapolates linearly beyond the boundary knots. After defining a few of these notions, we can intuitively see piecewise polynomials has relative smoothness properties and will allow us to arrive to a solution for the penalized objective. We will now prove a theorem that demonstrates this. 
177 | \subsec{Background: subspaces}
178 |         A subspace is a set of functions $f$ which form a subspace $\mathcal{F}$  if $\forall f,g \in \mathcal{F}$, $\lambda_1f_1 + \lambda_2g \in \mathcal{F}$ for all $\lambda_1, \lambda_2 \in \R$.
179 |         Furthermore, a subspace of functions has dimension of at most $k$ if $\exists f_1, ...,f_k \in \mathcal{F}$ such that $\forall f \in \mathcal{F}$, $f$ can be represented as \[
180 |         f = \sum_{i = 1}^{k}\lambda_if_i
181 |         \]
182 |         for some $\lambda_1, \cdots, \lambda_k \in \R$.  Note $f_i$'s are often refereed to as basis.
183 | \begin{theorem} The function that minimizes $L_{\lambda}(\hat{r})$ is a natural cubic spline with knots at the data points. The minimizer has to be a cubic spline. 
184 | \end{theorem}
185 | Therefore, the minimizer of the penalized objective has an estimator that is called a smoothing spline. Note that the search space in this case is dramatically reduced, because the number of natural cubic splines than the number of all functions. Furthermore, to have the smoothest function, you do not need any higher-order derivatives beyond the third order. To prove this let us consider the following lemma.
186 | \begin{lemma} All cubic splines with knots $\xi_1,...,\xi_k$ form a $(k+4)$-dimensional subspace of functions. Specifically, there exists some $h_1. ..., h_{k+4}$ such that every cubic spline $f$ can be represented as 
187 | 	\begin{align*}
188 | 		f = \sum_{i=1}^{k+4}\lambda_i h_i,
189 | 	\end{align*}
190 | where the $\lambda_i$'s serve as parameters. 
191 | \end{lemma}
192 | 
193 | \begin{proof}
194 | 	Let us consider the case where we have $f,g$ being cubic splices with fixed knots $\xi_1,...,\xi_k$. Therefore, $f+g$ is also a cubic splice. Now we will show why the space of cubic splines with fixed knots also form a $(k+4)$-dimensional subspace. Notice that a cubic spline can be represented as \[
195 | 	q(x) = a_{3,i}x^3 + a_{2,i}x^2 + a_{1,i}x + a_{0,i}
196 | 	\]
197 | 	For all $x \in [\xi_i, \xi_{i+1}]$ where $\forall i = 0, ..., k$. The convention for knot assignment is: $\xi_0 = a$, $\xi_{k+1} = b$, in addition to the convention of $a,b$ equating to ($-\infty,\infty$). However, note that $a_{t,i}$'s have to satisfy constraints pertaining to the $(M-2)$-th order derivation requirements explored earlier. To prove the lemma, consider the following functions
198 | 	\begin{align*}
199 | 	h_1(x) & = 1, \\
200 | 	h_2(x) & = x, \\
201 | 	h_3(x) & = x^2, \\
202 | 	h_4(x) &= x^3, \\
203 | 	h_{i+4}(x) &= (x-\xi_i)_+^3, \quad i =1, \cdots, k.
204 | 	\end{align*}
205 | 	where $t_+$ is the ReLU function utilized throughout machine learning $\max\{t,0\}$. We prove that they are the desired basis by induction.  When there is no knots, a degree 3 polynomial can be represented by combinations of $h_0, h_1, h_2, h_3$. 
206 | 	
207 | 	Suppose the inductive hypothesis holds true for $k-1$ knots. We can take the spline $\widetilde{q}(x)$ which spans over $\xi_1,...,\xi_{k-1}$ where we define $\widetilde{q}(x) = q(x)$ for all $x \in [\xi_{k-1},\xi_k]$. Suppose \[
208 | 	q(x) =  a_{3,k-1}x^3 + a_{2,k-1}x^2 + a_{1,k-1}x + a_{0,k-1}
209 | 	\]
210 | 	on $[\xi_{k-1}, \xi_k]$. For $\widetilde{q}(x)$ on $[\xi_k,b)$: \[
211 | 	\widetilde{q}(x) =  a_{3,k-1}x^3 + a_{2,k-1}x^2 + a_{1,k-1}x + a_{0,k-1}
212 | 	\] Since we also know that $\widetilde{q}$ is a cubic spline with $k-1$ knots, by the inductive hypothesis \[
213 | 	\widetilde{q}(x) = \sum_{i=1}^{k+3}\lambda_ih_i(x).
214 | 	\]
215 | 	Therefore, we can deduce that for all $x \leq \xi_k$ $q(x) - \widetilde{q}(x)$ is zero, while for [$\xi_k,b$) is a degree 3 polynomial. 
216 | 	\begin{figure}[!hb]
217 | 		\begin{center}
218 | 			\begin{tikzpicture}
219 | 				\begin{axis}[
220 | 					legend pos=north east,
221 | 					axis lines=middle,
222 | 					xtick={16,12}, ytick=\empty,
223 | 					xmin=-2, xmax=20,
224 | 					ymin=-2, ymax=40,
225 | 					xlabel=$\xi$,ylabel=$y$,
226 | 					]
227 | 					\addplot[
228 | 					domain = 0:16,
229 | 					thick,
230 | 					red,
231 | 					] {cos(.5 * pi * deg(x)) + .5*x + 2};
232 | 					\addplot[
233 | 					domain = 0:20,
234 | 					thick,
235 | 					blue,
236 | 					] {cos(.5 * pi * deg(x)) + .5*x + 2};
237 | 					\addplot[
238 | 					domain = 16:20,
239 | 					thick,
240 | 					orange,
241 | 					] {cos(.5 * pi * deg(x)) + .5*x - 9};
242 | 					\addplot[
243 | 					domain = 0:16,
244 | 					thick,
245 | 					orange,
246 | 					] {0};
247 | 					\addplot[
248 | 					domain = 16:20,
249 | 					thick,
250 | 					red,
251 | 					] {x - 5};
252 | 					
253 | 					\addplot[thick, samples=50, dashed,black] coordinates {(12,0)(12,20)};
254 | 					\addplot[thick, samples=50, dashed,black] coordinates {(16,0)(16,20)};
255 | 					\addplot[thick, samples=50, dashed,black] coordinates {(20,0)(20,20)};
256 | 					
257 | 					
258 | 					\addlegendentry{$\widetilde{q}(\cdot)$}
259 | 					\addlegendentry{$q(\cdot)$}
260 | 					\addlegendentry{$q(\cdot) - \widetilde{q}(\cdot)$}
261 | 					
262 | 				\end{axis}
263 | 			\end{tikzpicture}
264 | 			\caption{$q(\cdot) - \widetilde{q}(\cdot)$, $12= \xi_{k-1}$, $16 = \xi_k$}
265 | 			\label{fig:regression example}
266 | 		\end{center}
267 | 	\end{figure}
268 | 	\\Furthermore, recall that we know that $q(x)$ and $\widetilde{q}(x)$ have continuous derivatives. Notice that
269 | 	\[
270 | 	q(\xi_k) - \widetilde{q}(\xi_k) = 0 = b_0,
271 | 	\] \[
272 | 	q'(\xi_k) - \widetilde{q}'(\xi_k) = 0 = b_1, 
273 | 	\] \[
274 | 	q''(\xi_k) - \widetilde{q}''(\xi_k) = 0 = 2b_2.
275 | 	\]
276 | 	And given we can rewrite $q(x) - \widetilde{q}(x) = b_3(x-\xi_k)^3 + b_2(x-\xi_k)^2 + b_1(x-\xi_k) + b_0$. Therefore, $q(x) - \widetilde{q}(x) = b_3(x - \xi_k)^3$ for all $x \geq \xi_k$. Rewriting it shows $q(x) - \widetilde{q}(x) = b_3(x - \xi_k)^3$ and hence
277 | 	\begin{align*}
278 | 		q(x) &= \widetilde{q}(x) + b_3(x - \xi_k)_+^3 = \sum_{i=1}^{k+4}\lambda_ih_i(x).
279 | 	\end{align*}
280 | 	where $\lambda_{k+4} = b_3$
281 | \end{proof} 
282 | 	Given this lemma, we will now show that degree $3$ spline $g$ is the minimizer of $L_\lambda(\hat{r})$. To do this, we will construct a natural spline that matches $g$ on $x_1, ..., x_n$, namely $\widetilde{g}$, where we define $\widetilde{g}(x) = g(x)$ for all $x \in \left\{x_1, \cdots, x_n\right\}$. This implies \[
283 | 	\sum_{i =1}^{k}(Y_i - g(x_i))^2 = \sum_{i =1}^{k}(Y_i - \widetilde{g}(x_i))^2  
284 | 	\]
285 | 	Next we will show that $\int \widetilde{g}''(x)^2dx \leq \int g''(x)^2dx$ which will then indicate that $L_\lambda(\widetilde{g}) \leq L_\lambda(g)$. Notice that in these two inequalities, equality is attained at $g = \widetilde{g}$.
286 | 	Let us consider $h$ where $h = g - \widetilde{g} $. Given this, we only need to deduce $h(x) = 0$. Note that 
287 | 	\begin{align*}
288 | 		\int \widetilde{g}''(x)^2dx &= \int (\widetilde{g}''(x) + h''(x))^2dx \\
289 | 		&= \int (\widetilde{g}''(x))^2dx + 2\int \widetilde{g}''(x) h''(x)dx + \int (h''(x))^2dx \\
290 | 		&\geq \int (\widetilde{g}''(x))^2dx + 2\int \widetilde{g}''(x) h''(x)dx
291 | 	\end{align*}
292 | 	Here we want to show that $2\int \widetilde{g}''(x) h''(x)dx = 0$. By integration by parts, we can show: 
293 | 	\begin{align*}
294 | 		\int_{a}^{b} \widetilde{g}''(x) h''(x)dx &= \left.\widetilde{g}(x)''h'(x) \right|_{a}^b - \int_{a}^{b} h'(x)\widetilde{g}'''(x)dx.
295 | 	\end{align*}
296 | 	Recall that in natural splice, $\widetilde{g}'' = 0$, therefore $\widetilde{g}''(a) = 0$ and $\widetilde{g}''(b) = 0$, leaving us with
297 | 	\begin{align*}
298 | 		\int_{a}^{b} \widetilde{g}''(x) h''(x)dx &= - \int_{a}^{b} h'(x)\widetilde{g}'''(x)dx.
299 | 	\end{align*}
300 | 	Here, we can identify $\widetilde{g}''(x)$ as constant $c_i$ on $[\xi_i, \xi_{i+1}]$ given that $\widetilde{g}$ is a degree-$3$ polynomial on the interval. This allows us to further expand 
301 | 	\begin{align*}
302 | 		\int_{a}^{b} \widetilde{g}''(x) h''(x)dx &= - \int_{a}^{b} h'(x)\widetilde{g}'''(x)dx \\
303 | 		&= - \int_{a}^{x_1} h'(x)\widetilde{g}'''(x)dx  - \sum_{i = 1}^{n-1}\int_{x_i}^{x_{i+1}} h'(x)\widetilde{g}'''(x)dx - \int_{x_n}^{b} h'(x)\widetilde{g}'''(x)dx \\ 
304 | 		&= -c_0 \int_{a}^{x_i} h'(x)dx  - \sum_{i = 1}^{n-1} c_i\int_{x_i}^{x_{i+1}} h'(x)dx - c_{n} \int_{x_n}^{b} h'(x)dx.
305 | 	\end{align*}
306 | 	Here, $c_0$ and $c_n$ are zeros because $\widetilde{g}$ is linear near the boundary given that it is a natural spline, leaving us with
307 | 	\begin{align*}
308 | 		\int_{a}^{b} \widetilde{g}''(x) h''(x)dx &=  - \sum_{i = 1}^{n-1} c_i\int_{x_i}^{x_{i+1}} (h'(x))dx \\ 
309 | 		&=   - \sum_{i = 1}^{n-1} c_i (h(x_{i+1}) - h(x_i))dx  \\
310 | 		& = 0,
311 | 	\end{align*}
312 | 	where the last line follows since $g = \widetilde{g}$ on knots, $h(x_{i+1})$ and $h(x_{i})$ are zero. 
313 | 
314 | 
315 | %\subsection{}


--------------------------------------------------------------------------------
/Spring2021/06-04-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | % reset section counter
  4 | \setcounter{section}{0}
  5 | 
  6 | %\metadata{lecture ID}{Your names}{date}
  7 | \metadata{10}{Benjamin Anderson}{June 4th, 2021}
  8 | 
  9 | \sec{Review and overview}
 10 | Last lecture, we discussed the Gaussian process, which is a Bayesian approach to supervised, nonparametric problems. It can be thought of as a generalization of the mixture of Gaussians model, with an infinite number of Gaussian distributions. Today, we will discuss the Dirichlet process. As with the Gaussian process, our discussion of the Dirichlet process will begin with simpler, parametric mixture models, which we will then build on to understand the more complex Dirichlet process.
 11 | 
 12 | Unlike the Gaussian process, the Dirichlet process is used in an unsupervised setting, to model a distribution over some variable $X$, rather than modeling a conditional distribution of $Y \mid X$. We'll first review parametric mixture models, which are one way to model a probability distribution. Then, we'll discuss how to extend these models to a Bayesian setting, by establishing a prior over the parameters (this will require a tangent to define the Dirichlet distribution). Then, we'll discuss topic modeling, a popular type of unsupervised machine learning model, as an entry point into the Dirichlet process. Finally, we will define the Dirichlet process itself, which can be thought of as a topic model with infinite topics.
 13 | 
 14 | \sec{Parametric mixture models \& extension to Bayesian setting}
 15 | 
 16 | \subsec{Review: Gaussian mixture model}
 17 | The ``mixture of $k$ Gaussians'' is a probability distribution with the following generative ``story'' for how a sample $X_i$ is generated:
 18 | \begin{enumerate}
 19 | \item First, one of $k$ ``sources'', $z_i$, is chosen from a discrete (or categorical) distribution $\pi$ (which can be thought of as simply a non-negative, $k$-dimensional vector whose components sum to 1).
 20 | \item Then, $X_i$ is sampled from a Gaussian distribution whose mean and covariance are conditional on the choice of $z_i$, i.e. $X_i \mid z_i \sim \cN(\mu_{z_i}, \Sigma_{z_i})$
 21 | \end{enumerate}
 22 | 
 23 | The result is called a mixture of Gaussians because it is as if you combined $k$ Gaussian distributions (each with some weight $\pi_k$) into one distribution. You can read more about this model in the previous set of lecture notes.
 24 | 
 25 | \subsec{Dirichlet distribution}
 26 | In order to extend the mixture of Gaussians to a Bayesian setting, we have to establish priors over the parameters. There are many ways to get priors over $\mu_{z_i}$ and $\Sigma_{z_i}$, and we won't go into detail on that (a unit normal distribution and chi-squared distribution respectively is one example). However, the choice of parameters for $\pi$ is unique, as it is constrained: we must have that $\sum_{i = 1}^k = 1$, so not just any choice of $\pi_1, \ldots, \pi_k$ is a valid probability distribution. Our distribution over $\pi$ should only have probability mass on valid choices of $\pi$. The Dirichlet distribution (denoted \textit{Dir} for short in mathematical equations), is a natural choice.
 27 | 
 28 | In the Bayesian setting, we are interested in studying distributions over the \textit{parameters} of another distribution (which are themselves random variables). A simpler example of this idea is the Beta distribution, which is a probability distribution over the parameter $p$ for a binomial random variable (i.e. a coin flip). The Beta distribution represents the probability distribution over the ``true'' probability of the coin coming up heads. A Dirichlet distribution is simply a generalization of the Beta distribution to an experiment with more than two outcomes, e.g. a dice roll. So, the Dirichlet distribution could be used to model the distribution over the parameters $\pi_1, \pi_2, \pi_3, \pi_4, \pi_5, \pi_6$ representing the probability each side of the die has of being rolled.
 29 | 
 30 | The Dirichlet distribution is parametrized by $\alpha_1, \ldots, \alpha_k$, which, as with the Beta distribution, can be interpreted as ``pseudocounts'', i.e. a larger $\alpha_i$ will result in a distribution where larger values of $\pi_i$ have more density; and moreover, if their relative magnitudes are all held fixed, larger parameters denote more ``confidence'', resulting in a less uniform distribution. (As a simple example, if you've rolled each number on a die one time, you'd guess that all sides equally likely, but with low confidence. If you've rolled each number on a die 1000 times, you'd guess they're all equally likely, with high confidence.)
 31 | 
 32 | The Dirichlet distribution has a few important properties and related intuitions, some of which will be important for our later discussion of the Dirichlet process:
 33 | 
 34 | \begin{enumerate}
 35 | \item The PDF of the Dirichlet distribution over $K$-dimensional $\vec{\pi}$ is:
 36 | 
 37 | $$p(\vec{\pi}) = \frac{\prod\limits_{i = 1}^K \Gamma(\alpha_i)}{\Gamma\left(\sum\limits_{i = 1}^K \alpha_i\right)} \prod\limits_{i = 1}^K \pi_i^{\alpha_i - 1}$$
 38 | 
 39 | ...where $\Gamma$ is the Gamma function (too complicated to explain here).
 40 | 
 41 | \item $\Exp[\pi_i] = \frac{\alpha_i}{\sum\limits_{j = 1}^K \alpha_j}$, which means that the relative magnitudes of the $\alpha$'s determine the expected relative magnitudes of the components of $\pi$. (The pseudocounts interpretation helps here: a larger $\alpha_i$ is a larger pseudocount, as if you've already observed that event more, so that should make the parameter for the probability of that event larger.)
 42 | 
 43 | \item $\sum_{i = 1}^K \alpha_i$ controls how ``sharp'' the distribution is. Again, by the pseudocounts logic, having ``seen'' (or hallucinated) more data will make us more confident in what we think the distribution is, so this makes sense intuitively.
 44 | 
 45 | \item \textbf{Relationship to Gamma distribution:} If $\eta_k \iid Gamma(\alpha_k, 1)$, for $i \in \{1, \ldots, K\}$, and $\pi_i = \frac{\eta_i}{\sum_j \pi_j}$, then $(\pi_1, \ldots, \pi_K) \sim Dir(\alpha_1, \ldots, \alpha_K)$.
 46 | 
 47 | \newpage
 48 | \item \textbf{Merging rule:} If $(\pi_1, \ldots, \pi_K) \sim Dir(\alpha_1, \ldots, \alpha_K)$, then we can ``merge'' $\pi$'s by summing them. Doing so will create a new Dirichlet distribution with fewer components, parametrized by new $\alpha$'s obtained by summing the $\alpha_j$'s corresponding to the $\pi_j$'s that were combined. For example:
 49 | 
 50 | $$(\pi_1 + \pi_2, \pi_3 + \pi_4, \ldots) \sim Dir(\alpha_1 + \alpha_2, \alpha_3 + \alpha_4, \ldots)$$
 51 | 
 52 | \item \textbf{Expanding rule:} Reverse merging rule; you can also obtain a new Dirichlet distribution from an existing one by ``splitting'' components; for example:
 53 | 
 54 | $$(\pi_1\theta, \pi_1(1-\theta), \pi_2, \ldots, \pi_K) \sim Dir(\alpha_1b,\alpha_1(1-b),\alpha_2, \ldots, \alpha_K)$$
 55 | 
 56 | ...where $\theta \sim Beta(\alpha_1b, \alpha_1(1-b))$  for $0< b <1$. 
 57 | 
 58 | \item \textbf{Renormalizing property:} Given $(\pi_1, \ldots, \pi_K) \sim Dir(\alpha_1, \ldots, \alpha_K)$, then if we discard one $\pi_i$ and its associated $\alpha_i$, and renormalize, we get another Dirichlet distribution with parameters $\alpha_1, \ldots, \alpha_{i - 1}, \alpha_{i + 1}, \ldots, \alpha_K$.
 59 | \end{enumerate}
 60 | 
 61 | \subsec{Bayesian Gaussian mixture model}
 62 | Now, if we wanted to extend the mixture of Gaussians to the Bayesian setting, we have the tools to do so. The only change from the frequentist version of the generative story is that the parameters themselves ($\mu$'s, $\Sigma$'s, and $\pi$) are drawn from a prior distribution; \textit{then} the latent variables $z_i$ are drawn from $z \mid \pi \sim Categorical(\pi_1, \ldots, \pi_k)$; and finally $X_i \mid \mu, \Sigma, z_i \sim \cN(\mu_{z_i}, \Sigma_{z_i})$.
 63 | 
 64 | \subsec{Dirichlet topic model}
 65 | Topic modeling is a common unsupervised technique in natural language processing, which models a distribution over documents (collections of words) by grouping them into clusters (topics). This is a mixture model just like the mixture of Gaussians: each topic is a ``source'', and then the conditional on the document being associated with that source (topic), there's a set of probabilities associated with each word appearing in the document.
 66 | 
 67 | More formally, we set up with a vocabulary $\cV$ with $W$ words. Each document is represented as a vector in $\mathbb{R}^W$, with the $i$-th component equal to the number of times word $i$ appears in it. (This is called a ``bag of words'' representation.) For simplicity, we assume each document is length $n$. Then, we aim to model the distribution over these document vectors with a mixture model. Unlike the Gaussian mixture, these document vectors can only take on non-negative integer values in each entry, so rather than a Gaussian conditioned on the source, we model a document as a multinomial distribution conditioned on its topic. (A multinomial distribution is just $n$ trials of a categorical distribution.) So, we have parameters $\pi$ for the choice of topic, and then $\theta_{z_i}$ for the multinomial distribution over words for each topic.
 68 | 
 69 | Then, the generative story is:
 70 | \begin{enumerate}
 71 | \item First, select a topic, $z_i \sim Categorical(\pi)$, where as before, the parameter $\pi$ is a vector whose components sum to 1.
 72 | \item Generate $n$ words with $Multinomial(n, \theta_{z_i})$, this produces the document $X$.
 73 | \end{enumerate}
 74 | 
 75 | In order to make this Bayesian, which gets us to the Dirichlet Topic Model, we only need a prior over the parameters, $\pi$ and $\theta_{z_i}$ for $i \in \{1, \ldots, W\}$, since the number of words $n$ is fixed. Both can be Dirichlet priors: one for $\pi$ with the number of parameters equal to the number of topics; and the other for each $\theta_{z_i}$ with the number of parameters equal to the number of words in the vocabulary. Then, to generate a document, we first sample $\pi$ and all $\theta$'s, then follow the generative process above.
 76 | 
 77 | \sec{Dirichlet process}
 78 | 
 79 | \subsec{Overview}
 80 | One way to think about the Dirichlet process is as a topic model whose number of topics is not fixed, but rather can grow with the number of data points grows. Rather than a fixed number of topics in advance, we allow choosing the number of topics to be ``part of the model'', in a sense. To do this, we need a prior that can generate probability vectors of any dimension, not just a fixed $K$, in other words, a distribution over $\bigcup_{K=1}^\infty \Delta_K$. We can think of the Dirichlet process as doing exactly this. Let's take some abstractions from the parametric model to generalize to the Dirichlet process setting.
 81 | 
 82 | In the parametric mixture models we've been discussing, you sample some parameters $\theta_k^*$ for each source (or topic) from some distribution $H$; sample $\pi$ from a Dirichlet distribution; sample the latent $z$ from $Categorical(\pi)$, and finally sample $X$ from some distribution parametrized by $\theta_{z}$. The important thing to take away here is that, for a given sample $X_i$, once you've fixed $\pi$ and all the $\theta_k^*$'s, then your choice of $z_i$ completely determines $\theta_{z_i}$, i.e. the set of parameters you'll use to select $X_i$. 
 83 | 
 84 | Let's say we are modeling $n$ examples, $X_1, \ldots, X_n$. For each, we can think about its corresponding $\theta_{z_i}$ itself as a random variable, drawn from a distribution $G$. $G$ is fixed given a choice of $\pi$ and all $\theta_{k}^*$'s; which means that the prior for $G$ is determined by the choice of $\alpha$ and $H$. A realization of $G$ is a choice of $\theta_i$ (i.e. the $\theta$ used to sample $X_i$), which is one of the $k$ possible choices $\theta_1^*, \ldots, \theta_{K}^*$. $G$ is basically a discrete distribution with point masses on all the locations defined by $\theta_1^*, \ldots, \theta_{K}^*$, with the caveat that the magnitude of $K$ is not fixed. The goal is to construct a prior over $G$, which in turn gives a distribution over $\theta_i$, which in turn parametrizes a distribution over $X_i$.
 85 | 
 86 | There are two approaches for designing a prior for $G$. One is to directly construct it. (We'll do that later.) The other is to model the joint distribution over $\theta_1, \ldots, \theta_n$ (i.e. the choices of parameters for each of the $n$ examples), which then implicitly defines $G$. We will start with this approach. This will require a few theoretical building blocks, which will occupy the next few sections.
 87 | 
 88 | \subsec{Exchangeability \& de Finett's theorem}
 89 | Exchangeability is a fundamental concept in Bayesian statistics. Given a sequence of random variables $X_1, \ldots, X_n$, we say they are \textit{exchangeable} if their joint distribution $p$ is permutation-invariant. That is, if $p(X_1 = k_1, \ldots, X_n = k_n) = c$, then if we scramble up all the $k$'s, the joint probability would still be $c$, no matter what order the $k$'s are in. Furthermore, we say a sequence of random variables is \textit{infinitely exchangeable} if any length-$n$ prefix of the sequence is exchangeable for all $n \geq 1$.
 90 | 
 91 | \begin{theorem}
 92 | De Finett's Theorem: If $\theta_1, \ldots, \theta_n$ are infinitely exchangeable, then there exists a random variable $G$ such that $p(\theta_1, \ldots, \theta_n) = \int\limits p(G) \prod\limits_{i = 1}^n p(\theta_i \mid G) dG$.
 93 | \end{theorem}
 94 | 
 95 | In other words, there exists some $G$ such that joint distribution over all $n$ $\theta$'s ``factors'' and is equivalent to the distribution obtained by first sampling $G$ from $p(G)$, then sampling $\theta_i$ from the distribution defined by $G$. The implication is that we don't have to define $G$ directly; we can instead describe $\theta_1, \ldots, \theta_n$ (the ``effect'' of $G$) and this is sufficient (since by this theorem, $G$ is guaranteed to exist, and we can do inference tasks using just the $\theta_i$'s).
 96 | 
 97 | \subsec{The Chinese restaurant process}
 98 | To define the joint distribution over $\theta_1, \ldots, \theta_n$, we first have to explain something called the Chinese restaurant process, which provides intuition for this distribution. Imagine a restaurant with infinitely many tables, and $n$ customers. Customers enter one at a time, and sit at a table according to the following rules:
 99 | 
100 | \begin{enumerate}
101 | \item Customer 1 sits at table 1 with probability 1.
102 | \item For $i > 1$, customer $i$ sits at (occupied) table $k$ with probability $\frac{n_k}{\alpha + i - 1}$ (where $n_k$ is the number of previous customers at that table), or else sits down and starts a new table with probability $\frac{\alpha}{\alpha + i - 1}$. 
103 | \end{enumerate}
104 | 
105 | Because all the $n_k$'s add up to $i - 1$ (the number of previous customers) we can quite easily confirm this setup makes sense (i.e. the probabilities of the customer's choices sum to 1). What does this thought process have to do with Dirichlet processes though? Well, let the latent variable $z_i$ be the table number of the $i$-th customer. Then, if each ``table'' is assigned some $\theta_k^* \sim H$, then this gives us a way of picking $\theta_i$'s, by simply letting $\theta_i$ be the value assigned to the table where the $i$-th customer sits. This is also known as the Blackwell-MacQueen urn scheme.
106 | 
107 | This provides a joint distribution over $\theta_1, \ldots, \theta_n$. Moreover, it is exchangeable (possible to verify, but we won't do it here). Intuitively, it will result in some outcomes that ``could be'' IID draws from some discrete distribution $G$ (informally speaking). Formally, applying de Finetti's theorem, because exchangeability holds, we know that there exists a $G$ such that $\theta_1, \ldots, \theta_n$ chosen according to this scheme are equivalent to first sampling $G \sim DP(\alpha, H)$, then sampling each $\theta_i \mid G \iid G$. We don't know what $G$ is, just that it exists; and we can do all the interesting probabilistic inference without it (using just the $\theta_i$'s).
108 | 
109 | \subsec{Explicitly constructing $G$ (informal)}
110 | We don't have to specify $G$ indirectly in this way—it can also be directly defined and constructed. First, a slightly-incorrect, informal treatment. We \textit{basically} want an infinite-dimensional Dirichlet distribution, $\lim_{k \to \infty} Dir(\alpha/k, \ldots, \alpha/k)$. Then, we would just select some $\theta_k^* \sim H$ for each of these components, and have an infinite mixture. $G$ could then be defined as a set of ``point masses'' with some density on each of the $\theta_k^*$'s, i.e.:
111 | 
112 | $$G = \sum_{k = 1}^\infty \pi_k \delta_{\theta_k^*}$$
113 | 
114 | ...where $\delta$ denotes the direct measure. 
115 | 
116 | This is all slightly imprecise and incorrect, but it gets at the basic idea. To formalize it, we use a variate of the \textbf{merging rule} (10.2.2). Rather than summing pairs of $\pi$'s, as discussed there, imagine \textit{partitioning} $\pi$'s into groups. By the same rule, we get a new Dirichlet distribution with a component for each partition, whose $\alpha$ parameters for each partition are the sum of the $\alpha_k$'s in that partition. The Dirichlet process is a bit different, because we will partition an infinite space, not a finite list of $\pi$'s, but this is exactly the idea.
117 | 
118 | Recall that $G$ is a distribution over the space of $\Theta$, the set of all possible $\theta_k^*$. (A discrete distribution, with point masses on certain possible values $\theta_k^*$.) Consider the partition of $\Theta$ into $A_1, \ldots, A_m$. Then, $G(A_i)$ is basically the total mass of $G$ that's contained in the $A_i$ segment of the partition; i.e. $G(A_i) = \Pr[\theta \in A_i]$. This is deterministic for fixed $G$ (but of course random otherwise, since $G$ is a random variable itself). The claim is that $G(A_1), \ldots, G(A_m) \sim Dir(\alpha H(A_1), \ldots, \alpha H(A_m))$, i.e. that a partition of $\Theta$ into some finite number of segments results in a Dirichlet distribution.
119 | 
120 | We have $G = \sum_{k = 1}^\infty \pi_k \delta_{\theta_k^*}$, but we can write $G(A_i)$ as the sum over only the mass in partition $A_i$, i.e $\sum_{k = 1}^\infty \pi_k \mathbf{1}\{\theta_k^* \in A_i\}$. Or, letting $I_j$ be the set of $j$ such that $\theta_k^* \in A_j$, we can also write $G(A_j) = \sum_{k \in I_j} \pi_k$. We can then write:
121 | 
122 | $$(G(A_1), \ldots, G(A_m)) = \left(	\sum\limits_{k \in I_1} \pi_k, \ldots, \sum\limits_{k \in I_m} \pi_k 	\right) \sim$$
123 | $$Dir(\sum\limits_{k \in I_1} \alpha_k, \ldots, \sum\limits_{k \in I_m} \alpha_k)$$
124 | 
125 | This is close to showing the claim, but it's still not quite right because $I$ is not fixed, it is a random variable. But, we can intuitively explain why the $I$'s aren't that important. Remember, our goal is to get something like an infinite-dimensional Dirichlet distribution, with parameters $\alpha / K$. Because of the idea of this uniform prior, we can say that $\sum_{\alpha_k \in I_j}\alpha_k = \sum_{k \in I_j} \alpha / K = |I_j| \alpha / K $, i.e. $\alpha$ multiplied by the fraction the probability mass in partition defined by $I_j$, which is just $\alpha H(A_j)$, which is what we wanted to show.
126 | 
127 | \subsec{Explicitly constructing $G$ (formal)}
128 | The formal definition of a Dirichlet process: A unique distribution over distributions on $\Theta$ such that for any partition $A_1, \ldots, A_m$ of $\Theta$, we have that when $G \sim DP(\alpha, H)$, then $G(A_1), \ldots, G(A_m) \sim Dir(\alpha H(A_1), \ldots, \alpha H(A_m))$. We can explicitly construct such a distribution with the ``stick-breaking construction.'' 
129 | \begin{enumerate}
130 | \item Sample $\theta_k^* \iid H$ for $k = 1, 2, \ldots, \infty$.
131 | \item Choose $\beta_k \iid Beta(1, \alpha)$ for $k = 1, 2, \ldots, \infty$.
132 | \item Set $\pi_k = \beta_k \prod\limits_{i = 1}^{k - 1} (1 - \beta_i)$.
133 | \item Then, $G = \sum_{k = 1}^\infty \pi_k \delta_{\theta_k^*}$.
134 | \end{enumerate}
135 | 
136 | It's called the "stick-breaking construction" because the intuition is that you begin with a stick of length 1, and then at the $k$-th step, break off the fraction $\beta_k$ of what's left, and choose that value for $\pi_k$. So first, the stick is length 1, you break off $\beta_1$, and so $\pi_1 = \beta_1$. Then, there's $(1 - \beta_1)$ left; you break off $\beta_2$ of that, so then $\pi_2 = (1 - \beta_1)\beta_2$. So on and so forth. This gives a formal construction of $G$ for the Dirichlet process.
137 | 
138 | Then, all that remains is to do inference, which is typically done with Markov-Chain Monte Carlo (e.g. Gibbs Sampling). This is tractable, since the conditional distributions of the $\theta_i$'s have nice properties.
139 | 
140 | \sec{Summary}
141 | Since this is the last class, let's look back at what we've learned.
142 | \begin{enumerate}
143 | \item Non-parametric regression, including the kernel estimator, local polynomial/linear regression, splines, and using cross-validation to select a model and tune hyperparameters.
144 | \item The kernel method, and its connection to splines and wide two-layer neural networks.
145 | \item Neural networks, transfer learning, and few-shot learning.
146 | \item Density estimation, for CDF and PDF.
147 | \item Bayesian nonparametric models (Gaussian and Dirichlet processes).
148 | \end{enumerate}


--------------------------------------------------------------------------------
/Spring2021/05-07-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | % reset section counter
  4 | \setcounter{section}{0}
  5 | 
  6 | %\metadata{lecture ID}{Your names}{date}
  7 | \metadata{6}{Louie Kam and Shaan Patel}{May 7th, 2021}
  8 | 
  9 | \sec{Review and overview}
 10 | In the previous lecture, we began our discussion on neural nets and claimed that neural nets are equivalent to nonparametric penalized regression for one dimensional inputs; specifically, we showed that there exists a complexity measure $\bar{R}(f)$ for nonparametric penalized regression such that the nonparametrized penalized regression minimization problem is the same as the parametrized neural network one, i.e. $\min_{f}L(f) + \lambda \bar{R}(f) = \min_{\theta} L(h_\theta) + \lambda C(\theta)$.
 11 | 
 12 | In this lecture, we will discuss the next step of the proof by deriving an explicit formula for $\bar{R}(f)$. Following that, we will take a look at algorithms that will solve this minimization problem and discuss the intuition behind them. In particular, we will discuss gradient descent methods and feature training.
 13 | \sec{Finishing up proof}
 14 | 
 15 | \subsec{Recap}
 16 | Recall from the last lecture, we introduced the following quantities for an infinitely-wide two-layer neural net,
 17 | \begin{align*}
 18 | a & = (a_1, a_2, a_3, \cdots) \in \bbR^\infty, \\
 19 | b & = (b_1, b_2, b_3, \cdots) \in \bbR^\infty, \\
 20 | w & = (w_1, w_2, w_3, \cdots) \in \bbR^\infty, \\
 21 | h_\theta(x) & = \sum_{i=1}^\infty a_i [w_ix+b_i]_+, \\
 22 | C(\theta) &= \frac{1}{2} \bigg(\sum_{i=1}^\infty a_i^2 + \sum_{i=1}^\infty w_i^2\bigg).
 23 | \end{align*}
 24 | Denote $\theta$ as the set of vectors $\{(a_i, b_i, w_i)\}_{i=1}^\infty$. In the previous lecture, we posted the following theorem.
 25 | 
 26 | \begin{theorem}\label{thm:eq}
 27 | Let $\bar{R}(f)$ be a complexity measure such that
 28 | \al{\label{eqn:measure}
 29 | \bar{R}(f) \overset{\Delta}{=} \max\bigg\{\int_{-\infty}^\infty |f''(x)|, |f'(-\infty)+f'(+\infty)|\bigg\}.
 30 | }
 31 |  Let $f^*$ to be the nonparametric penalized regression that minimizes
 32 | \[
 33 | \min_f L(f)+\lambda\bar{R}(f).
 34 | \]
 35 |  Let $h_{\theta^*}$ be the parametric neural net that minimizes
 36 | \[
 37 | \min_\theta L(h_\theta)+\lambda C(\theta).
 38 | \]
 39 | Then $f^* = h_{\theta^*}$ and 
 40 | \al{\label{eqn:min_eq}
 41 | \min_f L(f)+\lambda\bar{R}(f) = \min_\theta L(h_\theta)+\lambda C(\theta).
 42 | }
 43 | \end{theorem}
 44 | 
 45 |  We previously showed that there exists
 46 | \al{\label{eqn:rf}
 47 | \bar{R}(f) \overset{\Delta}{=} \min C(\theta) \quad \text{ s.t. } \quad f(x) = h_\theta(x),
 48 | }
 49 | and Eq.~\eqref{eqn:min_eq} holds for this definition of $\bar{R}(f)$. It remains to show the definition of complexity measure in Eq.~\eqref{eqn:min_eq} holds for \eqref{eqn:rf}. Let us take a detour and prove a related lemma.
 50 | 
 51 | \subsec{Preparation}
 52 | 
 53 | \begin{lemma}\label{lemma:a=w}
 54 | The minimizer $\theta^*$ of Eq.~\eqref{eqn:rf} satisfies $|a_i| = |w_i|$, for all $i = 1, 2, \cdots$. 
 55 | \end{lemma}
 56 | 
 57 | Recall that the $w_i$'s are the weights of the first layer and the $a_i$'s are the weights of the second layer. Therefore, this lemma implies that the weights are balanced between the two levels in order to minimize the complexity.
 58 | 
 59 | \begin{proof}
 60 | 	We can write $a_i[w_ix + b_i]_+ = \frac{a_i}{\gamma}[\gamma w_ix + \gamma b_i]_+$ if $\gamma > 0$. This is allowed because $[\gamma t]_+ = \gamma[t]_+$.
 61 | 	Now suppose that $(a_i,w_i,b_i)$ is optimal for each $i$. Then the complexity should not decrease after scaling by $\gamma$ as we have already found the minimum.
 62 | 	\begin{equation}
 63 | 		\frac{1}{2}(a_i^2 + w_i^2) \leq \frac{1}{2}(\frac{a_i^2}{\gamma^2} + \gamma^2 w_i^2).
 64 | 	\end{equation}
 65 | 	Now we minimize with respect to $\gamma$.
 66 | 	\begin{equation}
 67 | 		\min_{\gamma} \frac{1}{2}(\frac{a_i^2}{\gamma^2} + \gamma^2 w_i^2) = \frac{1}{2}(a_i^2 + w_i^2).
 68 | 	\end{equation}
 69 | 	Let $g(\gamma) = \frac{1}{2}(\frac{a_i^2}{\gamma^2} + \gamma^2 w_i^2)$. Therefore $\min g(\gamma) = g(1)$.
 70 | 	\begin{align*}
 71 | 		& g'(1) = 0 = \frac{-a_i^2}{\gamma^3} + \gamma w_i^2 = -a_i^2 + w_i^2 \\
 72 | 		& \Rightarrow a_i^2 = w_i^2 \\
 73 | 		& \Rightarrow |a_i| = |w_i|.
 74 | 	\end{align*}
 75 | \end{proof} 
 76 |  We now proceed to finish our proof of Theorem \ref{thm:eq}. We can redefine our neural net $h_\theta(x)$: 
 77 | \[
 78 | \begin{split}
 79 | h_\theta(x) &= \sum_{i=1}^\infty a_i [w_ix+b_i]_+ = \sum_{i=1}^\infty a_i |a_i| \bigg[\frac{w_i}{|a_i|} x + \frac{b_i}{|a_i|}\bigg]_+ = \sum_{i=1}^\infty \alpha_i [\widetilde w_i x + \beta_i]_+,
 80 | \end{split}
 81 | \]
 82 | where $\alpha_i = a_i |a_i|$, $\widetilde w_i = \frac{w_i}{|a_i|}$, and $\beta_i = \frac{b_i}{|a_i|}$. Since $\theta$ satisfies Eq.~\eqref{eqn:rf}, by using the results of Lemma \ref{lemma:a=w}, we know that $\alpha_i \in \{-a_i^2, a_i^2\}$ and $\widetilde w_i \in \{-1, 1\}$. Furthermore, we can redefine $C(\theta)$:
 83 | \[
 84 | \begin{split}
 85 | C(\theta) &= \frac{1}{2} \bigg(\sum_{i=1}^\infty a_i^2 + \sum_{i=1}^\infty w_i^2\bigg) = \frac{1}{2} \bigg(\sum_{i=1}^\infty a_i^2 + \sum_{i=1}^\infty a_i^2\bigg) = \sum_{i=1}^\infty a_i^2 = \sum_{i=1}^\infty |\alpha_i|.
 86 | \end{split}
 87 | \]
 88 | Define a new neural net by the set of parameters $\widetilde \theta = \{(\alpha_i, \beta_i, \widetilde w_i)\}_{i=1}^\infty$. Then the objective function in Eq.~\eqref{eqn:rf} becomes
 89 | \al{\label{eqn:rf_1}
 90 | R(f) \overset{\Delta}{=} \min \norm{\alpha}_1 \quad \text{ s.t. } \quad f(x) = h_{\widetilde \theta}(x).
 91 | }
 92 | 
 93 | \subsec{Discretization to rewrite $h_{\widetilde\theta}(x)$}
 94 | The neural net $h_{\widetilde \theta}(x)$ can be grouped by whether $\widetilde w_i = 1$ and $\widetilde w_i = -1$:
 95 | \[
 96 | h_{\widetilde\theta}(x) = \sum_{i: \widetilde w_i = 1} \alpha_i [x+\beta_i]_+ + \sum_{i: \widetilde w_i = -1} \alpha_i [-x+\beta_i]_+
 97 | \]
 98 | Let $Z=\{z_1, \cdots, z_N\}$ be a discretization of $\bbR$. Then $[x+\beta_i]_+$ can be approximated by $[x+z_j]_+$, where the bin around $z_j$ is the ``bucket'' that $\beta_i$ falls into. Since $[x+z_j]_+$ is constant over all $i$ where $\beta_i$ falls into $z_j$, we can isolate $\alpha$ such that
 99 | \[
100 | \sum_{i: \beta_i \in z_j} \alpha_i [x+\beta_i]_+ \approx [x+z_j]_+ \sum_{i: \beta_i \in z_j} \alpha_i.
101 | \]
102 | Similar logic holds for $[-x+z_j]_+$. See Fig.~\ref{fig:disc} for an example of discretization. 
103 | \begin{figure}[h]
104 | \centering
105 | \includegraphics[width=0.8\textwidth]{figure/Lecture06/discretization.png}
106 | \caption{Example of discretization across $\bbR$} \label{fig:disc}
107 | \end{figure}
108 | 
109 | Define $u^+(z)$ and $u^-(z)$ as
110 | \[u^+(z)=\bigg(\sum_{i:\widetilde w_i = 1, \beta_i = z} \alpha_i\bigg), u^-(z)=\bigg(\sum_{i:\widetilde w_i = -1, \beta_i = z} \alpha_i\bigg).
111 | \]
112 | Then
113 | \[
114 | \sum_{i: \widetilde w = 1} \alpha_i [x+\beta_i]_+ = \sum_{z \in Z} [x+z]_+ \bigg(\sum_{i:\widetilde w_i = 1, \beta_i = z} \alpha_i\bigg) = \sum_{z \in Z} [x+z]_+ u^+(z),
115 | \]
116 | and thus we have 
117 | \[
118 | h_{\widetilde\theta}(x) = \sum_{z \in Z} [x+z]_+ u^+(z) + \sum_{z \in Z} [-x+z]_+ u^-(z).
119 | \]
120 | Taking the limit $N\to\infty$ makes the discretization fine-grained and results in the integral,
121 | \[
122 | h_{\widetilde\theta}(x) = \int_{-\infty}^\infty [x+z]_+ u^+(z) dz + \int_{-\infty}^\infty [-x+z]_+ u^-(z) dz.
123 | \]
124 | We can view $h_{\widetilde\theta}(x)$ as a linear combination of features $[x+z]_+$ and $[-x+z]_+$ for $z \in \bbR$.
125 | 
126 | \subsec{Reformulation of the objective}
127 |  We know that
128 | \[
129 | \sum_{i=1}^\infty |\alpha_i| \ge \int_{-\infty}^{\infty} |u^+(z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz.
130 | \]
131 | by the triangle equality; for every bucket $z$, $\sum_{i: \beta_i \in z} |\alpha_i| \ge |u^+(z)| = |\sum_{i: \beta_i \in z} \alpha_i|$ and a similar expression holds for $u^-(z)$. Equality occurs at the minimum, as the optimal $\theta$ minimizes complexity regardless of how we rewrite the expression. Thus, we can update our objective in Eq.~\eqref{eqn:rf_1}:
132 | \al{\label{eqn:rf_2}
133 | \min \int_{-\infty}^{\infty} |u^+(z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz \quad \text{ s.t. } \quad f(x)=h_{\widetilde\theta}(x).
134 | }
135 | We can write the first derivative of $[x+z]_+$ to be
136 | \[
137 | \frac{d}{dx} [x+z]_+ = I(x+z \ge 0),
138 | \]
139 | and the second derivative of $[x+z]_+$ to be
140 | \[
141 | \frac{d}{dx} I(x+z \ge 0) = \delta_{-x}(z) = 
142 | \begin{cases}
143 | \infty & \text{if } z=-x \\
144 | 0 & \text{otherwise}
145 | \end{cases}.
146 | \]
147 | Likewise, the first derivative of $[-x+z]_+$ is
148 | \[
149 | \frac{d}{dx} [-x+z]_+ = -I(-x+z \ge 0),
150 | \]
151 | and the second derivative of $[-x+z]_+$ is
152 | \[
153 | \frac{d}{dx} [-I(-x+z \ge 0)] = \delta_{x}(z) = 
154 | \begin{cases}
155 | \infty & \text{if } x=z \\
156 | 0 & \text{otherwise}
157 | \end{cases}.
158 | \]
159 | The derivative of $f(x)$ is
160 | \[
161 | f'(x) = h_{\widetilde\theta}'(x) = \int_{-\infty}^{\infty} I(x+z \ge 0) u^+(z) dz + \int_{-\infty}^{\infty} -I(-x+z \ge 0) u^-(z) dz,
162 | \]
163 | and the derivative of $f'(x)$ is
164 | \[
165 | f''(x) = \int_{-\infty}^\infty \delta_{-x}(z) u^+(z) dz + \int_{-\infty}^\infty \delta_{x}(z) u^-(z) dz = u^+(-x) + u^-(x).
166 | \]
167 | Note that the last equality holds because $\delta_{-x}(z)$ and $\delta_{x}(z)$ can be treated as ``degenerate'' probability distributions (with total probability 1 occurring only at $-x$ and $x$, respectively). Our choice of $x$ was arbitrary, so this holds for all $x$. Thus, the objective from Eq.~\eqref{eqn:rf_2} becomes
168 | \al{\label{eqn:rf_3}
169 | \min \int_{-\infty}^{\infty} |u^+(z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz \quad \text{ s.t. } \quad \forall x, f''(x) = u^+(-x) + u^-(x).
170 | }
171 | 
172 | \subsec{Simplification of the constraints}
173 | We can further remove redundancies by parameterizing $u^+(-x)$ and $u^-(x)$ in terms of a function $q$ and using the constraint $f''(x) = u^+(-x) + u^-(x)$:
174 | \[
175 | \begin{split}
176 | u^+(-x) &= \frac{1}{2}(f''(x) - q(x)), \\
177 | u^-(x) &= \frac{1}{2}(f''(x) + q(x)).
178 | \end{split}
179 | \]
180 | Then the objective function in Eq.~\eqref{eqn:rf_3} can be written as
181 | \[
182 | \begin{split}
183 | \int_{-\infty}^{\infty} |u^+(z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz &= \int_{-\infty}^{\infty} |u^+(-z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz \\
184 | &= \int_{-\infty}^{\infty} \Big|\frac{1}{2}(f''(z) - q(z))\Big| dz + \int_{-\infty}^{\infty} \Big|\frac{1}{2}(f''(z) + q(z))\Big| dz \\
185 | &= \frac{1}{2} \int_{-\infty}^\infty \Big(|f''(z)-q(z)| + |f''(z)+q(z)|\Big) dz.
186 | \end{split}
187 | \]
188 | Since $|f''(z)-q(z)| + |f''(z)+q(z)|$ is $2|f''(z)|$ if $|f''(z)| \ge q(z)$ and $2|q(z)|$ if $|f''(z)| < q(z)$, we have a simple expression for the objective function:
189 | \[
190 | \int_{-\infty}^{\infty} |u^+(z)| dz + \int_{-\infty}^{\infty} |u^-(z)| dz = \int_{-\infty}^\infty \max\{|f''(z)|, |q(z)|\} dz.
191 | \]
192 | We can find a constraint on $q$ using
193 | \[
194 | f'(x) = \int_{-\infty}^{\infty} I(x+z \ge 0) u^+(z) dz + \int_{-\infty}^{\infty} -I(-x+z \ge 0) u^-(z) dz,
195 | \]
196 | specifically the values of $f'(-\infty)$ and $f'(\infty)$:
197 | \[
198 | \begin{split}
199 | f'(-\infty) &= \int_{-\infty}^{\infty} -u^-(z) dz = \int_{-\infty}^{\infty} -\frac{1}{2}(f''(z) + q(z)) dz, \\
200 | f'(\infty) &= \int_{-\infty}^{\infty} u^+(z) dz = \int_{-\infty}^{\infty} u^+(-z) dz = \int_{-\infty}^{\infty} \frac{1}{2}(f''(z) - q(z)).
201 | \end{split}
202 | \]
203 | Thus, the sum
204 | \al{
205 | f'(-\infty) + f'(\infty) = - \int_{-\infty}^\infty q(z) dz
206 | }
207 | gives a constraint for $q$, and we update the objective in Equation \ref{eqn:rf_3} in terms of $q$:
208 | \al{\label{eqn:rf_4}
209 | \min \int_{-\infty}^{\infty} \max\{|f''(z)|, |q(z)|\} dz \text{ s.t. } f'(-\infty) + f'(\infty) = - \int_{-\infty}^\infty q(z) dz.
210 | }
211 | Previous formulations of the objective were taken with respect to many (infinite) variables, but we have found an equivalent objective with respect to $q$ only. Consider the following discrete objective
212 | \[
213 | \min \sum_{i=1}^k \max\{a_i, |x_i|\} \quad \text{ s.t. } \quad \sum_{i=1}^k x_i = B.
214 | \]
215 | The minimum value of the objective function above is $\max\{\sum_{i=1}^k a_i, |B|\}$. Connecting this idea to our objective in Equation \ref{eqn:rf_4}, the minimum value of the objective function is 
216 | \[
217 | \max\bigg\{\int_{-\infty}^\infty |f''(x)|, |f'(-\infty) + f'(\infty)|\bigg\},
218 | \]
219 | which is $\bar{R}(f)$ for the initial objective in Eq.~\eqref{eqn:rf} so we are done.
220 | 
221 | \sec{Optimization}
222 | 
223 | \subsec{Basic Premise}
224 | 
225 | In a general sense, our neural network function is of the form
226 | $$
227 | h_\theta(x) = a^T \phi_w(x).
228 | $$
229 | where $\phi_w(x)$ has a lot of layers in it. We established the conventional viewpoint that the earlier layers of $\phi_w(x)$ are producing features and the last layer is producing some linear classification of all of them. The typical objective function for regression is then denoted by
230 | \begin{align}
231 | L(\theta) = \min \frac{1}{2n}\sum_{i=1}^n (y^{(i)} - h_\theta(x^{(i)}))^2 + \frac{\lambda}{2} \left\| \theta \right\|_2^2. \label{eqn:loss}
232 | \end{align}
233 | We want to find some algorithms that will help us solve this optimization problem.
234 | 
235 | \subsec{Gradient Descent}
236 | Let us start from some initialization $\theta_0$. This initialization is often random but the exact initialization, or rather the scale, matters. After the initialization is created with an initial set of parameters we take the gradient and do a recursion.
237 | $$
238 | \theta_{t+1} = \theta_t - \eta \nabla L(\theta_t).
239 | $$
240 | 
241 | Why does this work? In essence, we are finding the steepest descent at a point $\theta_t$ and moving in that direction. If we look at a Taylor expansion of $L(\theta)$ at the point $\theta_t$, we get
242 | $$
243 | L(\theta) = L(\theta_t) + \langle \nabla L(\theta_t), \theta - \theta_t \rangle + \text{higher order terms}.
244 | $$
245 | Notice that the second term is linear in $\theta$. If we ignore the higher order terms and minimize over a Euclidean ball around $\theta_t$ (the ball is required to maintain the accuracy of the Taylor expansion), we get
246 | $$
247 | \argmin L(\theta_t) +  \langle \nabla L(\theta_t), \theta - \theta_t \rangle, \qquad s.t. \qquad \| \theta - \theta_t \|_2 \leq \varepsilon
248 | $$
249 | $L(\theta_t)$ is a constant in this case, so this simplifies to
250 | $$
251 | \min \langle \nabla L(\theta_t), \theta - \theta_t \rangle, \qquad s.t. \| \theta - \theta_t \|_2 \leq \varepsilon
252 | $$
253 | This is equivalent to finding two vectors $v$ and $x$ with minimum correlation such that $x$ has norm less than $\varepsilon$. As a result, the optimal solution is  $x = -cv$, where $c$ is a scalar constant greater than 0. This assures that our two vectors have a minimum correlation as they are in opposite directions, and $c$ allows the vector to be within our previously established ball. Therefore the optimal solution for $\theta - \theta_t$ is 
254 | $$
255 | \theta - \theta_t = -c\cdot \nabla L(\theta_t)
256 | $$
257 | Therefore, we can see that the steepest direction is optimal, and thus the gradient descent method will reach a minimum.
258 | 
259 | \subsec{Stochastic gradient descent}
260 | For many machine learning problems, computing the gradient is a computationally expensive task. Consider the gradient for the loss function in Eq.~\eqref{eqn:loss}:
261 | \[
262 | \nabla L(\theta_t) = \frac{1}{2n} \sum_{i=1}^n \nabla_\theta (y^{(i)} - h_{\theta_t}(x^{(i)})) + \lambda \theta_t.
263 | \]
264 | Calculating the summation gradient of $\nabla_\theta (y^{(i)} - h_{\theta_t}(x^{(i)}))$ over the entire dataset is expensive for complex neural nets (with many parameters) and/or large sample sizes. Stochastic gradient descent relies on using a small subset of the samples to estimate the gradient, which is effective, especially during the initial stages of training, because gradients of the individual data points will often point in somewhat similar directions. We can derive the individual loss function:
265 | \[
266 | L(\theta) = \frac{1}{2} \sum_{i=1}^n \ell_i(\theta) \implies \ell_i(\theta) = \frac{1}{2} (y^{(i)} - h_\theta(x^{(i)}))^2 + \frac{1}{2} \norm{\theta}_2^2.
267 | \]
268 | The SGD algorithm can be described as follows:
269 | \begin{enumerate}
270 | \item Sample a subset $S = \{i_1, \cdots, i_B\} \subseteq \{1, \cdots, n\}$.
271 | 
272 | \item Find the gradient estimate
273 | \[
274 | g_S(\theta) = \frac{1}{B} \sum_{k=1}^B \nabla \ell_{i_k}(\theta).
275 | \]
276 | Note that $g_S(\theta)$ is unbiased because
277 | \[
278 | \Exp_S[g_S(\theta)] = \frac{1}{B} \sum_{k=1}^\infty \Exp[\nabla \ell_{i_k}(\theta)] = = \frac{1}{B} \sum_{k=1}^\infty \nabla \ell(\theta) = \nabla \ell(\theta).
279 | \]
280 | 
281 | \item Sample $S$ and find $\theta_{t+1} = \theta_t - \eta g_S(\theta_t)$ for $t = 0$ to $t = T$, where $T$ is the number of iterations.
282 | \end{enumerate}
283 | 
284 | \subsec{Computing the gradient}
285 | The gradient of a single data point is
286 | \[
287 | \nabla_\theta(y^{(i)} - h_\theta(x^{(i)}))^2 = -(y^{(i)} - h_\theta(x^{(i)})) \nabla h_\theta(x^{(i)}).
288 | \]
289 | Hence, it suffices to find an evaluable expression for $\nabla h_\theta(x^{(i)})$. Recall that $h_\theta(x^{(i)}) = a^\top \sigma(wx)$. Then the partial derivatives are shown below:
290 | \[
291 | \begin{split}
292 | \frac{\partial}{\partial a} h_\theta(x^{(i)}) &= \sigma(wx) ,\\
293 | \frac{\partial}{\partial w} h_\theta(x^{(i)}) &= (a \odot \sigma'(wx)) x^\top.
294 | \end{split}
295 | \]
296 | Note that $\odot$ is the element-wise product.
297 | 
298 | We can also present an informal statement about the time for computation. Suppose $\ell(\theta_1, \cdots, \theta_p): \bbR^p \to \bbR$ can be evaluated by a differentiable circuit (or sequence of operations) of size $N$. Then the gradient $\nabla \ell(\theta)$ can be computed in time $O(N+p)$ using a circuit of size $O(N+d)$. This means that the time to compute the gradient is similar to the time to compute the function value. The only requirement is that the operations of the circuit are differentiable.
299 | 
300 | \sec{Learning Features}
301 | Neural nets learn better features than those designed in the kernel method. Suppose we have a simple two-layer neural net $h_\theta(x)=a^\top \sigma(wx)$ with objective
302 | \al{\label{eqn:normnn}
303 | \min \norm{a}_2 + \norm{w}_2^2 \text{ s.t. } y^{(i)} = a^\top \sigma(wx^{(i)}).
304 | }
305 | For a $d$-dimensional network, $w \in \bbR^{m \times d}$. We assume $m$ is sufficiently large. For the kernel method with feature $\sigma(wx)$ for random $w$, the objective is
306 | \[
307 | \min \norm{a}_2^2 \text{ s.t. } y^{(i)} = a^\top \sigma(wx^{(i)}).
308 | \]
309 | The objective for the neural net is equivalent to
310 | \[
311 | \min \norm{a}_1 \text{ s.t. } y^{(i)} = a^\top \sigma(wx),
312 | \]
313 | where $w$ is random. With the $L1$ norm, the neural net prefers sparse solutions, similar to the lasso regression. Thus, unlike the kernel method, the neural net actively selects features \cite{wei2020regularization}. 
314 | 
315 | \sec{Transfer Learning}
316 | Transfer learning aims to ``transfer'' features trained on a large dataset to a small, yet different, dataset. Consider the big dataset $(x^{(1)}, y^{(1)}), \cdots, (x^{(n)}, y^{(n)}) \iid P_{\text{transfer}}$ and the small target dataset $(\tilde x^{(1)}, \tilde y^{(1)}), \cdots, (\tilde x^{(m)}, \tilde y^{(m)}) \iid P_{\text{target}}$ where $n \gg m$. Our objective is to model $P_{\text{target}}$. A simple approach to transfer learning can be outlined as follows:
317 | \begin{enumerate}
318 | \item Train a (deep) neural net $h_\theta(x)=a^\top \phi_w(x)$ on $(x^{(1)}, y^{(1)}), \cdots, (x^{(n)}, y^{(n)})$. Often we can find and download a model previously trained on our big dataset (especially for famous datasets such as ImageNet). This neural net gives us values for $\hat{a}$ and $\hat{w}$.
319 | \item Train a linear model $g_b(x) = b^\top \phi_{\widetilde w}(x)$ on $(\tilde x^{(1)}, \tilde y^{(1)}), \cdots, (\tilde x^{(m)}, \tilde y^{(m)})$, discarding $\hat{a}$ and fixing $\hat{w}$ from $h_\theta$. Thus, our objective function is
320 | \[
321 | \min_b \frac{1}{2m} \sum_{i=1}^m \big(g_b(\tilde x^{(i)}) - \tilde y^{(i)}\big)^2 + \frac{\lambda}{2} \norm{b}_1^2.
322 | \]
323 | \end{enumerate}
324 | We also present an improved method that fine-tunes $W$:
325 | \begin{enumerate}
326 | \item Train a (deep) neural net $h_\theta(x)=a^\top \phi_w(x)$ on $(x^{(1)}, y^{(1)}), \cdots, (x^{(n)}, y^{(n)})$.
327 | 
328 | \item Train a linear model $g_{b_, w}(x) = b^\top \phi_{\widetilde w}(x)$ on $(\tilde x^{(1)}, \tilde y^{(1)}), \cdots, (\tilde x^{(m)}, \tilde y^{(m)})$, still discarding $\hat{a}$ but not fixing $\hat{w}$ from $h_\theta$. Thus, our objective function is
329 | \[
330 | \min_{b, w} \frac{1}{2m} \sum_{i=1}^m \big(g_{b, w}(\tilde x^{(i)}) - \tilde y^{(i)}\big)^2.
331 | \]
332 | \end{enumerate}
333 | The improved method can be implemented using SGD with initialization $w=\hat{w}$. We desire to keep $w$ close to its initialization (tactics like early stop can be used). This is useful for tasks where both datasets share similar goals but have slightly different contexts.


--------------------------------------------------------------------------------
/Spring2021/05-21-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | % reset section counter
  3 | \setcounter{section}{0}
  4 | 
  5 | %\metadata{lecture ID}{Your names}{date}
  6 | \metadata{8}{Nicholas K. Branigan and Andrew Kirjner}{May 21, 2021}
  7 | 
  8 | \sec{Review and overview}
  9 | 
 10 | In the last lecture, we began our treatment of nonparametric density estimation with a discussion of the histogram algorithm. This intuitive approach involves constructing a histogram from our observations and normalizing it so that it constitutes a valid density function. In particular, we discuss about bias-variance trade-off:: as the bandwidth increases, the variance decreases while variance increases, and as we consider more examples, variance decreases but the bias, given its dependence on the expectation of the algorithm's predictions, does not change with the number of examples. 
 11 | 
 12 | In this lecture, we move to kernel density estimation, a more sophisticated technique for this problem. Then, we briefly touch parametric and nonparametric mixture models and begin our discussion of Bayesian nonparametric methods. 
 13 | 
 14 | \sec{Kernel density estimation}
 15 | 
 16 | \subsec{Introduction}
 17 | We define the kernel density estimator 
 18 | \al{
 19 | \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^n K\left(\frac{x - x_i}{h}\right), \label{eq:2}
 20 | }
 21 | where $h$ is the bandwidth and $K$ is the kernel function. Recall from Lecture 1 that we have defined the kernel function to be any smooth, non-negative function $K$ such that
 22 | \als{
 23 | \int_{\R} K(x) dx = 1, \quad \int_{\R} xK(x)dx = 0, \quad \text{and} \quad \int_{\R} x^2 K(x)dx > 0.  \label{8.2}
 24 | }
 25 | Two kernel functions we have seen are the boxcar and Gaussian kernels. For the former, we now show that kernel density estimation is very similar to the histogram approach. Recall that the boxcar kernel $K(x) = \frac{1}{2} \mathbf{1}\{|x| \leq 1\}$. Thus, using the boxcar, our kernel density estimator is 
 26 | \al{
 27 | \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^n \frac{1}{2} \mathbf{1}\left\{\left|\frac{x-x_i}{h}\right| \leq 1\right\}.
 28 | }
 29 | Define $B_x = \{i : |x_i - x| \leq h \}$ and let $|B_x|$ be the cardinality of this set, i.e., the number of points in $B_x$. Then we can write, 
 30 | \als{
 31 | \hat{f}(x) &= \frac{1}{nh} \sum_{i \in B_x} \frac{1}{2} \mathbf{1}\left\{\left|\frac{x-x_i}{h}\right| \leq 1\right\} \\
 32 | &= \frac{1}{nh} \sum_{i \in B_x} \frac{1}{2}\\
 33 | &= \frac{|B_x|}{2nh}. \label{eq:6}
 34 | }
 35 | To see the similarity with the histogram algorithm, recall that for the histogram,
 36 | \al{
 37 | \hat{f}(x) = \frac{\hat{p}_j}{h} = \frac{|B_j|}{nh},
 38 | }
 39 | for $x \in B_j$. Moreover, note that for the histogram, $h$ corresponds to the bin width, whereas for our boxcar density estimator, $h$ is half of the bin width. The characteristic difference between the approaches is that for kernel density estimators, our bins are not fixed but moving with and centered at $x$. 
 40 | 
 41 | Of course, we require that our kernel density estimator constitutes a valid density. There are two approaches for verifying that \eqref{eq:2} coheres with the definition of the probability density function. The first is to check directly that \eqref{eq:2} integrates to 1:
 42 | \al{\int_\R \hat{f}(x)dx &= \int_\R \frac{1}{nh} \sum_{i=1}^n K\left(\frac{x-x_i}{h} \right)dx \nonumber \\
 43 | &= \frac{1}{nh} \sum_{i=1}^n \int_\R K\left(\frac{x-x_i}{h} \right)dx \nonumber \\
 44 | &= \frac{1}{nh} \sum_{i=1}^n \int_\R K\left(\frac{x}{h} \right)dx \label{eq:1}\\
 45 | &= \frac{1}{nh} \sum_{i=1}^n h\int_\R K\left(z \right)dz \label{8.9}\\
 46 | &= \frac{1}{nh} \sum_{i=1}^n h \nonumber \\
 47 | &= 1. \nonumber
 48 | }
 49 | To reach \eqref{eq:1}, we used that shifting a function being integrated over the continuum by a constant has no effect on the value of the integral. In \eqref{8.9} we made a change of variables.\footnote{Let $\mathcal{R}[a, b]$ denote the set of functions that are Riemann integrable on $[a, b]$. Then, let $f \in \mathcal{R}[a, b]$ and let $g$ be a strictly increasing function from $[c, d]$ onto $[a, b]$ such that $g$ is differentiable on $[c, d]$ and $g^\prime \in \mathcal{R}[c, d]$. Then $(f \circ g) \cdot g^\prime \in \mathcal{R}[c, d]$ and $\int_a^b f(x)dx = \int_c^d f(g(t))g^\prime (t)dt$ \cite{johnsonbaugh_foundations_2010}.} 
 50 | 
 51 | \begin{figure}[h!]
 52 | \centering
 53 | \includegraphics[width=.8\linewidth]{figure/Lecture08/fig-1.pdf}    
 54 | \caption{Example of Gaussian mixture model. In the figure above, $W = \frac{1}{5}\sum_{i=1}^5 W_i$, where $W_i \sim \cN(x_i, h^2)$. Per Theorem 8.1, the Gaussian kernel density estimator assumes that the density $\hat{f}$ is an equally weighted mixture of Gaussians centered on the observations $\{x_i\}_{i=1}^n$ with variance $h^2$.}
 55 | \end{figure}
 56 | 
 57 | The second approach for sanity checking our definition in \eqref{eq:2} is to view $K(x)$ as a probability density function. Examining \eqref{8.2} confirms that this is a valid move. Then, we find that $\hat{f}$ is identical to a density function as desired. The following theorem formalizes this.  
 58 | 
 59 | \begin{theorem}
 60 | Let $\xi \sim K(x)$, $Z \sim \mathrm{Unif}\{x_1, \dots, x_n\}$. Further, define $W = Z + \xi h$. Then, the density function of $W$ is $\hat{f}$.
 61 | \end{theorem}
 62 | \begin{proof}
 63 | Let $W_i = x_i + \xi h $ for each $i \in \{1, \dots, n\}$. Finding the density of $W_i$ is straightforward. To do this, we first find the distribution function of $W_i$:
 64 | \als{
 65 | F_{W_i}(x) &= P\{W_i \leq x\}\\
 66 | &= P\{x_i + \xi h \leq x\}\\
 67 | &= P\left\{\xi \leq \frac{x - x_i}{h}\right\}\\
 68 | &= F_\xi \left(\frac{x - x_i}{h}\right).
 69 | }
 70 | Then, we differentiate to find the density:
 71 | \als{
 72 | f_{W_i}(x) &= \frac{d}{dx} F_{W_i}(x)\\
 73 | &= \frac{d}{dx} F_\xi \left(\frac{x - x_i}{h}\right)\\
 74 | &= f_\xi \left( \frac{x - x_i}{h} \right)\frac{1}{h}\\
 75 | &= \frac{1}{h}K\left( \frac{x - x_i}{h}\right).
 76 | }
 77 | Now, since $W = W_i$ with probability $\frac{1}{n}$, we can easily find the density of $W$. We again appeal to distribution functions to show this.
 78 | \al{
 79 | F_W(x) &= P\{W \leq x\} \nonumber \\
 80 | &= \tfrac{1}{n}P\{W_1 \leq x\} + \dots + \tfrac{1}{n}P\{W_n \leq x\} \nonumber \\
 81 | &= \tfrac{1}{n}F_{W_1}(x) + \dots + \tfrac{1}{n}F_{W_n}(x). \label{eq:3}
 82 | }
 83 | Differentiating \eqref{eq:3} gives that 
 84 | \als{
 85 | f_W(x) &= \frac{1}{n}\sum_{i=1}^n f_{W_i}(x)\\
 86 | &= \frac{1}{nh}\sum_{i=1}^n K\left( \frac{x - x_i}{h}\right)\\
 87 | &= \hat{f}(x).
 88 | }
 89 | (If $W$ were not a mixture of random variables but a sum of them, computing its density would be far more complicated.)
 90 | \end{proof}
 91 | 
 92 | \subsec{Integrated risk}
 93 | 
 94 | Now, we turn to the risk associated with the Gaussian kernel density estimator.
 95 | \begin{theorem}
 96 | For the Gaussian kernel density estimator the risk is 
 97 | \al{
 98 | R\left(f, \hat{f}\right) = \underbrace{\frac{1}{4}\sigma_k^2 h^4 \int  f^{\prime \prime} (x)^2 dx}_{\text{bias}} + \underbrace{\vphantom{\int}\frac{\beta_k^2}{nh}}_{\sigma^2} + \cO \left(h^6\right) + \cO \left(n^{-1} \right), \label{eq:4}
 99 | }
100 | where $\sigma_k^2 = \int x^2 K(x)dx$ and $\beta_k^2 = \int K(x)^2 dx$. The first term in \eqref{eq:4} is the bias and the second is the variance.
101 | \end{theorem}
102 | Recall that for the histogram density estimator, 
103 | \al{
104 | R\left(\hat{f}, f\right) &= \underbrace{\frac{h^2}{12}\int  f^\prime(x)^2 dx}_{\text{bias}} + \underbrace{\vphantom{\int}\frac{1}{nh}}_{\sigma^2} + \cO(h^2) + \cO(n^{-1}). \label{eq:5}
105 | }
106 | Comparing \eqref{eq:4} and \eqref{eq:5}, we see that for the Gaussian kernel density estimator, we want $f^{\prime \prime}(x)$ to be small rather than $f^{\prime}(x)$. More importantly, for values of $h < 1$, the bias of the Gaussian kernel density estimator will be lower than for the histogram estimator. We encounter the usual bias-variance tradeoff here: increasing $h$ results in more smoothing which boosts bias and depresses variance whereas decreasing $h$ results in less smoothing which depresses bias and boosts variance. 
107 | 
108 | By minimizing the risk with respect to $h$, we find the optimal bandwidth
109 | \al{
110 | h^* &= \left( \frac{\beta_k^2}{\sigma_k^2 A(f) n}\right)^{1/5}, \quad \text{where} \quad A(f) = \int f^{\prime \prime}(x)^2dx. \label{8.28}
111 | }
112 | As usual, the optimal bandwidth is inversely dependent on $n$. Plugging $h^{*}$ into \eqref{eq:4}, we find that as a function of $n$, $R(f, \hat{f}) \propto \cO(n^{-4/5})$. We observe that this is an improvement over the histogram estimator where as a function of $n$, $R(f, \hat{f}) \propto \cO(n^{-2/3})$. 
113 | 
114 | Now, we show that for the boxcar kernel density estimator, the bias is as claimed in \eqref{eq:4}. From \eqref{eq:6}, $\hat{f}(x) = \frac{|B_x|}{2nh}$, so 
115 | \al{
116 | \Exp [\hat{f}(x)] &= \frac{1}{2nh} \Exp[|B_x|] \nonumber \\
117 | &= \frac{n}{2nh}\int_{x-h}^{x+h} f(u)du \nonumber \\
118 | &= \frac{1}{2h}\int_{x-h}^{x+h}f(u)du \nonumber \\
119 | &= \frac{1}{2h} \int_{x-h}^{x+h} \left( f(x) + (u-x)f^\prime (x) + \tfrac{1}{2}(u - x)^2f^{\prime \prime}(x) \right) du + \text{higher order terms} \label{eq:7}\\
120 | &= \frac{1}{2h} \left(2hf(x) + f^\prime(x) \int_{x-h}^{x+h}(u-x)du + f^{\prime \prime}(x)\int_{x-h}^{x+h}\tfrac{1}{2}(u-x)^2 du \right) + \text{higher order terms} \nonumber  \\
121 | &= f(x) + \cO(h^2)f^{\prime \prime}(x). \nonumber 
122 | }
123 | In \eqref{eq:7}, we have carried out a degree 2 Taylor expansion for $f$ at $x$.\footnote{A real-valued function $f$ is said to be of class $C^n$ on $(a, b)$ if $f^{(n)}(x)$ exists and is continuous for all $x \in (a, b)$. Define $P_n(x) = f(c) + f^{(1)}(c)(x-c) + \dots + \frac{f^{(n)}(c)}{n!}(x-c)^n$. Let $f \in C^{n+1}$ on $(a, b)$, and let $c$ and $d$ be any points in $(a,b)$. Then Taylor's Theorem says that there exists a point $t$ between $c$ and $d$ such that $f(d) = P_n(d) + \frac{f^{n+1}(t)}{(n+1)!}(d-c)^{n+1}$ \cite{johnsonbaugh_foundations_2010}.} Thus, we find that the bias at $x$ is 
124 | \als{
125 | \left( f(x) - \Exp\left[\hat{f}(x)\right] \right)^2 = \cO(h^2)^2 f^{\prime \prime}(x)^2 = \cO(h^4)f^{\prime \prime}(x)^2.
126 | }
127 | So, the total bias is $\cO(h^4) \int f^{\prime \prime}(x)^2$, which agrees with \eqref{eq:4} as desired. 
128 | 
129 | \subsec{Choosing $h$ empirically}
130 |  
131 | There are two approaches for selecting the optimal bandwidth parameter $h^*$ for kernel density estimators: normal references and cross-validation. 
132 | 
133 | Unfortunately, we cannot use \eqref{8.28} to directly pick $h^*$. Though we will know $\beta_k^2, \sigma_k^2,$ and $n$, we will not know $A(f)$, which depends on the object of our estimation. Normal references assumes for the purpose of finding $h^*$ that $f \sim \cN(\mu, \tau^2)$. Then, when $K$ is a Gaussian kernel, $h^* = 1.06 \tau n^{-1/5}$. In practice, $\tau$ is commonly chosen to be $\text{min} \{\hat{\tau}, \tfrac{Q}{1.34}\}$, where $\hat{\tau}$ is the sample standard deviation and $Q$ is the interquartile range.\footnote{For some data, the interquartile range is the data's 75th percentile minus its 25th percentile.} Note that we only make this normality assumption to choose $h$. If we believed that $f$ were actually normally distributed, we would be better suited with a parametric density estimation approach.  
134 | 
135 | Next, we examine how to use cross-validation to choose $h$. Cross-validation is somewhat trickier in the unsupervised setting, since we do not have labels to evaluate our performance against on held-out data. To address this challenge, we rewrite our integrated risk
136 | \al{
137 | R(f, \hat{f}) &= \int \left( f(x) - \hat{f}(x)\right)^2 \nonumber \\
138 | &= \int f(x)^2dx - 2 \int f(x)\hat{f}(x)dx + \int \hat{f}(x)dx. \label{8.37}
139 | }
140 | In \eqref{8.37}, the first term is constant with respect to $\hat{f}$, so we are not concerned about it when choosing $\hat{f}$. The second term can be rewritten $-2\Exp_{X \sim f}[\hat{f}(x)]$, and with held-out data $x_1^\prime, \dots,  x_m^\prime \sim f$, we can compute a Monte Carlo estimator of the expectation:  
141 | \als{
142 | \Exp_{X \sim f}\left[\hat{f}(x)\right] \approx \frac{1}{m}\sum_{i=1}^m \hat{f}(x_i^\prime).
143 | } 
144 | If we have insufficient data for a hold-out set, we can use cross-validation. Under leave-one-out and Monte Carlo we have
145 | \als{
146 | \Exp_{X \sim f}\left[\hat{f}(x)\right] \approx \frac{1}{n}\sum_{i=1}^n \hat{f}_{-i}(x_i),
147 | }
148 | where $\hat{f}_{-i}$ denotes the estimator obtained using $\{x_1, \dots, x_{i-1}, x_{i+1}, \dots, x_{n}\}$. Finally, the third term can be directly computed. Thus, the leave-one-out cross validation score is defined
149 | \als{
150 | \hat{J}\left(\hat{f}\right) = \int \hat{f}(x)^2dx - \frac{2}{n}\sum_{i=1}^n \hat{f}_{-i}(x_i).
151 | }
152 | 
153 | We would like an efficient way to find the leave-one-out loss. A naive approach to computing $\hat{J}$ could be quite expensive since it would require that we fit $\hat{f}$ $n$ times. Fortunately, we can do better. 
154 | 
155 | \begin{theorem}
156 | We can compute the leave-one-out cross validation score for a kernel density estimator $\hat{f}$ as
157 | \al{
158 | \hat{J}\left(\hat{f}\right) = \frac{1}{hn^2} \sum_{i=1}^n \sum_{j=1}^n K^* \left( \frac{x_i - x_j}{h}\right) + \frac{2}{nh}K(0) + \cO \left(n^{-2}\right),
159 | }
160 | where $K^*(x) = \int K(x-y)K(y)dy - 2K(x)$.
161 | \end{theorem}
162 | 
163 | \sec{Mixture models}
164 | 
165 | \subsec{Introduction}
166 | 
167 | From Theorem 8.1, we see that kernel density estimators recover the density of a random variable that is a mixture of $n$ distributions of the same form as the kernel centered on the data points. In this section, we present a more parametric approach by examining mixture models where the number of distributions in the mixture is $k < n$. We assume that our density
168 | \al{
169 | f(x) = \frac{1}{k} \sum_{i=1}^k f_i(x) \label{8.42}
170 | }
171 | for some distributions $\{f_i \mid i \in \{1, \dots, k\} \}$. There is an equivalent generative specification of this model:
172 | \begin{enumerate}
173 |     \item Draw $i$ from some distribution over $\{1, \dots, k\}$. A simple choice that we have used in \eqref{8.42} and that we will use going forward is $i \sim \text{unif}\{1, \dots, k\}$. 
174 |     \item Draw $x \sim f_i$.
175 | \end{enumerate}
176 | 
177 | \subsec{Gaussian mixtures and model fitting}
178 | 
179 | A popular mixture model is the Gaussian mixture. Under this model, $f_i(x) = \cN(\mu_i, \Sigma_i)$ for $\mu_i \in \R^d$, $\Sigma_i \in \R^{d \times d}$.  
180 | \al{
181 | f(x ; \mu_1 \dots, \mu_k, \Sigma_1, \dots, \Sigma_k) = \frac{1}{k}\sum_{i=1}^k \frac{1}{(2\pi)^{d/2}|\Sigma_i|^{1/2}}e^{-\tfrac{1}{2}(x - \mu_i)^\top \Sigma_i^{-1}(x - \mu_i)}.
182 | }
183 | This Gaussian mixture model is a fully parametric approach since $k$ is fixed. We can make it less parametric by letting $k$ grow with $n$ in some way. There are three algorithms commonly used for fitting mixtures: maximum likelihood estimation (MLE), the Expectation Maximization algorithm (EM), and the method of moments. 
184 | 
185 | Let $\theta = (\mu_1 \dots, \mu_k, \Sigma_1, \dots, \Sigma_k, z_1, \dots, z_n)$, where the $z_i$'s are in $\{1, \dots, k\}$ and denote the Gaussians to which the observations are assigned. MLE amounts to solving the optimization problem 
186 | \al{
187 | \max_{\theta} \frac{1}{n}\sum_{j = 1}^n \log f(x_j; \mu_{z_j}, \Sigma_{z_j}).
188 | }
189 | This is often impossible to do analytically, so numerical methods are frequently required. EM is beyond the scope of the class, but it can be applied to fit mixtures under the MLE approach or even the more general Bayesian framework. The method of moments involves relating model parameters to the moments of random variables. Recall that for random variables $x_i$, $i \in \{1, \dots, d\}$, the first moments are 
190 | \als{
191 | \Exp[x_i] \text{ for each $i$}, 
192 | }
193 | the second moments are
194 | \als{
195 | \Exp[x_ix_j] \text{ for each $i$, $j$},
196 | }
197 | and the third moments are 
198 | \als{
199 | \Exp[x_ix_jx_k] \text{ for each $i$, $j$, $k$}. 
200 | }
201 | We can estimate these using empirical moments. For example, for observations $x^{(1)}, \dots, x^{(n)}$ in $\R^d$, the empirical first moment for the $i$'th dimension of $x$ is 
202 | \als{
203 | \frac{1}{n}\sum_{j=1}^n x_i^{(j)} \approx \Exp[x_i].
204 | } 
205 | If the moments are functions of the model parameters, we can exploit this to fit our model. For example, in the Gaussian mixture case, $\Exp[x_i] = \frac{1}{k}\sum_{j=1}^k (\mu_j)_i$. In general, suppose 
206 | \als{
207 | \Exp[x_i] &= q_i(\mu, \Sigma)\\
208 | \Exp[x_i x_j] &= q_{ij}(\mu, \Sigma)\\
209 | & \hspace{2.35mm} \vdots \nonumber
210 | }
211 | Then we can construct loss functions to minimize with respect to $\theta$:
212 | \begin{gather}
213 | \left( \frac{1}{n}\sum_{j=1}^n x_i^{(j)} - q_i(\mu, \Sigma)\right)^2\\
214 | \left( \frac{1}{n}\sum_{k=1}^n x_i^{(k)}x_j^{(k)} - q_{ij}(\mu, \Sigma)\right)^2\\
215 | \vdots \nonumber
216 | \end{gather}
217 | 
218 | \sec{Bayesian nonparametric statistics}
219 | 
220 | \subsec{Review of the Bayesian approach}
221 | 
222 | Under the Bayesian take on statistics, we treat our model parameter $\theta$ as a random variable, and we express our beliefs regarding $\theta$ prior to our statistical analysis through a distribution over $\theta$ called a prior. In the unsupervised setting the Bayesian approach assumes the following hierarchical model:
223 | \begin{enumerate}
224 |     \item Draw $\theta \sim p(\theta)$.
225 |     \item Draw data $x^{(1)}, \dots, x^{(n)} \overset{iid}\sim p(x \mid \theta)$.\footnote{Note that in this section, we drop density function subscripts for notational elegance. For example, to be as clear as possible, we would write $f_\theta(\theta)$ rather than $p(\theta)$ and $f_{x \mid \theta}(x \mid \theta)$ not $p(x \mid \theta)$. $p(\theta)$ and $p(x \mid \theta)$ are not the same function $p$. ``Think of them as living things that look inside their own parentheses before deciding what function to be'' \cite{owen_lecture_2018}.}
226 | \end{enumerate}
227 | Our goal is to infer the posterior distribution $p\left(\theta \mid x^{(1)}, \dots, x^{(n)}\right)$. To accomplish this, we use Bayes' rule:
228 | \als{
229 | p\left(\theta \mid x^{(1)}, \dots, x^{(n)}\right) &= \frac{p\left(\theta, x^{(1)}, \dots, x^{(n)}\right)}{p\left(x^{(1)}, \dots, x^{(n)}\right)}\\
230 | &= \frac{p\left(x^{(1)}, \dots, x^{(n)} \mid \theta\right)p\left(\theta\right)}{p\left(x^{(1)}, \dots, x^{(n)}\right)}\\
231 | &= \frac{\prod_{i=1}^n p\left(x^{(i)} \mid \theta\right)p\left(\theta\right)}{\int \prod_{i=1}^n p\left(x^{(i)} \mid \theta\right)p\left(\theta\right) d\theta}.
232 | }
233 | 
234 | Now, in the supervised setting, suppose we have a dataset $S = \left\{\left(x^{(1)}, y^{(1)}\right), \dots, \left(x^{(n)}, y^{(n)}\right)\right\}$, where the $x^{(i)}$'s are fixed. Then, our generative story takes the following form: 
235 | \begin{enumerate}
236 |     \item Draw $\theta \sim p(\theta)$.
237 |     \item Draw a label $y^{(i)} \sim p\left(y^{(i)} \mid x^{(i)}, \theta\right)$ for each $i \in \{1, \dots, n\}$.
238 | \end{enumerate}
239 | Given a test example $x^*$, we want to find $p(y^* \mid x^*, S)$, where $y^*$ denotes the (unknown) label associated with $x^*$. We can do this if we can first infer the posterior $p(\theta \mid S)$. Why? Observe that
240 | \als{
241 | p(y^* \mid x^*, S) &= \int p(y^* \mid x^*, \theta, S)p(\theta \mid x^*, S)d\theta \\
242 | &= \int p(y^* \mid \theta, x^*)p(\theta \mid S) d\theta, \label{8.57}
243 | }
244 | where we've used that $y^*$ is independent of $y^{(1)}, \dots, y^{(n)}$ conditional on $\theta$. As in the supervised setting, we can use Bayes' rule to find an expression for the posterior that we can work with:
245 | \als{
246 | p(\theta \mid S) &= \frac{p(\theta, S)}{p(S)}\\
247 | &= \frac{p(S \mid \theta)p(\theta)}{\int p(S \mid \theta)p(\theta)d\theta}\\
248 | &= \frac{p\left(y^{(1)}, \dots, y^{(n)} \mid \theta\right)p(\theta)}{\int p\left(y^{(1)}, \dots, y^{(n)} \mid \theta\right)p(\theta)d\theta}\\
249 | &= \frac{\prod_{i=1}^n p\left(y^{(i)} \mid \theta, x^{(i)}\right)p(\theta)}{\int \prod_{i=1}^n p\left(y^{(i)} \mid \theta, x^{(i)}\right)p(\theta)d\theta}.
250 | }
251 | 
252 | \subsec{Bayesian linear regression}
253 | Let $x^{(i)} \in \R^d$, $y^{(i)} \in \R$, $\theta \in \R^d$ with $\theta \sim \cN(0, \tau^2 I_d)$. We can simplify the density of $\theta$ 
254 | \al{
255 | p(\theta) = \frac{1}{(2 \pi \tau^2)^{d/2}}e^{- ||\theta||_2^2/(2\tau^2)}.
256 | }
257 | We assume that $y^{(i)} = {x^{(i)}}^\top \theta + \epsilon^{(i)}$ where $\epsilon^{(i)} \sim \cN(0, \sigma^2)$. Our generative model then is 
258 | \begin{enumerate}
259 |     \item Draw $\theta \sim \cN(0, \tau^2 I_d)$.
260 |     \item Draw $y^{(i)} \sim \cN({x^{(i)}}^\top \theta, \sigma^2)$ for each $i \in \{1, \dots, n\}$.
261 | \end{enumerate}
262 | 
263 | \begin{theorem}
264 | Define the design matrix 
265 | \als{ X = 
266 | \begin{bmatrix}
267 | {x^{(1)}}^\top \\
268 | \vdots \\
269 | {x^{(n)}}^\top 
270 | \end{bmatrix}
271 | \in \R^{n \times d}, \quad \vec{y} =
272 | \begin{bmatrix}
273 | {y^{(1)}} \\
274 | \vdots \\
275 | {y^{(n)}} 
276 | \end{bmatrix}
277 | \in \R^n, \quad \text{and} \quad A = \frac{1}{\sigma^2}X^\top X + \frac{1}{\tau^2}I_d.
278 | }
279 | Then $\theta \mid S \sim \cN(\frac{1}{\sigma^2}A^{-1}X^\top \vec{y}, A^{-1})$, and $y^* \mid x^*, S \sim \cN (\frac{1}{\sigma^2} {x^*}^\top A^{-1}X^\top \vec{y}, {x^*}^\top A^{-1} x^* + \sigma^2)$.
280 | \end{theorem}
281 | Let's sanity check Theorem 8.4. We can rewrite $A$ as
282 | \al{
283 | {\underbrace{\frac{1}{\sigma^2} \sum_{i=1}^n x^{(i)}{x^{(i)}}^\top}_{\text{influence of data}}} \hspace{5mm} + {\underbrace{ \vphantom{\frac{1}{\sigma^2} \sum_{i=1}^n x^{(i)}{x^{(i)}}^\top}\frac{1}{\tau^2}I_d.}_{\text{influence of prior}}}
284 | \label{8.64}}
285 | 
286 | \begin{figure}[h!]
287 | \caption{Densities for $\cN(0, \tau^2)$}
288 | \centering \includegraphics[width=\linewidth]{figure/Lecture08/fig-2.pdf}
289 | \end{figure}
290 | 
291 | First, as $n \rightarrow \infty$, the first term in \eqref{8.64} dominates the second term. As we would hope, as the size of our dataset grows the influence of the prior on the posterior of $\theta$ diminishes and, at the limit, vanishes. For this reason, Bayesian methods are less useful under a large data regime. Second, as $\tau \rightarrow \infty$, our Gaussian prior becomes increasingly flat and uninformative; see Figure 8.2. Accordingly, in \eqref{8.64}, $\tau$ is inversely related to the influence of the prior on the posterior. Third, the variance of our posterior predictive distribution $y^* \mid x^*, S$ is at least $\sigma^2$. 
292 | 
293 | \begin{proof}
294 | First, we show that $A$ is positive definite. For non-zero $z \in \R^d$, 
295 | \als{
296 | z^\top \frac{1}{\sigma^2}X^\top X z &= \frac{1}{\sigma^2} (Xz)^\top Xz\\
297 | &= \frac{1}{\sigma^2} \langle Xz, Xz, \rangle. \label{8.66}
298 | }
299 | Since $X$ is full-rank (our predictors cannot be a linear combination of each other), $X$'s null space is trivial and $Xz \neq 0$. Because $\sigma^2$ is positive and the norm is positive for all non-zero vectors, \eqref{8.66} is positive and $A$ is positive definite. Then, $A^{-1}$ is also positive definite since its eigenvalues are the reciprocals of $A$'s eigenvalues.\footnote{For any invertible matrix $M$, $M^{-1}$'s eigenvalues are the eigenvalues of $M$ inverted. A matrix is positive definite if and only if all of its eigenvalues are positive \cite{axler_linear_2014}.} Thus, 
300 | \als{
301 | {x^*}^\top A^{-1}x^{*} + \sigma^2 \geq \sigma^2.
302 | }
303 | \end{proof}
304 | As expected, our uncertainty regarding our predictions is bounded below by $\sigma^2$, which is the uncertainty intrinsic to the problem. Moreover, as our dataset grows in observations, we approach this lower bound and achieve it at the infinite limit. As $n \rightarrow \infty$, ${x^*}^\top A^{-1}x^* \rightarrow 0$, since as $n \rightarrow \infty$, $A \rightarrow \infty$.  


--------------------------------------------------------------------------------
/Spring2021/04-23-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | % reset section counter
  4 | \setcounter{section}{0}
  5 | 
  6 | %\metadata{lecture ID}{Your names}{date}
  7 | \metadata{4}{Jack Collison}{April 23, 2021}
  8 | \sec{Review and overview}
  9 | 
 10 | In the previous lecture, we delved into local polynomial regressions, cross validation, hyperparameter tuning, and an introduction to splines. In particular we talked about cubic splines and natural cubic splines. 
 11 | 
 12 | In this lecture, we will continue our discussion on natural cubic splines in more depth. We will also talk about nearest-neighbor methods, challenges in high-dimensional nonparametrics, and the kernel method.
 13 | \sec{Splines}
 14 | 
 15 | Let's quickly review splines from our previous lecture. The basic principle of a spline is to minimize a regularized objective function over function $\hat{r}$ to some data $\{(x_i, Y_i) : 1 \leq i \leq n\}$
 16 | 
 17 | \begin{equation}
 18 |     L_\lambda(\hat{r}) \triangleq \argmin_{\hat{r}} \sum_{i=1}^n \left( Y_i - \hat{r}(x_i) \right)^2 + \lambda \int \hat{r}''(x)^2dx ,
 19 | \end{equation}
 20 | where the regularization term $\lambda \int \hat{r}''(x)^2dx$ encourages $\hat{r}$ to be as smooth as possible. Here, ``smooth'' means the second order derivative is as small as possible (i.e. we want a very small regularization penalty). 
 21 | 
 22 | \subsec{A brief review}
 23 | Let's recall a few important theorems and lemmas. From the last lecture, we know that the minimizer $\hat{r}$ of this objective function is a natural cubic spline.
 24 | 
 25 | \begin{theorem}
 26 |     The minimizer $L_\lambda(\hat{r})$ is a natural cubic spline with $n$ knots at data points $\{x_i : 1 \leq i \leq n\}$. That is, $\hat{r}$ must be a cubic spline.
 27 | \end{theorem}
 28 | A natural cubic spline is a piecewise polynomial function that linearly extrapolates near $\pm \infty$. Let's also recall an important lemma from last time.
 29 | 
 30 | \begin{lemma}
 31 |     A cubic spline with knots $\xi_1, ..., \xi_n$ forms a $(n + 4)$-dimensional subspace of functions. That is, there exist some $\{h_j : 1 \leq j \leq n+4\}$ such that the cubic spline $\hat{r}$ can be written as
 32 |     \begin{equation*}
 33 |         \hat{r} = \sum_{j=1}^{n+4} \beta_j h_j(x),
 34 |     \end{equation*}
 35 |     where $\beta_j \in \mathbb{R}$ for $j=1,\cdots, n+4$.
 36 | \end{lemma}
 37 | Now, we have a strong structural form in the cubic spline and only have to search over a finite dimensional subspace in the functional form specified above. We can use this functional form of $\hat{r}$ in our penalized regression from above.
 38 | 
 39 | \begin{equation*}
 40 |     L_\lambda(\hat{r}) = L_\lambda(\beta) = \sum_{i=1}^n \left( Y_i - \sum_{j=1}^{n+4} \beta_j h_j(x_i) \right)^2 + \lambda \int \left( \sum_{j=1}^{n+4} \beta_j h_j''(x_i)\right)^2dx.
 41 | \end{equation*}
 42 | Although this looks like a complex objective function, we have a finite number of parameters $\{\beta_j : 1 \leq j \leq n + 4\}$ to optimize over. Further, $L_\lambda(\beta)$ is a convex quadratic function in $\beta$. We can see this by expanding out the squared terms. As $\beta$ is not a function of $x$, the $\beta_j$'s in the regularization term will be unaffected by the derivatives. This makes it a much more feasible problem and allows us to write it in matrix form.
 43 | 
 44 | \subsec{Matrix notation for splines}
 45 | Although we have a convex quadratic functional form, the problem remains notationally burdensome. Let's continue by translating our optimization problem into matrix notation. First, let's define a few matricies that will be useful later.
 46 | 
 47 | \begin{equation}
 48 |     F = \begin{bmatrix} 
 49 |              h_1(x_1) & ... & h_{n+4}(x_1) \\ 
 50 |              & ... & \\
 51 |              h_{1}(x_n) & ... & h_{n+4}(x_n)
 52 |         \end{bmatrix} \in \mathbb{R}^{n \times (n+4)}, \qquad
 53 |     \beta = \begin{bmatrix} \beta_1 \\ ... \\ \beta_{n+4} \end{bmatrix} \in \mathbb{R}^{n+4}, \qquad 
 54 |     Y = \begin{bmatrix} Y_1 \\ ... \\ Y_n \end{bmatrix} \in \mathbb{R}^{n}.
 55 | \end{equation}
 56 | Applying matrix multiplication in lieu of our summations before, we find
 57 | \begin{equation*}
 58 |     Y - F \beta = \begin{bmatrix} Y_1 \\ ... \\ Y_n \end{bmatrix} - \begin{bmatrix} \beta_1h_1(x_1) + ... + \beta_{n+4}h_{n+4}(x_1) \\ ... \\ \beta_1h_1(x_n) + ... + \beta_{n+4}h_{n+4}(x_n) \end{bmatrix},
 59 | \end{equation*}
 60 | and therefore
 61 | \begin{equation*}
 62 |     \sum_{i=1}^n \left(Y_i - \sum_{j=1}^{n+4} \beta_jh_j(X_i)\right)^2 = ||Y - F\beta||_2^2.
 63 | \end{equation*}
 64 | Then we look at the regularization term, since
 65 | \begin{align*}
 66 |     \int \left( \sum_{j=1}^{n+4} \beta_jh_j''(x) \right)^2dx & = \int \left( \sum_{j=1}^{n+4} \sum_{k=1}^{n+4} \beta_j\beta_k h_j''(x) h_k''(x) \right)dx \\
 67 |     &= \sum_{j=1}^{n+4}\sum_{k=1}^{n+4} \beta_j\beta_k \left( \int h_j''(x)h_k''(x)dx \right),
 68 | \end{align*}
 69 | by defining the term $\Omega_{jk}$ as
 70 | \begin{equation*}
 71 |     \Omega_{jk} \triangleq \int h_j''(x)h_k''(x)dx,
 72 | \end{equation*}
 73 | we have $\Omega = [\Omega_{jk}] \in \mathbb{R}^{(n+4)\times(n+4)}$ and
 74 | \begin{equation*}
 75 |     \sum_{j=1}^{n+4}\sum_{k=1}^{n+4} \beta_j\beta_k \left( \int h_j''(x)h_k''(x)dx \right) = \sum_{j=1}^{n+4}\sum_{k=1}^{n+4} \beta_j\beta_k\Omega_{jk} = \beta^\top \Omega \beta.
 76 | \end{equation*}
 77 | Now that we have translated each part of the objective function, we can finally write it as
 78 | \begin{equation*}
 79 |     L_\lambda(\beta) \triangleq ||Y - F\beta||_2^2 + \lambda \beta^\top \Omega \beta.
 80 | \end{equation*}
 81 | This is a remarkably familiar functional form which reminds us of a simple linear regression and ridge regression. The regularization is weighted by matrix $\Omega$. In the case that we have $\Omega = I$, then we have
 82 | 
 83 | $$\beta^\top \Omega \beta = \beta^\top \beta = ||\beta||_2^2,$$
 84 | which is exactly a ridge penalty.
 85 | \subsec{Minimizing the regularized objective}
 86 | Given its similarities to linear regression, we can solve this objective function analytically in a similar fashion to that. That is, we will compute the gradient
 87 | \begin{equation*}
 88 |     \nabla L_\lambda(\beta) = -2F^\top (Y - F\beta) + 2\lambda \Omega\beta,
 89 | \end{equation*}
 90 | and set it equal to zero. Solving the above linear equation, we find
 91 | \begin{equation*}
 92 |     \hat\beta = (F^\top F + \lambda\Omega)^{-1} F^\top Y.
 93 | \end{equation*}
 94 | And therefore the minimizing natural cubic spline is then
 95 | \begin{equation*}
 96 |     \hat{r}(x) = \sum_{j=1}^{n+4} \hat\beta_jh_j(x). 
 97 | \end{equation*}
 98 | \subsec{Choosing the basis}
 99 | Now that we have analytically solved for $\hat{r}$, we might wonder how to choose $h_1, ..., h_{n+4}$. In the last lecture, we saw an example of a basis given by $h_{i+4} = (x - \xi_i)_+^3$ where $\xi_i$ is a knot in our spline. When choosing a basis, we should always remember that we must derive $\Omega$ which requires integration. Therefore, our choice of basis should be relatively easy to integrate. Of course, this can be done numerically but it's a much simpler problem when we are able to integrate and know properties of the basis (i.e. to speed up computation of $F^\top F$ in our estimate of $\beta$). 
100 | 
101 | The textbook refers to $B$-splines as a good basis for computational reasons and because they have nice properties. We will not delve into this here.
102 | \sec{Interpretation of splines}
103 | 
104 | \subsec{Splines as linear smoothers}
105 | 
106 | First, let's recall that a linear smoother is a family of functions that takes an average of the response variable (i.e. our $Y$ variable). Splines fall within this family of linear smoothers. We will show why this is true. Let's begin by recalling our definition of $\hat\beta$ and $\hat{r}$.
107 | \begin{equation*}
108 |     \hat\beta = (F^\top F + \lambda\Omega)^{-1} F^\top Y, \qquad \hat{r}(x) = \sum_{j=1}^{n+4} \hat\beta_jh_j(x).
109 | \end{equation*}
110 | By taking $h(x) = \begin{bmatrix} h_1(x) & \cdots & h_{n+1}(x) \end{bmatrix}^\top \in \mathbb{R}^{n+4}$, we can write
111 | \begin{align*}
112 | 	\hat{r}(x) & = h(x)^\top  (F^\top F + \lambda\Omega)^{-1} F^\top Y := LY,
113 | \end{align*}
114 | where $L = h(x)^\top  (F^\top F + \lambda\Omega)^{-1} F^\top$, meaning our spline is indeed a linear smoother, as required. It is important to show that this falls into the family of linear smoothers because it means that we can apply our methods of cross validation (which were defined for linear smoothers) to splines as well.
115 | \subsec{Splines approximated by kernel estimation}
116 | 
117 | In general, splines and local linear regression will perform approximately equivalently. It will be hard to find consistent cases where one outperforms the other; it simply depends on the noise in the data. Let's quickly recall the kernel estimator:
118 | $$
119 | \hat{r} = \frac{\sum_{j=1}^n w_jY_j}{\sum_{j=1}^n w_j}, \qquad w_j = K\left( \frac{x_j - x}{h} \right).
120 | $$
121 | Note that $K$ is our kernel function (e.g. Gaussian, boxcar, etc.). Splines approximately correspond to a similar functional form.
122 | $$
123 | \hat{r} \approx \frac{\sum_{j=1}^n w_jY_j}{\sum_{j=1}^n w_j}, \qquad w_j = \frac{n^{\frac{1}{4}}}{\lambda^{\frac{1}{4}}f(x)^{\frac{3}{4}}} K\left( \frac{x_j - x}{ \left( \frac{\lambda}{nf(x_j)} \right)^{\frac{1}{4}}} \right).
124 | $$
125 | This is clearly much more complicated that our standard kernel estimator for a number of reasons. First, we can notice that we have a dependency on the density $f(x)$ of $x$. Next, our kernel $K$ is typically not the Gaussian kernel (although it is usually something relatively reasonable). Finally, we have a bandwidth that depends on $f(x)$. 
126 | 
127 | Therefore, splines can be considered a special form of local averaging. They have the advantage in that they tend to better fit the global structure of the data due to the ``global'' penalization term that encourages smoothness.
128 | \subsec{Advanced spline methods}
129 | There exist more advanced spline methods, although we won't dive too deeply into them here. For example, there is a method that has knots that aren't necessarily on data points. In our penalized regression above, we mandated that the knots fell onto a data point in order to properly penalize the objective function. Other methods do not have this requirement. Other methods also allow us to fix knots $q_1, ..., q_m$ in advance while still others put knots in the optimization problem (i.e. the function also optimizes over the placement of knots and chooses their locations).
130 | \subsec{Confidence bands}
131 | Confidence intervals (or bands) are always important in statistics. We want to know how close we are to the ground truth functional form of the data and we want to know where our estimates might be more different from the ground truth (i.e. we want a more interpretable estimate).
132 | \begin{figure}[htbp!]
133 | 	\centering
134 |     \includegraphics[scale = .4]{figure/Lecture04/graph.png}
135 |     \caption{A example of spline along with its confidence band.}
136 | \end{figure}
137 | Our confidence band will be of the typical form
138 | $$
139 | [\hat{r} - \hat{s}(x), \hat{r} + \hat{s}].
140 | $$
141 | In the case of splines, we have an estimate of $\hat{r}(x)$ and wish to find the standard deviation of $\hat{r}(x)$ defined as $\hat{s}(x)$. That is, we want to find a confidence band around our estimates that contains the true function $r(x)$. In practice, this is difficult since we do not know the difference between $\mathbb{E}[\hat{r}(x)]$ and $r(x)$. Statisticians thus find the confidence band for $\mathbb{E}[\hat{r}(x)]$. For this purpose, we find $\hat{s}(x) \approx \text{SD}(\hat{r}(x))$. 
142 | 
143 | We can do this for the general family of linear smoothers with the general form
144 | $$
145 | \hat{r}(x) = \sum_{i=1}^n l_i(x)Y_i.
146 | $$
147 | From here, we can find the variance of $\hat{r}(x)$.
148 | \begin{align*}
149 |     \text{Var}(\hat{r}(x)) &= \text{Var}\left( \sum_{i=1}^n l_i(x)Y_i \right) \sum_{i=1}^n l_i(x)^2 \text{Var}(Y_i) = \sum_{i=1}^n l_i(x)^2 \sigma^2 = \sigma^2 ||l(x)||_2^2.
150 | \end{align*}
151 | 
152 | \noindent Notice that we assume $\text{Var}(Y_i) = \sigma^2$ for each $\{(Y_i) : 1 \leq i \leq n\}$ and define $l(x) = \begin{bmatrix} l_1(x) \\ .... \\ l_n(x) \end{bmatrix}$. Thus, our confidence band becomes
153 | $$
154 | [\hat{r} - c \cdot \sigma \cdot ||l(x)||_2, \hat{r} + c \cdot \sigma \cdot ||l(x)||_2].
155 | $$
156 | 
157 | \noindent Notice the addition of the variable $c$ which is chosen based on the level of confidence desired. A large value of $c$ will have higher confidence whereas a small value of $c$ will have lower confidence.
158 | 
159 | \sec{Nonparametrics in high dimension}
160 | In order to motivate this discussion of high dimensional statistics, we will begin by examining our toolbox of nonparametric techniques in the context of higher dimensional data. Let's consider the case when we have $X \in \mathbb{R}^d$ where $d > 1$. 
161 | 
162 | There are some immediate challenges to local averaging, including the construction of neighborhoods. A natural first thought would be to construct a ``sphere'' $B_x = \{x' : ||x' - x||_2 \leq h\}$ around some fixed point $x$. This is a problem in high dimensions because the distance between points often is not informative. They are often far away from one another in high dimensions.
163 | 
164 | \subsec{Examples}
165 | 
166 | \begin{example}
167 | Suppose we generate $x^{(1)}, ..., x^{(n)} \iid \text{Unif}(s^{d-1})$ where the superscripts denote the index of the data points and $s^{d-1}$ is a $d$-dimensional unit sphere defined as $\{x : ||x||_2 = 1, x \in \mathbb{R}^d\}$. 
168 | \end{example}
169 | In this case, with $p \geq 1 - n^2\text{exp}(-\Omega(d))$, we have
170 | $$
171 | \forall i, j \in [n], ||x^{(i)} - x^{(j)}||_2 \approx \sqrt{2} \Rightarrow \langle x^{(i)}, x^{(j)} \rangle \approx 0.
172 | $$
173 | This is problematic because the distance between points isn't that big since each $x$ is orthogonal to all other values of $x$. The distance isn't informative in this case. That is, if we take $h \gg \sqrt{2}$, then we have all data points in our neighborhood. However, if we take $h \ll \sqrt{2}$, then we have no points in our neighborhood.
174 | 
175 | \begin{example}
176 | Let's consider the example of comparing images as in Fig.~\ref{fig:fig2}.
177 | \begin{figure}
178 | 	\centering
179 |     \includegraphics[scale = .4]{figure/Lecture04/image_flip.png}
180 | 	\caption{Example of image comparison.} \label{fig:fig2}
181 | \end{figure}
182 | \end{example}
183 | The distances between these two images will appear as very large just comparing the pixels, even if they are closely related.
184 | 
185 | \subsec{$k$-Nearest neighbors algorithm}
186 | 
187 | \noindent One of the most prominent algorithms for high dimensional data is the $k$-nearest neighbors method. Before diving into this algorithm, we must first note that it doesn't always solve a problem, but will potentially make it easier to handle. The algorithm is defined as follows.
188 | \begin{enumerate}
189 |     \item $\forall x$, let $B_x = \{i : x^{(i)} \text{ is among the } k \text{ closest neighbors of } x\}$;
190 |     \item $\hat{r}(x) = \frac{1}{k} \sum_{i \in B_x} Y_i$.
191 | \end{enumerate}
192 | A benefit of this algorithm is that each bin $B_x$ will always contain $k$ points, so we don't need to worry about bandwidth. Of course, this doesn't always alleviate the problem that the neighbors might not be meaningful (i.e. they could be neighbors just by chance). The plots in Fig.~\ref{fig:fig3} explore a few different values for $k$ on a toy dataset ($k \in \{10, 50, 400\}$).
193 | 
194 | \begin{figure}
195 | 	\centering
196 | 	\includegraphics[scale = .18]{figure/Lecture04/knn10.png}
197 | 	\includegraphics[scale = .18]{figure/Lecture04/knn50.png}
198 | 	\includegraphics[scale = .18]{figure/Lecture04/knn400.png}
199 | 	\caption{Examples of $k$-NN with $k=10$ (Left), $k=50$ (Middle) and $k=400$ (Right).} \label{fig:fig3}
200 | \end{figure}
201 | 
202 | However, there is the fundamental limitation in the curse of dimensionality. That is, nonparametric methods require a sample size exponential in dimension $d$. A bit more formally, if we only assume Lipschitzness or smoothness conditions (e.g. $||r(x) - r(x')|| \leq ||x - x'||$ or bounds on second order derivatives) then any estimator of $\hat{r}$ will have errors on the order of $n^{-\frac{1}{\Omega(d)}}$. That is, we have
203 | $$
204 | \epsilon = n^{-\frac{1}{\Omega(d)}} \Rightarrow n \geq \left( \frac{1}{\epsilon} \right)^{\Omega(d)}.
205 | $$
206 | 
207 | \sec{Kernel method}
208 | 
209 | \noindent We will now explore the kernel method (not to be confused with our previous kernel estimators). In order to begin, let's introduce some notation. Our training dataset is defined as
210 | $$
211 | \left(x^{(1)}, y^{(1)}\right), ..., \left(x^{(n)}, y^{(n)}\right), \qquad x^{(i)} \in \mathbb{R}^d, y^{(i)} \in \mathbb{R}.
212 | $$
213 | The general idea of the kernel method is to generate a feature map defined as 
214 | $$
215 | \phi : x \in \mathbb{R}^d \rightarrow \phi(x) \in \mathbb{R}^m,
216 | $$
217 | where $x \in \mathbb{R}^d$ are our input and $\phi(x) \in \mathbb{R}^m$ are our features. Note that $m$ can be very big or even infinite. More specifically, we want to transform our standard database into a different feature-space.
218 | $$
219 | \left\{ \left( x^{(i)}, y^{(i)} \right) \right\}_{i=1}^n \rightarrow \left\{ \left( \phi \left( x^{(i)} \right), y^{(i)} \right) \right\}_{i=1}^n.
220 | $$
221 | After converting this to a higher dimension, we can run a standard parameterized method on the transformed dataset (e.g. a linear regression).
222 | 
223 | \subsec{Motivating examples}
224 | 
225 | \begin{example}
226 | 	Let's consider nearest neighbors with $\ell_2$ distance in the feature space. That is, we have $d(x, z) = ||\phi(x) - \phi(z)||_2^2$. If we design $\phi(\cdot)$ in the right way, our distance metric $d(\cdot, \cdot)$ will be more informative than the typical $||x - z||_2^2$. For example, applying $\phi$ could make transformed values $\phi(x)$ and $\phi(z)$ closer than their untransformed counterparts.
227 | \end{example} 
228 | \begin{example}
229 | 	We can apply the same logic to traditional linear models. That is, we can apply $\phi(x)$ and fit a linear model to the transformed data to extract more signal. Suppose we have $x, y \in \mathbb{R}$ with linear model $y = \theta_0 + \theta_1 x$. Now, let's say $\phi(x) = (1, x, x^2, x^3) \in \mathbb{R}^4$. We can then fit a linear regression on top of this transformed data
230 | 	$$
231 | 	y = \theta^\top \phi(x) = \begin{bmatrix} \theta_0 \\ \theta_1 \\ \theta_2 \\ \theta_3 \end{bmatrix}\phi(x) = \theta_0 + \theta_1x + \theta_2x^3 + \theta_3x^3.
232 | 	$$
233 | 	This is a more flexible polynomial model that can be expanded even beyond the third degree to represent any polynomial. This allows us to rely less on the assumptions of linear regression by transforming our input into a higher dimension.
234 | \end{example}
235 | \subsec{Kernel regression}
236 | In a linear regression on transformed feature space, we find the estimator as follows
237 | $$
238 | \hat{y} = \phi(x)^\top \theta, \qquad \phi(x) \in \mathbb{R}^m.
239 | $$
240 | 
241 | \noindent Let's focus on the case when $m > n$. Our least squares objective remains the same.
242 | $$
243 | \hat{L}(\theta) \triangleq \argmin_{\theta \in \mathbb{R}^m} \frac{1}{2n} \sum_{i=1}^n \left( y^{(i)} - \phi \left (x^{(i)} \right)^\top \theta \right)^2.
244 | $$
245 | 
246 | \noindent Converting to matrix notation, let's define a few variables.
247 | $$
248 | \Phi = \begin{bmatrix} \phi(x^{(1)})^\top \\ ... \\ \phi(x^{(n)})^\top \end{bmatrix} \in \mathbb{R}^{n \times n}, \qquad \bm{y} = \begin{bmatrix} y^{(1)} \\ ... \\ y^{(n)} \end{bmatrix} \in \mathbb{R}^n.
249 | $$
250 | 
251 | \noindent Then, our objective function becomes the following
252 | $$
253 | \hat{L}(\theta) = \frac{1}{2n}||\bm{y} - \Phi\theta||_2^2.
254 | $$
255 | 
256 | \noindent Noticing that this is convex in $\theta$, we can compute the gradient and set it equal to zero just as in a typical optimization problem
257 | \begin{align*}
258 | &\nabla \hat{L}(\theta) = \frac{1}{n} \sum_{i=1}^n \left( y^{(i)} - \phi(x^{(i)})^\top\theta \right)\phi(x^{(i)}) = 0, \\
259 | & \Leftrightarrow \sum_{i=1}^n \phi(x^{(i)})\phi(x^{(i)})^\top \theta = \sum_{i=1}^n y^{(i)}\phi(x^{(i)}), \\
260 | & \Leftrightarrow \Phi^\top \Phi \theta = \Phi^\top \bm{y}.
261 | \end{align*}
262 | It is important to notice that when $m > n$, $\Phi^\top \Phi$ is not invertible as is required to solve the minimization problem. This is because $\Phi^\top \Phi \in \mathbb{R}^{m \times m}$ with rank $n$ (i.e. the rank is smaller than the dimension). This means we will have a family of solutions rather than a unique one. We claim that the family of solutions is given by
263 | $$
264 | \theta = \Phi^\top (\Phi \Phi^\top )^{-1}\bm{y} + \beta,
265 | $$
266 | where $\beta \perp \phi(x^{(1)}), ..., \phi(x^{(n)})$ (i.e. $\beta$ is in the null space or $\beta \Phi = 0$). Note that this only feasible if $\Phi \Phi^\top$ is invertible. We can verify that this indeed is a family of solutions.
267 | \begin{align*}
268 |     \Phi^\top \Phi \theta &= (\Phi^\top \Phi)(\Phi^\top (\Phi \Phi^\top )^{-1}\bm{y} + \beta) \\
269 |     &= \Phi^\top \Phi \Phi^\top (\Phi\Phi^\top)^{-1} \\
270 |     &= \Phi^\top \bm{y}.
271 | \end{align*}
272 | Note that the $\beta$ term disappears because it is orthogonal to $\Phi$. We can also conduct a sanity check by confirming that there is zero training error.
273 | \begin{align*}
274 |     \bm{y} - \Phi\theta &= \bm{y} - \Phi(\Phi^\top (\Phi \Phi^\top )^{-1}\bm{y} + \beta) \\
275 |     &= \bm{y} - \Phi\Phi^\top (\Phi\Phi^\top)^{-1}\bm{y} \\
276 |     &= \bm{y} - \bm{y} \\
277 |     &= \bm{0}.
278 | \end{align*}
279 | Again, note that the $\beta$ term disappears because it is orthogonal to $\Phi$. Typically, we will take the minimum of the family of solutions as ``the'' solution to the problem. This is because it is seen as the simplest model and can likely generalize the best. The minimum in this case is given by:
280 | \begin{align}
281 | \hat{\theta} = \Phi^\top (\Phi \Phi^\top )^{-1}\bm{y}.
282 | \end{align}
283 | The issue remains that, when $m$ is large, it is computationally inefficient. Further, when $m$ is infinite, this is an impossible problem. In fact, computation is on the order of $O(m \cdot n^2)$. \newline
284 | \subsec{Kernel efficiencies}
285 | The trick of the kernel is to remove the explicit dependency on $m$. We know that $\hat{\theta}$ is difficult to compute; instead, we can compute $\hat{\theta}^\top \phi(x)$.
286 | $$
287 | \hat{\theta}^\top \phi(x) = \bm{y}^\top (\Phi\Phi^\top)^{-1}\Phi\phi(x).
288 | $$
289 |  We can re-write this in a more typical kernel fashion by defining $K$.
290 | $$
291 | K \triangleq \Phi\Phi^\top = \left[ \phi(x^{(i)})^\top \phi(x^{(j)}) \right]_{\forall i, j \in [n]}.
292 | $$
293 | Using this definition, we can plug into the expression for $\hat{\theta}\phi(x)$.
294 | $$
295 | \hat{\theta}^\top \phi(x) = \bm{y}^\top K^{-1} \begin{bmatrix} \phi(x^{(1)})^\top \phi(x) \\ ... \\ \phi(x^{(n)})^\top \phi(x) \end{bmatrix}.
296 | $$
297 | Thus, we see that this kernel trick only requires (i) $\phi(x^{(i)})\phi(x^{(j)})$ and (ii) $\phi(x^{(i)})^\top \phi(X)$ for all $i, j \in \{1, ..., n\}$. In the case that we can easily compute $\phi(x)^\top \phi(z)$ for some $x, z$, then we don't have any dependency on $m$. That is, we can construct feature maps such that $\phi(x)^\top \phi(z)$ can be computed more quickly than $O(m)$ time. 
298 | 
299 | Now we will briefly discuss the time it takes to compute an estimate in this fashion. Let's say it takes us $T$ units of time to compute $\phi(x)^\top \phi(z)$. Then, it takes us roughly
300 | \begin{enumerate}
301 |     \item $n^2T$ units of time to compute the matrix $K$;
302 |     \item $n^3$ units of time to compute the inverse $K^{-1}$;
303 |     \item $nT$ units of time to compute the vector $\begin{bmatrix} \phi(x^{(1)})^\top \phi(x) \\ ... \\ \phi(x^{(n)})^\top \phi(x) \end{bmatrix}$;
304 |     \item $n^2$ units of time to compute the matrix-vector product.
305 | \end{enumerate}
306 | All in all, it takes $n^2T + n^3 + nT + n^2$ units of time to compute $\hat{\theta}^\top \phi(x)$. Overloading our notation for $K$, we can define the kernel as the inner product
307 | $$
308 | K(x, z) \triangleq \phi(x)^\top \phi(z).
309 | $$
310 | We wish to construct a constant $\phi$ such that $K(\cdot, \cdot)$ is easy to compute. There are lots of ways to do this. In fact, we can even ignore $\phi$ and instead directly work with our kernel $K$ instead (as long as we know there exists some $\phi$ where $K = \phi(x)^\top \phi(z)$).
311 | 
312 | \subsec{Examples}
313 | 
314 | \begin{example}
315 | Consider the case when we have $x = (x_1, ..., x_d) \in \mathbb{R}^d$. Let's construct a feature map as follows.
316 | $$
317 | \phi(x) = \begin{bmatrix} 1 \\ x_1 \\ ... \\ x_d \\ x_1x_1 \\ x_1x_2 \\ ... \\ x_dx_d \end{bmatrix} \in \mathbb{R}^{1 + d + d^2}, \qquad \phi(x)^\top \phi(z) = \begin{bmatrix} 1 \\ x_1 \\ ... \\ x_d \\ x_1x_1 \\ x_1x_2 \\ ... \\ x_dx_d \end{bmatrix}^\top \begin{bmatrix} 1 \\ z_1 \\ ... \\ z_d \\ z_1z_1 \\ z_1z_2 \\ ... \\ z_dz_d \end{bmatrix}.
318 | $$
319 | Re-writing the inner product, we find the following.
320 | \begin{align*}
321 |     \phi(x)\phi(z) &= 1 + \sum_{i=1}^d x_iz_i + \sum_{i, j=1}^d x_ix_jz_iz_j \\
322 |     &= 1 + x^\top z + \sum_{i=1}^d x_iz_i \sum_{j=1}^d x_jz_j \\
323 |     &= 1 + x^\top z + (x^\top z)^2.
324 | \end{align*}
325 | Since it takes $O(d)$ time to compute $x^\top z$, it takes $O(d)$ time to compute $\phi(x)\phi(z)$. There is no reliance on $m$ here, so our kernel trick worked. 
326 | \end{example}
327 | \begin{example}
328 | 	Again, let's consider $x = (x_1, ..., x_d) \in \mathbb{R}^d$. Let's consider a degree-3 construction of $\phi$ this time.
329 | 	$$
330 | 	\phi(x) = \begin{bmatrix} 1 \\ x_1 \\ ... \\ x_d \\ x_1x_1 \\ x_1x_2 \\ ... \\ x_dx_d \\ x_1x_1x_1 \\ x_1x_1x_2 \\ ... \\ x_dx_dx_d \end{bmatrix} \in \mathbb{R}^{1 + d + d^2 + d^3}, \qquad \phi(x)^\top \phi(z) = \begin{bmatrix} 1 \\ x_1 \\ ... \\ x_d \\ x_1x_1 \\ x_1x_2 \\ ... \\ x_dx_d \\ x_1x_1x_1 \\ x_1x_1x_2 \\ ... \\ x_dx_dx_d \end{bmatrix}^\top \begin{bmatrix} 1 \\ z_1 \\ ... \\ z_d \\ z_1z_1 \\ z_1z_2 \\ ... \\ z_dz_d \\ z_1z_1z_1 \\ z_1z_1z_2 \\ ... \\ z_dz_dz_d \end{bmatrix}.
331 | 	$$
332 | 	Similar to the argument above, we find $\phi(x)^\top \phi(z) = 1 + x^\top z + (x^\top z)^2 + (x^\top z)^3$, meaning we have $O(d)$ time again. 
333 | \end{example}
334 | \begin{example}
335 | 	A Gaussian kernel also works here. That is, we have
336 | 	$$
337 | 	K(x, z) = \text{exp}\left(-\frac{||x - z||_2}{2}\right).
338 | 	$$
339 | 	We know that there exists some $\phi$ such that $K(x, z) = \phi(x)^\top \phi(z)$.
340 | \end{example}
341 | 
342 | There are many, many more examples of valid kernels. Here are just a few listed without many details.
343 | \begin{enumerate}
344 |     \item $K(x, z) = (x^\top z)^2$;
345 |     \item $K(x, z) = (x^\top z)^k$;
346 |     \item $K(x, z) = (1 + x^\top z)^k$;
347 |     \item $K(x, z) = \text{exp}\left(-\frac{||x - z||_2}{2\sigma^2}\right)$;
348 |     \item $K(x, z) = \text{random features kernel}$;
349 |     \item $K(x, z) = \text{infinite dimension features}$.
350 | \end{enumerate}
351 | 
352 | \subsec{Existence of $\phi$}
353 | 
354 | For a kernel function to be valid there must exist some $\phi$ such that $K(x, z) = \phi(x)^\top\phi(z)$. Let's show how we know that $\phi$ exists.
355 | \begin{theorem}
356 |     If $K(x, z) = \phi(x)^\top \phi(z)$ then for any $x^{(1)}, ..., x^{(n)}$ we have $[K(x^{(i)}, x^{(j)})]_{i, j \in [n]} \succeq 0$. That is, the matrix $K$ must be semi-positive definite.
357 | \end{theorem}
358 | \begin{proof} We know that $K \succeq 0$ if and only if $v^\top K v \succeq 0$ for all $v$. Let's show that this holds true for some arbitrary $v$.
359 | \begin{align*}
360 |     v^\intercal K v &= \sum_{i, j=1}^n v_i K_{ij} v_j \\
361 |     &= \sum_{i, j=1}^n v_i \langle \phi(x^{(i)}), \phi(x^{(j)}) \rangle v_j \\
362 |     &= \sum_{i, j=1}^n v_i \left( \sum_{k=1}^n \phi(x^{(i)}_k \phi(x^{(j)})_k \right) v_j \\
363 |     &= \sum_{k=1}^m \left( \sum_{i, j=1}^n v_i \phi(x^{(i)})_k v_j \phi(x^{(j)})_k \right) \\
364 |     &= \sum_{k=1}^m \left( \sum_{i=1}^n v_i \phi(x^{(i)})_k \right) \left( \sum_{j=1}^n v_j \phi(x^{(j)})_k \right) \\
365 |     &= \sum_{k=1}^m \left( \sum_{i=1}^n v_i \phi(x^{(i)})_k \right)^2 \geq 0
366 | \end{align*}
367 | \end{proof}
368 | Therefore, $K \succeq 0$ for any $x^{(1)}, \cdots, x^{(n)}$ is a necessary condition for $\phi$ to exist. This is in fact also sufficient. If you're interested, you can find more about it here \cite{enwiki:987099180} known as Mercer's theorem.


--------------------------------------------------------------------------------
/Spring2021/05-14-2021.tex:
--------------------------------------------------------------------------------
  1 | %\newcommand{\Exp}{\mathbb{E}}
  2 | 
  3 | 
  4 | 
  5 | % reset section counter
  6 | \setcounter{section}{0}
  7 | 
  8 | %\metadata{lecture ID}{Your names}{date}
  9 | \metadata{7}{Abigail VanderPloeg}{May 14th, 2021}
 10 | 
 11 | \sec{Review and overview}
 12 | In the previous few lectures, the class have covered neural network methods. While topics such as local linear regression and splines are considered classical methods in nonparametric studies (having been developed 30 years ago), most recently covered lecture topics of kernel methods and neural networks are part of modern nonparametric methods.
 13 | 
 14 | This lecture will conclude the class's coverage of neural networks for use on the smaller datasets often seen in nonparametric statistics. Next, the lecture begins discussion of unsupervised learning, which does not have an output or response variable for the input data. Instead, we wish to learn about the underlying distribution of the input data, which can be achieved with CDF and density estimation methods. 
 15 | 
 16 | 
 17 | \sec{Neural networks: few-shot learning}
 18 | 
 19 | \subsec{Transfer learning}
 20 | Here we continue the lectures' coverage of transfer learning, which is one way in which we can utilize neural networks on small datasets. 
 21 | 
 22 | Transfer learning involves first training a model on a big data set (as can be done by other researchers such as those at Google, who have released pre-trained models), then fine-tuning the model on the smaller data set. One way in which to fine-tune the model is to remove only the last layer of the model and replace it with a new, randomly initialized last layer that will be fine-tuned. Another approach is to fine-tune the entire network by again replacing the last layer of the model with a new, randomly initialized last layer, and then to fine-tune the parameters of the entire model with a small number of passes over the data.
 23 | 
 24 | \subsec{Few-shot learning}
 25 | Few-shot learning is utilized in an even more extreme case than transfer learning, when the dataset is even smaller. The learning setting involves training data of, say, $N$ examples and $l$ classes, where both $N$ and $l$ are very big (ImageNet \cite{deng2009imagenet}, a standard example, has $N = 1.2$M and $l = 10^3$).
 26 |  
 27 | At test time, however, we are only given a small number of examples $(\tilde{x}^{(1)}, \tilde{y}^{(1)}), ..., (\tilde{x}^{(nk )}, \tilde{y}^{(nk)})$ drawn independently and identically distributed from a distribution $P_{\text{test}}$. There are $k$ new labels or classes and $n$ images per each new class. In this scenario, $n$ is very small (such as $n = 5$), and $k$ could be bigger. Such a setting is called a ``$k$-way $n$-shot setting." With this limited data for each new class, the goal is to classify the examples from $P_{\text{test}}$ with one of the $k$ labels. In few-shot learning settings, the feature dimension is typically large (on the order of $10^3$), which makes it difficult to fine-tune a model to the small test dataset, as was done in transfer learning, because overfitting becomes likely.
 28 | 
 29 | \subsec{Nearest neighbor algorithms using features}
 30 | A simple but competitive algorithm in few-shot learning settings is nearest neighbor methods using features, with steps as follows:
 31 | \begin{enumerate}
 32 | 	\item Pretrain neural networks on the large pretraining dataset, which results in an output of $a^\top \phi_W(x)$.
 33 | 	\begin{enumerate}
 34 | 		\item In this case, we enforce $\phi_W(x)$ to have a norm of 1 during training. (This is helpful in order to not have dramatically different norms for different examples, and is likely applied by research teams such as Google who release pretrained neural networks.) \\ \\
 35 | 		This can be done in one of two ways by changing the parameterization
 36 | 		\begin{align}
 37 | 			\phi_W(x) &= \text{normalize}(NN_W(x)) = \frac{NN_W(x)}{\norm{NN_W(x)}_2}. \nonumber
 38 | 		\end{align}
 39 | 		for $NN_W(x)$ as the standard feed-forward NN. This normalization is a sequence of elementary operations, which can be done efficiently (such as computing $\lVert NN_W(x) \rVert_2$) and allows for efficient gradient calculations with auto-differentiation in backpropagation. Performing this operation then implies that $\norm{\phi_W(x)}_2 = 1$.
 40 | 	\end{enumerate}
 41 | 	\item At test time, we utilize an one-nearest neighbor algorithm. (Here, we predict based on the single nearest neighbor rather than a combination of the $k$ nearest as used in $k$-nearest neighbors.) Generally, given an example $x$, we wish to predict the output label $y$. The steps are as follows:
 42 | 	\begin{enumerate}
 43 | 		\item Compute $\phi_w(x)$.
 44 | 		\item Find nearest neighbor in
 45 | 		$\{\phi_w(\tilde{x}^{(1)}), ..., \phi_w(\tilde{x}^{(nk)})\}$. \\ \\
 46 | 		The ``nearness" is quantified according to $\ell_2$ distance or cosine distance. $\ell_2$ distance calculates $d(a,b) = \norm{a - b}_2$. Squaring this calculation results in
 47 | 		\al{
 48 | 			\lVert a - b \rVert_2^2 = \norm{a}_2^2 + \lVert b \rVert_2^2 - 2\braket{a,b}.
 49 | 			%$ = 2 - 2\braket{a,b} $ \text{(Outputs $\phi_w(\tilde{x}^{(i)})$ have been enforced to have unit norms)}
 50 | 		}
 51 | 		which, given the unit norms enforced on outputs $\phi_w(\tilde{x}^{(i)})$, can be simplified to
 52 | 		\al{
 53 | 			\lVert a - b \rVert_2^2 = 2 - 2\braket{a,b}.
 54 | 		}
 55 | 		which is a constant shift from the cosine distance $2\braket{a,b}$, the cosine angle between two vectors $a,b$. \\
 56 | 		\\ Let's suppose that the nearest neighbor is $\phi_w(\tilde{x}^{(j)})$. \\
 57 | 		\item Assign the output label of $\tilde{y}^{(j)}$, or the label of the ``nearest neighbor," to the example $x$.
 58 | 	\end{enumerate}
 59 | \end{enumerate}
 60 | 
 61 | \sec{Unsupervised learning: estimating the CDF}
 62 | Now we return to classical methods, with one-dimensional problems that can be described by CDFs and PDFs (rather than the high-dimensional feature sets of machine learning).
 63 | 
 64 | \subsec{Setup of CDF estimation}
 65 | Let $F$ be the CDF of some distribution over $\mathbb{R}$. Additionally, let us observe $n$ examples from this distribution
 66 | \al{ \nonumber
 67 | 	X_1, ..., X_n \iid F.
 68 | }
 69 | Our goal is to estimate the underlying function $F(x)$.
 70 | 
 71 | From earlier lectures, we can recall that a property of the CDF is that if $X \sim F$, then $\boxed{F(x) =\Pr  \left[X \leq x\right]} \in \left[0, 1\right]$ (the probability that we observe an output less than or equal to $x$). $F(x)$ is monotonically increasing, and an example of CDF is shown in Fig.~\ref{fig:cdf}.
 72 | 
 73 | \begin{figure}[htbp!]
 74 | 	\centering
 75 | 	\includegraphics[scale = 0.6]{figure/Lecture07/CDF.png}
 76 | 	\caption{Example of CDF,  the sigmoid function $f(x) = \frac{1}{1+e^{-x}}$.} \label{fig:cdf}
 77 | \end{figure}
 78 | 
 79 | \subsec{Empirical estimators}
 80 | Given the empirical examples $X_1, ..., X_n \iid F$, we can estimate the underlying CDF, $F(x)$, by evaluating how often $X_i \leq x$ for a given $x$ and $1 \leq i \leq n$ (how often our examples are less than or equal to the input value). Thus, we can consider the empirical estimator $\hat{F}_n(x)$ defined as
 81 | \al{
 82 | 	\hat{F}_n(x) = \frac{1}{n} \sum_{i=1}^n \mathbbm{1}(X_i \leq x). %\leq 1
 83 | }
 84 | We can make the following observations on the function $\hat{F}_n(\cdot)$:
 85 | \begin{itemize}
 86 | 	\item$\hat{F}_n(\cdot)$ as a function is called an \textbf{empirical distribution function}. 
 87 | 	\item $\hat{F}_n(\cdot)$ is a step function which only takes values in $\left[0, \frac{1}{n}, ..., 1 \right]$. This is because $\hat{F}_n(\cdot)$ multiplies $\frac{1}{n}$ by a sum of $n$ integers with values $\in \{0,1\}$, which then implies that $0 \leq \hat{F}_n(x) \leq 1$. 
 88 | 	\item$\hat{F}_n(\cdot)$ is a CDF itself, and  is in fact the CDF of the uniform distribution over $\{X_1, ..., X_n\}$.
 89 | \end{itemize}
 90 | Using the previous example of $F(x)$ (the sigmoid function), we can illustrate what form the estimator will take for an example with $n = 4$ data points in Fig.~\ref{fig:emp-cdf}.
 91 | 
 92 | 
 93 | \begin{figure}[htbp!]
 94 | 	\centering
 95 | 	\includegraphics[scale = 0.6]{figure/Lecture07/CDF_est1.jpg}
 96 | 	\caption{Example of empirical CDF.} \label{fig:emp-cdf}
 97 | \end{figure}
 98 | 
 99 | Although this class will not overview the in-depth theory, it is possible to show for a given $x$ that as the number of examples $n \to \infty$, this estimator $\hat{F}_n(x)$  converges to the underlying distribution function $F(x)$. 
100 | 
101 | In the extreme lower and/or upper range of inputs $x$, the density of data points is close to 0 (since $F(x)$ is flat), and the estimator does not change to transition to the next``increased step" often given the relative lack of examples in these regions. In the opposite scenario in a region where $F(x)$ increases sharply, there are more examples in this region, and the CDF will increase (transition to the next step) more quickly. 
102 | 
103 | The following section involves analysis of simple theorems related to the estimator $\hat{F}_n(x)$.
104 | \begin{theorem}
105 | 	For any fixed value of $x$, the expectation of the empirical estimator, $\mathbbm{E}\left[\hat{F}_n(x)\right]$, satisfies 
106 | 	\al{\mathbbm{E}\left[\hat{F}_n(x)\right] = F(x),} with randomness over the choice of $X_1, ..., X_n$. This means that $\hat{F}_n(x)$ is an unbiased estimator of $F(x)$.
107 | \end{theorem}
108 | 
109 | \noindent \textbf{Proof}
110 | This result can be seen by evaluating the $\mathbbm{E}\left[\hat{F}_n(x)\right]$, since
111 | \al{
112 | 	\mathbbm{E}\left[\hat{F}_n(x)\right] &= \mathbbm{E}\left[\frac{1}{n} \sum_{i=1}^n \mathbbm{1}(X_i \leq x) \right]  \nonumber \\
113 | 	&= \frac{1}{n} \sum_{i=1}^n \mathbbm{E} \left[\mathbbm{1}(X_i \leq x) \right] \nonumber \\ 
114 | 	&= \frac{1}{n} \sum_{i=1}^n \Pr (X_i \leq x) \nonumber \\ 
115 | 	&= F(x).
116 | }
117 | We can also evaluate the variance of $\hat{F}_n(x)$ as
118 | \al{
119 | 	\Var\left[\hat{F}_n(x)\right] = \frac{F(x)(1-F(x))}{n}.
120 | }
121 | and observe that the numerator is bounded in the range $\left[0,1 \right]$ while the denominator becomes larger (approaching infinity) with more and more examples. This means that, with more examples, the variance becomes smaller, and the (unbiased) estimate becomes more accurate. Therefore, we see that 
122 | \al{
123 | 	\Var\left[\hat{F}_n(x)\right] \overset{p}{\to} F(x).
124 | }
125 | or that our estimator converges in probability to the true underlying function (with more and more examples).
126 | \begin{theorem}[Glivenko-Cantelli]
127 | 	\al{
128 | 		\underset{x}{\sup} \, |\hat{F}_n(x) - F(x)| \:{\xrightarrow{a.s.}} \: 0.
129 | 	}
130 | 	This means that the supremum of $|\hat{F}_n(x) - F(x)|$ almost surely converges to 0.
131 | \end{theorem}
132 | Expressed in words, the Glivenko-Cantelli theorem ensures that the estimator converges to the true underlying distribution over the entire function.
133 | \begin{remark}
134 | While smoothing may produce an estimator that looks more similar to a true CDF, the step-wise estimator described is optimal to ensure the convergence of the estimator to the underlying CDF, and smoothing is therefore not necessary. However, this estimator has a zero derivative at most places and no derivative in others, which makes it inapplicable when trying to estimate the density of the data (which is the derivative of the CDF). 
135 | \end{remark}
136 | 
137 | \subsec{Estimating functionals of the CDF}
138 | Consider $T(F)$, which is a function of the CDF $F$. For examples, $T(F)$ could be any of the following functions of $F$ that represent a property of the CDF:
139 | \begin{itemize}
140 | 	\setlength\itemsep{0em}
141 | 	\item Mean of  the distribution $F$.
142 | 	\item Variance of $F$.
143 | 	\item Skewness of $F$ (measuring the asymmetry of the CDF about the mean).
144 | 	\item Quantile of $F$.
145 | \end{itemize}
146 | A \textbf{plug-in estimator} uses $T(\hat{F}_n)$ as the estimator (directly plugging in the estimator for $F$ into the functional). Under certain conditions (satisfied with the functionals listed above), $T(\hat{F}_n) \rightarrow T(F)$. Note that more ``abnormal" functions of F (such as the derivative) do not satisfy these conditions.
147 | 
148 | \sec{Unsupervised learning: density estimation}
149 | As before, we assume that we have data points
150 | \begin{align} \nonumber
151 | 	X_1, ..., X_n \overset{iid}{\sim} F.
152 | \end{align}
153 | with $F$ as the CDF of the underlying data distribution.
154 | 
155 | The PDF, or density, is $f = F'$. In other words, it is the derivative of the CDF. In density estimation, the goal is to estimate the underlying density function $f$ from the data $X_1, ..., X_n$.
156 | 
157 | In order to do so, we utilize technical ideas similar to regression problems. In this case, instead of predicting over an output $y$, we aim to predict $f(x)$. However, we do not directly observe $f(x)$ in any of the data points. This problem is separate from the problem of empirical CDF estimates, since we cannot simply take $\hat{f} = \hat{F}_n(x)'$ for an empirical CDF estimator $\hat{F}_n(x)$ because this CDF estimator is a step function which has a derivative of 0 at most inputs and infinity at the location of the data points.
158 | 
159 | \subsec{Measuring performance of density estimators}
160 | There are several different ways in which to measure and evaluate the performance of density estimators. The most common way in which we can measure the performance of a density estimator $\hat{f}$ in accurately estimating the true density $f$ is by calculating the \textbf{integrated mean square error}: 
161 | \al{
162 | 	R(\hat{f},f) &= \int \left(\hat{f}(x) - f(x)\right)^2 dx.  \nonumber
163 | }
164 | For a one-dimensional problem, this can be seen as a natural extension the mean squared error. 
165 | 
166 | Another way to calculate the risk in order to measure performance is to calculate the \textbf{$\ell_1$ integrated risk}, also known as the \textbf{total variation (TV) distance} between the two distributions $f$ and $\hat{f}$:
167 | \al{
168 | 	R_{\ell_1}(\hat{f},f) &= \int \left|\hat{f}(x) - f(x)\right| dx.
169 | } 
170 | Throughout the rest of the lecture, we will utilize the mean squared error as a metric to evaluate density estimator performance.
171 | \begin{remark} The mean squared error is not very useful in high dimensions. (If $f = \hat{f}$, then the mean squared error will evaluate to 0, but this error generally does not scale well in higher dimensions.)
172 | \end{remark}
173 | \subsec{Mean squared error in high-dimensional spaces}
174 | 
175 | Consider the $d$-dimension problem as follows. We assume that $f$ is a spherical Gaussian $\sim \mathcal{N}(0, I)$. It follows that 
176 | \al{
177 | 	f(x) = \frac{1}{\left(\sqrt{2\pi}\right)^d} \cdot \exp\left(-\frac{1}{2} \lVert x \rVert_2^2\right).
178 | }
179 | Some key observations we can make about this density function are:
180 | \begin{itemize}
181 | 	\item $f$ is a density and therefore \al{
182 | 		f(x) \geq 0.
183 | 	}
184 | 	\item We can evaluate the point with the largest density as
185 | 	\al{
186 | 		\underset{x}{\sup}\,f(x) = f(0) \leq \frac{1}{\left(\sqrt{2\pi}\right)^d}. \;\;\;\; \text{(an inverse exponential)}
187 | 	}
188 | 	This means that, in high dimensional-spaces, we are aiming to predict very small values, which becomes an issue that is exacerbated in the integrated mean squared error calculation.
189 | \end{itemize}
190 | Now, consider some $\hat{f}$ that approximates $f$ reasonably well. Because we have shown the output of $f(x)$ to be less than the inverse exponential $\frac{1}{\left(\sqrt{2\pi}\right)^d}$, we can reasonably expect that $\hat{f} \leq \frac{1}{\left(\sqrt{2\pi}\right)^d}$ for most $x$.
191 | 
192 | We can evaluate the integrated mean squared error between the described $f$ and $\hat{f}$ as follows:
193 | \al{
194 | 	R(\hat{f},f) &= \int \left(\hat{f}(x) - f(x)\right)^2 dx \nonumber \\ \nonumber
195 | 	&\leq \int \left| \hat{f}(x) - f(x) \right| \cdot \left(\lvert \hat{f}(x) \rvert + \lvert f(x) \rvert \right)dx \\ \nonumber
196 | 	&\lesssim \frac{2}{\left(\sqrt{2\pi}\right)^d} \int \left| \hat{f}(x) - f(x) \right|dx \\  \nonumber
197 | 	&\lesssim \frac{2}{\left(\sqrt{2\pi}\right)^d} \int \left(\hat{f}(x) + f(x)\right)dx \qquad \text{($f$ and $\hat{f}(x)$ are positive)} \\ \nonumber
198 | 	&\leq \frac{4}{\left(\sqrt{2\pi}\right)^d}. \nonumber
199 | }
200 | In conclusion, if we have an estimator $\hat{f}$ such that $\hat{f}(x) \leq \frac{1}{\left(\sqrt{2\pi}\right)^d} \; \forall x$, then 
201 | \al{
202 | 	R(\hat{f},f) \leq \frac{4}{\left(\sqrt{2\pi}\right)^d}.
203 | }
204 | and thus $f$ and $\hat{f}$ need not be close by any means for the error to be proportionally very small as an inverse exponential. Note that the TV distance can also not be very meaningful in high dimensions. Generally, the distance between two distributions in high-dimensional space is non-trivial. There are, however, alternatives used that offer slightly better method of measuring the performance of density estimators in high dimensions. One such alternative is the KL divergence metric. However, the KL divergence can still result in a large error for very similar distributions. Take, for example, two distributions $P_1 = \mathcal{N}(0, I)$ and  $P_1 = \mathcal{N}(\mu, I)$, where $\mu$ is a small vector. Then, the KL divergence can become very large. Wassertein distance is another alternative method that incorporates the geometry into the calculation, and performs better for examples such as distributions which are two point masses that are very close to one another, such as pictured below: 
205 | 
206 | \subsec{Mean squared error and other errors in low-dimensional spaces} 
207 | Suppose that $d=1$ and the situation is thus low-dimensional In this case, use of the mean-squared error is ok (as well as other distance metrics discussed). Going forward in lecture, we will primarily focus on discussing one-dimensional scenarios.\\
208 | 
209 | \subsec{Bias-variance tradeoff} 
210 | Just as we evaluated the bias-variance tradeoff in regression problems, we can calculate the bias-variance tradeoff with expectation of the integrated mean square error risk over the randomness of $X_1, ..., X_n$ as
211 | \al{
212 | 	\Exp\left[\int \left(f(x) - \hat{f}(x)\right)^2dx\right] &=  \int\left(\Exp\left[ (f(x) - \hat{f}(x))^2\right]\right)dx.  \nonumber
213 | }
214 | Because, for a random variable $Z$, $\Exp\left[Z^2\right] = \left(\Exp\left[Z\right]\right)^2 + \Var(Z)$, then we can decompose the bias-variance tradeoff as:
215 | \al{
216 | 	\Exp\left[\left(f(x) - \hat{f}(x)\right)^2\right] &= \left(\Exp\left[f(x) - \hat{f}(x)\right]\right)^2 + \Var\left[f(x) - \hat{f}(x)\right] \\ \nonumber
217 | 	&= \left(f(x) - \Exp\left[\hat{f}(x)\right]\right)^2 + \Var\left[-\hat{f}(x)\right] \quad  \quad \text{ ($f(x)$ is a constant)} \\ \nonumber
218 | 	&= \left(\Exp\left[\hat{f}(x)\right] - f(x)\right)^2 + \Var\left[\hat{f}(x)\right]. \nonumber 
219 | }
220 | Thus, we can continue to evaluate $\Exp\left[\int \left(f(x) - \hat{f}(x)\right)^2dx\right]$ as 
221 | \al{
222 | 	\quad\Exp\left[\int \left(f(x) - \hat{f}(x)\right)^2dx\right] = \int \colorboxed{blue}{\left(\Exp\left[\hat{f}(x)\right] - f(x)\right)^2}dx + \int\colorboxed{red}{\Var\left[\hat{f}(x)\right]}dx.
223 | } 
224 | where the term in the blue box is the \textcolor{blue}{bias} term and the term in the red box is the \textcolor{red}{variance} (although sometimes each term including the integral is regarded as the bias and variance respectively). This clear distinction between bias and variance is a property of the integrated mean squared error loss.
225 | 
226 | \subsec{Histograms}
227 | The first algorithm that we will discuss is the histogram algorithm, which is an analog of regressograms. Recalling from previous lectures, we remember that the process of solving a regressogram problem involves 
228 | \begin{enumerate}
229 | 	\item binning the input domain
230 | 	\item fitting constant density functions across each bin
231 | \end{enumerate}
232 | These two steps are also used in the histogram algorithm. 
233 | 
234 | If we assume $X \in \left[0,1\right]$, then we can create bins $B_1, ..., B_m$ within the input range, and we generate a constant function across each bin, $z_1, ..., z_m$. To set up the notation we will utilize, we first define 
235 | \begin{center}
236 | 	length of each bin $\triangleq h = \frac{1}{m}$.
237 | \end{center}
238 | Furthermore, let $Y_i$ equal the number of observations (data points) in each bin $B_i$. Then, we define $\hat{p}$ as
239 | \begin{center}
240 | 	$\hat{p}_i = \frac{Y_i}{n} = $ the fraction of data points in bin $B_i$.
241 | \end{center} 
242 | The value of $z_i \propto \hat{p}_i$, but we need to normalize our $\hat{p}_i$ in order to achieve a proper density function.
243 | 
244 | In order to form a proper density from $z_1, ..., z_n$, we require that 
245 | \al{
246 | 	\int \hat{f}(x)dx &= \sum\limits_{i=1}^m \int_{B_i}\hat{f}(x)dx
247 | 	=  \sum\limits_{i=1}^m h \cdot z_i = 1.
248 | }
249 | Suppose that $z_i = c \cdot \hat{p}_i$. Then, using the property that $\sum\limits_{i=1}^m\hat{p}_i = 1$, we see that
250 | \al{
251 | 	\int \hat{f}(x)dx &= h \cdot c \sum\limits_{i=1}^m\hat{p}_i = 1 \;\; \implies \;\; c = \frac{1}{h \cdot \sum\limits_{i=1}^m\hat{p}_i} = \frac{1}{h} \;\; \implies \;\; z_i = \frac{\hat{p}_i}{h},
252 | }
253 | which tells us that each $z_i$ is computed as the fraction of the points in the bin $B_i$ normalized by the size of the bin. 
254 | More succinctly, we can write
255 | \al{
256 | 	\hat{f}(x)dx &= \sum\limits_{j=1}^n z_j \mathbbm{1}(x \in B_j) = \sum\limits_{j=1}^n \frac{\hat{p}_j}{h} \mathbbm{1}(x \in B_j).
257 | }
258 | 
259 | 
260 | 
261 | \subsec{Bias-variance of histogram}
262 | In the case of the histogram algorithm, we can explicitly compute the bias and variance. First, for use in our calculation of the bias and variance for $x \in B_j$, we can evaluate the expectation of the estimator on $x$ as
263 | \al{
264 | 	\Exp\left[\hat{f}(x)\right] &=  \Exp\left[\frac{\hat{p}_j}{h}\right] \nonumber \\ \nonumber
265 | 	& = \frac{1}{h} \cdot \Exp \left[p_j\right] \\ \nonumber
266 | 	&= \frac{1}{h}\Pr\left[X \in B_j \right] %\quad \quad \quad \text{($\Exp \left[p_j\right]$ is the expected fraction of points in $B_j$)} 
267 | 	\\ \nonumber
268 | 	&= \frac{1}{h} \cdot \int_{B_j}f(u)du \\ \nonumber
269 | 	&= \frac{p_j}{h}.
270 | }
271 | with $p_j \triangleq \Pr\left[X \in B_j \right]$ defined as the probability of a random sample being in bin $B_j$.
272 | \subsubsection{Bias}
273 | Thus, we can evaluate the bias as
274 | \al{
275 | 	\text{Bias} &= \left( f(x) - \Exp\left[\hat{f}(x)\right] \right)^2 = \left( f(x) - \frac{p_j}{h} \right)^2. 
276 | }
277 | When $h$ is infinitesimally small, each bin becomes a very very small window. Knowing that $\int_{B_j}f(u)du$ can thus be approximated as $h \cdot f(x)$ for any $x \in B_j$ allows us to evaluate
278 | \al{
279 | 	\frac{p_j}{h} &= \frac{1}{h} \int_{B_j}f(u)du \approx \frac{1}{h} \cdot h \cdot f(x) = f(x),
280 | }
281 | and thus the bias goes to $0$ as $h \to 0$.
282 | \subsubsection{Variance}
283 | When evaluating the variance of the estimator for $x \in B_j$,
284 | \al{
285 | 	\Var\left(\hat{f}(x)\right) = \frac{1}{h^2}\Var\left(\hat{p}_j\right) \\ \nonumber
286 | }
287 | We can note that, for the number of points that fall into a given bin $B_j$ as $n\hat{p}_j$,
288 | 
289 | \al{
290 | 	n\hat{p}_j &= Y_j \sim \text{Binomial}(n,p_j) \\ \nonumber
291 | 	&= \sum\limits_{j=1}^n \mathbbm{1}(X_i \in B_j),
292 | }
293 | where $\mathbbm{1}(X_i \in B_j)$ follows the Bernoulli distribution $B(p_j)$. \\ \\
294 | Thus, we can calculate the variance of $n\hat{p}_j$ as
295 | \al{
296 | 	\Var(n\hat{p}_j)
297 | 	&= \sum\limits_{i}^n \Var(\mathbbm{1}(X_i \in B_j)) = n \cdot p_j(1-p_j),
298 | }
299 | and we can therefore evaluate $\Var(\hat{p}_j)$ as
300 | \al{
301 | 	\Var(\hat{p}_j) = \frac{1}{n^2}\Var(n\hat{p}_j) = \frac{p_j(1-p_j)}{n},
302 | }
303 | allowing us to evaluate $\Var(\hat{f}(x))$ as
304 | \al{
305 | 	\Var(\hat{f}(x)) = \frac{1}{h^2}\Var(\hat{p}_j) = \frac{1}{h^2 \cdot n}p_j(1-p_j).
306 | }
307 | By analyzing the above result, we see that when $h \to 0$, then $\Var(\hat{f}(x)) \to \infty$, and when $n \to \infty$, then $\Var(\hat{f}(x)) \to 0$. (This is consistent with the results we saw for regression problems).
308 | 
309 | \begin{theorem} 
310 | 	Suppose $f'$ is absolutely continuous and that $\int f'(u)^2 du < \infty$. Then, for the histogram estimator $\hat{f}$, 
311 | 	\al{
312 | 		R(\hat{f}, f) = \colorboxed{blue}{\frac{h^2}{12} \int (f'(u))^2du} + \colorboxed{red}{\frac{1}{nh}} + \mathcal{O}(h^2) + \mathcal{O}(\frac{1}{n}),
313 | 	}
314 | 	where the term in the blue box is the \textcolor{blue}{bias} term and the term in the red box is the \textcolor{red}{variance}. (The bias depends upon the Lipschizes of $f$, meaning that $f$ must be smooth.)
315 | \end{theorem}
316 | 
317 | \subsec{Finding the optimal $h^*$}
318 | The best value for $h$ is the minimizer of $R(\hat{f}, f)$ over $h$, ignoring higher order terms. Using the results from Theorem 7.5, we see that
319 | \al{
320 | 	h^* &= \underset{h}{\argmin} \left[\frac{h^2}{12} \int (f'(u))^2du + \frac{1}{nh} \right] \\ \nonumber
321 | 	&= \frac{1}{n^{\sfrac{1}{3}}} \cdot \left(\frac{6}{\int (f'(u))^2du}\right) ^{\sfrac{1}{3}},
322 | }
323 | which, most importantly, informs us that the rate that $h \propto \frac{1}{n^{\sfrac{1}{3}}}$. 
324 | Plugging in this choice of $h^*$, we get 
325 | \al{
326 | 	R(\hat{f}, f) \sim \frac{c}{n^{\sfrac{2}{3}}}.
327 | }
328 | which shows that the convergence rate of the error as $n \to \infty$.
329 | 
330 | 
331 | \subsec{Proof sketch of Theorem 7.5}
332 | When working through linear regression problems in previous lectures, we never derived these theorems. However, we can prove Theorem 7.5 as shown in this proof sketch.  We have shown that $\hat{f}(x) = \frac{\hat{p}_j}{h}$ if $x \in B_j$. 
333 | 
334 | We also saw that we can evaluate the expectation of the estimator $\hat{f}(x)$ at a point  $x \in B_j$ as 
335 | \al{
336 | 	\Exp\left[\hat{f}(x)\right] = \frac{p_j}{h} = \frac{1}{h} \cdot \int_{B_j} f(u)du.
337 | }
338 | Before, we had roughly approximated $f(u) \approx f(x)$. However, we can more explicitly obtain an expression for $f(u)$ using a first order Taylor expansion:
339 | \al{
340 | 	f(u) = f(x) + (u-x)f'(x) + \mathcal{O}(h^2),
341 | }
342 | since $\lvert u-x \rvert \leq h$ we have that the higher-order terms scale as $\mathcal{O}(h^2)$.
343 | 
344 | Therefore, we can further simplify our calculation of $\Exp\left[\hat{f}(x)\right] = \frac{1}{h} \int_{B_j} f(u)du$ by evaluating
345 | \al{
346 | 	\int_{B_j} f(u)du &= \int_{B_j} \left(f(x) + (u-x)f'(x) + \mathcal{O}(h^2)\right)dn \nonumber \\ \nonumber
347 | 	&= h \cdot f(x) + f'(x)\int(u-x)du + \mathcal{O}(h^2) \cdot h \\ \nonumber
348 | 	&= h \cdot f(x) + f'(x) \cdot \mathcal{O}(h^2) + \mathcal{O}(h^3),
349 | } 
350 | given that $\lvert u-x \rvert \leq h$ and the size of $B_j$ is similarly bounded as $\leq h$.  Thus, we can evaluate the expectation of the estimator as
351 | \al{
352 | 	\Exp\left[\hat{f}(x)\right] = \frac{1}{h} \cdot \int_{B_j} f(u)du = f(x) + f'(x)\cdot \mathcal{O}(h) + \mathcal{O}(h^2). %\\ \nonumber
353 | }
354 | 
355 | \subsubsection{Bias}
356 | Given our previous calculation of $\Exp\left[\hat{f}(x)\right]$, we can evaluate the bias as
357 | \al{
358 | 	\left(f(x) - \Exp\left[\hat{f}(x)\right]\right)^2 &= \left(f'(x)\cdot \mathcal{O}(h) + \mathcal{O}(h^2)\right)^2 \\ \nonumber
359 | 	&= h^2\left(f'(x) + \mathcal{O}(h)\right)^2 \\ \nonumber
360 | 	&= \mathcal{O}(h^2)f'(x)^2 +  \mathcal{O}(h^3).\\ \nonumber
361 | }
362 | And thus, the integrated bias can be calculated as
363 | \al{
364 | 	\int \left(f(x) - \Exp\left[\hat{f}(x)\right]\right)^2dx = \mathcal{O}(h^2) \cdot \int f'(x)^2 dx +  \mathcal{O}(h^3). \\ \nonumber
365 | }
366 | 
367 | \subsubsection{Variance}
368 | Given our previous calculation of $\Exp\left[\hat{f}(x)\right]$, we can evaluate the integrated vairance as
369 | \al{
370 | 	\int \Var(\hat{f}(x))dx &= \sum\limits_{j=1}^m \int_{B_j}  \Var(\hat{f}(x))dx \\ \nonumber
371 | 	&= \sum\limits_{j=1}^m  \frac{p_j(1-p_j)}{h^2 \cdot n} \cdot h \\ \nonumber
372 | 	&\leq \sum\limits_{j=1}^m  \frac{p_j}{h^2 \cdot n} \cdot h \\ \nonumber
373 | 	&= \frac{1}{nh}\sum\limits_{j=1}^m  p_j \\ \nonumber
374 | 	&= \frac{1}{nh}.
375 | }
376 | 
377 | Notice that the variance does not depend on $f$.
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | \newpage
393 | \begin{comment}
394 | \sec{Macros for frequently used notations}
395 | Please try to reuse the macros defined below to ensure consistency.
396 | \begin{itemize}
397 | \item $\Exp$, 
398 | \al{
399 | \E_{x\sim P}, \Exp_{x\sim P} 
400 | }
401 | \item $\Pr[X=1\vert Y=2]$
402 | \item 
403 | \al{
404 | \argmin_{x: x\ge 1}
405 | }
406 | \item 
407 | $\theta$, $\theta^\star$, $\thetaerm$, 
408 | \item 
409 | $\cX,\cY, \cH, \cF$
410 | \item $x\sp{1}, y\sp{k}$
411 | \item 
412 | $x\in \R^3, \bbZ$
413 | \item $\err(5\theta)$
414 | \item $O(\cdot)$, $\tilO(\cdot)$
415 | \item $\iid$
416 | \item $\norm{x}, \Norm{x^{2^3}}$, $\norm{x}_{2}$
417 | \item 
418 | \end{itemize}
419 | \begin{theorem}
420 | ..
421 | \end{theorem}
422 | \begin{lemma}
423 | ...
424 | \end{lemma}
425 | \end{comment}
426 | %\sec{}


--------------------------------------------------------------------------------