├── README.md
├── ml_cheat_sheet.pdf
├── ml_cheat_sheet.synctex.gz
├── ml_cheat_sheet.out
├── ml_cheat_sheet.toc
├── ml_cheat_sheet.aux
├── ml_cheat_sheet.log
└── ml_cheat_sheet.tex


/README.md:
--------------------------------------------------------------------------------
1 | # ml_cheat_sheet
2 | Machine Learning Cheat Sheet
3 | 


--------------------------------------------------------------------------------
/ml_cheat_sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eternalmothra/ml_cheat_sheet/HEAD/ml_cheat_sheet.pdf


--------------------------------------------------------------------------------
/ml_cheat_sheet.synctex.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eternalmothra/ml_cheat_sheet/HEAD/ml_cheat_sheet.synctex.gz


--------------------------------------------------------------------------------
/ml_cheat_sheet.out:
--------------------------------------------------------------------------------
 1 | \BOOKMARK [1][-]{section.1}{Some definitions}{}% 1
 2 | \BOOKMARK [1][-]{section.2}{Lazy vs Eager}{}% 2
 3 | \BOOKMARK [1][-]{section.3}{Decision Trees}{}% 3
 4 | \BOOKMARK [2][-]{subsection.3.1}{ID3 Algorithm}{section.3}% 4
 5 | \BOOKMARK [2][-]{subsection.3.2}{Inductive Bias of ID3}{section.3}% 5
 6 | \BOOKMARK [2][-]{subsection.3.3}{Pruning}{section.3}% 6
 7 | \BOOKMARK [2][-]{subsection.3.4}{Adapting Decision Trees to Regression\(?\)}{section.3}% 7
 8 | \BOOKMARK [1][-]{section.4}{Regression and Classification}{}% 8
 9 | \BOOKMARK [1][-]{section.5}{Neural Networks}{}% 9
10 | \BOOKMARK [2][-]{subsection.5.1}{Perceptrons}{section.5}% 10
11 | \BOOKMARK [2][-]{subsection.5.2}{Perceptron Training Rule vs Delta Rule}{section.5}% 11
12 | \BOOKMARK [2][-]{subsection.5.3}{Threshold Unit}{section.5}% 12
13 | \BOOKMARK [2][-]{subsection.5.4}{BACKPROP}{section.5}% 13
14 | \BOOKMARK [2][-]{subsection.5.5}{Momentum}{section.5}% 14
15 | \BOOKMARK [2][-]{subsection.5.6}{Radial Basis Functions}{section.5}% 15
16 | \BOOKMARK [1][-]{section.6}{Instance Based Learning}{}% 16
17 | \BOOKMARK [2][-]{subsection.6.1}{k-NN}{section.6}% 17
18 | \BOOKMARK [2][-]{subsection.6.2}{Locally Weighted Linear Regression}{section.6}% 18
19 | \BOOKMARK [1][-]{section.7}{Support Vector Machines}{}% 19
20 | \BOOKMARK [2][-]{subsection.7.1}{Kernel Induced Feature Spaces}{section.7}% 20
21 | \BOOKMARK [2][-]{subsection.7.2}{Relationship between SVMs and Boosting}{section.7}% 21
22 | \BOOKMARK [1][-]{section.8}{Boosting}{}% 22
23 | \BOOKMARK [1][-]{section.9}{Computational Learning Theory}{}% 23
24 | \BOOKMARK [2][-]{subsection.9.1}{Definitions}{section.9}% 24
25 | \BOOKMARK [2][-]{subsection.9.2}{Haussler Theorem}{section.9}% 25
26 | \BOOKMARK [2][-]{subsection.9.3}{Infinite Hypotheses Spaces}{section.9}% 26
27 | \BOOKMARK [1][-]{section.10}{Bayesian Learning}{}% 27
28 | \BOOKMARK [2][-]{subsection.10.1}{Equations and Definitions}{section.10}% 28
29 | \BOOKMARK [2][-]{subsection.10.2}{ML and Least-Squared Error}{section.10}% 29
30 | \BOOKMARK [2][-]{subsection.10.3}{Bayes Optimal Classifier}{section.10}% 30
31 | \BOOKMARK [2][-]{subsection.10.4}{Bayesian Belief Networks}{section.10}% 31
32 | \BOOKMARK [1][-]{section.11}{Evaluating Hypotheses}{}% 32
33 | \BOOKMARK [1][-]{section.12}{Randomized Optimization}{}% 33
34 | \BOOKMARK [2][-]{subsection.12.1}{MIMIC}{section.12}% 34
35 | \BOOKMARK [2][-]{subsection.12.2}{Simulated Annealing}{section.12}% 35
36 | \BOOKMARK [1][-]{section.13}{Information Theory}{}% 36
37 | 


--------------------------------------------------------------------------------
/ml_cheat_sheet.toc:
--------------------------------------------------------------------------------
 1 | \contentsline {section}{\numberline {1}Some definitions}{2}{section.1}
 2 | \contentsline {section}{\numberline {2}Lazy vs Eager}{2}{section.2}
 3 | \contentsline {section}{\numberline {3}Decision Trees}{2}{section.3}
 4 | \contentsline {subsection}{\numberline {3.1}ID3 Algorithm}{2}{subsection.3.1}
 5 | \contentsline {subsection}{\numberline {3.2}Inductive Bias of ID3}{2}{subsection.3.2}
 6 | \contentsline {subsection}{\numberline {3.3}Pruning}{2}{subsection.3.3}
 7 | \contentsline {subsection}{\numberline {3.4}Adapting Decision Trees to Regression(?)}{3}{subsection.3.4}
 8 | \contentsline {section}{\numberline {4}Regression and Classification}{3}{section.4}
 9 | \contentsline {section}{\numberline {5}Neural Networks}{3}{section.5}
10 | \contentsline {subsection}{\numberline {5.1}Perceptrons}{3}{subsection.5.1}
11 | \contentsline {subsection}{\numberline {5.2}Perceptron Training Rule vs Delta Rule}{3}{subsection.5.2}
12 | \contentsline {subsection}{\numberline {5.3}Threshold Unit}{4}{subsection.5.3}
13 | \contentsline {subsection}{\numberline {5.4}\textsc {BACKPROP}}{4}{subsection.5.4}
14 | \contentsline {subsection}{\numberline {5.5}Momentum}{5}{subsection.5.5}
15 | \contentsline {subparagraph}{Recurrent Networks}{5}{section*.2}
16 | \contentsline {subsection}{\numberline {5.6}Radial Basis Functions}{5}{subsection.5.6}
17 | \contentsline {section}{\numberline {6}Instance Based Learning}{6}{section.6}
18 | \contentsline {subsection}{\numberline {6.1}k-NN}{6}{subsection.6.1}
19 | \contentsline {subsection}{\numberline {6.2}Locally Weighted Linear Regression}{6}{subsection.6.2}
20 | \contentsline {section}{\numberline {7}Support Vector Machines}{6}{section.7}
21 | \contentsline {subsection}{\numberline {7.1}Kernel Induced Feature Spaces}{6}{subsection.7.1}
22 | \contentsline {subsection}{\numberline {7.2}Relationship between SVMs and Boosting}{7}{subsection.7.2}
23 | \contentsline {section}{\numberline {8}Boosting}{7}{section.8}
24 | \contentsline {section}{\numberline {9}Computational Learning Theory}{7}{section.9}
25 | \contentsline {subsection}{\numberline {9.1}Definitions}{7}{subsection.9.1}
26 | \contentsline {subsection}{\numberline {9.2}Haussler Theorem}{8}{subsection.9.2}
27 | \contentsline {subsection}{\numberline {9.3}Infinite Hypotheses Spaces}{8}{subsection.9.3}
28 | \contentsline {section}{\numberline {10}Bayesian Learning}{8}{section.10}
29 | \contentsline {subsection}{\numberline {10.1}Equations and Definitions}{8}{subsection.10.1}
30 | \contentsline {subparagraph}{BRUTE FORCE MAP learning algorithm}{9}{section*.3}
31 | \contentsline {subsection}{\numberline {10.2}ML and Least-Squared Error}{9}{subsection.10.2}
32 | \contentsline {subsection}{\numberline {10.3}Bayes Optimal Classifier}{9}{subsection.10.3}
33 | \contentsline {subsection}{\numberline {10.4}Bayesian Belief Networks}{9}{subsection.10.4}
34 | \contentsline {subparagraph}{Naive Bayes}{9}{section*.4}
35 | \contentsline {subparagraph}{EM Algorithm}{10}{section*.5}
36 | \contentsline {section}{\numberline {11}Evaluating Hypotheses}{10}{section.11}
37 | \contentsline {section}{\numberline {12}Randomized Optimization}{10}{section.12}
38 | \contentsline {subsection}{\numberline {12.1}MIMIC}{10}{subsection.12.1}
39 | \contentsline {subsection}{\numberline {12.2}Simulated Annealing}{10}{subsection.12.2}
40 | \contentsline {subparagraph}{Genetic Algorithms}{10}{section*.6}
41 | \contentsline {section}{\numberline {13}Information Theory}{11}{section.13}
42 | \contentsline {subparagraph}{Definitions}{11}{section*.7}
43 | 


--------------------------------------------------------------------------------
/ml_cheat_sheet.aux:
--------------------------------------------------------------------------------
 1 | \relax 
 2 | \providecommand\hyper@newdestlabel[2]{}
 3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
 4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
 5 | \global\let\oldcontentsline\contentsline
 6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
 7 | \global\let\oldnewlabel\newlabel
 8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2}
 9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
10 | \AtEndDocument{\ifx\hyper@anchor\@undefined
11 | \let\contentsline\oldcontentsline
12 | \let\newlabel\oldnewlabel
13 | \fi}
14 | \fi}
15 | \global\let\hyper@last\relax 
16 | \gdef\HyperFirstAtBeginDocument#1{#1}
17 | \providecommand\HyField@AuxAddToFields[1]{}
18 | \providecommand\HyField@AuxAddToCoFields[2]{}
19 | \@writefile{toc}{\contentsline {section}{\numberline {1}Some definitions}{2}{section.1}}
20 | \@writefile{toc}{\contentsline {section}{\numberline {2}Lazy vs Eager}{2}{section.2}}
21 | \@writefile{toc}{\contentsline {section}{\numberline {3}Decision Trees}{2}{section.3}}
22 | \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}ID3 Algorithm}{2}{subsection.3.1}}
23 | \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Inductive Bias of ID3}{2}{subsection.3.2}}
24 | \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Pruning}{2}{subsection.3.3}}
25 | \@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Adapting Decision Trees to Regression(?)}{3}{subsection.3.4}}
26 | \@writefile{toc}{\contentsline {section}{\numberline {4}Regression and Classification}{3}{section.4}}
27 | \@writefile{toc}{\contentsline {section}{\numberline {5}Neural Networks}{3}{section.5}}
28 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Perceptrons}{3}{subsection.5.1}}
29 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Perceptron Training Rule vs Delta Rule}{3}{subsection.5.2}}
30 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Threshold Unit}{4}{subsection.5.3}}
31 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.4}\textsc  {BACKPROP}}{4}{subsection.5.4}}
32 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.5}Momentum}{5}{subsection.5.5}}
33 | \@writefile{toc}{\contentsline {subparagraph}{Recurrent Networks}{5}{section*.2}}
34 | \@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Radial Basis Functions}{5}{subsection.5.6}}
35 | \@writefile{toc}{\contentsline {section}{\numberline {6}Instance Based Learning}{6}{section.6}}
36 | \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}k-NN}{6}{subsection.6.1}}
37 | \@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Locally Weighted Linear Regression}{6}{subsection.6.2}}
38 | \@writefile{toc}{\contentsline {section}{\numberline {7}Support Vector Machines}{6}{section.7}}
39 | \@writefile{toc}{\contentsline {subsection}{\numberline {7.1}Kernel Induced Feature Spaces}{6}{subsection.7.1}}
40 | \@writefile{toc}{\contentsline {subsection}{\numberline {7.2}Relationship between SVMs and Boosting}{7}{subsection.7.2}}
41 | \@writefile{toc}{\contentsline {section}{\numberline {8}Boosting}{7}{section.8}}
42 | \@writefile{toc}{\contentsline {section}{\numberline {9}Computational Learning Theory}{7}{section.9}}
43 | \@writefile{toc}{\contentsline {subsection}{\numberline {9.1}Definitions}{7}{subsection.9.1}}
44 | \@writefile{toc}{\contentsline {subsection}{\numberline {9.2}Haussler Theorem}{8}{subsection.9.2}}
45 | \@writefile{toc}{\contentsline {subsection}{\numberline {9.3}Infinite Hypotheses Spaces}{8}{subsection.9.3}}
46 | \@writefile{toc}{\contentsline {section}{\numberline {10}Bayesian Learning}{8}{section.10}}
47 | \@writefile{toc}{\contentsline {subsection}{\numberline {10.1}Equations and Definitions}{8}{subsection.10.1}}
48 | \@writefile{toc}{\contentsline {subparagraph}{BRUTE FORCE MAP learning algorithm}{9}{section*.3}}
49 | \@writefile{toc}{\contentsline {subsection}{\numberline {10.2}ML and Least-Squared Error}{9}{subsection.10.2}}
50 | \@writefile{toc}{\contentsline {subsection}{\numberline {10.3}Bayes Optimal Classifier}{9}{subsection.10.3}}
51 | \@writefile{toc}{\contentsline {subsection}{\numberline {10.4}Bayesian Belief Networks}{9}{subsection.10.4}}
52 | \@writefile{toc}{\contentsline {subparagraph}{Naive Bayes}{9}{section*.4}}
53 | \@writefile{toc}{\contentsline {subparagraph}{EM Algorithm}{10}{section*.5}}
54 | \@writefile{toc}{\contentsline {section}{\numberline {11}Evaluating Hypotheses}{10}{section.11}}
55 | \@writefile{toc}{\contentsline {section}{\numberline {12}Randomized Optimization}{10}{section.12}}
56 | \@writefile{toc}{\contentsline {subsection}{\numberline {12.1}MIMIC}{10}{subsection.12.1}}
57 | \@writefile{toc}{\contentsline {subsection}{\numberline {12.2}Simulated Annealing}{10}{subsection.12.2}}
58 | \@writefile{toc}{\contentsline {subparagraph}{Genetic Algorithms}{10}{section*.6}}
59 | \@writefile{toc}{\contentsline {section}{\numberline {13}Information Theory}{11}{section.13}}
60 | \@writefile{toc}{\contentsline {subparagraph}{Definitions}{11}{section*.7}}
61 | 


--------------------------------------------------------------------------------
/ml_cheat_sheet.log:
--------------------------------------------------------------------------------
  1 | This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9) (preloaded format=pdflatex 2014.1.8)  22 FEB 2015 17:51
  2 | entering extended mode
  3 | **ml_cheat_sheet.tex
  4 | (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.tex
  5 | LaTeX2e <2011/06/27>
  6 | Babel <v3.8m> and hyphenation patterns for english, afrikaans, ancientgreek, ar
  7 | abic, armenian, assamese, basque, bengali, bokmal, bulgarian, catalan, coptic, 
  8 | croatian, czech, danish, dutch, esperanto, estonian, farsi, finnish, french, ga
  9 | lician, german, german-x-2013-05-26, greek, gujarati, hindi, hungarian, iceland
 10 | ic, indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latvian,
 11 |  lithuanian, malayalam, marathi, mongolian, mongolianlmc, monogreek, ngerman, n
 12 | german-x-2013-05-26, nynorsk, oriya, panjabi, pinyin, polish, portuguese, roman
 13 | ian, russian, sanskrit, serbian, slovak, slovenian, spanish, swedish, swissgerm
 14 | an, tamil, telugu, turkish, turkmen, ukenglish, ukrainian, uppersorbian, usengl
 15 | ishmax, welsh, loaded.
 16 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\base\article.cls"
 17 | Document Class: article 2014/09/29 v1.4h Standard LaTeX document class
 18 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\base\size11.clo"
 19 | File: size11.clo 2014/09/29 v1.4h Standard LaTeX file (size option)
 20 | )
 21 | \c@part=\count79
 22 | \c@section=\count80
 23 | \c@subsection=\count81
 24 | \c@subsubsection=\count82
 25 | \c@paragraph=\count83
 26 | \c@subparagraph=\count84
 27 | \c@figure=\count85
 28 | \c@table=\count86
 29 | \abovecaptionskip=\skip41
 30 | \belowcaptionskip=\skip42
 31 | \bibindent=\dimen102
 32 | )
 33 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\mathtools\mathtools.sty"
 34 | Package: mathtools 2014/07/16 v1.15 mathematical typesetting tools
 35 | 
 36 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\graphics\keyval.sty"
 37 | Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
 38 | \KV@toks@=\toks14
 39 | )
 40 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\tools\calc.sty"
 41 | Package: calc 2014/10/28 v4.3 Infix arithmetic (KKT,FJ)
 42 | \calc@Acount=\count87
 43 | \calc@Bcount=\count88
 44 | \calc@Adimen=\dimen103
 45 | \calc@Bdimen=\dimen104
 46 | \calc@Askip=\skip43
 47 | \calc@Bskip=\skip44
 48 | LaTeX Info: Redefining \setlength on input line 75.
 49 | LaTeX Info: Redefining \addtolength on input line 76.
 50 | \calc@Ccount=\count89
 51 | \calc@Cskip=\skip45
 52 | )
 53 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\mathtools\mhsetup.sty"
 54 | Package: mhsetup 2010/01/21 v1.2a programming setup (MH)
 55 | )
 56 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsmath\amsmath.sty"
 57 | Package: amsmath 2013/01/14 v2.14 AMS math features
 58 | \@mathmargin=\skip46
 59 | 
 60 | For additional information on amsmath, use the `?' option.
 61 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsmath\amstext.sty"
 62 | Package: amstext 2000/06/29 v2.01
 63 | 
 64 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsmath\amsgen.sty"
 65 | File: amsgen.sty 1999/11/30 v2.0
 66 | \@emptytoks=\toks15
 67 | \ex@=\dimen105
 68 | ))
 69 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsmath\amsbsy.sty"
 70 | Package: amsbsy 1999/11/29 v1.2d
 71 | \pmbraise@=\dimen106
 72 | )
 73 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsmath\amsopn.sty"
 74 | Package: amsopn 1999/12/14 v2.01 operator names
 75 | )
 76 | \inf@bad=\count90
 77 | LaTeX Info: Redefining \frac on input line 210.
 78 | \uproot@=\count91
 79 | \leftroot@=\count92
 80 | LaTeX Info: Redefining \overline on input line 306.
 81 | \classnum@=\count93
 82 | \DOTSCASE@=\count94
 83 | LaTeX Info: Redefining \ldots on input line 378.
 84 | LaTeX Info: Redefining \dots on input line 381.
 85 | LaTeX Info: Redefining \cdots on input line 466.
 86 | \Mathstrutbox@=\box26
 87 | \strutbox@=\box27
 88 | \big@size=\dimen107
 89 | LaTeX Font Info:    Redeclaring font encoding OML on input line 566.
 90 | LaTeX Font Info:    Redeclaring font encoding OMS on input line 567.
 91 | \macc@depth=\count95
 92 | \c@MaxMatrixCols=\count96
 93 | \dotsspace@=\muskip10
 94 | \c@parentequation=\count97
 95 | \dspbrk@lvl=\count98
 96 | \tag@help=\toks16
 97 | \row@=\count99
 98 | \column@=\count100
 99 | \maxfields@=\count101
100 | \andhelp@=\toks17
101 | \eqnshift@=\dimen108
102 | \alignsep@=\dimen109
103 | \tagshift@=\dimen110
104 | \tagwidth@=\dimen111
105 | \totwidth@=\dimen112
106 | \lineht@=\dimen113
107 | \@envbody=\toks18
108 | \multlinegap=\skip47
109 | \multlinetaggap=\skip48
110 | \mathdisplay@stack=\toks19
111 | LaTeX Info: Redefining \[ on input line 2665.
112 | LaTeX Info: Redefining \] on input line 2666.
113 | )
114 | LaTeX Info: Thecontrolsequence`\['isalreadyrobust on input line 129.
115 | LaTeX Info: Thecontrolsequence`\]'isalreadyrobust on input line 129.
116 | \g_MT_multlinerow_int=\count102
117 | \l_MT_multwidth_dim=\dimen114
118 | \origjot=\skip49
119 | \l_MT_shortvdotswithinadjustabove_dim=\dimen115
120 | \l_MT_shortvdotswithinadjustbelow_dim=\dimen116
121 | \l_MT_above_intertext_sep=\dimen117
122 | \l_MT_below_intertext_sep=\dimen118
123 | \l_MT_above_shortintertext_sep=\dimen119
124 | \l_MT_below_shortintertext_sep=\dimen120
125 | )
126 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsfonts\amssymb.sty"
127 | Package: amssymb 2013/01/14 v3.01 AMS font symbols
128 | 
129 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsfonts\amsfonts.sty"
130 | Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
131 | \symAMSa=\mathgroup4
132 | \symAMSb=\mathgroup5
133 | LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
134 | (Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
135 | ))
136 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\fancyhdr\fancyhdr.sty"
137 | \fancy@headwidth=\skip50
138 | \f@ncyO@elh=\skip51
139 | \f@ncyO@erh=\skip52
140 | \f@ncyO@olh=\skip53
141 | \f@ncyO@orh=\skip54
142 | \f@ncyO@elf=\skip55
143 | \f@ncyO@erf=\skip56
144 | \f@ncyO@olf=\skip57
145 | \f@ncyO@orf=\skip58
146 | )
147 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\listings\listings.sty"
148 | \lst@mode=\count103
149 | \lst@gtempboxa=\box28
150 | \lst@token=\toks20
151 | \lst@length=\count104
152 | \lst@currlwidth=\dimen121
153 | \lst@column=\count105
154 | \lst@pos=\count106
155 | \lst@lostspace=\dimen122
156 | \lst@width=\dimen123
157 | \lst@newlines=\count107
158 | \lst@lineno=\count108
159 | \lst@maxwidth=\dimen124
160 | 
161 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\listings\lstmisc.sty"
162 | File: lstmisc.sty 2014/09/06 1.5e (Carsten Heinz)
163 | \c@lstnumber=\count109
164 | \lst@skipnumbers=\count110
165 | \lst@framebox=\box29
166 | )
167 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\listings\listings.cfg"
168 | File: listings.cfg 2014/09/06 1.5e listings configuration
169 | ))
170 | Package: listings 2014/09/06 1.5e (Carsten Heinz)
171 | 
172 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\algorithmicx\algpseudocode.sty"
173 | Package: algpseudocode 
174 | 
175 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\base\ifthen.sty"
176 | Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
177 | )
178 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\algorithmicx\algorithmicx.sty"
179 | Package: algorithmicx 2005/04/27 v1.2 Algorithmicx
180 | 
181 | Document Style algorithmicx 1.2 - a greatly improved `algorithmic' style
182 | \c@ALG@line=\count111
183 | \c@ALG@rem=\count112
184 | \c@ALG@nested=\count113
185 | \ALG@tlm=\skip59
186 | \ALG@thistlm=\skip60
187 | \c@ALG@Lnr=\count114
188 | \c@ALG@blocknr=\count115
189 | \c@ALG@storecount=\count116
190 | \c@ALG@tmpcounter=\count117
191 | \ALG@tmplength=\skip61
192 | )
193 | Document Style - pseudocode environments for use with the `algorithmicx' style
194 | ) ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\graphics\color.sty"
195 | Package: color 2014/10/28 v1.1a Standard LaTeX Color (DPC)
196 | 
197 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\00miktex\color.cfg"
198 | File: color.cfg 2007/01/18 v1.5 color configuration of teTeX/TeXLive
199 | )
200 | Package color Info: Driver file: pdftex.def on input line 137.
201 | 
202 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\pdftex-def\pdftex.def"
203 | File: pdftex.def 2011/05/27 v0.06d Graphics/color for pdfTeX
204 | 
205 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\oberdiek\infwarerr.sty"
206 | Package: infwarerr 2010/04/08 v1.3 Providing info/warning/error messages (HO)
207 | )
208 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\oberdiek\ltxcmds.sty"
209 | Package: ltxcmds 2011/11/09 v1.22 LaTeX kernel commands for general use (HO)
210 | )
211 | \Gread@gobject=\count118
212 | ))
213 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\hyperref\hyperref.sty"
214 | Package: hyperref 2012/11/06 v6.83m Hypertext links for LaTeX
215 | 
216 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\oberdiek\hobsub-hyperref.sty"
217 | Package: hobsub-hyperref 2012/04/25 v1.12 Bundle oberdiek, subset hyperref (HO)
218 | 
219 | 
220 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\oberdiek\hobsub-generic.sty"
221 | Package: hobsub-generic 2012/04/25 v1.12 Bundle oberdiek, subset generic (HO)
222 | Package: hobsub 2012/04/25 v1.12 Construct package bundles (HO)
223 | Package hobsub Info: Skipping package `infwarerr' (already loaded).
224 | Package hobsub Info: Skipping package `ltxcmds' (already loaded).
225 | Package: ifluatex 2010/03/01 v1.3 Provides the ifluatex switch (HO)
226 | Package ifluatex Info: LuaTeX not detected.
227 | Package: ifvtex 2010/03/01 v1.5 Detect VTeX and its facilities (HO)
228 | Package ifvtex Info: VTeX not detected.
229 | Package: intcalc 2007/09/27 v1.1 Expandable calculations with integers (HO)
230 | Package: ifpdf 2011/01/30 v2.3 Provides the ifpdf switch (HO)
231 | Package ifpdf Info: pdfTeX in PDF mode is detected.
232 | Package: etexcmds 2011/02/16 v1.5 Avoid name clashes with e-TeX commands (HO)
233 | Package etexcmds Info: Could not find \expanded.
234 | (etexcmds)             That can mean that you are not using pdfTeX 1.50 or
235 | (etexcmds)             that some package has redefined \expanded.
236 | (etexcmds)             In the latter case, load this package earlier.
237 | Package: kvsetkeys 2012/04/25 v1.16 Key value parser (HO)
238 | Package: kvdefinekeys 2011/04/07 v1.3 Define keys (HO)
239 | Package: pdftexcmds 2011/11/29 v0.20 Utility functions of pdfTeX for LuaTeX (HO
240 | )
241 | Package pdftexcmds Info: LuaTeX not detected.
242 | Package pdftexcmds Info: \pdf@primitive is available.
243 | Package pdftexcmds Info: \pdf@ifprimitive is available.
244 | Package pdftexcmds Info: \pdfdraftmode found.
245 | Package: pdfescape 2011/11/25 v1.13 Implements pdfTeX's escape features (HO)
246 | Package: bigintcalc 2012/04/08 v1.3 Expandable calculations on big integers (HO
247 | )
248 | Package: bitset 2011/01/30 v1.1 Handle bit-vector datatype (HO)
249 | Package: uniquecounter 2011/01/30 v1.2 Provide unlimited unique counter (HO)
250 | )
251 | Package hobsub Info: Skipping package `hobsub' (already loaded).
252 | Package: letltxmacro 2010/09/02 v1.4 Let assignment for LaTeX macros (HO)
253 | Package: hopatch 2011/06/24 v1.1 Wrapper for package hooks (HO)
254 | Package: xcolor-patch 2011/01/30 xcolor patch
255 | Package: atveryend 2011/06/30 v1.8 Hooks at the very end of document (HO)
256 | Package atveryend Info: \enddocument detected (standard20110627).
257 | Package: atbegshi 2011/10/05 v1.16 At begin shipout hook (HO)
258 | Package: refcount 2011/10/16 v3.4 Data extraction from label references (HO)
259 | Package: hycolor 2011/01/30 v1.7 Color options for hyperref/bookmark (HO)
260 | )
261 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\ifxetex\ifxetex.sty"
262 | Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional
263 | )
264 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\oberdiek\auxhook.sty"
265 | Package: auxhook 2011/03/04 v1.3 Hooks for auxiliary files (HO)
266 | )
267 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\oberdiek\kvoptions.sty"
268 | Package: kvoptions 2011/06/30 v3.11 Key value format for package options (HO)
269 | )
270 | \@linkdim=\dimen125
271 | \Hy@linkcounter=\count119
272 | \Hy@pagecounter=\count120
273 | 
274 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\hyperref\pd1enc.def"
275 | File: pd1enc.def 2012/11/06 v6.83m Hyperref: PDFDocEncoding definition (HO)
276 | )
277 | \Hy@SavedSpaceFactor=\count121
278 | 
279 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\00miktex\hyperref.cfg"
280 | File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive
281 | )
282 | Package hyperref Info: Hyper figures OFF on input line 4443.
283 | Package hyperref Info: Link nesting OFF on input line 4448.
284 | Package hyperref Info: Hyper index ON on input line 4451.
285 | Package hyperref Info: Plain pages OFF on input line 4458.
286 | Package hyperref Info: Backreferencing OFF on input line 4463.
287 | Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
288 | Package hyperref Info: Bookmarks ON on input line 4688.
289 | \c@Hy@tempcnt=\count122
290 | 
291 | (C:\Users\Beck\AppData\Roaming\MiKTeX\2.9\tex\latex\url\url.sty
292 | \Urlmuskip=\muskip11
293 | Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
294 | )
295 | LaTeX Info: Redefining \url on input line 5041.
296 | \XeTeXLinkMargin=\dimen126
297 | \Fld@menulength=\count123
298 | \Field@Width=\dimen127
299 | \Fld@charsize=\dimen128
300 | Package hyperref Info: Hyper figures OFF on input line 6295.
301 | Package hyperref Info: Link nesting OFF on input line 6300.
302 | Package hyperref Info: Hyper index ON on input line 6303.
303 | Package hyperref Info: backreferencing OFF on input line 6310.
304 | Package hyperref Info: Link coloring OFF on input line 6315.
305 | Package hyperref Info: Link coloring with OCG OFF on input line 6320.
306 | Package hyperref Info: PDF/A mode OFF on input line 6325.
307 | LaTeX Info: Redefining \ref on input line 6365.
308 | LaTeX Info: Redefining \pageref on input line 6369.
309 | \Hy@abspage=\count124
310 | \c@Item=\count125
311 | \c@Hfootnote=\count126
312 | )
313 | 
314 | Package hyperref Message: Driver (autodetected): hpdftex.
315 | 
316 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\hyperref\hpdftex.def"
317 | File: hpdftex.def 2012/11/06 v6.83m Hyperref driver for pdfTeX
318 | \Fld@listcount=\count127
319 | \c@bookmark@seq@number=\count128
320 | 
321 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\oberdiek\rerunfilecheck.sty"
322 | Package: rerunfilecheck 2011/04/15 v1.7 Rerun checks for auxiliary files (HO)
323 | Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
324 | 82.
325 | )
326 | \Hy@SectionHShift=\skip62
327 | )
328 | (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.aux)
329 | LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 22.
330 | LaTeX Font Info:    ... okay on input line 22.
331 | LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 22.
332 | LaTeX Font Info:    ... okay on input line 22.
333 | LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 22.
334 | LaTeX Font Info:    ... okay on input line 22.
335 | LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 22.
336 | LaTeX Font Info:    ... okay on input line 22.
337 | LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 22.
338 | LaTeX Font Info:    ... okay on input line 22.
339 | LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 22.
340 | LaTeX Font Info:    ... okay on input line 22.
341 | LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 22.
342 | LaTeX Font Info:    ... okay on input line 22.
343 | 
344 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\graphics\graphicx.sty"
345 | Package: graphicx 2014/10/28 v1.0g Enhanced LaTeX Graphics (DPC,SPQR)
346 | 
347 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\graphics\graphics.sty"
348 | Package: graphics 2014/10/28 v1.0p Standard LaTeX Graphics (DPC,SPQR)
349 | 
350 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\graphics\trig.sty"
351 | Package: trig 1999/03/16 v1.09 sin cos tan (DPC)
352 | )
353 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\00miktex\graphics.cfg"
354 | File: graphics.cfg 2007/01/18 v1.5 graphics configuration of teTeX/TeXLive
355 | )
356 | Package graphics Info: Driver file: pdftex.def on input line 91.
357 | )
358 | \Gin@req@height=\dimen129
359 | \Gin@req@width=\dimen130
360 | )
361 | \c@lstlisting=\count129
362 | 
363 | (C:\Users\Beck\AppData\Roaming\MiKTeX\2.9\tex\context\base\supp-pdf.mkii
364 | [Loading MPS to PDF converter (version 2006.09.02).]
365 | \scratchcounter=\count130
366 | \scratchdimen=\dimen131
367 | \scratchbox=\box30
368 | \nofMPsegments=\count131
369 | \nofMParguments=\count132
370 | \everyMPshowfont=\toks21
371 | \MPscratchCnt=\count133
372 | \MPscratchDim=\dimen132
373 | \MPnumerator=\count134
374 | \makeMPintoPDFobject=\count135
375 | \everyMPtoPDFconversion=\toks22
376 | )
377 | \AtBeginShipoutBox=\box31
378 | Package hyperref Info: Link coloring OFF on input line 22.
379 |  ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\hyperref\nameref.sty"
380 | Package: nameref 2012/10/27 v2.43 Cross-referencing by name of section
381 | 
382 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\generic\oberdiek\gettitlestring.sty"
383 | Package: gettitlestring 2010/12/03 v1.4 Cleanup title references (HO)
384 | )
385 | \c@section@level=\count136
386 | )
387 | LaTeX Info: Redefining \ref on input line 22.
388 | LaTeX Info: Redefining \pageref on input line 22.
389 | LaTeX Info: Redefining \nameref on input line 22.
390 | 
391 | (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.out)
392 | (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.out)
393 | \@outlinefile=\write3
394 | 
395 | (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.toc
396 | LaTeX Font Info:    Try loading font information for U+msa on input line 4.
397 | 
398 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsfonts\umsa.fd"
399 | File: umsa.fd 2013/01/14 v3.01 AMS symbols A
400 | )
401 | LaTeX Font Info:    Try loading font information for U+msb on input line 4.
402 | 
403 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\amsfonts\umsb.fd"
404 | File: umsb.fd 2013/01/14 v3.01 AMS symbols B
405 | ))
406 | \tf@toc=\write4
407 | LaTeX Font Info:    Try loading font information for OMS+cmr on input line 33.
408 | 
409 | ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\base\omscmr.fd"
410 | File: omscmr.fd 2014/09/29 v2.5h Standard LaTeX font definitions
411 | )
412 | LaTeX Font Info:    Font shape `OMS/cmr/m/n' in size <10.95> not available
413 | (Font)              Font shape `OMS/cmsy/m/n' tried instead on input line 33.
414 |  [1
415 | 
416 | {C:/Users/Beck/AppData/Local/MiKTeX/2.9/pdftex/config/pdftex.map}] [2] [3]
417 | 
418 | LaTeX Font Warning: Font shape `OT1/cmr/bx/sc' undefined
419 | (Font)              using `OT1/cmr/bx/n' instead on input line 140.
420 | 
421 | [4] [5]
422 | Underfull \hbox (badness 10000) in paragraph at lines 201--208
423 | 
424 |  []
425 | 
426 | [6]
427 | Underfull \hbox (badness 10000) in paragraph at lines 214--226
428 | 
429 |  []
430 | 
431 | [7] [8]
432 | Underfull \hbox (badness 10000) in paragraph at lines 292--293
433 | 
434 |  []
435 | 
436 | 
437 | Underfull \hbox (badness 10000) in paragraph at lines 311--312
438 | 
439 |  []
440 | 
441 | [9]
442 | Underfull \hbox (badness 10000) in paragraph at lines 329--331
443 | 
444 |  []
445 | 
446 | 
447 | Underfull \hbox (badness 10000) in paragraph at lines 340--341
448 | 
449 |  []
450 | 
451 | [10]
452 | Package atveryend Info: Empty hook `BeforeClearDocument' on input line 369.
453 |  [11]
454 | Package atveryend Info: Empty hook `AfterLastShipout' on input line 369.
455 |  (C:\Users\Beck\Documents\GitHub\ml_cheat_sheet\ml_cheat_sheet.aux)
456 | Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 369.
457 | Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 369.
458 | Package rerunfilecheck Info: File `ml_cheat_sheet.out' has not changed.
459 | (rerunfilecheck)             Checksum: 92B379F5C192AF6500990B141251C9A4;2375.
460 | 
461 | 
462 | LaTeX Font Warning: Some font shapes were not available, defaults substituted.
463 | 
464 | Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 369.
465 |  ) 
466 | Here is how much of TeX's memory you used:
467 |  8324 strings out of 493921
468 |  121536 string characters out of 3144880
469 |  212088 words of memory out of 3000000
470 |  11467 multiletter control sequences out of 15000+200000
471 |  13560 words of font info for 50 fonts, out of 3000000 for 9000
472 |  841 hyphenation exceptions out of 8191
473 |  33i,12n,43p,638b,457s stack positions out of 5000i,500n,10000p,200000b,50000s
474 | <C:/Program Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmbx10.pfb>
475 | <C:/Program Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmbx12.pfb><C
476 | :/Program Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmcsc10.pfb><C:
477 | /Program Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmex10.pfb><C:/P
478 | rogram Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cmextra/cmex8.pfb><C:
479 | /Program Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmmi10.pfb><C:/P
480 | rogram Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmmi6.pfb><C:/Prog
481 | ram Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmmi8.pfb><C:/Program
482 |  Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmmib10.pfb><C:/Program 
483 | Files (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmr10.pfb><C:/Program Fil
484 | es (x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmr6.pfb><C:/Program Files (
485 | x86)/MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmr8.pfb><C:/Program Files (x86)
486 | /MiKTeX 2.9/fonts/type1/public/amsfonts/cm/cmsy10.pfb><C:/Program Files (x86)/M
487 | iKTeX 2.9/fonts/type1/public/amsfonts/cm/cmsy8.pfb><C:/Program Files (x86)/MiKT
488 | eX 2.9/fonts/type1/public/amsfonts/cm/cmti10.pfb><C:/Program Files (x86)/MiKTeX
489 |  2.9/fonts/type1/public/amsfonts/symbols/msbm10.pfb>
490 | Output written on ml_cheat_sheet.pdf (11 pages, 254777 bytes).
491 | PDF statistics:
492 |  317 PDF objects out of 1000 (max. 8388607)
493 |  55 named destinations out of 1000 (max. 500000)
494 |  289 words of extra memory for PDF output out of 10000 (max. 10000000)
495 | 
496 | 


--------------------------------------------------------------------------------
/ml_cheat_sheet.tex:
--------------------------------------------------------------------------------
  1 | % You should title the file with a .tex extension (hw1.tex, for example)
  2 | \documentclass[11pt]{article}
  3 | 
  4 | \usepackage{mathtools}
  5 | \usepackage{amssymb}
  6 | \usepackage{fancyhdr}
  7 | \usepackage{listings}
  8 | \usepackage{algpseudocode}
  9 | \usepackage{color}
 10 | \usepackage{color}   %May be necessary if you want to color links
 11 | \usepackage{hyperref}
 12 | 
 13 | 
 14 | \oddsidemargin0cm
 15 | \topmargin-2cm     %I recommend adding these three lines to increase the 
 16 | \textwidth16.5cm   %amount of usable space on the page (and save trees)
 17 | \textheight23.5cm  
 18 | 
 19 | 
 20 | 
 21 | 
 22 | \begin{document}
 23 | \tableofcontents
 24 | 
 25 | \medskip                        % Skip a "medium" amount of space
 26 |                                 % (latex determines what medium is)
 27 |                                 % Also try: \bigskip, \littleskip
 28 | 
 29 | 
 30 | 
 31 | \section{Some definitions}
 32 | \begin{itemize}
 33 | \item overfitting: Given $H$, $h$ overfits if $\exists h' \in H$ such that $h'$ has smaller error over all the instances even though $h$ has a smaller error over the training examples.
 34 | \end{itemize}
 35 | 
 36 | \section{Lazy vs Eager}
 37 | \begin{itemize}
 38 | \item k-NN, locally weighted regression, and case-based reasoning are lazy
 39 | \item \textsc{BACKPROP}, RBF is eager (why?), ID3 eager
 40 | \item Lazy algorithms may use query instance $x_q$ when deciding how to generalize (can represent as a bunch of local functions). Eager methods have already developed what they think is the global function.
 41 | \end{itemize}
 42 | 
 43 | \section{Decision Trees}
 44 | 
 45 | \subsection{ID3 Algorithm}
 46 | \begin{itemize}
 47 | \item Constructs trees topdown. Greedy algorithm. Hypothesis space of ID3: set of decision trees. Complete space, maintains only a single hypothesis. Uses all traning examples at each step (reduced sensitivity to individual error).
 48 | \begin{itemize}
 49 | \item A $\leftarrow$ best attribute
 50 | \item assign A as decision attribute for Node
 51 | \item for each value of A, create a descendant of node
 52 | \item sort training examples to leaves
 53 | \item if examples perfectly classified, stop
 54 | \item else iterate over leaves
 55 | \end{itemize}
 56 | \item $Entropy(S) = \sum_{i=1}^{c} -p_i lg(p_i)$ ($p_i$ is proportion of $S$ belonging to class $i$, also base can vary--what would cause us to do that?)
 57 | \item $Gain(S,A) = Entropy(S) - \sum_{v \in values(A)} \frac{|S_v|}{|S|} Entropy(S_v)$
 58 | \begin{itemize}
 59 | \item $S_v$: subset of $S$ for which attribute $A$ has value $v$
 60 | \end{itemize}
 61 | \end{itemize}
 62 | 
 63 | \subsection{Inductive Bias of ID3}
 64 | \begin{itemize}
 65 | \item prefers shorter trees
 66 | \item highest info gain attributes
 67 | \end{itemize}
 68 | 
 69 | \subsection{Pruning}
 70 | \begin{itemize}
 71 | \item Reduced error  (?)
 72 | \item Rule post-pruning (?)
 73 | \begin{itemize}
 74 | \item grow the tree
 75 | \item convert tree into equivalent set of rules
 76 | \item prune (generalize) each rule by removing preconditions that result in improving its estimated accuracy
 77 | \item sort pruned rules by estimated accuracy. Consider them in this sequence when classifying subsequent instances.
 78 | \end{itemize}
 79 | \end{itemize}
 80 | 
 81 | \subsection{Adapting Decision Trees to Regression(?)}
 82 | \begin{itemize}
 83 | \item splitting criteria: variance
 84 | \item leaves: average local linear fit
 85 | \end{itemize}
 86 | 
 87 | \section{Regression and Classification}
 88 | \begin{itemize}
 89 | \item Least squared error:The objective consists of adjusting the parameters of a model function to best fit a data set. A simple data set consists of n points (data pairs) $(x_i,y_i)$, i = 1, ..., n, where $x_i$ is an independent variable and $y_i$ is a dependent variable whose value is found by observation. The model function has the form $f(x,\beta)$, where the m adjustable parameters are held in the vector $\boldsymbol \beta$. The goal is to find the parameter values for the model which "best" fits the data. The least squares method finds its optimum when the sum, S, of squared residuals
 90 | 
 91 | $S=\sum_{i=1}^{n}{r_i}^2$
 92 | is a minimum. A residual is defined as the difference between the actual value of the dependent variable and the value predicted by the model.
 93 | 
 94 | $r_i=y_i-f(x_i,\boldsymbol \beta)$
 95 | An example of a model is that of the straight line in two dimensions. Denoting the intercept as $\beta_0$ and the slope as $\beta_1$, the model function is given by $f(x,\boldsymbol \beta)=\beta_0+\beta_1 x$.
 96 | \end{itemize}
 97 | 
 98 | \section{Neural Networks}
 99 | \subsection{Perceptrons}
100 | $$o(x_1...x_n) =
101 |     \begin{cases}
102 |             1, &         \text{if } w_0+w_1x_1+...+w_nx_n>0,\\
103 |             0, &         \text{otherwise}.
104 |     \end{cases}
105 | $$
106 | where $w_0,...,w_n$ is a real-valued weight. Note that $w_0$ is a threshold that must be surpassed for the perceptron to output 1. Alternatively: $o(\vec{x}) = sgn(\vec{w}\vec{x})$. $H =\{\vec{w} | \vec{w} \in \mathbb{R}^{n+1} \}$.
107 | 
108 | \subsection{Perceptron Training Rule vs Delta Rule}
109 | \begin{itemize}
110 | \item Perceptron training rule: begin with random weights, apply perceptron to each training example, update perceptron weights when it misclassifies. Iterates through training examples repeatedly until it classifies all examples correctly.
111 | \begin{itemize}
112 | \item $w_i \leftarrow w_i + \Delta w_i$
113 | \item $\Delta w_i=\eta (t-o) x_i$, $t$: target output for current training example. $o$:output generated for current training example. $\eta$: learning rate.
114 | \end{itemize}
115 | \item To converge, Perceptron training rule needs data to be linearly separable( Decision for this hyperplane is $\vec{w}\vec{x} >0$) and for $\eta$ to be sufficiently small.
116 | \item Delta rule uses \textit{gradient descent}.
117 | \begin{itemize}
118 | \item (?) task of training linear unit (1st stage of a perceptron without the threshold): $o(\vec{x}) = \vec{w}\vec{x}$
119 | \item training error: $E(\vec{w})=\frac{1}{2} \sum_{d \in D} (t_d - o_d)^2$, where $D$: training examples, $t_d$: target output for training example $d$, and $o_d$: output of linear unit for training example $d$.
120 | \item Gradient descent finds global minimum of $E$ by initializing weights, then repeatedly modifying until it hits the global min. Modification: alters in the direction that gives steepest descent. $\nabla E(\vec{w})= [\frac{\partial E}{\partial w_0},...,\frac{\partial E}{\partial w_n}]$
121 | \item Training rule for gradient descent: $w_i \leftarrow w_i + \Delta w_i$\\
122 | $\Delta \vec{w} = -\eta \nabla E(\vec{w})$
123 | \item Training rule can also be written in its component form: $w_i \leftarrow w_i+\Delta w_i$\\ $\Delta w_i = -\eta \frac{\partial E}{\partial w_i}$
124 | \item Efficient way of finding $\frac{\partial E}{\partial w_i} = \sum_{d \in D} (t_d-o_d)(-x_{id})$, where $x_{id}$ (?) represents single input component $x_i$ for training example $d$.
125 | \item Rewrite: $\Delta w_i = \eta \sum_{d\in D} (t_d - o_d) (x_{id})$ (true gradient descent)
126 | \item Problems: slow; possibly multiple local minima in error surface (?-I thought error function was smooth, and would always find the global minimum. Example why not?)
127 | \item (?) Stochastic gradient descent: $\Delta w_i = \eta (t-o)x_i$ (known as delta rule). Error rule: $E_d(\vec{w}) = \frac{1}{2} (t_d - o_d)^2$ (?-relationship to the other gradient descent? Why don't we need to separate it by $x_{id}$ anymore? Is this a vector?)
128 | \item Stochastic versus True gradient descent
129 | \begin{itemize}
130 | \item true: error summed over all examples before updating weights. stochastic: weights updated upon examining each training example
131 | \item summing over multiple examples require more computation per weight update step. But using true gradient, so can use a larger step size
132 | \item Stochastic avoids multiple local minima because it uses $\nabla E_d(\vec{w})$ not $\nabla E(\vec{w})$
133 | \end{itemize}
134 | \end{itemize}
135 | \item The cost function for a neural network is non-convex, so it may have multiple minima. Which minimum you find with gradient descent depends on the initialization.
136 | \end{itemize}
137 | 
138 | \subsection{Threshold Unit}
139 | Unit for multilayer networks. Want a network that can represent highly nonlinear functions. Need unit whose output is nonlinear, but the output is also differentiable function of its inputs. $o = \sigma (\vec{w} \vec{x})$ where $\sigma(y) = \frac{1}{1-e^y}$
140 | \subsection{\textsc{BACKPROP}}
141 | $$E(\vec{w}) = \frac12 \sum_{d\in D} \sum_{k \in outputs}{(t_{kd} - o_{kd})^2}$$ where outputs: set of output units in network, $t_{kd}$ target, $o_{kd}$ output associated with $k^{th}$ output unit and training example $d$. (?)\\
142 | {\color{red}Algorithm BACKPROP}
143 | \begin{itemize}
144 | \item until termination condition is met:
145 | \item for i = 1 to m (m is the number of training examples)
146 | \begin{itemize}
147 | \item set $a^{(1)} = x^{(i)}$ ($i^{th}$ training example)
148 | \item Perform forward propagation by computing $a^{(l)}$ for $l = 2,...,L$ ($L$ is total number of layers) $a^{(l)}= \sigma (w^{(l-1)}a^{(l-1)}) $ = output of the $l^{th}$ layer. 
149 | \item Using $y^{(i)}$ compute $\delta^{(L)} = a^{(L)} - y^{(i)}$ ($y^{(i)}$ is the target for the $i^{th}$ training example)
150 | \item Then calculate (??) $\delta^{(L-1)}$ up until $\delta^{(2)}$ ($\delta^{(l)}$ is the "error" of layer $l$ and $$\delta^{(l)} = w^{(l)}\delta^{(l+1)} .* \sigma'(w^{(l)} a^{(l)})$$
151 | \item update $w^{(l)} = w^{(l)} + \Delta w^{(l)}$ (represents a vector of the weights of layer $l$) where  $$\Delta w^{(l)} = \eta \delta^{(l)}.*x^{(l)}$$
152 | \end{itemize}
153 | \end{itemize}
154 | 
155 | \subsection{Momentum}
156 | $$\Delta w^{(l)}_n= \eta \delta^{(l)}.*x^{(l)} +\alpha w^{(l)} (n-1) $$\\ where $n$ is the iteration (adds a momentum $\alpha$)
157 | \begin{itemize}
158 | \item $E_d(\vec{w}) = \frac12 \sum_{k\in outputs}{(t_k - o_k)^2}$ error on training example $d$
159 | \item How to derive the \textsc{BACKPROP} rule??
160 | \item \textsc{BACKPROP} for multi-layer networks may converge only at a local minimum (because error surface for multi-layer networks may contain many different minima).
161 | \item Alternative Error Functions?
162 | \item Alternative Error Minimization Procedures
163 | \end{itemize}
164 | 
165 | \subparagraph{Recurrent Networks}
166 | What do I need to know about recurrent networks?
167 | 
168 | \subsection{Radial Basis Functions}
169 | \begin{itemize}
170 | \item $\hat{f}(x) = w_0+ \sum_{u=1}^k{w_uKern_u(d(x_u, x))}$
171 | \item Equation can be thought of as training a 2-layer network. First layer computes $Kern_u$, second layer computes a linear combination of these first layer values.
172 | \item Kernel is defined such that $d(x_u, x) \uparrow \implies Kern_u \downarrow$
173 | \item RBF gives global approximation to target function represented by linear combinations of many local kernel functions (smooth linear combination).
174 | \item Faster to train than \textsc{BACKPROP} because input and output layer are trained separately.
175 | \item RBF is eager: represents global function as a linear combo of multiple local kernel functions. Local approximations RBF creates are not specifically targeted to the query.
176 | \item A type of ANN constructed from spatially localized kernel functions. Sort of the `link' between k-NN and ANN?
177 | \end{itemize}
178 | 
179 | 
180 | \section{Instance Based Learning}
181 | \subsection{k-NN}
182 | \begin{itemize}
183 | \item discrete: $$\hat{f} (x_q) =argmax_{v \in V} \sum_{i=1}^k{\delta(v, f(x_i))}$$ where $\delta(a,b) =1$ if $a=b$ and $0$ otherwise.
184 | \item continuous (for a new value, $x_q$): $$\hat{f} (x_q) = \frac{\sum_{i=1}^{k}{f(x_i)}}{k}$$
185 | \item distance-weighted: $w_i = \frac{1}{d(x_q, x_i)^2}$. If $x_q = x_i$ assign $\hat{f} (x_q) = f(x_i)$ (if more than one, do a majority).
186 | \item real valued distance weighted: $$\hat{f} (x_q) = \frac{\sum_{i=1}^{k}{f(x_i)}}{\sum_{i=1}^{k}{w_i}}$$
187 | \item  Inductive Bias of k-NN: assumption that nearest points are most similar
188 | \item k-NN is sensitive to having many irrelevant attributes `curse of dimensionality' (can deal with it by `stretching the axes', add a weight to each attribute. Can even get rid of some of the attributes by setting the weight =0)
189 | \end{itemize}
190 | 
191 | \subsection{Locally Weighted Linear Regression}
192 | \begin{itemize}
193 | \item $f$ approximated near $x_q$ using $\hat{f}(\vec{x}) = \vec{w} \cdot \vec{x}$ (is this appropriate notation?)
194 | \item Error function using kernel: $E(x_q) = \frac12 \sum_{k\in K}{(f(x) - \hat{f}(x))^2 Kern(d(x_q, x))}$ where $K$ is the set of $k$ closest $x$ to $x_q$.
195 | \end{itemize}
196 | 
197 | \section{Support Vector Machines}
198 | Maximal Margin Hyperplanes: if data linearly separable, then $\exists (\vec{w}, b)$ such that $\vec{w}^T\vec{x} + b \geq 1$ $\forall \vec{x_i} \in P$ and $\vec{w}^T\vec{x} + b \leq -1$ $\forall \vec{x_i} \in N$ (N, P are the two classes). Want to minimize $\vec{w}^T\vec{w}$ subject to constraints of linear separability.\\ Or, maximize $\frac2{|w|}$ while $y_i(\vec{w}^T\vec{x_1}+b) \geq 1$ $\forall i$. Note $y_i =\{+1, -1\}$. Or minimize $\frac12 |w|^2$. This is quadratic programming problem.\\
199 | $W(\alpha) = \sum_{i} \alpha_i - \frac12 \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^T x_j$. $w = \sum_i \alpha_i x_i y_i$. $\alpha_i$ mostly 0 $\implies$ only a few of the x's matter.
200 | \subsection{Kernel Induced Feature Spaces}
201 | Map to higher dimensional \textit{feature space}, construct a separating hyperplane. $X \rightarrow H$ is $\vec{x} \rightarrow \phi(\vec{x}).$\\
202 | Decision function is $f(\vec{x}) = sgn (\phi(\vec{x}) w^*+b^*)$ (* means optimal weight and bias)\\
203 | Kernel function: $K(\vec{x} \vec{z}) =\phi(\vec{x})^T\phi(\vec{z})$. If $K$ exists, we don't even need to know what $\phi$ is.\\
204 | Mercer's condition: \\
205 | What if data is not linearly separable? (slack variables?)\\
206 | Lagrangian?\\
207 | Mercer's Theorem? \\
208 | 
209 | \subsection{Relationship between SVMs and Boosting}
210 | $H_{trial} (x) = \frac{sgn(\sum_i{\alpha_i x_i})}{\sum_i \alpha_i}$. As we use more and more weak learners, the error stays the same, but the confidence goes up. This equates to having a big margin (big margins tend to avoid overfitting). 
211 | 
212 | 
213 | \section{Boosting}
214 | Boosting problem: set of weak learners combined to produce a learner with an arbitrary high accuracy.\\
215 | The original boosting problem asks whether a set
216 | of weak learners can be combined to produce a learner with an arbitrary
217 | high accuracy. A weak learner is a learner whose performance (at
218 | classification or regression) is only slightly better than random guessing.
219 | AdaBoost: trains multiple weak classifiers on training data, then combines into single boosted classifier. Weighted sum of weak classifiers with weights dependent on weak classifier accuracy.\\
220 | $N$ training examples: $x_i$, $y_i \in \{-1, +1\}$. Each example $i$ has an observation weight $w_i$ (how important example $i$ is for our current learning task).\\
221 | Classifier $G$: $err_S = \sum_{i=1}^N{w_i I(y_i \neq G(x_i))}$ \\
222 | Using weights: $err = \frac{\sum_{i=1}^N{w_i I(y_i \neq G(x_i))}}{\sum_{i=1}^N w_i}$\\
223 | In this way, our error metric is more sensitive to misclassified examples
224 | that have a greater importance weight. Denominator is only for normalization (we want an answer between 0 and $N$).
225 | Boosting: weights are sequentially updated. Algorithm:\\
226 | \begin{itemize}
227 | \item initialize $w_i = \frac1N$
228 | \item for $m =1$ to $M$:
229 | \begin{itemize}
230 | \item fit $G_m(x)$ using $w_i$'s
231 | \item compute $$err_m = \frac{\sum_{i=1}^N{w_i I(y_i \neq G_m(x_i))}}{\sum_{i=1}^N w_i}$$
232 | \item $\alpha_m = \frac{log(1-err_m)}{err_m}$
233 | \item $w_i \leftarrow w_i \cdot exp(\alpha_m I(y_i \neq G_m(x_i))$ for $i = 1 \dots N$
234 | \end{itemize}
235 | \item $G(x) = sign [ \sum_{m=1}^M \alpha_m G_m(x)]$ In this way, classifiers that have a poor accuracy (high error rate, low $\alpha_m$) are penalized in the final sum.
236 | \end{itemize}
237 | 
238 | Question : where are these $G_m$'s coming from? Are they pre-set or are they created by the algorithm?
239 | 
240 | \section{Computational Learning Theory}
241 | \subsection{Definitions}
242 | \begin{itemize}
243 | \item $H$--hypothesis space. $c \in H$--true hypothesis. $h \in H$--candidate hypothesis. $S \subseteq H$--training set.
244 | \item Consistent learner: Learner outputs a hypothesis such that $h(x) = c(x)$ $\forall x \in S$
245 | \item Version space: $VS(S) = \{ h \in H: $h$ \text{ consistent wrt to } S \}$ (ie, hypothesis consistent with training examples)
246 | \item training error: fraction of training examples misclassified by $h$.
247 | \item true error: fraction of examples that would be misclassified on sample drawn from $D$ (distribution over inputs). $error_D(h) = Pr_{x \sim D} [c(x) \neq h(x)]$
248 | \item $C$ is PAC-learnable by learner $L$ using $H$ $\iff$ $L$ will output $h \in H$ (with probability $1-\delta$) such that $error_D(h) \leq \varepsilon$ in time and samples polynomial in $1/\varepsilon$, $1/ \delta$, $|H|$.
249 | \item $\varepsilon$-exhausted version space: $VS(S)$ exhausted iff $\forall h \in VS(S)$ $error_D(h) \leq \varepsilon$.
250 | \end{itemize}
251 | 
252 | \subsection{Haussler Theorem}
253 | Bounds true error.\\
254 | Let $error_D(h_i) > \varepsilon$ for $i = 1 \dots k$ (some $h_i$'s in $H$). How much data do we need to ``knock out'' all these hypotheses?\\
255 | $Pr_{x \sim D} [h_i(x) = c(x)] \leq 1- \varepsilon$ (probability that $h_i$ matches true concept is low)\\
256 | $Pr(h_i \text{ consistent with $c$ on $m$ examples}) \leq (1- \varepsilon)^m$ (independent).\\
257 | $Pr(\exists h_i \text{ consistent with $c$ on $m$ examples}) = k \cdot (1- \varepsilon)^m \leq |H| \cdot (1-\varepsilon)^m$\\
258 | $-\varepsilon \geq ln(1- \varepsilon) \implies (1- \varepsilon)^m \leq exp(-\varepsilon m)$\\
259 | Upper bound that VS not $\varepsilon$-exhausted after $m$ samples: $|H|\cdot exp(-\varepsilon m)$.\\
260 | Want: $|H| \cdot exp(-\varepsilon m) \leq \delta$ (solve for m).\\
261 | $m \geq \frac{1}{\varepsilon} (ln(|H|) + ln (\frac{1}{\delta})$
262 | 
263 | \subsection{Infinite Hypotheses Spaces}
264 | \begin{itemize}
265 | \item Examples: linear separators, ANNs, decision trees (continuous inputs)
266 | \item $m \geq \frac{1}{\varepsilon} (8VC(H)lg(\frac{13}{\varepsilon}) + 4 lg( \frac{2}{\delta})$
267 | \item shatter: A set of instances $S$ is shattered by $H$ if every possible dichotomy of $S$ $\exists h \in H$ that is consistent with this dichotomy.
268 | \item $VC(H)$ is size of largest finite subset of instance space that can be shattered by $H$.
269 | \item $C$ PAC-learnable iff VC dimension is finite.
270 | \end{itemize}
271 | 
272 | 
273 | \section{Bayesian Learning}
274 | \subsection{Equations and Definitions}
275 | \begin{itemize}
276 | \item $P(h)$ : probability that a hypothesis $h$ holds
277 | \item $P(D)$: probability that training data $D$ will be observed
278 | \item Bayes' Rule: $$P(h|D) = \frac{P(D|h)P(h)}{P(D)}$$
279 | \item Find most probable $h \in H$ given $D$: $$h_{map} = argmax_{h \in H} P(h|D) = argmax_{h \in H} P(D|h)P(h)$$
280 | \item if every $h\in H$ a priori equally probable: $$h_{ml} = argmax_{h \in H} P(D|h)$$
281 | \end{itemize}
282 | 
283 | \subparagraph{BRUTE FORCE MAP learning algorithm}
284 | Output $h_{map}$
285 | 
286 | Let's assume:
287 | \begin{itemize}
288 | \item $D$ is noise-free
289 | \item Target function $c \in H$
290 | \item all $h$ (a priori) are equally likely
291 | \end{itemize}
292 | Then $P(h) = \frac1{|H|}$\\
293 | $$P(D|h) =
294 |     \begin{cases}
295 |             1, &         \text{if } d_i =h(x_i) \forall d_i \in D,\\
296 |             0, &         \text{otherwise}.
297 |     \end{cases}
298 | $$
299 | $$P(D) = \frac{|VS_{H,D}|}{|H|}$$ $|VS_{H,D}|$ is the set of hypotheses in $H$ that are consistent with $D$. Consistent learned outputs an $h$ with zero error over training examples.\\
300 | Therefore $$P(h|D) = \begin{cases}
301 |             \frac1{|VS_{H,D}|}, &         \text{if $h$ consistent with $D$}\\
302 |             0, &         \text{otherwise}.
303 |     \end{cases}
304 | $$
305 | Every consistent hypothesis is a MAP hypothesis (with these assumptions)!
306 | 
307 | \subsection{ML and Least-Squared Error}
308 | Under certain assumptions any learner that minimizes squared error between the outputs of hypothesis $h$ and training data will output an ML hypothesis. No idea why. ?? ML hypothesis is the one that minimizes the sum of squared errors over the training data.
309 | 
310 | \subsection{Bayes Optimal Classifier}
311 | $$P(v_j |D) = \sum_{h_j \in H}{P(v_j|h_i) P(h_i|D)}$$ (probability that correct classification is $v_j$)\\
312 | $$v_{map} = argmax_{v_j \in V} P(v_j|D)$$
313 | 
314 | \subsection{Bayesian Belief Networks}
315 | \subparagraph{Naive Bayes} Classify given attributes: $v_{map} = argmax_{v_j \in V} P(v_j|a_1,...,a_n)$. Rewrite using Bayes' rule and use naive assumption that all $a_i$ are conditionally independent given $v_j$. $v_{NB} = argmax_{v_j \in V} P(v_j) \prod_{i}{P(a_i|v_j)}$.\\
316 | Whenever naive assumption is satisfied, $v_{NB}$ same as MAP classification.
317 | \subparagraph{EM Algorithm}
318 | \begin{itemize}
319 | \item arbitrary initial hypothesis
320 | \item repeatedly calculates expected values of the hidden variables
321 | \item recalculates the ML hypothesis
322 | \end{itemize}
323 | This will converge to local ML hypothesis, along with estimated values for hidden variables (why?)
324 | 
325 | \section{Evaluating Hypotheses}
326 | 
327 | \section{Randomized Optimization}
328 | \subsection{MIMIC}
329 | Directly model distribution.\\
330 | Algorithm:\\
331 | \begin{itemize}
332 | \item generate samples from $P^{\theta_t}(x)$
333 | \item set $\theta_{t+1}$ to the n'th percentile
334 | \item retain only those samples such that $f(x) \geq \theta_{t+1}$
335 | \item estimate $P^{\theta_{t+1}}(x)$
336 | \item repeat!
337 | \end{itemize}
338 | 
339 | \subsection{Simulated Annealing}
340 | Algorithm:\\
341 | \begin{itemize}
342 | \item for finite number of iterations:
343 | \item sample new point $x_t$ in $N(x)$
344 | \item Jump to new sample with probability $P(x, x_t, T)$
345 | \item decrease $T$
346 | \end{itemize}
347 | $$P(x, x_t, T) =
348 |     \begin{cases}
349 |             1, &         \text{if } f(x_t) \geq f(x),\\
350 |             exp(\frac{f(x_t)-f(x)}{T}), &         \text{otherwise}.
351 |     \end{cases}
352 | $$
353 | 
354 | \subparagraph{Genetic Algorithms}
355 | WHAT IS??
356 | 
357 | \section{Information Theory}
358 | \subparagraph{Definitions}
359 | We'll use shorthand: Just write $x$ instead of $X= x$ for all the possible values that a random event $X$ could take on. (Am I using the terms correctly?)
360 | \begin{itemize}
361 | \item Mutual Information: $I(X,Y) = H(X) - H(X|Y)$
362 | \item Entropy: $H(A) = - \sum_{s \in A} P(s) lg(P(s))$
363 | \item Joint entropy: $H(X, Y) = - \sum_{x \in X} \sum_{y \in Y} P(x,y) lg(P(x,y))$
364 | \item Conditional Entropy: $H(Y|X) = -\sum_{x \in X} \sum_{y \in Y} P(x,y) lg(P(y|x))$
365 | \item If X independent of Y: $H(Y|X) = H(Y)$ and $H(Y,X) = H(Y) +H(X)$
366 | \item Kullback-Leibler divergence: $KL(p||q) = - \sum_{x \in X} p(x) lg(\frac{p(x)}{q(x)})$ for two different distributions $p$, $q$.
367 | \end{itemize}
368 | 
369 | \end{document}


--------------------------------------------------------------------------------