├── .gitignore
├── CONTRIBUTORS.tex
├── Makefile
├── README.md
├── course-revision.tex
├── example-mnist
    ├── :w
    ├── body.0.1.our-first-learning-algorithm.tex
    ├── burp.png
    ├── cow.png
    ├── example-beaver.py
    ├── example-sand.py
    ├── example.py
    ├── mnist-trn-00.png
    ├── mnist-trn-01.png
    ├── mnist-trn-02.png
    ├── mnist-trn-03.png
    ├── mnist-trn-04.png
    ├── mnist-trn-05.png
    ├── mnist-trn-06.png
    ├── mnist-trn-07.png
    ├── mnist-trn-08.png
    ├── mnist-trn-09.png
    ├── mnist-trn-10.png
    ├── mnist-trn-11.png
    ├── mnist-trn-12.png
    ├── mnist-trn-13.png
    ├── mnist-trn-14.png
    ├── mnist-trn-15.png
    ├── mnist-trn-16.png
    ├── mnist-trn-17.png
    ├── mnist-trn-18.png
    ├── mnist-trn-19.png
    ├── mnist-trn-20.png
    ├── mnist-trn-21.png
    ├── mnist-trn-22.png
    ├── mnist-trn-23.png
    ├── mnist-trn-24.png
    ├── new-test-scat.png
    ├── new-test.png
    ├── new-train-scat.png
    ├── new-train.png
    ├── plotter.py
    ├── rec-01.tex
    ├── scat.png
    ├── test-features.png
    ├── test-plain.png
    ├── test-scat.png
    ├── test-weights-Hinge.png
    ├── test-weights-HingeReg.png
    ├── test-weights-Percep.png
    ├── test-weights.png
    ├── test.png
    ├── tmp.cpp
    ├── tmp.py
    ├── train-features-hinge-narrow-crop.png
    ├── train-features-hinge-narrow.png
    ├── train-features-hinge-wide-crop.png
    ├── train-features-hinge-wide.png
    ├── train-features-whinge.png
    ├── train-features.png
    ├── train-plain-cropped.png
    ├── train-plain.png
    ├── train-scat.png
    ├── train-weights-Hinge.png
    ├── train-weights-HingeReg.png
    ├── train-weights-Percep.png
    ├── train-weights.png
    ├── train.png
    └── yo.png
├── figures
    ├── MaxPool.png
    ├── backslash.png
    ├── beach.png
    ├── bias-trick.png
    ├── black-hole.png
    ├── butterfly.png
    ├── cake-1.png
    ├── cake-2.png
    ├── cake-3.png
    ├── cake-4.png
    ├── cake-5-col.png
    ├── cake-5.png
    ├── cake-6-col.png
    ├── cake-6.png
    ├── chernoff.png
    ├── conditionally-dependent.png
    ├── conv_elements.png
    ├── conv_operation.png
    ├── deep-labeled.png
    ├── deep.png
    ├── depshear.png
    ├── dimple.png
    ├── face-vase.png
    ├── feature-space-dependence.png
    ├── feature-space-depshear.png
    ├── feature-space-phenomena.png
    ├── feature-space-testtrain.png
    ├── gd-answers.png
    ├── gd-problem.png
    ├── gradients-curvy.png
    ├── hinge-beach.png
    ├── linear.png
    ├── margin.png
    ├── ml-dataflow.png
    ├── necker.png
    ├── priors
    │   ├── graphs.py
    │   ├── yo-1-1.png
    │   ├── yo-1-2.png
    │   ├── yo-1-inf.png
    │   ├── yo-2-1.png
    │   ├── yo-2-2.png
    │   ├── yo-2-inf.png
    │   ├── yo-inf-1.png
    │   ├── yo-inf-2.png
    │   └── yo-inf-inf.png
    ├── quad-reg.png
    ├── quadratic-features.png
    ├── rbf-kernel.png
    ├── regress-springs.png
    ├── regression-beach.png
    ├── satellite-2.png
    ├── satellite.png
    ├── seven-days.png
    ├── shallow.png
    ├── sim-trefoil-a-prob.png
    ├── sim-trefoil-a-sol.png
    ├── sim-trefoil-a-v.png
    ├── sim-trefoil-a.png
    ├── sim-trefoil-b-prob.png
    ├── sim-trefoil-b-sol.png
    ├── sim-trefoil-b-v.png
    ├── sim-trefoil-b.png
    ├── slash.png
    ├── smiley-transformed.png
    ├── smiley.png
    └── supervised.png
├── figures_desc
    ├── MaxPool.tex
    ├── README.md
    ├── conv_elements.tex
    └── conv_operation.tex
├── mlentary.pdf
├── mlentary.tex
├── sam.sty
├── tex-source-sandbox
    ├── body.1.-1.microcosm.tex
    ├── body.1.0.linear-approximation.tex
    ├── body.1.1.priors.tex
    ├── body.1.2.bovinity-and-walls.tex
    ├── body.3.0.bayesian-models.tex
    ├── body.3.1.examples-of-bayesian-models.tex
    ├── body.3.2.inference-algorithms-for-bayesian-models.tex
    ├── body.3.3.combining-with-deep-learning.tex
    ├── body.U.0.bayesian-models.tex
    ├── body.U.1.classical-inference.tex
    ├── body.U.2.inference-via-neural-nets.tex
    ├── body.U.3.deep-generative-architectures.tex
    ├── body.w.0.data-dependent-features.tex
    ├── body.w.2.priors-and-generalization.tex
    ├── body.w.3.model-selection.tex
    ├── body.z.2.bovinity.tex
    ├── body.z.3.priors.tex
    ├── body.z.3.see-around-walls.tex
    ├── sammy.sty
    ├── u4cl.tex
    ├── u4ha.tex
    ├── u4hb.tex
    ├── u4pa.tex
    ├── u4pb.tex
    └── unit4.tex
└── tex-source
    ├── body.0.0.what-is-learning.tex
    ├── body.0.1.our-first-learning-algorithm.tex
    ├── body.0.2.how-well-did-we-do.tex
    ├── body.0.3.how-can-we-do-better.tex
    ├── body.1.0.likelihoods.tex
    ├── body.1.1.gradients.tex
    ├── body.1.1.iterative-optimization.tex
    ├── body.1.2.priors.tex
    ├── body.1.3.model-selection.tex
    ├── body.2.0.featurization.tex
    ├── body.2.1.quantify-uncertainty.tex
    ├── body.2.2.iterative-optimization.tex
    ├── body.2.3.data-dependent-features.tex
    ├── body.3.0.shallow-learning.tex
    ├── body.3.1.deep-architecture.tex
    ├── body.3.2.convolution.tex
    ├── body.3.3.attention.tex
    ├── body.4.0.probabilistic-models.tex
    ├── body.4.1.expectation-maximization.tex
    ├── body.4.2.metropolis-hastings.tex
    ├── body.4.3.deep-generators.tex
    ├── body.5.0.reinforcement.tex
    ├── body.5.1.mdps.tex
    ├── body.5.2.q-learning.tex
    ├── body.5.3.beyond.tex
    ├── body.F.0.linear-algebra.tex
    ├── body.F.1.probability.tex
    ├── body.F.2.derivatives.tex
    └── body.F.3.programming.tex


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Core latex/pdflatex auxiliary files:
  2 | *.aux
  3 | *.lof
  4 | *.log
  5 | *.lot
  6 | *.fls
  7 | *.out
  8 | *.toc
  9 | *.fmt
 10 | *.fot
 11 | *.cb
 12 | *.cb2
 13 | .*.lb
 14 | 
 15 | ## Intermediate documents:
 16 | *.dvi
 17 | *.xdv
 18 | *-converted-to.*
 19 | # these rules might exclude image files for figures etc.
 20 | # *.ps
 21 | # *.eps
 22 | # *.pdf
 23 | 
 24 | ## Generated if empty string is given at "Please type another file name for output:"
 25 | .pdf
 26 | 
 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 28 | *.bbl
 29 | *.bcf
 30 | *.blg
 31 | *-blx.aux
 32 | *-blx.bib
 33 | *.run.xml
 34 | 
 35 | ## Build tool auxiliary files:
 36 | *.fdb_latexmk
 37 | *.synctex
 38 | *.synctex(busy)
 39 | *.synctex.gz
 40 | *.synctex.gz(busy)
 41 | *.pdfsync
 42 | 
 43 | ## Build tool directories for auxiliary files
 44 | # latexrun
 45 | latex.out/
 46 | 
 47 | ## Auxiliary and intermediate files from other packages:
 48 | # algorithms
 49 | *.alg
 50 | *.loa
 51 | 
 52 | # achemso
 53 | acs-*.bib
 54 | 
 55 | # amsthm
 56 | *.thm
 57 | 
 58 | # beamer
 59 | *.nav
 60 | *.pre
 61 | *.snm
 62 | *.vrb
 63 | 
 64 | # changes
 65 | *.soc
 66 | 
 67 | # comment
 68 | *.cut
 69 | 
 70 | # cprotect
 71 | *.cpt
 72 | 
 73 | # elsarticle (documentclass of Elsevier journals)
 74 | *.spl
 75 | 
 76 | # endnotes
 77 | *.ent
 78 | 
 79 | # fixme
 80 | *.lox
 81 | 
 82 | # feynmf/feynmp
 83 | *.mf
 84 | *.mp
 85 | *.t[1-9]
 86 | *.t[1-9][0-9]
 87 | *.tfm
 88 | 
 89 | #(r)(e)ledmac/(r)(e)ledpar
 90 | *.end
 91 | *.?end
 92 | *.[1-9]
 93 | *.[1-9][0-9]
 94 | *.[1-9][0-9][0-9]
 95 | *.[1-9]R
 96 | *.[1-9][0-9]R
 97 | *.[1-9][0-9][0-9]R
 98 | *.eledsec[1-9]
 99 | *.eledsec[1-9]R
100 | *.eledsec[1-9][0-9]
101 | *.eledsec[1-9][0-9]R
102 | *.eledsec[1-9][0-9][0-9]
103 | *.eledsec[1-9][0-9][0-9]R
104 | 
105 | # glossaries
106 | *.acn
107 | *.acr
108 | *.glg
109 | *.glo
110 | *.gls
111 | *.glsdefs
112 | *.lzo
113 | *.lzs
114 | *.slg
115 | *.slo
116 | *.sls
117 | 
118 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
119 | # *.ist
120 | 
121 | # gnuplot
122 | *.gnuplot
123 | *.table
124 | 
125 | # gnuplottex
126 | *-gnuplottex-*
127 | 
128 | # gregoriotex
129 | *.gaux
130 | *.glog
131 | *.gtex
132 | 
133 | # htlatex
134 | *.4ct
135 | *.4tc
136 | *.idv
137 | *.lg
138 | *.trc
139 | *.xref
140 | 
141 | # hyperref
142 | *.brf
143 | 
144 | # knitr
145 | *-concordance.tex
146 | # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
147 | # *.tikz
148 | *-tikzDictionary
149 | 
150 | # listings
151 | *.lol
152 | 
153 | # luatexja-ruby
154 | *.ltjruby
155 | 
156 | # makeidx
157 | *.idx
158 | *.ilg
159 | *.ind
160 | 
161 | # minitoc
162 | *.maf
163 | *.mlf
164 | *.mlt
165 | *.mtc[0-9]*
166 | *.slf[0-9]*
167 | *.slt[0-9]*
168 | *.stc[0-9]*
169 | 
170 | # minted
171 | _minted*
172 | *.pyg
173 | 
174 | # morewrites
175 | *.mw
176 | 
177 | # newpax
178 | *.newpax
179 | 
180 | # nomencl
181 | *.nlg
182 | *.nlo
183 | *.nls
184 | 
185 | # pax
186 | *.pax
187 | 
188 | # pdfpcnotes
189 | *.pdfpc
190 | 
191 | # sagetex
192 | *.sagetex.sage
193 | *.sagetex.py
194 | *.sagetex.scmd
195 | 
196 | # scrwfile
197 | *.wrt
198 | 
199 | # svg
200 | svg-inkscape/
201 | 
202 | # sympy
203 | *.sout
204 | *.sympy
205 | sympy-plots-for-*.tex/
206 | 
207 | # pdfcomment
208 | *.upa
209 | *.upb
210 | 
211 | # pythontex
212 | *.pytxcode
213 | pythontex-files-*/
214 | 
215 | # tcolorbox
216 | *.listing
217 | 
218 | # thmtools
219 | *.loe
220 | 
221 | # TikZ & PGF
222 | *.dpth
223 | *.md5
224 | *.auxlock
225 | 
226 | # titletoc
227 | *.ptc
228 | 
229 | # todonotes
230 | *.tdo
231 | 
232 | # vhistory
233 | *.hst
234 | *.ver
235 | 
236 | # easy-todo
237 | *.lod
238 | 
239 | # xcolor
240 | *.xcp
241 | 
242 | # xmpincl
243 | *.xmpi
244 | 
245 | # xindy
246 | *.xdy
247 | 
248 | # xypic precompiled matrices and outlines
249 | *.xyc
250 | *.xyd
251 | 
252 | # endfloat
253 | *.ttt
254 | *.fff
255 | 
256 | # Latexian
257 | TSWLatexianTemp*
258 | 
259 | ## Editors:
260 | # WinEdt
261 | *.bak
262 | *.sav
263 | 
264 | # Texpad
265 | .texpadtmp
266 | 
267 | # LyX
268 | *.lyx~
269 | 
270 | # Kile
271 | *.backup
272 | 
273 | # gummi
274 | .*.swp
275 | 
276 | # KBibTeX
277 | *~[0-9]*
278 | 
279 | # TeXnicCenter
280 | *.tps
281 | 
282 | # auto folder when using emacs and auctex
283 | ./auto/*
284 | *.el
285 | 
286 | # expex forward references with \gathertags
287 | *-tags.tex
288 | 
289 | # standalone packages
290 | *.sta
291 | 
292 | # Makeindex log files
293 | *.lpz
294 | 
295 | # xwatermark package
296 | *.xwm
297 | 
298 | # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
299 | # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
300 | # Uncomment the next line to have this generated file ignored.
301 | #*Notes.bib
302 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.tex:
--------------------------------------------------------------------------------
1 | \texttt{karene},
2 | \texttt{laurent},
3 | \texttt{sam},
4 | 
5 | \attnsam{append your name to the list above by editing `CONTRIBUTORS.tex`}
6 | 
7 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ml:
 2 | 	pdflatex mlentary.tex
 3 | 	evince mlentary.pdf
 4 | 
 5 | clean:
 6 | 	rm *.aux
 7 | 	rm *.log
 8 | 	rm *.out
 9 | 
10 | ml-full:
11 | 	pdflatex mlentary.tex
12 | 	pdflatex mlentary.tex
13 | 	evince mlentary.pdf
14 | 


--------------------------------------------------------------------------------
/example-mnist/body.0.1.our-first-learning-algorithm.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/body.0.1.our-first-learning-algorithm.tex


--------------------------------------------------------------------------------
/example-mnist/burp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/burp.png


--------------------------------------------------------------------------------
/example-mnist/cow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/cow.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-00.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-01.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-02.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-03.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-04.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-05.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-06.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-07.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-08.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-09.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-10.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-11.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-12.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-13.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-14.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-15.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-16.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-17.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-18.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-19.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-20.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-21.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-22.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-23.png


--------------------------------------------------------------------------------
/example-mnist/mnist-trn-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/mnist-trn-24.png


--------------------------------------------------------------------------------
/example-mnist/new-test-scat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/new-test-scat.png


--------------------------------------------------------------------------------
/example-mnist/new-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/new-test.png


--------------------------------------------------------------------------------
/example-mnist/new-train-scat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/new-train-scat.png


--------------------------------------------------------------------------------
/example-mnist/new-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/new-train.png


--------------------------------------------------------------------------------
/example-mnist/plotter.py:
--------------------------------------------------------------------------------
  1 | ''' author: samtenka
  2 |     change: 2023-02-20
  3 |     create: 2022-05-16
  4 |     descrp:
  5 |     to use:
  6 | '''
  7 | 
  8 | #-------  imports  ------------------------------------------------------------
  9 | 
 10 | from matplotlib import pyplot as plt
 11 | from pyrecord import Record
 12 | import numpy as np
 13 | import tqdm
 14 | 
 15 | #-------  colors  -------------------------------------------------------------
 16 | 
 17 | BLUE    = np.array([ .05,  .55,  .85]) # hues (colorblindness-friendly pair)
 18 | ORANGE  = np.array([ .95,  .65,  .05]) #
 19 | 
 20 | WHITE   = np.array([1.  , 1.  , 1.  ]) # shades
 21 | SMOKE   = np.array([ .9 ,  .9 ,  .9 ])
 22 | SLATE   = np.array([ .5 ,  .5 ,  .5 ])
 23 | SHADE   = np.array([ .1 ,  .1 ,  .1 ])
 24 | BLACK   = np.array([ .0 ,  .0 ,  .0 ])
 25 | 
 26 | def overlay(background, foreground, foreground_opacity=1.0):
 27 |     background += foreground_opacity * (foreground - background)
 28 | 
 29 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 30 | #~~~  Plot Class Basics  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 31 | 
 32 | #-------  example usage  ------------------------------------------------------
 33 | 
 34 | ''' We'll define a `Plot` class to support uses such as in the example below.
 35 | 
 36 |     for #PROGRAMMERS: I prefer real code to code written in a comment, since
 37 |     code-in-a-comment can become outdated without us noticing.  There are tools
 38 |     to extract unit tests from comments, but I don't use them.
 39 | '''
 40 | 
 41 | def small_example_to_illustrate_comment():
 42 |     (blank(data_height=480, data_width=480, margin=6)
 43 |         .add_gridlines().add_ticks().add_axes()
 44 |         .box_at(0.6, 0.8, color=BLUE)
 45 |         .scatter_points([(.314,.271), (.159,.828)], color=ORANGE)
 46 |         .save_to('hello.png')
 47 |     )
 48 | 
 49 | #-------  datatype   ----------------------------------------------------------
 50 | 
 51 | ''' A `Plot` instance keeps an image bitmap `pixels` (array of pixel
 52 |     intensities) and a coordinate system for that bitmap represented as
 53 |     functions `cell_from` and `coor_from` that convert between pixel
 54 |     (row,column) indices and abstract (y-coordinate,x-coordinate) pairs.  It
 55 |     also has a tuple `HWM` for height, width, and margin, measured in pixels.
 56 | 
 57 |     for #PROGRAMMERS: We write our class as "plain old data" manipulated by
 58 |     "functions we add externally", albeit sugared with "method call syntax".
 59 |     This style is not idiomatic for Python, but I prefer it because it allows
 60 |     us to arrange code as we would for C or Haskell.
 61 | '''
 62 | 
 63 | Plot = Record.create_type('Plot', 'pixels', 'cell_from', 'coor_from', 'HWM')
 64 | 
 65 | def bind_to_class(c, name):
 66 |     ''' Returns a decorator that binds the function in question as a method of
 67 |         the class `c` with methodname `name`.  For instance, if object `moo` is
 68 |         an instance of the class `Plot`, and if `c=Plot` and `name='save_to'`,
 69 |         then this decorator allows us to call functions (defined at top level
 70 |         scope) via `moo.save_to(...)`.
 71 |     '''
 72 |     def decorator(f):
 73 |         def wrapped(*args, **kwargs):
 74 |             return f(*args, **kwargs)
 75 |         setattr(c, name, wrapped)
 76 |         return wrapped
 77 |     return decorator
 78 | 
 79 | #-------  create and consume  -------------------------------------------------
 80 | 
 81 | def blank(data_height, data_width, margin, color=WHITE,
 82 |           ymin=0., ymax=1., xmin=0., xmax=1.           ):
 83 |     H,W,M = data_height, data_width, margin
 84 |     pixels = np.ones((H+2*M,
 85 |                       W+2*M,3)) * color
 86 |     cell_from = lambda y,x : (int(M+(H-1) * (1.-(y-ymin)/(ymax-ymin))),
 87 |                               int(M+(W-1) * (   (x-xmin)/(xmax-xmin))) )
 88 |     coor_from = lambda r,c : (ymin + (ymax-ymin) * (1.-(r-margin)/float(H-1)),
 89 |                               xmin + (xmax-xmin) * (   (c-margin)/float(W-1)) )
 90 |     return Plot(pixels, cell_from, coor_from, (H, W, M))
 91 | 
 92 | @bind_to_class(Plot, 'save_to')
 93 | def save_to(plot, file_name):
 94 |     plt.imsave(file_name, plot.pixels)
 95 |     return plot
 96 | 
 97 | #-------  more examples to illustrate all features  ---------------------------
 98 | 
 99 | def examples()
100 |     pp = (blank(data_height=480, data_width=480, margin=6)
101 |              .add_gridlines()
102 |              .add_ticks()
103 |              .add_axes()
104 |              .shade_heatmap(lambda weights: weights.dot(np.array([1.,-2.])))
105 |              .cross_at(0.4, 0.9)
106 |              .box_at(0.6, 0.8, color=ORANGE)
107 |              .box_at(0.9, 0.2, color=ORANGE)
108 |              .shade_hypothesis(lambda feats: feats.dot(np.array([10.,-20.])))
109 |              .shade_hypothesis(lambda feats: 5. + feats.dot(np.array([-10.,0.])))
110 |              .scatter_points([(.55,.55), (.65,.85), (.45,.55)], color=ORANGE)
111 |              .scatter_points([(.15,.55), (.35,.85), (.95,.55)], color=[BLUE,ORANGE,BLUE])
112 |              .save_to('yo.png')
113 |          )
114 | 
115 |     qq = (blank(480, 480, 6, ymin=-1.,ymax=+1.,xmin=-1.,xmax=+1.)
116 |              .add_gridlines(np.arange(-.8,1.,.2), np.arange(-.8,1.,.2), opacity=1.)
117 |              .shade_heatmap(lambda weights: (
118 |                  np.mean(weights**2,axis=2)
119 |                  +0.5+0.5*np.sin(weights.dot(np.array([10.,-20.])))
120 |                  +.25* (weights[:,:,0]-weights[:,:,1]**2)**2
121 |                  ))
122 |              .box_at(0.6, 0.8, color=ORANGE)
123 |              .box_at(0.9, 0.2, color=ORANGE)
124 |              .add_gridlines(np.arange(-.8,1.,.2), np.arange(-.8,1.,.2), opacity=.2)
125 |              .cross_at()
126 |              .save_to('cow.png')
127 |          )
128 | 
129 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
130 | #~~~  Plot Class: Draw Various Graphic Elements  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
131 | 
132 | #-------  axes and friends  ---------------------------------------------------
133 | 
134 | INNER_DECADES = np.arange(.1, 1., .1)
135 | 
136 | @bind_to_class(Plot, 'add_gridlines')
137 | def add_gridlines(plot, yticks=INNER_DECADES, xticks=INNER_DECADES,
138 |                   color=SMOKE, opacity=1.                          ):
139 |     _,_,M = plot.HWM
140 |     for y in yticks:
141 |         r,_ = plot.cell_from(y,0)
142 |         overlay(plot.pixels[r,M:-M], color, opacity)
143 |     for x in xticks:
144 |         _,c = plot.cell_from(0,x)
145 |         overlay(plot.pixels[M:-M,c], color, opacity)
146 |     return plot
147 | 
148 | @bind_to_class(Plot, 'add_ticks')
149 | def add_ticks(plot, yticks=INNER_DECADES, xticks=INNER_DECADES, yy=0., xx=0.,
150 |               color=SLATE                                                    ):
151 |     H,W,M = plot.HWM
152 |     rr,cc = plot.cell_from(yy,xx)
153 |     mask = np.linspace(1., 0., 3*M+1)[:,np.newaxis]
154 |     for y in yticks:
155 |         r,_ = plot.cell_from(y,0)
156 |         overlay(plot.pixels[r, cc-M:cc+2*M+1], color, mask      )
157 |     for x in xticks:
158 |         _,c = plot.cell_from(0,x)
159 |         overlay(plot.pixels[rr-2*M:rr+M+1, c], color, mask[::-1])
160 |     return plot
161 | 
162 | @bind_to_class(Plot, 'add_axes')
163 | def add_axes(plot, yy=0., xx=0., color=SLATE):
164 |     H,W,M = plot.HWM
165 |     rr,cc = plot.cell_from(yy,xx)
166 |     plot.pixels[rr   , M:M+W] = color
167 |     plot.pixels[M:M+H, cc   ] = color
168 |     return plot
169 | 
170 | #-------  special markers  ----------------------------------------------------
171 | 
172 | @bind_to_class(Plot, 'cross_at')
173 | def cross_at(plot, yy=0., xx=0., size=5, color=BLACK, opacity=1.):
174 |     r,c = plot.cell_from(yy,xx)
175 |     S = size
176 |     overlay(plot.pixels[r-S:r+S+1, c         ], color, opacity)
177 |     overlay(plot.pixels[r         , c-S:c+S+1], color, opacity)
178 |     return plot
179 | 
180 | @bind_to_class(Plot, 'box_at')
181 | def box_at(plot, yy=0., xx=0., size=5, color=BLACK, opacity=1.):
182 |     r,c = plot.cell_from(yy,xx)
183 |     S = size
184 |     overlay(plot.pixels[r-S       , c-S:c+S+1], color, .9 * opacity)
185 |     overlay(plot.pixels[    r+S   , c-S:c+S+1], color, .9 * opacity)
186 |     overlay(plot.pixels[r-S:r+S+1 , c-S      ], color, .9 * opacity)
187 |     overlay(plot.pixels[r-S:r+S+1 ,     c+S  ], color, .9 * opacity)
188 |     overlay(plot.pixels[r-S:r+S+1 , c-S:c+S+1], color, .3 * opacity)
189 |     return plot
190 | 
191 | #-------  display data  -------------------------------------------------------
192 | 
193 | @bind_to_class(Plot, 'scatter_points')
194 | def scatter_points(plot, coors, color=BLACK, opacity=1., rad=3.):
195 |     _,_,M = plot.HWM
196 |     color = color if type(color)==type([]) else [color for _ in coors]
197 |     for col,(y,x) in zip(color, coors):
198 |         r,c = plot.cell_from(y,x)
199 |         mask = np.mgrid[-M:M+1 , -M:M+1]
200 |         mask = rad**2 / (rad**2 + mask[0]**2 + mask[1]**2)
201 |         mask = np.minimum(1., opacity * mask**2)[:,:,np.newaxis]
202 |         overlay(plot.pixels[r-M:r+M+1, c-M:c+M+1], col, mask)
203 |     return plot
204 | 
205 | @bind_to_class(Plot, 'shade_hypothesis')
206 | def shade_hypothesis(plot, decfunc, color_pos=ORANGE, color_neg=BLUE,
207 |                      opacity=.10, sharpness=1.4):
208 |     # TODO: separate out (y,x)->(color,opacity) logic??
209 |     H,W,M = plot.HWM
210 |     for dy in [-.3,+.3]:
211 |         for dx in [-.3,+.3]:
212 |             cells = np.mgrid[M+dy:M+H+dy, M+dx:M+W+dx]
213 |             coors = plot.coor_from(cells[0], cells[1])
214 |             coors = np.moveaxis(coors, 0, -1)
215 |             dec = decfunc(coors)
216 |             mask = opacity * np.exp(-sharpness**2 * dec**2) [:,:,np.newaxis]
217 |             ## TODO: vectorize the following line
218 |             colors = np.array([[color_pos if dd<0 else color_neg for dd in d]
219 |                                for d in dec                                  ])
220 |             overlay(plot.pixels[M:-M,M:-M], colors, mask)
221 |     return plot
222 | 
223 | @bind_to_class(Plot, 'shade_heatmap')
224 | def shade_heatmap(plot, intensity, color=SHADE):
225 |     H,W,M = plot.HWM
226 |     cells = np.mgrid[M:M+H, M:M+W]
227 |     coors = plot.coor_from(cells[0], cells[1])
228 |     coors = np.moveaxis(coors, 0, -1)
229 |     mask  = np.maximum(0.,np.minimum(1., intensity(coors) ))
230 |     overlay(plot.pixels[M:-M,M:-M], color, mask[:,:,np.newaxis])
231 |     return plot
232 | 
233 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
234 | #~~~  Use the Plot Class for ML Plots  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
235 | 
236 | #-------  _  ------------------------------------------------------------------
237 | 
238 | def plot_feature_space(feature_vectors, labels, file_name,
239 |                        ymin=0., ymax=1., xmin=0., xmax=1.,
240 |                        weights=[], dec_func_maker=None,
241 |                        opacity=1.0,                       ):
242 |     plot = blank(480, 480, 6, ymin, ymax, xmin, xmax)
243 |     plot.add_gridlines(np.linspace(ymin,ymax,11)[1:-1],
244 |                        np.linspace(xmin,xmax,11)[1:-1] )
245 |     plot.add_ticks().add_axes()
246 | 
247 |     for w in weights:
248 |         decfunc = decfunc_maker(w)
249 |         plot.shade_hypothesis(decfunc)
250 |     plot.scatter_points(feature_vectors, opacity=opacity,
251 |                         color=[ORANGE if l==+1 else BLUE for l in labels])
252 | 
253 |     plot.save_to(file_name)
254 | 
255 | #-------  _  ------------------------------------------------------------------
256 | 
257 | #def plot_weight_space(error_by_param, file_name,
258 | #                      ymin=-10., ymax=+10., xmin=-10., xmax=+10.,
259 | #                      weight_color_pairs=[],                     ):
260 | #
261 | #    plot = blank(480, 480, 6, ymin, ymax, xmin, xmax)
262 | #    plot.add_gridlines(np.linspace(ymin,ymax,11)[1:-1],
263 | #                       np.linspace(xmin,xmax,11)[1:-1], opacity=1.)
264 | #
265 | #    # TODO: fill in this essence!  using kd tree voronoi??
266 | #    #plot.shade_heatmap(lambda ws:[]
267 | #
268 | #    for (y,x), col in weight_color_pairs:
269 | #        plot.box_at(y, x, color=col)
270 | #
271 | #    plot.add_gridlines(np.linspace(ymin,ymax,11)[1:-1],
272 | #                       np.linspace(xmin,xmax,11)[1:-1], opacity=1.)
273 | #    plot.save_to(file_name)
274 | 
275 | 
276 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
277 | #~~~  Render Digits  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
278 | 
279 | # TODO: decide where to put this
280 | #render_digit = lambda x : SMOKE * (1.0-np.repeat(x[:,:,np.newaxis], 3, axis=2))
281 | 
282 | #for i,idx in enumerate(train_idxs[:NB_DIGITS_RENDERED]):
283 | #    plt.imsave('mnist-trn-{:02d}.png'.format(i), render_digit(all_x[idx]))
284 | #    if not PRINT_TRAIN_FEATURES: continue
285 | #    print('train example {:02d} has darkness {:.2f} and height {:.2f}'.format(
286 | #        i, darkness(all_x[idx]), width(all_x[idx])
287 | #        ))
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/example-mnist/rec-01.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/rec-01.tex


--------------------------------------------------------------------------------
/example-mnist/scat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/scat.png


--------------------------------------------------------------------------------
/example-mnist/test-features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-features.png


--------------------------------------------------------------------------------
/example-mnist/test-plain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-plain.png


--------------------------------------------------------------------------------
/example-mnist/test-scat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-scat.png


--------------------------------------------------------------------------------
/example-mnist/test-weights-Hinge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-weights-Hinge.png


--------------------------------------------------------------------------------
/example-mnist/test-weights-HingeReg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-weights-HingeReg.png


--------------------------------------------------------------------------------
/example-mnist/test-weights-Percep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-weights-Percep.png


--------------------------------------------------------------------------------
/example-mnist/test-weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test-weights.png


--------------------------------------------------------------------------------
/example-mnist/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/test.png


--------------------------------------------------------------------------------
/example-mnist/tmp.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 |     // moooo   shshja
 3 | //
 4 | /*moo*/
 5 | 
 6 | moo // <- isnidafhiwehfjwe 
 7 | 
 8 | /* TODO HELP COW COW MOO TODO TODO WARNING  */ 
 9 | /* FILL IN FILLIN FUCK DAMN SHIT CUNT TODO HELP COW COW MOO TODO TODO WARNING  */ 
10 | /* */
11 | 


--------------------------------------------------------------------------------
/example-mnist/tmp.py:
--------------------------------------------------------------------------------
 1 | for i in range(686):
 2 |   if i*i<100: print(i)
 3 | 
 4 | #for each natural number i in the range 0 to 686 not
 5 | #including 686, flurp the number i.  To "flurp" a
 6 | #number n is to check whether n times n is (strictly)
 7 | #less than 100 and, if so, to display n onto the
 8 | #screen (no ink jet printers involved). 
 9 | 
10 | #...
11 | #email = str(accounting_data)
12 | #...
13 | #send_email_to('tom@blah', email)
14 | #...
15 | #...
16 | 


--------------------------------------------------------------------------------
/example-mnist/train-features-hinge-narrow-crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features-hinge-narrow-crop.png


--------------------------------------------------------------------------------
/example-mnist/train-features-hinge-narrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features-hinge-narrow.png


--------------------------------------------------------------------------------
/example-mnist/train-features-hinge-wide-crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features-hinge-wide-crop.png


--------------------------------------------------------------------------------
/example-mnist/train-features-hinge-wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features-hinge-wide.png


--------------------------------------------------------------------------------
/example-mnist/train-features-whinge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features-whinge.png


--------------------------------------------------------------------------------
/example-mnist/train-features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-features.png


--------------------------------------------------------------------------------
/example-mnist/train-plain-cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-plain-cropped.png


--------------------------------------------------------------------------------
/example-mnist/train-plain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-plain.png


--------------------------------------------------------------------------------
/example-mnist/train-scat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-scat.png


--------------------------------------------------------------------------------
/example-mnist/train-weights-Hinge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-weights-Hinge.png


--------------------------------------------------------------------------------
/example-mnist/train-weights-HingeReg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-weights-HingeReg.png


--------------------------------------------------------------------------------
/example-mnist/train-weights-Percep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-weights-Percep.png


--------------------------------------------------------------------------------
/example-mnist/train-weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train-weights.png


--------------------------------------------------------------------------------
/example-mnist/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/train.png


--------------------------------------------------------------------------------
/example-mnist/yo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/example-mnist/yo.png


--------------------------------------------------------------------------------
/figures/MaxPool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/MaxPool.png


--------------------------------------------------------------------------------
/figures/backslash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/backslash.png


--------------------------------------------------------------------------------
/figures/beach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/beach.png


--------------------------------------------------------------------------------
/figures/bias-trick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/bias-trick.png


--------------------------------------------------------------------------------
/figures/black-hole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/black-hole.png


--------------------------------------------------------------------------------
/figures/butterfly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/butterfly.png


--------------------------------------------------------------------------------
/figures/cake-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-1.png


--------------------------------------------------------------------------------
/figures/cake-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-2.png


--------------------------------------------------------------------------------
/figures/cake-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-3.png


--------------------------------------------------------------------------------
/figures/cake-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-4.png


--------------------------------------------------------------------------------
/figures/cake-5-col.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-5-col.png


--------------------------------------------------------------------------------
/figures/cake-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-5.png


--------------------------------------------------------------------------------
/figures/cake-6-col.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-6-col.png


--------------------------------------------------------------------------------
/figures/cake-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/cake-6.png


--------------------------------------------------------------------------------
/figures/chernoff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/chernoff.png


--------------------------------------------------------------------------------
/figures/conditionally-dependent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/conditionally-dependent.png


--------------------------------------------------------------------------------
/figures/conv_elements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/conv_elements.png


--------------------------------------------------------------------------------
/figures/conv_operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/conv_operation.png


--------------------------------------------------------------------------------
/figures/deep-labeled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/deep-labeled.png


--------------------------------------------------------------------------------
/figures/deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/deep.png


--------------------------------------------------------------------------------
/figures/depshear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/depshear.png


--------------------------------------------------------------------------------
/figures/dimple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/dimple.png


--------------------------------------------------------------------------------
/figures/face-vase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/face-vase.png


--------------------------------------------------------------------------------
/figures/feature-space-dependence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/feature-space-dependence.png


--------------------------------------------------------------------------------
/figures/feature-space-depshear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/feature-space-depshear.png


--------------------------------------------------------------------------------
/figures/feature-space-phenomena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/feature-space-phenomena.png


--------------------------------------------------------------------------------
/figures/feature-space-testtrain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/feature-space-testtrain.png


--------------------------------------------------------------------------------
/figures/gd-answers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/gd-answers.png


--------------------------------------------------------------------------------
/figures/gd-problem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/gd-problem.png


--------------------------------------------------------------------------------
/figures/gradients-curvy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/gradients-curvy.png


--------------------------------------------------------------------------------
/figures/hinge-beach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/hinge-beach.png


--------------------------------------------------------------------------------
/figures/linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/linear.png


--------------------------------------------------------------------------------
/figures/margin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/margin.png


--------------------------------------------------------------------------------
/figures/ml-dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/ml-dataflow.png


--------------------------------------------------------------------------------
/figures/necker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/necker.png


--------------------------------------------------------------------------------
/figures/priors/graphs.py:
--------------------------------------------------------------------------------
  1 | ''' author: sam tenka
  2 |     change: 2022-07-28
  3 |     create: 2022-07-28
  4 |     descrp: Generate plots for the digit-classification example in the prologue
  5 |             pages of our 6.86x notes.  
  6 |     depend: keras
  7 |     jargon: we'll consistently use these abbreviations when naming variables:
  8 |                 dec_func    --- decision function
  9 |                 idx(s)      --- index/indices within list of all examples 
 10 |                 model       --- name of model ('linear', 'affine', etc)
 11 |                 nb_         --- number of (whatever follows the underscore)
 12 |                 side        --- sidelength of image, measured in pixels
 13 |                 x           --- photo of handwritten digit
 14 |                 y           --- digit-valued label
 15 |                 y_sign      --- {-1,+1}-valued label
 16 |                 z           --- feature vector
 17 |                 vert        --- to do with a graph's vertical axis
 18 |                 hori        --- to do with a graph's horizontal axis
 19 |     thanks: featurization idea inspired by abu-mostafa's book
 20 |     to use: Run `python3 example.py`.  On first run, expect a downloading
 21 |             progress bar to display and finish within 30 to 60 seconds; this
 22 |             is for downloading the MNIST dataset we'll use.
 23 | '''
 24 | 
 25 | #===============================================================================
 26 | #==  0. PREAMBLE  ==============================================================
 27 | #===============================================================================
 28 | 
 29 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 30 | #~~~~~~~~  0.0. universal constants  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 31 | 
 32 | #--------------  0.0.0. import modules  ----------------------------------------
 33 | 
 34 | from matplotlib import pyplot as plt                                            
 35 | import numpy as np                                                              
 36 | import tqdm
 37 | 
 38 | #--------------  0.2.1. colors  ------------------------------------------------
 39 | 
 40 | WHITE        = np.array([1.0 ,1.0 ,1.0 ])
 41 | SMOKE        = np.array([ .9 , .9 , .9 ])
 42 | SLATE        = np.array([ .5 , .5 , .5 ])
 43 | SHADE        = np.array([ .1 , .1 , .1 ])
 44 | BLACK        = np.array([ .0 , .0 , .0 ])
 45 | 
 46 | RED          = np.array([1.0 , .0 , .0 ]) #####
 47 | ORANGE       = np.array([ .75,0.25, .0 ]) #    
 48 | BROWN        = np.array([ .5 ,0.5 , .0 ]) ###    # i.e., dark YELLOW
 49 | OLIVE        = np.array([ .25,0.75, .0 ]) #    
 50 | GREEN        = np.array([ .0 ,1.0 , .0 ]) #####
 51 | AGAVE        = np.array([ .0 , .75, .25]) #    
 52 | CYAN         = np.array([ .0 , .5 , .5 ]) ###  
 53 | JUNIPER      = np.array([ .0 , .25, .75]) #    
 54 | BLUE         = np.array([ .0 , .0 ,1.0 ]) ##### 
 55 | INDIGO       = np.array([ .25, .0 , .75]) #    
 56 | MAGENTA      = np.array([ .5 , .0 , .5 ]) ###  
 57 | AMARANTH     = np.array([ .75, .0 , .25]) #    
 58 | 
 59 | RAINBOW = [
 60 |     RED     ,
 61 |     ORANGE  ,
 62 |     BROWN   ,
 63 |     OLIVE   ,
 64 |     GREEN   ,
 65 |     AGAVE   ,
 66 |     CYAN    ,
 67 |     JUNIPER ,
 68 |     BLUE    ,
 69 |     INDIGO  ,
 70 |     MAGENTA ,
 71 |     AMARANTH,
 72 | ]
 73 | 
 74 | 
 75 | def overlay_color(background, foreground, foreground_opacity=1.0):
 76 |     background += foreground_opacity * (foreground - background)
 77 | 
 78 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 79 | #~~~~~~~~  0.1. global parameters  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 80 | 
 81 | #--------------  0.1.1. learning parameters  -----------------------------------
 82 | 
 83 | np.random.seed(0)
 84 | 
 85 | def sample(dim,p):
 86 |     return {
 87 |         1: lambda d: np.random.laplace(size=d),
 88 |         2: lambda d: np.random.normal(size=d),
 89 |     'inf': lambda d: np.random.uniform(size=d, low=-1.0, high=+1.0),
 90 |     }[p](dim) 
 91 | 
 92 | def norm(v,p):
 93 |     return { 
 94 |         1: np.mean(np.abs(v)), 
 95 |         2: np.sqrt(np.mean(np.square(v))), 
 96 |     'inf': np.max(np.abs(v)),
 97 |     }[p]
 98 | 
 99 | def sample_scaled(dim,p,q,scale=1.0):
100 |     s = sample(dim, p)
101 |     y = sample(dim, q)
102 |     return scale * s * norm(y,q)/norm(s,p) 
103 | 
104 | dim = 30
105 | halfheight=200
106 | vert = 15 
107 | hori = 6
108 | for p in [1, 2, 'inf']:
109 |     for q in [1, 2, 'inf']:
110 |         scatter = np.ones(( halfheight*2, dim*hori, 4), dtype=np.float32) 
111 |         scatter[:,:,3]  = 0
112 |         for i in [0,+2]:
113 |             scatter[halfheight-int(vert*i),:, 3]  = 1
114 |             scatter[halfheight-int(vert*i),:,:3]  = 0
115 |         samples = np.array([sample_scaled(dim, p, q) for _ in range(10001)]) 
116 | 
117 |         stddev = np.sqrt(np.mean(np.square(np.ndarray.flatten(samples)))) 
118 |         samples /= np.sqrt(3)*stddev
119 | 
120 |         stddev = np.std(samples, axis=1)
121 |         samples = samples[np.argsort(stddev)]
122 |         samples = np.array([samples[0],samples[5000],samples[10000]])
123 |         samples = np.abs(samples)
124 |         samples = np.array([sorted(s) for s in samples])
125 | 
126 |         for color, s in tqdm.tqdm(zip([CYAN, BLUE, MAGENTA], samples)):
127 |             for i,(a,b) in enumerate(zip(s, s[1:])):
128 |                 x,y =  hori*(i+0.5), halfheight- (a)*2*vert  
129 |                 X,Y =  hori*(i+1.5), halfheight- (b)*2*vert  
130 |                 l = np.sqrt((X-x)**2+(Y-y)**2)
131 |                 for t in np.linspace(0.0,1.0,int(10+1.5*l)):
132 |                     for dx in np.linspace(-1.2,+1.2,9):
133 |                         for dy in np.linspace(-1.2,+1.2,9):
134 |                             opacity = 1.0/(1.0 + 10*(dx*dx + dy*dy))
135 |                             xx = int(x+(X-x)*t+dx)
136 |                             yy = int(y+(Y-y)*t+dy)
137 |                             dd = (2.0*max(0.0, 0.50-abs(abs(a)+(abs(b)-abs(a))*t)))**0.5 
138 |                             if 0<=yy<halfheight*2 and 0<=xx<dim*hori:
139 |                                 cc = color
140 |                                 scatter[yy][xx][:3] = (1.0-opacity)*scatter[yy][xx][:3] + opacity*(cc + (0.1+0.9*dd)*(WHITE-cc))
141 |                                 scatter[yy][xx][ 3] = 1.0 - (1.0-opacity) * (1.0 - scatter[yy][xx][ 3])
142 |         plt.imsave('yo-{}-{}.png'.format(p,q), scatter)
143 | 
144 | ##--------------  3.0.0. render some training digits  ---------------------------
145 | #
146 | #def new_plot(data_h=PLT_SIDE, data_w=PLT_SIDE, margin=MARG,
147 | #             nb_vert_axis_ticks=10, nb_hori_axis_ticks=10): 
148 | #    # white canvas 
149 | #    scatter = np.ones((data_h+2*margin,
150 | #                       data_w +2*margin,3), dtype=np.float32) 
151 | #
152 | #    # grid lines
153 | #    for a in range(nb_vert_axis_ticks): 
154 | #        s = int(data_h * float(a)/nb_vert_axis_ticks)
155 | #        scatter[margin+(data_h-1-s),margin:margin+data_w] = SMOKE
156 | #    for a in range(nb_hori_axis_ticks): 
157 | #        s = int(data_w * float(a)/nb_hori_axis_ticks)
158 | #        scatter[margin:margin+data_h,margin+s]            = SMOKE
159 | #    
160 | #    # tick marks
161 | #    for a in range(nb_vert_axis_ticks): 
162 | #        s = int(data_h * float(a)/nb_vert_axis_ticks)
163 | #        for i in range(nb_vert_axis_ticks)[::-1]:
164 | #            color = SLATE + 0.04*i*WHITE
165 | #            scatter[margin+(data_h-1-s)               ,  :margin+2+i] = color
166 | #    for a in range(nb_hori_axis_ticks): 
167 | #        s = int(data_w * float(a)/nb_hori_axis_ticks)
168 | #        for i in range(nb_hori_axis_ticks)[::-1]:
169 | #            color = SLATE + 0.04*i*WHITE
170 | #            scatter[margin+data_h-2-i:2*margin+data_h , margin+s    ] = color
171 | #   
172 | #    # axes
173 | #    scatter[margin+data_h-1      , margin:margin+data_w] = SLATE
174 | #    scatter[margin:margin+data_h , margin              ] = SLATE
175 | #
176 | #    return scatter
177 | #
178 | ##--------------  3.0.2. define feature space scatter plot  --------------------
179 | #
180 | #def plot_samples():
181 | #    # initialize
182 | #    scatter = new_plot(data_h, data_w, margin,
183 | #                       nb_vert_axis_ticks, nb_hori_axis_ticks)
184 | #
185 | #    # save
186 | #    plt.imsave(file_name, scatter) 
187 | #
188 | #
189 | 


--------------------------------------------------------------------------------
/figures/priors/yo-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-1-1.png


--------------------------------------------------------------------------------
/figures/priors/yo-1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-1-2.png


--------------------------------------------------------------------------------
/figures/priors/yo-1-inf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-1-inf.png


--------------------------------------------------------------------------------
/figures/priors/yo-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-2-1.png


--------------------------------------------------------------------------------
/figures/priors/yo-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-2-2.png


--------------------------------------------------------------------------------
/figures/priors/yo-2-inf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-2-inf.png


--------------------------------------------------------------------------------
/figures/priors/yo-inf-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-inf-1.png


--------------------------------------------------------------------------------
/figures/priors/yo-inf-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-inf-2.png


--------------------------------------------------------------------------------
/figures/priors/yo-inf-inf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/priors/yo-inf-inf.png


--------------------------------------------------------------------------------
/figures/quad-reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/quad-reg.png


--------------------------------------------------------------------------------
/figures/quadratic-features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/quadratic-features.png


--------------------------------------------------------------------------------
/figures/rbf-kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/rbf-kernel.png


--------------------------------------------------------------------------------
/figures/regress-springs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/regress-springs.png


--------------------------------------------------------------------------------
/figures/regression-beach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/regression-beach.png


--------------------------------------------------------------------------------
/figures/satellite-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/satellite-2.png


--------------------------------------------------------------------------------
/figures/satellite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/satellite.png


--------------------------------------------------------------------------------
/figures/seven-days.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/seven-days.png


--------------------------------------------------------------------------------
/figures/shallow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/shallow.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-a-prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-a-prob.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-a-sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-a-sol.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-a-v.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-a-v.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-a.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-b-prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-b-prob.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-b-sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-b-sol.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-b-v.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-b-v.png


--------------------------------------------------------------------------------
/figures/sim-trefoil-b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/sim-trefoil-b.png


--------------------------------------------------------------------------------
/figures/slash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/slash.png


--------------------------------------------------------------------------------
/figures/smiley-transformed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/smiley-transformed.png


--------------------------------------------------------------------------------
/figures/smiley.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/smiley.png


--------------------------------------------------------------------------------
/figures/supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/figures/supervised.png


--------------------------------------------------------------------------------
/figures_desc/MaxPool.tex:
--------------------------------------------------------------------------------
 1 | \begin{document}
 2 | MaxPool is a size reduction function. It divides an image into smaller submatrices
 3 | (in this particular case into $2 \times 2$ matrices). Then it selects an element with the maximum 
 4 | value from each submatrix
 5 | and put them into the corresponding cells of the output image. When 2D MaxPool is applied to 
 6 | the colored image, it processes each channel separately. Usually, MaxPool is used right after 
 7 | the convolutional layer, which extracts the features from the image. In these terms, 
 8 | MaxPool gets rid of less important information. It allows us to reduce computations and 
 9 | the number of weights in neural networks.
10 | \end{document}


--------------------------------------------------------------------------------
/figures_desc/README.md:
--------------------------------------------------------------------------------
1 | # Figures description
2 | 
3 | This folder is for the descriptions of the figures in [figures](../figures).  
4 | All description files should have the same name as the figures but in `.tex` format. So every
5 | `*.png` file in [figures](../figures) should come along with `*.tex` file here.


--------------------------------------------------------------------------------
/figures_desc/conv_elements.tex:
--------------------------------------------------------------------------------
 1 | \begin{document}
 2 | This figure demonstrates how the output image of the convolution operation is formed. 
 3 | First, the function selects particular elements defined by a window of the same size as the kernel.
 4 | Then the algorithm applies the weighted sum to this window and kernel;
 5 | and stores the result according to the window's position. 
 6 | The algorithm starts at the "top left" corner of the image, 
 7 | i. e. an element of the window at $(1, 1)$ is an element of the image at $(1, 1)$; 
 8 | and the output element is also at $(1, 1)$.
 9 | To compute the next element of the output image, the algorithm shifts the window by one.
10 | For instance, if we want to compute an output element $(2, 1)$ we shift the window by one row and 
11 | the window starts from 
12 | $(2, 1)$ element of the input image.
13 | Considering an input image of size $(W, H)$ and the kernel of size $(K, K)$, 
14 | the overall computational complexity of 2D convolution is $O(W \times H \times K^2)$.
15 | \end{document}


--------------------------------------------------------------------------------
/figures_desc/conv_operation.tex:
--------------------------------------------------------------------------------
1 | \begin{document}
2 | The picture exposes the computation of a single element of 2D convolution. 
3 | This is, in fact, a sum of elementwise multiplied matrices, except that one of the matrices is 
4 | flipped so that the first element becomes the last. For instance, if we perform convolution on two 
5 | $(N \times N)$ matrices, the element $(0, 0)$ will be multiplied by $(N, N)$ and vise versa. 
6 | If one of the matrices is symmetrical in both axes, there is no need to flip either matrix.
7 | \end{document}


--------------------------------------------------------------------------------
/mlentary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/mlentary.pdf


--------------------------------------------------------------------------------
/mlentary.tex:
--------------------------------------------------------------------------------
  1 | % author:   sam tenka
  2 | % change:   2023-05-25
  3 | % create:   2022-05-11
  4 | 
  5 | \documentclass[11pt, justified]{tufte-book}
  6 | \geometry{
  7 |   left           = 0.90in, % left margin
  8 |   textwidth      = 4.95in, % main text block
  9 |   marginparsep   = 0.15in, % gutter between main text block and margin notes
 10 |   marginparwidth = 2.30in, % width of margin notes
 11 |                  % 0.20in  % width from margin to edge
 12 | }
 13 | \input{sam.sty}
 14 | \usepackage{tikz-cd}
 15 | 
 16 | \begin{document}\samtitle{basics of machine learning}
 17 | 
 18 | \sampassage{contributors}
 19 |   \input{CONTRIBUTORS}
 20 | 
 21 | \sampassage{hallo!}
 22 |   These optional and still-evolving notes for 6.86x give a big picture view of
 23 |   our conceptual landscape.  They might help you study, but you can do all your
 24 |   assignments without these notes.  These notes treat many but not all the
 25 |   topics of our course; they also treat a few topics not emphasized in class,
 26 |   to fill gaps for curious learners.
 27 | 
 28 |   I'm happy to answer questions about the notes on the forum.  If you want to
 29 |   help improve these notes, ask me; I'll be happy to list you as a contributor!
 30 | 
 31 |   We compiled these notes using the following color palette; let us know if
 32 |   the colors are hard to distinguish.
 33 | 
 34 |   {\Huge\vspace{-1.5cm}\[\substack{%
 35 |     {\sky\blacksquare}
 36 |     {\blu\blacksquare}
 37 |     {\sea\blacksquare}\\
 38 |     {\pch\blacksquare}
 39 |     {\rng\blacksquare}
 40 |     {\bro\blacksquare}}
 41 |     \]}
 42 | 
 43 |     \marginnote[-6cm]{%
 44 |     \textsc{Clickable Table of Contents}\vspace{0.05cm}
 45 |     \begin{description}
 46 |         \item[\hyperlink{A}{A. prologue}]                                               \phdot  \hfill\pageref{part:A}
 47 |         \begin{description}
 48 |           \item[\hyperlink{A0}{{\color{gray}0} what is learning?}]
 49 |           \item[\hyperlink{A1}{{\color{gray}1} a tiny example: handwritten digits}]
 50 |           \item[\hyperlink{A2}{{\color{gray}2} how well did we do?}]
 51 |           \item[\hyperlink{A3}{{\color{gray}3} how can we do better?}]
 52 |         \end{description}
 53 |     \item[\hyperlink{part:Z}{Z. a complete, simple classifier}]                         \phdot  \hfill\pageref{part:Z}
 54 |         \begin{description}
 55 |           \item[\hyperlink{Z0}{{\color{gray}0} goodness of fit}]
 56 |           \item[\hyperlink{Z1}{{\color{gray}1} smarter optimization}]
 57 |           \item[\hyperlink{Z2}{{\color{gray}2} two examples: cows and walls}]
 58 |           \item[\hyperlink{Z3}{{\color{gray}3} intrinsic plausibility}]
 59 |         \end{description}
 60 |     \item[\hyperlink{part:B}{B. squeezing more out of linear models}]             \phdot  \hfill\pageref{part:B}
 61 |         \begin{description}
 62 |           \item[\hyperlink{B0}{{\color{gray}0} featurization}]
 63 |           \item[\hyperlink{B1}{{\color{gray}1} quantifying uncertainty}]
 64 |           \item[\hyperlink{B2}{{\color{gray}2} iterative optimization}]
 65 |           \item[\hyperlink{B3}{{\color{gray}3} data-dependent features}]
 66 |         \end{description}
 67 |     \item[\hyperlink{part:C}{C. bend those lines to capture rich patterns}]             \phdot  \hfill\pageref{part:C}
 68 |         \begin{description}
 69 |           \item[\hyperlink{C0}{{\color{gray}0} featurization}]
 70 |           \item[\hyperlink{C1}{{\color{gray}1} learned featurizations}]
 71 |           \item[\hyperlink{C2}{{\color{gray}2} locality and symmetry in architecture}]
 72 |           \item[\hyperlink{C3}{{\color{gray}3} dependencies in architecture}]
 73 |         \end{description}
 74 |     \item[\hyperlink{part:D}{D. thicken those lines to quantify uncertainty}]           \phdot  \hfill\pageref{part:D}
 75 |         \begin{description}
 76 |           \item[\hyperlink{D0}{{\color{gray}0} bayesian models}]
 77 |           \item[\hyperlink{D1}{{\color{gray}1} examples of bayesian models}]
 78 |           \item[\hyperlink{D2}{{\color{gray}2} inference algorithms for bayesian models}]
 79 |           \item[\hyperlink{D3}{{\color{gray}3} combining with deep learning}]
 80 |         \end{description}
 81 |     \item[\hyperlink{part:E}{E. beyond learning-from-examples}]                         \phdot  \hfill\pageref{part:E}
 82 |         \begin{description}
 83 |           \item[\hyperlink{E0}{{\color{gray}0} reinforcement}]
 84 |           \item[\hyperlink{E1}{{\color{gray}1} state}]
 85 |           \item[\hyperlink{E2}{{\color{gray}2} deep q learning}]
 86 |           \item[\hyperlink{E3}{{\color{gray}3} learning from instructions}]
 87 |         \end{description}
 88 |     %
 89 |     \item[] \vspace{0.05cm} \hrule \vspace{-0.05cm}
 90 |     %
 91 |     \item[\hyperlink{part:F}{F. some background}]                                       \phdot  \hfill\pageref{part:F}
 92 |         \begin{description}
 93 |           \item[\hyperlink{F0}{{\color{gray}0} probability primer}]
 94 |           \item[\hyperlink{F1}{{\color{gray}1} linear algebra primer}]
 95 |           \item[\hyperlink{F2}{{\color{gray}2} derivatives primer}]
 96 |           \item[\hyperlink{F3}{{\color{gray}3} programming and numpy and pytorch primer}]
 97 |         \end{description}
 98 |     %\item[\hyperlink{part:G}{G. rambles}]                                               \phdot  \hfill\pageref{part:G}
 99 |     %    \begin{description}
100 |     %      \item[\hyperlink{G0}{{\color{gray}0} }]
101 |     %      \item[\hyperlink{G1}{{\color{gray}1} }]
102 |     %      \item[\hyperlink{G2}{{\color{gray}2} }]
103 |     %      \item[\hyperlink{G3}{{\color{gray}3} }]
104 |     %    \end{description}
105 |     \end{description}
106 |   }
107 | 
108 |   \sampassage{navigating these notes}
109 |     We divide these notes into bite-size passages; each begins with a gray
110 |     heading (e.g.\ `navigating these notes' to the left).  All passages are
111 |     optional in the sense that you'll be able to complete all assignments
112 |     without ever studying these notes.  Yet, aside from a few bonus passages
113 |     marked as \textsf{\blu VERY OPTIONAL}, most passages offer complementary
114 |     discussion of our course topics that may help you understand, appreciate,
115 |     and internalize the lectures.  After all, only with two lines of sight can
116 |     we see depth.
117 | 
118 |     These notes highlight probability theory and model architecture more than
119 |     the lectures; clustering and low-rank approximation, less.  We can use
120 |     different projections to tear and squash a globe into a flat map of earth.
121 |     Different views reveal different aspects of geography.  In order to ``keep
122 |     australia from being split into two parts'' and to ``display the south
123 |     pole'', I used a different map projection than the lectures do.  So we
124 |     traverse topics in the notes in a different order than we do in lecture.
125 |     Still, we cover essentially the same overall ground.
126 | 
127 |   \newcommand{\SSS}[2]{\newpage\hypertarget{#1}{}\sampart{#2}\phantomsection\label{part:#1}}
128 |   \newcommand{\sss}[3]{\hypertarget{#1}{}\samsection{#3}\input{tex-source/#2}}
129 | 
130 |   \newpage
131 |   \SSS{0}{prologue}
132 |     \sss{0a}{body.0.0.what-is-learning}{what is learning?}
133 |     \sss{0b}{body.0.1.our-first-learning-algorithm}{a tiny example: classifying handwritten digits}
134 |     \sss{0c}{body.0.2.how-well-did-we-do}{how well did we do?  analyzing our error}
135 |     \sss{0d}{body.0.3.how-can-we-do-better}{how can we do better?  (survey of rest of notes)}
136 | 
137 |   \newpage
138 |   %a complete, simple classifier (unit 1)}
139 |   \SSS{1}{Learning from Examples}
140 |     \sss{1a}{body.1.0.likelihoods}{goodness of fit}
141 |     \sss{1b}{body.1.1.gradients}{smarter optimization}
142 |     \sss{1c}{body.1.2.priors}{intrinsic plausibility}
143 |     \sss{1d}{body.1.3.model-selection}{model selection}
144 | 
145 |   \newpage
146 |   %squeezing more out of linear models (unit 2)}
147 |   \SSS{2}{Engineering Features (and Friends) by Hand}
148 |     \sss{2a}{body.2.0.featurization}{featurization}
149 |     \sss{2b}{body.2.1.quantify-uncertainty}{richer outputs: quantifying uncertainty}
150 |     \sss{2c}{body.2.2.iterative-optimization}{iterative optimization}
151 |     \sss{2d}{body.2.3.data-dependent-features}{data dependent features}
152 | 
153 |   \newpage
154 |   %bend those lines to capture rich patterns (unit 3)}
155 |   \SSS{3}{Learning Features from Data}
156 |     \sss{3a}{body.3.0.shallow-learning}{shallow learning}
157 |     \sss{3b}{body.3.1.deep-architecture}{deep learning: ideas in architecture}
158 |     \sss{3c}{body.3.2.convolution}{symmetry and locality: convolution}
159 |     \sss{3d}{body.3.3.attention}{symmetry and locality: attention}
160 | 
161 |   \newpage
162 |   %thicken those lines to quantify uncertainty (unit 4)}
163 |   \SSS{4}{Modeling Structured Uncertainties}
164 |     \sss{4a}{body.4.0.probabilistic-models}{probabilistic models}
165 |     \sss{4b}{body.4.1.expectation-maximization}{inference by variation: EM}
166 |     \sss{4c}{body.4.2.metropolis-hastings}{inference by sampling: MH}
167 |     \sss{4d}{body.4.3.deep-generators}{deep generative models}
168 | 
169 |   \newpage
170 |   %beyond learning-from-examples (unit 5)}
171 |   \SSS{5}{Learning while Acting (and from Rewards)}
172 |     \sss{5a}{body.5.0.reinforcement}{rewards, actions, states. exploration}
173 |     \sss{5b}{body.5.1.mdps}{qualitative solutions to mdp, dynamic programming, bootstrap}
174 |         %state (dependence on prev xs and on actions ) ; RL ; partial observations}
175 |     \sss{5c}{body.5.2.q-learning}{reduce to supervision using q-learning}
176 |     \sss{5d}{body.5.3.beyond}{non-technical discussion of: curiosity, language, instruction, world-models}
177 | 
178 | % =============================================================================
179 | % ==  _  ======================================================================
180 | % =============================================================================
181 | 
182 |   \newpage
183 |   \SSS{F}{Prereq Helpers}
184 |     \sss{Fa}{body.F.0.linear-algebra}{high dimensions and linear algebra}
185 |       %-- linear "spaces" and linear "maps"
186 |       %-- dimension, trace, determinant, visually
187 |       %-- vectors, covectors, higher tensors
188 |       %-- dot product, angles, and svd, visually
189 |     \sss{Fb}{body.F.1.probability}{uncertainty and probability}
190 |       %-- fundamentals; frequentism vs bayesianism
191 |       %-- 2 sharp tools: independence and averaging
192 |       %-- log loss and friends
193 |       %-- common families of distributions
194 |     \sss{Fc}{body.F.2.derivatives}{optimization and derivatives}
195 |       %-- derivatives are best linear appro.s: chain rule and linearity
196 |       %-- product rule, visually
197 |       %-- optimization : critical points, lagrange, higher derivatives
198 |       %-- write your own automatic differentiator: a short exercise
199 |     \sss{Fd}{body.F.3.programming}{python, numpy, pytorch}
200 | 
201 | \end{document}
202 | 
203 | 


--------------------------------------------------------------------------------
/sam.sty:
--------------------------------------------------------------------------------
  1 | % author:   samtenka
  2 | % change:   2023-05-25
  3 | % create:   2022-05-11
  4 | 
  5 | \newcommand{\recommended} {\textsf{\large\blu$\leftarrow$   RECOMMENDED PASSAGE}}
  6 | \newcommand{\veryoptional}{VERY OPTIONAL}
  7 | 
  8 | \newcommand{\picturedw}[2]{\includegraphics[width=#1]{#2}}
  9 | \newcommand{\picturew}[2]{\includegraphics[width=#1]{figures/#2}}
 10 | \newcommand{\pictureh}[2]{\includegraphics[height=#1]{figures/#2}}
 11 | 
 12 | %==============================================================================
 13 | %====  0.  DOCUMENT SETTINGS  ================================================
 14 | %==============================================================================
 15 | 
 16 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 17 | %~~~~~~~~~~~~~  0.0. About this Exposition  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 18 | 
 19 | %---------------------  0.0.1. math packages  ---------------------------------
 20 | \newcommand\hmmax{0} % to allow for more fonts
 21 | \newcommand\bmmax{0} % to allow for more fonts
 22 | \usepackage{amsmath, amssymb, amsthm, mathtools}
 23 | \usepackage{bm}
 24 | \usepackage{euler}
 25 | 
 26 | \usepackage{array}   % for \newcolumntype macro
 27 | \newcolumntype{L}{>{$}l<{$}} % math-mode version of "l" column type
 28 | \newcolumntype{C}{>{$}c<{$}} % math-mode version of "c" column type
 29 | \newcolumntype{R}{>{$}r<{$}} % math-mode version of "r" column type
 30 | 
 31 | \newcolumntype{S}{ >{\centering\arraybackslash} m{3cm} } % vertically and horizontally centered
 32 | 
 33 | %---------------------  0.0.2. graphics packages  -----------------------------
 34 | \usepackage{graphicx, xcolor}
 35 | \usepackage{float, capt-of}
 36 | \usepackage{soul}
 37 | 
 38 | %---------------------  0.0.3. packages for fancy text  -----------------------
 39 | \usepackage{enumitem}\setlist{nosep}
 40 | \usepackage{listings}
 41 | \usepackage{xstring}
 42 | \usepackage{fontawesome5}
 43 | 
 44 | %---------------------  0.043. colors  ----------------------------------------
 45 | 
 46 | % NOTE: we want to cater to colorblind readers
 47 | 
 48 | % (LIGHT, MEDIUM, DARK) x (BLUE, ORANGE)
 49 | \definecolor{msky}{rgb}{0.62, 0.82, 0.94} \newcommand{\sky}{\color{msky}}
 50 | \definecolor{mpch}{rgb}{0.98, 0.86, 0.62} \newcommand{\pch}{\color{mpch}}
 51 | 
 52 | \definecolor{mblu}{rgb}{0.05, 0.55, 0.85} \newcommand{\blu}{\color{mblu}}
 53 | \definecolor{mrng}{rgb}{0.95, 0.65, 0.05} \newcommand{\rng}{\color{mrng}}
 54 | 
 55 | \definecolor{msea}{rgb}{0.02, 0.22, 0.34} \newcommand{\sea}{\color{msea}}
 56 | \definecolor{mbro}{rgb}{0.38, 0.26, 0.02} \newcommand{\bro}{\color{mbro}}
 57 | 
 58 | % SHADES:
 59 | \definecolor{mgre}{rgb}{0.55, 0.55, 0.55} \newcommand{\gre}{\color{mgre}}
 60 | \definecolor{mdgre}{rgb}{0.35, 0.35, 0.35} \newcommand{\dgre}{\color{mdgre}}
 61 | 
 62 | % UNFRIENDLY:
 63 | \definecolor{mred}{rgb}{1.00, 0.00, 0.00} \newcommand{\red}{\color{mred}}
 64 | 
 65 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 66 | %~~~~~~~~~~~~~  0.1. Headers and References  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 67 | 
 68 | %---------------------  0.1.0. intra-document references  ---------------------
 69 | \newcommand{\offour}[1]{
 70 |     {\tiny \raisebox{0.04cm}{\scalebox{0.9}{$\substack{
 71 |         \IfSubStr{#1}{0}{{\blacksquare}}{\square}
 72 |         \IfSubStr{#1}{1}{{\blacksquare}}{\square} \\ 
 73 |         \IfSubStr{#1}{2}{{\blacksquare}}{\square}
 74 |         \IfSubStr{#1}{3}{{\blacksquare}}{\square}
 75 |     }$}}}%
 76 | }
 77 | 
 78 | \newcommand{\offourline}[1]{
 79 |     {\tiny \raisebox{0.04cm}{\scalebox{0.9}{$\substack{
 80 |         \IfSubStr{#1}{0}{{\blacksquare}}{\square}
 81 |         \IfSubStr{#1}{1}{{\blacksquare}}{\square}
 82 |         \IfSubStr{#1}{2}{{\blacksquare}}{\square}
 83 |         \IfSubStr{#1}{3}{{\blacksquare}}{\square}
 84 |     }$}}}%
 85 | }
 86 | \newcommand{\notesam}[1]{{\blu \textsf{#1}}}
 87 | \newcommand{\attn}[1]{{\blu \textsf{#1}}}
 88 | \newcommand{\attnsam}[1]{{\red \textsf{#1}}}
 89 | %\newcommand{\attnsam}[1]{}%{\red \textsf{#1}}}
 90 | 
 91 | \newcommand{\blarr}{\hspace{-0.15cm}${\blu \leftarrow}\,$}
 92 | \newcommand{\bcirc}{${\blu ^\circ}$}
 93 | \newcommand{\bovinenote}[1]{\bcirc\marginnote{\blarr #1}}
 94 | 
 95 | %---------------------  0.1.1. table of contents helpers  ---------------------
 96 | \newcommand{\phdot}{\phantom{.}}
 97 | 
 98 | %---------------------  0.1.2. section headers  -------------------------------
 99 | \newcommand{\samtitle} [1]{
100 |   \par\noindent{\Huge \sf \blu #1}
101 |   \vspace{0.4cm}
102 | }
103 | 
104 | \newcommand{\samquote} [2]{
105 |     \marginnote[-0.4cm]{\begin{flushright}
106 |     %\scriptsize
107 |         \gre {\it #1} \\ --- #2
108 |     \end{flushright}}
109 | }
110 | 
111 | \newcommand{\sampart} [1]{
112 |   \vspace{0.5cm}
113 |   \par\noindent{\LARGE \sf \blu #1}
114 |   \vspace{0.1cm}\par
115 | }
116 | 
117 | \newcommand{\samsection}[1]{
118 |   \vspace{0.3cm}
119 |   \par\noindent{\Large \sf \blu #1}
120 |   \vspace{0.1cm}\par
121 | }
122 | 
123 | \newcommand{\sampassage}[1]{
124 |    \vspace{0.1cm}
125 |    \par\noindent{\hspace{-2cm}\normalsize \sc \gre #1} ---
126 | }
127 | 
128 | \newcommand{\blurb}[1]{
129 |     [[[#1]]]
130 | }
131 | 
132 | %---------------------  0.1.3. clear the bibliography's header  ---------------
133 | \usepackage{etoolbox}
134 | \patchcmd{\thebibliography}{\section*{\refname}}{}{}{}
135 | 
136 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
137 | %~~~~~~~~~~~~~  0.2. Math Symbols and Blocks  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
138 | 
139 | %---------------------  0.2.0. general math operators  ------------------------
140 | \newcommand{\scirc}{\mathrel{\mathsmaller{\mathsmaller{\mathsmaller{\circ}}}}}
141 | \newcommand{\cmop}[2]{{(#1\!\to\!#2)}}
142 | \newcommand{\pr}{^\prime}
143 | \newcommand{\prpr}{^{\prime\prime}}
144 | 
145 | \newcommand{\wrap}[1]{\left(#1\right)}
146 | 
147 | %---------------------  0.2.1. probability symbols  ---------------------------
148 | \newcommand{\KL}{\text{KL}}
149 | \newcommand{\EN}{\text{H}}
150 | \newcommand{\note}[1]{{\blu \textsf{#1}}}
151 | 
152 | %---------------------  0.2.2. losses averaged in various ways  ---------------
153 | \newcommand{\Ein}  {\text{trn}_{\sS}}
154 | \newcommand{\Einb} {\text{trn}_{\check\sS}}
155 | \newcommand{\Einc} {\text{trn}_{\sS\sqcup \check\sS}}
156 | \newcommand{\Egap} {\text{gap}_{\sS}}
157 | \newcommand{\Eout} {\text{tst}}
158 | 
159 | %---------------------  0.2.3. double-struck and caligraphic upper letters  ---
160 | \newcommand{\Aa}{\mathbb{A}}\newcommand{\aA}{\mathcal{A}}
161 | \newcommand{\Bb}{\mathbb{B}}\newcommand{\bB}{\mathcal{B}}
162 | \newcommand{\Cc}{\mathbb{C}}\newcommand{\cC}{\mathcal{C}}
163 | \newcommand{\Dd}{\mathbb{D}}\newcommand{\dD}{\mathcal{D}}
164 | \newcommand{\Ee}{\mathbb{E}}\newcommand{\eE}{\mathcal{E}}
165 | \newcommand{\Ff}{\mathbb{F}}\newcommand{\fF}{\mathcal{F}}
166 | \newcommand{\Gg}{\mathbb{G}}\newcommand{\gG}{\mathcal{G}}
167 | \newcommand{\Hh}{\mathbb{H}}\newcommand{\hH}{\mathcal{H}}
168 | \newcommand{\Ii}{\mathbb{I}}\newcommand{\iI}{\mathcal{I}}
169 | \newcommand{\Jj}{\mathbb{J}}\newcommand{\jJ}{\mathcal{J}}
170 | \newcommand{\Kk}{\mathbb{K}}\newcommand{\kK}{\mathcal{K}}
171 | \newcommand{\Ll}{\mathbb{L}}\newcommand{\lL}{\mathcal{L}}
172 | \newcommand{\Mm}{\mathbb{M}}\newcommand{\mM}{\mathcal{M}}
173 | \newcommand{\Nn}{\mathbb{N}}\newcommand{\nN}{\mathcal{N}}
174 | \newcommand{\Oo}{\mathbb{O}}\newcommand{\oO}{\mathcal{O}}
175 | \newcommand{\Pp}{\mathbb{P}}\newcommand{\pP}{\mathcal{P}}
176 | \newcommand{\Qq}{\mathbb{Q}}\newcommand{\qQ}{\mathcal{Q}}
177 | \newcommand{\Rr}{\mathbb{R}}\newcommand{\rR}{\mathcal{R}}
178 | \newcommand{\Ss}{\mathbb{S}}\newcommand{\sS}{\mathcal{S}}
179 | \newcommand{\Tt}{\mathbb{T}}\newcommand{\tT}{\mathcal{T}}
180 | \newcommand{\Uu}{\mathbb{U}}\newcommand{\uU}{\mathcal{U}}
181 | \newcommand{\Vv}{\mathbb{V}}\newcommand{\vV}{\mathcal{V}}
182 | \newcommand{\Ww}{\mathbb{W}}\newcommand{\wW}{\mathcal{W}}
183 | \newcommand{\Xx}{\mathbb{X}}\newcommand{\xX}{\mathcal{X}}
184 | \newcommand{\Yy}{\mathbb{Y}}\newcommand{\yY}{\mathcal{Y}}
185 | \newcommand{\Zz}{\mathbb{Z}}\newcommand{\zZ}{\mathcal{Z}}
186 | 
187 | %---------------------  0.2.4. sans serif and frak lower letters  -------------
188 | \newcommand{\sfa}{\mathsf{a}}\newcommand{\fra}{\mathfrak{a}}
189 | \newcommand{\sfb}{\mathsf{b}}\newcommand{\frb}{\mathfrak{b}}
190 | \newcommand{\sfc}{\mathsf{c}}\newcommand{\frc}{\mathfrak{c}}
191 | \newcommand{\sfd}{\mathsf{d}}\newcommand{\frd}{\mathfrak{d}}
192 | \newcommand{\sfe}{\mathsf{e}}\newcommand{\fre}{\mathfrak{e}}
193 | \newcommand{\sff}{\mathsf{f}}\newcommand{\frf}{\mathfrak{f}}
194 | \newcommand{\sfg}{\mathsf{g}}\newcommand{\frg}{\mathfrak{g}}
195 | \newcommand{\sfh}{\mathsf{h}}\newcommand{\frh}{\mathfrak{h}}
196 | \newcommand{\sfi}{\mathsf{i}}\newcommand{\fri}{\mathfrak{i}}
197 | \newcommand{\sfj}{\mathsf{j}}\newcommand{\frj}{\mathfrak{j}}
198 | \newcommand{\sfk}{\mathsf{k}}\newcommand{\frk}{\mathfrak{k}}
199 | \newcommand{\sfl}{\mathsf{l}}\newcommand{\frl}{\mathfrak{l}}
200 | \newcommand{\sfm}{\mathsf{m}}\newcommand{\frm}{\mathfrak{m}}
201 | \newcommand{\sfn}{\mathsf{n}}\newcommand{\frn}{\mathfrak{n}}
202 | \newcommand{\sfo}{\mathsf{o}}\newcommand{\fro}{\mathfrak{o}}
203 | \newcommand{\sfp}{\mathsf{p}}\newcommand{\frp}{\mathfrak{p}}
204 | \newcommand{\sfq}{\mathsf{q}}\newcommand{\frq}{\mathfrak{q}}
205 | \newcommand{\sfr}{\mathsf{r}}\newcommand{\frr}{\mathfrak{r}}
206 | \newcommand{\sfs}{\mathsf{s}}\newcommand{\frs}{\mathfrak{s}}
207 | \newcommand{\sft}{\mathsf{t}}\newcommand{\frt}{\mathfrak{t}}
208 | \newcommand{\sfu}{\mathsf{u}}\newcommand{\fru}{\mathfrak{u}}
209 | \newcommand{\sfv}{\mathsf{v}}\newcommand{\frv}{\mathfrak{v}}
210 | \newcommand{\sfw}{\mathsf{w}}\newcommand{\frw}{\mathfrak{w}}
211 | \newcommand{\sfx}{\mathsf{x}}\newcommand{\frx}{\mathfrak{x}}
212 | \newcommand{\sfy}{\mathsf{y}}\newcommand{\fry}{\mathfrak{y}}
213 | \newcommand{\sfz}{\mathsf{z}}\newcommand{\frz}{\mathfrak{z}}
214 | 
215 | %---------------------  0.2.5. math environments  -----------------------------
216 | \newtheorem*{qst}{Question}
217 | \newtheorem*{thm}{Theorem}
218 | \newtheorem*{lem}{Lemma}
219 | % ...
220 | \theoremstyle{definition}
221 | \newtheorem*{dfn}{Definition}
222 | 
223 | \newcommand{\exercise}[1]{%
224 |   \par\noindent%
225 |   \attn{Food For Thought:} #1%
226 | }
227 | \newcommand{\noparexercise}[1]{%
228 |   \attn{Food For Thought:} #1%
229 | }
230 | \newcommand{\objectives}[1]{%
231 |   \marginnote[-0.2cm]{%
232 |     By the end of this section, you'll be able to
233 |     \begin{itemize}#1\end{itemize}
234 |   }
235 | }
236 | 
237 | 
238 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.1.1.priors.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |   \item {explain how regularization, in its incarnation as
  3 |          margin-maximization, counters data terms to improve generalization}
  4 |   \item {write a regularized ML program (namely, an SVM),
  5 |                  to classify high-dimensional data}
  6 | }
  7 | 
  8 | \sampassage{how good is a hypothesis?  plausibility}
  9 |   Now to define intrinsic plausiblity, also known as a \textbf{regularizer}
 10 |   %term.
 11 |   We find a hypothesis more plausible when its ``total amount of
 12 |   dependence'' on the features is small.
 13 |   %
 14 |   So we'll focus for now on capturing this intution:
 15 |   \emph{a hypothesis that depends a lot on many features is less
 16 |   plausible}.\bovinenote{%
 17 |     There are many other aspects we might design a regularizer
 18 |     to capture, e.g.\ a domain's symmetry.
 19 |     The regularizer is in practice a key point where we inject domain
 20 |     knowledge.
 21 |   }
 22 |   %
 23 |   We may conveniently quantify this as
 24 |   proportional to a sum of squared weights (jargon: \textbf{L2}):\bovinenote{%
 25 |     \noparexercise{%
 26 |       When $(a,b)$ represent weights for brightness-width digits features, how
 27 |       do hypotheses with small $a^2 + b^2$ visually differ from ones with
 28 |       small $6.86 a^2+b^2$ (a perfectly fine variant of our
 29 |       `implausibility')?
 30 |     }
 31 |   }
 32 |   $
 33 |     \text{implausibility of $h=(a,b, \cdots)$}
 34 |     =
 35 |     \lambda (a^2 + b^2 + \cdots)
 36 |   $.  In code:
 37 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 38 |     LAMBDA = 1.
 39 |     def implausibility(a,b):
 40 |       return LAMBDA * np.sum(np.square([a,b]))
 41 |   \end{lstlisting}
 42 |   Intuitively, the constant $\lambda$=\texttt{LAMBDA} tells us how much we care
 43 |   about plausibility relative to goodness-of-fit-to-data.
 44 | 
 45 |   Here's what the formula means.
 46 |   Each of three friends has a theory\bovinenote{%
 47 |   \textbf{AJ}
 48 |     insists a bird with a wings shorter than 1ft can't fly far, so it's
 49 |     \emph{sure} to sing; Conversely, birds with longer wings never sing.
 50 |   \textbf{Pat}
 51 |     checks if the bird grows red feathers, eats shrimp, lives near ice, wakes
 52 |     in the night, and has a bill.  If and only if an even number of these $5$
 53 |     qualities are true, the bird probably sings.
 54 |   \textbf{Sandy}
 55 |     says shorter wings and nocturnality both make a bird somewhat more likely
 56 |     to sing.
 57 |     }
 58 |   about which birds sing.
 59 |   %
 60 |   Which theory do we prefer?  Well, \textbf{AJ} seems too confident.  Wingspan
 61 |   may matter but probably not so decisively.  \textbf{Pat} avoids
 62 |   black-and-white claims, but Pat's predictions depend substantively on many
 63 |   features: flipping any one quality flips their prediction.  This seems
 64 |   implausible.  By contrast, \textbf{Sandy}'s hypothesis doesn't depend too
 65 |   strongly on too many features.  To me, a bird non-expert, Sandy's seems most
 66 |   plausible.
 67 | 
 68 |   Now we can define the overall undesirability of a hypothesis:\bovinenote{%
 69 |     We'll use SVM loss but feel free to plug in other losses to get
 70 |     different learning behaviors!
 71 |   }
 72 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 73 |     def objective_function(examples,a,b):
 74 |       data_term = np.sum([svm_loss(x,y,a,b) for x,y in examples])
 75 |       regularizer = implausibility(a, b)
 76 |       return data_term + regularizer
 77 |   \end{lstlisting}
 78 | 
 79 | \sampassage{margins}
 80 |   To build intuition
 81 |   %about which hypotheses are most desirable according to that metric,
 82 |   let's suppose $\lambda$ is a tiny positive number.  Then
 83 |   minimizing the objective function is the same as minimizing the data term,
 84 |   the total SVM loss: our notion of implausibility breaks ties.
 85 | 
 86 |   \begin{marginfigure}[0cm]
 87 |     \centering
 88 |     \picturew{0.99\textwidth}{margin}
 89 |     \caption{%
 90 |       \textbf{Balancing goodness-of-fit against intrinsic plausibility leads
 91 |       to hypotheses with large margins.}
 92 |       %\textbf{IGNORE the rightmost {\rng orange point} until we say otherwise!}
 93 |         A hypothesis's \textbf{margin} is its distance to the closest correctly
 94 |         classified training point(s).  Short stems depict these distances for
 95 |         two hypotheses (\textbf{black}, {\gre\textbf{gray}}).
 96 |         %
 97 |         If not for the rightmost {\rng orange point}, we'd prefer \textbf{black} over
 98 |         {\gre\textbf{gray}} since it has larger margins.  With large $\lambda$ (i.e., strong
 99 |         regularization), we might prefer black over gray even with that
100 |         rightmost {\rng orange point} included, since expanding the margin
101 |         is worth the single misclassification.
102 |       %For convenience we set the origin to the intersection of the two
103 |       %hypotheses.  That way we can still say that every hypothesis's decision
104 |       %boundary goes through
105 |       %the origin.
106 |     }
107 |   \end{marginfigure}
108 | 
109 |   Now, how does it break ties?  Momentarily ignore the Figure's rightmost {\rng
110 |   orange point} and consider the black hypothesis; its predictions depend only
111 |   on an input's first (vertical) coordinate, so it comes from weights of the
112 |   form $(a,b) = (a,0)$.
113 |   %
114 |   The $(a,0)$ pairs differ in SVM loss.  If
115 |   $a\approx 0$, each point has leeway close to $0$
116 |   and thus SVM loss close to $1$; conversely, if $a$ is huge, each
117 |   point has leeway very positive and thus SVM loss equal to
118 |   the imposed floor: $0$.  So SVM loss is $0$ as long as
119 |   $a$ is so big that each leeway to exceed $1$.
120 | 
121 |   Imagine sliding a point through the plane.  Its leeway is $0$ at the
122 |   black line and changes by $a$ for every unit we slide vertically.
123 |   %
124 |   So the farther the point is from the black line, the less $a$
125 |   must be before leeway exceeds $1$ --- and the happier is
126 |   the regularizer, which wants $a$ small.
127 |   % TODO BEACH, WATER, SLOPE story
128 |   % TODO Interpreting leeway as a measure of confidence.
129 |   So \emph{minimizing SVM loss with an L2 regularizer favors decision
130 |   boundaries far from even the closest correctly classified points!}  The black
131 |   line's margins exceed the gray's, so we favor black.
132 | 
133 |   For large $\lambda$, then this margin-maximization tendency can be so
134 |   strong that it overrides the data term.  Thus, even when we bring back
135 |   the rightmost {\rng orange point} we ignored, we might prefer the black
136 |   hypothesis to the gray one.
137 | 
138 | 
139 |   \newpage
140 | \sampassage{optimization}
141 | 
142 |   \begin{marginfigure}[-2cm]
143 |     \centering
144 |     \picturedw{0.99\textwidth}{example-mnist/train-weights-HingeReg}
145 |     \caption{%
146 |         \attnsam{REPLACE}
147 |       With $\lambda=0.02$ the objective visibly prefers weights near $0$.
148 |       We develop an algorithm to take steps in this plane
149 |       toward the minimum, `rolling down' the hill so to speak.
150 |     }
151 |   \end{marginfigure}
152 | 
153 | 
154 |   \exercise{%
155 |     We've discussed the L2 regularizer.  Also common is the L1 regularizer:
156 |     $
157 |       \text{implausibility of $h=(a,b, \cdots)$}
158 |       =
159 |       \lambda (|a| + |b| + \cdots)
160 |     $.
161 |     Hypotheses optimized with strong L1 regularization will tend to have
162 |     zero dependence on many features.  Explain to yourself and then to a friend
163 |     what the previous sentence means, why it is true, and how we might exploit
164 |     it in practice.
165 |   }
166 | 
167 | \newpage
168 | \sampassage{occam's razor}\marginnote{\veryoptional}
169 |   Did you feel not-quite-convinced by the AJ-Pat-Sandy example above?
170 |   We said: ``\emph{Pat's predictions depend substantively on many
171 |   features: flipping any one quality flips their prediction.  This seems
172 |   implausible.}'' --- does this really feel implausible, and if so, why?
173 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.1.2.bovinity-and-walls.tex:
--------------------------------------------------------------------------------
 1 | \objectives{%
 2 |   \item \attnsam{FILLIN}
 3 |   \item \attnsam{FILLIN}
 4 | }
 5 | 
 6 | % solving other classification tasks by the same method
 7 | \sampassage{worked example: bovinity of reddit posts}
 8 |   Digits are nice.  Let's solve a couple other tasks by the same method.  This
 9 |   illustrates which aspects are design decisions and which aren't.  We'll start
10 |   with a text-classification task.
11 | 
12 |   %\textbf{reddit posts about cows vs dogs}
13 |   We gather the text of $\sim 2000$ reddit posts, half from \texttt{r/cow} and
14 |   half from \texttt{r/dog}.  \emph{Can we predict from text alone which of the
15 |   two subreddits a post came from?}
16 | 
17 |   Intuitively, words like
18 |   \texttt{cow},
19 |   \texttt{hoof},
20 |   \texttt{moo},
21 |     or
22 |   \texttt{dog},
23 |   \texttt{paw},
24 |   \texttt{bark}
25 |   are tell-tale signs.  So for our featurization, let's have a feature for
26 |   each $3$-letter and each $4$-letter word.  The feature for the word
27 |   \texttt{hoof} simply maps a text input $x$ in $\xX$ to a real number that's
28 |   $1$ if the word appears in the post and $0$ otherwise.  Likewise with the
29 |   other features.
30 | 
31 | \sampassage{worked example: seeing around walls}
32 |   %\textbf{seeing around walls}
33 |   Let's collect 200 photos of a certain MIT hallway corner.  In half, there's
34 |   some large obstacle (e.g.\ a person) right around the corner.  In the other
35 |   half, there's no obstacle.  \emph{Can we distinguish these cases from pixels
36 |   alone?}
37 | 
38 |   Intuitively, if this prediction is possible it'd be based on subtle shading
39 |   arising from multiply reflected light.  So we'll probably want to
40 |   invent features to do with brightness.
41 | 
42 | % TODO : integrate approximation, optimization, generalization
43 | % into these two examples
44 | %\sampassage{improving approximation, optimization, generalization}
45 | 
46 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.3.0.bayesian-models.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.3.0.bayesian-models.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.3.1.examples-of-bayesian-models.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.3.1.examples-of-bayesian-models.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.3.2.inference-algorithms-for-bayesian-models.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.3.2.inference-algorithms-for-bayesian-models.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.3.3.combining-with-deep-learning.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.3.3.combining-with-deep-learning.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.U.0.bayesian-models.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.U.0.bayesian-models.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.U.1.classical-inference.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.U.1.classical-inference.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.U.2.inference-via-neural-nets.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.U.2.inference-via-neural-nets.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.U.3.deep-generative-architectures.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source-sandbox/body.U.3.deep-generative-architectures.tex


--------------------------------------------------------------------------------
/tex-source-sandbox/body.w.0.data-dependent-features.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |   \item \attnsam{FILLIN}
  3 |   \item \attnsam{FILLIN}
  4 | }
  5 | 
  6 | %--
  7 | %--
  8 | %--
  9 | %--
 10 | 
 11 | \sampassage{landmarks and kernels}
 12 |   Inspired by that `black hole' image, we might want to center the black hole
 13 |   around a representative training point $x_\star$.
 14 |   This gives a potentially-very-useful feature that says for any input $x$
 15 |   how similar is to $x_\star$, where we've somehow defined
 16 |   $\text{similarity}$ using domain knowledge.
 17 |   In fact, why not do this for several training
 18 |   points to get several features?  For example, if we use three representative
 19 |   `landmarks' $x_\circ, x_\square, x_\star$ then we get a featurization
 20 |   $$
 21 |     x \mapsto (\text{similarity}(x, x_\circ),
 22 |                \text{similarity}(x, x_\square),
 23 |                \text{similarity}(x, x_\star))
 24 |     \in \Rr^3
 25 |   $$
 26 |   Taking this to an extreme, we can use all $N$ training points as landmarks:
 27 |   $$
 28 |     x \mapsto (\text{similarity}(x, x_0),
 29 |                \text{similarity}(x, x_1),
 30 |                 \cdots
 31 |                \text{similarity}(x, x_{N-1}))
 32 |     \in \Rr^N
 33 |   $$
 34 |   Though we chose our features cleverly, at the end of a day, we'll use them
 35 |   to make predictions the same way as before: we'll have a bunch of weights
 36 |   $w_i$, one for each feature and we'll classify a fresh $x$ according to the
 37 |   sign of
 38 |   $$
 39 |     \sum_i w_i \cdot (\text{$i$th feature of $x$})
 40 |     =
 41 |     \sum_i w_i \cdot \text{similarity}(x, x_i)
 42 |   $$
 43 |   That's our hypothesis class $\hH$.
 44 | 
 45 |   With $N$ features for $N$ training points, $\hH$ will (usually) be very
 46 |   expressive.
 47 |   On one hand, expressivity means $\hH$ contains many hypotheses that well-fit
 48 |   the training data but do horribly at testing time.
 49 |   On the other hand, if we've done a good job choosing \emph{especially
 50 |   informative} features, then $\hH$ will contain a hypothesis that does well on
 51 |   both training and testing data.
 52 |   Thus, regularization is crucial!\bovinenote{%
 53 |     We pay some generalization cost in return for reducing approximation cost.
 54 |   }
 55 | 
 56 | \newpage
 57 | \sampassage{superpositions and kernels}%\marginnote{\veryoptional}
 58 |   In this passage we'll discuss how, once we've featurized our $x$s by
 59 |   similarities, we'll select a hypothesis from $\hH$ based on
 60 |   training data.  As usual we can do ordinary gradient descent, the kind we're
 61 |   now used to.  But we'll here explore a different method, a special gradient descent.  The method
 62 |   is important because it offers a fast way to solve a seemingly different
 63 |   problem:
 64 |   $$
 65 |     \substack{\text{\small ordinary (slow) gradient descent}\\%
 66 |               \text{\small on data featurized as we please,}\\%
 67 |               \text{\small say by $x\mapsto \varphi(x)$}}
 68 |     \quad\text{\emph{is equivalent to}}\quad
 69 |     \substack{\text{\small special (fast) gradient descent}\\%
 70 |               \text{\small on data featurized according to}\\%
 71 |               \text{\small $\text{similarity}(x,x\pr)=\varphi(x)\cdot\varphi(x\pr)$}}
 72 |   $$
 73 | 
 74 |   Here's an analogy for the speedup.\bovinenote{%
 75 |     Instead of
 76 |     using pointers to implicitly
 77 |     arrange an array of high-memory-footprint objects
 78 |     into an ordering
 79 |     that helps compute a rank for a fresh $x$,
 80 |     we'll
 81 |     use numbers to implicitly
 82 |     arrange a training set of high-dimensional featurevectors
 83 |     into a formal linear combination
 84 |     that helps compute a label for a fresh $x$.
 85 |   }
 86 |   Imagine we are quick-sorting some array $[x_0, \cdots,
 87 |   x_{N-1}]$ of large objects.  It's expensive to keep swapping such large
 88 |   objects.  So instead, we cleverly create an array of pointers to the original
 89 |   objects, then sort those pointers, and only as a final step arrange the
 90 |   objects based on the sorted pointers.  That way we do $N$ large-object-swaps
 91 |   instead of $N\log N$.
 92 |   %
 93 |   Better yet, if the point of sorting was to allow us to quickly binary search
 94 |   to count how many array elements $x_k$ are less than any given $x$, then we
 95 |   can avoid large-object-swaps \emph{completely} (!) by binary searching
 96 |   through the array of pointers.
 97 | 
 98 |   Now for the two methods, ordinary and special.  Well, as
 99 |   we've seen already we can subtract gradients of loss with respect to $w$ ---
100 |   let's write this out as a reminder.
101 |   First, use
102 |   $d_i = w \cdot (\text{features of $x_i$})$
103 |   as shorthand for the decision function value at the $i$th training input.
104 |   Then, to reduce the loss $\ell_k = \ell(y_k, d_k)$ suffered at the $k$th training example,
105 |   we use the derivative $g_k = \partial \ell(y_k, d_k) / \partial d_k$:
106 |   \begin{table}\centering
107 |     \vspace{-0.2cm}
108 |   \begin{tabular}{cc}
109 |     \text{\gre ordinary, ${\rng w}$-based update}       &       \text{\gre special, ${\blu d}$-based update}\\
110 |     $\begin{aligned}w^{\text{new}}
111 |         &= w^{} - \eta \,
112 |     \frac
113 |     {\partial \, \ell_k}
114 |     {\partial \, {\rng w}^{}}\\
115 |         &= w^{} - \eta \,g_k (\text{features of $x_k$})
116 |     \end{aligned}$
117 |     &
118 |     $\begin{aligned}w^{\text{new}}
119 |         &= w^{} - \eta \,
120 |     \frac
121 |     {\partial \, \ell_k}
122 |     {\partial \, {\blu d}^{}}\\
123 |         &= w^{} - \eta \,
124 |       g_k (\text{$k$th one-hot vector})
125 |     \end{aligned}$
126 |   \end{tabular}
127 |     \vspace{0.2cm}
128 |   \end{table}
129 | 
130 |   Note that $w$ has as many entries as there are features and $d$ has as many
131 |   entries as there are training examples; so th special update only makes sense
132 |   because we've cleverly chosen a collection of features that is indexed by
133 |   training points!
134 |   %
135 |   Intuitively $d = X \cdot w$,\bovinenote{%
136 |     Here, $X$ is the $N\times N$ matrix whose $k$th row is the featurization
137 |     of the $k$th training input.  So $X_{ki} = \text{similarity}(x_k, x_i)$.
138 |   } so $w$ and $d$ are proportional and the
139 |   ordinary and special updates are just stretched versions of each other.
140 |   %
141 |   In multiple dimensions, different directions get stretched
142 |   different amounts; it's because of this that the two updates are
143 |   inequivalent and lead to different predictions at the end of the day.
144 | 
145 |   \emph{Why the heck would we want to do this?}
146 |   One answer is that we can transform expensive ordinary updates into a
147 |   mathematically-equivalent-but-computationally-cheap computations.  And those
148 |   computations are special updates.
149 | 
150 |   More precisely, say we have some featurization $\varphi:\xX\to \Rr^s$, for
151 |   instance the kind that you already knew about before we discussed
152 |   `similarity' and want to use ordinary updates to find $s$ many weights.  If
153 |   the number $s$ of features is huge, then each update will take a lot of time,
154 |   since it'll involve multiplying each of $s$ many features by a coefficient.
155 |   That's what we mean by `expensive'.
156 |   %
157 |   Each ordinary update adds some linear combination of training inputs to the
158 |   weights, so (if we initialize weights to zero) we can after any number of
159 |   steps write $\text{weightvector} = \sum_i \alpha_i \varphi(x_i)$.  But
160 | 
161 |   we define similarity as a dot product:
162 |   $$
163 |     \text{similarity}(x,x\pr) = \varphi(x) \cdot \varphi(x\pr)
164 |   $$ 
165 | 
166 | 
167 | into equivalent, cheap special updates
168 |   for a similarity-based featurization.
169 |   %One answer is that we can transform an ordinary update for inputs featurized
170 |   %by some $\varphi:\xX\to \Rr^s$
171 |   %into an \emph{equivalent} special update for inputs featurized according
172 |   %to similarity.
173 |   In this case, we define similarity as a dot product:
174 |   $$
175 |     \text{similarity}(x,x\pr) = \varphi(x) \cdot \varphi(x\pr)
176 |   $$ 
177 | 
178 |   One answer is that the special ${\blu d}$-based update for our similarity
179 |   features is equivalent to an ordinary update for different features.
180 | 
181 |   Intuitively, this says 
182 | 
183 | 
184 |   %
185 |   %
186 |   %%But because our features are now indexed by training points, there's
187 |   %%\emph{another}, inequivalent method!  The idea is to represent a hypothesis
188 |   %%not by the weight values $w_0, \cdots, w_{\text{number of features}-1)$
189 |   %%but instead by the decision function values $d_0, \cdots, d_{\text{number of training points}-1)$
190 |   %%on the training set: $d_i = w \cdot (\text{features of $x_i$})$.  Since
191 |   %%$\text{number of training points} = \text{number of features}$ this is probably
192 |   %%an okay parameterization.
193 |   %%Then we can subtract gradients with respect to $d$:
194 |   %%$$
195 |   %%  d^{\text{new}}
196 |   %%  = d^{\text{old}} - \eta \,
197 |   %%  \frac
198 |   %%  {\partial \, \ell(y_k, d_k^{\text{old}})}
199 |   %%  {\partial \, d^{\text{old}}}
200 |   %%$$
201 |   %%This just means 
202 | 
203 |   %For example, to reduce perceptron loss we'd make an updates
204 |   %\begin{align*}
205 |   %  w_i^{\text{new}}
206 |   %  &= w_i^{\text{old}} + y_k \times (\text{$i$th feature of $x_k$}) \times (\text{$1$ if $w^{\text{old}}$ misclassifies $(y_k, x_k)$ else $0$})
207 |   %  \\&= w_i^{\text{old}} + y_k \times \text{similarity}(x_k, x_i)\times (\text{$1$ if $w^{\text{old}}$ misclassifies $(y_k, x_k)$ else $0$})
208 |   %\end{align*}
209 |   %when $w^{\text{old}}$ misclassifies training example $(y_k, x_k)$.
210 | 
211 |   We have flexibility in designing our function
212 |   $\text{similarity}:\xX\times\xX\to\Rr$.  But for the function to be worthy
213 |   of the name, it should at least satisfy these two rules:\bovinenote{%
214 |     These generalize to stronger, subtler conditions that we'll discuss in the
215 |     next passage.
216 |   }
217 |   $x$ is as similar to $x\pr$ as $x\pr$ is to $x$
218 |   ($\text{similarity}(x,x\pr) = \text{similarity}(x\pr,x)$)
219 |   and $x$ is similar to itself
220 |   ($\text{similarity}(x,x) \geq 0$).
221 |   %
222 |   \attnsam{What happens if $x$ isn't similar to itself?  Perceptron goes
223 |   the wrong way!!}
224 | 
225 |   Let's look more into this.
226 |   Though we invented these features cleverly, we may use them the same
227 |   way as before.  For example, we can choose our weights $w$ using the
228 |   perceptron algorithm, if we wish.  This says that if we misclassify a training
229 |   example $(y_k, x_k)$ then we update
230 |   $$
231 |     w_i^{\text{new}}
232 |     = w_i^{\text{old}} + y_k \times (\text{$i$th feature of $x_k$})
233 |     = w_i^{\text{old}} + y_k \times \text{similarity}(x_k, x_i)
234 |   $$
235 | 
236 |   %
237 | 
238 |   There is a beautiful alternative view on kernels.
239 | 
240 |   Now, let's say a \textbf{superposition} is a formal combination like
241 |   $2 x_\circ - 0.1 x_\square + 0 x_\star$.  Here the addition and scalings
242 |   are just book-keeping devices.  Even if the $x$s are pretzels or french text
243 |   -- things we can't \emph{actually} add --- we can still write that formal
244 |   combination as a book-keeping device.  And when we \emph{do} have some
245 |   ordinary way of adding the $x$s --- maybe they are vectors --- we still
246 |   don't want to use that way of adding in this context.
247 | 
248 |   The point of a superposition
249 | 
250 | \sampassage{quantiles and decision trees}\marginnote{veryoptional}
251 |   There are many other good ideas for choosing featurizations based on data.
252 |   Here's one: \emph{rescale a feature based on the distributions of its values
253 |   in the training data}.
254 | 
255 |   From quantiles to binning.
256 | 
257 | 
258 | 
259 |   We won't discuss them in lecture, but \textbf{decision trees} can be very
260 |   practical: at their best they offer fast learning, fast prediction,
261 |   interpretable models, and robust generalization.  Trees are discrete so we
262 |   can't use plain gradient descent; instead, we train decision trees by
263 |   greedily growing branches from a stump.  We typically make predictions by
264 |   averaging over ensembles --- ``forests'' --- of several decision trees each
265 |   trained on the training data using different random seeds.
266 | 
267 | \sampassage{linear dimension-reduction}
268 |   There are many other good ideas for choosing featurizations based on data.
269 |   Here's one: \emph{if some raw features are (on the training data) highly
270 |   correlated}, collapse them into a single feature.  Beyond saving computation
271 |   time, this can improve generalization by reducing the number of parameters to
272 |   learn.  We lose information in the collapse --- the small deviations of those
273 |   raw features from their average\bovinenote{%
274 |      or more precisely, from a properly scaled average
275 |   } --- so to warrant this collapse we'd want justification from domain knowledge
276 |   that those small deviations are mostly irrelevant noise.
277 | 
278 |   More generally, we might want to 
279 | 
280 | %\sampassage{matrix factorization and pca}\marginnote{veryoptional}
281 |   One way of understanding such linear dimension-reduction is matrix
282 |   factorization.  I mean that we want to approximate our $N\times D$ matrix $X$
283 |   of raw features as $X \approx F C$, a product of an $N\times R$ matrix $F$ of
284 |   processed features with an $R\times D$ matrix $C$ that defines each processed
285 |   feature as a combination of the raw features.
286 | 
287 |   There's \textbf{principal component analysis}.
288 | 
289 |   As a fun application, we can fix a corrupted row (i.e., vector of raw
290 |   features for some data point) of $X$ by replacing it with the corresponding
291 |   row of $F C$.  We expect this to help when the character of the corruption
292 |   fits our notion of ``$\approx$''.  For example, if the corruption is small
293 |   in an L2 sense then PCA is appropriate.
294 |   \attnsam{collaborative filtering}
295 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.w.2.priors-and-generalization.tex:
--------------------------------------------------------------------------------
  1 |       \samquote{
  2 |         A child's education should begin at least 100 years before [they are]
  3 |         born.
  4 |       }{oliver wendell holmes jr}
  5 | 
  6 | 
  7 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  8 | %~~~~~~~~~~~~~  2.8. on overfitting  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  9 | 
 10 |       \sampassage{on overfitting}
 11 |         In the Bayesian framework, we optimistically assume that our model is
 12 |         ``correct'' and that the ``true'' posterior over parameters is
 13 |         $$
 14 |             p(w|y;x) = p(y|w;x) p(w) / Z_x
 15 |         $$
 16 |         a normalized product of likelihood and prior.
 17 |         %
 18 |         Therefore, our optimal guess for a new prediction is:
 19 |         $$
 20 |             p(y_\star;x_\star, x)
 21 |             = \sum_w p(y_\star|w;x_\star) p(y|w;x) p(w) / Z_x
 22 |         $$
 23 | 
 24 |         For computational tractability, we typically approximate the posterior
 25 |         over hypotheses by a point mass at the posterior's mode
 26 |         $\text{argmax}_{w^\prime} p(w^\prime|y;x)$:
 27 |         $$
 28 |             p(w|y,x) \approx \delta(w - w^\star(y,x))
 29 |             \quad\quad
 30 |             w^\star(y,x)=\text{argmax}_{w^\prime} p(w^\prime|y;x)
 31 |         $$
 32 |         Then
 33 |         $$
 34 |             p(y_\star;x_\star, x)
 35 |             \approx p(y_\star|w^\star(y,x);x_\star)
 36 |         $$
 37 | 
 38 |         What do we lose in this approximation?
 39 | 
 40 |         \attnsam{$\Ee$ and $\max$ do not commute}
 41 | 
 42 |         \attnsam{PICTURE: ``bowtie'' vs ``pipe''}
 43 | 
 44 |         \attnsam{BIC for relevant variables}
 45 | 
 46 | 
 47 |         %\attnsam{point estimates vs bayesian decision theory}
 48 | 
 49 |         %\attnsam{interpolation does not imply overfitting}
 50 | 
 51 | 
 52 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 53 | %~~~~~~~~~~~~~  2.9. log priors and bayes  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 54 | 
 55 |       \sampassage{log priors and bayes}
 56 |         \attnsam{fill in computation and bases}
 57 |         \attnsam{visual illustration of how choice of L2 dot product matters}
 58 |         \attnsam{$\ell^p$ regularization; sparsity}
 59 |         \attnsam{eye regularization example!}
 60 |         \attnsam{TODO: rank as a prior (for multioutput models)}
 61 | 
 62 |         %The so-called $\ell^p$ priors are popular.  For $1\leq p<\infty$,
 63 |         %these are defined by:
 64 |         For $1\leq p\leq \infty$ and $1\leq q\leq \infty$ we can consider this prior:\bovinenote{%
 65 |           To define the cases $p=\infty$ or $q=\infty$ we take limits.
 66 |           To define the case $p=\infty=q$ we take limits while maintaining $p=q$.
 67 |         }
 68 |         $$
 69 |             p(w) \propto \exp\left(-\left(\sum_i |\lambda w_i|^p\right)^{q/p}\right)
 70 |         $$
 71 |         For $q=p$ this decomposes as a sum and thus has each coordinate
 72 |         independent.  Whereas $q$ limits outliers, $p$ controls the shape of
 73 |         level curves.  Small $q$ makes large vectors more probable and
 74 |         small $p$ makes it more probable that the entries within a vector will
 75 |         be of very different sizes.
 76 |         \newcommand{\priors}[1]{\includegraphics[width=2.9cm]{priors/yo-#1}}
 77 |         \newcommand{\smarsh}[1]{\vspace{0.125cm}\smash{\parbox[c]{3cm}{#1}}\vspace{0.525cm}}
 78 |         \begin{table}
 79 |           \centering
 80 |           \begin{tabular}{m{1.1cm}SSS}%
 81 |                                 & $q=1$                 & $q=2$                 & $q=\infty$              \\%
 82 |             \smarsh{     $p=1$} &\smarsh{\priors{1-1}}  &\smarsh{\priors{1-2}}  &\smarsh{\priors{1-inf}}  \\%
 83 |             \smarsh{     $p=2$} &\smarsh{\priors{2-1}}  &\smarsh{\priors{2-2}}  &\smarsh{\priors{2-inf}}  \\%
 84 |             \smarsh{$p=\infty$} &\smarsh{\priors{inf-1}}&\smarsh{\priors{inf-2}}&\smarsh{\priors{inf-inf}}\\%
 85 |           \end{tabular}
 86 |         \end{table}
 87 | 
 88 |         %\begin{table}
 89 |         %    \centering
 90 |         %    \begin{tabular}{ccccccc}
 91 |         %        p         &       q         &  sample A          &  sample B          &   shape       & indep.?       & outliers    \\\hline% sample A            sample B
 92 |         %        $1     $  &       $1$       & $( 0.5,-0.1, 0.7)$ & $(14.4,-0.9,-0.9)$ & octahedron    & $\checkmark$  & many        \\      %$( 0.6, 0.1, 0.9)$  $( 1.6, 0.1, 0.1)$
 93 |         %        $1     $  &       $2$       & $( 0.6,-0.1, 0.9)$ & $( 4.8,-0.3,-0.3)$ & octahedron    &               & few         \\      %$( 0.6, 0.1, 0.9)$  $( 1.6, 0.1, 0.1)$
 94 |         %        $1     $  &       $\infty$  & $( 0.7,-0.1, 1.1)$ & $( 1.6,-0.1,-0.1)$ & octahedron    &               & none        \\      %$( 0.6, 0.1, 0.9)$  $( 1.6, 0.1, 0.1)$
 95 |         %        $2     $  &       $1$       & $( 0.3,-0.2, 0.8)$ & $(13.5,-7.2,-2.7)$ & sphere        &               & many        \\      %$( 0.4, 0.2, 1.0)$  $( 1.5, 0.8, 0.3)$
 96 |         %        $2     $  &       $2$       & $( 0.4,-0.2, 1.0)$ & $( 4.5,-2.4,-0.9)$ & sphere        & $\checkmark$  & few         \\      %$( 0.4, 0.2, 1.0)$  $( 1.5, 0.8, 0.3)$
 97 |         %        $2     $  &       $\infty$  & $( 0.5,-0.2, 1.2)$ & $( 1.5,-0.8,-0.3)$ & sphere        &               & none        \\      %$( 0.4, 0.2, 1.0)$  $( 1.5, 0.8, 0.3)$
 98 |         %        $\infty$  &       $1$       & $( 0.4,-0.2, 0.7)$ & $( 9.0,-8.1,-7.2)$ & cube          &               & many        \\      %$( 0.5, 0.3, 0.9)$  $( 1.0, 0.9, 0.8)$
 99 |         %        $\infty$  &       $2$       & $( 0.5,-0.3, 0.9)$ & $( 3.0,-2.7,-2.4)$ & cube          &               & few         \\      %$( 0.5, 0.3, 0.9)$  $( 1.0, 0.9, 0.8)$
100 |         %        $\infty$  &       $\infty$  & $( 0.6,-0.4, 1.1)$ & $( 1.0,-0.9,-0.8)$ & cube          & $\checkmark$  & none        \\      %$( 0.5, 0.3, 0.9)$  $( 1.0, 0.9, 0.8)$
101 |         %    \end{tabular}
102 |         %\end{table}
103 |         %\begin{description}
104 |         %    \item[$(p,q)=(1     ,1)$ --- indep laplacian        ]
105 |         %    \item[$(p,q)=(1     ,2)$ ---                        ]
106 |         %    \item[$(p,q)=(2     ,1)$ ---                        ]
107 |         %    \item[$(p,q)=(2     ,2)$ --- indep\&radial gaussian ]
108 |         %    \item[$(p,q)=(\infty,1)$ ---                        ]
109 |         %    \item[$(p,q)=(\infty,2)$ ---planckian               ]
110 |         %\end{description}
111 | 
112 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
113 | %~~~~~~~~~~~~~  2.10. hierarchy, mixtures, transfer  ~~~~~~~~~~~~~~~~~~~~~~~~~~~
114 | 
115 |       \sampassage{hierarchy, mixtures, transfer}
116 |         \attnsam{k-fold cross validation}
117 |         %\attnsam{dimension-based generalization bound}
118 |         \attnsam{bayesian information criterion}
119 | 
120 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
121 | %~~~~~~~~~~~~~  2.11. estimating generalization  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
122 | 
123 |       \sampassage{estimating generalization}
124 |         \attnsam{k-fold cross validation}
125 |         \attnsam{dimension-based generalization bound}
126 |         \attnsam{bayesian information criterion}
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.w.3.model-selection.tex:
--------------------------------------------------------------------------------
  1 | \samquote{
  2 |   All human beings have three lives: public, private, and secret.
  3 | }{gabriel garc\`ia marquez}
  4 | 
  5 | %-- featurization
  6 | %-- generalization bounds and BC
  7 | %-- hyperparameter search
  8 | %-- double descent
  9 | 
 10 | \sampassage{taking stock so far}
 11 |   By \textbf{model selection} we mean the selection of all those design
 12 |   parameters --- featurization and other `architecture', optimization method.
 13 | 
 14 |   The story of model selection has to do with approximation, optimization,
 15 |   and generalization.
 16 | 
 17 | \sampassage{grid/random search}
 18 | 
 19 | \sampassage{selecting prior strength}
 20 | 
 21 | \sampassage{overfitting on a validation set}
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | \newpage
 28 |     \samsection{4. generalization bounds}
 29 |       \samquote{
 30 |         A foreign philosopher rides a train in Scotland.  Looking out the window,
 31 |         they see a black sheep; they exclaim: ``wow!  in Scotland at least one side of one sheep is black!''
 32 |       }{unknown}
 33 | 
 34 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 35 | %~~~~~~~~~~~~~  2.16. dot products and generalization  ~~~~~~~~~~~~~~~~~~~~~~~~
 36 | 
 37 |       \sampassage{dot products and generalization}
 38 |       %\sampassage{perceptron bound}
 39 | 
 40 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 41 | %~~~~~~~~~~~~~  2.17. dimension bound  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 42 | 
 43 |       %\sampassage{hypothesis-class-based bounds}
 44 |       \sampassage{hypothesis-geometry bounds}
 45 |         Suppose we are doing binary linear classification with $N$ training
 46 |         samples of dimension $d< N$.  Then with probability at least $1-\eta$
 47 |         the gen gap is at most:
 48 |         %\bovinenote{%
 49 |         %    %\blarr We write $\log_{\!+}(z)$ for $\log(\max(1,z))$.
 50 |         %    %\blarr we write $a^\cdot, a^{\cdot\cdot}$ for $(a+1), (a+2)$ to
 51 |         %    %avoid overemphasizing annoying constants.
 52 |         %}
 53 |         $$
 54 |           \sqrt{\frac{d\log(6N/d) + \log(4/\eta)}{N}}
 55 |         $$
 56 |         For example, with $d=16$ features and tolerance $\eta=1/1000$, we
 57 |         can achieve a gen.\ gap of less than $5\%$ once we have more than
 58 |         $N\approx 64000$ samples.  This is pretty lousy.  It's a worst case
 59 |         bound in the sense that it doesn't make any assumptions about how
 60 |         orderly or gnarly the data is.
 61 | 
 62 |         If we normalize so that $\|x_i\|\leq R$ and we insist on classifiers
 63 |         with margin at least $0<m\leq R$, then we may replace $d$ by
 64 |         $\lceil 1+(R/m)^2 \rceil$ if we wish, so long as we count each
 65 |         margin-violator as a training error, even if it is correctly
 66 |         classified.
 67 | 
 68 |         \attnsam{CHECK ABOVE!}
 69 | 
 70 |         Thus, if $R=1$ and $1 \leq Nm$ then with chance at least $1-\eta$:
 71 |         \begin{align*}
 72 |             \text{testing error} \leq &~\frac{\text{(number of margin violators)}}{N}
 73 |           +\\
 74 |             &~\sqrt{\frac{(2/m^2) \log(6N m^2) + \log(4/\eta)}{N}}
 75 |         \end{align*}
 76 | 
 77 |         \attnsam{dimension/margin}
 78 | 
 79 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 80 | %~~~~~~~~~~~~~  2.18. margin bound  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 81 | 
 82 |       \sampassage{optimization-based bounds}
 83 |         Another way to estimate testing error is through \textbf{leave-one-out
 84 |         cross validation} (LOOCV).  This requires sacrifice of a single
 85 |         training point in the sense that we need $N+1$ data points to do LOOCV
 86 |         for an algorithm that learns from $N$ training points.
 87 |         %
 88 |        %
 89 |         The idea is that after training on the $N$, the
 90 |         testing-accuracy-of-hypotheses-learned-from-a-random-training-sample is
 91 |         unbiasedly estimated by the learned hypothesis's accuracy on the
 92 |         remaining data point.  This is a very coarse, high-variance estimate.
 93 |         To address the variance, we can average over all $N+1$
 94 |         choices\bovinenote{%
 95 |           In principle, LOOCV requires training our model $N+1$ many times; we'll
 96 |           soon see ways around this for the models we've talked about.
 97 |         }
 98 |         of which data point to remove from the training set.  When the
 99 |         different estimates are sufficiently uncorrelated, this drastically
100 |         reduces the variance of our estimate.
101 | 
102 |         Our key to establishing sufficient un-correlation lies in
103 |         \emph{algorithmic stability}: the hypothesis shouldn't change too much
104 |         as a function of small changes to the training set; thus, most of the
105 |         variance in each LOOCV estimate is due to the testing points, which by
106 |         assumption are independent.
107 | 
108 |         If all $x$s, train or test, have length at most $R$, then
109 |         we have that with chance at least $1-\eta$:
110 |         $$
111 |             \Ee_{\sS}
112 |             \text{testing error}
113 |             \leq
114 |             \Ee_{\sS}
115 |             \text{LOOCV error}
116 |             +
117 |             \sqrt{\frac{1+6R^2/\lambda}{2N\eta}}
118 |         $$
119 | 
120 |         %together with the
121 |         %boundedness and lipschitzness of the ramp function, the LOOCV estimate
122 |         %probably does not severely underestimate the true testing error:
123 |         %$$
124 |         %    \sqrt{\frac{1+6R^2/\lambda}{2N\delta}}
125 |         %    +
126 |         %    \sum_{i} \text{ramp}\left(\frac{\nu^i}{\lambda} \|x^i\|^2 - y^i w\cdot x^i\right)
127 |         %$$
128 | 
129 | 
130 |         %% support/sensitiviy bounds
131 |         %Leave-one-out cross-validation gives an unbiased (but not quite
132 |         %1/root n) estimate of generalization gap for N one less than true.
133 |         %%
134 |         %We can proceed further when we're optimizing
135 |         %$$
136 |         %    \lL(w) = \frac{\lambda}{2} \|w\|^2 + \sum_i \ell(-y^i w\cdot x^i)
137 |         %$$
138 |         %where $\ell:\Rr\to\Rr$ is a convex function such as $\text{hinge}$ or
139 |         %$\text{softplus}$.  Suppose that $\ell(d)\geq 0$ and $\ell(d)\to 0$ as
140 |         %its $d\to-\infty$ and $\ell(d)-d \to C$ as $d\to+\infty$ (so slope $1$).
141 | 
142 |         %The idea is that the optimal $w$ for one less training point shouldn't
143 |         %be that different from the optimal $w$.  If at a training point $i$ we
144 |         %have $d\ell(-y^i w\cdot x^i)/dw$ has norm $\nu^i\|x^i\|$ (largest
145 |         %subgradient--- aka the ``supportiveness of training point i''), then
146 |         %taking out that point shifts $w$ by a distance at most $\nu^i\|x^i\|/\lambda$.
147 |         %So an upper bound for the LOOCV unbiased estimate of the testing
148 |         %error (for N one less) is:
149 |         %$$
150 |         %    \sum_{i} \text{ramp}\left(\frac{\nu^i}{\lambda} \|x^i\|^2 - y^i w\cdot x^i\right)
151 |         %$$
152 |         %where $\text{ramp}(z) = \min(1,\max(0,1+z))$.
153 | 
154 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 | %~~~~~~~~~~~~~  2.19. bayes and testing  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
156 | 
157 |       \sampassage{bayes and testing}
158 |         % PAC Bayes?  testing Sets?  or maybe testing sets should be part of model selection
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.z.2.bovinity.tex:
--------------------------------------------------------------------------------
 1 | \objectives{%
 2 |   \item \attnsam{FILLIN}
 3 |   \item \attnsam{FILLIN}
 4 | }
 5 | 
 6 | % solving other classification tasks by the same method
 7 | \sampassage{worked example: bovinity of reddit posts}
 8 |   Digits are nice.  Let's solve a couple other tasks by the same method.  This
 9 |   illustrates which aspects are design decisions and which aren't.  We'll start
10 |   with a text-classification task.
11 | 
12 |   %\textbf{reddit posts about cows vs dogs}
13 |   We gather the text of $\sim 2000$ reddit posts, half from \texttt{r/cow} and
14 |   half from \texttt{r/dog}.  \emph{Can we predict from text alone which of the
15 |   two subreddits a post came from?}
16 | 
17 |   Intuitively, words like
18 |   \texttt{cow},
19 |   \texttt{hoof},
20 |   \texttt{moo},
21 |     or
22 |   \texttt{dog},
23 |   \texttt{paw},
24 |   \texttt{bark}
25 |   are tell-tale signs.  So for our featurization, let's have a feature for
26 |   each $3$-letter and each $4$-letter word.  The feature for the word
27 |   \texttt{hoof} simply maps a text input $x$ in $\xX$ to a real number that's
28 |   $1$ if the word appears in the post and $0$ otherwise.  Likewise with the
29 |   other features.
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.z.3.priors.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |   \item {explain how regularization, in its incarnation as
  3 |          margin-maximization, counters data terms to improve generalization}
  4 |   \item {write a regularized ML program (namely, an SVM),
  5 |                  to classify high-dimensional data}
  6 | }
  7 | 
  8 | \sampassage{how good is a hypothesis?  plausibility}
  9 |   Now to define intrinsic plausiblity, also known as a \textbf{regularizer}.
 10 |   %term.
 11 |   We find a hypothesis more plausible when its ``total amount of
 12 |   dependence'' on the features is small.
 13 |   %
 14 |   So we'll focus for now on capturing this intution:
 15 |   \emph{a hypothesis that depends a lot on many features is less
 16 |   plausible}.\bovinenote{%
 17 |     There are many other aspects we might design a regularizer
 18 |     to capture, e.g.\ a domain's symmetry.
 19 |     The regularizer is in practice a key point where we inject domain
 20 |     knowledge.
 21 |   }
 22 |   %
 23 |   We may conveniently quantify this as
 24 |   proportional to a sum of squared weights (jargon: \textbf{L2}):\bovinenote{%
 25 |     \noparexercise{%
 26 |       When $(a,b)$ represent weights for brightness-width digits features, how
 27 |       do hypotheses with small $a^2 + b^2$ visually differ from ones with
 28 |       small $6.86 a^2+b^2$ (a perfectly fine variant of our
 29 |       `implausibility')?
 30 |     }
 31 |   }
 32 |   $
 33 |     \text{implausibility of $h=(a,b, \cdots)$}
 34 |     =
 35 |     \lambda (a^2 + b^2 + \cdots)
 36 |   $.  In code:
 37 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 38 |     LAMBDA = 1.
 39 |     def implausibility(a,b):
 40 |       return LAMBDA * np.sum(np.square([a,b]))
 41 |   \end{lstlisting}
 42 |   Intuitively, the constant $\lambda$=\texttt{LAMBDA} tells us how much we care
 43 |   about plausibility relative to goodness-of-fit-to-data.
 44 | 
 45 |   Here's what the formula means.
 46 |   Each of three friends has a theory\bovinenote{%
 47 |   \textbf{AJ}
 48 |     insists a bird with a wings shorter than 1ft can't fly far, so it's
 49 |     \emph{sure} to sing; Conversely, birds with longer wings never sing.
 50 |   \textbf{Pat}
 51 |     checks if the bird grows red feathers, eats shrimp, lives near ice, wakes
 52 |     in the night, and has a bill.  If and only if an even number of these $5$
 53 |     qualities are true, the bird probably sings.
 54 |   \textbf{Sandy}
 55 |     says shorter wings and nocturnality both make a bird somewhat more likely
 56 |     to sing.
 57 |     }
 58 |   about which birds sing.
 59 |   %
 60 |   Which theory do we prefer?  Well, \textbf{AJ} seems too confident.  Wingspan
 61 |   may matter but probably not so decisively.  \textbf{Pat} avoids
 62 |   black-and-white claims, but Pat's predictions depend substantively on many
 63 |   features: flipping any one quality flips their prediction.  This seems
 64 |   implausible.  By contrast, \textbf{Sandy}'s hypothesis doesn't depend too
 65 |   strongly on too many features.  To me, a bird non-expert, Sandy's seems most
 66 |   plausible.
 67 | 
 68 |   Now we can define the overall undesirability of a hypothesis:\bovinenote{%
 69 |     We'll use SVM loss but feel free to plug in other losses to get
 70 |     different learning behaviors!
 71 |   }
 72 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 73 |     def objective_function(examples,a,b):
 74 |       data_term = np.sum([svm_loss(x,y,a,b) for x,y in examples])
 75 |       regularizer = implausibility(a, b)
 76 |       return data_term + regularizer
 77 |   \end{lstlisting}
 78 | 
 79 | \sampassage{margins}
 80 |   To build intuition
 81 |   %about which hypotheses are most desirable according to that metric,
 82 |   let's suppose $\lambda$ is a tiny positive number.  Then
 83 |   minimizing the objective function is the same as minimizing the data term,
 84 |   the total SVM loss: our notion of implausibility breaks ties.
 85 | 
 86 |   \begin{marginfigure}[0cm]
 87 |     \centering
 88 |     \picturew{0.99\textwidth}{margin}
 89 |     \caption{%
 90 |       \textbf{Balancing goodness-of-fit against intrinsic plausibility leads
 91 |       to hypotheses with large margins.}
 92 |       %\textbf{IGNORE the rightmost {\rng orange point} until we say otherwise!}
 93 |         A hypothesis's \textbf{margin} is its distance to the closest correctly
 94 |         classified training point(s).  Short stems depict these distances for
 95 |         two hypotheses (\textbf{black}, {\gre\textbf{gray}}).
 96 |         %
 97 |         If not for the rightmost {\rng orange point}, we'd prefer \textbf{black} over
 98 |         {\gre\textbf{gray}} since it has larger margins.  With large $\lambda$ (i.e., strong
 99 |         regularization), we might prefer black over gray even with that
100 |         rightmost {\rng orange point} included, since expanding the margin
101 |         is worth the single misclassification.
102 |       %For convenience we set the origin to the intersection of the two
103 |       %hypotheses.  That way we can still say that every hypothesis's decision
104 |       %boundary goes through
105 |       %the origin.
106 |     }
107 |   \end{marginfigure}
108 | 
109 |   Now, how does it break ties?  Momentarily ignore the Figure's rightmost {\rng
110 |   orange point} and consider the black hypothesis; its predictions depend only
111 |   on an input's first (vertical) coordinate, so it comes from weights of the
112 |   form $(a,b) = (a,0)$.
113 |   %
114 |   The $(a,0)$ pairs differ in SVM loss.  If
115 |   $a\approx 0$, each point has leeway close to $0$
116 |   and thus SVM loss close to $1$; conversely, if $a$ is huge, each
117 |   point has leeway very positive and thus SVM loss equal to
118 |   the imposed floor: $0$.  So SVM loss is $0$ as long as
119 |   $a$ is so big that each leeway to exceed $1$.
120 | 
121 |   Imagine sliding a point through the plane.  Its leeway is $0$ at the
122 |   black line and changes by $a$ for every unit we slide vertically.
123 |   %
124 |   So the farther the point is from the black line, the less $a$
125 |   must be before leeway exceeds $1$ --- and the happier is
126 |   the regularizer, which wants $a$ small.
127 |   % TODO BEACH, WATER, SLOPE story
128 |   % TODO Interpreting leeway as a measure of confidence.
129 |   So \emph{minimizing SVM loss with an L2 regularizer favors decision
130 |   boundaries far from even the closest correctly classified points!}  The black
131 |   line's margins exceed the gray's, so we favor black.
132 | 
133 |   For large $\lambda$, then this margin-maximization tendency can be so
134 |   strong that it overrides the data term.  Thus, even when we bring back
135 |   the rightmost {\rng orange point} we ignored, we might prefer the black
136 |   hypothesis to the gray one.
137 | 
138 | \newpage
139 |   Now for some really good intuition-building brain-food!
140 |   \exercise{%
141 |     Identify which point on the {\gre gray curve} to the right corresponds to
142 |     $\lambda=0$.  How about $\lambda=\infty$?
143 |   }
144 |   \exercise{%
145 |     We have two weight coefficients (corresponding to the horizontal and
146 |     vertical axes of the Figure).  Based on the {\blu fit-to-data} term, which
147 |     coefficient is the loss more sensitive to?
148 |     %(That is, if we are at an optimum for $\lambda=0$, then )
149 |   }
150 |   \exercise{%
151 |     Observe that the weight-vs-$\lambda$ {\gre trajectory} is curved: it
152 |     doesn't interpolate linearly between its $\lambda=0$ and $\lambda=\infty$
153 |     values.  Which weight (horizontal or vertical) gets suppressed `first'
154 |     as we increase $\lambda$ from $0$?
155 |   }
156 |   \exercise{%
157 |     By thinking about points at which {\blu blue} and {\rng orange} contours
158 |     are mutually tangent, sketch the weight-vs-$\lambda$ trajectory described
159 |     in the Figure.  That is: check that the Figure is right!
160 |   }
161 |   \begin{marginfigure}[-2cm]
162 |     \centering
163 |     \picturew{0.99\textwidth}{quad-reg.png}%
164 |     \caption{%
165 |       \textbf{Regularization suppresses
166 |       different features \emph{by different amounts}.}
167 |       %
168 |       We show a contour plot of loss terms over 2D weight space: an {\rng L2
169 |       regularizer} and a {\blu fit-to-data} term.  As we vary $\lambda$ from
170 |       $0$ ({\rng L2} doesn't matter) toward $\infty$ ({\blu data } doesn't
171 |       matter), the optimal weight changes.  We show this weight-vs-$\lambda$
172 |       trajectory in {\gre gray}.
173 |       %
174 |       \textbf{Warning}:
175 |       For the perceptron and hinge notions of fit-to-data, the latter term
176 |       won't look so smooth.  Still, the moral about regularization applies.
177 |       (And future models
178 |       we'll discuss (logistic models, least-squares regression, etc) \emph{are}
179 |       smooth.)
180 |       \attnsam{TODO: expand on caption}
181 |     }
182 |   \end{marginfigure}
183 | 
184 | 
185 | 
186 | 
187 |   \vfill
188 | \sampassage{optimization}
189 |   Now that we've defined our objective function,
190 |   %(repeated below for easy
191 |   %reference),
192 |   we want to find a hypothesis $h=(a,b)$ that minimizes it.
193 |   %
194 |   We've already discussed how to nudge the weight vector to reduce the badness-of-fit for a datapoint.
195 |   How do we nudge it to reduce the implausibility?
196 |   Well, we reduce the $\lambda$ term simply by moving $a,b$ closer to $0$!
197 |   That is, we combine an update of the form
198 |   $$
199 |     w^{\text{new}}
200 |     =w^{\text{old}} - \lambda w^{\text{old}}
201 |   $$
202 |   with the data update.
203 | 
204 |   %\bovinenote{%
205 |   (
206 |     To get this to match our objective exactly, we should actually write $2\lambda/N$ instead
207 |     of $\lambda$.  The $2$ comes from the second power in L2's definition; the $1/N$,
208 |     more importantly, comes from the fact that we have $N$ data terms but just
209 |     $1$ plausiblity term.  So if we work row-by-row (datapoint-by-datapoint),
210 |     we ought to divvy up the plausibility term into $N$ many terms, each of strength $\lambda/N$.
211 |     %
212 |     At this point, we can just abstract this reasoning away by defining a new
213 |     constant --- say $L$ --- that secretly is $2\lambda/N$.
214 |     Later, it'll be good to know where $L$ comes from.
215 |     )
216 |   %}
217 | 
218 |   %
219 |   We end up with
220 |   %
221 |   %Well, we can reduce a row's $\lambda/N$ term by moving $a,b$ closer to $0$.
222 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
223 |     ETA = 0.01
224 |     ab = initialize()
225 |     for t in range(10000):
226 |       xfeatures, y = fetch_datapoint_from(training_examples)
227 |       ab = ab + ETA * ( - L * ab
228 |                         + y * xfeats * (0 if max(0., y*ab.dot(xfeatures))==0 else 1) )
229 |   \end{lstlisting}
230 |   This is the \textbf{pegasos algorithm} we'll see in the project.
231 |   Soon we'll formalize and generalize this algorithm using calculus.
232 |   \begin{marginfigure}[-8cm]
233 |     \centering
234 |     \picturedw{0.99\textwidth}{example-mnist/train-weights-HingeReg}
235 |     \caption{%
236 |         \attnsam{REPLACE}
237 |       With $\lambda=0.02$ the objective visibly prefers weights near $0$.
238 |       We develop an algorithm to take steps in this plane
239 |       toward the minimum, `rolling down' the hill so to speak.
240 |     }
241 |   \end{marginfigure}
242 | 
243 | 
244 | 
245 |   \exercise{%
246 |     We've discussed the L2 regularizer.  Also common is the L1 regularizer:
247 |     $
248 |       \text{implausibility of $h=(a,b, \cdots)$}
249 |       =
250 |       \lambda (|a| + |b| + \cdots)
251 |     $.
252 |     Hypotheses optimized with strong L1 regularization will tend to have
253 |     zero dependence on many features.  Explain to yourself and then to a friend
254 |     what the previous sentence means, why it is true, and how we might exploit
255 |     it in practice.
256 |   }
257 | 
258 | 
259 | 
260 | \newpage
261 |   \vfill
262 | \sampassage{occam's razor}\marginnote{\veryoptional}
263 |   Did you feel not-quite-convinced by the AJ-Pat-Sandy example above?
264 |   We said: ``\emph{Pat's predictions depend substantively on many
265 |   features: flipping any one quality flips their prediction.  This seems
266 |   implausible.}'' --- does this really feel implausible, and if so, why?
267 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/body.z.3.see-around-walls.tex:
--------------------------------------------------------------------------------
 1 | \objectives{%
 2 |   \item \attnsam{FILLIN}
 3 |   \item \attnsam{FILLIN}
 4 | }
 5 | 
 6 | \sampassage{worked example: seeing around walls}
 7 |   %\textbf{seeing around walls}
 8 |   Let's collect 200 photos of a certain MIT hallway corner.  In half, there's
 9 |   some large obstacle (e.g.\ a person) right around the corner.  In the other
10 |   half, there's no obstacle.  \emph{Can we distinguish these cases from pixels
11 |   alone?}
12 | 
13 |   Intuitively, if this prediction is possible it'd be based on subtle shading
14 |   arising from multiply reflected light.  So we'll probably want to
15 |   invent features to do with brightness.
16 | 
17 | % TODO : integrate approximation, optimization, generalization
18 | % into these two examples
19 | %\sampassage{improving approximation, optimization, generalization}
20 | 
21 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/sammy.sty:
--------------------------------------------------------------------------------
  1 | % author:   samtenka
  2 | % change:   2023-03-28
  3 | % create:   2022-05-11
  4 | 
  5 | \newcommand{\picturedw}[2]{\includegraphics[width=#1]{#2}}
  6 | \newcommand{\picturew}[2]{\includegraphics[width=#1]{figures/#2}}
  7 | \newcommand{\pictureh}[2]{\includegraphics[height=#1]{figures/#2}}
  8 | 
  9 | %==============================================================================
 10 | %====  0.  DOCUMENT SETTINGS  ================================================
 11 | %==============================================================================
 12 | 
 13 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 14 | %~~~~~~~~~~~~~  0.0. About this Exposition  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 15 | 
 16 | %---------------------  0.0.1. math packages  ---------------------------------
 17 | \newcommand\hmmax{0} % to allow for more fonts
 18 | \newcommand\bmmax{0} % to allow for more fonts
 19 | \usepackage{amsmath, amssymb, amsthm, mathtools}
 20 | \usepackage{bm}
 21 | \usepackage{euler}
 22 | 
 23 | \usepackage{array}   % for \newcolumntype macro
 24 | \newcolumntype{L}{>{$}l<{$}} % math-mode version of "l" column type
 25 | \newcolumntype{C}{>{$}c<{$}} % math-mode version of "c" column type
 26 | \newcolumntype{R}{>{$}r<{$}} % math-mode version of "r" column type
 27 | 
 28 | \newcolumntype{S}{ >{\centering\arraybackslash} m{3cm} } % vertically and horizontally centered
 29 | 
 30 | %---------------------  0.0.2. graphics packages  -----------------------------
 31 | \usepackage{graphicx, xcolor}
 32 | \usepackage{float, capt-of}
 33 | \usepackage{soul}
 34 | 
 35 | %---------------------  0.0.3. packages for fancy text  -----------------------
 36 | \usepackage{enumitem}\setlist{nosep}
 37 | \usepackage{listings}
 38 | \usepackage{xstring}
 39 | \usepackage{fontawesome5}
 40 | 
 41 | %---------------------  0.043. colors  ----------------------------------------
 42 | 
 43 | % NOTE: we want to cater to colorblind readers
 44 | 
 45 | % (LIGHT, MEDIUM, DARK) x (BLUE, ORANGE)
 46 | \definecolor{msky}{rgb}{0.62, 0.82, 0.94} \newcommand{\sky}{\color{msky}}
 47 | \definecolor{mpch}{rgb}{0.98, 0.86, 0.62} \newcommand{\pch}{\color{mpch}}
 48 | 
 49 | \definecolor{mblu}{rgb}{0.05, 0.55, 0.85} \newcommand{\blu}{\color{mblu}}
 50 | \definecolor{mrng}{rgb}{0.95, 0.65, 0.05} \newcommand{\rng}{\color{mrng}}
 51 | 
 52 | \definecolor{msea}{rgb}{0.02, 0.22, 0.34} \newcommand{\sea}{\color{msea}}
 53 | \definecolor{mbro}{rgb}{0.38, 0.26, 0.02} \newcommand{\bro}{\color{mbro}}
 54 | 
 55 | % SHADES:
 56 | \definecolor{mgre}{rgb}{0.55, 0.55, 0.55} \newcommand{\gre}{\color{mgre}}
 57 | \definecolor{mdgre}{rgb}{0.35, 0.35, 0.35} \newcommand{\dgre}{\color{mdgre}}
 58 | 
 59 | % UNFRIENDLY:
 60 | \definecolor{mred}{rgb}{1.00, 0.00, 0.00} \newcommand{\red}{\color{mred}}
 61 | 
 62 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 63 | %~~~~~~~~~~~~~  0.1. Headers and References  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 64 | 
 65 | %---------------------  0.1.0. intra-document references  ---------------------
 66 | \newcommand{\offour}[1]{
 67 |     {\tiny \raisebox{0.04cm}{\scalebox{0.9}{$\substack{
 68 |         \IfSubStr{#1}{0}{{\blacksquare}}{\square}
 69 |         \IfSubStr{#1}{1}{{\blacksquare}}{\square} \\ 
 70 |         \IfSubStr{#1}{2}{{\blacksquare}}{\square}
 71 |         \IfSubStr{#1}{3}{{\blacksquare}}{\square}
 72 |     }$}}}%
 73 | }
 74 | 
 75 | \newcommand{\offourline}[1]{
 76 |     {\tiny \raisebox{0.04cm}{\scalebox{0.9}{$\substack{
 77 |         \IfSubStr{#1}{0}{{\blacksquare}}{\square}
 78 |         \IfSubStr{#1}{1}{{\blacksquare}}{\square}
 79 |         \IfSubStr{#1}{2}{{\blacksquare}}{\square}
 80 |         \IfSubStr{#1}{3}{{\blacksquare}}{\square}
 81 |     }$}}}%
 82 | }
 83 | \newcommand{\notesam}[1]{{\blu \textsf{#1}}}
 84 | \newcommand{\attn}[1]{{\blu \textsf{#1}}}
 85 | \newcommand{\attnsam}[1]{{\red \textsf{#1}}}
 86 | %\newcommand{\attnsam}[1]{}%{\red \textsf{#1}}}
 87 | 
 88 | \newcommand{\blarr}{\hspace{-0.15cm}${\blu \leftarrow}\,$}
 89 | \newcommand{\bcirc}{${\blu ^\circ}$}
 90 | \newcommand{\bovinenote}[1]{\bcirc\marginnote{\blarr #1}}
 91 | 
 92 | %---------------------  0.1.1. table of contents helpers  ---------------------
 93 | \newcommand{\phdot}{\phantom{.}}
 94 | 
 95 | %---------------------  0.1.2. section headers  -------------------------------
 96 | \newcommand{\samtitle} [1]{
 97 |   \par\noindent{\Huge \sf \blu #1}
 98 |   \vspace{0.4cm}
 99 | }
100 | 
101 | \newcommand{\samquote} [2]{
102 |     \marginnote[-0.4cm]{\begin{flushright}
103 |     %\scriptsize
104 |         \gre {\it #1} \\ --- #2
105 |     \end{flushright}}
106 | }
107 | 
108 | \newcommand{\sampart} [1]{
109 |   \vspace{0.5cm}
110 |   \par\noindent{\LARGE \sf \blu #1}
111 |   \vspace{0.1cm}\par
112 | }
113 | 
114 | \newcommand{\samsection}[1]{
115 |   \vspace{0.3cm}
116 |   \par\noindent{\Large \sf \blu #1}
117 |   \vspace{0.1cm}\par
118 | }
119 | 
120 | \newcommand{\sampassage}[1]{
121 |    \vspace{0.1cm}
122 |    \par\noindent{\hspace{-2cm}\normalsize \sc \gre #1} ---
123 | }
124 | 
125 | %---------------------  0.1.3. clear the bibliography's header  ---------------
126 | \usepackage{etoolbox}
127 | \patchcmd{\thebibliography}{\section*{\refname}}{}{}{}
128 | 
129 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
130 | %~~~~~~~~~~~~~  0.2. Math Symbols and Blocks  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
131 | 
132 | %---------------------  0.2.0. general math operators  ------------------------
133 | \newcommand{\scirc}{\mathrel{\mathsmaller{\mathsmaller{\mathsmaller{\circ}}}}}
134 | \newcommand{\cmop}[2]{{(#1\!\to\!#2)}}
135 | \newcommand{\pr}{^\prime}
136 | \newcommand{\prpr}{^{\prime\prime}}
137 | 
138 | \newcommand{\wrap}[1]{\left(#1\right)}
139 | 
140 | %---------------------  0.2.1. probability symbols  ---------------------------
141 | \newcommand{\KL}{\text{KL}}
142 | \newcommand{\EN}{\text{H}}
143 | \newcommand{\note}[1]{{\blu \textsf{#1}}}
144 | 
145 | %---------------------  0.2.2. losses averaged in various ways  ---------------
146 | \newcommand{\Ein}  {\text{trn}_{\sS}}
147 | \newcommand{\Einb} {\text{trn}_{\check\sS}}
148 | \newcommand{\Einc} {\text{trn}_{\sS\sqcup \check\sS}}
149 | \newcommand{\Egap} {\text{gap}_{\sS}}
150 | \newcommand{\Eout} {\text{tst}}
151 | 
152 | %---------------------  0.2.3. double-struck and caligraphic upper letters  ---
153 | \newcommand{\Aa}{\mathbb{A}}\newcommand{\aA}{\mathcal{A}}
154 | \newcommand{\Bb}{\mathbb{B}}\newcommand{\bB}{\mathcal{B}}
155 | \newcommand{\Cc}{\mathbb{C}}\newcommand{\cC}{\mathcal{C}}
156 | \newcommand{\Dd}{\mathbb{D}}\newcommand{\dD}{\mathcal{D}}
157 | \newcommand{\Ee}{\mathbb{E}}\newcommand{\eE}{\mathcal{E}}
158 | \newcommand{\Ff}{\mathbb{F}}\newcommand{\fF}{\mathcal{F}}
159 | \newcommand{\Gg}{\mathbb{G}}\newcommand{\gG}{\mathcal{G}}
160 | \newcommand{\Hh}{\mathbb{H}}\newcommand{\hH}{\mathcal{H}}
161 | \newcommand{\Ii}{\mathbb{I}}\newcommand{\iI}{\mathcal{I}}
162 | \newcommand{\Jj}{\mathbb{J}}\newcommand{\jJ}{\mathcal{J}}
163 | \newcommand{\Kk}{\mathbb{K}}\newcommand{\kK}{\mathcal{K}}
164 | \newcommand{\Ll}{\mathbb{L}}\newcommand{\lL}{\mathcal{L}}
165 | \newcommand{\Mm}{\mathbb{M}}\newcommand{\mM}{\mathcal{M}}
166 | \newcommand{\Nn}{\mathbb{N}}\newcommand{\nN}{\mathcal{N}}
167 | \newcommand{\Oo}{\mathbb{O}}\newcommand{\oO}{\mathcal{O}}
168 | \newcommand{\Pp}{\mathbb{P}}\newcommand{\pP}{\mathcal{P}}
169 | \newcommand{\Qq}{\mathbb{Q}}\newcommand{\qQ}{\mathcal{Q}}
170 | \newcommand{\Rr}{\mathbb{R}}\newcommand{\rR}{\mathcal{R}}
171 | \newcommand{\Ss}{\mathbb{S}}\newcommand{\sS}{\mathcal{S}}
172 | \newcommand{\Tt}{\mathbb{T}}\newcommand{\tT}{\mathcal{T}}
173 | \newcommand{\Uu}{\mathbb{U}}\newcommand{\uU}{\mathcal{U}}
174 | \newcommand{\Vv}{\mathbb{V}}\newcommand{\vV}{\mathcal{V}}
175 | \newcommand{\Ww}{\mathbb{W}}\newcommand{\wW}{\mathcal{W}}
176 | \newcommand{\Xx}{\mathbb{X}}\newcommand{\xX}{\mathcal{X}}
177 | \newcommand{\Yy}{\mathbb{Y}}\newcommand{\yY}{\mathcal{Y}}
178 | \newcommand{\Zz}{\mathbb{Z}}\newcommand{\zZ}{\mathcal{Z}}
179 | 
180 | %---------------------  0.2.4. sans serif and frak lower letters  -------------
181 | \newcommand{\sfa}{\mathsf{a}}\newcommand{\fra}{\mathfrak{a}}
182 | \newcommand{\sfb}{\mathsf{b}}\newcommand{\frb}{\mathfrak{b}}
183 | \newcommand{\sfc}{\mathsf{c}}\newcommand{\frc}{\mathfrak{c}}
184 | \newcommand{\sfd}{\mathsf{d}}\newcommand{\frd}{\mathfrak{d}}
185 | \newcommand{\sfe}{\mathsf{e}}\newcommand{\fre}{\mathfrak{e}}
186 | \newcommand{\sff}{\mathsf{f}}\newcommand{\frf}{\mathfrak{f}}
187 | \newcommand{\sfg}{\mathsf{g}}\newcommand{\frg}{\mathfrak{g}}
188 | \newcommand{\sfh}{\mathsf{h}}\newcommand{\frh}{\mathfrak{h}}
189 | \newcommand{\sfi}{\mathsf{i}}\newcommand{\fri}{\mathfrak{i}}
190 | \newcommand{\sfj}{\mathsf{j}}\newcommand{\frj}{\mathfrak{j}}
191 | \newcommand{\sfk}{\mathsf{k}}\newcommand{\frk}{\mathfrak{k}}
192 | \newcommand{\sfl}{\mathsf{l}}\newcommand{\frl}{\mathfrak{l}}
193 | \newcommand{\sfm}{\mathsf{m}}\newcommand{\frm}{\mathfrak{m}}
194 | \newcommand{\sfn}{\mathsf{n}}\newcommand{\frn}{\mathfrak{n}}
195 | \newcommand{\sfo}{\mathsf{o}}\newcommand{\fro}{\mathfrak{o}}
196 | \newcommand{\sfp}{\mathsf{p}}\newcommand{\frp}{\mathfrak{p}}
197 | \newcommand{\sfq}{\mathsf{q}}\newcommand{\frq}{\mathfrak{q}}
198 | \newcommand{\sfr}{\mathsf{r}}\newcommand{\frr}{\mathfrak{r}}
199 | \newcommand{\sfs}{\mathsf{s}}\newcommand{\frs}{\mathfrak{s}}
200 | \newcommand{\sft}{\mathsf{t}}\newcommand{\frt}{\mathfrak{t}}
201 | \newcommand{\sfu}{\mathsf{u}}\newcommand{\fru}{\mathfrak{u}}
202 | \newcommand{\sfv}{\mathsf{v}}\newcommand{\frv}{\mathfrak{v}}
203 | \newcommand{\sfw}{\mathsf{w}}\newcommand{\frw}{\mathfrak{w}}
204 | \newcommand{\sfx}{\mathsf{x}}\newcommand{\frx}{\mathfrak{x}}
205 | \newcommand{\sfy}{\mathsf{y}}\newcommand{\fry}{\mathfrak{y}}
206 | \newcommand{\sfz}{\mathsf{z}}\newcommand{\frz}{\mathfrak{z}}
207 | 
208 | %---------------------  0.2.5. math environments  -----------------------------
209 | \newtheorem*{qst}{Question}
210 | \newtheorem*{thm}{Theorem}
211 | \newtheorem*{lem}{Lemma}
212 | % ...
213 | \theoremstyle{definition}
214 | \newtheorem*{dfn}{Definition}
215 | 
216 | \newcommand{\exercise}[1]{%
217 |   \par\noindent%
218 |   \attn{Food For Thought:} #1%
219 | }
220 | \newcommand{\noparexercise}[1]{%
221 |   \attn{Food For Thought:} #1%
222 | }
223 | \newcommand{\objectives}[1]{%
224 |   \marginnote[-0.2cm]{%
225 |     By the end of this section, you'll be able to
226 |     \begin{itemize}#1\end{itemize}
227 |   }
228 | }
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/u4cl.tex:
--------------------------------------------------------------------------------
1 | \subsection*{Coding Lecture 4: Expectation-Maximization for Sequences}
2 | 
3 | \subsubsection*{HMM architecture.  Remark on RNN}
4 | \subsubsection*{E-step: dynamic programming}
5 | \subsubsection*{M-step with regularization}
6 | \subsubsection*{End-to-end reading and mimicry of cookbook text}
7 | \subsubsection*{Investigation of State Meanings}
8 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/u4ha.tex:
--------------------------------------------------------------------------------
 1 | \subsection*{Homework 4a: Gaussian Mixtures for Clustering}
 2 | 
 3 | % =============================================================================
 4 | % ==  _  ======================================================================
 5 | % =============================================================================
 6 | 
 7 | \subsubsection*{forward model}
 8 | %-------  _  ------------------------------------------------------------------
 9 | \blurb{visual invitation to clustering as inference problem}
10 | %-------  _  ------------------------------------------------------------------
11 | \blurb{reminder about multivariate gaussian shapes.  what does covariance mean?}
12 | %-------  _  ------------------------------------------------------------------
13 | \blurb{mathematics and sampling from a 3-cluster GMM to make color / stick figure data}
14 | %-------  _  ------------------------------------------------------------------
15 | \blurb{example of conditional 1-cluster GMM to connect to unit 3.  color data}
16 | %-------  _  ------------------------------------------------------------------
17 | \blurb{example of conditional 2-cluster GMM to connect to unit 3.  color data}
18 | 
19 | % =============================================================================
20 | % ==  _  ======================================================================
21 | % =============================================================================
22 | 
23 | \subsubsection*{E step and M step}
24 | %-------  _  ------------------------------------------------------------------
25 | \blurb{type signatures of E step and of M step}
26 | %-------  _  ------------------------------------------------------------------
27 | \blurb{qualitative behavior of E step and of M step}
28 | %-------  _  ------------------------------------------------------------------
29 | \blurb{formula for M step}
30 | %-------  _  ------------------------------------------------------------------
31 | \blurb{formula for E step}
32 | %-------  _  ------------------------------------------------------------------
33 | \blurb{example dynamics of EM on flower data}
34 | 
35 | % =============================================================================
36 | % ==  _  ======================================================================
37 | % =============================================================================
38 | 
39 | \subsubsection*{small-var regularization from prior; conjugate meta-priors}
40 | %-------  _  ------------------------------------------------------------------
41 | \blurb{visualizing the loss landscape}
42 | %-------  _  ------------------------------------------------------------------
43 | \blurb{black holes due to density vs probability mass notions}
44 | %-------  _  ------------------------------------------------------------------
45 | \blurb{regularization term, motivated probabilistically}
46 | %-------  _  ------------------------------------------------------------------
47 | \blurb{notion of conjugate priors and, secretly, of exponential family}
48 | %-------  _  ------------------------------------------------------------------
49 | \blurb{other examples of deriving natural regularizer term from conjugate prior}
50 | 
51 | % =============================================================================
52 | % ==  _  ======================================================================
53 | % =============================================================================
54 | 
55 | \subsubsection*{k-means as limit}
56 | %-------  _  ------------------------------------------------------------------
57 | \blurb{GMM with variances small and spherical instead of free parameters}
58 | %-------  _  ------------------------------------------------------------------
59 | \blurb{show that limit is equivalent to a given algo that we call k-means}
60 | % equate with failure to marginalize
61 | %-------  _  ------------------------------------------------------------------
62 | \blurb{example of k-means fitting}
63 | %-------  _  ------------------------------------------------------------------
64 | \blurb{sensitivity to intialization; practicalities}
65 | %-------  _  ------------------------------------------------------------------
66 | \blurb{kernelization and implicit or approximate averages}
67 | 
68 | % =============================================================================
69 | % ==  _  ======================================================================
70 | % =============================================================================
71 | 
72 | \subsubsection*{convergence of EM near minimum}
73 | %-------  _  ------------------------------------------------------------------
74 | \blurb{ELBO decomposition, visually}
75 | %-------  _  ------------------------------------------------------------------
76 | \blurb{ELBO decomposition, mathematically}
77 | %-------  _  ------------------------------------------------------------------
78 | \blurb{how M step changes ELBO}
79 | %-------  _  ------------------------------------------------------------------
80 | \blurb{how E step changes ELBO}
81 | %-------  _  ------------------------------------------------------------------
82 | \blurb{ping-pong picture in space of distributions}
83 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/u4hb.tex:
--------------------------------------------------------------------------------
  1 | \subsection*{Homework 4b: Bayes Nets.  NNs as Amortized Inference}
  2 | 
  3 | % =============================================================================
  4 | % ==  _  ======================================================================
  5 | % =============================================================================
  6 | 
  7 | \subsubsection*{Marginalization over Latents}
  8 | %-------  _  ------------------------------------------------------------------
  9 | \blurb{}
 10 | 
 11 | %-------  _  ------------------------------------------------------------------
 12 | \blurb{}
 13 | 
 14 | %-------  _  ------------------------------------------------------------------
 15 | \blurb{}
 16 | 
 17 | %-------  _  ------------------------------------------------------------------
 18 | \blurb{}
 19 | 
 20 | %-------  _  ------------------------------------------------------------------
 21 | \blurb{}
 22 | 
 23 | % =============================================================================
 24 | % ==  _  ======================================================================
 25 | % =============================================================================
 26 | 
 27 | \subsubsection*{Weights as Latents in Linear Classification (bowtie vs pipe)}
 28 | 
 29 | %-------  _  ------------------------------------------------------------------
 30 | \blurb{}
 31 | 
 32 | %-------  _  ------------------------------------------------------------------
 33 | \blurb{}
 34 | 
 35 | %-------  _  ------------------------------------------------------------------
 36 | \blurb{}
 37 | 
 38 | %-------  _  ------------------------------------------------------------------
 39 | \blurb{}
 40 | 
 41 | %-------  _  ------------------------------------------------------------------
 42 | \blurb{}
 43 | 
 44 | % =============================================================================
 45 | % ==  _  ======================================================================
 46 | % =============================================================================
 47 | 
 48 | \subsubsection*{BIC and structural penalty}
 49 | %-------  _  ------------------------------------------------------------------
 50 | \blurb{}
 51 | 
 52 | %-------  _  ------------------------------------------------------------------
 53 | \blurb{}
 54 | 
 55 | %-------  _  ------------------------------------------------------------------
 56 | \blurb{}
 57 | 
 58 | %-------  _  ------------------------------------------------------------------
 59 | \blurb{}
 60 | 
 61 | %-------  _  ------------------------------------------------------------------
 62 | \blurb{}
 63 | 
 64 | 
 65 | % =============================================================================
 66 | % ==  _  ======================================================================
 67 | % =============================================================================
 68 | 
 69 | \subsubsection*{Causal modeling}
 70 | %-------  _  ------------------------------------------------------------------
 71 | \blurb{}
 72 | 
 73 | %-------  _  ------------------------------------------------------------------
 74 | \blurb{}
 75 | 
 76 | %-------  _  ------------------------------------------------------------------
 77 | \blurb{}
 78 | 
 79 | %-------  _  ------------------------------------------------------------------
 80 | \blurb{}
 81 | 
 82 | %-------  _  ------------------------------------------------------------------
 83 | \blurb{}
 84 | 
 85 | % =============================================================================
 86 | % ==  _  ======================================================================
 87 | % =============================================================================
 88 | 
 89 | \subsubsection*{Very Basic Occlusion Inference Object Net}
 90 | %-------  _  ------------------------------------------------------------------
 91 | \blurb{}
 92 | 
 93 | %-------  _  ------------------------------------------------------------------
 94 | \blurb{}
 95 | 
 96 | %-------  _  ------------------------------------------------------------------
 97 | \blurb{}
 98 | 
 99 | %-------  _  ------------------------------------------------------------------
100 | \blurb{}
101 | 
102 | %-------  _  ------------------------------------------------------------------
103 | \blurb{}
104 | 
105 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/u4pa.tex:
--------------------------------------------------------------------------------
1 | \subsection*{Project 4a: Predicting Political Polls}
2 | 
3 | \subsubsection*{Meeting the Polling Data}
4 | \subsubsection*{Forward Model}
5 | \subsubsection*{Inference through Sampling}
6 | \subsubsection*{Wrestling with Training Difficulties (burnin, etc)}
7 | \subsubsection*{Interpreting Latents}
8 | 


--------------------------------------------------------------------------------
/tex-source-sandbox/u4pb.tex:
--------------------------------------------------------------------------------
1 | \subsection*{Project 4b: A Deep Image Generator (VAE)}
2 | 
3 | \subsubsection*{Meeting the Face Data.  How to assess success?} % maybe met face data in previous unit?
4 | \subsubsection*{Forward Model including reparam trick}
5 | \subsubsection*{Backward model and intuitive interpretation}
6 | \subsubsection*{Training and Testing}
7 | \subsubsection*{Interpreting Latents.  Sources of Prejudice and Capriciousness}
8 | 


--------------------------------------------------------------------------------
/tex-source/body.0.0.what-is-learning.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |       \item recognize whether a learning task fits the paradigm of
  3 |             \emph{learning from examples}
  4 |             and whether it's \emph{supervised} or \emph{unsupervised}.
  5 |       \item identify within a completed learning-from-examples project:
  6 |             the \emph{training inputs(outputs)},
  7 |             \emph{testing inputs(outputs)},
  8 |             \emph{hypothesis class},
  9 |             \emph{learned hypothesis};
 10 |             and describe which parts depend on
 11 |             which.
 12 | }
 13 | 
 14 | 
 15 | \sampassage{kinds of learning}
 16 |   How do we communicate patterns of desired behavior?  We can teach:
 17 |   \begin{description}
 18 |     \item[\textbf{by instruction}:  ]  ``to tell whether a mushroom is poisonous, first look at its gills...''
 19 |     \item[\textbf{by example}:      ]  ``here are six poisonous fungi; here, six safe ones.  see a pattern?''
 20 |     \item[\textbf{by reinforcement}:]  ``eat foraged mushrooms for a month; learn from getting sick.''
 21 |   \end{description}
 22 |   %
 23 |   Machine learning is the art of programming computers to learn from such
 24 |   sources.  We'll focus on the most important case: \textbf{learning from
 25 |   examples}.\bovinenote{%
 26 |     \noparexercise{What's something you've learned by instruction?  By example?
 27 |     By reinforcement?}
 28 |     %
 29 |     In Unit 5 we'll see that learning by example unlocks the
 30 |     other modes of learning.
 31 |   }
 32 | 
 33 | \sampassage{from examples to predictions}
 34 |   For us, a pattern of desired behavior is a function that for each given
 35 |   situation/prompt returns a favorable action/answer.
 36 |   %
 37 |   We seek a program that, from a list of examples of prompts and matching
 38 |   answers, determines an underlying pattern.  Our program is a success if this
 39 |   pattern accurately predicts answers for new, unseen prompts.
 40 |   %
 41 |   We often define our program as a search, over some class $\hH$ of candidate
 42 |   patterns (jargon: \textbf{hypotheses}), to maximize some notion of
 43 |   ``intrinsic-plausibility plus goodness-of-fit-to-the-examples''.
 44 | 
 45 |   \begin{figure}[h]
 46 |     \vspace{-0.5cm}
 47 |     \par\noindent\includegraphics[width=\textwidth]{figures/ml-dataflow}
 48 |     \caption{%
 49 |       \textbf{Predicting mushrooms' poisons.}
 50 |       %
 51 |       Our learning program selects from a class of hypotheses ({\gre gray blob}) a plausible
 52 |       hypothesis that well fits (\textbf{\blu blue dots} are close to
 53 |       \textbf{black dots}) a given list of poison-labeled mushrooms ({\blu blue
 54 |       blob}).  Evaluating the selected hypothesis on new mushrooms, we predict
 55 |       the corresponding poison levels ({\rng orange numbers}).
 56 |       %
 57 |       \par The arrows show dataflow: how the hypothesis class and the
 58 |       mushroom+poisonlevel examples determine one hypothesis, which, together
 59 |       with new mushrooms, determines predicted poison levels.
 60 |       %
 61 |       Selecting a
 62 |       hypothesis is called \textbf{learning}; predicting unseen poison levels,
 63 |       \textbf{inference}.  The examples we learn from are
 64 |       \textbf{training data}; the new mushrooms and their true poison levels
 65 |       are \textbf{testing data}.
 66 |     }
 67 |     \vspace{-1.0cm}
 68 |   \end{figure}
 69 | 
 70 |   For example, say we want to predict poison levels (answers) of mushrooms
 71 |   (prompts).  %Comparing our `\textbf{training data}' to our
 72 |   Among our hypotheses,\bovinenote{%
 73 |       We choose four hypotheses: respectively, that
 74 |       a mushroom's poison level is
 75 |       close to:
 76 |       \par\emph{--- its ambient soil's percent water by weight};
 77 |       \par\emph{--- its gills' odor level, in kilo-Scoville units};
 78 |       \par\emph{--- its zipcode (divided by 100000)};
 79 |       \par\emph{--- the fraction of visible light its cap reflects}.
 80 |   }
 81 |   the GillOdor hypothesis fits the examples well: it guesses poison
 82 |   levels
 83 |   %(\textbf{\blu blue dots})
 84 |   close to the truth.
 85 |   %(\textbf{black dots}).
 86 |   So the program selects GillOdor.
 87 |  %(jargon: \textbf{learns})
 88 |   %The selection process is called \textbf{learning}.
 89 | 
 90 |   `\emph{Wait!}', you say,
 91 |   `\emph{doesn't Zipcode fit the example data more closely than GillOdor?'}.
 92 |   Yes.  But a poison-zipcode proportionality is implausible: we'd need
 93 |   more evidence before believing Zipcode.  We can easily make many oddball
 94 |   hypotheses; by chance some may fit our data well, but they probably
 95 |   won't predict well!
 96 |   %
 97 |   Thus
 98 |   ``intrinsic plausibility'' and ``goodness-of-fit-to-data''
 99 |   \emph{both}\bovinenote{%
100 |     We choose those two notions (and our $\hH$) based on \textbf{domain
101 |     knowledge}.  This design process is an art; we'll study some rules of
102 |     thumb.
103 |   } play a role in learning.
104 | 
105 |   %We must specify both notions based on our domain knowledge
106 |   %--- this is an art but we'll learn its rules of thumb.
107 |   %
108 |   %We've now met ML's key elements.
109 |   %Those are ML's elements.
110 |   In practice we'll think of each hypothesis as mapping mushrooms to
111 |   \emph{distributions} over poison levels; then its
112 |   ``goodness-of-fit-to-data'' is simply the chance it allots to the
113 |   data.\bovinenote{That's why we'll need \textbf{probability}.}
114 |   %the details are more complex.
115 |   We'll also use huge $\hH$s: we'll \emph{combine} mushroom features
116 |   (wetness, odor, and shine) to make more hypotheses such as
117 |   %Maybe
118 |   $
119 |     (1.0 \cdot \text{GillOdor} - 0.2\cdot \text{CapShine})
120 |   $.\bovinenote{That's why we'll need \textbf{linear algebra}.}
121 |   %predicts poison well!
122 |   %
123 |   Since we can't compute ``goodness-of-fit'' for so many hypotheses,
124 |   we'll guess a hypothesis
125 |   %start at a tentative hypothesis and
126 |   then repeatedly
127 |   nudge it up the ``goodness-of-fit'' \emph{slope}.\bovinenote{That's why we'll need \textbf{derivatives}.}
128 |   %
129 |   %
130 | 
131 |   %We must specify both notions based on our domain knowledge
132 |   %--- this is an art but we'll learn its rules of thumb.
133 |   %%
134 |   %We'll often do this in terms of \emph{probabilities}:\bovinenote{%
135 |   %  Still, probability isn't the only criterion: if overestimating poison
136 |   %  levels is safer than underestimating them, then we'd want to hedge toward
137 |   %  overestimating.
138 |   %  This will become especially unavoidable when in Unit 5 we learn
139 |   %  from reinforcement.
140 |   %}
141 |   %we supply a distribution over all hypotheses (a \textbf{prior}) and, we think
142 |   %of each hypothesis as mapping mushrooms to \emph{distributions} over poison
143 |   %levels.
144 | 
145 |   %\bovinenote{%
146 |   %    \textbf{Bayesian Information Criterion}
147 |   %}
148 | 
149 | \sampassage{supervised learning}
150 |   We'll soon allow uncertainty by letting patterns map prompts to
151 |   \emph{distributions} over answers.
152 |   %\bovinenote{%
153 |   %  Then learning programs have this type:
154 |   %  $$
155 |   %    \lL : (\xX\times \yY)^N \to (\xX\to \text{DistributionsOn}(\yY))
156 |   %  $$
157 |   %}
158 |   %
159 |   Even if there is only one prompt --- say, ``\emph{produce a
160 |   beautiful melody}'' --- we may seek to learn the complicated
161 |   distribution over answers, e.g.\ to generate a diversity of apt
162 |   answers.  Such \textbf{unsupervised learning} concerns output
163 |   structure.
164 |   %
165 |   By contrast, \textbf{supervised learning} (our main subject), concerns
166 |   the input-output relation; it's interesting when there are many possible prompts.
167 |   %there are many possible prompts.
168 |   %
169 |   Both involve learning from examples; the distinction is no more firm
170 |   than that between sandwiches and hotdogs, but the words are good to
171 |   know.
172 | 
173 |   %To save ink, say that $\xX$ is the set of possible prompts; $\yY$, of
174 |   %possible answers.
175 |   %%
176 |   %With the example above, $\xX$ contains all
177 |   %conceivable mushrooms and $\yY$ contains all conceivable poison
178 |   %levels (perhaps all the non-negative real numbers).
179 |   %%
180 |   %If we like, we can now summarize the data flow in symbols.  A pattern is a
181 |   %function of type $\xX\to\yY$.  And we can model the examples from which our
182 |   %program learns as a list of type $(\xX\times \yY)^N$.  Then a program that
183 |   %learns from examples has type:
184 |   %$$
185 |   %  \lL : (\xX\times \yY)^N \to (\xX\to \yY)
186 |   %$$
187 | 
188 | %\sampassage{learning as...}
189 | %  TODO
190 | %    %
191 | %\attnsam{
192 | %    machine learning is like science.
193 | %    %
194 | %    machine learning is like automatic programming.
195 | %    %
196 | %    machine learning is like curve-fitting.
197 | %    %%
198 | %    three classic threads of AI
199 | %    }
200 | 


--------------------------------------------------------------------------------
/tex-source/body.0.2.how-well-did-we-do.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |       \item compute and conceptually distinguish training and testing misclassification errors
  3 |       \item explain how the problem of achieving low testing error
  4 |             decomposes into the three problems of achieving low
  5 |             \emph{generalization},
  6 |             \emph{optimization}, and
  7 |             \emph{approximation}
  8 |             errors.
  9 | }
 10 | 
 11 | \sampassage{error analysis}
 12 |   Intuitively, our testing error of $17\%$ comes from three sources:
 13 |   \textbf{(a)} the failure of our training set to be representative of our testing set;
 14 |   \textbf{(b)} the failure of our program to exactly minimize training error over $\hH$; and
 15 |   \textbf{(c)} the failure of our hypothesis set $\hH$ to contain ``the true'' pattern.
 16 | 
 17 |   These are respectively errors of
 18 |   \textbf{generalization},
 19 |   \textbf{optimization},
 20 |   \textbf{approximation}.
 21 | 
 22 |   We can see generalization error when we plot testing data in the
 23 |   brightness-width plane.  The hypotheses $h=(20, 83)$ that we selected based
 24 |   on the training in the brightness-width plane misclassifies many testing
 25 |   points.  we see many misclassified points.  Whereas $h$ misclassifies only
 26 |   $10\%$ of the training data, it misclassifies $17\%$ of the testing data.
 27 |   This illustrates generalization error.
 28 | 
 29 |   In our plot of the $(a,b)$ plane,
 30 |   the {\blu blue square} is the hypothesis $h$ (in $\hH$) that best fits
 31 |   the training data.  The {\rng orange square} is the hypothesis (in
 32 |   $\hH$) that best fits the testing data.  But even the latter seems
 33 |   suboptimal, since $\hH$ only includes lines through the origin while it
 34 |   seems we want a line --- or curve --- that hits higher up on the
 35 |   brightness axis.  This illustrates approximation error.\bovinenote{%
 36 |     To define \emph{approximation error}, we need to specify whether the `truth'
 37 |     we want to approximate is the training or the testing
 38 |     data.  Either way we get a useful concept.  In this paragraph we're talking
 39 |     about approximating \emph{testing} data; but in our notes overall we'll
 40 |     focus on the concept of error in approximating \emph{training} data.
 41 |   }
 42 | 
 43 |   Optimization error is best seen by plotting training rather than testing
 44 |   data.  It measures the failure of our selected hypothesis $h$ to minimize
 45 |   training error --- i.e., the failure of the {\blu blue square} to lie in a
 46 |   least shaded point in the $(a,b)$ plane, when we shade according to training
 47 |   error.
 48 | 
 49 |   \begin{figure}[h]
 50 |   %\begin{marginfigure}%[h]
 51 |       \centering
 52 |       \includegraphics[width=0.49\textwidth]{example-mnist/test-features.png}%
 53 |       \includegraphics[width=0.49\textwidth]{example-mnist/test-weights.png}%
 54 |       %
 55 |       \caption{
 56 |           \textbf{Testing error visualized two ways.}.
 57 |         --- %  \par\noindent
 58 |         \textbf{Left: in feature space.}
 59 |         The hypotheses $h=(20, 83)$ that we selected based on the training set
 60 |         classifies testing data in the brightness-width plane; glowing colors
 61 |         distinguish a hypothesis' $\blu{1}$ and $\rng{3}$ sides.
 62 |         Axes range $[0, 1.0]$.
 63 |         %
 64 |         %
 65 |         --- %  \par\noindent
 66 |         \textbf{Right: in weight space.}
 67 |         %
 68 |         Each point in the $(a,b)$ plane
 69 |         represents a hypothesis; darker regions misclassify a greater
 70 |         fraction of testing data.  Axes range $[-99,+99]$.
 71 |         %
 72 |       }
 73 |       \label{fig:test-features-weights}
 74 |   \end{figure}
 75 |   %\end{marginfigure}
 76 | 
 77 |   Here, we got optimization error $\approx 0\%$ (albeit by
 78 |   \emph{unscalable brute-force}).  Because optimization error is zero in
 79 |   our case, the approximation error and training error are the same:
 80 |   $\approx10\%$.  The approximation error is so high because our straight
 81 |   lines are \emph{too simple}: brightness and width lose useful
 82 |   information and the ``true'' boundary between digits --- even training ---
 83 |   may be curved.
 84 |   %
 85 |   Finally, our testing error $\approx 17\%$ exceeds our training error.
 86 |   We thus suffer a generalization error of $\approx 7\%$: we \emph{didn't
 87 |   perfectly extrapolate} from training to testing situations.
 88 |   %
 89 |   In 6.86x we'll address all three italicized issues.
 90 | 
 91 |   \exercise{why is generalization error usually positive?}
 92 | 
 93 | 
 94 | 
 95 | \sampassage{formalism}\marginnote{\veryoptional}
 96 |   Here's how we can describe learning and our error decomposition in
 97 |   symbols.
 98 |   %
 99 |   Draw training examples $\sS : (\xX\times \yY)^N$
100 |   from nature's distribution $\dD$ on $\xX\times \yY$.  A hypothesis
101 |   $f:\xX\to \yY$ has \textbf{training error}
102 |   $
103 |      \Ein(f) = \Pp_{(x,y)\sim \blu{\sS}}[f(x)\neq y]
104 |   $, an average over examples; and \textbf{testing error}
105 |   $
106 |      \Eout(f) = \Pp_{(x,y)\sim \rng{\dD}}[f(x)\neq y]
107 |   $, an average over nature.  A \emph{learning program} is a function
108 |   $
109 |       \lL : (\xX\times \yY)^N \to (\xX\to \yY)
110 |   $; we want to design $\lL$ so that it maps typical $\sS$s to $f$s with
111 |   low $\Eout(f)$.
112 |   %\marginnote{%
113 |   %  %  TODO: mention extereme class-imbalance and bayesian *decision* theory
114 |   %}
115 | 
116 |   So
117 |   we often define
118 |   $\lL$ to roughly
119 |   minimize $\Ein$ over a
120 |   set $\hH \subseteq (\xX\to \yY)$ of candidate patterns.  Then $\Eout$
121 |   decomposes
122 |   into the failures
123 |   of
124 |   $\Ein$ to estimate $\Eout$ (generalization),
125 |   of
126 |   $\lL$ to minimize $\Ein$ (optimization), and
127 |   of
128 |   $\hH$ to contain
129 |   nature's
130 |   truth (approximation):
131 |   \newcommand{\minf}[1]{{\inf}_{\hH}}
132 |   \begin{align*}
133 |       \Eout(\lL(\sS))
134 |       =~&\Eout(\lL(\sS))      &-\,\,\,&      \Ein(\lL(\sS)) &~\}~~& \text{\textbf{generalization} error} \\
135 |       +~&\Ein(\lL(\sS))       &-\,\,\,& \minf{\hH}(\Ein(f)) &~\}~~& \text{\textbf{optimization} error} \\
136 |       +~&\minf{\hH}(\Ein(f))  &       &                     &~\}~~& \text{\textbf{approximation} error}
137 |   \end{align*}
138 |   These terms are in tension.  For example, as $\hH$ grows, the
139 |   approx.\ error may decrease while the gen.\ error may
140 |   increase --- this is the ``\textbf{bias-variance} tradeoff''.
141 | 


--------------------------------------------------------------------------------
/tex-source/body.0.3.how-can-we-do-better.tex:
--------------------------------------------------------------------------------
 1 | 
 2 | %~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | %~~~~~~~~~~~~~  1.9. workflow  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | %      \sampassage{workflow}
 6 | %      %\sampassage{workflow: framing}
 7 | %        We first \emph{frame}: what data will help us solve what problem?  To
 8 | %        do this, we \emph{factor} our complex prediction problem into simple
 9 | %        classification or regression problems; randomly \emph{split} the
10 | %        resulting example pairs into training, dev(elopment), and testing sets;
11 | %        and \emph{visualize} the training data to weigh our intuitions.
12 | %
13 | %      %\sampassage{workflow: modeling}
14 | %        Next, we \emph{model}: we present the data to the computer so that
15 | %        true patterns are more easily found.
16 | %        %
17 | %        Here we inject our \emph{domain knowledge} --- our human experience and
18 | %        intuition about which factors are likely to help with prediction.
19 | %        %
20 | %        Modeling includes \emph{featurizing} our inputs and choosing
21 | %        appropriate \emph{priors} and \emph{symmetries}.
22 | %
23 | %      %\sampassage{workflow: training}
24 | %        During \emph{training}, the computer searches among candidate patterns
25 | %        for one that explains the examples relatively well.
26 | %        We used brute force above; we'll soon learn faster algorithms
27 | %        such as \emph{gradient descent} on the training set for parameter
28 | %        selection and \emph{random grid search} on the dev set for
29 | %        hyperparameter selection.
30 | %
31 | %      %\sampassage{workflow: harvesting}
32 | %        Finally, we may \emph{harvest}: we derive insights from the pattern
33 | %        itself\bovinenote{%
34 | %            which factors ended up being most important?
35 | %        }
36 | %        and we predict outputs for to fresh inputs.
37 | %        %
38 | %        Qualifying both applications is the pattern's quality.  To assess this,
39 | %        we measure its accuracy on our held-out testing data.
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/tex-source/body.1.1.gradients.tex:
--------------------------------------------------------------------------------
 1 | \objectives{%
 2 |   \item quickly minimize perceptron or hinge loss via `gradient' updates
 3 |   \item explain why those update formulas common linear models are
 4 |         intuitively sensible
 5 | }
 6 | 
 7 | \sampassage{which hypothesis is best?}%  optimization by gradient descent}
 8 |   Now that we've quantified badness-of-fit-to-data, we want to find a
 9 |   hypothesis $h=(a,b)$ that minimizes it.\bovinenote{%
10 |     Soon we'll also include intrinsic implausibility!  We'll see throughout
11 |     this course that it's important to minimize implausibility plus
12 |     badness-of-fit, not just badness-of-fit; otherwise, optimization might
13 |     select a very implausible hypothesis that happens to fit the the training
14 |     data.  Think of the Greek constellations: isn't it miraculous how
15 |     constellations --- the bears, the queen, etc --- so perfectly fit the
16 |     positions of the stars?
17 |   }
18 |   We
19 |   \emph{could} try brute force, like so:
20 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
21 |     def best_hypothesis():
22 |       # returns a pair (loss value, hypothesis)
23 |       return min(perceptron_loss((training_data, a, b), (a,b))
24 |                  for a in np.arange(-50,+50,.25)
25 |                  for b in np.arange(-50,+50,.25)              )
26 |   \end{lstlisting}
27 |   But this is slow!  Here we're searching a 2D grid at resolution $\approx
28 |   400$, so we call the loss $400^2$ times.  That exponent counts the parameters
29 |   we're finding (here, $2$: $a$ and $b$); if we had $10$ features and $10$
30 |   weights, we'd make $400^{10}$ calls.  Yikes!
31 | 
32 |   Let's instead use more of the information available to direct our search.
33 |   Suppose at some point in our search the best $h$ we've found so far is
34 |   $(a, b)$.  The loss function is a sum (or average) over $N$
35 |   training points $(x_i, y_i)$:\bovinenote{%
36 |     Here, $\text{br}(x)$ and $\text{wi}(x)$ stand for the features of $x$,
37 |     say the brightness and width.
38 |     %
39 |     Also, we'll use take $y$'s values to be $\pm 1$ (rather than cow vs dog
40 |     or {\blu $1$} vs {\rng $3$}), for notational convenience.
41 |   }
42 |   \begin{align*}
43 |       +\max(1,1-{y_{0}}  (a \cdot {\text{br}(x_{0})} + b \cdot{\text{wi}(x_{0})})+\cdots
44 |     \\+\max(1,1-{y_{42}} (a \cdot {\text{br}(x_{42})} + b \cdot{\text{wi}(x_{42})})+\cdots
45 |   \end{align*}
46 | 
47 |   Let's try to decrease this sum by reducing one row  at a time.
48 |   %
49 |   If $\ell>0$, then
50 |   %\bovinenote{%
51 |   %  except in the negligible case where $\ell=0$ exactly
52 |   %}
53 |   any small change in $(a,b)$ won't change $\max(1,1-\ell)$.
54 |   %
55 |   But if $\ell\leq 0$, then we can decrease $\max(1,1-\ell)$ by increasing
56 |   $\ell$, i.e., by increasing (say):
57 |   $$
58 |     \underbrace{+1}_{y_{42}}  (a \cdot \underbrace{0.9}_{\text{br}(x_{42})} + b \cdot\underbrace{0.1}_{\text{wi}(x_{42})})
59 |   $$
60 |   We can increase $\ell$ by increasing $a$ or $b$; but increasing $a$ gives us
61 |   more bang for our buck ($0.9>0.1$), so we'd probably nudge $a$ more than $b$,
62 |   say, by adding a multiple of $(+0.9, +0.1)$ to $(a, b)$.  Conversely, if
63 |   ${y_i=-1}$ then we'd add a multiple of $(-0.9, -0.1)$ to $(a, b)$.
64 |   %
65 |   Therefore, to reduce the $i$th row, we want to move $a, b$ like this:
66 |     \emph{Unless the max term is $0$,
67 |       add a multiple of $y_i ({\text{br}(x_{i})}, {\text{wi}(x_{i}))}$ to $(a,b)$.}
68 | 
69 |   Now, what if improving the $i$th row messes up other rows?  Because of this
70 |   danger we'll take small steps: we'll scale those aforementioned multiples
71 |   by some small $\eta$.  That way, even if the rows all pull
72 |   $(a,b)$ in different directions, the dance will buzz close to some average
73 |   $(a,b)$ that minimizes the average row.  So let's initialize
74 |   $h=(a,b)$ arbitrarily and take a bunch of small steps!
75 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
76 |     ETA = 0.01
77 |     h = initialize()
78 |     for t in range(10000):
79 |       xfeatures, y = fetch_datapoint_from(training_examples)
80 |       leeway = y*h.dot(xfeatures)
81 |       h = h + ETA * ( y * xfeatures * (0 if leeway>0. else 1) ) # update
82 |   \end{lstlisting}
83 |   \exercise{%
84 |     Convince a friend that, for $\eta=\text{ETA}=1$, this is the
85 |     \textbf{perceptron algorithm} from lecture.
86 |   }
87 |   Choosing smaller $\eta$ means that it takes more steps to get near an optimal
88 |   $h$ but that once we get near we will stay nearby instead of jumping away.
89 |   One can aim for the best of both worlds by letting $\eta$ decay with $t$.
90 |   %Soon we'll formalize and generalize this algorithm using calculus.
91 |   \exercise{%
92 |     We could have used hinge loss instead of perceptron loss.  Mimicking the
93 |     reasoning above, derive a corresponding line of code \texttt{h = h + ...}.
94 |   }
95 | 
96 |   \newpage
97 | \sampassage{pictures of optimization}
98 | 
99 | 


--------------------------------------------------------------------------------
/tex-source/body.1.1.iterative-optimization.tex:
--------------------------------------------------------------------------------
  1 | \objectives{%
  2 |   \item implement gradient descent for any given loss function and (usually)
  3 |        thereby automatically and efficiently find nearly-optimal linear
  4 |        hypotheses from data
  5 |   \item explain why the gradient-update formulas for common linear models are
  6 |         sensible, not just formally but also intuitively
  7 | }
  8 | 
  9 | %\samquote{
 10 | %  Hey Jude, don't make it bad \\
 11 | %  Take a sad song and make it better \\
 12 | %  Remember to let her under your skin \\
 13 | %  Then you'll begin to make it \\
 14 | %  Better, better, better, better, better, better, ...
 15 | %}{paul mccartney, john lennon}
 16 | 
 17 |         %-- gradients
 18 |         %-- writing out the code : a key exercise ; batches
 19 |         %-- setting initialization and learning rate; local minima
 20 |         %-- visualizing noise and curvature
 21 | 
 22 | \sampassage{(stochastic) gradient descent}
 23 |   We seek a hypothesis that is best (among a class $\hH$) according to some
 24 |   notion of how well each hypothesis models given data:
 25 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 26 |     def badness(h,y,x):
 27 |         # return e.g. whether h misclassifies y,x OR h's surprise at seeing y,x OR etc
 28 |     def badness_on_dataset(h, examples):
 29 |         return np.mean([badness(h,y,x) for y,x in examples])
 30 |   \end{lstlisting}
 31 |         %#return np.mean([for y,x in examples])
 32 | 
 33 |   %For example, out notion of goodness might map $h$ to its
 34 |   %training accuracy $1-\Ein$.  Or, when $h$ has a probabilistic interpretation,
 35 |   %our notion of goodness might map $h$ to the probability it predicts for the
 36 |   %training outputs $y_i$.\bovinenote{%
 37 |   %  In either case, we view our notion-of-good, computed on the training data,
 38 |   %  as an estimate of the notion-of-good we most care about: testing
 39 |   %  performance.  So $1-\Ein$ estimates $1-\Eout$ and $p(y_i|x_i;h)$ for $y_i,
 40 |   %  x_i$ a training example estimates $p(y|x;h)$ for $y,x$ fresh data.
 41 |   %}
 42 |   %
 43 |   Earlier we found a nearly best candidate by brute-force search over all
 44 |   hypotheses.  But this doesn't scale to most interesting cases wherein $\hH$
 45 |   is intractably large.
 46 |   %
 47 |   So: \emph{what's a faster algorithm to find a nearly best candidate?}
 48 | 
 49 |   A common idea is to start arbitrarily with some $h_0\in \hH$ and
 50 |   repeatedly improve to get $h_1, h_2, \cdots$.  We eventually stop, say at $h_{10000}$.
 51 |   The key question is:\bovinenote{%
 52 |     Also important are the questions of where to start and when to stop.
 53 |     But have patience!  We'll discuss these later.
 54 |   }
 55 |   \emph{how do we compute an improved hypothesis $h_{t+1}$ from our current
 56 |   hypothesis $h_t$}?
 57 | 
 58 |   We \emph{could} just keep randomly nudging $h_t$ until we hit on an
 59 |   improvement; then we define $h_{t+1}$ as that improvement.  Though this
 60 |   sometimes works surprisingly well,\bovinenote{%
 61 |     If you're curious, search `metropolis hastings' and
 62 |     `probabilistic programming'.
 63 |   } we can often save time by exploiting more available information.
 64 |   Specifically, we can inspect $h_t$'s inadequacies to inform our proposal
 65 |   $h_{t+1}$.
 66 |   %
 67 |   Intuitively, if $h_t$ misclassifies a particular $(x_i, y_i) \in \sS$, then
 68 |   we'd like $h_{t+1}$ to be like $h_t$ but nudged toward
 69 |   accurately classifying $(x_i, y_i)$.\bovinenote{%
 70 |     In doing better on the $i$th datapoint, we might mess up how we do
 71 |     on the other datapoints!  We'll consider this in due time.
 72 |   }
 73 | 
 74 |   How do we compute ``{a nudge toward accurately classifying $(x, y)$}''?  That
 75 |   is, how do measure how slightly changing a parameter affects some result?
 76 |   Answer: derivatives!  To make $h$ less bad on an example $(y, x)$, we'll
 77 |   nudge $h$ in tiny bit along $-g = -d \texttt{badness}(h,y,x) /
 78 |   dh$. Say, $h$ becomes $h-0.01g$.\bovinenote{%
 79 |     E.g.\ if each $h$ is a vector and we've chosen
 80 |     $\texttt{badness}(h,y,x) = -y h\cdot x$ as our notion of badness, then $-d
 81 |     \texttt{badness}(h,y,x) / dh = +yx$, so we'll nudge $h$ in the
 82 |     direction of $+yx$.
 83 |     \exercise{Is this update familiar?}
 84 |   }
 85 |   Once we write
 86 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 87 |     def gradient_badness(h,y,x):
 88 |         # returns the derivative of badness(h,y,x) with respect to h
 89 |     def gradient_badness_on_dataset(h, examples):
 90 |         return np.mean([gradient_badness(h,y,x) for y,x in examples])
 91 |   \end{lstlisting}
 92 |   we can repeatedly nudge via \textbf{gradient descent (GD)}, the engine of ML:\bovinenote{%
 93 |     \noparexercise{Can GD directly minimize misclassification rate?}
 94 |   }
 95 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
 96 |     h = initialize()
 97 |     for t in range(10000):
 98 |       h = h - 0.01 * gradient_badness_on_dataset(h, examples)
 99 |   \end{lstlisting}
100 |   Since the derivative of total badness depends on all the training data,
101 |   looping $10000$ times is expensive.  So in practice we estimate the needed
102 |   derivative based on some \emph{subset} (jargon: \textbf{batch}) of the
103 |   training data --- a different subset each pass through the loop --- in what's
104 |   called \textbf{stochastic gradient descent (SGD)}:
105 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
106 |     h = initialize()
107 |     for t in range(10000):
108 |       batch = select_subset_of(examples)
109 |       h = h - 0.01 * gradient_badness(h, batch)
110 |   \end{lstlisting}
111 |   \begin{marginfigure}
112 |       \attnsam{cartoon of GD}\\
113 |       \vspace{4cm}\\
114 |       \attnsam{cartoon of GD}
115 |   \end{marginfigure}
116 | 
117 |   (S)GD requires informative derivatives.  Misclassification rate has
118 |   uninformative derivatives: any tiny change in $h$ won't change the predicted
119 |   labels.  But when we use probabilistic models, small changes in $h$ can lead
120 |   to small changes in the predicted \emph{distribution} over labels.
121 |   %
122 |   To speak poetically: the softness of probabilistic models paves a smooth ramp
123 |   over the intractably black-and-white cliffs of `right' or `wrong'.
124 |   %
125 |   We now apply SGD to maximizing probabilities.
126 | 
127 | \sampassage{maximum likelihood estimation}
128 |   When we can compute each hypothesis $h$'s asserted probability
129 |   that the training $y$s match the training $x$s, it seems
130 |   reasonable to seek an $h$ for which this probability is maximal.  This
131 |   method is \textbf{maximum likelihood estimation (MLE)}.
132 |   %
133 |   It's convenient for the overall goodness to be a sum (or average) over each
134 |   training example.  But independent chances multiply rather than add:
135 |   rolling snake-eyes has chance $1\!/\!6 \cdot 1\!/\!6$, not $1\!/\!6 + 1\!/\!6$.  So
136 |   we prefer to think about maximizing log-probabilities instead of maximizing
137 |   probabilities --- it's the same in the end.\bovinenote{%
138 |     Throughout this course we make a crucial assumption that our training
139 |     examples are independent from each other.
140 |   }
141 |   By historical
142 |   convention we like to minimize badness rather than maximize goodness, so
143 |   we'll use SGD to \emph{minimize negative-log-probabilities}.
144 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
145 |     def badness(h,y,x):
146 |         return -np.log( probability_model(y,x,h) )
147 |   \end{lstlisting}
148 | 
149 |   Let's see this in action for the linear logistic model we developed for soft
150 |   binary classification.  A hypothesis $\vec w$ predicts that a (featurized)
151 |   input $\vec x$ has label $y=+1$ or $y=-1$ with chance $\sigma(+ \vec w \cdot \vec x)$
152 |   or $\sigma(- \vec w \cdot \vec x)$:
153 |   $$
154 |     p_{\sfy|\sfx,\sfw}(y|\vec x,\vec w) = \sigma(y \vec w \cdot \vec x)
155 |     \quad\quad
156 |     \text{where}
157 |     \quad\quad
158 |     \sigma(\frd) = 1/(1-\exp(-\frd))
159 |   $$
160 |   So MLE with our logistic model means finding $\vec w$ that \emph{minimizes}
161 |   $$
162 |     -\log\wrap{\text{prob of all $y_i$s given all $\vec x_i$s and $\vec w$}}
163 |     =
164 |     \sum_i -\log(\sigma(y_i \vec w\cdot \vec x_i))
165 |   $$
166 |   The key computation is the derivative of those badness terms:\bovinenote{%
167 |     Remember that $\sigma\pr(z) = \sigma(z)\sigma(-z)$.
168 |     %
169 |     To reduce clutter we'll temporarily write $y \vec w\cdot \vec x$ as $ywx$.
170 |   }
171 |   $$
172 |     \frac{\partial (-\log(\sigma(y w x)))}{\partial w}
173 |     =
174 |     \frac{-\sigma(y w x)\sigma(-y w x) y x}{\sigma(y w x)}
175 |     =
176 |     - \sigma(-y w x) y x
177 |   $$
178 | 
179 |   \exercise{If you're like me, you might've zoned out by now.  But this stuff
180 |   is important, especially for deep learning!  So please graph the
181 |   above expressions to convince yourself that our formula for derivative
182 |   makes sense visually.}
183 | 
184 |   \vspace{\baselineskip}
185 | 
186 |   To summarize, we've found the loss gradient for the logistic model:
187 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
188 |     sigma = lambda z : 1./(1+np.exp(-z))
189 |     def badness(w,y,x):             return -np.log( sigma(y*w.dot(x)) )
190 |     def gradient_badness(w,y,x):    return -sigma(-y*w.dot(x)) * y*x
191 |   \end{lstlisting}
192 |   As before, we define overall badness on a dataset as an average badness over
193 |   examples; and for simplicity, let's intialize gradient descent at $h_0=0$:
194 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
195 |     def gradient_badness_on_dataset(h, examples):
196 |       return np.mean([gradient_badness(h,y,x) for y,x in examples])
197 |     def initialize():
198 |         return np.zeros(NUMBER_OF_DIMENSIONS, dtype=np.float32)
199 |   \end{lstlisting}
200 |   Then we can finally write gradient descent:
201 |   \begin{lstlisting}[language=Python, basicstyle=\footnotesize\ttfamily]
202 |     h = initialize()
203 |     for t in range(10000):
204 |       h = h - 0.01 * gradient_badness_on_data(h, examples)
205 |   \end{lstlisting}
206 | 
207 |   %\attnsam{mention convexity and convergence?}
208 | 
209 |   \begin{marginfigure}
210 |      \attnsam{show trajectory in weight space over time -- see how certainty
211 |   degree of freedom is no longer redundant? (``markov'')}\\
212 |       \vspace{4cm}\\
213 |     \attnsam{show training and testing loss and acc over time}
214 |   \end{marginfigure}
215 | 
216 | 
217 | 
218 |   \newpage
219 | \sampassage{initialization, learning rate, local minima}\marginnote{\veryoptional}
220 | 
221 | \sampassage{pictures of training: noise and curvature}\marginnote{\veryoptional}
222 |   \par\attnsam{}
223 |   \par\attnsam{}
224 |   \par\attnsam{test vs train curves: overfitting}
225 |   \par\attnsam{random featurization: double descent}
226 | 
227 | \sampassage{practical implementation: vectorization}
228 | 
229 | 
230 | %    \samsection{5. ideas in optimization}
231 | %      \samquote{
232 | %        premature optimization is the root of all evil
233 | %      }{donald knuth}
234 | %
235 | %        \attn{learning rate as metric; robustness to 2 noise structures}
236 | %        \attn{nesterov momentum}
237 | %        \attn{decaying step size; termination conditions}
238 | %        \attn{batch normalization}
239 | %
240 | %
241 | %
242 | %%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
243 | %%~~~~~~~~~~~~~  2.20. local minima  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
244 | %
245 | %      \sampassage{local minima}
246 | %        % convexity, initialization
247 | %
248 | %%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
249 | %%~~~~~~~~~~~~~  2.21. implicit regularization  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
250 | %
251 | %      \sampassage{implicit regularization}
252 | %
253 | %%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
254 | %%~~~~~~~~~~~~~  2.22. learning rate schedule  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
255 | %
256 | %      \sampassage{learning rate schedule}
257 | %
258 | %%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
259 | %%~~~~~~~~~~~~~  2.23. learning rates as dot products  ~~~~~~~~~~~~~~~~~~~~~~~~~
260 | %
261 | %      \sampassage{learning rates as dot products} % connects to whitening / pre-conditioning; ties into next section on kernels
262 | %
263 | %
264 | 


--------------------------------------------------------------------------------
/tex-source/body.1.2.priors.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source/body.1.2.priors.tex


--------------------------------------------------------------------------------
/tex-source/body.1.3.model-selection.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source/body.1.3.model-selection.tex


--------------------------------------------------------------------------------
/tex-source/body.3.2.convolution.tex:
--------------------------------------------------------------------------------
 1 | \sampassage{CNN layer} Idea of a convolutional layer: turn ``image" to ``image". Here ``image" is an abstract quantity, for example a ${ \text{(height)} \times \text{(width)} \times \text{(dim)} }$ ${ = 30 \times 30 \times 3 }$ array representing an image with ${ 30 \times 30 }$ RGB pixels. 
 2 | 
 3 | A ${ H \times W \times D }$ input array turns into another array ${ H' \times W' \times D' }$ where roughly ${ H' \simeq H }$ and ${ W' \simeq W }.$ The transformation must follow ``locality" and ``symmetry". By locality one means: output pixels are effected only from local data in the input. By symmetry one means: effect of input neighbourhoods on output pixels is invariant to translating (both neighbourhoods the same way). 
 4 | 
 5 | \sampassage{Math of a CNN layer}
 6 | A ${ H \times W \times D }$ input array ${ X }$ transforms to a ${ \tilde{H} \times \tilde{W} \times \tilde{D} }$ array ${ Y },$ and entries of ${ Y }$ are given by dot product of learned weight matrix ${ A }$ with appropriate submatrices of ${ X }.$  More precisely, $${ Y[\tilde{h}, \tilde{w}, \tilde{d}] = \sum _{\substack{0 \leq \Delta h < K \\ 0 \leq \Delta w < K \\ o \leq d < D}} A[\Delta h, \Delta w, d, \tilde{d}] \cdot X[\tilde{h} + \Delta h, \tilde{w} + \Delta w, d]  .}$$ 
 7 | 
 8 | Here ${ K }$ is called kernel size, and controls the size of input neighbourhoods used to compute the output. 
 9 | 
10 | Note symmetry holds since ${ A }$ does not depend on ${ (\tilde{h}, \tilde{w}) },$ and locality holds since we are using only local values ${ X[\tilde{h} + \Delta h, \tilde{w} + \Delta w, d] }$ to compute ${ Y[\tilde{h}, \tilde{w}, \tilde{d}] }.$
11 | 
12 | \sampassage{CNN Applications} 
13 | \begin{itemize}
14 | \item Image processing with typical CNN architecture: Passing an image through multiple CNN layers each having its own learned weight matrix, for image to image translation. 
15 | \item In above setup, a small neighbourhood in the output array contains high level information about a larger neighbourhood in the original input. Now one can  ``flatten"/``vectorise" output array and perform classification by adding for eg dense and softmax layers.
16 | \item Image generation: Given noise, prompts, etc. one can unflatten and pass through CNN layers to learn to generate images. 
17 | \end{itemize} 
18 | 
19 | ${ \textbf{Practicalities} }$: 
20 | \begin{itemize}
21 | 
22 | \item Adding pooling layers (to shrink images). These layers are not learned. 
23 | 
24 | \item Padding: Slight clipping due to boundary effects changing index range (padding with zeroes, etc.) 
25 | 
26 | \item Skip connections 
27 | 
28 | \item Stride: Instead of input neighbourhoods skipping by 1 they are more spaced 
29 | 
30 | 
31 |     
32 | \end{itemize}     
33 |     
34 |     \samsection{2. multiple layers}
35 |         We can continue alternating learned linearities with fixed
36 |         nonlinearities:
37 |         \begin{align*}
38 |           \hat p(y\!=\!+1\,|\,x) \,=\,
39 |           (\sigma_{1\times 1} \,\circ\,
40 |           A_{1\times (h\prpr +1)} \,\circ\,
41 |             &f_{(h\prpr+1) \times h\prpr} \,\circ\,
42 |           B_{h\prpr \times (h\pr+1)} \,\circ\, \\
43 |             &f_{(h\pr+1)\times h\pr} \,\circ\,
44 |           C_{h\pr\times (h+1)} \,\circ\, \\
45 |             &f_{(h+1)\times h} \,\circ\,
46 |           D_{h\times d})(x)
47 |         \end{align*}
48 | 
49 |       \sampassage{feature hierarchies}
50 |       \sampassage{bottlenecking}
51 |       \sampassage{highways}
52 |       %\sampassage{differentiation} % addressed in 0.
53 | 
54 |     \samsection{3. architecture and wishful thinking}
55 |       \sampassage{representation learning} % leads to lstms etc
56 | 
57 |     \samsection{4. architecture and symmetry} % and other priors?
58 |       \samquote{
59 |         About to speak at [conference].  Spilled Coke on left leg of jeans, so
60 |         poured some water on right leg so looks like the denim fade.
61 |       }{tony hsieh}
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/tex-source/body.3.3.attention.tex:
--------------------------------------------------------------------------------
 1 | \samsection{5. stochastic gradient descent}
 2 |       \samquote{
 3 |         The key to success is failure.
 4 |       }{michael j.\ jordan}
 5 | 
 6 |     \samsection{6. loss landscape shape}
 7 |       \samquote{
 8 |         The virtue of maps, they show what can be done with limited space, they
 9 |         foresee that everything can happen therein.
10 |       }{jos\'e saramago}
11 | 


--------------------------------------------------------------------------------
/tex-source/body.4.1.expectation-maximization.tex:
--------------------------------------------------------------------------------
  1 | %\subsection*{Lecture 4b: Inference via Variation: Expectation Maximization}
  2 | 
  3 | % =============================================================================
  4 | % ==  _  ======================================================================
  5 | % =============================================================================
  6 | 
  7 | \sampassage{EM overview}
  8 | 
  9 | %-------  _  ------------------------------------------------------------------
 10 | \blurb{Challenge of summing}
 11 | 
 12 | % what goes wrong in 3State Traffic example if don't marginalize
 13 | 
 14 | The idea of expectation maximization is to do coordinate ascent there are two
 15 | things we don't know there's theater which is our private or we want to find a
 16 | single best value for and there is Z of the latent random variable whose
 17 | multiplicity of possible values will you wish to account for we have to treat
 18 | theater and see slightly asymmetrically because what we really want to find its
 19 | data that maximizes a certain some over zs
 20 | 
 21 | %-------  _  ------------------------------------------------------------------
 22 | \blurb{EM qualitatively: maintain many replicas with different zs}
 23 | 
 24 | % misleadingly simple cartoon of EM for 3State Traffic example
 25 | 
 26 | %-------  _  ------------------------------------------------------------------
 27 | \blurb{E and M steps: formulas}
 28 | 
 29 | mention gradient descent as option in M
 30 | mention how to incorporate priors
 31 | 
 32 | %-------  _  ------------------------------------------------------------------
 33 | \blurb{cartoon of EM for GMM}
 34 | 
 35 | %-------  _  ------------------------------------------------------------------
 36 | \blurb{cartoon of EM for HMM}
 37 | 
 38 | % =============================================================================
 39 | % ==  _  ======================================================================
 40 | % =============================================================================
 41 | 
 42 | \sampassage{EM: GMM example (more detail in pset)}
 43 | 
 44 | %-------  _  ------------------------------------------------------------------
 45 | \blurb{type signatures of E step and of M step}
 46 | %-------  _  ------------------------------------------------------------------
 47 | \blurb{qualitative behavior of E step and of M step}
 48 | %-------  _  ------------------------------------------------------------------
 49 | \blurb{formula for M step}
 50 | %-------  _  ------------------------------------------------------------------
 51 | \blurb{formula for E step}
 52 | %-------  _  ------------------------------------------------------------------
 53 | \blurb{example dynamics of EM on flower data}
 54 | 
 55 | % =============================================================================
 56 | % ==  _  ======================================================================
 57 | % =============================================================================
 58 | 
 59 | \sampassage{EM: HMM example (more detail in coding example)}
 60 | 
 61 | %-------  _  ------------------------------------------------------------------
 62 | \blurb{HMM forward model.  applications.}
 63 | %-------  _  ------------------------------------------------------------------
 64 | \blurb{M step}
 65 | %-------  _  ------------------------------------------------------------------
 66 | \blurb{E step: (inefficient) formula}
 67 | %-------  _  ------------------------------------------------------------------
 68 | \blurb{E step: efficient way to compute via dynamic programming}
 69 | %-------  _  ------------------------------------------------------------------
 70 | \blurb{Example runthrough on toy data.  applications.}
 71 | 
 72 | % =============================================================================
 73 | % ==  _  ======================================================================
 74 | % =============================================================================
 75 | 
 76 | \sampassage{Learned E steps and Neural Networks}
 77 | %-------  _  ------------------------------------------------------------------
 78 | \blurb{why are E steps hard?}
 79 | %-------  _  ------------------------------------------------------------------
 80 | \blurb{how we might throw deep learning at the E step}
 81 | %-------  _  ------------------------------------------------------------------
 82 | \blurb{first taste of VAEs: architecture for image generation}
 83 | %-------  _  ------------------------------------------------------------------
 84 | \blurb{a remark on diffusion models; distribution-level losses and GANs}
 85 | %-------  _  ------------------------------------------------------------------
 86 | \blurb{a periodic table of ways to encode distributions as neural networks}
 87 | 
 88 | % =============================================================================
 89 | % ==  _  ======================================================================
 90 | % =============================================================================
 91 | 
 92 | \sampassage{(Bonus) under the hood: ELBO bound, pingpong KL geometry}
 93 | 
 94 | %-------  _  ------------------------------------------------------------------
 95 | \blurb{useful math: interplay of logarithms and expectations}
 96 | %-------  _  ------------------------------------------------------------------
 97 | \blurb{ELBO bound and E,M steps}
 98 | %-------  _  ------------------------------------------------------------------
 99 | \blurb{KL divergence, over vs undershoot support, compression, and surprise}
100 | %-------  _  ------------------------------------------------------------------
101 | \blurb{exponential vs mixture families.  M and E projections.}
102 | %-------  _  ------------------------------------------------------------------
103 | \blurb{ping-pong picture}
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/tex-source/body.4.2.metropolis-hastings.tex:
--------------------------------------------------------------------------------
 1 | %\subsection*{Lecture 4c: Inference via Sampling} % MH
 2 | 
 3 | \sampassage{Challenge.  MH Algorithm.  Diffusion.}
 4 |   Okay, so we have a probabilistic model and some data.  How do we do
 5 |   inference?  We want an approximate posterior over part of the model
 6 |   conditioned on the data.
 7 | \sampassage{Visualizing MH.  Proposals Matter}
 8 | \sampassage{MH: 3State Traffic example}
 9 | \sampassage{MH: HMM and GMM example}
10 | \sampassage{On Deep Learning and Noise}
11 | 


--------------------------------------------------------------------------------
/tex-source/body.4.3.deep-generators.tex:
--------------------------------------------------------------------------------
1 | %\subsection*{Lecture 4d: Variational Autoencoders (or Encoders)} % so connects back to Unit 3
2 | 
3 | \sampassage{Architecture} % comparison to matrix factorization, etc
4 | \sampassage{VAEs and ELBO}
5 | \sampassage{Interpreting Update Intuitively}
6 | \sampassage{Output side Noise Model (e.g. square loss)}
7 | \sampassage{Conditional VAEs}
8 | 


--------------------------------------------------------------------------------
/tex-source/body.5.0.reinforcement.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source/body.5.0.reinforcement.tex


--------------------------------------------------------------------------------
/tex-source/body.5.1.mdps.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source/body.5.1.mdps.tex


--------------------------------------------------------------------------------
/tex-source/body.5.2.q-learning.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/urgent-learner/mlentary/7b74c8b19368c3a636d7c335c853bc5b3e2d5fcf/tex-source/body.5.2.q-learning.tex


--------------------------------------------------------------------------------
/tex-source/body.5.3.beyond.tex:
--------------------------------------------------------------------------------
1 | 
2 | / Shoul we think about Etics in AI in this course?
3 | 


--------------------------------------------------------------------------------
/tex-source/body.F.0.linear-algebra.tex:
--------------------------------------------------------------------------------
  1 | 
  2 |     \samsection{linear algebra and approximation}
  3 |       \samquote{
  4 |         Stand firm in your refusal to remain conscious during algebra.
  5 |         In real life, I assure you, there is no such thing as algebra.
  6 |       }{fran lebowitz}
  7 | 
  8 |         Linear algebra is the part of geometry that focuses on when a point is
  9 |         the origin, when a `line' is a straight, and when two straight lines
 10 |         are parallel.
 11 |         %
 12 |         Linear algebra thus helps us deal with the preceding pictures
 13 |         mathematically and automatically.  The concept of `straight lines'
 14 |         gives a simple, flexible model for extrapolation from known points to
 15 |         unknown points.  That is intuitively why linear algebra will be crucial
 16 |         at every stage of 6.86x.
 17 | 
 18 |       \sampassage{visualizing high dimensional spaces}
 19 |       \sampassage{column vectors and row vectors} % vectors and co-vectors
 20 |         The elements of linear algebra are \textbf{column vectors} and
 21 |         \textbf{row vectors}.\bcirc \marginnote{%
 22 |           \blarr Though we represent the two similarly in a computer's memory, they
 23 |           have different geometric meanings.  We save much anguish by
 24 |           remembering the difference.
 25 |         }
 26 |         We have a set $V$ of ``column vectors''.  We're allowed to find $V$'s
 27 |         zero vector and to add or scale vectors in $V$ to get other vectors in
 28 |         $V$.  $V$ is the primary object we hold in our mind; perhaps, if we're
 29 |         doing image classification, then each column vector represents a
 30 |         photograph.  We use the word ``space'' or ``vector space'' when talking
 31 |         about $V$ to emphasize that we'd like to exploit visual intuitions when
 32 |         analyzing $V$.  In short: we imagine each column vector as a point in
 33 |         space, or, if we like, as an arrow from the zero vector to that point.
 34 | 
 35 |         Now, associated to $V$ is the set of ``row vectors''.  Under the hood, a
 36 |         row vector is a linear function from $V$ to the real numbers $\Rr$.  We
 37 |         imagine each row vector not as an arrow but as a ``linear'' heat map or
 38 |         altitude map that assigns to each point in $V$ a numeric ``intensity''.
 39 |         We can visualize a row vector the same way makers of geographic maps
 40 |         do: using contours for where in $V$ the row vector attains values
 41 |         $\cdots,-2,-1,0,+1,+2,\cdots$.
 42 |         These will be a bunch of uniformly spaced parallel ``planes''.  The
 43 |         spacing and orientation of the planes depends on and determines the row
 44 |         vector.  In short, we imagine each row vector as a collection of
 45 |         parallel planes in space.
 46 | 
 47 |         Informally: a column vector is a noun or thing whereas a row vector is
 48 |         a adjective or property.  The degree to which a property holds on a
 49 |         thing (or a description is true of a thing) is gotten by evaluating the
 50 |         row vector on the column vector --- remember, a row vector is a
 51 |         function, so we can do this evaluation.  Geometrically, if a row vector
 52 |         is a bunch of planes and a column vector is an arrow, the two evaluate
 53 |         to a number: the number of planes that the arrow pierces.  Intuitively,
 54 |         an example of a column vector might be ``this particular photo of my
 55 |         pet cow''; an example of a row vector might be ``redness of the left
 56 |         half (of the input photo)''.  If we evaluate this row vector on this
 57 |         column vector, then we get a number indicating how intensely true it is
 58 |         that the left half of that particular photo is red.\marginnote{%
 59 |             We can draw an analogy with syntax vs semantics.  This pair of
 60 |             concepts pops up in linguistics, philosophy, circuit engineering,
 61 |             quantum physics, and more, but all we need to know is that:
 62 |             semantics is about things while syntax is about descriptions of
 63 |             things.  The two concepts relate in that, given a set of things, we
 64 |             can ask for the set of all descriptions that hold true for all
 65 |             those things simultaneously.  And if we have a set of descriptions,
 66 |             we can ask for the set of all things that satisfy all those
 67 |             descriptions simultaneously.  These two concepts stand in formal
 68 |             opposition in the sense that: if we have a set of things and make
 69 |             it bigger, then the set of descriptions that apply becomes smaller.
 70 |             And vice versa.  Then a column vector is an object of semantics.
 71 |             And a row vector is an object of syntax.
 72 |         }
 73 | 
 74 |       \sampassage{inner products}
 75 |         Now, here is the key point: the engine behind generalization in machine
 76 |         learning (at least, the machine learning we'll do in Units 1 and 2; and
 77 |         less visibly but still truly in more than half of each of Units 3,4,5)
 78 |         is the ability to translate between things and properties.  If ``my pet
 79 |         cow'' is a thing, then ``similar to my pet cow'' is a property.  The whole
 80 |         project of machine learning is to define and implement this word
 81 |         ``similar to''.  When we define and implement well, our programs can
 82 |         generalize successfully from training examples to new, previously
 83 |         unseen situations, since they will be able to see which of the training
 84 |         examples the new situations are similar to.  Since ``similar to''
 85 |         transforms things to properties, the linear algebra math behind
 86 |         ``similar to'' is a function from column vectors to row vectors.  This
 87 |         brings us to...
 88 | 
 89 |         ... inner products, aka kernels.  An inner product is just a fancy word
 90 |         for a (linear) function from column vectors to row vectors.  We
 91 |         actually demand that this linear function has two nice properties:
 92 |         FIRST, it should have an inverse.  That is, it ought to be a two way
 93 |         bridge between column vectors and row vectors, allowing us to translate
 94 |         things to descriptions and vice versa.  SECOND, it should be symmetric.
 95 |         This means that if we have two things, say ``my pet cow'' and ``my pet
 96 |         raccoon'', then the degree to which ``my pet raccoon'' has the property
 97 |         ``is similar to my pet cow'' ought to match the degree to which ``my pet
 98 |         cow'' has the property ``is similar to my pet raccoon''.  Any invertible,
 99 |         symmetric, linear function from column vectors to row vectors is called
100 |         an inner product.  Kernel is a synonym here.\marginnote{%
101 |           Beware: the same word, ``kernel'', has different meanings depending
102 |           on context.
103 |         }
104 | 
105 |         There are generally infinitely many inner products.  Which one we
106 |         choose changes the generalization properties of our machine learning
107 |         program.  Practically, if we are doing machine learning in a concrete
108 |         situation, then we want to choose an inner product that reflects our
109 |         human intuition and experience and domain knowledge about the right
110 |         notion of ``similarity'' in that situation.
111 | 
112 |         Any inner product $P$ from column vectors to row vectors induces notions
113 |         of length and angle.  We just define a column vector v's length by
114 |         $\sqrt{P(v)(v)}$.  Call that quantity $\|v\|$.  And we define the angle
115 |         $\alpha(v,w)$ between two non-zero column vectors v,w by
116 |         $P(v)(w)=\|v\|\cdot\|w\|\cdot\cos\alpha(v,w)$.  We make these
117 |         definitions so as to match the Pythagorean theorem from plane geometry.\marginnote{%
118 |           You can google up proofs of the Pythagorean theorem (many are quick and
119 |           beautiful) if you wish to dig deeper.
120 |         }
121 |         So once we choose which inner product we'll use (out of the infinitely
122 |         many choices), we get the concepts of euclidean geometry for free.
123 |         Immediately following from that definition of angle, we get that if two
124 |         column vectors have vanishing inner product (i.e., if $P(v)(w)=0$), then
125 |         those vectors are at right angles (i.e. $\alpha(v)(w)=\pi/2$).
126 | 
127 |         Now, sometimes (but most of the time not), we are blessed in
128 |         that we know more about our situation than just the space V of things.
129 |         Specifically, V might come with a canonical basis.  This just means
130 |         that V comes marked with "the right" axes with respect to which we
131 |         ought to analyze vectors in V.  In this fortunate case, there is also a
132 |         canonical inner product.  It's called dot product.  Again, I want to
133 |         emphasize that a plain vector space doesn't come with a dot product.
134 |         We need a basis for that.
135 | 
136 |         The dot product is defined as follows.  Say there are $D$ axes and that
137 |         the "unit" vectors along each axis (aka the basis vector) are named
138 |         $(v_i:0\leq i<D)$.  Then we define $P(v_j)(v_i) = (1 \text{ if } i=j
139 |         \text{ else } 0)$ --- the $1$ expresses that each "unit" basis vector
140 |         ought to have length $1$.  The $0$ expresses that different basis
141 |         vectors ought to be at right angles to each other.  This determines $P$
142 |         on all inputs, by linearity: $P(\sum c\pr_j v_j)(\sum c_i v_i)=\sum_i
143 |         c\pr_i c_i$ where $c\pr_j,c_i$ are numbers.  In short: given a basis,
144 |         there is a unique inner product such that those basis elements all have
145 |         length $1$ and are at right angles to each other.  We call that inner
146 |         product the dot product.
147 | 
148 |         \attnsam{FILL IN LINEAR DECISION BOUNDARY! (remark on featurization and
149 |         argmax nonlinearities)}
150 | 
151 |         We may \textbf{evaluate} a row vector on a column vector.  \attnsam{FILL
152 |         IN} A \textbf{dot product} is a way of translating between row and
153 |         column vectors.  \attnsam{FILL IN: DISCUSS GENERALIZATION; (DISCUSS ANGLE, TOO)}
154 | 
155 |       \sampassage{linear maps}
156 | 
157 |       \sampassage{singular value decomposition}
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/tex-source/body.F.1.probability.tex:
--------------------------------------------------------------------------------
 1 | 
 2 |     \samsection{probability and generalization}%, independence, concentration}
 3 |       \samquote{
 4 |         Can I just say Chris for one moment that I have a new theory about the
 5 |         brontosaurus. ...  This theory goes as follows and begins now. All
 6 |         brontosauruses are thin at one end, much, much thicker in the middle
 7 |         and then thin again at the far end. That is my theory, it is mine, and
 8 |         belongs to me and I own it, and what it is too.
 9 |       }{john cleese}
10 | 
11 |       \sampassage{belief and bayes}
12 |       \sampassage{the key abstraction: averages}
13 |       \sampassage{the key approximation: independence}
14 |       \sampassage{uniform concentration}
15 | 
16 | 


--------------------------------------------------------------------------------
/tex-source/body.F.2.derivatives.tex:
--------------------------------------------------------------------------------
  1 | 
  2 |     \samsection{calculus and optimization}%gradients
  3 |       \samquote{
  4 |         The self is not something ready-made, but something in continuous
  5 |         formation through choice of action.
  6 |       }{john dewey}
  7 | 
  8 |       Throughout this section, $X$ and $Y$ will refer to two normed real
  9 |       vector spaces of finite dimension.
 10 | 
 11 |       \sampassage{asymptotic notation}
 12 |         When analyzing algorithms or data, we often wish to consider extremes
 13 |         of the very small or the very large.  Such thought experiments isolate
 14 |         how behaviors of interest depend on the variables we take to extreme
 15 |         values.
 16 | 
 17 |         We say \textbf{$f:X\to Y$ is negligible compared to $g:X\to Y$
 18 |         for sufficiently small\bovinenote{%
 19 |           The notion that ``\emph{$f$ is negligible compared to $g$ for
 20 |           sufficiently small inputs}'' is the most important of a $2\times 2$
 21 |           grid of variants:
 22 |           we may change
 23 |           $$\text{sufficiently small} \rightsquigarrow \text{sufficiently large}$$
 24 |           by replacing
 25 |           $$\text{``$0<\|x\|<\delta$''} \rightsquigarrow \text{``$\delta<\|x\|$''}$$
 26 |           and/or we may change
 27 |           \begin{align*}
 28 |               &\text{is negligible compared to}\\
 29 |               \rightsquigarrow~
 30 |               &\text{never overwhelms}
 31 |           \end{align*}
 32 |           by replacing
 33 |           \begin{align*}
 34 |               &\text{``For any positive number $\epsilon$''}\\
 35 |               \rightsquigarrow~
 36 |               &\text{``There exists a positive number $\epsilon$''}
 37 |           \end{align*}
 38 |           %
 39 |           The class of $f$s that never ovewhelm $g$ is called $O(g)$ --- pronounced
 40 |           \textbf{big-Oh}.  Clearly, $o(g) \subsetneq O(g)$.
 41 |           Confusingly, folks use the same notation $o(g), O(g)$
 42 |           when considering small inputs and large inputs;
 43 |           which sense we mean should be clear from context.
 44 |         } inputs}
 45 |         when the ratio $\|f\|/\|g\|$ is tiny for small inputs --- that is, when:
 46 |         \begin{align*}
 47 |             &\hspace{0cm}\text{For any positive number $\epsilon$}\\
 48 |             &\hspace{1cm}\text{there exists a positive number $\delta$ so that,}\\
 49 |             &\hspace{2cm}\text{whenever $0<\|x\|<\delta$,}\\
 50 |             &\hspace{3cm}\text{we also have $\|f(x)\| < \epsilon \|g(x)\|$.}
 51 |         \end{align*}
 52 |         The class of $f$s that are negligible compared to $g$ we denote by
 53 |         $$
 54 |             o(g)
 55 |         $$
 56 |         or, when abusing notation, by $o(g(x))$ even though $x$
 57 |         isn't defined.  This is \textbf{little-oh} notation.
 58 | 
 59 |         For example, if $p,q$ are positive real numbers
 60 |         then $|x|^p$ is negligible compared
 61 |         to $|x|^q$ if and only if $p<q$:
 62 |         $$
 63 |             (x \mapsto |x|^3)     \in o(x \mapsto |x|^2)
 64 |             \quad\quad
 65 |             (x \mapsto |x|^3) \not\in o(x \mapsto |x|^4)
 66 |         $$
 67 | 
 68 |         \par\noindent
 69 |         \attn{Exercise:} {Is $\sin(x) \in o(1)$?  How about $o(x)$?}
 70 | 
 71 |         \par\noindent
 72 |         \attn{Exercise:} {Is $\max(0,x) \in o(1)$?  How about $o(x)$?}
 73 | 
 74 |         \par\noindent
 75 |         \attn{Exercise:} {Is $\log |x| \in o(1/x)$?  (Ignore $x=0$.)}
 76 | 
 77 |         \par\noindent
 78 |         \attn{Exercise:} {Is $\exp(-1/|x|) \in o(x)$?  (Ignore $x=0$.)}
 79 | 
 80 |       \sampassage{derivatives}
 81 |         If $f:X\to Y$ is a (potentially nonlinear) function between two
 82 |         finite-dimensional real vector spaces, we may wish to approximate $f$
 83 |         by a linear function (plus a constant).  It is often unreasonable to
 84 |         ask that the approximation is good for all inputs; instead, we ask that
 85 |         the approximation is good near some specific input $x:X$:
 86 |         $$
 87 |             f(x+h) \approx f(x) + (Df_x)(h)
 88 |         $$
 89 |         Here, $(Df_x):X\to Y$ is a linear map that translates changes $h$ in
 90 |         $f$'s input to changes $(Df_x)(h)$ in $f$'s output.  We want the
 91 |         approximation to be good for small $h$ in the sense that the error
 92 |         vanishes faster than linearly as $h$ shrinks:
 93 |         $$
 94 |             \|f(x) + (Df_x)(h) - f(x+h)\| \in o(\|h\|)
 95 |         $$
 96 |         Intuitively, $Df_x$ exists when $f$ varies smoothly.
 97 | 
 98 |       \sampassage{integrals}
 99 |       \sampassage{}
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------