├── .gitattributes
├── .gitignore
├── Computer Vision
    ├── 01.ImageFormation.md
    ├── 02.ImageProcessing.md
    ├── 03.BinaryImageProcessing.md
    ├── 04.EdgeAndBoundaryDetection.md
    ├── 05.CornerDetectionAndMatching.md
    ├── 06.AdvancedFeatures.md
    ├── 07.ImageAlignmentAndTransform.md
    ├── 08.CameraCalibrationAndPhotogrammetry.md
    ├── 09.MultiView.md
    ├── 10.VisualRecognition.md
    ├── 11.ImageSegmentation.md
    ├── 12.ObjectDetection.md
    ├── 13.OpticalFlow.md
    └── 14.FaceDetection.md
├── Convex Optimization
    ├── 00.NumericalLinearAlgebra.md
    ├── 00.Preliminaries.md
    ├── 01.Introduction.md
    ├── 02.ConvexSets.md
    ├── 03.ConvexFunctions.md
    ├── 04.ConvexOptimizationProblems.md
    ├── 05.Duality.md
    ├── 06.ApproximationAndFitting.md
    ├── 07.StatisticalEstimation.md
    ├── 08.GeometricProblems.md
    ├── 09.UnconstrainedMinimization.md
    ├── 10.EqualityConstrainedMinimization.md
    └── 11.InteriorPointMethods.md
├── Data Mining
    ├── 01.Fundamentals.md
    ├── 02.FindingSimilarItems_.md
    ├── 03.Clustering.md
    ├── 04.DimensionalityReduction.md
    ├── 05.LinkAnalysis.md
    ├── 06.CommunityDetection.md
    ├── 07.LinkPrediction.md
    ├── 08.FrequentItemsets.md
    ├── 09.DataStreamMining.md
    └── 10.DifferentialPrivacy.md
├── Database System Concepts
    ├── 01.Introduction.md
    ├── 02.RelationalModel.md
    ├── 03.SQLBasics.md
    ├── 04.SQLAdvanced.md
    ├── 05.DatabaseDesignTheory.md
    ├── 06.DataStorageStructures.md
    ├── 07.IndexAndHashing.md
    ├── 08.QueryProcessing.md
    ├── 09.QueryOptimization.md
    ├── 10.TransactionProcessing.md
    └── 11.Recovery.md
├── Digital Signal and Image Processing
    ├── AI2614-DSIP.md
    ├── DIP-2DFT.md
    ├── DIP-Fundamentals.md
    ├── DIP-HistogramProcessing.md
    ├── DIP-IntensityTransformAndSpatialFiltering.md
    ├── DIP-Restoration.md
    ├── DIP-SpatialFiltering.md
    ├── DSP-DFT.md
    ├── DSP-FFT.md
    ├── DSP-FunctionalInterpretationOfSignals.md
    ├── DSP-STFT.md
    └── DSP-SamplingAndInterpolation.md
├── Foundations of Programming Languages
    ├── 01.BasicSetTheory.md
    ├── 02.OperationalSemantics.md
    ├── 03.PrinciplesOfInduction.md
    ├── 04.InductiveDefinitions.md
    ├── 05.DenotationalSemantics.md
    ├── 06.AxiomaticSemantics.md
    ├── 07.CompletenessOfHoareRules.md
    ├── 08.DomainTheory.md
    └── 09.LanguagesWithHigherTypes.md
├── LICENSE
├── Machine Learning
    ├── Classification.tex
    ├── Clustering.tex
    ├── DimensionReduction.tex
    ├── Introduction.tex
    ├── LinearDiscriminantAnalysis.tex
    ├── MachineLearning.pdf
    ├── MachineLearning.tex
    ├── Regression.tex
    └── SupportVectorMachine.tex
├── Mathematical Logic
    ├── DeductiveCalculus.tex
    ├── FirstOrderLogic.tex
    ├── InformalNotionsOfAlgorithms.tex
    ├── MathematicalLogic.pdf
    ├── MathematicalLogic.tex
    ├── SententialLogic.tex
    ├── SetTheory.tex
    └── wrapup.tex
├── Operating System
    ├── 01.ARM汇编.md
    ├── 02.链接.md
    ├── 03.操作系统概述.md
    ├── 04.进程.md
    ├── 05.文件.md
    ├── 06.内存地址翻译.md
    ├── 07.系统初始化.md
    ├── 08.虚拟内存管理.md
    ├── 09.物理内存管理.md
    ├── 10.进程管理.md
    ├── 11.处理器调度.md
    ├── 12.多线程.md
    ├── 13.同步原语.md
    ├── 14.同步原语的实现.md
    ├── 15.文件系统.md
    ├── 16.文件系统实现.md
    ├── 17.设备管理与驱动.md
    └── 18.存储与文件系统.md
├── Provable Security
    ├── 00.CheatSheet.md
    ├── 01.Introduction.md
    ├── 02.ModernCryptography.md
    ├── 03.PublicKeyEncryption.md
    ├── 04.PreliminaryOfProvableSecurity.md
    ├── 05.StatisticalDistance.md
    ├── 06.ComputationalIndistinguishability.md
    ├── 07.OnewayFunctions.md
    ├── 08.PublicKeyCryptographyMaths.md
    ├── 09.ElGammalEncryption.md
    ├── 10.IdentityBasedEncryption.md
    ├── 11.AttributeBasedEncryption.md
    ├── 12.HierarchicalDeterministicWallet.md
    └── 13.PseudorandomFunctions.md
├── README.md
├── Reinforcement Learning
    ├── 01.Introduction.md
    ├── 02.MultiArmBandit.md
    ├── 03.MarkovDecisionProcess.md
    ├── 04.ModelFreeControl.md
    ├── 05.ProgrammingAndLearning.md
    ├── 06.ApproximationMethods.md
    ├── 07.DeepRL.md
    ├── 08.DeepPolicyNetworks.md
    ├── 09.ModelBasedDRL.md
    ├── 10.ImitationLearning.md
    └── 11.OfflineRL.md
├── Speech Recognition
    ├── code_demo
    │   └── viterbi_demo.py
    ├── 大词表连续语音识别.md
    ├── 概率论与贝叶斯决策理论.md
    ├── 深度神经网络模型.md
    ├── 熵.md
    ├── 特征提取.md
    ├── 统计语言模型.md
    ├── 说话人识别与说话人日志.md
    └── 隐马尔科夫模型.md
├── Statistical Learning And Inference
    ├── 01.Introduction.md
    ├── 02.SupervisedLearning.md
    ├── 03.LinearRegression.md
    ├── 04.LinearClassifiers.md
    ├── 05.KernelMethod.md
    ├── 06.SVM.md
    ├── 07.BasisExpansion.md
    ├── 08.ModelSelection.md
    ├── 09.ModelInference.md
    ├── 10.ZeroShotLearning.md
    ├── 11.FewShotLearning.md
    ├── 12.DomainAdaptation.md
    ├── 13.DomainGeneralization.md
    ├── 14.WeaklySupervisedLearning.md
    ├── 15.UnsupervisedLearning.md
    └── figs
    │   ├── function-estimation-model.png
    │   └── ultimate-em-algorithm-figure.png
└── Stochastic Processes
    ├── BrownianMotion.tex
    ├── ContinousTimeMarkovChain.tex
    ├── Diffusion.tex
    ├── DiscreteMarkovChain.tex
    ├── MarkovRandomFields.tex
    ├── Martingale.tex
    ├── NotesTo2613.pdf
    ├── NotesTo2613.tex
    └── PoisonProcess.tex


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pdf filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Core latex/pdflatex auxiliary files:
  2 | *.aux
  3 | *.lof
  4 | *.log
  5 | *.lot
  6 | *.fls
  7 | *.out
  8 | *.toc
  9 | *.fmt
 10 | *.fot
 11 | *.cb
 12 | *.cb2
 13 | .*.lb
 14 | 
 15 | ## Intermediate documents:
 16 | *.dvi
 17 | *.xdv
 18 | *-converted-to.*
 19 | # these rules might exclude image files for figures etc.
 20 | # *.ps
 21 | # *.eps
 22 | # *.pdf
 23 | 
 24 | ## Generated if empty string is given at "Please type another file name for output:"
 25 | .pdf
 26 | 
 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 28 | *.bbl
 29 | *.bcf
 30 | *.blg
 31 | *-blx.aux
 32 | *-blx.bib
 33 | *.run.xml
 34 | 
 35 | ## Build tool auxiliary files:
 36 | *.fdb_latexmk
 37 | *.synctex
 38 | *.synctex(busy)
 39 | *.synctex.gz
 40 | *.synctex.gz(busy)
 41 | *.pdfsync
 42 | 
 43 | ## Build tool directories for auxiliary files
 44 | # latexrun
 45 | latex.out/
 46 | 
 47 | ## Auxiliary and intermediate files from other packages:
 48 | # algorithms
 49 | *.alg
 50 | *.loa
 51 | 
 52 | # achemso
 53 | acs-*.bib
 54 | 
 55 | # amsthm
 56 | *.thm
 57 | 
 58 | # beamer
 59 | *.nav
 60 | *.pre
 61 | *.snm
 62 | *.vrb
 63 | 
 64 | # changes
 65 | *.soc
 66 | 
 67 | # comment
 68 | *.cut
 69 | 
 70 | # cprotect
 71 | *.cpt
 72 | 
 73 | # elsarticle (documentclass of Elsevier journals)
 74 | *.spl
 75 | 
 76 | # endnotes
 77 | *.ent
 78 | 
 79 | # fixme
 80 | *.lox
 81 | 
 82 | # feynmf/feynmp
 83 | *.mf
 84 | *.mp
 85 | *.t[1-9]
 86 | *.t[1-9][0-9]
 87 | *.tfm
 88 | 
 89 | #(r)(e)ledmac/(r)(e)ledpar
 90 | *.end
 91 | *.?end
 92 | *.[1-9]
 93 | *.[1-9][0-9]
 94 | *.[1-9][0-9][0-9]
 95 | *.[1-9]R
 96 | *.[1-9][0-9]R
 97 | *.[1-9][0-9][0-9]R
 98 | *.eledsec[1-9]
 99 | *.eledsec[1-9]R
100 | *.eledsec[1-9][0-9]
101 | *.eledsec[1-9][0-9]R
102 | *.eledsec[1-9][0-9][0-9]
103 | *.eledsec[1-9][0-9][0-9]R
104 | 
105 | # glossaries
106 | *.acn
107 | *.acr
108 | *.glg
109 | *.glo
110 | *.gls
111 | *.glsdefs
112 | *.lzo
113 | *.lzs
114 | 
115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
116 | # *.ist
117 | 
118 | # gnuplottex
119 | *-gnuplottex-*
120 | 
121 | # gregoriotex
122 | *.gaux
123 | *.gtex
124 | 
125 | # htlatex
126 | *.4ct
127 | *.4tc
128 | *.idv
129 | *.lg
130 | *.trc
131 | *.xref
132 | 
133 | # hyperref
134 | *.brf
135 | 
136 | # knitr
137 | *-concordance.tex
138 | # TODO Comment the next line if you want to keep your tikz graphics files
139 | *.tikz
140 | *-tikzDictionary
141 | 
142 | # listings
143 | *.lol
144 | 
145 | # luatexja-ruby
146 | *.ltjruby
147 | 
148 | # makeidx
149 | *.idx
150 | *.ilg
151 | *.ind
152 | 
153 | # minitoc
154 | *.maf
155 | *.mlf
156 | *.mlt
157 | *.mtc[0-9]*
158 | *.slf[0-9]*
159 | *.slt[0-9]*
160 | *.stc[0-9]*
161 | 
162 | # minted
163 | _minted*
164 | *.pyg
165 | 
166 | # morewrites
167 | *.mw
168 | 
169 | # nomencl
170 | *.nlg
171 | *.nlo
172 | *.nls
173 | 
174 | # pax
175 | *.pax
176 | 
177 | # pdfpcnotes
178 | *.pdfpc
179 | 
180 | # sagetex
181 | *.sagetex.sage
182 | *.sagetex.py
183 | *.sagetex.scmd
184 | 
185 | # scrwfile
186 | *.wrt
187 | 
188 | # sympy
189 | *.sout
190 | *.sympy
191 | sympy-plots-for-*.tex/
192 | 
193 | # pdfcomment
194 | *.upa
195 | *.upb
196 | 
197 | # pythontex
198 | *.pytxcode
199 | pythontex-files-*/
200 | 
201 | # tcolorbox
202 | *.listing
203 | 
204 | # thmtools
205 | *.loe
206 | 
207 | # TikZ & PGF
208 | *.dpth
209 | *.md5
210 | *.auxlock
211 | 
212 | # todonotes
213 | *.tdo
214 | 
215 | # vhistory
216 | *.hst
217 | *.ver
218 | 
219 | # easy-todo
220 | *.lod
221 | 
222 | # xcolor
223 | *.xcp
224 | 
225 | # xmpincl
226 | *.xmpi
227 | 
228 | # xindy
229 | *.xdy
230 | 
231 | # xypic precompiled matrices and outlines
232 | *.xyc
233 | *.xyd
234 | 
235 | # endfloat
236 | *.ttt
237 | *.fff
238 | 
239 | # Latexian
240 | TSWLatexianTemp*
241 | 
242 | ## Editors:
243 | # WinEdt
244 | *.bak
245 | *.sav
246 | 
247 | # Texpad
248 | .texpadtmp
249 | 
250 | # LyX
251 | *.lyx~
252 | 
253 | # Kile
254 | *.backup
255 | 
256 | # gummi
257 | .*.swp
258 | 
259 | # KBibTeX
260 | *~[0-9]*
261 | 
262 | # TeXnicCenter
263 | *.tps
264 | 
265 | # auto folder when using emacs and auctex
266 | ./auto/*
267 | *.el
268 | 
269 | # expex forward references with \gathertags
270 | *-tags.tex
271 | 
272 | # standalone packages
273 | *.sta
274 | 
275 | # Makeindex log files
276 | *.lpz
277 | 


--------------------------------------------------------------------------------
/Computer Vision/02.ImageProcessing.md:
--------------------------------------------------------------------------------
  1 | # Image Processing
  2 | 
  3 | ---
  4 | 
  5 | > Part 0: Preliminary
  6 | 
  7 | ---
  8 | 
  9 | ## Functional Interpretation of an Image
 10 | 
 11 | - $f(x,y)$ is the image intensity at $(x,y)$.
 12 | - Image processing is a transoformation $t$: $t(f(x,y))$.
 13 | 
 14 | ## Linear Shift-Invariant Systems
 15 | 
 16 | - Ideal lens is an LSI system
 17 |   - Defocuesd Image $g$ is a processed version of Focused Image $f$.
 18 |   - Linearity: Variation in brightness
 19 |   - Shift: Movement of scenes
 20 | 
 21 | ## Unit Impulse Function
 22 | 
 23 | $$\delta(x)=\begin{cases}
 24 |     1/2\varepsilon \quad & |x| \le \varepsilon\\
 25 |     0 \quad & |x| > \varepsilon
 26 | \end{cases} \quad (\varepsilon\to 0)$$
 27 | 
 28 | - $\int \delta(x) \mathrm{d}x = 1$
 29 | - $\delta(x)*h(x) = h(x)$
 30 | 
 31 | ---
 32 | 
 33 | > Part 1: Spatial Processing
 34 | 
 35 | ---
 36 | 
 37 | ## Pixel-level processing
 38 | 
 39 | - darken $f-128$
 40 | - lighten $f + 128$
 41 | - invert $255-f$
 42 | - low contrast $f/2$
 43 | - high constrast $f * 2$
 44 | - gray $0.3f_R+0.6f_G+0.1f_B$
 45 | 
 46 | ## Convolution
 47 | 
 48 | ### 1D Convolution
 49 | 
 50 | $$f(t) * h(t) = \int f(\tau)h(t-\tau)\mathrm{d}\tau$$
 51 | 
 52 | - *Convolution implies LSI and LSI implies convolution.*
 53 | 
 54 | ### 2D Convolution
 55 | 
 56 | $$ g(x,y) = \iint f(u,v)h(x-u,y-v)\mathrm{d}u\mathrm{d}v $$
 57 | 
 58 | $$ g[i,j] = \sum_{n=1}^M\sum_{n=1}^N f[m,n]h[i-m, j-n] $$
 59 | 
 60 | ### Border Problem and Padding
 61 | 
 62 | - Ignore border
 63 | - Pad with constants
 64 | - Pad with reflection
 65 | 
 66 | > “我们的爱因斯坦，爱酱”
 67 | 
 68 | ## Spatial Filtering
 69 | 
 70 | - Impulse filtering
 71 | - Mean filtering
 72 |   - Orig - MeanFilter = Sharpened
 73 | - Box filtering
 74 |   - Does not look natural
 75 |   - Has blocky artifacts
 76 | - Fuzzy filter (Gaussian filtering)
 77 | 
 78 | ### Gaussian Kernel
 79 | 
 80 | $$ n_{\sigma}[i,j] = \frac{1}{2\pi\sigma^2}e^{-\frac{i^2+j^2}{2\sigma^2}} $$
 81 | 
 82 | - Rule of Thumb: $KernelSize=2\pi\sigma$
 83 | - Larger kernel (or $\sigma$) results in more blurring
 84 | 
 85 | #### Separability of Gaussian Filter
 86 | 
 87 | $$g[i,j] = \frac{1}{2\pi\sigma^2}\sum_{m=-K/2}^{K/2}\sum_{n=-K/2}^{K/2}e^{-\frac{m^2+n^2}{2\sigma^2}}f[i-m, j-n] $$
 88 | 
 89 | $$ g[i,j] = \frac{1}{2\pi\sigma^2}\sum_m e^{-\frac{m^2}{2\sigma^2}}\cdot\sum_ne^{-\frac{n^2}{2\sigma^2}}f[i-m,j-n] $$
 90 | 
 91 | - A 2D Gaussian filter can be equivalently replaced by two 1D Gaussian filters
 92 |   - Lower time complexity
 93 |     - From $K^2$ multiplications and $K^2-1$ additions to $2K$ multiplications and $2(K-1)$ additions
 94 | 
 95 | ### Denoise with Smoothing
 96 | 
 97 | - Problems with Gaussian smoothing
 98 |   - Sensitive to outliers
 99 |   - Smoothens edges
100 | 
101 | #### Median Filtering
102 | 
103 | - Sort the $K^2$ values in window centered at the pixel
104 | - Assign the median to the pixel
105 |   - Can handle Salt and Pepper Noise
106 | - Non-linear (involves sorting)
107 |   - Cannot be implemented by convolution
108 | - Drawbacks
109 |   - Not effective when the noise is not simply salt and pepper
110 |   - Large kernel also blurrs edges
111 | 
112 | #### Bilateral Filter
113 | 
114 | ##### Gaussian Filtering Revisited
115 | 
116 | - If we apply the same Gaussian Kernel everywhere, it will blur the edges
117 |   - Because the filter has no knowledge of foregrounds and backgrounds
118 | - Solution
119 |   - Add weight
120 |   - Bias Gaussian Kernel s.t. pixels that are not similar in intensity to the center pixel have smaller weight
121 | 
122 | ##### Gaussian Filtering
123 | 
124 | $$ g[i,j] =\frac{1}{W_{sb}} \sum_m\sum_n f[m,n]n_{\sigma_s}[i-m,j-n]n_{\sigma_{b}}(|f[m,n]-f[i,j]|) $$
125 | 
126 | - $n_{\sigma_s}$ is the regular spatial Gaussian filter
127 | - $n_{\sigma_b}$ is the brightness Gaussian, used for adjusting weights
128 |   - similar pixels have higher weights
129 |   - non-similar pixels have lower weights
130 |   - If $\sigma_b \to \infty$, the bilateral filter reduces to ordinary Gaussian filter.
131 | - Non-linear operation, cannot use convolution
132 | 
133 | > “雀斑都去掉了，五官都保留了。非常好。”
134 | > “磨皮磨得有点过。”
135 | 
136 | ## Template Matching
137 | 
138 | > “经典得现在都已经没有人用了”
139 | 
140 | *How do we locate a target template $t$ in an given image $f$?*
141 | 
142 | ### Formulation
143 | 
144 | Minimize
145 | 
146 | $$ E[i,j] = \sum_m\sum_n\left(f[m,n]-t[m-i,n-i] \right)^2 $$
147 | 
148 | which is equivalent to maximizing
149 | 
150 | $$ f[m,n]t[m-i,n-j] $$
151 | 
152 | ### Cross-Correlation
153 | 
154 | $$ t \otimes f = R_{tf}[i,j] = \sum_m\sum_nf[m,n]t[m-i,n-j] $$
155 | 
156 | #### Convolution and Correlation
157 | 
158 | Correlation
159 | 
160 | $$ t \otimes f = \sum_m\sum_n f[m,n] t[m-i,n-j] $$
161 | 
162 | Convolution
163 | 
164 | $$ t * f = \sum_m\sum_nf[m,n]t[i-m,j-n] $$
165 | 
166 | #### Normalization
167 | 
168 | - Problem with Cross-Correlation
169 |   - Unnormalized input can give erroneous results
170 | 
171 | $$ N_{tf}[i,j] = \frac{t \otimes f}{\|f\|_2\cdot\|t\|_2} $$
172 | 
173 | #### Drawbacks
174 | 
175 | - Border problem
176 | - Sensitive to object pose, scale and rotation
177 | - Not good for general object detection
178 |   - Can only match templates
179 | - Can be computationally expensive
180 | 
181 | ---
182 | 
183 | > Part 2: Spectral Processing
184 | 
185 | ---
186 | 
187 | ## 2D Fourier Transform
188 | 
189 | > Any periodic function can be rewritten as a **weighted sum** of **infinite sinusoids** of different frequencies.
190 | 
191 | `LazyError: Refer to AI2614 DSIP`
192 | 


--------------------------------------------------------------------------------
/Computer Vision/06.AdvancedFeatures.md:
--------------------------------------------------------------------------------
  1 | # 2D Recognition with SIFT
  2 | 
  3 | - Corners are not scale invariant, so we need better features.
  4 | 
  5 | ## Recap: What is a Good Feature
  6 | 
  7 | - Has rich image content within a local window
  8 | - Has well-defined representation for matching/comparing with other points
  9 | - Has a well-defined position
 10 | - Should be invariant to rotation and scaling
 11 | - SHould relatively invariant to lighting changes
 12 | 
 13 | ## From Blob Detection to SIFT
 14 | 
 15 | ### Overview
 16 | 
 17 | - Use **blob-like** features for 2D recognition
 18 | - Need to
 19 |   - Locate blobs
 20 |   - Determine its size
 21 |   - Determine its orientation
 22 |   - Formulate a description or signature that is independent of its size and orientation
 23 | 
 24 | #### Observation
 25 | 
 26 | - If we compute the second-order derivative of an image with a Gaussian kernel of size $\sigma$ (i.e. a Laplacian kernel of size $\sigma$), and if the size of a blob is similar to the size of the kernel, there will be a local extremum in the result $\frac{\partial^2 n_\sigma}{\partial x^2} \ast f$
 27 |   - The size of the kernel can then represent the size of the blob
 28 | 
 29 | #### Characteristic Scale
 30 | 
 31 | - Define characteristic scale to be the $\sigma$ at which the second derivative of a blob attains its local extrema
 32 | 
 33 | ##### Scale Normalization
 34 | 
 35 | - We want to find the characteristic scale of the blob by convolving it with Laplacian operators at several scales and looking for the maximum response
 36 | - However, Laplacian response **decays** as the scale increases
 37 |   - This is caused by the normalizing factor of Gaussian kernels $1/\sigma\sqrt{2\pi}$, which gets smaller as $\sigma$ increases.
 38 | 
 39 | Therefore we want to normalize the response.
 40 | 
 41 | - To do this, we **normalize the Laplacian response** by multiplying it with $\sigma^2$
 42 |   - Because the $1/\sigma$ in Gaussian becomes $1/\sigma^2$ in Laplacian
 43 | 
 44 | #### 1D Blob Detection
 45 | 
 46 | Given a 1D signal $f(x)$
 47 | 
 48 | 1. Compute $\frac{\partial^2 n_\sigma}{\partial x^2} \ast f$ at different scales of $\sigma$
 49 | 2. $(x^\ast,\sigma^\ast) = \arg\max\left| \frac{\partial^2n_\sigma}{\partial x^2}\ast f(x) \right|$
 50 | 
 51 | ### 2D Blob Detection
 52 | 
 53 | - Normalized Laplacian of Gaussians (NLoGs) are used as the 2D equivalent for blob detection.
 54 | 
 55 | #### Scale-Space
 56 | 
 57 | As we increase $\sigma$, the resolution becomes lower. Define scale space by the space created by filtering results of differernt $\sigma$
 58 | 
 59 | $$ S(x,y,\sigma) = n(x,y,\sigma) \ast f(x,y) $$
 60 | 
 61 | #### Creating Scale Space
 62 | 
 63 | - Ideally, the scale space is continuous
 64 | - But computers can only handle discrete spaces
 65 | - So we difine an initial $\sigma_0$ and multiply it by power of a constant multiplier $s$
 66 | 
 67 | $$ \sigma_k = \sigma_0s^k $$
 68 | 
 69 | - Given a blob area, there will be a $\sigma^\ast$ such that the response of this blob is maximized
 70 |   - Then we will be able to know the characteristic scale and the size of the blob
 71 | - For a large flat area, no extrema w.r.t. $\sigma$ can be found
 72 | 
 73 | #### 2D Blob Detection Procedure
 74 | 
 75 | Given an image $I(x,y)$
 76 | 
 77 | 1. Convolve the image with many NLoG of different scales
 78 | 2. Find $\arg\max_{x,y,\sigma}|\sigma^2\nabla^2S(x,y,\sigma)|$
 79 | 3. $(x,y)$ gives the location and $\sigma$ gives the size
 80 | 
 81 | ## Scale Invariant Feature Transform SIFT
 82 | 
 83 | - An efficient implementation of blob detector
 84 | - Use Difference of Gaussian (DoG) as an approximation of NLoG
 85 | 
 86 | $$ DoG = (n_{s\sigma} - n_{\sigma}) \approx (s-1)\sigma^2\nabla^2n_{\sigma} $$
 87 | 
 88 | ### Basic Procedure
 89 | 
 90 | 1. Compute difference of Gaussians
 91 | 2. Find extrema in every 3x3x3 space
 92 | 3. Select interest point candidates (may contain weak extrema and bad contrast)
 93 | 4. Remove weak extrema
 94 | 
 95 | ### Descriptors for SIFT Features
 96 | 
 97 | #### Principal Orientation
 98 | 
 99 | - Use the histogram of gradient directions
100 |   - For each detected blob
101 |   - Divide the blob into grids
102 |   - Group the grids into cells
103 |   - Compute histogram of gradients
104 | - Choose the most prominent gradient direction
105 | 
106 | #### Matching Features
107 | 
108 | - Use characteristic size to match sizes
109 | - Use principal orientation to match orientations
110 | 
111 | #### SIFT Descriptor
112 | 
113 | - Use histograms of gradient directions as the descriptor
114 |   - Should be normalized using the principal gradient orientation
115 |   - Common implementation uses a 4x4 cell, each cell with a histogram of 8 bins
116 |     - So the result is 128-dimensional
117 | 
118 | ## Other Discriptors
119 | 
120 | ### Histogram-of-Oriented-Gradients HoG
121 | 
122 | - Used for pedestrian detection
123 | 
124 | ### Shape Contexts
125 | 
126 | Can be used as a desciptor to match shapes
127 | 
128 | - Sample some points along each edge
129 | - For each point, construct a log polar system coordinate at each point
130 | - Divide the coordinate system into bins
131 | - Count the number of boundary points in each bin
132 | 


--------------------------------------------------------------------------------
/Computer Vision/10.VisualRecognition.md:
--------------------------------------------------------------------------------
 1 | # Visual Recognition (Pre-DeepLearning)
 2 | 
 3 | > “哦你们都是00后？”
 4 | > “失敬失敬”
 5 | 
 6 | ## Overview
 7 | 
 8 | ### Image Representation
 9 | 
10 | #### Descriptors
11 | 
12 | - SIFT
13 | - HOG
14 | - CodeWords
15 | 
16 | ### Feature Learning
17 | 
18 | #### Discriminative and Generative Models
19 | 
20 | - Discriminative models model posterior ratio
21 |   - Find a 'dicision boundary'
22 | - Generative models model the likelihood ratio
23 | 
24 | ## Apperance Recognition by PCA
25 | 
26 | ### Offline Stage
27 | 
28 | Given $M$ learning images $\{I_1^{(q)},\dots,I_M^{(q)}\}$ for each of the $Q$ training objects. $q \in \{ 1,\dots,Q \}$
29 | 
30 | 1. Normalized all images to remove brightness variations $I' = I / \|I\|$
31 | 2. Convert image $I'$ to feature vector $f'$
32 | 3. Compute mean feature vector $c$
33 | 4. Remove mean $f = f' - c$
34 | 5. Compute correlation matrix $R$
35 | 6. Compute PCA $E=[e_1,\dots,e_K]$ and project learning images to eigenspace $p = Ef$
36 | 
37 | ### Online Stage
38 | 
39 | Given an input image $I$
40 | 
41 | 1. Normalize image
42 | 2. Convert to feature vector
43 | 3. For each object $q$ in the database
44 |    1. Remove mean $f^{(q)} = f' - c^{(q)} $
45 |    2. Project into eigenspace
46 |    3. Find object $q$ that minimizes the distance $d^{(q)}$
47 | 
48 | ## Bag of 'Features'
49 | 
50 | ### Bag of Features for Detection
51 | 
52 | 1. Take a bunch of images
53 | 2. Extract features
54 | 3. Build up a dictionary of features
55 | 4. Given a new image, extract features and build a histogram for each feature
56 | 5. Find the closest visual 'word' in the dictionary
57 | 
58 | ### Outline
59 | 
60 | 1. Extract visual features
61 | 2. Learn 'visual vocabulary'
62 | 3. Quantize features using visual vocabulary
63 | 4. Represent images by frequencies of visual words
64 | 
65 | ### Feature Detection and Representation
66 | 
67 | #### Detection
68 | 
69 | - Regular grids
70 | - Interest point detector
71 | - Other methods
72 |   - Random sampling
73 |   - Segmentation based patches
74 | 
75 | #### Representation
76 | 
77 | SIFT descriptors of patches can be a good choice
78 | 
79 | ### Constructing Visual Dictionary
80 | 
81 | - Idea
82 |   - Common categories should have clusters of similar features
83 | - So we find the clusters and check which cluster tend to support which categories
84 | - K-Means or EM can be applied to do this
85 | 
86 | The visual dictionary (codebook) is then used to quantize features
87 | 
88 | - A vector quantizer takes a feature vector as input and maps it to the index of the nearest codevector in a codebook
89 | 
90 | ## TF-IDF Weighting
91 | 
92 | - Use **inverted files** to speed up computation of TF-IDF
93 |   - Inverted file is a mapping from a feature to all documents that have the feature
94 | 


--------------------------------------------------------------------------------
/Computer Vision/11.ImageSegmentation.md:
--------------------------------------------------------------------------------
  1 | # Image Segmentation
  2 | 
  3 | ## Principles of Perceptrual Organization and Gestalt
  4 | 
  5 | - Similarity
  6 | - Parallelism
  7 | - Symmetry
  8 | - Common Fate
  9 | - Proximity
 10 | - Figure-Ground
 11 | - Continuity
 12 | - Closure
 13 | 
 14 | ### Intuitions
 15 | 
 16 | - Whole is greater than sum of its parts
 17 | - Relationships among parts can yield new properties
 18 | 
 19 | ## Overview of Segmentation
 20 | 
 21 | ### Goal
 22 | 
 23 | - Group pixels into meaningful or perceptually similar regions
 24 | - Separate images into coherent objects
 25 | 
 26 | ### Workflow
 27 | 
 28 | - Bottom-up
 29 |   - Group tokens with similar features
 30 | - Top-down
 31 |   - Group tokens that are likely belong to the same object
 32 | 
 33 | ## Bottum-Up Segmentation via Clustering
 34 | 
 35 | - For very basic toy images, we can easily perform segmentation by grouping pixels according to their intensities
 36 | - For more complicated images, if we still group pixels by intensity, then we will need clusters of intensities
 37 | 
 38 | Best cluster centers are those that minimizes the SSD between all points and their nearst cluster center
 39 | 
 40 | ### K-Means
 41 | 
 42 | - Algorithm
 43 |   - Omitted
 44 | - Properties
 45 |   - Will always converge to the same solution
 46 |   - Can be a local minimum
 47 | 
 48 | #### Choosing Features
 49 | 
 50 | - Intensity
 51 | - RGB
 52 | - Position
 53 | 
 54 | #### Pros and Cons
 55 | 
 56 | - Pros
 57 |   - Simple
 58 |   - Converges to local minimum
 59 | - Cons
 60 |   - Choice of $k$
 61 |   - Sensitive to outliers
 62 |   - Can only handle convex structures
 63 |   - Assumes means can be computed
 64 | 
 65 | ### Mean Shift
 66 | 
 67 | > A versatile technique for cluster based segmentation
 68 | 
 69 | #### Kernel Density Estimation
 70 | 
 71 | $$ \hat{f}_h(x) = \frac{1}{nh}\sum_{i=1}^n K\left( \frac{x-x_i}{h} \right) $$
 72 | 
 73 | where $K$ is a kernel function, for example Gaussian kernel
 74 | 
 75 | $$ K\left( \frac{x - x_i}{h} \right) = \frac{1}{\sqrt{2\pi}}e^{-\frac{(x-x_i)^2}{2h^2}} $$
 76 | 
 77 | #### Overview of Mean Shift
 78 | 
 79 | 1. Compute mean shift vector $m(x)$
 80 | 2. Translate the Kernel window by $m(x)$
 81 | 
 82 | #### Attraction Basins and Clusters
 83 | 
 84 | - Attraction Basin
 85 |   - The region for which all trajectories lead to the same mode
 86 | - Cluster
 87 |   - All data points in the attraction basin of a mode
 88 | 
 89 | #### Procedure
 90 | 
 91 | 1. Choose a kernel and a bandwidth
 92 | 2. For each point
 93 |    1. Center a window on the point
 94 |    2. Compute the mean of the data in the search window
 95 |    3. Center the search window at the new mean location
 96 |    4. Repeat 2,3 until convergence
 97 | 3. Assign points that lead to nearby modes to the same cluster
 98 | 
 99 | #### Implementation Considerations
100 | 
101 | - Speedup
102 |   - Use bins instead of individual points
103 |   - Individual points are represented by a bin whose center is the center of mass of this group of points
104 |   - Use k-d Tree or approximate NN for fast neighbour search
105 |   - Updated all windows in an iteration for faster convergence
106 | - Other tricks
107 |   - Use KNN to determine window sizes adaptively
108 | 
109 | #### Pros and Cons
110 | 
111 | - Pros
112 |   - Good general-purpose segmentation
113 |   - Flexible in number and shape of regions
114 |   - Robust to outliers
115 | - Cons
116 |   - Choice of kernel size
117 |   - Not suitable for high-dimensional data
118 | 
119 | ## Segmentation as Graph Partition
120 | 
121 | - Segmentation can be implemented via Graph Cut
122 | - MinCut is not suitable because it favours isolated pixels
123 | 
124 | ### Normalized Cut (NCut)
125 | 
126 | $$ NCut(A, \bar{A}) = \frac{cut(A,\bar{A})}{assoc(A,V)} + \frac{cut(A,\bar{A})}{assoc(\bar{A},V)} $$
127 | 
128 | where
129 | 
130 | $$ assoc(A,V) = \sum_{u\in A,t\in V} w(u,t) $$
131 | 
132 | - NP-hard
133 | 
134 | #### Approximation Formulation
135 | 
136 | Assume we parition the graph into 2 parts. We use a vector to represent which part each node has been assigned to. Let $1$ denote $v \in A$ and $-1$ denote $v \notin A$.
137 | 
138 | $$ NCut(A,\bar{A}) = \frac{\sum_{x_i>0,x_j<0}-w_{ij}x_ix_j}{\sum_{x_i>0}d_i} +\frac{\sum_{x_i<0,x_j>0}-w_{ij}x_ix_j}{\sum_{x_i<0}d_i} $$
139 | 
140 | Let $W$ be the affinity matrix encoding edge weights, and let $D$ denote a diagonal matrix with $d_i$ on its main diagonal. Then
141 | 
142 | $$ NCut(x) = \min_{x}\frac{y^\top(D-W)y}{y^\top Dy} $$
143 | 
144 | where
145 | 
146 | $$ y = \frac{1}{2}[(1+x) - b(1-x)] \quad b= \frac{\sum_{x_i>0}d_i}{\sum_{x_i<0}d_i}$$
147 | 
148 | Therefore
149 | 
150 | $$ y^\top(D-W)y = 0 $$
151 | 
152 | Let $\mathcal{L} = I - D^{-1/2}WD^{-1/2}$, we perform EVD on $\mathcal{L}$ and choose the second larges eigenvector as the solution (to avoid trivial solutions)
153 | 


--------------------------------------------------------------------------------
/Computer Vision/12.ObjectDetection.md:
--------------------------------------------------------------------------------
 1 | # Object Detection
 2 | 
 3 | ## Pedestrain Detection with HoG
 4 | 
 5 | HoG: Histogram of Gradient
 6 | 
 7 | - Local object appearance and shape can often be characterized rather well by the distribution of local intensity gradients or edge directions
 8 | 
 9 | ### Acquiring HoGs
10 | 
11 | - Divide the image into small spatial regions (cells)
12 |   - Cells can be either rectangle or radial
13 | - Each cell accumulates a weighted local 1-D histogram of gradient directions over pixels of the cell
14 | 
15 | ### Normalization
16 | 
17 | - Contrast-normalize the local responses for better invariance to illumination and shadowing
18 | 
19 | ### Dalal & Triggs Detector
20 | 
21 | - Construct an image pyramid to get the whole image at multiple resolutions
22 | - Score every window of the feature pyramid
23 | - Apply non-maximal suppression
24 | 
25 | ## Deformable Part Models DPM
26 | 
27 | - Use part-based models and pictorial structures
28 | - DPM = Dalal Triggs Detector + Part-based modeling
29 |   - Add parts at a relative location and scale
30 | 
31 | ### Structure of DPM
32 | 
33 | - Root filter
34 |   - The root part of an object is modeled with HoG template
35 | - Part filters
36 |   - Part filters operate on a picture with 2x resolution for more fine-grained gradient information
37 | - Deformation costs
38 |   - Parts are allowed to slightly move around the expected location
39 |   - Cost for the deformation between parts
40 |   - The cost is modeled by a quadratic function
41 | 
42 | The final score is given by filter scores and spring costs
43 | 
44 | $$ Score(I, p_0) = \max \sum_{i=0}^n m_i(I, p_i) - \sum_{i=1}^n d_i(p_0, p_i) $$
45 | 
46 | ### Training
47 | 
48 | - Parameters to learn
49 |   - Biases (per component)
50 |   - Deformation cost (per part)
51 |   - Filter weights
52 | 
53 | #### Latent SVM
54 | 
55 | - Cannot directly train the DPM on a SVM
56 |   - Because the parts do NOT have annotations
57 |   - They are latent
58 | 


--------------------------------------------------------------------------------
/Computer Vision/14.FaceDetection.md:
--------------------------------------------------------------------------------
 1 | # Face Detection (Pre-Deep-Learning)
 2 | 
 3 | ## Face Detection in Computers
 4 | 
 5 | - Use a sliding window to search for face models
 6 | - For each window
 7 |   - Extract features
 8 |   - Match face model (Classifier)
 9 | 
10 | ## Haar Features
11 | 
12 | - A set of correlation responses to rectangular Haar filters
13 | - A Haar filter is a filter consisting of regions of 1's and -1's (denote 1's by 'white' regions, and denote -1's by 'black' regions)
14 | 
15 | $$ V_A = I \otimes H_A = \sum I[\text{white regions}] - \sum I[\text{black regions}] $$
16 | 
17 | - Haar features capture the **directional pattern**
18 | 
19 | ### Computational Cost
20 | 
21 | Assume the filter is $N\times M$, then it would be $(N\times M)$ additions per pixel per filter per scale.
22 | 
23 | - Expensive
24 | 
25 | ### Integral Image
26 | 
27 | - Integral image helps compute Haar features effectively
28 | - An integral image is a table that holds the sum of all pixel values to the left and top of a given pixel, inclusive
29 | - The integral image enables fast summations of arbitrary rectangles
30 | - Reduces summation of arbitrary rectangle regions to 3 additions
31 | 


--------------------------------------------------------------------------------
/Convex Optimization/00.NumericalLinearAlgebra.md:
--------------------------------------------------------------------------------
 1 | # Numerical Linear Algebra
 2 | 
 3 | ## Numerical Complexity
 4 | 
 5 | - Floating point operations (flops)
 6 |   - Addition, subtraction, multiplication, division, square root, etc.
 7 |   - Only consider dominant terms
 8 |   - Not very accurate
 9 | 
10 | ### Complexity of Multiplications
11 | 
12 | - $x^Ty$, where $x, y \in \mathbb{R}^n$: $O(n)$ flops, $2n-1$.
13 | - $Ax$, where $A \in \mathbb{R}^{m\times n}, y \in \mathbb{R}^n$: $O(mn)$ flops, $m(2n-1)$.
14 | - $AB$, where $A \in \mathbb{R}^{m \times n}, B \in \mathbb{R}^{n \times p}$: $O(mnp)$.
15 | 
16 | ## Solving Linear Equations with Factored Matrices
17 | 
18 | ### Linear Equations that are Easy to Solve
19 | 
20 | - **Diagonal matrix.** $Ax = b \Leftrightarrow x = A^{-1}b = \frac{1}{a_{ii}}b$.
21 |   - $O(n)$ flops.
22 | - **Lower triangular matrix.** $Ax = b \Leftrightarrow x = A^{-1}b = \frac{1}{a_{ii}}(b - \sum_{j=1}^{i-1}a_{ij}x_j)$.
23 |   - $O(n^2)$ flops.
24 | 
25 | ### LU Factorization
26 | 
27 | #### LU Factorization and Gaussian Elimination
28 | 
29 | Every nonsingular matrix $A \in \mathbb{R}^{n \times n}$ can be factored as
30 | 
31 | $$ A = PLU $$
32 | 
33 | where $P$ is a permutation matrix, $L$ is unit lower triangular, and $U$ is upper triangular and nonsingular.
34 | 
35 | The compexity for LU factorization is about $2/3n^3$ flops, if no structure of $A$ is exploited.


--------------------------------------------------------------------------------
/Convex Optimization/01.Introduction.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | > -- “我认识你和我不认识你在改卷的时候还是很有区别的”
  4 | > 
  5 | > -- “我没有威胁大家的意思”
  6 | 
  7 | ## Overview of the Course
  8 | 
  9 | > “按照我们班之前的水平，考30分，有点难度”
 10 | 
 11 | 1. Introduction.
 12 | 2. Convex sets.
 13 |    - Line segment, convex set, cone.
 14 |    - Operations that preserves convexity.
 15 | 3. Convex functions.
 16 |    - Definition, first-order & second-order conditions.
 17 |    - Epigraph, sublevel set.
 18 |    - Operations that preserves convexity.
 19 |    - Conjugate function.
 20 | 4. Convex optimization problem.
 21 |    - Linear programming.
 22 | 5. Duality.
 23 |    - Lagrangian.
 24 |    - Strong and weak duality, qualification constraints.
 25 |    - KKT conditions.
 26 | 6. Approximation (Approximation / Fitting / Estimation / Geometric problems).
 27 | 7. Unconstrained optimization.
 28 |    - Descent method.
 29 |    - Gradient descent.
 30 |    - Steepest descent method.
 31 |    - Newton's method.
 32 | 8. Equality constrained optimization.
 33 |    - Equality constrained Newton's method.
 34 |    - Infeasible start Newton's method.
 35 | 9. Inequality constrained optimization.
 36 |    - Barrier method.
 37 | 
 38 | ## Mathematical Optimization
 39 | 
 40 | A **mathematical optimization problem** (a.k.a. **optimization problem**) has the form
 41 | 
 42 | $$ \begin{align*}
 43 |  \min \quad & f_0(x) \\
 44 |  \mathrm{s.t.} \quad & f_i(x) \le b_i \quad i=1,\dots,m
 45 | \end{align*} $$
 46 | 
 47 | where
 48 | 
 49 | - $x$ is the **optimization variable**,
 50 | - $f_0$ is the **objective function**,
 51 | - $f_i$'s are the **constraint functions**.
 52 | 
 53 | ## Linear Regression and Least-Squares
 54 | 
 55 | ### Linear Regression
 56 | 
 57 | - Dataset $\mathcal{D} = \{ (x_i, y_i) \}_{i=1}^n$ where $x_i \in \mathbb{R}$ and $y_i \in \mathbb{R}$.
 58 | - Objective (intuitively): Find a line $f_{\theta}(x) = ax + b$ that "best" fits the dataset.
 59 |   - Let $h_i = f_{\theta}(x_i)$
 60 |   - The term "best" is measured by a **loss function**
 61 |     - e.g. Mean Square Error $L = \sum_i (h_i - y_i)^2$
 62 | 
 63 | $$ \min_{a,b} \sum_{i=0}^{n} (h_i - y_i)^2 = \sum_{i=0}^{n} (ax_i + b - y_i)^2 $$
 64 | 
 65 | #### Solving Linear Regression
 66 | 
 67 | 1. Closed-form solution
 68 |    - $ \partial L / \partial a = 0 $
 69 |    - $ \partial L / \partial b = 0 $
 70 | 2. Descent method (e.g. Gradient Descent)
 71 |    - $a_{t+1} = a_t - \eta \frac{\partial L}{\partial a}$
 72 |    - $b_{t+1} = b_t - \eta \frac{\partial L}{\partial b}$
 73 | 3. Least Squares
 74 |    - *“看到等号就换成减号，然后外面套一个平方”*
 75 | 
 76 | #### Least-Squares
 77 | 
 78 | $$ \begin{bmatrix}
 79 |   y_1\\ \vdots \\ y_n
 80 | \end{bmatrix} = \begin{bmatrix}
 81 |   x_1, & 1 \\
 82 |   \vdots & \vdots \\
 83 |   x_n, & 1
 84 | \end{bmatrix} \begin{bmatrix} a \\ b \end{bmatrix} $$
 85 | 
 86 | $$ \Rightarrow Y = X \beta $$
 87 | 
 88 | ## Linear Programming
 89 | 
 90 | In a **linear programming** problem, the objective and all constraint functions are *linear*.
 91 | 
 92 | $$ \begin{align*}
 93 |     \min \quad & c^Tx \\
 94 |     \mathrm{s.t.} \quad &a_i^Tx \le b_i
 95 | \end{align*} $$
 96 | 
 97 | ### Chebyshev Approximation
 98 | 
 99 | > 打地鼠
100 | 
101 | ## Convex Optimization
102 | 
103 | > 为什么要研究凸优化？简单。
104 | > 为什么不研究非凸优化？困难。
105 | > 非凸优化怎么做？瞎做。
106 | 
107 | A **convex optimization** problem is one of the form
108 | 
109 | $$ \begin{align*}
110 |   \min \quad & f_0(x) \\
111 |   \mathrm{s.t.} \quad & f_i(x) \le b_i \quad i=1,\dots,m
112 | \end{align*} $$
113 | 
114 | where the functions $f_0, \dots, f_m$ are *convex functions*.
115 | 
116 | - The local optima of a convex optimization problem is also its global optima.
117 | 


--------------------------------------------------------------------------------
/Convex Optimization/06.ApproximationAndFitting.md:
--------------------------------------------------------------------------------
  1 | # Approximation and Fitting
  2 | 
  3 | ## Norm Approximation
  4 | 
  5 | ### Norms
  6 | 
  7 | A function $f: \mathbb{R}^n \mapsto \mathbb{R}$ is called a **norm** if
  8 | 
  9 | - $f$ is non-negative: $f(x) \ge 0$ for all $x \in \mathbb{R}^n$
 10 | - $f$ is definite: $f(x) = 0$ iff $x=0$
 11 | - $f$ is homogeneous: $f(tx) = |t|f(x)$ for all $x \in \mathbb{R}^n$ and $t \in \mathbb{R}$
 12 | - $f$ satisfies the triangle inequality: $f(x+y) \le f(x) + f(y)$ for all $x, y \in \mathbb{R}^n $
 13 | 
 14 | #### Lp-Norm
 15 | 
 16 | $$ \| v \|_p = \left( \sum_{i=1}^n |v_i|^p \right)^{1/p} $$
 17 | 
 18 | ##### L2-Norm (Euclidean Distance)
 19 | 
 20 | $$ \| v \|_2 = \left( \sum_{i=1}^n v_i^2 \right)^{1/2} $$
 21 | 
 22 | ##### L1-Norm (Manhattan Distance)
 23 | 
 24 | $$ \| v \|_1 = \sum_{i=1}^n |v_i| $$
 25 | 
 26 | ##### L-$\infty$ Norms
 27 | 
 28 | $$ \|v\|_{\infty} = \max_{i} |v_i| $$
 29 | 
 30 | ##### L0-Norm
 31 | 
 32 | $$ \|v\|_0 = \#\text{non-zero elements} $$
 33 | 
 34 | - **Note.** $L_0$-norm is not a norm. It is not homogenous.
 35 | 
 36 | ### Norm Approximation
 37 | 
 38 | $$ \min \| Ax - b \| $$
 39 | 
 40 | - $A \in \mathbb{R}^{m \times n}$ with $m \ge n$
 41 | - $b \in \mathbb{R}^m$
 42 | - $\|\cdot\|$ is a norm on $\mathbb{R}^m$
 43 | 
 44 | #### Interpretations of Norm Approximation
 45 | 
 46 | ##### Estimation Interpretation
 47 | 
 48 | Consider a linear measurement model
 49 | 
 50 | $$ y = Ax + v $$
 51 | 
 52 | where $v$ is some measurement error that is unknown, but persumed to be small and follows a certain distribution (typically a Gaussian distribution)
 53 | 
 54 | ##### Geometric Interpretation
 55 | 
 56 | Consider a subspace $\mathcal{A} = \mathcal{R}(A) \subseteq \mathbb{R}^m$ and a point $b \in \mathbb{R}^m$. A **projection** of $b$ onto $\mathcal{A}$ in the norm $\|\cdot\|$ is any point in $\mathcal{A}$ that is closest to $b$.
 57 | 
 58 | Parametrize any point in $\mathcal{A}$ by $u =Ax$, solving the norm approximation is equivalent to computing a projection of $b$ onto $A$.
 59 | 
 60 | ##### Design Interpretation
 61 | 
 62 | - $x_1,\dots,x_n$ are design variables.
 63 | - $y = Ax$ gives a vector of $m$ results.
 64 | - $b$ is a vector of targets or desired results.
 65 | - The goal is to choose a vector of design variables that achieves the desired results as closely as possible.
 66 | 
 67 | #### Examples
 68 | 
 69 | ##### Least-Squares Approximation
 70 | 
 71 | $$ \min \| Ax - b \|^2 $$
 72 | 
 73 | - $f(x) = x^TA^TAx - 2b^TAx +b^Tb$
 74 | - $\nabla f(x) = 2A^TAx - 2A^Tb = 0$
 75 | - $ x = (A^TA)^{-1} A^Tb $
 76 | 
 77 | ##### Chebyshev / Minimax Approximation
 78 | 
 79 | $$ \min \| Ax - b \|_{\infty} $$
 80 | 
 81 | The Chebyshev Approximation can be cast as an LP.
 82 | 
 83 | $$ \begin{align*}
 84 |     \min \quad & t\\
 85 |     \mathrm{s.t.} \quad & -t\bm{1} \preceq Ax - b \preceq t\bm{1}
 86 | \end{align*} $$
 87 | 
 88 | ##### Sum of Absolute Residuals Approximation
 89 | 
 90 | $$ \min \| Ax - b \|_1 $$
 91 | 
 92 | Like the Chebyshev Approximation, the $L_1$-norm approximation can be cast as an LP.
 93 | 
 94 | $$ \begin{align*}
 95 |     \min \quad & \bm{1}^T t\\
 96 |     \mathrm{s.t.} \quad & -t \preceq Ax - b \preceq t
 97 | \end{align*} $$
 98 | 
 99 | ## Penalty Function Approximation
100 | 
101 | The **Penalty Function Approximation** is a generalization of the $L_p$-norm approximation, where the objective depends only on the amplitude distribution of the residuals.
102 | 
103 | $$ \begin{align*}
104 |     \min \quad & \phi(r_1) + \cdots + \phi(r_m) \\
105 |     \mathrm{s.t.} \quad & r = Ax - b
106 | \end{align*} $$
107 | 
108 | where $\phi: \mathbb{R} \mapsto \mathbb{R}$ is called the **penalty function**. We assume $\phi$ is convex.
109 | 
110 | #### Common Penalty Functions
111 | 
112 | - $\phi(u) = \|u\|^p$ with $p \ge 1$. Equivalent to norm approximation.
113 | - **Deadzone-Linear.**
114 |   $$ \phi(u) = \begin{cases}
115 |     0 \quad & |u| \le a \\
116 |     |u| - a \quad & |u| > a
117 |   \end{cases} $$ The deadzone-linear penalty assesses no penalty for residuals smaller than $a$.
118 | - **Log Barrier.** Log barrier with limit $a > 0$
119 |   $$ \phi(u) = \begin{cases}
120 |     -a^2\log(1-(u/a)^2) & \quad |u|<a \\
121 |     \infty & \quad |u| \ge a
122 |   \end{cases} $$
123 | - **Huber Penalty Function.**
124 |   $$ \phi(u) = \begin{cases}
125 |     u^2 &\quad |u| \le M \\
126 |     M(2|u| - M) |u| \ge M &\quad |u| > M
127 |   \end{cases} $$ The Huber penalty is more robust to outliers
128 | 
129 | ## Over-parametrization
130 | 
131 | Consider cases when $m < n$
132 | 
133 | ### Least-Norm Problems
134 | 
135 | $$ \begin{align*}
136 |     \min &\quad \| x \| \\
137 |     \mathrm{s.t.} &\quad Ax = b
138 | \end{align*} $$
139 | 
140 | The least-norm problem is only interesting when $m < n$.
141 | 
142 | ## Regularized Approximation
143 | 
144 | The goal is to find a vector $x$ that is small and also makes $Ax - b$ small. This is naturally described as a convex optimization with two objectives
145 | 
146 | $$ \min (\mathrm{w.r.t.} \mathbb{R}_{+}^{2}) \quad (\|Ax-b\|, \|x\|) $$
147 | 
148 | ### Regularization
149 | 
150 | **Regularization** is a common scalarization method used to solve the bi-criterion problem.
151 | 
152 | One form of regularization is to minimize the weighted sum
153 | 
154 | $$ \min \quad \| Ax - b \| + \gamma \| x\| $$
155 | 
156 | where $\gamma > 0$ is a parameter.
157 | 
158 | #### Tikhonov Regularization
159 | 
160 | Another common method is to minimize the weighted sum of squared norms
161 | 
162 | $$ \min \quad \| Ax - b \|^2 + \delta \| x \|^2 $$
163 | 
164 | This problem has an analytical solution
165 | 
166 | $$ x = (A^TA + \delta I)^{-1}A^Tb $$
167 | 
168 | Since $(A^TA + \delta I) \succ 0$ for any $\delta > 0$, the Tikhonov Regularization requires no rank (or dimension) assumptions on $A$.
169 | 


--------------------------------------------------------------------------------
/Convex Optimization/07.StatisticalEstimation.md:
--------------------------------------------------------------------------------
  1 | # Statistical Estimation
  2 | 
  3 | ## Parametric Distribution Estimation
  4 | 
  5 | ### Maximum Likelihood Estimation
  6 | 
  7 | Consider a family of probability distributions on $\mathbb{R}^m$, indexed by a vector $x \in \mathbb{R}^n$, i.e., each $x$ corresponds to a distribution $p_x(\cdot)$.
  8 | 
  9 | The function $p_x(y)$ is called the **likelihood function**.
 10 | 
 11 | However, it is more convenient to work with the **log-likelihood**,
 12 | 
 13 | $$ l(x) = \log p_x(y) $$
 14 | 
 15 | To estimate the value of the parameter $x$, based on observing one sample $y$ from the distribution, a widely used method is the **maximum-likelihood estimation (MLE)**.
 16 | 
 17 | $$ \hat{x}_{ml} = \arg\max_x p_x(y) = \arg\max_x l(x) $$
 18 | 
 19 | #### Linear Measurements with IID Noise
 20 | 
 21 | Consider a linear measurement model
 22 | 
 23 | $$ y_i = a_i^T x + v_i $$
 24 | 
 25 | where $x_i \in \mathbb{R}^n$ is a vector of parameters to be estimated, $y \in \mathbb{R}$ are the measured quantities, and $v_i$ are some iid measurement noise, with density $p$ on $\mathbb{R}$.
 26 | 
 27 | The likelihood function is
 28 | 
 29 | $$ p_x(y) = \prod_{i=1}^m p(y_i - a_i^Tx) $$
 30 | 
 31 | And the log-likelihood is
 32 | 
 33 | $$ l(x) = \log p_x(y) = \sum_{i=1}^m \log p(y_i - a_i^Tx) $$
 34 | 
 35 | So the goal of MLE is
 36 | 
 37 | $$ \max_x \log p_x(y) $$
 38 | 
 39 | If the density $p$ is log-concave, then the optimization problem would be convex.
 40 | 
 41 | ##### Gaussian Noise
 42 | 
 43 | If $v_i$ follows a Gaussian distribution with zero mean and variance $\sigma^2$,
 44 | 
 45 | $$ p_x(v) = \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left( -\frac{1}{2\sigma^2}v^2 \right) $$
 46 | 
 47 | The log-likelihood function is
 48 | 
 49 | $$ l(x) = -(m/2)\log(2\pi\sigma^2) - \frac{1}{2\sigma^2}\| Ax-y \|_2^2 $$
 50 | 
 51 | Therefore maximizing $l(x)$ is equivalent to minimizing
 52 | 
 53 | $$ \min_x \|Ax - y\|_2^2 $$
 54 | 
 55 | which is the solution of a least-squares problem
 56 | 
 57 | ##### Laplacian Noise
 58 | 
 59 | If $v_i$ follows a Laplacian distribution
 60 | 
 61 | $$ p(z) = \frac{1}{2a}\exp\left( -\frac{1}{a}|z| \right), \quad a > 0$$
 62 | 
 63 | Maximizing $l(x)$ is equivalent to minimizing
 64 | 
 65 | $$ \min_x \| Ax - y \|_1, $$
 66 | 
 67 | which is the solution of a $L_1$-norm approximation.
 68 | 
 69 | ##### Uniform Noise
 70 | 
 71 | If $v_i \sim \mathcal{U}(-a, a)$, an ML estimate is any $x$ satisfying
 72 | 
 73 | $$ \| Ax - y \|_{\infty} \le a $$
 74 | 
 75 | #### Logistic Regression
 76 | 
 77 | Consider a random variable $y \in \{ 0, 1 \}$ with
 78 | 
 79 | $$ \mathrm{Pr}[y = 1] = p, \quad \mathrm{Pr}[y=0] = (1-p) $$
 80 | 
 81 | The **logistic model** has the form
 82 | 
 83 | $$ p = \frac{\exp(a^Tx + b)}{1 + \exp(a^Tx + b)}, $$
 84 | 
 85 | and we use $p$ to model the probability that the input $x$ has output $y$.
 86 | 
 87 | We can re-order the log-likelihood function so that for a first group $x_1,\dots,x_q$, the outcome is $y=1$, and for $x_{q+1},\dots,x_m$, the outcome is $y=0$.
 88 | 
 89 | $$ p(a, b) = \prod_{i=1}^q p_i \cdot \prod_{i=q+1}^m (1 - p_i). $$
 90 | 
 91 | The above equation can be rewritten as
 92 | 
 93 | $$ p(a, b) = \prod_{i=1}^m p_i^{y_i} (1-p_i)^{1-y_i} $$
 94 | 
 95 | The log-likelihood function is then
 96 | 
 97 | $$ l(a, b) = \sum_{i=1}^m \mathbb{I}[y_i = 1] p_i + \mathbb{I}[y_i = 0] (1-p_i) $$
 98 | 
 99 | !!!note ""
100 |     The log-likelihood function for a logistic regression is concave. So the MLE problem can be converted into a convex optimization problem.
101 | 
102 | ### Maximum a Posteriori Probability Estimation
103 | 
104 | The **Maximum a posteriori (MAP)** estimation can be viewed as a Bayesian version of maximum likelihood estimation.
105 | 
106 | !!!note Recap. The Bayes Theorem
107 |     $$ p(y|x) = \frac{p(x, y)}{p(x)} = \frac{p(x|y)p(y)}{p(x)} $$
108 | 
109 | We assume the parameter $x$ and the observation $y$ are random variables with a joint probability $p(x,y)$. This is in contrast to the statistical estimation setup, where $x$ is a *parameter*, not a *random variable*.
110 | 
111 | In the MAP setup, we have $p(y|x)$, and we want to maximize $p(x|y)$. By the Bayes Theorem,
112 | 
113 | $$ \log p(y|x) = \log p(y|x) + \log p(x) - \log p(y) $$
114 | 
115 | Maximizing $\log p(y|x)$ is equivalent to
116 | 
117 | $$ \max_x \log p(x| y) \sim \max_x \log p(y|x) + \log p(x) $$
118 | 
119 | Ignoring the philosophical differences between MAP and MLE, the only difference between MAP and MLE is an additional prior term $p(x)$.
120 | 
121 | ## Non-parametric Distribution Estimation
122 | 
123 | Consider a random variable $X$ taking values from a finite set $\{ a_1,\dots,a_n \}$. The distribution $X$ is characterized by $p \in \mathbb{R}^n$.
124 | 
125 | Note that for each $p \in \mathbb{R}^n$ such that $p_i \ge 0, \sum_i p_i = 1$, it defines a probability distribution on $\mathbb{R}^n$ for $X$.
126 | 
127 | We aim to estimate the distribution of $p$, i.e., the *distribution of a distribution*.
128 | 


--------------------------------------------------------------------------------
/Convex Optimization/08.GeometricProblems.md:
--------------------------------------------------------------------------------
  1 | # Geometric Problems
  2 | 
  3 | ## Projection on a Set
  4 | 
  5 | ### Projecting a Point on a Set
  6 | 
  7 | **Distance.** The **distance** of a point $x_0 \in \mathbb{R}$ to a closed set $C \subseteq \mathbb{R}^n$ in the norm $\| \cdot \|$ is defined as
  8 | 
  9 | $$ \mathrm{dist}(x_0, C) = \inf\{ \|x_0 - y\| | y \in C \} $$
 10 | 
 11 | **Projection.** We refer to any point $z \in C$ which is the closest to $x_0$ ($\|z-x_0\| = \mathrm{dist}(x_0, C)$) as a **projection** of $x_0$ on $C$.
 12 | 
 13 | We use the notation $P_C: \mathbb{R}^n \mapsto \mathbb{R}^n$ to denote any function for which $P_C(x_0)$ is a projection of $x_0$ on $C$.
 14 | 
 15 | $$ P_C(x_0) \in C, \quad \| x_0 - P_C(x_0) \| = \mathrm{dist}(x_0, C) $$
 16 | 
 17 | $P_C$ is referred to as **projection on $C$**.
 18 | 
 19 | #### Examples
 20 | 
 21 | ### Projecting a Point on a Convex Set
 22 | 
 23 | We represent a convex set $C$ by a set of linear equalities and convex inequalitites.
 24 | 
 25 | $$ C = \{ x \in \mathbb{R}^n | Ax = b, f_i(x) \le 0 \} $$
 26 | 
 27 | The projection can be found by solving
 28 | 
 29 | $$ \begin{align*}
 30 |     \min &\quad \| x - x_0 \|\\
 31 |     \mathrm{s.t.} &\quad f_i(x) \le 0 \\
 32 |     &\quad Ax = b
 33 | \end{align*} $$
 34 | 
 35 | - The problem is feasible iff $C$ is nonempty.
 36 | - When it is feasible, its optimal value $p^* = \mathrm{dist}(x_0, C)$, and any optimal point is a projection of $x_0$ on $C$.
 37 | 
 38 | #### Projection on a Polyhedron
 39 | 
 40 | The projection of $x_0$ on a polyhedron described by $Ax \preceq b$ can be computed by solving the QP
 41 | 
 42 | $$ \begin{align*}
 43 |     \min &\quad \| x - x_0 \|_2^2\\
 44 |     \mathrm{s.t.} &\quad Ax \preceq b
 45 | \end{align*} $$
 46 | 
 47 | Some special cases have simple analytical solutions
 48 | 
 49 | **Projection on a Hyperplane.** $C = \{ x | a^Tx = b \}$.
 50 | 
 51 | $$ P_C(x_0) = x_0 + (b - a^Tx_0) a /\|a\|^2 $$
 52 | 
 53 | **Projection on a Halfspace.** $C = \{ x | a^Tx \le b \}$.
 54 | 
 55 | $$ P_C(x_0) = \begin{cases}
 56 |     x_0 + (b-a^Tx_0)a/\|a\|^2&\quad a^Tx_0 > b\\
 57 |     x_0 &\quad a^Tx \le b
 58 | \end{cases} $$
 59 | 
 60 | ### Separating a Point and a Convex Set
 61 | 
 62 | ## Distance between Sets
 63 | 
 64 | **Distance between Two Sets.** The distance between two sets $C, D$ in a norm $\|\cdot\|$ is defined as
 65 | 
 66 | $$ \mathrm{dist}(C, D) = \inf \{ \| x - y \| | x \in C, y \in D \} $$
 67 | 
 68 | - The two sets $C$ and $D$ do not intersect if $\mathrm{dist}(C, D) > 0$.
 69 | - They intersect if $\mathrm{dist}(C,D) = 0$ and the infimum is attained.
 70 | 
 71 | The distance between $C$ and $D$ can be expressed in terms of the distance between a point and a set
 72 | 
 73 | $$ \mathrm{dist}(C, D) = \mathrm{dist}(0, D - C) $$
 74 | 
 75 | ### Computing the Distance between Convex Sets
 76 | 
 77 | Assume $C$ and $D$ are described by two sets of convex inequalities
 78 | 
 79 | $$ C = \{ x | f_i(x) \le 0 \}, \quad D = \{ x | g_i(x) \le 0 \} $$
 80 | 
 81 | We find the distance by solving
 82 | 
 83 | $$\begin{align*}
 84 |     \min &\quad \| x - y \|\\
 85 |     \mathrm{s.t.} &\quad f_i(x) \le 0 \\
 86 |     &\quad g_i(y) \le 0
 87 | \end{align*}$$
 88 | 
 89 | or equivalently
 90 | 
 91 | $$\begin{align*}
 92 |     \min &\quad \| w \|\\
 93 |     \mathrm{s.t.} &\quad f_i(x) \le 0 \\
 94 |     &\quad g_i(y) \le 0\\
 95 |     &\quad x - y = w
 96 | \end{align*}$$
 97 | 
 98 | The Lagrangian is
 99 | 
100 | $$ \mathcal{L}(x, y, w, \lambda, z, \mu) = \| w \| + \sum_i \lambda_i f_i(x) + \sum_i \mu_i g_i(y) + z^T(x-y-w) $$
101 | 
102 | and the dual function is
103 | 
104 | $$ \begin{align*}
105 |     g(\lambda, z, \mu) &= \inf_{x, y, w} \{ \| w \| + \sum_i \lambda_i f_i(x) + \sum_i \mu_i g_i(y) + z^T(x-y-w) \} \\
106 |     &= \inf_x \{ \sum_i \lambda_i f_i(x) + z^Tx \} + \inf_y \{ \sum_i \mu_i g_i(y) - z^T y \} + \inf_w\{ \|w\| - z^Tw \}\\
107 |     &=\begin{cases}
108 |         \inf_x \{ \sum_i \lambda_i f_i(x) + z^Tx \} + \inf_y \{ \sum_i \mu_i g_i(y) - z^T y \}, &\quad \|z\|_* \le 1\\
109 |         -\infty, &\quad \|z\|_* > 1
110 |     \end{cases}
111 | \end{align*} $$
112 | 
113 | Therefore the dual problem is formuated as
114 | 
115 | $$ \begin{align*}
116 |     \max_{\lambda, \mu, z} &\quad \inf_x \{ \sum_i \lambda_i f_i(x) + z^Tx \} + \inf_y \{ \sum_i \mu_i g_i(y) - z^T y \}\\
117 |     &\quad \mathrm{s.t.} \|z\|_* \le 1, \lambda \ge 0, \mu \ge 0
118 | \end{align*} $$
119 | 
120 | ## Centering
121 | 
122 | ### Chebyshev Center
123 | 
124 | **Depth.** Let $C \subseteq \mathbb{R}^n$ be bounded and have nonempty interior, and $x \in C$. The **depth** of a point $x \in C$ is defined as
125 | 
126 | $$ \mathrm{depth}(x, C) = \mathrm{dist}(x, \mathbb{R}^n \backslash C) $$
127 | 
128 | **Chebyshev Center.** The **Chebyshev center** of the set $C$ is defined as any point of maximum depth in $C$.
129 | 
130 | $$ x_{cheb}(C) = \argmax \mathrm{depth}(x, C) = \argmax\mathrm{dist}(x, \mathbb{R}^n \backslash C) $$
131 | 
132 | - The Chebyshev center is a point inside $C$ that is farthest from the exterior of $C$.
133 | - It is also the center of the largest ball that lies inside $C$.
134 | 
135 | #### Chebyshev Center of a Convex Set
136 | 
137 | Consider a convex set defined by convex inequalities
138 | 
139 | $$ C = \{ x | f_i(x) \le 0, i =1,\dots,m \} $$
140 | 
141 | We can find the Chebyshev center by solving
142 | 
143 | $$\begin{align*}
144 |     \max &\quad R\\
145 |     \mathrm{s.t.} &\quad g_i(x, R) \le 0
146 | \end{align*}$$
147 | 
148 | where $g_i(x, R) = \sup_{\|u\|\le 1} f_i(x + Ru)$
149 | 


--------------------------------------------------------------------------------
/Data Mining/03.Clustering.md:
--------------------------------------------------------------------------------
  1 | # Clustering
  2 | 
  3 | !!! cite Curse of DImensionality
  4 |     In high dimensions, almost all points have basically the same distance.
  5 | 
  6 | ## Recap of Previously Mentioned Clustering Methods
  7 | 
  8 | - Hierarchical
  9 |   - Agglomerative Method
 10 |   - Divisive Method
 11 | - Point Assignment
 12 |   - K-means
 13 |   - EM-based methods
 14 | 
 15 | ## Hierarchical Clustering
 16 | 
 17 | > Key Operation: Repeatedly combine two nearest clusters
 18 | 
 19 | - For Euclidean distances
 20 |   - Represent a cluster by its centriod
 21 |   - Measure distances of clusters by distances of centroids
 22 | 
 23 | ### Clustroids
 24 | 
 25 | - For Non-Euclidean Cases
 26 |   - Choose the 'clustoid' to be the point that is the closest to other points
 27 |   - By 'closest' we mean
 28 |     - Smallest maximum/average/sum distance
 29 |     - e.g. $\min_{c\in C} \sum_{x\in C}d(x,c)^2$
 30 |     - Note that a clustroid is selected from the dataset, while a centroid may not exist in the dataset
 31 | 
 32 | ### Distance between Clusters
 33 | 
 34 | #### Intercluster distance
 35 | 
 36 | Minimum distance between any pair of points from the two clusters
 37 | 
 38 | #### Cohesion
 39 | 
 40 | Pick a notion of cohesion of clusters
 41 | 
 42 | - radius: maximum distance from the clustroid
 43 | 
 44 | Merge clusters whose union is the most cohesive, e.g.
 45 | 
 46 | - diameter of merged clusters
 47 | 
 48 | ### Stopping Criterion
 49 | 
 50 | 1. Stop when we have $k$ clusters
 51 | 2. Stop when the next merges creates a bad cluster with low cohesion
 52 | 3. Stop when the average diameter takes a sudden jump
 53 | 
 54 | ### Implementation Considerations
 55 | 
 56 | - Naive approach is $O(N^3)$
 57 | - Implementation with priority-queue reduces time to $O(N^2\log{N})$
 58 |   - Still expensive
 59 | - The BFR algorithm
 60 | 
 61 | ## The BFR Algorithm
 62 | 
 63 | > A variant of K-means designed to handle very large datasets
 64 | > > Assumes Clusters are normally distributed around a centroid in the Euclidean space
 65 | 
 66 | - Points are read from disk
 67 |   - Each time we load as many points as possible to the memory
 68 | - Most points from previous memory loads are summarized by simple statistics
 69 | - Initialize with $k$ points
 70 |   - Take random points
 71 |   - Take a small sample and cluster
 72 |   - Take a sample randomly, and choose the remaining $k-1$ points such that each are as far from previous points as possible
 73 | 
 74 | ### Three Class of Points
 75 | 
 76 | - Discard Set
 77 |   - Points close enough existing centroids to be summarized
 78 | - Compression Set
 79 |   - Points are close enought to each other, but not close to any existing centroids
 80 | - Retained Set
 81 |   - Isolated points weaiting to be assigned to a compression set
 82 | 
 83 | ### Summarizing Sets of Points
 84 | 
 85 | #### Discard Set
 86 | 
 87 | - Number of points $N$
 88 | - Vector $SUM$
 89 |   - $i$th component is the sum of coordinates of the points in $i$th dimension
 90 | - Vector $SUMSQ$
 91 |   - $i$th component is the sum of squares of coordinates in $i$th dimension
 92 | - A total of $2d+1$ values
 93 | - Mean can be computed by $SUM_i/N$
 94 | - Variance can be computed by $SUMSQ_i/N - (SUM_i/N)^2$
 95 | 
 96 | #### Updating Sets
 97 | 
 98 | 1. Find points that are sufficiently close to a cluster centroid and add the points to the cluster and the DS
 99 | 2. Cluster the remaining points and the old RS
100 | 3. Adjust statistices of updated DS sets
101 | 4. Consider merging CS
102 | 5. If is the final iteration, merge all CS and RS points into the nearest cluster
103 | 
104 | ### Implementation Details
105 | 
106 | #### Deciding 'Close Enough'
107 | 
108 | Two ways
109 | 
110 | - High likelihood
111 | - Mahalanobis Distance less than a threshold
112 | 
113 | $$ d(x,c) = \sqrt{\sum_{i=1}^d\left( \frac{x_i-c_i}{\sigma_i} \right)^2} $$
114 | 
115 | - $c_i$: centroid
116 | 
117 | #### Merging Compressed Sets
118 | 
119 | - Compute the variance of the combined sub-cluster
120 |   - Merge CS if the combined variance is less than a threshold
121 | 


--------------------------------------------------------------------------------
/Data Mining/04.DimensionalityReduction.md:
--------------------------------------------------------------------------------
 1 | # Dimensionality Reduction
 2 | 
 3 | ## Singular Value Decomposition
 4 | 
 5 | Gives dense $U$, $V$ and sparse $\Sigma$
 6 | 
 7 | ### Motivating Example
 8 | 
 9 | Consider a point cloud in 3D space, represented by a matrix where each row is a point
10 | 
11 | $$ \begin{bmatrix}
12 |     1 & 2 & 1\\
13 |     -2 & -3 & 1\\
14 |     3 & 5 & 0
15 | \end{bmatrix} $$
16 | 
17 | Notice that the rank is $2$, so $A,B,C$ are linearly dependent, and we can change the basis into $[1,2,1]$ and $[-2,-3,1]$. In this way $A'=[1,0]$, $B'=[0,1]$ and $C'=[1,-1]$
18 | 
19 | ### Formulation of SVD
20 | 
21 | $$ A_{m\times n} = U_{m \times r}\Sigma_{r\times r}V^T_{n\times r} = \sum_i \sigma_i u_i \circ v_i^T $$
22 | 
23 | - $A$: Input matrix of $m \times n$
24 |   - e.g. $m$ documents, $n$ terms
25 | - $U$: Left singular vectors of $m\times r$
26 |   - e.g. $m$ documents, $r$ concepts
27 |   - $U^TU=I$
28 | - $\Sigma$: Diagonal matrix whose diagonal elements are singular values
29 |   - e.g. Strength of each 'concept'
30 |   - $r$: rank of $A$
31 | - $V$: Right singular vecotors
32 |   - e.g. $n$ terms, $r$ concepts
33 |   - $V^TV=I$
34 | - Usually $r \ll n$
35 | 
36 | ### SVD as Dimension Reduction
37 | 
38 | - Similar to PCA
39 | - Sort the singular values in descending order
40 | - Set the smallest singular values to zero
41 | - SVD minimizes the reconstruction error in terms of Frobenius Norm
42 | - Usually keeps 80%=90% Energy
43 | 
44 | ### Computing SVD
45 | 
46 | - $A^TA=V\Sigma^2V^T$
47 | - $AA^T=U\Sigma^2U^T$
48 | - We can reduce solving for SVD into solving for EVD
49 | 
50 | #### Iterative Numerical Method for Eigenvalues
51 | 
52 | > The Power Iteration
53 | 
54 | - Let $M = AA^T$
55 | 
56 | 1. Start with any guess $x_0$
57 | 2. $x_{k+1} = \frac{Mx_k}{\|Mx_k\|_F}$
58 | 3. Repeat until $x_{k}$ changes little
59 | 4. $x_k$ converges to a eigenvector
60 | 5. $M' = M-\lambda xx^T$
61 | 6. Compute next vector using $M'$
62 | 
63 | - Complexity: $O(nm^2)$ or $O(n^2m)$
64 | 
65 | ### SVD and PCA
66 | 
67 | - The same if the matrix is zero-centered
68 | 
69 | ## CUR Decomposition
70 | 
71 | Express $A$ as a product of matrices $C,U,R$ s.t. $\|A - CUR\|_F$ is small, Gives sparse $C$ and $R$ and dense $U$
72 | 
73 | - $C$ is sampled randomly from columns of $A$
74 |   - Samples can be repeated
75 | - $R$ is sampled randomly from rows of $A$
76 | - $U$ is the pseudo-inverse of intersections of $C$ and $R$
77 | 
78 | ### Sampling Rows and Columns
79 | 
80 | - Probability is based on values
81 | - e.g. for columns
82 | - $P(x) = \sum_i A(i,x)^2 / \sum_i\sum_j A(i,j)^2$
83 | 
84 | ### Computing $U$
85 | 
86 | - Let $W$ be the matrix formed by the intersection of $C$ and $R$
87 | - Let $XZY^T$ be the SVD of $W$
88 | - Then $U = YZ^+X^T$ where $Z^+$ is the reciprocals of non-zero singular values $Z_{ii} = 1/Z^+_{ii}$
89 | 
90 | ## Factor Analysis
91 | 


--------------------------------------------------------------------------------
/Data Mining/07.LinkPrediction.md:
--------------------------------------------------------------------------------
  1 | # Link Prediction
  2 | 
  3 | ## Feature Learning in Graphs
  4 | 
  5 | ### Network Embedding
  6 | 
  7 | The task is to map each node in a network to a point in a low-dimensional space
  8 | 
  9 | - Similarity in embedding indicates similarity in structure
 10 | 
 11 | #### Difficulties
 12 | 
 13 | - Complex topographical structure
 14 | - No fixed node ordering
 15 | - Dynamic and have multimodal features
 16 | 
 17 | #### Node Embedding
 18 | 
 19 | - The goal is to construct an encoding module for node embedding
 20 | - So that the similarity in the embedded space approximates similarity in original graph
 21 | 
 22 | $$ Sim(u,v) \approx \mathbf{z}_u^T\mathbf{z}_v $$
 23 | 
 24 | - To do this, we need
 25 |   - A node similarity measurement $Sim$
 26 |   - And of course, an encoder
 27 | 
 28 | #### Shallow Embedding
 29 | 
 30 | $$ Encoder(v) = \mathbf{Z}\mathbf{v} $$
 31 | 
 32 | - An embedding-lookup
 33 |   - Widely applied to many embedding models
 34 | - $\mathbf{Z}$ is the embedding metrix to be learned
 35 |   - Each column is a embedding for a certain node
 36 | - $\mathbf{v}$ is a one-hot vector
 37 | 
 38 | ## DeepWalk
 39 | 
 40 | Let $\mathbf{z}_u^T\mathbf{z}_v$ approximate the probability that node $u$ and $v$ co-occur in a random walk in the network
 41 | 
 42 | - Expressivity
 43 |   - Flexible stochastic definition that incorporates both local and higher-order neighbourhood information
 44 | - Efficiency
 45 |   - Do not need to consider all pairs of nodes. Only a sequence of random walk is needed
 46 | 
 47 | ### Neighbourhood Definition
 48 | 
 49 | Let $N_R(u)$ be the neighbourhood of node $u$ obtained by some strategy $R$
 50 | 
 51 | ### Objective
 52 | 
 53 | $$ \max_z \sum_{u\in V}\log P(N_R(u)|z_u) $$
 54 | 
 55 | - $N_R(u)$ is the neighbourhood of $u$
 56 | 
 57 | Assume the conditional likelihood factorize over the set of neighbours, i.e. nodes are independent
 58 | 
 59 | $$ \log P(N_R(u)|z_u) = \sum_{v\in N_R(u)} \log P(z_v|z_u) $$
 60 | 
 61 | We estimate $P(z_v|z_u)$ by Softmax
 62 | 
 63 | $$ P(z_v|z_u) = \frac{\exp(z_v\cdot z_u)}{\sum_{n\in V}\exp(z_n\cdot z_u)} $$
 64 | 
 65 | Therefore we minimize the negative log likelihood
 66 | 
 67 | $$ L = \sum_{u\in V}\sum_{v\in N_R(u)} -\log\left( \frac{\exp(\mathbf{z}_v^T\mathbf{z}_u)}{\sum_{n\in V}\exp(\mathbf{z}_n^T\mathbf{z}_u)} \right) $$
 68 | 
 69 | Unfortunately the computational overhead is large due to the Softmax funtion iterating over the entire vertex set and computing exponentials
 70 | 
 71 | #### Negative Sampling Approximation
 72 | 
 73 | Simplify the problem into a binary classification.
 74 | 
 75 | - Maximize the probability that a node co-occurs with its neighbouring node
 76 | - Minimize the probability that the node co-occurs with some randomly chosen nodes
 77 | 
 78 | Approximate log softmax by log sigmoid.
 79 | 
 80 | $$ \log\left( \frac{\exp(\mathbf{z}_v^T\mathbf{z}_u)}{\sum_{n\in V}\exp(\mathbf{z}_n^T\mathbf{z}_u)} \right) \approx \log(\sigma(\mathbf{z}_u^T\mathbf{z}_v)) - \sum_{i=1}^k \log(\sigma(\mathbf{z}_u^T\mathbf{z}_{n_i})) $$
 81 | 
 82 | - Where $n_i$ is randomly sampled from the entire vertex set
 83 | - Choice of $k$ is usually proportional to $k$
 84 |   - Higher $k$ gives robust estimation
 85 |   - Emprical value is between 5 and 20
 86 | 
 87 | ### Algorithm
 88 | 
 89 | 1. Run short fixed-length random walks starting from each node using strategy $R$ (typically uniform at random)
 90 | 2. For each node $u$ collect $N_R(u)$, the multiset of nodes visited on random walks starting from u
 91 |    - $N_R(u)$ can have repeat elements because nodes can be visited multiple times on random walks
 92 | 3. Optimize the objective with gradient descent
 93 | 
 94 | ## Node2Vec
 95 | 
 96 | Uses flexible, biased random walks that can trade off between local and global views of the network
 97 | 
 98 | ### Interpolation of BFS and DFS
 99 | 
100 | - BFS characterizes the micro view of the neighbourhood
101 | - DFS characterizes the macro view of the neighbourhood
102 | 
103 | Node2Vec implements a biased fixed-length random walk
104 | 
105 | - Return parameter $p$
106 |   - Return back to the previous node
107 | - In-out parameter $q$
108 |   - Moving outwards (DFS) or inwards (BFS)
109 |   - Intuitively, it is the radio of DFB/BFS
110 | 
111 | Consider the unnormalized probabilities
112 | 
113 | - With probability $1/p$, it returns to previous node
114 | - With probability $1/q$, it moves further from the starting node
115 | - With probability $1$, it stays close to the preceding node
116 | 
117 | A BFS-like walk has low $p$, and a DFS-like walk has low $q$
118 | 
119 | ### Algorithm
120 | 
121 | 1. Compute random-walk probabilities
122 | 2. Simulate $r$ random walks of length $l$ starting from each node $u$
123 | 3. Optimize objective function
124 | 
125 | - Linear time comlexity
126 | - All 3 steps are individually parallelizable
127 | 
128 | ## Embedding in Downstream Tasks
129 | 
130 | - Usage of embedding
131 |   - Clustering or community detection
132 |   - Node classification
133 |   - Link prediction
134 |     - Predict edge $(i,j)$ based on $f(z_i, z_j)$
135 |     - $f$ can be various operations such as
136 |       - Concatenation
137 |       - Hadamard (element-wise product)
138 |       - Summation or average
139 |       - Distance
140 | 
141 | ## Graph Embedding
142 | 
143 | Embedding an entire graph
144 | 
145 | ### By Node Embedding
146 | 
147 | - Run a standard node embedding algorithm
148 | - Sum or average the node embedding
149 | 
150 | ### Metanodes
151 | 
152 | - Introduce 'virtual node's to represent a subgraph of the graph and run a standard node embedding algorithm
153 | 


--------------------------------------------------------------------------------
/Data Mining/10.DifferentialPrivacy.md:
--------------------------------------------------------------------------------
 1 | # Privacy Preserving Data Mining
 2 | 
 3 | ## Definition and Properties
 4 | 
 5 | > Participation of a person does not change the outcome too much
 6 | 
 7 | - Randomness
 8 | - Closeness
 9 | 
10 | ### Terminology
11 | 
12 | - Two parties are involved
13 |   - A curator
14 |   - A data analyst
15 |     - Data analyst can be adversary
16 | 
17 | #### Curator
18 | 
19 | - A trustworthy **curator** holds data of individuals in database $D$
20 |   - Each row corresponds to an individual
21 |   - Each column corresponds to an attribute
22 | - Goal is to protect ever individual row while permitting statistical analysis of $D$
23 | - Can have two types depending on permission to interact
24 |   - Non-interactive
25 |     - Curator releases summary statistics, or "sanitized database", once and for all
26 |   - Interactive
27 |     - Permit queries adaptively to the database
28 | 
29 | #### Privacy
30 | 
31 | - Data analysis knows no more about an individual after analysis is completed than before the analysis had begun
32 | - Adversary's **prior view** and **posterior views** about an individual should not be too different
33 | 
34 | ##### Plausible Deniability
35 | 
36 | - Privacy comes from **plausible deniability**
37 | 
38 | ### Randomized Algorithm
39 | 
40 | #### Probability Simplex
41 | 
42 | - Given a discrete set $B$, the **probability simplex** over $B$, denoted by $\Delta(B)$, is defined to be
43 | 
44 | $$ \Delta(B) = \left\{ x\in\mathbb{R}^{|B|}\mid \forall i x_i\ge 0, \sum_{i=1}^{|B|} x_i = 1 \right\} $$
45 | 
46 | A randomized algorithm $M$ with domain $A$ and discrete range $B$
47 | 
48 | #### Differentially Private Algorithms
49 | 
50 | ##### $\epsilon$-Differential Privacy
51 | 
52 | An algorithm $A$ is called $\epsilon$-differentially private randomized algorithm if for all $D$, $D'$ that differ in one person's value
53 | 
54 | $$ \sup_t \left| \log\frac{p(A(D)=t)}{p(A(D')=t)} \right| \le \epsilon $$
55 | 
56 | i.e. The max-divergence of the two distributions is $\epsilon$
57 | 
58 | ##### $(\epsilon,\delta)$-Differential Privacy
59 | 
60 | A randomized algorithm $M$ with domain $X$ is $(\epsilon,\delta)$-differentially private if for all $O\subseteq Range(M)$ and for all $x,y \in X$ s.t. $\|x - y\|_1 \le 1$
61 | 
62 | #### Properties
63 | 
64 | ##### Post-Processing
65 | 
66 | Let $M$ be a $(\epsilon,\delta)$-differentially private algorithm
67 | , and let $f$ be an arbitrary randomized mapping. Then $f\circ M$ is also $(\epsilon,\delta)$-differentially private
68 | 
69 | ##### Composition
70 | 
71 | The composition of $k$ $(\epsilon_i,\delta_i)$-differentially private mechanizsms is $(\sum_i\epsilon_i,\sum_i\delta_i)$-differentially private
72 | 
73 | ## Basic Mechanisms
74 | 
75 | ### Randomized Response
76 | 
77 | > Flip a coin
78 | 


--------------------------------------------------------------------------------
/Database System Concepts/01.Introduction.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | ## Database Systems
  4 | 
  5 | - **Database**: Organized collection of inter-related data that models some aspect of the real-world
  6 | - **DBMS**: Database Management System. Includes a database and a set of programs to access the data
  7 | - **DBA**: Database Administrator
  8 | 
  9 | ## Database System Overview
 10 | 
 11 | ### Outline
 12 | 
 13 | - Data models
 14 | - Database languages
 15 | - Database Design
 16 | - Database engine
 17 | 
 18 | ### Data Model
 19 | 
 20 | **Data model** is a collection of concepts/tools for describing data, data relationships, data semantics and data constraints.
 21 | 
 22 | Different data models
 23 | 
 24 | - Relational model
 25 |   - Data are represented in the form of tables
 26 | - Entity-Relationship model
 27 | - Semi-Structured model
 28 | - Object-Based data model
 29 | 
 30 | #### Levels of Abstraction
 31 | 
 32 | - Physical Level
 33 |   - Describes how a record is stored
 34 | - Logical Level
 35 |   - Describes what data are stored in database, and the relationships among the data
 36 | - View Level
 37 |   - Application programs that hide details of data
 38 | 
 39 | ### Database Languages
 40 | 
 41 | - Data Definition Language DDL
 42 |   - Specifies notation for defining the database schema
 43 | - Data Manipulation Language DML
 44 |   - Expresses database queries and updates
 45 |   - Procedural DML
 46 |     - Specifies what data is needed and how to get the data
 47 |   - Declarative DML
 48 |     - Specifies only what data is needed
 49 | 
 50 | DDL and DML usually forms parts of a single database language
 51 | 
 52 | #### Structured Query Language SQL
 53 | 
 54 | ```sql
 55 | select name
 56 | from instructor
 57 | where dept_name = 'Comp. Sci.'
 58 | ```
 59 | 
 60 | - Non-procedural
 61 | - De facto standard
 62 | 
 63 | ### Database Design
 64 | 
 65 | #### Logical Design
 66 | 
 67 | - To find a good collection of relational schemas
 68 |   - What attributes should be recorded in the database
 69 |   - What relation schemas we should use, and how the attributes should be distributed
 70 | 
 71 | ### Database Engine
 72 | 
 73 | - Storage Manager
 74 |   - Store, retrieve, update data in the database
 75 | - Query Processor
 76 |   - The execution engine. Compiles and optimizes queries
 77 | - Transaction Manager
 78 |   - Logging, concurrency control and failure recovery
 79 | 
 80 | #### Storage Manager
 81 | 
 82 | Responsible for interaction with the file system
 83 | 
 84 | - Authorization and integrity manager
 85 | - File manager
 86 | - Buffer manager
 87 | 
 88 | Implements several data structures as part of the physical system implementation
 89 | 
 90 | - Data files
 91 |   - The database itself
 92 | - Data dictionary
 93 |   - Metadata
 94 | - Indices
 95 |   - For fast access to items
 96 | 
 97 | ### Query Processor
 98 | 
 99 | - DDL Interpreter
100 |   - Interprets DDL statements and records the definitions in the data dictionary
101 | - DML Compiler
102 |   - Translates DML statements in a query language into an evaluation plan consisting of low-level instructions
103 | - Evaluation Engine
104 |   - Executes low-level instructions generated by the DML compiler
105 | 
106 | ### Transaction Manager
107 | 
108 | A **transaction** is a collection of operations that performs a single logical function in a database application
109 | 
110 | - e.g. Purchasing a train ticket
111 | 
112 | #### Properties
113 | 
114 | - Atomicity
115 | - Consistency
116 | - Isolation
117 | - Durability
118 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/AI2614-DSIP.md:
--------------------------------------------------------------------------------
 1 | # AI2614 Digital Signal and Image Processing
 2 | 
 3 | - Digital Signal Processing
 4 |   - [A Functional Intepretation of Signals](./DSP-FunctionalInterpretationOfSignals.md)
 5 |   - [Sampling and Interpolation](./DSP-SamplingAndInterpolation.md)
 6 |   - [Discrete Fourier Transform](./DSP-DFT.md)
 7 |   - [Fast Fourier Transform](./DSP-FFT.md)
 8 |   - [Short-Time Fourier Transform](./DSP-STFT.md)
 9 | - Digital Image Processing
10 |   - [Fundamentals of Digital Images](./DIP-Fundamentals.md)
11 |   - [Intensity Transformation and Spatial Filtering](./DIP-IntensityTransformAndSpatialFiltering.md)
12 |   - [Histogram Processing](./DIP-HistogramProcessing.md)
13 |   - [Spatial Filtering](./DIP-SpatialFiltering.md)
14 |   - [2D Fourier Transform](./DIP-2DFT.md)
15 |   - [Image Restoration](./DIP-Restoration.md)
16 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-2DFT.md:
--------------------------------------------------------------------------------
  1 | # 2D Fourier Transforms and Spectral Filtering
  2 | 
  3 | ## Fourier Transforms and Spatial Frequencies in 2D
  4 | 
  5 | ### Definition
  6 | 
  7 | $$ F(u,v) = \int\int f(x,y)e^{-j2\pi (ux+vy)}\mathrm{d}x\mathrm{d}y $$
  8 | 
  9 | $$ f(x,y) = \int\int F(u,v)e^{j2\pi (ux+vy)}\mathrm{d}u\mathrm{d}v $$
 10 | 
 11 | - $u$ and $v$ are spatial frequencies.
 12 | - In many cases, $f(x,y)$ can be separated into $f(x)f(y)$, and the calculation will be easier as the integral can be separated.
 13 | - $F(u,v)$ is complex in general.
 14 | - We decompose signals into basis $\{\exp\left\{ -j\pi (ux+vy) \right\}\}$
 15 | 
 16 | ### Important FT Pairs
 17 | 
 18 | #### Rectangle Centered at Origin
 19 | 
 20 | Consider a rectangle centered at origin with length $X$ and width $Y$.
 21 | 
 22 | $$
 23 | f(x,y) = \begin{cases}
 24 | 1 & \quad |x| \le X/2, |y| \le Y/2\\
 25 | 0 & \quad o.w.
 26 | \end{cases}
 27 | $$
 28 | 
 29 | Plug into definition
 30 | $$ F(u,v) = XY\left[\frac{\sin(\pi u X)}{\pi u X}\right]\left[\frac{\sin(\pi v Y)}{\pi v Y}\right] $$
 31 | 
 32 | #### Gaussian Centered at Origin
 33 | 
 34 | The Fourier Transform of a (Univariate) Gaussian distribution is still a Gaussian distribution.
 35 | 
 36 | We start from the FT of 1d Gaussian
 37 | $$ f(t)=\exp\{ -\pi t^2 \} $$
 38 | 
 39 | By definition of Fourier transform
 40 | $$ F(f) = \int f(t)\exp\{-j2\pi ft\}\mathrm{d}t $$
 41 | 
 42 | $$
 43 | \begin{align*}
 44 | F(f) &= \int \exp\{ -\pi ((t+jf)^2 + \pi f^2) \} \mathrm{d}t\\
 45 | &= \exp(-\pi f^2) \int \exp\{ -\pi(t+jf)^2 \}\mathrm{d}t
 46 | \end{align*}
 47 | $$
 48 | 
 49 | The latter integral equals to $1$ (cdf of a Gaussian Distribution). Therefore
 50 | $$ F(f) = \exp\{-\pi f^2\} $$
 51 | 
 52 | For 2d the case is similar.
 53 | 
 54 | #### Circular Disk Centered at Origin
 55 | 
 56 | Suppose the disk has unit height and unit radius.
 57 | 
 58 | $$
 59 | f(r)=\begin{cases}
 60 | 1 & \quad |r| < a\\
 61 | 0 & \quad |r| \ge a
 62 | \end{cases}
 63 | $$
 64 | 
 65 | Consider $x=r\cos\theta$, $y=r\sin\theta$, $u=\rho\cos\varphi$, $v=\rho\sin\varphi$.
 66 | $$ F(\rho, \varphi) = \int\int f(r,\theta)\exp(-j2\pi \rho r(\cos\theta\cos\varphi + \sin\theta\sin\varphi))\mathrm{d}r\mathrm{d}\theta $$
 67 | 
 68 | Therefore
 69 | $$ F(\rho,\varphi) = \int_{0}^ar\mathrm{d}r\int_0^{2\pi}\exp(-j2\pi \rho r \cos(\theta-\varphi))\mathrm{d}\theta $$
 70 | 
 71 | The integral does not have analytical solution. Define Bessel function
 72 | $$ J_0(x) = \frac{1}{2\pi}\int_0^{2\pi}\exp(-jx\cos(\theta-\varphi))\mathrm{d}\theta $$
 73 | 
 74 | Therefore
 75 | $$ F(\rho,\varphi) = \int_0^a 2\pi J_0(2\pi\rho r)r\mathrm{d}r = aJ_1(\pi a \rho)/\rho $$
 76 | 
 77 | - Bessel functions $J_1()$ are usually considered similar to sinusoids. And $J_1(x)/x$ is called the **Jinc Function**
 78 | 
 79 | #### Delta Function (Impulse)
 80 | 
 81 | $$ f(x,y) = \delta(x,y) = \delta(x)\delta(y) $$
 82 | 
 83 | $$F(u,v) = 1$$
 84 | 
 85 | #### Symmetric Delta Function Pairs
 86 | 
 87 | Corresponds to sinusoids.
 88 | 
 89 | ### 2D Fourier Transform on Images
 90 | 
 91 | Usually the 2D FT of natural images have a Gaussian distribution. In many cases, structured patterns (symmetric peaks, etc.) corresponds to the image of some artifacts.
 92 | 
 93 | ### Magnitude vs Phase
 94 | 
 95 | - Magnitude: informative. Amplitude of sinusoids.
 96 | - Phase: less intuitive. May have similar patterns with the magnitude spectrum, but looks messy due to random noises.
 97 | 
 98 | $$ \angle\left( \sum ae^{-j2\pi t} + n \right) $$
 99 | 
100 | - For regions with high snr, magnitude and phase spectrums have similar pattens
101 |   - the magnitude of an image usually decreases with the frequency
102 | - For regions with low snr, random noise dominates the phase.
103 |   - the phase of an image is generally uniformly distributed
104 | 
105 | However, phases are important because they contain more information about the original image.
106 | 
107 | ## Convolution Theorem and Frequency Filtering
108 | 
109 | **Filtering** in the frequency domain consists of modifying the Fourier Transform of an image and the Inverse Fourier Transform of the modified spectrum.
110 | 
111 | ### 2D Convolution Theorem
112 | 
113 | For an LTI system
114 | $$ f(x,y)*h(x,y) = F(u,v)H(u,v) $$
115 | 
116 | **Linear** spatial filtering operations can be carried out by multiplications in the Fourier domain.
117 | 
118 | - Nonlinear filtering such as median filtering cannot be performed in frequency domain.
119 | 
120 | #### Basic Steps
121 | 
122 | 1. Compute FT
123 | 2. Multiply FT’s
124 | 3. IFT
125 | 
126 | #### Technical Details
127 | 
128 | - How to multiply if the size of images and kernels are different
129 |   - Padding. But in frequency domain or in temporal domain?
130 |     - Temporal domain.
131 |     - Increasing the resolution in frequency domain.
132 | - Some filters may be easier to specify in one domain than in another, due to sampling and quantization.
133 | 
134 | ## 2D Sampling
135 | 
136 | ### 2D Sampling Function
137 | 
138 | $$ \sum_{n=-\infty}^{+\infty}\sum_{m=-\infty}^{+\infty} \delta(x-nX)\delta(y-mY) $$
139 | 
140 | $$ F(u,v) = \frac{1}{XY} $$
141 | 
142 | ### Aliasing in 2D
143 | 
144 | Alias occurs if the signal has frequencies above the Nyquist sampling rate.
145 | 
146 | #### Handling Aliasing
147 | 
148 | - Increase sample rate: buy new display or new camera. (`NoMoneyException`)
149 | - Preprocess: Downsampling or filtering (remove high frequencies)
150 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-Fundamentals.md:
--------------------------------------------------------------------------------
 1 | # Fundamentals of Digital Images
 2 | 
 3 | - Natural image: The intercept of a 3D object on a 2D image plane.
 4 | 
 5 | ## Image Processing Pipeline
 6 | 
 7 | 1. acquisition
 8 | 2. enhancement (preprocessing)
 9 | 3. restoration
10 | 4. morphological processing
11 | 5. other task-specific procedures
12 | 
13 | ## Human Visual System
14 | 
15 | > Sensor — Network — Compute
16 | 
17 | - contrast
18 | - exposure
19 | - illusion
20 | 
21 | ## Acquisition and Representation of Digital Images
22 | 
23 | ### Acquisition: Sampling and Quantization
24 | 
25 | #### Sampling
26 | 
27 | - Sample a discrete sample $f[x,y]$ of a continuous image $f(x,y)$.
28 | - Each element of the array is a **pixel**.
29 | - loss of information when downsampling
30 | 
31 | #### Quantization
32 | 
33 | - use a finite number of bits to represent real values.
34 | - loss of information when rounding.
35 | 
36 | #### Aliasing
37 | 
38 | Not to confuse with the aliasing in signal processing.
39 | 
40 | - Aliasing occurs at edges of graphics
41 | - Anti-Aliasing: bi-linear interpolation or bi-cubic interpolation.
42 | 
43 | #### Color Components
44 | 
45 | A colored image consists of 3 channels
46 | 
47 | - Red $R[x,y]$
48 | - Green $G[x,y]$
49 | - Blue $B[x,y]$
50 | 
51 | Cannot process each channel separately and combine the results. The distributions of the channels are different and therefore will cause undesired results.
52 | 
53 | For monochromatic images, the 3 channels can be considered to have the same distribution and therefore can be processed together.
54 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-HistogramProcessing.md:
--------------------------------------------------------------------------------
 1 | # Histogram Processing
 2 | 
 3 | ## Image Histogram
 4 | 
 5 | Represents the relative frequency of occurrence of the various gray level of the image.
 6 | 
 7 | ## Histogram Equalization
 8 | 
 9 | ### Improving Contrast
10 | 
11 | Improving the contrast of an image is spreading its histogram into a wider range.
12 | 
13 | Spreading out the frequencies in an image (or equalizing the image) is a simple way to improve dark or washed out images.
14 | 
15 | In other words, we need to implement a mapping such that any arbitrary input pdf (histogram) can be converted to a uniform distribution.
16 | 
17 | ### Continuous Case
18 | 
19 | Let $p_r(s)$ be the input histogram (pdf), let $p_s(s)$ be our desired equalized output.
20 | 
21 | Assume $s=T(r)$, the relationship between input and output pdf is
22 | $$ p_s(s) \left| \frac{\mathrm{d}T(r)}{\mathrm{d}r} \right| = p_r(r) $$
23 | 
24 | We want $p_s(s)$ to be a constant (uniform distribution). Suppose the range of input is from $0$ to $L-1$, then we can set $p_s(s) = 1/(L-1)$ for all $s$ and integrate over $r$
25 | $$ T(r) = (L-1)\int_{0}^{r}p_r(w)\mathrm{d}w $$
26 | 
27 | - Slope of $T(r)$ gives the amplification. The higher the value is in the original pdf, the larger the amplification.
28 | 
29 | ### Discrete Case
30 | 
31 | The pixels of digital images are usually quantized.
32 | $$ p_r(r_k) = \frac{n_k}{MN} $$
33 | 
34 | If we rewrite the integral to a summation
35 | $$ s_k = T(r_k) = (L-1)\sum_{j=0}^kp_r(r_j) = \frac{L-1}{MN}\sum_{j=0}^kn_j $$
36 | 
37 | This can give a quite good result, but not optimal. Performing equalization is non-trivial in discrete case.
38 | 
39 | Instead of smoothing the original pdf into a uniform distribution, the summation only adjusts the distances between the bins.
40 | 
41 | - Bins with high frequencies are “separated”
42 | - Bins with low frequencies are “compressed”
43 | 
44 | > “洗白了、washed out、over-exposure” —- Yuye Ling
45 | 
46 | ## Histogram Specification
47 | 
48 | In addition to uniform distribution, we can also specify other distributions (e.g. Gaussian) and force the processed image to have a specified histogram distribution.
49 | 
50 | ## CLAHE: Contrast-Limited Adaptive Histogram Equalization
51 | 
52 | > Cost-Limited Adaptive Healthy Eating
53 | 
54 | In discrete case, histogram equalization may fail to produce a desired output.
55 | 
56 | - Vanilla HE will assign the best contrast to the dominant gray levels regions.
57 | - Because slope of $T(r)$ gives the scale of amplification.
58 | 
59 | ### Contrast Limitation
60 | 
61 | - We want to limit the amplification given by $T(r)$: this is equivalent to clipping the height of the histogram.
62 |   - But clipped histogram (pdf) does not sum up to 1.
63 |   - So we compensate the clipped values by increasing all values in the pdf.
64 |     - clipped parts now have smoother slopes.
65 |     - originally low frequency parts now have sharper slopes.
66 | - Contrast Limiting is simply histogram specification.
67 | 
68 | ### Adaptive
69 | 
70 | - Perform histogram equalization in a small neighborhood.
71 | - Use interpolation to stitch them up.
72 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-IntensityTransformAndSpatialFiltering.md:
--------------------------------------------------------------------------------
 1 | # Intensity Transformation and Spatial Filtering
 2 | 
 3 | ## Basics
 4 | 
 5 | What is the spacial domain?
 6 | 
 7 | - the plane containing the pixels of an image.
 8 | - neighborhood.
 9 | 
10 | - Intensity Transformation
11 |   - $g[x,y] = T(f[x,y])$
12 | - Spatial Filtering
13 |   - mask
14 |   - kernel
15 |   - template
16 |   - window
17 | 
18 | ## Intensity Transformation
19 | 
20 | ### Thresholding
21 | 
22 | - Useful for segmentation, to isolate an object of interest from a background.
23 | 
24 | ### Kinds of grey level transformation
25 | 
26 | - Linear
27 |   - Negative
28 |   - Identity
29 | - Logarithmic
30 |   - Log
31 |   - Exponential
32 | - Power (gamma)
33 |   - $N$-th power
34 |   - $N$-th root
35 | 
36 | ### Logarithmic transformation
37 | 
38 | $$ s = c\log(1+r) $$
39 | 
40 | Maps a narrow range of low input grey level values into a wider range of output values.
41 | 
42 | ### Gamma transformation
43 | 
44 | $$ s = cr^{\gamma} $$
45 | 
46 | Maps a narrow range of dark/bright input values into a wider range of output values.
47 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-Restoration.md:
--------------------------------------------------------------------------------
  1 | # Fundamentals of Image Restoration
  2 | 
  3 | ## Image Degradation
  4 | 
  5 | ### Degradations
  6 | 
  7 | - Optical Blur
  8 | - Motion Blur
  9 | - Spatial Quantization
 10 | - Additive Intensity Noice
 11 | 
 12 | ### Formulation
 13 | 
 14 | #### Overview
 15 | 
 16 | We can assume that the pattern of blurred point is preserved
 17 | 
 18 | Therefore imaging can be modeled as an LSI system.
 19 | 
 20 | - the convolution kernel is called the **point spread function** (PSF).
 21 | 
 22 | #### Maths: Inverse Problem
 23 | 
 24 | An observed image can be modeled as
 25 | $$ g(x,y) = h(x,y) * f(x,y) + n(x,y) $$
 26 | 
 27 | where $h$ is the PSF of the imaging system, and $n$ is the additive noise.
 28 | 
 29 | Observing $g$, we want to solve for $f$, but this problem is non-trivial.
 30 | 
 31 | #### Challenge: Loss of Information
 32 | 
 33 | Take a Gaussian filter for example, the filter dropped many information in high frequencies.
 34 | 
 35 | Knowing the output and solving for the input is underdetermined.
 36 | 
 37 | So we need many assumptions and priors.
 38 | 
 39 | ## Image Restoration
 40 | 
 41 | ### Inverse Filtering
 42 | 
 43 | #### 1D Vector Explanation
 44 | 
 45 | Consider $g=h*f$, if we ignore the boundaries, it can be re-written into matrix form
 46 | $$ g = Af $$
 47 | 
 48 | where $A$ can be constructed from $h$.
 49 | 
 50 | So the problem is solved by inverting $A$.
 51 | $$ f=A^{—1}g $$
 52 | 
 53 | - Matrix inversion is difficult.
 54 | - If the boundary is not ignored, then $A$ is not a square matrix, and we have to use pseudo-inverse.
 55 | 
 56 | #### Fourier Perspective: Inverse Filtering
 57 | 
 58 | $$ G = HF + N $$
 59 | 
 60 | If we ignore the noise $N$,
 61 | $$ \hat{F} = G/H $$
 62 | 
 63 | - works perfectly fine if there is no noise
 64 | - fails immediately if noise exists
 65 | 
 66 | #### Noise Amplification
 67 | 
 68 | If noise exists
 69 | $$ \hat{F} = F + N/H $$
 70 | 
 71 | As we divide $N$ by $H$, the noise gets amplified and dominates high frequencies, because $1/H$ has larger values in high frequencies.
 72 | 
 73 | ### Wiener Filtering
 74 | 
 75 | Still applies inverse kernel, but avoids divide-by-zero by adding damping factors.
 76 | $$ \hat{F} =\frac{|H|^2}{|H|^2+1/SNR}\cdot\frac{G}{H} = \frac{H^*}{|H|^2+K}\cdot G $$
 77 | 
 78 | where the first term is an amplitude-dependent damping factor.
 79 | 
 80 | - If $K=0$, Wiener Filtering reduces to standard inverse filtering.
 81 | - Choosing $K$
 82 |   - Manual parameter tuning
 83 |   - Prior: Choose some part of the image that is assumed to be uniformly distributed, and estimate SNR in that region.
 84 |   - BM3D: State-of-the-art
 85 | - Intuitively, Wiener filtering drops frequencies that is considered dominated by noises.
 86 | 
 87 | #### Maths
 88 | 
 89 | Wiener filter is designed to minimize the mean squared error
 90 | $$ E = \iint |f(x,y)-\hat{f}(x,y)|^2\mathrm{d}x\mathrm{d}y $$
 91 | 
 92 | - Assumes the noise is additive Gaussian noise
 93 | 
 94 | By Parseval’s Theorem
 95 | $$ E = \iint |F(u,v) - \hat{F}(u,v)|^2\mathrm{d}u\mathrm{d}v $$
 96 | where $\hat{F}=WG=WHF-WN$
 97 | 
 98 | Therefore
 99 | $$F-\hat{F}=(1-WH)F-WN$$
100 | 
101 | $$E=\iint |(1-WH)F-WN|^2\mathrm{d}u\mathrm{d}v$$
102 | 
103 | If we assume the two terms are not correlated,
104 | $$ E=\iint |(1-WH)F|^2+|WN|^2\mathrm{d}u\mathrm{d}v $$
105 | 
106 | To minimize the error, we take the derivative with respective to $W$.
107 | $$ 2(-(1-W^*H^*)F^*HF+W^*N^*N)=0 $$
108 | 
109 | #### Example: Motion Blur
110 | 
111 | The motion blurring can be modeled as a shift-and-add process
112 | $$ g = \frac{1}{T}\int_{-T/2}^{T/2}f(x-x_0(t),y)\mathrm{d}t $$
113 | 
114 | To apply Wiener filtering, we need to derive $H$. By definition and properties of Fourier transform
115 | $$ G = \frac{1}{T} \int_{-T/2}^{T/2}Fe^{-j2\pi{}ux_0(t)}\mathrm{d}t $$
116 | 
117 | Therefore
118 | $$ H = \frac{1}{T}\int_{-T/2}^{T/2}e^{-j2\pi{}ux_0(t)}$$
119 | 
120 | Suppose $x_0(t)=st$,
121 | $$ H = \textrm{sinc}(\pi{}ud) $$
122 | 
123 | So $H$ still have zeros, and we should use Wiener filtering.
124 | 
125 | ### MAP Formulation
126 | 
127 | `NotImplementedError: 讲不完力`
128 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DIP-SpatialFiltering.md:
--------------------------------------------------------------------------------
 1 | # Fundamentals of Spatial Filtering
 2 | 
 3 | ## Neighborhood Operations
 4 | 
 5 | Neighborhood operations operate on a larger neighborhood of pixels, instead of operating on single points.
 6 | 
 7 | ## Filtering vs Convolution
 8 | 
 9 | Filtering in MATLAB. Filtering is like correlation.
10 | $$ g[m,n] = \sum*{k}\sum*{l}h[k,l]f[m+k,n+l] $$
11 | 
12 | Convolution in MATLAB
13 | $$ g[m,n] = \sum*{k}\sum*{l}h[k,l]f[m-k,n-l] $$
14 | 
15 | ## Image Smoothing
16 | 
17 | - Averaging all pixels in a neighborhood.
18 | - Useful in removing noise
19 | - Linear
20 | 
21 | ### Simple Averaging Filter
22 | 
23 | ### Weighted Averaging Filter
24 | 
25 | ### Gaussian Filter
26 | 
27 | ## Nonlinear Filtering
28 | 
29 | ### Minimum Filtering
30 | 
31 | ### Maximum Filtering
32 | 
33 | ### Median Filtering
34 | 
35 | - Useful in cancelling pepper-salt noises.
36 | 
37 | ## Edge Effect
38 | 
39 | How to pad.
40 | 
41 | ### Zero Padding
42 | 
43 | May cause edge effects.
44 | 
45 | ### Replicate
46 | 
47 | Pad with replicating edge pixels
48 | 
49 | ### Wrap Around Edges
50 | 
51 | Assume the image is “periodic”, and pad with periodic extension.
52 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DSP-FunctionalInterpretationOfSignals.md:
--------------------------------------------------------------------------------
 1 | # A Functional Interpretation of Signals
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | ### Vector Spaces
 6 | 
 7 | A vector space $V$ is a set of functions equipped with
 8 | 
 9 | - Vector addition: $\forall u,v \in V \quad u + v \in V$
10 | - Scalar multiplication: $\forall \alpha \in \mathbb{R}, \forall v \in V, \quad \alpha v \in V$
11 | 
12 | ### Subspace
13 | 
14 | $S$ is a subspace of $V$ if
15 | 
16 | - $S$ is a subset of $V$
17 | - $S$ is a vector space for the addition and scalar multiplication
18 | 
19 | ### Spanned Subspace
20 | 
21 | Consider $v_1,\dots,v_N$ in a vector space $V$. Define
22 | $$ S = \{ \alpha_1v_1 + \cdots + \alpha_Nv_N: \alpha_1,\dots,\alpha_N \in \mathbb{R} \} $$
23 | $S$ is the subspace spanned by $v_1,\dots,v_N$, denoted by
24 | $$ S = \mathrm{span}(v_1,\dots,v_N) $$
25 | 
26 | ### Euclidean Space
27 | 
28 | A vector space $V$ is an Euclidean space if an **inner product** is defined on $V$.
29 | 
30 | - For example, the 2D Euclidean space $\mathbb{R}^2$ has inner product defined by $\langle x,y \rangle = x_1y_1 + x_2y_2$
31 | 
32 | The formal definition of an inner product is: A function $\langle \cdot, \cdot \rangle$ is called an inner product if
33 | 
34 | - $\forall u,v \in V, \langle u,v \rangle \in \mathbb{R} / \mathbb{C}$
35 | - $\forall u,v,w\in V, \langle u+v,w \rangle = \langle u,w \rangle + \langle v,w \rangle$
36 | - $\forall \alpha \in \mathbb{R}, \forall u,v \in V, \langle \alpha u, v \rangle = \alpha \langle u, v \rangle$
37 | - $\forall u,v \in V, \langle u,v \rangle = \langle v,u \rangle$
38 | - $\forall u \in V, \langle u,u \rangle \ge 0$
39 | - $\langle u, u \rangle = 0 \Longrightarrow u = 0$
40 | 
41 | ### Normed Space
42 | 
43 | Based on the inner product, we can define
44 | 
45 | - Norm: $\|x\| = \sqrt{\langle x,x \rangle}$
46 |   - $\|x\| \ge 0$
47 | - Distance: $d(x,y) = \|x-y\|$
48 | - Orthogonality: $x \perp y \Leftrightarrow \langle x,y \rangle = 0$
49 | - Orthonormal bases.
50 | - Projection.
51 | 
52 | #### Projection
53 | 
54 | Projection of $x$ onto $y$
55 | 
56 | $$z = \frac{\langle x, y \rangle}{\|y\|^2}y$$
57 | 
58 | ### Kindergarten Equation
59 | 
60 | Take $\mathbb{R}^2$ for example. Consider the orthonormal bases
61 | 
62 | $$
63 | \begin{cases}
64 |     e_1 &= (1,0)\\
65 |     e_2 &= (0,1)
66 | \end{cases}
67 | $$
68 | 
69 | Then the projection of $x=(x_1,x_2)$ can be obtained by inner product
70 | $$\langle x, e_1 \rangle = x_1 \quad \langle x, e_2 \rangle = x_2$$
71 | 
72 | We introduce the **Kindergarten Equation**
73 | $$ x = x_1e_1 + x_2e_2 = \langle x,e_1 \rangle e_1 + \langle x,e_2 \rangle e_2 $$
74 | 
75 | ### Hilbert Space
76 | 
77 | The previous discussions can be generalized to arbitrarily large finite and even some infinite dimensions.
78 | 
79 | The inner product of $\mathbb{R}^{\mathbb{R}}$ is defined by
80 | $$ \langle x,y \rangle = \frac{1}{T}\int_0^T x(t)y^*(t)\mathrm{d}t $$
81 | 
82 | This form is very similar to the formalism of Fourier transform.
83 | 
84 | ### Fourier Series: A Functional Perspective
85 | 
86 | Suppose a $T$-periodic signal $x(t)$, its Fourier series is given by
87 | $$ x(t) = \sum_n X[k]\cdot\exp(j2\pi k\frac{t}{T}) $$
88 | 
89 | - We use a set of orthornormal bases $\exp(j2\pi kt /T)$ to approximate the original signal.
90 | 
91 | The coefficients $X[k]$
92 | $$ X[k] = \frac{1}{T}\int_0^T x(t)\cdot\exp(-j2\pi k \frac{t}{T})\mathrm{d}t $$
93 | 
94 | are exactly the projections of $x(t)$ on orthonrmal bases $e_k = \exp(j2\pi kt/T)$
95 | 
96 | Therefore, Fourier series gives an approximation to the original periodic function in a subspace spanned by $e_k$ in mean square sense.
97 | 


--------------------------------------------------------------------------------
/Digital Signal and Image Processing/DSP-SamplingAndInterpolation.md:
--------------------------------------------------------------------------------
  1 | # Sampling and Interpolation
  2 | 
  3 | ## Sampling
  4 | 
  5 | ### Definition
  6 | 
  7 | As has been discussed in Signals and Systems, for an arbitrary signal, we can use a Dirac comb to sample it and convert it to a discrete-time signal.
  8 | $$ s(t) = \sum_{n=-\infty}^{+\infty} \delta(t-nT) $$
  9 | 
 10 | $$x[n] = x(t)s(t) = x(nT)$$
 11 | 
 12 | ### Invertibility
 13 | 
 14 | **Nyquist Sampling Theorem**: For a **band-limited** continuous-time signal, if we sample it with a rate higher than 2$\times$ bandwidth, then we can perfectly reconstruct the original continuous-time signal.
 15 | 
 16 | - However, most signals in reality are not band-limited.
 17 | - In general, sampling is **not** invertible.
 18 | 
 19 | ## Interpolation
 20 | 
 21 | ### Sinc Interpolation
 22 | 
 23 | We can reconstruct a band-limited signal by super-imposing scaled sinc function at each sampling point.
 24 | $$ \mathrm{sinc}(t) = \frac{\sin(t)}{t} $$
 25 | 
 26 | #### Why sinc?
 27 | 
 28 | For a band-limited signal $x(t)$ with spectrum $X(j\Omega)$,
 29 | 
 30 | $$
 31 | X(j\Omega) = \begin{cases}
 32 |     something, &\quad |\Omega| \le \Omega_0\\
 33 |     0, &\quad |\Omega| > \Omega_0
 34 | \end{cases}
 35 | $$
 36 | 
 37 | Therefore the spectrum can be re-written as
 38 | $$ X(j\Omega) = X(j\Omega)\cdot\mathrm{rect}\left( \frac{\Omega}{\Omega_0} \right) = X_p(j\Omega)\cdot\mathrm{rect}\left( \frac{\Omega}{\Omega_0} \right) $$
 39 | where $X_p(j\Omega)$ is constructed by periodically extending $X(j\Omega)$
 40 | 
 41 | Since $X_p(j\Omega)$ is periodic, we can expand it by its Fourier series
 42 | $$X_p(\Omega) = \sum_{n=-\infty}^{+\infty} x[n]\exp(j2\pi n \Omega/\Omega_0)\cdot\mathrm{rect}\left( \frac{\Omega}{\Omega_0} \right)$$
 43 | 
 44 | The inverse Fourier transform is then given by
 45 | $$x(t) = \left( \sum_{n=-\infty}^{+\infty}x[n]\delta(t-nT_0)\right) * \Omega_0\mathrm{sinc}(\Omega_0t)$$
 46 | 
 47 | ### Aliasing
 48 | 
 49 | What happens to the sinc function when aliasing occurs?
 50 | 
 51 | - With increased signal frequency, the sinc function can no longer span the signal space
 52 | - Therefore aliasing occurs
 53 | 
 54 | #### Anti-Aliasing Filters
 55 | 
 56 | ### Interpolating with Different Bases
 57 | 
 58 | #### Piecewise Constant
 59 | 
 60 | $$ V = \{ \text{piecewise constant functions on intervals } [kT, (k+1)T] \} $$
 61 | 
 62 | #### Linear
 63 | 
 64 | $$ V = \{ \text{piecewise linear functions on intervals } [kT, (k+1)T] \} $$
 65 | 
 66 | ### Brief Remarks on Sampling
 67 | 
 68 | - Theoretically, it is impossbile to reconstruct the original continuous-time signal from its finite samples
 69 | - However, based on
 70 |   - Our knowledge about the signal
 71 |   - Our tolerance on the resonstruction accuracy
 72 |   - We could design a proper basis and kernel to represent and reconstruct the signal
 73 | 
 74 | ## Discrete-Time Processing of Continuous-Time Signals
 75 | 
 76 | ### Basic Workflow
 77 | 
 78 | Theoretically,
 79 | 
 80 | - Sample CT signal
 81 | - Do DT processing
 82 | - Interpolate back to CT signal
 83 | 
 84 | But this process is **seriously non-trivial**
 85 | 
 86 | C/D convertion: $x[n] = x_c(nT)$
 87 | The DFTF of $x[n]$ is
 88 | $$X(e^{j\omega}) \frac{1}{T}\sum_{-\infty}^{+\infty}X_c\left[ j\left( \frac{\omega}{T} - \frac{2\pi k}{T} \right) \right]$$
 89 | 
 90 | D/C conversion: $y_r(t) = \sum_{n=-\infty}^{+\infty}y[n]\frac{\sin[\pi(t-nT)/T]}{\pi(t-nT)/T}$
 91 | The DTFT of $y[n]$ is realted to the FT of $y_r$ by
 92 | $$Y_r(j\Omega) = H_r(j\Omega)Y(e^{j\Omega T})$$
 93 | 
 94 | If the DT system is LTI, we have
 95 | $$ Y(e^{j\Omega T}) = H(e^{j\Omega T})X(e^{j\Omega T}) $$
 96 | 
 97 | If $X_c$ is bandlimited, there is no aliasing, and
 98 | $$Y_r(j\Omega) = H(e^{j\Omega T})X_c(j\Omega)$$
 99 | 
100 | - $x_c(t)$ must be bandlimited
101 | - C/D must satisfy Nyquist sampling theorem.
102 | - DT system must be LTI
103 | 
104 | ### Low-Pass Filtering before Sampling
105 | 
106 | By applying a low-pass filter in the spectral domain, we will be able to sample signals that are originally impossible to sample, but this comes with the cost in precision.
107 | 
108 | ## Quantization
109 | 
110 | - Input/output quantization
111 | - Filter coefficient quantization
112 | - Product roundoff
113 | - (Potential) overflow in sums
114 | 
115 | Notice that quantization makes the system nonlinear. To make our analysis tractable, denote the quantized value as
116 | $$\hat{x} = \mathcal{Q}(x) = x + \epsilon$$
117 | where $\epsilon$ is the quantization error.
118 | 
119 | Define $\Delta = \frac{2X_m}{2^{B+1}} = \frac{X_m}{w^B}$ where $2X_m$ is the full-scale range of quantizer and $B+1$ is the number of quantizer bits.
120 | 
121 | Therefore the quantizer can be seen as the addition of original signal $x[n]$ and noise signal $e[n]$
122 | 
123 | - $e$ is uniformly distributed random variable between $-\Delta/2$ and $\Delta/2$
124 | - $e$ is a stationary stochastic process, its statistics do not change over time
125 | - $e$ is a white sequence, uncorrelated with $e[m]$ for all $n \neq m$
126 | - $e$ correlated with $x$
127 | 
128 | ### Major Types of Quantization
129 | 
130 | - Truncation
131 | - Rounding
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 YBRua
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Machine Learning/Introduction.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Introduction}
 2 | \emph{“你们CS比CS的强吗？那你们的EE比EE的强吗？”}
 3 | \newpage
 4 | 
 5 | \section{Basics}
 6 | The basic assumption of machine learning is that \textbf{data samples are i.i.d.}.
 7 | 
 8 | The goal of training a model is to \textbf{minimize the generalization error of the model}. Since we only have limited amount of data, what we can actually do is to minimize the emprical error.
 9 | 
10 | However, we do not always want the emprical error to be as small as possible due to the risk of overfitting.
11 | 
12 | \section{Overfitting and Underfitting}
13 | \paragraph{Overfitting.} High variance. The model performs well on training sets but performs poorly on new unseen samples. Using a high-order model to fit low-order distribution of data usually leads to overfitting.
14 | \paragraph{Underfitting.} High bias. The model has not fully captured the underlying structure of the data. Conduct more training or change a more complicated model.
15 | 
16 | \section{Methods for Splitting data}
17 | To train a model, we first need to divide data into training set and test set. Training set and test set should be disjoint.
18 | 
19 | \subsection{Hold-Out}
20 | Divide dataset $\mathcal{D}$ into traning set $\mathcal{S}$ and test set $\mathcal{T}$ s.t.
21 | \[ \mathcal{S} \cup \mathcal{T} = \mathcal{D} \quad \mathcal{S} \cap \mathcal{T} = \emptyset \]
22 | Typical proportion of $\mathcal{S}$ and $\mathcal{T}$ is 30\% and 70\%.
23 | 
24 | \subsection{Cross-Validation}
25 | Divide $\mathcal{D}$ into $k$ disjoint sets of similar size.
26 | \[ \mathcal{D} = \mathcal{D}_1 \cup \mathcal{D}_2 \cup \dots \mathcal{D}_k \quad \text{s.t.} \quad \mathcal{D}_i \cap \mathcal{D}_j = \emptyset \]
27 | Each time use $k-1$ sets for training and the remaining set for testing. 
28 | A typical value of $k$ is $10$.
29 | 
30 | \subsection{Leave-One-Out}
31 | A special case of cross-validation, wehre each set $\mathcal{D}_i$ contains only one sample.
32 | 
33 | \subsection{Bootstrapping}
34 | Suppose $\mathcal{D}$ has $m$ samples. Randomly pick a sample from $\mathcal{D}$, copy it into some $\mathcal{D}'$ and put it back to $\mathcal{D}$. Repeat the process for $m$ times.
35 | \[ \lim_{m\to\infty}(1-\frac{1}{m})^m = \frac{1}{e} \approx 0.368 \]
36 | About $36.8\%$ samples in $\mathcal{D}$ will not be in $\mathcal{D}'$. So we can use $\mathcal{D}'$ for training and $\mathcal{D}\backslash\mathcal{D}'$ for testing.
37 | 
38 | \section{Performance Evaluation}
39 | \subsection{Measure}
40 | \paragraph{Regression} Common performance measure for a regression model is \textbf{Mean Squared Error}.
41 | \[ E = \frac{1}{m}\sum_{i=1}^m(f(x^{(i)}) - y^{(i)})^2 \]
42 | \paragraph{Classification} Common measure for a classification model is \textbf{Error Rate}
43 | \[ E = \frac{1}{m}\sum_{i=1}^m\mathbb{I}[f(x^{(i)}) \neq y^{(i)}] \]
44 | 
45 | \subsection{TPR and FPR}
46 | \begin{definition}[Sensitivity/TPR]
47 |     \[ TPR = \frac{TP}{TP + FN} \]
48 | \end{definition}
49 | \begin{definition}[FPR]
50 |     \[ FPR = \frac{FP}{TN + FP} \]
51 | \end{definition}
52 | 
53 | \subsection{Receiver Operating Characteristic}
54 | Many classification models output a real value and compare it to a certain threshold.
55 | 
56 | The \textbf{ROC Curve} uses $FPR$ as its $x$-axis, and $TPR$ as its $y$-axis. It can be plotted by setting different thresholds for dividing positive and negative samples.
57 | 
58 | The \textbf{Area Under Curve, AUC} is used to evaluate different models. Usually models with a larger AUC is considered to have better performance.
59 | 
60 | \subsection{Precision and Recall}
61 | \begin{definition}[Precision]
62 |     \[ P = \frac{TP}{TP + FP} \]
63 | \end{definition}
64 | \begin{definition}[Recall]
65 |     \[ R = \frac{TP}{TP + FN} \]
66 | \end{definition}
67 | Similar to the ROC Curve, we can also plot the \textbf{P-R Curve}. And the \textbf{Break-Even Point, BEP}, defined as the value when $P = R$, is used to evaluate different models.
68 | 
69 | Another more common measure is the $F1$ rate
70 | \begin{definition}[$F1$ Rate]
71 |     \[ F1 = \frac{2 \times P \times R}{P + R} = \frac{2 \times TP}{\#Samples + TP - TN} \]
72 | \end{definition}
73 | \begin{remark}
74 |     The $F1$ rate is defined by the harmonic mean of Precision and Recall.
75 | \end{remark}
76 | 
77 | \begin{definition}[$F_{\beta}$ Rate]
78 |     \[ F_{\beta} = \frac{(1+\beta^2)\times P \times R}{(\beta^2 \times P)+R} \]
79 | \end{definition}
80 | \begin{remark}
81 |     $F_{\beta}$ is the weighted harmonic mean. When $\beta > 1$, precision has a higher weight. When $0 < \beta < 1$, recall has a higher weight.
82 | \end{remark}
83 | 
84 | \section{Error Analysis}
85 | \paragraph{Bias.} The \textbf{bias} is the difference between model prediction and ground truth. This is usually because the model is not well-trained, or because the model is not complex enough to fit the data distribution.
86 | \paragraph{Variance.} The \textbf{variance} is the variance of outputs of the same model fitted different times. This is usually because the model is too complex and mistakenly fits the noise or specific features in the dataset.
87 | \paragraph{Noise.} Noise.
88 | 
89 | High variance $\to$ Overfitting.
90 | 
91 | High bias $\to$ Underfitting.
92 | 
93 | \subsection{Bias-Variance Decomposition}
94 | Let
95 | \[ bias(x) = f(x) - y \]
96 | \[ var(x) = \mathbb{E}_{\mathcal{D}}[(f(x;\mathcal{D}-f(x)))^2] \]
97 | The generalization error of a model $f$ trained on $\mathcal{D}$ can be represented by
98 | \[ E(f;\mathcal{D}) = bias^2(x) + var(x) + \varepsilon^2 \]


--------------------------------------------------------------------------------
/Machine Learning/LinearDiscriminantAnalysis.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Linear Discriminant Analysis}
 2 | \newpage
 3 | 
 4 | \section{Binary LDA}
 5 |     Linear Discriminant Analysis aims at projecting data samples onto a hyperplane, such that samples of the same class get as close to each other as possible, while samples of different classes get as far from each other as possible.
 6 | 
 7 |     \subsection{Within- and Between- class Distances}
 8 |         Let $\mu_1$ and $\mu_2$ be the mean value of two classes.
 9 |         \[ \mu_1 = \frac{1}{N_1}\sum_{i\in C_1}x^{(i)} \qquad \mu_2 = \frac{1}{N_2}\sum_{i\in C_2}x^{(i)} \]
10 | 
11 |         \subsubsection{Between-class Distance}
12 |         Let $w$ be the linear mapping,
13 |         \[ d_{between} = |w^Tu_1 - w^Tu_2| \]
14 |         We want to maximize $d_{between}$, which is equivalent to maximizing
15 |         \[ \|d_{between}\|^2 = w^T(\mu_1-\mu_2)(\mu_1-\mu_2)^Tw\]
16 | 
17 |         \subsubsection{Within-class Distance}
18 |         We also want to minimize
19 |         \[ d_{within}^{(i)} = \sum_{j\in C}|w^Tx^{(j)} -w^T\mu_i| \]
20 |         for all $j$, which is equivalent to minimizing
21 |         \[ \|d_{within}^{(i)}\|^2 = \sum_{j\in C}w^T(x^{(j)}-\mu_i)(x^{(j)}-\mu_i)^Tw = w^T\Sigma_iw \]
22 |         where $\Sigma_i$ is the covariance matrix of the $i$-th class.
23 | 
24 |         Therefore define
25 |         \[ d_{within}^2 = \sum_i (d_{within}^{(i)})^2 \]
26 | 
27 |     \subsection{Opimization Formulation}
28 |         Notice that we have two objective functions, the key is to combine the two objective into one single objective funtion.
29 | 
30 |         This can be done by simply setting the objective function to be
31 |         \[ \max J = \frac{d_{between}}{d_{within}} = \frac{w^T(\mu_1-\mu_2)(\mu_1-\mu_2)^Tw}{w^T(\Sigma_1 + \Sigma_2)w} \]
32 | 
33 |         We define the \textbf{within-class scatter matrix} and \textbf{between-class scatter matrix} by
34 |         \[ S_w = \Sigma_1 + \Sigma_2 \]
35 |         \[ S_b = (\mu_1 - \mu_2)(\mu_1-\mu_2)^T \]
36 | 
37 |         Therefore
38 |         \[ J = \frac{w^TS_bw}{w^TS_ww} \]
39 |         this is also known as the \textbf{generalized Rayleight quotient}.
40 | 
41 |     \subsection{Solution}
42 |         \[ J = \frac{w^TS_bw}{w^TS_ww} \]
43 |         Notice that both the numerator and denominator are quadratic, so the norm of $w$ does not matter.
44 | 
45 |         \subsubsection{By Lagrangian}
46 |         Since the norm of $w$ does not matter, the problem is equivalent to
47 |         \begin{align*}
48 |             \min &\quad -w^TS_bw\\
49 |             \text{s.t.} &\quad w^TS_ww=1
50 |         \end{align*}
51 |         By Lagrangian multipliers
52 |         \[ S_bw = \lambda S_ww \]
53 | 
54 |         Notice that $(\mu_1-\mu_2)^Tw$ is a scalar, and therefore $S_bw$ is parallel with $\mu_1-\mu_2$. WLOG let
55 |         \[ S_bw = \lambda(\mu_1-\mu_2) \]
56 | 
57 |         And therefore
58 |         \[ w = S_w^{-1}(\mu_0-\mu_1) \]
59 | 
60 |         \begin{remark}
61 |             For numerical stability we usually compute the inverse by SVD.
62 |         \end{remark}
63 | 
64 |         \subsubsection{By Gradient}
65 |         We can directly take gradient w.r.t. $w$.
66 |         \[ \nabla J = \frac{2(w^TS_ww)S_bw-2(w^TS_bw)S_ww}{(w^TS_ww)^2} \]
67 |         which yields
68 |         \[ S_bw=JS_ww \Longrightarrow S_w^{-1}S_bw = Jw \]
69 |         and it becomes a eigenvalue decomposition problem.
70 | 
71 |         To solve this, we perform eigenvalue decomposition on $S_w^{-1}S_b$, and choose the eigenvector with the largest eigenvalue.
72 | 
73 |         Or alternatively, we can take more than one eigenvectors.
74 | 
75 | 
76 | \section{Multi-class LDA}
77 |     We introduce the \textbf{total scatter matrix} $S_T$
78 |     \[ S_T = \sum_{i=1}^N(x^{(i)}-\mu)(x^{(i)}-\mu)^T \]
79 |     where $\mu$ is the mean vector of all samples.
80 | 
81 |     \begin{align*}
82 |         S_T &= \sum_{j=1}^{K}\sum_{i\in C_j}(x^{(i)}-\mu_j+\mu_j-\mu)(x^{(i)}-\mu_j+\mu_j-\mu)^T\\
83 |         &= \sum_j\sum_{i\in C_j}(x^{(i)}-\mu_j)(x^{(i)}-\mu_j)^T\\
84 |         &\quad + \sum_j\sum_i(\mu_j-\mu)(\mu_j-\mu)^T + 2 \sum_j\sum_i(x^{(i)}-\mu_j)(x^{(i)}-\mu_j)^T\\
85 |         &= S_w + S_b
86 |     \end{align*}
87 |     Notice that the last term is zero.
88 | 
89 |     The solution is the same as the binary case.
90 | 
91 |     However, notice that
92 |     \[ S_b = \sum_{j=1}^k N_j(\mu_j-\mu)(\mu_j-\mu)^T \]
93 |     has at most $k-1$ positive eigenvalues, and therefore we can at most reduce the dimension to $k-1$, but not $k$.
94 | 


--------------------------------------------------------------------------------
/Machine Learning/MachineLearning.pdf:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b1e848ac3ea3c534cfa6d8c3920cbf26588ffa9364df2ef4fab85a66b3fc60d
3 | size 192699
4 | 


--------------------------------------------------------------------------------
/Machine Learning/MachineLearning.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[oneside]{book}
 2 | \usepackage{xeCJK}
 3 | \usepackage{amsmath}
 4 | \usepackage{mathtools}
 5 | \usepackage{listings} % lstlist插入代码
 6 | \usepackage{booktabs}
 7 | \usepackage{ulem}
 8 | \usepackage{enumerate}
 9 | \usepackage{amsfonts}
10 | \usepackage{amssymb}
11 | \usepackage{amsthm}
12 | \usepackage{setspace} % spacing环境设置行间距
13 | \usepackage[ruled, vlined]{algorithm2e} % 算法与伪代码 
14 | \usepackage{bm} % 数学公式中的加粗
15 | \usepackage{pifont} % 打圈的数字。172-211。\ding
16 | \usepackage{graphicx}
17 | \usepackage{float}
18 | \usepackage[dvipsnames]{xcolor}
19 | %\usepackage{indentfirst}
20 | \usepackage{ulem} %\sout{}打删除线
21 | \normalem % 使用默认normalem
22 | \usepackage{lmodern}
23 | \usepackage{subcaption}
24 | \usepackage[colorlinks, linkcolor=blue]{hyperref}
25 | \usepackage{cleveref}
26 | \usepackage[a4paper]{geometry}
27 | \usepackage{titlesec}
28 | 
29 | \theoremstyle{definition}
30 | \newtheorem{definition}{Definition}[section]
31 | \newtheorem{theorem}{Theorem}[section]
32 | \newtheorem*{optTheorem}{Theorem}
33 | \newtheorem{proposition}{Proposition}[section]
34 | \newtheorem{lemma}{Lemma}[section]
35 | \newtheorem{corollary}{Corollary}[section]
36 | \theoremstyle{remark}
37 | \newtheorem*{remark}{Remark}
38 | \newtheorem*{sketchproof}{Sketch of Proof}
39 | 
40 | \newcommand\sgn{\mathrm{sgn}}
41 | 
42 | % \titleformat{\part}{\centering\Huge}{第 \thepart 部分}{1em}{}
43 | % \titleformat{\chapter}{\centering\Huge}{第 \thechapter 章}{1em}{}
44 | 
45 | \title{\textsc{Machine Learning}\\T.H.E. Note}
46 | \author{\textsc{YBiuR}}
47 | \date{Super Vegetable Me}
48 | 
49 | 
50 | \begin{document}
51 | \begin{spacing}{1.2}
52 | \setlength{\parskip}{1em}
53 | \setlength{\parindent}{0em}
54 | 
55 | \frontmatter
56 | \maketitle
57 | \chapter*{Preface}
58 | \paragraph{}\verb|Loss: NaN|
59 | \paragraph{}\verb|Acc: 0.00|
60 | \mainmatter
61 | \include{Introduction}
62 | \include{Regression}
63 | \include{Classification}
64 | \include{SupportVectorMachine}
65 | \include{LinearDiscriminantAnalysis}
66 | \include{Clustering}
67 | \include{DimensionReduction}
68 | 
69 | \end{spacing}
70 | \end{document}


--------------------------------------------------------------------------------
/Mathematical Logic/MathematicalLogic.pdf:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7992aebfbc3a759373c4a8d030db34818e18172b542ddf7a0d130c2c7b3e9335
3 | size 295403
4 | 


--------------------------------------------------------------------------------
/Mathematical Logic/MathematicalLogic.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[oneside]{book}
 2 | \usepackage[UTF8]{ctex}
 3 | \usepackage{amsmath}
 4 | \usepackage{mathtools}
 5 | \usepackage{listings} % lstlist插入代码
 6 | \usepackage{booktabs}
 7 | \usepackage{ulem}
 8 | \usepackage{enumerate}
 9 | \usepackage{amsfonts}
10 | \usepackage{amssymb}
11 | \usepackage{amsthm}
12 | \usepackage{proof}
13 | \usepackage{setspace} % spacing环境设置行间距
14 | \usepackage[ruled, vlined]{algorithm2e} % 算法与伪代码 
15 | \usepackage{bm} % 数学公式中的加粗
16 | \usepackage{pifont} % 打圈的数字。172-211。\ding
17 | \usepackage{graphicx}
18 | \usepackage{float}
19 | \usepackage[dvipsnames]{xcolor}
20 | %\usepackage{indentfirst}
21 | \usepackage{ulem} %\sout{}打删除线
22 | \normalem % 使用默认normalem
23 | \usepackage{lmodern}
24 | \usepackage{subcaption}
25 | \usepackage[colorlinks, linkcolor=blue]{hyperref}
26 | \usepackage{cleveref}
27 | \usepackage[a4paper]{geometry}
28 | \usepackage{titlesec}
29 | \usepackage{graphicx}
30 | \usepackage{stmaryrd}
31 | 
32 | \theoremstyle{definition}
33 | \newtheorem{definition}{Definition}[section]
34 | \newtheorem{theorem}{Theorem}[section]
35 | \newtheorem*{optTheorem}{Theorem}
36 | \newtheorem{proposition}{Proposition}[section]
37 | \newtheorem{lemma}{Lemma}[section]
38 | \newtheorem{corollary}{Corollary}[section]
39 | \newtheorem{axiom}{Axiom}[section]
40 | \theoremstyle{remark}
41 | \newtheorem*{remark}{Remark}
42 | \newtheorem*{sketchproof}{Sketch of Proof}
43 | \renewcommand{\proofname}{Proof}
44 | 
45 | \newcommand{\range}{\textrm{rng}}
46 | \newcommand{\domain}{\textrm{dom}}
47 | 
48 | \newcommand{\Dashv}{\rotatebox[origin=c]{180}{\ensuremath\vDash}}
49 | 
50 | \newcommand{\questeq}{\stackrel{?}{=}}
51 | 
52 | \newcommand{\semanticalImply}[2]{#1\vDash{}#2}
53 | \newcommand{\tautology}[1]{\vDash{}#1}
54 | 
55 | \newcommand{\sat}[3]{\vDash_{\mathfrak{#1}}#2[#3]}
56 | \newcommand{\sentSat}[2]{\vDash_{\mathfrak{#1}}#2}
57 | \newcommand{\unsat}[3]{\nvDash_{\mathfrak{#1}}#2[#3]}
58 | \newcommand{\sentunsat}[2]{\nvDash_{\mathfrak{#1}}#2}
59 | \newcommand{\assignSat}[3]{\vDash_{\mathfrak{#1}}#2\llbracket #3 \rrbracket}
60 | 
61 | \newcommand{\naturalSet}{\mathbb{N}}
62 | \newcommand{\realSet}{\mathbb{R}}
63 | \newcommand{\naturalStruct}{\mathfrak{N}}
64 | \newcommand{\realStruct}{\mathfrak{R}}
65 | 
66 | % \newcommand{\iff}{\Leftrightarrow}
67 | 
68 | \newcommand{\frakA}{\mathfrak{A}}
69 | \newcommand{\frakB}{\mathfrak{B}}
70 | 
71 | 
72 | \title{Mathematical Logic}
73 | \author{\textsc{YBiuR}}
74 | \date{A long long time ago in a far far away SJTU}
75 | 
76 | 
77 | \begin{document}
78 | \setlength{\parskip}{1em}
79 | \setlength{\parindent}{0em}
80 | 
81 | \frontmatter
82 | \maketitle
83 | \chapter*{Preface}
84 | \emph{“道可道，非常道。”}
85 | 
86 | \mainmatter
87 | \tableofcontents
88 | 
89 | \include{SetTheory.tex}
90 | \include{InformalNotionsOfAlgorithms.tex}
91 | \include{SententialLogic.tex}
92 | \include{FirstOrderLogic.tex}
93 | \include{DeductiveCalculus.tex}
94 | \include{wrapup.tex}
95 | 
96 | \end{document}


--------------------------------------------------------------------------------
/Mathematical Logic/wrapup.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Wrapping Up}
 2 | 
 3 | \section{Programs and Proofs}
 4 | 
 5 | \begin{itemize}
 6 |     \item Law of Excluded Middle $\forall{Q} Q\vee\neg Q$
 7 |     \item $Programs \equiv Proofs$
 8 | \end{itemize}
 9 | 
10 | Not all proofs have a corresponding program.
11 | 
12 | Let $isProgram(P)$ and $halts(P)$ be two predicates.
13 | \[ \varphi \triangleq \forall{P} isProgram(P) \to halts(P) \vee \neg halts(P) \]
14 | $\varphi$ is valid. i.e. $\vDash\varphi$. By completeness of first order logic, $\vdash\varphi$. So there must exist a proof (deduction) $\Phi=\{\varphi_1,\dots,\varphi_n\}$ s.t. $\varphi_n = \varphi$. If every proof had a corresponding program, then this program would be able to solve the Halting problem.
15 | 
16 | Not all programs have a corresponding proof.
17 | 
18 | \begin{lstlisting}[language=c]
19 |     while(1) {}
20 | \end{lstlisting}
21 | 
22 | This program has a dead loop, which implies ``false''. If this program had a proof, then we would be able to prove ``false''.
23 | 


--------------------------------------------------------------------------------
/Operating System/03.操作系统概述.md:
--------------------------------------------------------------------------------
  1 | # 操作系统
  2 | 
  3 | > “程序是写死在那边的，也不能和他交流。”
  4 | > “操作系统劝你别访问这个。”
  5 | 
  6 | 操作系统的主要功能
  7 | 
  8 | - 保护硬件不被应用程序滥用
  9 | - 为应用提供简单、统一的接口来操作硬件
 10 | 
 11 | ## 三个基本抽象
 12 | 
 13 | > “抽象这个东西本身非常抽象”
 14 | 
 15 | - 进程 process
 16 | - 虚拟内存 virtual memory
 17 | - 文件 file
 18 | 
 19 | ```kkkkk
 20 | | <-------------Process--------------> |
 21 |             | <----Virtual Memory----> |
 22 |                           | <--File--> |
 23 | | Processor | Main Memory | IO Devices |
 24 | ```
 25 | 
 26 | ### 进程
 27 | 
 28 | 程序执行需要依赖处理器，但是处理器资源有限，而且每个程序并不是时刻都需要使用处理器。大部分操作系统让各个程序“轮流”使用处理器。
 29 | 
 30 | 程序运行时，操作系统会提供一种假象
 31 | 
 32 | - 操作系统中**只有**这个程序在运行
 33 | - 一个程序看起来独占了所有处理器、主存、IO设备
 34 |   - 无间断地依次执行指令
 35 |   - 独占了所有内存
 36 |   - 独占了所有IO
 37 | - 当一个程序在操作系统上运行了一段时间后，会被系统打断执行，把处理器让给其他程序
 38 | - 当一个程序需要等待外部资源（IO、磁盘等），操作系统也会暂时把该程序调走，等到该程序等待的资源准备完毕后在切换回来
 39 | 
 40 | #### 进程概述
 41 | 
 42 | 进程
 43 | : 一个正在运行的程序。操作系统提供的抽象之一。
 44 | 
 45 | - 一个程序可以对应多个进程
 46 |   - 一个程序开好几个
 47 |   - 一个程序使用多线程
 48 | - 多个程序也可以对应一个进程
 49 |   - 比如GPU Code fusion
 50 | 
 51 | 进程由两个部分组成
 52 | 
 53 | - 用户部分
 54 | - 操作系统部分
 55 |   - （狭义上）又称为内核 kernel
 56 |   - 由所有进程共享
 57 | 
 58 | #### 进程切换
 59 | 
 60 | 进程之间进行切换的步骤称为**上下文切换**。
 61 | 
 62 | - 操作系统为每个进程维护了一个称为 **进程控制块(Process Control Block PCB)** 的区域，用于记录每个进程的基本信息
 63 | - 上下文切换时，操作系统将当前进程的各种状态保存到PCB，并读取下一个进程的PCB，将上下文恢复到处理器中
 64 | 
 65 | #### 进程树
 66 | 
 67 | 在 Linux 系统中，新的进程都是由现有的进程创建出来的，从而构成了进程之间的父子关系，所有进程最终都来自一个父进程
 68 | 
 69 | ### 虚拟内存
 70 | 
 71 | #### 虚拟内存的动机
 72 | 
 73 | - 现代操作系统中，常常有数十上百个应用程序想要同时运行
 74 |   - 如果允许每个应用程序访问所有内存资源，则需要在上下文切换时将整个内存保存到磁盘，时间开销巨大
 75 |   - 如果每个应用程序独立使用物理内存的一部分
 76 |     - 无法保证不同应用程序使用物理内存之间的隔离性
 77 |     - 无法保证应用程序运行时使用的内存地址是连续且统一
 78 | 
 79 | #### 虚拟内存机制概述
 80 | 
 81 | - 每个进程都独占了虚拟内存
 82 | - 应用程序通过虚拟内存来间接访问物理内存
 83 | - 虚拟地址空间
 84 |   - 程序的代码与数据、堆、共享库、运行时栈
 85 |   - 内核
 86 | - 通过处理器中专门的硬件和操作系统协同完成地址翻译
 87 |   - 现代处理器通常使用专门的内存管理单元 MMU 来进行地址翻译
 88 |   - MMU 的地址翻译规则和策略由操作系统决定
 89 | 
 90 | #### 进程的虚拟地址空间
 91 | 
 92 | 每个进程都拥有各自的私有地址空间，这是一段从0开始的连续的地址空间。除去内核代码、内核数据、内核栈外，该地址空间的布局从上到下依次为
 93 | 
 94 | - 内核代码及数据、内核栈
 95 |   - 用户态应用程序通常无法访问这部分内存
 96 |   - 只有通过中断或者系统调用机制进入到内核的代码后，才能由操作系统来操作这部分内存
 97 | - 用户栈
 98 |   - 保存临时数据（局部变量等）
 99 |   - 可伸缩，扩展方向**自顶向下**（从高地址向低地址扩展）
100 | - 代码库
101 |   - 共享的代码库
102 |   - 只读
103 | - 用户堆
104 |   - 运行时动态分配的地址
105 |   - 扩展方向**自底向上**，与栈相反
106 | - 数据与代码段
107 |   - 原本保存在需要执行的二进制文件中
108 |   - 在进程执行之前，操作系统会将它们载入到虚拟地址空间
109 |   - 数据段主要保存全局变量的值
110 |   - 代码段保存进程执行的指令
111 | 
112 | ### 文件
113 | 
114 | 文件是一个存储设备中的、有名字的字节序列，用于给不同的存储设备提供统一的操作接口
115 | 
116 | - 每个打开的文件都有一个独立的编号 File Descriptor, FD
117 | - 字节序列的内容称为**文件数据**
118 | - 操作系统同时保存长度、修改时间等其他信息，称为**文件元数据**
119 | - 每个文件拥有自己的**文件名**，被组织在**目录**中
120 | - 由**文件系统**管理
121 | 
122 | #### IO的文件抽象
123 | 
124 | > Linux “一切皆是文件” 的设计哲学
125 | 
126 | 除了存储设备中的字节序列之外，其他设备（IO等）也可以被抽象为文件，从而可以使用操作系统提供的文件操作接口统一操作
127 | 
128 | - 每个IO设备都被抽象为一个文件
129 | - Unix IO
130 |   - 提供有限的系统调用来读写“文件”
131 |   - 所有输入输出都通过这些系统调用完成
132 | 
133 | ## 用户态与内核态
134 | 
135 | ### 处理器特权级
136 | 
137 | 现代处理器提供**特权级**机制，限制处理器可以执行哪些指令，通常由处理器中特殊的状态位表示
138 | 
139 | AArch64提供4个特权级
140 | 
141 | - `EL0`
142 |   - aka 用户态。最低的特权级。一般的应用程序通常运行在这一级别
143 |   - 不能执行特权指令
144 |   - 不能直接访问地址空间中的内核部分
145 |   - 只能通过**系统调用**来间接（且可控）地达成上述目的
146 |     - 系统调用是应用程序实现这些目的的唯一合法方式
147 |     - “你除个零能进kernel，触发Segmentation Fault也能进kernel”
148 | - `EL1`
149 |   - aka 内核态。操作系统通常运行在这一级别
150 |   - 可以执行任何指令
151 |   - 可以访问任何内存地址
152 | - `EL2`
153 |   - 在虚拟化场景下，虚拟机监控器（Virtual Machine Monitor, aka. Hypervisor）通常运行在这一级别
154 | - `EL3`
155 |   - 和体系结构的TrustZone特性相关。用于普通世界和安全世界的切换，安全世界可以不受限地访问所有计算资源，而普通世界不能访问安全世界的资源。
156 | 
157 | ## 异常 Exception
158 | 
159 | ### 用户态与内核态的控制流转换
160 | 
161 | - 跳转指令 `b` 和过程调用/返回指令 `bl` `ret` 都只能在同一种模式内跳转
162 | - 需要新的指令 `svc` `eret`
163 | 
164 | ### 系统调用
165 | 
166 | - 系统调用参数不超过8个，放置于寄存器 `x0` 至 `x7`
167 | - 寄存器 `x8` 用于存放系统调用编号
168 |   - `svc` 指令将直接根据 `x8` 跳转至系统调用
169 | - 返回值存放于 `x0`
170 | 
171 | ### 事件 Event 与异常 Exception
172 | 
173 | ```mermaid
174 | sequenceDiagram
175 |     用户态 ->> 内核态: 异常
176 |     activate 内核态
177 |     Note right of 内核态: 异常处理函数
178 |     内核态 -->> 用户态: 返回指令（可选）
179 |     deactivate 内核态
180 | ```
181 | 
182 | - 事件与当前指令可能有关，也可能无关
183 |   - 事件可以理解为触发异常的行为
184 |   - `svc` `eret`（有关）
185 |   - 缺页、溢出（有关）
186 |   - 定时器结束、IO请求完成（无关）
187 | - 异常处理结束后，将通过以下某种方式之一返回控制权
188 |   - 回到异常发生时正在执行的指令
189 |   - 回到异常发生时的下一条指令
190 |   - 结束当前进程
191 | 
192 | ### 异常向量表
193 | 
194 | - 操作系统内核预先在一张表中准备好不同类型异常的处理函数
195 |   - 基地址存储在 `VBAR_EL1` 寄存器中
196 | - 发生异常时自动跳转到对应位置
197 | - 一共16项，其中4条最为常用
198 |   - 各个处理函数根据各自处理的异常类型进一步调用其他函数
199 | - 处理器将异常类型存储在指定寄存器 `ESR_EL1` 中
200 | 
201 | ### 异常类型
202 | 
203 | #### 同步异常
204 | 
205 | - 由于执行指令而发生时间引起的异常
206 |   - 同步：与当前进程有关
207 | - 陷阱 Trap
208 |   - 有意触发的异常
209 |   - 例如断点、系统调用
210 |   - 异常处理函数返回到下一条指令
211 | - 故障 Fault
212 |   - 非有意触发，但是可能可以恢复（但是不一定能成功恢复）
213 |   - 例如缺页异常、保护错误
214 |   - 控制权将返回到当前指令，或终止当前进程
215 | - 终止 Abort
216 |   - 非有意触发、不可恢复
217 |   - 当前进程将被终止
218 | 
219 | #### 异步异常：中断 Interrupt
220 | 
221 | - 由处理器外部的事件导致
222 |   - 外部设备设置处理器的中断引脚
223 |   - 处理函数返回到被中断的下一条指令
224 | - 例子
225 |   - IO中断
226 |     - `CTRL + C` Keyboard interrupt
227 |     - 网络收到数据包
228 |     - 磁盘从某个扇区读取完成
229 |   - 硬复位
230 |     - 重启按钮
231 |   - 软复位
232 |     - `CTRL + ALT + DEL`
233 | - 流程
234 |   - IO设备通过设置处理器引脚来发起终端
235 |   - IO设备向总监发送一个信号，表示发起中断的是哪个设备
236 |   - 处理器将打断当前执行流，跳转到中断处理函数
237 | 
238 | ### 信号
239 | 
240 | - 操作系统向应用程序发送的信息
241 |   - 例如 `Segmentation Fault` 发生时，向应用程序发送 `SIGSEG` 信号
242 | 
243 | ### 内存映射IO
244 | 
245 | - IO设备抽象为内存地址，对IO设备的操作抽象为读写内存的操作
246 | - 每个设备可以被映射到多个IO端口
247 | 


--------------------------------------------------------------------------------
/Operating System/04.进程.md:
--------------------------------------------------------------------------------
  1 | # 进程
  2 | 
  3 | ## 概述
  4 | 
  5 | ### 进程如何产生
  6 | 
  7 | - 在 `shell` 中输入可执行文件的名称，或在图形化界面双击应用程序
  8 | - 应用程序也可以自行创建新进程
  9 |   - 在新进程中可以运行其他应用程序或者运行和自己一样的程序
 10 | 
 11 | ### 控制流
 12 | 
 13 | - 物理控制流
 14 |   - 逐条读取并执行程序指令
 15 | - 并发 Concurrency：多个进程同时运行
 16 |   - 如何支持并发
 17 |     - 多任务交错 Interleaving
 18 |     - 虚拟地址
 19 |   - 如何实现交错执行
 20 |     - 上下文切换 Context Switch
 21 | 
 22 | ## 上下文
 23 | 
 24 | - 内核为每个进程维护一个**上下文**
 25 | - 包含了被中断进程恢复执行所需要的所有状态
 26 |   - 程序代码和数据
 27 |     - 存储在内存
 28 |   - `PC`、寄存器、状态寄存器
 29 |   - 用户栈、内核栈
 30 |     - 属于上下文范畴
 31 |     - 不需要额外保存操作（虚拟内存抽象保证了栈不需要额外保存和恢复）
 32 |   - 环境变量
 33 |   - 内核数据结构
 34 |     - 进程表、页表、文件表
 35 |     - 只需要记住在哪能找到，不需要整个保存
 36 | - 看着很多，但是实际要保存的主要是一些寄存器，因此不会花很久
 37 | 
 38 | ### 上下文切换 Context Switch
 39 | 
 40 | #### 时间片调度 Time Slicing
 41 | 
 42 | - 将时间段划分为多个时间片
 43 | - 每个进程只能在分配给自己的时间片中运行
 44 |   - 时间到了会被操作系统赶走
 45 | - 异常控制流的高层表现形式
 46 | 
 47 | #### 上下文切换的时机
 48 | 
 49 | 通常会在两种情况下执行上下文切换
 50 | 
 51 | - 用户执行系统调用并进入内核
 52 |   - 会导致进程阻塞的系统调用，例如 `read` / `sleep`
 53 |   - 即使系统调用没有执行，内核也可以决定进行上下文切换
 54 | - 中断导致的上下文切换
 55 |   - 时钟中断：时间片到了
 56 | 
 57 | #### 时间片切换频率
 58 | 
 59 | - 频繁地时间片切换可以提高用户体验，但是会导致切换带来的总时间增加
 60 |   - 因为看上去每个进程都可以立刻得到响应
 61 | - 例如
 62 |   - 一台服务器可能需要较低频率地切换
 63 |   - 而一台PC可能需要较快频率地切换
 64 | 
 65 | ### 调度器 Scheduler
 66 | 
 67 | - Policy
 68 |   - 决定是否要在进程执行过程中，抢占当前的进程
 69 |     - 例如是否有系统调用、时间片是否到了
 70 |   - 选择恢复一个先前被抢占地进程
 71 | - Mechanism
 72 |   - 抢占 Preempt 当前执行的进程
 73 |     - 需要保存当前进程的上下文
 74 |   - 恢复 Restart 被调度到的进程
 75 |     - 恢复该进程的上下文
 76 |     - 把控制权交给该进程
 77 | 
 78 | ## 并发 Concurrent
 79 | 
 80 | ### 逻辑控制流
 81 | 
 82 | - 每个进程都有自己的逻辑控制流
 83 |   - 不会影响其他进程的状态
 84 | 
 85 | ### 并发
 86 | 
 87 | - 两个进程同时运行，则称为并发
 88 | - 否则为顺序执行（串行）
 89 | - 并发进程的控制流实际上是**交错执行**的
 90 | - 但是可以把并发进程视为相互间**并行执行**
 91 | 
 92 | ### 进程的三种状态
 93 | 
 94 | > 你不要调度到一个进程上来，它说它不想执行，然后它又走了
 95 | 
 96 | #### 状态
 97 | 
 98 | - 运行态
 99 |   - 该进程正在CPU中执行，或者正在等待被执行
100 |   - 该进程最终一定会被调度
101 | - 暂停态/阻塞态
102 |   - 进程执行被挂起，未来不会被调度
103 |   - 比如在等IO、等网络
104 | - 终止态
105 |   - 进程永远终止运行
106 |   - “也许你这个应用的尸体中有一些信息，其他人需要”
107 | 
108 | #### 状态切换
109 | 
110 | 状态切换通过信号实现
111 | 
112 | - 运行态的进程可以暂停
113 |   - 收到信号，例如 `SIGSTOP`
114 | - 阻塞态的进程可以运行
115 |   - 收到信号，例如 `SIGCONT`
116 | - 进程终止方式
117 |   - 收到信号的默认处理方式是终止进程
118 |   - 例如从 `main()` 返回，调用 `exit()`
119 | 
120 | ## 创建和终止进程
121 | 
122 | ### 进程ID
123 | 
124 | - 进程 ID
125 |   - 每个进程都有一个唯一的、正数的PID
126 | - `Getpid()`
127 |   - 返回调用进程的PID
128 | - `Getppid()`
129 |   - 返回调用进程的**父进程**的PID
130 |   - 父进程是创建了调用进程的进程
131 |   - `getpid` 和 `getppid` 返回 `pid_t` 类型的值，在 `linux` 中为 `int`
132 | - `Kill(pid)`
133 |   - 经操作系统，向对应ID的进程发送 `SIGKILL` 信号
134 | 
135 | ### Exit函数
136 | 
137 | - `void exit(int status)`
138 | - 没有返回值
139 | - 终止进程时可以附带一个 `status` 表示被终止进程的终止状态~~遗言~~
140 |   - 父进程可以获取这个状态（如果父进程需要的话）
141 | 
142 | ### Fork函数
143 | 
144 | - `pid_t fork(void)`
145 |   - 无参数
146 |   - 返回值：子进程为 `0`，父进程为子进程 PID，出错为 `-1`
147 | - 创建的新进程和父进程**几乎相同**
148 |   - 相同
149 |     - 子进程获得一份**相同但独立**的父进程用户态虚拟地址空间的复制
150 |     - 获得了父进程所有打开的文件标识符的复制
151 |       - 子进程可以读写任何在调用 `fork` 前父进程打开的文件
152 |   - 不同
153 |     - pid
154 |     - 返回值
155 | 
156 | #### Fork的调用与返回
157 | 
158 | - 调用一次
159 |   - 在父进程中调用一次
160 | - 返回两次
161 |   - 在父进程中，返回子进程的PID
162 |   - 在子进程中，返回 `0`
163 |   - 返回值提供了唯一明确区分父进程和子进程的方法
164 | 
165 | #### 父进程与子进程的并发执行
166 | 
167 | - 父进程和子进程同时运行
168 | - 在内核调度下，由内核以任意可能的顺序交错执行
169 | - 因此并不能确定父进程和子进程谁先结束
170 | 
171 | #### 重复但独立的地址空间
172 | 
173 | ```c
174 | int main()
175 | {
176 |     pid_t pid;
177 |     int x = 1;
178 |     pid = Fork();
179 |     if (pid == 0)
180 |     {
181 |         // child process
182 |         printf("child: x=%d\n", ++x);
183 |         exit(0);
184 |     }
185 |     // parent
186 |     printf("parent: x=%d\n", --x);
187 |     exit(0);
188 | }
189 | ```
190 | 
191 | - 上述代码会打印两次，一次是 `x=0`，一次是 `x=2`
192 | - 打印顺序不能确定，取决于内核调度器
193 | - 父进程和子进程里的 `x` 不是同一个 `x`
194 | 
195 | ## 回收进程
196 | 
197 | ### 僵尸进程
198 | 
199 | - 进程以终止态存在，此时进程并未被从系统中移除，而是等待父进程回收
200 | - 如果父进程在自己终止前没有回收僵尸子进程，内核会安排 `init` 进程回收这些子进程
201 |   - 父进程回收时，内核会将子进程 `exit` 时的状态传递给父进程
202 |   - 然后内核将移除子进程，此时子进程才真正被回收
203 | - 终止状态下、尚未被内核回收的进程称为**僵尸进程**
204 | - 僵尸进程仍然会占用系统的内存资源
205 | - 对于长时间运行的程序，应该总是及时回收僵尸子进程
206 |   - 防止僵尸进程占用过多系统资源
207 | 
208 | #### `init` 进程
209 | 
210 | - `PID` 为 `1`
211 | - 在系统初始化时由内核创建
212 | 
213 | ### `waitpid`
214 | 
215 | - `pid_t waitpid(pit_t pid, int* status, int options);`
216 |   - 成功时返回子进程 PID
217 |   - 出错时返回 `-1`
218 |     - 没有子进程 `ECHILD`
219 |     - 被打断 `EINTR`
220 |   - 此时子进程已经被回收
221 | - `pid_t pid`
222 |   - `>0`：等待某个特定的子进程
223 |   - `-1`：等待所有子进程
224 | - `int options`
225 |   - `options=0` 则挂起调用进程，等待集合中任意子进程终止
226 |   - 如果等待集合中有子进程在函数调用前已经终止，则立刻返回
227 |   - 返回值是导致函数返回的终止子进程的 `pid`
228 |   - 该终止子进程将被内核回收
229 |   - `options` 还支持一些参数
230 |     - `WNOHANG` 如果等待集合中没有终止子进程，则不挂起而立刻返回 `0`
231 |     - `WUNTRACED` 挂起调用进程，等待集合中任意子进程终止或暂停
232 | - `int status` 将待会被回收子进程的 `exit` 状态
233 | 


--------------------------------------------------------------------------------
/Operating System/05.文件.md:
--------------------------------------------------------------------------------
  1 | # 文件
  2 | 
  3 | ## Unix IO
  4 | 
  5 | ### Unix 文件
  6 | 
  7 | - Unix 文件是一串字节序列
  8 | - IO设备也被抽象为文件
  9 |   - `Unix` 提供基于文件的底层应用接口，即 Unix IO
 10 |   - 所有输入、输出都通过读写文件完成
 11 | 
 12 | ### 文件类型
 13 | 
 14 | - 普通文件 regular file
 15 |   - 文本文件
 16 |     - `Linux` 的文本文件中包含一系列文本行，每一行都是字符串，末尾换行符由 `\n` 表示
 17 |   - 二进制文件
 18 |     - 除文本文件以外的所有文件
 19 |   - 从内核角度看，文本文件和二进制文件没有区别
 20 | - 目录 directory
 21 |   - 由一组链接组成
 22 |   - 每个链接将一个文件名映射到一个文件或目录
 23 |   - 每个目录至少有两个链接
 24 |     - `.` 当前目录
 25 |     - `..` 父目录
 26 | - 套接字 Socket
 27 |   - 用于跨网络进程交互
 28 | - 其他文件
 29 |   - 命名管道 named pipes
 30 |   - 符号链接 symbolic links
 31 |   - 字符/块设备 (character/block devices)
 32 | 
 33 | #### 目录层级
 34 | 
 35 | - `Linux` 内核采用单个层次化目录来组织文件
 36 |   - `/` 代表根目录
 37 |   - 每个文件都是根目录 `/` 的直接或间接后代
 38 | 
 39 | ### 文件读写
 40 | 
 41 | #### 打开文件
 42 | 
 43 | 在应用准备访问一个 IO 设备或读取一个文件时
 44 | 
 45 | - 内核打开相关文件，返回一个非负整数，作为**文件标识符** file descriptor `fd`
 46 |   - `fd` 代表该文件，用于后续对文件的操作
 47 |   - 对每个打开的文件，保证获得该池中最小的文件标识符
 48 |   - 特殊标识符
 49 |     - `STDIN_FILENO: 1`
 50 |     - `STDOUT_FILENO: 2`
 51 |     - `STDERR_FILENO: 3`
 52 | - 内核跟踪记录每个进程的所有打开文件的信息
 53 |   - 对每个打开的文件，记录文件内偏移（游标） `k`
 54 |   - 应用可以通过 `seek` 函数显示改变游标位置
 55 | - 应用只需要记录内核返回的文件标识符
 56 |   - 后续应用对文件的所有操作都通过文件标识符实现
 57 | 
 58 | #### 关闭文件
 59 | 
 60 | - 由内核负责关闭文件
 61 |   - 释放在文件打开时创建的数据结构
 62 |   - 把文件标识符返回到可用标识符池
 63 | 
 64 | #### 读写文件
 65 | 
 66 | ##### 读
 67 | 
 68 | - 从文件中复制 $m$ 字节到内存
 69 |   - 若文件剩余长度小于 $m$，则触发 `end-of-file`
 70 |     - `EOF` 可以被应用程序检测，但是文件尾实际上并不存在 `EOF`
 71 | - `ssize_t read(int fd, void* buf, size_t count)`
 72 | - `ssize_t write(int fd, void* buf, size_t count)`
 73 |   - 成功：返回读入的字节数
 74 |   - 遇到文件尾：`0`
 75 |   - 失败：`-1`
 76 | 
 77 | #### IO 重定向
 78 | 
 79 | `dup2` 复制该进程的文件描述符表中的条目
 80 | 
 81 | ```cpp
 82 | int dup2(int oldfd, int newfd)
 83 | ```
 84 | 
 85 | - 成功时返回文件描述符，失败时返回 `-1`
 86 | 
 87 | ## 加载和运行进程
 88 | 
 89 | ### 加载和运行 `execve`
 90 | 
 91 | - `int execve(const char* filename, const char* argv[], const char* envp[]);`
 92 | - 只调用一次，不会返回
 93 |   - 仅在运行报错的时候返回调用程序
 94 | 
 95 | ## 内存映射
 96 | 
 97 | - 将文件映射到内存
 98 |   - 通过读写内存来读写文件（无需使用 `read` `write`）
 99 | - `void *mmap(...)`
100 | 
101 | ### 共享映射与私有映射
102 | 
103 | - 共享映射
104 |   - 会立刻写入磁盘
105 |   - 对其他共享映射了文件的其他进程可见
106 | - 私有映射
107 |   - 不会立刻写入磁盘
108 |   - 对其他进程不可见
109 |   - 对 `Fork()` 后的子进程（私有映射）
110 |     - 子进程将继承父进程的所有内存映射
111 |     - 子进程可以观察到父进程在 `Fork()` 前的所有修改
112 |     - `Fork()` 后的修改只对自身进程内可见
113 |     - 可以实现父子进程的交流（但是不能两个人一起写否则就无了）
114 | 
115 | ## SHELL
116 | 
117 | ### Unix Shell
118 | 
119 | - 一个交互型应用程序，用于代表用户运行其他程序
120 | - 依次迭代执行**读取输入** `read` 和**解析命令** `eval`，直到进程终止
121 | 
122 | ### 解析命令
123 | 
124 | - 使用空格分割命令行参数
125 | - 建立 `argv` 数组
126 |   - 要求每个元素指向一个参数
127 |   - 传递给 `execve` 函数
128 |   - 技巧：把命令行输入中的空格 `\s` 先替换成 `\0`
129 | - 加在命令行最后的 `&` 用于表示是否在后台执行，`&` 不计入 `argv`
130 | 


--------------------------------------------------------------------------------
/Operating System/07.系统初始化.md:
--------------------------------------------------------------------------------
  1 | # 系统初始化
  2 | 
  3 | ## 启动流程
  4 | 
  5 | ```mermaid
  6 | graph LR
  7 |     按下开关-->内核第一行代码
  8 |     内核第一行代码-->用户态应用代码
  9 |     用户态应用代码-->Shell代码
 10 |     Shell代码-->Shell等待输入
 11 | ```
 12 | 
 13 | ## 主要任务
 14 | 
 15 | - 配置页表并开启虚拟内存
 16 |   - 配置页表前使用物理地址，配置页表后使用虚拟地址
 17 | - 配置异常向量表并打开中断
 18 | 
 19 | ## 总线
 20 | 
 21 | ### 物理地址空间与物理内存空间
 22 | 
 23 | - 物理地址空间通常指系统总线的地址空间
 24 |   - 对64位系统，这个空s间大小为 $2^{64}$
 25 | - 系统总线连接内存，也连接其他设备，每个设备占一块空间
 26 |   - 通常内存确实占用了绝大部分空间
 27 | 
 28 | ### 总线事务
 29 | 
 30 | - 总线上的设备可以向总线发起请求，称为**事务 Transaction**
 31 |   - 例如 `(1742, READ 102)` 该事务对总线上所有设备可见
 32 | 
 33 | ## BootLoader
 34 | 
 35 | > “启动计算机，你需要运行代码；运行代码，你需要启动计算机”
 36 | 
 37 | - 以树莓派为例
 38 |   - 上电后固定从 `0x0` 地址运行 `firmware`，也称为 `bootloader`
 39 |   - `0x0` 不是物理内存的地址，而是连在总线上的某个存储介质的地址
 40 |   - 由这段代码处理 CPU、SDRAM
 41 |   - 然后加载内核、文件系统到内存
 42 | - 不同厂商的启动实现可能不同
 43 | 
 44 | ### 入口函数
 45 | 
 46 | - CPU从预定义的物理内存地址读取第一行代码，由硬件厂商决定
 47 |   - 树莓派
 48 |     - 32位为 `0x8000`
 49 |     - 64位为 `0x80000`
 50 |   - x86
 51 |     - `0x7C00`
 52 | 
 53 | ## BIOS
 54 | 
 55 | ### BIOS 概述
 56 | 
 57 | > `Basic Input Output System`
 58 | 
 59 | 1. 上电后，开始执行BIOS ROM中的代码
 60 |    - 自检测
 61 |    - 找到第一个可以启动的设备
 62 |    - 将可启动设备的第一个块加载到内存
 63 |    - 跳转到 `bootloader` 执行
 64 | 2. 开始执行 `bootloader`
 65 | 3. 内核代码开始执行
 66 | 
 67 | #### 什么是 BIOS
 68 | 
 69 | - 通常保存在主板的只读内存（ROM）中
 70 | - 仅仅是存储，没有执行能力
 71 | - 主要是为了PC的兼容性和扩展性
 72 |   - PC必须兼容不同的配件
 73 |   - 在没有插不同CPU或不同内存的需求的设备上，通常不需要BIOS
 74 |   - 在许多嵌入式设备中没有BIOS
 75 | 
 76 | #### 执行 BIOS
 77 | 
 78 | - 由 CPU 负责执行 BIOS
 79 | - `x86` CPU在reset后，`PC` 固定指向 `0xFFFF0`
 80 |   - 该地址即为 BIOS 的物理地址
 81 | 
 82 | ### 上电自检 Power-On Self-Test POST
 83 | 
 84 | - 检查计算机的硬件设备能否满足运行的基本条件
 85 | - 如果硬件出现问题，则BIOS会发出不同含义的蜂鸣声
 86 | 
 87 | ### 主引导记录 Master Boot Record MBR
 88 | 
 89 | - 在硬盘的前512字节
 90 | - 由三部分组成
 91 |   - 主引导程序 `bootloader`
 92 |   - 硬盘分区表 DPT
 93 |   - 硬盘有效标志 `0x55AA`
 94 | 
 95 | ### 在一台电脑上安装多个操作系统
 96 | 
 97 | - Windows 会在 MBR 安装 NTLDR
 98 | - Linux 会在 MBR 安装 GRUB，可以选择 Windows 或 Linux
 99 | - 所以要先装 Windows 再装 Linux
100 | - MacOS：世界上为什么还有别的操作系统呢
101 | 


--------------------------------------------------------------------------------
/Operating System/08.虚拟内存管理.md:
--------------------------------------------------------------------------------
  1 | # 虚拟内存管理
  2 | 
  3 | ## 虚拟内存段和 VMA
  4 | 
  5 | ### 段
  6 | 
  7 | - OS采用**段**管理虚拟地址，每个段包含了起始结束地址范围、读写权限等信息
  8 |   - 段内连续，段间非连续
  9 |   - 合法虚拟地址段：代码、数据、堆、栈
 10 |   - 非法虚拟地址段：未映射的代码段
 11 |     - 一旦访问则触发 `Segmentation Fault`
 12 | - 记录应用程序中已分配的虚拟地址区域
 13 |   - 代码段：读、执行
 14 |   - 数据段：读
 15 |   - 堆：读、写
 16 |   - 栈：读、写
 17 | 
 18 | ### VMA
 19 | 
 20 | Linux 中采用 VMA 结构体记录应用程序已经分配的虚拟内存区域
 21 | 
 22 | #### 添加 VMA
 23 | 
 24 | - 途径1：OS在创建应用程序时分配
 25 |   - 数据
 26 |   - 代码
 27 |   - 栈（初始为空）  
 28 | - 途径2：应用程序主动向OS发出请求
 29 |   - `brk()`：扩大、缩小堆区域
 30 |     - OS也可以在应用程序创建时分配初始的堆的VMA
 31 |   - `mmap()`：申请空的虚拟内存区域，或申请文件内存映射的虚拟内存区域
 32 | - 用户态的 `malloc()` 也会改变VMA
 33 |   - 通常是调用 `brk`
 34 |   - 但是部分实现也会调用 `mmap`
 35 | 
 36 | #### `mmap`
 37 | 
 38 | - 通常用于把一个文件（或文件的一部分）映射到内存
 39 | 
 40 | ```c
 41 | void *mmap(
 42 |     void *addr, //手动指定虚拟内存起始地址
 43 |     size_t length, // 手动指定虚拟内存区域长度
 44 |     int prot,
 45 |     int flags,
 46 |     int fd,
 47 |     off_t offset)
 48 | ```
 49 | 
 50 | - 也可以不映射任何文件，仅仅新建虚拟内存区域
 51 |   - `fd=-1`
 52 |   - 并非POSIX标准，但是主流OS通常都会支持这一功能
 53 | - OS 不允许两块 `mmap` 的内存区域重叠
 54 | - 执行 `mmap` 后，VMA 中新增一个 `mmap` 开出来的段
 55 | 
 56 | ```text
 57 | 
 58 | 
 59 | ┌─────────────┐       ┌─────────────┐
 60 | │             │       │             │
 61 | ├─────────────┤       ├─────────────┤
 62 | │    Stack    │       │    Stack    │
 63 | ├─────────────┤       ├─────────────┤
 64 | │             │       │             │
 65 | │             │       ├─────────────┤
 66 | │             ├──────►│   mmapped   │
 67 | │             │       ├─────────────┤
 68 | │             │       │             │
 69 | ├─────────────┤       ├─────────────┤
 70 | │    Heap     │       │    Heap     │
 71 | ├─────────────┤       ├─────────────┤
 72 | └─────────────┘       └─────────────┘
 73 | ```
 74 | 
 75 | #### VMA 与页表
 76 | 
 77 | - OS通过VMA记录应用程序能够访问的虚拟地址
 78 |   - 未映射的区域没有对应的VMA结构
 79 | - OS通过页表控制应用程序能访问的虚拟地址
 80 |   - 未分配的虚拟地址没有页表
 81 | 
 82 | #### 操作系统填写页表
 83 | 
 84 | - 立即映射
 85 |   - 每个虚拟页都对应一个物理页
 86 | - 延迟映射
 87 |   - 有些虚拟页不对应任何物理页
 88 |   - 对应的数据在磁盘上
 89 |   - 没有对应的数据（初始化为0）
 90 | 
 91 | !!!question VMA是否冗余
 92 |     如果只有立即映射一种模式，那么页表和VMA的功能有重叠，VMA会显得多余，但是仍然可以帮助OS快速判断一个虚拟地址是否合法。
 93 |     在增加了延迟映射的情况下，有些已分配的虚拟地址也没有页表项，因此需要VMA。
 94 | 
 95 | ## 延迟映射-按需调页 Demand Paging
 96 | 
 97 | ### 物理内存作为虚拟地址空间的Cache
 98 | 
 99 | #### 为什么
100 | 
101 | - 有时应用程序所需的物理内存总和超出了机器上实际搭载的物理内存总量
102 |   - 对应[换页机制](#换页机制)
103 | - 有时应用程序预先申请了很大的内存，但是很多虚拟页最终都没有用到
104 |   - 对应[按需调页](#延迟映射-按需调页-demand-paging)
105 | 
106 | ### 流程
107 | 
108 | 当应用程序申请分配内存时，操作系统可以将新分配的虚拟页标记成 *已分配，但未映射在物理内存中* 的状态，而不必立刻为这个虚拟页分配对应的物理页
109 | 
110 | 1. 操作系统为应用程序创建VMA，但是并不把整个应用程序加载到内存
111 |    - “先画个饼”
112 | 2. CPU访存时发现没有页表项没有对应的物理页映射
113 | 3. 触发 `Page Fault`
114 | 4. OS处理 `Page Fault`
115 |    1. 检查VMA
116 |    2. 根据VMA分配并初始化内存，此时才真正分配物理页
117 |    3. 更新页表
118 | 5. 回到用户态，重新运行访存指令
119 | 
120 | !!!note
121 |     只在触发了 `Page Fault` 时（即应用程序真的需要访问对应的物理页时）才分配物理内存，节省了物理内存使用。
122 | 
123 |     这会导致初次访存时总会触发 `Page Fault`。操作系统利用程序访存的**空间局部性**特点，可以进行预先映射（prefetch），即把缺页异常的虚拟页附近的虚拟页也一并进行映射，从而减少未来可能发生缺页异常的次数。
124 | 
125 | ## 换页机制
126 | 
127 | ### 基本思想
128 | 
129 | - 用磁盘作为物理内存的补充，且对上层应用透明
130 | - 应用对虚拟内存的使用不受物理内存大小限制
131 | - “骗应用程序说我有很大很大很大的物理内存”
132 | 
133 | ### 基本实现
134 | 
135 | - 磁盘上划分专门的Swap分区或Swap文件
136 | - 在处理缺页异常时，触发物理内存页的换入换出
137 | - Swap-out
138 |   - 将数据写入磁盘Swap分区（或写回backup file），在该进程的页表中抹去对应虚拟地址到该物理页的映射，并记录虚拟地址和磁盘块的对应关系
139 |   - 此后该物理页可以被操作系统回收并提供给其他应用程序使用
140 |   - 原进程的虚拟页此时将处在已经分配、但是未映射在物理内存中的状态
141 | - Swap-in
142 |   - 将数据从磁盘写回内存，重新建立映射
143 | 
144 | ### 页命中与缺页异常
145 | 
146 | #### 页命中
147 | 
148 | - CPU将虚拟地址发送给MMU
149 | - MMU查TLB，如果未命中则从页表中取出页表项
150 | - MMU将物理地址发送给Cache或内存
151 | - Cache或内存将内容返回给CPU
152 | 
153 | #### Page Fault
154 | 
155 | - CPU将虚拟地址发送个MMU
156 | - MMU查TLB，如果未命中则从页表中取出页表项
157 | - 页表项有效位为`0`，触发`Page Fault`
158 | - Handler找到一个victim page
159 |   - 如果该页被修改过，则需要写回磁盘
160 | - Handler将新的页载入内存，并修改页表
161 | - Handler返回触发`Page Fault`的指令，重新运行
162 | 
163 | !!!note SWAP
164 |     如果物理内存耗尽，那么操作系统会疯狂读写SWAP进行换页，在使用机械硬盘时会显著拖慢系统性能，在使用固态硬盘时虽然系统性能不会下降得太严重，但是频繁读写也会损耗固态硬盘寿命
165 | 
166 | !!!info 内存管理API
167 |     `madvice` 函数将用户态的一些信息发送给内核以便于优化，**建议**操作系统提前分配物理页，减少缺页异常造成的性能开销
168 |     `mprotect` 函数改变内存权限
169 | 
170 | ## 其他内存管理机制
171 | 
172 | ### 共享内存
173 | 
174 | - 将两个进程的虚拟内存映射到同一段物理地址
175 |   - 节约内存，例如使用共享库时
176 |   - 可以实现进程间通信
177 | - 基于共享内存，又衍生出写时拷贝、内存去重等功能
178 | 
179 | ### 写时拷贝 Copy-on-Write
180 | 
181 | - 常见于 fork
182 | - 父子进程共享一段只读物理内存
183 | - 如果某个进程要写物理内存，则在写之前OS会复制一页并修改权限为可写
184 |   - 在尝试写共享物理内存页时，同样会触发缺页异常，但是处理函数知道这是被标记为写时拷贝的页，因此会做出对应处理
185 | - 常用于性能优化
186 | 
187 | ### 内存去重
188 | 
189 | - 操作系统中有专门扫描内存的进程
190 | - 用于合并具有相同内容的物理页
191 | - 由操作系统发起，对用户态透明
192 | - 但是可能会对程序访存时延造成影响
193 |   - 应用程序尝试写一个被去重的内存页时，又会触发缺页异常，并触发写时拷贝
194 | 
195 | ### 内存压缩
196 | 
197 | - 将多个内存页的数据压缩成单个内存页，存放在专门的内存区域
198 |   - 称为 ZSWAP
199 | - 例如全零的页或稀疏的页
200 | 
201 | ### 大页
202 | 
203 | - 将二级页表中一个条目指向的三级页表（512页）全部合并为一个页
204 | - 共 $512\times 4\mathsf{K}=2\mathsf{M}$
205 |   - 降低对页表的操作，同时降低TLB占用（一个大页在TLB中只占一项）
206 |   - 代价是可能会造成内部碎片和浪费，而且增加了内存管理的复杂度
207 | - 类似地也可以把L1页表项之后的页表全部合并，指向一个1G的内存页
208 |   - 可能用于云端服务器开虚拟机时
209 | 
210 | ## 虚拟地址的优势
211 | 
212 | - 把目前不太充足的物理内存“变大”
213 | - 简化应用程序开发和编译器设计时的地址管理
214 |   - 对应用程序而言，它看到的永远是完整的、连续的虚拟地址空间
215 | - 较为便捷地实现权限管理、内存共享
216 | - 结合VMA让程序启动更加快速
217 | 


--------------------------------------------------------------------------------
/Operating System/12.多线程.md:
--------------------------------------------------------------------------------
  1 | # 多线程
  2 | 
  3 | ## 线程概念的引入
  4 | 
  5 | - 早期进程数量通常超过CPU核数
  6 |   - 通过简单的分配和进程调度，即可让每个核得到一个进程
  7 |   - 一个进程同时只能被调度到其中一个核心上运行
  8 | - 虽然可以用 `fork` 创建多个进程来实现多核运行，但是 `fork` 产生的两个进程隔离过强，共享数据协调困难
  9 | 
 10 | ### 轻量级进程
 11 | 
 12 | > 能否让单一进程跨和执行？
 13 | 
 14 | - 共享页表的进程
 15 | - 两个共享页表的线程的 TCB 中，页表地址相同
 16 | 
 17 | ### 线程
 18 | 
 19 | **线程**：进程内部可以有多个独立的执行流，它们共享一份进程的地址空间，但是各自拥有独立的运行时状态（上下文和栈）。
 20 | 
 21 | #### 包含的内容
 22 | 
 23 | - 独立的上下文
 24 |   - 实现跨核支持
 25 | - 独立的用户栈
 26 |   - 常规栈操作不互相影响
 27 |     - 函数调用返回等
 28 | - 共享代码和数据部分
 29 | - 共享的堆
 30 |   - 可以共享动态分配的内存
 31 |   - 堆管理起来比较复杂，而且共享堆不会太多影响；而入栈出栈是底层汇编触发的，不好同步，独立的栈可以简化设计
 32 | 
 33 | ```text
 34 | ┌────────────────────────────┐
 35 | │    Kernel Code and Data    │
 36 | ├────────────────────────────┤
 37 | ├────────────────────────────┤
 38 | │        Kernel Stack        │
 39 | ├────────────────────────────┤
 40 | │                            │
 41 | ├────────────────────────────┤
 42 | │       Thread Stack 1       │
 43 | ├────────────────────────────┤
 44 | ├────────────────────────────┤
 45 | │       Thread Stack 2       │
 46 | ├────────────────────────────┤
 47 | ├────────────────────────────┤
 48 | │       Thread Stack 3       │
 49 | ├────────────────────────────┤
 50 | │                            │
 51 | ├────────────────────────────┤
 52 | │         Shared Lib         │
 53 | ├────────────────────────────┤
 54 | │                            │
 55 | │                            │
 56 | ├────────────────────────────┤
 57 | │            Heap            │
 58 | ├────────────────────────────┤
 59 | │            Code            │
 60 | ├────────────────────────────┤
 61 | │            Data            │
 62 | └────────────────────────────┘
 63 | ```
 64 | 
 65 | #### 进程与线程
 66 | 
 67 | - 相似
 68 |   - 都可以与其他进程/线程并发执行（可能在不同核心上）
 69 |   - 都可以进行上下文切换
 70 |     - 在引入线程之后，调度单位由进程变为线程
 71 |     - 线程拥有独立的上下文
 72 | - 不同
 73 |   - 线程比进程开销更低
 74 |   - 同一进程的不同线程共享代码和数据
 75 | 
 76 | ## 操作系统提供的线程相关接口
 77 | 
 78 | - 常用库 POSIX Threads `pthreads`
 79 |   - 包含创建、回收、退出线程的各种接口
 80 | - 一个线程执行系统调用可能会影响该进程的所有线程
 81 |   - 一个线程直接调用 `exit` 会导致所有线程都退出
 82 | - 可以使用 `thread_join` 等待特点线程退出并回收
 83 | - 可以使用 `thread_detach` 让线程退出时自动回收相关资源
 84 | - 可以使用 `thread_exit` 只退出当前线程，防止主线程结束后调用 `exit` 造成其他线程被一并终止
 85 | 
 86 | ## 线程管理与实现
 87 | 
 88 | ### 多线程进程的 `fork`
 89 | 
 90 | #### 会发生什么
 91 | 
 92 | - 假设 `thread_1` 调用 `fork`
 93 |   - 会创建一个新的进程
 94 |   - 新进程的内存空间里会有原进程的所有线程的栈
 95 |   - 但是只有 `thread_1` 的 `SP` 指向了自己的栈
 96 |   - 其他线程的栈没有人用
 97 |   - `fork` 之后的进程只有1个线程在工作
 98 | - 这种实现相对比较简单，但是显然会造成内存资源的浪费
 99 | - 然而如果 `fork` 时拷贝了进程的所有线程，则可能出现迷惑行为
100 |   - 比如一个线程 `fork` 时，另一个线程恰好在写文件
101 |   - 那么 `fork` 后如果新产生一个正在写文件的线程，该文件就会被写两次
102 | - 因此事实上建议程序员不要 `fork` 一个正在多线程执行的进程
103 | 
104 | ### 线程控制块 TCB
105 | 
106 | - 原有的PCB中维护一个链表，记录该进程的所有线程
107 | - 由于线程拥有独立的上下文，因此进程控制块不再需要维护指向上下文的引用，但是相应地，进程现在需要维护进程包含的所有线程
108 |   - 这可以通过链表 `thread_list` 实现
109 | 
110 | ```cpp
111 | struct process {
112 |   struct vmspace *vmspace;
113 |   struct list_head thread_list;
114 | }
115 | ```
116 | 
117 | - 线程控制块 Thread Control Block 包含以下内容
118 |   - 线程上下文。与之前的进程上下文对应。保存了线程在暂停后能恢复执行所需要保存的最小状态集合
119 |   - 所属进程信息。进程与线程互相引用，便于管理
120 |   - 进程间通信相关。
121 | 
122 | ```cpp
123 | struct thread {
124 |   struct thread_ctx thread_ctx;
125 |   struct process *process;
126 |   struct ipc_connection *active_conn; // beyond the scope
127 |   struct server_ipc_config *server_ipc_config; // beyond the scope
128 | }
129 | ```
130 | 
131 | - 除此之外，操作系统还会为每个线程配备独立的内核栈，以保证各个线程进入内核后互不干扰
132 | 


--------------------------------------------------------------------------------
/Operating System/15.文件系统.md:
--------------------------------------------------------------------------------
  1 | # 文件系统
  2 | 
  3 | ## 文件
  4 | 
  5 | - 文件是对数据的一种抽象
  6 | 
  7 | 文件
  8 | : 有名字且持久化的一段数据
  9 | 
 10 | ```text
 11 |        ┌─────────┐ ┌─────────┐ ┌─────────┐
 12 |        │         │ │         │ │         │
 13 | User   │  App 1  │ │  App 2  │ │  App 3  │
 14 |        │         │ │         │ │         │
 15 |        └─────────┘ └─────────┘ └─────────┘
 16 | 
 17 |        ┌─────────────────────────────────┐
 18 |        │             File Sys            │
 19 |        └─────────────────────────────────┘
 20 | Kernel
 21 |        ┌─────────────────────────────────┐
 22 |        │              Driver             │
 23 |        └─────────────────────────────────┘
 24 | 
 25 |        ┌─────────────────────────────────┐
 26 |        │                                 │
 27 | HW     │         Memory and Disk         │
 28 |        │                                 │
 29 |        └─────────────────────────────────┘
 30 | ```
 31 | 
 32 | ## INODE
 33 | 
 34 | |        Layer        |          Purpose           |       Orientation       |
 35 | | :-----------------: | :------------------------: | :---------------------: |
 36 | |    Symbolic Link    |       Symbolic links       |          Human          |
 37 | | Abosolute Path Name |  Provide root for naming   |          Human          |
 38 | |      Path Name      |     Naming hierarchies     |          Human          |
 39 | |      File Name      |    Human-oriented names    | Human-Machine interface |
 40 | |    Inode Number     |   Machine-oriented names   |         Machine         |
 41 | |        File         | Organize blocks into files |         Machine         |
 42 | |        Block        |    Identify disk blocks    |         Machine         |
 43 | 
 44 | ### 磁盘块层 Block Layer
 45 | 
 46 | $$磁盘块号 \to 磁盘上的数据$$
 47 | 
 48 | - 整个磁盘抽象成大数组
 49 | - 磁盘划分成很多块
 50 |   - 常见的块大小是 `4KB`
 51 | - 将磁盘块编号映射到磁盘块上的数据
 52 | 
 53 | #### 超级块 Super Block
 54 | 
 55 | - 每个文件系统在逻辑上有一个超级块
 56 | - 包含
 57 |   - 磁盘块大小
 58 |   - 磁盘块数量
 59 |   - 空闲磁盘块的bitmap的开始位置
 60 |     - 该bitmap记录了磁盘中空闲块的位置，但是并不直接存放在超级块内
 61 | - 超级块在逻辑上只有一个，但是物理上可以有多个备份
 62 | - 加载文件系统时，内核会读取超级块的信息
 63 | 
 64 | ### 文件层 File Layer
 65 | 
 66 | $$ iNode \to 磁盘块号 $$
 67 | 
 68 | - 一个文件常常需要多个（甚至很多很多个）磁盘块才能存下，一个我们认知中的文件往往对应一大堆磁盘块，因此需要高效地记录并组织一个文件对应的存储块。
 69 | - 基于 inode 的文件系统使用 `inode` 记录每个文件所对应的所有磁盘块号
 70 | - 通过 `inode` 就可以访问一个文件的所有数据
 71 | 
 72 | ```cpp
 73 | struct inode {
 74 |     int block_numbers[N];
 75 |     int size;
 76 | }
 77 | ```
 78 | 
 79 | - `block_number` 记录了所有磁盘块号
 80 | - `size` 记录了大小
 81 | 
 82 | #### 多级磁盘块索引
 83 | 
 84 | - 如果只使用一级索引，那么 `inode` 预设的 `block_numbers` 数组可能不够，或者可能为了支持大文件而在存储小文件时产生大量浪费，因此使用多级磁盘块索引
 85 | - 类似多级页表
 86 | - `inode` 中保存了直接指针、一级间接指针和二级间接指针
 87 |   - 每个直接指针指向一个块
 88 |   - 每个间接指针存储了若干个上一级磁盘块的地址
 89 |   - 如果有必要，还可以启用三级、四级间接指针
 90 | 
 91 | #### 元数据
 92 | 
 93 | `inode` 除了记录存储索引外，还记录该文件相关的其他元数据，例如文件模式、链接数、拥有者、用户组、文件大小、访问时间、修改时间等
 94 | 
 95 | ### INODE号层
 96 | 
 97 | $$iNodeNumber \to iNode$$
 98 | 
 99 | - inode表
100 |   - 磁盘的固定位置存储了一个inode表，该表中记录了inode号到inode在表中的相对位置的映射。可以通过inode表找到特定的inode。
101 | - 空闲inode bitmap
102 |   - 记录了哪些inode被使用、哪些inode是空闲的
103 | 
104 | ### 文件名层
105 | 
106 | $$FileNameString \to iNodeNumber$$
107 | 
108 | - inode号对于计算机已经足够，但是对人类而言非常不友好
109 |   - 需要一个字符串形式的、对人类友好的文件名
110 | - 而且直接使用inode号造成了inode号与文件存储位置的强耦合
111 |   - 文件系统不能在不更改inode号的前提下更改文件inode的存储位置
112 |   - 也不能用一个新的inode号指代已经存在的inode
113 | 
114 | #### 字符串文件名到`inode`号的映射
115 | 
116 | - 需要一个映射表记录文件名到`inode`号的映射
117 | - 该映射表保存在 `目录` 中
118 | - `目录` 本身也是一个文件，同样用 `inode` 组织
119 |   - `目录` 大小和文件大小没有任何关系
120 |   - 它的大小只和文件数量以及字符串文件名长度有关
121 |   - 文件名也不是文件的元数据
122 | 
123 | > “这就是目录为什么叫目录，因为它就是一个目录”
124 | 
125 | #### 查目录
126 | 
127 | - 字符串匹配
128 | - 返回对应的文件的 `inode` 号
129 | - 如果找不到对应文件，则报错
130 | 
131 | ### 路径层
132 | 
133 | - 提供结构化的路径名
134 |   - 例如 `projects/paper`
135 | 
136 | ### 绝对路径层
137 | 
138 | #### HOME目录
139 | 
140 | - 每个用户都有自己的HOME目录
141 |   - 也是该用户登陆后的默认目录
142 |   - 不同用户只能访问自己的HOME目录，不能跨用户共享文件
143 | 
144 | #### 根目录
145 | 
146 | - 通常规定根目录的 `inode` 编号为 `1`
147 | - `/`
148 | 
149 | ### 符号链接层
150 | 
151 | 如何在一个磁盘上建立指向另一个磁盘的Link
152 | 
153 | - 不同磁盘的文件系统不同，因此 `inode` 命名空间不同
154 | - 因此需要增加一种新的 inode 类型：软连接（符号链接）
155 | 
156 | ## 硬链接与符号链接
157 | 
158 | ### 硬链接
159 | 
160 | - 文件名并非元数据，因此一个文件可以对应多个文件名
161 |   - 通过 `ln file link` 命令实现
162 |   - 文件系统将先找到 `file` 对应的 `inode`，然后在目标路径的父目录下新增一个指向该 `inode` 的目录项
163 | - 为了支持这个功能，`inode` 需要增加 Reference Counter `refcnt`
164 |   - 用于记录有多少个文件名指向了当前的 `inode`
165 |   - 只有 `refcnt == 0` 时才可以物理上移除该 `inode` 对应的文件
166 |   - 但是通过任何一个文件名对文件的修改都将直接影响到这个文件
167 | - Link不允许形成环，否则可能在磁盘中出现根目录无法访问，但是也不能删除的空间
168 |   - 除了 `./` 和 `../`
169 | - 或者在删除目录时必须保证当前目录为空
170 | - 创建硬链接时，文件系统中不会创建新的文件，但是目录中会增加硬链接的条目
171 | 
172 | ### 符号链接
173 | 
174 | - 会真实创建一个文件
175 |   - 使用 `ln -s file slink` 创建
176 | - 软连接是一个文件，里面保存了一个表示文件路径的字符串
177 | 
178 | ## 文件元数据
179 | 
180 | - 拥有者 Owner、所在组 ID
181 |   - 拥有改 `inode` 的用户ID和用户组ID
182 | - 权限类型
183 |   - 拥有者、所在组、其他
184 |   - 读、写、执行
185 | - 时间戳
186 |   - 最后一次访问 (READ)
187 |   - 最后一次修改 (WRITE)
188 |   - 最后一次 `inode` 更新 (LINK)
189 | 
190 | ## 小结
191 | 
192 | - 文件名并不是文件的一部分
193 |   - 文件名不是文件的数据，也不是文件的元数据
194 |     - 文件名并不保存在文件中
195 |     - 文件名不在 `inode` 中
196 |   - 文件名是目录的数据，是文件系统的元数据
197 |   - 一个 `inode` 可以有多个文件名
198 |     - 硬链接
199 | - 每个硬链接的地位都是相同的
200 |   - 文件名就是硬连接
201 | - 目录所占磁盘空间通常很小
202 |   - 目录仅仅负责记录文件名到 `inode` 的映射
203 | 


--------------------------------------------------------------------------------
/Operating System/17.设备管理与驱动.md:
--------------------------------------------------------------------------------
  1 | # 设备管理与驱动
  2 | 
  3 | ## 标准IO协议
  4 | 
  5 | ### 设备规范
  6 | 
  7 | 设备需要提供一些对外接口，
  8 | 
  9 | - 状态寄存器
 10 | - 地址寄存器
 11 | - 数据寄存器
 12 | - 命令寄存器
 13 | 
 14 | 除此之外，设备还会有一些不暴露在外的内部设备（微控制器、内部存储或其他硬件芯片）
 15 | 
 16 | ### 轮询IO流程
 17 | 
 18 | - 从CPU的视角
 19 |   - 等待直到设备空闲
 20 |     - 轮询状态寄存器，确定设备是否空闲
 21 |   - 将数据和地址写入设备的对应寄存器
 22 |   - 将命令写入命令寄存器
 23 |   - 等待设备完成工作
 24 |     - 轮询状态寄存器，确定设备是否完成任务
 25 | - 从设备的视角
 26 |   - 轮询命令寄存器
 27 |   - 如果有新命令，则修改状态寄存器，开始工作
 28 |   - 完成命令后恢复到等待状态
 29 | 
 30 | ### 基于中断的IO
 31 | 
 32 | - OS向设备发送一个请求，随后让发起IO的进程睡眠（该进程将处于 `WAITING` 状态）
 33 |   - 此时CPU可以切换到其他进程
 34 | - 设备完成IO后，触发IO中断
 35 | - CPU跳转到中断处理程序
 36 | - 内核态的中断处理程序负责响应设备中断
 37 |   - 例如读取设备数据或返回的状态码，进而唤醒等待IO的进程
 38 | - 这种中断只适用于低速设备（例如键盘），用于在等待设备时提高CPU利用率
 39 | 
 40 | #### 中断造成的活锁
 41 | 
 42 | - 如果IO设备速度较快，例如工作在极佳网络环境下的网卡，导致IO设备频繁向CPU发送中断时，操作系统可能进入活锁
 43 |   - CPU忙于响应设备的中断，无法调度用户进程或处理中断发来的数据
 44 |   - “看上去它很忙，但是实际上什么都没干。我不知道同学们平时会不会有这种状态”
 45 | 
 46 | #### 中断合并 Interrupt Coalescing
 47 | 
 48 | - 基于“一个中断到来后，很可能短时间内会有另一个中断”的直觉
 49 | - 因此设备发送中断前会先等待一段时间，如果等待期间有新的中断，则合并中断
 50 |   - 代价是过长的等待时间可能导致高时延
 51 | 
 52 | ### 中断与轮询结合的IO
 53 | 
 54 | - 默认使用中断
 55 | - 网络中断发生后，使用轮询处理后续到达的网络包
 56 |   - 网络包通常是短时间内来一大堆
 57 | - 如果没有更多中断，或轮询中断超过时间限制，则回到中断模式
 58 | - 在Linux网络驱动中称为NAPI（New API）
 59 | 
 60 | ## 直接内存访问 Direct Memory Access DMA
 61 | 
 62 | ### 传统的IO流程
 63 | 
 64 | - CPU从设备读取数据，存入CPU的寄存器
 65 | - CPU将数据从寄存器转入内存
 66 | - 低效
 67 | - be like
 68 |   - 用勺子从一个游泳池往另一个游泳池舀水.gif
 69 | 
 70 | ### DMA
 71 | 
 72 | - CPU向磁盘控制器发送“读”请求
 73 |   - 给定要读的地址和目标内存地址
 74 |   - 一次读取一块数据
 75 | - 磁盘控制器直接将数据写入内存
 76 |   - No more CPU involvement
 77 | 
 78 | #### 优势
 79 | 
 80 | - 减轻CPU负担
 81 | - 减少传输次数
 82 |   - 从两次传输减少到一次
 83 | - 更好的支持“长消息”
 84 | - 分摊总线协议开销
 85 | 
 86 | #### 缺陷
 87 | 
 88 | - 不适合单次小批量读取数据
 89 | 
 90 | ## 设备交互
 91 | 
 92 | ### PIO
 93 | 
 94 | - 在x86上，使用 `IN` `OUT` 指令
 95 |   - `IN` 是读入CPU，`OUT` 是从CPU写出
 96 | - 需要以内核态特权模式执行
 97 | 
 98 | ### 内存映射 IO
 99 | 
100 | - 将设备寄存器映射到内存地址空间
101 |   - 内存地址被IO设备位置信息重载
102 | - 使用访存的 `LOAD` `STORE` 指令操作设备
103 | - 可以以非特权模式（用户态）执行
104 | - 配置页表时，映射到设备的内存地址需要设置为 `NON_CACHABLE`
105 |   - 否则CPU可能会直接读cache，但设备寄存器的数据已经被更新了
106 | 
107 | !!!info MMIO地址应该使用 `volatile` 关键字
108 |     否则可能会被编译器优化掉。
109 | 
110 | ## 硬件总线
111 | 
112 | ### 硬件总线的特点
113 | 
114 | - 物理上是一组电线
115 |   - 有地址总线、数据总线、控制总线等
116 |   - 将IO设备连接在一起
117 | - 广播特性
118 |   - 每个模块都能接收到总线上的数据
119 |   - 通过地址总线指定预期的接收方
120 | - 仲裁协议
121 |   - 决定哪些设备可以在什么时候使用总线
122 | 
123 | ### 总线事务 Transaction
124 | 
125 | 1. 源模块获取总线的使用权
126 |    - 总线的使用具有排他性
127 | 2. 源模块将目标模块的地址写到总线
128 | 3. 源模块发送READY信号，广播提醒其他模块
129 | 4. 目标模块在拷贝完数据后，发送ACK信号
130 |    - 如果采用同步模式，则无需READY和ACK，只要在每个时钟周期检查即可
131 | 5. 源模块释放总线
132 | 
133 | ### 同步与异步
134 | 
135 | - 同步数据传输
136 |   - 源和目标根据共享的时钟进行操作
137 |     - 例如DDR内存
138 | - 异步数据传输
139 |   - 源和目标借助显式信号进行协作
140 | 
141 | ## IO子系统
142 | 
143 | ### IO子系统的意义
144 | 
145 | IO子系统主要处理兼容性、容错性等问题
146 | 
147 | - 数以千计的设备类型
148 |   - 需要标准化的接口
149 | - 设备的不可靠性
150 |   - 传输介质可能失效或发生错误
151 | - 设备的不可预测性和慢速
152 |   - 在不清楚设备具体速度和性能表现的前提下管理设备
153 |   - 不同设备的处理性能差异极大，IO子系统需要能够处理这类差异
154 |     - 对高速设备，需要最小化使用开销
155 |     - 对慢速设备，需要避免浪费CPU时间在盲目等待
156 | 
157 | ### IO子系统的目标
158 | 
159 | - 对各种设备提供统一的接口
160 | - 提供IO硬件的抽象层，管理硬件资源，隐藏硬件细节
161 | 
162 | ### 三类设备接口
163 | 
164 | Linux提供三类常见的设备接口：字符设备、块设备、网络设备。所有设备在上层被文件系统接口统一，可以使用 `open` `read` `write` 等文件系统接口操作。
165 | 
166 | #### 字符设备
167 | 
168 | “流式设备”，数据流输入CPU
169 | 
170 | - 例如鼠标、键盘、串口、部分USB
171 |   - 串行访问，每次一个字符
172 | - 通常使用文件系统接口进行交互
173 | 
174 | #### 块设备
175 | 
176 | 具有空间概念，可以往前或往后读。硬盘、光盘等。具有Buffer-Cache概念：写Buffer、读Cache
177 | 
178 | - 例如磁盘、磁带、DVD
179 | - 统一的、以块为粒度的访问接口
180 | - 提供原始的IO接口，也支持以文件形式访问
181 | - 允许以内存映射的方式访问文件内容
182 | 
183 | #### 网络设备
184 | 
185 | - 例如网卡、无线网络、蓝牙
186 | - 不同于块设备、字符设备，网络设备有自己的接口
187 |   - 提供特殊的网络接口，支持各类网络协议
188 |   - 负责网络包收发
189 | 
190 | ### 设备驱动
191 | 
192 | - 提供特定的代码，和硬件设备直接交互
193 |   - 要求提供标准的文件系统接口
194 |   - 内核的IO子系统可以和内核中的不同驱动交互
195 |   - 可以借助 `ioctl()` 系统调用进行设备的相关配置或者解析自定义的功能
196 | - Linux的设备驱动通常分为两部分
197 |   - 上半部：迅速处理，此时中断关闭，没有嵌套
198 |   - 下半部：延后处理，此时中断打开，可能发生嵌套
199 | 
200 | ## IOMMU
201 | 
202 | - IOMMU为IO设备做地址翻译，避免设备直接使用物理地址访问内存
203 |   - 设备使用的地址由IOMMU翻译为实际的物理地址
204 |   - 广泛应用于虚拟机场景中
205 |     - 允许虚拟机独占某个设备
206 | - 提高安全性，并提供额外的抽象


--------------------------------------------------------------------------------
/Provable Security/01.Introduction.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | > Modern cryptography involves the study of mathematical techniques for securing digital information, systems and distributed computations against adversarial attacks.
 4 | 
 5 | ## Classical Cryptography vs. Modern Cryptography
 6 | 
 7 | ### Examples of Classical Cryptography
 8 | 
 9 | #### Caesar's Cipher
10 | 
11 | - Shift the letters of an alphabet forward/backward
12 | 
13 | #### Simple Substitution Cipher
14 | 
15 | - Each plaintext letter is mapped to and substituted by another ciphertext letter
16 | 
17 | ## An Example of Modern Cryptography: Private Key Encryption
18 | 
19 | > Also known as *symmetric-key* setting. In this setting, both parties hold the same key that is used for encryption and decryption
20 | 
21 | ### Syntax of Private Key Encryption
22 | 
23 | A **private-key encryption** scheme is defined by specifying a **message space** $\mathrm{M}$ along with 3 algorithms: a procedure for generating keys ($\mathrm{KeyGen}$), a procedure for encrypting ($\mathrm{Enc}$) and a procedure for decrypting ($\mathrm{Dec}$).
24 | 
25 | - $\mathrm{KeyGen}(\lambda) \to k$. The **key-generation** algorithm is a *probabilistic* algorithm that outputs a key $k$ chosen according to some random distribution.
26 | - $\mathrm{Enc}(k, m) \to c$. The **encryption** algorithm takes a key $k$ and a message $m$ and outputs a ciphertext $c$.
27 | - $\mathrm{Dec}(k, c) \to m$. The **decryption** algorithm takes a key $k$ and a ciphertext $c$ and outputs a plaintext $m$.
28 | 
29 | ### Correctness
30 | 
31 | A private key encryption scheme must satisfy the following **correctness** requirement: for any $k$ output by $\mathrm{KeyGen}$ and every message $m \in \mathcal{M}$ it holds that
32 | 
33 | $$ \mathrm{Dec}(k, \mathrm{Enc}(k, m)) = m. $$
34 | 
35 | ### Security and Kerckhoff's Principle
36 | 
37 | The intuition of "security" is that "the attacker cannot break the scheme".
38 | 
39 | - Keeping the secret key is a necessary/basic requirement for security.
40 | - But what about other components? The decryption algorithm? The encryption algorithm? The entire scheme?
41 | 
42 | #### Kerckhoff's Principle
43 | 
44 | **Kerckhoff's Principle.** The cipher method must not be required to be secret, and it must be able to fall into the hands of the enemy without inconvenience.
45 | 
46 | - A cryptographic scheme should be designed to be secure even if an attacker knows all the details of the scheme, so long as the attacker does not know the key being used.
47 | - Security should not rely on the scheme being secret; instead it should rely solely on the secrecy of the key.
48 | 
49 | ##### Arguments in Favor of Kerckhoff's Principle
50 | 
51 | 1. It is significantly easier for the parties to maintain secrecy of a short key than to keep secret the (more complicated) algorithm they are using.
52 |    - It is simply unrealistic to assume that the encryption algorithm will remain secret.
53 | 2. In case the honest parties' secret information is ever exposed, it is much easier to change a key than to change the entire scheme.
54 | 3. For large-scale deployment, it is significantly easier for users to all rely on the same encryption algorithm than for everyone to use their custom ones.
55 | 
56 | ##### No "Security by Obscurity"
57 | 
58 | - Cryptography design should be made completely public.
59 | - Published designs undergo public review and are likely to be stronger.
60 | - Confidence in the security of a scheme is much higher if it has been extensively studied.
61 | 
62 | ## Principles of Modern Cryptography
63 | 


--------------------------------------------------------------------------------
/Provable Security/02.ModernCryptography.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YBRua/CourseNotes/9a3a7d322ab13b53fa619abb1a7bfd79938c7944/Provable Security/02.ModernCryptography.md


--------------------------------------------------------------------------------
/Provable Security/03.PublicKeyEncryption.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YBRua/CourseNotes/9a3a7d322ab13b53fa619abb1a7bfd79938c7944/Provable Security/03.PublicKeyEncryption.md


--------------------------------------------------------------------------------
/Provable Security/06.ComputationalIndistinguishability.md:
--------------------------------------------------------------------------------
 1 | # Computational Indistinguishability and Pseudorandom Generators
 2 | 
 3 | > A cipher must be practically, if not mathematically, indecipherable.
 4 | 
 5 | ## Computational Security
 6 | 
 7 | A cryptographic scheme $\Pi$ is $(t,\epsilon)$-secure if every adversary of running time at most $t$ succeeds in breaking $\Pi$ with probability at most $\epsilon$.
 8 | 
 9 | - **Perfect security.** $t = \infty, \epsilon = 0$.
10 | - **Statistical security.** $t = \infty, \epsilon = \mathrm{negl}(n) = n^{-\omega(1)}$.
11 | - **Computational security.** $t = n^{\omega(1)}, \epsilon = \mathrm{negl}(n) = n^{-\omega(1)}$.
12 | 
13 | ### Semantic Security
14 | 
15 | A private-key encryption scheme $(\mathrm{Gen}, \mathrm{Enc}, \mathrm{Dec})$ is semantically secure in the presence of an eavesdropper of for every PPT algorithm $A$ there exists a PPT algorithm $A'$ such that for every PPT algorithm $\mathrm{Samp}$ and polynomial-time computable functions $f$ and $h$
16 | 
17 | ## Pseudorandom Generators
18 | 
19 | Let $g: \{0, 1\} \mapsto \{0, 1\}^l (l > n)$ be a deterministic polynomial-time algorithm. We say that $g$ is a **pseudorandom generator (PRG)** if for all PPT distinguishers $D$, there exists a negligible function $\mathrm{negl}()$
20 | 
21 | $$ |\mathrm{Pr}[D(g(U_n)) = 1] - \mathrm{Pr}[D(U_l) = 1]| \le \mathrm{negl}(n). $$
22 | 
23 | ### Secure PRGs
24 | 
25 | $g: \{ 0, 1 \}^n \mapsto \{0,1\}^l (l > n)$ is a $(t,\epsilon)$-secure PRG if every probablistic distinguisher of running time $t$ satisfies
26 | 
27 | $$ | \mathrm{Pr}[D(g(U_n)) = 1] - \mathrm{Pr}[D(U_l) = 1] | \le \epsilon. $$
28 | 
29 | Note that PRGs are not secure against adversaries who have unlimited computing power.
30 | 
31 | #### Replacement Lemma
32 | 
33 | #### Switching Lemma
34 | 
35 | #### Sequential Composition of PRGs
36 | 
37 | Let
38 | 
39 | $$\begin{align*}
40 |     g: \{ 0,1 \}^n &\mapsto \{ 0, 1 \}^{n+s}\\
41 |      s_i &\mapsto (s_{i+1}, r_{i+1}) \text{ where } s_i,s_{i+1} \in \{0,1\}^n, r_{i+1} \in \{0,1\}^s
42 | \end{align*} $$
43 | 
44 | be a $(t,\epsilon)$-secure PRG, and for any $q(n) \in \mathbb{N}$ define $g^q: \{0,1\}^n \mapsto \{0,1\}^{n+qs}$
45 | 
46 | $$\begin{align*}
47 |     g^q: \{ 0,1 \}^n &\mapsto \{ 0, 1 \}^{n+qs}\\
48 |      s_0 &\mapsto (s_q,r_q,\dots,r_1)
49 | \end{align*} $$
50 | 
51 | where $0 \le i \le q-1$ iteratively compute $(s_{i+1},r_{i+1}) \coloneqq g(s_i)$. Then we have that $g^q$ is a $(t-q\mathrm{poly}(n), q\epsilon)$-secure PRG, where $\textrm{poly}(n)$ is the running time for computing $g$.
52 | 
53 | #### PRG-based Encryption
54 | 


--------------------------------------------------------------------------------
/Provable Security/07.OnewayFunctions.md:
--------------------------------------------------------------------------------
 1 | # One-way Functions
 2 | 
 3 | ## One-way Functions and Permutations
 4 | 
 5 | ### One-way Function
 6 | 
 7 | > [KL] Section 7.1.1.
 8 | 
 9 | A function $\{0,1\}^n \mapsto \{0,1\}^i$ is a **one-way function** if $f$ is
10 | 
11 | - **Easy-to-Compute.** $f$ can be computed by some algorithm in $\mathrm{poly}()$.
12 | - **Hard-to-Invert.** For every PPT $A$, there exists a negligible function $\mathrm{negl}()$ such that
13 |   $$ \mathrm{Pr}_{x \leftarrow U_n, x' \leftarrow A(1^n, f(x))} [f(x') = f(x)] \le \mathrm{negl}(n). $$
14 | 
15 | We say $f$ is a $(t, \epsilon)$-one-way function if no probabilistic $A$ of time $t =t(n)$ can invert the function with probability more than $\epsilon = \epsilon(n)$.
16 | 
17 | #### Existence of One-way Function
18 | 
19 | - Up till now people are only *assuming* that one-way function exists.
20 | - The existence of one-way function implies $P \neq NP$.
21 | 
22 | #### Candidates of One-way Functions
23 | 
24 | ## Hardcore Predicate
25 | 
26 | ## Implication of Pseudorandom Generator
27 | 
28 | If a permutation $f: \{0,1\}^n \mapsto \{0,1\}^n$ has a hardcore predicate $h_c: \{0,1\}^n \mapsto \{0,1\}$ (which implies that $f$ is a one-way permutation), then the function
29 | 
30 | $$ g(x) = (f(x), h_c(x)) $$
31 | 
32 | is a pseudorandom generator.
33 | 
34 | ### Goldreich-Levin Theorem
35 | 
36 | ## PRG from One-way Functions
37 | 
38 | ### Regular Functions
39 | 
40 | $f$ is a **regular function** if the preimage size $\alpha = |f^{-1}(y)|$ is fixed (independent of $y$).
41 | 
42 | - It is a **known regular function** if its **regularity** $\alpha$ is polynomial-time computable from security parameter $n$.
43 | 
44 | ### Recap: Pseudorandom Generators
45 | 
46 | ### BM-Y Generator
47 | 
48 | **Theorem.** Any one-way permutation $f: \{0,1\}^n \mapsto \{ 0,1 \}^n$ implies a PRG $G: \{0,1\}^n \mapsto \{0,1\}^m$ for any $m = \mathrm{poly}(n)$.
49 | 
50 | ### Unpredictable Pseudoentropy (UP)
51 | 
52 | A random variable $X$ has $m$ bits of **unpredictable pseudoentropy** if a PPT algorithm $\mathcal{A}$ can only win the game with probability less than $2^{-m}$.
53 | 
54 | ### PRGs from Known Regular OWFs by Three Extractions
55 | 


--------------------------------------------------------------------------------
/Provable Security/09.ElGammalEncryption.md:
--------------------------------------------------------------------------------
 1 | # El Gammal Encryption: A Public Key Encryption Scheme
 2 | 
 3 | ## Construction of El Gammal
 4 | 
 5 | ### Key Generation
 6 | 
 7 | $\mathrm{KeyGen}(1^\lambda) \to (pk, sk)$.
 8 | 
 9 | - On input $q^\lambda$, the algorithm runs $\mathrm{GenGroup}(1^\lambda)$ to obtain $(G, q, g)$.
10 | - Then it chooses a random $x \in Z_q$ and computes $h = g^x$. This essentially samples a uniform $h$ from the group.
11 | 
12 | The public and private keys are given by
13 | 
14 | $$ pk = (G, q, g, h), \quad sk = (G, q, g, x) $$
15 | 
16 | ### Encryption
17 | 
18 | $\mathrm{Enc}(pk,m) \to c$.
19 | 
20 | - On input a public key $pk = (G, q, g, h)$ and a message $m \in G$ (message space $M = G$)
21 | - The algorithm chooses a uniform $y \in Z_q$ and outputs ciphertext $(g^y, m \cdot h^y)$.
22 | 
23 | ### Decryption
24 | 
25 | $\mathrm{Dec}(sk, c) \to m$.
26 | 
27 | - On input $pk=(G, q, g, x)$ and $c = (c_1, c_2) \in G^2$,
28 | - Output $m = c_2 \cdot (c_1^x)^{-1}$.
29 | 
30 | ## Proof of Correctness
31 | 
32 | ## Proof of Security
33 | 
34 | > Typically the security is proved by contradiction: if the scheme is insecure, then some of the hardness assumptions must fail.
35 | 
36 | **Theorem.** If the DDH problem is hard relative to $\mathrm{GenGroup}$, then the El Gamal Encryption Scheme is CPA-secure.
37 | 
38 | *Sketch of Proof.* Suppose there is a PPT adversary $A$ such that $\mathrm{Adv}_{\Pi, A}^{CPA} > \epsilon$ for non-negligible probability $\epsilon$. We show that we can construct a PPT algorithm $B$ to solve the DDH Problem relative to $\mathrm{GenGroup}$ with non-negligible probability.
39 | 
40 | ### Setup Stage
41 | 
42 | - $\mathbf{B}$ receives a DDH problem instance, $(G, q, g, A, B, T)$, where $A = g^\alpha, B = g^\beta$ for some $\alpha, \beta \in Z_q^*$ and $T \in G$. $\mathbf{B}$'s target is to determine whether $T = g^{\alpha\beta}$ (or $T$ is just some random number).
43 | - $\mathbf{B}$ sets $h=A$ and gives $pk = (G, q, g, h)$ to $\mathbf{A}$.
44 |   - Here $\mathbf{B}$ constructs a public key $(G, q, g, A)$ using $A$ from the DDH problem instance.
45 |   - This public key is valid, but $\mathbf{B}$ does not have the corresponding private key pair.
46 | 
47 | **Note.** We can safely assume that the adversary cannot tell whether $T$ is a valid ciphertext ($T=g^{\alpha\beta}$) or not. If the adversary were able to distinguish $T_0 = g^{\alpha\beta}$ and $T_1 = g^c$, then we would also be able to solve DDH problem (the adversary solves it by distinguishing $T_0, T_1$).
48 | 
49 | ### Challenge Stage
50 | 
51 | - $\mathbf{A}$ submits $m_0^*, m_1^* \in G$ s.t. $m_0^* \neq m_1^*$.
52 | - $\mathbf{B}$ randomly chooses $b = \{0,1\}$ and set $c_1 = B, c_2 = m_b^* \cdot T$.
53 |   - $B = g^\beta$. $\mathbf{B}$ does not know $\beta$, but it can pretend it does, and send $B$ as $c_1$.
54 |   - Note that if $T = g^{\alpha\beta}$, then the returned ciphertext is also a valid El Gammal ciphertext.
55 | - $\mathbf{B}$ returns $c^* = (c_1, c_2)$ to $\mathbf{A}$.
56 | 
57 | ### Output Stage
58 | 
59 | - $\mathbf{A}$ sends $b' \in \{0, 1\}$ to $\mathbf{B}$.
60 | - $\mathbf{B}$ sets $z' = 1$ if $b' = b$, otherwise $z' = 0$.
61 | - $\mathbf{B}$ outputs $z'$ as its solution to the DDH Problem.
62 | 
63 | ### Analysis
64 | 
65 | ### Remarks
66 | 
67 | - Basic workflow of a proof
68 |   1. Assume such an adversary exists.
69 |   2. Use the adversary's ability to solve a hard problem.
70 |   3. Provide analysis on the probability of solving the problem.
71 | - From the adversary's perspective, it sees exactly what it expects to see under an El Gammal Scheme, *provided $T = g^{\alpha\beta}$* holds.
72 | - The construction of the proof should leverage both the ability of the adversary and the information provided by the hard problem.
73 | - For the proof itself, it must ensure
74 |   - The game procedure follows the definition of security (CPA in this case).
75 |   - The adversary should see exactly what it is supposed to see in the adversary game (valid public keys and ciphertexts in this case). Or it should at least be unable to distinguish valid and invalid responses.
76 | 


--------------------------------------------------------------------------------
/Provable Security/11.AttributeBasedEncryption.md:
--------------------------------------------------------------------------------
  1 | # Attribute Based Encryption (ABE)
  2 | 
  3 | ## Introduction
  4 | 
  5 | - IBE. In IBE, the secret key $sk_{ID}$ is based on the user's $ID$.
  6 |   - One-to-one. Each $sk_{ID}$ corresponds to one single user, identified by its $ID$.
  7 | - Fuzzy IBE. Allows non-exact matches (e.g., fingerprint).
  8 | - Ciphertext-Policy Attribute-Based Encryption (CP-ABE). Provides an access policy for the ciphertext.
  9 |   - One-to-more. Allows multiple users to decrypt the message.
 10 |   - E.g., "Dept. of CS AND (Phd OR Alumni)".
 11 | - Broadcast Encryption. Broadcast a ciphertext to a group of people, identified by the IDs.
 12 |   - Also one-to-more.
 13 |   - Different from ABE, BE requires knowing *all* IDs of the receivers in advance.
 14 |   - ABE only requires attributes instead of unique IDs.
 15 | 
 16 | ### Collusion Resistance
 17 | 
 18 | - Assume two users, `{EE, PhD, Bill}` and `{CS, UnG, Carl}`.
 19 | - Neither of the two satisfies the access policy `{CS, PhD OR Alumni}`.
 20 | - But two of them combined will satisfy the policy.
 21 | - ABE schemes should be resilient to such collusion.
 22 | 
 23 | ### Ciphertext-Policy ABE (CP-ABE)
 24 | 
 25 | The encrypted data is associated with an access policy. Users' keys are generated based on the attributes of the user. A user can decrypt the message iff its attributes satisfies the access policy of the ciphertext.
 26 | 
 27 | CP-ABE does not need to know or specify *identities* of potential receivers. The attributes are associated with the receivers.
 28 | 
 29 | - A decryption key is described by an **attribute set**.
 30 | - A ciphertext is associated with an **access policy**.
 31 | 
 32 | ### Key-Policy ABE (KP-ABE)
 33 | 
 34 | The data is encrypted by its attributes. Users' keys are generated based on their access policy. A user can decrypt the message iff the attribute of the data satisfies the access policy of the user.
 35 | 
 36 | - The attributes are associated with the data. 
 37 | 
 38 | E.g., consider a TV broadcasting service. Assume a video stream is associated with `{Basketball, 2024, March}`. The encrypted file is broadcasted and users who have subscribed to `Basketball` can decrypt it.
 39 | 
 40 | ## Definition of CP-ABE
 41 | 
 42 | ### Syntax
 43 | 
 44 | #### Setup
 45 | 
 46 | $$ \mathrm{Setup}(\lambda) \to (mpk, msk). $$
 47 | 
 48 | #### Encrypt
 49 | 
 50 | $$ \mathrm{Enc}(mpk, A, m) \to C_A. $$
 51 | 
 52 | - $A$: an access policy.
 53 | - $C_A$: ciphertext.
 54 | 
 55 | #### KeyGen
 56 | 
 57 | $$ \mathrm{KeyGen}(mpk, msk, S) \to sk_S. $$
 58 | 
 59 | - $S$: an attribute set.
 60 | - $sk_S$: a user key for the attribute set.
 61 | 
 62 | #### Decrypt
 63 | 
 64 | $$ \mathrm{Dec}(mpk, sk_S, C_A) \to m/\bot. $$
 65 | 
 66 | ### Correctness
 67 | 
 68 | ### Security
 69 | 
 70 | ## Practical Considerations
 71 | 
 72 | To be practical, a CP-ABE system needs to have the following properties,
 73 | 
 74 | 1. **Traitor tracing.** Traceability on malicious users who leak their decryption keys.
 75 | 2. **Revocation.** Preventing revoked users from decrypting newly encrypted data.
 76 | 3. **Large universe.** Any string can be used as an attribute, and attributes do not need to be pre-specified during setup.
 77 | 
 78 | ### Traitor Tracing
 79 | 
 80 | - **Type 1 Traitors.** Directly leaks the key $sk_S$ for a set $S$.
 81 | - **Type 2 Traitors.** Leaks a black-box. Decryption keys and even the algorithm are hidden inside the box.
 82 | 
 83 | #### Black-box Traitors
 84 | 
 85 | ##### Key-like Decryption Blackbox
 86 | 
 87 | The blackbox $D$ is described by an attribute set $S_D$ and can decrypt ciphertexts with access policies satisfied by $S_D$. (`S_D = { CS, PhD }`).
 88 | 
 89 | ##### Policy-specific Decryption Blackbox
 90 | 
 91 | The blackbox $D$ is associated with an access policy $AP_D$, and can be decrypted with policy $AP_D$. (`AP = CS AND (PhD OR Alumni)`).
 92 | 
 93 | Weaker decryption ability, but reflects traitor's possible attempt to bypass detection.
 94 | 
 95 | If a CP-ABE scheme is policy-specific blackbox traceable, then it is also key-like blackbox traceable.
 96 | 
 97 | ### Revocation
 98 | 
 99 | A key should be revoked when
100 | 
101 | - Private keys get compromised.
102 | - User leaves the system.
103 | - User is removed from the system for some reason.
104 | 
105 | We aim to prevent the revoked user from decrypting new ciphertexts, generated after revocation.
106 | 
107 | #### Direct Revocation
108 | 
109 | Encryption requires an additional "revocation list",
110 | 
111 | $$ \mathrm{Enc}(PP, M, R, AP). $$
112 | 
113 | - $R$: a revocation list.
114 | 
115 | #### Indirect Revocation
116 | 


--------------------------------------------------------------------------------
/Provable Security/12.HierarchicalDeterministicWallet.md:
--------------------------------------------------------------------------------
 1 | # Hierarchical Deterministic Wallet
 2 | 
 3 | ## Digital Signature in Cryptography & Cryptographic Wallets
 4 | 
 5 | ## Hierarchical Deterministic Wallet (HDW)
 6 | 
 7 | ## Stealth Address
 8 | 
 9 | ## Formalized HDW
10 | 


--------------------------------------------------------------------------------
/Provable Security/13.PseudorandomFunctions.md:
--------------------------------------------------------------------------------
 1 | # Pseudorandom Functions and its Construction from PRGs
 2 | 
 3 | ## Pseudorandom Functions (PRF)
 4 | 
 5 | Consider a function $F$ with two inputs, $F: \{0,1\}^{\kappa(n)} \times \{0,1\}^n \mapsto \{0,1\}^{l(n)}$. $F$ can be seen as a family of functions, indexed by $k \in \kappa(n)$.
 6 | 
 7 | $$ F = \{ F_k : \{0,1\}^n \mapsto \{0,1\}^l \mid k \in \{ 0,1 \}^\kappa \}. $$
 8 | 
 9 | A function $F$ is a **pseudorandom function** if for all probabilistic polynomial-time distinguisher $\mathcal{D}$, there exists a negligible function that
10 | 
11 | ### The GGM Tree Construction of PRF
12 | 
13 | **Goldreich-Goldwasser-Micali Construction.** Let $G: \{ 0,1 \}^n \mapsto \{0,1\}^{2n}$ be a *length-doubling* pseudorandom generator. Define $G_0: \{0,1\}^n \mapsto \{0,1\}^n$, $G_1: \{0,1\}^n \mapsto \{0,1\}^n$.
14 | 
15 | #### Proof of Construction
16 | 
17 | **Lemma (Parallel Repetition of PRGs on Independent Seeds).** For any parameter $k = \mathrm{poly}(n)$, define $G^k: \{0,1\}^{kn} \mapsto \{0,1\}^{km}$ as
18 | 
19 | $$ G^k(x_1,\dots,x_k) \coloneqq (G(x_1), G(x_2), \dots, G(x_k)). $$
20 | 
21 | *Proof (Sketch).*
22 | 
23 | Consider
24 | 
25 | $$ \begin{matrix}
26 |     G(x_1) & G(x_2) & \cdots & G(x_{k-1}) & G(x_k) \\
27 |     G(x_1) & G(x_2) & \cdots & G(x_{k-1}) & U_k \\
28 |     \vdots & \vdots & \ddots & \vdots & \vdots \\
29 |     G(x_1) & U_2 & \cdots & U_{k-1} & U_k\\
30 |     U_1 & U_2 & \cdots & U_{k-1} & U_k
31 | \end{matrix} $$
32 | 
33 | Every two adjacent rows above are computationally indistinguishable.
34 | 
35 | #### Levin's Trick (Domain Extension)
36 | 
37 | Extend the domain of a PRF from $l$ bits to $n$ bits, by using a hash function $H_1 \in \mathcal{H}$ to compress an $n$-bit input into an $l$-bit one.
38 | 
39 | Can be proved by computing the chance of collision for $H_1$.
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Archive of Knowledge and Memories
2 | 
3 | 大概会用来放各种课程的电子笔记。
4 | 
5 | 随缘更新，希望不鸽。
6 | 
7 | 不嫌弃的话可以随意取用。
8 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/01.Introduction.md:
--------------------------------------------------------------------------------
 1 | # 导论
 2 | 
 3 | > 序列决策问题
 4 | 
 5 | ## 基础概念
 6 | 
 7 | > 通过从交互中学习来实现目标的计算方法
 8 | 
 9 | - 智能体（Agent）
10 |   - 感知状态
11 |   - 采取行动：影响状态或达到目标
12 |   - 达到目标：最大化累计奖励
13 | - 交互过程
14 |   - Agent不同，交互出的数据也不同
15 | 
16 | > “要在第1分钟到第90分钟都换下C罗。”
17 | > “结果发现C罗根本就不work。”
18 | 
19 | `Placeholder`
20 | 
21 | > “你第一次越线超车”
22 | > “你发现”
23 | > “欸，还挺爽啊。”
24 | 
25 | ### 强化学习系统要素
26 | 
27 | #### 历史（History）
28 | 
29 | - **历史**是观察、行动和奖励的序列
30 | 
31 | $$ H_t = [O_1, R_1, A_1, \dots, O_{t-1}, R_{t-1}, A_{t-1}, O_t, R_t] $$
32 | 
33 | - 即到 $t$ 未知所有可观测的变量
34 | 
35 | #### 状态（State）
36 | 
37 | - **状态**是用于确定接下来会发生的事情的信息，是历史的函数
38 | 
39 | $$ S_t = f(H_t) $$
40 | 
41 | #### 策略（Policy）
42 | 
43 | - **策略**是智能体在特定时间的行为方式，是从状态到行动的映射
44 | - **确定性策略**：$a = \pi(s)$
45 | - **随机策略**：$\pi(a|s) = \mathbb{P}(A_t=a|S_t=s)$
46 | 
47 | #### 奖励（Reward）
48 | 
49 | - **奖励** $R(s, a)$ 是定义强化学习目标的**标量**
50 | 
51 | #### 价值函数（Value Function）
52 | 
53 | - 标量，用于定义对于长期来说什么是“好”的
54 | - 是对于未来累计奖励的预测
55 | - 给定Policy，才有对应的价值
56 | 
57 | ##### 例子
58 | 
59 | $$\begin{aligned}
60 |   Q_{\pi}(s,a) &= \mathbb{E}_{\pi}[R_{t+1} + \gamma R_{t+2} + \cdots | S_t=s, A_t=a]\\
61 |   &=\mathbb{E}_{\pi}[R_{t+1} + \gamma Q_{\pi}(s',a')|S_t=s, A_t=a]
62 | \end{aligned}$$
63 | 
64 | #### 模型（Model）
65 | 
66 | - **环境的模型**用于模拟环境的行为
67 |   - 预测下一个状态：$\mathcal{P}_{ss'}^{a} = \mathbb{P}[S_{t+1}=s'|S_t=s, A_t=a]$
68 |   - 预测下一个奖励：$\mathcal{R}_{s}^{a}=\mathbb{E}[R_{t+1}|S_t=s, A_t=a]$
69 | 
70 | ### 方法
71 | 
72 | - 基于价值
73 |   - 没有策略
74 |   - 有价值函数
75 | - 基于策略
76 |   - 没有价值函数
77 |   - 直接优化策略
78 | - Actor-Critic
79 |   - 有策略
80 |   - 有价值函数
81 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/02.MultiArmBandit.md:
--------------------------------------------------------------------------------
 1 | # 多 臂 老 虎 机
 2 | 
 3 | ## 序列决策任务的基本问题
 4 | 
 5 | “基于目前策略获取已知最优收益，或者尝试不同的决策”
 6 | 
 7 | ## 多臂老虎机
 8 | 
 9 | ### 形式化表述
10 | 
11 | - 动作 $\mathcal{A} = \{ a_1, a_2, \dots, a_K \} $
12 | - 收益分布 $\mathcal{R}(r_t|a_i) = \mathbb{P}(r_t|a_i)$
13 | - 目标是最大化累计时间的收益 $\max\sum_{t=1}^T r_t$
14 | 
15 | ### 算法框架
16 | 
17 | ```python
18 | # init
19 | for i in range(K):
20 |     Q[a[i]] = c[i]
21 |     N[a[i]] = 0  # counter
22 | 
23 | # main loop
24 | for t in range(T):
25 |     action = select_action(pi)  # choose action by policy pi
26 |     r[t] = bandit(action)       # get reward
27 |     N[a[i]] = N[a[i]] + 1       # update counter
28 |     update(Q[a[i]])             # update reward estimation
29 | ```
30 | 
31 | - $Q(a_i)$ 是执行动作 $a_i$ 能带来的收益的估计值
32 | - $N_t(a_i) = \sum_{j=1}^t \mathbb{I}[A_j=a_i]$ 是 $t$ 时刻前选择动作 $a_i$ 的次数
33 | - 要解决的问题
34 |   - 估计期望收益
35 |   - 选择策略 $\pi$
36 | 
37 | ### 期望收益估计
38 | 
39 | - 动作的价值的真实值是选择这个动作的期望收益 $q_*(a)=\mathbb{E}[r|A_t=a]$
40 | - 该期望可以使用均值估计
41 | - 令 $Q_n(a_i)$ 表示动作$a_i$在被选择了 $n$ 次之后的估计收益，$R_j(a_i)$ 表示第 $j$ 次选择动作 $a_i$ 时获得的收益。
42 | 
43 | $$ Q_n(a_i) = \frac{\sum_{j=1}^{n-1}R_{j}(a_i)}{n-1} $$
44 | 
45 | ### 贪心策略
46 | 
47 | #### 贪心法
48 | 
49 | 在每一步估计收益，并选择当前最优（带来收益最大）的动作。
50 | 
51 | $$ Q_t(a) = \frac{\sum_{i=1}^{t-1}R_i \mathbb{I}[A_i=a]}{\sum_{i=1}^{t-1}\mathbb{I}[A_i=a]} $$
52 | 
53 | $$ a^* = \argmax_a Q_t(a) $$
54 | 
55 | #### $\epsilon$-贪心策略
56 | 
57 | #### 衰减贪心策略
58 | 
59 | ### 乐观初始化
60 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/03.MarkovDecisionProcess.md:
--------------------------------------------------------------------------------
  1 | # 马尔可夫决策过程
  2 | 
  3 | ## 随机过程
  4 | 
  5 | > 随机过程是一个或多个事件、随即系统或者随机现象随时间发生演变的过程。
  6 | 
  7 | ### 马尔可夫过程
  8 | 
  9 | > 当前状态是未来的充分统计量
 10 | 
 11 | $$ \mathbb{P}[S_{t+1}|S_1,\dots,S_t] = \mathbb{P}[S_{t+1}|S_t] $$
 12 | 
 13 | ## 马尔可夫决策过程
 14 | 
 15 | 提供了一套为在结果部分随机、部分在决策者控制下的决策过程建模的数学框架
 16 | 
 17 | $$\mathbb{P}[S_{t+1}|S_t,A_t]$$
 18 | 
 19 | ### 五元组
 20 | 
 21 | 马尔可夫决策过程可以由一个五元组表示
 22 | 
 23 | $$ (S,A,\{P_{sa}\},\gamma,R) $$
 24 | 
 25 | - $S$ 是状态的集合
 26 | - $A$ 是动作的集合
 27 | - $P_{sa}$ 是状态转移概率：对每个状态 $s$ 和动作 $a$，$P_{sa}$ 是下一个状态在 $S$ 中的概率分布
 28 | - $\gamma \in [0, 1]$ 是对未来奖励的折扣因子
 29 | - $R:S\times A\mapsto \mathbb{R}$ 是奖励函数
 30 |   - 有时只和状态有关
 31 | 
 32 | ### MDP的动态
 33 | 
 34 | - 从状态 $s_0$ 开始
 35 | - 智能体选择某个动作 $a_0$
 36 | - 得到奖励 $R(s_0,a_0)$
 37 | - 随机转移到下个状态 $s_1 \sim P_{s_0a_0}$
 38 | - 不断执行上述过程，直到出现终止状态（也可以无限执行）
 39 | - 总回报为 $\sum_t \gamma^t R(s_t,a_t)$
 40 | 
 41 | ## 占用度量 Occupancy Measure
 42 | 
 43 | $$ \rho^{\pi}(s,a) = \mathbb{E}_{a\sim\pi(s),s'\sim p(s,a)}\left[ \sum_{t=0}^T\gamma^tp(s_t=s,a_t=a) \right] $$
 44 | 
 45 | ### 归一化的占用度量
 46 | 
 47 | $$ \rho^{\pi}(s,a)' = (1-r)\rho^{\pi}(s,a) $$
 48 | 
 49 | - 归一化后占用度量的和为1
 50 | - 表示（动作，状态）被访问到的概率
 51 | 
 52 | 定理1
 53 | : 和同一个动态环境交互的两个策略 $\pi_1$ 和 $\pi_2$ 得到的占用度量 $\rho^{\pi_1}$ 和 $\rho^{\pi_2}$ 满足
 54 | 
 55 | $$ \rho^{\pi_1} = \rho^{\pi_2} \Leftrightarrow \pi_1 = \pi_2 $$
 56 | 
 57 | 定理2
 58 | : 给定一个占用度量 $\rho$，可生成该占用度量的唯一策略是
 59 | 
 60 | $$ \pi_p = \frac{\rho(s,a)}{\sum_{a'}\rho(s,a')} $$
 61 | 
 62 | ## 基于动态规划的强化学习
 63 | 
 64 | - 选择能够最大化累计奖励期望的动作
 65 | $$ \mathbb{E}[R(s_0) + \gamma R(s_1) + \gamma^2R(s_2）+ \cdots] $$
 66 |   - $\gamma\in[0,1]$ 是未来奖励的折扣因子，使智能体更重视即时奖励
 67 | - 给定一个特定策略 $\pi(s):S \mapsto A$
 68 |   - 在状态 $s$ 采取策略 $a=\pi(s)$
 69 | - 给策略 $\pi$ 定义价值函数
 70 | $$ V^{\pi}(s) = \mathbb{E}[R(s_0) + \gamma R(s_1) + \gamma^2R(s_2）+ \cdots \mid s_0=s, \pi] $$
 71 | 
 72 | ### 贝尔曼等式 Bellman Equation
 73 | 
 74 | $$\begin{aligned}
 75 |     V^{\pi}(s) &= \mathbb{E}[R(s_0) + \gamma R(s_1) + \gamma^2R(s_2）+ \cdots \mid s_0=s, \pi]\\
 76 |     &= \mathbb{E}[R(s_0) \mid s_0=s, \pi] + \gamma\mathbb{E}[R(s_1)+\gamma R(s_2) + \cdots \mid s_0=s, \pi]\\
 77 |     &= R(s) + \gamma\sum_{s'\in S}P_{s\pi(s)}(s')V(s')
 78 | \end{aligned}$$
 79 | 
 80 | ### 最优价值函数
 81 | 
 82 | 对状态 $s$ 来说的最优价值函数是所有策略可获得的最大可能折扣奖励的和
 83 | 
 84 | $$ V^*(s) = \max_{\pi}V^{\pi}(s) $$
 85 | 
 86 | #### 最优价值函数的Bellman等式
 87 | 
 88 | $$ V^*(s) = R(s) + \max_{a\in A}\gamma\sum_{s' \in S}P_{sa}(s')V^*(s') $$
 89 | 
 90 | #### 最优策略
 91 | 
 92 | $$ \pi^*(s) = \argmax_{a \in A}\sum_{s' \in S}P_{sa}(s')V^*(s') $$
 93 | 
 94 | - $ V^*(s) = V^{\pi^*}(s) \ge V^{\pi}(s) $
 95 | 
 96 | ### 价值迭代 Value Iteration
 97 | 
 98 | ```python
 99 | V = [0 for s in S]
100 | while not converged():
101 |     for s in S:
102 |         V[s] = R[s] + maximize(gamma * sum([P[s][a](s_) * V[s_] for s_ in S]))
103 | ```
104 | 
105 | - 在计算中没有明确的策略
106 | 
107 | #### 同步迭代
108 | 
109 | - 存储两份值函数的拷贝
110 | $$ V_{new}(s) \leftarrow \max_{a\in A}\left( R(s) + \gamma\sum_{s'\in S}P_{sa}(s')V_{old}(s') \right) $$
111 | 
112 | $$ V_{old}(s) \leftarrow V_{new}(s) $$
113 | 
114 | #### 异步迭代
115 | 
116 | $$ V_(s) \leftarrow \max_{a\in A}\left( R(s) + \gamma\sum_{s'\in S}P_{sa}(s')V(s') \right) $$
117 | 
118 | ### 策略迭代 Policy Iteration
119 | 
120 | - 对于有限动作空间和状态空间的MDP
121 | - $|S|<\infty, |A|<\infty$
122 | 
123 | ```python
124 | pi = random_init()
125 | while not converged():
126 |     V = compute_v_pi()  # computationally expensive
127 |     for s in S:
128 |         pi[s] = argmax(sum([P[s][a][s_] * V[s_] for s_ in S]))
129 | ```
130 | 
131 | ## 基于模型的强化学习
132 | 
133 | - 在实际问题中，状态转移和奖励函数一般不是明确给出的
134 | - 但是我们可以观测到一些 episodes
135 | 
136 | ### 学习MDP模型
137 | 
138 | - 学习状态转移概率
139 | $$ P_{sa}(s') = \frac{\#在状态s下采取策略a转移到s'}{\#在状态s下采取策略a} $$
140 | - 学习奖励函数
141 | $$ R(s) = \textrm{mean}_i(R(s)^{(i)}) $$
142 | 
143 | ```python
144 | pi = random_init()
145 | while not converged():
146 |     data = execute_and_collect_data(pi)
147 |     P_sa, R = update(data)
148 |     V = value_iteration()
149 |     pi = greedy_update(V)
150 | ```
151 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/04.ModelFreeControl.md:
--------------------------------------------------------------------------------
  1 | # 无模型控制方法
  2 | 
  3 | > 估计 $Q(s,a)$
  4 | 
  5 | ## 模型无关的强化学习 Model-Free RL
  6 | 
  7 | > 在没有明确给出状态转移和奖励函数的现实问题，模型无关的强化学习直接从经验中学习值和策略，而无需构建MDP
  8 | 
  9 | - 关键步骤
 10 |   1. 估计值函数
 11 |   2. 优化策略
 12 | 
 13 | ## 值函数估计
 14 | 
 15 | ### 蒙特卡洛方法 Monte-Carlo Method
 16 | 
 17 | - 无偏估计、高方差
 18 | - 总折扣奖励
 19 | $$ G_t =R_{t+1} + \gamma R_{t+2} + \cdots + \gamma^{T-1}R_T $$
 20 | - 值函数估计值
 21 | $$\begin{aligned}
 22 |     V^{\pi}(s) &= \mathbb{E}[R(s_0)+\gamma R(s_1) + \cdots\mid s_0=s, \pi]\\
 23 |     &= \mathbb{E}[G_t\mid s_t=s,\pi]\\
 24 |     &\approx \frac{1}{N}\sum_{i=1}^N G_t^{(i)}
 25 | \end{aligned}$$
 26 |   - 其中 $G_t^{(i)}$ 是一次观测到状态 $s_t=s$ 起始的总折扣奖励
 27 | 
 28 | #### 实现
 29 | 
 30 | - 给定一段 episode，其中的每个时刻 $t$ 的状态 $s$ 都被访问
 31 |   1. 计数器增量 $N(s) \leftarrow N(s)+1$
 32 |   2. 累计奖励增量 $S(s) \leftarrow S(s) + G_t$
 33 |   3. 估计价值函数 $V(s) \leftarrow S(s)/N(s)$
 34 |   4. 更新策略 $\pi$
 35 | 
 36 | ### 重要性采样 Importance Sampling
 37 | 
 38 | - 估计一个不同分布的期望
 39 | $$\begin{aligned}
 40 |   \mathbb{E}_{x\sim p}[f(x)] &= \int_x p(x)f(x)\mathrm{d}x\\
 41 |   &= \int_x q(x)\frac{p(x)}{q(x)}f(x)\mathrm{d}x\\
 42 |   &= \mathbb{E}_{x\sim q}\left[ \frac{p(x)}{q(x)}f(x) \right]
 43 | \end{aligned}$$
 44 | - 使用从一个分布中采到的样本估计另一个分布上的期望
 45 | - 将每个实例的权重重新分配为 $\beta(x)=p(x)/q(x)$
 46 | - 但是 $p(x)$ 和 $q(x)$ 都是概率，因此 $\beta$ 有时候会很大，有时候会很小
 47 |   - 导致权重小的数据点被权重大的数据点支配
 48 | 
 49 | #### 重要性采样的值函数估计
 50 | 
 51 | - 从独立同分布的原则出发，在完成一次策略更新后，之前所有依据旧策略得到的数据都不再能运用到后续训练中
 52 | - 因此可以根据两个策略之间的重要性比率对累计奖励进行加权
 53 | - 使用策略 $\mu$ 产生的累计奖励评估策略 $\pi$
 54 | 
 55 | $$ G_t^{\pi/\mu} = \frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}\frac{pi(a_{t+1}|s_{t+1})}{\mu(a_{t+1}|s_{t+1})}\cdot\cdots G_t $$
 56 | 
 57 | - 方差可能非常大
 58 | 
 59 | #### 时序差分
 60 | 
 61 | - 根据重要性采样对时序差分目标 $r + \gamma V(s')$ 加权
 62 | 
 63 | $$ V(s_t)\leftarrow V(s_t) + \alpha\left( \frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}\left( r_{t+1} + \gamma V(s_{t+1}) - V(s_t) \right) \right) $$
 64 | 
 65 | - 相比蒙特卡洛方法，方差更小
 66 | 
 67 | ### 时序差分学习
 68 | 
 69 | - 模型无关学习
 70 | - 低方差、有偏估计
 71 | - 能够通过 bootstrap 从不完整的片段中学习
 72 | 
 73 | $$ V(S_t) \leftarrow V(S_t) + \alpha(R_{t+1} + \gamma V(S_{t+1})- V(S_t)) $$
 74 | 
 75 | #### 多步时序差分学习
 76 | 
 77 | - $n$ 步累计奖励 $G_t^{(n)} = R_{t+1} + \gamma R_{t+2} + \cdots + \gamma^{n-1}R_{t+n} + \gamma^n V(S_{t+n})$
 78 | 
 79 | ## 在线策略 On-Policy 与 离线策略 Off-Policy
 80 | 
 81 | - 在线策略
 82 |   - 得到策略后可以直接开始使用
 83 |   - 学习时使用当前策略采集到的数据
 84 |     - 不需要采样
 85 |     - 使用的数据也全是符合自己采用的策略的
 86 |   - 但是需要耗费大量资源
 87 |     - 因为数据全是自己采样的
 88 |   - 例如 $\epsilon$-Greedy
 89 | - 离线策略
 90 |   - 使用一个行为策略收集数据
 91 |   - 使用另一个目标策略估计动作价值函数
 92 |   - 例如 Q-Learning
 93 | 
 94 | ### 为什么使用离线策略学习
 95 | 
 96 | - 平衡探索和利用
 97 | - 重用旧策略产生的经验
 98 | - 探索策略时学习最优策略
 99 | 
100 | ## SARSA
101 | 
102 | ### 策略评估
103 | 
104 | - 对当前策略执行的每个 $(S,A,R,S',A')$ 五元组
105 | 
106 | $$ Q(s,a) \leftarrow Q(s,a) + \alpha (r + \gamma Q(s',a') - Q(s,a)) $$
107 | 
108 | ### 策略改进
109 | 
110 | - $\epsilon$-Greedy 策略改进
111 | 
112 | ## Q-Learning
113 | 
114 | - 根据行为策略选择动作 $a_t \sim \mu(a|s_t)$
115 | - 根据目标策略选择后续动作 $a_{t+1}' \sim \pi(a|s_t)$
116 | 
117 | $$ Q(s,a) \leftarrow Q(s_t,a_t) + \alpha (r_t + \gamma Q(s_{t+1},a_{t+1}') - Q(s_t,a_t)) $$
118 | 
119 | - 允许行为策略和目标策略都进行改进
120 | - 目标策略 $\pi$ 是关于 $Q(s,a)$ 的贪心策略
121 | - 行为策略 $\mu$ 是关于 $Q(s,a)$ 的 $\epsilon$-贪心策略
122 | 
123 | Q-Learning 的目标函数可以简化为
124 | 
125 | $$ r_{t+1} + \gamma Q(s_{t+1},a_{t+1}') = r_{t+1} + \gamma\max_{a_{t+1}'}Q(s_{t+1},a_{t+1}') $$
126 | 
127 | ## 多步时序差分
128 | 
129 | > MC观测了完整序列，获得对值函数的无偏估计
130 | > TD观测了单步序列，获得对值函数的有偏估计
131 | > 多步时序差分是两者的折中
132 | 
133 | ### $n$ 步累计奖励
134 | 
135 | $$ G_t^{(n)} = R_{t+1} + \gamma R_{t+2} + \cdots + \gamma^{n-1} R_{t+n} + \gamma^n V(S_{t+n}) $$
136 | 
137 | ### $n$ 步时序差分学习
138 | 
139 | $$ V(S_t) = V(S_t) + \alpha (G_t^{(n)} - V(S_t)) $$
140 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/05.ProgrammingAndLearning.md:
--------------------------------------------------------------------------------
  1 | # 规划与学习
  2 | 
  3 | ## 回顾：策略评估与策略提升
  4 | 
  5 | ### 值函数估计
  6 | 
  7 | $$ V^{\pi}(s) = \mathbb{E}_{a\sim\pi(s)}[Q^{\pi}(s,a)] $$
  8 | 
  9 | $$ Q^{\pi}(s,a) = R(s,a) + \gamma \sum_{s'} P_{s\pi(s)}(s')V^{\pi}(s') $$
 10 | 
 11 | ### 策略提升
 12 | 
 13 | 对于两个策略 $\pi$ 和 $\pi'$，如果满足如下条件
 14 | 
 15 | $$ \forall s \quad Q^{\pi}(s, \pi'(s)) \ge V^{\pi}(s) $$
 16 | 
 17 | 则 $\pi'$ 是 $\pi$ 的策略提升
 18 | 
 19 | 特别地，如果
 20 | 
 21 | 1. 在某个状态下，两个策略输出不同，且 $Q^{\pi}(s,\pi'(s)) > Q^{\pi}(s,\pi(s))=V^{\pi}(s)$
 22 | 2. 在其他所有状态下，两个策略输出相同
 23 | 
 24 | 则 $\pi'$ 是 $\pi$ 地一种策略提升
 25 | 
 26 | #### 策略提升定理
 27 | 
 28 | 如果 $\pi'$ 是 $\pi$ 的策略提升，则对任何状态
 29 | 
 30 | $$ V^{\pi'}(s) \ge V^{\pi}(s) $$
 31 | 
 32 | 因此有一个策略 $\pi$后，可以迭代地更新策略，使新策略的价值更高
 33 | 
 34 | 1. 评估策略价值 $V^{\pi}(s)$
 35 | 2. 寻找新策略 $\pi'$ 使 $\pi'$ 满足策略提升
 36 | 
 37 | ## 规划与学习入门
 38 | 
 39 | ### 模型
 40 | 
 41 | 给定一个状态和动作，模型能够预测下一个状态和奖励的分布，即 $\mathbb{P}[s',r|s,a]$
 42 | 
 43 | #### 分类
 44 | 
 45 | - 白盒模型（分布模型）
 46 |   - 描述了轨迹的所有可能性及其概率
 47 | - 黑盒模型（样本模型）
 48 |   - 根据概率进行采样，只产生一条可能的轨迹
 49 | 
 50 | #### 作用
 51 | 
 52 | 得到模拟的模拟数据（simulated experience）
 53 | 
 54 | ### 规划
 55 | 
 56 | 输入一个模型，输出一个策略的搜索过程
 57 | 
 58 | #### 分类
 59 | 
 60 | - 状态空间的规划
 61 |   - 在状态空间搜索最佳策略
 62 | - 规划空间的规划
 63 |   - 在规划空间搜索最佳策略
 64 |   - 此时，一个规划就是一个动作集合以及动作顺序的约束
 65 |   - 此时的状态是一个规划，目标状态是能完成任务的规划
 66 | 
 67 | #### 通用框架
 68 | 
 69 | - 通过模型得到模拟数据
 70 | - 利用模拟数据更新值函数，从而改进策略
 71 | 
 72 | ### Q-Planning
 73 | 
 74 | #### 与学习的异同
 75 | 
 76 | - 不同点
 77 |   - 规划使用模型产生的模拟经验
 78 |   - 学习使用环境产生的真实经验
 79 | - 相同点
 80 |   - 通过回溯更新值函数估计
 81 |   - 学习的方法也可以用在模拟经验上
 82 | 
 83 | #### 框架
 84 | 
 85 | - 重复以下步骤
 86 |   1. 随机选择状态 $s$  和动作 $a$
 87 |   2. 将 $s$ 和 $a$ 输入采样模型，获得奖励 $r$ 和 $s'$
 88 |   3. 根据 $(s,a,r,s')$ 进行 Q学习
 89 | 
 90 | 环境模型是否准确会影响模型的实际性能，因此通常需要应用在环境非常好学习的应用场景中
 91 | 
 92 | ## Dyna 算法
 93 | 
 94 | > ウルトラマンダイナ
 95 | 
 96 | Dyna算法使用经验更新值函数和策略，同时更新模型
 97 | 
 98 | ### 框架
 99 | 
100 | - 和环境交互产生真实经验
101 | - 使用真实经验直接强化学习更新
102 | - 同时使用真实经验学习模型
103 | - 使用模型产生的模拟经验规划更新策略
104 | 
105 | ### 算法
106 | 
107 | - 初始化值函数 $Q(s,a)$ 和模型 $Model(s,a)$
108 | - 重复以下步骤
109 |   1. $s$ 更新为当前非终止状态
110 |   2. $a = greedy(s, Q)$
111 |   3. 执行动作 $a$，获得奖励 $r$ 和 新状态 $s'$
112 |   4. 使用 $Q$-Learning 更新值函数 $Q$
113 |   5. 使用 $r, s'$ 更新模型
114 |   6. 重复以下步骤 $n$ 次
115 |      1. 随机采样之前观测到的状态 $s$（每轮新循环重新采样，不是在上一步基础上采样）
116 |      2. 随机采样在 $s$ 状态下做过的状态 $a$
117 |      3. 根据模型 $Model(s,a)$ 获得 $r$, $s'$
118 |      4. 使用 $Q$-Planning 规划更新值函数
119 | 
120 | ### 小结
121 | 
122 | - 综合了规划、决策和学习
123 | - 在环境模型比较准确的情况下，可以较多地依赖规划（$n$ 选大一点）
124 | - 否则可以选择较小的 $n$，在训练过程中学习到较准确的环境模型
125 | 
126 | ### 应对环境变化
127 | 
128 | #### 模型可能不准确
129 | 
130 | - 环境是随机的，但是我们只观测到了有限的样本
131 | - 模型使用了泛化性不好的环境估计
132 | - 环境改变了，但是模型没有发现环境改变
133 | 
134 | #### Dyna-Q+
135 | 
136 | - 应对环境不准确或环境在变化的情况
137 | - 将奖励从 $r$ 更改为 $r+\mathcal{K}\sqrt{\tau}$
138 |   - $r$: 原本的奖励
139 |   - $\mathcal{K}$: 权重参数，通常较小
140 |   - $\tau$: 某个状态多久没有到达过了
141 |   - 鼓励模型探索一些长时间没到达的状态
142 | 
143 | !!!note Reward Shaping
144 |     对 Reward 进行一些修改以达到一些特定的目的（例如让模型尽可能探索未知状态等）。但是这种修改需要满足一些特定条件，使最终学到的策略和本来的策略一致。
145 | 
146 |     - 例如 $1/n_s$ 可以鼓励智能体在行动时尽可能覆盖到所有状态
147 | 
148 | ## :robot:采样方法
149 | 
150 | > 通过模拟场景让智能体更快地学到一些真实环境里不容易见到的经验
151 | 
152 | ### 优先级采样
153 | 
154 | 定义优先级队列，其中优先级 $P$ 定义为
155 | 
156 | $$ P \leftarrow |R + \gamma\max_a Q(s',a) - Q(s,a)| $$
157 | 
158 | #### 使用优先级采样的Dyna-Q
159 | 
160 | - 初始化值函数 $Q(s,a)$ 、模型 $Model(s,a)$和优先级队列 $Pq$
161 | - 重复以下步骤
162 |   1. $s$ 更新为当前非终止状态
163 |   2. $a = \epsilon greedy(s, Q)$
164 |   3. 执行动作 $a$，获得奖励 $r$ 和 新状态 $s'$
165 |   4. 更新优先级 $P$
166 |   5. 如果 $P > \theta$，则将 $(s,a)$ 以优先级 $P$ 插入 $Pq$
167 |   6. 重复以下步骤 $n$ 次
168 |      1. $(s,a)$ 为优先级队列 $Pq$ 首个元素
169 |      2. 根据模型 $Model(s,a)$ 获得 $r$, $s'$
170 |      3. 更新 $Q$
171 |      4. 对于所有能够到达 $s$ 的 $\bar{s}$, $\bar{a}$
172 |         1. 令 $\bar{r}$ 为模型对于 $\bar{s},\bar{a},s$ 预测的奖励
173 |         2. $P = |\bar{r} + \gamma\max_a Q(s,a) - Q(\bar{s},\bar{a})|$
174 |         3. 若 $P > \theta$，则将 $(\bar{s}, \bar{a})$ 以优先级 $P$ 插入
175 | 
176 | ### 期望更新和采样更新
177 | 
178 | - 期望更新
179 |   - $Q(s,a) = \sum_{s',r} \mathbb{P}[s',r|s,a](r + \gamma\max_{a'}Q(s',a'))$
180 |   - 需要分布模型
181 |   - 需要更大计算量
182 |   - 没有偏差，更准确
183 | - 采样更新
184 |   - $Q(s,a) =Q(s,a) + \alpha [r + \gamma\max_{a'}Q(s',a') - Q(s,a)] $
185 |   - 只需要采样模型
186 |   - 计算量需求更低
187 |   - 受采样误差影响，准确率相对较低
188 | 
189 | ## 决策时规划
190 | 
191 | ### 实时动态规划 RTDP
192 | 
193 | - 只考虑决策时会涉及到的状态
194 | - 不考虑不会访问到的状态
195 | 
196 | ### 决策时
197 | 
198 | - 背景规划
199 |   - 为了更新很多状态值供后续动作选择
200 |   - 例如动态规划、Dyna-Q
201 | - 决策时规划
202 |   - 只着眼于当前状态选择的动作
203 |   - 在不需要快速反应的应用中很有效
204 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/07.DeepRL.md:
--------------------------------------------------------------------------------
  1 | # 深度强化学习
  2 | 
  3 | ## 深度Q网络 DQN
  4 | 
  5 | - 直观想法
  6 |   - 使用神经网络拟合 $Q_\theta(s,a)$
  7 |   - 不work
  8 |     - 连续采样的 $s_t,a_t,s_{t+1},r_t$ 不满足独立同分布
  9 |     - $Q_\theta(s,a)$ 频繁更新
 10 | - 解决方法
 11 |   - 经验回放
 12 |   - 双网络结构
 13 |     - 评估网络 Evaluation Network
 14 |     - 目标网络 Target Network
 15 | 
 16 | ### 经验回放
 17 | 
 18 | - 存储训练过程中的每一步 $e_t = (s_t,a_t,s_{t+1},r_t)$
 19 |   - 从训练数据中采样，采样时服从均匀分布
 20 | 
 21 | #### 优先经验回放
 22 | 
 23 | 以 Q 函数的值与目标值的差异来衡量学习的价值
 24 | 
 25 | $$ p_t = |r_t + \gamma\max_{a'}Q_\theta(s_{t+1},a')-Q_\theta(s_t,a_t)| $$
 26 | 
 27 | - 为了保证每个样本都有机会被采样，存储时使用 $p_t + \varepsilon$
 28 | 
 29 | 选择样本的概率
 30 | 
 31 | $$ P(t) = \frac{p_t^\alpha}{\sum_kp_k^\alpha} $$
 32 | 
 33 | - 重要性采样
 34 |   - $\omega_t = \frac{(N\times P(t))^{-\beta}}{\max_i \omega_i}$
 35 | 
 36 | #### 目标网络
 37 | 
 38 | - 使用较旧的参数 $\theta_-$，每隔 $C$ 步和训练网络的参数同步一次
 39 | - 让目标网络的更新慢一些，防止新数据的偏差导致训练不稳定
 40 | 
 41 | $$L_i(\theta_i) = \mathbb{E}[\frac{1}{2}\omega_t(r_t + \gamma\max_{a'}Q_{\theta_-}(s_{t+1}, a') - Q_{\theta_i}(s_t,a_t))^2]$$
 42 | 
 43 | - 其中 $r_t + \gamma\max_{a'}Q_{\theta_-}(s_{t+1}, a')$ 为目标价值函数
 44 | 
 45 | ## Double DQN
 46 | 
 47 | ### DQN的过估计
 48 | 
 49 | - $y_t = r_t + \gamma\max_{a'}Q_{\theta_-}(s_{t+1}, a')$
 50 |   - $\max$ 操作使 $Q$ 函数值越来越大甚至高于真实值
 51 | 
 52 | #### 过估计的原因
 53 | 
 54 | - 对随机变量 $X_1,X_2$，有 $\mathbb{E}[\max(X_1,X_2)] \ge \max(\mathbb{E}[X_1], \mathbb{E}[X_2]) $
 55 | 
 56 | 于是
 57 | 
 58 | $$ \begin{aligned}
 59 |   \max Q_{\theta'} &= Q_{\theta'}(s_{t+1}, \arg\max_{a'}Q_{\theta'}(s_{t+1},a'))\\
 60 |   &= \mathbb{E}[R|s_{t+1}, \arg\max_{a'}Q_{\theta'},\theta']\\
 61 |   &\ge \max(\mathbb{E}[R|s_{t+1},a_1,\theta'], \mathbb{E}[R|s_{t+1},a_2,\theta'],\dots)
 62 | \end{aligned} $$
 63 | 
 64 | - 直觉上
 65 |   - 选择策略和评估策略来自同一个网络，因此该网络会更倾向于选择自己错误地过估计的动作
 66 | - 随着候选行动数量的增加而越发严重
 67 | - 在Q-Table方法中也存在，但是在引入DQN之后情况更加严重
 68 | 
 69 | ### Double DQN架构
 70 | 
 71 | - 使用不同的网络来估计价值和选择决策
 72 | 
 73 | $$ y_t = r_t + \gamma Q_{\theta'}(s_{t+1}, \arg\max_{a'}Q_\theta(s_{t+1}, a')) $$
 74 | 
 75 | - 注意到DQN中本身就有两套网络，因此可以直接使用DQN中的评估网络来选择策略
 76 | 
 77 | ## Dueling DQN
 78 | 
 79 | - 假设动作值函数服从某个分布 $Q(s,a)\sim\mathcal{N}(\mu,\sigma)$
 80 | - $V(s)=\mathbb{E}[Q(s,a)] = \mu$
 81 | - $Q(s,a) = \mu + \varepsilon(s,a)$
 82 |   - $\varepsilon(s,a)$ 是偏移量
 83 | 
 84 | ### 优势函数
 85 | 
 86 | 定义优势函数 Advantage Function
 87 | 
 88 | $$ A^{\pi}(s,a) \triangleq \varepsilon(s,a) = Q(s,a) - V(s) $$
 89 | 
 90 | ### 网络架构
 91 | 
 92 | ```mermaid
 93 | graph LR
 94 |   DNN-->Hidden1
 95 |   DNN-->Hidden2
 96 |   Hidden1-->Vs
 97 |   Hidden2-->Asa
 98 |   Vs-->merge((+))
 99 |   Asa-->merge
100 |   merge-->Qsa
101 | ```
102 | 
103 | #### 优势函数聚合形式
104 | 
105 | $$ Q(s,a) = V(s) + (A(s,a) - \max_{a'}A(s,a')) $$
106 | 
107 | - 强制最优动作的优势函数输出为0
108 |   - 解决神经网络对 $A$ 和 $V$ 建模不稳定的问题
109 | 
110 | 或
111 | 
112 | $$ Q(s,a) = V(s) + (A(s,a) - \frac{1}{|A|}\sum_{a'}A(s,a')) $$
113 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/08.DeepPolicyNetworks.md:
--------------------------------------------------------------------------------
  1 | # 深度策略评估网络
  2 | 
  3 | ## 基于神经网络的策略梯度
  4 | 
  5 | ### 策略网络的梯度
  6 | 
  7 | $$ \nabla J(\theta) = \mathbb{E}_{\pi_\theta}\left[ \frac{\partial\log\pi_\theta(a|s)}{\partial\theta}Q_{\pi_\theta}(s,a) \right] $$
  8 | 
  9 | 在使用Softmax输出层时
 10 | 
 11 | $$ \frac{\partial\log\pi_\theta}{\partial\theta} = \frac{\partial f_\theta(s,a)}{\partial\theta} - \mathbb{E}_{a'\sim\pi_\theta(a'|s)}\left[ \frac{\partial f_\theta(s,a')}{\partial\theta} \right] $$
 12 | 
 13 | 故
 14 | 
 15 | $$ \nabla J(\theta) = \mathbb{E}_{\pi_\theta}\left[\left( \frac{\partial f_\theta(s,a)}{\partial\theta} - \mathbb{E}_{a'\sim\pi_\theta(a'|s)}\left[ \frac{\partial f_\theta(s,a')}{\partial\theta} \right] \right) Q_{\pi_\theta}(s,a) \right] $$
 16 | 
 17 | #### 策略梯度与Q学习的对比
 18 | 
 19 | - Q学习算法学习一个以$\theta$为参数的Q函数 $Q_\theta$
 20 |   - 优化目标为最小化TD误差
 21 | - 策略梯度学习一个以$\theta$为参数的策略 $\pi_\theta$
 22 |   - 优化目标直接为策略的价值
 23 |     - 比Q学习更加直接
 24 | 
 25 | ## Asynchronous Advantageous Actor-Critic A3C
 26 | 
 27 | $$ \nabla_{\theta'} \log\pi(a_t|s_t;\theta')A(s_t,a_t;\theta_v) $$
 28 | 
 29 | $$ A(s_t,a_t;\theta_v) = \sum_{i=0}^{k-1}\gamma^ir_{t+i} + \gamma^kV(s_{t+k};\theta_v) - V(s_t;\theta_v) $$
 30 | 
 31 | ### A3C 架构
 32 | 
 33 | - Asynchronous 表现在该算法采用分布式计算
 34 |   - Worker独立地计算各自的梯度并push到Server
 35 |   - 然后从Server端pull最新的参数继续更新
 36 |   - 可以在大规模计算集群上展开训练
 37 | 
 38 | > “交换机非常重要”
 39 | > 万 兆 交 换 机
 40 | 
 41 | ```mermaid
 42 | flowchart TB
 43 |     subgraph Worker1
 44 |         direction BT
 45 |         Input1-->Network1
 46 |         Network1-->Policy1
 47 |         Network1-->Vs1
 48 |     end
 49 |     Worker1-->GlobalNetwork
 50 |     GlobalNetwork-->Worker1
 51 |     Worker1-->Env1
 52 |     Env1-->Worker1
 53 |     subgraph Worker2
 54 |         direction BT
 55 |         Input2-->Network2
 56 |         Network2-->Policy2
 57 |         Network2-->Vs2
 58 |     end
 59 |     Worker2-->GlobalNetwork
 60 |     GlobalNetwork-->Worker2
 61 |     Worker2-->Env2
 62 |     Env2-->Worker2
 63 |     subgraph Worker3
 64 |         direction BT
 65 |         Input3-->Network3
 66 |         Network3-->Policy3
 67 |         Network3-->Vs3
 68 |     end
 69 |     Worker3-->GlobalNetwork
 70 |     GlobalNetwork-->Worker3
 71 |     Worker3-->Env3
 72 |     Env3-->Worker3
 73 |     subgraph GlobalNetwork
 74 |         direction BT
 75 |         Input-->Network
 76 |         Network-->Policy
 77 |         Network-->Vs
 78 |     end
 79 | ```
 80 | 
 81 | ## 确定性策略梯度
 82 | 
 83 | - 随机策略
 84 |   - 输出的策略是概率，根据概率采样
 85 | - 确定性策略
 86 |   - 对于离散动作，$\arg\max_{a}Q_\theta(s,a)$（不可微）
 87 |   - 对于连续动作，$a=\pi(s;\theta)$
 88 | 
 89 | ### 确定性策略的Actor-Critic
 90 | 
 91 | - Critic没有变化，因为Critic只需要关心Actor得到的$Q(s,a)$
 92 | 
 93 | $$ Q^w(s,a) \simeq Q^\pi(s,a) $$
 94 | 
 95 | $$ L(w) = \mathbb{E}_{s\sim p^\pi,a\sim\pi_\theta}[(Q^w(s,a)-Q^\pi(s,a))^2] $$
 96 | 
 97 | #### 确定性策略梯度定理
 98 | 
 99 | $$ J(\pi_\theta) = \mathbb{E}_{s\sim p^{\pi}}[Q^w(s,a)] $$
100 | 
101 | $$ \nabla J = \mathbb{E}_{s\sim p^\pi}[\nabla_\theta \pi_\theta(s)\nabla_a Q^w(s,a)|_{a=\pi_\theta(s)}] $$
102 | 
103 | ### 深度确定性策略梯度 DDPG
104 | 
105 | - 对于确定性策略的梯度
106 | 
107 | $$ \nabla_\theta J(\pi_\theta) = \mathbb{E}_{s\sim p^\theta}[\nabla_\theta \pi_\theta(s)\nabla_a Q^w(s,a)|_{a=\pi_\theta(s)}] $$
108 | 
109 | - 实际应用中，如果使用神经网络拟合，在面对有挑战性的问题时是不稳定的
110 | - DDPG给出了一系列对应的解决方案
111 |   - 经验重放
112 |   - 目标网络
113 |   - 批标准化 Batch Normalization
114 |   - 添加连续噪声
115 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/09.ModelBasedDRL.md:
--------------------------------------------------------------------------------
 1 | # Model-Based Deep Reinforcement Learning
 2 | 
 3 | ## Q-Planning and Dyna
 4 | 
 5 | ### Q-Planning
 6 | 
 7 | 1. 从经验数据学习一个（环境）模型 $p(s', r|s,a)$
 8 | 2. 从模型中采样以训练模型
 9 | 
10 | Q-Planning同样产生一个四元组 $(s,a,r,s')$，但是其中 $s,a$ 是和真实环境交互的真实（历史）数据、$r,s'$ 是学习到的模型生成的虚假数据
11 | 
12 | ```js
13 | s = sample_state()
14 | a = sample_action(s)
15 | 
16 | r, s_ = model(s, a)
17 | 
18 | /* One-step Q-function update */
19 | ```
20 | 
21 | ### Dyna-Q
22 | 
23 | ## Shooting Methods
24 | 
25 | 1. “凭空产生”一组长度为 $T$ 的行动 $[a_0,\dots,a_T]$ （$a_0$是初始状态）
26 | 2. 用该行动序列和环境交互，得到一组轨迹 $[s_0,a_0,\hat{r}_0,\hat{s}_1,a_1,\hat{r}_0,\dots,\hat{s}_T,a_T,\hat{r}_T]$
27 | 3. 选择预期价值最高的动作序列 $\pi(s) = \arg\max_a \hat{Q}(s,a)$
28 | 
29 | ### Random Shooting
30 | 
31 | - 纯随机采样动作序列
32 | - 优势
33 |   - 实现简单
34 |   - 计算开销小
35 | - 问题
36 |   - 高方差
37 |   - 可能采不到高回报的动作序列
38 | 
39 | ### Cross Entropy Method CEM
40 | 
41 | - 根据已知的会产生较高回报的动作序列
42 | - 在新采样时使用接近该高回报序列的分布
43 | 
44 | ### Probabilistic Ensembles with Trajectory Sampling PETS
45 | 
46 | $$ loss_P(\theta) = -\sum_{n=1}^N \log\tilde{f}_{\theta}(s_{n+1}|s_n,a_n) $$
47 | 
48 | $$ \tilde{f} = P(s_{t+1}|s_t,a_t) = \mathcal{N}(\mu_\theta(s_t,a_t), \Sigma_\theta(s_t,a_t)) $$
49 | 
50 | - 假设 $P(s'|s,a)$ 是一个高斯过程，且使用神经网络拟合该过程
51 | - 集成多个拟合的模型
52 | 
53 | ## Branched Rollout Method
54 | 
55 | ### Branched Rollout
56 | 
57 | - 从历史数据中，根据状态的分布采样一个历史起点
58 | - 从历史起点跑 $k$ 步模拟
59 |   - Dyna算法可以看作 $k=1$ 的 Rollout
60 | 
61 | ### Model-Based Policy Optimization MBPO
62 | 
63 | ## BMPO and AMPO
64 | 
65 | ### Bidirectional Model BMPO
66 | 
67 | - 除了向后推演外，双向模型同时向前推演（TENET.jpg）
68 | - 在同样长度的情况下，Compouding Error更小
69 | - 但是需要额外的后向模型
70 |   - 后向模型 $q_\theta'(s|s',a)$
71 |   - 后向策略 $\tilde{\pi}_\phi(a|s')$
72 | 
73 | ### AMPO
74 | 
75 | #### Distribution Mismatch in MBRL
76 | 
77 | - 环境的真实数据和模型模拟的数据之间的分布存在误差
78 | - 这是 Compounding Model Error 的来源
79 | 
80 | ##### Alleviations
81 | 
82 | - 学习过程中
83 |   - 设计不同的损失函数和模型架构
84 |   - 让rollout更拟真
85 | - 使用模型时
86 |   - 设计比较合理的rollout情景
87 |   - 在误差造成问题之前停止rollout
88 | - 但是问题仍然存在
89 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/10.ImitationLearning.md:
--------------------------------------------------------------------------------
  1 | # .模仿学习
  2 | 
  3 | ## 绪论
  4 | 
  5 | - 通常，提供专家数据比定义合理的汇报函数更容易一些
  6 | 
  7 | ### What is Imitation Learning
  8 | 
  9 | - Train models when the reward funtion is not defined
 10 |   - Learning from Demostration LfD
 11 |   - Try to imitate from the expert demonstrations
 12 | 
 13 | #### Problem Setting
 14 | 
 15 | - Can obtain pre-collected trajectories ($(s,a)$-pairs) from experts
 16 | - Can (or sometimes cannot) interact with the environment
 17 | - Cannot access direct reward signals
 18 | 
 19 | ### Basics
 20 | 
 21 | #### Trajectory Distribution
 22 | 
 23 | $P(\tau|\pi)$: distribution of trajectories induced by a policy
 24 | 
 25 | 1. Sample $s_0$ from initial distribution over states $\rho_0$
 26 | 2. Sample action $a_t$ from $\pi$
 27 | 3. Sample next state $s_{t+1}$ from environment
 28 | 
 29 | #### Occupancy Measure
 30 | 
 31 | Distribution of state-action paris induced by a policy
 32 | 
 33 | $$ \rho_\pi(s,a) = \rho_\pi(s)\pi(a|s) $$
 34 | 
 35 | where
 36 | 
 37 | $$ \rho_pi(s) = (1-\gamma)\sum_{t=0}^\infty \gamma^tP(s_t=s|\pi) $$
 38 | 
 39 | ## Methods
 40 | 
 41 | ### General Imitation Learning
 42 | 
 43 | #### Objective
 44 | 
 45 | $$ \pi^\ast = \arg\min_\pi \mathbb{E}_{s\sim \rho_\pi^s}[l(\pi(\cdot|s), \pi_E(\cdot|s))] $$
 46 | 
 47 | ### Behaviour Cloning
 48 | 
 49 | $$ \pi^\ast = \arg\min_\pi \mathbb{E}_{s\sim \rho_{\pi_E}^s}[l(\pi_E(\cdot|s), \pi(\cdot|s))] $$
 50 | 
 51 | - Essentially an MLE on each single step
 52 | 
 53 | #### Limitations
 54 | 
 55 | - Distributional Shift
 56 |   - When $\pi_\theta$ takes a wrong action and starts to diverge from the expert, the distribution will be different from that of the expert
 57 |   - The agent cannot recover from errors so the problem becomes worse
 58 | - No long term planning
 59 | 
 60 | #### Advantages
 61 | 
 62 | - Simple
 63 | - Efficient
 64 | 
 65 | ##### Can be Used For
 66 | 
 67 | - When 1-step deviations are not very bad
 68 | - When learning reactive behaviours
 69 | - When expert trajectories covers the entire space
 70 | 
 71 | #### Dataset Aggregation DAgger
 72 | 
 73 | - Agent can interact with the environment using its learned policy
 74 | - Experts then label the generated trajectory to provide partial dataset
 75 |   - 'Partial' means that if the agent deviates from the trajectory, the data cannot be used since the agent is in a different state
 76 | 
 77 | ### Inverse Reinforcement Learning
 78 | 
 79 | Learn a reward function $r^\ast$ from expert datasets.
 80 | 
 81 | Once $r^\ast$ is learned, a model can be trained with the reward function
 82 | 
 83 | $$ \pi^\ast = \arg\max_\pi \mathbb{E}){(s,a)\sim\rho_pi}[r^\ast(s,a)] $$
 84 | 
 85 | #### Learning Reward Function
 86 | 
 87 | The pricinple is that the expert should be optimal. i.e. the expert trajectory should achieve the highest value in the leared reward function
 88 | 
 89 | $$ r^\ast = \arg\max_r \mathbb{E}\left[ \sum_{t=0}^\infty \gamma^tr(s,a)|\pi^\ast \right] - \mathbb{E}\left[ \sum_{t=0}^\infty \gamma^tr(s,a)|\pi \right] $$
 90 | 
 91 | - Want the reward of the expert to be high, and the reward of others to be low
 92 | 
 93 | The training process is usually implemented in a nested loop
 94 | 
 95 | - Outer loop: find $r$
 96 |   - Inner loop: train policy $\pi$ with $r$
 97 |   - Check if $V(\pi^\ast) - V(\pi)$ is minimized
 98 | 
 99 | However, this formulation is ambiguous. The solution $r$ may not be unique.
100 | 
101 | #### Max-Entropy RL Formulation
102 | 
103 | - Learns a more diversed strategy
104 |   - More chances to explore the environment
105 |   - More robust
106 | - Resolves ambiguity in IRL
107 | 
108 | ### Generative Adversarial Imitation Learning
109 | 
110 | > Idea: Expert and policy occupancy measure can be formulated as the discriminator and generator in GAN
111 | > Also proved that the duality of IRL with MaxEnt setting is equivalent to GAIL
112 | 
113 | $$ \min_\pi\max_D \mathbb{E}_{\pi_E}[\log D(s,a)] + \mathbb{E}_\pi[\log(1-D(s,a))] - \lambda H(\pi) $$
114 | 
115 | - Uses a discriminator $D$ to determine whether the current policy is from the expert or from the agent
116 | 


--------------------------------------------------------------------------------
/Reinforcement Learning/11.OfflineRL.md:
--------------------------------------------------------------------------------
 1 | # Offline Reinforcement Learning
 2 | 
 3 | ## Overview
 4 | 
 5 | - Training an RL agent in real environment can sometimes be costly or risky
 6 | - **Offline RL** is a method of training the agent only from a pre-collected offline dataset
 7 | - The agent only have access to an offline dataset and cannot interact with the environment during training
 8 | 
 9 | ### Extrapolation Error
10 | 
11 | $$ Q(s,a) \leftarrow (1-\alpha)Q(s,a) + \alpha(r + \gamma\max_{a'}Q(s',a')) $$
12 | 
13 | - Extrapolation error is introduced by the mismatch between the dataset and true state-action vistation of the current policy
14 | - The Q-function may encounter an unforeseen $(s',a')$
15 | 
16 | ## Methods
17 | 
18 | ### Batch-Constrained Q-Learning BCQ
19 | 
20 | Although the agents use the same training data, their behavior may vary greatly. Whenever the offline model deviates from the online model, even the same input data may lead to different training results
21 | 
22 | #### Constraining Batches
23 | 
24 | $$ Q(s,a) = (1-\alpha)Q(s,a) + \alpha(r + \gamma\max_{a'\in\mathcal{B}}Q(s',a')) $$
25 | 
26 | - Constraint $a'$ to be some action in current batch of input data
27 | 
28 | #### VAE-Based Action Generation
29 | 
30 | - Use a VAE to give the action that may occur in real environment with high probability
31 | 
32 | $$ \pi(s) = \arg\max_{a_i+\xi_\phi(s, a_i, \Phi)} Q_\theta(s,a_i+\xi_\phi(s, a_i, \Phi)) $$
33 | 
34 | - $\xi_\phi(s, a_i, \Phi)$ is a variational auto-encoder
35 | 
36 | ### Conservative Q-Learning
37 | 
38 | - Learn a conservative, lower-bound Q function to avoid over-estimation
39 | - Add penalty terms on a standard Bellman error objective
40 | 
41 | ### Advantage-Weighted Regression AWR
42 | 
43 | Policy optimization objective
44 | 
45 | $$ J(\pi) = \mathbb{E}_{r\sim p_\pi} \left[ \sum_{t=0}^\infty \gamma^tr_t \right] = \mathbb{E}_s\mathbb{E}_a[r(s,a)] $$
46 | 
47 | #### Reward-Weighted Regression
48 | 
49 | $$ \pi = \arg\max_\pi \mathbb{E}s\mathbb{E}_a \left[ \log\pi(a|s)\exp\left( \frac{1}{\beta}R(s,a) \right) \right] $$
50 | 
51 | - Regarded as solving a maximum likelihood estimation which fits a new policy to samples collected under the current policy, where the likelihood is weighted by the exponential return
52 | - In some sense, a behaviour cloning that only clones 'good' data
53 | 


--------------------------------------------------------------------------------
/Speech Recognition/code_demo/viterbi_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Viterbi Demo
 4 | 
 5 | @author: YBR10
 6 | """
 7 | import numpy as np
 8 | 
 9 | 
10 | def viterbi(init, transition, emission, state, observation):
11 |     """T.H.E. Viterbi Algorithm!
12 |     Given an HMM and an observation sequence,
13 |     Viterbi Algorithm returns the most likely hidden state
14 |     that produces the observation.
15 | 
16 |     Parameters
17 |     ----------
18 |     init : 1d-array
19 |         Initial distribution of the HMM.
20 |     transition : 2d-array
21 |         Transition matrix of the Markov Chain
22 |     emission : 2d-array
23 |         Emission matrix of state i to produce observation j
24 |     state : 1d-array
25 |         States of the HMM.
26 |     observation : 1d-array
27 |         observation sequence
28 | 
29 |     Returns
30 |     -------
31 |     hidden : 1d-array
32 |         The max-a-posteriori estimator of hidden state sequence.
33 | 
34 |     """
35 |     hidden = np.zeros_like(observation)
36 |     # likelihood that current observation i is emitted by state j
37 |     phi = np.zeros((len(state), len(observation)))
38 |     # previous maximizing estimator
39 |     prev = np.zeros((len(state), len(observation)))
40 | 
41 |     phi[:, 0] = np.log(init) + np.log(emission[:, observation[0]])
42 |     prev[:, 1] = -1
43 | 
44 |     for ob in range(1, len(observation)):
45 |         score = np.log(transition).T + phi[:, ob-1]
46 |         candidate = np.max(score, axis=1)
47 | 
48 |         prev[:, ob] = np.argmax(score, axis=1)
49 |         phi[:, ob] = np.log(emission[:, observation[ob]]) + candidate
50 | 
51 |     ptr = np.argmax(phi[:, len(observation)-1])
52 |     idx = len(observation) - 1
53 |     while idx >= 0:
54 |         hidden[idx] = ptr
55 |         ptr = int(prev[ptr, idx])
56 |         idx -= 1
57 | 
58 |     return hidden
59 | 
60 | 
61 | A = np.array([
62 |     [0.2, 0.3, 0.5],
63 |     [0.3, 0.6, 0.1],
64 |     [0.1, 0.8, 0.1]
65 | ])
66 | 
67 | B = np.array([
68 |     [0.2, 0.6, 0.2],
69 |     [0.6, 0.1, 0.3],
70 |     [0.1, 0.2, 0.7]
71 | ])
72 | 
73 | pi = np.array([1/3, 1/3, 1/3])
74 | 
75 | state = np.array([0, 1, 2])
76 | observation = np.array([0, 0, 1, 1, 1, 2, 2, 0])
77 | 
78 | ans = viterbi(pi, A, B, state, observation)
79 | 
80 | print(ans)
81 | 


--------------------------------------------------------------------------------
/Speech Recognition/大词表连续语音识别.md:
--------------------------------------------------------------------------------
 1 | # 大词表连续语音识别 LVCSR
 2 | 
 3 | [TOC]
 4 | 
 5 | **大词表**：大于10k词
 6 | **连续语音**：自然且连续
 7 | 
 8 | ## 声学单元
 9 | 
10 | ### 用HMM建模什么
11 | 
12 | - 词？规模不紧凑、内部语音变化难以建模、训练集中未出现的词难以处理
13 | - 子词？
14 |   - 音素 Phones：最小可辨识的发音声学单元。
15 |   - 音节 Syllables：得到好的参数估计很困难
16 | 
17 | ### 音素变化与协同发音
18 | 
19 | #### 协同发音
20 | 
21 | 某一个音素在一个特定上下文中的具体发音，受到上下文音素的影响。
22 | 
23 | - 可以把单音子模型替换成上下文相关的多音子模型。
24 | 
25 | #### 边界信息
26 | 
27 | - 词内 Within Word
28 | - 跨词 Cross Word
29 | 
30 | ### 决策树聚类
31 | 
32 | - 状态层高斯参数的绑定
33 | - 每个节点的数据使用单个高斯描述
34 | - 每次分裂的增益用似然度增长定义
35 | - 对每个节点，选择似然度增长最大的问题，若增长高于预定阈值，则继续分裂当前节点
36 | 
37 | ## 解码
38 | 
39 | ### 令牌传递算法 Token Passing Algorithm
40 | 
41 | `raise NotImplementedError`
42 | 
43 | ## Viterbi Training and Forced Alignment
44 | 
45 | ### Viterbi Training
46 | 
47 | 如果能预先获得每一帧对应的HMM状态，则不需要使用Baum-Welch算法，而可以直接训练GMM参数。这被称为Viterbi Training。
48 | 
49 | 但是为了获得每一帧的状态的标注，需要用一个预先训练好的声学模型对数据进行强制对齐。
50 | 
51 | ### 强制对齐
52 | 
53 | 是一种特殊的解码过程
54 | 
55 | - 用输入语音对应的标注文本直接构建解码图，图中仅包含标注文本对应的状态。每个状态上有指向自身以及下个状态的跳转。
56 | - 根据语音帧和已有的声学模型，选取解码图中一条最优路径将各帧匹配到解码图上。
57 | 
58 | ## N-Best and Lattices
59 | 
60 | ### N-Best
61 | 
62 | 每个候选是一个词序列。
63 | 
64 | ### Word Lattice 词图
65 | 
66 | - 节点：对应的时间信息
67 | - 边：词的标识以及节点之间对应某个词的声学模型或语言模型分数
68 | 
69 | ### 在解码过程中生成多候选
70 | 
71 | **Word-Pair Assumption**：
72 | 
73 | 1. 每个状态保留 $N$ 个token
74 | 2. 每个token都按照正常的令牌传递分别传播
75 | 3. 找到所有相同历史词token的集合
76 | 


--------------------------------------------------------------------------------
/Speech Recognition/概率论与贝叶斯决策理论.md:
--------------------------------------------------------------------------------
  1 | # 目录
  2 | 
  3 | [TOC]
  4 | 
  5 | ## 联合概率与多元随机变量
  6 | 
  7 | ### 联合概率
  8 | 
  9 | - **联合概率**是所有随机变量 $x_i \in \mathcal{X}_i, i = 1,2,\dots,d$ 同时发生的概率。
 10 | - 常将联合概率合并为多元随机变量 $\mathbf{x} \in \mathcal{X}$。
 11 | $$\mathbf{x} = \begin{bmatrix} x_1\\x_2\\\vdots\\x_d \end{bmatrix}$$
 12 | 
 13 | ### 多元随机变量的统计值
 14 | 
 15 | #### 均值
 16 | 
 17 | $$\bm{\mu} = \mathbb{E}[\mathbf{x}] = \int_{\mathbf{x}}\mathbf{x}\cdot p(\mathbf{x})\mathrm{d}\mathbf{x}$$
 18 | 
 19 | #### 协方差
 20 | 
 21 | $$\Sigma = \mathbb{E}[(\mathbf{x} - \bm{\mu})(\mathbf{x}-\bm{\mu})^T]=\mathbb{E}[\mathbf{x}\mathbf{x}^T]-\bm{\mu}\bm{\mu}^T = (\sigma_{ij})$$
 22 | 
 23 | - $\sigma_{xy} = \mathbb{E}[(x-\mu_x)(y-\mu_y)]$
 24 | - $\sigma_{xx} = \sigma_x^2$
 25 | 
 26 | #### 相关系数
 27 | 
 28 | $$\rho = \frac{\sigma_{xy}}{\sigma_x\sigma_y}$$
 29 | 
 30 | - $-1 \le \rho \le 1$
 31 | - $\rho = 0$ 表示两个变量不相关
 32 | - 独立 $\Rightarrow$ 不相关
 33 | - 对高斯分布： 独立 $\Leftrightarrow$ 不相关
 34 | 
 35 | ### 多元高斯分布
 36 | 
 37 | $d$ 维多元高斯分布的形式为
 38 | $$p(\mathbf{x}) = \frac{1}{(2\pi)^{\frac{d}{2}}|\Sigma|^{\frac{1}{2}}}\exp\left\{ -\frac{1}{2}(\mathbf{x}-\bm{\mu})^T\Sigma^{-1}(\mathbf{x}-\bm{\mu}) \right\}$$
 39 | 
 40 | 若 $\Sigma$ 为对角阵，则
 41 | $$ p(\mathbf{x}) = \prod_{i=1}^d \frac{1}{\sqrt{2\pi}\sigma_i}\exp\left\{-\frac{(x-\mu_i)^2}{2\sigma_i^2}\right\} $$
 42 | 
 43 | - 多元高斯分布的边缘概率分布仍是高斯分布
 44 | - 任何一个子集的联合边缘分布仍是高斯分布
 45 | - 条件分布 $p(x_i|x_j)$ 仍是高斯分布
 46 | - $\mathbf{x}$ 的线性变换 $A\mathbf{x}+\mathbf{b}$ 仍是高斯分布，其均值为 $A\bm{\mu} + \mathbf{b}$，方差为 $A \Sigma A^T$
 47 | 
 48 | #### 多元高斯分布的参数估计
 49 | 
 50 | ##### 充分统计量
 51 | 
 52 | - $\Gamma_0 = \sum_{i=1}^N1 = N$
 53 | - $\Gamma_1 = \sum_{i=1}^N\mathbf{x_n}$
 54 | - $\Gamma_2 = \sum_{i=1}^N\mathbf{x_n}\mathbf{x_n}^T$
 55 | 
 56 | ##### 最大似然估计值
 57 | 
 58 | - $\hat{\bm{\mu}} = \Gamma_1/\Gamma_0$
 59 | - $\hat{\Sigma} = \Gamma_2/\Gamma_0 - \hat{\bm{\mu}}\hat{\bm{\mu}}^T$
 60 | 
 61 | ## 贝叶斯决策理论
 62 | 
 63 | 摸了。
 64 | 
 65 | ## 高斯混合模型 Gaussian Mixture Model
 66 | 
 67 | 将 $M$ 个高斯分布加权求和合并为一个更复杂的模型
 68 | $$p(\mathbf{x}) = \sum_{i=1}^M p(m)p(\mathbf{x}|m) = \sum_{i=1}^Mc_m\mathcal{N}(\mathbf{x}|\bm{\mu}_m,\Sigma_m)$$
 69 | 
 70 | - $c_m = p(m)$
 71 | - $\mathcal{N}(\mathbf{x}|\bm{\mu}_m,\Sigma_m) = p(\mathbf{x}|m)$
 72 | - $c_m$、$\bm{\mu}_m$ 和 $\Sigma_m$ 为待估计的参数
 73 | - 令 $p(z_m = 1) = c_m$，其中$z_m$为one-hot编码，则$z_m$用于指示“哪个高斯分量用于生成模型”。$z_m$不能被直接观测到，因此称为**隐变量**。
 74 | 
 75 | ### 隐变量的估计
 76 | 
 77 | #### 硬分配
 78 | 
 79 | 对每个样本，假设已知（或者强制分配）该样本所属的高斯分量，则可以使用常规的最大似然估计等参数估计方法估计出高斯混合模型的各项参数。
 80 | 
 81 | #### 软分配
 82 | 
 83 | 解决了实际应用中隐变量事实上不能被观测到的问题，但是会导致对数似然函数过于复杂，难以优化。
 84 | 
 85 | ### 期望最大化算法 Expectation-Maximization
 86 | 
 87 | #### Jensen不等式
 88 | 
 89 | 假设$f$是一个凸函数，$X$是一个随机变量，则
 90 | $$f(\mathbb{E}[X]) \le \mathbb{E}[f(X)]$$
 91 | 若$f$是凹函数则不等式反向
 92 | 
 93 | #### EM算法的一般形式
 94 | 
 95 | - 我们希望建模 $p(x,z;\theta)$，但$z$为隐变量，只能观测到$x$
 96 | - 假设 $z_n$ 服从分布 $\gamma_n(m) = p(z_n=m|x_n;\hat{\theta})$
 97 | $$
 98 | \begin{align*}
 99 | \mathcal{L}(\theta) &= \sum_{n=1}^N\log p(x_n;\theta)\\
100 | &= \sum_{n=1}^N\log\sum_{m=1}^M p(x_n, z_m; \theta)\\
101 | &= \sum_{n=1}^N\log\sum_{m=1}^M\gamma_n(m)\frac{p(x_n,m;\theta)}{\gamma_n(m)} \quad \text{(先乘后除$\gamma_n(m)$)}\\
102 | &= \sum_{n=1}^N\log\mathbb{E}\left[\frac{p(x_n,m;\theta)}{\gamma_n(m)}\right] \quad \text{(期望的定义)}\\
103 | &\ge \sum_{n=1}^N\mathbb{E}\left[\log\left(\frac{p(x_n,m)}{\gamma_n(m)}\right)\right] \quad \text{(Jensen不等式)}\\
104 | &= \sum_{n=1}^N\sum_{m=1}^M\gamma_n(m)\log\frac{p(x_n,m;\theta)}{\gamma_n(m)}\\
105 | &= \sum_{n=1}^N\mathrm{H}\left(\gamma_n(m)\right) + \mathcal{Q}(\theta;\hat{\theta})
106 | \end{align*}
107 | $$
108 | 
109 | - $\mathrm{H}(\cdot)$是熵
110 | - 定义$\mathcal{Q}(\theta;\hat{\theta})=\sum_{n=1}^N\sum_{m=1}^Mp(m|x_n;\theta)\log p(x_n,m;\theta)$
111 | - 取$\gamma_n(m) = p(z_n=m|x_n;\hat{\theta})$时，Jensen不等式可以取等
112 | - $\sum_{n=1}^N\mathrm{H}\left(\gamma_n(m)\right) + \mathcal{Q}(\theta;\hat{\theta})$给出了$\mathcal{L}(\theta)$的一个下界，EM算法迭代地最大化这一下界，进而最大化$\mathcal{L}(\theta)$。
113 | 
114 | Jensen不等式在$\hat{\theta}$处取等，因此
115 | $$\mathcal{Q}(\theta;\hat{\theta}) \ge \mathcal{Q}(\hat{\theta};\hat{\theta})$$
116 | 
117 | $$\sum_{n=1}^N\mathrm{H}\left(\gamma_n(m)\right) + \mathcal{Q}(\theta;\hat{\theta}) \ge \sum_{n=1}^N\mathrm{H}\left(\gamma_n(m)\right) + \mathcal{Q}(\hat{\theta};\hat{\theta})$$
118 | 
119 | $$\mathcal{L}(\theta) \ge \mathcal{L}(\hat{\theta})$$
120 | 
121 | 因此EM算法的一般形式为
122 | 
123 | - *E-Step*: $\gamma_n(m) \leftarrow p(m|x_n;\theta)$
124 | - *M-Step*: $\theta = \arg\max_{\theta} \sum_{n=1}^N\sum_{m=1}^M\gamma_n(m)\log\frac{p(x_n,m;\theta)}{\gamma_n(m)}$
125 | 
126 | #### 使用EM计算GMM参数
127 | 
128 | - 初始化参数
129 | - 重复直到收敛：
130 |    1. *E-Step*：对每个样本 $x_n$，更新 $\gamma_n(m) = p(m|x_n)$
131 |    2. *M-Step*：对每个高斯分量 $m$，更新 $\mathcal{N}_m$的参数
132 | 
133 | ##### E-Step
134 | 
135 | 如果假设$m$服从多项式分布，则相关更新公式如下：
136 | $$\gamma_n(m) = p(z_n=m|x_n) = \frac{p(x_n|z_n=m)p(z_n=m)}{\sum_kp(x_n|z_n=k)p(z_n=k)}$$
137 | 在*E-Step*，上式右侧所有的值都是已知且可计算的。
138 | 
139 | ##### M-Step
140 | 
141 | 目标函数是
142 | $$\begin{align*}
143 | Q &= Const\\
144 | &+ \sum_{n=1}^N\sum_{m=1}^M\gamma_n(m)\log c_m\\
145 | &- \frac{1}{2} \sum_{n=1}^N\sum_{m=1}^M\gamma_n(m)\left( \log|\Sigma_m|+(x_n-\mu_m)^T\Sigma_{m}^{-1}(x_n-\mu_m) \right)
146 | \end{align*}$$
147 | 
148 | 令
149 | $$\Gamma^{(m)}_0 = \sum_{n=1}^N\gamma_n(m) \quad \Gamma^{(m)}_1 = \sum_{n=1}^N\gamma_n(m)\mathbf{x_n} \quad \Gamma_2^{(m)} = \sum_{n=1}^N \gamma_n(m)\mathbf{x}_n\mathbf{x}_n^T$$
150 | 
151 | 则
152 | $$\bm{\mu}_m = \frac{\Gamma^{(m)}_1}{\Gamma^{(m)}_0}$$
153 | 
154 | $$\Sigma_m = \frac{\Gamma^{(m)}_2}{\Gamma^{(m)}_0}-\bm{\mu}_m\bm{\mu}^T_m$$
155 | 
156 | $$c_m = \frac{\Gamma^{(m)}_0}{\sum_m\Gamma^{(m)}_0}$$
157 | 


--------------------------------------------------------------------------------
/Speech Recognition/深度神经网络模型.md:
--------------------------------------------------------------------------------
  1 | # 语音识别的深度神经网络模型
  2 | 
  3 | [TOC]
  4 | 
  5 | ## 深度神经网络基础
  6 | 
  7 | ### 数据预处理
  8 | 
  9 | #### 倒谱均值归一化
 10 | 
 11 | 1. 减去每句话的特征平均值
 12 | 2. 估计逐句均值
 13 |    $$ \mu_i = \frac{1}{T}\sum_{t=1}^T o_i^t $$
 14 | 3. 对于每一维度 $i$ 减去所有帧中的平均值
 15 |    $$ \hat{o}_i^t = o_i^t - \mu_i $$
 16 | 
 17 | #### 全局特征标准化
 18 | 
 19 | 将MFCC和FBANK归一化为零均值、单位方差
 20 | 
 21 | ### 初始化
 22 | 
 23 | 1. 受限玻尔兹曼机预训练
 24 | 2. 鉴别性预训练
 25 | 3. 从高斯分布 $\mathcal{N}(w,0.05)$ 或均匀分布 $\mathcal{U}(-0.05,0.05)$ 中采样初始化。
 26 | 
 27 | ### 权重衰减 Weight Decay
 28 | 
 29 | ### 随机失活 Dropout
 30 | 
 31 | - 更新参数时忽略某些节点
 32 | - 释放一些节点的依赖关系
 33 | - 测试时输出要乘以 dropout rate $p$
 34 | 
 35 | ### 批次大小 Batch Size
 36 | 
 37 | - Batch Gradient Descent
 38 | - Stochastic Gradient Descent
 39 | - Mini-batch Gradient Descent
 40 | 
 41 | 对SGD和Minibatch，样本随机初始化很重要，否则参数可能会沿着相似的方向移动太长时间。
 42 | 
 43 | #### 实现
 44 | 
 45 | - 对小训练集，可以直接加载到内存中，然后随机化训练集数组下标
 46 | - 对打训练集，每次读入一部分，然后在内部进行随机采样。
 47 |   - 如果数据来自不同来源，随机化语句列表文件也有帮助。
 48 | 
 49 | ### 动量 Momentum
 50 | 
 51 | - 负梯度可以认为是在参数空间中移动粒子的力
 52 | - 假设加速过程中粒子为单位质量，且力衰减系数为 $\eta$，粒子动量衰减系数为 $\gamma$
 53 | $$ v_t = \gamma v_{t-1} - \eta \nabla J(W) $$
 54 | 
 55 | $$ W = W + v_t $$
 56 | 
 57 | - 通常 $\gamma$ 取 $0.9$
 58 | - 可以减少常规随机梯度下降中的震荡问题
 59 | 
 60 | ### 学习率 Learning Rate
 61 | 
 62 | - 减半
 63 |   - 在交叉验证误差增加时减半学习率
 64 | - 指数衰减
 65 |   - 在经过指定步长后衰减学习率
 66 | 
 67 | ### 网络架构
 68 | 
 69 | - 在语音识别任务中
 70 |   - 5-7层、每层1000-3000个神经元的深度神经网络效果非常好。
 71 |   - 宽而深的模型往往容易找到更优的配置
 72 | 
 73 | ## 语音识别的深度神经网络应用
 74 | 
 75 | - 深度神经网络不能直接用于语音信号建模
 76 |   - 语音信号是变长的
 77 |   - DNN需要固定长度的输入
 78 | - DNN-HMM系统
 79 |   - HMM建模语音信号的动态变化
 80 |   - DNN建模观测概率
 81 | 
 82 | ### DNN-HMM
 83 | 
 84 | #### 组成部分
 85 | 
 86 | - 输入：上下文窗长大小的特征向量（多帧输入）
 87 | - 输出：senones (tri-phone state) 的后验概率
 88 | - DNN：所有状态共用一个DNN（与GMMHMM不同）
 89 | $$ p(o_t|q_t, \theta) = \frac{p(q_t=s|o_t)p(o_t)}{p(s)} \quad \text{其中$p(q_t=s|o_t)$ 是神经网络输出} $$
 90 | 
 91 | #### 流程
 92 | 
 93 | 1. 训练一个GMM-HMM
 94 |    - 为了获得每一帧的状态标注
 95 | 2. 用GMM-HMM解码生成强制对齐的标签
 96 | 3. 用得到的标签数据构建数据集训练DNN
 97 | 4. 获得DNN-HMM
 98 | 
 99 | #### 损失函数
100 | 
101 | 通常使用交叉熵损失函数。
102 | 
103 | #### 缩减尺寸与稀疏性 Size Reduction and Sparsity
104 | 
105 | - 语音识别的DNN训练完成后，很多权重值都是 $0$
106 | - 在一些应用中，即使只保留 $20\%$ 或 $15\%$ 的非零参数，模型的性能仍然不会有明显下降。
107 | 
108 | ## 其他神经网络及其应用
109 | 
110 | ### 卷积神经网络 CNN
111 | 
112 | easy.
113 | 
114 | #### 语音识别中的CNN应用
115 | 
116 | 当输入具有以下两个性质时，CNN能发挥较好作用
117 | 
118 | - 局部相关性
119 | - 平移不变性
120 | 
121 | ### 循环神经网络 RNN
122 | 
123 | #### 前向计算
124 | 
125 | 每个时刻 $t$ 的输入由两部分组成
126 | 
127 | - 输入序列在当前时刻的取值 $x_{t-1}$
128 | - 上一时刻的网络输出 $o_{t-1}$ 或隐层状态 $h_{t-1}$
129 | 
130 | 每个时刻网络的隐层状态 $h_t$ 为
131 | $$ h_t = \sigma_h(Ux_{t-1} + Vh_{t-1} + b_h) $$
132 | 
133 | 每个时刻的输出 $o_t$ 为
134 | $$ o_t = \sigma_o(Wh_t + b_o) $$
135 | 
136 | #### 参数更新
137 | 
138 | $$ \mathcal{L}(r, o) = \sum_{t=1}^T \mathcal{L}_t(r_t, o_t) $$
139 | 其中 $t$ 是时间戳，$r$ 是标签，$o$ 是模型输出
140 | 
141 | ##### 沿时间的反向传播 BPTT
142 | 
143 | $$ \frac{\partial \mathcal{L}}{\partial V} = \sum_t\frac{\partial \mathcal{L}_t}{\partial V} $$
144 | 
145 | $$ \frac{\partial \mathcal{L}_t}{\partial V} = \frac{\partial h_t}{\partial V}\frac{\partial o_t}{\partial h_t}\frac{\partial \mathcal{L}_t}{\partial o_t} $$
146 | 
147 | #### RNN的问题
148 | 
149 | > 比全连接DNN更容易遇到梯度消失
150 | 
151 | - 梯度消失
152 | - 梯度爆炸
153 | 
154 | ### LSTM
155 | 
156 | 缓解RNN中存在的梯度消失问题。
157 | 
158 | #### Cell
159 | 
160 | 每个LSTM 的 Cell 由遗忘门、输入门、输出门三部分组成
161 | 
162 | 输入由前一个cell的隐状态 $h_{t-1}$、当前cell的输入 $x_t$ 和前一个 cell 的状态 $C_{t-1}$ 组成。
163 | 
164 | - $C_{t-1}$在传递过程中几乎只经过线性变换。
165 | 
166 | ##### 遗忘门
167 | 
168 | 决定要丢弃多少历史信息
169 | $$ f_t = \sigma(W_{xf}x_t + W_{hf}h_{t-1} + b_f) $$
170 | 
171 | ##### 输入门
172 | 
173 | 决定前一个隐状态和当前输入中保留多少到当前cell的隐状态 $h_t$
174 | $$ I_t = \sigma(W_{xi}x_t + W_{hi}h_{t-1} + b_i) $$
175 | 
176 | $$ \hat{C}_i = \tanh(W_{xg}x_t + W_{hg}h_{t-1} + b_g) $$
177 | 
178 | ##### 输出门
179 | 
180 | 决定当前cell多少信息被添加到cell的状态 $C_t$
181 | $$  $$
182 | 
183 | $$ h_t = o_t * \tanh(C_t) $$
184 | 
185 | $$ C_t = f_t *C_{t-1} + i* \hat{C}_t $$
186 | 
187 | 其中 $*$ 为按元素相乘
188 | 
189 | - $C_t$ 定义为当前网络的长期记忆
190 | - $\hat{C}_t$ 定义为当前网络的短期记忆
191 | 
192 | ### 注意力机制
193 | 
194 | $$ Attention(Query, Key, Value) = \sum_{i=1}^N Similarity(Query, Key_i) \times Value_i $$
195 | 
196 | 其中Similarity可以定义为余弦距离、向量内积等。
197 | $$ Attention(Q,K,V) = Softmax\left( \frac{QK^T}{\sqrt{d_k}} \right)V $$
198 | 
199 | 其中
200 | 
201 | - $X$ 为输入，长度为 $N$
202 | - $C$ 为上下文序列，长度为 $M$
203 | - $Q=XW_Q \in \mathbb{R}^{N\times d_k}$
204 | - $K=CW_K \in \mathbb{R}^{M\times d_k}$
205 | - $V=CW_V \in \mathbb{R}^{M\times d_v}$
206 | 
207 | ### 可解释性
208 | 
209 | t-SNE
210 | 


--------------------------------------------------------------------------------
/Speech Recognition/熵.md:
--------------------------------------------------------------------------------
 1 | # 熵相关
 2 | 
 3 | ## 信息
 4 | 
 5 | $$I(x) = -\log_2p(x)$$
 6 | 
 7 | ## 熵
 8 | 
 9 | $$H = \mathbb{E}[-\log_2p(x)] = -\sum_{x}p(x)\cdot\log_2p(x)$$
10 | 
11 | - 是随机变量的信息 $I(x)$ 的期望
12 | 
13 | ## 一些分布的熵
14 | 
15 | ### 伯努利分布
16 | 
17 | $$H(B(1,p)) = -(1-\mu)\ln(1-\mu) - \mu\ln(\mu) $$
18 | 
19 | ### 高斯分布
20 | 
21 | $$H(\mathcal{N}(\mu, \sigma^2)) = \frac{1}{2}\ln(2\pi{}e\sigma^2) $$
22 | 
23 | ## 条件熵
24 | 
25 | **条件熵**是分布 $p(y|x)$ 的熵的期望
26 | $$H[y|x] = -\sum_{x'} p(x)H(y|x=x') = -\sum_{x'}p(x)\sum_{y}p(y|x)\ln p(y|x)$$
27 | $$H[y|x] = -\iint p(y,x) \ln p(y|x) \mathrm{d}y\mathrm{d}x$$
28 | **联合熵**是条件熵与边缘熵之和
29 | $$H[x,y] = H[y|x] + H[x]$$
30 | 
31 | ## 互信息
32 | 
33 | **互信息**是边缘熵和条件熵的差
34 | $$I[x,y] = H[x] - H[x|y] = H[y] - H[y|x]$$
35 | 
36 | - 互信息是对称的
37 | 
38 | ## KL距离
39 | 
40 | **KL距离**可用于描述两个分布之间的差异
41 | $$KL(p\|q) = -\int p(x) \ln q(x)\mathrm{d}x - \left(-\int p(x) \ln p(x) \mathrm{d}x\right)$$
42 | 
43 | $$KL(p\|q) = -\int p(x) \ln\left\{\frac{p(x)}{q(x)}\right\}\mathrm{d}x$$
44 | 
45 | - $KL(p\|q) \ge 0$
46 | - $KL(p\|q) \neq KL(q\|p)$
47 | - $I[x,y] = KL(p(x,y)\|p(x)p(y))$
48 | 
49 | ## 交叉熵
50 | 
51 | $$H_c(P,Q) = -\sum_{x}P(x)\log_2Q(x)$$
52 | 
53 | - $H_c(P,Q) \neq H_c(Q,P)$
54 | 


--------------------------------------------------------------------------------
/Speech Recognition/特征提取.md:
--------------------------------------------------------------------------------
 1 | # 特征提取
 2 | 
 3 | [TOC]
 4 | 
 5 | ## 线性预测系数 LPC
 6 | 
 7 | 给定信号 $X$，通过 $n$ 个之前时刻样本的 $n$ 阶加权线性插值来预测$t$时刻的样本
 8 | $$\hat{x}_t = \sum_{i=1}^na_ix_{t-i}$$
 9 | 
10 | 或
11 | $$\hat{x} = Ma$$
12 | 
13 | $$ M = \begin{bmatrix}
14 |     x_0 & x_{-1} & x_{-2} & \cdots & x_{-n+1}\\
15 |     x_1 & x_0    & x_{-1} & \cdots & x_{-n+2}\\
16 |     \vdots & \vdots & \vdots & \ddots & \vdots\\
17 |     x_{T-1} & x_{T-2} & x_{T-3} & \dots & x_{-n+T}
18 | \end{bmatrix} $$
19 | 
20 | $$a = \begin{bmatrix}
21 |     a_1 \\ a_2 \\ \vdots \\ a_N
22 | \end{bmatrix}$$
23 | 
24 | 其中 $a_i$ 被称为线性预测系数；$M$ 被称为托普利兹矩阵。
25 | 
26 | 通过最小化均方误差
27 | $$\mathcal{L} = \frac{1}{T}\sum_{t=1}^T(\hat{x}_t - x_t)^2$$
28 | 
29 | 可以确定系数$a$。
30 | $$a = (MM^T)^{-1}M^Tx$$
31 | 
32 | 但是一般使用Levinson-Dublin算法可以更快速地计算。
33 | 
34 | - LPC阶数越高，对谱幅值的刻画越精确，但相应的受噪声影响越大
35 | - 谱包络的峰值可以用来确定共振峰的位置
36 | 
37 | ## Filter Bank / FBank系数
38 | 
39 | - 使用一系列带通滤波器（通常是三角窗滤波器）
40 | - 每一个带通滤波器输出一个FBank系数，它等于此带通滤波器内信号的加权和
41 | 
42 | ## 梅尔域 (Mel Scale)
43 | 
44 | 梅尔域是一个基音感知域。它通过听音者可以区分两种纯音频率的差距来作为标度。
45 | $$\mathrm{Mel}(f) = 2595\log_{10}(1+\frac{f}{100})$$
46 | 
47 | - 变换公式是拟合而非推导得到的
48 | - 在梅尔域中，低频部分具有更高的分辨率
49 | 
50 | ### 梅尔域的FBank系数
51 | 
52 | $$m_i = \sum_{k=f_i}^{F_i}s(k)T_i(k)$$
53 | 其中$m_i$是第$i$个系数，$f_i$和$F_i$是三角滤波器的开始和结束频率，$s_i$是频谱的能量或幅度，$T_i$是三角滤波器的值
54 | 
55 | ### 梅尔频率倒谱系数 MFCC
56 | 
57 | 1. 取对数得到N个对数域的FBank系数
58 | 2. 使用离散余弦变换 (DCT) 计算倒谱系数 (Cesptral系数)
59 | 
60 | - 倒谱系数Cesptral是频谱Spectral前四个字母倒过来
61 | - 倒谱系数某种意义上可以理解为类似“反变换回时域”的操作，其横轴意义是“时间”而非“频率”
62 | - 随着神经网络的普及，MFCC已经不太常用
63 | 
64 | ## 动态特征
65 | 
66 | ### 简单差分
67 | 
68 | $$\Delta_n = \frac{c_{n+\delta} - c_{n-\delta}}{2\delta}$$
69 | 
70 | ### 更鲁棒的差分
71 | 
72 | $$\Delta_n = \frac{\sum_{i=1}^\delta i(c_{n+i} - c_{n-i})}{2\sum_{i=1}^\delta i^2}$$
73 | 


--------------------------------------------------------------------------------
/Speech Recognition/统计语言模型.md:
--------------------------------------------------------------------------------
 1 | # 统计语言模型
 2 | 
 3 | [TOC]
 4 | 
 5 | ## 概述
 6 | 
 7 | ### 回顾：统计语音识别的数学框架
 8 | 
 9 | $$\hat{W} = \arg\max_W \mathbb{P}[W|O] = \arg\max_W \mathbb{P}[O|W]\mathbb{P}[W]$$
10 | 其中$W=[w_1,\dots,w_N]$通常情况下表示一个**词序列**。
11 | 
12 | ### 词表
13 | 
14 | - 语言模型建模需要定义一个词表。
15 | - 搜索空间随词表大小呈指数增长。
16 | - 在词表中无法找到的词称为集外词（Out-Of-Vocabulary）
17 | 
18 | ### 基于规则的语法网络
19 | 
20 | - 一个语法网络是一个**加权有限状态机**。
21 | - 语法间包括词之间的回环、替代、重复等。
22 | - 状态机的边上赋有权重，对于经常观测到的路径权重较高。
23 | - 难以手工制定大量语法规则
24 | - 自然语言通常情况下不合语法
25 | 
26 | ### 统计语言模型概述
27 | 
28 | - 完全数据驱动
29 | - 适合自然语音
30 | $$ \mathbb{P}[W] = \mathbb{P}[w_1,w_2,\dots,w_N] = \prod_{k=1}^{K+1} \mathbb{P}[w_k|w_1,\dots,w_{k-1}] $$
31 | 
32 | ## N-gram语言模型
33 | 
34 | ### N-gram语言模型概述
35 | 
36 | N-gram模型近似地表示在一个较短历史情况下的条件概率（序列最长为N个）
37 | $$ \mathbb{P}[w_k|w_1,\dots,w_{k-1}] = \mathbb{P}[w_k|w_{k-1},\dots,w_{k-n+1}] $$
38 | 给定的一个当前词的概率仅依赖于这个词之前的$N-1$个前继词
39 | 
40 | - Unigram: $\mathbb{P}[W] = \mathbb{P}[w_k]$ （不依赖前继词）
41 | - Bigram: $\mathbb{P}[W] = \mathbb{P}[w_k|w_{k-1}]$ （仅依赖于前一个词）
42 | - Trigram: $\mathbb{P}[W] = \mathbb{P}[w_k|w_{k-1},w_{k-2}]$
43 | - Quadrigram: $\mathbb{P}[W] = \mathbb{P}[w_k|w_{k-1},w_{k-2},w_{k-3}]$
44 | 
45 | ### 文本归一化
46 | 
47 | 转换训练文本使之成为纯语言，移除和语言不相关的部分。一般包括
48 | 
49 | - 移除标点
50 | - 日期、货币、数字归一化（统一格式）
51 | - 缩写词的归一化（统一格式）
52 | 
53 | ### 最大似然参数估计
54 | 
55 | - 数据：词序列 $W_1^N = [w_1,\dots,w_N]$
56 | - 模型：$\mathbb{P}[w_k|W^{k-1}_{k-n+1}], \quad w \in \mathcal{V}$，其中$ \mathcal{V} $是词表。
57 | 
58 | 进行参数估计时，最大化对数似然函数
59 | $$
60 | \begin{align*}
61 | &\hat{\mathbb{P}}[v|y] = \arg\max_{\mathbb{P}[v|y]} \sum_{v\in\mathcal{V}}\sum_{y\in\mathcal{Y}}C(y,v)\log\mathbb{P}[v|y]\\
62 | &\text{s.t.} \quad \sum_{v\in\mathcal{V}}\mathbb{P}[v|y]=1
63 | \end{align*}
64 | $$
65 | 这是6一个等式约束的优化问题，可以用拉格朗日乘子法求解。
66 | $$ \hat{\mathbb{P}}[v|y] = \frac{C(y,v)}{C(y)} $$
67 | 其中
68 | $$ C(y) = \sum_{v\in\mathcal{V}}C(y,v) $$
69 | 
70 | ### 模型评估
71 | 
72 | - 理想：语音识别的结果与词错误率之差
73 | - 实际：在预留的文本数据集上预测
74 | 
75 | ### 评价指标
76 | 
77 | #### 交叉熵 Cross-Entropy
78 | 
79 | $$  $$
80 | 
81 | #### 困惑度 Perplexity
82 | 
83 | $$ \mathrm{PPL} = 2^{-\mathcal{L}(\theta)} = \left( \prod_{k=1}^K\frac{1}{\mathbb{P}[w_k|W^{k-1}_{k-n+1}]} \right)^{\frac{1}{k}} $$
84 | 
85 | ### 数据稀疏与零概率
86 | 
87 | 训练集上未见过的词与词之间的关系会导致无穷大的Perplexity，训练集上低频次的关系也可能导致不可靠的估计。
88 | 
89 | #### 折扣法 Discounting
90 | 
91 | 解决零概率的一种方法。将一些概率值“重新分配”到未见过的N-gram上。
92 | 
93 | #### Backing-off and Interpolation
94 | 
95 | 解决零概率的一种方法，递归地回退到较低阶的N-gram模型上，直到得到一个较为鲁棒的概率估计。
96 | 


--------------------------------------------------------------------------------
/Speech Recognition/说话人识别与说话人日志.md:
--------------------------------------------------------------------------------
 1 | # 说话人识别与说话人日志
 2 | 
 3 | [TOC]
 4 | 
 5 | ## 基本概念
 6 | 
 7 | ### 环节
 8 | 
 9 | 1. 注册
10 | 2. 验证
11 | 
12 | ### 任务
13 | 
14 | - 说话人识别：一对多分类任务。确定未知说话人是已知说话人中的哪一个
15 | - 说话人验证：一对一决策人物。判断说话人是否与某个声音相匹配。
16 | 
17 | ### 文本相关/文本无关
18 | 
19 | - 文本相关：系统知道说话人要说的文本。
20 |   - 固定词语/提示词语
21 | - 文本无关：系统不知道说话人要说的文本
22 | 
23 | # 基于GMM的说话人确认
24 | 
25 | ## 特征提取
26 | 
27 | > 摸了
28 | 
29 | ## 流程
30 | 
31 | ### 通用背景模型 UBM
32 | 
33 | UBM是通过大量说话人数据训练的GMM模型
34 | 
35 | ### 目标说话人注册
36 | 
37 | - 通过特定说话人数据将UBM自适应得到特定说话人的GMM模型
38 | - 一般使用最大后验概率训练
39 | - 只更新UBM的均值参数
40 | $$ \mu_i = \alpha_i E_i(x) + (1-\alpha_i)\mu_i^{UBM} $$
41 | 
42 | ## 基于I-Vector的说话人确认系统
43 | 
44 | ### 联合因子分析
45 | 
46 | - 联合因子分析 JFA 对语音中的说话人信息和信道信息分别建模
47 | - GMM中的各个均指向量拼接为一个超向量$M$，$M$可以分解为
48 | 
49 | $$ M = m +Vy + Ux + Dx $$
50 | - $m$ 为与说话人无关的超向量
51 | - $V$ 是一个低秩矩阵（说话人本征空间）
52 | - $y$ 是说话人因子
53 | - $U$ 是信道本征空间
54 | - $x$ 是信道因子
55 | - $Dx$ 是其他信息
56 | 
57 | ### I-Vector
58 | 
59 | 实际应用中 $U$ 和 $V$ 并不能很好的分开。
60 | $$ M = m + Tw $$
61 | - $T$是说话人空间和信道空间合并而成的总变量空间
62 | - $w$遵循标准正态分布
63 | 
64 | # 基于深度学习的说话人确认
65 | 
66 | ## Deep Feature / D-Vector
67 | 
68 | - 训练阶段
69 |   - 对每一帧音频进行分类
70 |   - 对每一帧，与其前后若干帧拼接后作为输入
71 |   - 通过对说话人的分类任务来优化神经网络
72 | - 测试阶段
73 |   - 对每个说话人音频，得到其所有帧在最后一层的输出，平均所有输出后得到说话人的潜入特征
74 | 
75 | ## 段级别的说话人嵌入特征
76 | 
77 | - 通过神经网络提取帧级别特征
78 | - 通过统计池化得到段级别特征
79 | 


--------------------------------------------------------------------------------
/Statistical Learning And Inference/01.Introduction.md:
--------------------------------------------------------------------------------
 1 | # 01. Introduction
 2 | 
 3 | ## Big Data
 4 | 
 5 | **3V Interpretation.** Big data are **high-volume**, **high-velocity** and **high-variety** information assests.
 6 | 
 7 | ## Function Estimation Model
 8 | 
 9 | ![function-estimation-model](figs/function-estimation-model.png)
10 | 
11 | - **Generator ($G, F(x)$).** Generates observations $x$, which are independently sampled from a certain distribution $F(x)$.
12 | - **Supervisor ($S, F(y|x)$).** Labels each input $x$ with an output value $y$ according to some fixed distribution $F(y|x)$.
13 | - **Learning Machine ($LM, f(x; \alpha)$).** "Learns" from $(x, y)$ pairs from $G$ and $S$, by choosing a function that *best approximates* $S$ from a parametrized function class $f(x; \alpha) = \hat{y}$.
14 | 
15 | ### Risk Minimization
16 | 
17 | - **Loss Functional.** The loss functional $(L, Q)$ is the error of a given function on a given example.
18 |   - $L(x, y, f_{\alpha}) \rightarrow L(y, f(x, \alpha))$.
19 |   - $Q(z, \alpha) \rightarrow L(z_y, f(z_x, \alpha))$, where $z = (x,y)$ (if we want to merge $x$ and $y$)
20 | - **Risk Functional.** The risk functional $R$ is the expected loss of a given example when drawn from $F(x, y)$.
21 |   - **Generalization Error.** The generalization error of a given function is usually defined as $R(\alpha) = \int Q(z, \alpha) \mathrm{d}F(z)$. (Integral over all pairs of possible $(x,y)$)
22 | 
23 | #### Three Main Learning Problems
24 | 
25 | > All aim to minimize certain kinds of risks
26 | 
27 | - **Pattern Recognition.**
28 |   - $y \in \{ 0, 1 \}$ and $L(y, f(x, \alpha)) = \mathbb{I}[y \neq f(x,a)]$.
29 | - **Regression Estimation.**
30 |   - $y \in \mathbb{R}$ and $L(y, f(x, \alpha)) = (y - f(x, \alpha))^2$.
31 | - **Density Estimation.**
32 |   - $L(p(x, \alpha)) = -\log p(x, \alpha)$.
33 | 
34 | #### General Formulation of Learning
35 | 
36 | - Given an i.i.d. $k$ samples, $z_1, \dots, z_k$ where $z_i = (x_i, y_i)$ drawn from a distribution $F(z)$.
37 | - We wish to find a function $\alpha^*$ such that the risk is minimized
38 |   - $\arg\min_{\alpha} R(\alpha)$
39 | 
40 | ##### Emprical Risk Minimization
41 | 
42 | Calculating the integral is usually not practical.
43 | 
44 | $$ R(\alpha) = \int Q(z, \alpha) \mathrm{d}F(z) \Rightarrow R_{emp}(\alpha) = \frac{1}{k} \sum_{i = 1}^k Q(z_i, \alpha) $$
45 | 
46 | - **Emprical Risk Minimizer (ERM).** $\alpha_k = \arg\min_{\alpha} R_{emp}(\alpha)$.
47 |   - ERM approximates $Q(z, \alpha^*)$ with $Q(z, \alpha_k)$
48 |   - ERM approximates $\alpha^*$ with $\alpha_k$
49 | - Many loss functions / methods are realizations of ERM.
50 |   - **Least-squares.** $\min_\alpha \sum_i L(y_i, f(x_i, \alpha)) = \min_\alpha \sum_i(y_i - f(x_i, \alpha_i))$
51 |   - **Maximum-likelihood.** $\min_\alpha \sum_i L(p(x_i, \alpha)) = \min_\alpha - \sum_i \log(p(x_i, \alpha))$
52 | 
53 | #### Error Bounds
54 | 
55 | The **error bound** is the maximum possible error of a model on an unseen test set.
56 | 
57 | $$ R_T(h) \le R_S(h) + \frac{1}{2} d_{H\Delta{}H}(S,T) + \lambda \le R_S(h) + \frac{1}{2} d_H(S, T) + \lambda $$
58 | 
59 | - Basically "test error = training error + distance between training and testing set"
60 | 
61 | ## Classification
62 | 
63 | > Classification tasks will be the focus of this course.
64 | 
65 | **Classifiers.** The task of a **classifier** is to use the feature vector provided by a feature extractor to assign the object to a category.
66 | 
67 | - Essentially the classifier divides the feature space into regions corresponding to different categories.
68 | - The degree of difficulty of the classification problem depends on *within-category feature variability* relative to *cross-category feature variability*.
69 | 


--------------------------------------------------------------------------------
/Statistical Learning And Inference/13.DomainGeneralization.md:
--------------------------------------------------------------------------------
 1 | # Domain Generalization
 2 | 
 3 | ## Domain Adaptation vs. Domain Generalization
 4 | 
 5 | - **Domain adaptation.** unlabeled target domain is *seen* in the training stage.
 6 | - **Domain generalization.** unlabeled target domain is *unseen* in the trianing stage.
 7 | 
 8 | The goal of domain generalization is to learn *domain-invariant* features for the classifier.
 9 | 
10 | The training set might contain images from multiple **latent domains**.
11 | 
12 | ## Methods
13 | 
14 | ### Latent Domain Labels are Known
15 | 
16 | #### Domain-Invariant Component Analysis (DICA)
17 | 
18 | $$ \max_{B} \frac{\frac{1}{n}\mathrm{Tr}\left( B^TL\left( L +n\epsilon I_n \right)^{-1} K^2 B \right)}{\mathrm{Tr}(B^TKQKB+BKB)} $$
19 | 
20 | where the numerator preserves the functional relationship $\mathbb{E}[X|Y]$, and the denominator minimizes the differences across domains $\mathbb{E}[X]$.
21 | 
22 | After obtaining $B$, use $\tilde{K} = KBB^TK$
23 | 
24 | #### Domain Generalization with Adversarial Feature Learning
25 | 
26 | Assume we have images from $K$ latent domains and their labels $(X_k, Y_k)$.
27 | 
28 | Train an auto encoder with encoder $Q$ and decoder $P$ on the images to extract latent domain features.
29 | 
30 | $$ H_k = Q(X_k), \quad \hat{X}_k = P(Q(X_k)) $$
31 | 
32 | The model is trained with three loss functions
33 | 
34 | $$ \min_{Q,P} \max_{D} L_{rec} + L_{MMD} + L_{GAN} $$
35 | 
36 | - Reconstruction loss. $L_{rec} = \|\hat{X}_k - X_k\|_F^2$
37 | - MMD loss. $L_{MMD} = \sum_{k_1 \neq k_2} \| H_{k_1} - H_{k_2} \|_F^2$
38 | - GAN loss. $\mathbb{E}_{h \sim p(h)}[\log(D(h))] + \mathbb{E}_{x\sim p(x)}[\log(1 - D(Q(x)))]$
39 |   - where $h$ is a random vector drawn from a prior distribution.
40 |   - the discriminator $D$ forces the extracted features and $h$ to have similar distributions.
41 | 
42 | ### Latent Domain Labels are Unknown
43 | 
44 | When the latent domain labels are unknown, we try to *discover* the latent domains in the training data.
45 | 
46 | #### Discovering latent domains for multisource domain adaptation
47 | 
48 | If there are $C$ categories and $K$ domains, group training samples into $CK$ local clusters.
49 | 
50 | Cluster the means of the local cluster to find domains, in which each domain contains only a local cluster from each category.
51 | 
52 | #### Reshaping visual datasets for domain adaptation
53 | 
54 | Denote by $z_{mk}$ the probability that the $m$-th sample belongs to the $k$-th latent domain.
55 | 
56 | $$\begin{align*}
57 |   \max_{z_{mk}} &\quad \sum_{k\neq k'}\left\| \frac{1}{M_k}\sum_m \phi(x_m)z_{mk} - \frac{1}{M_{k'}} \sum_m \phi(x_m)z_{mk'}\right\|^2\\
58 |   \mathrm{s.t.} &\quad \sum_{k=1}^K z_{mk} = 1\\
59 |   &\quad \frac{1}{M_k} \sum_{m=1}^M z_{mk}y_{mc} = \frac{1}{M}\sum_{m=1}^M y_{mc}
60 | \end{align*}$$
61 | 


--------------------------------------------------------------------------------
/Statistical Learning And Inference/15.UnsupervisedLearning.md:
--------------------------------------------------------------------------------
  1 | # Unsupervised Learning
  2 | 
  3 | ## Introduction
  4 | 
  5 | - Learning without a teacher.
  6 | - The system aims to exploit data structure from a set of $N$ observations $\{ x_1,\dots,x_N \}$ where $x_i \in \mathbb{R}^p$.
  7 | 
  8 | ## Clustering
  9 | 
 10 | Divides the samples into multiple regions (typically convex regions).
 11 | 
 12 | Group or segment a collection of objects into subsets or **clusters**, such that those within each cluster are more closely related to each other than objects in other clusters.
 13 | 
 14 | ### Cluster Analysis
 15 | 
 16 | - Central to all of the goals of cluster analysis is the notion of the degree of similarity (or dissimilarity) between the individual objects.
 17 | 
 18 | #### Proximity Matrices
 19 | 
 20 | - Dissimilarities can be computed by averaging over the collection of such judgements.
 21 | - Dissimilarity represented by a $N \times N$ matrix $D$.
 22 | - Each $d_{ij}$ records the proximity between sample $i$ and $j$.
 23 | - Most algorithms presume a matrix with non-negative dissimilarities and zero diagonal elements.
 24 | 
 25 | #### Dissimilarities based on Variables
 26 | 
 27 | - Dissimilarity between two variables
 28 | 
 29 | $$ D(x_i, x_j) = \sum_{k=1}^p d(x_{i,k}, x_{j,k}) $$
 30 | 
 31 | ##### Quantitative Variables
 32 | 
 33 | $$ d(x_{i,k}, x_{j,k}) = (x_{i,k} - x_{j,k})^2 $$
 34 | 
 35 | ##### Categorical Variables
 36 | 
 37 | For categorial variables, assume the variable has $M$ possible values. The values can be arranged in a symmetric $M \times M$ matrix
 38 | 
 39 | $$ L_{ij} = L_{ji}, \quad L_{ii} = 0, \quad L_{ij} \ge 0 $$
 40 | 
 41 | The most common choice is
 42 | 
 43 | $$ L_{ij} = 1, \quad \forall i \neq j $$
 44 | 
 45 | ##### Observation Dissimilarities
 46 | 
 47 | The dissimilarity between two samples $D(x_i, x_j)$ can be computed by the weighted sum of each dimension
 48 | 
 49 | $$ D(x_i, x_j) = \sum_{k=1}^p w_k d(x_{ik}, x_{jk}), \quad \sum_{k=1}^p w_k = 1 $$
 50 | 
 51 | The average dissimilarity over all pairs of samples is given by
 52 | 
 53 | $$ \bar{D} = \frac{1}{N^2} \sum_{i=1}^N\sum_{j=1}^N D(x_i, x_j) = \sum_{k=1}^p w_j \bar{d}_j $$
 54 | 
 55 | $$ \bar{d}_k = \frac{1}{N^2} \sum_{i=1}^N \sum_{j=1}^N d_j (x_{ik}, x_{jk}) $$
 56 | 
 57 | #### Combinatorial Algorithms
 58 | 
 59 | Assign each sample $x_i$ to a cluster $C(i) = 1,\dots, K$.
 60 | 
 61 | The **within-cluster scatter** is defined as
 62 | 
 63 | $$ W(C) = \frac{1}{2} \sum_{k=1}^K \sum_{C(i) = k, C(j) = k} d(x_i, x_j) $$
 64 | 
 65 | Note that we should minimize $W(C)$ since samples in the same class should be close to each other.
 66 | 
 67 | The **total point scatter** is given by
 68 | 
 69 | $$ \begin{align*}
 70 |     T &= \frac{1}{2} \sum_{i=1}^N\sum_{j=1}^N d(x_i, x_j) \\
 71 |     &= \frac{1}{2}\sum_{k=1}^K \sum_{C(i) = k} (\sum_{C(j) = k} d(x_i, x_j) + \sum_{C(j)\neq k}d(x_i, x_j))\\
 72 |     &= W(C) + B(C)
 73 | \end{align*} $$
 74 | 
 75 | where $B(C)$ is the **between-class scatter**
 76 | 
 77 | $$ B(C) = \frac{1}{2} \sum_{k=1}^K \sum_{C(i) = k} \sum_{C(j)\neq k}d(x_i, x_j) $$
 78 | 
 79 | Note that the total point scatter $T$ is a constant, and minimizing $W(C)$ is equivalent to maximizing $B(C)$.
 80 | 
 81 | ### K-Means
 82 | 
 83 | The within-class scatter $W(C)$ can be written as
 84 | 
 85 | $$\begin{align*}
 86 |     W(C) &= \frac{1}{2} \sum_{k=1}^K \sum_{C(i) = k} \sum_{C(j)=k} d(x_i, x_j) \\
 87 |     &= \frac{1}{2} \sum_{k=1}^K \sum_{C(i) = k} \sum_{C(j)=k} \|x_i - x_j\|^2 \\
 88 |     &= \sum_{k=1}^K N_k \sum_{C(i) = k} \| x_i - \bar{x}_k \|^2
 89 | \end{align*}$$
 90 | 
 91 | where $\bar{x}_k$ is the center of cluster $k$.
 92 | 
 93 | #### Algorithm
 94 | 
 95 | - **Expectation Step.** Estimate the hidden variable
 96 |   - $ C(i) = \arg\min_k \| x_i - \bar{x}_k \|^2 $
 97 | - **Maximization Step.** Maximum likelihood estimation
 98 |   - $ \bar{x}_k = \frac{1}{N_k} \sum_{C(i) = k} x_k $
 99 | 
100 | ### Vector Quantization (VQ)
101 | 
102 | An extension of the K-Means algorithm that reduces $N$ points to $K$ centroids.
103 | 
104 | 1. Pick a sample point $x_i$ at random.
105 | 2. Find the centroid $m_k$ for which $d(x_i, m_k)$ is minimized.
106 | 3. Move $m_k$ towards $x_i$ by a small fraction of distance.
107 | 4. Repeat until convergence.
108 | 
109 | ### Self-Organizing Map (SOM)
110 | 
111 | An constrained version of the K-Means algorithm with prototypes on a topological map.
112 | 
113 | - Tranditional K-Means updates centroids independently.
114 | - SOM updates related centroids accordigly.
115 | 
116 | 1. Find $K$ prototypes $m_j$ placed on a given map.
117 | 2. Find closest $m_j$ of a sample $x_i$.
118 | 3. Move $m_j$ and its neighbors $m_k$ toward $x_i$.
119 |    - $ m_k \leftarrow m_k + \alpha (x_i - m_k) $
120 | 
121 | - Less sensitive to initialization
122 | 
123 | ## Dimensionality Reduction
124 | 
125 | Identifiers low-dimensional manifolds that preserves the relation or association of the original data points.
126 | 


--------------------------------------------------------------------------------
/Statistical Learning And Inference/figs/function-estimation-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YBRua/CourseNotes/9a3a7d322ab13b53fa619abb1a7bfd79938c7944/Statistical Learning And Inference/figs/function-estimation-model.png


--------------------------------------------------------------------------------
/Statistical Learning And Inference/figs/ultimate-em-algorithm-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YBRua/CourseNotes/9a3a7d322ab13b53fa619abb1a7bfd79938c7944/Statistical Learning And Inference/figs/ultimate-em-algorithm-figure.png


--------------------------------------------------------------------------------
/Stochastic Processes/NotesTo2613.pdf:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:10a7eb9155926a96fa252b48e657e1edd944298ed8afd3fa9480316f39869475
3 | size 457836
4 | 


--------------------------------------------------------------------------------
/Stochastic Processes/NotesTo2613.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[oneside]{book}
 2 | \usepackage{xeCJK}
 3 | \usepackage{amsmath}
 4 | \usepackage{mathtools}
 5 | \usepackage{listings} % lstlist插入代码
 6 | \usepackage{booktabs}
 7 | \usepackage{ulem}
 8 | \usepackage{enumerate}
 9 | \usepackage{amsfonts}
10 | \usepackage{amssymb}
11 | \usepackage{amsthm}
12 | \usepackage{setspace} % spacing环境设置行间距
13 | \usepackage[ruled, vlined]{algorithm2e} % 算法与伪代码 
14 | \usepackage{bm} % 数学公式中的加粗
15 | \usepackage{pifont} % 打圈的数字。172-211。\ding
16 | \usepackage{graphicx}
17 | \usepackage{float}
18 | \usepackage[dvipsnames]{xcolor}
19 | %\usepackage{indentfirst}
20 | \usepackage{ulem} %\sout{}打删除线
21 | \normalem % 使用默认normalem
22 | \usepackage{lmodern}
23 | \usepackage{subcaption}
24 | \usepackage[colorlinks, linkcolor=blue]{hyperref}
25 | \usepackage{cleveref}
26 | \usepackage[a4paper]{geometry}
27 | \usepackage{titlesec}
28 | 
29 | \theoremstyle{definition}
30 | \newtheorem{definition}{Definition}[section]
31 | \newtheorem{theorem}{Theorem}[section]
32 | \newtheorem*{optTheorem}{Theorem}
33 | \newtheorem{proposition}{Proposition}[section]
34 | \newtheorem{lemma}{Lemma}[section]
35 | \newtheorem{corollary}{Corollary}[section]
36 | \theoremstyle{remark}
37 | \newtheorem*{remark}{Remark}
38 | \newtheorem*{sketchproof}{Sketch of Proof}
39 | 
40 | \newcommand{\questeq}{\stackrel{?}{=}}
41 | 
42 | 
43 | \title{Notes to AI2613 Stochastic Processes}
44 | \author{\textsc{YBiuR}}
45 | \date{A long long time ago in a far far away SJTU}
46 | 
47 | 
48 | \begin{document}
49 | \setlength{\parskip}{1em}
50 | \setlength{\parindent}{0em}
51 | 
52 | \frontmatter
53 | \maketitle
54 | \chapter*{Preface}
55 | \paragraph{}Learning Convex Optimization is non-convex.
56 | \paragraph{}Yet learning Stochastic Porcesses is indeed stochastic.
57 | \mainmatter
58 | \tableofcontents
59 | \begin{spacing}{1.2}
60 | \include{DiscreteMarkovChain}
61 | \include{MarkovRandomFields}
62 | \include{PoisonProcess}
63 | \include{ContinousTimeMarkovChain}
64 | \include{Martingale}
65 | \include{BrownianMotion}
66 | \include{Diffusion}
67 | 
68 | \begin{thebibliography}{9}
69 |     \bibitem{eosp} Durrett, Richard, and R. Durrett.\textit{Essentials of stochastic processes}. Vol. 1. New York: Springer, 1999.
70 |     \bibitem{ITPM} Ross, Sheldon M. \textit{Introduction to probability models}. Academic press, 2014.
71 |     \bibitem{Chang} J.Chang, \textit{Lecture Notes to Stochastic Processes}, 2007
72 | \end{thebibliography}
73 | 
74 | \end{spacing}
75 | \end{document}


--------------------------------------------------------------------------------