├── .DS_Store
├── .gitignore
├── Adversarial_Attack_and_Training
    └── intriguing_properties_of_NN.tex
├── Artificial_General_Intelligence
    ├── ER_AML.tex
    └── continual_learning.tex
├── Basic_Machine_Learning
    ├── fundamental_algo.tex
    ├── introduction.tex
    └── notation.tex
├── LICENSE
├── Neural_Networks
    └── kowledge_distillation.tex
├── README.md
├── imgs
    ├── .DS_Store
    ├── continual_learning
    │   ├── cl_1.png
    │   ├── cl_2.png
    │   └── cl_3.png
    ├── fundamental_algo
    │   ├── algo_1.png
    │   ├── algo_2.png
    │   ├── algo_3.png
    │   ├── algo_4.png
    │   └── algo_5.png
    ├── introduction
    │   └── intro_1.png
    └── notation
    │   ├── notation_1.png
    │   ├── notation_2.png
    │   ├── notation_3.png
    │   └── notation_4.png
├── machine_learning.pdf
├── machine_learning.tex
└── references.bib


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # LaTeX auxiliary files
 2 | *.aux
 3 | *.lof
 4 | *.log
 5 | *.lot
 6 | *.fls
 7 | *.out
 8 | *.toc
 9 | *.fmt
10 | *.fot
11 | *.cb
12 | *.cb2
13 | *.lb
14 | *.bbl
15 | *.blg
16 | *.brf
17 | *.idx
18 | *.ilg
19 | *.ind
20 | *.loa
21 | *.glo
22 | *.gls
23 | *.ist
24 | *.acn
25 | *.acr
26 | *.alg
27 | *.glg
28 | *.glsdefs
29 | *.xdy
30 | 
31 | # LaTeX intermediate files
32 | *.dvi
33 | *.xdv
34 | *.fdb_latexmk
35 | *.synctex.gz
36 | *.synctex(busy)
37 | *.synctex.gz(busy)
38 | *.pdfsync
39 | 
40 | # LaTeX backup files
41 | *~
42 | 
43 | # Other files to ignore
44 | *.ps
45 | *.eps
46 | *.cls
47 | *.sty
48 | 


--------------------------------------------------------------------------------
/Adversarial_Attack_and_Training/intriguing_properties_of_NN.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Intriguing Properties of Neural Network}
 2 | 
 3 | Deep neural networks, known for their exceptional performance in speech and visual recognition tasks, exhibit two notable characteristics \citep{szegedy2013intriguing}. First, \textit{the semantic information in their higher layers is embedded not in individual units but in the collective space they form}. This insights shifts the focus from analyzing single neurons to considering the entire unit group to understand network processing. Second, \textit{these networks display a surprisingly sensitivity to minute, yet percisely tailored alternations (or perturbation).} Such small changes can lead to incorrect outcomes. This vulnerability is not due to random noise; the same modifications can deceive different networks trained on a different subset of the dataset, to misclassify the same input.
 4 | 
 5 | \section{Introduction}
 6 | 
 7 | Deep neural networks are powerful learning models that achieve excellent performance on visual and speech recognition problems because they can express arbitrary computation that consists of a modest number of massively parallel nonlinear steps. As the resulting computation is automatically discovered by backpropagation via supervised learning, it can be difficult to interpret and can have counter-intuitive properties.
 8 | 
 9 | The \textbf{first} property is concerned with the semantic meaning of individual units. It seems that the entire space of activation, rather than the individual units, that contains the bulk of the semantic information contrary to prior belief and the \textbf{second} property is concerned with the stability of neural networks with respect to small perturbation to their inputs. Apply an \textit{imperceptible} non-random perturbation to a test image, it is possible to arbitrarily change the network's prediction. These perturbation are found by optimizing the input to maximize the prediction error. The perturbed examples are often called ``adversarial examples"
10 | 
11 | \section{Framework}
12 | \textbf{Notation} $x\in \mathbb{R}$ denotes an input image, $\phi(x)$ is an activation values of some layer. \citep{szegedy2013intriguing} first examine properties of the image of $\phi(x)$, and then search for its blind spots. 
13 | 


--------------------------------------------------------------------------------
/Artificial_General_Intelligence/ER_AML.tex:
--------------------------------------------------------------------------------
1 | \chapter{New Insights on Reducing Abrupt Representation Change in Online Continual Learning}
2 | 
3 | Experience Replay (ER), where a small subset of past data is stored and replayed alongside new data, has emerged as a simple and effective learning strategy. The author focus on the change in representations of observed data that arises when previously unobserved classes appear in representations of observed data
4 | 
5 | \section{Introduction}


--------------------------------------------------------------------------------
/Artificial_General_Intelligence/continual_learning.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Continual Learning: An Overview}
  2 | \section{Introduction}
  3 | \textbf{Continual Learning} is motivated by the fact that human and other organisms has the ability to adapt, accumulate and exploit knowledge. A common setting for continual learning is to learn a sequence of contents one by one and behave as if they were observed simultaneously \citep{wang2023comprehensive}. Each task learned throughout the life time can be new skills, new examples of old skills, different environments, etc (Fig.\ref{fig:cl_1}, a). This attribute of continual learning makes it also referred to as \textbf{incremental learning} or \textbf{lifelong learning}.
  4 | 
  5 | Unlike conventional pipeline, where joint training is applied, continual learning is characterized by learning from dynamic data distributions. A major challenge is known as \textbf{catastrophic forgetting}, where \textit{adaptation to a new distribution generally results in a largely reduced ability to capture the old ones}. This dilemma is a facet of the trade-off between \textbf{learning plasticity} and \textbf{memory stability}: an excess of the former interferes with the latter, and vice versa. A good continual learning algorithm should obtain a strong \textbf{generalizability} to accommodate distribution differences within and between tasks (Fig.\ref{fig:cl_1}, b). As a naive baseline, retraining all old training samples (if allowed) makes it easy to address the above challenges, but creates huge computational and storage overheads (as well as potential privacy issues). In fact, continual learning is primarily intended to ensure \textbf{resource efficiency} of model updates, preferably close to learning only new training samples.
  6 | 
  7 | \begin{figure}[H]
  8 |     \centering
  9 |     \includegraphics[width=0.7\linewidth]{imgs/continual_learning/cl_1.png}
 10 |     \caption{A conceptual framework of continual learning. \textbf{a}, Continual learning requires adapting to incremental tasks with dynamic data distributions. \textbf{b}, A desirable solution should ensure a proper balance between stability (red arrow) and plasticity (green arrow), as well as an adequate generalizability to intra-task (blue arrow) and inter-task (orange arrow) distribution differences. \textbf{c}, Representative strategies have targeted various aspects of machine learning.}
 11 |     \label{fig:cl_1}
 12 | \end{figure}
 13 | 
 14 | Numerous efforts have been devoted to addressing the above challenges, which can be conceptually separated into five groups (Fig.\ref{fig:cl_1}, c): \textit{regularization-based approach}; \textit{replay-based approach}; \textit{optimization-based approach}; \textit{representation-based approach}; and \textit{architecture-based approach}. These methods are \textit{closely connected}, e.g., regularization and replay ultimately act to rectify the gradient directions, and \textit{highly synergistic}, e.g., the efficacy of replay can be facilitated by distilling knowledge from the old model.
 15 | 
 16 | \section{Setup}
 17 | In this section, we first present a basic formulation of continual leanring. Then we introduce typical scenairos and evaluation metrics.
 18 | 
 19 | \subsection{Basic Formulation}
 20 | A continual learning model parameterized by \(\theta\) needs to learn corresponding task(s) with no or limited access to old training samples and perform well on their test sets. Formally, an incoming batch of training samples belonging to a task \(t\) can be represented as \(\mathcal{D}_{t, b}=\left\{\mathcal{X}_{t, b}, \mathcal{Y}_{t, b}\right\}\) where \(\mathcal{X}_{t, b}\) is the input data, \(\mathcal{Y}_{t, b}\) is the data label, \(t \in \mathcal{T}=\{1, \cdots, k\}\) is the task identity and \(b \in \mathcal{B}_{t}\) is the batch index ( \(\mathcal{T}\) and \(\mathcal{B}_{t}\) denote their space, respectively). Here we define a "task" by its training samples \(\mathcal{D}_{t}\) following the distribution \(\mathbb{D}_{t}:=p\left(\mathcal{X}_{t}, \mathcal{Y}_{t}\right)\left(\mathcal{D}_{t}\right.\) denotes the entire training set by omitting the batch index, likewise for \(\mathcal{X}_{t}\) and \(\left( \mathcal{Y}_{t}\right)\), and assume that there is no difference in distribution between training and testing. Under realistic constraints, the data label \(\mathcal{Y}_{t}\) and the task identity \(t\) might not be always available. In continual learning, the training samples of each task can arrive incrementally in batches (i.e., \(\left\{\left\{\mathcal{D}_{t, b}\right\}_{b \in \mathcal{B}_{t}}\right\}_{t \in \mathcal{T}}\) ) or simultaneously (i.e.,\(\left\{\mathcal{D}_{t}\right\}_{t \in \mathcal{T}}\)).
 21 | 
 22 | \begin{table}[H]
 23 |     \centering
 24 |     \renewcommand{\arraystretch}{1.75}
 25 |     \resizebox{\textwidth}{!}{
 26 |         \begin{tabular}{c|c|c}
 27 |             \hline
 28 |             \textbf{Scenario} & \textbf{Training}                                                                                                                                                                                                                               & \textbf{Testing}                                                                                  \\
 29 |             \hline IIL        & \(\left\{\left\{\mathcal{D}_{t, b}, t\right\}_{b \in \mathcal{B}_{t}}\right\}_{t=j}\)                                                                                                                                                           & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t=j ; t}\) is not required                        \\
 30 |             \hline DIL        & \(\left\{\mathcal{D}_{t}, t\right\}_{t \in \mathcal{T}} ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right)\) and \(\mathcal{Y}_{i}=\mathcal{Y}_{j}\) for \(i \neq j\)                                                           & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T}}, t\) is not required           \\
 31 |             \hline TIL        & \(\left\{\mathcal{D}_{t}, t\right\}_{t \in \mathcal{T}} ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right)\) and \(\mathcal{Y}_{i} \cap \mathcal{Y}_{j}=\emptyset\) for \(i \neq j\)                                            & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T} ; t}\) is available             \\
 32 |             \hline CIL        & \(\left\{\mathcal{D}_{t}, t\right\}_{t \in \mathcal{T}} ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right)\) and \(\mathcal{Y}_{i} \cap \mathcal{Y}_{j}=\emptyset\) for \(i \neq j\)                                            & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T} ; t}\) is unavailable           \\
 33 |             \hline TFCL       & \(\left\{\left\{\mathcal{D}_{t, b}\right\}_{b \in \mathcal{B}_{t}}\right\}_{t \in \mathcal{T}} ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right)\) and \(\mathcal{Y}_{i} \cap \mathcal{Y}_{j}=\emptyset\) for \(i \neq j\)     & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T}} ; t\) is optionally available  \\ \hline OCL  & \(\left\{\left\{\mathcal{D}_{t, b}\right\}_{b \in \mathcal{B}_{t}}\right\}_{t \in \mathcal{T}},|b|=1 ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right)\) and \(\mathcal{Y}_{i} \cap \mathcal{Y}_{j}=\emptyset\) for \(i \neq j\) & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T}} ; t\) is optionally available \\
 34 |             \hline BBCL       & \(\left\{\mathcal{D}_{t}, t\right\}_{t \in \mathcal{T}} ; p\left(\mathcal{X}_{i}\right) \neq p\left(\mathcal{X}_{j}\right), \mathcal{Y}_{i} \neq \mathcal{Y}_{j}\) and \(\mathcal{Y}_{i} \cap \mathcal{Y}_{j} \neq \emptyset\) for \(i \neq j\) & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t \in \mathcal{T} ; t \text { is unavailable }}\) \\
 35 |             \hline CPT        & \(\left\{\mathcal{D}_{t}^{p t}, t\right\}_{t \in \mathcal{T} p t}\), followed by a downstream task \(j\)                                                                                                                                        & \(\left\{p\left(\mathcal{X}_{t}\right)\right\}_{t=j ; t}\) is not required                        \\
 36 |             \hline\end{tabular}}
 37 |     \caption{A formal comparison of typical continual learning scenarios. \(\mathcal{D}_{t, b}:\) the training samples of task \(t\) and batch \(b\). \(|b|:\) the size of batch \(b\). \(\mathcal{B}_{t}:\) the space of incremental batches belonging to task \(t\). \(\mathcal{D}_{t}:\) the training set of task \(t\) (further specified as \(\mathcal{D}_{t}^{p t}\) for pre-training). \(\mathcal{T}:\) the space of all incremental tasks (further specified as \(\mathcal{T}^{p t}\) for pre-training). \(\mathcal{X}_{t}:\) the input data in \(\mathcal{D}_{t}\), \(p\left(\mathcal{X}_{t}\right):\) the distribution of \(\mathcal{X}_{t} . \mathcal{Y}_{t}:\) the data label of \(\mathcal{X}_{t}\).}
 38 |     \label{table:4.1}
 39 | \end{table}
 40 | 
 41 | \subsection{Typical Scenairo}
 42 | 
 43 | Detail of some tyical continual learning scenairos (refer to Table \ref{table:4.1} for a formal comparision):
 44 | \begin{itemize}
 45 |     \item \textit{Instance-Incremental Learning} (IIL): All training samples belong to the same task and arrive in batches.
 46 |     \item \textit{Domain-Incremental Learning} (DIL): Tasks have the same data label space but different input distributions. Task identities are not required.
 47 |     \item \textbf{\textit{Task-Incremental Learning}} (TIL): Tasks have disjoint data label spaces. Task identities are provided in both training and testing.
 48 |     \item \textbf{\textit{Class-Incremental Learning}} (CIL): Tasks have disjoint data label spaces. Task identities are only provided in training.
 49 |     \item \textit{Task-free Continual Learning} (TFCL): Tasks have disjoint data label spaces. Task identities are not provided in either training or testing.
 50 |     \item \textit{Online Continual Learning} (OCL): Tasks have disjoint data label spaces. Training samples for each task arrive as a one-pass data stream.
 51 |     \item \textit{Blurred Boundary Continual Learning} (BBCL): Task boundaries are blurred, characterized by distinct but overlapping data label spaces.
 52 |     \item \textit{Continual Pre-training} (CPT): Pre-training data arrives in sequence. The goal is to improve the performance of learning downstream tasks.
 53 | \end{itemize}
 54 | 
 55 | Lots of the above mentioned scenairo is messy, hence we will focus on the most popular scenairos: Task-Incremental Learning and Class-Incremental Learning.
 56 | 
 57 | \subsection{Evaluation Metrics}
 58 | \textbf{Overall performance} is typically evaluated by \textit{average accuracy} (AA) and \textit{average incremental accuracy} (AIA). Let \(a_{k, j} \in[0,1]\) denote the classification accuracy evaluated on the test set of the \(j\)-th task after incremental learning of the \(k\)-th task \((j \leq k)\). The output space to compute \(a_{k, j}\) consists of the classes in either \(\mathcal{Y}_{j}\) or \(\cup_{i=1}^{k} \mathcal{Y}_{i}\),  corresponding to the use of multi-head evaluation (e.g., TIL) or single-head evaluation (e.g., CIL). The two metrics at the \(k\)-th task are then defined as
 59 | \begin{equation*}
 60 |     \mathrm{AA}_{k}=\frac{1}{k} \sum_{j=1}^{k} a_{k, j}
 61 | \end{equation*}
 62 | AA represnets the overall performance at the current moment.
 63 | 
 64 | \begin{equation*}
 65 |     \mathrm{AIA}_{k}=\frac{1}{k} \sum_{i=1}^{k} \mathrm{AA}_{i}
 66 | \end{equation*}
 67 | AIA reflects the historical variaion.
 68 | 
 69 | \textbf{Memory stability} can be evaluted by \textit{forgetting measure} (FM) and \textit{backward transfer} (BWT). As for the former, the forgetting of a task is calculated by the difference between its maximum performance obtained in the past and its current performance:
 70 | \begin{equation*}
 71 |     f_{j, k}=\max _{i \in\{1, \ldots, k-1\}}\left(a_{i, j}-a_{k, j}\right), \forall j<k
 72 | \end{equation*}
 73 | FM at the \(k\)-th task is the average forgetting of all old tasks:
 74 | $$
 75 |     \mathrm{FM}_{k}=\frac{1}{k-1} \sum_{j=1}^{k-1} f_{j, k},
 76 | $$
 77 | As for the latter, BWT evaluates the average influence of learning the \(k\)-th task on all old tasks:
 78 | $$
 79 |     \mathrm{FWT}_{k}=\frac{1}{k-1} \sum_{j=2}^{k}\left(a_{j, j}-\tilde{a}_{j}\right),
 80 | $$
 81 | where the forgetting is usally reflected as a negative BWT.
 82 | 
 83 | \textbf{Learning plasticity} can be evaluated by \textit{intransience measure} (IM) and \textit{forward transfer} (FWT). IM is defined as the inability of a model to learn new tasks, calculated by the difference of a task between its joint training performance and continual leanring performance:
 84 | $$
 85 |     \mathrm{IM}_{k}=a_{k}^{*}-a_{k, k}
 86 | $$
 87 | where \(a_{k}^{*}\) is the classification accuracy of a randomly-initialized reference model jointly trained with \(\cup_{j=1}^{k} \mathcal{D}_{j}\) for the \(k\)-th task. In comparison, FWT evaluates the average influence of all old tasks on the current \(k\)-th task:
 88 | $$
 89 |     \mathrm{FWT}_{k}=\frac{1}{k-1} \sum_{j=2}^{k}\left(a_{j, j}-\tilde{a}_{j}\right)
 90 | $$
 91 | where \(\tilde{a}_{j}\) is the classification accuracy of a randomly-initialized reference model trained with \(\mathcal{D}_{j}\) for the \(j\)-th task.
 92 | 
 93 | \section{Method}
 94 | 
 95 | In this section, we go through representative continual learning methods.
 96 | 
 97 | \subsection{Regularization-based Aprroach}
 98 | This direction is characterized by adding explicit regularization terms to balance the old and new tasks, which usually requires storing a frozen copy of the old model for reference (Fig. \ref{cl_2}). Depending on the target of regularization, such methods can be divided into two sub-directions.
 99 | 
100 | \begin{figure}[H]
101 |     \centering
102 |     \includegraphics[width=0.7\linewidth]{imgs/continual_learning/cl_2.png}
103 |     \caption{Regularization-based approach. This direction is characterized by adding explicit regularization terms to mimic the parameters (weight regularization) or behaviors (function regularization) of the old model.}
104 |     \label{cl_2}
105 | \end{figure}
106 | 
107 | The first is \textbf{weight regularization}, which selectively regularizes the variation of network parameters. A typical implementation is to add a quadratic penalty in loss function that penalizes the variation of each parameter depending on its contribution or ``importance'' to performing the old tasks, in a form originally derived from online Laplace approximation of the posterior under the Bayesian framework. The importance can be calculated by the Fisher information matrix (FIM), such as EWC. Numerous efforts have been devoted to designing a better imoportance measurement. SI online approximate the importance of each parameter by its contribution to the total loss vaiation and its update length over the entire training trajectory. MAS accumulates the importance measurements based on the sensitivity of predictive results to parameter changes, which is both online and unsupervised. RWalk combines regularization terms of SI and EWC to integrate their advantages.These importance measurements have been shown to be all tantamount to an approximation of the FIM, although stemming from different motivations.
108 | 
109 | There are also several works refining the implementation of the quadratic penalty.Since the diagonal approaximation of the FIM in EWC might lose information about the old tasks, R-EWC performs a factorized rotation of the parameter space to diagonalize the FIM. XK-FAC further considers the inter-example relations in approximating the FIM to better accommodate batch normalization. Observing the asymmetric effect of parameter changes on old tasks, ALASSO designs an asymmetric quadratic penalty with one of its sides overestimated.
110 | 
111 | Compared to learning each task within the constraints of the old model, which typically exacerbates the intransience, an \textit{expansion-renormalization} process of obtaining separately the new task solution and renormalizing it with the old model has shown to provide a better stability-plasticity trade off.
112 | 
113 | The second is \textbf{functional regularization}, which targets the intermediate or final output of the prediction function. This strategy typically employs the previously-learned model as the teacher and the currently-trained model as the student, while implementing knowledge distillation (KD) to mitigate catastrophic forgetting.
114 | 
115 | \subsection{Replay-based Approach}
116 | We group the methods for approximating and recovering old data distributions into this direction (Fig. \ref{cl_3}). Depending on the content of replay, these methods can be further divided into three sub-directions, each with its own targets and challenges.
117 | 
118 | \begin{figure}[H]
119 |     \centering
120 |     \includegraphics[width=0.7\linewidth]{imgs/continual_learning/cl_3.png}
121 |     \caption{Replay-based approach. This direction is characterized by approximating and recovering the old data distributions. Typical subdirections include experience replay, which saves a few old training samples in a memory buffer; generative replay, which trains a generative model to provide generated samples; and feature replay, which recovers the distribution of old features through saving prototypes, saving statistical information or training a generative model.}
122 |     \label{cl_3}
123 | \end{figure}
124 | 
125 | The first is \textbf{experience replay}, which typically stores a few old training samples within a small memory buffer. Due to the extremely limited storage space, the key challenges consist of \textit{how to construct} and \textit{how to exploit} the memory buffer.  As for construction, the preserved old training samples should be carefully selected, compressed, augmented and updated, in order to recover adaptively the past information.  Some notable sampling methods are \textit{reservoir sampling} where randomly preserves a fixed number of old training samples obtained from each training batch and \textit{ring buffer} ensures an equal number of old training samples per class.  \textit{Mean-of-Feature} selects an equal number of old training samples that are closest to the feature mean of each class.  For exploitation, experience replay requires an adequate use of the memory buffer to recover the past information.  First, the effect of old training samples in optimization can be constrained to avoid catastrophic forgetting and facilitate knowledge transfer.  On the other hand, experience replay can be naturally combined with \textit{knowledge distillation} (KD), which additionally incorporates the past information from the old model.
126 | 
127 | The second is \textbf{generative replay} or pseudo-rehearsal, which usually requires training on additional generative model to replay generated data. This is closely related to continual learning of generative models themselves, as they also require incremental updates.
128 | 
129 | \subsection{Optimization-based Approach}
130 | Continual learning can be achieved not only by adding additional terms to the loss function (e.g., regularization and replay), but also by explicitly designing and manipulating the optimization programs. A typical idea is to perform \textbf{gradient projection}.  A typical idea is to perform \textbf{gradient projection}. Some replay-based approaches as GEM, A-GEM, LOGD and MER constrain parameter updates to align with the direction of experience replay, corresponding to preserving the previous input space and gradient space with some old training samples. 


--------------------------------------------------------------------------------
/Basic_Machine_Learning/fundamental_algo.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Fundamental Algorithms}
  2 | In this chapter, we will go through popular machine learning algorithms. In general, machine learning algorithms that are the basis of artificial intelligence (AI) such as image recognition, speech recognition, recommendation systems, ranking and personalization of content. They aren't generally designed to infer the underlying \textit{generative process} (e.g., to model something), but rather to predict or classify with the most accuracy.
  3 | 
  4 | \section{Three Basic Algorithms}
  5 | We will first provide you with basic tools to use, so we will start the simple three \textit{linear regression, k-nearest neighbours}, and \textit{k-means}.
  6 | 
  7 | \subsection{Linear Regression}
  8 | When you use it, you are making the assumption that there is a \textit{linear} relationship between an outcome variable (sometimes also called the response variable, dependent variable, or label) and a predictor (sometimes also called an independent variable, explanatory variable, or feature); or between one variable and several other variables, in which case you're \textit{modelling} the relationship as having a linear structure.
  9 | 
 10 | \begin{tcolorbox}[enhanced jigsaw, breakable, pad at break*=1mm, colback=gray!20!white, colframe=black!85!black, title=\textbf{Algorithm or a Model?}]
 11 |     The two words seem to be used interchangeably when their actual definitions are not the same thing at all. In the purest sense, an algorithm is a set of rules or steps to follow to accomplish some task, and a model is an attempt to describe or capture the world.
 12 | 
 13 |     To differentiate the two in machine learning, think of an algorithm as a process and a model as the result of running the process. In other words, you use an algorithm to create a model.
 14 | \end{tcolorbox}
 15 | 
 16 | How to describe something of a linear nature? \(y=f(x)=\beta_{0}+\beta_{1}x\). Let's approach this from the perspective of deterministic function first.
 17 | 
 18 | \textbf{Example 1} Suppose you run a social networking site that charges a monthly subscription fee of \$25, and that this is your only source of revenue. Each month you collect data and count your number of users and total revenue. You’ve done this daily over the course of two years, recording it all in a spreadsheet. You could express this data as a series of points. Here are the first four:
 19 | $$
 20 |     S=\{(x, y)=(1,25),(10,250),(100,2500),(200,5000)\}
 21 | $$
 22 | If you showed this to someone else who didn't even know how much you charged or anything about your business model (what kind of friend wasn't paying attention to your business model?!), they might notice that there's a clear relationship enjoyed by all of these points, namely \(y=25x\). This is a deterministic function, and it’s a linear one. It's also a perfect fit for the data. If you were to plot it, you'd see that it passes through every point (Fig.\ref{fig:algo_1}).
 23 | 
 24 | \begin{figure}[H]
 25 |     \centering
 26 |     \includegraphics[width=0.7\linewidth]{imgs/fundamental_algo/algo_1.png}
 27 |     \caption{An obvious linear pattern}
 28 |     \label{fig:algo_1}
 29 | \end{figure}
 30 | 
 31 | \textbf{Example 2} Say you have a dataset \textit{keyed} by user (meaning each row contains data for a single user), and the columns represent user behaviour on a social networking site over a period of a week. Let's say you feel comfortable that the data is clean at this stage and that you have on the order of hundreds of thousands of users. The names of the columns are total\_num\_friends, total\_new\_friends\_this\_week, num\_visits, time\_spent, number\_ads\_shown and so on. During the course of your exploratory analysis, you've randomly sampled 100 users to keep it simple, and you plot pairs of these variables, for example, \(x\) = total\_new\_friends and \(y\) = time\_spent (in seconds). The business context might be that eventually you want to be able to promise advertisers who bid for space on your website in advance a certain number of users, so you want to be able to forecast number of users several days or weeks in advance. You decide to plot out the data first (Fig. \ref{fig:algo_2}):
 32 | 
 33 | \begin{figure}[H]
 34 |     \centering
 35 |     \includegraphics[width=0.7\linewidth]{imgs/fundamental_algo/algo_2.png}
 36 |     \caption{Looking kind of linear}
 37 |     \label{fig:algo_2}
 38 | \end{figure}
 39 | 
 40 | The relationship looks \textit{kind of} linear. But be aware that there is no perfectly \textit{deterministic} relationship between number of new friends and time spent on the site, but it makes sense that there is an \textit{association} between these two variables.
 41 | 
 42 | \textbf{Building Blocks} There are two things you want to capture in the model. The first is the \textit{trend} and the second is the \textit{variation}. First, we focus on the \textit{trend}. Let's assume there exists a relationship and it is linear. There are many lines and they all look they might work (Fig.\ref{fig:algo_3}).
 43 | 
 44 | \begin{figure}[H]
 45 |     \centering
 46 |     \includegraphics[width=0.7\linewidth]{imgs/fundamental_algo/algo_3.png}
 47 |     \caption{Which line is the best fit?}
 48 |     \label{fig:algo_3}
 49 | \end{figure}
 50 | 
 51 | Because you're assuming a linear relationship, start your model by assuming the functional form to be:
 52 | $$
 53 |     y=\beta_{0}+\beta_{1} x
 54 | $$
 55 | Now your job is to find the best choices for \(\beta_{0}\) and \(\beta_{1}\) using the observed data to estimate them: \(\left(x_{1}, y_{1}\right),\left(x_{2}, y_{2}\right), \ldots\left(x_{n}, y_{n}\right)\). Writing this with matrix notation results in this:
 56 | $$
 57 |     y=\mathbf{X} \cdot \boldsymbol{\beta}
 58 | $$
 59 | Now that we have our model, the rest is fitting the model.
 60 | 
 61 | \textbf{Fitting the model} The intuition behind linear regression is that you want to find the line that minimizes the distance between all points and the line. Many lines look approximately correct, but the goal is to find the optimal one. \textit{Optimal} could mean different things, but let's start with optimal to mean the line that, on average, is closest to all the points.
 62 | 
 63 | Linear regression seeks to find the line that minimize the sum of the squared distances between the predicted \(\widehat{y_{i}}\) s and the observed \(y_{i}\) s. This is the \textit{least squares} estimation (Fig.\ref{fig:algo_4}).
 64 | \begin{figure}[H]
 65 |     \centering
 66 |     \includegraphics[width=0.7\linewidth]{imgs/fundamental_algo/algo_4.png}
 67 |     \caption{The line closest to all the points}
 68 |     \label{fig:algo_4}
 69 | \end{figure}
 70 | To find this line, you'll define the ``residual sum of squares'' (RSS) as:
 71 | \begin{equation}
 72 |     R S S(\beta)=\Sigma_{i}\left(y_{i}-\beta x_{i}\right)^{2}
 73 |     \label{eq:rss}
 74 | \end{equation}
 75 | where \(i\) ranges over the various data points. It is the sum of all the squared vertical distances between the observed points and any given line. Note this is a function of \(\beta\) and you want to optimize with respect to \(\beta\) to find the optimal line.
 76 | 
 77 | We have a closed solution to Eq.\ref{eq:rss}
 78 | 
 79 | $$
 80 |     \hat{\beta}=\left(x^{t} x\right)^{-1} x^{t} y
 81 | $$
 82 | 
 83 | \begin{figure}[H]
 84 |     \centering
 85 |     \includegraphics[width=0.7\linewidth]{imgs/fundamental_algo/algo_5.png}
 86 |     \caption{On the left is the fitted line. We can see that for any fixed value, say $5$, the values for $y$ vary. For people with $5$ new friends, we display their time spent in the plot on the right.}
 87 |     \label{fig:algo_5}
 88 | \end{figure}
 89 | 
 90 | Form Fig.\ref{fig:algo_5}, we have modeled the \textit{trend}, but we have not yet modeled the \textit{variation}. The variation is the \textit{noise} in the data. The noise is the randomness inherent in the data. In the real world, we don't know the true underlying relationship between the variables, so we can't predict the future. The model is only as good as the data we have.
 91 | 
 92 | \textbf{Extending Beyond Least Squares} Now we will extend this \textit{simple linear regression} model in three primary ways:
 93 | 
 94 | \begin{enumerate}
 95 |     \item Adding in modelling assumption about the errors
 96 |     \item Adding in more predictors
 97 |     \item Transforming the predictors
 98 | \end{enumerate}
 99 | 
100 | \textbf{Adding in Modeling Assumptions about the Errors} If you use the model to predict \(y\) for a given \(x\), the prediction is deterministic and does not capture the variability in the observed data. Look at the Fig \ref{fig:algo_5}. We see a variability when \(x=5\) and you want to capture this variability in the model:
101 | \begin{equation}
102 |     y=\beta_{0}+\beta_{1} x+\epsilon
103 | \end{equation}
104 | the new term \(\epsilon\) is the noise or \textit{error} in the data. The noise is the randomness inherent in the data. One often makes the modelling assumption that the noise is normally distributed, which is denoted:
105 | \begin{equation}
106 |     \epsilon \sim N\left(0, \sigma^{2}\right)
107 | \end{equation}
108 | With the preceding assumption on the distribution of noise, this model is saying that, for any given \(x\), the conditional distribution of \(y\) given \(x\) is \(p(y \mid x) \sim N\left(\beta_{0}+\beta_{1} x, \sigma^{2}\right)\). How do you fit the model? How do you get the parameters \(\beta_{0}, \beta_{1}, \sigma\) from the data?
109 | 
110 | \begin{tcolorbox}[enhanced jigsaw, breakable, pad at break*=1mm, colback=gray!20!white, colframe=black!85!black, title=\textbf{How to fit the model?}]
111 |     Turns out that no matter how the \(\epsilon\) are distributed, the least squares estimates that we derived are the optimal estimator for \(\beta\)s because they have the property of being unbiased and of being the minimum variance estimators.
112 | \end{tcolorbox}
113 | 
114 | So what can you do with your observed data to estimate the variance of the errors? Now that you have the estimated line, you can see how far away the observed data points are from the line itself, and you can treat these differences, also known as \textit{observed errors} or \textit{residuals}, as observation themselves, or estimate of the actual errors, the \(\epsilon\)s. Define \(e_{i}=y_{i}-\hat{y_{i}}=y_{i}-\left(\hat{\beta}_{0}+\hat{\beta}_{1} x_{i}\right)\) for \(i=1, \ldots, n\). Then you estimate the variance (\(\sigma^{2}\)) of \(\epsilon\), as:
115 | 
116 | \begin{equation}
117 |     \frac{\sum_{i} e_{i}^{2}}{n-2}
118 | \end{equation}
119 | 
120 | Why are we dividing by \(n-2\)? Dividing by \(n-2\), rather than just \(n\), produces an \textit{unbiased estimator}.The 2 corresponds to the number of model parameters. This is called the \textit{mean squared error} and captures how much the predicted value varies from the observed. \textit{Mean squared error} is a useful quantity for any prediction problem. In regression in particular, it's also an estimator for your variance, but it can't always be used or interpreted that way.
121 | 
122 | \textbf{Evaluation Metrics} How confident are you with the estimates?
123 | \begin{itemize}
124 |     \item \textit{R-squared}
125 | 
126 |           \(R^{2}=1-\frac{\Sigma_{i}\left(y_{i}-\hat{y}_{i}\right)^{2}}{\sum_{i}\left(y_{i}-\bar{y}\right)^{2}}\). This can be interpreted as the proportion of variance explained by our model. The mean squared error is in there getting divided by total error, which is the proportion of variance \textit{unexplained} by our model and we calculate 1 minus that.
127 | 
128 |     \item \textit{p-values}
129 | 
130 | 
131 | \end{itemize}
132 | 
133 | 


--------------------------------------------------------------------------------
/Basic_Machine_Learning/introduction.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Introduction}
  2 | \section{What is Machine Learning}
  3 | Machine learning is a subfield of computer science that is concerned with building algorithms which to be useful, rely on a collection of examples of some phenomenon.
  4 | \begin{itemize}
  5 | 	\item The examples come from nature, handcrafted by humans or generated by another algorithms.
  6 | \end{itemize}
  7 | Machine learning can also be defined as the process of solving a practical problem by
  8 | \begin{enumerate}
  9 | 	\item gathering a dataset
 10 | 	\item algorithmically building a statistical model based on the dataset.
 11 | \end{enumerate}
 12 | The statistical model is assumed to be used somehow to solve the practical problem.
 13 | \section{Types of Learninig}
 14 | Learning can be \textbf{supervised}, \textbf{semi-supervised}, \textbf{unsupervised} and \textbf{reinforcement}.
 15 | \subsection{Supervised Learning}
 16 | In \textbf{supervised learning}, the \textbf{dataset} is the collection of \textbf{labeled examples} $\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$.
 17 | \begin{itemize}
 18 | 	\item Each element $\mathbf{x}_i$ among $N$ is called a \textbf{feature vector}. A feature vector is a vector in which each dimension $j=1,...,D$ contains a value that describes the example somehow. That value is called a \textbf{feature} and is denoted as \(x^{(j)}\).
 19 | 	\item The \textbf{label} \(y_i\) can be either an element belonging to a finite set of \textbf{classes} ${1,2,...,C}$, or a real number, or a more complex structure, like a vector, a matrix, a tree or a graph. Unless otherwise stated, $y_i$ is either one of a finite set of classes or a real number.
 20 | \end{itemize}
 21 | The goal of a \textbf{supervised learning algorithm} is to use the dataset to produce a \textbf{model} that takes a feature vector $\mathbf{x}$ as input and outputs information that allows deducing the label for this feature vector.
 22 | 
 23 | \subsection{Unsupervised Learning}
 24 | In \textbf{unsupervised learning}, the dataset is a collection of \textbf{unlabeled examples} $\left\{\mathbf{x}_i\right\}_{i=1}^N$. The goal of an \textbf{unsupervised learning algorithm} is to create a model that takes a feature vector $\mathbf{x}$ as input and either transforms it into another vector or into a value that can be used to solve a practical problem. For example,
 25 | \begin{itemize}
 26 | 	\item in \textbf{clustering}, the model returns the id of the cluster for each feature vector in the dataset.
 27 | 	\item in \textbf{dimensionality reduction}, the output of the model is a feature vector that has fewer features than the input $\mathbf{x}$;
 28 | 	\item in \textbf{outlier detection}, the output is a real number that indicates how $\mathbf{x}$ is different from a ``typical" example in the dataset.
 29 | \end{itemize}
 30 | \subsection{Semi-Supervised Learning}
 31 | In \textbf{semi-supervised learning}, the dataset contains both labelled and unlabeled examples.
 32 | \begin{itemize}
 33 | 	\item Usually the quantity of unlabeled examples is much higher than the number of labelled examples.
 34 | \end{itemize}
 35 | The goal of \textbf{semi-supervised learning algorithm} is the same as the goal of the supervised learning algorithm.
 36 | \begin{itemize}
 37 | 	\item The hope here is that using many unlabeled examples can help the learning algorithm to find a better model.
 38 | \end{itemize}
 39 | How does the learning benefit from adding more unlabeled examples?
 40 | \begin{itemize}
 41 | 	\item By adding unlabeled examples, you add more information about your problem:\textit{ a larger sample reflects better the probability distribution the data we labelled came from}.
 42 | \end{itemize}
 43 | 
 44 | \subsection{Reinforcement Learning}
 45 | \textbf{Reinforcement learning} is a subfield of machine learning where the machine ``lives" in an environment and is capable of perceiving the \textit{state} of that environment as vector of features.
 46 | \begin{itemize}
 47 | 	\item The machine can execute \textit{actions} in every state. Different actions bring different \textit{rewards} and could also move the machine to another state of the environment.
 48 | \end{itemize}
 49 | The goal of a reinforcement learning algorithm is to learn a \textit{policy}. A policy is a function (similar to the model in supervised learning) that takes the feature vector of a state as input and outputs an optimal action to execute in that state. The action is optimal if it maximizes the \textit{expected average reward}.
 50 | 
 51 | \section{How Supervised Learning Works}
 52 | Supervised learning is the type of machine learning most frequently used in practice. The supervised learning process starts with gathering the data. The data for supervised learning is a collection of pairs (input,output).
 53 | \begin{itemize}
 54 | 	\item Inputs could be anything, for example, email messages, pictures, or sensor measurements.
 55 | 	\item Outputs are usually real numbers, or labels (e.g. ``spam", ``not\_spam", etc). In some cases, outputs are vectors (e.g., four coordinates of the rectangle around a person on the picture), sequences (e.g. [``adjective", ``adjective", ``noun"] for the input ``big beautiful car") or have some other structure.
 56 | \end{itemize}
 57 | You want to solve spam detection using supervised learning. First, you gather the data of 10,000 email messages and you have your label ``spam" or ``not\_spam" for each message. With the data at hand, you have to represent each message using a feature vector.
 58 | 
 59 | A common approach is to use \textbf{bag of words}, is to take a dictionary of English words (e.g. 20,000 alphabetically sorted words) and stipulate that in our feature vector:
 60 | \begin{itemize}
 61 | 	\item the first feature is equal to 1 if the email message contains the word ``a"; otherwise, this feature is 0;
 62 | 	\item the second feature is equal to 1 if the email message contains the word ``aaron"; otherwise, this feature equals 0;
 63 | 	\item ...
 64 | 	\item the feature at position 20,000 is equal to 1 if the email message contains the word ``zulu"; otherwise, this feature is equal to 0.
 65 | \end{itemize}
 66 | You repeat the above procedure for every email message in our collection, which gives us 10,000 feature vectors (each vector having the dimensionality of 20,000) and a label (``spam"/``not\_spam").
 67 | 
 68 | We successfully converted input data into machine-readable type, but the output labels are still in the form of human-readable text. Some learning algorithms require transforming labels into numbers. For demonstration purpose, we will use a supervised learning algorithm called \textbf{Support Vector Machine} (SVM). This algorithm requires the positive label (in our case it's ``spam") has the numeric value of +1 (one), and the negative label (``not\_spam") has the value of -1 (minus one).
 69 | 
 70 | You now have a \textbf{dataset} and a \textbf{learning algorithm}, you will need to apply the learning algorithm to the dataset to get the \textbf{model}.
 71 | 
 72 | SVM sees every feature vector as a point in a high-dimensional space (in our case, space is 20,000 -- dimensional). The algorithm puts all feature vectors on an imaginary 20,000-dimensional plot and draws an imaginary 19,999 -- dimensional line (a \textit{hyperplane}) that separates examples with positive labels from examples with negative labels. In machine learning, the boundary separating the examples of different classes is called the \textbf{decision boundary}.
 73 | 
 74 | Hyperplane is denoted by two \textbf{parameters}
 75 | 
 76 | \[ \mathbf{w} \mathbf{x}-b=0 \text {, } \]
 77 | \begin{itemize}
 78 | 	\item a real-valued vector $\mathbf{w}$ of the same dimensionality as our input vector $\mathbf{x}$, and a real number $b$
 79 | 	\item $\mathbf{w} \mathbf{x}= w^{(1)} x^{(1)}+w^{(2)} x^{(2)}+\ldots+w^{(D)} x^{(D)}$, and $D$ is the number of dimensions of the feature vector $\mathbf{x}$.
 80 | \end{itemize}
 81 | The predicted label for some input feature vector $\mathbf{x}$ is given like this:
 82 | 
 83 | $$
 84 | 	y=\operatorname{sign}(\mathbf{w} \mathbf{x}-b),
 85 | $$
 86 | 
 87 | where \verb*|sign| is a mathematical operator that takes any value as input and returns $+1$ if the input is a positive number or $-1$ if the input is a negative number.
 88 | 
 89 | The goal of the learning algorithm, SVM in this case, is to leverage the dataset and find the optimal values \(\mathbf{w}^{*}\) and $b^{*}$ for parameters $\mathbf{w}$ and $b$. Once the learning algorithm identifies these optimal values, the \textbf{model} $f(\mathbf{x})$ is then defined as:
 90 | 
 91 | \begin{equation*}
 92 | 	f(\mathbf{x})=\operatorname{sign}\left(\mathbf{w}^* \mathbf{x}-b^*\right)
 93 | \end{equation*}
 94 | 
 95 | 
 96 | How do we find these optima values? It turns out it is an optimization problem. Machines are good at optimizing functions \underline{under constraints}.
 97 | 
 98 | $$
 99 | 	\begin{array}{ll}
100 | 		\mathbf{w} \mathbf{x}_{i}-b \geq+1 & \text { if } y_{i}=+1 \\
101 | 		\mathbf{w} \mathbf{x}_{i}-b \leq-1 & \text { if } y_{i}=-1
102 | 	\end{array}
103 | $$
104 | 
105 | It will also be ideal if the hyperplane separates positive examples from negative ones with the largest \textbf{margin}. The margin is the distance between the closest examples of two classes, as defined by the decision boundary. A large margin contributes to a better \textbf{generalization}, that is how well the model will classify new examples in the future.
106 | 
107 | \begin{figure}[H]
108 | 	\centering
109 | 	\includegraphics[width=0.7\linewidth]{imgs/introduction/intro_1}
110 | 	\caption{An example of an SVM model for two-dimensional feature vectors.}
111 | 	\label{fig:intro1}
112 | \end{figure}
113 | 
114 | Let's do some quick refreshment
115 | \vspace*{1em}
116 | \begin{tcolorbox}[enhanced jigsaw, breakable, pad at break*=1mm, colback=gray!20!white, colframe=black!85!black, title=\textbf{Distance Formulas in Euclidean Space}]
117 | 	\textbf{Distance Between Two Points} \\
118 | 	In two-dimensional space, for points \( P_1(x_1, y_1) \) and \( P_2(x_2, y_2) \), the distance is calculated as:
119 | 	\[ d = \sqrt{(x_2 - x_1)^2 + (y_2 - y_1)^2} \]
120 | 
121 | 	\textbf{Distance From a Point to a Line} \\
122 | 	In two-dimensional space, for a line defined by \( ax + by + c = 0 \) and a point \( P(x_0, y_0) \), the distance is:
123 | 	\[ d = \frac{|ax_0 + by_0 + c|}{\sqrt{a^2 + b^2}} \]
124 | 
125 | 	\textbf{Distance Between Two Parallel Lines} \\
126 | 	For two parallel lines with equations \( ax + by + c_1 = 0 \) and \( ax + by + c_2 = 0 \), the distance is:
127 | 	\[ D = \frac{|c_2 - c_1|}{\sqrt{a^2 + b^2}} \]
128 | \end{tcolorbox}
129 | \vspace*{1em}
130 | So, the optimization problem that we want the machine to solve looks like this:
131 | 
132 | \textit{\textbf{Minimize} $\|\mathbf{w}\|$ subject to $y_{i}\left(\mathbf{w} \mathbf{x}_{i}-b\right) \geq 1$ for $i=1, \ldots, N$. The expression $y_{i}\left(\mathbf{w} \mathbf{x}_{i}-b\right) \geq 1$ is just a compact way to write the above two constraints.}
133 | 
134 | More on SVMs later. This simple example should give you an idea how supervised learning works.
135 | 
136 | \section{Why the Model Works on New Data}
137 | 
138 | Let's refer to Figure \ref{fig:intro1}. If two classes are separable from one another by a decision boundary, then, obviously, examples that belong to each class are located in two different subspaces which the decision boundary creates.
139 | 
140 | If the examples used for training were selected randomly, independently of one another, and following the same procedure, then statistically, it is \textit{more likely} that the new negative example will be located on the plot somewhere not too far from other negative examples.  The idea goes with the positive examples as well.
141 | 
142 | 


--------------------------------------------------------------------------------
/Basic_Machine_Learning/notation.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Notation and Definition}
  2 | 
  3 | \section{Notation}
  4 | 
  5 | We will review all the necessary notation and mathematics for us to continue the journey of Machine Learning.
  6 | 
  7 | \subsection{Data Structure}
  8 | 
  9 | A \textbf{scalar} is simple numerical value, like $15$ or $-3.25$, denoted by an italic letter, like $x$ or $a$. A \textbf{vector} is an ordered list of scalar values, called attributes. Vector is denoted by bold character $\mathbf{x}$ or $\mathbf{w}$. A \textbf{matrix} is a rectangular array of numbers arranged in rows and columns.
 10 | 
 11 | \begin{figure}[H]
 12 | 	\begin{subfigure}[t]{0.45\linewidth}
 13 | 		\centering
 14 | 		\includegraphics[width=\linewidth]{imgs/notation/notation_1}
 15 | 		\caption{First subfigure}
 16 | 	\end{subfigure}
 17 | 	\hfill % this will put some space between your two figures
 18 | 	\begin{subfigure}[t]{0.45\linewidth}
 19 | 		\centering
 20 | 		\includegraphics[width=\linewidth]{imgs/notation/notation_2}
 21 | 		\caption{Second subfigure}
 22 | 	\end{subfigure}
 23 | 	\caption{Three vectors visualized as directions and as points.}
 24 | 	\label{fig:notation_1}
 25 | \end{figure}
 26 | 
 27 | A \textbf{matrix} is a rectangular array of numbers arranged in rows and columns, which are denoted with bold capital letters, such as $\mathbf{A}$ or $\mathbf{W}$.  A \textbf{set} is an unordered collection of unique elements. We denote a set as a calligraphic capital character, for example, \(\mathcal{S}\). When an element belongs to a set \(\mathcal{S}\) , we write $x\in\mathcal{S}$. We can obtain a new set $\mathcal{S}_3$ as an \textbf{intersection} of two set $\mathcal{S}_1$ and $\mathcal{S}_2$, written as $\mathcal{S} \leftarrow\mathcal{S}_{1} \cap \mathcal{S}_{2}$. Also we can obtain a new set by \textbf{union}, $\mathcal{S}_3\leftarrow\mathcal{S}_1\cup\mathcal{S}_2$.
 28 | 
 29 | \subsection{Capital Sigma Notation}
 30 | The summation over a collection \(\mathcal{X}=\left\{x_{1}, x_{2}, \ldots, x_{n-1}, x_{n}\right\}\) or over the attributes of a vector $\mathbf{x}=\left[x^{(1)}, x^{(2)}, \ldots, x^{(m-1)}, x^{(m)}\right]$ is denoted like this:
 31 | \begin{equation*}
 32 | 	\sum_{i=1}^{n} x_{i} \stackrel{\text { def }}{=} x_{1}+x_{2}+\ldots+x_{n-1}+x_{n}, \text { or else: } \sum_{j=1}^{m} x^{(j)} \stackrel{\text { def }}{=} x^{(1)}+x^{(2)}+\ldots+x^{(m-1)}+x^{(m)}
 33 | \end{equation*}
 34 | The notation $\stackrel{\text { def }}{=}$ means ``is defined as".
 35 | 
 36 | \subsection{Capital Pi Notation}
 37 | 
 38 | \begin{equation*}
 39 | 	\prod_{i=1}^{n} x_{i} \stackrel{\text { def }}{=} x_{1} \cdot x_{2} \cdot \ldots \cdot x_{n-1} \cdot x_{n}
 40 | \end{equation*}
 41 | \begin{itemize}
 42 | 	\item A product of elements in a collection or attributes of a vector.
 43 | \end{itemize}
 44 | 
 45 | \subsection{Operations on Sets}
 46 | 
 47 | Given the expression:
 48 | \[ \mathcal{S}^{\prime} \leftarrow \left\{x^{2} \mid x \in \mathcal{S}, x > 3\right\} \]
 49 | This notation is used to define a derived set creation operator. It means that we create a new set \( \mathcal{S}^{\prime} \) by including the square of each element \( x \) from the set \( \mathcal{S} \), under the condition that \( x \) is greater than 3. In other words, \( \mathcal{S}^{\prime} \) is comprised of the squares of all elements in \( \mathcal{S} \) which are greater than 3.
 50 | 
 51 | Additionally, the cardinality operator \( |\mathcal{S}| \) is used to denote the number of elements in the set \( \mathcal{S} \). For example, if \( \mathcal{S} = \{1, 2, 4, 5\} \), then \( \mathcal{S}^{\prime} = \{16, 25\} \) as only 4 and 5 from \( \mathcal{S} \) satisfy the condition \( x > 3 \). The \textbf{cardinality} \( |\mathcal{S}| \) in this case would be 4.
 52 | 
 53 | \subsection{Operations on Vectors}
 54 | 
 55 | \textbf{Vector Addition and Subtraction:}
 56 | The sum and difference of two vectors \( \mathbf{x} \) and \( \mathbf{z} \) are defined component-wise as:
 57 | \[ \mathbf{x} + \mathbf{z} = \left[x^{(1)} + z^{(1)}, \ldots, x^{(m)} + z^{(m)}\right] \]
 58 | \[ \mathbf{x} - \mathbf{z} = \left[x^{(1)} - z^{(1)}, \ldots, x^{(m)} - z^{(m)}\right] \]
 59 | \emph{Example:} For \( \mathbf{x} = [1, 2] \) and \( \mathbf{z} = [3, 4] \),
 60 | \[ \mathbf{x} + \mathbf{z} = [1+3, 2+4] = [4, 6] \]
 61 | 
 62 | \textbf{Scalar Multiplication:}
 63 | A vector multiplied by a scalar \( c \) results in a scaled vector:
 64 | \[ \mathbf{x} c = \left[c x^{(1)}, \ldots, c x^{(m)}\right] \]
 65 | \emph{Example:} For \( \mathbf{x} = [1, 2] \) and \( c = 3 \),
 66 | \[ \mathbf{x} c = [3 \times 1, 3 \times 2] = [3, 6] \]
 67 | 
 68 | \textbf{Dot Product:}
 69 | The dot product of two vectors \( \mathbf{w} \) and \( \mathbf{x} \) is a scalar:
 70 | \[ \mathbf{w} \mathbf{x} = \sum_{i=1}^{m} w^{(i)} x^{(i)} \]
 71 | \emph{Example:} For \( \mathbf{w} = [1, 2] \) and \( \mathbf{x} = [3, 4] \),
 72 | \[ \mathbf{w} \mathbf{x} = 1 \times 3 + 2 \times 4 = 3 + 8 = 11 \]
 73 | \textbf{Matrix-Vector Multiplication:}
 74 | Multiplying a matrix \( \mathbf{W} \) by a vector \( \mathbf{x} \) yields another vector. For example:
 75 | $$
 76 | 	\begin{aligned}
 77 | 		\mathbf{W} \mathbf{x} & =\left[\begin{array}{lll}
 78 | 				                               w^{(1,1)} & w^{(1,2)} & w^{(1,3)} \\
 79 | 				                               w^{(2,1)} & w^{(2,2)} & w^{(2,3)}
 80 | 			                               \end{array}\right]\left[\begin{array}{l}
 81 | 				                                                       x^{(1)} \\
 82 | 				                                                       x^{(2)} \\
 83 | 				                                                       x^{(3)}
 84 | 			                                                       \end{array}\right]                                      \\
 85 | 		                      & \stackrel{\text { def }}{=}\left[\begin{array}{l}
 86 | 				                                                         w^{(1,1)} x^{(1)}+w^{(1,2)} x^{(2)}+w^{(1,3)} x^{(3)} \\
 87 | 				                                                         w^{(2,1)} x^{(1)}+w^{(2,2)} x^{(2)}+w^{(2,3)} x^{(3)}
 88 | 			                                                         \end{array}\right] \\
 89 | 		                      & =\left[\begin{array}{l}
 90 | 				                               \mathbf{w}^{(1)} \mathbf{x} \\
 91 | 				                               \mathbf{w}^{(2)} \mathbf{x}
 92 | 			                               \end{array}\right]
 93 | 	\end{aligned}
 94 | $$
 95 | 
 96 | \emph{Example:} For
 97 | \[ \mathbf{W} = \left[\begin{array}{ll}
 98 | 			1 & 2 \\
 99 | 			3 & 4
100 | 		\end{array}\right] \text{ and } \mathbf{x} = \left[\begin{array}{l}
101 | 			5 \\
102 | 			6
103 | 		\end{array}\right], \]
104 | \[ \mathbf{W} \mathbf{x} = \left[\begin{array}{l}
105 | 			1 \times 5 + 2 \times 6 \\
106 | 			3 \times 5 + 4 \times 6
107 | 		\end{array}\right] = \left[\begin{array}{l}
108 | 			17 \\
109 | 			39
110 | 		\end{array}\right] \]
111 | \textbf{Transpose and Multiplication:}
112 | For the transpose of a vector \( \mathbf{x} \) denoted \( \mathbf{x}^{\top} \), and a matrix \( \mathbf{W} \), the multiplication \( \mathbf{x}^{\top} \mathbf{W} \) is given by:
113 | $$
114 | 	\begin{aligned}
115 | 		\mathbf{x}^{\top} \mathbf{W} & =\left[\begin{array}{ll}
116 | 				                                      x^{(1)} & x^{(2)}
117 | 			                                      \end{array}\right]\left[\begin{array}{lll}
118 | 				                                                              w^{(1,1)} & w^{(1,2)} & w^{(1,3)} \\
119 | 				                                                              w^{(2,1)} & w^{(2,2)} & w^{(2,3)}
120 | 			                                                              \end{array}\right]                                                                                      \\
121 | 		                             & \stackrel{\text { def }}{=}\left[w^{(1,1)} x^{(1)}+w^{(2,1)} x^{(2)}, w^{(1,2)} x^{(1)}+w^{(2,2)} x^{(2)}, w^{(1,3)} x^{(1)}+w^{(2,3)} x^{(2)}\right]
122 | 	\end{aligned}
123 | $$
124 | 
125 | \emph{Example:} For
126 | \[ \mathbf{x} = \left[\begin{array}{l}
127 | 			7 \\
128 | 			8
129 | 		\end{array}\right] \text{ and } \mathbf{W} = \left[\begin{array}{lll}
130 | 			1 & 2 & 3 \\
131 | 			4 & 5 & 6
132 | 		\end{array}\right], \]
133 | \[ \mathbf{x}^{\top} \mathbf{W} = \left[\begin{array}{lll}
134 | 			7 \times 1 + 8 \times 4, 7 \times 2 + 8 \times 5, 7 \times 3 + 8 \times 6
135 | 		\end{array}\right] = \left[\begin{array}{lll}
136 | 			39, 54, 69
137 | 		\end{array}\right] \]
138 | 
139 | \subsection{Functions}
140 | 
141 | \textbf{Definition of a Function}\\
142 | A function is a relation that associates each element \( x \) of a set \( \mathcal{X} \), known as the domain, to a single element \( y \) of another set \( \mathcal{Y} \), known as the codomain. This relation is denoted as \( y = f(x) \), where \( f \) is the name of the function, \( x \) is the input or argument, and \( y \) is the output. The input variable is also referred to as the variable of the function.
143 | 
144 | \emph{Example:} Consider the function \( f(x) = x^2 \) defined on the domain \( \mathcal{X} = \mathbb{R} \). For \( x = 2 \), the output is \( f(2) = 2^2 = 4 \).
145 | 
146 | \textbf{Local and Global Minima}\\
147 | The function \( f(x) \) has a local minimum at \( x = c \) if \( f(x) \geq f(c) \) for every \( x \) in an open interval around \( c \). An open interval, such as \( (0,1) \), includes all numbers between its endpoints but not the endpoints themselves. The smallest value among all local minima is known as the global minimum.
148 | 
149 | \emph{Example:} In the function \( f(x) = (x-1)^2 \), the local (and global) minimum occurs at \( x = 1 \) since \( f(x) \geq f(1) = 0 \) for all \( x \).
150 | 
151 | \begin{figure}[H]
152 | 	\centering
153 | 	\includegraphics[width=0.7\linewidth]{imgs/notation/notation_3.png}
154 | 	\caption{A local and a global minima of a function.}
155 | 	\label{fig:notation_3}
156 | \end{figure}
157 | 
158 | \textbf{Vector Functions}\\
159 | A vector function, denoted \( \mathbf{y} = \mathbf{f}(x) \), is a function that returns a vector \( \mathbf{y} \). Its argument can be either a vector or a scalar.
160 | 
161 | \emph{Example:} For the vector function \( \mathbf{f}(x) = [x, x^2] \), with \( x = 2 \), the output is \( \mathbf{f}(2) = [2, 2^2] = [2, 4] \).
162 | 
163 | \subsection{Max and Arg Max}
164 | 
165 | Given a set of values \( \mathcal{A} = \{a_{1}, a_{2}, \ldots, a_{n}\} \), the operator \( \max_{a \in \mathcal{A}} f(a) \) returns the highest value of \( f(a) \) for all elements in the set \( \mathcal{A} \). Conversely, the operator \( \arg \max_{a \in \mathcal{A}} f(a) \) identifies the specific element \( a \) in the set \( \mathcal{A} \) that maximizes the function \( f(a) \).
166 | 
167 | In cases where the set is implicit or infinite, we can use the notation \( \max_{a} f(a) \) or \( \arg \max_{a} f(a) \) respectively. Similarly, the operators \( \min \) and \( \arg \min \) function in a comparable way, determining the lowest value of a function and the
168 | 
169 | \subsection{Assignment Operator}
170 | The expression \(a \leftarrow f(x)\) means that the variable \(a\) gets the new value: the result of \(f(x)\). We say that the variable \(a\) gets assigned a new value. Similarly, \(\mathbf{a} \leftarrow\left[a_{1}, a_{2}\right]\) means that the vector variable \(\mathbf{a}\) gets the two-dimensional vector \(\left[a_{1}, a_{2}\right]\).
171 | 
172 | \subsection{Derivative and Gradient}
173 | A \textbf{derivative} \(f^{\prime}\) of a function \(f\) is a function or a value that describes how fast \(f\) grows (or decreases). If the derivative \(f^{\prime}\) is a function, then the function \(f\) can grow at a different pace in different regions of its domain.
174 | 
175 | we can use \textbf{chain rule} when we encounter hard-to-differentiate function. For instance if \(F(x)=f(g(x))\), where \(f\) and \(g\) are some functions, then \(F^{\prime}(x)= f^{\prime}(g(x)) g^{\prime}(x)\).
176 | 
177 | \textbf{Gradient} is the generalization of derivative for functions that take several inputs (or one input in the form of a vector or some other complex structure). A gradient of a function is a vector of \textbf{partial derivatives}. For example, \(f\left(\left[x^{(1)}, x^{(2)}\right]\right)=a x^{(1)}+b x^{(2)}+c\), then the partial derivative of function \(f\) \textit{with respect to} \(x^{(1)}\), denoted as \(\frac{\partial f}{\partial x^{(1)}}\), is given by,
178 | $$
179 | 	\frac{\partial f}{\partial x^{(1)}}=a+0+0=a
180 | $$
181 | where \(a\) is the derivative of the function \(a x^{(1)}\); the two zeroes are respectively derivatives of
182 | \(b x^{(2)}\) and \(c\), because \(x^{(2)}\) is considered constant when we compute the derivative with respect
183 | to \(x^{(1)}\), and the derivative of any constant is zero. Similarly, the partial derivative of function \(f\) with respect to \(x^{(2)}, \frac{\partial f}{\partial x^{(2)}}\), is given by,
184 | $$
185 | 	\frac{\partial f}{\partial x^{(2)}}=0+b+0=b
186 | $$
187 | 
188 | The gradient of function \(f\), denoted as \(\nabla f\) is given by the vector \(\left[\frac{\partial f}{\partial x^{(1)}}, \frac{\partial f}{\partial x^{(2)}}\right]\).
189 | 
190 | \section{Random Variable}
191 | 
192 | A \textbf{random variable}, usually written as an italic letter, like \(X\), is a variable whose possible values are numerical outcomes of a random phenomenon. There are two types of random variables: \textbf{discrete} and \textbf{continuous}. A \textbf{discrete random variable} takes on only countable number of distinct values such as \textit{red}, \textit{yellow}, \textit{blue} or 1,2,3, . . ..
193 | 
194 | \begin{figure}[H]
195 | 	\centering
196 | 	\includegraphics[width=0.7\linewidth]{imgs/notation/notation_4.png}
197 | 	\caption{A probability mass function and a probability density function.}
198 | 	\label{fig:notation_4}
199 | \end{figure}
200 | The \textbf{probability distribution} of a discrete random variable is described by a list of probability associated with each of its possible values. This list of probability is called a \textbf{probability mass function} (pmf)(Fig.\ref{fig:notation_4}, \textbf{a}).
201 | 
202 | A \textbf{continuous random variable} takes an infinite number of possible values in some interval. The probability distribution of a continual random variable (a continuous probability distribution) is described by a \textbf{probability density function} (pdf) (Fig.\ref{fig:notation_4}, \textbf{b}).
203 | 
204 | Let a discrete random variable \(X\) have \(k\) possible values \(\left\{x_{i}\right\}_{i=1}^{k}\). The \textbf{expectation} of \(X\) denoted as \(\mathbb{E}[X]\) is given by,
205 | 
206 | \begin{equation}
207 | 	\begin{aligned}
208 | 		\mathbb{E}[X] & \stackrel{\text { def }}{=} \sum_{i=1}^{k}\left[x_{i} \cdot \operatorname{Pr}\left(X=x_{i}\right)\right] \\ & =x_{1} \cdot \operatorname{Pr}\left(X=x_{1}\right)+x_{2} \cdot \operatorname{Pr}\left(X=x_{2}\right)+\cdots+x_{k} \cdot \operatorname{Pr}\left(X=x_{k}\right)
209 | 	\end{aligned}
210 | 	\label{notation:1}
211 | \end{equation}
212 | 
213 | where \(\operatorname{Pr}\left(X=x_{i}\right)\) is the probability that \(X\) has the value \(x_{i}\) according to the pmf. The expectation of a random variable is also called the \textbf{mean}, \textbf{average} or \textbf{expected value} and is frequently denoted with the letter \(\mu\),
214 | 
215 | Now the \textbf{standard deviation}, defined as,
216 | $$
217 | 	\sigma \stackrel{\text { def }}{=} \sqrt{\mathbb{E}\left[(X-\mu)^{2}\right]}
218 | $$
219 | \textbf{Variance}, denoted as \(\sigma^{2}\) or \(\operatorname{var}(X)\), is defined as,
220 | $$
221 | 	\sigma^{2}=\mathbb{E}\left[(X-\mu)^{2}\right]
222 | $$
223 | For a discrete random variable, the standard deviation is given by:
224 | $$
225 | 	\sigma=\sqrt{\operatorname{Pr}\left(X=x_{1}\right)\left(x_{1}-\mu\right)^{2}+\operatorname{Pr}\left(X=x_{2}\right)\left(x_{2}-\mu\right)^{2}+\cdots+\operatorname{Pr}\left(X=x_{k}\right)\left(x_{k}-\mu\right)^{2}}
226 | $$
227 | 
228 | The expectation of a continuous random variable \(X\) is given by,
229 | \begin{equation}
230 | 	\mathbb{E}[X] \stackrel{\text { def }}{=} \int_{\mathbb{R}} x f_{X}(x) d x
231 | 	\label{notation:2}
232 | \end{equation}
233 | 
234 | where \(f_{X}\) is the pdf of the variable \(X\) and \(\int_{\mathbb{R}}\) is the integral of function \(x f_{X}\)
235 | \begin{tcolorbox}[enhanced jigsaw, breakable, pad at break*=1mm, colback=gray!20!white, colframe=black!85!black, title=\textbf{Real-Life Examples of Probability Concepts}]
236 | 
237 | 	\textbf{Standard Deviation of a Discrete Random Variable:}
238 | 	Consider a dice game where you roll a six-sided die. Each face represents a different prize amount in dollars: \{1, 2, 3, 4, 5, 6\}. The probability of each outcome is \( \frac{1}{6} \) for a fair die.
239 | 
240 | 	\textit{Mean Calculation:}
241 | 	The mean (\( \mu \)) or expected value of your winnings per roll is:
242 | 	\[ \mu = \frac{1+2+3+4+5+6}{6} = 3.5 \, \text{dollars} \]
243 | 
244 | 	\textit{Standard Deviation Calculation:}
245 | 	The standard deviation \( \sigma \) is given by:
246 | 	\[ \sigma = \sqrt{\sum_{i=1}^{6} \left(\frac{1}{6}\right) \times (i - 3.5)^2} \]
247 | 
248 | 	\textbf{Expectation of a Continuous Random Variable:}
249 | 	Imagine the waiting time for a bus, which follows a continuous uniform distribution between 0 and 1 hour.
250 | 
251 | 	\textit{Expectation Calculation:}
252 | 	The expectation of the waiting time, \( \mathbb{E}[X] \), is calculated as:
253 | 	\[ \mathbb{E}[X] = \int_{0}^{1} x \, dx \]
254 | 	$$
255 | 		\mathbb{E}[X]=\int_{0}^{1} x d x=\left[\frac{1}{2} x^{2}\right]_{0}^{1}=\frac{1}{2}\left(1^{2}\right)-\frac{1}{2}\left(0^{2}\right)=\frac{1}{2}
256 | 	$$
257 | 	The result of this integral is \( \frac{1}{2} \) hour, indicating the average waiting time.
258 | \end{tcolorbox}
259 | The property of the pdf that the area under its curve is 1 mathematically means that \(\int_{\mathbb{R}} f_{X}(x) d x=1\). Most of the time we don't know \(f_{X}\), but we can observe some values of \(X\). In machine learning, we call these values \textbf{examples}, and the collection of these examples is called a \textbf{sample} or a \textbf{dataset}.
260 | 
261 | \section{Unbiased Estimator}
262 | Because \(f_{X}\) is usually unknown, but we have sample \(S_{X}=\left\{x_{i}\right\}_{i=1}^{N}\), we often content ourselves not with the true values of statistics of the probability distribution, such as expectation, but with their \textbf{unbiased estimators}.
263 | 
264 | We say that \(\hat{\theta}\left(S_{X}\right)\) is an unbiased estimator of some statistic \(\theta\) calculated using a sample \(S_{X}\)
265 | drawn from an unknown probability distribution if \(\hat{\theta}\left(S_{X}\right)\) has the following property:
266 | $$
267 | 	\mathbb{E}\left[\hat{\theta}\left(S_{X}\right)\right]=\theta
268 | $$
269 | where \(\hat{\theta}\) is a \textbf{sample statistic}, obtained using a sample \(S_{X}\) and not the real statistic \(\theta\) that can be obtained only knowing \(X\); the expectation is taken over all possible samples drawn from \(X\). Intuitively, this means that if you can have an unlimited number of such sample as \(S_{X}\), and you compute some unbiased estimator, such as \(\hat{\mu}\), using each sample, then the average of all these \(\hat{\mu}\) equals the real statistic \(\mu\) that you would get computed on \(X\).
270 | 
271 | It can be shown that an unbiased estimator of an unknown \(\mathbb{E}[X]\) (Eq.\ref{notation:1} or Eq.\ref{notation:2}) is given by \(\frac{1}{N} \sum_{i=1}^{N} x_{i}\) (called in statistics the \textbf{sample mean}).
272 | 
273 | \section{Bayes' Rule}
274 | The conditional probability \(\operatorname{Pr}(X=x \mid Y=y)\) is the probability of the random variable \(X\) to have a specific value $x$ given another random variable $Y$ has a specific value of $y$. The \textbf{Bayes' Rule} (also known as the \textbf{Bayes' Therem}) stipulate that:
275 | $$
276 | 	\operatorname{Pr}(X=x \mid Y=y)=\frac{\operatorname{Pr}(Y=y \mid X=x) \operatorname{Pr}(X=x)}{\operatorname{Pr}(Y=y)}
277 | $$
278 | 
279 | \section{Parameter Estimation}
280 | Bayes' Rule comes in handy when we have a model of \(X\)'s distribution, and this model \(f_{\theta}\) is a function that has some parameters in the form of a vector \(\theta\).  An example of such a function could be the Gaussian function that has two parameters, $\mu$ and $\sigma$, and is defined as:
281 | \begin{equation}
282 | 	f_{\boldsymbol{\theta}}(x)=\frac{1}{\sqrt{2 \pi \sigma^{2}}} e^{-\frac{(x-\mu)^{2}}{2 \sigma^{2}}}
283 | 	\label{notation:3}
284 | \end{equation}
285 | where \(\boldsymbol{\theta} \stackrel{\text { def }}{=}[\mu, \sigma]\) and \(\pi\) is the constant \((3.14159 \ldots)\).
286 | 
287 | This function has all the properties of a pdf, which is the pdf of one of the most frequently used in practice probability distributions called \textbf{Gaussian distribution} or \textbf{normal distribution} and denote as \(\dot{\mathcal{N}}\left(\mu, \sigma^{2}\right)\). Therefore, we can use it as a model of an unknown distribution of \(X\). We can update the values of parameters in the vector \(\theta\) from the data using the Bayes's Rule.
288 | 
289 | \begin{equation}
290 | 	\operatorname{Pr}(\theta=\hat{\theta} \mid X=x) \leftarrow \frac{\operatorname{Pr}(X=x \mid \theta=\hat{\theta}) \operatorname{Pr}(\theta=\hat{\theta})}{\operatorname{Pr}(X=x)}=\frac{\operatorname{Pr}(X=x \mid \theta=\hat{\theta}) \operatorname{Pr}(\theta=\hat{\theta})}{\sum_{\tilde{\theta}} \operatorname{Pr}(X=x \mid \theta=\tilde{\theta}) \operatorname{Pr}(\theta=\tilde{\theta})}
291 | 	\label{eq:bayes_prediction}
292 | \end{equation}
293 | 
294 | where \(\operatorname{Pr}(X=x \mid \theta=\hat{\theta}) \stackrel{\text { def }}{=} f_{\hat{\theta}}\). If we have a sample \(\mathcal{S}\) of \(X\) and the set of possible values for \(\theta\) is finite, we can easily estimate \(\operatorname{Pr}(\theta=\hat{\theta})\) by applying Bayes' Rule iteratively, one example \(x \in \mathcal{S}\) at a time. The initial value \(\operatorname{Pr}(\theta=\hat{\theta})\) can be guessed such that \(\sum_{\hat{\theta}} \operatorname{Pr}(\theta=\hat{\theta})=1\). This guess of the probabilities for different \(\hat{\theta}\) is called the \textbf{prior}.
295 | 
296 | First, we compute \(\operatorname{Pr}\left(\theta=\hat{\theta} \mid X=x_{1}\right)\) for all possible values \(\hat{\theta}\). Then, before updating \(\operatorname{Pr}(\theta=\hat{\theta} \mid X=x)\) once again, this time for \(x=x_{2} \in \mathcal{S}\) using Eq.\ref{eq:bayes_prediction}, we replace the prior \(\operatorname{Pr}(\theta=\hat{\theta})\) in Eq.\ref{eq:bayes_prediction} by the new estimate \(\operatorname{Pr}(\theta=\hat{\theta}) \leftarrow \frac{1}{N} \sum_{x \in \mathcal{S}} \operatorname{Pr}(\theta=\hat{\theta} \mid X=x)\).
297 | 
298 | The optimal parameters \(\theta^{*}\) given one example is obtained using the principle of \textbf{maximum a posterior} (or MAP):
299 | \begin{equation}
300 | 	\theta^{*}=\underset{\theta}{\arg \max } \prod_{i=1}^{N} \operatorname{Pr}\left(\theta=\hat{\theta} \mid X=x_{i}\right)
301 | 	\label{maximum a posteriori}
302 | \end{equation}
303 | If the set of possible values for $\theta$ isn't finite, then we need to optimize Eq. \ref{maximum a posteriori} directly using a numerical optimization routine, such as gradient descent.  Usually, we optimize the natural logarithm of the right-hand  side expression in Eq. \ref{maximum a posteriori} because the logarithm of a product becomes the sum of logarithms and it's easier for the machine to work with a sum than with a product.
304 | 
305 | \section{Parameters vs. Hyperparameters}
306 | 
307 | A \textit{hyper-parameter} is a property of a learning algorithm, usually (but not always) having a numerical value. That value influences the way the algorithm works. Hyper-parameters aren't learned by the algorithm itself from data. They have to be set by the data analyst before running the algorithm. \textit{Parameters} are variables that define the model learned by the learning algorithm. \textit{Parameters} are directly modified by the learning algorithm based on the training data. The goal of learning is to find such values of parameters that make the model optimal in a certain sense.
308 | 
309 | \section{Classification vs. Regression}
310 | \textbf{Classification} is a problem of automatically assigning a \textbf{label} to an \textbf{unlabeled example}; Spam detection. Classification problem is solved by \textbf{classification learning algorithm} that takes a collection of \textbf{labelled examples} as inputs and produces a \textbf{model} that can take an unlabeled example as input and either directly output a label or output a number that can be used by the analyst to deduce the label. An example of such a number is a probability.
311 | 
312 | In a classification problem, a label is a member of a finite set of \textbf{classes}. If the size of the set of classes is two, then it is a \textbf{binary} or \textbf{binomial} \textbf{classification}. \textbf{Multiclass classification} (also called \textbf{multinomial}) is a classification problem with three or more classes.
313 | 
314 | \textbf{Regression} is a problem of predicting a real-valued label (often called a \textbf{target}) given an unlabeled example. The regression problem is solved by a \textbf{regression learning algorithm} that takes a collection of labelled examples as inputs and produces a model that can take an unlabeled example as input and output a target.
315 | 
316 | \section{Model-Based vs. Instance-Based Learning}
317 | Most supervised learning algorithms are model-based. \textit{Model-based learning algorithms} use the training data to create a \textbf{model} that has \textbf{parameters} learned from the training data. \textit{Instance-based learning algorithms} use the whole dataset as the model. One instance-based algorithm frequently used in practice is \textbf{k-Nearest Neighbors} (kNN).
318 | 
319 | \section{Shallow vs. Deep Learning}
320 | A \textbf{shallow learning} algorithm learns the parameters of the model directly from the features of the training examples. The \textbf{neural network} learning algorithms, specifically those networks with more than one \textbf{layer} between input and output. Such neural networks are called \textbf{deep neural networks}. In deep neural network learning (or, simply, \textbf{deep learning}), contrary to shallow learning, most model parameters are learned not directly from the features of the training examples, but from the outputs of the preceding layers.
321 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Jue Guo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Neural_Networks/kowledge_distillation.tex:
--------------------------------------------------------------------------------
1 | \chapter{Distilling the knowledge in a Neural Network}
2 | This chapter is a direct and indirect reference to \cite{hinton2015distilling}.
3 | 
4 | Simple way to improve the performance of any machine learning algorithm is to train many different models on the same data and then to average their predictions. Making predictions using a whole ensemble of models is too computationally expensive to allow deployment. \textit{It is shown that we can distill the knowledge of an ensemble of models into a single model.} The author introduces a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel. 
5 | 
6 | \section{Introduction}
7 | 
8 | In large-scale machine learning, we typically use very similar models for the training stage and deployment stage despite their very different requirements. The author provided an analogy of the insects suggests that we should be willing to train a very cumbersome models if that makes it easier to extract structure from the data. The cumbersome model could be an ensemble of separately trained models or a single very large model trained with a very strong regularizer such as dropout. 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A Comprehensive Note on Machine Learning
  2 | 
  3 | ## Introduction
  4 | 
  5 | This document serves as an educational resource primarily designed for the courses I teach, including:
  6 | 
  7 | - **CSE 474/574 Introduction to Machine Learning**
  8 | - **CSE 455/555 Pattern Recognition**
  9 | - **CSE 676 Deep Learning**
 10 | 
 11 | Additionally, it functions as a personal reference in the field of machine learning.
 12 | 
 13 | ## Purpose and Use
 14 | 
 15 | This compilation aims to provide an extensive overview and guide for students and practitioners of machine learning, incorporating a blend of **direct** references and **adaptations** from established texts and sources. It is intended for:
 16 | 
 17 | - **Educational Aid**: As a supplementary resource for teaching and learning.
 18 | - **Reference Material**: For personal and academic use.
 19 | 
 20 | This document is **not authorized** for commercial use, redistribution, or sale without explicit consent. To read the pdf version of the document: [PDF](https://github.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/blob/main/machine_learning.pdf)
 21 | 
 22 | ## Community Contributions
 23 | From my side there will be constant updates but in order to motivate my students to read I have a **3 bonus points** if they make significant and meaningful contribution to each chapters of the note. I welcome contributions and feedback from the community! If you have suggestions, corrections, or additional material you think would enhance this resource, please feel free to contribute. Here's how you can do that:
 24 | 
 25 | - **Fork the Repository**: Create your own fork of the project.
 26 | - **Make Changes**: Add your contributions or modifications.
 27 | - **Submit a Pull Request**: Open a pull request to the original repository with a clear list of what you've done.
 28 | - **Review & Merge**: I will review your changes and merge them into the main document as appropriate.
 29 | 
 30 | Details on how to create a pull request: [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) 
 31 | 
 32 | When contributing, please adhere to the following guidelines:
 33 | 
 34 | - Ensure that any added content is accurate and relevant to machine learning.
 35 | - Respect intellectual property and cite sources appropriately.
 36 | - Maintain a respectful and constructive tone in discussions and pull requests.
 37 | 
 38 | ## Installation of LaTeX
 39 | 
 40 | LaTeX is a high-quality typesetting system; it includes features designed for the production of technical and scientific documentation. My preferred setup for writing LaTeX documents is Visual Studio Code (VSCode) with the LaTeX Workshop extension, offering a user-friendly and efficient LaTeX writing experience. Below is a guide to help you set up this environment.
 41 | 
 42 | ### 1. Install LaTeX Distribution
 43 | To begin with, you need a LaTeX distribution installed on your computer.
 44 | 
 45 | - **Windows**: Use MiKTeX or TeX Live. MiKTeX is more user-friendly for beginners, while TeX Live is more comprehensive. Download from [MiKTeX](https://miktex.org/) or [TeX Live](https://www.tug.org/texlive/).
 46 | - **macOS**: Install MacTeX, which is a macOS version of TeX Live with additional tools. Download from [MacTeX](http://www.tug.org/mactex/).
 47 | - **Linux**: Most Linux distributions include TeX Live in their package repositories. Install it using your package manager (for example, `sudo apt-get install texlive-full` in Ubuntu).
 48 | 
 49 | ### 2. Install Visual Studio Code (VSCode)
 50 | VSCode is a free, open-source editor with a wide array of features.
 51 | 
 52 | - Download and install VSCode from the [official website](https://code.visualstudio.com/).
 53 | 
 54 | ### 3. Install LaTeX Workshop Extension in VSCode
 55 | LaTeX Workshop enhances VSCode with LaTeX typesetting capabilities.
 56 | 
 57 | - Open VSCode.
 58 | - Navigate to Extensions (`Ctrl+Shift+X` / `Cmd+Shift+X`).
 59 | - Search for "LaTeX Workshop" and install the extension.
 60 | 
 61 | ### 4. Configure LaTeX Workshop
 62 | Customize LaTeX Workshop settings for your needs.
 63 | 
 64 | - Go to `File > Preferences > Settings` (`Code > Preferences > Settings` on macOS).
 65 | - Search for "LaTeX Workshop" settings.
 66 | - Configure according to your preferences, like setting up a default compiler or enabling auto-build on save.
 67 | 
 68 | ### 5. Start Writing LaTeX
 69 | Now, you are ready to write LaTeX documents.
 70 | 
 71 | - Create a new file with a `.tex` extension in VSCode.
 72 | - Write your LaTeX content.
 73 | - Use the build feature in LaTeX Workshop to compile your document into a PDF.
 74 | 
 75 | ### 6. Additional Tools and Tips
 76 | - **Git Integration**: VSCode's integrated support for Git is beneficial for version controlling your LaTeX documents.
 77 | - **Live Preview**: LaTeX Workshop supports live preview of your document.
 78 | - **Custom Snippets**: Create custom snippets for frequently used LaTeX commands to improve efficiency.
 79 | 
 80 | This setup with VSCode and LaTeX Workshop provides a powerful, modern environment for writing and managing LaTeX documents, blending LaTeX's typesetting capabilities with the features of a contemporary code editor.
 81 | 
 82 | 
 83 | ## Primary Sources
 84 | 
 85 | The majority of the material referenced in this document comes from the following key sources:
 86 | 
 87 | 1. Zhang, Aston, et al. "Dive into Deep Learning." Cambridge University Press, 2023.
 88 | 2. Bishop, C. M., & Nasrabadi, N. M. "Pattern Recognition and Machine Learning" (Vol. 4, No. 4, p. 738). New York: Springer, 2006.
 89 | 3. Hart, P. E., Stork, D. G., & Duda, R. O. "Pattern Classification." Hoboken: Wiley, 2000.
 90 | 4. Burkov, A. "The Hundred-Page Machine Learning Book" (Vol. 1, p. 32). Quebec City, QC, Canada: Andriy Burkov, 2019.
 91 | 5. Burkov, A. "Machine Learning Engineering" (Vol. 1). Montreal, QC, Canada: True Positive Incorporated, 2020.
 92 | 
 93 | 
 94 | ## Additional References
 95 | 
 96 | All other referenced materials and sources are cited in the bibliography section of this document.
 97 | 
 98 | ## Contact Information
 99 | 
100 | For inquiries, permissions, or further information, please reach out to me at ( jueguo@buffalo.edu ).
101 | 
102 | ## Disclaimer
103 | 
104 | This document is provided "as is," and the author makes no representations or warranties, express or implied, regarding its completeness, accuracy, or reliability.
105 | 


--------------------------------------------------------------------------------
/imgs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/.DS_Store


--------------------------------------------------------------------------------
/imgs/continual_learning/cl_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/continual_learning/cl_1.png


--------------------------------------------------------------------------------
/imgs/continual_learning/cl_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/continual_learning/cl_2.png


--------------------------------------------------------------------------------
/imgs/continual_learning/cl_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/continual_learning/cl_3.png


--------------------------------------------------------------------------------
/imgs/fundamental_algo/algo_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/fundamental_algo/algo_1.png


--------------------------------------------------------------------------------
/imgs/fundamental_algo/algo_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/fundamental_algo/algo_2.png


--------------------------------------------------------------------------------
/imgs/fundamental_algo/algo_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/fundamental_algo/algo_3.png


--------------------------------------------------------------------------------
/imgs/fundamental_algo/algo_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/fundamental_algo/algo_4.png


--------------------------------------------------------------------------------
/imgs/fundamental_algo/algo_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/fundamental_algo/algo_5.png


--------------------------------------------------------------------------------
/imgs/introduction/intro_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/introduction/intro_1.png


--------------------------------------------------------------------------------
/imgs/notation/notation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/notation/notation_1.png


--------------------------------------------------------------------------------
/imgs/notation/notation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/notation/notation_2.png


--------------------------------------------------------------------------------
/imgs/notation/notation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/notation/notation_3.png


--------------------------------------------------------------------------------
/imgs/notation/notation_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/imgs/notation/notation_4.png


--------------------------------------------------------------------------------
/machine_learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COD1995/A-Comprehensive-Note-on-Machine-Learning/068268fc12c612b2922acc181608104e570cf8f5/machine_learning.pdf


--------------------------------------------------------------------------------
/machine_learning.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[12pt]{book} 
 2 | \usepackage{mathpazo}
 3 | \usepackage{geometry} 
 4 | \usepackage{titlesec} 
 5 | \usepackage{amsmath}
 6 | \usepackage{amsfonts}
 7 | \usepackage{graphicx,subcaption}
 8 | \usepackage{float}
 9 | \usepackage{tikz,lipsum,lmodern}
10 | \usepackage[most]{tcolorbox}
11 | \usepackage[colorlinks=true,linkcolor=blue, citecolor=blue]{hyperref}
12 | \usepackage[authoryear,round]{natbib}
13 | 
14 | % Adjust spacing for chapters
15 | \titlespacing*{\chapter}{0pt}{-50pt}{20pt}
16 | 
17 | % Adjust spacing for sections
18 | \titlespacing*{\section}{0pt}{20pt}{10pt}
19 | 
20 | 
21 | % Remove paragraph indentation and add space between paragraphs
22 | \setlength{\parindent}{0pt}
23 | \setlength{\parskip}{\baselineskip}
24 | 
25 | \geometry{left=1.25in, right=1in, top=1in, bottom=1in, footskip=0.5in}
26 | 
27 | \titleformat{\chapter}[display]
28 | {\normalfont\Large\bfseries}{\chaptertitlename\ \thechapter}{20pt}{\Large}
29 | \titleformat{\section}
30 | {\normalfont\large\bfseries}{\thesection}{1em}{}
31 | 
32 | \title{A Comprehensive Note on Machine Learning\footnote{This content serve only for \textbf{educational} and \textbf{personal} purpose, \textbf{do not share} without my approval.}}
33 | \author{Jue Guo}
34 | \date{\today}
35 | 
36 | \begin{document}
37 | \frontmatter
38 | \maketitle
39 | \tableofcontents
40 | \chapter*{Preface}
41 | 
42 | The primary purpose of this document is educational, specifically for the courses I teach (\textbf{CSE 474/574 Introduction to Machine Learning, CSE 455/555 Pattern Recognition, and CSE 676 Deep Learning}), as well as for my personal reference. A substantial portion of the material herein is directly referenced or adapted from established texts and sources, and is not claimed as original content. This document is intended as a supplementary teaching and learning resource and is not authorized for commercial use, redistribution, or sale without my explicit consent.
43 | 
44 | The majority of the material referenced comes from the following sources:
45 | 
46 | {
47 | \renewcommand{\labelenumi}{[\theenumi]}
48 | \setcounter{enumi}{0} % Start enumeration from 1
49 | 
50 | \begin{enumerate}
51 | 	\item Zhang, Aston, et al. Dive into deep learning. Cambridge University Press, 2023.
52 | 	\item Bishop, C. M., \& Nasrabadi, N. M. (2006). Pattern recognition and machine learning (Vol. 4, No. 4, p. 738). New York: Springer.
53 | 	\item Hart, P. E., Stork, D. G., \& Duda, R. O. (2000). Pattern classification. Hoboken: Wiley.
54 | 	\item Burkov, A. (2019). The hundred-page machine learning book (Vol. 1, p. 32). Quebec City, QC, Canada: Andriy Burkov.
55 | 	\item Burkov, A. (2020). Machine learning engineering (Vol. 1). Montreal, QC, Canada: True Positive Incorporated. \footnote{This is a birthday present from my lab member and good friend Peiyao Xiao}
56 | \end{enumerate}
57 | }
58 | 
59 | All other referenced materials and sources are cited in the bibliography section of this document. This compilation is intended to provide a comprehensive overview and guide for students and practitioners of machine learning, drawing upon a wide range of foundational and contemporary sources in the field.
60 | 
61 | \mainmatter
62 | \part{Basic Machine Learning}
63 | \include{Basic_Machine_Learning/introduction}
64 | \include{Basic_Machine_Learning/notation}
65 | \include{Basic_Machine_Learning/fundamental_algo}
66 | 
67 | \part{Advance Machine Learning}
68 | \part{Neural Networks}
69 | \include{Neural_Networks/kowledge_distillation}
70 | \part{Convolution Neural Networks}
71 | 
72 | \part{Adversarial Attacks and Training}
73 | \include{Adversarial_Attack_and_Training/intriguing_properties_of_NN}
74 | 
75 | \part{Recurrent Neural Networks}
76 | 
77 | \part{Transformers}
78 | 
79 | \part{Artificial General Intelligence}
80 | \include{Artificial_General_Intelligence/continual_learning}
81 | \include{Artificial_General_Intelligence/ER_AML}
82 | 
83 | \backmatter
84 | \bibliographystyle{plainnat}
85 | \bibliography{references}
86 | 
87 | \end{document}
88 | 


--------------------------------------------------------------------------------
/references.bib:
--------------------------------------------------------------------------------
 1 | @article{wang2023comprehensive,
 2 |   title   = {A comprehensive survey of continual learning: Theory, method and application},
 3 |   author  = {Wang, Liyuan and Zhang, Xingxing and Su, Hang and Zhu, Jun},
 4 |   journal = {arXiv preprint arXiv:2302.00487},
 5 |   year    = {2023}
 6 | }
 7 | @article{szegedy2013intriguing,
 8 |   title   = {Intriguing properties of neural networks},
 9 |   author  = {Szegedy, Christian and Zaremba, Wojciech and Sutskever, Ilya and Bruna, Joan and Erhan, Dumitru and Goodfellow, Ian and Fergus, Rob},
10 |   journal = {arXiv preprint arXiv:1312.6199},
11 |   year    = {2013}
12 | }
13 | 
14 | @misc{hinton2015distilling,
15 |   title         = {Distilling the Knowledge in a Neural Network},
16 |   author        = {Geoffrey Hinton and Oriol Vinyals and Jeff Dean},
17 |   year          = {2015},
18 |   eprint        = {1503.02531},
19 |   archiveprefix = {arXiv},
20 |   primaryclass  = {stat.ML}
21 | }


--------------------------------------------------------------------------------