├── .gitignore
├── sample.pdf
├── chapter1.pdf
├── chapter12.pdf
├── chapter2.pdf
├── chapter3.pdf
├── chapter4.pdf
├── chapter5.pdf
├── chapter6.pdf
├── chapter7.pdf
├── chapter8.pdf
├── Images
    ├── nn.jpeg
    ├── xor.png
    ├── knobs.png
    ├── svm-slack.png
    ├── grad_desc1.png
    ├── grad_desc2.png
    ├── bias-variance.jpg
    ├── convex-cost-function.png
    └── nonconvex-cost-function.png
├── background.pdf
├── README.md
├── chapter8.tex
├── background.tex
├── chapter4.tex
├── chapter6.tex
├── chapter5.tex
├── chapter1.tex
├── chapter7.tex
├── chapter12.tex
├── sample.tex
├── chapter2.tex
└── chapter3.tex


/.gitignore:
--------------------------------------------------------------------------------
1 | Code
2 | equations.tex
3 | 


--------------------------------------------------------------------------------
/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/sample.pdf


--------------------------------------------------------------------------------
/chapter1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter1.pdf


--------------------------------------------------------------------------------
/chapter12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter12.pdf


--------------------------------------------------------------------------------
/chapter2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter2.pdf


--------------------------------------------------------------------------------
/chapter3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter3.pdf


--------------------------------------------------------------------------------
/chapter4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter4.pdf


--------------------------------------------------------------------------------
/chapter5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter5.pdf


--------------------------------------------------------------------------------
/chapter6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter6.pdf


--------------------------------------------------------------------------------
/chapter7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter7.pdf


--------------------------------------------------------------------------------
/chapter8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/chapter8.pdf


--------------------------------------------------------------------------------
/Images/nn.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/nn.jpeg


--------------------------------------------------------------------------------
/Images/xor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/xor.png


--------------------------------------------------------------------------------
/background.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/background.pdf


--------------------------------------------------------------------------------
/Images/knobs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/knobs.png


--------------------------------------------------------------------------------
/Images/svm-slack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/svm-slack.png


--------------------------------------------------------------------------------
/Images/grad_desc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/grad_desc1.png


--------------------------------------------------------------------------------
/Images/grad_desc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/grad_desc2.png


--------------------------------------------------------------------------------
/Images/bias-variance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/bias-variance.jpg


--------------------------------------------------------------------------------
/Images/convex-cost-function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/convex-cost-function.png


--------------------------------------------------------------------------------
/Images/nonconvex-cost-function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/PyMLSlides/HEAD/Images/nonconvex-cost-function.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Machine learning slides
 2 | 
 3 | I created these slides for my machine learning course that I teach at [Loyola University Chicago](http://www.luc.edu/cs/). These slides are *mostly* based on Sebastian Raschka's [Python Machine Learning](https://www.amazon.com/Python-Machine-Learning-Sebastian-Raschka/dp/1783555130/ref=sr_1_1?ie=UTF8&qid=1496671816&sr=8-1&keywords=python+machine+learning) book. Please see Sebastian's github [repository](https://github.com/rasbt/python-machine-learning-book) for additional course materials such as the full set of equations used in the text and the code samples.
 4 | 
 5 | **NOTE:** Be sure to download the [Code directory](https://github.com/rasbt/python-machine-learning-book/tree/master/code) into the same place where you cloned this repository. Some slides will not compile without it!
 6 | 
 7 | ## Acknowledgements
 8 | 
 9 | I am grateful to Sebastian for providing the latex source for the equations used in the book.
10 | 


--------------------------------------------------------------------------------
/chapter8.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{beamer}
 2 | \usepackage{latexsym} 
 3 | \usepackage{graphicx}
 4 | \usetheme{Warsaw}
 5 | 
 6 | \title{Chapter 5}
 7 | \subtitle{Working with Text}
 8 | 
 9 | \begin{document}
10 | \maketitle
11 | 
12 | \begin{frame}
13 |   \frametitle{}
14 |   \begin{itemize}
15 |   \item Natural Language Processing (NLP)
16 |   \item Sentiment analysis (aka opinion mining)
17 |   \item Document polarity (e.g. positive vs. negative)
18 |   \item IMDB dataset
19 |     \begin{itemize}
20 |     \item 50,000 movie reviews labeled as positive/negative
21 |     \item Positive: more than six stars on IMDB
22 |     \item Negative: fewer than five stars on IMDB
23 |     \end{itemize}
24 |   \item Predict automatically whether the reviewer liked the movie
25 |   \end{itemize}
26 | \end{frame}
27 | 
28 | \begin{frame}
29 |   \frametitle{Bag-of-words models}
30 |   \begin{itemize}
31 |   \item Idea: represent text as numerical feature vectors
32 |   \item Create a vocabulary (alphabet) of unique tokens (e.g. words)
33 |   \item Assign an integer index to each token
34 |   \item Construct a sparse feature vector
35 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb}{CountVectorizer example}
36 |   \end{itemize}
37 | \end{frame}
38 | 
39 | \begin{frame}
40 |   \frametitle{N-grams}
41 |   \begin{itemize}
42 |   \item Unigrams and bigrams
43 |   \item The sun is shining
44 |   \item Unigrams: the, sun, is, shining
45 |   \item Bigrams: the sun, sun is, is shining
46 |   \item CountVectorizer can extract any n-grams
47 |   \item Tf-idf sometimes works better than row counts
48 |   \end{itemize}
49 | \end{frame}
50 | 
51 | \begin{frame}
52 |   \frametitle{NLP bag-of-tricks}
53 |   \begin{itemize}
54 |   \item Data cleaning to remove noisy tokens (e.g. HTML tags)
55 |   \item Stemming (e.g. running - run)
56 |   \item Lemmatization (e.g. went - to go)
57 |   \item Stop-word removal
58 |   \item Open-source libraries, e.g. NLTK and OpenNLP
59 |   \item Details in text (should be useful for projects)
60 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb}{90\% accurate logistic regression example}
61 |   \item Out-of-core learning possible in scikit-learn
62 |   \end{itemize}
63 | \end{frame}
64 | 
65 | \begin{frame}
66 |   \frametitle{}
67 |   \begin{itemize}
68 |   \item 
69 |   \end{itemize}
70 | \end{frame}
71 | 
72 | \end{document}
73 | 


--------------------------------------------------------------------------------
/background.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{beamer}
 2 | \usepackage{latexsym}
 3 | \usepackage{graphicx}
 4 | \usetheme{Warsaw}
 5 | 
 6 | \title{ML Background}
 7 | \subtitle{Maximum likelihood etc.}
 8 | 
 9 | \begin{document}
10 | \maketitle
11 | 
12 | \begin{frame}
13 |   \frametitle{Coin Tossing}
14 |   \begin{itemize}
15 |   \item Given a coin, find out $P(heads)$
16 |   \item I.e. the probability that if you flip it, it lands as `heads' \pause
17 |   \item Flip it a few times: $H$ $H$ $T$
18 |   \item $P(heads)=2/3$, no need for Comp 379
19 |   \item Hmm... is this rigorous?
20 |   \end{itemize}
21 | \end{frame}
22 | 
23 | \begin{frame}
24 |   \frametitle{Bernoulli distribution}
25 |   \begin{itemize}
26 |   \item Single binary random variable $x\in\{0,1\}$
27 |   \item E.g. $x=1$ represents `heads' and $x=0$ represents `tails'
28 |   \item Probability of $x=1$ denoted by the parameter $\mu$
29 |   \item So, $p(x=1|\mu) = \mu$ and $p(x=0|\mu) = 1 - \mu$
30 |   \item The probability distribution over $x$ can be written
31 |   \end{itemize}
32 |   \centering
33 |   $Bern(x|\mu) = \mu^x(1-\mu)^{1-x}$
34 | \end{frame}
35 | 
36 | \begin{frame}
37 |   \frametitle{Coin tossing model}
38 |   \begin{itemize}
39 |   \item Assume coin flips are independent and identically distributed
40 |   \item All are separate samples from the Bernoulli distribution (i.i.d.)
41 |   \item Given data $\mathcal{D} = \{x_1,\ldots,x_N\}$
42 |   \item Where heads: $x_i=1$ and tails: $x_i=0$
43 |   \item The \textbf{likelihood} of the data is: \[p(\mathcal{D}|\mu) = \prod_{n=1}^{N} p(x_n|\mu) = \prod_{n=1}^{N} \mu^{x_n} (1-\mu)^{1-x_n} \]
44 |   \end{itemize}
45 | \end{frame}
46 | 
47 | \begin{frame}
48 |   \frametitle{Maximum Likelihood Estimation}
49 |   \begin{itemize}
50 |   \item Given $\mathcal{D}$ with $H$ heads and $T$ tails
51 |   \item What should $\mu$ be?
52 |   \item Maximum Likelihood Estimation (MLE)
53 |   \item Choose $\mu$ which maximizes the likelihood of the data
54 |     \[ \mu_{ML} = \arg \max_{\mu} p(\mathcal{D}|\mu) \]
55 |   \item Since $\ln(\cdot)$ is monotonically increasing:
56 |      \[ \mu_{ML} = \arg \max_{\mu} \ln p(\mathcal{D}|\mu) \]
57 |   \end{itemize}
58 |   \tiny
59 |   \textbf{NOTE:} A monotonically increasing function is one that increases as $x$ does for all real $x$
60 | \end{frame}
61 | 
62 | \begin{frame}
63 |   \frametitle{Maximum Likelihood Estimation}
64 |   \begin{itemize}
65 |   \item Likelihood
66 |     \[ p(\mathcal{D}|\mu) = \prod_{n=1}^{N} \mu^{x_n} (1-\mu)^{1-x_n} \]
67 |   \item Log-likelihood
68 |     \[ \ln p(\mathcal{D}|\mu) = \sum_{n=1}^{N} x_n \ln \mu + (1-x_n) \ln (1-\mu) \]
69 |   \item Take the derivative and set to 0 \pause
70 |     \[ \frac{d}{d \mu } \ln p(\mathcal{D}|\mu) = \sum_{n=1}^{N} x_n \frac{1}{\mu} - (1-x_n) \frac{1}{1-\mu}  = \frac{1}{\mu} H - \frac{1}{1-\mu} T \]
71 |     \[ \mu = \frac{H}{T + H} \]
72 |   \end{itemize}
73 | \end{frame}
74 | 
75 | \begin{frame}
76 | \textbf{Acknowledgements:} Slides based on the latex source provided by Oliver Schulte and Greg Mori (Simon Fraser University)
77 | \end{frame}
78 | 
79 | \end{document}
80 | 


--------------------------------------------------------------------------------
/chapter4.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym} 
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 4}
  7 | \subtitle{Data Preprocessing: Practical Issues}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \begin{frame}
 13 |   \frametitle{Splitting data into train and test}
 14 |   \begin{itemize}
 15 |   \item Download wine dataset
 16 |     \begin{itemize}
 17 |     \item Three classes which map to different types of grapes in Italy
 18 |     \end{itemize}
 19 |   \item Cannot train and test on the same data
 20 |   \item So allocate some portion for testing and use the rest for training
 21 |     \begin{itemize}
 22 |     \item 70-30 or 80-20 split
 23 |     \end{itemize}
 24 |   \item Splitting three ways is a better idea to allocate some dev data
 25 |   \item N-fold cross-validation
 26 |   \item Scikit-learn helper methods (e.g. train\_test\_split())
 27 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch04/ch04.ipynb}{Chapter 4 iPython notebook}
 28 |   \end{itemize}
 29 | \end{frame}
 30 | 
 31 | \begin{frame}
 32 |   \frametitle{Wine dataset}
 33 |   \includegraphics[width=\textwidth]{Code/ch04/images/04_10.png} 
 34 | \end{frame}
 35 | 
 36 | \begin{frame}
 37 |   \frametitle{Feature scaling}
 38 |   \begin{itemize}
 39 |   \item Imagine we have two features
 40 |     \begin{itemize}
 41 |     \item $1 < x_1 < 10$
 42 |     \item $1 < x_2 < 100000$
 43 |     \end{itemize}
 44 |   \item Algorithm will likely focus on optimizing $w_2$
 45 |   \item As this will produce the largest changes in perceptron error
 46 |   \item KNN based on Euclidean distance will be dominated by $x_2$
 47 |   \item Two common approaches
 48 |     \begin{itemize}
 49 |     \item Normalization
 50 |     \item Standartization
 51 |     \end{itemize}
 52 |   \end{itemize}
 53 | \end{frame}
 54 | 
 55 | \begin{frame}
 56 |   \frametitle{Normalization}
 57 |   \textit{Normalization} refers to the rescaling of the features to a range of [0, 1].  To normalize the data, we apply the min-max scaling to each feature column, where the new value $x_{norm}^{(i)}$ of a sample  $x^{(i)}$ is calculated as follows:
 58 |   \[
 59 |   x_{norm}^{(i)} = \frac{x^{(i)} - \mathbf{x}_{min}}{\mathbf{x}_{max} - \mathbf{x}_{min}}
 60 |   \]
 61 |   Here, $x^{(i)}$ is a particular sample, $x_{min}$ is the smallest value in a feature column, and $x_{max}$ the largest value, respectively.
 62 | \end{frame}
 63 | 
 64 | \begin{frame}
 65 |   \frametitle{Standartization}
 66 |   \begin{itemize}
 67 |   \item Normalization gives us values in a bounded interval
 68 |   \item Standartization can be more practical:
 69 |   \item Many ML algorithms initialize the weights to zero
 70 |   \item Standartization centers the columns at $mean=0$ and $std=1$
 71 |   \item So feature columns take the form of a normal distribution
 72 |   \item This makes it easer to learn the weights
 73 |   \item Standartization encodes useful info about outliers
 74 |   \item Vs. normalization which scales the data to a fixed range
 75 |   \end{itemize}
 76 | \end{frame}
 77 | 
 78 | \begin{frame}
 79 |   \frametitle{}
 80 |   The procedure of standardization can be expressed by the following equation:
 81 |   \[
 82 |   x_{std}^{(i)} = \frac{x^{(i)} - \mu_{x}}{\sigma_{x}}
 83 |   \]
 84 |   Here, $\mu_{x}$ is the sample mean of a particular feature column and $\sigma_{x}$ the corresponding standard deviation, respectively.
 85 |   \begin{itemize}
 86 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch04/ch04.ipynb}{Example of using normalization and standardization}
 87 |   \end{itemize}
 88 | \end{frame}
 89 | 
 90 | \begin{frame}
 91 |   \frametitle{L1 regularization}
 92 |   Recall L2 regularization -- one approach to reduce model complexity
 93 |   \[
 94 |   L2: \lVert \mathbf{w} \rVert^{2}_{2} = \sum_{j=1}^{m} w^{2}_{j}
 95 |   \]
 96 |   An alternative approach is \textit{L1 regularization}:
 97 |   \[
 98 |   L1: \lVert \mathbf{w} \rVert_{1} = \sum_{j=1}^{m} |w_j|
 99 |   \]
100 | \end{frame}
101 | 
102 | \begin{frame}
103 |   \frametitle{L1 regularization}
104 |   \begin{itemize}
105 |   \item L1 yields sparse solutions
106 |   \item Most feature weights will be zero
107 |   \item Useful for high-dimensional datasets with irrelevant features
108 |   \item It can be viewed as a technique for feature selection
109 |   \item Some intuition as to why this is the case will follow
110 |   \end{itemize}
111 | \end{frame}
112 | 
113 | \begin{frame}
114 |   \frametitle{L2 regularization}
115 |   \includegraphics[width=\textwidth]{Code/ch04/images/04_12.png} 
116 | \end{frame}
117 | 
118 | \begin{frame}
119 |   \frametitle{L1 regularization}
120 |   \includegraphics[width=\textwidth]{Code/ch04/images/04_13.png} 
121 | \end{frame}
122 | 
123 | \begin{frame}
124 |   \frametitle{Sparcity}
125 |   \begin{itemize}
126 |   \item Regularization penalty and cost pull in opposite directions
127 |   \item Regularization wants the weight to be at (0, 0)
128 |   \item I.e. regularization prefers a simpler model
129 |   \item And decreases the dependence of the model on the training data
130 |   \item \href{https://github.com/rasbt/python-machine-learning-book/tree/master/code/ch04}{L1 in scikit-learn}
131 |   \end{itemize}
132 | \end{frame}
133 | 
134 | \begin{frame}
135 |   \frametitle{}
136 |   \begin{itemize}
137 |   \item 
138 |   \end{itemize}
139 | \end{frame}
140 | 
141 | \end{document}
142 | 


--------------------------------------------------------------------------------
/chapter6.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym} 
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 6}
  7 | \subtitle{Model Evaluation and Hyperparameter Tuning}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \begin{frame}
 13 |   \frametitle{How do we know the model is working?}
 14 |   \begin{itemize}
 15 |   \item Model evaluation
 16 |   \item How do we obtain an unbiased estimate of model's performance?
 17 |   \item Key concept: estimate model performance on \textbf{unseen} data
 18 |   \item Hyperparameters vs. model parameters (e.g. weights)
 19 |   \item Model fine-tuning
 20 |   \item Performance metrics
 21 |   \end{itemize}
 22 | \end{frame}
 23 | 
 24 | \begin{frame}
 25 |   \frametitle{The holdout method}
 26 |   \begin{itemize}
 27 |   \item Split data into training and test datasets
 28 |   \item However, typically we cannot test immediately after training
 29 |   \item Need to tune the model to further improve the performance
 30 |   \item Select optimial values of hyperparameters
 31 |   \item This step is known as \textit{model selection}
 32 |   \item A better approach: training set + validation set + test set
 33 |   \item Validation set is used for model selection
 34 |   \end{itemize}
 35 | \end{frame}
 36 | 
 37 | \begin{frame}
 38 |   \frametitle{The holdout method}
 39 |   \includegraphics[width=\textwidth]{Code/ch06/images/06_02.png}
 40 | \end{frame}
 41 | 
 42 | \begin{frame}
 43 |   \frametitle{K-fold cross-validation}
 44 |   \begin{itemize}
 45 |   \item Disadvantage of the holdout method: sensitive to partitioning
 46 |   \item Randomly split the training dataset into $k$ folds
 47 |   \item Of these, $k-1$ folds are used for training and one for testing
 48 |   \item Repeat this procedure $k$ times and average across $k$ folds
 49 |   \item Each sample will be part of train and test sets
 50 |   \item Lower-variance estimate of the model performance (than holdout)
 51 |   \end{itemize}
 52 | \end{frame}
 53 | 
 54 | \begin{frame}
 55 |   \frametitle{K-fold cross-validation}
 56 |   \includegraphics[width=\textwidth]{Code/ch06/images/06_03.png}
 57 | \end{frame}
 58 | 
 59 | \begin{frame}
 60 |   \frametitle{How do we pick the number of folds?}
 61 |   \begin{itemize}
 62 |   \item The standard value is $k=10$
 63 |   \item For small datasets, increase the number of folds
 64 |   \item Which will increase the amount of training data
 65 |   \item For larger datasets, we can decrease the number of folds
 66 |   \item E.g. $k=5$ is a reasonable choice
 67 |   \end{itemize}
 68 | \end{frame}
 69 | 
 70 | \begin{frame}
 71 |   \frametitle{Variations on the theme of k-fold cross-validation}
 72 |   \begin{itemize}
 73 |   \item \textbf{Leave-one-out cross-validation}
 74 |     \begin{itemize}
 75 |     \item Set the number of folds equal to the number of training samples
 76 |     \item Only a single training sample used for testing during each iteration
 77 |     \item Recommended approach for very small datasets
 78 |     \end{itemize}
 79 |   \item \textbf{Stratified k-fold cross-validation}
 80 |     \begin{itemize}
 81 |     \item Class proportions preserved in each fold
 82 |     \item I.e. each fold is representative of the training set
 83 |     \item Better performance estimates for imbalanced data
 84 |     \end{itemize}
 85 |   \end{itemize}
 86 | \end{frame}
 87 | 
 88 | \begin{frame}
 89 |   \frametitle{Grid search}
 90 |   \begin{itemize}
 91 |   \item Many ML algorithms offer a number of hyperparameters
 92 |   \item \href{https://www.csie.ntu.edu.tw/~cjlin/libsvm/}{Link to libsvm command-line arguments}
 93 |   \item Find the optimal combination of hyperparameter values
 94 |   \item Brute-force exhaustive search of hyperparameter space
 95 |   \item Obviously, this can be computationally very expensive
 96 |   \end{itemize}
 97 | \end{frame}
 98 | 
 99 | \begin{frame}
100 |   \frametitle{Performance evaluation metrics}
101 |   \begin{itemize}
102 |   \item We've been using accuracy
103 |   \item Accuracy can be misleading for imbalanced datasets
104 |   \item Need ways to compute the performance for a specific class
105 |   \item Confusion matrix helps visualize different types of errors a classifier can make by reporting the counts of these errors
106 |   \item I.e. true positive (TP), true negative (TN), false positive (FP), false negagive (FN) predictions
107 |   \end{itemize}
108 | \end{frame}
109 | 
110 | \begin{frame}
111 |   \frametitle{Confusion matrix}
112 |   \includegraphics[scale=0.5]{Code/ch06/images/06_08.png}
113 | \end{frame}
114 | 
115 | \begin{frame}
116 |   \frametitle{Deducing performance metrics from a confusion matrix}
117 |   The error can be understood as the sum of all false predictions divided by the number of total predictions, and the accuracy is calculated as the sum of correct predictions divided by the total number of predictions, respectively:
118 |   \[
119 |   Error = \frac{FP + FN}{FP + FN + TP + TN}
120 |   \]
121 |   The prediction accuracy can then be calculated directly from the error:
122 |   \[
123 |   Accuracy = \frac{TP + TN}{FP + FN + TP + TN} = 1 - Error
124 |   \]
125 | \end{frame}
126 | 
127 | \begin{frame}
128 |   \frametitle{Precision, Recall, F1}
129 |   \textbf{Precision:}
130 |   \[
131 |   P = \frac{TP}{TP + FP}
132 |   \]
133 |   \textbf{Recall:}
134 |   \[
135 |   R = TPR = \frac{TP}{P} = \frac{TP}{FN + TP}
136 |   \]
137 |   \textbf{F1 score:}
138 |   \[
139 |   \text{F1} = 2 \times \frac{P \times R}{P + R}
140 |   \]
141 | \end{frame}
142 | 
143 | \begin{frame}
144 |   \frametitle{}
145 |   \begin{itemize}
146 |   \item 
147 |   \end{itemize}
148 | \end{frame}
149 | 
150 | \end{document}
151 | 


--------------------------------------------------------------------------------
/chapter5.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym}
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 5}
  7 | \subtitle{Dimensionality Reduction}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \begin{frame}
 13 |   \frametitle{Principal Component Analysis (PCA)}
 14 |   \begin{columns}[c]
 15 |     \column{0.5\textwidth}
 16 |     \begin{itemize}
 17 |     \item Find the directions of maximum variance
 18 |     \item Project data onto the lower-dimensional space
 19 |     \item Original features: $x_1$ and $x_2$
 20 |     \item Principal components: \textbf{PC1} and \textbf{PC2}
 21 |     \end{itemize}
 22 |     \column{0.5\textwidth}
 23 |     \includegraphics[width=\textwidth]{Code/ch05/images/05_01.png}
 24 |   \end{columns}
 25 | \end{frame}
 26 | 
 27 | \begin{frame}
 28 |   \frametitle{Mapping to a low-dimensional space}
 29 |   When we use PCA for dimensionality reduction, we construct a $d \times k$ transformation matrix $\mathbf{W}$.
 30 |   We then map a sample vector $\mathbf{x}$ onto a new $k$-dimensional feature subspace ($k << d$)
 31 |   \[
 32 |   \mathbf{x} = [ x_1, x_2, \dots, x_j], \mathbf{x} \in \mathbb{R}^d
 33 |   \]
 34 |   \[
 35 |   \downarrow \mathbf{x W}, \quad \mathbf{W} \in \mathbb{R}^{d \times k}
 36 |   \]
 37 |   \[
 38 |   \mathbf{z} = [z_1, z_2, \dots, z_k], \quad \mathbf{z} \in \mathbb{R}^k
 39 |   \]
 40 | \end{frame}
 41 | 
 42 | \begin{frame}
 43 |   \frametitle{Principal components}
 44 |   \begin{itemize}
 45 |   \item Transforming $d$-dimensional data to $k$ dimensions
 46 |   \item First principal component will have the largest variance
 47 |   \item Second principal component will have next largest variance
 48 |   \item And so on...
 49 |   \item PCA sensitive to data scaling, so need to standardize features
 50 |   \end{itemize}
 51 | \end{frame}
 52 | 
 53 | \begin{frame}
 54 |   \frametitle{Algorithm}
 55 |   \begin{enumerate}
 56 |   \item Standardize the $d$-dimensional dataset.
 57 |   \item Construct the covariance matrix.
 58 |   \item Decompose the covariance matrix into its eigenvectors and eigenvalues.
 59 |   \item Select $k$ eigenvectors that correspond to the $k$ largest eigenvalues, where $k$ is the dimensionality of the new feature subspace $(k \le d)$.
 60 |   \item Construct a projection matrix $\mathbf{W}$ from the "top" $k$ eigenvectors.
 61 |   \item Transform the $d$-dimensional input dataset $\mathbf{X}$ using the projection matrix $\mathbf{W}$ to obtain the new $k$-dimensional feature subspace.
 62 |   \end{enumerate}
 63 | \end{frame}
 64 | 
 65 | \begin{frame}
 66 |   \frametitle{Variance-covariance matrix}
 67 |   \begin{itemize}
 68 |   \item Symmetric $d \times d$ -dimensional matrix ($d$ - number of dimensions)
 69 |   \item Pairwise covariances between the different features
 70 |   \item Covariance between two features $\mathbf{x}_j$ and $\mathbf{x}_k$:
 71 |     \[
 72 |     \sigma_{jk} = \frac{1}{n} \sum_{i=1}^{n} \big(  x_{j}^{(i)} - \mu_j  \big) \big(  x_{k}^{(i)}  - \mu_k \big)
 73 |     \]
 74 |     Where $\mu_j$ and $\mu_k$ are the sample means of feature $j$ and $k$
 75 |   \end{itemize}
 76 | 
 77 | \end{frame}
 78 | 
 79 | \begin{frame}
 80 |   \frametitle{What is covariance?}
 81 |   \begin{itemize}
 82 |   \item Measure of how much two random variables change together
 83 |   \item Positive covariance
 84 |     \begin{itemize}
 85 |     \item Features increase together
 86 |     \item Features decrease together
 87 |     \item E.g. As a balloon is blown up it gets larger in all dimensions
 88 |     \end{itemize}
 89 |   \item Negative covariance
 90 |     \begin{itemize}
 91 |     \item Features vary in opposite directions
 92 |     \item Large values of one variable correspond to small values of the other
 93 |     \item E.g. if a sealed balloon is squashed in one dimension then it will expand in the other two
 94 |     \end{itemize}
 95 |   \item The magnitude of the covariance is not easy to interpret
 96 |   \item The normalized version of covariance (\textit{correlation coefficient})  indicates the strength of the linear relation.
 97 |   \end{itemize}
 98 | \end{frame}
 99 | 
100 | \begin{frame}
101 |   \frametitle{Variance-covariance matrix}
102 |   \begin{itemize}
103 |   \item For three features, covariance matrix will look like this:
104 |     \[
105 |     \Sigma = \begin{bmatrix}
106 |       \sigma_{1}^2 & \sigma_{12} & \sigma_{13} \\
107 |       \sigma_{21} & \sigma_{2}^{2} & \sigma_{23} \\
108 |       \sigma_{31} & \sigma_{32} & \sigma_{3}^{2}
109 |     \end{bmatrix}
110 |     \]
111 |   \item The eigenvectors of $\Sigma$ represent the principle components
112 |   \item The corresponding eigenvalues represent their magnitude
113 |     \begin{itemize}
114 |     \item Principle components: the directions of maximum variance
115 |     \end{itemize}
116 |   \item E.g. Wine dataset (13 dimensions)
117 |     \begin{itemize}
118 |     \item $13x13$ covariance matrix
119 |     \item 13 eigenvectors
120 |     \item 13 eigenvalues
121 |     \end{itemize}
122 |   \end{itemize}
123 | \end{frame}
124 | 
125 | \begin{frame}
126 |   \frametitle{Eigenpairs}
127 |   \begin{itemize}
128 |   \item An Eigenvector $\mathbf{v}$ satisfies the condition:
129 |     \[
130 |     \Sigma \mathbf{v} = \lambda \mathbf{v}
131 |     \]
132 |     Where $\lambda$ is the eigenvalue (scalar)
133 |   \item NumPy has a function to compute eigenpairs
134 |   \item We want to reduce the dimensionality
135 |   \item So, we select a subset of $k$ most informative eigenvectors
136 |   \end{itemize}
137 | \end{frame}
138 | 
139 | \begin{frame}
140 |   \frametitle{Variance explained ratio}
141 |   \begin{itemize}
142 |   \item Variance explained ratio of an eigenvalue $\lambda_j$:
143 |   \[
144 |   \frac{\lambda_j}{\sum_{j=1}^{d} \lambda_j}
145 |   \]
146 |   \item First two principal components explain about 60 percent of the variance in the data
147 |   \end{itemize}
148 |   \includegraphics[scale=0.55]{Code/ch05/images/05_02.png}
149 | \end{frame}
150 | 
151 | \begin{frame}
152 |   \frametitle{Feature transformation}
153 |   \begin{itemize}
154 |   \item We decomposed the covariance matrix into eigenpairs
155 |   \item Now need to project to new space defined by principle component axes
156 |   \item Construct a $13 \times 2$ projection matrix from top two eigenvectors
157 |   \item Transform a sample $\mathbf{x}$ onto the PCA subspace obtaining $\mathbf{x}'$
158 |   \item Which is a two-dimensional vector consisting of two new features:
159 |     \[
160 |     \mathbf{x}' = \mathbf{xW}
161 |     \]
162 |   \item Transform entire Wine dataset ($124 \times 13$)
163 |     \[
164 |     \mathbf{X}' = \mathbf{XW}
165 |     \]
166 |   \end{itemize}
167 | \end{frame}
168 | 
169 | \begin{frame}
170 |   \frametitle{Visualize \textit{Wine} dataset in two dimensions}
171 |   \includegraphics[scale=0.55]{Code/ch05/images/05_03.png}
172 | \end{frame}
173 | 
174 | \begin{frame}
175 |   \frametitle{Visualize \textit{Wine} dataset in two dimensions}
176 |   \begin{columns}[c]
177 |     \column{0.5\textwidth}
178 |     \includegraphics[scale=0.35]{Code/ch05/images/05_03.png}
179 |     \column{0.5\textwidth}
180 |     \begin{itemize}
181 |     \item Can now visualize a 13-dimensional dataset
182 |     \item Data more spread along first principal component, which explained 40 percent of the variance
183 |     \item A linear classifier should be able to do a good job separating the classes
184 |     \item Keep in mind that PCA is an \textit{unsupervised} algorithm
185 |     \end{itemize}
186 |   \end{columns}
187 | \end{frame}
188 | 
189 | \end{document}
190 | 


--------------------------------------------------------------------------------
/chapter1.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym}
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 1}
  7 | \subtitle{ML Introduction}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \begin{frame}
 13 |   \frametitle{Math/stats background}
 14 |   \begin{itemize}
 15 |   \item Partial derivatives, e.g.: \\
 16 |     \[
 17 |     \frac{\partial (x^3 + y^2 + 1)}{\partial x}
 18 |     \]
 19 |   \item Matrix and vector operations \\
 20 |     \[
 21 |     \begin{bmatrix}
 22 |       1 & 2  & 3\\
 23 |       4 & 5  & 6
 24 |     \end{bmatrix} \times  \begin{bmatrix}
 25 |       7 \\
 26 |       8 \\
 27 |       9
 28 |     \end{bmatrix} =  \begin{bmatrix}
 29 |       1 \times 7 + 2 \times 8 + 3 \times 9 \\
 30 |       4 \times 7 + 5 \times 8 + 6 \times 9
 31 |     \end{bmatrix} = \begin{bmatrix}
 32 |       50 \\
 33 |       122
 34 |     \end{bmatrix}
 35 |     \]
 36 |   \item Basic probability and statistics
 37 |     \begin{itemize}
 38 |     \item Conditional probability
 39 |     \item Normal distribution
 40 |     \end{itemize}
 41 |   \end{itemize}
 42 | \end{frame}
 43 | 
 44 | \begin{frame}
 45 |   \frametitle{Programming background}
 46 |   \begin{itemize}
 47 |   \item Python
 48 |   \item NumPy
 49 |   \item Matplotlib
 50 |   \end{itemize}
 51 | \end{frame}
 52 | 
 53 | \begin{frame}
 54 |   \frametitle{What is ML? Ask Wikipedia}
 55 |   \begin{quote}
 56 |     Machine learning is a subfield of computer science (more particularly soft computing) that evolved from the study of pattern recognition and computational learning theory in artificial intelligence. In 1959, Arthur Samuel defined machine learning as a "Field of study that gives computers the ability to learn without being explicitly programmed". Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.
 57 |   \end{quote}
 58 |   \href{http://machinelearningmastery.com/what-is-machine-learning/}{More definitions here}
 59 | \end{frame}
 60 | 
 61 | \begin{frame}
 62 |   \frametitle{Applications of ML}
 63 |   \begin{itemize}
 64 |   \item Image recognition
 65 |   \item Spam classification
 66 |   \item Web search engines
 67 |   \item Voice recognition
 68 |   \item \href{https://www.quora.com/What-are-some-real-world-examples-of-applications-of-machine-learning-in-the-field}{Link to Quora}
 69 |   \end{itemize}
 70 | \end{frame}
 71 | 
 72 | \begin{frame}
 73 |   \frametitle{Three types of ML}
 74 |   \begin{itemize}
 75 |   \item Supervised learning
 76 |   \item Unsupervised learning
 77 |   \item Reinforcement learning
 78 |   \end{itemize}
 79 | \end{frame}
 80 | 
 81 | \begin{frame}
 82 |   \frametitle{Yann LeCun explains supervised learning}
 83 |   \scriptsize
 84 |   A pattern recognition system is like a black box with a camera at one end, a green light and a red light on top, and a whole bunch of knobs on the front. The learning algorithm tries to adjust the knobs so that when, say, a dog is in front of the camera, the red light turns on, and when a car is put in front of the camera, the green light turns on. You show a dog to the machine. If the red light is bright, don't do anything. If it’s dim, tweak the knobs so that the light gets brighter. If the green light turns on, tweak the knobs so that it gets dimmer. Then show a car, and tweak the knobs so that the red light get dimmer and the green light gets brighter. If you show many examples of the cars and dogs, and you keep adjusting the knobs just a little bit each time, eventually the machine will get the right answer every time.
 85 |   \center
 86 |   \includegraphics[scale=0.3]{Images/knobs.png}
 87 | \end{frame}
 88 | 
 89 | \begin{frame}
 90 |   \frametitle{Scaling up}
 91 |   The interesting thing is that it may also correctly classify cars and dogs it has never seen before. The trick is to figure out in which direction to tweak each knob and by how much without actually fiddling with them. This involves computing a “gradient,” which for each knob indicates how the light changes when the knob is tweaked.
 92 |   \\~\\
 93 |   Now, imagine a box with 500 million knobs, 1,000 light bulbs, and 10 million images to train it with. That’s what a typical Deep Learning system is.
 94 |   \\~\\
 95 |   Source: \href{http://spectrum.ieee.org/automaton/robotics/artificial-intelligence/facebook-ai-director-yann-lecun-on-deep-learning}{IEEE Spectrum Interview}
 96 | \end{frame}
 97 | 
 98 | \begin{frame}
 99 |   \frametitle{Supervised learning}
100 |   \includegraphics[width=\textwidth]{Code/ch01/images/01_02.png}
101 |   \begin{itemize}
102 |   \item Predicting the future with supervised learning
103 |   \item Classification vs. Regression
104 |   \end{itemize}
105 | \end{frame}
106 | 
107 | \begin{frame}
108 |   \frametitle{Classification}
109 |   \begin{columns}[c]
110 |     \column{0.5\textwidth}
111 |     \begin{itemize}
112 |     \item Predict categorical class labels based on past observations
113 |     \item Class labels are discrete unordered values
114 |     \item Email spam classification example (binary)
115 |     \item Handwritten digit classification example (multi-class)
116 |     \end{itemize}
117 |     \column{0.5\textwidth}
118 |     \includegraphics[width=\textwidth]{Code/ch01/images/01_03.png}
119 |   \end{columns}
120 | \end{frame}
121 | 
122 | \begin{frame}
123 |   \frametitle{Regression}
124 |   \begin{itemize}
125 |   \item Also a kind of supervised learning
126 |   \item Prediction of continuous outcomes
127 |   \item Predicting semester grades scores for students
128 |   \end{itemize}
129 |   \center
130 |   \includegraphics[scale=0.4]{Code/ch01/images/01_04.png}
131 | \end{frame}
132 | 
133 | \begin{frame}
134 |   \frametitle{Unsupervised learning}
135 |   \begin{itemize}
136 |   \item Dealing with \textit{unlabeled} data
137 |   \item Cluster analysis
138 |   \item Objects within a cluster share a degree of similarity
139 |   \end{itemize}
140 |   \center
141 |   \includegraphics[scale=0.4]{Code/ch01/images/01_06.png}
142 | \end{frame}
143 | 
144 | \begin{frame}
145 |   \frametitle{Unsupervised learing example}
146 |   \begin{itemize}
147 |   \item Latent Dirichlet Allocation (LDA)
148 |   \item \href{http://www.princeton.edu/~achaney/tmve/wiki100k/browse/topic-presence.html}{Link to Wikipedia topics}
149 |   \item \href {https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation}{Wikipedia LDA entry}
150 |     \item \href{http://blog.echen.me/2011/06/27/topic-modeling-the-sarah-palin-emails/}{Sara Palin topics}
151 |   \end{itemize}
152 | \end{frame}
153 | 
154 | \begin{frame}
155 |   \frametitle{Iris dataset}
156 |   \includegraphics[scale=0.1]{Code/ch01/images/01_08.png}
157 | \end{frame}
158 | 
159 | \begin{frame}
160 |   \frametitle{Basic terminology}
161 |   \begin{itemize}
162 |   \item Measurements of 150 iris flowers (150 samples / 4 features)
163 |   \item From 3 different species (Setosa, Versicolor, Virginica)
164 |   \item Rows are samples and columns are features
165 |   \item $150 \times 4$ matrix $\mathbf{X} \in \mathbb{R}^{150 \times 4}:$
166 |   \end{itemize}
167 | 
168 |   \[
169 |   \begin{bmatrix}
170 |     x_{1}^{(1)} & x_{2}^{(1)} & x_{3}^{(1)} & \dots  & x_{4}^{(1)} \\
171 |     x_{1}^{(2)} & x_{2}^{(2)} & x_{3}^{(2)} & \dots  & x_{4}^{(2)} \\
172 |     \vdots & \vdots & \vdots & \ddots & \vdots \\
173 |     x_{1}^{(150)} & x_{2}^{(150)} & x_{3}^{(150)} & \dots  & x_{4}^{(150)}
174 |   \end{bmatrix}
175 |   \]
176 | \end{frame}
177 | 
178 | \begin{frame}
179 |   \frametitle{A roadmap for building ML systems}
180 |   \includegraphics[width=\textwidth]{Code/ch01/images/01_09.png}
181 | \end{frame}
182 | 
183 | \begin{frame}
184 |   \frametitle{Model selection}
185 |   \begin{itemize}
186 |   \item No Free Lunch Theorems
187 |   \item Each classification algorithm makes assumptions
188 |   \item Often we empirically determine what works best
189 |   \item But how do we know what works best?
190 |     \begin{itemize}
191 |     \item Classification accuracy
192 |     \item Train+Dev+Test split
193 |     \item Hyperparameter optimization (knobs of the model)
194 |     \end{itemize}
195 |   \end{itemize}
196 | \end{frame}
197 | 
198 | \begin{frame}
199 |   \frametitle{Python for ML}
200 |   \begin{itemize}
201 |   \item Libraries for scientific computing such as NumPy and SciPy
202 |   \item Performance of interpreted languages is inferior
203 |   \item But NumPy and SciPy build upon lower level C and Fortran subroutines
204 |   \item Scikit-learn library
205 |   \item See page 13 for installation instructions (or just google)
206 |   \item \href{http://www-ekp.physik.uni-karlsruhe.de/~giffels/GridKa-School-Lectures/NumPy.slides.html}{\beamerbutton{NumPy Slides}}
207 |   \end{itemize}
208 | \end{frame}
209 | 
210 | \end{document}
211 | 


--------------------------------------------------------------------------------
/chapter7.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym}
  3 | \usepackage{graphicx}
  4 | \usepackage{listings}
  5 | 
  6 | \usetheme{Warsaw}
  7 | 
  8 | \title{Chapter 7}
  9 | \subtitle{Ensemble Classifiers}
 10 | 
 11 | \begin{document}
 12 | \maketitle
 13 | 
 14 | \begin{frame}
 15 |   \frametitle{Learning with ensembles}
 16 |   \begin{itemize}
 17 |   \item Our goal is to combined multiple classifiers
 18 |   \item Mixture of experts, e.g. 10 experts
 19 |   \item Predictions more accurate and robust
 20 |   \item Provide an intuition why this might work
 21 |   \item Simplest approach: majority voting
 22 |   \end{itemize}
 23 | \end{frame}
 24 | 
 25 | \begin{frame}
 26 |   \frametitle{Majority voting}
 27 |   \begin{itemize}
 28 |   \item Majority voting refers to binary setting
 29 |   \item Can easily generalize to multi-class: plurality voting
 30 |   \item Select class label that receives the most votes (mode)
 31 |   \end{itemize}
 32 |   \vspace{0.2in}
 33 |   \center
 34 |   \includegraphics[scale=0.4]{Code/ch07/images/07_01.png}
 35 | \end{frame}
 36 | 
 37 | \begin{frame}
 38 |   \frametitle{Combining predictions: options}
 39 |   \begin{itemize}
 40 |   \item Train $m$ classifiers $C_1,\dots,C_m$
 41 |   \item Build ensemble using different classification algorithms (e.g. SVM, logistic regression, etc.)
 42 |   \item Use the same algorithm but fit different subsets of the training set (e.g. random forest)
 43 |   \end{itemize}
 44 | \end{frame}
 45 | 
 46 | \begin{frame}
 47 |   \frametitle{General approach}
 48 |   \center
 49 |   \includegraphics[scale=0.35]{Code/ch07/images/07_02.png}
 50 | \end{frame}
 51 | 
 52 | \begin{frame}
 53 |   \frametitle{Combining predictions via majority voting}
 54 |   We have predictions of individual classifiers $C_j$ and need to select the final class label $\hat{y}$
 55 |   \[
 56 |   \hat{y} = mode \{ C_1 (\mathbf{x}), C_2 (\mathbf{x}), \dots, C_m (\mathbf{x}) \}
 57 |   \]
 58 |   For example, in a binary classification task where $class_1 = -1$ and $class_2 = +1$, we can write the majority vote prediction as follows:
 59 |   \[
 60 |   C(\mathbf{x}) = sign \Bigg[ \sum_{j}^{m} C_j (\mathbf{x}) \Bigg] = \begin{cases}
 61 |         1 & \text{ if } \sum_j C_j (\mathbf{x}) \ge 0 \\
 62 |         -1 & \text{ otherwise }
 63 |      \end{cases}
 64 |   \]
 65 | \end{frame}
 66 | 
 67 | \begin{frame}
 68 |   \frametitle{Intuition why ensembles can work better}
 69 |   Assume that all $n$ base classifiers have the same error rate $\epsilon$. We can expresss the probability of an error of an ensemble can be expressed as a probability mass function of a binomial distribution:
 70 |   \[
 71 |   P(y \ge k) = \sum_{k}^{n} \binom{n}{k} \epsilon^k (1 - \epsilon)^{n-k} = \epsilon_{\text{ensemble}}
 72 |   \]
 73 |   Here, $\binom{n}{k}$ is the binomial coefficient \textit{n choose k}. In other words, we compute the probability that the prediction of the ensemble is wrong.
 74 | \end{frame}
 75 | 
 76 | \begin{frame}
 77 |   \frametitle{Example}
 78 |   Imagine we have 11 base classifiers ($n=11$) with an error rate of 0.25 ($\epsilon = 0.25$):
 79 |   \[
 80 |   P(y \ge k) = \sum_{k=6}^{11} \binom{11}{k} 0.25^k (1 - 0.25)^{11-k} = 0.034
 81 |   \]
 82 |   So the error rate of the ensemble of $n=11$ classifiers is much lower than the error rate of the individual classifiers.
 83 | \end{frame}
 84 | 
 85 | \begin{frame}
 86 |   \frametitle{Same reasoning applied to a wider range of error rates}
 87 |   \center
 88 |   \includegraphics[scale=0.6]{Code/ch07/images/07_03.png}
 89 | \end{frame}
 90 | 
 91 | \begin{frame}
 92 |   \frametitle{Voting classifier in scikit-learn}
 93 |   \begin{itemize}
 94 |   \item Simply instantiate several classifiers
 95 |   \item Make a list
 96 |   \item Pass to sklearn.ensemble.VotingClassifier(...)
 97 |   \item  \href{http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html}{\beamergotobutton{API link}}
 98 |   \end{itemize}
 99 | \end{frame}
100 | 
101 | \begin{frame}[fragile]
102 | \small
103 | \begin{verbatim}
104 |   clf1 = LogisticRegression(random_state=1)
105 |   clf2 = RandomForestClassifier(random_state=1)
106 |   clf3 = GaussianNB()
107 |   estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)]
108 |   ens_clf = VotingClassifier(estimators)
109 |   ens_clf = eclf1.fit(X, y)
110 | \end{verbatim}
111 | \end{frame}
112 | 
113 | \begin{frame}
114 |   \frametitle{Boostrap aggregation (bagging)}
115 |   \begin{itemize}
116 |   \item We used the entire training set for the majority vote classifier
117 |   \item Here we draw \textbf{bootstrap samples}
118 |   \item In statistics, \textbf{bootstrapping} is any test or metric that relies on \textbf{random sampling with replacement}.
119 |   \item Hypothesis testing: bootstrapping often used as an alternative to statistical inference based on the assumption of a parametric model when that assumption is in doubt
120 |   \item The basic idea of bootstrapping is that inference about a population from sample data, can be modelled by resampling with replacement the sample data and performing inference about a sample from resampled data.
121 |   \end{itemize}
122 | \end{frame}
123 | 
124 | \begin{frame}
125 |   \frametitle{Bagging}
126 |   \center
127 |   \includegraphics[scale=0.08]{Code/ch07/images/07_06.png}
128 | \end{frame}
129 | 
130 | \begin{frame}
131 |   \frametitle{Boostrapping example}
132 |   \center
133 |   \includegraphics[scale=0.24]{Code/ch07/images/07_07.png}
134 |   \begin{itemize}
135 |   \item Seven training examples
136 |   \item Sample randomly with replacement
137 |   \item Use each boostrap sample to train a classifier $C_j$
138 |   \item $C_j$ is typically a decision tree
139 |   \item \textbf{Random Forests}: also use random feature subsets
140 |   \end{itemize}
141 | 
142 | \end{frame}
143 | 
144 | \begin{frame}
145 |   \frametitle{Bagging in scikit-learn}
146 |   \begin{itemize}
147 |   \item Instantiate a decision tree classifier
148 |   \item Make a bagging classifier with decision trees
149 |   \item Check that the accuracy is higher for the bagging classifier
150 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch07/ch07.ipynb}{\beamergotobutton{PML github}}
151 |   \end{itemize}
152 | \end{frame}
153 | 
154 | \begin{frame}
155 |   \frametitle{Boosting}
156 |   \begin{itemize}
157 |   \item Basic idea: start with weak learners that have only a slight performance advantage over random guessing (e.g. a decision tree stump) and try to boost their performance by focusing on training samples that are hard to classify
158 |   \item Very simple base classifiers learn from misclassified training examples
159 |   \item The original boosting algorithm was formulated by Robert Schapire in 1990
160 |   \item It was later refined into \textbf{AdaBoost}
161 |   \item \textbf{AdaBoost} (short for Adaptive Boosting) is the most common implementation of boosting
162 |   \end{itemize}
163 | \end{frame}
164 | 
165 | \begin{frame}
166 |   \frametitle{Original boosting algorithm}
167 |   \begin{enumerate}
168 |   \item Draw a random subset of training samples $d_1$ without replacement from the training set $D$ to train a weak learner $C_1$
169 |   \item Draw second random training subset $d_2$ without replacement from the training set and add 50 percent of the samples that were previously misclassified to train a weak learner $C_2$
170 |   \item Find the training samples $d_3$ in the training set $D$ on which $C_1$ and $C_2$ disagree to train a third weak learner $C_3$
171 |   \item Combine the weak learners $C_1, C_2$, and $C_3$ via majority voting
172 |   \end{enumerate}
173 | \end{frame}
174 | 
175 | \begin{frame}
176 |   \frametitle{AdaBoost}
177 |   \begin{itemize}
178 |   \item In contrast, AdaBoost uses the complete training set to train the weak learners
179 |   \item Training samples are reweighted in each iteration to build a strong classifier
180 |   \item End goal is to build a strong classifier that learns from the mistakes of the previous weak learners in the ensemble
181 |   \end{itemize}
182 | \end{frame}
183 | 
184 | \begin{frame}
185 |   \frametitle{AdaBoost algorithm}
186 |   \begin{enumerate}
187 |   \item Set weight vector $\mathbf{w}$ to uniform weights where $\sum_i  w_i = 1$.
188 |   \item For $j$ in $m$ boosting rounds, do the following:
189 |   \begin{enumerate}
190 |   \item Train a weighted weak learner: $C_j = train(\mathbf{X, y, w})$.
191 |   \item Predict class labels: $\hat{y} = predict(C_j, \mathbf{X})$.
192 |   \item Compute the weighted error rate: $\epsilon = \mathbf{w} \cdot (\mathbf{\hat{y}} \neq \mathbf{y})$.
193 |   \item Compute the coefficient $\alpha_j$: $\alpha_j=0.5 \log \frac{1 - \epsilon}{\epsilon}$.
194 |   \item Update the weights: $\mathbf{w} := \mathbf{w} \times \exp \big( -\alpha_j \times \mathbf{\hat{y}} \times \mathbf{y} \big)$.
195 |   \item Normalize weights to sum to 1: $\mathbf{w}:= \mathbf{w} / \sum_i w_i$.
196 |   \end{enumerate}
197 |   \item Compute the final prediction: $\mathbf{\hat{y}} = \big( \sum^{m}_{j=1} \big( \mathbf{\alpha}_j \times predict(C_j, \mathbf{X})  \big) > 0 \big)$.
198 |   \end{enumerate}
199 |   \scriptsize
200 |   \\~\\
201 |   Notes: For clarity, we will denote element-wise multiplication by the cross symbol $(\times)$ and the dot product between two vectors by a dot symbol $(\cdot)$, respectively. Note that the expression ($\mathbf{\hat{y}} == \mathbf{y}$) in step 5 refers to a vector of 1s and 0s, where a 1 is assigned if the prediction is incorrect and 0 is assigned otherwise.
202 | \end{frame}
203 | 
204 | \begin{frame}
205 |   \frametitle{}
206 |   \begin{itemize}
207 |   \item
208 |   \end{itemize}
209 | \end{frame}
210 | 
211 | \end{document}
212 | 


--------------------------------------------------------------------------------
/chapter12.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym} 
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 12}
  7 | \subtitle{Artificial Neural Networks}
  8 | 
  9 | \begin{document}
 10 | \maketitle
 11 | 
 12 | \begin{frame}
 13 |   \frametitle{Deep learning}
 14 |   \begin{itemize}
 15 |   \item Big in ML
 16 |   \item Set of algorithms to train neural networks
 17 |   \item Python libraries available
 18 |   \item Outline
 19 |     \begin{itemize}
 20 |     \item Forward propagation in ANNs
 21 |     \item Backpropagation to learn the parameters
 22 |     \item Debugging ANNs
 23 |     \item Alternative architectures (CNN, RNN)
 24 |     \end{itemize}
 25 |   \end{itemize}
 26 | \end{frame}
 27 | 
 28 | \begin{frame}
 29 |   \frametitle{Single neuron review}
 30 |   \includegraphics[width=\textwidth]{Code/ch12/images/12_01.png}  
 31 | \end{frame}
 32 | 
 33 | \begin{frame}
 34 |   \frametitle{Adaline review}
 35 |   \begin{itemize}
 36 |   \item Perceptron
 37 |     \begin{itemize}
 38 |     \item Update all weights, then recompute $\hat{y}$
 39 |     \item Weight update done after seeing each sample
 40 |       \[
 41 |       \Delta w_j = \eta \bigg( y^{(i)} - \hat{y}^{(i)} \bigg)x_{j}^{(i)}
 42 |       \]
 43 |     \end{itemize}
 44 |   \item Adaline
 45 |     \begin{itemize}
 46 |     \item Weight update done after entire training set has been seen
 47 |     \item In every epoch, update all weights as follows:
 48 |       \[
 49 |       \mathbf{w} := \mathbf{w} + \Delta \mathbf{w}, \quad \text{where } \Delta \mathbf{w} = - \eta \nabla J (\mathbf{w})
 50 |       \]
 51 |     \item I.e. compute the gradient based on all samples in the training set (this is known as batch gradient descent)
 52 |     \item SGD updates after seeing $n$ samples
 53 |     \item Mini-batch: middle ground bewteen SGD and batch GD
 54 |     \end{itemize}
 55 |   \end{itemize}
 56 | \end{frame}
 57 | 
 58 | \begin{frame}
 59 |   \frametitle{Weight update details}
 60 |   Partial derivative for each weight $w_j$ in the weight vector $\mathbf{w}$:
 61 |   \[
 62 |   \frac{\partial}{\partial w_j} J(\mathbf{w}) = \sum_i \big( y^{(i)} - a^{(i)} \big) x_{j}^{(i)}
 63 |   \]
 64 |   Here $y^{(i)}$ is the target class label of a particular sample $x^{(i)}$, and $a^{(i)}$ is the \textit{activation} of the neuron, which is a linear function in the case of Adaline: Remember that we defined the \textit{activation function} $\phi(\cdot)$ as follows:
 65 |   \[
 66 |   \phi(z) = z = a
 67 |   \]
 68 |   Here, the net input $z$  is a linear combination of the weights that are connecting the
 69 |   input to the output layer:
 70 |   \[
 71 |   z = \sum_j w_j x_j = \mathbf{w}^T \mathbf{x}
 72 |   \]
 73 | \end{frame}
 74 | 
 75 | \begin{frame}
 76 |   \frametitle{Multi-layer feedforward neural network}
 77 |   \center
 78 |   \includegraphics[scale=0.6]{Code/ch12/images/12_02.png}  
 79 | \end{frame}
 80 | 
 81 | \begin{frame}
 82 |   \frametitle{Notation}
 83 |   \begin{itemize}
 84 |   \item We denote the $i$th activation unit in the $l$th layer as $a_{i}^{(l)}$
 85 |   \item The activation units $a_{0}^{(1)}$ and $a_{0}^{(2)}$ are the \textit{bias units}, respectively, which we set equal to 1
 86 |   \item The activation of the units in the input layer:
 87 |     \[
 88 |     \mathbf{a}^{(i)} = 
 89 |     \begin{bmatrix}
 90 |       a_{0}^{(1)} \\
 91 |       a_{1}^{(1)} \\
 92 |       \vdots \\
 93 |       a_{m}^{(1)}
 94 |     \end{bmatrix}
 95 |     = 
 96 |     \begin{bmatrix}
 97 |       1 \\
 98 |       x_{1}^{(i)} \\
 99 |       \vdots \\
100 |       x_{m}^{(i)}
101 |     \end{bmatrix}
102 |     \]
103 |     \item The connection between the $k$th unit in layer $l$ to the $j$th unit in layer
104 |     $l+1$ written as $w^{(l)}_{j, k}$
105 |   \end{itemize}
106 | \end{frame}
107 | 
108 | \begin{frame}
109 |   \frametitle{Notation summary}
110 |   \includegraphics[scale=0.4]{Code/ch12/images/12_03.png}  
111 | \end{frame}
112 | 
113 | \begin{frame}
114 |   \frametitle{MLP learning procedure}
115 |   \begin{enumerate}
116 |   \item Starting at the input layer, forward propagate $\mathbf{x^{(i)}}$
117 |   \item Calculate the error that we will want to minimize
118 |   \item Find its derivative with respect to each weight
119 |   \item Update the weights
120 |   \end{enumerate}
121 | \end{frame}
122 | 
123 | \begin{frame}
124 |   \frametitle{Forward propagation}
125 |   \begin{enumerate}
126 |   \item Assume, input has $m$ dimensions
127 |     \item Compute the net input $a_{1}^{(2)}$ for unit 1 in the hidden layer:
128 |       \[
129 |       z_{1}^{(2)} = a_{0}^{(1)} w_{1,0}^{(1)} + a_{1}^{(1)} w_{1, 1}^{(1)} + \dots + a_{m}^{(1)} w_{l, m}^{(1)}
130 |       \]
131 |     \item Compute the activation for unit 1 in the hidden layer:
132 |       \[
133 |       a_{1}^{(2)} = \phi \big( z_{1}^{(2)} \big)
134 |       \]
135 |     \item Here $\phi(\cdot)$ is the activation function
136 |     \item Logistic sigmoid is often used:
137 |       \[
138 |       \phi(z) = \frac{1}{1 + e^{-z}}.
139 |       \]
140 |   \end{enumerate}
141 | \end{frame}
142 | 
143 | \begin{frame}
144 |   \frametitle{Sigmoid function}
145 |   \center
146 |   \includegraphics[scale=0.6]{Code/ch12/images/12_04.png}  
147 | \end{frame}
148 | 
149 | \begin{frame}
150 |   \frametitle{Vectorized notation}
151 |   \begin{itemize}
152 |   \item Write activation in a matrix form
153 |   \item Readability + more efficient code
154 |   \item Net inputs for the hidden layer:
155 |     \[
156 |     \mathbf{z}^{(2)} = \mathbf{W}^{(1)} \mathbf{a}^{(1)}
157 |     \]
158 |   \item Dimensions (ignoring bias units for simplicity)
159 |     \[
160 |     [h \times 1] = [h \times m] [m \times 1]
161 |     \]
162 |   \item Activations for the hidden layer:
163 |     \[
164 |     \mathbf{a}^{(2)} = \phi \big( \mathbf{z}^{(2)} \big)
165 |     \]
166 |   \end{itemize}
167 | \end{frame}
168 | 
169 | \begin{frame}
170 |   \frametitle{Matrix notation}
171 |   \begin{itemize}
172 |   \item Generalize computation to all $n$ samples in the training set
173 |     \[
174 |     \mathbf{Z}^{(2)} = \mathbf{W}^{(1)} \big[ \mathbf{A}^{(1)} \big]^T
175 |     \]
176 |   \item Matrix dimensions 
177 |     \[
178 |     [h \times n] = [h \times m] [n \times m]^T
179 |     \]
180 |   \item Activation matrix
181 |     \[
182 |     \mathbf{A}^{(2)} = \phi \big( \mathbf{Z}^{(2)}  \big)
183 |     \]
184 |     \item Now activation of the output layer
185 |       \[
186 |       \mathbf{Z}^{(3)} = \mathbf{W}^{(2)}  \mathbf{A}^{(2)} 
187 |       \]
188 |     \item Matrix dimensions
189 |       \[
190 |       [t \times n] = [t \times h] [h \times n]
191 |       \]
192 |     \item Output of the network
193 |       \[
194 |       \mathbf{A}^{(3)} = \phi \big( \mathbf{Z}^{(3)} \big), \; \mathbf{A}^{(3)} \in \mathbb{R}^{t \times n}.
195 |       \]
196 |   \end{itemize}
197 | \end{frame}
198 | 
199 | \begin{frame}
200 |   \frametitle{Cost function}
201 |   The logistic Cost function is the same we used for logistic regression:
202 |   \[
203 |   J(\mathbf{w}) = -\sum_{i=1}^{n} y^{(i)} \log \big( a^{(i)} \big) + \big( 1 - y^{(i)} \big) \log \big( 1 - a^{(i)}\big)
204 |   \]
205 |   Here, $a^{(i)}$ is the sigmoid activation of the $i$th unit $a^{(i)} = \phi \big( z^{(i)} \big)$.
206 |   Regularization:
207 |   \[
208 |   L2 = \lambda \lVert  \mathbf{w} \rVert^{2}_{2} = \lambda \sum_{j=1}^{m} w_{j}^{2}
209 |   \]
210 |   \[
211 |   J(\mathbf{w}) = - \Bigg[ \sum_{i=1}^{n} y^{(i)} \log \big( a^{(i)} \big) + \big(1 - y^{(i)} \big) \log \big(1- a^{(i)} \big) \Bigg] + \frac{\lambda}{2} \lVert  \mathbf{w} \rVert^{2}_{2}
212 |   \]
213 | 
214 | \end{frame}
215 | 
216 | \begin{frame}
217 |   \frametitle{Cost function for all units in output layer}
218 |   The activation of the third layer and the target class could be:
219 |   \[
220 |   a^{(3)} = 
221 |   \begin{bmatrix}
222 |     0.1 \\
223 |     0.9 \\
224 |     \vdots \\
225 |     0.3
226 |   \end{bmatrix}
227 |   ,\; \mathbf{y} = 
228 |   \begin{bmatrix}
229 |     0 \\
230 |     1 \\
231 |     \vdots \\
232 |     0
233 |   \end{bmatrix}
234 |   \]
235 |   So, we need to generalize the logistic cost function to all activation units $j$ in our network. The cost function (without the regularization term) becomes:
236 |   \[
237 |   J(\mathbf{w}) = - \sum_{i=1}^{n} \sum_{j=1}^{t} y_{j}^{(i)} \log(a_{j}^{(i)}) + (1 - y_{j}^{(i)}) \log(1 - a^{(i)}_{j})
238 |   \]
239 |   Superscript $i$ is the index of a particular sample in training set
240 | \end{frame}
241 | 
242 | \begin{frame}
243 |   \frametitle{Cost function for the entire network}
244 |   Sum all the weights in the entire network in the regularization term:
245 |   \[
246 |   J(\mathbf{w}) = - \Bigg[ \sum_{i=1}^{n} \sum_{j=1}^{m} y_{j}^{(i)}  \log \bigg( \phi \Big( z_{j}^{(i)} \Big) \bigg) + \Big(1 - y_{j}^{(i)} \Big) \log  \bigg(1 - \phi \Big( z_{j}^{(i)} \Big)  \bigg) \Bigg] +
247 |   \]
248 |   \[
249 |   + \frac{\lambda}{2} \sum_{l=1}^{L-1} \sum_{i=1}^{u_l} \sum_{j=1}^{u_{l+1}} \Big(w_{j, i}^{(l)}\Big)^2
250 |   \]
251 |   The following expression represents the L2-penalty term:
252 |   \[
253 |   \frac{\lambda}{2} \sum_{l=1}^{L-1} \sum_{i=1}^{u_l} \sum_{j=1}^{u_{l+1}} \Big(w_{j, i}^{(l)}\Big)^2
254 |   \]
255 | \end{frame}
256 | 
257 | \begin{frame}
258 |   \frametitle{Minimizing the cost function}
259 |   We want to minimize the cost function $J(\mathbf{w})$, so we calculate the partial derivative with respect to each weight for every layer in the network:
260 |   \[
261 |   \frac{\partial J(\mathbf{W})}{\partial w_{j, i}^{(l)}}
262 |   \]
263 | \end{frame}
264 | 
265 | \begin{frame}
266 |   \frametitle{}
267 |   \begin{itemize}
268 |   \item
269 |   \end{itemize}
270 | \end{frame}
271 | 
272 | \begin{frame}
273 |   \frametitle{}
274 |   \begin{itemize}
275 |   \item
276 |   \end{itemize}
277 | \end{frame}
278 | 
279 | \begin{frame}
280 |   \frametitle{}
281 |   \begin{itemize}
282 |   \item
283 |   \end{itemize}
284 | \end{frame}
285 | 
286 | \begin{frame}
287 |   \frametitle{}
288 |   \begin{itemize}
289 |   \item
290 |   \end{itemize}
291 | \end{frame}
292 | 
293 | \begin{frame}
294 |   \frametitle{}
295 |   \begin{itemize}
296 |   \item
297 |   \end{itemize}
298 | \end{frame}
299 | 
300 | \begin{frame}
301 |   \frametitle{}
302 |   \begin{itemize}
303 |   \item
304 |   \end{itemize}
305 | \end{frame}
306 | 
307 | \begin{frame}
308 |   \frametitle{}
309 |   \begin{itemize}
310 |   \item
311 |   \end{itemize}
312 | \end{frame}
313 | 
314 | \end{document}
315 | 


--------------------------------------------------------------------------------
/sample.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym}                                                                                      
  3 | \usepackage{graphicx}                                                                                      
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title[Active Learning for Phenotyping Tasks\hspace{2em}\insertframenumber/\inserttotalframenumber]
  7 | {Active Learning for Phenotyping Tasks}
  8 | \author{Dmitriy Dligach, Timothy A. Miller, and \textbf{Guergana Savova}}
  9 | \institute{Boston Children's Hosptial and Harvard Medical School}
 10 | \date{\today}
 11 | 
 12 | \begin{document}
 13 | 
 14 | % remove word ``Figure'' from graphics caption
 15 | \setbeamertemplate{caption}{\insertcaption}
 16 | 
 17 | %\maketitle
 18 | \begin{frame}[t]                                                                                              
 19 | \titlepage                                                                                                    
 20 | \end{frame}  
 21 | 
 22 | \begin{frame}
 23 | \frametitle{Introduction}
 24 | \begin{itemize}
 25 | \item Phenotyping
 26 | \begin{itemize}
 27 | \item What's a phenotype?
 28 | \item i2b2 and eMERGE
 29 | \item Link EHRs to biobanks for genetic analysis
 30 | \item Supervised learning for phenotyping
 31 | \end{itemize}
 32 | \item Manual annotation needed
 33 | \begin{itemize}
 34 | \item Standard approach: passive learning
 35 | \item Alternative: active learning
 36 | \end{itemize}
 37 | \end{itemize}
 38 | \end{frame}
 39 | 
 40 | \begin{frame}
 41 | \frametitle{Active Learning}
 42 | \begin{itemize}
 43 | \item Approach for selecting data for annotation
 44 | \item Data selection delegated to classifier
 45 | \item Pool-based scenario
 46 | \begin{itemize}
 47 | \item Lots of unlabeled data
 48 | \item Can afford to annotate only a small amount
 49 | \end{itemize}
 50 | \item Little work in clinical domain
 51 | \end{itemize}
 52 | \end{frame}
 53 | 
 54 | \begin{frame}                                                                                                 
 55 | \frametitle{Intuition}                                                                                  
 56 | Suppose there's a little bit of labeled data                                                                  
 57 | \newline                                                                                                      
 58 |   \begin{itemize}                                                                                             
 59 |   \item Classify example $\vec{x}$                                                                               
 60 |   \begin{itemize}                                                                                             
 61 |     \item $p(c_1 | \vec{x}) = 0.95$ and $p(c_2 | \vec{x}) = 0.05$                                     
 62 |     \item $p(c_1 | \vec{x}) = 0.55$ and $p(c_2 | \vec{x}) = 0.45$                                      
 63 |     \newline                                                                                                  
 64 |   \end{itemize}                                                                                               
 65 |   \item Margin Sampling                                                                                       
 66 |   \begin{itemize}                                                                                             
 67 |     \item $Prediction Margin = |P(c_1 | \vec{x}) - P(c_2 | \vec{x})|$                                            
 68 |     \item Annotate examples with smallest margin first                                                        
 69 |   \end{itemize}                                                                                               
 70 |   \end{itemize}                                                                                               
 71 | \end{frame} 
 72 | 
 73 | \begin{frame}
 74 | \frametitle{How does active learning work?}
 75 | \begin{itemize}
 76 | \item Seed classifier
 77 | \begin{itemize}
 78 | \item Annotate a small amount of data
 79 | \item Train a classifier
 80 | \end{itemize}
 81 | \item Iterative process
 82 | \begin{itemize}
 83 | \item Apply the classifier to the pool of unlabeled data
 84 | \item Select an example and add it to the training set
 85 | \item Retrain the classifier
 86 | \item Check if we are done
 87 | \end{itemize}
 88 | \item The learner quickly converges on the decision boundary
 89 | \end{itemize}
 90 | \end{frame}
 91 | 
 92 | \begin{frame}
 93 | \frametitle{Data Representation}
 94 | \begin{itemize}
 95 | \item Unit of classification
 96 | \begin{itemize}
 97 | \item Single patient
 98 | \end{itemize}
 99 | \item Patient representation
100 | \begin{itemize}
101 | \item Set of CUIs extracted with cTAKES
102 | \item Abstract from lexical variability of medical terminology
103 | \item Filter out non-clinical vocabulary
104 | \end{itemize}
105 | \item Phenotype-specific dictionaries
106 | \item Patient vector $\vec{x}$ 
107 | \begin{itemize}
108 | \item Element $x_n$ is frequency of $CUI_n$
109 | \end{itemize}
110 | \end{itemize}
111 | \end{frame}
112 | 
113 | \begin{frame}
114 | \frametitle{Naive Bayes}
115 | \begin{itemize}
116 | \item Need to evaluate $p(c_i|\vec{x})$
117 | \item Multinomial Naive Bayes
118 | \begin{itemize}
119 | \item Probabilistic classifier
120 | \item Supports multi-class classification
121 | \item Training and classification speed
122 | \end{itemize}
123 | \item Uncertainty sampling:
124 | \begin{equation} 
125 | prediction\: margin = |p(c_1|\vec{x}) - p(c_2|\vec{x})|                                                            
126 | \end{equation} 
127 | \end{itemize}
128 | \end{frame}
129 | 
130 | \begin{frame}
131 | \frametitle{Counts}\
132 | Compute posterior probability as follows:
133 | \begin{equation}                                                                                          
134 | p(c_i|\vec{x}) = \frac{1}{Z}p(c_i)\prod_{n=1}^Np(CUI_n|c_i)^{x_n}
135 | \end{equation}
136 | \fontsize{6.5pt}{7.2}\selectfont
137 | \\
138 | $p(c_i)$ - prior probability of class $c_i$ \\
139 | $N$ is the number of CUIs in the phenotype-specific dictionary \\
140 | $CUI_n$ is the $n_{th}$ CUI in that dictionary \\
141 | $x_n$ is the frequency of $CUI_n$ in $\vec{x}$ \\
142 | $Z$ (evidence) is the scaling factor \\
143 | Determine $p(c_i)$ and $p(CUI_n|c_i)$ via maximum likelihood estimation
144 | \end{frame}
145 | 
146 | \begin{frame}
147 | \frametitle{Dataset creation}
148 | \begin{itemize}
149 | \item Created within the i2b2 initiative
150 | \item ICD-9 codes used to form initial cohort
151 | \item About 600 patients selected randomly
152 | \item Labeled by domain experts
153 | \end{itemize}
154 | \end{frame}
155 | 
156 | \begin{frame}
157 | \frametitle{Dataset stats}
158 | \resizebox{\linewidth}{!}{
159 | \begin{tabular}{|l|r|r|r|}                                                                                
160 | \hline
161 | Phenotype & Total Instances & Number of Classes & Proportion of Predominant Class \\                       
162 | \hline                                                                                                    
163 | Ulcerative Colitis & 600 & 2 & 0.630 \\                                                                    
164 | Crohn's Disease & 600 & 2 & 0.665 \\                                                                       
165 | Multiple Sclerosis & 595 & 5 & 0.395 \\                                                                    
166 | Type II Diabetes & 600 & 3 & 0.583 \\                                                                      
167 | \hline                                                                                                     \end{tabular}}
168 | \end{frame}
169 | 
170 | \begin{frame}
171 | \frametitle{Evaluation}
172 | \begin{itemize}
173 | \item Learning curve generation
174 | \begin{itemize}
175 | \item Done in the style of 10-fold cross validation
176 | \end{itemize}
177 | \item Within each fold: 
178 | \begin{itemize}
179 | \item Training data
180 | \item Pool of ``unlabeled'' examples
181 | \item Held-out test set
182 | \end{itemize}
183 | \item Various seed sizes
184 | \begin{itemize}
185 | \item Affect of seed size and performance
186 | \item Only showing the plots for seed size = 30
187 | \item See the paper for other sizes
188 | \end{itemize}
189 | \item Gold labels in the pool hidden from classifier
190 | \end{itemize}
191 | \end{frame}
192 | 
193 | \begin{frame}
194 | \frametitle{Learning Curves} 
195 | \begin{center} 
196 | \begin{figure}
197 | \includegraphics[width=0.30\textwidth]{figures/uc30.png} 
198 | \includegraphics[width=0.30\textwidth]{figures/cd30.png} \\
199 | \includegraphics[width=0.30\textwidth]{figures/ms30.png} 
200 | \includegraphics[width=0.30\textwidth]{figures/t2d30.png} \\
201 | \caption{Ulcerative Colitis, Crohn's Disease, Multiple Sclerosis, Type II Diabetes}
202 | \end{figure}
203 | \end{center} 
204 | \end{frame} 
205 | 
206 | \begin{frame}
207 | \frametitle{Close-up}
208 | \begin{center}
209 | \begin{figure}
210 | \includegraphics[width=0.7\textwidth]{figures/uc30.png}
211 | \caption{Ulcerative Colitis}
212 | \end{figure}
213 | \end{center}
214 | \end{frame}
215 | 
216 | \begin{frame}
217 | \frametitle{Sample plot}
218 | \begin{columns}[c]
219 | \column{.5\textwidth}
220 | \begin{itemize}
221 | \item {\scriptsize Active Learning above passive}
222 | \item {\scriptsize Only need $1/3$ of the data}
223 | \item {\scriptsize Best performance higher}
224 | \end{itemize}
225 | \column{.5\textwidth}
226 | \begin{center}
227 | \begin{figure}
228 | \includegraphics[width=1.0\textwidth]{figures/uc30.png}
229 | \caption{Ulcerative Colitis}
230 | \end{figure}
231 | \end{center}
232 | \end{columns}
233 | \end{frame}
234 | 
235 | \begin{frame}
236 | \frametitle{Difference between areas under the curve (Active - Passive)}
237 | \resizebox{\linewidth}{!}{
238 | \begin{tabular}{|l|r|r|r|r|}                                                                                
239 | \hline
240 | Seed Size & Ulcerative Colitis & Crohn's Disease & Multiple Sclerosis & Type II Diabetes \\
241 | \hline                                                                                                    
242 | 10 & 6.90 & 4.17 & 10.50 & 11.05 \\
243 | 30 & 6.64 & 2.21 & 15.43 & 7.49 \\
244 | 50 & 8.63 & 1.75 & 8.61 & 8.90 \\
245 | \hline      
246 | \end{tabular}}
247 | \end{frame}
248 | 
249 | \begin{frame}
250 | \frametitle{Conclusion}
251 | \begin{itemize}
252 | \item Annotation effort reduced by 2/3
253 | \item Active learning sometimes reaches better accuracy
254 | \item Need to know when to stop
255 | \item What happens if the base classifier is swapped?
256 | \end{itemize}
257 | \end{frame}
258 | 
259 | \begin{frame}
260 | Questions?
261 | \end{frame}
262 | 
263 | \end{document}
264 | 


--------------------------------------------------------------------------------
/chapter2.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym}
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 2}
  7 | \subtitle{Training Machine Learning Algorithms for Classification}
  8 | 
  9 | \begin{document}
 10 | 
 11 | \maketitle
 12 | 
 13 | \begin{frame}
 14 |   \frametitle{Biology}
 15 |    \begin{center}
 16 |     \begin{figure}
 17 |       \includegraphics[width=\textwidth]{Code/ch02/images/02_01.png}
 18 |     \end{figure}
 19 |    \end{center}
 20 | \end{frame}
 21 | 
 22 | \begin{frame}
 23 |   \frametitle{Logic Gate}
 24 |   \begin{center}
 25 |     \begin{figure}
 26 |       \includegraphics[width=\textwidth]{Code/ch02/images/02_01.png}
 27 |     \end{figure}
 28 |   \end{center}
 29 |   \begin{itemize}
 30 |   \item Simple logic gate with binary outputs
 31 |   \item Signals arrive at dendrites
 32 |   \item Integrated into cell body
 33 |   \item If signal exceeds threshold, generate output, and pass to axon
 34 |   \end{itemize}
 35 | \end{frame}
 36 | 
 37 | \begin{frame}
 38 |   \frametitle{Rosenblatt Perceptron}
 39 |   \begin{itemize}
 40 |   \item Binary classification task
 41 |   \item Positive class (1) vs. negative class (-1)
 42 |   \item Define activation function $\phi(z)$
 43 |   \item Takes as input a dot product of input and weights
 44 |   \item Net input: $z = w_1 x_1 + \dots + w_m x_m$
 45 |   \end{itemize}
 46 | 
 47 |   \[
 48 |   \mathbf{w} = \begin{bmatrix}
 49 |     w^{(1)}  \\
 50 |     w^{(2)}  \\
 51 |     \vdots  \\
 52 |     w^{(m)}
 53 |   \end{bmatrix},
 54 |   \mathbf{x} = \begin{bmatrix}
 55 |     x^{(1)}  \\
 56 |     x^{(2)}  \\
 57 |     \vdots  \\
 58 |     x^{(m)}
 59 |   \end{bmatrix}
 60 |   \]
 61 | 
 62 | \end{frame}
 63 | 
 64 | \begin{frame}
 65 |   \frametitle{Heaviside step function}
 66 |   \begin{itemize}
 67 |   \item $\phi(z)$ known as activation
 68 |   \item if activation above some threshold, predict class 1
 69 |   \item predict class -1 otherwise
 70 |   \end{itemize}
 71 |   Heaviside Step Function
 72 | 
 73 |   \[ \phi(z) = \begin{cases}
 74 |     1  & \text{ if } z \ge \theta \\
 75 |     -1 & \text{ otherwise }.
 76 |   \end{cases}
 77 |   \]
 78 | \end{frame}
 79 | 
 80 | \begin{frame}
 81 | \frametitle{Step function simplified}
 82 | Bring the threshold $\theta$ to the left side of the equation and define a weight-zero as $w_0 = -\theta$ and $x_0=1$, so that we write $\mathbf{z}$ in a more compact form
 83 | 
 84 | \[
 85 | z  = w_0 x_0 + w_1 x_1 + \dots + w_m x_m = \mathbf{w^T x}
 86 | \]
 87 | 
 88 | and
 89 | 
 90 | \[ \phi(z) = \begin{cases}
 91 |       1  & \text{ if } z \ge 0 \\
 92 |       -1 & \text{ otherwise }.
 93 |    \end{cases}
 94 | \]
 95 | \end{frame}
 96 | 
 97 | \begin{frame}
 98 |   \frametitle{Basic Linear Algebra}
 99 |   Vector dot product
100 |   \[
101 |   z  = \mathbf{w^T x} = \sum_{j=0}^{m} \mathbf{w_j} \mathbf{x_j}
102 |   \]
103 | 
104 |   \[
105 |   \big[1 \quad 2 \quad 3 \big] \times \begin{bmatrix}
106 |     4  \\
107 |     5  \\
108 |     6
109 |   \end{bmatrix} = 1 \times 4 + 2 \times 5 + 3 \times 6 = 32.
110 |   \]
111 | \end{frame}
112 | 
113 | \begin{frame}
114 |   \frametitle{Input squashed into a binary output}
115 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_02.png}
116 | \end{frame}
117 | 
118 | \begin{frame}
119 |   \frametitle{Rosenblatt perceptron algorithm}
120 |   \begin{enumerate}
121 |   \item Initialize the weights to 0 or small random numbers.
122 |   \item For each training sample $\mathbf{x}^{(i)}$, perform the following steps:
123 |     \begin{enumerate}
124 |     \item Compute the output value $\hat{y}$.
125 |     \item Update the weights.
126 |     \end{enumerate}
127 |   \end{enumerate}
128 | \end{frame}
129 | 
130 | \begin{frame}
131 |   \frametitle{Weight update}
132 |   Weight update rule:
133 |   \[
134 |   w_j := w_j + \Delta w_j
135 |   \]
136 |   Perceptron learning rule:
137 |   \[
138 |   \Delta w_j = \eta \bigg( y^{(i)} - \hat{y}^{(i)} \bigg)x_{j}^{(i)}
139 |   \]
140 |   Where $\eta$ is the learning rate (a constant between 0.0 and 1.0), $y^{(i)}$ is the true class label of the $i$th training sample, and $\hat{y}^{(i)}$ is the predicted class label.
141 | \end{frame}
142 | 
143 | \begin{frame}
144 |   \frametitle{Update rule examples}
145 |   Correct prediction, weights unchanged:
146 |   \[
147 |   \Delta w_j = \eta \bigg( -1 -- 1 \bigg)x_{j}^{(i)} = 0
148 |   \]
149 | 
150 |   \[
151 |   \Delta w_j = \eta \bigg( 1-1 \bigg)x_{j}^{(i)} = 0
152 |   \]
153 |   Wrong prediction, weights pushed towards the positive or negative class:
154 |   \[
155 |   \Delta w_j = \eta \bigg( 1 -- 1 \bigg)x_{j}^{(i)} = \eta(2)x_{j}^{(i)}
156 |   \]
157 | 
158 |   \[
159 |   \Delta w_j = \eta \bigg( -1-1 \bigg)x_{j}^{(i)} = \eta(-2)x_{j}^{(i)}
160 |   \]
161 | \end{frame}
162 | 
163 | \begin{frame}
164 |   \frametitle{Linear separability}
165 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_03.png}
166 | \end{frame}
167 | 
168 | \begin{frame}
169 |   \frametitle{Convergence}
170 |   Convergence guaranteed if
171 |   \begin{itemize}
172 |   \item The two classess linearly separable
173 |   \item Learning rate is sufficiently small
174 |   \end{itemize}
175 |   If classes cannot be seprated:
176 |   \begin{itemize}
177 |   \item Set a maximum number of passes over the training dataset (epochs)
178 |   \item Set a threshold for the number of tolerated misclassifications
179 |   \item Otherwise, it will never stop updating weights (converge)
180 |   \end{itemize}
181 | \end{frame}
182 | 
183 | \begin{frame}
184 |   \frametitle{Linear separability}
185 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_04.png}
186 | \end{frame}
187 | 
188 | \begin{frame}
189 |   \frametitle{Perceptron implementation}
190 |   \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch02/ch02.ipynb}{\beamergotobutton{iPython notebook on github}}
191 | \end{frame}
192 | 
193 | \begin{frame}
194 |   \frametitle{ADAPtive LInear NEuron (Adaline)}
195 |   \begin{itemize}
196 |   \item Weights updated based on a linear activation function
197 |   \item Remember that perceptron used a unit step function
198 |   \item  $\phi(z)$ is simply the identity function of the net input
199 | 
200 |     \[
201 |     \phi \big( \mathbf{w}^T \mathbf{x} \big) = \mathbf{w}^T \mathbf{x}
202 |     \]
203 | 
204 |     \item A quantizer is then used to predict class label
205 |   \end{itemize}
206 | \end{frame}
207 | 
208 | \begin{frame}
209 |   \frametitle{Adaline: notice the difference with perceptron}
210 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_09.png}
211 | \end{frame}
212 | 
213 | \begin{frame}
214 |   \frametitle{Cost functions}
215 |   \begin{itemize}
216 |   \item ML algorithms often define an \emph{objective} function
217 |   \item This function is optimized during learning
218 |   \item It is often a \emph{cost} function we want to minimize
219 |   \item Adaline uses a cost function $J(\cdot)$
220 |   \item Learns weights as the sum of squared errors (SSE)
221 |     \[
222 |     J(\mathbf{w}) = \frac{1}{2} \sum_i \bigg(y^{(i)}  - \phi \big(z^{(i)} \big) \bigg)^2
223 |     \]
224 |   \end{itemize}
225 | \end{frame}
226 | 
227 | \begin{frame}
228 |   \frametitle{Advantages of Adaline cost function}
229 |   \begin{itemize}
230 |   \item The linear activation function is differentiable
231 |   \item Unlike the unit step function
232 |   \item It is convex
233 |   \item Can use \emph{gradient descent} to learn the weights
234 |   \end{itemize}
235 | \end{frame}
236 | 
237 | \begin{frame}
238 | \frametitle{What is the gradient? Ask Wikipedia:}
239 | \begin{itemize}
240 | \item The gradient is a multi-variable generalization of the derivative. While a derivative can be defined on functions of a single variable, for functions of several variables, the gradient takes its place.
241 | \item Like the derivative, the gradient represents the slope of the tangent of the graph of the function. More precisely, the gradient points in the direction of the greatest rate of increase of the function, and its magnitude is the slope of the graph in that direction.
242 | \end{itemize}
243 | \end{frame}
244 | 
245 | \begin{frame}
246 |   \frametitle{Gradient Descent}
247 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_10.png}
248 | \end{frame}
249 | 
250 | \begin{frame}
251 |   \frametitle{Gradient descent: an intuition}
252 |   \begin{itemize}
253 |   \item Suppose you are at the top of a mountain, and you have to reach a lake which is at the lowest point of the mountain (a.k.a valley). A twist is that you are blindfolded and you have zero visibility to see where you are headed. So, what approach will you take to reach the lake?
254 |   \item The best way is to check the ground near you and observe where the land tends to descend. This will give an idea in what direction you should take your first step. If you follow the descending path, it is very likely you would reach the lake.
255 |   \end{itemize}
256 |   \vspace{0.2in}
257 |   \tiny
258 |   https://www.analyticsvidhya.com/blog/2017/03/introduction-to-gradient-descent-algorithm-along-its-variants/
259 | \end{frame}
260 | 
261 | \begin{frame}
262 |   \frametitle{Gradient Descent: an intuition}
263 |   \center
264 |   \includegraphics[scale=0.40]{Images/grad_desc1.png}
265 |   \tiny
266 |   \vspace{0.2in}
267 |   https://www.analyticsvidhya.com/blog/2017/03/introduction-to-gradient-descent-algorithm-along-its-variants/
268 | \end{frame}
269 | 
270 | \begin{frame}
271 |   \frametitle{Gradient Descent: an intuition}
272 |   \center
273 |   \includegraphics[scale=0.8]{Images/grad_desc2.png}
274 |   \vspace{0.2in}
275 |   \tiny
276 |   https://www.analyticsvidhya.com/blog/2017/03/introduction-to-gradient-descent-algorithm-along-its-variants/
277 | \end{frame}
278 | 
279 | \begin{frame}
280 |   \frametitle{Gradient Descent}
281 |   \begin{itemize}
282 |   \item Weights updated by taking small steps
283 |   \item Step size determined by learning rate
284 |   \item Take a step away from the gradient $\nabla J(\mathbf{w})$ of the cost function
285 |     \[
286 |     \mathbf{w} := \mathbf{w} + \Delta \mathbf{w}.
287 |     \]
288 |   \item The weight change is defined as follows:
289 |     \[
290 |     \Delta \mathbf{w} = - \eta \nabla J(\mathbf{w})
291 |     \]
292 |   \end{itemize}
293 | \end{frame}
294 | 
295 | \begin{frame}
296 |   \frametitle{Gradient computation}
297 |   To compute the gradient of the cost function, we need to compute the partial derivative of the cost function with respect to each weight $w_j$,
298 | 
299 |   \[
300 |   \frac{\partial J}{\partial w_j} = - \sum_i \bigg( y^{(i)} - \phi \big(z^{(i)} \big) \bigg) x_{j}^{(i)},
301 |   \]
302 | 
303 |   Weight update of weight $w_j$
304 | 
305 |   \[
306 |   \Delta w_j = - \eta \frac{\partial J}{\partial w_j} = \eta  \sum_i \bigg( y^{(i)} - \phi \big(z^{(i)} \big) \bigg) x_{j}^{(i)}
307 |   \]
308 | 
309 |   We update all weights simultaneously, so Adaline learning rule becomes
310 | 
311 |   \[
312 |   \mathbf{w} := \mathbf{w} + \Delta \mathbf{w}.
313 |   \]
314 | \end{frame}
315 | 
316 | \begin{frame}
317 |   \frametitle{Partial derivatives}
318 |   \begin{equation*}
319 |     \begin{split}
320 |       & \frac{\partial J}{\partial w_j} = \frac{\partial}{\partial w_j} \frac{1}{2} \sum_i \bigg(  y^{(i)} - \phi \big( z^{(i)} \big)  \bigg)^2 \\
321 |       & = \frac{1}{2} \frac{\partial}{\partial w_j} \sum_i \bigg(  y^{(i)} - \phi \big( z^{(i)} \big)  \bigg)^2 \\
322 |       & = \frac{1}{2} \sum_i 2 \big( y^{(i)} - \phi(z^{(i)})\big)  \frac{\partial}{\partial w_j} \Big( y^{(i)}  - \phi({z^{(i)}}) \Big) \\
323 |       & = \sum_i \big( y^{(i)}  - \phi (z^{(i)})   \big) \frac{\partial}{\partial w_j} \Big( y^{(i)} - \sum_i \big(w^{(i)}_{j} x^{(i)}_{j} \big) \Big) \\
324 |       & = \sum_i \bigg(  y^{(i)} - \phi \big( z^{(i)} \big)  \bigg) \bigg( - x_{j}^{(i)} \bigg) \\
325 |       & = - \sum_i \bigg(  y^{(i)} - \phi \big( z^{(i)} \big)  \bigg) x_{j}^{(i)}  \\
326 |     \end{split}
327 |   \end{equation*}
328 | \end{frame}
329 | 
330 | \begin{frame}
331 |   \frametitle{Adaline learning rule vs. Perceptron rule}
332 |   \begin{itemize}
333 |   \item Looks (almost) identical. What is the difference?
334 |   \item $\phi(z^{(i)})$ with $z^{(i)} = \mathbf{w}^T \mathbf{x}^{(i)}$ is a real number
335 |   \item And not an integer class label as in Perceptron
336 |   \item The weight update is done based on \emph{all} samples in training set
337 |   \item Perceptron updates weights incrementally after each sample
338 |     \item This approach is known as ``batch'' gradient descent
339 |   \end{itemize}
340 | \end{frame}
341 | 
342 | \begin{frame}
343 |   \frametitle{Perceptron implementation}
344 |   \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch02/ch02.ipynb}{\beamergotobutton{iPython notebook on github}}
345 | \end{frame}
346 | 
347 | \begin{frame}
348 |   \frametitle{Lessons learned}
349 |   \includegraphics[width=\textwidth]{Code/ch02/images/02_12.png}
350 |   \begin{itemize}
351 |   \item Learning rate too high: error becomes larger (overshoots global min)
352 |   \item Learning rate too low: takes many epochs to converge
353 |     \item Feature normalization
354 |   \end{itemize}
355 | \end{frame}
356 | 
357 | \begin{frame}
358 |   \frametitle{Stochastic gradient descent (SGD)}
359 |   \begin{itemize}
360 |   \item Large dataset with millions of data points (``big data'')
361 |   \item Batch gradient descent costly
362 |   \item Need to compute the error for the entire dataset ...
363 |   \item ... to take one step towards the global minimum!
364 |     \[
365 |     \Delta \mathbf{w} = \eta \sum_i \bigg( y^{(i)} - \phi \big( z^{(i)}\big) \bigg) \mathbf{x}^{(i)}.
366 |     \]
367 |   \item SGD updates the weights incrementally for each training sample
368 |     \[
369 |     \Delta \mathbf{w} = \eta  \bigg( y^{(i)} - \phi \big( z^{(i)}\big) \bigg) \mathbf{x}^{(i)}.
370 |     \]
371 |   \end{itemize}
372 | \end{frame}
373 | 
374 | \begin{frame}
375 |   \frametitle{SGD details}
376 |   \begin{itemize}
377 |   \item Approximation of gradient descent
378 |   \item Reaches convergence faster because of frequent weight updates
379 |   \item Important to present data in random order
380 |   \item Learning rate often gradually decreased (adaptive learning rate)
381 |   \item Can be used for online learning
382 |   \item Middle ground between SGD and batch GD is known as \emph{mini-batch learning}
383 |     \begin{itemize}
384 |     \item E.g. 50 examples at a time
385 |     \item Can use vector/matrix operations rather than loops as in SGD
386 |     \item Vectorized operations highly efficient
387 |     \end{itemize}
388 |   \end{itemize}
389 | \end{frame}
390 | 
391 | \begin{frame}
392 |   \frametitle{}
393 |   \begin{itemize}
394 |   \item
395 |   \end{itemize}
396 | \end{frame}
397 | 
398 | \end{document}
399 | 


--------------------------------------------------------------------------------
/chapter3.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{latexsym} 
  3 | \usepackage{graphicx}
  4 | \usetheme{Warsaw}
  5 | 
  6 | \title{Chapter 3}
  7 | \subtitle{A Tour of Machine Learning Classifiers Using Scikit-learn}
  8 | 
  9 | \begin{document}
 10 | 
 11 | \maketitle
 12 | 
 13 | \begin{frame}
 14 |   \frametitle{Choosing a classification algorithm}
 15 |   \begin{itemize}
 16 |   \item No classifier works best across all scenarios (``no free lunch'' theorem)
 17 |   \item Always need to consider the specifics of the problem
 18 |   \item Solving a problem within supervised ML framework:
 19 |     \begin{enumerate}
 20 |     \item Select features
 21 |     \item Choose performance metrics
 22 |     \item Choose classifier and optimization algorithm
 23 |     \item Evaluate performance of the model
 24 |     \item Tune the classifier
 25 |     \end{enumerate}
 26 |   \end{itemize}
 27 | \end{frame}
 28 | 
 29 | \begin{frame}
 30 |   \frametitle{Perceptron implementation} \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch03/ch03.ipynb}{\beamergotobutton{iPython notebook on github}}
 31 | \end{frame}
 32 | 
 33 | \begin{frame}
 34 |   \frametitle{Modeling class probabilities}
 35 |   \begin{itemize}
 36 |   \item What happens if the classes are not linearly separable?
 37 |   \item Weights never stop updating as long as there is at least one misclassified example in each epoch
 38 |   \item Logistic regression is a better option
 39 |     \item Note that despite the name this is a classification model
 40 |   \end{itemize}
 41 | \end{frame}
 42 | 
 43 | \begin{frame}
 44 |   \frametitle{Logistic regression model}
 45 |   \begin{itemize}
 46 |   \item This is a ``go to'' model for classification
 47 |   \item Designed for binary classification but can be extended to multiclass
 48 |   \item Odds ratio
 49 |     \[
 50 |     \frac{p}{(1-p)}
 51 |     \]
 52 |     Where $p$ is the probability of the positive class  (class label $y=1$). E.g. the probability that a patient has a certain disease.
 53 |   \item Logit function
 54 |     \[
 55 |     logit(p) = \log \frac{p}{1-p}
 56 |     \]
 57 |   \end{itemize}
 58 | \end{frame}
 59 | 
 60 | \begin{frame}
 61 |   \frametitle{Modeling logit function}
 62 |   \begin{itemize}
 63 |   \item We model the logit function as a linear combination of features (dot product of feature values and weights)
 64 |     \[
 65 |     logit ( p (y=1 | \mathbf{x})) = w_0 x_0 + w_1 x_1 + \cdots + w_m x_m = \sum^{m}_{i=0} w_i x_i = \mathbf{w}^T \mathbf{x}.
 66 |     \]
 67 |     Where $p(y=1 | \mathbf{x})$ s the conditional probability that a particular sample belongs to class 1 given its features $\mathbf{x}$
 68 |   \item This is equivalent to expressing $p$ as
 69 |     \[
 70 |     p(y = 1 | \mathbf{x}) = \frac{1}{1 + e^{-\mathbf{w}^T \mathbf{x}}}
 71 |     \]
 72 |   \end{itemize}
 73 | \end{frame}
 74 | 
 75 | \begin{frame}
 76 |   \frametitle{Logistic Sigmoid}
 77 |   \begin{itemize}
 78 |   \item Logistic function (aka sigmoid function)
 79 |     \[
 80 |     \phi(z) = \frac{1}{1+e^{-z}}.
 81 |     \]
 82 |   \item S-shaped curve
 83 |   \end{itemize}
 84 |   \center
 85 |   \includegraphics[scale=0.5]{Code/ch03/images/03_02.png}
 86 | \end{frame}
 87 | 
 88 | \begin{frame}
 89 |   \frametitle{Relationship with Adaline}
 90 |   \begin{itemize}
 91 |   \item In Adaline, we used the identify function as the activation function
 92 |   \item In logistic regression, we use instead use the sigmoid function
 93 |   \end{itemize}
 94 |   \includegraphics[width=\textwidth]{Code/ch03/images/03_03.png}
 95 | \end{frame}
 96 | 
 97 | \begin{frame}
 98 |   \frametitle{Probability distribution over classes}
 99 |   \begin{itemize}
100 |   \item Output of the sigmoid often interpreted as probability
101 |   \item E.g. $P(y=1 | \mathbf{x};\mathbf{w}) = 0.8$
102 |   \item Probability can be converted to a binary outcome (quantizer)
103 |   \[ \hat{y}= \begin{cases} 
104 |     1 & \text{ if } \phi(z) \ge 0.5 \\
105 |     0 & \text{ otherwise }.
106 |     \end{cases}
107 |   \]
108 | \item Which is equivalent to the following
109 |   \[ \hat{y}= \begin{cases} 
110 |     1 & \text{ if } \phi(z) \ge 0.0 \\
111 |     0 & \text{ otherwise }
112 |     \end{cases}
113 |   \]
114 |   \item For many applications (e.g. weather forecasting), we want the probability
115 |   \end{itemize}
116 | \end{frame}
117 | 
118 | \begin{frame}
119 |   \frametitle{Learning the weights}
120 |   \begin{itemize}
121 |   \item Previously we minimized the sum-squared-error cost function
122 |     \[
123 |     J(\mathbf{w}) = \frac{1}{2} \sum_i \bigg( \phi \big( z^{(i)} \big) - y^{(i)}  \bigg)^2
124 |     \]
125 |   \item Now we need to derive the cost function for logistic regression
126 |   \item Define the likelihood $L$
127 |   \end{itemize}
128 |   \[
129 |   L(\mathbf{w}) = P(\mathbf{y} | \mathbf{x}; \mathbf{w}) = \prod_{i=1}^{n} P \big( y^{(i)} | x^{(i)}; \mathbf{w} \big)
130 |   \]
131 |   \[
132 |   L(\mathbf{w}) = \prod_{i=1}^{n} \bigg( \phi \big(z^{(i)} \big) \bigg) ^ {y^{(i)}} \bigg( 1 - \phi \big( z^{(i)} \big) \bigg)^{1-y^{(i)}}
133 |   \]
134 | \end{frame}
135 | 
136 | \begin{frame}
137 |   \frametitle{Log-likelihood function}
138 |   \begin{itemize}
139 |   \item Maximize the likelihood function
140 |     \[
141 |     L(\mathbf{w}) = P(\mathbf{y} | \mathbf{x}; \mathbf{w})
142 |     \]
143 |     \[
144 |     L(\mathbf{w}) = \prod_{i=1}^{n} P \big( y^{(i)} | x^{(i)}; \mathbf{w} \big) =  \prod_{i=1}^{n} \bigg( \phi \big(z^{(i)} \big) \bigg) ^ {y^{(i)}} \bigg( 1 - \phi \big( z^{(i)} \big) \bigg)^{1-y^{(i)}}
145 |     \]
146 |   \item In practice easier to deal with the natural log of this equation
147 |     \[
148 |     l(\mathbf{w}) = \log L(\mathbf{w})
149 |     \]
150 |     \[
151 |     l(\mathbf{w}) = \sum_{i=1}^{n} \Bigg[ y^{(i)} \log \bigg(\phi \big( z^{(i)} \big) \bigg) + \bigg(1 - y^{(i)} \bigg) \log \bigg( 1 - \phi \big( z^{i()} \big) \bigg)  \Bigg]
152 |     \]
153 |     \item Easier to take derivative + fewer numerical underflow issues
154 |   \end{itemize}
155 | \end{frame}
156 | 
157 | \begin{frame}
158 |   \frametitle{Cost function}
159 |   \begin{itemize}
160 |   \item Rewrite likelihood as a cost function
161 |     \[
162 |     J(\mathbf{w}) = \sum_{i=1}^{n} \Bigg[- y^{(i)} \log \bigg(\phi \big( z^{(i)} \big) \bigg) - \bigg(1 - y^{(i)} \bigg) \log \bigg( 1 - \phi \big( z^{i()} \big) \bigg)  \Bigg]
163 |     \]
164 |   \item Can now be minimized using gradient descent
165 |   \end{itemize} \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch03/ch03.ipynb}{\beamergotobutton{iPython notebook on github}}
166 | \end{frame}
167 | 
168 | \begin{frame}
169 |   \frametitle{Weight update derivation}
170 |   Calculate the partial derivative of the log-likelihood function with respect to the $j$th weight:
171 |   \[
172 |   \frac{\partial}{\partial w_j} l(\mathbf{w}) = \Bigg( y \frac{1}{\phi(z)}  - (1-y) \frac{1}{1-\phi(z)}   \Bigg)   \frac{\partial}{\partial w_j} \phi(z)
173 |   \]
174 |   Partial derivative of the sigmoid function:
175 |   \[
176 |   \frac{\partial}{\partial z} \phi(z) = \frac{\partial}{\partial z} \frac{1}{1 + e^{-1}} = \frac{1}{\big( 1 + e^{-z}\big)^2} e^{-z} = \frac{1}{1 + e^{-z}} \bigg( 1 - \frac{1}{1 + e^{-z}} \bigg)
177 |   \]
178 |   \[
179 |   = \phi(z)(1-\phi(z)).
180 |   \]
181 | \end{frame}
182 | 
183 | \begin{frame}
184 |   \frametitle{Weight update derivation}
185 |   Resubstitute $\frac{\partial}{\partial z} \phi(z) = \phi(z)(1-\phi(z))$ to obtain:
186 |   \begin{equation*}
187 |   \begin{split}
188 |     & \Bigg( y \frac{1}{\phi(z)} - (1-y) \frac{1}{1-\phi(z)} \Bigg) \frac{\partial}{\partial w_j} \phi(z) \\
189 |     & = \Bigg( y \frac{1}{\phi(z)} - (1-y) \frac{1}{1-\phi(z)} \Bigg) \phi(z) \big(1 - \phi(z)\big) \frac{\partial}{\partial w_j} z \\
190 |     & = \bigg(  y \big( 1 - \phi(z)   \big) - (1-y) \phi(z)  \bigg) x_j \\
191 |     & = \big( y - \phi(z)  \big) x_j
192 |   \end{split}
193 |   \end{equation*}
194 | \end{frame}
195 | 
196 | \begin{frame}
197 |   \frametitle{Overfitting}
198 |   \begin{itemize}
199 |   \item Sometimes model performs well on training data but does not generalize well to unsee data (test data)
200 |   \item This is overfitting
201 |   \item If a model suffers from overfitting, the model has a high variance
202 |   \item This is often caused by a model that's too complex
203 |   \item Underfitting can also occur (high bias)
204 |   \item Underfitting is caused by a model's not being complex enough
205 |   \item Both suffer from low performance on unseen data
206 |   \end{itemize}
207 | \end{frame}
208 | 
209 | \begin{frame}
210 |   \frametitle{Bias-variance tradeoff}
211 |   \includegraphics[scale=0.4]{Images/bias-variance.jpg}
212 | \end{frame}
213 | 
214 | \begin{frame}
215 |   \frametitle{Regularization}
216 |   \includegraphics[scale=0.3]{Code/ch03/images/03_06.png}
217 |   \begin{itemize}
218 |   \item Regularization is a way to tune the complexity of the model
219 |   \item Regularization helps to filter out noise from training data
220 |   \item As a result, regularization prevents overfitting
221 |   \end{itemize}
222 | \end{frame}
223 | 
224 | \begin{frame}
225 |   \frametitle{}
226 |   \subsection{Tackling overfitting via regularization}
227 |   The most common form of regularization is the so-called L2 regularization (sometimes also called L2 shrinkage or weight decay):
228 |   \[
229 |   \frac{\lambda}{2} \lVert \mathbf{w} \rVert^2 = \frac{\lambda}{2} \sum_{j=1}^m w_{j}^{2}
230 |   \]
231 |   Where $\lambda$ is the so-called regularization parameter.
232 |   To apply regularization, we add the regularization term to the cost function, which shrinks the weights:
233 |   \[
234 |   J(\mathbf{w}) = \sum_{i=1}^{n} \bigg[ - y^{(i)} \log \big(  \phi(z^{(i)})  \big)  - \big( 1 - y ^{(i)} \big)  \log \big( 1 - \phi(z^{(i)})   \big)   \bigg] + \frac{\lambda}{2} \lVert \mathbf{w}\rVert^2  
235 |   \]
236 | \end{frame}
237 | 
238 | \begin{frame}
239 |   \frametitle{Regularization parameter}
240 |   \begin{itemize}
241 |   \item We control how well we fit the training data via the regularization parameter $\lambda$
242 |   \item By increasing $\lambda$, we increase the strength of regularization
243 |   \item Sometimes (e.g in scikit-learn), SVM terminology is used
244 |     \[
245 |     C = \frac{1}{\lambda}
246 |     \]
247 |   \item I.e we rewrite the regularized cost function of logistic regression:
248 |   \end{itemize}
249 |   \[
250 |   C \Bigg[  \sum_{i=1}^{n} \Big(   -y^{(i)} \log \big( \phi(z^{(i)} \big) - \big(  1 - y^{(i)} \big)    \Big) \log \bigg( 1 - \phi(z^{(i)}) \bigg)         \Bigg] + \frac{1}{2} \lVert \mathbf{w} \rVert^2
251 |   \]
252 | \end{frame}
253 | 
254 | \begin{frame}
255 |   \frametitle{Regularization illustrated}
256 |   \begin{itemize}
257 |   \item Decreasing the value of $C$ means increasing the regularization strength
258 |   \item Can be visualized by plotting L2 regularization path for two weights
259 |   \item Display weights across multiple C values
260 |   \item As you see, weights shrink to zero as C decreased
261 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch03/ch03.ipynb}{\beamergotobutton{iPython notebook on github}}
262 |   \end{itemize}
263 | \end{frame}
264 | 
265 | \begin{frame}
266 |   \frametitle{Support Vector Machines}
267 |   \begin{itemize}
268 |   \item In SVMs, the optimization objective is to maximize the \textbf{margin}
269 |   \item The margin is defined as the distance between the separating hyperlane and the training samples that are closest to this hyperplane (\textbf{support vectors})
270 |   \item Intuitively, the larger the margin, the lower generalization error
271 |   \item Models with small margin prone to overfitting
272 |   \end{itemize}
273 | \end{frame}
274 | 
275 | \begin{frame}
276 |   \frametitle{Maximum margin classification}
277 |   \includegraphics[width=\textwidth]{Code/ch03/images/03_07.png}
278 | \end{frame}
279 | 
280 | \begin{frame}
281 |   \frametitle{Mathematical intuition}
282 |   \textit{Positive} and \textit{negative} hyperplanes that are parallel to the decision boundary, which can be expressed as follows:
283 |   \[
284 |   w_0 + \mathbf{w}^T \mathbf{x}_{pos} = 1
285 |   \]
286 |   \[
287 |   w_0 + \mathbf{w}^T \mathbf{x}_{neg} = -1
288 |   \]
289 |   Distance between these two planes (prove it!), i.e. the margin:
290 |   \[
291 |   \frac{2}{\lVert \mathbf{w} \rVert}
292 |   \]
293 |   Where the length of the vector $\mathbf{w}$ is defined as follows:
294 |   \[
295 |   \lVert \mathbf{w} \rVert = \sqrt{\sum_{j=1}^{m} w_{j}^{2}} 
296 |   \]
297 | \end{frame}
298 | 
299 | \begin{frame}
300 |   \frametitle{Constrained optimization problem}
301 |   Minimize:
302 |   \[
303 |   \frac{1}{2} \lVert \mathbf{w} \rVert^2
304 |   \]
305 |   Subject to constraints that the samples are classified correctly:
306 |   \[
307 |   w_0 + \mathbf{w}^T \mathbf{x}^{(i)} \ge 1 \text{ if } y^{(i)} = 1
308 |   \]
309 |   \[
310 |   w_0 + \mathbf{w}^T \mathbf{x}^{(i)}  < -1 \text{ if } y^{(i)} = -1
311 |   \]
312 |   These equations say that all negative and positive samples should fall respectively on one side of the negative and positive hyperplanes. This can be written more compactly:
313 |   \[
314 |   y^{(i)} \big(  w_0 + \mathbf{w}^T \mathbf{x}^{(i)} \big) \ge 1 \quad \forall_i
315 |   \]
316 | \end{frame}
317 | 
318 | \begin{frame}
319 |   \frametitle{SVM Solution}
320 |   Classsifier
321 |   \[
322 |   f(\mathbf{x}) = sgn(\mathbf{w}^T \mathbf{x} + w_0)
323 |   \]
324 |   Weights
325 |   \[
326 |   \mathbf{w} = \sum_{i=1}^N \alpha_i y_i \mathbf{x}_i
327 |   \]
328 | \end{frame}
329 | 
330 | \begin{frame}
331 |   \frametitle{Slack variables / soft margin SVM}
332 |   \includegraphics[width=\textwidth]{Images/svm-slack.png}
333 |   \\
334 |   \tiny{Source: http://www.saedsayad.com/support\_vector\_machine.htm}
335 | \end{frame}
336 | 
337 | 
338 | \begin{frame}
339 |   \frametitle{Extending SVM to non-linearly separable cases}
340 |   \begin{itemize}
341 |   \item Need to relax the linear constraints
342 |   \item To ensure convergence in presense of misclassifications
343 |   \item Introduce slack variables $\xi$
344 |   \end{itemize}
345 |   \\
346 |   \[
347 |   \mathbf{w}^T \mathbf{x}^{(i)} \ge 1 - \xi^{(i)} \text{ if } y^{(i)} = 1
348 |   \]
349 |   \[
350 |   \mathbf{w}^T \mathbf{x}^{(i)} < -1 + \xi^{(i)} \text{ if } y^{(i)} = -1
351 |   \]
352 |   New objective to be minimized:
353 |   \[
354 |   \frac{1}{2} \lVert \mathbf{w} \rVert^2 + C \Big(\sum_i \xi^{(i)} \Big)
355 |   \]
356 | \end{frame}
357 | 
358 | \begin{frame}
359 |   \frametitle{Regularization in SVMs}
360 |   \[
361 |   \frac{1}{2} \lVert \mathbf{w} \rVert^2 + C \Big(\sum_i \xi^{(i)} \Big)
362 |   \]
363 |   \begin{itemize}
364 |   \item Large values of $C$ - large error penalties
365 |   \item Small values of $C$ - less strict about misclassifications
366 |   \item Parameter $C$ controls width of the margin
367 |   \item I.e. $C$ is a way to do regularization in SVMs
368 |   \end{itemize}
369 | \end{frame}
370 | 
371 | \begin{frame}
372 |   \frametitle{Regularization in SVMs}
373 |   \includegraphics[width=\textwidth]{Code/ch03/images/03_08.png}
374 | \end{frame}
375 | 
376 | \begin{frame}
377 |   \frametitle{Exclusive OR (XOR) linear separability}
378 |   \includegraphics[width=\textwidth]{Images/xor.png}
379 |   \\
380 |   \tiny{Source: http://www.saedsayad.com/artificial\_neural\_network\_bkp.htm}
381 | \end{frame}
382 | 
383 | \begin{frame}
384 |   \frametitle{Generated XOR data}
385 |   \begin{itemize}
386 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch03/ch03.ipynb}{\beamergotobutton{iPython notebook on github}}
387 |   \item Kernel methods create non-linear combinations of the original features
388 |   \item Project onto a higher dimensional space where they are separable
389 |   \item Mapping function $\phi(\cdot)$
390 |   \end{itemize}
391 |   \[
392 |   \phi(x_1, x_2) = (z_1, z_2, z_3) = (x_1, x_2, x_{1}^{2} + x_{2}^{2})
393 |   \]
394 | \end{frame}
395 | 
396 | \begin{frame}
397 |   \frametitle{Turn non-separable classess are separable}
398 |   \includegraphics[width=\textwidth]{Code/ch03/images/03_11.png}
399 | \end{frame}
400 | 
401 | \begin{frame}
402 |   \frametitle{Kernel trick}
403 |   General blueprint:
404 |   \begin{itemize}
405 |   \item Transform training data into a higher dimensional space via a mapping function $\phi(\cdot)$
406 |   \item Train a linear SVM to classify the data in the new feature space
407 |   \item Use the same mapping function $\phi(\cdot)$ to transform new (unseen) data
408 |   \item Classify unseen data using the linear SVM model
409 |   \end{itemize}
410 | \end{frame}
411 | 
412 | \begin{frame}
413 |   \frametitle{Problem with explicit mapping}
414 |   \begin{itemize}
415 |   \item The construction of the new features is computationally expensive
416 |   \item Fortunately, we have the \textit{kernel trick}
417 |   \item Decision boundary rely on dot products in input space
418 |   \item Need to replace the dot product
419 |     \[
420 |     \mathbf{x}^{(i) \; T} \mathbf{x}^{(j)} \text{ by } \phi \big( \mathbf{x}^{(i)} \big)^T \phi \big( \mathbf{x}^{(j)} \big)
421 |     \]
422 |   \item No need to calculate this dot product explicitly
423 |   \item Instead, we define a kernel function:
424 |     \[
425 |     k \big( \mathbf{x}^{(i)}, \mathbf{x}^{(j)}  \big) = \phi \big( \mathbf{x}^{(i)} \big)^T \phi \big( \mathbf{x}^{(j)} \big)
426 |     \]
427 |   \end{itemize}
428 | \end{frame}
429 | 
430 | \begin{frame}
431 |   \frametitle{Kernel Trick: Example}
432 |   $\mathbf{x} = (x_1, x_2), $
433 |   $\mathbf{z} = (z_1, z_2), $ 
434 |   $K(\mathbf{x}, \mathbf{z}) = \langle \mathbf{x} \cdot \mathbf{z} \rangle^2$
435 |   \[
436 |   K(x, z) = (x_1z_1 + x_2z_2)^2 = (x_1^2z_1^2 + 2x_1z_1x_2z_2 + x_2^2z_2^2) = \\
437 |   = \langle (x_1^2, \sqrt{2}x_1x_2, x_2^2) \cdot (z_1^2, \sqrt{2}z_1z_2, z_2^2) \rangle
438 |   = \lange \phi(\mathbf{x}) \phi(\mathbf{z}) \rangle
439 |   \]
440 |   % source: http://www.cogsys.wiai.uni-bamberg.de/teaching/ss06/hs_svm/slides/SVM_and_Kernels.pdf
441 | \end{frame}
442 | 
443 | \begin{frame}
444 |   \frametitle{RBF Kernel}
445 |   One of the most widely used kernels is the \textit{Radial Basis Function kernel} (RBF kernel) or Gaussian kernel:
446 |   \[
447 |   k \big( \mathbf{x}^{(i)}, \mathbf{x}^{(j)}  \big) = \exp \Bigg( - \frac{ \lVert \mathbf{x}^{(i)} - \mathbf{x}^{(j)} \rVert^2  }{2 \sigma^2} \Bigg)
448 |   \]
449 |   This is often simplified to:
450 |   \[
451 |   k \big( \mathbf{x}^{(i)}, \mathbf{x}^{(j)}  \big) = \exp \bigg(  -\gamma\ \lVert \mathbf{x}^{(i)} - \mathbf{x}^{(j)} \rVert^2  \bigg)
452 |   \]
453 |   Here, $\gamma = \frac{1}{2 \sigma^2}$ is a free parameter that is to be optimized.
454 | \end{frame}
455 | 
456 | \begin{frame}
457 |   \frametitle{}
458 |   \begin{itemize}
459 |   \item The term \textit{kernel} can be interpreted as a \textit{similarity function} between a pair of samples
460 |   \item The minus sign inverts the distance measure into a similarity score (from 0/dissimilar to 1/very similar)
461 |   \item Use RBF kernel to separate XOR data
462 |   \item Vary the $\gamma$ parameter
463 |   \item \href{https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch03/ch03.ipynb}{\beamergotobutton{iPython notebook on github}}
464 |   \end{itemize}
465 | \end{frame}
466 | 
467 | \begin{frame}
468 |   \frametitle{K-nearest neighbors}
469 |   \begin{itemize}
470 |   \item KNN is an example of a non-parametric model
471 |   \item Parametric models learn parameters from training data
472 |   \item Once training done, the training set not required
473 |   \item KNN is an instance-based learner
474 |   \end{itemize}
475 | \end{frame}
476 | 
477 | \begin{frame}
478 |   \frametitle{Basic KNN algorithm}
479 |   \begin{itemize}
480 |   \item Choose $k$ and a distance metric
481 |   \item Find $k$ nearest neighbors of the sample to be classified
482 |   \item Assign the class label by majority vote
483 |   \end{itemize}
484 | \end{frame}
485 | 
486 | \begin{frame}
487 |   \includegraphics[height=\textheight, width=\textwidth]{Code/ch03/images/03_20.png}
488 | \end{frame}
489 | 
490 | \begin{frame}
491 |   \frametitle{KNN advantages}
492 |   \begin{itemize}
493 |   \item Classifier immediately adapts as we receive new training examples
494 |   \item But computational complexity grows linearly with the number of samples
495 |   \item Need efficient data structures such as KD-trees
496 |   \end{itemize}
497 |   Distance metrics:
498 |   \[
499 |   d \big(\mathbf{x}^{(i)}, \mathbf{x}^{(j)}\big) =  \sqrt[p]{\sum_k \big| x_{k}^{(i)} - x_{k}^{(j)} \big|^p } 
500 |   \]
501 |   Euclidean distance if we set the parameter $p=2$ \\
502 |   Manhattan distance if we set the parameter $p=1$
503 | \end{frame}
504 | 
505 | \end{document}
506 | 


--------------------------------------------------------------------------------