├── .gitignore
├── MLT
    ├── images
    │   ├── SVM.png
    │   ├── hinge_loss.png
    │   ├── RandomForest.png
    │   ├── lasso_contour.png
    │   ├── ridge_contour.png
    │   ├── softmax-model.png
    │   └── Softmax-regression.png
    ├── data
    │   └── weather_play.csv
    └── Week_9.ipynb
├── MLP
    ├── images
    │   ├── cluster.png
    │   ├── AverageLinkage.png
    │   ├── SingleLinkage.png
    │   ├── CompleteLinkage.png
    │   └── week_4_sns_pairplot.png
    ├── Week_2.ipynb
    └── Week_11.ipynb
├── DL
    ├── Week_1
    │   ├── images
    │   │   ├── Perceptron.png
    │   │   ├── McCullochPitts.png
    │   │   ├── ArtificialNeuron.png
    │   │   ├── BiologicalNeuron.png
    │   │   └── DifferentVariationsofMcCullochPitts.png
    │   └── Week_1.md
    └── Week_3
    │   ├── images
    │       ├── ErrorEquation.png
    │       ├── FeedForwardNN.png
    │       ├── example_2_3_3.png
    │       ├── example_3_3.png
    │       └── table_week_3_3.png
    │   ├── Lecture_3_1.md
    │   ├── Lecture_3_2.md
    │   └── Lecture_3_3.md
├── .gitattributes
├── README.md
└── MLP-using-GPU
    └── 2-DataPreprocessing.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | # pixi environments
3 | .pixi
4 | *.egg-info
5 | 


--------------------------------------------------------------------------------
/MLT/images/SVM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/SVM.png


--------------------------------------------------------------------------------
/MLP/images/cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/cluster.png


--------------------------------------------------------------------------------
/MLT/images/hinge_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/hinge_loss.png


--------------------------------------------------------------------------------
/MLP/images/AverageLinkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/AverageLinkage.png


--------------------------------------------------------------------------------
/MLP/images/SingleLinkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/SingleLinkage.png


--------------------------------------------------------------------------------
/MLT/images/RandomForest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/RandomForest.png


--------------------------------------------------------------------------------
/MLT/images/lasso_contour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/lasso_contour.png


--------------------------------------------------------------------------------
/MLT/images/ridge_contour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/ridge_contour.png


--------------------------------------------------------------------------------
/MLT/images/softmax-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/softmax-model.png


--------------------------------------------------------------------------------
/DL/Week_1/images/Perceptron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/Perceptron.png


--------------------------------------------------------------------------------
/MLP/images/CompleteLinkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/CompleteLinkage.png


--------------------------------------------------------------------------------
/DL/Week_3/images/ErrorEquation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/ErrorEquation.png


--------------------------------------------------------------------------------
/DL/Week_3/images/FeedForwardNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/FeedForwardNN.png


--------------------------------------------------------------------------------
/DL/Week_3/images/example_2_3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/example_2_3_3.png


--------------------------------------------------------------------------------
/DL/Week_3/images/example_3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/example_3_3.png


--------------------------------------------------------------------------------
/MLP/images/week_4_sns_pairplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/week_4_sns_pairplot.png


--------------------------------------------------------------------------------
/MLT/images/Softmax-regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/Softmax-regression.png


--------------------------------------------------------------------------------
/DL/Week_1/images/McCullochPitts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/McCullochPitts.png


--------------------------------------------------------------------------------
/DL/Week_3/images/table_week_3_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/table_week_3_3.png


--------------------------------------------------------------------------------
/DL/Week_1/images/ArtificialNeuron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/ArtificialNeuron.png


--------------------------------------------------------------------------------
/DL/Week_1/images/BiologicalNeuron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/BiologicalNeuron.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # SCM syntax highlighting & preventing 3-way merges
2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true
3 | 


--------------------------------------------------------------------------------
/DL/Week_1/images/DifferentVariationsofMcCullochPitts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/DifferentVariationsofMcCullochPitts.png


--------------------------------------------------------------------------------
/MLT/data/weather_play.csv:
--------------------------------------------------------------------------------
 1 | Outlook,Temperature,Humidity,Wind,Play
 2 | Sunny,Hot,High,Weak,No
 3 | Sunny,Hot,High,Strong,No
 4 | Overcast,Hot,High,Weak,Yes
 5 | Rain,Mild,High,Weak,Yes
 6 | Rain,Cool,Normal,Weak,Yes
 7 | Rain,Cool,Normal,Strong,No
 8 | Overcast,Cool,Normal,Strong,Yes
 9 | Sunny,Mild,High,Weak,No
10 | Sunny,Cool,Normal,Weak,Yes
11 | Rain,Mild,Normal,Weak,Yes
12 | Sunny,Mild,Normal,Strong,Yes
13 | Overcast,Mild,High,Strong,Yes
14 | Overcast,Hot,Normal,Weak,Yes
15 | Rain,Mild,High,Strong,No
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IITM-Notes
 2 | 
 3 | Lecture notes and colab/jupyter notebooks
 4 | 
 5 | Colab/jupyter notebooks were written manually, so some of the code might be modified (Don't worry most of it will stay true to the original code shown the colab lectures).
 6 | 
 7 | If you find any errors or mistakes in the notebooks, please notify me through issues.
 8 | 
 9 | > NOTE: If you want to run MLT / MLP notebooks on GPU using cuML and cuPy then I would recommends installing rapids library through conda otherwise you can install the current dependencies using pixi / conda / pip.
10 | 


--------------------------------------------------------------------------------
/DL/Week_3/Lecture_3_1.md:
--------------------------------------------------------------------------------
 1 | # [Lecture 3.1: Feed Forward Neural Network](https://www.youtube.com/watch?v=HHv6Ndo9VBU)
 2 | 
 3 | ![image](images/FeedForwardNN.png)
 4 | 
 5 | - The input to the network is an $n$-dimensional vector
 6 | - The network contains **$L - 1$** hidden layers (2 in this case) having **$n$** neurons each.
 7 | - Finally, there is one output layer containing **$k$** neurons (say corresponding to **$k$** classes).
 8 | - Each neuron in the hidden layer and the output layer can be split into two parts: pre-activation ($a_i$) and activation($h_i$). Both $a_i$ and $h_i$ are vectors.
 9 | - The input layer is called 0-th layer and the output layer is called the (**$L$**)-th layer.
10 | - $\large W_i \isin \mathbb{R}^{n \times n}$ and $\large b_i \isin \mathbb{R}^n$ are the weight and the bias between the layers $i-1$ and $i$ ( $\large 0 \lt i \lt L$ ).
11 | - $\large W_L \isin \mathbb{R}^{k \times n}$ and $\large b_L \isin \mathbb{R}^k$ are the weight and the bias between the last hidden layer and output layer (L = 3 in this case)
12 | - The pre-activation at the layer $i$ is given by
13 | $$
14 | \large
15 | a_i(x) = b_i(x) + W_ih_{i-1}(x)
16 | $$
17 | - The activation at the layer $i$ is given by
18 | $$
19 | \large
20 | h_i(x) = g(a_i(x))
21 | $$
22 | 
23 | where $g$ is called the activation function (for example logistic, tanh, linear etc)
24 | 
25 | - The activation at the output layer is given by
26 | $$
27 | \large
28 | f(x) = h_L(x) = O(a_L(x))
29 | $$
30 | 
31 | where $O$ is the output activation function (for example softmax, linear etc).
32 | 
33 | > To simplify notation we will refer to $a_i(x)$ as $a_i$ and $h_i(x)$ as $h_i$,
34 | 
35 | So in the above diagram,
36 | 
37 | - **Data**: $\large \{x_i, y_i\}_{i=1}^N$
38 | - **Model**:
39 | 
40 | $$
41 | \large
42 | \hat{y_i} = \hat{f}(x_i) = O(W_3g(W_2g(W_1x + b_1)+b_2)+b_3)
43 | $$
44 | 
45 | - **Parameters**:
46 | 
47 | $$
48 | \large
49 | \theta = W_1,.... ,W_L, b_1, b_2, ... , b_L \quad (L=3) 
50 | $$
51 | 
52 | - **Algorithm**: Gradient descent with back-propagation
53 | - **Obejective/Loss/Error Function**: Say,
54 | 
55 | $$
56 | min \frac{1}{N} \sum_{i=1}^N \sum_{j=1}^k (\hat{y}_{ij} - y_{ij})^2 \\
57 | In \enspace general,\enspace min \enspace \mathscr{L}(\theta)
58 | $$
59 | 
60 | where $\mathscr{L}(\theta)$ is some function of parameters.
61 | 


--------------------------------------------------------------------------------
/DL/Week_3/Lecture_3_2.md:
--------------------------------------------------------------------------------
 1 | # [Lecture 3.2 Learning parameters](https://www.youtube.com/watch?v=0Me1ywSlJE8)
 2 | 
 3 | ![image](images/FeedForwardNN.png)
 4 | 
 5 | ___
 6 | 
 7 | ## Algorithm: `gradient_descent()`
 8 | 
 9 | ___
10 | $$
11 | \begin{align}
12 | &t \leftarrow 0; \\
13 | &max\_iterations \leftarrow 1000; \\
14 | &Initialize \enspace w_0, b_0; \\
15 | & \mathbf{while} \enspace t \text{++}  \lt max\_iterations \enspace \mathbf{do} \\
16 |     & \quad w_{t+1} \leftarrow w_t -\eta \nabla w_t \\
17 |     & \quad b_{t+1} \leftarrow b_t -\eta \nabla b_t \\
18 | & \mathbf{end}
19 | \end{align}
20 | $$
21 | 
22 | We can concisely write it as:
23 | 
24 | ___
25 | 
26 | ## Algorithm: `gradient_descent()`
27 | 
28 | ___
29 | $$
30 | \begin{align}
31 | & t \leftarrow 0; \\
32 | & max\_iterations \leftarrow 1000; \\
33 | & Initialize \enspace \theta_0 = [w_0, b_0]; \\
34 | & \mathbf{while} \enspace t \text{++}  \lt max\_iterations \enspace \mathbf{do} \\
35 |     & \quad\theta_{t+1} \leftarrow \theta_t -\eta \nabla \theta_t \\
36 | & \mathbf{end}
37 | \end{align}
38 | $$
39 | where $\Large \nabla \theta_t = [\frac{\partial \mathscr{L}(\theta)}{\partial w_t}, \frac{\partial \mathscr{L}(\theta)}{\partial b_t}]^T$
40 | 
41 | - Now, in this feedforward neural network, instead of $\theta = [w, b]$ we have $\theta = [W_1, W_2,...,W_L, b_1, b_2,...,b_L]$
42 | - We can still use the same algorithm for learning the parameters of our model.
43 | 
44 | ___
45 | 
46 | ## Algorithm: `gradient_descent()`
47 | 
48 | ___
49 | $$
50 | \begin{align}
51 | & t \leftarrow 0; \\
52 | & max\_iterations \leftarrow 1000; \\
53 | & Initialize \enspace \color{red}{\theta_0 = [W_1^0,...,W_L^0, b_1^0,...,b_L^0];} \\
54 | & \mathbf{while} \enspace t \text{++}  \lt max\_iterations \enspace \mathbf{do} \\
55 |     & \quad\theta_{t+1} \leftarrow \theta_t -\eta \nabla \theta_t \\
56 | & \mathbf{end}
57 | \end{align}
58 | $$
59 | where $\color{red}{\Large \nabla \theta_t = [\frac{\partial \mathscr{L}(\theta)}{\partial W_{1,t}},...,\frac{\partial \mathscr{L}(\theta)}{\partial W_{L,t}},\frac{\partial \mathscr{L}(\theta)}{\partial b_{1,t}},...,\frac{\partial \mathscr{L}(\theta)}{\partial b_{L,t}}]^T}$
60 | 
61 | - Thus $\nabla \theta$ is composed of:
62 |   - $\nabla W_1, \nabla W_2,..., \nabla W_{L-1} \in \mathbb{R}^{n \times n}, \nabla W_L \in \mathbb{R}^{k \times n}$
63 |   - $\nabla b_1, \nabla b_2,..., \nabla b_{L-1} \in \mathbb{R}^n, \nabla b_L \in \mathbb{R}^k$
64 | 


--------------------------------------------------------------------------------
/DL/Week_3/Lecture_3_3.md:
--------------------------------------------------------------------------------
 1 | # [Lecture 3.3: Output functions and loss functions](https://www.youtube.com/watch?v=1hefEWZHvJg)
 2 | 
 3 | - The choice of loss function depends on problem at hand
 4 | - Consider movie example again but this time we are interested in predicting ratings
 5 | 
 6 | ![image](images/example_3_3.png)
 7 | 
 8 | - Here $y_i \in \mathbb{R}^3$
 9 | - The loss function should capture how much $\hat{y}_j$ deviates from $y_j$
10 | - If $y_j \in \mathbb{R}^3$ then the squared error loss can capture this deviation
11 | 
12 | $$
13 | \mathscr{L}(\theta) = \frac{1}{N} \sigma_{i=1}^N \sigma_{j=1}^k (\hat{y}_{ij} - y_{ij})^2
14 | $$
15 | 
16 | - A related question would be what is the output functions 'O' be if $y_j \in \mathbb{R}$
17 | - More specifically, can it be the logistic function?
18 | - No because it restricts $\hat{y}_j$ to the value to between 0 and 1. But we want $y_j \in \mathbb{R}$
19 | - So, in such cases it makes sense to have 'O' as linear function
20 | 
21 | $$
22 | \begin{align}
23 |     \hat{f}(x) &= h_L = O(a_L) \\
24 |     &= W_O a_L + b_O
25 | \end{align}
26 | $$
27 | 
28 | - $\hat{y}_j = \hat{f}(X_i)$ is no longer bounded between 0 and 1
29 | 
30 | ![image](images/example_2_3_3.png)
31 | 
32 | - Now let us consider another problem for which a different loss function would be appropriate.
33 | - Suppose we want to classify an image into 1 of the $k$ classes
34 | - Here again we could use the squared error loss to capture the deviation
35 | - Notice that $y$ is a probability distribution
36 | - Therefore we should also ensure that $\hat{y}$ is a probability distribution
37 | - We use **softmax** function to get the expected output in a probability distribution
38 |   - $a_L = W_L h_{L-1} + b_L$
39 |   - $\large y_i = O(a_L)_j = \frac{e^{L,j}}{\sum_{i=1}^k e^{a_{L,j}}}$
40 | - $O(a_L)_j$ is the $j^{th}$ element of $\hat{y}$ and $a_{L,j}$ is the $j^{th}$ element of the vector $a_L$.
41 | - **Cross Entropy**:
42 | $$\mathscr{L}(\theta) = -\sum_{c=1}^k y_c \log \hat{y}_c$$
43 | 
44 | Notice that
45 | 
46 | - $y_c = 1$ if $c = l$ ( the true class label) and 0 otherwise.
47 | 
48 | $$
49 | \because \enspace \mathscr{L}(\theta) = - \log \hat{y}_l
50 | $$
51 | 
52 | - So for classification problem (where we have to choose 1 of K classes), we use the following objective function
53 |   - $\text{minimize} \enspace \mathscr{L}(\theta) = - \log \hat{y}_l$
54 |   - $y_l$ is a function of $\theta$ and is the probablity that $x$ belongs to $l$ class.
55 |   - $\log y_l$ is called **log-likelihood** of the data.
56 | 
57 | ![image](images/table_week_3_3.png)
58 | 


--------------------------------------------------------------------------------
/DL/Week_1/Week_1.md:
--------------------------------------------------------------------------------
 1 | # Week 1
 2 | 
 3 | ## [Lecture 1.6: Motivation from Biological Neuron](https://www.youtube.com/watch?v=KjMvUwq7PdQ)
 4 | 
 5 | - The most fundamental unit of a deep neural network is called an **artificial** *neuron*.
 6 | - The inspiration comes from biology (more specifically from the brain)
 7 | - **biological neurons = neuron cells = neural processing units**
 8 | 
 9 | ![ArtificialNeuron](images/ArtificialNeuron.png)
10 | 
11 | ### Biological Neuron
12 | 
13 | ![BiologicalNeuron](images/BiologicalNeuron.png)
14 | 
15 | - **dendrite**: receives signals from other neurons
16 | - **synapse**: point of connection to other neurons
17 | - **soma**: process the information
18 | - **axon**: transmits the output of the neuron
19 | 
20 | - Our sensory organs interact with the outside world and they relay information to the neurons. The neurons (may) get activated and produces a response
21 | - Of course, in reality, its not just a single neuron that does all this but there is a massively parallel interconnected network of neurons.
22 | - The sensory organs relay information to the lowest layer of neurons.
23 | - An average human brain has around $10^{11}$ (100 billion) neurons
24 | - This massively parallel network also ensures that there is division of work
25 | - Each neuron may perform a certain role or respond to a certain stimulus.
26 | 
27 | ## [Lecture 1.7: McCulloch Pitts Neuron and Thresholding Logic](https://www.youtube.com/watch?v=-bxOadOFNYc)
28 | 
29 | ![image](images/McCullochPitts.png)
30 | 
31 | - McCulloch and Pitts proposed a highly simplified computational model of the neuron
32 | - $g$ aggregates the inputs and the function $f$ takes a decision based on this aggregation
33 | - The inputs can be excitatory and inhibitory ( if a particular input is "ON", no matter what the other inputs are, the output will always be zero.)
34 |   - $y = 0$ if any $x_i$ is inhibitory, else
35 | $$
36 | \large
37 | g(x_1, x_2,..., x_n) = g(x) = \sum_{i=1}^n x_i \\
38 | y = f(g(x)) = 1 \quad \text{if} \quad g(x) \ge \theta
39 | $$
40 | - $\theta$ is called thresholding parameter
41 | 
42 | ![image](images/DifferentVariationsofMcCullochPitts.png)
43 | 
44 | Here in NOT function $x_1$ is an inhibitory input.
45 | 
46 | ## [Lecture 1.8: Perceptrons](https://www.youtube.com/watch?v=Ydd9TMyoG6k)
47 | 
48 | ![image](images/Perceptron.png)
49 | 
50 | - Frank Rosenblatt, an American Psychologist proposed the classic perceptron model in 1958.
51 | - A more general computationsal model than McCulloch-Pitts neurons
52 | - **Main differences**: Introduction of numerical weights for inputs and a mechanism for learning these weights
53 | - Inputs are no longer limited to boolean values
54 | - Refined and carefully analyzed by Minsky and Papert (1969) - their model is referred to as the **perceptron** model here.
55 | 
56 | $$
57 | \begin{align}
58 |   y &= 1 \quad if  \quad \sum_{i=1}^n w_i*x_i \ge \theta \\
59 |   &= 0 \quad if  \quad \sum_{i=1}^n w_i*x_i \lt \theta
60 | \end{align}
61 | $$
62 | simplifying it and taking $w_0$ as $-\theta$ we get
63 | $$
64 | \begin{align}
65 |   y &= 1 \quad if  \quad \sum_{i=0}^n w_i*x_i \ge 0 \\
66 |   &= 0 \quad if  \quad \sum_{i=0}^n w_i*x_i \lt 0
67 | \end{align}
68 | $$
69 | 
70 | - From the equations it should be clear that even a perceptron separates the input space into two halves
71 | - All the inputs which produce a 1 lie on one side and all inputs which produce a 0 lie on the other side
72 | - In other words, a single perceptron can only be used to implement linearly seperable functions
73 | - Then what is the difference between McCulloch-Pitts model and Perceptron?
74 |   - The weights (including the threshold) can be learned and the inputs can be real valued.
75 | 


--------------------------------------------------------------------------------
/MLT/Week_9.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Lecture 9.4: Implementing DT from scratch"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Decision Trees\n",
 15 |     "\n",
 16 |     "Decision Trees are popular **supervised machine learning algorithm** that can be used for both **classification** and **regression** tasks\n",
 17 |     "\n",
 18 |     "The tree itself is a model in decision trees and we would like to estimate an **optimal tree structure** from the given training data.\n",
 19 |     "\n",
 20 |     "- The internal nodes contains conditions on features. Depending on the outcome of the comparision, we take an appropriate path in the tree. The process is repeated until we reach a leaf note.\n",
 21 |     "- In the case of classification, leaf nodes contain label and in case of regression, the prediction is obtained by taking sample mean of labels of the subset of training present in that leaf node.\n",
 22 |     "\n",
 23 |     "In this colab, we will implement decision tree for classification with ID3 algorithm"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "#### Importing Libraries"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import numpy as np\n",
 40 |     "import pandas as pd"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/plain": [
 51 |        "2.220446049250313e-16"
 52 |       ]
 53 |      },
 54 |      "execution_count": 2,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "eps = np.finfo(float).eps\n",
 61 |     "eps"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Here `eps` is the smallest respectable number. At times we get `log(0)` or `0` in the denominator, to avoid that we are going to use this."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "#### Classification Demo\n",
 76 |     "In this case we'll use a synthetic data for classification data."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/html": [
 87 |        "<div>\n",
 88 |        "<style scoped>\n",
 89 |        "    .dataframe tbody tr th:only-of-type {\n",
 90 |        "        vertical-align: middle;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe tbody tr th {\n",
 94 |        "        vertical-align: top;\n",
 95 |        "    }\n",
 96 |        "\n",
 97 |        "    .dataframe thead th {\n",
 98 |        "        text-align: right;\n",
 99 |        "    }\n",
100 |        "</style>\n",
101 |        "<table border=\"1\" class=\"dataframe\">\n",
102 |        "  <thead>\n",
103 |        "    <tr style=\"text-align: right;\">\n",
104 |        "      <th></th>\n",
105 |        "      <th>Outlook</th>\n",
106 |        "      <th>Temperature</th>\n",
107 |        "      <th>Humidity</th>\n",
108 |        "      <th>Wind</th>\n",
109 |        "      <th>Play</th>\n",
110 |        "    </tr>\n",
111 |        "  </thead>\n",
112 |        "  <tbody>\n",
113 |        "    <tr>\n",
114 |        "      <th>0</th>\n",
115 |        "      <td>Sunny</td>\n",
116 |        "      <td>Hot</td>\n",
117 |        "      <td>High</td>\n",
118 |        "      <td>Weak</td>\n",
119 |        "      <td>No</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>1</th>\n",
123 |        "      <td>Sunny</td>\n",
124 |        "      <td>Hot</td>\n",
125 |        "      <td>High</td>\n",
126 |        "      <td>Strong</td>\n",
127 |        "      <td>No</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>2</th>\n",
131 |        "      <td>Overcast</td>\n",
132 |        "      <td>Hot</td>\n",
133 |        "      <td>High</td>\n",
134 |        "      <td>Weak</td>\n",
135 |        "      <td>Yes</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3</th>\n",
139 |        "      <td>Rain</td>\n",
140 |        "      <td>Mild</td>\n",
141 |        "      <td>High</td>\n",
142 |        "      <td>Weak</td>\n",
143 |        "      <td>Yes</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>4</th>\n",
147 |        "      <td>Rain</td>\n",
148 |        "      <td>Cool</td>\n",
149 |        "      <td>Normal</td>\n",
150 |        "      <td>Weak</td>\n",
151 |        "      <td>Yes</td>\n",
152 |        "    </tr>\n",
153 |        "  </tbody>\n",
154 |        "</table>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "    Outlook Temperature Humidity    Wind Play\n",
159 |        "0     Sunny         Hot     High    Weak   No\n",
160 |        "1     Sunny         Hot     High  Strong   No\n",
161 |        "2  Overcast         Hot     High    Weak  Yes\n",
162 |        "3      Rain        Mild     High    Weak  Yes\n",
163 |        "4      Rain        Cool   Normal    Weak  Yes"
164 |       ]
165 |      },
166 |      "execution_count": 3,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "df = pd.read_csv('data/weather_play.csv') # This is the data shown in the slides\n",
173 |     "df.head()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 4,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "(14, 5)"
185 |       ]
186 |      },
187 |      "execution_count": 4,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "df.shape"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 5,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "array([['Sunny', 'Hot', 'High', 'Weak', 'No'],\n",
205 |        "       ['Sunny', 'Hot', 'High', 'Strong', 'No'],\n",
206 |        "       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],\n",
207 |        "       ['Rain', 'Mild', 'High', 'Weak', 'Yes'],\n",
208 |        "       ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],\n",
209 |        "       ['Rain', 'Cool', 'Normal', 'Strong', 'No'],\n",
210 |        "       ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],\n",
211 |        "       ['Sunny', 'Mild', 'High', 'Weak', 'No'],\n",
212 |        "       ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],\n",
213 |        "       ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],\n",
214 |        "       ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],\n",
215 |        "       ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],\n",
216 |        "       ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],\n",
217 |        "       ['Rain', 'Mild', 'High', 'Strong', 'No']], dtype=object)"
218 |       ]
219 |      },
220 |      "execution_count": 5,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "df.values"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 6,
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "Index(['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play'], dtype='object')"
238 |       ]
239 |      },
240 |      "execution_count": 6,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "df.keys()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 7,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "'Play'"
258 |       ]
259 |      },
260 |      "execution_count": 7,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "target = df.keys()[-1] # Name of the target column\n",
267 |     "target"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 8,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "Index(['Outlook', 'Temperature', 'Humidity', 'Wind'], dtype='object')"
279 |       ]
280 |      },
281 |      "execution_count": 8,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "df.keys()[:-1]"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "Let's check the total number of labels"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 9,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "data": {
304 |       "text/plain": [
305 |        "array(['No', 'Yes'], dtype=object)"
306 |       ]
307 |      },
308 |      "execution_count": 9,
309 |      "metadata": {},
310 |      "output_type": "execute_result"
311 |     }
312 |    ],
313 |    "source": [
314 |     "df[target].unique()"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "There are two labels `Yes` and `No`"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 10,
327 |    "metadata": {},
328 |    "outputs": [
329 |     {
330 |      "name": "stdout",
331 |      "output_type": "stream",
332 |      "text": [
333 |       "No\n",
334 |       "Yes\n"
335 |      ]
336 |     }
337 |    ],
338 |    "source": [
339 |     "print(df[target].unique()[0])\n",
340 |     "print(df[target].unique()[1])"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 12,
346 |    "metadata": {},
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "5\n",
353 |       "9\n"
354 |      ]
355 |     }
356 |    ],
357 |    "source": [
358 |     "print(df[target].value_counts()[df[target].unique()[0]])\n",
359 |     "print(df[target].value_counts()[df[target].unique()[1]])"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "9 out of 14 examples have `Yes` and 5 out of 14 examples have label `No`"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "#### Calculating entropy of the whole dataset"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 13,
379 |    "metadata": {},
380 |    "outputs": [
381 |     {
382 |      "data": {
383 |       "text/plain": [
384 |        "0.9402859586706311"
385 |       ]
386 |      },
387 |      "execution_count": 13,
388 |      "metadata": {},
389 |      "output_type": "execute_result"
390 |     }
391 |    ],
392 |    "source": [
393 |     "def find_entropy_whole(df):\n",
394 |     "    # Last column in dataframe is the target variable.\n",
395 |     "    target = df.keys()[-1]\n",
396 |     "\n",
397 |     "    # Initialization\n",
398 |     "    overall_entropy = 0\n",
399 |     "\n",
400 |     "    # possible values of the target\n",
401 |     "    values_in_target = df[target].unique()\n",
402 |     "\n",
403 |     "    for value in values_in_target:\n",
404 |     "        p = df[target].value_counts()[value]/len(df[target])\n",
405 |     "        overall_entropy += -p*np.log2(p)\n",
406 |     "    \n",
407 |     "    return overall_entropy\n",
408 |     "\n",
409 |     "find_entropy_whole(df)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "#### Calculating entropy of an attribute"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 14,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "def find_entropy_of_attribute(df, attribute):\n",
426 |     "\n",
427 |     "    # last column in dataframe is the target variable\n",
428 |     "    target = df.keys()[-1]\n",
429 |     "    \n",
430 |     "    values_in_target = df[target].unique()\n",
431 |     "\n",
432 |     "    # This gives different features in that attribute (\n",
433 |     "    #   like  'hot', 'cold' in temperature)\n",
434 |     "    values_in_attribute = df[attribute].unique()\n",
435 |     "\n",
436 |     "    # Initialize attribute's entropy\n",
437 |     "    entropy_attribute = 0\n",
438 |     "\n",
439 |     "\n",
440 |     "    for value_in_attribute in values_in_attribute:\n",
441 |     "        overall_entropy = 0\n",
442 |     "        for value_in_target in values_in_target:\n",
443 |     "            num = len(df[attribute][df[attribute] == value_in_attribute][df[target] == value_in_target])\n",
444 |     "            den = len(df[attribute][df[attribute] == value_in_attribute])\n",
445 |     "            p = num/(den + eps)\n",
446 |     "            overall_entropy += -p*np.log2(p+eps)\n",
447 |     "        p2 = den/len(df)\n",
448 |     "        entropy_attribute += -p2*overall_entropy\n",
449 |     "    return abs(entropy_attribute)"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "Let's compute entropy for different attributes"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 15,
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "name": "stdout",
466 |      "output_type": "stream",
467 |      "text": [
468 |       "Entropy of attribute 'Outlook' is : 0.6935361388961914\n",
469 |       "Entropy of attribute 'Temperature' is : 0.9110633930116756\n",
470 |       "Entropy of attribute 'Humidity' is : 0.7884504573082889\n",
471 |       "Entropy of attribute 'Wind' is : 0.892158928262361\n"
472 |      ]
473 |     }
474 |    ],
475 |    "source": [
476 |     "for i_attribute in df.keys()[:-1]:\n",
477 |     "    print(f'Entropy of attribute \\'{i_attribute}\\' is :',\n",
478 |     "            find_entropy_of_attribute(df, i_attribute))"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "markdown",
483 |    "metadata": {},
484 |    "source": [
485 |     "#### Finding the best attribute"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 16,
491 |    "metadata": {},
492 |    "outputs": [
493 |     {
494 |      "data": {
495 |       "text/plain": [
496 |        "'Outlook'"
497 |       ]
498 |      },
499 |      "execution_count": 16,
500 |      "metadata": {},
501 |      "output_type": "execute_result"
502 |     }
503 |    ],
504 |    "source": [
505 |     "def find_best_attribute_to_divide(df):\n",
506 |     "    # Information gain\n",
507 |     "    IG = []\n",
508 |     "\n",
509 |     "    # All column names\n",
510 |     "    all_attribute_names = df.keys()[:-1]\n",
511 |     "\n",
512 |     "    for attribute in all_attribute_names:\n",
513 |     "        # Compute information gain for every attribute\n",
514 |     "        IG.append(find_entropy_whole(df) - find_entropy_of_attribute(df, attribute))\n",
515 |     "\n",
516 |     "    # Get the index of attribute with best information gain\n",
517 |     "    index_of_attribute_with_max_IG = np.argmax(IG)\n",
518 |     "    best_attribute = all_attribute_names[index_of_attribute_with_max_IG]\n",
519 |     "\n",
520 |     "    return best_attribute\n",
521 |     "\n",
522 |     "find_best_attribute_to_divide(df) "
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "markdown",
527 |    "metadata": {},
528 |    "source": [
529 |     "#### Building Decision Tree"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 19,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": [
538 |     "def buildTree(df, tree=None):\n",
539 |     "\n",
540 |     "    # last column in dataframe\n",
541 |     "    target = df.keys()[-1]\n",
542 |     "\n",
543 |     "    # Here we build our decision tree\n",
544 |     "\n",
545 |     "    # Get attribute with maximum information gain\n",
546 |     "    node = find_best_attribute_to_divide(df)\n",
547 |     "\n",
548 |     "    # Get distinct value of that attribute\n",
549 |     "    attValue = np.unique(df[node])\n",
550 |     "\n",
551 |     "    # Create an array dictionary to create tree\n",
552 |     "    if tree is None:\n",
553 |     "        tree = {}\n",
554 |     "        tree[node] = {}\n",
555 |     "    \n",
556 |     "    # We make a loop to contruct a tree by calling this function recursively\n",
557 |     "    # In this we check if the subset is pure and stops if it is pure\n",
558 |     "    for value in attValue:\n",
559 |     "\n",
560 |     "        subtable = df[df[node] == value].reset_index(drop=True)\n",
561 |     "        clValue, counts = np.unique(subtable['Play'], return_counts=True)\n",
562 |     "\n",
563 |     "        if len(counts) == 1: # Checking purity of the subset\n",
564 |     "            tree[node][value] = clValue[0]\n",
565 |     "        else:\n",
566 |     "            tree[node][value] = buildTree(subtable) # Calling the function recursively\n",
567 |     "    \n",
568 |     "    return tree"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 20,
574 |    "metadata": {},
575 |    "outputs": [
576 |     {
577 |      "data": {
578 |       "text/plain": [
579 |        "{'Outlook': {'Overcast': 'Yes',\n",
580 |        "  'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},\n",
581 |        "  'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}"
582 |       ]
583 |      },
584 |      "execution_count": 20,
585 |      "metadata": {},
586 |      "output_type": "execute_result"
587 |     }
588 |    ],
589 |    "source": [
590 |     "buildTree(df)"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "metadata": {},
596 |    "source": [
597 |     "ID3 in its pure form performs no backtracking in its search. Once it selects an attribute to test at a particular level in the tree, it never backtracks to reconsider this choice. Therefore, it is susceptible to the usual risks of hill-climbing search without backtracking: converging to locally optimal solutions that are not globally optimal"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": null,
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": []
606 |   }
607 |  ],
608 |  "metadata": {
609 |   "kernelspec": {
610 |    "display_name": "Python 3.9.12 ('base')",
611 |    "language": "python",
612 |    "name": "python3"
613 |   },
614 |   "language_info": {
615 |    "codemirror_mode": {
616 |     "name": "ipython",
617 |     "version": 3
618 |    },
619 |    "file_extension": ".py",
620 |    "mimetype": "text/x-python",
621 |    "name": "python",
622 |    "nbconvert_exporter": "python",
623 |    "pygments_lexer": "ipython3",
624 |    "version": "3.9.12"
625 |   },
626 |   "orig_nbformat": 4,
627 |   "vscode": {
628 |    "interpreter": {
629 |     "hash": "9244b6adea22edad6e19cdea93c196ea7ddff3c1d91dfb077ea542e13d85dd05"
630 |    }
631 |   }
632 |  },
633 |  "nbformat": 4,
634 |  "nbformat_minor": 2
635 | }
636 | 


--------------------------------------------------------------------------------
/MLP/Week_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# **Data Preprocessing Techniques**\n",
  8 |     "Data preprocessing involves several transformations that are applied to the raw data and make it more amenable for learning. It is carried out before using it for model training or prediction.\n",
  9 |     "\n",
 10 |     "There are many pre-processing techniques for\n",
 11 |     "- Data cleaning\n",
 12 |     "  - Data imputation\n",
 13 |     "  - Feature scaling\n",
 14 |     "- Feature transformation\n",
 15 |     "  - Polynomial features\n",
 16 |     "  - Discretization\n",
 17 |     "  - Handling categorical features\n",
 18 |     "  - Custom Transformers\n",
 19 |     "  - Composite Transformers\n",
 20 |     "    - Apply transformation of diverse features\n",
 21 |     "    - TargetTransformedRegressor\n",
 22 |     "- Feature Selection\n",
 23 |     "  - Filter based feature selection\n",
 24 |     "  - Wrapper based feature selection\n",
 25 |     "- Feature Extraction\n",
 26 |     "  - PCA\n",
 27 |     "\n",
 28 |     "The transformations are applied in a specific order and the order can be specified via ``Pipeline``. We need to apply different transformations based on the feature type. ``FeatureUnion`` helps us perform that task and combine outputs from multiple transformations into a single transformed feature matrix. We will also study how to visualize this pipeline."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Importing basic libraries"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import numpy as np\n",
 45 |     "import matplotlib.pyplot as plt\n",
 46 |     "import pandas as pd\n",
 47 |     "import seaborn as sns\n",
 48 |     "\n",
 49 |     "sns.set_theme(style=\"whitegrid\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## **1. Feature Extraction**"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### DictVectorizer\n",
 64 |     "\n",
 65 |     "Many a times the data is present as a **list of dictionary objects**. ML algorithms expect the data to be in **matrix form** of shape $(n,m)$ where $n$ is the number of samples and $m$ is the number of features.\n",
 66 |     "\n",
 67 |     "``DictVectorizer`` **converts** a *list of dictionary objects to feature matrix*.\n",
 68 |     "\n",
 69 |     "Let's create a sample data for demo purpose containing ``age`` and ``height`` of children\n",
 70 |     "> Each record/sample in dictionary with two keys ``age`` and ``height``, and their corresponding values."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 2,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "data = [{'age' : 4, 'height' : 96.0},\n",
 80 |     "        {'age' : 1, 'height' : 73.9},\n",
 81 |     "        {'age' : 3, 'height' : 88.9},\n",
 82 |     "        {'age' : 2, 'height' : 81.6}]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "> There are 4 data samples with 2 features each\n",
 90 |     "\n",
 91 |     "Let's make use of ``DictVectorizer`` to convert the list of dictionary objects to the feature matrix"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 3,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "array([[ 4. , 96. ],\n",
103 |        "       [ 1. , 73.9],\n",
104 |        "       [ 3. , 88.9],\n",
105 |        "       [ 2. , 81.6]])"
106 |       ]
107 |      },
108 |      "execution_count": 3,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "from sklearn.feature_extraction import DictVectorizer\n",
115 |     "dv = DictVectorizer(sparse = False)\n",
116 |     "data_transformed = dv.fit_transform(data)\n",
117 |     "data_transformed"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "(4, 2)"
129 |       ]
130 |      },
131 |      "execution_count": 4,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "data_transformed.shape"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "The transformed data is in the feature matrix form- 4 examples with 2 features each i.e shape $(4,2)$"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "## **2. Data Imputation**\n",
152 |     "- Many machine learning algorithms need full feature matrix and they may not work in the presence of missing data\n",
153 |     "- Data imputation identified **missing values** in each feature of the dataset and **replaces** them with an **appropriate value** based on **fixed strategy** such as:\n",
154 |     "  - **mean** or **median** or **mode** of that feature.\n",
155 |     "  - **use specified constant** value\n",
156 |     "\n",
157 |     "Sklearn library provides ``sklearn.impute.SimpleImputer`` class for this purpose"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "from sklearn.impute import SimpleImputer"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "Some of its important parameters:\n",
174 |     "- *missing_values*: Could be ``int, float, str, np.nan`` or ``None``. By default its ``np.nan``.\n",
175 |     "- *strategy*: default is 'mean'. One of the following strategies can be used.   \n",
176 |     "  - ``mean``- missing values are replaced using the **mean** along each column\n",
177 |     "  - ``median`` - missing values are replaced using the **median** along each column\n",
178 |     "  - ``most_frequent`` - missing values are replaced using the **most frequent** along each column\n",
179 |     "  - ``constant`` - missing values are replaced with values specified in ``fill_value`` argument.\n",
180 |     "- ``add_indicator`` is a boolean parameter that when set to ``True`` returns **missing value indicators** in ``indicators_`` member variable.\n",
181 |     "\n",
182 |     "**Note**:\n",
183 |     "- ``mean`` and ``mode`` strategies can only be used with numeric data.\n",
184 |     "- ``most_frequent`` and ``constant`` strategies can be used with strings or numeric data."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "### Data imputation on real world dataset\n",
192 |     "Let's perform data imputation on real world dataset. We will be using [heart-disease from uci machine learning repository](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) for this purpose. We will load this dataset from csv file."
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 6,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "cols = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']\n",
202 |     "heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None,names=cols)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "**STEP 1.**: Check if dataset contains missing values.\n",
210 |     "- This can be checked via dataset description or by check number of ``nan`` or ``np.null`` in the dataframe. Howevver such check can be performed only for numerical features.\n",
211 |     "- For non-numerical features, we can list their unique values and check if there are values like ``?``.\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "<class 'pandas.core.frame.DataFrame'>\n",
224 |       "RangeIndex: 303 entries, 0 to 302\n",
225 |       "Data columns (total 14 columns):\n",
226 |       " #   Column    Non-Null Count  Dtype  \n",
227 |       "---  ------    --------------  -----  \n",
228 |       " 0   age       303 non-null    float64\n",
229 |       " 1   sex       303 non-null    float64\n",
230 |       " 2   cp        303 non-null    float64\n",
231 |       " 3   trestbps  303 non-null    float64\n",
232 |       " 4   chol      303 non-null    float64\n",
233 |       " 5   fbs       303 non-null    float64\n",
234 |       " 6   restecg   303 non-null    float64\n",
235 |       " 7   thalach   303 non-null    float64\n",
236 |       " 8   exang     303 non-null    float64\n",
237 |       " 9   oldpeak   303 non-null    float64\n",
238 |       " 10  slope     303 non-null    float64\n",
239 |       " 11  ca        303 non-null    object \n",
240 |       " 12  thal      303 non-null    object \n",
241 |       " 13  num       303 non-null    int64  \n",
242 |       "dtypes: float64(11), int64(1), object(2)\n",
243 |       "memory usage: 33.3+ KB\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "heart_data.info()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Let's check if there are missing values in numerical columns - here we have checked it for all columns in the dataframe."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "age         0\n",
267 |        "sex         0\n",
268 |        "cp          0\n",
269 |        "trestbps    0\n",
270 |        "chol        0\n",
271 |        "fbs         0\n",
272 |        "restecg     0\n",
273 |        "thalach     0\n",
274 |        "exang       0\n",
275 |        "oldpeak     0\n",
276 |        "slope       0\n",
277 |        "ca          0\n",
278 |        "thal        0\n",
279 |        "num         0\n",
280 |        "dtype: int64"
281 |       ]
282 |      },
283 |      "execution_count": 8,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "(heart_data.isnull().sum())"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "There are two non-numerical features: ``ca`` and ``thal``.\n",
297 |     "- List their unique values."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 9,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "Unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?']\n",
310 |       "Unique values in thal: ['6.0' '3.0' '7.0' '?']\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "print('Unique values in ca:', heart_data.ca.unique())\n",
316 |     "print('Unique values in thal:', heart_data.thal.unique())"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "Both of them contain ``?`` which is a missing values. Let's count the number of missing values."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 10,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "name": "stdout",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "# missing values in ca: 4\n",
336 |       "# missing values in thal: 2\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "print('# missing values in ca:', heart_data.loc[heart_data.ca == '?','ca'].count())\n",
342 |     "print('# missing values in thal:', heart_data.loc[heart_data.thal ==\"?\",'thal'].count())"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "**STEP 2**: Replace '?' with ``nan``."
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 11,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "heart_data.replace('?',np.nan, inplace=True)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "**STEP 3**: Fill the missing values with ``sklearn`` missing value imputation utilities.\n",
366 |     "> Here we use ``SimpleImputer`` with ``mean`` strategy.\n",
367 |     "\n",
368 |     "We will try two variations- \n",
369 |     "- ``add_indicator = False``: Default choice that only imputes missing values."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 12,
375 |    "metadata": {},
376 |    "outputs": [
377 |     {
378 |      "name": "stdout",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "(303, 14)\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
387 |     "imputer = imputer.fit(heart_data)\n",
388 |     "heart_data_imputed = imputer.transform(heart_data)\n",
389 |     "print(heart_data_imputed.shape)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "- ``add_indicator = True``: Adds additional column for each column containing missing values. In this case it adds two column, one for ``ca`` and the other for ``thal``."
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 14,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "(303, 16)\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "imputer = SimpleImputer(missing_values= np.nan, strategy='mean', add_indicator=True)\n",
414 |     "imputer = imputer.fit(heart_data)\n",
415 |     "heart_data_imputed_with_indicator = imputer.transform(heart_data)\n",
416 |     "print(heart_data_imputed_with_indicator.shape)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "## **3. Feature Scaling**\n",
424 |     "\n",
425 |     "Feature scaling **transforms feature values** such that **all the features are on the same scale**.\n",
426 |     "When we use feature matrix with all the features on the same scale.\n",
427 |     "- **Enables faster convergence** in iterative optimization algorithms like gradient descent and its variants.\n",
428 |     "- The performance of ML algorithms such as SVM, K-NN and K-means etc, that compute euclidean distance among input samples gets impacted if the features are not scaled.\n",
429 |     "\n",
430 |     "Tree based ML algorithms are not affected by feature-scaling. In other words, feature scaling is not required for tree based ML algorithms\n",
431 |     "\n",
432 |     "Feature scaling can be performed with the following methods:\n",
433 |     "- Standardization\n",
434 |     "- Normalization\n",
435 |     "- MaxAbsScaler.\n",
436 |     "\n",
437 |     "Let's demonstrate feature scaling on real world dataset. For this purpose, we will be using [abalone dataset](https://archive.ics.uci.edu/ml/datasets/abalone). We will use different scaling utilities in ``sklearn`` library."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 17,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "cols = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']\n",
447 |     "abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',header=None,names=cols)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "**STEP 1**: Examine the dataset"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 18,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "name": "stdout",
464 |      "output_type": "stream",
465 |      "text": [
466 |       "<class 'pandas.core.frame.DataFrame'>\n",
467 |       "RangeIndex: 4177 entries, 0 to 4176\n",
468 |       "Data columns (total 9 columns):\n",
469 |       " #   Column          Non-Null Count  Dtype  \n",
470 |       "---  ------          --------------  -----  \n",
471 |       " 0   Sex             4177 non-null   object \n",
472 |       " 1   Length          4177 non-null   float64\n",
473 |       " 2   Diameter        4177 non-null   float64\n",
474 |       " 3   Height          4177 non-null   float64\n",
475 |       " 4   Whole weight    4177 non-null   float64\n",
476 |       " 5   Shucked weight  4177 non-null   float64\n",
477 |       " 6   Viscera weight  4177 non-null   float64\n",
478 |       " 7   Shell weight    4177 non-null   float64\n",
479 |       " 8   Rings           4177 non-null   int64  \n",
480 |       "dtypes: float64(7), int64(1), object(1)\n",
481 |       "memory usage: 293.8+ KB\n"
482 |      ]
483 |     }
484 |    ],
485 |    "source": [
486 |     "abalone_data.info()"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "**STEP 1a**: [Optional]: convert non-numerical attributes into numerical ones\n",
494 |     "> In this dataset only ``Sex`` is the non-numeric column"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 19,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "text/plain": [
505 |        "array(['M', 'F', 'I'], dtype=object)"
506 |       ]
507 |      },
508 |      "execution_count": 19,
509 |      "metadata": {},
510 |      "output_type": "execute_result"
511 |     }
512 |    ],
513 |    "source": [
514 |     "abalone_data.Sex.unique()"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 20,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "name": "stdout",
524 |      "output_type": "stream",
525 |      "text": [
526 |       "<class 'pandas.core.frame.DataFrame'>\n",
527 |       "RangeIndex: 4177 entries, 0 to 4176\n",
528 |       "Data columns (total 9 columns):\n",
529 |       " #   Column          Non-Null Count  Dtype  \n",
530 |       "---  ------          --------------  -----  \n",
531 |       " 0   Sex             4177 non-null   int64  \n",
532 |       " 1   Length          4177 non-null   float64\n",
533 |       " 2   Diameter        4177 non-null   float64\n",
534 |       " 3   Height          4177 non-null   float64\n",
535 |       " 4   Whole weight    4177 non-null   float64\n",
536 |       " 5   Shucked weight  4177 non-null   float64\n",
537 |       " 6   Viscera weight  4177 non-null   float64\n",
538 |       " 7   Shell weight    4177 non-null   float64\n",
539 |       " 8   Rings           4177 non-null   int64  \n",
540 |       "dtypes: float64(7), int64(2)\n",
541 |       "memory usage: 293.8 KB\n"
542 |      ]
543 |     }
544 |    ],
545 |    "source": [
546 |     "#Assign numeric values to sex.\n",
547 |     "abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})\n",
548 |     "abalone_data.info()"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "**STEP 2**: Separate labels from features."
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 21,
561 |    "metadata": {},
562 |    "outputs": [
563 |     {
564 |      "name": "stdout",
565 |      "output_type": "stream",
566 |      "text": [
567 |       "The dataframe object after deleting the column\n",
568 |       "<class 'pandas.core.frame.DataFrame'>\n",
569 |       "RangeIndex: 4177 entries, 0 to 4176\n",
570 |       "Data columns (total 8 columns):\n",
571 |       " #   Column          Non-Null Count  Dtype  \n",
572 |       "---  ------          --------------  -----  \n",
573 |       " 0   Sex             4177 non-null   int64  \n",
574 |       " 1   Length          4177 non-null   float64\n",
575 |       " 2   Diameter        4177 non-null   float64\n",
576 |       " 3   Height          4177 non-null   float64\n",
577 |       " 4   Whole weight    4177 non-null   float64\n",
578 |       " 5   Shucked weight  4177 non-null   float64\n",
579 |       " 6   Viscera weight  4177 non-null   float64\n",
580 |       " 7   Shell weight    4177 non-null   float64\n",
581 |       "dtypes: float64(7), int64(1)\n",
582 |       "memory usage: 261.2 KB\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "y = abalone_data.pop('Rings')\n",
588 |     "print('The dataframe object after deleting the column')\n",
589 |     "abalone_data.info()"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "markdown",
594 |    "metadata": {},
595 |    "source": [
596 |     "**STEP 3**: Examing the feature scales\n",
597 |     "\n",
598 |     "#### Statistical method\n",
599 |     "Check the scales of different features with ``describe()`` method of dataframe."
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 23,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "data": {
609 |       "text/html": [
610 |        "<div>\n",
611 |        "<style scoped>\n",
612 |        "    .dataframe tbody tr th:only-of-type {\n",
613 |        "        vertical-align: middle;\n",
614 |        "    }\n",
615 |        "\n",
616 |        "    .dataframe tbody tr th {\n",
617 |        "        vertical-align: top;\n",
618 |        "    }\n",
619 |        "\n",
620 |        "    .dataframe thead th {\n",
621 |        "        text-align: right;\n",
622 |        "    }\n",
623 |        "</style>\n",
624 |        "<table border=\"1\" class=\"dataframe\">\n",
625 |        "  <thead>\n",
626 |        "    <tr style=\"text-align: right;\">\n",
627 |        "      <th></th>\n",
628 |        "      <th>count</th>\n",
629 |        "      <th>mean</th>\n",
630 |        "      <th>std</th>\n",
631 |        "      <th>min</th>\n",
632 |        "      <th>25%</th>\n",
633 |        "      <th>50%</th>\n",
634 |        "      <th>75%</th>\n",
635 |        "      <th>max</th>\n",
636 |        "    </tr>\n",
637 |        "  </thead>\n",
638 |        "  <tbody>\n",
639 |        "    <tr>\n",
640 |        "      <th>Sex</th>\n",
641 |        "      <td>4177.0</td>\n",
642 |        "      <td>1.955470</td>\n",
643 |        "      <td>0.827815</td>\n",
644 |        "      <td>1.0000</td>\n",
645 |        "      <td>1.0000</td>\n",
646 |        "      <td>2.0000</td>\n",
647 |        "      <td>3.000</td>\n",
648 |        "      <td>3.0000</td>\n",
649 |        "    </tr>\n",
650 |        "    <tr>\n",
651 |        "      <th>Length</th>\n",
652 |        "      <td>4177.0</td>\n",
653 |        "      <td>0.523992</td>\n",
654 |        "      <td>0.120093</td>\n",
655 |        "      <td>0.0750</td>\n",
656 |        "      <td>0.4500</td>\n",
657 |        "      <td>0.5450</td>\n",
658 |        "      <td>0.615</td>\n",
659 |        "      <td>0.8150</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>Diameter</th>\n",
663 |        "      <td>4177.0</td>\n",
664 |        "      <td>0.407881</td>\n",
665 |        "      <td>0.099240</td>\n",
666 |        "      <td>0.0550</td>\n",
667 |        "      <td>0.3500</td>\n",
668 |        "      <td>0.4250</td>\n",
669 |        "      <td>0.480</td>\n",
670 |        "      <td>0.6500</td>\n",
671 |        "    </tr>\n",
672 |        "    <tr>\n",
673 |        "      <th>Height</th>\n",
674 |        "      <td>4177.0</td>\n",
675 |        "      <td>0.139516</td>\n",
676 |        "      <td>0.041827</td>\n",
677 |        "      <td>0.0000</td>\n",
678 |        "      <td>0.1150</td>\n",
679 |        "      <td>0.1400</td>\n",
680 |        "      <td>0.165</td>\n",
681 |        "      <td>1.1300</td>\n",
682 |        "    </tr>\n",
683 |        "    <tr>\n",
684 |        "      <th>Whole weight</th>\n",
685 |        "      <td>4177.0</td>\n",
686 |        "      <td>0.828742</td>\n",
687 |        "      <td>0.490389</td>\n",
688 |        "      <td>0.0020</td>\n",
689 |        "      <td>0.4415</td>\n",
690 |        "      <td>0.7995</td>\n",
691 |        "      <td>1.153</td>\n",
692 |        "      <td>2.8255</td>\n",
693 |        "    </tr>\n",
694 |        "    <tr>\n",
695 |        "      <th>Shucked weight</th>\n",
696 |        "      <td>4177.0</td>\n",
697 |        "      <td>0.359367</td>\n",
698 |        "      <td>0.221963</td>\n",
699 |        "      <td>0.0010</td>\n",
700 |        "      <td>0.1860</td>\n",
701 |        "      <td>0.3360</td>\n",
702 |        "      <td>0.502</td>\n",
703 |        "      <td>1.4880</td>\n",
704 |        "    </tr>\n",
705 |        "    <tr>\n",
706 |        "      <th>Viscera weight</th>\n",
707 |        "      <td>4177.0</td>\n",
708 |        "      <td>0.180594</td>\n",
709 |        "      <td>0.109614</td>\n",
710 |        "      <td>0.0005</td>\n",
711 |        "      <td>0.0935</td>\n",
712 |        "      <td>0.1710</td>\n",
713 |        "      <td>0.253</td>\n",
714 |        "      <td>0.7600</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <th>Shell weight</th>\n",
718 |        "      <td>4177.0</td>\n",
719 |        "      <td>0.238831</td>\n",
720 |        "      <td>0.139203</td>\n",
721 |        "      <td>0.0015</td>\n",
722 |        "      <td>0.1300</td>\n",
723 |        "      <td>0.2340</td>\n",
724 |        "      <td>0.329</td>\n",
725 |        "      <td>1.0050</td>\n",
726 |        "    </tr>\n",
727 |        "  </tbody>\n",
728 |        "</table>\n",
729 |        "</div>"
730 |       ],
731 |       "text/plain": [
732 |        "                 count      mean       std     min     25%     50%    75%  \\\n",
733 |        "Sex             4177.0  1.955470  0.827815  1.0000  1.0000  2.0000  3.000   \n",
734 |        "Length          4177.0  0.523992  0.120093  0.0750  0.4500  0.5450  0.615   \n",
735 |        "Diameter        4177.0  0.407881  0.099240  0.0550  0.3500  0.4250  0.480   \n",
736 |        "Height          4177.0  0.139516  0.041827  0.0000  0.1150  0.1400  0.165   \n",
737 |        "Whole weight    4177.0  0.828742  0.490389  0.0020  0.4415  0.7995  1.153   \n",
738 |        "Shucked weight  4177.0  0.359367  0.221963  0.0010  0.1860  0.3360  0.502   \n",
739 |        "Viscera weight  4177.0  0.180594  0.109614  0.0005  0.0935  0.1710  0.253   \n",
740 |        "Shell weight    4177.0  0.238831  0.139203  0.0015  0.1300  0.2340  0.329   \n",
741 |        "\n",
742 |        "                   max  \n",
743 |        "Sex             3.0000  \n",
744 |        "Length          0.8150  \n",
745 |        "Diameter        0.6500  \n",
746 |        "Height          1.1300  \n",
747 |        "Whole weight    2.8255  \n",
748 |        "Shucked weight  1.4880  \n",
749 |        "Viscera weight  0.7600  \n",
750 |        "Shell weight    1.0050  "
751 |       ]
752 |      },
753 |      "execution_count": 23,
754 |      "metadata": {},
755 |      "output_type": "execute_result"
756 |     }
757 |    ],
758 |    "source": [
759 |     "abalone_data.describe().T"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": null,
765 |    "metadata": {},
766 |    "outputs": [],
767 |    "source": []
768 |   }
769 |  ],
770 |  "metadata": {
771 |   "kernelspec": {
772 |    "display_name": "ML",
773 |    "language": "python",
774 |    "name": "python3"
775 |   },
776 |   "language_info": {
777 |    "codemirror_mode": {
778 |     "name": "ipython",
779 |     "version": 3
780 |    },
781 |    "file_extension": ".py",
782 |    "mimetype": "text/x-python",
783 |    "name": "python",
784 |    "nbconvert_exporter": "python",
785 |    "pygments_lexer": "ipython3",
786 |    "version": "3.13.5"
787 |   },
788 |   "orig_nbformat": 4
789 |  },
790 |  "nbformat": 4,
791 |  "nbformat_minor": 2
792 | }
793 | 


--------------------------------------------------------------------------------
/MLP-using-GPU/2-DataPreprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# **Data Preprocessing Techniques**\n",
  8 |     "Data preprocessing involves several transformations that are applied to the raw data and make it more amenable for learning. It is carried out before using it for model training or prediction.\n",
  9 |     "\n",
 10 |     "There are many pre-processing techniques for\n",
 11 |     "- Data cleaning\n",
 12 |     "  - Data imputation\n",
 13 |     "  - Feature scaling\n",
 14 |     "- Feature transformation\n",
 15 |     "  - Polynomial features\n",
 16 |     "  - Discretization\n",
 17 |     "  - Handling categorical features\n",
 18 |     "  - Custom Transformers\n",
 19 |     "  - Composite Transformers\n",
 20 |     "    - Apply transformation of diverse features\n",
 21 |     "    - TargetTransformedRegressor\n",
 22 |     "- Feature Selection\n",
 23 |     "  - Filter based feature selection\n",
 24 |     "  - Wrapper based feature selection\n",
 25 |     "- Feature Extraction\n",
 26 |     "  - PCA\n",
 27 |     "\n",
 28 |     "The transformations are applied in a specific order and the order can be specified via ``Pipeline``. We need to apply different transformations based on the feature type. ``FeatureUnion`` helps us perform that task and combine outputs from multiple transformations into a single transformed feature matrix. We will also study how to visualize this pipeline."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Importing basic libraries"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import numpy as np\n",
 45 |     "import matplotlib.pyplot as plt\n",
 46 |     "import pandas as pd\n",
 47 |     "import seaborn as sns\n",
 48 |     "\n",
 49 |     "sns.set_theme(style=\"whitegrid\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## **1. Feature Extraction**"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### DictVectorizer\n",
 64 |     "\n",
 65 |     "Many a times the data is present as a **list of dictionary objects**. ML algorithms expect the data to be in **matrix form** of shape $(n,m)$ where $n$ is the number of samples and $m$ is the number of features.\n",
 66 |     "\n",
 67 |     "``DictVectorizer`` **converts** a *list of dictionary objects to feature matrix*.\n",
 68 |     "\n",
 69 |     "Let's create a sample data for demo purpose containing ``age`` and ``height`` of children\n",
 70 |     "> Each record/sample in dictionary with two keys ``age`` and ``height``, and their corresponding values."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 2,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "data = [{'age' : 4, 'height' : 96.0},\n",
 80 |     "        {'age' : 1, 'height' : 73.9},\n",
 81 |     "        {'age' : 3, 'height' : 88.9},\n",
 82 |     "        {'age' : 2, 'height' : 81.6}]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "> There are 4 data samples with 2 features each\n",
 90 |     "\n",
 91 |     "Let's make use of ``DictVectorizer`` to convert the list of dictionary objects to the feature matrix"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 3,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "array([[ 4. , 96. ],\n",
103 |        "       [ 1. , 73.9],\n",
104 |        "       [ 3. , 88.9],\n",
105 |        "       [ 2. , 81.6]])"
106 |       ]
107 |      },
108 |      "execution_count": 3,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "from sklearn.feature_extraction import DictVectorizer\n",
115 |     "dv = DictVectorizer(sparse = False)\n",
116 |     "data_transformed = dv.fit_transform(data)\n",
117 |     "data_transformed"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "(4, 2)"
129 |       ]
130 |      },
131 |      "execution_count": 4,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "data_transformed.shape"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "The transformed data is in the feature matrix form- 4 examples with 2 features each i.e shape $(4,2)$"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "## **2. Data Imputation**\n",
152 |     "- Many machine learning algorithms need full feature matrix and they may not work in the presence of missing data\n",
153 |     "- Data imputation identified **missing values** in each feature of the dataset and **replaces** them with an **appropriate value** based on **fixed strategy** such as:\n",
154 |     "  - **mean** or **median** or **mode** of that feature.\n",
155 |     "  - **use specified constant** value\n",
156 |     "\n",
157 |     "Sklearn library provides ``sklearn.impute.SimpleImputer`` class for this purpose"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "from sklearn.impute import SimpleImputer"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "Some of its important parameters:\n",
174 |     "- *missing_values*: Could be ``int, float, str, np.nan`` or ``None``. By default its ``np.nan``.\n",
175 |     "- *strategy*: default is 'mean'. One of the following strategies can be used.   \n",
176 |     "  - ``mean``- missing values are replaced using the **mean** along each column\n",
177 |     "  - ``median`` - missing values are replaced using the **median** along each column\n",
178 |     "  - ``most_frequent`` - missing values are replaced using the **most frequent** along each column\n",
179 |     "  - ``constant`` - missing values are replaced with values specified in ``fill_value`` argument.\n",
180 |     "- ``add_indicator`` is a boolean parameter that when set to ``True`` returns **missing value indicators** in ``indicators_`` member variable.\n",
181 |     "\n",
182 |     "**Note**:\n",
183 |     "- ``mean`` and ``mode`` strategies can only be used with numeric data.\n",
184 |     "- ``most_frequent`` and ``constant`` strategies can be used with strings or numeric data."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "### Data imputation on real world dataset\n",
192 |     "Let's perform data imputation on real world dataset. We will be using [heart-disease from uci machine learning repository](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) for this purpose. We will load this dataset from csv file."
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 6,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "cols = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']\n",
202 |     "heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None,names=cols)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "**STEP 1.**: Check if dataset contains missing values.\n",
210 |     "- This can be checked via dataset description or by check number of ``nan`` or ``np.null`` in the dataframe. Howevver such check can be performed only for numerical features.\n",
211 |     "- For non-numerical features, we can list their unique values and check if there are values like ``?``.\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "<class 'pandas.core.frame.DataFrame'>\n",
224 |       "RangeIndex: 303 entries, 0 to 302\n",
225 |       "Data columns (total 14 columns):\n",
226 |       " #   Column    Non-Null Count  Dtype  \n",
227 |       "---  ------    --------------  -----  \n",
228 |       " 0   age       303 non-null    float64\n",
229 |       " 1   sex       303 non-null    float64\n",
230 |       " 2   cp        303 non-null    float64\n",
231 |       " 3   trestbps  303 non-null    float64\n",
232 |       " 4   chol      303 non-null    float64\n",
233 |       " 5   fbs       303 non-null    float64\n",
234 |       " 6   restecg   303 non-null    float64\n",
235 |       " 7   thalach   303 non-null    float64\n",
236 |       " 8   exang     303 non-null    float64\n",
237 |       " 9   oldpeak   303 non-null    float64\n",
238 |       " 10  slope     303 non-null    float64\n",
239 |       " 11  ca        303 non-null    object \n",
240 |       " 12  thal      303 non-null    object \n",
241 |       " 13  num       303 non-null    int64  \n",
242 |       "dtypes: float64(11), int64(1), object(2)\n",
243 |       "memory usage: 33.3+ KB\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "heart_data.info()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Let's check if there are missing values in numerical columns - here we have checked it for all columns in the dataframe."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "age         0\n",
267 |        "sex         0\n",
268 |        "cp          0\n",
269 |        "trestbps    0\n",
270 |        "chol        0\n",
271 |        "fbs         0\n",
272 |        "restecg     0\n",
273 |        "thalach     0\n",
274 |        "exang       0\n",
275 |        "oldpeak     0\n",
276 |        "slope       0\n",
277 |        "ca          0\n",
278 |        "thal        0\n",
279 |        "num         0\n",
280 |        "dtype: int64"
281 |       ]
282 |      },
283 |      "execution_count": 8,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "(heart_data.isnull().sum())"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "There are two non-numerical features: ``ca`` and ``thal``.\n",
297 |     "- List their unique values."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 9,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "Unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?']\n",
310 |       "Unique values in thal: ['6.0' '3.0' '7.0' '?']\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "print('Unique values in ca:', heart_data.ca.unique())\n",
316 |     "print('Unique values in thal:', heart_data.thal.unique())"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "Both of them contain ``?`` which is a missing values. Let's count the number of missing values."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 10,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "name": "stdout",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "# missing values in ca: 4\n",
336 |       "# missing values in thal: 2\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "print('# missing values in ca:', heart_data.loc[heart_data.ca == '?','ca'].count())\n",
342 |     "print('# missing values in thal:', heart_data.loc[heart_data.thal ==\"?\",'thal'].count())"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "**STEP 2**: Replace '?' with ``nan``."
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 11,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "heart_data.replace('?',np.nan, inplace=True)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "**STEP 3**: Fill the missing values with ``sklearn`` missing value imputation utilities.\n",
366 |     "> Here we use ``SimpleImputer`` with ``mean`` strategy.\n",
367 |     "\n",
368 |     "We will try two variations- \n",
369 |     "- ``add_indicator = False``: Default choice that only imputes missing values."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 12,
375 |    "metadata": {},
376 |    "outputs": [
377 |     {
378 |      "name": "stdout",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "(303, 14)\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
387 |     "imputer = imputer.fit(heart_data)\n",
388 |     "heart_data_imputed = imputer.transform(heart_data)\n",
389 |     "print(heart_data_imputed.shape)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "- ``add_indicator = True``: Adds additional column for each column containing missing values. In this case it adds two column, one for ``ca`` and the other for ``thal``."
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 14,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "(303, 16)\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "imputer = SimpleImputer(missing_values= np.nan, strategy='mean', add_indicator=True)\n",
414 |     "imputer = imputer.fit(heart_data)\n",
415 |     "heart_data_imputed_with_indicator = imputer.transform(heart_data)\n",
416 |     "print(heart_data_imputed_with_indicator.shape)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "## **3. Feature Scaling**\n",
424 |     "\n",
425 |     "Feature scaling **transforms feature values** such that **all the features are on the same scale**.\n",
426 |     "When we use feature matrix with all the features on the same scale.\n",
427 |     "- **Enables faster convergence** in iterative optimization algorithms like gradient descent and its variants.\n",
428 |     "- The performance of ML algorithms such as SVM, K-NN and K-means etc, that compute euclidean distance among input samples gets impacted if the features are not scaled.\n",
429 |     "\n",
430 |     "Tree based ML algorithms are not affected by feature-scaling. In other words, feature scaling is not required for tree based ML algorithms\n",
431 |     "\n",
432 |     "Feature scaling can be performed with the following methods:\n",
433 |     "- Standardization\n",
434 |     "- Normalization\n",
435 |     "- MaxAbsScaler.\n",
436 |     "\n",
437 |     "Let's demonstrate feature scaling on real world dataset. For this purpose, we will be using [abalone dataset](https://archive.ics.uci.edu/ml/datasets/abalone). We will use different scaling utilities in ``sklearn`` library."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 17,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "cols = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']\n",
447 |     "abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',header=None,names=cols)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "**STEP 1**: Examine the dataset"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 18,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "name": "stdout",
464 |      "output_type": "stream",
465 |      "text": [
466 |       "<class 'pandas.core.frame.DataFrame'>\n",
467 |       "RangeIndex: 4177 entries, 0 to 4176\n",
468 |       "Data columns (total 9 columns):\n",
469 |       " #   Column          Non-Null Count  Dtype  \n",
470 |       "---  ------          --------------  -----  \n",
471 |       " 0   Sex             4177 non-null   object \n",
472 |       " 1   Length          4177 non-null   float64\n",
473 |       " 2   Diameter        4177 non-null   float64\n",
474 |       " 3   Height          4177 non-null   float64\n",
475 |       " 4   Whole weight    4177 non-null   float64\n",
476 |       " 5   Shucked weight  4177 non-null   float64\n",
477 |       " 6   Viscera weight  4177 non-null   float64\n",
478 |       " 7   Shell weight    4177 non-null   float64\n",
479 |       " 8   Rings           4177 non-null   int64  \n",
480 |       "dtypes: float64(7), int64(1), object(1)\n",
481 |       "memory usage: 293.8+ KB\n"
482 |      ]
483 |     }
484 |    ],
485 |    "source": [
486 |     "abalone_data.info()"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "**STEP 1a**: [Optional]: convert non-numerical attributes into numerical ones\n",
494 |     "> In this dataset only ``Sex`` is the non-numeric column"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 19,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "text/plain": [
505 |        "array(['M', 'F', 'I'], dtype=object)"
506 |       ]
507 |      },
508 |      "execution_count": 19,
509 |      "metadata": {},
510 |      "output_type": "execute_result"
511 |     }
512 |    ],
513 |    "source": [
514 |     "abalone_data.Sex.unique()"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 20,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "name": "stdout",
524 |      "output_type": "stream",
525 |      "text": [
526 |       "<class 'pandas.core.frame.DataFrame'>\n",
527 |       "RangeIndex: 4177 entries, 0 to 4176\n",
528 |       "Data columns (total 9 columns):\n",
529 |       " #   Column          Non-Null Count  Dtype  \n",
530 |       "---  ------          --------------  -----  \n",
531 |       " 0   Sex             4177 non-null   int64  \n",
532 |       " 1   Length          4177 non-null   float64\n",
533 |       " 2   Diameter        4177 non-null   float64\n",
534 |       " 3   Height          4177 non-null   float64\n",
535 |       " 4   Whole weight    4177 non-null   float64\n",
536 |       " 5   Shucked weight  4177 non-null   float64\n",
537 |       " 6   Viscera weight  4177 non-null   float64\n",
538 |       " 7   Shell weight    4177 non-null   float64\n",
539 |       " 8   Rings           4177 non-null   int64  \n",
540 |       "dtypes: float64(7), int64(2)\n",
541 |       "memory usage: 293.8 KB\n"
542 |      ]
543 |     }
544 |    ],
545 |    "source": [
546 |     "#Assign numeric values to sex.\n",
547 |     "abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})\n",
548 |     "abalone_data.info()"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "**STEP 2**: Separate labels from features."
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 21,
561 |    "metadata": {},
562 |    "outputs": [
563 |     {
564 |      "name": "stdout",
565 |      "output_type": "stream",
566 |      "text": [
567 |       "The dataframe object after deleting the column\n",
568 |       "<class 'pandas.core.frame.DataFrame'>\n",
569 |       "RangeIndex: 4177 entries, 0 to 4176\n",
570 |       "Data columns (total 8 columns):\n",
571 |       " #   Column          Non-Null Count  Dtype  \n",
572 |       "---  ------          --------------  -----  \n",
573 |       " 0   Sex             4177 non-null   int64  \n",
574 |       " 1   Length          4177 non-null   float64\n",
575 |       " 2   Diameter        4177 non-null   float64\n",
576 |       " 3   Height          4177 non-null   float64\n",
577 |       " 4   Whole weight    4177 non-null   float64\n",
578 |       " 5   Shucked weight  4177 non-null   float64\n",
579 |       " 6   Viscera weight  4177 non-null   float64\n",
580 |       " 7   Shell weight    4177 non-null   float64\n",
581 |       "dtypes: float64(7), int64(1)\n",
582 |       "memory usage: 261.2 KB\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "y = abalone_data.pop('Rings')\n",
588 |     "print('The dataframe object after deleting the column')\n",
589 |     "abalone_data.info()"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "markdown",
594 |    "metadata": {},
595 |    "source": [
596 |     "**STEP 3**: Examing the feature scales\n",
597 |     "\n",
598 |     "#### Statistical method\n",
599 |     "Check the scales of different features with ``describe()`` method of dataframe."
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 23,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "data": {
609 |       "text/html": [
610 |        "<div>\n",
611 |        "<style scoped>\n",
612 |        "    .dataframe tbody tr th:only-of-type {\n",
613 |        "        vertical-align: middle;\n",
614 |        "    }\n",
615 |        "\n",
616 |        "    .dataframe tbody tr th {\n",
617 |        "        vertical-align: top;\n",
618 |        "    }\n",
619 |        "\n",
620 |        "    .dataframe thead th {\n",
621 |        "        text-align: right;\n",
622 |        "    }\n",
623 |        "</style>\n",
624 |        "<table border=\"1\" class=\"dataframe\">\n",
625 |        "  <thead>\n",
626 |        "    <tr style=\"text-align: right;\">\n",
627 |        "      <th></th>\n",
628 |        "      <th>count</th>\n",
629 |        "      <th>mean</th>\n",
630 |        "      <th>std</th>\n",
631 |        "      <th>min</th>\n",
632 |        "      <th>25%</th>\n",
633 |        "      <th>50%</th>\n",
634 |        "      <th>75%</th>\n",
635 |        "      <th>max</th>\n",
636 |        "    </tr>\n",
637 |        "  </thead>\n",
638 |        "  <tbody>\n",
639 |        "    <tr>\n",
640 |        "      <th>Sex</th>\n",
641 |        "      <td>4177.0</td>\n",
642 |        "      <td>1.955470</td>\n",
643 |        "      <td>0.827815</td>\n",
644 |        "      <td>1.0000</td>\n",
645 |        "      <td>1.0000</td>\n",
646 |        "      <td>2.0000</td>\n",
647 |        "      <td>3.000</td>\n",
648 |        "      <td>3.0000</td>\n",
649 |        "    </tr>\n",
650 |        "    <tr>\n",
651 |        "      <th>Length</th>\n",
652 |        "      <td>4177.0</td>\n",
653 |        "      <td>0.523992</td>\n",
654 |        "      <td>0.120093</td>\n",
655 |        "      <td>0.0750</td>\n",
656 |        "      <td>0.4500</td>\n",
657 |        "      <td>0.5450</td>\n",
658 |        "      <td>0.615</td>\n",
659 |        "      <td>0.8150</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>Diameter</th>\n",
663 |        "      <td>4177.0</td>\n",
664 |        "      <td>0.407881</td>\n",
665 |        "      <td>0.099240</td>\n",
666 |        "      <td>0.0550</td>\n",
667 |        "      <td>0.3500</td>\n",
668 |        "      <td>0.4250</td>\n",
669 |        "      <td>0.480</td>\n",
670 |        "      <td>0.6500</td>\n",
671 |        "    </tr>\n",
672 |        "    <tr>\n",
673 |        "      <th>Height</th>\n",
674 |        "      <td>4177.0</td>\n",
675 |        "      <td>0.139516</td>\n",
676 |        "      <td>0.041827</td>\n",
677 |        "      <td>0.0000</td>\n",
678 |        "      <td>0.1150</td>\n",
679 |        "      <td>0.1400</td>\n",
680 |        "      <td>0.165</td>\n",
681 |        "      <td>1.1300</td>\n",
682 |        "    </tr>\n",
683 |        "    <tr>\n",
684 |        "      <th>Whole weight</th>\n",
685 |        "      <td>4177.0</td>\n",
686 |        "      <td>0.828742</td>\n",
687 |        "      <td>0.490389</td>\n",
688 |        "      <td>0.0020</td>\n",
689 |        "      <td>0.4415</td>\n",
690 |        "      <td>0.7995</td>\n",
691 |        "      <td>1.153</td>\n",
692 |        "      <td>2.8255</td>\n",
693 |        "    </tr>\n",
694 |        "    <tr>\n",
695 |        "      <th>Shucked weight</th>\n",
696 |        "      <td>4177.0</td>\n",
697 |        "      <td>0.359367</td>\n",
698 |        "      <td>0.221963</td>\n",
699 |        "      <td>0.0010</td>\n",
700 |        "      <td>0.1860</td>\n",
701 |        "      <td>0.3360</td>\n",
702 |        "      <td>0.502</td>\n",
703 |        "      <td>1.4880</td>\n",
704 |        "    </tr>\n",
705 |        "    <tr>\n",
706 |        "      <th>Viscera weight</th>\n",
707 |        "      <td>4177.0</td>\n",
708 |        "      <td>0.180594</td>\n",
709 |        "      <td>0.109614</td>\n",
710 |        "      <td>0.0005</td>\n",
711 |        "      <td>0.0935</td>\n",
712 |        "      <td>0.1710</td>\n",
713 |        "      <td>0.253</td>\n",
714 |        "      <td>0.7600</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <th>Shell weight</th>\n",
718 |        "      <td>4177.0</td>\n",
719 |        "      <td>0.238831</td>\n",
720 |        "      <td>0.139203</td>\n",
721 |        "      <td>0.0015</td>\n",
722 |        "      <td>0.1300</td>\n",
723 |        "      <td>0.2340</td>\n",
724 |        "      <td>0.329</td>\n",
725 |        "      <td>1.0050</td>\n",
726 |        "    </tr>\n",
727 |        "  </tbody>\n",
728 |        "</table>\n",
729 |        "</div>"
730 |       ],
731 |       "text/plain": [
732 |        "                 count      mean       std     min     25%     50%    75%  \\\n",
733 |        "Sex             4177.0  1.955470  0.827815  1.0000  1.0000  2.0000  3.000   \n",
734 |        "Length          4177.0  0.523992  0.120093  0.0750  0.4500  0.5450  0.615   \n",
735 |        "Diameter        4177.0  0.407881  0.099240  0.0550  0.3500  0.4250  0.480   \n",
736 |        "Height          4177.0  0.139516  0.041827  0.0000  0.1150  0.1400  0.165   \n",
737 |        "Whole weight    4177.0  0.828742  0.490389  0.0020  0.4415  0.7995  1.153   \n",
738 |        "Shucked weight  4177.0  0.359367  0.221963  0.0010  0.1860  0.3360  0.502   \n",
739 |        "Viscera weight  4177.0  0.180594  0.109614  0.0005  0.0935  0.1710  0.253   \n",
740 |        "Shell weight    4177.0  0.238831  0.139203  0.0015  0.1300  0.2340  0.329   \n",
741 |        "\n",
742 |        "                   max  \n",
743 |        "Sex             3.0000  \n",
744 |        "Length          0.8150  \n",
745 |        "Diameter        0.6500  \n",
746 |        "Height          1.1300  \n",
747 |        "Whole weight    2.8255  \n",
748 |        "Shucked weight  1.4880  \n",
749 |        "Viscera weight  0.7600  \n",
750 |        "Shell weight    1.0050  "
751 |       ]
752 |      },
753 |      "execution_count": 23,
754 |      "metadata": {},
755 |      "output_type": "execute_result"
756 |     }
757 |    ],
758 |    "source": [
759 |     "abalone_data.describe().T"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": null,
765 |    "metadata": {},
766 |    "outputs": [],
767 |    "source": []
768 |   }
769 |  ],
770 |  "metadata": {
771 |   "kernelspec": {
772 |    "display_name": "ML",
773 |    "language": "python",
774 |    "name": "python3"
775 |   },
776 |   "language_info": {
777 |    "codemirror_mode": {
778 |     "name": "ipython",
779 |     "version": 3
780 |    },
781 |    "file_extension": ".py",
782 |    "mimetype": "text/x-python",
783 |    "name": "python",
784 |    "nbconvert_exporter": "python",
785 |    "pygments_lexer": "ipython3",
786 |    "version": "3.13.5"
787 |   },
788 |   "orig_nbformat": 4
789 |  },
790 |  "nbformat": 4,
791 |  "nbformat_minor": 2
792 | }
793 | 


--------------------------------------------------------------------------------
/MLP/Week_11.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Week_11.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "source": [
 22 |         "## [Lecture 11.1: K-means clustering on digit dataset](https://www.youtube.com/watch?v=-tPSKI9nUf0)"
 23 |       ],
 24 |       "metadata": {
 25 |         "id": "1VQs-b78IKar"
 26 |       }
 27 |     },
 28 |     {
 29 |       "cell_type": "markdown",
 30 |       "source": [
 31 |         "In this notebook, we will implement `K-Means` algorithm with `sklearn`"
 32 |       ],
 33 |       "metadata": {
 34 |         "id": "LaG01b-nIaqm"
 35 |       }
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "execution_count": 1,
 40 |       "metadata": {
 41 |         "id": "PLkFDybvDsNc"
 42 |       },
 43 |       "outputs": [],
 44 |       "source": [
 45 |         "import matplotlib.pyplot as plt\n",
 46 |         "import numpy as np\n",
 47 |         "import pandas as pd\n",
 48 |         "\n",
 49 |         "# KMeans clustering\n",
 50 |         "from sklearn.cluster import KMeans\n",
 51 |         "\n",
 52 |         "# Loading the digit dataset\n",
 53 |         "from sklearn.datasets import load_digits\n",
 54 |         "\n",
 55 |         "# Selecting k through silhoutte score\n",
 56 |         "from sklearn.metrics import silhouette_score\n",
 57 |         "\n",
 58 |         "# Normalization through MinMaxScaler\n",
 59 |         "from sklearn.preprocessing import MinMaxScaler\n",
 60 |         "\n",
 61 |         "from sklearn.pipeline import Pipeline"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "markdown",
 66 |       "source": [
 67 |         "### Clustering of digits\n",
 68 |         "We will use digit dataset for clustering, which is loaded through `load_digit` API\n",
 69 |         "- It loads 8x8 digit images which is approximately 180 samples per class\n",
 70 |         "- From 10 classes, it has total of 1797 images\n",
 71 |         "- Each pixel has value between 0 and 16"
 72 |       ],
 73 |       "metadata": {
 74 |         "id": "lGPHpCn2JXt_"
 75 |       }
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "source": [
 80 |         "digits = load_digits()"
 81 |       ],
 82 |       "metadata": {
 83 |         "id": "7SnsK4ehJGEu"
 84 |       },
 85 |       "execution_count": 2,
 86 |       "outputs": []
 87 |     },
 88 |     {
 89 |       "cell_type": "markdown",
 90 |       "source": [
 91 |         "Let's quickly check `KMeans` class as implemented in `sklearn.cluster` module"
 92 |       ],
 93 |       "metadata": {
 94 |         "id": "pAW_Y6ReJ72R"
 95 |       }
 96 |     },
 97 |     {
 98 |       "cell_type": "code",
 99 |       "source": [
100 |         "?KMeans"
101 |       ],
102 |       "metadata": {
103 |         "id": "xdPQSYfuJ4zI"
104 |       },
105 |       "execution_count": 3,
106 |       "outputs": []
107 |     },
108 |     {
109 |       "cell_type": "markdown",
110 |       "source": [
111 |         "Some of the important parameters are as follows:\n",
112 |         "- `init`\n",
113 |         "- `n_init`\n",
114 |         "- `max_iter`\n",
115 |         "- `random_state`\n",
116 |         "\n",
117 |         "Since KMeans algorithm is susceptible to local minima,we perform `KMeans` fit and select the ones with the lowest value of sum of squared error\n",
118 |         "\n",
119 |         "The total number of time, we would like to run KMeans algorithm is specified through `n_init` parameter.\n",
120 |         "\n",
121 |         "`max_iter` specifies total number of iterations before declaring convergence."
122 |       ],
123 |       "metadata": {
124 |         "id": "Q1jnSaOvKHl7"
125 |       }
126 |     },
127 |     {
128 |       "cell_type": "markdown",
129 |       "source": [
130 |         "Let's define parameters of KMeans clustering algorithm in a dictionary object."
131 |       ],
132 |       "metadata": {
133 |         "id": "M_ViEA2VK3ug"
134 |       }
135 |     },
136 |     {
137 |       "cell_type": "code",
138 |       "source": [
139 |         "kmeans_kwargs = {\n",
140 |         "    'init': 'random',\n",
141 |         "    'n_init': 50,\n",
142 |         "    'max_iter': 500,\n",
143 |         "    'random_state': 0\n",
144 |         "}"
145 |       ],
146 |       "metadata": {
147 |         "id": "-TY1TIbcKEWZ"
148 |       },
149 |       "execution_count": 4,
150 |       "outputs": []
151 |     },
152 |     {
153 |       "cell_type": "markdown",
154 |       "source": [
155 |         "Let's define a pipeline with two stages:\n",
156 |         "- Preprocessing for feature scaling with `MinMaxScaler`.\n",
157 |         "- Clustering with `KMeans` clustering algorithm"
158 |       ],
159 |       "metadata": {
160 |         "id": "iOV6zonNoADt"
161 |       }
162 |     },
163 |     {
164 |       "cell_type": "code",
165 |       "source": [
166 |         "pipeline = Pipeline([('Preprocess', MinMaxScaler()),\n",
167 |         "                     ('Clustering', KMeans(n_clusters=10, **kmeans_kwargs))])\n",
168 |         "pipeline.fit(digits.data)"
169 |       ],
170 |       "metadata": {
171 |         "id": "W8Qx_q-WLLUm",
172 |         "colab": {
173 |           "base_uri": "https://localhost:8080/"
174 |         },
175 |         "outputId": "e02c0439-4700-4cf7-ae26-5d78dd383a19"
176 |       },
177 |       "execution_count": 5,
178 |       "outputs": [
179 |         {
180 |           "output_type": "execute_result",
181 |           "data": {
182 |             "text/plain": [
183 |               "Pipeline(steps=[('Preprocess', MinMaxScaler()),\n",
184 |               "                ('Clustering',\n",
185 |               "                 KMeans(init='random', max_iter=500, n_clusters=10, n_init=50,\n",
186 |               "                        random_state=0))])"
187 |             ]
188 |           },
189 |           "metadata": {},
190 |           "execution_count": 5
191 |         }
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "markdown",
196 |       "source": [
197 |         "The cluster centroids can be accessed via `cluster_centers_` member variable of `KMeans` class."
198 |       ],
199 |       "metadata": {
200 |         "id": "mlYS_0dVoxrT"
201 |       }
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "source": [
206 |         "cluster_centers = pipeline[-1].cluster_centers_"
207 |       ],
208 |       "metadata": {
209 |         "id": "ugJKlEesoggn"
210 |       },
211 |       "execution_count": 6,
212 |       "outputs": []
213 |     },
214 |     {
215 |       "cell_type": "markdown",
216 |       "source": [
217 |         "Let's display cluster centroids:"
218 |       ],
219 |       "metadata": {
220 |         "id": "eW0OpJTPpAu0"
221 |       }
222 |     },
223 |     {
224 |       "cell_type": "code",
225 |       "source": [
226 |         "# displaying centroids\n",
227 |         "fig, ax = plt.subplots(5, 2, figsize=(4, 4))\n",
228 |         "for i, j in zip(ax.flat, cluster_centers.reshape(10, 8, 8)):\n",
229 |         "    i.imshow(j)"
230 |       ],
231 |       "metadata": {
232 |         "colab": {
233 |           "base_uri": "https://localhost:8080/",
234 |           "height": 267
235 |         },
236 |         "id": "5NMKg5AWo_S1",
237 |         "outputId": "009b925a-5004-4347-ccc7-77bbd6a47e33"
238 |       },
239 |       "execution_count": 7,
240 |       "outputs": [
241 |         {
242 |           "output_type": "display_data",
243 |           "data": {
244 |             "text/plain": [
245 |               "<Figure size 288x288 with 10 Axes>"
246 |             ],
247 |             "image/png": "iVBORw0KGgoAAAANSUhEUgAAALsAAAD6CAYAAAD5lDajAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAZMklEQVR4nO2da3RU13XH/1ujxwi9kCzEQyiywCgYuya2KdgYrxjbGFI7wU28HHCSrjZtlUdpHk2yTNLlldp1kpXWabvycBvFIU3iYJI4kY1jzNOJiUnAEo1TbAxYCGQkBEgggZDQY0a7HzTUOrNHM/dKuqO5c/bvi9hb+957NHvP4Zx79tmHmBmKYgMZk90ARUkWGuyKNWiwK9agwa5Ygwa7Yg0a7Io1OAp2IlpFRIeJqJGI1nvdKGVySHc/U6L37EQUAHAEwAoALQDqAaxl5oOjXZNNORxEnnmfrCxD7pudKa5bUNAudIMxmtd4broh57RdEjY8NGQ+Dz0Y4H4arc2249bPsXyMqE93aGrU7wEUzuwWukwaErqO9iJDzupI7GMA6EZnBzNPi9VmGXGSxQAambkJAIhoE4DVAEYN9iDysITuMB80o9yQD/1zmbjuxeWPC93psPyD7nnqs4Y89yuvCZuhbvND3ce7RmuuMowrP8fyMWWa4dR91yJx3Z3/+FuhK828KHTf/+7dhjzziVeFzVBvr9Dt5KebY7UXcDaMKQdwYoTcEtEp6UXa+9lJz+4IIqoBUAMAQUyZqNsqKYTffeykZ28FUDFCnh3RGTBzLTMvYuZFWciZqPYpySOhn/3uYyc9ez2AeURUheE/fg2ABxJeReZspXNZhSE/uGizuGTxno8J3dLKY0K39j27zQb+6Br5/INyIqTExb2fo3ycMdWcVJ5cFRKX3F0ox94H+iqELvc9p81HvTBDPv/NprjNiyZhsDNziIjWAdgGIABgAzO/7uopSspjg58djdmZeQuALR63RZlk0t3PuoKqWMOEvY0RRC1WTT3QachPrr9HXDJDvlJH6cPyHewLLQtMmzNnx9BAZdxE+Zhyc83fk1wR3Hz+BqEryewRut7+bNPmYqewcYv27Io1aLAr1qDBrliDBrtiDd5NUKMIHzxiyPkdMjHtjUcrhe5bJb8Xus3P32TIJd1vjbN1yoQwOGiIgbNZwqQmhj+lFfCTF1Yacrj9zXE1DdCeXbEIDXbFGjTYFWvQYFesIWkTVEF/v1BNe1k258c33ix0H713pyHv2nqLsKE9MrtO8RYOmVmO4Xy5JP6OzHyhe6ZH6qY1XIh777GgPbtiDRrsijVosCvW4GjMTkTHAXQDCAMIMbPcNq74nnT3s5sJ6nJm7hjrgzKi0j+peKqwKX1WVm3Y079E6O5/aKshtyyXm38r9rhtoRJhzH6OTvF9Z7XYqoxHO+YLXQZkKvD56gJDLvqjDFW3k1YdxijW4DTYGcB2ItofKacgIKIaImogooZByNeKii+I62e/+9jpMGYZM7cSURmAHUR0iJmNLf7MXAugFgAKqUTPrvEncf3sdx873XDdGvl5hojqMFwqbXf8q0woz6z7d3jdTNmYi7IA1cCVfUK3dIqZAfe9cIznRZViw/jXJNKe8fqZ+8ze/nT3FcKmo0guIJVmya2XPbPMQUdxvqwbGe4677RpABwMY4goj4gKLv8bwF0AZHFFxdfY4GcnPft0AHU0XBAnE8BGZt4a/xLFh6S9n50USWoCsDAJbVEmERv8rK8eFWtIXtYjmxlwuXMuCJMDSzYK3bFBOXlZsWedIVdvlnVjONusO0Jh/V5POFG1Hoc6zdouwU1zxSVlX5Q1OGdly5ow2ReiXvYEAmNooIlGgGINGuyKNWiwK9agwa5YQ8LT8sZ0U6J2AM0ASgGMOVMyAW7vXTnaKWqKe0b4GPCJnz0J9v+/OVGDVznRXt5bcYdf/KzDGMUaNNgVa/A62Gt9em/FHb7ws6djdkVJJRz17ES0iogOE1EjEa33ulHK5JDufk7YsxNRAMARACswfMR3PYC1zBzzTHsAyKYcDsJMtqfcoCFzhdxxUR2UyfgHukqFLtg2YN5rMPHOjD70YID7KaGhpbj1cywfi3tmyWLUmXNllbDeQWmXfcK044EBYROLbnR2jPbq0Uki2GIAjZEUUBDRJgCrAcT8EIgoUIBiLKE7DH3GVeaucv6mTAjaOv95oavaLLe8LvinZkMOnTotbKLZx7sS2liOKz8HkSd8HE3m9FlCV/yDXqF79ZTcoVb5OTMBMHSsWdjEYic/Paqhk2FMOYATI+SWiG40FjtqlZJquPWz75iwFN/IbvQaAMV+3HmuJGaEjxGErNWT6jjp2VsBjDxcfnZEZ8DMtZGVrgezkDNBzVOSSEI/X/YxMy/yo4+d9Oz1AOYRURWG//g1AB6IY98Kkrv725aXGHLd3CfEhb+4KMd4f3nzy0L36+vNEtU5LyQesysJcetnSdRmjiN/L8/I2lPxmNAtf+4LQsfdp1w92glO9qCGiGgdgG0AAgA2MPPrcS6pn6jGKcljDH72HU7rxmwBsMWhbagwoySxoZJyuPGzH9HcGMUavNlwzQCHzUWjrIvm4tXtOz8jLss9li10n3rgWaEbyta1oVQkUFJsyJ+4Z5uwWf6KXDepfEKecRo+e27iGhZBe3bFGjTYFWvQYFesQYNdsQbvKoJFZVOWbT1myEXH5AJS85/JDMypAZk4FOxwlgGnJJfwXDOV5obc3wibb7XfKXTdt84RuoLD5uvr8MEj8oEu92Joz65Ygwa7Yg0a7Io1aLAr1pC0ktWhNjOLLTtf5kOXv0tu2drc8S6hy2rrMu89zrYpE8PFStOnC7NlufGHb/ul0L21VG693N52tSFn/csNwiZz135X7dOeXbEGDXbFGjTYFWtwNGYnouMAugGEAYS0oGh6ku5+djNBXc7MYy5LTFlm+m7H0unC5qaSBqF7rv56oasuvWTIgQvycNlwhzxnSXHEmP1c9Lp5NtIg5ArnxpNLhK7tWbl9L/895guNU38jX0NcVV8oGxHnHGAdxijW4DTYGcB2ItofKacgIKIaImogogYtpeFb4vrZ7z52OoxZxsytRFQGYAcRHWJm40x7Zq5FpOJqIZVotVR/EtfPfvex0w3XrZGfZ4ioDsNVv3bHv8okMKPMkBd+8n+FzWen/VrorrxVjr2bFpml/J6vlwczL/jXfEOmFrnlTzEZr5/Dh44a8n0HPyxsPlgh52VPhOWYvXpquyFPy+0RNgNZ7tZEEw5jiCiPiAou/xvAXQBec/UUJeWxwc9OvhrTAdTRcAGcTAAbmXmrp61SJoO097OTIklNAOQ4QUkrbPCzvnpUrCFpWY88xTyM4HRfgbCpysoXupX5sgJbfaY5odk1rVrY9M0xM+mG2pP2p9rLkFkrqODjMov1hR9cK3R/+NLjQvezi0WG/NhXZNnJ4s5XXDVPe3bFGjTYFWvQYFesQYNdsQZPzkElonYAzQBKAYw5UzIBbu9dOdopaop7RvgY8ImfPT30l4gavMqJ9vLeijv84mcdxijWoMGuWIPXwV7r03sr7vCFnz0dsytKKuGoZyeiVUR0mIgaiWi9141SJod093PCnp2IAgCOAFiB4SO+6wGsZeaYZ9oDQDblcBB5CZ4sz0XqL5dVwq4o7Ba6nmOmHV/qi/8sAH3owQD362FMo+DWzzF9HOXTUGmMqm/T5VvEKRlhoWsZMHNjLp0NCpusLrk18MJge8dorx6dZEctBtAYSQEFEW0CsBpAzA+BiAIFKMYSuiPuTSlHnpB89NOyxNmHV74kdHs/dJ0hDx2UB1BFJyXt411x26O483MQecLH0T5t/4D05yNf+IHQ/WmO3I32DyfuNuQ3fni1sJnxbJPQbW37TrNQRnAyjCkHcGKE3BLRjcZiB/dUUg+3fvYdE5b3GtmNXgOg2I87z5XEjPAxgpBDlFTHSc/eCqBihDw7ojNg5trISteDWZBDFCXlSejnyz5m5kV+9LGTnr0ewDwiqsLwH78GgMykfxvxRYhF//LrhO4X9/+70N37m78Tuvl95oGwmWWy5PHQBXNiS5d0/SwBbv0sCJSaldnmfkSeg/Ti+QVC95/dsjrc/TPqDflAUF7H/e7O1nKyBzVEROsAbAMQALCBmeX2obepj/M7JUUZg599h9O6MVsAbHFoGyqkksSGSsrhxs9+RP9vV6whabuQA1PNRYKiL74lbH54dqnQle3IErq2lTMNecoZuShRtDNqvNin60meE7WodPjpdwqTC3vkImHT++VG+wMrzSq+xW8OCpuhbnmveGjPrliDBrtiDRrsijVosCvW4N0ENWqycmqNuShQf9V3xCULv7lO6AaulVmZ195iJn69/tJVwqbgGbPEMQ/J6lTKBBMyj4IJ9EnfHb9HVoL75Pvk284hNvvh81fKFxU5YfliIh7asyvWoMGuWIMGu2INGuyKNXg3QY3a7lf0gZOGHCD5PaObuoTukQUvCN3KKWZi5S3bPy8fPxiVEacbyz2HC82V0Au3yu2S316yUehWTZH7H+47eqchZ/XE8J9Ln2rPrliDBrtiDRrsijU4GrMT0XEA3QDCAEJaUDQ9SXc/u5mgLmdmR6WDiQgZQbPOx6mXzY3qnyuRZRb+uvp3QremoFPo7jv6PkO+8qenhI27tTVlBI79HA31XDLkgj1yu92uq+X2up93yBpDZx6bY8hXvCiPZHW7Jq7DGMUanAY7A9hORPsj5RQERFRDRA1E1DCgpTT8Slw/j/SxH8ulOB3GLGPmViIqA7CDiA4xs3GmPTPXIlJxtSjjCn2p7U/i+nmkjwupxHc+drrhujXy8wwR1WG46tfuOPYYGjC3Uc154rghv/rb68V1de+XzVn93m8IXevjZpZj4Zt7R2274hy3fo4mdLLNkEsOzxQ2c4NnhO5Xz6wWuiu3/Y8hD/UlrueZiITDGCLKI6KCy/8GcBcAOVtQfI0NfnbSs08HUEfD+emZADYy81ZPW6VMBmnvZydFkpoALExCW5RJxAY/66tHxRq8y3qMqo8eajWzHrPPmvUaAYDvl/Uff3XxGqEr/qN5rS4gpQYUCBhy202y+On5sKz+W7GzR+g4PPHbKLVnV6xBg12xBg12xRo02BVr8OQcVCJqB9AMoBTAmDLoHOD23pWjnaKmuGeEjwGf+NnTQ3+JqMGrnGgv7624wy9+1mGMYg0a7Io1eB3stT69t+IOX/jZ0zG7oqQSjnp2IlpFRIeJqJGI1nvdKGVySHc/J+zZiSgA4AiAFRg+4rsewFpmjnmmPQBkUw4HITfRJmxMblDoplRdErqpgV5DfutkmbAJnDNt+rgHA6wHK42GWz878XH4Cvn7ylmnhe5cWJ6p1HPCzKGh7l5hE4tudHaM9urRSSLYYgCNkRRQENEmAKsBxPwQiChQgGIsoTscNW4kGdXzhe7GH8mjON9XZO5iWffwp4RNyVOmzd5+WUZPMXDl5yDyEvq46+6bhe67j/yH0D3VtUToGj5/oyFn7tof91mX2clPN4/2OyfDmHIAJ0bILRHdaCx21Col1XDrZ98xYSm+kd3oNQCK/bjzXEnMCB8jCJmqm+o46dlbAVSMkGdHdAbMXBtZ6XowCzKPWUl5Evr5so+ZeZEffeykZ68HMI+IqjD8x68B8EAce/FFACDOWMpYeLUwOf2I3IbxaNkBodvbZ35H+0rkvJOinqcz04S49bMgMM2cFz785Q3CZkpGSOiCGfJA35u/8Yoh/2HVLGETOiUnu/Fwsgc1RETrAGwDEACwgZnlrPFt6l21QEkJxuBn3+G0bswWAPJIs9i2oUIqGVejlMnBjZ/9iObGKNbg3YbrKALVcw35xJelzbeu+bnQ/exikdDdmGNOC/JPys25HHUmJ0PTIrzmwrvNyrurpuwQNgu//gWhm7n7vNCtecq8dvdN8p197jPuxuzasyvWoMGuWIMGu2INGuyKNSRtgtr1rlJDfuiap4TN8cFSoVuae0zoZmeaq3dTTg0Im+gJqs5Pvef8HLMi2NZeuco66yV51i2/3ih0/3bITDLrvSUgbOY+46592rMr1qDBrliDBrtiDRrsijUkbYJaePSiIX9p81phE86XK6Ffvk3OQs5lm2f3ZHbL/Hmdjyafvqgzxfb1zBU21NoudEOD8gXD4KAZmkNl498joT27Yg0a7Io1aLAr1uBozE5ExwF0Y/hEl5AWFE1P0t3Pbiaoy5l5zGWJM46bk8qrfjJD2FyoLpAX3iZVRwfNOjGBMzJFVG7+UhwyZj/P3Gtuq7x/TYOw2VeyQOgCRbJuzAfmvWrIdT+9dSxNMtBhjGINToOdAWwnov2RcgoCIqohogYiatBSGr4lrp/97mOnw5hlzNxKRGUAdhDRIWY2zrRn5lpEKq4WUom+5vYncf3sdx873XDdGvl5hojqMFz1a3f8q0zCUeeeUvdFYZMzXZ6DOi/7lNAd6KswZC7wX8GeVGS8fs5/+agh97PMVAw/3id0BdlS96Gp+wz5lZducNqMUUk4jCGiPCIquPxvAHcBeG3cT1ZSChv87KRnnw6gLlJ0KBPARmbe6mmrlMkg7f3spEhSE4CFSWiLMonY4Gd99ahYQ9KyHhF16AH3y1dXwVM9Qve73nlC13xJbt9TJp9wx1lD/vhDnxY2T3/1MaHrZVmJ875vmvVlZu2X9dndvg7Snl2xBg12xRo02BVr0GBXrMGTc1CJqB1AM4BSAGPOlEyA23tXjnaKmuKeET4GfOJnTw/9JaIGr3Kivby34g6/+FmHMYo1aLAr1uB1sNf69N6KO3zhZ0/H7IqSSjjq2YloFREdJqJGIlrvdaOUySHd/ZywZyeiAIAjAFZg+IjvegBrmTnmmfYAkE05HERe9H0MuX+m3HAxs6RT6AaH5AaAs2cLzed1yOR/Dpubf/vQgwHu1+NQR8Gtn2P5GFOChhgul7FVniN9nEPy/NvzQ7mGfLZNnq0VOCdzqbrR2THaq0cniWCLATRGUkBBRJsArAYQ80MgokABirGEzPraGUHzgzj2ievFtQ998GdC1zIgj5l88kcrDLnie/K4znCXWXFgH++K1VzlbVz5OYg84WOaf40hn/+q7IQerZblDOdkyuoQW3rMQ6F//NW7hU3Rk3uFbic/3SyUEZwMY8oBnBght0R0o7HYwT2V1MOtn33HhKX4Rnaj1wAo9uPOcyUxI3yMIPy379dJz94KYOQO59kRnQEz10ZWuh7MgjxeREl5Evr5so+ZeZEffeykZ68HMI+IqjD8x68B8EAce/FFAIDQovmG/PJfyST+5Q1/K3RZATl5qbqnyZDDP5eTF3TJcaASF7d+Fpy7znxx8MrCnwib/+qSI6Oa3/2F0AVazTleVaOcjLrFyR7UEBGtA7ANQADABmaWM8K3qR93q5SkMwY/+w6ndWO2ANji0DZUSPINipL6uPGzH9HcGMUakrbh+kKVuUiwr/8KYZP7jBx7d8qir5h9u/kq9Vj+leNqmzJGohYKe2aZ8vmhS+KSb/9wtdBVP39O6DI6Thpy9GZuQDdcK8qoaLAr1qDBrliDBrtiDUmboBY19hry91rfLWzu/dyLQvfu/DeE7tW+SkN+c+p8YaPf4uSTGZX31TskFwQ/+hFZK/Xb71ghdNUbzGxXPjP+/dwaE4o1aLAr1qDBrliDBrtiDUmboAb+cNiQz3/tT4TNkzfOFbr/zrtd6N670jxv5+y1ucJm2stuW6i4JmpL58zdZqbp0vmfFZeUV8qV0MpqeW5W2zIzO7K8qVDYhDvlFr94aM+uWIMGu2INGuyKNTgasxPRcQDdAMIAQlpQND1Jdz+7maAuZ+YxL2NxKGTIOWdlmYXK57qFjvoGha7rdnOzb++ssbZKicHY/fxaoyEu+Np0YXL6Trktb+DP5eMuzYxK4M0Yf8kfHcYo1uA02BnAdiLaHymnICCiGiJqIKIGLaXhW+L62e8+djqMWcbMrURUBmAHER1iZuNMe2auRaTiaiGVaLVUfxLXz373sdMN162Rn2eIqA7DVb92x7/KJKPIXBR442Oy7kj+oQKhy+mSn+m9Rb835L2XrovxwKgakTIBT4livH7OyDdrP55aGWN8vkqWOPlgxQGh+8UWczGRL8k5nlsSDmOIKI+ICi7/G8BdAF4b95OVlMIGPzvp2acDqItU4c0EsJGZZVKy4nfS3s9OiiQ1AViYhLYok4gNftZXj4o1JC3rceiiWasvpzVb2Gxa9w2huyZbZjTe9tq9hlyxUy5G+e5VQTowwzwDoOrDbwqT71c9J3TXP/sZobv6l2Y9z1Bvr7Bxi/bsijVosCvWoMGuWIMGu2INnpyDSkTtAJoBlAIYf8GP2Li9d+Vop6gp7hnhY8Anfvb00F8iavAqJ9rLeyvu8IufdRijWIMGu2INXgd7rU/vrbjDF372dMyuKKmEDmMUa/Ak2IloFREdJqJGIlrvwf2PE9EBInqViBom+v6KM/zm5wkfxhBRAMARACsAtGD4XNS1zHxwAp9xHMCi8VQ7UMaHH/3sRc++GEAjMzcx8wCATQDkEWmK3/Gdn70I9nIAJ0bILRHdRJKw2oHiOb7zc9Ly2SeYhNUOlLRgQv3sRc/eCqBihDw7opswRu6CB3B5F7ySXHznZy+CvR7APCKqIqJsAGsAbJ6om9uwC94n+M7PEz6MYeYQEa0DsA1AAMAGZn59Ah+R9rvg/YAf/awrqIo16AqqYg0a7Io1aLAr1qDBrliDBrtiDRrsijVosCvWoMGuWMP/AfTeXG+hMjkVAAAAAElFTkSuQmCC\n"
248 |           },
249 |           "metadata": {
250 |             "needs_background": "light"
251 |           }
252 |         }
253 |       ]
254 |     },
255 |     {
256 |       "cell_type": "markdown",
257 |       "source": [
258 |         "In this case, the number of clusters were known. Hence we set `k=10` and got the clusters\n",
259 |         "\n",
260 |         "For deciding the optimal number of clusters through elbow and silhoutte, we will pretend that we do not know the number of clusters in the data and we will try to discover the optimal number through these two methods one by one:\n",
261 |         "\n",
262 |         "### Elbow method\n",
263 |         "Here we keep track of sum-of-squared error (SSE) in a list for each value of k."
264 |       ],
265 |       "metadata": {
266 |         "id": "c8Cv2aixqrq5"
267 |       }
268 |     },
269 |     {
270 |       "cell_type": "code",
271 |       "source": [
272 |         "# Identifying the correct number of clusters\n",
273 |         "sse_digit = []\n",
274 |         "scaled_digits = MinMaxScaler().fit_transform(digits.data)\n",
275 |         "for k in range(1, 12):\n",
276 |         "    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n",
277 |         "    kmeans.fit(scaled_digits)\n",
278 |         "    sse_digit.append(kmeans.inertia_)"
279 |       ],
280 |       "metadata": {
281 |         "id": "x1YuEMnwpW11"
282 |       },
283 |       "execution_count": 8,
284 |       "outputs": []
285 |     },
286 |     {
287 |       "cell_type": "markdown",
288 |       "source": [
289 |         "Note that SSE for a given clustering output is obtained through `inertia_` member variable."
290 |       ],
291 |       "metadata": {
292 |         "id": "_NFrkwkbvogl"
293 |       }
294 |     },
295 |     {
296 |       "cell_type": "code",
297 |       "source": [
298 |         "plt.plot(range(1, 12), sse_digit)\n",
299 |         "plt.xticks(range(1, 12))\n",
300 |         "plt.xlabel('Number of clusters')\n",
301 |         "plt.ylabel('SSE')\n",
302 |         "plt.show()"
303 |       ],
304 |       "metadata": {
305 |         "colab": {
306 |           "base_uri": "https://localhost:8080/",
307 |           "height": 279
308 |         },
309 |         "id": "qo31vTUTvhDL",
310 |         "outputId": "6acb93ac-8160-42ab-dc8b-1546cc2fcd91"
311 |       },
312 |       "execution_count": 9,
313 |       "outputs": [
314 |         {
315 |           "output_type": "display_data",
316 |           "data": {
317 |             "text/plain": [
318 |               "<Figure size 432x288 with 1 Axes>"
319 |             ],
320 |             "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXwV1fnH8c+ThAAJSxIJCCSQIIssAkJAVFAURVwqaquitS5V6SLu9qfW/upWW21dfmotLYpbXdC6FGoVRYqKGyTsYUe2JGzRQFjCFvL8/riDRgpcltx7s3zfr9d9Ze6ZM3OesTRP5pyZc8zdERER2Z+4WAcgIiLVn5KFiIiEpWQhIiJhKVmIiEhYShYiIhJWQqwDiIRmzZp5VlZWrMMQEalRpk2b9rW7p+9tX61MFllZWeTl5cU6DBGRGsXMVuxrn7qhREQkLCULEREJS8lCRETCUrIQEZGwlCxERCQsJQsREQlLyUJERMJSsqhk645dPPjeAgpKymIdiohItaJkUcn6sh38/Yvl3PXPfLTOh4jIdyKaLMzsZjOba2b5ZvaqmTUws+fNbJmZzQw+PYO6ZmZPmNkSM5ttZr0qnecKM1scfK6IVLytUhpy+5lH88miYv45syhSzYiI1DgRSxZm1hq4Achx925APDAs2P0rd+8ZfGYGZWcCHYLPcGBkcJ404G7gOKAvcLeZpUYq7suOa0uvNinc9695fLN5e6SaERGpUSLdDZUANDSzBCAJWLWfukOBFz3kSyDFzFoCZwAT3L3E3dcDE4AhkQo4Ls548Ifd2by9nPvfmRepZkREapSIJQt3LwIeBlYCq4FSd/8g2P1A0NX0mJnVD8paAwWVTlEYlO2r/HvMbLiZ5ZlZXnFx8WHF3rFFY345sD3/nLmKSQvXHda5RERqg0h2Q6USulvIBloByWZ2GXAncDTQB0gDbq+K9tx9lLvnuHtOevpeZ9g9KL885SjaN2/Eb97OZ8v28iqIUESk5opkN9RpwDJ3L3b3ncBbwAnuvjroatoOPEdoHAKgCMisdHxGULav8oiqnxDPQz88hlWlW3nkg0WRbk5EpFqLZLJYCfQzsyQzM2AQMD8YhyAoOw/ID+qPAy4PnorqR6jbajXwPjDYzFKDu5XBQVnE9W6bxmXHteW5z5cxY+X6aDQpIlItRXLMYgrwBjAdmBO0NQp42czmBGXNgN8Fh7wLLAWWAE8DvwzOUwLcD+QGn/uCsqj4nyGdaNG4AXe+NYcd5RXRalZEpFqx2vjyWU5OjlflSnkT5q3l2hfzuG1wR0ac2qHKzisiUp2Y2TR3z9nbPr3BfQBO79KCs7u35In/LOGr4s2xDkdEJOqULA7QPT/oSsN68dz51hwqKmrf3ZiIyP4oWRyg9Mb1ueuszkxdVsKY3ILwB4iI1CJKFgfhwpwMTjjqCP7w7nzWbtwW63BERKJGyeIgmBm/P/8Yduyq4Ldj88MfICJSSyhZHKSsZsncfHpH3p+7lvH5q2MdjohIVChZHIJr+mfTtVUTfjt2LqVbd8Y6HBGRiFOyOAQJ8XE8eEF3vt68nQffWxDrcEREIk7J4hAdk9GUawa049WpK/ly6TexDkdEJKKULA7Dzad1pE1aEr9+aw7bdu6KdTgiIhGjZHEYGibG8/vzj2Hp11v483+WxDocEZGIUbI4TP07NONHvTP468dfMX/1xliHIyISEUoWVeCuszrTtGE97nhzNrs0FYiI1EJKFlUgNTmRu8/tyqzCUp7/fHmswxERqXJKFlXkB91bcurRzXn4/YUUlJTFOhwRkSoV0WRhZjeb2VwzyzezV82sgZllm9kUM1tiZq+ZWWJQt37wfUmwP6vSee4Myhea2RmRjPlQmRn3n9eNOIO7/plPbVwnRETqroglCzNrDdwA5Lh7NyAeGAY8BDzm7u2B9cDVwSFXA+uD8seCephZl+C4rsAQ4C9mFh+puA9H65SG/M+Qo/lkUTFjZ66KdTgiIlUm0t1QCUBDM0sAkoDVwKmEllsFeIHQOtwAQ4PvBPsHBet0DwXGuPt2d19GaNnVvhGO+5Bd1q8tx7ZJ4d5/zeWbzdtjHY6ISJWI5BrcRcDDwEpCSaIUmAZscPfyoFoh0DrYbg0UBMeWB/WPqFy+l2O+ZWbDzSzPzPKKi4ur/oIOUHyc8dAPu7N5ezm/+/f8mMUhIlKVItkNlUroriAbaAUkE+pGigh3H+XuOe6ek56eHqlmDkjHFo35xcD2vD2jiI8XxS5xiYhUlUh2Q50GLHP3YnffCbwFnAikBN1SABlAUbBdBGQCBPubAt9ULt/LMdXWdaccxVHpyfz6rTls2V4e/gARkWosksliJdDPzJKCsYdBwDxgEvCjoM4VwNhge1zwnWD/fzz0SNE4YFjwtFQ20AGYGsG4q0T9hHge+mF3ijZs5dEJi2IdjojIYYnkmMUUQgPV04E5QVujgNuBW8xsCaExidHBIaOBI4LyW4A7gvPMBV4nlGjGA9e5e42YtS8nK43L+rXhuc+WMbNgQ6zDERE5ZFYb3wfIycnxvLy8WIcBwMZtOxn86CekJNXjX9f3p1683oMUkerJzKa5e87e9uk3V4Q1aVCP+8/rxoI1mxj1ydJYhyMickiULKLg9C4tOPuYljw+cTFLizfHOhwRkYOmZBEld5/bhQYJcdz51hwqNDOtiNQwShZR0rxxA+46uzNTlpXwWl5B+ANERKoRJYsouignk+PbHcHv353P2o3bYh2OiMgBU7KIIjPj9xccw47yCu4eOzfW4YiIHDAliyjLbpbMTad1ZPzcNYzPXxPrcEREDoiSRQxcMyCbLi2b8Nux+ZRu3RnrcEREwlKyiIF68XE89MPufL15Ow+NXxDrcEREwlKyiJFjMppydf9sXpmykilLv4l1OCIi+6VkEUM3n96RzLSG3PnWHLbtrBHTXYlIHaVkEUNJiQn8/vxjWPr1Fp6atCTW4YiI7JOSRYwN6JDOD3tlMPKjr1iwZmOswxER2Ssli2rgN2d3pmnDetz6+iy27lB3lIhUP0oW1UBqciJ//FF35q3eyI1jZrBLc0eJSDUTyTW4O5nZzEqfjWZ2k5ndY2ZFlcrPqnTMnWa2xMwWmtkZlcqHBGVLzOyOSMUcS4M6t+Duc7rwwby1PPDv+bEOR0TkexLCVzk07r4Q6AlgZvGE1s1+G7gKeMzdH65c38y6AMOArkAr4EMz6xjsfgo4HSgEcs1snLvPi1TssXLlidmsLNnKs58tIzOtIVedmB3rkEREgAgmiz0MAr5y9xWh5bj3aigwxt23A8uC5VX7BvuWuPtSADMbE9StdckC4K6zO1O0oYz73plH65SGDO56ZKxDEhGJ2pjFMODVSt9HmNlsM3vWzFKDstZA5bm7C4OyfZXXSvFxxv9dfCzdM1K4YcwMZmntbhGpBiKeLMwsETgX+EdQNBI4ilAX1WrgkSpqZ7iZ5ZlZXnFxcVWcMmYaJsbzzOU5pDeuz9Uv5FJQUhbrkESkjovGncWZwHR3Xwvg7mvdfZe7VwBP811XUxGQWem4jKBsX+Xf4+6j3D3H3XPS09MjcBnRld64Ps9d2Zcd5RVc9XwupWWacFBEYicayeISKnVBmVnLSvvOB/KD7XHAMDOrb2bZQAdgKpALdDCz7OAuZVhQt9Zr37wRoy7PYeU3ZfzspTy2l+sdDBGJjYgmCzNLJvQU01uViv9oZnPMbDZwCnAzgLvPBV4nNHA9HrguuAMpB0YA7wPzgdeDunVCv3ZH8KcLu/Pl0hLueHMO7noHQ0SiL6JPQ7n7FuCIPcp+sp/6DwAP7KX8XeDdKg+whhjaszUFJWU8/MEiMtOSuOX0juEPEhGpQtF6dFYO03WntGdlSRlPTFxMRmpDLsrJDH+QiEgVUbKoIcyMB84/htWl2/j1W3No1bQh/Ts0i3VYIlJHaG6oGqRefBxP/bgX7Zs34hcvTWPhmk2xDklE6gglixqmSYN6PHtlHxomxnPVc1NZu3FbrEMSkTpAyaIGapXSkGev7MOGrTu5+oVctmwvj3VIIlLLKVnUUN1aN+WpS3sxb9VGrn91BuW7KmIdkojUYkoWNdgpRzfnvqHd+M+Cddzzr7l6B0NEIkZPQ9Vwl/VrS8H6Mv728VLapiVz7UntYh2SiNRCSha1wO1nHE1hyVYeeHc+rVMbctYxLcMfJCJyEJQsaoG4OOORi3qwZuM2bnptJi2a1Kd327RYhyUitYjGLGqJBvXiefryHFo1bcC1L05j+ddbYh2SiNQiSha1SFpyIs9d1Rd356rnc1m/ZUesQxKRWkLJopbJbpbM05fnULRhK9e+mMe2nZrWXEQOn5JFLZSTlcajF/Ugb8V6bvvHLCoq9EitiBweDXDXUud0b0Xh+q08+N4CMtOSuH3I0bEOSURqMCWLWuxnJ7VjZUkZIz/6iszUJC49rk2sQxKRGipi3VBm1snMZlb6bDSzm8wszcwmmNni4GdqUN/M7AkzW2Jms82sV6VzXRHUX2xmV0Qq5trGzLjv3K4M7JTO/47NZ9LCdbEOSURqqIglC3df6O493b0n0BsoA94G7gAmunsHYGLwHeBMQutudwCGAyMBzCwNuBs4DugL3L07wUh4CfFx/PnSXnRq0ZgRL09n7qrSWIckIjVQtAa4BwFfufsKYCjwQlD+AnBesD0UeNFDvgRSzKwlcAYwwd1L3H09MAEYEqW4a4VG9RN49so+NGlYj58+n8vq0q2xDklEaphoJYthwKvBdgt3Xx1srwFaBNutgYJKxxQGZfsq/x4zG25meWaWV1xcXJWx1wpHNm3As1f2Ycv2XVz1XC6btu2MdUgiUoNEPFmYWSJwLvCPPfd5aJrUKnmu091HuXuOu+ekp6dXxSlrnc4tmzDysl4sWbeZX748nZ2a1lxEDlA07izOBKa7+9rg+9qge4ng5+5R1yIgs9JxGUHZvsrlEAzokM4D53dj8uKv+d9/5mtacxE5INFIFpfwXRcUwDhg9xNNVwBjK5VfHjwV1Q8oDbqr3gcGm1lqMLA9OCiTQ3RxnzaMOKU9Y3IL+MtHX8U6HBGpASL6noWZJQOnAz+rVPwg8LqZXQ2sAC4Kyt8FzgKWEHpy6ioAdy8xs/uB3KDefe5eEsm464JbB3ekYH0Zf3p/IRmpDRna87+GgUREvhXRZOHuW4Aj9ij7htDTUXvWdeC6fZznWeDZSMRYV5kZf/xRd1aXbuNX/5iNmXFuj1axDktEqinNDVWH1U+IZ9RPetM9oyk3vDqD3/xzjiYeFJG9UrKo41KSEnl1eD9+dnI7XvpyJT/66+es/KYs1mGJSDWjZCHUi4/jzjM788zlORSUbOXsJyczPn9NrMMSkWpEyUK+dVqXFrxzfX/aNUvm5y9N4/535rGjXO9iiIiShewhMy2J139+PFeekMXoT5dx8agvKNqg6UFE6jolC/kv9RPiuefcrjx1aS8Wr93M2U9MZtICzVgrUpcpWcg+nd29Jf+6vj8tmzbkqudz+eP4BZRrihCROknJQvYru1kyb//yBC7p24a/fPQVlz4zhbUbt8U6LBGJMiULCatBvXj+cMExPHZxD+YUlnL2E5P5bMnXsQ5LRKJov8nCzJrsZ5/W6Kxjzj82g3EjTiQ1KZHLRk/h8Q8Xs6tCExGK1AXh7iw+2r1hZhP32PfPKo9Gqr0OLRozdsSJnN+zNY99uIgrn5vK15u3xzosEYmwcMnCKm2n7Wef1CFJiQk8clEPHrzgGKYuK+HsJyYzdZnmdhSpzcIlC9/H9t6+Sx1iZgzr24a3f3kiSYkJXPL0l4z86Csq1C0lUiuFm3W2uZndQuguYvc2wXctRyd0adWEcSNO5I435/DQ+AXkLi/hkQt7kJqcGOvQRKQKhbuzeBpoDDSqtL37+zORDU1qisYN6vHnS4/lvqFdmby4mHOe/JQZK9fHOiwRqUIWyWU1zSyFUFLpRqjb6qfAGcC1QHFQ7dfu/m5Q/07gamAXcIO7vx+UDwEeB+KBZ9z9wf21m5OT43l5eVV/QRLWrIINXPfKdNZu3Mavz+rMlSdkYabhLZGawMymuXvO3vaFe3T2WjPrEGybmT1rZqVmNtvMjj2Ath8Hxrv70UAPYH5Q/pi79ww+uxNFF2AY0BUYAvzFzOLNLB54itBa3l2AS4K6Ug31yEzh39cP4OSOzbn3X/P45cvT2bhtZ6zDEpHDFK4b6kZgebB9CaFf+O2AW4An9negmTUFTgJGA7j7DnffsJ9DhgJj3H27uy8jtLxq3+CzxN2XuvsOYExQV6qppkn1ePry3tx1Vmc+mLeWHzz5KflFpbEOS0QOQ7hkUe7uu/8sPAd40d2/cfcPgeQwx2YT6mp6zsxmmNkzwZrcACOCu5NnzSw1KGsNFFQ6vjAo21f595jZcDPLM7O84uLiPXdLlJkZ157UjteG92P7zgouGPk5L09ZQSS7PUUkcsIliwoza2lmDQitm/1hpX0NwxybAPQCRrr7scAW4A5gJHAU0BNYDTxyKIHvyd1HuXuOu+ekp+tBreoiJyuNf9/Qn37tjuCut/O56bWZbNleHuuwROQghUsWvwXyCHVFjXP3uQBmdjKwNMyxhUChu08Jvr8B9HL3te6+y90rCD1h1TfYXwRkVjo+IyjbV7nUEEc0qs/zV/bhtsEd+desVZz7509ZuGZTrMMSkYMQLlmsBY4HOrv7tWZ2uZmNBX4MDN/fge6+Bigws05B0SBgnpm1rFTtfCA/2B4HDDOz+maWDXQApgK5QAczyzazREKD4OMO/BKlOoiLM0ac2oGXrjmO0q3lDH3qU96YVhjrsETkAIVLFn8DNrv7ejM7CXgQeJFQEnn8AM5/PfCymc0m1O30e+CPZjYnKDsFuBkguGt5HZgHjAeuC+5AyoERwPuEnqZ6ffcdjtQ8JxzVjHdv7E/PzBRu+8csbvvHLDarW0qk2tvvexZmNsvdewTbTwHF7n5P8H2mu/eMSpQHSe9ZVH/luyp4YuJi/jxpCZlpSTx2cU96tUkNf6CIRMwhv2cBxJvZ7ilBBgH/qbQv3FQhIvuUEB/HLYM7MWb48ZTvci786xc8/uFircQnUk2FSxavAh8H4xRbgckAZtYe0IPzctj6Zqfx3k0D+EH3ljz24SIuHvUlBSVlsQ5LRPYQdroPM+sHtAQ+cPctQVlHoJG7T498iAdP3VA109iZRfzm7XwcuPfcrlzQq7WmChGJov11Q4XtSnL3L/dStqgqAhOpbGjP1vRqk8otr8/k1n/MYtLCdTxw3jE0TaoX69BE6jytwS3VSmZaEmOGH8+vzujE+Pw1nPn4J3zx1TexDkukzlOykGonPs647pT2vPmLE6hfL55Ln/mSh8YvYEe5Br9FYkXJQqqtHpkpvHN9f4b1yWTkR19xwcjP+Kp4c6zDEqmTlCykWkuun8AfLujOXy/rTdH6rZz9xGRNSCgSA0oWUiMM6XYk4286iT5Zadz1dj7XvjiNbzZvj3VYInWGkoXUGC2aNOCFq/ryv+d04ZNFxQx5fDIfLVwX67BE6gQlC6lR4uKMq/tnM3bEiaQm1ePK53K5Z9xctu3cFevQRGo1JQupkTq3bMK4Ef258oQsnv98OUP//BkL1myMdVgitZaShdRYDerFc8+5XXnhp30pKdvBuU9+xjOTl1JRocFvkaqmZCE13skd0xl/4wBO6pjO7/49nyuem8rajdtiHZZIraJkIbXCEY3q8/TlvXng/G7kLi9hyP99wvj8NbEOS6TWiGiyMLMUM3vDzBaY2XwzO97M0sxsgpktDn6mBnXNzJ4wsyVmNtvMelU6zxVB/cVmdkUkY5aay8z48XFteef6AbRObcjPX5rGHW/O1prfIlUg0ncWjwPj3f1ooAehle7uACa6ewdgYvAd4ExCS6l2ILRk60gAM0sD7gaOI7Re9927E4zI3rRv3oi3fnEivxh4FK/lFXDOk58yq2BDrMMSqdEilizMrClwEjAawN13uPsGYCjwQlDtBeC8YHso8KKHfAmkBOt1nwFMcPcSd18PTACGRCpuqR0SE+K4fcjRvHJNP7bv3MUPR37OU5OWsEuD3yKHJJJ3FtlAMfCcmc0ws2fMLBlo4e6rgzprgBbBdmugoNLxhUHZvspFwjr+qCN478aTGNLtSP70/kIuGfUlheu1uJLIwYpkskgAegEj3f1YYAvfdTkB4KEJfqrkTz0zG25meWaWV1xcXBWnlFqiaVI9nrzkWB69qAfzVm/kzMcnM3ZmkeaXEjkIkUwWhUChu08Jvr9BKHmsDbqXCH7unq+hCMisdHxGULav8u9x91HunuPuOenp6VV6IVLzmRkX9MrgvRsH0LFFY24cM5PLn53K4rWbYh2aSI0QsWTh7muAAjPrFBQNAuYB44DdTzRdAYwNtscBlwdPRfUDSoPuqveBwWaWGgxsDw7KRA5aZloSrw3vx90/6MKsgg0MeXwy9/5rLqVbd8Y6NJFqLeyyqofpeuBlM0sElgJXEUpQr5vZ1cAK4KKg7rvAWcASoCyoi7uXmNn9QG5Q7z53L4lw3FKLJcTHcdWJ2ZzboxWPTFjE858vZ+zMVdw2uBMX98kkPk7rfovsyWpjv21OTo7n5eXFOgypIeauKuXecfOYuryErq2acM+5XemTlRbrsESizsymuXvO3vbpDW6p87q2asprP+vHk5ccS8mWHVz41y+44dUZrC7dGuvQRKoNJQsRQgPgP+jRiom3nswNp7Zn/Nw1nPrwxzw5cbGmPxdByULke5ISE7hlcCcm3nIyAzul88iERZz26MeMz1+jR22lTlOyENmLzLQkRl7Wm1euOY7kxAR+/tI0Lhs9hUV61FbqKCULkf04oX0z/n1Df+49tyv5RaEX+u4ZN5fSMj1qK3WLkoVIGAnxcVxxQhaTbhvIJX0zefGL5Qx8eBIvT1mhuaakzlCyEDlAacmJ/O68Y3jn+gF0aNGYu97O5wdPfsrUZXrtR2o/JQuRg9SlVRNeG96PP196LBvKdnDR375gxCvTWbVBj9pK7aVkIXIIzIxzurdi4q0DuXFQBybMW8upj3zE4x/qUVupnZQsRA5Dw8R4bj69IxNvPZlTj27OYx8uYtAjH/PenNV61FZqFSULkSqQkZrEX37cm1euPY7GDRL4xcvTufTpKSxYszHWoYlUCSULkSp0wlHNeOf6/tw/tCvz12zkrMcn89ux+Wwo2xHr0EQOi5KFSBVLiI/jJ8dnMenWgfz4uLa89OUKBj78EX//YjnluypiHZ7IIVGyEImQ1ORE7j+vG/++YQBHH9mY/x07l3Oe/JRJC9ZpPENqHCULkQjr3LIJr17bj7/8uBebtpVz1fO5DH7sE17LXaknp6TG0HoWIlG0o7yCd2av4unJy5i/eiPNGiVy+fFZXNavLWnJibEOT+q4mK1nYWbLzWyOmc00s7yg7B4zKwrKZprZWZXq32lmS8xsoZmdUal8SFC2xMzuiGTMIpGUmBDHBb0yePeG/rx8zXF0a92URycs4oQHJ/Kbf85h2ddbYh2iyF5FellVgFPc/es9yh5z94crF5hZF2AY0BVoBXxoZh2D3U8BpwOFQK6ZjXP3eRGOWyRizIwT2zfjxPbNWLR2E89MXsrruYW8PGUlp3VuwfCT2pHTNhUzLfEq1UM0ksWBGgqMcfftwDIzWwL0DfYtcfelAGY2JqirZCG1QscWjfnjj3pw2xmd+PsXK/j7lyuYMG8tPTJTuHZANkO6HklCvIYXJbYi/S/QgQ/MbJqZDa9UPsLMZpvZs2aWGpS1Bgoq1SkMyvZV/j1mNtzM8swsr7i4uGqvQiQKmjduwK2DO/H5Hady/9CulJbtYMQrMxj48EeM/nQZm7eXxzpEqcMinSz6u3sv4EzgOjM7CRgJHAX0BFYDj1RFQ+4+yt1z3D0nPT29Kk4pEhNJiQn85PgsJt46kL/9pDctmzbg/nfmcfwfJvKH9+azpnRbrEOUOiii3VDuXhT8XGdmbwN93f2T3fvN7GngneBrEZBZ6fCMoIz9lIvUWvFxxhldj+SMrkcyY+V6npm8jKc/Wcroycs4t0crrhnQji6tmsQ6TKkjIpYszCwZiHP3TcH2YOA+M2vp7quDaucD+cH2OOAVM3uU0AB3B2AqYEAHM8smlCSGAZdGKm6R6ujYNqk89eNUCkrKePazZbyWW8BbM4o4sf0RXDOgHQM7pmswXCIqkncWLYC3g3/ACcAr7j7ezP5uZj0JjWcsB34G4O5zzex1QgPX5cB17r4LwMxGAO8D8cCz7j43gnGLVFuZaUnc/YOu3DSoI69MXcnzny/jqudy6dC8EdcMyGZoz9Y0qBcf6zClFtJLeSI12O6X/EZ9spQFazbRrFF9rji+LZf1a0uqXvKTg7S/l/KULERqAXfnsyXf8PTkpXy8qJgG9eK4sHcmV/fPJqtZcqzDkxpif8miOr1nISKHyMzo36EZ/Ts0Y+Ga0Et+r+UW8NKUFZzeuQXX6iU/OUy6sxCppdZt2saLn6/gpSkr2FC2k56ZKVyjl/xkP9QNJVKHle0o541phTz76TKWf1NG65SGXHlCFhf3zaRJg3qxDk+qESULEWFXhTNx/lpGf7qMKctKSE6M56I+mfz0xGwy05JiHZ5UA0oWIvI9cwpLGf3pUt6ZvZoKd87oeiTXDMimVxuNa9RlShYisldrSrfxwhfLeWXKSkq37qRHZgrX9M/mzG4a16iLlCxEZL/KdpTz5rRCRlca17jihLZc3KcNTRtqXKOuULIQkQNSUeFMXLCO0Z8u5cul341rXHVCNm2O0LhGbadkISIHLb+olNGfLuNfs1ZR4c7gLqFxjd56X6PWUrIQkUO2pnQbL36xnJc1rlHrKVmIyGEr21HOm9OLePbTZSz7egutmjbgyhOzNK5RiyhZiEiVqahw/rNgHaM/XcYXS78hOTGeC3NC72toXKNmU7IQkYjILyrl2U+XMU7jGrWCkoWIRNTajaFxjZe+DMY1Mppy9YB2nNntSOppXKPG2F+yiOj/ima23MzmmNlMM8sLytLMbIKZLQ5+pgblZmZPmNkSM5ttZr0qneeKoP5iM7sikjGLyMFr0aQBvzrjaL6481TuP5v99rcAAA9ySURBVK8bm7aVc8OrMzj5j5P428dfsaFsR6xDlMMU0TsLM1sO5Lj715XK/giUuPuDZnYHkOrut5vZWcD1wFnAccDj7n6cmaUBeUAOodX1pgG93X39vtrVnYVIbFVUOJMWruOZyaFxjcSEOM7sdiTD+rShX7s0dVFVU9VtPYuhwMBg+wXgI+D2oPxFD2WvL80sxcxaBnUnuHsJgJlNAIYAr0Y3bBE5UHFxxqDOLRjUuQXzV29kzNSVvD2jiLEzV5F1RBIX92nDj3pnkN64fqxDlQMU6c5EBz4ws2lmNjwoa+Huq4PtNYTW6gZoDRRUOrYwKNtX+feY2XAzyzOzvOLi4qq8BhE5DJ1bNuHeod2YetdpPHpRD5o3acBD4xdw/B8m8vO/T2PSwnXsqqh9Y6e1TaTvLPq7e5GZNQcmmNmCyjvd3c2sSv6VuPsoYBSEuqGq4pwiUnUa1Ivngl4ZXNArg6+KN/NabgFvTitk/Nw1tE5pyIU5GVyYk0nrlIaxDlX2IqJ3Fu5eFPxcB7wN9AXWBt1LBD/XBdWLgMxKh2cEZfsqF5Ea6qj0Rvz6rM58cecg/vLjXrRLT+bxiYvp/9B/uPK5qYzPX8POXRWxDlMqidgAt5klA3HuvinYngDcBwwCvqk0wJ3m7v9jZmcDI/hugPsJd+8bDHBPA3Y/HTWd0AB3yb7a1gC3SM1TUFLGP/IKeD2vkDUbt9GsUX1+1DuDi/tkkt0sOdbh1Qkxec/CzNoRupuAUHfXK+7+gJkdAbwOtAFWABe5e4mFHo/4M6HB6zLgKnff/bjtT4FfB+d6wN2f21/bShYiNVf5rgo+WVzMq1ML+M+C0HhGv3ZpXNK3DWd0PZIG9eJjHWKtpZfyRKRGWrtxG29MK+S13AJWlpTRtGE9zj+2NZf0bUOnIxvHOrxaR8lCRGq0igrni6XfMCa3gPfz17BjVwU9M1O4pG8m53RvRXL9WLwFUPsoWYhIrVGyZQdvTS9kTG4BS9ZtJjkxnnN7tmJYnzZ0z2iqF/4Og5KFiNQ67s70let5dWoB78xexbadFXRu2YRhfTI5r2drmiZp2vSDpWQhIrXaxm07GTdzFa/lFjCnqJT6CXGcdUxLzj+2NTlZqSQlqpvqQChZiEidkV9UypjclYydsYpN28uJjzO6tWpCTlYafbJS6d02TdOM7IOShYjUOVt37GLKsm/IW76eqctLmFWwge3loRf9spslk9M2lT5ZaeRkpZLdLFljHShZiIiwvXwX+UUbyVteQu7y9eStKGFD2U4AmjVKJKdtKHH0yUqjS6smdXIdDiULEZE9VFQ4S7/eTO7y9eQuLyFv+XpWlpQB0LBePMe2Sfm26+rYNqk0qgOP5ypZiIgcgLUbt5EXJI/c5SXMX72RCoc4gy6tmpDTNo0+QQJp3qRBrMOtckoWIiKHYNO2ncxYueHbrqsZBevZtjM07tH2iKQgeaSSk5XGUek1f9yjui1+JCJSIzRuUI+TOqZzUsd0AHbuqmDuqt3jHiV8tHAdb04vBCA1qd633VY5WWl0admkVs1jpTsLEZFD5O4s+3rLt11XeSvWs+zrLQDUizeOPrIJPTKb0j0jhR4ZKbRv3oj4uOp796FuKBGRKCnetJ1pK0qYWVDK7MINzCksZdP2cgCSEuPp1ropPTKa0iMzlEAyUhtWm+4rJQsRkRgJPXW1hdmFG5hdWMrMgg3MW72RHcE7H2nJiXTP2H33EfoZq5cGNWYhIhIjcXFG++aNaN+8ERf0ygBgR3kFi9ZuYmbBhm+TyCeLFrN7KfLWKQ3pHtx9dM9oyjGtm9K4QWznulKyEBGJssSEOLq1bkq31k2BtgBs2V7O3FUbmV24IUgipbyXvwYAs9BStN0zmtIjI4UemSl0btmY+gnRG0CPeLIws3ggDyhy93PM7HngZKA0qHKlu88MVsp7nNCyqmVB+fTgHFcAvwnq/87dX4h03CIi0ZRcP4G+2Wn0zU77tqxky45v7zxmFWzgk0Vf89b0IiD6A+jRuLO4EZgPNKlU9it3f2OPemcCHYLPccBI4LhgDe67gRzAgWlmNs7d10c8chGRGEpLTmRgp+YM7NQcCD19tbp0G7MKNjCrMDSAPnbGKl76ciUQGkAf1LkFT15ybJXHEtFkYWYZwNnAA8AtYaoPBV700Ij7l2aWYmYtgYHABHcvCc45gdA63a9GLHARkWrIzGiV0pBWKQ0585iWwH8PoCclRqZrKtJ3Fv8H/A+w52K5D5jZb4GJwB3uvh1oDRRUqlMYlO2r/HvMbDgwHKBNmzZVFb+ISLW2twH0iLQTqROb2TnAOneftseuO4GjgT5AGnB7VbTn7qPcPcfdc9LT06vilCIiEojkHLwnAuea2XJgDHCqmb3k7qs9ZDvwHNA3qF8EZFY6PiMo21e5iIhEScSShbvf6e4Z7p4FDAP+4+6XBeMQBE8/nQfkB4eMAy63kH5AqbuvBt4HBptZqpmlAoODMhERiZJYvGfxspmlAwbMBH4elL9L6LHZJYQenb0KwN1LzOx+IDeod9/uwW4REYkOTfchIiLA/qf7qHvrBoqIyEFTshARkbCULEREJKxaOWZhZsXAisM4RTPg6yoKpya0G8u261q7sWxb11w32j6cdtu6+15fVKuVyeJwmVnevgZ5amO7sWy7rrUby7Z1zXWj7Ui1q24oEREJS8lCRETCUrLYu1F1rN1Ytl3X2o1l27rmutF2RNrVmIWIiISlOwsREQlLyUJERMJSsqjEzJ41s3Vmlh++dpW2m2lmk8xsnpnNNbMbo9RuAzObamazgnbvjUa7ldqPN7MZZvZOlNtdbmZzzGymmUVtErFg9cc3zGyBmc03s+Oj1G6n4Fp3fzaa2U1Ravvm4N9Wvpm9amYNotFu0PaNQbtzI3m9e/u9YWZpZjbBzBYHP1Oj2PaFwTVXmFmVPUKrZPF9zxNasjXayoFb3b0L0A+4zsy6RKHd7cCp7t4D6AkMCaaHj5bd67PHwinu3jPKz8E/Dox396OBHkTp2t19YXCtPYHehGZ1fjvS7ZpZa+AGIMfduwHxhJYriDgz6wZcS2i9nB7AOWbWPkLNPc9//964A5jo7h0IVgSNYtv5wAXAJ1XZkJJFJe7+CRD16c+DBaGmB9ubCP0S+a+lYyPQrrv75uBrveATlSceKq3P/kw02os1M2sKnASMBnD3He6+IQahDAK+cvfDmeHgYCQADc0sAUgCVkWp3c7AFHcvc/dy4GNCv0Cr3D5+bwwFXgi2XyC0dk9U2nb3+e6+sKrbUrKoZswsCzgWmBKl9uLNbCawDpjg7lFpl+/WZ6+IUnuVOfCBmU0L1m6PhmygGHgu6Hp7xsySo9R2ZcOAV6PRkLsXAQ8DK4HVhBY0+yAabRP663qAmR1hZkmE1srJDHNMVWoRLN4GsAZoEcW2I0LJohoxs0bAm8BN7r4xGm26+66geyID6BvcvkfUftZnj5b+7t4LOJNQl99JUWgzAegFjHT3Y4EtRK5rYq/MLBE4F/hHlNpLJfQXdjbQCkg2s8ui0ba7zwceAj4AxhNaaG1XNNreSyxOlO7YI0nJopows3qEEsXL7v5WtNsPukQmEZ0xm72uzx6FdoFv/+LF3dcR6rvvu/8jqkQhUFjpzu0NQskjms4Eprv72ii1dxqwzN2L3X0n8BZwQpTaxt1Hu3tvdz8JWA8silbbwNpKS0i3JHTnXqMpWVQDwXrko4H57v5oFNtNN7OUYLshcDqwINLt7mt99ki3C2BmyWbWePc2oTXdI/70m7uvAQrMrFNQNAiYF+l293AJUeqCCqwE+plZUvBvfBBRfKDBzJoHP9sQGq94JVptA+OAK4LtK4CxUWw7Mtxdn+BD6P9Iq4GdhP4SvDpK7fYndJs6m9Dt8kzgrCi02x2YEbSbD/w2Bv/NBwLvRLG9dsCs4DMXuCuKbfcE8oL/3v8EUqPYdjLwDdA0yv/73kvoD5B84O9A/Si2PZlQQp4FDIpgO//1ewM4gtBTUIuBD4G0KLZ9frC9HVgLvF8VbWm6DxERCUvdUCIiEpaShYiIhKVkISIiYSlZiIhIWEoWIiISlpKF1Ehm5mb2SKXvt5nZPVV07ufN7EdVca4w7VwYzD47KZJxmVmWmV168BGKfEfJQmqq7cAFZtYs1oFUFkyYd6CuBq5191MiFU8gCzioZHGQ1yF1gJKF1FTlhNYavnnPHXv+BW5mm4OfA83sYzMba2ZLzexBM/txsKbHHDM7qtJpTjOzPDNbFMxltXvSxT+ZWa6ZzTazn1U672QzG8de3so2s0uC8+eb2UNB2W8JvYw52sz+tJdjbg+OmWVmD+5l//LdidLMcszso2D75ErrVswI3lZ/kNCkejOD9SUO6DqCt93/HcSQb2YXH8j/MFI76a8HqcmeAmab2R8P4pgehKavLgGWAs+4e18LLTh1PbB7kZwsQnNGHQVMCtZCuJzQzKl9zKw+8JmZ7Z5FtRfQzd2XVW7MzFoRmtCuN6H5iT4ws/Pc/T4zOxW4zd3z9jjmTEIT8B3n7mVmlnYQ13cbcJ27fxZMTLmN0ISFt7n77qQ3/ECuw8x+CKxy97OD45oeRBxSy+jOQmosD83M+yKhBXYOVK6H1g/ZDnxFaFZSgDmEEsRur7t7hbsvJpRUjiY0j9TlwZTuUwhN6dAhqD91z0QR6AN85KHJ9MqBlwmta7E/pwHPuXtZcJ0Hs8bKZ8CjZnYDkBK0uacDvY45wOlm9pCZDXD30oOIQ2oZJQup6f6PUN9/5bUhygn+bZtZHJBYad/2StsVlb5X8P077T3nwXHAgOs9WHXO3bP9u/UZthzWVRy8b68R+HapUnd/ELgGaEjojuHovRx7QNfh7osI3WnMAX4XdJ1JHaVkITVa8Ff364QSxm7LCXX7QGj9hnqHcOoLzSwuGMdoBywE3gd+EUwnj5l1tPALGE0FTjazZmYWT2jm14/DHDMBuMpCi/awj26o5Xx3jT/cXWhmR7n7HHd/CMgldEe0CWhc6dgDuo6gC63M3V8C/kT0p1SXakRjFlIbPAKMqPT9aWCsmc0itPDNofzVv5LQL/omwM/dfZuZPUOoq2p6MOV2MWGWy3T31WZ2B6G1Qgz4t7vvd7pqdx9vZj2BPDPbAbwL/HqPavcSGhy/H/ioUvlNZnYKoTulucB7wfau4L/H84TWAj+Q6zgG+JOZVRCa1fQX+4tbajfNOisiImGpG0pERMJSshARkbCULEREJCwlCxERCUvJQkREwlKyEBGRsJQsREQkrP8HeGt5W2Mot3wAAAAASUVORK5CYII=\n"
321 |           },
322 |           "metadata": {
323 |             "needs_background": "light"
324 |           }
325 |         }
326 |       ]
327 |     },
328 |     {
329 |       "cell_type": "markdown",
330 |       "source": [
331 |         "There is a slight elbow at `k=9` which could point to the fact that a few digits may have been merged in one cluster."
332 |       ],
333 |       "metadata": {
334 |         "id": "d0Wj1fYCwEhF"
335 |       }
336 |     },
337 |     {
338 |       "cell_type": "markdown",
339 |       "source": [
340 |         "### Silhoutte"
341 |       ],
342 |       "metadata": {
343 |         "id": "ASM9vuyCzKIb"
344 |       }
345 |     },
346 |     {
347 |       "cell_type": "code",
348 |       "source": [
349 |         "sil_coef_digits = []\n",
350 |         "for k in range(2, 15):\n",
351 |         "    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n",
352 |         "    kmeans.fit(scaled_digits)\n",
353 |         "    score = silhouette_score(digits.data, kmeans.labels_)\n",
354 |         "    sil_coef_digits.append(score) "
355 |       ],
356 |       "metadata": {
357 |         "id": "B_OE4DBJv9e7"
358 |       },
359 |       "execution_count": 10,
360 |       "outputs": []
361 |     },
362 |     {
363 |       "cell_type": "code",
364 |       "source": [
365 |         "plt.plot(range(2, 15), sil_coef_digits)\n",
366 |         "plt.xticks(range(2, 15))\n",
367 |         "plt.xlabel('Number of clusters')\n",
368 |         "plt.ylabel('silhouette score')\n",
369 |         "plt.grid(True)\n",
370 |         "plt.show()"
371 |       ],
372 |       "metadata": {
373 |         "colab": {
374 |           "base_uri": "https://localhost:8080/",
375 |           "height": 279
376 |         },
377 |         "id": "D5g9yY5Jzp1I",
378 |         "outputId": "92092d9b-05d7-48bd-8f57-cd401accea07"
379 |       },
380 |       "execution_count": 11,
381 |       "outputs": [
382 |         {
383 |           "output_type": "display_data",
384 |           "data": {
385 |             "text/plain": [
386 |               "<Figure size 432x288 with 1 Axes>"
387 |             ],
388 |             "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEGCAYAAAB/+QKOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU5fX48c8hEBIIS9jCHnYBAYGwuKLRqrhBrai4UHEpasVqq7b4taVqN5dq1Ur7s3XBDaOirRRpcQu2VZEQ9rBoWBKIbIawhJCQZM7vj3tDxzhJJmTuzGRy3q/XvDJ3ee45N5A5c59773NFVTHGGGOqaxbpBIwxxkQnKxDGGGMCsgJhjDEmICsQxhhjArICYYwxJqDmkU4gVDp16qR9+vQ57vaHDx+mdevWoUsohmOEK06sxAhXHNuX6IsRrjgNiZGdnf21qnYOuFBVY+KVlpamDZGZmdmg9k0pRrjixEqMcMWxfYm+GOGK05AYwHKt4XPVupiMMcYEZAXCGGNMQFYgjDHGBORpgRCRiSKySURyRWRWgOUTRGSFiFSIyJRqyx4WkXXu60ov8zTGGPNtnhUIEYkD5gAXAEOBq0RkaLXV8oHpwLxqbS8CRgMjgfHA3SLS1qtcjTHGfJuXRxDjgFxV3aKqR4EMYLL/Cqq6TVXXAL5qbYcC/1bVClU9DKwBJnqYqzHGmGpEPRrN1e0ymqiqN7nT04DxqjozwLpzgYWqOt+dPg/4JXAu0ApYBsxR1ceqtZsBzABISUlJy8jIOO58i4uLSUpKOu72TSlGuOLESoxwxbF9ib4Y4YrTkBjp6enZqjom4MKarn9t6AuYAjzrNz0NeLqGdecCU6rNuw9YBbwPvArcWVs8uw8ifDHCFSdWYoQjzuY9h/RXr7ynlZU+T+Ooxs6/S6z82zc0BhG6D6IA6OU33dOdFxRV/Y2qjlTVcwEBvghxfsbEhIL9R5j6l6U8u/Yo172wjL2HyiKdkokRXhaILGCgiPQVkXhgKrAgmIYiEiciHd33I4ARwHueZWpMI3WgpJzpzy/jSHkllw5owbKt+7jgyf/wSe7XkU7NxADPCoSqVgAzgcXABuANVc0RkQdFZBKAiIwVkR3A5cAzIpLjNm8B/EdE1gN/Aa51t2eMcZVVVPKDl5eTV1jCX6aNYfKAeN6ZeRrtW7Xg2uc+5/H3NlFRWf36D2OC5+lgfaq6CFhUbd5sv/dZOF1P1duV4lzJZIwJwOdT7npjNcu27uPJqSM5pX9HlmyHwV3bsmDmacx+J4enPspl6dZ9PDV1FF3bJUQ6ZdMI2Z3UxjRCD/9rIwvX7GTWBYOZPLLHN5a1im/O7y8/iccuP4l1BQe48Kn/kLlpT4QyNV7w+ZScrw7w/H+3MuOl5Ty71pvzTjEz3LcxTcWLn27jmX9vYdrJqdw8oV+N612W1pOTerVn5rwVXP9CFjef2Y+7zzuBFnH2vbCxqfQpG3YeZOmWQj7fuo9lW/dx4Eg5AL07tGJIG2/iWoEwphH517pd3P+PHM4dmsL9k05ERGpdf0CXJP5+22k8uHA9z3y8hayt+3jqqlH0TG4VpozN8fAvCEu3FLJs6z4OljqnYVM7tmLiiV0Z368D4/t1pEf7RJYsWeJJHlYgjGkksvOKuCNjJSf1bM9TU0cR16z24lAloUUcv710OKf068i9b6/loqf+y6NTRnDeiV09ztgEq9KnrP/KryBs28chtyD06diKC4d34+R+HRnfrwPd2iWGLS8rEMY0Alv2FnPTi1l0a5fAc9eNITE+rt7buOSk7gzv0Y6Zr61gxsvZXH9aH2ZdMJiWzeu/LdMwFZU+1h87QthH1tZ9HCpzCkK/Tq25eIRbEPp2jOgFBlYgjIlyXxeXMf2FLESEudePo2NSy+PeVp9OrXnr1lP53aKNvPDJNpZvK+Lpq0eR2tH7R282ZRWVPtZ9dZDP3SOErG1FFFcVhM6tuWRkd7cgdCClbfRccWYFwpgoVnK0ghvnZrHnUCmv/eBk+nRq+Ad5y+Zx3D/pRE7p35F73lzNxU/9l99dNpyLR3QPQcYGnCGMvthdzJJNe3h3eSkzM98/VhAGdElisl9B6BJFBaE6KxDGRKmKSh+3z1vJ2oIDPDNtDKN6J4d0++ef2JWh3dpy+2srmTlvJZ9tLuQXFw8loYV1OR2P0vJKPttcyEcb9/DRxj0U7D8CQPck4dJRvZ2Tyn070rnN8R8BhpsVCGOikKoye0EOH27cw6++O4xzh6Z4EqdXh1a8ecsp/H7xJp759xay84qYc81o+nf2fpTTWPDV/iPHCsKnm7+mtNxHq/g4ThvQidvPHkD64C5sWLGUs84aFulUj4sVCGOi0J+WbGbe5/nccmZ/pp2c6mmsFnHNuPfCIZzcryM/eWMVl/zxv/zm0mFcOupbgxw0eZU+ZWV+ER9u3EPmxj1s3HUIcO5FmDq2N2cP7sL4fh2+ceJ/Q6SSDQErEMZEmb+t3MGjizcxeWR3fnr+CWGLmz64C4vuOIM7XlvFj19fzae5hTww+URaxTftj4n9JUf5+Iu9fLRxDx9/sZf9JeU0byaM6ZPMfRcOIX1wF/p3bl3nPSmNUdP+lzcmynya+zU/nb+GU/p15JEpI2gW5L0OodKtXSLzfjCeJz/8kqczc1m1fT9zrhnNoJTQ36rr8yl7i8vYUVTCjqIjFOw/4vwsOkJRUSmL962ha9tEurVLIKVdAt3aJdC1XQJtWjb39MO46gTzhxt3k7lxD9l5RfgUOraO55zBKZw9uAtnDOpE24QWnuUQLaxAGBMlNu46yM0vZ9O3U2v+37S0iN2f0DyuGXeddwLj+3bkztdXMenp//LApBO5Ykyven0wV1T62HmglIL9zoe+UwRKjhWCnftLOVpttNnkVi3okZzIoTLlvZzdFB4++q3ttoqPo2u7BLq2dQpGt2Pv3WLSNoGOrePrVVxLyyv5dPPXfLRxD5kb9x47wTysR1tmpjvnEk7q2T7sBTvSrEAYEwV2HjjC9OezaNUyjrnXj6NdYuS/nZ4+sBOL7jidH7++ip+9tZZPNxfym0uHH1teVlHJV/tL2VFUQkG1I4CC/UfYdbCUSt83H2ncpU1LeiQnMrxHOy4Y1o0eyYn0bJ9Ij+REerRPpHVL5yNpyZIlnHXWWZRVVLLnYBk7D5Sy62Apuw4cYdeBMnYdPMLOA6Us3VzI7kNl34rTIk7o0uZ/Rx3/KyaJdG3Xkq7tEtlb4uPlpXlkVjvBfLrfCeZouichEqxAGBNhB0vLmf58FsVlFbx5yyl0bx++oRTq0qVNAi/dMJ4/Zebyhw++IDuviJa+Mu755INvPbmumThdVD2SExnft4Pz4Z+cSI/2reiR7Hy7r+8ltC2bx9GrQyt6dah57KhKn1JY7BSRnQdK2X3Q/+cR1hUc4IMNuyktD/RsjHW1nmBu6qxAGBNBRyt83PJyNpv3FjP3+nEM6dY20il9S1wz4fZzBjKubwceWbyJI8VlnN23y7Fv/VWFoGvbBJpHYKTYuGZCl7YJdGmbwEm9Aq+jqhw4Uu53JFJKzoZNTL/glJg9wRwKViCMiRBV5WdvreHTzYU8fsVJnD6wU6RTqtX4fh1569ZT3e6fEZFOp15EhPat4mnfKv5YEV5SsoUBXex+j9rYwPDGRMjv39vE31YWcPd5g/jeaLvnwEQfKxDGRMArS/OYk7mZq8b15rb0AZFOx5iAPC0QIjJRRDaJSK6IzAqwfIKIrBCRChGZUm3ZIyKSIyIbROQpsU5CEyM+WL+b2e+s4+zBXfjV5Lof+mNMpHhWIEQkDpgDXAAMBa4SkaHVVssHpgPzqrU9FTgNGAEMA8YCZ3qVqzHhsmr7fm5/bSXDerTj6atHReSkrjHB8vIk9TggV1W3AIhIBjAZWF+1gqpuc5dVv/5MgQQgHhCgBbDbw1yN8Vxe4WFunJtFpzbxPHfd2CY/hIWJfqKqda91PBt2uowmqupN7vQ0YLyqzgyw7lxgoarO95v3e+AmnALxtKreF6DdDGAGQEpKSlpGRsZx51tcXExSkrdXNMRKjHDFiZUYALuKivnD2mYUlys/H59It6TQHznYv330xQhXnIbESE9Pz1bVMQEXqqonL2AK8Kzf9DScD/pA684FpvhNDwDeBZLc12fAGbXFS0tL04bIzMxsUPumFCNccWIlRklZhZ7920U66L5FunxboWdx7N8++mKEK05DYgDLtYbPVS87QAsA/9tWerrzgnEpsFRVi1W1GPgncEqI8zPGc6rK3W+uZssBH09OHUlaaodIp2RM0LwsEFnAQBHpKyLxwFRgQZBt84EzRaS5iLTAOUHdmIdVN03U++t38+7anVw2sAUTh3WLdDrG1ItnBUJVK4CZwGKcD/c3VDVHRB4UkUkAIjJWRHYAlwPPiEiO23w+sBlYC6wGVqvqP7zK1RgvlJZX8qt31zOwSxIT+0Z+8D1j6svTyyhUdRGwqNq82X7vs3C6nqq3qwRu9jI3Y7z2l39vYfu+I8y7aTxHd6yLdDrG1JtdhG2MB3YUlfCnJblcOLwrpw6I7jGWjKmJFQhjPPDbRc4ps/suqn5vqDGNhxUIY0Lsk9yvWbR2Fz88awA9oujZDsbUlxUIY0KovNLHLxfk0KtDIjMm9It0OsY0iBUIY0LoxU+3kbunmF9cNLTeT08zJtpYgTAmRPYeKuPJD77kzEGdOXdoSqTTMabBrEAYEyIP/2sjpRWV/PKSoTaEt4kJViCMCYEV+UXMz97BDaf3pV9ne4yliQ1WIIxpIJ9PuX9BDl3atOT2swdGOh1jQsYKhDEN9Mby7azZcYD/u3AISS3tGQ8mdliBMKYBDpSU88jiTYztk8zkkd0jnY4xIWUFwpgGePz9TewvOcr9k+zZ0ib2WIEw5jht2HmQl5fmcc34VE7s3i7S6RgTclYgjDkOqsovF+TQLrEFd503KNLpGOMJKxDGHId/rNnJsq37uPv8E2jfKj7S6RjjCSsQxtTT4bIKfvvuBob1aMvUsb0jnY4xnrFr8oyppzmZuew6WMqca0YR18xOTJvYZUcQxtTD1q8P89f/bOF7o3qQltoh0ukY4ykrEMbUw4P/yKFl8zhmXTA40qkY4zlPC4SITBSRTSKSKyKzAiyfICIrRKRCRKb4zU8XkVV+r1IR+a6XuRpTlw837CZz017uOGcgXdomRDodYzzn2TkIEYkD5gDnAjuALBFZoKrr/VbLB6YDd/u3VdVMYKS7nQ5ALvCeV7kaU5fS8koeXLie/p1bc92pfSKdjjFh4eVJ6nFArqpuARCRDGAycKxAqOo2d5mvlu1MAf6pqiXepWpM7Z7771byCkt4+cZxxDe3nlnTNIiqerNhp8tooqre5E5PA8ar6swA684FFqrq/ADLPgIeV9WFAZbNAGYApKSkpGVkZBx3vsXFxSQleTtMc6zECFecaIlReMTHvf89wvBOcdw+6vi6lqJlXxpLnFiJEa44DYmRnp6erapjAi5UVU9eON/8n/WbngY8XcO6c4EpAeZ3A/YCLeqKl5aWpg2RmZnZoPZNKUa44kRLjNtezdZB9y3S/MLDnsZpKPu3j74Y4YrTkBjAcq3hc9XLY+UCoJffdE93Xn1cAfxNVctDlpUx9fDZ5kIWrtnJLWf2p1eHVpFOx5iw8rJAZAEDRaSviMQDU4EF9dzGVcBrIc/MmCBUVPq4f0EOPZMTufWs/pFOx5iw86xAqGoFMBNYDGwA3lDVHBF5UEQmAYjIWBHZAVwOPCMiOVXtRaQPzhHIx17laExtXl6ax6bdh/j5RUNJaBEX6XSMCTtPh9pQ1UXAomrzZvu9z8LpegrUdhvQw8v8jKnJ18VlPP7+F5wxsBPnn5gS6XSMiQi7Xs+YAB791yaOHK3kl5fYg4BM0xV0gRARO0NnmoTV2/fzRvZ2rj+tDwO6eH8ZpDHRqs4CISKnish6YKM7fZKI/MnzzIyJAJ9Pmb0gh05JLfnROQMjnY4xERXMEcQfgPOBQgBVXQ1M8DIpYyJl/oodrN6+n1kTB9MmoUWk0zEmooLqYlLV7dVmVXqQizERdeBIOQ//cyNpqclcOsqujzAmmKuYtovIqYCKSAvgDpzLVo2JKU988AX7So7y4qRxNLMHARkT1BHELcBtOJecFuCMsnqbl0kZE26bdh3ipc/yuGpcb4b1aBfpdIyJCrUeQbhDdj+pqteEKR9jwk5VuX9BDkktm3PPeSdEOh1jokatRxCqWgmkukNlGBOTFq3dxWdbCrn7vEEkt7b/6sZUCeYcxBbgExFZAByumqmqj3uWlTFhUlah/Obd9Qzp1parx6dGOh1jokowBWKz+2oGtPE2HWPCa+HWcr46UM4TU0cRZyemjfmGOguEqj4AICJJ7nSx10kZEw6b9xbzz63lfHdkd8b17RDpdIyJOsHcST1MRFYCOUCOiGSLyInep2aMd44creS2V1eQEAf3Xjgk0ukYE5WCucz1L8BPVDVVVVOBu4C/epuWMd5RVe77+1o27T7EzSNaktL2+B4jakysC6ZAtFbVzKoJVV0CtPYsI2M8lpG1nbdXFPCjswcyvLOnI94b06gFUyC2iMgvRKSP+/o5zpVNxjQ66woO8MsFOZwxsJMNxmdMHYIpEDcAnYG3gbeATu48YxqVAyXl3PpqNh1bx/PElSPtqiVj6hDMVUxFwI/CkIsxnvH5lLveXM3O/aW8fvMpdExqGemUjIl6wVzF9L6ItPebThaRxcFsXEQmisgmEckVkVkBlk8QkRUiUiEiU6ot6y0i74nIBhFZ7z6j2pjj8pf/bOGDDbu576IhpKUmRzodYxqFYLqYOqnq/qoJ94iiS12N3HGc5gAXAEOBq0RkaLXV8oHpwLwAm3gJeFRVhwDjgD1B5GrMtyzdUsijizdx0fBuTD+1T6TTMabRCKZA+ESkd9WEiKQCGkS7cUCuqm5R1aNABjDZfwVV3aaqawCf/3y3kDRX1ffd9YpVtSSImMZ8w56Dpdz+2kpSO7TiocuG2/OljamHYK7xuw/4r4h8DAhwBjAjiHY9AP8HDe0AxgeZ1yBgv4i8DfQFPgBmuYMHGhOUikoft7+2kkOl5bxy43h7Qpwx9SSqdR8MiEgn4GR3cqmqfh1EmynARFW9yZ2eBoxX1ZkB1p0LLFTV+X5tnwNG4XRDvQ4sUtXnqrWbgVusUlJS0jIyMurcl5oUFxeTlOTtA+pjJUa44jQ0xpubjvLu1nJ+MDye03oELg72+4rOOLESI1xxGhIjPT09W1XHBFyoqrW+gNNwbpYDuBZ4HEgNot0pwGK/6XuBe2tYdy4wxW/6ZOBjv+lpwJza4qWlpWlDZGZmNqh9U4oRrjgNifFezi5N/dlCnfXWGs9i1Ee0/76iLU6sxAhXnIbEAJZrDZ+rwZyD+DNQIiInAT/BGdn1pSDaZQEDRaSv+zyJqcCCINpVtW0vIp3d6bOB9UG2NU1cfmEJd72ximE92vLLS6pfF2GMCVYwBaLCrTKTcb7FzyGIYb9VtQKYCSzGeYb1G6qaIyIPisgkABEZKyI7gMuBZ0Qkx21bCdwNfCgia3HOfdj4T6ZOpeWV/HBeNgB/viaNhBZxEc7ImMYrmJPUh0TkXpzupQki0gwI6myfqi4CFlWbN9vvfRbQs4a27wMjgoljTJUH/rGedQUHefb7Y+jVoVWk0zGmUQvmCOJKoAy4UVV34XygP+ppVsYch7eyd/DasnxuPas/3xmaEul0jGn0ghlqYxfOiemq6XyCOwdhTNhs3HWQ+/6+lvF9O3DXuYMinY4xMSGYIwhjotqh0nJufWUFbRJa8MerR9E8zv5bGxMKNhi+adRUlVlvrSV/XwnzbhpPlzb28B9jQiWor1oikigiJ3idjDH19cIn23h37U7uOf8ExvfrGOl0jIkpwYzmegmwCviXOz1SRIK9n8EYz2TnFfHbRRv4zpAUbp7QL9LpGBNzgjmCuB9n4L39AKq6Cmd8JGMiprC4jJnzVtCtfQKPXXGSDcJnjAeCKRDlqnqg2rxgRnM1xhOVPuXO11dRePgof74mjXaJNgifMV4I5iR1johcDcSJyECcp8t96m1axtTsqQ+/5D9ffs3vvjecYT3aRTodY2JWMEcQtwMn4twsNw84ANzhZVLG1GTJpj089dGXXDa6J1PH9op0OsbEtGCOIC5S1ftwngsBgIhcDrzpWVbGBFCw/wg/fn0VJ6S04dffHWbnHYzxWDBHEPcGOc8Yzxyt8HHbqysor1T+dM1oEuNtED5jvFbjEYSIXABcCPQQkaf8FrUFKrxOzBh/v120gVXb9/Ona0bTr7P3D3kxxtTexfQVsByYBGT7zT8E/NjLpIzx94/VXzH3023ccFpfLhzeLdLpGNNk1FggVHU1sFpEUlT1Rf9lInIH8KTXyRmTu6eYWW+tYXTv9sy6YHCk0zGmSQnmHMTUAPOmhzgPY76lrEL54avZtGwRx5xrRhPf3AbhMyacajsHcRVwNdC32tAabYB9XidmmjZVZW5OGV/uqeSlG8bRrV1ipFMypsmp7RzEp8BOoBPwmN/8Q8AaL5My5rVl2/lsZyU//s4gzhjYue4GxpiQq+0cRB6QB5wiIqnAQFX9QEQSgUScQmFMyOUXlvDrd9dzYsdm3H72gEinY0yTFcxorj8A5gPPuLN6An8PZuMiMlFENolIrojMCrB8goisEJEKEZlSbVmliKxyXzZ6bBPh8yk/fWs1zUS4YVhLmjWzm+GMiZRgzvrdBpwGHARQ1S+BLnU1EpE4YA5wATAUuEpEhlZbLR/nhPe8AJs4oqoj3dekIPI0MeCVz/NYumUfP79oCB0T7aS0MZEUzF9gmaoerZoQkeYEN5rrOCBXVbe47TOAyf4rqOo2VV0D+OqRs4lR+YUlPPTPjZwxsBNX2jhLxkScqNb+WS8ij+A8C+L7OAP3/RBY747PVFu7KcBEVb3JnZ4GjFfVmQHWnQssVNX5fvMqcB5UVAE8pKrf6tYSkRnADICUlJS0jIyMWvelNsXFxSQleXuHbqzE8CKOT5VHskrZdsDHb05PpGNiM/t9RWGMcMWJlRjhitOQGOnp6dmqOibgQlWt9YVzlPEDnMH55rvvJYh2U4Bn/aanAU/XsO5cYEq1eT3cn/2AbUD/2uKlpaVpQ2RmZjaofVOK4UWcFz/dqqk/W6ivfZ7nWYxAGuvvK1IxwhUnVmKEK05DYgDLtYbP1TpHc1VVH/BX91UfBYB/P0FPd15QVLXA/blFRJYAo4DN9czBNALWtWRMdKqzQIjIVgKcc1DVuh4CnAUMFJG+OIVhKs6Nd3USkWSgRFXLRKQTzknyR4JpaxoX/6uWHrpshA3hbUwUCeZ5EP59UwnA5UCHuhqpaoWIzAQWA3HA86qaIyIP4hzSLBCRscDfgGTgEhF5QFVPBIYAz4iID6eL6yFVXV+vPTONQtVVSw99bzg92tvd0sZEk2C6mAqrzXpCRLKB2UG0XQQsqjZvtt/7LJyup+rtPgWG17V907hZ15Ix0S2YLqbRfpPNcI4ogjnyMKZG1rVkTPQL5oPefxymCpwriq7wJBvTZFjXkjHRL5gupvRwJGKaDutaMqZxCGYspnYi8riILHdfj4lIu3AkZ2KPdS0Z03gEM9TG8zgjt17hvg4CL3iZlIld/mMtWdeSMdEtmHMQ/VX1Mr/pB0RklVcJmdhlXUvGNC7BHEEcEZHTqyZE5DTgiHcpmVhkXUvGND7BHEHcArzknncQnMeNTvcyKRN77KolYxqfYK5iWg2cJCJt3emDnmdlYkp+YQm/W2RdS8Y0NsHcKNcSuAzoAzSv6hpQ1Qc9zczEhKqupbhm1rVkTGMTTBfTO8ABIBso8zYdE2usa8mYxiuYAtFTVSd6nomJOVVdSxMGdbauJWMaoWCuYvpURGzgPFMv3+ha+t5w61oyphGq8QhCRNbiPAeiOXC9iGzB6WISQFV1RHhSNI2Rf9dSd+taMqZRqq2L6eKwZWFiinUtGRMbaisQh8KWhYkZ1rVkTOyorUBk43QxBfoLV6CuR46aJsi6loyJHTUWCFXtG85ETONnXUvGxJbaTlIPVtWN1Z4od4yqrvAuLdPYWNeSMbGntstcf+L+fCzA6/fBbFxEJorIJhHJFZFZAZZPEJEVIlIhIlMCLG8rIjtE5Olg4pnI8R/G27qWjIkNtXUxzXB/HtcT5UQkDpgDnAvsALJEZIGqrvdbLR9n4L+7a9jMr4B/H098Ez7WtWRMbArmiXKXi0gb9/3PReRtERkVxLbHAbmqukVVjwIZwGT/FVR1m6quAXwB4qYBKcB7QcQyEVLVtdTcupaMiTmiqrWvILJGVUe4z4T4NfAoMFtVx9fRbgowUVVvcqenAeNVdWaAdecCC1V1vjvdDPgIuBb4DjCmhnYzgBkAKSkpaRkZGXXsbs2Ki4tJSko67vZNKYZ/nA/yynllw1GuHxbPmT1beBLDS+H+fTX2GOGKEysxwhWnITHS09OzVXVMwIWqWusLWOn+/B1wtf+8OtpNAZ71m54GPF3DunOBKX7TM4Gfuu+n19TO/5WWlqYNkZmZ2aD2TSlGVZy8rw/r4J//U6c997n6fD5PYngtnL+vWIgRrjixEiNccRoSA1iuNXyuBjNYX4GIPINzLuFhd/jvYMZwKgD8O6R7uvOCcQpwhoj8EEgC4kWkWFW/daLbRIZPrWvJmFgXTIG4ApgI/F5V94tIN+CeINplAQNFpC9OYZgKXB1MUqp6TdV7EZmO08VkxSGKfJRfwdIt+3j4MrshzphYVeeRgKqWqOrbqvqlO71TVes8cayqFThdRYuBDcAbqpojIg+KyCQAERkrIjuAy4FnRCSnITtjwiO/sIQ3vjjKhEGduWKMXbVkTKwK5gjiuKnqImBRtXmz/d5n4XQ91baNuTjnKEwUKK/0cff81cQJ1rVkTIwL5lyCMYB7Sev8NSzbuo9rh8Rb15IxMc7TIwgTO1SVBxeu528rC7jn/BM4UXZEOiVjjMfsCMIE5Y8f5TL3023cdHpffnhW/0inY4wJAysQpk4vf7aNx9//gstG9+T/Lhxi5x2MaSKsQJhaLVj9FbMX5PCdISk8fNlwmjWz4uWt2GoAABG+SURBVGBMU2EFwtRoyaY9/OT1VYzr04Gnrx5F8zj772JMU2J/8Sag7Lx93PJKNid0bcNfrxtDQou4SKdkjAkzKxDmWzbuOsj1L2TRrV0iL94wjrYJoR2EzxjTOFiBMN+QX1jC959bRqv45rx84zg6JbWMdErGmAix+yDMMXsOlTLt+c85WunjzZtPoWdyq0inZIyJIDuCMAAcOFLOdc9nsfdQGS9MH8vAlDaRTskYE2FWIAxHjlZy04tZ5O45xDPT0hjVOznSKRljooB1MTVx5ZU+bpu3guV5RTx91WjOGNg50ikZY6KEHUE0YVWD7320cQ+//u4wLhrRLdIpGWOiiBWIJqr64HvXjE+NdErGmChjBaKJetodfO9GG3zPGFMDKxBN0MtL83jMHXzvPht8zxhTAysQTcyC1V8x+511NvieMaZOViCakKrB98ba4HvGmCB4+gkhIhNFZJOI5IrIrADLJ4jIChGpEJEpfvNT3fmrRCRHRG7xMs+mIDuviFtfWcEJXdvwrA2+Z4wJgmf3QYhIHDAHOBfYAWSJyAJVXe+3Wj4wHbi7WvOdwCmqWiYiScA6t+1XXuUbyzbtOsQNc7Po2i7BBt8zxgTNyxvlxgG5qroFQEQygMnAsQKhqtvcZT7/hqp61G+yJdYVdty27yth2nOfk9CiGS/dYIPvGWOCJ6rqzYadLqOJqnqTOz0NGK+qMwOsOxdYqKrz/eb1At4FBgD3qOqcAO1mADMAUlJS0jIyMo473+LiYpKSko67fTTGOFCm/ObzIxwuV/5vXCI92oSuzsbi76uxx7F9ib4Y4YrTkBjp6enZqjom4EJV9eQFTAGe9ZueBjxdw7pzgSk1LOsOLANSaouXlpamDZGZmdmg9tEWY3/JUZ34xL91yC/+qSvy9nkWx0uxEiNccWxfoi9GuOI0JAawXGv4XPWy66YA6OU33dOdVy/qnHdYB5wRorxinv/ge//vWht8zxhzfLwsEFnAQBHpKyLxwFRgQTANRaSniCS675OB04FNnmUaQyp8ykx38L0/XDmSCYNs8D1jzPHxrECoagUwE1gMbADeUNUcEXlQRCYBiMhYEdkBXA48IyI5bvMhwOcishr4GPi9qq71KtdYUVZRyfPrjvKhO/jexSO6RzolY0wj5ulw36q6CFhUbd5sv/dZOF1P1du9D4zwMrdYsPdQGdl5RazILyI7r4i1Ow5wtNJng+8ZY0LCngfRSFT6lC92H3IKQl4R2flF5BWWABAf14zhPdtx/Wl9aH24wAbfM8aEhBWIKHWotJxV2/eTneccHazK38+hsgoAOiW1ZExqMteOT2V0ajLDerSlZXPnzuglS3bb4HvGmJCwAhEFVJXt+46Qnb+P7Lwilm8rYtPuQ6iCCJyQ0obJo7qTlppMWu8O9OqQaEXAGOM5KxARUFZRybqCg6zIK2J53j6y8/bzdXEZAEktmzOqd3smDutKWmoyI3u1p40NjWGMiQArEGGyruAAGRuP8scNnx47mQyQ2rEVEwZ2YnRqMmmpyQxKaUOcDcFtjIkCViDC4J1VBdzz5hp8Ph8n9Ybpp/VhdO9kRqe2p0ubhEinZ4wxAVmB8JCq8uSHX/LEB18yrm8Hvt+3lIvPOzXSaRljTFBslFSPlJZXcufrq3jigy+5bHRPXrlxPEnx1nVkjGk87AjCA4XFZcx4OZvsvCLuOf8EfnhWf7vqyBjT6FiBCLHcPYe4fm4Wew6WMefq0Vw0olukUzLGmONiBSKEPsn9mlteyaZl8zgyZpxso6gaYxo1KxAh8tqyfH7x93X075zEc9PH0DO5VaRTMsaYBrEC0UA+n/LwvzbyzL+3cOagzjx99Si7sc0YExOsQDRAydEK7sxYxXvrd/P9U1KZffFQmsfZhWHGmNhgBeI47T5Yyo0vZrH+q4Pcf8lQpp/WN9IpGWNMSFmBOA45Xx3gxrnLOVRazrPXjeHswSmRTskYY0LOCkQ9fbB+Nz/KWEm7xBa8ecupDO3eNtIpGWOMJ6xABElVef6Tbfz63fUM696O564bQ5e2No6SMSZ2WYEIQkWlj/v/kcMrS/M5/8QU/nDlSFrF26/OGBPbPL3kRkQmisgmEckVkVkBlk8QkRUiUiEiU/zmjxSRz0QkR0TWiMiVXuZZm4Ol5Vw/N4tXluZz85n9+PM1aVYcjDFNgmefdCISB8wBzgV2AFkiskBV1/utlg9MB+6u1rwE+L6qfiki3YFsEVmsqvu9yjeQ7ftKuGFuFlu/PszDlw3nyrG9wxneGGMiysuvwuOAXFXdAiAiGcBk4FiBUNVt7jKff0NV/cLv/VcisgfoDIStQKzIL2LGS8s5WuHjpRvGceqATuEKbYwxUUFU1ZsNO11GE1X1Jnd6GjBeVWcGWHcusFBV5wdYNg54EThRVX3Vls0AZgCkpKSkZWRkHHe+xcXFJCUlAfD5zgr+uraMDgnCnaMT6J4Ump44/xheCUeMcMWJlRjhimP7En0xwhWnITHS09OzVXVMwIWq6skLmAI86zc9DXi6hnXnAlMCzO8GbAJOriteWlqaNkRmZqb6fD7944dfaOrPFuqUP3+ihcVlDdpmoBheC0eMcMWJlRjhimP7En0xwhWnITGA5VrD56qXXUwFQC+/6Z7uvKCISFvgXeA+VV0a4ty+pdyn3PXmat5eUcClo3rw0GXDadk8zuuwxhgTtbwsEFnAQBHpi1MYpgJXB9NQROKBvwEvaYBup1ArOnyUR7NK+aKogJ+cO4jbzx5gD/gxxjR5nl3mqqoVwExgMbABeENVc0TkQRGZBCAiY0VkB3A58IyI5LjNrwAmANNFZJX7GulFnjuKSrj0T5+w5YCPp64axY/OGWjFwRhj8PhGOVVdBCyqNm+23/ssnK6n6u1eAV7xMrcqHVu3pH/nJK4d4GPSSd3DEdIYYxqFJj82dWJ8HM9NH8uAZDvfYIwx/pp8gTDGGBOYFQhjjDEBWYEwxhgTkBUIY4wxAVmBMMYYE5AVCGOMMQFZgTDGGBOQFQhjjDEBeTbcd7iJyF4grwGb6AR8HaJ0Yj1GuOLESoxwxbF9ib4Y4YrTkBipqto50IKYKRANJSLLtaYx0S1GROLESoxwxbF9ib4Y4YrjVQzrYjLGGBOQFQhjjDEBWYH4n79YjKiLEysxwhXH9iX6YoQrjicx7ByEMcaYgOwIwhhjTEBWIIwxxgTUpAuEiPQSkUwRWS8iOSJyh0dxEkRkmYisduM84EUcN1aciKwUkYUebX+biKx1HwO73IsYbpz2IjJfRDaKyAYROSXE2z/B73G2q0TkoIjcGcoYbpwfu//m60TkNRFJCHUMN84dboycUO2HiDwvIntEZJ3fvA4i8r6IfOn+TPYozuXuvvhEpMGXb9YQ41H3/9caEfmbiLT3IMav3O2vEpH3RKTBj60MFMdv2V0ioiLSqaFxAFDVJvsCugGj3fdtgC+AoR7EESDJfd8C+Bw42aN9+gkwD1jo0fa3AZ3C8G/zInCT+z4eaO9hrDhgF84NQ6Hcbg9gK5DoTr8BTPcg/2HAOqAVzmOEPwAGhGC7E4DRwDq/eY8As9z3s4CHPYozBDgBWAKM8SjGeUBz9/3DDd2XGmK09Xv/I+D/ebEv7vxewGKcG4ZD8jfapI8gVHWnqq5w3x8CNuD8UYc6jqpqsTvZwn2F/OoAEekJXAQ8G+pth5OItMP5I3gOQFWPqup+D0OeA2xW1YbciV+T5kCiiDTH+QD/yoMYQ4DPVbVEVSuAj4HvNXSjqvpvYF+12ZNxijfuz+96EUdVN6jqpoZuu44Y77m/L4ClQE8PYhz0m2xNCP7ua/h3AfgD8NNQxKjSpAuEPxHpA4zC+XbvxfbjRGQVsAd4X1W9iPMEzn8QnwfbrqLAeyKSLSIzPIrRF9gLvOB2lz0rIq09igUwFXgt1BtV1QLg90A+sBM4oKrvhToOztHDGSLSUURaARfifJv0Qoqq7nTf7wJSPIoTbjcA//RiwyLyGxHZDlwDzPYoxmSgQFVXh3K7ViAAEUkC3gLurFbxQ0ZVK1V1JM63lHEiMiyU2xeRi4E9qpodyu0GcLqqjgYuAG4TkQkexGiOcwj9Z1UdBRzG6c4IORGJByYBb3qw7WScb9x9ge5AaxG5NtRxVHUDThfJe8C/gFVAZajjBIireHAkHG4ich9QAbzqxfZV9T5V7eVuf2aot+9+Kfg/PCg+Tb5AiEgLnOLwqqq+7XU8t6skE5gY4k2fBkwSkW1ABnC2iLwS4hhV34pR1T3A34BxoY4B7AB2+B1lzccpGF64AFihqrs92PZ3gK2quldVy4G3gVM9iIOqPqeqaao6ASjCOZ/mhd0i0g3A/bnHozhhISLTgYuBa9yC56VXgcs82G5/nC8hq92//57AChHp2tANN+kCISKC08+9QVUf9zBO56orJEQkETgX2BjKGKp6r6r2VNU+OF0mH6lqSL+tikhrEWlT9R7nJN+3rqRoKFXdBWwXkRPcWecA60Mdx3UVHnQvufKBk0Wklft/7Ryc81whJyJd3J+9cc4/zPMiDrAAuM59fx3wjkdxPCciE3G6ZCepaolHMQb6TU4mxH/3AKq6VlW7qGof9+9/B87FN7tCsfEm+wJOxzlEXoNzWL4KuNCDOCOAlW6cdcBsj/frLDy4ignoB6x2XznAfR7uw0hgufs7+zuQ7EGM1kAh0M7D/XgA50NhHfAy0NKjOP/BKaKrgXNCtM3XcM6dlLsfOjcCHYEPgS9xrpbq4FGcS933ZcBuYLEHMXKB7X5/+w26wqiGGG+5//ZrgH8APbz4fVVbvo0QXcVkQ20YY4wJqEl3MRljjKmZFQhjjDEBWYEwxhgTkBUIY4wxAVmBMMYYE5AVCNNouKNUPuY3fbeI3B+ibc8VkSmh2FYdcS53R6fN9DIvEekjIlfXP0Nj/scKhGlMyoDvhWwo4xBxB+IL1o3AD1Q13at8XH2AehWIeu6HaQKsQJjGpALn2bs/rr6g+jdtESl2f54lIh+LyDsiskVEHhKRa8R5PsdaEenvt5nviMhyEfnCHduqapDFR0Ukyx3X/2a/7f5HRBYQ4C5vEbnK3f46EXnYnTcb5+bM50Tk0QBtfua2WS0iDwVYvq2qOIrIGBFZ4r4/U/73XIuV7t3uD+EM4LdKnGdSBLUf7t3y77o5rBORK4P5hzGxyb4xmMZmDrBGRB6pR5uTcIbE3gdsAZ5V1XHiPCDqdqDqATt9cMaW6g9kisgA4Ps4o7COFZGWwCciUjUi62hgmKpu9Q8mzkNhHgbScMZFek9EvquqD4rI2cDdqrq8WpsLcIZiGK+qJSLSoR77dzdwm6p+4g48WYozuOHdqlpV6GYEsx8ichnwlape5LZrV488TIyxIwjTqKgz2u5LOA9fCVaWOs/+KAM244x6CrAWpyhUeUNVfar6JU4hGYwz3tT3xRmq/XOcoSaqxtdZVr04uMYCS9QZpK9qlNC6Rr39DvCCumMCqWqg8f5r8gnwuIj8COfBShUB1gl2P9YC54rIwyJyhqoeqEceJsZYgTCN0RM4ffn+z4iowP3/LCLNcJ5CV6XM773Pb9rHN4+iq487ozhPA7xdVUe6r776v2c6HG7QXtTfsX0Ejj26VFUfAm4CEnGODAYHaBvUfqjqFzhHFGuBX7vdYqaJsgJhGh332/UbOEWiyjacLh1wnu/Q4jg2fbmINHPPS/QDNuE8wvFWd1h4RGSQ1P3womXAmSLSSUTicEaM/biONu8D14sztj81dDFt43/7eGzYaBHpr86Ing8DWThHPodwHqNbJaj9cLvHSlT1FeBRvBtm3TQCdg7CNFaP8c2Hr/wVeEdEVuM8NOd4vt3n43y4twVuUdVSEXkWpxtqhTtk917qeMymqu4UkVk4z/0Q4F1VrXVYbFX9l4iMBJaLyFFgEc5DYPw9gHOC+1c4z2qucqeIpOMcEeXgPBnNB1S6v4+5wJNB7sdw4FER8eGMFnprbXmb2GajuRpjjAnIupiMMcYEZAXCGGNMQFYgjDHGBGQFwhhjTEBWIIwxxgRkBcIYY0xAViCMMcYE9P8BePc/ZJG+fksAAAAASUVORK5CYII=\n"
389 |           },
390 |           "metadata": {
391 |             "needs_background": "light"
392 |           }
393 |         }
394 |       ]
395 |     },
396 |     {
397 |       "cell_type": "code",
398 |       "source": [
399 |         "# get the value of K for which silhouette score is highest\n",
400 |         "print(np.argmax(sil_coef_digits)+2)"
401 |       ],
402 |       "metadata": {
403 |         "colab": {
404 |           "base_uri": "https://localhost:8080/"
405 |         },
406 |         "id": "_L92U019z8XA",
407 |         "outputId": "8c462436-9fb4-48ef-8f9d-36e808de4aba"
408 |       },
409 |       "execution_count": 12,
410 |       "outputs": [
411 |         {
412 |           "output_type": "stream",
413 |           "name": "stdout",
414 |           "text": [
415 |             "9\n"
416 |           ]
417 |         }
418 |       ]
419 |     },
420 |     {
421 |       "cell_type": "markdown",
422 |       "source": [
423 |         "This graph points the fact that n=9 this could mean some digits like 1 and 7 make striking similarity"
424 |       ],
425 |       "metadata": {
426 |         "id": "ZX7SlAg80NnK"
427 |       }
428 |     },
429 |     {
430 |       "cell_type": "markdown",
431 |       "source": [
432 |         "## [Lecture 11.2: Hierarchical Agglomerative Clustering (HAC)](https://www.youtube.com/watch?v=GG6yYMO91FQ)\n",
433 |         "Clustering is concerned about grouping objects with *similar attributes* or *characteristics*. The objects in the same cluster are closer to one and another than the objects from the other clusters\n",
434 |         "\n",
435 |         "<img src=\"images/cluster.png\"/>\n",
436 |         "\n",
437 |         "In the image above, the clusters with the same color share similar properties (Feature values represented on axis). For instance, if the x-axis represents weight and y-axis represent height, then the red cluster represents people with low BMI. Similar interpretations can be drawn for the remaining clusters.\n",
438 |         "\n",
439 |         "Here we will discuss another clustering algorithm which is **hierarchial aggglomerative clustering (HAC)** algorithm\n",
440 |         "- Hierarchial clustering starts by considering each datum as cluster and then combines closest clusters to form larger clusters. This is bottom-up approach\n",
441 |         "- There is an alternate approach, which is top-down approach, where the entire data is one large single cluster, which is divided into smaller clusters in each step.\n",
442 |         "\n",
443 |         "The merging and splitting decisions are influenced by certain conditions that will discussed shortly."
444 |       ],
445 |       "metadata": {
446 |         "id": "exX3J-wB5C0Q"
447 |       }
448 |     },
449 |     {
450 |       "cell_type": "markdown",
451 |       "source": [
452 |         "### Metric\n",
453 |         "Certain metrics are used for calculating similarity between clusters. Note that metric is a generalization of concept of distance. The metrics follow certain properties like:\n",
454 |         "1. non-negative\n",
455 |         "2. symmetric\n",
456 |         "3. follows triangle inequality\n",
457 |         "\n",
458 |         "Some of the popular metric function are:\n",
459 |         "1. **Euclidean**:\n",
460 |         "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) = \\sqrt{\\sum_{l=1}^m (\\mathbf{x}_l^{(i)} - \\mathbf{x}_l^{(j)})^2}$$\n",
461 |         "2. **Manhattan**:\n",
462 |         "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) =  \\sum_{l=1}^m |(\\mathbf{x}_l^{(i)} - \\mathbf{x}_l^{(j)})|$$\n",
463 |         "3. **Cosine distance**:\n",
464 |         "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) = 1 - \\frac{\\mathbf{x}^{(i)}\\mathbf{x}^{(j)}}{||\\mathbf{x}^{(i)}|| \\times ||\\mathbf{x}^{(j)}||} = 1 - cos (\\theta)$$"
465 |       ],
466 |       "metadata": {
467 |         "id": "Q1GdOSNyHBU4"
468 |       }
469 |     },
470 |     {
471 |       "cell_type": "markdown",
472 |       "source": [
473 |         "### Linkage\n",
474 |         "Linkage is a strategy for aggregating clusters\n",
475 |         "\n",
476 |         "There are four linkages we will study\n",
477 |         "- Single linkage\n",
478 |         "- Average linkage\n",
479 |         "- Complete linkage\n",
480 |         "- Ward's linkage\n",
481 |         "\n",
482 |         "The single linkage criterion merges clusters based on the shortest distance over all possible pairs . That is\n",
483 |         "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\text{min}_{i,j} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)}) $$\n",
484 |         "\n",
485 |         "<img src=\"images/SingleLinkage.png\" />\n",
486 |         "\n",
487 |         "The complete linkage merges clusters to minimize the maximum distance between the clusters (in other words, distance of the furthest elements).\n",
488 |         "\n",
489 |         "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\text{max}_{i,j} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)})$$\n",
490 |         "\n",
491 |         "<img src=\"images/CompleteLinkage.png\" />\n",
492 |         "\n",
493 |         "The average linkage criterion uses average distance over all possible pairs between the groups for merging clusters\n",
494 |         "\n",
495 |         "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\frac{1}{|r_1||r_2|} \\sum_{i=1}^{|r_1|}\\sum_{j=1}^{|r_2|} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)}) $$\n",
496 |         "\n",
497 |         "<img src=\"images/AverageLinkage.png\" />\n",
498 |         "\n",
499 |         "The ward's linkage computes the sum of squared distances within the clusters\n",
500 |         "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) =  \\sum_{i=1}^{|r_1|}\\sum_{j=1}^{|r_2|} ||\\mathbf{x}_{r_1}^{(i)} - \\mathbf{x}_{r_2}^{(j)}||^2 $$\n"
501 |       ],
502 |       "metadata": {
503 |         "id": "8Ax_oFePQG2W"
504 |       }
505 |     },
506 |     {
507 |       "cell_type": "markdown",
508 |       "source": [
509 |         "### Hierarchical Aggolomerative Clustering\n",
510 |         "\n",
511 |         "Algorithm:\n",
512 |         "1. Calculate the distance matrix between pairs and clusters\n",
513 |         "2. While all the objects are clustered into one.\n",
514 |         "    - Detect the two closest groups (clusters) and merge them\n"
515 |       ],
516 |       "metadata": {
517 |         "id": "zOW9btr21KEK"
518 |       }
519 |     },
520 |     {
521 |       "cell_type": "markdown",
522 |       "source": [
523 |         "### Dendrograms \n",
524 |         "Dendrograms are graphical representation of the agglomerative process which show how an aggregation happens at each level. Let's take example of toy dataset to understand this.\n",
525 |         "\n",
526 |         "Example:\n",
527 |         "\n",
528 |         "| $x_1$ | $x_2$ |\n",
529 |         "| --- | --- |\n",
530 |         "| 8 | 3 |\n",
531 |         "| 5 | 3 |\n",
532 |         "| 6 | 4 |\n",
533 |         "| 1 | 6 |\n",
534 |         "| 2 | 8 |"
535 |       ],
536 |       "metadata": {
537 |         "id": "jz_DOow_1m5H"
538 |       }
539 |     },
540 |     {
541 |       "cell_type": "code",
542 |       "source": [
543 |         "import numpy as np\n",
544 |         "import matplotlib.pyplot as plt\n",
545 |         "from sklearn.preprocessing import normalize"
546 |       ],
547 |       "metadata": {
548 |         "id": "s5TgF9aM0MOd"
549 |       },
550 |       "execution_count": 13,
551 |       "outputs": []
552 |     },
553 |     {
554 |       "cell_type": "code",
555 |       "source": [
556 |         "X = np.array([(8, 3), (5, 3), (6, 4), (1, 6), (2, 8)])\n",
557 |         "scaled_X = normalize(X)\n",
558 |         "plt.scatter(X[:, 0], X[:, 1])\n",
559 |         "plt.show()"
560 |       ],
561 |       "metadata": {
562 |         "colab": {
563 |           "base_uri": "https://localhost:8080/",
564 |           "height": 265
565 |         },
566 |         "id": "E1PwpJlz23_N",
567 |         "outputId": "c8255f3d-aa78-4462-dcee-33e4312433ab"
568 |       },
569 |       "execution_count": 14,
570 |       "outputs": [
571 |         {
572 |           "output_type": "display_data",
573 |           "data": {
574 |             "text/plain": [
575 |               "<Figure size 432x288 with 1 Axes>"
576 |             ],
577 |             "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAANrUlEQVR4nO3db2zcB33H8fd3djouGWC0WqhxpqWPLE0tqyurYxRVWkPxOqoSVXtQpO4BbMoTxFomGZFH1fZg1WQ0sUdIUTrGRFcEwc2D/ak7iSKGxMqcuOCuxQ+AttQp5KrJlFa3YcJ3D3wuTeo/5+TO9z3n/ZKiJudfzh9VzjuX3/3OF5mJJKmuX+v3AEnS1gy1JBVnqCWpOEMtScUZakkqbrgXd3rttdfm4cOHe3HXkrQnnTlz5pXMHN3oYz0J9eHDh5mfn+/FXUvSnhQRL2z2MU99SFJxhlqSijPUklScoZak4gy1JBXX0VUfEfFJ4M+ABBaBj2bm//ZyWGWnF5aZmVvi3EqLgyMNpqfGOTox1u9ZkvaobR9RR8QY8OfAZGbeAAwB9/Z6WFWnF5Y5PrvI8kqLBJZXWhyfXeT0wnK/p0naozo99TEMNCJiGNgPnOvdpNpm5pZorV646LbW6gVm5pb6tEjSXrdtqDNzGfgM8CLwMvDTzHzi0uMi4lhEzEfEfLPZ7P7SIs6ttHZ0uyRdqU5OfbwL+DBwPXAQOBAR9116XGaeyMzJzJwcHd3wVZB7wsGRxo5ul6Qr1cmpjw8AP8zMZmauArPA+3o7q67pqXEa+4Yuuq2xb4jpqfE+LZK013Vy1ceLwHsjYj/QAo4AV+038li/usOrPiTtlm1DnZlPRcQp4CzwC2ABONHrYZUdnRgzzJJ2TUfXUWfmg8CDPd4iSdqAr0yUpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSits21BExHhFPv+nHqxHxwG6MkyTB8HYHZOYScBNARAwBy8BjPd4lSWrb6amPI8D3M/OFXoyRJL3VTkN9L/DoRh+IiGMRMR8R881m88qXSZKAHYQ6Iq4B7ga+stHHM/NEZk5m5uTo6Gi39knSVW8nj6jvBM5m5k96NUaS9FY7CfVH2OS0hySpdzoKdUQcAO4AZns7R5J0qW0vzwPIzNeB3+zxFknSBnxloiQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSihvu5KCIGAFOAjcACXwsM7/VzSGnF5aZmVvi3EqLgyMNpqfGOTox1s1PIUkDqaNQA38HPJ6ZfxwR1wD7uzni9MIyx2cXaa1eAGB5pcXx2UUAYy3pqrftqY+IeCdwG/AwQGb+PDNXujliZm7pjUiva61eYGZuqZufRpIGUifnqK8HmsDnI2IhIk5GxIFLD4qIYxExHxHzzWZzRyPOrbR2dLskXU06CfUwcDPwucycAF4HPn3pQZl5IjMnM3NydHR0RyMOjjR2dLskXU06CfVLwEuZ+VT716dYC3fXTE+N09g3dNFtjX1DTE+Nd/PTSNJA2jbUmflj4EcRsV7NI8Cz3RxxdGKMh+65kbGRBgGMjTR46J4bfSJRkuj8qo9PAI+0r/j4AfDRbg85OjFmmCVpAx2FOjOfBiZ7vEWStAFfmShJxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVN9zJQRHxPPAz4ALwi8yc7OUoSdKvdBTqtj/IzFd6tkSStCFPfUhScZ2GOoEnIuJMRBzb6ICIOBYR8xEx32w2u7dQkq5ynYb6/Zl5M3An8PGIuO3SAzLzRGZOZubk6OhoV0dK0tWso1Bn5nL7v+eBx4BbejlKkvQr24Y6Ig5ExNvXfw58EHim18MkSWs6uerj3cBjEbF+/D9l5uM9XSVJesO2oc7MHwC/uwtbJEkb8PI8SSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVN9zpgRExBMwDy5l5V+8mSeqF0wvLzMwtcW6lxcGRBtNT4xydGOv3LHWg41AD9wPPAe/o0RZJPXJ6YZnjs4u0Vi8AsLzS4vjsIoCxHgAdnfqIiEPAh4CTvZ0jqRdm5pbeiPS61uoFZuaW+rRIO9HpOerPAp8CfrnZARFxLCLmI2K+2Wx2ZZyk7ji30trR7apl21BHxF3A+cw8s9VxmXkiMyczc3J0dLRrAyVduYMjjR3drlo6eUR9K3B3RDwPfAm4PSK+2NNVkrpqemqcxr6hi25r7Btiemq8T4u0E9uGOjOPZ+ahzDwM3At8LTPv6/kySV1zdGKMh+65kbGRBgGMjTR46J4bfSJxQOzkqg9JA+zoxJhhHlA7CnVmfh34ek+WSJI25CsTJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOKGtzsgIt4GfAP49fbxpzLzwV4PkwbB6YVlZuaWOLfS4uBIg+mpcY5OjPV7lnZZr78Otg018H/A7Zn5WkTsA74ZEf+Wmf/ZtRXSADq9sMzx2UVaqxcAWF5pcXx2EcBYX0V24+tg21Mfuea19i/3tX9kVz67NMBm5pbe+MO5rrV6gZm5pT4tUj/sxtdBR+eoI2IoIp4GzgP/nplPbXDMsYiYj4j5ZrPZtYFSVedWWju6XXvTbnwddBTqzLyQmTcBh4BbIuKGDY45kZmTmTk5OjratYFSVQdHGju6XXvTbnwd7Oiqj8xcAZ4E/rBrC6QBNT01TmPf0EW3NfYNMT013qdF6ofd+Dro5KqPUWA1M1ciogHcAfxN1xZIA2r9iSKv+ri67cbXQWRu/bxgRLwH+AIwxNoj8C9n5l9t9XsmJydzfn6+ayMlaa+LiDOZObnRx7Z9RJ2Z3wUmur5KktQRX5koScUZakkqzlBLUnGGWpKK2/aqj8u604gm8MJl/vZrgVe6OKeXBmkrDNbeQdoKg7V3kLbCYO29kq2/nZkbvlqwJ6G+EhExv9klKtUM0lYYrL2DtBUGa+8gbYXB2turrZ76kKTiDLUkFVcx1Cf6PWAHBmkrDNbeQdoKg7V3kLbCYO3tydZy56glSRer+IhakvQmhlqSiisT6oj4+4g4HxHP9HvLdiLityLiyYh4NiL+OyLu7/emzUTE2yLi2xHxnfbWv+z3pk6031VoISL+ud9bthIRz0fEYkQ8HRHlv2VkRIxExKmI+F5EPBcRv9/vTRuJiPH2/9P1H69GxAP93rWViPhk+8/YMxHxaPuNwbtz31XOUUfEbcBrwD9m5lveQaaSiLgOuC4zz0bE24EzwNHMfLbP094iIgI48OY3Jwbur/7mxBHxF8Ak8I7MvKvfezYTEc8Dk5k5EC/IiIgvAP+RmScj4hpgf/sNQcqKiCFgGfi9zLzcF9L1VESMsfZn63cysxURXwb+NTP/oRv3X+YRdWZ+A/iffu/oRGa+nJln2z//GfAcUPK7xQ/imxNHxCHgQ8DJfm/ZSyLincBtwMMAmfnz6pFuOwJ8v2qk32QYaETEMLAfONetOy4T6kEVEYdZ+37db3nD3yo6eXPiYj4LfAr4Zb+HdCCBJyLiTEQc6/eYbVwPNIHPt08rnYyIA/0e1YF7gUf7PWIrmbkMfAZ4EXgZ+GlmPtGt+zfUVyAifgP4KvBAZr7a7z2b6eTNiauIiLuA85l5pt9bOvT+zLwZuBP4ePsUXlXDwM3A5zJzAngd+HR/J22tfXrmbuAr/d6ylYh4F/Bh1v4yPAgciIj7unX/hvoytc/3fhV4JDNn+72nEwPy5sS3Ane3z/1+Cbg9Ir7Y30mbaz+SIjPPA48Bt/R30ZZeAl5607+oTrEW7sruBM5m5k/6PWQbHwB+mJnNzFwFZoH3devODfVlaD9B9zDwXGb+bb/3bCUiRiNipP3z9Tcn/l5/V20uM49n5qHMPMzaP3m/lplde2TSTRFxoP1kMu1TCB8Eyl61lJk/Bn4UEetvj30EKPcE+CU+QvHTHm0vAu+NiP3tPhxh7bmrrigT6oh4FPgWMB4RL0XEn/Z70xZuBf6EtUd765cP/VG/R23iOuDJiPgu8F+snaMufcnbAHk38M2I+A7wbeBfMvPxPm/azieAR9pfDzcBf93nPZtq/+V3B2uPTktr/yvlFHAWWGStrV17OXmZy/MkSRsr84hakrQxQy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOL+H8N3uVec8v9QAAAAAElFTkSuQmCC\n"
578 |           },
579 |           "metadata": {
580 |             "needs_background": "light"
581 |           }
582 |         }
583 |       ]
584 |     },
585 |     {
586 |       "cell_type": "markdown",
587 |       "source": [
588 |         "Let's plot the dendrogram with `scipy.cluster.hierarchy` library"
589 |       ],
590 |       "metadata": {
591 |         "id": "Y5REenHR3Shb"
592 |       }
593 |     },
594 |     {
595 |       "cell_type": "code",
596 |       "source": [
597 |         "import scipy.cluster.hierarchy as shc\n",
598 |         "plt.figure(figsize=(8, 8))\n",
599 |         "plt.title('Dendrogram')\n",
600 |         "dend = shc.dendrogram(shc.linkage(scaled_X, method=\"ward\"))"
601 |       ],
602 |       "metadata": {
603 |         "colab": {
604 |           "base_uri": "https://localhost:8080/",
605 |           "height": 502
606 |         },
607 |         "id": "Ogbv_h9o3NAe",
608 |         "outputId": "99b4e1a5-487f-48c7-fd45-08d8946a2b03"
609 |       },
610 |       "execution_count": 17,
611 |       "outputs": [
612 |         {
613 |           "output_type": "display_data",
614 |           "data": {
615 |             "text/plain": [
616 |               "<Figure size 576x576 with 1 Axes>"
617 |             ],
618 |             "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeMAAAHlCAYAAADGLpQlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAXIElEQVR4nO3df7Dld13f8dfbrOAPCDRkRU2yBDEEV6NAt6i1DrRqCWklziiaCFYjGKumv3Qc8Vek0dZqHZ3qBDEtJNQfIFDUbVlkqqLWH0EW+WUCG7cRyYZEQoLhdyD23T/OiXNdNrkn7Nl9Z88+HjN3cr/f7+d+zzt3Nnne7/ecPbe6OwDAnE+aHgAATnZiDADDxBgAhokxAAwTYwAYJsYAMEyMYYNU1bdU1R9MzwHcP2IMx1hVvaOqPlxV76+qv66qP6qqf1lV/vsDkogxHC9f3d0PTfKoJP8pyfcleeHxHKCqdjyQzwcnMzGG46i77+zuvUm+Ick3V9UXVNWDq+qnquqdVfVXVfWCqvrUJKmqp1TVoar6nqp6d1XdUlWX3HO+qnpEVe2tqvdV1Z8keczWx6uqrqrvqqo/T/Lny33fVlUHq+qO5dd+9pb1/7SqDlTVnVX1/Kr6vap6zvLYt1TVH1bVz1TV7UmeV1WPqarfqarbq+o9VfXLVfXwLed7R1V9b1W9pao+WFUvrKpHVtWrl3cKfquq/t4x/JbDCUGMYUB3/0mSQ0m+PIsr5ccmeXySz01yRpLLtyz/zCQPW+5/dpIrtwTsyiQfSfJZSb51+XG4r0nyxUl2V9U/SfLjSb5++TV/meSlSVJVpyd5RZLvT/KIJAeS/MPDzvXFSW5M8sgk/yFJLc/32Uk+L8lZSZ532Nd8bZKvWv47fnWSVyf5gSQ7s/h/0L++128UnCTEGOa8K8lpSS5N8u+6+47ufn+S/5jkoi3rPpbkiu7+WHfvS/KBJOdW1SlZhO7y7v5gd/9Zkhcf4XF+fHnuDyd5ZpIXdfefdvddWYT3S6vq7CQXJLmuu1/Z3Xcn+dkktx4+c3f/XHff3d0f7u6D3f2/u/uu7r4tyU8nefJhX/Nz3f1X3X1zkv+T5HXd/cbu/kiSX0vyhE/gewcbxXM+MOeMLP4b/LQkb6iqe/ZXklO2rLt9Gcd7fCjJQ7K4styR5KYtx/7yCI+z9fhnJ/nTeza6+wPLW85nLI/dtOVYV9Wh+zhXquqRSf5LFlf4D83iB/z3HvY1f7Xl8w8fYfshR5gZTiqujGFAVf2DLAL461kE6fO7++HLj4d19yqBui3J3VncGr7HriOs2/qr2d6VxYvI7pnj07O4JX1zkluSnLnlWG3dPsK5ksVVfCc5r7tPTfKsLH6YAO4HMYbjqKpOrap/nsXztL/U3W9O8l+T/ExVfcZyzRlV9dTtztXdf5PklVm8kOrTqmp3km/e5stekuSSqnp8VT04i5i+rrvfkeRVSc6rqq9ZvlL6u7J4vvq+PDSL2+Z3VtUZSb53u7mBjyfGcHz8z6p6fxa3eX8wi+dW73lV9PclOZjk2qp6X5LfSnLuiue9LIvbvLcmuSbJ1fe1uLt/K8kPJ/kfWVwJPybL56e7+z1JnpHkJ5PcnmR3kv1J7rqPU/77JE9McmcWMX/linMDW1T34XedAJLlm5IcSvLM7n7t9DywyVwZA3+rqp5aVQ9f3sL+gSye/712eCzYeGIMbPWlSf5vkvdk8XeCv2b5V6KAY8htagAY5soYAIaJMQAMG3sHrtNPP73PPvvsqYcHgOPuDW94w3u6e+fh+8difPbZZ2f//v1TDw8Ax11VHekta92mBoBpYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw3ZMD8Cx8Suve2d+4003T48BD2gXPv6MfOMX75oeA1wZb6rfeNPNuf6W902PAQ9Y19/yPj+w8oDhyniD7f6sU/Or3/6l02PAA9I3/MIfT48Af8uVMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGLZtjKvqRVX17qr6s3s5/syqektVvbWq/qiqvmj9YwLA5lrlyviaJOffx/G/SPLk7j4vyY8muWoNcwHASWPHdgu6+/er6uz7OP5HWzavTXLm0Y8FACePdT9n/Owkr17zOQFgo217ZbyqqvrHWcT4H93HmkuTXJoku3btWtdDA8AJbS1XxlX1hUn+W5ILu/v2e1vX3Vd1957u3rNz5851PDQAnPCOOsZVtSvJK5N8U3ffcPQjAcDJZdvb1FX1kiRPSXJ6VR1K8iNJPjlJuvsFSS5P8ogkz6+qJLm7u/ccq4EBYNOs8mrqi7c5/pwkz1nbRABwkvEOXAAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADNs2xlX1oqp6d1X92b0cr6r62ao6WFVvqaonrn9MANhcq1wZX5Pk/Ps4/rQk5yw/Lk3y80c/FgCcPLaNcXf/fpI77mPJhUn+ey9cm+ThVfVZ6xoQADbdOp4zPiPJTVu2Dy33AQArOK4v4KqqS6tqf1Xtv+22247nQwPAA9Y6YnxzkrO2bJ+53Pdxuvuq7t7T3Xt27ty5hocGgBPfOmK8N8m/WL6q+kuS3Nndt6zhvABwUtix3YKqekmSpyQ5vaoOJfmRJJ+cJN39giT7klyQ5GCSDyW55FgNCwCbaNsYd/fF2xzvJN+1tokA4CTjHbgAYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABi2Uoyr6vyqOlBVB6vquUc4vquqXltVb6yqt1TVBesfFQA207YxrqpTklyZ5GlJdie5uKp2H7bsh5K8rLufkOSiJM9f96AAsKlWuTJ+UpKD3X1jd380yUuTXHjYmk5y6vLzhyV51/pGBIDNtkqMz0hy05btQ8t9Wz0vybOq6lCSfUn+1ZFOVFWXVtX+qtp/2223fQLjAsDmWdcLuC5Ock13n5nkgiS/WFUfd+7uvqq793T3np07d67poQHgxLZKjG9OctaW7TOX+7Z6dpKXJUl3/3GST0ly+joGBIBNt0qMX5/knKp6dFU9KIsXaO09bM07k3xFklTV52URY/ehAWAF28a4u+9OclmS1yR5Wxavmr6uqq6oqqcvl31Pkm+rqjcneUmSb+nuPlZDA8Am2bHKou7el8ULs7buu3zL59cn+bL1jgYAJwfvwAUAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFg2Eoxrqrzq+pAVR2squfey5qvr6rrq+q6qvqV9Y4JAJtrx3YLquqUJFcm+aokh5K8vqr2dvf1W9ack+T7k3xZd7+3qj7jWA0MAJtmlSvjJyU52N03dvdHk7w0yYWHrfm2JFd293uTpLvfvd4xAWBzrRLjM5LctGX70HLfVo9N8tiq+sOquraqzl/XgACw6ba9TX0/znNOkqckOTPJ71fVed3911sXVdWlSS5Nkl27dq3poQHgxLbKlfHNSc7asn3mct9Wh5Ls7e6PdfdfJLkhizj/Hd19VXfv6e49O3fu/ERnBoCNskqMX5/knKp6dFU9KMlFSfYetubXs7gqTlWdnsVt6xvXOCcAbKxtY9zddye5LMlrkrwtycu6+7qquqKqnr5c9pokt1fV9Ulem+R7u/v2YzU0AGySlZ4z7u59SfYdtu/yLZ93ku9efgAA94N34AKAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYNhKMa6q86vqQFUdrKrn3se6r62qrqo96xsRADbbtjGuqlOSXJnkaUl2J7m4qnYfYd1Dk/ybJK9b95AAsMlWuTJ+UpKD3X1jd380yUuTXHiEdT+a5CeSfGSN8wHAxlslxmckuWnL9qHlvr9VVU9MclZ3v2qNswHASeGoX8BVVZ+U5KeTfM8Kay+tqv1Vtf+222472ocGgI2wSoxvTnLWlu0zl/vu8dAkX5Dkd6vqHUm+JMneI72Iq7uv6u493b1n586dn/jUALBBVonx65OcU1WPrqoHJbkoyd57Dnb3nd19enef3d1nJ7k2ydO7e/8xmRgANsy2Me7uu5NcluQ1Sd6W5GXdfV1VXVFVTz/WAwLAptuxyqLu3pdk32H7Lr+XtU85+rEA4OThHbgAYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACG7ZgeADh+Xn7Dy7Pvxn3TYzwgHLjjyUmSS37zquFJHhgu+JwL8ozHPmN6jJOWGMNJZN+N+3LgjgM597Rzp0cZ94Qn/N70CA8YB+44kCRiPEiM4SRz7mnn5urzr54egweQS37zkukRTnqeMwaAYWIMAMPEGACGiTEADBNjABgmxgAwbKUYV9X5VXWgqg5W1XOPcPy7q+r6qnpLVf12VT1q/aMCwGbaNsZVdUqSK5M8LcnuJBdX1e7Dlr0xyZ7u/sIkr0jyk+seFAA21SpXxk9KcrC7b+zujyZ5aZILty7o7td294eWm9cmOXO9YwLA5lolxmckuWnL9qHlvnvz7CSvPpqhAOBksta3w6yqZyXZk+TJ93L80iSXJsmuXbvW+dAAcMJa5cr45iRnbdk+c7nv76iqr0zyg0me3t13HelE3X1Vd+/p7j07d+78ROYFgI2zSoxfn+Scqnp0VT0oyUVJ9m5dUFVPSPILWYT43esfEwA217Yx7u67k1yW5DVJ3pbkZd19XVVdUVVPXy77z0kekuTlVfWmqtp7L6cDAA6z0nPG3b0vyb7D9l2+5fOvXPNcAHDS8A5cADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGLZjeoATxv6rk7e+YnqK1d164eKfV//Y7Bz313lfl+y5ZHoKgONKjFf11lckt741+czzpidZya/u+o3pEe6/W9+6+KcYAycZMb4/PvO85JJXTU+xua7+Z9MTAIzwnDEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGedMPgDV7+Q0vz74b902PsbK33/H2JMklv3livfvdBZ9zQZ7x2GdMj7EWrowB1mzfjfty4I4D02Os7HGnPS6PO+1x02PcLwfuOHBC/cCzHVfGAMfAuaedm6vPv3p6jI11ol3Fb8eVMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAMWynGVXV+VR2oqoNV9dwjHH9wVf3q8vjrqursdQ8KAJtq2xhX1SlJrkzytCS7k1xcVbsPW/bsJO/t7s9N8jNJfmLdgwLAplrlyvhJSQ52943d/dEkL01y4WFrLkzy4uXnr0jyFVVV6xsTADbXKjE+I8lNW7YPLfcdcU13353kziSPWMeAALDpjutvbaqqS5Ncutz8QFWdOL9j7B7f6oL/mPM9PuauyTXTI5wUfJ+PvRPwe/yoI+1cJcY3Jzlry/aZy31HWnOoqnYkeViS2w8/UXdfleSqVaYFgJPFKrepX5/knKp6dFU9KMlFSfYetmZvkm9efv51SX6nu3t9YwLA5tr2yri7766qy5K8JskpSV7U3ddV1RVJ9nf33iQvTPKLVXUwyR1ZBBsAWEG5gAWAWd6BCwCGiTEADBNjABgmxiuoql+qqluq6n1VdUNVPWd6pk1VVedU1Ueq6pemZ9k0VXVaVf1aVX2wqv6yqr5xeqZNU1WXVdX+qrqrqq6ZnmcTLX8XwguXf4bfX1VvqqqnTc91tI7rm36cwH48ybO7+66qelyS362qN3b3G6YH20BXZvHX6Vi/K5N8NMkjkzw+yauq6s3dfd3sWBvlXUl+LMlTk3zq8CybakcW7/j45CTvTHJBkpdV1Xnd/Y7JwY6GK+MVdPd13X3XPZvLj8cMjrSRquqiJH+d5LenZ9k0VfXpSb42yQ939we6+w+yeH+Ab5qdbLN09yu7+9dzhDc9Yj26+4Pd/bzufkd3/7/u/l9J/iLJ35+e7WiI8Yqq6vlV9aEkb09yS5J9wyNtlKo6NckVSb57epYN9dgkd3f3DVv2vTnJ5w/NA2tRVY/M4s/3CX2HR4xX1N3fmeShSb48ySuT3HXfX8H99KNJXtjdh6YH2VAPSfK+w/bdmcWfaTghVdUnJ/nlJC/u7rdPz3M0xPh+6O6/Wd7eOzPJd0zPsymq6vFJvjKL34XNsfGBJKcetu/UJO8fmAWOWlV9UpJfzOJ1EJcNj3PUvIDrE7MjnjNep6ckOTvJO5e/BvshSU6pqt3d/cTBuTbJDUl2VNU53f3ny31flBP81h4np1r8j+KFWbwY8YLu/tjwSEfNlfE2quozquqiqnpIVZ1SVU9NcnG8yGidrsrih5vHLz9ekORVWbwilTXo7g9m8fTKFVX16VX1ZUkuzOLKgjWpqh1V9SlZvI//KVX1KcvfZMd6/XySz0vy1d394elh1kGMt9dZ3JI+lOS9SX4qyb9d/oIM1qC7P9Tdt97zkcUt1Y90923Ts22Y78zir9u8O8lLknyHv9a0dj+U5MNJnpvkWcvPf2h0og1TVY9K8u1Z/OB+a1V9YPnxzOHRjopfFAEAw1wZA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8Cw/w+ur6eq07qr5gAAAABJRU5ErkJggg==\n"
619 |           },
620 |           "metadata": {
621 |             "needs_background": "light"
622 |           }
623 |         }
624 |       ]
625 |     },
626 |     {
627 |       "cell_type": "markdown",
628 |       "source": [
629 |         "HAC is implemented in `sklearn.cluster` module as `AgglomerativeClustering` class"
630 |       ],
631 |       "metadata": {
632 |         "id": "hl87PtAx36qb"
633 |       }
634 |     }
635 |   ]
636 | }


--------------------------------------------------------------------------------