├── .gitignore ├── MLT ├── images │ ├── SVM.png │ ├── hinge_loss.png │ ├── RandomForest.png │ ├── lasso_contour.png │ ├── ridge_contour.png │ ├── softmax-model.png │ └── Softmax-regression.png ├── data │ └── weather_play.csv └── Week_9.ipynb ├── MLP ├── images │ ├── cluster.png │ ├── AverageLinkage.png │ ├── SingleLinkage.png │ ├── CompleteLinkage.png │ └── week_4_sns_pairplot.png ├── Week_2.ipynb └── Week_11.ipynb ├── DL ├── Week_1 │ ├── images │ │ ├── Perceptron.png │ │ ├── McCullochPitts.png │ │ ├── ArtificialNeuron.png │ │ ├── BiologicalNeuron.png │ │ └── DifferentVariationsofMcCullochPitts.png │ └── Week_1.md └── Week_3 │ ├── images │ ├── ErrorEquation.png │ ├── FeedForwardNN.png │ ├── example_2_3_3.png │ ├── example_3_3.png │ └── table_week_3_3.png │ ├── Lecture_3_1.md │ ├── Lecture_3_2.md │ └── Lecture_3_3.md ├── .gitattributes ├── README.md └── MLP-using-GPU └── 2-DataPreprocessing.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # pixi environments 3 | .pixi 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /MLT/images/SVM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/SVM.png -------------------------------------------------------------------------------- /MLP/images/cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/cluster.png -------------------------------------------------------------------------------- /MLT/images/hinge_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/hinge_loss.png -------------------------------------------------------------------------------- /MLP/images/AverageLinkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/AverageLinkage.png -------------------------------------------------------------------------------- /MLP/images/SingleLinkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/SingleLinkage.png -------------------------------------------------------------------------------- /MLT/images/RandomForest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/RandomForest.png -------------------------------------------------------------------------------- /MLT/images/lasso_contour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/lasso_contour.png -------------------------------------------------------------------------------- /MLT/images/ridge_contour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/ridge_contour.png -------------------------------------------------------------------------------- /MLT/images/softmax-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/softmax-model.png -------------------------------------------------------------------------------- /DL/Week_1/images/Perceptron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/Perceptron.png -------------------------------------------------------------------------------- /MLP/images/CompleteLinkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/CompleteLinkage.png -------------------------------------------------------------------------------- /DL/Week_3/images/ErrorEquation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/ErrorEquation.png -------------------------------------------------------------------------------- /DL/Week_3/images/FeedForwardNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/FeedForwardNN.png -------------------------------------------------------------------------------- /DL/Week_3/images/example_2_3_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/example_2_3_3.png -------------------------------------------------------------------------------- /DL/Week_3/images/example_3_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/example_3_3.png -------------------------------------------------------------------------------- /MLP/images/week_4_sns_pairplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLP/images/week_4_sns_pairplot.png -------------------------------------------------------------------------------- /MLT/images/Softmax-regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/MLT/images/Softmax-regression.png -------------------------------------------------------------------------------- /DL/Week_1/images/McCullochPitts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/McCullochPitts.png -------------------------------------------------------------------------------- /DL/Week_3/images/table_week_3_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_3/images/table_week_3_3.png -------------------------------------------------------------------------------- /DL/Week_1/images/ArtificialNeuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/ArtificialNeuron.png -------------------------------------------------------------------------------- /DL/Week_1/images/BiologicalNeuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/BiologicalNeuron.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # SCM syntax highlighting & preventing 3-way merges 2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true 3 | -------------------------------------------------------------------------------- /DL/Week_1/images/DifferentVariationsofMcCullochPitts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mourya96/IITM-Notes/HEAD/DL/Week_1/images/DifferentVariationsofMcCullochPitts.png -------------------------------------------------------------------------------- /MLT/data/weather_play.csv: -------------------------------------------------------------------------------- 1 | Outlook,Temperature,Humidity,Wind,Play 2 | Sunny,Hot,High,Weak,No 3 | Sunny,Hot,High,Strong,No 4 | Overcast,Hot,High,Weak,Yes 5 | Rain,Mild,High,Weak,Yes 6 | Rain,Cool,Normal,Weak,Yes 7 | Rain,Cool,Normal,Strong,No 8 | Overcast,Cool,Normal,Strong,Yes 9 | Sunny,Mild,High,Weak,No 10 | Sunny,Cool,Normal,Weak,Yes 11 | Rain,Mild,Normal,Weak,Yes 12 | Sunny,Mild,Normal,Strong,Yes 13 | Overcast,Mild,High,Strong,Yes 14 | Overcast,Hot,Normal,Weak,Yes 15 | Rain,Mild,High,Strong,No 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IITM-Notes 2 | 3 | Lecture notes and colab/jupyter notebooks 4 | 5 | Colab/jupyter notebooks were written manually, so some of the code might be modified (Don't worry most of it will stay true to the original code shown the colab lectures). 6 | 7 | If you find any errors or mistakes in the notebooks, please notify me through issues. 8 | 9 | > NOTE: If you want to run MLT / MLP notebooks on GPU using cuML and cuPy then I would recommends installing rapids library through conda otherwise you can install the current dependencies using pixi / conda / pip. 10 | -------------------------------------------------------------------------------- /DL/Week_3/Lecture_3_1.md: -------------------------------------------------------------------------------- 1 | # [Lecture 3.1: Feed Forward Neural Network](https://www.youtube.com/watch?v=HHv6Ndo9VBU) 2 | 3 | ![image](images/FeedForwardNN.png) 4 | 5 | - The input to the network is an $n$-dimensional vector 6 | - The network contains **$L - 1$** hidden layers (2 in this case) having **$n$** neurons each. 7 | - Finally, there is one output layer containing **$k$** neurons (say corresponding to **$k$** classes). 8 | - Each neuron in the hidden layer and the output layer can be split into two parts: pre-activation ($a_i$) and activation($h_i$). Both $a_i$ and $h_i$ are vectors. 9 | - The input layer is called 0-th layer and the output layer is called the (**$L$**)-th layer. 10 | - $\large W_i \isin \mathbb{R}^{n \times n}$ and $\large b_i \isin \mathbb{R}^n$ are the weight and the bias between the layers $i-1$ and $i$ ( $\large 0 \lt i \lt L$ ). 11 | - $\large W_L \isin \mathbb{R}^{k \times n}$ and $\large b_L \isin \mathbb{R}^k$ are the weight and the bias between the last hidden layer and output layer (L = 3 in this case) 12 | - The pre-activation at the layer $i$ is given by 13 | $$ 14 | \large 15 | a_i(x) = b_i(x) + W_ih_{i-1}(x) 16 | $$ 17 | - The activation at the layer $i$ is given by 18 | $$ 19 | \large 20 | h_i(x) = g(a_i(x)) 21 | $$ 22 | 23 | where $g$ is called the activation function (for example logistic, tanh, linear etc) 24 | 25 | - The activation at the output layer is given by 26 | $$ 27 | \large 28 | f(x) = h_L(x) = O(a_L(x)) 29 | $$ 30 | 31 | where $O$ is the output activation function (for example softmax, linear etc). 32 | 33 | > To simplify notation we will refer to $a_i(x)$ as $a_i$ and $h_i(x)$ as $h_i$, 34 | 35 | So in the above diagram, 36 | 37 | - **Data**: $\large \{x_i, y_i\}_{i=1}^N$ 38 | - **Model**: 39 | 40 | $$ 41 | \large 42 | \hat{y_i} = \hat{f}(x_i) = O(W_3g(W_2g(W_1x + b_1)+b_2)+b_3) 43 | $$ 44 | 45 | - **Parameters**: 46 | 47 | $$ 48 | \large 49 | \theta = W_1,.... ,W_L, b_1, b_2, ... , b_L \quad (L=3) 50 | $$ 51 | 52 | - **Algorithm**: Gradient descent with back-propagation 53 | - **Obejective/Loss/Error Function**: Say, 54 | 55 | $$ 56 | min \frac{1}{N} \sum_{i=1}^N \sum_{j=1}^k (\hat{y}_{ij} - y_{ij})^2 \\ 57 | In \enspace general,\enspace min \enspace \mathscr{L}(\theta) 58 | $$ 59 | 60 | where $\mathscr{L}(\theta)$ is some function of parameters. 61 | -------------------------------------------------------------------------------- /DL/Week_3/Lecture_3_2.md: -------------------------------------------------------------------------------- 1 | # [Lecture 3.2 Learning parameters](https://www.youtube.com/watch?v=0Me1ywSlJE8) 2 | 3 | ![image](images/FeedForwardNN.png) 4 | 5 | ___ 6 | 7 | ## Algorithm: `gradient_descent()` 8 | 9 | ___ 10 | $$ 11 | \begin{align} 12 | &t \leftarrow 0; \\ 13 | &max\_iterations \leftarrow 1000; \\ 14 | &Initialize \enspace w_0, b_0; \\ 15 | & \mathbf{while} \enspace t \text{++} \lt max\_iterations \enspace \mathbf{do} \\ 16 | & \quad w_{t+1} \leftarrow w_t -\eta \nabla w_t \\ 17 | & \quad b_{t+1} \leftarrow b_t -\eta \nabla b_t \\ 18 | & \mathbf{end} 19 | \end{align} 20 | $$ 21 | 22 | We can concisely write it as: 23 | 24 | ___ 25 | 26 | ## Algorithm: `gradient_descent()` 27 | 28 | ___ 29 | $$ 30 | \begin{align} 31 | & t \leftarrow 0; \\ 32 | & max\_iterations \leftarrow 1000; \\ 33 | & Initialize \enspace \theta_0 = [w_0, b_0]; \\ 34 | & \mathbf{while} \enspace t \text{++} \lt max\_iterations \enspace \mathbf{do} \\ 35 | & \quad\theta_{t+1} \leftarrow \theta_t -\eta \nabla \theta_t \\ 36 | & \mathbf{end} 37 | \end{align} 38 | $$ 39 | where $\Large \nabla \theta_t = [\frac{\partial \mathscr{L}(\theta)}{\partial w_t}, \frac{\partial \mathscr{L}(\theta)}{\partial b_t}]^T$ 40 | 41 | - Now, in this feedforward neural network, instead of $\theta = [w, b]$ we have $\theta = [W_1, W_2,...,W_L, b_1, b_2,...,b_L]$ 42 | - We can still use the same algorithm for learning the parameters of our model. 43 | 44 | ___ 45 | 46 | ## Algorithm: `gradient_descent()` 47 | 48 | ___ 49 | $$ 50 | \begin{align} 51 | & t \leftarrow 0; \\ 52 | & max\_iterations \leftarrow 1000; \\ 53 | & Initialize \enspace \color{red}{\theta_0 = [W_1^0,...,W_L^0, b_1^0,...,b_L^0];} \\ 54 | & \mathbf{while} \enspace t \text{++} \lt max\_iterations \enspace \mathbf{do} \\ 55 | & \quad\theta_{t+1} \leftarrow \theta_t -\eta \nabla \theta_t \\ 56 | & \mathbf{end} 57 | \end{align} 58 | $$ 59 | where $\color{red}{\Large \nabla \theta_t = [\frac{\partial \mathscr{L}(\theta)}{\partial W_{1,t}},...,\frac{\partial \mathscr{L}(\theta)}{\partial W_{L,t}},\frac{\partial \mathscr{L}(\theta)}{\partial b_{1,t}},...,\frac{\partial \mathscr{L}(\theta)}{\partial b_{L,t}}]^T}$ 60 | 61 | - Thus $\nabla \theta$ is composed of: 62 | - $\nabla W_1, \nabla W_2,..., \nabla W_{L-1} \in \mathbb{R}^{n \times n}, \nabla W_L \in \mathbb{R}^{k \times n}$ 63 | - $\nabla b_1, \nabla b_2,..., \nabla b_{L-1} \in \mathbb{R}^n, \nabla b_L \in \mathbb{R}^k$ 64 | -------------------------------------------------------------------------------- /DL/Week_3/Lecture_3_3.md: -------------------------------------------------------------------------------- 1 | # [Lecture 3.3: Output functions and loss functions](https://www.youtube.com/watch?v=1hefEWZHvJg) 2 | 3 | - The choice of loss function depends on problem at hand 4 | - Consider movie example again but this time we are interested in predicting ratings 5 | 6 | ![image](images/example_3_3.png) 7 | 8 | - Here $y_i \in \mathbb{R}^3$ 9 | - The loss function should capture how much $\hat{y}_j$ deviates from $y_j$ 10 | - If $y_j \in \mathbb{R}^3$ then the squared error loss can capture this deviation 11 | 12 | $$ 13 | \mathscr{L}(\theta) = \frac{1}{N} \sigma_{i=1}^N \sigma_{j=1}^k (\hat{y}_{ij} - y_{ij})^2 14 | $$ 15 | 16 | - A related question would be what is the output functions 'O' be if $y_j \in \mathbb{R}$ 17 | - More specifically, can it be the logistic function? 18 | - No because it restricts $\hat{y}_j$ to the value to between 0 and 1. But we want $y_j \in \mathbb{R}$ 19 | - So, in such cases it makes sense to have 'O' as linear function 20 | 21 | $$ 22 | \begin{align} 23 | \hat{f}(x) &= h_L = O(a_L) \\ 24 | &= W_O a_L + b_O 25 | \end{align} 26 | $$ 27 | 28 | - $\hat{y}_j = \hat{f}(X_i)$ is no longer bounded between 0 and 1 29 | 30 | ![image](images/example_2_3_3.png) 31 | 32 | - Now let us consider another problem for which a different loss function would be appropriate. 33 | - Suppose we want to classify an image into 1 of the $k$ classes 34 | - Here again we could use the squared error loss to capture the deviation 35 | - Notice that $y$ is a probability distribution 36 | - Therefore we should also ensure that $\hat{y}$ is a probability distribution 37 | - We use **softmax** function to get the expected output in a probability distribution 38 | - $a_L = W_L h_{L-1} + b_L$ 39 | - $\large y_i = O(a_L)_j = \frac{e^{L,j}}{\sum_{i=1}^k e^{a_{L,j}}}$ 40 | - $O(a_L)_j$ is the $j^{th}$ element of $\hat{y}$ and $a_{L,j}$ is the $j^{th}$ element of the vector $a_L$. 41 | - **Cross Entropy**: 42 | $$\mathscr{L}(\theta) = -\sum_{c=1}^k y_c \log \hat{y}_c$$ 43 | 44 | Notice that 45 | 46 | - $y_c = 1$ if $c = l$ ( the true class label) and 0 otherwise. 47 | 48 | $$ 49 | \because \enspace \mathscr{L}(\theta) = - \log \hat{y}_l 50 | $$ 51 | 52 | - So for classification problem (where we have to choose 1 of K classes), we use the following objective function 53 | - $\text{minimize} \enspace \mathscr{L}(\theta) = - \log \hat{y}_l$ 54 | - $y_l$ is a function of $\theta$ and is the probablity that $x$ belongs to $l$ class. 55 | - $\log y_l$ is called **log-likelihood** of the data. 56 | 57 | ![image](images/table_week_3_3.png) 58 | -------------------------------------------------------------------------------- /DL/Week_1/Week_1.md: -------------------------------------------------------------------------------- 1 | # Week 1 2 | 3 | ## [Lecture 1.6: Motivation from Biological Neuron](https://www.youtube.com/watch?v=KjMvUwq7PdQ) 4 | 5 | - The most fundamental unit of a deep neural network is called an **artificial** *neuron*. 6 | - The inspiration comes from biology (more specifically from the brain) 7 | - **biological neurons = neuron cells = neural processing units** 8 | 9 | ![ArtificialNeuron](images/ArtificialNeuron.png) 10 | 11 | ### Biological Neuron 12 | 13 | ![BiologicalNeuron](images/BiologicalNeuron.png) 14 | 15 | - **dendrite**: receives signals from other neurons 16 | - **synapse**: point of connection to other neurons 17 | - **soma**: process the information 18 | - **axon**: transmits the output of the neuron 19 | 20 | - Our sensory organs interact with the outside world and they relay information to the neurons. The neurons (may) get activated and produces a response 21 | - Of course, in reality, its not just a single neuron that does all this but there is a massively parallel interconnected network of neurons. 22 | - The sensory organs relay information to the lowest layer of neurons. 23 | - An average human brain has around $10^{11}$ (100 billion) neurons 24 | - This massively parallel network also ensures that there is division of work 25 | - Each neuron may perform a certain role or respond to a certain stimulus. 26 | 27 | ## [Lecture 1.7: McCulloch Pitts Neuron and Thresholding Logic](https://www.youtube.com/watch?v=-bxOadOFNYc) 28 | 29 | ![image](images/McCullochPitts.png) 30 | 31 | - McCulloch and Pitts proposed a highly simplified computational model of the neuron 32 | - $g$ aggregates the inputs and the function $f$ takes a decision based on this aggregation 33 | - The inputs can be excitatory and inhibitory ( if a particular input is "ON", no matter what the other inputs are, the output will always be zero.) 34 | - $y = 0$ if any $x_i$ is inhibitory, else 35 | $$ 36 | \large 37 | g(x_1, x_2,..., x_n) = g(x) = \sum_{i=1}^n x_i \\ 38 | y = f(g(x)) = 1 \quad \text{if} \quad g(x) \ge \theta 39 | $$ 40 | - $\theta$ is called thresholding parameter 41 | 42 | ![image](images/DifferentVariationsofMcCullochPitts.png) 43 | 44 | Here in NOT function $x_1$ is an inhibitory input. 45 | 46 | ## [Lecture 1.8: Perceptrons](https://www.youtube.com/watch?v=Ydd9TMyoG6k) 47 | 48 | ![image](images/Perceptron.png) 49 | 50 | - Frank Rosenblatt, an American Psychologist proposed the classic perceptron model in 1958. 51 | - A more general computationsal model than McCulloch-Pitts neurons 52 | - **Main differences**: Introduction of numerical weights for inputs and a mechanism for learning these weights 53 | - Inputs are no longer limited to boolean values 54 | - Refined and carefully analyzed by Minsky and Papert (1969) - their model is referred to as the **perceptron** model here. 55 | 56 | $$ 57 | \begin{align} 58 | y &= 1 \quad if \quad \sum_{i=1}^n w_i*x_i \ge \theta \\ 59 | &= 0 \quad if \quad \sum_{i=1}^n w_i*x_i \lt \theta 60 | \end{align} 61 | $$ 62 | simplifying it and taking $w_0$ as $-\theta$ we get 63 | $$ 64 | \begin{align} 65 | y &= 1 \quad if \quad \sum_{i=0}^n w_i*x_i \ge 0 \\ 66 | &= 0 \quad if \quad \sum_{i=0}^n w_i*x_i \lt 0 67 | \end{align} 68 | $$ 69 | 70 | - From the equations it should be clear that even a perceptron separates the input space into two halves 71 | - All the inputs which produce a 1 lie on one side and all inputs which produce a 0 lie on the other side 72 | - In other words, a single perceptron can only be used to implement linearly seperable functions 73 | - Then what is the difference between McCulloch-Pitts model and Perceptron? 74 | - The weights (including the threshold) can be learned and the inputs can be real valued. 75 | -------------------------------------------------------------------------------- /MLT/Week_9.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Lecture 9.4: Implementing DT from scratch" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Decision Trees\n", 15 | "\n", 16 | "Decision Trees are popular **supervised machine learning algorithm** that can be used for both **classification** and **regression** tasks\n", 17 | "\n", 18 | "The tree itself is a model in decision trees and we would like to estimate an **optimal tree structure** from the given training data.\n", 19 | "\n", 20 | "- The internal nodes contains conditions on features. Depending on the outcome of the comparision, we take an appropriate path in the tree. The process is repeated until we reach a leaf note.\n", 21 | "- In the case of classification, leaf nodes contain label and in case of regression, the prediction is obtained by taking sample mean of labels of the subset of training present in that leaf node.\n", 22 | "\n", 23 | "In this colab, we will implement decision tree for classification with ID3 algorithm" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "#### Importing Libraries" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import numpy as np\n", 40 | "import pandas as pd" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "2.220446049250313e-16" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "eps = np.finfo(float).eps\n", 61 | "eps" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Here `eps` is the smallest respectable number. At times we get `log(0)` or `0` in the denominator, to avoid that we are going to use this." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "#### Classification Demo\n", 76 | "In this case we'll use a synthetic data for classification data." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
OutlookTemperatureHumidityWindPlay
0SunnyHotHighWeakNo
1SunnyHotHighStrongNo
2OvercastHotHighWeakYes
3RainMildHighWeakYes
4RainCoolNormalWeakYes
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " Outlook Temperature Humidity Wind Play\n", 159 | "0 Sunny Hot High Weak No\n", 160 | "1 Sunny Hot High Strong No\n", 161 | "2 Overcast Hot High Weak Yes\n", 162 | "3 Rain Mild High Weak Yes\n", 163 | "4 Rain Cool Normal Weak Yes" 164 | ] 165 | }, 166 | "execution_count": 3, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "df = pd.read_csv('data/weather_play.csv') # This is the data shown in the slides\n", 173 | "df.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 4, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "(14, 5)" 185 | ] 186 | }, 187 | "execution_count": 4, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "df.shape" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 5, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "array([['Sunny', 'Hot', 'High', 'Weak', 'No'],\n", 205 | " ['Sunny', 'Hot', 'High', 'Strong', 'No'],\n", 206 | " ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],\n", 207 | " ['Rain', 'Mild', 'High', 'Weak', 'Yes'],\n", 208 | " ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],\n", 209 | " ['Rain', 'Cool', 'Normal', 'Strong', 'No'],\n", 210 | " ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],\n", 211 | " ['Sunny', 'Mild', 'High', 'Weak', 'No'],\n", 212 | " ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],\n", 213 | " ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],\n", 214 | " ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],\n", 215 | " ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],\n", 216 | " ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],\n", 217 | " ['Rain', 'Mild', 'High', 'Strong', 'No']], dtype=object)" 218 | ] 219 | }, 220 | "execution_count": 5, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "df.values" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 6, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "Index(['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play'], dtype='object')" 238 | ] 239 | }, 240 | "execution_count": 6, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "df.keys()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 7, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "'Play'" 258 | ] 259 | }, 260 | "execution_count": 7, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "target = df.keys()[-1] # Name of the target column\n", 267 | "target" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 8, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "text/plain": [ 278 | "Index(['Outlook', 'Temperature', 'Humidity', 'Wind'], dtype='object')" 279 | ] 280 | }, 281 | "execution_count": 8, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "df.keys()[:-1]" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Let's check the total number of labels" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 9, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "array(['No', 'Yes'], dtype=object)" 306 | ] 307 | }, 308 | "execution_count": 9, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "df[target].unique()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "There are two labels `Yes` and `No`" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 10, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "No\n", 334 | "Yes\n" 335 | ] 336 | } 337 | ], 338 | "source": [ 339 | "print(df[target].unique()[0])\n", 340 | "print(df[target].unique()[1])" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 12, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "5\n", 353 | "9\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "print(df[target].value_counts()[df[target].unique()[0]])\n", 359 | "print(df[target].value_counts()[df[target].unique()[1]])" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "9 out of 14 examples have `Yes` and 5 out of 14 examples have label `No`" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "#### Calculating entropy of the whole dataset" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 13, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "0.9402859586706311" 385 | ] 386 | }, 387 | "execution_count": 13, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "def find_entropy_whole(df):\n", 394 | " # Last column in dataframe is the target variable.\n", 395 | " target = df.keys()[-1]\n", 396 | "\n", 397 | " # Initialization\n", 398 | " overall_entropy = 0\n", 399 | "\n", 400 | " # possible values of the target\n", 401 | " values_in_target = df[target].unique()\n", 402 | "\n", 403 | " for value in values_in_target:\n", 404 | " p = df[target].value_counts()[value]/len(df[target])\n", 405 | " overall_entropy += -p*np.log2(p)\n", 406 | " \n", 407 | " return overall_entropy\n", 408 | "\n", 409 | "find_entropy_whole(df)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "#### Calculating entropy of an attribute" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 14, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "def find_entropy_of_attribute(df, attribute):\n", 426 | "\n", 427 | " # last column in dataframe is the target variable\n", 428 | " target = df.keys()[-1]\n", 429 | " \n", 430 | " values_in_target = df[target].unique()\n", 431 | "\n", 432 | " # This gives different features in that attribute (\n", 433 | " # like 'hot', 'cold' in temperature)\n", 434 | " values_in_attribute = df[attribute].unique()\n", 435 | "\n", 436 | " # Initialize attribute's entropy\n", 437 | " entropy_attribute = 0\n", 438 | "\n", 439 | "\n", 440 | " for value_in_attribute in values_in_attribute:\n", 441 | " overall_entropy = 0\n", 442 | " for value_in_target in values_in_target:\n", 443 | " num = len(df[attribute][df[attribute] == value_in_attribute][df[target] == value_in_target])\n", 444 | " den = len(df[attribute][df[attribute] == value_in_attribute])\n", 445 | " p = num/(den + eps)\n", 446 | " overall_entropy += -p*np.log2(p+eps)\n", 447 | " p2 = den/len(df)\n", 448 | " entropy_attribute += -p2*overall_entropy\n", 449 | " return abs(entropy_attribute)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "Let's compute entropy for different attributes" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 15, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "Entropy of attribute 'Outlook' is : 0.6935361388961914\n", 469 | "Entropy of attribute 'Temperature' is : 0.9110633930116756\n", 470 | "Entropy of attribute 'Humidity' is : 0.7884504573082889\n", 471 | "Entropy of attribute 'Wind' is : 0.892158928262361\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "for i_attribute in df.keys()[:-1]:\n", 477 | " print(f'Entropy of attribute \\'{i_attribute}\\' is :',\n", 478 | " find_entropy_of_attribute(df, i_attribute))" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "#### Finding the best attribute" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 16, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "data": { 495 | "text/plain": [ 496 | "'Outlook'" 497 | ] 498 | }, 499 | "execution_count": 16, 500 | "metadata": {}, 501 | "output_type": "execute_result" 502 | } 503 | ], 504 | "source": [ 505 | "def find_best_attribute_to_divide(df):\n", 506 | " # Information gain\n", 507 | " IG = []\n", 508 | "\n", 509 | " # All column names\n", 510 | " all_attribute_names = df.keys()[:-1]\n", 511 | "\n", 512 | " for attribute in all_attribute_names:\n", 513 | " # Compute information gain for every attribute\n", 514 | " IG.append(find_entropy_whole(df) - find_entropy_of_attribute(df, attribute))\n", 515 | "\n", 516 | " # Get the index of attribute with best information gain\n", 517 | " index_of_attribute_with_max_IG = np.argmax(IG)\n", 518 | " best_attribute = all_attribute_names[index_of_attribute_with_max_IG]\n", 519 | "\n", 520 | " return best_attribute\n", 521 | "\n", 522 | "find_best_attribute_to_divide(df) " 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": {}, 528 | "source": [ 529 | "#### Building Decision Tree" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 19, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "def buildTree(df, tree=None):\n", 539 | "\n", 540 | " # last column in dataframe\n", 541 | " target = df.keys()[-1]\n", 542 | "\n", 543 | " # Here we build our decision tree\n", 544 | "\n", 545 | " # Get attribute with maximum information gain\n", 546 | " node = find_best_attribute_to_divide(df)\n", 547 | "\n", 548 | " # Get distinct value of that attribute\n", 549 | " attValue = np.unique(df[node])\n", 550 | "\n", 551 | " # Create an array dictionary to create tree\n", 552 | " if tree is None:\n", 553 | " tree = {}\n", 554 | " tree[node] = {}\n", 555 | " \n", 556 | " # We make a loop to contruct a tree by calling this function recursively\n", 557 | " # In this we check if the subset is pure and stops if it is pure\n", 558 | " for value in attValue:\n", 559 | "\n", 560 | " subtable = df[df[node] == value].reset_index(drop=True)\n", 561 | " clValue, counts = np.unique(subtable['Play'], return_counts=True)\n", 562 | "\n", 563 | " if len(counts) == 1: # Checking purity of the subset\n", 564 | " tree[node][value] = clValue[0]\n", 565 | " else:\n", 566 | " tree[node][value] = buildTree(subtable) # Calling the function recursively\n", 567 | " \n", 568 | " return tree" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 20, 574 | "metadata": {}, 575 | "outputs": [ 576 | { 577 | "data": { 578 | "text/plain": [ 579 | "{'Outlook': {'Overcast': 'Yes',\n", 580 | " 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},\n", 581 | " 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}" 582 | ] 583 | }, 584 | "execution_count": 20, 585 | "metadata": {}, 586 | "output_type": "execute_result" 587 | } 588 | ], 589 | "source": [ 590 | "buildTree(df)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "ID3 in its pure form performs no backtracking in its search. Once it selects an attribute to test at a particular level in the tree, it never backtracks to reconsider this choice. Therefore, it is susceptible to the usual risks of hill-climbing search without backtracking: converging to locally optimal solutions that are not globally optimal" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [] 606 | } 607 | ], 608 | "metadata": { 609 | "kernelspec": { 610 | "display_name": "Python 3.9.12 ('base')", 611 | "language": "python", 612 | "name": "python3" 613 | }, 614 | "language_info": { 615 | "codemirror_mode": { 616 | "name": "ipython", 617 | "version": 3 618 | }, 619 | "file_extension": ".py", 620 | "mimetype": "text/x-python", 621 | "name": "python", 622 | "nbconvert_exporter": "python", 623 | "pygments_lexer": "ipython3", 624 | "version": "3.9.12" 625 | }, 626 | "orig_nbformat": 4, 627 | "vscode": { 628 | "interpreter": { 629 | "hash": "9244b6adea22edad6e19cdea93c196ea7ddff3c1d91dfb077ea542e13d85dd05" 630 | } 631 | } 632 | }, 633 | "nbformat": 4, 634 | "nbformat_minor": 2 635 | } 636 | -------------------------------------------------------------------------------- /MLP/Week_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# **Data Preprocessing Techniques**\n", 8 | "Data preprocessing involves several transformations that are applied to the raw data and make it more amenable for learning. It is carried out before using it for model training or prediction.\n", 9 | "\n", 10 | "There are many pre-processing techniques for\n", 11 | "- Data cleaning\n", 12 | " - Data imputation\n", 13 | " - Feature scaling\n", 14 | "- Feature transformation\n", 15 | " - Polynomial features\n", 16 | " - Discretization\n", 17 | " - Handling categorical features\n", 18 | " - Custom Transformers\n", 19 | " - Composite Transformers\n", 20 | " - Apply transformation of diverse features\n", 21 | " - TargetTransformedRegressor\n", 22 | "- Feature Selection\n", 23 | " - Filter based feature selection\n", 24 | " - Wrapper based feature selection\n", 25 | "- Feature Extraction\n", 26 | " - PCA\n", 27 | "\n", 28 | "The transformations are applied in a specific order and the order can be specified via ``Pipeline``. We need to apply different transformations based on the feature type. ``FeatureUnion`` helps us perform that task and combine outputs from multiple transformations into a single transformed feature matrix. We will also study how to visualize this pipeline." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Importing basic libraries" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import numpy as np\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "import pandas as pd\n", 47 | "import seaborn as sns\n", 48 | "\n", 49 | "sns.set_theme(style=\"whitegrid\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## **1. Feature Extraction**" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### DictVectorizer\n", 64 | "\n", 65 | "Many a times the data is present as a **list of dictionary objects**. ML algorithms expect the data to be in **matrix form** of shape $(n,m)$ where $n$ is the number of samples and $m$ is the number of features.\n", 66 | "\n", 67 | "``DictVectorizer`` **converts** a *list of dictionary objects to feature matrix*.\n", 68 | "\n", 69 | "Let's create a sample data for demo purpose containing ``age`` and ``height`` of children\n", 70 | "> Each record/sample in dictionary with two keys ``age`` and ``height``, and their corresponding values." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "data = [{'age' : 4, 'height' : 96.0},\n", 80 | " {'age' : 1, 'height' : 73.9},\n", 81 | " {'age' : 3, 'height' : 88.9},\n", 82 | " {'age' : 2, 'height' : 81.6}]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "> There are 4 data samples with 2 features each\n", 90 | "\n", 91 | "Let's make use of ``DictVectorizer`` to convert the list of dictionary objects to the feature matrix" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "array([[ 4. , 96. ],\n", 103 | " [ 1. , 73.9],\n", 104 | " [ 3. , 88.9],\n", 105 | " [ 2. , 81.6]])" 106 | ] 107 | }, 108 | "execution_count": 3, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "from sklearn.feature_extraction import DictVectorizer\n", 115 | "dv = DictVectorizer(sparse = False)\n", 116 | "data_transformed = dv.fit_transform(data)\n", 117 | "data_transformed" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "(4, 2)" 129 | ] 130 | }, 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "data_transformed.shape" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "The transformed data is in the feature matrix form- 4 examples with 2 features each i.e shape $(4,2)$" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## **2. Data Imputation**\n", 152 | "- Many machine learning algorithms need full feature matrix and they may not work in the presence of missing data\n", 153 | "- Data imputation identified **missing values** in each feature of the dataset and **replaces** them with an **appropriate value** based on **fixed strategy** such as:\n", 154 | " - **mean** or **median** or **mode** of that feature.\n", 155 | " - **use specified constant** value\n", 156 | "\n", 157 | "Sklearn library provides ``sklearn.impute.SimpleImputer`` class for this purpose" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "from sklearn.impute import SimpleImputer" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Some of its important parameters:\n", 174 | "- *missing_values*: Could be ``int, float, str, np.nan`` or ``None``. By default its ``np.nan``.\n", 175 | "- *strategy*: default is 'mean'. One of the following strategies can be used. \n", 176 | " - ``mean``- missing values are replaced using the **mean** along each column\n", 177 | " - ``median`` - missing values are replaced using the **median** along each column\n", 178 | " - ``most_frequent`` - missing values are replaced using the **most frequent** along each column\n", 179 | " - ``constant`` - missing values are replaced with values specified in ``fill_value`` argument.\n", 180 | "- ``add_indicator`` is a boolean parameter that when set to ``True`` returns **missing value indicators** in ``indicators_`` member variable.\n", 181 | "\n", 182 | "**Note**:\n", 183 | "- ``mean`` and ``mode`` strategies can only be used with numeric data.\n", 184 | "- ``most_frequent`` and ``constant`` strategies can be used with strings or numeric data." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Data imputation on real world dataset\n", 192 | "Let's perform data imputation on real world dataset. We will be using [heart-disease from uci machine learning repository](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) for this purpose. We will load this dataset from csv file." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "cols = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']\n", 202 | "heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None,names=cols)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "**STEP 1.**: Check if dataset contains missing values.\n", 210 | "- This can be checked via dataset description or by check number of ``nan`` or ``np.null`` in the dataframe. Howevver such check can be performed only for numerical features.\n", 211 | "- For non-numerical features, we can list their unique values and check if there are values like ``?``.\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "\n", 224 | "RangeIndex: 303 entries, 0 to 302\n", 225 | "Data columns (total 14 columns):\n", 226 | " # Column Non-Null Count Dtype \n", 227 | "--- ------ -------------- ----- \n", 228 | " 0 age 303 non-null float64\n", 229 | " 1 sex 303 non-null float64\n", 230 | " 2 cp 303 non-null float64\n", 231 | " 3 trestbps 303 non-null float64\n", 232 | " 4 chol 303 non-null float64\n", 233 | " 5 fbs 303 non-null float64\n", 234 | " 6 restecg 303 non-null float64\n", 235 | " 7 thalach 303 non-null float64\n", 236 | " 8 exang 303 non-null float64\n", 237 | " 9 oldpeak 303 non-null float64\n", 238 | " 10 slope 303 non-null float64\n", 239 | " 11 ca 303 non-null object \n", 240 | " 12 thal 303 non-null object \n", 241 | " 13 num 303 non-null int64 \n", 242 | "dtypes: float64(11), int64(1), object(2)\n", 243 | "memory usage: 33.3+ KB\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "heart_data.info()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Let's check if there are missing values in numerical columns - here we have checked it for all columns in the dataframe." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "age 0\n", 267 | "sex 0\n", 268 | "cp 0\n", 269 | "trestbps 0\n", 270 | "chol 0\n", 271 | "fbs 0\n", 272 | "restecg 0\n", 273 | "thalach 0\n", 274 | "exang 0\n", 275 | "oldpeak 0\n", 276 | "slope 0\n", 277 | "ca 0\n", 278 | "thal 0\n", 279 | "num 0\n", 280 | "dtype: int64" 281 | ] 282 | }, 283 | "execution_count": 8, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "(heart_data.isnull().sum())" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "There are two non-numerical features: ``ca`` and ``thal``.\n", 297 | "- List their unique values." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?']\n", 310 | "Unique values in thal: ['6.0' '3.0' '7.0' '?']\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "print('Unique values in ca:', heart_data.ca.unique())\n", 316 | "print('Unique values in thal:', heart_data.thal.unique())" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "Both of them contain ``?`` which is a missing values. Let's count the number of missing values." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 10, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "# missing values in ca: 4\n", 336 | "# missing values in thal: 2\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "print('# missing values in ca:', heart_data.loc[heart_data.ca == '?','ca'].count())\n", 342 | "print('# missing values in thal:', heart_data.loc[heart_data.thal ==\"?\",'thal'].count())" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "**STEP 2**: Replace '?' with ``nan``." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 11, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "heart_data.replace('?',np.nan, inplace=True)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "**STEP 3**: Fill the missing values with ``sklearn`` missing value imputation utilities.\n", 366 | "> Here we use ``SimpleImputer`` with ``mean`` strategy.\n", 367 | "\n", 368 | "We will try two variations- \n", 369 | "- ``add_indicator = False``: Default choice that only imputes missing values." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 12, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "(303, 14)\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n", 387 | "imputer = imputer.fit(heart_data)\n", 388 | "heart_data_imputed = imputer.transform(heart_data)\n", 389 | "print(heart_data_imputed.shape)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "- ``add_indicator = True``: Adds additional column for each column containing missing values. In this case it adds two column, one for ``ca`` and the other for ``thal``." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 14, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "(303, 16)\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "imputer = SimpleImputer(missing_values= np.nan, strategy='mean', add_indicator=True)\n", 414 | "imputer = imputer.fit(heart_data)\n", 415 | "heart_data_imputed_with_indicator = imputer.transform(heart_data)\n", 416 | "print(heart_data_imputed_with_indicator.shape)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "## **3. Feature Scaling**\n", 424 | "\n", 425 | "Feature scaling **transforms feature values** such that **all the features are on the same scale**.\n", 426 | "When we use feature matrix with all the features on the same scale.\n", 427 | "- **Enables faster convergence** in iterative optimization algorithms like gradient descent and its variants.\n", 428 | "- The performance of ML algorithms such as SVM, K-NN and K-means etc, that compute euclidean distance among input samples gets impacted if the features are not scaled.\n", 429 | "\n", 430 | "Tree based ML algorithms are not affected by feature-scaling. In other words, feature scaling is not required for tree based ML algorithms\n", 431 | "\n", 432 | "Feature scaling can be performed with the following methods:\n", 433 | "- Standardization\n", 434 | "- Normalization\n", 435 | "- MaxAbsScaler.\n", 436 | "\n", 437 | "Let's demonstrate feature scaling on real world dataset. For this purpose, we will be using [abalone dataset](https://archive.ics.uci.edu/ml/datasets/abalone). We will use different scaling utilities in ``sklearn`` library." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 17, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "cols = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']\n", 447 | "abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',header=None,names=cols)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "**STEP 1**: Examine the dataset" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 18, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "\n", 467 | "RangeIndex: 4177 entries, 0 to 4176\n", 468 | "Data columns (total 9 columns):\n", 469 | " # Column Non-Null Count Dtype \n", 470 | "--- ------ -------------- ----- \n", 471 | " 0 Sex 4177 non-null object \n", 472 | " 1 Length 4177 non-null float64\n", 473 | " 2 Diameter 4177 non-null float64\n", 474 | " 3 Height 4177 non-null float64\n", 475 | " 4 Whole weight 4177 non-null float64\n", 476 | " 5 Shucked weight 4177 non-null float64\n", 477 | " 6 Viscera weight 4177 non-null float64\n", 478 | " 7 Shell weight 4177 non-null float64\n", 479 | " 8 Rings 4177 non-null int64 \n", 480 | "dtypes: float64(7), int64(1), object(1)\n", 481 | "memory usage: 293.8+ KB\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "abalone_data.info()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "**STEP 1a**: [Optional]: convert non-numerical attributes into numerical ones\n", 494 | "> In this dataset only ``Sex`` is the non-numeric column" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 19, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "array(['M', 'F', 'I'], dtype=object)" 506 | ] 507 | }, 508 | "execution_count": 19, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "abalone_data.Sex.unique()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 20, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "\n", 527 | "RangeIndex: 4177 entries, 0 to 4176\n", 528 | "Data columns (total 9 columns):\n", 529 | " # Column Non-Null Count Dtype \n", 530 | "--- ------ -------------- ----- \n", 531 | " 0 Sex 4177 non-null int64 \n", 532 | " 1 Length 4177 non-null float64\n", 533 | " 2 Diameter 4177 non-null float64\n", 534 | " 3 Height 4177 non-null float64\n", 535 | " 4 Whole weight 4177 non-null float64\n", 536 | " 5 Shucked weight 4177 non-null float64\n", 537 | " 6 Viscera weight 4177 non-null float64\n", 538 | " 7 Shell weight 4177 non-null float64\n", 539 | " 8 Rings 4177 non-null int64 \n", 540 | "dtypes: float64(7), int64(2)\n", 541 | "memory usage: 293.8 KB\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "#Assign numeric values to sex.\n", 547 | "abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})\n", 548 | "abalone_data.info()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "**STEP 2**: Separate labels from features." 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 21, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "name": "stdout", 565 | "output_type": "stream", 566 | "text": [ 567 | "The dataframe object after deleting the column\n", 568 | "\n", 569 | "RangeIndex: 4177 entries, 0 to 4176\n", 570 | "Data columns (total 8 columns):\n", 571 | " # Column Non-Null Count Dtype \n", 572 | "--- ------ -------------- ----- \n", 573 | " 0 Sex 4177 non-null int64 \n", 574 | " 1 Length 4177 non-null float64\n", 575 | " 2 Diameter 4177 non-null float64\n", 576 | " 3 Height 4177 non-null float64\n", 577 | " 4 Whole weight 4177 non-null float64\n", 578 | " 5 Shucked weight 4177 non-null float64\n", 579 | " 6 Viscera weight 4177 non-null float64\n", 580 | " 7 Shell weight 4177 non-null float64\n", 581 | "dtypes: float64(7), int64(1)\n", 582 | "memory usage: 261.2 KB\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "y = abalone_data.pop('Rings')\n", 588 | "print('The dataframe object after deleting the column')\n", 589 | "abalone_data.info()" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "**STEP 3**: Examing the feature scales\n", 597 | "\n", 598 | "#### Statistical method\n", 599 | "Check the scales of different features with ``describe()`` method of dataframe." 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 23, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/html": [ 610 | "
\n", 611 | "\n", 624 | "\n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | "
countmeanstdmin25%50%75%max
Sex4177.01.9554700.8278151.00001.00002.00003.0003.0000
Length4177.00.5239920.1200930.07500.45000.54500.6150.8150
Diameter4177.00.4078810.0992400.05500.35000.42500.4800.6500
Height4177.00.1395160.0418270.00000.11500.14000.1651.1300
Whole weight4177.00.8287420.4903890.00200.44150.79951.1532.8255
Shucked weight4177.00.3593670.2219630.00100.18600.33600.5021.4880
Viscera weight4177.00.1805940.1096140.00050.09350.17100.2530.7600
Shell weight4177.00.2388310.1392030.00150.13000.23400.3291.0050
\n", 729 | "
" 730 | ], 731 | "text/plain": [ 732 | " count mean std min 25% 50% 75% \\\n", 733 | "Sex 4177.0 1.955470 0.827815 1.0000 1.0000 2.0000 3.000 \n", 734 | "Length 4177.0 0.523992 0.120093 0.0750 0.4500 0.5450 0.615 \n", 735 | "Diameter 4177.0 0.407881 0.099240 0.0550 0.3500 0.4250 0.480 \n", 736 | "Height 4177.0 0.139516 0.041827 0.0000 0.1150 0.1400 0.165 \n", 737 | "Whole weight 4177.0 0.828742 0.490389 0.0020 0.4415 0.7995 1.153 \n", 738 | "Shucked weight 4177.0 0.359367 0.221963 0.0010 0.1860 0.3360 0.502 \n", 739 | "Viscera weight 4177.0 0.180594 0.109614 0.0005 0.0935 0.1710 0.253 \n", 740 | "Shell weight 4177.0 0.238831 0.139203 0.0015 0.1300 0.2340 0.329 \n", 741 | "\n", 742 | " max \n", 743 | "Sex 3.0000 \n", 744 | "Length 0.8150 \n", 745 | "Diameter 0.6500 \n", 746 | "Height 1.1300 \n", 747 | "Whole weight 2.8255 \n", 748 | "Shucked weight 1.4880 \n", 749 | "Viscera weight 0.7600 \n", 750 | "Shell weight 1.0050 " 751 | ] 752 | }, 753 | "execution_count": 23, 754 | "metadata": {}, 755 | "output_type": "execute_result" 756 | } 757 | ], 758 | "source": [ 759 | "abalone_data.describe().T" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [] 768 | } 769 | ], 770 | "metadata": { 771 | "kernelspec": { 772 | "display_name": "ML", 773 | "language": "python", 774 | "name": "python3" 775 | }, 776 | "language_info": { 777 | "codemirror_mode": { 778 | "name": "ipython", 779 | "version": 3 780 | }, 781 | "file_extension": ".py", 782 | "mimetype": "text/x-python", 783 | "name": "python", 784 | "nbconvert_exporter": "python", 785 | "pygments_lexer": "ipython3", 786 | "version": "3.13.5" 787 | }, 788 | "orig_nbformat": 4 789 | }, 790 | "nbformat": 4, 791 | "nbformat_minor": 2 792 | } 793 | -------------------------------------------------------------------------------- /MLP-using-GPU/2-DataPreprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# **Data Preprocessing Techniques**\n", 8 | "Data preprocessing involves several transformations that are applied to the raw data and make it more amenable for learning. It is carried out before using it for model training or prediction.\n", 9 | "\n", 10 | "There are many pre-processing techniques for\n", 11 | "- Data cleaning\n", 12 | " - Data imputation\n", 13 | " - Feature scaling\n", 14 | "- Feature transformation\n", 15 | " - Polynomial features\n", 16 | " - Discretization\n", 17 | " - Handling categorical features\n", 18 | " - Custom Transformers\n", 19 | " - Composite Transformers\n", 20 | " - Apply transformation of diverse features\n", 21 | " - TargetTransformedRegressor\n", 22 | "- Feature Selection\n", 23 | " - Filter based feature selection\n", 24 | " - Wrapper based feature selection\n", 25 | "- Feature Extraction\n", 26 | " - PCA\n", 27 | "\n", 28 | "The transformations are applied in a specific order and the order can be specified via ``Pipeline``. We need to apply different transformations based on the feature type. ``FeatureUnion`` helps us perform that task and combine outputs from multiple transformations into a single transformed feature matrix. We will also study how to visualize this pipeline." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Importing basic libraries" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import numpy as np\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "import pandas as pd\n", 47 | "import seaborn as sns\n", 48 | "\n", 49 | "sns.set_theme(style=\"whitegrid\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## **1. Feature Extraction**" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### DictVectorizer\n", 64 | "\n", 65 | "Many a times the data is present as a **list of dictionary objects**. ML algorithms expect the data to be in **matrix form** of shape $(n,m)$ where $n$ is the number of samples and $m$ is the number of features.\n", 66 | "\n", 67 | "``DictVectorizer`` **converts** a *list of dictionary objects to feature matrix*.\n", 68 | "\n", 69 | "Let's create a sample data for demo purpose containing ``age`` and ``height`` of children\n", 70 | "> Each record/sample in dictionary with two keys ``age`` and ``height``, and their corresponding values." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "data = [{'age' : 4, 'height' : 96.0},\n", 80 | " {'age' : 1, 'height' : 73.9},\n", 81 | " {'age' : 3, 'height' : 88.9},\n", 82 | " {'age' : 2, 'height' : 81.6}]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "> There are 4 data samples with 2 features each\n", 90 | "\n", 91 | "Let's make use of ``DictVectorizer`` to convert the list of dictionary objects to the feature matrix" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "array([[ 4. , 96. ],\n", 103 | " [ 1. , 73.9],\n", 104 | " [ 3. , 88.9],\n", 105 | " [ 2. , 81.6]])" 106 | ] 107 | }, 108 | "execution_count": 3, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "from sklearn.feature_extraction import DictVectorizer\n", 115 | "dv = DictVectorizer(sparse = False)\n", 116 | "data_transformed = dv.fit_transform(data)\n", 117 | "data_transformed" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "(4, 2)" 129 | ] 130 | }, 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "data_transformed.shape" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "The transformed data is in the feature matrix form- 4 examples with 2 features each i.e shape $(4,2)$" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## **2. Data Imputation**\n", 152 | "- Many machine learning algorithms need full feature matrix and they may not work in the presence of missing data\n", 153 | "- Data imputation identified **missing values** in each feature of the dataset and **replaces** them with an **appropriate value** based on **fixed strategy** such as:\n", 154 | " - **mean** or **median** or **mode** of that feature.\n", 155 | " - **use specified constant** value\n", 156 | "\n", 157 | "Sklearn library provides ``sklearn.impute.SimpleImputer`` class for this purpose" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "from sklearn.impute import SimpleImputer" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Some of its important parameters:\n", 174 | "- *missing_values*: Could be ``int, float, str, np.nan`` or ``None``. By default its ``np.nan``.\n", 175 | "- *strategy*: default is 'mean'. One of the following strategies can be used. \n", 176 | " - ``mean``- missing values are replaced using the **mean** along each column\n", 177 | " - ``median`` - missing values are replaced using the **median** along each column\n", 178 | " - ``most_frequent`` - missing values are replaced using the **most frequent** along each column\n", 179 | " - ``constant`` - missing values are replaced with values specified in ``fill_value`` argument.\n", 180 | "- ``add_indicator`` is a boolean parameter that when set to ``True`` returns **missing value indicators** in ``indicators_`` member variable.\n", 181 | "\n", 182 | "**Note**:\n", 183 | "- ``mean`` and ``mode`` strategies can only be used with numeric data.\n", 184 | "- ``most_frequent`` and ``constant`` strategies can be used with strings or numeric data." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Data imputation on real world dataset\n", 192 | "Let's perform data imputation on real world dataset. We will be using [heart-disease from uci machine learning repository](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) for this purpose. We will load this dataset from csv file." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "cols = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']\n", 202 | "heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None,names=cols)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "**STEP 1.**: Check if dataset contains missing values.\n", 210 | "- This can be checked via dataset description or by check number of ``nan`` or ``np.null`` in the dataframe. Howevver such check can be performed only for numerical features.\n", 211 | "- For non-numerical features, we can list their unique values and check if there are values like ``?``.\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "\n", 224 | "RangeIndex: 303 entries, 0 to 302\n", 225 | "Data columns (total 14 columns):\n", 226 | " # Column Non-Null Count Dtype \n", 227 | "--- ------ -------------- ----- \n", 228 | " 0 age 303 non-null float64\n", 229 | " 1 sex 303 non-null float64\n", 230 | " 2 cp 303 non-null float64\n", 231 | " 3 trestbps 303 non-null float64\n", 232 | " 4 chol 303 non-null float64\n", 233 | " 5 fbs 303 non-null float64\n", 234 | " 6 restecg 303 non-null float64\n", 235 | " 7 thalach 303 non-null float64\n", 236 | " 8 exang 303 non-null float64\n", 237 | " 9 oldpeak 303 non-null float64\n", 238 | " 10 slope 303 non-null float64\n", 239 | " 11 ca 303 non-null object \n", 240 | " 12 thal 303 non-null object \n", 241 | " 13 num 303 non-null int64 \n", 242 | "dtypes: float64(11), int64(1), object(2)\n", 243 | "memory usage: 33.3+ KB\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "heart_data.info()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Let's check if there are missing values in numerical columns - here we have checked it for all columns in the dataframe." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "age 0\n", 267 | "sex 0\n", 268 | "cp 0\n", 269 | "trestbps 0\n", 270 | "chol 0\n", 271 | "fbs 0\n", 272 | "restecg 0\n", 273 | "thalach 0\n", 274 | "exang 0\n", 275 | "oldpeak 0\n", 276 | "slope 0\n", 277 | "ca 0\n", 278 | "thal 0\n", 279 | "num 0\n", 280 | "dtype: int64" 281 | ] 282 | }, 283 | "execution_count": 8, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "(heart_data.isnull().sum())" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "There are two non-numerical features: ``ca`` and ``thal``.\n", 297 | "- List their unique values." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?']\n", 310 | "Unique values in thal: ['6.0' '3.0' '7.0' '?']\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "print('Unique values in ca:', heart_data.ca.unique())\n", 316 | "print('Unique values in thal:', heart_data.thal.unique())" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "Both of them contain ``?`` which is a missing values. Let's count the number of missing values." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 10, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "# missing values in ca: 4\n", 336 | "# missing values in thal: 2\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "print('# missing values in ca:', heart_data.loc[heart_data.ca == '?','ca'].count())\n", 342 | "print('# missing values in thal:', heart_data.loc[heart_data.thal ==\"?\",'thal'].count())" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "**STEP 2**: Replace '?' with ``nan``." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 11, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "heart_data.replace('?',np.nan, inplace=True)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "**STEP 3**: Fill the missing values with ``sklearn`` missing value imputation utilities.\n", 366 | "> Here we use ``SimpleImputer`` with ``mean`` strategy.\n", 367 | "\n", 368 | "We will try two variations- \n", 369 | "- ``add_indicator = False``: Default choice that only imputes missing values." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 12, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "(303, 14)\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n", 387 | "imputer = imputer.fit(heart_data)\n", 388 | "heart_data_imputed = imputer.transform(heart_data)\n", 389 | "print(heart_data_imputed.shape)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "- ``add_indicator = True``: Adds additional column for each column containing missing values. In this case it adds two column, one for ``ca`` and the other for ``thal``." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 14, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "(303, 16)\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "imputer = SimpleImputer(missing_values= np.nan, strategy='mean', add_indicator=True)\n", 414 | "imputer = imputer.fit(heart_data)\n", 415 | "heart_data_imputed_with_indicator = imputer.transform(heart_data)\n", 416 | "print(heart_data_imputed_with_indicator.shape)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "## **3. Feature Scaling**\n", 424 | "\n", 425 | "Feature scaling **transforms feature values** such that **all the features are on the same scale**.\n", 426 | "When we use feature matrix with all the features on the same scale.\n", 427 | "- **Enables faster convergence** in iterative optimization algorithms like gradient descent and its variants.\n", 428 | "- The performance of ML algorithms such as SVM, K-NN and K-means etc, that compute euclidean distance among input samples gets impacted if the features are not scaled.\n", 429 | "\n", 430 | "Tree based ML algorithms are not affected by feature-scaling. In other words, feature scaling is not required for tree based ML algorithms\n", 431 | "\n", 432 | "Feature scaling can be performed with the following methods:\n", 433 | "- Standardization\n", 434 | "- Normalization\n", 435 | "- MaxAbsScaler.\n", 436 | "\n", 437 | "Let's demonstrate feature scaling on real world dataset. For this purpose, we will be using [abalone dataset](https://archive.ics.uci.edu/ml/datasets/abalone). We will use different scaling utilities in ``sklearn`` library." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 17, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "cols = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']\n", 447 | "abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',header=None,names=cols)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "**STEP 1**: Examine the dataset" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 18, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "\n", 467 | "RangeIndex: 4177 entries, 0 to 4176\n", 468 | "Data columns (total 9 columns):\n", 469 | " # Column Non-Null Count Dtype \n", 470 | "--- ------ -------------- ----- \n", 471 | " 0 Sex 4177 non-null object \n", 472 | " 1 Length 4177 non-null float64\n", 473 | " 2 Diameter 4177 non-null float64\n", 474 | " 3 Height 4177 non-null float64\n", 475 | " 4 Whole weight 4177 non-null float64\n", 476 | " 5 Shucked weight 4177 non-null float64\n", 477 | " 6 Viscera weight 4177 non-null float64\n", 478 | " 7 Shell weight 4177 non-null float64\n", 479 | " 8 Rings 4177 non-null int64 \n", 480 | "dtypes: float64(7), int64(1), object(1)\n", 481 | "memory usage: 293.8+ KB\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "abalone_data.info()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "**STEP 1a**: [Optional]: convert non-numerical attributes into numerical ones\n", 494 | "> In this dataset only ``Sex`` is the non-numeric column" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 19, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "array(['M', 'F', 'I'], dtype=object)" 506 | ] 507 | }, 508 | "execution_count": 19, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "abalone_data.Sex.unique()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 20, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "\n", 527 | "RangeIndex: 4177 entries, 0 to 4176\n", 528 | "Data columns (total 9 columns):\n", 529 | " # Column Non-Null Count Dtype \n", 530 | "--- ------ -------------- ----- \n", 531 | " 0 Sex 4177 non-null int64 \n", 532 | " 1 Length 4177 non-null float64\n", 533 | " 2 Diameter 4177 non-null float64\n", 534 | " 3 Height 4177 non-null float64\n", 535 | " 4 Whole weight 4177 non-null float64\n", 536 | " 5 Shucked weight 4177 non-null float64\n", 537 | " 6 Viscera weight 4177 non-null float64\n", 538 | " 7 Shell weight 4177 non-null float64\n", 539 | " 8 Rings 4177 non-null int64 \n", 540 | "dtypes: float64(7), int64(2)\n", 541 | "memory usage: 293.8 KB\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "#Assign numeric values to sex.\n", 547 | "abalone_data = abalone_data.replace({'Sex': {'M':1,'F':2,'I':3}})\n", 548 | "abalone_data.info()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "**STEP 2**: Separate labels from features." 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 21, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "name": "stdout", 565 | "output_type": "stream", 566 | "text": [ 567 | "The dataframe object after deleting the column\n", 568 | "\n", 569 | "RangeIndex: 4177 entries, 0 to 4176\n", 570 | "Data columns (total 8 columns):\n", 571 | " # Column Non-Null Count Dtype \n", 572 | "--- ------ -------------- ----- \n", 573 | " 0 Sex 4177 non-null int64 \n", 574 | " 1 Length 4177 non-null float64\n", 575 | " 2 Diameter 4177 non-null float64\n", 576 | " 3 Height 4177 non-null float64\n", 577 | " 4 Whole weight 4177 non-null float64\n", 578 | " 5 Shucked weight 4177 non-null float64\n", 579 | " 6 Viscera weight 4177 non-null float64\n", 580 | " 7 Shell weight 4177 non-null float64\n", 581 | "dtypes: float64(7), int64(1)\n", 582 | "memory usage: 261.2 KB\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "y = abalone_data.pop('Rings')\n", 588 | "print('The dataframe object after deleting the column')\n", 589 | "abalone_data.info()" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "**STEP 3**: Examing the feature scales\n", 597 | "\n", 598 | "#### Statistical method\n", 599 | "Check the scales of different features with ``describe()`` method of dataframe." 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 23, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/html": [ 610 | "
\n", 611 | "\n", 624 | "\n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | "
countmeanstdmin25%50%75%max
Sex4177.01.9554700.8278151.00001.00002.00003.0003.0000
Length4177.00.5239920.1200930.07500.45000.54500.6150.8150
Diameter4177.00.4078810.0992400.05500.35000.42500.4800.6500
Height4177.00.1395160.0418270.00000.11500.14000.1651.1300
Whole weight4177.00.8287420.4903890.00200.44150.79951.1532.8255
Shucked weight4177.00.3593670.2219630.00100.18600.33600.5021.4880
Viscera weight4177.00.1805940.1096140.00050.09350.17100.2530.7600
Shell weight4177.00.2388310.1392030.00150.13000.23400.3291.0050
\n", 729 | "
" 730 | ], 731 | "text/plain": [ 732 | " count mean std min 25% 50% 75% \\\n", 733 | "Sex 4177.0 1.955470 0.827815 1.0000 1.0000 2.0000 3.000 \n", 734 | "Length 4177.0 0.523992 0.120093 0.0750 0.4500 0.5450 0.615 \n", 735 | "Diameter 4177.0 0.407881 0.099240 0.0550 0.3500 0.4250 0.480 \n", 736 | "Height 4177.0 0.139516 0.041827 0.0000 0.1150 0.1400 0.165 \n", 737 | "Whole weight 4177.0 0.828742 0.490389 0.0020 0.4415 0.7995 1.153 \n", 738 | "Shucked weight 4177.0 0.359367 0.221963 0.0010 0.1860 0.3360 0.502 \n", 739 | "Viscera weight 4177.0 0.180594 0.109614 0.0005 0.0935 0.1710 0.253 \n", 740 | "Shell weight 4177.0 0.238831 0.139203 0.0015 0.1300 0.2340 0.329 \n", 741 | "\n", 742 | " max \n", 743 | "Sex 3.0000 \n", 744 | "Length 0.8150 \n", 745 | "Diameter 0.6500 \n", 746 | "Height 1.1300 \n", 747 | "Whole weight 2.8255 \n", 748 | "Shucked weight 1.4880 \n", 749 | "Viscera weight 0.7600 \n", 750 | "Shell weight 1.0050 " 751 | ] 752 | }, 753 | "execution_count": 23, 754 | "metadata": {}, 755 | "output_type": "execute_result" 756 | } 757 | ], 758 | "source": [ 759 | "abalone_data.describe().T" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [] 768 | } 769 | ], 770 | "metadata": { 771 | "kernelspec": { 772 | "display_name": "ML", 773 | "language": "python", 774 | "name": "python3" 775 | }, 776 | "language_info": { 777 | "codemirror_mode": { 778 | "name": "ipython", 779 | "version": 3 780 | }, 781 | "file_extension": ".py", 782 | "mimetype": "text/x-python", 783 | "name": "python", 784 | "nbconvert_exporter": "python", 785 | "pygments_lexer": "ipython3", 786 | "version": "3.13.5" 787 | }, 788 | "orig_nbformat": 4 789 | }, 790 | "nbformat": 4, 791 | "nbformat_minor": 2 792 | } 793 | -------------------------------------------------------------------------------- /MLP/Week_11.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Week_11.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "source": [ 22 | "## [Lecture 11.1: K-means clustering on digit dataset](https://www.youtube.com/watch?v=-tPSKI9nUf0)" 23 | ], 24 | "metadata": { 25 | "id": "1VQs-b78IKar" 26 | } 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "source": [ 31 | "In this notebook, we will implement `K-Means` algorithm with `sklearn`" 32 | ], 33 | "metadata": { 34 | "id": "LaG01b-nIaqm" 35 | } 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": { 41 | "id": "PLkFDybvDsNc" 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "import matplotlib.pyplot as plt\n", 46 | "import numpy as np\n", 47 | "import pandas as pd\n", 48 | "\n", 49 | "# KMeans clustering\n", 50 | "from sklearn.cluster import KMeans\n", 51 | "\n", 52 | "# Loading the digit dataset\n", 53 | "from sklearn.datasets import load_digits\n", 54 | "\n", 55 | "# Selecting k through silhoutte score\n", 56 | "from sklearn.metrics import silhouette_score\n", 57 | "\n", 58 | "# Normalization through MinMaxScaler\n", 59 | "from sklearn.preprocessing import MinMaxScaler\n", 60 | "\n", 61 | "from sklearn.pipeline import Pipeline" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "source": [ 67 | "### Clustering of digits\n", 68 | "We will use digit dataset for clustering, which is loaded through `load_digit` API\n", 69 | "- It loads 8x8 digit images which is approximately 180 samples per class\n", 70 | "- From 10 classes, it has total of 1797 images\n", 71 | "- Each pixel has value between 0 and 16" 72 | ], 73 | "metadata": { 74 | "id": "lGPHpCn2JXt_" 75 | } 76 | }, 77 | { 78 | "cell_type": "code", 79 | "source": [ 80 | "digits = load_digits()" 81 | ], 82 | "metadata": { 83 | "id": "7SnsK4ehJGEu" 84 | }, 85 | "execution_count": 2, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "source": [ 91 | "Let's quickly check `KMeans` class as implemented in `sklearn.cluster` module" 92 | ], 93 | "metadata": { 94 | "id": "pAW_Y6ReJ72R" 95 | } 96 | }, 97 | { 98 | "cell_type": "code", 99 | "source": [ 100 | "?KMeans" 101 | ], 102 | "metadata": { 103 | "id": "xdPQSYfuJ4zI" 104 | }, 105 | "execution_count": 3, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "source": [ 111 | "Some of the important parameters are as follows:\n", 112 | "- `init`\n", 113 | "- `n_init`\n", 114 | "- `max_iter`\n", 115 | "- `random_state`\n", 116 | "\n", 117 | "Since KMeans algorithm is susceptible to local minima,we perform `KMeans` fit and select the ones with the lowest value of sum of squared error\n", 118 | "\n", 119 | "The total number of time, we would like to run KMeans algorithm is specified through `n_init` parameter.\n", 120 | "\n", 121 | "`max_iter` specifies total number of iterations before declaring convergence." 122 | ], 123 | "metadata": { 124 | "id": "Q1jnSaOvKHl7" 125 | } 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "source": [ 130 | "Let's define parameters of KMeans clustering algorithm in a dictionary object." 131 | ], 132 | "metadata": { 133 | "id": "M_ViEA2VK3ug" 134 | } 135 | }, 136 | { 137 | "cell_type": "code", 138 | "source": [ 139 | "kmeans_kwargs = {\n", 140 | " 'init': 'random',\n", 141 | " 'n_init': 50,\n", 142 | " 'max_iter': 500,\n", 143 | " 'random_state': 0\n", 144 | "}" 145 | ], 146 | "metadata": { 147 | "id": "-TY1TIbcKEWZ" 148 | }, 149 | "execution_count": 4, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "source": [ 155 | "Let's define a pipeline with two stages:\n", 156 | "- Preprocessing for feature scaling with `MinMaxScaler`.\n", 157 | "- Clustering with `KMeans` clustering algorithm" 158 | ], 159 | "metadata": { 160 | "id": "iOV6zonNoADt" 161 | } 162 | }, 163 | { 164 | "cell_type": "code", 165 | "source": [ 166 | "pipeline = Pipeline([('Preprocess', MinMaxScaler()),\n", 167 | " ('Clustering', KMeans(n_clusters=10, **kmeans_kwargs))])\n", 168 | "pipeline.fit(digits.data)" 169 | ], 170 | "metadata": { 171 | "id": "W8Qx_q-WLLUm", 172 | "colab": { 173 | "base_uri": "https://localhost:8080/" 174 | }, 175 | "outputId": "e02c0439-4700-4cf7-ae26-5d78dd383a19" 176 | }, 177 | "execution_count": 5, 178 | "outputs": [ 179 | { 180 | "output_type": "execute_result", 181 | "data": { 182 | "text/plain": [ 183 | "Pipeline(steps=[('Preprocess', MinMaxScaler()),\n", 184 | " ('Clustering',\n", 185 | " KMeans(init='random', max_iter=500, n_clusters=10, n_init=50,\n", 186 | " random_state=0))])" 187 | ] 188 | }, 189 | "metadata": {}, 190 | "execution_count": 5 191 | } 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "source": [ 197 | "The cluster centroids can be accessed via `cluster_centers_` member variable of `KMeans` class." 198 | ], 199 | "metadata": { 200 | "id": "mlYS_0dVoxrT" 201 | } 202 | }, 203 | { 204 | "cell_type": "code", 205 | "source": [ 206 | "cluster_centers = pipeline[-1].cluster_centers_" 207 | ], 208 | "metadata": { 209 | "id": "ugJKlEesoggn" 210 | }, 211 | "execution_count": 6, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "source": [ 217 | "Let's display cluster centroids:" 218 | ], 219 | "metadata": { 220 | "id": "eW0OpJTPpAu0" 221 | } 222 | }, 223 | { 224 | "cell_type": "code", 225 | "source": [ 226 | "# displaying centroids\n", 227 | "fig, ax = plt.subplots(5, 2, figsize=(4, 4))\n", 228 | "for i, j in zip(ax.flat, cluster_centers.reshape(10, 8, 8)):\n", 229 | " i.imshow(j)" 230 | ], 231 | "metadata": { 232 | "colab": { 233 | "base_uri": "https://localhost:8080/", 234 | "height": 267 235 | }, 236 | "id": "5NMKg5AWo_S1", 237 | "outputId": "009b925a-5004-4347-ccc7-77bbd6a47e33" 238 | }, 239 | "execution_count": 7, 240 | "outputs": [ 241 | { 242 | "output_type": "display_data", 243 | "data": { 244 | "text/plain": [ 245 | "
" 246 | ], 247 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAALsAAAD6CAYAAAD5lDajAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAZMklEQVR4nO2da3RU13XH/1ujxwi9kCzEQyiywCgYuya2KdgYrxjbGFI7wU28HHCSrjZtlUdpHk2yTNLlldp1kpXWabvycBvFIU3iYJI4kY1jzNOJiUnAEo1TbAxYCGQkBEgggZDQY0a7HzTUOrNHM/dKuqO5c/bvi9hb+957NHvP4Zx79tmHmBmKYgMZk90ARUkWGuyKNWiwK9agwa5Ygwa7Yg0a7Io1OAp2IlpFRIeJqJGI1nvdKGVySHc/U6L37EQUAHAEwAoALQDqAaxl5oOjXZNNORxEnnmfrCxD7pudKa5bUNAudIMxmtd4broh57RdEjY8NGQ+Dz0Y4H4arc2249bPsXyMqE93aGrU7wEUzuwWukwaErqO9iJDzupI7GMA6EZnBzNPi9VmGXGSxQAambkJAIhoE4DVAEYN9iDysITuMB80o9yQD/1zmbjuxeWPC93psPyD7nnqs4Y89yuvCZuhbvND3ce7RmuuMowrP8fyMWWa4dR91yJx3Z3/+FuhK828KHTf/+7dhjzziVeFzVBvr9Dt5KebY7UXcDaMKQdwYoTcEtEp6UXa+9lJz+4IIqoBUAMAQUyZqNsqKYTffeykZ28FUDFCnh3RGTBzLTMvYuZFWciZqPYpySOhn/3uYyc9ez2AeURUheE/fg2ABxJeReZspXNZhSE/uGizuGTxno8J3dLKY0K39j27zQb+6Br5/INyIqTExb2fo3ycMdWcVJ5cFRKX3F0ox94H+iqELvc9p81HvTBDPv/NprjNiyZhsDNziIjWAdgGIABgAzO/7uopSspjg58djdmZeQuALR63RZlk0t3PuoKqWMOEvY0RRC1WTT3QachPrr9HXDJDvlJH6cPyHewLLQtMmzNnx9BAZdxE+Zhyc83fk1wR3Hz+BqEryewRut7+bNPmYqewcYv27Io1aLAr1qDBrliDBrtiDd5NUKMIHzxiyPkdMjHtjUcrhe5bJb8Xus3P32TIJd1vjbN1yoQwOGiIgbNZwqQmhj+lFfCTF1Yacrj9zXE1DdCeXbEIDXbFGjTYFWvQYFesIWkTVEF/v1BNe1k258c33ix0H713pyHv2nqLsKE9MrtO8RYOmVmO4Xy5JP6OzHyhe6ZH6qY1XIh777GgPbtiDRrsijVosCvW4GjMTkTHAXQDCAMIMbPcNq74nnT3s5sJ6nJm7hjrgzKi0j+peKqwKX1WVm3Y079E6O5/aKshtyyXm38r9rhtoRJhzH6OTvF9Z7XYqoxHO+YLXQZkKvD56gJDLvqjDFW3k1YdxijW4DTYGcB2ItofKacgIKIaImogooZByNeKii+I62e/+9jpMGYZM7cSURmAHUR0iJmNLf7MXAugFgAKqUTPrvEncf3sdx873XDdGvl5hojqMFwqbXf8q0woz6z7d3jdTNmYi7IA1cCVfUK3dIqZAfe9cIznRZViw/jXJNKe8fqZ+8ze/nT3FcKmo0guIJVmya2XPbPMQUdxvqwbGe4677RpABwMY4goj4gKLv8bwF0AZHFFxdfY4GcnPft0AHU0XBAnE8BGZt4a/xLFh6S9n50USWoCsDAJbVEmERv8rK8eFWtIXtYjmxlwuXMuCJMDSzYK3bFBOXlZsWedIVdvlnVjONusO0Jh/V5POFG1Hoc6zdouwU1zxSVlX5Q1OGdly5ow2ReiXvYEAmNooIlGgGINGuyKNWiwK9agwa5YQ8LT8sZ0U6J2AM0ASgGMOVMyAW7vXTnaKWqKe0b4GPCJnz0J9v+/OVGDVznRXt5bcYdf/KzDGMUaNNgVa/A62Gt9em/FHb7ws6djdkVJJRz17ES0iogOE1EjEa33ulHK5JDufk7YsxNRAMARACswfMR3PYC1zBzzTHsAyKYcDsJMtqfcoCFzhdxxUR2UyfgHukqFLtg2YN5rMPHOjD70YID7KaGhpbj1cywfi3tmyWLUmXNllbDeQWmXfcK044EBYROLbnR2jPbq0Uki2GIAjZEUUBDRJgCrAcT8EIgoUIBiLKE7DH3GVeaucv6mTAjaOv95oavaLLe8LvinZkMOnTotbKLZx7sS2liOKz8HkSd8HE3m9FlCV/yDXqF79ZTcoVb5OTMBMHSsWdjEYic/Paqhk2FMOYATI+SWiG40FjtqlZJquPWz75iwFN/IbvQaAMV+3HmuJGaEjxGErNWT6jjp2VsBjDxcfnZEZ8DMtZGVrgezkDNBzVOSSEI/X/YxMy/yo4+d9Oz1AOYRURWG//g1AB6IY98Kkrv725aXGHLd3CfEhb+4KMd4f3nzy0L36+vNEtU5LyQesysJcetnSdRmjiN/L8/I2lPxmNAtf+4LQsfdp1w92glO9qCGiGgdgG0AAgA2MPPrcS6pn6jGKcljDH72HU7rxmwBsMWhbagwoySxoZJyuPGzH9HcGMUavNlwzQCHzUWjrIvm4tXtOz8jLss9li10n3rgWaEbyta1oVQkUFJsyJ+4Z5uwWf6KXDepfEKecRo+e27iGhZBe3bFGjTYFWvQYFesQYNdsQbvKoJFZVOWbT1myEXH5AJS85/JDMypAZk4FOxwlgGnJJfwXDOV5obc3wibb7XfKXTdt84RuoLD5uvr8MEj8oEu92Joz65Ygwa7Yg0a7Io1aLAr1pC0ktWhNjOLLTtf5kOXv0tu2drc8S6hy2rrMu89zrYpE8PFStOnC7NlufGHb/ul0L21VG693N52tSFn/csNwiZz135X7dOeXbEGDXbFGjTYFWtwNGYnouMAugGEAYS0oGh6ku5+djNBXc7MYy5LTFlm+m7H0unC5qaSBqF7rv56oasuvWTIgQvycNlwhzxnSXHEmP1c9Lp5NtIg5ArnxpNLhK7tWbl9L/895guNU38jX0NcVV8oGxHnHGAdxijW4DTYGcB2ItofKacgIKIaImogogYtpeFb4vrZ7z52OoxZxsytRFQGYAcRHWJm40x7Zq5FpOJqIZVotVR/EtfPfvex0w3XrZGfZ4ioDsNVv3bHv8okMKPMkBd+8n+FzWen/VrorrxVjr2bFpml/J6vlwczL/jXfEOmFrnlTzEZr5/Dh44a8n0HPyxsPlgh52VPhOWYvXpquyFPy+0RNgNZ7tZEEw5jiCiPiAou/xvAXQBec/UUJeWxwc9OvhrTAdTRcAGcTAAbmXmrp61SJoO097OTIklNAOQ4QUkrbPCzvnpUrCFpWY88xTyM4HRfgbCpysoXupX5sgJbfaY5odk1rVrY9M0xM+mG2pP2p9rLkFkrqODjMov1hR9cK3R/+NLjQvezi0WG/NhXZNnJ4s5XXDVPe3bFGjTYFWvQYFesQYNdsQZPzkElonYAzQBKAYw5UzIBbu9dOdopaop7RvgY8ImfPT30l4gavMqJ9vLeijv84mcdxijWoMGuWIPXwV7r03sr7vCFnz0dsytKKuGoZyeiVUR0mIgaiWi9141SJod093PCnp2IAgCOAFiB4SO+6wGsZeaYZ9oDQDblcBB5CZ4sz0XqL5dVwq4o7Ba6nmOmHV/qi/8sAH3owQD362FMo+DWzzF9HOXTUGmMqm/T5VvEKRlhoWsZMHNjLp0NCpusLrk18MJge8dorx6dZEctBtAYSQEFEW0CsBpAzA+BiAIFKMYSuiPuTSlHnpB89NOyxNmHV74kdHs/dJ0hDx2UB1BFJyXt411x26O483MQecLH0T5t/4D05yNf+IHQ/WmO3I32DyfuNuQ3fni1sJnxbJPQbW37TrNQRnAyjCkHcGKE3BLRjcZiB/dUUg+3fvYdE5b3GtmNXgOg2I87z5XEjPAxgpBDlFTHSc/eCqBihDw7ojNg5trISteDWZBDFCXlSejnyz5m5kV+9LGTnr0ewDwiqsLwH78GgMykfxvxRYhF//LrhO4X9/+70N37m78Tuvl95oGwmWWy5PHQBXNiS5d0/SwBbv0sCJSaldnmfkSeg/Ti+QVC95/dsjrc/TPqDflAUF7H/e7O1nKyBzVEROsAbAMQALCBmeX2obepj/M7JUUZg599h9O6MVsAbHFoGyqkksSGSsrhxs9+RP9vV6whabuQA1PNRYKiL74lbH54dqnQle3IErq2lTMNecoZuShRtDNqvNin60meE7WodPjpdwqTC3vkImHT++VG+wMrzSq+xW8OCpuhbnmveGjPrliDBrtiDRrsijVosCvW4N0ENWqycmqNuShQf9V3xCULv7lO6AaulVmZ195iJn69/tJVwqbgGbPEMQ/J6lTKBBMyj4IJ9EnfHb9HVoL75Pvk284hNvvh81fKFxU5YfliIh7asyvWoMGuWIMGu2INGuyKNXg3QY3a7lf0gZOGHCD5PaObuoTukQUvCN3KKWZi5S3bPy8fPxiVEacbyz2HC82V0Au3yu2S316yUehWTZH7H+47eqchZ/XE8J9Ln2rPrliDBrtiDRrsijU4GrMT0XEA3QDCAEJaUDQ9SXc/u5mgLmdmR6WDiQgZQbPOx6mXzY3qnyuRZRb+uvp3QremoFPo7jv6PkO+8qenhI27tTVlBI79HA31XDLkgj1yu92uq+X2up93yBpDZx6bY8hXvCiPZHW7Jq7DGMUanAY7A9hORPsj5RQERFRDRA1E1DCgpTT8Slw/j/SxH8ulOB3GLGPmViIqA7CDiA4xs3GmPTPXIlJxtSjjCn2p7U/i+nmkjwupxHc+drrhujXy8wwR1WG46tfuOPYYGjC3Uc154rghv/rb68V1de+XzVn93m8IXevjZpZj4Zt7R2274hy3fo4mdLLNkEsOzxQ2c4NnhO5Xz6wWuiu3/Y8hD/UlrueZiITDGCLKI6KCy/8GcBcAOVtQfI0NfnbSs08HUEfD+emZADYy81ZPW6VMBmnvZydFkpoALExCW5RJxAY/66tHxRq8y3qMqo8eajWzHrPPmvUaAYDvl/Uff3XxGqEr/qN5rS4gpQYUCBhy202y+On5sKz+W7GzR+g4PPHbKLVnV6xBg12xBg12xRo02BVr8OQcVCJqB9AMoBTAmDLoHOD23pWjnaKmuGeEjwGf+NnTQ3+JqMGrnGgv7624wy9+1mGMYg0a7Io1eB3stT69t+IOX/jZ0zG7oqQSjnp2IlpFRIeJqJGI1nvdKGVySHc/J+zZiSgA4AiAFRg+4rsewFpmjnmmPQBkUw4HITfRJmxMblDoplRdErqpgV5DfutkmbAJnDNt+rgHA6wHK42GWz878XH4Cvn7ylmnhe5cWJ6p1HPCzKGh7l5hE4tudHaM9urRSSLYYgCNkRRQENEmAKsBxPwQiChQgGIsoTscNW4kGdXzhe7GH8mjON9XZO5iWffwp4RNyVOmzd5+WUZPMXDl5yDyEvq46+6bhe67j/yH0D3VtUToGj5/oyFn7tof91mX2clPN4/2OyfDmHIAJ0bILRHdaCx21Col1XDrZ98xYSm+kd3oNQCK/bjzXEnMCB8jCJmqm+o46dlbAVSMkGdHdAbMXBtZ6XowCzKPWUl5Evr5so+ZeZEffeykZ68HMI+IqjD8x68B8EAce/FFACDOWMpYeLUwOf2I3IbxaNkBodvbZ35H+0rkvJOinqcz04S49bMgMM2cFz785Q3CZkpGSOiCGfJA35u/8Yoh/2HVLGETOiUnu/Fwsgc1RETrAGwDEACwgZnlrPFt6l21QEkJxuBn3+G0bswWAPJIs9i2oUIqGVejlMnBjZ/9iObGKNbg3YbrKALVcw35xJelzbeu+bnQ/exikdDdmGNOC/JPys25HHUmJ0PTIrzmwrvNyrurpuwQNgu//gWhm7n7vNCtecq8dvdN8p197jPuxuzasyvWoMGuWIMGu2INGuyKNSRtgtr1rlJDfuiap4TN8cFSoVuae0zoZmeaq3dTTg0Im+gJqs5Pvef8HLMi2NZeuco66yV51i2/3ih0/3bITDLrvSUgbOY+46592rMr1qDBrliDBrtiDRrsijUkbYJaePSiIX9p81phE86XK6Ffvk3OQs5lm2f3ZHbL/Hmdjyafvqgzxfb1zBU21NoudEOD8gXD4KAZmkNl498joT27Yg0a7Io1aLAr1uBozE5ExwF0Y/hEl5AWFE1P0t3Pbiaoy5l5zGWJM46bk8qrfjJD2FyoLpAX3iZVRwfNOjGBMzJFVG7+UhwyZj/P3Gtuq7x/TYOw2VeyQOgCRbJuzAfmvWrIdT+9dSxNMtBhjGINToOdAWwnov2RcgoCIqohogYiatBSGr4lrp/97mOnw5hlzNxKRGUAdhDRIWY2zrRn5lpEKq4WUom+5vYncf3sdx873XDdGvl5hojqMFz1a3f8q0zCUeeeUvdFYZMzXZ6DOi/7lNAd6KswZC7wX8GeVGS8fs5/+agh97PMVAw/3id0BdlS96Gp+wz5lZducNqMUUk4jCGiPCIquPxvAHcBeG3cT1ZSChv87KRnnw6gLlJ0KBPARmbe6mmrlMkg7f3spEhSE4CFSWiLMonY4Gd99ahYQ9KyHhF16AH3y1dXwVM9Qve73nlC13xJbt9TJp9wx1lD/vhDnxY2T3/1MaHrZVmJ875vmvVlZu2X9dndvg7Snl2xBg12xRo02BVr0GBXrMGTc1CJqB1AM4BSAGPOlEyA23tXjnaKmuKeET4GfOJnTw/9JaIGr3Kivby34g6/+FmHMYo1aLAr1uB1sNf69N6KO3zhZ0/H7IqSSjjq2YloFREdJqJGIlrvdaOUySHd/ZywZyeiAIAjAFZg+IjvegBrmTnmmfYAkE05HERe9H0MuX+m3HAxs6RT6AaH5AaAs2cLzed1yOR/Dpubf/vQgwHu1+NQR8Gtn2P5GFOChhgul7FVniN9nEPy/NvzQ7mGfLZNnq0VOCdzqbrR2THaq0cniWCLATRGUkBBRJsArAYQ80MgokABirGEzPraGUHzgzj2ievFtQ998GdC1zIgj5l88kcrDLnie/K4znCXWXFgH++K1VzlbVz5OYg84WOaf40hn/+q7IQerZblDOdkyuoQW3rMQ6F//NW7hU3Rk3uFbic/3SyUEZwMY8oBnBght0R0o7HYwT2V1MOtn33HhKX4Rnaj1wAo9uPOcyUxI3yMIPy379dJz94KYOQO59kRnQEz10ZWuh7MgjxeREl5Evr5so+ZeZEffeykZ68HMI+IqjD8x68B8EAce/FFAIDQovmG/PJfyST+5Q1/K3RZATl5qbqnyZDDP5eTF3TJcaASF7d+Fpy7znxx8MrCnwib/+qSI6Oa3/2F0AVazTleVaOcjLrFyR7UEBGtA7ANQADABmaWM8K3qR93q5SkMwY/+w6ndWO2ANji0DZUSPINipL6uPGzH9HcGMUakrbh+kKVuUiwr/8KYZP7jBx7d8qir5h9u/kq9Vj+leNqmzJGohYKe2aZ8vmhS+KSb/9wtdBVP39O6DI6Thpy9GZuQDdcK8qoaLAr1qDBrliDBrtiDUmboBY19hry91rfLWzu/dyLQvfu/DeE7tW+SkN+c+p8YaPf4uSTGZX31TskFwQ/+hFZK/Xb71ghdNUbzGxXPjP+/dwaE4o1aLAr1qDBrliDBrtiDUmboAb+cNiQz3/tT4TNkzfOFbr/zrtd6N670jxv5+y1ucJm2stuW6i4JmpL58zdZqbp0vmfFZeUV8qV0MpqeW5W2zIzO7K8qVDYhDvlFr94aM+uWIMGu2INGuyKNTgasxPRcQDdAMIAQlpQND1Jdz+7maAuZ+YxL2NxKGTIOWdlmYXK57qFjvoGha7rdnOzb++ssbZKicHY/fxaoyEu+Np0YXL6Trktb+DP5eMuzYxK4M0Yf8kfHcYo1uA02BnAdiLaHymnICCiGiJqIKIGLaXhW+L62e8+djqMWcbMrURUBmAHER1iZuNMe2auRaTiaiGVaLVUfxLXz373sdMN162Rn2eIqA7DVb92x7/KJKPIXBR442Oy7kj+oQKhy+mSn+m9Rb835L2XrovxwKgakTIBT4livH7OyDdrP55aGWN8vkqWOPlgxQGh+8UWczGRL8k5nlsSDmOIKI+ICi7/G8BdAF4b95OVlMIGPzvp2acDqItU4c0EsJGZZVKy4nfS3s9OiiQ1AViYhLYok4gNftZXj4o1JC3rceiiWasvpzVb2Gxa9w2huyZbZjTe9tq9hlyxUy5G+e5VQTowwzwDoOrDbwqT71c9J3TXP/sZobv6l2Y9z1Bvr7Bxi/bsijVosCvWoMGuWIMGu2INnpyDSkTtAJoBlAIYf8GP2Li9d+Vop6gp7hnhY8Anfvb00F8iavAqJ9rLeyvu8IufdRijWIMGu2INXgd7rU/vrbjDF372dMyuKKmEDmMUa/Ak2IloFREdJqJGIlrvwf2PE9EBInqViBom+v6KM/zm5wkfxhBRAMARACsAtGD4XNS1zHxwAp9xHMCi8VQ7UMaHH/3sRc++GEAjMzcx8wCATQDkEWmK3/Gdn70I9nIAJ0bILRHdRJKw2oHiOb7zc9Ly2SeYhNUOlLRgQv3sRc/eCqBihDw7opswRu6CB3B5F7ySXHznZy+CvR7APCKqIqJsAGsAbJ6om9uwC94n+M7PEz6MYeYQEa0DsA1AAMAGZn59Ah+R9rvg/YAf/awrqIo16AqqYg0a7Io1aLAr1qDBrliDBrtiDRrsijVosCvWoMGuWMP/AfTeXG+hMjkVAAAAAElFTkSuQmCC\n" 248 | }, 249 | "metadata": { 250 | "needs_background": "light" 251 | } 252 | } 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "source": [ 258 | "In this case, the number of clusters were known. Hence we set `k=10` and got the clusters\n", 259 | "\n", 260 | "For deciding the optimal number of clusters through elbow and silhoutte, we will pretend that we do not know the number of clusters in the data and we will try to discover the optimal number through these two methods one by one:\n", 261 | "\n", 262 | "### Elbow method\n", 263 | "Here we keep track of sum-of-squared error (SSE) in a list for each value of k." 264 | ], 265 | "metadata": { 266 | "id": "c8Cv2aixqrq5" 267 | } 268 | }, 269 | { 270 | "cell_type": "code", 271 | "source": [ 272 | "# Identifying the correct number of clusters\n", 273 | "sse_digit = []\n", 274 | "scaled_digits = MinMaxScaler().fit_transform(digits.data)\n", 275 | "for k in range(1, 12):\n", 276 | " kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n", 277 | " kmeans.fit(scaled_digits)\n", 278 | " sse_digit.append(kmeans.inertia_)" 279 | ], 280 | "metadata": { 281 | "id": "x1YuEMnwpW11" 282 | }, 283 | "execution_count": 8, 284 | "outputs": [] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "source": [ 289 | "Note that SSE for a given clustering output is obtained through `inertia_` member variable." 290 | ], 291 | "metadata": { 292 | "id": "_NFrkwkbvogl" 293 | } 294 | }, 295 | { 296 | "cell_type": "code", 297 | "source": [ 298 | "plt.plot(range(1, 12), sse_digit)\n", 299 | "plt.xticks(range(1, 12))\n", 300 | "plt.xlabel('Number of clusters')\n", 301 | "plt.ylabel('SSE')\n", 302 | "plt.show()" 303 | ], 304 | "metadata": { 305 | "colab": { 306 | "base_uri": "https://localhost:8080/", 307 | "height": 279 308 | }, 309 | "id": "qo31vTUTvhDL", 310 | "outputId": "6acb93ac-8160-42ab-dc8b-1546cc2fcd91" 311 | }, 312 | "execution_count": 9, 313 | "outputs": [ 314 | { 315 | "output_type": "display_data", 316 | "data": { 317 | "text/plain": [ 318 | "
" 319 | ], 320 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXwV1fnH8c+ThAAJSxIJCCSQIIssAkJAVFAURVwqaquitS5V6SLu9qfW/upWW21dfmotLYpbXdC6FGoVRYqKGyTsYUe2JGzRQFjCFvL8/riDRgpcltx7s3zfr9d9Ze6ZM3OesTRP5pyZc8zdERER2Z+4WAcgIiLVn5KFiIiEpWQhIiJhKVmIiEhYShYiIhJWQqwDiIRmzZp5VlZWrMMQEalRpk2b9rW7p+9tX61MFllZWeTl5cU6DBGRGsXMVuxrn7qhREQkLCULEREJS8lCRETCUrIQEZGwlCxERCQsJQsREQlLyUJERMJSsqhk645dPPjeAgpKymIdiohItaJkUcn6sh38/Yvl3PXPfLTOh4jIdyKaLMzsZjOba2b5ZvaqmTUws+fNbJmZzQw+PYO6ZmZPmNkSM5ttZr0qnecKM1scfK6IVLytUhpy+5lH88miYv45syhSzYiI1DgRSxZm1hq4Achx925APDAs2P0rd+8ZfGYGZWcCHYLPcGBkcJ404G7gOKAvcLeZpUYq7suOa0uvNinc9695fLN5e6SaERGpUSLdDZUANDSzBCAJWLWfukOBFz3kSyDFzFoCZwAT3L3E3dcDE4AhkQo4Ls548Ifd2by9nPvfmRepZkREapSIJQt3LwIeBlYCq4FSd/8g2P1A0NX0mJnVD8paAwWVTlEYlO2r/HvMbLiZ5ZlZXnFx8WHF3rFFY345sD3/nLmKSQvXHda5RERqg0h2Q6USulvIBloByWZ2GXAncDTQB0gDbq+K9tx9lLvnuHtOevpeZ9g9KL885SjaN2/Eb97OZ8v28iqIUESk5opkN9RpwDJ3L3b3ncBbwAnuvjroatoOPEdoHAKgCMisdHxGULav8oiqnxDPQz88hlWlW3nkg0WRbk5EpFqLZLJYCfQzsyQzM2AQMD8YhyAoOw/ID+qPAy4PnorqR6jbajXwPjDYzFKDu5XBQVnE9W6bxmXHteW5z5cxY+X6aDQpIlItRXLMYgrwBjAdmBO0NQp42czmBGXNgN8Fh7wLLAWWAE8DvwzOUwLcD+QGn/uCsqj4nyGdaNG4AXe+NYcd5RXRalZEpFqx2vjyWU5OjlflSnkT5q3l2hfzuG1wR0ac2qHKzisiUp2Y2TR3z9nbPr3BfQBO79KCs7u35In/LOGr4s2xDkdEJOqULA7QPT/oSsN68dz51hwqKmrf3ZiIyP4oWRyg9Mb1ueuszkxdVsKY3ILwB4iI1CJKFgfhwpwMTjjqCP7w7nzWbtwW63BERKJGyeIgmBm/P/8Yduyq4Ldj88MfICJSSyhZHKSsZsncfHpH3p+7lvH5q2MdjohIVChZHIJr+mfTtVUTfjt2LqVbd8Y6HBGRiFOyOAQJ8XE8eEF3vt68nQffWxDrcEREIk7J4hAdk9GUawa049WpK/ly6TexDkdEJKKULA7Dzad1pE1aEr9+aw7bdu6KdTgiIhGjZHEYGibG8/vzj2Hp11v483+WxDocEZGIUbI4TP07NONHvTP468dfMX/1xliHIyISEUoWVeCuszrTtGE97nhzNrs0FYiI1EJKFlUgNTmRu8/tyqzCUp7/fHmswxERqXJKFlXkB91bcurRzXn4/YUUlJTFOhwRkSoV0WRhZjeb2VwzyzezV82sgZllm9kUM1tiZq+ZWWJQt37wfUmwP6vSee4Myhea2RmRjPlQmRn3n9eNOIO7/plPbVwnRETqroglCzNrDdwA5Lh7NyAeGAY8BDzm7u2B9cDVwSFXA+uD8seCephZl+C4rsAQ4C9mFh+puA9H65SG/M+Qo/lkUTFjZ66KdTgiIlUm0t1QCUBDM0sAkoDVwKmEllsFeIHQOtwAQ4PvBPsHBet0DwXGuPt2d19GaNnVvhGO+5Bd1q8tx7ZJ4d5/zeWbzdtjHY6ISJWI5BrcRcDDwEpCSaIUmAZscPfyoFoh0DrYbg0UBMeWB/WPqFy+l2O+ZWbDzSzPzPKKi4ur/oIOUHyc8dAPu7N5ezm/+/f8mMUhIlKVItkNlUroriAbaAUkE+pGigh3H+XuOe6ek56eHqlmDkjHFo35xcD2vD2jiI8XxS5xiYhUlUh2Q50GLHP3YnffCbwFnAikBN1SABlAUbBdBGQCBPubAt9ULt/LMdXWdaccxVHpyfz6rTls2V4e/gARkWosksliJdDPzJKCsYdBwDxgEvCjoM4VwNhge1zwnWD/fzz0SNE4YFjwtFQ20AGYGsG4q0T9hHge+mF3ijZs5dEJi2IdjojIYYnkmMUUQgPV04E5QVujgNuBW8xsCaExidHBIaOBI4LyW4A7gvPMBV4nlGjGA9e5e42YtS8nK43L+rXhuc+WMbNgQ6zDERE5ZFYb3wfIycnxvLy8WIcBwMZtOxn86CekJNXjX9f3p1683oMUkerJzKa5e87e9uk3V4Q1aVCP+8/rxoI1mxj1ydJYhyMickiULKLg9C4tOPuYljw+cTFLizfHOhwRkYOmZBEld5/bhQYJcdz51hwqNDOtiNQwShZR0rxxA+46uzNTlpXwWl5B+ANERKoRJYsouignk+PbHcHv353P2o3bYh2OiMgBU7KIIjPj9xccw47yCu4eOzfW4YiIHDAliyjLbpbMTad1ZPzcNYzPXxPrcEREDoiSRQxcMyCbLi2b8Nux+ZRu3RnrcEREwlKyiIF68XE89MPufL15Ow+NXxDrcEREwlKyiJFjMppydf9sXpmykilLv4l1OCIi+6VkEUM3n96RzLSG3PnWHLbtrBHTXYlIHaVkEUNJiQn8/vxjWPr1Fp6atCTW4YiI7JOSRYwN6JDOD3tlMPKjr1iwZmOswxER2Ssli2rgN2d3pmnDetz6+iy27lB3lIhUP0oW1UBqciJ//FF35q3eyI1jZrBLc0eJSDUTyTW4O5nZzEqfjWZ2k5ndY2ZFlcrPqnTMnWa2xMwWmtkZlcqHBGVLzOyOSMUcS4M6t+Duc7rwwby1PPDv+bEOR0TkexLCVzk07r4Q6AlgZvGE1s1+G7gKeMzdH65c38y6AMOArkAr4EMz6xjsfgo4HSgEcs1snLvPi1TssXLlidmsLNnKs58tIzOtIVedmB3rkEREgAgmiz0MAr5y9xWh5bj3aigwxt23A8uC5VX7BvuWuPtSADMbE9StdckC4K6zO1O0oYz73plH65SGDO56ZKxDEhGJ2pjFMODVSt9HmNlsM3vWzFKDstZA5bm7C4OyfZXXSvFxxv9dfCzdM1K4YcwMZmntbhGpBiKeLMwsETgX+EdQNBI4ilAX1WrgkSpqZ7iZ5ZlZXnFxcVWcMmYaJsbzzOU5pDeuz9Uv5FJQUhbrkESkjovGncWZwHR3Xwvg7mvdfZe7VwBP811XUxGQWem4jKBsX+Xf4+6j3D3H3XPS09MjcBnRld64Ps9d2Zcd5RVc9XwupWWacFBEYicayeISKnVBmVnLSvvOB/KD7XHAMDOrb2bZQAdgKpALdDCz7OAuZVhQt9Zr37wRoy7PYeU3ZfzspTy2l+sdDBGJjYgmCzNLJvQU01uViv9oZnPMbDZwCnAzgLvPBV4nNHA9HrguuAMpB0YA7wPzgdeDunVCv3ZH8KcLu/Pl0hLueHMO7noHQ0SiL6JPQ7n7FuCIPcp+sp/6DwAP7KX8XeDdKg+whhjaszUFJWU8/MEiMtOSuOX0juEPEhGpQtF6dFYO03WntGdlSRlPTFxMRmpDLsrJDH+QiEgVUbKoIcyMB84/htWl2/j1W3No1bQh/Ts0i3VYIlJHaG6oGqRefBxP/bgX7Zs34hcvTWPhmk2xDklE6gglixqmSYN6PHtlHxomxnPVc1NZu3FbrEMSkTpAyaIGapXSkGev7MOGrTu5+oVctmwvj3VIIlLLKVnUUN1aN+WpS3sxb9VGrn91BuW7KmIdkojUYkoWNdgpRzfnvqHd+M+Cddzzr7l6B0NEIkZPQ9Vwl/VrS8H6Mv728VLapiVz7UntYh2SiNRCSha1wO1nHE1hyVYeeHc+rVMbctYxLcMfJCJyEJQsaoG4OOORi3qwZuM2bnptJi2a1Kd327RYhyUitYjGLGqJBvXiefryHFo1bcC1L05j+ddbYh2SiNQiSha1SFpyIs9d1Rd356rnc1m/ZUesQxKRWkLJopbJbpbM05fnULRhK9e+mMe2nZrWXEQOn5JFLZSTlcajF/Ugb8V6bvvHLCoq9EitiBweDXDXUud0b0Xh+q08+N4CMtOSuH3I0bEOSURqMCWLWuxnJ7VjZUkZIz/6iszUJC49rk2sQxKRGipi3VBm1snMZlb6bDSzm8wszcwmmNni4GdqUN/M7AkzW2Jms82sV6VzXRHUX2xmV0Qq5trGzLjv3K4M7JTO/47NZ9LCdbEOSURqqIglC3df6O493b0n0BsoA94G7gAmunsHYGLwHeBMQutudwCGAyMBzCwNuBs4DugL3L07wUh4CfFx/PnSXnRq0ZgRL09n7qrSWIckIjVQtAa4BwFfufsKYCjwQlD+AnBesD0UeNFDvgRSzKwlcAYwwd1L3H09MAEYEqW4a4VG9RN49so+NGlYj58+n8vq0q2xDklEaphoJYthwKvBdgt3Xx1srwFaBNutgYJKxxQGZfsq/x4zG25meWaWV1xcXJWx1wpHNm3As1f2Ycv2XVz1XC6btu2MdUgiUoNEPFmYWSJwLvCPPfd5aJrUKnmu091HuXuOu+ekp6dXxSlrnc4tmzDysl4sWbeZX748nZ2a1lxEDlA07izOBKa7+9rg+9qge4ng5+5R1yIgs9JxGUHZvsrlEAzokM4D53dj8uKv+d9/5mtacxE5INFIFpfwXRcUwDhg9xNNVwBjK5VfHjwV1Q8oDbqr3gcGm1lqMLA9OCiTQ3RxnzaMOKU9Y3IL+MtHX8U6HBGpASL6noWZJQOnAz+rVPwg8LqZXQ2sAC4Kyt8FzgKWEHpy6ioAdy8xs/uB3KDefe5eEsm464JbB3ekYH0Zf3p/IRmpDRna87+GgUREvhXRZOHuW4Aj9ij7htDTUXvWdeC6fZznWeDZSMRYV5kZf/xRd1aXbuNX/5iNmXFuj1axDktEqinNDVWH1U+IZ9RPetM9oyk3vDqD3/xzjiYeFJG9UrKo41KSEnl1eD9+dnI7XvpyJT/66+es/KYs1mGJSDWjZCHUi4/jzjM788zlORSUbOXsJyczPn9NrMMSkWpEyUK+dVqXFrxzfX/aNUvm5y9N4/535rGjXO9iiIiShewhMy2J139+PFeekMXoT5dx8agvKNqg6UFE6jolC/kv9RPiuefcrjx1aS8Wr93M2U9MZtICzVgrUpcpWcg+nd29Jf+6vj8tmzbkqudz+eP4BZRrihCROknJQvYru1kyb//yBC7p24a/fPQVlz4zhbUbt8U6LBGJMiULCatBvXj+cMExPHZxD+YUlnL2E5P5bMnXsQ5LRKJov8nCzJrsZ5/W6Kxjzj82g3EjTiQ1KZHLRk/h8Q8Xs6tCExGK1AXh7iw+2r1hZhP32PfPKo9Gqr0OLRozdsSJnN+zNY99uIgrn5vK15u3xzosEYmwcMnCKm2n7Wef1CFJiQk8clEPHrzgGKYuK+HsJyYzdZnmdhSpzcIlC9/H9t6+Sx1iZgzr24a3f3kiSYkJXPL0l4z86Csq1C0lUiuFm3W2uZndQuguYvc2wXctRyd0adWEcSNO5I435/DQ+AXkLi/hkQt7kJqcGOvQRKQKhbuzeBpoDDSqtL37+zORDU1qisYN6vHnS4/lvqFdmby4mHOe/JQZK9fHOiwRqUIWyWU1zSyFUFLpRqjb6qfAGcC1QHFQ7dfu/m5Q/07gamAXcIO7vx+UDwEeB+KBZ9z9wf21m5OT43l5eVV/QRLWrIINXPfKdNZu3Mavz+rMlSdkYabhLZGawMymuXvO3vaFe3T2WjPrEGybmT1rZqVmNtvMjj2Ath8Hxrv70UAPYH5Q/pi79ww+uxNFF2AY0BUYAvzFzOLNLB54itBa3l2AS4K6Ug31yEzh39cP4OSOzbn3X/P45cvT2bhtZ6zDEpHDFK4b6kZgebB9CaFf+O2AW4An9negmTUFTgJGA7j7DnffsJ9DhgJj3H27uy8jtLxq3+CzxN2XuvsOYExQV6qppkn1ePry3tx1Vmc+mLeWHzz5KflFpbEOS0QOQ7hkUe7uu/8sPAd40d2/cfcPgeQwx2YT6mp6zsxmmNkzwZrcACOCu5NnzSw1KGsNFFQ6vjAo21f595jZcDPLM7O84uLiPXdLlJkZ157UjteG92P7zgouGPk5L09ZQSS7PUUkcsIliwoza2lmDQitm/1hpX0NwxybAPQCRrr7scAW4A5gJHAU0BNYDTxyKIHvyd1HuXuOu+ekp+tBreoiJyuNf9/Qn37tjuCut/O56bWZbNleHuuwROQghUsWvwXyCHVFjXP3uQBmdjKwNMyxhUChu08Jvr8B9HL3te6+y90rCD1h1TfYXwRkVjo+IyjbV7nUEEc0qs/zV/bhtsEd+desVZz7509ZuGZTrMMSkYMQLlmsBY4HOrv7tWZ2uZmNBX4MDN/fge6+Bigws05B0SBgnpm1rFTtfCA/2B4HDDOz+maWDXQApgK5QAczyzazREKD4OMO/BKlOoiLM0ac2oGXrjmO0q3lDH3qU96YVhjrsETkAIVLFn8DNrv7ejM7CXgQeJFQEnn8AM5/PfCymc0m1O30e+CPZjYnKDsFuBkguGt5HZgHjAeuC+5AyoERwPuEnqZ6ffcdjtQ8JxzVjHdv7E/PzBRu+8csbvvHLDarW0qk2tvvexZmNsvdewTbTwHF7n5P8H2mu/eMSpQHSe9ZVH/luyp4YuJi/jxpCZlpSTx2cU96tUkNf6CIRMwhv2cBxJvZ7ilBBgH/qbQv3FQhIvuUEB/HLYM7MWb48ZTvci786xc8/uFircQnUk2FSxavAh8H4xRbgckAZtYe0IPzctj6Zqfx3k0D+EH3ljz24SIuHvUlBSVlsQ5LRPYQdroPM+sHtAQ+cPctQVlHoJG7T498iAdP3VA109iZRfzm7XwcuPfcrlzQq7WmChGJov11Q4XtSnL3L/dStqgqAhOpbGjP1vRqk8otr8/k1n/MYtLCdTxw3jE0TaoX69BE6jytwS3VSmZaEmOGH8+vzujE+Pw1nPn4J3zx1TexDkukzlOykGonPs647pT2vPmLE6hfL55Ln/mSh8YvYEe5Br9FYkXJQqqtHpkpvHN9f4b1yWTkR19xwcjP+Kp4c6zDEqmTlCykWkuun8AfLujOXy/rTdH6rZz9xGRNSCgSA0oWUiMM6XYk4286iT5Zadz1dj7XvjiNbzZvj3VYInWGkoXUGC2aNOCFq/ryv+d04ZNFxQx5fDIfLVwX67BE6gQlC6lR4uKMq/tnM3bEiaQm1ePK53K5Z9xctu3cFevQRGo1JQupkTq3bMK4Ef258oQsnv98OUP//BkL1myMdVgitZaShdRYDerFc8+5XXnhp30pKdvBuU9+xjOTl1JRocFvkaqmZCE13skd0xl/4wBO6pjO7/49nyuem8rajdtiHZZIraJkIbXCEY3q8/TlvXng/G7kLi9hyP99wvj8NbEOS6TWiGiyMLMUM3vDzBaY2XwzO97M0sxsgpktDn6mBnXNzJ4wsyVmNtvMelU6zxVB/cVmdkUkY5aay8z48XFteef6AbRObcjPX5rGHW/O1prfIlUg0ncWjwPj3f1ooAehle7uACa6ewdgYvAd4ExCS6l2ILRk60gAM0sD7gaOI7Re9927E4zI3rRv3oi3fnEivxh4FK/lFXDOk58yq2BDrMMSqdEilizMrClwEjAawN13uPsGYCjwQlDtBeC8YHso8KKHfAmkBOt1nwFMcPcSd18PTACGRCpuqR0SE+K4fcjRvHJNP7bv3MUPR37OU5OWsEuD3yKHJJJ3FtlAMfCcmc0ws2fMLBlo4e6rgzprgBbBdmugoNLxhUHZvspFwjr+qCN478aTGNLtSP70/kIuGfUlheu1uJLIwYpkskgAegEj3f1YYAvfdTkB4KEJfqrkTz0zG25meWaWV1xcXBWnlFqiaVI9nrzkWB69qAfzVm/kzMcnM3ZmkeaXEjkIkUwWhUChu08Jvr9BKHmsDbqXCH7unq+hCMisdHxGULav8u9x91HunuPuOenp6VV6IVLzmRkX9MrgvRsH0LFFY24cM5PLn53K4rWbYh2aSI0QsWTh7muAAjPrFBQNAuYB44DdTzRdAYwNtscBlwdPRfUDSoPuqveBwWaWGgxsDw7KRA5aZloSrw3vx90/6MKsgg0MeXwy9/5rLqVbd8Y6NJFqLeyyqofpeuBlM0sElgJXEUpQr5vZ1cAK4KKg7rvAWcASoCyoi7uXmNn9QG5Q7z53L4lw3FKLJcTHcdWJ2ZzboxWPTFjE858vZ+zMVdw2uBMX98kkPk7rfovsyWpjv21OTo7n5eXFOgypIeauKuXecfOYuryErq2acM+5XemTlRbrsESizsymuXvO3vbpDW6p87q2asprP+vHk5ccS8mWHVz41y+44dUZrC7dGuvQRKoNJQsRQgPgP+jRiom3nswNp7Zn/Nw1nPrwxzw5cbGmPxdByULke5ISE7hlcCcm3nIyAzul88iERZz26MeMz1+jR22lTlOyENmLzLQkRl7Wm1euOY7kxAR+/tI0Lhs9hUV61FbqKCULkf04oX0z/n1Df+49tyv5RaEX+u4ZN5fSMj1qK3WLkoVIGAnxcVxxQhaTbhvIJX0zefGL5Qx8eBIvT1mhuaakzlCyEDlAacmJ/O68Y3jn+gF0aNGYu97O5wdPfsrUZXrtR2o/JQuRg9SlVRNeG96PP196LBvKdnDR375gxCvTWbVBj9pK7aVkIXIIzIxzurdi4q0DuXFQBybMW8upj3zE4x/qUVupnZQsRA5Dw8R4bj69IxNvPZlTj27OYx8uYtAjH/PenNV61FZqFSULkSqQkZrEX37cm1euPY7GDRL4xcvTufTpKSxYszHWoYlUCSULkSp0wlHNeOf6/tw/tCvz12zkrMcn89ux+Wwo2xHr0EQOi5KFSBVLiI/jJ8dnMenWgfz4uLa89OUKBj78EX//YjnluypiHZ7IIVGyEImQ1ORE7j+vG/++YQBHH9mY/x07l3Oe/JRJC9ZpPENqHCULkQjr3LIJr17bj7/8uBebtpVz1fO5DH7sE17LXaknp6TG0HoWIlG0o7yCd2av4unJy5i/eiPNGiVy+fFZXNavLWnJibEOT+q4mK1nYWbLzWyOmc00s7yg7B4zKwrKZprZWZXq32lmS8xsoZmdUal8SFC2xMzuiGTMIpGUmBDHBb0yePeG/rx8zXF0a92URycs4oQHJ/Kbf85h2ddbYh2iyF5FellVgFPc/es9yh5z94crF5hZF2AY0BVoBXxoZh2D3U8BpwOFQK6ZjXP3eRGOWyRizIwT2zfjxPbNWLR2E89MXsrruYW8PGUlp3VuwfCT2pHTNhUzLfEq1UM0ksWBGgqMcfftwDIzWwL0DfYtcfelAGY2JqirZCG1QscWjfnjj3pw2xmd+PsXK/j7lyuYMG8tPTJTuHZANkO6HklCvIYXJbYi/S/QgQ/MbJqZDa9UPsLMZpvZs2aWGpS1Bgoq1SkMyvZV/j1mNtzM8swsr7i4uGqvQiQKmjduwK2DO/H5Hady/9CulJbtYMQrMxj48EeM/nQZm7eXxzpEqcMinSz6u3sv4EzgOjM7CRgJHAX0BFYDj1RFQ+4+yt1z3D0nPT29Kk4pEhNJiQn85PgsJt46kL/9pDctmzbg/nfmcfwfJvKH9+azpnRbrEOUOiii3VDuXhT8XGdmbwN93f2T3fvN7GngneBrEZBZ6fCMoIz9lIvUWvFxxhldj+SMrkcyY+V6npm8jKc/Wcroycs4t0crrhnQji6tmsQ6TKkjIpYszCwZiHP3TcH2YOA+M2vp7quDaucD+cH2OOAVM3uU0AB3B2AqYEAHM8smlCSGAZdGKm6R6ujYNqk89eNUCkrKePazZbyWW8BbM4o4sf0RXDOgHQM7pmswXCIqkncWLYC3g3/ACcAr7j7ezP5uZj0JjWcsB34G4O5zzex1QgPX5cB17r4LwMxGAO8D8cCz7j43gnGLVFuZaUnc/YOu3DSoI69MXcnzny/jqudy6dC8EdcMyGZoz9Y0qBcf6zClFtJLeSI12O6X/EZ9spQFazbRrFF9rji+LZf1a0uqXvKTg7S/l/KULERqAXfnsyXf8PTkpXy8qJgG9eK4sHcmV/fPJqtZcqzDkxpif8miOr1nISKHyMzo36EZ/Ts0Y+Ga0Et+r+UW8NKUFZzeuQXX6iU/OUy6sxCppdZt2saLn6/gpSkr2FC2k56ZKVyjl/xkP9QNJVKHle0o541phTz76TKWf1NG65SGXHlCFhf3zaRJg3qxDk+qESULEWFXhTNx/lpGf7qMKctKSE6M56I+mfz0xGwy05JiHZ5UA0oWIvI9cwpLGf3pUt6ZvZoKd87oeiTXDMimVxuNa9RlShYisldrSrfxwhfLeWXKSkq37qRHZgrX9M/mzG4a16iLlCxEZL/KdpTz5rRCRlca17jihLZc3KcNTRtqXKOuULIQkQNSUeFMXLCO0Z8u5cul341rXHVCNm2O0LhGbadkISIHLb+olNGfLuNfs1ZR4c7gLqFxjd56X6PWUrIQkUO2pnQbL36xnJc1rlHrKVmIyGEr21HOm9OLePbTZSz7egutmjbgyhOzNK5RiyhZiEiVqahw/rNgHaM/XcYXS78hOTGeC3NC72toXKNmU7IQkYjILyrl2U+XMU7jGrWCkoWIRNTajaFxjZe+DMY1Mppy9YB2nNntSOppXKPG2F+yiOj/ima23MzmmNlMM8sLytLMbIKZLQ5+pgblZmZPmNkSM5ttZr0qneeKoP5iM7sikjGLyMFr0aQBvzrjaL6481TuP5v99rcAAA9ySURBVK8bm7aVc8OrMzj5j5P428dfsaFsR6xDlMMU0TsLM1sO5Lj715XK/giUuPuDZnYHkOrut5vZWcD1wFnAccDj7n6cmaUBeUAOodX1pgG93X39vtrVnYVIbFVUOJMWruOZyaFxjcSEOM7sdiTD+rShX7s0dVFVU9VtPYuhwMBg+wXgI+D2oPxFD2WvL80sxcxaBnUnuHsJgJlNAIYAr0Y3bBE5UHFxxqDOLRjUuQXzV29kzNSVvD2jiLEzV5F1RBIX92nDj3pnkN64fqxDlQMU6c5EBz4ws2lmNjwoa+Huq4PtNYTW6gZoDRRUOrYwKNtX+feY2XAzyzOzvOLi4qq8BhE5DJ1bNuHeod2YetdpPHpRD5o3acBD4xdw/B8m8vO/T2PSwnXsqqh9Y6e1TaTvLPq7e5GZNQcmmNmCyjvd3c2sSv6VuPsoYBSEuqGq4pwiUnUa1Ivngl4ZXNArg6+KN/NabgFvTitk/Nw1tE5pyIU5GVyYk0nrlIaxDlX2IqJ3Fu5eFPxcB7wN9AXWBt1LBD/XBdWLgMxKh2cEZfsqF5Ea6qj0Rvz6rM58cecg/vLjXrRLT+bxiYvp/9B/uPK5qYzPX8POXRWxDlMqidgAt5klA3HuvinYngDcBwwCvqk0wJ3m7v9jZmcDI/hugPsJd+8bDHBPA3Y/HTWd0AB3yb7a1gC3SM1TUFLGP/IKeD2vkDUbt9GsUX1+1DuDi/tkkt0sOdbh1Qkxec/CzNoRupuAUHfXK+7+gJkdAbwOtAFWABe5e4mFHo/4M6HB6zLgKnff/bjtT4FfB+d6wN2f21/bShYiNVf5rgo+WVzMq1ML+M+C0HhGv3ZpXNK3DWd0PZIG9eJjHWKtpZfyRKRGWrtxG29MK+S13AJWlpTRtGE9zj+2NZf0bUOnIxvHOrxaR8lCRGq0igrni6XfMCa3gPfz17BjVwU9M1O4pG8m53RvRXL9WLwFUPsoWYhIrVGyZQdvTS9kTG4BS9ZtJjkxnnN7tmJYnzZ0z2iqF/4Og5KFiNQ67s70let5dWoB78xexbadFXRu2YRhfTI5r2drmiZp2vSDpWQhIrXaxm07GTdzFa/lFjCnqJT6CXGcdUxLzj+2NTlZqSQlqpvqQChZiEidkV9UypjclYydsYpN28uJjzO6tWpCTlYafbJS6d02TdOM7IOShYjUOVt37GLKsm/IW76eqctLmFWwge3loRf9spslk9M2lT5ZaeRkpZLdLFljHShZiIiwvXwX+UUbyVteQu7y9eStKGFD2U4AmjVKJKdtKHH0yUqjS6smdXIdDiULEZE9VFQ4S7/eTO7y9eQuLyFv+XpWlpQB0LBePMe2Sfm26+rYNqk0qgOP5ypZiIgcgLUbt5EXJI/c5SXMX72RCoc4gy6tmpDTNo0+QQJp3qRBrMOtckoWIiKHYNO2ncxYueHbrqsZBevZtjM07tH2iKQgeaSSk5XGUek1f9yjui1+JCJSIzRuUI+TOqZzUsd0AHbuqmDuqt3jHiV8tHAdb04vBCA1qd633VY5WWl0admkVs1jpTsLEZFD5O4s+3rLt11XeSvWs+zrLQDUizeOPrIJPTKb0j0jhR4ZKbRv3oj4uOp796FuKBGRKCnetJ1pK0qYWVDK7MINzCksZdP2cgCSEuPp1ropPTKa0iMzlEAyUhtWm+4rJQsRkRgJPXW1hdmFG5hdWMrMgg3MW72RHcE7H2nJiXTP2H33EfoZq5cGNWYhIhIjcXFG++aNaN+8ERf0ygBgR3kFi9ZuYmbBhm+TyCeLFrN7KfLWKQ3pHtx9dM9oyjGtm9K4QWznulKyEBGJssSEOLq1bkq31k2BtgBs2V7O3FUbmV24IUgipbyXvwYAs9BStN0zmtIjI4UemSl0btmY+gnRG0CPeLIws3ggDyhy93PM7HngZKA0qHKlu88MVsp7nNCyqmVB+fTgHFcAvwnq/87dX4h03CIi0ZRcP4G+2Wn0zU77tqxky45v7zxmFWzgk0Vf89b0IiD6A+jRuLO4EZgPNKlU9it3f2OPemcCHYLPccBI4LhgDe67gRzAgWlmNs7d10c8chGRGEpLTmRgp+YM7NQcCD19tbp0G7MKNjCrMDSAPnbGKl76ciUQGkAf1LkFT15ybJXHEtFkYWYZwNnAA8AtYaoPBV700Ij7l2aWYmYtgYHABHcvCc45gdA63a9GLHARkWrIzGiV0pBWKQ0585iWwH8PoCclRqZrKtJ3Fv8H/A+w52K5D5jZb4GJwB3uvh1oDRRUqlMYlO2r/HvMbDgwHKBNmzZVFb+ISLW2twH0iLQTqROb2TnAOneftseuO4GjgT5AGnB7VbTn7qPcPcfdc9LT06vilCIiEojkHLwnAuea2XJgDHCqmb3k7qs9ZDvwHNA3qF8EZFY6PiMo21e5iIhEScSShbvf6e4Z7p4FDAP+4+6XBeMQBE8/nQfkB4eMAy63kH5AqbuvBt4HBptZqpmlAoODMhERiZJYvGfxspmlAwbMBH4elL9L6LHZJYQenb0KwN1LzOx+IDeod9/uwW4REYkOTfchIiLA/qf7qHvrBoqIyEFTshARkbCULEREJKxaOWZhZsXAisM4RTPg6yoKpya0G8u261q7sWxb11w32j6cdtu6+15fVKuVyeJwmVnevgZ5amO7sWy7rrUby7Z1zXWj7Ui1q24oEREJS8lCRETCUrLYu1F1rN1Ytl3X2o1l27rmutF2RNrVmIWIiISlOwsREQlLyUJERMJSsqjEzJ41s3Vmlh++dpW2m2lmk8xsnpnNNbMbo9RuAzObamazgnbvjUa7ldqPN7MZZvZOlNtdbmZzzGymmUVtErFg9cc3zGyBmc03s+Oj1G6n4Fp3fzaa2U1Ravvm4N9Wvpm9amYNotFu0PaNQbtzI3m9e/u9YWZpZjbBzBYHP1Oj2PaFwTVXmFmVPUKrZPF9zxNasjXayoFb3b0L0A+4zsy6RKHd7cCp7t4D6AkMCaaHj5bd67PHwinu3jPKz8E/Dox396OBHkTp2t19YXCtPYHehGZ1fjvS7ZpZa+AGIMfduwHxhJYriDgz6wZcS2i9nB7AOWbWPkLNPc9//964A5jo7h0IVgSNYtv5wAXAJ1XZkJJFJe7+CRD16c+DBaGmB9ubCP0S+a+lYyPQrrv75uBrveATlSceKq3P/kw02os1M2sKnASMBnD3He6+IQahDAK+cvfDmeHgYCQADc0sAUgCVkWp3c7AFHcvc/dy4GNCv0Cr3D5+bwwFXgi2XyC0dk9U2nb3+e6+sKrbUrKoZswsCzgWmBKl9uLNbCawDpjg7lFpl+/WZ6+IUnuVOfCBmU0L1m6PhmygGHgu6Hp7xsySo9R2ZcOAV6PRkLsXAQ8DK4HVhBY0+yAabRP663qAmR1hZkmE1srJDHNMVWoRLN4GsAZoEcW2I0LJohoxs0bAm8BN7r4xGm26+66geyID6BvcvkfUftZnj5b+7t4LOJNQl99JUWgzAegFjHT3Y4EtRK5rYq/MLBE4F/hHlNpLJfQXdjbQCkg2s8ui0ba7zwceAj4AxhNaaG1XNNreSyxOlO7YI0nJopows3qEEsXL7v5WtNsPukQmEZ0xm72uzx6FdoFv/+LF3dcR6rvvu/8jqkQhUFjpzu0NQskjms4Eprv72ii1dxqwzN2L3X0n8BZwQpTaxt1Hu3tvdz8JWA8silbbwNpKS0i3JHTnXqMpWVQDwXrko4H57v5oFNtNN7OUYLshcDqwINLt7mt99ki3C2BmyWbWePc2oTXdI/70m7uvAQrMrFNQNAiYF+l293AJUeqCCqwE+plZUvBvfBBRfKDBzJoHP9sQGq94JVptA+OAK4LtK4CxUWw7Mtxdn+BD6P9Iq4GdhP4SvDpK7fYndJs6m9Dt8kzgrCi02x2YEbSbD/w2Bv/NBwLvRLG9dsCs4DMXuCuKbfcE8oL/3v8EUqPYdjLwDdA0yv/73kvoD5B84O9A/Si2PZlQQp4FDIpgO//1ewM4gtBTUIuBD4G0KLZ9frC9HVgLvF8VbWm6DxERCUvdUCIiEpaShYiIhKVkISIiYSlZiIhIWEoWIiISlpKF1Ehm5mb2SKXvt5nZPVV07ufN7EdVca4w7VwYzD47KZJxmVmWmV168BGKfEfJQmqq7cAFZtYs1oFUFkyYd6CuBq5191MiFU8gCzioZHGQ1yF1gJKF1FTlhNYavnnPHXv+BW5mm4OfA83sYzMba2ZLzexBM/txsKbHHDM7qtJpTjOzPDNbFMxltXvSxT+ZWa6ZzTazn1U672QzG8de3so2s0uC8+eb2UNB2W8JvYw52sz+tJdjbg+OmWVmD+5l//LdidLMcszso2D75ErrVswI3lZ/kNCkejOD9SUO6DqCt93/HcSQb2YXH8j/MFI76a8HqcmeAmab2R8P4pgehKavLgGWAs+4e18LLTh1PbB7kZwsQnNGHQVMCtZCuJzQzKl9zKw+8JmZ7Z5FtRfQzd2XVW7MzFoRmtCuN6H5iT4ws/Pc/T4zOxW4zd3z9jjmTEIT8B3n7mVmlnYQ13cbcJ27fxZMTLmN0ISFt7n77qQ3/ECuw8x+CKxy97OD45oeRBxSy+jOQmosD83M+yKhBXYOVK6H1g/ZDnxFaFZSgDmEEsRur7t7hbsvJpRUjiY0j9TlwZTuUwhN6dAhqD91z0QR6AN85KHJ9MqBlwmta7E/pwHPuXtZcJ0Hs8bKZ8CjZnYDkBK0uacDvY45wOlm9pCZDXD30oOIQ2oZJQup6f6PUN9/5bUhygn+bZtZHJBYad/2StsVlb5X8P077T3nwXHAgOs9WHXO3bP9u/UZthzWVRy8b68R+HapUnd/ELgGaEjojuHovRx7QNfh7osI3WnMAX4XdJ1JHaVkITVa8Ff364QSxm7LCXX7QGj9hnqHcOoLzSwuGMdoBywE3gd+EUwnj5l1tPALGE0FTjazZmYWT2jm14/DHDMBuMpCi/awj26o5Xx3jT/cXWhmR7n7HHd/CMgldEe0CWhc6dgDuo6gC63M3V8C/kT0p1SXakRjFlIbPAKMqPT9aWCsmc0itPDNofzVv5LQL/omwM/dfZuZPUOoq2p6MOV2MWGWy3T31WZ2B6G1Qgz4t7vvd7pqdx9vZj2BPDPbAbwL/HqPavcSGhy/H/ioUvlNZnYKoTulucB7wfau4L/H84TWAj+Q6zgG+JOZVRCa1fQX+4tbajfNOisiImGpG0pERMJSshARkbCULEREJCwlCxERCUvJQkREwlKyEBGRsJQsREQkrP8HeGt5W2Mot3wAAAAASUVORK5CYII=\n" 321 | }, 322 | "metadata": { 323 | "needs_background": "light" 324 | } 325 | } 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "source": [ 331 | "There is a slight elbow at `k=9` which could point to the fact that a few digits may have been merged in one cluster." 332 | ], 333 | "metadata": { 334 | "id": "d0Wj1fYCwEhF" 335 | } 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "source": [ 340 | "### Silhoutte" 341 | ], 342 | "metadata": { 343 | "id": "ASM9vuyCzKIb" 344 | } 345 | }, 346 | { 347 | "cell_type": "code", 348 | "source": [ 349 | "sil_coef_digits = []\n", 350 | "for k in range(2, 15):\n", 351 | " kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n", 352 | " kmeans.fit(scaled_digits)\n", 353 | " score = silhouette_score(digits.data, kmeans.labels_)\n", 354 | " sil_coef_digits.append(score) " 355 | ], 356 | "metadata": { 357 | "id": "B_OE4DBJv9e7" 358 | }, 359 | "execution_count": 10, 360 | "outputs": [] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "source": [ 365 | "plt.plot(range(2, 15), sil_coef_digits)\n", 366 | "plt.xticks(range(2, 15))\n", 367 | "plt.xlabel('Number of clusters')\n", 368 | "plt.ylabel('silhouette score')\n", 369 | "plt.grid(True)\n", 370 | "plt.show()" 371 | ], 372 | "metadata": { 373 | "colab": { 374 | "base_uri": "https://localhost:8080/", 375 | "height": 279 376 | }, 377 | "id": "D5g9yY5Jzp1I", 378 | "outputId": "92092d9b-05d7-48bd-8f57-cd401accea07" 379 | }, 380 | "execution_count": 11, 381 | "outputs": [ 382 | { 383 | "output_type": "display_data", 384 | "data": { 385 | "text/plain": [ 386 | "
" 387 | ], 388 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEGCAYAAAB/+QKOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU5fX48c8hEBIIS9jCHnYBAYGwuKLRqrhBrai4UHEpasVqq7b4taVqN5dq1Ur7s3XBDaOirRRpcQu2VZEQ9rBoWBKIbIawhJCQZM7vj3tDxzhJJmTuzGRy3q/XvDJ3ee45N5A5c59773NFVTHGGGOqaxbpBIwxxkQnKxDGGGMCsgJhjDEmICsQxhhjArICYYwxJqDmkU4gVDp16qR9+vQ57vaHDx+mdevWoUsohmOEK06sxAhXHNuX6IsRrjgNiZGdnf21qnYOuFBVY+KVlpamDZGZmdmg9k0pRrjixEqMcMWxfYm+GOGK05AYwHKt4XPVupiMMcYEZAXCGGNMQFYgjDHGBORpgRCRiSKySURyRWRWgOUTRGSFiFSIyJRqyx4WkXXu60ov8zTGGPNtnhUIEYkD5gAXAEOBq0RkaLXV8oHpwLxqbS8CRgMjgfHA3SLS1qtcjTHGfJuXRxDjgFxV3aKqR4EMYLL/Cqq6TVXXAL5qbYcC/1bVClU9DKwBJnqYqzHGmGpEPRrN1e0ymqiqN7nT04DxqjozwLpzgYWqOt+dPg/4JXAu0ApYBsxR1ceqtZsBzABISUlJy8jIOO58i4uLSUpKOu72TSlGuOLESoxwxbF9ib4Y4YrTkBjp6enZqjom4MKarn9t6AuYAjzrNz0NeLqGdecCU6rNuw9YBbwPvArcWVs8uw8ifDHCFSdWYoQjzuY9h/RXr7ynlZU+T+Ooxs6/S6z82zc0BhG6D6IA6OU33dOdFxRV/Y2qjlTVcwEBvghxfsbEhIL9R5j6l6U8u/Yo172wjL2HyiKdkokRXhaILGCgiPQVkXhgKrAgmIYiEiciHd33I4ARwHueZWpMI3WgpJzpzy/jSHkllw5owbKt+7jgyf/wSe7XkU7NxADPCoSqVgAzgcXABuANVc0RkQdFZBKAiIwVkR3A5cAzIpLjNm8B/EdE1gN/Aa51t2eMcZVVVPKDl5eTV1jCX6aNYfKAeN6ZeRrtW7Xg2uc+5/H3NlFRWf36D2OC5+lgfaq6CFhUbd5sv/dZOF1P1duV4lzJZIwJwOdT7npjNcu27uPJqSM5pX9HlmyHwV3bsmDmacx+J4enPspl6dZ9PDV1FF3bJUQ6ZdMI2Z3UxjRCD/9rIwvX7GTWBYOZPLLHN5a1im/O7y8/iccuP4l1BQe48Kn/kLlpT4QyNV7w+ZScrw7w/H+3MuOl5Ty71pvzTjEz3LcxTcWLn27jmX9vYdrJqdw8oV+N612W1pOTerVn5rwVXP9CFjef2Y+7zzuBFnH2vbCxqfQpG3YeZOmWQj7fuo9lW/dx4Eg5AL07tGJIG2/iWoEwphH517pd3P+PHM4dmsL9k05ERGpdf0CXJP5+22k8uHA9z3y8hayt+3jqqlH0TG4VpozN8fAvCEu3FLJs6z4OljqnYVM7tmLiiV0Z368D4/t1pEf7RJYsWeJJHlYgjGkksvOKuCNjJSf1bM9TU0cR16z24lAloUUcv710OKf068i9b6/loqf+y6NTRnDeiV09ztgEq9KnrP/KryBs28chtyD06diKC4d34+R+HRnfrwPd2iWGLS8rEMY0Alv2FnPTi1l0a5fAc9eNITE+rt7buOSk7gzv0Y6Zr61gxsvZXH9aH2ZdMJiWzeu/LdMwFZU+1h87QthH1tZ9HCpzCkK/Tq25eIRbEPp2jOgFBlYgjIlyXxeXMf2FLESEudePo2NSy+PeVp9OrXnr1lP53aKNvPDJNpZvK+Lpq0eR2tH7R282ZRWVPtZ9dZDP3SOErG1FFFcVhM6tuWRkd7cgdCClbfRccWYFwpgoVnK0ghvnZrHnUCmv/eBk+nRq+Ad5y+Zx3D/pRE7p35F73lzNxU/9l99dNpyLR3QPQcYGnCGMvthdzJJNe3h3eSkzM98/VhAGdElisl9B6BJFBaE6KxDGRKmKSh+3z1vJ2oIDPDNtDKN6J4d0++ef2JWh3dpy+2srmTlvJZ9tLuQXFw8loYV1OR2P0vJKPttcyEcb9/DRxj0U7D8CQPck4dJRvZ2Tyn070rnN8R8BhpsVCGOikKoye0EOH27cw6++O4xzh6Z4EqdXh1a8ecsp/H7xJp759xay84qYc81o+nf2fpTTWPDV/iPHCsKnm7+mtNxHq/g4ThvQidvPHkD64C5sWLGUs84aFulUj4sVCGOi0J+WbGbe5/nccmZ/pp2c6mmsFnHNuPfCIZzcryM/eWMVl/zxv/zm0mFcOupbgxw0eZU+ZWV+ER9u3EPmxj1s3HUIcO5FmDq2N2cP7sL4fh2+ceJ/Q6SSDQErEMZEmb+t3MGjizcxeWR3fnr+CWGLmz64C4vuOIM7XlvFj19fzae5hTww+URaxTftj4n9JUf5+Iu9fLRxDx9/sZf9JeU0byaM6ZPMfRcOIX1wF/p3bl3nPSmNUdP+lzcmynya+zU/nb+GU/p15JEpI2gW5L0OodKtXSLzfjCeJz/8kqczc1m1fT9zrhnNoJTQ36rr8yl7i8vYUVTCjqIjFOw/4vwsOkJRUSmL962ha9tEurVLIKVdAt3aJdC1XQJtWjb39MO46gTzhxt3k7lxD9l5RfgUOraO55zBKZw9uAtnDOpE24QWnuUQLaxAGBMlNu46yM0vZ9O3U2v+37S0iN2f0DyuGXeddwLj+3bkztdXMenp//LApBO5Ykyven0wV1T62HmglIL9zoe+UwRKjhWCnftLOVpttNnkVi3okZzIoTLlvZzdFB4++q3ttoqPo2u7BLq2dQpGt2Pv3WLSNoGOrePrVVxLyyv5dPPXfLRxD5kb9x47wTysR1tmpjvnEk7q2T7sBTvSrEAYEwV2HjjC9OezaNUyjrnXj6NdYuS/nZ4+sBOL7jidH7++ip+9tZZPNxfym0uHH1teVlHJV/tL2VFUQkG1I4CC/UfYdbCUSt83H2ncpU1LeiQnMrxHOy4Y1o0eyYn0bJ9Ij+REerRPpHVL5yNpyZIlnHXWWZRVVLLnYBk7D5Sy62Apuw4cYdeBMnYdPMLOA6Us3VzI7kNl34rTIk7o0uZ/Rx3/KyaJdG3Xkq7tEtlb4uPlpXlkVjvBfLrfCeZouichEqxAGBNhB0vLmf58FsVlFbx5yyl0bx++oRTq0qVNAi/dMJ4/Zebyhw++IDuviJa+Mu755INvPbmumThdVD2SExnft4Pz4Z+cSI/2reiR7Hy7r+8ltC2bx9GrQyt6dah57KhKn1JY7BSRnQdK2X3Q/+cR1hUc4IMNuyktD/RsjHW1nmBu6qxAGBNBRyt83PJyNpv3FjP3+nEM6dY20il9S1wz4fZzBjKubwceWbyJI8VlnN23y7Fv/VWFoGvbBJpHYKTYuGZCl7YJdGmbwEm9Aq+jqhw4Uu53JFJKzoZNTL/glJg9wRwKViCMiRBV5WdvreHTzYU8fsVJnD6wU6RTqtX4fh1569ZT3e6fEZFOp15EhPat4mnfKv5YEV5SsoUBXex+j9rYwPDGRMjv39vE31YWcPd5g/jeaLvnwEQfKxDGRMArS/OYk7mZq8b15rb0AZFOx5iAPC0QIjJRRDaJSK6IzAqwfIKIrBCRChGZUm3ZIyKSIyIbROQpsU5CEyM+WL+b2e+s4+zBXfjV5Lof+mNMpHhWIEQkDpgDXAAMBa4SkaHVVssHpgPzqrU9FTgNGAEMA8YCZ3qVqzHhsmr7fm5/bSXDerTj6atHReSkrjHB8vIk9TggV1W3AIhIBjAZWF+1gqpuc5dVv/5MgQQgHhCgBbDbw1yN8Vxe4WFunJtFpzbxPHfd2CY/hIWJfqKqda91PBt2uowmqupN7vQ0YLyqzgyw7lxgoarO95v3e+AmnALxtKreF6DdDGAGQEpKSlpGRsZx51tcXExSkrdXNMRKjHDFiZUYALuKivnD2mYUlys/H59It6TQHznYv330xQhXnIbESE9Pz1bVMQEXqqonL2AK8Kzf9DScD/pA684FpvhNDwDeBZLc12fAGbXFS0tL04bIzMxsUPumFCNccWIlRklZhZ7920U66L5FunxboWdx7N8++mKEK05DYgDLtYbPVS87QAsA/9tWerrzgnEpsFRVi1W1GPgncEqI8zPGc6rK3W+uZssBH09OHUlaaodIp2RM0LwsEFnAQBHpKyLxwFRgQZBt84EzRaS5iLTAOUHdmIdVN03U++t38+7anVw2sAUTh3WLdDrG1ItnBUJVK4CZwGKcD/c3VDVHRB4UkUkAIjJWRHYAlwPPiEiO23w+sBlYC6wGVqvqP7zK1RgvlJZX8qt31zOwSxIT+0Z+8D1j6svTyyhUdRGwqNq82X7vs3C6nqq3qwRu9jI3Y7z2l39vYfu+I8y7aTxHd6yLdDrG1JtdhG2MB3YUlfCnJblcOLwrpw6I7jGWjKmJFQhjPPDbRc4ps/suqn5vqDGNhxUIY0Lsk9yvWbR2Fz88awA9oujZDsbUlxUIY0KovNLHLxfk0KtDIjMm9It0OsY0iBUIY0LoxU+3kbunmF9cNLTeT08zJtpYgTAmRPYeKuPJD77kzEGdOXdoSqTTMabBrEAYEyIP/2sjpRWV/PKSoTaEt4kJViCMCYEV+UXMz97BDaf3pV9ne4yliQ1WIIxpIJ9PuX9BDl3atOT2swdGOh1jQsYKhDEN9Mby7azZcYD/u3AISS3tGQ8mdliBMKYBDpSU88jiTYztk8zkkd0jnY4xIWUFwpgGePz9TewvOcr9k+zZ0ib2WIEw5jht2HmQl5fmcc34VE7s3i7S6RgTclYgjDkOqsovF+TQLrEFd503KNLpGOMJKxDGHId/rNnJsq37uPv8E2jfKj7S6RjjCSsQxtTT4bIKfvvuBob1aMvUsb0jnY4xnrFr8oyppzmZuew6WMqca0YR18xOTJvYZUcQxtTD1q8P89f/bOF7o3qQltoh0ukY4ykrEMbUw4P/yKFl8zhmXTA40qkY4zlPC4SITBSRTSKSKyKzAiyfICIrRKRCRKb4zU8XkVV+r1IR+a6XuRpTlw837CZz017uOGcgXdomRDodYzzn2TkIEYkD5gDnAjuALBFZoKrr/VbLB6YDd/u3VdVMYKS7nQ5ALvCeV7kaU5fS8koeXLie/p1bc92pfSKdjjFh4eVJ6nFArqpuARCRDGAycKxAqOo2d5mvlu1MAf6pqiXepWpM7Z7771byCkt4+cZxxDe3nlnTNIiqerNhp8tooqre5E5PA8ar6swA684FFqrq/ADLPgIeV9WFAZbNAGYApKSkpGVkZBx3vsXFxSQleTtMc6zECFecaIlReMTHvf89wvBOcdw+6vi6lqJlXxpLnFiJEa44DYmRnp6erapjAi5UVU9eON/8n/WbngY8XcO6c4EpAeZ3A/YCLeqKl5aWpg2RmZnZoPZNKUa44kRLjNtezdZB9y3S/MLDnsZpKPu3j74Y4YrTkBjAcq3hc9XLY+UCoJffdE93Xn1cAfxNVctDlpUx9fDZ5kIWrtnJLWf2p1eHVpFOx5iw8rJAZAEDRaSviMQDU4EF9dzGVcBrIc/MmCBUVPq4f0EOPZMTufWs/pFOx5iw86xAqGoFMBNYDGwA3lDVHBF5UEQmAYjIWBHZAVwOPCMiOVXtRaQPzhHIx17laExtXl6ax6bdh/j5RUNJaBEX6XSMCTtPh9pQ1UXAomrzZvu9z8LpegrUdhvQw8v8jKnJ18VlPP7+F5wxsBPnn5gS6XSMiQi7Xs+YAB791yaOHK3kl5fYg4BM0xV0gRARO0NnmoTV2/fzRvZ2rj+tDwO6eH8ZpDHRqs4CISKnish6YKM7fZKI/MnzzIyJAJ9Pmb0gh05JLfnROQMjnY4xERXMEcQfgPOBQgBVXQ1M8DIpYyJl/oodrN6+n1kTB9MmoUWk0zEmooLqYlLV7dVmVXqQizERdeBIOQ//cyNpqclcOsqujzAmmKuYtovIqYCKSAvgDpzLVo2JKU988AX7So7y4qRxNLMHARkT1BHELcBtOJecFuCMsnqbl0kZE26bdh3ipc/yuGpcb4b1aBfpdIyJCrUeQbhDdj+pqteEKR9jwk5VuX9BDkktm3PPeSdEOh1jokatRxCqWgmkukNlGBOTFq3dxWdbCrn7vEEkt7b/6sZUCeYcxBbgExFZAByumqmqj3uWlTFhUlah/Obd9Qzp1parx6dGOh1jokowBWKz+2oGtPE2HWPCa+HWcr46UM4TU0cRZyemjfmGOguEqj4AICJJ7nSx10kZEw6b9xbzz63lfHdkd8b17RDpdIyJOsHcST1MRFYCOUCOiGSLyInep2aMd44creS2V1eQEAf3Xjgk0ukYE5WCucz1L8BPVDVVVVOBu4C/epuWMd5RVe77+1o27T7EzSNaktL2+B4jakysC6ZAtFbVzKoJVV0CtPYsI2M8lpG1nbdXFPCjswcyvLOnI94b06gFUyC2iMgvRKSP+/o5zpVNxjQ66woO8MsFOZwxsJMNxmdMHYIpEDcAnYG3gbeATu48YxqVAyXl3PpqNh1bx/PElSPtqiVj6hDMVUxFwI/CkIsxnvH5lLveXM3O/aW8fvMpdExqGemUjIl6wVzF9L6ItPebThaRxcFsXEQmisgmEckVkVkBlk8QkRUiUiEiU6ot6y0i74nIBhFZ7z6j2pjj8pf/bOGDDbu576IhpKUmRzodYxqFYLqYOqnq/qoJ94iiS12N3HGc5gAXAEOBq0RkaLXV8oHpwLwAm3gJeFRVhwDjgD1B5GrMtyzdUsijizdx0fBuTD+1T6TTMabRCKZA+ESkd9WEiKQCGkS7cUCuqm5R1aNABjDZfwVV3aaqawCf/3y3kDRX1ffd9YpVtSSImMZ8w56Dpdz+2kpSO7TiocuG2/OljamHYK7xuw/4r4h8DAhwBjAjiHY9AP8HDe0AxgeZ1yBgv4i8DfQFPgBmuYMHGhOUikoft7+2kkOl5bxy43h7Qpwx9SSqdR8MiEgn4GR3cqmqfh1EmynARFW9yZ2eBoxX1ZkB1p0LLFTV+X5tnwNG4XRDvQ4sUtXnqrWbgVusUlJS0jIyMurcl5oUFxeTlOTtA+pjJUa44jQ0xpubjvLu1nJ+MDye03oELg72+4rOOLESI1xxGhIjPT09W1XHBFyoqrW+gNNwbpYDuBZ4HEgNot0pwGK/6XuBe2tYdy4wxW/6ZOBjv+lpwJza4qWlpWlDZGZmNqh9U4oRrjgNifFezi5N/dlCnfXWGs9i1Ee0/76iLU6sxAhXnIbEAJZrDZ+rwZyD+DNQIiInAT/BGdn1pSDaZQEDRaSv+zyJqcCCINpVtW0vIp3d6bOB9UG2NU1cfmEJd72ximE92vLLS6pfF2GMCVYwBaLCrTKTcb7FzyGIYb9VtQKYCSzGeYb1G6qaIyIPisgkABEZKyI7gMuBZ0Qkx21bCdwNfCgia3HOfdj4T6ZOpeWV/HBeNgB/viaNhBZxEc7ImMYrmJPUh0TkXpzupQki0gwI6myfqi4CFlWbN9vvfRbQs4a27wMjgoljTJUH/rGedQUHefb7Y+jVoVWk0zGmUQvmCOJKoAy4UVV34XygP+ppVsYch7eyd/DasnxuPas/3xmaEul0jGn0ghlqYxfOiemq6XyCOwdhTNhs3HWQ+/6+lvF9O3DXuYMinY4xMSGYIwhjotqh0nJufWUFbRJa8MerR9E8zv5bGxMKNhi+adRUlVlvrSV/XwnzbhpPlzb28B9jQiWor1oikigiJ3idjDH19cIn23h37U7uOf8ExvfrGOl0jIkpwYzmegmwCviXOz1SRIK9n8EYz2TnFfHbRRv4zpAUbp7QL9LpGBNzgjmCuB9n4L39AKq6Cmd8JGMiprC4jJnzVtCtfQKPXXGSDcJnjAeCKRDlqnqg2rxgRnM1xhOVPuXO11dRePgof74mjXaJNgifMV4I5iR1johcDcSJyECcp8t96m1axtTsqQ+/5D9ffs3vvjecYT3aRTodY2JWMEcQtwMn4twsNw84ANzhZVLG1GTJpj089dGXXDa6J1PH9op0OsbEtGCOIC5S1ftwngsBgIhcDrzpWVbGBFCw/wg/fn0VJ6S04dffHWbnHYzxWDBHEPcGOc8Yzxyt8HHbqysor1T+dM1oEuNtED5jvFbjEYSIXABcCPQQkaf8FrUFKrxOzBh/v120gVXb9/Ona0bTr7P3D3kxxtTexfQVsByYBGT7zT8E/NjLpIzx94/VXzH3023ccFpfLhzeLdLpGNNk1FggVHU1sFpEUlT1Rf9lInIH8KTXyRmTu6eYWW+tYXTv9sy6YHCk0zGmSQnmHMTUAPOmhzgPY76lrEL54avZtGwRx5xrRhPf3AbhMyacajsHcRVwNdC32tAabYB9XidmmjZVZW5OGV/uqeSlG8bRrV1ipFMypsmp7RzEp8BOoBPwmN/8Q8AaL5My5rVl2/lsZyU//s4gzhjYue4GxpiQq+0cRB6QB5wiIqnAQFX9QEQSgUScQmFMyOUXlvDrd9dzYsdm3H72gEinY0yTFcxorj8A5gPPuLN6An8PZuMiMlFENolIrojMCrB8goisEJEKEZlSbVmliKxyXzZ6bBPh8yk/fWs1zUS4YVhLmjWzm+GMiZRgzvrdBpwGHARQ1S+BLnU1EpE4YA5wATAUuEpEhlZbLR/nhPe8AJs4oqoj3dekIPI0MeCVz/NYumUfP79oCB0T7aS0MZEUzF9gmaoerZoQkeYEN5rrOCBXVbe47TOAyf4rqOo2VV0D+OqRs4lR+YUlPPTPjZwxsBNX2jhLxkScqNb+WS8ij+A8C+L7OAP3/RBY747PVFu7KcBEVb3JnZ4GjFfVmQHWnQssVNX5fvMqcB5UVAE8pKrf6tYSkRnADICUlJS0jIyMWvelNsXFxSQleXuHbqzE8CKOT5VHskrZdsDHb05PpGNiM/t9RWGMcMWJlRjhitOQGOnp6dmqOibgQlWt9YVzlPEDnMH55rvvJYh2U4Bn/aanAU/XsO5cYEq1eT3cn/2AbUD/2uKlpaVpQ2RmZjaofVOK4UWcFz/dqqk/W6ivfZ7nWYxAGuvvK1IxwhUnVmKEK05DYgDLtYbP1TpHc1VVH/BX91UfBYB/P0FPd15QVLXA/blFRJYAo4DN9czBNALWtWRMdKqzQIjIVgKcc1DVuh4CnAUMFJG+OIVhKs6Nd3USkWSgRFXLRKQTzknyR4JpaxoX/6uWHrpshA3hbUwUCeZ5EP59UwnA5UCHuhqpaoWIzAQWA3HA86qaIyIP4hzSLBCRscDfgGTgEhF5QFVPBIYAz4iID6eL6yFVXV+vPTONQtVVSw99bzg92tvd0sZEk2C6mAqrzXpCRLKB2UG0XQQsqjZvtt/7LJyup+rtPgWG17V907hZ15Ix0S2YLqbRfpPNcI4ogjnyMKZG1rVkTPQL5oPefxymCpwriq7wJBvTZFjXkjHRL5gupvRwJGKaDutaMqZxCGYspnYi8riILHdfj4lIu3AkZ2KPdS0Z03gEM9TG8zgjt17hvg4CL3iZlIld/mMtWdeSMdEtmHMQ/VX1Mr/pB0RklVcJmdhlXUvGNC7BHEEcEZHTqyZE5DTgiHcpmVhkXUvGND7BHEHcArzknncQnMeNTvcyKRN77KolYxqfYK5iWg2cJCJt3emDnmdlYkp+YQm/W2RdS8Y0NsHcKNcSuAzoAzSv6hpQ1Qc9zczEhKqupbhm1rVkTGMTTBfTO8ABIBso8zYdE2usa8mYxiuYAtFTVSd6nomJOVVdSxMGdbauJWMaoWCuYvpURGzgPFMv3+ha+t5w61oyphGq8QhCRNbiPAeiOXC9iGzB6WISQFV1RHhSNI2Rf9dSd+taMqZRqq2L6eKwZWFiinUtGRMbaisQh8KWhYkZ1rVkTOyorUBk43QxBfoLV6CuR46aJsi6loyJHTUWCFXtG85ETONnXUvGxJbaTlIPVtWN1Z4od4yqrvAuLdPYWNeSMbGntstcf+L+fCzA6/fBbFxEJorIJhHJFZFZAZZPEJEVIlIhIlMCLG8rIjtE5Olg4pnI8R/G27qWjIkNtXUxzXB/HtcT5UQkDpgDnAvsALJEZIGqrvdbLR9n4L+7a9jMr4B/H098Ez7WtWRMbArmiXKXi0gb9/3PReRtERkVxLbHAbmqukVVjwIZwGT/FVR1m6quAXwB4qYBKcB7QcQyEVLVtdTcupaMiTmiqrWvILJGVUe4z4T4NfAoMFtVx9fRbgowUVVvcqenAeNVdWaAdecCC1V1vjvdDPgIuBb4DjCmhnYzgBkAKSkpaRkZGXXsbs2Ki4tJSko67vZNKYZ/nA/yynllw1GuHxbPmT1beBLDS+H+fTX2GOGKEysxwhWnITHS09OzVXVMwIWqWusLWOn+/B1wtf+8OtpNAZ71m54GPF3DunOBKX7TM4Gfuu+n19TO/5WWlqYNkZmZ2aD2TSlGVZy8rw/r4J//U6c997n6fD5PYngtnL+vWIgRrjixEiNccRoSA1iuNXyuBjNYX4GIPINzLuFhd/jvYMZwKgD8O6R7uvOCcQpwhoj8EEgC4kWkWFW/daLbRIZPrWvJmFgXTIG4ApgI/F5V94tIN+CeINplAQNFpC9OYZgKXB1MUqp6TdV7EZmO08VkxSGKfJRfwdIt+3j4MrshzphYVeeRgKqWqOrbqvqlO71TVes8cayqFThdRYuBDcAbqpojIg+KyCQAERkrIjuAy4FnRCSnITtjwiO/sIQ3vjjKhEGduWKMXbVkTKwK5gjiuKnqImBRtXmz/d5n4XQ91baNuTjnKEwUKK/0cff81cQJ1rVkTIwL5lyCMYB7Sev8NSzbuo9rh8Rb15IxMc7TIwgTO1SVBxeu528rC7jn/BM4UXZEOiVjjMfsCMIE5Y8f5TL3023cdHpffnhW/0inY4wJAysQpk4vf7aNx9//gstG9+T/Lhxi5x2MaSKsQJhaLVj9FbMX5PCdISk8fNlwmjWz4uWt2GoAABG+SURBVGBMU2EFwtRoyaY9/OT1VYzr04Gnrx5F8zj772JMU2J/8Sag7Lx93PJKNid0bcNfrxtDQou4SKdkjAkzKxDmWzbuOsj1L2TRrV0iL94wjrYJoR2EzxjTOFiBMN+QX1jC959bRqv45rx84zg6JbWMdErGmAix+yDMMXsOlTLt+c85WunjzZtPoWdyq0inZIyJIDuCMAAcOFLOdc9nsfdQGS9MH8vAlDaRTskYE2FWIAxHjlZy04tZ5O45xDPT0hjVOznSKRljooB1MTVx5ZU+bpu3guV5RTx91WjOGNg50ikZY6KEHUE0YVWD7320cQ+//u4wLhrRLdIpGWOiiBWIJqr64HvXjE+NdErGmChjBaKJetodfO9GG3zPGFMDKxBN0MtL83jMHXzvPht8zxhTAysQTcyC1V8x+511NvieMaZOViCakKrB98ba4HvGmCB4+gkhIhNFZJOI5IrIrADLJ4jIChGpEJEpfvNT3fmrRCRHRG7xMs+mIDuviFtfWcEJXdvwrA2+Z4wJgmf3QYhIHDAHOBfYAWSJyAJVXe+3Wj4wHbi7WvOdwCmqWiYiScA6t+1XXuUbyzbtOsQNc7Po2i7BBt8zxgTNyxvlxgG5qroFQEQygMnAsQKhqtvcZT7/hqp61G+yJdYVdty27yth2nOfk9CiGS/dYIPvGWOCJ6rqzYadLqOJqnqTOz0NGK+qMwOsOxdYqKrz/eb1At4FBgD3qOqcAO1mADMAUlJS0jIyMo473+LiYpKSko67fTTGOFCm/ObzIxwuV/5vXCI92oSuzsbi76uxx7F9ib4Y4YrTkBjp6enZqjom4EJV9eQFTAGe9ZueBjxdw7pzgSk1LOsOLANSaouXlpamDZGZmdmg9tEWY3/JUZ34xL91yC/+qSvy9nkWx0uxEiNccWxfoi9GuOI0JAawXGv4XPWy66YA6OU33dOdVy/qnHdYB5wRorxinv/ge//vWht8zxhzfLwsEFnAQBHpKyLxwFRgQTANRaSniCS675OB04FNnmUaQyp8ykx38L0/XDmSCYNs8D1jzPHxrECoagUwE1gMbADeUNUcEXlQRCYBiMhYEdkBXA48IyI5bvMhwOcishr4GPi9qq71KtdYUVZRyfPrjvKhO/jexSO6RzolY0wj5ulw36q6CFhUbd5sv/dZOF1P1du9D4zwMrdYsPdQGdl5RazILyI7r4i1Ow5wtNJng+8ZY0LCngfRSFT6lC92H3IKQl4R2flF5BWWABAf14zhPdtx/Wl9aH24wAbfM8aEhBWIKHWotJxV2/eTneccHazK38+hsgoAOiW1ZExqMteOT2V0ajLDerSlZXPnzuglS3bb4HvGmJCwAhEFVJXt+46Qnb+P7Lwilm8rYtPuQ6iCCJyQ0obJo7qTlppMWu8O9OqQaEXAGOM5KxARUFZRybqCg6zIK2J53j6y8/bzdXEZAEktmzOqd3smDutKWmoyI3u1p40NjWGMiQArEGGyruAAGRuP8scNnx47mQyQ2rEVEwZ2YnRqMmmpyQxKaUOcDcFtjIkCViDC4J1VBdzz5hp8Ph8n9Ybpp/VhdO9kRqe2p0ubhEinZ4wxAVmB8JCq8uSHX/LEB18yrm8Hvt+3lIvPOzXSaRljTFBslFSPlJZXcufrq3jigy+5bHRPXrlxPEnx1nVkjGk87AjCA4XFZcx4OZvsvCLuOf8EfnhWf7vqyBjT6FiBCLHcPYe4fm4Wew6WMefq0Vw0olukUzLGmONiBSKEPsn9mlteyaZl8zgyZpxso6gaYxo1KxAh8tqyfH7x93X075zEc9PH0DO5VaRTMsaYBrEC0UA+n/LwvzbyzL+3cOagzjx99Si7sc0YExOsQDRAydEK7sxYxXvrd/P9U1KZffFQmsfZhWHGmNhgBeI47T5Yyo0vZrH+q4Pcf8lQpp/WN9IpGWNMSFmBOA45Xx3gxrnLOVRazrPXjeHswSmRTskYY0LOCkQ9fbB+Nz/KWEm7xBa8ecupDO3eNtIpGWOMJ6xABElVef6Tbfz63fUM696O564bQ5e2No6SMSZ2WYEIQkWlj/v/kcMrS/M5/8QU/nDlSFrF26/OGBPbPL3kRkQmisgmEckVkVkBlk8QkRUiUiEiU/zmjxSRz0QkR0TWiMiVXuZZm4Ol5Vw/N4tXluZz85n9+PM1aVYcjDFNgmefdCISB8wBzgV2AFkiskBV1/utlg9MB+6u1rwE+L6qfiki3YFsEVmsqvu9yjeQ7ftKuGFuFlu/PszDlw3nyrG9wxneGGMiysuvwuOAXFXdAiAiGcBk4FiBUNVt7jKff0NV/cLv/VcisgfoDIStQKzIL2LGS8s5WuHjpRvGceqATuEKbYwxUUFU1ZsNO11GE1X1Jnd6GjBeVWcGWHcusFBV5wdYNg54EThRVX3Vls0AZgCkpKSkZWRkHHe+xcXFJCUlAfD5zgr+uraMDgnCnaMT6J4Ump44/xheCUeMcMWJlRjhimP7En0xwhWnITHS09OzVXVMwIWq6skLmAI86zc9DXi6hnXnAlMCzO8GbAJOriteWlqaNkRmZqb6fD7944dfaOrPFuqUP3+ihcVlDdpmoBheC0eMcMWJlRjhimP7En0xwhWnITGA5VrD56qXXUwFQC+/6Z7uvKCISFvgXeA+VV0a4ty+pdyn3PXmat5eUcClo3rw0GXDadk8zuuwxhgTtbwsEFnAQBHpi1MYpgJXB9NQROKBvwEvaYBup1ArOnyUR7NK+aKogJ+cO4jbzx5gD/gxxjR5nl3mqqoVwExgMbABeENVc0TkQRGZBCAiY0VkB3A58IyI5LjNrwAmANNFZJX7GulFnjuKSrj0T5+w5YCPp64axY/OGWjFwRhj8PhGOVVdBCyqNm+23/ssnK6n6u1eAV7xMrcqHVu3pH/nJK4d4GPSSd3DEdIYYxqFJj82dWJ8HM9NH8uAZDvfYIwx/pp8gTDGGBOYFQhjjDEBWYEwxhgTkBUIY4wxAVmBMMYYE5AVCGOMMQFZgTDGGBOQFQhjjDEBeTbcd7iJyF4grwGb6AR8HaJ0Yj1GuOLESoxwxbF9ib4Y4YrTkBipqto50IKYKRANJSLLtaYx0S1GROLESoxwxbF9ib4Y4YrjVQzrYjLGGBOQFQhjjDEBWYH4n79YjKiLEysxwhXH9iX6YoQrjicx7ByEMcaYgOwIwhhjTEBWIIwxxgTUpAuEiPQSkUwRWS8iOSJyh0dxEkRkmYisduM84EUcN1aciKwUkYUebX+biKx1HwO73IsYbpz2IjJfRDaKyAYROSXE2z/B73G2q0TkoIjcGcoYbpwfu//m60TkNRFJCHUMN84dboycUO2HiDwvIntEZJ3fvA4i8r6IfOn+TPYozuXuvvhEpMGXb9YQ41H3/9caEfmbiLT3IMav3O2vEpH3RKTBj60MFMdv2V0ioiLSqaFxAFDVJvsCugGj3fdtgC+AoR7EESDJfd8C+Bw42aN9+gkwD1jo0fa3AZ3C8G/zInCT+z4eaO9hrDhgF84NQ6Hcbg9gK5DoTr8BTPcg/2HAOqAVzmOEPwAGhGC7E4DRwDq/eY8As9z3s4CHPYozBDgBWAKM8SjGeUBz9/3DDd2XGmK09Xv/I+D/ebEv7vxewGKcG4ZD8jfapI8gVHWnqq5w3x8CNuD8UYc6jqpqsTvZwn2F/OoAEekJXAQ8G+pth5OItMP5I3gOQFWPqup+D0OeA2xW1YbciV+T5kCiiDTH+QD/yoMYQ4DPVbVEVSuAj4HvNXSjqvpvYF+12ZNxijfuz+96EUdVN6jqpoZuu44Y77m/L4ClQE8PYhz0m2xNCP7ua/h3AfgD8NNQxKjSpAuEPxHpA4zC+XbvxfbjRGQVsAd4X1W9iPMEzn8QnwfbrqLAeyKSLSIzPIrRF9gLvOB2lz0rIq09igUwFXgt1BtV1QLg90A+sBM4oKrvhToOztHDGSLSUURaARfifJv0Qoqq7nTf7wJSPIoTbjcA//RiwyLyGxHZDlwDzPYoxmSgQFVXh3K7ViAAEUkC3gLurFbxQ0ZVK1V1JM63lHEiMiyU2xeRi4E9qpodyu0GcLqqjgYuAG4TkQkexGiOcwj9Z1UdBRzG6c4IORGJByYBb3qw7WScb9x9ge5AaxG5NtRxVHUDThfJe8C/gFVAZajjBIireHAkHG4ich9QAbzqxfZV9T5V7eVuf2aot+9+Kfg/PCg+Tb5AiEgLnOLwqqq+7XU8t6skE5gY4k2fBkwSkW1ABnC2iLwS4hhV34pR1T3A34BxoY4B7AB2+B1lzccpGF64AFihqrs92PZ3gK2quldVy4G3gVM9iIOqPqeqaao6ASjCOZ/mhd0i0g3A/bnHozhhISLTgYuBa9yC56VXgcs82G5/nC8hq92//57AChHp2tANN+kCISKC08+9QVUf9zBO56orJEQkETgX2BjKGKp6r6r2VNU+OF0mH6lqSL+tikhrEWlT9R7nJN+3rqRoKFXdBWwXkRPcWecA60Mdx3UVHnQvufKBk0Wklft/7Ryc81whJyJd3J+9cc4/zPMiDrAAuM59fx3wjkdxPCciE3G6ZCepaolHMQb6TU4mxH/3AKq6VlW7qGof9+9/B87FN7tCsfEm+wJOxzlEXoNzWL4KuNCDOCOAlW6cdcBsj/frLDy4ignoB6x2XznAfR7uw0hgufs7+zuQ7EGM1kAh0M7D/XgA50NhHfAy0NKjOP/BKaKrgXNCtM3XcM6dlLsfOjcCHYEPgS9xrpbq4FGcS933ZcBuYLEHMXKB7X5/+w26wqiGGG+5//ZrgH8APbz4fVVbvo0QXcVkQ20YY4wJqEl3MRljjKmZFQhjjDEBWYEwxhgTkBUIY4wxAVmBMMYYE5AVCNNouKNUPuY3fbeI3B+ibc8VkSmh2FYdcS53R6fN9DIvEekjIlfXP0Nj/scKhGlMyoDvhWwo4xBxB+IL1o3AD1Q13at8XH2AehWIeu6HaQKsQJjGpALn2bs/rr6g+jdtESl2f54lIh+LyDsiskVEHhKRa8R5PsdaEenvt5nviMhyEfnCHduqapDFR0Ukyx3X/2a/7f5HRBYQ4C5vEbnK3f46EXnYnTcb5+bM50Tk0QBtfua2WS0iDwVYvq2qOIrIGBFZ4r4/U/73XIuV7t3uD+EM4LdKnGdSBLUf7t3y77o5rBORK4P5hzGxyb4xmMZmDrBGRB6pR5uTcIbE3gdsAZ5V1XHiPCDqdqDqATt9cMaW6g9kisgA4Ps4o7COFZGWwCciUjUi62hgmKpu9Q8mzkNhHgbScMZFek9EvquqD4rI2cDdqrq8WpsLcIZiGK+qJSLSoR77dzdwm6p+4g48WYozuOHdqlpV6GYEsx8ichnwlape5LZrV488TIyxIwjTqKgz2u5LOA9fCVaWOs/+KAM244x6CrAWpyhUeUNVfar6JU4hGYwz3tT3xRmq/XOcoSaqxtdZVr04uMYCS9QZpK9qlNC6Rr39DvCCumMCqWqg8f5r8gnwuIj8COfBShUB1gl2P9YC54rIwyJyhqoeqEceJsZYgTCN0RM4ffn+z4iowP3/LCLNcJ5CV6XM773Pb9rHN4+iq487ozhPA7xdVUe6r776v2c6HG7QXtTfsX0Ejj26VFUfAm4CEnGODAYHaBvUfqjqFzhHFGuBX7vdYqaJsgJhGh332/UbOEWiyjacLh1wnu/Q4jg2fbmINHPPS/QDNuE8wvFWd1h4RGSQ1P3womXAmSLSSUTicEaM/biONu8D14sztj81dDFt43/7eGzYaBHpr86Ing8DWThHPodwHqNbJaj9cLvHSlT1FeBRvBtm3TQCdg7CNFaP8c2Hr/wVeEdEVuM8NOd4vt3n43y4twVuUdVSEXkWpxtqhTtk917qeMymqu4UkVk4z/0Q4F1VrXVYbFX9l4iMBJaLyFFgEc5DYPw9gHOC+1c4z2qucqeIpOMcEeXgPBnNB1S6v4+5wJNB7sdw4FER8eGMFnprbXmb2GajuRpjjAnIupiMMcYEZAXCGGNMQFYgjDHGBGQFwhhjTEBWIIwxxgRkBcIYY0xAViCMMcYE9P8BePc/ZJG+fksAAAAASUVORK5CYII=\n" 389 | }, 390 | "metadata": { 391 | "needs_background": "light" 392 | } 393 | } 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "source": [ 399 | "# get the value of K for which silhouette score is highest\n", 400 | "print(np.argmax(sil_coef_digits)+2)" 401 | ], 402 | "metadata": { 403 | "colab": { 404 | "base_uri": "https://localhost:8080/" 405 | }, 406 | "id": "_L92U019z8XA", 407 | "outputId": "8c462436-9fb4-48ef-8f9d-36e808de4aba" 408 | }, 409 | "execution_count": 12, 410 | "outputs": [ 411 | { 412 | "output_type": "stream", 413 | "name": "stdout", 414 | "text": [ 415 | "9\n" 416 | ] 417 | } 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "source": [ 423 | "This graph points the fact that n=9 this could mean some digits like 1 and 7 make striking similarity" 424 | ], 425 | "metadata": { 426 | "id": "ZX7SlAg80NnK" 427 | } 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "source": [ 432 | "## [Lecture 11.2: Hierarchical Agglomerative Clustering (HAC)](https://www.youtube.com/watch?v=GG6yYMO91FQ)\n", 433 | "Clustering is concerned about grouping objects with *similar attributes* or *characteristics*. The objects in the same cluster are closer to one and another than the objects from the other clusters\n", 434 | "\n", 435 | "\n", 436 | "\n", 437 | "In the image above, the clusters with the same color share similar properties (Feature values represented on axis). For instance, if the x-axis represents weight and y-axis represent height, then the red cluster represents people with low BMI. Similar interpretations can be drawn for the remaining clusters.\n", 438 | "\n", 439 | "Here we will discuss another clustering algorithm which is **hierarchial aggglomerative clustering (HAC)** algorithm\n", 440 | "- Hierarchial clustering starts by considering each datum as cluster and then combines closest clusters to form larger clusters. This is bottom-up approach\n", 441 | "- There is an alternate approach, which is top-down approach, where the entire data is one large single cluster, which is divided into smaller clusters in each step.\n", 442 | "\n", 443 | "The merging and splitting decisions are influenced by certain conditions that will discussed shortly." 444 | ], 445 | "metadata": { 446 | "id": "exX3J-wB5C0Q" 447 | } 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "source": [ 452 | "### Metric\n", 453 | "Certain metrics are used for calculating similarity between clusters. Note that metric is a generalization of concept of distance. The metrics follow certain properties like:\n", 454 | "1. non-negative\n", 455 | "2. symmetric\n", 456 | "3. follows triangle inequality\n", 457 | "\n", 458 | "Some of the popular metric function are:\n", 459 | "1. **Euclidean**:\n", 460 | "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) = \\sqrt{\\sum_{l=1}^m (\\mathbf{x}_l^{(i)} - \\mathbf{x}_l^{(j)})^2}$$\n", 461 | "2. **Manhattan**:\n", 462 | "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) = \\sum_{l=1}^m |(\\mathbf{x}_l^{(i)} - \\mathbf{x}_l^{(j)})|$$\n", 463 | "3. **Cosine distance**:\n", 464 | "$$d(\\mathbf{x}^{(i)}, \\mathbf{x}^{(j)}) = 1 - \\frac{\\mathbf{x}^{(i)}\\mathbf{x}^{(j)}}{||\\mathbf{x}^{(i)}|| \\times ||\\mathbf{x}^{(j)}||} = 1 - cos (\\theta)$$" 465 | ], 466 | "metadata": { 467 | "id": "Q1GdOSNyHBU4" 468 | } 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "source": [ 473 | "### Linkage\n", 474 | "Linkage is a strategy for aggregating clusters\n", 475 | "\n", 476 | "There are four linkages we will study\n", 477 | "- Single linkage\n", 478 | "- Average linkage\n", 479 | "- Complete linkage\n", 480 | "- Ward's linkage\n", 481 | "\n", 482 | "The single linkage criterion merges clusters based on the shortest distance over all possible pairs . That is\n", 483 | "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\text{min}_{i,j} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)}) $$\n", 484 | "\n", 485 | "\n", 486 | "\n", 487 | "The complete linkage merges clusters to minimize the maximum distance between the clusters (in other words, distance of the furthest elements).\n", 488 | "\n", 489 | "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\text{max}_{i,j} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)})$$\n", 490 | "\n", 491 | "\n", 492 | "\n", 493 | "The average linkage criterion uses average distance over all possible pairs between the groups for merging clusters\n", 494 | "\n", 495 | "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\frac{1}{|r_1||r_2|} \\sum_{i=1}^{|r_1|}\\sum_{j=1}^{|r_2|} d(\\mathbf{x}_{r_1}^{(i)}, \\mathbf{x}_{r_2}^{(j)}) $$\n", 496 | "\n", 497 | "\n", 498 | "\n", 499 | "The ward's linkage computes the sum of squared distances within the clusters\n", 500 | "$$(\\{\\mathbf{x}_{r_1}^{(i)}\\}_{i=1}^{|r_1|}, \\{\\mathbf{x}_{r_2}^{(j)}\\}_{j=1}^{|r_2|}) = \\sum_{i=1}^{|r_1|}\\sum_{j=1}^{|r_2|} ||\\mathbf{x}_{r_1}^{(i)} - \\mathbf{x}_{r_2}^{(j)}||^2 $$\n" 501 | ], 502 | "metadata": { 503 | "id": "8Ax_oFePQG2W" 504 | } 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "source": [ 509 | "### Hierarchical Aggolomerative Clustering\n", 510 | "\n", 511 | "Algorithm:\n", 512 | "1. Calculate the distance matrix between pairs and clusters\n", 513 | "2. While all the objects are clustered into one.\n", 514 | " - Detect the two closest groups (clusters) and merge them\n" 515 | ], 516 | "metadata": { 517 | "id": "zOW9btr21KEK" 518 | } 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "source": [ 523 | "### Dendrograms \n", 524 | "Dendrograms are graphical representation of the agglomerative process which show how an aggregation happens at each level. Let's take example of toy dataset to understand this.\n", 525 | "\n", 526 | "Example:\n", 527 | "\n", 528 | "| $x_1$ | $x_2$ |\n", 529 | "| --- | --- |\n", 530 | "| 8 | 3 |\n", 531 | "| 5 | 3 |\n", 532 | "| 6 | 4 |\n", 533 | "| 1 | 6 |\n", 534 | "| 2 | 8 |" 535 | ], 536 | "metadata": { 537 | "id": "jz_DOow_1m5H" 538 | } 539 | }, 540 | { 541 | "cell_type": "code", 542 | "source": [ 543 | "import numpy as np\n", 544 | "import matplotlib.pyplot as plt\n", 545 | "from sklearn.preprocessing import normalize" 546 | ], 547 | "metadata": { 548 | "id": "s5TgF9aM0MOd" 549 | }, 550 | "execution_count": 13, 551 | "outputs": [] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "source": [ 556 | "X = np.array([(8, 3), (5, 3), (6, 4), (1, 6), (2, 8)])\n", 557 | "scaled_X = normalize(X)\n", 558 | "plt.scatter(X[:, 0], X[:, 1])\n", 559 | "plt.show()" 560 | ], 561 | "metadata": { 562 | "colab": { 563 | "base_uri": "https://localhost:8080/", 564 | "height": 265 565 | }, 566 | "id": "E1PwpJlz23_N", 567 | "outputId": "c8255f3d-aa78-4462-dcee-33e4312433ab" 568 | }, 569 | "execution_count": 14, 570 | "outputs": [ 571 | { 572 | "output_type": "display_data", 573 | "data": { 574 | "text/plain": [ 575 | "
" 576 | ], 577 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAANrUlEQVR4nO3db2zcB33H8fd3djouGWC0WqhxpqWPLE0tqyurYxRVWkPxOqoSVXtQpO4BbMoTxFomGZFH1fZg1WQ0sUdIUTrGRFcEwc2D/ak7iSKGxMqcuOCuxQ+AttQp5KrJlFa3YcJ3D3wuTeo/5+TO9z3n/ZKiJudfzh9VzjuX3/3OF5mJJKmuX+v3AEnS1gy1JBVnqCWpOEMtScUZakkqbrgXd3rttdfm4cOHe3HXkrQnnTlz5pXMHN3oYz0J9eHDh5mfn+/FXUvSnhQRL2z2MU99SFJxhlqSijPUklScoZak4gy1JBXX0VUfEfFJ4M+ABBaBj2bm//ZyWGWnF5aZmVvi3EqLgyMNpqfGOTox1u9ZkvaobR9RR8QY8OfAZGbeAAwB9/Z6WFWnF5Y5PrvI8kqLBJZXWhyfXeT0wnK/p0naozo99TEMNCJiGNgPnOvdpNpm5pZorV646LbW6gVm5pb6tEjSXrdtqDNzGfgM8CLwMvDTzHzi0uMi4lhEzEfEfLPZ7P7SIs6ttHZ0uyRdqU5OfbwL+DBwPXAQOBAR9116XGaeyMzJzJwcHd3wVZB7wsGRxo5ul6Qr1cmpjw8AP8zMZmauArPA+3o7q67pqXEa+4Yuuq2xb4jpqfE+LZK013Vy1ceLwHsjYj/QAo4AV+038li/usOrPiTtlm1DnZlPRcQp4CzwC2ABONHrYZUdnRgzzJJ2TUfXUWfmg8CDPd4iSdqAr0yUpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSits21BExHhFPv+nHqxHxwG6MkyTB8HYHZOYScBNARAwBy8BjPd4lSWrb6amPI8D3M/OFXoyRJL3VTkN9L/DoRh+IiGMRMR8R881m88qXSZKAHYQ6Iq4B7ga+stHHM/NEZk5m5uTo6Gi39knSVW8nj6jvBM5m5k96NUaS9FY7CfVH2OS0hySpdzoKdUQcAO4AZns7R5J0qW0vzwPIzNeB3+zxFknSBnxloiQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSihvu5KCIGAFOAjcACXwsM7/VzSGnF5aZmVvi3EqLgyMNpqfGOTox1s1PIUkDqaNQA38HPJ6ZfxwR1wD7uzni9MIyx2cXaa1eAGB5pcXx2UUAYy3pqrftqY+IeCdwG/AwQGb+PDNXujliZm7pjUiva61eYGZuqZufRpIGUifnqK8HmsDnI2IhIk5GxIFLD4qIYxExHxHzzWZzRyPOrbR2dLskXU06CfUwcDPwucycAF4HPn3pQZl5IjMnM3NydHR0RyMOjjR2dLskXU06CfVLwEuZ+VT716dYC3fXTE+N09g3dNFtjX1DTE+Nd/PTSNJA2jbUmflj4EcRsV7NI8Cz3RxxdGKMh+65kbGRBgGMjTR46J4bfSJRkuj8qo9PAI+0r/j4AfDRbg85OjFmmCVpAx2FOjOfBiZ7vEWStAFfmShJxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVN9zJQRHxPPAz4ALwi8yc7OUoSdKvdBTqtj/IzFd6tkSStCFPfUhScZ2GOoEnIuJMRBzb6ICIOBYR8xEx32w2u7dQkq5ynYb6/Zl5M3An8PGIuO3SAzLzRGZOZubk6OhoV0dK0tWso1Bn5nL7v+eBx4BbejlKkvQr24Y6Ig5ExNvXfw58EHim18MkSWs6uerj3cBjEbF+/D9l5uM9XSVJesO2oc7MHwC/uwtbJEkb8PI8SSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVN9zpgRExBMwDy5l5V+8mSeqF0wvLzMwtcW6lxcGRBtNT4xydGOv3LHWg41AD9wPPAe/o0RZJPXJ6YZnjs4u0Vi8AsLzS4vjsIoCxHgAdnfqIiEPAh4CTvZ0jqRdm5pbeiPS61uoFZuaW+rRIO9HpOerPAp8CfrnZARFxLCLmI2K+2Wx2ZZyk7ji30trR7apl21BHxF3A+cw8s9VxmXkiMyczc3J0dLRrAyVduYMjjR3drlo6eUR9K3B3RDwPfAm4PSK+2NNVkrpqemqcxr6hi25r7Btiemq8T4u0E9uGOjOPZ+ahzDwM3At8LTPv6/kySV1zdGKMh+65kbGRBgGMjTR46J4bfSJxQOzkqg9JA+zoxJhhHlA7CnVmfh34ek+WSJI25CsTJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOIMtSQVZ6glqThDLUnFGWpJKs5QS1JxhlqSijPUklScoZak4gy1JBVnqCWpOEMtScUZakkqzlBLUnGGWpKKM9SSVJyhlqTiDLUkFWeoJak4Qy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOKGtzsgIt4GfAP49fbxpzLzwV4PkwbB6YVlZuaWOLfS4uBIg+mpcY5OjPV7lnZZr78Otg018H/A7Zn5WkTsA74ZEf+Wmf/ZtRXSADq9sMzx2UVaqxcAWF5pcXx2EcBYX0V24+tg21Mfuea19i/3tX9kVz67NMBm5pbe+MO5rrV6gZm5pT4tUj/sxtdBR+eoI2IoIp4GzgP/nplPbXDMsYiYj4j5ZrPZtYFSVedWWju6XXvTbnwddBTqzLyQmTcBh4BbIuKGDY45kZmTmTk5OjratYFSVQdHGju6XXvTbnwd7Oiqj8xcAZ4E/rBrC6QBNT01TmPf0EW3NfYNMT013qdF6ofd+Dro5KqPUWA1M1ciogHcAfxN1xZIA2r9iSKv+ri67cbXQWRu/bxgRLwH+AIwxNoj8C9n5l9t9XsmJydzfn6+ayMlaa+LiDOZObnRx7Z9RJ2Z3wUmur5KktQRX5koScUZakkqzlBLUnGGWpKK2/aqj8u604gm8MJl/vZrgVe6OKeXBmkrDNbeQdoKg7V3kLbCYO29kq2/nZkbvlqwJ6G+EhExv9klKtUM0lYYrL2DtBUGa+8gbYXB2turrZ76kKTiDLUkFVcx1Cf6PWAHBmkrDNbeQdoKg7V3kLbCYO3tydZy56glSRer+IhakvQmhlqSiisT6oj4+4g4HxHP9HvLdiLityLiyYh4NiL+OyLu7/emzUTE2yLi2xHxnfbWv+z3pk6031VoISL+ud9bthIRz0fEYkQ8HRHlv2VkRIxExKmI+F5EPBcRv9/vTRuJiPH2/9P1H69GxAP93rWViPhk+8/YMxHxaPuNwbtz31XOUUfEbcBrwD9m5lveQaaSiLgOuC4zz0bE24EzwNHMfLbP094iIgI48OY3Jwbur/7mxBHxF8Ak8I7MvKvfezYTEc8Dk5k5EC/IiIgvAP+RmScj4hpgf/sNQcqKiCFgGfi9zLzcF9L1VESMsfZn63cysxURXwb+NTP/oRv3X+YRdWZ+A/iffu/oRGa+nJln2z//GfAcUPK7xQ/imxNHxCHgQ8DJfm/ZSyLincBtwMMAmfnz6pFuOwJ8v2qk32QYaETEMLAfONetOy4T6kEVEYdZ+37db3nD3yo6eXPiYj4LfAr4Zb+HdCCBJyLiTEQc6/eYbVwPNIHPt08rnYyIA/0e1YF7gUf7PWIrmbkMfAZ4EXgZ+GlmPtGt+zfUVyAifgP4KvBAZr7a7z2b6eTNiauIiLuA85l5pt9bOvT+zLwZuBP4ePsUXlXDwM3A5zJzAngd+HR/J22tfXrmbuAr/d6ylYh4F/Bh1v4yPAgciIj7unX/hvoytc/3fhV4JDNn+72nEwPy5sS3Ane3z/1+Cbg9Ir7Y30mbaz+SIjPPA48Bt/R30ZZeAl5607+oTrEW7sruBM5m5k/6PWQbHwB+mJnNzFwFZoH3devODfVlaD9B9zDwXGb+bb/3bCUiRiNipP3z9Tcn/l5/V20uM49n5qHMPMzaP3m/lplde2TSTRFxoP1kMu1TCB8Eyl61lJk/Bn4UEetvj30EKPcE+CU+QvHTHm0vAu+NiP3tPhxh7bmrrigT6oh4FPgWMB4RL0XEn/Z70xZuBf6EtUd765cP/VG/R23iOuDJiPgu8F+snaMufcnbAHk38M2I+A7wbeBfMvPxPm/azieAR9pfDzcBf93nPZtq/+V3B2uPTktr/yvlFHAWWGStrV17OXmZy/MkSRsr84hakrQxQy1JxRlqSSrOUEtScYZakooz1JJUnKGWpOL+H8N3uVec8v9QAAAAAElFTkSuQmCC\n" 578 | }, 579 | "metadata": { 580 | "needs_background": "light" 581 | } 582 | } 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "source": [ 588 | "Let's plot the dendrogram with `scipy.cluster.hierarchy` library" 589 | ], 590 | "metadata": { 591 | "id": "Y5REenHR3Shb" 592 | } 593 | }, 594 | { 595 | "cell_type": "code", 596 | "source": [ 597 | "import scipy.cluster.hierarchy as shc\n", 598 | "plt.figure(figsize=(8, 8))\n", 599 | "plt.title('Dendrogram')\n", 600 | "dend = shc.dendrogram(shc.linkage(scaled_X, method=\"ward\"))" 601 | ], 602 | "metadata": { 603 | "colab": { 604 | "base_uri": "https://localhost:8080/", 605 | "height": 502 606 | }, 607 | "id": "Ogbv_h9o3NAe", 608 | "outputId": "99b4e1a5-487f-48c7-fd45-08d8946a2b03" 609 | }, 610 | "execution_count": 17, 611 | "outputs": [ 612 | { 613 | "output_type": "display_data", 614 | "data": { 615 | "text/plain": [ 616 | "
" 617 | ], 618 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeMAAAHlCAYAAADGLpQlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAXIElEQVR4nO3df7Dld13f8dfbrOAPCDRkRU2yBDEEV6NAt6i1DrRqCWklziiaCFYjGKumv3Qc8Vek0dZqHZ3qBDEtJNQfIFDUbVlkqqLWH0EW+WUCG7cRyYZEQoLhdyD23T/OiXNdNrkn7Nl9Z88+HjN3cr/f7+d+zzt3Nnne7/ecPbe6OwDAnE+aHgAATnZiDADDxBgAhokxAAwTYwAYJsYAMEyMYYNU1bdU1R9MzwHcP2IMx1hVvaOqPlxV76+qv66qP6qqf1lV/vsDkogxHC9f3d0PTfKoJP8pyfcleeHxHKCqdjyQzwcnMzGG46i77+zuvUm+Ick3V9UXVNWDq+qnquqdVfVXVfWCqvrUJKmqp1TVoar6nqp6d1XdUlWX3HO+qnpEVe2tqvdV1Z8keczWx6uqrqrvqqo/T/Lny33fVlUHq+qO5dd+9pb1/7SqDlTVnVX1/Kr6vap6zvLYt1TVH1bVz1TV7UmeV1WPqarfqarbq+o9VfXLVfXwLed7R1V9b1W9pao+WFUvrKpHVtWrl3cKfquq/t4x/JbDCUGMYUB3/0mSQ0m+PIsr5ccmeXySz01yRpLLtyz/zCQPW+5/dpIrtwTsyiQfSfJZSb51+XG4r0nyxUl2V9U/SfLjSb5++TV/meSlSVJVpyd5RZLvT/KIJAeS/MPDzvXFSW5M8sgk/yFJLc/32Uk+L8lZSZ532Nd8bZKvWv47fnWSVyf5gSQ7s/h/0L++128UnCTEGOa8K8lpSS5N8u+6+47ufn+S/5jkoi3rPpbkiu7+WHfvS/KBJOdW1SlZhO7y7v5gd/9Zkhcf4XF+fHnuDyd5ZpIXdfefdvddWYT3S6vq7CQXJLmuu1/Z3Xcn+dkktx4+c3f/XHff3d0f7u6D3f2/u/uu7r4tyU8nefJhX/Nz3f1X3X1zkv+T5HXd/cbu/kiSX0vyhE/gewcbxXM+MOeMLP4b/LQkb6iqe/ZXklO2rLt9Gcd7fCjJQ7K4styR5KYtx/7yCI+z9fhnJ/nTeza6+wPLW85nLI/dtOVYV9Wh+zhXquqRSf5LFlf4D83iB/z3HvY1f7Xl8w8fYfshR5gZTiqujGFAVf2DLAL461kE6fO7++HLj4d19yqBui3J3VncGr7HriOs2/qr2d6VxYvI7pnj07O4JX1zkluSnLnlWG3dPsK5ksVVfCc5r7tPTfKsLH6YAO4HMYbjqKpOrap/nsXztL/U3W9O8l+T/ExVfcZyzRlV9dTtztXdf5PklVm8kOrTqmp3km/e5stekuSSqnp8VT04i5i+rrvfkeRVSc6rqq9ZvlL6u7J4vvq+PDSL2+Z3VtUZSb53u7mBjyfGcHz8z6p6fxa3eX8wi+dW73lV9PclOZjk2qp6X5LfSnLuiue9LIvbvLcmuSbJ1fe1uLt/K8kPJ/kfWVwJPybL56e7+z1JnpHkJ5PcnmR3kv1J7rqPU/77JE9McmcWMX/linMDW1T34XedAJLlm5IcSvLM7n7t9DywyVwZA3+rqp5aVQ9f3sL+gSye/712eCzYeGIMbPWlSf5vkvdk8XeCv2b5V6KAY8htagAY5soYAIaJMQAMG3sHrtNPP73PPvvsqYcHgOPuDW94w3u6e+fh+8difPbZZ2f//v1TDw8Ax11VHekta92mBoBpYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw3ZMD8Cx8Suve2d+4003T48BD2gXPv6MfOMX75oeA1wZb6rfeNPNuf6W902PAQ9Y19/yPj+w8oDhyniD7f6sU/Or3/6l02PAA9I3/MIfT48Af8uVMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGLZtjKvqRVX17qr6s3s5/syqektVvbWq/qiqvmj9YwLA5lrlyviaJOffx/G/SPLk7j4vyY8muWoNcwHASWPHdgu6+/er6uz7OP5HWzavTXLm0Y8FACePdT9n/Owkr17zOQFgo217ZbyqqvrHWcT4H93HmkuTXJoku3btWtdDA8AJbS1XxlX1hUn+W5ILu/v2e1vX3Vd1957u3rNz5851PDQAnPCOOsZVtSvJK5N8U3ffcPQjAcDJZdvb1FX1kiRPSXJ6VR1K8iNJPjlJuvsFSS5P8ogkz6+qJLm7u/ccq4EBYNOs8mrqi7c5/pwkz1nbRABwkvEOXAAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADNs2xlX1oqp6d1X92b0cr6r62ao6WFVvqaonrn9MANhcq1wZX5Pk/Ps4/rQk5yw/Lk3y80c/FgCcPLaNcXf/fpI77mPJhUn+ey9cm+ThVfVZ6xoQADbdOp4zPiPJTVu2Dy33AQArOK4v4KqqS6tqf1Xtv+22247nQwPAA9Y6YnxzkrO2bJ+53Pdxuvuq7t7T3Xt27ty5hocGgBPfOmK8N8m/WL6q+kuS3Nndt6zhvABwUtix3YKqekmSpyQ5vaoOJfmRJJ+cJN39giT7klyQ5GCSDyW55FgNCwCbaNsYd/fF2xzvJN+1tokA4CTjHbgAYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABi2Uoyr6vyqOlBVB6vquUc4vquqXltVb6yqt1TVBesfFQA207YxrqpTklyZ5GlJdie5uKp2H7bsh5K8rLufkOSiJM9f96AAsKlWuTJ+UpKD3X1jd380yUuTXHjYmk5y6vLzhyV51/pGBIDNtkqMz0hy05btQ8t9Wz0vybOq6lCSfUn+1ZFOVFWXVtX+qtp/2223fQLjAsDmWdcLuC5Ock13n5nkgiS/WFUfd+7uvqq793T3np07d67poQHgxLZKjG9OctaW7TOX+7Z6dpKXJUl3/3GST0ly+joGBIBNt0qMX5/knKp6dFU9KIsXaO09bM07k3xFklTV52URY/ehAWAF28a4u+9OclmS1yR5Wxavmr6uqq6oqqcvl31Pkm+rqjcneUmSb+nuPlZDA8Am2bHKou7el8ULs7buu3zL59cn+bL1jgYAJwfvwAUAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFg2Eoxrqrzq+pAVR2squfey5qvr6rrq+q6qvqV9Y4JAJtrx3YLquqUJFcm+aokh5K8vqr2dvf1W9ack+T7k3xZd7+3qj7jWA0MAJtmlSvjJyU52N03dvdHk7w0yYWHrfm2JFd293uTpLvfvd4xAWBzrRLjM5LctGX70HLfVo9N8tiq+sOquraqzl/XgACw6ba9TX0/znNOkqckOTPJ71fVed3911sXVdWlSS5Nkl27dq3poQHgxLbKlfHNSc7asn3mct9Wh5Ls7e6PdfdfJLkhizj/Hd19VXfv6e49O3fu/ERnBoCNskqMX5/knKp6dFU9KMlFSfYetubXs7gqTlWdnsVt6xvXOCcAbKxtY9zddye5LMlrkrwtycu6+7qquqKqnr5c9pokt1fV9Ulem+R7u/v2YzU0AGySlZ4z7u59SfYdtu/yLZ93ku9efgAA94N34AKAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYNhKMa6q86vqQFUdrKrn3se6r62qrqo96xsRADbbtjGuqlOSXJnkaUl2J7m4qnYfYd1Dk/ybJK9b95AAsMlWuTJ+UpKD3X1jd380yUuTXHiEdT+a5CeSfGSN8wHAxlslxmckuWnL9qHlvr9VVU9MclZ3v2qNswHASeGoX8BVVZ+U5KeTfM8Kay+tqv1Vtf+222472ocGgI2wSoxvTnLWlu0zl/vu8dAkX5Dkd6vqHUm+JMneI72Iq7uv6u493b1n586dn/jUALBBVonx65OcU1WPrqoHJbkoyd57Dnb3nd19enef3d1nJ7k2ydO7e/8xmRgANsy2Me7uu5NcluQ1Sd6W5GXdfV1VXVFVTz/WAwLAptuxyqLu3pdk32H7Lr+XtU85+rEA4OThHbgAYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACG7ZgeADh+Xn7Dy7Pvxn3TYzwgHLjjyUmSS37zquFJHhgu+JwL8ozHPmN6jJOWGMNJZN+N+3LgjgM597Rzp0cZ94Qn/N70CA8YB+44kCRiPEiM4SRz7mnn5urzr54egweQS37zkukRTnqeMwaAYWIMAMPEGACGiTEADBNjABgmxgAwbKUYV9X5VXWgqg5W1XOPcPy7q+r6qnpLVf12VT1q/aMCwGbaNsZVdUqSK5M8LcnuJBdX1e7Dlr0xyZ7u/sIkr0jyk+seFAA21SpXxk9KcrC7b+zujyZ5aZILty7o7td294eWm9cmOXO9YwLA5lolxmckuWnL9qHlvnvz7CSvPpqhAOBksta3w6yqZyXZk+TJ93L80iSXJsmuXbvW+dAAcMJa5cr45iRnbdk+c7nv76iqr0zyg0me3t13HelE3X1Vd+/p7j07d+78ROYFgI2zSoxfn+Scqnp0VT0oyUVJ9m5dUFVPSPILWYT43esfEwA217Yx7u67k1yW5DVJ3pbkZd19XVVdUVVPXy77z0kekuTlVfWmqtp7L6cDAA6z0nPG3b0vyb7D9l2+5fOvXPNcAHDS8A5cADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAME2MAGLZjeoATxv6rk7e+YnqK1d164eKfV//Y7Bz313lfl+y5ZHoKgONKjFf11lckt741+czzpidZya/u+o3pEe6/W9+6+KcYAycZMb4/PvO85JJXTU+xua7+Z9MTAIzwnDEADBNjABgmxgAwTIwBYJgYA8AwMQaAYWIMAMPEGACGedMPgDV7+Q0vz74b902PsbK33/H2JMklv3livfvdBZ9zQZ7x2GdMj7EWrowB1mzfjfty4I4D02Os7HGnPS6PO+1x02PcLwfuOHBC/cCzHVfGAMfAuaedm6vPv3p6jI11ol3Fb8eVMQAME2MAGCbGADBMjAFgmBgDwDAxBoBhYgwAw8QYAIaJMQAMWynGVXV+VR2oqoNV9dwjHH9wVf3q8vjrqursdQ8KAJtq2xhX1SlJrkzytCS7k1xcVbsPW/bsJO/t7s9N8jNJfmLdgwLAplrlyvhJSQ52943d/dEkL01y4WFrLkzy4uXnr0jyFVVV6xsTADbXKjE+I8lNW7YPLfcdcU13353kziSPWMeAALDpjutvbaqqS5Ncutz8QFWdOL9j7B7f6oL/mPM9PuauyTXTI5wUfJ+PvRPwe/yoI+1cJcY3Jzlry/aZy31HWnOoqnYkeViS2w8/UXdfleSqVaYFgJPFKrepX5/knKp6dFU9KMlFSfYetmZvkm9efv51SX6nu3t9YwLA5tr2yri7766qy5K8JskpSV7U3ddV1RVJ9nf33iQvTPKLVXUwyR1ZBBsAWEG5gAWAWd6BCwCGiTEADBNjABgmxiuoql+qqluq6n1VdUNVPWd6pk1VVedU1Ueq6pemZ9k0VXVaVf1aVX2wqv6yqr5xeqZNU1WXVdX+qrqrqq6ZnmcTLX8XwguXf4bfX1VvqqqnTc91tI7rm36cwH48ybO7+66qelyS362qN3b3G6YH20BXZvHX6Vi/K5N8NMkjkzw+yauq6s3dfd3sWBvlXUl+LMlTk3zq8CybakcW7/j45CTvTHJBkpdV1Xnd/Y7JwY6GK+MVdPd13X3XPZvLj8cMjrSRquqiJH+d5LenZ9k0VfXpSb42yQ939we6+w+yeH+Ab5qdbLN09yu7+9dzhDc9Yj26+4Pd/bzufkd3/7/u/l9J/iLJ35+e7WiI8Yqq6vlV9aEkb09yS5J9wyNtlKo6NckVSb57epYN9dgkd3f3DVv2vTnJ5w/NA2tRVY/M4s/3CX2HR4xX1N3fmeShSb48ySuT3HXfX8H99KNJXtjdh6YH2VAPSfK+w/bdmcWfaTghVdUnJ/nlJC/u7rdPz3M0xPh+6O6/Wd7eOzPJd0zPsymq6vFJvjKL34XNsfGBJKcetu/UJO8fmAWOWlV9UpJfzOJ1EJcNj3PUvIDrE7MjnjNep6ckOTvJO5e/BvshSU6pqt3d/cTBuTbJDUl2VNU53f3ny31flBP81h4np1r8j+KFWbwY8YLu/tjwSEfNlfE2quozquqiqnpIVZ1SVU9NcnG8yGidrsrih5vHLz9ekORVWbwilTXo7g9m8fTKFVX16VX1ZUkuzOLKgjWpqh1V9SlZvI//KVX1KcvfZMd6/XySz0vy1d394elh1kGMt9dZ3JI+lOS9SX4qyb9d/oIM1qC7P9Tdt97zkcUt1Y90923Ts22Y78zir9u8O8lLknyHv9a0dj+U5MNJnpvkWcvPf2h0og1TVY9K8u1Z/OB+a1V9YPnxzOHRjopfFAEAw1wZA8AwMQaAYWIMAMPEGACGiTEADBNjABgmxgAwTIwBYJgYA8Cw/w+ur6eq07qr5gAAAABJRU5ErkJggg==\n" 619 | }, 620 | "metadata": { 621 | "needs_background": "light" 622 | } 623 | } 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "source": [ 629 | "HAC is implemented in `sklearn.cluster` module as `AgglomerativeClustering` class" 630 | ], 631 | "metadata": { 632 | "id": "hl87PtAx36qb" 633 | } 634 | } 635 | ] 636 | } --------------------------------------------------------------------------------