├── src ├── __init__.py └── metricas.py ├── .gitignore ├── img ├── 06_points.png ├── 01_head_iris.png ├── power_of_math.gif ├── 11_petal_kmeans.png ├── 03_distance_formula.png ├── 02_elbow_named_points.png ├── 04_elbow_green_a0a18.png ├── 07_distancia_calc_a1.png ├── 08_distancia_calc_a1.png ├── 05_elbow_distance_a0a1a18.png ├── 09_descriptive_stats_cluster.PNG └── 10_descriptive_stats_species.PNG ├── latex.tex ├── README.md └── kmeans.ipynb /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | *__pycache__/ -------------------------------------------------------------------------------- /img/06_points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/06_points.png -------------------------------------------------------------------------------- /img/01_head_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/01_head_iris.png -------------------------------------------------------------------------------- /img/power_of_math.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/power_of_math.gif -------------------------------------------------------------------------------- /img/11_petal_kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/11_petal_kmeans.png -------------------------------------------------------------------------------- /img/03_distance_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/03_distance_formula.png -------------------------------------------------------------------------------- /img/02_elbow_named_points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/02_elbow_named_points.png -------------------------------------------------------------------------------- /img/04_elbow_green_a0a18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/04_elbow_green_a0a18.png -------------------------------------------------------------------------------- /img/07_distancia_calc_a1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/07_distancia_calc_a1.png -------------------------------------------------------------------------------- /img/08_distancia_calc_a1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/08_distancia_calc_a1.png -------------------------------------------------------------------------------- /img/05_elbow_distance_a0a1a18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/05_elbow_distance_a0a1a18.png -------------------------------------------------------------------------------- /img/09_descriptive_stats_cluster.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/09_descriptive_stats_cluster.PNG -------------------------------------------------------------------------------- /img/10_descriptive_stats_species.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtemporal/kmeans_e_cotovelo/HEAD/img/10_descriptive_stats_species.PNG -------------------------------------------------------------------------------- /latex.tex: -------------------------------------------------------------------------------- 1 | % dado para calculo da distancia 2 | \begin{center} 3 | \text{pontos que definem a reta}\\ 4 | a_0 =(x_0,\ y_0)=(2,\ 152.34)\\ 5 | a_{18} =(x_1,\ y_1)=(20,\ 14.73)\\ 6 | \ \\ 7 | 8 | \text{nosso ponto de interesse}\\ 9 | a_1 =(x,\ y)=(3,\ 78.85)\\ 10 | \end{center} 11 | 12 | % data for the distance calc 13 | \begin{center} 14 | \text{points that define the line}\\ 15 | a_0 =(x_0,\ y_0)=(2,\ 152.34)\\ 16 | a_{18} =(x_1,\ y_1)=(20,\ 14.73)\\ 17 | \ \\ 18 | 19 | \text{our point of interest}\\ 20 | a_1 =(x,\ y)=(3,\ 78.85)\\ 21 | \end{center} 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Como definir o número de clusters para o seu KMeans 2 | 3 | Esse repositório contém todo o código de base para o blog post [_"Como definir o número de clusters para o seu KMeans"_](https://medium.com/pizzadedados/kmeans-e-metodo-do-cotovelo-94ded9fdf3a9). 4 | 5 | ## Organização deste repositório 6 | 7 | ``` 8 | kmeans_e_cotovelo/ 9 | ├── html/ -> contém os notebooks em formato de página html 10 | ├── img/ -> contém as imagens que aparecem no blog post 11 | ├── graficos_e_metricas.ipynb -> notebook com os gráficos que estão no post 12 | ├── kmeans.ipynb -> notebook com todos os passos para rodar o kmeans e o método do cotovelo 13 | └── src/ -> o código das funções que fiz 14 | ``` 15 | -------------------------------------------------------------------------------- /src/metricas.py: -------------------------------------------------------------------------------- 1 | def optimal_number_of_clusters(wcss): 2 | """ 3 | Calcula a maior distância entre os pontos que marcam as 4 | somas dos quadrados intra-clusters para 19 calculadas 5 | com `calculate_wcss()` 6 | 7 | Parametros 8 | ---------- 9 | wcss : lista 10 | lista contendo os valores de soma de quadrados intra-cluster 11 | 12 | 13 | Returns 14 | ------- 15 | int : número de clusters 16 | """ 17 | from math import sqrt 18 | x1, y1 = 2, wcss[0] 19 | x2, y2 = 20, wcss[len(wcss)-1] 20 | 21 | distances = [] 22 | for i in range(len(wcss)): 23 | x0 = i+2 24 | y0 = wcss[i] 25 | 26 | numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1) 27 | denominator = sqrt((y2 - y1)**2 + (x2 - x1)**2) 28 | distances.append(numerator/denominator) 29 | return distances.index(max(distances)) + 2 30 | 31 | 32 | def calculate_wcss(data): 33 | """ 34 | Calcula a soma dos quadrados intra-clusters para 19 35 | quantidades de clusters, iniciando com o mínimo de 2 clusters 36 | 37 | Parametros 38 | ---------- 39 | data : DataFrame 40 | conjunto de dados para fazer o `.fit()` do KMeans 41 | 42 | 43 | Returns 44 | ------- 45 | wcss : lista contendo os valores de soma de quadrados intra-cluster 46 | """ 47 | from sklearn.cluster import KMeans 48 | wcss = [] 49 | for n in range(2, 21): 50 | kmeans = KMeans(n_clusters=n) 51 | kmeans.fit(X=data) 52 | wcss.append(kmeans.inertia_) 53 | 54 | return wcss 55 | -------------------------------------------------------------------------------- /kmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clusters pra todo mundo\n", 8 | "\n", 9 | "## Funções\n", 10 | "\n", 11 | "Importando as nossas funções que vimos no post." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from src.metricas import calculate_wcss, optimal_number_of_clusters" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "\u001b[0;31mSignature:\u001b[0m \u001b[0mcalculate_wcss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 32 | "\u001b[0;31mDocstring:\u001b[0m\n", 33 | "Calcula a soma dos quadrados intra-clusters para 19\n", 34 | "quantidades de clusters, iniciando com o mínimo de 2 clusters\n", 35 | "\n", 36 | "Parametros\n", 37 | "----------\n", 38 | "data : DataFrame\n", 39 | " conjunto de dados para fazer o `.fit()` do KMeans\n", 40 | "\n", 41 | "\n", 42 | "Returns\n", 43 | "-------\n", 44 | "wcss : lista contendo os valores de soma de quadrados intra-cluster\n", 45 | "\u001b[0;31mFile:\u001b[0m ~/work/kmeans/src/metricas.py\n", 46 | "\u001b[0;31mType:\u001b[0m function\n" 47 | ] 48 | }, 49 | "metadata": {}, 50 | "output_type": "display_data" 51 | } 52 | ], 53 | "source": [ 54 | "?calculate_wcss" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "\u001b[0;31mSignature:\u001b[0m \u001b[0moptimal_number_of_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwcss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 66 | "\u001b[0;31mDocstring:\u001b[0m\n", 67 | "Calcula a maior distância entre os pontos que marcam as \n", 68 | "somas dos quadrados intra-clusters para 19 calculadas \n", 69 | "com `calculate_wcss()`\n", 70 | "\n", 71 | "Parametros\n", 72 | "----------\n", 73 | "wcss : lista\n", 74 | " lista contendo os valores de soma de quadrados intra-cluster\n", 75 | "\n", 76 | "\n", 77 | "Returns\n", 78 | "-------\n", 79 | "int : número de clusters \n", 80 | "\u001b[0;31mFile:\u001b[0m ~/work/kmeans/src/metricas.py\n", 81 | "\u001b[0;31mType:\u001b[0m function\n" 82 | ] 83 | }, 84 | "metadata": {}, 85 | "output_type": "display_data" 86 | } 87 | ], 88 | "source": [ 89 | "?optimal_number_of_clusters" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "---\n", 97 | "\n", 98 | "## Calculando KMeans para Iris" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "import matplotlib.pyplot as plt\n", 108 | "import seaborn as sns\n", 109 | "\n", 110 | "from sklearn.cluster import KMeans\n", 111 | "from sklearn.preprocessing import LabelEncoder\n", 112 | "\n", 113 | "from src.metricas import calculate_wcss, optimal_number_of_clusters\n", 114 | "\n", 115 | "\n", 116 | "# preparando nossos dados\n", 117 | "iris = sns.load_dataset('iris')\n", 118 | "df = iris.drop('species', axis=1)\n", 119 | "\n", 120 | "# calculando a soma dos quadrados para as 19 quantidade de clusters\n", 121 | "sum_of_squares = calculate_wcss(df)\n", 122 | "\n", 123 | "# calculando a quantidade ótima de clusters\n", 124 | "n = optimal_number_of_clusters(sum_of_squares)\n", 125 | "\n", 126 | "# inicializando o kmeans para nossa quantidade ótima de clusters\n", 127 | "kmeans = KMeans(n_clusters=n)\n", 128 | "\n", 129 | "# predizendo nossos clusters\n", 130 | "iris['clusters'] = kmeans.fit_predict(df)\n", 131 | "\n", 132 | "# transformando as especies de iris em numeros para colorir o gráfico\n", 133 | "iris['species_encoded'] = LabelEncoder().fit_transform(iris['species'])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "---\n", 141 | "\n", 142 | "## Gráfico de clusters olhando para comprimento e largura de pétalas" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "image/png": "\n", 153 | "text/plain": [ 154 | "
" 155 | ] 156 | }, 157 | "metadata": { 158 | "needs_background": "light" 159 | }, 160 | "output_type": "display_data" 161 | } 162 | ], 163 | "source": [ 164 | "plt.figure(figsize=(15,5))\n", 165 | "\n", 166 | "plt.subplot(1, 2, 1)\n", 167 | "plt.title('Antes')\n", 168 | "plt.xlabel('comprimento da pétala')\n", 169 | "plt.ylabel('largura da pétala')\n", 170 | "plt.scatter(iris['petal_length'], iris['petal_width'], c=iris['species_encoded'])\n", 171 | "\n", 172 | "\n", 173 | "plt.subplot(1, 2, 2)\n", 174 | "plt.title('Depois')\n", 175 | "plt.xlabel('comprimento da pétala')\n", 176 | "plt.scatter(iris['petal_length'], iris['petal_width'], c=iris['clusters'])\n", 177 | "\n", 178 | "plt.show()" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.6.7" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 2 203 | } 204 | --------------------------------------------------------------------------------