├── .gitignore ├── K-means.ipynb ├── LICENSE ├── README.md ├── color-reduction.gif ├── image.jpg ├── images ├── k-means.gif ├── k1.jpg ├── k10.jpg ├── k128.jpg ├── k16.jpg ├── k2.jpg ├── k256.jpg ├── k3.jpg ├── k32.jpg ├── k4.jpg ├── k5.jpg ├── k6.jpg ├── k64.jpg ├── k7.jpg ├── k8.jpg └── k9.jpg ├── k-means.py └── make_gif.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 tugot17 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # K-Means-Algorithm-From-Scratch 2 | The K-Means algorithm, written from scratch using the Python programming language. The main jupiter notebook shows how to write k-means from scratch and shows an example application - reducing the number of colors. 3 | 4 | 5 | drawing 6 | 7 | ## Getting Started 8 | 9 | The main file is [K-means.ipynb](K-means.ipynb) 10 | 11 | The code itself, without comments, can be found in the [k-means.py](k-means.py) file 12 | 13 | ## Image 14 | 15 | Image used as example for color reduction was downloaded from [here](https://www.nationalgeographic.com/photography/photo-of-the-day/2012/1/elephants-queen-elizabeth-park-sartore/) 16 | 17 | drawing 18 | 19 | 20 | ### Prerequisites 21 | ``` 22 | -numpy 23 | -sklearn 24 | -matplotlib 25 | -cv2 26 | ``` 27 | 28 | 29 | ## Authors 30 | * [tugot17](https://github.com/tugot17) 31 | 32 | ## License 33 | 34 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details 35 | 36 | 37 | -------------------------------------------------------------------------------- /color-reduction.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/color-reduction.gif -------------------------------------------------------------------------------- /image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/image.jpg -------------------------------------------------------------------------------- /images/k-means.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k-means.gif -------------------------------------------------------------------------------- /images/k1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k1.jpg -------------------------------------------------------------------------------- /images/k10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k10.jpg -------------------------------------------------------------------------------- /images/k128.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k128.jpg -------------------------------------------------------------------------------- /images/k16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k16.jpg -------------------------------------------------------------------------------- /images/k2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k2.jpg -------------------------------------------------------------------------------- /images/k256.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k256.jpg -------------------------------------------------------------------------------- /images/k3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k3.jpg -------------------------------------------------------------------------------- /images/k32.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k32.jpg -------------------------------------------------------------------------------- /images/k4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k4.jpg -------------------------------------------------------------------------------- /images/k5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k5.jpg -------------------------------------------------------------------------------- /images/k6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k6.jpg -------------------------------------------------------------------------------- /images/k64.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k64.jpg -------------------------------------------------------------------------------- /images/k7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k7.jpg -------------------------------------------------------------------------------- /images/k8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k8.jpg -------------------------------------------------------------------------------- /images/k9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tugot17/K-Means-Algorithm-From-Scratch/0cb9a2ec8a764a0b51a121178dea23d84f98b7cf/images/k9.jpg -------------------------------------------------------------------------------- /k-means.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import cv2 4 | 5 | random.seed(7) 6 | np.random.seed(7) 7 | 8 | def get_initial_centroids(X, k): 9 | """ 10 | Function picks k random data points from dataset X, recurring points are removed and replaced but new points 11 | so a result we have array of k unique points. Founded points can be used as intial centroids for k means algorithm 12 | Args: 13 | X (numpy.ndarray) : dataset points array, size N:D 14 | k (int): number of centroids 15 | 16 | Returns: 17 | (numpy.ndarray): array of k unique initial centroids, size K:D 18 | 19 | """ 20 | number_of_samples = X.shape[0] 21 | sample_points_ids = random.sample(range(0, number_of_samples), k) 22 | 23 | centroids = [tuple(X[id]) for id in sample_points_ids] 24 | unique_centroids = list(set(centroids)) 25 | 26 | number_of_unique_centroids = len(unique_centroids) 27 | 28 | while number_of_unique_centroids < k: 29 | new_sample_points_ids = random.sample(range(0, number_of_samples), k - number_of_unique_centroids) 30 | new_centroids = [tuple(X[id]) for id in new_sample_points_ids] 31 | unique_centroids = list(set(unique_centroids + new_centroids)) 32 | 33 | number_of_unique_centroids = len(unique_centroids) 34 | 35 | return np.array(unique_centroids) 36 | 37 | 38 | def get_euclidean_distance(A_matrix, B_matrix): 39 | """ 40 | Function computes euclidean distance between matrix A and B. 41 | E. g. C[2,15] is distance between point 2 from A (A[2]) matrix and point 15 from matrix B (B[15]) 42 | Args: 43 | A_matrix (numpy.ndarray): Matrix size N1:D 44 | B_matrix (numpy.ndarray): Matrix size N2:D 45 | 46 | Returns: 47 | numpy.ndarray: Matrix size N1:N2 48 | """ 49 | 50 | A_square = np.reshape(np.sum(A_matrix * A_matrix, axis=1), (A_matrix.shape[0], 1)) 51 | B_square = np.reshape(np.sum(B_matrix * B_matrix, axis=1), (1, B_matrix.shape[0])) 52 | AB = A_matrix @ B_matrix.T 53 | 54 | C = -2 * AB + B_square + A_square 55 | 56 | return np.sqrt(C) 57 | 58 | 59 | def get_clusters(X, centroids, distance_mesuring_method): 60 | """ 61 | Function finds k centroids and assigns each of the N points of array X to one centroid 62 | Args: 63 | X (numpy.ndarray): array of sample points, size N:D 64 | centroids (numpy.ndarray): array of centroids, size K:D 65 | distance_mesuring_method (function): function taking 2 Matrices A (N1:D) and B (N2:D) and returning distance 66 | between all points from matrix A and all points from matrix B, size N1:N2 67 | 68 | Returns: 69 | dict {cluster_number: list_of_points_in_cluster} 70 | """ 71 | 72 | k = centroids.shape[0] 73 | 74 | clusters = {} 75 | 76 | distance_matrix = distance_mesuring_method(X, centroids) 77 | 78 | closest_cluster_ids = np.argmin(distance_matrix, axis=1) 79 | 80 | for i in range(k): 81 | clusters[i] = [] 82 | 83 | for i, cluster_id in enumerate(closest_cluster_ids): 84 | clusters[cluster_id].append(X[i]) 85 | 86 | return clusters 87 | 88 | 89 | def has_centroids_covered(previous_centroids, new_centroids, distance_mesuring_method, movement_threshold_delta): 90 | """ 91 | Function checks if any of centroids moved more then MOVEMENT_THRESHOLD_DELTA if not we assume the centroids were founded 92 | Args: 93 | previous_centroids (numpy.ndarray): array of k old centroids, size K:D 94 | new_centroids (numpy.ndarray): array of k new centroids, size K:D 95 | distance_mesuring_method (function): function taking 2 Matrices A (N1:D) and B (N2:D) and returning distance 96 | movement_threshold_delta (float): threshold value, if centroids move less we assume that algorithm covered 97 | 98 | 99 | Returns: boolean True if centroids coverd False if not 100 | 101 | """ 102 | distances_between_old_and_new_centroids = distance_mesuring_method(previous_centroids, new_centroids) 103 | centroids_covered = np.max(distances_between_old_and_new_centroids.diagonal()) <= movement_threshold_delta 104 | 105 | return centroids_covered 106 | 107 | 108 | def perform_k_means_algorithm(X, k, distance_mesuring_method, movement_threshold_delta=0): 109 | """ 110 | Function performs k-means algorithm on a given dataset, finds and returns k centroids 111 | Args: 112 | X (numpy.ndarray) : dataset points array, size N:D 113 | distance_mesuring_method (function): function taking 2 Matrices A (N1:D) and B (N2:D) and returning distance 114 | between all points from matrix A and all points from matrix B, size N1:N2. 115 | k (int): number of centroids 116 | movement_threshold_delta (float): threshold value, if centroids move less we assume that algorithm covered 117 | 118 | Returns: 119 | (numpy.ndarray): array of k centroids, size K:D 120 | """ 121 | 122 | new_centroids = get_initial_centroids(X=X, k=k) 123 | 124 | centroids_covered = False 125 | 126 | while not centroids_covered: 127 | previous_centroids = new_centroids 128 | clusters = get_clusters(X, previous_centroids, distance_mesuring_method) 129 | 130 | new_centroids = np.array([np.mean(clusters[key], axis=0, dtype=X.dtype) for key in sorted(clusters.keys())]) 131 | 132 | centroids_covered = has_centroids_covered(previous_centroids, new_centroids, distance_mesuring_method, movement_threshold_delta) 133 | 134 | return new_centroids 135 | 136 | 137 | def get_reduced_colors_image(image, number_of_colors): 138 | """ 139 | Function returns given image with reduced number of colors 140 | Args: 141 | image (numpy.ndarray): original opencv image, function finds its reduced colors form 142 | number_of_colors (integer): number of colors in reduced image 143 | 144 | Returns: 145 | (numpy.ndarray): image with reduced number of colors 146 | """ 147 | 148 | h, w, d = image.shape 149 | 150 | X = np.reshape(image, (h * w, d)) 151 | X = np.array(X, dtype=np.int32) 152 | 153 | centroids = perform_k_means_algorithm(X, k=number_of_colors, distance_mesuring_method=get_euclidean_distance, movement_threshold_delta=4) 154 | distance_matrix = get_euclidean_distance(X, centroids) 155 | closest_cluster_ids = np.argmin(distance_matrix, axis=1) 156 | 157 | X_reconstructed = centroids[closest_cluster_ids] 158 | X_reconstructed = np.array(X_reconstructed, dtype=np.uint8) 159 | reduced_image = np.reshape(X_reconstructed, (h, w, d)) 160 | 161 | return reduced_image 162 | 163 | 164 | if __name__ == '__main__': 165 | k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128, 256] 166 | 167 | reconstrutions = [] 168 | 169 | img = cv2.imread("image.jpg") 170 | 171 | for k in k_values: 172 | reduced_colors_image = get_reduced_colors_image(img, k) 173 | 174 | cv2.imwrite(f"images/k{k}.jpg", reduced_colors_image) -------------------------------------------------------------------------------- /make_gif.py: -------------------------------------------------------------------------------- 1 | import imageio 2 | from os import listdir 3 | from os.path import isfile, join 4 | 5 | from natsort import natsort 6 | 7 | images = [] 8 | 9 | PATH_TO_GIF = 'color-reduction.gif' 10 | PATH_TO_IMAGES = "Images/" 11 | 12 | 13 | filenames= [f for f in listdir(PATH_TO_IMAGES) if isfile(join(PATH_TO_IMAGES, f))] 14 | 15 | 16 | 17 | filenames = natsort.natsorted(filenames,reverse=False) 18 | print(filenames) 19 | 20 | 21 | 22 | 23 | for filename in filenames: 24 | images.append(imageio.imread("Images/" + filename)) 25 | imageio.mimsave(PATH_TO_GIF, images, duration=1.5) --------------------------------------------------------------------------------