├── .gitignore ├── LICENSE ├── README.md ├── output-screenshots ├── 2-clusters.jpg ├── 3-clusters.jpg ├── 4-clusters.jpg └── 6-clusters.jpg └── src └── KMeans.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.ear 17 | *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Adnan Ansari 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # k-means Clustering 2 | A Java program to cluster a dataset in CSV format using k-means clustering 3 | 4 | # Input 5 | The user must provide the following inputs: 6 | * Name of the CSV dataset (make sure that the header line is removed) 7 | * The X and Y attributes from the dataset 8 | * The number of clusters 9 | * The maximum number of iterations 10 | 11 | # Output 12 | An array of ArrayLists which can be passed to other graph libraries (not included) to visualize the output. 13 | 14 | Following is the output for 2, 3, 4 and 6 clusters for the [FIFA 17 dataset](https://www.kaggle.com/artimous/complete-fifa-2017-player-dataset-global) visualized using [Plotly](https://plot.ly/). The player's FIFA rating is plotted against his age 15 | 16 | ![2 clusters](https://github.com/psyclone20/k-means-clustering/blob/master/output-screenshots/2-clusters.jpg "2 clusters") 17 | ![3 clusters](https://github.com/psyclone20/k-means-clustering/blob/master/output-screenshots/3-clusters.jpg "3 clusters") 18 | ![4 clusters](https://github.com/psyclone20/k-means-clustering/blob/master/output-screenshots/4-clusters.jpg "4 clusters") 19 | ![6 clusters](https://github.com/psyclone20/k-means-clustering/blob/master/output-screenshots/6-clusters.jpg "6 clusters") 20 | -------------------------------------------------------------------------------- /output-screenshots/2-clusters.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psyclone20/k-means-clustering/e7e0f6e203357e5ae4c69368591ab39737c0b546/output-screenshots/2-clusters.jpg -------------------------------------------------------------------------------- /output-screenshots/3-clusters.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psyclone20/k-means-clustering/e7e0f6e203357e5ae4c69368591ab39737c0b546/output-screenshots/3-clusters.jpg -------------------------------------------------------------------------------- /output-screenshots/4-clusters.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psyclone20/k-means-clustering/e7e0f6e203357e5ae4c69368591ab39737c0b546/output-screenshots/4-clusters.jpg -------------------------------------------------------------------------------- /output-screenshots/6-clusters.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psyclone20/k-means-clustering/e7e0f6e203357e5ae4c69368591ab39737c0b546/output-screenshots/6-clusters.jpg -------------------------------------------------------------------------------- /src/KMeans.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.FileReader; 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Scanner; 6 | 7 | public class KMeans { 8 | public static void main(String args[]) throws IOException { 9 | Scanner sc = new Scanner(System.in); 10 | String filePath = ""; 11 | System.out.print("Enter the name of the CSV file: "); 12 | String fileName = sc.nextLine(); 13 | 14 | // Open the file just to count the number of records 15 | int records = getRecords(filePath, fileName); 16 | 17 | System.out.print("Enter the index of the X-attribute: "); 18 | int xAttribute = sc.nextInt(); 19 | System.out.print("Enter the index of the Y-attribute: "); 20 | int yAttribute = sc.nextInt(); 21 | 22 | // Open file again to read the records 23 | double[][] points = new double[records][2]; 24 | readRecords(filePath, fileName, points, xAttribute, yAttribute); 25 | 26 | // Sort the points based on X-coordinate values 27 | sortPointsByX(points); 28 | 29 | // Input the number of iterations 30 | System.out.print("Enter the maximum number of iterations: "); 31 | int maxIterations = sc.nextInt(); 32 | 33 | // Input number of clusters 34 | System.out.print("Enter the number of clusters to form: "); 35 | int clusters = sc.nextInt(); 36 | 37 | // Calculate initial means 38 | double[][] means = new double[clusters][2]; 39 | for(int i=0; i[] oldClusters = new ArrayList[clusters]; 46 | ArrayList[] newClusters = new ArrayList[clusters]; 47 | 48 | for(int i=0; i(); 50 | newClusters[i] = new ArrayList(); 51 | } 52 | 53 | // Make the initial clusters 54 | formClusters(oldClusters, means, points); 55 | int iterations = 0; 56 | 57 | // Showtime 58 | while(true) { 59 | updateMeans(oldClusters, means, points); 60 | formClusters(newClusters, means, points); 61 | 62 | iterations++; 63 | 64 | if(iterations > maxIterations || checkEquality(oldClusters, newClusters)) 65 | break; 66 | else 67 | resetClusters(oldClusters, newClusters); 68 | } 69 | 70 | // Display the output 71 | System.out.println("\nThe final clusters are:"); 72 | displayOutput(oldClusters, points); 73 | System.out.println("\nIterations taken = " + iterations); 74 | 75 | sc.close(); 76 | } 77 | 78 | static int getRecords(String filePath, String fileName) throws IOException { 79 | int records = 0; 80 | BufferedReader br = new BufferedReader(new FileReader(filePath + fileName + ".csv")); 81 | while (br.readLine() != null) 82 | records++; 83 | 84 | br.close(); 85 | return records; 86 | } 87 | 88 | static void readRecords(String filePath, String fileName, double[][] points, int xAttribute, int yAttribute) throws IOException { 89 | BufferedReader br = new BufferedReader(new FileReader(filePath + fileName + ".csv")); 90 | String line; 91 | int i = 0; 92 | while ((line = br.readLine()) != null) { 93 | points[i][0] = Double.parseDouble(line.split(",")[xAttribute]); 94 | points[i++][1] = Double.parseDouble(line.split(",")[yAttribute]); 95 | } 96 | 97 | br.close(); 98 | } 99 | 100 | static void sortPointsByX(double[][] points) { 101 | double[] temp; 102 | 103 | // Bubble Sort 104 | for(int i=0; i points[j][0]) { 107 | temp = points[j-1]; 108 | points[j-1] = points[j]; 109 | points[j] = temp; 110 | } 111 | } 112 | 113 | static void updateMeans(ArrayList[] clusterList, double[][] means, double[][] points) { 114 | double totalX = 0; 115 | double totalY = 0; 116 | for(int i=0; i[] clusterList, double[][] means, double[][] points) { 129 | double distance[] = new double[means.length]; 130 | double minDistance = 999999999; 131 | int minIndex = 0; 132 | 133 | for(int i=0; i[] oldClusters, ArrayList[] newClusters) { 147 | for(int i=0; i[] oldClusters, ArrayList[] newClusters) { 162 | for(int i=0; i[] clusterList, double[][] points) { 174 | for(int i=0; i