├── heapq.py ├── .gitignore ├── LICENSE ├── cluster_output.txt ├── iris.dat ├── README.md └── hclust.py /heapq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZwEin27/Hierarchical-Clustering/HEAD/heapq.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 ZwEin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /cluster_output.txt: -------------------------------------------------------------------------------- 1 | 0.819168173599 2 | 0.862857142857 3 | [100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148] 4 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] 5 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149] 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 77 110 147 29 | 30 | 31 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 104, 106, 108, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 123, 124, 126, 127, 128, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149] 32 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] 33 | [100, 102, 105, 107, 109, 117, 118, 122, 125, 129, 130, 131, 135] 34 | [Finished in 0.4s] 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | [ 53 | [100, 102, 105, 107, 109, 110, 115, 117, 118, 122, 125, 129, 130, 131, 135, 136, 141, 145, 147, 148] 54 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] 55 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 104, 106, 108, 111, 112, 113, 114, 116, 119, 120, 121, 123, 124, 126, 127, 128, 132, 133, 134, 137, 138, 139, 140, 142, 143, 144, 146, 149] 56 | 57 | 58 | ] -------------------------------------------------------------------------------- /iris.dat: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical-Clustering 2 | Hierarchical Clustering Python Implementation 3 | 4 | a hierarchical agglomerative clustering algorithm implementation. The algorithm starts by placing each data point in a cluster by itself and then repeatedly merges two clusters until some stopping condition is met. 5 | 6 | ## Clustering process 7 | 8 | Algorithm should stop the clustering process when all data points are placed in a single cluster. That is, the algorithm will perform n – 1 merging if there are n data points. 9 | 10 | Algorithm should also keep track of the merging process: for each merging step, remember which clusters are merged to produce which new cluster. Such information is similar to a dendrogram (e.g., shown below), except that dendrogram also remembers the distance of two clusters when they are merged. 11 | With the above information, this algorithm can then allow users to specify a desired number of clusters k, and returns k clusters based on the clustering result. For example, to produce two clusters, the algorithm can simply return the last two clusters that were merged during the process. 12 | 13 | ## Distance function 14 | 15 | Assuming that the input to the algorithm consists of a set of n- dimensional data points in the Euclidean space. The distance between two clusters is measured by the Euclidean distance of their centroids. 16 | 17 | ## Priority queue 18 | 19 | The algorithm should use a priority queue to store the distance between each pair of clusters and find the two clusters with minimum distance efficiently. You are provided with an implementation of a generic priority queue and required to use it in your algorithm. You are not supposed to modify the provided implementation. 20 | Recall that during the merging process, distance entries in the queue for the clusters that are merged need to be removed from the queue. However, it may be hard to keep track of entries in the queue that involves a particular cluster, since the queue is undergoing frequent changes. To address this, your algorithm should implement a lazy removal strategy: entries for merged clusters are not removed until they show up in the root of the queue. When such entries are found at the root during the ExactMin operation, they are simply ignored, and ExactMin continues until a pair of clusters that do not contain merged clusters is found at the root of queue, or the queue is exhausted. Note that in this strategy, you do not need to implement a remove function that removes an arbitrary node from the heap. 21 | 22 | ## Evaluation 23 | 24 | For the evaluation purpose, the data set input to this algorithm also contains cluster label for each data point. These cluster labels form a gold standard for clustering the data set to produce a specific number of clusters. Note that here we can not use the label in the clustering process. Instead, use them in evaluating the performance of your algorithm. The performance is measured by precision and recall of correct pairs. A pair of two data points x and y is correct if they belong to the same cluster according to the cluster label. Recall that precision and recall are discussed in LSH lectures. Precision is the percentage of pairs discovered by the algorithm that are correct, while recall is the percentage of correct pairs that are discovered by the algorithm. 25 | As an example, consider five data points: 1, 2, 3, 4, 5. Suppose there are 2 clusters: {1, 2, 3} and {4, 5} according to the algorithm, while different 2 clusters: {1, 2} and {3, 4, 5} according to the gold standard. In this case, the algorithm discovers four pairs: (1, 2), (1, 3), (2, 3), and (4, 5), while the gold standard has the pairs: (1, 2), (3, 4), (3, 5), and (4, 5). In this case, precision is 2/4 since only (1, 2) and (4, 5) discovered by the algorithm are correct, among the 4 discovered. Recall is also 2/4, since only 2 correct pairs were discovered among the total 4 in the gold standard. 26 | 27 | ## Input data format 28 | 29 | The data set contains a list of data points, one point per line. For each data point, it gives the value of the point at each dimension, followed by the cluster label of the point. A sample data point is provided for you to test your algorithm. The data contains 150 iris plants (https://archive.ics.uci.edu/ml/datasets/Iris). For each plant, it lists its sepal and petal length and width in centimeter, and also its type (e.g., setosa, see below). 30 | 31 | 5.1,3.5,1.4,0.2,Iris-setosa 32 | 4.9,3.0,1.4,0.2,Iris-setosa 33 | 4.7,3.2,1.3,0.2,Iris-setosa 34 | ... 35 | 36 | Assuming that input data sets that will be used to test your algorithm will have similar format, that is, one data point per line, which has value of point in each of n dimensions (where n >= 1), followed by a class label. 37 | 38 | ## Input and output format 39 | 40 | This algorithm should take two arguments: a text file name for the input data, and a value k for the number of desired clusters. For example, 41 | 42 | $python hclust.py iris.dat 3 43 | 44 | Where hclust.py is your hierarchical clustering algorithm, iris.dat is the input data file, and 3 is the k value. 45 | It should output 3 clusters, with each cluster contains a set of data points. Data point are numbered by their positions they appear in the data file: the first data point is numbered 0, second 1, and so on. The data points in the clusters are output in the ascending order of their numbers. 46 | 47 | For example, here is an example output. 48 | 49 | Cluster 1: [3, 10, 13, ...] 50 | Cluster 2: [8, 52, 87, 88, ...] 51 | Cluster 3: [100, 105, ...] 52 | 53 | This algorithm also output the accuracy of the discovery. For example, 54 | 55 | Precision = .8, recall = .5 56 | 57 | ## Priority queue module 58 | 59 | Please find heapq.py provided to you. It implements a priority queue with operations for building the heap, adding items, and extracting smallest item from the heap. See the documentation in the script and also the link (https://docs.python.org/2/library/heapq.html) for more details. 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /hclust.py: -------------------------------------------------------------------------------- 1 | """ 2 | Executing code: 3 | Python hclust.py iris.dat 3 4 | 5 | """ 6 | 7 | """ 8 | Change log: 9 | 10 | - Nov 8, 2015 11 | 1. Change the logic to calculation centroid 12 | 2. Add judgement for some invalid input cases 13 | 14 | """ 15 | 16 | import sys 17 | import math 18 | import os 19 | import heapq 20 | import itertools 21 | 22 | class Hierarchical_Clustering: 23 | def __init__(self, ipt_data, ipt_k): 24 | self.input_file_name = ipt_data 25 | self.k = ipt_k 26 | self.dataset = None 27 | self.dataset_size = 0 28 | self.dimension = 0 29 | self.heap = [] 30 | self.clusters = [] 31 | self.gold_standard = {} 32 | 33 | def initialize(self): 34 | """ 35 | Initialize and check parameters 36 | 37 | """ 38 | # check file exist and if it's a file or dir 39 | if not os.path.isfile(self.input_file_name): 40 | self.quit("Input file doesn't exist or it's not a file") 41 | 42 | self.dataset, self.clusters, self.gold_standard = self.load_data(self.input_file_name) 43 | self.dataset_size = len(self.dataset) 44 | 45 | if self.dataset_size == 0: 46 | self.quit("Input file doesn't include any data") 47 | 48 | if self.k == 0: 49 | self.quit("k = 0, no cluster will be generated") 50 | 51 | if self.k > self.dataset_size: 52 | self.quit("k is larger than the number of existing clusters") 53 | 54 | self.dimension = len(self.dataset[0]["data"]) 55 | 56 | if self.dimension == 0: 57 | self.quit("dimension for dataset cannot be zero") 58 | 59 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 60 | """ Hierarchical Clustering Functions """ 61 | """ """ 62 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 63 | 64 | def euclidean_distance(self, data_point_one, data_point_two): 65 | """ 66 | euclidean distance: https://en.wikipedia.org/wiki/Euclidean_distance 67 | assume that two data points have same dimension 68 | 69 | """ 70 | size = len(data_point_one) 71 | result = 0.0 72 | for i in range(size): 73 | f1 = float(data_point_one[i]) # feature for data one 74 | f2 = float(data_point_two[i]) # feature for data two 75 | tmp = f1 - f2 76 | result += pow(tmp, 2) 77 | result = math.sqrt(result) 78 | return result 79 | 80 | def compute_pairwise_distance(self, dataset): 81 | result = [] 82 | dataset_size = len(dataset) 83 | for i in range(dataset_size-1): # ignore last i 84 | for j in range(i+1, dataset_size): # ignore duplication 85 | dist = self.euclidean_distance(dataset[i]["data"], dataset[j]["data"]) 86 | 87 | # duplicate dist, need to be remove, and there is no difference to use tuple only 88 | # leave second dist here is to take up a position for tie selection 89 | result.append( (dist, [dist, [[i], [j]]]) ) 90 | 91 | return result 92 | 93 | def build_priority_queue(self, distance_list): 94 | heapq.heapify(distance_list) 95 | self.heap = distance_list 96 | return self.heap 97 | 98 | def compute_centroid_two_clusters(self, current_clusters, data_points_index): 99 | size = len(data_points_index) 100 | dim = self.dimension 101 | centroid = [0.0]*dim 102 | for index in data_points_index: 103 | dim_data = current_clusters[str(index)]["centroid"] 104 | for i in range(dim): 105 | centroid[i] += float(dim_data[i]) 106 | for i in range(dim): 107 | centroid[i] /= size 108 | return centroid 109 | 110 | def compute_centroid(self, dataset, data_points_index): 111 | size = len(data_points_index) 112 | dim = self.dimension 113 | centroid = [0.0]*dim 114 | for idx in data_points_index: 115 | dim_data = dataset[idx]["data"] 116 | for i in range(dim): 117 | centroid[i] += float(dim_data[i]) 118 | for i in range(dim): 119 | centroid[i] /= size 120 | return centroid 121 | 122 | def hierarchical_clustering(self): 123 | """ 124 | Main Process for hierarchical clustering 125 | 126 | """ 127 | dataset = self.dataset 128 | current_clusters = self.clusters 129 | old_clusters = [] 130 | heap = hc.compute_pairwise_distance(dataset) 131 | heap = hc.build_priority_queue(heap) 132 | 133 | while len(current_clusters) > self.k: 134 | dist, min_item = heapq.heappop(heap) 135 | # pair_dist = min_item[0] 136 | pair_data = min_item[1] 137 | 138 | # judge if include old cluster 139 | if not self.valid_heap_node(min_item, old_clusters): 140 | continue 141 | 142 | new_cluster = {} 143 | new_cluster_elements = sum(pair_data, []) 144 | new_cluster_cendroid = self.compute_centroid(dataset, new_cluster_elements) 145 | new_cluster_elements.sort() 146 | new_cluster.setdefault("centroid", new_cluster_cendroid) 147 | new_cluster.setdefault("elements", new_cluster_elements) 148 | for pair_item in pair_data: 149 | old_clusters.append(pair_item) 150 | del current_clusters[str(pair_item)] 151 | self.add_heap_entry(heap, new_cluster, current_clusters) 152 | current_clusters[str(new_cluster_elements)] = new_cluster 153 | current_clusters.sort() 154 | return current_clusters 155 | 156 | def valid_heap_node(self, heap_node, old_clusters): 157 | pair_dist = heap_node[0] 158 | pair_data = heap_node[1] 159 | for old_cluster in old_clusters: 160 | if old_cluster in pair_data: 161 | return False 162 | return True 163 | 164 | def add_heap_entry(self, heap, new_cluster, current_clusters): 165 | for ex_cluster in current_clusters.values(): 166 | new_heap_entry = [] 167 | dist = self.euclidean_distance(ex_cluster["centroid"], new_cluster["centroid"]) 168 | new_heap_entry.append(dist) 169 | new_heap_entry.append([new_cluster["elements"], ex_cluster["elements"]]) 170 | heapq.heappush(heap, (dist, new_heap_entry)) 171 | 172 | def evaluate(self, current_clusters): 173 | gold_standard = self.gold_standard 174 | current_clustes_pairs = [] 175 | 176 | for (current_cluster_key, current_cluster_value) in current_clusters.items(): 177 | tmp = list(itertools.combinations(current_cluster_value["elements"], 2)) 178 | current_clustes_pairs.extend(tmp) 179 | tp_fp = len(current_clustes_pairs) 180 | 181 | gold_standard_pairs = [] 182 | for (gold_standard_key, gold_standard_value) in gold_standard.items(): 183 | tmp = list(itertools.combinations(gold_standard_value, 2)) 184 | gold_standard_pairs.extend(tmp) 185 | tp_fn = len(gold_standard_pairs) 186 | 187 | tp = 0.0 188 | for ccp in current_clustes_pairs: 189 | if ccp in gold_standard_pairs: 190 | tp += 1 191 | 192 | if tp_fp == 0: 193 | precision = 0.0 194 | else: 195 | precision = tp/tp_fp 196 | if tp_fn == 0: 197 | precision = 0.0 198 | else: 199 | recall = tp/tp_fn 200 | 201 | return precision, recall 202 | 203 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 204 | """ Helper Functions """ 205 | """ """ 206 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 207 | def load_data(self, input_file_name): 208 | """ 209 | load data and do some preparations 210 | 211 | """ 212 | input_file = open(input_file_name, 'rU') 213 | dataset = [] 214 | clusters = {} 215 | gold_standard = {} 216 | id = 0 217 | for line in input_file: 218 | line = line.strip('\n') 219 | row = str(line) 220 | row = row.split(",") 221 | iris_class = row[-1] 222 | 223 | data = {} 224 | data.setdefault("id", id) # duplicate 225 | data.setdefault("data", row[:-1]) 226 | data.setdefault("class", row[-1]) 227 | dataset.append(data) 228 | 229 | clusters_key = str([id]) 230 | clusters.setdefault(clusters_key, {}) 231 | clusters[clusters_key].setdefault("centroid", row[:-1]) 232 | clusters[clusters_key].setdefault("elements", [id]) 233 | 234 | gold_standard.setdefault(iris_class, []) 235 | gold_standard[iris_class].append(id) 236 | 237 | id += 1 238 | return dataset, clusters, gold_standard 239 | 240 | def quit(self, err_desc): 241 | raise SystemExit('\n'+ "PROGRAM EXIT: " + err_desc + ', please check your input' + '\n') 242 | 243 | def loaded_dataset(self): 244 | """ 245 | use for test only 246 | 247 | """ 248 | return self.dataset 249 | 250 | def display(self, current_clusters, precision, recall): 251 | print precision 252 | print recall 253 | clusters = current_clusters.values() 254 | for cluster in clusters: 255 | cluster["elements"].sort() 256 | print cluster["elements"] 257 | 258 | 259 | 260 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 261 | """ Main Method """ 262 | """ """ 263 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 264 | if __name__ == '__main__': 265 | """ 266 | inputs: 267 | - ipt_data: a text file name for the input data 268 | - ipt_k: a value k for the number of desired clusters. 269 | 270 | outputs: 271 | - opt_clusters: output k clusters, with each cluster contains a set of data points (index for input data) 272 | - opt_precision 273 | - opt_recall 274 | 275 | """ 276 | 277 | ## input test 278 | # ipt_data = "iris.dat" 279 | # ipt_data = "iris_dataset1.txt" 280 | # ipt_k = 3 281 | 282 | ipt_data = sys.argv[1] # input data, e.g. iris.dat 283 | ipt_k = int(sys.argv[2]) # number of clusters, e.g. 3 284 | 285 | hc = Hierarchical_Clustering(ipt_data, ipt_k) 286 | hc.initialize() 287 | current_clusters = hc.hierarchical_clustering() 288 | precision, recall = hc.evaluate(current_clusters) 289 | hc.display(current_clusters, precision, recall) 290 | 291 | ## euclidean_distance() test 292 | # loaded_data = hc.loaded_dataset() 293 | # print loaded_data 294 | # print hc.euclidean_distance(loaded_data[0]["data"],loaded_data[1]["data"]) 295 | 296 | ## compute_centroid() test 297 | # loaded_data = hc.loaded_dataset() 298 | # hc.compute_centroid(loaded_data, [10, 11, 12, 13]) 299 | 300 | ## distance_list test 301 | # distance_list = hc.compute_pairwise_distance() 302 | # distance_list.sort() 303 | # print distance_list 304 | 305 | ## heapq test 306 | # heap = [] 307 | # data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0] 308 | # data = [[1,4,5], [3,6,1], [5,6,10], [7,2,11], [9,6,1], [2,1,5], [4,2,1], [6,6,5], [8,7,1], [0,1,0]] 309 | # heapq.heapify(data) 310 | # print data 311 | 312 | 313 | 314 | 315 | --------------------------------------------------------------------------------