├── heapq.py
├── .gitignore
├── LICENSE
├── cluster_output.txt
├── iris.dat
├── README.md
└── hclust.py


/heapq.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZwEin27/Hierarchical-Clustering/HEAD/heapq.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 ZwEin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/cluster_output.txt:
--------------------------------------------------------------------------------
 1 | 0.819168173599
 2 | 0.862857142857
 3 | [100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148]
 4 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
 5 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 77 110 147
29 | 
30 | 
31 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 104, 106, 108, 110, 111, 112, 113, 114, 115, 116, 119, 120, 121, 123, 124, 126, 127, 128, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
32 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
33 | [100, 102, 105, 107, 109, 117, 118, 122, 125, 129, 130, 131, 135]
34 | [Finished in 0.4s]
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | [   
53 | [100, 102, 105, 107, 109, 110, 115, 117, 118, 122, 125, 129, 130, 131, 135, 136, 141, 145, 147, 148]
54 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
55 | [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 104, 106, 108, 111, 112, 113, 114, 116, 119, 120, 121, 123, 124, 126, 127, 128, 132, 133, 134, 137, 138, 139, 140, 142, 143, 144, 146, 149]
56 |     
57 |     
58 | ]


--------------------------------------------------------------------------------
/iris.dat:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hierarchical-Clustering
 2 | Hierarchical Clustering Python Implementation
 3 | 
 4 | a hierarchical agglomerative clustering algorithm implementation. The algorithm starts by placing each data point in a cluster by itself and then repeatedly merges two clusters until some stopping condition is met.
 5 | 
 6 | ## Clustering process
 7 | 
 8 | Algorithm should stop the clustering process when all data points are placed in a single cluster. That is, the algorithm will perform n – 1 merging if there are n data points.
 9 | 
10 | Algorithm should also keep track of the merging process: for each merging step, remember which clusters are merged to produce which new cluster. Such information is similar to a dendrogram (e.g., shown below), except that dendrogram also remembers the distance of two clusters when they are merged.
11 | With the above information, this algorithm can then allow users to specify a desired number of clusters k, and returns k clusters based on the clustering result. For example, to produce two clusters, the algorithm can simply return the last two clusters that were merged during the process.
12 | 
13 | ## Distance function
14 | 
15 | Assuming that the input to the algorithm consists of a set of n- dimensional data points in the Euclidean space. The distance between two clusters is measured by the Euclidean distance of their centroids.
16 | 
17 | ## Priority queue
18 | 
19 | The algorithm should use a priority queue to store the distance between each pair of clusters and find the two clusters with minimum distance efficiently. You are provided with an implementation of a generic priority queue and required to use it in your algorithm. You are not supposed to modify the provided implementation.
20 | Recall that during the merging process, distance entries in the queue for the clusters that are merged need to be removed from the queue. However, it may be hard to keep track of entries in the queue that involves a particular cluster, since the queue is undergoing frequent changes. To address this, your algorithm should implement a lazy removal strategy: entries for merged clusters are not removed until they show up in the root of the queue. When such entries are found at the root during the ExactMin operation, they are simply ignored, and ExactMin continues until a pair of clusters that do not contain merged clusters is found at the root of queue, or the queue is exhausted. Note that in this strategy, you do not need to implement a remove function that removes an arbitrary node from the heap.
21 | 
22 | ## Evaluation
23 | 
24 | For the evaluation purpose, the data set input to this algorithm also contains cluster label for each data point. These cluster labels form a gold standard for clustering the data set to produce a specific number of clusters. Note that here we can not use the label in the clustering process. Instead, use them in evaluating the performance of your algorithm. The performance is measured by precision and recall of correct pairs. A pair of two data points x and y is correct if they belong to the same cluster according to the cluster label. Recall that precision and recall are discussed in LSH lectures. Precision is the percentage of pairs discovered by the algorithm that are correct, while recall is the percentage of correct pairs that are discovered by the algorithm.
25 | As an example, consider five data points: 1, 2, 3, 4, 5. Suppose there are 2 clusters: {1, 2, 3} and {4, 5} according to the algorithm, while different 2 clusters: {1, 2} and {3, 4, 5} according to the gold standard. In this case, the algorithm discovers four pairs: (1, 2), (1, 3), (2, 3), and (4, 5), while the gold standard has the pairs: (1, 2), (3, 4), (3, 5), and (4, 5). In this case, precision is 2/4 since only (1, 2) and (4, 5) discovered by the algorithm are correct, among the 4 discovered. Recall is also 2/4, since only 2 correct pairs were discovered among the total 4 in the gold standard.
26 | 
27 | ## Input data format
28 | 
29 | The data set contains a list of data points, one point per line. For each data point, it gives the value of the point at each dimension, followed by the cluster label of the point. A sample data point is provided for you to test your algorithm. The data contains 150 iris plants (https://archive.ics.uci.edu/ml/datasets/Iris). For each plant, it lists its sepal and petal length and width in centimeter, and also its type (e.g., setosa, see below).
30 | 
31 |         5.1,3.5,1.4,0.2,Iris-setosa
32 |         4.9,3.0,1.4,0.2,Iris-setosa
33 |         4.7,3.2,1.3,0.2,Iris-setosa
34 |         ...
35 | 
36 | Assuming that input data sets that will be used to test your algorithm will have similar format, that is, one data point per line, which has value of point in each of n dimensions (where n >= 1), followed by a class label.
37 | 
38 | ## Input and output format
39 | 
40 | This algorithm should take two arguments: a text file name for the input data, and a value k for the number of desired clusters. For example,
41 | 
42 |         $python hclust.py iris.dat 3
43 | 
44 | Where hclust.py is your hierarchical clustering algorithm, iris.dat is the input data file, and 3 is the k value.
45 | It should output 3 clusters, with each cluster contains a set of data points. Data point are numbered by their positions they appear in the data file: the first data point is numbered 0, second 1, and so on. The data points in the clusters are output in the ascending order of their numbers.
46 | 
47 | For example, here is an example output. 
48 | 
49 |         Cluster 1: [3, 10, 13, ...]
50 |         Cluster 2: [8, 52, 87, 88, ...]
51 |         Cluster 3: [100, 105, ...]
52 | 
53 | This algorithm also output the accuracy of the discovery. For example,
54 | 
55 |         Precision = .8, recall = .5
56 | 
57 | ## Priority queue module
58 | 
59 | Please find heapq.py provided to you. It implements a priority queue with operations for building the heap, adding items, and extracting smallest item from the heap. See the documentation in the script and also the link (https://docs.python.org/2/library/heapq.html) for more details.
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/hclust.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Executing code: 
  3 | Python hclust.py iris.dat 3
  4 | 
  5 | """
  6 | 
  7 | """
  8 | Change log: 
  9 | 
 10 | - Nov 8, 2015
 11 | 1. Change the logic to calculation centroid
 12 | 2. Add judgement for some invalid input cases
 13 | 
 14 | """
 15 | 
 16 | import sys
 17 | import math
 18 | import os
 19 | import heapq
 20 | import itertools
 21 | 
 22 | class Hierarchical_Clustering:
 23 |     def __init__(self, ipt_data, ipt_k):
 24 |         self.input_file_name = ipt_data
 25 |         self.k = ipt_k
 26 |         self.dataset = None
 27 |         self.dataset_size = 0
 28 |         self.dimension = 0
 29 |         self.heap = []
 30 |         self.clusters = []
 31 |         self.gold_standard = {}
 32 | 
 33 |     def initialize(self):
 34 |         """
 35 |         Initialize and check parameters
 36 | 
 37 |         """
 38 |         # check file exist and if it's a file or dir
 39 |         if not os.path.isfile(self.input_file_name):
 40 |             self.quit("Input file doesn't exist or it's not a file")
 41 | 
 42 |         self.dataset, self.clusters, self.gold_standard = self.load_data(self.input_file_name)
 43 |         self.dataset_size = len(self.dataset)
 44 | 
 45 |         if self.dataset_size == 0:
 46 |             self.quit("Input file doesn't include any data")
 47 | 
 48 |         if self.k == 0:
 49 |             self.quit("k = 0, no cluster will be generated")
 50 | 
 51 |         if self.k > self.dataset_size:
 52 |             self.quit("k is larger than the number of existing clusters")
 53 | 
 54 |         self.dimension = len(self.dataset[0]["data"])
 55 | 
 56 |         if self.dimension == 0:
 57 |             self.quit("dimension for dataset cannot be zero")
 58 | 
 59 |     """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 60 |     """                      Hierarchical Clustering Functions                       """
 61 |     """                                                                              """    
 62 |     """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 63 | 
 64 |     def euclidean_distance(self, data_point_one, data_point_two):
 65 |         """
 66 |         euclidean distance: https://en.wikipedia.org/wiki/Euclidean_distance
 67 |         assume that two data points have same dimension
 68 | 
 69 |         """
 70 |         size = len(data_point_one)
 71 |         result = 0.0
 72 |         for i in range(size):
 73 |             f1 = float(data_point_one[i])   # feature for data one
 74 |             f2 = float(data_point_two[i])   # feature for data two
 75 |             tmp = f1 - f2
 76 |             result += pow(tmp, 2)
 77 |         result = math.sqrt(result)
 78 |         return result
 79 | 
 80 |     def compute_pairwise_distance(self, dataset):
 81 |         result = []
 82 |         dataset_size = len(dataset)
 83 |         for i in range(dataset_size-1):    # ignore last i
 84 |             for j in range(i+1, dataset_size):     # ignore duplication
 85 |                 dist = self.euclidean_distance(dataset[i]["data"], dataset[j]["data"])
 86 | 
 87 |                 # duplicate dist, need to be remove, and there is no difference to use tuple only
 88 |                 # leave second dist here is to take up a position for tie selection
 89 |                 result.append( (dist, [dist, [[i], [j]]]) )
 90 | 
 91 |         return result
 92 |                 
 93 |     def build_priority_queue(self, distance_list):
 94 |         heapq.heapify(distance_list)
 95 |         self.heap = distance_list
 96 |         return self.heap
 97 | 
 98 |     def compute_centroid_two_clusters(self, current_clusters, data_points_index):
 99 |         size = len(data_points_index)
100 |         dim = self.dimension
101 |         centroid = [0.0]*dim
102 |         for index in data_points_index:
103 |             dim_data = current_clusters[str(index)]["centroid"]
104 |             for i in range(dim):
105 |                 centroid[i] += float(dim_data[i])
106 |         for i in range(dim):
107 |             centroid[i] /= size
108 |         return centroid
109 | 
110 |     def compute_centroid(self, dataset, data_points_index):
111 |         size = len(data_points_index)
112 |         dim = self.dimension
113 |         centroid = [0.0]*dim
114 |         for idx in data_points_index:
115 |             dim_data = dataset[idx]["data"]
116 |             for i in range(dim):
117 |                 centroid[i] += float(dim_data[i])
118 |         for i in range(dim):
119 |             centroid[i] /= size
120 |         return centroid
121 | 
122 |     def hierarchical_clustering(self):
123 |         """
124 |         Main Process for hierarchical clustering
125 | 
126 |         """
127 |         dataset = self.dataset
128 |         current_clusters = self.clusters
129 |         old_clusters = []
130 |         heap = hc.compute_pairwise_distance(dataset)
131 |         heap = hc.build_priority_queue(heap)
132 | 
133 |         while len(current_clusters) > self.k:
134 |             dist, min_item = heapq.heappop(heap)
135 |             # pair_dist = min_item[0]
136 |             pair_data = min_item[1]
137 | 
138 |             # judge if include old cluster
139 |             if not self.valid_heap_node(min_item, old_clusters):
140 |                 continue
141 | 
142 |             new_cluster = {}
143 |             new_cluster_elements = sum(pair_data, [])
144 |             new_cluster_cendroid = self.compute_centroid(dataset, new_cluster_elements)
145 |             new_cluster_elements.sort()
146 |             new_cluster.setdefault("centroid", new_cluster_cendroid)
147 |             new_cluster.setdefault("elements", new_cluster_elements)
148 |             for pair_item in pair_data:
149 |                 old_clusters.append(pair_item)
150 |                 del current_clusters[str(pair_item)]
151 |             self.add_heap_entry(heap, new_cluster, current_clusters)
152 |             current_clusters[str(new_cluster_elements)] = new_cluster
153 |         current_clusters.sort()
154 |         return current_clusters
155 |             
156 |     def valid_heap_node(self, heap_node, old_clusters):
157 |         pair_dist = heap_node[0]
158 |         pair_data = heap_node[1]
159 |         for old_cluster in old_clusters:
160 |             if old_cluster in pair_data:
161 |                 return False
162 |         return True
163 |             
164 |     def add_heap_entry(self, heap, new_cluster, current_clusters):
165 |         for ex_cluster in current_clusters.values():
166 |             new_heap_entry = []
167 |             dist = self.euclidean_distance(ex_cluster["centroid"], new_cluster["centroid"])
168 |             new_heap_entry.append(dist)
169 |             new_heap_entry.append([new_cluster["elements"], ex_cluster["elements"]])
170 |             heapq.heappush(heap, (dist, new_heap_entry))
171 | 
172 |     def evaluate(self, current_clusters):
173 |         gold_standard = self.gold_standard
174 |         current_clustes_pairs = []
175 | 
176 |         for (current_cluster_key, current_cluster_value) in current_clusters.items():
177 |             tmp = list(itertools.combinations(current_cluster_value["elements"], 2))
178 |             current_clustes_pairs.extend(tmp)
179 |         tp_fp = len(current_clustes_pairs)
180 | 
181 |         gold_standard_pairs = []
182 |         for (gold_standard_key, gold_standard_value) in gold_standard.items():
183 |             tmp = list(itertools.combinations(gold_standard_value, 2))
184 |             gold_standard_pairs.extend(tmp)
185 |         tp_fn = len(gold_standard_pairs)
186 | 
187 |         tp = 0.0
188 |         for ccp in current_clustes_pairs:
189 |             if ccp in gold_standard_pairs:
190 |                 tp += 1
191 | 
192 |         if tp_fp == 0:
193 |             precision = 0.0
194 |         else:
195 |             precision = tp/tp_fp
196 |         if tp_fn == 0:
197 |             precision = 0.0
198 |         else:
199 |             recall = tp/tp_fn
200 | 
201 |         return precision, recall
202 | 
203 |     """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
204 |     """                             Helper Functions                                 """
205 |     """                                                                              """    
206 |     """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
207 |     def load_data(self, input_file_name):
208 |         """
209 |         load data and do some preparations
210 | 
211 |         """
212 |         input_file = open(input_file_name, 'rU')
213 |         dataset = []
214 |         clusters = {}
215 |         gold_standard = {}
216 |         id = 0
217 |         for line in input_file:
218 |             line = line.strip('\n')
219 |             row = str(line)
220 |             row = row.split(",")
221 |             iris_class = row[-1]
222 | 
223 |             data = {}
224 |             data.setdefault("id", id)   # duplicate
225 |             data.setdefault("data", row[:-1])
226 |             data.setdefault("class", row[-1])
227 |             dataset.append(data)
228 | 
229 |             clusters_key = str([id])
230 |             clusters.setdefault(clusters_key, {})
231 |             clusters[clusters_key].setdefault("centroid", row[:-1])
232 |             clusters[clusters_key].setdefault("elements", [id])
233 | 
234 |             gold_standard.setdefault(iris_class, [])
235 |             gold_standard[iris_class].append(id)
236 | 
237 |             id += 1
238 |         return dataset, clusters, gold_standard
239 | 
240 |     def quit(self, err_desc):
241 |         raise SystemExit('\n'+ "PROGRAM EXIT: " + err_desc + ', please check your input' + '\n')
242 | 
243 |     def loaded_dataset(self):
244 |         """
245 |         use for test only
246 | 
247 |         """
248 |         return self.dataset
249 | 
250 |     def display(self, current_clusters, precision, recall):
251 |         print precision
252 |         print recall
253 |         clusters = current_clusters.values()
254 |         for cluster in clusters:
255 |             cluster["elements"].sort()
256 |             print cluster["elements"]
257 | 
258 | 
259 | 
260 | """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
261 | """                               Main Method                                    """
262 | """                                                                              """    
263 | """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
264 | if __name__ == '__main__':
265 |     """
266 |     inputs:
267 |     - ipt_data: a text file name for the input data
268 |     - ipt_k: a value k for the number of desired clusters.
269 | 
270 |     outputs:
271 |     - opt_clusters: output k clusters, with each cluster contains a set of data points (index for input data)
272 |     - opt_precision
273 |     - opt_recall
274 | 
275 |     """
276 | 
277 |     ## input test
278 |     # ipt_data = "iris.dat"
279 |     # ipt_data = "iris_dataset1.txt"
280 |     # ipt_k = 3
281 | 
282 |     ipt_data = sys.argv[1]      # input data, e.g. iris.dat
283 |     ipt_k = int(sys.argv[2])    # number of clusters, e.g. 3
284 | 
285 |     hc = Hierarchical_Clustering(ipt_data, ipt_k)
286 |     hc.initialize()
287 |     current_clusters = hc.hierarchical_clustering()
288 |     precision, recall = hc.evaluate(current_clusters)
289 |     hc.display(current_clusters, precision, recall)
290 | 
291 |     ## euclidean_distance() test
292 |     # loaded_data = hc.loaded_dataset()
293 |     # print loaded_data
294 |     # print hc.euclidean_distance(loaded_data[0]["data"],loaded_data[1]["data"])
295 | 
296 |     ## compute_centroid() test
297 |     # loaded_data = hc.loaded_dataset()
298 |     # hc.compute_centroid(loaded_data, [10, 11, 12, 13])
299 | 
300 |     ## distance_list test
301 |     # distance_list = hc.compute_pairwise_distance()
302 |     # distance_list.sort()
303 |     # print distance_list
304 |     
305 |     ## heapq test
306 |     # heap = []
307 |     # data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0]
308 |     # data = [[1,4,5], [3,6,1], [5,6,10], [7,2,11], [9,6,1], [2,1,5], [4,2,1], [6,6,5], [8,7,1], [0,1,0]]
309 |     # heapq.heapify(data)
310 |     # print data
311 | 
312 | 
313 | 
314 | 
315 | 


--------------------------------------------------------------------------------