├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.rst ├── __init__.py ├── mutual_neighborhood_graph.h ├── quickshift_pp.pyx └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | quickshift_pp.cpp 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Quickshift++ 2 | ====== 3 | This is not an officially supported Google product 4 | 5 | Density-based clustering algorithm based on mode-seeking. 6 | 7 | 8 | Usage 9 | ====== 10 | 11 | **Initializiation**: 12 | 13 | .. code-block:: python 14 | 15 | QuickshiftPP(k, beta) 16 | 17 | k: number of neighbors in k-NN 18 | 19 | beta: fluctuation parameter which ranges between 0 and 1. 20 | 21 | **Finding Clusters**: 22 | 23 | .. code-block:: python 24 | 25 | fit(X) 26 | 27 | X is the data matrix, where each row is a datapoint in euclidean space. 28 | 29 | fit performs the clustering. The final result can be found in QuickshiftPP.memberships. 30 | 31 | **Example** (mixture of two gaussians): 32 | 33 | .. code-block:: python 34 | 35 | from QuickshiftPP import * 36 | import numpy as np 37 | 38 | X = [np.random.normal(0, 1, 2) for i in range(100)] + [np.random.normal(5, 1, 2) for i in range(100)] 39 | y = [0] * 100 + [1] * 100 40 | 41 | # Declare a Quickshift++ model with tuning hyperparameters. 42 | model = QuickshiftPP(k=20, beta=.5) 43 | 44 | # Compute the clustering. 45 | model.fit(X) 46 | y_hat = model.memberships 47 | 48 | from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score 49 | print("Adj. Rand Index Score: %f." % adjusted_rand_score(y_hat, y)) 50 | print("Adj. Mutual Info Score: %f." % adjusted_mutual_info_score(y_hat, y)) 51 | 52 | 53 | Install 54 | ======= 55 | 56 | This package uses distutils, which is the default way of installing 57 | python modules. 58 | 59 | To install for all users on Unix/Linux:: 60 | 61 | sudo python setup.py build; python setup.py install 62 | 63 | 64 | 65 | Dependencies 66 | ======= 67 | 68 | python 2.7, scikit-learn 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /mutual_neighborhood_graph.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2018 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | using namespace std; 26 | 27 | struct Node { 28 | /* 29 | Node struct for our k-NN or neighborhood Graph 30 | */ 31 | int index; 32 | int rank; 33 | Node * parent; 34 | set children; 35 | Node(int idx) { 36 | index = idx; 37 | rank = 0; 38 | parent = NULL; 39 | children.clear(); 40 | } 41 | 42 | }; 43 | 44 | 45 | 46 | struct Graph { 47 | /* 48 | Graph struct. 49 | Allows us to build the graph one node at a time 50 | */ 51 | vector nodes; 52 | map M; 53 | set intersecting_sets; 54 | Graph() { 55 | M.clear(); 56 | intersecting_sets.clear(); 57 | nodes.clear(); 58 | } 59 | 60 | Node * get_root(Node * node) { 61 | 62 | if (node->parent != NULL) { 63 | node->parent->children.erase(node); 64 | node->parent = get_root(node->parent); 65 | node->parent->children.insert(node); 66 | return node->parent; 67 | } else { 68 | return node; 69 | } 70 | } 71 | 72 | void add_node(int idx) { 73 | nodes.push_back(new Node(idx)); 74 | M[idx] = nodes[nodes.size() - 1]; 75 | } 76 | 77 | void add_edge(int n1, int n2) { 78 | Node * r1 = get_root(M[n1]); 79 | Node * r2 = get_root(M[n2]); 80 | if (r1 != r2) { 81 | if (r1->rank > r2->rank) { 82 | r2->parent = r1; 83 | r1->children.insert(r2); 84 | if (intersecting_sets.count(r2)) { 85 | intersecting_sets.erase(r2); 86 | intersecting_sets.insert(r1); 87 | } 88 | } else { 89 | r1->parent = r2; 90 | r2->children.insert(r1); 91 | if (intersecting_sets.count(r1)) { 92 | intersecting_sets.erase(r1); 93 | intersecting_sets.insert(r2); 94 | } 95 | 96 | if (r1->rank == r2->rank) { 97 | r2->rank++; 98 | } 99 | } 100 | } 101 | } 102 | 103 | vector get_connected_component(int n) { 104 | Node * r = get_root(M[n]); 105 | vector L; 106 | stack s; 107 | s.push(r); 108 | while (!s.empty()) { 109 | Node * top = s.top(); s.pop(); 110 | L.push_back(top->index); 111 | for (set::iterator it = top->children.begin(); 112 | it != top->children.end(); 113 | ++it) { 114 | s.push(*it); 115 | } 116 | } 117 | return L; 118 | } 119 | 120 | 121 | bool component_seen(int n) { 122 | Node * r = get_root(M[n]); 123 | if (intersecting_sets.count(r)) { 124 | return true; 125 | } 126 | intersecting_sets.insert(r); 127 | return false; 128 | } 129 | 130 | int GET_ROOT(int idx) { 131 | Node * r = get_root(M[idx]); 132 | return r->index; 133 | } 134 | 135 | vector GET_CHILDREN(int idx) { 136 | Node * r = M[idx]; 137 | vector to_ret; 138 | for (set::iterator it = r->children.begin(); 139 | it != r->children.end(); 140 | ++it) { 141 | to_ret.push_back((*it)->index); 142 | } 143 | return to_ret; 144 | } 145 | 146 | }; 147 | 148 | 149 | struct NodeBasic { 150 | int index; 151 | int rank; 152 | NodeBasic * parent; 153 | NodeBasic(int idx) { 154 | index = idx; 155 | rank = 0; 156 | parent = NULL; 157 | } 158 | }; 159 | 160 | struct GraphBasic { 161 | /* 162 | Basic disjoint set data structure. */ 163 | vector M; 164 | GraphBasic(const int n) { 165 | M.clear(); 166 | for (int i = 0; i < n; ++i) { 167 | M.push_back(new NodeBasic(i)); 168 | } 169 | } 170 | 171 | NodeBasic * get_root(NodeBasic * node) { 172 | if (!node) return NULL; 173 | if (!node->parent) return node; 174 | node->parent = get_root(node->parent); 175 | return node->parent; 176 | } 177 | 178 | void add_edge(const int n1, const int n2) { 179 | NodeBasic * r1 = get_root(M[n1]); 180 | NodeBasic * r2 = get_root(M[n2]); 181 | if (!r1 || !r2) return; 182 | if (r1 != r2) { 183 | if (r1->rank > r2->rank) { 184 | r2->parent = r1; 185 | } else { 186 | r1->parent = r2; 187 | if (r1->rank == r2->rank) { 188 | r2->rank++; 189 | } 190 | } 191 | } 192 | } 193 | }; 194 | 195 | 196 | void compute_mutual_knn(int n, int k, 197 | int d, 198 | double * radii, 199 | int * neighbors, 200 | double beta, 201 | double epsilon, 202 | int * result) { 203 | /* Given the kNN density and neighbors 204 | We build the k-NN graph / cluster tree and return the estimated modes. 205 | Note that here, we don't require the dimension of the dataset 206 | Returns array of estimated mode membership, where each index cosrresponds 207 | the respective index in the density array. Points without 208 | membership are assigned -1 */ 209 | 210 | vector > knn_radii(n); 211 | vector > knn_neighbors(n); 212 | 213 | 214 | 215 | for (int i = 0; i < n; ++i) { 216 | knn_radii[i].first = radii[i]; 217 | knn_radii[i].second = i; 218 | 219 | for (int j = 0; j < k; ++j) { 220 | knn_neighbors[i].insert(neighbors[i * k + j]); 221 | } 222 | } 223 | 224 | int m_hat[n]; 225 | int cluster_membership[n]; 226 | int n_chosen_points = 0; 227 | int n_chosen_clusters = 0; 228 | sort(knn_radii.begin(), knn_radii.end()); 229 | 230 | Graph G = Graph(); 231 | 232 | int last_considered = 0; 233 | int last_pruned = 0; 234 | 235 | for (int i = 0; i < n; ++i) { 236 | while (last_pruned < n && pow(1. + epsilon, 1. / d) * knn_radii[i].first > knn_radii[last_pruned].first) { 237 | 238 | G.add_node(knn_radii[last_pruned].second); 239 | 240 | for (set ::iterator it = knn_neighbors[knn_radii[last_pruned].second].begin(); 241 | it != knn_neighbors[knn_radii[last_pruned].second].end(); 242 | ++it) { 243 | if (G.M.count(*it)) { 244 | if (knn_neighbors[*it].count(knn_radii[last_pruned].second)) { 245 | G.add_edge(knn_radii[last_pruned].second, *it); 246 | } 247 | 248 | } 249 | 250 | } 251 | last_pruned++; 252 | } 253 | 254 | 255 | while(knn_radii[i].first * pow(1. - beta, 1. / d) > knn_radii[last_considered].first) { 256 | 257 | if (!G.component_seen(knn_radii[last_considered].second)) { 258 | vector res = G.get_connected_component(knn_radii[last_considered].second); 259 | for (size_t j = 0; j < res.size(); j++) { 260 | if (radii[res[j]] <= knn_radii[i].first) { 261 | cluster_membership[n_chosen_points] = n_chosen_clusters; 262 | m_hat[n_chosen_points++] = res[j]; 263 | } 264 | 265 | } 266 | n_chosen_clusters++; 267 | } 268 | last_considered++; 269 | } 270 | } 271 | 272 | for (int i = 0; i < n; ++i) { 273 | result[i] = -1; 274 | } 275 | 276 | for (int i = 0; i < n_chosen_points; ++i) { 277 | result[m_hat[i]] = cluster_membership[i]; 278 | } 279 | 280 | 281 | } 282 | 283 | double dist(int i, int j, int d, double ** dataset) { 284 | double sum = 0.; 285 | for (int m = 0; m < d; ++m) { 286 | sum += (dataset[i][m] - dataset[j][m]) * (dataset[i][m] - dataset[j][m]); 287 | } 288 | return sum; 289 | } 290 | 291 | void cluster_remaining( 292 | int n, int k, int d, 293 | double * dataset, 294 | double * radii, 295 | int * neighbors, 296 | int * initial_memberships, 297 | int * result) { 298 | 299 | int ** knn_neighbors = new int*[n]; 300 | double ** data; 301 | data = new double *[n]; 302 | for (int i = 0; i < n; ++i) { 303 | data[i] = new double[d]; 304 | knn_neighbors[i] = new int[k]; 305 | } 306 | for (int i = 0; i < n; ++i) { 307 | for (int j = 0; j < k; ++j) { 308 | knn_neighbors[i][j] = neighbors[i * k + j]; 309 | } 310 | } 311 | for (int i = 0; i < n; ++i) { 312 | for (int j = 0; j < d; ++j) { 313 | data[i][j] = dataset[i * d + j]; 314 | } 315 | } 316 | 317 | // Final clusters. 318 | GraphBasic H = GraphBasic(n); 319 | 320 | int n_chosen_clusters = 0; 321 | for (int i = 0; i < n; ++i) { 322 | if (n_chosen_clusters < initial_memberships[i]) { 323 | n_chosen_clusters = initial_memberships[i]; 324 | } 325 | } 326 | n_chosen_clusters += 1; 327 | vector > modal_sets(n_chosen_clusters); 328 | for (int c = 0; c < n_chosen_clusters; ++c) { 329 | modal_sets.push_back(vector()); 330 | } 331 | for (int i = 0; i < n; ++i) { 332 | if (initial_memberships[i] >= 0) { 333 | modal_sets[initial_memberships[i]].push_back(i); 334 | } 335 | } 336 | for (int c = 0; c < n_chosen_clusters; ++c) { 337 | for (size_t i = 0; i < modal_sets[c].size() - 1; ++i) { 338 | H.add_edge(modal_sets[c][i], modal_sets[c][i+1]); 339 | } 340 | } 341 | int next = -1; 342 | double dt, best_distance = 0.; 343 | for (int i = 0; i < n; ++i) { 344 | if (initial_memberships[i] >= 0) { 345 | continue; 346 | } 347 | next = -1; 348 | for (int j = 0; j < k; ++j) { 349 | if (radii[knn_neighbors[i][j]] < radii[i]) { 350 | next = knn_neighbors[i][j]; 351 | break; 352 | } 353 | } 354 | 355 | if (next < 0) { 356 | best_distance = 1000000000.; 357 | for (int j = 0; j < n; ++j) { 358 | if (radii[j] >= radii[i]) { 359 | continue; 360 | } 361 | dt = 0.0; 362 | for (int m = 0; m < d; ++m) { 363 | dt += (data[i][m] - data[j][m]) * (data[i][m] - data[j][m]); 364 | } 365 | if (best_distance > dt) { 366 | best_distance = dt; 367 | next = j; 368 | } 369 | } 370 | } 371 | H.add_edge(i, next); 372 | } 373 | for (int i = 0; i < n; ++i) { 374 | result[i] = -1; 375 | } 376 | int n_clusters = 0; 377 | map label_mapping; 378 | for (int i = 0; i < n; ++i) { 379 | if (result[i] < 0) { 380 | int label = (H.get_root(H.M[i]))->index; 381 | if (label_mapping.count(label)) { 382 | result[i] = label_mapping[label]; 383 | } else { 384 | label_mapping[label] = n_clusters; 385 | result[i] = n_clusters; 386 | n_clusters++; 387 | } 388 | } 389 | } 390 | } 391 | -------------------------------------------------------------------------------- /quickshift_pp.pyx: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | cimport numpy as np 17 | from sklearn.neighbors import KDTree, BallTree 18 | from sklearn.neighbors.kde import KernelDensity 19 | import scipy 20 | import math 21 | import sys 22 | 23 | 24 | cdef extern from "mutual_neighborhood_graph.h": 25 | void compute_mutual_knn(int n, int k, int d, 26 | double * radii, 27 | int * neighbors, 28 | double beta, 29 | double epsilon, 30 | int * result) 31 | void cluster_remaining(int n, int k, int d, 32 | double * dataset, 33 | double * radii, 34 | int * neighbors, 35 | int * initial_memberships, 36 | int * result) 37 | 38 | 39 | cdef compute_mutual_knn_np(n, k, d, 40 | np.ndarray[double, ndim=1, mode="c"] radii, 41 | np.ndarray[np.int32_t, ndim=2, mode="c"] neighbors, 42 | beta, 43 | epsilon, 44 | np.ndarray[np.int32_t, ndim=1, mode="c"] result): 45 | compute_mutual_knn(n, k, d, 46 | np.PyArray_DATA(radii), 47 | np.PyArray_DATA(neighbors), 48 | beta, epsilon, 49 | np.PyArray_DATA(result)) 50 | 51 | cdef cluster_remaining_np(n, k, d, 52 | np.ndarray[double, ndim=2, mode="c"] dataset, 53 | np.ndarray[double, ndim=1, mode="c"] radii, 54 | np.ndarray[np.int32_t, ndim=2, mode="c"] neighbors, 55 | np.ndarray[np.int32_t, ndim=1, mode="c"] initial_memberships, 56 | np.ndarray[np.int32_t, ndim=1, mode="c"] result): 57 | cluster_remaining(n, k, d, 58 | np.PyArray_DATA(dataset), 59 | np.PyArray_DATA(radii), 60 | np.PyArray_DATA(neighbors), 61 | np.PyArray_DATA(initial_memberships), 62 | np.PyArray_DATA(result)) 63 | 64 | 65 | 66 | 67 | class QuickshiftPP: 68 | """ 69 | Parameters 70 | ---------- 71 | 72 | k: The number of neighbors (i.e. the k in k-NN density) 73 | 74 | beta: Ranges from 0 to 1. We choose points that have kernel density of at 75 | least (1 - beta) * F where F is the mode of the empirical density of 76 | the cluster 77 | 78 | epsilon: For pruning. Sets how much deeper in the cluster tree to look 79 | in order to connect clusters together. Must be at least 0. 80 | 81 | 82 | Attributes 83 | ---------- 84 | 85 | n_clusters: number of clusters fitted 86 | 87 | cluster_map: a map from the cluster (zero-based indexed) to the list of points 88 | in that cluster 89 | 90 | """ 91 | 92 | 93 | 94 | def __init__(self, k, beta, 95 | epsilon=0, 96 | ann="kdtree"): 97 | self.k = k 98 | self.beta = beta 99 | self.epsilon = epsilon 100 | self.ann = ann 101 | 102 | 103 | 104 | def fit(self, X): 105 | """ 106 | Determines the clusters in two steps. 107 | First step is to compute the knn density estimate and 108 | distances. This is done using kd tree 109 | Second step is to build the knn neighbor graphs 110 | Updates the cluster count and membership attributes 111 | 112 | Parameters 113 | ---------- 114 | X: Data matrix. Each row should represent a datapoint in 115 | euclidean space 116 | """ 117 | X = np.array(X) 118 | n, d = X.shape 119 | knn_density = None 120 | neighbors = None 121 | 122 | if self.ann == "kdtree": 123 | kdt = KDTree(X, metric='euclidean') 124 | query_res = kdt.query(X, k=self.k) 125 | knn_radius = query_res[0][:, self.k-1] 126 | neighbors = query_res[1] 127 | 128 | elif self.ann == "balltree": 129 | balltree = BallTree(X, metric='euclidean') 130 | query_res = balltree.query(X, k=self.k) 131 | knn_radius = query_res[0][:, self.k - 1] 132 | neighbors = query_res[1] 133 | 134 | memberships = np.zeros(n, dtype=np.int32) 135 | result = np.zeros(n, dtype=np.int32) 136 | neighbors = np.ndarray.astype(neighbors, dtype=np.int32) 137 | knn_radius = np.ndarray.astype(knn_radius, dtype=np.float64) 138 | X_copy = np.ndarray.astype(X, dtype=np.float64) 139 | 140 | compute_mutual_knn_np(n, self.k, d, 141 | knn_radius, 142 | neighbors, 143 | self.beta, self.epsilon, 144 | memberships) 145 | knn_radius = np.ndarray.astype(knn_radius, dtype=np.float64) 146 | cluster_remaining_np(n, self.k, d, X_copy, knn_radius, neighbors, memberships, result) 147 | 148 | self.memberships = result 149 | 150 | 151 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from distutils.core import setup, Extension 16 | import numpy 17 | from Cython.Distutils import build_ext 18 | 19 | 20 | setup( 21 | name='QuickshiftPP', 22 | version='1.0', 23 | cmdclass={'build_ext': build_ext}, 24 | ext_modules=[Extension("QuickshiftPP", 25 | sources=["quickshift_pp.pyx"], 26 | language="c++", 27 | include_dirs=[numpy.get_include()])], 28 | author='Heinrich Jiang', 29 | author_email='heinrich.jiang@gmail.com' 30 | 31 | ) 32 | --------------------------------------------------------------------------------