├── .gitignore ├── LICENSE ├── README.md ├── examples └── demo.py ├── scc ├── __init__.py └── scc.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sub-Cluster Component Clustering Algorithm 2 | 3 | This is a scipy /numpy / python implementation of SCC. For relatively sparse graph inputs, it should scale 4 | relatively easily to datasets of millions of nodes. This implementation assumes similarities 5 | are given. 6 | 7 | There is an example use in [demo.py](examples/demo.py). This demo shows: 8 | 9 | ```Python 10 | upper = 1.0 11 | lower = 0.1 12 | num_rounds = 50 13 | X = np.random.randn(100,5) 14 | graph = graph_from_vectors(X, k=25, batch_size=5000) 15 | taus = np.geomspace(start=upper, stop=lower, num=num_rounds) 16 | 17 | scc = SCC(graph, num_rounds, taus) 18 | scc.fit() 19 | 20 | # How to inspect this? 21 | # this gives the things stored in the 3rd round of the alg. (0 based) 22 | scc.rounds[3].__dict__ 23 | 24 | # the cluster assignment of the 18th point of the dataset. (0 based) 25 | scc.rounds[3].cluster_assignments[18] 26 | 27 | # the id of the parent in the next round of node 2 (0 based) 28 | scc.rounds[3].parents[2] 29 | ``` 30 | 31 | 32 | Citation: 33 | 34 | ``` 35 | @article{scc2020arxiv, 36 | author = {Nicholas Monath and 37 | Avinava Dubey and 38 | Guru Guruganesh and 39 | Manzil Zaheer and 40 | Amr Ahmed and 41 | Andrew McCallum and 42 | G{\"{o}}khan Mergen and 43 | Marc Najork and 44 | Mert Terzihan and 45 | Bryon Tjanaka and 46 | Yuan Wang and 47 | Yuchen Wu}, 48 | title = {Scalable Bottom-Up Hierarchical Clustering}, 49 | journal = {arXiv preprint, 2010.11821}, 50 | year = {2020} 51 | } 52 | ``` 53 | 54 | -------------------------------------------------------------------------------- /examples/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2021 The authors of SCC All rights reserved. 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | 8 | Unless required by applicable law or agreed to in writing, software 9 | distributed under the License is distributed on an "AS IS" BASIS, 10 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | See the License for the specific language governing permissions and 12 | limitations under the License. 13 | """ 14 | 15 | import os 16 | 17 | import numpy as np 18 | from absl import app 19 | from absl import logging 20 | from scipy.sparse import coo_matrix, csr_matrix 21 | from tqdm import tqdm 22 | from scc import SCC 23 | 24 | import numpy.random 25 | import time 26 | 27 | logging.set_verbosity(logging.INFO) 28 | 29 | def sim_fn(XA,XB): 30 | return XA @ XB.T 31 | 32 | def batched_knn(XA, XB, K, batch_size=1000, offset=0): 33 | K = np.minimum(K, XB.shape[0]) 34 | res_i = np.zeros((XA.shape[0], K), dtype=np.int32) 35 | res = np.zeros((XA.shape[0], K), dtype=np.int32) 36 | resd = np.zeros((XA.shape[0], K), dtype=np.float32) 37 | for i in tqdm([x for x in range(0, XA.shape[0], batch_size)]): 38 | istart = i 39 | iend = min(XA.shape[0], i + batch_size) 40 | r = np.zeros((iend-istart, XB.shape[0]), dtype=np.float32) 41 | for j in range(0, XB.shape[0], batch_size): 42 | jstart = j 43 | jend = min(XB.shape[0], j + batch_size) 44 | r[:, jstart:jend] = sim_fn(XA[istart:iend], XB[jstart:jend]) 45 | np.put(r, np.arange(iend - istart)*r.shape[1] + np.arange(istart, iend), np.inf) 46 | res[istart:iend, :] = np.argpartition(r, -K, axis=1)[:, -K:] 47 | resd[istart:iend, :] = r[np.arange(iend-istart)[:, None], res[istart:iend, :]] 48 | res_i[istart:iend, :] = np.repeat(np.expand_dims(np.arange(istart, iend), 1), K, axis=1) + offset 49 | 50 | row = res_i.flatten() 51 | col = res.flatten() 52 | d = resd.flatten() 53 | c = coo_matrix((d[row!=col], (row[row!=col], col[row!=col])), dtype=np.float32,shape=(XB.shape[0], XB.shape[0])) 54 | return c 55 | 56 | def make_symmetric(coo_mat): 57 | lil = coo_mat.tolil() 58 | rows, cols = lil.nonzero() 59 | lil[cols, rows] = lil[rows, cols].maximum(lil[cols, rows]) 60 | return lil.tocoo() 61 | 62 | def graph_from_vectors(vectors, k, batch_size, random_noise=0): 63 | graph = batched_knn(vectors, vectors, k,offset=0, batch_size=batch_size) 64 | graph.data += np.random.random(graph.data.shape) * random_noise 65 | graph = make_symmetric(graph) 66 | return graph 67 | 68 | 69 | def main(argv): 70 | upper = 1.0 71 | lower = 0.1 72 | num_rounds = 50 73 | X = np.random.randn(100,5) 74 | graph = graph_from_vectors(X, k=25, batch_size=5000) 75 | taus = np.geomspace(start=upper, stop=lower, num=num_rounds) 76 | scc = SCC(graph, num_rounds, taus) 77 | scc.fit() 78 | 79 | # How to inspect this? 80 | # this gives the things stored in the 3rd round of the alg. (0 based) 81 | print('Third round of the alg: ') 82 | print(scc.rounds[3].__dict__) 83 | 84 | # the cluster assignment of the 18th point of the dataset. (0 based) 85 | print('cluster assignment of the 18th point of the dataset') 86 | print('scc.rounds[3].cluster_assignments[18]') 87 | print(scc.rounds[3].cluster_assignments[18]) 88 | 89 | # the id of the parent in the next round of node 0 (0 based) 90 | print('the id of the parent in the next round of node 0 (0 based)') 91 | print('scc.rounds[3].parents[0]') 92 | print(scc.rounds[3].parents[0]) 93 | 94 | if __name__ == "__main__": 95 | app.run(main) 96 | -------------------------------------------------------------------------------- /scc/__init__.py: -------------------------------------------------------------------------------- 1 | from scc.scc import SCC 2 | -------------------------------------------------------------------------------- /scc/scc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2021 The authors of SCC All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import time 18 | 19 | import numpy as np 20 | from absl import logging 21 | from scipy.sparse import coo_matrix, csr_matrix 22 | from scipy.sparse.csgraph import connected_components 23 | from tqdm import tqdm 24 | 25 | logging.set_verbosity(logging.INFO) 26 | 27 | class TreeLevel(object): 28 | 29 | def __init__(self, tau, dist_graph, node_counts, cluster_assignments=None, cc_connection='weak'): 30 | self.dist_graph = dist_graph 31 | self.node_counts = node_counts 32 | # parents[i] gives the id of the node in next round of node i 33 | self.parents = None 34 | self.num_uniq_parents = None 35 | self.nn_edges = None 36 | self.nn_edge_sims = None 37 | self.tau = tau 38 | self.cc_connection=cc_connection 39 | # cluster_assignments[j] gives cluster id of jth point in this round 40 | if cluster_assignments is None: 41 | self.cluster_assignments = np.arange(self.dist_graph.shape[0],dtype=np.int32) 42 | else: 43 | self.cluster_assignments = cluster_assignments 44 | 45 | def perform_round(self): 46 | row = self.dist_graph.row 47 | col = self.dist_graph.col 48 | if row.shape[0] > 1: 49 | oneNN_s = time.time() 50 | # normalize the data by counts 51 | data = self.dist_graph.data.copy() 52 | data /= (self.node_counts[row] * self.node_counts[col]) 53 | to_use_binary = np.logical_and(data >= self.tau, row != col) 54 | g = csr_matrix((data[to_use_binary], 55 | (row[to_use_binary], 56 | col[to_use_binary])), 57 | shape=self.dist_graph.shape, dtype=np.float32) 58 | 59 | nns = g.argmax(axis=1).A.squeeze(1) 60 | nns_sims = g.max(axis=1).A.squeeze(1) 61 | next_round_binary = coo_matrix( 62 | (np.ones(self.dist_graph.shape[0], dtype=np.float32)[nns_sims != 0], 63 | (np.arange(self.dist_graph.shape[0], dtype=np.int32)[nns_sims != 0], 64 | nns[nns_sims != 0])), 65 | shape=self.dist_graph.shape) 66 | next_round_binary.eliminate_zeros() 67 | oneNN_e = time.time() 68 | logging.debug('Nearest Neighbor: Done. nodes %s, edges %s, time %s', self.dist_graph.shape[0], row.shape[0], oneNN_e-oneNN_s) 69 | CC_s = time.time() 70 | num_uniq_parents, parent_map = connected_components(next_round_binary, 71 | directed=True, connection=self.cc_connection) 72 | CC_e = time.time() 73 | logging.debug('Connected Components: Done. nodes %s, edges %s, time %s', next_round_binary.shape[0], next_round_binary.nnz, 74 | CC_e - CC_s) 75 | self.parents = parent_map 76 | self.num_uniq_parents = num_uniq_parents 77 | else: 78 | self.num_uniq_parents = 1 79 | self.parents = np.zeros(self.dist_graph.col.shape[0], dtype=np.int32) 80 | 81 | 82 | def form_next_round(self, next_tau): 83 | if self.num_uniq_parents > 1: 84 | contract_s = time.time() 85 | next_row = self.parents[self.dist_graph.row] 86 | next_col = self.parents[self.dist_graph.col] 87 | i, j = next_row[next_row != next_col], next_col[next_row != next_col] 88 | next_round_dist_sum = coo_matrix((self.dist_graph.data[next_row != next_col], (i,j)), 89 | shape=(self.num_uniq_parents, self.num_uniq_parents), dtype=np.float32) 90 | next_round_dist_sum.sum_duplicates() 91 | 92 | 93 | next_counts_nodes = coo_matrix((self.node_counts, (self.parents, np.zeros_like(self.parents, dtype=np.float32))), 94 | shape=(self.num_uniq_parents, 1), dtype=np.float32) 95 | next_counts_nodes.sum_duplicates() 96 | next_counts_nodes = next_counts_nodes.todense().A[:, 0] 97 | next_cluster_assignments = self.parents[self.cluster_assignments] 98 | contract_t = time.time() 99 | logging.debug('Graph Contract: Done. nodes %s, edges %s, time %s', next_round_dist_sum.shape[0], 100 | next_round_dist_sum.nnz, 101 | contract_t - contract_s) 102 | return TreeLevel(next_tau, dist_graph=next_round_dist_sum, 103 | node_counts=next_counts_nodes, 104 | cluster_assignments=next_cluster_assignments, 105 | cc_connection=self.cc_connection) 106 | else: 107 | return None 108 | 109 | class SCC(object): 110 | def __init__(self, g, num_rounds, taus, cc_connection='weak'): 111 | self.g = g 112 | self.uniq_ids = np.unique(g.row) 113 | self.num_rounds = num_rounds 114 | self.taus = taus 115 | self.rounds = [] 116 | self.cc_connection = cc_connection 117 | 118 | def assignments_by_threshold(self, threshold): 119 | closest_v = np.Inf 120 | closest = 0 121 | for i,r in enumerate(self.rounds): 122 | t = self.taus[i] 123 | v = np.abs(t-threshold) 124 | if v < closest_v: 125 | closest = i 126 | closest_v = v 127 | return self.rounds[closest].cluster_assignments 128 | 129 | def assignments_by_num_clusters(self, k): 130 | closest_v = np.Inf 131 | closest = 0 132 | for i,r in enumerate(self.rounds): 133 | v = np.abs(r.num_uniq_parents-k) 134 | if v < closest_v: 135 | closest = i 136 | closest_v = v 137 | return self.rounds[closest].cluster_assignments 138 | 139 | def fit(self): 140 | st = time.time() 141 | self.rounds.append(TreeLevel(tau=self.taus[0], 142 | dist_graph=self.g.copy(), 143 | node_counts=np.ones(self.g.shape[0]), 144 | cc_connection=self.cc_connection)) 145 | for i in range(self.num_rounds): 146 | logging.debug('round %s', i) 147 | logging.debug('round %s starts with %s nodes', i, self.rounds[i].dist_graph.shape[0]) 148 | self.rounds[i].perform_round() 149 | if i != self.num_rounds-1: 150 | nr = self.rounds[i].form_next_round(self.taus[i+1]) 151 | if nr is not None and nr.dist_graph.nnz > 0 and nr.dist_graph.shape[0] > 1: 152 | logging.debug('round %s ends with %s nodes',i, nr.dist_graph.shape[0]) 153 | self.rounds.append(nr) 154 | else: 155 | break 156 | ent = time.time() 157 | logging.debug('SCC time done in %s', ent-st) 158 | 159 | def write_tsv(self, outfile, labels): 160 | logging.debug('writing tsv tree @ %s', outfile) 161 | with open(outfile, 'w') as fout: 162 | for round_i, round_obj in tqdm(enumerate(self.rounds)): 163 | 164 | for j in range(round_obj.parents.shape[0]): 165 | j_id = '%s_%s' % (round_i, j) 166 | 167 | if round_i != len(self.rounds) - 1: 168 | parent = round_obj.parents[j] 169 | par_j_id = '%s_%s' % (round_i+1, parent) 170 | else: 171 | par_j_id = 'root' 172 | 173 | lbl = 'None' if round_i > 0 else labels[j] 174 | fout.write('%s\t%s\t%s\n' % (j_id, par_j_id, lbl)) 175 | 176 | fout.write('root\tNone\tNone\n') 177 | logging.debug('writing fininshed!') 178 | 179 | 180 | def write_and_prune(self, outfile, labels): 181 | logging.debug('writing tsv tree @ %s', outfile) 182 | skip_map = dict() 183 | with open(outfile, 'w') as fout: 184 | for round_i, round_obj in tqdm(enumerate(self.rounds)): 185 | # check to see how many children have each parent 186 | # for each parent that has a single child. 187 | # set skip_map[parent] = skip_map[child] if child in skip_map else child 188 | import collections 189 | parent_counts = collections.defaultdict(int) 190 | for j in range(round_obj.parents.shape[0]): 191 | if round_i != len(self.rounds) - 1: 192 | parent = round_obj.parents[j] 193 | par_j_id = '%s_%s' % (round_i + 1, parent) 194 | else: 195 | par_j_id = 'root' 196 | parent_counts[par_j_id] += 1 197 | 198 | for j in range(round_obj.parents.shape[0]): 199 | j_id = '%s_%s' % (round_i, j) if round_i > 0 else '%s' % (j) 200 | 201 | if round_i != len(self.rounds) - 1: 202 | parent = round_obj.parents[j] 203 | par_j_id = '%s_%s' % (round_i + 1, parent) 204 | else: 205 | par_j_id = 'root' 206 | 207 | if parent_counts[par_j_id] == 1 and round_i > 0 and par_j_id != 'root': # always write data points 208 | skip_map[par_j_id] = skip_map[j_id] if j_id in skip_map else j_id 209 | else: 210 | lbl = 'None' if round_i > 0 else labels[j] 211 | if par_j_id in skip_map: 212 | par_j_id = skip_map[par_j_id] 213 | if j_id in skip_map: 214 | j_id = skip_map[j_id] 215 | 216 | fout.write('%s\t%s\t%s\n' % (j_id, par_j_id, lbl)) 217 | 218 | fout.write('root\tNone\tNone\n') 219 | logging.debug('writing fininshed!') 220 | 221 | 222 | class Affinity(SCC): 223 | def __init__(self, g, num_rounds): 224 | super(Affinity, self).__init__(g, num_rounds, taus=-np.Inf*np.ones(num_rounds)) 225 | 226 | class RecipNN(SCC): 227 | def __init__(self, g, num_rounds): 228 | super(RecipNN, self).__init__(g, num_rounds, taus=-np.Inf*np.ones(num_rounds), cc_connection='strong') 229 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='scc', 6 | version='0.0.4', 7 | url='https://github.com/nmonath/scc', 8 | author='Nicholas Monath', 9 | author_email='nmonath@cs.umass.edu', 10 | packages=['scc'], 11 | package_dir={'scc': 'scc'} 12 | ) 13 | --------------------------------------------------------------------------------