├── LICENSE ├── README.md ├── build └── lib │ ├── hypercluster │ ├── __init__.py │ ├── additional_clusterers.py │ ├── additional_metrics.py │ ├── classes.py │ ├── clustering.py │ ├── constants.py │ ├── evaluations.py │ ├── metrics.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_clustering.py │ │ └── test_visualize.py │ ├── utilities.py │ └── visualize.py │ └── tests │ ├── __init__.py │ ├── test_clustering.py │ ├── test_snakemake.py │ └── test_visualize.py ├── dist ├── hypercluster-0.0.1-py3-none-any.whl ├── hypercluster-0.0.1.tar.gz ├── hypercluster-0.1.0-py3-none-any.whl ├── hypercluster-0.1.0.tar.gz ├── hypercluster-0.1.1-py3-none-any.whl ├── hypercluster-0.1.1.tar.gz ├── hypercluster-0.1.10-py3-none-any.whl ├── hypercluster-0.1.10.tar.gz ├── hypercluster-0.1.12-py3-none-any.whl ├── hypercluster-0.1.12.tar.gz ├── hypercluster-0.1.13-py3-none-any.whl ├── hypercluster-0.1.13.tar.gz ├── hypercluster-0.1.2-py3-none-any.whl ├── hypercluster-0.1.2.tar.gz ├── hypercluster-0.1.3-py3-none-any.whl ├── hypercluster-0.1.3.tar.gz ├── hypercluster-0.1.5-py3-none-any.whl ├── hypercluster-0.1.5.tar.gz ├── hypercluster-0.1.6-py3-none-any.whl ├── hypercluster-0.1.6.tar.gz ├── hypercluster-0.1.7-py3-none-any.whl ├── hypercluster-0.1.7.tar.gz ├── hypercluster-0.1.8-py3-none-any.whl ├── hypercluster-0.1.8.tar.gz ├── hypercluster-0.1.9-py3-none-any.whl └── hypercluster-0.1.9.tar.gz ├── docs ├── Makefile ├── conf.py ├── hypercluster.rst ├── index.rst ├── make.bat ├── requirements.txt └── snakemake.rst ├── examples ├── README.md ├── local_TCGA_BRCA_RNAseq │ ├── TCGA_2012_BRCA_data_expression_median_top500_most_variable.txt │ ├── TCGA_BRCA_RNAseq_subtype_clustering.ipynb │ ├── data_clinical_sample.txt │ ├── figures │ │ ├── .ipynb_checkpoints │ │ │ ├── grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf │ │ │ └── grid.scatter.louvain-umaps-checkpoint.pdf │ │ ├── brca.rna.evaluations.pdf │ │ ├── clustermap.nmf4-vs-psm50.pdf │ │ ├── colorbar.LeidenCluster-silhouette_score.pdf │ │ ├── colorbar.LouvainCluster-adjusted_rand_score.pdf │ │ ├── colorbar.NMFCluster-silhouette_score.pdf │ │ ├── colorbar.silhouette_score.pdf │ │ ├── grid.scatter.LeidenCluster-silhouette_score-umaps.pdf │ │ ├── grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf │ │ ├── grid.scatter.NMFCluster-silhouette_score-umaps.pdf │ │ ├── heatmap.brca-rna.evaluations.PAM50_comp.pdf │ │ ├── heatmap.brca-rna.evaluations.pdf │ │ ├── heatmaps.graphs-clusterers.metrics.pdf │ │ ├── scatter.calinski_harabasz_score.pdf │ │ ├── scatter.davies_bouldin_score.pdf │ │ ├── scatter.largest_cluster_size.pdf │ │ ├── scatter.number_of_clusters.pdf │ │ ├── scatter.pca.various_clusters.pdf │ │ ├── scatter.silhouette_score.pdf │ │ ├── scatter.smallest_cluster_size.pdf │ │ └── scatter.smallest_largest_clusters_ratio.pdf │ └── jupyter-lab-5918844.log └── snakemake_scRNA_example │ ├── cluster.json │ ├── config.yml │ ├── data_table_100genes.csv │ ├── figures │ ├── heatmaps.graphs-clusterers.metrics.pdf │ ├── pca.best_labels.pdf │ ├── pca.published_labels.pdf │ ├── umap.best_labels.pdf │ └── umap.published_labels.pdf │ ├── gold_standard.csv │ ├── params_to_test.yml │ ├── scRNA-seq_example.ipynb │ ├── sc_data.csv │ ├── seurat_igor_meta.tsv │ └── snakemake_submit.sh ├── hypercluster ├── __init__.py ├── additional_clusterers.py ├── additional_metrics.py ├── classes.py ├── constants.py ├── tests │ ├── __init__.py │ ├── test_clustering.py │ └── test_visualize.py ├── utilities.py └── visualize.py ├── setup.py └── snakemake ├── cluster.json ├── config.yml ├── hypercluster.smk ├── snakemake_submit.sh └── test_input.txt /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Lili Blumenberg 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | © 2019 GitHub, Inc. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hypercluster 2 | A package for clustering optimization with sklearn. 3 | 4 | ### Requirements: 5 | pandas 6 | numpy 7 | scipy 8 | matplotlib 9 | seaborn 10 | scikit-learn 11 | hdbscan 12 | 13 | Optional: 14 | snakemake 15 | 16 | 17 | ### Install 18 | With pip: 19 | ``` 20 | pip install hypercluster 21 | ``` 22 | 23 | or with conda: 24 | ``` 25 | conda install hypercluster 26 | # or 27 | conda install -c conda-forge -c bioconda hypercluster 28 | ``` 29 | If you are having problems installing with conda, try changing your channel priority. Priority of conda-forge > bioconda > defaults is recommended. 30 | To check channel priority: `conda config --get channels` 31 | It should look like: 32 | ``` 33 | --add channels 'defaults' # lowest priority 34 | --add channels 'bioconda' 35 | --add channels 'conda-forge' # highest priority 36 | ``` 37 | 38 | If it doesn't look like that, try: 39 | ``` 40 | conda config --add channels bioconda 41 | conda config --add channels conda-forge 42 | ``` 43 | 44 | ### Docs 45 | https://hypercluster.readthedocs.io/en/latest/index.html 46 | 47 | It will also be useful to check out sklearn's page on [clustering](https://scikit-learn.org/stable/modules/clustering.html) 48 | and [evaluation metrics](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation) 49 | 50 | ### Examples 51 | https://github.com/liliblu/hypercluster/tree/dev/examples 52 | 53 | ### Quickstart with SnakeMake 54 | 55 | Default `config.yml` and `hypercluster.smk` are in the snakemake repo above. 56 | Edit the `config.yml` file or arguments. 57 | ```bash 58 | snakemake -s hypercluster.smk --configfile config.yml --config input_data_files=test_data input_data_folder=. 59 | ``` 60 | 61 | Example editing with python: 62 | ```python 63 | import yaml 64 | 65 | with open('config.yml', 'r') as fh: 66 | config = yaml.load(fh) 67 | 68 | input_data_prefix = 'test_data' 69 | config['input_data_folder'] = os.path.abspath('.') 70 | config['input_data_files'] = [input_data_prefix] 71 | config['read_csv_kwargs'] = {input_data_prefix:{'index_col': [0]}} 72 | 73 | with open('config.yml', 'w') as fh: 74 | yaml.dump(config, stream=fh) 75 | ``` 76 | 77 | Then call snakemake. 78 | ```bash 79 | snakemake -s hypercluster.smk 80 | ``` 81 | 82 | Or submit the snakemake scheduler as an sbatch job e.g. with BigPurple Slurm: 83 | ```bash 84 | module add slurm 85 | sbatch snakemake_submit.sh 86 | ``` 87 | Examples for `snakemake_submit.sh` and `cluster.json` is in the scRNA-seq example. 88 | 89 | ### Quickstart with python 90 | ```python 91 | import pandas as pd 92 | from sklearn.datasets import make_blobs 93 | import hypercluster 94 | 95 | data, labels = make_blobs() 96 | data = pd.DataFrame(data) 97 | labels = pd.Series(labels, index=data.index, name='labels') 98 | 99 | # With a single clustering algorithm 100 | clusterer = hypercluster.AutoClusterer() 101 | clusterer.fit(data).evaluate( 102 | methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 103 | gold_standard = labels 104 | ) 105 | 106 | clusterer.visualize_evaluations() 107 | 108 | # With a range of algorithms 109 | 110 | clusterer = hypercluster.MultiAutoClusterer() 111 | clusterer.fit(data).evaluate( 112 | methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 113 | gold_standard = labels 114 | ) 115 | 116 | clusterer.visualize_evaluations() 117 | ``` 118 | -------------------------------------------------------------------------------- /build/lib/hypercluster/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import seaborn as sns 3 | import hypercluster 4 | from hypercluster import ( 5 | utilities, additional_clusterers, additional_metrics, classes, constants, visualize 6 | ) 7 | from hypercluster.classes import AutoClusterer, MultiAutoClusterer 8 | __version__ = '0.1.13' 9 | __all__ = [ 10 | "AutoClusterer", 11 | "MultiAutoClusterer" 12 | ] 13 | 14 | matplotlib.rcParams["pdf.fonttype"] = 42 15 | matplotlib.rcParams["ps.fonttype"] = 42 16 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3) 17 | matplotlib.rcParams.update({"savefig.bbox": "tight"}) -------------------------------------------------------------------------------- /build/lib/hypercluster/additional_clusterers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additonal clustering classes can be added here, as long as they have a 'fit' method. 3 | 4 | 5 | Attributes: 6 | HDBSCAN (clustering class): See `hdbscan`_ 7 | 8 | .. _hdbscan: 9 | https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#the-simple-case/ 10 | """ 11 | from typing import Optional, Iterable 12 | import logging 13 | import numpy as np 14 | import pandas as pd 15 | from scipy.spatial.distance import pdist 16 | from sklearn.decomposition import NMF 17 | from sklearn.neighbors import NearestNeighbors 18 | from hdbscan import HDBSCAN 19 | from .constants import pdist_adjacency_methods, valid_partition_types 20 | import igraph as ig 21 | import louvain 22 | import leidenalg 23 | 24 | 25 | class NMFCluster: 26 | """Uses non-negative factorization from sklearn to assign clusters to samples, based on the 27 | maximum membership score of the sample per component. 28 | 29 | Args: 30 | n_clusters: The number of clusters to find. Used as n_components when fitting. 31 | **nmf_kwargs: 32 | """ 33 | def __init__(self, n_clusters: int = 8, **nmf_kwargs): 34 | 35 | nmf_kwargs['n_components'] = n_clusters 36 | 37 | self.NMF = NMF(**nmf_kwargs) 38 | self.n_clusters = n_clusters 39 | 40 | def fit(self, data): 41 | """If negative numbers are present, creates one data matrix with all negative numbers 42 | zeroed. Create another data matrix with all positive numbers zeroed and the signs of all 43 | negative numbers reversed. Concatenate both matrices resulting in a data matrix twice as 44 | large as the original, but with positive values only and zeros and hence appropriate for 45 | NMF. Uses decomposed matrix H, which is nxk (with n=number of samples and k=number of 46 | components) to assign cluster membership. Each sample is assigned to the cluster for 47 | which it has the highest membership score. See `sklearn.decomposition.NMF`_ 48 | 49 | Args: 50 | data (DataFrame): Data to fit with samples as rows and features as columns. 51 | 52 | Returns: 53 | self with labels\_ attribute. 54 | 55 | .. _sklearn.decomposition.NMF: 56 | https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html 57 | """ 58 | 59 | if np.any(data<0): 60 | positive = data.copy() 61 | positive[positive < 0] = 0 62 | negative = data.copy() 63 | negative[negative > 0] = 0 64 | negative = -negative 65 | data = pd.concat([positive, negative], axis=1, join='outer') 66 | 67 | self.labels_ = pd.DataFrame(self.NMF.fit_transform(data)).idxmax(axis=1).values 68 | return self 69 | 70 | 71 | class LouvainCluster: 72 | """Louvain clustering on graph derived from an adjacency matrix. 73 | 74 | Args: 75 | adjacency_method: Method to use to construct adjacency matrix, which is used to construct \ 76 | graph that will be clustered. Valid methods are any metric valid in \ 77 | scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \ 78 | nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \ 79 | NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \ 80 | the count of how many NNs i and j have in common as the edge weight. 81 | k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix. 82 | resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \ 83 | resolution to use. If using other partitioners, this is ignored but any other kwargs for \ 84 | those partitioners can be passed too. 85 | adjacency_kwargs: Additional keyword arguments to pass to \ 86 | sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \ 87 | adjacency matrix. 88 | partition_type: Which partition to use for louvain clustering, see `louvain-igraph`_ for \ 89 | more info. 90 | **louvain_kwargs: Additional kwargs to be passed to `find_partition`_ 91 | 92 | .. _louvain-igraph: 93 | https://louvain-igraph.readthedocs.io/en/latest/reference.html 94 | .. _find_partition: 95 | https://louvain-igraph.readthedocs.io/en/latest/reference.html#louvain.find_partition 96 | """ 97 | def __init__( 98 | self, 99 | adjacency_method: str = 'MNN', 100 | k: int = 20, 101 | resolution: float = 0.8, 102 | adjacency_kwargs: Optional[dict] = None, 103 | partition_type: str = 'RBConfigurationVertexPartition', 104 | **louvain_kwargs 105 | ): 106 | 107 | if adjacency_method not in ['MNN', 'CNN'] + pdist_adjacency_methods: 108 | raise ValueError( 109 | 'Adjacency method %s invalid. Must be "SNN", "CNN" or a valid metric for ' 110 | 'scipy.spatial.distance.pdist.' % adjacency_method 111 | ) 112 | if partition_type not in valid_partition_types: 113 | raise ValueError( 114 | 'Partition type %s not valid, must be in constants.valid_partition_types' % 115 | partition_type 116 | ) 117 | self.adjacency_method = adjacency_method 118 | self.k = int(k) 119 | self.resolution = resolution 120 | self.adjacency_kwargs = adjacency_kwargs 121 | self.partition_type = partition_type 122 | self.louvain_kwargs = louvain_kwargs 123 | 124 | def fit( 125 | self, 126 | data: pd.DataFrame, 127 | ): 128 | adjacency_method = self.adjacency_method 129 | k = self.k 130 | resolution = self.resolution 131 | adjacency_kwargs = self.adjacency_kwargs 132 | louvain_kwargs = self.louvain_kwargs 133 | partition_type = self.partition_type 134 | if k >= len(data): 135 | logging.warning( 136 | 'k was set to %s, with only %s samples. Changing to k to %s-1' 137 | % (k, len(data), len(data)) 138 | ) 139 | k = len(data) - 1 140 | if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'): 141 | if adjacency_kwargs is None: 142 | adjacency_kwargs = {} 143 | adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k) 144 | nns = NearestNeighbors(**adjacency_kwargs) 145 | nns.fit(data) 146 | adjacency_mat = nns.kneighbors_graph(data) 147 | if adjacency_method == 'MNN': 148 | adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose()) 149 | if adjacency_method == 'CNN': 150 | adjacency_mat = adjacency_mat*adjacency_mat.transpose() 151 | elif adjacency_method in pdist_adjacency_methods: 152 | adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs) 153 | 154 | if louvain_kwargs is None: 155 | louvain_kwargs = {} 156 | g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist()) 157 | 158 | if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']: 159 | louvain_kwargs['resolution_parameter'] = resolution 160 | 161 | labels = eval('louvain.find_partition(g, louvain.%s, **louvain_kwargs)' % partition_type) 162 | labels = pd.Series({v: i for i in range(len(labels)) for v in labels[i]}).sort_index() 163 | if labels.is_unique or (len(labels.unique()) == 1): 164 | labels = pd.Series([-1 for i in range(len(labels))]) 165 | labels = labels.values 166 | self.labels_ = labels 167 | return self 168 | 169 | 170 | class LeidenCluster: 171 | """Leidein clustering on graph derived from an adjacency matrix. See `reference`_ for more info 172 | 173 | Args: 174 | adjacency_method: Method to use to construct adjacency matrix, which is used to construct \ 175 | graph that will be clustered. Valid methods are any metric valid in \ 176 | scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \ 177 | nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \ 178 | NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \ 179 | the count of how many NNs i and j have in common as the edge weight. 180 | k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix. 181 | resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \ 182 | resolution to use. If using other partitioners, this is ignored but any other kwargs for \ 183 | those partitioners can be passed too. 184 | adjacency_kwargs: Additional keyword arguments to pass to \ 185 | sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \ 186 | adjacency matrix. 187 | partition_type: Which partition to use for leiden clustering, see `leidenalg`_ for \ 188 | more info. 189 | **leiden_kwargs: Additional kwargs to be passed to `find_partition`_ 190 | .. _reference: 191 | https://www.nature.com/articles/s41598-019-41695-z 192 | .. _leidenalg: 193 | https://leidenalg.readthedocs.io/en/latest/reference.html 194 | .. _find_partition: 195 | https://leidenalg.readthedocs.io/en/latest/reference.html#leidenalg.find_partition 196 | """ 197 | def __init__( 198 | self, 199 | adjacency_method: str = 'SNN', 200 | k: int = 20, 201 | resolution: float = 0.8, 202 | adjacency_kwargs: Optional[dict] = None, 203 | partition_type: str = 'RBConfigurationVertexPartition', 204 | **leiden_kwargs 205 | ): 206 | 207 | self.adjacency_method = adjacency_method 208 | self.k = int(k) 209 | self.resolution = resolution 210 | self.adjacency_kwargs = adjacency_kwargs 211 | self.partition_type = partition_type 212 | self.leiden_kwargs = leiden_kwargs 213 | 214 | def fit( 215 | self, 216 | data: pd.DataFrame, 217 | ): 218 | 219 | adjacency_method = self.adjacency_method 220 | k = self.k 221 | resolution = self.resolution 222 | adjacency_kwargs = self.adjacency_kwargs 223 | leiden_kwargs = self.leiden_kwargs 224 | partition_type = self.partition_type 225 | if k >= len(data): 226 | logging.warning( 227 | 'k was set to %s, with only %s samples. Changing to k to %s-1' 228 | % (k, len(data), len(data)) 229 | ) 230 | k = len(data) - 1 231 | if (adjacency_method == 'SNN') | (adjacency_method == 'CNN'): 232 | if adjacency_kwargs is None: 233 | adjacency_kwargs = {} 234 | adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k) 235 | nns = NearestNeighbors(**adjacency_kwargs) 236 | nns.fit(data) 237 | adjacency_mat = nns.kneighbors_graph(data) 238 | if adjacency_method == 'SNN': 239 | adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose()) 240 | if adjacency_method == 'CNN': 241 | adjacency_mat = adjacency_mat * adjacency_mat.transpose() 242 | elif adjacency_method in pdist_adjacency_methods: 243 | adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs) 244 | 245 | if leiden_kwargs is None: 246 | leiden_kwargs = {} 247 | g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist()) 248 | 249 | if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']: 250 | leiden_kwargs['resolution_parameter'] = resolution 251 | 252 | labels = eval('leidenalg.find_partition(g, leidenalg.%s,**leiden_kwargs)' % partition_type) 253 | labels = pd.Series({v:i for i in range(len(labels)) for v in labels[i]}).sort_index() 254 | if labels.is_unique or (len(labels.unique()) == 1): 255 | labels = pd.Series([-1 for i in range(len(labels))]) 256 | labels = labels.values 257 | self.labels_ = labels 258 | return self 259 | -------------------------------------------------------------------------------- /build/lib/hypercluster/additional_metrics.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Optional 2 | from collections import Counter 3 | from pandas import DataFrame 4 | from scipy.cluster.hierarchy import linkage, cophenet 5 | from scipy.spatial.distance import pdist 6 | 7 | __doc__ = ( 8 | "More functions for evaluating clustering results. Additional metric evaluations can " 9 | "be added here, as long as the second argument is the labels to evaluate" 10 | ) 11 | 12 | 13 | def number_clustered(_, labels: Iterable) -> float: 14 | """Returns the number of clustered samples. 15 | 16 | Args: 17 | _: Dummy, pass anything or None. 18 | labels (Iterable): Vector of sample labels. 19 | 20 | Returns (int): 21 | The number of clustered labels. 22 | 23 | """ 24 | return (labels != -1).sum() 25 | 26 | 27 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float: 28 | """Number in the smallest cluster over the number in the largest cluster. 29 | 30 | Args: 31 | _: Dummy, pass anything or None. 32 | labels (Iterable): Vector of sample labels. 33 | 34 | Returns (float): 35 | Ratio of number of members in smallest over largest cluster. 36 | 37 | """ 38 | counts = Counter(labels) 39 | counts.pop(-1, None) 40 | return min(counts.values()) / max(counts.values()) 41 | 42 | 43 | def smallest_cluster_ratio(_, labels: Iterable) -> float: 44 | """Number in the smallest cluster over the total samples. 45 | 46 | Args: 47 | _: Dummy, pass anything or None. 48 | labels (Iterable): Vector of sample labels. 49 | 50 | Returns (float): 51 | Ratio of number of members in smallest over all samples. 52 | 53 | """ 54 | counts = Counter(labels) 55 | counts.pop(-1, None) 56 | return min(counts.values()) / len(labels) 57 | 58 | 59 | def number_of_clusters(_, labels: Iterable) -> float: 60 | """Number of total clusters. 61 | 62 | Args: 63 | _: Dummy, pass anything or None 64 | labels (Iterable): Vector of sample labels. 65 | 66 | Returns (int): 67 | Number of clusters. 68 | 69 | """ 70 | return len(Counter(labels)) 71 | 72 | 73 | def smallest_cluster_size(_, labels: Iterable) -> float: 74 | """Number in smallest cluster 75 | 76 | Args: 77 | _: Dummy, pass anything or None 78 | labels (Iterable): Vector of sample labels. 79 | 80 | Returns (int): 81 | Number of samples in smallest cluster. 82 | 83 | """ 84 | return min(Counter(labels).values()) 85 | 86 | 87 | def largest_cluster_size(_, labels: Iterable) -> float: 88 | """Number in largest cluster 89 | 90 | Args: 91 | _: Dummy, pass anything or None 92 | labels (Iterable): Vector of sample labels. 93 | 94 | Returns (int): 95 | Number of samples in largest cluster. 96 | 97 | """ 98 | return max(Counter(labels).values()) 99 | -------------------------------------------------------------------------------- /build/lib/hypercluster/clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import * 2 | from sklearn.metrics import * 3 | from .metrics import * 4 | from hdbscan import HDBSCAN 5 | from pandas import DataFrame 6 | import pandas as pd 7 | import numpy as np 8 | import logging 9 | from typing import Optional, Iterable, Dict, Union 10 | from itertools import product 11 | from .constants import * 12 | 13 | 14 | def calculate_row_weights( 15 | row: Iterable, param_weights: dict, vars_to_optimize: dict 16 | ) -> float: 17 | """ 18 | Used to select random rows of parameter combinations using individual parameter weights. 19 | 20 | Args: 21 | row: Series of parameters, with parameter names as index. 22 | param_weights: Dictionary of str: dictionaries. Ex format - {'parameter_name':{'param_option_1':0.5, 'param_option_2':0.5}}. 23 | vars_to_optimize: Dictionary with possibilities for different parameters. Ex format - {'parameter_name':[1, 2, 3, 4, 5]}. 24 | 25 | Returns: 26 | Float representing the probability of seeing that combination of parameters, 27 | given their individual weights. 28 | 29 | """ 30 | weights = [] 31 | for var_lab, val in row.to_dict().items(): 32 | weights.append( 33 | param_weights.get(var_lab, {}).get( 34 | val, (1 / len(vars_to_optimize[var_lab])) 35 | ) 36 | ) 37 | # TODO if probs are given to some options and not other, split the remaining probability, 38 | # don't just give equal prob. 39 | return np.prod(weights) 40 | 41 | 42 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}): 43 | """ 44 | Runs a given clusterer with a given set of parameters. 45 | 46 | Args: 47 | clusterer_name: String name of clusterer, for options see hypercluster.categories.clusterers. 48 | data: Dataframe with elements to cluster as index and examples as columns. 49 | params: Dictionary of parameter names and values to feed into clusterer. Default {}. 50 | 51 | Returns: 52 | Instance of the clusterer fit with the data provided. 53 | """ 54 | clusterer = eval(clusterer_name)(**params) 55 | return clusterer.fit(data) 56 | 57 | 58 | class AutoClusterer: 59 | """ 60 | Main hypercluster object. 61 | Args: 62 | clusterer_name: String name of clustererm for options see 63 | hypercluster.categories.clusterers.. 64 | params_to_optimize: Dictionary with possibilities for different parameters. Ex format - { 65 | 'parameter_name':[1, 2, 3, 4, 5]}. If None, will optimize default selection, given in 66 | hypercluster.constants.variables_to_optimize. Default None. 67 | random_search: Whether to search a random selection of possible parameters or all 68 | possibilites. Default True. 69 | random_search_fraction: If random_search is True, what fraction of the possible 70 | parameters to search. Default 0.5. 71 | param_weights: Dictionary of str: dictionaries. Ex format - {'parameter_name':{ 72 | 'param_option_1':0.5, 'param_option_2':0.5}}. 73 | clus_kwargs: Additional kwargs to pass into given clusterer, but not to be optimized. 74 | Default None. 75 | """ 76 | 77 | def __init__( 78 | self, 79 | clusterer_name: Optional[str] = "hdbscan", 80 | params_to_optimize: Optional[dict] = None, 81 | random_search: bool = True, 82 | random_search_fraction: float = 0.5, 83 | param_weights: dict = {}, 84 | clus_kwargs: Optional[dict] = None, 85 | ): 86 | self.clusterer_name = clusterer_name 87 | self.params_to_optimize = params_to_optimize 88 | self.random_search = random_search 89 | self.random_search_fraction = random_search_fraction 90 | self.param_weights = param_weights 91 | self.clus_kwargs = clus_kwargs 92 | 93 | if self.params_to_optimize is None: 94 | self.params_to_optimize = variables_to_optimize[clusterer_name] 95 | if self.clus_kwargs is None: 96 | self.clus_kwargs = {} 97 | 98 | self.labels_ = None 99 | self.static_kwargs = None 100 | self.total_possible_conditions = None 101 | self.param_sets = None 102 | self.generate_param_sets() 103 | self.labels_ = None 104 | 105 | def generate_param_sets(self): 106 | """ 107 | Uses info from init to make a Dataframe of all parameter sets that will be tried. 108 | Returns: 109 | self 110 | """ 111 | conditions = 1 112 | vars_to_optimize = {} 113 | static_kwargs = {} 114 | for parameter_name, possible_values in self.params_to_optimize.items(): 115 | if len(possible_values) == 1: 116 | static_kwargs[parameter_name] = possible_values 117 | elif len(possible_values) > 1: 118 | vars_to_optimize[parameter_name] = possible_values 119 | conditions *= conditions * len(possible_values) 120 | else: 121 | logging.error( 122 | "Parameter %s was given no possibilities. Will continue with default parameter." 123 | % parameter_name 124 | ) 125 | 126 | self.static_kwargs = static_kwargs 127 | self.total_possible_conditions = conditions 128 | 129 | parameters = pd.DataFrame(columns=list(vars_to_optimize.keys())) 130 | for row in iter(product(*vars_to_optimize.values())): 131 | parameters = parameters.append( 132 | dict(zip(vars_to_optimize.keys(), row)), ignore_index=True 133 | ) 134 | 135 | if self.random_search and len(parameters) > 1: 136 | will_search = int(conditions * self.random_search_fraction) 137 | 138 | # calculates probability of getting a particular set of parameters, given the probs of 139 | # all the individual params. If a prob isn't set, give uniform probability to each 140 | # parameter. 141 | if self.param_weights: 142 | weights = parameters.apply( 143 | lambda param_set: calculate_row_weights( 144 | param_set, self.param_weights, vars_to_optimize 145 | ), 146 | axis=1, 147 | ) 148 | else: 149 | weights = None 150 | parameters = parameters.sample(will_search, weights=weights) 151 | 152 | for col in static_kwargs.keys(): 153 | parameters[col] = static_kwargs[col] 154 | 155 | logging.info( 156 | "For clusterer %s, testing %s out of %s possible conditions" 157 | % (self.clusterer_name, len(parameters), conditions) 158 | ) 159 | 160 | self.param_sets = parameters 161 | return self 162 | 163 | def fit(self, data: DataFrame): 164 | """ 165 | Fits clusterer to data with each parameter set. 166 | Args: 167 | data: Dataframe with elements to cluster as index and examples as columns. 168 | 169 | Returns: 170 | self with self.labels_ assigned 171 | """ 172 | 173 | if self.param_sets.shape == (0, 0): 174 | labels_results = pd.DataFrame( 175 | cluster(self.clusterer_name, data).labels_, 176 | columns=["default_parameters"], 177 | index=data.index, 178 | ) 179 | 180 | label_results = pd.DataFrame(columns=self.param_sets.columns.union(data.index)) 181 | for i, row in self.param_sets.iterrows(): 182 | single_params = row.to_dict() 183 | labels = cluster(self.clusterer_name, data, single_params).labels_ 184 | 185 | label_row = dict(zip(data.index, labels)) 186 | label_row.update(single_params) 187 | label_results = label_results.append(label_row, ignore_index=True) 188 | logging.info( 189 | "%s - %s of conditions done" % (i, (i / self.total_possible_conditions)) 190 | ) 191 | if len(self.param_sets.columns) > 0: 192 | label_results = label_results.set_index( 193 | list(self.param_sets.columns) 194 | ).transpose() 195 | 196 | self.labels_ = label_results 197 | return self 198 | 199 | 200 | def evaluate_results( 201 | labels: Iterable, 202 | method: str = "silhouette_score", 203 | data: Optional[DataFrame] = None, 204 | gold_standard: Optional[Iterable] = None, 205 | metric_kwargs: Optional[dict] = None, 206 | ) -> dict: 207 | """ 208 | Uses a given metric to evaluate clustering results. 209 | Args: 210 | labels: Series of labels 211 | method: Str of name of evaluation to use. For options see hypercluster.categories.evaluations. Default is silhouette. 212 | data: If using an inherent metric, must provide Dataframe of original data used to 213 | cluster. For options see hypercluster.constants.inherent_metric. 214 | gold_standard: If using a metric that compares to ground truth, must provide a set of 215 | gold standard labels. For options see hypercluster.constants.need_ground_truth. 216 | metric_kwargs: Additional kwargs to use in evaluation. 217 | 218 | Returns: 219 | Dictionary where every column from the label_df is a key and its evaluation is the value. 220 | """ 221 | 222 | if metric_kwargs is None: 223 | metric_kwargs = {} 224 | 225 | if method in need_ground_truth: 226 | if gold_standard is None: 227 | raise ValueError( 228 | "Chosen evaluation metric %s requires gold standard set." % method 229 | ) 230 | clustered = (gold_standard != -1) & (labels != -1) 231 | compare_to = gold_standard[clustered] 232 | 233 | elif method in inherent_metrics: 234 | if data is None: 235 | raise ValueError( 236 | "Chosen evaluation metric %s requires data input." % method 237 | ) 238 | clustered = labels != -1 239 | compare_to = data.loc[clustered] 240 | else: 241 | raise ValueError("Evaluation metric %s not valid" % method) 242 | 243 | if len(labels[clustered].value_counts()) < 2: 244 | logging.error( 245 | "Condition %s does not have at least two clusters, skipping" % labels.name 246 | ) 247 | return np.nan 248 | 249 | return eval(method)(compare_to, labels[clustered], **metric_kwargs) 250 | 251 | 252 | def optimize_clustering( 253 | data, 254 | algorithm_names: Union[Iterable, str] = variables_to_optimize.keys(), 255 | algorithm_parameters: Optional[Dict[str, dict]] = None, 256 | random_search: bool = True, 257 | random_search_fraction: float = 0.5, 258 | algorithm_param_weights: Optional[dict] = None, 259 | algorithm_clus_kwargs: Optional[dict] = None, 260 | evaluation_methods: Optional[list] = None, 261 | gold_standard: Optional[Iterable] = None, 262 | metric_kwargs: Optional[dict] = None, 263 | ) -> tuple: 264 | """ 265 | Runs through many clusterers and parameters to get best clustering labels. 266 | Args: 267 | data: Dataframe with elements to cluster as index and examples as columns. 268 | algorithm_names: Which clusterers to try. Default is in variables_to_optimize.Can also 269 | put 'slow', 'fast' or 'fastest' for subset of clusterers. See hypercluster.constants.speeds. 270 | algorithm_parameters: Dictionary of str:dict, with parameters to optimize for each clusterer. Ex. structure:: {'clusterer1':{'param1':['opt1', 'opt2', 'opt3']}}. 271 | random_search: Whether to search a random selection of possible parameters or all possibilities. Default True. 272 | random_search_fraction: If random_search is True, what fraction of the possible parameters to search, applied to all clusterers. Default 0.5. 273 | algorithm_param_weights: Dictionary of str: dictionaries. Ex format - {'clusterer_name': {'parameter_name':{'param_option_1':0.5, 'param_option_2':0.5}}}. 274 | algorithm_clus_kwargs: Dictionary of additional kwargs per clusterer. 275 | evaluation_methods: Str name of evaluation metric to use. For options see 276 | hypercluster.categories.evaluations. Default silhouette. 277 | gold_standard: If using a evaluation needs ground truth, must provide ground truth labels. For options see hypercluster.constants.need_ground_truth. 278 | metric_kwargs: Additional evaluation metric kwargs. 279 | 280 | Returns: 281 | Best labels, dictionary of clustering evaluations, dictionary of all clustering labels 282 | """ 283 | 284 | if algorithm_param_weights is None: 285 | algorithm_param_weights = {} 286 | if algorithm_clus_kwargs is None: 287 | algorithm_clus_kwargs = {} 288 | if algorithm_parameters is None: 289 | algorithm_parameters = {} 290 | if metric_kwargs is None: 291 | metric_kwargs = {} 292 | if evaluation_methods is None: 293 | evaluation_methods = inherent_metrics 294 | 295 | if algorithm_names in list(categories.keys()): 296 | algorithm_names = categories[algorithm_names] 297 | 298 | clustering_labels = {} 299 | clustering_labels_df = pd.DataFrame() 300 | for clusterer_name in algorithm_names: 301 | label_df = ( 302 | AutoClusterer( 303 | clusterer_name=clusterer_name, 304 | params_to_optimize=algorithm_parameters.get(clusterer_name, None), 305 | random_search=random_search, 306 | random_search_fraction=random_search_fraction, 307 | param_weights=algorithm_param_weights.get(clusterer_name, None), 308 | clus_kwargs=algorithm_clus_kwargs.get(clusterer_name, None), 309 | ) 310 | .fit(data) 311 | .labels_ 312 | ) 313 | label_df.index = pd.MultiIndex.from_tuples(label_df.index) 314 | clustering_labels[clusterer_name] = label_df 315 | 316 | # Put all parameter labels into 1 for a big df 317 | label_df = label_df.transpose() 318 | cols_for_labels = label_df.index.to_frame() 319 | 320 | inds = cols_for_labels.apply( 321 | lambda row: param_delim.join( 322 | [clusterer_name] 323 | + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()] 324 | ), 325 | axis=1, 326 | ) 327 | 328 | label_df.index = inds 329 | label_df = label_df.transpose() 330 | clustering_labels_df = pd.concat( 331 | [clustering_labels_df, label_df], join="outer", axis=1 332 | ) 333 | 334 | evaluation_results_df = pd.DataFrame({"methods": evaluation_methods}) 335 | for col in clustering_labels_df.columns: 336 | evaluation_results_df[col] = evaluation_results_df.apply( 337 | lambda row: evaluate_results( 338 | clustering_labels_df[col], 339 | method=row["methods"], 340 | data=data, 341 | gold_standard=gold_standard, 342 | metric_kwargs=metric_kwargs.get(row["methods"], None), 343 | ), 344 | axis=1, 345 | ) 346 | 347 | return evaluation_results_df, clustering_labels_df, clustering_labels 348 | -------------------------------------------------------------------------------- /build/lib/hypercluster/constants.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | __doc__ = """ 5 | Attributes: 6 | param_delim: delimiter between hyperparameters for snakemake file labels and labels DataFrame \ 7 | columns. 8 | val_delim: delimiter between hyperparameter label and value for snakemake file labels and \ 9 | labels DataFrame columns. 10 | categories: Convenient groups of clusterers to use. If all samples need to be clustered, \ 11 | 'partitioners' is a good choice. If there are millions of samples, 'fastest' might be a good \ 12 | choice. 13 | variables_to_optimize: Some default hyperparameters to optimize and value ranges for a \ 14 | selection of commonly used clustering algoirthms from sklearn. Used as deafults for \ 15 | clustering.AutoClusterer and clustering.optimize_clustering. 16 | need_ground_truth: list of sklearn metrics that need ground truth labeling. \ 17 | "adjusted_rand_score", "adjusted_mutual_info_score", "homogeneity_score", \ 18 | "completeness_score", "fowlkes_mallows_score", "mutual_info_score", "v_measure_score" 19 | inherent_metrics: list of sklearn metrics that need original data for calculation. \ 20 | "silhouette_score", "calinski_harabasz_score", "davies_bouldin_score", \ 21 | "smallest_largest_clusters_ratio", "number_of_clusters", "smallest_cluster_size", \ 22 | "largest_cluster_size" 23 | min_or_max: establishing whether each sklearn metric is better when minimized or maximized for \ 24 | clustering.pick_best_labels. 25 | """ 26 | param_delim = ";" 27 | val_delim = "-" 28 | 29 | slow = ["AffinityPropagation", "MeanShift"] 30 | fast = ["KMeans", "OPTICS", "HDBSCAN"] 31 | fastest = ["MiniBatchKMeans"] 32 | partitioners = ["AffinityPropagation", "MeanShift", "KMeans", "MiniBatchKMeans"] 33 | clusterers = ["OPTICS", "HDBSCAN"] 34 | categories = { 35 | "slow": slow, 36 | "fast": fast, 37 | "fastest": fastest, 38 | "partitioning": partitioners, 39 | "clustering": clusterers, 40 | } 41 | 42 | min_cluster_size = [i for i in range(2, 17, 2)] 43 | n_clusters = [i for i in range(2, 41)] 44 | damping = [i / 100 for i in range(55, 95, 5)] 45 | resolutions = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] 46 | knn = [20, 30, 60] 47 | 48 | 49 | variables_to_optimize = { 50 | "HDBSCAN": dict(min_cluster_size=min_cluster_size), 51 | "KMeans": dict(n_clusters=n_clusters), 52 | "MiniBatchKMeans": dict(n_clusters=n_clusters), 53 | "AffinityPropagation": dict(damping=damping), 54 | "MeanShift": dict(cluster_all=[False]), 55 | "OPTICS": dict(min_samples=min_cluster_size), 56 | "NMFCluster": dict(n_clusters=n_clusters), 57 | "LouvainCluster": dict(resolution=resolutions, k=knn), 58 | "LeidenCluster": dict(resolution=resolutions, k=knn), 59 | } 60 | 61 | 62 | need_ground_truth = [ 63 | "adjusted_rand_score", 64 | "adjusted_mutual_info_score", 65 | "homogeneity_score", 66 | "completeness_score", 67 | "fowlkes_mallows_score", 68 | "mutual_info_score", 69 | "v_measure_score", 70 | ] 71 | 72 | inherent_metrics = [ 73 | "silhouette_score", 74 | "calinski_harabasz_score", 75 | "davies_bouldin_score", 76 | "smallest_largest_clusters_ratio", 77 | "number_of_clusters", 78 | "smallest_cluster_size", 79 | "largest_cluster_size" 80 | ] 81 | 82 | min_or_max = { 83 | "adjusted_rand_score": 'max', 84 | "adjusted_mutual_info_score": 'max', 85 | "homogeneity_score": 'max', 86 | "completeness_score": 'max', 87 | "fowlkes_mallows_score": 'max', 88 | "silhouette_score": 'max', 89 | "calinski_harabasz_score": 'max', 90 | "davies_bouldin_score": 'min', 91 | "mutual_info_score": 'max', 92 | "v_measure_score": 'max', 93 | } 94 | 95 | pdist_adjacency_methods = [ 96 | 'braycurtis', 97 | 'canberra', 98 | 'chebyshev', 99 | 'cityblock', 100 | 'correlation', 101 | 'cosine', 102 | 'dice', 103 | 'euclidean', 104 | 'hamming', 105 | 'jaccard', 106 | 'jensenshannon', 107 | 'kulsinski', 108 | 'mahalanobis', 109 | 'matching', 110 | 'minkowski', 111 | 'rogerstanimoto', 112 | 'russellrao', 113 | 'seuclidean', 114 | 'sokalmichener', 115 | 'sokalsneath', 116 | 'sqeuclidean', 117 | 'yule' 118 | ] 119 | 120 | 121 | valid_partition_types = [ 122 | 'RBConfigurationVertexPartition', 123 | 'ModularityVertexPartition', 124 | 'RBERVertexPartition', 125 | 'CPMVertexPartition', 126 | 'SignificanceVertexPartition', 127 | 'SurpriseVertexPartition' 128 | ] -------------------------------------------------------------------------------- /build/lib/hypercluster/evaluations.py: -------------------------------------------------------------------------------- 1 | #TODO add label count evals here 2 | 3 | #TODO fn that grabs best params from col/yml in smk output, and feeds into clusterer. 4 | -------------------------------------------------------------------------------- /build/lib/hypercluster/metrics.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | from collections import Counter 3 | 4 | __doc__ = "More functions for evaluating clustering results." 5 | 6 | 7 | def number_clustered(_, labels: Iterable) -> float: 8 | return len(labels) 9 | 10 | 11 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float: 12 | counts = Counter(labels) 13 | counts.pop(-1, None) 14 | smallest = min(counts.values()) 15 | largest = max(counts.values()) 16 | return smallest / largest 17 | 18 | 19 | def smallest_cluster_ratio(_, labels: Iterable) -> float: 20 | counts = Counter(labels) 21 | counts.pop(-1, None) 22 | return min(counts.values()) / len(labels) 23 | -------------------------------------------------------------------------------- /build/lib/hypercluster/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/build/lib/hypercluster/tests/__init__.py -------------------------------------------------------------------------------- /build/lib/hypercluster/tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | from hypercluster import utilities 2 | from hypercluster.constants import * 3 | import hypercluster 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | test_data = pd.DataFrame( 9 | np.array( 10 | [[1, 2], [-1.8, 4], [1, -0.5], 11 | [10, 2], [-10, 4], [10, 0], 12 | [np.nan, 5], [3.2, np.nan], [0, 14], 13 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 14 | [42, np.nan], [-8, 2], [1.2, 12], 15 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 16 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 17 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 18 | [np.nan, 22], [8.6, -7.5], [0, 14], 19 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 20 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 21 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 22 | [-29, 3.6], [-13, -3], [3.3, np.nan], 23 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 24 | ) 25 | ) 26 | 27 | 28 | test_data['ind1'] = 'a' 29 | test_data['ind2'] = range(len(test_data)) 30 | test_data = test_data.set_index(['ind1', 'ind2']) 31 | test_data = test_data.fillna(test_data.median()) 32 | 33 | test_ground_truth = pd.Series( 34 | np.random.randint(0, 2, size=(len(test_data), )), 35 | index=test_data.index 36 | ) 37 | 38 | 39 | def test_cluster_one(): 40 | # Test all clusterers are working with default params 41 | for clus_name in variables_to_optimize.keys(): 42 | utilities.cluster(clus_name, test_data) 43 | 44 | # Test with putting extra params in there 45 | for clus_name in variables_to_optimize.keys(): 46 | vars = variables_to_optimize[clus_name] 47 | key = list(vars.keys())[0] 48 | params = {key: vars[key][0]} 49 | # grabbing a variable and making sure var passing works 50 | utilities.cluster(clus_name, test_data, params) 51 | 52 | 53 | def test_autoclusterer(): 54 | for clus_name in variables_to_optimize.keys(): 55 | hypercluster.AutoClusterer(clus_name).fit(test_data) 56 | for clus_name in variables_to_optimize.keys(): 57 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 58 | 59 | 60 | def test_param_weights(): 61 | for clus_name in variables_to_optimize.keys(): 62 | weights = { 63 | param: {value: (1/len(values)) for value in values} for param, values in 64 | variables_to_optimize[ 65 | clus_name 66 | ].items() 67 | } 68 | hypercluster.AutoClusterer(clus_name, param_weights=weights).fit( 69 | test_data 70 | ) 71 | for clus_name in variables_to_optimize.keys(): 72 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 73 | 74 | 75 | def test_passing_kwargs_for_a_clusterer(): 76 | clus_name = 'KMeans' 77 | 78 | hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit( 79 | test_data 80 | ) 81 | 82 | 83 | def test_evaluate_results(): 84 | labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_ 85 | for metric in inherent_metrics + need_ground_truth: 86 | utilities.evaluate_one( 87 | labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth 88 | ) 89 | 90 | 91 | def test_multiauto(): 92 | hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 93 | -------------------------------------------------------------------------------- /build/lib/hypercluster/tests/test_visualize.py: -------------------------------------------------------------------------------- 1 | from hypercluster import visualize 2 | import hypercluster 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | test_data = pd.DataFrame( 8 | np.array( 9 | [[1, 2], [-1.8, 4], [1, -0.5], 10 | [10, 2], [-10, 4], [10, 0], 11 | [np.nan, 5], [3.2, np.nan], [0, 14], 12 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 13 | [42, np.nan], [-8, 2], [1.2, 12], 14 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 15 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 16 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 17 | [np.nan, 22], [8.6, -7.5], [0, 14], 18 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 19 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 20 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 21 | [-29, 3.6], [-13, -3], [3.3, np.nan], 22 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 23 | ) 24 | ) 25 | 26 | 27 | test_data['ind1'] = 'a' 28 | test_data['ind2'] = range(len(test_data)) 29 | test_data = test_data.set_index(['ind1', 'ind2']) 30 | test_data = test_data.fillna(test_data.median()) 31 | 32 | test_ground_truth = pd.Series( 33 | np.random.randint(0, 2, size=(len(test_data), )), 34 | index=test_data.index 35 | ) 36 | 37 | 38 | def test_vis_eval(): 39 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 40 | visualize.visualize_evaluations(clusterer.evaluation_df) 41 | clusterer.visualize_evaluations( 42 | # savefig=True 43 | ) 44 | visualize.visualize_for_picking_labels( 45 | clusterer.evaluation_df, savefig_prefix='test_visualize_for_picking' 46 | ) 47 | 48 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 49 | visualize.visualize_evaluations(clusterer.evaluation_df) 50 | clusterer.visualize_evaluations() 51 | 52 | 53 | def test_vis_sample(): 54 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 55 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 56 | clusterer.visualize_sample_label_consistency() 57 | 58 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 59 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 60 | clusterer.visualize_sample_label_consistency() 61 | 62 | 63 | def test_vis_labels(): 64 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 65 | visualize.visualize_label_agreement(clusterer.labels_df) 66 | clusterer.visualize_label_agreement( 67 | savefig=True, 68 | ) 69 | 70 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 71 | visualize.visualize_label_agreement(clusterer.labels_df) 72 | clusterer.visualize_label_agreement() -------------------------------------------------------------------------------- /build/lib/hypercluster/utilities.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import * 2 | from sklearn.metrics import * 3 | from .additional_clusterers import * 4 | from .additional_metrics import * 5 | from pandas import DataFrame 6 | import pandas as pd 7 | import numpy as np 8 | import logging 9 | from typing import Optional, Iterable, Dict 10 | from .constants import * 11 | from hypercluster.constants import param_delim, val_delim 12 | 13 | 14 | def calculate_row_weights( 15 | row: Iterable, param_weights: dict, vars_to_optimize: dict 16 | ) -> float: 17 | """Used to select random rows of parameter combinations using individual parameter weights. 18 | 19 | Args: 20 | row (Iterable): Series of parameters, with parameter names as index. 21 | param_weights (dict): Dictionary of str: dictionaries. Ex format - {'parameter_name':{ \ 22 | 'param_option_1':0.5, 'param_option_2':0.5}}. 23 | vars_to_optimize (Iterable): Dictionary with possibilities for different parameters. Ex \ 24 | format - {'parameter_name':[1, 2, 3, 4, 5]}. 25 | 26 | Returns (float): 27 | Float representing the probability of seeing that combination of parameters, given their \ 28 | individual weights. 29 | 30 | """ 31 | param_weights.update({ 32 | param: { 33 | val: param_weights.get(param, {}).get( 34 | val, (1-sum(param_weights.get(param, {}).values()))/len([ 35 | notweighted for notweighted in vars_to_optimize.get(param, {}) 36 | if notweighted not in param_weights.get(param, {}).keys() 37 | ]) 38 | ) for val in vals 39 | } for param, vals in vars_to_optimize.items() 40 | }) 41 | 42 | return np.prod([param_weights[param][val] for param, val in row.to_dict().items()]) 43 | 44 | 45 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}): 46 | """Runs a given clusterer with a given set of parameters. 47 | 48 | Args: 49 | clusterer_name (str): String name of clusterer. 50 | data (DataFrame): Dataframe with elements to cluster as index and examples as columns. 51 | params (dict): Dictionary of parameter names and values to feed into clusterer. Default {} 52 | 53 | Returns: 54 | Instance of the clusterer fit with the data provided. 55 | """ 56 | clusterer = eval(clusterer_name)(**params) 57 | return clusterer.fit(data) 58 | 59 | 60 | def evaluate_one( 61 | labels: Iterable, 62 | method: str = "silhouette_score", 63 | data: Optional[DataFrame] = None, 64 | gold_standard: Optional[Iterable] = None, 65 | metric_kwargs: Optional[dict] = None, 66 | ) -> dict: 67 | """Uses a given metric to evaluate clustering results. 68 | 69 | Args: 70 | labels (Iterable): Series of labels. 71 | method (str): Str of name of evaluation to use. Default is silhouette. 72 | data (DataFrame): If using an inherent metric, must provide DataFrame with which to \ 73 | calculate the metric. 74 | gold_standard (Iterable): If using a metric that compares to ground truth, must provide a \ 75 | set of gold standard labels. 76 | metric_kwargs (dict): Additional kwargs to use in evaluation. 77 | 78 | Returns (float): 79 | Metric value 80 | """ 81 | if isinstance(labels, pd.Series) is False: 82 | labels = pd.Series(labels) 83 | if len(labels[labels != -1].unique()) < 2: 84 | return np.nan 85 | 86 | if metric_kwargs is None: 87 | metric_kwargs = {} 88 | 89 | if method in need_ground_truth: 90 | if gold_standard is None: 91 | raise ValueError( 92 | "Chosen evaluation metric %s requires gold standard set." % method 93 | ) 94 | clustered = (gold_standard != -1) & (labels != -1) 95 | compare_to = gold_standard[clustered] 96 | 97 | elif method in inherent_metrics: 98 | if data is None: 99 | raise ValueError( 100 | "Chosen evaluation metric %s requires data input." % method 101 | ) 102 | clustered = labels != -1 103 | compare_to = data.loc[clustered] 104 | else: 105 | compare_to = None 106 | clustered = labels.index 107 | 108 | return eval(method)(compare_to, labels[clustered], **metric_kwargs) 109 | 110 | 111 | def generate_flattened_df(df_dict: Dict[str, DataFrame]) -> DataFrame: 112 | """Takes dictionary of results from many clusterers and makes 1 DataFrame. Opposite of \ 113 | convert_to_multiind. 114 | 115 | Args: 116 | df_dict (Dict[str, DataFrame]): Dictionary of dataframes to flatten. Can be .labels_ or \ 117 | .evaluations_ from MultiAutoClusterer. 118 | 119 | Returns: 120 | Flattened DataFrame with all data. 121 | """ 122 | merged_df = pd.DataFrame() 123 | for clus_name, df in df_dict.items(): 124 | df = df.transpose() 125 | cols_for_labels = df.index.to_frame() 126 | inds = cols_for_labels.apply( 127 | lambda row: param_delim.join( 128 | [clus_name] + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()] 129 | ), 130 | axis=1, 131 | ) 132 | df.index = inds 133 | df = df.transpose() 134 | 135 | merged_df = pd.concat( 136 | [merged_df, df], join="outer", axis=1 137 | ) 138 | return merged_df 139 | 140 | 141 | def convert_to_multiind(key: str, df: DataFrame) -> DataFrame: 142 | """Takes columns from a single clusterer from Clusterer.labels_df or .evaluation_df and 143 | converts to a multiindexed rather than collapsed into string. Equivalent to grabbing 144 | Clusterer.labels[clusterer] or .evaluations[clusterer]. Opposite of generate_flattened_df. 145 | 146 | Args: 147 | key (str): Name of clusterer, must match beginning of columns to convert. 148 | df (DataFrame): Dataframe to grab chunk from. 149 | 150 | Returns: 151 | Subset DataFrame with multiindex. 152 | 153 | """ 154 | clus_cols = [col for col in df.columns if col.split(param_delim, 1)[0] == key] 155 | temp = df[clus_cols].transpose() 156 | temp.index = pd.MultiIndex.from_frame( 157 | pd.DataFrame([{ 158 | s.split(val_delim, 1)[0]: s.split(val_delim, 1)[1] for s in i.split(param_delim)[1:] 159 | } for i in temp.index]).astype(float, errors='ignore') 160 | ) 161 | return temp.sort_index().transpose() 162 | 163 | 164 | def pick_best_labels( 165 | evaluation_results_df: DataFrame, 166 | clustering_labels_df: DataFrame, 167 | method: Optional[str] = None, 168 | min_or_max: Optional[str] = None 169 | ) -> Iterable: 170 | """From evaluations and a metric to minimize or maximize, return all labels with top pick. 171 | 172 | Args: 173 | evaluation_results_df (DataFrame): Evaluations DataFrame from optimize_clustering. 174 | clustering_labels_df (DataFrame): Labels DataFrame from optimize_clustering. 175 | method (str): Method with which to choose the best labels. 176 | min_or_max (str): Whether to minimize or maximize the metric. Must be 'min' or 'max'. 177 | Returns (DataFrame): 178 | DataFrame of all top labels. 179 | """ 180 | if method is None: 181 | method = "silhouette_score" 182 | if min_or_max is None: 183 | min_or_max = 'max' 184 | 185 | best_labels = evaluation_results_df.loc[method, :] 186 | if min_or_max == 'min': 187 | best_labels = best_labels.index[best_labels == best_labels.min()] 188 | return clustering_labels_df[best_labels] 189 | elif min_or_max == 'max': 190 | best_labels = best_labels.index[best_labels == best_labels.max()] 191 | return clustering_labels_df[best_labels] 192 | logging.error('min_or_max must be either min or max, %s invalid choice' % min_or_max) 193 | 194 | 195 | -------------------------------------------------------------------------------- /build/lib/hypercluster/visualize.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import logging 3 | from collections import Counter 4 | from itertools import cycle 5 | import numpy as np 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | from pandas import DataFrame 10 | from scipy.cluster import hierarchy 11 | from scipy.spatial.distance import pdist 12 | from hypercluster.constants import param_delim 13 | from hypercluster.utilities import convert_to_multiind, evaluate_one 14 | 15 | matplotlib.rcParams["pdf.fonttype"] = 42 16 | matplotlib.rcParams["ps.fonttype"] = 42 17 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3) 18 | matplotlib.rcParams.update({"savefig.bbox": "tight"}) 19 | cmap = sns.cubehelix_palette( 20 | start=0, 21 | rot=0.4, 22 | gamma=1.0, 23 | hue=0.82, 24 | light=1, 25 | dark=0, 26 | reverse=False, 27 | as_cmap=True 28 | ) 29 | cmap.set_over('black') 30 | cmap.set_under('white') 31 | cmap.set_bad("#DAE0E6") 32 | 33 | 34 | def zscore(df): 35 | """Row zscores a DataFrame, ignores np.nan 36 | 37 | Args: 38 | df (DataFrame): DataFrame to z-score 39 | 40 | Returns (DataFrame): 41 | Row-zscored DataFrame. 42 | """ 43 | return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0) 44 | 45 | 46 | def compute_order( 47 | df, 48 | dist_method: str = "euclidean", 49 | cluster_method: str = "average" 50 | ): 51 | """Gives hierarchical clustering order for the rows of a DataFrame 52 | 53 | Args: 54 | df (DataFrame): DataFrame with rows to order. 55 | dist_method (str): Distance method to pass to scipy.cluster.hierarchy.linkage. 56 | cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist. 57 | 58 | Returns (pandas.Index): 59 | Ordered row index. 60 | 61 | """ 62 | dist_mat = pdist(df, metric=dist_method) 63 | link_mat = hierarchy.linkage(dist_mat, method=cluster_method) 64 | 65 | return df.index[hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))] 66 | 67 | 68 | def visualize_evaluations( 69 | evaluations_df: DataFrame, 70 | savefig: bool = False, 71 | output_prefix: str = "evaluations", 72 | **heatmap_kws 73 | ) -> List[matplotlib.axes.Axes]: 74 | """Makes a z-scored visualization of all evaluations. 75 | 76 | Args: 77 | evaluations_df (DataFrame): Evaluations dataframe from clustering.optimize_clustering 78 | output_prefix (str): If saving a figure, file prefix to use. 79 | savefig (bool): Whether to save a pdf 80 | **heatmap_kws: Additional keyword arguments to pass to seaborn.heatmap. 81 | 82 | Returns (List[matplotlib.axes.Axes]): 83 | List of all matplotlib axes. 84 | 85 | """ 86 | clusterers = sorted( 87 | list(set([i.split(param_delim, 1)[0] for i in evaluations_df.columns])) 88 | ) 89 | width_ratios = [ 90 | dict( 91 | Counter( 92 | [i.split(param_delim, 1)[0] for i in evaluations_df.columns] 93 | ) 94 | )[clus] 95 | for clus in clusterers 96 | ] 97 | 98 | evaluations_df = zscore(evaluations_df) 99 | width = 0.18 * (len(evaluations_df.columns) + 2 + (0.01 * (len(clusterers) - 1))) 100 | height = 0.22 * (len(evaluations_df)) 101 | 102 | fig, axs = plt.subplots( 103 | figsize=(width, height), 104 | nrows=1, 105 | ncols=(len(clusterers) + 1), 106 | gridspec_kw=dict( 107 | width_ratios=width_ratios + [2], 108 | wspace=0.01, 109 | left=0, 110 | right=1, 111 | top=1, 112 | bottom=0, 113 | ), 114 | ) 115 | vmin = np.nanquantile(evaluations_df, 0.1) 116 | vmax = np.nanquantile(evaluations_df, 0.9) 117 | 118 | heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) 119 | heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) 120 | heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) 121 | 122 | for i, clus in enumerate(clusterers): 123 | temp = convert_to_multiind(clus, evaluations_df) 124 | 125 | ax = axs[i] 126 | sns.heatmap( 127 | temp, 128 | ax=ax, 129 | yticklabels=temp.index, 130 | xticklabels=["-".join([str(i) for i in col]) for col in temp.columns], 131 | cbar_ax=axs[-1], 132 | cbar_kws=dict(label="z-score"), 133 | **heatmap_kws 134 | ) 135 | ax.set_ylabel("") 136 | ax.set_title(clus) 137 | ax.set_yticklabels([]) 138 | 139 | axs[0].set_ylabel("evaluation method") 140 | axs[0].set_yticklabels(temp.index, rotation=0) 141 | if savefig: 142 | plt.savefig("%s.pdf" % output_prefix) 143 | return axs 144 | 145 | 146 | def visualize_pairwise( 147 | df: DataFrame, 148 | savefig: bool = False, 149 | output_prefix: Optional[str] = None, 150 | method: Optional[str] = None, 151 | **heatmap_kws 152 | ) -> List[matplotlib.axes.Axes]: 153 | """Visualize symmetrical square DataFrames. 154 | 155 | Args: 156 | df (DataFrame): DataFrame to visualize. 157 | savefig (bool): Whether to save a pdf. 158 | output_prefix (str): If saving a pdf, file prefix to use. 159 | method (str): Label for cbar, if relevant. 160 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 161 | 162 | Returns (List[matplotlib.axes.Axes]): 163 | List of matplotlib axes for figure. 164 | 165 | .. _seaborn.heatmap: 166 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 167 | """ 168 | heatmap_kws = {**heatmap_kws} 169 | 170 | vmin = np.nanquantile(df, 0.1) 171 | vmax = np.nanquantile(df, 0.9) 172 | 173 | heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) 174 | heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) 175 | heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) 176 | cbar_kws = heatmap_kws.get('cbar_kws', {}) 177 | cbar_kws['label'] = cbar_kws.get('label', method) 178 | heatmap_kws['cbar_kws'] = cbar_kws 179 | 180 | cbar_ratio = 2 181 | wspace = 0.01 182 | height = 0.18 * len(df) 183 | width = 0.18 * (len(df.columns)+cbar_ratio+wspace) 184 | fig, axs = plt.subplots( 185 | figsize=(width, height), 186 | nrows=1, 187 | ncols=2, 188 | gridspec_kw=dict( 189 | width_ratios=[len(df.columns), cbar_ratio], 190 | wspace=wspace, 191 | left=0, 192 | right=1, 193 | top=1, 194 | bottom=0, 195 | ) 196 | ) 197 | try: 198 | order = compute_order(df.fillna(df.median())) 199 | except ValueError: 200 | order = df.index 201 | df = df.loc[order, order] 202 | sns.heatmap( 203 | df, 204 | xticklabels=order, 205 | yticklabels=order, 206 | ax=axs[0], 207 | cbar_ax=axs[1], 208 | **heatmap_kws 209 | ) 210 | if savefig: 211 | if output_prefix is None: 212 | output_prefix = "heatmap.pairwise" 213 | plt.savefig('%s.pdf' % output_prefix) 214 | 215 | return axs 216 | 217 | 218 | def visualize_label_agreement( 219 | labels: DataFrame, 220 | method: Optional[str] = None, 221 | savefig: bool = False, 222 | output_prefix: Optional[str] = None, 223 | **heatmap_kws 224 | ) -> List[matplotlib.axes.Axes]: 225 | """Visualize similarity between clustering results given an evaluation metric. 226 | 227 | Args: 228 | labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ 229 | AutoClusterer.labels_ 230 | method (str): Method with which to compare labels. Must be a metric like the ones in \ 231 | constants.need_ground_truth, which takes two sets of labels. 232 | savefig (bool): Whether to save a pdf. 233 | output_prefix (str): If saving a pdf, file prefix to use. 234 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 235 | 236 | Returns (List[matplotlib.axes.Axes]): 237 | List of matplotlib axes 238 | 239 | .. _seaborn.heatmap: 240 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 241 | """ 242 | if savefig and output_prefix is None: 243 | output_prefix = 'heatmap.labels.pairwise' 244 | if method is None: 245 | method = 'adjusted_rand_score' 246 | 247 | labels = labels.corr( 248 | lambda x, y: evaluate_one(x, method=method, gold_standard=y) 249 | ) 250 | return visualize_pairwise(labels, savefig, output_prefix, method=method, **heatmap_kws) 251 | 252 | 253 | def visualize_sample_label_consistency( 254 | labels: DataFrame, 255 | savefig: bool = False, 256 | output_prefix: Optional[str] = None, 257 | **heatmap_kws 258 | ) -> List[matplotlib.axes.Axes]: 259 | """Visualize how often two samples are labeled in the same group across conditions. Interpret 260 | with care--if you use more conditions for some type of clusterers, e.g. more n_clusters for 261 | KMeans, those cluster more similarly across conditions than between clusterers. This means 262 | that more agreement in labeling could be due to the choice of clusterers rather than true 263 | similarity between samples. 264 | 265 | Args: 266 | labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ 267 | AutoClusterer.labels_ 268 | savefig (bool): Whether to save a pdf. 269 | output_prefix (str): If saving a pdf, file prefix to use. 270 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 271 | 272 | Returns (List[matplotlib.axes.Axes]): 273 | List of matplotlib axes 274 | 275 | .. _seaborn.heatmap: 276 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 277 | 278 | """ 279 | if savefig and output_prefix is None: 280 | output_prefix = "heatmap.sample.pairwise" 281 | labels = labels.transpose().corr(lambda x, y: sum( 282 | np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))]) 283 | )) 284 | return visualize_pairwise(labels, savefig, output_prefix, method='# same label', **heatmap_kws) 285 | 286 | 287 | def visualize_for_picking_labels( 288 | evaluation_df: DataFrame, 289 | method: Optional[str] = None, 290 | savefig_prefix: Optional[str] = None 291 | ): 292 | """Generates graphs similar to a `scree graph`_ for PCA for each parameter and each clusterer. 293 | 294 | Args: 295 | evaluation_df (DataFrame): DataFrame of evaluations to visualize. Clusterer.evaluation_df. 296 | method (str): Which metric to visualize. 297 | savefig_prefix (str): If not None, save a figure with give prefix. 298 | 299 | Returns: 300 | matplotlib axes. 301 | .. _scree graph: 302 | https://en.wikipedia.org/wiki/Scree_plot 303 | """ 304 | if method is None: 305 | method = "silhouette_score" 306 | cluss_temp = list(set([i.split(param_delim, 1)[0] for i in evaluation_df.columns])) 307 | # get figure dimensions 308 | ncols = 0 309 | cluss = [] 310 | for ploti, clus in enumerate(cluss_temp): 311 | scores = convert_to_multiind( 312 | clus, evaluation_df.loc[[method], :] 313 | ).transpose().dropna(how='any') 314 | if len(scores) == 0: 315 | logging.error( 316 | 'Score %s is missing for clusterer %s, skipping visualization' % (method, clus) 317 | ) 318 | continue 319 | indep = scores.index.to_frame().reset_index(drop=True) 320 | try: 321 | indep.astype(float) 322 | except ValueError or AssertionError: 323 | logging.error('Cannot convert %s data to floats, skipping visualization' % clus) 324 | continue 325 | cluss.append(clus) 326 | if scores.index.nlevels > ncols: 327 | ncols = scores.index.nlevels 328 | if not cluss: 329 | logging.error('No valid clusterers, cannot visualize. ') 330 | return None 331 | cluss.sort() 332 | 333 | ybuff = np.abs(np.nanquantile(evaluation_df.loc[method], 0.05)) 334 | ylim = (evaluation_df.loc[method].min() - ybuff, evaluation_df.loc[method].max() + ybuff) 335 | colors = cycle(sns.color_palette('twilight', n_colors=len(cluss) * ncols)) 336 | fig = plt.figure(figsize=(5 * (ncols), 5 * len(cluss))) 337 | gs = plt.GridSpec(nrows=len(cluss), ncols=ncols, wspace=0.25, hspace=0.25) 338 | for ploti, clus in enumerate(cluss): 339 | scores = convert_to_multiind( 340 | clus, evaluation_df.loc[[method], :] 341 | ).transpose().dropna(how='any') 342 | indep = scores.index.to_frame().reset_index(drop=True) 343 | 344 | for whcol, col in enumerate(indep.columns): 345 | if whcol == 0: 346 | saveax = plt.subplot(gs[ploti, whcol]) 347 | ax = saveax 348 | ax.set_ylim(ylim) 349 | ax.set_ylabel(clus) 350 | else: 351 | ax = plt.subplot(gs[ploti, whcol], sharey=saveax) 352 | color = next(colors) 353 | 354 | # plot eval results 355 | sns.regplot( 356 | indep[col], 357 | scores[method].values, 358 | color=color, 359 | ax=ax, 360 | logistic=True, 361 | ) 362 | 363 | axs = fig.get_axes() 364 | axs[0].set_title('%s results per parameter' % method, ha='left') 365 | if savefig_prefix: 366 | plt.savefig('%s.pdf' % savefig_prefix) 367 | return axs 368 | -------------------------------------------------------------------------------- /build/lib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/build/lib/tests/__init__.py -------------------------------------------------------------------------------- /build/lib/tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | from hypercluster import utilities 2 | from hypercluster.constants import * 3 | import hypercluster 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | test_data = pd.DataFrame( 9 | np.array( 10 | [[1, 2], [-1.8, 4], [1, -0.5], 11 | [10, 2], [-10, 4], [10, 0], 12 | [np.nan, 5], [3.2, np.nan], [0, 14], 13 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 14 | [42, np.nan], [-8, 2], [1.2, 12], 15 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 16 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 17 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 18 | [np.nan, 22], [8.6, -7.5], [0, 14], 19 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 20 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 21 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 22 | [-29, 3.6], [-13, -3], [3.3, np.nan], 23 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 24 | ) 25 | ) 26 | 27 | 28 | test_data['ind1'] = 'a' 29 | test_data['ind2'] = range(len(test_data)) 30 | test_data = test_data.set_index(['ind1', 'ind2']) 31 | test_data = test_data.fillna(test_data.median()) 32 | 33 | test_ground_truth = pd.Series( 34 | np.random.randint(0, 2, size=(len(test_data), )), 35 | index=test_data.index 36 | ) 37 | 38 | 39 | def test_cluster_one(): 40 | # Test all clusterers are working with default params 41 | for clus_name in variables_to_optimize.keys(): 42 | utilities.cluster(clus_name, test_data) 43 | 44 | # Test with putting extra params in there 45 | for clus_name in variables_to_optimize.keys(): 46 | vars = variables_to_optimize[clus_name] 47 | key = list(vars.keys())[0] 48 | params = {key: vars[key][0]} 49 | # grabbing a variable and making sure var passing works 50 | utilities.cluster(clus_name, test_data, params) 51 | 52 | 53 | def test_autoclusterer(): 54 | for clus_name in variables_to_optimize.keys(): 55 | hypercluster.AutoClusterer(clus_name).fit(test_data) 56 | for clus_name in variables_to_optimize.keys(): 57 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 58 | 59 | 60 | def test_param_weights(): 61 | for clus_name in variables_to_optimize.keys(): 62 | weights = { 63 | param: {value: (1/len(values)) for value in values} for param, values in 64 | variables_to_optimize[ 65 | clus_name 66 | ].items() 67 | } 68 | hypercluster.AutoClusterer(clus_name, param_weights=weights).fit( 69 | test_data 70 | ) 71 | for clus_name in variables_to_optimize.keys(): 72 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 73 | 74 | 75 | def test_passing_kwargs_for_a_clusterer(): 76 | clus_name = 'KMeans' 77 | 78 | hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit( 79 | test_data 80 | ) 81 | 82 | 83 | def test_evaluate_results(): 84 | labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_ 85 | for metric in inherent_metrics + need_ground_truth: 86 | utilities.evaluate_one( 87 | labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth 88 | ) 89 | 90 | 91 | def test_multiauto(): 92 | hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 93 | -------------------------------------------------------------------------------- /build/lib/tests/test_snakemake.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | def test_run_snakemake_all(): 4 | # subprocess.run( 5 | # ['touch', 'test_input.txt'] 6 | # ) 7 | subprocess.run( 8 | ['snakemake', '-s', 'hypercluster.smk', '--config', 'input_data_files=test_input'] 9 | ) -------------------------------------------------------------------------------- /build/lib/tests/test_visualize.py: -------------------------------------------------------------------------------- 1 | from hypercluster import visualize 2 | import hypercluster 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | test_data = pd.DataFrame( 8 | np.array( 9 | [[1, 2], [-1.8, 4], [1, -0.5], 10 | [10, 2], [-10, 4], [10, 0], 11 | [np.nan, 5], [3.2, np.nan], [0, 14], 12 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 13 | [42, np.nan], [-8, 2], [1.2, 12], 14 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 15 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 16 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 17 | [np.nan, 22], [8.6, -7.5], [0, 14], 18 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 19 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 20 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 21 | [-29, 3.6], [-13, -3], [3.3, np.nan], 22 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 23 | ) 24 | ) 25 | 26 | 27 | test_data['ind1'] = 'a' 28 | test_data['ind2'] = range(len(test_data)) 29 | test_data = test_data.set_index(['ind1', 'ind2']) 30 | test_data = test_data.fillna(test_data.median()) 31 | 32 | test_ground_truth = pd.Series( 33 | np.random.randint(0, 2, size=(len(test_data), )), 34 | index=test_data.index 35 | ) 36 | 37 | 38 | def test_vis_eval(): 39 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 40 | visualize.visualize_evaluations(clusterer.evaluation_df) 41 | clusterer.visualize_evaluations() 42 | # 43 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 44 | visualize.visualize_evaluations(clusterer.evaluation_df) 45 | clusterer.visualize_evaluations() 46 | 47 | 48 | def test_vis_sample(): 49 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 50 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 51 | clusterer.visualize_sample_label_consistency() 52 | 53 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 54 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 55 | clusterer.visualize_sample_label_consistency() 56 | 57 | 58 | def test_vis_labels(): 59 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 60 | visualize.visualize_label_agreement(clusterer.labels_df) 61 | clusterer.visualize_label_agreement( 62 | # savefig=True, 63 | # output_prefix='test_input/test_label_agreement_multi' 64 | ) 65 | 66 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 67 | visualize.visualize_label_agreement(clusterer.labels_df) 68 | clusterer.visualize_label_agreement( 69 | # savefig=True, 70 | # output_prefix='test_input/test_label_agreement_auto' 71 | ) 72 | -------------------------------------------------------------------------------- /dist/hypercluster-0.0.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.0.1.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.0-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.0.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.1-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.1.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.10-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.10-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.10.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.10.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.12-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.12-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.12.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.12.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.13-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.13-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.13.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.13.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.2-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.2.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.3-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.3-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.3.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.5-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.5-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.5.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.5.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.6-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.6-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.6.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.7-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.7-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.7.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.7.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.8-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.8-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.8.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.8.tar.gz -------------------------------------------------------------------------------- /dist/hypercluster-0.1.9-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.9-py3-none-any.whl -------------------------------------------------------------------------------- /dist/hypercluster-0.1.9.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.9.tar.gz -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | import sys 15 | sys.path.insert(0, '../.') 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Hypercluster' 21 | copyright = '2019, Ruggleslab' 22 | author = 'Ruggleslab' 23 | 24 | # The short X.Y version 25 | version = '0.0.2' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = '0.0.2' 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.todo', 40 | 'sphinx.ext.napoleon' 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The language for content autogenerated by Sphinx. Refer to documentation 47 | # for a list of supported languages. 48 | # 49 | # This is also used if you do content translation via gettext catalogs. 50 | # Usually you set "language" from the command line for these cases. 51 | language = 'en' 52 | 53 | # List of patterns, relative to source directory, that match files and 54 | # directories to ignore when looking for source files. 55 | # This pattern also affects html_static_path and html_extra_path. 56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 57 | 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | html_theme_options = { 66 | 'navigation_depth': 4 67 | } 68 | 69 | # Add any paths that contain custom static files (such as style sheets) here, 70 | # relative to this directory. They are copied after the builtin static files, 71 | # so a file named "default.css" will overwrite the builtin "default.css". 72 | html_static_path = ['_static'] 73 | 74 | 75 | # -- Extension configuration ------------------------------------------------- 76 | 77 | # -- Options for todo extension ---------------------------------------------- 78 | 79 | # If true, `todo` and `todoList` produce output, else they produce nothing. 80 | todo_include_todos = False 81 | -------------------------------------------------------------------------------- /docs/hypercluster.rst: -------------------------------------------------------------------------------- 1 | hypercluster package 2 | ==================== 3 | 4 | .. automodule:: hypercluster 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | hypercluster.classes module 10 | --------------------------- 11 | 12 | .. automodule:: hypercluster.classes 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | hypercluster.utilities module 18 | ----------------------------- 19 | 20 | .. automodule:: hypercluster.utilities 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | hypercluster.visualize module 26 | ----------------------------- 27 | 28 | .. automodule:: hypercluster.visualize 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | hypercluster.constants module 34 | ----------------------------- 35 | 36 | .. automodule:: hypercluster.constants 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | hypercluster.additional\_clusterers module 42 | ------------------------------------------ 43 | 44 | .. automodule:: hypercluster.additional_clusterers 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | hypercluster.additional\_metrics module 50 | --------------------------------------- 51 | 52 | .. automodule:: hypercluster.additional_metrics 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Hypercluster documentation master file, created by 2 | sphinx-quickstart on Mon Dec 30 16:45:25 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Documentation for hypercluster 7 | ============================== 8 | 9 | .. toctree:: 10 | :hidden: 11 | 12 | self 13 | 14 | .. toctree:: 15 | :maxdepth: 4 16 | :caption: Contents: 17 | 18 | hypercluster 19 | snakemake 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | 29 | 30 | Installation and logistics 31 | ========================== 32 | 33 | ************ 34 | Installation 35 | ************ 36 | 37 | Available via pip:: 38 | 39 | pip install hypercluster 40 | 41 | Or bioconda:: 42 | 43 | conda install hypercluster 44 | # or 45 | conda install -c conda-forge -c bioconda hypercluster 46 | 47 | If you are having problems installing with conda, try changing your channel priority. Priority of 48 | conda-forge > bioconda > defaults is recommended. 49 | 50 | To check channel priority: :code:`conda config --get channels` 51 | 52 | 53 | It should look like:: 54 | 55 | --add channels 'defaults' # lowest priority 56 | --add channels 'bioconda' 57 | --add channels 'conda-forge' # highest priority 58 | 59 | If it doesn't look like that, try:: 60 | 61 | conda config --add channels bioconda 62 | conda config --add channels conda-forge 63 | 64 | ********************************************* 65 | Quick reference for clustering and evaluation 66 | ********************************************* 67 | 68 | .. list-table:: Clustering algorithms 69 | :widths: 50 50 70 | :header-rows: 1 71 | 72 | * - Clusterer 73 | - Type 74 | * - KMeans/MiniBatch KMeans 75 | - Partitioner 76 | * - Affinity Propagation 77 | - Partitioner 78 | * - Mean Shift 79 | - Partitioner 80 | * - DBSCAN 81 | - Clusterer 82 | * - OPTICS 83 | - Clusterer 84 | * - Birch 85 | - Partitioner 86 | * - OPTICS 87 | - Clusterer 88 | * - HDBSCAN 89 | - Clusterer 90 | * - NMF 91 | - Partitioner 92 | * - LouvainCluster 93 | - Partitioner 94 | * - LeidenCluster 95 | - Partitioner 96 | 97 | 98 | .. list-table:: Evaluations 99 | :widths: 50 50 100 | :header-rows: 1 101 | 102 | * - Metric 103 | - Type 104 | * - adjusted_rand_score 105 | - Needs ground truth 106 | * - adjusted_mutual_info_score 107 | - Needs ground truth 108 | * - homogeneity_score 109 | - Needs ground truth 110 | * - completeness_score 111 | - Needs ground truth 112 | * - fowlkes_mallows_score 113 | - Needs ground truth 114 | * - mutual_info_score 115 | - Needs ground truth 116 | * - v_measure_score 117 | - Needs ground truth 118 | * - silhouette_score 119 | - Inherent metric 120 | * - calinski_harabasz_score 121 | - Inherent metric 122 | * - davies_bouldin_score 123 | - Inherent metric 124 | * - smallest_largest_clusters_ratio 125 | - Inherent metric 126 | * - number_of_clusters 127 | - Inherent metric 128 | * - smallest_cluster_size 129 | - Inherent metric 130 | * - largest_cluster_size 131 | - Inherent metric 132 | 133 | 134 | *********************** 135 | Quickstart and examples 136 | *********************** 137 | 138 | With snakemake: 139 | -------------- 140 | .. code-block:: 141 | 142 | snakemake -s hypercluster.smk --configfile config.yml --config input_data_files=test_data input_data_folder=. 143 | 144 | With python: 145 | ----------- 146 | .. code-block:: python 147 | 148 | import pandas as pd 149 | from sklearn.datasets import make_blobs 150 | import hypercluster 151 | 152 | data, labels = make_blobs() 153 | data = pd.DataFrame(data) 154 | labels = pd.Series(labels, index=data.index, name='labels') 155 | 156 | # With a single clustering algorithm 157 | clusterer = hypercluster.AutoClusterer() 158 | clusterer.fit(data).evaluate( 159 | methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 160 | gold_standard = labels 161 | ) 162 | 163 | clusterer.visualize_evaluations() 164 | 165 | # With a range of algorithms 166 | 167 | clusterer = hypercluster.MultiAutoClusterer() 168 | clusterer.fit(data).evaluate( 169 | methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 170 | gold_standard = labels 171 | ) 172 | 173 | clusterer.visualize_evaluations() 174 | 175 | Example work flows for both python and snakemake are 176 | `here `_ 177 | 178 | Source code is available `here `_ 179 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.24.2 2 | numpy==1.22.0 3 | scipy==1.2.1 4 | scikit-learn==0.22.0 5 | sphinx==2.0.0 6 | sphinx-argparse==0.2.5 7 | sphinx-autodoc-typehints==1.7.0 8 | sphinx_rtd_theme==0.4.3 9 | hdbscan==0.8.24 10 | snakemake==5.8.2 11 | matplotlib==3.1.2 12 | seaborn==0.9.0 13 | python-igraph==0.7.1.post6 14 | louvain==0.6.1 15 | leidenalg==0.7.0 -------------------------------------------------------------------------------- /docs/snakemake.rst: -------------------------------------------------------------------------------- 1 | hypercluster SnakeMake pipeline 2 | =============================== 3 | 4 | Line-by-line explanation of config.yml 5 | -------------------------------------- 6 | 7 | .. list-table:: Explanation for config.yml 8 | :widths: 33 33 33 9 | :header-rows: 1 10 | 11 | * - config.yml parameter 12 | - Explanation 13 | - Example from `scRNA-seq workflow `_ 14 | * - ``input_data_folder`` 15 | - Path to folder in which input data can be found. No / at the end. 16 | - ``/input_data`` 17 | * - ``input_data_files`` 18 | - | List of prefixes of data files. Exclude extension, .csv, .tsv and .txt 19 | | allowed. 20 | - ``['input_data1', 'input_data2']`` 21 | * - ``gold_standard_file`` 22 | - | File name of gold_standard_file. Must have same pandas.read_csv kwargs 23 | | as the corresponding input file. Must be in input_data_folder. 24 | - ``{'input_data': 'gold_standard_file.txt'}`` 25 | * - ``read_csv_kwargs`` 26 | - | Per input data file, keyword args to put into `pandas.read_csv `_. 27 | | **If specifying multiindex, also put the same in output_kwargs['labels']** 28 | - ``{'test_input': {'index_col':[0]}}`` 29 | * - ``output_folder`` 30 | - Path to folder in which results will be written. No / at the end. 31 | - ``/hypercluster_results`` 32 | * - ``intermediates_folder`` 33 | - | Name of the folder within the output_folder to put intermediate results, 34 | | such as labels and evaluations per condition. No need to change this usually. 35 | - ``clustering_intermediates`` 36 | * - ``clustering_results`` 37 | - | Name of the folder within the output_folder to put final results. 38 | | No need to change this usually. 39 | - ``clustering`` 40 | * - ``clusterer_kwargs`` 41 | - | Additional static keyword arguments to pass to individual clusterers. 42 | | Not to optimize. 43 | - ``KMeans: {'random_state':8}}`` 44 | * - ``generate_parameters_addtl_kwargs`` 45 | - Additonal keyword arguments for the hypercluster.AutoClusterer class. 46 | - ``{'KMeans': {'random_search':true, 'param_weights': {'n_clusters': {5: 0.25, 6:0.75}}}`` 47 | * - ``evaluations`` 48 | - | Names of evaluation metrics to use. See 49 | | hypercluster.constants.inherent_metrics or 50 | | hypercluster.constants.need_ground_truth 51 | - ``['silhouette_score', 'number_clustered']`` 52 | * - ``eval_kwargs`` 53 | - Additional kwargs per evaluation metric function. 54 | - ``{'silhouette_score': {'random_state': 8}}`` 55 | * - ``screeplot_evals`` 56 | - Metrics for which to draw scree plots. Must be a subset of metrics used to evaluate. 57 | - ``['silhouette_score', 'smallest_largest_clusters_ratio']`` 58 | * - ``metric_to_choose_best`` 59 | - | If picking best labels, which metric to maximize to choose the labels. If not choosing 60 | | best labels, leave as empty string (''). 61 | - ``silhouette_score`` 62 | * - ``metric_to_compare_labels`` 63 | - | If comparing labeling result pairwise similarity, which metric to use. To not generate 64 | | this comparison, leave blank/or empty string. 65 | - ``adjusted_rand_score`` 66 | * - ``compare_samples`` 67 | - | Whether to made a table and figure with counts of how often two samples are in the same 68 | | cluster. 69 | - ``true`` 70 | * - ``output_kwargs`` 71 | - | pandas.to_csv and pandas.read_csv kwargs per output type. Generally, 72 | | don't need to change the evaluations kwargs, but labels index_col have to 73 | | match index_col like in the read_csv_kwargs. 74 | - ``{'evaluations': {'index_col':[0]}, 'labels': {'index_col':[0]}}`` 75 | * - ``heatmap_kwargs`` 76 | - Additional kwargs for `seaborn.heatmap `_ for visualizations. 77 | - ``{'vmin':-2, 'vmax':2}`` 78 | * - ``optimization_parameters`` 79 | - Fun part! This is where you put which hyperparameters per algorithm to try. 80 | - ``{'KMeans': {'n_clusters': [5, 6, 7]}}`` 81 | 82 | **Note: Formatting of lists and dictionaries can be in python syntax (like above) or yaml syntax, or a mixture, like below. ** 83 | 84 | config.yml example from `scRNA-seq workflow `_ 85 | ---------------------------------- 86 | 87 | .. code-block:: yaml 88 | 89 | input_data_folder: '.' 90 | input_data_files: 91 | - sc_data 92 | gold_standards: 93 | test_input: 'gold_standard.csv' 94 | read_csv_kwargs: 95 | test_input: {'index_col':[0]} 96 | 97 | output_folder: 'results' 98 | intermediates_folder: 'clustering_intermediates' 99 | clustering_results: 'clustering' 100 | 101 | clusterer_kwargs: {} 102 | generate_parameters_addtl_kwargs: {} 103 | 104 | evaluations: 105 | - silhouette_score 106 | - calinski_harabasz_score 107 | - davies_bouldin_score 108 | - number_clustered 109 | - smallest_largest_clusters_ratio 110 | - smallest_cluster_ratio 111 | eval_kwargs: {} 112 | 113 | metric_to_choose_best: silhouette_score 114 | metric_to_compare_labels: adjusted_rand_score 115 | compare_samples: true 116 | 117 | output_kwargs: 118 | evaluations: 119 | index_col: [0] 120 | labels: 121 | index_col: [0] 122 | heatmap_kwargs: {} 123 | 124 | optimization_parameters: 125 | HDBSCAN: 126 | min_cluster_size: &id002 127 | - 2 128 | - 3 129 | - 4 130 | - 5 131 | KMeans: 132 | n_clusters: &id001 133 | - 5 134 | - 6 135 | - 7 136 | MiniBatchKMeans: 137 | n_clusters: *id001 138 | OPTICS: 139 | min_samples: *id002 140 | NMFCluster: 141 | n_clusters: *id001 142 | LouvainCluster: &id003 143 | resolution: 144 | - 0.2 145 | - 0.4 146 | - 0.6 147 | - 0.8 148 | - 1.0 149 | - 1.2 150 | - 1.4 151 | - 1.6 152 | k: 153 | - 10 154 | - 15 155 | - 20 156 | - 40 157 | - 80 158 | - 120 159 | LeidenCluster: *id003 160 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Example jupyter notebooks for hypercluster 2 | 3 | ##### Roster: 4 | 1. Running hypercluster with snakemake, in a distributed HPCC on scRNA-seq data from [Tikhonova AN et al. Nature. (2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/30971824/) 5 | 2. Running hypercluster locally on the [TCGA breast cancer RNA data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3465532/) 6 | 7 | If you would like another vignette, please open an [issue](https://github.com/liliblu/hypercluster/issues) 8 | -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.louvain-umaps-checkpoint.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.louvain-umaps-checkpoint.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/brca.rna.evaluations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/brca.rna.evaluations.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/clustermap.nmf4-vs-psm50.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/clustermap.nmf4-vs-psm50.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LeidenCluster-silhouette_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LeidenCluster-silhouette_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LouvainCluster-adjusted_rand_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LouvainCluster-adjusted_rand_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/colorbar.NMFCluster-silhouette_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.NMFCluster-silhouette_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/colorbar.silhouette_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.silhouette_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LeidenCluster-silhouette_score-umaps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LeidenCluster-silhouette_score-umaps.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.NMFCluster-silhouette_score-umaps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.NMFCluster-silhouette_score-umaps.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.PAM50_comp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.PAM50_comp.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/heatmaps.graphs-clusterers.metrics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmaps.graphs-clusterers.metrics.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.calinski_harabasz_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.calinski_harabasz_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.davies_bouldin_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.davies_bouldin_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.largest_cluster_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.largest_cluster_size.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.number_of_clusters.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.number_of_clusters.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.pca.various_clusters.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.pca.various_clusters.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.silhouette_score.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.silhouette_score.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_cluster_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_cluster_size.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_largest_clusters_ratio.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_largest_clusters_ratio.pdf -------------------------------------------------------------------------------- /examples/local_TCGA_BRCA_RNAseq/jupyter-lab-5918844.log: -------------------------------------------------------------------------------- 1 | 2 | MacOS or linux terminal command to create your ssh tunnel: 3 | ssh -N -L 8693:fn-0002:8693 lmb529@bigpurple3.nyumc.org 4 | 5 | For more info and how to connect from windows, 6 | see research.computing.yale.edu/jupyter-nb 7 | Here is the MobaXterm info: 8 | 9 | Forwarded port:same as remote port 10 | Remote server: fn-0002 11 | Remote port: 8693 12 | SSH server: bigpurple3.nyumc.org 13 | SSH login: lmb529 14 | SSH port: 22 15 | 16 | Use a Browser on your local machine to go to: 17 | http://localhost:8693 (prefix w/ https:// if using password) 18 | 19 | /cm/local/apps/slurm/var/spool/job5918844/slurm_script: line 57: activate: No such file or directory 20 | [I 11:42:30.612 LabApp] JupyterLab extension loaded from /gpfs/data/ruggleslab/home/lmb529/conda/envs/hc_test/lib/python3.7/site-packages/jupyterlab 21 | [I 11:42:30.614 LabApp] JupyterLab application directory is /gpfs/data/ruggleslab/home/lmb529/conda/envs/hc_test/share/jupyter/lab 22 | [I 11:42:30.619 LabApp] Serving notebooks from local directory: /gpfs/data/ruggleslab 23 | [I 11:42:30.619 LabApp] The Jupyter Notebook is running at: 24 | [I 11:42:30.619 LabApp] http://fn-0002:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb 25 | [I 11:42:30.620 LabApp] or http://127.0.0.1:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb 26 | [I 11:42:30.620 LabApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 27 | [C 11:42:30.662 LabApp] 28 | 29 | To access the notebook, open this file in a browser: 30 | file:///gpfs/home/lmb529/.local/share/jupyter/runtime/nbserver-187904-open.html 31 | Or copy and paste one of these URLs: 32 | http://fn-0002:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb 33 | or http://127.0.0.1:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb 34 | [I 11:43:24.452 LabApp] 302 GET / (192.168.0.103) 0.60ms 35 | [I 11:43:24.467 LabApp] 302 GET /lab? (192.168.0.103) 0.85ms 36 | [I 11:43:35.596 LabApp] 302 POST /login?next=%2Flab%3F (192.168.0.103) 1.19ms 37 | [W 11:43:36.846 LabApp] Could not determine jupyterlab build status without nodejs 38 | [I 11:43:38.497 LabApp] Kernel started: 5905720c-dcbc-407d-acd9-e3e535378860 39 | [I 11:43:38.533 LabApp] Kernel started: e06fce5a-5298-45f3-99f7-3c929235c5b6 40 | [I 11:45:38.369 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 41 | [I 11:59:39.671 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 42 | [I 12:01:39.875 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 43 | [I 12:03:40.086 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 44 | [I 12:05:40.254 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 45 | [I 12:17:41.100 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 46 | [I 12:19:06.673 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8 47 | [I 12:19:06.675 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:e9bab26b-244d-42fc-84f9-8dd2884de9d2 48 | [I 12:19:07.729 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8 49 | [I 12:19:41.442 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 50 | [W 12:23:07.712 LabApp] WebSocket ping timeout after 119994 ms. 51 | [W 12:23:07.730 LabApp] WebSocket ping timeout after 119993 ms. 52 | [W 12:23:07.753 LabApp] WebSocket ping timeout after 119994 ms. 53 | [I 12:23:12.731 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8 54 | [I 12:23:12.754 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:e9bab26b-244d-42fc-84f9-8dd2884de9d2 55 | [W 14:19:17.494 LabApp] Could not determine jupyterlab build status without nodejs 56 | [I 14:21:18.748 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 57 | [I 14:26:35.854 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:aa6c73a1-652a-453d-904b-1eeed7fce497 58 | [I 14:26:35.856 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:98f03fb6-73e6-4f56-aa66-a6b71770f0be 59 | [W 14:26:40.187 LabApp] Could not determine jupyterlab build status without nodejs 60 | [I 14:36:43.172 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 61 | [I 14:38:43.384 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 62 | [I 14:40:43.602 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 63 | [I 14:42:43.820 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 64 | [I 15:02:24.900 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 65 | [I 15:02:46.333 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 66 | [I 15:03:18.710 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 67 | [I 15:04:46.632 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 68 | [I 15:06:53.109 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 69 | [I 15:08:20.203 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 70 | [I 15:08:46.952 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 71 | [I 15:09:40.773 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 72 | [I 15:10:47.129 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 73 | [I 15:14:24.124 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 74 | [I 15:14:47.393 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 75 | [I 15:16:47.623 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 76 | [I 15:16:53.281 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa 77 | [I 15:16:53.281 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115 78 | [I 15:16:55.674 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115 79 | [I 15:16:55.707 LabApp] Restoring connection for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa 80 | [I 15:18:47.945 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 81 | [I 15:20:48.169 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 82 | [I 15:24:48.485 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 83 | [I 15:25:05.450 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 84 | [I 15:26:48.688 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 85 | [I 15:28:49.229 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 86 | [I 15:29:33.256 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 87 | [I 15:30:49.455 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 88 | [I 15:32:49.772 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 89 | [I 15:34:50.095 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 90 | [I 15:36:50.456 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 91 | [I 15:40:50.739 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 92 | [I 15:42:50.989 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 93 | [I 15:44:51.348 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 94 | [I 15:45:05.779 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6 95 | [I 15:46:52.685 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 96 | [I 15:48:54.259 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 97 | [I 15:50:28.449 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115 98 | [I 15:50:28.467 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa 99 | [I 15:50:29.737 LabApp] Restoring connection for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa 100 | [I 15:50:29.763 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115 101 | [I 15:50:55.952 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 102 | [W 15:52:59.739 LabApp] WebSocket ping timeout after 119935 ms. 103 | [W 15:52:59.763 LabApp] WebSocket ping timeout after 119956 ms. 104 | [I 15:53:04.755 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa 105 | [I 15:53:04.764 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115 106 | [W 16:24:46.888 LabApp] Could not determine jupyterlab build status without nodejs 107 | [I 16:26:48.786 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 108 | [I 16:28:49.624 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 109 | [I 16:30:51.525 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb 110 | [W 17:23:18.493 LabApp] WebSocket ping timeout after 119943 ms. 111 | [W 17:23:18.571 LabApp] WebSocket ping timeout after 119982 ms. 112 | [I 17:23:23.521 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:c9dfa09f-3795-4e25-a45b-7a97fcdc6906 113 | [I 17:23:23.573 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:2de51812-9e38-436b-a0e6-45ad9c32335a 114 | slurmstepd-fn-0002: error: *** JOB 5918844 ON fn-0002 CANCELLED AT 2020-01-07T21:42:49 DUE TO TIME LIMIT *** 115 | -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__": { 3 | "job-name": "snakemake", 4 | "time": "4-23:59:59", 5 | "mem": 2G, 6 | "partition": "fn_medium", 7 | "cpus-per-task": 1, 8 | "output": "logs/slurm/%j.out", 9 | "error": "logs/slurm/%j.err" 10 | }, 11 | "run_clusterer": { 12 | "job-name": "clusterer", 13 | "time": "4-23:59:59", 14 | "mem": 32G, 15 | "partition": "fn_medium", 16 | "cpus-per-task": 8, 17 | "output": "logs/slurm/cluster-%j.out", 18 | "error": "logs/slurm/cluster-%j.err" 19 | }, 20 | "run_evaluation": { 21 | "job-name": "evaluate", 22 | "time": "4-23:59:59", 23 | "mem": 2G, 24 | "partition": "fn_medium", 25 | "output": "logs/slurm/evaluate-%j.out", 26 | "error": "logs/slurm/evaluate-%j.err" 27 | }, 28 | "compare_labels": { 29 | "job-name": "compare_labels", 30 | "time": "4-23:59:59", 31 | "mem": 32G, 32 | "cpus-per-task": 2, 33 | "partition": "fn_medium", 34 | "output": "logs/slurm/compare_labels-%j.out", 35 | "error": "logs/slurm/compare_labels-%j.err" 36 | }, 37 | "compare_samples": { 38 | "job-name": "compare_samples", 39 | "time": "4-23:59:59", 40 | "mem": 48G, 41 | "cpus-per-task": 2, 42 | "partition": "fn_medium", 43 | "output": "logs/slurm/compare_samples-%j.out", 44 | "error": "logs/slurm/compare_samples-%j.err" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/config.yml: -------------------------------------------------------------------------------- 1 | clusterer_kwargs: {} 2 | clustering_results: clustering 3 | compare_samples: false 4 | eval_kwargs: {} 5 | evaluations: 6 | - adjusted_mutual_info_score 7 | - adjusted_rand_score 8 | - calinski_harabasz_score 9 | - completeness_score 10 | - davies_bouldin_score 11 | - fowlkes_mallows_score 12 | - homogeneity_score 13 | - largest_cluster_size 14 | - mutual_info_score 15 | - number_of_clusters 16 | - silhouette_score 17 | - smallest_cluster_size 18 | - smallest_largest_clusters_ratio 19 | - v_measure_score 20 | generate_parameters_addtl_kwargs: {} 21 | gold_standards: 22 | sc_data: gold_standard.csv 23 | sc_data_pca: gold_standard.csv 24 | heatmap_kwargs: {} 25 | input_data_files: 26 | - sc_data 27 | - sc_data_pca 28 | input_data_folder: /gpfs/data/ruggleslab/home/lmb529/hypercluster/examples/snakemake_scRNA_example 29 | intermediates_folder: clustering_intermediates 30 | metric_to_choose_best: adjusted_rand_score 31 | metric_to_compare_labels: adjusted_rand_score 32 | optimization_parameters: 33 | AffinityPropagation: 34 | damping: 35 | - 0.55 36 | - 0.6 37 | - 0.65 38 | - 0.7 39 | - 0.75 40 | - 0.8 41 | - 0.85 42 | - 0.9 43 | - 0.95 44 | LeidenCluster: 45 | adjacency_method: 46 | - SNN 47 | - CNN 48 | k: 49 | - 20 50 | - 30 51 | - 40 52 | - 80 53 | - 120 54 | resolution: 55 | - 0.3 56 | - 0.4 57 | - 0.5 58 | - 0.6 59 | - 0.7 60 | - 0.8 61 | - 0.9 62 | - 1.0 63 | - 1.2 64 | - 1.4 65 | LouvainCluster: 66 | adjacency_method: 67 | - MNN 68 | - CNN 69 | k: 70 | - 20 71 | - 30 72 | - 40 73 | - 80 74 | - 120 75 | resolution: 76 | - 0.3 77 | - 0.4 78 | - 0.5 79 | - 0.6 80 | - 0.7 81 | - 0.8 82 | - 0.9 83 | - 1.0 84 | - 1.2 85 | - 1.4 86 | NMFCluster: 87 | n_clusters: 88 | - 3 89 | - 4 90 | - 5 91 | - 6 92 | - 7 93 | - 8 94 | - 9 95 | - 10 96 | - 11 97 | - 12 98 | - 13 99 | - 14 100 | - 15 101 | - 16 102 | - 17 103 | - 18 104 | - 19 105 | - 20 106 | - 21 107 | - 22 108 | - 23 109 | - 24 110 | - 25 111 | - 26 112 | - 27 113 | - 28 114 | - 29 115 | - 30 116 | - 31 117 | - 32 118 | - 33 119 | - 34 120 | - 35 121 | - 36 122 | - 37 123 | - 38 124 | - 39 125 | output_folder: results_NN_test 126 | output_kwargs: 127 | evaluations: 128 | index_col: 129 | - 0 130 | labels: 131 | index_col: 132 | - 0 133 | read_csv_kwargs: 134 | sc_data: 135 | index_col: 136 | - 0 137 | sc_data_pca: 138 | index_col: 139 | - 0 140 | screeplot_evals: 141 | - adjusted_mutual_info_score 142 | - adjusted_rand_score 143 | - calinski_harabasz_score 144 | - completeness_score 145 | - davies_bouldin_score 146 | - fowlkes_mallows_score 147 | - homogeneity_score 148 | - largest_cluster_size 149 | - mutual_info_score 150 | - number_of_clusters 151 | - silhouette_score 152 | - smallest_cluster_size 153 | - smallest_largest_clusters_ratio 154 | - v_measure_score 155 | -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/figures/heatmaps.graphs-clusterers.metrics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/heatmaps.graphs-clusterers.metrics.pdf -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/figures/pca.best_labels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/pca.best_labels.pdf -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/figures/pca.published_labels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/pca.published_labels.pdf -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/figures/umap.best_labels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/umap.best_labels.pdf -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/figures/umap.published_labels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/umap.published_labels.pdf -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/params_to_test.yml: -------------------------------------------------------------------------------- 1 | AffinityPropagation;damping-0.55: 2 | clusterer: AffinityPropagation 3 | damping: 0.55 4 | AffinityPropagation;damping-0.6: 5 | clusterer: AffinityPropagation 6 | damping: 0.6 7 | AffinityPropagation;damping-0.65: 8 | clusterer: AffinityPropagation 9 | damping: 0.65 10 | AffinityPropagation;damping-0.7: 11 | clusterer: AffinityPropagation 12 | damping: 0.7 13 | AffinityPropagation;damping-0.75: 14 | clusterer: AffinityPropagation 15 | damping: 0.75 16 | AffinityPropagation;damping-0.8: 17 | clusterer: AffinityPropagation 18 | damping: 0.8 19 | AffinityPropagation;damping-0.85: 20 | clusterer: AffinityPropagation 21 | damping: 0.85 22 | AffinityPropagation;damping-0.9: 23 | clusterer: AffinityPropagation 24 | damping: 0.9 25 | AffinityPropagation;damping-0.95: 26 | clusterer: AffinityPropagation 27 | damping: 0.95 28 | HDBSCAN;min_cluster_size-10: 29 | clusterer: HDBSCAN 30 | min_cluster_size: 10 31 | HDBSCAN;min_cluster_size-11: 32 | clusterer: HDBSCAN 33 | min_cluster_size: 11 34 | HDBSCAN;min_cluster_size-12: 35 | clusterer: HDBSCAN 36 | min_cluster_size: 12 37 | HDBSCAN;min_cluster_size-13: 38 | clusterer: HDBSCAN 39 | min_cluster_size: 13 40 | HDBSCAN;min_cluster_size-14: 41 | clusterer: HDBSCAN 42 | min_cluster_size: 14 43 | HDBSCAN;min_cluster_size-15: 44 | clusterer: HDBSCAN 45 | min_cluster_size: 15 46 | HDBSCAN;min_cluster_size-16: 47 | clusterer: HDBSCAN 48 | min_cluster_size: 16 49 | HDBSCAN;min_cluster_size-2: 50 | clusterer: HDBSCAN 51 | min_cluster_size: 2 52 | HDBSCAN;min_cluster_size-3: 53 | clusterer: HDBSCAN 54 | min_cluster_size: 3 55 | HDBSCAN;min_cluster_size-4: 56 | clusterer: HDBSCAN 57 | min_cluster_size: 4 58 | HDBSCAN;min_cluster_size-5: 59 | clusterer: HDBSCAN 60 | min_cluster_size: 5 61 | HDBSCAN;min_cluster_size-6: 62 | clusterer: HDBSCAN 63 | min_cluster_size: 6 64 | HDBSCAN;min_cluster_size-7: 65 | clusterer: HDBSCAN 66 | min_cluster_size: 7 67 | HDBSCAN;min_cluster_size-8: 68 | clusterer: HDBSCAN 69 | min_cluster_size: 8 70 | HDBSCAN;min_cluster_size-9: 71 | clusterer: HDBSCAN 72 | min_cluster_size: 9 73 | KMeans;n_clusters-10: 74 | clusterer: KMeans 75 | n_clusters: 10 76 | KMeans;n_clusters-11: 77 | clusterer: KMeans 78 | n_clusters: 11 79 | KMeans;n_clusters-12: 80 | clusterer: KMeans 81 | n_clusters: 12 82 | KMeans;n_clusters-13: 83 | clusterer: KMeans 84 | n_clusters: 13 85 | KMeans;n_clusters-14: 86 | clusterer: KMeans 87 | n_clusters: 14 88 | KMeans;n_clusters-15: 89 | clusterer: KMeans 90 | n_clusters: 15 91 | KMeans;n_clusters-16: 92 | clusterer: KMeans 93 | n_clusters: 16 94 | KMeans;n_clusters-17: 95 | clusterer: KMeans 96 | n_clusters: 17 97 | KMeans;n_clusters-18: 98 | clusterer: KMeans 99 | n_clusters: 18 100 | KMeans;n_clusters-19: 101 | clusterer: KMeans 102 | n_clusters: 19 103 | KMeans;n_clusters-2: 104 | clusterer: KMeans 105 | n_clusters: 2 106 | KMeans;n_clusters-20: 107 | clusterer: KMeans 108 | n_clusters: 20 109 | KMeans;n_clusters-21: 110 | clusterer: KMeans 111 | n_clusters: 21 112 | KMeans;n_clusters-22: 113 | clusterer: KMeans 114 | n_clusters: 22 115 | KMeans;n_clusters-23: 116 | clusterer: KMeans 117 | n_clusters: 23 118 | KMeans;n_clusters-24: 119 | clusterer: KMeans 120 | n_clusters: 24 121 | KMeans;n_clusters-25: 122 | clusterer: KMeans 123 | n_clusters: 25 124 | KMeans;n_clusters-26: 125 | clusterer: KMeans 126 | n_clusters: 26 127 | KMeans;n_clusters-27: 128 | clusterer: KMeans 129 | n_clusters: 27 130 | KMeans;n_clusters-28: 131 | clusterer: KMeans 132 | n_clusters: 28 133 | KMeans;n_clusters-29: 134 | clusterer: KMeans 135 | n_clusters: 29 136 | KMeans;n_clusters-3: 137 | clusterer: KMeans 138 | n_clusters: 3 139 | KMeans;n_clusters-30: 140 | clusterer: KMeans 141 | n_clusters: 30 142 | KMeans;n_clusters-31: 143 | clusterer: KMeans 144 | n_clusters: 31 145 | KMeans;n_clusters-32: 146 | clusterer: KMeans 147 | n_clusters: 32 148 | KMeans;n_clusters-33: 149 | clusterer: KMeans 150 | n_clusters: 33 151 | KMeans;n_clusters-34: 152 | clusterer: KMeans 153 | n_clusters: 34 154 | KMeans;n_clusters-35: 155 | clusterer: KMeans 156 | n_clusters: 35 157 | KMeans;n_clusters-36: 158 | clusterer: KMeans 159 | n_clusters: 36 160 | KMeans;n_clusters-37: 161 | clusterer: KMeans 162 | n_clusters: 37 163 | KMeans;n_clusters-38: 164 | clusterer: KMeans 165 | n_clusters: 38 166 | KMeans;n_clusters-39: 167 | clusterer: KMeans 168 | n_clusters: 39 169 | KMeans;n_clusters-4: 170 | clusterer: KMeans 171 | n_clusters: 4 172 | KMeans;n_clusters-40: 173 | clusterer: KMeans 174 | n_clusters: 40 175 | KMeans;n_clusters-41: 176 | clusterer: KMeans 177 | n_clusters: 41 178 | KMeans;n_clusters-42: 179 | clusterer: KMeans 180 | n_clusters: 42 181 | KMeans;n_clusters-43: 182 | clusterer: KMeans 183 | n_clusters: 43 184 | KMeans;n_clusters-44: 185 | clusterer: KMeans 186 | n_clusters: 44 187 | KMeans;n_clusters-45: 188 | clusterer: KMeans 189 | n_clusters: 45 190 | KMeans;n_clusters-46: 191 | clusterer: KMeans 192 | n_clusters: 46 193 | KMeans;n_clusters-47: 194 | clusterer: KMeans 195 | n_clusters: 47 196 | KMeans;n_clusters-48: 197 | clusterer: KMeans 198 | n_clusters: 48 199 | KMeans;n_clusters-49: 200 | clusterer: KMeans 201 | n_clusters: 49 202 | KMeans;n_clusters-5: 203 | clusterer: KMeans 204 | n_clusters: 5 205 | KMeans;n_clusters-50: 206 | clusterer: KMeans 207 | n_clusters: 50 208 | KMeans;n_clusters-51: 209 | clusterer: KMeans 210 | n_clusters: 51 211 | KMeans;n_clusters-52: 212 | clusterer: KMeans 213 | n_clusters: 52 214 | KMeans;n_clusters-53: 215 | clusterer: KMeans 216 | n_clusters: 53 217 | KMeans;n_clusters-54: 218 | clusterer: KMeans 219 | n_clusters: 54 220 | KMeans;n_clusters-55: 221 | clusterer: KMeans 222 | n_clusters: 55 223 | KMeans;n_clusters-56: 224 | clusterer: KMeans 225 | n_clusters: 56 226 | KMeans;n_clusters-57: 227 | clusterer: KMeans 228 | n_clusters: 57 229 | KMeans;n_clusters-58: 230 | clusterer: KMeans 231 | n_clusters: 58 232 | KMeans;n_clusters-59: 233 | clusterer: KMeans 234 | n_clusters: 59 235 | KMeans;n_clusters-6: 236 | clusterer: KMeans 237 | n_clusters: 6 238 | KMeans;n_clusters-7: 239 | clusterer: KMeans 240 | n_clusters: 7 241 | KMeans;n_clusters-8: 242 | clusterer: KMeans 243 | n_clusters: 8 244 | KMeans;n_clusters-9: 245 | clusterer: KMeans 246 | n_clusters: 9 247 | MiniBatchKMeans;n_clusters-10: 248 | clusterer: MiniBatchKMeans 249 | n_clusters: 10 250 | MiniBatchKMeans;n_clusters-11: 251 | clusterer: MiniBatchKMeans 252 | n_clusters: 11 253 | MiniBatchKMeans;n_clusters-12: 254 | clusterer: MiniBatchKMeans 255 | n_clusters: 12 256 | MiniBatchKMeans;n_clusters-13: 257 | clusterer: MiniBatchKMeans 258 | n_clusters: 13 259 | MiniBatchKMeans;n_clusters-14: 260 | clusterer: MiniBatchKMeans 261 | n_clusters: 14 262 | MiniBatchKMeans;n_clusters-15: 263 | clusterer: MiniBatchKMeans 264 | n_clusters: 15 265 | MiniBatchKMeans;n_clusters-16: 266 | clusterer: MiniBatchKMeans 267 | n_clusters: 16 268 | MiniBatchKMeans;n_clusters-17: 269 | clusterer: MiniBatchKMeans 270 | n_clusters: 17 271 | MiniBatchKMeans;n_clusters-18: 272 | clusterer: MiniBatchKMeans 273 | n_clusters: 18 274 | MiniBatchKMeans;n_clusters-19: 275 | clusterer: MiniBatchKMeans 276 | n_clusters: 19 277 | MiniBatchKMeans;n_clusters-2: 278 | clusterer: MiniBatchKMeans 279 | n_clusters: 2 280 | MiniBatchKMeans;n_clusters-20: 281 | clusterer: MiniBatchKMeans 282 | n_clusters: 20 283 | MiniBatchKMeans;n_clusters-21: 284 | clusterer: MiniBatchKMeans 285 | n_clusters: 21 286 | MiniBatchKMeans;n_clusters-22: 287 | clusterer: MiniBatchKMeans 288 | n_clusters: 22 289 | MiniBatchKMeans;n_clusters-23: 290 | clusterer: MiniBatchKMeans 291 | n_clusters: 23 292 | MiniBatchKMeans;n_clusters-24: 293 | clusterer: MiniBatchKMeans 294 | n_clusters: 24 295 | MiniBatchKMeans;n_clusters-25: 296 | clusterer: MiniBatchKMeans 297 | n_clusters: 25 298 | MiniBatchKMeans;n_clusters-26: 299 | clusterer: MiniBatchKMeans 300 | n_clusters: 26 301 | MiniBatchKMeans;n_clusters-27: 302 | clusterer: MiniBatchKMeans 303 | n_clusters: 27 304 | MiniBatchKMeans;n_clusters-28: 305 | clusterer: MiniBatchKMeans 306 | n_clusters: 28 307 | MiniBatchKMeans;n_clusters-29: 308 | clusterer: MiniBatchKMeans 309 | n_clusters: 29 310 | MiniBatchKMeans;n_clusters-3: 311 | clusterer: MiniBatchKMeans 312 | n_clusters: 3 313 | MiniBatchKMeans;n_clusters-30: 314 | clusterer: MiniBatchKMeans 315 | n_clusters: 30 316 | MiniBatchKMeans;n_clusters-31: 317 | clusterer: MiniBatchKMeans 318 | n_clusters: 31 319 | MiniBatchKMeans;n_clusters-32: 320 | clusterer: MiniBatchKMeans 321 | n_clusters: 32 322 | MiniBatchKMeans;n_clusters-33: 323 | clusterer: MiniBatchKMeans 324 | n_clusters: 33 325 | MiniBatchKMeans;n_clusters-34: 326 | clusterer: MiniBatchKMeans 327 | n_clusters: 34 328 | MiniBatchKMeans;n_clusters-35: 329 | clusterer: MiniBatchKMeans 330 | n_clusters: 35 331 | MiniBatchKMeans;n_clusters-36: 332 | clusterer: MiniBatchKMeans 333 | n_clusters: 36 334 | MiniBatchKMeans;n_clusters-37: 335 | clusterer: MiniBatchKMeans 336 | n_clusters: 37 337 | MiniBatchKMeans;n_clusters-38: 338 | clusterer: MiniBatchKMeans 339 | n_clusters: 38 340 | MiniBatchKMeans;n_clusters-39: 341 | clusterer: MiniBatchKMeans 342 | n_clusters: 39 343 | MiniBatchKMeans;n_clusters-4: 344 | clusterer: MiniBatchKMeans 345 | n_clusters: 4 346 | MiniBatchKMeans;n_clusters-40: 347 | clusterer: MiniBatchKMeans 348 | n_clusters: 40 349 | MiniBatchKMeans;n_clusters-41: 350 | clusterer: MiniBatchKMeans 351 | n_clusters: 41 352 | MiniBatchKMeans;n_clusters-42: 353 | clusterer: MiniBatchKMeans 354 | n_clusters: 42 355 | MiniBatchKMeans;n_clusters-43: 356 | clusterer: MiniBatchKMeans 357 | n_clusters: 43 358 | MiniBatchKMeans;n_clusters-44: 359 | clusterer: MiniBatchKMeans 360 | n_clusters: 44 361 | MiniBatchKMeans;n_clusters-45: 362 | clusterer: MiniBatchKMeans 363 | n_clusters: 45 364 | MiniBatchKMeans;n_clusters-46: 365 | clusterer: MiniBatchKMeans 366 | n_clusters: 46 367 | MiniBatchKMeans;n_clusters-47: 368 | clusterer: MiniBatchKMeans 369 | n_clusters: 47 370 | MiniBatchKMeans;n_clusters-48: 371 | clusterer: MiniBatchKMeans 372 | n_clusters: 48 373 | MiniBatchKMeans;n_clusters-49: 374 | clusterer: MiniBatchKMeans 375 | n_clusters: 49 376 | MiniBatchKMeans;n_clusters-5: 377 | clusterer: MiniBatchKMeans 378 | n_clusters: 5 379 | MiniBatchKMeans;n_clusters-50: 380 | clusterer: MiniBatchKMeans 381 | n_clusters: 50 382 | MiniBatchKMeans;n_clusters-51: 383 | clusterer: MiniBatchKMeans 384 | n_clusters: 51 385 | MiniBatchKMeans;n_clusters-52: 386 | clusterer: MiniBatchKMeans 387 | n_clusters: 52 388 | MiniBatchKMeans;n_clusters-53: 389 | clusterer: MiniBatchKMeans 390 | n_clusters: 53 391 | MiniBatchKMeans;n_clusters-54: 392 | clusterer: MiniBatchKMeans 393 | n_clusters: 54 394 | MiniBatchKMeans;n_clusters-55: 395 | clusterer: MiniBatchKMeans 396 | n_clusters: 55 397 | MiniBatchKMeans;n_clusters-56: 398 | clusterer: MiniBatchKMeans 399 | n_clusters: 56 400 | MiniBatchKMeans;n_clusters-57: 401 | clusterer: MiniBatchKMeans 402 | n_clusters: 57 403 | MiniBatchKMeans;n_clusters-58: 404 | clusterer: MiniBatchKMeans 405 | n_clusters: 58 406 | MiniBatchKMeans;n_clusters-59: 407 | clusterer: MiniBatchKMeans 408 | n_clusters: 59 409 | MiniBatchKMeans;n_clusters-6: 410 | clusterer: MiniBatchKMeans 411 | n_clusters: 6 412 | MiniBatchKMeans;n_clusters-7: 413 | clusterer: MiniBatchKMeans 414 | n_clusters: 7 415 | MiniBatchKMeans;n_clusters-8: 416 | clusterer: MiniBatchKMeans 417 | n_clusters: 8 418 | MiniBatchKMeans;n_clusters-9: 419 | clusterer: MiniBatchKMeans 420 | n_clusters: 9 421 | OPTICS;min_samples-10: 422 | clusterer: OPTICS 423 | min_samples: 10 424 | OPTICS;min_samples-11: 425 | clusterer: OPTICS 426 | min_samples: 11 427 | OPTICS;min_samples-12: 428 | clusterer: OPTICS 429 | min_samples: 12 430 | OPTICS;min_samples-13: 431 | clusterer: OPTICS 432 | min_samples: 13 433 | OPTICS;min_samples-14: 434 | clusterer: OPTICS 435 | min_samples: 14 436 | OPTICS;min_samples-15: 437 | clusterer: OPTICS 438 | min_samples: 15 439 | OPTICS;min_samples-16: 440 | clusterer: OPTICS 441 | min_samples: 16 442 | OPTICS;min_samples-2: 443 | clusterer: OPTICS 444 | min_samples: 2 445 | OPTICS;min_samples-3: 446 | clusterer: OPTICS 447 | min_samples: 3 448 | OPTICS;min_samples-4: 449 | clusterer: OPTICS 450 | min_samples: 4 451 | OPTICS;min_samples-5: 452 | clusterer: OPTICS 453 | min_samples: 5 454 | OPTICS;min_samples-6: 455 | clusterer: OPTICS 456 | min_samples: 6 457 | OPTICS;min_samples-7: 458 | clusterer: OPTICS 459 | min_samples: 7 460 | OPTICS;min_samples-8: 461 | clusterer: OPTICS 462 | min_samples: 8 463 | OPTICS;min_samples-9: 464 | clusterer: OPTICS 465 | min_samples: 9 466 | -------------------------------------------------------------------------------- /examples/snakemake_scRNA_example/snakemake_submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --partition cpu_long 3 | #SBATCH --mem 4G 4 | #SBATCH --time 27-23:59:59 5 | #SBATCH --job-name snakeautocluster 6 | #SBATCH --cpus-per-task=1 7 | #SBATCH -e logs/sbatchSnakefile_progress_err.log 8 | #SBATCH -o logs/sbatchSnakefile_progress_out.log 9 | 10 | 11 | module purge 12 | module add slurm 13 | source activate hc_test 14 | cd /gpfs/data/ruggleslab/home/lmb529/hypercluster/examples/snakemake_scRNA_example 15 | mkdir -p logs/slurm/ 16 | 17 | snakemake -j 999 -p --verbose \ 18 | -s ../../snakemake/hypercluster.smk \ 19 | --configfile config.yml \ 20 | --keep-going \ 21 | --cluster-config cluster.json \ 22 | --cluster "sbatch --mem={cluster.mem} -t {cluster.time} -o {cluster.output} -p {cluster.partition}" 23 | -------------------------------------------------------------------------------- /hypercluster/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import seaborn as sns 3 | import hypercluster 4 | from hypercluster import ( 5 | utilities, additional_clusterers, additional_metrics, classes, constants, visualize 6 | ) 7 | from hypercluster.classes import AutoClusterer, MultiAutoClusterer 8 | __version__ = '0.1.13' 9 | __all__ = [ 10 | "AutoClusterer", 11 | "MultiAutoClusterer" 12 | ] 13 | 14 | matplotlib.rcParams["pdf.fonttype"] = 42 15 | matplotlib.rcParams["ps.fonttype"] = 42 16 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3) 17 | matplotlib.rcParams.update({"savefig.bbox": "tight"}) -------------------------------------------------------------------------------- /hypercluster/additional_clusterers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additonal clustering classes can be added here, as long as they have a 'fit' method. 3 | 4 | 5 | Attributes: 6 | HDBSCAN (clustering class): See `hdbscan`_ 7 | 8 | .. _hdbscan: 9 | https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#the-simple-case/ 10 | """ 11 | from typing import Optional, Iterable 12 | import logging 13 | import numpy as np 14 | import pandas as pd 15 | from scipy.spatial.distance import pdist 16 | from sklearn.decomposition import NMF 17 | from sklearn.neighbors import NearestNeighbors 18 | from hdbscan import HDBSCAN 19 | from .constants import pdist_adjacency_methods, valid_partition_types 20 | import igraph as ig 21 | import louvain 22 | import leidenalg 23 | 24 | 25 | class NMFCluster: 26 | """Uses non-negative factorization from sklearn to assign clusters to samples, based on the 27 | maximum membership score of the sample per component. 28 | 29 | Args: 30 | n_clusters: The number of clusters to find. Used as n_components when fitting. 31 | **nmf_kwargs: 32 | """ 33 | def __init__(self, n_clusters: int = 8, **nmf_kwargs): 34 | 35 | nmf_kwargs['n_components'] = n_clusters 36 | 37 | self.NMF = NMF(**nmf_kwargs) 38 | self.n_clusters = n_clusters 39 | 40 | def fit(self, data): 41 | """If negative numbers are present, creates one data matrix with all negative numbers 42 | zeroed. Create another data matrix with all positive numbers zeroed and the signs of all 43 | negative numbers reversed. Concatenate both matrices resulting in a data matrix twice as 44 | large as the original, but with positive values only and zeros and hence appropriate for 45 | NMF. Uses decomposed matrix H, which is nxk (with n=number of samples and k=number of 46 | components) to assign cluster membership. Each sample is assigned to the cluster for 47 | which it has the highest membership score. See `sklearn.decomposition.NMF`_ 48 | 49 | Args: 50 | data (DataFrame): Data to fit with samples as rows and features as columns. 51 | 52 | Returns: 53 | self with labels\_ attribute. 54 | 55 | .. _sklearn.decomposition.NMF: 56 | https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html 57 | """ 58 | 59 | if np.any(data<0): 60 | positive = data.copy() 61 | positive[positive < 0] = 0 62 | negative = data.copy() 63 | negative[negative > 0] = 0 64 | negative = -negative 65 | data = pd.concat([positive, negative], axis=1, join='outer') 66 | 67 | self.labels_ = pd.DataFrame(self.NMF.fit_transform(data)).idxmax(axis=1).values 68 | return self 69 | 70 | 71 | class LouvainCluster: 72 | """Louvain clustering on graph derived from an adjacency matrix. 73 | 74 | Args: 75 | adjacency_method: Method to use to construct adjacency matrix, which is used to construct \ 76 | graph that will be clustered. Valid methods are any metric valid in \ 77 | scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \ 78 | nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \ 79 | NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \ 80 | the count of how many NNs i and j have in common as the edge weight. 81 | k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix. 82 | resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \ 83 | resolution to use. If using other partitioners, this is ignored but any other kwargs for \ 84 | those partitioners can be passed too. 85 | adjacency_kwargs: Additional keyword arguments to pass to \ 86 | sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \ 87 | adjacency matrix. 88 | partition_type: Which partition to use for louvain clustering, see `louvain-igraph`_ for \ 89 | more info. 90 | **louvain_kwargs: Additional kwargs to be passed to `find_partition`_ 91 | 92 | .. _louvain-igraph: 93 | https://louvain-igraph.readthedocs.io/en/latest/reference.html 94 | .. _find_partition: 95 | https://louvain-igraph.readthedocs.io/en/latest/reference.html#louvain.find_partition 96 | """ 97 | def __init__( 98 | self, 99 | adjacency_method: str = 'MNN', 100 | k: int = 20, 101 | resolution: float = 0.8, 102 | adjacency_kwargs: Optional[dict] = None, 103 | partition_type: str = 'RBConfigurationVertexPartition', 104 | **louvain_kwargs 105 | ): 106 | 107 | if adjacency_method not in ['MNN', 'CNN'] + pdist_adjacency_methods: 108 | raise ValueError( 109 | 'Adjacency method %s invalid. Must be "SNN", "CNN" or a valid metric for ' 110 | 'scipy.spatial.distance.pdist.' % adjacency_method 111 | ) 112 | if partition_type not in valid_partition_types: 113 | raise ValueError( 114 | 'Partition type %s not valid, must be in constants.valid_partition_types' % 115 | partition_type 116 | ) 117 | self.adjacency_method = adjacency_method 118 | self.k = int(k) 119 | self.resolution = resolution 120 | self.adjacency_kwargs = adjacency_kwargs 121 | self.partition_type = partition_type 122 | self.louvain_kwargs = louvain_kwargs 123 | 124 | def fit( 125 | self, 126 | data: pd.DataFrame, 127 | ): 128 | adjacency_method = self.adjacency_method 129 | k = self.k 130 | resolution = self.resolution 131 | adjacency_kwargs = self.adjacency_kwargs 132 | louvain_kwargs = self.louvain_kwargs 133 | partition_type = self.partition_type 134 | if k >= len(data): 135 | logging.warning( 136 | 'k was set to %s, with only %s samples. Changing to k to %s-1' 137 | % (k, len(data), len(data)) 138 | ) 139 | k = len(data) - 1 140 | if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'): 141 | if adjacency_kwargs is None: 142 | adjacency_kwargs = {} 143 | adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k) 144 | nns = NearestNeighbors(**adjacency_kwargs) 145 | nns.fit(data) 146 | adjacency_mat = nns.kneighbors_graph(data) 147 | if adjacency_method == 'MNN': 148 | adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose()) 149 | if adjacency_method == 'CNN': 150 | adjacency_mat = adjacency_mat*adjacency_mat.transpose() 151 | elif adjacency_method in pdist_adjacency_methods: 152 | adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs) 153 | 154 | if louvain_kwargs is None: 155 | louvain_kwargs = {} 156 | g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist()) 157 | 158 | if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']: 159 | louvain_kwargs['resolution_parameter'] = resolution 160 | 161 | labels = eval('louvain.find_partition(g, louvain.%s, **louvain_kwargs)' % partition_type) 162 | labels = pd.Series({v: i for i in range(len(labels)) for v in labels[i]}).sort_index() 163 | if labels.is_unique or (len(labels.unique()) == 1): 164 | labels = pd.Series([-1 for i in range(len(labels))]) 165 | labels = labels.values 166 | self.labels_ = labels 167 | return self 168 | 169 | 170 | class LeidenCluster: 171 | """Leidein clustering on graph derived from an adjacency matrix. See `reference`_ for more info 172 | 173 | Args: 174 | adjacency_method: Method to use to construct adjacency matrix, which is used to construct \ 175 | graph that will be clustered. Valid methods are any metric valid in \ 176 | scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \ 177 | nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \ 178 | NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \ 179 | the count of how many NNs i and j have in common as the edge weight. 180 | k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix. 181 | resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \ 182 | resolution to use. If using other partitioners, this is ignored but any other kwargs for \ 183 | those partitioners can be passed too. 184 | adjacency_kwargs: Additional keyword arguments to pass to \ 185 | sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \ 186 | adjacency matrix. 187 | partition_type: Which partition to use for leiden clustering, see `leidenalg`_ for \ 188 | more info. 189 | **leiden_kwargs: Additional kwargs to be passed to `find_partition`_ 190 | .. _reference: 191 | https://www.nature.com/articles/s41598-019-41695-z 192 | .. _leidenalg: 193 | https://leidenalg.readthedocs.io/en/latest/reference.html 194 | .. _find_partition: 195 | https://leidenalg.readthedocs.io/en/latest/reference.html#leidenalg.find_partition 196 | """ 197 | def __init__( 198 | self, 199 | adjacency_method: str = 'MNN', 200 | k: int = 20, 201 | resolution: float = 0.8, 202 | adjacency_kwargs: Optional[dict] = None, 203 | partition_type: str = 'RBConfigurationVertexPartition', 204 | **leiden_kwargs 205 | ): 206 | 207 | self.adjacency_method = adjacency_method 208 | self.k = int(k) 209 | self.resolution = resolution 210 | self.adjacency_kwargs = adjacency_kwargs 211 | self.partition_type = partition_type 212 | self.leiden_kwargs = leiden_kwargs 213 | 214 | def fit( 215 | self, 216 | data: pd.DataFrame, 217 | ): 218 | 219 | adjacency_method = self.adjacency_method 220 | k = self.k 221 | resolution = self.resolution 222 | adjacency_kwargs = self.adjacency_kwargs 223 | leiden_kwargs = self.leiden_kwargs 224 | partition_type = self.partition_type 225 | if k >= len(data): 226 | logging.warning( 227 | 'k was set to %s, with only %s samples. Changing to k to %s-1' 228 | % (k, len(data), len(data)) 229 | ) 230 | k = len(data) - 1 231 | if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'): 232 | if adjacency_kwargs is None: 233 | adjacency_kwargs = {} 234 | adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k) 235 | nns = NearestNeighbors(**adjacency_kwargs) 236 | nns.fit(data) 237 | adjacency_mat = nns.kneighbors_graph(data) 238 | if adjacency_method == 'MNN': 239 | adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose()) 240 | if adjacency_method == 'CNN': 241 | adjacency_mat = adjacency_mat * adjacency_mat.transpose() 242 | elif adjacency_method in pdist_adjacency_methods: 243 | adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs) 244 | 245 | if leiden_kwargs is None: 246 | leiden_kwargs = {} 247 | g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist()) 248 | 249 | if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']: 250 | leiden_kwargs['resolution_parameter'] = resolution 251 | 252 | labels = eval('leidenalg.find_partition(g, leidenalg.%s,**leiden_kwargs)' % partition_type) 253 | labels = pd.Series({v:i for i in range(len(labels)) for v in labels[i]}).sort_index() 254 | if labels.is_unique or (len(labels.unique()) == 1): 255 | labels = pd.Series([-1 for i in range(len(labels))]) 256 | labels = labels.values 257 | self.labels_ = labels 258 | return self 259 | -------------------------------------------------------------------------------- /hypercluster/additional_metrics.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Optional 2 | from collections import Counter 3 | from pandas import DataFrame 4 | from scipy.cluster.hierarchy import linkage, cophenet 5 | from scipy.spatial.distance import pdist 6 | 7 | __doc__ = ( 8 | "More functions for evaluating clustering results. Additional metric evaluations can " 9 | "be added here, as long as the second argument is the labels to evaluate" 10 | ) 11 | 12 | 13 | def number_clustered(_, labels: Iterable) -> float: 14 | """Returns the number of clustered samples. 15 | 16 | Args: 17 | _: Dummy, pass anything or None. 18 | labels (Iterable): Vector of sample labels. 19 | 20 | Returns (int): 21 | The number of clustered labels. 22 | 23 | """ 24 | return (labels != -1).sum() 25 | 26 | 27 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float: 28 | """Number in the smallest cluster over the number in the largest cluster. 29 | 30 | Args: 31 | _: Dummy, pass anything or None. 32 | labels (Iterable): Vector of sample labels. 33 | 34 | Returns (float): 35 | Ratio of number of members in smallest over largest cluster. 36 | 37 | """ 38 | counts = Counter(labels) 39 | counts.pop(-1, None) 40 | return min(counts.values()) / max(counts.values()) 41 | 42 | 43 | def smallest_cluster_ratio(_, labels: Iterable) -> float: 44 | """Number in the smallest cluster over the total samples. 45 | 46 | Args: 47 | _: Dummy, pass anything or None. 48 | labels (Iterable): Vector of sample labels. 49 | 50 | Returns (float): 51 | Ratio of number of members in smallest over all samples. 52 | 53 | """ 54 | counts = Counter(labels) 55 | counts.pop(-1, None) 56 | return min(counts.values()) / len(labels) 57 | 58 | 59 | def number_of_clusters(_, labels: Iterable) -> float: 60 | """Number of total clusters. 61 | 62 | Args: 63 | _: Dummy, pass anything or None 64 | labels (Iterable): Vector of sample labels. 65 | 66 | Returns (int): 67 | Number of clusters. 68 | 69 | """ 70 | return len(Counter(labels)) 71 | 72 | 73 | def smallest_cluster_size(_, labels: Iterable) -> float: 74 | """Number in smallest cluster 75 | 76 | Args: 77 | _: Dummy, pass anything or None 78 | labels (Iterable): Vector of sample labels. 79 | 80 | Returns (int): 81 | Number of samples in smallest cluster. 82 | 83 | """ 84 | return min(Counter(labels).values()) 85 | 86 | 87 | def largest_cluster_size(_, labels: Iterable) -> float: 88 | """Number in largest cluster 89 | 90 | Args: 91 | _: Dummy, pass anything or None 92 | labels (Iterable): Vector of sample labels. 93 | 94 | Returns (int): 95 | Number of samples in largest cluster. 96 | 97 | """ 98 | return max(Counter(labels).values()) 99 | -------------------------------------------------------------------------------- /hypercluster/constants.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | __doc__ = """ 5 | Attributes: 6 | param_delim: delimiter between hyperparameters for snakemake file labels and labels DataFrame \ 7 | columns. 8 | val_delim: delimiter between hyperparameter label and value for snakemake file labels and \ 9 | labels DataFrame columns. 10 | categories: Convenient groups of clusterers to use. If all samples need to be clustered, \ 11 | 'partitioners' is a good choice. If there are millions of samples, 'fastest' might be a good \ 12 | choice. 13 | variables_to_optimize: Some default hyperparameters to optimize and value ranges for a \ 14 | selection of commonly used clustering algoirthms from sklearn. Used as deafults for \ 15 | clustering.AutoClusterer and clustering.optimize_clustering. 16 | need_ground_truth: list of sklearn metrics that need ground truth labeling. \ 17 | "adjusted_rand_score", "adjusted_mutual_info_score", "homogeneity_score", \ 18 | "completeness_score", "fowlkes_mallows_score", "mutual_info_score", "v_measure_score" 19 | inherent_metrics: list of sklearn metrics that need original data for calculation. \ 20 | "silhouette_score", "calinski_harabasz_score", "davies_bouldin_score", \ 21 | "smallest_largest_clusters_ratio", "number_of_clusters", "smallest_cluster_size", \ 22 | "largest_cluster_size" 23 | min_or_max: establishing whether each sklearn metric is better when minimized or maximized for \ 24 | clustering.pick_best_labels. 25 | """ 26 | param_delim = ";" 27 | val_delim = "-" 28 | 29 | slow = ["AffinityPropagation", "MeanShift"] 30 | fast = ["KMeans", "OPTICS", "HDBSCAN"] 31 | fastest = ["MiniBatchKMeans"] 32 | partitioners = ["AffinityPropagation", "MeanShift", "KMeans", "MiniBatchKMeans"] 33 | clusterers = ["OPTICS", "HDBSCAN"] 34 | categories = { 35 | "slow": slow, 36 | "fast": fast, 37 | "fastest": fastest, 38 | "partitioning": partitioners, 39 | "clustering": clusterers, 40 | } 41 | 42 | min_cluster_size = [i for i in range(2, 17, 2)] 43 | n_clusters = [i for i in range(2, 41)] 44 | damping = [i / 100 for i in range(55, 95, 5)] 45 | resolutions = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] 46 | knn = [20, 30, 60] 47 | 48 | 49 | variables_to_optimize = { 50 | "HDBSCAN": dict(min_cluster_size=min_cluster_size), 51 | "KMeans": dict(n_clusters=n_clusters), 52 | "MiniBatchKMeans": dict(n_clusters=n_clusters), 53 | "AffinityPropagation": dict(damping=damping), 54 | "MeanShift": dict(cluster_all=[False]), 55 | "OPTICS": dict(min_samples=min_cluster_size), 56 | "NMFCluster": dict(n_clusters=n_clusters), 57 | "LouvainCluster": dict(resolution=resolutions, k=knn), 58 | "LeidenCluster": dict(resolution=resolutions, k=knn), 59 | } 60 | 61 | 62 | need_ground_truth = [ 63 | "adjusted_rand_score", 64 | "adjusted_mutual_info_score", 65 | "homogeneity_score", 66 | "completeness_score", 67 | "fowlkes_mallows_score", 68 | "mutual_info_score", 69 | "v_measure_score", 70 | ] 71 | 72 | inherent_metrics = [ 73 | "silhouette_score", 74 | "calinski_harabasz_score", 75 | "davies_bouldin_score", 76 | "smallest_largest_clusters_ratio", 77 | "number_of_clusters", 78 | "smallest_cluster_size", 79 | "largest_cluster_size" 80 | ] 81 | 82 | min_or_max = { 83 | "adjusted_rand_score": 'max', 84 | "adjusted_mutual_info_score": 'max', 85 | "homogeneity_score": 'max', 86 | "completeness_score": 'max', 87 | "fowlkes_mallows_score": 'max', 88 | "silhouette_score": 'max', 89 | "calinski_harabasz_score": 'max', 90 | "davies_bouldin_score": 'min', 91 | "mutual_info_score": 'max', 92 | "v_measure_score": 'max', 93 | } 94 | 95 | pdist_adjacency_methods = [ 96 | 'braycurtis', 97 | 'canberra', 98 | 'chebyshev', 99 | 'cityblock', 100 | 'correlation', 101 | 'cosine', 102 | 'dice', 103 | 'euclidean', 104 | 'hamming', 105 | 'jaccard', 106 | 'jensenshannon', 107 | 'kulsinski', 108 | 'mahalanobis', 109 | 'matching', 110 | 'minkowski', 111 | 'rogerstanimoto', 112 | 'russellrao', 113 | 'seuclidean', 114 | 'sokalmichener', 115 | 'sokalsneath', 116 | 'sqeuclidean', 117 | 'yule' 118 | ] 119 | 120 | 121 | valid_partition_types = [ 122 | 'RBConfigurationVertexPartition', 123 | 'ModularityVertexPartition', 124 | 'RBERVertexPartition', 125 | 'CPMVertexPartition', 126 | 'SignificanceVertexPartition', 127 | 'SurpriseVertexPartition' 128 | ] -------------------------------------------------------------------------------- /hypercluster/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/hypercluster/tests/__init__.py -------------------------------------------------------------------------------- /hypercluster/tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | from hypercluster import utilities 2 | from hypercluster.constants import * 3 | import hypercluster 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | test_data = pd.DataFrame( 9 | np.array( 10 | [[1, 2], [-1.8, 4], [1, -0.5], 11 | [10, 2], [-10, 4], [10, 0], 12 | [np.nan, 5], [3.2, np.nan], [0, 14], 13 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 14 | [42, np.nan], [-8, 2], [1.2, 12], 15 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 16 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 17 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 18 | [np.nan, 22], [8.6, -7.5], [0, 14], 19 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 20 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 21 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 22 | [-29, 3.6], [-13, -3], [3.3, np.nan], 23 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 24 | ) 25 | ) 26 | 27 | 28 | test_data['ind1'] = 'a' 29 | test_data['ind2'] = range(len(test_data)) 30 | test_data = test_data.set_index(['ind1', 'ind2']) 31 | test_data = test_data.fillna(test_data.median()) 32 | 33 | test_ground_truth = pd.Series( 34 | np.random.randint(0, 2, size=(len(test_data), )), 35 | index=test_data.index 36 | ) 37 | 38 | 39 | def test_cluster_one(): 40 | # Test all clusterers are working with default params 41 | for clus_name in variables_to_optimize.keys(): 42 | utilities.cluster(clus_name, test_data) 43 | 44 | # Test with putting extra params in there 45 | for clus_name in variables_to_optimize.keys(): 46 | vars = variables_to_optimize[clus_name] 47 | key = list(vars.keys())[0] 48 | params = {key: vars[key][0]} 49 | # grabbing a variable and making sure var passing works 50 | utilities.cluster(clus_name, test_data, params) 51 | 52 | 53 | def test_autoclusterer(): 54 | for clus_name in variables_to_optimize.keys(): 55 | hypercluster.AutoClusterer(clus_name).fit(test_data) 56 | for clus_name in variables_to_optimize.keys(): 57 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 58 | 59 | 60 | def test_param_weights(): 61 | for clus_name in variables_to_optimize.keys(): 62 | weights = { 63 | param: {value: (1/len(values)) for value in values} for param, values in 64 | variables_to_optimize[ 65 | clus_name 66 | ].items() 67 | } 68 | hypercluster.AutoClusterer(clus_name, param_weights=weights).fit( 69 | test_data 70 | ) 71 | for clus_name in variables_to_optimize.keys(): 72 | hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data) 73 | 74 | 75 | def test_passing_kwargs_for_a_clusterer(): 76 | clus_name = 'KMeans' 77 | 78 | hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit( 79 | test_data 80 | ) 81 | 82 | 83 | def test_evaluate_results(): 84 | labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_ 85 | for metric in inherent_metrics + need_ground_truth: 86 | utilities.evaluate_one( 87 | labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth 88 | ) 89 | 90 | 91 | def test_multiauto(): 92 | hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 93 | -------------------------------------------------------------------------------- /hypercluster/tests/test_visualize.py: -------------------------------------------------------------------------------- 1 | from hypercluster import visualize 2 | import hypercluster 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | test_data = pd.DataFrame( 8 | np.array( 9 | [[1, 2], [-1.8, 4], [1, -0.5], 10 | [10, 2], [-10, 4], [10, 0], 11 | [np.nan, 5], [3.2, np.nan], [0, 14], 12 | [-16.4, 3.67], [13.22, -3], [3.3, np.nan], 13 | [42, np.nan], [-8, 2], [1.2, 12], 14 | [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11], 15 | [-44, 0], [-0.22, -0.11], [2.34, 6.7], 16 | [-10, np.nan], [-2.3, -2.5], [np.nan, 0], 17 | [np.nan, 22], [8.6, -7.5], [0, 14], 18 | [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan], 19 | [-20, 2.01], [0.25, -.25], [0.455, 0.233], 20 | [np.nan, -0.89], [19, np.nan], [np.nan, np.nan], 21 | [-29, 3.6], [-13, -3], [3.3, np.nan], 22 | [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]] 23 | ) 24 | ) 25 | 26 | 27 | test_data['ind1'] = 'a' 28 | test_data['ind2'] = range(len(test_data)) 29 | test_data = test_data.set_index(['ind1', 'ind2']) 30 | test_data = test_data.fillna(test_data.median()) 31 | 32 | test_ground_truth = pd.Series( 33 | np.random.randint(0, 2, size=(len(test_data), )), 34 | index=test_data.index 35 | ) 36 | 37 | 38 | def test_vis_eval(): 39 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 40 | visualize.visualize_evaluations(clusterer.evaluation_df) 41 | clusterer.visualize_evaluations( 42 | # savefig=True 43 | ) 44 | visualize.visualize_for_picking_labels( 45 | clusterer.evaluation_df, savefig_prefix='test_visualize_for_picking' 46 | ) 47 | 48 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 49 | visualize.visualize_evaluations(clusterer.evaluation_df) 50 | clusterer.visualize_evaluations() 51 | 52 | 53 | def test_vis_sample(): 54 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 55 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 56 | clusterer.visualize_sample_label_consistency() 57 | 58 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 59 | visualize.visualize_sample_label_consistency(clusterer.labels_df) 60 | clusterer.visualize_sample_label_consistency() 61 | 62 | 63 | def test_vis_labels(): 64 | clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate() 65 | visualize.visualize_label_agreement(clusterer.labels_df) 66 | clusterer.visualize_label_agreement( 67 | savefig=True, 68 | ) 69 | 70 | clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate() 71 | visualize.visualize_label_agreement(clusterer.labels_df) 72 | clusterer.visualize_label_agreement() -------------------------------------------------------------------------------- /hypercluster/utilities.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import * 2 | from sklearn.metrics import * 3 | from .additional_clusterers import * 4 | from .additional_metrics import * 5 | from pandas import DataFrame 6 | import pandas as pd 7 | import numpy as np 8 | import logging 9 | from typing import Optional, Iterable, Dict 10 | from .constants import * 11 | from hypercluster.constants import param_delim, val_delim 12 | 13 | 14 | def calculate_row_weights( 15 | row: Iterable, param_weights: dict, vars_to_optimize: dict 16 | ) -> float: 17 | """Used to select random rows of parameter combinations using individual parameter weights. 18 | 19 | Args: 20 | row (Iterable): Series of parameters, with parameter names as index. 21 | param_weights (dict): Dictionary of str: dictionaries. Ex format - {'parameter_name':{ \ 22 | 'param_option_1':0.5, 'param_option_2':0.5}}. 23 | vars_to_optimize (Iterable): Dictionary with possibilities for different parameters. Ex \ 24 | format - {'parameter_name':[1, 2, 3, 4, 5]}. 25 | 26 | Returns (float): 27 | Float representing the probability of seeing that combination of parameters, given their \ 28 | individual weights. 29 | 30 | """ 31 | param_weights.update({ 32 | param: { 33 | val: param_weights.get(param, {}).get( 34 | val, (1-sum(param_weights.get(param, {}).values()))/len([ 35 | notweighted for notweighted in vars_to_optimize.get(param, {}) 36 | if notweighted not in param_weights.get(param, {}).keys() 37 | ]) 38 | ) for val in vals 39 | } for param, vals in vars_to_optimize.items() 40 | }) 41 | 42 | return np.prod([param_weights[param][val] for param, val in row.to_dict().items()]) 43 | 44 | 45 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}): 46 | """Runs a given clusterer with a given set of parameters. 47 | 48 | Args: 49 | clusterer_name (str): String name of clusterer. 50 | data (DataFrame): Dataframe with elements to cluster as index and examples as columns. 51 | params (dict): Dictionary of parameter names and values to feed into clusterer. Default {} 52 | 53 | Returns: 54 | Instance of the clusterer fit with the data provided. 55 | """ 56 | clusterer = eval(clusterer_name)(**params) 57 | return clusterer.fit(data) 58 | 59 | 60 | def evaluate_one( 61 | labels: Iterable, 62 | method: str = "silhouette_score", 63 | data: Optional[DataFrame] = None, 64 | gold_standard: Optional[Iterable] = None, 65 | metric_kwargs: Optional[dict] = None, 66 | ) -> dict: 67 | """Uses a given metric to evaluate clustering results. 68 | 69 | Args: 70 | labels (Iterable): Series of labels. 71 | method (str): Str of name of evaluation to use. Default is silhouette. 72 | data (DataFrame): If using an inherent metric, must provide DataFrame with which to \ 73 | calculate the metric. 74 | gold_standard (Iterable): If using a metric that compares to ground truth, must provide a \ 75 | set of gold standard labels. 76 | metric_kwargs (dict): Additional kwargs to use in evaluation. 77 | 78 | Returns (float): 79 | Metric value 80 | """ 81 | if isinstance(labels, pd.Series) is False: 82 | labels = pd.Series(labels) 83 | if len(labels[labels != -1].unique()) < 2: 84 | return np.nan 85 | 86 | if metric_kwargs is None: 87 | metric_kwargs = {} 88 | 89 | if method in need_ground_truth: 90 | if gold_standard is None: 91 | raise ValueError( 92 | "Chosen evaluation metric %s requires gold standard set." % method 93 | ) 94 | clustered = (gold_standard != -1) & (labels != -1) 95 | compare_to = gold_standard[clustered] 96 | 97 | elif method in inherent_metrics: 98 | if data is None: 99 | raise ValueError( 100 | "Chosen evaluation metric %s requires data input." % method 101 | ) 102 | clustered = labels != -1 103 | compare_to = data.loc[clustered] 104 | else: 105 | compare_to = None 106 | clustered = labels.index 107 | 108 | return eval(method)(compare_to, labels[clustered], **metric_kwargs) 109 | 110 | 111 | def generate_flattened_df(df_dict: Dict[str, DataFrame]) -> DataFrame: 112 | """Takes dictionary of results from many clusterers and makes 1 DataFrame. Opposite of \ 113 | convert_to_multiind. 114 | 115 | Args: 116 | df_dict (Dict[str, DataFrame]): Dictionary of dataframes to flatten. Can be .labels_ or \ 117 | .evaluations_ from MultiAutoClusterer. 118 | 119 | Returns: 120 | Flattened DataFrame with all data. 121 | """ 122 | merged_df = pd.DataFrame() 123 | for clus_name, df in df_dict.items(): 124 | df = df.transpose() 125 | cols_for_labels = df.index.to_frame() 126 | inds = cols_for_labels.apply( 127 | lambda row: param_delim.join( 128 | [clus_name] + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()] 129 | ), 130 | axis=1, 131 | ) 132 | df.index = inds 133 | df = df.transpose() 134 | 135 | merged_df = pd.concat( 136 | [merged_df, df], join="outer", axis=1 137 | ) 138 | return merged_df 139 | 140 | 141 | def convert_to_multiind(key: str, df: DataFrame) -> DataFrame: 142 | """Takes columns from a single clusterer from Clusterer.labels_df or .evaluation_df and 143 | converts to a multiindexed rather than collapsed into string. Equivalent to grabbing 144 | Clusterer.labels[clusterer] or .evaluations[clusterer]. Opposite of generate_flattened_df. 145 | 146 | Args: 147 | key (str): Name of clusterer, must match beginning of columns to convert. 148 | df (DataFrame): Dataframe to grab chunk from. 149 | 150 | Returns: 151 | Subset DataFrame with multiindex. 152 | 153 | """ 154 | clus_cols = [col for col in df.columns if col.split(param_delim, 1)[0] == key] 155 | temp = df[clus_cols].transpose() 156 | temp.index = pd.MultiIndex.from_frame( 157 | pd.DataFrame([{ 158 | s.split(val_delim, 1)[0]: s.split(val_delim, 1)[1] for s in i.split(param_delim)[1:] 159 | } for i in temp.index]).astype(float, errors='ignore') 160 | ) 161 | return temp.sort_index().transpose() 162 | 163 | 164 | def pick_best_labels( 165 | evaluation_results_df: DataFrame, 166 | clustering_labels_df: DataFrame, 167 | method: Optional[str] = None, 168 | min_or_max: Optional[str] = None 169 | ) -> Iterable: 170 | """From evaluations and a metric to minimize or maximize, return all labels with top pick. 171 | 172 | Args: 173 | evaluation_results_df (DataFrame): Evaluations DataFrame from optimize_clustering. 174 | clustering_labels_df (DataFrame): Labels DataFrame from optimize_clustering. 175 | method (str): Method with which to choose the best labels. 176 | min_or_max (str): Whether to minimize or maximize the metric. Must be 'min' or 'max'. 177 | Returns (DataFrame): 178 | DataFrame of all top labels. 179 | """ 180 | if method is None: 181 | method = "silhouette_score" 182 | if min_or_max is None: 183 | min_or_max = 'max' 184 | 185 | best_labels = evaluation_results_df.loc[method, :] 186 | if min_or_max == 'min': 187 | best_labels = best_labels.index[best_labels == best_labels.min()] 188 | return clustering_labels_df[best_labels] 189 | elif min_or_max == 'max': 190 | best_labels = best_labels.index[best_labels == best_labels.max()] 191 | return clustering_labels_df[best_labels] 192 | logging.error('min_or_max must be either min or max, %s invalid choice' % min_or_max) 193 | 194 | 195 | -------------------------------------------------------------------------------- /hypercluster/visualize.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import logging 3 | from collections import Counter 4 | from itertools import cycle 5 | import numpy as np 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | from pandas import DataFrame 10 | from scipy.cluster import hierarchy 11 | from scipy.spatial.distance import pdist 12 | from hypercluster.constants import param_delim 13 | from hypercluster.utilities import convert_to_multiind, evaluate_one 14 | 15 | matplotlib.rcParams["pdf.fonttype"] = 42 16 | matplotlib.rcParams["ps.fonttype"] = 42 17 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3) 18 | matplotlib.rcParams.update({"savefig.bbox": "tight"}) 19 | cmap = sns.cubehelix_palette( 20 | start=0, 21 | rot=0.4, 22 | gamma=1.0, 23 | hue=0.82, 24 | light=1, 25 | dark=0, 26 | reverse=False, 27 | as_cmap=True 28 | ) 29 | cmap.set_over('black') 30 | cmap.set_under('white') 31 | cmap.set_bad("#DAE0E6") 32 | 33 | 34 | def zscore(df): 35 | """Row zscores a DataFrame, ignores np.nan 36 | 37 | Args: 38 | df (DataFrame): DataFrame to z-score 39 | 40 | Returns (DataFrame): 41 | Row-zscored DataFrame. 42 | """ 43 | return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0) 44 | 45 | 46 | def compute_order( 47 | df, 48 | dist_method: str = "euclidean", 49 | cluster_method: str = "average" 50 | ): 51 | """Gives hierarchical clustering order for the rows of a DataFrame 52 | 53 | Args: 54 | df (DataFrame): DataFrame with rows to order. 55 | dist_method (str): Distance method to pass to scipy.cluster.hierarchy.linkage. 56 | cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist. 57 | 58 | Returns (pandas.Index): 59 | Ordered row index. 60 | 61 | """ 62 | dist_mat = pdist(df, metric=dist_method) 63 | link_mat = hierarchy.linkage(dist_mat, method=cluster_method) 64 | 65 | return df.index[hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))] 66 | 67 | 68 | def visualize_evaluations( 69 | evaluations_df: DataFrame, 70 | savefig: bool = False, 71 | output_prefix: str = "evaluations", 72 | **heatmap_kws 73 | ) -> List[matplotlib.axes.Axes]: 74 | """Makes a z-scored visualization of all evaluations. 75 | 76 | Args: 77 | evaluations_df (DataFrame): Evaluations dataframe from clustering.optimize_clustering 78 | output_prefix (str): If saving a figure, file prefix to use. 79 | savefig (bool): Whether to save a pdf 80 | **heatmap_kws: Additional keyword arguments to pass to seaborn.heatmap. 81 | 82 | Returns (List[matplotlib.axes.Axes]): 83 | List of all matplotlib axes. 84 | 85 | """ 86 | clusterers = sorted( 87 | list(set([i.split(param_delim, 1)[0] for i in evaluations_df.columns])) 88 | ) 89 | width_ratios = [ 90 | dict( 91 | Counter( 92 | [i.split(param_delim, 1)[0] for i in evaluations_df.columns] 93 | ) 94 | )[clus] 95 | for clus in clusterers 96 | ] 97 | 98 | evaluations_df = zscore(evaluations_df) 99 | width = 0.18 * (len(evaluations_df.columns) + 2 + (0.01 * (len(clusterers) - 1))) 100 | height = 0.22 * (len(evaluations_df)) 101 | 102 | fig, axs = plt.subplots( 103 | figsize=(width, height), 104 | nrows=1, 105 | ncols=(len(clusterers) + 1), 106 | gridspec_kw=dict( 107 | width_ratios=width_ratios + [2], 108 | wspace=0.01, 109 | left=0, 110 | right=1, 111 | top=1, 112 | bottom=0, 113 | ), 114 | ) 115 | vmin = np.nanquantile(evaluations_df, 0.1) 116 | vmax = np.nanquantile(evaluations_df, 0.9) 117 | 118 | heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) 119 | heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) 120 | heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) 121 | 122 | for i, clus in enumerate(clusterers): 123 | temp = convert_to_multiind(clus, evaluations_df) 124 | 125 | ax = axs[i] 126 | sns.heatmap( 127 | temp, 128 | ax=ax, 129 | yticklabels=temp.index, 130 | xticklabels=["-".join([str(i) for i in col]) for col in temp.columns], 131 | cbar_ax=axs[-1], 132 | cbar_kws=dict(label="z-score"), 133 | **heatmap_kws 134 | ) 135 | ax.set_ylabel("") 136 | ax.set_title(clus) 137 | ax.set_yticklabels([]) 138 | 139 | axs[0].set_ylabel("evaluation method") 140 | axs[0].set_yticklabels(temp.index, rotation=0) 141 | if savefig: 142 | plt.savefig("%s.pdf" % output_prefix) 143 | return axs 144 | 145 | 146 | def visualize_pairwise( 147 | df: DataFrame, 148 | savefig: bool = False, 149 | output_prefix: Optional[str] = None, 150 | method: Optional[str] = None, 151 | **heatmap_kws 152 | ) -> List[matplotlib.axes.Axes]: 153 | """Visualize symmetrical square DataFrames. 154 | 155 | Args: 156 | df (DataFrame): DataFrame to visualize. 157 | savefig (bool): Whether to save a pdf. 158 | output_prefix (str): If saving a pdf, file prefix to use. 159 | method (str): Label for cbar, if relevant. 160 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 161 | 162 | Returns (List[matplotlib.axes.Axes]): 163 | List of matplotlib axes for figure. 164 | 165 | .. _seaborn.heatmap: 166 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 167 | """ 168 | heatmap_kws = {**heatmap_kws} 169 | 170 | vmin = np.nanquantile(df, 0.1) 171 | vmax = np.nanquantile(df, 0.9) 172 | 173 | heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) 174 | heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) 175 | heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) 176 | cbar_kws = heatmap_kws.get('cbar_kws', {}) 177 | cbar_kws['label'] = cbar_kws.get('label', method) 178 | heatmap_kws['cbar_kws'] = cbar_kws 179 | 180 | cbar_ratio = 2 181 | wspace = 0.01 182 | height = 0.18 * len(df) 183 | width = 0.18 * (len(df.columns)+cbar_ratio+wspace) 184 | fig, axs = plt.subplots( 185 | figsize=(width, height), 186 | nrows=1, 187 | ncols=2, 188 | gridspec_kw=dict( 189 | width_ratios=[len(df.columns), cbar_ratio], 190 | wspace=wspace, 191 | left=0, 192 | right=1, 193 | top=1, 194 | bottom=0, 195 | ) 196 | ) 197 | try: 198 | order = compute_order(df.fillna(df.median())) 199 | except ValueError: 200 | order = df.index 201 | df = df.loc[order, order] 202 | sns.heatmap( 203 | df, 204 | xticklabels=order, 205 | yticklabels=order, 206 | ax=axs[0], 207 | cbar_ax=axs[1], 208 | **heatmap_kws 209 | ) 210 | if savefig: 211 | if output_prefix is None: 212 | output_prefix = "heatmap.pairwise" 213 | plt.savefig('%s.pdf' % output_prefix) 214 | 215 | return axs 216 | 217 | 218 | def visualize_label_agreement( 219 | labels: DataFrame, 220 | method: Optional[str] = None, 221 | savefig: bool = False, 222 | output_prefix: Optional[str] = None, 223 | **heatmap_kws 224 | ) -> List[matplotlib.axes.Axes]: 225 | """Visualize similarity between clustering results given an evaluation metric. 226 | 227 | Args: 228 | labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ 229 | AutoClusterer.labels_ 230 | method (str): Method with which to compare labels. Must be a metric like the ones in \ 231 | constants.need_ground_truth, which takes two sets of labels. 232 | savefig (bool): Whether to save a pdf. 233 | output_prefix (str): If saving a pdf, file prefix to use. 234 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 235 | 236 | Returns (List[matplotlib.axes.Axes]): 237 | List of matplotlib axes 238 | 239 | .. _seaborn.heatmap: 240 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 241 | """ 242 | if savefig and output_prefix is None: 243 | output_prefix = 'heatmap.labels.pairwise' 244 | if method is None: 245 | method = 'adjusted_rand_score' 246 | 247 | labels = labels.astype(float).corr( 248 | lambda x, y: evaluate_one(x, method=method, gold_standard=y) 249 | ) 250 | return visualize_pairwise(labels, savefig, output_prefix, method=method, **heatmap_kws) 251 | 252 | 253 | def visualize_sample_label_consistency( 254 | labels: DataFrame, 255 | savefig: bool = False, 256 | output_prefix: Optional[str] = None, 257 | **heatmap_kws 258 | ) -> List[matplotlib.axes.Axes]: 259 | """Visualize how often two samples are labeled in the same group across conditions. Interpret 260 | with care--if you use more conditions for some type of clusterers, e.g. more n_clusters for 261 | KMeans, those cluster more similarly across conditions than between clusterers. This means 262 | that more agreement in labeling could be due to the choice of clusterers rather than true 263 | similarity between samples. 264 | 265 | Args: 266 | labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ 267 | AutoClusterer.labels_ 268 | savefig (bool): Whether to save a pdf. 269 | output_prefix (str): If saving a pdf, file prefix to use. 270 | **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ 271 | 272 | Returns (List[matplotlib.axes.Axes]): 273 | List of matplotlib axes 274 | 275 | .. _seaborn.heatmap: 276 | https://seaborn.pydata.org/generated/seaborn.heatmap.html 277 | 278 | """ 279 | if savefig and output_prefix is None: 280 | output_prefix = "heatmap.sample.pairwise" 281 | #TODO change this to much faster matmult 282 | labels = labels.transpose().astype(float).corr(lambda x, y: sum( 283 | np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))]) 284 | )) 285 | return visualize_pairwise(labels, savefig, output_prefix, method='# same label', **heatmap_kws) 286 | 287 | 288 | def visualize_for_picking_labels( 289 | evaluation_df: DataFrame, 290 | method: Optional[str] = None, 291 | savefig_prefix: Optional[str] = None 292 | ): 293 | """Generates graphs similar to a `scree graph`_ for PCA for each parameter and each clusterer. 294 | 295 | Args: 296 | evaluation_df (DataFrame): DataFrame of evaluations to visualize. Clusterer.evaluation_df. 297 | method (str): Which metric to visualize. 298 | savefig_prefix (str): If not None, save a figure with give prefix. 299 | 300 | Returns: 301 | matplotlib axes. 302 | .. _scree graph: 303 | https://en.wikipedia.org/wiki/Scree_plot 304 | """ 305 | if method is None: 306 | method = "silhouette_score" 307 | cluss_temp = list(set([i.split(param_delim, 1)[0] for i in evaluation_df.columns])) 308 | # get figure dimensions 309 | ncols = 0 310 | cluss = [] 311 | for ploti, clus in enumerate(cluss_temp): 312 | scores = convert_to_multiind( 313 | clus, evaluation_df.loc[[method], :] 314 | ).transpose().dropna(how='any') 315 | if len(scores) == 0: 316 | logging.error( 317 | 'Score %s is missing for clusterer %s, skipping visualization' % (method, clus) 318 | ) 319 | continue 320 | indep = scores.index.to_frame().reset_index(drop=True) 321 | try: 322 | indep.astype(float) 323 | except ValueError or AssertionError: 324 | logging.error('Cannot convert %s data to floats, skipping visualization' % clus) 325 | continue 326 | cluss.append(clus) 327 | if scores.index.nlevels > ncols: 328 | ncols = scores.index.nlevels 329 | if not cluss: 330 | logging.error('No valid clusterers, cannot visualize. ') 331 | return None 332 | cluss.sort() 333 | 334 | ybuff = np.abs(np.nanquantile(evaluation_df.loc[method], 0.05)) 335 | ylim = (evaluation_df.loc[method].min() - ybuff, evaluation_df.loc[method].max() + ybuff) 336 | colors = cycle(sns.color_palette('twilight', n_colors=len(cluss) * ncols)) 337 | fig = plt.figure(figsize=(5 * (ncols), 5 * len(cluss))) 338 | gs = plt.GridSpec(nrows=len(cluss), ncols=ncols, wspace=0.25, hspace=0.25) 339 | for ploti, clus in enumerate(cluss): 340 | scores = convert_to_multiind( 341 | clus, evaluation_df.loc[[method], :] 342 | ).transpose().dropna(how='any') 343 | indep = scores.index.to_frame().reset_index(drop=True) 344 | 345 | for whcol, col in enumerate(indep.columns): 346 | if whcol == 0: 347 | saveax = plt.subplot(gs[ploti, whcol]) 348 | ax = saveax 349 | ax.set_ylim(ylim) 350 | ax.set_ylabel(clus) 351 | else: 352 | ax = plt.subplot(gs[ploti, whcol], sharey=saveax) 353 | color = next(colors) 354 | 355 | # plot eval results 356 | sns.regplot( 357 | indep[col], 358 | scores[method].values, 359 | color=color, 360 | ax=ax, 361 | logistic=True, 362 | ) 363 | 364 | axs = fig.get_axes() 365 | axs[0].set_title('%s results per parameter' % method, ha='left') 366 | if savefig_prefix: 367 | plt.savefig('%s.pdf' % savefig_prefix) 368 | return axs 369 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | files = ['snakemake/hypercluster.smk', 'snakemake/config.yml'] 6 | setuptools.setup( 7 | name="hypercluster", 8 | version="0.1.13", 9 | author="Lili Blumenberg, Ruggles Lab", 10 | author_email="lili.blumenberg@gmail.com", 11 | description="A package for automatic clustering hyperparameter optmization", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/liliblu/hypercluster", 15 | classifiers=[ 16 | "Programming Language :: Python :: 3.7", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: MacOS", 19 | "Operating System :: Unix", 20 | ], 21 | install_requires=[ 22 | "pandas >= 0.24.2", 23 | "numpy >= 1.16.4", 24 | "scipy >= 1.2.1", 25 | "matplotlib >= 3.1.0", 26 | "seaborn >= 0.9.0", 27 | "scikit-learn >= 0.22.0", 28 | "hdbscan >= 0.8.24", 29 | "snakemake >= 5.8.2", 30 | "python-igraph >=0.7.1", 31 | "leidenalg >=0.7.0", 32 | "louvain >=0.6.1" 33 | ], 34 | package_data={"hypercluster": files}, 35 | packages=setuptools.find_packages() 36 | ) 37 | -------------------------------------------------------------------------------- /snakemake/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__": { 3 | "job-name": "snakemake", 4 | "time": "4-23:59:59", 5 | "mem": 2G, 6 | "partition": "fn_medium", 7 | "cpus-per-task": 1, 8 | "output": "logs/slurm/%j.out", 9 | "error": "logs/slurm/%j.err" 10 | }, 11 | "run_clusterer": { 12 | "job-name": "snakemakerunclusterer", 13 | "time": "4-23:59:59", 14 | "mem": 32G, 15 | "partition": "fn_medium", 16 | "cpus-per-task": 4, 17 | "output": "logs/slurm/%j.out", 18 | "error": "logs/slurm/%j.err" 19 | }, 20 | "run_evaluation": { 21 | "job-name": "snakeevaluate", 22 | "time": "4-23:59:59", 23 | "mem": 4G, 24 | "partition": "fn_medium", 25 | "cpus-per-task": 1, 26 | "output": "logs/slurm/%j.out", 27 | "error": "logs/slurm/%j.err" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /snakemake/config.yml: -------------------------------------------------------------------------------- 1 | input_data_folder: '.' 2 | input_data_files: 'test_input' 3 | gold_standards: 4 | test_input: '' 5 | read_csv_kwargs: 6 | test_input: {'index_col':[0]} 7 | 8 | output_folder: 'results' 9 | intermediates_folder: 'clustering_intermediates' 10 | clustering_results: 'clustering' 11 | 12 | clusterer_kwargs: {} 13 | generate_parameters_addtl_kwargs: {} 14 | 15 | evaluations: 16 | # - adjusted_rand_score 17 | # - adjusted_mutual_info_score 18 | # - homogeneity_score 19 | # - completeness_score 20 | # - fowlkes_mallows_score 21 | # - mutual_info_score 22 | # - v_measure_score 23 | - silhouette_score 24 | - calinski_harabasz_score 25 | - davies_bouldin_score 26 | - number_clustered 27 | - smallest_largest_clusters_ratio 28 | - smallest_cluster_ratio 29 | eval_kwargs: {} 30 | screeplot_evals: 31 | # - adjusted_rand_score 32 | # - adjusted_mutual_info_score 33 | # - homogeneity_score 34 | # - completeness_score 35 | # - fowlkes_mallows_score 36 | # - mutual_info_score 37 | # - v_measure_score 38 | - silhouette_score 39 | - calinski_harabasz_score 40 | - davies_bouldin_score 41 | - number_clustered 42 | - smallest_largest_clusters_ratio 43 | - smallest_cluster_ratio 44 | 45 | metric_to_choose_best: silhouette_score 46 | metric_to_compare_labels: adjusted_rand_score 47 | compare_samples: true 48 | 49 | output_kwargs: 50 | evaluations: 51 | index_col: [0] 52 | labels: 53 | index_col: [0] 54 | heatmap_kwargs: {} 55 | 56 | optimization_parameters: 57 | AffinityPropagation: 58 | damping: 59 | - 0.55 60 | - 0.6 61 | - 0.65 62 | - 0.7 63 | - 0.75 64 | - 0.8 65 | - 0.85 66 | - 0.9 67 | - 0.95 68 | HDBSCAN: 69 | min_cluster_size: &id002 70 | - 2 71 | - 3 72 | - 4 73 | - 5 74 | - 6 75 | - 7 76 | - 8 77 | - 9 78 | - 10 79 | - 11 80 | - 12 81 | - 13 82 | - 14 83 | - 15 84 | - 16 85 | KMeans: 86 | n_clusters: &id001 87 | - 2 88 | - 3 89 | - 4 90 | - 5 91 | - 6 92 | - 7 93 | - 8 94 | - 9 95 | - 10 96 | - 11 97 | - 12 98 | - 13 99 | - 14 100 | - 15 101 | - 16 102 | - 17 103 | - 18 104 | - 19 105 | - 20 106 | - 21 107 | - 22 108 | - 23 109 | - 24 110 | - 25 111 | - 26 112 | - 27 113 | - 28 114 | - 29 115 | - 30 116 | - 31 117 | - 32 118 | - 33 119 | - 34 120 | - 35 121 | - 36 122 | - 37 123 | - 38 124 | - 39 125 | - 40 126 | MiniBatchKMeans: 127 | n_clusters: *id001 128 | OPTICS: 129 | min_samples: *id002 130 | NMFCluster: 131 | n_clusters: *id001 132 | LouvainCluster: &id003 133 | resolution: 134 | - 0.2 135 | - 0.4 136 | - 0.6 137 | - 0.8 138 | - 1.0 139 | - 1.2 140 | - 1.4 141 | - 1.6 142 | k: 143 | - 10 144 | - 15 145 | - 20 146 | - 40 147 | - 80 148 | - 120 149 | LeidenCluster: *id003 -------------------------------------------------------------------------------- /snakemake/hypercluster.smk: -------------------------------------------------------------------------------- 1 | import pandas as pd,numpy as np 2 | from hypercluster import utilities, visualize 3 | import hypercluster 4 | from hypercluster.constants import param_delim, val_delim 5 | import os, subprocess 6 | from shutil import copyfile 7 | import yaml 8 | 9 | subprocess.run(['mkdir', '-p', 'logs']) 10 | targets = ['labels', 'evaluations'] 11 | 12 | input_data_folder = config['input_data_folder'] 13 | input_files = config['input_data_files'] 14 | 15 | output_folder = config['output_folder'] 16 | subprocess.run(['mkdir', '-p', output_folder]) 17 | 18 | intermediates_folder = config['intermediates_folder'] 19 | clustering_results = config['clustering_results'] 20 | 21 | 22 | def generate_parameters(config): 23 | parameters = config['optimization_parameters'] 24 | all_params_to_test = [] 25 | for clusterer, params in parameters.items(): 26 | clus_kwargs = config['clusterer_kwargs'].get(clusterer, {}) 27 | kwargs = config['generate_parameters_addtl_kwargs'].get(clusterer, {}) 28 | df = hypercluster.AutoClusterer( 29 | clusterer_name=clusterer, 30 | params_to_optimize=params, 31 | clus_kwargs=clus_kwargs, 32 | **kwargs 33 | ).param_sets 34 | df['clusterer'] = clusterer 35 | all_params_to_test.extend(df.to_dict('records')) 36 | #TODO why is random search not working? getting key not found errors 37 | final_param_sets = {} 38 | for param_set in all_params_to_test: 39 | clusterer = param_set['clusterer'] 40 | lab = param_delim.join([clusterer]+[ 41 | '%s%s%s' % (k, val_delim, v) for k, v in param_set.items() if k != 'clusterer' 42 | ]) 43 | final_param_sets.update({lab:param_set}) 44 | config['param_sets'] = final_param_sets 45 | config['param_sets_labels'] = list(final_param_sets.keys()) 46 | 47 | with open('%s/params_to_test.yml' % output_folder, 'w') as fh: 48 | yaml.dump(final_param_sets, fh) 49 | 50 | 51 | def handle_ext(wildcards): 52 | base = wildcards.input_file 53 | files = [] 54 | for file_ext in [".csv", ".tsv", ".txt"]: 55 | file = '%s/%s%s' % (input_data_folder, base, file_ext) 56 | if os.path.exists(file): 57 | files.append(file) 58 | if len(files) == 1: 59 | return files[0] 60 | if len(files) > 1: 61 | raise ValueError( 62 | 'Multiple files with prefix %s/%s can be found, must be unique' % (input_data_folder, base) 63 | ) 64 | raise FileNotFoundError( 65 | 'No .txt, .csv or .tsv files with prefix %s/%s can be found' % (input_data_folder, base) 66 | ) 67 | 68 | 69 | def concat_dfs(df_list, kwargs): 70 | results = pd.DataFrame() 71 | for fil in df_list: 72 | temp = pd.read_csv(fil, **kwargs) 73 | results = pd.concat([results, temp], join='outer', axis=1) 74 | return results 75 | 76 | 77 | def get_target_files(config): 78 | target_files = expand( 79 | '%s/{input_file}/%s/{targets}.txt' % (output_folder, clustering_results), 80 | input_file=input_files, 81 | targets=targets 82 | ) + expand( 83 | 84 | "%s/{input_file}/%s/{labs}_{targets}.txt" % (output_folder, intermediates_folder), 85 | input_file=input_files, 86 | labs=config["param_sets_labels"], 87 | targets=targets 88 | ) + expand( 89 | '%s/{input_file}/%s/evaluations.pdf' % (output_folder, clustering_results), 90 | input_file=input_files 91 | ) 92 | 93 | if config['metric_to_choose_best']: 94 | target_files.append( 95 | expand( 96 | "%s/{input_file}/%s/best_parameters.txt" % (output_folder, clustering_results), 97 | input_file=input_files 98 | ) 99 | ) 100 | if config['metric_to_compare_labels']: 101 | target_files.append( 102 | expand( 103 | '%s/{input_file}/%s/%s_label_comparison.txt' % ( 104 | output_folder, clustering_results, config['metric_to_compare_labels'] 105 | ), 106 | input_file=input_files 107 | ) 108 | ) 109 | if config['compare_samples']: 110 | target_files.append( 111 | expand( 112 | '%s/{input_file}/%s/sample_label_agreement.txt' % (output_folder, clustering_results), 113 | input_file=input_files 114 | ) 115 | ) 116 | if config['screeplot_evals']: 117 | target_files.append( 118 | expand( 119 | '%s/{input_file}/%s/scree_plots.{eval}.pdf' % (output_folder, clustering_results), 120 | input_file=input_files, 121 | eval=config['screeplot_evals'] 122 | ) 123 | ) 124 | 125 | return target_files 126 | 127 | 128 | generate_parameters(config) 129 | files_to_generate = get_target_files(config) 130 | 131 | rule all: 132 | input: 133 | files_to_generate 134 | 135 | 136 | rule run_clusterer: 137 | input: 138 | infile = handle_ext 139 | output: 140 | "%s/{input_file}/%s/{labs}_labels.txt" % (output_folder, intermediates_folder) 141 | params: 142 | kwargs = lambda wildcards: config["param_sets"][wildcards.labs], 143 | readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}), 144 | cluskwargs = config['clusterer_kwargs'] 145 | run: 146 | df = pd.read_csv(input.infile, **params.readkwargs) 147 | kwargs = params.kwargs 148 | clusterer = kwargs.pop('clusterer') 149 | 150 | kwargs.update(params.cluskwargs.get(clusterer, {})) 151 | print(kwargs) 152 | cls = utilities.cluster(clusterer, df, kwargs) 153 | 154 | labs = pd.DataFrame(cls.labels_, index=df.index, columns=[wildcards.labs]) 155 | labs.to_csv(output[0], sep = params.readkwargs.get('sep', ',')) 156 | 157 | 158 | rule run_evaluation: 159 | input: 160 | "%s/{input_file}/%s/{labs}_labels.txt" % (output_folder, intermediates_folder) 161 | output: 162 | "%s/{input_file}/%s/{labs}_evaluations.txt" % (output_folder, intermediates_folder) 163 | params: 164 | gold_standards = lambda wildcards: config['gold_standards'].get(wildcards.input_file, ''), 165 | input_data = handle_ext, 166 | readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}), 167 | evals = config["evaluations"], 168 | evalkwargs = config["eval_kwargs"] 169 | run: 170 | readkwargs = { 171 | 'index_col':params.readkwargs.get('index_col', 0), 172 | 'sep':params.readkwargs.get('sep', ',') 173 | } 174 | test_labels = pd.read_csv(input[0], **params.readkwargs) 175 | if os.path.exists(params.gold_standards): 176 | gold_standard = pd.read_csv( 177 | '%s/%s' %(input_data_folder, params.gold_standards), 178 | **readkwargs 179 | ) 180 | gold_standard = gold_standard[gold_standard.columns[0]] 181 | else: 182 | gold_standard = None 183 | 184 | data = pd.read_csv(params.input_data, **readkwargs) 185 | res = pd.DataFrame({'methods':params.evals}) 186 | 187 | res[wildcards.labs] = res.apply( 188 | lambda row: utilities.evaluate_one( 189 | test_labels[test_labels.columns[0]], 190 | method=row['methods'], 191 | data=data, 192 | gold_standard=gold_standard, 193 | metric_kwargs=params.evalkwargs.get(row['methods'], None) 194 | ), axis=1 195 | ) 196 | res = res.set_index('methods') 197 | res.to_csv(output[0], sep=readkwargs['sep']) 198 | 199 | 200 | rule collect_dfs: 201 | input: 202 | files = expand( 203 | '%s/{{input_file}}/%s/{params_label}_{{targets}}.txt' % ( 204 | output_folder, intermediates_folder 205 | ), params_label = config['param_sets_labels'] 206 | ) 207 | params: 208 | outputkwargs = lambda wildcards: config['output_kwargs'].get(wildcards.targets) 209 | output: 210 | '%s/{input_file}/%s/{targets}.txt' % (output_folder, clustering_results) 211 | run: 212 | kwargs = { 213 | 'index_col':params.outputkwargs.get('index_col', 0), 214 | 'sep':params.outputkwargs.get('sep', ',') 215 | } 216 | 217 | df = concat_dfs(input.files, kwargs) 218 | df.to_csv( 219 | output[0], sep = kwargs['sep'] # TODO see if this works for the rest 220 | ) 221 | 222 | 223 | rule visualize_evaluations: 224 | input: 225 | files = '%s/{input_file}/%s/evaluations.txt' % ( 226 | output_folder, clustering_results 227 | ) 228 | output: 229 | output_file = '%s/{input_file}/%s/evaluations.pdf' % ( 230 | output_folder, clustering_results 231 | ) 232 | params: 233 | heatmap_kwargs = config['heatmap_kwargs'], 234 | readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}) 235 | run: 236 | df = pd.read_csv(input.files, sep=params.readkwargs.get('sep', ','), index_col=0) 237 | 238 | visualize.visualize_evaluations( 239 | df, output_prefix=output.output_file.rsplit('.', 1)[0], savefig=True, 240 | **params.heatmap_kwargs 241 | ) 242 | 243 | 244 | rule pick_best_clusters: 245 | input: 246 | evals = '%s/{input_file}/%s/evaluations.txt' % (output_folder, clustering_results) 247 | output: 248 | "%s/{input_file}/%s/best_parameters.txt" % (output_folder, clustering_results), 249 | params: 250 | metric = config['metric_to_choose_best'], 251 | sep = lambda wcs: config['read_csv_kwargs'].get(wcs.input_file, {}).get('sep', ',') 252 | run: 253 | df = pd.read_csv(input.evals, sep=params.sep, index_col=0).transpose() 254 | labs = list(df[df[params.metric]==df[params.metric].max()].index) 255 | for lab in labs: 256 | copyfile( 257 | "%s/%s/%s/%s_labels.txt" % ( 258 | output_folder, 259 | wildcards.input_file, 260 | intermediates_folder, 261 | lab 262 | ), 263 | "%s/%s/%s/%s_labels.txt" % ( 264 | output_folder, 265 | wildcards.input_file, 266 | clustering_results, 267 | lab 268 | ) 269 | ) 270 | with open(output[0], 'a') as fh: 271 | fh.write('%s\n' % lab) 272 | 273 | visualize.visualize_for_picking_labels( 274 | df.transpose(), 275 | method=params.metric, 276 | savefig_prefix='%s/scree_plots.%s' % ( 277 | output[0].rsplit('/', 1)[0], params.metric 278 | ) 279 | ) 280 | 281 | rule compare_labels: 282 | input: 283 | labels = '%s/{input_file}/%s/labels.txt' % (output_folder, clustering_results) 284 | output: 285 | table = '%s/{input_file}/%s/%s_label_comparison.txt' % ( 286 | output_folder, clustering_results, config['metric_to_compare_labels'] 287 | ) 288 | params: 289 | metric = config['metric_to_compare_labels'], 290 | readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}) 291 | run: 292 | kwargs = { 293 | 'index_col':params.readkwargs.get('index_col', 0), 294 | 'sep':params.readkwargs.get('sep', ',') 295 | } 296 | df = pd.read_csv(input.labels, **kwargs) 297 | df = df.corr(lambda x, y: utilities.evaluate_one( 298 | x, method=params.metric, gold_standard=y 299 | )) 300 | df.to_csv(output.table) 301 | 302 | visualize.visualize_pairwise( 303 | df, 304 | savefig=True, 305 | output_prefix=output.table.rsplit('.', 1)[0], 306 | method = params.metric, 307 | **config['heatmap_kwargs'] 308 | ) 309 | 310 | 311 | rule compare_samples: 312 | input: 313 | labels = '%s/{input_file}/%s/labels.txt' % (output_folder, clustering_results) 314 | output: 315 | table = '%s/{input_file}/%s/sample_label_agreement.txt' % (output_folder, 316 | clustering_results) 317 | params: 318 | readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}) 319 | run: 320 | kwargs = { 321 | 'index_col':params.readkwargs.get('index_col', 0), 322 | 'sep':params.readkwargs.get('sep', ',') 323 | } 324 | df = pd.read_csv(input.labels, **kwargs).transpose() 325 | df = df.corr( 326 | lambda x, y: sum(np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))])) 327 | ) 328 | 329 | df.to_csv(output.table, sep = kwargs['sep']) 330 | 331 | visualize.visualize_pairwise( 332 | df, 333 | savefig=True, 334 | output_prefix=output.table.rsplit('.', 1)[0], 335 | method = '# same label', 336 | **config['heatmap_kwargs'] 337 | ) 338 | 339 | 340 | rule draw_scree_plots: 341 | input: 342 | eval_df = '%s/{input_file}/%s/evaluations.txt' % (output_folder, clustering_results) 343 | output: 344 | pdfs = expand( 345 | '%s/{{input_file}}/%s/scree_plots.{eval}.pdf' % (output_folder, clustering_results), 346 | eval=config['screeplot_evals'] 347 | ) 348 | params: 349 | sep = lambda wcs: config['read_csv_kwargs'].get(wcs.input_file, {}).get('sep', ',') 350 | run: 351 | df = pd.read_csv(input.eval_df, sep=params.sep, index_col=0) 352 | for metric in config['screeplot_evals']: 353 | visualize.visualize_for_picking_labels( 354 | df, 355 | method=metric, 356 | savefig_prefix='%s/%s/%s/scree_plots.%s' % ( 357 | output_folder, wildcards.input_file, clustering_results, metric 358 | ) 359 | ) 360 | 361 | 362 | -------------------------------------------------------------------------------- /snakemake/snakemake_submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --partition cpu_long 3 | #SBATCH --mem 4G 4 | #SBATCH --time 27-23:59:59 5 | #SBATCH --job-name snakeautocluster 6 | #SBATCH --cpus-per-task=2 7 | #SBATCH -e logs/sbatchSnakefile_progress_err.log 8 | #SBATCH -o logs/sbatchSnakefile_progress_out.log 9 | 10 | 11 | module purge 12 | module add slurm 13 | source activate hypercluster 14 | cd /gpfs/home/lmb529/ruggleslabHome/hypercluster 15 | mkdir -p logs/slurm/ 16 | 17 | snakemake -j 999 -p --verbose \ 18 | -s hypercluster.smk \ 19 | --keep-going \ 20 | --cluster-config cluster.json \ 21 | --cluster "sbatch --mem={cluster.mem} -t {cluster.time} -o {cluster.output} -p {cluster.partition}" 22 | -------------------------------------------------------------------------------- /snakemake/test_input.txt: -------------------------------------------------------------------------------- 1 | ind2,0,1 2 | a,1.0,2.0 3 | b,-1.8,4.0 4 | c,1.0,-0.5 5 | d,10.0,2.0 6 | e,-10.0,4.0 7 | f,10.0,0.0 8 | g,0.1,5.0 9 | h,3.2,2.0 10 | i,0.0,14.0 11 | j,-16.4,3.67 12 | k,13.22,-3.0 13 | 11,3.3,2.0 14 | 12,42.0,2.0 15 | 13,-8.0,2.0 16 | 14,1.2,12.0 17 | 15,0.1,2.1 18 | 16,0.25,2.0 19 | 17,0.1,1.11 20 | 18,-44.0,0.0 21 | 19,-0.22,-0.11 22 | 20,2.34,6.7 23 | 21,-10.0,2.0 24 | 22,-2.3,-2.5 25 | 23,0.1,0.0 26 | 24,0.1,22.0 27 | 25,8.6,-7.5 28 | 26,0.0,14.0 29 | 27,-6.4,23.67 30 | 28,-3.22,3.0 31 | 29,0.1,2.0 32 | 30,-20.0,2.01 33 | 31,0.25,-0.25 34 | 32,0.455,0.233 35 | 33,0.1,-0.89 36 | 34,19.0,2.0 37 | 35,0.1,2.0 38 | 36,-29.0,3.6 39 | 37,-13.0,-3.0 40 | 38,3.3,2.0 41 | 39,-4.0,2.0 42 | 40,-0.2,-0.1 43 | 41,0.34,0.7 44 | --------------------------------------------------------------------------------