├── LICENSE
├── README.md
├── build
    └── lib
    │   ├── hypercluster
    │       ├── __init__.py
    │       ├── additional_clusterers.py
    │       ├── additional_metrics.py
    │       ├── classes.py
    │       ├── clustering.py
    │       ├── constants.py
    │       ├── evaluations.py
    │       ├── metrics.py
    │       ├── tests
    │       │   ├── __init__.py
    │       │   ├── test_clustering.py
    │       │   └── test_visualize.py
    │       ├── utilities.py
    │       └── visualize.py
    │   └── tests
    │       ├── __init__.py
    │       ├── test_clustering.py
    │       ├── test_snakemake.py
    │       └── test_visualize.py
├── dist
    ├── hypercluster-0.0.1-py3-none-any.whl
    ├── hypercluster-0.0.1.tar.gz
    ├── hypercluster-0.1.0-py3-none-any.whl
    ├── hypercluster-0.1.0.tar.gz
    ├── hypercluster-0.1.1-py3-none-any.whl
    ├── hypercluster-0.1.1.tar.gz
    ├── hypercluster-0.1.10-py3-none-any.whl
    ├── hypercluster-0.1.10.tar.gz
    ├── hypercluster-0.1.12-py3-none-any.whl
    ├── hypercluster-0.1.12.tar.gz
    ├── hypercluster-0.1.13-py3-none-any.whl
    ├── hypercluster-0.1.13.tar.gz
    ├── hypercluster-0.1.2-py3-none-any.whl
    ├── hypercluster-0.1.2.tar.gz
    ├── hypercluster-0.1.3-py3-none-any.whl
    ├── hypercluster-0.1.3.tar.gz
    ├── hypercluster-0.1.5-py3-none-any.whl
    ├── hypercluster-0.1.5.tar.gz
    ├── hypercluster-0.1.6-py3-none-any.whl
    ├── hypercluster-0.1.6.tar.gz
    ├── hypercluster-0.1.7-py3-none-any.whl
    ├── hypercluster-0.1.7.tar.gz
    ├── hypercluster-0.1.8-py3-none-any.whl
    ├── hypercluster-0.1.8.tar.gz
    ├── hypercluster-0.1.9-py3-none-any.whl
    └── hypercluster-0.1.9.tar.gz
├── docs
    ├── Makefile
    ├── conf.py
    ├── hypercluster.rst
    ├── index.rst
    ├── make.bat
    ├── requirements.txt
    └── snakemake.rst
├── examples
    ├── README.md
    ├── local_TCGA_BRCA_RNAseq
    │   ├── TCGA_2012_BRCA_data_expression_median_top500_most_variable.txt
    │   ├── TCGA_BRCA_RNAseq_subtype_clustering.ipynb
    │   ├── data_clinical_sample.txt
    │   ├── figures
    │   │   ├── .ipynb_checkpoints
    │   │   │   ├── grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf
    │   │   │   └── grid.scatter.louvain-umaps-checkpoint.pdf
    │   │   ├── brca.rna.evaluations.pdf
    │   │   ├── clustermap.nmf4-vs-psm50.pdf
    │   │   ├── colorbar.LeidenCluster-silhouette_score.pdf
    │   │   ├── colorbar.LouvainCluster-adjusted_rand_score.pdf
    │   │   ├── colorbar.NMFCluster-silhouette_score.pdf
    │   │   ├── colorbar.silhouette_score.pdf
    │   │   ├── grid.scatter.LeidenCluster-silhouette_score-umaps.pdf
    │   │   ├── grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf
    │   │   ├── grid.scatter.NMFCluster-silhouette_score-umaps.pdf
    │   │   ├── heatmap.brca-rna.evaluations.PAM50_comp.pdf
    │   │   ├── heatmap.brca-rna.evaluations.pdf
    │   │   ├── heatmaps.graphs-clusterers.metrics.pdf
    │   │   ├── scatter.calinski_harabasz_score.pdf
    │   │   ├── scatter.davies_bouldin_score.pdf
    │   │   ├── scatter.largest_cluster_size.pdf
    │   │   ├── scatter.number_of_clusters.pdf
    │   │   ├── scatter.pca.various_clusters.pdf
    │   │   ├── scatter.silhouette_score.pdf
    │   │   ├── scatter.smallest_cluster_size.pdf
    │   │   └── scatter.smallest_largest_clusters_ratio.pdf
    │   └── jupyter-lab-5918844.log
    └── snakemake_scRNA_example
    │   ├── cluster.json
    │   ├── config.yml
    │   ├── data_table_100genes.csv
    │   ├── figures
    │       ├── heatmaps.graphs-clusterers.metrics.pdf
    │       ├── pca.best_labels.pdf
    │       ├── pca.published_labels.pdf
    │       ├── umap.best_labels.pdf
    │       └── umap.published_labels.pdf
    │   ├── gold_standard.csv
    │   ├── params_to_test.yml
    │   ├── scRNA-seq_example.ipynb
    │   ├── sc_data.csv
    │   ├── seurat_igor_meta.tsv
    │   └── snakemake_submit.sh
├── hypercluster
    ├── __init__.py
    ├── additional_clusterers.py
    ├── additional_metrics.py
    ├── classes.py
    ├── constants.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_clustering.py
    │   └── test_visualize.py
    ├── utilities.py
    └── visualize.py
├── setup.py
└── snakemake
    ├── cluster.json
    ├── config.yml
    ├── hypercluster.smk
    ├── snakemake_submit.sh
    └── test_input.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, Lili Blumenberg
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the <organization> nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | © 2019 GitHub, Inc.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hypercluster
  2 | A package for clustering optimization with sklearn. 
  3 | 
  4 | ### Requirements:  
  5 | pandas  
  6 | numpy  
  7 | scipy  
  8 | matplotlib  
  9 | seaborn  
 10 | scikit-learn  
 11 | hdbscan  
 12 | 
 13 | Optional:
 14 | snakemake
 15 | 
 16 | 
 17 | ### Install  
 18 | With pip:
 19 | ```
 20 | pip install hypercluster
 21 | ```
 22 | 
 23 | or with conda:
 24 | ```
 25 | conda install hypercluster
 26 | # or
 27 | conda install -c conda-forge -c bioconda hypercluster
 28 | ```
 29 | If you are having problems installing with conda, try changing your channel priority. Priority of conda-forge > bioconda > defaults is recommended. 
 30 | To check channel priority: `conda config --get channels`
 31 | It should look like:
 32 | ```
 33 | --add channels 'defaults'   # lowest priority
 34 | --add channels 'bioconda'
 35 | --add channels 'conda-forge'   # highest priority
 36 | ```
 37 | 
 38 | If it doesn't look like that, try:
 39 | ```
 40 | conda config --add channels bioconda
 41 | conda config --add channels conda-forge
 42 | ```
 43 | 
 44 | ### Docs 
 45 | https://hypercluster.readthedocs.io/en/latest/index.html  
 46 | 
 47 | It will also be useful to check out sklearn's page on [clustering](https://scikit-learn.org/stable/modules/clustering.html) 
 48 | and [evaluation metrics](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation) 
 49 | 
 50 | ### Examples
 51 | https://github.com/liliblu/hypercluster/tree/dev/examples
 52 | 
 53 | ### Quickstart with SnakeMake
 54 | 
 55 | Default `config.yml` and `hypercluster.smk` are in the snakemake repo above.  
 56 | Edit the `config.yml` file or arguments.
 57 | ```bash
 58 | snakemake -s hypercluster.smk --configfile config.yml --config input_data_files=test_data input_data_folder=. 
 59 | ```
 60 | 
 61 | Example editing with python:
 62 | ```python
 63 | import yaml
 64 | 
 65 | with open('config.yml', 'r') as fh:
 66 |     config = yaml.load(fh)
 67 |     
 68 | input_data_prefix = 'test_data'
 69 | config['input_data_folder'] = os.path.abspath('.')
 70 | config['input_data_files'] = [input_data_prefix]
 71 | config['read_csv_kwargs'] = {input_data_prefix:{'index_col': [0]}}
 72 | 
 73 | with open('config.yml', 'w') as fh:
 74 |     yaml.dump(config, stream=fh)
 75 | ```
 76 | 
 77 | Then call snakemake. 
 78 | ```bash
 79 | snakemake -s hypercluster.smk
 80 | ```
 81 | 
 82 | Or submit the snakemake scheduler as an sbatch job e.g. with BigPurple Slurm:
 83 | ```bash
 84 | module add slurm
 85 | sbatch snakemake_submit.sh
 86 | ```
 87 | Examples for `snakemake_submit.sh` and `cluster.json` is in the scRNA-seq example. 
 88 | 
 89 | ### Quickstart with python
 90 | ```python
 91 | import pandas as pd
 92 | from sklearn.datasets import make_blobs
 93 | import hypercluster
 94 | 
 95 | data, labels = make_blobs()
 96 | data = pd.DataFrame(data)
 97 | labels = pd.Series(labels, index=data.index, name='labels')
 98 | 
 99 | # With a single clustering algorithm
100 | clusterer = hypercluster.AutoClusterer()
101 | clusterer.fit(data).evaluate(
102 |   methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 
103 |   gold_standard = labels
104 |   )
105 | 
106 | clusterer.visualize_evaluations()
107 | 
108 | # With a range of algorithms
109 | 
110 | clusterer = hypercluster.MultiAutoClusterer()
111 | clusterer.fit(data).evaluate(
112 |   methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics, 
113 |   gold_standard = labels
114 |   )
115 | 
116 | clusterer.visualize_evaluations()
117 | ```
118 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/__init__.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import seaborn as sns
 3 | import hypercluster
 4 | from hypercluster import (
 5 |     utilities, additional_clusterers, additional_metrics, classes, constants, visualize
 6 | )
 7 | from hypercluster.classes import AutoClusterer, MultiAutoClusterer
 8 | __version__ = '0.1.13'
 9 | __all__ = [
10 |     "AutoClusterer",
11 |     "MultiAutoClusterer"
12 | ]
13 | 
14 | matplotlib.rcParams["pdf.fonttype"] = 42
15 | matplotlib.rcParams["ps.fonttype"] = 42
16 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3)
17 | matplotlib.rcParams.update({"savefig.bbox": "tight"})


--------------------------------------------------------------------------------
/build/lib/hypercluster/additional_clusterers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Additonal clustering classes can be added here, as long as they have a 'fit' method.
  3 | 
  4 | 
  5 | Attributes:
  6 |     HDBSCAN (clustering class): See `hdbscan`_
  7 | 
  8 | .. _hdbscan:
  9 |     https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#the-simple-case/
 10 | """
 11 | from typing import Optional, Iterable
 12 | import logging
 13 | import numpy as np
 14 | import pandas as pd
 15 | from scipy.spatial.distance import pdist
 16 | from sklearn.decomposition import NMF
 17 | from sklearn.neighbors import NearestNeighbors
 18 | from hdbscan import HDBSCAN
 19 | from .constants import pdist_adjacency_methods, valid_partition_types
 20 | import igraph as ig
 21 | import louvain
 22 | import leidenalg
 23 | 
 24 | 
 25 | class NMFCluster:
 26 |     """Uses non-negative factorization from sklearn to assign clusters to samples, based on the
 27 |     maximum membership score of the sample per component.
 28 | 
 29 |     Args:
 30 |         n_clusters: The number of clusters to find. Used as n_components when fitting.
 31 |         **nmf_kwargs:
 32 |     """
 33 |     def __init__(self, n_clusters: int = 8, **nmf_kwargs):
 34 | 
 35 |         nmf_kwargs['n_components'] = n_clusters
 36 | 
 37 |         self.NMF = NMF(**nmf_kwargs)
 38 |         self.n_clusters = n_clusters
 39 | 
 40 |     def fit(self, data):
 41 |         """If negative numbers are present, creates one data matrix with all negative numbers
 42 |         zeroed. Create another data matrix with all positive numbers zeroed and the signs of all
 43 |         negative numbers reversed. Concatenate both matrices resulting in a data matrix twice as
 44 |         large as the original, but with positive values only and zeros and hence appropriate for
 45 |         NMF. Uses decomposed matrix H, which is nxk (with n=number of samples and k=number of
 46 |         components) to assign cluster membership. Each sample is assigned to the cluster for
 47 |         which it has the highest membership score. See `sklearn.decomposition.NMF`_  
 48 | 
 49 |         Args: 
 50 |             data (DataFrame): Data to fit with samples as rows and features as columns.  
 51 | 
 52 |         Returns: 
 53 |             self with labels\_ attribute.  
 54 | 
 55 |         .. _sklearn.decomposition.NMF: 
 56 |             https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
 57 |         """
 58 | 
 59 |         if np.any(data<0):
 60 |             positive = data.copy()
 61 |             positive[positive < 0] = 0
 62 |             negative = data.copy()
 63 |             negative[negative > 0] = 0
 64 |             negative = -negative
 65 |             data = pd.concat([positive, negative], axis=1, join='outer')
 66 | 
 67 |         self.labels_ = pd.DataFrame(self.NMF.fit_transform(data)).idxmax(axis=1).values
 68 |         return self
 69 | 
 70 | 
 71 | class LouvainCluster:
 72 |     """Louvain clustering on graph derived from an adjacency matrix. 
 73 | 
 74 |     Args: 
 75 |         adjacency_method: Method to use to construct adjacency matrix, which is used to construct \
 76 |         graph that will be clustered. Valid methods are any metric valid in \
 77 |         scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \
 78 |         nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \
 79 |         NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \
 80 |         the count of how many NNs i and j have in common as the edge weight.  
 81 |         k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix.  
 82 |         resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \
 83 |         resolution to use. If using other partitioners, this is ignored but any other kwargs for \
 84 |         those partitioners can be passed too. 
 85 |         adjacency_kwargs: Additional keyword arguments to pass to \
 86 |         sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \
 87 |         adjacency matrix. 
 88 |         partition_type: Which partition to use for louvain clustering, see `louvain-igraph`_ for \
 89 |         more info.  
 90 |         **louvain_kwargs: Additional kwargs to be passed to `find_partition`_
 91 | 
 92 |     .. _louvain-igraph:
 93 |         https://louvain-igraph.readthedocs.io/en/latest/reference.html
 94 |     .. _find_partition:
 95 |         https://louvain-igraph.readthedocs.io/en/latest/reference.html#louvain.find_partition
 96 |     """
 97 |     def __init__(
 98 |             self,
 99 |             adjacency_method: str = 'MNN',
100 |             k: int = 20,
101 |             resolution: float = 0.8,
102 |             adjacency_kwargs: Optional[dict] = None,
103 |             partition_type: str = 'RBConfigurationVertexPartition',
104 |             **louvain_kwargs
105 |     ):
106 | 
107 |         if adjacency_method not in ['MNN', 'CNN'] + pdist_adjacency_methods:
108 |             raise ValueError(
109 |                 'Adjacency method %s invalid. Must be "SNN", "CNN" or a valid metric for '
110 |                 'scipy.spatial.distance.pdist.' % adjacency_method
111 |             )
112 |         if partition_type not in valid_partition_types:
113 |             raise ValueError(
114 |                 'Partition type %s not valid, must be in constants.valid_partition_types' %
115 |                 partition_type
116 |             )
117 |         self.adjacency_method = adjacency_method
118 |         self.k = int(k)
119 |         self.resolution = resolution
120 |         self.adjacency_kwargs = adjacency_kwargs
121 |         self.partition_type = partition_type
122 |         self.louvain_kwargs = louvain_kwargs
123 | 
124 |     def fit(
125 |             self,
126 |             data: pd.DataFrame,
127 |     ):
128 |         adjacency_method = self.adjacency_method
129 |         k = self.k
130 |         resolution = self.resolution
131 |         adjacency_kwargs = self.adjacency_kwargs
132 |         louvain_kwargs = self.louvain_kwargs
133 |         partition_type = self.partition_type
134 |         if k >= len(data):
135 |             logging.warning(
136 |                 'k was set to %s, with only %s samples. Changing to k to %s-1'
137 |                 % (k, len(data), len(data))
138 |             )
139 |             k = len(data) - 1
140 |         if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'):
141 |             if adjacency_kwargs is None:
142 |                 adjacency_kwargs = {}
143 |             adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k)
144 |             nns = NearestNeighbors(**adjacency_kwargs)
145 |             nns.fit(data)
146 |             adjacency_mat = nns.kneighbors_graph(data)
147 |             if adjacency_method == 'MNN':
148 |                 adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose())
149 |             if adjacency_method == 'CNN':
150 |                 adjacency_mat = adjacency_mat*adjacency_mat.transpose()
151 |         elif adjacency_method in pdist_adjacency_methods:
152 |             adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs)
153 | 
154 |         if louvain_kwargs is None:
155 |             louvain_kwargs = {}
156 |         g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist())
157 | 
158 |         if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']:
159 |             louvain_kwargs['resolution_parameter'] = resolution
160 | 
161 |         labels = eval('louvain.find_partition(g, louvain.%s, **louvain_kwargs)' % partition_type)
162 |         labels = pd.Series({v: i for i in range(len(labels)) for v in labels[i]}).sort_index()
163 |         if labels.is_unique or (len(labels.unique()) == 1):
164 |             labels = pd.Series([-1 for i in range(len(labels))])
165 |         labels = labels.values
166 |         self.labels_ = labels
167 |         return self
168 | 
169 | 
170 | class LeidenCluster:
171 |     """Leidein clustering on graph derived from an adjacency matrix. See `reference`_ for more info 
172 | 
173 |     Args: 
174 |         adjacency_method: Method to use to construct adjacency matrix, which is used to construct \
175 |         graph that will be clustered. Valid methods are any metric valid in \
176 |         scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \
177 |         nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \
178 |         NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \
179 |         the count of how many NNs i and j have in common as the edge weight.  
180 |         k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix.  
181 |         resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \
182 |         resolution to use. If using other partitioners, this is ignored but any other kwargs for \
183 |         those partitioners can be passed too. 
184 |         adjacency_kwargs: Additional keyword arguments to pass to \
185 |         sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \
186 |         adjacency matrix. 
187 |         partition_type: Which partition to use for leiden clustering, see `leidenalg`_ for \
188 |         more info.  
189 |         **leiden_kwargs: Additional kwargs to be passed to `find_partition`_
190 |     .. _reference:
191 |         https://www.nature.com/articles/s41598-019-41695-z
192 |     .. _leidenalg:
193 |         https://leidenalg.readthedocs.io/en/latest/reference.html
194 |     .. _find_partition:
195 |         https://leidenalg.readthedocs.io/en/latest/reference.html#leidenalg.find_partition
196 |     """
197 |     def __init__(
198 |             self,
199 |             adjacency_method: str = 'SNN',
200 |             k: int = 20,
201 |             resolution: float = 0.8,
202 |             adjacency_kwargs: Optional[dict] = None,
203 |             partition_type: str = 'RBConfigurationVertexPartition',
204 |             **leiden_kwargs
205 |     ):
206 | 
207 |         self.adjacency_method = adjacency_method
208 |         self.k = int(k)
209 |         self.resolution = resolution
210 |         self.adjacency_kwargs = adjacency_kwargs
211 |         self.partition_type = partition_type
212 |         self.leiden_kwargs = leiden_kwargs
213 | 
214 |     def fit(
215 |             self,
216 |             data: pd.DataFrame,
217 |     ):
218 | 
219 |         adjacency_method = self.adjacency_method
220 |         k = self.k
221 |         resolution = self.resolution
222 |         adjacency_kwargs = self.adjacency_kwargs
223 |         leiden_kwargs = self.leiden_kwargs
224 |         partition_type = self.partition_type
225 |         if k >= len(data):
226 |             logging.warning(
227 |                 'k was set to %s, with only %s samples. Changing to k to %s-1'
228 |                 % (k, len(data), len(data))
229 |             )
230 |             k = len(data) - 1
231 |         if (adjacency_method == 'SNN') | (adjacency_method == 'CNN'):
232 |             if adjacency_kwargs is None:
233 |                 adjacency_kwargs = {}
234 |             adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k)
235 |             nns = NearestNeighbors(**adjacency_kwargs)
236 |             nns.fit(data)
237 |             adjacency_mat = nns.kneighbors_graph(data)
238 |             if adjacency_method == 'SNN':
239 |                 adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose())
240 |             if adjacency_method == 'CNN':
241 |                 adjacency_mat = adjacency_mat * adjacency_mat.transpose()
242 |         elif adjacency_method in pdist_adjacency_methods:
243 |             adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs)
244 | 
245 |         if leiden_kwargs is None:
246 |             leiden_kwargs = {}
247 |         g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist())
248 | 
249 |         if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']:
250 |             leiden_kwargs['resolution_parameter'] = resolution
251 | 
252 |         labels = eval('leidenalg.find_partition(g, leidenalg.%s,**leiden_kwargs)' % partition_type)
253 |         labels = pd.Series({v:i for i in range(len(labels)) for v in labels[i]}).sort_index()
254 |         if labels.is_unique or (len(labels.unique()) == 1):
255 |             labels = pd.Series([-1 for i in range(len(labels))])
256 |         labels = labels.values
257 |         self.labels_ = labels
258 |         return self
259 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/additional_metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Optional
 2 | from collections import Counter
 3 | from pandas import DataFrame
 4 | from scipy.cluster.hierarchy import linkage, cophenet
 5 | from scipy.spatial.distance import pdist
 6 | 
 7 | __doc__ = (
 8 |     "More functions for evaluating clustering results. Additional metric evaluations can "
 9 |     "be added here, as long as the second argument is the labels to evaluate"
10 | )
11 | 
12 | 
13 | def number_clustered(_, labels: Iterable) -> float:
14 |     """Returns the number of clustered samples. 
15 | 
16 |     Args: 
17 |         _: Dummy, pass anything or None.  
18 |         labels (Iterable): Vector of sample labels.  
19 | 
20 |     Returns (int): 
21 |         The number of clustered labels.  
22 | 
23 |     """
24 |     return (labels != -1).sum()
25 | 
26 | 
27 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float:
28 |     """Number in the smallest cluster over the number in the largest cluster.  
29 | 
30 |     Args: 
31 |         _: Dummy, pass anything or None.  
32 |         labels (Iterable): Vector of sample labels.  
33 | 
34 |     Returns (float): 
35 |         Ratio of number of members in smallest over largest cluster.  
36 | 
37 |     """
38 |     counts = Counter(labels)
39 |     counts.pop(-1, None)
40 |     return min(counts.values()) / max(counts.values())
41 | 
42 | 
43 | def smallest_cluster_ratio(_, labels: Iterable) -> float:
44 |     """Number in the smallest cluster over the total samples. 
45 | 
46 |     Args: 
47 |         _: Dummy, pass anything or None.  
48 |         labels (Iterable): Vector of sample labels.  
49 | 
50 |     Returns (float): 
51 |         Ratio of number of members in smallest over all samples.  
52 | 
53 |     """
54 |     counts = Counter(labels)
55 |     counts.pop(-1, None)
56 |     return min(counts.values()) / len(labels)
57 | 
58 | 
59 | def number_of_clusters(_, labels: Iterable) -> float:
60 |     """Number of total clusters. 
61 | 
62 |     Args: 
63 |         _: Dummy, pass anything or None  
64 |         labels (Iterable): Vector of sample labels.  
65 | 
66 |     Returns (int): 
67 |         Number of clusters.  
68 | 
69 |     """
70 |     return len(Counter(labels))
71 | 
72 | 
73 | def smallest_cluster_size(_, labels: Iterable) -> float:
74 |     """Number in smallest cluster 
75 | 
76 |     Args: 
77 |         _: Dummy, pass anything or None  
78 |         labels (Iterable): Vector of sample labels.  
79 | 
80 |     Returns (int): 
81 |         Number of samples in smallest cluster. 
82 | 
83 |     """
84 |     return min(Counter(labels).values())
85 | 
86 | 
87 | def largest_cluster_size(_, labels: Iterable) -> float:
88 |     """Number in largest cluster 
89 | 
90 |     Args: 
91 |         _: Dummy, pass anything or None  
92 |         labels (Iterable): Vector of sample labels.  
93 | 
94 |     Returns (int): 
95 |         Number of samples in largest cluster. 
96 | 
97 |     """
98 |     return max(Counter(labels).values())
99 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/clustering.py:
--------------------------------------------------------------------------------
  1 | from sklearn.cluster import *
  2 | from sklearn.metrics import *
  3 | from .metrics import *
  4 | from hdbscan import HDBSCAN
  5 | from pandas import DataFrame
  6 | import pandas as pd
  7 | import numpy as np
  8 | import logging
  9 | from typing import Optional, Iterable, Dict, Union
 10 | from itertools import product
 11 | from .constants import *
 12 | 
 13 | 
 14 | def calculate_row_weights(
 15 |     row: Iterable, param_weights: dict, vars_to_optimize: dict
 16 | ) -> float:
 17 |     """
 18 |     Used to select random rows of parameter combinations using individual parameter weights.  
 19 | 
 20 |     Args:
 21 |         row:  Series of parameters, with parameter names as index.
 22 |         param_weights: Dictionary of str: dictionaries. Ex format - {'parameter_name':{'param_option_1':0.5, 'param_option_2':0.5}}.
 23 |         vars_to_optimize: Dictionary with possibilities for different parameters. Ex format - {'parameter_name':[1, 2, 3, 4, 5]}.
 24 | 
 25 |     Returns:
 26 |         Float representing the probability of seeing that combination of parameters,
 27 |         given their individual weights.
 28 | 
 29 |     """
 30 |     weights = []
 31 |     for var_lab, val in row.to_dict().items():
 32 |         weights.append(
 33 |             param_weights.get(var_lab, {}).get(
 34 |                 val, (1 / len(vars_to_optimize[var_lab]))
 35 |             )
 36 |         )
 37 |     # TODO if probs are given to some options and not other, split the remaining probability,
 38 |     # don't just give equal prob.
 39 |     return np.prod(weights)
 40 | 
 41 | 
 42 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}):
 43 |     """
 44 |     Runs a given clusterer with a given set of parameters.
 45 | 
 46 |     Args:
 47 |         clusterer_name: String name of clusterer, for options see hypercluster.categories.clusterers.
 48 |         data: Dataframe with elements to cluster as index and examples as columns.  
 49 |         params: Dictionary of parameter names and values to feed into clusterer. Default {}.  
 50 | 
 51 |     Returns: 
 52 |         Instance of the clusterer fit with the data provided.  
 53 |     """
 54 |     clusterer = eval(clusterer_name)(**params)
 55 |     return clusterer.fit(data)
 56 | 
 57 | 
 58 | class AutoClusterer:
 59 |     """
 60 |     Main hypercluster object.
 61 |     Args:
 62 |             clusterer_name: String name of clustererm for options see
 63 |         hypercluster.categories.clusterers..
 64 |             params_to_optimize: Dictionary with possibilities for different parameters. Ex format - {
 65 |         'parameter_name':[1, 2, 3, 4, 5]}. If None, will optimize default selection, given in
 66 |         hypercluster.constants.variables_to_optimize. Default None.
 67 |             random_search: Whether to search a random selection of possible parameters or all
 68 |         possibilites. Default True.
 69 |             random_search_fraction: If random_search is True, what fraction of the possible
 70 |             parameters to search. Default 0.5.
 71 |             param_weights: Dictionary of str: dictionaries. Ex format - {'parameter_name':{
 72 |         'param_option_1':0.5, 'param_option_2':0.5}}.
 73 |             clus_kwargs: Additional kwargs to pass into given clusterer, but not to be optimized.
 74 |         Default None.
 75 |         """
 76 | 
 77 |     def __init__(
 78 |         self,
 79 |         clusterer_name: Optional[str] = "hdbscan",
 80 |         params_to_optimize: Optional[dict] = None,
 81 |         random_search: bool = True,
 82 |         random_search_fraction: float = 0.5,
 83 |         param_weights: dict = {},
 84 |         clus_kwargs: Optional[dict] = None,
 85 |     ):
 86 |         self.clusterer_name = clusterer_name
 87 |         self.params_to_optimize = params_to_optimize
 88 |         self.random_search = random_search
 89 |         self.random_search_fraction = random_search_fraction
 90 |         self.param_weights = param_weights
 91 |         self.clus_kwargs = clus_kwargs
 92 | 
 93 |         if self.params_to_optimize is None:
 94 |             self.params_to_optimize = variables_to_optimize[clusterer_name]
 95 |         if self.clus_kwargs is None:
 96 |             self.clus_kwargs = {}
 97 | 
 98 |         self.labels_ = None
 99 |         self.static_kwargs = None
100 |         self.total_possible_conditions = None
101 |         self.param_sets = None
102 |         self.generate_param_sets()
103 |         self.labels_ = None
104 | 
105 |     def generate_param_sets(self):
106 |         """
107 |         Uses info from init to make a Dataframe of all parameter sets that will be tried.
108 |         Returns:
109 |             self
110 |         """
111 |         conditions = 1
112 |         vars_to_optimize = {}
113 |         static_kwargs = {}
114 |         for parameter_name, possible_values in self.params_to_optimize.items():
115 |             if len(possible_values) == 1:
116 |                 static_kwargs[parameter_name] = possible_values
117 |             elif len(possible_values) > 1:
118 |                 vars_to_optimize[parameter_name] = possible_values
119 |                 conditions *= conditions * len(possible_values)
120 |             else:
121 |                 logging.error(
122 |                     "Parameter %s was given no possibilities. Will continue with default parameter."
123 |                     % parameter_name
124 |                 )
125 | 
126 |         self.static_kwargs = static_kwargs
127 |         self.total_possible_conditions = conditions
128 | 
129 |         parameters = pd.DataFrame(columns=list(vars_to_optimize.keys()))
130 |         for row in iter(product(*vars_to_optimize.values())):
131 |             parameters = parameters.append(
132 |                 dict(zip(vars_to_optimize.keys(), row)), ignore_index=True
133 |             )
134 | 
135 |         if self.random_search and len(parameters) > 1:
136 |             will_search = int(conditions * self.random_search_fraction)
137 | 
138 |             # calculates probability of getting a particular set of parameters, given the probs of
139 |             # all the individual params. If a prob isn't set, give uniform probability to each
140 |             # parameter.
141 |             if self.param_weights:
142 |                 weights = parameters.apply(
143 |                     lambda param_set: calculate_row_weights(
144 |                         param_set, self.param_weights, vars_to_optimize
145 |                     ),
146 |                     axis=1,
147 |                 )
148 |             else:
149 |                 weights = None
150 |             parameters = parameters.sample(will_search, weights=weights)
151 | 
152 |         for col in static_kwargs.keys():
153 |             parameters[col] = static_kwargs[col]
154 | 
155 |         logging.info(
156 |             "For clusterer %s, testing %s out of %s possible conditions"
157 |             % (self.clusterer_name, len(parameters), conditions)
158 |         )
159 | 
160 |         self.param_sets = parameters
161 |         return self
162 | 
163 |     def fit(self, data: DataFrame):
164 |         """
165 |         Fits clusterer to data with each parameter set.
166 |         Args:
167 |             data: Dataframe with elements to cluster as index and examples as columns.
168 | 
169 |         Returns:
170 |             self with self.labels_ assigned
171 |         """
172 | 
173 |         if self.param_sets.shape == (0, 0):
174 |             labels_results = pd.DataFrame(
175 |                 cluster(self.clusterer_name, data).labels_,
176 |                 columns=["default_parameters"],
177 |                 index=data.index,
178 |             )
179 | 
180 |         label_results = pd.DataFrame(columns=self.param_sets.columns.union(data.index))
181 |         for i, row in self.param_sets.iterrows():
182 |             single_params = row.to_dict()
183 |             labels = cluster(self.clusterer_name, data, single_params).labels_
184 | 
185 |             label_row = dict(zip(data.index, labels))
186 |             label_row.update(single_params)
187 |             label_results = label_results.append(label_row, ignore_index=True)
188 |             logging.info(
189 |                 "%s - %s of conditions done" % (i, (i / self.total_possible_conditions))
190 |             )
191 |         if len(self.param_sets.columns) > 0:
192 |             label_results = label_results.set_index(
193 |                 list(self.param_sets.columns)
194 |             ).transpose()
195 | 
196 |         self.labels_ = label_results
197 |         return self
198 | 
199 | 
200 | def evaluate_results(
201 |     labels: Iterable,
202 |     method: str = "silhouette_score",
203 |     data: Optional[DataFrame] = None,
204 |     gold_standard: Optional[Iterable] = None,
205 |     metric_kwargs: Optional[dict] = None,
206 | ) -> dict:
207 |     """
208 |     Uses a given metric to evaluate clustering results.  
209 |     Args:
210 |         labels: Series of labels
211 |         method: Str of name of evaluation to use. For options see hypercluster.categories.evaluations. Default is silhouette.
212 |         data: If using an inherent metric, must provide Dataframe of original data used to
213 |         cluster. For options see hypercluster.constants.inherent_metric.
214 |         gold_standard: If using a metric that compares to ground truth, must provide a set of
215 |         gold standard labels. For options see hypercluster.constants.need_ground_truth.
216 |         metric_kwargs: Additional kwargs to use in evaluation.
217 | 
218 |     Returns:
219 |         Dictionary where every column from the label_df is a key and its evaluation is the value.
220 |     """
221 | 
222 |     if metric_kwargs is None:
223 |         metric_kwargs = {}
224 | 
225 |     if method in need_ground_truth:
226 |         if gold_standard is None:
227 |             raise ValueError(
228 |                 "Chosen evaluation metric %s requires gold standard set." % method
229 |             )
230 |         clustered = (gold_standard != -1) & (labels != -1)
231 |         compare_to = gold_standard[clustered]
232 | 
233 |     elif method in inherent_metrics:
234 |         if data is None:
235 |             raise ValueError(
236 |                 "Chosen evaluation metric %s requires data input." % method
237 |             )
238 |         clustered = labels != -1
239 |         compare_to = data.loc[clustered]
240 |     else:
241 |         raise ValueError("Evaluation metric %s not valid" % method)
242 | 
243 |     if len(labels[clustered].value_counts()) < 2:
244 |         logging.error(
245 |             "Condition %s does not have at least two clusters, skipping" % labels.name
246 |         )
247 |         return np.nan
248 | 
249 |     return eval(method)(compare_to, labels[clustered], **metric_kwargs)
250 | 
251 | 
252 | def optimize_clustering(
253 |     data,
254 |     algorithm_names: Union[Iterable, str] = variables_to_optimize.keys(),
255 |     algorithm_parameters: Optional[Dict[str, dict]] = None,
256 |     random_search: bool = True,
257 |     random_search_fraction: float = 0.5,
258 |     algorithm_param_weights: Optional[dict] = None,
259 |     algorithm_clus_kwargs: Optional[dict] = None,
260 |     evaluation_methods: Optional[list] = None,
261 |     gold_standard: Optional[Iterable] = None,
262 |     metric_kwargs: Optional[dict] = None,
263 | ) -> tuple:
264 |     """
265 |     Runs through many clusterers and parameters to get best clustering labels.
266 |     Args:
267 |         data: Dataframe with elements to cluster as index and examples as columns.
268 |         algorithm_names: Which clusterers to try. Default is in variables_to_optimize.Can also
269 |         put 'slow', 'fast' or 'fastest' for subset of clusterers. See hypercluster.constants.speeds.
270 |         algorithm_parameters: Dictionary of str:dict, with parameters to optimize for each clusterer. Ex. structure:: {'clusterer1':{'param1':['opt1', 'opt2', 'opt3']}}.
271 |         random_search: Whether to search a random selection of possible parameters or all possibilities. Default True.
272 |         random_search_fraction: If random_search is True, what fraction of the possible parameters to search, applied to all clusterers. Default 0.5.
273 |         algorithm_param_weights: Dictionary of str: dictionaries. Ex format - {'clusterer_name': {'parameter_name':{'param_option_1':0.5, 'param_option_2':0.5}}}.
274 |         algorithm_clus_kwargs: Dictionary of additional kwargs per clusterer.
275 |         evaluation_methods: Str name of evaluation metric to use. For options see
276 |         hypercluster.categories.evaluations. Default silhouette.
277 |         gold_standard: If using a evaluation needs ground truth, must provide ground truth labels. For options see hypercluster.constants.need_ground_truth.
278 |         metric_kwargs: Additional evaluation metric kwargs.  
279 | 
280 |     Returns:
281 |         Best labels, dictionary of clustering evaluations, dictionary of all clustering labels
282 |     """
283 | 
284 |     if algorithm_param_weights is None:
285 |         algorithm_param_weights = {}
286 |     if algorithm_clus_kwargs is None:
287 |         algorithm_clus_kwargs = {}
288 |     if algorithm_parameters is None:
289 |         algorithm_parameters = {}
290 |     if metric_kwargs is None:
291 |         metric_kwargs = {}
292 |     if evaluation_methods is None:
293 |         evaluation_methods = inherent_metrics
294 | 
295 |     if algorithm_names in list(categories.keys()):
296 |         algorithm_names = categories[algorithm_names]
297 | 
298 |     clustering_labels = {}
299 |     clustering_labels_df = pd.DataFrame()
300 |     for clusterer_name in algorithm_names:
301 |         label_df = (
302 |             AutoClusterer(
303 |                 clusterer_name=clusterer_name,
304 |                 params_to_optimize=algorithm_parameters.get(clusterer_name, None),
305 |                 random_search=random_search,
306 |                 random_search_fraction=random_search_fraction,
307 |                 param_weights=algorithm_param_weights.get(clusterer_name, None),
308 |                 clus_kwargs=algorithm_clus_kwargs.get(clusterer_name, None),
309 |             )
310 |             .fit(data)
311 |             .labels_
312 |         )
313 |         label_df.index = pd.MultiIndex.from_tuples(label_df.index)
314 |         clustering_labels[clusterer_name] = label_df
315 | 
316 |         # Put all parameter labels into 1 for a big df
317 |         label_df = label_df.transpose()
318 |         cols_for_labels = label_df.index.to_frame()
319 | 
320 |         inds = cols_for_labels.apply(
321 |             lambda row: param_delim.join(
322 |                 [clusterer_name]
323 |                 + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()]
324 |             ),
325 |             axis=1,
326 |         )
327 | 
328 |         label_df.index = inds
329 |         label_df = label_df.transpose()
330 |         clustering_labels_df = pd.concat(
331 |             [clustering_labels_df, label_df], join="outer", axis=1
332 |         )
333 | 
334 |     evaluation_results_df = pd.DataFrame({"methods": evaluation_methods})
335 |     for col in clustering_labels_df.columns:
336 |         evaluation_results_df[col] = evaluation_results_df.apply(
337 |             lambda row: evaluate_results(
338 |                 clustering_labels_df[col],
339 |                 method=row["methods"],
340 |                 data=data,
341 |                 gold_standard=gold_standard,
342 |                 metric_kwargs=metric_kwargs.get(row["methods"], None),
343 |             ),
344 |             axis=1,
345 |         )
346 | 
347 |     return evaluation_results_df, clustering_labels_df, clustering_labels
348 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/constants.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | __doc__ = """
  5 | Attributes: 
  6 |     param_delim: delimiter between hyperparameters for snakemake file labels and labels DataFrame \
  7 |     columns.  
  8 |     val_delim: delimiter between hyperparameter label and value for snakemake file labels and \
  9 |     labels DataFrame columns.  
 10 |     categories: Convenient groups of clusterers to use. If all samples need to be clustered, \
 11 |     'partitioners' is a good choice. If there are millions of samples, 'fastest' might be a good \
 12 |     choice.    
 13 |     variables_to_optimize: Some default hyperparameters to optimize and value ranges for a \
 14 |     selection of commonly used clustering algoirthms from sklearn. Used as deafults for \
 15 |     clustering.AutoClusterer and clustering.optimize_clustering.    
 16 |     need_ground_truth: list of sklearn metrics that need ground truth labeling. \
 17 |     "adjusted_rand_score", "adjusted_mutual_info_score", "homogeneity_score", \
 18 |     "completeness_score", "fowlkes_mallows_score", "mutual_info_score", "v_measure_score"    
 19 |     inherent_metrics: list of sklearn metrics that need original data for calculation. \
 20 |     "silhouette_score", "calinski_harabasz_score", "davies_bouldin_score", \
 21 |     "smallest_largest_clusters_ratio", "number_of_clusters", "smallest_cluster_size", \
 22 |     "largest_cluster_size"  
 23 |     min_or_max: establishing whether each sklearn metric is better when minimized or maximized for \
 24 |     clustering.pick_best_labels.  
 25 | """
 26 | param_delim = ";"
 27 | val_delim = "-"
 28 | 
 29 | slow = ["AffinityPropagation", "MeanShift"]
 30 | fast = ["KMeans", "OPTICS", "HDBSCAN"]
 31 | fastest = ["MiniBatchKMeans"]
 32 | partitioners = ["AffinityPropagation", "MeanShift", "KMeans", "MiniBatchKMeans"]
 33 | clusterers = ["OPTICS", "HDBSCAN"]
 34 | categories = {
 35 |     "slow": slow,
 36 |     "fast": fast,
 37 |     "fastest": fastest,
 38 |     "partitioning": partitioners,
 39 |     "clustering": clusterers,
 40 | }
 41 | 
 42 | min_cluster_size = [i for i in range(2, 17, 2)]
 43 | n_clusters = [i for i in range(2, 41)]
 44 | damping = [i / 100 for i in range(55, 95, 5)]
 45 | resolutions = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]
 46 | knn = [20, 30, 60]
 47 | 
 48 | 
 49 | variables_to_optimize = {
 50 |     "HDBSCAN": dict(min_cluster_size=min_cluster_size),
 51 |     "KMeans": dict(n_clusters=n_clusters),
 52 |     "MiniBatchKMeans": dict(n_clusters=n_clusters),
 53 |     "AffinityPropagation": dict(damping=damping),
 54 |     "MeanShift": dict(cluster_all=[False]),
 55 |     "OPTICS": dict(min_samples=min_cluster_size),
 56 |     "NMFCluster": dict(n_clusters=n_clusters),
 57 |     "LouvainCluster": dict(resolution=resolutions, k=knn),
 58 |     "LeidenCluster": dict(resolution=resolutions, k=knn),
 59 | }
 60 | 
 61 | 
 62 | need_ground_truth = [
 63 |     "adjusted_rand_score",
 64 |     "adjusted_mutual_info_score",
 65 |     "homogeneity_score",
 66 |     "completeness_score",
 67 |     "fowlkes_mallows_score",
 68 |     "mutual_info_score",
 69 |     "v_measure_score",
 70 | ]
 71 | 
 72 | inherent_metrics = [
 73 |     "silhouette_score",
 74 |     "calinski_harabasz_score",
 75 |     "davies_bouldin_score",
 76 |     "smallest_largest_clusters_ratio",
 77 |     "number_of_clusters",
 78 |     "smallest_cluster_size",
 79 |     "largest_cluster_size"
 80 | ]
 81 | 
 82 | min_or_max = {
 83 |     "adjusted_rand_score": 'max',
 84 |     "adjusted_mutual_info_score": 'max',
 85 |     "homogeneity_score": 'max',
 86 |     "completeness_score": 'max',
 87 |     "fowlkes_mallows_score": 'max',
 88 |     "silhouette_score": 'max',
 89 |     "calinski_harabasz_score": 'max',
 90 |     "davies_bouldin_score": 'min',
 91 |     "mutual_info_score": 'max',
 92 |     "v_measure_score": 'max',
 93 | }
 94 | 
 95 | pdist_adjacency_methods = [
 96 |     'braycurtis',
 97 |     'canberra',
 98 |     'chebyshev',
 99 |     'cityblock',
100 |     'correlation',
101 |     'cosine',
102 |     'dice',
103 |     'euclidean',
104 |     'hamming',
105 |     'jaccard',
106 |     'jensenshannon',
107 |     'kulsinski',
108 |     'mahalanobis',
109 |     'matching',
110 |     'minkowski',
111 |     'rogerstanimoto',
112 |     'russellrao',
113 |     'seuclidean',
114 |     'sokalmichener',
115 |     'sokalsneath',
116 |     'sqeuclidean',
117 |     'yule'
118 | ]
119 | 
120 | 
121 | valid_partition_types = [
122 |     'RBConfigurationVertexPartition',
123 |     'ModularityVertexPartition',
124 |     'RBERVertexPartition',
125 |     'CPMVertexPartition',
126 |     'SignificanceVertexPartition',
127 |     'SurpriseVertexPartition'
128 | ]


--------------------------------------------------------------------------------
/build/lib/hypercluster/evaluations.py:
--------------------------------------------------------------------------------
1 | #TODO add label count evals here
2 | 
3 | #TODO fn that grabs best params from col/yml in smk output, and feeds into clusterer.
4 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | from collections import Counter
 3 | 
 4 | __doc__ = "More functions for evaluating clustering results."
 5 | 
 6 | 
 7 | def number_clustered(_, labels: Iterable) -> float:
 8 |     return len(labels)
 9 | 
10 | 
11 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float:
12 |     counts = Counter(labels)
13 |     counts.pop(-1, None)
14 |     smallest = min(counts.values())
15 |     largest = max(counts.values())
16 |     return smallest / largest
17 | 
18 | 
19 | def smallest_cluster_ratio(_, labels: Iterable) -> float:
20 |     counts = Counter(labels)
21 |     counts.pop(-1, None)
22 |     return min(counts.values()) / len(labels)
23 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/build/lib/hypercluster/tests/__init__.py


--------------------------------------------------------------------------------
/build/lib/hypercluster/tests/test_clustering.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import utilities
 2 | from hypercluster.constants import *
 3 | import hypercluster
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | 
 8 | test_data = pd.DataFrame(
 9 |         np.array(
10 |             [[1, 2], [-1.8, 4], [1, -0.5],
11 |              [10, 2], [-10, 4], [10, 0],
12 |              [np.nan, 5], [3.2, np.nan], [0, 14],
13 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
14 |              [42, np.nan], [-8, 2], [1.2, 12],
15 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
16 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
17 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
18 |              [np.nan, 22], [8.6, -7.5], [0, 14],
19 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
20 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
21 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
22 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
23 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
24 |         )
25 | )
26 | 
27 | 
28 | test_data['ind1'] = 'a'
29 | test_data['ind2'] = range(len(test_data))
30 | test_data = test_data.set_index(['ind1', 'ind2'])
31 | test_data = test_data.fillna(test_data.median())
32 | 
33 | test_ground_truth = pd.Series(
34 |     np.random.randint(0, 2, size=(len(test_data), )),
35 |     index=test_data.index
36 | )
37 | 
38 | 
39 | def test_cluster_one():
40 |     # Test all clusterers are working with default params
41 |     for clus_name in variables_to_optimize.keys():
42 |         utilities.cluster(clus_name, test_data)
43 | 
44 |     # Test with putting extra params in there
45 |     for clus_name in variables_to_optimize.keys():
46 |         vars = variables_to_optimize[clus_name]
47 |         key = list(vars.keys())[0]
48 |         params = {key: vars[key][0]}
49 |         # grabbing a variable and making sure var passing works
50 |         utilities.cluster(clus_name, test_data, params)
51 | 
52 | 
53 | def test_autoclusterer():
54 |     for clus_name in variables_to_optimize.keys():
55 |         hypercluster.AutoClusterer(clus_name).fit(test_data)
56 |     for clus_name in variables_to_optimize.keys():
57 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
58 | 
59 | 
60 | def test_param_weights():
61 |     for clus_name in variables_to_optimize.keys():
62 |         weights = {
63 |             param: {value: (1/len(values)) for value in values} for param, values in
64 |             variables_to_optimize[
65 |                 clus_name
66 |             ].items()
67 |         }
68 |         hypercluster.AutoClusterer(clus_name, param_weights=weights).fit(
69 |             test_data
70 |         )
71 |     for clus_name in variables_to_optimize.keys():
72 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
73 | 
74 | 
75 | def test_passing_kwargs_for_a_clusterer():
76 |     clus_name = 'KMeans'
77 | 
78 |     hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit(
79 |         test_data
80 |     )
81 | 
82 | 
83 | def test_evaluate_results():
84 |     labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_
85 |     for metric in inherent_metrics + need_ground_truth:
86 |         utilities.evaluate_one(
87 |             labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth
88 |         )
89 | 
90 | 
91 | def test_multiauto():
92 |     hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
93 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/tests/test_visualize.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import visualize
 2 | import hypercluster
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | test_data = pd.DataFrame(
 8 |         np.array(
 9 |             [[1, 2], [-1.8, 4], [1, -0.5],
10 |              [10, 2], [-10, 4], [10, 0],
11 |              [np.nan, 5], [3.2, np.nan], [0, 14],
12 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
13 |              [42, np.nan], [-8, 2], [1.2, 12],
14 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
15 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
16 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
17 |              [np.nan, 22], [8.6, -7.5], [0, 14],
18 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
19 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
20 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
21 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
22 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
23 |         )
24 | )
25 | 
26 | 
27 | test_data['ind1'] = 'a'
28 | test_data['ind2'] = range(len(test_data))
29 | test_data = test_data.set_index(['ind1', 'ind2'])
30 | test_data = test_data.fillna(test_data.median())
31 | 
32 | test_ground_truth = pd.Series(
33 |     np.random.randint(0, 2, size=(len(test_data), )),
34 |     index=test_data.index
35 | )
36 | 
37 | 
38 | def test_vis_eval():
39 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
40 |     visualize.visualize_evaluations(clusterer.evaluation_df)
41 |     clusterer.visualize_evaluations(
42 |         # savefig=True
43 |     )
44 |     visualize.visualize_for_picking_labels(
45 |         clusterer.evaluation_df, savefig_prefix='test_visualize_for_picking'
46 |     )
47 | 
48 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
49 |     visualize.visualize_evaluations(clusterer.evaluation_df)
50 |     clusterer.visualize_evaluations()
51 | 
52 | 
53 | def test_vis_sample():
54 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
55 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
56 |     clusterer.visualize_sample_label_consistency()
57 | 
58 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
59 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
60 |     clusterer.visualize_sample_label_consistency()
61 | 
62 | 
63 | def test_vis_labels():
64 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
65 |     visualize.visualize_label_agreement(clusterer.labels_df)
66 |     clusterer.visualize_label_agreement(
67 |         savefig=True,
68 |     )
69 | 
70 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
71 |     visualize.visualize_label_agreement(clusterer.labels_df)
72 |     clusterer.visualize_label_agreement()


--------------------------------------------------------------------------------
/build/lib/hypercluster/utilities.py:
--------------------------------------------------------------------------------
  1 | from sklearn.cluster import *
  2 | from sklearn.metrics import *
  3 | from .additional_clusterers import *
  4 | from .additional_metrics import *
  5 | from pandas import DataFrame
  6 | import pandas as pd
  7 | import numpy as np
  8 | import logging
  9 | from typing import Optional, Iterable, Dict
 10 | from .constants import *
 11 | from hypercluster.constants import param_delim, val_delim
 12 | 
 13 | 
 14 | def calculate_row_weights(
 15 |     row: Iterable, param_weights: dict, vars_to_optimize: dict
 16 | ) -> float:
 17 |     """Used to select random rows of parameter combinations using individual parameter weights.  
 18 | 
 19 |     Args: 
 20 |         row (Iterable):  Series of parameters, with parameter names as index.  
 21 |         param_weights (dict): Dictionary of str: dictionaries. Ex format - {'parameter_name':{ \
 22 |         'param_option_1':0.5, 'param_option_2':0.5}}.  
 23 |         vars_to_optimize (Iterable): Dictionary with possibilities for different parameters. Ex \
 24 |         format - {'parameter_name':[1, 2, 3, 4, 5]}.  
 25 | 
 26 |     Returns (float): 
 27 |         Float representing the probability of seeing that combination of parameters, given their \
 28 |         individual weights.
 29 | 
 30 |     """
 31 |     param_weights.update({
 32 |         param: {
 33 |             val: param_weights.get(param, {}).get(
 34 |                 val, (1-sum(param_weights.get(param, {}).values()))/len([
 35 |                     notweighted for notweighted in vars_to_optimize.get(param,  {})
 36 |                     if notweighted not in param_weights.get(param, {}).keys()
 37 |                 ])
 38 |             ) for val in vals
 39 |         } for param, vals in vars_to_optimize.items()
 40 |     })
 41 | 
 42 |     return np.prod([param_weights[param][val] for param, val in row.to_dict().items()])
 43 | 
 44 | 
 45 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}):
 46 |     """Runs a given clusterer with a given set of parameters.
 47 | 
 48 |     Args: 
 49 |         clusterer_name (str): String name of clusterer.
 50 |         data (DataFrame): Dataframe with elements to cluster as index and examples as columns.
 51 |         params (dict): Dictionary of parameter names and values to feed into clusterer. Default {}
 52 | 
 53 |     Returns: 
 54 |         Instance of the clusterer fit with the data provided.
 55 |     """
 56 |     clusterer = eval(clusterer_name)(**params)
 57 |     return clusterer.fit(data)
 58 | 
 59 | 
 60 | def evaluate_one(
 61 |     labels: Iterable,
 62 |     method: str = "silhouette_score",
 63 |     data: Optional[DataFrame] = None,
 64 |     gold_standard: Optional[Iterable] = None,
 65 |     metric_kwargs: Optional[dict] = None,
 66 | ) -> dict:
 67 |     """Uses a given metric to evaluate clustering results.
 68 | 
 69 |     Args: 
 70 |         labels (Iterable): Series of labels.
 71 |         method (str): Str of name of evaluation to use. Default is silhouette.
 72 |         data (DataFrame): If using an inherent metric, must provide DataFrame with which to \
 73 |         calculate the metric.
 74 |         gold_standard (Iterable): If using a metric that compares to ground truth, must provide a \
 75 |         set of gold standard labels.
 76 |         metric_kwargs (dict): Additional kwargs to use in evaluation.
 77 | 
 78 |     Returns (float): 
 79 |         Metric value
 80 |     """
 81 |     if isinstance(labels, pd.Series) is False:
 82 |         labels = pd.Series(labels)
 83 |     if len(labels[labels != -1].unique()) < 2:
 84 |         return np.nan
 85 | 
 86 |     if metric_kwargs is None:
 87 |         metric_kwargs = {}
 88 | 
 89 |     if method in need_ground_truth:
 90 |         if gold_standard is None:
 91 |             raise ValueError(
 92 |                 "Chosen evaluation metric %s requires gold standard set." % method
 93 |             )
 94 |         clustered = (gold_standard != -1) & (labels != -1)
 95 |         compare_to = gold_standard[clustered]
 96 | 
 97 |     elif method in inherent_metrics:
 98 |         if data is None:
 99 |             raise ValueError(
100 |                 "Chosen evaluation metric %s requires data input." % method
101 |             )
102 |         clustered = labels != -1
103 |         compare_to = data.loc[clustered]
104 |     else:
105 |         compare_to = None
106 |         clustered = labels.index
107 | 
108 |     return eval(method)(compare_to, labels[clustered], **metric_kwargs)
109 | 
110 | 
111 | def generate_flattened_df(df_dict: Dict[str, DataFrame]) -> DataFrame:
112 |     """Takes dictionary of results from many clusterers and makes 1 DataFrame. Opposite of \
113 |     convert_to_multiind.
114 | 
115 |     Args: 
116 |         df_dict (Dict[str, DataFrame]): Dictionary of dataframes to flatten. Can be .labels_ or \
117 |         .evaluations_ from MultiAutoClusterer.
118 | 
119 |     Returns: 
120 |         Flattened DataFrame with all data.
121 |     """
122 |     merged_df = pd.DataFrame()
123 |     for clus_name, df in df_dict.items():
124 |         df = df.transpose()
125 |         cols_for_labels = df.index.to_frame()
126 |         inds = cols_for_labels.apply(
127 |             lambda row: param_delim.join(
128 |                 [clus_name] + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()]
129 |             ),
130 |             axis=1,
131 |         )
132 |         df.index = inds
133 |         df = df.transpose()
134 | 
135 |         merged_df = pd.concat(
136 |             [merged_df, df], join="outer", axis=1
137 |         )
138 |     return merged_df
139 | 
140 | 
141 | def convert_to_multiind(key: str, df: DataFrame) -> DataFrame:
142 |     """Takes columns from a single clusterer from Clusterer.labels_df or .evaluation_df and
143 |     converts to a multiindexed rather than collapsed into string. Equivalent to grabbing
144 |     Clusterer.labels[clusterer] or .evaluations[clusterer]. Opposite of generate_flattened_df.
145 | 
146 |     Args: 
147 |         key (str): Name of clusterer, must match beginning of columns to convert.  
148 |         df (DataFrame): Dataframe to grab chunk from.  
149 | 
150 |     Returns: 
151 |         Subset DataFrame with multiindex.
152 | 
153 |     """
154 |     clus_cols = [col for col in df.columns if col.split(param_delim, 1)[0] == key]
155 |     temp = df[clus_cols].transpose()
156 |     temp.index = pd.MultiIndex.from_frame(
157 |         pd.DataFrame([{
158 |             s.split(val_delim, 1)[0]: s.split(val_delim, 1)[1] for s in i.split(param_delim)[1:]
159 |         } for i in temp.index]).astype(float, errors='ignore')
160 |     )
161 |     return temp.sort_index().transpose()
162 | 
163 | 
164 | def pick_best_labels(
165 |         evaluation_results_df: DataFrame,
166 |         clustering_labels_df: DataFrame,
167 |         method: Optional[str] = None,
168 |         min_or_max: Optional[str] = None
169 | ) -> Iterable:
170 |     """From evaluations and a metric to minimize or maximize, return all labels with top pick.  
171 | 
172 |     Args: 
173 |         evaluation_results_df (DataFrame): Evaluations DataFrame from optimize_clustering.  
174 |         clustering_labels_df (DataFrame): Labels DataFrame from optimize_clustering.  
175 |         method (str): Method with which to choose the best labels.  
176 |         min_or_max (str): Whether to minimize or maximize the metric. Must be 'min' or 'max'.  
177 |     Returns (DataFrame): 
178 |         DataFrame of all top labels.  
179 |     """
180 |     if method is None:
181 |         method = "silhouette_score"
182 |     if min_or_max is None:
183 |         min_or_max = 'max'
184 | 
185 |     best_labels = evaluation_results_df.loc[method, :]
186 |     if min_or_max == 'min':
187 |         best_labels = best_labels.index[best_labels == best_labels.min()]
188 |         return clustering_labels_df[best_labels]
189 |     elif min_or_max == 'max':
190 |         best_labels = best_labels.index[best_labels == best_labels.max()]
191 |         return clustering_labels_df[best_labels]
192 |     logging.error('min_or_max must be either min or max, %s invalid choice' % min_or_max)
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/build/lib/hypercluster/visualize.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import logging
  3 | from collections import Counter
  4 | from itertools import cycle
  5 | import numpy as np
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | import seaborn as sns
  9 | from pandas import DataFrame
 10 | from scipy.cluster import hierarchy
 11 | from scipy.spatial.distance import pdist
 12 | from hypercluster.constants import param_delim
 13 | from hypercluster.utilities import convert_to_multiind, evaluate_one
 14 | 
 15 | matplotlib.rcParams["pdf.fonttype"] = 42
 16 | matplotlib.rcParams["ps.fonttype"] = 42
 17 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3)
 18 | matplotlib.rcParams.update({"savefig.bbox": "tight"})
 19 | cmap = sns.cubehelix_palette(
 20 |     start=0,
 21 |     rot=0.4,
 22 |     gamma=1.0,
 23 |     hue=0.82,
 24 |     light=1,
 25 |     dark=0,
 26 |     reverse=False,
 27 |     as_cmap=True
 28 | )
 29 | cmap.set_over('black')
 30 | cmap.set_under('white')
 31 | cmap.set_bad("#DAE0E6")
 32 | 
 33 | 
 34 | def zscore(df):
 35 |     """Row zscores a DataFrame, ignores np.nan 
 36 | 
 37 |     Args: 
 38 |         df (DataFrame): DataFrame to z-score 
 39 | 
 40 |     Returns (DataFrame): 
 41 |         Row-zscored DataFrame. 
 42 |     """
 43 |     return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0)
 44 | 
 45 | 
 46 | def compute_order(
 47 |         df,
 48 |         dist_method: str = "euclidean",
 49 |         cluster_method: str = "average"
 50 | ):
 51 |     """Gives hierarchical clustering order for the rows of a DataFrame 
 52 | 
 53 |     Args: 
 54 |         df (DataFrame): DataFrame with rows to order.  
 55 |         dist_method (str):  Distance method to pass to scipy.cluster.hierarchy.linkage.  
 56 |         cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist.  
 57 | 
 58 |     Returns (pandas.Index): 
 59 |         Ordered row index. 
 60 | 
 61 |     """
 62 |     dist_mat = pdist(df, metric=dist_method)
 63 |     link_mat = hierarchy.linkage(dist_mat, method=cluster_method)
 64 | 
 65 |     return df.index[hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))]
 66 | 
 67 | 
 68 | def visualize_evaluations(
 69 |     evaluations_df: DataFrame,
 70 |     savefig: bool = False,
 71 |     output_prefix: str = "evaluations",
 72 |     **heatmap_kws
 73 | ) -> List[matplotlib.axes.Axes]:
 74 |     """Makes a z-scored visualization of all evaluations. 
 75 | 
 76 |     Args: 
 77 |         evaluations_df (DataFrame): Evaluations dataframe from clustering.optimize_clustering  
 78 |         output_prefix (str): If saving a figure, file prefix to use.  
 79 |         savefig (bool): Whether to save a pdf  
 80 |         **heatmap_kws: Additional keyword arguments to pass to seaborn.heatmap.  
 81 | 
 82 |     Returns (List[matplotlib.axes.Axes]): 
 83 |         List of all matplotlib axes.  
 84 | 
 85 |     """
 86 |     clusterers = sorted(
 87 |         list(set([i.split(param_delim, 1)[0] for i in evaluations_df.columns]))
 88 |     )
 89 |     width_ratios = [
 90 |             dict(
 91 |                 Counter(
 92 |                     [i.split(param_delim, 1)[0] for i in evaluations_df.columns]
 93 |                 )
 94 |             )[clus]
 95 |             for clus in clusterers
 96 |         ]
 97 | 
 98 |     evaluations_df = zscore(evaluations_df)
 99 |     width = 0.18 * (len(evaluations_df.columns) + 2 + (0.01 * (len(clusterers) - 1)))
100 |     height = 0.22 * (len(evaluations_df))
101 | 
102 |     fig, axs = plt.subplots(
103 |         figsize=(width, height),
104 |         nrows=1,
105 |         ncols=(len(clusterers) + 1),
106 |         gridspec_kw=dict(
107 |             width_ratios=width_ratios + [2],
108 |             wspace=0.01,
109 |             left=0,
110 |             right=1,
111 |             top=1,
112 |             bottom=0,
113 |         ),
114 |     )
115 |     vmin = np.nanquantile(evaluations_df, 0.1)
116 |     vmax = np.nanquantile(evaluations_df, 0.9)
117 | 
118 |     heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap)
119 |     heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin)
120 |     heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax)
121 | 
122 |     for i, clus in enumerate(clusterers):
123 |         temp = convert_to_multiind(clus, evaluations_df)
124 | 
125 |         ax = axs[i]
126 |         sns.heatmap(
127 |             temp,
128 |             ax=ax,
129 |             yticklabels=temp.index,
130 |             xticklabels=["-".join([str(i) for i in col]) for col in temp.columns],
131 |             cbar_ax=axs[-1],
132 |             cbar_kws=dict(label="z-score"),
133 |             **heatmap_kws
134 |         )
135 |         ax.set_ylabel("")
136 |         ax.set_title(clus)
137 |         ax.set_yticklabels([])
138 | 
139 |     axs[0].set_ylabel("evaluation method")
140 |     axs[0].set_yticklabels(temp.index, rotation=0)
141 |     if savefig:
142 |         plt.savefig("%s.pdf" % output_prefix)
143 |     return axs
144 | 
145 | 
146 | def visualize_pairwise(
147 |         df: DataFrame,
148 |         savefig: bool = False,
149 |         output_prefix: Optional[str] = None,
150 |         method: Optional[str] = None,
151 |         **heatmap_kws
152 | ) -> List[matplotlib.axes.Axes]:
153 |     """Visualize symmetrical square DataFrames. 
154 | 
155 |     Args: 
156 |         df (DataFrame): DataFrame to visualize.  
157 |         savefig (bool): Whether to save a pdf.  
158 |         output_prefix (str): If saving a pdf, file prefix to use.  
159 |         method (str): Label for cbar, if relevant.  
160 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
161 | 
162 |     Returns (List[matplotlib.axes.Axes]): 
163 |         List of matplotlib axes for figure. 
164 | 
165 |     .. _seaborn.heatmap:
166 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
167 |     """
168 |     heatmap_kws = {**heatmap_kws}
169 | 
170 |     vmin = np.nanquantile(df, 0.1)
171 |     vmax = np.nanquantile(df, 0.9)
172 | 
173 |     heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap)
174 |     heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin)
175 |     heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax)
176 |     cbar_kws = heatmap_kws.get('cbar_kws', {})
177 |     cbar_kws['label'] = cbar_kws.get('label', method)
178 |     heatmap_kws['cbar_kws'] = cbar_kws
179 | 
180 |     cbar_ratio = 2
181 |     wspace = 0.01
182 |     height = 0.18 * len(df)
183 |     width = 0.18 * (len(df.columns)+cbar_ratio+wspace)
184 |     fig, axs = plt.subplots(
185 |         figsize=(width, height),
186 |         nrows=1,
187 |         ncols=2,
188 |         gridspec_kw=dict(
189 |             width_ratios=[len(df.columns), cbar_ratio],
190 |             wspace=wspace,
191 |             left=0,
192 |             right=1,
193 |             top=1,
194 |             bottom=0,
195 |         )
196 |     )
197 |     try:
198 |         order = compute_order(df.fillna(df.median()))
199 |     except ValueError:
200 |         order = df.index
201 |     df = df.loc[order, order]
202 |     sns.heatmap(
203 |         df,
204 |         xticklabels=order,
205 |         yticklabels=order,
206 |         ax=axs[0],
207 |         cbar_ax=axs[1],
208 |         **heatmap_kws
209 |     )
210 |     if savefig:
211 |         if output_prefix is None:
212 |             output_prefix = "heatmap.pairwise"
213 |         plt.savefig('%s.pdf' % output_prefix)
214 | 
215 |     return axs
216 | 
217 | 
218 | def visualize_label_agreement(
219 |         labels: DataFrame,
220 |         method: Optional[str] = None,
221 |         savefig: bool = False,
222 |         output_prefix: Optional[str] = None,
223 |         **heatmap_kws
224 | ) -> List[matplotlib.axes.Axes]:
225 |     """Visualize similarity between clustering results given an evaluation metric. 
226 | 
227 |     Args: 
228 |         labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \
229 |         AutoClusterer.labels_  
230 |         method (str): Method with which to compare labels. Must be a metric like the ones in \
231 |         constants.need_ground_truth, which takes two sets of labels.  
232 |         savefig (bool): Whether to save a pdf.  
233 |         output_prefix (str): If saving a pdf, file prefix to use.  
234 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
235 | 
236 |     Returns (List[matplotlib.axes.Axes]): 
237 |         List of matplotlib axes  
238 | 
239 |     .. _seaborn.heatmap:
240 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
241 |     """
242 |     if savefig and output_prefix is None:
243 |         output_prefix = 'heatmap.labels.pairwise'
244 |     if method is None:
245 |         method = 'adjusted_rand_score'
246 | 
247 |     labels = labels.corr(
248 |         lambda x, y: evaluate_one(x, method=method, gold_standard=y)
249 |     )
250 |     return visualize_pairwise(labels, savefig, output_prefix, method=method, **heatmap_kws)
251 | 
252 | 
253 | def visualize_sample_label_consistency(
254 |         labels: DataFrame,
255 |         savefig: bool = False,
256 |         output_prefix: Optional[str] = None,
257 |         **heatmap_kws
258 | ) -> List[matplotlib.axes.Axes]:
259 |     """Visualize how often two samples are labeled in the same group across conditions. Interpret
260 |     with care--if you use more conditions for some type of clusterers, e.g. more n_clusters for
261 |     KMeans, those cluster more similarly across conditions than between clusterers. This means
262 |     that more agreement in labeling could be due to the choice of clusterers rather than true
263 |     similarity between samples. 
264 | 
265 |     Args: 
266 |         labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \
267 |         AutoClusterer.labels_  
268 |         savefig (bool): Whether to save a pdf.  
269 |         output_prefix (str): If saving a pdf, file prefix to use.  
270 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
271 | 
272 |     Returns (List[matplotlib.axes.Axes]): 
273 |         List of matplotlib axes  
274 | 
275 |     .. _seaborn.heatmap:
276 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
277 | 
278 |     """
279 |     if savefig and output_prefix is None:
280 |         output_prefix = "heatmap.sample.pairwise"
281 |     labels = labels.transpose().corr(lambda x, y: sum(
282 |         np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))])
283 |     ))
284 |     return visualize_pairwise(labels, savefig, output_prefix, method='# same label', **heatmap_kws)
285 | 
286 | 
287 | def visualize_for_picking_labels(
288 |         evaluation_df: DataFrame,
289 |         method: Optional[str] = None,
290 |         savefig_prefix: Optional[str] = None
291 | ):
292 |     """Generates graphs similar to a `scree graph`_ for PCA for each parameter and each clusterer. 
293 | 
294 |     Args: 
295 |         evaluation_df (DataFrame): DataFrame of evaluations to visualize. Clusterer.evaluation_df.  
296 |         method (str): Which metric to visualize.  
297 |         savefig_prefix (str): If not None, save a figure with give prefix.  
298 | 
299 |     Returns:
300 |         matplotlib axes.  
301 |     .. _scree graph:
302 |         https://en.wikipedia.org/wiki/Scree_plot
303 |     """
304 |     if method is None:
305 |         method = "silhouette_score"
306 |     cluss_temp = list(set([i.split(param_delim, 1)[0] for i in evaluation_df.columns]))
307 |     # get figure dimensions
308 |     ncols = 0
309 |     cluss = []
310 |     for ploti, clus in enumerate(cluss_temp):
311 |         scores = convert_to_multiind(
312 |             clus, evaluation_df.loc[[method], :]
313 |         ).transpose().dropna(how='any')
314 |         if len(scores) == 0:
315 |             logging.error(
316 |                 'Score %s is missing for clusterer %s, skipping visualization' % (method, clus)
317 |             )
318 |             continue
319 |         indep = scores.index.to_frame().reset_index(drop=True)
320 |         try:
321 |             indep.astype(float)         
322 |         except ValueError or AssertionError:
323 |             logging.error('Cannot convert %s data to floats, skipping visualization' % clus)
324 |             continue
325 |         cluss.append(clus)
326 |         if scores.index.nlevels > ncols:
327 |             ncols = scores.index.nlevels
328 |     if not cluss:
329 |         logging.error('No valid clusterers, cannot visualize. ')
330 |         return None
331 |     cluss.sort()
332 | 
333 |     ybuff = np.abs(np.nanquantile(evaluation_df.loc[method], 0.05))
334 |     ylim = (evaluation_df.loc[method].min() - ybuff, evaluation_df.loc[method].max() + ybuff)
335 |     colors = cycle(sns.color_palette('twilight', n_colors=len(cluss) * ncols))
336 |     fig = plt.figure(figsize=(5 * (ncols), 5 * len(cluss)))
337 |     gs = plt.GridSpec(nrows=len(cluss), ncols=ncols, wspace=0.25, hspace=0.25)
338 |     for ploti, clus in enumerate(cluss):
339 |         scores = convert_to_multiind(
340 |             clus, evaluation_df.loc[[method], :]
341 |         ).transpose().dropna(how='any')
342 |         indep = scores.index.to_frame().reset_index(drop=True)
343 | 
344 |         for whcol, col in enumerate(indep.columns):
345 |             if whcol == 0:
346 |                 saveax = plt.subplot(gs[ploti, whcol])
347 |                 ax = saveax
348 |                 ax.set_ylim(ylim)
349 |                 ax.set_ylabel(clus)
350 |             else:
351 |                 ax = plt.subplot(gs[ploti, whcol], sharey=saveax)
352 |             color = next(colors)
353 | 
354 |             # plot eval results
355 |             sns.regplot(
356 |                 indep[col],
357 |                 scores[method].values,
358 |                 color=color,
359 |                 ax=ax,
360 |                 logistic=True, 
361 |             )
362 | 
363 |     axs = fig.get_axes()
364 |     axs[0].set_title('%s results per parameter' % method, ha='left')
365 |     if savefig_prefix:
366 |         plt.savefig('%s.pdf' % savefig_prefix)
367 |     return axs
368 | 


--------------------------------------------------------------------------------
/build/lib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/build/lib/tests/__init__.py


--------------------------------------------------------------------------------
/build/lib/tests/test_clustering.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import utilities
 2 | from hypercluster.constants import *
 3 | import hypercluster
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | 
 8 | test_data = pd.DataFrame(
 9 |         np.array(
10 |             [[1, 2], [-1.8, 4], [1, -0.5],
11 |              [10, 2], [-10, 4], [10, 0],
12 |              [np.nan, 5], [3.2, np.nan], [0, 14],
13 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
14 |              [42, np.nan], [-8, 2], [1.2, 12],
15 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
16 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
17 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
18 |              [np.nan, 22], [8.6, -7.5], [0, 14],
19 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
20 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
21 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
22 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
23 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
24 |         )
25 | )
26 | 
27 | 
28 | test_data['ind1'] = 'a'
29 | test_data['ind2'] = range(len(test_data))
30 | test_data = test_data.set_index(['ind1', 'ind2'])
31 | test_data = test_data.fillna(test_data.median())
32 | 
33 | test_ground_truth = pd.Series(
34 |     np.random.randint(0, 2, size=(len(test_data), )),
35 |     index=test_data.index
36 | )
37 | 
38 | 
39 | def test_cluster_one():
40 |     # Test all clusterers are working with default params
41 |     for clus_name in variables_to_optimize.keys():
42 |         utilities.cluster(clus_name, test_data)
43 | 
44 |     # Test with putting extra params in there
45 |     for clus_name in variables_to_optimize.keys():
46 |         vars = variables_to_optimize[clus_name]
47 |         key = list(vars.keys())[0]
48 |         params = {key: vars[key][0]}
49 |         # grabbing a variable and making sure var passing works
50 |         utilities.cluster(clus_name, test_data, params)
51 | 
52 | 
53 | def test_autoclusterer():
54 |     for clus_name in variables_to_optimize.keys():
55 |         hypercluster.AutoClusterer(clus_name).fit(test_data)
56 |     for clus_name in variables_to_optimize.keys():
57 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
58 | 
59 | 
60 | def test_param_weights():
61 |     for clus_name in variables_to_optimize.keys():
62 |         weights = {
63 |             param: {value: (1/len(values)) for value in values} for param, values in
64 |             variables_to_optimize[
65 |                 clus_name
66 |             ].items()
67 |         }
68 |         hypercluster.AutoClusterer(clus_name, param_weights=weights).fit(
69 |             test_data
70 |         )
71 |     for clus_name in variables_to_optimize.keys():
72 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
73 | 
74 | 
75 | def test_passing_kwargs_for_a_clusterer():
76 |     clus_name = 'KMeans'
77 | 
78 |     hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit(
79 |         test_data
80 |     )
81 | 
82 | 
83 | def test_evaluate_results():
84 |     labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_
85 |     for metric in inherent_metrics + need_ground_truth:
86 |         utilities.evaluate_one(
87 |             labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth
88 |         )
89 | 
90 | 
91 | def test_multiauto():
92 |     hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
93 | 


--------------------------------------------------------------------------------
/build/lib/tests/test_snakemake.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | 
3 | def test_run_snakemake_all():
4 |     # subprocess.run(
5 |     #     ['touch', 'test_input.txt']
6 |     # )
7 |     subprocess.run(
8 |         ['snakemake', '-s', 'hypercluster.smk', '--config', 'input_data_files=test_input']
9 |     )


--------------------------------------------------------------------------------
/build/lib/tests/test_visualize.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import visualize
 2 | import hypercluster
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | test_data = pd.DataFrame(
 8 |         np.array(
 9 |             [[1, 2], [-1.8, 4], [1, -0.5],
10 |              [10, 2], [-10, 4], [10, 0],
11 |              [np.nan, 5], [3.2, np.nan], [0, 14],
12 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
13 |              [42, np.nan], [-8, 2], [1.2, 12],
14 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
15 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
16 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
17 |              [np.nan, 22], [8.6, -7.5], [0, 14],
18 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
19 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
20 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
21 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
22 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
23 |         )
24 | )
25 | 
26 | 
27 | test_data['ind1'] = 'a'
28 | test_data['ind2'] = range(len(test_data))
29 | test_data = test_data.set_index(['ind1', 'ind2'])
30 | test_data = test_data.fillna(test_data.median())
31 | 
32 | test_ground_truth = pd.Series(
33 |     np.random.randint(0, 2, size=(len(test_data), )),
34 |     index=test_data.index
35 | )
36 | 
37 | 
38 | def test_vis_eval():
39 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
40 |     visualize.visualize_evaluations(clusterer.evaluation_df)
41 |     clusterer.visualize_evaluations()
42 | # 
43 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
44 |     visualize.visualize_evaluations(clusterer.evaluation_df)
45 |     clusterer.visualize_evaluations()
46 | 
47 | 
48 | def test_vis_sample():
49 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
50 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
51 |     clusterer.visualize_sample_label_consistency()
52 | 
53 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
54 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
55 |     clusterer.visualize_sample_label_consistency()
56 | 
57 | 
58 | def test_vis_labels():
59 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
60 |     visualize.visualize_label_agreement(clusterer.labels_df)
61 |     clusterer.visualize_label_agreement(
62 |         # savefig=True, 
63 |         # output_prefix='test_input/test_label_agreement_multi'
64 |     )
65 | 
66 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
67 |     visualize.visualize_label_agreement(clusterer.labels_df)
68 |     clusterer.visualize_label_agreement(
69 |         # savefig=True, 
70 |         # output_prefix='test_input/test_label_agreement_auto'
71 |     )
72 | 


--------------------------------------------------------------------------------
/dist/hypercluster-0.0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.0.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.0.1.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.0-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.0.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.1.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.10-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.10-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.10.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.10.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.12-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.12-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.12.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.12.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.13-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.13-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.13.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.13.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.2-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.2.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.3-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.3.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.5-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.5-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.5.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.6-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.6-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.6.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.7-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.7-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.7.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.8-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.8-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.8.tar.gz


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.9-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.9-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/hypercluster-0.1.9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/dist/hypercluster-0.1.9.tar.gz


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | import sys
15 | sys.path.insert(0, '../.')
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'Hypercluster'
21 | copyright = '2019, Ruggleslab'
22 | author = 'Ruggleslab'
23 | 
24 | # The short X.Y version
25 | version = '0.0.2'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = '0.0.2'
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     'sphinx.ext.autodoc',
38 |     'sphinx.ext.viewcode',
39 |     'sphinx.ext.todo',
40 |     'sphinx.ext.napoleon'
41 | ]
42 | 
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 | 
46 | # The language for content autogenerated by Sphinx. Refer to documentation
47 | # for a list of supported languages.
48 | #
49 | # This is also used if you do content translation via gettext catalogs.
50 | # Usually you set "language" from the command line for these cases.
51 | language = 'en'
52 | 
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
57 | 
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 | html_theme_options = {
66 |     'navigation_depth': 4
67 | }
68 | 
69 | # Add any paths that contain custom static files (such as style sheets) here,
70 | # relative to this directory. They are copied after the builtin static files,
71 | # so a file named "default.css" will overwrite the builtin "default.css".
72 | html_static_path = ['_static']
73 | 
74 | 
75 | # -- Extension configuration -------------------------------------------------
76 | 
77 | # -- Options for todo extension ----------------------------------------------
78 | 
79 | # If true, `todo` and `todoList` produce output, else they produce nothing.
80 | todo_include_todos = False
81 | 


--------------------------------------------------------------------------------
/docs/hypercluster.rst:
--------------------------------------------------------------------------------
 1 | hypercluster package
 2 | ====================
 3 | 
 4 | .. automodule:: hypercluster
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | hypercluster.classes module
10 | ---------------------------
11 | 
12 | .. automodule:: hypercluster.classes
13 |    :members:
14 |    :undoc-members:
15 |    :show-inheritance:
16 | 
17 | hypercluster.utilities module
18 | -----------------------------
19 | 
20 | .. automodule:: hypercluster.utilities
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | hypercluster.visualize module
26 | -----------------------------
27 | 
28 | .. automodule:: hypercluster.visualize
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | hypercluster.constants module
34 | -----------------------------
35 | 
36 | .. automodule:: hypercluster.constants
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | hypercluster.additional\_clusterers module
42 | ------------------------------------------
43 | 
44 | .. automodule:: hypercluster.additional_clusterers
45 |    :members:
46 |    :undoc-members:
47 |    :show-inheritance:
48 | 
49 | hypercluster.additional\_metrics module
50 | ---------------------------------------
51 | 
52 | .. automodule:: hypercluster.additional_metrics
53 |    :members:
54 |    :undoc-members:
55 |    :show-inheritance:
56 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. Hypercluster documentation master file, created by
  2 |    sphinx-quickstart on Mon Dec 30 16:45:25 2019.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Documentation for hypercluster
  7 | ==============================
  8 | 
  9 | .. toctree::
 10 |    :hidden:
 11 | 
 12 |    self
 13 | 
 14 | .. toctree::
 15 |    :maxdepth: 4
 16 |    :caption: Contents:
 17 | 
 18 |    hypercluster
 19 |    snakemake
 20 | 
 21 | 
 22 | Indices and tables
 23 | ==================
 24 | 
 25 | * :ref:`genindex`
 26 | * :ref:`modindex`
 27 | * :ref:`search`
 28 | 
 29 | 
 30 | Installation and logistics
 31 | ==========================
 32 | 
 33 | ************
 34 | Installation
 35 | ************
 36 | 
 37 | Available via pip::
 38 | 
 39 |   pip install hypercluster
 40 | 
 41 | Or bioconda::
 42 | 
 43 |   conda install hypercluster
 44 |    # or
 45 |   conda install -c conda-forge -c bioconda hypercluster
 46 | 
 47 | If you are having problems installing with conda, try changing your channel priority. Priority of
 48 | conda-forge > bioconda > defaults is recommended.
 49 | 
 50 | To check channel priority: :code:`conda config --get channels`
 51 | 
 52 | 
 53 | It should look like::
 54 | 
 55 |    --add channels 'defaults'   # lowest priority
 56 |    --add channels 'bioconda'
 57 |    --add channels 'conda-forge'   # highest priority
 58 | 
 59 | If it doesn't look like that, try::
 60 | 
 61 |    conda config --add channels bioconda
 62 |    conda config --add channels conda-forge
 63 | 
 64 | *********************************************
 65 | Quick reference for clustering and evaluation
 66 | *********************************************
 67 | 
 68 | .. list-table:: Clustering algorithms
 69 |    :widths: 50 50
 70 |    :header-rows: 1
 71 | 
 72 |    * - Clusterer
 73 |      - Type
 74 |    * - KMeans/MiniBatch KMeans
 75 |      - Partitioner
 76 |    * - Affinity Propagation
 77 |      - Partitioner
 78 |    * - Mean Shift
 79 |      - Partitioner
 80 |    * - DBSCAN
 81 |      - Clusterer
 82 |    * - OPTICS
 83 |      - Clusterer
 84 |    * - Birch
 85 |      - Partitioner
 86 |    * - OPTICS
 87 |      - Clusterer
 88 |    * - HDBSCAN
 89 |      - Clusterer
 90 |    * - NMF
 91 |      - Partitioner
 92 |    * - LouvainCluster
 93 |      - Partitioner
 94 |    * - LeidenCluster
 95 |      - Partitioner
 96 | 
 97 | 
 98 | .. list-table:: Evaluations
 99 |    :widths: 50 50
100 |    :header-rows: 1
101 | 
102 |    * - Metric
103 |      - Type
104 |    * - adjusted_rand_score
105 |      - Needs ground truth
106 |    * - adjusted_mutual_info_score
107 |      - Needs ground truth
108 |    * - homogeneity_score
109 |      - Needs ground truth
110 |    * - completeness_score
111 |      - Needs ground truth
112 |    * - fowlkes_mallows_score
113 |      - Needs ground truth
114 |    * - mutual_info_score
115 |      - Needs ground truth
116 |    * - v_measure_score
117 |      - Needs ground truth
118 |    * - silhouette_score
119 |      - Inherent metric
120 |    * - calinski_harabasz_score
121 |      - Inherent metric
122 |    * - davies_bouldin_score
123 |      - Inherent metric
124 |    * - smallest_largest_clusters_ratio
125 |      - Inherent metric
126 |    * - number_of_clusters
127 |      - Inherent metric
128 |    * - smallest_cluster_size
129 |      - Inherent metric
130 |    * - largest_cluster_size
131 |      - Inherent metric
132 | 
133 | 
134 | ***********************
135 | Quickstart and examples
136 | ***********************
137 | 
138 | With snakemake:
139 | --------------
140 | .. code-block::
141 | 
142 |    snakemake -s hypercluster.smk --configfile config.yml --config input_data_files=test_data input_data_folder=.
143 | 
144 | With python:
145 | -----------
146 | .. code-block:: python
147 | 
148 |    import pandas as pd
149 |    from sklearn.datasets import make_blobs
150 |    import hypercluster
151 | 
152 |    data, labels = make_blobs()
153 |    data = pd.DataFrame(data)
154 |    labels = pd.Series(labels, index=data.index, name='labels')
155 | 
156 |    # With a single clustering algorithm
157 |    clusterer = hypercluster.AutoClusterer()
158 |    clusterer.fit(data).evaluate(
159 |      methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics,
160 |      gold_standard = labels
161 |      )
162 | 
163 |    clusterer.visualize_evaluations()
164 | 
165 |    # With a range of algorithms
166 | 
167 |    clusterer = hypercluster.MultiAutoClusterer()
168 |    clusterer.fit(data).evaluate(
169 |      methods = hypercluster.constants.need_ground_truth+hypercluster.constants.inherent_metrics,
170 |      gold_standard = labels
171 |      )
172 | 
173 |    clusterer.visualize_evaluations()
174 | 
175 | Example work flows for both python and snakemake are
176 | `here <https://github.com/liliblu/hypercluster/tree/dev/examples/>`_
177 | 
178 | Source code is available `here <https://github.com/liliblu/hypercluster/>`_
179 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==0.24.2
 2 | numpy==1.22.0
 3 | scipy==1.2.1
 4 | scikit-learn==0.22.0
 5 | sphinx==2.0.0
 6 | sphinx-argparse==0.2.5
 7 | sphinx-autodoc-typehints==1.7.0
 8 | sphinx_rtd_theme==0.4.3
 9 | hdbscan==0.8.24
10 | snakemake==5.8.2
11 | matplotlib==3.1.2
12 | seaborn==0.9.0
13 | python-igraph==0.7.1.post6
14 | louvain==0.6.1
15 | leidenalg==0.7.0


--------------------------------------------------------------------------------
/docs/snakemake.rst:
--------------------------------------------------------------------------------
  1 | hypercluster SnakeMake pipeline
  2 | ===============================
  3 | 
  4 | Line-by-line explanation of config.yml
  5 | --------------------------------------
  6 | 
  7 | .. list-table:: Explanation for config.yml
  8 |    :widths: 33 33 33
  9 |    :header-rows: 1
 10 | 
 11 |    * - config.yml parameter
 12 |      - Explanation
 13 |      - Example from `scRNA-seq workflow <https://github.com/liliblu/hypercluster/tree/dev/examples/snakemake_scRNA_example>`_
 14 |    * - ``input_data_folder``
 15 |      - Path to folder in which input data can be found. No / at the end.
 16 |      - ``/input_data``
 17 |    * - ``input_data_files``
 18 |      - | List of prefixes of data files. Exclude extension, .csv, .tsv and .txt
 19 |        | allowed.
 20 |      - ``['input_data1', 'input_data2']``
 21 |    * - ``gold_standard_file``
 22 |      - | File name of gold_standard_file. Must have same pandas.read_csv kwargs
 23 |        | as the corresponding input file. Must be in input_data_folder.
 24 |      - ``{'input_data': 'gold_standard_file.txt'}``
 25 |    * - ``read_csv_kwargs``
 26 |      - | Per input data file, keyword args to put into `pandas.read_csv <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html>`_.
 27 |        | **If specifying multiindex, also put the same in output_kwargs['labels']**
 28 |      - ``{'test_input': {'index_col':[0]}}``
 29 |    * - ``output_folder``
 30 |      - Path to folder in which results will be written. No / at the end.
 31 |      - ``/hypercluster_results``
 32 |    * - ``intermediates_folder``
 33 |      - | Name of the folder within the output_folder to put intermediate results,
 34 |        | such as labels and evaluations per condition. No need to change this usually.
 35 |      - ``clustering_intermediates``
 36 |    * - ``clustering_results``
 37 |      - | Name of the folder within the output_folder to put final results.
 38 |        | No need to change this usually.
 39 |      - ``clustering``
 40 |    * - ``clusterer_kwargs``
 41 |      - | Additional static keyword arguments to pass to individual clusterers.
 42 |        | Not to optimize.
 43 |      - ``KMeans: {'random_state':8}}``
 44 |    * - ``generate_parameters_addtl_kwargs``
 45 |      - Additonal keyword arguments for the hypercluster.AutoClusterer class.
 46 |      - ``{'KMeans': {'random_search':true, 'param_weights': {'n_clusters': {5: 0.25, 6:0.75}}}``
 47 |    * - ``evaluations``
 48 |      - | Names of evaluation metrics to use. See
 49 |        | hypercluster.constants.inherent_metrics or
 50 |        | hypercluster.constants.need_ground_truth
 51 |      - ``['silhouette_score', 'number_clustered']``
 52 |    * - ``eval_kwargs``
 53 |      - Additional kwargs per evaluation metric function.
 54 |      - ``{'silhouette_score': {'random_state': 8}}``
 55 |    * - ``screeplot_evals``
 56 |      - Metrics for which to draw scree plots. Must be a subset of metrics used to evaluate. 
 57 |      - ``['silhouette_score', 'smallest_largest_clusters_ratio']``
 58 |    * - ``metric_to_choose_best``
 59 |      - | If picking best labels, which metric to maximize to choose the labels. If not choosing
 60 |        | best labels, leave as empty string ('').
 61 |      - ``silhouette_score``
 62 |    * - ``metric_to_compare_labels``
 63 |      - | If comparing labeling result pairwise similarity, which metric to use. To not generate
 64 |        | this comparison, leave blank/or empty string.
 65 |      - ``adjusted_rand_score``
 66 |    * - ``compare_samples``
 67 |      - | Whether to made a table and figure with counts of how often two samples are in the same
 68 |        | cluster.
 69 |      - ``true``
 70 |    * - ``output_kwargs``
 71 |      - | pandas.to_csv and pandas.read_csv kwargs per output type. Generally,
 72 |        | don't need to change the evaluations kwargs, but labels index_col have to
 73 |        | match index_col like in the read_csv_kwargs.
 74 |      - ``{'evaluations': {'index_col':[0]},  'labels': {'index_col':[0]}}``
 75 |    * - ``heatmap_kwargs``
 76 |      - Additional kwargs for `seaborn.heatmap <https://seaborn.pydata.org/generated/seaborn.heatmap.html>`_ for visualizations.
 77 |      - ``{'vmin':-2, 'vmax':2}``
 78 |    * - ``optimization_parameters``
 79 |      - Fun part! This is where you put which hyperparameters per algorithm to try.
 80 |      - ``{'KMeans': {'n_clusters': [5, 6, 7]}}``
 81 | 
 82 | **Note: Formatting of lists and dictionaries can be in python syntax (like above) or yaml syntax, or a mixture, like below. **
 83 | 
 84 | config.yml example from `scRNA-seq workflow <https://github.com/liliblu/hypercluster/tree/dev/examples/snakemake_scRNA_example>`_
 85 | ----------------------------------
 86 | 
 87 | .. code-block:: yaml
 88 | 
 89 |     input_data_folder: '.'
 90 |     input_data_files:
 91 |       - sc_data
 92 |     gold_standards:
 93 |       test_input: 'gold_standard.csv'
 94 |     read_csv_kwargs:
 95 |       test_input: {'index_col':[0]}
 96 | 
 97 |     output_folder: 'results'
 98 |     intermediates_folder: 'clustering_intermediates'
 99 |     clustering_results: 'clustering'
100 | 
101 |     clusterer_kwargs: {}
102 |     generate_parameters_addtl_kwargs: {}
103 | 
104 |     evaluations:
105 |       - silhouette_score
106 |       - calinski_harabasz_score
107 |       - davies_bouldin_score
108 |       - number_clustered
109 |       - smallest_largest_clusters_ratio
110 |       - smallest_cluster_ratio
111 |     eval_kwargs: {}
112 | 
113 |     metric_to_choose_best: silhouette_score
114 |     metric_to_compare_labels: adjusted_rand_score
115 |     compare_samples: true
116 | 
117 |     output_kwargs:
118 |       evaluations:
119 |         index_col: [0]
120 |       labels:
121 |         index_col: [0]
122 |     heatmap_kwargs: {}
123 | 
124 |     optimization_parameters:
125 |       HDBSCAN:
126 |         min_cluster_size: &id002
127 |         - 2
128 |         - 3
129 |         - 4
130 |         - 5
131 |       KMeans:
132 |         n_clusters: &id001
133 |         - 5
134 |         - 6
135 |         - 7
136 |       MiniBatchKMeans:
137 |         n_clusters: *id001
138 |       OPTICS:
139 |         min_samples: *id002
140 |       NMFCluster:
141 |         n_clusters: *id001
142 |       LouvainCluster: &id003
143 |         resolution:
144 |         - 0.2
145 |         - 0.4
146 |         - 0.6
147 |         - 0.8
148 |         - 1.0
149 |         - 1.2
150 |         - 1.4
151 |         - 1.6
152 |         k:
153 |         - 10
154 |         - 15
155 |         - 20
156 |         - 40
157 |         - 80
158 |         - 120
159 |       LeidenCluster: *id003
160 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Example jupyter notebooks for hypercluster
2 | 
3 | ##### Roster:
4 | 1. Running hypercluster with snakemake, in a distributed HPCC on scRNA-seq data from [Tikhonova AN et al. Nature. (2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/30971824/)
5 | 2. Running hypercluster locally on the [TCGA breast cancer RNA data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3465532/)
6 | 
7 | If you would like another vignette, please open an [issue](https://github.com/liliblu/hypercluster/issues)
8 | 


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.LeidenCluster-silhouette_score-umaps-checkpoint.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.louvain-umaps-checkpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/.ipynb_checkpoints/grid.scatter.louvain-umaps-checkpoint.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/brca.rna.evaluations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/brca.rna.evaluations.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/clustermap.nmf4-vs-psm50.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/clustermap.nmf4-vs-psm50.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LeidenCluster-silhouette_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LeidenCluster-silhouette_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LouvainCluster-adjusted_rand_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.LouvainCluster-adjusted_rand_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.NMFCluster-silhouette_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.NMFCluster-silhouette_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.silhouette_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/colorbar.silhouette_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LeidenCluster-silhouette_score-umaps.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LeidenCluster-silhouette_score-umaps.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.LouvainCluster-adjusted_rand_score-umaps.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.NMFCluster-silhouette_score-umaps.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/grid.scatter.NMFCluster-silhouette_score-umaps.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.PAM50_comp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.PAM50_comp.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmap.brca-rna.evaluations.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/heatmaps.graphs-clusterers.metrics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/heatmaps.graphs-clusterers.metrics.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.calinski_harabasz_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.calinski_harabasz_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.davies_bouldin_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.davies_bouldin_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.largest_cluster_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.largest_cluster_size.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.number_of_clusters.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.number_of_clusters.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.pca.various_clusters.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.pca.various_clusters.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.silhouette_score.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.silhouette_score.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_cluster_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_cluster_size.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_largest_clusters_ratio.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/local_TCGA_BRCA_RNAseq/figures/scatter.smallest_largest_clusters_ratio.pdf


--------------------------------------------------------------------------------
/examples/local_TCGA_BRCA_RNAseq/jupyter-lab-5918844.log:
--------------------------------------------------------------------------------
  1 | 
  2 | MacOS or linux terminal command to create your ssh tunnel:
  3 | ssh -N -L 8693:fn-0002:8693 lmb529@bigpurple3.nyumc.org
  4 | 
  5 | For more info and how to connect from windows,
  6 |    see research.computing.yale.edu/jupyter-nb
  7 | Here is the MobaXterm info:
  8 | 
  9 | Forwarded port:same as remote port
 10 | Remote server: fn-0002
 11 | Remote port: 8693
 12 | SSH server: bigpurple3.nyumc.org
 13 | SSH login: lmb529
 14 | SSH port: 22
 15 | 
 16 | Use a Browser on your local machine to go to:
 17 | http://localhost:8693  (prefix w/ https:// if using password)
 18 | 
 19 | /cm/local/apps/slurm/var/spool/job5918844/slurm_script: line 57: activate: No such file or directory
 20 | [I 11:42:30.612 LabApp] JupyterLab extension loaded from /gpfs/data/ruggleslab/home/lmb529/conda/envs/hc_test/lib/python3.7/site-packages/jupyterlab
 21 | [I 11:42:30.614 LabApp] JupyterLab application directory is /gpfs/data/ruggleslab/home/lmb529/conda/envs/hc_test/share/jupyter/lab
 22 | [I 11:42:30.619 LabApp] Serving notebooks from local directory: /gpfs/data/ruggleslab
 23 | [I 11:42:30.619 LabApp] The Jupyter Notebook is running at:
 24 | [I 11:42:30.619 LabApp] http://fn-0002:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb
 25 | [I 11:42:30.620 LabApp]  or http://127.0.0.1:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb
 26 | [I 11:42:30.620 LabApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
 27 | [C 11:42:30.662 LabApp] 
 28 |     
 29 |     To access the notebook, open this file in a browser:
 30 |         file:///gpfs/home/lmb529/.local/share/jupyter/runtime/nbserver-187904-open.html
 31 |     Or copy and paste one of these URLs:
 32 |         http://fn-0002:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb
 33 |      or http://127.0.0.1:8693/?token=fc13b11a9a39ac2da71f8cedf08311581dddfd01c51561fb
 34 | [I 11:43:24.452 LabApp] 302 GET / (192.168.0.103) 0.60ms
 35 | [I 11:43:24.467 LabApp] 302 GET /lab? (192.168.0.103) 0.85ms
 36 | [I 11:43:35.596 LabApp] 302 POST /login?next=%2Flab%3F (192.168.0.103) 1.19ms
 37 | [W 11:43:36.846 LabApp] Could not determine jupyterlab build status without nodejs
 38 | [I 11:43:38.497 LabApp] Kernel started: 5905720c-dcbc-407d-acd9-e3e535378860
 39 | [I 11:43:38.533 LabApp] Kernel started: e06fce5a-5298-45f3-99f7-3c929235c5b6
 40 | [I 11:45:38.369 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 41 | [I 11:59:39.671 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 42 | [I 12:01:39.875 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 43 | [I 12:03:40.086 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 44 | [I 12:05:40.254 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 45 | [I 12:17:41.100 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 46 | [I 12:19:06.673 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8
 47 | [I 12:19:06.675 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:e9bab26b-244d-42fc-84f9-8dd2884de9d2
 48 | [I 12:19:07.729 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8
 49 | [I 12:19:41.442 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 50 | [W 12:23:07.712 LabApp] WebSocket ping timeout after 119994 ms.
 51 | [W 12:23:07.730 LabApp] WebSocket ping timeout after 119993 ms.
 52 | [W 12:23:07.753 LabApp] WebSocket ping timeout after 119994 ms.
 53 | [I 12:23:12.731 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:33d3baf4-9661-40db-ac0b-77ed4e3a86f8
 54 | [I 12:23:12.754 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:e9bab26b-244d-42fc-84f9-8dd2884de9d2
 55 | [W 14:19:17.494 LabApp] Could not determine jupyterlab build status without nodejs
 56 | [I 14:21:18.748 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 57 | [I 14:26:35.854 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:aa6c73a1-652a-453d-904b-1eeed7fce497
 58 | [I 14:26:35.856 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:98f03fb6-73e6-4f56-aa66-a6b71770f0be
 59 | [W 14:26:40.187 LabApp] Could not determine jupyterlab build status without nodejs
 60 | [I 14:36:43.172 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 61 | [I 14:38:43.384 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 62 | [I 14:40:43.602 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 63 | [I 14:42:43.820 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 64 | [I 15:02:24.900 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 65 | [I 15:02:46.333 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 66 | [I 15:03:18.710 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 67 | [I 15:04:46.632 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 68 | [I 15:06:53.109 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 69 | [I 15:08:20.203 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 70 | [I 15:08:46.952 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 71 | [I 15:09:40.773 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 72 | [I 15:10:47.129 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 73 | [I 15:14:24.124 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 74 | [I 15:14:47.393 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 75 | [I 15:16:47.623 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 76 | [I 15:16:53.281 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa
 77 | [I 15:16:53.281 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115
 78 | [I 15:16:55.674 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115
 79 | [I 15:16:55.707 LabApp] Restoring connection for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa
 80 | [I 15:18:47.945 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 81 | [I 15:20:48.169 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 82 | [I 15:24:48.485 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 83 | [I 15:25:05.450 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 84 | [I 15:26:48.688 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 85 | [I 15:28:49.229 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 86 | [I 15:29:33.256 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 87 | [I 15:30:49.455 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 88 | [I 15:32:49.772 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 89 | [I 15:34:50.095 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 90 | [I 15:36:50.456 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 91 | [I 15:40:50.739 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 92 | [I 15:42:50.989 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 93 | [I 15:44:51.348 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 94 | [I 15:45:05.779 LabApp] Kernel restarted: e06fce5a-5298-45f3-99f7-3c929235c5b6
 95 | [I 15:46:52.685 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 96 | [I 15:48:54.259 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
 97 | [I 15:50:28.449 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115
 98 | [I 15:50:28.467 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa
 99 | [I 15:50:29.737 LabApp] Restoring connection for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa
100 | [I 15:50:29.763 LabApp] Restoring connection for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115
101 | [I 15:50:55.952 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
102 | [W 15:52:59.739 LabApp] WebSocket ping timeout after 119935 ms.
103 | [W 15:52:59.763 LabApp] WebSocket ping timeout after 119956 ms.
104 | [I 15:53:04.755 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:5ce1fe1d-30f0-4421-9c82-e5ca9ff11afa
105 | [I 15:53:04.764 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:995fe3b6-23fe-4bef-82b6-d3935b271115
106 | [W 16:24:46.888 LabApp] Could not determine jupyterlab build status without nodejs
107 | [I 16:26:48.786 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
108 | [I 16:28:49.624 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
109 | [I 16:30:51.525 LabApp] Saving file at /home/lmb529/hypercluster/examples/local_no-snakemake/running_locally.ipynb
110 | [W 17:23:18.493 LabApp] WebSocket ping timeout after 119943 ms.
111 | [W 17:23:18.571 LabApp] WebSocket ping timeout after 119982 ms.
112 | [I 17:23:23.521 LabApp] Starting buffering for 5905720c-dcbc-407d-acd9-e3e535378860:c9dfa09f-3795-4e25-a45b-7a97fcdc6906
113 | [I 17:23:23.573 LabApp] Starting buffering for e06fce5a-5298-45f3-99f7-3c929235c5b6:2de51812-9e38-436b-a0e6-45ad9c32335a
114 | slurmstepd-fn-0002: error: *** JOB 5918844 ON fn-0002 CANCELLED AT 2020-01-07T21:42:49 DUE TO TIME LIMIT ***
115 | 


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "__default__": {
 3 |         "job-name": "snakemake",
 4 |         "time": "4-23:59:59",
 5 |         "mem": 2G,
 6 |         "partition": "fn_medium",
 7 |         "cpus-per-task": 1,
 8 |         "output": "logs/slurm/%j.out",
 9 |         "error": "logs/slurm/%j.err"
10 |     },
11 |     "run_clusterer": {
12 |         "job-name": "clusterer",
13 |         "time": "4-23:59:59",
14 |         "mem": 32G,
15 |         "partition": "fn_medium",
16 |         "cpus-per-task": 8,
17 |         "output": "logs/slurm/cluster-%j.out",
18 |         "error": "logs/slurm/cluster-%j.err"
19 |     },
20 |     "run_evaluation": {
21 |         "job-name": "evaluate",
22 |         "time": "4-23:59:59",
23 |         "mem": 2G,
24 |         "partition": "fn_medium",
25 |         "output": "logs/slurm/evaluate-%j.out",
26 |         "error": "logs/slurm/evaluate-%j.err"
27 |     },
28 |     "compare_labels": {
29 |         "job-name": "compare_labels",
30 |         "time": "4-23:59:59",
31 |         "mem": 32G,
32 |         "cpus-per-task": 2,
33 |         "partition": "fn_medium",
34 |         "output": "logs/slurm/compare_labels-%j.out",
35 |         "error": "logs/slurm/compare_labels-%j.err"
36 |     },
37 |     "compare_samples": {
38 |         "job-name": "compare_samples",
39 |         "time": "4-23:59:59",
40 |         "mem": 48G,
41 |         "cpus-per-task": 2,
42 |         "partition": "fn_medium",
43 |         "output": "logs/slurm/compare_samples-%j.out",
44 |         "error": "logs/slurm/compare_samples-%j.err"
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/config.yml:
--------------------------------------------------------------------------------
  1 | clusterer_kwargs: {}
  2 | clustering_results: clustering
  3 | compare_samples: false
  4 | eval_kwargs: {}
  5 | evaluations:
  6 | - adjusted_mutual_info_score
  7 | - adjusted_rand_score
  8 | - calinski_harabasz_score
  9 | - completeness_score
 10 | - davies_bouldin_score
 11 | - fowlkes_mallows_score
 12 | - homogeneity_score
 13 | - largest_cluster_size
 14 | - mutual_info_score
 15 | - number_of_clusters
 16 | - silhouette_score
 17 | - smallest_cluster_size
 18 | - smallest_largest_clusters_ratio
 19 | - v_measure_score
 20 | generate_parameters_addtl_kwargs: {}
 21 | gold_standards:
 22 |   sc_data: gold_standard.csv
 23 |   sc_data_pca: gold_standard.csv
 24 | heatmap_kwargs: {}
 25 | input_data_files:
 26 | - sc_data
 27 | - sc_data_pca
 28 | input_data_folder: /gpfs/data/ruggleslab/home/lmb529/hypercluster/examples/snakemake_scRNA_example
 29 | intermediates_folder: clustering_intermediates
 30 | metric_to_choose_best: adjusted_rand_score
 31 | metric_to_compare_labels: adjusted_rand_score
 32 | optimization_parameters:
 33 |   AffinityPropagation:
 34 |     damping:
 35 |     - 0.55
 36 |     - 0.6
 37 |     - 0.65
 38 |     - 0.7
 39 |     - 0.75
 40 |     - 0.8
 41 |     - 0.85
 42 |     - 0.9
 43 |     - 0.95
 44 |   LeidenCluster:
 45 |     adjacency_method:
 46 |     - SNN
 47 |     - CNN
 48 |     k:
 49 |     - 20
 50 |     - 30
 51 |     - 40
 52 |     - 80
 53 |     - 120
 54 |     resolution:
 55 |     - 0.3
 56 |     - 0.4
 57 |     - 0.5
 58 |     - 0.6
 59 |     - 0.7
 60 |     - 0.8
 61 |     - 0.9
 62 |     - 1.0
 63 |     - 1.2
 64 |     - 1.4
 65 |   LouvainCluster:
 66 |     adjacency_method:
 67 |     - MNN
 68 |     - CNN
 69 |     k:
 70 |     - 20
 71 |     - 30
 72 |     - 40
 73 |     - 80
 74 |     - 120
 75 |     resolution:
 76 |     - 0.3
 77 |     - 0.4
 78 |     - 0.5
 79 |     - 0.6
 80 |     - 0.7
 81 |     - 0.8
 82 |     - 0.9
 83 |     - 1.0
 84 |     - 1.2
 85 |     - 1.4
 86 |   NMFCluster:
 87 |     n_clusters:
 88 |     - 3
 89 |     - 4
 90 |     - 5
 91 |     - 6
 92 |     - 7
 93 |     - 8
 94 |     - 9
 95 |     - 10
 96 |     - 11
 97 |     - 12
 98 |     - 13
 99 |     - 14
100 |     - 15
101 |     - 16
102 |     - 17
103 |     - 18
104 |     - 19
105 |     - 20
106 |     - 21
107 |     - 22
108 |     - 23
109 |     - 24
110 |     - 25
111 |     - 26
112 |     - 27
113 |     - 28
114 |     - 29
115 |     - 30
116 |     - 31
117 |     - 32
118 |     - 33
119 |     - 34
120 |     - 35
121 |     - 36
122 |     - 37
123 |     - 38
124 |     - 39
125 | output_folder: results_NN_test
126 | output_kwargs:
127 |   evaluations:
128 |     index_col:
129 |     - 0
130 |   labels:
131 |     index_col:
132 |     - 0
133 | read_csv_kwargs:
134 |   sc_data:
135 |     index_col:
136 |     - 0
137 |   sc_data_pca:
138 |     index_col:
139 |     - 0
140 | screeplot_evals:
141 | - adjusted_mutual_info_score
142 | - adjusted_rand_score
143 | - calinski_harabasz_score
144 | - completeness_score
145 | - davies_bouldin_score
146 | - fowlkes_mallows_score
147 | - homogeneity_score
148 | - largest_cluster_size
149 | - mutual_info_score
150 | - number_of_clusters
151 | - silhouette_score
152 | - smallest_cluster_size
153 | - smallest_largest_clusters_ratio
154 | - v_measure_score
155 | 


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/figures/heatmaps.graphs-clusterers.metrics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/heatmaps.graphs-clusterers.metrics.pdf


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/figures/pca.best_labels.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/pca.best_labels.pdf


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/figures/pca.published_labels.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/pca.published_labels.pdf


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/figures/umap.best_labels.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/umap.best_labels.pdf


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/figures/umap.published_labels.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/examples/snakemake_scRNA_example/figures/umap.published_labels.pdf


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/params_to_test.yml:
--------------------------------------------------------------------------------
  1 | AffinityPropagation;damping-0.55:
  2 |   clusterer: AffinityPropagation
  3 |   damping: 0.55
  4 | AffinityPropagation;damping-0.6:
  5 |   clusterer: AffinityPropagation
  6 |   damping: 0.6
  7 | AffinityPropagation;damping-0.65:
  8 |   clusterer: AffinityPropagation
  9 |   damping: 0.65
 10 | AffinityPropagation;damping-0.7:
 11 |   clusterer: AffinityPropagation
 12 |   damping: 0.7
 13 | AffinityPropagation;damping-0.75:
 14 |   clusterer: AffinityPropagation
 15 |   damping: 0.75
 16 | AffinityPropagation;damping-0.8:
 17 |   clusterer: AffinityPropagation
 18 |   damping: 0.8
 19 | AffinityPropagation;damping-0.85:
 20 |   clusterer: AffinityPropagation
 21 |   damping: 0.85
 22 | AffinityPropagation;damping-0.9:
 23 |   clusterer: AffinityPropagation
 24 |   damping: 0.9
 25 | AffinityPropagation;damping-0.95:
 26 |   clusterer: AffinityPropagation
 27 |   damping: 0.95
 28 | HDBSCAN;min_cluster_size-10:
 29 |   clusterer: HDBSCAN
 30 |   min_cluster_size: 10
 31 | HDBSCAN;min_cluster_size-11:
 32 |   clusterer: HDBSCAN
 33 |   min_cluster_size: 11
 34 | HDBSCAN;min_cluster_size-12:
 35 |   clusterer: HDBSCAN
 36 |   min_cluster_size: 12
 37 | HDBSCAN;min_cluster_size-13:
 38 |   clusterer: HDBSCAN
 39 |   min_cluster_size: 13
 40 | HDBSCAN;min_cluster_size-14:
 41 |   clusterer: HDBSCAN
 42 |   min_cluster_size: 14
 43 | HDBSCAN;min_cluster_size-15:
 44 |   clusterer: HDBSCAN
 45 |   min_cluster_size: 15
 46 | HDBSCAN;min_cluster_size-16:
 47 |   clusterer: HDBSCAN
 48 |   min_cluster_size: 16
 49 | HDBSCAN;min_cluster_size-2:
 50 |   clusterer: HDBSCAN
 51 |   min_cluster_size: 2
 52 | HDBSCAN;min_cluster_size-3:
 53 |   clusterer: HDBSCAN
 54 |   min_cluster_size: 3
 55 | HDBSCAN;min_cluster_size-4:
 56 |   clusterer: HDBSCAN
 57 |   min_cluster_size: 4
 58 | HDBSCAN;min_cluster_size-5:
 59 |   clusterer: HDBSCAN
 60 |   min_cluster_size: 5
 61 | HDBSCAN;min_cluster_size-6:
 62 |   clusterer: HDBSCAN
 63 |   min_cluster_size: 6
 64 | HDBSCAN;min_cluster_size-7:
 65 |   clusterer: HDBSCAN
 66 |   min_cluster_size: 7
 67 | HDBSCAN;min_cluster_size-8:
 68 |   clusterer: HDBSCAN
 69 |   min_cluster_size: 8
 70 | HDBSCAN;min_cluster_size-9:
 71 |   clusterer: HDBSCAN
 72 |   min_cluster_size: 9
 73 | KMeans;n_clusters-10:
 74 |   clusterer: KMeans
 75 |   n_clusters: 10
 76 | KMeans;n_clusters-11:
 77 |   clusterer: KMeans
 78 |   n_clusters: 11
 79 | KMeans;n_clusters-12:
 80 |   clusterer: KMeans
 81 |   n_clusters: 12
 82 | KMeans;n_clusters-13:
 83 |   clusterer: KMeans
 84 |   n_clusters: 13
 85 | KMeans;n_clusters-14:
 86 |   clusterer: KMeans
 87 |   n_clusters: 14
 88 | KMeans;n_clusters-15:
 89 |   clusterer: KMeans
 90 |   n_clusters: 15
 91 | KMeans;n_clusters-16:
 92 |   clusterer: KMeans
 93 |   n_clusters: 16
 94 | KMeans;n_clusters-17:
 95 |   clusterer: KMeans
 96 |   n_clusters: 17
 97 | KMeans;n_clusters-18:
 98 |   clusterer: KMeans
 99 |   n_clusters: 18
100 | KMeans;n_clusters-19:
101 |   clusterer: KMeans
102 |   n_clusters: 19
103 | KMeans;n_clusters-2:
104 |   clusterer: KMeans
105 |   n_clusters: 2
106 | KMeans;n_clusters-20:
107 |   clusterer: KMeans
108 |   n_clusters: 20
109 | KMeans;n_clusters-21:
110 |   clusterer: KMeans
111 |   n_clusters: 21
112 | KMeans;n_clusters-22:
113 |   clusterer: KMeans
114 |   n_clusters: 22
115 | KMeans;n_clusters-23:
116 |   clusterer: KMeans
117 |   n_clusters: 23
118 | KMeans;n_clusters-24:
119 |   clusterer: KMeans
120 |   n_clusters: 24
121 | KMeans;n_clusters-25:
122 |   clusterer: KMeans
123 |   n_clusters: 25
124 | KMeans;n_clusters-26:
125 |   clusterer: KMeans
126 |   n_clusters: 26
127 | KMeans;n_clusters-27:
128 |   clusterer: KMeans
129 |   n_clusters: 27
130 | KMeans;n_clusters-28:
131 |   clusterer: KMeans
132 |   n_clusters: 28
133 | KMeans;n_clusters-29:
134 |   clusterer: KMeans
135 |   n_clusters: 29
136 | KMeans;n_clusters-3:
137 |   clusterer: KMeans
138 |   n_clusters: 3
139 | KMeans;n_clusters-30:
140 |   clusterer: KMeans
141 |   n_clusters: 30
142 | KMeans;n_clusters-31:
143 |   clusterer: KMeans
144 |   n_clusters: 31
145 | KMeans;n_clusters-32:
146 |   clusterer: KMeans
147 |   n_clusters: 32
148 | KMeans;n_clusters-33:
149 |   clusterer: KMeans
150 |   n_clusters: 33
151 | KMeans;n_clusters-34:
152 |   clusterer: KMeans
153 |   n_clusters: 34
154 | KMeans;n_clusters-35:
155 |   clusterer: KMeans
156 |   n_clusters: 35
157 | KMeans;n_clusters-36:
158 |   clusterer: KMeans
159 |   n_clusters: 36
160 | KMeans;n_clusters-37:
161 |   clusterer: KMeans
162 |   n_clusters: 37
163 | KMeans;n_clusters-38:
164 |   clusterer: KMeans
165 |   n_clusters: 38
166 | KMeans;n_clusters-39:
167 |   clusterer: KMeans
168 |   n_clusters: 39
169 | KMeans;n_clusters-4:
170 |   clusterer: KMeans
171 |   n_clusters: 4
172 | KMeans;n_clusters-40:
173 |   clusterer: KMeans
174 |   n_clusters: 40
175 | KMeans;n_clusters-41:
176 |   clusterer: KMeans
177 |   n_clusters: 41
178 | KMeans;n_clusters-42:
179 |   clusterer: KMeans
180 |   n_clusters: 42
181 | KMeans;n_clusters-43:
182 |   clusterer: KMeans
183 |   n_clusters: 43
184 | KMeans;n_clusters-44:
185 |   clusterer: KMeans
186 |   n_clusters: 44
187 | KMeans;n_clusters-45:
188 |   clusterer: KMeans
189 |   n_clusters: 45
190 | KMeans;n_clusters-46:
191 |   clusterer: KMeans
192 |   n_clusters: 46
193 | KMeans;n_clusters-47:
194 |   clusterer: KMeans
195 |   n_clusters: 47
196 | KMeans;n_clusters-48:
197 |   clusterer: KMeans
198 |   n_clusters: 48
199 | KMeans;n_clusters-49:
200 |   clusterer: KMeans
201 |   n_clusters: 49
202 | KMeans;n_clusters-5:
203 |   clusterer: KMeans
204 |   n_clusters: 5
205 | KMeans;n_clusters-50:
206 |   clusterer: KMeans
207 |   n_clusters: 50
208 | KMeans;n_clusters-51:
209 |   clusterer: KMeans
210 |   n_clusters: 51
211 | KMeans;n_clusters-52:
212 |   clusterer: KMeans
213 |   n_clusters: 52
214 | KMeans;n_clusters-53:
215 |   clusterer: KMeans
216 |   n_clusters: 53
217 | KMeans;n_clusters-54:
218 |   clusterer: KMeans
219 |   n_clusters: 54
220 | KMeans;n_clusters-55:
221 |   clusterer: KMeans
222 |   n_clusters: 55
223 | KMeans;n_clusters-56:
224 |   clusterer: KMeans
225 |   n_clusters: 56
226 | KMeans;n_clusters-57:
227 |   clusterer: KMeans
228 |   n_clusters: 57
229 | KMeans;n_clusters-58:
230 |   clusterer: KMeans
231 |   n_clusters: 58
232 | KMeans;n_clusters-59:
233 |   clusterer: KMeans
234 |   n_clusters: 59
235 | KMeans;n_clusters-6:
236 |   clusterer: KMeans
237 |   n_clusters: 6
238 | KMeans;n_clusters-7:
239 |   clusterer: KMeans
240 |   n_clusters: 7
241 | KMeans;n_clusters-8:
242 |   clusterer: KMeans
243 |   n_clusters: 8
244 | KMeans;n_clusters-9:
245 |   clusterer: KMeans
246 |   n_clusters: 9
247 | MiniBatchKMeans;n_clusters-10:
248 |   clusterer: MiniBatchKMeans
249 |   n_clusters: 10
250 | MiniBatchKMeans;n_clusters-11:
251 |   clusterer: MiniBatchKMeans
252 |   n_clusters: 11
253 | MiniBatchKMeans;n_clusters-12:
254 |   clusterer: MiniBatchKMeans
255 |   n_clusters: 12
256 | MiniBatchKMeans;n_clusters-13:
257 |   clusterer: MiniBatchKMeans
258 |   n_clusters: 13
259 | MiniBatchKMeans;n_clusters-14:
260 |   clusterer: MiniBatchKMeans
261 |   n_clusters: 14
262 | MiniBatchKMeans;n_clusters-15:
263 |   clusterer: MiniBatchKMeans
264 |   n_clusters: 15
265 | MiniBatchKMeans;n_clusters-16:
266 |   clusterer: MiniBatchKMeans
267 |   n_clusters: 16
268 | MiniBatchKMeans;n_clusters-17:
269 |   clusterer: MiniBatchKMeans
270 |   n_clusters: 17
271 | MiniBatchKMeans;n_clusters-18:
272 |   clusterer: MiniBatchKMeans
273 |   n_clusters: 18
274 | MiniBatchKMeans;n_clusters-19:
275 |   clusterer: MiniBatchKMeans
276 |   n_clusters: 19
277 | MiniBatchKMeans;n_clusters-2:
278 |   clusterer: MiniBatchKMeans
279 |   n_clusters: 2
280 | MiniBatchKMeans;n_clusters-20:
281 |   clusterer: MiniBatchKMeans
282 |   n_clusters: 20
283 | MiniBatchKMeans;n_clusters-21:
284 |   clusterer: MiniBatchKMeans
285 |   n_clusters: 21
286 | MiniBatchKMeans;n_clusters-22:
287 |   clusterer: MiniBatchKMeans
288 |   n_clusters: 22
289 | MiniBatchKMeans;n_clusters-23:
290 |   clusterer: MiniBatchKMeans
291 |   n_clusters: 23
292 | MiniBatchKMeans;n_clusters-24:
293 |   clusterer: MiniBatchKMeans
294 |   n_clusters: 24
295 | MiniBatchKMeans;n_clusters-25:
296 |   clusterer: MiniBatchKMeans
297 |   n_clusters: 25
298 | MiniBatchKMeans;n_clusters-26:
299 |   clusterer: MiniBatchKMeans
300 |   n_clusters: 26
301 | MiniBatchKMeans;n_clusters-27:
302 |   clusterer: MiniBatchKMeans
303 |   n_clusters: 27
304 | MiniBatchKMeans;n_clusters-28:
305 |   clusterer: MiniBatchKMeans
306 |   n_clusters: 28
307 | MiniBatchKMeans;n_clusters-29:
308 |   clusterer: MiniBatchKMeans
309 |   n_clusters: 29
310 | MiniBatchKMeans;n_clusters-3:
311 |   clusterer: MiniBatchKMeans
312 |   n_clusters: 3
313 | MiniBatchKMeans;n_clusters-30:
314 |   clusterer: MiniBatchKMeans
315 |   n_clusters: 30
316 | MiniBatchKMeans;n_clusters-31:
317 |   clusterer: MiniBatchKMeans
318 |   n_clusters: 31
319 | MiniBatchKMeans;n_clusters-32:
320 |   clusterer: MiniBatchKMeans
321 |   n_clusters: 32
322 | MiniBatchKMeans;n_clusters-33:
323 |   clusterer: MiniBatchKMeans
324 |   n_clusters: 33
325 | MiniBatchKMeans;n_clusters-34:
326 |   clusterer: MiniBatchKMeans
327 |   n_clusters: 34
328 | MiniBatchKMeans;n_clusters-35:
329 |   clusterer: MiniBatchKMeans
330 |   n_clusters: 35
331 | MiniBatchKMeans;n_clusters-36:
332 |   clusterer: MiniBatchKMeans
333 |   n_clusters: 36
334 | MiniBatchKMeans;n_clusters-37:
335 |   clusterer: MiniBatchKMeans
336 |   n_clusters: 37
337 | MiniBatchKMeans;n_clusters-38:
338 |   clusterer: MiniBatchKMeans
339 |   n_clusters: 38
340 | MiniBatchKMeans;n_clusters-39:
341 |   clusterer: MiniBatchKMeans
342 |   n_clusters: 39
343 | MiniBatchKMeans;n_clusters-4:
344 |   clusterer: MiniBatchKMeans
345 |   n_clusters: 4
346 | MiniBatchKMeans;n_clusters-40:
347 |   clusterer: MiniBatchKMeans
348 |   n_clusters: 40
349 | MiniBatchKMeans;n_clusters-41:
350 |   clusterer: MiniBatchKMeans
351 |   n_clusters: 41
352 | MiniBatchKMeans;n_clusters-42:
353 |   clusterer: MiniBatchKMeans
354 |   n_clusters: 42
355 | MiniBatchKMeans;n_clusters-43:
356 |   clusterer: MiniBatchKMeans
357 |   n_clusters: 43
358 | MiniBatchKMeans;n_clusters-44:
359 |   clusterer: MiniBatchKMeans
360 |   n_clusters: 44
361 | MiniBatchKMeans;n_clusters-45:
362 |   clusterer: MiniBatchKMeans
363 |   n_clusters: 45
364 | MiniBatchKMeans;n_clusters-46:
365 |   clusterer: MiniBatchKMeans
366 |   n_clusters: 46
367 | MiniBatchKMeans;n_clusters-47:
368 |   clusterer: MiniBatchKMeans
369 |   n_clusters: 47
370 | MiniBatchKMeans;n_clusters-48:
371 |   clusterer: MiniBatchKMeans
372 |   n_clusters: 48
373 | MiniBatchKMeans;n_clusters-49:
374 |   clusterer: MiniBatchKMeans
375 |   n_clusters: 49
376 | MiniBatchKMeans;n_clusters-5:
377 |   clusterer: MiniBatchKMeans
378 |   n_clusters: 5
379 | MiniBatchKMeans;n_clusters-50:
380 |   clusterer: MiniBatchKMeans
381 |   n_clusters: 50
382 | MiniBatchKMeans;n_clusters-51:
383 |   clusterer: MiniBatchKMeans
384 |   n_clusters: 51
385 | MiniBatchKMeans;n_clusters-52:
386 |   clusterer: MiniBatchKMeans
387 |   n_clusters: 52
388 | MiniBatchKMeans;n_clusters-53:
389 |   clusterer: MiniBatchKMeans
390 |   n_clusters: 53
391 | MiniBatchKMeans;n_clusters-54:
392 |   clusterer: MiniBatchKMeans
393 |   n_clusters: 54
394 | MiniBatchKMeans;n_clusters-55:
395 |   clusterer: MiniBatchKMeans
396 |   n_clusters: 55
397 | MiniBatchKMeans;n_clusters-56:
398 |   clusterer: MiniBatchKMeans
399 |   n_clusters: 56
400 | MiniBatchKMeans;n_clusters-57:
401 |   clusterer: MiniBatchKMeans
402 |   n_clusters: 57
403 | MiniBatchKMeans;n_clusters-58:
404 |   clusterer: MiniBatchKMeans
405 |   n_clusters: 58
406 | MiniBatchKMeans;n_clusters-59:
407 |   clusterer: MiniBatchKMeans
408 |   n_clusters: 59
409 | MiniBatchKMeans;n_clusters-6:
410 |   clusterer: MiniBatchKMeans
411 |   n_clusters: 6
412 | MiniBatchKMeans;n_clusters-7:
413 |   clusterer: MiniBatchKMeans
414 |   n_clusters: 7
415 | MiniBatchKMeans;n_clusters-8:
416 |   clusterer: MiniBatchKMeans
417 |   n_clusters: 8
418 | MiniBatchKMeans;n_clusters-9:
419 |   clusterer: MiniBatchKMeans
420 |   n_clusters: 9
421 | OPTICS;min_samples-10:
422 |   clusterer: OPTICS
423 |   min_samples: 10
424 | OPTICS;min_samples-11:
425 |   clusterer: OPTICS
426 |   min_samples: 11
427 | OPTICS;min_samples-12:
428 |   clusterer: OPTICS
429 |   min_samples: 12
430 | OPTICS;min_samples-13:
431 |   clusterer: OPTICS
432 |   min_samples: 13
433 | OPTICS;min_samples-14:
434 |   clusterer: OPTICS
435 |   min_samples: 14
436 | OPTICS;min_samples-15:
437 |   clusterer: OPTICS
438 |   min_samples: 15
439 | OPTICS;min_samples-16:
440 |   clusterer: OPTICS
441 |   min_samples: 16
442 | OPTICS;min_samples-2:
443 |   clusterer: OPTICS
444 |   min_samples: 2
445 | OPTICS;min_samples-3:
446 |   clusterer: OPTICS
447 |   min_samples: 3
448 | OPTICS;min_samples-4:
449 |   clusterer: OPTICS
450 |   min_samples: 4
451 | OPTICS;min_samples-5:
452 |   clusterer: OPTICS
453 |   min_samples: 5
454 | OPTICS;min_samples-6:
455 |   clusterer: OPTICS
456 |   min_samples: 6
457 | OPTICS;min_samples-7:
458 |   clusterer: OPTICS
459 |   min_samples: 7
460 | OPTICS;min_samples-8:
461 |   clusterer: OPTICS
462 |   min_samples: 8
463 | OPTICS;min_samples-9:
464 |   clusterer: OPTICS
465 |   min_samples: 9
466 | 


--------------------------------------------------------------------------------
/examples/snakemake_scRNA_example/snakemake_submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --partition cpu_long
 3 | #SBATCH --mem 4G
 4 | #SBATCH --time 27-23:59:59
 5 | #SBATCH --job-name snakeautocluster
 6 | #SBATCH --cpus-per-task=1
 7 | #SBATCH -e logs/sbatchSnakefile_progress_err.log
 8 | #SBATCH -o logs/sbatchSnakefile_progress_out.log
 9 | 
10 | 
11 | module purge
12 | module add slurm
13 | source activate hc_test
14 | cd /gpfs/data/ruggleslab/home/lmb529/hypercluster/examples/snakemake_scRNA_example
15 | mkdir -p logs/slurm/
16 | 
17 | snakemake -j 999 -p --verbose \
18 | -s ../../snakemake/hypercluster.smk \
19 | --configfile config.yml \
20 | --keep-going \
21 | --cluster-config cluster.json \
22 | --cluster "sbatch --mem={cluster.mem} -t {cluster.time} -o {cluster.output} -p {cluster.partition}"
23 | 


--------------------------------------------------------------------------------
/hypercluster/__init__.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import seaborn as sns
 3 | import hypercluster
 4 | from hypercluster import (
 5 |     utilities, additional_clusterers, additional_metrics, classes, constants, visualize
 6 | )
 7 | from hypercluster.classes import AutoClusterer, MultiAutoClusterer
 8 | __version__ = '0.1.13'
 9 | __all__ = [
10 |     "AutoClusterer",
11 |     "MultiAutoClusterer"
12 | ]
13 | 
14 | matplotlib.rcParams["pdf.fonttype"] = 42
15 | matplotlib.rcParams["ps.fonttype"] = 42
16 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3)
17 | matplotlib.rcParams.update({"savefig.bbox": "tight"})


--------------------------------------------------------------------------------
/hypercluster/additional_clusterers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Additonal clustering classes can be added here, as long as they have a 'fit' method.
  3 | 
  4 | 
  5 | Attributes:
  6 |     HDBSCAN (clustering class): See `hdbscan`_
  7 | 
  8 | .. _hdbscan:
  9 |     https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#the-simple-case/
 10 | """
 11 | from typing import Optional, Iterable
 12 | import logging
 13 | import numpy as np
 14 | import pandas as pd
 15 | from scipy.spatial.distance import pdist
 16 | from sklearn.decomposition import NMF
 17 | from sklearn.neighbors import NearestNeighbors
 18 | from hdbscan import HDBSCAN
 19 | from .constants import pdist_adjacency_methods, valid_partition_types
 20 | import igraph as ig
 21 | import louvain
 22 | import leidenalg
 23 | 
 24 | 
 25 | class NMFCluster:
 26 |     """Uses non-negative factorization from sklearn to assign clusters to samples, based on the
 27 |     maximum membership score of the sample per component.
 28 | 
 29 |     Args:
 30 |         n_clusters: The number of clusters to find. Used as n_components when fitting.
 31 |         **nmf_kwargs:
 32 |     """
 33 |     def __init__(self, n_clusters: int = 8, **nmf_kwargs):
 34 | 
 35 |         nmf_kwargs['n_components'] = n_clusters
 36 | 
 37 |         self.NMF = NMF(**nmf_kwargs)
 38 |         self.n_clusters = n_clusters
 39 | 
 40 |     def fit(self, data):
 41 |         """If negative numbers are present, creates one data matrix with all negative numbers
 42 |         zeroed. Create another data matrix with all positive numbers zeroed and the signs of all
 43 |         negative numbers reversed. Concatenate both matrices resulting in a data matrix twice as
 44 |         large as the original, but with positive values only and zeros and hence appropriate for
 45 |         NMF. Uses decomposed matrix H, which is nxk (with n=number of samples and k=number of
 46 |         components) to assign cluster membership. Each sample is assigned to the cluster for
 47 |         which it has the highest membership score. See `sklearn.decomposition.NMF`_  
 48 | 
 49 |         Args: 
 50 |             data (DataFrame): Data to fit with samples as rows and features as columns.  
 51 | 
 52 |         Returns: 
 53 |             self with labels\_ attribute.  
 54 | 
 55 |         .. _sklearn.decomposition.NMF: 
 56 |             https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
 57 |         """
 58 | 
 59 |         if np.any(data<0):
 60 |             positive = data.copy()
 61 |             positive[positive < 0] = 0
 62 |             negative = data.copy()
 63 |             negative[negative > 0] = 0
 64 |             negative = -negative
 65 |             data = pd.concat([positive, negative], axis=1, join='outer')
 66 | 
 67 |         self.labels_ = pd.DataFrame(self.NMF.fit_transform(data)).idxmax(axis=1).values
 68 |         return self
 69 | 
 70 | 
 71 | class LouvainCluster:
 72 |     """Louvain clustering on graph derived from an adjacency matrix. 
 73 | 
 74 |     Args: 
 75 |         adjacency_method: Method to use to construct adjacency matrix, which is used to construct \
 76 |         graph that will be clustered. Valid methods are any metric valid in \
 77 |         scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \
 78 |         nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \
 79 |         NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \
 80 |         the count of how many NNs i and j have in common as the edge weight.  
 81 |         k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix.  
 82 |         resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \
 83 |         resolution to use. If using other partitioners, this is ignored but any other kwargs for \
 84 |         those partitioners can be passed too. 
 85 |         adjacency_kwargs: Additional keyword arguments to pass to \
 86 |         sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \
 87 |         adjacency matrix. 
 88 |         partition_type: Which partition to use for louvain clustering, see `louvain-igraph`_ for \
 89 |         more info.  
 90 |         **louvain_kwargs: Additional kwargs to be passed to `find_partition`_
 91 | 
 92 |     .. _louvain-igraph:
 93 |         https://louvain-igraph.readthedocs.io/en/latest/reference.html
 94 |     .. _find_partition:
 95 |         https://louvain-igraph.readthedocs.io/en/latest/reference.html#louvain.find_partition
 96 |     """
 97 |     def __init__(
 98 |             self,
 99 |             adjacency_method: str = 'MNN',
100 |             k: int = 20,
101 |             resolution: float = 0.8,
102 |             adjacency_kwargs: Optional[dict] = None,
103 |             partition_type: str = 'RBConfigurationVertexPartition',
104 |             **louvain_kwargs
105 |     ):
106 | 
107 |         if adjacency_method not in ['MNN', 'CNN'] + pdist_adjacency_methods:
108 |             raise ValueError(
109 |                 'Adjacency method %s invalid. Must be "SNN", "CNN" or a valid metric for '
110 |                 'scipy.spatial.distance.pdist.' % adjacency_method
111 |             )
112 |         if partition_type not in valid_partition_types:
113 |             raise ValueError(
114 |                 'Partition type %s not valid, must be in constants.valid_partition_types' %
115 |                 partition_type
116 |             )
117 |         self.adjacency_method = adjacency_method
118 |         self.k = int(k)
119 |         self.resolution = resolution
120 |         self.adjacency_kwargs = adjacency_kwargs
121 |         self.partition_type = partition_type
122 |         self.louvain_kwargs = louvain_kwargs
123 | 
124 |     def fit(
125 |             self,
126 |             data: pd.DataFrame,
127 |     ):
128 |         adjacency_method = self.adjacency_method
129 |         k = self.k
130 |         resolution = self.resolution
131 |         adjacency_kwargs = self.adjacency_kwargs
132 |         louvain_kwargs = self.louvain_kwargs
133 |         partition_type = self.partition_type
134 |         if k >= len(data):
135 |             logging.warning(
136 |                 'k was set to %s, with only %s samples. Changing to k to %s-1'
137 |                 % (k, len(data), len(data))
138 |             )
139 |             k = len(data) - 1
140 |         if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'):
141 |             if adjacency_kwargs is None:
142 |                 adjacency_kwargs = {}
143 |             adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k)
144 |             nns = NearestNeighbors(**adjacency_kwargs)
145 |             nns.fit(data)
146 |             adjacency_mat = nns.kneighbors_graph(data)
147 |             if adjacency_method == 'MNN':
148 |                 adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose())
149 |             if adjacency_method == 'CNN':
150 |                 adjacency_mat = adjacency_mat*adjacency_mat.transpose()
151 |         elif adjacency_method in pdist_adjacency_methods:
152 |             adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs)
153 | 
154 |         if louvain_kwargs is None:
155 |             louvain_kwargs = {}
156 |         g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist())
157 | 
158 |         if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']:
159 |             louvain_kwargs['resolution_parameter'] = resolution
160 | 
161 |         labels = eval('louvain.find_partition(g, louvain.%s, **louvain_kwargs)' % partition_type)
162 |         labels = pd.Series({v: i for i in range(len(labels)) for v in labels[i]}).sort_index()
163 |         if labels.is_unique or (len(labels.unique()) == 1):
164 |             labels = pd.Series([-1 for i in range(len(labels))])
165 |         labels = labels.values
166 |         self.labels_ = labels
167 |         return self
168 | 
169 | 
170 | class LeidenCluster:
171 |     """Leidein clustering on graph derived from an adjacency matrix. See `reference`_ for more info 
172 | 
173 |     Args: 
174 |         adjacency_method: Method to use to construct adjacency matrix, which is used to construct \
175 |         graph that will be clustered. Valid methods are any metric valid in \
176 |         scipy.spatial.distance.pdist, or MNN, for mutual nearest neighbors and CNN for common \
177 |         nearest neighbors. Both use sklearn.neighbors.NearestNeighbors at a given k to calculate \
178 |         NNs. MNN then uses whether points i and j are each others NNs as edge weights. CNN uses \
179 |         the count of how many NNs i and j have in common as the edge weight.  
180 |         k: If using CNN or MNN, k to use to construct the NearestNeighbors matrix.  
181 |         resolution: If using 'RBConfigurationVertexPartition', 'CPMVertexPartition' which \
182 |         resolution to use. If using other partitioners, this is ignored but any other kwargs for \
183 |         those partitioners can be passed too. 
184 |         adjacency_kwargs: Additional keyword arguments to pass to \
185 |         sklearn.neighbors.NearestNeighbors or scipy.spatial.distance.pdist to construct the \
186 |         adjacency matrix. 
187 |         partition_type: Which partition to use for leiden clustering, see `leidenalg`_ for \
188 |         more info.  
189 |         **leiden_kwargs: Additional kwargs to be passed to `find_partition`_
190 |     .. _reference:
191 |         https://www.nature.com/articles/s41598-019-41695-z
192 |     .. _leidenalg:
193 |         https://leidenalg.readthedocs.io/en/latest/reference.html
194 |     .. _find_partition:
195 |         https://leidenalg.readthedocs.io/en/latest/reference.html#leidenalg.find_partition
196 |     """
197 |     def __init__(
198 |             self,
199 |             adjacency_method: str = 'MNN',
200 |             k: int = 20,
201 |             resolution: float = 0.8,
202 |             adjacency_kwargs: Optional[dict] = None,
203 |             partition_type: str = 'RBConfigurationVertexPartition',
204 |             **leiden_kwargs
205 |     ):
206 | 
207 |         self.adjacency_method = adjacency_method
208 |         self.k = int(k)
209 |         self.resolution = resolution
210 |         self.adjacency_kwargs = adjacency_kwargs
211 |         self.partition_type = partition_type
212 |         self.leiden_kwargs = leiden_kwargs
213 | 
214 |     def fit(
215 |             self,
216 |             data: pd.DataFrame,
217 |     ):
218 | 
219 |         adjacency_method = self.adjacency_method
220 |         k = self.k
221 |         resolution = self.resolution
222 |         adjacency_kwargs = self.adjacency_kwargs
223 |         leiden_kwargs = self.leiden_kwargs
224 |         partition_type = self.partition_type
225 |         if k >= len(data):
226 |             logging.warning(
227 |                 'k was set to %s, with only %s samples. Changing to k to %s-1'
228 |                 % (k, len(data), len(data))
229 |             )
230 |             k = len(data) - 1
231 |         if (adjacency_method == 'MNN') | (adjacency_method == 'CNN'):
232 |             if adjacency_kwargs is None:
233 |                 adjacency_kwargs = {}
234 |             adjacency_kwargs['n_neighbors'] = adjacency_kwargs.get('n_neighbors', k)
235 |             nns = NearestNeighbors(**adjacency_kwargs)
236 |             nns.fit(data)
237 |             adjacency_mat = nns.kneighbors_graph(data)
238 |             if adjacency_method == 'MNN':
239 |                 adjacency_mat = adjacency_mat.multiply(adjacency_mat.transpose())
240 |             if adjacency_method == 'CNN':
241 |                 adjacency_mat = adjacency_mat * adjacency_mat.transpose()
242 |         elif adjacency_method in pdist_adjacency_methods:
243 |             adjacency_mat = pdist(data, metric=adjacency_method, **adjacency_kwargs)
244 | 
245 |         if leiden_kwargs is None:
246 |             leiden_kwargs = {}
247 |         g = ig.Graph.Weighted_Adjacency(adjacency_mat.toarray().tolist())
248 | 
249 |         if partition_type in ['RBConfigurationVertexPartition', 'CPMVertexPartition']:
250 |             leiden_kwargs['resolution_parameter'] = resolution
251 | 
252 |         labels = eval('leidenalg.find_partition(g, leidenalg.%s,**leiden_kwargs)' % partition_type)
253 |         labels = pd.Series({v:i for i in range(len(labels)) for v in labels[i]}).sort_index()
254 |         if labels.is_unique or (len(labels.unique()) == 1):
255 |             labels = pd.Series([-1 for i in range(len(labels))])
256 |         labels = labels.values
257 |         self.labels_ = labels
258 |         return self
259 | 


--------------------------------------------------------------------------------
/hypercluster/additional_metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Optional
 2 | from collections import Counter
 3 | from pandas import DataFrame
 4 | from scipy.cluster.hierarchy import linkage, cophenet
 5 | from scipy.spatial.distance import pdist
 6 | 
 7 | __doc__ = (
 8 |     "More functions for evaluating clustering results. Additional metric evaluations can "
 9 |     "be added here, as long as the second argument is the labels to evaluate"
10 | )
11 | 
12 | 
13 | def number_clustered(_, labels: Iterable) -> float:
14 |     """Returns the number of clustered samples. 
15 | 
16 |     Args: 
17 |         _: Dummy, pass anything or None.  
18 |         labels (Iterable): Vector of sample labels.  
19 | 
20 |     Returns (int): 
21 |         The number of clustered labels.  
22 | 
23 |     """
24 |     return (labels != -1).sum()
25 | 
26 | 
27 | def smallest_largest_clusters_ratio(_, labels: Iterable) -> float:
28 |     """Number in the smallest cluster over the number in the largest cluster.  
29 | 
30 |     Args: 
31 |         _: Dummy, pass anything or None.  
32 |         labels (Iterable): Vector of sample labels.  
33 | 
34 |     Returns (float): 
35 |         Ratio of number of members in smallest over largest cluster.  
36 | 
37 |     """
38 |     counts = Counter(labels)
39 |     counts.pop(-1, None)
40 |     return min(counts.values()) / max(counts.values())
41 | 
42 | 
43 | def smallest_cluster_ratio(_, labels: Iterable) -> float:
44 |     """Number in the smallest cluster over the total samples. 
45 | 
46 |     Args: 
47 |         _: Dummy, pass anything or None.  
48 |         labels (Iterable): Vector of sample labels.  
49 | 
50 |     Returns (float): 
51 |         Ratio of number of members in smallest over all samples.  
52 | 
53 |     """
54 |     counts = Counter(labels)
55 |     counts.pop(-1, None)
56 |     return min(counts.values()) / len(labels)
57 | 
58 | 
59 | def number_of_clusters(_, labels: Iterable) -> float:
60 |     """Number of total clusters. 
61 | 
62 |     Args: 
63 |         _: Dummy, pass anything or None  
64 |         labels (Iterable): Vector of sample labels.  
65 | 
66 |     Returns (int): 
67 |         Number of clusters.  
68 | 
69 |     """
70 |     return len(Counter(labels))
71 | 
72 | 
73 | def smallest_cluster_size(_, labels: Iterable) -> float:
74 |     """Number in smallest cluster 
75 | 
76 |     Args: 
77 |         _: Dummy, pass anything or None  
78 |         labels (Iterable): Vector of sample labels.  
79 | 
80 |     Returns (int): 
81 |         Number of samples in smallest cluster. 
82 | 
83 |     """
84 |     return min(Counter(labels).values())
85 | 
86 | 
87 | def largest_cluster_size(_, labels: Iterable) -> float:
88 |     """Number in largest cluster 
89 | 
90 |     Args: 
91 |         _: Dummy, pass anything or None  
92 |         labels (Iterable): Vector of sample labels.  
93 | 
94 |     Returns (int): 
95 |         Number of samples in largest cluster. 
96 | 
97 |     """
98 |     return max(Counter(labels).values())
99 | 


--------------------------------------------------------------------------------
/hypercluster/constants.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | __doc__ = """
  5 | Attributes: 
  6 |     param_delim: delimiter between hyperparameters for snakemake file labels and labels DataFrame \
  7 |     columns.  
  8 |     val_delim: delimiter between hyperparameter label and value for snakemake file labels and \
  9 |     labels DataFrame columns.  
 10 |     categories: Convenient groups of clusterers to use. If all samples need to be clustered, \
 11 |     'partitioners' is a good choice. If there are millions of samples, 'fastest' might be a good \
 12 |     choice.    
 13 |     variables_to_optimize: Some default hyperparameters to optimize and value ranges for a \
 14 |     selection of commonly used clustering algoirthms from sklearn. Used as deafults for \
 15 |     clustering.AutoClusterer and clustering.optimize_clustering.    
 16 |     need_ground_truth: list of sklearn metrics that need ground truth labeling. \
 17 |     "adjusted_rand_score", "adjusted_mutual_info_score", "homogeneity_score", \
 18 |     "completeness_score", "fowlkes_mallows_score", "mutual_info_score", "v_measure_score"    
 19 |     inherent_metrics: list of sklearn metrics that need original data for calculation. \
 20 |     "silhouette_score", "calinski_harabasz_score", "davies_bouldin_score", \
 21 |     "smallest_largest_clusters_ratio", "number_of_clusters", "smallest_cluster_size", \
 22 |     "largest_cluster_size"  
 23 |     min_or_max: establishing whether each sklearn metric is better when minimized or maximized for \
 24 |     clustering.pick_best_labels.  
 25 | """
 26 | param_delim = ";"
 27 | val_delim = "-"
 28 | 
 29 | slow = ["AffinityPropagation", "MeanShift"]
 30 | fast = ["KMeans", "OPTICS", "HDBSCAN"]
 31 | fastest = ["MiniBatchKMeans"]
 32 | partitioners = ["AffinityPropagation", "MeanShift", "KMeans", "MiniBatchKMeans"]
 33 | clusterers = ["OPTICS", "HDBSCAN"]
 34 | categories = {
 35 |     "slow": slow,
 36 |     "fast": fast,
 37 |     "fastest": fastest,
 38 |     "partitioning": partitioners,
 39 |     "clustering": clusterers,
 40 | }
 41 | 
 42 | min_cluster_size = [i for i in range(2, 17, 2)]
 43 | n_clusters = [i for i in range(2, 41)]
 44 | damping = [i / 100 for i in range(55, 95, 5)]
 45 | resolutions = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]
 46 | knn = [20, 30, 60]
 47 | 
 48 | 
 49 | variables_to_optimize = {
 50 |     "HDBSCAN": dict(min_cluster_size=min_cluster_size),
 51 |     "KMeans": dict(n_clusters=n_clusters),
 52 |     "MiniBatchKMeans": dict(n_clusters=n_clusters),
 53 |     "AffinityPropagation": dict(damping=damping),
 54 |     "MeanShift": dict(cluster_all=[False]),
 55 |     "OPTICS": dict(min_samples=min_cluster_size),
 56 |     "NMFCluster": dict(n_clusters=n_clusters),
 57 |     "LouvainCluster": dict(resolution=resolutions, k=knn),
 58 |     "LeidenCluster": dict(resolution=resolutions, k=knn),
 59 | }
 60 | 
 61 | 
 62 | need_ground_truth = [
 63 |     "adjusted_rand_score",
 64 |     "adjusted_mutual_info_score",
 65 |     "homogeneity_score",
 66 |     "completeness_score",
 67 |     "fowlkes_mallows_score",
 68 |     "mutual_info_score",
 69 |     "v_measure_score",
 70 | ]
 71 | 
 72 | inherent_metrics = [
 73 |     "silhouette_score",
 74 |     "calinski_harabasz_score",
 75 |     "davies_bouldin_score",
 76 |     "smallest_largest_clusters_ratio",
 77 |     "number_of_clusters",
 78 |     "smallest_cluster_size",
 79 |     "largest_cluster_size"
 80 | ]
 81 | 
 82 | min_or_max = {
 83 |     "adjusted_rand_score": 'max',
 84 |     "adjusted_mutual_info_score": 'max',
 85 |     "homogeneity_score": 'max',
 86 |     "completeness_score": 'max',
 87 |     "fowlkes_mallows_score": 'max',
 88 |     "silhouette_score": 'max',
 89 |     "calinski_harabasz_score": 'max',
 90 |     "davies_bouldin_score": 'min',
 91 |     "mutual_info_score": 'max',
 92 |     "v_measure_score": 'max',
 93 | }
 94 | 
 95 | pdist_adjacency_methods = [
 96 |     'braycurtis',
 97 |     'canberra',
 98 |     'chebyshev',
 99 |     'cityblock',
100 |     'correlation',
101 |     'cosine',
102 |     'dice',
103 |     'euclidean',
104 |     'hamming',
105 |     'jaccard',
106 |     'jensenshannon',
107 |     'kulsinski',
108 |     'mahalanobis',
109 |     'matching',
110 |     'minkowski',
111 |     'rogerstanimoto',
112 |     'russellrao',
113 |     'seuclidean',
114 |     'sokalmichener',
115 |     'sokalsneath',
116 |     'sqeuclidean',
117 |     'yule'
118 | ]
119 | 
120 | 
121 | valid_partition_types = [
122 |     'RBConfigurationVertexPartition',
123 |     'ModularityVertexPartition',
124 |     'RBERVertexPartition',
125 |     'CPMVertexPartition',
126 |     'SignificanceVertexPartition',
127 |     'SurpriseVertexPartition'
128 | ]


--------------------------------------------------------------------------------
/hypercluster/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliblu/hypercluster/8ea9a59b9755007ebe2612e1f3586a6b27e71c70/hypercluster/tests/__init__.py


--------------------------------------------------------------------------------
/hypercluster/tests/test_clustering.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import utilities
 2 | from hypercluster.constants import *
 3 | import hypercluster
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | 
 8 | test_data = pd.DataFrame(
 9 |         np.array(
10 |             [[1, 2], [-1.8, 4], [1, -0.5],
11 |              [10, 2], [-10, 4], [10, 0],
12 |              [np.nan, 5], [3.2, np.nan], [0, 14],
13 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
14 |              [42, np.nan], [-8, 2], [1.2, 12],
15 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
16 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
17 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
18 |              [np.nan, 22], [8.6, -7.5], [0, 14],
19 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
20 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
21 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
22 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
23 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
24 |         )
25 | )
26 | 
27 | 
28 | test_data['ind1'] = 'a'
29 | test_data['ind2'] = range(len(test_data))
30 | test_data = test_data.set_index(['ind1', 'ind2'])
31 | test_data = test_data.fillna(test_data.median())
32 | 
33 | test_ground_truth = pd.Series(
34 |     np.random.randint(0, 2, size=(len(test_data), )),
35 |     index=test_data.index
36 | )
37 | 
38 | 
39 | def test_cluster_one():
40 |     # Test all clusterers are working with default params
41 |     for clus_name in variables_to_optimize.keys():
42 |         utilities.cluster(clus_name, test_data)
43 | 
44 |     # Test with putting extra params in there
45 |     for clus_name in variables_to_optimize.keys():
46 |         vars = variables_to_optimize[clus_name]
47 |         key = list(vars.keys())[0]
48 |         params = {key: vars[key][0]}
49 |         # grabbing a variable and making sure var passing works
50 |         utilities.cluster(clus_name, test_data, params)
51 | 
52 | 
53 | def test_autoclusterer():
54 |     for clus_name in variables_to_optimize.keys():
55 |         hypercluster.AutoClusterer(clus_name).fit(test_data)
56 |     for clus_name in variables_to_optimize.keys():
57 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
58 | 
59 | 
60 | def test_param_weights():
61 |     for clus_name in variables_to_optimize.keys():
62 |         weights = {
63 |             param: {value: (1/len(values)) for value in values} for param, values in
64 |             variables_to_optimize[
65 |                 clus_name
66 |             ].items()
67 |         }
68 |         hypercluster.AutoClusterer(clus_name, param_weights=weights).fit(
69 |             test_data
70 |         )
71 |     for clus_name in variables_to_optimize.keys():
72 |         hypercluster.AutoClusterer(clus_name, random_search=False).fit(test_data)
73 | 
74 | 
75 | def test_passing_kwargs_for_a_clusterer():
76 |     clus_name = 'KMeans'
77 | 
78 |     hypercluster.AutoClusterer(clus_name, clus_kwargs={'max_iter': 50}).fit(
79 |         test_data
80 |     )
81 | 
82 | 
83 | def test_evaluate_results():
84 |     labs = hypercluster.AutoClusterer('KMeans').fit(test_data).labels_
85 |     for metric in inherent_metrics + need_ground_truth:
86 |         utilities.evaluate_one(
87 |             labs[labs.columns[0]], metric, data=test_data, gold_standard=test_ground_truth
88 |         )
89 | 
90 | 
91 | def test_multiauto():
92 |     hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
93 | 


--------------------------------------------------------------------------------
/hypercluster/tests/test_visualize.py:
--------------------------------------------------------------------------------
 1 | from hypercluster import visualize
 2 | import hypercluster
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | test_data = pd.DataFrame(
 8 |         np.array(
 9 |             [[1, 2], [-1.8, 4], [1, -0.5],
10 |              [10, 2], [-10, 4], [10, 0],
11 |              [np.nan, 5], [3.2, np.nan], [0, 14],
12 |              [-16.4, 3.67], [13.22, -3], [3.3, np.nan],
13 |              [42, np.nan], [-8, 2], [1.2, 12],
14 |              [np.nan, 2.1], [0.25, np.nan], [0.1, 1.11],
15 |              [-44, 0], [-0.22, -0.11], [2.34, 6.7],
16 |              [-10, np.nan], [-2.3, -2.5], [np.nan, 0],
17 |              [np.nan, 22], [8.6, -7.5], [0, 14],
18 |              [-6.4, 23.67], [-3.22, 3], [np.nan, np.nan],
19 |              [-20, 2.01], [0.25, -.25], [0.455, 0.233],
20 |              [np.nan, -0.89], [19, np.nan], [np.nan, np.nan],
21 |              [-29, 3.6], [-13, -3], [3.3, np.nan],
22 |              [-4, np.nan], [-0.2, -0.1], [0.34, 0.7]]
23 |         )
24 | )
25 | 
26 | 
27 | test_data['ind1'] = 'a'
28 | test_data['ind2'] = range(len(test_data))
29 | test_data = test_data.set_index(['ind1', 'ind2'])
30 | test_data = test_data.fillna(test_data.median())
31 | 
32 | test_ground_truth = pd.Series(
33 |     np.random.randint(0, 2, size=(len(test_data), )),
34 |     index=test_data.index
35 | )
36 | 
37 | 
38 | def test_vis_eval():
39 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
40 |     visualize.visualize_evaluations(clusterer.evaluation_df)
41 |     clusterer.visualize_evaluations(
42 |         # savefig=True
43 |     )
44 |     visualize.visualize_for_picking_labels(
45 |         clusterer.evaluation_df, savefig_prefix='test_visualize_for_picking'
46 |     )
47 | 
48 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
49 |     visualize.visualize_evaluations(clusterer.evaluation_df)
50 |     clusterer.visualize_evaluations()
51 | 
52 | 
53 | def test_vis_sample():
54 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
55 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
56 |     clusterer.visualize_sample_label_consistency()
57 | 
58 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
59 |     visualize.visualize_sample_label_consistency(clusterer.labels_df)
60 |     clusterer.visualize_sample_label_consistency()
61 | 
62 | 
63 | def test_vis_labels():
64 |     clusterer = hypercluster.MultiAutoClusterer().fit(test_data).evaluate()
65 |     visualize.visualize_label_agreement(clusterer.labels_df)
66 |     clusterer.visualize_label_agreement(
67 |         savefig=True,
68 |     )
69 | 
70 |     clusterer = hypercluster.AutoClusterer().fit(test_data).evaluate()
71 |     visualize.visualize_label_agreement(clusterer.labels_df)
72 |     clusterer.visualize_label_agreement()


--------------------------------------------------------------------------------
/hypercluster/utilities.py:
--------------------------------------------------------------------------------
  1 | from sklearn.cluster import *
  2 | from sklearn.metrics import *
  3 | from .additional_clusterers import *
  4 | from .additional_metrics import *
  5 | from pandas import DataFrame
  6 | import pandas as pd
  7 | import numpy as np
  8 | import logging
  9 | from typing import Optional, Iterable, Dict
 10 | from .constants import *
 11 | from hypercluster.constants import param_delim, val_delim
 12 | 
 13 | 
 14 | def calculate_row_weights(
 15 |     row: Iterable, param_weights: dict, vars_to_optimize: dict
 16 | ) -> float:
 17 |     """Used to select random rows of parameter combinations using individual parameter weights.  
 18 | 
 19 |     Args: 
 20 |         row (Iterable):  Series of parameters, with parameter names as index.  
 21 |         param_weights (dict): Dictionary of str: dictionaries. Ex format - {'parameter_name':{ \
 22 |         'param_option_1':0.5, 'param_option_2':0.5}}.  
 23 |         vars_to_optimize (Iterable): Dictionary with possibilities for different parameters. Ex \
 24 |         format - {'parameter_name':[1, 2, 3, 4, 5]}.  
 25 | 
 26 |     Returns (float): 
 27 |         Float representing the probability of seeing that combination of parameters, given their \
 28 |         individual weights.
 29 | 
 30 |     """
 31 |     param_weights.update({
 32 |         param: {
 33 |             val: param_weights.get(param, {}).get(
 34 |                 val, (1-sum(param_weights.get(param, {}).values()))/len([
 35 |                     notweighted for notweighted in vars_to_optimize.get(param,  {})
 36 |                     if notweighted not in param_weights.get(param, {}).keys()
 37 |                 ])
 38 |             ) for val in vals
 39 |         } for param, vals in vars_to_optimize.items()
 40 |     })
 41 | 
 42 |     return np.prod([param_weights[param][val] for param, val in row.to_dict().items()])
 43 | 
 44 | 
 45 | def cluster(clusterer_name: str, data: DataFrame, params: dict = {}):
 46 |     """Runs a given clusterer with a given set of parameters.
 47 | 
 48 |     Args: 
 49 |         clusterer_name (str): String name of clusterer.
 50 |         data (DataFrame): Dataframe with elements to cluster as index and examples as columns.
 51 |         params (dict): Dictionary of parameter names and values to feed into clusterer. Default {}
 52 | 
 53 |     Returns: 
 54 |         Instance of the clusterer fit with the data provided.
 55 |     """
 56 |     clusterer = eval(clusterer_name)(**params)
 57 |     return clusterer.fit(data)
 58 | 
 59 | 
 60 | def evaluate_one(
 61 |     labels: Iterable,
 62 |     method: str = "silhouette_score",
 63 |     data: Optional[DataFrame] = None,
 64 |     gold_standard: Optional[Iterable] = None,
 65 |     metric_kwargs: Optional[dict] = None,
 66 | ) -> dict:
 67 |     """Uses a given metric to evaluate clustering results.
 68 | 
 69 |     Args: 
 70 |         labels (Iterable): Series of labels.
 71 |         method (str): Str of name of evaluation to use. Default is silhouette.
 72 |         data (DataFrame): If using an inherent metric, must provide DataFrame with which to \
 73 |         calculate the metric.
 74 |         gold_standard (Iterable): If using a metric that compares to ground truth, must provide a \
 75 |         set of gold standard labels.
 76 |         metric_kwargs (dict): Additional kwargs to use in evaluation.
 77 | 
 78 |     Returns (float): 
 79 |         Metric value
 80 |     """
 81 |     if isinstance(labels, pd.Series) is False:
 82 |         labels = pd.Series(labels)
 83 |     if len(labels[labels != -1].unique()) < 2:
 84 |         return np.nan
 85 | 
 86 |     if metric_kwargs is None:
 87 |         metric_kwargs = {}
 88 | 
 89 |     if method in need_ground_truth:
 90 |         if gold_standard is None:
 91 |             raise ValueError(
 92 |                 "Chosen evaluation metric %s requires gold standard set." % method
 93 |             )
 94 |         clustered = (gold_standard != -1) & (labels != -1)
 95 |         compare_to = gold_standard[clustered]
 96 | 
 97 |     elif method in inherent_metrics:
 98 |         if data is None:
 99 |             raise ValueError(
100 |                 "Chosen evaluation metric %s requires data input." % method
101 |             )
102 |         clustered = labels != -1
103 |         compare_to = data.loc[clustered]
104 |     else:
105 |         compare_to = None
106 |         clustered = labels.index
107 | 
108 |     return eval(method)(compare_to, labels[clustered], **metric_kwargs)
109 | 
110 | 
111 | def generate_flattened_df(df_dict: Dict[str, DataFrame]) -> DataFrame:
112 |     """Takes dictionary of results from many clusterers and makes 1 DataFrame. Opposite of \
113 |     convert_to_multiind.
114 | 
115 |     Args: 
116 |         df_dict (Dict[str, DataFrame]): Dictionary of dataframes to flatten. Can be .labels_ or \
117 |         .evaluations_ from MultiAutoClusterer.
118 | 
119 |     Returns: 
120 |         Flattened DataFrame with all data.
121 |     """
122 |     merged_df = pd.DataFrame()
123 |     for clus_name, df in df_dict.items():
124 |         df = df.transpose()
125 |         cols_for_labels = df.index.to_frame()
126 |         inds = cols_for_labels.apply(
127 |             lambda row: param_delim.join(
128 |                 [clus_name] + ["%s%s%s" % (k, val_delim, v) for k, v in row.to_dict().items()]
129 |             ),
130 |             axis=1,
131 |         )
132 |         df.index = inds
133 |         df = df.transpose()
134 | 
135 |         merged_df = pd.concat(
136 |             [merged_df, df], join="outer", axis=1
137 |         )
138 |     return merged_df
139 | 
140 | 
141 | def convert_to_multiind(key: str, df: DataFrame) -> DataFrame:
142 |     """Takes columns from a single clusterer from Clusterer.labels_df or .evaluation_df and
143 |     converts to a multiindexed rather than collapsed into string. Equivalent to grabbing
144 |     Clusterer.labels[clusterer] or .evaluations[clusterer]. Opposite of generate_flattened_df.
145 | 
146 |     Args: 
147 |         key (str): Name of clusterer, must match beginning of columns to convert.  
148 |         df (DataFrame): Dataframe to grab chunk from.  
149 | 
150 |     Returns: 
151 |         Subset DataFrame with multiindex.
152 | 
153 |     """
154 |     clus_cols = [col for col in df.columns if col.split(param_delim, 1)[0] == key]
155 |     temp = df[clus_cols].transpose()
156 |     temp.index = pd.MultiIndex.from_frame(
157 |         pd.DataFrame([{
158 |             s.split(val_delim, 1)[0]: s.split(val_delim, 1)[1] for s in i.split(param_delim)[1:]
159 |         } for i in temp.index]).astype(float, errors='ignore')
160 |     )
161 |     return temp.sort_index().transpose()
162 | 
163 | 
164 | def pick_best_labels(
165 |         evaluation_results_df: DataFrame,
166 |         clustering_labels_df: DataFrame,
167 |         method: Optional[str] = None,
168 |         min_or_max: Optional[str] = None
169 | ) -> Iterable:
170 |     """From evaluations and a metric to minimize or maximize, return all labels with top pick.  
171 | 
172 |     Args: 
173 |         evaluation_results_df (DataFrame): Evaluations DataFrame from optimize_clustering.  
174 |         clustering_labels_df (DataFrame): Labels DataFrame from optimize_clustering.  
175 |         method (str): Method with which to choose the best labels.  
176 |         min_or_max (str): Whether to minimize or maximize the metric. Must be 'min' or 'max'.  
177 |     Returns (DataFrame): 
178 |         DataFrame of all top labels.  
179 |     """
180 |     if method is None:
181 |         method = "silhouette_score"
182 |     if min_or_max is None:
183 |         min_or_max = 'max'
184 | 
185 |     best_labels = evaluation_results_df.loc[method, :]
186 |     if min_or_max == 'min':
187 |         best_labels = best_labels.index[best_labels == best_labels.min()]
188 |         return clustering_labels_df[best_labels]
189 |     elif min_or_max == 'max':
190 |         best_labels = best_labels.index[best_labels == best_labels.max()]
191 |         return clustering_labels_df[best_labels]
192 |     logging.error('min_or_max must be either min or max, %s invalid choice' % min_or_max)
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/hypercluster/visualize.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import logging
  3 | from collections import Counter
  4 | from itertools import cycle
  5 | import numpy as np
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | import seaborn as sns
  9 | from pandas import DataFrame
 10 | from scipy.cluster import hierarchy
 11 | from scipy.spatial.distance import pdist
 12 | from hypercluster.constants import param_delim
 13 | from hypercluster.utilities import convert_to_multiind, evaluate_one
 14 | 
 15 | matplotlib.rcParams["pdf.fonttype"] = 42
 16 | matplotlib.rcParams["ps.fonttype"] = 42
 17 | sns.set(font="arial", style="white", color_codes=True, font_scale=1.3)
 18 | matplotlib.rcParams.update({"savefig.bbox": "tight"})
 19 | cmap = sns.cubehelix_palette(
 20 |     start=0,
 21 |     rot=0.4,
 22 |     gamma=1.0,
 23 |     hue=0.82,
 24 |     light=1,
 25 |     dark=0,
 26 |     reverse=False,
 27 |     as_cmap=True
 28 | )
 29 | cmap.set_over('black')
 30 | cmap.set_under('white')
 31 | cmap.set_bad("#DAE0E6")
 32 | 
 33 | 
 34 | def zscore(df):
 35 |     """Row zscores a DataFrame, ignores np.nan 
 36 | 
 37 |     Args: 
 38 |         df (DataFrame): DataFrame to z-score 
 39 | 
 40 |     Returns (DataFrame): 
 41 |         Row-zscored DataFrame. 
 42 |     """
 43 |     return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0)
 44 | 
 45 | 
 46 | def compute_order(
 47 |         df,
 48 |         dist_method: str = "euclidean",
 49 |         cluster_method: str = "average"
 50 | ):
 51 |     """Gives hierarchical clustering order for the rows of a DataFrame 
 52 | 
 53 |     Args: 
 54 |         df (DataFrame): DataFrame with rows to order.  
 55 |         dist_method (str):  Distance method to pass to scipy.cluster.hierarchy.linkage.  
 56 |         cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist.  
 57 | 
 58 |     Returns (pandas.Index): 
 59 |         Ordered row index. 
 60 | 
 61 |     """
 62 |     dist_mat = pdist(df, metric=dist_method)
 63 |     link_mat = hierarchy.linkage(dist_mat, method=cluster_method)
 64 | 
 65 |     return df.index[hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))]
 66 | 
 67 | 
 68 | def visualize_evaluations(
 69 |     evaluations_df: DataFrame,
 70 |     savefig: bool = False,
 71 |     output_prefix: str = "evaluations",
 72 |     **heatmap_kws
 73 | ) -> List[matplotlib.axes.Axes]:
 74 |     """Makes a z-scored visualization of all evaluations. 
 75 | 
 76 |     Args: 
 77 |         evaluations_df (DataFrame): Evaluations dataframe from clustering.optimize_clustering  
 78 |         output_prefix (str): If saving a figure, file prefix to use.  
 79 |         savefig (bool): Whether to save a pdf  
 80 |         **heatmap_kws: Additional keyword arguments to pass to seaborn.heatmap.  
 81 | 
 82 |     Returns (List[matplotlib.axes.Axes]): 
 83 |         List of all matplotlib axes.  
 84 | 
 85 |     """
 86 |     clusterers = sorted(
 87 |         list(set([i.split(param_delim, 1)[0] for i in evaluations_df.columns]))
 88 |     )
 89 |     width_ratios = [
 90 |             dict(
 91 |                 Counter(
 92 |                     [i.split(param_delim, 1)[0] for i in evaluations_df.columns]
 93 |                 )
 94 |             )[clus]
 95 |             for clus in clusterers
 96 |         ]
 97 | 
 98 |     evaluations_df = zscore(evaluations_df)
 99 |     width = 0.18 * (len(evaluations_df.columns) + 2 + (0.01 * (len(clusterers) - 1)))
100 |     height = 0.22 * (len(evaluations_df))
101 | 
102 |     fig, axs = plt.subplots(
103 |         figsize=(width, height),
104 |         nrows=1,
105 |         ncols=(len(clusterers) + 1),
106 |         gridspec_kw=dict(
107 |             width_ratios=width_ratios + [2],
108 |             wspace=0.01,
109 |             left=0,
110 |             right=1,
111 |             top=1,
112 |             bottom=0,
113 |         ),
114 |     )
115 |     vmin = np.nanquantile(evaluations_df, 0.1)
116 |     vmax = np.nanquantile(evaluations_df, 0.9)
117 | 
118 |     heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap)
119 |     heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin)
120 |     heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax)
121 | 
122 |     for i, clus in enumerate(clusterers):
123 |         temp = convert_to_multiind(clus, evaluations_df)
124 | 
125 |         ax = axs[i]
126 |         sns.heatmap(
127 |             temp,
128 |             ax=ax,
129 |             yticklabels=temp.index,
130 |             xticklabels=["-".join([str(i) for i in col]) for col in temp.columns],
131 |             cbar_ax=axs[-1],
132 |             cbar_kws=dict(label="z-score"),
133 |             **heatmap_kws
134 |         )
135 |         ax.set_ylabel("")
136 |         ax.set_title(clus)
137 |         ax.set_yticklabels([])
138 | 
139 |     axs[0].set_ylabel("evaluation method")
140 |     axs[0].set_yticklabels(temp.index, rotation=0)
141 |     if savefig:
142 |         plt.savefig("%s.pdf" % output_prefix)
143 |     return axs
144 | 
145 | 
146 | def visualize_pairwise(
147 |         df: DataFrame,
148 |         savefig: bool = False,
149 |         output_prefix: Optional[str] = None,
150 |         method: Optional[str] = None,
151 |         **heatmap_kws
152 | ) -> List[matplotlib.axes.Axes]:
153 |     """Visualize symmetrical square DataFrames. 
154 | 
155 |     Args: 
156 |         df (DataFrame): DataFrame to visualize.  
157 |         savefig (bool): Whether to save a pdf.  
158 |         output_prefix (str): If saving a pdf, file prefix to use.  
159 |         method (str): Label for cbar, if relevant.  
160 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
161 | 
162 |     Returns (List[matplotlib.axes.Axes]): 
163 |         List of matplotlib axes for figure. 
164 | 
165 |     .. _seaborn.heatmap:
166 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
167 |     """
168 |     heatmap_kws = {**heatmap_kws}
169 | 
170 |     vmin = np.nanquantile(df, 0.1)
171 |     vmax = np.nanquantile(df, 0.9)
172 | 
173 |     heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap)
174 |     heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin)
175 |     heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax)
176 |     cbar_kws = heatmap_kws.get('cbar_kws', {})
177 |     cbar_kws['label'] = cbar_kws.get('label', method)
178 |     heatmap_kws['cbar_kws'] = cbar_kws
179 | 
180 |     cbar_ratio = 2
181 |     wspace = 0.01
182 |     height = 0.18 * len(df)
183 |     width = 0.18 * (len(df.columns)+cbar_ratio+wspace)
184 |     fig, axs = plt.subplots(
185 |         figsize=(width, height),
186 |         nrows=1,
187 |         ncols=2,
188 |         gridspec_kw=dict(
189 |             width_ratios=[len(df.columns), cbar_ratio],
190 |             wspace=wspace,
191 |             left=0,
192 |             right=1,
193 |             top=1,
194 |             bottom=0,
195 |         )
196 |     )
197 |     try:
198 |         order = compute_order(df.fillna(df.median()))
199 |     except ValueError:
200 |         order = df.index
201 |     df = df.loc[order, order]
202 |     sns.heatmap(
203 |         df,
204 |         xticklabels=order,
205 |         yticklabels=order,
206 |         ax=axs[0],
207 |         cbar_ax=axs[1],
208 |         **heatmap_kws
209 |     )
210 |     if savefig:
211 |         if output_prefix is None:
212 |             output_prefix = "heatmap.pairwise"
213 |         plt.savefig('%s.pdf' % output_prefix)
214 | 
215 |     return axs
216 | 
217 | 
218 | def visualize_label_agreement(
219 |         labels: DataFrame,
220 |         method: Optional[str] = None,
221 |         savefig: bool = False,
222 |         output_prefix: Optional[str] = None,
223 |         **heatmap_kws
224 | ) -> List[matplotlib.axes.Axes]:
225 |     """Visualize similarity between clustering results given an evaluation metric. 
226 | 
227 |     Args: 
228 |         labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \
229 |         AutoClusterer.labels_  
230 |         method (str): Method with which to compare labels. Must be a metric like the ones in \
231 |         constants.need_ground_truth, which takes two sets of labels.  
232 |         savefig (bool): Whether to save a pdf.  
233 |         output_prefix (str): If saving a pdf, file prefix to use.  
234 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
235 | 
236 |     Returns (List[matplotlib.axes.Axes]): 
237 |         List of matplotlib axes  
238 | 
239 |     .. _seaborn.heatmap:
240 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
241 |     """
242 |     if savefig and output_prefix is None:
243 |         output_prefix = 'heatmap.labels.pairwise'
244 |     if method is None:
245 |         method = 'adjusted_rand_score'
246 | 
247 |     labels = labels.astype(float).corr(
248 |         lambda x, y: evaluate_one(x, method=method, gold_standard=y)
249 |     )
250 |     return visualize_pairwise(labels, savefig, output_prefix, method=method, **heatmap_kws)
251 | 
252 | 
253 | def visualize_sample_label_consistency(
254 |         labels: DataFrame,
255 |         savefig: bool = False,
256 |         output_prefix: Optional[str] = None,
257 |         **heatmap_kws
258 | ) -> List[matplotlib.axes.Axes]:
259 |     """Visualize how often two samples are labeled in the same group across conditions. Interpret
260 |     with care--if you use more conditions for some type of clusterers, e.g. more n_clusters for
261 |     KMeans, those cluster more similarly across conditions than between clusterers. This means
262 |     that more agreement in labeling could be due to the choice of clusterers rather than true
263 |     similarity between samples. 
264 | 
265 |     Args: 
266 |         labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \
267 |         AutoClusterer.labels_  
268 |         savefig (bool): Whether to save a pdf.  
269 |         output_prefix (str): If saving a pdf, file prefix to use.  
270 |         **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_  
271 | 
272 |     Returns (List[matplotlib.axes.Axes]): 
273 |         List of matplotlib axes  
274 | 
275 |     .. _seaborn.heatmap:
276 |         https://seaborn.pydata.org/generated/seaborn.heatmap.html
277 | 
278 |     """
279 |     if savefig and output_prefix is None:
280 |         output_prefix = "heatmap.sample.pairwise"
281 |     #TODO change this to much faster matmult
282 |     labels = labels.transpose().astype(float).corr(lambda x, y: sum(
283 |         np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))])
284 |     ))
285 |     return visualize_pairwise(labels, savefig, output_prefix, method='# same label', **heatmap_kws)
286 | 
287 | 
288 | def visualize_for_picking_labels(
289 |         evaluation_df: DataFrame,
290 |         method: Optional[str] = None,
291 |         savefig_prefix: Optional[str] = None
292 | ):
293 |     """Generates graphs similar to a `scree graph`_ for PCA for each parameter and each clusterer. 
294 | 
295 |     Args: 
296 |         evaluation_df (DataFrame): DataFrame of evaluations to visualize. Clusterer.evaluation_df.  
297 |         method (str): Which metric to visualize.  
298 |         savefig_prefix (str): If not None, save a figure with give prefix.  
299 | 
300 |     Returns:
301 |         matplotlib axes.  
302 |     .. _scree graph:
303 |         https://en.wikipedia.org/wiki/Scree_plot
304 |     """
305 |     if method is None:
306 |         method = "silhouette_score"
307 |     cluss_temp = list(set([i.split(param_delim, 1)[0] for i in evaluation_df.columns]))
308 |     # get figure dimensions
309 |     ncols = 0
310 |     cluss = []
311 |     for ploti, clus in enumerate(cluss_temp):
312 |         scores = convert_to_multiind(
313 |             clus, evaluation_df.loc[[method], :]
314 |         ).transpose().dropna(how='any')
315 |         if len(scores) == 0:
316 |             logging.error(
317 |                 'Score %s is missing for clusterer %s, skipping visualization' % (method, clus)
318 |             )
319 |             continue
320 |         indep = scores.index.to_frame().reset_index(drop=True)
321 |         try:
322 |             indep.astype(float)         
323 |         except ValueError or AssertionError:
324 |             logging.error('Cannot convert %s data to floats, skipping visualization' % clus)
325 |             continue
326 |         cluss.append(clus)
327 |         if scores.index.nlevels > ncols:
328 |             ncols = scores.index.nlevels
329 |     if not cluss:
330 |         logging.error('No valid clusterers, cannot visualize. ')
331 |         return None
332 |     cluss.sort()
333 | 
334 |     ybuff = np.abs(np.nanquantile(evaluation_df.loc[method], 0.05))
335 |     ylim = (evaluation_df.loc[method].min() - ybuff, evaluation_df.loc[method].max() + ybuff)
336 |     colors = cycle(sns.color_palette('twilight', n_colors=len(cluss) * ncols))
337 |     fig = plt.figure(figsize=(5 * (ncols), 5 * len(cluss)))
338 |     gs = plt.GridSpec(nrows=len(cluss), ncols=ncols, wspace=0.25, hspace=0.25)
339 |     for ploti, clus in enumerate(cluss):
340 |         scores = convert_to_multiind(
341 |             clus, evaluation_df.loc[[method], :]
342 |         ).transpose().dropna(how='any')
343 |         indep = scores.index.to_frame().reset_index(drop=True)
344 | 
345 |         for whcol, col in enumerate(indep.columns):
346 |             if whcol == 0:
347 |                 saveax = plt.subplot(gs[ploti, whcol])
348 |                 ax = saveax
349 |                 ax.set_ylim(ylim)
350 |                 ax.set_ylabel(clus)
351 |             else:
352 |                 ax = plt.subplot(gs[ploti, whcol], sharey=saveax)
353 |             color = next(colors)
354 | 
355 |             # plot eval results
356 |             sns.regplot(
357 |                 indep[col],
358 |                 scores[method].values,
359 |                 color=color,
360 |                 ax=ax,
361 |                 logistic=True, 
362 |             )
363 | 
364 |     axs = fig.get_axes()
365 |     axs[0].set_title('%s results per parameter' % method, ha='left')
366 |     if savefig_prefix:
367 |         plt.savefig('%s.pdf' % savefig_prefix)
368 |     return axs
369 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | files = ['snakemake/hypercluster.smk', 'snakemake/config.yml']
 6 | setuptools.setup(
 7 |     name="hypercluster",
 8 |     version="0.1.13",
 9 |     author="Lili Blumenberg, Ruggles Lab",
10 |     author_email="lili.blumenberg@gmail.com",
11 |     description="A package for automatic clustering hyperparameter optmization",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/liliblu/hypercluster",
15 |     classifiers=[
16 |         "Programming Language :: Python :: 3.7",
17 |         "License :: OSI Approved :: MIT License",
18 |         "Operating System :: MacOS",
19 |         "Operating System :: Unix",
20 |     ],
21 |     install_requires=[
22 |         "pandas >= 0.24.2",
23 |         "numpy >= 1.16.4",
24 |         "scipy >= 1.2.1",
25 |         "matplotlib >= 3.1.0",
26 |         "seaborn >= 0.9.0",
27 |         "scikit-learn >= 0.22.0",
28 |         "hdbscan >= 0.8.24",
29 |         "snakemake >= 5.8.2",
30 |         "python-igraph >=0.7.1",
31 |         "leidenalg >=0.7.0",
32 |         "louvain >=0.6.1"
33 |     ],
34 |     package_data={"hypercluster": files},
35 |     packages=setuptools.find_packages()
36 | )
37 | 


--------------------------------------------------------------------------------
/snakemake/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "__default__": {
 3 |         "job-name": "snakemake",
 4 |         "time": "4-23:59:59",
 5 |         "mem": 2G,
 6 |         "partition": "fn_medium",
 7 |         "cpus-per-task": 1,
 8 |         "output": "logs/slurm/%j.out",
 9 |         "error": "logs/slurm/%j.err"
10 |     },
11 |     "run_clusterer": {
12 |         "job-name": "snakemakerunclusterer",
13 |         "time": "4-23:59:59",
14 |         "mem": 32G,
15 |         "partition": "fn_medium",
16 |         "cpus-per-task": 4,
17 |         "output": "logs/slurm/%j.out",
18 |         "error": "logs/slurm/%j.err"
19 |     },
20 |     "run_evaluation": {
21 |         "job-name": "snakeevaluate",
22 |         "time": "4-23:59:59",
23 |         "mem": 4G,
24 |         "partition": "fn_medium",
25 |         "cpus-per-task": 1,
26 |         "output": "logs/slurm/%j.out",
27 |         "error": "logs/slurm/%j.err"
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/snakemake/config.yml:
--------------------------------------------------------------------------------
  1 | input_data_folder: '.'
  2 | input_data_files: 'test_input'
  3 | gold_standards:
  4 |   test_input: ''
  5 | read_csv_kwargs:
  6 |   test_input: {'index_col':[0]}
  7 | 
  8 | output_folder: 'results'
  9 | intermediates_folder: 'clustering_intermediates'
 10 | clustering_results: 'clustering'
 11 | 
 12 | clusterer_kwargs: {}
 13 | generate_parameters_addtl_kwargs: {}
 14 | 
 15 | evaluations:
 16 | #  - adjusted_rand_score
 17 | #  - adjusted_mutual_info_score
 18 | #  - homogeneity_score
 19 | #  - completeness_score
 20 | #  - fowlkes_mallows_score
 21 | #  - mutual_info_score
 22 | #  - v_measure_score
 23 |   - silhouette_score
 24 |   - calinski_harabasz_score
 25 |   - davies_bouldin_score
 26 |   - number_clustered
 27 |   - smallest_largest_clusters_ratio
 28 |   - smallest_cluster_ratio
 29 | eval_kwargs: {}
 30 | screeplot_evals:
 31 | #  - adjusted_rand_score
 32 | #  - adjusted_mutual_info_score
 33 | #  - homogeneity_score
 34 | #  - completeness_score
 35 | #  - fowlkes_mallows_score
 36 | #  - mutual_info_score
 37 | #  - v_measure_score
 38 |   - silhouette_score
 39 |   - calinski_harabasz_score
 40 |   - davies_bouldin_score
 41 |   - number_clustered
 42 |   - smallest_largest_clusters_ratio
 43 |   - smallest_cluster_ratio
 44 | 
 45 | metric_to_choose_best: silhouette_score
 46 | metric_to_compare_labels: adjusted_rand_score
 47 | compare_samples: true
 48 | 
 49 | output_kwargs:
 50 |   evaluations:
 51 |     index_col: [0]
 52 |   labels:
 53 |     index_col: [0]
 54 | heatmap_kwargs: {}
 55 | 
 56 | optimization_parameters:
 57 |   AffinityPropagation:
 58 |     damping:
 59 |     - 0.55
 60 |     - 0.6
 61 |     - 0.65
 62 |     - 0.7
 63 |     - 0.75
 64 |     - 0.8
 65 |     - 0.85
 66 |     - 0.9
 67 |     - 0.95
 68 |   HDBSCAN:
 69 |     min_cluster_size: &id002
 70 |     - 2
 71 |     - 3
 72 |     - 4
 73 |     - 5
 74 |     - 6
 75 |     - 7
 76 |     - 8
 77 |     - 9
 78 |     - 10
 79 |     - 11
 80 |     - 12
 81 |     - 13
 82 |     - 14
 83 |     - 15
 84 |     - 16
 85 |   KMeans:
 86 |     n_clusters: &id001
 87 |     - 2
 88 |     - 3
 89 |     - 4
 90 |     - 5
 91 |     - 6
 92 |     - 7
 93 |     - 8
 94 |     - 9
 95 |     - 10
 96 |     - 11
 97 |     - 12
 98 |     - 13
 99 |     - 14
100 |     - 15
101 |     - 16
102 |     - 17
103 |     - 18
104 |     - 19
105 |     - 20
106 |     - 21
107 |     - 22
108 |     - 23
109 |     - 24
110 |     - 25
111 |     - 26
112 |     - 27
113 |     - 28
114 |     - 29
115 |     - 30
116 |     - 31
117 |     - 32
118 |     - 33
119 |     - 34
120 |     - 35
121 |     - 36
122 |     - 37
123 |     - 38
124 |     - 39
125 |     - 40
126 |   MiniBatchKMeans:
127 |     n_clusters: *id001
128 |   OPTICS:
129 |     min_samples: *id002
130 |   NMFCluster:
131 |     n_clusters: *id001
132 |   LouvainCluster: &id003
133 |     resolution: 
134 |     - 0.2
135 |     - 0.4
136 |     - 0.6
137 |     - 0.8
138 |     - 1.0
139 |     - 1.2
140 |     - 1.4
141 |     - 1.6
142 |     k:
143 |     - 10
144 |     - 15
145 |     - 20
146 |     - 40
147 |     - 80
148 |     - 120
149 |   LeidenCluster: *id003


--------------------------------------------------------------------------------
/snakemake/hypercluster.smk:
--------------------------------------------------------------------------------
  1 | import pandas as pd,numpy as np
  2 | from hypercluster import utilities, visualize
  3 | import hypercluster
  4 | from hypercluster.constants import param_delim, val_delim
  5 | import os, subprocess
  6 | from shutil import copyfile
  7 | import yaml
  8 | 
  9 | subprocess.run(['mkdir', '-p', 'logs'])
 10 | targets = ['labels', 'evaluations']
 11 | 
 12 | input_data_folder = config['input_data_folder']
 13 | input_files = config['input_data_files']
 14 | 
 15 | output_folder = config['output_folder']
 16 | subprocess.run(['mkdir', '-p', output_folder])
 17 | 
 18 | intermediates_folder = config['intermediates_folder']
 19 | clustering_results = config['clustering_results']
 20 | 
 21 | 
 22 | def generate_parameters(config):
 23 |     parameters = config['optimization_parameters']
 24 |     all_params_to_test = []
 25 |     for clusterer, params in parameters.items():
 26 |         clus_kwargs = config['clusterer_kwargs'].get(clusterer, {})
 27 |         kwargs = config['generate_parameters_addtl_kwargs'].get(clusterer, {})
 28 |         df = hypercluster.AutoClusterer(
 29 |             clusterer_name=clusterer,
 30 |             params_to_optimize=params,
 31 |             clus_kwargs=clus_kwargs,
 32 |             **kwargs
 33 |         ).param_sets
 34 |         df['clusterer'] = clusterer
 35 |         all_params_to_test.extend(df.to_dict('records'))
 36 |     #TODO why is random search not working? getting key not found errors
 37 |     final_param_sets = {}
 38 |     for param_set in all_params_to_test:
 39 |         clusterer = param_set['clusterer']
 40 |         lab = param_delim.join([clusterer]+[
 41 |             '%s%s%s' % (k, val_delim, v) for k, v in param_set.items() if k != 'clusterer'
 42 |         ])
 43 |         final_param_sets.update({lab:param_set})
 44 |     config['param_sets'] = final_param_sets
 45 |     config['param_sets_labels'] = list(final_param_sets.keys())
 46 | 
 47 |     with open('%s/params_to_test.yml' % output_folder, 'w') as fh:
 48 |         yaml.dump(final_param_sets, fh)
 49 | 
 50 | 
 51 | def handle_ext(wildcards):
 52 |     base = wildcards.input_file
 53 |     files = []
 54 |     for file_ext in [".csv", ".tsv", ".txt"]:
 55 |         file = '%s/%s%s' % (input_data_folder, base, file_ext)
 56 |         if os.path.exists(file):
 57 |             files.append(file)
 58 |     if len(files) == 1:
 59 |         return files[0]
 60 |     if len(files) > 1:
 61 |         raise ValueError(
 62 |         'Multiple files with prefix %s/%s can be found, must be unique' % (input_data_folder, base)
 63 |     )
 64 |     raise FileNotFoundError(
 65 |         'No .txt, .csv or .tsv files with prefix %s/%s can be found' % (input_data_folder, base)
 66 |     )
 67 | 
 68 | 
 69 | def concat_dfs(df_list, kwargs):
 70 |     results = pd.DataFrame()
 71 |     for fil in df_list:
 72 |         temp = pd.read_csv(fil, **kwargs)
 73 |         results = pd.concat([results, temp], join='outer', axis=1)
 74 |     return results
 75 | 
 76 | 
 77 | def get_target_files(config):
 78 |     target_files = expand(
 79 |         '%s/{input_file}/%s/{targets}.txt' % (output_folder, clustering_results),
 80 |         input_file=input_files,
 81 |         targets=targets
 82 |     ) + expand(
 83 | 
 84 |         "%s/{input_file}/%s/{labs}_{targets}.txt" % (output_folder, intermediates_folder),
 85 |         input_file=input_files,
 86 |         labs=config["param_sets_labels"],
 87 |         targets=targets
 88 |     ) + expand(
 89 |             '%s/{input_file}/%s/evaluations.pdf' % (output_folder, clustering_results),
 90 |             input_file=input_files
 91 |     )
 92 | 
 93 |     if config['metric_to_choose_best']:
 94 |         target_files.append(
 95 |             expand(
 96 |                 "%s/{input_file}/%s/best_parameters.txt" % (output_folder, clustering_results),
 97 |                 input_file=input_files
 98 |             )
 99 |         )
100 |     if config['metric_to_compare_labels']:
101 |         target_files.append(
102 |             expand(
103 |                 '%s/{input_file}/%s/%s_label_comparison.txt' % (
104 |                     output_folder, clustering_results, config['metric_to_compare_labels']
105 |                 ),
106 |                 input_file=input_files
107 |             )
108 |         )
109 |     if config['compare_samples']:
110 |         target_files.append(
111 |             expand(
112 |                 '%s/{input_file}/%s/sample_label_agreement.txt' % (output_folder, clustering_results),
113 |                 input_file=input_files
114 |             )
115 |         )
116 |     if config['screeplot_evals']:
117 |         target_files.append(
118 |             expand(
119 |                 '%s/{input_file}/%s/scree_plots.{eval}.pdf' % (output_folder, clustering_results),
120 |                 input_file=input_files,
121 |                 eval=config['screeplot_evals']
122 |             )
123 |         )
124 | 
125 |     return target_files
126 | 
127 | 
128 | generate_parameters(config)
129 | files_to_generate = get_target_files(config)
130 | 
131 | rule all:
132 |     input:
133 |          files_to_generate
134 | 
135 | 
136 | rule run_clusterer:
137 |     input:
138 |         infile = handle_ext
139 |     output:
140 |         "%s/{input_file}/%s/{labs}_labels.txt" % (output_folder, intermediates_folder)
141 |     params:
142 |         kwargs = lambda wildcards: config["param_sets"][wildcards.labs],
143 |         readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}),
144 |         cluskwargs = config['clusterer_kwargs']
145 |     run:
146 |         df = pd.read_csv(input.infile, **params.readkwargs)
147 |         kwargs = params.kwargs
148 |         clusterer = kwargs.pop('clusterer')
149 | 
150 |         kwargs.update(params.cluskwargs.get(clusterer, {}))
151 |         print(kwargs)
152 |         cls = utilities.cluster(clusterer, df, kwargs)
153 | 
154 |         labs = pd.DataFrame(cls.labels_, index=df.index, columns=[wildcards.labs])
155 |         labs.to_csv(output[0], sep = params.readkwargs.get('sep', ','))
156 | 
157 | 
158 | rule run_evaluation:
159 |     input:
160 |         "%s/{input_file}/%s/{labs}_labels.txt" % (output_folder, intermediates_folder)
161 |     output:
162 |         "%s/{input_file}/%s/{labs}_evaluations.txt" % (output_folder, intermediates_folder)
163 |     params:
164 |         gold_standards = lambda wildcards: config['gold_standards'].get(wildcards.input_file, ''),
165 |         input_data = handle_ext,
166 |         readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {}),
167 |         evals = config["evaluations"],
168 |         evalkwargs = config["eval_kwargs"]
169 |     run:
170 |         readkwargs = {
171 |             'index_col':params.readkwargs.get('index_col', 0),
172 |             'sep':params.readkwargs.get('sep', ',')
173 |         }
174 |         test_labels = pd.read_csv(input[0], **params.readkwargs)
175 |         if os.path.exists(params.gold_standards):
176 |             gold_standard = pd.read_csv(
177 |                 '%s/%s' %(input_data_folder, params.gold_standards),
178 |                 **readkwargs
179 |             )
180 |             gold_standard = gold_standard[gold_standard.columns[0]]
181 |         else:
182 |             gold_standard = None
183 | 
184 |         data = pd.read_csv(params.input_data, **readkwargs)
185 |         res = pd.DataFrame({'methods':params.evals})
186 | 
187 |         res[wildcards.labs] = res.apply(
188 |             lambda row: utilities.evaluate_one(
189 |                 test_labels[test_labels.columns[0]],
190 |                 method=row['methods'],
191 |                 data=data,
192 |                 gold_standard=gold_standard,
193 |                 metric_kwargs=params.evalkwargs.get(row['methods'], None)
194 |             ), axis=1
195 |         )
196 |         res = res.set_index('methods')
197 |         res.to_csv(output[0], sep=readkwargs['sep'])
198 | 
199 | 
200 | rule collect_dfs:
201 |     input:
202 |         files = expand(
203 |             '%s/{{input_file}}/%s/{params_label}_{{targets}}.txt' % (
204 |                 output_folder, intermediates_folder
205 |             ), params_label = config['param_sets_labels']
206 |         )
207 |     params:
208 |         outputkwargs = lambda wildcards: config['output_kwargs'].get(wildcards.targets)
209 |     output:
210 |         '%s/{input_file}/%s/{targets}.txt' % (output_folder, clustering_results)
211 |     run:
212 |         kwargs = {
213 |             'index_col':params.outputkwargs.get('index_col', 0),
214 |             'sep':params.outputkwargs.get('sep', ',')
215 |         }
216 | 
217 |         df = concat_dfs(input.files, kwargs)
218 |         df.to_csv(
219 |             output[0], sep = kwargs['sep'] # TODO see if this works for the rest
220 |         )
221 | 
222 | 
223 | rule visualize_evaluations:
224 |     input:
225 |         files = '%s/{input_file}/%s/evaluations.txt' % (
226 |             output_folder, clustering_results
227 |         )
228 |     output:
229 |         output_file = '%s/{input_file}/%s/evaluations.pdf' % (
230 |             output_folder, clustering_results
231 |         )
232 |     params:
233 |         heatmap_kwargs = config['heatmap_kwargs'],
234 |         readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {})
235 |     run:
236 |         df = pd.read_csv(input.files, sep=params.readkwargs.get('sep', ','), index_col=0)
237 | 
238 |         visualize.visualize_evaluations(
239 |             df, output_prefix=output.output_file.rsplit('.', 1)[0], savefig=True,
240 |             **params.heatmap_kwargs
241 |         )
242 | 
243 | 
244 | rule pick_best_clusters:
245 |     input:
246 |         evals = '%s/{input_file}/%s/evaluations.txt' % (output_folder, clustering_results)
247 |     output:
248 |         "%s/{input_file}/%s/best_parameters.txt" % (output_folder, clustering_results),
249 |     params:
250 |         metric = config['metric_to_choose_best'],
251 |         sep = lambda wcs: config['read_csv_kwargs'].get(wcs.input_file, {}).get('sep', ',')
252 |     run:
253 |         df = pd.read_csv(input.evals, sep=params.sep, index_col=0).transpose()
254 |         labs = list(df[df[params.metric]==df[params.metric].max()].index)
255 |         for lab in labs:
256 |             copyfile(
257 |                 "%s/%s/%s/%s_labels.txt" % (
258 |                     output_folder,
259 |                     wildcards.input_file,
260 |                     intermediates_folder,
261 |                     lab
262 |                 ),
263 |                 "%s/%s/%s/%s_labels.txt" % (
264 |                     output_folder,
265 |                     wildcards.input_file,
266 |                     clustering_results,
267 |                     lab
268 |                 )
269 |             )
270 |             with open(output[0], 'a') as fh:
271 |                 fh.write('%s\n' % lab)
272 | 
273 |         visualize.visualize_for_picking_labels(
274 |             df.transpose(),
275 |             method=params.metric,
276 |             savefig_prefix='%s/scree_plots.%s' % (
277 |                 output[0].rsplit('/', 1)[0], params.metric
278 |             )
279 |         )
280 | 
281 | rule compare_labels:
282 |     input:
283 |          labels = '%s/{input_file}/%s/labels.txt' % (output_folder, clustering_results)
284 |     output:
285 |         table = '%s/{input_file}/%s/%s_label_comparison.txt' % (
286 |             output_folder, clustering_results, config['metric_to_compare_labels']
287 |         )
288 |     params:
289 |         metric = config['metric_to_compare_labels'],
290 |         readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {})
291 |     run:
292 |         kwargs = {
293 |             'index_col':params.readkwargs.get('index_col', 0),
294 |             'sep':params.readkwargs.get('sep', ',')
295 |         }
296 |         df = pd.read_csv(input.labels, **kwargs)
297 |         df = df.corr(lambda x, y: utilities.evaluate_one(
298 |             x, method=params.metric, gold_standard=y
299 |         ))
300 |         df.to_csv(output.table)
301 | 
302 |         visualize.visualize_pairwise(
303 |             df,
304 |             savefig=True,
305 |             output_prefix=output.table.rsplit('.', 1)[0],
306 |             method = params.metric,
307 |             **config['heatmap_kwargs']
308 |         )
309 | 
310 | 
311 | rule compare_samples:
312 |     input:
313 |          labels = '%s/{input_file}/%s/labels.txt' % (output_folder, clustering_results)
314 |     output:
315 |          table = '%s/{input_file}/%s/sample_label_agreement.txt' % (output_folder,
316 |                                                                      clustering_results)
317 |     params:
318 |           readkwargs = lambda wildcards: config['read_csv_kwargs'].get(wildcards.input_file, {})
319 |     run:
320 |         kwargs = {
321 |             'index_col':params.readkwargs.get('index_col', 0),
322 |             'sep':params.readkwargs.get('sep', ',')
323 |         }
324 |         df = pd.read_csv(input.labels, **kwargs).transpose()
325 |         df = df.corr(
326 |             lambda x, y: sum(np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))]))
327 |         )
328 | 
329 |         df.to_csv(output.table, sep = kwargs['sep'])
330 | 
331 |         visualize.visualize_pairwise(
332 |             df,
333 |             savefig=True,
334 |             output_prefix=output.table.rsplit('.', 1)[0],
335 |             method = '# same label',
336 |             **config['heatmap_kwargs']
337 |         )
338 | 
339 | 
340 | rule draw_scree_plots:
341 |     input:
342 |          eval_df = '%s/{input_file}/%s/evaluations.txt' % (output_folder, clustering_results)
343 |     output:
344 |          pdfs = expand(
345 |              '%s/{{input_file}}/%s/scree_plots.{eval}.pdf' % (output_folder, clustering_results),
346 |              eval=config['screeplot_evals']
347 |          )
348 |     params:
349 |         sep = lambda wcs: config['read_csv_kwargs'].get(wcs.input_file, {}).get('sep', ',')
350 |     run:
351 |         df = pd.read_csv(input.eval_df, sep=params.sep, index_col=0)
352 |         for metric in config['screeplot_evals']:
353 |             visualize.visualize_for_picking_labels(
354 |                 df,
355 |                 method=metric,
356 |                 savefig_prefix='%s/%s/%s/scree_plots.%s' % (
357 |                     output_folder, wildcards.input_file, clustering_results, metric
358 |                 )
359 |             )
360 | 
361 | 
362 | 


--------------------------------------------------------------------------------
/snakemake/snakemake_submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --partition cpu_long
 3 | #SBATCH --mem 4G
 4 | #SBATCH --time 27-23:59:59
 5 | #SBATCH --job-name snakeautocluster
 6 | #SBATCH --cpus-per-task=2
 7 | #SBATCH -e logs/sbatchSnakefile_progress_err.log
 8 | #SBATCH -o logs/sbatchSnakefile_progress_out.log
 9 | 
10 | 
11 | module purge
12 | module add slurm
13 | source activate hypercluster
14 | cd /gpfs/home/lmb529/ruggleslabHome/hypercluster
15 | mkdir -p logs/slurm/
16 | 
17 | snakemake -j 999 -p --verbose \
18 | -s hypercluster.smk \
19 | --keep-going \
20 | --cluster-config cluster.json \
21 | --cluster "sbatch --mem={cluster.mem} -t {cluster.time} -o {cluster.output} -p {cluster.partition}"
22 | 


--------------------------------------------------------------------------------
/snakemake/test_input.txt:
--------------------------------------------------------------------------------
 1 | ind2,0,1
 2 | a,1.0,2.0
 3 | b,-1.8,4.0
 4 | c,1.0,-0.5
 5 | d,10.0,2.0
 6 | e,-10.0,4.0
 7 | f,10.0,0.0
 8 | g,0.1,5.0
 9 | h,3.2,2.0
10 | i,0.0,14.0
11 | j,-16.4,3.67
12 | k,13.22,-3.0
13 | 11,3.3,2.0
14 | 12,42.0,2.0
15 | 13,-8.0,2.0
16 | 14,1.2,12.0
17 | 15,0.1,2.1
18 | 16,0.25,2.0
19 | 17,0.1,1.11
20 | 18,-44.0,0.0
21 | 19,-0.22,-0.11
22 | 20,2.34,6.7
23 | 21,-10.0,2.0
24 | 22,-2.3,-2.5
25 | 23,0.1,0.0
26 | 24,0.1,22.0
27 | 25,8.6,-7.5
28 | 26,0.0,14.0
29 | 27,-6.4,23.67
30 | 28,-3.22,3.0
31 | 29,0.1,2.0
32 | 30,-20.0,2.01
33 | 31,0.25,-0.25
34 | 32,0.455,0.233
35 | 33,0.1,-0.89
36 | 34,19.0,2.0
37 | 35,0.1,2.0
38 | 36,-29.0,3.6
39 | 37,-13.0,-3.0
40 | 38,3.3,2.0
41 | 39,-4.0,2.0
42 | 40,-0.2,-0.1
43 | 41,0.34,0.7
44 | 


--------------------------------------------------------------------------------