├── .gitignore ├── README.rst ├── dynamicTreeCut ├── R_func.py ├── __init__.py ├── __main__.py ├── df_apply.py ├── dynamicTreeCut.py └── tests │ └── test_dynamicTreeCut.py ├── environment.yml ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | .eggs 3 | .idea 4 | *~ 5 | *.pyc 6 | build/* 7 | dynamicTreeCut.egg-info/* 8 | dist/* 9 | writing/* 10 | 11 | # Ignore CSVs unless explicitly added 12 | .csv 13 | 14 | # Jupyter/IPython Checkpoints 15 | .ipynb_checkpoints 16 | 17 | # Emacs temporary files 18 | *~ 19 | 20 | # Mac OS files 21 | .DS_Store 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | .hypothesis/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Stars| |PyPIDownloads| |PyPI| |Build Status| |Coffee| 2 | 3 | .. |Stars| image:: https://img.shields.io/github/stars/kylessmith/dynamicTreeCut?logo=GitHub&color=yellow 4 | :target: https://github.com/kylessmith/dynamicTreeCut/stargazers 5 | .. |PyPIDownloads| image:: https://pepy.tech/badge/dynamicTreeCut 6 | :target: https://pepy.tech/project/dynamicTreeCut 7 | .. |PyPI| image:: https://img.shields.io/pypi/v/dynamicTreeCut.svg 8 | :target: https://pypi.org/project/dynamicTreeCut 9 | .. |Build Status| image:: https://travis-ci.org/kylessmith/dynamicTreeCut.svg?branch=master 10 | :target: https://travis-ci.org/kylessmith/dynamicTreeCut 11 | .. |Coffee| image:: https://img.shields.io/badge/-buy_me_a%C2%A0coffee-gray?logo=buy-me-a-coffee&color=ff69b4 12 | :target: https://www.buymeacoffee.com/kylessmith 13 | 14 | 15 | Python translation of the hybrid dynamicTreeCut method created by Peter Langfelder and Bin Zhang. 16 | 17 | dynamicTreeCut was originally published by in *Bioinformatics*: 18 | Langfelder P, Zhang B, Horvath S (2007) Defining clusters from a hierarchical cluster tree: 19 | the Dynamic Tree Cut package for R. Bioinformatics 2008 24(5):719-720 20 | 21 | dynamicTreeCut R code is distributed under the GPL-3 License and 22 | original sources should be cited. 23 | 24 | 25 | dynamicTreeCut contains methods for detection of clusters in hierarchical clustering dendrograms. 26 | *NOTE: though the clusters match the R output, the cluster names are shuffled* 27 | 28 | Installing 29 | ========== 30 | 31 | To install, it's best to create an environment after installing and downloading the 32 | `Anaconda Python Distribution `__ 33 | 34 | conda env create --file environment.yml 35 | 36 | PyPI install, presuming you have all its requirements (numpy and scipy) installed:: 37 | 38 | pip install dynamicTreeCut 39 | 40 | 41 | Importation 42 | =========== 43 | :: 44 | 45 | >>> from dynamicTreeCut import cutreeHybrid 46 | >>> from scipy.spatial.distance import pdist 47 | >>> import numpy as np 48 | >>> from scipy.cluster.hierarchy import linkage 49 | >>> d = np.transpose(np.arange(1,10001).reshape(100,100)) 50 | >>> distances = pdist(d, "euclidean") 51 | >>> link = linkage(distances, "average") 52 | >>> clusters = cutreeHybrid(link, distances) 53 | ..cutHeight not given, setting it to 495.1 ===> 99% of the (truncated) height range in dendro. 54 | ..done. 55 | >>> clusters["labels"] 56 | [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 57 | 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 58 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 59 | 60 | 61 | Compared to R:: 62 | 63 | > library(dynamicTreeCut) 64 | > d = matrix(1:10000, 100) 65 | > distances <- dist(d, method="euclidean") 66 | > dendro <- hclust(distances, method="average") 67 | > clusters <- cutreeDynamic(dendro, distM=as.matrix(distances)) 68 | ..cutHeight not given, setting it to 495 ===> 99% of the (truncated) height range in dendro. 69 | ..done. 70 | > clusters 71 | [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 72 | [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 73 | [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 74 | 75 | Installation 76 | ============ 77 | 78 | If you dont already have numpy and scipy installed, it is best to download 79 | `Anaconda`, a python distribution that has them included. 80 | 81 | https://continuum.io/downloads 82 | 83 | Dependencies can be installed by:: 84 | 85 | pip install -r requirements.txt 86 | 87 | 88 | License 89 | ======= 90 | 91 | dynamicTreeCut is available under the GPL-3 License 92 | -------------------------------------------------------------------------------- /dynamicTreeCut/R_func.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.cluster.hierarchy import to_tree 4 | 5 | 6 | def sign(value): 7 | #python version of R's sign 8 | 9 | if value > 0: 10 | return(1) 11 | elif value < 0: 12 | return(-1) 13 | else: 14 | return(0) 15 | 16 | def paste(string, n, sep=""): 17 | #python version of R's paste 18 | 19 | results = [] 20 | for i in range(n): 21 | results.append(string + sep + str(i)) 22 | 23 | return(results) 24 | 25 | 26 | def get_heights(Z): 27 | #python verison of R's dendro$height 28 | #height = np.zeros(len(dendro["dcoord"])) 29 | 30 | #for i, d in enumerate(dendro["dcoord"]): 31 | #height[i] = d[1] 32 | 33 | clusternode = to_tree(Z, True) 34 | #height = np.array([c.dist for c in clusternode[1]]) 35 | height = np.array([c.dist for c in clusternode[1] if c.is_leaf() != True]) 36 | 37 | #height.sort() 38 | 39 | return(height) 40 | 41 | 42 | def get_merges(z): 43 | #python version of R's dendro$merge 44 | n = z.shape[0] 45 | merges = np.zeros((z.shape[0], 2), dtype=int) 46 | 47 | for i in range(z.shape[0]): 48 | for j in range(2): 49 | if z[i][j] <= n: 50 | merges[i][j] = -(z[i][j] + 1) 51 | else: 52 | cluster = z[i][j] - n 53 | merges[i][j] = cluster 54 | 55 | return(merges) 56 | 57 | 58 | def factor(vector): 59 | return(vector) 60 | 61 | 62 | def nlevels(vector): 63 | #python version of R's nlevels 64 | return(len(np.unique(vector))) 65 | 66 | 67 | def levels(vector): 68 | #python version of R's levels 69 | return(np.unique(vector)) 70 | 71 | 72 | def tapply(vector, index, function): #can add **args, **kwargs 73 | #python version of R's tapply 74 | 75 | factors = np.unique(index) 76 | 77 | #results = pd.Series(np.repeat(np.nan, len(factors))) 78 | results = np.repeat(np.nan, len(factors)) 79 | #results.index = factors 80 | 81 | for i, k in enumerate(factors): 82 | subset = vector[index == k] 83 | #results.iloc[i] = function(subset) 84 | results[i] = function(subset) 85 | 86 | return(results) 87 | 88 | 89 | def tapply_df(df, index, function, axis=0): #can add **args, **kwargs 90 | #python version of R's tapply 91 | 92 | factors = np.unique(index) 93 | 94 | if axis == 1: 95 | #results = pd.DataFrame(np.zeros((len(factors), df.shape[1]))) 96 | results = np.zeros((len(factors), df.shape[1])) 97 | else: 98 | #results = pd.DataFrame(np.zeros((df.shape[0], len(factors)))) 99 | results = np.zeros((df.shape[0], len(factors))) 100 | 101 | #results.index = factors 102 | 103 | if axis == 1: 104 | for j in range(df.shape[1]): 105 | for i, k in enumerate(factors): 106 | subset = df[index == k, j] 107 | #results.iloc[i, j] = function(subset) 108 | results[i, j] = function(subset) 109 | else: 110 | for i in range(df.shape[0]): 111 | for j, k in enumerate(factors): 112 | subset = df[i, index == k] 113 | #results.iloc[i, j] = function(subset) 114 | results[i, j] = function(subset) 115 | 116 | return(results) 117 | 118 | 119 | def table(vector): 120 | 121 | factors = np.unique(vector) 122 | results = pd.Series(np.zeros(len(factors), dtype=int)) 123 | results.index = factors 124 | 125 | for i, k in enumerate(factors): 126 | results.iloc[i] = np.sum(vector == k) 127 | 128 | return(results) 129 | 130 | -------------------------------------------------------------------------------- /dynamicTreeCut/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python translation of the hybrid dynamicTreeCut 3 | Reference: 4 | Langfelder P, Zhang B, Horvath S (2007) Defining clusters from a hierarchical cluster tree: 5 | the Dynamic Tree Cut package for R. Bioinformatics 2008 24(5):719-720 6 | ''' 7 | 8 | from .dynamicTreeCut import cutreeHybrid 9 | 10 | __author__ = 'kylessmith' -------------------------------------------------------------------------------- /dynamicTreeCut/__main__.py: -------------------------------------------------------------------------------- 1 | from . import cutreeHybrid 2 | 3 | if __name__ == "__main__": 4 | cutreeHybrid() -------------------------------------------------------------------------------- /dynamicTreeCut/df_apply.py: -------------------------------------------------------------------------------- 1 | #import pandas as pd 2 | import numpy as np 3 | from functools import partial 4 | from multiprocessing import Pool 5 | 6 | 7 | #iterate through rows of DataFrame 8 | def gen_row(ndarray): 9 | 10 | for i in range(ndarray.shape[0]): 11 | yield ndarray[i,:] 12 | 13 | 14 | #iterate through columns of DataFrame 15 | def gen_col(ndarray): 16 | 17 | for i in range(ndarray.shape[1]): 18 | yield ndarray[:,i] 19 | 20 | #apply a function to each row or columns of a DataFrame 21 | def apply(func, df, axis=0, ncores=None, p=None, **kwargs): 22 | 23 | #check axis input is 0 or 1 24 | if axis not in (0,1): 25 | raise IndexError("axis must equal 0 or 1") 26 | 27 | #check if p is provided or needs to be created 28 | if p == None and ncores != None: 29 | p = Pool(ncores) 30 | 31 | #create function and pass kwargs 32 | g = partial(func, **kwargs) 33 | 34 | #if axis is 0 apply function to rows 35 | if axis == 0: 36 | #conduct multiprocessed version or not 37 | if p != None: 38 | iter_results = p.map(g, gen_row(df)) 39 | else: 40 | iter_results = map(g, gen_row(df)) 41 | #if axis is 1 apply function to columns 42 | elif axis == 1: 43 | #conduct multiprocessed version or not 44 | if p != None: 45 | iter_results = p.map(g, gen_col(df)) 46 | else: 47 | iter_results = map(g, gen_col(df)) 48 | 49 | #close Pool if it wasn't provided 50 | if ncores != None: 51 | p.close() 52 | p.join() 53 | 54 | #create DataFrame for output 55 | results = np.array(list(iter_results)) 56 | #if applied to comluns output transposed 57 | #results(to retain shape of df input) 58 | if axis == 1: 59 | results = np.transpose(results) 60 | 61 | return(results) 62 | -------------------------------------------------------------------------------- /dynamicTreeCut/dynamicTreeCut.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from scipy.stats import rankdata 4 | from scipy.special import binom #faster than comb 5 | import dynamicTreeCut.df_apply 6 | from functools import partial 7 | from dynamicTreeCut.R_func import * 8 | 9 | 10 | chunkSize = 100 11 | spaces = " " 12 | 13 | 14 | def dist_index(i, j, matrix, l, n): 15 | """ 16 | Function to index flat matrix as squareform matrix 17 | 18 | Parameters 19 | ---------- 20 | i : int 21 | First index 22 | j : int 23 | Second index 24 | matrix : numpy.ndarray 25 | Squareform matrix 26 | l : int 27 | n : int 28 | 29 | Returns 30 | ------- 31 | matrix_slice : float 32 | Indexed value 33 | 34 | """ 35 | 36 | # Check if indices are the same 37 | if i == j: 38 | return 0.0 39 | 40 | # Calculate index 41 | index = int(l - binom(n-min(i, j), 2) + (max(i, j) - min(i, j) - 1)) 42 | 43 | # Slice matrix 44 | matrix_slice = matrix[index] 45 | 46 | return matrix_slice 47 | 48 | 49 | def dist_multi_index(_array, matrix): 50 | """ 51 | Function to index flat matrix as squareform matrix 52 | """ 53 | 54 | #handle 2D array 55 | if len(matrix.shape) == 2: 56 | return(matrix[_array, :][:, _array]) 57 | 58 | l = len(matrix) 59 | n = 0.5*(np.sqrt((8*l)+1)+1) 60 | 61 | results = np.zeros((len(_array), len(_array))) 62 | for i in range(len(_array)): 63 | for j in range(i, len(_array)): 64 | score = dist_index(_array[i], _array[j], matrix, l, n) 65 | results[i,j] = score 66 | results[j,i] = score 67 | 68 | return results 69 | 70 | 71 | def get_rows(_array, matrix): 72 | """ 73 | Function to index rows of flat matrix as squareform matrix 74 | """ 75 | 76 | #handle 2D array 77 | if len(matrix.shape) == 2: 78 | return(matrix[_array,:]) 79 | 80 | l = len(matrix) 81 | n = int(0.5*(np.sqrt((8*l)+1)+1)) 82 | 83 | if _array.dtype != "bool": 84 | results = np.zeros((len(_array), n)) 85 | 86 | for row, i in enumerate(_array): 87 | for j in range(n): 88 | if i == j: 89 | results[row, j] = 0.0 90 | else: 91 | index = int(l - binom(n - min(i, j), 2) + (max(i, j) - min(i, j) - 1)) 92 | results[row,j] = matrix[index] 93 | 94 | return results 95 | 96 | else: 97 | results = np.zeros((np.sum(_array), n)) 98 | row = 0 99 | for i, b in enumerate(_array): 100 | if b == True: 101 | for j in range(n): 102 | if i == j: 103 | results[row, j] = 0.0 104 | else: 105 | index = int(l - binom(n - min(i, j), 2) + (max(i, j) - min(i, j) - 1)) 106 | results[row, j] = matrix[index] 107 | 108 | row += 1 109 | 110 | return results 111 | 112 | 113 | def CoreSize(BranchSize, minClusterSize): 114 | """ 115 | The following are supporting function for GetClusters. 116 | """ 117 | 118 | BaseCoreSize = minClusterSize / 2 + 1 119 | if BaseCoreSize < BranchSize: 120 | CoreSize = BaseCoreSize + np.sqrt(BranchSize - BaseCoreSize) 121 | else: 122 | CoreSize = BranchSize 123 | 124 | return int(CoreSize) 125 | 126 | 127 | # This assumes the diagonal of the distance matrix 128 | # is zero, BranchDist is a square matrix whose dimension is at least 2. 129 | def CoreScatter(BranchDist, minClusterSize): 130 | 131 | nPoints = BranchDist.shape[0] 132 | PointAverageDistances = np.sum(BranchDist, axis=1) / (nPoints - 1) 133 | CoreSize = minClusterSize / 2 + 1 134 | 135 | if CoreSize < nPoints: 136 | EffCoreSize = CoreSize + np.sqrt(nPoints - CoreSize) 137 | order = np.argsort(PointAverageDistances) 138 | Core = order[np.arange(EffCoreSize)] 139 | else: 140 | Core = np.arange(nPoints) 141 | EffCoreSize = nPoints 142 | 143 | CoreAverageDistances = np.sum(BranchDist[Core, Core], axis=1) / (EffCoreSize - 1) 144 | 145 | return(np.mean(CoreAverageDistances)) 146 | 147 | 148 | def interpolate(data, index): 149 | 150 | i = np.round(index) 151 | n = len(data) 152 | if i < 0: return(data[0]) 153 | if i >= n: return(data[-1]) 154 | 155 | r = index - i 156 | 157 | return(data[i-1] * (1 - r) + data[i] * r) 158 | 159 | 160 | def cutreeHybrid(link, distM, 161 | cutHeight = None, minClusterSize = 20, deepSplit = 1, 162 | maxCoreScatter = None, minGap = None, 163 | maxAbsCoreScatter = None, minAbsGap = None, 164 | minSplitHeight = None, minAbsSplitHeight = None, 165 | externalBranchSplitFnc = None, minExternalSplit = None, 166 | externalSplitOptions = [], 167 | externalSplitFncNeedsDistance = None, 168 | assumeSimpleExternalSpecification = True, 169 | pamStage = True, pamRespectsDendro = True, 170 | useMedoids = False, 171 | maxPamDist = None, 172 | respectSmallClusters = True, 173 | verbose = 2, indent = 0): 174 | 175 | 176 | dendro_height = get_heights(link) 177 | dendro_merge = get_merges(link) 178 | 179 | if maxPamDist == None: 180 | maxPamDist = cutHeight 181 | 182 | nMerge = len(dendro_height) 183 | refQuantile = 0.05 184 | refMerge = np.round(nMerge * refQuantile) 185 | 186 | if refMerge < 1: refMerge = 1 187 | 188 | refHeight = dendro_height[int(refMerge) - 1] 189 | 190 | if cutHeight == None: 191 | cutHeight = 0.99 * (np.max(dendro_height) - refHeight) + refHeight 192 | print("..cutHeight not given, setting it to", cutHeight, 193 | " ===> 99% of the (truncated) height range in dendro.") 194 | else: 195 | if cutHeight > np.max(dendro_height): cutHeight = np.max(dendro_height) 196 | 197 | if maxPamDist == None: maxPamDist = cutHeight 198 | 199 | nMergeBelowCut = np.sum(dendro_height <= cutHeight) 200 | 201 | if nMergeBelowCut < minClusterSize: 202 | print("cutHeight set too low; no merges below the cut.") 203 | return(np.zeros(nMerge+1)) 204 | 205 | # fill in this section once understood better 206 | if externalBranchSplitFnc != None: 207 | raise NotImplementedError("externalBranchSplitFnc is not supported yet") 208 | nExternalSplits = len(externalBranchSplitFnc) 209 | if len(minExternalSplit) < 1: 210 | raise AttributeError("minExternalBranchSplit must be given.") 211 | if assumeSimpleExternalSpecification and nExternalSplits == 1: 212 | pass 213 | else: 214 | nExternalSplits = 0 215 | 216 | 217 | MxBranches = nMergeBelowCut 218 | branch_isBasic = np.repeat(True, MxBranches) 219 | branch_isTopBasic = np.repeat(True, MxBranches) 220 | branch_failSize = np.repeat(False, MxBranches) 221 | branch_rootHeight = np.repeat(np.nan, MxBranches) 222 | branch_size = np.repeat(2, MxBranches) 223 | branch_nMerge = np.repeat(1, MxBranches) 224 | branch_nSingletons = np.repeat(2, MxBranches) 225 | branch_nBasicClusters = np.repeat(0, MxBranches) 226 | branch_mergedInto = np.repeat(0, MxBranches) 227 | branch_attachHeight = np.repeat(np.nan, MxBranches) 228 | #branch_singletons = np.zeros(MxBranches) 229 | branch_singletons = [np.nan] * MxBranches 230 | #branch_basicClusters = pd.Series(np.zeros(MxBranches)) 231 | branch_basicClusters = [np.nan] * MxBranches 232 | #branch_mergingHeights = pd.Series(np.zeros(MxBranches)) 233 | branch_mergingHeights = [np.nan] * MxBranches 234 | #branch_singletonHeights = pd.Series(np.zeros(MxBranches)) 235 | branch_singletonHeights = [np.nan] * MxBranches 236 | 237 | 238 | nBranches = 0 239 | 240 | spyIndex = None 241 | if os.path.isfile(".dynamicTreeCutSpyFile"): 242 | spyIndex = pd.read_csv(".dynamicTreeCutSpyFile") 243 | print("Found 'spy file' with indices of objects to watch for.") 244 | spyIndex = spyIndex.iloc[:,1].values 245 | 246 | 247 | defMCS = np.array([0.64, 0.73, 0.82, 0.91, 0.95]) 248 | defMG = (1 - defMCS) * 3 / 4.0 249 | 250 | nSplitDefaults = len(defMCS) 251 | 252 | if type(deepSplit) == bool: deepSplit = int(deepSplit) * (nSplitDefaults - 2) 253 | deepSplit = deepSplit + 1 254 | 255 | if deepSplit < 1 or deepSplit > nSplitDefaults: 256 | raise IndexError("Parameter deepSplit (value", deepSplit, 257 | ") out of range: allowable range is 0 through", 258 | nSplitDefaults - 1) 259 | 260 | if maxCoreScatter == None: maxCoreScatter = interpolate(defMCS, deepSplit) 261 | if minGap == None: minGap = interpolate(defMG, deepSplit) 262 | 263 | if maxAbsCoreScatter == None: 264 | maxAbsCoreScatter = refHeight + maxCoreScatter * (cutHeight - refHeight) 265 | if minAbsGap == None: 266 | minAbsGap = minGap * (cutHeight - refHeight) 267 | 268 | if minSplitHeight == None: minSplitHeight = 0 269 | 270 | if minAbsSplitHeight == None: 271 | minAbsSplitHeight = refHeight + minSplitHeight * (cutHeight - refHeight) 272 | 273 | nPoints = nMerge + 1 274 | 275 | IndMergeToBranch = np.repeat(0, nMerge) 276 | 277 | onBranch = np.repeat(0, nPoints) 278 | 279 | RootBranch = 0 280 | 281 | mergeDiagnostics = dict(smI = np.repeat(np.nan, nMerge), smSize = np.repeat(np.nan, nMerge), 282 | smCrSc = np.repeat(np.nan, nMerge), smGap = np.repeat(np.nan, nMerge), 283 | lgI = np.repeat(np.nan, nMerge), lgSize = np.repeat(np.nan, nMerge), 284 | lgCrSc = np.repeat(np.nan, nMerge), lgGap = np.repeat(np.nan, nMerge), 285 | merged = np.repeat(np.nan, nMerge)) 286 | 287 | if nExternalSplits > 0: 288 | #externalMergeDiags = pd.DataFrame(np.repeat(np.nan, nMerge*nExternalSplits).reshape(nMerge, nExternalSplits)) 289 | #externalMergeDiags.columns = paste("externalBranchSplit", nExternalSplits, sep = ".") 290 | pass 291 | 292 | extender = np.zeros(chunkSize, dtype=int) 293 | 294 | for merge in range(nMerge): 295 | if dendro_height[merge] <= cutHeight: 296 | # are both merged objects singletons? 297 | if dendro_merge[merge, 0] < 0 and dendro_merge[merge, 1] < 0: 298 | nBranches = nBranches + 1 299 | branch_isBasic[nBranches - 1] = True 300 | branch_isTopBasic[nBranches - 1] = True 301 | branch_singletons[nBranches - 1] = np.append(-dendro_merge[merge,], extender) 302 | branch_basicClusters[nBranches - 1] = extender 303 | branch_mergingHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender) 304 | branch_singletonHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender) 305 | IndMergeToBranch[merge] = nBranches 306 | RootBranch = nBranches 307 | elif sign(dendro_merge[merge,0]) * sign(dendro_merge[merge,1]) < 0: 308 | clust = IndMergeToBranch[int(np.max(dendro_merge[merge,])) - 1] 309 | if clust == 0: raise ValueError("a previous merge has no associated cluster. Sorry!") 310 | gene = -np.min(dendro_merge[merge,]) 311 | ns = branch_nSingletons[clust - 1] + 1 312 | nm = branch_nMerge[clust - 1] + 1 313 | if branch_isBasic[clust - 1]: 314 | if ns > len(branch_singletons[clust - 1]): 315 | branch_singletons[clust - 1] = np.append(branch_singletons[clust - 1], extender) 316 | branch_singletonHeights[clust - 1] = np.append(branch_singletonHeights[clust - 1], extender) 317 | branch_singletons[clust - 1][ns - 1] = gene 318 | branch_singletonHeights[clust - 1][ns - 1] = dendro_height[merge] 319 | else: 320 | onBranch[int(gene) - 1] = clust 321 | 322 | if nm >= len(branch_mergingHeights[clust - 1]): 323 | branch_mergingHeights[clust - 1] = np.append(branch_mergingHeights[clust - 1], extender) 324 | branch_mergingHeights[clust - 1][nm - 1] = dendro_height[merge] 325 | branch_size[clust - 1] = branch_size[clust - 1] + 1 326 | branch_nMerge[clust - 1] = nm 327 | branch_nSingletons[clust - 1] = ns 328 | IndMergeToBranch[merge] = clust 329 | RootBranch = clust 330 | else: 331 | # attempt to merge two branches: 332 | clusts = IndMergeToBranch[dendro_merge[merge,] - 1] 333 | sizes = branch_size[clusts - 1] 334 | # Note: for 2 elements, rank and order are the same. 335 | rnk = rankdata(sizes, method = "ordinal") 336 | small = clusts[rnk[0] - 1] 337 | large = clusts[rnk[1] - 1] 338 | sizes = sizes[rnk - 1] 339 | branch1 = np.nan if np.any(np.isnan(branch_singletons[large - 1])) else branch_singletons[large - 1][np.arange(sizes[1])] 340 | branch2 = np.nan if np.any(np.isnan(branch_singletons[small - 1])) else branch_singletons[small - 1][np.arange(sizes[0])] 341 | spyMatch = False 342 | if spyIndex != None: 343 | n1 = len(set(branch1) & set(spyIndex)) 344 | if n1 / len(branch1) > 0.99 and n1 / len(spyIndex) > 0.99: 345 | print("Found spy match for branch 1 on merge", merge) 346 | spyMatch = True 347 | n2 = len(set(branch2) & set(spyIndex)) 348 | if n2 / len(branch1) > 0.99 and n2 / len(spyIndex) > 0.99: 349 | print("Found spy match for branch 2 on merge", merge) 350 | spyMatch = True 351 | 352 | if branch_isBasic[small - 1]: 353 | coresize = CoreSize(branch_nSingletons[small - 1], minClusterSize) 354 | Core = np.array(branch_singletons[small - 1][np.arange(int(coresize))], dtype=int) 355 | # SmAveDist = mean(apply(distM[Core, Core], 2, sum)/(coresize-1)) 356 | SmAveDist = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize - 1)) 357 | else: 358 | SmAveDist = 0 359 | 360 | if branch_isBasic[large - 1]: 361 | coresize = CoreSize(branch_nSingletons[large - 1], minClusterSize) 362 | Core = np.array(branch_singletons[large - 1][np.arange(int(coresize))], dtype=int) 363 | LgAveDist = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize -1 )) 364 | else: 365 | LgAveDist = 0 366 | 367 | for key in mergeDiagnostics: 368 | if key == "smI": 369 | mergeDiagnostics[key][merge] = small 370 | elif key == "smSize": 371 | mergeDiagnostics[key][merge] = branch_size[small - 1] 372 | elif key == "smCrSc": 373 | mergeDiagnostics[key][merge] = SmAveDist 374 | elif key == "smGap": 375 | mergeDiagnostics[key][merge] = dendro_height[merge] - SmAveDist 376 | elif key == "lgI": 377 | mergeDiagnostics[key][merge] = large 378 | elif key == "lgSize": 379 | mergeDiagnostics[key][merge] = branch_size[large - 1] 380 | elif key == "lgCrSc": 381 | mergeDiagnostics[key][merge] = LgAveDist 382 | elif key == "lgGap": 383 | mergeDiagnostics[key][merge] = dendro_height[merge] - LgAveDist 384 | elif key == "merged": 385 | mergeDiagnostics[key][merge] = np.nan 386 | 387 | 388 | # We first check each cluster separately for being too small, too diffuse, or too shallow: 389 | SmallerScores = [branch_isBasic[small - 1], 390 | branch_size[small - 1] < minClusterSize, 391 | SmAveDist > maxAbsCoreScatter, 392 | dendro_height[merge] - SmAveDist < minAbsGap, 393 | dendro_height[merge] < minAbsSplitHeight] 394 | 395 | if SmallerScores[0] * np.sum(SmallerScores[1:]) > 0: 396 | DoMerge = True 397 | SmallerFailSize = ~np.logical_or(SmallerScores[2], SmallerScores[3]) # Smaller fails only due to size 398 | else: 399 | LargerScores = [branch_isBasic[large - 1], 400 | branch_size[large - 1] < minClusterSize, 401 | LgAveDist > maxAbsCoreScatter, 402 | dendro_height[merge] - LgAveDist < minAbsGap, 403 | dendro_height[merge] < minAbsSplitHeight] 404 | if LargerScores[0] * np.sum(LargerScores[1:]) > 0: 405 | # Actually: the large one is the one to be merged 406 | DoMerge = True 407 | SmallerFailSize = ~np.logical_or(LargerScores[2], LargerScores[3]) # cluster fails only due to size 408 | x = small 409 | small = large 410 | large = x 411 | sizes = sizes[::-1] 412 | else: 413 | DoMerge = False # None of the two satisfies merging criteria 414 | 415 | if DoMerge: 416 | mergeDiagnostics["merged"][merge] = 1 417 | 418 | if ~DoMerge and nExternalSplits > 0 and branch_isBasic[small - 1] and branch_isBasic[large - 1]: 419 | if verbose > 4: print("Entering external split code on merge ", merge) 420 | branch1 = branch_singletons[large - 1][np.arange(sizes[1])] 421 | branch2 = branch_singletons[small - 1][np.arange(sizes[0])] 422 | 423 | if verbose > 4 or spyMatch: print(" ..branch lengths: ", sizes[0], ", ", sizes[1]) 424 | #if (any(is.na(branch1)) || any(branch1==0)) browser(); 425 | #if (any(is.na(branch2)) || any(branch2==0)) browser(); 426 | 427 | 428 | ##### fix after External Splits is understood better 429 | es = 0 430 | while es < nExternalSplits and ~DoMerge: 431 | es = es + 1 432 | args = externalSplitOptions[es - 1] 433 | args = [args, list(branch1 = branch1, branch2 = branch2)] 434 | #extSplit = do.call(externalBranchSplitFnc[es], args) 435 | if spyMatch: 436 | print(" .. external criterion ", es, ": ", extSplit) 437 | DoMerge = extSplit < minExternalSplit[es - 1] 438 | externalMergeDiags[merge, es - 1] = extSplit 439 | if DoMerge: 440 | mergeDiagnostics_merged[merge] = 2 441 | else: 442 | mergeDiagnostics_merged[merge] = 0 443 | 444 | if DoMerge: 445 | # merge the small into the large cluster and close it. 446 | branch_failSize[small - 1] = SmallerFailSize 447 | branch_mergedInto[small - 1] = large 448 | branch_attachHeight[small - 1] = dendro_height[merge] 449 | branch_isTopBasic[small - 1] = False 450 | nss = branch_nSingletons[small - 1] 451 | nsl = branch_nSingletons[large - 1] 452 | ns = nss + nsl 453 | 454 | if branch_isBasic[large - 1]: 455 | nExt = np.ceil( (ns - len(branch_singletons[large - 1])) / chunkSize ) 456 | 457 | if nExt > 0: 458 | if verbose > 5: 459 | print("Extending singletons for branch", large, "by", nExt, " extenders.") 460 | 461 | branch_singletons[large - 1] = np.append(branch_singletons[large - 1], np.repeat(extender, nExt)) 462 | branch_singletonHeights[large - 1] = np.append(branch_singletonHeights[large - 1], np.repeat(extender, nExt)) 463 | 464 | branch_singletons[large - 1][np.arange(nsl,ns)] = branch_singletons[small - 1][np.arange(nss)] 465 | branch_singletonHeights[large - 1][np.arange(nsl,ns)] = branch_singletonHeights[small - 1][np.arange(nss)] 466 | branch_nSingletons[large - 1] = ns 467 | 468 | else: 469 | if ~branch_isBasic[small - 1]: 470 | raise ValueError("merging two composite clusters. Sorry!") 471 | 472 | onBranch[ branch_singletons[small - 1][branch_singletons[small - 1] != 0] - 1 ] = large 473 | 474 | nm = branch_nMerge[large - 1] + 1 475 | 476 | if nm > len(branch_mergingHeights[large - 1]): 477 | branch_mergingHeights[large - 1] = np.append(branch_mergingHeights[large - 1], extender) 478 | 479 | branch_mergingHeights[large - 1][nm - 1] = dendro_height[merge] 480 | branch_nMerge[large - 1] = nm 481 | branch_size[large - 1] = branch_size[small - 1] + branch_size[large - 1] 482 | IndMergeToBranch[merge] = large 483 | RootBranch = large 484 | else: 485 | # start or continue a composite cluster. 486 | 487 | # If large is basic and small is not basic, switch them. 488 | if branch_isBasic[large - 1] and ~branch_isBasic[small - 1]: 489 | x = large 490 | large = small 491 | small = x 492 | sizes = sizes[::-1] 493 | 494 | # Note: if pamRespectsDendro, need to start a new composite cluster every time two branches merge, 495 | # otherwise will not have the necessary information. 496 | # Otherwise, if the large cluster is already composite, I can simply merge both clusters into 497 | # one of the non-composite clusters. 498 | 499 | if branch_isBasic[large - 1] or pamStage and pamRespectsDendro: 500 | nBranches = nBranches + 1 501 | branch_attachHeight[[large - 1, small - 1]] = dendro_height[merge] 502 | branch_mergedInto[[large - 1, small - 1]] = nBranches 503 | if branch_isBasic[small - 1]: 504 | addBasicClusters = small # add basic clusters 505 | else: 506 | addBasicClusters = branch_basicClusters[small - 1] 507 | if branch_isBasic[large - 1]: 508 | addBasicClusters = np.append(addBasicClusters, large) 509 | else: 510 | addBasicClusters = np.append(addBasicClusters, branch_basicClusters[large - 1]) 511 | # print(paste(" Starting a composite cluster with number", nBranches)); 512 | branch_isBasic[nBranches - 1] = False 513 | branch_isTopBasic[nBranches - 1] = False 514 | branch_basicClusters[nBranches - 1] = addBasicClusters 515 | branch_mergingHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender) 516 | branch_nMerge[nBranches - 1] = 2 517 | branch_size[nBranches - 1] = np.sum(sizes) 518 | branch_nBasicClusters[nBranches - 1] = len(addBasicClusters) 519 | IndMergeToBranch[merge] = nBranches 520 | RootBranch = nBranches 521 | else: 522 | # Add small branch to the large one 523 | addBasicClusters = small if branch_isBasic[small - 1] else branch_basicClusters[small - 1] 524 | nbl = branch_nBasicClusters[large - 1] 525 | #small might be an int 526 | try: 527 | nb = branch_nBasicClusters[large - 1] + len(addBasicClusters) 528 | except TypeError: 529 | nb = branch_nBasicClusters[large - 1] + 1 530 | 531 | if nb > len(branch_basicClusters[large - 1]): 532 | nExt = np.ceil( ( nb - len(branch_basicClusters[large - 1])) / chunkSize) 533 | branch_basicClusters[large - 1] = np.append(branch_basicClusters[large - 1], np.repeat(extender, nExt)) 534 | 535 | branch_basicClusters[large - 1][np.arange(nbl,nb)] = addBasicClusters 536 | branch_nBasicClusters[large - 1] = nb 537 | branch_size[large - 1] = branch_size[large - 1] + branch_size[small - 1] 538 | nm = branch_nMerge[large - 1] + 1 539 | 540 | if nm > len(branch_mergingHeights[large - 1]): 541 | branch_mergingHeights[large - 1] = np.append(branch_mergingHeights[large - 1], extender) 542 | 543 | branch_mergingHeights[large - 1][nm - 1] = dendro_height[merge] 544 | branch_nMerge[large - 1] = nm 545 | branch_attachHeight[small - 1] = dendro_height[merge] 546 | branch_mergedInto[small - 1] = large 547 | IndMergeToBranch[merge] = large 548 | RootBranch = large 549 | 550 | if verbose > 2: print("..Going through detected branches and marking clusters..") 551 | 552 | isCluster = np.repeat(False, nBranches) 553 | SmallLabels = np.repeat(0, nPoints) 554 | 555 | for clust in range(nBranches): 556 | 557 | if np.isnan(branch_attachHeight[clust]): branch_attachHeight[clust] = cutHeight 558 | if branch_isTopBasic[clust]: 559 | coresize = CoreSize(branch_nSingletons[clust], minClusterSize) 560 | Core = branch_singletons[clust][np.arange(coresize)] 561 | CoreScatter = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize - 1)) 562 | isCluster[clust] = np.logical_and(np.logical_and(branch_isTopBasic[clust], 563 | branch_size[clust] >= minClusterSize), 564 | np.logical_and(CoreScatter < maxAbsCoreScatter, 565 | branch_attachHeight[clust] - CoreScatter > minAbsGap)) 566 | else: 567 | CoreScatter = 0 568 | 569 | if branch_failSize[clust]: SmallLabels[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = clust + 1 570 | 571 | if not respectSmallClusters: SmallLabels = np.repeat(0, nPoints) 572 | 573 | if verbose > 2: print(spaces, "..Assigning Tree Cut stage labels..") 574 | 575 | Colors = np.repeat(0, nPoints) 576 | coreLabels = np.repeat(0, nPoints) 577 | clusterBranches = np.arange(nBranches)[isCluster] 578 | branchLabels = np.repeat(0, nBranches) 579 | color = 0 580 | 581 | for clust in clusterBranches: 582 | color = color + 1 583 | Colors[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = color 584 | SmallLabels[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = 0 585 | coresize = CoreSize(branch_nSingletons[clust], minClusterSize) 586 | Core = branch_singletons[clust][np.arange(coresize)] 587 | coreLabels[Core - 1] = color 588 | branchLabels[clust] = color 589 | 590 | Labeled = np.arange(nPoints)[Colors != 0] 591 | Unlabeled = np.arange(nPoints)[Colors == 0] 592 | nUnlabeled = len(Unlabeled) 593 | UnlabeledExist = nUnlabeled > 0 594 | 595 | if len(Labeled) > 0: 596 | LabelFac = factor(Colors[Labeled]) 597 | nProperLabels = nlevels(LabelFac) 598 | else: 599 | nProperLabels = 0 600 | 601 | 602 | if pamStage and UnlabeledExist and nProperLabels > 0: 603 | if verbose > 2: print(spaces, "..Assigning PAM stage labels..") 604 | nPAMed = 0 605 | # Assign some of the grey genes to the nearest module. Define nearest as the distance to the medoid, 606 | # that is the point in the cluster that has the lowest average distance to all other points in the 607 | # cluster. First get the medoids. 608 | if useMedoids: 609 | Medoids = np.repeat(0, nProperLabels) 610 | ClusterRadii = np.repeat(0.0, nProperLabels) 611 | for cluster in range(1, nProperLabels + 1): 612 | InCluster = np.arange(1,nPoints+1)[Colors == cluster] 613 | DistInCluster = dist_multi_index(InCluster - 1, distM) 614 | #DistInCluster = distM[InCluster, InCluster] 615 | DistSums = np.sum(DistInCluster, axis=1) 616 | Medoids[cluster - 1] = InCluster[np.argmin(DistSums)] 617 | ClusterRadii[cluster - 1] = np.max(DistInCluster[:, np.argmin(DistSums)]) 618 | # If small clusters are to be respected, assign those first based on medoid-medoid distances. 619 | if respectSmallClusters: 620 | FSmallLabels = factor(SmallLabels) 621 | SmallLabLevs = levels(FSmallLabels) 622 | nSmallClusters = nlevels(FSmallLabels) - (SmallLabLevs[0] == 0) 623 | if nSmallClusters > 0 : 624 | for sclust in SmallLabLevs[SmallLabLevs != 0]: 625 | InCluster = np.arange(nPoints)[SmallLabels == sclust] 626 | if pamRespectsDendro: 627 | onBr = np.unique(onBranch[InCluster]) 628 | if len(onBr) > 1: 629 | raise ValueError("Internal error: objects in a small cluster are marked to belong", 630 | "\nto several large branches:") 631 | if onBr > 0: 632 | basicOnBranch = branch_basicClusters[onBr[0] - 1] 633 | labelsOnBranch = branchLabels[basicOnBranch - 1] 634 | else: 635 | labelsOnBranch = None 636 | else: 637 | labelsOnBranch = np.arange(1, nProperLabels + 1) 638 | # printFlush(paste("SmallCluster", sclust, "has", length(InCluster), "elements.")); 639 | DistInCluster = dist_multi_index(InCluster, distM) 640 | #DistInCluster = distM[InCluster, InCluster] 641 | if len(labelsOnBranch) > 0: 642 | if len(InCluster) > 1: 643 | DistSums = df_apply.apply(np.sum, DistInCluster, 1) 644 | smed = InCluster[np.argmin(DistSums)] 645 | DistToMeds = get_rows(Medoids[labelsOnBranch - 1][Medoids[labelsOnBranch - 1] != 0] - 1, distM)[:, smed] 646 | closest = np.argmin(DistToMeds) 647 | DistToClosest = DistToMeds[closest] 648 | closestLabel = labelsOnBranch[closest] 649 | if DistToClosest < ClusterRadii[closestLabel - 1] or DistToClosest < maxPamDist: 650 | Colors[InCluster] = closestLabel 651 | nPAMed = nPAMed + len(InCluster) 652 | else: Colors[InCluster] = -1 # This prevents individual points from being assigned later 653 | else: 654 | Colors[InCluster] = -1 655 | 656 | # Assign leftover unlabeled objects to clusters with nearest medoids 657 | Unlabeled = np.arange(nPoints)[Colors == 0] 658 | if len(Unlabeled > 0): 659 | for obj in Unlabeled: 660 | if pamRespectsDendro: 661 | onBr = onBranch[obj] 662 | if onBr > 0: 663 | basicOnBranch = branch_basicClusters[onBr - 1] 664 | labelsOnBranch = branchLabels[basicOnBranch - 1] 665 | else: 666 | labelsOnBranch = None 667 | else: 668 | labelsOnBranch = np.arange(nProperLabels) 669 | if labelsOnBranch != None: 670 | UnassdToMedoidDist = get_rows(Medoids[labelsOnBranch - 1] - 1, distM)[:,obj] 671 | #UnassdToMedoidDist = distM[Medoids[labelsOnBranch], obj] 672 | nearest= np.argmin(UnassdToMedoidDist) 673 | NearestCenterDist = UnassdToMedoidDist[nearest] 674 | nearestMed = labelsOnBranch[nearest] 675 | if NearestCenterDist < ClusterRadii[nearestMed - 1] or NearestCenterDist < maxPamDist: 676 | Colors[obj] = nearestMed 677 | nPAMed = nPAMed + 1 678 | 679 | UnlabeledExist = np.sum(Colors == 0) > 0 680 | else: # Instead of medoids, use average distances 681 | # This is the default method, so I will try to tune it for speed a bit. 682 | ClusterDiam = np.repeat(0, nProperLabels) 683 | for cluster in range(nProperLabels): 684 | InCluster = np.arange(nPoints)[Colors == cluster] 685 | nInCluster = len(InCluster) 686 | DistInCluster = dist_multi_index(InCluster, distM) 687 | #DistInCluster = distM[InCluster, InCluster] 688 | if nInCluster > 1: 689 | AveDistInClust = np.sum(DistInCluster, axis=1) / (nInCluster - 1) 690 | ClusterDiam[cluster] = np.max(AveDistInClust) 691 | else: 692 | ClusterDiam[cluster] = 0 693 | 694 | # If small clusters are respected, assign them first based on average cluster-cluster distances. 695 | ColorsX = Colors.copy() 696 | if respectSmallClusters: 697 | FSmallLabels = factor(SmallLabels) #### think about 698 | SmallLabLevs = levels(FSmallLabels) ##### think about 699 | nSmallClusters = nlevels(FSmallLabels) - (SmallLabLevs[0] == 0) 700 | if nSmallClusters > 0: 701 | if pamRespectsDendro: 702 | for sclust in SmallLabLevs[SmallLabLevs != 0]: 703 | InCluster = np.arange(nPoints)[SmallLabels == sclust] 704 | onBr = np.unique(onBranch[InCluster]) 705 | 706 | if len(onBr) > 1: 707 | raise ValueError("objects in a small cluster are marked to belong", 708 | "\nto several large branches:") 709 | if onBr > 0: 710 | basicOnBranch = branch_basicClusters[onBr[0] - 1] 711 | labelsOnBranch = branchLabels[basicOnBranch - 1] 712 | useObjects = np.in1d(ColorsX, np.unique(labelsOnBranch)) 713 | DistSClustClust = get_rows(InCluster, distM)[:,useObjects] 714 | #DistSClustClust = distM[InCluster, useObjects] 715 | MeanDist = np.mean(DistSClustClust, axis=0) 716 | useColorsFac = factor(ColorsX[useObjects]) ### think about 717 | MeanMeanDist = tapply(MeanDist, useColorsFac, np.mean) ## think about 718 | nearest = np.argmin(MeanMeanDist) 719 | NearestDist = MeanMeanDist[nearest] 720 | nearestLabel = levels(useColorsFac)[nearest] ## think about 721 | if NearestDist < ClusterDiam[nearestLabel - 1] or NearestDist < maxPamDist: 722 | Colors[InCluster] = nearestLabel 723 | nPAMed = nPAMed + len(InCluster) 724 | else: 725 | Colors[InCluster] = -1 # This prevents individual points from being assigned later 726 | 727 | else: 728 | labelsOnBranch = np.arange(nProperLabels) 729 | useObjects = np.arange(nPoints)[ColorsX != 0] 730 | for sclust in SmallLabLevs[SmallLabLevs != 0]: 731 | InCluster = np.arange(nPoints)[SmallLabels == sclust] 732 | DistSClustClust = get_rows(InCluster, distM)[:,useObjects] 733 | #DistSClustClust = distM[InCluster, useObjects] 734 | MeanDist = np.mean(DistSClustClust, axis=0) 735 | useColorsFac = factor(ColorsX[useObjects]) ### think about 736 | MeanMeanDist = tapply(MeanDist, useColorsFac, np.mean) ### think about 737 | nearest = np.argmin(MeanMeanDist) 738 | NearestDist = MeanMeanDist[nearest] 739 | nearestLabel = levels(useColorsFac)[nearest] ## think about 740 | if NearestDist < ClusterDiam[nearestLabel - 1] or NearestDist < maxPamDist: 741 | Colors[InCluster] = nearestLabel 742 | nPAMed = nPAMed + len(InCluster) 743 | else: 744 | Colors[InCluster] = -1 # This prevents individual points from being assigned later 745 | 746 | # Assign leftover unlabeled objects to clusters with nearest medoids 747 | Unlabeled = np.arange(nPoints)[Colors == 0] 748 | #ColorsX = Colors; 749 | if len(Unlabeled) > 0: 750 | if pamRespectsDendro: 751 | unlabOnBranch = Unlabeled[onBranch[Unlabeled] > 0] 752 | for obj in unlabOnBranch: 753 | onBr = onBranch[obj] 754 | basicOnBranch = branch_basicClusters[onBr - 1] 755 | labelsOnBranch = branchLabels[basicOnBranch - 1] 756 | useObjects = np.in1d(ColorsX, np.unique(labelsOnBranch)) 757 | useColorsFac = factor(ColorsX[useObjects]) ### think about 758 | #UnassdToClustDist = tapply(distM[useObjects, obj], useColorsFac, mean) ### think about 759 | UnassdToClustDist = tapply(get_rows(useObjects, distM)[:,obj], useColorsFac, np.mean) ### think about 760 | nearest = np.argmin(UnassdToClustDist) 761 | NearestClusterDist = UnassdToClustDist[nearest] 762 | nearestLabel = levels(useColorsFac)[nearest] ### think about 763 | if NearestClusterDist < ClusterDiam[nearestLabel - 1] or NearestClusterDist < maxPamDist: 764 | Colors[obj] = nearestLabel 765 | nPAMed = nPAMed + 1 766 | 767 | else: 768 | useObjects = np.arange(nPoints)[ColorsX != 0] 769 | useColorsFac = factor(ColorsX[useObjects]) ## think about 770 | nUseColors = nlevels(useColorsFac) ### think about 771 | UnassdToClustDist = tapply_df(get_rows(useObjects, distM)[:,Unlabeled], useColorsFac, np.mean, 1) 772 | #UnassdToClustDist = df_apply.apply(distM[useObjects, Unlabeled], 1, tapply, useColorsFac, mean) ### think about 773 | # Fix dimensions for the case when there's only one cluster 774 | #dim(UnassdToClustDist) = np.append(nUseColors, len(Unlabeled)) ### think about 775 | nearest = df_apply.apply(np.argmin, UnassdToClustDist, 1) 776 | nearestDist = df_apply.apply(np.min, UnassdToClustDist, 1) 777 | nearestLabel = levels(useColorsFac)[nearest - 1] ### think about 778 | assign = np.logical_or(nearestDist < ClusterDiam[nearestLabel - 1], nearestDist < maxPamDist) 779 | Colors[Unlabeled[assign]] = nearestLabel[assign] 780 | nPAMed = nPAMed + np.sum(assign) 781 | 782 | if verbose > 2: print("....assigned", nPAMed, "objects to existing clusters.") 783 | 784 | 785 | # Relabel labels such that 1 corresponds to the largest cluster etc. 786 | Colors[Colors < 0] = 0 787 | UnlabeledExist = np.sum(Colors == 0) > 0 788 | NumLabs = Colors + 1 789 | Sizes = table(NumLabs) ### think about 790 | if UnlabeledExist: 791 | if len(Sizes) > 1: 792 | SizeRank = np.append(1, rankdata(-Sizes.iloc[1:len(Sizes)], method="ordinal")+1) 793 | else: 794 | SizeRank = np.array([1]) 795 | OrdNumLabs = SizeRank[NumLabs - 1] 796 | else: 797 | SizeRank = rankdata(-Sizes.iloc[np.arange(len(Sizes))], method="ordinal") 798 | OrdNumLabs = SizeRank[NumLabs - 2] 799 | ordCoreLabels = OrdNumLabs - UnlabeledExist 800 | ordCoreLabels[coreLabels == 0] = 0 801 | 802 | if verbose > 0: print( "..done.") 803 | 804 | results = dict(labels = OrdNumLabs-UnlabeledExist, 805 | cores = ordCoreLabels, 806 | smallLabels = SmallLabels, 807 | onBranch = onBranch, 808 | mergeDiagnostics = mergeDiagnostics if nExternalSplits==0 else pd.DataFrame({'x':mergeDiagnostics, 'y':externalMergeDiags}), 809 | mergeCriteria = dict(maxCoreScatter = maxCoreScatter, minGap = minGap, 810 | maxAbsCoreScatter = maxAbsCoreScatter, minAbsGap = minAbsGap, 811 | minExternalSplit = minExternalSplit), 812 | branches = dict(nBranches = nBranches, # Branches = Branches, 813 | IndMergeToBranch = IndMergeToBranch, 814 | RootBranch = RootBranch, isCluster = isCluster, 815 | nPoints = nMerge+1)) 816 | 817 | return(results) 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | -------------------------------------------------------------------------------- /dynamicTreeCut/tests/test_dynamicTreeCut.py: -------------------------------------------------------------------------------- 1 | from scipy.cluster.hierarchy import linkage 2 | from scipy.spatial.distance import pdist 3 | import numpy as np 4 | 5 | 6 | def test_cuttreeHybrid(): 7 | from dynamicTreeCut import cutreeHybrid 8 | d = np.transpose(np.arange(1, 10001).reshape(100, 100)) 9 | distances = pdist(d, "euclidean") 10 | link = linkage(distances, "average") 11 | test = cutreeHybrid(link, distances) 12 | 13 | true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 15 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 16 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 | 1, 1, 1, 1] 20 | 21 | assert (test['labels'] == true).all() 22 | assert False 23 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: dynamicTreeCut-env 2 | channels: 3 | - r 4 | - defaults 5 | dependencies: 6 | - mkl 7 | - numpy 8 | - openssl 9 | - pandas 10 | - pip 11 | - python 12 | - readline 13 | - scipy 14 | - setuptools 15 | - sqlite 16 | - tk 17 | - wheel 18 | - xz 19 | - zlib 20 | 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | numpy 3 | pandas -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | with open("README.rst", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name="dynamicTreeCut", 9 | version="0.1.1", 10 | packages=["dynamicTreeCut"], 11 | scripts=["dynamicTreeCut/df_apply.py", "dynamicTreeCut/R_func.py"], 12 | author="Kyle S. Smith", 13 | license="GPL-3 Licenses", 14 | description='Dynamic Tree Cut', 15 | install_requires=['numpy', 'scipy'], 16 | long_description=long_description, 17 | url="https://github.com/kylessmith/dynamicTreeCut", 18 | author_email="kyle.smith@stjude.org", 19 | ) --------------------------------------------------------------------------------