├── .gitignore
├── README.rst
├── dynamicTreeCut
    ├── R_func.py
    ├── __init__.py
    ├── __main__.py
    ├── df_apply.py
    ├── dynamicTreeCut.py
    └── tests
    │   └── test_dynamicTreeCut.py
├── environment.yml
├── requirements.txt
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .cache
  2 | .eggs
  3 | .idea
  4 | *~
  5 | *.pyc
  6 | build/*
  7 | dynamicTreeCut.egg-info/*
  8 | dist/*
  9 | writing/*
 10 | 
 11 | # Ignore CSVs unless explicitly added
 12 | .csv
 13 | 
 14 | # Jupyter/IPython Checkpoints
 15 | .ipynb_checkpoints
 16 | 
 17 | # Emacs temporary files
 18 | *~
 19 | 
 20 | # Mac OS files
 21 | .DS_Store
 22 | 
 23 | # Byte-compiled / optimized / DLL files
 24 | __pycache__/
 25 | *.py[cod]
 26 | *$py.class
 27 | 
 28 | # C extensions
 29 | *.so
 30 | 
 31 | # Distribution / packaging
 32 | .Python
 33 | build/
 34 | develop-eggs/
 35 | dist/
 36 | downloads/
 37 | eggs/
 38 | .eggs/
 39 | lib/
 40 | lib64/
 41 | parts/
 42 | sdist/
 43 | var/
 44 | wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | 
 49 | # PyInstaller
 50 | #  Usually these files are written by a python script from a template
 51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 52 | *.manifest
 53 | *.spec
 54 | 
 55 | # Installer logs
 56 | pip-log.txt
 57 | pip-delete-this-directory.txt
 58 | 
 59 | # Unit test / coverage reports
 60 | htmlcov/
 61 | .tox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | .hypothesis/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |Stars| |PyPIDownloads| |PyPI| |Build Status| |Coffee|
 2 | 
 3 | .. |Stars| image:: https://img.shields.io/github/stars/kylessmith/dynamicTreeCut?logo=GitHub&color=yellow
 4 |    :target: https://github.com/kylessmith/dynamicTreeCut/stargazers
 5 | .. |PyPIDownloads| image:: https://pepy.tech/badge/dynamicTreeCut
 6 |    :target: https://pepy.tech/project/dynamicTreeCut
 7 | .. |PyPI| image:: https://img.shields.io/pypi/v/dynamicTreeCut.svg
 8 |    :target: https://pypi.org/project/dynamicTreeCut
 9 | .. |Build Status| image:: https://travis-ci.org/kylessmith/dynamicTreeCut.svg?branch=master
10 |    :target: https://travis-ci.org/kylessmith/dynamicTreeCut
11 | .. |Coffee| image:: https://img.shields.io/badge/-buy_me_a%C2%A0coffee-gray?logo=buy-me-a-coffee&color=ff69b4
12 |    :target: https://www.buymeacoffee.com/kylessmith
13 | 
14 | 
15 | Python translation of the hybrid dynamicTreeCut method created by Peter Langfelder and Bin Zhang.
16 | 
17 | dynamicTreeCut was originally published by in *Bioinformatics*:
18 | 	Langfelder P, Zhang B, Horvath S (2007) Defining clusters from a hierarchical cluster tree:
19 | 	the Dynamic Tree Cut package for R. Bioinformatics 2008 24(5):719-720
20 | 
21 | dynamicTreeCut R code is distributed under the GPL-3 License and
22 | original sources should be cited.
23 | 
24 | 
25 | dynamicTreeCut contains methods for detection of clusters in hierarchical clustering dendrograms.
26 | *NOTE: though the clusters match the R output, the cluster names are shuffled*
27 | 
28 | Installing
29 | ==========
30 | 
31 | To install, it's best to create an environment after installing and downloading the
32 | `Anaconda Python Distribution <https://www.continuum.io/downloads>`__
33 | 
34 |     conda env create --file environment.yml
35 | 
36 | PyPI install, presuming you have all its requirements (numpy and scipy) installed::
37 | 
38 | 	pip install dynamicTreeCut
39 | 
40 | 	
41 | Importation
42 | ===========
43 | ::
44 | 
45 | 	>>> from dynamicTreeCut import cutreeHybrid
46 | 	>>> from scipy.spatial.distance import pdist
47 | 	>>> import numpy as np
48 | 	>>> from scipy.cluster.hierarchy import linkage
49 | 	>>> d = np.transpose(np.arange(1,10001).reshape(100,100))
50 | 	>>> distances = pdist(d, "euclidean")
51 | 	>>> link = linkage(distances, "average")
52 | 	>>> clusters = cutreeHybrid(link, distances)
53 | 	..cutHeight not given, setting it to 495.1  ===>  99% of the (truncated) height range in dendro.
54 | 	..done.
55 | 	>>> clusters["labels"]
56 | 	[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
57 |  	 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1
58 |  	 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
59 | 	
60 | 	
61 | Compared to R::
62 | 
63 | 	> library(dynamicTreeCut)
64 | 	> d = matrix(1:10000, 100)
65 | 	> distances <- dist(d, method="euclidean")
66 | 	> dendro <- hclust(distances, method="average")
67 | 	> clusters <- cutreeDynamic(dendro, distM=as.matrix(distances))
68 | 	  ..cutHeight not given, setting it to 495  ===>  99% of the (truncated) height range in dendro.
69 | 	  ..done.
70 | 	> clusters
71 | 	  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
72 |  	  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1
73 |  	  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
74 | 
75 | Installation
76 | ============
77 | 
78 | If you dont already have numpy and scipy installed, it is best to download
79 | `Anaconda`, a python distribution that has them included.  
80 | 
81 |     https://continuum.io/downloads
82 | 
83 | Dependencies can be installed by::
84 | 
85 |     pip install -r requirements.txt
86 | 
87 | 
88 | License
89 | =======
90 | 
91 | dynamicTreeCut is available under the GPL-3 License
92 | 


--------------------------------------------------------------------------------
/dynamicTreeCut/R_func.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from scipy.cluster.hierarchy import to_tree
  4 | 
  5 | 
  6 | def sign(value):
  7 |     #python version of R's sign
  8 |     
  9 |     if value > 0:
 10 |         return(1)
 11 |     elif value < 0:
 12 |         return(-1)
 13 |     else:
 14 |         return(0)
 15 | 
 16 | def paste(string, n, sep=""):
 17 |     #python version of R's paste
 18 |     
 19 |     results = []
 20 |     for i in range(n):
 21 |         results.append(string + sep + str(i))
 22 |     
 23 |     return(results)
 24 | 
 25 | 
 26 | def get_heights(Z):
 27 |     #python verison of R's dendro$height
 28 |     #height = np.zeros(len(dendro["dcoord"]))
 29 |     
 30 |     #for i, d in enumerate(dendro["dcoord"]):
 31 |         #height[i] = d[1]
 32 |     
 33 |     clusternode = to_tree(Z, True)
 34 |     #height = np.array([c.dist for c in clusternode[1]])
 35 |     height = np.array([c.dist for c in clusternode[1] if c.is_leaf() != True])
 36 |         
 37 |     #height.sort()
 38 |     
 39 |     return(height)
 40 |     
 41 | 
 42 | def get_merges(z):
 43 |     #python version of R's dendro$merge
 44 |     n = z.shape[0]
 45 |     merges = np.zeros((z.shape[0], 2), dtype=int)
 46 |         
 47 |     for i in range(z.shape[0]):
 48 |         for j in range(2):
 49 |             if z[i][j] <= n:
 50 |                 merges[i][j] = -(z[i][j] + 1)
 51 |             else:
 52 |                 cluster = z[i][j] - n
 53 |                 merges[i][j] = cluster
 54 |                 
 55 |     return(merges)
 56 |     
 57 |     
 58 | def factor(vector):
 59 |     return(vector)
 60 | 
 61 | 
 62 | def nlevels(vector):
 63 |     #python version of R's nlevels
 64 |     return(len(np.unique(vector)))
 65 | 
 66 | 
 67 | def levels(vector):
 68 |     #python version of R's levels
 69 |     return(np.unique(vector))
 70 | 
 71 | 
 72 | def tapply(vector, index, function): #can add **args, **kwargs
 73 |     #python version of R's tapply
 74 |     
 75 |     factors = np.unique(index)
 76 |     
 77 |     #results = pd.Series(np.repeat(np.nan, len(factors)))
 78 |     results = np.repeat(np.nan, len(factors))
 79 |     #results.index = factors
 80 |     
 81 |     for i, k in enumerate(factors):
 82 |         subset = vector[index == k]
 83 |         #results.iloc[i] = function(subset)
 84 |         results[i] = function(subset)
 85 |     
 86 |     return(results)
 87 | 
 88 | 
 89 | def tapply_df(df, index, function, axis=0): #can add **args, **kwargs
 90 |     #python version of R's tapply
 91 |     
 92 |     factors = np.unique(index)
 93 |     
 94 |     if axis == 1:
 95 |         #results = pd.DataFrame(np.zeros((len(factors), df.shape[1])))
 96 |         results = np.zeros((len(factors), df.shape[1]))
 97 |     else:
 98 |         #results = pd.DataFrame(np.zeros((df.shape[0], len(factors))))
 99 |         results = np.zeros((df.shape[0], len(factors)))
100 |     
101 |     #results.index = factors
102 |     
103 |     if axis == 1:
104 |         for j in range(df.shape[1]):
105 |             for i, k in enumerate(factors):
106 |                 subset = df[index == k, j]
107 |                 #results.iloc[i, j] = function(subset)
108 |                 results[i, j] = function(subset)
109 |     else:
110 |         for i in range(df.shape[0]):
111 |             for j, k in enumerate(factors):
112 |                 subset = df[i, index == k]
113 |                 #results.iloc[i, j] = function(subset)
114 |                 results[i, j] = function(subset)
115 |     
116 |     return(results)
117 | 
118 | 
119 | def table(vector):
120 |     
121 |     factors = np.unique(vector)
122 |     results = pd.Series(np.zeros(len(factors), dtype=int))
123 |     results.index = factors
124 |     
125 |     for i, k in enumerate(factors):
126 |         results.iloc[i] = np.sum(vector == k)
127 |         
128 |     return(results)
129 | 
130 | 


--------------------------------------------------------------------------------
/dynamicTreeCut/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Python translation of the hybrid dynamicTreeCut
 3 | Reference:
 4 | 	Langfelder P, Zhang B, Horvath S (2007) Defining clusters from a hierarchical cluster tree:
 5 | 	the Dynamic Tree Cut package for R. Bioinformatics 2008 24(5):719-720
 6 | '''
 7 | 
 8 | from .dynamicTreeCut import cutreeHybrid
 9 | 
10 | __author__ = 'kylessmith'


--------------------------------------------------------------------------------
/dynamicTreeCut/__main__.py:
--------------------------------------------------------------------------------
1 | from . import cutreeHybrid
2 | 
3 | if __name__ == "__main__":
4 |     cutreeHybrid()


--------------------------------------------------------------------------------
/dynamicTreeCut/df_apply.py:
--------------------------------------------------------------------------------
 1 | #import pandas as pd
 2 | import numpy as np
 3 | from functools import partial
 4 | from multiprocessing import Pool
 5 | 
 6 | 
 7 | #iterate through rows of DataFrame
 8 | def gen_row(ndarray):
 9 |     
10 |     for i in range(ndarray.shape[0]):
11 |         yield ndarray[i,:]
12 | 
13 | 
14 | #iterate through columns of DataFrame
15 | def gen_col(ndarray):
16 |     
17 |     for i in range(ndarray.shape[1]):
18 |         yield ndarray[:,i]
19 | 
20 | #apply a function to each row or columns of a DataFrame
21 | def apply(func, df, axis=0, ncores=None, p=None, **kwargs):
22 |     
23 |     #check axis input is 0 or 1
24 |     if axis not in (0,1):
25 |         raise IndexError("axis must equal 0 or 1")
26 |     
27 |     #check if p is provided or needs to be created
28 |     if p == None and ncores != None:
29 |         p = Pool(ncores)
30 |     
31 |     #create function and pass kwargs
32 |     g = partial(func, **kwargs)
33 |     
34 |     #if axis is 0 apply function to rows
35 |     if axis == 0:
36 |         #conduct multiprocessed version or not
37 |         if p != None:
38 |             iter_results = p.map(g, gen_row(df))
39 |         else:
40 |             iter_results = map(g, gen_row(df))
41 |     #if axis is 1 apply function to columns
42 |     elif axis == 1:
43 |         #conduct multiprocessed version or not
44 |         if p != None:
45 |             iter_results = p.map(g, gen_col(df))
46 |         else:
47 |             iter_results = map(g, gen_col(df))
48 |     
49 |     #close Pool if it wasn't provided
50 |     if ncores != None:
51 |         p.close()
52 |         p.join()
53 |     
54 |     #create DataFrame for output
55 |     results = np.array(list(iter_results))
56 |     #if applied to comluns output transposed
57 |     #results(to retain shape of df input)
58 |     if axis == 1:
59 |         results = np.transpose(results)
60 |         
61 |     return(results)
62 |     


--------------------------------------------------------------------------------
/dynamicTreeCut/dynamicTreeCut.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from scipy.stats import rankdata
  4 | from scipy.special import binom #faster than comb
  5 | import dynamicTreeCut.df_apply
  6 | from functools import partial
  7 | from dynamicTreeCut.R_func import *
  8 | 
  9 | 
 10 | chunkSize = 100
 11 | spaces = "   "
 12 | 
 13 | 
 14 | def dist_index(i, j, matrix, l, n):
 15 |     """
 16 |     Function to index flat matrix as squareform matrix
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |         i : int
 21 |             First index
 22 |         j : int
 23 |             Second index
 24 |         matrix : numpy.ndarray
 25 |             Squareform matrix
 26 |         l : int
 27 |         n : int
 28 | 
 29 |     Returns
 30 |     -------
 31 |         matrix_slice : float
 32 |             Indexed value
 33 | 
 34 |     """
 35 |     
 36 |     # Check if indices are the same
 37 |     if i == j:
 38 |         return 0.0
 39 |     
 40 |     # Calculate index
 41 |     index = int(l - binom(n-min(i, j), 2) + (max(i, j) - min(i, j) - 1))
 42 | 
 43 |     # Slice matrix
 44 |     matrix_slice = matrix[index]
 45 |     
 46 |     return matrix_slice
 47 |     
 48 | 
 49 | def dist_multi_index(_array, matrix):
 50 |     """
 51 |     Function to index flat matrix as squareform matrix
 52 |     """
 53 |     
 54 |     #handle 2D array
 55 |     if len(matrix.shape) == 2:
 56 |         return(matrix[_array, :][:, _array])
 57 |     
 58 |     l = len(matrix)
 59 |     n = 0.5*(np.sqrt((8*l)+1)+1)
 60 |     
 61 |     results = np.zeros((len(_array), len(_array)))
 62 |     for i in range(len(_array)):
 63 |         for j in range(i, len(_array)):
 64 |             score = dist_index(_array[i], _array[j], matrix, l, n)
 65 |             results[i,j] = score
 66 |             results[j,i] = score
 67 |     
 68 |     return results
 69 |     
 70 | 
 71 | def get_rows(_array, matrix):
 72 |     """
 73 |     Function to index rows of flat matrix as squareform matrix
 74 |     """
 75 |     
 76 |     #handle 2D array
 77 |     if len(matrix.shape) == 2:
 78 |         return(matrix[_array,:])
 79 |     
 80 |     l = len(matrix)
 81 |     n = int(0.5*(np.sqrt((8*l)+1)+1))
 82 |     
 83 |     if _array.dtype != "bool":
 84 |         results = np.zeros((len(_array), n))
 85 |     
 86 |         for row, i in enumerate(_array):
 87 |             for j in range(n):
 88 |                 if i == j:
 89 |                     results[row, j] = 0.0
 90 |                 else:
 91 |                     index = int(l - binom(n - min(i, j), 2) + (max(i, j) - min(i, j) - 1))
 92 |                     results[row,j] = matrix[index]
 93 |     
 94 |         return results
 95 |         
 96 |     else:
 97 |         results = np.zeros((np.sum(_array), n))
 98 |         row = 0
 99 |         for i, b in enumerate(_array):
100 |             if b == True:
101 |                 for j in range(n):
102 |                     if i == j:
103 |                         results[row, j] = 0.0
104 |                     else:
105 |                         index = int(l - binom(n - min(i, j), 2) + (max(i, j) - min(i, j) - 1))
106 |                         results[row, j] = matrix[index]
107 |                 
108 |                 row += 1
109 |                 
110 |         return results
111 |     
112 |     
113 | def CoreSize(BranchSize, minClusterSize):
114 |     """
115 |     The following are supporting function for GetClusters.
116 |     """
117 |     
118 |     BaseCoreSize = minClusterSize / 2 + 1
119 |     if BaseCoreSize < BranchSize:
120 |         CoreSize = BaseCoreSize + np.sqrt(BranchSize - BaseCoreSize)
121 |     else:
122 |         CoreSize = BranchSize
123 |     
124 |     return int(CoreSize) 
125 |     
126 | 
127 | # This assumes the diagonal of the distance matrix
128 | # is zero, BranchDist is a square matrix whose dimension is at least 2.
129 | def CoreScatter(BranchDist, minClusterSize):
130 |     
131 |     nPoints = BranchDist.shape[0]
132 |     PointAverageDistances = np.sum(BranchDist, axis=1) / (nPoints - 1)
133 |     CoreSize = minClusterSize / 2 + 1
134 | 
135 |     if CoreSize < nPoints:
136 |         EffCoreSize = CoreSize + np.sqrt(nPoints - CoreSize)
137 |         order = np.argsort(PointAverageDistances)
138 |         Core = order[np.arange(EffCoreSize)]
139 |     else:
140 |         Core = np.arange(nPoints)
141 |         EffCoreSize = nPoints
142 | 
143 |     CoreAverageDistances = np.sum(BranchDist[Core, Core], axis=1) / (EffCoreSize - 1)
144 |     
145 |     return(np.mean(CoreAverageDistances))
146 |     
147 | 
148 | def interpolate(data, index):
149 |     
150 |     i = np.round(index)
151 |     n = len(data)
152 |     if i < 0: return(data[0])
153 |     if i >= n: return(data[-1])
154 | 
155 |     r = index - i
156 |         
157 |     return(data[i-1] * (1 - r) + data[i] * r)
158 | 
159 | 
160 | def cutreeHybrid(link, distM,
161 |                  cutHeight = None, minClusterSize = 20, deepSplit = 1,
162 |                  maxCoreScatter = None, minGap = None,
163 |                  maxAbsCoreScatter = None, minAbsGap = None,
164 |                  minSplitHeight = None, minAbsSplitHeight = None,
165 |                  externalBranchSplitFnc = None, minExternalSplit = None,
166 |                  externalSplitOptions = [],
167 |                  externalSplitFncNeedsDistance = None,
168 |                  assumeSimpleExternalSpecification = True,
169 |                  pamStage = True, pamRespectsDendro = True,
170 |                  useMedoids = False,
171 |                  maxPamDist = None,
172 |                  respectSmallClusters = True,
173 |                  verbose = 2, indent = 0):
174 |     
175 |     
176 |     dendro_height = get_heights(link)
177 |     dendro_merge = get_merges(link)
178 | 
179 |     if maxPamDist == None:
180 |         maxPamDist = cutHeight
181 |         
182 |     nMerge = len(dendro_height)
183 |     refQuantile = 0.05
184 |     refMerge = np.round(nMerge * refQuantile)
185 | 
186 |     if refMerge < 1: refMerge = 1
187 | 
188 |     refHeight = dendro_height[int(refMerge) - 1]
189 | 
190 |     if cutHeight == None:
191 |         cutHeight = 0.99 * (np.max(dendro_height) - refHeight) + refHeight
192 |         print("..cutHeight not given, setting it to", cutHeight, 
193 |               " ===>  99% of the (truncated) height range in dendro.")
194 |     else:
195 |         if cutHeight > np.max(dendro_height): cutHeight = np.max(dendro_height)
196 |     
197 |     if maxPamDist == None: maxPamDist = cutHeight
198 | 
199 |     nMergeBelowCut = np.sum(dendro_height <= cutHeight)
200 | 
201 |     if nMergeBelowCut < minClusterSize:
202 |         print("cutHeight set too low; no merges below the cut.")
203 |         return(np.zeros(nMerge+1))
204 |     
205 |     # fill in this section once understood better
206 |     if externalBranchSplitFnc != None:
207 |         raise NotImplementedError("externalBranchSplitFnc is not supported yet")
208 |         nExternalSplits = len(externalBranchSplitFnc)
209 |         if len(minExternalSplit) < 1:
210 |             raise AttributeError("minExternalBranchSplit must be given.")
211 |         if assumeSimpleExternalSpecification and nExternalSplits == 1:
212 |             pass
213 |     else:
214 |         nExternalSplits = 0
215 | 
216 | 
217 |     MxBranches = nMergeBelowCut
218 |     branch_isBasic = np.repeat(True, MxBranches)
219 |     branch_isTopBasic = np.repeat(True, MxBranches)
220 |     branch_failSize = np.repeat(False, MxBranches)
221 |     branch_rootHeight = np.repeat(np.nan, MxBranches)
222 |     branch_size = np.repeat(2, MxBranches)
223 |     branch_nMerge = np.repeat(1, MxBranches)
224 |     branch_nSingletons = np.repeat(2, MxBranches)
225 |     branch_nBasicClusters = np.repeat(0, MxBranches)
226 |     branch_mergedInto = np.repeat(0, MxBranches)
227 |     branch_attachHeight = np.repeat(np.nan, MxBranches)
228 |     #branch_singletons = np.zeros(MxBranches)
229 |     branch_singletons = [np.nan] * MxBranches
230 |     #branch_basicClusters = pd.Series(np.zeros(MxBranches))
231 |     branch_basicClusters = [np.nan] * MxBranches
232 |     #branch_mergingHeights = pd.Series(np.zeros(MxBranches))
233 |     branch_mergingHeights = [np.nan] * MxBranches
234 |     #branch_singletonHeights = pd.Series(np.zeros(MxBranches))
235 |     branch_singletonHeights = [np.nan] * MxBranches
236 | 
237 | 
238 |     nBranches = 0
239 | 
240 |     spyIndex = None
241 |     if os.path.isfile(".dynamicTreeCutSpyFile"):
242 |         spyIndex = pd.read_csv(".dynamicTreeCutSpyFile")
243 |         print("Found 'spy file' with indices of objects to watch for.")
244 |         spyIndex = spyIndex.iloc[:,1].values
245 |     
246 | 
247 |     defMCS = np.array([0.64, 0.73, 0.82, 0.91, 0.95])
248 |     defMG = (1 - defMCS) * 3 / 4.0
249 | 
250 |     nSplitDefaults = len(defMCS)
251 | 
252 |     if type(deepSplit) == bool: deepSplit = int(deepSplit) * (nSplitDefaults - 2)
253 |     deepSplit = deepSplit + 1
254 | 
255 |     if deepSplit < 1 or deepSplit > nSplitDefaults:
256 |         raise IndexError("Parameter deepSplit (value", deepSplit,
257 |                          ") out of range: allowable range is 0 through",
258 |                          nSplitDefaults - 1)
259 |     
260 |     if maxCoreScatter == None: maxCoreScatter = interpolate(defMCS, deepSplit)
261 |     if minGap == None: minGap = interpolate(defMG, deepSplit)
262 | 
263 |     if maxAbsCoreScatter == None:
264 |         maxAbsCoreScatter = refHeight + maxCoreScatter * (cutHeight - refHeight)
265 |     if minAbsGap == None:
266 |         minAbsGap = minGap * (cutHeight - refHeight)
267 | 
268 |     if minSplitHeight == None: minSplitHeight = 0
269 | 
270 |     if minAbsSplitHeight == None:
271 |         minAbsSplitHeight = refHeight + minSplitHeight * (cutHeight - refHeight)
272 |     
273 |     nPoints = nMerge + 1
274 | 
275 |     IndMergeToBranch = np.repeat(0, nMerge)
276 | 
277 |     onBranch = np.repeat(0, nPoints)
278 | 
279 |     RootBranch = 0
280 | 
281 |     mergeDiagnostics = dict(smI = np.repeat(np.nan, nMerge), smSize = np.repeat(np.nan, nMerge), 
282 |                             smCrSc = np.repeat(np.nan, nMerge), smGap = np.repeat(np.nan, nMerge), 
283 |                             lgI = np.repeat(np.nan, nMerge), lgSize = np.repeat(np.nan, nMerge), 
284 |                             lgCrSc = np.repeat(np.nan, nMerge), lgGap = np.repeat(np.nan, nMerge),
285 |                             merged = np.repeat(np.nan, nMerge))
286 | 
287 |     if nExternalSplits > 0:
288 |         #externalMergeDiags = pd.DataFrame(np.repeat(np.nan, nMerge*nExternalSplits).reshape(nMerge, nExternalSplits))
289 |         #externalMergeDiags.columns = paste("externalBranchSplit", nExternalSplits, sep = ".")
290 |         pass
291 | 
292 |     extender = np.zeros(chunkSize, dtype=int)
293 |     
294 |     for merge in range(nMerge):
295 |         if dendro_height[merge] <= cutHeight:
296 |             # are both merged objects singletons?
297 |             if dendro_merge[merge, 0] < 0 and dendro_merge[merge, 1] < 0:
298 |                 nBranches = nBranches + 1
299 |                 branch_isBasic[nBranches - 1] = True
300 |                 branch_isTopBasic[nBranches - 1] = True
301 |                 branch_singletons[nBranches - 1] = np.append(-dendro_merge[merge,], extender)
302 |                 branch_basicClusters[nBranches - 1] = extender
303 |                 branch_mergingHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender)
304 |                 branch_singletonHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender)
305 |                 IndMergeToBranch[merge] = nBranches
306 |                 RootBranch = nBranches
307 |             elif sign(dendro_merge[merge,0]) * sign(dendro_merge[merge,1]) < 0:
308 |                 clust = IndMergeToBranch[int(np.max(dendro_merge[merge,])) - 1]
309 |                 if clust == 0: raise ValueError("a previous merge has no associated cluster. Sorry!")
310 |                 gene = -np.min(dendro_merge[merge,])
311 |                 ns = branch_nSingletons[clust - 1] + 1
312 |                 nm = branch_nMerge[clust - 1] + 1
313 |                 if branch_isBasic[clust - 1]:
314 |                     if ns > len(branch_singletons[clust - 1]):
315 |                         branch_singletons[clust - 1] = np.append(branch_singletons[clust - 1], extender)
316 |                         branch_singletonHeights[clust - 1] = np.append(branch_singletonHeights[clust - 1], extender)
317 |                     branch_singletons[clust - 1][ns - 1] = gene
318 |                     branch_singletonHeights[clust - 1][ns - 1] = dendro_height[merge]
319 |                 else:
320 |                     onBranch[int(gene) - 1] = clust
321 |                 
322 |                 if nm >= len(branch_mergingHeights[clust - 1]):
323 |                     branch_mergingHeights[clust - 1] = np.append(branch_mergingHeights[clust - 1], extender)
324 |                 branch_mergingHeights[clust - 1][nm - 1] = dendro_height[merge]
325 |                 branch_size[clust - 1] = branch_size[clust - 1] + 1
326 |                 branch_nMerge[clust - 1] = nm
327 |                 branch_nSingletons[clust - 1] = ns
328 |                 IndMergeToBranch[merge] = clust
329 |                 RootBranch = clust
330 |             else:
331 |                 # attempt to merge two branches:
332 |                 clusts = IndMergeToBranch[dendro_merge[merge,] - 1]
333 |                 sizes = branch_size[clusts - 1]
334 |                 # Note: for 2 elements, rank and order are the same.
335 |                 rnk = rankdata(sizes, method = "ordinal")
336 |                 small = clusts[rnk[0] - 1]
337 |                 large = clusts[rnk[1] - 1]
338 |                 sizes = sizes[rnk - 1]
339 |                 branch1 = np.nan if np.any(np.isnan(branch_singletons[large - 1])) else branch_singletons[large - 1][np.arange(sizes[1])]
340 |                 branch2 = np.nan if np.any(np.isnan(branch_singletons[small - 1])) else branch_singletons[small - 1][np.arange(sizes[0])]
341 |                 spyMatch = False
342 |                 if spyIndex != None:
343 |                     n1 = len(set(branch1) & set(spyIndex))
344 |                     if n1 / len(branch1) > 0.99 and n1 / len(spyIndex) > 0.99:
345 |                         print("Found spy match for branch 1 on merge", merge)
346 |                         spyMatch = True
347 |                     n2 = len(set(branch2) & set(spyIndex))
348 |                     if n2 / len(branch1) > 0.99 and n2 / len(spyIndex) > 0.99:
349 |                         print("Found spy match for branch 2 on merge", merge)
350 |                         spyMatch = True
351 |             
352 |                 if branch_isBasic[small - 1]:
353 |                     coresize = CoreSize(branch_nSingletons[small - 1], minClusterSize)
354 |                     Core = np.array(branch_singletons[small - 1][np.arange(int(coresize))], dtype=int)
355 |                     # SmAveDist = mean(apply(distM[Core, Core], 2, sum)/(coresize-1))
356 |                     SmAveDist = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize - 1))     
357 |                 else:
358 |                     SmAveDist = 0
359 |             
360 |                 if branch_isBasic[large - 1]:
361 |                     coresize = CoreSize(branch_nSingletons[large - 1], minClusterSize)
362 |                     Core = np.array(branch_singletons[large - 1][np.arange(int(coresize))], dtype=int)
363 |                     LgAveDist = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize -1 ))
364 |                 else:
365 |                     LgAveDist = 0
366 |                 
367 |                 for key in mergeDiagnostics:
368 |                     if key == "smI":
369 |                         mergeDiagnostics[key][merge] = small
370 |                     elif key == "smSize":
371 |                         mergeDiagnostics[key][merge] = branch_size[small - 1]
372 |                     elif key == "smCrSc":
373 |                         mergeDiagnostics[key][merge] = SmAveDist
374 |                     elif key == "smGap":
375 |                         mergeDiagnostics[key][merge] = dendro_height[merge] - SmAveDist
376 |                     elif key == "lgI":
377 |                         mergeDiagnostics[key][merge] = large
378 |                     elif key == "lgSize":
379 |                         mergeDiagnostics[key][merge] = branch_size[large - 1]
380 |                     elif key == "lgCrSc":
381 |                         mergeDiagnostics[key][merge] = LgAveDist
382 |                     elif key == "lgGap":
383 |                         mergeDiagnostics[key][merge] = dendro_height[merge] - LgAveDist
384 |                     elif key == "merged":
385 |                         mergeDiagnostics[key][merge] = np.nan
386 |                         
387 |             
388 |                 # We first check each cluster separately for being too small, too diffuse, or too shallow:
389 |                 SmallerScores = [branch_isBasic[small - 1], 
390 |                                  branch_size[small - 1] < minClusterSize,
391 |                                  SmAveDist > maxAbsCoreScatter, 
392 |                                  dendro_height[merge] - SmAveDist < minAbsGap,
393 |                                  dendro_height[merge] < minAbsSplitHeight]
394 |             
395 |                 if SmallerScores[0] * np.sum(SmallerScores[1:]) > 0:
396 |                     DoMerge = True
397 |                     SmallerFailSize = ~np.logical_or(SmallerScores[2], SmallerScores[3])  # Smaller fails only due to size
398 |                 else:
399 |                     LargerScores = [branch_isBasic[large - 1], 
400 |                                     branch_size[large - 1] < minClusterSize,
401 |                                     LgAveDist > maxAbsCoreScatter, 
402 |                                     dendro_height[merge] - LgAveDist < minAbsGap,
403 |                                     dendro_height[merge] < minAbsSplitHeight]
404 |                     if LargerScores[0] * np.sum(LargerScores[1:]) > 0:
405 |                         # Actually: the large one is the one to be merged
406 |                         DoMerge = True
407 |                         SmallerFailSize = ~np.logical_or(LargerScores[2], LargerScores[3])  # cluster fails only due to size
408 |                         x = small
409 |                         small = large
410 |                         large = x
411 |                         sizes = sizes[::-1]
412 |                     else:
413 |                         DoMerge = False # None of the two satisfies merging criteria
414 |             
415 |                 if DoMerge:
416 |                     mergeDiagnostics["merged"][merge] = 1
417 | 
418 |                 if ~DoMerge and nExternalSplits > 0 and branch_isBasic[small - 1] and branch_isBasic[large - 1]:
419 |                     if verbose > 4: print("Entering external split code on merge ", merge)
420 |                     branch1 = branch_singletons[large - 1][np.arange(sizes[1])]
421 |                     branch2 = branch_singletons[small - 1][np.arange(sizes[0])]
422 | 
423 |                     if verbose > 4 or spyMatch: print("  ..branch lengths: ", sizes[0], ", ", sizes[1])
424 |                     #if (any(is.na(branch1)) || any(branch1==0)) browser();
425 |                     #if (any(is.na(branch2)) || any(branch2==0)) browser();
426 |                 
427 |                 
428 |                     ##### fix after External Splits is understood better
429 |                     es = 0
430 |                     while es < nExternalSplits and ~DoMerge:
431 |                         es = es + 1
432 |                         args = externalSplitOptions[es - 1]
433 |                         args = [args, list(branch1 = branch1, branch2 = branch2)]
434 |                         #extSplit = do.call(externalBranchSplitFnc[es], args)
435 |                         if spyMatch:
436 |                             print(" .. external criterion ", es, ": ", extSplit)
437 |                         DoMerge = extSplit < minExternalSplit[es - 1]
438 |                         externalMergeDiags[merge, es - 1] = extSplit
439 |                         if DoMerge:
440 |                             mergeDiagnostics_merged[merge] = 2
441 |                         else:
442 |                             mergeDiagnostics_merged[merge] = 0
443 |             
444 |                 if DoMerge:
445 |                     # merge the small into the large cluster and close it.
446 |                     branch_failSize[small - 1] = SmallerFailSize
447 |                     branch_mergedInto[small - 1] = large
448 |                     branch_attachHeight[small - 1] = dendro_height[merge]
449 |                     branch_isTopBasic[small - 1] = False
450 |                     nss = branch_nSingletons[small - 1]
451 |                     nsl = branch_nSingletons[large - 1]
452 |                     ns = nss + nsl
453 |                 
454 |                     if branch_isBasic[large - 1]: 
455 |                         nExt = np.ceil(  (ns - len(branch_singletons[large - 1])) / chunkSize  )
456 |                     
457 |                         if nExt > 0:
458 |                             if verbose > 5:
459 |                                 print("Extending singletons for branch", large, "by", nExt, " extenders.")
460 |                         
461 |                             branch_singletons[large - 1] = np.append(branch_singletons[large - 1], np.repeat(extender, nExt))
462 |                             branch_singletonHeights[large - 1] = np.append(branch_singletonHeights[large - 1], np.repeat(extender, nExt))
463 |                     
464 |                         branch_singletons[large - 1][np.arange(nsl,ns)] = branch_singletons[small - 1][np.arange(nss)]
465 |                         branch_singletonHeights[large - 1][np.arange(nsl,ns)] = branch_singletonHeights[small - 1][np.arange(nss)]
466 |                         branch_nSingletons[large - 1] = ns
467 |                     
468 |                     else:
469 |                         if ~branch_isBasic[small - 1]:
470 |                             raise ValueError("merging two composite clusters. Sorry!")
471 |                     
472 |                         onBranch[ branch_singletons[small - 1][branch_singletons[small - 1] != 0] - 1 ] = large
473 |                 
474 |                     nm = branch_nMerge[large - 1] + 1
475 |                 
476 |                     if nm > len(branch_mergingHeights[large - 1]):
477 |                         branch_mergingHeights[large - 1] = np.append(branch_mergingHeights[large - 1], extender)
478 |                 
479 |                     branch_mergingHeights[large - 1][nm - 1] = dendro_height[merge]
480 |                     branch_nMerge[large - 1] = nm
481 |                     branch_size[large - 1] = branch_size[small - 1] + branch_size[large - 1]
482 |                     IndMergeToBranch[merge] = large
483 |                     RootBranch = large
484 |                 else:
485 |                     # start or continue a composite cluster.
486 | 
487 |                     # If large is basic and small is not basic, switch them.
488 |                     if branch_isBasic[large - 1] and ~branch_isBasic[small - 1]:
489 |                         x = large
490 |                         large = small
491 |                         small = x
492 |                         sizes = sizes[::-1]
493 |                 
494 |                     # Note: if pamRespectsDendro, need to start a new composite cluster every time two branches merge,
495 |                     # otherwise will not have the necessary information.
496 |                     # Otherwise, if the large cluster is already composite, I can simply merge both clusters into 
497 |                     # one of the non-composite clusters.
498 | 
499 |                     if branch_isBasic[large - 1] or pamStage and pamRespectsDendro:
500 |                         nBranches = nBranches + 1
501 |                         branch_attachHeight[[large - 1, small - 1]] = dendro_height[merge]
502 |                         branch_mergedInto[[large - 1, small - 1]] = nBranches
503 |                         if branch_isBasic[small - 1]:
504 |                             addBasicClusters = small # add basic clusters
505 |                         else:
506 |                             addBasicClusters = branch_basicClusters[small - 1]
507 |                         if branch_isBasic[large - 1]:
508 |                             addBasicClusters = np.append(addBasicClusters, large)
509 |                         else:
510 |                             addBasicClusters = np.append(addBasicClusters, branch_basicClusters[large - 1])
511 |                         # print(paste("  Starting a composite cluster with number", nBranches));
512 |                         branch_isBasic[nBranches - 1] = False
513 |                         branch_isTopBasic[nBranches - 1] = False
514 |                         branch_basicClusters[nBranches - 1] = addBasicClusters
515 |                         branch_mergingHeights[nBranches - 1] = np.append(np.repeat(dendro_height[merge], 2), extender)
516 |                         branch_nMerge[nBranches - 1] = 2
517 |                         branch_size[nBranches - 1] = np.sum(sizes)
518 |                         branch_nBasicClusters[nBranches - 1] = len(addBasicClusters)
519 |                         IndMergeToBranch[merge] = nBranches
520 |                         RootBranch = nBranches
521 |                     else:
522 |                         # Add small branch to the large one 
523 |                         addBasicClusters = small if branch_isBasic[small - 1] else branch_basicClusters[small - 1]
524 |                         nbl = branch_nBasicClusters[large - 1]
525 |                         #small might be an int
526 |                         try:
527 |                             nb = branch_nBasicClusters[large - 1] + len(addBasicClusters)
528 |                         except TypeError:
529 |                             nb = branch_nBasicClusters[large - 1] + 1
530 |                                                 
531 |                         if nb > len(branch_basicClusters[large - 1]):
532 |                             nExt = np.ceil(  ( nb - len(branch_basicClusters[large - 1])) / chunkSize)
533 |                             branch_basicClusters[large - 1] = np.append(branch_basicClusters[large - 1], np.repeat(extender, nExt))
534 |                     
535 |                         branch_basicClusters[large - 1][np.arange(nbl,nb)] = addBasicClusters
536 |                         branch_nBasicClusters[large - 1] = nb
537 |                         branch_size[large - 1] = branch_size[large - 1] + branch_size[small - 1]
538 |                         nm = branch_nMerge[large - 1] + 1
539 |                     
540 |                         if nm > len(branch_mergingHeights[large - 1]):
541 |                             branch_mergingHeights[large - 1] = np.append(branch_mergingHeights[large - 1], extender) 
542 |                     
543 |                         branch_mergingHeights[large - 1][nm - 1] = dendro_height[merge]
544 |                         branch_nMerge[large - 1] = nm
545 |                         branch_attachHeight[small - 1] = dendro_height[merge]
546 |                         branch_mergedInto[small - 1] = large
547 |                         IndMergeToBranch[merge] = large
548 |                         RootBranch = large
549 |         
550 |     if verbose > 2: print("..Going through detected branches and marking clusters..")
551 |             
552 |     isCluster = np.repeat(False, nBranches)
553 |     SmallLabels = np.repeat(0, nPoints)
554 | 
555 |     for clust in range(nBranches):
556 |     
557 |         if np.isnan(branch_attachHeight[clust]): branch_attachHeight[clust] = cutHeight
558 |         if branch_isTopBasic[clust]:
559 |             coresize = CoreSize(branch_nSingletons[clust], minClusterSize)
560 |             Core = branch_singletons[clust][np.arange(coresize)]
561 |             CoreScatter = np.mean(np.sum(dist_multi_index(Core - 1, distM), axis=1) / (coresize - 1))
562 |             isCluster[clust] = np.logical_and(np.logical_and(branch_isTopBasic[clust],
563 |                                                              branch_size[clust] >= minClusterSize),
564 |                                               np.logical_and(CoreScatter < maxAbsCoreScatter,
565 |                                                              branch_attachHeight[clust] - CoreScatter > minAbsGap))
566 |         else:
567 |             CoreScatter = 0
568 |         
569 |         if branch_failSize[clust]: SmallLabels[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = clust + 1
570 |     
571 |     if not respectSmallClusters: SmallLabels = np.repeat(0, nPoints)
572 | 
573 |     if verbose > 2: print(spaces, "..Assigning Tree Cut stage labels..")
574 | 
575 |     Colors = np.repeat(0, nPoints)
576 |     coreLabels = np.repeat(0, nPoints)
577 |     clusterBranches = np.arange(nBranches)[isCluster]
578 |     branchLabels = np.repeat(0, nBranches)
579 |     color = 0
580 |                     
581 |     for clust in clusterBranches:
582 |         color = color + 1
583 |         Colors[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = color
584 |         SmallLabels[branch_singletons[clust][branch_singletons[clust] != 0] - 1] = 0
585 |         coresize = CoreSize(branch_nSingletons[clust], minClusterSize)
586 |         Core = branch_singletons[clust][np.arange(coresize)]
587 |         coreLabels[Core - 1] = color
588 |         branchLabels[clust] = color
589 | 
590 |     Labeled = np.arange(nPoints)[Colors != 0]
591 |     Unlabeled = np.arange(nPoints)[Colors == 0]
592 |     nUnlabeled = len(Unlabeled)
593 |     UnlabeledExist = nUnlabeled > 0
594 | 
595 |     if len(Labeled) > 0:
596 |         LabelFac = factor(Colors[Labeled])
597 |         nProperLabels = nlevels(LabelFac)
598 |     else:
599 |         nProperLabels = 0
600 | 
601 | 
602 |     if pamStage and UnlabeledExist and nProperLabels > 0:
603 |         if verbose > 2: print(spaces, "..Assigning PAM stage labels..")
604 |         nPAMed = 0
605 |         # Assign some of the grey genes to the nearest module. Define nearest as the distance to the medoid,
606 |         # that is the point in the cluster that has the lowest average distance to all other points in the
607 |         # cluster. First get the medoids.
608 |         if useMedoids:
609 |             Medoids = np.repeat(0, nProperLabels)
610 |             ClusterRadii = np.repeat(0.0, nProperLabels)
611 |             for cluster in range(1, nProperLabels + 1):
612 |                 InCluster = np.arange(1,nPoints+1)[Colors == cluster]
613 |                 DistInCluster = dist_multi_index(InCluster - 1, distM)
614 |                 #DistInCluster = distM[InCluster, InCluster]
615 |                 DistSums = np.sum(DistInCluster, axis=1)
616 |                 Medoids[cluster - 1] = InCluster[np.argmin(DistSums)]
617 |                 ClusterRadii[cluster - 1] = np.max(DistInCluster[:, np.argmin(DistSums)])
618 |             # If small clusters are to be respected, assign those first based on medoid-medoid distances.
619 |             if respectSmallClusters:
620 |                 FSmallLabels = factor(SmallLabels)
621 |                 SmallLabLevs = levels(FSmallLabels)
622 |                 nSmallClusters = nlevels(FSmallLabels) - (SmallLabLevs[0] == 0)
623 |                 if nSmallClusters > 0 :
624 |                     for sclust in SmallLabLevs[SmallLabLevs != 0]:
625 |                         InCluster = np.arange(nPoints)[SmallLabels == sclust]
626 |                         if pamRespectsDendro:
627 |                             onBr = np.unique(onBranch[InCluster])
628 |                             if len(onBr) > 1:
629 |                                 raise ValueError("Internal error: objects in a small cluster are marked to belong",
630 |                                                  "\nto several large branches:")
631 |                             if onBr > 0:
632 |                                 basicOnBranch = branch_basicClusters[onBr[0] - 1]
633 |                                 labelsOnBranch = branchLabels[basicOnBranch - 1]
634 |                             else:
635 |                                 labelsOnBranch = None
636 |                         else:
637 |                             labelsOnBranch = np.arange(1, nProperLabels + 1)
638 |                         # printFlush(paste("SmallCluster", sclust, "has", length(InCluster), "elements."));
639 |                         DistInCluster = dist_multi_index(InCluster, distM)
640 |                         #DistInCluster = distM[InCluster, InCluster]
641 |                         if len(labelsOnBranch) > 0:
642 |                             if len(InCluster) > 1:
643 |                                 DistSums = df_apply.apply(np.sum, DistInCluster, 1)
644 |                                 smed = InCluster[np.argmin(DistSums)]
645 |                                 DistToMeds = get_rows(Medoids[labelsOnBranch - 1][Medoids[labelsOnBranch - 1] != 0] - 1, distM)[:, smed]
646 |                                 closest = np.argmin(DistToMeds)
647 |                                 DistToClosest = DistToMeds[closest]
648 |                                 closestLabel = labelsOnBranch[closest]
649 |                                 if DistToClosest < ClusterRadii[closestLabel - 1] or DistToClosest <  maxPamDist:
650 |                                     Colors[InCluster] = closestLabel
651 |                                     nPAMed = nPAMed + len(InCluster)
652 |                                 else: Colors[InCluster] = -1  # This prevents individual points from being assigned later 
653 |                         else:
654 |                             Colors[InCluster] = -1
655 |          
656 |                 # Assign leftover unlabeled objects to clusters with nearest medoids
657 |                 Unlabeled = np.arange(nPoints)[Colors == 0]
658 |                 if len(Unlabeled > 0):
659 |                     for obj in Unlabeled:
660 |                         if pamRespectsDendro:
661 |                             onBr = onBranch[obj]
662 |                             if onBr > 0:
663 |                                 basicOnBranch = branch_basicClusters[onBr - 1]
664 |                                 labelsOnBranch = branchLabels[basicOnBranch - 1]
665 |                             else:
666 |                                 labelsOnBranch = None
667 |                         else:
668 |                             labelsOnBranch = np.arange(nProperLabels)
669 |                         if labelsOnBranch != None:
670 |                             UnassdToMedoidDist = get_rows(Medoids[labelsOnBranch - 1] - 1, distM)[:,obj]
671 |                             #UnassdToMedoidDist = distM[Medoids[labelsOnBranch], obj]
672 |                             nearest= np.argmin(UnassdToMedoidDist)
673 |                             NearestCenterDist = UnassdToMedoidDist[nearest]
674 |                             nearestMed = labelsOnBranch[nearest]
675 |                             if NearestCenterDist < ClusterRadii[nearestMed - 1] or NearestCenterDist < maxPamDist:
676 |                                 Colors[obj] = nearestMed
677 |                                 nPAMed = nPAMed + 1
678 | 
679 |                 UnlabeledExist = np.sum(Colors == 0) > 0
680 |         else: # Instead of medoids, use average distances
681 |         # This is the default method, so I will try to tune it for speed a bit.
682 |             ClusterDiam = np.repeat(0, nProperLabels)
683 |             for cluster in range(nProperLabels):
684 |                 InCluster = np.arange(nPoints)[Colors == cluster]
685 |                 nInCluster = len(InCluster)
686 |                 DistInCluster = dist_multi_index(InCluster, distM)
687 |                 #DistInCluster = distM[InCluster, InCluster]
688 |                 if nInCluster > 1:
689 |                     AveDistInClust = np.sum(DistInCluster, axis=1) / (nInCluster - 1)
690 |                     ClusterDiam[cluster] = np.max(AveDistInClust)
691 |                 else:
692 |                     ClusterDiam[cluster] = 0
693 | 
694 |             # If small clusters are respected, assign them first based on average cluster-cluster distances.
695 |             ColorsX = Colors.copy()
696 |             if respectSmallClusters:
697 |                 FSmallLabels = factor(SmallLabels) #### think about
698 |                 SmallLabLevs = levels(FSmallLabels) ##### think about
699 |                 nSmallClusters = nlevels(FSmallLabels) - (SmallLabLevs[0] == 0)
700 |                 if nSmallClusters > 0:
701 |                     if pamRespectsDendro:
702 |                         for sclust in SmallLabLevs[SmallLabLevs != 0]:
703 |                             InCluster = np.arange(nPoints)[SmallLabels == sclust]
704 |                             onBr = np.unique(onBranch[InCluster])
705 | 
706 |                             if len(onBr) > 1:
707 |                                 raise ValueError("objects in a small cluster are marked to belong",
708 |                                                  "\nto several large branches:")
709 |                             if onBr > 0:
710 |                                 basicOnBranch = branch_basicClusters[onBr[0] - 1]
711 |                                 labelsOnBranch = branchLabels[basicOnBranch - 1]
712 |                                 useObjects = np.in1d(ColorsX, np.unique(labelsOnBranch))
713 |                                 DistSClustClust = get_rows(InCluster, distM)[:,useObjects]
714 |                                 #DistSClustClust = distM[InCluster, useObjects]
715 |                                 MeanDist = np.mean(DistSClustClust, axis=0)
716 |                                 useColorsFac = factor(ColorsX[useObjects]) ### think about
717 |                                 MeanMeanDist = tapply(MeanDist, useColorsFac, np.mean) ## think about
718 |                                 nearest = np.argmin(MeanMeanDist)
719 |                                 NearestDist = MeanMeanDist[nearest]
720 |                                 nearestLabel = levels(useColorsFac)[nearest] ## think about
721 |                                 if NearestDist < ClusterDiam[nearestLabel - 1] or NearestDist <  maxPamDist:
722 |                                     Colors[InCluster] = nearestLabel
723 |                                     nPAMed = nPAMed + len(InCluster)
724 |                                 else:
725 |                                     Colors[InCluster] = -1  # This prevents individual points from being assigned later
726 |  
727 |                     else:
728 |                         labelsOnBranch = np.arange(nProperLabels)
729 |                         useObjects = np.arange(nPoints)[ColorsX != 0]
730 |                         for sclust in SmallLabLevs[SmallLabLevs != 0]:
731 |                             InCluster = np.arange(nPoints)[SmallLabels == sclust]
732 |                             DistSClustClust = get_rows(InCluster, distM)[:,useObjects]
733 |                             #DistSClustClust = distM[InCluster, useObjects]
734 |                             MeanDist = np.mean(DistSClustClust, axis=0)
735 |                             useColorsFac = factor(ColorsX[useObjects]) ### think about
736 |                             MeanMeanDist = tapply(MeanDist, useColorsFac, np.mean) ### think about
737 |                             nearest = np.argmin(MeanMeanDist)
738 |                             NearestDist = MeanMeanDist[nearest]
739 |                             nearestLabel = levels(useColorsFac)[nearest] ## think about
740 |                             if NearestDist < ClusterDiam[nearestLabel - 1] or NearestDist <  maxPamDist:
741 |                                 Colors[InCluster] = nearestLabel
742 |                                 nPAMed = nPAMed + len(InCluster)
743 |                             else:
744 |                                 Colors[InCluster] = -1  # This prevents individual points from being assigned later
745 | 
746 |             # Assign leftover unlabeled objects to clusters with nearest medoids
747 |             Unlabeled = np.arange(nPoints)[Colors == 0]
748 |             #ColorsX = Colors;
749 |             if len(Unlabeled) > 0:
750 |                 if pamRespectsDendro:
751 |                     unlabOnBranch = Unlabeled[onBranch[Unlabeled] > 0]
752 |                     for obj in unlabOnBranch:
753 |                         onBr = onBranch[obj]
754 |                         basicOnBranch = branch_basicClusters[onBr - 1]
755 |                         labelsOnBranch = branchLabels[basicOnBranch - 1]
756 |                         useObjects = np.in1d(ColorsX, np.unique(labelsOnBranch))
757 |                         useColorsFac = factor(ColorsX[useObjects]) ### think about
758 |                         #UnassdToClustDist = tapply(distM[useObjects, obj], useColorsFac, mean) ### think about
759 |                         UnassdToClustDist = tapply(get_rows(useObjects, distM)[:,obj], useColorsFac, np.mean) ### think about
760 |                         nearest = np.argmin(UnassdToClustDist)
761 |                         NearestClusterDist = UnassdToClustDist[nearest]
762 |                         nearestLabel = levels(useColorsFac)[nearest] ### think about
763 |                         if NearestClusterDist < ClusterDiam[nearestLabel - 1] or NearestClusterDist < maxPamDist:
764 |                             Colors[obj] = nearestLabel
765 |                             nPAMed = nPAMed + 1
766 | 
767 |                 else:
768 |                     useObjects = np.arange(nPoints)[ColorsX != 0]
769 |                     useColorsFac = factor(ColorsX[useObjects]) ## think about
770 |                     nUseColors = nlevels(useColorsFac) ### think about
771 |                     UnassdToClustDist = tapply_df(get_rows(useObjects, distM)[:,Unlabeled], useColorsFac, np.mean, 1)
772 |                     #UnassdToClustDist = df_apply.apply(distM[useObjects, Unlabeled], 1, tapply, useColorsFac, mean) ### think about
773 |                     # Fix dimensions for the case when there's only one cluster
774 |                     #dim(UnassdToClustDist) = np.append(nUseColors, len(Unlabeled)) ### think about
775 |                     nearest = df_apply.apply(np.argmin, UnassdToClustDist, 1)
776 |                     nearestDist = df_apply.apply(np.min, UnassdToClustDist, 1)
777 |                     nearestLabel = levels(useColorsFac)[nearest - 1] ### think about
778 |                     assign = np.logical_or(nearestDist < ClusterDiam[nearestLabel - 1], nearestDist < maxPamDist)
779 |                     Colors[Unlabeled[assign]] = nearestLabel[assign]
780 |                     nPAMed = nPAMed + np.sum(assign)
781 | 
782 |         if verbose > 2: print("....assigned", nPAMed, "objects to existing clusters.")
783 | 
784 |     
785 |     # Relabel labels such that 1 corresponds to the largest cluster etc.
786 |     Colors[Colors < 0] = 0
787 |     UnlabeledExist = np.sum(Colors == 0) > 0
788 |     NumLabs = Colors + 1
789 |     Sizes = table(NumLabs) ### think about
790 |     if UnlabeledExist:
791 |         if len(Sizes) > 1:
792 |             SizeRank = np.append(1, rankdata(-Sizes.iloc[1:len(Sizes)], method="ordinal")+1)
793 |         else:
794 |             SizeRank = np.array([1])
795 |         OrdNumLabs = SizeRank[NumLabs - 1]
796 |     else:
797 |         SizeRank = rankdata(-Sizes.iloc[np.arange(len(Sizes))], method="ordinal")
798 |         OrdNumLabs = SizeRank[NumLabs - 2]
799 |     ordCoreLabels = OrdNumLabs - UnlabeledExist
800 |     ordCoreLabels[coreLabels == 0] = 0
801 | 
802 |     if verbose > 0: print( "..done.")
803 | 
804 |     results = dict(labels = OrdNumLabs-UnlabeledExist,
805 |                    cores = ordCoreLabels,
806 |                    smallLabels = SmallLabels,
807 |                    onBranch = onBranch,
808 |                    mergeDiagnostics = mergeDiagnostics if nExternalSplits==0 else pd.DataFrame({'x':mergeDiagnostics, 'y':externalMergeDiags}),
809 |                    mergeCriteria = dict(maxCoreScatter = maxCoreScatter, minGap = minGap, 
810 |                                         maxAbsCoreScatter = maxAbsCoreScatter, minAbsGap = minAbsGap, 
811 |                                         minExternalSplit = minExternalSplit),
812 |                    branches  = dict(nBranches = nBranches, # Branches = Branches, 
813 |                                     IndMergeToBranch = IndMergeToBranch,
814 |                                     RootBranch = RootBranch, isCluster = isCluster, 
815 |                                     nPoints = nMerge+1))
816 | 
817 |     return(results)
818 | 
819 | 
820 | 
821 | 
822 | 
823 | 
824 | 
825 | 
826 | 
827 | 
828 | 
829 | 
830 | 
831 | 
832 | 
833 | 
834 | 
835 | 
836 | 
837 | 
838 | 


--------------------------------------------------------------------------------
/dynamicTreeCut/tests/test_dynamicTreeCut.py:
--------------------------------------------------------------------------------
 1 | from scipy.cluster.hierarchy import linkage
 2 | from scipy.spatial.distance import pdist
 3 | import numpy as np
 4 | 
 5 | 
 6 | def test_cuttreeHybrid():
 7 |     from dynamicTreeCut import cutreeHybrid
 8 |     d = np.transpose(np.arange(1, 10001).reshape(100, 100))
 9 |     distances = pdist(d, "euclidean")
10 |     link = linkage(distances, "average")
11 |     test = cutreeHybrid(link, distances)
12 | 
13 |     true = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
14 |             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
15 |             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
16 |             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
17 |             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 |             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 |             1, 1, 1, 1]
20 |     
21 |     assert (test['labels'] == true).all()
22 |     assert False
23 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: dynamicTreeCut-env
 2 | channels:
 3 | - r
 4 | - defaults
 5 | dependencies:
 6 | - mkl
 7 | - numpy
 8 | - openssl
 9 | - pandas
10 | - pip
11 | - python
12 | - readline
13 | - scipy
14 | - setuptools
15 | - sqlite
16 | - tk
17 | - wheel
18 | - xz
19 | - zlib
20 | 
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | numpy
3 | pandas


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | with open("README.rst", "r") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(
 8 |     name="dynamicTreeCut",
 9 |     version="0.1.1",
10 |     packages=["dynamicTreeCut"],
11 |     scripts=["dynamicTreeCut/df_apply.py", "dynamicTreeCut/R_func.py"],
12 |     author="Kyle S. Smith",
13 |     license="GPL-3 Licenses",
14 |     description='Dynamic Tree Cut',
15 |     install_requires=['numpy', 'scipy'],
16 |     long_description=long_description,
17 |     url="https://github.com/kylessmith/dynamicTreeCut",
18 |     author_email="kyle.smith@stjude.org",
19 |     )


--------------------------------------------------------------------------------