├── .gitattributes
├── .gitfat
├── .gitignore
├── README.md
├── diffusionMaps.py
└── test_DiffusionMap.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.mat filter=fat -crlf
 2 | *.fig filter=fat -crlf
 3 | *.png filter=fat -crlf
 4 | *.gif filter=fat -crlf
 5 | *.jpg filter=fat -crlf
 6 | *.pdf filter=fat -crlf
 7 | *.eps filter=fat -crlf
 8 | *.mexa64 filter=fat -crlf
 9 | *.zip filter=fat -crlf
10 | *.tar filter=fat -crlf
11 | *.jar filter=fat -crlf
12 | *.old filter=fat -crlf
13 | *.backup filter=fat -crlf
14 | *.amt_tree filter=fat -crlf
15 | *.avi filter=fat -crtf
16 | *.mp4 filter=fat -crtf
17 | *.xoj filter=fat -crtf
18 | *.csv filter=fat -crtf
19 | *.dll filter=fat -crtf
20 | *.shelf filter=fat -crtf
21 | *.xls filter=fat -crtf
22 | *.xlsx filter=fat -crtf
23 | 


--------------------------------------------------------------------------------
/.gitfat:
--------------------------------------------------------------------------------
1 | [rsync]
2 | remote = your.remote-host.org:/share/fat-store
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #java class
 2 | *.class
 3 | 
 4 | #matlab autosaves
 5 | *~
 6 | 
 7 | #svn remnants
 8 | .svn
 9 | 
10 | # some latex stuff
11 | *.aux
12 | *.lof
13 | *.log
14 | *.toc
15 | *.fls
16 | *.out
17 | #bib files
18 | *.bbl
19 | *.bcf
20 | *.blg
21 | *-blx.aux
22 | *-blx.bib
23 | *.run.xml
24 | 
25 | #python compiled files
26 | *.pyc
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pyDiffusionMaps
 2 | Diffusion maps for python
 3 | 
 4 | see 
 5 | - [Diffusion maps for high-dimensional single-cell analysis of differentiation data.](http://bioinformatics.oxfordjournals.org/content/31/18/2989)
 6 | - [Geometric diffusions as a tool for harmonic analysis and structure definition of data: Diffusion maps](http://www.pnas.org/content/102/21/7426.long)
 7 | 
 8 | ## TODO:
 9 | - sparse matrix usage in density_normalize is inefficient
10 | 
11 | 


--------------------------------------------------------------------------------
/diffusionMaps.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator
  2 | from sklearn.neighbors import NearestNeighbors
  3 | import numpy as np
  4 | from scipy.sparse import csr_matrix, issparse, diags
  5 | from scipy.sparse.linalg import eigsh
  6 | from scipy.linalg import eigh
  7 | import matplotlib.pyplot as plt
  8 | import scipy.spatial.distance
  9 | import logging
 10 | logging.basicConfig(level=logging.INFO)  #for DEVEL, just use full logging
 11 | 
 12 | 
 13 | """
 14 |     [1] Haghverdi, L., Buettner, F. & Theis, F. J.
 15 |         Diffusion maps for high-dimensional single-cell analysis of differentiation data.
 16 |         Bioinformatics 31, 2989–2998 (2015).
 17 |     [2] Laleh Haghverdi, Maren B¨uttner, F. Alexander Wolf, Florian Buettner, Fabian J. Theis
 18 |         Diffusion pseudotime robustly reconstructs lineage branching
 19 | """
 20 | 
 21 | def _check_Z_for_division(Z,eps):
 22 |     "Z might be zero sometimes, we add a small constant epsilon to it. make sure this doesnt change to much"
 23 | 
 24 |     # check that all nonzeros of z are significantly larger than eps
 25 |     ixNonZero = Z!=0
 26 |     assert np.all(np.abs(Z[ixNonZero]) > eps*10), 'values to small. might introduce some error since close to zero division'
 27 | 
 28 | 
 29 | def _density_normalize(kernelMat, symmetrize=False):
 30 |     """
 31 |     1. density normalization: Eq (4-5) of [1]  or Eq 3,4 in [2]
 32 |         W_xy = K(x,y)/Z(x)Z(y)
 33 |         thats the Coifman anisotropy thingy, trying to mitigate the effect of density
 34 |         (alpha=1 in Coifman)
 35 | 
 36 |     2. strange row normalization Eq(5,6) in [2]  or Eq(5,6) in [1]
 37 |         this is to get the "normalized graph laplacian" as in Coifman.
 38 | 
 39 |         essentially this makes it a transition matrix. This is asymmetric!
 40 | 
 41 |     3. optional: symmetrize the transition matrix again! (see [2] Suppl.Eq 7)
 42 | 
 43 |     be very careful here if K is a sparse matrix, which behaves differently from usual np.ndarray
 44 |     in terms of operators *, /
 45 | 
 46 |     :param symmetrize: if True, we return a symmetrized transition matrix
 47 |          otherwise the classic non-symmetric transition matrix
 48 |     """
 49 |     eps = 1e-100
 50 | 
 51 |     # the method only works on symmetric matrices (relies that Z is the same along rows and cols)
 52 |     atol_symmetric = 1e-10  #TODO loose tolerance
 53 |     if issparse(kernelMat):
 54 |         np.testing.assert_allclose((kernelMat-kernelMat.T).A, 0, atol=atol_symmetric)
 55 |     else:
 56 |         np.testing.assert_allclose(kernelMat-kernelMat.T, 0, atol=atol_symmetric)
 57 | 
 58 |     "calculate:  P_xy / Z(x)Z(y)"
 59 |     "rescale each column by Z and also each row by Z"
 60 |     "easily done by just multipling with a diagonal matrix from the left (scaling rows) and right (rescaling columsn)"
 61 |     # note that row and column sum are the same as the matrix is symmetric!!
 62 |     if issparse(kernelMat):
 63 |         Z = np.array(kernelMat.sum(0)).flatten()  # a bit ugly, Z is this strange type(matrix), which one cannot cast into a 1d array, hence the detour to np.array
 64 |         _check_Z_for_division(Z, eps)
 65 |         scalingMat = diags(1.0 / (Z + eps), offsets=0)  # multiplying by this (one the right) is equivalent to rescaling the columns
 66 |         P_tilde = scalingMat * kernelMat * scalingMat  # this is matrix multiply!
 67 |         # assert np.testing.assert_allclose(P_tilde.toarray(), P_tilde.T.toarray(), err_msg='Ptilde should be symmetric')
 68 | 
 69 |     else:
 70 |         Z = kernelMat.sum(0).flatten()  # make sure it doesnt have two dimensions, needed for the broadcasting below
 71 |         _check_Z_for_division(Z, eps)
 72 |         invZ = 1.0 / (Z + eps)  # careful about zero division.
 73 |         #TODO replace by matrix multiplicaition?!  ->  M@N
 74 |         P_tilde = kernelMat * invZ * invZ.reshape(-1,1)  # broadcasts along rows and columsn, sclaing them both
 75 |         # assert np.testing.assert_allclose(P_tilde, P_tilde.T, err_msg='Ptilde should be symmetric', atol=atol_symmetric)
 76 | 
 77 |     "THIS PTILDE HAS TO BE SYMMETRIC HERE!!"
 78 |     logging.warning("max discrepancy of Ptilde symmetry: %e" % np.max(np.abs(P_tilde - P_tilde.T)))
 79 | 
 80 |     # Eq (5,6) of [1]
 81 |     # once again, the same trick with diagonal matrix for resacling
 82 |     if issparse(kernelMat):
 83 |         # import pdb
 84 |         # pdb.set_trace()
 85 |         if symmetrize:   # Eq 7 of
 86 |             logging.warning("not clear how the symmetric version is implemented")
 87 |             rowsum = np.array(P_tilde.sum(1)).flatten()
 88 |             _check_Z_for_division(rowsum, eps)
 89 |             scalingMat_rows = diags(1.0 / (rowsum + eps), offsets=0)
 90 |             sqrt_scale_row = np.sqrt(scalingMat_rows)
 91 | 
 92 |             colsum = np.array(P_tilde.sum(0)).flatten()
 93 |             _check_Z_for_division(colsum, eps)
 94 |             scalingMat_cols = diags(1.0 / (colsum + eps), offsets=0)
 95 |             sqrt_scale_col = np.sqrt(scalingMat_cols)
 96 | 
 97 |             logging.warning("max discrepancy of row/colsum: %e" % np.max(np.abs(rowsum-colsum)))
 98 |             logging.warning("max discrepancy of sqrt: %e" % np.max(np.abs(sqrt_scale_row-sqrt_scale_col)))
 99 | 
100 |             P_tilde =  sqrt_scale_row * P_tilde * sqrt_scale_col
101 |         else:
102 |             Z_tilde = np.array(P_tilde.sum(1)).flatten()
103 |             _check_Z_for_division(Z_tilde, eps)
104 |             scalingMat = diags(1.0 / (Z_tilde + eps), offsets=0)
105 | 
106 |             P_tilde = scalingMat * P_tilde
107 |     else:
108 |         Z_tilde = P_tilde.sum(1).flatten() # make sure it doesnt have two dimensions, needed for the broadcasting below
109 |         _check_Z_for_division(Z_tilde, eps)
110 |         invZ_tilde = 1.0 / (Z_tilde + eps)
111 |         ixnonZero = Z_tilde != 0         #same fuzz about the zero
112 | 
113 |         # nasty: since zInv_tilde is a 1D vector it automatically broadcasts along rows (leading to col normalization)
114 |         # hence we have to make the broadcasting explicit, giving shape to invZ
115 |         if symmetrize:   # Eq 7 of
116 |             raise NotImplementedError("not clear how symmetric is implemented. ask maren")
117 |             logging.warning("not clear how the symmetric version is implemented")
118 |             sqrt_invZ_tilde = np.sqrt(invZ_tilde)
119 |             P_tilde[np.ix_(ixnonZero, ixnonZero)] = P_tilde[np.ix_(ixnonZero, ixnonZero)] * sqrt_invZ_tilde[ixnonZero].reshape(-1, 1) * sqrt_invZ_tilde[ixnonZero] # normalizes each row
120 |         else:
121 |             P_tilde[np.ix_(ixnonZero, ixnonZero)] = P_tilde[np.ix_(ixnonZero, ixnonZero)] * invZ_tilde[ixnonZero].reshape(-1,1)  #normalizes each row
122 | 
123 |     return P_tilde
124 | 
125 | 
126 | def _calc_dpt(T):
127 |     ":param T: transition matrix"
128 |     n_vectors = T.shape[0]-1  # somehow the method can only compute all but the first EV
129 | 
130 |     assert issparse(T), "T should be sparse"
131 |     logging.info("Calculating full eigenvalue decomposition")
132 |     lambdas, V = eigsh(T, k=n_vectors)  # psi(0) which is the stationary density
133 | 
134 |     # the last eigenvalue/eigenvector pair is the stationary state which we ommit here
135 |     # note that we're missing the smalest eigenvector here!!
136 |     prefactor = lambdas/(1-lambdas)
137 | 
138 |     M = V[:,:-1] @ np.diag(prefactor[:-1]) @ V[:,:-1].T  # [:-1] skip the last EV which is the steady state
139 | 
140 |     logging.info("calculating dpt matrix")
141 | 
142 |     # we have to iterate over all eigenvectors,
143 |     # build a difference matrix and multiply by the prefactor
144 |     # dpt2 is then jsut the sum over all these matrixes
145 |     dpt2_matrix = np.zeros((V.shape[0], V.shape[0]))
146 |     for i in range(0, n_vectors - 1):  # -1 again to skip the stst-vector
147 |         currentPsi = V[:, i].reshape(-1, 1)  # a row vector
148 |         # due to numpy broadcasting the next line will
149 |         # become a matrix: difference of everyone vs evergone
150 |         squared_difference_matrix = (currentPsi - currentPsi.T) ** 2
151 |         dpt2_matrix = dpt2_matrix + prefactor[i]**2 * squared_difference_matrix
152 | 
153 |     import warnings
154 |     warnings.warn('changed to return sqrt(dtp2)')
155 |     return M, np.sqrt(dpt2_matrix)
156 | 
157 | 
158 | class DiffusionMap(BaseEstimator):
159 | 
160 |     """
161 |     diffusion maps for dimension reduction along the lines of [1]
162 |     this one uses nearest neighbours to approximate the kernel matrix
163 | 
164 | 
165 |     """
166 | 
167 |     def __init__(self, sigma, embedding_dim, k=100):
168 |         """
169 |         :param sigma: diffusion kernel width
170 |         :param embedding_dim: how many dimension to reduce to
171 |         :param k: kNN parameter (kNN is used when calculating the kernel matrix). the larger the more accurate, but the more RAM needed
172 |         :return:
173 |         """
174 |         self.sigma = sigma
175 |         self.embedding_dim = embedding_dim
176 |         self.k = k
177 |         self.local_sigma = None
178 | 
179 |         # NN is the most expensive caluclation, cache it
180 |         self._cached_nn_distances = None
181 |         self._cached_nn_indices = None
182 | 
183 |     def fit_transform(self, X, density_normalize=True, symmetrize=False):
184 |         """
185 |         estimates the diffusion map embedding
186 |         :param X: data matrix (samples x features)
187 |         :param density_normalize: boolean, wheter to apply density normalization of the kernel matrix
188 |         :return:
189 |         """
190 | 
191 |         # calculate the kernelmatrix based on a neirest neighbour graph
192 |         # kernelMat is called $P_xy$ in [1]
193 |         k = min(self.k, X.shape[0])  # limit the number of neighbours
194 | 
195 |         logging.info("Calculating kernel matrix")
196 |         kernelMat = self._get_kernel_matrix(X, k)
197 | 
198 |         # set the diagonal to 0: no diffusion onto itself
199 |         kernelMat.setdiag(np.zeros(X.shape[0]))  # TODO not sure if this is to be done BEFORE or after normailzation
200 | 
201 |         if density_normalize:
202 |             logging.info("density normalization")
203 |             kernelMat = _density_normalize(kernelMat, symmetrize=symmetrize)
204 | 
205 |         #also, store the kernel matrix (mostly debugging)
206 |         self.kernelMat = kernelMat
207 | 
208 |         # calculate the eigenvectors of the matrx
209 |         logging.info("Calculating eigenvalue decomposition")
210 | 
211 |         #TODO Warning: kernel matrix os not symmetric after density normalization, eigsh might fail!?
212 |         lambdas, V = eigsh(kernelMat, k=self.embedding_dim)  # calculate as many eigs as the requested embedding dim
213 | 
214 |         # eigsh returns the k largest eigenvals but ascending order (smallest first), so resort
215 |         ix = lambdas.argsort()[::-1]
216 | 
217 |         # TODO could think about getting rid of the first EV, which has only density info
218 |         return V[:,ix], lambdas[ix]
219 | 
220 |     def _get_NN(self, dataMatrix, k):
221 |         """
222 |         caluclates the distance to the k-nearest neighbours,
223 |         return an array of distances and indices of nearest
224 |         neigbours (see NearestNeighbors.kneighbors output)
225 | 
226 |         :param dataMatrix: matrix containing one sample per row
227 |         :param k: number of nearest nneighbours
228 |         :return:
229 |         """
230 | 
231 |         if self._cached_nn_distances is None or self._cached_nn_indices is None:
232 |             nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(dataMatrix)
233 |             distances, indices = nbrs.kneighbors(dataMatrix)
234 | 
235 |             # cache for later
236 |             self._cached_nn_distances = distances
237 |             self._cached_nn_indices = indices
238 | 
239 |         else:  # load cached
240 |             distances = self._cached_nn_distances
241 |             indices = self._cached_nn_indices
242 | 
243 |         return distances, indices
244 | 
245 |     def _get_kernel_matrix(self, X, k):
246 |         """
247 |         returns the kernel matrix for the samples in X using a Gaussian Kernel and a kNN-approximation,
248 |         called K(x,y) in [2]
249 | 
250 |         - all distances are zero, except within the neartest neighbours
251 |         - also symmetrizing the matrix (kNN is not symmetric necceseraly)
252 | 
253 |         if self.sigma !=0 just apply a single specified sigma to all datapoints.
254 |         if self.sigma ==0, estimate sigma for each datapoint via nearest-neighbour distance
255 | 
256 |         :param X: data matrix NxF, where N=number of samples, F= number of features
257 |         :param k: number of nearest neighbours to consider in kNN
258 |         :return: symmetric sparse matrix of NxN
259 |         """
260 | 
261 |         distances, indices = self._get_NN(X, k=k)
262 | 
263 |         if self.sigma != 0:
264 |             logging.info("calculating kernel matrix with global sigma %f" % self.sigma)
265 |             diffDist = np.exp(-(0.5 / self.sigma**2) * distances**2)
266 |         else:
267 |             logging.info("calculating kernel matrix with local sigma")
268 |             local_sigma_squared =  np.median(distances**2, axis=1).reshape(-1,1)  # .shape = (datapoints, 1)
269 |             local_sigma_squared += 1e-15  # numerical stability, also dropout leads to 0 distance of different datapoints
270 | 
271 |             self.local_sigma =  np.sqrt(local_sigma_squared)
272 |             "more tricky as for each datapoint + knn,s we have to consider different sigmas"
273 |             # distances.shape = (datapoints, kNNs)
274 |             diffDist = []
275 |             for i in range(len(indices)):  # for each datapoint calculate the row in the kernel matrix, taking care of the local sigmas of each datapoint
276 | 
277 |                 prefactor_nom = 2 * self.local_sigma[i] * self.local_sigma[indices[i]]
278 |                 prefactor_denom = local_sigma_squared[i] + local_sigma_squared[indices[i]]
279 |                 prefactor = np.sqrt(prefactor_nom/prefactor_denom)
280 |                 exp_denom = 2 * prefactor_denom
281 |                 diffDist.append(prefactor * np.exp(-(distances[i].reshape(-1,1) ** 2)/ exp_denom))  # reshape otherwise autobroadcasting goes from (k,) -> (k,k)
282 |             diffDist = np.array(diffDist)
283 | 
284 | 
285 |         # build a sparse matrix out of the diffusionDistances; some crazy magic with the sparse matrixes
286 |         N = X.shape[0]
287 |         indptr = range(0, (N+1)*k, k)   # some helper matrix, specfiing that the first k indices in indices,flatten() belong to the first row of data
288 | 
289 |         K = csr_matrix((diffDist.flatten(), indices.flatten(), indptr), shape=(N, N))
290 | 
291 |         # due to the kNN approximation, the matrix K is not neccesarily symmetric
292 |         # (if x is a kNN of y, y doesnt have to be a kNN of x)
293 |         # lets make it symmetric again, just filling in the missing entries
294 | 
295 |         shared_mask = (K!=0).multiply(K.T!=0)  # marking entries that are nonzero in both matrixes. mulitply is elemntwise!
296 |         K_sym = K + K.T - K.multiply(shared_mask) # adding them up, subtracting the common part that was counted twice!
297 | 
298 |         np.testing.assert_allclose((K_sym-K_sym.T).A, 0, atol=1e-6)  # todo loose tolerance
299 | 
300 |         return K_sym
301 | 
302 | 
303 | if __name__ == '__main__':
304 | 
305 |     # testing with MNIST
306 |     from sklearn.datasets import fetch_mldata
307 |     mnist = fetch_mldata('MNIST original')
308 |     X, y = mnist.data.astype(np.float32), mnist.target
309 |     ix_perm = np.random.permutation(X.shape[0]) # shuffle the data
310 |     X, y = X[ix_perm,:], y[ix_perm]
311 | 
312 |     X,y = X[:1000,:], y[:1000]
313 | 
314 |     X/=255
315 | 
316 |     df = DiffusionMap(sigma=5, embedding_dim=10)
317 |     V,lam = df.fit_transform(X, density_normalize=False)
318 | 
319 |     plt.scatter(V[:,0], V[:,1], c=y)
320 |     plt.show()
321 | 


--------------------------------------------------------------------------------
/test_DiffusionMap.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from diffusionMaps import DiffusionMap, _density_normalize
  4 | from scipy.sparse import csr_matrix, issparse
  5 | import scipy.sparse
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def mixtureNormals(request):
 10 |     # teardown
 11 |     def fin():
 12 |         pass
 13 |     request.addfinalizer(fin)
 14 | 
 15 |     samples = 50
 16 |     dims = 3
 17 |     X1 = np.random.normal(0,1, size=(samples,dims))
 18 |     X2 = np.random.normal(3,1, size=(samples, dims))
 19 | 
 20 |     X = np.vstack([X1,X2])
 21 |     return X
 22 | 
 23 | def create_sym_matrix(n):
 24 |     q = np.random.rand(n,n)
 25 |     return np.dot(q,q.T)
 26 | 
 27 | 
 28 | def create_sparse_sym_matrix(n, density=0.1): # sparseness = .9 -> 90% entries zero
 29 |     q = np.random.rand(n,n)
 30 |     sym = np.dot(q,q.T)
 31 |     ix = np.random.binomial(1,density/2, size=(n,n))
 32 |     ix = ix+ix.T*(ix==0)  # make the selection of entrys also symmetric
 33 |     return csr_matrix(sym*ix)
 34 | 
 35 | 
 36 | def test_DiffusionMap_fit_transform_output_dimensions(mixtureNormals):
 37 |     X = mixtureNormals
 38 | 
 39 |     embDim = 2
 40 |     df = DiffusionMap(sigma=1, embedding_dim=embDim )
 41 |     X_embed, lam = df.fit_transform(X)
 42 | 
 43 |     assert X_embed.shape == (X.shape[0], embDim ), "returns wrong dimensionally"
 44 |     assert lam.shape[0] == X_embed.shape[1], "must return as many eigenvalues as embedded dimensions"
 45 | 
 46 | 
 47 | def test_DiffusionMap_nearestNeighbour_number_of_neighbours(mixtureNormals):
 48 |     X = mixtureNormals
 49 |     embDim = 2
 50 |     df = DiffusionMap(sigma=1, embedding_dim=embDim)
 51 | 
 52 |     kNN = 4
 53 |     distances, indices = df._get_NN(X,k=kNN)
 54 | 
 55 |     assert distances.shape == (X.shape[0], kNN)
 56 |     assert indices.shape == (X.shape[0], kNN)
 57 | 
 58 | 
 59 | def test_DiffusionMap_get_kernel_matrix_number_of_neighbours(mixtureNormals):
 60 |     """actually we would like to test for the exact number of neighvours
 61 |     but due tot the symmetrizing, it can exceed the kNN"""
 62 |     X = mixtureNormals
 63 |     embDim = 2
 64 |     df = DiffusionMap(sigma=1, embedding_dim=embDim)
 65 | 
 66 |     kNN = 4
 67 |     K = df._get_kernel_matrix(X,k=kNN)
 68 |     assert K.shape == (X.shape[0], X.shape[0])
 69 | 
 70 |     nonzero_elements_per_row = np.sum(K.toarray()!=0, 1)
 71 |     print(nonzero_elements_per_row)
 72 |     assert np.all(nonzero_elements_per_row >= kNN)  # the number of nonzero elements must be kNN or larger (due to the symmetrizing
 73 | 
 74 | 
 75 | def test_DiffusionMap_get_kernel_matrix_symmetry(mixtureNormals):
 76 |     "make sure the kernel matrix is symmetric"
 77 |     X = mixtureNormals
 78 |     df = DiffusionMap(sigma=1,embedding_dim=2)
 79 |     K = df._get_kernel_matrix(X,k=2)
 80 | 
 81 |     Q = (K-K.T).toarray()  # np.all doesnt work on sparse matrices
 82 |     assert np.all(Q==0), 'returned kernel matrix is not symmetric'
 83 | 
 84 | 
 85 | def test__get_kernel_matrix_sparse(mixtureNormals):
 86 |     df = DiffusionMap(sigma=1,embedding_dim=2)
 87 |     K = df._get_kernel_matrix(mixtureNormals,k=10)
 88 |     assert issparse(K)
 89 | 
 90 | 
 91 | def test__density_normalize__sparse(mixtureNormals):
 92 |     "must return sparse if we put in sparse"
 93 |     K = csr_matrix([[0,1],[1,1]])
 94 |     assert issparse(_density_normalize(K)), 'returned matrix is not sparse after normalization'
 95 | 
 96 | 
 97 | def test__density_normalize__rowsum(mixtureNormals):
 98 |     "enforce rows summing to on for the desniy normalization"
 99 |     K = create_sparse_sym_matrix(100, density=0.1)
100 |     K_norm = _density_normalize(K)
101 |     np.testing.assert_allclose(K_norm.toarray().sum(1), 1)
102 | 
103 | 
104 | def test__density_normalize__not_sparse_rowsum(mixtureNormals):
105 |     "enforce rows summing to on for the desniy normalization"
106 |     K = create_sym_matrix(100)
107 |     K_norm = _density_normalize(K)
108 |     np.testing.assert_allclose(K_norm.sum(1), 1)
109 | 
110 | 
111 | # def test__density_normalize__not_sparse_symmetrize(mixtureNormals):
112 | #     "check the symmetrized version of the transtion matrix"
113 | #     K = create_sym_matrix(100)
114 | #     Tsym = _density_normalize(K, symmetrize=True)
115 | #     # check symmetry
116 | #     np.testing.assert_allclose(Tsym, Tsym.T, err_msg="Tsym is not symmetric")
117 | #     # check the rowsum =1
118 | #     np.testing.assert_allclose(Tsym.sum(1), 1)
119 | 
120 | def test__density_normalize__sparse_symmetrize(mixtureNormals):
121 |     "check the symmetrized version of the transtion matrix"
122 |     K = create_sparse_sym_matrix(100, 0.1)
123 |     Tsym = _density_normalize(K, symmetrize=True)
124 |     # check symmetry
125 |     np.testing.assert_allclose(Tsym.toarray(), Tsym.T.toarray(), err_msg="Tsym is not symmetric")
126 |     # check the rowsum =1
127 |     np.testing.assert_allclose(Tsym.toarray().sum(1), 1, err_msg="rowsum <> 1")
128 | 
129 | def test__density_normalize__not_sparse(mixtureNormals):
130 |     K = create_sym_matrix(2)
131 |     K_norm = _density_normalize(K)
132 |     assert isinstance(K_norm, np.ndarray), 'must return full matrix if we put in a full matrix'
133 | 
134 | 
135 | 
136 | def test_density_normalize_same_result_sparse_nonsparse():
137 | 
138 |     for d in [0.0001, 0.01, 0.5]:  # test it for different sparsity, as sometimes entre rows/col become zero
139 |         K_sparse = create_sparse_sym_matrix(5, density=d)
140 |         K_full = K_sparse.A
141 | 
142 |         n_sparse = _density_normalize(K_sparse).A
143 |         n_full = _density_normalize(K_full)
144 | 
145 |         np.testing.assert_allclose(n_sparse, n_full)
146 | 
147 | 
148 | def test_DiffusionMap_fit_transform_eigenvalue_ordering(mixtureNormals):
149 |     "must return the largest first"
150 |     X = mixtureNormals
151 | 
152 |     embDim = 2
153 |     df = DiffusionMap(sigma=1, embedding_dim=embDim )
154 |     X_embed, lam = df.fit_transform(X)
155 |     assert(lam[0]> lam[1])
156 | 


--------------------------------------------------------------------------------