├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── MultiMAP
    ├── __init__.py
    └── matrix.py
├── README.md
├── docs
    ├── Makefile
    ├── MultiMAP_schematic.png
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── requirements.txt
├── examples
    └── tutorial.ipynb
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | MultiMAP/__pycache__
3 | docs/_build
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.9"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | # Install our python package before building the docs
19 | python:
20 |   install:
21 |     - requirements: docs/requirements.txt
22 |     - method: pip
23 |       path: .
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Mika Sarkin Jain
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/MultiMAP/__init__.py:
--------------------------------------------------------------------------------
  1 | import scipy
  2 | import numpy as np
  3 | from MultiMAP.matrix import MultiMAP, tfidf
  4 | #you don't need these if going for MultiMAP.matrix functions
  5 | try:
  6 | 	import anndata
  7 | except ImportError:
  8 | 	pass
  9 | try:
 10 | 	import scanpy as sc
 11 | except ImportError:
 12 | 	pass
 13 | 
 14 | def TFIDF_LSI(adata, n_comps=50, binarize=True, random_state=0):
 15 | 	'''
 16 | 	Computes LSI based on a TF-IDF transformation of the data. Putative dimensionality 
 17 | 	reduction for scATAC-seq data prior to MultiMAP. Adds an ``.obsm['X_lsi']`` field to 
 18 | 	the object it was ran on.
 19 | 	
 20 | 	Input
 21 | 	-----
 22 | 	adata : ``AnnData``
 23 | 		The object to run TFIDF + LSI on. Will use ``.X`` as the input data.
 24 | 	n_comps : ``int``
 25 | 		The number of components to generate. Default: 50
 26 | 	binarize : ``bool``
 27 | 		Whether to binarize the data prior to the computation. Often done during scATAC-seq 
 28 | 		processing. Default: True
 29 | 	random_state : ``int``
 30 | 		The seed to use for randon number generation. Default: 0
 31 | 	'''
 32 | 	
 33 | 	#this is just a very basic wrapper for the non-adata function
 34 | 	if scipy.sparse.issparse(adata.X):
 35 | 		adata.obsm['X_lsi'] = tfidf(adata.X.todense(), n_components=n_comps, binarize=binarize, random_state=random_state)
 36 | 	else:
 37 | 		adata.obsm['X_lsi'] = tfidf(adata.X, n_components=n_comps, binarize=binarize, random_state=random_state)
 38 | 
 39 | def Wrapper(flagged, use_reps, embedding, seed, **kwargs):
 40 | 	'''
 41 | 	A function that computes the paired PCAs between the datasets to integrate, calls MultiMAP
 42 | 	proper, and returns a  (parameters, connectivities, embedding) tuple. Embedding optional
 43 | 	depending on ``embedding``.
 44 | 	
 45 | 	Input
 46 | 	-----
 47 | 	flagged : list of ``AnnData``
 48 | 		Preprocessed objects to integrate. Need to have the single-dataset DRs computed at 
 49 | 		this stage. Need to have ``.obs[\'multimap_index\']`` defined, incrementing integers
 50 | 		matching the object's index in the list. Both ``Integrate()`` and ``Batch()`` make 
 51 | 		these.
 52 | 	
 53 | 	All other arguments as described in ``MultiMAP.Integration()``.
 54 | 	'''
 55 | 	#MultiMAP wants the shared PCAs delivered as a dictionary, with the subset indices 
 56 | 	#tupled up as a key. let's make that then
 57 | 	joint = {}
 58 | 	#process all dataset pairs
 59 | 	for ind1 in np.arange(len(flagged)-1):
 60 | 		for ind2 in np.arange(ind1+1, len(flagged)):
 61 | 			subset = (ind1, ind2)
 62 | 			#collapse into a single object and run a PCA
 63 | 			adata = flagged[ind1].concatenate(flagged[ind2], join='inner')
 64 | 			sc.tl.pca(adata)
 65 | 			#preserve space by deleting the intermediate object and just keeping its PCA
 66 | 			#and multimap index thing
 67 | 			X_pca = adata.obsm['X_pca'].copy()
 68 | 			multimap_index = adata.obs['multimap_index'].values
 69 | 			del adata
 70 | 			#store the results in joint, which involves some further acrobatics
 71 | 			joint[subset] = []
 72 | 			#extract the coordinates for this particular element in the original list, using 
 73 | 			#the multimap_index .obs column we created before. handy!
 74 | 			for i in subset:
 75 | 				joint[subset].append(X_pca[multimap_index == i, :])
 76 | 	
 77 | 	#with the joint prepped, we just need to extract the primary dimensionality reductions 
 78 | 	#and we're good to go here
 79 | 	Xs = []
 80 | 	for adata, use_rep in zip(flagged, use_reps):
 81 | 		Xs.append(adata.obsm[use_rep])
 82 | 	
 83 | 	#set seed
 84 | 	np.random.seed(seed)
 85 | 	
 86 | 	#and with that, we're now truly free to call the MultiMAP function
 87 | 	#need to negate embedding and provide that as graph_only for the function to understand
 88 | 	mmp = MultiMAP(Xs=Xs, joint=joint, graph_only=(not embedding), **kwargs)
 89 | 	
 90 | 	#and that's it. spit this out for the other wrappers to use however
 91 | 	return mmp
 92 | 
 93 | def Integration(adatas, use_reps, scale=True, embedding=True, seed=0, **kwargs):
 94 | 	'''
 95 | 	Run MultiMAP to integrate a number of AnnData objects from various multi-omics experiments
 96 | 	into a single joint dimensionally reduced space. Returns a joint object with the resulting 
 97 | 	embedding stored in ``.obsm[\'X_multimap\']`` (if instructed) and appropriate graphs in 
 98 | 	``.obsp``. The final object will be a concatenation of the individual ones provided on 
 99 | 	input, so in the interest of ease of exploration it is recommended to have non-scaled data 
100 | 	in ``.X``.
101 | 	
102 | 	Input
103 | 	-----
104 | 	adatas : list of ``AnnData``
105 | 		The objects to integrate. The ``.var`` spaces will be intersected across subsets of 
106 | 		the objects to compute shared PCAs, so make sure that you have ample features in 
107 | 		common between the objects. ``.X`` data will be used for computation.
108 | 	use_reps : list of ``str``
109 | 		The ``.obsm`` fields for each of the corresponding ``adatas`` to use as the 
110 | 		dimensionality reduction to represent the full feature space of the object. Needs 
111 | 		to be precomputed and present in the object at the time of calling the function.
112 | 	scale : ``bool``, optional (default: ``True``)
113 | 		Whether to scale the data to N(0,1) on a per-dataset basis prior to computing the 
114 | 		cross-dataset PCAs. Improves integration.
115 | 	embedding : ``bool``, optional (default: ``True``)
116 | 		Whether to compute the MultiMAP embedding. If ``False``, will just return the graph,
117 | 		which can be used to compute a regular UMAP. This can produce a manifold quicker,
118 | 		but at the cost of accuracy.
119 | 	n_neighbors : ``int`` or ``None``, optional (default: ``None``)
120 | 		The number of neighbours for each node (data point) in the MultiGraph. If ``None``, 
121 | 		defaults to 15 times the number of input datasets.
122 | 	n_components : ``int`` (default: 2)
123 | 		The number of dimensions of the MultiMAP embedding.
124 | 	seed : ``int`` (default: 0)
125 | 		RNG seed.
126 | 	strengths: ``list`` of ``float`` or ``None`` (default: ``None``)
127 | 		The relative contribution of each dataset to the layout of the embedding. The 
128 | 		higher the strength the higher the weighting of its cross entropy in the layout loss. 
129 | 		If provided, needs to be a list with one 0-1 value per dataset; if ``None``, defaults 
130 | 		to 0.5 for each dataset.
131 | 	cardinality : ``float`` or ``None``, optional (default: ``None``)
132 | 		The target sum of the connectivities of each neighbourhood in the MultiGraph. If 
133 | 		``None``, defaults to ``log2(n_neighbors)``.
134 | 	
135 | 	The following parameter definitions are sourced from UMAP 0.5.1:
136 | 	
137 | 	n_epochs : int (optional, default None)
138 | 		The number of training epochs to be used in optimizing the
139 | 		low dimensional embedding. Larger values result in more accurate
140 | 		embeddings. If None is specified a value will be selected based on
141 | 		the size of the input dataset (200 for large datasets, 500 for small).
142 | 	init : string (optional, default 'spectral')
143 | 		How to initialize the low dimensional embedding. Options are:
144 | 			* 'spectral': use a spectral embedding of the fuzzy 1-skeleton
145 | 			* 'random': assign initial embedding positions at random.
146 | 			* A numpy array of initial embedding positions.
147 | 	min_dist : float (optional, default 0.1)
148 | 		The effective minimum distance between embedded points. Smaller values
149 | 		will result in a more clustered/clumped embedding where nearby points
150 | 		on the manifold are drawn closer together, while larger values will
151 | 		result on a more even dispersal of points. The value should be set
152 | 		relative to the ``spread`` value, which determines the scale at which
153 | 		embedded points will be spread out.
154 | 	spread : float (optional, default 1.0)
155 | 		The effective scale of embedded points. In combination with ``min_dist``
156 | 		this determines how clustered/clumped the embedded points are.
157 | 	set_op_mix_ratio : float (optional, default 1.0)
158 | 		Interpolate between (fuzzy) union and intersection as the set operation
159 | 		used to combine local fuzzy simplicial sets to obtain a global fuzzy
160 | 		simplicial sets. Both fuzzy set operations use the product t-norm.
161 | 		The value of this parameter should be between 0.0 and 1.0; a value of
162 | 		1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy
163 | 		intersection.
164 | 	local_connectivity : int (optional, default 1)
165 | 		The local connectivity required -- i.e. the number of nearest
166 | 		neighbors that should be assumed to be connected at a local level.
167 | 		The higher this value the more connected the manifold becomes
168 | 		locally. In practice this should be not more than the local intrinsic
169 | 		dimension of the manifold.
170 | 	a : float (optional, default None)
171 | 		More specific parameters controlling the embedding. If None these
172 | 		values are set automatically as determined by ``min_dist`` and
173 | 		``spread``.
174 | 	b : float (optional, default None)
175 | 		More specific parameters controlling the embedding. If None these
176 | 		values are set automatically as determined by ``min_dist`` and
177 | 		``spread``.
178 | 	'''
179 | 	
180 | 	#the main thing will be pulling out the various subsets of the adatas, sticking them 
181 | 	#together, running joint PCAs, and then splitting up the joint PCAs into datasets of 
182 | 	#origin. to do so, let's introduce a helper .obs column in copied versions of adatas
183 | 	flagged = []
184 | 	for i, adata in enumerate(adatas):
185 | 		flagged.append(adata.copy())
186 | 		#while we're at it, may as well potentially scale our data copy
187 | 		if scale:
188 | 			sc.pp.scale(flagged[-1])
189 | 		flagged[-1].obs['multimap_index'] = i
190 | 	
191 | 	#call the wrapper. returns (params, connectivities, embedding), with embedding optional
192 | 	mmp = Wrapper(flagged=flagged, use_reps=use_reps, embedding=embedding, seed=seed, **kwargs)
193 | 	
194 | 	#make one happy collapsed object and shove the stuff in correct places
195 | 	#outer join to capture as much gene information as possible for annotation
196 | 	adata = anndata.concat(adatas, join='outer')
197 | 	if embedding:
198 | 		adata.obsm['X_multimap'] = mmp[2]
199 | 	#the graph is weighted, the higher the better, 1 best. sounds similar to connectivities
200 | 	#TODO: slot distances into .obsp['distances']
201 | 	adata.obsp['connectivities'] = mmp[1]
202 | 	#set up .uns['neighbors'], setting method to umap as these are connectivities
203 | 	adata.uns['neighbors'] = {}
204 | 	adata.uns['neighbors']['params'] = mmp[0]
205 | 	adata.uns['neighbors']['params']['method'] = 'umap'
206 | 	adata.uns['neighbors']['distances_key'] = 'distances'
207 | 	adata.uns['neighbors']['connectivities_key'] = 'connectivities'
208 | 	return adata
209 | 
210 | def Batch(adata, batch_key='batch', scale=True, embedding=True, seed=0, dimred_func=None, rep_name='X_pca', **kwargs):
211 | 	'''
212 | 	Run MultiMAP to correct batch effect within a single AnnData object. Loses the flexibility 
213 | 	of individualised dimensionality reduction choices, but doesn't require a list of separate 
214 | 	objects for each batch/dataset to integrate. Runs PCA on a per-batch/dataset basis prior 
215 | 	to performing an analysis analogous to  ``Integration()``. Adds appropriate ``.obsp`` graphs 
216 | 	and ``.obsm[\'X_multimap\']`` (if instructed) to the input.
217 | 	
218 | 	Input
219 | 	-----
220 | 	adata : ``AnnData``
221 | 		The object to process. ``.X`` data will be used in the computation.
222 | 	batch_key : ``str``, optional (default: "batch")
223 | 		The ``.obs`` column of the input object with the categorical variable defining the 
224 | 		batch/dataset grouping to integrate on.
225 | 	scale : ``bool``, optional (default: ``True``)
226 | 		Whether to scale the data to N(0,1) on a per-dataset basis prior to computing the 
227 | 		cross-dataset PCAs. Improves integration.
228 | 	embedding : ``bool``, optional (default: ``True``)
229 | 		Whether to compute the MultiMAP embedding. If ``False``, will just return the graph,
230 | 		which can be used to compute a regular UMAP. This can produce a manifold quicker,
231 | 		but at the cost of accuracy.
232 | 	dimred_func : function or ``None``, optional (default: ``None``)
233 | 		The function to use to compute dimensionality reduction on a per-dataset basis. Must 
234 | 		accept an ``AnnData`` on input and modify it by inserting its dimensionality reduction 
235 | 		into ``.obsm``. If ``None``, ``scanpy.tl.pca()`` will be used.
236 | 	rep_name : ``str``, optional (default: "X_pca")
237 | 		The ``.obsm`` field that the dimensionality reduction function stores its output under.
238 | 	
239 | 	All other arguments as described in ``Integration()``.
240 | 	'''
241 | 	
242 | 	#as promised in the docstring, set dimred_func to scanpy PCA if not provided
243 | 	if dimred_func is None:
244 | 		dimred_func = sc.tl.pca
245 | 	
246 | 	#essentially what this function does is preps data to run through the other wrapper
247 | 	#so what needs to happen is the object needs to be partitioned up, have DR ran,
248 | 	#and passed as a list to the wrapper function
249 | 	flagged = []
250 | 	flagged_ids = []
251 | 	use_reps = []
252 | 	for i,batch in enumerate(np.unique(adata.obs[batch_key])):
253 | 		#extract the single batch data
254 | 		flagged.append(adata[adata.obs[batch_key]==batch].copy())
255 | 		#potentially scale
256 | 		if scale:
257 | 			sc.pp.scale(flagged[-1])
258 | 		#and run DR
259 | 		dimred_func(flagged[-1])
260 | 		#and stick on the index for multimap to pull stuff apart later
261 | 		flagged[-1].obs['multimap_index'] = i
262 | 		#and add an entry to the list of .obsm keys for the other function
263 | 		use_reps.append(rep_name)
264 | 		#and store the cell name ordering for later
265 | 		flagged_ids = flagged_ids + list(flagged[-1].obs_names)
266 | 	
267 | 	#call the wrapper. returns (params, connectivities, embedding), with embedding optional
268 | 	mmp = Wrapper(flagged=flagged, use_reps=use_reps, embedding=embedding, seed=seed, **kwargs)
269 | 	
270 | 	#this output has the cells ordered as a concatenation of the individual flagged objects
271 | 	#so need to figure out how to reorder the output to get the original cell order
272 | 	#doing the following operation sets the desired order to adata.obs_names
273 | 	#and checks the index for each in flagged_ids
274 | 	#so taking something in flagged_ids order and using sort_order on it will match obs_names
275 | 	sort_order = [flagged_ids.index(i) for i in list(adata.obs_names)]
276 | 	
277 | 	#stick stuff where it's supposed to go
278 | 	if embedding:
279 | 		adata.obsm['X_multimap'] = mmp[2][sort_order,:]
280 | 	#the graph is weighted, the higher the better, 1 best. sounds similar to connectivities
281 | 	#TODO: slot distances into .obsp['distances']
282 | 	adata.obsp['connectivities'] = mmp[1][sort_order,:][:,sort_order]
283 | 	#set up .uns['neighbors'], setting method to umap as these are connectivities
284 | 	adata.uns['neighbors'] = {}
285 | 	adata.uns['neighbors']['params'] = mmp[0]
286 | 	adata.uns['neighbors']['params']['method'] = 'umap'
287 | 	adata.uns['neighbors']['distances_key'] = 'distances'
288 | 	adata.uns['neighbors']['connectivities_key'] = 'connectivities'


--------------------------------------------------------------------------------
/MultiMAP/matrix.py:
--------------------------------------------------------------------------------
   1 | #  Partially based on codebase by Leland McInnes (https://github.com/lmcinnes/umap)
   2 | 
   3 | from __future__ import print_function
   4 | 
   5 | import numpy as np
   6 | import numba
   7 | import scipy
   8 | from scipy.optimize import curve_fit
   9 | from sklearn.neighbors import KDTree
  10 | from sklearn.metrics import pairwise_distances
  11 | 
  12 | import warnings
  13 | 
  14 | 
  15 | 
  16 | 
  17 | 
  18 | 
  19 | 
  20 | 
  21 | #INT32_MIN = np.iinfo(np.int32).min + 1
  22 | #INT32_MAX = np.iinfo(np.int32).max - 1
  23 | 
  24 | 
  25 | 
  26 | 
  27 | 
  28 | 
  29 | 
  30 | 
  31 | 
  32 | 
  33 | 
  34 | from collections import deque, namedtuple
  35 | from warnings import warn
  36 | 
  37 | import numpy as np
  38 | import numba
  39 | 
  40 | #from umap.sparse import sparse_mul, sparse_diff, sparse_sum
  41 | 
  42 | #from umap.utils import tau_rand_int, norm
  43 | 
  44 | import scipy.sparse
  45 | import locale
  46 | 
  47 | locale.setlocale(locale.LC_NUMERIC, "C")
  48 | 
  49 | 
  50 | EPS = 1e-8
  51 | 
  52 | RandomProjectionTreeNode = namedtuple(
  53 |     "RandomProjectionTreeNode",
  54 |     ["indices", "is_leaf", "hyperplane", "offset", "left_child", "right_child"],
  55 | )
  56 | 
  57 | FlatTree = namedtuple("FlatTree", ["hyperplanes", "offsets", "children", "indices"])
  58 | 
  59 | 
  60 | @numba.njit(fastmath=True)
  61 | def angular_random_projection_split(data, indices, rng_state):
  62 | 
  63 |     dim = data.shape[1]
  64 | 
  65 |     
  66 |     left_index = tau_rand_int(rng_state) % indices.shape[0]
  67 |     right_index = tau_rand_int(rng_state) % indices.shape[0]
  68 |     right_index += left_index == right_index
  69 |     right_index = right_index % indices.shape[0]
  70 |     left = indices[left_index]
  71 |     right = indices[right_index]
  72 | 
  73 |     left_norm = norm(data[left])
  74 |     right_norm = norm(data[right])
  75 | 
  76 |     if abs(left_norm) < EPS:
  77 |         left_norm = 1.0
  78 | 
  79 |     if abs(right_norm) < EPS:
  80 |         right_norm = 1.0
  81 | 
  82 |     
  83 |     
  84 |     hyperplane_vector = np.empty(dim, dtype=np.float32)
  85 | 
  86 |     for d in range(dim):
  87 |         hyperplane_vector[d] = (data[left, d] / left_norm) - (
  88 |             data[right, d] / right_norm
  89 |         )
  90 | 
  91 |     hyperplane_norm = norm(hyperplane_vector)
  92 |     if abs(hyperplane_norm) < EPS:
  93 |         hyperplane_norm = 1.0
  94 | 
  95 |     for d in range(dim):
  96 |         hyperplane_vector[d] = hyperplane_vector[d] / hyperplane_norm
  97 | 
  98 |     
  99 |     
 100 |     
 101 |     n_left = 0
 102 |     n_right = 0
 103 |     side = np.empty(indices.shape[0], np.int8)
 104 |     for i in range(indices.shape[0]):
 105 |         margin = 0.0
 106 |         for d in range(dim):
 107 |             margin += hyperplane_vector[d] * data[indices[i], d]
 108 | 
 109 |         if abs(margin) < EPS:
 110 |             side[i] = tau_rand_int(rng_state) % 2
 111 |             if side[i] == 0:
 112 |                 n_left += 1
 113 |             else:
 114 |                 n_right += 1
 115 |         elif margin > 0:
 116 |             side[i] = 0
 117 |             n_left += 1
 118 |         else:
 119 |             side[i] = 1
 120 |             n_right += 1
 121 | 
 122 |     
 123 |     indices_left = np.empty(n_left, dtype=np.int64)
 124 |     indices_right = np.empty(n_right, dtype=np.int64)
 125 | 
 126 |     
 127 |     n_left = 0
 128 |     n_right = 0
 129 |     for i in range(side.shape[0]):
 130 |         if side[i] == 0:
 131 |             indices_left[n_left] = indices[i]
 132 |             n_left += 1
 133 |         else:
 134 |             indices_right[n_right] = indices[i]
 135 |             n_right += 1
 136 | 
 137 |     return indices_left, indices_right, hyperplane_vector, None
 138 | 
 139 | 
 140 | @numba.njit(fastmath=True, nogil=True)
 141 | def euclidean_random_projection_split(data, indices, rng_state):
 142 | 
 143 |     dim = data.shape[1]
 144 | 
 145 |     
 146 |     left_index = tau_rand_int(rng_state) % indices.shape[0]
 147 |     right_index = tau_rand_int(rng_state) % indices.shape[0]
 148 |     right_index += left_index == right_index
 149 |     right_index = right_index % indices.shape[0]
 150 |     left = indices[left_index]
 151 |     right = indices[right_index]
 152 | 
 153 |     
 154 |     
 155 |     hyperplane_offset = 0.0
 156 |     hyperplane_vector = np.empty(dim, dtype=np.float32)
 157 | 
 158 |     for d in range(dim):
 159 |         hyperplane_vector[d] = data[left, d] - data[right, d]
 160 |         hyperplane_offset -= (
 161 |             hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0
 162 |         )
 163 | 
 164 |     
 165 |     
 166 |     
 167 |     n_left = 0
 168 |     n_right = 0
 169 |     side = np.empty(indices.shape[0], np.int8)
 170 |     for i in range(indices.shape[0]):
 171 |         margin = hyperplane_offset
 172 |         for d in range(dim):
 173 |             margin += hyperplane_vector[d] * data[indices[i], d]
 174 | 
 175 |         if abs(margin) < EPS:
 176 |             side[i] = tau_rand_int(rng_state) % 2
 177 |             if side[i] == 0:
 178 |                 n_left += 1
 179 |             else:
 180 |                 n_right += 1
 181 |         elif margin > 0:
 182 |             side[i] = 0
 183 |             n_left += 1
 184 |         else:
 185 |             side[i] = 1
 186 |             n_right += 1
 187 | 
 188 |     
 189 |     indices_left = np.empty(n_left, dtype=np.int64)
 190 |     indices_right = np.empty(n_right, dtype=np.int64)
 191 | 
 192 |     
 193 |     n_left = 0
 194 |     n_right = 0
 195 |     for i in range(side.shape[0]):
 196 |         if side[i] == 0:
 197 |             indices_left[n_left] = indices[i]
 198 |             n_left += 1
 199 |         else:
 200 |             indices_right[n_right] = indices[i]
 201 |             n_right += 1
 202 | 
 203 |     return indices_left, indices_right, hyperplane_vector, hyperplane_offset
 204 | 
 205 | 
 206 | @numba.njit(fastmath=True)
 207 | def sparse_angular_random_projection_split(inds, indptr, data, indices, rng_state):
 208 | 
 209 |     
 210 |     left_index = tau_rand_int(rng_state) % indices.shape[0]
 211 |     right_index = tau_rand_int(rng_state) % indices.shape[0]
 212 |     right_index += left_index == right_index
 213 |     right_index = right_index % indices.shape[0]
 214 |     left = indices[left_index]
 215 |     right = indices[right_index]
 216 | 
 217 |     left_inds = inds[indptr[left] : indptr[left + 1]]
 218 |     left_data = data[indptr[left] : indptr[left + 1]]
 219 |     right_inds = inds[indptr[right] : indptr[right + 1]]
 220 |     right_data = data[indptr[right] : indptr[right + 1]]
 221 | 
 222 |     left_norm = norm(left_data)
 223 |     right_norm = norm(right_data)
 224 | 
 225 |     if abs(left_norm) < EPS:
 226 |         left_norm = 1.0
 227 | 
 228 |     if abs(right_norm) < EPS:
 229 |         right_norm = 1.0
 230 | 
 231 |     
 232 |     
 233 |     normalized_left_data = left_data / left_norm
 234 |     normalized_right_data = right_data / right_norm
 235 |     hyperplane_inds, hyperplane_data = sparse_diff(
 236 |         left_inds, normalized_left_data, right_inds, normalized_right_data
 237 |     )
 238 | 
 239 |     hyperplane_norm = norm(hyperplane_data)
 240 |     if abs(hyperplane_norm) < EPS:
 241 |         hyperplane_norm = 1.0
 242 |     for d in range(hyperplane_data.shape[0]):
 243 |         hyperplane_data[d] = hyperplane_data[d] / hyperplane_norm
 244 | 
 245 |     
 246 |     
 247 |     
 248 |     n_left = 0
 249 |     n_right = 0
 250 |     side = np.empty(indices.shape[0], np.int8)
 251 |     for i in range(indices.shape[0]):
 252 |         margin = 0.0
 253 | 
 254 |         i_inds = inds[indptr[indices[i]] : indptr[indices[i] + 1]]
 255 |         i_data = data[indptr[indices[i]] : indptr[indices[i] + 1]]
 256 | 
 257 |         mul_inds, mul_data = sparse_mul(
 258 |             hyperplane_inds, hyperplane_data, i_inds, i_data
 259 |         )
 260 |         for d in range(mul_data.shape[0]):
 261 |             margin += mul_data[d]
 262 | 
 263 |         if abs(margin) < EPS:
 264 |             side[i] = tau_rand_int(rng_state) % 2
 265 |             if side[i] == 0:
 266 |                 n_left += 1
 267 |             else:
 268 |                 n_right += 1
 269 |         elif margin > 0:
 270 |             side[i] = 0
 271 |             n_left += 1
 272 |         else:
 273 |             side[i] = 1
 274 |             n_right += 1
 275 | 
 276 |     
 277 |     indices_left = np.empty(n_left, dtype=np.int64)
 278 |     indices_right = np.empty(n_right, dtype=np.int64)
 279 | 
 280 |     
 281 |     n_left = 0
 282 |     n_right = 0
 283 |     for i in range(side.shape[0]):
 284 |         if side[i] == 0:
 285 |             indices_left[n_left] = indices[i]
 286 |             n_left += 1
 287 |         else:
 288 |             indices_right[n_right] = indices[i]
 289 |             n_right += 1
 290 | 
 291 |     hyperplane = np.vstack((hyperplane_inds, hyperplane_data))
 292 | 
 293 |     return indices_left, indices_right, hyperplane, None
 294 | 
 295 | 
 296 | @numba.njit(fastmath=True)
 297 | def sparse_euclidean_random_projection_split(inds, indptr, data, indices, rng_state):
 298 | 
 299 |     
 300 |     left_index = tau_rand_int(rng_state) % indices.shape[0]
 301 |     right_index = tau_rand_int(rng_state) % indices.shape[0]
 302 |     right_index += left_index == right_index
 303 |     right_index = right_index % indices.shape[0]
 304 |     left = indices[left_index]
 305 |     right = indices[right_index]
 306 | 
 307 |     left_inds = inds[indptr[left] : indptr[left + 1]]
 308 |     left_data = data[indptr[left] : indptr[left + 1]]
 309 |     right_inds = inds[indptr[right] : indptr[right + 1]]
 310 |     right_data = data[indptr[right] : indptr[right + 1]]
 311 | 
 312 |     
 313 |     
 314 |     hyperplane_offset = 0.0
 315 |     hyperplane_inds, hyperplane_data = sparse_diff(
 316 |         left_inds, left_data, right_inds, right_data
 317 |     )
 318 |     offset_inds, offset_data = sparse_sum(left_inds, left_data, right_inds, right_data)
 319 |     offset_data = offset_data / 2.0
 320 |     offset_inds, offset_data = sparse_mul(
 321 |         hyperplane_inds, hyperplane_data, offset_inds, offset_data
 322 |     )
 323 | 
 324 |     for d in range(offset_data.shape[0]):
 325 |         hyperplane_offset -= offset_data[d]
 326 | 
 327 |     
 328 |     
 329 |     
 330 |     n_left = 0
 331 |     n_right = 0
 332 |     side = np.empty(indices.shape[0], np.int8)
 333 |     for i in range(indices.shape[0]):
 334 |         margin = hyperplane_offset
 335 |         i_inds = inds[indptr[indices[i]] : indptr[indices[i] + 1]]
 336 |         i_data = data[indptr[indices[i]] : indptr[indices[i] + 1]]
 337 | 
 338 |         mul_inds, mul_data = sparse_mul(
 339 |             hyperplane_inds, hyperplane_data, i_inds, i_data
 340 |         )
 341 |         for d in range(mul_data.shape[0]):
 342 |             margin += mul_data[d]
 343 | 
 344 |         if abs(margin) < EPS:
 345 |             side[i] = tau_rand_int(rng_state) % 2
 346 |             if side[i] == 0:
 347 |                 n_left += 1
 348 |             else:
 349 |                 n_right += 1
 350 |         elif margin > 0:
 351 |             side[i] = 0
 352 |             n_left += 1
 353 |         else:
 354 |             side[i] = 1
 355 |             n_right += 1
 356 | 
 357 |     
 358 |     indices_left = np.empty(n_left, dtype=np.int64)
 359 |     indices_right = np.empty(n_right, dtype=np.int64)
 360 | 
 361 |     
 362 |     n_left = 0
 363 |     n_right = 0
 364 |     for i in range(side.shape[0]):
 365 |         if side[i] == 0:
 366 |             indices_left[n_left] = indices[i]
 367 |             n_left += 1
 368 |         else:
 369 |             indices_right[n_right] = indices[i]
 370 |             n_right += 1
 371 | 
 372 |     hyperplane = np.vstack((hyperplane_inds, hyperplane_data))
 373 | 
 374 |     return indices_left, indices_right, hyperplane, hyperplane_offset
 375 | 
 376 | 
 377 | def make_euclidean_tree(data, indices, rng_state, leaf_size=30):
 378 |     if indices.shape[0] > leaf_size:
 379 |         left_indices, right_indices, hyperplane, offset = euclidean_random_projection_split(
 380 |             data, indices, rng_state
 381 |         )
 382 | 
 383 |         left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size)
 384 |         right_node = make_euclidean_tree(data, right_indices, rng_state, leaf_size)
 385 | 
 386 |         node = RandomProjectionTreeNode(
 387 |             None, False, hyperplane, offset, left_node, right_node
 388 |         )
 389 |     else:
 390 |         node = RandomProjectionTreeNode(indices, True, None, None, None, None)
 391 | 
 392 |     return node
 393 | 
 394 | 
 395 | def make_angular_tree(data, indices, rng_state, leaf_size=30):
 396 |     if indices.shape[0] > leaf_size:
 397 |         left_indices, right_indices, hyperplane, offset = angular_random_projection_split(
 398 |             data, indices, rng_state
 399 |         )
 400 | 
 401 |         left_node = make_angular_tree(data, left_indices, rng_state, leaf_size)
 402 |         right_node = make_angular_tree(data, right_indices, rng_state, leaf_size)
 403 | 
 404 |         node = RandomProjectionTreeNode(
 405 |             None, False, hyperplane, offset, left_node, right_node
 406 |         )
 407 |     else:
 408 |         node = RandomProjectionTreeNode(indices, True, None, None, None, None)
 409 | 
 410 |     return node
 411 | 
 412 | 
 413 | def make_sparse_euclidean_tree(inds, indptr, data, indices, rng_state, leaf_size=30):
 414 |     if indices.shape[0] > leaf_size:
 415 |         left_indices, right_indices, hyperplane, offset = sparse_euclidean_random_projection_split(
 416 |             inds, indptr, data, indices, rng_state
 417 |         )
 418 | 
 419 |         left_node = make_sparse_euclidean_tree(
 420 |             inds, indptr, data, left_indices, rng_state, leaf_size
 421 |         )
 422 |         right_node = make_sparse_euclidean_tree(
 423 |             inds, indptr, data, right_indices, rng_state, leaf_size
 424 |         )
 425 | 
 426 |         node = RandomProjectionTreeNode(
 427 |             None, False, hyperplane, offset, left_node, right_node
 428 |         )
 429 |     else:
 430 |         node = RandomProjectionTreeNode(indices, True, None, None, None, None)
 431 | 
 432 |     return node
 433 | 
 434 | 
 435 | def make_sparse_angular_tree(inds, indptr, data, indices, rng_state, leaf_size=30):
 436 |     if indices.shape[0] > leaf_size:
 437 |         left_indices, right_indices, hyperplane, offset = sparse_angular_random_projection_split(
 438 |             inds, indptr, data, indices, rng_state
 439 |         )
 440 | 
 441 |         left_node = make_sparse_angular_tree(
 442 |             inds, indptr, data, left_indices, rng_state, leaf_size
 443 |         )
 444 |         right_node = make_sparse_angular_tree(
 445 |             inds, indptr, data, right_indices, rng_state, leaf_size
 446 |         )
 447 | 
 448 |         node = RandomProjectionTreeNode(
 449 |             None, False, hyperplane, offset, left_node, right_node
 450 |         )
 451 |     else:
 452 |         node = RandomProjectionTreeNode(indices, True, None, None, None, None)
 453 | 
 454 |     return node
 455 | 
 456 | 
 457 | def make_tree(data, rng_state, leaf_size=30, angular=False):
 458 | 
 459 |     is_sparse = scipy.sparse.isspmatrix_csr(data)
 460 |     indices = np.arange(data.shape[0])
 461 | 
 462 |     
 463 |     if is_sparse:
 464 |         inds = data.indices
 465 |         indptr = data.indptr
 466 |         spdata = data.data
 467 | 
 468 |         if angular:
 469 |             return make_sparse_angular_tree(
 470 |                 inds, indptr, spdata, indices, rng_state, leaf_size
 471 |             )
 472 |         else:
 473 |             return make_sparse_euclidean_tree(
 474 |                 inds, indptr, spdata, indices, rng_state, leaf_size
 475 |             )
 476 |     else:
 477 |         if angular:
 478 |             return make_angular_tree(data, indices, rng_state, leaf_size)
 479 |         else:
 480 |             return make_euclidean_tree(data, indices, rng_state, leaf_size)
 481 | 
 482 | 
 483 | def num_nodes(tree):
 484 |     if tree.is_leaf:
 485 |         return 1
 486 |     else:
 487 |         return 1 + num_nodes(tree.left_child) + num_nodes(tree.right_child)
 488 | 
 489 | 
 490 | def num_leaves(tree):
 491 |     if tree.is_leaf:
 492 |         return 1
 493 |     else:
 494 |         return num_leaves(tree.left_child) + num_leaves(tree.right_child)
 495 | 
 496 | 
 497 | def max_sparse_hyperplane_size(tree):
 498 |     if tree.is_leaf:
 499 |         return 0
 500 |     else:
 501 |         return max(
 502 |             tree.hyperplane.shape[1],
 503 |             max_sparse_hyperplane_size(tree.left_child),
 504 |             max_sparse_hyperplane_size(tree.right_child),
 505 |         )
 506 | 
 507 | 
 508 | def recursive_flatten(
 509 |     tree, hyperplanes, offsets, children, indices, node_num, leaf_num
 510 | ):
 511 |     if tree.is_leaf:
 512 |         children[node_num, 0] = -leaf_num
 513 |         indices[leaf_num, : tree.indices.shape[0]] = tree.indices
 514 |         leaf_num += 1
 515 |         return node_num, leaf_num
 516 |     else:
 517 |         if len(tree.hyperplane.shape) > 1:
 518 |             
 519 |             hyperplanes[node_num][:, : tree.hyperplane.shape[1]] = tree.hyperplane
 520 |         else:
 521 |             hyperplanes[node_num] = tree.hyperplane
 522 |         offsets[node_num] = tree.offset
 523 |         children[node_num, 0] = node_num + 1
 524 |         old_node_num = node_num
 525 |         node_num, leaf_num = recursive_flatten(
 526 |             tree.left_child,
 527 |             hyperplanes,
 528 |             offsets,
 529 |             children,
 530 |             indices,
 531 |             node_num + 1,
 532 |             leaf_num,
 533 |         )
 534 |         children[old_node_num, 1] = node_num + 1
 535 |         node_num, leaf_num = recursive_flatten(
 536 |             tree.right_child,
 537 |             hyperplanes,
 538 |             offsets,
 539 |             children,
 540 |             indices,
 541 |             node_num + 1,
 542 |             leaf_num,
 543 |         )
 544 |         return node_num, leaf_num
 545 | 
 546 | 
 547 | def flatten_tree(tree, leaf_size):
 548 |     n_nodes = num_nodes(tree)
 549 |     n_leaves = num_leaves(tree)
 550 | 
 551 |     if len(tree.hyperplane.shape) > 1:
 552 |         
 553 |         max_hyperplane_nnz = max_sparse_hyperplane_size(tree)
 554 |         hyperplanes = np.zeros(
 555 |             (n_nodes, tree.hyperplane.shape[0], max_hyperplane_nnz), dtype=np.float32
 556 |         )
 557 |     else:
 558 |         hyperplanes = np.zeros((n_nodes, tree.hyperplane.shape[0]), dtype=np.float32)
 559 | 
 560 |     offsets = np.zeros(n_nodes, dtype=np.float32)
 561 |     children = -1 * np.ones((n_nodes, 2), dtype=np.int64)
 562 |     indices = -1 * np.ones((n_leaves, leaf_size), dtype=np.int64)
 563 |     recursive_flatten(tree, hyperplanes, offsets, children, indices, 0, 0)
 564 |     return FlatTree(hyperplanes, offsets, children, indices)
 565 | 
 566 | 
 567 | @numba.njit()
 568 | def select_side(hyperplane, offset, point, rng_state):
 569 |     margin = offset
 570 |     for d in range(point.shape[0]):
 571 |         margin += hyperplane[d] * point[d]
 572 | 
 573 |     if abs(margin) < EPS:
 574 |         side = tau_rand_int(rng_state) % 2
 575 |         if side == 0:
 576 |             return 0
 577 |         else:
 578 |             return 1
 579 |     elif margin > 0:
 580 |         return 0
 581 |     else:
 582 |         return 1
 583 | 
 584 | 
 585 | @numba.njit()
 586 | def search_flat_tree(point, hyperplanes, offsets, children, indices, rng_state):
 587 |     node = 0
 588 |     while children[node, 0] > 0:
 589 |         side = select_side(hyperplanes[node], offsets[node], point, rng_state)
 590 |         if side == 0:
 591 |             node = children[node, 0]
 592 |         else:
 593 |             node = children[node, 1]
 594 | 
 595 |     return indices[-children[node, 0]]
 596 | 
 597 | 
 598 | def make_forest(data, n_neighbors, n_trees, rng_state, angular=False):
 599 | 
 600 |     result = []
 601 |     leaf_size = max(10, n_neighbors)
 602 |     try:
 603 |         result = [
 604 |             flatten_tree(make_tree(data, rng_state, leaf_size, angular), leaf_size)
 605 |             for i in range(n_trees)
 606 |         ]
 607 |     except (RuntimeError, RecursionError, SystemError):
 608 |         warn(
 609 |             "Random Projection forest initialisation failed due to recursion"
 610 |             "limit being reached. Something is a little strange with your "
 611 |             "data, and this may take longer than normal to compute."
 612 |         )
 613 | 
 614 |     return result
 615 | 
 616 | 
 617 | def rptree_leaf_array(rp_forest):
 618 | 
 619 |     if len(rp_forest) > 0:
 620 |         leaf_array = np.vstack([tree.indices for tree in rp_forest])
 621 |     else:
 622 |         leaf_array = np.array([[-1]])
 623 | 
 624 |     return leaf_array
 625 | 
 626 | 
 627 | 
 628 | 
 629 | 
 630 | 
 631 | 
 632 | 
 633 | 
 634 | 
 635 | 
 636 | 
 637 | import numpy as np
 638 | import numba
 639 | 
 640 | _mock_identity = np.eye(2, dtype=np.float64)
 641 | _mock_ones = np.ones(2, dtype=np.float64)
 642 | 
 643 | 
 644 | @numba.njit(fastmath=True)
 645 | def euclidean(x, y):
 646 | 
 647 |     result = 0.0
 648 |     for i in range(x.shape[0]):
 649 |         result += (x[i] - y[i]) ** 2
 650 |     return np.sqrt(result)
 651 | 
 652 | 
 653 | @numba.njit()
 654 | def standardised_euclidean(x, y, sigma=_mock_ones):
 655 | 
 656 |     result = 0.0
 657 |     for i in range(x.shape[0]):
 658 |         result += ((x[i] - y[i]) ** 2) / sigma[i]
 659 | 
 660 |     return np.sqrt(result)
 661 | 
 662 | 
 663 | @numba.njit()
 664 | def manhattan(x, y):
 665 | 
 666 |     result = 0.0
 667 |     for i in range(x.shape[0]):
 668 |         result += np.abs(x[i] - y[i])
 669 | 
 670 |     return result
 671 | 
 672 | 
 673 | @numba.njit()
 674 | def chebyshev(x, y):
 675 | 
 676 |     result = 0.0
 677 |     for i in range(x.shape[0]):
 678 |         result = max(result, np.abs(x[i] - y[i]))
 679 | 
 680 |     return result
 681 | 
 682 | 
 683 | @numba.njit()
 684 | def minkowski(x, y, p=2):
 685 | 
 686 |     result = 0.0
 687 |     for i in range(x.shape[0]):
 688 |         result += (np.abs(x[i] - y[i])) ** p
 689 | 
 690 |     return result ** (1.0 / p)
 691 | 
 692 | 
 693 | @numba.njit()
 694 | def weighted_minkowski(x, y, w=_mock_ones, p=2):
 695 | 
 696 |     result = 0.0
 697 |     for i in range(x.shape[0]):
 698 |         result += (w[i] * np.abs(x[i] - y[i])) ** p
 699 | 
 700 |     return result ** (1.0 / p)
 701 | 
 702 | 
 703 | @numba.njit()
 704 | def mahalanobis(x, y, vinv=_mock_identity):
 705 |     result = 0.0
 706 | 
 707 |     diff = np.empty(x.shape[0], dtype=np.float64)
 708 | 
 709 |     for i in range(x.shape[0]):
 710 |         diff[i] = x[i] - y[i]
 711 | 
 712 |     for i in range(x.shape[0]):
 713 |         tmp = 0.0
 714 |         for j in range(x.shape[0]):
 715 |             tmp += vinv[i, j] * diff[j]
 716 |         result += tmp * diff[i]
 717 | 
 718 |     return np.sqrt(result)
 719 | 
 720 | 
 721 | @numba.njit()
 722 | def hamming(x, y):
 723 |     result = 0.0
 724 |     for i in range(x.shape[0]):
 725 |         if x[i] != y[i]:
 726 |             result += 1.0
 727 | 
 728 |     return float(result) / x.shape[0]
 729 | 
 730 | 
 731 | @numba.njit()
 732 | def canberra(x, y):
 733 |     result = 0.0
 734 |     for i in range(x.shape[0]):
 735 |         denominator = np.abs(x[i]) + np.abs(y[i])
 736 |         if denominator > 0:
 737 |             result += np.abs(x[i] - y[i]) / denominator
 738 | 
 739 |     return result
 740 | 
 741 | 
 742 | @numba.njit()
 743 | def bray_curtis(x, y):
 744 |     numerator = 0.0
 745 |     denominator = 0.0
 746 |     for i in range(x.shape[0]):
 747 |         numerator += np.abs(x[i] - y[i])
 748 |         denominator += np.abs(x[i] + y[i])
 749 | 
 750 |     if denominator > 0.0:
 751 |         return float(numerator) / denominator
 752 |     else:
 753 |         return 0.0
 754 | 
 755 | 
 756 | @numba.njit()
 757 | def jaccard(x, y):
 758 |     num_non_zero = 0.0
 759 |     num_equal = 0.0
 760 |     for i in range(x.shape[0]):
 761 |         x_true = x[i] != 0
 762 |         y_true = y[i] != 0
 763 |         num_non_zero += x_true or y_true
 764 |         num_equal += x_true and y_true
 765 | 
 766 |     if num_non_zero == 0.0:
 767 |         return 0.0
 768 |     else:
 769 |         return float(num_non_zero - num_equal) / num_non_zero
 770 | 
 771 | 
 772 | @numba.njit()
 773 | def matching(x, y):
 774 |     num_not_equal = 0.0
 775 |     for i in range(x.shape[0]):
 776 |         x_true = x[i] != 0
 777 |         y_true = y[i] != 0
 778 |         num_not_equal += x_true != y_true
 779 | 
 780 |     return float(num_not_equal) / x.shape[0]
 781 | 
 782 | 
 783 | @numba.njit()
 784 | def dice(x, y):
 785 |     num_true_true = 0.0
 786 |     num_not_equal = 0.0
 787 |     for i in range(x.shape[0]):
 788 |         x_true = x[i] != 0
 789 |         y_true = y[i] != 0
 790 |         num_true_true += x_true and y_true
 791 |         num_not_equal += x_true != y_true
 792 | 
 793 |     if num_not_equal == 0.0:
 794 |         return 0.0
 795 |     else:
 796 |         return num_not_equal / (2.0 * num_true_true + num_not_equal)
 797 | 
 798 | 
 799 | @numba.njit()
 800 | def kulsinski(x, y):
 801 |     num_true_true = 0.0
 802 |     num_not_equal = 0.0
 803 |     for i in range(x.shape[0]):
 804 |         x_true = x[i] != 0
 805 |         y_true = y[i] != 0
 806 |         num_true_true += x_true and y_true
 807 |         num_not_equal += x_true != y_true
 808 | 
 809 |     if num_not_equal == 0:
 810 |         return 0.0
 811 |     else:
 812 |         return float(num_not_equal - num_true_true + x.shape[0]) / (
 813 |             num_not_equal + x.shape[0]
 814 |         )
 815 | 
 816 | 
 817 | @numba.njit()
 818 | def rogers_tanimoto(x, y):
 819 |     num_not_equal = 0.0
 820 |     for i in range(x.shape[0]):
 821 |         x_true = x[i] != 0
 822 |         y_true = y[i] != 0
 823 |         num_not_equal += x_true != y_true
 824 | 
 825 |     return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)
 826 | 
 827 | 
 828 | @numba.njit()
 829 | def russellrao(x, y):
 830 |     num_true_true = 0.0
 831 |     for i in range(x.shape[0]):
 832 |         x_true = x[i] != 0
 833 |         y_true = y[i] != 0
 834 |         num_true_true += x_true and y_true
 835 | 
 836 |     if num_true_true == np.sum(x != 0) and num_true_true == np.sum(y != 0):
 837 |         return 0.0
 838 |     else:
 839 |         return float(x.shape[0] - num_true_true) / (x.shape[0])
 840 | 
 841 | 
 842 | @numba.njit()
 843 | def sokal_michener(x, y):
 844 |     num_not_equal = 0.0
 845 |     for i in range(x.shape[0]):
 846 |         x_true = x[i] != 0
 847 |         y_true = y[i] != 0
 848 |         num_not_equal += x_true != y_true
 849 | 
 850 |     return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)
 851 | 
 852 | 
 853 | @numba.njit()
 854 | def sokal_sneath(x, y):
 855 |     num_true_true = 0.0
 856 |     num_not_equal = 0.0
 857 |     for i in range(x.shape[0]):
 858 |         x_true = x[i] != 0
 859 |         y_true = y[i] != 0
 860 |         num_true_true += x_true and y_true
 861 |         num_not_equal += x_true != y_true
 862 | 
 863 |     if num_not_equal == 0.0:
 864 |         return 0.0
 865 |     else:
 866 |         return num_not_equal / (0.5 * num_true_true + num_not_equal)
 867 | 
 868 | 
 869 | @numba.njit()
 870 | def haversine(x, y):
 871 |     if x.shape[0] != 2:
 872 |         raise ValueError("haversine is only defined for 2 dimensional data")
 873 |     sin_lat = np.sin(0.5 * (x[0] - y[0]))
 874 |     sin_long = np.sin(0.5 * (x[1] - y[1]))
 875 |     result = np.sqrt(sin_lat ** 2 + np.cos(x[0]) * np.cos(y[0]) * sin_long ** 2)
 876 |     return 2.0 * np.arcsin(result)
 877 | 
 878 | 
 879 | @numba.njit()
 880 | def yule(x, y):
 881 |     num_true_true = 0.0
 882 |     num_true_false = 0.0
 883 |     num_false_true = 0.0
 884 |     for i in range(x.shape[0]):
 885 |         x_true = x[i] != 0
 886 |         y_true = y[i] != 0
 887 |         num_true_true += x_true and y_true
 888 |         num_true_false += x_true and (not y_true)
 889 |         num_false_true += (not x_true) and y_true
 890 | 
 891 |     num_false_false = x.shape[0] - num_true_true - num_true_false - num_false_true
 892 | 
 893 |     if num_true_false == 0.0 or num_false_true == 0.0:
 894 |         return 0.0
 895 |     else:
 896 |         return (2.0 * num_true_false * num_false_true) / (
 897 |             num_true_true * num_false_false + num_true_false * num_false_true
 898 |         )
 899 | 
 900 | 
 901 | @numba.njit()
 902 | def cosine(x, y):
 903 |     result = 0.0
 904 |     norm_x = 0.0
 905 |     norm_y = 0.0
 906 |     for i in range(x.shape[0]):
 907 |         result += x[i] * y[i]
 908 |         norm_x += x[i] ** 2
 909 |         norm_y += y[i] ** 2
 910 | 
 911 |     if norm_x == 0.0 and norm_y == 0.0:
 912 |         return 0.0
 913 |     elif norm_x == 0.0 or norm_y == 0.0:
 914 |         return 1.0
 915 |     else:
 916 |         return 1.0 - (result / np.sqrt(norm_x * norm_y))
 917 | 
 918 | 
 919 | @numba.njit()
 920 | def correlation(x, y):
 921 |     mu_x = 0.0
 922 |     mu_y = 0.0
 923 |     norm_x = 0.0
 924 |     norm_y = 0.0
 925 |     dot_product = 0.0
 926 | 
 927 |     for i in range(x.shape[0]):
 928 |         mu_x += x[i]
 929 |         mu_y += y[i]
 930 | 
 931 |     mu_x /= x.shape[0]
 932 |     mu_y /= x.shape[0]
 933 | 
 934 |     for i in range(x.shape[0]):
 935 |         shifted_x = x[i] - mu_x
 936 |         shifted_y = y[i] - mu_y
 937 |         norm_x += shifted_x ** 2
 938 |         norm_y += shifted_y ** 2
 939 |         dot_product += shifted_x * shifted_y
 940 | 
 941 |     if norm_x == 0.0 and norm_y == 0.0:
 942 |         return 0.0
 943 |     elif dot_product == 0.0:
 944 |         return 1.0
 945 |     else:
 946 |         return 1.0 - (dot_product / np.sqrt(norm_x * norm_y))
 947 | 
 948 | 
 949 | named_distances = {
 950 |     
 951 |     "euclidean": euclidean,
 952 |     "l2": euclidean,
 953 |     "manhattan": manhattan,
 954 |     "taxicab": manhattan,
 955 |     "l1": manhattan,
 956 |     "chebyshev": chebyshev,
 957 |     "linfinity": chebyshev,
 958 |     "linfty": chebyshev,
 959 |     "linf": chebyshev,
 960 |     "minkowski": minkowski,
 961 |     
 962 |     "seuclidean": standardised_euclidean,
 963 |     "standardised_euclidean": standardised_euclidean,
 964 |     "wminkowski": weighted_minkowski,
 965 |     "weighted_minkowski": weighted_minkowski,
 966 |     "mahalanobis": mahalanobis,
 967 |     
 968 |     "canberra": canberra,
 969 |     "cosine": cosine,
 970 |     "correlation": correlation,
 971 |     "haversine": haversine,
 972 |     "braycurtis": bray_curtis,
 973 |     
 974 |     "hamming": hamming,
 975 |     "jaccard": jaccard,
 976 |     "dice": dice,
 977 |     "matching": matching,
 978 |     "kulsinski": kulsinski,
 979 |     "rogerstanimoto": rogers_tanimoto,
 980 |     "russellrao": russellrao,
 981 |     "sokalsneath": sokal_sneath,
 982 |     "sokalmichener": sokal_michener,
 983 |     "yule": yule,
 984 | }
 985 | 
 986 | 
 987 | 
 988 | 
 989 | 
 990 | 
 991 | 
 992 | 
 993 | 
 994 | 
 995 | 
 996 | 
 997 | 
 998 | 
 999 | import time
1000 | 
1001 | import numpy as np
1002 | import numba
1003 | 
1004 | 
1005 | @numba.njit(parallel=True)
1006 | def fast_knn_indices(X, n_neighbors):
1007 | 
1008 |     knn_indices = np.empty(
1009 |         (X.shape[0], n_neighbors), dtype=np.int32
1010 |     )
1011 |     for row in numba.prange(X.shape[0]):
1012 |         v = X[row].argsort(kind="quicksort")
1013 |         v = v[:n_neighbors]
1014 |         knn_indices[row] = v
1015 |     return knn_indices
1016 | 
1017 | 
1018 | @numba.njit("i4(i8[:])")
1019 | def tau_rand_int(state):
1020 | 
1021 |     state[0] = (
1022 |         ((state[0] & 4294967294) << 12) & 0xFFFFFFFF
1023 |     ) ^ ((((state[0] << 13) & 0xFFFFFFFF) ^ state[0]) >> 19)
1024 |     state[1] = (
1025 |         ((state[1] & 4294967288) << 4) & 0xFFFFFFFF
1026 |     ) ^ ((((state[1] << 2) & 0xFFFFFFFF) ^ state[1]) >> 25)
1027 |     state[2] = (
1028 |         ((state[2] & 4294967280) << 17) & 0xFFFFFFFF
1029 |     ) ^ ((((state[2] << 3) & 0xFFFFFFFF) ^ state[2]) >> 11)
1030 | 
1031 |     return state[0] ^ state[1] ^ state[2]
1032 | 
1033 | 
1034 | @numba.njit("f4(i8[:])")
1035 | def tau_rand(state):
1036 | 
1037 |     integer = tau_rand_int(state)
1038 |     return abs(float(integer) / 0x7FFFFFFF)
1039 | 
1040 | 
1041 | @numba.njit()
1042 | def norm(vec):
1043 | 
1044 |     result = 0.0
1045 |     for i in range(vec.shape[0]):
1046 |         result += vec[i] ** 2
1047 |     return np.sqrt(result)
1048 | 
1049 | 
1050 | @numba.njit()
1051 | def rejection_sample(n_samples, pool_size, rng_state):
1052 | 
1053 |     result = np.empty(n_samples, dtype=np.int64)
1054 |     for i in range(n_samples):
1055 |         reject_sample = True
1056 |         while reject_sample:
1057 |             j = tau_rand_int(rng_state) % pool_size
1058 |             for k in range(i):
1059 |                 if j == result[k]:
1060 |                     break
1061 |             else:
1062 |                 reject_sample = False
1063 |         result[i] = j
1064 |     return result
1065 | 
1066 | 
1067 | @numba.njit("f8[:, :, :](i8,i8)")
1068 | def make_heap(n_points, size):
1069 | 
1070 |     result = np.zeros(
1071 |         (3, int(n_points), int(size)), dtype=np.float64
1072 |     )
1073 |     result[0] = -1
1074 |     result[1] = np.infty
1075 |     result[2] = 0
1076 | 
1077 |     return result
1078 | 
1079 | 
1080 | @numba.njit("i8(f8[:,:,:],i8,f8,i8,i8)")
1081 | def heap_push(heap, row, weight, index, flag):
1082 | 
1083 |     row = int(row)
1084 |     indices = heap[0, row]
1085 |     weights = heap[1, row]
1086 |     is_new = heap[2, row]
1087 | 
1088 |     if weight >= weights[0]:
1089 |         return 0
1090 | 
1091 |     
1092 |     for i in range(indices.shape[0]):
1093 |         if index == indices[i]:
1094 |             return 0
1095 | 
1096 |     
1097 |     weights[0] = weight
1098 |     indices[0] = index
1099 |     is_new[0] = flag
1100 | 
1101 |     
1102 |     i = 0
1103 |     while True:
1104 |         ic1 = 2 * i + 1
1105 |         ic2 = ic1 + 1
1106 | 
1107 |         if ic1 >= heap.shape[2]:
1108 |             break
1109 |         elif ic2 >= heap.shape[2]:
1110 |             if weights[ic1] > weight:
1111 |                 i_swap = ic1
1112 |             else:
1113 |                 break
1114 |         elif weights[ic1] >= weights[ic2]:
1115 |             if weight < weights[ic1]:
1116 |                 i_swap = ic1
1117 |             else:
1118 |                 break
1119 |         else:
1120 |             if weight < weights[ic2]:
1121 |                 i_swap = ic2
1122 |             else:
1123 |                 break
1124 | 
1125 |         weights[i] = weights[i_swap]
1126 |         indices[i] = indices[i_swap]
1127 |         is_new[i] = is_new[i_swap]
1128 | 
1129 |         i = i_swap
1130 | 
1131 |     weights[i] = weight
1132 |     indices[i] = index
1133 |     is_new[i] = flag
1134 | 
1135 |     return 1
1136 | 
1137 | 
1138 | @numba.njit("i8(f8[:,:,:],i8,f8,i8,i8)")
1139 | def unchecked_heap_push(heap, row, weight, index, flag):
1140 | 
1141 |     indices = heap[0, row]
1142 |     weights = heap[1, row]
1143 |     is_new = heap[2, row]
1144 | 
1145 |     if weight >= weights[0]:
1146 |         return 0
1147 | 
1148 |     
1149 |     weights[0] = weight
1150 |     indices[0] = index
1151 |     is_new[0] = flag
1152 | 
1153 |     
1154 |     i = 0
1155 |     while True:
1156 |         ic1 = 2 * i + 1
1157 |         ic2 = ic1 + 1
1158 | 
1159 |         if ic1 >= heap.shape[2]:
1160 |             break
1161 |         elif ic2 >= heap.shape[2]:
1162 |             if weights[ic1] > weight:
1163 |                 i_swap = ic1
1164 |             else:
1165 |                 break
1166 |         elif weights[ic1] >= weights[ic2]:
1167 |             if weight < weights[ic1]:
1168 |                 i_swap = ic1
1169 |             else:
1170 |                 break
1171 |         else:
1172 |             if weight < weights[ic2]:
1173 |                 i_swap = ic2
1174 |             else:
1175 |                 break
1176 | 
1177 |         weights[i] = weights[i_swap]
1178 |         indices[i] = indices[i_swap]
1179 |         is_new[i] = is_new[i_swap]
1180 | 
1181 |         i = i_swap
1182 | 
1183 |     weights[i] = weight
1184 |     indices[i] = index
1185 |     is_new[i] = flag
1186 | 
1187 |     return 1
1188 | 
1189 | 
1190 | @numba.njit()
1191 | def siftdown(heap1, heap2, elt):
1192 | 
1193 |     while elt * 2 + 1 < heap1.shape[0]:
1194 |         left_child = elt * 2 + 1
1195 |         right_child = left_child + 1
1196 |         swap = elt
1197 | 
1198 |         if heap1[swap] < heap1[left_child]:
1199 |             swap = left_child
1200 | 
1201 |         if (
1202 |             right_child < heap1.shape[0]
1203 |             and heap1[swap] < heap1[right_child]
1204 |         ):
1205 |             swap = right_child
1206 | 
1207 |         if swap == elt:
1208 |             break
1209 |         else:
1210 |             heap1[elt], heap1[swap] = (
1211 |                 heap1[swap],
1212 |                 heap1[elt],
1213 |             )
1214 |             heap2[elt], heap2[swap] = (
1215 |                 heap2[swap],
1216 |                 heap2[elt],
1217 |             )
1218 |             elt = swap
1219 | 
1220 | 
1221 | @numba.njit()
1222 | def deheap_sort(heap):
1223 | 
1224 |     indices = heap[0]
1225 |     weights = heap[1]
1226 | 
1227 |     for i in range(indices.shape[0]):
1228 | 
1229 |         ind_heap = indices[i]
1230 |         dist_heap = weights[i]
1231 | 
1232 |         for j in range(ind_heap.shape[0] - 1):
1233 |             ind_heap[0], ind_heap[
1234 |                 ind_heap.shape[0] - j - 1
1235 |             ] = (
1236 |                 ind_heap[ind_heap.shape[0] - j - 1],
1237 |                 ind_heap[0],
1238 |             )
1239 |             dist_heap[0], dist_heap[
1240 |                 dist_heap.shape[0] - j - 1
1241 |             ] = (
1242 |                 dist_heap[dist_heap.shape[0] - j - 1],
1243 |                 dist_heap[0],
1244 |             )
1245 | 
1246 |             siftdown(
1247 |                 dist_heap[: dist_heap.shape[0] - j - 1],
1248 |                 ind_heap[: ind_heap.shape[0] - j - 1],
1249 |                 0,
1250 |             )
1251 | 
1252 |     return indices.astype(np.int64), weights
1253 | 
1254 | 
1255 | @numba.njit("i8(f8[:, :, :],i8)")
1256 | def smallest_flagged(heap, row):
1257 | 
1258 |     ind = heap[0, row]
1259 |     dist = heap[1, row]
1260 |     flag = heap[2, row]
1261 | 
1262 |     min_dist = np.inf
1263 |     result_index = -1
1264 | 
1265 |     for i in range(ind.shape[0]):
1266 |         if flag[i] == 1 and dist[i] < min_dist:
1267 |             min_dist = dist[i]
1268 |             result_index = i
1269 | 
1270 |     if result_index >= 0:
1271 |         flag[result_index] = 0.0
1272 |         return int(ind[result_index])
1273 |     else:
1274 |         return -1
1275 | 
1276 | 
1277 | @numba.njit(parallel=True)
1278 | def build_candidates(
1279 |     current_graph,
1280 |     n_vertices,
1281 |     n_neighbors,
1282 |     max_candidates,
1283 |     rng_state,
1284 | ):
1285 | 
1286 |     candidate_neighbors = make_heap(
1287 |         n_vertices, max_candidates
1288 |     )
1289 |     for i in range(n_vertices):
1290 |         for j in range(n_neighbors):
1291 |             if current_graph[0, i, j] < 0:
1292 |                 continue
1293 |             idx = current_graph[0, i, j]
1294 |             isn = current_graph[2, i, j]
1295 |             d = tau_rand(rng_state)
1296 |             heap_push(candidate_neighbors, i, d, idx, isn)
1297 |             heap_push(candidate_neighbors, idx, d, i, isn)
1298 |             current_graph[2, i, j] = 0
1299 | 
1300 |     return candidate_neighbors
1301 | 
1302 | 
1303 | @numba.njit(parallel=True)
1304 | def new_build_candidates(
1305 |     current_graph,
1306 |     n_vertices,
1307 |     n_neighbors,
1308 |     max_candidates,
1309 |     rng_state,
1310 |     rho=0.5,
1311 | ):  
1312 | 
1313 |     new_candidate_neighbors = make_heap(
1314 |         n_vertices, max_candidates
1315 |     )
1316 |     old_candidate_neighbors = make_heap(
1317 |         n_vertices, max_candidates
1318 |     )
1319 | 
1320 |     for i in numba.prange(n_vertices):
1321 |         for j in range(n_neighbors):
1322 |             if current_graph[0, i, j] < 0:
1323 |                 continue
1324 |             idx = current_graph[0, i, j]
1325 |             isn = current_graph[2, i, j]
1326 |             d = tau_rand(rng_state)
1327 |             if tau_rand(rng_state) < rho:
1328 |                 c = 0
1329 |                 if isn:
1330 |                     c += heap_push(
1331 |                         new_candidate_neighbors,
1332 |                         i,
1333 |                         d,
1334 |                         idx,
1335 |                         isn,
1336 |                     )
1337 |                     c += heap_push(
1338 |                         new_candidate_neighbors,
1339 |                         idx,
1340 |                         d,
1341 |                         i,
1342 |                         isn,
1343 |                     )
1344 |                 else:
1345 |                     heap_push(
1346 |                         old_candidate_neighbors,
1347 |                         i,
1348 |                         d,
1349 |                         idx,
1350 |                         isn,
1351 |                     )
1352 |                     heap_push(
1353 |                         old_candidate_neighbors,
1354 |                         idx,
1355 |                         d,
1356 |                         i,
1357 |                         isn,
1358 |                     )
1359 | 
1360 |                 if c > 0:
1361 |                     current_graph[2, i, j] = 0
1362 | 
1363 |     return new_candidate_neighbors, old_candidate_neighbors
1364 | 
1365 | 
1366 | @numba.njit(parallel=True)
1367 | def submatrix(dmat, indices_col, n_neighbors):
1368 | 
1369 |     n_samples_transform, n_samples_fit = dmat.shape
1370 |     submat = np.zeros(
1371 |         (n_samples_transform, n_neighbors), dtype=dmat.dtype
1372 |     )
1373 |     for i in numba.prange(n_samples_transform):
1374 |         for j in numba.prange(n_neighbors):
1375 |             submat[i, j] = dmat[i, indices_col[i, j]]
1376 |     return submat
1377 | 
1378 | 
1379 | 
1380 | def ts():
1381 |     return time.ctime(time.time())
1382 | 
1383 | 
1384 | 
1385 | 
1386 | 
1387 | 
1388 | 
1389 | 
1390 | 
1391 | 
1392 | 
1393 | 
1394 | 
1395 | 
1396 | 
1397 | 
1398 | import numpy as np
1399 | import numba
1400 | 
1401 | 
1402 | 
1403 | 
1404 | 
1405 | 
1406 | 
1407 | 
1408 | 
1409 | 
1410 | 
1411 | 
1412 | 
1413 | #from umap.rp_tree import search_flat_tree
1414 | 
1415 | 
1416 | def make_nn_descent(dist, dist_args):
1417 | 
1418 | 
1419 |     @numba.njit()
1420 |     def nn_descent(
1421 |         data,
1422 |         n_neighbors,
1423 |         rng_state,
1424 |         max_candidates=50,
1425 |         n_iters=10,
1426 |         delta=0.001,
1427 |         rho=0.5,
1428 |         rp_tree_init=True,
1429 |         leaf_array=None,
1430 |         verbose=False,
1431 |     ):
1432 |         n_vertices = data.shape[0]
1433 | 
1434 |         current_graph = make_heap(data.shape[0], n_neighbors)
1435 |         for i in range(data.shape[0]):
1436 |             indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
1437 |             for j in range(indices.shape[0]):
1438 |                 d = dist(data[i], data[indices[j]], *dist_args)
1439 |                 heap_push(current_graph, i, d, indices[j], 1)
1440 |                 heap_push(current_graph, indices[j], d, i, 1)
1441 | 
1442 |         if rp_tree_init:
1443 |             for n in range(leaf_array.shape[0]):
1444 |                 for i in range(leaf_array.shape[1]):
1445 |                     if leaf_array[n, i] < 0:
1446 |                         break
1447 |                     for j in range(i + 1, leaf_array.shape[1]):
1448 |                         if leaf_array[n, j] < 0:
1449 |                             break
1450 |                         d = dist(
1451 |                             data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args
1452 |                         )
1453 |                         heap_push(
1454 |                             current_graph, leaf_array[n, i], d, leaf_array[n, j], 1
1455 |                         )
1456 |                         heap_push(
1457 |                             current_graph, leaf_array[n, j], d, leaf_array[n, i], 1
1458 |                         )
1459 | 
1460 |         for n in range(n_iters):
1461 |             if verbose:
1462 |                 print("\t", n, " / ", n_iters)
1463 | 
1464 |             candidate_neighbors = build_candidates(
1465 |                 current_graph, n_vertices, n_neighbors, max_candidates, rng_state
1466 |             )
1467 | 
1468 |             c = 0
1469 |             for i in range(n_vertices):
1470 |                 for j in range(max_candidates):
1471 |                     p = int(candidate_neighbors[0, i, j])
1472 |                     if p < 0 or tau_rand(rng_state) < rho:
1473 |                         continue
1474 |                     for k in range(max_candidates):
1475 |                         q = int(candidate_neighbors[0, i, k])
1476 |                         if (
1477 |                             q < 0
1478 |                             or not candidate_neighbors[2, i, j]
1479 |                             and not candidate_neighbors[2, i, k]
1480 |                         ):
1481 |                             continue
1482 | 
1483 |                         d = dist(data[p], data[q], *dist_args)
1484 |                         c += heap_push(current_graph, p, d, q, 1)
1485 |                         c += heap_push(current_graph, q, d, p, 1)
1486 | 
1487 |             if c <= delta * n_neighbors * data.shape[0]:
1488 |                 break
1489 | 
1490 |         return deheap_sort(current_graph)
1491 | 
1492 |     return nn_descent
1493 | 
1494 | 
1495 | def make_initialisations(dist, dist_args):
1496 |     @numba.njit(parallel=True)
1497 |     def init_from_random(n_neighbors, data, query_points, heap, rng_state):
1498 |         for i in range(query_points.shape[0]):
1499 |             indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
1500 |             for j in range(indices.shape[0]):
1501 |                 if indices[j] < 0:
1502 |                     continue
1503 |                 d = dist(data[indices[j]], query_points[i], *dist_args)
1504 |                 heap_push(heap, i, d, indices[j], 1)
1505 |         return
1506 | 
1507 |     @numba.njit(parallel=True)
1508 |     def init_from_tree(tree, data, query_points, heap, rng_state):
1509 |         for i in range(query_points.shape[0]):
1510 |             indices = search_flat_tree(
1511 |                 query_points[i],
1512 |                 tree.hyperplanes,
1513 |                 tree.offsets,
1514 |                 tree.children,
1515 |                 tree.indices,
1516 |                 rng_state,
1517 |             )
1518 | 
1519 |             for j in range(indices.shape[0]):
1520 |                 if indices[j] < 0:
1521 |                     continue
1522 |                 d = dist(data[indices[j]], query_points[i], *dist_args)
1523 |                 heap_push(heap, i, d, indices[j], 1)
1524 | 
1525 |         return
1526 | 
1527 |     return init_from_random, init_from_tree
1528 | 
1529 | 
1530 | def initialise_search(
1531 |     forest, data, query_points, n_neighbors, init_from_random, init_from_tree, rng_state
1532 | ):
1533 |     results = make_heap(query_points.shape[0], n_neighbors)
1534 |     init_from_random(n_neighbors, data, query_points, results, rng_state)
1535 |     if forest is not None:
1536 |         for tree in forest:
1537 |             init_from_tree(tree, data, query_points, results, rng_state)
1538 | 
1539 |     return results
1540 | 
1541 | 
1542 | def make_initialized_nnd_search(dist, dist_args):
1543 |     @numba.njit(parallel=True)
1544 |     def initialized_nnd_search(data, indptr, indices, initialization, query_points):
1545 | 
1546 |         for i in numba.prange(query_points.shape[0]):
1547 | 
1548 |             tried = set(initialization[0, i])
1549 | 
1550 |             while True:
1551 | 
1552 |                 
1553 |                 vertex = smallest_flagged(initialization, i)
1554 | 
1555 |                 if vertex == -1:
1556 |                     break
1557 |                 candidates = indices[indptr[vertex] : indptr[vertex + 1]]
1558 |                 for j in range(candidates.shape[0]):
1559 |                     if (
1560 |                         candidates[j] == vertex
1561 |                         or candidates[j] == -1
1562 |                         or candidates[j] in tried
1563 |                     ):
1564 |                         continue
1565 |                     d = dist(data[candidates[j]], query_points[i], *dist_args)
1566 |                     unchecked_heap_push(initialization, i, d, candidates[j], 1)
1567 |                     tried.add(candidates[j])
1568 | 
1569 |         return initialization
1570 | 
1571 |     return initialized_nnd_search
1572 | 
1573 | 
1574 | 
1575 | 
1576 | 
1577 | 
1578 | 
1579 | 
1580 | 
1581 | 
1582 | 
1583 | 
1584 | 
1585 | 
1586 | 
1587 | 
1588 | 
1589 | import numpy as np
1590 | import numba
1591 | 
1592 | 
1593 | 
1594 | 
1595 | 
1596 | 
1597 | 
1598 | 
1599 | 
1600 | 
1601 | 
1602 | 
1603 | import locale
1604 | 
1605 | locale.setlocale(locale.LC_NUMERIC, "C")
1606 | 
1607 | 
1608 | @numba.njit()
1609 | def arr_unique(arr):
1610 |     aux = np.sort(arr)
1611 |     flag = np.concatenate((np.ones(1, dtype=np.bool_), aux[1:] != aux[:-1]))
1612 |     return aux[flag]
1613 | 
1614 | 
1615 | 
1616 | @numba.njit()
1617 | def arr_union(ar1, ar2):
1618 |     if ar1.shape[0] == 0:
1619 |         return ar2
1620 |     elif ar2.shape[0] == 0:
1621 |         return ar1
1622 |     else:
1623 |         return arr_unique(np.concatenate((ar1, ar2)))
1624 | 
1625 | 
1626 | 
1627 | 
1628 | @numba.njit()
1629 | def arr_intersect(ar1, ar2):
1630 |     aux = np.concatenate((ar1, ar2))
1631 |     aux.sort()
1632 |     return aux[:-1][aux[1:] == aux[:-1]]
1633 | 
1634 | 
1635 | @numba.njit()
1636 | def sparse_sum(ind1, data1, ind2, data2):
1637 |     result_ind = arr_union(ind1, ind2)
1638 |     result_data = np.zeros(result_ind.shape[0], dtype=np.float32)
1639 | 
1640 |     i1 = 0
1641 |     i2 = 0
1642 |     nnz = 0
1643 | 
1644 |     
1645 |     while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
1646 |         j1 = ind1[i1]
1647 |         j2 = ind2[i2]
1648 | 
1649 |         if j1 == j2:
1650 |             val = data1[i1] + data2[i2]
1651 |             if val != 0:
1652 |                 result_ind[nnz] = j1
1653 |                 result_data[nnz] = val
1654 |                 nnz += 1
1655 |             i1 += 1
1656 |             i2 += 1
1657 |         elif j1 < j2:
1658 |             val = data1[i1]
1659 |             if val != 0:
1660 |                 result_ind[nnz] = j1
1661 |                 result_data[nnz] = val
1662 |                 nnz += 1
1663 |             i1 += 1
1664 |         else:
1665 |             val = data2[i2]
1666 |             if val != 0:
1667 |                 result_ind[nnz] = j2
1668 |                 result_data[nnz] = val
1669 |                 nnz += 1
1670 |             i2 += 1
1671 | 
1672 |     
1673 |     while i1 < ind1.shape[0]:
1674 |         val = data1[i1]
1675 |         if val != 0:
1676 |             result_ind[nnz] = i1
1677 |             result_data[nnz] = val
1678 |             nnz += 1
1679 |         i1 += 1
1680 | 
1681 |     while i2 < ind2.shape[0]:
1682 |         val = data2[i2]
1683 |         if val != 0:
1684 |             result_ind[nnz] = i2
1685 |             result_data[nnz] = val
1686 |             nnz += 1
1687 |         i2 += 1
1688 | 
1689 |     
1690 |     result_ind = result_ind[:nnz]
1691 |     result_data = result_data[:nnz]
1692 | 
1693 |     return result_ind, result_data
1694 | 
1695 | 
1696 | @numba.njit()
1697 | def sparse_diff(ind1, data1, ind2, data2):
1698 |     return sparse_sum(ind1, data1, ind2, -data2)
1699 | 
1700 | 
1701 | @numba.njit()
1702 | def sparse_mul(ind1, data1, ind2, data2):
1703 |     result_ind = arr_intersect(ind1, ind2)
1704 |     result_data = np.zeros(result_ind.shape[0], dtype=np.float32)
1705 | 
1706 |     i1 = 0
1707 |     i2 = 0
1708 |     nnz = 0
1709 | 
1710 |     
1711 |     while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
1712 |         j1 = ind1[i1]
1713 |         j2 = ind2[i2]
1714 | 
1715 |         if j1 == j2:
1716 |             val = data1[i1] * data2[i2]
1717 |             if val != 0:
1718 |                 result_ind[nnz] = j1
1719 |                 result_data[nnz] = val
1720 |                 nnz += 1
1721 |             i1 += 1
1722 |             i2 += 1
1723 |         elif j1 < j2:
1724 |             i1 += 1
1725 |         else:
1726 |             i2 += 1
1727 | 
1728 |     
1729 |     result_ind = result_ind[:nnz]
1730 |     result_data = result_data[:nnz]
1731 | 
1732 |     return result_ind, result_data
1733 | 
1734 | 
1735 | def make_sparse_nn_descent(sparse_dist, dist_args):
1736 | 
1737 |     @numba.njit(parallel=True)
1738 |     def nn_descent(
1739 |         inds,
1740 |         indptr,
1741 |         data,
1742 |         n_vertices,
1743 |         n_neighbors,
1744 |         rng_state,
1745 |         max_candidates=50,
1746 |         n_iters=10,
1747 |         delta=0.001,
1748 |         rho=0.5,
1749 |         rp_tree_init=True,
1750 |         leaf_array=None,
1751 |         verbose=False,
1752 |     ):
1753 | 
1754 |         current_graph = make_heap(n_vertices, n_neighbors)
1755 |         for i in range(n_vertices):
1756 |             indices = rejection_sample(n_neighbors, n_vertices, rng_state)
1757 |             for j in range(indices.shape[0]):
1758 | 
1759 |                 from_inds = inds[indptr[i] : indptr[i + 1]]
1760 |                 from_data = data[indptr[i] : indptr[i + 1]]
1761 | 
1762 |                 to_inds = inds[indptr[indices[j]] : indptr[indices[j] + 1]]
1763 |                 to_data = data[indptr[indices[j]] : indptr[indices[j] + 1]]
1764 | 
1765 |                 d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)
1766 | 
1767 |                 heap_push(current_graph, i, d, indices[j], 1)
1768 |                 heap_push(current_graph, indices[j], d, i, 1)
1769 | 
1770 |         if rp_tree_init:
1771 |             for n in range(leaf_array.shape[0]):
1772 |                 for i in range(leaf_array.shape[1]):
1773 |                     if leaf_array[n, i] < 0:
1774 |                         break
1775 |                     for j in range(i + 1, leaf_array.shape[1]):
1776 |                         if leaf_array[n, j] < 0:
1777 |                             break
1778 | 
1779 |                         from_inds = inds[
1780 |                             indptr[leaf_array[n, i]] : indptr[leaf_array[n, i] + 1]
1781 |                         ]
1782 |                         from_data = data[
1783 |                             indptr[leaf_array[n, i]] : indptr[leaf_array[n, i] + 1]
1784 |                         ]
1785 | 
1786 |                         to_inds = inds[
1787 |                             indptr[leaf_array[n, j]] : indptr[leaf_array[n, j] + 1]
1788 |                         ]
1789 |                         to_data = data[
1790 |                             indptr[leaf_array[n, j]] : indptr[leaf_array[n, j] + 1]
1791 |                         ]
1792 | 
1793 |                         d = sparse_dist(
1794 |                             from_inds, from_data, to_inds, to_data, *dist_args
1795 |                         )
1796 | 
1797 |                         heap_push(
1798 |                             current_graph, leaf_array[n, i], d, leaf_array[n, j], 1
1799 |                         )
1800 |                         heap_push(
1801 |                             current_graph, leaf_array[n, j], d, leaf_array[n, i], 1
1802 |                         )
1803 | 
1804 |         for n in range(n_iters):
1805 |             if verbose:
1806 |                 print("\t", n, " / ", n_iters)
1807 | 
1808 |             candidate_neighbors = build_candidates(
1809 |                 current_graph, n_vertices, n_neighbors, max_candidates, rng_state
1810 |             )
1811 | 
1812 |             c = 0
1813 |             for i in range(n_vertices):
1814 |                 for j in range(max_candidates):
1815 |                     p = int(candidate_neighbors[0, i, j])
1816 |                     if p < 0 or tau_rand(rng_state) < rho:
1817 |                         continue
1818 |                     for k in range(max_candidates):
1819 |                         q = int(candidate_neighbors[0, i, k])
1820 |                         if (
1821 |                             q < 0
1822 |                             or not candidate_neighbors[2, i, j]
1823 |                             and not candidate_neighbors[2, i, k]
1824 |                         ):
1825 |                             continue
1826 | 
1827 |                         from_inds = inds[indptr[p] : indptr[p + 1]]
1828 |                         from_data = data[indptr[p] : indptr[p + 1]]
1829 | 
1830 |                         to_inds = inds[indptr[q] : indptr[q + 1]]
1831 |                         to_data = data[indptr[q] : indptr[q + 1]]
1832 | 
1833 |                         d = sparse_dist(
1834 |                             from_inds, from_data, to_inds, to_data, *dist_args
1835 |                         )
1836 | 
1837 |                         c += heap_push(current_graph, p, d, q, 1)
1838 |                         c += heap_push(current_graph, q, d, p, 1)
1839 | 
1840 |             if c <= delta * n_neighbors * n_vertices:
1841 |                 break
1842 | 
1843 |         return deheap_sort(current_graph)
1844 | 
1845 |     return nn_descent
1846 | 
1847 | 
1848 | @numba.njit()
1849 | def general_sset_intersection(
1850 |     indptr1,
1851 |     indices1,
1852 |     data1,
1853 |     indptr2,
1854 |     indices2,
1855 |     data2,
1856 |     result_row,
1857 |     result_col,
1858 |     result_val,
1859 |     mix_weight=0.5,
1860 | ):
1861 | 
1862 |     left_min = max(data1.min() / 2.0, 1.0e-8)
1863 |     right_min = max(data2.min() / 2.0, 1.0e-8)
1864 | 
1865 |     for idx in range(result_row.shape[0]):
1866 |         i = result_row[idx]
1867 |         j = result_col[idx]
1868 | 
1869 |         left_val = left_min
1870 |         for k in range(indptr1[i], indptr1[i + 1]):
1871 |             if indices1[k] == j:
1872 |                 left_val = data1[k]
1873 | 
1874 |         right_val = right_min
1875 |         for k in range(indptr2[i], indptr2[i + 1]):
1876 |             if indices2[k] == j:
1877 |                 right_val = data2[k]
1878 | 
1879 |         if left_val > left_min or right_val > right_min:
1880 |             if mix_weight < 0.5:
1881 |                 result_val[idx] = left_val * pow(
1882 |                     right_val, mix_weight / (1.0 - mix_weight)
1883 |                 )
1884 |             else:
1885 |                 result_val[idx] = (
1886 |                     pow(left_val, (1.0 - mix_weight) / mix_weight) * right_val
1887 |                 )
1888 | 
1889 |     return
1890 | 
1891 | 
1892 | @numba.njit()
1893 | def sparse_euclidean(ind1, data1, ind2, data2):
1894 |     aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2)
1895 |     result = 0.0
1896 |     for i in range(aux_data.shape[0]):
1897 |         result += aux_data[i] ** 2
1898 |     return np.sqrt(result)
1899 | 
1900 | 
1901 | @numba.njit()
1902 | def sparse_manhattan(ind1, data1, ind2, data2):
1903 |     aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2)
1904 |     result = 0.0
1905 |     for i in range(aux_data.shape[0]):
1906 |         result += np.abs(aux_data[i])
1907 |     return result
1908 | 
1909 | 
1910 | @numba.njit()
1911 | def sparse_chebyshev(ind1, data1, ind2, data2):
1912 |     aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2)
1913 |     result = 0.0
1914 |     for i in range(aux_data.shape[0]):
1915 |         result = max(result, np.abs(aux_data[i]))
1916 |     return result
1917 | 
1918 | 
1919 | @numba.njit()
1920 | def sparse_minkowski(ind1, data1, ind2, data2, p=2.0):
1921 |     aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2)
1922 |     result = 0.0
1923 |     for i in range(aux_data.shape[0]):
1924 |         result += np.abs(aux_data[i]) ** p
1925 |     return result ** (1.0 / p)
1926 | 
1927 | 
1928 | @numba.njit()
1929 | def sparse_hamming(ind1, data1, ind2, data2, n_features):
1930 |     num_not_equal = sparse_diff(ind1, data1, ind2, data2)[0].shape[0]
1931 |     return float(num_not_equal) / n_features
1932 | 
1933 | 
1934 | @numba.njit()
1935 | def sparse_canberra(ind1, data1, ind2, data2):
1936 |     abs_data1 = np.abs(data1)
1937 |     abs_data2 = np.abs(data2)
1938 |     denom_inds, denom_data = sparse_sum(ind1, abs_data1, ind2, abs_data2)
1939 |     denom_data = 1.0 / denom_data
1940 |     numer_inds, numer_data = sparse_diff(ind1, data1, ind2, data2)
1941 |     numer_data = np.abs(numer_data)
1942 | 
1943 |     val_inds, val_data = sparse_mul(numer_inds, numer_data, denom_inds, denom_data)
1944 | 
1945 |     return np.sum(val_data)
1946 | 
1947 | 
1948 | @numba.njit()
1949 | def sparse_bray_curtis(ind1, data1, ind2, data2):  
1950 |     abs_data1 = np.abs(data1)
1951 |     abs_data2 = np.abs(data2)
1952 |     denom_inds, denom_data = sparse_sum(ind1, abs_data1, ind2, abs_data2)
1953 | 
1954 |     if denom_data.shape[0] == 0:
1955 |         return 0.0
1956 | 
1957 |     denominator = np.sum(denom_data)
1958 | 
1959 |     numer_inds, numer_data = sparse_diff(ind1, data1, ind2, data2)
1960 |     numer_data = np.abs(numer_data)
1961 | 
1962 |     numerator = np.sum(numer_data)
1963 | 
1964 |     return float(numerator) / denominator
1965 | 
1966 | 
1967 | @numba.njit()
1968 | def sparse_jaccard(ind1, data1, ind2, data2):
1969 |     num_non_zero = arr_union(ind1, ind2).shape[0]
1970 |     num_equal = arr_intersect(ind1, ind2).shape[0]
1971 | 
1972 |     if num_non_zero == 0:
1973 |         return 0.0
1974 |     else:
1975 |         return float(num_non_zero - num_equal) / num_non_zero
1976 | 
1977 | 
1978 | @numba.njit()
1979 | def sparse_matching(ind1, data1, ind2, data2, n_features):
1980 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
1981 |     num_non_zero = arr_union(ind1, ind2).shape[0]
1982 |     num_not_equal = num_non_zero - num_true_true
1983 | 
1984 |     return float(num_not_equal) / n_features
1985 | 
1986 | 
1987 | @numba.njit()
1988 | def sparse_dice(ind1, data1, ind2, data2):
1989 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
1990 |     num_non_zero = arr_union(ind1, ind2).shape[0]
1991 |     num_not_equal = num_non_zero - num_true_true
1992 | 
1993 |     if num_not_equal == 0.0:
1994 |         return 0.0
1995 |     else:
1996 |         return num_not_equal / (2.0 * num_true_true + num_not_equal)
1997 | 
1998 | 
1999 | @numba.njit()
2000 | def sparse_kulsinski(ind1, data1, ind2, data2, n_features):
2001 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
2002 |     num_non_zero = arr_union(ind1, ind2).shape[0]
2003 |     num_not_equal = num_non_zero - num_true_true
2004 | 
2005 |     if num_not_equal == 0:
2006 |         return 0.0
2007 |     else:
2008 |         return float(num_not_equal - num_true_true + n_features) / (
2009 |             num_not_equal + n_features
2010 |         )
2011 | 
2012 | 
2013 | @numba.njit()
2014 | def sparse_rogers_tanimoto(ind1, data1, ind2, data2, n_features):
2015 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
2016 |     num_non_zero = arr_union(ind1, ind2).shape[0]
2017 |     num_not_equal = num_non_zero - num_true_true
2018 | 
2019 |     return (2.0 * num_not_equal) / (n_features + num_not_equal)
2020 | 
2021 | 
2022 | @numba.njit()
2023 | def sparse_russellrao(ind1, data1, ind2, data2, n_features):
2024 |     if ind1.shape[0] == ind2.shape[0] and np.all(ind1 == ind2):
2025 |         return 0.0
2026 | 
2027 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
2028 | 
2029 |     if num_true_true == np.sum(data1 != 0) and num_true_true == np.sum(data2 != 0):
2030 |         return 0.0
2031 |     else:
2032 |         return float(n_features - num_true_true) / (n_features)
2033 | 
2034 | 
2035 | @numba.njit()
2036 | def sparse_sokal_michener(ind1, data1, ind2, data2, n_features):
2037 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
2038 |     num_non_zero = arr_union(ind1, ind2).shape[0]
2039 |     num_not_equal = num_non_zero - num_true_true
2040 | 
2041 |     return (2.0 * num_not_equal) / (n_features + num_not_equal)
2042 | 
2043 | 
2044 | @numba.njit()
2045 | def sparse_sokal_sneath(ind1, data1, ind2, data2):
2046 |     num_true_true = arr_intersect(ind1, ind2).shape[0]
2047 |     num_non_zero = arr_union(ind1, ind2).shape[0]
2048 |     num_not_equal = num_non_zero - num_true_true
2049 | 
2050 |     if num_not_equal == 0.0:
2051 |         return 0.0
2052 |     else:
2053 |         return num_not_equal / (0.5 * num_true_true + num_not_equal)
2054 | 
2055 | 
2056 | @numba.njit()
2057 | def sparse_cosine(ind1, data1, ind2, data2):
2058 |     aux_inds, aux_data = sparse_mul(ind1, data1, ind2, data2)
2059 |     result = 0.0
2060 |     norm1 = norm(data1)
2061 |     norm2 = norm(data2)
2062 | 
2063 |     for i in range(aux_data.shape[0]):
2064 |         result += aux_data[i]
2065 | 
2066 |     if norm1 == 0.0 and norm2 == 0.0:
2067 |         return 0.0
2068 |     elif norm1 == 0.0 or norm2 == 0.0:
2069 |         return 1.0
2070 |     else:
2071 |         return 1.0 - (result / (norm1 * norm2))
2072 | 
2073 | 
2074 | @numba.njit()
2075 | def sparse_correlation(ind1, data1, ind2, data2, n_features):
2076 | 
2077 |     mu_x = 0.0
2078 |     mu_y = 0.0
2079 |     dot_product = 0.0
2080 | 
2081 |     if ind1.shape[0] == 0 and ind2.shape[0] == 0:
2082 |         return 0.0
2083 |     elif ind1.shape[0] == 0 or ind2.shape[0] == 0:
2084 |         return 1.0
2085 | 
2086 |     for i in range(data1.shape[0]):
2087 |         mu_x += data1[i]
2088 |     for i in range(data2.shape[0]):
2089 |         mu_y += data2[i]
2090 | 
2091 |     mu_x /= n_features
2092 |     mu_y /= n_features
2093 | 
2094 |     shifted_data1 = np.empty(data1.shape[0], dtype=np.float32)
2095 |     shifted_data2 = np.empty(data2.shape[0], dtype=np.float32)
2096 | 
2097 |     for i in range(data1.shape[0]):
2098 |         shifted_data1[i] = data1[i] - mu_x
2099 |     for i in range(data2.shape[0]):
2100 |         shifted_data2[i] = data2[i] - mu_y
2101 | 
2102 |     norm1 = np.sqrt(
2103 |         (norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2)
2104 |     )
2105 |     norm2 = np.sqrt(
2106 |         (norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2)
2107 |     )
2108 | 
2109 |     dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2)
2110 | 
2111 |     common_indices = set(dot_prod_inds)
2112 | 
2113 |     for i in range(dot_prod_data.shape[0]):
2114 |         dot_product += dot_prod_data[i]
2115 | 
2116 |     for i in range(ind1.shape[0]):
2117 |         if ind1[i] not in common_indices:
2118 |             dot_product -= shifted_data1[i] * (mu_y)
2119 | 
2120 |     for i in range(ind2.shape[0]):
2121 |         if ind2[i] not in common_indices:
2122 |             dot_product -= shifted_data2[i] * (mu_x)
2123 | 
2124 |     all_indices = arr_union(ind1, ind2)
2125 |     dot_product += mu_x * mu_y * (n_features - all_indices.shape[0])
2126 | 
2127 |     if norm1 == 0.0 and norm2 == 0.0:
2128 |         return 0.0
2129 |     elif dot_product == 0.0:
2130 |         return 1.0
2131 |     else:
2132 |         return 1.0 - (dot_product / (norm1 * norm2))
2133 | 
2134 | 
2135 | sparse_named_distances = {
2136 |     
2137 |     "euclidean": sparse_euclidean,
2138 |     "manhattan": sparse_manhattan,
2139 |     "l1": sparse_manhattan,
2140 |     "taxicab": sparse_manhattan,
2141 |     "chebyshev": sparse_chebyshev,
2142 |     "linf": sparse_chebyshev,
2143 |     "linfty": sparse_chebyshev,
2144 |     "linfinity": sparse_chebyshev,
2145 |     "minkowski": sparse_minkowski,
2146 |     
2147 |     "canberra": sparse_canberra,
2148 |     
2149 |     
2150 |     "hamming": sparse_hamming,
2151 |     "jaccard": sparse_jaccard,
2152 |     "dice": sparse_dice,
2153 |     "matching": sparse_matching,
2154 |     "kulsinski": sparse_kulsinski,
2155 |     "rogerstanimoto": sparse_rogers_tanimoto,
2156 |     "russellrao": sparse_russellrao,
2157 |     "sokalmichener": sparse_sokal_michener,
2158 |     "sokalsneath": sparse_sokal_sneath,
2159 |     "cosine": sparse_cosine,
2160 |     "correlation": sparse_correlation,
2161 | }
2162 | 
2163 | sparse_need_n_features = (
2164 |     "hamming",
2165 |     "matching",
2166 |     "kulsinski",
2167 |     "rogerstanimoto",
2168 |     "russellrao",
2169 |     "sokalmichener",
2170 |     "correlation",
2171 | )
2172 | 
2173 | 
2174 | 
2175 | 
2176 | 
2177 | 
2178 | 
2179 | 
2180 | 
2181 | 
2182 | 
2183 | 
2184 | 
2185 | 
2186 | 
2187 | import numpy as np
2188 | import numba
2189 | import scipy
2190 | from sklearn.metrics import pairwise_distances
2191 | from sklearn.utils import check_random_state
2192 | from sklearn.neighbors import KDTree
2193 | from scipy.spatial import cKDTree
2194 | 
2195 | 
2196 | 
2197 | 
2198 | 
2199 | 
2200 | 
2201 | #INT32_MIN = np.iinfo(np.int32).min + 1
2202 | #INT32_MAX = np.iinfo(np.int32).max - 1
2203 | 
2204 | SMOOTH_K_TOLERANCE = 1e-5
2205 | MIN_K_DIST_SCALE = 1e-3
2206 | NPY_INFINITY = np.inf
2207 | 
2208 | def nearest_neighbors(
2209 |     X,
2210 |     n_neighbors,
2211 |     metric,
2212 |     metric_kwds,
2213 |     angular,
2214 |     random_state,
2215 |     verbose=False,
2216 | ):
2217 | 
2218 |     if verbose:
2219 |         print("Finding Nearest Neighbors")
2220 | 
2221 |     if metric == "precomputed":
2222 |         
2223 |         
2224 |         knn_indices = fast_knn_indices(X, n_neighbors)
2225 |         
2226 |         
2227 |         knn_dists = X[
2228 |             np.arange(X.shape[0])[:, None], knn_indices
2229 |         ].copy()
2230 | 
2231 |         rp_forest = []
2232 |     else:
2233 |         if callable(metric):
2234 |             distance_func = metric
2235 |         elif metric in named_distances:
2236 |             distance_func = named_distances[metric]
2237 |         else:
2238 |             raise ValueError(
2239 |                 "Metric is neither callable, "
2240 |                 + "nor a recognised string"
2241 |             )
2242 | 
2243 |         if metric in (
2244 |             "cosine",
2245 |             "correlation",
2246 |             "dice",
2247 |             "jaccard",
2248 |         ):
2249 |             angular = True
2250 | 
2251 |         rng_state = random_state.randint(
2252 |             np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3
2253 |         ).astype(np.int64)
2254 | 
2255 |         if scipy.sparse.isspmatrix_csr(X):
2256 |             if metric in sparse.sparse_named_distances:
2257 |                 distance_func = sparse.sparse_named_distances[
2258 |                     metric
2259 |                 ]
2260 |                 if metric in sparse.sparse_need_n_features:
2261 |                     metric_kwds["n_features"] = X.shape[1]
2262 |             else:
2263 |                 raise ValueError(
2264 |                     "Metric {} not supported for sparse "
2265 |                     + "data".format(metric)
2266 |                 )
2267 |             metric_nn_descent = sparse.make_sparse_nn_descent(
2268 |                 distance_func, tuple(metric_kwds.values())
2269 |             )
2270 | 
2271 |             
2272 |             n_trees = 5 + int(
2273 |                 round((X.shape[0]) ** 0.5 / 20.0)
2274 |             )
2275 |             n_iters = max(
2276 |                 5, int(round(np.log2(X.shape[0])))
2277 |             )
2278 |             if verbose:
2279 |                 print(
2280 |                     "Building RP forest with",
2281 |                     str(n_trees),
2282 |                     "trees",
2283 |                 )
2284 | 
2285 |             rp_forest = make_forest(
2286 |                 X, n_neighbors, n_trees, rng_state, angular
2287 |             )
2288 |             leaf_array = rptree_leaf_array(rp_forest)
2289 | 
2290 |             if verbose:
2291 |                 print(
2292 |                     "NN descent for",
2293 |                     str(n_iters),
2294 |                     "iterations",
2295 |                 )
2296 |             knn_indices, knn_dists = metric_nn_descent(
2297 |                 X.indices,
2298 |                 X.indptr,
2299 |                 X.data,
2300 |                 X.shape[0],
2301 |                 n_neighbors,
2302 |                 rng_state,
2303 |                 max_candidates=60,
2304 |                 rp_tree_init=True,
2305 |                 leaf_array=leaf_array,
2306 |                 n_iters=n_iters,
2307 |                 verbose=verbose,
2308 |             )
2309 |         else:
2310 |             metric_nn_descent = make_nn_descent(
2311 |                 distance_func, tuple(metric_kwds.values())
2312 |             )
2313 |             
2314 |             n_trees = 5 + int(
2315 |                 round((X.shape[0]) ** 0.5 / 20.0)
2316 |             )
2317 |             n_iters = max(
2318 |                 5, int(round(np.log2(X.shape[0])))
2319 |             )
2320 | 
2321 |             if verbose:
2322 |                 print(
2323 |                     "Building RP forest with",
2324 |                     str(n_trees),
2325 |                     "trees",
2326 |                 )
2327 |             rp_forest = make_forest(
2328 |                 X, n_neighbors, n_trees, rng_state, angular
2329 |             )
2330 |             leaf_array = rptree_leaf_array(rp_forest)
2331 |             if verbose:
2332 |                 print(
2333 |                     "NN descent for",
2334 |                     str(n_iters),
2335 |                     "iterations",
2336 |                 )
2337 |             knn_indices, knn_dists = metric_nn_descent(
2338 |                 X,
2339 |                 n_neighbors,
2340 |                 rng_state,
2341 |                 max_candidates=60,
2342 |                 rp_tree_init=True,
2343 |                 leaf_array=leaf_array,
2344 |                 n_iters=n_iters,
2345 |                 verbose=verbose,
2346 |             )
2347 | 
2348 |         if np.any(knn_indices < 0):
2349 |             warn(
2350 |                 "Failed to correctly find n_neighbors for some samples."
2351 |                 "Results may be less than ideal. Try re-running with"
2352 |                 "different parameters."
2353 |             )
2354 |     if verbose:
2355 |         print("Finished Nearest Neighbor Search")
2356 |     return knn_indices, knn_dists, rp_forest
2357 | 
2358 | @numba.njit(
2359 |     fastmath=True
2360 | )  
2361 | def smooth_knn_dist(
2362 |     distances,
2363 |     k,
2364 |     n_iter=64,
2365 |     local_connectivity=1.0,
2366 |     bandwidth=1.0,
2367 |     cardinality=None
2368 | ):
2369 | 
2370 |     if cardinality is None:
2371 |         target = np.log2(k) * bandwidth
2372 |     else:
2373 |         target = cardinality
2374 |     rho = np.zeros(distances.shape[0])
2375 |     result = np.zeros(distances.shape[0])
2376 | 
2377 |     mean_distances = np.mean(distances)
2378 | 
2379 |     for i in range(distances.shape[0]):
2380 |         lo = 0.0
2381 |         hi = NPY_INFINITY
2382 |         mid = 1.0
2383 | 
2384 |         
2385 |         ith_distances = distances[i]
2386 |         non_zero_dists = ith_distances[ith_distances > 0.0]
2387 |         if non_zero_dists.shape[0] >= local_connectivity:
2388 |             index = int(np.floor(local_connectivity))
2389 |             interpolation = local_connectivity - index
2390 |             if index > 0:
2391 |                 rho[i] = non_zero_dists[index - 1]
2392 |                 if interpolation > SMOOTH_K_TOLERANCE:
2393 |                     rho[i] += interpolation * (
2394 |                         non_zero_dists[index]
2395 |                         - non_zero_dists[index - 1]
2396 |                     )
2397 |             else:
2398 |                 rho[i] = interpolation * non_zero_dists[0]
2399 |         elif non_zero_dists.shape[0] > 0:
2400 |             rho[i] = np.max(non_zero_dists)
2401 | 
2402 |         for n in range(n_iter):
2403 | 
2404 |             psum = 0.0
2405 |             for j in range(1, distances.shape[1]):
2406 |                 d = distances[i, j] - rho[i]
2407 |                 if d > 0:
2408 |                     psum += np.exp(-(d / mid))
2409 |                 else:
2410 |                     psum += 1.0
2411 | 
2412 |             if np.fabs(psum - target) < SMOOTH_K_TOLERANCE:
2413 |                 break
2414 | 
2415 |             if psum > target:
2416 |                 hi = mid
2417 |                 mid = (lo + hi) / 2.0
2418 |             else:
2419 |                 lo = mid
2420 |                 if hi == NPY_INFINITY:
2421 |                     mid *= 2
2422 |                 else:
2423 |                     mid = (lo + hi) / 2.0
2424 | 
2425 |         result[i] = mid
2426 | 
2427 |         
2428 |         if rho[i] > 0.0:
2429 |             mean_ith_distances = np.mean(ith_distances)
2430 |             if (
2431 |                 result[i]
2432 |                 < MIN_K_DIST_SCALE * mean_ith_distances
2433 |             ):
2434 |                 result[i] = (
2435 |                     MIN_K_DIST_SCALE * mean_ith_distances
2436 |                 )
2437 |         else:
2438 |             if (
2439 |                 result[i]
2440 |                 < MIN_K_DIST_SCALE * mean_distances
2441 |             ):
2442 |                 result[i] = (
2443 |                     MIN_K_DIST_SCALE * mean_distances
2444 |                 )
2445 | 
2446 |     return result, rho
2447 | 
2448 | @numba.njit(parallel=True, fastmath=True)
2449 | def compute_membership_strengths(
2450 |     knn_indices, knn_dists, sigmas, rhos
2451 | ):
2452 | 
2453 |     n_samples = knn_indices.shape[0]
2454 |     n_neighbors = knn_indices.shape[1]
2455 | 
2456 |     rows = np.zeros(knn_indices.size, dtype=np.int64)
2457 |     cols = np.zeros(knn_indices.size, dtype=np.int64)
2458 |     vals = np.zeros(knn_indices.size, dtype=np.float64)
2459 | 
2460 |     for i in range(n_samples):
2461 |         for j in range(n_neighbors):
2462 |             if knn_indices[i, j] == -1:
2463 |                 continue  
2464 |             if knn_indices[i, j] == i:
2465 |                 val = 0.0
2466 |             elif knn_dists[i, j] - rhos[i] <= 0.0:
2467 |                 val = 1.0
2468 |             else:
2469 |                 val = np.exp(
2470 |                     -(
2471 |                         (knn_dists[i, j] - rhos[i])
2472 |                         / (sigmas[i])
2473 |                     )
2474 |                 )
2475 | 
2476 |             rows[i * n_neighbors + j] = i
2477 |             cols[i * n_neighbors + j] = knn_indices[i, j]
2478 |             vals[i * n_neighbors + j] = val
2479 | 
2480 |     return rows, cols, vals
2481 | 
2482 | def create_tree(data, metric):
2483 |     if metric == 'euclidean':
2484 |         ckd = cKDTree(data)
2485 |     else:
2486 |         ckd = KDTree(data, metric=metric)
2487 |     return ckd
2488 | 
2489 | def query_tree(data, ckd, k, metric):
2490 |     if metric == 'euclidean':
2491 |         ckdout = ckd.query(x=data, k=k, workers=-1)
2492 |     else:
2493 |         ckdout = ckd.query(data, k=k)
2494 |     return ckdout
2495 | 
2496 | def partitioned_nearest_neighbors(X, Y, k, metric='euclidean'):
2497 |     tree = create_tree(Y, metric)
2498 |     nns = query_tree(X, tree, k, metric)
2499 |     knn_indices = nns[1]
2500 |     knn_dists = nns[0]
2501 |     return knn_indices, knn_dists
2502 | 
2503 | 
2504 | 
2505 | 
2506 | 
2507 | 
2508 | 
2509 | 
2510 | 
2511 | 
2512 | 
2513 | 
2514 | 
2515 | import numpy as np
2516 | 
2517 | import scipy.sparse
2518 | import scipy.sparse.csgraph
2519 | 
2520 | from sklearn.manifold import SpectralEmbedding
2521 | from sklearn.metrics import pairwise_distances
2522 | from warnings import warn
2523 | 
2524 | 
2525 | def component_layout(
2526 |     data, n_components, component_labels, dim, metric="euclidean", metric_kwds={}
2527 | ):
2528 | 
2529 |     component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64)
2530 | 
2531 |     for label in range(n_components):
2532 |         component_centroids[label] = data[component_labels == label].mean(axis=0)
2533 | 
2534 |     distance_matrix = pairwise_distances(
2535 |         component_centroids, metric=metric, **metric_kwds
2536 |     )
2537 |     affinity_matrix = np.exp(-distance_matrix ** 2)
2538 | 
2539 |     component_embedding = SpectralEmbedding(
2540 |         n_components=dim, affinity="precomputed"
2541 |     ).fit_transform(affinity_matrix)
2542 |     component_embedding /= component_embedding.max()
2543 | 
2544 |     return component_embedding
2545 | 
2546 | 
2547 | def multi_component_layout(
2548 |     data,
2549 |     graph,
2550 |     n_components,
2551 |     component_labels,
2552 |     dim,
2553 |     random_state,
2554 |     metric="euclidean",
2555 |     metric_kwds={},
2556 | ):
2557 | 
2558 | 
2559 |     result = np.empty((graph.shape[0], dim), dtype=np.float32)
2560 | 
2561 |     if n_components > 2 * dim:
2562 |         meta_embedding = component_layout(
2563 |             data,
2564 |             n_components,
2565 |             component_labels,
2566 |             dim,
2567 |             metric=metric,
2568 |             metric_kwds=metric_kwds,
2569 |         )
2570 |     else:
2571 |         k = int(np.ceil(n_components / 2.0))
2572 |         base = np.hstack([np.eye(k), np.zeros((k, dim - k))])
2573 |         meta_embedding = np.vstack([base, -base])[:n_components]
2574 | 
2575 |     for label in range(n_components):
2576 |         component_graph = graph.tocsr()[component_labels == label, :].tocsc()
2577 |         component_graph = component_graph[:, component_labels == label].tocoo()
2578 | 
2579 |         distances = pairwise_distances([meta_embedding[label]], meta_embedding)
2580 |         data_range = distances[distances > 0.0].min() / 2.0
2581 | 
2582 |         if component_graph.shape[0] < 2 * dim:
2583 |             result[component_labels == label] = (
2584 |                 random_state.uniform(
2585 |                     low=-data_range,
2586 |                     high=data_range,
2587 |                     size=(component_graph.shape[0], dim),
2588 |                 )
2589 |                 + meta_embedding[label]
2590 |             )
2591 |             continue
2592 | 
2593 |         diag_data = np.asarray(component_graph.sum(axis=0))
2594 |         
2595 |         
2596 |         
2597 |         
2598 |         I = scipy.sparse.identity(component_graph.shape[0], dtype=np.float64)
2599 |         D = scipy.sparse.spdiags(
2600 |             1.0 / np.sqrt(diag_data),
2601 |             0,
2602 |             component_graph.shape[0],
2603 |             component_graph.shape[0],
2604 |         )
2605 |         L = I - D * component_graph * D
2606 | 
2607 |         k = dim + 1
2608 |         num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(component_graph.shape[0])))
2609 |         try:
2610 |             eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
2611 |                 L,
2612 |                 k,
2613 |                 which="SM",
2614 |                 ncv=num_lanczos_vectors,
2615 |                 tol=1e-4,
2616 |                 v0=np.ones(L.shape[0]),
2617 |                 maxiter=graph.shape[0] * 5,
2618 |             )
2619 |             order = np.argsort(eigenvalues)[1:k]
2620 |             component_embedding = eigenvectors[:, order]
2621 |             expansion = data_range / np.max(np.abs(component_embedding))
2622 |             component_embedding *= expansion
2623 |             result[component_labels == label] = (
2624 |                 component_embedding + meta_embedding[label]
2625 |             )
2626 |         except scipy.sparse.linalg.ArpackError:
2627 |             warn(
2628 |                 "WARNING: spectral initialisation failed! The eigenvector solver\n"
2629 |                 "failed. This is likely due to too small an eigengap. Consider\n"
2630 |                 "adding some noise or jitter to your data.\n\n"
2631 |                 "Falling back to random initialisation!"
2632 |             )
2633 |             result[component_labels == label] = (
2634 |                 random_state.uniform(
2635 |                     low=-data_range,
2636 |                     high=data_range,
2637 |                     size=(component_graph.shape[0], dim),
2638 |                 )
2639 |                 + meta_embedding[label]
2640 |             )
2641 | 
2642 |     return result
2643 | 
2644 | 
2645 | def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_kwds={}):
2646 | 
2647 |     n_samples = graph.shape[0]
2648 |     n_components, labels = scipy.sparse.csgraph.connected_components(graph)
2649 | 
2650 |     if n_components > 1:
2651 |         warn(
2652 |             "Embedding a total of {} separate connected components using meta-embedding (experimental)".format(
2653 |                 n_components
2654 |             )
2655 |         )
2656 |         return multi_component_layout(
2657 |             data,
2658 |             graph,
2659 |             n_components,
2660 |             labels,
2661 |             dim,
2662 |             random_state,
2663 |             metric=metric,
2664 |             metric_kwds=metric_kwds,
2665 |         )
2666 | 
2667 |     diag_data = np.asarray(graph.sum(axis=0))
2668 |     
2669 |     
2670 |     
2671 |     
2672 |     I = scipy.sparse.identity(graph.shape[0], dtype=np.float64)
2673 |     D = scipy.sparse.spdiags(
2674 |         1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0]
2675 |     )
2676 |     L = I - D * graph * D
2677 | 
2678 |     k = dim + 1
2679 |     num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(graph.shape[0])))
2680 |     try:
2681 |         if L.shape[0] < 2000000:
2682 |             eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
2683 |                 L,
2684 |                 k,
2685 |                 which="SM",
2686 |                 ncv=num_lanczos_vectors,
2687 |                 tol=1e-4,
2688 |                 v0=np.ones(L.shape[0]),
2689 |                 maxiter=graph.shape[0] * 5,
2690 |             )
2691 |         else:
2692 |             eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
2693 |                 L, random_state.normal(size=(L.shape[0], k)), largest=False, tol=1e-8
2694 |             )
2695 |         order = np.argsort(eigenvalues)[1:k]
2696 |         return eigenvectors[:, order]
2697 |     except scipy.sparse.linalg.ArpackError:
2698 |         warn(
2699 |             "WARNING: spectral initialisation failed! The eigenvector solver\n"
2700 |             "failed. This is likely due to too small an eigengap. Consider\n"
2701 |             "adding some noise or jitter to your data.\n\n"
2702 |             "Falling back to random initialisation!"
2703 |         )
2704 |         return random_state.uniform(low=-10.0, high=10.0, size=(graph.shape[0], dim))
2705 | 
2706 | 
2707 | 
2708 | 
2709 | 
2710 | 
2711 | 
2712 | 
2713 | 
2714 | 
2715 | 
2716 | 
2717 | import numpy as np
2718 | import numba
2719 | 
2720 | 
2721 | @numba.njit()
2722 | def clip(val):
2723 | 
2724 |     if val > 4.0:
2725 |         return 4.0
2726 |     elif val < -4.0:
2727 |         return -4.0
2728 |     else:
2729 |         return val
2730 | 
2731 | @numba.njit(
2732 |     "f4(f4[::1],f4[::1])",
2733 |     fastmath=True,
2734 |     cache=True,
2735 |     locals={
2736 |         "result": numba.types.float32,
2737 |         "diff": numba.types.float32,
2738 |         "dim": numba.types.intp,
2739 |     },
2740 | )
2741 | def rdist(x, y):
2742 | 
2743 |     result = 0.0
2744 |     dim = x.shape[0]
2745 |     for i in range(dim):
2746 |         diff = x[i] - y[i]
2747 |         result += diff * diff
2748 | 
2749 |     return result
2750 | 
2751 | 
2752 | def _optimize_layout_euclidean_single_epoch(
2753 |     head_embedding,
2754 |     head,
2755 |     tail,
2756 |     n_vertices,
2757 |     epochs_per_sample,
2758 |     a,
2759 |     b,
2760 |     rng_state,
2761 |     gamma,
2762 |     dim,
2763 |     move_other,
2764 |     alpha,
2765 |     epochs_per_negative_sample,
2766 |     epoch_of_next_negative_sample,
2767 |     epoch_of_next_sample,
2768 |     n,
2769 | ):  
2770 |     for i in numba.prange(epochs_per_sample.shape[0]):
2771 |         if epoch_of_next_sample[i] <= n:
2772 |             j = head[i]
2773 |             k = tail[i]
2774 | 
2775 |             current = head_embedding[j]
2776 |             other = head_embedding[k]
2777 | 
2778 |             dist_squared = rdist(current, other)
2779 | 
2780 |             if dist_squared > 0.0:
2781 |                 grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0)
2782 |                 grad_coeff /= a * pow(dist_squared, b) + 1.0
2783 |             else:
2784 |                 grad_coeff = 0.0
2785 | 
2786 |             for d in range(dim):
2787 |                 grad_d = clip(grad_coeff * (current[d] - other[d]))
2788 |                 current[d] += grad_d * alpha
2789 |                 if move_other:
2790 |                     other[d] += -grad_d * alpha
2791 | 
2792 |             epoch_of_next_sample[i] += epochs_per_sample[i]
2793 | 
2794 |             n_neg_samples = int(
2795 |                 (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]
2796 |             )
2797 | 
2798 |             for p in range(n_neg_samples):
2799 |                 k = tau_rand_int(rng_state) % n_vertices
2800 | 
2801 |                 other = head_embedding[k]
2802 | 
2803 |                 dist_squared = rdist(current, other)
2804 | 
2805 |                 if dist_squared > 0.0:
2806 |                     grad_coeff = 2.0 * gamma * b
2807 |                     grad_coeff /= (0.001 + dist_squared) * (
2808 |                         a * pow(dist_squared, b) + 1
2809 |                     )
2810 |                 elif j == k:
2811 |                     continue
2812 |                 else:
2813 |                     grad_coeff = 0.0
2814 | 
2815 |                 for d in range(dim):
2816 |                     if grad_coeff > 0.0:
2817 |                         grad_d = clip(grad_coeff * (current[d] - other[d]))
2818 |                     else:
2819 |                         grad_d = 4.0
2820 |                     current[d] += grad_d * alpha
2821 | 
2822 |             epoch_of_next_negative_sample[i] += (
2823 |                 n_neg_samples * epochs_per_negative_sample[i]
2824 |             )
2825 | 
2826 |     return head_embedding
2827 | 
2828 | 
2829 | 
2830 | 
2831 | 
2832 | 
2833 | 
2834 | 
2835 | 
2836 | def fuzzy_simplicial_set(
2837 |     Xs,
2838 |     joint,
2839 |     joint_idxs,
2840 |     weights,
2841 |     n_neighbors,
2842 |     cardinality,
2843 |     metrics,
2844 |     metric_kwds,
2845 |     joint_metrics,
2846 |     angular,
2847 |     set_op_mix_ratio,
2848 |     local_connectivity,
2849 |     n_epochs,
2850 |     random_state,
2851 |     verbose,
2852 | ):
2853 | 
2854 |     len_Xs = [len(i) for i in Xs]
2855 |     
2856 |     rows, cols, vals = np.array([]), np.array([]), np.array([])
2857 | 
2858 |     for i in range(len(Xs)):
2859 | 
2860 |         X_n_neighbors = int(round(n_neighbors * len_Xs[i]/sum(len_Xs)))
2861 |         if X_n_neighbors < 2:
2862 |             weights[(i,i)] *= X_n_neighbors/2
2863 |             X_n_neighbors = 2
2864 | 
2865 |         if Xs[i].shape[0] < 4096:
2866 |             X = Xs[i]
2867 |             if scipy.sparse.issparse(Xs[i]):
2868 |                 X = Xs[i].toarray()
2869 |             dmat = pairwise_distances(Xs[i], metric=metrics[i], **metric_kwds[i])
2870 |             knn_indices, knn_dists, _ = nearest_neighbors(
2871 |                 dmat,
2872 |                 X_n_neighbors,
2873 |                 'precomputed',
2874 |                 {},
2875 |                 angular,
2876 |                 np.random.RandomState(random_state),
2877 |                 verbose=verbose,
2878 |             )
2879 |         else:
2880 |             knn_indices, knn_dists, _ = nearest_neighbors(
2881 |                 Xs[i],
2882 |                 X_n_neighbors,
2883 |                 metrics[i],
2884 |                 metric_kwds[i],
2885 |                 angular,
2886 |                 np.random.RandomState(random_state),
2887 |                 verbose=verbose,
2888 |             )
2889 | 
2890 |         sigmas, rhos = smooth_knn_dist(
2891 |             knn_dists,
2892 |             0,
2893 |             local_connectivity=local_connectivity,
2894 |             cardinality=cardinality * X_n_neighbors/n_neighbors
2895 |         )
2896 | 
2897 |         X_rows, X_cols, X_vals = compute_membership_strengths(
2898 |             knn_indices, knn_dists, sigmas, rhos
2899 |         )
2900 | 
2901 |         rows = np.concatenate([rows, X_rows + sum(len_Xs[:i])])
2902 |         cols = np.concatenate([cols, X_cols + sum(len_Xs[:i])])
2903 |         vals = np.concatenate([vals, X_vals])
2904 | 
2905 |     for k in joint.keys():
2906 |         XY = joint[k]
2907 |         idxs = joint_idxs[k]
2908 |         metric = joint_metrics[k]
2909 | 
2910 |         XY_n_neighbors = int(round(n_neighbors * len_Xs[k[1]]/sum(len_Xs) * len(idxs[1])/len_Xs[k[1]]))
2911 |         YX_n_neighbors = int(round(n_neighbors * len_Xs[k[0]]/sum(len_Xs) * len(idxs[0])/len_Xs[k[0]]))
2912 | 
2913 |         if XY_n_neighbors < 2:
2914 |             weights[(k[0],k[1])] *= XY_n_neighbors/2
2915 |             XY_n_neighbors = 2
2916 |         if YX_n_neighbors < 2:
2917 |             weights[(k[1],k[0])] *= YX_n_neighbors/2
2918 |             YX_n_neighbors = 2
2919 | 
2920 |         
2921 |         if metric == 'precomputed':
2922 |             XY_knn_indices = np.argsort(XY, axis=1)[:,XY_n_neighbors]
2923 |             XY_knn_dists = np.sort(XY, axis=1)[:,XY_n_neighbors]
2924 | 
2925 |             YX_knn_indices = np.argsort(XY.T, axis=1)[:,YX_n_neighbors]
2926 |             YX_knn_dists = np.sort(XY.T, axis=1)[:,YX_n_neighbors]
2927 | 
2928 |         else:
2929 |             XY_knn_indices, XY_knn_dists = partitioned_nearest_neighbors(XY[0], XY[1], 
2930 |                                                                          XY_n_neighbors, metric)
2931 |             YX_knn_indices, YX_knn_dists = partitioned_nearest_neighbors(XY[1], XY[0], 
2932 |                                                                          YX_n_neighbors, metric)
2933 | 
2934 |         XY_sigmas, XY_rhos = smooth_knn_dist(
2935 |             XY_knn_dists,
2936 |             0,
2937 |             local_connectivity=local_connectivity,
2938 |             cardinality=cardinality * XY_n_neighbors/n_neighbors
2939 |         )
2940 |         YX_sigmas, YX_rhos = smooth_knn_dist(
2941 |             YX_knn_dists,
2942 |             0,
2943 |             local_connectivity=local_connectivity,
2944 |             cardinality=cardinality * YX_n_neighbors/n_neighbors
2945 |         )
2946 | 
2947 |         XY_rows, XY_cols, XY_vals = compute_membership_strengths(
2948 |             XY_knn_indices, XY_knn_dists, XY_sigmas, XY_rhos
2949 |         )
2950 |         YX_rows, YX_cols, YX_vals = compute_membership_strengths(
2951 |             YX_knn_indices, YX_knn_dists, YX_sigmas, YX_rhos
2952 |         )
2953 | 
2954 |         rows = np.concatenate([rows, idxs[0][XY_rows] + sum(len_Xs[:k[0]])])
2955 |         cols = np.concatenate([cols, idxs[1][XY_cols] + sum(len_Xs[:k[1]])])
2956 |         vals = np.concatenate([vals, XY_vals])
2957 | 
2958 |         rows = np.concatenate([rows, idxs[1][YX_rows] + sum(len_Xs[:k[1]])])
2959 |         cols = np.concatenate([cols, idxs[0][YX_cols] + sum(len_Xs[:k[0]])])
2960 |         vals = np.concatenate([vals, YX_vals])
2961 | 
2962 |     fs = scipy.sparse.coo_matrix(
2963 |         (vals, (rows, cols)), shape=(sum(len_Xs), sum(len_Xs))
2964 |     )
2965 |     fs.eliminate_zeros()
2966 | 
2967 |     transpose = fs.transpose()
2968 | 
2969 |     prod_matrix = fs.multiply(transpose)
2970 | 
2971 |     fs = (
2972 |         set_op_mix_ratio
2973 |         * (fs + transpose - prod_matrix)
2974 |         + (1.0 - set_op_mix_ratio) * prod_matrix
2975 |     )
2976 | 
2977 |     
2978 |     fs.sum_duplicates()
2979 |     fs.data[fs.data < (fs.data.max() / float(n_epochs))] = 0.0
2980 |     fs.eliminate_zeros()
2981 |     full_graph = fs
2982 | 
2983 |     graphs = []
2984 |     for i in range(len(Xs)):
2985 |         graphs += [fs[sum(len_Xs[:i]):sum(len_Xs[:i+1]), 
2986 |                       sum(len_Xs[:i]):sum(len_Xs[:i+1])].tocoo()]
2987 |     joint_graphs = {}
2988 |     for k in joint.keys():
2989 |         joint_graphs[k] = fs[sum(len_Xs[:k[0]]):sum(len_Xs[:k[0]+1]), 
2990 |                              sum(len_Xs[:k[1]]):sum(len_Xs[:k[1]+1])].tocoo()
2991 | 
2992 |     return graphs, joint_graphs, full_graph, weights
2993 | 
2994 | def init_layout(init, 
2995 |                 Xs, 
2996 |                 graphs, 
2997 |                 n_components,
2998 |                 metrics,
2999 |                 metric_kwds,
3000 |                 random_state):
3001 | 
3002 |     len_Xs = [len(i) for i in Xs]
3003 | 
3004 |     if init == 'random':
3005 |         embeddings = []
3006 |         for i in range(len(Xs)):
3007 |             embeddings += [np.random.RandomState(random_state).uniform(low=-10.0, high=10.0, 
3008 |                             size=(len_Xs[i], n_components),
3009 |                            ).astype(np.float32)]
3010 |     elif init == 'spectral':
3011 |         embeddings = []
3012 |         for i in range(len(Xs)):
3013 |             try:
3014 |                 X_embedding = spectral_layout(
3015 |                     Xs[i],
3016 |                     graphs[i],
3017 |                     n_components,
3018 |                     np.random.RandomState(random_state),
3019 |                     metric=metrics[i],
3020 |                     metric_kwds=metric_kwds[i],
3021 |                 )
3022 |                 expansion = 10.0 / np.abs(X_embedding).max()
3023 |                 X_embedding = (X_embedding * expansion).astype(np.float32) + \
3024 |                               np.random.RandomState(random_state).normal(scale=0.0001, 
3025 |                                                   size=[len_Xs[i], n_components]
3026 |                                                  ).astype(np.float32)
3027 |             except:
3028 |                 X_embedding = np.random.RandomState(random_state).uniform(low=-10.0, high=10.0, 
3029 |                                                    size=(len_Xs[i], n_components),
3030 |                                                    ).astype(np.float32)
3031 |             embeddings += [X_embedding]
3032 |     else:
3033 |         if len(init.shape) == 2:
3034 |             if (np.unique(init, axis=0).shape[0] < init.shape[0]):
3035 |                 tree = KDTree(init_data)
3036 |                 dist, ind = tree.query(init_data, k=2)
3037 |                 nndist = np.mean(dist[:,1])
3038 |                 embedding = init + np.random.RandomState(random_state).normal(
3039 |                     scale=0.001 * nndist,
3040 |                     size=init.shape
3041 |                 ).astype(np.float32)
3042 |             else:
3043 |                 embedding = init
3044 |             embeddings = []
3045 |             for i in range(len(Xs)):
3046 |                 embeddings += [embedding[sum(len_Xs[:i]):sum(len_Xs[:i+1])]]
3047 | 
3048 |     for i in range(len(embeddings)):
3049 |         embeddings[i] = (10.0 * (embeddings[i] - np.min(embeddings[i], 0))
3050 |                          / (np.max(embeddings[i], 0) - np.min(embeddings[i], 0))
3051 |                         ).astype(np.float32, order="C")
3052 |     return embeddings
3053 | 
3054 | 
3055 | def optimize_layout(
3056 |     embeddings,
3057 |     graphs,
3058 |     joint_graphs,
3059 |     weights,
3060 |     n_epochs,
3061 |     a,
3062 |     b,
3063 |     random_state,
3064 |     gamma=1.0,
3065 |     initial_alpha=1.0,
3066 |     negative_sample_rate=5.0,
3067 |     parallel=False,
3068 |     verbose=False,
3069 | ):
3070 | 
3071 | 
3072 |     len_Xs = np.array([len(i) for i in embeddings])
3073 |     dim = embeddings[0].shape[1]
3074 |     move_other = True
3075 |     alpha = initial_alpha
3076 | 
3077 |     heads = [i.row for i in graphs]
3078 |     tails = [i.col for i in graphs]
3079 |     n_vertices = [i.shape[1] for i in graphs]
3080 | 
3081 |     epochs_per_sample = [make_epochs_per_sample(i.data, n_epochs) for i in graphs]
3082 |     epochs_per_negative_sample = [i/negative_sample_rate for i in epochs_per_sample]
3083 |     epoch_of_next_negative_sample = [i.copy() for i in epochs_per_negative_sample]
3084 |     epoch_of_next_sample = [i.copy() for i in epochs_per_sample]
3085 | 
3086 |     joint_heads = {k: np.concatenate([joint_graphs[k].row, 
3087 |                                       joint_graphs[k].col + len_Xs[k[0]]]) for k in joint_graphs.keys()}
3088 |     joint_tails = {k: np.concatenate([joint_graphs[k].col + len_Xs[k[0]], 
3089 |                                       joint_graphs[k].row]) for k in joint_graphs.keys()}
3090 |     joint_n_vertices = {k: len_Xs[k[0]] + len_Xs[k[1]] for k in joint_graphs.keys()}
3091 |     joint_epochs_per_sample = {k: make_epochs_per_sample(
3092 |                                 np.concatenate([joint_graphs[k].data, joint_graphs[k].data]), n_epochs) for k in joint_graphs.keys()}
3093 |     joint_epochs_per_negative_sample = {k: joint_epochs_per_sample[k]/negative_sample_rate for k in joint_graphs.keys()}
3094 |     joint_epoch_of_next_negative_sample = {k: np.copy(joint_epochs_per_negative_sample[k]) for k in joint_graphs.keys()}
3095 |     joint_epoch_of_next_sample = {k: np.copy(joint_epochs_per_sample[k]) for k in joint_graphs.keys()}
3096 | 
3097 | 
3098 | 
3099 |     optimize_fn = numba.njit(
3100 |         _optimize_layout_euclidean_single_epoch, fastmath=True, parallel=parallel
3101 |     )
3102 | 
3103 |     for n in range(n_epochs):
3104 | 
3105 |         for i in range(len(embeddings)):
3106 | 
3107 |             if weights[(i,i)] != 0:
3108 |                 new_embedding = optimize_fn(
3109 |                     np.copy(embeddings[i]),
3110 |                     heads[i],
3111 |                     tails[i],
3112 |                     n_vertices[i],
3113 |                     epochs_per_sample[i],
3114 |                     a,
3115 |                     b,
3116 |                     np.random.RandomState(random_state).randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64),
3117 |                     gamma,
3118 |                     dim,
3119 |                     move_other,
3120 |                     alpha,
3121 |                     epochs_per_negative_sample[i],
3122 |                     epoch_of_next_negative_sample[i],
3123 |                     epoch_of_next_sample[i],
3124 |                     n,
3125 |                 )
3126 |                 embeddings[i] += (new_embedding - embeddings[i]) * weights[(i,i)]
3127 | 
3128 |         for k in joint_graphs.keys():
3129 | 
3130 |             if weights[(k[0], k[1])] != 0 or weights[(k[1], k[0])] != 0:
3131 |                 new_embeddings = optimize_fn(
3132 |                     np.concatenate([embeddings[k[0]], embeddings[k[1]]]),
3133 |                     joint_heads[k],
3134 |                     joint_tails[k],
3135 |                     joint_n_vertices[k],
3136 |                     joint_epochs_per_sample[k],
3137 |                     a,
3138 |                     b,
3139 |                     np.random.RandomState(random_state).randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64),
3140 |                     gamma,
3141 |                     dim,
3142 |                     move_other,
3143 |                     alpha,
3144 |                     joint_epochs_per_negative_sample[k],
3145 |                     joint_epoch_of_next_negative_sample[k],
3146 |                     joint_epoch_of_next_sample[k],
3147 |                     n,
3148 |                 )
3149 | 
3150 |                 embeddings[k[0]] += (new_embeddings[:len(embeddings[k[0]])] - embeddings[k[0]]) * weights[(k[0], k[1])]
3151 |                 embeddings[k[1]] += (new_embeddings[len(embeddings[k[0]]):] - embeddings[k[1]]) * weights[(k[1], k[0])]
3152 | 
3153 |         alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs)))
3154 | 
3155 |         if verbose and n % int(n_epochs / 10) == 0:
3156 |             print("\tcompleted ", n, " / ", n_epochs, "epochs")
3157 | 
3158 |     return embeddings
3159 | 
3160 | 
3161 | def find_ab_params(spread, min_dist):
3162 | 
3163 | 
3164 |     def curve(x, a, b):
3165 |         return 1.0 / (1.0 + a * x ** (2 * b))
3166 | 
3167 |     xv = np.linspace(0, spread * 3, 300)
3168 |     yv = np.zeros(xv.shape)
3169 |     yv[xv < min_dist] = 1.0
3170 |     yv[xv >= min_dist] = np.exp(-(xv[xv >= min_dist] - min_dist) / spread)
3171 |     params, covar = curve_fit(curve, xv, yv)
3172 |     return params[0], params[1]
3173 | 
3174 | 
3175 | def make_epochs_per_sample(weights, n_epochs):
3176 | 
3177 |     result = -1.0 * np.ones(weights.shape[0], dtype=np.float64)
3178 |     n_samples = n_epochs * (weights / weights.max())
3179 |     result[n_samples > 0] = float(n_epochs) / n_samples[n_samples > 0]
3180 |     return result
3181 | 
3182 | 
3183 | def elaborate_relation_dict(dict, list_elems=True):
3184 |     new = {}
3185 |     for k in dict.keys():
3186 |         if len(k) == 2 and type(k[0]) != tuple and type(k[1]) != tuple:
3187 |             new[k] = dict[k]
3188 |         elif len(k) == 2:
3189 |             k_0 = k[0]
3190 |             k_1 = k[1]
3191 |             if type(k[0]) != tuple:
3192 |                 k_0 = (k_0,)
3193 |             if type(k[1]) != tuple:
3194 |                 k_1 = (k_1,)
3195 |             for i in range(len(k_0)):
3196 |                 for j in range(len(k_1)):
3197 |                     if list_elems:
3198 |                         new[(k_0[i], k_1[j])] = [dict[k][0][i], dict[k][1][j]] 
3199 |                     else:
3200 |                         new[(k_0[i], k_1[j])] = dict[k]
3201 |         else:
3202 |             for i in range(len(k)):
3203 |                 for j in range(i+1, len(k)):
3204 |                     if list_elems:
3205 |                         new[(k[i], k[j])] = [dict[k][i], dict[k][j]]
3206 |                     else:
3207 |                         new[(k[i], k[j])] = dict[k]
3208 |     return new
3209 | 
3210 | def find_weights(strengths, len_Xs, joint_idxs):
3211 | 
3212 |     if type(strengths) != dict:
3213 |         strengths = np.clip(strengths, 0, 1)
3214 |         weights = {}
3215 |         for i in range(len(len_Xs)):
3216 |             for j in range(len(len_Xs)):
3217 |                 if i == j:
3218 |                     weights[(i,j)] = strengths[i]
3219 |                     
3220 |                 else:
3221 |                     weights[(i,j)] = 1 - strengths[i]
3222 |     else:
3223 |         weights = elaborate_relation_dict(strengths, list_elems=False)
3224 |         for i in range(len(len_Xs)):
3225 |             for j in range(len(len_Xs)):
3226 |                 if (i,j) not in weights.keys():
3227 |                     weights[(i,j)] = 1
3228 | 
3229 |     
3230 |     
3231 |     weight_sums = []
3232 |     for i in range(len(len_Xs)):
3233 |         weight_sum = 0
3234 |         for j in range(len(len_Xs)):
3235 |             weight_sum += weights[(i,j)] * len_Xs[j]
3236 |         weight_sums += [weight_sum]
3237 |     for i in range(len(len_Xs)):
3238 |         for j in range(len(len_Xs)):
3239 |             weights[(i,j)] *= sum(len_Xs) / weight_sums[i]
3240 | 
3241 |     
3242 |     for k in weights.keys():
3243 |         if k[0] != k[1]:
3244 |             if k in joint_idxs.keys():
3245 |                 weights[k] *= len(joint_idxs[k][1])/len_Xs[k[1]]
3246 |             elif k[::-1] in joint_idxs.keys():
3247 |                 weights[k] *= len(joint_idxs[k[::-1]][0])/len_Xs[k[1]]
3248 |             else:
3249 |                 weights[k] = 0
3250 | 
3251 |     return weights
3252 | 
3253 | def MultiGraph(**kwds):
3254 |     return MultiMAP(**kwds, graph_only=True)
3255 | 
3256 | def MultiMAP(Xs,
3257 |              joint={},
3258 |              joint_idxs={},
3259 | 
3260 |              metrics=None,
3261 |              metric_kwds=None,
3262 |              joint_metrics={},
3263 | 
3264 |              n_neighbors=None,
3265 |              cardinality=None,
3266 |              angular=False,
3267 |              set_op_mix_ratio=1.0,
3268 |              local_connectivity=1.0,
3269 | 
3270 |              n_components=2,
3271 |              spread=1.0,
3272 |              min_dist=None,
3273 |              init='spectral',
3274 |              n_epochs=None,
3275 |              a=None,
3276 |              b=None,
3277 |              strengths=None,
3278 | 
3279 |              random_state=0,
3280 |              
3281 |              verbose=False,
3282 | 
3283 |              graph_only=False,
3284 |             ):
3285 |     '''
3286 |     Run MultiMAP on a collection of dimensionality reduction matrices. Returns a ``(parameters, 
3287 |     neighbor_graph, embedding)`` tuple, with the embedding optionally skipped if ``graph_only=True``.
3288 |     
3289 |     Input
3290 |     -----
3291 |     Xs : list of ``np.array``
3292 |         The dimensionality reductions of the datasets to integrate, observations as rows.
3293 |         
3294 |         >>> Xs = [DR_A, DR_B, DR_C]
3295 |     joint : dict of ``np.array``
3296 |         The joint dimensionality reductions generated for all pair combinations of the input 
3297 |         datasets. The keys are to be two-integer tuples, specifying the indices of the two
3298 |         datasets in ``Xs``
3299 |         
3300 |         >>> joint = {(0,1):DR_AB, (0,2):DR_AC, (1,2):DR_BC}
3301 |     graph_only : ``bool``, optional (default: ``False``)
3302 |         If ``True``, skip producing the embedding and only return the neighbour graph.
3303 |     
3304 |     All other arguments as described in ``MultiMAP.Integration()``.
3305 |     '''
3306 |     
3307 |     #turn off warnings if we're not verbose
3308 |     if not verbose:
3309 |         warnings.simplefilter('ignore')
3310 |     
3311 |     for i in range(len(Xs)):
3312 |         if not scipy.sparse.issparse(Xs[i]):
3313 |             Xs[i] = np.array(Xs[i])
3314 |     len_Xs = [len(i) for i in Xs]
3315 | 
3316 |     if not joint:
3317 |         joint = {tuple(range(len(Xs))): Xs}
3318 |     
3319 |     joint = elaborate_relation_dict(joint, list_elems=True)
3320 |     joint_idxs = elaborate_relation_dict(joint_idxs, list_elems=True)
3321 |     joint_metrics = elaborate_relation_dict(joint_metrics, list_elems=False)
3322 |     for k in joint.keys():
3323 |         joint[k] = [i.toarray() if scipy.sparse.issparse(i) else np.array(i) for i in joint[k]]   
3324 |         if k not in joint_idxs.keys():
3325 |             if k[::-1] in joint_idxs.keys():
3326 |                 joint_idxs[k] = joint_idxs[k[::-1]]
3327 |             else:
3328 |                 joint_idxs[k] = [np.arange(len_Xs[k[0]]), np.arange(len_Xs[k[1]])]
3329 |         if k not in joint_metrics.keys():
3330 |             if k[::-1] in joint_metrics.keys():
3331 |                 joint_metrics[k] = joint_metrics[k[::-1]]
3332 |             else:
3333 |                 joint_metrics[k] = 'euclidean'
3334 | 
3335 |     if metrics is None:
3336 |         metrics = ['euclidean' for i in range(len(Xs))]
3337 |     if metric_kwds is None:
3338 |         metric_kwds = [{} for i in range(len(Xs))]
3339 | 
3340 | 
3341 | 
3342 |     
3343 |     
3344 |     
3345 | 
3346 |     if n_neighbors is None:
3347 |         n_neighbors = 15 * len(Xs)
3348 |     if cardinality is None:
3349 |         cardinality = np.log2(n_neighbors)
3350 |     if min_dist is None:
3351 |         min_dist = 0.5 * 15/n_neighbors
3352 | 
3353 |     if scipy.sparse.issparse(init):
3354 |         init = init.toarray()
3355 |     else:
3356 |         init = np.array(init)
3357 |     if n_epochs is None:
3358 |         if np.sum(len_Xs) <= 10000:
3359 |             n_epochs = 500
3360 |         else:
3361 |             n_epochs = 200
3362 |     if a is None or b is None:
3363 |         a, b = find_ab_params(spread, min_dist)
3364 | 
3365 |     if strengths is None:
3366 |         strengths = np.ones(len(Xs))*0.5
3367 |     weights = find_weights(strengths, len_Xs, joint_idxs)
3368 | 
3369 |     if verbose:
3370 |         print("Constructing fuzzy simplicial sets ...")
3371 |     graphs, joint_graphs, full_graph, weights = fuzzy_simplicial_set(
3372 |         Xs,
3373 |         joint,
3374 |         joint_idxs,
3375 |         weights,
3376 |         n_neighbors,
3377 |         cardinality,
3378 |         metrics,
3379 |         metric_kwds,
3380 |         joint_metrics,
3381 |         angular,
3382 |         set_op_mix_ratio,
3383 |         local_connectivity,
3384 |         n_epochs,
3385 |         random_state,
3386 |         verbose=False
3387 |     )
3388 | 
3389 |     #set up parameter output
3390 |     params = {'n_neighbors': n_neighbors,
3391 |               'metric': metrics[0],
3392 |               'multimap': {'cardinality': cardinality,
3393 |                            'set_op_mix_ratio': set_op_mix_ratio,
3394 |                            'local_connectivity': local_connectivity,
3395 |                            'n_components': n_components,
3396 |                            'spread': spread,
3397 |                            'min_dist': min_dist,
3398 |                            'init': init,
3399 |                            'n_epochs': n_epochs,
3400 |                            'a': a,
3401 |                            'b': b,
3402 |                            'strengths': strengths,
3403 |                            'random_state': random_state}}
3404 | 
3405 |     #return parameter and graph tuple
3406 |     #TODO: add the distances graph to this once it exists
3407 |     if graph_only:
3408 |         return (params, full_graph)
3409 | 
3410 |     if verbose:
3411 |         print("Initializing embedding ...")
3412 |     embeddings = init_layout(
3413 |         init, 
3414 |         Xs, 
3415 |         graphs, 
3416 |         n_components,  
3417 |         metrics,
3418 |         metric_kwds,
3419 |         random_state
3420 |     )
3421 | 
3422 |     if verbose:
3423 |         print("Optimizing embedding ...")
3424 |     embeddings = optimize_layout(
3425 |         embeddings,
3426 |         graphs,
3427 |         joint_graphs,
3428 |         weights,
3429 |         n_epochs,
3430 |         a,
3431 |         b,
3432 |         random_state,
3433 |         gamma=1.0,
3434 |         initial_alpha=1.0,
3435 |         negative_sample_rate=5.0,
3436 |         parallel=False,
3437 |         verbose=verbose
3438 |     )
3439 |     #undo warning reset
3440 |     if not verbose:
3441 |         warnings.resetwarnings()
3442 |     
3443 |     #return an embedding/graph/parameters tuple
3444 |     #TODO: add the distances graph to this once it exists
3445 |     return (params, full_graph, np.concatenate(embeddings))
3446 | 
3447 | import sklearn
3448 | 
3449 | def tfidf(X, n_components, binarize=True, random_state=0):
3450 |     from sklearn.feature_extraction.text import TfidfTransformer
3451 |     
3452 |     sc_count = np.copy(X)
3453 |     if binarize:
3454 |         sc_count = np.where(sc_count < 1, sc_count, 1)
3455 |     
3456 |     tfidf = TfidfTransformer(norm='l2', sublinear_tf=True)
3457 |     normed_count = tfidf.fit_transform(sc_count)
3458 | 
3459 |     lsi = sklearn.decomposition.TruncatedSVD(n_components=n_components, random_state=random_state)
3460 |     lsi_r = lsi.fit_transform(normed_count)
3461 |     
3462 |     X_lsi = lsi_r[:,1:]
3463 |     return X_lsi


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MultiMAP
 2 | **MultiMAP** is a method for integrating single cell multi-omics. MultiMAP can also be used for batch correction. More detail is available in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02565-y).
 3 | 
 4 | <p align="center"><img src="docs/MultiMAP_schematic.png" width="900"></p>
 5 | 
 6 | 
 7 | ## Installation
 8 | 
 9 | ```bash
10 | pip3 install git+https://github.com/Teichlab/MultiMAP.git
11 | ```
12 | 
13 | ## Usage and Documentation
14 | 
15 | MultiMAP offers two functions accepting AnnData objects on input:
16 |   - `MultiMAP.Integration()` expects a list of one AnnData per dataset, with the desired dimensionality reduction precomputed and stored in `.obsm`. This allows for refining the initial dimensionality reduction, e.g. if wishing to use `TFIDF_LSI` for ATAC data and PCA for RNA data.
17 |   - `MultiMAP.Batch()` expects a single AnnData object with the dataset information stored in an `.obs` column. This allows for convenient integration with minimal preparation if all datasets can be treated with the same dimensionality reduction.
18 | 
19 | There's also an AnnData-independent `MultiMAP.matrix.MultiMAP()` function which operates directly on dimensionality reduction matrices. This requires precomputing all pairwise dimensionality reductions prior to calling MultiMAP.
20 | 
21 | A tutorial covering both RNA-ATAC integration and RNA-Seq batch correction use can be found [here](https://nbviewer.jupyter.org/github/Teichlab/MultiMAP/blob/master/examples/tutorial.ipynb).
22 | 
23 | Documentation of the function parameters can be found on [ReadTheDocs](https://multimap.readthedocs.io/en/latest/).
24 | 
25 | ## Citation
26 | 
27 | If your work uses MultiMAP, please cite the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02565-y):
28 | 
29 | 	@article{jain2021multimap,
30 | 	  title={MultiMAP: dimensionality reduction and integration of multimodal data},
31 | 	  author={Jain, Mika Sarkin and Polanski, Krzysztof and Conde, Cecilia Dominguez and Chen, Xi and Park, Jongeun and Mamanova, Lira and Knights, Andrew and Botting, Rachel A and Stephenson, Emily and Haniffa, Muzlifah and others},
32 | 	  journal={Genome biology},
33 | 	  volume={22},
34 | 	  number={1},
35 | 	  pages={1--26},
36 | 	  year={2021},
37 | 	  publisher={BioMed Central}
38 | 	}
39 | 
40 | ## Contact
41 | 
42 | Mika Sarkin Jain - mikasarkinjain@gmail.com \
43 | Mirjana Efremova -  m.efremova@qmul.ac.uk \
44 | Sarah Teichmann - st9@sanger.ac.uk
45 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/MultiMAP_schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Teichlab/MultiMAP/681e608c45cdb6b139dfb6700e40c7520bc6096d/docs/MultiMAP_schematic.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | autodoc_mock_imports = ['anndata','scanpy','numpy','scipy','numba','scipy.optimize',
17 | 						'sklearn.neighbors','sklearn.metrics','warnings','scipy.sparse',
18 | 						'locale','sklearn.utils','annoy','faiss','scipy.sparse.csgraph',
19 | 						'sklearn.metrics','sklearn.manifold','sklearn']
20 | 
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = 'MultiMAP'
24 | copyright = '2020-2021, Mika Sarkin Jain'
25 | author = 'Mika Sarkin Jain'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = '0.0.1'
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = ['sphinx.ext.autodoc']
37 | 
38 | # Add any paths that contain templates here, relative to this directory.
39 | templates_path = ['_templates']
40 | 
41 | # List of patterns, relative to source directory, that match files and
42 | # directories to ignore when looking for source files.
43 | # This pattern also affects html_static_path and html_extra_path.
44 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = 'sphinx_rtd_theme'
52 | 
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. MultiMAP documentation master file, created by
 2 |    sphinx-quickstart on Wed Dec  2 10:49:54 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | MultiMAP
 7 | ========
 8 | 
 9 | .. automodule:: MultiMAP
10 |    :members: Integration, Batch, TFIDF_LSI
11 | 
12 | .. automodule:: MultiMAP.matrix
13 |    :members: MultiMAP
14 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-rtd-theme
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 | 	name='MultiMAP',
 5 | 	version='0.0.1',
 6 | 	description='MultiMAP',
 7 | 	url='https://github.com/Teichlab/MultiMAP',
 8 | 	packages=find_packages(exclude=['docs', 'examples']),
 9 | 	install_requires=['numpy','scipy','numba','scikit-learn'],
10 | 	author='Mika Sarkin Jain',
11 | 	author_email='mikasarkinjain@gmail.com',
12 | 	license='MIT'
13 | )
14 | 


--------------------------------------------------------------------------------