├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── MultiMAP ├── __init__.py └── matrix.py ├── README.md ├── docs ├── Makefile ├── MultiMAP_schematic.png ├── conf.py ├── index.rst ├── make.bat └── requirements.txt ├── examples └── tutorial.ipynb └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | MultiMAP/__pycache__ 3 | docs/_build 4 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Install our python package before building the docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | - method: pip 23 | path: . 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Mika Sarkin Jain 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /MultiMAP/__init__.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | from MultiMAP.matrix import MultiMAP, tfidf 4 | #you don't need these if going for MultiMAP.matrix functions 5 | try: 6 | import anndata 7 | except ImportError: 8 | pass 9 | try: 10 | import scanpy as sc 11 | except ImportError: 12 | pass 13 | 14 | def TFIDF_LSI(adata, n_comps=50, binarize=True, random_state=0): 15 | ''' 16 | Computes LSI based on a TF-IDF transformation of the data. Putative dimensionality 17 | reduction for scATAC-seq data prior to MultiMAP. Adds an ``.obsm['X_lsi']`` field to 18 | the object it was ran on. 19 | 20 | Input 21 | ----- 22 | adata : ``AnnData`` 23 | The object to run TFIDF + LSI on. Will use ``.X`` as the input data. 24 | n_comps : ``int`` 25 | The number of components to generate. Default: 50 26 | binarize : ``bool`` 27 | Whether to binarize the data prior to the computation. Often done during scATAC-seq 28 | processing. Default: True 29 | random_state : ``int`` 30 | The seed to use for randon number generation. Default: 0 31 | ''' 32 | 33 | #this is just a very basic wrapper for the non-adata function 34 | if scipy.sparse.issparse(adata.X): 35 | adata.obsm['X_lsi'] = tfidf(adata.X.todense(), n_components=n_comps, binarize=binarize, random_state=random_state) 36 | else: 37 | adata.obsm['X_lsi'] = tfidf(adata.X, n_components=n_comps, binarize=binarize, random_state=random_state) 38 | 39 | def Wrapper(flagged, use_reps, embedding, seed, **kwargs): 40 | ''' 41 | A function that computes the paired PCAs between the datasets to integrate, calls MultiMAP 42 | proper, and returns a (parameters, connectivities, embedding) tuple. Embedding optional 43 | depending on ``embedding``. 44 | 45 | Input 46 | ----- 47 | flagged : list of ``AnnData`` 48 | Preprocessed objects to integrate. Need to have the single-dataset DRs computed at 49 | this stage. Need to have ``.obs[\'multimap_index\']`` defined, incrementing integers 50 | matching the object's index in the list. Both ``Integrate()`` and ``Batch()`` make 51 | these. 52 | 53 | All other arguments as described in ``MultiMAP.Integration()``. 54 | ''' 55 | #MultiMAP wants the shared PCAs delivered as a dictionary, with the subset indices 56 | #tupled up as a key. let's make that then 57 | joint = {} 58 | #process all dataset pairs 59 | for ind1 in np.arange(len(flagged)-1): 60 | for ind2 in np.arange(ind1+1, len(flagged)): 61 | subset = (ind1, ind2) 62 | #collapse into a single object and run a PCA 63 | adata = flagged[ind1].concatenate(flagged[ind2], join='inner') 64 | sc.tl.pca(adata) 65 | #preserve space by deleting the intermediate object and just keeping its PCA 66 | #and multimap index thing 67 | X_pca = adata.obsm['X_pca'].copy() 68 | multimap_index = adata.obs['multimap_index'].values 69 | del adata 70 | #store the results in joint, which involves some further acrobatics 71 | joint[subset] = [] 72 | #extract the coordinates for this particular element in the original list, using 73 | #the multimap_index .obs column we created before. handy! 74 | for i in subset: 75 | joint[subset].append(X_pca[multimap_index == i, :]) 76 | 77 | #with the joint prepped, we just need to extract the primary dimensionality reductions 78 | #and we're good to go here 79 | Xs = [] 80 | for adata, use_rep in zip(flagged, use_reps): 81 | Xs.append(adata.obsm[use_rep]) 82 | 83 | #set seed 84 | np.random.seed(seed) 85 | 86 | #and with that, we're now truly free to call the MultiMAP function 87 | #need to negate embedding and provide that as graph_only for the function to understand 88 | mmp = MultiMAP(Xs=Xs, joint=joint, graph_only=(not embedding), **kwargs) 89 | 90 | #and that's it. spit this out for the other wrappers to use however 91 | return mmp 92 | 93 | def Integration(adatas, use_reps, scale=True, embedding=True, seed=0, **kwargs): 94 | ''' 95 | Run MultiMAP to integrate a number of AnnData objects from various multi-omics experiments 96 | into a single joint dimensionally reduced space. Returns a joint object with the resulting 97 | embedding stored in ``.obsm[\'X_multimap\']`` (if instructed) and appropriate graphs in 98 | ``.obsp``. The final object will be a concatenation of the individual ones provided on 99 | input, so in the interest of ease of exploration it is recommended to have non-scaled data 100 | in ``.X``. 101 | 102 | Input 103 | ----- 104 | adatas : list of ``AnnData`` 105 | The objects to integrate. The ``.var`` spaces will be intersected across subsets of 106 | the objects to compute shared PCAs, so make sure that you have ample features in 107 | common between the objects. ``.X`` data will be used for computation. 108 | use_reps : list of ``str`` 109 | The ``.obsm`` fields for each of the corresponding ``adatas`` to use as the 110 | dimensionality reduction to represent the full feature space of the object. Needs 111 | to be precomputed and present in the object at the time of calling the function. 112 | scale : ``bool``, optional (default: ``True``) 113 | Whether to scale the data to N(0,1) on a per-dataset basis prior to computing the 114 | cross-dataset PCAs. Improves integration. 115 | embedding : ``bool``, optional (default: ``True``) 116 | Whether to compute the MultiMAP embedding. If ``False``, will just return the graph, 117 | which can be used to compute a regular UMAP. This can produce a manifold quicker, 118 | but at the cost of accuracy. 119 | n_neighbors : ``int`` or ``None``, optional (default: ``None``) 120 | The number of neighbours for each node (data point) in the MultiGraph. If ``None``, 121 | defaults to 15 times the number of input datasets. 122 | n_components : ``int`` (default: 2) 123 | The number of dimensions of the MultiMAP embedding. 124 | seed : ``int`` (default: 0) 125 | RNG seed. 126 | strengths: ``list`` of ``float`` or ``None`` (default: ``None``) 127 | The relative contribution of each dataset to the layout of the embedding. The 128 | higher the strength the higher the weighting of its cross entropy in the layout loss. 129 | If provided, needs to be a list with one 0-1 value per dataset; if ``None``, defaults 130 | to 0.5 for each dataset. 131 | cardinality : ``float`` or ``None``, optional (default: ``None``) 132 | The target sum of the connectivities of each neighbourhood in the MultiGraph. If 133 | ``None``, defaults to ``log2(n_neighbors)``. 134 | 135 | The following parameter definitions are sourced from UMAP 0.5.1: 136 | 137 | n_epochs : int (optional, default None) 138 | The number of training epochs to be used in optimizing the 139 | low dimensional embedding. Larger values result in more accurate 140 | embeddings. If None is specified a value will be selected based on 141 | the size of the input dataset (200 for large datasets, 500 for small). 142 | init : string (optional, default 'spectral') 143 | How to initialize the low dimensional embedding. Options are: 144 | * 'spectral': use a spectral embedding of the fuzzy 1-skeleton 145 | * 'random': assign initial embedding positions at random. 146 | * A numpy array of initial embedding positions. 147 | min_dist : float (optional, default 0.1) 148 | The effective minimum distance between embedded points. Smaller values 149 | will result in a more clustered/clumped embedding where nearby points 150 | on the manifold are drawn closer together, while larger values will 151 | result on a more even dispersal of points. The value should be set 152 | relative to the ``spread`` value, which determines the scale at which 153 | embedded points will be spread out. 154 | spread : float (optional, default 1.0) 155 | The effective scale of embedded points. In combination with ``min_dist`` 156 | this determines how clustered/clumped the embedded points are. 157 | set_op_mix_ratio : float (optional, default 1.0) 158 | Interpolate between (fuzzy) union and intersection as the set operation 159 | used to combine local fuzzy simplicial sets to obtain a global fuzzy 160 | simplicial sets. Both fuzzy set operations use the product t-norm. 161 | The value of this parameter should be between 0.0 and 1.0; a value of 162 | 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy 163 | intersection. 164 | local_connectivity : int (optional, default 1) 165 | The local connectivity required -- i.e. the number of nearest 166 | neighbors that should be assumed to be connected at a local level. 167 | The higher this value the more connected the manifold becomes 168 | locally. In practice this should be not more than the local intrinsic 169 | dimension of the manifold. 170 | a : float (optional, default None) 171 | More specific parameters controlling the embedding. If None these 172 | values are set automatically as determined by ``min_dist`` and 173 | ``spread``. 174 | b : float (optional, default None) 175 | More specific parameters controlling the embedding. If None these 176 | values are set automatically as determined by ``min_dist`` and 177 | ``spread``. 178 | ''' 179 | 180 | #the main thing will be pulling out the various subsets of the adatas, sticking them 181 | #together, running joint PCAs, and then splitting up the joint PCAs into datasets of 182 | #origin. to do so, let's introduce a helper .obs column in copied versions of adatas 183 | flagged = [] 184 | for i, adata in enumerate(adatas): 185 | flagged.append(adata.copy()) 186 | #while we're at it, may as well potentially scale our data copy 187 | if scale: 188 | sc.pp.scale(flagged[-1]) 189 | flagged[-1].obs['multimap_index'] = i 190 | 191 | #call the wrapper. returns (params, connectivities, embedding), with embedding optional 192 | mmp = Wrapper(flagged=flagged, use_reps=use_reps, embedding=embedding, seed=seed, **kwargs) 193 | 194 | #make one happy collapsed object and shove the stuff in correct places 195 | #outer join to capture as much gene information as possible for annotation 196 | adata = anndata.concat(adatas, join='outer') 197 | if embedding: 198 | adata.obsm['X_multimap'] = mmp[2] 199 | #the graph is weighted, the higher the better, 1 best. sounds similar to connectivities 200 | #TODO: slot distances into .obsp['distances'] 201 | adata.obsp['connectivities'] = mmp[1] 202 | #set up .uns['neighbors'], setting method to umap as these are connectivities 203 | adata.uns['neighbors'] = {} 204 | adata.uns['neighbors']['params'] = mmp[0] 205 | adata.uns['neighbors']['params']['method'] = 'umap' 206 | adata.uns['neighbors']['distances_key'] = 'distances' 207 | adata.uns['neighbors']['connectivities_key'] = 'connectivities' 208 | return adata 209 | 210 | def Batch(adata, batch_key='batch', scale=True, embedding=True, seed=0, dimred_func=None, rep_name='X_pca', **kwargs): 211 | ''' 212 | Run MultiMAP to correct batch effect within a single AnnData object. Loses the flexibility 213 | of individualised dimensionality reduction choices, but doesn't require a list of separate 214 | objects for each batch/dataset to integrate. Runs PCA on a per-batch/dataset basis prior 215 | to performing an analysis analogous to ``Integration()``. Adds appropriate ``.obsp`` graphs 216 | and ``.obsm[\'X_multimap\']`` (if instructed) to the input. 217 | 218 | Input 219 | ----- 220 | adata : ``AnnData`` 221 | The object to process. ``.X`` data will be used in the computation. 222 | batch_key : ``str``, optional (default: "batch") 223 | The ``.obs`` column of the input object with the categorical variable defining the 224 | batch/dataset grouping to integrate on. 225 | scale : ``bool``, optional (default: ``True``) 226 | Whether to scale the data to N(0,1) on a per-dataset basis prior to computing the 227 | cross-dataset PCAs. Improves integration. 228 | embedding : ``bool``, optional (default: ``True``) 229 | Whether to compute the MultiMAP embedding. If ``False``, will just return the graph, 230 | which can be used to compute a regular UMAP. This can produce a manifold quicker, 231 | but at the cost of accuracy. 232 | dimred_func : function or ``None``, optional (default: ``None``) 233 | The function to use to compute dimensionality reduction on a per-dataset basis. Must 234 | accept an ``AnnData`` on input and modify it by inserting its dimensionality reduction 235 | into ``.obsm``. If ``None``, ``scanpy.tl.pca()`` will be used. 236 | rep_name : ``str``, optional (default: "X_pca") 237 | The ``.obsm`` field that the dimensionality reduction function stores its output under. 238 | 239 | All other arguments as described in ``Integration()``. 240 | ''' 241 | 242 | #as promised in the docstring, set dimred_func to scanpy PCA if not provided 243 | if dimred_func is None: 244 | dimred_func = sc.tl.pca 245 | 246 | #essentially what this function does is preps data to run through the other wrapper 247 | #so what needs to happen is the object needs to be partitioned up, have DR ran, 248 | #and passed as a list to the wrapper function 249 | flagged = [] 250 | flagged_ids = [] 251 | use_reps = [] 252 | for i,batch in enumerate(np.unique(adata.obs[batch_key])): 253 | #extract the single batch data 254 | flagged.append(adata[adata.obs[batch_key]==batch].copy()) 255 | #potentially scale 256 | if scale: 257 | sc.pp.scale(flagged[-1]) 258 | #and run DR 259 | dimred_func(flagged[-1]) 260 | #and stick on the index for multimap to pull stuff apart later 261 | flagged[-1].obs['multimap_index'] = i 262 | #and add an entry to the list of .obsm keys for the other function 263 | use_reps.append(rep_name) 264 | #and store the cell name ordering for later 265 | flagged_ids = flagged_ids + list(flagged[-1].obs_names) 266 | 267 | #call the wrapper. returns (params, connectivities, embedding), with embedding optional 268 | mmp = Wrapper(flagged=flagged, use_reps=use_reps, embedding=embedding, seed=seed, **kwargs) 269 | 270 | #this output has the cells ordered as a concatenation of the individual flagged objects 271 | #so need to figure out how to reorder the output to get the original cell order 272 | #doing the following operation sets the desired order to adata.obs_names 273 | #and checks the index for each in flagged_ids 274 | #so taking something in flagged_ids order and using sort_order on it will match obs_names 275 | sort_order = [flagged_ids.index(i) for i in list(adata.obs_names)] 276 | 277 | #stick stuff where it's supposed to go 278 | if embedding: 279 | adata.obsm['X_multimap'] = mmp[2][sort_order,:] 280 | #the graph is weighted, the higher the better, 1 best. sounds similar to connectivities 281 | #TODO: slot distances into .obsp['distances'] 282 | adata.obsp['connectivities'] = mmp[1][sort_order,:][:,sort_order] 283 | #set up .uns['neighbors'], setting method to umap as these are connectivities 284 | adata.uns['neighbors'] = {} 285 | adata.uns['neighbors']['params'] = mmp[0] 286 | adata.uns['neighbors']['params']['method'] = 'umap' 287 | adata.uns['neighbors']['distances_key'] = 'distances' 288 | adata.uns['neighbors']['connectivities_key'] = 'connectivities' -------------------------------------------------------------------------------- /MultiMAP/matrix.py: -------------------------------------------------------------------------------- 1 | # Partially based on codebase by Leland McInnes (https://github.com/lmcinnes/umap) 2 | 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import numba 7 | import scipy 8 | from scipy.optimize import curve_fit 9 | from sklearn.neighbors import KDTree 10 | from sklearn.metrics import pairwise_distances 11 | 12 | import warnings 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | #INT32_MIN = np.iinfo(np.int32).min + 1 22 | #INT32_MAX = np.iinfo(np.int32).max - 1 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | from collections import deque, namedtuple 35 | from warnings import warn 36 | 37 | import numpy as np 38 | import numba 39 | 40 | #from umap.sparse import sparse_mul, sparse_diff, sparse_sum 41 | 42 | #from umap.utils import tau_rand_int, norm 43 | 44 | import scipy.sparse 45 | import locale 46 | 47 | locale.setlocale(locale.LC_NUMERIC, "C") 48 | 49 | 50 | EPS = 1e-8 51 | 52 | RandomProjectionTreeNode = namedtuple( 53 | "RandomProjectionTreeNode", 54 | ["indices", "is_leaf", "hyperplane", "offset", "left_child", "right_child"], 55 | ) 56 | 57 | FlatTree = namedtuple("FlatTree", ["hyperplanes", "offsets", "children", "indices"]) 58 | 59 | 60 | @numba.njit(fastmath=True) 61 | def angular_random_projection_split(data, indices, rng_state): 62 | 63 | dim = data.shape[1] 64 | 65 | 66 | left_index = tau_rand_int(rng_state) % indices.shape[0] 67 | right_index = tau_rand_int(rng_state) % indices.shape[0] 68 | right_index += left_index == right_index 69 | right_index = right_index % indices.shape[0] 70 | left = indices[left_index] 71 | right = indices[right_index] 72 | 73 | left_norm = norm(data[left]) 74 | right_norm = norm(data[right]) 75 | 76 | if abs(left_norm) < EPS: 77 | left_norm = 1.0 78 | 79 | if abs(right_norm) < EPS: 80 | right_norm = 1.0 81 | 82 | 83 | 84 | hyperplane_vector = np.empty(dim, dtype=np.float32) 85 | 86 | for d in range(dim): 87 | hyperplane_vector[d] = (data[left, d] / left_norm) - ( 88 | data[right, d] / right_norm 89 | ) 90 | 91 | hyperplane_norm = norm(hyperplane_vector) 92 | if abs(hyperplane_norm) < EPS: 93 | hyperplane_norm = 1.0 94 | 95 | for d in range(dim): 96 | hyperplane_vector[d] = hyperplane_vector[d] / hyperplane_norm 97 | 98 | 99 | 100 | 101 | n_left = 0 102 | n_right = 0 103 | side = np.empty(indices.shape[0], np.int8) 104 | for i in range(indices.shape[0]): 105 | margin = 0.0 106 | for d in range(dim): 107 | margin += hyperplane_vector[d] * data[indices[i], d] 108 | 109 | if abs(margin) < EPS: 110 | side[i] = tau_rand_int(rng_state) % 2 111 | if side[i] == 0: 112 | n_left += 1 113 | else: 114 | n_right += 1 115 | elif margin > 0: 116 | side[i] = 0 117 | n_left += 1 118 | else: 119 | side[i] = 1 120 | n_right += 1 121 | 122 | 123 | indices_left = np.empty(n_left, dtype=np.int64) 124 | indices_right = np.empty(n_right, dtype=np.int64) 125 | 126 | 127 | n_left = 0 128 | n_right = 0 129 | for i in range(side.shape[0]): 130 | if side[i] == 0: 131 | indices_left[n_left] = indices[i] 132 | n_left += 1 133 | else: 134 | indices_right[n_right] = indices[i] 135 | n_right += 1 136 | 137 | return indices_left, indices_right, hyperplane_vector, None 138 | 139 | 140 | @numba.njit(fastmath=True, nogil=True) 141 | def euclidean_random_projection_split(data, indices, rng_state): 142 | 143 | dim = data.shape[1] 144 | 145 | 146 | left_index = tau_rand_int(rng_state) % indices.shape[0] 147 | right_index = tau_rand_int(rng_state) % indices.shape[0] 148 | right_index += left_index == right_index 149 | right_index = right_index % indices.shape[0] 150 | left = indices[left_index] 151 | right = indices[right_index] 152 | 153 | 154 | 155 | hyperplane_offset = 0.0 156 | hyperplane_vector = np.empty(dim, dtype=np.float32) 157 | 158 | for d in range(dim): 159 | hyperplane_vector[d] = data[left, d] - data[right, d] 160 | hyperplane_offset -= ( 161 | hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0 162 | ) 163 | 164 | 165 | 166 | 167 | n_left = 0 168 | n_right = 0 169 | side = np.empty(indices.shape[0], np.int8) 170 | for i in range(indices.shape[0]): 171 | margin = hyperplane_offset 172 | for d in range(dim): 173 | margin += hyperplane_vector[d] * data[indices[i], d] 174 | 175 | if abs(margin) < EPS: 176 | side[i] = tau_rand_int(rng_state) % 2 177 | if side[i] == 0: 178 | n_left += 1 179 | else: 180 | n_right += 1 181 | elif margin > 0: 182 | side[i] = 0 183 | n_left += 1 184 | else: 185 | side[i] = 1 186 | n_right += 1 187 | 188 | 189 | indices_left = np.empty(n_left, dtype=np.int64) 190 | indices_right = np.empty(n_right, dtype=np.int64) 191 | 192 | 193 | n_left = 0 194 | n_right = 0 195 | for i in range(side.shape[0]): 196 | if side[i] == 0: 197 | indices_left[n_left] = indices[i] 198 | n_left += 1 199 | else: 200 | indices_right[n_right] = indices[i] 201 | n_right += 1 202 | 203 | return indices_left, indices_right, hyperplane_vector, hyperplane_offset 204 | 205 | 206 | @numba.njit(fastmath=True) 207 | def sparse_angular_random_projection_split(inds, indptr, data, indices, rng_state): 208 | 209 | 210 | left_index = tau_rand_int(rng_state) % indices.shape[0] 211 | right_index = tau_rand_int(rng_state) % indices.shape[0] 212 | right_index += left_index == right_index 213 | right_index = right_index % indices.shape[0] 214 | left = indices[left_index] 215 | right = indices[right_index] 216 | 217 | left_inds = inds[indptr[left] : indptr[left + 1]] 218 | left_data = data[indptr[left] : indptr[left + 1]] 219 | right_inds = inds[indptr[right] : indptr[right + 1]] 220 | right_data = data[indptr[right] : indptr[right + 1]] 221 | 222 | left_norm = norm(left_data) 223 | right_norm = norm(right_data) 224 | 225 | if abs(left_norm) < EPS: 226 | left_norm = 1.0 227 | 228 | if abs(right_norm) < EPS: 229 | right_norm = 1.0 230 | 231 | 232 | 233 | normalized_left_data = left_data / left_norm 234 | normalized_right_data = right_data / right_norm 235 | hyperplane_inds, hyperplane_data = sparse_diff( 236 | left_inds, normalized_left_data, right_inds, normalized_right_data 237 | ) 238 | 239 | hyperplane_norm = norm(hyperplane_data) 240 | if abs(hyperplane_norm) < EPS: 241 | hyperplane_norm = 1.0 242 | for d in range(hyperplane_data.shape[0]): 243 | hyperplane_data[d] = hyperplane_data[d] / hyperplane_norm 244 | 245 | 246 | 247 | 248 | n_left = 0 249 | n_right = 0 250 | side = np.empty(indices.shape[0], np.int8) 251 | for i in range(indices.shape[0]): 252 | margin = 0.0 253 | 254 | i_inds = inds[indptr[indices[i]] : indptr[indices[i] + 1]] 255 | i_data = data[indptr[indices[i]] : indptr[indices[i] + 1]] 256 | 257 | mul_inds, mul_data = sparse_mul( 258 | hyperplane_inds, hyperplane_data, i_inds, i_data 259 | ) 260 | for d in range(mul_data.shape[0]): 261 | margin += mul_data[d] 262 | 263 | if abs(margin) < EPS: 264 | side[i] = tau_rand_int(rng_state) % 2 265 | if side[i] == 0: 266 | n_left += 1 267 | else: 268 | n_right += 1 269 | elif margin > 0: 270 | side[i] = 0 271 | n_left += 1 272 | else: 273 | side[i] = 1 274 | n_right += 1 275 | 276 | 277 | indices_left = np.empty(n_left, dtype=np.int64) 278 | indices_right = np.empty(n_right, dtype=np.int64) 279 | 280 | 281 | n_left = 0 282 | n_right = 0 283 | for i in range(side.shape[0]): 284 | if side[i] == 0: 285 | indices_left[n_left] = indices[i] 286 | n_left += 1 287 | else: 288 | indices_right[n_right] = indices[i] 289 | n_right += 1 290 | 291 | hyperplane = np.vstack((hyperplane_inds, hyperplane_data)) 292 | 293 | return indices_left, indices_right, hyperplane, None 294 | 295 | 296 | @numba.njit(fastmath=True) 297 | def sparse_euclidean_random_projection_split(inds, indptr, data, indices, rng_state): 298 | 299 | 300 | left_index = tau_rand_int(rng_state) % indices.shape[0] 301 | right_index = tau_rand_int(rng_state) % indices.shape[0] 302 | right_index += left_index == right_index 303 | right_index = right_index % indices.shape[0] 304 | left = indices[left_index] 305 | right = indices[right_index] 306 | 307 | left_inds = inds[indptr[left] : indptr[left + 1]] 308 | left_data = data[indptr[left] : indptr[left + 1]] 309 | right_inds = inds[indptr[right] : indptr[right + 1]] 310 | right_data = data[indptr[right] : indptr[right + 1]] 311 | 312 | 313 | 314 | hyperplane_offset = 0.0 315 | hyperplane_inds, hyperplane_data = sparse_diff( 316 | left_inds, left_data, right_inds, right_data 317 | ) 318 | offset_inds, offset_data = sparse_sum(left_inds, left_data, right_inds, right_data) 319 | offset_data = offset_data / 2.0 320 | offset_inds, offset_data = sparse_mul( 321 | hyperplane_inds, hyperplane_data, offset_inds, offset_data 322 | ) 323 | 324 | for d in range(offset_data.shape[0]): 325 | hyperplane_offset -= offset_data[d] 326 | 327 | 328 | 329 | 330 | n_left = 0 331 | n_right = 0 332 | side = np.empty(indices.shape[0], np.int8) 333 | for i in range(indices.shape[0]): 334 | margin = hyperplane_offset 335 | i_inds = inds[indptr[indices[i]] : indptr[indices[i] + 1]] 336 | i_data = data[indptr[indices[i]] : indptr[indices[i] + 1]] 337 | 338 | mul_inds, mul_data = sparse_mul( 339 | hyperplane_inds, hyperplane_data, i_inds, i_data 340 | ) 341 | for d in range(mul_data.shape[0]): 342 | margin += mul_data[d] 343 | 344 | if abs(margin) < EPS: 345 | side[i] = tau_rand_int(rng_state) % 2 346 | if side[i] == 0: 347 | n_left += 1 348 | else: 349 | n_right += 1 350 | elif margin > 0: 351 | side[i] = 0 352 | n_left += 1 353 | else: 354 | side[i] = 1 355 | n_right += 1 356 | 357 | 358 | indices_left = np.empty(n_left, dtype=np.int64) 359 | indices_right = np.empty(n_right, dtype=np.int64) 360 | 361 | 362 | n_left = 0 363 | n_right = 0 364 | for i in range(side.shape[0]): 365 | if side[i] == 0: 366 | indices_left[n_left] = indices[i] 367 | n_left += 1 368 | else: 369 | indices_right[n_right] = indices[i] 370 | n_right += 1 371 | 372 | hyperplane = np.vstack((hyperplane_inds, hyperplane_data)) 373 | 374 | return indices_left, indices_right, hyperplane, hyperplane_offset 375 | 376 | 377 | def make_euclidean_tree(data, indices, rng_state, leaf_size=30): 378 | if indices.shape[0] > leaf_size: 379 | left_indices, right_indices, hyperplane, offset = euclidean_random_projection_split( 380 | data, indices, rng_state 381 | ) 382 | 383 | left_node = make_euclidean_tree(data, left_indices, rng_state, leaf_size) 384 | right_node = make_euclidean_tree(data, right_indices, rng_state, leaf_size) 385 | 386 | node = RandomProjectionTreeNode( 387 | None, False, hyperplane, offset, left_node, right_node 388 | ) 389 | else: 390 | node = RandomProjectionTreeNode(indices, True, None, None, None, None) 391 | 392 | return node 393 | 394 | 395 | def make_angular_tree(data, indices, rng_state, leaf_size=30): 396 | if indices.shape[0] > leaf_size: 397 | left_indices, right_indices, hyperplane, offset = angular_random_projection_split( 398 | data, indices, rng_state 399 | ) 400 | 401 | left_node = make_angular_tree(data, left_indices, rng_state, leaf_size) 402 | right_node = make_angular_tree(data, right_indices, rng_state, leaf_size) 403 | 404 | node = RandomProjectionTreeNode( 405 | None, False, hyperplane, offset, left_node, right_node 406 | ) 407 | else: 408 | node = RandomProjectionTreeNode(indices, True, None, None, None, None) 409 | 410 | return node 411 | 412 | 413 | def make_sparse_euclidean_tree(inds, indptr, data, indices, rng_state, leaf_size=30): 414 | if indices.shape[0] > leaf_size: 415 | left_indices, right_indices, hyperplane, offset = sparse_euclidean_random_projection_split( 416 | inds, indptr, data, indices, rng_state 417 | ) 418 | 419 | left_node = make_sparse_euclidean_tree( 420 | inds, indptr, data, left_indices, rng_state, leaf_size 421 | ) 422 | right_node = make_sparse_euclidean_tree( 423 | inds, indptr, data, right_indices, rng_state, leaf_size 424 | ) 425 | 426 | node = RandomProjectionTreeNode( 427 | None, False, hyperplane, offset, left_node, right_node 428 | ) 429 | else: 430 | node = RandomProjectionTreeNode(indices, True, None, None, None, None) 431 | 432 | return node 433 | 434 | 435 | def make_sparse_angular_tree(inds, indptr, data, indices, rng_state, leaf_size=30): 436 | if indices.shape[0] > leaf_size: 437 | left_indices, right_indices, hyperplane, offset = sparse_angular_random_projection_split( 438 | inds, indptr, data, indices, rng_state 439 | ) 440 | 441 | left_node = make_sparse_angular_tree( 442 | inds, indptr, data, left_indices, rng_state, leaf_size 443 | ) 444 | right_node = make_sparse_angular_tree( 445 | inds, indptr, data, right_indices, rng_state, leaf_size 446 | ) 447 | 448 | node = RandomProjectionTreeNode( 449 | None, False, hyperplane, offset, left_node, right_node 450 | ) 451 | else: 452 | node = RandomProjectionTreeNode(indices, True, None, None, None, None) 453 | 454 | return node 455 | 456 | 457 | def make_tree(data, rng_state, leaf_size=30, angular=False): 458 | 459 | is_sparse = scipy.sparse.isspmatrix_csr(data) 460 | indices = np.arange(data.shape[0]) 461 | 462 | 463 | if is_sparse: 464 | inds = data.indices 465 | indptr = data.indptr 466 | spdata = data.data 467 | 468 | if angular: 469 | return make_sparse_angular_tree( 470 | inds, indptr, spdata, indices, rng_state, leaf_size 471 | ) 472 | else: 473 | return make_sparse_euclidean_tree( 474 | inds, indptr, spdata, indices, rng_state, leaf_size 475 | ) 476 | else: 477 | if angular: 478 | return make_angular_tree(data, indices, rng_state, leaf_size) 479 | else: 480 | return make_euclidean_tree(data, indices, rng_state, leaf_size) 481 | 482 | 483 | def num_nodes(tree): 484 | if tree.is_leaf: 485 | return 1 486 | else: 487 | return 1 + num_nodes(tree.left_child) + num_nodes(tree.right_child) 488 | 489 | 490 | def num_leaves(tree): 491 | if tree.is_leaf: 492 | return 1 493 | else: 494 | return num_leaves(tree.left_child) + num_leaves(tree.right_child) 495 | 496 | 497 | def max_sparse_hyperplane_size(tree): 498 | if tree.is_leaf: 499 | return 0 500 | else: 501 | return max( 502 | tree.hyperplane.shape[1], 503 | max_sparse_hyperplane_size(tree.left_child), 504 | max_sparse_hyperplane_size(tree.right_child), 505 | ) 506 | 507 | 508 | def recursive_flatten( 509 | tree, hyperplanes, offsets, children, indices, node_num, leaf_num 510 | ): 511 | if tree.is_leaf: 512 | children[node_num, 0] = -leaf_num 513 | indices[leaf_num, : tree.indices.shape[0]] = tree.indices 514 | leaf_num += 1 515 | return node_num, leaf_num 516 | else: 517 | if len(tree.hyperplane.shape) > 1: 518 | 519 | hyperplanes[node_num][:, : tree.hyperplane.shape[1]] = tree.hyperplane 520 | else: 521 | hyperplanes[node_num] = tree.hyperplane 522 | offsets[node_num] = tree.offset 523 | children[node_num, 0] = node_num + 1 524 | old_node_num = node_num 525 | node_num, leaf_num = recursive_flatten( 526 | tree.left_child, 527 | hyperplanes, 528 | offsets, 529 | children, 530 | indices, 531 | node_num + 1, 532 | leaf_num, 533 | ) 534 | children[old_node_num, 1] = node_num + 1 535 | node_num, leaf_num = recursive_flatten( 536 | tree.right_child, 537 | hyperplanes, 538 | offsets, 539 | children, 540 | indices, 541 | node_num + 1, 542 | leaf_num, 543 | ) 544 | return node_num, leaf_num 545 | 546 | 547 | def flatten_tree(tree, leaf_size): 548 | n_nodes = num_nodes(tree) 549 | n_leaves = num_leaves(tree) 550 | 551 | if len(tree.hyperplane.shape) > 1: 552 | 553 | max_hyperplane_nnz = max_sparse_hyperplane_size(tree) 554 | hyperplanes = np.zeros( 555 | (n_nodes, tree.hyperplane.shape[0], max_hyperplane_nnz), dtype=np.float32 556 | ) 557 | else: 558 | hyperplanes = np.zeros((n_nodes, tree.hyperplane.shape[0]), dtype=np.float32) 559 | 560 | offsets = np.zeros(n_nodes, dtype=np.float32) 561 | children = -1 * np.ones((n_nodes, 2), dtype=np.int64) 562 | indices = -1 * np.ones((n_leaves, leaf_size), dtype=np.int64) 563 | recursive_flatten(tree, hyperplanes, offsets, children, indices, 0, 0) 564 | return FlatTree(hyperplanes, offsets, children, indices) 565 | 566 | 567 | @numba.njit() 568 | def select_side(hyperplane, offset, point, rng_state): 569 | margin = offset 570 | for d in range(point.shape[0]): 571 | margin += hyperplane[d] * point[d] 572 | 573 | if abs(margin) < EPS: 574 | side = tau_rand_int(rng_state) % 2 575 | if side == 0: 576 | return 0 577 | else: 578 | return 1 579 | elif margin > 0: 580 | return 0 581 | else: 582 | return 1 583 | 584 | 585 | @numba.njit() 586 | def search_flat_tree(point, hyperplanes, offsets, children, indices, rng_state): 587 | node = 0 588 | while children[node, 0] > 0: 589 | side = select_side(hyperplanes[node], offsets[node], point, rng_state) 590 | if side == 0: 591 | node = children[node, 0] 592 | else: 593 | node = children[node, 1] 594 | 595 | return indices[-children[node, 0]] 596 | 597 | 598 | def make_forest(data, n_neighbors, n_trees, rng_state, angular=False): 599 | 600 | result = [] 601 | leaf_size = max(10, n_neighbors) 602 | try: 603 | result = [ 604 | flatten_tree(make_tree(data, rng_state, leaf_size, angular), leaf_size) 605 | for i in range(n_trees) 606 | ] 607 | except (RuntimeError, RecursionError, SystemError): 608 | warn( 609 | "Random Projection forest initialisation failed due to recursion" 610 | "limit being reached. Something is a little strange with your " 611 | "data, and this may take longer than normal to compute." 612 | ) 613 | 614 | return result 615 | 616 | 617 | def rptree_leaf_array(rp_forest): 618 | 619 | if len(rp_forest) > 0: 620 | leaf_array = np.vstack([tree.indices for tree in rp_forest]) 621 | else: 622 | leaf_array = np.array([[-1]]) 623 | 624 | return leaf_array 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | import numpy as np 638 | import numba 639 | 640 | _mock_identity = np.eye(2, dtype=np.float64) 641 | _mock_ones = np.ones(2, dtype=np.float64) 642 | 643 | 644 | @numba.njit(fastmath=True) 645 | def euclidean(x, y): 646 | 647 | result = 0.0 648 | for i in range(x.shape[0]): 649 | result += (x[i] - y[i]) ** 2 650 | return np.sqrt(result) 651 | 652 | 653 | @numba.njit() 654 | def standardised_euclidean(x, y, sigma=_mock_ones): 655 | 656 | result = 0.0 657 | for i in range(x.shape[0]): 658 | result += ((x[i] - y[i]) ** 2) / sigma[i] 659 | 660 | return np.sqrt(result) 661 | 662 | 663 | @numba.njit() 664 | def manhattan(x, y): 665 | 666 | result = 0.0 667 | for i in range(x.shape[0]): 668 | result += np.abs(x[i] - y[i]) 669 | 670 | return result 671 | 672 | 673 | @numba.njit() 674 | def chebyshev(x, y): 675 | 676 | result = 0.0 677 | for i in range(x.shape[0]): 678 | result = max(result, np.abs(x[i] - y[i])) 679 | 680 | return result 681 | 682 | 683 | @numba.njit() 684 | def minkowski(x, y, p=2): 685 | 686 | result = 0.0 687 | for i in range(x.shape[0]): 688 | result += (np.abs(x[i] - y[i])) ** p 689 | 690 | return result ** (1.0 / p) 691 | 692 | 693 | @numba.njit() 694 | def weighted_minkowski(x, y, w=_mock_ones, p=2): 695 | 696 | result = 0.0 697 | for i in range(x.shape[0]): 698 | result += (w[i] * np.abs(x[i] - y[i])) ** p 699 | 700 | return result ** (1.0 / p) 701 | 702 | 703 | @numba.njit() 704 | def mahalanobis(x, y, vinv=_mock_identity): 705 | result = 0.0 706 | 707 | diff = np.empty(x.shape[0], dtype=np.float64) 708 | 709 | for i in range(x.shape[0]): 710 | diff[i] = x[i] - y[i] 711 | 712 | for i in range(x.shape[0]): 713 | tmp = 0.0 714 | for j in range(x.shape[0]): 715 | tmp += vinv[i, j] * diff[j] 716 | result += tmp * diff[i] 717 | 718 | return np.sqrt(result) 719 | 720 | 721 | @numba.njit() 722 | def hamming(x, y): 723 | result = 0.0 724 | for i in range(x.shape[0]): 725 | if x[i] != y[i]: 726 | result += 1.0 727 | 728 | return float(result) / x.shape[0] 729 | 730 | 731 | @numba.njit() 732 | def canberra(x, y): 733 | result = 0.0 734 | for i in range(x.shape[0]): 735 | denominator = np.abs(x[i]) + np.abs(y[i]) 736 | if denominator > 0: 737 | result += np.abs(x[i] - y[i]) / denominator 738 | 739 | return result 740 | 741 | 742 | @numba.njit() 743 | def bray_curtis(x, y): 744 | numerator = 0.0 745 | denominator = 0.0 746 | for i in range(x.shape[0]): 747 | numerator += np.abs(x[i] - y[i]) 748 | denominator += np.abs(x[i] + y[i]) 749 | 750 | if denominator > 0.0: 751 | return float(numerator) / denominator 752 | else: 753 | return 0.0 754 | 755 | 756 | @numba.njit() 757 | def jaccard(x, y): 758 | num_non_zero = 0.0 759 | num_equal = 0.0 760 | for i in range(x.shape[0]): 761 | x_true = x[i] != 0 762 | y_true = y[i] != 0 763 | num_non_zero += x_true or y_true 764 | num_equal += x_true and y_true 765 | 766 | if num_non_zero == 0.0: 767 | return 0.0 768 | else: 769 | return float(num_non_zero - num_equal) / num_non_zero 770 | 771 | 772 | @numba.njit() 773 | def matching(x, y): 774 | num_not_equal = 0.0 775 | for i in range(x.shape[0]): 776 | x_true = x[i] != 0 777 | y_true = y[i] != 0 778 | num_not_equal += x_true != y_true 779 | 780 | return float(num_not_equal) / x.shape[0] 781 | 782 | 783 | @numba.njit() 784 | def dice(x, y): 785 | num_true_true = 0.0 786 | num_not_equal = 0.0 787 | for i in range(x.shape[0]): 788 | x_true = x[i] != 0 789 | y_true = y[i] != 0 790 | num_true_true += x_true and y_true 791 | num_not_equal += x_true != y_true 792 | 793 | if num_not_equal == 0.0: 794 | return 0.0 795 | else: 796 | return num_not_equal / (2.0 * num_true_true + num_not_equal) 797 | 798 | 799 | @numba.njit() 800 | def kulsinski(x, y): 801 | num_true_true = 0.0 802 | num_not_equal = 0.0 803 | for i in range(x.shape[0]): 804 | x_true = x[i] != 0 805 | y_true = y[i] != 0 806 | num_true_true += x_true and y_true 807 | num_not_equal += x_true != y_true 808 | 809 | if num_not_equal == 0: 810 | return 0.0 811 | else: 812 | return float(num_not_equal - num_true_true + x.shape[0]) / ( 813 | num_not_equal + x.shape[0] 814 | ) 815 | 816 | 817 | @numba.njit() 818 | def rogers_tanimoto(x, y): 819 | num_not_equal = 0.0 820 | for i in range(x.shape[0]): 821 | x_true = x[i] != 0 822 | y_true = y[i] != 0 823 | num_not_equal += x_true != y_true 824 | 825 | return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal) 826 | 827 | 828 | @numba.njit() 829 | def russellrao(x, y): 830 | num_true_true = 0.0 831 | for i in range(x.shape[0]): 832 | x_true = x[i] != 0 833 | y_true = y[i] != 0 834 | num_true_true += x_true and y_true 835 | 836 | if num_true_true == np.sum(x != 0) and num_true_true == np.sum(y != 0): 837 | return 0.0 838 | else: 839 | return float(x.shape[0] - num_true_true) / (x.shape[0]) 840 | 841 | 842 | @numba.njit() 843 | def sokal_michener(x, y): 844 | num_not_equal = 0.0 845 | for i in range(x.shape[0]): 846 | x_true = x[i] != 0 847 | y_true = y[i] != 0 848 | num_not_equal += x_true != y_true 849 | 850 | return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal) 851 | 852 | 853 | @numba.njit() 854 | def sokal_sneath(x, y): 855 | num_true_true = 0.0 856 | num_not_equal = 0.0 857 | for i in range(x.shape[0]): 858 | x_true = x[i] != 0 859 | y_true = y[i] != 0 860 | num_true_true += x_true and y_true 861 | num_not_equal += x_true != y_true 862 | 863 | if num_not_equal == 0.0: 864 | return 0.0 865 | else: 866 | return num_not_equal / (0.5 * num_true_true + num_not_equal) 867 | 868 | 869 | @numba.njit() 870 | def haversine(x, y): 871 | if x.shape[0] != 2: 872 | raise ValueError("haversine is only defined for 2 dimensional data") 873 | sin_lat = np.sin(0.5 * (x[0] - y[0])) 874 | sin_long = np.sin(0.5 * (x[1] - y[1])) 875 | result = np.sqrt(sin_lat ** 2 + np.cos(x[0]) * np.cos(y[0]) * sin_long ** 2) 876 | return 2.0 * np.arcsin(result) 877 | 878 | 879 | @numba.njit() 880 | def yule(x, y): 881 | num_true_true = 0.0 882 | num_true_false = 0.0 883 | num_false_true = 0.0 884 | for i in range(x.shape[0]): 885 | x_true = x[i] != 0 886 | y_true = y[i] != 0 887 | num_true_true += x_true and y_true 888 | num_true_false += x_true and (not y_true) 889 | num_false_true += (not x_true) and y_true 890 | 891 | num_false_false = x.shape[0] - num_true_true - num_true_false - num_false_true 892 | 893 | if num_true_false == 0.0 or num_false_true == 0.0: 894 | return 0.0 895 | else: 896 | return (2.0 * num_true_false * num_false_true) / ( 897 | num_true_true * num_false_false + num_true_false * num_false_true 898 | ) 899 | 900 | 901 | @numba.njit() 902 | def cosine(x, y): 903 | result = 0.0 904 | norm_x = 0.0 905 | norm_y = 0.0 906 | for i in range(x.shape[0]): 907 | result += x[i] * y[i] 908 | norm_x += x[i] ** 2 909 | norm_y += y[i] ** 2 910 | 911 | if norm_x == 0.0 and norm_y == 0.0: 912 | return 0.0 913 | elif norm_x == 0.0 or norm_y == 0.0: 914 | return 1.0 915 | else: 916 | return 1.0 - (result / np.sqrt(norm_x * norm_y)) 917 | 918 | 919 | @numba.njit() 920 | def correlation(x, y): 921 | mu_x = 0.0 922 | mu_y = 0.0 923 | norm_x = 0.0 924 | norm_y = 0.0 925 | dot_product = 0.0 926 | 927 | for i in range(x.shape[0]): 928 | mu_x += x[i] 929 | mu_y += y[i] 930 | 931 | mu_x /= x.shape[0] 932 | mu_y /= x.shape[0] 933 | 934 | for i in range(x.shape[0]): 935 | shifted_x = x[i] - mu_x 936 | shifted_y = y[i] - mu_y 937 | norm_x += shifted_x ** 2 938 | norm_y += shifted_y ** 2 939 | dot_product += shifted_x * shifted_y 940 | 941 | if norm_x == 0.0 and norm_y == 0.0: 942 | return 0.0 943 | elif dot_product == 0.0: 944 | return 1.0 945 | else: 946 | return 1.0 - (dot_product / np.sqrt(norm_x * norm_y)) 947 | 948 | 949 | named_distances = { 950 | 951 | "euclidean": euclidean, 952 | "l2": euclidean, 953 | "manhattan": manhattan, 954 | "taxicab": manhattan, 955 | "l1": manhattan, 956 | "chebyshev": chebyshev, 957 | "linfinity": chebyshev, 958 | "linfty": chebyshev, 959 | "linf": chebyshev, 960 | "minkowski": minkowski, 961 | 962 | "seuclidean": standardised_euclidean, 963 | "standardised_euclidean": standardised_euclidean, 964 | "wminkowski": weighted_minkowski, 965 | "weighted_minkowski": weighted_minkowski, 966 | "mahalanobis": mahalanobis, 967 | 968 | "canberra": canberra, 969 | "cosine": cosine, 970 | "correlation": correlation, 971 | "haversine": haversine, 972 | "braycurtis": bray_curtis, 973 | 974 | "hamming": hamming, 975 | "jaccard": jaccard, 976 | "dice": dice, 977 | "matching": matching, 978 | "kulsinski": kulsinski, 979 | "rogerstanimoto": rogers_tanimoto, 980 | "russellrao": russellrao, 981 | "sokalsneath": sokal_sneath, 982 | "sokalmichener": sokal_michener, 983 | "yule": yule, 984 | } 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | import time 1000 | 1001 | import numpy as np 1002 | import numba 1003 | 1004 | 1005 | @numba.njit(parallel=True) 1006 | def fast_knn_indices(X, n_neighbors): 1007 | 1008 | knn_indices = np.empty( 1009 | (X.shape[0], n_neighbors), dtype=np.int32 1010 | ) 1011 | for row in numba.prange(X.shape[0]): 1012 | v = X[row].argsort(kind="quicksort") 1013 | v = v[:n_neighbors] 1014 | knn_indices[row] = v 1015 | return knn_indices 1016 | 1017 | 1018 | @numba.njit("i4(i8[:])") 1019 | def tau_rand_int(state): 1020 | 1021 | state[0] = ( 1022 | ((state[0] & 4294967294) << 12) & 0xFFFFFFFF 1023 | ) ^ ((((state[0] << 13) & 0xFFFFFFFF) ^ state[0]) >> 19) 1024 | state[1] = ( 1025 | ((state[1] & 4294967288) << 4) & 0xFFFFFFFF 1026 | ) ^ ((((state[1] << 2) & 0xFFFFFFFF) ^ state[1]) >> 25) 1027 | state[2] = ( 1028 | ((state[2] & 4294967280) << 17) & 0xFFFFFFFF 1029 | ) ^ ((((state[2] << 3) & 0xFFFFFFFF) ^ state[2]) >> 11) 1030 | 1031 | return state[0] ^ state[1] ^ state[2] 1032 | 1033 | 1034 | @numba.njit("f4(i8[:])") 1035 | def tau_rand(state): 1036 | 1037 | integer = tau_rand_int(state) 1038 | return abs(float(integer) / 0x7FFFFFFF) 1039 | 1040 | 1041 | @numba.njit() 1042 | def norm(vec): 1043 | 1044 | result = 0.0 1045 | for i in range(vec.shape[0]): 1046 | result += vec[i] ** 2 1047 | return np.sqrt(result) 1048 | 1049 | 1050 | @numba.njit() 1051 | def rejection_sample(n_samples, pool_size, rng_state): 1052 | 1053 | result = np.empty(n_samples, dtype=np.int64) 1054 | for i in range(n_samples): 1055 | reject_sample = True 1056 | while reject_sample: 1057 | j = tau_rand_int(rng_state) % pool_size 1058 | for k in range(i): 1059 | if j == result[k]: 1060 | break 1061 | else: 1062 | reject_sample = False 1063 | result[i] = j 1064 | return result 1065 | 1066 | 1067 | @numba.njit("f8[:, :, :](i8,i8)") 1068 | def make_heap(n_points, size): 1069 | 1070 | result = np.zeros( 1071 | (3, int(n_points), int(size)), dtype=np.float64 1072 | ) 1073 | result[0] = -1 1074 | result[1] = np.infty 1075 | result[2] = 0 1076 | 1077 | return result 1078 | 1079 | 1080 | @numba.njit("i8(f8[:,:,:],i8,f8,i8,i8)") 1081 | def heap_push(heap, row, weight, index, flag): 1082 | 1083 | row = int(row) 1084 | indices = heap[0, row] 1085 | weights = heap[1, row] 1086 | is_new = heap[2, row] 1087 | 1088 | if weight >= weights[0]: 1089 | return 0 1090 | 1091 | 1092 | for i in range(indices.shape[0]): 1093 | if index == indices[i]: 1094 | return 0 1095 | 1096 | 1097 | weights[0] = weight 1098 | indices[0] = index 1099 | is_new[0] = flag 1100 | 1101 | 1102 | i = 0 1103 | while True: 1104 | ic1 = 2 * i + 1 1105 | ic2 = ic1 + 1 1106 | 1107 | if ic1 >= heap.shape[2]: 1108 | break 1109 | elif ic2 >= heap.shape[2]: 1110 | if weights[ic1] > weight: 1111 | i_swap = ic1 1112 | else: 1113 | break 1114 | elif weights[ic1] >= weights[ic2]: 1115 | if weight < weights[ic1]: 1116 | i_swap = ic1 1117 | else: 1118 | break 1119 | else: 1120 | if weight < weights[ic2]: 1121 | i_swap = ic2 1122 | else: 1123 | break 1124 | 1125 | weights[i] = weights[i_swap] 1126 | indices[i] = indices[i_swap] 1127 | is_new[i] = is_new[i_swap] 1128 | 1129 | i = i_swap 1130 | 1131 | weights[i] = weight 1132 | indices[i] = index 1133 | is_new[i] = flag 1134 | 1135 | return 1 1136 | 1137 | 1138 | @numba.njit("i8(f8[:,:,:],i8,f8,i8,i8)") 1139 | def unchecked_heap_push(heap, row, weight, index, flag): 1140 | 1141 | indices = heap[0, row] 1142 | weights = heap[1, row] 1143 | is_new = heap[2, row] 1144 | 1145 | if weight >= weights[0]: 1146 | return 0 1147 | 1148 | 1149 | weights[0] = weight 1150 | indices[0] = index 1151 | is_new[0] = flag 1152 | 1153 | 1154 | i = 0 1155 | while True: 1156 | ic1 = 2 * i + 1 1157 | ic2 = ic1 + 1 1158 | 1159 | if ic1 >= heap.shape[2]: 1160 | break 1161 | elif ic2 >= heap.shape[2]: 1162 | if weights[ic1] > weight: 1163 | i_swap = ic1 1164 | else: 1165 | break 1166 | elif weights[ic1] >= weights[ic2]: 1167 | if weight < weights[ic1]: 1168 | i_swap = ic1 1169 | else: 1170 | break 1171 | else: 1172 | if weight < weights[ic2]: 1173 | i_swap = ic2 1174 | else: 1175 | break 1176 | 1177 | weights[i] = weights[i_swap] 1178 | indices[i] = indices[i_swap] 1179 | is_new[i] = is_new[i_swap] 1180 | 1181 | i = i_swap 1182 | 1183 | weights[i] = weight 1184 | indices[i] = index 1185 | is_new[i] = flag 1186 | 1187 | return 1 1188 | 1189 | 1190 | @numba.njit() 1191 | def siftdown(heap1, heap2, elt): 1192 | 1193 | while elt * 2 + 1 < heap1.shape[0]: 1194 | left_child = elt * 2 + 1 1195 | right_child = left_child + 1 1196 | swap = elt 1197 | 1198 | if heap1[swap] < heap1[left_child]: 1199 | swap = left_child 1200 | 1201 | if ( 1202 | right_child < heap1.shape[0] 1203 | and heap1[swap] < heap1[right_child] 1204 | ): 1205 | swap = right_child 1206 | 1207 | if swap == elt: 1208 | break 1209 | else: 1210 | heap1[elt], heap1[swap] = ( 1211 | heap1[swap], 1212 | heap1[elt], 1213 | ) 1214 | heap2[elt], heap2[swap] = ( 1215 | heap2[swap], 1216 | heap2[elt], 1217 | ) 1218 | elt = swap 1219 | 1220 | 1221 | @numba.njit() 1222 | def deheap_sort(heap): 1223 | 1224 | indices = heap[0] 1225 | weights = heap[1] 1226 | 1227 | for i in range(indices.shape[0]): 1228 | 1229 | ind_heap = indices[i] 1230 | dist_heap = weights[i] 1231 | 1232 | for j in range(ind_heap.shape[0] - 1): 1233 | ind_heap[0], ind_heap[ 1234 | ind_heap.shape[0] - j - 1 1235 | ] = ( 1236 | ind_heap[ind_heap.shape[0] - j - 1], 1237 | ind_heap[0], 1238 | ) 1239 | dist_heap[0], dist_heap[ 1240 | dist_heap.shape[0] - j - 1 1241 | ] = ( 1242 | dist_heap[dist_heap.shape[0] - j - 1], 1243 | dist_heap[0], 1244 | ) 1245 | 1246 | siftdown( 1247 | dist_heap[: dist_heap.shape[0] - j - 1], 1248 | ind_heap[: ind_heap.shape[0] - j - 1], 1249 | 0, 1250 | ) 1251 | 1252 | return indices.astype(np.int64), weights 1253 | 1254 | 1255 | @numba.njit("i8(f8[:, :, :],i8)") 1256 | def smallest_flagged(heap, row): 1257 | 1258 | ind = heap[0, row] 1259 | dist = heap[1, row] 1260 | flag = heap[2, row] 1261 | 1262 | min_dist = np.inf 1263 | result_index = -1 1264 | 1265 | for i in range(ind.shape[0]): 1266 | if flag[i] == 1 and dist[i] < min_dist: 1267 | min_dist = dist[i] 1268 | result_index = i 1269 | 1270 | if result_index >= 0: 1271 | flag[result_index] = 0.0 1272 | return int(ind[result_index]) 1273 | else: 1274 | return -1 1275 | 1276 | 1277 | @numba.njit(parallel=True) 1278 | def build_candidates( 1279 | current_graph, 1280 | n_vertices, 1281 | n_neighbors, 1282 | max_candidates, 1283 | rng_state, 1284 | ): 1285 | 1286 | candidate_neighbors = make_heap( 1287 | n_vertices, max_candidates 1288 | ) 1289 | for i in range(n_vertices): 1290 | for j in range(n_neighbors): 1291 | if current_graph[0, i, j] < 0: 1292 | continue 1293 | idx = current_graph[0, i, j] 1294 | isn = current_graph[2, i, j] 1295 | d = tau_rand(rng_state) 1296 | heap_push(candidate_neighbors, i, d, idx, isn) 1297 | heap_push(candidate_neighbors, idx, d, i, isn) 1298 | current_graph[2, i, j] = 0 1299 | 1300 | return candidate_neighbors 1301 | 1302 | 1303 | @numba.njit(parallel=True) 1304 | def new_build_candidates( 1305 | current_graph, 1306 | n_vertices, 1307 | n_neighbors, 1308 | max_candidates, 1309 | rng_state, 1310 | rho=0.5, 1311 | ): 1312 | 1313 | new_candidate_neighbors = make_heap( 1314 | n_vertices, max_candidates 1315 | ) 1316 | old_candidate_neighbors = make_heap( 1317 | n_vertices, max_candidates 1318 | ) 1319 | 1320 | for i in numba.prange(n_vertices): 1321 | for j in range(n_neighbors): 1322 | if current_graph[0, i, j] < 0: 1323 | continue 1324 | idx = current_graph[0, i, j] 1325 | isn = current_graph[2, i, j] 1326 | d = tau_rand(rng_state) 1327 | if tau_rand(rng_state) < rho: 1328 | c = 0 1329 | if isn: 1330 | c += heap_push( 1331 | new_candidate_neighbors, 1332 | i, 1333 | d, 1334 | idx, 1335 | isn, 1336 | ) 1337 | c += heap_push( 1338 | new_candidate_neighbors, 1339 | idx, 1340 | d, 1341 | i, 1342 | isn, 1343 | ) 1344 | else: 1345 | heap_push( 1346 | old_candidate_neighbors, 1347 | i, 1348 | d, 1349 | idx, 1350 | isn, 1351 | ) 1352 | heap_push( 1353 | old_candidate_neighbors, 1354 | idx, 1355 | d, 1356 | i, 1357 | isn, 1358 | ) 1359 | 1360 | if c > 0: 1361 | current_graph[2, i, j] = 0 1362 | 1363 | return new_candidate_neighbors, old_candidate_neighbors 1364 | 1365 | 1366 | @numba.njit(parallel=True) 1367 | def submatrix(dmat, indices_col, n_neighbors): 1368 | 1369 | n_samples_transform, n_samples_fit = dmat.shape 1370 | submat = np.zeros( 1371 | (n_samples_transform, n_neighbors), dtype=dmat.dtype 1372 | ) 1373 | for i in numba.prange(n_samples_transform): 1374 | for j in numba.prange(n_neighbors): 1375 | submat[i, j] = dmat[i, indices_col[i, j]] 1376 | return submat 1377 | 1378 | 1379 | 1380 | def ts(): 1381 | return time.ctime(time.time()) 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | import numpy as np 1399 | import numba 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 | 1411 | 1412 | 1413 | #from umap.rp_tree import search_flat_tree 1414 | 1415 | 1416 | def make_nn_descent(dist, dist_args): 1417 | 1418 | 1419 | @numba.njit() 1420 | def nn_descent( 1421 | data, 1422 | n_neighbors, 1423 | rng_state, 1424 | max_candidates=50, 1425 | n_iters=10, 1426 | delta=0.001, 1427 | rho=0.5, 1428 | rp_tree_init=True, 1429 | leaf_array=None, 1430 | verbose=False, 1431 | ): 1432 | n_vertices = data.shape[0] 1433 | 1434 | current_graph = make_heap(data.shape[0], n_neighbors) 1435 | for i in range(data.shape[0]): 1436 | indices = rejection_sample(n_neighbors, data.shape[0], rng_state) 1437 | for j in range(indices.shape[0]): 1438 | d = dist(data[i], data[indices[j]], *dist_args) 1439 | heap_push(current_graph, i, d, indices[j], 1) 1440 | heap_push(current_graph, indices[j], d, i, 1) 1441 | 1442 | if rp_tree_init: 1443 | for n in range(leaf_array.shape[0]): 1444 | for i in range(leaf_array.shape[1]): 1445 | if leaf_array[n, i] < 0: 1446 | break 1447 | for j in range(i + 1, leaf_array.shape[1]): 1448 | if leaf_array[n, j] < 0: 1449 | break 1450 | d = dist( 1451 | data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args 1452 | ) 1453 | heap_push( 1454 | current_graph, leaf_array[n, i], d, leaf_array[n, j], 1 1455 | ) 1456 | heap_push( 1457 | current_graph, leaf_array[n, j], d, leaf_array[n, i], 1 1458 | ) 1459 | 1460 | for n in range(n_iters): 1461 | if verbose: 1462 | print("\t", n, " / ", n_iters) 1463 | 1464 | candidate_neighbors = build_candidates( 1465 | current_graph, n_vertices, n_neighbors, max_candidates, rng_state 1466 | ) 1467 | 1468 | c = 0 1469 | for i in range(n_vertices): 1470 | for j in range(max_candidates): 1471 | p = int(candidate_neighbors[0, i, j]) 1472 | if p < 0 or tau_rand(rng_state) < rho: 1473 | continue 1474 | for k in range(max_candidates): 1475 | q = int(candidate_neighbors[0, i, k]) 1476 | if ( 1477 | q < 0 1478 | or not candidate_neighbors[2, i, j] 1479 | and not candidate_neighbors[2, i, k] 1480 | ): 1481 | continue 1482 | 1483 | d = dist(data[p], data[q], *dist_args) 1484 | c += heap_push(current_graph, p, d, q, 1) 1485 | c += heap_push(current_graph, q, d, p, 1) 1486 | 1487 | if c <= delta * n_neighbors * data.shape[0]: 1488 | break 1489 | 1490 | return deheap_sort(current_graph) 1491 | 1492 | return nn_descent 1493 | 1494 | 1495 | def make_initialisations(dist, dist_args): 1496 | @numba.njit(parallel=True) 1497 | def init_from_random(n_neighbors, data, query_points, heap, rng_state): 1498 | for i in range(query_points.shape[0]): 1499 | indices = rejection_sample(n_neighbors, data.shape[0], rng_state) 1500 | for j in range(indices.shape[0]): 1501 | if indices[j] < 0: 1502 | continue 1503 | d = dist(data[indices[j]], query_points[i], *dist_args) 1504 | heap_push(heap, i, d, indices[j], 1) 1505 | return 1506 | 1507 | @numba.njit(parallel=True) 1508 | def init_from_tree(tree, data, query_points, heap, rng_state): 1509 | for i in range(query_points.shape[0]): 1510 | indices = search_flat_tree( 1511 | query_points[i], 1512 | tree.hyperplanes, 1513 | tree.offsets, 1514 | tree.children, 1515 | tree.indices, 1516 | rng_state, 1517 | ) 1518 | 1519 | for j in range(indices.shape[0]): 1520 | if indices[j] < 0: 1521 | continue 1522 | d = dist(data[indices[j]], query_points[i], *dist_args) 1523 | heap_push(heap, i, d, indices[j], 1) 1524 | 1525 | return 1526 | 1527 | return init_from_random, init_from_tree 1528 | 1529 | 1530 | def initialise_search( 1531 | forest, data, query_points, n_neighbors, init_from_random, init_from_tree, rng_state 1532 | ): 1533 | results = make_heap(query_points.shape[0], n_neighbors) 1534 | init_from_random(n_neighbors, data, query_points, results, rng_state) 1535 | if forest is not None: 1536 | for tree in forest: 1537 | init_from_tree(tree, data, query_points, results, rng_state) 1538 | 1539 | return results 1540 | 1541 | 1542 | def make_initialized_nnd_search(dist, dist_args): 1543 | @numba.njit(parallel=True) 1544 | def initialized_nnd_search(data, indptr, indices, initialization, query_points): 1545 | 1546 | for i in numba.prange(query_points.shape[0]): 1547 | 1548 | tried = set(initialization[0, i]) 1549 | 1550 | while True: 1551 | 1552 | 1553 | vertex = smallest_flagged(initialization, i) 1554 | 1555 | if vertex == -1: 1556 | break 1557 | candidates = indices[indptr[vertex] : indptr[vertex + 1]] 1558 | for j in range(candidates.shape[0]): 1559 | if ( 1560 | candidates[j] == vertex 1561 | or candidates[j] == -1 1562 | or candidates[j] in tried 1563 | ): 1564 | continue 1565 | d = dist(data[candidates[j]], query_points[i], *dist_args) 1566 | unchecked_heap_push(initialization, i, d, candidates[j], 1) 1567 | tried.add(candidates[j]) 1568 | 1569 | return initialization 1570 | 1571 | return initialized_nnd_search 1572 | 1573 | 1574 | 1575 | 1576 | 1577 | 1578 | 1579 | 1580 | 1581 | 1582 | 1583 | 1584 | 1585 | 1586 | 1587 | 1588 | 1589 | import numpy as np 1590 | import numba 1591 | 1592 | 1593 | 1594 | 1595 | 1596 | 1597 | 1598 | 1599 | 1600 | 1601 | 1602 | 1603 | import locale 1604 | 1605 | locale.setlocale(locale.LC_NUMERIC, "C") 1606 | 1607 | 1608 | @numba.njit() 1609 | def arr_unique(arr): 1610 | aux = np.sort(arr) 1611 | flag = np.concatenate((np.ones(1, dtype=np.bool_), aux[1:] != aux[:-1])) 1612 | return aux[flag] 1613 | 1614 | 1615 | 1616 | @numba.njit() 1617 | def arr_union(ar1, ar2): 1618 | if ar1.shape[0] == 0: 1619 | return ar2 1620 | elif ar2.shape[0] == 0: 1621 | return ar1 1622 | else: 1623 | return arr_unique(np.concatenate((ar1, ar2))) 1624 | 1625 | 1626 | 1627 | 1628 | @numba.njit() 1629 | def arr_intersect(ar1, ar2): 1630 | aux = np.concatenate((ar1, ar2)) 1631 | aux.sort() 1632 | return aux[:-1][aux[1:] == aux[:-1]] 1633 | 1634 | 1635 | @numba.njit() 1636 | def sparse_sum(ind1, data1, ind2, data2): 1637 | result_ind = arr_union(ind1, ind2) 1638 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32) 1639 | 1640 | i1 = 0 1641 | i2 = 0 1642 | nnz = 0 1643 | 1644 | 1645 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]: 1646 | j1 = ind1[i1] 1647 | j2 = ind2[i2] 1648 | 1649 | if j1 == j2: 1650 | val = data1[i1] + data2[i2] 1651 | if val != 0: 1652 | result_ind[nnz] = j1 1653 | result_data[nnz] = val 1654 | nnz += 1 1655 | i1 += 1 1656 | i2 += 1 1657 | elif j1 < j2: 1658 | val = data1[i1] 1659 | if val != 0: 1660 | result_ind[nnz] = j1 1661 | result_data[nnz] = val 1662 | nnz += 1 1663 | i1 += 1 1664 | else: 1665 | val = data2[i2] 1666 | if val != 0: 1667 | result_ind[nnz] = j2 1668 | result_data[nnz] = val 1669 | nnz += 1 1670 | i2 += 1 1671 | 1672 | 1673 | while i1 < ind1.shape[0]: 1674 | val = data1[i1] 1675 | if val != 0: 1676 | result_ind[nnz] = i1 1677 | result_data[nnz] = val 1678 | nnz += 1 1679 | i1 += 1 1680 | 1681 | while i2 < ind2.shape[0]: 1682 | val = data2[i2] 1683 | if val != 0: 1684 | result_ind[nnz] = i2 1685 | result_data[nnz] = val 1686 | nnz += 1 1687 | i2 += 1 1688 | 1689 | 1690 | result_ind = result_ind[:nnz] 1691 | result_data = result_data[:nnz] 1692 | 1693 | return result_ind, result_data 1694 | 1695 | 1696 | @numba.njit() 1697 | def sparse_diff(ind1, data1, ind2, data2): 1698 | return sparse_sum(ind1, data1, ind2, -data2) 1699 | 1700 | 1701 | @numba.njit() 1702 | def sparse_mul(ind1, data1, ind2, data2): 1703 | result_ind = arr_intersect(ind1, ind2) 1704 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32) 1705 | 1706 | i1 = 0 1707 | i2 = 0 1708 | nnz = 0 1709 | 1710 | 1711 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]: 1712 | j1 = ind1[i1] 1713 | j2 = ind2[i2] 1714 | 1715 | if j1 == j2: 1716 | val = data1[i1] * data2[i2] 1717 | if val != 0: 1718 | result_ind[nnz] = j1 1719 | result_data[nnz] = val 1720 | nnz += 1 1721 | i1 += 1 1722 | i2 += 1 1723 | elif j1 < j2: 1724 | i1 += 1 1725 | else: 1726 | i2 += 1 1727 | 1728 | 1729 | result_ind = result_ind[:nnz] 1730 | result_data = result_data[:nnz] 1731 | 1732 | return result_ind, result_data 1733 | 1734 | 1735 | def make_sparse_nn_descent(sparse_dist, dist_args): 1736 | 1737 | @numba.njit(parallel=True) 1738 | def nn_descent( 1739 | inds, 1740 | indptr, 1741 | data, 1742 | n_vertices, 1743 | n_neighbors, 1744 | rng_state, 1745 | max_candidates=50, 1746 | n_iters=10, 1747 | delta=0.001, 1748 | rho=0.5, 1749 | rp_tree_init=True, 1750 | leaf_array=None, 1751 | verbose=False, 1752 | ): 1753 | 1754 | current_graph = make_heap(n_vertices, n_neighbors) 1755 | for i in range(n_vertices): 1756 | indices = rejection_sample(n_neighbors, n_vertices, rng_state) 1757 | for j in range(indices.shape[0]): 1758 | 1759 | from_inds = inds[indptr[i] : indptr[i + 1]] 1760 | from_data = data[indptr[i] : indptr[i + 1]] 1761 | 1762 | to_inds = inds[indptr[indices[j]] : indptr[indices[j] + 1]] 1763 | to_data = data[indptr[indices[j]] : indptr[indices[j] + 1]] 1764 | 1765 | d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) 1766 | 1767 | heap_push(current_graph, i, d, indices[j], 1) 1768 | heap_push(current_graph, indices[j], d, i, 1) 1769 | 1770 | if rp_tree_init: 1771 | for n in range(leaf_array.shape[0]): 1772 | for i in range(leaf_array.shape[1]): 1773 | if leaf_array[n, i] < 0: 1774 | break 1775 | for j in range(i + 1, leaf_array.shape[1]): 1776 | if leaf_array[n, j] < 0: 1777 | break 1778 | 1779 | from_inds = inds[ 1780 | indptr[leaf_array[n, i]] : indptr[leaf_array[n, i] + 1] 1781 | ] 1782 | from_data = data[ 1783 | indptr[leaf_array[n, i]] : indptr[leaf_array[n, i] + 1] 1784 | ] 1785 | 1786 | to_inds = inds[ 1787 | indptr[leaf_array[n, j]] : indptr[leaf_array[n, j] + 1] 1788 | ] 1789 | to_data = data[ 1790 | indptr[leaf_array[n, j]] : indptr[leaf_array[n, j] + 1] 1791 | ] 1792 | 1793 | d = sparse_dist( 1794 | from_inds, from_data, to_inds, to_data, *dist_args 1795 | ) 1796 | 1797 | heap_push( 1798 | current_graph, leaf_array[n, i], d, leaf_array[n, j], 1 1799 | ) 1800 | heap_push( 1801 | current_graph, leaf_array[n, j], d, leaf_array[n, i], 1 1802 | ) 1803 | 1804 | for n in range(n_iters): 1805 | if verbose: 1806 | print("\t", n, " / ", n_iters) 1807 | 1808 | candidate_neighbors = build_candidates( 1809 | current_graph, n_vertices, n_neighbors, max_candidates, rng_state 1810 | ) 1811 | 1812 | c = 0 1813 | for i in range(n_vertices): 1814 | for j in range(max_candidates): 1815 | p = int(candidate_neighbors[0, i, j]) 1816 | if p < 0 or tau_rand(rng_state) < rho: 1817 | continue 1818 | for k in range(max_candidates): 1819 | q = int(candidate_neighbors[0, i, k]) 1820 | if ( 1821 | q < 0 1822 | or not candidate_neighbors[2, i, j] 1823 | and not candidate_neighbors[2, i, k] 1824 | ): 1825 | continue 1826 | 1827 | from_inds = inds[indptr[p] : indptr[p + 1]] 1828 | from_data = data[indptr[p] : indptr[p + 1]] 1829 | 1830 | to_inds = inds[indptr[q] : indptr[q + 1]] 1831 | to_data = data[indptr[q] : indptr[q + 1]] 1832 | 1833 | d = sparse_dist( 1834 | from_inds, from_data, to_inds, to_data, *dist_args 1835 | ) 1836 | 1837 | c += heap_push(current_graph, p, d, q, 1) 1838 | c += heap_push(current_graph, q, d, p, 1) 1839 | 1840 | if c <= delta * n_neighbors * n_vertices: 1841 | break 1842 | 1843 | return deheap_sort(current_graph) 1844 | 1845 | return nn_descent 1846 | 1847 | 1848 | @numba.njit() 1849 | def general_sset_intersection( 1850 | indptr1, 1851 | indices1, 1852 | data1, 1853 | indptr2, 1854 | indices2, 1855 | data2, 1856 | result_row, 1857 | result_col, 1858 | result_val, 1859 | mix_weight=0.5, 1860 | ): 1861 | 1862 | left_min = max(data1.min() / 2.0, 1.0e-8) 1863 | right_min = max(data2.min() / 2.0, 1.0e-8) 1864 | 1865 | for idx in range(result_row.shape[0]): 1866 | i = result_row[idx] 1867 | j = result_col[idx] 1868 | 1869 | left_val = left_min 1870 | for k in range(indptr1[i], indptr1[i + 1]): 1871 | if indices1[k] == j: 1872 | left_val = data1[k] 1873 | 1874 | right_val = right_min 1875 | for k in range(indptr2[i], indptr2[i + 1]): 1876 | if indices2[k] == j: 1877 | right_val = data2[k] 1878 | 1879 | if left_val > left_min or right_val > right_min: 1880 | if mix_weight < 0.5: 1881 | result_val[idx] = left_val * pow( 1882 | right_val, mix_weight / (1.0 - mix_weight) 1883 | ) 1884 | else: 1885 | result_val[idx] = ( 1886 | pow(left_val, (1.0 - mix_weight) / mix_weight) * right_val 1887 | ) 1888 | 1889 | return 1890 | 1891 | 1892 | @numba.njit() 1893 | def sparse_euclidean(ind1, data1, ind2, data2): 1894 | aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2) 1895 | result = 0.0 1896 | for i in range(aux_data.shape[0]): 1897 | result += aux_data[i] ** 2 1898 | return np.sqrt(result) 1899 | 1900 | 1901 | @numba.njit() 1902 | def sparse_manhattan(ind1, data1, ind2, data2): 1903 | aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2) 1904 | result = 0.0 1905 | for i in range(aux_data.shape[0]): 1906 | result += np.abs(aux_data[i]) 1907 | return result 1908 | 1909 | 1910 | @numba.njit() 1911 | def sparse_chebyshev(ind1, data1, ind2, data2): 1912 | aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2) 1913 | result = 0.0 1914 | for i in range(aux_data.shape[0]): 1915 | result = max(result, np.abs(aux_data[i])) 1916 | return result 1917 | 1918 | 1919 | @numba.njit() 1920 | def sparse_minkowski(ind1, data1, ind2, data2, p=2.0): 1921 | aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2) 1922 | result = 0.0 1923 | for i in range(aux_data.shape[0]): 1924 | result += np.abs(aux_data[i]) ** p 1925 | return result ** (1.0 / p) 1926 | 1927 | 1928 | @numba.njit() 1929 | def sparse_hamming(ind1, data1, ind2, data2, n_features): 1930 | num_not_equal = sparse_diff(ind1, data1, ind2, data2)[0].shape[0] 1931 | return float(num_not_equal) / n_features 1932 | 1933 | 1934 | @numba.njit() 1935 | def sparse_canberra(ind1, data1, ind2, data2): 1936 | abs_data1 = np.abs(data1) 1937 | abs_data2 = np.abs(data2) 1938 | denom_inds, denom_data = sparse_sum(ind1, abs_data1, ind2, abs_data2) 1939 | denom_data = 1.0 / denom_data 1940 | numer_inds, numer_data = sparse_diff(ind1, data1, ind2, data2) 1941 | numer_data = np.abs(numer_data) 1942 | 1943 | val_inds, val_data = sparse_mul(numer_inds, numer_data, denom_inds, denom_data) 1944 | 1945 | return np.sum(val_data) 1946 | 1947 | 1948 | @numba.njit() 1949 | def sparse_bray_curtis(ind1, data1, ind2, data2): 1950 | abs_data1 = np.abs(data1) 1951 | abs_data2 = np.abs(data2) 1952 | denom_inds, denom_data = sparse_sum(ind1, abs_data1, ind2, abs_data2) 1953 | 1954 | if denom_data.shape[0] == 0: 1955 | return 0.0 1956 | 1957 | denominator = np.sum(denom_data) 1958 | 1959 | numer_inds, numer_data = sparse_diff(ind1, data1, ind2, data2) 1960 | numer_data = np.abs(numer_data) 1961 | 1962 | numerator = np.sum(numer_data) 1963 | 1964 | return float(numerator) / denominator 1965 | 1966 | 1967 | @numba.njit() 1968 | def sparse_jaccard(ind1, data1, ind2, data2): 1969 | num_non_zero = arr_union(ind1, ind2).shape[0] 1970 | num_equal = arr_intersect(ind1, ind2).shape[0] 1971 | 1972 | if num_non_zero == 0: 1973 | return 0.0 1974 | else: 1975 | return float(num_non_zero - num_equal) / num_non_zero 1976 | 1977 | 1978 | @numba.njit() 1979 | def sparse_matching(ind1, data1, ind2, data2, n_features): 1980 | num_true_true = arr_intersect(ind1, ind2).shape[0] 1981 | num_non_zero = arr_union(ind1, ind2).shape[0] 1982 | num_not_equal = num_non_zero - num_true_true 1983 | 1984 | return float(num_not_equal) / n_features 1985 | 1986 | 1987 | @numba.njit() 1988 | def sparse_dice(ind1, data1, ind2, data2): 1989 | num_true_true = arr_intersect(ind1, ind2).shape[0] 1990 | num_non_zero = arr_union(ind1, ind2).shape[0] 1991 | num_not_equal = num_non_zero - num_true_true 1992 | 1993 | if num_not_equal == 0.0: 1994 | return 0.0 1995 | else: 1996 | return num_not_equal / (2.0 * num_true_true + num_not_equal) 1997 | 1998 | 1999 | @numba.njit() 2000 | def sparse_kulsinski(ind1, data1, ind2, data2, n_features): 2001 | num_true_true = arr_intersect(ind1, ind2).shape[0] 2002 | num_non_zero = arr_union(ind1, ind2).shape[0] 2003 | num_not_equal = num_non_zero - num_true_true 2004 | 2005 | if num_not_equal == 0: 2006 | return 0.0 2007 | else: 2008 | return float(num_not_equal - num_true_true + n_features) / ( 2009 | num_not_equal + n_features 2010 | ) 2011 | 2012 | 2013 | @numba.njit() 2014 | def sparse_rogers_tanimoto(ind1, data1, ind2, data2, n_features): 2015 | num_true_true = arr_intersect(ind1, ind2).shape[0] 2016 | num_non_zero = arr_union(ind1, ind2).shape[0] 2017 | num_not_equal = num_non_zero - num_true_true 2018 | 2019 | return (2.0 * num_not_equal) / (n_features + num_not_equal) 2020 | 2021 | 2022 | @numba.njit() 2023 | def sparse_russellrao(ind1, data1, ind2, data2, n_features): 2024 | if ind1.shape[0] == ind2.shape[0] and np.all(ind1 == ind2): 2025 | return 0.0 2026 | 2027 | num_true_true = arr_intersect(ind1, ind2).shape[0] 2028 | 2029 | if num_true_true == np.sum(data1 != 0) and num_true_true == np.sum(data2 != 0): 2030 | return 0.0 2031 | else: 2032 | return float(n_features - num_true_true) / (n_features) 2033 | 2034 | 2035 | @numba.njit() 2036 | def sparse_sokal_michener(ind1, data1, ind2, data2, n_features): 2037 | num_true_true = arr_intersect(ind1, ind2).shape[0] 2038 | num_non_zero = arr_union(ind1, ind2).shape[0] 2039 | num_not_equal = num_non_zero - num_true_true 2040 | 2041 | return (2.0 * num_not_equal) / (n_features + num_not_equal) 2042 | 2043 | 2044 | @numba.njit() 2045 | def sparse_sokal_sneath(ind1, data1, ind2, data2): 2046 | num_true_true = arr_intersect(ind1, ind2).shape[0] 2047 | num_non_zero = arr_union(ind1, ind2).shape[0] 2048 | num_not_equal = num_non_zero - num_true_true 2049 | 2050 | if num_not_equal == 0.0: 2051 | return 0.0 2052 | else: 2053 | return num_not_equal / (0.5 * num_true_true + num_not_equal) 2054 | 2055 | 2056 | @numba.njit() 2057 | def sparse_cosine(ind1, data1, ind2, data2): 2058 | aux_inds, aux_data = sparse_mul(ind1, data1, ind2, data2) 2059 | result = 0.0 2060 | norm1 = norm(data1) 2061 | norm2 = norm(data2) 2062 | 2063 | for i in range(aux_data.shape[0]): 2064 | result += aux_data[i] 2065 | 2066 | if norm1 == 0.0 and norm2 == 0.0: 2067 | return 0.0 2068 | elif norm1 == 0.0 or norm2 == 0.0: 2069 | return 1.0 2070 | else: 2071 | return 1.0 - (result / (norm1 * norm2)) 2072 | 2073 | 2074 | @numba.njit() 2075 | def sparse_correlation(ind1, data1, ind2, data2, n_features): 2076 | 2077 | mu_x = 0.0 2078 | mu_y = 0.0 2079 | dot_product = 0.0 2080 | 2081 | if ind1.shape[0] == 0 and ind2.shape[0] == 0: 2082 | return 0.0 2083 | elif ind1.shape[0] == 0 or ind2.shape[0] == 0: 2084 | return 1.0 2085 | 2086 | for i in range(data1.shape[0]): 2087 | mu_x += data1[i] 2088 | for i in range(data2.shape[0]): 2089 | mu_y += data2[i] 2090 | 2091 | mu_x /= n_features 2092 | mu_y /= n_features 2093 | 2094 | shifted_data1 = np.empty(data1.shape[0], dtype=np.float32) 2095 | shifted_data2 = np.empty(data2.shape[0], dtype=np.float32) 2096 | 2097 | for i in range(data1.shape[0]): 2098 | shifted_data1[i] = data1[i] - mu_x 2099 | for i in range(data2.shape[0]): 2100 | shifted_data2[i] = data2[i] - mu_y 2101 | 2102 | norm1 = np.sqrt( 2103 | (norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2) 2104 | ) 2105 | norm2 = np.sqrt( 2106 | (norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2) 2107 | ) 2108 | 2109 | dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2) 2110 | 2111 | common_indices = set(dot_prod_inds) 2112 | 2113 | for i in range(dot_prod_data.shape[0]): 2114 | dot_product += dot_prod_data[i] 2115 | 2116 | for i in range(ind1.shape[0]): 2117 | if ind1[i] not in common_indices: 2118 | dot_product -= shifted_data1[i] * (mu_y) 2119 | 2120 | for i in range(ind2.shape[0]): 2121 | if ind2[i] not in common_indices: 2122 | dot_product -= shifted_data2[i] * (mu_x) 2123 | 2124 | all_indices = arr_union(ind1, ind2) 2125 | dot_product += mu_x * mu_y * (n_features - all_indices.shape[0]) 2126 | 2127 | if norm1 == 0.0 and norm2 == 0.0: 2128 | return 0.0 2129 | elif dot_product == 0.0: 2130 | return 1.0 2131 | else: 2132 | return 1.0 - (dot_product / (norm1 * norm2)) 2133 | 2134 | 2135 | sparse_named_distances = { 2136 | 2137 | "euclidean": sparse_euclidean, 2138 | "manhattan": sparse_manhattan, 2139 | "l1": sparse_manhattan, 2140 | "taxicab": sparse_manhattan, 2141 | "chebyshev": sparse_chebyshev, 2142 | "linf": sparse_chebyshev, 2143 | "linfty": sparse_chebyshev, 2144 | "linfinity": sparse_chebyshev, 2145 | "minkowski": sparse_minkowski, 2146 | 2147 | "canberra": sparse_canberra, 2148 | 2149 | 2150 | "hamming": sparse_hamming, 2151 | "jaccard": sparse_jaccard, 2152 | "dice": sparse_dice, 2153 | "matching": sparse_matching, 2154 | "kulsinski": sparse_kulsinski, 2155 | "rogerstanimoto": sparse_rogers_tanimoto, 2156 | "russellrao": sparse_russellrao, 2157 | "sokalmichener": sparse_sokal_michener, 2158 | "sokalsneath": sparse_sokal_sneath, 2159 | "cosine": sparse_cosine, 2160 | "correlation": sparse_correlation, 2161 | } 2162 | 2163 | sparse_need_n_features = ( 2164 | "hamming", 2165 | "matching", 2166 | "kulsinski", 2167 | "rogerstanimoto", 2168 | "russellrao", 2169 | "sokalmichener", 2170 | "correlation", 2171 | ) 2172 | 2173 | 2174 | 2175 | 2176 | 2177 | 2178 | 2179 | 2180 | 2181 | 2182 | 2183 | 2184 | 2185 | 2186 | 2187 | import numpy as np 2188 | import numba 2189 | import scipy 2190 | from sklearn.metrics import pairwise_distances 2191 | from sklearn.utils import check_random_state 2192 | from sklearn.neighbors import KDTree 2193 | from scipy.spatial import cKDTree 2194 | 2195 | 2196 | 2197 | 2198 | 2199 | 2200 | 2201 | #INT32_MIN = np.iinfo(np.int32).min + 1 2202 | #INT32_MAX = np.iinfo(np.int32).max - 1 2203 | 2204 | SMOOTH_K_TOLERANCE = 1e-5 2205 | MIN_K_DIST_SCALE = 1e-3 2206 | NPY_INFINITY = np.inf 2207 | 2208 | def nearest_neighbors( 2209 | X, 2210 | n_neighbors, 2211 | metric, 2212 | metric_kwds, 2213 | angular, 2214 | random_state, 2215 | verbose=False, 2216 | ): 2217 | 2218 | if verbose: 2219 | print("Finding Nearest Neighbors") 2220 | 2221 | if metric == "precomputed": 2222 | 2223 | 2224 | knn_indices = fast_knn_indices(X, n_neighbors) 2225 | 2226 | 2227 | knn_dists = X[ 2228 | np.arange(X.shape[0])[:, None], knn_indices 2229 | ].copy() 2230 | 2231 | rp_forest = [] 2232 | else: 2233 | if callable(metric): 2234 | distance_func = metric 2235 | elif metric in named_distances: 2236 | distance_func = named_distances[metric] 2237 | else: 2238 | raise ValueError( 2239 | "Metric is neither callable, " 2240 | + "nor a recognised string" 2241 | ) 2242 | 2243 | if metric in ( 2244 | "cosine", 2245 | "correlation", 2246 | "dice", 2247 | "jaccard", 2248 | ): 2249 | angular = True 2250 | 2251 | rng_state = random_state.randint( 2252 | np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3 2253 | ).astype(np.int64) 2254 | 2255 | if scipy.sparse.isspmatrix_csr(X): 2256 | if metric in sparse.sparse_named_distances: 2257 | distance_func = sparse.sparse_named_distances[ 2258 | metric 2259 | ] 2260 | if metric in sparse.sparse_need_n_features: 2261 | metric_kwds["n_features"] = X.shape[1] 2262 | else: 2263 | raise ValueError( 2264 | "Metric {} not supported for sparse " 2265 | + "data".format(metric) 2266 | ) 2267 | metric_nn_descent = sparse.make_sparse_nn_descent( 2268 | distance_func, tuple(metric_kwds.values()) 2269 | ) 2270 | 2271 | 2272 | n_trees = 5 + int( 2273 | round((X.shape[0]) ** 0.5 / 20.0) 2274 | ) 2275 | n_iters = max( 2276 | 5, int(round(np.log2(X.shape[0]))) 2277 | ) 2278 | if verbose: 2279 | print( 2280 | "Building RP forest with", 2281 | str(n_trees), 2282 | "trees", 2283 | ) 2284 | 2285 | rp_forest = make_forest( 2286 | X, n_neighbors, n_trees, rng_state, angular 2287 | ) 2288 | leaf_array = rptree_leaf_array(rp_forest) 2289 | 2290 | if verbose: 2291 | print( 2292 | "NN descent for", 2293 | str(n_iters), 2294 | "iterations", 2295 | ) 2296 | knn_indices, knn_dists = metric_nn_descent( 2297 | X.indices, 2298 | X.indptr, 2299 | X.data, 2300 | X.shape[0], 2301 | n_neighbors, 2302 | rng_state, 2303 | max_candidates=60, 2304 | rp_tree_init=True, 2305 | leaf_array=leaf_array, 2306 | n_iters=n_iters, 2307 | verbose=verbose, 2308 | ) 2309 | else: 2310 | metric_nn_descent = make_nn_descent( 2311 | distance_func, tuple(metric_kwds.values()) 2312 | ) 2313 | 2314 | n_trees = 5 + int( 2315 | round((X.shape[0]) ** 0.5 / 20.0) 2316 | ) 2317 | n_iters = max( 2318 | 5, int(round(np.log2(X.shape[0]))) 2319 | ) 2320 | 2321 | if verbose: 2322 | print( 2323 | "Building RP forest with", 2324 | str(n_trees), 2325 | "trees", 2326 | ) 2327 | rp_forest = make_forest( 2328 | X, n_neighbors, n_trees, rng_state, angular 2329 | ) 2330 | leaf_array = rptree_leaf_array(rp_forest) 2331 | if verbose: 2332 | print( 2333 | "NN descent for", 2334 | str(n_iters), 2335 | "iterations", 2336 | ) 2337 | knn_indices, knn_dists = metric_nn_descent( 2338 | X, 2339 | n_neighbors, 2340 | rng_state, 2341 | max_candidates=60, 2342 | rp_tree_init=True, 2343 | leaf_array=leaf_array, 2344 | n_iters=n_iters, 2345 | verbose=verbose, 2346 | ) 2347 | 2348 | if np.any(knn_indices < 0): 2349 | warn( 2350 | "Failed to correctly find n_neighbors for some samples." 2351 | "Results may be less than ideal. Try re-running with" 2352 | "different parameters." 2353 | ) 2354 | if verbose: 2355 | print("Finished Nearest Neighbor Search") 2356 | return knn_indices, knn_dists, rp_forest 2357 | 2358 | @numba.njit( 2359 | fastmath=True 2360 | ) 2361 | def smooth_knn_dist( 2362 | distances, 2363 | k, 2364 | n_iter=64, 2365 | local_connectivity=1.0, 2366 | bandwidth=1.0, 2367 | cardinality=None 2368 | ): 2369 | 2370 | if cardinality is None: 2371 | target = np.log2(k) * bandwidth 2372 | else: 2373 | target = cardinality 2374 | rho = np.zeros(distances.shape[0]) 2375 | result = np.zeros(distances.shape[0]) 2376 | 2377 | mean_distances = np.mean(distances) 2378 | 2379 | for i in range(distances.shape[0]): 2380 | lo = 0.0 2381 | hi = NPY_INFINITY 2382 | mid = 1.0 2383 | 2384 | 2385 | ith_distances = distances[i] 2386 | non_zero_dists = ith_distances[ith_distances > 0.0] 2387 | if non_zero_dists.shape[0] >= local_connectivity: 2388 | index = int(np.floor(local_connectivity)) 2389 | interpolation = local_connectivity - index 2390 | if index > 0: 2391 | rho[i] = non_zero_dists[index - 1] 2392 | if interpolation > SMOOTH_K_TOLERANCE: 2393 | rho[i] += interpolation * ( 2394 | non_zero_dists[index] 2395 | - non_zero_dists[index - 1] 2396 | ) 2397 | else: 2398 | rho[i] = interpolation * non_zero_dists[0] 2399 | elif non_zero_dists.shape[0] > 0: 2400 | rho[i] = np.max(non_zero_dists) 2401 | 2402 | for n in range(n_iter): 2403 | 2404 | psum = 0.0 2405 | for j in range(1, distances.shape[1]): 2406 | d = distances[i, j] - rho[i] 2407 | if d > 0: 2408 | psum += np.exp(-(d / mid)) 2409 | else: 2410 | psum += 1.0 2411 | 2412 | if np.fabs(psum - target) < SMOOTH_K_TOLERANCE: 2413 | break 2414 | 2415 | if psum > target: 2416 | hi = mid 2417 | mid = (lo + hi) / 2.0 2418 | else: 2419 | lo = mid 2420 | if hi == NPY_INFINITY: 2421 | mid *= 2 2422 | else: 2423 | mid = (lo + hi) / 2.0 2424 | 2425 | result[i] = mid 2426 | 2427 | 2428 | if rho[i] > 0.0: 2429 | mean_ith_distances = np.mean(ith_distances) 2430 | if ( 2431 | result[i] 2432 | < MIN_K_DIST_SCALE * mean_ith_distances 2433 | ): 2434 | result[i] = ( 2435 | MIN_K_DIST_SCALE * mean_ith_distances 2436 | ) 2437 | else: 2438 | if ( 2439 | result[i] 2440 | < MIN_K_DIST_SCALE * mean_distances 2441 | ): 2442 | result[i] = ( 2443 | MIN_K_DIST_SCALE * mean_distances 2444 | ) 2445 | 2446 | return result, rho 2447 | 2448 | @numba.njit(parallel=True, fastmath=True) 2449 | def compute_membership_strengths( 2450 | knn_indices, knn_dists, sigmas, rhos 2451 | ): 2452 | 2453 | n_samples = knn_indices.shape[0] 2454 | n_neighbors = knn_indices.shape[1] 2455 | 2456 | rows = np.zeros(knn_indices.size, dtype=np.int64) 2457 | cols = np.zeros(knn_indices.size, dtype=np.int64) 2458 | vals = np.zeros(knn_indices.size, dtype=np.float64) 2459 | 2460 | for i in range(n_samples): 2461 | for j in range(n_neighbors): 2462 | if knn_indices[i, j] == -1: 2463 | continue 2464 | if knn_indices[i, j] == i: 2465 | val = 0.0 2466 | elif knn_dists[i, j] - rhos[i] <= 0.0: 2467 | val = 1.0 2468 | else: 2469 | val = np.exp( 2470 | -( 2471 | (knn_dists[i, j] - rhos[i]) 2472 | / (sigmas[i]) 2473 | ) 2474 | ) 2475 | 2476 | rows[i * n_neighbors + j] = i 2477 | cols[i * n_neighbors + j] = knn_indices[i, j] 2478 | vals[i * n_neighbors + j] = val 2479 | 2480 | return rows, cols, vals 2481 | 2482 | def create_tree(data, metric): 2483 | if metric == 'euclidean': 2484 | ckd = cKDTree(data) 2485 | else: 2486 | ckd = KDTree(data, metric=metric) 2487 | return ckd 2488 | 2489 | def query_tree(data, ckd, k, metric): 2490 | if metric == 'euclidean': 2491 | ckdout = ckd.query(x=data, k=k, workers=-1) 2492 | else: 2493 | ckdout = ckd.query(data, k=k) 2494 | return ckdout 2495 | 2496 | def partitioned_nearest_neighbors(X, Y, k, metric='euclidean'): 2497 | tree = create_tree(Y, metric) 2498 | nns = query_tree(X, tree, k, metric) 2499 | knn_indices = nns[1] 2500 | knn_dists = nns[0] 2501 | return knn_indices, knn_dists 2502 | 2503 | 2504 | 2505 | 2506 | 2507 | 2508 | 2509 | 2510 | 2511 | 2512 | 2513 | 2514 | 2515 | import numpy as np 2516 | 2517 | import scipy.sparse 2518 | import scipy.sparse.csgraph 2519 | 2520 | from sklearn.manifold import SpectralEmbedding 2521 | from sklearn.metrics import pairwise_distances 2522 | from warnings import warn 2523 | 2524 | 2525 | def component_layout( 2526 | data, n_components, component_labels, dim, metric="euclidean", metric_kwds={} 2527 | ): 2528 | 2529 | component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64) 2530 | 2531 | for label in range(n_components): 2532 | component_centroids[label] = data[component_labels == label].mean(axis=0) 2533 | 2534 | distance_matrix = pairwise_distances( 2535 | component_centroids, metric=metric, **metric_kwds 2536 | ) 2537 | affinity_matrix = np.exp(-distance_matrix ** 2) 2538 | 2539 | component_embedding = SpectralEmbedding( 2540 | n_components=dim, affinity="precomputed" 2541 | ).fit_transform(affinity_matrix) 2542 | component_embedding /= component_embedding.max() 2543 | 2544 | return component_embedding 2545 | 2546 | 2547 | def multi_component_layout( 2548 | data, 2549 | graph, 2550 | n_components, 2551 | component_labels, 2552 | dim, 2553 | random_state, 2554 | metric="euclidean", 2555 | metric_kwds={}, 2556 | ): 2557 | 2558 | 2559 | result = np.empty((graph.shape[0], dim), dtype=np.float32) 2560 | 2561 | if n_components > 2 * dim: 2562 | meta_embedding = component_layout( 2563 | data, 2564 | n_components, 2565 | component_labels, 2566 | dim, 2567 | metric=metric, 2568 | metric_kwds=metric_kwds, 2569 | ) 2570 | else: 2571 | k = int(np.ceil(n_components / 2.0)) 2572 | base = np.hstack([np.eye(k), np.zeros((k, dim - k))]) 2573 | meta_embedding = np.vstack([base, -base])[:n_components] 2574 | 2575 | for label in range(n_components): 2576 | component_graph = graph.tocsr()[component_labels == label, :].tocsc() 2577 | component_graph = component_graph[:, component_labels == label].tocoo() 2578 | 2579 | distances = pairwise_distances([meta_embedding[label]], meta_embedding) 2580 | data_range = distances[distances > 0.0].min() / 2.0 2581 | 2582 | if component_graph.shape[0] < 2 * dim: 2583 | result[component_labels == label] = ( 2584 | random_state.uniform( 2585 | low=-data_range, 2586 | high=data_range, 2587 | size=(component_graph.shape[0], dim), 2588 | ) 2589 | + meta_embedding[label] 2590 | ) 2591 | continue 2592 | 2593 | diag_data = np.asarray(component_graph.sum(axis=0)) 2594 | 2595 | 2596 | 2597 | 2598 | I = scipy.sparse.identity(component_graph.shape[0], dtype=np.float64) 2599 | D = scipy.sparse.spdiags( 2600 | 1.0 / np.sqrt(diag_data), 2601 | 0, 2602 | component_graph.shape[0], 2603 | component_graph.shape[0], 2604 | ) 2605 | L = I - D * component_graph * D 2606 | 2607 | k = dim + 1 2608 | num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(component_graph.shape[0]))) 2609 | try: 2610 | eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( 2611 | L, 2612 | k, 2613 | which="SM", 2614 | ncv=num_lanczos_vectors, 2615 | tol=1e-4, 2616 | v0=np.ones(L.shape[0]), 2617 | maxiter=graph.shape[0] * 5, 2618 | ) 2619 | order = np.argsort(eigenvalues)[1:k] 2620 | component_embedding = eigenvectors[:, order] 2621 | expansion = data_range / np.max(np.abs(component_embedding)) 2622 | component_embedding *= expansion 2623 | result[component_labels == label] = ( 2624 | component_embedding + meta_embedding[label] 2625 | ) 2626 | except scipy.sparse.linalg.ArpackError: 2627 | warn( 2628 | "WARNING: spectral initialisation failed! The eigenvector solver\n" 2629 | "failed. This is likely due to too small an eigengap. Consider\n" 2630 | "adding some noise or jitter to your data.\n\n" 2631 | "Falling back to random initialisation!" 2632 | ) 2633 | result[component_labels == label] = ( 2634 | random_state.uniform( 2635 | low=-data_range, 2636 | high=data_range, 2637 | size=(component_graph.shape[0], dim), 2638 | ) 2639 | + meta_embedding[label] 2640 | ) 2641 | 2642 | return result 2643 | 2644 | 2645 | def spectral_layout(data, graph, dim, random_state, metric="euclidean", metric_kwds={}): 2646 | 2647 | n_samples = graph.shape[0] 2648 | n_components, labels = scipy.sparse.csgraph.connected_components(graph) 2649 | 2650 | if n_components > 1: 2651 | warn( 2652 | "Embedding a total of {} separate connected components using meta-embedding (experimental)".format( 2653 | n_components 2654 | ) 2655 | ) 2656 | return multi_component_layout( 2657 | data, 2658 | graph, 2659 | n_components, 2660 | labels, 2661 | dim, 2662 | random_state, 2663 | metric=metric, 2664 | metric_kwds=metric_kwds, 2665 | ) 2666 | 2667 | diag_data = np.asarray(graph.sum(axis=0)) 2668 | 2669 | 2670 | 2671 | 2672 | I = scipy.sparse.identity(graph.shape[0], dtype=np.float64) 2673 | D = scipy.sparse.spdiags( 2674 | 1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0] 2675 | ) 2676 | L = I - D * graph * D 2677 | 2678 | k = dim + 1 2679 | num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(graph.shape[0]))) 2680 | try: 2681 | if L.shape[0] < 2000000: 2682 | eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( 2683 | L, 2684 | k, 2685 | which="SM", 2686 | ncv=num_lanczos_vectors, 2687 | tol=1e-4, 2688 | v0=np.ones(L.shape[0]), 2689 | maxiter=graph.shape[0] * 5, 2690 | ) 2691 | else: 2692 | eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg( 2693 | L, random_state.normal(size=(L.shape[0], k)), largest=False, tol=1e-8 2694 | ) 2695 | order = np.argsort(eigenvalues)[1:k] 2696 | return eigenvectors[:, order] 2697 | except scipy.sparse.linalg.ArpackError: 2698 | warn( 2699 | "WARNING: spectral initialisation failed! The eigenvector solver\n" 2700 | "failed. This is likely due to too small an eigengap. Consider\n" 2701 | "adding some noise or jitter to your data.\n\n" 2702 | "Falling back to random initialisation!" 2703 | ) 2704 | return random_state.uniform(low=-10.0, high=10.0, size=(graph.shape[0], dim)) 2705 | 2706 | 2707 | 2708 | 2709 | 2710 | 2711 | 2712 | 2713 | 2714 | 2715 | 2716 | 2717 | import numpy as np 2718 | import numba 2719 | 2720 | 2721 | @numba.njit() 2722 | def clip(val): 2723 | 2724 | if val > 4.0: 2725 | return 4.0 2726 | elif val < -4.0: 2727 | return -4.0 2728 | else: 2729 | return val 2730 | 2731 | @numba.njit( 2732 | "f4(f4[::1],f4[::1])", 2733 | fastmath=True, 2734 | cache=True, 2735 | locals={ 2736 | "result": numba.types.float32, 2737 | "diff": numba.types.float32, 2738 | "dim": numba.types.intp, 2739 | }, 2740 | ) 2741 | def rdist(x, y): 2742 | 2743 | result = 0.0 2744 | dim = x.shape[0] 2745 | for i in range(dim): 2746 | diff = x[i] - y[i] 2747 | result += diff * diff 2748 | 2749 | return result 2750 | 2751 | 2752 | def _optimize_layout_euclidean_single_epoch( 2753 | head_embedding, 2754 | head, 2755 | tail, 2756 | n_vertices, 2757 | epochs_per_sample, 2758 | a, 2759 | b, 2760 | rng_state, 2761 | gamma, 2762 | dim, 2763 | move_other, 2764 | alpha, 2765 | epochs_per_negative_sample, 2766 | epoch_of_next_negative_sample, 2767 | epoch_of_next_sample, 2768 | n, 2769 | ): 2770 | for i in numba.prange(epochs_per_sample.shape[0]): 2771 | if epoch_of_next_sample[i] <= n: 2772 | j = head[i] 2773 | k = tail[i] 2774 | 2775 | current = head_embedding[j] 2776 | other = head_embedding[k] 2777 | 2778 | dist_squared = rdist(current, other) 2779 | 2780 | if dist_squared > 0.0: 2781 | grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0) 2782 | grad_coeff /= a * pow(dist_squared, b) + 1.0 2783 | else: 2784 | grad_coeff = 0.0 2785 | 2786 | for d in range(dim): 2787 | grad_d = clip(grad_coeff * (current[d] - other[d])) 2788 | current[d] += grad_d * alpha 2789 | if move_other: 2790 | other[d] += -grad_d * alpha 2791 | 2792 | epoch_of_next_sample[i] += epochs_per_sample[i] 2793 | 2794 | n_neg_samples = int( 2795 | (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i] 2796 | ) 2797 | 2798 | for p in range(n_neg_samples): 2799 | k = tau_rand_int(rng_state) % n_vertices 2800 | 2801 | other = head_embedding[k] 2802 | 2803 | dist_squared = rdist(current, other) 2804 | 2805 | if dist_squared > 0.0: 2806 | grad_coeff = 2.0 * gamma * b 2807 | grad_coeff /= (0.001 + dist_squared) * ( 2808 | a * pow(dist_squared, b) + 1 2809 | ) 2810 | elif j == k: 2811 | continue 2812 | else: 2813 | grad_coeff = 0.0 2814 | 2815 | for d in range(dim): 2816 | if grad_coeff > 0.0: 2817 | grad_d = clip(grad_coeff * (current[d] - other[d])) 2818 | else: 2819 | grad_d = 4.0 2820 | current[d] += grad_d * alpha 2821 | 2822 | epoch_of_next_negative_sample[i] += ( 2823 | n_neg_samples * epochs_per_negative_sample[i] 2824 | ) 2825 | 2826 | return head_embedding 2827 | 2828 | 2829 | 2830 | 2831 | 2832 | 2833 | 2834 | 2835 | 2836 | def fuzzy_simplicial_set( 2837 | Xs, 2838 | joint, 2839 | joint_idxs, 2840 | weights, 2841 | n_neighbors, 2842 | cardinality, 2843 | metrics, 2844 | metric_kwds, 2845 | joint_metrics, 2846 | angular, 2847 | set_op_mix_ratio, 2848 | local_connectivity, 2849 | n_epochs, 2850 | random_state, 2851 | verbose, 2852 | ): 2853 | 2854 | len_Xs = [len(i) for i in Xs] 2855 | 2856 | rows, cols, vals = np.array([]), np.array([]), np.array([]) 2857 | 2858 | for i in range(len(Xs)): 2859 | 2860 | X_n_neighbors = int(round(n_neighbors * len_Xs[i]/sum(len_Xs))) 2861 | if X_n_neighbors < 2: 2862 | weights[(i,i)] *= X_n_neighbors/2 2863 | X_n_neighbors = 2 2864 | 2865 | if Xs[i].shape[0] < 4096: 2866 | X = Xs[i] 2867 | if scipy.sparse.issparse(Xs[i]): 2868 | X = Xs[i].toarray() 2869 | dmat = pairwise_distances(Xs[i], metric=metrics[i], **metric_kwds[i]) 2870 | knn_indices, knn_dists, _ = nearest_neighbors( 2871 | dmat, 2872 | X_n_neighbors, 2873 | 'precomputed', 2874 | {}, 2875 | angular, 2876 | np.random.RandomState(random_state), 2877 | verbose=verbose, 2878 | ) 2879 | else: 2880 | knn_indices, knn_dists, _ = nearest_neighbors( 2881 | Xs[i], 2882 | X_n_neighbors, 2883 | metrics[i], 2884 | metric_kwds[i], 2885 | angular, 2886 | np.random.RandomState(random_state), 2887 | verbose=verbose, 2888 | ) 2889 | 2890 | sigmas, rhos = smooth_knn_dist( 2891 | knn_dists, 2892 | 0, 2893 | local_connectivity=local_connectivity, 2894 | cardinality=cardinality * X_n_neighbors/n_neighbors 2895 | ) 2896 | 2897 | X_rows, X_cols, X_vals = compute_membership_strengths( 2898 | knn_indices, knn_dists, sigmas, rhos 2899 | ) 2900 | 2901 | rows = np.concatenate([rows, X_rows + sum(len_Xs[:i])]) 2902 | cols = np.concatenate([cols, X_cols + sum(len_Xs[:i])]) 2903 | vals = np.concatenate([vals, X_vals]) 2904 | 2905 | for k in joint.keys(): 2906 | XY = joint[k] 2907 | idxs = joint_idxs[k] 2908 | metric = joint_metrics[k] 2909 | 2910 | XY_n_neighbors = int(round(n_neighbors * len_Xs[k[1]]/sum(len_Xs) * len(idxs[1])/len_Xs[k[1]])) 2911 | YX_n_neighbors = int(round(n_neighbors * len_Xs[k[0]]/sum(len_Xs) * len(idxs[0])/len_Xs[k[0]])) 2912 | 2913 | if XY_n_neighbors < 2: 2914 | weights[(k[0],k[1])] *= XY_n_neighbors/2 2915 | XY_n_neighbors = 2 2916 | if YX_n_neighbors < 2: 2917 | weights[(k[1],k[0])] *= YX_n_neighbors/2 2918 | YX_n_neighbors = 2 2919 | 2920 | 2921 | if metric == 'precomputed': 2922 | XY_knn_indices = np.argsort(XY, axis=1)[:,XY_n_neighbors] 2923 | XY_knn_dists = np.sort(XY, axis=1)[:,XY_n_neighbors] 2924 | 2925 | YX_knn_indices = np.argsort(XY.T, axis=1)[:,YX_n_neighbors] 2926 | YX_knn_dists = np.sort(XY.T, axis=1)[:,YX_n_neighbors] 2927 | 2928 | else: 2929 | XY_knn_indices, XY_knn_dists = partitioned_nearest_neighbors(XY[0], XY[1], 2930 | XY_n_neighbors, metric) 2931 | YX_knn_indices, YX_knn_dists = partitioned_nearest_neighbors(XY[1], XY[0], 2932 | YX_n_neighbors, metric) 2933 | 2934 | XY_sigmas, XY_rhos = smooth_knn_dist( 2935 | XY_knn_dists, 2936 | 0, 2937 | local_connectivity=local_connectivity, 2938 | cardinality=cardinality * XY_n_neighbors/n_neighbors 2939 | ) 2940 | YX_sigmas, YX_rhos = smooth_knn_dist( 2941 | YX_knn_dists, 2942 | 0, 2943 | local_connectivity=local_connectivity, 2944 | cardinality=cardinality * YX_n_neighbors/n_neighbors 2945 | ) 2946 | 2947 | XY_rows, XY_cols, XY_vals = compute_membership_strengths( 2948 | XY_knn_indices, XY_knn_dists, XY_sigmas, XY_rhos 2949 | ) 2950 | YX_rows, YX_cols, YX_vals = compute_membership_strengths( 2951 | YX_knn_indices, YX_knn_dists, YX_sigmas, YX_rhos 2952 | ) 2953 | 2954 | rows = np.concatenate([rows, idxs[0][XY_rows] + sum(len_Xs[:k[0]])]) 2955 | cols = np.concatenate([cols, idxs[1][XY_cols] + sum(len_Xs[:k[1]])]) 2956 | vals = np.concatenate([vals, XY_vals]) 2957 | 2958 | rows = np.concatenate([rows, idxs[1][YX_rows] + sum(len_Xs[:k[1]])]) 2959 | cols = np.concatenate([cols, idxs[0][YX_cols] + sum(len_Xs[:k[0]])]) 2960 | vals = np.concatenate([vals, YX_vals]) 2961 | 2962 | fs = scipy.sparse.coo_matrix( 2963 | (vals, (rows, cols)), shape=(sum(len_Xs), sum(len_Xs)) 2964 | ) 2965 | fs.eliminate_zeros() 2966 | 2967 | transpose = fs.transpose() 2968 | 2969 | prod_matrix = fs.multiply(transpose) 2970 | 2971 | fs = ( 2972 | set_op_mix_ratio 2973 | * (fs + transpose - prod_matrix) 2974 | + (1.0 - set_op_mix_ratio) * prod_matrix 2975 | ) 2976 | 2977 | 2978 | fs.sum_duplicates() 2979 | fs.data[fs.data < (fs.data.max() / float(n_epochs))] = 0.0 2980 | fs.eliminate_zeros() 2981 | full_graph = fs 2982 | 2983 | graphs = [] 2984 | for i in range(len(Xs)): 2985 | graphs += [fs[sum(len_Xs[:i]):sum(len_Xs[:i+1]), 2986 | sum(len_Xs[:i]):sum(len_Xs[:i+1])].tocoo()] 2987 | joint_graphs = {} 2988 | for k in joint.keys(): 2989 | joint_graphs[k] = fs[sum(len_Xs[:k[0]]):sum(len_Xs[:k[0]+1]), 2990 | sum(len_Xs[:k[1]]):sum(len_Xs[:k[1]+1])].tocoo() 2991 | 2992 | return graphs, joint_graphs, full_graph, weights 2993 | 2994 | def init_layout(init, 2995 | Xs, 2996 | graphs, 2997 | n_components, 2998 | metrics, 2999 | metric_kwds, 3000 | random_state): 3001 | 3002 | len_Xs = [len(i) for i in Xs] 3003 | 3004 | if init == 'random': 3005 | embeddings = [] 3006 | for i in range(len(Xs)): 3007 | embeddings += [np.random.RandomState(random_state).uniform(low=-10.0, high=10.0, 3008 | size=(len_Xs[i], n_components), 3009 | ).astype(np.float32)] 3010 | elif init == 'spectral': 3011 | embeddings = [] 3012 | for i in range(len(Xs)): 3013 | try: 3014 | X_embedding = spectral_layout( 3015 | Xs[i], 3016 | graphs[i], 3017 | n_components, 3018 | np.random.RandomState(random_state), 3019 | metric=metrics[i], 3020 | metric_kwds=metric_kwds[i], 3021 | ) 3022 | expansion = 10.0 / np.abs(X_embedding).max() 3023 | X_embedding = (X_embedding * expansion).astype(np.float32) + \ 3024 | np.random.RandomState(random_state).normal(scale=0.0001, 3025 | size=[len_Xs[i], n_components] 3026 | ).astype(np.float32) 3027 | except: 3028 | X_embedding = np.random.RandomState(random_state).uniform(low=-10.0, high=10.0, 3029 | size=(len_Xs[i], n_components), 3030 | ).astype(np.float32) 3031 | embeddings += [X_embedding] 3032 | else: 3033 | if len(init.shape) == 2: 3034 | if (np.unique(init, axis=0).shape[0] < init.shape[0]): 3035 | tree = KDTree(init_data) 3036 | dist, ind = tree.query(init_data, k=2) 3037 | nndist = np.mean(dist[:,1]) 3038 | embedding = init + np.random.RandomState(random_state).normal( 3039 | scale=0.001 * nndist, 3040 | size=init.shape 3041 | ).astype(np.float32) 3042 | else: 3043 | embedding = init 3044 | embeddings = [] 3045 | for i in range(len(Xs)): 3046 | embeddings += [embedding[sum(len_Xs[:i]):sum(len_Xs[:i+1])]] 3047 | 3048 | for i in range(len(embeddings)): 3049 | embeddings[i] = (10.0 * (embeddings[i] - np.min(embeddings[i], 0)) 3050 | / (np.max(embeddings[i], 0) - np.min(embeddings[i], 0)) 3051 | ).astype(np.float32, order="C") 3052 | return embeddings 3053 | 3054 | 3055 | def optimize_layout( 3056 | embeddings, 3057 | graphs, 3058 | joint_graphs, 3059 | weights, 3060 | n_epochs, 3061 | a, 3062 | b, 3063 | random_state, 3064 | gamma=1.0, 3065 | initial_alpha=1.0, 3066 | negative_sample_rate=5.0, 3067 | parallel=False, 3068 | verbose=False, 3069 | ): 3070 | 3071 | 3072 | len_Xs = np.array([len(i) for i in embeddings]) 3073 | dim = embeddings[0].shape[1] 3074 | move_other = True 3075 | alpha = initial_alpha 3076 | 3077 | heads = [i.row for i in graphs] 3078 | tails = [i.col for i in graphs] 3079 | n_vertices = [i.shape[1] for i in graphs] 3080 | 3081 | epochs_per_sample = [make_epochs_per_sample(i.data, n_epochs) for i in graphs] 3082 | epochs_per_negative_sample = [i/negative_sample_rate for i in epochs_per_sample] 3083 | epoch_of_next_negative_sample = [i.copy() for i in epochs_per_negative_sample] 3084 | epoch_of_next_sample = [i.copy() for i in epochs_per_sample] 3085 | 3086 | joint_heads = {k: np.concatenate([joint_graphs[k].row, 3087 | joint_graphs[k].col + len_Xs[k[0]]]) for k in joint_graphs.keys()} 3088 | joint_tails = {k: np.concatenate([joint_graphs[k].col + len_Xs[k[0]], 3089 | joint_graphs[k].row]) for k in joint_graphs.keys()} 3090 | joint_n_vertices = {k: len_Xs[k[0]] + len_Xs[k[1]] for k in joint_graphs.keys()} 3091 | joint_epochs_per_sample = {k: make_epochs_per_sample( 3092 | np.concatenate([joint_graphs[k].data, joint_graphs[k].data]), n_epochs) for k in joint_graphs.keys()} 3093 | joint_epochs_per_negative_sample = {k: joint_epochs_per_sample[k]/negative_sample_rate for k in joint_graphs.keys()} 3094 | joint_epoch_of_next_negative_sample = {k: np.copy(joint_epochs_per_negative_sample[k]) for k in joint_graphs.keys()} 3095 | joint_epoch_of_next_sample = {k: np.copy(joint_epochs_per_sample[k]) for k in joint_graphs.keys()} 3096 | 3097 | 3098 | 3099 | optimize_fn = numba.njit( 3100 | _optimize_layout_euclidean_single_epoch, fastmath=True, parallel=parallel 3101 | ) 3102 | 3103 | for n in range(n_epochs): 3104 | 3105 | for i in range(len(embeddings)): 3106 | 3107 | if weights[(i,i)] != 0: 3108 | new_embedding = optimize_fn( 3109 | np.copy(embeddings[i]), 3110 | heads[i], 3111 | tails[i], 3112 | n_vertices[i], 3113 | epochs_per_sample[i], 3114 | a, 3115 | b, 3116 | np.random.RandomState(random_state).randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64), 3117 | gamma, 3118 | dim, 3119 | move_other, 3120 | alpha, 3121 | epochs_per_negative_sample[i], 3122 | epoch_of_next_negative_sample[i], 3123 | epoch_of_next_sample[i], 3124 | n, 3125 | ) 3126 | embeddings[i] += (new_embedding - embeddings[i]) * weights[(i,i)] 3127 | 3128 | for k in joint_graphs.keys(): 3129 | 3130 | if weights[(k[0], k[1])] != 0 or weights[(k[1], k[0])] != 0: 3131 | new_embeddings = optimize_fn( 3132 | np.concatenate([embeddings[k[0]], embeddings[k[1]]]), 3133 | joint_heads[k], 3134 | joint_tails[k], 3135 | joint_n_vertices[k], 3136 | joint_epochs_per_sample[k], 3137 | a, 3138 | b, 3139 | np.random.RandomState(random_state).randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3).astype(np.int64), 3140 | gamma, 3141 | dim, 3142 | move_other, 3143 | alpha, 3144 | joint_epochs_per_negative_sample[k], 3145 | joint_epoch_of_next_negative_sample[k], 3146 | joint_epoch_of_next_sample[k], 3147 | n, 3148 | ) 3149 | 3150 | embeddings[k[0]] += (new_embeddings[:len(embeddings[k[0]])] - embeddings[k[0]]) * weights[(k[0], k[1])] 3151 | embeddings[k[1]] += (new_embeddings[len(embeddings[k[0]]):] - embeddings[k[1]]) * weights[(k[1], k[0])] 3152 | 3153 | alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs))) 3154 | 3155 | if verbose and n % int(n_epochs / 10) == 0: 3156 | print("\tcompleted ", n, " / ", n_epochs, "epochs") 3157 | 3158 | return embeddings 3159 | 3160 | 3161 | def find_ab_params(spread, min_dist): 3162 | 3163 | 3164 | def curve(x, a, b): 3165 | return 1.0 / (1.0 + a * x ** (2 * b)) 3166 | 3167 | xv = np.linspace(0, spread * 3, 300) 3168 | yv = np.zeros(xv.shape) 3169 | yv[xv < min_dist] = 1.0 3170 | yv[xv >= min_dist] = np.exp(-(xv[xv >= min_dist] - min_dist) / spread) 3171 | params, covar = curve_fit(curve, xv, yv) 3172 | return params[0], params[1] 3173 | 3174 | 3175 | def make_epochs_per_sample(weights, n_epochs): 3176 | 3177 | result = -1.0 * np.ones(weights.shape[0], dtype=np.float64) 3178 | n_samples = n_epochs * (weights / weights.max()) 3179 | result[n_samples > 0] = float(n_epochs) / n_samples[n_samples > 0] 3180 | return result 3181 | 3182 | 3183 | def elaborate_relation_dict(dict, list_elems=True): 3184 | new = {} 3185 | for k in dict.keys(): 3186 | if len(k) == 2 and type(k[0]) != tuple and type(k[1]) != tuple: 3187 | new[k] = dict[k] 3188 | elif len(k) == 2: 3189 | k_0 = k[0] 3190 | k_1 = k[1] 3191 | if type(k[0]) != tuple: 3192 | k_0 = (k_0,) 3193 | if type(k[1]) != tuple: 3194 | k_1 = (k_1,) 3195 | for i in range(len(k_0)): 3196 | for j in range(len(k_1)): 3197 | if list_elems: 3198 | new[(k_0[i], k_1[j])] = [dict[k][0][i], dict[k][1][j]] 3199 | else: 3200 | new[(k_0[i], k_1[j])] = dict[k] 3201 | else: 3202 | for i in range(len(k)): 3203 | for j in range(i+1, len(k)): 3204 | if list_elems: 3205 | new[(k[i], k[j])] = [dict[k][i], dict[k][j]] 3206 | else: 3207 | new[(k[i], k[j])] = dict[k] 3208 | return new 3209 | 3210 | def find_weights(strengths, len_Xs, joint_idxs): 3211 | 3212 | if type(strengths) != dict: 3213 | strengths = np.clip(strengths, 0, 1) 3214 | weights = {} 3215 | for i in range(len(len_Xs)): 3216 | for j in range(len(len_Xs)): 3217 | if i == j: 3218 | weights[(i,j)] = strengths[i] 3219 | 3220 | else: 3221 | weights[(i,j)] = 1 - strengths[i] 3222 | else: 3223 | weights = elaborate_relation_dict(strengths, list_elems=False) 3224 | for i in range(len(len_Xs)): 3225 | for j in range(len(len_Xs)): 3226 | if (i,j) not in weights.keys(): 3227 | weights[(i,j)] = 1 3228 | 3229 | 3230 | 3231 | weight_sums = [] 3232 | for i in range(len(len_Xs)): 3233 | weight_sum = 0 3234 | for j in range(len(len_Xs)): 3235 | weight_sum += weights[(i,j)] * len_Xs[j] 3236 | weight_sums += [weight_sum] 3237 | for i in range(len(len_Xs)): 3238 | for j in range(len(len_Xs)): 3239 | weights[(i,j)] *= sum(len_Xs) / weight_sums[i] 3240 | 3241 | 3242 | for k in weights.keys(): 3243 | if k[0] != k[1]: 3244 | if k in joint_idxs.keys(): 3245 | weights[k] *= len(joint_idxs[k][1])/len_Xs[k[1]] 3246 | elif k[::-1] in joint_idxs.keys(): 3247 | weights[k] *= len(joint_idxs[k[::-1]][0])/len_Xs[k[1]] 3248 | else: 3249 | weights[k] = 0 3250 | 3251 | return weights 3252 | 3253 | def MultiGraph(**kwds): 3254 | return MultiMAP(**kwds, graph_only=True) 3255 | 3256 | def MultiMAP(Xs, 3257 | joint={}, 3258 | joint_idxs={}, 3259 | 3260 | metrics=None, 3261 | metric_kwds=None, 3262 | joint_metrics={}, 3263 | 3264 | n_neighbors=None, 3265 | cardinality=None, 3266 | angular=False, 3267 | set_op_mix_ratio=1.0, 3268 | local_connectivity=1.0, 3269 | 3270 | n_components=2, 3271 | spread=1.0, 3272 | min_dist=None, 3273 | init='spectral', 3274 | n_epochs=None, 3275 | a=None, 3276 | b=None, 3277 | strengths=None, 3278 | 3279 | random_state=0, 3280 | 3281 | verbose=False, 3282 | 3283 | graph_only=False, 3284 | ): 3285 | ''' 3286 | Run MultiMAP on a collection of dimensionality reduction matrices. Returns a ``(parameters, 3287 | neighbor_graph, embedding)`` tuple, with the embedding optionally skipped if ``graph_only=True``. 3288 | 3289 | Input 3290 | ----- 3291 | Xs : list of ``np.array`` 3292 | The dimensionality reductions of the datasets to integrate, observations as rows. 3293 | 3294 | >>> Xs = [DR_A, DR_B, DR_C] 3295 | joint : dict of ``np.array`` 3296 | The joint dimensionality reductions generated for all pair combinations of the input 3297 | datasets. The keys are to be two-integer tuples, specifying the indices of the two 3298 | datasets in ``Xs`` 3299 | 3300 | >>> joint = {(0,1):DR_AB, (0,2):DR_AC, (1,2):DR_BC} 3301 | graph_only : ``bool``, optional (default: ``False``) 3302 | If ``True``, skip producing the embedding and only return the neighbour graph. 3303 | 3304 | All other arguments as described in ``MultiMAP.Integration()``. 3305 | ''' 3306 | 3307 | #turn off warnings if we're not verbose 3308 | if not verbose: 3309 | warnings.simplefilter('ignore') 3310 | 3311 | for i in range(len(Xs)): 3312 | if not scipy.sparse.issparse(Xs[i]): 3313 | Xs[i] = np.array(Xs[i]) 3314 | len_Xs = [len(i) for i in Xs] 3315 | 3316 | if not joint: 3317 | joint = {tuple(range(len(Xs))): Xs} 3318 | 3319 | joint = elaborate_relation_dict(joint, list_elems=True) 3320 | joint_idxs = elaborate_relation_dict(joint_idxs, list_elems=True) 3321 | joint_metrics = elaborate_relation_dict(joint_metrics, list_elems=False) 3322 | for k in joint.keys(): 3323 | joint[k] = [i.toarray() if scipy.sparse.issparse(i) else np.array(i) for i in joint[k]] 3324 | if k not in joint_idxs.keys(): 3325 | if k[::-1] in joint_idxs.keys(): 3326 | joint_idxs[k] = joint_idxs[k[::-1]] 3327 | else: 3328 | joint_idxs[k] = [np.arange(len_Xs[k[0]]), np.arange(len_Xs[k[1]])] 3329 | if k not in joint_metrics.keys(): 3330 | if k[::-1] in joint_metrics.keys(): 3331 | joint_metrics[k] = joint_metrics[k[::-1]] 3332 | else: 3333 | joint_metrics[k] = 'euclidean' 3334 | 3335 | if metrics is None: 3336 | metrics = ['euclidean' for i in range(len(Xs))] 3337 | if metric_kwds is None: 3338 | metric_kwds = [{} for i in range(len(Xs))] 3339 | 3340 | 3341 | 3342 | 3343 | 3344 | 3345 | 3346 | if n_neighbors is None: 3347 | n_neighbors = 15 * len(Xs) 3348 | if cardinality is None: 3349 | cardinality = np.log2(n_neighbors) 3350 | if min_dist is None: 3351 | min_dist = 0.5 * 15/n_neighbors 3352 | 3353 | if scipy.sparse.issparse(init): 3354 | init = init.toarray() 3355 | else: 3356 | init = np.array(init) 3357 | if n_epochs is None: 3358 | if np.sum(len_Xs) <= 10000: 3359 | n_epochs = 500 3360 | else: 3361 | n_epochs = 200 3362 | if a is None or b is None: 3363 | a, b = find_ab_params(spread, min_dist) 3364 | 3365 | if strengths is None: 3366 | strengths = np.ones(len(Xs))*0.5 3367 | weights = find_weights(strengths, len_Xs, joint_idxs) 3368 | 3369 | if verbose: 3370 | print("Constructing fuzzy simplicial sets ...") 3371 | graphs, joint_graphs, full_graph, weights = fuzzy_simplicial_set( 3372 | Xs, 3373 | joint, 3374 | joint_idxs, 3375 | weights, 3376 | n_neighbors, 3377 | cardinality, 3378 | metrics, 3379 | metric_kwds, 3380 | joint_metrics, 3381 | angular, 3382 | set_op_mix_ratio, 3383 | local_connectivity, 3384 | n_epochs, 3385 | random_state, 3386 | verbose=False 3387 | ) 3388 | 3389 | #set up parameter output 3390 | params = {'n_neighbors': n_neighbors, 3391 | 'metric': metrics[0], 3392 | 'multimap': {'cardinality': cardinality, 3393 | 'set_op_mix_ratio': set_op_mix_ratio, 3394 | 'local_connectivity': local_connectivity, 3395 | 'n_components': n_components, 3396 | 'spread': spread, 3397 | 'min_dist': min_dist, 3398 | 'init': init, 3399 | 'n_epochs': n_epochs, 3400 | 'a': a, 3401 | 'b': b, 3402 | 'strengths': strengths, 3403 | 'random_state': random_state}} 3404 | 3405 | #return parameter and graph tuple 3406 | #TODO: add the distances graph to this once it exists 3407 | if graph_only: 3408 | return (params, full_graph) 3409 | 3410 | if verbose: 3411 | print("Initializing embedding ...") 3412 | embeddings = init_layout( 3413 | init, 3414 | Xs, 3415 | graphs, 3416 | n_components, 3417 | metrics, 3418 | metric_kwds, 3419 | random_state 3420 | ) 3421 | 3422 | if verbose: 3423 | print("Optimizing embedding ...") 3424 | embeddings = optimize_layout( 3425 | embeddings, 3426 | graphs, 3427 | joint_graphs, 3428 | weights, 3429 | n_epochs, 3430 | a, 3431 | b, 3432 | random_state, 3433 | gamma=1.0, 3434 | initial_alpha=1.0, 3435 | negative_sample_rate=5.0, 3436 | parallel=False, 3437 | verbose=verbose 3438 | ) 3439 | #undo warning reset 3440 | if not verbose: 3441 | warnings.resetwarnings() 3442 | 3443 | #return an embedding/graph/parameters tuple 3444 | #TODO: add the distances graph to this once it exists 3445 | return (params, full_graph, np.concatenate(embeddings)) 3446 | 3447 | import sklearn 3448 | 3449 | def tfidf(X, n_components, binarize=True, random_state=0): 3450 | from sklearn.feature_extraction.text import TfidfTransformer 3451 | 3452 | sc_count = np.copy(X) 3453 | if binarize: 3454 | sc_count = np.where(sc_count < 1, sc_count, 1) 3455 | 3456 | tfidf = TfidfTransformer(norm='l2', sublinear_tf=True) 3457 | normed_count = tfidf.fit_transform(sc_count) 3458 | 3459 | lsi = sklearn.decomposition.TruncatedSVD(n_components=n_components, random_state=random_state) 3460 | lsi_r = lsi.fit_transform(normed_count) 3461 | 3462 | X_lsi = lsi_r[:,1:] 3463 | return X_lsi -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MultiMAP 2 | **MultiMAP** is a method for integrating single cell multi-omics. MultiMAP can also be used for batch correction. More detail is available in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02565-y). 3 | 4 |

5 | 6 | 7 | ## Installation 8 | 9 | ```bash 10 | pip3 install git+https://github.com/Teichlab/MultiMAP.git 11 | ``` 12 | 13 | ## Usage and Documentation 14 | 15 | MultiMAP offers two functions accepting AnnData objects on input: 16 | - `MultiMAP.Integration()` expects a list of one AnnData per dataset, with the desired dimensionality reduction precomputed and stored in `.obsm`. This allows for refining the initial dimensionality reduction, e.g. if wishing to use `TFIDF_LSI` for ATAC data and PCA for RNA data. 17 | - `MultiMAP.Batch()` expects a single AnnData object with the dataset information stored in an `.obs` column. This allows for convenient integration with minimal preparation if all datasets can be treated with the same dimensionality reduction. 18 | 19 | There's also an AnnData-independent `MultiMAP.matrix.MultiMAP()` function which operates directly on dimensionality reduction matrices. This requires precomputing all pairwise dimensionality reductions prior to calling MultiMAP. 20 | 21 | A tutorial covering both RNA-ATAC integration and RNA-Seq batch correction use can be found [here](https://nbviewer.jupyter.org/github/Teichlab/MultiMAP/blob/master/examples/tutorial.ipynb). 22 | 23 | Documentation of the function parameters can be found on [ReadTheDocs](https://multimap.readthedocs.io/en/latest/). 24 | 25 | ## Citation 26 | 27 | If your work uses MultiMAP, please cite the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02565-y): 28 | 29 | @article{jain2021multimap, 30 | title={MultiMAP: dimensionality reduction and integration of multimodal data}, 31 | author={Jain, Mika Sarkin and Polanski, Krzysztof and Conde, Cecilia Dominguez and Chen, Xi and Park, Jongeun and Mamanova, Lira and Knights, Andrew and Botting, Rachel A and Stephenson, Emily and Haniffa, Muzlifah and others}, 32 | journal={Genome biology}, 33 | volume={22}, 34 | number={1}, 35 | pages={1--26}, 36 | year={2021}, 37 | publisher={BioMed Central} 38 | } 39 | 40 | ## Contact 41 | 42 | Mika Sarkin Jain - mikasarkinjain@gmail.com \ 43 | Mirjana Efremova - m.efremova@qmul.ac.uk \ 44 | Sarah Teichmann - st9@sanger.ac.uk 45 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/MultiMAP_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Teichlab/MultiMAP/681e608c45cdb6b139dfb6700e40c7520bc6096d/docs/MultiMAP_schematic.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | autodoc_mock_imports = ['anndata','scanpy','numpy','scipy','numba','scipy.optimize', 17 | 'sklearn.neighbors','sklearn.metrics','warnings','scipy.sparse', 18 | 'locale','sklearn.utils','annoy','faiss','scipy.sparse.csgraph', 19 | 'sklearn.metrics','sklearn.manifold','sklearn'] 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'MultiMAP' 24 | copyright = '2020-2021, Mika Sarkin Jain' 25 | author = 'Mika Sarkin Jain' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = '0.0.1' 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = ['sphinx.ext.autodoc'] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # List of patterns, relative to source directory, that match files and 42 | # directories to ignore when looking for source files. 43 | # This pattern also affects html_static_path and html_extra_path. 44 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = 'sphinx_rtd_theme' 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. MultiMAP documentation master file, created by 2 | sphinx-quickstart on Wed Dec 2 10:49:54 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | MultiMAP 7 | ======== 8 | 9 | .. automodule:: MultiMAP 10 | :members: Integration, Batch, TFIDF_LSI 11 | 12 | .. automodule:: MultiMAP.matrix 13 | :members: MultiMAP 14 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-rtd-theme 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='MultiMAP', 5 | version='0.0.1', 6 | description='MultiMAP', 7 | url='https://github.com/Teichlab/MultiMAP', 8 | packages=find_packages(exclude=['docs', 'examples']), 9 | install_requires=['numpy','scipy','numba','scikit-learn'], 10 | author='Mika Sarkin Jain', 11 | author_email='mikasarkinjain@gmail.com', 12 | license='MIT' 13 | ) 14 | --------------------------------------------------------------------------------