├── .gitattributes ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── harmony ├── __init__.py ├── harmony.py └── utils.py ├── method └── Method.md ├── pyproject.toml ├── setup.py └── test ├── gen_cell_lines.R ├── gen_mantonbm.R ├── gen_pbmc.R ├── test.py └── test_gpu.py /.gitattributes: -------------------------------------------------------------------------------- 1 | harmony/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Sublime workspace 132 | *.sublime-workspace 133 | .DS_Store 134 | 135 | #Custom folders 136 | results/ 137 | figures/ 138 | 139 | *.sublime-workspace 140 | *.sublime-project 141 | 142 | # Jupyter notebooks 143 | *.ipynb 144 | 145 | .idea/ 146 | 147 | *.h5ad 148 | 149 | version.py 150 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019-2020 The Broad Institute, Inc. and The General Hospital Corporation. 4 | Copyright (c) 2021-present Genentech, Inc. for code commits from 2021 onward. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude .* 2 | exclude test/* 3 | exclude method/* 4 | exclude pyproject.toml 5 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Harmony-Pytorch 2 | --------------- 3 | 4 | |PyPI| |Conda| |Python| |License| 5 | 6 | .. |PyPI| image:: https://img.shields.io/pypi/v/harmony-pytorch.svg 7 | :target: https://pypi.org/project/harmony-pytorch 8 | 9 | .. |Conda| image:: https://img.shields.io/conda/v/bioconda/harmony-pytorch 10 | :target: https://anaconda.org/bioconda/harmony-pytorch 11 | 12 | .. |Python| image:: https://img.shields.io/pypi/pyversions/harmony-pytorch.svg 13 | :target: https://pypi.org/project/harmony-pytorch 14 | 15 | .. |License| image:: https://img.shields.io/github/license/lilab-bcb/harmony-pytorch 16 | :target: https://github.com/lilab-bcb/harmony-pytorch/blob/master/LICENSE 17 | 18 | This is a Pytorch implementation of Harmony algorithm on single-cell sequencing data integration. Please see `Ilya Korsunsky et al., 2019 `_ for details. 19 | 20 | Installation 21 | ^^^^^^^^^^^^^ 22 | 23 | This package is published on PyPI:: 24 | 25 | pip install harmony-pytorch 26 | 27 | 28 | Usage 29 | ^^^^^^^^ 30 | 31 | General Case 32 | ############## 33 | 34 | Given an embedding ``X`` as a N-by-d matrix in numpy array structure (N for number of cells, d for embedding components) and cell attributes as a Data Frame ``df_metadata``, use Harmony for data integration as the following:: 35 | 36 | from harmony import harmonize 37 | Z = harmonize(X, df_metadata, batch_key = 'Channel') 38 | 39 | 40 | where ``Channel`` is the attribute in ``df_metadata`` for batches. 41 | 42 | Alternatively, if there are multiple attributes for batches, write:: 43 | 44 | Z = harmonize(X, df_metadata, batch_key = ['Lab', 'Date']) 45 | 46 | Input as MultimodalData Object 47 | ############################### 48 | 49 | It's easy for Harmony-pytorch to work with count matrix data structure from `PegasusIO `_ package. Let ``data`` be a MultimodalData object in Python:: 50 | 51 | from harmony import harmonize 52 | Z = harmonize(data.obsm['X_pca'], data.obs, batch_key = 'Channel') 53 | data.obsm['X_pca_harmony'] = Z 54 | 55 | This will calculate the harmonized PCA matrix for the default UnimodalData of ``data``. 56 | 57 | Given a UnimodalData object ``unidata``, you can also use the code above to perform Harmony algorithm: simply substitute ``unidata`` for ``data`` there. 58 | 59 | Input as AnnData Object 60 | ########################## 61 | 62 | It's easy for Harmony-pytorch to work with annotated count matrix data structure from `anndata `_ package. Let ``adata`` be an AnnData object in Python:: 63 | 64 | from harmony import harmonize 65 | Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = '') 66 | adata.obsm['X_harmony'] = Z 67 | 68 | where ```` should be replaced by the actual batch key attribute name in your data. 69 | 70 | For details about ``AnnData`` data structure, please refer to its `documentation `_. 71 | -------------------------------------------------------------------------------- /harmony/__init__.py: -------------------------------------------------------------------------------- 1 | from .harmony import harmonize 2 | 3 | try: 4 | from importlib.metadata import version, PackageNotFoundError 5 | except ImportError: # < Python 3.8: Use backport module 6 | from importlib_metadata import version, PackageNotFoundError 7 | 8 | try: 9 | __version__ = version("harmony-pytorch") 10 | del version 11 | except PackageNotFoundError: 12 | pass 13 | -------------------------------------------------------------------------------- /harmony/harmony.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | from sklearn.cluster import KMeans 8 | from torch.nn.functional import normalize 9 | from typing import Union, List 10 | from .utils import one_hot_tensor, get_batch_codes 11 | 12 | 13 | def harmonize( 14 | X: np.array, 15 | df_obs: pd.DataFrame, 16 | batch_key: Union[str, List[str]], 17 | n_clusters: int = None, 18 | max_iter_harmony: int = 10, 19 | max_iter_clustering: int = 200, 20 | tol_harmony: float = 1e-4, 21 | tol_clustering: float = 1e-5, 22 | ridge_lambda: float = 1.0, 23 | sigma: float = 0.1, 24 | block_proportion: float = 0.05, 25 | init_centroids_method: str = "default", 26 | theta: float = 2.0, 27 | tau: int = 0, 28 | random_state: int = 0, 29 | use_gpu: bool = False, 30 | n_jobs: int = -1, 31 | verbose: bool = True, 32 | ) -> np.array: 33 | """ 34 | Integrate data using Harmony algorithm. 35 | 36 | Parameters 37 | ---------- 38 | 39 | X: ``numpy.array`` 40 | The input embedding with rows for cells (N) and columns for embedding coordinates (d). 41 | 42 | df_obs: ``pandas.DataFrame`` 43 | The cell barcode attributes as a Pandas Data Frame. 44 | 45 | batch_key: ``str`` or ``List[str]`` 46 | Cell attribute(s) from ``df_obs`` to identify batches. 47 | 48 | n_clusters: ``int``, optional, default: ``None`` 49 | Number of clusters used in Harmony algorithm. If ``None``, choose the minimum of 100 and N / 30. 50 | 51 | max_iter_harmony: ``int``, optional, default: ``10`` 52 | Maximum iterations on running Harmony if not converged. 53 | 54 | max_iter_clustering: ``int``, optional, default: ``200`` 55 | Within each Harmony iteration, maximum iterations on the clustering step if not converged. 56 | 57 | tol_harmony: ``float``, optional, default: ``1e-4`` 58 | Tolerance on justifying convergence of Harmony over objective function values. 59 | 60 | tol_clustering: ``float``, optional, default: ``1e-5`` 61 | Tolerance on justifying convergence of the clustering step over objective function values within each Harmony iteration. 62 | 63 | ridge_lambda: ``float``, optional, default: ``1.0`` 64 | Hyperparameter of ridge regression on the correction step. 65 | 66 | sigma: ``float``, optional, default: ``0.1`` 67 | Weight of the entropy term in objective function. 68 | 69 | block_proportion: ``float``, optional, default: ``0.05`` 70 | Proportion of block size in one update operation of clustering step. 71 | 72 | init_centroids_method: ``str``, optional, default: ``default`` 73 | K-Means method used for intializing centroids. Can be either 'default' or 'harmony-paper'. 74 | If using 'default', it will use the default settings of ``sklearn.cluster.KMeans`` function. 75 | If choosing 'harmony-paper', it will use the same method as described in Harmony paper, i.e. ``sklearn.cluster.KMeans(..., init='random', n_init=10, max_iter=25, ...)``. 76 | 77 | theta: ``float``, optional, default: ``2.0`` 78 | Weight of the diversity penalty term in objective function. 79 | 80 | tau: ``int``, optional, default: ``0`` 81 | Discounting factor on ``theta``. By default, there is no discounting. 82 | 83 | random_state: ``int``, optional, default: ``0`` 84 | Random seed for reproducing results. 85 | 86 | use_gpu: ``bool``, optional, default: ``False`` 87 | If ``True``, use GPU if available. Otherwise, use CPU only. 88 | 89 | n_jobs: ``int``, optional, default ``-1`` 90 | How many CPU threads to use. By default, use all physical cores. If 'use_gpu' is True, this option only affects the KMeans step. 91 | 92 | verbose: ``bool``, optional, default ``True`` 93 | If ``True``, print verbose output. 94 | 95 | Returns 96 | ------- 97 | ``numpy.array`` 98 | The integrated embedding by Harmony, of the same shape as the input embedding. 99 | 100 | Examples 101 | -------- 102 | >>> adata = anndata.read_h5ad("filename.h5ad") 103 | >>> X_harmony = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel') 104 | 105 | >>> adata = anndata.read_h5ad("filename.h5ad") 106 | >>> X_harmony = harmonize(adata.obsm['X_pca'], adata.obs, ['Channel', 'Lab']) 107 | """ 108 | 109 | assert isinstance(X, np.ndarray) 110 | 111 | if n_jobs < 0: 112 | import psutil 113 | 114 | n_jobs = psutil.cpu_count(logical=False) # get physical cores 115 | if n_jobs is None: 116 | n_jobs = psutil.cpu_count( 117 | logical=True 118 | ) # if undetermined, use logical cores instead 119 | torch.set_num_threads(n_jobs) 120 | 121 | device_type = "cpu" 122 | if use_gpu: 123 | if torch.cuda.is_available(): 124 | device_type = "cuda" 125 | if verbose: 126 | print("Use GPU mode.") 127 | elif torch.backends.mps.is_available(): 128 | device_type = "mps" 129 | if verbose: 130 | print("Use Metal (MPS) mode.") 131 | elif verbose: 132 | print( 133 | "Neither CUDA nor MPS is available on your machine. Use CPU mode instead." 134 | ) 135 | 136 | (stride_0, stride_1) = X.strides 137 | if stride_0 < 0 or stride_1 < 0: 138 | Z = torch.tensor(X.copy(), dtype=torch.float, device=device_type) 139 | else: 140 | Z = torch.tensor(X, dtype=torch.float, device=device_type) 141 | Z_norm = normalize(Z, p=2, dim=1) 142 | n_cells = Z.shape[0] 143 | 144 | batch_codes = get_batch_codes(df_obs, batch_key) 145 | n_batches = batch_codes.cat.categories.size 146 | N_b = torch.tensor( 147 | batch_codes.value_counts(sort=False).values, 148 | dtype=torch.float, 149 | device=device_type, 150 | ) 151 | Pr_b = N_b.view(-1, 1) / n_cells 152 | 153 | Phi = one_hot_tensor(batch_codes, device_type) 154 | 155 | if n_clusters is None: 156 | n_clusters = int(min(100, n_cells / 30)) 157 | 158 | theta = torch.tensor([theta], dtype=torch.float, device=device_type).expand( 159 | n_batches 160 | ) 161 | 162 | if tau > 0: 163 | theta = theta * (1 - torch.exp(-N_b / (n_clusters * tau)) ** 2) 164 | 165 | theta = theta.view(1, -1) 166 | 167 | assert block_proportion > 0 and block_proportion <= 1, f"block_proportion must be a fraction in range (0, 1]!" 168 | block_size = int(n_cells * block_proportion) 169 | 170 | assert init_centroids_method in ["default", "harmony-paper"], f"init_centroids_method must be chosen from ['default', 'harmony-paper']!" 171 | 172 | np.random.seed(random_state) 173 | 174 | # Initialize centroids 175 | R, E, O, objectives_harmony = initialize_centroids( 176 | Z_norm, 177 | n_clusters, 178 | sigma, 179 | Pr_b, 180 | Phi, 181 | theta, 182 | init_centroids_method, 183 | None, 184 | device_type, 185 | n_jobs, 186 | ) 187 | 188 | if verbose: 189 | print("\tInitialization is completed.") 190 | 191 | rng = np.random.default_rng() 192 | for i in range(max_iter_harmony): 193 | clustering( 194 | Z_norm, 195 | Pr_b, 196 | Phi, 197 | R, 198 | E, 199 | O, 200 | theta, 201 | tol_clustering, 202 | objectives_harmony, 203 | max_iter_clustering, 204 | sigma, 205 | block_size, 206 | rng, 207 | ) 208 | Z_hat = correction(Z, R, Phi, O, ridge_lambda, device_type) 209 | Z_norm = normalize(Z_hat, p=2, dim=1) 210 | 211 | if verbose: 212 | print( 213 | "\tCompleted {cur_iter} / {total_iter} iteration(s).".format( 214 | cur_iter=i + 1, 215 | total_iter=max_iter_harmony, 216 | ) 217 | ) 218 | 219 | if is_convergent_harmony(objectives_harmony, tol=tol_harmony): 220 | if verbose: 221 | print(f"Reach convergence after {i + 1} iteration(s).") 222 | break 223 | 224 | return Z_hat.numpy() if device_type == "cpu" else Z_hat.cpu().numpy() 225 | 226 | 227 | def initialize_centroids( 228 | Z_norm, 229 | n_clusters, 230 | sigma, 231 | Pr_b, 232 | Phi, 233 | theta, 234 | init_centroids_method, 235 | random_state, 236 | device_type, 237 | n_jobs, 238 | ): 239 | kmeans_params = { 240 | "n_clusters": n_clusters, 241 | "random_state": random_state, 242 | } 243 | if init_centroids_method == "harmony-paper": 244 | kmeans_params["init"] = "random" 245 | kmeans_params["n_init"] = 10 246 | kmeans_params["max_iter"] = 25 247 | 248 | kmeans = KMeans(**kmeans_params) 249 | 250 | from threadpoolctl import threadpool_limits 251 | 252 | with threadpool_limits(limits=n_jobs): 253 | if device_type == "cpu": 254 | kmeans.fit(Z_norm) 255 | else: 256 | kmeans.fit(Z_norm.cpu()) 257 | 258 | Y = torch.tensor(kmeans.cluster_centers_, dtype=torch.float, device=device_type) 259 | Y_norm = normalize(Y, p=2, dim=1) 260 | 261 | # Initialize R 262 | R = torch.exp(-2 / sigma * (1 - torch.matmul(Z_norm, Y_norm.t()))) 263 | R = normalize(R, p=1, dim=1) 264 | 265 | E = torch.matmul(Pr_b, torch.sum(R, dim=0, keepdim=True)) 266 | O = torch.matmul(Phi.t(), R) 267 | 268 | objectives_harmony = [] 269 | compute_objective( 270 | Y_norm, Z_norm, R, theta, sigma, O, E, objectives_harmony 271 | ) 272 | 273 | return R, E, O, objectives_harmony 274 | 275 | 276 | def clustering( 277 | Z_norm, 278 | Pr_b, 279 | Phi, 280 | R, 281 | E, 282 | O, 283 | theta, 284 | tol, 285 | objectives_harmony, 286 | max_iter, 287 | sigma, 288 | block_size, 289 | rng, 290 | ): 291 | n_cells = Z_norm.shape[0] 292 | 293 | objectives_clustering = [] 294 | 295 | for _ in range(max_iter): 296 | # Compute Cluster Centroids 297 | Y = torch.matmul(R.t(), Z_norm) 298 | Y_norm = normalize(Y, p=2, dim=1) 299 | 300 | idx_list = rng.permutation(n_cells) 301 | pos = 0 302 | while pos < len(idx_list): 303 | idx_in = idx_list[pos : (pos + block_size)] 304 | R_in = R[idx_in,] 305 | Phi_in = Phi[idx_in,] 306 | 307 | # Compute O and E on left out data. 308 | O -= torch.matmul(Phi_in.t(), R_in) 309 | E -= torch.matmul(Pr_b, torch.sum(R_in, dim=0, keepdim=True)) 310 | 311 | # Update and Normalize R 312 | R_in = torch.exp( 313 | -2 / sigma * (1 - torch.matmul(Z_norm[idx_in,], Y_norm.t())) 314 | ) 315 | omega = torch.matmul(Phi_in, torch.pow(torch.div(E + 1, O + 1), theta.t())) 316 | R_in = R_in * omega 317 | R_in = normalize(R_in, p=1, dim=1) 318 | R[idx_in,] = R_in 319 | 320 | # Compute O and E with full data. 321 | O += torch.matmul(Phi_in.t(), R_in) 322 | E += torch.matmul(Pr_b, torch.sum(R_in, dim=0, keepdim=True)) 323 | 324 | pos += block_size 325 | 326 | compute_objective( 327 | Y_norm, Z_norm, R, theta, sigma, O, E, objectives_clustering 328 | ) 329 | 330 | if is_convergent_clustering(objectives_clustering, tol): 331 | break 332 | 333 | objectives_harmony.append(objectives_clustering[-1]) 334 | 335 | 336 | def correction(X, R, Phi, O, ridge_lambda, device_type): 337 | n_cells = X.shape[0] 338 | n_clusters = R.shape[1] 339 | n_batches = Phi.shape[1] 340 | Phi_1 = torch.cat((torch.ones(n_cells, 1, device=device_type), Phi), dim=1) 341 | 342 | Z = X.clone() 343 | P = torch.eye(n_batches + 1, n_batches + 1, device=device_type) 344 | for k in range(n_clusters): 345 | O_k = O[:, k] 346 | N_k = torch.sum(O_k) 347 | 348 | factor = 1 / (O_k + ridge_lambda) 349 | c = N_k + torch.sum(-factor * O_k**2) 350 | c_inv = 1 / c 351 | 352 | P[0, 1:] = -factor * O_k 353 | 354 | P_t_B_inv = torch.diag( 355 | torch.cat( 356 | (torch.tensor([[c_inv]], device=device_type), factor.view(1, -1)), dim=1 357 | ).squeeze() 358 | ) 359 | P_t_B_inv[1:, 0] = P[0, 1:] * c_inv 360 | inv_mat = torch.matmul(P_t_B_inv, P) 361 | 362 | Phi_t_diag_R = Phi_1.t() * R[:, k].view(1, -1) 363 | W = torch.matmul(inv_mat, torch.matmul(Phi_t_diag_R, X)) 364 | W[0, :] = 0 365 | 366 | Z -= torch.matmul(Phi_t_diag_R.t(), W) 367 | 368 | return Z 369 | 370 | 371 | def compute_objective(Y_norm, Z_norm, R, theta, sigma, O, E, objective_arr): 372 | kmeans_error = torch.sum(R * 2 * (1 - torch.matmul(Z_norm, Y_norm.t()))) 373 | entropy_term = sigma * torch.sum( 374 | -torch.distributions.Categorical(probs=R).entropy() 375 | ) 376 | diversity_penalty = sigma * torch.sum( 377 | torch.matmul(theta, O * torch.log(torch.div(O + 1, E + 1))) 378 | ) 379 | objective = kmeans_error + entropy_term + diversity_penalty 380 | 381 | objective_arr.append(objective) 382 | 383 | 384 | def is_convergent_harmony(objectives_harmony, tol): 385 | if len(objectives_harmony) < 2: 386 | return False 387 | 388 | obj_old = objectives_harmony[-2] 389 | obj_new = objectives_harmony[-1] 390 | 391 | return (obj_old - obj_new) < tol * torch.abs(obj_old) 392 | 393 | 394 | def is_convergent_clustering(objectives_clustering, tol, window_size=3): 395 | if len(objectives_clustering) < window_size + 1: 396 | return False 397 | 398 | obj_old = 0 399 | obj_new = 0 400 | for i in range(window_size): 401 | obj_old += objectives_clustering[-2 - i] 402 | obj_new += objectives_clustering[-1 - i] 403 | 404 | return (obj_old - obj_new) < tol * torch.abs(obj_old) 405 | -------------------------------------------------------------------------------- /harmony/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_batch_codes(batch_mat, batch_key): 5 | if type(batch_key) is str: 6 | batch_vec = batch_mat[batch_key] 7 | 8 | elif len(batch_key) == 1: 9 | batch_key = batch_key[0] 10 | 11 | batch_vec = batch_mat[batch_key] 12 | 13 | else: 14 | df = batch_mat[batch_key].astype("str") 15 | batch_vec = df.apply(lambda row: ",".join(row), axis=1) 16 | 17 | return batch_vec.astype("category") 18 | 19 | 20 | def one_hot_tensor(X, device_type): 21 | ids = torch.as_tensor( 22 | X.cat.codes.values.copy(), dtype=torch.long, device=device_type 23 | ).view(-1, 1) 24 | n_row = X.size 25 | n_col = X.cat.categories.size 26 | Phi = torch.zeros(n_row, n_col, dtype=torch.float, device=device_type) 27 | Phi.scatter_(dim=1, index=ids, value=1.0) 28 | 29 | return Phi 30 | -------------------------------------------------------------------------------- /method/Method.md: -------------------------------------------------------------------------------- 1 | # Harmony Algorithm 2 | 3 | As most of the data have cell barcodes in rows, we adjusted the algorithm, and thus it's slightly different from the paper. 4 | 5 | In this document, we resummarise the Harmony algorithm. 6 | 7 | ## Notations 8 | 9 | Given an embedding of $N$ cell barcodes in $d$ dimensions, coming from $B$ batches, Harmony tries to cluster them into $K$ clusters first, then integrate data. 10 | 11 | * $Z \in \mathbb{R}^{N \times d}$: Input embedding to be corrected by Harmony. 12 | * $\hat{Z} \in \mathbb{R}^{N \times d}$: Output embedding which is integrated. 13 | * $R \in [0, 1]^{N \times K}$: Soft cluster assignment matrix of cells (rows) to clusters (columns). 14 | * $\phi \in \{0, 1\}^{N \times B}$: One-hot assignment matrix of cells (rows) to batches (columns). 15 | * $Pr \in [0, 1]^B$: Frequency of batches. 16 | * $O \in [0, 1]^{B \times K}$: The observed co-occurrence matrix of cells in batches (rows) and clusters (columns). 17 | * $E \in [0, 1]^{B \times K}$: The expected co-occurrence matrix of cells in batches (rows) and clusters (columns), under assumption of independence between cluster and batch assignment. 18 | * $Y \in [0, 1]^{K \times d}$: L2-Normalized cluster centroid locations. 19 | 20 | ## Objective Function 21 | 22 | * K-Means error: 23 | 24 | ```math 25 | e_1 = \sum_{i, k} R_{ik}\ ||Z_i - Y_k||^2 \qquad \text{for} \quad \forall 1 \leq i \leq N \text{ and }\forall 1 \leq k \leq K 26 | ``` 27 | 28 | Moreover, if both $Z_i$ and $Y_k$ are L2-normalized, their euclidean distance can be reduced to cosine distance: 29 | 30 | ```math 31 | \begin{align*} 32 | e_1 &= \sum_{i, k} R_{ik}(|Z_i|^2 + |Y_k|^2 - 2Z_i\cdot Y_k^T ) \\ 33 | &= \sum_{i, k} R_{ik}(2 - 2Z_i \cdot Y_k^T) \qquad \qquad ( Z_i \text{ and } Y_k \text{ are L2-normalized}) \\ 34 | &= \sum_{i, k} 2R_{ik}(1 - Z_{i} \cdot Y_{k}^T) \\ 35 | &= \sum_{i, k} 2R * (1 - Z Y^T) 36 | \end{align*} 37 | ``` 38 | 39 | where $*$ is element-wise product. 40 | 41 | * Cross-entropy Error: 42 | 43 | $$ 44 | e_2 = \sigma \sum_{i, k} R_{ik}\log{R_{ik}} = \sigma \sum_{i, k}R * \log{R} 45 | $$ 46 | 47 | * Diversity Penalty: 48 | 49 | $$ 50 | \begin{align*} 51 | e_3 &= \sigma \sum_{i, k} R_{ik} \sum_{b}\theta_b \phi_{ib}\log{\Big( \frac{O_{bk} + 1}{E_{bk} + 1} \Big)} \\ 52 | &= \sigma \sum_{k} \theta \Big[ (\phi^T R) * \log{\Big( \frac{O + 1}{E + 1} \Big)} \Big] \\ 53 | &= \sigma \sum_{k} \theta \Big[ O * \log{\Big( \frac{O + 1}{E + 1} \Big)} \Big] 54 | \end{align*} 55 | $$ 56 | 57 | where $\theta = [\theta_1, ..., \theta_B]$ of shape $1 \times B$ are the discounting hyperparameters. 58 | 59 | Therefore, the objective function is 60 | 61 | $$ 62 | E = e_1 + e_2 + e_3. 63 | $$ 64 | 65 | ## Algorithm Structure 66 | 67 | ```python 68 | def harmonize(Z, phi): 69 | Z_hat = Z 70 | R, E, O = initialize_centroids(Z_hat) 71 | while not converged: 72 | R = clustering(Z_hat, phi) 73 | Z_hat = correction(Z, R, phi) 74 | 75 | return Z_hat 76 | ``` 77 | 78 | ## Centroids Initialization 79 | 80 | 1. L2-normalize $\hat{Z}$ on rows. 81 | 82 | 2. $\hat{Y} = kmeans(\hat{Z}, K)$. And then L2-normalize $\hat{Y}$ on rows. 83 | 84 | 3. Initialize $R$: 85 | 86 | $$ 87 | R = \exp{\Big(-\frac{2(1 - \hat{Z} \hat{Y}^T)}{\sigma}\Big)} 88 | $$ 89 | 90 | Then L1-normalize $R$ on rows, so that each row sums up to 1. 91 | 92 | 4. Initialize $E$ and $O$: 93 | 94 | ```math 95 | \begin{align*} 96 | (E)_{bk} = Pr_b \cdot \sum_{i = 1}^N R_{ik} \qquad &\Rightarrow \qquad E = Pr^T \cdot [R_{\cdot 1}, \dots, R_{\cdot K}];\\ 97 | (O)_{bk} = \sum_{i = 1}^N \phi_{ib}R_{ik} \qquad &\Rightarrow \qquad O = \phi^T R. 98 | \end{align*} 99 | ``` 100 | 101 | 5. Compute objective value with $\hat{Y}$, $\hat{Z}$, $R$, $O$, and $E$. 102 | 103 | ## Clustering 104 | 105 | ### Block-wise Update 106 | 107 | 1. Compute $O$ and $E$ on left-out data: 108 | 109 | ```math 110 | E = E - Pr^T \cdot [R_{in, 1}, \dots, R_{in, K}], \qquad O = O - \phi_{in}^T R_{in}. 111 | ``` 112 | 113 | where $R_{in, 1}, ..., R_{in, K}$ are the summations of $R_{ik}$ over cells in the current block regarding each cluster $k$. 114 | 115 | 2. Update and normalize $R$: 116 | 117 | ```math 118 | \begin{align*} 119 | R_{in} &= \exp{\Big( -\frac{2(1 - \hat{Z}_{in}\hat{Y}^T)}{\sigma} \Big)};\\ 120 | \Omega &= \phi^{in} \Big( \frac{E+1}{O+1} \Big)^\Theta; \\ 121 | R_{in} &= R_{in} \Omega; \\ 122 | R_{in} &= \text{L1-Normalize}(R_{in}, \text{row}). 123 | \end{align*} 124 | ``` 125 | 126 | where $\Theta = [\theta^T, \dots, \theta^T]$ of shape $B \times K$. 127 | 128 | 3. Compute $O$ and $E$ with full data: 129 | 130 | $$ 131 | E = E + Pr^T \cdot [R_{in, 1}, \dots, R_{in, K}], \qquad O = O + \phi_{in}^T R_{in}. 132 | $$ 133 | 134 | 4. Update cluster centroids: 135 | 136 | $$ 137 | \begin{align*} 138 | \hat{Y} &= \sum_{i = 1}^N R_{ik}\hat{Z}_{id} = R^T \hat{Z};\\ 139 | \hat{Y} &= \text{L2-Normalize}(\hat{Y}, \text{row}). 140 | \end{align*} 141 | $$ 142 | 143 | 5. Compute objective value with updated $\hat{Y}$, $\hat{Z}$, $R$, $O$, and $E$. 144 | 145 | ## Correction 146 | 147 | ### Original Method 148 | 149 | 1. Initialize $\hat{Z}$ by $Z$. 150 | 151 | 2. Let 152 | 153 | $$ 154 | \phi^* = \begin{bmatrix} 155 | 1 & \phi_{11} & \cdots & \phi_{1B} \\ 156 | \vdots & \vdots & \ddots & \vdots \\ 157 | 1 & \phi_{N1} & \cdots & \phi_{NB} 158 | \end{bmatrix} 159 | $$ 160 | 161 | 3. Cluster-wise correction: 162 | 163 | For each cluster $k$, 164 | 165 | ```math 166 | \begin{align*} 167 | R_k &= [R_{1k}, \dots, R_{Nk}];\\ 168 | \Phi_{R,k}^* &= \phi^{*T} \otimes R_k;\\ 169 | W_k &= (\Phi_{R,k}^* \phi^* + \lambda J)^{-1} \Phi_{R,k}^* Z;\\ 170 | W_k[0, :] &= \mathbf{0};\\ 171 | \hat{Z} &= \hat{Z} - \Phi_{R,k}^{*T} W_k. 172 | \end{align*} 173 | ``` 174 | 175 | where $\otimes$ is row-wise multiplication of a matrix and a row vector, and 176 | 177 | $$ 178 | J = \begin{bmatrix} 179 | 0 & 0 & 0 & \cdots & 0\\ 180 | 0 & 1 & & & \\ 181 | 0 & & 1 & & \\ 182 | \vdots & & & \ddots & \\ 183 | 0 & & & & 1 184 | \end{bmatrix}. 185 | $$ 186 | 187 | 188 | ### Improvement 189 | 190 | We don't need to directly calculate the matrix inverse: 191 | 192 | ```math 193 | (\Phi_{R,k}^* \phi^* + \lambda J)^{-1} 194 | ``` 195 | 196 | of shape $(B+1)\times(B+1)$, which can be time consuming when the number of batches $B$ is high. 197 | 198 | Let 199 | 200 | ```math 201 | A_k = \phi^{*T}diag(R_k)\phi^* + \lambda J, 202 | ``` 203 | 204 | then 205 | 206 | ```math 207 | W_k = A_k^{-1}\Phi_{R, k}^* Z. 208 | ``` 209 | 210 | Since 211 | 212 | ```math 213 | \begin{align*} 214 | A_k &= \begin{bmatrix} 215 | 1 & \cdots & 1 \\ 216 | \phi_{11} & \cdots & \phi_{N1} \\ 217 | \vdots & \vdots & \vdots \\ 218 | \phi_{1B} & \cdots & \phi_{NB} 219 | \end{bmatrix} \cdot \begin{bmatrix} 220 | R_{1k} & & \\ 221 | & \ddots & \\ 222 | & & R_{Nk} 223 | \end{bmatrix} \cdot \begin{bmatrix} 224 | 1 & \phi_{11} & \cdots & \phi_{1B} \\ 225 | \vdots & \vdots & \ddots & \vdots \\ 226 | 1 & \phi_{N1} & \cdots & \phi_{NB} 227 | \end{bmatrix} + \lambda J \\ 228 | &= \begin{bmatrix} 229 | \sum_{i = 1}^N R_{ik} & \sum_{i = 1}^N \phi_{i1}R_{ik} & \cdots & \sum_{i = 1}^N \phi_{iB}R_{ik} \\ 230 | \sum_{i = 1}^N \phi_{i1}R_{ik} & \sum_{i = 1}^N \phi_{i1}^2 R_ik & \cdots & \sum_{i = 1}^N \phi_{i1}\phi_{iB}R_{ik} \\ 231 | \vdots & \vdots & \ddots & \vdots \\ 232 | \sum_{i = 1}^N \phi_{iB}R_{ik} & \sum_{i = 1}^N \phi_{iB}\phi_{i1}R_{ik} & \cdots & \sum_{i = 1}^N \phi_{iB}^2R_{ik} 233 | \end{bmatrix} + \lambda J, 234 | \end{align*} 235 | ``` 236 | 237 | it's easy to see that 238 | 239 | ```math 240 | \sum_{i = 1}^N \phi_{ib_1}\phi_{ib_2}R_{ik} = 0 \qquad \text{ for } \quad \forall b_1 \neq b_2 241 | ``` 242 | 243 | and 244 | 245 | ```math 246 | \sum_{i = 1}^N \phi_{ib}^2 R_{ik} = \sum_{i = 1}^N \phi_{ib} R_{ik}. 247 | ``` 248 | 249 | Let 250 | 251 | $$ 252 | \begin{align*} 253 | N_k &= \sum_{i = 1}^N R_{ik},\\ 254 | N_{bk} &= \sum_{i = 1}^N \phi_{ib}R_{ik} \qquad \Rightarrow \qquad N = \phi^T R \qquad \Rightarrow \qquad N = O. 255 | \end{align*} 256 | $$ 257 | 258 | Then we have 259 | 260 | ```math 261 | N_k = \sum_{b = 1}^B O_{bk} 262 | ``` 263 | 264 | and 265 | 266 | ```math 267 | A_k = \begin{bmatrix} 268 | N_k & O_{1k} & \cdots & O_{Bk} \\ 269 | O_{1k} & O_{1k} & & \\ 270 | \vdots & & \ddots & \\ 271 | O_{Bk} & & & O_{Bk} 272 | \end{bmatrix} + \lambda J = \begin{bmatrix} 273 | N_k & O_{1k} & \cdots & O_{Bk} \\ 274 | O_{1k} & O_{1k} + \lambda & & \\ 275 | \vdots & & \ddots & \\ 276 | O_{Bk} & & & O_{Bk} + \lambda 277 | \end{bmatrix}. 278 | ``` 279 | 280 | Let 281 | 282 | ```math 283 | P = \begin{bmatrix} 284 | 1 & -\frac{O_{1k}}{O_{1k} + \lambda} & \cdots & -\frac{O_{Bk}}{O_{Bk} + \lambda} \\ 285 | & 1 & & \\ 286 | & & \ddots & \\ 287 | & & & 1 288 | \end{bmatrix} 289 | ``` 290 | 291 | then 292 | 293 | ```math 294 | \mathcal{B}_k = PA_kP^T = \begin{bmatrix} 295 | c & & & \\ 296 | & O_{1k}+\lambda & & \\ 297 | & & \ddots & \\ 298 | & & & O_{Bk}+\lambda 299 | \end{bmatrix}, 300 | ``` 301 | 302 | where 303 | 304 | ```math 305 | c = N_k - \sum_{i = 1}^N \frac{O_{ik}^2}{O_{ik}+\lambda}. 306 | ``` 307 | 308 | $\mathcal{B}_k$ has inverse 309 | 310 | ```math 311 | \mathcal{B}^{-1}_k = \begin{bmatrix} 312 | c^{-1} & & & \\ 313 | & \frac{1}{O_{1k}+\lambda} & & \\ 314 | & & \ddots & \\ 315 | & & & \frac{1}{O_{Bk}+\lambda} 316 | \end{bmatrix}. 317 | ``` 318 | 319 | Now since $P$, $A_k$ and $P^T$ are all square matrices of shape $(B+1)\times(B+1)$ and invertible, we have 320 | 321 | ```math 322 | \begin{align*} 323 | \mathcal{B}_k^{-1} &= (PA_kP^T)^{-1} \\ 324 | &= (P^T)^{-1} A_k^{-1} P^{-1}. 325 | \end{align*} 326 | ``` 327 | 328 | Thus 329 | 330 | ```math 331 | \begin{align*} 332 | A^{-1}_k &= P^T\mathcal{B}_k^{-1}P \\ 333 | &= \begin{bmatrix} 334 | c^{-1} & & & \\ 335 | -\frac{O_{1k}}{O_{1k}+\lambda}c^{-1} & \frac{1}{O_{1k}+\lambda} & & \\ 336 | \vdots & & \ddots & \\ 337 | -\frac{O_{Bk}}{O_{Bk}+\lambda}c^{-1} & & & \frac{1}{O_{Bk}+\lambda} 338 | \end{bmatrix} \cdot P 339 | \end{align*} 340 | ``` 341 | 342 | which is decomposited into a lower-triangular, a diagonal, and an upper-triangular matrix. 343 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"] 3 | 4 | [tool.setuptools_scm] 5 | write_to = "harmony/version.py" 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | with open(path.join(here, "README.rst"), encoding="utf-8") as f: 7 | long_description = f.read() 8 | 9 | requires = [ 10 | "torch>=1.12", 11 | "numpy", 12 | "pandas", 13 | "psutil", 14 | "threadpoolctl", 15 | "scikit-learn>=0.23" 16 | ] 17 | 18 | setup( 19 | name="harmony-pytorch", 20 | use_scm_version=True, 21 | description="Pytorch implementation of Harmony algorithm on single-cell sequencing data integration", 22 | long_description=long_description, 23 | url="https://github.com/lilab-bcb/harmony-pytorch", 24 | author="Yiming Yang, Bo Li", 25 | author_email="yang.yihming@gmail.com, lijiganjun@gmail.com", 26 | classifiers=[ # https://pypi.python.org/pypi?%3Aaction=list_classifiers 27 | "Development Status :: 3 - Alpha", 28 | "Intended Audience :: Developers", 29 | "Intended Audience :: Science/Research", 30 | "License :: OSI Approved :: BSD License", 31 | "Natural Language :: English", 32 | "Operating System :: MacOS", 33 | "Operating System :: Microsoft :: Windows :: Windows 10", 34 | "Operating System :: POSIX :: Linux", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3 :: Only", 37 | "Topic :: Software Development :: Build Tools", 38 | "Topic :: Scientific/Engineering :: Bio-Informatics", 39 | ], 40 | keywords="single-cell genomics data integration", 41 | packages=find_packages(), 42 | install_requires=requires, 43 | setup_requires=["setuptools_scm"], 44 | python_requires="~=3.8", 45 | ) 46 | -------------------------------------------------------------------------------- /test/gen_cell_lines.R: -------------------------------------------------------------------------------- 1 | library(harmony) 2 | 3 | metadata <- read.table("data/cell_lines/metadata.csv", header = TRUE, sep = ',') 4 | X <- read.table("data/cell_lines/pca.txt") 5 | 6 | start <- Sys.time() 7 | Z <- HarmonyMatrix(X, metadata, "dataset", do_pca = FALSE) 8 | end <- Sys.time() 9 | 10 | print(end - start) 11 | 12 | write.table(Z, file = "result/cell_lines_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) -------------------------------------------------------------------------------- /test/gen_mantonbm.R: -------------------------------------------------------------------------------- 1 | library(harmony) 2 | 3 | metadata <- read.table("./data/MantonBM/metadata.csv", header = TRUE, sep = ',') 4 | X <- read.table("./data/MantonBM/pca.txt") 5 | 6 | start <- Sys.time() 7 | Z <- HarmonyMatrix(X, metadata, "Channel", do_pca = FALSE) 8 | end <- Sys.time() 9 | 10 | write.table(Z, file = "./result/MantonBM_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) 11 | 12 | print(end - start) -------------------------------------------------------------------------------- /test/gen_pbmc.R: -------------------------------------------------------------------------------- 1 | library(harmony) 2 | 3 | metadata <- read.table("data/10x_pbmc/metadata.csv", header = TRUE, sep = ',') 4 | X <- read.table("data/10x_pbmc/pca.txt") 5 | 6 | start <- Sys.time() 7 | Z <- HarmonyMatrix(X, metadata, "Channel", do_pca = FALSE) 8 | end <- Sys.time() 9 | 10 | print(end - start) 11 | 12 | write.table(Z, file = "result/pbmc_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pegasus as pg 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | 7 | import os, sys, time, re 8 | 9 | from harmony import harmonize 10 | from harmonypy import run_harmony 11 | from anndata import AnnData 12 | from scipy.stats import pearsonr 13 | from scipy.sparse import csr_matrix 14 | 15 | 16 | metric_dict = {"r": "Correlation", "L2": "L2 Error"} 17 | 18 | 19 | def check_metric(Z_torch, Z_py, Z_R, prefix, norm): 20 | assert Z_torch.shape == Z_py.shape and Z_py.shape == Z_R.shape 21 | 22 | metric_torch = [] 23 | for i in range(Z_torch.shape[1]): 24 | m = get_measure(Z_torch[:, i], Z_R[:, i], norm) 25 | metric_torch.append(m) 26 | 27 | print( 28 | "Mean {metric} by harmony-pytorch = {value:.4f}".format( 29 | metric=metric_dict[norm], value=np.mean(metric_torch) 30 | ) 31 | ) 32 | np.savetxt( 33 | "./result/{prefix}_{metric}_torch.txt".format(prefix=prefix, metric=norm), 34 | metric_torch, 35 | ) 36 | 37 | metric_py = [] 38 | for i in range(Z_py.shape[1]): 39 | m = get_measure(Z_py[:, i], Z_R[:, i], norm) 40 | metric_py.append(m) 41 | 42 | print( 43 | "Mean {metric} by harmonypy = {value:.4f}".format( 44 | metric=metric_dict[norm], value=np.mean(metric_py) 45 | ) 46 | ) 47 | np.savetxt( 48 | "./result/{prefix}_{metric}_py.txt".format(prefix=prefix, metric=norm), 49 | metric_py, 50 | ) 51 | 52 | 53 | def get_measure(x, base, norm): 54 | assert norm in ["r", "L2"] 55 | 56 | if norm == "r": 57 | corr, _ = pearsonr(x, base) 58 | return corr 59 | else: 60 | return np.linalg.norm(x - base) / np.linalg.norm(base) 61 | 62 | 63 | def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key): 64 | if adata is not None: 65 | adata.obsm["X_torch"] = Z_torch 66 | adata.obsm["X_py"] = Z_py 67 | adata.obsm["X_harmony"] = Z_R 68 | 69 | pg.neighbors(adata, rep="torch") 70 | pg.umap(adata, rep="torch", out_basis="umap_torch") 71 | 72 | pg.neighbors(adata, rep="py") 73 | pg.umap(adata, rep="py", out_basis="umap_py") 74 | 75 | pg.neighbors(adata, rep="harmony") 76 | pg.umap(adata, rep="harmony", out_basis="umap_harmony") 77 | 78 | pg.write_output(adata, "./result/{}_result".format(prefix)) 79 | else: 80 | print("Use precalculated AnnData result.") 81 | 82 | if os.system( 83 | "pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format( 84 | name=prefix, attr=batch_key 85 | ) 86 | ): 87 | sys.exit(1) 88 | 89 | if os.system( 90 | "pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format( 91 | name=prefix, attr=batch_key 92 | ) 93 | ): 94 | sys.exit(1) 95 | 96 | if os.system( 97 | "pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format( 98 | name=prefix, attr=batch_key 99 | ) 100 | ): 101 | sys.exit(1) 102 | 103 | if os.system( 104 | "pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format( 105 | name=prefix, attr=batch_key 106 | ) 107 | ): 108 | sys.exit(1) 109 | 110 | 111 | def test_cell_lines(): 112 | print("Testing on cell lines dataset...") 113 | 114 | z_files = [ 115 | f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f) 116 | ] 117 | if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): 118 | X = np.loadtxt("./data/cell_lines/pca.txt") 119 | df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") 120 | source_loaded = True 121 | 122 | if os.path.exists("./result/cell_lines_torch_z.npy"): 123 | Z_torch = np.load("./result/cell_lines_torch_z.npy") 124 | print("Precalculated embedding by harmony-pytorch is loaded.") 125 | else: 126 | start_torch = time.time() 127 | Z_torch = harmonize(X, df_metadata, batch_key="dataset") 128 | end_torch = time.time() 129 | 130 | print( 131 | "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch) 132 | ) 133 | np.save("./result/cell_lines_torch_z.npy", Z_torch) 134 | 135 | if os.path.exists("./result/cell_lines_py_z.npy"): 136 | Z_py = np.load("./result/cell_lines_py_z.npy") 137 | print("Precalculated embedding by harmonypy is loaded.") 138 | else: 139 | start_py = time.time() 140 | ho = run_harmony(X, df_metadata, ["dataset"]) 141 | end_py = time.time() 142 | 143 | print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) 144 | print(ho.objective_harmony) 145 | 146 | Z_py = np.transpose(ho.Z_corr) 147 | np.save("./result/cell_lines_py_z.npy", Z_py) 148 | 149 | Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") 150 | 151 | check_metric(Z_torch, Z_py, Z_R, prefix="cell_lines", norm="r") 152 | check_metric(Z_torch, Z_py, Z_R, prefix="cell_lines", norm="L2") 153 | 154 | if os.path.exists("./result/cell_lines_result.h5ad"): 155 | adata = None 156 | else: 157 | n_obs = X.shape[0] 158 | adata = AnnData(X=csr_matrix((n_obs, 2)), obs=df_metadata) 159 | adata.obsm["X_pca"] = X 160 | 161 | pg.neighbors(adata, rep="pca") 162 | pg.umap(adata) 163 | 164 | umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)] 165 | if len(umap_list) < 4: 166 | plot_umap(adata, Z_torch, Z_py, Z_R, prefix="cell_lines", batch_key="dataset") 167 | 168 | if os.path.exists("./result/cell_lines_result.h5ad"): 169 | adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode="r") 170 | 171 | stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="harmony") 172 | print( 173 | "kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format( 174 | stat=stat, pval=pvalue, ac_rate=ac_rate 175 | ) 176 | ) 177 | 178 | stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="py") 179 | print( 180 | "kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format( 181 | stat=stat, pval=pvalue, ac_rate=ac_rate 182 | ) 183 | ) 184 | 185 | stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="torch") 186 | print( 187 | "kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format( 188 | stat=stat, pval=pvalue, ac_rate=ac_rate 189 | ) 190 | ) 191 | 192 | 193 | def test_pbmc(): 194 | print("Testing on 10x pbmc dataset...") 195 | 196 | z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)] 197 | if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"): 198 | adata = pg.read_input("./data/10x_pbmc/original_data.h5ad") 199 | 200 | if os.path.exists("./result/pbmc_torch_z.npy"): 201 | Z_torch = np.load("./result/pbmc_torch_z.npy") 202 | print("Precalculated embedding by harmony-pytorch is loaded.") 203 | else: 204 | start_torch = time.time() 205 | Z_torch = harmonize(adata.obsm["X_pca"], adata.obs, batch_key="Channel") 206 | end_torch = time.time() 207 | 208 | print( 209 | "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch) 210 | ) 211 | np.save("./result/pbmc_torch_z.npy", Z_torch) 212 | 213 | if os.path.exists("./result/pbmc_py_z.npy"): 214 | Z_py = np.load("./result/pbmc_py_z.npy") 215 | print("Precalculated embedding by harmonypy is loaded.") 216 | else: 217 | start_py = time.time() 218 | ho = run_harmony(adata.obsm["X_pca"], adata.obs, ["Channel"]) 219 | end_py = time.time() 220 | 221 | print(ho.objective_harmony) 222 | print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) 223 | 224 | Z_py = np.transpose(ho.Z_corr) 225 | np.save("./result/pbmc_py_z.npy", Z_py) 226 | 227 | Z_R = np.loadtxt("./result/pbmc_harmony_z.txt") 228 | 229 | check_metric(Z_torch, Z_py, Z_R, prefix="pbmc", norm="r") 230 | check_metric(Z_torch, Z_py, Z_R, prefix="pbmc", norm="L2") 231 | 232 | if os.path.exists("./result/pbmc_result.h5ad"): 233 | adata = None 234 | 235 | umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)] 236 | if len(umap_list) < 4: 237 | plot_umap(adata, Z_torch, Z_py, Z_R, prefix="pbmc", batch_key="Channel") 238 | 239 | 240 | def test_mantonbm(): 241 | print("Testing on MantonBM dataset...") 242 | 243 | z_files = [ 244 | f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f) 245 | ] 246 | if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"): 247 | adata = pg.read_input("./data/MantonBM/original_data.h5ad") 248 | adata.obs["Individual"] = pd.Categorical( 249 | adata.obs["Channel"].apply(lambda s: s.split("_")[0][-1]) 250 | ) 251 | 252 | if os.path.exists("./result/MantonBM_torch_z.npy"): 253 | Z_torch = np.load("./result/MantonBM_torch_z.npy") 254 | print("Precalculated embedding by harmony-pytorch is loaded.") 255 | else: 256 | start_torch = time.time() 257 | Z_torch = harmonize(adata.obsm["X_pca"], adata.obs, batch_key="Channel") 258 | end_torch = time.time() 259 | 260 | print( 261 | "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch) 262 | ) 263 | np.save("./result/MantonBM_torch_z.npy", Z_torch) 264 | 265 | if os.path.exists("./result/MantonBM_py_z.npy"): 266 | Z_py = np.load("./result/MantonBM_py_z.npy") 267 | print("Precalculated embedding by harmonypy is loaded.") 268 | else: 269 | start_py = time.time() 270 | ho = run_harmony(adata.obsm["X_pca"], adata.obs, ["Channel"]) 271 | end_py = time.time() 272 | 273 | print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) 274 | 275 | Z_py = np.transpose(ho.Z_corr) 276 | np.save("./result/MantonBM_py_z.npy", Z_py) 277 | 278 | Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") 279 | 280 | check_metric(Z_torch, Z_py, Z_R, prefix="MantonBM", norm="r") 281 | check_metric(Z_torch, Z_py, Z_R, prefix="MantonBM", norm="L2") 282 | 283 | if os.path.exists("./result/MantonBM_result.h5ad"): 284 | adata = None 285 | 286 | umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] 287 | if len(umap_list) < 4: 288 | plot_umap(adata, Z_torch, Z_py, Z_R, prefix="MantonBM", batch_key="Individual") 289 | 290 | 291 | def gen_plot(norm): 292 | # Cell Lines 293 | metric_celllines_torch = np.loadtxt("./result/cell_lines_{}_torch.txt".format(norm)) 294 | metric_celllines_py = np.loadtxt("./result/cell_lines_{}_py.txt".format(norm)) 295 | 296 | df1 = pd.DataFrame( 297 | { 298 | "dataset": np.repeat( 299 | ["Cell Lines"], metric_celllines_torch.size + metric_celllines_py.size 300 | ), 301 | "package": np.concatenate( 302 | ( 303 | np.repeat(["Torch"], metric_celllines_torch.size), 304 | np.repeat(["Py"], metric_celllines_py.size), 305 | ), 306 | axis=0, 307 | ), 308 | "metric": np.concatenate( 309 | (metric_celllines_torch, metric_celllines_py), axis=0 310 | ), 311 | } 312 | ) 313 | 314 | # PBMC 315 | metric_pbmc_torch = np.loadtxt("./result/pbmc_{}_torch.txt".format(norm)) 316 | metric_pbmc_py = np.loadtxt("./result/pbmc_{}_py.txt".format(norm)) 317 | 318 | df2 = pd.DataFrame( 319 | { 320 | "dataset": np.repeat( 321 | ["10x PBMC"], metric_pbmc_torch.size + metric_pbmc_py.size 322 | ), 323 | "package": np.concatenate( 324 | ( 325 | np.repeat(["Torch"], metric_pbmc_torch.size), 326 | np.repeat(["Py"], metric_pbmc_py.size), 327 | ), 328 | axis=0, 329 | ), 330 | "metric": np.concatenate((metric_pbmc_torch, metric_pbmc_py), axis=0), 331 | } 332 | ) 333 | 334 | # MantonBM 335 | metric_mantonbm_torch = np.loadtxt("./result/MantonBM_{}_torch.txt".format(norm)) 336 | metric_mantonbm_py = np.loadtxt("./result/MantonBM_{}_py.txt".format(norm)) 337 | 338 | df3 = pd.DataFrame( 339 | { 340 | "dataset": np.repeat( 341 | ["Bone Marrow"], metric_mantonbm_torch.size + metric_mantonbm_py.size 342 | ), 343 | "package": np.concatenate( 344 | ( 345 | np.repeat(["Torch"], metric_mantonbm_torch.size), 346 | np.repeat(["Py"], metric_mantonbm_py.size), 347 | ), 348 | axis=0, 349 | ), 350 | "metric": np.concatenate( 351 | (metric_mantonbm_torch, metric_mantonbm_py), axis=0 352 | ), 353 | } 354 | ) 355 | 356 | df = pd.concat([df1, df2, df3]) 357 | 358 | # Plot 359 | ax = sns.violinplot( 360 | x="dataset", 361 | y="metric", 362 | hue="package", 363 | data=df, 364 | palette="muted", 365 | split=True, 366 | cut=0, 367 | ) 368 | ax.set_title( 369 | "{} between Harmonypy and Harmony-pytorch Integration".format(metric_dict[norm]) 370 | ) 371 | ax.set(xlabel="Dataset", ylabel="{} on PCs".format(metric_dict[norm])) 372 | if norm == "r": 373 | ax.set(ylim=(0.98, 1.001)) 374 | else: 375 | ax.set(ylim=(0, 0.1)) 376 | figure = ax.get_figure() 377 | legend_loc = "lower right" if norm == "r" else "upper right" 378 | figure.get_axes()[0].legend(title="Package", loc=legend_loc) 379 | figure.savefig("./plots/{}_stats.png".format(norm), dpi=400) 380 | plt.close() 381 | 382 | 383 | if __name__ == "__main__": 384 | dataset = sys.argv[1] 385 | 386 | assert dataset in ["cell_lines", "pbmc", "MantonBM", "plot"] 387 | 388 | if not os.path.exists("./result"): 389 | if os.system("mkdir ./result"): 390 | sys.exit(1) 391 | 392 | if not os.path.exists("./plots"): 393 | if os.system("mkdir ./plots"): 394 | sys.exit(1) 395 | 396 | if dataset == "cell_lines": 397 | test_cell_lines() 398 | elif dataset == "pbmc": 399 | test_pbmc() 400 | elif dataset == "MantonBM": 401 | test_mantonbm() 402 | else: 403 | gen_plot("r") 404 | gen_plot("L2") 405 | -------------------------------------------------------------------------------- /test/test_gpu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pegasus as pg 3 | import pandas as pd 4 | 5 | import os, sys, time, re 6 | 7 | from harmony import harmonize 8 | from scipy.stats import pearsonr 9 | from scipy.sparse import csr_matrix 10 | from anndata import AnnData 11 | 12 | 13 | def check_metrics(Z, base, prefix): 14 | assert Z.shape == base.shape 15 | 16 | cors = [] 17 | errors = [] 18 | for i in range(Z.shape[1]): 19 | cor, _ = pearsonr(Z[:, i], base[:, i]) 20 | cors.append(cor) 21 | 22 | err = np.linalg.norm(Z[:, i] - base[:, i]) / np.linalg.norm(base[:, i]) 23 | errors.append(err) 24 | 25 | print( 26 | "For {name}, mean r = {cor:.4f}, mean L2 error = {err:.4f}.".format( 27 | name=prefix, cor=np.mean(cors), err=np.mean(errors) 28 | ) 29 | ) 30 | np.savetxt("./result/{}_r.txt".format(prefix), cors) 31 | np.savetxt("./result/{}_L2.txt".format(prefix), errors) 32 | 33 | 34 | def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key): 35 | if adata is not None: 36 | adata.obsm["X_cpu"] = Z_cpu 37 | adata.obsm["X_gpu"] = Z_gpu 38 | adata.obsm["X_harmony"] = Z_R 39 | 40 | pg.neighbors(adata, rep="cpu") 41 | pg.umap(adata, rep="cpu", out_basis="umap_cpu") 42 | 43 | pg.neighbors(adata, rep="gpu") 44 | pg.umap(adata, rep="gpu", out_basis="umap_gpu") 45 | 46 | pg.neighbors(adata, rep="harmony") 47 | pg.umap(adata, rep="harmony", out_basis="umap_harmony") 48 | 49 | pg.write_output(adata, "./result/{}_result".format(prefix)) 50 | else: 51 | print("Use precalculated AnnData result.") 52 | 53 | if os.system( 54 | "pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format( 55 | attr=batch_key, prefix=prefix 56 | ) 57 | ): 58 | sys.exit(1) 59 | 60 | if os.system( 61 | "pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format( 62 | attr=batch_key, prefix=prefix 63 | ) 64 | ): 65 | sys.exit(1) 66 | 67 | if os.system( 68 | "pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format( 69 | attr=batch_key, prefix=prefix 70 | ) 71 | ): 72 | sys.exit(1) 73 | 74 | if os.system( 75 | "pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format( 76 | attr=batch_key, prefix=prefix 77 | ) 78 | ): 79 | sys.exit(1) 80 | 81 | 82 | def test_cell_lines(): 83 | print("Testing on Cell Lines...") 84 | 85 | z_files = [ 86 | f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f) 87 | ] 88 | if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): 89 | X = np.loadtxt("./data/cell_lines/pca.txt") 90 | df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") 91 | 92 | if os.path.exists("./result/cell_lines_cpu_z.npy"): 93 | Z_cpu = np.load("./result/cell_lines_cpu_z.npy") 94 | print("Precalculated CPU mode result is loaded.") 95 | else: 96 | start_cpu = time.time() 97 | Z_cpu = harmonize(X, df_metadata, "dataset") 98 | end_cpu = time.time() 99 | 100 | print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) 101 | np.save("./result/cell_lines_cpu_z.npy", Z_cpu) 102 | 103 | if os.path.exists("./result/cell_lines_gpu_z.npy"): 104 | Z_gpu = np.load("./result/cell_lines_gpu_z.npy") 105 | print("Precalculated GPU mode result is loaded.") 106 | else: 107 | start_gpu = time.time() 108 | Z_gpu = harmonize(X, df_metadata, "dataset", use_gpu=True) 109 | end_gpu = time.time() 110 | 111 | print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) 112 | np.save("./result/cell_lines_gpu_z.npy", Z_gpu) 113 | 114 | Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") 115 | 116 | check_metrics(Z_cpu, Z_R, prefix="cell_lines_cpu") 117 | check_metrics(Z_gpu, Z_R, prefix="cell_lines_gpu") 118 | 119 | if os.path.exists("./result/cell_lines_result.h5ad"): 120 | adata = None 121 | else: 122 | n_obs = X.shape[0] 123 | adata = AnnData(X=csr_matrix((n_obs, 2)), obs=df_metadata) 124 | adata.obsm["X_pca"] = X 125 | 126 | pg.neighbors(adata, rep="pca") 127 | pg.umap(adata) 128 | 129 | umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")] 130 | if len(umap_list) < 4: 131 | plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="cell_lines", batch_key="dataset") 132 | 133 | 134 | def test_pbmc(): 135 | print("Testing on 10x PBMC...") 136 | 137 | z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)] 138 | if len(z_files) < 3: 139 | adata = pg.read_input("./data/10x_pbmc/original_data.h5ad") 140 | 141 | if os.path.exists("./result/pbmc_cpu_z.npy"): 142 | Z_cpu = np.load("./result/pbmc_cpu_z.npy") 143 | print("Precalculated CPU mode result is loaded.") 144 | else: 145 | start_cpu = time.time() 146 | Z_cpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel") 147 | end_cpu = time.time() 148 | 149 | print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) 150 | np.save("./result/pbmc_cpu_z.npy", Z_cpu) 151 | 152 | if os.path.exists("./result/pbmc_gpu_z.npy"): 153 | Z_gpu = np.load("./result/pbmc_gpu_z.npy") 154 | print("Precalculated GPU mode result is loaded.") 155 | else: 156 | start_gpu = time.time() 157 | Z_gpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel", use_gpu=True) 158 | end_gpu = time.time() 159 | 160 | print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) 161 | np.save("./result/pbmc_gpu_z.npy", Z_gpu) 162 | 163 | Z_R = np.loadtxt("./result/pbmc_harmony_z.txt") 164 | 165 | check_metrics(Z_cpu, Z_R, prefix="pbmc_cpu") 166 | check_metrics(Z_gpu, Z_R, prefix="pbmc_gpu") 167 | 168 | if os.path.exists("./result/pbmc_result.h5ad"): 169 | adata = None 170 | 171 | umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)] 172 | if len(umap_list) < 4: 173 | plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="pbmc", batch_key="Channel") 174 | 175 | 176 | def test_mantonbm(): 177 | print("Testing on MantonBM...") 178 | 179 | z_files = [ 180 | f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f) 181 | ] 182 | if len(z_files) < 3: 183 | adata = pg.read_input("./data/MantonBM/original_data.h5ad") 184 | adata.obs["Individual"] = pd.Categorical( 185 | adata.obs["Channel"].apply(lambda s: s.split("_")[0][-1]) 186 | ) 187 | 188 | if os.path.exists("./result/MantonBM_cpu_z.npy"): 189 | Z_cpu = np.load("./result/MantonBM_cpu_z.npy") 190 | print("Precalculated CPU mode result is loaded.") 191 | else: 192 | start_cpu = time.time() 193 | Z_cpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel") 194 | end_cpu = time.time() 195 | 196 | print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu)) 197 | np.save("./result/MantonBM_cpu_z.npy", Z_cpu) 198 | 199 | if os.path.exists("./result/MantonBM_gpu_z.npy"): 200 | Z_gpu = np.load("./result/MantonBM_gpu_z.npy") 201 | print("Precalculated GPU mode result is loaded.") 202 | else: 203 | start_gpu = time.time() 204 | Z_gpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel", use_gpu=True) 205 | end_gpu = time.time() 206 | 207 | print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu)) 208 | np.save("./result/MantonBM_gpu_z.npy", Z_gpu) 209 | 210 | Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") 211 | 212 | check_metrics(Z_cpu, Z_R, prefix="MantonBM_cpu") 213 | check_metrics(Z_gpu, Z_R, prefix="MantonBM_gpu") 214 | 215 | if os.path.exists("./result/MantonBM_result.h5ad"): 216 | adata = None 217 | 218 | umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] 219 | if len(umap_list) < 4: 220 | plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="MantonBM", batch_key="Individual") 221 | 222 | 223 | if __name__ == "__main__": 224 | dataset = sys.argv[1] 225 | 226 | assert dataset in ["cell_lines", "pbmc", "MantonBM"] 227 | if dataset == "cell_lines": 228 | test_cell_lines() 229 | elif dataset == "pbmc": 230 | test_pbmc() 231 | else: 232 | test_mantonbm() 233 | --------------------------------------------------------------------------------