├── .gitattributes
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── harmony
    ├── __init__.py
    ├── harmony.py
    └── utils.py
├── method
    └── Method.md
├── pyproject.toml
├── setup.py
└── test
    ├── gen_cell_lines.R
    ├── gen_mantonbm.R
    ├── gen_pbmc.R
    ├── test.py
    └── test_gpu.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | harmony/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Sublime workspace
132 | *.sublime-workspace
133 | .DS_Store
134 | 
135 | #Custom folders
136 | results/
137 | figures/
138 | 
139 | *.sublime-workspace
140 | *.sublime-project
141 | 
142 | # Jupyter notebooks
143 | *.ipynb
144 | 
145 | .idea/
146 | 
147 | *.h5ad
148 | 
149 | version.py
150 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019-2020 The Broad Institute, Inc. and The General Hospital Corporation.
 4 | Copyright (c) 2021-present Genentech, Inc. for code commits from 2021 onward.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude .*
2 | exclude test/*
3 | exclude method/*
4 | exclude pyproject.toml
5 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Harmony-Pytorch
 2 | ---------------
 3 | 
 4 | |PyPI| |Conda| |Python| |License|
 5 | 
 6 | .. |PyPI| image:: https://img.shields.io/pypi/v/harmony-pytorch.svg
 7 |    :target: https://pypi.org/project/harmony-pytorch
 8 | 
 9 | .. |Conda| image:: https://img.shields.io/conda/v/bioconda/harmony-pytorch
10 |    :target: https://anaconda.org/bioconda/harmony-pytorch
11 | 
12 | .. |Python| image:: https://img.shields.io/pypi/pyversions/harmony-pytorch.svg
13 |    :target: https://pypi.org/project/harmony-pytorch
14 | 
15 | .. |License| image:: https://img.shields.io/github/license/lilab-bcb/harmony-pytorch
16 |    :target: https://github.com/lilab-bcb/harmony-pytorch/blob/master/LICENSE
17 | 
18 | This is a Pytorch implementation of Harmony algorithm on single-cell sequencing data integration. Please see `Ilya Korsunsky et al., 2019 <https://www.nature.com/articles/s41592-019-0619-0>`_ for details.
19 | 
20 | Installation
21 | ^^^^^^^^^^^^^
22 | 
23 | This package is published on PyPI::
24 | 
25 | 	pip install harmony-pytorch
26 | 
27 | 
28 | Usage
29 | ^^^^^^^^
30 | 
31 | General Case
32 | ##############
33 | 
34 | Given an embedding ``X`` as a N-by-d matrix in numpy array structure (N for number of cells, d for embedding components) and cell attributes as a Data Frame ``df_metadata``, use Harmony for data integration as the following::
35 | 
36 | 	from harmony import harmonize
37 | 	Z = harmonize(X, df_metadata, batch_key = 'Channel')
38 | 
39 | 
40 | where ``Channel`` is the attribute in ``df_metadata`` for batches.
41 | 
42 | Alternatively, if there are multiple attributes for batches, write::
43 | 
44 | 	Z = harmonize(X, df_metadata, batch_key = ['Lab', 'Date'])
45 | 
46 | Input as MultimodalData Object
47 | ###############################
48 | 
49 | It's easy for Harmony-pytorch to work with count matrix data structure from `PegasusIO <https://pegasusio.readthedocs.io>`_ package. Let ``data`` be a MultimodalData object in Python::
50 | 
51 |     from harmony import harmonize
52 |     Z = harmonize(data.obsm['X_pca'], data.obs, batch_key = 'Channel')
53 |     data.obsm['X_pca_harmony'] = Z
54 | 
55 | This will calculate the harmonized PCA matrix for the default UnimodalData of ``data``.
56 | 
57 | Given a UnimodalData object ``unidata``, you can also use the code above to perform Harmony algorithm: simply substitute ``unidata`` for ``data`` there.
58 | 
59 | Input as AnnData Object
60 | ##########################
61 | 
62 | It's easy for Harmony-pytorch to work with annotated count matrix data structure from `anndata <https://icb-anndata.readthedocs-hosted.com/en/stable/index.html>`_ package. Let ``adata`` be an AnnData object in Python::
63 | 
64 | 	from harmony import harmonize
65 | 	Z = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = '<your-batch-key>')
66 | 	adata.obsm['X_harmony'] = Z
67 | 
68 | where ``<your-batch-key>`` should be replaced by the actual batch key attribute name in your data.
69 | 
70 | For details about ``AnnData`` data structure, please refer to its `documentation <https://icb-anndata.readthedocs-hosted.com/en/stable/anndata.AnnData.html>`_.
71 | 


--------------------------------------------------------------------------------
/harmony/__init__.py:
--------------------------------------------------------------------------------
 1 | from .harmony import harmonize
 2 | 
 3 | try:
 4 |     from importlib.metadata import version, PackageNotFoundError
 5 | except ImportError:  # < Python 3.8: Use backport module
 6 |     from importlib_metadata import version, PackageNotFoundError
 7 | 
 8 | try:
 9 |     __version__ = version("harmony-pytorch")
10 |     del version
11 | except PackageNotFoundError:
12 |     pass
13 | 


--------------------------------------------------------------------------------
/harmony/harmony.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | from sklearn.cluster import KMeans
  8 | from torch.nn.functional import normalize
  9 | from typing import Union, List
 10 | from .utils import one_hot_tensor, get_batch_codes
 11 | 
 12 | 
 13 | def harmonize(
 14 |     X: np.array,
 15 |     df_obs: pd.DataFrame,
 16 |     batch_key: Union[str, List[str]],
 17 |     n_clusters: int = None,
 18 |     max_iter_harmony: int = 10,
 19 |     max_iter_clustering: int = 200,
 20 |     tol_harmony: float = 1e-4,
 21 |     tol_clustering: float = 1e-5,
 22 |     ridge_lambda: float = 1.0,
 23 |     sigma: float = 0.1,
 24 |     block_proportion: float = 0.05,
 25 |     init_centroids_method: str = "default",
 26 |     theta: float = 2.0,
 27 |     tau: int = 0,
 28 |     random_state: int = 0,
 29 |     use_gpu: bool = False,
 30 |     n_jobs: int = -1,
 31 |     verbose: bool = True,
 32 | ) -> np.array:
 33 |     """
 34 |     Integrate data using Harmony algorithm.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 | 
 39 |     X: ``numpy.array``
 40 |         The input embedding with rows for cells (N) and columns for embedding coordinates (d).
 41 | 
 42 |     df_obs: ``pandas.DataFrame``
 43 |         The cell barcode attributes as a Pandas Data Frame.
 44 | 
 45 |     batch_key: ``str`` or ``List[str]``
 46 |         Cell attribute(s) from ``df_obs`` to identify batches.
 47 | 
 48 |     n_clusters: ``int``, optional, default: ``None``
 49 |         Number of clusters used in Harmony algorithm. If ``None``, choose the minimum of 100 and N / 30.
 50 | 
 51 |     max_iter_harmony: ``int``, optional, default: ``10``
 52 |         Maximum iterations on running Harmony if not converged.
 53 | 
 54 |     max_iter_clustering: ``int``, optional, default: ``200``
 55 |         Within each Harmony iteration, maximum iterations on the clustering step if not converged.
 56 | 
 57 |     tol_harmony: ``float``, optional, default: ``1e-4``
 58 |         Tolerance on justifying convergence of Harmony over objective function values.
 59 | 
 60 |     tol_clustering: ``float``, optional, default: ``1e-5``
 61 |         Tolerance on justifying convergence of the clustering step over objective function values within each Harmony iteration.
 62 | 
 63 |     ridge_lambda: ``float``, optional, default: ``1.0``
 64 |         Hyperparameter of ridge regression on the correction step.
 65 | 
 66 |     sigma: ``float``, optional, default: ``0.1``
 67 |         Weight of the entropy term in objective function.
 68 | 
 69 |     block_proportion: ``float``, optional, default: ``0.05``
 70 |         Proportion of block size in one update operation of clustering step.
 71 | 
 72 |     init_centroids_method: ``str``, optional, default: ``default``
 73 |         K-Means method used for intializing centroids. Can be either 'default' or 'harmony-paper'.
 74 |         If using 'default', it will use the default settings of ``sklearn.cluster.KMeans`` function.
 75 |         If choosing 'harmony-paper', it will use the same method as described in Harmony paper, i.e. ``sklearn.cluster.KMeans(..., init='random', n_init=10, max_iter=25, ...)``.
 76 | 
 77 |     theta: ``float``, optional, default: ``2.0``
 78 |         Weight of the diversity penalty term in objective function.
 79 | 
 80 |     tau: ``int``, optional, default: ``0``
 81 |         Discounting factor on ``theta``. By default, there is no discounting.
 82 | 
 83 |     random_state: ``int``, optional, default: ``0``
 84 |         Random seed for reproducing results.
 85 | 
 86 |     use_gpu: ``bool``, optional, default: ``False``
 87 |         If ``True``, use GPU if available. Otherwise, use CPU only.
 88 | 
 89 |     n_jobs: ``int``, optional, default ``-1``
 90 |         How many CPU threads to use. By default, use all physical cores. If 'use_gpu' is True, this option only affects the KMeans step.
 91 | 
 92 |     verbose: ``bool``, optional, default ``True``
 93 |         If ``True``, print verbose output.
 94 | 
 95 |     Returns
 96 |     -------
 97 |     ``numpy.array``
 98 |         The integrated embedding by Harmony, of the same shape as the input embedding.
 99 | 
100 |     Examples
101 |     --------
102 |     >>> adata = anndata.read_h5ad("filename.h5ad")
103 |     >>> X_harmony = harmonize(adata.obsm['X_pca'], adata.obs, 'Channel')
104 | 
105 |     >>> adata = anndata.read_h5ad("filename.h5ad")
106 |     >>> X_harmony = harmonize(adata.obsm['X_pca'], adata.obs, ['Channel', 'Lab'])
107 |     """
108 | 
109 |     assert isinstance(X, np.ndarray)
110 | 
111 |     if n_jobs < 0:
112 |         import psutil
113 | 
114 |         n_jobs = psutil.cpu_count(logical=False)  # get physical cores
115 |         if n_jobs is None:
116 |             n_jobs = psutil.cpu_count(
117 |                 logical=True
118 |             )  # if undetermined, use logical cores instead
119 |     torch.set_num_threads(n_jobs)
120 | 
121 |     device_type = "cpu"
122 |     if use_gpu:
123 |         if torch.cuda.is_available():
124 |             device_type = "cuda"
125 |             if verbose:
126 |                 print("Use GPU mode.")
127 |         elif torch.backends.mps.is_available():
128 |             device_type = "mps"
129 |             if verbose:
130 |                 print("Use Metal (MPS) mode.")
131 |         elif verbose:
132 |             print(
133 |                 "Neither CUDA nor MPS is available on your machine. Use CPU mode instead."
134 |             )
135 | 
136 |     (stride_0, stride_1) = X.strides
137 |     if stride_0 < 0 or stride_1 < 0:
138 |         Z = torch.tensor(X.copy(), dtype=torch.float, device=device_type)
139 |     else:
140 |         Z = torch.tensor(X, dtype=torch.float, device=device_type)
141 |     Z_norm = normalize(Z, p=2, dim=1)
142 |     n_cells = Z.shape[0]
143 | 
144 |     batch_codes = get_batch_codes(df_obs, batch_key)
145 |     n_batches = batch_codes.cat.categories.size
146 |     N_b = torch.tensor(
147 |         batch_codes.value_counts(sort=False).values,
148 |         dtype=torch.float,
149 |         device=device_type,
150 |     )
151 |     Pr_b = N_b.view(-1, 1) / n_cells
152 | 
153 |     Phi = one_hot_tensor(batch_codes, device_type)
154 | 
155 |     if n_clusters is None:
156 |         n_clusters = int(min(100, n_cells / 30))
157 | 
158 |     theta = torch.tensor([theta], dtype=torch.float, device=device_type).expand(
159 |         n_batches
160 |     )
161 | 
162 |     if tau > 0:
163 |         theta = theta * (1 - torch.exp(-N_b / (n_clusters * tau)) ** 2)
164 | 
165 |     theta = theta.view(1, -1)
166 | 
167 |     assert block_proportion > 0 and block_proportion <= 1, f"block_proportion must be a fraction in range (0, 1]!"
168 |     block_size = int(n_cells * block_proportion)
169 | 
170 |     assert init_centroids_method in ["default", "harmony-paper"], f"init_centroids_method must be chosen from ['default', 'harmony-paper']!"
171 | 
172 |     np.random.seed(random_state)
173 | 
174 |     # Initialize centroids
175 |     R, E, O, objectives_harmony = initialize_centroids(
176 |         Z_norm,
177 |         n_clusters,
178 |         sigma,
179 |         Pr_b,
180 |         Phi,
181 |         theta,
182 |         init_centroids_method,
183 |         None,
184 |         device_type,
185 |         n_jobs,
186 |     )
187 | 
188 |     if verbose:
189 |         print("\tInitialization is completed.")
190 | 
191 |     rng = np.random.default_rng()
192 |     for i in range(max_iter_harmony):
193 |         clustering(
194 |             Z_norm,
195 |             Pr_b,
196 |             Phi,
197 |             R,
198 |             E,
199 |             O,
200 |             theta,
201 |             tol_clustering,
202 |             objectives_harmony,
203 |             max_iter_clustering,
204 |             sigma,
205 |             block_size,
206 |             rng,
207 |         )
208 |         Z_hat = correction(Z, R, Phi, O, ridge_lambda, device_type)
209 |         Z_norm = normalize(Z_hat, p=2, dim=1)
210 | 
211 |         if verbose:
212 |             print(
213 |                 "\tCompleted {cur_iter} / {total_iter} iteration(s).".format(
214 |                     cur_iter=i + 1,
215 |                     total_iter=max_iter_harmony,
216 |                 )
217 |             )
218 | 
219 |         if is_convergent_harmony(objectives_harmony, tol=tol_harmony):
220 |             if verbose:
221 |                 print(f"Reach convergence after {i + 1} iteration(s).")
222 |             break
223 | 
224 |     return Z_hat.numpy() if device_type == "cpu" else Z_hat.cpu().numpy()
225 | 
226 | 
227 | def initialize_centroids(
228 |     Z_norm,
229 |     n_clusters,
230 |     sigma,
231 |     Pr_b,
232 |     Phi,
233 |     theta,
234 |     init_centroids_method,
235 |     random_state,
236 |     device_type,
237 |     n_jobs,
238 | ):
239 |     kmeans_params = {
240 |         "n_clusters": n_clusters,
241 |         "random_state": random_state,
242 |     }
243 |     if init_centroids_method == "harmony-paper":
244 |         kmeans_params["init"] = "random"
245 |         kmeans_params["n_init"] = 10
246 |         kmeans_params["max_iter"] = 25
247 | 
248 |     kmeans = KMeans(**kmeans_params)
249 | 
250 |     from threadpoolctl import threadpool_limits
251 | 
252 |     with threadpool_limits(limits=n_jobs):
253 |         if device_type == "cpu":
254 |             kmeans.fit(Z_norm)
255 |         else:
256 |             kmeans.fit(Z_norm.cpu())
257 | 
258 |     Y = torch.tensor(kmeans.cluster_centers_, dtype=torch.float, device=device_type)
259 |     Y_norm = normalize(Y, p=2, dim=1)
260 | 
261 |     # Initialize R
262 |     R = torch.exp(-2 / sigma * (1 - torch.matmul(Z_norm, Y_norm.t())))
263 |     R = normalize(R, p=1, dim=1)
264 | 
265 |     E = torch.matmul(Pr_b, torch.sum(R, dim=0, keepdim=True))
266 |     O = torch.matmul(Phi.t(), R)
267 | 
268 |     objectives_harmony = []
269 |     compute_objective(
270 |         Y_norm, Z_norm, R, theta, sigma, O, E, objectives_harmony
271 |     )
272 | 
273 |     return R, E, O, objectives_harmony
274 | 
275 | 
276 | def clustering(
277 |     Z_norm,
278 |     Pr_b,
279 |     Phi,
280 |     R,
281 |     E,
282 |     O,
283 |     theta,
284 |     tol,
285 |     objectives_harmony,
286 |     max_iter,
287 |     sigma,
288 |     block_size,
289 |     rng,
290 | ):
291 |     n_cells = Z_norm.shape[0]
292 | 
293 |     objectives_clustering = []
294 | 
295 |     for _ in range(max_iter):
296 |         # Compute Cluster Centroids
297 |         Y = torch.matmul(R.t(), Z_norm)
298 |         Y_norm = normalize(Y, p=2, dim=1)
299 | 
300 |         idx_list = rng.permutation(n_cells)
301 |         pos = 0
302 |         while pos < len(idx_list):
303 |             idx_in = idx_list[pos : (pos + block_size)]
304 |             R_in = R[idx_in,]
305 |             Phi_in = Phi[idx_in,]
306 | 
307 |             # Compute O and E on left out data.
308 |             O -= torch.matmul(Phi_in.t(), R_in)
309 |             E -= torch.matmul(Pr_b, torch.sum(R_in, dim=0, keepdim=True))
310 | 
311 |             # Update and Normalize R
312 |             R_in = torch.exp(
313 |                 -2 / sigma * (1 - torch.matmul(Z_norm[idx_in,], Y_norm.t()))
314 |             )
315 |             omega = torch.matmul(Phi_in, torch.pow(torch.div(E + 1, O + 1), theta.t()))
316 |             R_in = R_in * omega
317 |             R_in = normalize(R_in, p=1, dim=1)
318 |             R[idx_in,] = R_in
319 | 
320 |             # Compute O and E with full data.
321 |             O += torch.matmul(Phi_in.t(), R_in)
322 |             E += torch.matmul(Pr_b, torch.sum(R_in, dim=0, keepdim=True))
323 | 
324 |             pos += block_size
325 | 
326 |         compute_objective(
327 |             Y_norm, Z_norm, R, theta, sigma, O, E, objectives_clustering
328 |         )
329 | 
330 |         if is_convergent_clustering(objectives_clustering, tol):
331 |             break
332 | 
333 |     objectives_harmony.append(objectives_clustering[-1])
334 | 
335 | 
336 | def correction(X, R, Phi, O, ridge_lambda, device_type):
337 |     n_cells = X.shape[0]
338 |     n_clusters = R.shape[1]
339 |     n_batches = Phi.shape[1]
340 |     Phi_1 = torch.cat((torch.ones(n_cells, 1, device=device_type), Phi), dim=1)
341 | 
342 |     Z = X.clone()
343 |     P = torch.eye(n_batches + 1, n_batches + 1, device=device_type)
344 |     for k in range(n_clusters):
345 |         O_k = O[:, k]
346 |         N_k = torch.sum(O_k)
347 | 
348 |         factor = 1 / (O_k + ridge_lambda)
349 |         c = N_k + torch.sum(-factor * O_k**2)
350 |         c_inv = 1 / c
351 | 
352 |         P[0, 1:] = -factor * O_k
353 | 
354 |         P_t_B_inv = torch.diag(
355 |             torch.cat(
356 |                 (torch.tensor([[c_inv]], device=device_type), factor.view(1, -1)), dim=1
357 |             ).squeeze()
358 |         )
359 |         P_t_B_inv[1:, 0] = P[0, 1:] * c_inv
360 |         inv_mat = torch.matmul(P_t_B_inv, P)
361 | 
362 |         Phi_t_diag_R = Phi_1.t() * R[:, k].view(1, -1)
363 |         W = torch.matmul(inv_mat, torch.matmul(Phi_t_diag_R, X))
364 |         W[0, :] = 0
365 | 
366 |         Z -= torch.matmul(Phi_t_diag_R.t(), W)
367 | 
368 |     return Z
369 | 
370 | 
371 | def compute_objective(Y_norm, Z_norm, R, theta, sigma, O, E, objective_arr):
372 |     kmeans_error = torch.sum(R * 2 * (1 - torch.matmul(Z_norm, Y_norm.t())))
373 |     entropy_term = sigma * torch.sum(
374 |         -torch.distributions.Categorical(probs=R).entropy()
375 |     )
376 |     diversity_penalty = sigma * torch.sum(
377 |         torch.matmul(theta, O * torch.log(torch.div(O + 1, E + 1)))
378 |     )
379 |     objective = kmeans_error + entropy_term + diversity_penalty
380 | 
381 |     objective_arr.append(objective)
382 | 
383 | 
384 | def is_convergent_harmony(objectives_harmony, tol):
385 |     if len(objectives_harmony) < 2:
386 |         return False
387 | 
388 |     obj_old = objectives_harmony[-2]
389 |     obj_new = objectives_harmony[-1]
390 | 
391 |     return (obj_old - obj_new) < tol * torch.abs(obj_old)
392 | 
393 | 
394 | def is_convergent_clustering(objectives_clustering, tol, window_size=3):
395 |     if len(objectives_clustering) < window_size + 1:
396 |         return False
397 | 
398 |     obj_old = 0
399 |     obj_new = 0
400 |     for i in range(window_size):
401 |         obj_old += objectives_clustering[-2 - i]
402 |         obj_new += objectives_clustering[-1 - i]
403 | 
404 |     return (obj_old - obj_new) < tol * torch.abs(obj_old)
405 | 


--------------------------------------------------------------------------------
/harmony/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_batch_codes(batch_mat, batch_key):
 5 |     if type(batch_key) is str:
 6 |         batch_vec = batch_mat[batch_key]
 7 | 
 8 |     elif len(batch_key) == 1:
 9 |         batch_key = batch_key[0]
10 | 
11 |         batch_vec = batch_mat[batch_key]
12 | 
13 |     else:
14 |         df = batch_mat[batch_key].astype("str")
15 |         batch_vec = df.apply(lambda row: ",".join(row), axis=1)
16 | 
17 |     return batch_vec.astype("category")
18 | 
19 | 
20 | def one_hot_tensor(X, device_type):
21 |     ids = torch.as_tensor(
22 |         X.cat.codes.values.copy(), dtype=torch.long, device=device_type
23 |     ).view(-1, 1)
24 |     n_row = X.size
25 |     n_col = X.cat.categories.size
26 |     Phi = torch.zeros(n_row, n_col, dtype=torch.float, device=device_type)
27 |     Phi.scatter_(dim=1, index=ids, value=1.0)
28 | 
29 |     return Phi
30 | 


--------------------------------------------------------------------------------
/method/Method.md:
--------------------------------------------------------------------------------
  1 | # Harmony Algorithm
  2 | 
  3 | As most of the data have cell barcodes in rows, we adjusted the algorithm, and thus it's slightly different from the paper.
  4 | 
  5 | In this document, we resummarise the Harmony algorithm.
  6 | 
  7 | ## Notations
  8 | 
  9 | Given an embedding of $N$ cell barcodes in $d$ dimensions, coming from $B$ batches, Harmony tries to cluster them into $K$ clusters first, then integrate data.
 10 | 
 11 | * $Z \in \mathbb{R}^{N \times d}$: Input embedding to be corrected by Harmony.
 12 | * $\hat{Z} \in \mathbb{R}^{N \times d}$: Output embedding which is integrated.
 13 | * $R \in [0, 1]^{N \times K}$: Soft cluster assignment matrix of cells (rows) to clusters (columns).
 14 | * $\phi \in \{0, 1\}^{N \times B}$: One-hot assignment matrix of cells (rows) to batches (columns).
 15 | * $Pr \in [0, 1]^B$: Frequency of batches.
 16 | * $O \in [0, 1]^{B \times K}$: The observed co-occurrence matrix of cells in batches (rows) and clusters (columns).
 17 | * $E \in [0, 1]^{B \times K}$: The expected co-occurrence matrix of cells in batches (rows) and clusters (columns), under assumption of independence between cluster and batch assignment.
 18 | * $Y \in [0, 1]^{K \times d}$: L2-Normalized cluster centroid locations.
 19 | 
 20 | ## Objective Function
 21 | 
 22 | * K-Means error:
 23 | 
 24 | ```math
 25 | e_1 = \sum_{i, k} R_{ik}\ ||Z_i - Y_k||^2 \qquad \text{for} \quad \forall 1 \leq i \leq N \text{ and }\forall 1 \leq k \leq K
 26 | ```
 27 | 
 28 | Moreover, if both $Z_i$ and $Y_k$ are L2-normalized, their euclidean distance can be reduced to cosine distance:
 29 | 
 30 | ```math
 31 | \begin{align*}
 32 | e_1 &= \sum_{i, k} R_{ik}(|Z_i|^2 + |Y_k|^2 - 2Z_i\cdot Y_k^T ) \\
 33 |     &= \sum_{i, k} R_{ik}(2 - 2Z_i \cdot Y_k^T) \qquad \qquad ( Z_i \text{ and } Y_k \text{ are L2-normalized}) \\
 34 |     &= \sum_{i, k} 2R_{ik}(1 - Z_{i} \cdot Y_{k}^T) \\
 35 |     &= \sum_{i, k} 2R * (1 - Z Y^T)
 36 | \end{align*}
 37 | ```
 38 | 
 39 | where $*$ is element-wise product.
 40 | 
 41 | * Cross-entropy Error:
 42 | 
 43 | $$
 44 | e_2 = \sigma \sum_{i, k} R_{ik}\log{R_{ik}} = \sigma \sum_{i, k}R * \log{R}
 45 | $$
 46 | 
 47 | * Diversity Penalty:
 48 | 
 49 | $$
 50 | \begin{align*}
 51 | e_3 &= \sigma \sum_{i, k} R_{ik} \sum_{b}\theta_b \phi_{ib}\log{\Big( \frac{O_{bk} + 1}{E_{bk} + 1} \Big)} \\
 52 |     &= \sigma \sum_{k} \theta \Big[ (\phi^T R) * \log{\Big( \frac{O + 1}{E + 1} \Big)} \Big] \\
 53 |     &= \sigma \sum_{k} \theta \Big[ O * \log{\Big( \frac{O + 1}{E + 1} \Big)} \Big]
 54 | \end{align*}
 55 | $$
 56 | 
 57 | where $\theta = [\theta_1, ..., \theta_B]$ of shape $1 \times B$ are the discounting hyperparameters.
 58 | 
 59 | Therefore, the objective function is
 60 | 
 61 | $$
 62 | E = e_1 + e_2 + e_3.
 63 | $$
 64 | 
 65 | ## Algorithm Structure
 66 | 
 67 | ```python
 68 | def harmonize(Z, phi):
 69 |     Z_hat = Z
 70 |     R, E, O = initialize_centroids(Z_hat)
 71 |     while not converged:
 72 |         R = clustering(Z_hat, phi)
 73 |         Z_hat = correction(Z, R, phi)
 74 | 
 75 |     return Z_hat
 76 | ```
 77 | 
 78 | ## Centroids Initialization
 79 | 
 80 | 1. L2-normalize $\hat{Z}$ on rows.
 81 | 
 82 | 2. $\hat{Y} = kmeans(\hat{Z}, K)$. And then L2-normalize $\hat{Y}$ on rows.
 83 | 
 84 | 3. Initialize $R$:
 85 | 
 86 | $$
 87 | R = \exp{\Big(-\frac{2(1 - \hat{Z} \hat{Y}^T)}{\sigma}\Big)}
 88 | $$
 89 | 
 90 | Then L1-normalize $R$ on rows, so that each row sums up to 1.
 91 | 
 92 | 4. Initialize $E$ and $O$:
 93 | 
 94 | ```math
 95 | \begin{align*}
 96 | (E)_{bk} = Pr_b \cdot \sum_{i = 1}^N R_{ik} \qquad &\Rightarrow \qquad E = Pr^T \cdot [R_{\cdot 1}, \dots, R_{\cdot K}];\\
 97 | (O)_{bk} = \sum_{i = 1}^N \phi_{ib}R_{ik} \qquad &\Rightarrow \qquad O = \phi^T R.
 98 | \end{align*}
 99 | ```
100 | 
101 | 5. Compute objective value with $\hat{Y}$, $\hat{Z}$, $R$, $O$, and $E$.
102 | 
103 | ## Clustering
104 | 
105 | ### Block-wise Update
106 | 
107 | 1. Compute $O$ and $E$ on left-out data:
108 | 
109 | ```math
110 | E = E - Pr^T \cdot [R_{in, 1}, \dots, R_{in, K}], \qquad O = O - \phi_{in}^T R_{in}.
111 | ```
112 | 
113 | where $R_{in, 1}, ..., R_{in, K}$ are the summations of $R_{ik}$ over cells in the current block regarding each cluster $k$.
114 | 
115 | 2. Update and normalize $R$:
116 | 
117 | ```math
118 | \begin{align*}
119 | R_{in} &= \exp{\Big( -\frac{2(1 - \hat{Z}_{in}\hat{Y}^T)}{\sigma} \Big)};\\
120 | \Omega &= \phi^{in} \Big( \frac{E+1}{O+1} \Big)^\Theta; \\
121 | R_{in} &= R_{in} \Omega; \\
122 | R_{in} &= \text{L1-Normalize}(R_{in}, \text{row}).
123 | \end{align*}
124 | ```
125 | 
126 | where $\Theta = [\theta^T, \dots, \theta^T]$ of shape $B \times K$.
127 | 
128 | 3. Compute $O$ and $E$ with full data:
129 | 
130 | $$
131 | E = E + Pr^T \cdot [R_{in, 1}, \dots, R_{in, K}], \qquad O = O + \phi_{in}^T R_{in}.
132 | $$
133 | 
134 | 4. Update cluster centroids:
135 | 
136 | $$
137 | \begin{align*}
138 | \hat{Y} &= \sum_{i = 1}^N R_{ik}\hat{Z}_{id} = R^T \hat{Z};\\
139 | \hat{Y} &= \text{L2-Normalize}(\hat{Y}, \text{row}).
140 | \end{align*}
141 | $$
142 | 
143 | 5. Compute objective value with updated $\hat{Y}$, $\hat{Z}$, $R$, $O$, and $E$.
144 | 
145 | ## Correction
146 | 
147 | ### Original Method
148 | 
149 | 1. Initialize $\hat{Z}$ by $Z$.
150 | 
151 | 2. Let
152 | 
153 | $$
154 | \phi^* = \begin{bmatrix}
155 | 1 & \phi_{11} & \cdots & \phi_{1B} \\
156 | \vdots & \vdots & \ddots & \vdots \\
157 | 1 & \phi_{N1} & \cdots & \phi_{NB}
158 | \end{bmatrix}
159 | $$
160 | 
161 | 3. Cluster-wise correction:
162 | 
163 | For each cluster $k$,
164 | 
165 | ```math
166 | \begin{align*}
167 | R_k &= [R_{1k}, \dots, R_{Nk}];\\
168 | \Phi_{R,k}^* &= \phi^{*T} \otimes R_k;\\
169 | W_k &= (\Phi_{R,k}^* \phi^* + \lambda J)^{-1} \Phi_{R,k}^* Z;\\
170 | W_k[0, :] &= \mathbf{0};\\
171 | \hat{Z} &= \hat{Z} - \Phi_{R,k}^{*T} W_k.
172 | \end{align*}
173 | ```
174 | 
175 | where $\otimes$ is row-wise multiplication of a matrix and a row vector, and
176 | 
177 | $$
178 | J = \begin{bmatrix}
179 | 0 & 0 & 0 & \cdots & 0\\
180 | 0 & 1 & & & \\
181 | 0 &   & 1 & & \\
182 | \vdots &   &   & \ddots & \\
183 | 0 & & & & 1
184 | \end{bmatrix}.
185 | $$
186 | 
187 | 
188 | ### Improvement
189 | 
190 | We don't need to directly calculate the matrix inverse:
191 | 
192 | ```math
193 | (\Phi_{R,k}^* \phi^* + \lambda J)^{-1}
194 | ```
195 | 
196 | of shape $(B+1)\times(B+1)$, which can be time consuming when the number of batches $B$ is high.
197 | 
198 | Let
199 | 
200 | ```math
201 | A_k = \phi^{*T}diag(R_k)\phi^* + \lambda J,
202 | ```
203 | 
204 | then
205 | 
206 | ```math
207 | W_k = A_k^{-1}\Phi_{R, k}^* Z.
208 | ```
209 | 
210 | Since
211 | 
212 | ```math
213 | \begin{align*}
214 | A_k &= \begin{bmatrix}
215 | 1 & \cdots & 1 \\
216 | \phi_{11} & \cdots & \phi_{N1} \\
217 | \vdots & \vdots & \vdots \\
218 | \phi_{1B} & \cdots & \phi_{NB}
219 | \end{bmatrix} \cdot \begin{bmatrix}
220 | R_{1k} & & \\
221 |  & \ddots & \\
222 |  & & R_{Nk}
223 | \end{bmatrix} \cdot \begin{bmatrix}
224 | 1 & \phi_{11} & \cdots & \phi_{1B} \\
225 | \vdots & \vdots & \ddots & \vdots \\
226 | 1 & \phi_{N1} & \cdots & \phi_{NB}
227 | \end{bmatrix} + \lambda J \\
228 | &= \begin{bmatrix}
229 | \sum_{i = 1}^N R_{ik} & \sum_{i = 1}^N \phi_{i1}R_{ik} & \cdots & \sum_{i = 1}^N \phi_{iB}R_{ik} \\
230 | \sum_{i = 1}^N \phi_{i1}R_{ik} & \sum_{i = 1}^N \phi_{i1}^2 R_ik & \cdots & \sum_{i = 1}^N \phi_{i1}\phi_{iB}R_{ik} \\
231 | \vdots & \vdots & \ddots & \vdots \\
232 | \sum_{i = 1}^N \phi_{iB}R_{ik} & \sum_{i = 1}^N \phi_{iB}\phi_{i1}R_{ik} & \cdots & \sum_{i = 1}^N \phi_{iB}^2R_{ik}
233 | \end{bmatrix} + \lambda J,
234 | \end{align*}
235 | ```
236 | 
237 | it's easy to see that
238 | 
239 | ```math
240 | \sum_{i = 1}^N \phi_{ib_1}\phi_{ib_2}R_{ik} = 0 \qquad \text{ for } \quad \forall b_1 \neq b_2
241 | ```
242 | 
243 | and
244 | 
245 | ```math
246 | \sum_{i = 1}^N \phi_{ib}^2 R_{ik} = \sum_{i = 1}^N \phi_{ib} R_{ik}.
247 | ```
248 | 
249 | Let
250 | 
251 | $$
252 | \begin{align*}
253 | N_k &= \sum_{i = 1}^N R_{ik},\\
254 | N_{bk} &= \sum_{i = 1}^N \phi_{ib}R_{ik} \qquad \Rightarrow \qquad N = \phi^T R \qquad \Rightarrow \qquad N = O.
255 | \end{align*}
256 | $$
257 | 
258 | Then we have
259 | 
260 | ```math
261 | N_k = \sum_{b = 1}^B O_{bk}
262 | ```
263 | 
264 | and
265 | 
266 | ```math
267 | A_k = \begin{bmatrix}
268 | N_k & O_{1k} & \cdots & O_{Bk} \\
269 | O_{1k} & O_{1k} & & \\
270 | \vdots & & \ddots & \\
271 | O_{Bk} & & & O_{Bk}
272 | \end{bmatrix} + \lambda J = \begin{bmatrix}
273 | N_k & O_{1k} & \cdots & O_{Bk} \\
274 | O_{1k} & O_{1k} + \lambda & & \\
275 | \vdots & & \ddots & \\
276 | O_{Bk} & & & O_{Bk} + \lambda
277 | \end{bmatrix}.
278 | ```
279 | 
280 | Let
281 | 
282 | ```math
283 | P = \begin{bmatrix}
284 | 1 & -\frac{O_{1k}}{O_{1k} + \lambda} & \cdots & -\frac{O_{Bk}}{O_{Bk} + \lambda} \\
285 |  & 1 &  &  \\
286 |  & & \ddots & \\
287 |  & & & 1
288 | \end{bmatrix}
289 | ```
290 | 
291 | then
292 | 
293 | ```math
294 | \mathcal{B}_k = PA_kP^T = \begin{bmatrix}
295 | c & & & \\
296 |   & O_{1k}+\lambda & & \\
297 |   & & \ddots & \\
298 |   & & & O_{Bk}+\lambda
299 | \end{bmatrix},
300 | ```
301 | 
302 | where
303 | 
304 | ```math
305 | c = N_k - \sum_{i = 1}^N \frac{O_{ik}^2}{O_{ik}+\lambda}.
306 | ```
307 | 
308 | $\mathcal{B}_k$ has inverse
309 | 
310 | ```math
311 | \mathcal{B}^{-1}_k = \begin{bmatrix}
312 | c^{-1} & & & \\
313 |  & \frac{1}{O_{1k}+\lambda} & & \\
314 |  & & \ddots & \\
315 |  & & & \frac{1}{O_{Bk}+\lambda}
316 | \end{bmatrix}.
317 | ```
318 | 
319 | Now since $P$, $A_k$ and $P^T$ are all square matrices of shape $(B+1)\times(B+1)$ and invertible, we have
320 | 
321 | ```math
322 | \begin{align*}
323 | \mathcal{B}_k^{-1} &= (PA_kP^T)^{-1} \\
324 |     &= (P^T)^{-1} A_k^{-1} P^{-1}.
325 | \end{align*}
326 | ```
327 | 
328 | Thus
329 | 
330 | ```math
331 | \begin{align*}
332 | A^{-1}_k &= P^T\mathcal{B}_k^{-1}P \\
333 | &= \begin{bmatrix}
334 | c^{-1} & & & \\
335 | -\frac{O_{1k}}{O_{1k}+\lambda}c^{-1} & \frac{1}{O_{1k}+\lambda} & & \\
336 | \vdots & & \ddots & \\
337 | -\frac{O_{Bk}}{O_{Bk}+\lambda}c^{-1} & & & \frac{1}{O_{Bk}+\lambda}
338 | \end{bmatrix} \cdot P
339 | \end{align*}
340 | ```
341 | 
342 | which is decomposited into a lower-triangular, a diagonal, and an upper-triangular matrix.
343 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"]
3 | 
4 | [tool.setuptools_scm]
5 | write_to = "harmony/version.py"
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | with open(path.join(here, "README.rst"), encoding="utf-8") as f:
 7 |     long_description = f.read()
 8 | 
 9 | requires = [
10 |     "torch>=1.12",
11 |     "numpy",
12 |     "pandas",
13 |     "psutil",
14 |     "threadpoolctl",
15 |     "scikit-learn>=0.23"
16 | ]
17 | 
18 | setup(
19 |     name="harmony-pytorch",
20 |     use_scm_version=True,
21 |     description="Pytorch implementation of Harmony algorithm on single-cell sequencing data integration",
22 |     long_description=long_description,
23 |     url="https://github.com/lilab-bcb/harmony-pytorch",
24 |     author="Yiming Yang, Bo Li",
25 |     author_email="yang.yihming@gmail.com, lijiganjun@gmail.com",
26 |     classifiers=[ # https://pypi.python.org/pypi?%3Aaction=list_classifiers
27 |         "Development Status :: 3 - Alpha",
28 |         "Intended Audience :: Developers",
29 |         "Intended Audience :: Science/Research",
30 |         "License :: OSI Approved :: BSD License",
31 |         "Natural Language :: English",
32 |         "Operating System :: MacOS",
33 |         "Operating System :: Microsoft :: Windows :: Windows 10",
34 |         "Operating System :: POSIX :: Linux",
35 |         "Programming Language :: Python :: 3",
36 |         "Programming Language :: Python :: 3 :: Only",
37 |         "Topic :: Software Development :: Build Tools",
38 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
39 |     ],
40 |     keywords="single-cell genomics data integration",
41 |     packages=find_packages(),
42 |     install_requires=requires,
43 |     setup_requires=["setuptools_scm"],
44 |     python_requires="~=3.8",
45 | )
46 | 


--------------------------------------------------------------------------------
/test/gen_cell_lines.R:
--------------------------------------------------------------------------------
 1 | library(harmony)
 2 | 
 3 | metadata <- read.table("data/cell_lines/metadata.csv", header = TRUE, sep = ',')
 4 | X <- read.table("data/cell_lines/pca.txt")
 5 | 
 6 | start <- Sys.time()
 7 | Z <- HarmonyMatrix(X, metadata, "dataset", do_pca = FALSE)
 8 | end <- Sys.time()
 9 | 
10 | print(end - start)
11 | 
12 | write.table(Z, file = "result/cell_lines_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)


--------------------------------------------------------------------------------
/test/gen_mantonbm.R:
--------------------------------------------------------------------------------
 1 | library(harmony)
 2 | 
 3 | metadata <- read.table("./data/MantonBM/metadata.csv", header = TRUE, sep = ',')
 4 | X <- read.table("./data/MantonBM/pca.txt")
 5 | 
 6 | start <- Sys.time()
 7 | Z <- HarmonyMatrix(X, metadata, "Channel", do_pca = FALSE)
 8 | end <- Sys.time()
 9 | 
10 | write.table(Z, file = "./result/MantonBM_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
11 | 
12 | print(end - start)


--------------------------------------------------------------------------------
/test/gen_pbmc.R:
--------------------------------------------------------------------------------
 1 | library(harmony)
 2 | 
 3 | metadata <- read.table("data/10x_pbmc/metadata.csv", header = TRUE, sep = ',')
 4 | X <- read.table("data/10x_pbmc/pca.txt")
 5 | 
 6 | start <- Sys.time()
 7 | Z <- HarmonyMatrix(X, metadata, "Channel", do_pca = FALSE)
 8 | end <- Sys.time()
 9 | 
10 | print(end - start)
11 | 
12 | write.table(Z, file = "result/pbmc_harmony_z.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pegasus as pg
  4 | import seaborn as sns
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import os, sys, time, re
  8 | 
  9 | from harmony import harmonize
 10 | from harmonypy import run_harmony
 11 | from anndata import AnnData
 12 | from scipy.stats import pearsonr
 13 | from scipy.sparse import csr_matrix
 14 | 
 15 | 
 16 | metric_dict = {"r": "Correlation", "L2": "L2 Error"}
 17 | 
 18 | 
 19 | def check_metric(Z_torch, Z_py, Z_R, prefix, norm):
 20 |     assert Z_torch.shape == Z_py.shape and Z_py.shape == Z_R.shape
 21 | 
 22 |     metric_torch = []
 23 |     for i in range(Z_torch.shape[1]):
 24 |         m = get_measure(Z_torch[:, i], Z_R[:, i], norm)
 25 |         metric_torch.append(m)
 26 | 
 27 |     print(
 28 |         "Mean {metric} by harmony-pytorch = {value:.4f}".format(
 29 |             metric=metric_dict[norm], value=np.mean(metric_torch)
 30 |         )
 31 |     )
 32 |     np.savetxt(
 33 |         "./result/{prefix}_{metric}_torch.txt".format(prefix=prefix, metric=norm),
 34 |         metric_torch,
 35 |     )
 36 | 
 37 |     metric_py = []
 38 |     for i in range(Z_py.shape[1]):
 39 |         m = get_measure(Z_py[:, i], Z_R[:, i], norm)
 40 |         metric_py.append(m)
 41 | 
 42 |     print(
 43 |         "Mean {metric} by harmonypy = {value:.4f}".format(
 44 |             metric=metric_dict[norm], value=np.mean(metric_py)
 45 |         )
 46 |     )
 47 |     np.savetxt(
 48 |         "./result/{prefix}_{metric}_py.txt".format(prefix=prefix, metric=norm),
 49 |         metric_py,
 50 |     )
 51 | 
 52 | 
 53 | def get_measure(x, base, norm):
 54 |     assert norm in ["r", "L2"]
 55 | 
 56 |     if norm == "r":
 57 |         corr, _ = pearsonr(x, base)
 58 |         return corr
 59 |     else:
 60 |         return np.linalg.norm(x - base) / np.linalg.norm(base)
 61 | 
 62 | 
 63 | def plot_umap(adata, Z_torch, Z_py, Z_R, prefix, batch_key):
 64 |     if adata is not None:
 65 |         adata.obsm["X_torch"] = Z_torch
 66 |         adata.obsm["X_py"] = Z_py
 67 |         adata.obsm["X_harmony"] = Z_R
 68 | 
 69 |         pg.neighbors(adata, rep="torch")
 70 |         pg.umap(adata, rep="torch", out_basis="umap_torch")
 71 | 
 72 |         pg.neighbors(adata, rep="py")
 73 |         pg.umap(adata, rep="py", out_basis="umap_py")
 74 | 
 75 |         pg.neighbors(adata, rep="harmony")
 76 |         pg.umap(adata, rep="harmony", out_basis="umap_harmony")
 77 | 
 78 |         pg.write_output(adata, "./result/{}_result".format(prefix))
 79 |     else:
 80 |         print("Use precalculated AnnData result.")
 81 | 
 82 |     if os.system(
 83 |         "pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.before.umap.pdf".format(
 84 |             name=prefix, attr=batch_key
 85 |         )
 86 |     ):
 87 |         sys.exit(1)
 88 | 
 89 |     if os.system(
 90 |         "pegasus plot scatter --basis umap_torch --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.torch.umap.pdf".format(
 91 |             name=prefix, attr=batch_key
 92 |         )
 93 |     ):
 94 |         sys.exit(1)
 95 | 
 96 |     if os.system(
 97 |         "pegasus plot scatter --basis umap_py --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.py.umap.pdf".format(
 98 |             name=prefix, attr=batch_key
 99 |         )
100 |     ):
101 |         sys.exit(1)
102 | 
103 |     if os.system(
104 |         "pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{name}_result.h5ad ./plots/{name}.harmony.umap.pdf".format(
105 |             name=prefix, attr=batch_key
106 |         )
107 |     ):
108 |         sys.exit(1)
109 | 
110 | 
111 | def test_cell_lines():
112 |     print("Testing on cell lines dataset...")
113 | 
114 |     z_files = [
115 |         f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)
116 |     ]
117 |     if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
118 |         X = np.loadtxt("./data/cell_lines/pca.txt")
119 |         df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")
120 |         source_loaded = True
121 | 
122 |     if os.path.exists("./result/cell_lines_torch_z.npy"):
123 |         Z_torch = np.load("./result/cell_lines_torch_z.npy")
124 |         print("Precalculated embedding by harmony-pytorch is loaded.")
125 |     else:
126 |         start_torch = time.time()
127 |         Z_torch = harmonize(X, df_metadata, batch_key="dataset")
128 |         end_torch = time.time()
129 | 
130 |         print(
131 |             "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)
132 |         )
133 |         np.save("./result/cell_lines_torch_z.npy", Z_torch)
134 | 
135 |     if os.path.exists("./result/cell_lines_py_z.npy"):
136 |         Z_py = np.load("./result/cell_lines_py_z.npy")
137 |         print("Precalculated embedding by harmonypy is loaded.")
138 |     else:
139 |         start_py = time.time()
140 |         ho = run_harmony(X, df_metadata, ["dataset"])
141 |         end_py = time.time()
142 | 
143 |         print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
144 |         print(ho.objective_harmony)
145 | 
146 |         Z_py = np.transpose(ho.Z_corr)
147 |         np.save("./result/cell_lines_py_z.npy", Z_py)
148 | 
149 |     Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")
150 | 
151 |     check_metric(Z_torch, Z_py, Z_R, prefix="cell_lines", norm="r")
152 |     check_metric(Z_torch, Z_py, Z_R, prefix="cell_lines", norm="L2")
153 | 
154 |     if os.path.exists("./result/cell_lines_result.h5ad"):
155 |         adata = None
156 |     else:
157 |         n_obs = X.shape[0]
158 |         adata = AnnData(X=csr_matrix((n_obs, 2)), obs=df_metadata)
159 |         adata.obsm["X_pca"] = X
160 | 
161 |         pg.neighbors(adata, rep="pca")
162 |         pg.umap(adata)
163 | 
164 |     umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)]
165 |     if len(umap_list) < 4:
166 |         plot_umap(adata, Z_torch, Z_py, Z_R, prefix="cell_lines", batch_key="dataset")
167 | 
168 |     if os.path.exists("./result/cell_lines_result.h5ad"):
169 |         adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode="r")
170 | 
171 |         stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="harmony")
172 |         print(
173 |             "kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(
174 |                 stat=stat, pval=pvalue, ac_rate=ac_rate
175 |             )
176 |         )
177 | 
178 |         stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="py")
179 |         print(
180 |             "kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(
181 |                 stat=stat, pval=pvalue, ac_rate=ac_rate
182 |             )
183 |         )
184 | 
185 |         stat, pvalue, ac_rate = pg.calc_kBET(adata, attr="dataset", rep="torch")
186 |         print(
187 |             "kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(
188 |                 stat=stat, pval=pvalue, ac_rate=ac_rate
189 |             )
190 |         )
191 | 
192 | 
193 | def test_pbmc():
194 |     print("Testing on 10x pbmc dataset...")
195 | 
196 |     z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)]
197 |     if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"):
198 |         adata = pg.read_input("./data/10x_pbmc/original_data.h5ad")
199 | 
200 |     if os.path.exists("./result/pbmc_torch_z.npy"):
201 |         Z_torch = np.load("./result/pbmc_torch_z.npy")
202 |         print("Precalculated embedding by harmony-pytorch is loaded.")
203 |     else:
204 |         start_torch = time.time()
205 |         Z_torch = harmonize(adata.obsm["X_pca"], adata.obs, batch_key="Channel")
206 |         end_torch = time.time()
207 | 
208 |         print(
209 |             "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)
210 |         )
211 |         np.save("./result/pbmc_torch_z.npy", Z_torch)
212 | 
213 |     if os.path.exists("./result/pbmc_py_z.npy"):
214 |         Z_py = np.load("./result/pbmc_py_z.npy")
215 |         print("Precalculated embedding by harmonypy is loaded.")
216 |     else:
217 |         start_py = time.time()
218 |         ho = run_harmony(adata.obsm["X_pca"], adata.obs, ["Channel"])
219 |         end_py = time.time()
220 | 
221 |         print(ho.objective_harmony)
222 |         print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
223 | 
224 |         Z_py = np.transpose(ho.Z_corr)
225 |         np.save("./result/pbmc_py_z.npy", Z_py)
226 | 
227 |     Z_R = np.loadtxt("./result/pbmc_harmony_z.txt")
228 | 
229 |     check_metric(Z_torch, Z_py, Z_R, prefix="pbmc", norm="r")
230 |     check_metric(Z_torch, Z_py, Z_R, prefix="pbmc", norm="L2")
231 | 
232 |     if os.path.exists("./result/pbmc_result.h5ad"):
233 |         adata = None
234 | 
235 |     umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)]
236 |     if len(umap_list) < 4:
237 |         plot_umap(adata, Z_torch, Z_py, Z_R, prefix="pbmc", batch_key="Channel")
238 | 
239 | 
240 | def test_mantonbm():
241 |     print("Testing on MantonBM dataset...")
242 | 
243 |     z_files = [
244 |         f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)
245 |     ]
246 |     if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"):
247 |         adata = pg.read_input("./data/MantonBM/original_data.h5ad")
248 |         adata.obs["Individual"] = pd.Categorical(
249 |             adata.obs["Channel"].apply(lambda s: s.split("_")[0][-1])
250 |         )
251 | 
252 |     if os.path.exists("./result/MantonBM_torch_z.npy"):
253 |         Z_torch = np.load("./result/MantonBM_torch_z.npy")
254 |         print("Precalculated embedding by harmony-pytorch is loaded.")
255 |     else:
256 |         start_torch = time.time()
257 |         Z_torch = harmonize(adata.obsm["X_pca"], adata.obs, batch_key="Channel")
258 |         end_torch = time.time()
259 | 
260 |         print(
261 |             "Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)
262 |         )
263 |         np.save("./result/MantonBM_torch_z.npy", Z_torch)
264 | 
265 |     if os.path.exists("./result/MantonBM_py_z.npy"):
266 |         Z_py = np.load("./result/MantonBM_py_z.npy")
267 |         print("Precalculated embedding by harmonypy is loaded.")
268 |     else:
269 |         start_py = time.time()
270 |         ho = run_harmony(adata.obsm["X_pca"], adata.obs, ["Channel"])
271 |         end_py = time.time()
272 | 
273 |         print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py))
274 | 
275 |         Z_py = np.transpose(ho.Z_corr)
276 |         np.save("./result/MantonBM_py_z.npy", Z_py)
277 | 
278 |     Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt")
279 | 
280 |     check_metric(Z_torch, Z_py, Z_R, prefix="MantonBM", norm="r")
281 |     check_metric(Z_torch, Z_py, Z_R, prefix="MantonBM", norm="L2")
282 | 
283 |     if os.path.exists("./result/MantonBM_result.h5ad"):
284 |         adata = None
285 | 
286 |     umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)]
287 |     if len(umap_list) < 4:
288 |         plot_umap(adata, Z_torch, Z_py, Z_R, prefix="MantonBM", batch_key="Individual")
289 | 
290 | 
291 | def gen_plot(norm):
292 |     # Cell Lines
293 |     metric_celllines_torch = np.loadtxt("./result/cell_lines_{}_torch.txt".format(norm))
294 |     metric_celllines_py = np.loadtxt("./result/cell_lines_{}_py.txt".format(norm))
295 | 
296 |     df1 = pd.DataFrame(
297 |         {
298 |             "dataset": np.repeat(
299 |                 ["Cell Lines"], metric_celllines_torch.size + metric_celllines_py.size
300 |             ),
301 |             "package": np.concatenate(
302 |                 (
303 |                     np.repeat(["Torch"], metric_celllines_torch.size),
304 |                     np.repeat(["Py"], metric_celllines_py.size),
305 |                 ),
306 |                 axis=0,
307 |             ),
308 |             "metric": np.concatenate(
309 |                 (metric_celllines_torch, metric_celllines_py), axis=0
310 |             ),
311 |         }
312 |     )
313 | 
314 |     # PBMC
315 |     metric_pbmc_torch = np.loadtxt("./result/pbmc_{}_torch.txt".format(norm))
316 |     metric_pbmc_py = np.loadtxt("./result/pbmc_{}_py.txt".format(norm))
317 | 
318 |     df2 = pd.DataFrame(
319 |         {
320 |             "dataset": np.repeat(
321 |                 ["10x PBMC"], metric_pbmc_torch.size + metric_pbmc_py.size
322 |             ),
323 |             "package": np.concatenate(
324 |                 (
325 |                     np.repeat(["Torch"], metric_pbmc_torch.size),
326 |                     np.repeat(["Py"], metric_pbmc_py.size),
327 |                 ),
328 |                 axis=0,
329 |             ),
330 |             "metric": np.concatenate((metric_pbmc_torch, metric_pbmc_py), axis=0),
331 |         }
332 |     )
333 | 
334 |     # MantonBM
335 |     metric_mantonbm_torch = np.loadtxt("./result/MantonBM_{}_torch.txt".format(norm))
336 |     metric_mantonbm_py = np.loadtxt("./result/MantonBM_{}_py.txt".format(norm))
337 | 
338 |     df3 = pd.DataFrame(
339 |         {
340 |             "dataset": np.repeat(
341 |                 ["Bone Marrow"], metric_mantonbm_torch.size + metric_mantonbm_py.size
342 |             ),
343 |             "package": np.concatenate(
344 |                 (
345 |                     np.repeat(["Torch"], metric_mantonbm_torch.size),
346 |                     np.repeat(["Py"], metric_mantonbm_py.size),
347 |                 ),
348 |                 axis=0,
349 |             ),
350 |             "metric": np.concatenate(
351 |                 (metric_mantonbm_torch, metric_mantonbm_py), axis=0
352 |             ),
353 |         }
354 |     )
355 | 
356 |     df = pd.concat([df1, df2, df3])
357 | 
358 |     # Plot
359 |     ax = sns.violinplot(
360 |         x="dataset",
361 |         y="metric",
362 |         hue="package",
363 |         data=df,
364 |         palette="muted",
365 |         split=True,
366 |         cut=0,
367 |     )
368 |     ax.set_title(
369 |         "{} between Harmonypy and Harmony-pytorch Integration".format(metric_dict[norm])
370 |     )
371 |     ax.set(xlabel="Dataset", ylabel="{} on PCs".format(metric_dict[norm]))
372 |     if norm == "r":
373 |         ax.set(ylim=(0.98, 1.001))
374 |     else:
375 |         ax.set(ylim=(0, 0.1))
376 |     figure = ax.get_figure()
377 |     legend_loc = "lower right" if norm == "r" else "upper right"
378 |     figure.get_axes()[0].legend(title="Package", loc=legend_loc)
379 |     figure.savefig("./plots/{}_stats.png".format(norm), dpi=400)
380 |     plt.close()
381 | 
382 | 
383 | if __name__ == "__main__":
384 |     dataset = sys.argv[1]
385 | 
386 |     assert dataset in ["cell_lines", "pbmc", "MantonBM", "plot"]
387 | 
388 |     if not os.path.exists("./result"):
389 |         if os.system("mkdir ./result"):
390 |             sys.exit(1)
391 | 
392 |     if not os.path.exists("./plots"):
393 |         if os.system("mkdir ./plots"):
394 |             sys.exit(1)
395 | 
396 |     if dataset == "cell_lines":
397 |         test_cell_lines()
398 |     elif dataset == "pbmc":
399 |         test_pbmc()
400 |     elif dataset == "MantonBM":
401 |         test_mantonbm()
402 |     else:
403 |         gen_plot("r")
404 |         gen_plot("L2")
405 | 


--------------------------------------------------------------------------------
/test/test_gpu.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pegasus as pg
  3 | import pandas as pd
  4 | 
  5 | import os, sys, time, re
  6 | 
  7 | from harmony import harmonize
  8 | from scipy.stats import pearsonr
  9 | from scipy.sparse import csr_matrix
 10 | from anndata import AnnData
 11 | 
 12 | 
 13 | def check_metrics(Z, base, prefix):
 14 |     assert Z.shape == base.shape
 15 | 
 16 |     cors = []
 17 |     errors = []
 18 |     for i in range(Z.shape[1]):
 19 |         cor, _ = pearsonr(Z[:, i], base[:, i])
 20 |         cors.append(cor)
 21 | 
 22 |         err = np.linalg.norm(Z[:, i] - base[:, i]) / np.linalg.norm(base[:, i])
 23 |         errors.append(err)
 24 | 
 25 |     print(
 26 |         "For {name}, mean r = {cor:.4f}, mean L2 error = {err:.4f}.".format(
 27 |             name=prefix, cor=np.mean(cors), err=np.mean(errors)
 28 |         )
 29 |     )
 30 |     np.savetxt("./result/{}_r.txt".format(prefix), cors)
 31 |     np.savetxt("./result/{}_L2.txt".format(prefix), errors)
 32 | 
 33 | 
 34 | def plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix, batch_key):
 35 |     if adata is not None:
 36 |         adata.obsm["X_cpu"] = Z_cpu
 37 |         adata.obsm["X_gpu"] = Z_gpu
 38 |         adata.obsm["X_harmony"] = Z_R
 39 | 
 40 |         pg.neighbors(adata, rep="cpu")
 41 |         pg.umap(adata, rep="cpu", out_basis="umap_cpu")
 42 | 
 43 |         pg.neighbors(adata, rep="gpu")
 44 |         pg.umap(adata, rep="gpu", out_basis="umap_gpu")
 45 | 
 46 |         pg.neighbors(adata, rep="harmony")
 47 |         pg.umap(adata, rep="harmony", out_basis="umap_harmony")
 48 | 
 49 |         pg.write_output(adata, "./result/{}_result".format(prefix))
 50 |     else:
 51 |         print("Use precalculated AnnData result.")
 52 | 
 53 |     if os.system(
 54 |         "pegasus plot scatter --basis umap --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.before.umap.pdf".format(
 55 |             attr=batch_key, prefix=prefix
 56 |         )
 57 |     ):
 58 |         sys.exit(1)
 59 | 
 60 |     if os.system(
 61 |         "pegasus plot scatter --basis umap_cpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.cpu.umap.pdf".format(
 62 |             attr=batch_key, prefix=prefix
 63 |         )
 64 |     ):
 65 |         sys.exit(1)
 66 | 
 67 |     if os.system(
 68 |         "pegasus plot scatter --basis umap_gpu --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.gpu.umap.pdf".format(
 69 |             attr=batch_key, prefix=prefix
 70 |         )
 71 |     ):
 72 |         sys.exit(1)
 73 | 
 74 |     if os.system(
 75 |         "pegasus plot scatter --basis umap_harmony --attributes {attr} --alpha 0.5 ./result/{prefix}_result.h5ad ./plots/{prefix}.harmony.umap.pdf".format(
 76 |             attr=batch_key, prefix=prefix
 77 |         )
 78 |     ):
 79 |         sys.exit(1)
 80 | 
 81 | 
 82 | def test_cell_lines():
 83 |     print("Testing on Cell Lines...")
 84 | 
 85 |     z_files = [
 86 |         f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)
 87 |     ]
 88 |     if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"):
 89 |         X = np.loadtxt("./data/cell_lines/pca.txt")
 90 |         df_metadata = pd.read_csv("./data/cell_lines/metadata.csv")
 91 | 
 92 |     if os.path.exists("./result/cell_lines_cpu_z.npy"):
 93 |         Z_cpu = np.load("./result/cell_lines_cpu_z.npy")
 94 |         print("Precalculated CPU mode result is loaded.")
 95 |     else:
 96 |         start_cpu = time.time()
 97 |         Z_cpu = harmonize(X, df_metadata, "dataset")
 98 |         end_cpu = time.time()
 99 | 
100 |         print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
101 |         np.save("./result/cell_lines_cpu_z.npy", Z_cpu)
102 | 
103 |     if os.path.exists("./result/cell_lines_gpu_z.npy"):
104 |         Z_gpu = np.load("./result/cell_lines_gpu_z.npy")
105 |         print("Precalculated GPU mode result is loaded.")
106 |     else:
107 |         start_gpu = time.time()
108 |         Z_gpu = harmonize(X, df_metadata, "dataset", use_gpu=True)
109 |         end_gpu = time.time()
110 | 
111 |         print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
112 |         np.save("./result/cell_lines_gpu_z.npy", Z_gpu)
113 | 
114 |     Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt")
115 | 
116 |     check_metrics(Z_cpu, Z_R, prefix="cell_lines_cpu")
117 |     check_metrics(Z_gpu, Z_R, prefix="cell_lines_gpu")
118 | 
119 |     if os.path.exists("./result/cell_lines_result.h5ad"):
120 |         adata = None
121 |     else:
122 |         n_obs = X.shape[0]
123 |         adata = AnnData(X=csr_matrix((n_obs, 2)), obs=df_metadata)
124 |         adata.obsm["X_pca"] = X
125 | 
126 |         pg.neighbors(adata, rep="pca")
127 |         pg.umap(adata)
128 | 
129 |     umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf")]
130 |     if len(umap_list) < 4:
131 |         plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="cell_lines", batch_key="dataset")
132 | 
133 | 
134 | def test_pbmc():
135 |     print("Testing on 10x PBMC...")
136 | 
137 |     z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)]
138 |     if len(z_files) < 3:
139 |         adata = pg.read_input("./data/10x_pbmc/original_data.h5ad")
140 | 
141 |     if os.path.exists("./result/pbmc_cpu_z.npy"):
142 |         Z_cpu = np.load("./result/pbmc_cpu_z.npy")
143 |         print("Precalculated CPU mode result is loaded.")
144 |     else:
145 |         start_cpu = time.time()
146 |         Z_cpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel")
147 |         end_cpu = time.time()
148 | 
149 |         print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
150 |         np.save("./result/pbmc_cpu_z.npy", Z_cpu)
151 | 
152 |     if os.path.exists("./result/pbmc_gpu_z.npy"):
153 |         Z_gpu = np.load("./result/pbmc_gpu_z.npy")
154 |         print("Precalculated GPU mode result is loaded.")
155 |     else:
156 |         start_gpu = time.time()
157 |         Z_gpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel", use_gpu=True)
158 |         end_gpu = time.time()
159 | 
160 |         print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
161 |         np.save("./result/pbmc_gpu_z.npy", Z_gpu)
162 | 
163 |     Z_R = np.loadtxt("./result/pbmc_harmony_z.txt")
164 | 
165 |     check_metrics(Z_cpu, Z_R, prefix="pbmc_cpu")
166 |     check_metrics(Z_gpu, Z_R, prefix="pbmc_gpu")
167 | 
168 |     if os.path.exists("./result/pbmc_result.h5ad"):
169 |         adata = None
170 | 
171 |     umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)]
172 |     if len(umap_list) < 4:
173 |         plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="pbmc", batch_key="Channel")
174 | 
175 | 
176 | def test_mantonbm():
177 |     print("Testing on MantonBM...")
178 | 
179 |     z_files = [
180 |         f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)
181 |     ]
182 |     if len(z_files) < 3:
183 |         adata = pg.read_input("./data/MantonBM/original_data.h5ad")
184 |         adata.obs["Individual"] = pd.Categorical(
185 |             adata.obs["Channel"].apply(lambda s: s.split("_")[0][-1])
186 |         )
187 | 
188 |     if os.path.exists("./result/MantonBM_cpu_z.npy"):
189 |         Z_cpu = np.load("./result/MantonBM_cpu_z.npy")
190 |         print("Precalculated CPU mode result is loaded.")
191 |     else:
192 |         start_cpu = time.time()
193 |         Z_cpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel")
194 |         end_cpu = time.time()
195 | 
196 |         print("Time spent in CPU mode = {:.2f}s.".format(end_cpu - start_cpu))
197 |         np.save("./result/MantonBM_cpu_z.npy", Z_cpu)
198 | 
199 |     if os.path.exists("./result/MantonBM_gpu_z.npy"):
200 |         Z_gpu = np.load("./result/MantonBM_gpu_z.npy")
201 |         print("Precalculated GPU mode result is loaded.")
202 |     else:
203 |         start_gpu = time.time()
204 |         Z_gpu = harmonize(adata.obsm["X_pca"], adata.obs, "Channel", use_gpu=True)
205 |         end_gpu = time.time()
206 | 
207 |         print("Time spent in GPU mode = {:.2f}s".format(end_gpu - start_gpu))
208 |         np.save("./result/MantonBM_gpu_z.npy", Z_gpu)
209 | 
210 |     Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt")
211 | 
212 |     check_metrics(Z_cpu, Z_R, prefix="MantonBM_cpu")
213 |     check_metrics(Z_gpu, Z_R, prefix="MantonBM_gpu")
214 | 
215 |     if os.path.exists("./result/MantonBM_result.h5ad"):
216 |         adata = None
217 | 
218 |     umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)]
219 |     if len(umap_list) < 4:
220 |         plot_umap(adata, Z_cpu, Z_gpu, Z_R, prefix="MantonBM", batch_key="Individual")
221 | 
222 | 
223 | if __name__ == "__main__":
224 |     dataset = sys.argv[1]
225 | 
226 |     assert dataset in ["cell_lines", "pbmc", "MantonBM"]
227 |     if dataset == "cell_lines":
228 |         test_cell_lines()
229 |     elif dataset == "pbmc":
230 |         test_pbmc()
231 |     else:
232 |         test_mantonbm()
233 | 


--------------------------------------------------------------------------------