├── .github ├── scripts │ └── release.py └── workflows │ ├── publish.yml │ └── release.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── _static └── training_progress.png ├── dist ├── celldancer-1.1.4-py3-none-any.whl ├── celldancer-1.1.4.tar.gz ├── celldancer-1.1.7-py3-none-any.whl └── celldancer-1.1.7.tar.gz ├── notebooks ├── case_study_gastrulation.ipynb ├── case_study_hgforebrian.ipynb ├── case_study_neuro.ipynb ├── case_study_pancreas.ipynb ├── case_study_pancreas_dynamo.ipynb ├── case_study_rpe1.ipynb └── celldancer_prototype_model.ipynb ├── readme.rst ├── readme_pypi.rst ├── requirements.txt ├── setup.py └── src └── celldancer ├── .Rapp.history ├── __init__.py ├── cdplt.py ├── compute_cell_velocity.py ├── diffusion.py ├── embedding_kinetic_para.py ├── model ├── branch.pt └── circle.pt ├── plotting ├── .Rapp.history ├── __init__.py ├── cell.py ├── colormap.py ├── gene.py └── graph.py ├── pseudo_time.py ├── sampling.py ├── simulation.py ├── utilities.py └── velocity_estimation.py /.github/scripts/release.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import subprocess 4 | 5 | 6 | def get_last_version() -> str: 7 | """Return the version number of the last release.""" 8 | json_string = ( 9 | subprocess.run( 10 | ["gh", "release", "view", "--json", "tagName"], 11 | check=True, 12 | stdout=subprocess.PIPE, 13 | stderr=subprocess.PIPE, 14 | ) 15 | .stdout.decode("utf8") 16 | .strip() 17 | ) 18 | 19 | return json.loads(json_string)["tagName"] 20 | 21 | 22 | def bump_patch_number(version_number: str) -> str: 23 | """Return a copy of `version_number` with the patch number incremented.""" 24 | major, minor, patch = version_number.split(".") 25 | return f"{major}.{minor}.{int(patch) + 1}" 26 | 27 | 28 | def create_new_patch_release(): 29 | """Create a new patch release on GitHub.""" 30 | try: 31 | last_version_number = get_last_version() 32 | except subprocess.CalledProcessError as err: 33 | if err.stderr.decode("utf8").startswith("HTTP 404:"): 34 | # The project doesn't have any releases yet. 35 | new_version_number = "0.0.1" 36 | else: 37 | raise 38 | else: 39 | new_version_number = bump_patch_number(last_version_number) 40 | 41 | subprocess.run( 42 | ["gh", "release", "create", "--generate-notes", new_version_number], 43 | check=True, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | create_new_patch_release() 49 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI.org 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | pypi: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v3 11 | with: 12 | fetch-depth: 0 13 | - run: python3 -m pip install --upgrade build && python3 -m build 14 | - name: Publish package 15 | uses: pypa/gh-action-pypi-publish@release/v1 16 | with: 17 | password: ${{ secrets.PYPI_API_TOKEN_CELLDANCER }} 18 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create a new patch release 2 | on: workflow_dispatch 3 | jobs: 4 | github: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: Checkout 8 | uses: actions/checkout@v3 9 | - name: Create new patch release 10 | run: .github/scripts/release.py 11 | env: 12 | GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | .eggs/ 11 | *.egg-info/ 12 | 13 | # PyInstaller 14 | *.manifest 15 | *.spec 16 | build/ 17 | 18 | # Installer logs 19 | pip-log.txt 20 | pip-delete-this-directory.txt 21 | 22 | # Unit test / coverage reports 23 | .cache 24 | 25 | # Sphinx documentation 26 | docs/_build/ 27 | 28 | # Emacs, vim 29 | .#* 30 | *.swp 31 | 32 | # Notebook Checkpoints 33 | .ipynb_checkpoints/ 34 | 35 | 36 | # Mac specific 37 | .DS_Store 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Wang Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include readme.rst 2 | include readme_pypi.rst 3 | include LICENSE -------------------------------------------------------------------------------- /_static/training_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/_static/training_progress.png -------------------------------------------------------------------------------- /dist/celldancer-1.1.4-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4-py3-none-any.whl -------------------------------------------------------------------------------- /dist/celldancer-1.1.4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4.tar.gz -------------------------------------------------------------------------------- /dist/celldancer-1.1.7-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7-py3-none-any.whl -------------------------------------------------------------------------------- /dist/celldancer-1.1.7.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7.tar.gz -------------------------------------------------------------------------------- /readme.rst: -------------------------------------------------------------------------------- 1 | cellDancer - Estimating Cell-dependent RNA Velocity 2 | =========================================================================================== 3 | 4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website `_. 5 | 6 | 7 | .. image:: _static/training_progress.png 8 | :width: 100% 9 | :alt: cell_type_u_s_sample_df 10 | 11 | Cite 12 | 13 | Shengyu Li#, Pengzhi Zhang#, Weiqing Chen, Lingqun Ye, Kristopher W. Brannan, Nhat-Tu Le, Jun-ichi Abe, John P. Cooke, Guangyu Wang. A relay velocity model infers cell-dependent RNA velocity. Nature Biotechnology (2023) https://doi.org/10.1038/s41587-023-01728-5 14 | 15 | cellDancer's key applications 16 | ======================================================== 17 | * Enable accurate inference of dynamic cell state transitions in heterogeneous cell populations. 18 | * Estimate cell-specific transcription (α), splicing (β) and degradation (γ) rates for each gene and reveal RNA turnover strategies. 19 | * Improves downstream analysis such as vector field predictions. 20 | 21 | To be done 22 | ======================================================== 23 | - [ ] Update an anndata-compatible version. 24 | 25 | What's new 26 | ======================================================== 27 | cellDancer is updated to v1.1.7 28 | 29 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw(). 30 | * Added try except to catch genes with low quality in velocity(). 31 | 32 | Installation 33 | ======================================================== 34 | cellDancer requires Python version >= 3.7.6 to run. 35 | 36 | To run cellDancer locally, we recommend to create a `conda `_ environment: ``conda create -n cellDancer python==3.7.6``. Then activate the new environment with ``conda activate cellDancer``. cellDancer package could be installed from pypi with ``pip install celldancer``. 37 | 38 | Python 3.7 is not compatible with M1 Mac, ``conda create -n cellDancer python==3.9.16`` is the version that compatible with M1 Mac that has been well tested to run cellDancer. 39 | 40 | To install the latest version from GitHub, run: 41 | 42 | ``pip install git+https://github.com/GuangyuWangLab2021/cellDancer.git`` 43 | 44 | To install cellDancer from source code, run: 45 | 46 | ``pip install 'your_path/Source Code/cellDancer'``. 47 | 48 | For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link: https://bezier.readthedocs.io/en/2021.2.12/#installing 49 | 50 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt`` or manually install each package in requirements.txt. 51 | 52 | To be compatible with Dynamo (optional), after first ``pip install celldancer`` and then ``pip install dynamo-release``, installing Dynamo will update numpy to 1.24.0, and we can downgrade numpy back to 1.20.0 with ``pip install numpy==1.20.0`` to let them be compatible. 53 | 54 | Frequently asked questions 55 | ======================================================== 56 | Q: How should I prepare the input for my own data? 57 | 58 | A: The `Data Preparation `_ page introduces the details of how to prepare and pre-process your own data. 59 | 60 | Check more frequently asked questions at `FAQ `_ in our website. If you have any other question related to your specific contition, welcome to post it in our github `issue `_ page or email to sli5@houstonmethodist.org 61 | 62 | Support 63 | ======================================================== 64 | Welcome bug reports and suggestions to our GitHub issue page! 65 | -------------------------------------------------------------------------------- /readme_pypi.rst: -------------------------------------------------------------------------------- 1 | cellDancer - Estimating Cell-dependent RNA Velocity 2 | =========================================================================================== 3 | 4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website `_. 5 | 6 | 7 | cellDancer's key applications 8 | ======================================================== 9 | * Estimate cell-specific RNA velocity for each gene. 10 | * Derive cell fates in embedding space. 11 | * Estimate pseudotime for each cell in embedding space. 12 | 13 | What's new 14 | ======================================================== 15 | cellDancer is updated to v1.1.7 16 | 17 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw(). 18 | * Added try except to catch genes with low quality in velocity(). 19 | 20 | Installation 21 | ======================================================== 22 | cellDancer requires Python version >= 3.7.6 to run. 23 | 24 | To run cellDancer locally, create an `conda `_ or `Anaconda `_ environment as ``conda create -n cellDancer python==3.7.6``, and activate the new environment with ``conda activate cellDancer``. cellDancer could be installed with ``pip install celldancer``. 25 | 26 | To install cellDancer from source code, run: 27 | ``pip install 'your_path/Source Code/cellDancer'``. 28 | 29 | For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link: 30 | https://bezier.readthedocs.io/en/2021.2.12/#installing 31 | 32 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt``. 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-lightning==1.5.2 2 | torch==1.10.0 3 | pandas==1.3.4 4 | numpy==1.20.3 5 | anndata==0.8.0 6 | tqdm==4.62.3 7 | scikit-learn==1.0.1 8 | scipy==1.7.2 9 | joblib==1.1.0 10 | scikit-image==0.19.2 11 | statsmodels==0.13.1 12 | matplotlib==3.5.3 13 | seaborn==0.11.2 14 | datashader==0.14.0 15 | bezier==2021.2.12 16 | umap-learn==0.5.2 17 | jupyterlab 18 | setuptools==59.5.0 19 | setuptools-scm==6.3.2 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | project_urls = { 4 | 'cellDancer': 'https://github.com/GuangyuWangLab2021/cellDancer', 5 | 'Documentation':'https://guangyuwanglab2021.github.io/cellDancer_website/' 6 | } 7 | 8 | with open("readme_pypi.rst", "rt", encoding="utf8") as f: 9 | long_description = f.read() 10 | 11 | setuptools.setup( 12 | name="celldancer", 13 | version="1.1.7", 14 | author="Wang Lab", 15 | author_email="gwang2@houstonmethodist.org", 16 | description="Study RNA velocity through neural network.", 17 | long_description=long_description, 18 | long_description_content_type="text/x-rst; charset=UTF-8", 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: MIT License", 22 | "Operating System :: OS Independent", 23 | ], 24 | project_urls = project_urls, 25 | package_dir={"": "src"}, 26 | packages=setuptools.find_packages(where="src"), 27 | package_data={'': ['model/*.pt']}, 28 | include_package_data=True, 29 | python_requires=">=3.7.6", 30 | install_requires = ['pytorch-lightning==1.5.2', 31 | 'torch==1.10.0', 32 | 'pandas==1.3.4', 33 | 'numpy==1.20.3', 34 | 'anndata==0.8.0', 35 | 'tqdm==4.62.3', 36 | 'scikit-learn==1.0.1', 37 | 'scipy==1.7.2', 38 | 'joblib==1.1.0', 39 | 'scikit-image==0.19.2', 40 | 'statsmodels==0.13.1', 41 | 'matplotlib==3.5.3', 42 | 'seaborn==0.11.2', 43 | 'datashader==0.14.0', 44 | 'bezier==2021.2.12', 45 | 'umap-learn==0.5.2', 46 | 'jupyterlab', 47 | 'setuptools==59.5.0', 48 | 'setuptools-scm==6.3.2' 49 | ] 50 | ) 51 | 52 | -------------------------------------------------------------------------------- /src/celldancer/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/.Rapp.history -------------------------------------------------------------------------------- /src/celldancer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | from .velocity_estimation import velocity 3 | from .pseudo_time import pseudo_time 4 | from .compute_cell_velocity import compute_cell_velocity 5 | from .embedding_kinetic_para import embedding_kinetic_para 6 | from .utilities import adata_to_df_with_embed 7 | from .utilities import to_dynamo 8 | from .utilities import export_velocity_to_dynamo 9 | from .simulation import simulate 10 | from . import cdplt 11 | 12 | __all__ = [ 13 | "cdplt", 14 | "velocity_estimation", 15 | "pseudo_time", 16 | "diffusion", 17 | "compute_cell_velocity", 18 | "simulation", 19 | "embedding_kinetic_para", 20 | "sampling", 21 | "utilities", 22 | "simulation" 23 | ] 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/celldancer/cdplt.py: -------------------------------------------------------------------------------- 1 | from celldancer.plotting import * 2 | -------------------------------------------------------------------------------- /src/celldancer/compute_cell_velocity.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.neighbors import NearestNeighbors 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | if __name__ == "__main__": 11 | sys.path.append('.') 12 | from sampling import * 13 | else: 14 | try: 15 | from .sampling import * 16 | except ImportError: 17 | from sampling import * 18 | 19 | 20 | def compute_cell_velocity( 21 | cellDancer_df, 22 | gene_list=None, 23 | speed_up=(60,60), 24 | expression_scale=None, 25 | projection_neighbor_size=200, 26 | projection_neighbor_choice='embedding'): 27 | 28 | """Project the RNA velocity onto the embedding space. 29 | 30 | Arguments 31 | --------- 32 | cellDancer_df: `pandas.DataFrame` 33 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID, 'clusters', 'embedding1', 'embedding2'] 34 | gene_list: optional, `list` (default: None) 35 | Genes selected to calculate the cell velocity. `None` if all genes in the cellDancer_df are to be used. 36 | speed_up: optional, `tuple` (default: (60,60)) 37 | Speed up by giving the sampling grid to downsample cells. 38 | `None` if all cells are used to compute cell velocity. 39 | expression_scale: optional, `str` (default: None) 40 | `None` if no expression scale is to be used. 41 | `'power10'` if the 10th power is been used to scale spliced and unspliced reads. 42 | projection_neighbor_size: optional, `int` (default: '200') 43 | The number of neighboring cells used for the transition probability matrix for one cell. 44 | projection_neighbor_choice: optional, `str` (default: 'embedding') 45 | `'embedding'` if using the embedding space to obtain the neighbors. 46 | `'gene'` if using the spliced reads of all genes to obtain the neighbors. 47 | 48 | Returns 49 | ------- 50 | cellDancer_df: `pandas.DataFrame` 51 | The updated cellDancer_df with additional columns ['velocity1', 'velocity2']. 52 | """ 53 | 54 | def velocity_correlation(cell_matrix, velocity_matrix): 55 | """Calculate the correlation between the predict velocity (velocity_matrix[:,i]) 56 | and the difference between a cell and every other (cell_matrix - cell_matrix[:, i]) 57 | 58 | Arguments 59 | --------- 60 | cell_matrix: np.ndarray (ngenes, ncells) 61 | gene expression matrix 62 | velocity_matrix: np.ndarray (ngenes, ncells) 63 | Return 64 | --------- 65 | c_matrix: np.ndarray (ncells, ncells) 66 | """ 67 | c_matrix = np.zeros((cell_matrix.shape[1], velocity_matrix.shape[1])) 68 | for i in range(cell_matrix.shape[1]): 69 | c_matrix[i, :] = corr_coeff(cell_matrix, velocity_matrix, i)[0, :] 70 | np.fill_diagonal(c_matrix, 0) 71 | return c_matrix 72 | 73 | 74 | def velocity_projection(cell_matrix, velocity_matrix, embedding, knn_embedding): 75 | ''' 76 | cell_matrix: np.ndarray (ngenes, ncells) 77 | gene expression matrix 78 | velocity_matrix: np.ndarray (ngenes, ncells) 79 | ''' 80 | # cell_matrix = np_splice[:,sampling_ixs] 81 | # velocity_matrix = np_dMatrix[:,sampling_ixs] 82 | sigma_corr = 0.05 83 | cell_matrix[np.isnan(cell_matrix)] = 0 84 | velocity_matrix[np.isnan(velocity_matrix)] = 0 85 | corrcoef = velocity_correlation(cell_matrix, velocity_matrix) 86 | probability_matrix = np.exp(corrcoef / sigma_corr)*knn_embedding.A 87 | probability_matrix /= probability_matrix.sum(1)[:, None] 88 | unitary_vectors = embedding.T[:, None, :] - embedding.T[:, :, None] 89 | with np.errstate(divide='ignore', invalid='ignore'): 90 | unitary_vectors /= np.linalg.norm(unitary_vectors, ord=2, axis=0) 91 | np.fill_diagonal(unitary_vectors[0, ...], 0) 92 | np.fill_diagonal(unitary_vectors[1, ...], 0) 93 | velocity_embedding = (probability_matrix * unitary_vectors).sum(2) 94 | velocity_embedding -= (knn_embedding.A * unitary_vectors).sum(2) / \ 95 | knn_embedding.sum(1).A.T # embedding_knn.A * 96 | velocity_embedding = velocity_embedding.T 97 | return velocity_embedding 98 | 99 | # remove invalid prediction 100 | is_NaN = cellDancer_df[['alpha','beta']].isnull() 101 | row_has_NaN = is_NaN. any(axis=1) 102 | cellDancer_df = cellDancer_df[~row_has_NaN].reset_index(drop=True) 103 | 104 | if 'velocity1' in cellDancer_df.columns: 105 | del cellDancer_df['velocity1'] 106 | if 'velocity2' in cellDancer_df.columns: 107 | del cellDancer_df['velocity2'] 108 | 109 | if gene_list is None: 110 | gene_list=cellDancer_df.gene_name.drop_duplicates() 111 | 112 | 113 | # This creates a new dataframe 114 | cellDancer_df_input = cellDancer_df[cellDancer_df.gene_name.isin(gene_list)].reset_index(drop=True) 115 | np_splice_all, np_dMatrix_all= data_reshape(cellDancer_df_input) 116 | # print("(genes, cells): ", end="") 117 | # print(np_splice_all.shape) 118 | n_genes, n_cells = np_splice_all.shape 119 | 120 | # This creates a new dataframe 121 | data_df = cellDancer_df_input.loc[:, 122 | ['gene_name', 'unsplice', 'splice', 'cellID','embedding1', 'embedding2']] 123 | # random.seed(10) 124 | embedding_downsampling, sampling_ixs, knn_embedding = downsampling_embedding(data_df, 125 | para='neighbors', 126 | target_amount=0, 127 | step=speed_up, 128 | n_neighbors=projection_neighbor_size, 129 | projection_neighbor_choice=projection_neighbor_choice, 130 | expression_scale=expression_scale, 131 | pca_n_components=None, 132 | umap_n=None, 133 | umap_n_components=None) 134 | 135 | 136 | # projection_neighbor_choice only provides neighborlist, use embedding(from raw data) to compute cell velocity 137 | embedding = cellDancer_df_input[cellDancer_df_input.gene_name == 138 | gene_list[0]][['embedding1', 'embedding2']] 139 | embedding = embedding.to_numpy() 140 | velocity_embedding = velocity_projection( 141 | np_splice_all[:, sampling_ixs], 142 | np_dMatrix_all[:, sampling_ixs], 143 | embedding[sampling_ixs, :], 144 | knn_embedding) 145 | 146 | if set(['velocity1','velocity2']).issubset(cellDancer_df.columns): 147 | print("Caution! Overwriting the \'velocity\' columns.") 148 | cellDancer_df.drop(['velocity1','velocity2'], axis=1, inplace=True) 149 | 150 | sampling_ixs_all_genes = cellDancer_df_input[cellDancer_df_input.cellIndex.isin(sampling_ixs)].index 151 | cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity1'] = np.tile(velocity_embedding[:,0], n_genes) 152 | cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity2'] = np.tile(velocity_embedding[:,1], n_genes) 153 | # print("After downsampling, there are ", len(sampling_ixs), "cells.") 154 | return(cellDancer_df_input) 155 | 156 | def corr_coeff(ematrix, vmatrix, i): 157 | ''' 158 | Calculate the correlation between the predict velocity (velocity_matrix[:,i]) 159 | and the displacement between a cell and every other (cell_matrix - cell_matrix[:, i]) 160 | ematrix = cell_matrix 161 | vmatrix = velocity_matrix 162 | ''' 163 | ematrix = ematrix.T 164 | vmatrix = vmatrix.T 165 | ematrix = ematrix - ematrix[i, :] 166 | vmatrix = vmatrix[i, :][None, :] 167 | ematrix_m = ematrix - ematrix.mean(1)[:, None] 168 | vmatrix_m = vmatrix - vmatrix.mean(1)[:, None] 169 | 170 | # Sum of squares across rows 171 | ematrix_ss = (ematrix_m**2).sum(1) 172 | vmatrix_ss = (vmatrix_m**2).sum(1) 173 | cor = np.dot(ematrix_m, vmatrix_m.T) 174 | N = np.sqrt(np.dot(ematrix_ss[:, None], vmatrix_ss[None])) 175 | cor=np.divide(cor, N, where=N!=0) 176 | return cor.T 177 | 178 | 179 | def data_reshape(cellDancer_df): # pengzhi version 180 | ''' 181 | load detail file 182 | return expression matrix and velocity (ngenes, ncells) 183 | ''' 184 | psc = 1 185 | gene_names = cellDancer_df['gene_name'].drop_duplicates().to_list() 186 | # PZ uncommented this. 187 | cell_number = cellDancer_df[cellDancer_df['gene_name']==gene_names[0]].shape[0] 188 | cellDancer_df['index'] = np.tile(range(cell_number),len(gene_names)) 189 | 190 | splice_reshape = cellDancer_df.pivot( 191 | index='gene_name', values='splice', columns='index') 192 | splice_predict_reshape = cellDancer_df.pivot( 193 | index='gene_name', values='splice_predict', columns='index') 194 | dMatrix = splice_predict_reshape-splice_reshape 195 | np_splice_reshape = np.array(splice_reshape) 196 | np_dMatrix = np.array(dMatrix) 197 | np_dMatrix2 = np.sqrt(np.abs(np_dMatrix) + psc) * \ 198 | np.sign(np_dMatrix) 199 | return(np_splice_reshape, np_dMatrix2) 200 | 201 | -------------------------------------------------------------------------------- /src/celldancer/diffusion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import sys 6 | import random 7 | import multiprocessing as mp 8 | 9 | import numpy as np 10 | from sklearn import preprocessing 11 | 12 | import matplotlib as mpl 13 | import matplotlib.pyplot as plt 14 | 15 | def embedding_normalization(cell_embedding, embedding=None, mode="minmax", NORM_ALL_CELLS=False): 16 | ''' 17 | Normalize by the maximum absolute value. 18 | 19 | Parameters 20 | ---------- 21 | embedding: 2D numpy array (n_cells, 2) 22 | mode: string 23 | 'maxabs', "minmax" 24 | maxabs is meant for sparse data and/or centered at 0. 25 | Note in this program (ML velocity), it is pretty safe to do maxabs normalization 26 | since the data are free of extreme outliers. 27 | 28 | ''' 29 | if mode in ['max', 'maximum', 'maxabs']: 30 | transformer = preprocessing.MaxAbsScaler().fit(cell_embedding) 31 | elif mode in ['minmax']: 32 | transformer = preprocessing.MinMaxScaler().fit(cell_embedding) 33 | em = transformer.transform(cell_embedding) 34 | if NORM_ALL_CELLS: 35 | try: 36 | em_all = transformer.transform(embedding) 37 | except ValueError: 38 | print("ERROR! Missing embedding for all cells.") 39 | raise 40 | return em, em_all 41 | else: 42 | return em 43 | 44 | def velocity_normalization(downsampled_vel, all_vel=None, mode="max", NORM_ALL_CELLS=False): 45 | ''' 46 | Normalize by the maximum absolute value in the downsampled_vel. 47 | 48 | Parameters 49 | ---------- 50 | downsampled_vel: 2D numpy array (n_cells, 2) 51 | mode: 'maxabs' 52 | 53 | maxabs is meant for sparse data and/or centered at 0. 54 | 55 | Note in this program, it is pretty safe to do maxabs normalization 56 | since the data are free of extreme outliers. 57 | 58 | ''' 59 | # add v_prime to vel of each cell without changing their directions. 60 | v_mag = np.linalg.norm(downsampled_vel, axis=1) 61 | v_prime = 0.1*np.std(v_mag) 62 | 63 | # for 0 velocity cell, nothing changed. 64 | v_prime = np.divide(v_prime, v_mag, where=v_mag > 0) 65 | downsampled_vel = downsampled_vel*(v_prime + 1)[:,None] 66 | 67 | if mode in ['max', 'maximum', 'maxabs']: 68 | transformer = preprocessing.MaxAbsScaler().fit(downsampled_vel) 69 | em = transformer.transform(downsampled_vel) 70 | if NORM_ALL_CELLS: 71 | em_all = transformer.transform(all_vel) 72 | return em, em_all 73 | else: 74 | return em 75 | 76 | 77 | def discretize(coordinate, xmin, xmax, n_grids, capping=False): 78 | ''' 79 | ''' 80 | grid_size = np.array(xmax) - np.array(xmin) 81 | grid_size = grid_size / np.array(n_grids) 82 | 83 | grid_idx = np.int64(np.floor((coordinate-xmin)/grid_size)) 84 | 85 | if capping: 86 | grid_idx = np.where(grid_idx > n_grids, n_grids, grid_idx) 87 | grid_idx = np.where(grid_idx <0, 0, grid_idx) 88 | 89 | grid_coor = xmin + grid_size * (grid_idx+0.5) 90 | return grid_idx, grid_coor 91 | 92 | 93 | def generate_grid( 94 | cell_embedding, 95 | embedding, 96 | velocity_embedding, 97 | abr_umap = None, 98 | n_grids = None): 99 | 100 | xmin = np.min(cell_embedding, axis=0) 101 | xmax = np.max(cell_embedding, axis=0) 102 | n_grids = np.array(n_grids, dtype=int) 103 | 104 | cell_grid_idx, cell_grid_coor = discretize(cell_embedding, 105 | xmin=xmin, 106 | xmax=xmax, 107 | n_grids=n_grids) 108 | 109 | # The actual n_grids need to allow a leeway +1 in each dimension. 110 | mesh = np.zeros(np.append(n_grids+1,len(n_grids))) 111 | 112 | cnt = np.zeros(n_grids+1) 113 | for index in range(cell_grid_idx.shape[0]): 114 | grid_index = cell_grid_idx[index] 115 | if np.any(grid_index > n_grids) or np.any(grid_index < 0): 116 | continue 117 | grid_index = toTuple(grid_index) 118 | mesh[grid_index] += velocity_embedding[index] 119 | cnt[grid_index] += 1 120 | cnt = cnt[:,:,None] 121 | mesh = np.divide(mesh, cnt, out=np.zeros_like(mesh), where=cnt>0.1) 122 | 123 | # the all cell embedding is used to generate mass 124 | mass = np.zeros(n_grids+1) 125 | all_cells_grid_idx, all_cells_grid_coor = \ 126 | discretize(embedding, xmin=xmin, xmax=xmax, n_grids=n_grids) 127 | n_cells = all_cells_grid_idx.shape[0] 128 | 129 | for index in range(n_cells): 130 | all_cells_grid_index = all_cells_grid_idx[index] 131 | 132 | # mass outside the grid is not needed. 133 | if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0): 134 | continue 135 | all_cells_grid_index = toTuple(all_cells_grid_index) 136 | mass[all_cells_grid_index] += 1 137 | 138 | # the all cell embedding is used to generate grid_umap 139 | if abr_umap is not None: 140 | grid_umap = np.full_like(mesh, np.NAN) 141 | n_umap_dims = all_cells_grid_idx.shape[-1] 142 | for index in range(n_cells): 143 | all_cells_grid_index = all_cells_grid_idx[index] 144 | if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0): 145 | all_cells_grid_index = toTuple(all_cells_grid_index) 146 | grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), np.NAN) 147 | pass 148 | all_cells_grid_index = toTuple(all_cells_grid_index) 149 | if np.any(np.isnan(grid_umap[all_cells_grid_index])): 150 | grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), 0) 151 | else: 152 | grid_umap[all_cells_grid_index] += abr_umap[index,:] 153 | 154 | # divide by 0 does not happen 155 | # because where-ever mass is 0, grid_umap is nan. nan/0 -> nan 156 | grid_umap = np.divide(grid_umap, mass[:,:,None]) 157 | 158 | else: 159 | grid_umap = None 160 | 161 | return mesh, mass, grid_umap, \ 162 | cell_grid_idx, cell_grid_coor, all_cells_grid_idx, all_cells_grid_coor 163 | 164 | 165 | def toTuple(arr): 166 | ''' 167 | Parameters 168 | ---------- 169 | arr: numpy ndarray or list 170 | 171 | Return 172 | ------ 173 | A tuple (of nested tuples) 174 | 175 | ''' 176 | 177 | try: 178 | return tuple(toTuple(i) for i in arr) 179 | except TypeError: 180 | return arr 181 | 182 | 183 | def compute_path_divider_matrix(fmat, cutoff=0.3): 184 | 185 | 186 | print("The cutoff for banning a path is ", cutoff) 187 | ngrids = fmat.shape[:-1] 188 | flat_length = np.multiply(*ngrids) 189 | temp = fmat.reshape(flat_length, fmat.shape[-1]) 190 | 191 | temp2 = temp-temp[:,None] 192 | temp2 = np.linalg.norm(temp2, axis=-1) 193 | 194 | ban = temp2.reshape(ngrids+ngrids) 195 | 196 | path_divider_matrix = ban < cutoff 197 | return path_divider_matrix 198 | 199 | 200 | def plot_velocity(embedding, velocity_embedding): 201 | fig, ax = plt.subplots(figsize=(6,6)) 202 | plt.quiver(embedding[:, 0],embedding[:, 1], 203 | velocity_embedding[:,0], velocity_embedding[:,1], 204 | color='Blue') 205 | plt.show() 206 | 207 | def plot_mesh_velocity(mesh, grid_mass): 208 | x=list() 209 | y=list() 210 | vx=list() 211 | vy=list() 212 | for i in range(mesh.shape[0]): 213 | for j in range(mesh.shape[1]): 214 | x.append(i) 215 | y.append(j) 216 | vx.append(mesh[i,j][0]) 217 | vy.append(mesh[i,j][1]) 218 | fig, ax = plt.subplots(figsize=(6, 6)) 219 | ax.quiver(x,y,vx,vy,color='red',scale = 10) 220 | plt.imshow(grid_mass.T, interpolation=None, origin='lower',cmap="Greys") 221 | plt.show() 222 | 223 | def velocity_add_random(velocity, theta): 224 | ''' 225 | Rotate the velocity according to a randomized kicks on the perpendicular direction. 226 | The direction is determined by the sign of a random number. 227 | The magnitude of the perpendicular kick is determined by the random number 228 | from a normal distribution N(0, theta). 229 | Magnitude of the velocity is kept the same to conserve energy (temperature) of the system. 230 | 231 | Parameters 232 | ---------- 233 | velocity 234 | velocity of the grid 235 | theta 236 | the angular range that the noise could be affecting the direction of the velocity 237 | 238 | WARNING 239 | at a rare chance, the rotation angle (magnitude) could be much larger than theta. 240 | 241 | Return 242 | ------ 243 | Adjusted velocity for the interested cell 244 | 245 | ''' 246 | r = np.random.normal(0, theta, 1) 247 | # print(mp.current_process(), r) 248 | 249 | cosine = np.cos(r)[0] 250 | sine = np.sin(r)[0] 251 | 252 | # Rotation matrix 253 | R = np.array([[cosine, sine],[-sine, cosine]]) 254 | velocity = np.dot(velocity, R) 255 | return velocity 256 | 257 | def velocity_rotation(velocity, theta): 258 | ''' 259 | Rotate the velocity clockwise by angle theta 260 | 261 | Parameters 262 | ---------- 263 | velocity 264 | velocity of the grid 265 | theta 266 | the angular range that the noise could be affecting the direction of the velocity 267 | 268 | Return 269 | ------ 270 | Adjusted velocity for the interested cell 271 | 272 | ''' 273 | cosine = np.cos(theta) 274 | sine = np.sin(theta) 275 | 276 | # Rotation matrix 277 | R = np.array([[cosine, sine],[-sine, cosine]]) 278 | velocity = np.dot(velocity, R) 279 | return velocity 280 | 281 | 282 | def diffusion_off_grid_wallbound( 283 | cell_embedding, 284 | vel, 285 | init, 286 | grid_mass, 287 | dt = 0.001, 288 | t_total = 10000, 289 | eps = 1e-5, 290 | random_seed = None, 291 | pdm = None): 292 | 293 | ''' 294 | Simulate the diffusion of a cell in the velocity field (off grid), the 295 | cell's velocity will turn 30 degrees 296 | if it hits the boundary the next timestep. 297 | 298 | The diffusion is stopped by any of the criteria: 299 | - reach t_total 300 | - the magnitude of the velocity is less than eps. 301 | - the cell goes to places where the cell mass <= MAX_IGNORED_MASS even after turning. 302 | - the cell is out of the simulation box 303 | 304 | Parameters 305 | ---------- 306 | 307 | cell_embedding: numpy ndarray (n_cells x n_dims) 308 | embedding coordinate for all the cells (downsampled) 309 | 310 | vel: numpy ndarray (n_grids x n_dims) 311 | pre-assigned velocity of each grid 312 | 313 | init: numpy ndarray (n_cells x n_dims) 314 | The initial position (cell_embedding) 315 | 316 | dt: float 317 | Step size of each integration time step 318 | 319 | t_total: int 320 | Total number of time steps 321 | 322 | grid_mass: numpy ndarray (n_grids x n_dims) 323 | mass of cells. 324 | 325 | eps 326 | Criterion to stop a trajectory before t_total (v_net < eps) 327 | 328 | 329 | Return 330 | ------ 331 | a numpy ndarray of coordinates in the trajectory, shape: 332 | (real_n_time_steps, n_dims) 333 | ''' 334 | 335 | np.random.seed(seed = random_seed) 336 | # print("random seed is set to, ", random_seed) 337 | THETA = np.pi/6 338 | 339 | XMIN = np.min(cell_embedding, axis=0) 340 | XMAX = np.max(cell_embedding, axis=0) 341 | N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1) 342 | 343 | # lower 5% nonzero mass set to 0. 344 | #MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0], 5) 345 | MAX_IGNORED_MASS = 2 346 | 347 | def no_cells_around(xcur, xcur_d, vcur): 348 | xnxt = xcur + vcur*dt 349 | xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 350 | try: 351 | mass = grid_mass[xnxt_d[0], xnxt_d[1]] 352 | except IndexError: 353 | return True 354 | return mass <= MAX_IGNORED_MASS 355 | 356 | x0 = init 357 | x0_d, dummy = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 358 | v0 = vel[x0_d[0],x0_d[1]] 359 | v0 = velocity_add_random(v0, THETA) 360 | trajectory = [x0] 361 | 362 | for i in range(int(t_total)): 363 | 364 | if np.linalg.norm(v0) < eps: 365 | #print("Velocity is too small") 366 | return np.array(trajectory) 367 | if no_cells_around(x0, x0_d, v0): 368 | v0_cc = velocity_rotation(v0, THETA) 369 | v0_c = velocity_rotation(v0, -THETA) 370 | 371 | # nowhere to go but null 372 | CC = no_cells_around(x0, x0_d, v0_cc) 373 | C = no_cells_around(x0, x0_d, v0_c) 374 | 375 | if CC and C: 376 | return np.array(trajectory) 377 | elif not C: 378 | v0 = v0_c 379 | else: 380 | v0 = v0_cc 381 | 382 | else: 383 | x = x0 + v0*dt 384 | x_d, dummy = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 385 | if (pdm is None) or (pdm[toTuple(x0_d)+toTuple(x_d)]): 386 | try: 387 | v = vel[x_d[0],x_d[1]] 388 | mass = grid_mass[x_d[0],x_d[1]] 389 | v = velocity_add_random(v, THETA) 390 | except IndexError: 391 | break 392 | 393 | trajectory.append(x) 394 | x0 = x 395 | v0 = v 396 | 397 | return np.array(trajectory) 398 | 399 | 400 | def diffusion_on_grid_wallbound( 401 | cell_embedding, 402 | vel, 403 | init, 404 | grid_mass, 405 | dt=0.001, 406 | t_total=10000, 407 | eps = 1e-5): 408 | 409 | ''' 410 | same as diffusion_off_grid_wallbound, however, it returns the coordinates 411 | of the grid traversed by the cell, instead of the position of the cell. 412 | 413 | The diffusion is stopped by any of the criteria: 414 | 1. reach t_total 415 | 2. the magnitude of the velocity is less than eps. 416 | 3. the cell goes to places where the cell mass = 0 even after turning. 417 | 4. the cell is out of the simulation box 418 | 419 | Parameters 420 | ---------- 421 | 422 | cell_embedding: numpy ndarray (n_cells x n_dims) 423 | embedding coordinate for all the cells (downsampled) 424 | 425 | vel: numpy ndarray (n_grids x n_dims) 426 | pre-assigned velocity of each grid 427 | 428 | init: numpy ndarray (n_cells x n_dims) 429 | The initial position (cell_embedding) 430 | 431 | dt: float 432 | Step size of each integration time step 433 | 434 | t_total: int 435 | Total number of time steps 436 | 437 | grid_mass: numpy ndarray (n_grids x n_dims) 438 | mass of cells. 439 | 440 | eps 441 | Criterion to stop a trajectory before t_total (v_net < eps) 442 | 443 | 444 | Return 445 | ------ 446 | a numpy ndarray of coordinates in the trajectory, shape: 447 | (real_n_time_steps, n_dims) 448 | ''' 449 | 450 | THETA = np.pi/6 451 | 452 | XMIN = np.min(cell_embedding, axis=0) 453 | XMAX = np.max(cell_embedding, axis=0) 454 | N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1) 455 | 456 | # lower 5% nonzero mass set to 0. 457 | MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0],5) 458 | 459 | def no_cells_around(xcur, xcur_d, vcur): 460 | xnxt = xcur + vcur*dt 461 | xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 462 | try: 463 | mass = grid_mass[xnxt_d[0], xnxt_d[1]] 464 | except IndexError: 465 | return True 466 | return mass < MAX_IGNORED_MASS 467 | 468 | x0 = init 469 | x0_d, x0_d_coor = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 470 | v0 = vel[x0_d[0],x0_d[1]] 471 | v0 = velocity_add_random(v0, THETA) 472 | trajectory = [x0_d_coor] 473 | 474 | for i in range(int(t_total)): 475 | 476 | if np.linalg.norm(v0) < eps: 477 | #print("Velocity is too small") 478 | return np.array(trajectory) 479 | if no_cells_around(x0_d_coor, x0_d, v0): 480 | v0_cc = velocity_rotation(v0, np.pi/2) 481 | v0_c = velocity_rotation(v0, -np.pi/2) 482 | # nowhere to go but null 483 | CC = no_cells_around(x0_d_coor, x0_d, v0_cc) 484 | C = no_cells_around(x0_d_coor, x0_d, v0_c) 485 | if CC and C: 486 | return np.array(trajectory) 487 | elif not C: 488 | v0 = v0_c 489 | else: 490 | v0 = v0_cc 491 | 492 | else: 493 | x = x0_d_coor + v0*dt 494 | x_d, x_d_coor = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS) 495 | try: 496 | v = vel[x_d[0],x_d[1]] 497 | v = velocity_add_random(v, THETA) 498 | except IndexError: 499 | break 500 | 501 | trajectory.append(x_d_coor) 502 | x0 = x_d 503 | x0_d_coor = x_d_coor 504 | v0 = v 505 | 506 | return np.array(trajectory) 507 | 508 | 509 | def run_diffusion( 510 | cell_embedding, 511 | vel, 512 | grid_mass, 513 | dt, 514 | t_total = 10000, 515 | eps = 1e-5, 516 | off_cell_init = False, 517 | init_cell = [], 518 | n_repeats = 10, 519 | n_jobs = 8, 520 | psrng_seeds_diffusion = None, 521 | path_divider_matrix=None): 522 | ''' 523 | Simulation of diffusion of a cell in the velocity field (on grid), 524 | the cell's velocity will turn 90 degrees if it hits the boundary the next timestep. 525 | Embarrassingly parallel (process) are employed. 526 | 527 | Parameters 528 | ---------- 529 | 530 | cell_embedding: numpy.ndarray (n_cells, 2) 531 | embedding coordinate for all the cells (downsampled) 532 | 533 | vel: numpy.ndarray (ngrid, ngrid, 2) 534 | pre-assigned velocity of each grid 535 | 536 | dt: float 537 | Step size of each integration time step 538 | 539 | t_total: int 540 | Total number of time steps 541 | 542 | eps: float 543 | Criterion to stop a trajectory before t_total (v_net < eps) 544 | 545 | off_cell_init: Boolean 546 | Whether to spawn initial coordinates from the neighbouring space around a cell 547 | 548 | init_cell: list 549 | List of initial cell indices. If empty list, use all cell indices in the given cell_embedding. 550 | 551 | n_repeats: init 552 | Number of repeats (either on or off the cells) 553 | 554 | n_jobs: int 555 | Number of threads 556 | 557 | Return 558 | ------ 559 | a numpy array of trajectorys, shape: (num_trajs, *n_time_steps, 2) 560 | ''' 561 | import tqdm 562 | 563 | if psrng_seeds_diffusion is None: 564 | psrng_seeds_diffusion = [i*100+11 for i in range(n_repeats)] 565 | 566 | assert len(psrng_seeds_diffusion) >= n_repeats 567 | 568 | if n_jobs >= mp.cpu_count(): 569 | n_jobs = mp.cpu_count() 570 | 571 | if n_jobs < 0: 572 | n_jobs = mp.cpu_count() + 1 + n_jobs 573 | 574 | TASKS = list() 575 | # Setting up the TASKS 576 | n_cells = cell_embedding.shape[0] 577 | 578 | if not init_cell: 579 | init_cell = list(range(n_cells)) 580 | 581 | embedding_range = cell_embedding.max(axis=0) - cell_embedding.min(axis=0) 582 | n_grids = np.array([vel.shape[0], vel.shape[1]]) 583 | grid_size = embedding_range/n_grids 584 | 585 | n_trajs = 0 586 | for i in init_cell: 587 | for j in range(n_repeats): 588 | n_trajs += 1 589 | if off_cell_init: 590 | init_position = cell_embedding[i] + grid_size * np.random.uniform(-0.5,0.5,2) 591 | else: 592 | init_position = cell_embedding[i] 593 | TASKS.append((cell_embedding, vel, init_position, grid_mass, dt, 594 | t_total, 1e-5, psrng_seeds_diffusion[n_trajs % n_repeats], 595 | path_divider_matrix)) 596 | 597 | with mp.Pool(n_jobs) as pool: 598 | n_total = len(init_cell)*n_repeats 599 | if n_total > 5000: 600 | paths = pool.starmap(diffusion_off_grid_wallbound, 601 | tqdm.tqdm(TASKS, total=n_total, 602 | desc="Generating Trajectories", 603 | colour="blue") 604 | ) 605 | else: 606 | paths = pool.starmap(diffusion_off_grid_wallbound, TASKS) 607 | return np.array(paths, dtype=object) 608 | -------------------------------------------------------------------------------- /src/celldancer/embedding_kinetic_para.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | 5 | os.environ['KMP_WARNINGS'] = '0' 6 | 7 | def embedding_kinetic_para( 8 | cellDancer_df, 9 | kinetic_para, 10 | umap_n=25 11 | ): 12 | """Calculate the UMAP based on the kinetic parameter(s). 13 | 14 | Arguments 15 | --------- 16 | cellDancer_df: `pandas.DataFrame` 17 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2'] 18 | kinetic_para: `str` 19 | Choose Which parameter is used to calculate embedding space, which could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}. 20 | umap_n: optional, `int` (default: 25) 21 | The size of the local neighborhood (in terms of the number of neighboring sample points) used for manifold approximation in UMAP. 22 | 23 | Returns 24 | ------- 25 | cellDancer_df: `pandas.DataFrame` 26 | The updated cellDancer_df with an additional column of UMAP based on the kinetic parameter(s). 27 | 28 | """ 29 | import umap 30 | if set([(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]).issubset(cellDancer_df.columns): 31 | cellDancer_df=cellDancer_df.drop(columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]) 32 | 33 | if kinetic_para=='alpha' or kinetic_para=='beta' or kinetic_para=='gamma': 34 | para_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values=kinetic_para) 35 | elif kinetic_para=='alpha_beta_gamma': 36 | alpha_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='alpha') 37 | beta_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='beta') 38 | gamma_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='gamma') 39 | para_df=pd.concat([alpha_df,beta_df,gamma_df],axis=1) 40 | else: 41 | print('kinetic_para should be set in one of alpha, beta, gamma, or alpha_beta_gamma.') 42 | 43 | def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=2, metric='euclidean'): 44 | fit = umap.UMAP( 45 | n_neighbors=n_neighbors, 46 | min_dist=min_dist, 47 | n_components=n_components, 48 | metric=metric 49 | ) 50 | embed = fit.fit_transform(df); 51 | return(embed) 52 | umap_para=get_umap(para_df) 53 | umap_info=pd.DataFrame(umap_para,columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]) 54 | 55 | gene_amt=len(cellDancer_df.gene_name.drop_duplicates()) 56 | umap_col=pd.concat([umap_info]*gene_amt) 57 | umap_col.index=cellDancer_df.index 58 | cellDancer_df=pd.concat([cellDancer_df,umap_col],axis=1) 59 | return(cellDancer_df) 60 | -------------------------------------------------------------------------------- /src/celldancer/model/branch.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/branch.pt -------------------------------------------------------------------------------- /src/celldancer/model/circle.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/circle.pt -------------------------------------------------------------------------------- /src/celldancer/plotting/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/plotting/.Rapp.history -------------------------------------------------------------------------------- /src/celldancer/plotting/__init__.py: -------------------------------------------------------------------------------- 1 | from .cell import scatter_cell 2 | from .cell import plot_kinetic_para 3 | from .graph import PTO_Graph 4 | from .gene import scatter_gene 5 | from .colormap import build_colormap 6 | 7 | 8 | __all__=[ 9 | 'scatter_cell', 10 | 'build_colormap', 11 | 'scatter_gene', 12 | 'PTO_Graph', 13 | 'plot_kinetic_para', 14 | 'colormap' 15 | ] 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/celldancer/plotting/cell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import matplotlib.pyplot as plt 4 | from matplotlib.lines import Line2D 5 | from matplotlib.colors import ListedColormap, LinearSegmentedColormap 6 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable 7 | from scipy.stats import norm as normal 8 | import bezier 9 | import numpy as np 10 | import pandas as pd 11 | from .colormap import * 12 | 13 | if __name__ == "__main__": 14 | sys.path.append('..') 15 | from utilities import find_nn_neighbors, extract_from_df 16 | else: 17 | from celldancer.utilities import find_nn_neighbors, extract_from_df 18 | 19 | def scatter_cell( 20 | ax, 21 | cellDancer_df, 22 | colors=None, 23 | custom_xlim=None, 24 | custom_ylim=None, 25 | vmin=None, 26 | vmax=None, 27 | alpha=0.5, 28 | s = 5, 29 | legend_marker_size=5, 30 | gene=None, 31 | velocity=False, 32 | legend='off', 33 | colorbar='on', 34 | min_mass=2, 35 | arrow_grid=(30,30) 36 | ): 37 | 38 | """Plot the RNA velocity on the embedding space; or plot the kinetic parameters ('alpha', 'beta', 'gamma', 'splice', 'unsplice', or 'pseudotime') of one gene on the embedding space. 39 | 40 | Arguments 41 | --------- 42 | ax: `ax` 43 | ax of plt.subplots() 44 | cellDancer_df: `pandas.DataFrame` 45 | Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime'] 46 | colors: `list`, `dict`, or `str` 47 | When the input is a list: build a colormap dictionary for a list of cell type; 48 | When the input is a dictionary: it is the customized color map dictionary of each cell type; 49 | When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as input. 50 | custom_xlim: optional, `float` (default: None) 51 | Set the x limit of the current axes. 52 | custom_ylim: optional, `float` (default: None) 53 | Set the y limit of the current axes. 54 | vmin: optional, `float` (default: None) 55 | Set the minimum color limit of the current image. 56 | vmax: optional, `float` (default: None) 57 | Set the maximum color limit of the current image. 58 | alpha: optional, `float` (default: 0.5) 59 | The alpha blending value, between 0 (transparent) and 1 (opaque). 60 | s: optional, `float` (default: 5) 61 | The marker size. 62 | legend_marker_size: optional, `float` (default: 5) 63 | The lengend marker size. 64 | gene: optional, `str` (default: None) 65 | Gene name for plotting. 66 | velocity: optional, `bool` (default: False) 67 | `True` if plot velocity. 68 | legend: optional, `str` (default: 'off') 69 | `'off'` if the color map of cell legend is not plotted. 70 | `'only'` if only plot the cell type legend. 71 | colorbar: optional, `str` (default: 'on') 72 | `‘on’` if the colorbar of the plot of `alpha`, `beta`, `gamma`, `splice`, or `unsplice` is to be shown. `'off'` if the colorbar is to be not shown. 73 | min_mass: optional, `float` (default: 2) 74 | Filter by using the isotropic gaussian kernel to display the arrow on grids. The lower the min_mass, the more arrows. 75 | arrow_grid: optional, `tuple` (default: (30,30)) 76 | The sparsity of the grids of velocity arrows. The larger, the more compact, and more arrows will be shown. 77 | Returns 78 | ------- 79 | ax: matplotlib.axes.Axes 80 | """ 81 | 82 | def gen_Line2D(label, markerfacecolor): 83 | return Line2D([0], [0], color='w', marker='o', label=label, 84 | markerfacecolor=markerfacecolor, 85 | markeredgewidth=0, 86 | markersize=legend_marker_size) 87 | 88 | if isinstance(colors, (list, tuple)): 89 | #print("\nbuild a colormap for a list of clusters as input\n") 90 | colors = build_colormap(colors) 91 | 92 | if isinstance(colors, dict): 93 | attr = 'clusters' 94 | legend_elements= [gen_Line2D(i, colors[i]) for i in colors] 95 | if legend != 'off': 96 | lgd=ax.legend(handles=legend_elements, 97 | bbox_to_anchor=(1.01, 1), 98 | loc='upper left') 99 | bbox_extra_artists=(lgd,) 100 | if legend == 'only': 101 | return lgd 102 | else: 103 | bbox_extra_artists=None 104 | 105 | c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters', gene)) 106 | cmap=ListedColormap(list(colors.values())) 107 | elif isinstance(colors, str): 108 | attr = colors 109 | if colors in ['alpha', 'beta', 'gamma']: 110 | assert gene, '\nError! gene is required!\n' 111 | cmap = LinearSegmentedColormap.from_list("mycmap", colors_alpha_beta_gamma) 112 | if colors in ['splice', 'unsplice']: 113 | assert gene, '\nError! gene is required!\n' 114 | colors = {'splice':'splice', 'unsplice':'unsplice'}[colors] 115 | cmap = LinearSegmentedColormap.from_list("mycmap", 116 | colors_splice_unsplice) 117 | if colors in ['pseudotime']: 118 | cmap = 'viridis' 119 | c = extract_from_df(cellDancer_df, [colors], gene) 120 | 121 | elif colors is None: 122 | attr = 'basic' 123 | cmap = None 124 | c = 'Grey' 125 | 126 | embedding = extract_from_df(cellDancer_df, ['embedding1', 'embedding2'], gene) 127 | n_cells = embedding.shape[0] 128 | 129 | im=ax.scatter(embedding[:, 0], 130 | embedding[:, 1], 131 | c=c, 132 | cmap=cmap, 133 | s=s, 134 | vmin=vmin, 135 | vmax=vmax, 136 | alpha=alpha, 137 | edgecolor="none") 138 | if colorbar == 'on' and isinstance(colors, str): 139 | ax_divider = make_axes_locatable(ax) 140 | cax = ax_divider.append_axes("top", size="5%", pad="-5%") 141 | 142 | # print(" \n ") 143 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1) 144 | cbar.set_ticks([]) 145 | 146 | if velocity: 147 | sample_cells = cellDancer_df['velocity1'][:n_cells].dropna().index 148 | embedding_ds = embedding[sample_cells] 149 | velocity_embedding= extract_from_df(cellDancer_df, ['velocity1', 'velocity2'], gene) 150 | grid_curve(ax, embedding_ds, velocity_embedding, arrow_grid, min_mass) 151 | 152 | if custom_xlim is not None: 153 | ax.set_xlim(custom_xlim[0], custom_xlim[1]) 154 | if custom_ylim is not None: 155 | ax.set_ylim(custom_ylim[0], custom_ylim[1]) 156 | 157 | return ax 158 | 159 | def grid_curve( 160 | ax, 161 | embedding_ds, 162 | velocity_embedding, 163 | arrow_grid, 164 | min_mass 165 | ): 166 | # calculate_grid_arrows 167 | # kernel grid plot 168 | 169 | def calculate_two_end_grid(embedding_ds, velocity_embedding, smooth=None, steps=None, min_mass=None): 170 | # Prepare the grid 171 | grs = [] 172 | for dim_i in range(embedding_ds.shape[1]): 173 | m, M = np.min(embedding_ds[:, dim_i])-0.2, np.max(embedding_ds[:, dim_i])-0.2 174 | m = m - 0.025 * np.abs(M - m) 175 | M = M + 0.025 * np.abs(M - m) 176 | gr = np.linspace(m, M, steps[dim_i]) 177 | grs.append(gr) 178 | 179 | meshes_tuple = np.meshgrid(*grs) 180 | gridpoints_coordinates = np.vstack( 181 | [i.flat for i in meshes_tuple]).T 182 | 183 | n_neighbors = int(velocity_embedding.shape[0]/3) 184 | dists_head, neighs_head = find_nn_neighbors( 185 | embedding_ds, gridpoints_coordinates, n_neighbors) 186 | dists_tail, neighs_tail = find_nn_neighbors( 187 | embedding_ds+velocity_embedding, gridpoints_coordinates, 188 | n_neighbors) 189 | std = np.mean([(g[1] - g[0]) for g in grs]) 190 | 191 | # isotropic gaussian kernel 192 | gaussian_w_head = normal.pdf( 193 | loc=0, scale=smooth * std, x=dists_head) 194 | total_p_mass_head = gaussian_w_head.sum(1) 195 | gaussian_w_tail = normal.pdf( 196 | loc=0, scale=smooth * std, x=dists_tail) 197 | total_p_mass_tail = gaussian_w_tail.sum(1) 198 | 199 | 200 | UZ_head = (velocity_embedding[neighs_head] * gaussian_w_head[:, :, None]).sum( 201 | 1) / np.maximum(1, total_p_mass_head)[:, None] # weighed average 202 | UZ_tail = (velocity_embedding[neighs_tail] * gaussian_w_tail[:, :, None]).sum( 203 | 1) / np.maximum(1, total_p_mass_tail)[:, None] # weighed average 204 | 205 | XY = gridpoints_coordinates 206 | 207 | dists_head2, neighs_head2 = find_nn_neighbors( 208 | embedding_ds, XY+UZ_head, n_neighbors) 209 | dists_tail2, neighs_tail2 = find_nn_neighbors( 210 | embedding_ds, XY-UZ_tail, n_neighbors) 211 | 212 | gaussian_w_head2 = normal.pdf( 213 | loc=0, scale=smooth * std, x=dists_head2) 214 | total_p_mass_head2 = gaussian_w_head2.sum(1) 215 | gaussian_w_tail2 = normal.pdf( 216 | loc=0, scale=smooth * std, x=dists_tail2) 217 | total_p_mass_tail2 = gaussian_w_tail2.sum(1) 218 | 219 | UZ_head2 = (velocity_embedding[neighs_head2] * gaussian_w_head2[:, :, None]).sum( 220 | 1) / np.maximum(1, total_p_mass_head2)[:, None] # weighed average 221 | UZ_tail2 = (velocity_embedding[neighs_tail2] * gaussian_w_tail2[:, :, None]).sum( 222 | 1) / np.maximum(1, total_p_mass_tail2)[:, None] # weighed average 223 | 224 | mass_filter = total_p_mass_head < min_mass 225 | 226 | # filter dots 227 | UZ_head_filtered = UZ_head[~mass_filter, :] 228 | UZ_tail_filtered = UZ_tail[~mass_filter, :] 229 | UZ_head2_filtered = UZ_head2[~mass_filter, :] 230 | UZ_tail2_filtered = UZ_tail2[~mass_filter, :] 231 | XY_filtered = XY[~mass_filter, :] 232 | return(XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs) 233 | 234 | XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs = calculate_two_end_grid( 235 | embedding_ds, velocity_embedding, smooth=0.8, steps=arrow_grid, min_mass=min_mass) 236 | 237 | # connect two end grid to curve 238 | n_curves = XY_filtered.shape[0] 239 | s_vals = np.linspace(0.0, 1.5, 15) # TODO check last 240 | # get longest distance len and norm ratio 241 | XYM = XY_filtered 242 | UVT = UZ_tail_filtered 243 | UVH = UZ_head_filtered 244 | UVT2 = UZ_tail2_filtered 245 | UVH2 = UZ_head2_filtered 246 | 247 | def norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals): 248 | '''get the longest distance in prediction between the five points, 249 | and normalize by using the distance between two grids''' 250 | 251 | def distance(x, y): 252 | # calc disctnce list between a set of coordinate 253 | calculate_square = np.subtract( 254 | x[0:-1], x[1:])**2 + np.subtract(y[0:-1], y[1:])**2 255 | distance_result = (calculate_square)**0.5 256 | return distance_result 257 | 258 | max_discance = 0 259 | for i in range(n_curves): 260 | nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]], 261 | [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]]) 262 | curve = bezier.Curve(nodes, degree=4) 263 | curve_dots = curve.evaluate_multi(s_vals) 264 | distance_sum = np.sum( 265 | distance(curve_dots[0], curve_dots[1])) 266 | max_discance = max(max_discance, distance_sum) 267 | distance_grid = ( 268 | abs(grs[0][0]-grs[0][1]) + abs(grs[1][0]-grs[1][1]))/2 269 | norm_ratio = distance_grid/max_discance 270 | return(norm_ratio) 271 | 272 | norm_ratio = norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals) 273 | 274 | # plot the curve arrow for cell velocity 275 | XYM = XY_filtered 276 | UVT = UZ_tail_filtered * norm_ratio 277 | UVH = UZ_head_filtered * norm_ratio 278 | UVT2 = UZ_tail2_filtered * norm_ratio 279 | UVH2 = UZ_head2_filtered * norm_ratio 280 | 281 | def plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals): 282 | # TO DO: add 'colorful cell velocity' to here, now there is only curve arrows 283 | for i in range(n_curves): 284 | nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]], 285 | [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]]) 286 | curve = bezier.Curve(nodes, degree=4) 287 | curve_dots = curve.evaluate_multi(s_vals) 288 | ax.plot(curve_dots[0], curve_dots[1], 289 | linewidth=0.5, color='black', alpha=1) 290 | 291 | # normalize the arrow of the last two points at the tail, to let all arrows has the same size in quiver 292 | U = curve_dots[0][-1]-curve_dots[0][-2] 293 | V = curve_dots[1][-1]-curve_dots[1][-2] 294 | N = np.sqrt(U**2 + V**2) 295 | U1, V1 = U/N*0.5, V/N*0.5 # 0.5 is to let the arrow have a suitable size 296 | ax.quiver(curve_dots[0][-2], curve_dots[1][-2], U1, V1, units='xy', angles='xy', 297 | scale=1, linewidth=0, color='black', alpha=1, minlength=0, width=0.1) 298 | 299 | plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals) 300 | 301 | 302 | def plot_kinetic_para( 303 | ax, 304 | kinetic_para, 305 | cellDancer_df, 306 | color_map=None, 307 | title=None, 308 | legend=False 309 | ): 310 | 311 | """Plot the UMAP calculated by the kinetic parameter(s). 312 | 313 | Arguments 314 | --------- 315 | ax: `ax` 316 | ax of plt.subplots() 317 | kinetic_para: `str` 318 | The parameter used to generate the embedding space based on UMAP, could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}. 319 | cellDancer_df: `pandas.DataFrame` 320 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'splice', 'unsplice', 'splice_predict', 'unsplice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2'] 321 | color_map: `dict` (optional, default: None) 322 | The color map dictionary of each cell type. 323 | legend: `bool` (optional, default: False) 324 | `True` if the color map of cell legend is to be plotted. 325 | """ 326 | onegene=cellDancer_df[cellDancer_df.gene_name==cellDancer_df.gene_name[0]] 327 | umap_para=onegene[[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]].to_numpy() 328 | onegene_cluster_info=onegene.clusters 329 | 330 | gene=None 331 | if gene is None: 332 | if color_map is None: 333 | from .colormap import build_colormap 334 | color_map=build_colormap(onegene_cluster_info) 335 | 336 | colors = list(map(lambda x: color_map.get(x, 'black'), onegene_cluster_info)) 337 | 338 | if legend: 339 | markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in color_map.values()] 340 | lgd=plt.legend(markers, color_map.keys(), numpoints=1,loc='upper left',bbox_to_anchor=(1.01, 1)) 341 | 342 | im=ax.scatter(umap_para[:,0], umap_para[:,1],c=colors,s=15,alpha=0.5,edgecolor="none") 343 | ax.axis('square') 344 | ax.axis('off') 345 | ax.set_title('UMAP of '+ kinetic_para) 346 | 347 | else: 348 | onegene=cellDancer_df[cellDancer_df.gene_name==gene] 349 | im=ax.scatter(umap_para[:,0], umap_para[:,1],c=np.log(onegene.splice+0.0001),s=15,alpha=1,edgecolor="none") 350 | ax.axis('square') 351 | ax.axis('off') 352 | ax.set_title('spliced reads of '+gene+'\n on UMAP of \n'+ kinetic_para) 353 | 354 | ax_divider = make_axes_locatable(ax) 355 | cax = ax_divider.append_axes("top", size="5%", pad="-5%") 356 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1) 357 | cbar.set_ticks([]) 358 | 359 | umap_df=pd.concat([pd.DataFrame({'umap1':umap_para[:,0],'umap2':umap_para[:,1]})],axis=1) 360 | 361 | return ax -------------------------------------------------------------------------------- /src/celldancer/plotting/colormap.py: -------------------------------------------------------------------------------- 1 | colors_alpha_beta_gamma = ["#007EB7","#3B9AB2", "#78B7C5", "#EBCC2A", "#E1AF00", "#F21A00"] 2 | colors_splice_unsplice = ["#2488F0","#7F3F98","#E22929","#FCB31A"] 3 | 4 | colormap_erythroid={ 5 | 'Haematoendothelial progenitors':'#3361A5', 6 | 'Blood progenitors 1':'#248AF3', 7 | 'Blood progenitors 2':'#14B3FF', 8 | 'Erythroid1':'#88CEEF', 9 | 'Erythroid2':'#FDB31A', 10 | 'Erythroid3':'#E42A2A' 11 | } 12 | 13 | colormap_neuro = { 14 | 'CA': "#ed0345", 15 | 'CA1-Sub': "#710162", 16 | 'CA2-3-4': "#a12a5e", 17 | 'Granule':"#ef6a32", 18 | 'ImmGranule1': "#ef6a32", 19 | 'ImmGranule2': "#ef6a32", 20 | 'Nbl1': "#fbbf45", 21 | 'Nbl2': "#fbbf45", 22 | 'nIPC': "#aad962", 23 | 'RadialGlia': "#03c383", 24 | 'RadialGlia2': "#03c383", 25 | 'GlialProg': '#56A65A', 26 | 'OPC': "#017351", 27 | 'ImmAstro': "#08A8CE" 28 | } 29 | 30 | 31 | colormap_pancreas={ 32 | 'Ductal':'#3361A5', 33 | 'Ngn3 low EP':'#248AF3', 34 | 'Ngn3 high EP':'#14B3FF', 35 | 'Pre-endocrine':'#88CEEF', 36 | 'Alpha':'#ff4800', 37 | 'Beta':"#B81136", 38 | 'Delta':'green', 39 | 'Epsilon':'#03B3B0' 40 | } 41 | 42 | colormap_hgForebrainGlut={ 43 | 0:'#9408F7', 44 | 1:'#C729D6', 45 | 2:'#FA4AB5', 46 | 3:'#FF6A95', 47 | 4:'#FF8B74', 48 | 5:'#FFAC53', 49 | 6:'#FFCD32' 50 | } 51 | 52 | colormap_hgforebrainglut={ 53 | 'Radial Glia':'#9408F7', 54 | 'Radial Glia':'#C729D6', 55 | 'Neuroblast':'#FA4AB5', 56 | 'Neuroblast':'#FF6A95', 57 | 'Immature Neuron':'#FF8B74', 58 | 'Immature Neuron':'#FFAC53', 59 | 'Neuron':'#FFCD32' 60 | } 61 | 62 | color_template = ["#08A8CE","#017351",'#56A65A',"#03c383","#aad962","#fbbf45","#ef6a32","#ed0345","#a12a5e","#710162","#3B9AB2"] 63 | 64 | def build_colormap(cluster_list): 65 | from itertools import cycle 66 | color_list=color_template 67 | colors = dict(zip(cluster_list, cycle(color_list)) if len(cluster_list) > len(color_list) else zip(cycle(cluster_list), color_list)) 68 | return colors 69 | 70 | -------------------------------------------------------------------------------- /src/celldancer/plotting/gene.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import os 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | from matplotlib.lines import Line2D 7 | from matplotlib.colors import ListedColormap 8 | from .colormap import * 9 | from ..sampling import sampling_neighbors 10 | from ..utilities import extract_from_df 11 | 12 | def scatter_gene( 13 | ax=None, 14 | x=None, 15 | y=None, 16 | cellDancer_df=None, 17 | colors=None, 18 | custom_xlim=None, 19 | custom_ylim=None, 20 | vmin=None, 21 | vmax=None, 22 | alpha=0.5, 23 | s = 5, 24 | velocity=False, 25 | gene=None, 26 | legend='off', 27 | arrow_grid = (15,15)): 28 | 29 | """Plot the velocity (splice-unsplice) of a gene, or plot the parameter ('alpha', 'beta', 'gamma', 'splice', 'unsplice') in pseudotime, or customize the parameters in x-axis and y-axis of a gene. 30 | 31 | Arguments 32 | --------- 33 | ax: `ax of plt.subplots()` 34 | ax to add subplot. 35 | x: `str` 36 | Set x axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}. 37 | y: `str` 38 | Set y axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}. 39 | cellDancer_df: `pandas.DataFrame` 40 | Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime'] 41 | colors: `list`, `dict`, or `str` 42 | When the input is a list: build a colormap dictionary for a list of cell type; 43 | When the input is a dictionary: the customized color map dictionary of each cell type; 44 | When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as value of color. 45 | custom_xlim: optional, `float` (default: None) 46 | Set the x limit of the current axes. 47 | custom_ylim: optional, `float` (default: None) 48 | Set the y limit of the current axes. 49 | vmin: optional, `float` (default: None) 50 | Set the minimum color limit of the current image. 51 | vmax: optional, `float` (default: None) 52 | Set the maximum color limit of the current image. 53 | alpha: optional, `float` (default: 0.5) 54 | The alpha blending value, between 0 (transparent) and 1 (opaque). 55 | s: optional, `float` (default: 5) 56 | The marker size. 57 | velocity: optional, `bool` (default: False) 58 | `True` if velocity in gene level is to be plotted. 59 | gene: optional, `str` (default: None) 60 | Gene selected to be plotted. 61 | legend: optional, `str` (default: 'off') 62 | `‘off’` if the color map of cell type legend is not to be plotted; 63 | `‘only’` if only plot the cell type legend. 64 | arrow_grid: optional, `tuple` (default: (15,15)) 65 | The sparsity of the grids of velocity arrows. The larger, the more compact and more arrows will be shown. 66 | 67 | Returns 68 | ------- 69 | ax: matplotlib.axes.Axes 70 | """ 71 | 72 | def gen_Line2D(label, markerfacecolor): 73 | return Line2D([0], [0], color='w', marker='o', label=label, 74 | markerfacecolor=markerfacecolor, 75 | markeredgewidth=0, 76 | markersize=s) 77 | 78 | if isinstance(colors, list): 79 | colors = build_colormap(colors) 80 | 81 | if isinstance(colors, dict): 82 | attr = 'clusters' 83 | legend_elements= [gen_Line2D(i, colors[i]) for i in colors] 84 | if legend != 'off': 85 | lgd=ax.legend(handles=legend_elements, 86 | bbox_to_anchor=(1.01, 1), 87 | loc='upper left') 88 | bbox_extra_artists=(lgd,) 89 | if legend == 'only': 90 | return lgd 91 | else: 92 | bbox_extra_artists=None 93 | 94 | c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters')) 95 | cmap=ListedColormap(list(colors.values())) 96 | 97 | elif isinstance(colors, str): 98 | attr = colors 99 | if colors in ['alpha', 'beta', 'gamma']: 100 | assert gene, '\nError! gene is required!\n' 101 | cmap = ListedColormap(colors_alpha_beta_gamma) 102 | if colors in ['splice', 'unsplice']: 103 | assert gene, '\nError! gene is required!\n' 104 | cmap = ListedColormap(colors_splice_unsplice) 105 | if colors in ['pseudotime']: 106 | cmap = 'viridis' 107 | else: 108 | cmap = 'viridis' 109 | 110 | c = extract_from_df(cellDancer_df, [colors], gene) 111 | elif colors is None: 112 | attr = 'basic' 113 | cmap = None 114 | c = '#95D9EF' 115 | 116 | assert gene, '\nError! gene is required!\n' 117 | xy = extract_from_df(cellDancer_df, [x, y], gene) 118 | ax.scatter(xy[:, 0], 119 | xy[:, 1], 120 | c=c, 121 | cmap=cmap, 122 | s=s, 123 | alpha=alpha, 124 | vmin=vmin, 125 | vmax=vmax, 126 | edgecolor="none") 127 | 128 | if custom_xlim is not None: 129 | ax.set_xlim(custom_xlim[0], custom_xlim[1]) 130 | if custom_ylim is not None: 131 | ax.set_ylim(custom_ylim[0], custom_ylim[1]) 132 | 133 | 134 | if velocity: 135 | assert (x,y) in [('unsplice', 'splice'), ('splice', 'unsplice')] 136 | u_s = extract_from_df(cellDancer_df, ['unsplice','splice','unsplice_predict','splice_predict'], gene) 137 | sampling_idx=sampling_neighbors(u_s[:,0:2], step=arrow_grid, percentile=15) # Sampling 138 | u_s_downsample = u_s[sampling_idx,0:4] 139 | 140 | plt.scatter(u_s_downsample[:, 1], u_s_downsample[:,0], color="none", s=s, edgecolor="k") 141 | plt.quiver(u_s_downsample[:, 1], u_s_downsample[:, 0], 142 | u_s_downsample[:, 3]-u_s_downsample[:, 1], 143 | u_s_downsample[:, 2]-u_s_downsample[:, 0], 144 | angles='xy', clim=(0., 1.)) 145 | 146 | return ax 147 | 148 | -------------------------------------------------------------------------------- /src/celldancer/plotting/graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import networkx as nx 3 | import pandas as pd 4 | import numpy as np 5 | from datashader.layout import forceatlas2_layout 6 | from datashader.bundling import hammer_bundle, connect_edges 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import ListedColormap 9 | from matplotlib.lines import Line2D 10 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable 11 | 12 | from .colormap import * 13 | if __name__ == "__main__":# developer test 14 | sys.path.append('..') 15 | from utilities import extract_from_df 16 | else: 17 | from celldancer.utilities import extract_from_df 18 | 19 | def PTO_Graph( 20 | ax, 21 | cellDancer_df, 22 | node_layout='forceatlas2', 23 | PRNG_SEED=None, 24 | force_iters=2000, 25 | use_edge_bundling=True, 26 | node_colors=None, 27 | node_sizes=5, 28 | edge_length=None, 29 | legend='off', 30 | colorbar='on'): 31 | 32 | """ 33 | Graph visualization of selected cells reflecting their orders in 34 | pseudotime (PseudoTimeOrdered_Graph: PTO_Graph). Embedding and pseudotime 35 | of the cells are required. Each cell makes a node and the connections between 36 | nodes are based on their separation in the embedding space and the strength 37 | of the connection is proportional to the pseudotime difference (the larger 38 | the pseudotime difference in absolute values, the weaker the connection). 39 | 40 | Example usage: 41 | 42 | .. code-block:: python 43 | 44 | from celldancer.plotting import graph 45 | from matplotlib import pyplot as plt 46 | fig, ax = plt.subplots(figsize=(10,10)) 47 | graph.PTO_Graph(ax, 48 | load_cellDancer, 49 | node_layout='forcedirected', 50 | use_edge_bundling=True, 51 | node_colors='clusters', 52 | edge_length=3, 53 | node_sizes='pseudotime', 54 | colorbar='on', 55 | legend='on') 56 | 57 | In this example, we use a force-directed node layout algorithm (`ForceAtlas2 58 | `_). 59 | A connection is made between any two cells within 3 (unit in the embedding). 60 | The resulted edge lengths indicate the time difference between nodes (the 61 | closer in pseudotime, the shorter the edge length). Edge bundling is applied 62 | to highlight important edges (trunks). The sizes of the nodes are 63 | proportional to the pseudotime. The nodes are colored according to their 64 | cell types (if given by the input data). 65 | 66 | Arguments 67 | --------- 68 | cellDancer_df: `pandas.DataFrame` 69 | Dataframe of velocity estimation, cell velocity, and pseudotime results. 70 | Columns=['cellIndex', 'gene_name', 71 | 'unsplice', 'splice', 72 | 'unsplice_predict', 'splice_predict', 73 | 'alpha', 'beta', 'gamma', 74 | 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 75 | 'velocity1', 'velocity2', 'pseudotime'] 76 | 77 | node_layout: optional, `str` (default: forceatlas2) 78 | Layout for the graph. Currently only supports the forceatlas2 and 79 | embedding. 80 | 81 | - `'forceatlas2'` or `'forcedirected'`: treat connections as forces 82 | between connected nodes. 83 | 84 | - `'embedding'`: use the embedding as positions of the nodes. 85 | 86 | PRNG_SEED: optional, `int`, or `None` (default: `None`) 87 | Seed to initialize the pseudo-random number generator. 88 | 89 | force_iters: optional, `int` (default: 2000) 90 | Number of passes for the force-directed layout calculation. 91 | 92 | use_edge_bundling: optional, `bool` (default: `True`) 93 | `True` if bundle the edges (computational demanding). 94 | Edge bundling allows edges to curve and groups nearby ones together 95 | for better visualization of the graph structure. 96 | 97 | node_colors: optional, `str` (default: `None`) 98 | The node fill colors. 99 | Possible values: 100 | 101 | - *clusters*: color according to the clusters information of the 102 | respective cells. 103 | 104 | - *pseudotime*: colors according to the pseudotime of the 105 | respective cells. 106 | 107 | - A single color format string. 108 | 109 | edge_length: optional, `float` (default: `None`) 110 | The distance cutoff in the embedding between two nodes to determine 111 | whether an edge should be formed (edge is formed when r < *edge_length*). 112 | By default, the mean of all the cell 113 | 114 | node_sizes: optional, `float` or `numeric list-like` or `str` (default: 5) 115 | The sizes of the nodes. If it is `str`, then the `str` has to be either one of those 116 | {`pseudotime`, `index`, `x`, `y`} read from the `nodes` dataframe. 117 | 118 | legend: optional, `str` (default: 'off') 119 | - `'off'`/`'on'`: Exclude/include the cell type legend on the plot. 120 | - `'only'`: Negelect the plot and only show the cell type legend. 121 | 122 | colorbar: optional, `str` (default: 'on') 123 | - `'off'`/`'on'`: Show the colorbar in the case nodes are colored by `pseudotime`. 124 | 125 | 126 | Returns 127 | ------- 128 | ax: matplotlib.axes.Axes 129 | 130 | """ 131 | 132 | nodes, edges = create_nodes_edges(cellDancer_df, edge_length) 133 | 134 | if node_layout in ['forceatlas2', 'forcedirected']: 135 | # Current version of datashader.layout does not support reading a layout (x,y) and perform layout function 136 | # It does not support other attributes except index. 137 | forcedirected = forceatlas2_layout(nodes[['index']], edges, 138 | weight='weight', iterations=force_iters, k=0.1, seed=PRNG_SEED) 139 | nodes['x'] = forcedirected['x'] 140 | nodes['y'] = forcedirected['y'] 141 | 142 | if use_edge_bundling: 143 | bundle = hammer_bundle(nodes, edges) 144 | else: 145 | bundle = connect_edges(nodes, edges) 146 | 147 | 148 | # For plotting settings 149 | def gen_Line2D(label, markerfacecolor, markersize): 150 | return Line2D([0], [0], color='w', 151 | marker='o', 152 | label=label, 153 | markerfacecolor=markerfacecolor, 154 | markeredgewidth=0, 155 | markersize=markersize) 156 | 157 | if isinstance(node_sizes, (int, float)) or isinstance(node_sizes, list): 158 | pass 159 | elif isinstance(node_sizes, str): 160 | node_sizes=nodes[node_sizes].to_numpy(dtype=float)*200 161 | 162 | if isinstance(node_colors, str): 163 | # This goes to dict case afterwards 164 | if node_colors in ['clusters']: 165 | node_colors = build_colormap(nodes[node_colors]) 166 | if node_colors in ['pseudotime']: 167 | cmap='viridis' 168 | c=nodes[node_colors].to_numpy(dtype=float) 169 | 170 | if isinstance(node_colors, dict): 171 | legend_elements= [gen_Line2D(i, 172 | node_colors[i], 173 | 10) 174 | for i in node_colors] 175 | 176 | if legend != 'off': 177 | lgd=ax.legend(handles=legend_elements, 178 | bbox_to_anchor=(1.01, 1), 179 | loc='upper left') 180 | bbox_extra_artists=(lgd,) 181 | if legend == 'only': 182 | return lgd 183 | else: 184 | bbox_extra_artists=None 185 | 186 | c=nodes['clusters'].map(node_colors).to_list() 187 | cmap=ListedColormap(list(node_colors.values())) 188 | 189 | if node_colors is None: 190 | c = ['Grey']*len(nodes) 191 | 192 | ax.plot(bundle.x, bundle.y, 'y', zorder=1, linewidth=0.3, color='blue', alpha=1) 193 | im = ax.scatter(nodes.x, nodes.y, c=c, cmap=cmap, s=node_sizes, zorder=2, edgecolors='k', alpha=0.5) 194 | 195 | if colorbar == 'on' and isinstance(node_colors, str): 196 | ax_divider = make_axes_locatable(ax) 197 | cax = ax_divider.append_axes("top", size="5%", pad="-5%") 198 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1) 199 | cbar.set_ticks([]) 200 | ax.axis('off') 201 | 202 | return ax 203 | 204 | 205 | 206 | def create_nodes_edges(data, radius): 207 | def create_KNN_based_graph(): 208 | from sklearn.neighbors import NearestNeighbors 209 | neigh = NearestNeighbors(radius = radius) 210 | neigh.fit(embedding_ds) 211 | nn_graph = neigh.radius_neighbors_graph(embedding_ds, mode='connectivity') 212 | nn_array = nn_graph.toarray() 213 | 214 | # nn_array is effectively the edge list 215 | # Keep track of cells of 0 timeshift. 216 | node_list = [(i, {'pseudotime': pseudotime_ds[i,0], 'clusters':clusters_ds[i]}) 217 | for i in range(len(embedding_ds))] 218 | 219 | dtime = pseudotime_ds[:,0] - pseudotime_ds 220 | INF = 1./np.min(np.abs(dtime[dtime != 0])) 221 | 222 | # upper triangle of the knn array (i0: 233 | edge_list.append((a, b, 1/w)) 234 | elif w<0: 235 | edge_list.append((a, b, -1/w)) 236 | else: 237 | edge_list.append((a, b, INF)) 238 | 239 | G = nx.Graph() 240 | G.add_nodes_from(node_list) 241 | G.add_weighted_edges_from(edge_list) 242 | return G 243 | 244 | embedding = extract_from_df(data, ['embedding1', 'embedding2']) 245 | n_cells = embedding.shape[0] 246 | sample_cells = data['velocity1'][:n_cells].dropna().index 247 | clusters = extract_from_df(data, ['clusters']) 248 | pseudotime = extract_from_df(data, ['pseudotime']) 249 | 250 | embedding_ds = embedding[sample_cells] 251 | pseudotime_ds = pseudotime[sample_cells] 252 | clusters_ds = clusters[sample_cells] 253 | 254 | G = create_KNN_based_graph() 255 | 256 | index = np.array(range(len(embedding_ds)), dtype=int)[:,None] 257 | nodes = pd.DataFrame(np.hstack((embedding_ds, index, pseudotime_ds, clusters_ds)), 258 | columns=['x','y','index','pseudotime','clusters']) 259 | 260 | edges = pd.DataFrame([(i[0], i[1], G.edges[i]['weight']) for i in G.edges], 261 | columns=['source', 'target', 'weight']) 262 | return nodes, edges 263 | -------------------------------------------------------------------------------- /src/celldancer/sampling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from numpy.core.fromnumeric import size 4 | import scipy 5 | from sklearn.neighbors import NearestNeighbors 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def sampling_neighbors(gene_unsplice_splice,step=(30,30),percentile=25): 10 | 11 | from scipy.stats import norm 12 | def gaussian_kernel(X, mu = 0, sigma=1): 13 | return np.exp(-(X - mu)**2 / (2*sigma**2)) / np.sqrt(2*np.pi*sigma**2) 14 | grs = [] 15 | for dim_i in range(gene_unsplice_splice.shape[1]): 16 | m, M = np.min(gene_unsplice_splice[:, dim_i]), np.max(gene_unsplice_splice[:, dim_i]) 17 | m = m - 0.025 * np.abs(M - m) 18 | M = M + 0.025 * np.abs(M - m) 19 | gr = np.linspace(m, M, step[dim_i]) 20 | grs.append(gr) 21 | meshes_tuple = np.meshgrid(*grs) 22 | gridpoints_coordinates = np.vstack([i.flat for i in meshes_tuple]).T 23 | gridpoints_coordinates = gridpoints_coordinates + norm.rvs(loc=0, scale=0.15, size=gridpoints_coordinates.shape) 24 | 25 | np.random.seed(10) # set random seed 26 | 27 | nn = NearestNeighbors() 28 | 29 | neighbors_1 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20) 30 | nn.fit(gene_unsplice_splice[:,0:2]) 31 | dist, ixs = nn.kneighbors(gridpoints_coordinates, neighbors_1) 32 | 33 | ix_choice = ixs[:,0].flat[:] 34 | ix_choice = np.unique(ix_choice) 35 | 36 | nn = NearestNeighbors() 37 | 38 | neighbors_2 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20) 39 | nn.fit(gene_unsplice_splice[:,0:2]) 40 | dist, ixs = nn.kneighbors(gene_unsplice_splice[ix_choice, 0:2], neighbors_2) 41 | 42 | density_extimate = gaussian_kernel(dist, mu=0, sigma=0.5).sum(1) 43 | bool_density = density_extimate > np.percentile(density_extimate, percentile) 44 | ix_choice = ix_choice[bool_density] 45 | return(ix_choice) 46 | 47 | def sampling_inverse(gene_unsplice_splice,target_amount=500): 48 | unsplice = gene_unsplice_splice[:,0] 49 | splice = gene_unsplice_splice[:,1] 50 | values = np.vstack([unsplice,splice]) 51 | kernel = scipy.stats.gaussian_kde(values) 52 | p = kernel(values) 53 | # p2 = (1/p)/sum(1/p) 54 | p2 = (1/p)/sum(1/p) 55 | idx = np.arange(values.shape[1]) 56 | r = scipy.stats.rv_discrete(values=(idx, p2)) 57 | idx_choice = r.rvs(size=target_amount) 58 | return(idx_choice) 59 | 60 | def sampling_circle(gene_unsplice_splice,target_amount=500): 61 | unsplice = gene_unsplice_splice[:,0] 62 | splice = gene_unsplice_splice[:,1] 63 | values = np.vstack([unsplice,splice]) 64 | kernel = scipy.stats.gaussian_kde(values) 65 | p = kernel(values) 66 | idx = np.arange(values.shape[1]) 67 | tmp_p = np.square((1-(p/(max(p)))**2))+0.0001 68 | p2 = tmp_p/sum(tmp_p) 69 | r = scipy.stats.rv_discrete(values=(idx, p2)) 70 | idx_choice = r.rvs(size=target_amount) 71 | return(idx_choice) 72 | 73 | def sampling_random(gene_unsplice_splice, target_amount=500): 74 | idx = np.random.choice(gene_unsplice_splice.shape[0], size = target_amount, replace=False) 75 | return(idx) 76 | 77 | def sampling_adata(detail, 78 | para, 79 | target_amount=500, 80 | step=(30,30)): 81 | if para == 'neighbors': 82 | data_U_S= np.array(detail[["unsplice","splice"]]) 83 | idx = sampling_neighbors(data_U_S,step) 84 | elif para == 'inverse': 85 | data_U_S= np.array(detail[["unsplice","splice"]]) 86 | idx = sampling_inverse(data_U_S,target_amount) 87 | elif para == 'circle': 88 | data_U_S= np.array(detail[["unsplice","splice"]]) 89 | idx = sampling_circle(data_U_S,target_amount) 90 | elif para == 'random': 91 | data_U_S= np.array(detail[["unsplice","splice"]]) 92 | idx = sampling_random(data_U_S,target_amount) 93 | else: 94 | print('para is neighbors or inverse or circle') 95 | return(idx) 96 | 97 | def sampling_embedding(detail, 98 | para, 99 | target_amount=500, 100 | step=(30,30)): 101 | 102 | ''' 103 | Guangyu 104 | ''' 105 | if para == 'neighbors': 106 | data_U_S= np.array(detail[["embedding1","embedding2"]]) 107 | idx = sampling_neighbors(data_U_S,step) 108 | elif para == 'inverse': 109 | print('inverse') 110 | data_U_S= np.array(detail[["embedding1","embedding2"]]) 111 | idx = sampling_inverse(data_U_S,target_amount) 112 | elif para == 'circle': 113 | data_U_S= np.array(detail[["embedding1","embedding2"]]) 114 | idx = sampling_circle(data_U_S,target_amount) 115 | elif para == 'random': 116 | # print('random') 117 | data_U_S= np.array(detail[["embedding1","embedding2"]]) 118 | idx = sampling_random(data_U_S,target_amount) 119 | else: 120 | print('para is neighbors or inverse or circle') 121 | return(idx) 122 | 123 | def adata_to_detail(data, para, gene): 124 | ''' 125 | convert adata to detail format 126 | data: an anndata 127 | para: the varable name of unsplice, splice, and gene name 128 | para = ['Mu', 'Ms'] 129 | ''' 130 | data2 = data[:, data.var.index.isin([gene])].copy() 131 | unsplice = data2.layers[para[0]][:,0].copy().astype(np.float32) 132 | splice = data2.layers[para[1]][:,0].copy().astype(np.float32) 133 | detail = pd.DataFrame({'gene_name':gene, 'unsplice':unsplice, 'splice':splice}) 134 | return(detail) 135 | 136 | def downsampling_embedding(data_df,para,target_amount, step, n_neighbors,expression_scale=None,projection_neighbor_choice='embedding',pca_n_components=None,umap_n=None,umap_n_components=None): 137 | ''' 138 | Guangyu 139 | sampling cells by embedding 140 | data—df: from load_cellDancer 141 | para: 142 | 143 | return: sampled embedding, the indexs of sampled cells, and the neighbors of sampled cells 144 | ''' 145 | 146 | gene = data_df['gene_name'].drop_duplicates().iloc[0] 147 | embedding = data_df.loc[data_df['gene_name']==gene][['embedding1','embedding2']] 148 | 149 | if step is not None: 150 | idx_downSampling_embedding = sampling_embedding(embedding, 151 | para=para, 152 | target_amount=target_amount, 153 | step=step) 154 | else: 155 | idx_downSampling_embedding=range(0,embedding.shape[0]) # all cells 156 | 157 | def transfer(data_df,expression_scale): 158 | if expression_scale=='log': 159 | data_df.splice=np.log(data_df.splice+0.000001) 160 | data_df.unsplice=np.log(data_df.unsplice+0.000001) 161 | elif expression_scale=='2power': 162 | data_df.splice=2**(data_df.splice) 163 | data_df.unsplice=2**(data_df.unsplice) 164 | elif expression_scale=='power10': 165 | data_df.splice=(data_df.splice)**10 166 | data_df.unsplice=(data_df.unsplice)**10 167 | elif expression_scale=='2power_norm_multi10': 168 | gene_order=data_df.gene_name.drop_duplicates() 169 | onegene=data_df[data_df.gene_name==data_df.gene_name[0]] 170 | cellAmt=len(onegene) 171 | data_df_max=data_df.groupby('gene_name')[['splice','unsplice']].max().rename(columns={'splice': 'splice_max','unsplice': 'unsplice_max'}) 172 | data_df_min=data_df.groupby('gene_name')[['splice','unsplice']].min().rename(columns={'splice': 'splice_min','unsplice': 'unsplice_min'}) 173 | data_df_fin=pd.concat([data_df_max,data_df_min],axis=1).reindex(gene_order) 174 | data_df_fin=data_df_fin.loc[data_df_fin.index.repeat(cellAmt)] 175 | data_df_combined=pd.concat([data_df.reset_index(drop=True) ,data_df_fin[['splice_max','unsplice_max','splice_min','unsplice_min']].reset_index(drop=True)],axis=1) 176 | data_df_combined['unsplice_norm']='' 177 | data_df_combined['splice_norm']='' 178 | data_df_combined.unsplice_norm=(data_df_combined.unsplice-data_df_combined.unsplice_min)/(data_df_combined.unsplice_max-data_df_combined.unsplice_min) 179 | data_df_combined.splice_norm=(data_df_combined.splice-data_df_combined.splice_min)/(data_df_combined.splice_max-data_df_combined.splice_min) 180 | data_df_combined.unsplice=2**(data_df_combined.unsplice_norm*10) 181 | data_df_combined.splice=2**(data_df_combined.splice_norm*10) 182 | data_df=data_df_combined 183 | 184 | return (data_df) 185 | 186 | data_df=transfer(data_df,expression_scale) 187 | 188 | 189 | if projection_neighbor_choice=='gene': 190 | #print('using gene projection_neighbor_choice') 191 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID'] 192 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID) 193 | embedding_downsampling = data_df_pivot.iloc[idx_downSampling_embedding] 194 | elif projection_neighbor_choice=='pca': # not use 195 | from sklearn.decomposition import PCA 196 | #print('using pca projection_neighbor_choice') 197 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID'] 198 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID) 199 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding] 200 | pca=PCA(n_components=pca_n_components) 201 | pca.fit(embedding_downsampling_0) 202 | embedding_downsampling = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)] 203 | elif projection_neighbor_choice=='pca_norm': 204 | from sklearn.decomposition import PCA 205 | #print('pca_norm') 206 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID'] 207 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID) 208 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding] 209 | pca=PCA(n_components=pca_n_components) 210 | pca.fit(embedding_downsampling_0) 211 | embedding_downsampling_trans = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)] 212 | embedding_downsampling_trans_norm=(embedding_downsampling_trans - embedding_downsampling_trans.min(0)) / embedding_downsampling_trans.ptp(0)#normalize 213 | embedding_downsampling_trans_norm_mult10=embedding_downsampling_trans_norm*10 #optional 214 | embedding_downsampling=embedding_downsampling_trans_norm_mult10**5 # optional 215 | elif projection_neighbor_choice=='embedding': 216 | embedding_downsampling = embedding.iloc[idx_downSampling_embedding][['embedding1','embedding2']] 217 | 218 | elif projection_neighbor_choice =='umap': 219 | import umap 220 | #print('using umap projection_neighbor_choice') 221 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID'] 222 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID) 223 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding] 224 | 225 | def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=umap_n_components, metric='euclidean'): 226 | fit = umap.UMAP( 227 | n_neighbors=n_neighbors, 228 | min_dist=min_dist, 229 | n_components=n_components, 230 | metric=metric 231 | ) 232 | embed = fit.fit_transform(df); 233 | return(embed) 234 | embedding_downsampling=get_umap(embedding_downsampling_0) 235 | 236 | n_neighbors = min(int((embedding_downsampling.shape[0])/4), n_neighbors) 237 | if n_neighbors==0: 238 | n_neighbors=1 239 | nn = NearestNeighbors(n_neighbors=n_neighbors) 240 | nn.fit(embedding_downsampling) 241 | embedding_knn = nn.kneighbors_graph(mode="connectivity") 242 | return(embedding_downsampling, idx_downSampling_embedding, embedding_knn) 243 | 244 | def downsampling(data_df, gene_list, downsampling_ixs): 245 | ''' 246 | Guangyu 247 | ''' 248 | data_df_downsampled=pd.DataFrame() 249 | for gene in gene_list: 250 | data_df_one_gene=data_df[data_df['gene_name']==gene] 251 | data_df_one_gene_downsampled = data_df_one_gene.iloc[downsampling_ixs] 252 | data_df_downsampled=data_df_downsampled.append(data_df_one_gene_downsampled) 253 | return(data_df_downsampled) 254 | -------------------------------------------------------------------------------- /src/celldancer/simulation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | from torch.utils.data import * 5 | import anndata 6 | 7 | from scipy.integrate import solve_ivp 8 | 9 | def _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples): 10 | 11 | def trans_dynamics(t, expr): 12 | s = expr[0] 13 | u = expr[1] 14 | du_dt = alpha - beta*u 15 | ds_dt = beta*u - gamma*s 16 | return [ds_dt, du_dt] 17 | 18 | #print("t1 and t2:", t1, t2) 19 | t_space = np.linspace(t1, t2, samples) 20 | num_sol = solve_ivp(trans_dynamics, [0, t2], [s0_start, u0_start], method='RK45', dense_output=True) 21 | XY_num_sol = num_sol.sol(t_space) 22 | S, U = XY_num_sol[0], XY_num_sol[1] 23 | return U, S 24 | 25 | def _jitter(U, S, scale): 26 | S = S + np.random.normal(loc=0.0, scale=scale*np.percentile(S, 99) / 10, size=np.size(S)) 27 | U = U + np.random.normal(loc=0.0, scale=scale*np.percentile(U, 99) / 10, size=np.size(U)) 28 | S1 = S[(S>0)&(U>0)] 29 | U1 = U[(S>0)&(U>0)] 30 | S1, U1 = np.clip(S, 0, None), np.clip(U, 0, None) 31 | return U1, S1 32 | 33 | def _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt=0.001, scale=1): 34 | u0, s0 = _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples) 35 | u0_end, s0_end = u0[-1], s0[-1] 36 | #u0, s0 = _jitter(u0, s0, scale) 37 | u1 = u0 + (alpha - beta*u0)*dt 38 | s1 = s0 + (beta*u0 - gamma*s0)*dt 39 | 40 | expr = pd.DataFrame(u0, columns=['u0']) 41 | expr['s0'] = s0 42 | expr['u1'] = u1 43 | expr['s1'] = s1 44 | expr['alpha'] = alpha 45 | expr['beta'] = beta 46 | expr['gamma'] = gamma 47 | return expr, (u0_end, s0_end) 48 | 49 | def _simulate_without_t( u0_start, s0_start, alpha, beta, gamma, percent_start_u, percent_end_u, samples, dt=0.001, scale=1): 50 | '''percentage_u: u_end/u_max''' 51 | 52 | def inversed_u(u, expr): 53 | t = expr[0] 54 | dt_du = 1/(alpha - beta*u) 55 | return dt_du 56 | 57 | if alpha != 0: 58 | u_max = alpha/beta 59 | u_start = u0_start + (u_max-u0_start) * percent_start_u/100 60 | u_end = u0_start + (u_max-u0_start) * percent_end_u/100 61 | else: 62 | u_max = u0_start 63 | u_start = u_max * (100-percent_start_u)/100 64 | u_end = u_max * (100-percent_end_u)/100 65 | 66 | t_sol = solve_ivp(inversed_u, [u0_start, u_end], [0], method='RK45', dense_output=True) 67 | t1 = t_sol.sol(u_start)[0] 68 | t2 = t_sol.sol(u_end)[0] 69 | return _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt, scale) 70 | 71 | def forward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1): 72 | expr, end = _simulate_without_t(0, 0, alpha, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level) 73 | return expr 74 | 75 | def backward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1): 76 | u0_start = alpha/beta 77 | s0_start = alpha/gamma 78 | expr, end = _simulate_without_t(u0_start, s0_start, 0, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level) 79 | return expr 80 | 81 | def two_alpha(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1): 82 | expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level) 83 | expr2, end2 = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level) 84 | expr = expr1.append(expr2) 85 | expr.index = range(len(expr)) 86 | return expr 87 | 88 | def boost_path(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1): 89 | 90 | #expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level) 91 | #expr2, end2 = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level 92 | expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level) 93 | expr2, end2 = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level) 94 | 95 | # boosted induction starts from the end of the previous induction. 96 | expr2['u0'] += alpha1/beta1 97 | expr2['s0'] += alpha1/gamma1 98 | expr2['u1'] += alpha1/beta1 99 | expr2['s1'] += alpha1/gamma1 100 | expr = expr1.append(expr2) 101 | expr.index = range(len(expr)) 102 | return expr 103 | 104 | def two_alpha2(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1): 105 | expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level) 106 | expr2, end2 = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level) 107 | expr = expr1.append(expr2) 108 | expr.index = range(len(expr)) 109 | return expr 110 | 111 | def two_alpha3(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=0.02): 112 | exprx, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, 99.9, samples2, dt, noise_level) 113 | expr1, (new_u0_start2, new_s0_start2) = _simulate_without_t(new_u0_start, new_s0_start, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level) 114 | expr2, end1 = _simulate_without_t(new_u0_start2, new_s0_start2, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level) 115 | expr = expr1.append(expr2) 116 | expr.index = range(len(expr)) 117 | return expr 118 | 119 | def generate_with_df(gene_info, dt=0.001, noise_level=0.2): 120 | expr = pd.DataFrame() 121 | last_u, last_s = None, None 122 | for i in range(len(gene_info.index)): 123 | gene_name, start_u, start_s = gene_info['gene_name'][i], gene_info['start_u'][i], gene_info['start_s'][i] 124 | alpha, beta, gamma = gene_info['alpha'][i], gene_info['beta'][i], gene_info['gamma'][i] 125 | start_pct, end_pct, samples = gene_info['start_pct'][i], gene_info['end_pct'][i], gene_info['samples'][i] 126 | if start_u is not None and start_s is not None: 127 | expr_tmp, (last_u, last_s) = _simulate_without_t(start_u, start_s, alpha, beta, gamma, start_pct, end_pct, samples) 128 | else: 129 | if last_u is None or last_s is None: 130 | print("start_u and start_s should not be None at the first line.") 131 | return None 132 | expr_tmp, (last_u, last_s) = _simulate_without_t(last_u, last_s, alpha, beta, gamma, start_pct, end_pct, samples) 133 | expr = expr.append(expr_tmp) 134 | expr.index = range(len(expr)) 135 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level) 136 | return gene_info, expr 137 | 138 | def adata_to_detail(data, para, gene): 139 | data2 = data[:, data.var.index.isin([gene])].copy() 140 | u0 = data2.layers[para[0]][:,0].copy().astype(np.float32) 141 | s0 = data2.layers[para[1]][:,0].copy().astype(np.float32) 142 | alpha = data2.layers[para[2]][:,0].copy().astype(np.float32) 143 | beta = data2.layers[para[3]][:,0].copy().astype(np.float32) 144 | gamma = data2.layers[para[4]][:,0].copy().astype(np.float32) 145 | detail = pd.DataFrame({'gene_list':gene, 'u0':u0, 's0':s0, 'embedding1':u0, 'embedding2':s0, 'alpha':alpha, 'beta':beta, 'gamma':gamma}) 146 | #detail['beta1'] = data2.var['beta1'].to_numpy()[0] 147 | #detail['beta2'] = data2.var['beta2'].to_numpy()[0] 148 | #detail['gamma1'] = data2.var['gamma1'].to_numpy()[0] 149 | #detail['gamma2'] = data2.var['gamma2'].to_numpy()[0] 150 | detail['path1_pct'] = data2.var['path1_pct'].to_numpy()[0] 151 | detail['path2_pct'] = data2.var['path2_pct'].to_numpy()[0] 152 | return(detail) 153 | 154 | def generate(type, gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level): 155 | cell_num=path1_sample+path2_sample 156 | u0s, s0s, u1s, s1s, alphas, betas, gammas = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() 157 | gene_info = pd.DataFrame(columns = ['gene_name', 'type', 'alpha1', 'alpha2', 'beta1', 'beta2', 'gamma1', 'gamma2', 'path1_pct', 'path2_pct', 'samples']) 158 | 159 | for i in range(gene_num): 160 | samples1, samples2 = path1_sample, path2_sample 161 | if type == "forwad": 162 | expr = forward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9, samples=samples1, noise_level=noise_level) 163 | elif type == "backward": 164 | expr = backward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9, samples=samples1, noise_level=noise_level) 165 | elif type == "two_alpha": 166 | expr = two_alpha(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 167 | samples1=samples1, samples2=samples2, noise_level=noise_level) 168 | elif type == "two_alpha2": 169 | expr = two_alpha2(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 170 | samples1=samples1, samples2=samples2, noise_level=noise_level) 171 | elif type == "two_alpha3": 172 | expr = two_alpha3(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 173 | samples1=samples1, samples2=samples2, noise_level=noise_level) 174 | elif type == "boost": 175 | expr = boost_path(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 176 | samples1=samples1, samples2=samples2, noise_level=noise_level) 177 | else: 178 | print("type not match") 179 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level) 180 | expr = expr.head(cell_num) 181 | gene_name = "simulation"+str(i).zfill(3) 182 | u0s[gene_name] = expr.u0 183 | s0s[gene_name] = expr.s0 184 | u1s[gene_name] = expr.u1 185 | s1s[gene_name] = expr.s1 186 | alphas[gene_name] = expr.alpha 187 | betas[gene_name] = expr.beta 188 | gammas[gene_name] = expr.gamma 189 | gene_info = gene_info.append({'gene_name':gene_name, 'type':"multi_path", 'alpha1':alpha1, 'alpha2':alpha2, 'beta1':beta1, 'beta2':beta2, 'gamma1':gamma1, 'gamma2':gamma2, 'path1_pct':path1_pct, 'path2_pct':path2_pct, 'samples':len(expr)}, ignore_index=True) 190 | 191 | #gene_info.set_index("gene_name") 192 | cell_info = pd.DataFrame() 193 | cell_info['barcode'] = s0s.index 194 | adata = anndata.AnnData( 195 | X=s0s.to_numpy(), 196 | obs = cell_info, 197 | var = gene_info, 198 | layers = { 199 | 'u0s':u0s.to_numpy(), 200 | 's0s': s0s.to_numpy(), 201 | 'u1s':u1s.to_numpy(), 202 | 's1s': s1s.to_numpy(), 203 | 'alphas': alphas.to_numpy(), 204 | 'betas': betas.to_numpy(), 205 | 'gammas': gammas.to_numpy() } 206 | ) 207 | adata.var_names = gene_info['gene_name'] 208 | 209 | genelist_all=adata.var_names 210 | data_onegene = pd.DataFrame() 211 | for g in genelist_all: 212 | data_onegene = data_onegene.append(adata_to_detail(adata, para=['u0s', 's0s', 'alphas', 'betas', "gammas"], gene=g)) 213 | data_onegene=data_onegene.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"}) 214 | data_onegene.loc[:,'cellID']=list(range(len(data_onegene))) 215 | data_onegene.loc[:,'clusters']=None 216 | return data_onegene 217 | 218 | def generate_mono(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1): 219 | return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level) 220 | 221 | def generate_tran_boost(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1): 222 | return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level) 223 | 224 | def generate_forward(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1): 225 | return generate("two_alpha2", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level) 226 | 227 | def generate_backward(start_s1, start_s2, start_u1, start_u2,alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_sample, path2_sample,noise_level=None): 228 | gene_info = pd.DataFrame(columns = ['gene_name', 'start_u', 'start_s', 'alpha', 'beta', 'gamma', 'start_pct', 'end_pct', 'samples']) 229 | gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u1, 'start_s':start_s1, 'alpha':alpha1, 'beta':beta1, 'gamma':gamma1, 'start_pct':0, 'end_pct':99, 'samples':path1_sample}, ignore_index=True) 230 | gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u2, 'start_s':start_s2, 'alpha':alpha2, 'beta':beta2, 'gamma':gamma2, 'start_pct':0, 'end_pct':99, 'samples':path2_sample}, ignore_index=True) 231 | 232 | gene_info, expr = generate_with_df(gene_info,noise_level) 233 | expr['embedding1']=expr['u0'] 234 | expr['embedding2']=expr['s0'] 235 | expr=expr.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"}) 236 | expr.loc[:,'cellID']=list(range(len(expr))) 237 | expr.loc[:,'clusters']=None 238 | return expr 239 | 240 | def generate_by_each_cell(df, t, dt=0.001, noise_level=1): 241 | expr = pd.DataFrame() 242 | 243 | ti = t/len(df.index) 244 | 245 | last_u0, last_s0 = 0, 0 246 | 247 | for i in range(len(df.index)): 248 | sub_expr, (u0i, s0i) = _simulate( 249 | u0_start = last_u0, s0_start=last_s0, 250 | alpha=df['alpha'][i], beta=df['beta'][i], gamma=df['gamma'][i], 251 | t1=ti, t2=ti, 252 | samples=1, 253 | dt=dt, scale=noise_level) 254 | 255 | last_u0, last_s0 = u0i, s0i 256 | expr = expr.append(sub_expr) 257 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level) 258 | 259 | expr.index = range(len(expr.index)) 260 | expr['t'] = ti * (expr.index+1) 261 | return expr 262 | 263 | def simulate(kinetic_type, 264 | alpha1=None, 265 | alpha2=None, 266 | beta1=None, 267 | beta2=None, 268 | gamma1=None, 269 | gamma2=None, 270 | start_splice1=None, 271 | start_splice2=None, 272 | start_unsplice1=None, 273 | start_unsplice2=None, 274 | path1_pct=None, 275 | path2_pct=None, 276 | path1_cell_number=None, 277 | path2_cell_number=None, 278 | noise_level=0.2): 279 | 280 | """ 281 | Simulate a gene with the kinetic type of mono-kinetic, multi-forward, multi-backward, or transcriptional boost. 282 | 283 | Arguments 284 | --------- 285 | kinetic_type: `pandas.DataFrame` 286 | kinetic_type could be selected from ['mono', 'multi_forward', 'multi_backward', 'tran_boost'] 287 | 288 | alpha1: `float` (default: `None`) 289 | The simulated alpha (transcriptional rate) for the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward', or 'tran_boost'. 290 | 291 | alpha2: `float` (default: `None`) 292 | The simulated alpha (transcriptional rate) for the second lineage. This parameter is valid when kinetic_type is set to 'multi_forward' or 'tran_boost'. 293 | 294 | beta1: `float` (default: `None`) 295 | The simulated beta (splicing rate) for the first lineage. 296 | 297 | beta2: `float` (default: `None`) 298 | The simulated beta (splicing rate) for the second lineage. 299 | 300 | gamma1: `float` (default: `None`) 301 | The simulated gamma (degration rate) for the first lineage. 302 | 303 | gamma2: `float` (default: `None`) 304 | The simulated gamma (degration rate) for the second lineage. 305 | 306 | start_splice1: optional, `float` (default: `None`) 307 | The simulated spliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'. 308 | 309 | start_splice2: optional, `float` (default: `None`) 310 | The simulated spliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'. 311 | 312 | start_unsplice1: optional, `float` (default: `None`) 313 | The simulated unspliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'. 314 | 315 | start_unsplice2: optional, `float` (default: `None`) 316 | The simulated unspliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'. 317 | 318 | path1_pct: optional, `float` (default: `None`) 319 | To decrease the bias of cell distribution at the steady point in the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'. 320 | 321 | path2_pct: optional, `float` (default: `None`) 322 | To decrease the bias of cell distribution at the steady point in the second lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'. 323 | 324 | path1_cell_number: `float` (default: `None`) 325 | The number of cells to be generated in the first lineage. 326 | 327 | path2_cell_number: `float` (default: `None`) 328 | The number of cells to be generated in the second lineage. 329 | 330 | noise_level: `float` (default: `0.2`) 331 | The noise level to be set. 332 | 333 | Returns 334 | ------- 335 | df: pandas.DataFrame 336 | The dataframe of one simulated gene. 337 | 338 | 339 | ------- 340 | 341 | Example usage: 342 | 343 | .. code-block:: python 344 | 345 | import celldancer.simulation as cdsim 346 | import matplotlib.pyplot as plt 347 | 348 | # Mono-kinetic 349 | plt.figure(figsize=(5,5)) 350 | gene=cdsim.simulate(kinetic_type='mono', 351 | alpha1=1, 352 | alpha2=0, 353 | beta1=1, 354 | beta2=1, 355 | gamma1=1, 356 | gamma2=1, 357 | path1_pct=99, 358 | path2_pct=99, 359 | path1_cell_number=1000, 360 | path2_cell_number=1000) 361 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5) 362 | 363 | # Multi-lineage forward branching 364 | plt.figure(figsize=(5,5)) 365 | gene=cdsim.simulate(kinetic_type='multi_forward', 366 | alpha1=5, 367 | alpha2=1, 368 | beta1=1, 369 | beta2=0.5, 370 | gamma1=5, 371 | gamma2=0.25, 372 | path1_pct=99, 373 | path2_pct=99, 374 | path1_cell_number=1000, 375 | path2_cell_number=1000) 376 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5) 377 | 378 | # Multi-lineage backward branching 379 | plt.figure(figsize=(5,5)) 380 | gene=cdsim.simulate(kinetic_type='multi_backward', 381 | beta1=1, 382 | beta2=1, 383 | gamma1=1, 384 | gamma2=1, 385 | start_splice1=1, 386 | start_splice2=1.5, 387 | start_unsplice1=1, 388 | start_unsplice2=0.2, 389 | path1_cell_number=1000, 390 | path2_cell_number=1000) 391 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5) 392 | 393 | # Transcriptional boost 394 | plt.figure(figsize=(5,5)) 395 | gene=cdsim.simulate(kinetic_type='tran_boost', 396 | alpha1=2, 397 | alpha2=5, 398 | beta1=2, 399 | beta2=2, 400 | gamma1=1, 401 | gamma2=1, 402 | path1_pct=99, 403 | path2_pct=80, 404 | path1_cell_number=1000, 405 | path2_cell_number=1000) 406 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5) 407 | 408 | .. image:: _static/sim.png 409 | :width: 100% 410 | :alt: sim 411 | 412 | """ 413 | 414 | 415 | if kinetic_type=='mono': 416 | df=generate_mono(alpha1=alpha1, 417 | alpha2=alpha2, 418 | beta1=beta1, 419 | beta2=beta2, 420 | gamma1=gamma1, 421 | gamma2=gamma2, 422 | path1_pct=path1_pct, 423 | path2_pct=path2_pct, 424 | path1_sample=path1_cell_number, 425 | path2_sample=path2_cell_number, 426 | noise_level=noise_level) 427 | 428 | elif kinetic_type=='multi_forward': 429 | df=generate_forward(alpha1=alpha1, 430 | alpha2=alpha2, 431 | beta1=beta1, 432 | beta2=beta2, 433 | gamma1=gamma1, 434 | gamma2=gamma2, 435 | path1_pct=path1_pct, 436 | path2_pct=path2_pct, 437 | path1_sample=path2_cell_number, 438 | path2_sample=path2_cell_number, 439 | noise_level=noise_level) 440 | 441 | elif kinetic_type=='multi_backward': 442 | df=generate_backward(start_s1=start_splice1, 443 | start_s2=start_splice2, 444 | start_u1=start_unsplice1, 445 | start_u2=start_unsplice2, 446 | alpha1=0, 447 | alpha2=0, 448 | beta1=beta1, 449 | beta2=beta2, 450 | gamma1=gamma1, 451 | gamma2=gamma2, 452 | path1_sample=path1_cell_number, 453 | path2_sample=path2_cell_number, 454 | noise_level=noise_level) 455 | 456 | elif kinetic_type=='tran_boost': 457 | df=generate_tran_boost(alpha1=alpha1, 458 | alpha2=alpha2, 459 | beta1=beta1, 460 | beta2=beta2, 461 | gamma1=gamma1, 462 | gamma2=gamma2, 463 | path1_pct=path1_pct, 464 | path2_pct=path2_pct, 465 | path1_sample=path1_cell_number, 466 | path2_sample=path2_cell_number, 467 | noise_level=noise_level) 468 | 469 | 470 | else: 471 | kinetic_type_list=['mono', 'multi_forward', 'multi_backward', 'tran_boost'] 472 | print('Kinetic type in ',kinetic_type_list,' could be choose from.') 473 | 474 | return(df) -------------------------------------------------------------------------------- /src/celldancer/utilities.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | import scipy 4 | import pandas as pd 5 | import anndata as ad 6 | from sklearn.neighbors import NearestNeighbors 7 | from statsmodels.nonparametric.kernel_regression import KernelReg 8 | 9 | # progress bar 10 | import contextlib 11 | import joblib 12 | from tqdm import tqdm 13 | 14 | @contextlib.contextmanager 15 | def tqdm_joblib(tqdm_object): 16 | """Context manager to patch joblib to report into tqdm progress bar given as argument""" 17 | class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack): 18 | def __call__(self, *args, **kwargs): 19 | tqdm_object.update(n=self.batch_size) 20 | return super().__call__(*args, **kwargs) 21 | 22 | old_batch_callback = joblib.parallel.BatchCompletionCallBack 23 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback 24 | try: 25 | yield tqdm_object 26 | finally: 27 | joblib.parallel.BatchCompletionCallBack = old_batch_callback 28 | tqdm_object.close() 29 | 30 | def _non_para_kernel(X,Y,down_sample_idx): 31 | # (no first cls),pseudotime r square calculation 32 | # this version has downsampling section 33 | # TO DO WHEN ONLY USING ONE GENE, WILL CAUSL PROBLEM WHEN COMBINING 34 | # Usage: Gene pseudotime fitting and r square (moved to utilities) 35 | # input: X,Y 36 | # return: estimator, r_square 37 | # example: 38 | # X = pd.DataFrame(np.arange(100)*np.pi/100) 39 | # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1))) 40 | # estimator,r_square=non_para_kernel(X,Y) 41 | 42 | # X2=pd.DataFrame(np.random.randint(0,100,size=[200,1])) 43 | # Y2=pd.DataFrame(np.random.normal(9,5,size=[200])) 44 | # X = pd.DataFrame(np.arange(100)*np.pi/100) 45 | # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1))) 46 | from statsmodels.nonparametric.kernel_regression import KernelReg 47 | import matplotlib.pyplot as plt 48 | print('_non_para_kernel_t4') 49 | Y_sampled=Y[X['index'].isin(down_sample_idx)] 50 | X_sampled=X[X['index'].isin(down_sample_idx)].time 51 | kde=KernelReg(endog=Y_sampled, 52 | exog=X_sampled, 53 | var_type='c', 54 | ) 55 | #X=merged.time 56 | #Y=merged.s0 57 | #print(kde.r_squared()) 58 | n=X_sampled.shape[0] 59 | 60 | estimator = kde.fit(X_sampled) 61 | estimator = np.reshape(estimator[0],[n,1]) 62 | 63 | return(estimator,kde.r_squared()) 64 | 65 | def getidx_downSampling_embedding(load_cellDancer,cell_choice=None): 66 | # find the origional id 67 | 68 | if cell_choice is not None: 69 | load_cellDancer=load_cellDancer[load_cellDancer.cellIndex.isin(cell_choice)] 70 | 71 | embedding=load_cellDancer.loc[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]][['embedding1','embedding2']] 72 | 73 | # get transfer id 74 | from .sampling import sampling_embedding 75 | idx_downSampling_embedding = sampling_embedding(embedding, 76 | para='neighbors', 77 | target_amount=0, 78 | step=(30,30) # TODO: default is 30 79 | ) 80 | if cell_choice is None: 81 | return(idx_downSampling_embedding) 82 | else: 83 | # transfer to the id of origional all detail list 84 | onegene=load_cellDancer[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]].copy() 85 | onegene.loc[:,'transfer_id']=range(len(onegene)) 86 | sampled_left=onegene[onegene.transfer_id.isin(idx_downSampling_embedding)] 87 | transfered_index=sampled_left.cellIndex 88 | return(transfered_index) 89 | 90 | 91 | def combine_parallel_result(result,gene_list,sampled_idx,merged_part_time): 92 | # combine result of rsquare and non-para fitting obtained from parallel computing 93 | for i,result_i in enumerate(result): 94 | 95 | r_square=result_i[1] 96 | non_para_fit=result_i[0] 97 | #print(r_square) 98 | if i == 0: 99 | r_square_list = r_square 100 | non_para_fit_list = np.transpose(non_para_fit) 101 | else: 102 | r_square_list = np.vstack((r_square_list, r_square)) 103 | non_para_fit_list = np.vstack((non_para_fit_list, np.transpose(non_para_fit)[0])) 104 | r_square=pd.DataFrame({'gene_name':gene_list,'r_square':np.transpose(r_square_list)[0]}) 105 | 106 | non_para_fit_heat=pd.DataFrame(non_para_fit_list,index=gene_list) 107 | non_para_fit_heat.columns=merged_part_time[merged_part_time['index'].isin(sampled_idx)]['index'] 108 | 109 | non_para_list=pd.DataFrame(non_para_fit_list) 110 | non_para_list['combined']=non_para_list.values.tolist() 111 | r_square 112 | r_square_non_para_list=pd.concat([r_square,non_para_list['combined']],axis=1) 113 | r_square_non_para_list_sort=r_square_non_para_list.sort_values(by=['r_square'], axis=0, ascending=False) 114 | 115 | return(r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list) 116 | 117 | def get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=None,): 118 | # downsample 119 | sampled_idx=getidx_downSampling_embedding(load_cellDancer,cell_choice=cell_choice) 120 | 121 | # parallel thread 122 | from joblib import Parallel, delayed 123 | # run parallel 124 | with tqdm_joblib(tqdm(desc="Calculate rsquare", total=len(gene_list))) as progress_bar: 125 | result = Parallel(n_jobs= -1, backend="loky")( # TODO: FIND suitable njobs 126 | delayed(_non_para_kernel_t4)(s0_merged_part_time,s0_merged_part_gene[gene_list[gene_index]],sampled_idx) 127 | for gene_index in range(0,len(gene_list))) 128 | 129 | # combine 130 | r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list=combine_parallel_result(result,gene_list,sampled_idx,s0_merged_part_time) 131 | 132 | return (r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx) 133 | 134 | 135 | def get_gene_s0_by_time(cell_time,load_cellDancer): 136 | cell_time_time_sort=cell_time.sort_values('pseudotime') 137 | cell_time_time_sort.columns=['index','time'] 138 | 139 | s0_heatmap_raw=load_cellDancer.pivot(index='cellIndex', columns='gene_name', values='unsplice') 140 | 141 | s0_heatmap_raw 142 | s0_merged=pd.merge(cell_time_time_sort,s0_heatmap_raw,left_on='index', right_on='cellIndex') # TODO: NOT cellIndex in the future 143 | 144 | s0_merged_part_gene=s0_merged.loc[:, s0_merged.columns[2:]] 145 | s0_merged_part_time=s0_merged.loc[:, s0_merged.columns[0:2]] 146 | 147 | return(s0_merged_part_gene,s0_merged_part_time) 148 | 149 | def rank_rsquare(load_cellDancer,gene_list=None,cluster_choice=None): 150 | cell_time=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]][['cellIndex','pseudotime']] 151 | s0_merged_part_gene,s0_merged_part_time=get_gene_s0_by_time(cell_time,load_cellDancer) 152 | 153 | onegene=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]] 154 | 155 | if cluster_choice is None: 156 | cluster_choice=list(onegene.clusters.drop_duplicates()) 157 | cell_idx=list(onegene[onegene.clusters.isin(cluster_choice)].cellIndex) 158 | 159 | if gene_list is None: 160 | gene_list=s0_merged_part_gene.columns 161 | r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx=get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=cell_idx) 162 | return(r_square_non_para_list_sort[['gene_name','r_square']].reset_index(drop=True)) 163 | 164 | 165 | def adata_to_df_with_embed(adata, 166 | us_para=['Mu', 'Ms'], 167 | cell_type_para='celltype', 168 | embed_para='X_umap', 169 | save_path='cell_type_u_s_sample_df.csv', 170 | gene_list=None): 171 | 172 | """Convert adata to pandas.DataFrame format and save it as csv file with embedding info. 173 | 174 | Arguments 175 | --------- 176 | adata: `anndata._core.anndata.AnnData` 177 | The adata to be transferred. 178 | us_para: `list` (default: ['Mu','Ms']) 179 | The attributes of the two count matrices of pre-mature (unspliced) and mature (spliced) abundances from adata.layers. By default, splice and unsplice columns (the two count matrices of spliced and unspliced abundances) are obtained from the ['Ms', 'Mu'] attributes of adata.layers. 180 | cell_type_para: `str` (default: 'celltype') 181 | The attribute of cell type to be obtained from adata.obs. By default, cell type information is obtained from ['celltype'] column of adata.obs. 182 | embed_para: `str` (default: 'X_umap') 183 | The attribute of embedding space to be obtained from adata.obsm. It represents the 2-dimensional representation of all cells. The embedding1 and embedding2 columns are obtained from [‘X_umap’] attribute of adata.obsm. 184 | save_path: `str` (default: 'cell_type_u_s_sample_df.csv') 185 | Path to save the result of transferred csv file. 186 | gene_list: `list` (default: None) 187 | Specific gene(s) to be transfered. 188 | Returns 189 | ------- 190 | raw_data: `pandas.DataFrame` 191 | pandas DataFrame with columns gene_name, unsplice, splice, cellID, clusters, embedding1, embedding2. 192 | """ 193 | from tqdm import tqdm 194 | def adata_to_raw_one_gene(data, us_para, gene): 195 | ''' 196 | convert adata to raw data format (one gene) 197 | data: an anndata 198 | us_para: the varable name of u0, s0, and gene name 199 | us_para = ['Mu', 'Ms'] 200 | ''' 201 | data2 = data[:, data.var.index.isin([gene])].copy() 202 | u0 = data2.layers[us_para[0]][:,0].copy().astype(np.float32) 203 | s0 = data2.layers[us_para[1]][:,0].copy().astype(np.float32) 204 | raw_data = pd.DataFrame({'gene_name':gene, 'unsplice':u0, 'splice':s0}) 205 | return(raw_data) 206 | 207 | if gene_list is None: gene_list=adata.var.index 208 | 209 | for i,gene in enumerate(tqdm(gene_list)): 210 | data_onegene = adata_to_raw_one_gene(adata, us_para=us_para, gene=gene) 211 | if i==0: 212 | data_onegene.to_csv(save_path,header=True,index=False) 213 | else: 214 | data_onegene.to_csv(save_path,mode='a',header=False,index=False) 215 | 216 | # cell info 217 | gene_num=len(gene_list) 218 | cellID=pd.DataFrame({'cellID':adata.obs.index}) 219 | celltype_meta=adata.obs[cell_type_para].reset_index(drop=True) 220 | celltype=pd.DataFrame({'clusters':celltype_meta})# 221 | embed_map=pd.DataFrame({'embedding1':adata.obsm[embed_para][:,0],'embedding2':adata.obsm[embed_para][:,1]}) 222 | # embed_info_df = pd.concat([embed_info]*gene_num) 223 | embed_info=pd.concat([cellID,celltype,embed_map],axis=1) 224 | embed_raw=pd.concat([embed_info]*gene_num) 225 | embed_raw=embed_raw.reset_index(drop=True) 226 | 227 | raw_data=pd.read_csv(save_path) 228 | raw_data=pd.concat([raw_data,embed_raw],axis=1) 229 | raw_data.to_csv(save_path,header=True,index=False) 230 | 231 | return(raw_data) 232 | 233 | def to_dynamo(cellDancer_df): 234 | ''' 235 | Convert the output dataframe of cellDancer to the input of dynamo. The output of this function can be directly used in the downstream analyses of dynamo. 236 | 237 | Example usage: 238 | 239 | .. code-block:: python 240 | 241 | import dynamo as dyn 242 | import numpy as np 243 | import pandas as pd 244 | import anndata as ann 245 | import matplotlib.pyplot as plt 246 | import celldancer as cd 247 | import celldancer.utilities as cdutil 248 | 249 | # load the prediction result of all genes, the data could be achieved from section 'Deciphering gene regulation through vector fields analysis in pancreatic endocrinogenesis' 250 | cellDancer_df=pd.read_csv('HgForebrainGlut_cellDancer_estimation_spliced.csv') 251 | cellDancer_df=cd.compute_cell_velocity(cellDancer_df=cellDancer_df, projection_neighbor_choice='embedding', expression_scale='power10', projection_neighbor_size=100) # compute cell velocity 252 | 253 | # transform celldancer dataframe to anndata 254 | adata_from_dancer = cdutil.to_dynamo(cellDancer_df) 255 | 256 | # plot the velocity vector 257 | dyn.pl.streamline_plot(adata_from_dancer, color=["clusters"], basis = "cdr", show_legend="on data", show_arrowed_spines=True) 258 | 259 | ------- 260 | 261 | .. image:: _static/dynamo_plt.png 262 | :width: 60% 263 | :alt: dynamo_plt 264 | 265 | Arguments 266 | --------- 267 | cellDancer_df: `pandas.DataFrame` 268 | The output dataframe of cellDancer. 269 | 270 | cellDancer --> dynamo 271 | 272 | cellDancer_df.splice --> adata.X 273 | 274 | cellDancer_df.loss --> adata.var.loss 275 | 276 | cellDancer_df.cellID --> adata.obs 277 | 278 | cellDancer_df.clusters --> adata.obs.clusters 279 | 280 | cellDancer_df.splice --> adata.layers['X_spliced'] 281 | 282 | cellDancer_df.splice --> adata.layers['M_s'] 283 | 284 | cellDancer_df.unsplice --> adata.layers['X_unspliced'] 285 | 286 | cellDancer_df.unsplice --> adata.layers['M_u'] 287 | 288 | cellDancer_df.alpha --> adata.layers['alpha'] 289 | 290 | cellDancer_df.beta --> adata.layers['beta'] 291 | 292 | cellDancer_df.gamma --> adata.layers['gamma'] 293 | 294 | cellDancer_df.unsplice_predict - cellDancer_df.unsplice --> adata.layers['velocity_U'] 295 | 296 | cellDancer_df.splice_predict - cellDancer_df.splice --> adata.layers['velocity_S'] 297 | 298 | cellDancer_df[['embeddding1', 'embedding2']] --> adata.obsm['X_cdr'] 299 | 300 | cellDancer_df[['velocity1', 'velocity2']] --> adata.obsm['velocity_cdr'] 301 | 302 | Returns 303 | ------- 304 | adata 305 | ''' 306 | 307 | # Sort the cellDancer_df by cellID, so if it's not done already, your cellDancer_df could be changed. 308 | # This is because pd.DataFrame.pivot does this automatically and we don't want to mess up with 309 | # the obsm etc 310 | cellDancer_df = cellDancer_df.sort_values('cellID') 311 | 312 | spliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice') 313 | unspliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice') 314 | 315 | spliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice_predict') 316 | unspliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice_predict') 317 | 318 | alpha = cellDancer_df.pivot(index='cellID', columns='gene_name', values='alpha') 319 | beta = cellDancer_df.pivot(index='cellID', columns='gene_name', values='beta') 320 | gamma = cellDancer_df.pivot(index='cellID', columns='gene_name', values='gamma') 321 | 322 | one_gene = cellDancer_df['gene_name'].iloc[0] 323 | one_cell = cellDancer_df['cellID'].iloc[0] 324 | 325 | adata1 = ad.AnnData(spliced) 326 | 327 | # var 328 | adata1.var['highly_variable_genes'] = True 329 | #adata1.var['loss'] = (cellDancer_df[cellDancer_df['cellID'] == one_cell]['loss']).tolist() 330 | loss = cellDancer_df.pivot(index='gene_name', columns='cellID', values='loss').iloc[:, 0] 331 | loss.index = loss.index.astype(str) 332 | adata1.var['loss'] = loss 333 | # celldancer uses all genes (high variable) for dynamics and transition. 334 | adata1.var['use_for_dynamics'] = True 335 | adata1.var['use_for_transition'] = True 336 | 337 | # obs 338 | if 'clusters' in cellDancer_df: 339 | clusters = cellDancer_df.pivot(index='cellID', columns='gene_name', values='clusters').iloc[:, 0] 340 | clusters.index = clusters.index.astype(str) 341 | adata1.obs['clusters'] = clusters 342 | # layers 343 | adata1.layers['X_spliced'] = spliced 344 | adata1.layers['X_unspliced'] = unspliced 345 | 346 | adata1.layers['M_s'] = spliced 347 | adata1.layers['M_u'] = unspliced 348 | adata1.layers['velocity_S'] = spliced_predict - spliced 349 | 350 | adata1.layers['velocity_U'] = unspliced_predict - unspliced 351 | adata1.layers['alpha'] = alpha 352 | adata1.layers['beta'] = beta 353 | adata1.layers['gamma'] = gamma 354 | 355 | # obsm 356 | adata1.obsm['X_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['embedding1', 'embedding2']].values 357 | # assuming no downsampling is used for the cell velocities in the cellDancer_df 358 | if 'velocity1' in cellDancer_df: 359 | adata1.obsm['velocity_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['velocity1', 'velocity2']].values 360 | 361 | # obsp 362 | n_neighbors = 20 363 | nn = NearestNeighbors(n_neighbors=n_neighbors) 364 | nn.fit(adata1.obsm['X_cdr']) 365 | connect_knn = nn.kneighbors_graph(mode='connectivity') 366 | distance_knn = nn.kneighbors_graph(mode='distance') 367 | adata1.obsp['connectivities'] = connect_knn 368 | adata1.obsp['distances'] = distance_knn 369 | 370 | # uns 371 | dynamics_info = {'filter_gene_mode': 'final', 372 | 't': None, 373 | 'group': None, 374 | 'X_data': None, 375 | 'X_fit_data': None, 376 | 'asspt_mRNA': 'ss', 377 | 'experiment_type': 'conventional', 378 | 'normalized': True, 379 | 'model': 'static', 380 | 'est_method': 'ols', 381 | 'has_splicing': True, 382 | 'has_labeling': False, 383 | 'splicing_labeling': False, 384 | 'has_protein': False, 385 | 'use_smoothed': True, 386 | 'NTR_vel': False, 387 | 'log_unnormalized': False, 388 | 'fraction_for_deg': False} 389 | 390 | adata1.uns['dynamics']= dynamics_info 391 | 392 | return adata1 393 | 394 | def export_velocity_to_dynamo(cellDancer_df,adata): 395 | ''' 396 | Replace the velocities in adata of dynamo (“adata” in parameters) with the cellDancer predicted velocities (“cellDancer_df” in parameters). The output can be directly used in the downstream analyses of dynamo. 397 | 398 | ------- 399 | The vector field could be learned by dynamo based on the RNA velocity of cellDancer. Details are shown in the section ‘Application of dynamo.’ 400 | 401 | .. image:: _static/dynamo_vector_field_pancreas.png 402 | :width: 60% 403 | :alt: dynamo_vector_field_pancreas 404 | 405 | Arguments 406 | --------- 407 | cellDancer_df: `pandas.DataFrame` 408 | The output dataframe of cellDancer. 409 | 410 | cellDancer --> dynamo 411 | 412 | bools of the existance of cellDancer_df['gene_name'] in adata.var --> adata.var['use_for_dynamics'] 413 | 414 | bools of the existance of cellDancer_df['gene_name'] in adata.var --> adata.var['use_for_transition'] 415 | 416 | cellDancer_df.splice_predict - cellDancer_df.splice --> adata.layers['velocity_S'] 417 | 418 | adata: `anndata._core.anndata.AnnData` 419 | The adata to be integrated with cellDancer velocity result. 420 | 421 | 422 | Returns 423 | ------- 424 | adata 425 | ''' 426 | 427 | dancer_genes = cellDancer_df['gene_name'].drop_duplicates() 428 | cellDancer_df["velocity_S"] = cellDancer_df["splice_predict"]-cellDancer_df["splice"] 429 | dancer_velocity_s = cellDancer_df[['cellID', 'gene_name', 'velocity_S']] 430 | pivoted = dancer_velocity_s.pivot(index="cellID", columns="gene_name", values="velocity_S") 431 | velocity_matrix = np.zeros(adata.shape) 432 | adata_ds_zeros = pd.DataFrame(velocity_matrix, columns=adata.var.index, index=adata.obs.index) 433 | celldancer_velocity_s_df = (adata_ds_zeros + pivoted).fillna(0)[adata.var.index] 434 | 435 | adata.layers['velocity_S'] = scipy.sparse.csr_matrix(celldancer_velocity_s_df.values) 436 | adata.var['use_for_dynamics'] = adata.var.index.isin(dancer_genes) 437 | adata.var['use_for_transition'] = adata.var.index.isin(dancer_genes) 438 | return(adata.copy()) 439 | 440 | def adata_to_raw(adata,save_path,gene_list=None): 441 | '''convert adata to raw data format 442 | data: 443 | save_path: 444 | gene_list (optional): 445 | return: panda dataframe with gene_list,u0,s0,cellID 446 | 447 | run: test=adata_to_raw(adata,'/Users/shengyuli/Library/CloudStorage/OneDrive-HoustonMethodist/work/Velocity/bin/cellDancer-development_20220128/src/output/test.csv',gene_list=genelist_all) 448 | ref: mel - loom_to_celldancer_raw.py 449 | ''' 450 | from tqdm import tqdm 451 | 452 | def adata_to_raw_one_gene(data, para, gene): 453 | ''' 454 | convert adata to raw data format (one gene) 455 | data: an anndata 456 | para: the varable name of u0, s0, and gene name 457 | para = ['Mu', 'Ms'] 458 | ''' 459 | data2 = data[:, data.var.index.isin([gene])].copy() 460 | u0 = data2.layers[para[0]][:,0].copy().astype(np.float32) 461 | s0 = data2.layers[para[1]][:,0].copy().astype(np.float32) 462 | raw_data = pd.DataFrame({'gene_name':gene, 'u0':u0, 's0':s0}) 463 | raw_data['cellID']=adata.obs.index 464 | return(raw_data) 465 | 466 | if gene_list is None: gene_list=adata.var.index 467 | 468 | for i,gene in enumerate(tqdm(gene_list)): 469 | data_onegene = adata_to_raw_one_gene(adata, para=['Mu', 'Ms'], gene=gene) 470 | if i==0: 471 | data_onegene.to_csv(save_path,header=True,index=False) 472 | else: 473 | data_onegene.to_csv(save_path,mode='a',header=False,index=False) 474 | raw_data=pd.read_csv(save_path) 475 | 476 | return(raw_data) 477 | 478 | def filter_by_neighbor_sample_parallel(load_raw_data,step_i=15,step_j=15,cutoff_s0_zero_ratio=0.2,cutoff_u0_zero_ratio=0.2,gene_amt_each_job=100): 479 | from joblib import Parallel, delayed 480 | import pandas as pd 481 | import numpy as np 482 | 483 | '''filter genes with''' 484 | # parallel filter gene_by_neighbor_sample_one_gene 485 | def filter_gene_by_neighbor_sample_one_gene(gene,load_raw_data,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None): 486 | # print(gene) 487 | u_s= np.array(load_raw_data[load_raw_data['gene_list']==gene][["u0","s0"]]) # u_s 488 | sampling_idx=sampling_neighbors(u_s[:,0:2], step_i=step_i,step_j=step_j,percentile=15) # Sampling 489 | u_s_downsample = u_s[sampling_idx,0:4] 490 | u_s_df=pd.DataFrame({"s0":u_s_downsample[:, 1],'u0':u_s_downsample[:, 0]}) 491 | u_s_df=u_s_df[~((u_s_df.s0==0) & (u_s_df.u0==0))] 492 | # print(u_s_df) 493 | u_s_df_zero_amt=u_s_df.agg(lambda x: x.eq(0).sum()) 494 | sampled_gene_amt=len(u_s_df) 495 | u_s_df_zero_ratio=u_s_df_zero_amt/sampled_gene_amt 496 | # plt.figure(None,(6,6)) 497 | # plt.scatter(u_s_df.s0,u_s_df.u0,alpha=0.1) 498 | # plt.show() 499 | # return [u_s_df_zero_ratio.s0,u_s_df_zero_ratio.u0] 500 | # return(u_s_df) 501 | if ~(u_s_df_zero_ratio.s0>cutoff_s0_zero_ratio or u_s_df_zero_ratio.u0>cutoff_u0_zero_ratio): 502 | return(gene) 503 | 504 | def filter_gene_by_neighbor_sample(start_point,load_raw_data,gene_list=None,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None): 505 | if start_point+gene_amt_each_job rawdata[['gene_list', 'u0','s0']] 544 | return(ratio2, cor2) 545 | ratio2 [['gene_choice','ratio']] 546 | ratio2 [['gene_choice','correlation']] 547 | ''' 548 | def identify_in_grid(u, s, onegene_u0_s0): 549 | select_cell =onegene_u0_s0[(onegene_u0_s0[:,0]>u[0]) & (onegene_u0_s0[:,0]s[0]) & (onegene_u0_s0[:,1] cell, 117 | # col -> neighboring cells, 118 | # value -> index of cells, 119 | # the fist col is the index of row 120 | 121 | expr = pd.merge(pd.DataFrame(splice, columns=['splice']), pd.DataFrame(unsplice, columns=['unsplice']), left_index=True, right_index=True) 122 | if barcode is not None: 123 | expr.index = barcode 124 | unsplice = torch.tensor(expr['unsplice']) 125 | splice = torch.tensor(expr['splice']) 126 | indices = torch.tensor(indices) 127 | unsplice_predict, splice_predict, alphas, beta, gamma = self.module(unsplice, splice, alpha0, beta0, gamma0, dt) 128 | 129 | def cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices): 130 | """Cost function 131 | Return: 132 | list of cosine distance and a list of the index of the next cell 133 | """ 134 | 135 | uv, sv = unsplice_predict-unsplice, splice_predict-splice # Velocity from (unsplice, splice) to (unsplice_predict, splice_predict) 136 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice # Velocity from (unsplice, splice) to its neighbors 137 | 138 | den = torch.sqrt(unv**2 + snv**2) * torch.sqrt(uv**2+sv**2) 139 | den[den==0] = -1 140 | cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.)) # cosine: column -> individuel cell (cellI); row -> nearby cells of cell id ; value -> cosine between col and row cells 141 | cosine_max, cosine_max_idx = torch.max(cosine, dim=0) 142 | cell_idx = torch.diag(indices[:, cosine_max_idx+1]) 143 | return 1 - cosine_max, cell_idx 144 | 145 | 146 | 147 | def rmse(unsplice, splice, unsplice_predict, splice_predict, indices): 148 | """ 149 | This loss is defined as the rmse of the predicted velocity vector (uv, sv) from the neighboring velocity vectors (unv, snv). 150 | 151 | This loss is used during revision. 152 | 153 | """ 154 | uv, sv = unsplice_predict-unsplice, splice_predict-splice 155 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice 156 | 157 | rmse = (uv-unv)**2 + (sv-snv)**2 158 | rmse = torch.sqrt(0.5*rmse) 159 | 160 | # normalize across all neighboring cells using a softmax function. 161 | # m = torch.nn.Softmax(dim=0) 162 | # rmse = m(rmse) 163 | 164 | rmse_min, rmse_min_idx = torch.min(rmse, dim=0) 165 | cell_idx = torch.diag(indices[:, rmse_min_idx+1]) 166 | return rmse_min, cell_idx 167 | 168 | 169 | def mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio = 0.5): 170 | """ 171 | This loss is defined as the mix of rmse loss and cosine loss. 172 | 173 | This loss is used during revision. 174 | 175 | Parameters: 176 | 177 | unsplice: 1d tensor [n_cells] 178 | splice: 1d tensor [n_cells] 179 | indices: 2d array [n_cells, n_neighbors] 180 | Return: 181 | list of cosine distance and a list of the index of the next cell 182 | """ 183 | 184 | #print("mix ratio, ", mix_ratio) 185 | uv, sv = unsplice_predict-unsplice, splice_predict-splice 186 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice 187 | mag_v = torch.sqrt(uv**2 + sv**2) 188 | mag_nv = torch.sqrt(unv**2 + snv**2) 189 | mag = (mag_nv - mag_v)**2 190 | 191 | # minimize mag or maximize -mag 192 | # normalize across all neighboring cells using a softmax function 193 | m = torch.nn.Softmax(dim=0) 194 | mag = m(mag) 195 | 196 | den = mag_v * mag_nv 197 | den[den==0] = -1 198 | 199 | # cosine: [n_neighbors x n_cells] 200 | cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.)) 201 | 202 | total = mix_ratio*(1-cosine) + (1 - mix_ratio)* mag 203 | total_min, total_min_idx = torch.min(total, dim=0) 204 | 205 | cell_idx = torch.diag(indices[:, total_min_idx+1]) 206 | return total_min, cell_idx 207 | 208 | 209 | def trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx, version): 210 | 211 | # This cost has been deprecated. 212 | 213 | uv, sv = unsplice_predict-unsplice, splice_predict-splice 214 | tan = torch.where(sv!=1000000, uv/sv, torch.tensor(0.00001)) 215 | atan_theta = torch.atan(tan) + torch.pi/2 216 | atan_theta2=atan_theta[idx] 217 | atan_theta3 = atan_theta[idx[idx]] 218 | if version=="v1": 219 | cost = atan_theta2/atan_theta+atan_theta3/atan_theta2 220 | elif version=="v2": 221 | cost=torch.where(atan_theta=torch.tensor(0.0), torch.tensor(0.0), torch.tensor(-corrcoef)) 233 | return(cost) 234 | 235 | if trace_cost_ratio == 0 and corrcoef_cost_ratio == 0: 236 | 237 | if loss_func == 'cosine': 238 | cost1 = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices)[0] 239 | cost_fin = torch.mean(cost1) 240 | 241 | if loss_func == 'rmse': 242 | cost1 = rmse(unsplice, splice, unsplice_predict, splice_predict, indices)[0] 243 | cost_fin = torch.mean(cost1) 244 | 245 | elif 'mix' in loss_func: 246 | mix_ratio = loss_func[1] 247 | cost1 = mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio=mix_ratio)[0] 248 | cost_fin = torch.mean(cost1) 249 | 250 | else: # trace cost and corrcoef cost have been deprecated. 251 | # cosine cost 252 | cost1,idx = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices) 253 | cost1_normalize=(cost1-torch.min(cost1))/torch.max(cost1) 254 | cost1_mean = torch.mean(cost1_normalize) 255 | 256 | # trace cost 257 | if trace_cost_ratio>0: 258 | cost2 = trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx,"v2") 259 | cost2_normalize=(cost2-torch.min(cost2))/torch.max(cost2) 260 | cost2_mean = torch.mean(cost2_normalize) 261 | cost2_relu=(max((cost2_mean-cost2_cutoff), 0)) 262 | 263 | # corrcoef cost 264 | if corrcoef_cost_ratio>0: 265 | corrcoef_cost=corrcoef_cost(alphas, unsplice, beta, splice) 266 | 267 | # sum all cost 268 | cosin_cost_ratio=1-trace_cost_ratio-corrcoef_cost_ratio 269 | cost_fin = cosin_cost_ratio*cost1_mean + \ 270 | trace_cost_ratio*cost2_relu + \ 271 | corrcoef_cost_ratio*corrcoef_cost 272 | 273 | return cost_fin, unsplice_predict, splice_predict, alphas, beta, gamma 274 | 275 | 276 | def summary_para_validation(self, cost_mean): 277 | loss_df = pd.DataFrame({'cost': cost_mean}, index=[0]) 278 | return(loss_df) 279 | 280 | def summary_para(self, unsplice, splice, unsplice_predict, splice_predict, alphas, beta, gamma, cost): 281 | cellDancer_df = pd.merge(pd.DataFrame(unsplice, columns=['unsplice']),pd.DataFrame(splice, columns=['splice']), left_index=True, right_index=True) 282 | cellDancer_df['unsplice_predict'] = unsplice_predict 283 | cellDancer_df['splice_predict'] = splice_predict 284 | cellDancer_df['alpha'] = alphas 285 | cellDancer_df['beta'] = beta 286 | cellDancer_df['gamma'] = gamma 287 | cellDancer_df['cost'] = cost 288 | return cellDancer_df 289 | 290 | class ltModule(pl.LightningModule): 291 | ''' 292 | train network using "DNN_module" 293 | ''' 294 | def __init__(self, 295 | backbone=None, 296 | initial_zoom=2, 297 | initial_strech=1, 298 | learning_rate=None, 299 | dt=None, 300 | loss_func = None, 301 | cost2_cutoff=0, 302 | optimizer='Adam', 303 | trace_cost_ratio=0, 304 | corrcoef_cost_ratio=0, 305 | cost_type='smooth', 306 | average_cost_window_size=10, 307 | smooth_weight=0.9): 308 | super().__init__() 309 | self.backbone = backbone 310 | self.validation_loss_df = pd.DataFrame() 311 | self.test_cellDancer_df = None 312 | self.test_loss_df = None 313 | self.initial_zoom = initial_zoom 314 | self.initial_strech = initial_strech 315 | self.learning_rate=learning_rate 316 | self.dt=dt 317 | self.loss_func=loss_func 318 | self.cost2_cutoff=cost2_cutoff 319 | self.optimizer=optimizer 320 | self.trace_cost_ratio=trace_cost_ratio 321 | self.corrcoef_cost_ratio=corrcoef_cost_ratio 322 | self.save_hyperparameters() 323 | self.get_loss=1000 324 | self.cost_type=cost_type 325 | self.average_cost_window_size=average_cost_window_size # will be used only when cost_tpye.isin(['average', 'median']) 326 | self.cost_window=[] 327 | self.smooth_weight=smooth_weight 328 | 329 | def save(self, model_path): 330 | self.backbone.module.save(model_path) # save network 331 | 332 | def load(self, model_path): 333 | self.backbone.module.load(model_path) # load network 334 | 335 | def configure_optimizers(self): # define optimizer 336 | if self.optimizer=="Adam": 337 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.999), eps=10**(-8), weight_decay=0.004, amsgrad=False) 338 | elif self.optimizer=="SGD": 339 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.8) 340 | return optimizer 341 | 342 | def training_step(self, batch, batch_idx): 343 | ''' 344 | traning network 345 | batch: [] output returned from realDataset.__getitem__ 346 | 347 | ''' 348 | 349 | unsplices, splices, gene_names, unsplicemaxs, splicemaxs, embedding1s, embedding2s = batch 350 | unsplice, splice, unsplicemax, splicemax, embedding1, embedding2 = unsplices[0], splices[0], unsplicemaxs[0], splicemaxs[0], embedding1s[0], embedding2s[0] 351 | 352 | umax = unsplicemax 353 | smax = splicemax 354 | alpha0 = np.float32(umax*self.initial_zoom) 355 | beta0 = np.float32(1.0) 356 | gamma0 = np.float32(umax/smax*self.initial_strech) 357 | 358 | cost, unsplice_predict, splice_predict, alphas, beta, gamma = self.backbone.velocity_calculate( \ 359 | unsplice, splice, alpha0, beta0, gamma0, self.dt, embedding1, embedding2, \ 360 | loss_func = self.loss_func, \ 361 | cost2_cutoff = self.cost2_cutoff, \ 362 | trace_cost_ratio = self.trace_cost_ratio, \ 363 | corrcoef_cost_ratio=self.corrcoef_cost_ratio) 364 | 365 | if self.cost_type=='average': # keep the window len <= check_val_every_n_epoch 366 | if len(self.cost_window)0): 485 | data=data_fitting.sample(frac=self.permutation_ratio) # select cells to train using random methods 486 | else: 487 | print('sampling ratio is wrong!') 488 | elif self.datastatus=="predict_dataset": 489 | data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene 490 | data=data_pred 491 | 492 | data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene 493 | 494 | unsplicemax = np.float32(max(data_pred["unsplice"])) 495 | splicemax = np.float32(max(data_pred["splice"])) 496 | unsplice = np.array(data.unsplice.copy().astype(np.float32)) 497 | splice = np.array(data.splice.copy().astype(np.float32)) 498 | if self.norm_u_s: 499 | unsplice=unsplice/unsplicemax 500 | splice=splice/splicemax 501 | 502 | # add embedding 503 | embedding1 = np.array(data.embedding1.copy().astype(np.float32)) 504 | embedding2 = np.array(data.embedding2.copy().astype(np.float32)) 505 | 506 | return unsplice, splice, gene_name, unsplicemax, splicemax, embedding1, embedding2 507 | 508 | 509 | 510 | class feedData(pl.LightningDataModule): 511 | ''' 512 | load training and test data 513 | ''' 514 | def __init__(self, data_fit=None, data_predict=None,permutation_ratio=1,norm_u_s=True,norm_cell_distribution=False): 515 | super().__init__() 516 | 517 | self.fit_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="fit_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) 518 | 519 | self.predict_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="predict_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s) 520 | 521 | def subset(self, indices): 522 | import copy 523 | temp = copy.copy(self) 524 | temp.fit_dataset = Subset(self.fit_dataset, indices) 525 | temp.predict_dataset = Subset(self.predict_dataset, indices) 526 | return temp 527 | 528 | def train_dataloader(self): 529 | return DataLoader(self.fit_dataset,num_workers=0) 530 | def val_dataloader(self): 531 | return DataLoader(self.fit_dataset,num_workers=0) 532 | def test_dataloader(self): 533 | return DataLoader(self.predict_dataset,num_workers=0,) 534 | 535 | def _train_thread(datamodule, 536 | data_indices, 537 | save_path=None, 538 | max_epoches=None, 539 | check_val_every_n_epoch=None, 540 | norm_u_s=None, 541 | patience=None, 542 | learning_rate=None, 543 | dt=None, 544 | loss_func=None, 545 | n_neighbors=None, 546 | ini_model=None, 547 | model_save_path=None): 548 | 549 | try: 550 | seed = 0 551 | torch.manual_seed(seed) 552 | random.seed(seed) 553 | np.random.seed(seed) 554 | 555 | # iniate network (DNN_layer) and loss function (DynamicModule) 556 | backbone = DNN_module(DNN_layer(100, 100), n_neighbors=n_neighbors) 557 | model = ltModule(backbone=backbone, dt=dt, learning_rate=learning_rate, loss_func=loss_func) 558 | 559 | selected_data = datamodule.subset(data_indices) 560 | 561 | unsplice, splice, this_gene_name, unsplicemax, splicemax, embedding1, embedding2=selected_data.fit_dataset.__getitem__(0) 562 | 563 | data_df=pd.DataFrame({'unsplice':unsplice,'splice':splice,'embedding1':embedding1,'embedding2':embedding2}) 564 | data_df['gene_name']=this_gene_name 565 | try: 566 | 567 | # Note 568 | # here n_neighbors in the downsampling_embedding function is for selecting initial model. 569 | # which is different from the n_neighbors in _train_tread for velocity calculation. 570 | _, sampling_ixs_select_model, _ = downsampling_embedding(data_df, # for select model 571 | para='neighbors', 572 | step=(20,20), 573 | n_neighbors=30, 574 | target_amount=None, 575 | projection_neighbor_choice='embedding') 576 | except: 577 | sampling_ixs_select_model=list(data_df.index) 578 | 579 | gene_downsampling=downsampling(data_df=data_df, gene_list=[this_gene_name], downsampling_ixs=sampling_ixs_select_model) 580 | if ini_model=='circle': 581 | model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name 582 | if ini_model=='branch': 583 | model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name 584 | else: 585 | model_path=select_initial_net(this_gene_name, gene_downsampling, data_df) 586 | model.load(model_path) 587 | 588 | early_stop_callback = EarlyStopping(monitor="loss", min_delta=0.0, patience=patience,mode='min') 589 | 590 | if check_val_every_n_epoch is None: 591 | # not use early stop 592 | trainer = pl.Trainer( 593 | max_epochs=max_epoches, 594 | progress_bar_refresh_rate=0, 595 | reload_dataloaders_every_n_epochs=1, 596 | logger = False, 597 | enable_checkpointing = False, 598 | enable_model_summary=False, 599 | ) 600 | else: 601 | # use early stop 602 | trainer = pl.Trainer( 603 | max_epochs=max_epoches, 604 | progress_bar_refresh_rate=0, 605 | reload_dataloaders_every_n_epochs=1, 606 | logger = False, 607 | enable_checkpointing = False, 608 | check_val_every_n_epoch = check_val_every_n_epoch, 609 | enable_model_summary=False, 610 | callbacks=[early_stop_callback] 611 | ) 612 | 613 | if max_epoches > 0: 614 | trainer.fit(model, selected_data) # train network 615 | 616 | trainer.test(model, selected_data,verbose=False) # predict 617 | 618 | if(model_save_path != None): 619 | model.save(model_save_path) 620 | 621 | loss_df = model.validation_loss_df 622 | cellDancer_df = model.test_cellDancer_df 623 | 624 | if norm_u_s: 625 | cellDancer_df.unsplice=cellDancer_df.unsplice*unsplicemax 626 | cellDancer_df.splice=cellDancer_df.splice*splicemax 627 | cellDancer_df.unsplice_predict=cellDancer_df.unsplice_predict*unsplicemax 628 | cellDancer_df.splice_predict=cellDancer_df.splice_predict*splicemax 629 | cellDancer_df.beta=cellDancer_df.beta*unsplicemax 630 | cellDancer_df.gamma=cellDancer_df.gamma*splicemax 631 | 632 | if(model_save_path != None): 633 | model.save(model_save_path) 634 | 635 | header_loss_df=['gene_name','epoch','loss'] 636 | header_cellDancer_df=['cellIndex','gene_name','unsplice','splice','unsplice_predict','splice_predict','alpha','beta','gamma','loss'] 637 | 638 | loss_df.to_csv(os.path.join(save_path,'TEMP', ('loss'+'_'+this_gene_name+'.csv')),header=header_loss_df,index=False) 639 | cellDancer_df.to_csv(os.path.join(save_path,'TEMP', ('cellDancer_estimation_'+this_gene_name+'.csv')),header=header_cellDancer_df,index=False) 640 | 641 | return None 642 | 643 | except: 644 | return this_gene_name 645 | 646 | 647 | 648 | 649 | 650 | def build_datamodule(cell_type_u_s, 651 | speed_up, 652 | norm_u_s, 653 | permutation_ratio, 654 | norm_cell_distribution=False, 655 | gene_list=None, 656 | downsample_method='neighbors', 657 | n_neighbors_downsample=30, 658 | step=(200,200), 659 | downsample_target_amount=None): 660 | 661 | ''' 662 | set fitting data, data to be predicted, and sampling ratio when fitting 663 | ''' 664 | step_i=step[0] 665 | step_j=step[1] 666 | 667 | if gene_list is None: 668 | data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']] 669 | else: 670 | data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']][cell_type_u_s.gene_name.isin(gene_list)] 671 | 672 | if speed_up: 673 | _, sampling_ixs, _ = downsampling_embedding(data_df, 674 | para=downsample_method, 675 | target_amount=downsample_target_amount, 676 | step=(step_i,step_j), 677 | n_neighbors=n_neighbors_downsample, 678 | projection_neighbor_choice='embedding') 679 | data_df_one_gene=cell_type_u_s[cell_type_u_s['gene_name']==list(gene_list)[0]] 680 | downsample_cellid=data_df_one_gene.cellID.iloc[sampling_ixs] 681 | gene_downsampling=data_df[data_df.cellID.isin(downsample_cellid)] 682 | 683 | feed_data = feedData(data_fit = gene_downsampling, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default 684 | else: 685 | feed_data = feedData(data_fit = data_df, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default 686 | 687 | return(feed_data) 688 | 689 | 690 | def velocity( 691 | cell_type_u_s, 692 | gene_list=None, 693 | max_epoches=200, 694 | check_val_every_n_epoch=10, 695 | patience=3, 696 | learning_rate=0.001, 697 | dt=0.5, 698 | n_neighbors=30, 699 | permutation_ratio=0.125, 700 | speed_up=True, 701 | norm_u_s=True, 702 | norm_cell_distribution=True, 703 | loss_func='cosine', 704 | n_jobs=-1, 705 | save_path=None, 706 | ): 707 | 708 | """Velocity estimation for each cell. 709 | 710 | Arguments 711 | --------- 712 | cell_type_u_s: `pandas.DataFrame` 713 | Dataframe that contains the unspliced abundance, spliced abundance, embedding space, and cell type information. Columns=['gene_name', 'unsplice', 'splice' ,'cellID' ,'clusters' ,'embedding1' ,'embedding2'] 714 | gene_list: optional, `list` (default: None) 715 | Gene list for velocity estimation. `None` if to estimate the velocity of all genes. 716 | max_epoches: optional, `int` (default: 200) 717 | Stop to update the network once this number of epochs is reached. 718 | check_val_every_n_epoch: optional, `int` (default: 10) 719 | Check loss every n train epochs. 720 | patience: optional, `int` (default: 3) 721 | Number of checks with no improvement after which training will be stopped. 722 | dt: optional, `float` (default: 0.5) 723 | Step size 724 | permutation_ratio: optional, `float` (default: 0.125) 725 | Sampling ratio of cells in each epoch when training each gene. 726 | speed_up: optional, `bool` (default: True) 727 | `True` if speed up by downsampling cells. `False` if to use all cells to train the model. 728 | norm_u_s: optional, `bool` (default: True) 729 | `True` if normalize unsplice (and splice) reads by dividing max value of unspliced (and spliced) reads. 730 | norm_cell_distribution: optional, `bool` (default: True) 731 | `True` if the bias of cell distribution is to be removed on embedding space (many cells share the same position of unspliced (and spliced) reads). 732 | loss_func: optional, `str` (default: `cosine`) 733 | Currently support `'cosine'`, `'rmse'`, and (`'mix'`, mix_ratio). 734 | n_jobs: optional, `int` (default: -1) 735 | The maximum number of concurrently running jobs. 736 | save_path: optional, `str` (default: 200) 737 | Path to save the result of velocity estimation. 738 | Returns 739 | ------- 740 | loss_df: `pandas.DataFrame` 741 | The record of loss. 742 | cellDancer_df: `pandas.DataFrame` 743 | The result of velocity estimation. 744 | """ 745 | 746 | # set output dir 747 | datestring = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"); 748 | folder_name='cellDancer_velocity_'+datestring 749 | 750 | if save_path is None: 751 | save_path=os.getcwd() 752 | 753 | try:shutil.rmtree(os.path.join(save_path,folder_name)) 754 | except:os.mkdir(os.path.join(save_path,folder_name)) 755 | save_path=os.path.join(save_path,folder_name) 756 | print('Using '+save_path+' as the output path.') 757 | 758 | try:shutil.rmtree(os.path.join(save_path,'TEMP')) 759 | except:os.mkdir(os.path.join(save_path,'TEMP')) 760 | 761 | # set gene_list if not given 762 | if gene_list is None: 763 | gene_list=list(cell_type_u_s.gene_name.drop_duplicates()) 764 | else: 765 | cell_type_u_s=cell_type_u_s[cell_type_u_s.gene_name.isin(gene_list)] 766 | all_gene_name_cell_type_u_s=list(cell_type_u_s.gene_name.drop_duplicates()) 767 | gene_not_in_cell_type_u_s= list(set(gene_list).difference(set(all_gene_name_cell_type_u_s))) 768 | gene_list=list(list(set(all_gene_name_cell_type_u_s).intersection(set(gene_list)))) 769 | if len(gene_not_in_cell_type_u_s)>0: print(gene_not_in_cell_type_u_s," not in the data cell_type_u_s") 770 | 771 | cell_type_u_s=cell_type_u_s.reset_index(drop=True) 772 | # buring 773 | gene_list_buring=[list(cell_type_u_s.gene_name.drop_duplicates())[0]] 774 | datamodule=build_datamodule(cell_type_u_s,speed_up,norm_u_s,permutation_ratio,norm_cell_distribution,gene_list=gene_list_buring) 775 | 776 | result = Parallel(n_jobs=n_jobs, backend="loky")( 777 | delayed(_train_thread)( 778 | datamodule = datamodule, 779 | data_indices=[data_index], 780 | max_epoches=max_epoches, 781 | check_val_every_n_epoch=check_val_every_n_epoch, 782 | patience=patience, 783 | learning_rate=learning_rate, 784 | n_neighbors=n_neighbors, 785 | dt=dt, 786 | loss_func=loss_func, 787 | save_path=save_path, 788 | norm_u_s=norm_u_s) 789 | for data_index in range(0,len(gene_list_buring))) 790 | 791 | # clean directory 792 | shutil.rmtree(os.path.join(save_path,'TEMP')) 793 | os.mkdir(os.path.join(save_path,'TEMP')) 794 | 795 | data_len = len(gene_list) 796 | 797 | id_ranges=list() 798 | if n_jobs==-1: 799 | interval=os.cpu_count() 800 | else: 801 | interval=n_jobs 802 | for i in range(0,data_len,interval): 803 | idx_start=i 804 | if data_lens_max_90per) & (gene_u_s_full.unsplice>u_max_90per), 'position'] = 'cells_corner' 906 | 907 | if gene_u_s_full.loc[gene_u_s_full['position']=='cells_corner'].shape[0]>0.001*gene_u_s_full.shape[0]: 908 | # model in circle shape 909 | model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name 910 | else: 911 | # model in seperated branch shape 912 | model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name 913 | return(model_path) --------------------------------------------------------------------------------