├── .github
├── scripts
│ └── release.py
└── workflows
│ ├── publish.yml
│ └── release.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── _static
└── training_progress.png
├── dist
├── celldancer-1.1.4-py3-none-any.whl
├── celldancer-1.1.4.tar.gz
├── celldancer-1.1.7-py3-none-any.whl
└── celldancer-1.1.7.tar.gz
├── notebooks
├── case_study_gastrulation.ipynb
├── case_study_hgforebrian.ipynb
├── case_study_neuro.ipynb
├── case_study_pancreas.ipynb
├── case_study_pancreas_dynamo.ipynb
├── case_study_rpe1.ipynb
└── celldancer_prototype_model.ipynb
├── readme.rst
├── readme_pypi.rst
├── requirements.txt
├── setup.py
└── src
└── celldancer
├── .Rapp.history
├── __init__.py
├── cdplt.py
├── compute_cell_velocity.py
├── diffusion.py
├── embedding_kinetic_para.py
├── model
├── branch.pt
└── circle.pt
├── plotting
├── .Rapp.history
├── __init__.py
├── cell.py
├── colormap.py
├── gene.py
└── graph.py
├── pseudo_time.py
├── sampling.py
├── simulation.py
├── utilities.py
└── velocity_estimation.py
/.github/scripts/release.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import subprocess
4 |
5 |
6 | def get_last_version() -> str:
7 | """Return the version number of the last release."""
8 | json_string = (
9 | subprocess.run(
10 | ["gh", "release", "view", "--json", "tagName"],
11 | check=True,
12 | stdout=subprocess.PIPE,
13 | stderr=subprocess.PIPE,
14 | )
15 | .stdout.decode("utf8")
16 | .strip()
17 | )
18 |
19 | return json.loads(json_string)["tagName"]
20 |
21 |
22 | def bump_patch_number(version_number: str) -> str:
23 | """Return a copy of `version_number` with the patch number incremented."""
24 | major, minor, patch = version_number.split(".")
25 | return f"{major}.{minor}.{int(patch) + 1}"
26 |
27 |
28 | def create_new_patch_release():
29 | """Create a new patch release on GitHub."""
30 | try:
31 | last_version_number = get_last_version()
32 | except subprocess.CalledProcessError as err:
33 | if err.stderr.decode("utf8").startswith("HTTP 404:"):
34 | # The project doesn't have any releases yet.
35 | new_version_number = "0.0.1"
36 | else:
37 | raise
38 | else:
39 | new_version_number = bump_patch_number(last_version_number)
40 |
41 | subprocess.run(
42 | ["gh", "release", "create", "--generate-notes", new_version_number],
43 | check=True,
44 | )
45 |
46 |
47 | if __name__ == "__main__":
48 | create_new_patch_release()
49 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI.org
2 | on:
3 | release:
4 | types: [published]
5 | jobs:
6 | pypi:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - name: Checkout
10 | uses: actions/checkout@v3
11 | with:
12 | fetch-depth: 0
13 | - run: python3 -m pip install --upgrade build && python3 -m build
14 | - name: Publish package
15 | uses: pypa/gh-action-pypi-publish@release/v1
16 | with:
17 | password: ${{ secrets.PYPI_API_TOKEN_CELLDANCER }}
18 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Create a new patch release
2 | on: workflow_dispatch
3 | jobs:
4 | github:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - name: Checkout
8 | uses: actions/checkout@v3
9 | - name: Create new patch release
10 | run: .github/scripts/release.py
11 | env:
12 | GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | .eggs/
11 | *.egg-info/
12 |
13 | # PyInstaller
14 | *.manifest
15 | *.spec
16 | build/
17 |
18 | # Installer logs
19 | pip-log.txt
20 | pip-delete-this-directory.txt
21 |
22 | # Unit test / coverage reports
23 | .cache
24 |
25 | # Sphinx documentation
26 | docs/_build/
27 |
28 | # Emacs, vim
29 | .#*
30 | *.swp
31 |
32 | # Notebook Checkpoints
33 | .ipynb_checkpoints/
34 |
35 |
36 | # Mac specific
37 | .DS_Store
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2022, Wang Lab
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readme.rst
2 | include readme_pypi.rst
3 | include LICENSE
--------------------------------------------------------------------------------
/_static/training_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/_static/training_progress.png
--------------------------------------------------------------------------------
/dist/celldancer-1.1.4-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/celldancer-1.1.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4.tar.gz
--------------------------------------------------------------------------------
/dist/celldancer-1.1.7-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/celldancer-1.1.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7.tar.gz
--------------------------------------------------------------------------------
/readme.rst:
--------------------------------------------------------------------------------
1 | cellDancer - Estimating Cell-dependent RNA Velocity
2 | ===========================================================================================
3 |
4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website `_.
5 |
6 |
7 | .. image:: _static/training_progress.png
8 | :width: 100%
9 | :alt: cell_type_u_s_sample_df
10 |
11 | Cite
12 |
13 | Shengyu Li#, Pengzhi Zhang#, Weiqing Chen, Lingqun Ye, Kristopher W. Brannan, Nhat-Tu Le, Jun-ichi Abe, John P. Cooke, Guangyu Wang. A relay velocity model infers cell-dependent RNA velocity. Nature Biotechnology (2023) https://doi.org/10.1038/s41587-023-01728-5
14 |
15 | cellDancer's key applications
16 | ========================================================
17 | * Enable accurate inference of dynamic cell state transitions in heterogeneous cell populations.
18 | * Estimate cell-specific transcription (α), splicing (β) and degradation (γ) rates for each gene and reveal RNA turnover strategies.
19 | * Improves downstream analysis such as vector field predictions.
20 |
21 | To be done
22 | ========================================================
23 | - [ ] Update an anndata-compatible version.
24 |
25 | What's new
26 | ========================================================
27 | cellDancer is updated to v1.1.7
28 |
29 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw().
30 | * Added try except to catch genes with low quality in velocity().
31 |
32 | Installation
33 | ========================================================
34 | cellDancer requires Python version >= 3.7.6 to run.
35 |
36 | To run cellDancer locally, we recommend to create a `conda `_ environment: ``conda create -n cellDancer python==3.7.6``. Then activate the new environment with ``conda activate cellDancer``. cellDancer package could be installed from pypi with ``pip install celldancer``.
37 |
38 | Python 3.7 is not compatible with M1 Mac, ``conda create -n cellDancer python==3.9.16`` is the version that compatible with M1 Mac that has been well tested to run cellDancer.
39 |
40 | To install the latest version from GitHub, run:
41 |
42 | ``pip install git+https://github.com/GuangyuWangLab2021/cellDancer.git``
43 |
44 | To install cellDancer from source code, run:
45 |
46 | ``pip install 'your_path/Source Code/cellDancer'``.
47 |
48 | For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link: https://bezier.readthedocs.io/en/2021.2.12/#installing
49 |
50 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt`` or manually install each package in requirements.txt.
51 |
52 | To be compatible with Dynamo (optional), after first ``pip install celldancer`` and then ``pip install dynamo-release``, installing Dynamo will update numpy to 1.24.0, and we can downgrade numpy back to 1.20.0 with ``pip install numpy==1.20.0`` to let them be compatible.
53 |
54 | Frequently asked questions
55 | ========================================================
56 | Q: How should I prepare the input for my own data?
57 |
58 | A: The `Data Preparation `_ page introduces the details of how to prepare and pre-process your own data.
59 |
60 | Check more frequently asked questions at `FAQ `_ in our website. If you have any other question related to your specific contition, welcome to post it in our github `issue `_ page or email to sli5@houstonmethodist.org
61 |
62 | Support
63 | ========================================================
64 | Welcome bug reports and suggestions to our GitHub issue page!
65 |
--------------------------------------------------------------------------------
/readme_pypi.rst:
--------------------------------------------------------------------------------
1 | cellDancer - Estimating Cell-dependent RNA Velocity
2 | ===========================================================================================
3 |
4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website `_.
5 |
6 |
7 | cellDancer's key applications
8 | ========================================================
9 | * Estimate cell-specific RNA velocity for each gene.
10 | * Derive cell fates in embedding space.
11 | * Estimate pseudotime for each cell in embedding space.
12 |
13 | What's new
14 | ========================================================
15 | cellDancer is updated to v1.1.7
16 |
17 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw().
18 | * Added try except to catch genes with low quality in velocity().
19 |
20 | Installation
21 | ========================================================
22 | cellDancer requires Python version >= 3.7.6 to run.
23 |
24 | To run cellDancer locally, create an `conda `_ or `Anaconda `_ environment as ``conda create -n cellDancer python==3.7.6``, and activate the new environment with ``conda activate cellDancer``. cellDancer could be installed with ``pip install celldancer``.
25 |
26 | To install cellDancer from source code, run:
27 | ``pip install 'your_path/Source Code/cellDancer'``.
28 |
29 | For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link:
30 | https://bezier.readthedocs.io/en/2021.2.12/#installing
31 |
32 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt``.
33 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch-lightning==1.5.2
2 | torch==1.10.0
3 | pandas==1.3.4
4 | numpy==1.20.3
5 | anndata==0.8.0
6 | tqdm==4.62.3
7 | scikit-learn==1.0.1
8 | scipy==1.7.2
9 | joblib==1.1.0
10 | scikit-image==0.19.2
11 | statsmodels==0.13.1
12 | matplotlib==3.5.3
13 | seaborn==0.11.2
14 | datashader==0.14.0
15 | bezier==2021.2.12
16 | umap-learn==0.5.2
17 | jupyterlab
18 | setuptools==59.5.0
19 | setuptools-scm==6.3.2
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | project_urls = {
4 | 'cellDancer': 'https://github.com/GuangyuWangLab2021/cellDancer',
5 | 'Documentation':'https://guangyuwanglab2021.github.io/cellDancer_website/'
6 | }
7 |
8 | with open("readme_pypi.rst", "rt", encoding="utf8") as f:
9 | long_description = f.read()
10 |
11 | setuptools.setup(
12 | name="celldancer",
13 | version="1.1.7",
14 | author="Wang Lab",
15 | author_email="gwang2@houstonmethodist.org",
16 | description="Study RNA velocity through neural network.",
17 | long_description=long_description,
18 | long_description_content_type="text/x-rst; charset=UTF-8",
19 | classifiers=[
20 | "Programming Language :: Python :: 3",
21 | "License :: OSI Approved :: MIT License",
22 | "Operating System :: OS Independent",
23 | ],
24 | project_urls = project_urls,
25 | package_dir={"": "src"},
26 | packages=setuptools.find_packages(where="src"),
27 | package_data={'': ['model/*.pt']},
28 | include_package_data=True,
29 | python_requires=">=3.7.6",
30 | install_requires = ['pytorch-lightning==1.5.2',
31 | 'torch==1.10.0',
32 | 'pandas==1.3.4',
33 | 'numpy==1.20.3',
34 | 'anndata==0.8.0',
35 | 'tqdm==4.62.3',
36 | 'scikit-learn==1.0.1',
37 | 'scipy==1.7.2',
38 | 'joblib==1.1.0',
39 | 'scikit-image==0.19.2',
40 | 'statsmodels==0.13.1',
41 | 'matplotlib==3.5.3',
42 | 'seaborn==0.11.2',
43 | 'datashader==0.14.0',
44 | 'bezier==2021.2.12',
45 | 'umap-learn==0.5.2',
46 | 'jupyterlab',
47 | 'setuptools==59.5.0',
48 | 'setuptools-scm==6.3.2'
49 | ]
50 | )
51 |
52 |
--------------------------------------------------------------------------------
/src/celldancer/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/.Rapp.history
--------------------------------------------------------------------------------
/src/celldancer/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
2 | from .velocity_estimation import velocity
3 | from .pseudo_time import pseudo_time
4 | from .compute_cell_velocity import compute_cell_velocity
5 | from .embedding_kinetic_para import embedding_kinetic_para
6 | from .utilities import adata_to_df_with_embed
7 | from .utilities import to_dynamo
8 | from .utilities import export_velocity_to_dynamo
9 | from .simulation import simulate
10 | from . import cdplt
11 |
12 | __all__ = [
13 | "cdplt",
14 | "velocity_estimation",
15 | "pseudo_time",
16 | "diffusion",
17 | "compute_cell_velocity",
18 | "simulation",
19 | "embedding_kinetic_para",
20 | "sampling",
21 | "utilities",
22 | "simulation"
23 | ]
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/celldancer/cdplt.py:
--------------------------------------------------------------------------------
1 | from celldancer.plotting import *
2 |
--------------------------------------------------------------------------------
/src/celldancer/compute_cell_velocity.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import random
4 | import pandas as pd
5 | import numpy as np
6 | from sklearn.neighbors import NearestNeighbors
7 | import matplotlib.pyplot as plt
8 |
9 |
10 | if __name__ == "__main__":
11 | sys.path.append('.')
12 | from sampling import *
13 | else:
14 | try:
15 | from .sampling import *
16 | except ImportError:
17 | from sampling import *
18 |
19 |
20 | def compute_cell_velocity(
21 | cellDancer_df,
22 | gene_list=None,
23 | speed_up=(60,60),
24 | expression_scale=None,
25 | projection_neighbor_size=200,
26 | projection_neighbor_choice='embedding'):
27 |
28 | """Project the RNA velocity onto the embedding space.
29 |
30 | Arguments
31 | ---------
32 | cellDancer_df: `pandas.DataFrame`
33 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID, 'clusters', 'embedding1', 'embedding2']
34 | gene_list: optional, `list` (default: None)
35 | Genes selected to calculate the cell velocity. `None` if all genes in the cellDancer_df are to be used.
36 | speed_up: optional, `tuple` (default: (60,60))
37 | Speed up by giving the sampling grid to downsample cells.
38 | `None` if all cells are used to compute cell velocity.
39 | expression_scale: optional, `str` (default: None)
40 | `None` if no expression scale is to be used.
41 | `'power10'` if the 10th power is been used to scale spliced and unspliced reads.
42 | projection_neighbor_size: optional, `int` (default: '200')
43 | The number of neighboring cells used for the transition probability matrix for one cell.
44 | projection_neighbor_choice: optional, `str` (default: 'embedding')
45 | `'embedding'` if using the embedding space to obtain the neighbors.
46 | `'gene'` if using the spliced reads of all genes to obtain the neighbors.
47 |
48 | Returns
49 | -------
50 | cellDancer_df: `pandas.DataFrame`
51 | The updated cellDancer_df with additional columns ['velocity1', 'velocity2'].
52 | """
53 |
54 | def velocity_correlation(cell_matrix, velocity_matrix):
55 | """Calculate the correlation between the predict velocity (velocity_matrix[:,i])
56 | and the difference between a cell and every other (cell_matrix - cell_matrix[:, i])
57 |
58 | Arguments
59 | ---------
60 | cell_matrix: np.ndarray (ngenes, ncells)
61 | gene expression matrix
62 | velocity_matrix: np.ndarray (ngenes, ncells)
63 | Return
64 | ---------
65 | c_matrix: np.ndarray (ncells, ncells)
66 | """
67 | c_matrix = np.zeros((cell_matrix.shape[1], velocity_matrix.shape[1]))
68 | for i in range(cell_matrix.shape[1]):
69 | c_matrix[i, :] = corr_coeff(cell_matrix, velocity_matrix, i)[0, :]
70 | np.fill_diagonal(c_matrix, 0)
71 | return c_matrix
72 |
73 |
74 | def velocity_projection(cell_matrix, velocity_matrix, embedding, knn_embedding):
75 | '''
76 | cell_matrix: np.ndarray (ngenes, ncells)
77 | gene expression matrix
78 | velocity_matrix: np.ndarray (ngenes, ncells)
79 | '''
80 | # cell_matrix = np_splice[:,sampling_ixs]
81 | # velocity_matrix = np_dMatrix[:,sampling_ixs]
82 | sigma_corr = 0.05
83 | cell_matrix[np.isnan(cell_matrix)] = 0
84 | velocity_matrix[np.isnan(velocity_matrix)] = 0
85 | corrcoef = velocity_correlation(cell_matrix, velocity_matrix)
86 | probability_matrix = np.exp(corrcoef / sigma_corr)*knn_embedding.A
87 | probability_matrix /= probability_matrix.sum(1)[:, None]
88 | unitary_vectors = embedding.T[:, None, :] - embedding.T[:, :, None]
89 | with np.errstate(divide='ignore', invalid='ignore'):
90 | unitary_vectors /= np.linalg.norm(unitary_vectors, ord=2, axis=0)
91 | np.fill_diagonal(unitary_vectors[0, ...], 0)
92 | np.fill_diagonal(unitary_vectors[1, ...], 0)
93 | velocity_embedding = (probability_matrix * unitary_vectors).sum(2)
94 | velocity_embedding -= (knn_embedding.A * unitary_vectors).sum(2) / \
95 | knn_embedding.sum(1).A.T # embedding_knn.A *
96 | velocity_embedding = velocity_embedding.T
97 | return velocity_embedding
98 |
99 | # remove invalid prediction
100 | is_NaN = cellDancer_df[['alpha','beta']].isnull()
101 | row_has_NaN = is_NaN. any(axis=1)
102 | cellDancer_df = cellDancer_df[~row_has_NaN].reset_index(drop=True)
103 |
104 | if 'velocity1' in cellDancer_df.columns:
105 | del cellDancer_df['velocity1']
106 | if 'velocity2' in cellDancer_df.columns:
107 | del cellDancer_df['velocity2']
108 |
109 | if gene_list is None:
110 | gene_list=cellDancer_df.gene_name.drop_duplicates()
111 |
112 |
113 | # This creates a new dataframe
114 | cellDancer_df_input = cellDancer_df[cellDancer_df.gene_name.isin(gene_list)].reset_index(drop=True)
115 | np_splice_all, np_dMatrix_all= data_reshape(cellDancer_df_input)
116 | # print("(genes, cells): ", end="")
117 | # print(np_splice_all.shape)
118 | n_genes, n_cells = np_splice_all.shape
119 |
120 | # This creates a new dataframe
121 | data_df = cellDancer_df_input.loc[:,
122 | ['gene_name', 'unsplice', 'splice', 'cellID','embedding1', 'embedding2']]
123 | # random.seed(10)
124 | embedding_downsampling, sampling_ixs, knn_embedding = downsampling_embedding(data_df,
125 | para='neighbors',
126 | target_amount=0,
127 | step=speed_up,
128 | n_neighbors=projection_neighbor_size,
129 | projection_neighbor_choice=projection_neighbor_choice,
130 | expression_scale=expression_scale,
131 | pca_n_components=None,
132 | umap_n=None,
133 | umap_n_components=None)
134 |
135 |
136 | # projection_neighbor_choice only provides neighborlist, use embedding(from raw data) to compute cell velocity
137 | embedding = cellDancer_df_input[cellDancer_df_input.gene_name ==
138 | gene_list[0]][['embedding1', 'embedding2']]
139 | embedding = embedding.to_numpy()
140 | velocity_embedding = velocity_projection(
141 | np_splice_all[:, sampling_ixs],
142 | np_dMatrix_all[:, sampling_ixs],
143 | embedding[sampling_ixs, :],
144 | knn_embedding)
145 |
146 | if set(['velocity1','velocity2']).issubset(cellDancer_df.columns):
147 | print("Caution! Overwriting the \'velocity\' columns.")
148 | cellDancer_df.drop(['velocity1','velocity2'], axis=1, inplace=True)
149 |
150 | sampling_ixs_all_genes = cellDancer_df_input[cellDancer_df_input.cellIndex.isin(sampling_ixs)].index
151 | cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity1'] = np.tile(velocity_embedding[:,0], n_genes)
152 | cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity2'] = np.tile(velocity_embedding[:,1], n_genes)
153 | # print("After downsampling, there are ", len(sampling_ixs), "cells.")
154 | return(cellDancer_df_input)
155 |
156 | def corr_coeff(ematrix, vmatrix, i):
157 | '''
158 | Calculate the correlation between the predict velocity (velocity_matrix[:,i])
159 | and the displacement between a cell and every other (cell_matrix - cell_matrix[:, i])
160 | ematrix = cell_matrix
161 | vmatrix = velocity_matrix
162 | '''
163 | ematrix = ematrix.T
164 | vmatrix = vmatrix.T
165 | ematrix = ematrix - ematrix[i, :]
166 | vmatrix = vmatrix[i, :][None, :]
167 | ematrix_m = ematrix - ematrix.mean(1)[:, None]
168 | vmatrix_m = vmatrix - vmatrix.mean(1)[:, None]
169 |
170 | # Sum of squares across rows
171 | ematrix_ss = (ematrix_m**2).sum(1)
172 | vmatrix_ss = (vmatrix_m**2).sum(1)
173 | cor = np.dot(ematrix_m, vmatrix_m.T)
174 | N = np.sqrt(np.dot(ematrix_ss[:, None], vmatrix_ss[None]))
175 | cor=np.divide(cor, N, where=N!=0)
176 | return cor.T
177 |
178 |
179 | def data_reshape(cellDancer_df): # pengzhi version
180 | '''
181 | load detail file
182 | return expression matrix and velocity (ngenes, ncells)
183 | '''
184 | psc = 1
185 | gene_names = cellDancer_df['gene_name'].drop_duplicates().to_list()
186 | # PZ uncommented this.
187 | cell_number = cellDancer_df[cellDancer_df['gene_name']==gene_names[0]].shape[0]
188 | cellDancer_df['index'] = np.tile(range(cell_number),len(gene_names))
189 |
190 | splice_reshape = cellDancer_df.pivot(
191 | index='gene_name', values='splice', columns='index')
192 | splice_predict_reshape = cellDancer_df.pivot(
193 | index='gene_name', values='splice_predict', columns='index')
194 | dMatrix = splice_predict_reshape-splice_reshape
195 | np_splice_reshape = np.array(splice_reshape)
196 | np_dMatrix = np.array(dMatrix)
197 | np_dMatrix2 = np.sqrt(np.abs(np_dMatrix) + psc) * \
198 | np.sign(np_dMatrix)
199 | return(np_splice_reshape, np_dMatrix2)
200 |
201 |
--------------------------------------------------------------------------------
/src/celldancer/diffusion.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import os
5 | import sys
6 | import random
7 | import multiprocessing as mp
8 |
9 | import numpy as np
10 | from sklearn import preprocessing
11 |
12 | import matplotlib as mpl
13 | import matplotlib.pyplot as plt
14 |
15 | def embedding_normalization(cell_embedding, embedding=None, mode="minmax", NORM_ALL_CELLS=False):
16 | '''
17 | Normalize by the maximum absolute value.
18 |
19 | Parameters
20 | ----------
21 | embedding: 2D numpy array (n_cells, 2)
22 | mode: string
23 | 'maxabs', "minmax"
24 | maxabs is meant for sparse data and/or centered at 0.
25 | Note in this program (ML velocity), it is pretty safe to do maxabs normalization
26 | since the data are free of extreme outliers.
27 |
28 | '''
29 | if mode in ['max', 'maximum', 'maxabs']:
30 | transformer = preprocessing.MaxAbsScaler().fit(cell_embedding)
31 | elif mode in ['minmax']:
32 | transformer = preprocessing.MinMaxScaler().fit(cell_embedding)
33 | em = transformer.transform(cell_embedding)
34 | if NORM_ALL_CELLS:
35 | try:
36 | em_all = transformer.transform(embedding)
37 | except ValueError:
38 | print("ERROR! Missing embedding for all cells.")
39 | raise
40 | return em, em_all
41 | else:
42 | return em
43 |
44 | def velocity_normalization(downsampled_vel, all_vel=None, mode="max", NORM_ALL_CELLS=False):
45 | '''
46 | Normalize by the maximum absolute value in the downsampled_vel.
47 |
48 | Parameters
49 | ----------
50 | downsampled_vel: 2D numpy array (n_cells, 2)
51 | mode: 'maxabs'
52 |
53 | maxabs is meant for sparse data and/or centered at 0.
54 |
55 | Note in this program, it is pretty safe to do maxabs normalization
56 | since the data are free of extreme outliers.
57 |
58 | '''
59 | # add v_prime to vel of each cell without changing their directions.
60 | v_mag = np.linalg.norm(downsampled_vel, axis=1)
61 | v_prime = 0.1*np.std(v_mag)
62 |
63 | # for 0 velocity cell, nothing changed.
64 | v_prime = np.divide(v_prime, v_mag, where=v_mag > 0)
65 | downsampled_vel = downsampled_vel*(v_prime + 1)[:,None]
66 |
67 | if mode in ['max', 'maximum', 'maxabs']:
68 | transformer = preprocessing.MaxAbsScaler().fit(downsampled_vel)
69 | em = transformer.transform(downsampled_vel)
70 | if NORM_ALL_CELLS:
71 | em_all = transformer.transform(all_vel)
72 | return em, em_all
73 | else:
74 | return em
75 |
76 |
77 | def discretize(coordinate, xmin, xmax, n_grids, capping=False):
78 | '''
79 | '''
80 | grid_size = np.array(xmax) - np.array(xmin)
81 | grid_size = grid_size / np.array(n_grids)
82 |
83 | grid_idx = np.int64(np.floor((coordinate-xmin)/grid_size))
84 |
85 | if capping:
86 | grid_idx = np.where(grid_idx > n_grids, n_grids, grid_idx)
87 | grid_idx = np.where(grid_idx <0, 0, grid_idx)
88 |
89 | grid_coor = xmin + grid_size * (grid_idx+0.5)
90 | return grid_idx, grid_coor
91 |
92 |
93 | def generate_grid(
94 | cell_embedding,
95 | embedding,
96 | velocity_embedding,
97 | abr_umap = None,
98 | n_grids = None):
99 |
100 | xmin = np.min(cell_embedding, axis=0)
101 | xmax = np.max(cell_embedding, axis=0)
102 | n_grids = np.array(n_grids, dtype=int)
103 |
104 | cell_grid_idx, cell_grid_coor = discretize(cell_embedding,
105 | xmin=xmin,
106 | xmax=xmax,
107 | n_grids=n_grids)
108 |
109 | # The actual n_grids need to allow a leeway +1 in each dimension.
110 | mesh = np.zeros(np.append(n_grids+1,len(n_grids)))
111 |
112 | cnt = np.zeros(n_grids+1)
113 | for index in range(cell_grid_idx.shape[0]):
114 | grid_index = cell_grid_idx[index]
115 | if np.any(grid_index > n_grids) or np.any(grid_index < 0):
116 | continue
117 | grid_index = toTuple(grid_index)
118 | mesh[grid_index] += velocity_embedding[index]
119 | cnt[grid_index] += 1
120 | cnt = cnt[:,:,None]
121 | mesh = np.divide(mesh, cnt, out=np.zeros_like(mesh), where=cnt>0.1)
122 |
123 | # the all cell embedding is used to generate mass
124 | mass = np.zeros(n_grids+1)
125 | all_cells_grid_idx, all_cells_grid_coor = \
126 | discretize(embedding, xmin=xmin, xmax=xmax, n_grids=n_grids)
127 | n_cells = all_cells_grid_idx.shape[0]
128 |
129 | for index in range(n_cells):
130 | all_cells_grid_index = all_cells_grid_idx[index]
131 |
132 | # mass outside the grid is not needed.
133 | if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0):
134 | continue
135 | all_cells_grid_index = toTuple(all_cells_grid_index)
136 | mass[all_cells_grid_index] += 1
137 |
138 | # the all cell embedding is used to generate grid_umap
139 | if abr_umap is not None:
140 | grid_umap = np.full_like(mesh, np.NAN)
141 | n_umap_dims = all_cells_grid_idx.shape[-1]
142 | for index in range(n_cells):
143 | all_cells_grid_index = all_cells_grid_idx[index]
144 | if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0):
145 | all_cells_grid_index = toTuple(all_cells_grid_index)
146 | grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), np.NAN)
147 | pass
148 | all_cells_grid_index = toTuple(all_cells_grid_index)
149 | if np.any(np.isnan(grid_umap[all_cells_grid_index])):
150 | grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), 0)
151 | else:
152 | grid_umap[all_cells_grid_index] += abr_umap[index,:]
153 |
154 | # divide by 0 does not happen
155 | # because where-ever mass is 0, grid_umap is nan. nan/0 -> nan
156 | grid_umap = np.divide(grid_umap, mass[:,:,None])
157 |
158 | else:
159 | grid_umap = None
160 |
161 | return mesh, mass, grid_umap, \
162 | cell_grid_idx, cell_grid_coor, all_cells_grid_idx, all_cells_grid_coor
163 |
164 |
165 | def toTuple(arr):
166 | '''
167 | Parameters
168 | ----------
169 | arr: numpy ndarray or list
170 |
171 | Return
172 | ------
173 | A tuple (of nested tuples)
174 |
175 | '''
176 |
177 | try:
178 | return tuple(toTuple(i) for i in arr)
179 | except TypeError:
180 | return arr
181 |
182 |
183 | def compute_path_divider_matrix(fmat, cutoff=0.3):
184 |
185 |
186 | print("The cutoff for banning a path is ", cutoff)
187 | ngrids = fmat.shape[:-1]
188 | flat_length = np.multiply(*ngrids)
189 | temp = fmat.reshape(flat_length, fmat.shape[-1])
190 |
191 | temp2 = temp-temp[:,None]
192 | temp2 = np.linalg.norm(temp2, axis=-1)
193 |
194 | ban = temp2.reshape(ngrids+ngrids)
195 |
196 | path_divider_matrix = ban < cutoff
197 | return path_divider_matrix
198 |
199 |
200 | def plot_velocity(embedding, velocity_embedding):
201 | fig, ax = plt.subplots(figsize=(6,6))
202 | plt.quiver(embedding[:, 0],embedding[:, 1],
203 | velocity_embedding[:,0], velocity_embedding[:,1],
204 | color='Blue')
205 | plt.show()
206 |
207 | def plot_mesh_velocity(mesh, grid_mass):
208 | x=list()
209 | y=list()
210 | vx=list()
211 | vy=list()
212 | for i in range(mesh.shape[0]):
213 | for j in range(mesh.shape[1]):
214 | x.append(i)
215 | y.append(j)
216 | vx.append(mesh[i,j][0])
217 | vy.append(mesh[i,j][1])
218 | fig, ax = plt.subplots(figsize=(6, 6))
219 | ax.quiver(x,y,vx,vy,color='red',scale = 10)
220 | plt.imshow(grid_mass.T, interpolation=None, origin='lower',cmap="Greys")
221 | plt.show()
222 |
223 | def velocity_add_random(velocity, theta):
224 | '''
225 | Rotate the velocity according to a randomized kicks on the perpendicular direction.
226 | The direction is determined by the sign of a random number.
227 | The magnitude of the perpendicular kick is determined by the random number
228 | from a normal distribution N(0, theta).
229 | Magnitude of the velocity is kept the same to conserve energy (temperature) of the system.
230 |
231 | Parameters
232 | ----------
233 | velocity
234 | velocity of the grid
235 | theta
236 | the angular range that the noise could be affecting the direction of the velocity
237 |
238 | WARNING
239 | at a rare chance, the rotation angle (magnitude) could be much larger than theta.
240 |
241 | Return
242 | ------
243 | Adjusted velocity for the interested cell
244 |
245 | '''
246 | r = np.random.normal(0, theta, 1)
247 | # print(mp.current_process(), r)
248 |
249 | cosine = np.cos(r)[0]
250 | sine = np.sin(r)[0]
251 |
252 | # Rotation matrix
253 | R = np.array([[cosine, sine],[-sine, cosine]])
254 | velocity = np.dot(velocity, R)
255 | return velocity
256 |
257 | def velocity_rotation(velocity, theta):
258 | '''
259 | Rotate the velocity clockwise by angle theta
260 |
261 | Parameters
262 | ----------
263 | velocity
264 | velocity of the grid
265 | theta
266 | the angular range that the noise could be affecting the direction of the velocity
267 |
268 | Return
269 | ------
270 | Adjusted velocity for the interested cell
271 |
272 | '''
273 | cosine = np.cos(theta)
274 | sine = np.sin(theta)
275 |
276 | # Rotation matrix
277 | R = np.array([[cosine, sine],[-sine, cosine]])
278 | velocity = np.dot(velocity, R)
279 | return velocity
280 |
281 |
282 | def diffusion_off_grid_wallbound(
283 | cell_embedding,
284 | vel,
285 | init,
286 | grid_mass,
287 | dt = 0.001,
288 | t_total = 10000,
289 | eps = 1e-5,
290 | random_seed = None,
291 | pdm = None):
292 |
293 | '''
294 | Simulate the diffusion of a cell in the velocity field (off grid), the
295 | cell's velocity will turn 30 degrees
296 | if it hits the boundary the next timestep.
297 |
298 | The diffusion is stopped by any of the criteria:
299 | - reach t_total
300 | - the magnitude of the velocity is less than eps.
301 | - the cell goes to places where the cell mass <= MAX_IGNORED_MASS even after turning.
302 | - the cell is out of the simulation box
303 |
304 | Parameters
305 | ----------
306 |
307 | cell_embedding: numpy ndarray (n_cells x n_dims)
308 | embedding coordinate for all the cells (downsampled)
309 |
310 | vel: numpy ndarray (n_grids x n_dims)
311 | pre-assigned velocity of each grid
312 |
313 | init: numpy ndarray (n_cells x n_dims)
314 | The initial position (cell_embedding)
315 |
316 | dt: float
317 | Step size of each integration time step
318 |
319 | t_total: int
320 | Total number of time steps
321 |
322 | grid_mass: numpy ndarray (n_grids x n_dims)
323 | mass of cells.
324 |
325 | eps
326 | Criterion to stop a trajectory before t_total (v_net < eps)
327 |
328 |
329 | Return
330 | ------
331 | a numpy ndarray of coordinates in the trajectory, shape:
332 | (real_n_time_steps, n_dims)
333 | '''
334 |
335 | np.random.seed(seed = random_seed)
336 | # print("random seed is set to, ", random_seed)
337 | THETA = np.pi/6
338 |
339 | XMIN = np.min(cell_embedding, axis=0)
340 | XMAX = np.max(cell_embedding, axis=0)
341 | N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1)
342 |
343 | # lower 5% nonzero mass set to 0.
344 | #MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0], 5)
345 | MAX_IGNORED_MASS = 2
346 |
347 | def no_cells_around(xcur, xcur_d, vcur):
348 | xnxt = xcur + vcur*dt
349 | xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
350 | try:
351 | mass = grid_mass[xnxt_d[0], xnxt_d[1]]
352 | except IndexError:
353 | return True
354 | return mass <= MAX_IGNORED_MASS
355 |
356 | x0 = init
357 | x0_d, dummy = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
358 | v0 = vel[x0_d[0],x0_d[1]]
359 | v0 = velocity_add_random(v0, THETA)
360 | trajectory = [x0]
361 |
362 | for i in range(int(t_total)):
363 |
364 | if np.linalg.norm(v0) < eps:
365 | #print("Velocity is too small")
366 | return np.array(trajectory)
367 | if no_cells_around(x0, x0_d, v0):
368 | v0_cc = velocity_rotation(v0, THETA)
369 | v0_c = velocity_rotation(v0, -THETA)
370 |
371 | # nowhere to go but null
372 | CC = no_cells_around(x0, x0_d, v0_cc)
373 | C = no_cells_around(x0, x0_d, v0_c)
374 |
375 | if CC and C:
376 | return np.array(trajectory)
377 | elif not C:
378 | v0 = v0_c
379 | else:
380 | v0 = v0_cc
381 |
382 | else:
383 | x = x0 + v0*dt
384 | x_d, dummy = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
385 | if (pdm is None) or (pdm[toTuple(x0_d)+toTuple(x_d)]):
386 | try:
387 | v = vel[x_d[0],x_d[1]]
388 | mass = grid_mass[x_d[0],x_d[1]]
389 | v = velocity_add_random(v, THETA)
390 | except IndexError:
391 | break
392 |
393 | trajectory.append(x)
394 | x0 = x
395 | v0 = v
396 |
397 | return np.array(trajectory)
398 |
399 |
400 | def diffusion_on_grid_wallbound(
401 | cell_embedding,
402 | vel,
403 | init,
404 | grid_mass,
405 | dt=0.001,
406 | t_total=10000,
407 | eps = 1e-5):
408 |
409 | '''
410 | same as diffusion_off_grid_wallbound, however, it returns the coordinates
411 | of the grid traversed by the cell, instead of the position of the cell.
412 |
413 | The diffusion is stopped by any of the criteria:
414 | 1. reach t_total
415 | 2. the magnitude of the velocity is less than eps.
416 | 3. the cell goes to places where the cell mass = 0 even after turning.
417 | 4. the cell is out of the simulation box
418 |
419 | Parameters
420 | ----------
421 |
422 | cell_embedding: numpy ndarray (n_cells x n_dims)
423 | embedding coordinate for all the cells (downsampled)
424 |
425 | vel: numpy ndarray (n_grids x n_dims)
426 | pre-assigned velocity of each grid
427 |
428 | init: numpy ndarray (n_cells x n_dims)
429 | The initial position (cell_embedding)
430 |
431 | dt: float
432 | Step size of each integration time step
433 |
434 | t_total: int
435 | Total number of time steps
436 |
437 | grid_mass: numpy ndarray (n_grids x n_dims)
438 | mass of cells.
439 |
440 | eps
441 | Criterion to stop a trajectory before t_total (v_net < eps)
442 |
443 |
444 | Return
445 | ------
446 | a numpy ndarray of coordinates in the trajectory, shape:
447 | (real_n_time_steps, n_dims)
448 | '''
449 |
450 | THETA = np.pi/6
451 |
452 | XMIN = np.min(cell_embedding, axis=0)
453 | XMAX = np.max(cell_embedding, axis=0)
454 | N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1)
455 |
456 | # lower 5% nonzero mass set to 0.
457 | MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0],5)
458 |
459 | def no_cells_around(xcur, xcur_d, vcur):
460 | xnxt = xcur + vcur*dt
461 | xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
462 | try:
463 | mass = grid_mass[xnxt_d[0], xnxt_d[1]]
464 | except IndexError:
465 | return True
466 | return mass < MAX_IGNORED_MASS
467 |
468 | x0 = init
469 | x0_d, x0_d_coor = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
470 | v0 = vel[x0_d[0],x0_d[1]]
471 | v0 = velocity_add_random(v0, THETA)
472 | trajectory = [x0_d_coor]
473 |
474 | for i in range(int(t_total)):
475 |
476 | if np.linalg.norm(v0) < eps:
477 | #print("Velocity is too small")
478 | return np.array(trajectory)
479 | if no_cells_around(x0_d_coor, x0_d, v0):
480 | v0_cc = velocity_rotation(v0, np.pi/2)
481 | v0_c = velocity_rotation(v0, -np.pi/2)
482 | # nowhere to go but null
483 | CC = no_cells_around(x0_d_coor, x0_d, v0_cc)
484 | C = no_cells_around(x0_d_coor, x0_d, v0_c)
485 | if CC and C:
486 | return np.array(trajectory)
487 | elif not C:
488 | v0 = v0_c
489 | else:
490 | v0 = v0_cc
491 |
492 | else:
493 | x = x0_d_coor + v0*dt
494 | x_d, x_d_coor = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
495 | try:
496 | v = vel[x_d[0],x_d[1]]
497 | v = velocity_add_random(v, THETA)
498 | except IndexError:
499 | break
500 |
501 | trajectory.append(x_d_coor)
502 | x0 = x_d
503 | x0_d_coor = x_d_coor
504 | v0 = v
505 |
506 | return np.array(trajectory)
507 |
508 |
509 | def run_diffusion(
510 | cell_embedding,
511 | vel,
512 | grid_mass,
513 | dt,
514 | t_total = 10000,
515 | eps = 1e-5,
516 | off_cell_init = False,
517 | init_cell = [],
518 | n_repeats = 10,
519 | n_jobs = 8,
520 | psrng_seeds_diffusion = None,
521 | path_divider_matrix=None):
522 | '''
523 | Simulation of diffusion of a cell in the velocity field (on grid),
524 | the cell's velocity will turn 90 degrees if it hits the boundary the next timestep.
525 | Embarrassingly parallel (process) are employed.
526 |
527 | Parameters
528 | ----------
529 |
530 | cell_embedding: numpy.ndarray (n_cells, 2)
531 | embedding coordinate for all the cells (downsampled)
532 |
533 | vel: numpy.ndarray (ngrid, ngrid, 2)
534 | pre-assigned velocity of each grid
535 |
536 | dt: float
537 | Step size of each integration time step
538 |
539 | t_total: int
540 | Total number of time steps
541 |
542 | eps: float
543 | Criterion to stop a trajectory before t_total (v_net < eps)
544 |
545 | off_cell_init: Boolean
546 | Whether to spawn initial coordinates from the neighbouring space around a cell
547 |
548 | init_cell: list
549 | List of initial cell indices. If empty list, use all cell indices in the given cell_embedding.
550 |
551 | n_repeats: init
552 | Number of repeats (either on or off the cells)
553 |
554 | n_jobs: int
555 | Number of threads
556 |
557 | Return
558 | ------
559 | a numpy array of trajectorys, shape: (num_trajs, *n_time_steps, 2)
560 | '''
561 | import tqdm
562 |
563 | if psrng_seeds_diffusion is None:
564 | psrng_seeds_diffusion = [i*100+11 for i in range(n_repeats)]
565 |
566 | assert len(psrng_seeds_diffusion) >= n_repeats
567 |
568 | if n_jobs >= mp.cpu_count():
569 | n_jobs = mp.cpu_count()
570 |
571 | if n_jobs < 0:
572 | n_jobs = mp.cpu_count() + 1 + n_jobs
573 |
574 | TASKS = list()
575 | # Setting up the TASKS
576 | n_cells = cell_embedding.shape[0]
577 |
578 | if not init_cell:
579 | init_cell = list(range(n_cells))
580 |
581 | embedding_range = cell_embedding.max(axis=0) - cell_embedding.min(axis=0)
582 | n_grids = np.array([vel.shape[0], vel.shape[1]])
583 | grid_size = embedding_range/n_grids
584 |
585 | n_trajs = 0
586 | for i in init_cell:
587 | for j in range(n_repeats):
588 | n_trajs += 1
589 | if off_cell_init:
590 | init_position = cell_embedding[i] + grid_size * np.random.uniform(-0.5,0.5,2)
591 | else:
592 | init_position = cell_embedding[i]
593 | TASKS.append((cell_embedding, vel, init_position, grid_mass, dt,
594 | t_total, 1e-5, psrng_seeds_diffusion[n_trajs % n_repeats],
595 | path_divider_matrix))
596 |
597 | with mp.Pool(n_jobs) as pool:
598 | n_total = len(init_cell)*n_repeats
599 | if n_total > 5000:
600 | paths = pool.starmap(diffusion_off_grid_wallbound,
601 | tqdm.tqdm(TASKS, total=n_total,
602 | desc="Generating Trajectories",
603 | colour="blue")
604 | )
605 | else:
606 | paths = pool.starmap(diffusion_off_grid_wallbound, TASKS)
607 | return np.array(paths, dtype=object)
608 |
--------------------------------------------------------------------------------
/src/celldancer/embedding_kinetic_para.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import os
4 |
5 | os.environ['KMP_WARNINGS'] = '0'
6 |
7 | def embedding_kinetic_para(
8 | cellDancer_df,
9 | kinetic_para,
10 | umap_n=25
11 | ):
12 | """Calculate the UMAP based on the kinetic parameter(s).
13 |
14 | Arguments
15 | ---------
16 | cellDancer_df: `pandas.DataFrame`
17 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2']
18 | kinetic_para: `str`
19 | Choose Which parameter is used to calculate embedding space, which could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}.
20 | umap_n: optional, `int` (default: 25)
21 | The size of the local neighborhood (in terms of the number of neighboring sample points) used for manifold approximation in UMAP.
22 |
23 | Returns
24 | -------
25 | cellDancer_df: `pandas.DataFrame`
26 | The updated cellDancer_df with an additional column of UMAP based on the kinetic parameter(s).
27 |
28 | """
29 | import umap
30 | if set([(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]).issubset(cellDancer_df.columns):
31 | cellDancer_df=cellDancer_df.drop(columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')])
32 |
33 | if kinetic_para=='alpha' or kinetic_para=='beta' or kinetic_para=='gamma':
34 | para_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values=kinetic_para)
35 | elif kinetic_para=='alpha_beta_gamma':
36 | alpha_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='alpha')
37 | beta_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='beta')
38 | gamma_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='gamma')
39 | para_df=pd.concat([alpha_df,beta_df,gamma_df],axis=1)
40 | else:
41 | print('kinetic_para should be set in one of alpha, beta, gamma, or alpha_beta_gamma.')
42 |
43 | def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=2, metric='euclidean'):
44 | fit = umap.UMAP(
45 | n_neighbors=n_neighbors,
46 | min_dist=min_dist,
47 | n_components=n_components,
48 | metric=metric
49 | )
50 | embed = fit.fit_transform(df);
51 | return(embed)
52 | umap_para=get_umap(para_df)
53 | umap_info=pd.DataFrame(umap_para,columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')])
54 |
55 | gene_amt=len(cellDancer_df.gene_name.drop_duplicates())
56 | umap_col=pd.concat([umap_info]*gene_amt)
57 | umap_col.index=cellDancer_df.index
58 | cellDancer_df=pd.concat([cellDancer_df,umap_col],axis=1)
59 | return(cellDancer_df)
60 |
--------------------------------------------------------------------------------
/src/celldancer/model/branch.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/branch.pt
--------------------------------------------------------------------------------
/src/celldancer/model/circle.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/circle.pt
--------------------------------------------------------------------------------
/src/celldancer/plotting/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/plotting/.Rapp.history
--------------------------------------------------------------------------------
/src/celldancer/plotting/__init__.py:
--------------------------------------------------------------------------------
1 | from .cell import scatter_cell
2 | from .cell import plot_kinetic_para
3 | from .graph import PTO_Graph
4 | from .gene import scatter_gene
5 | from .colormap import build_colormap
6 |
7 |
8 | __all__=[
9 | 'scatter_cell',
10 | 'build_colormap',
11 | 'scatter_gene',
12 | 'PTO_Graph',
13 | 'plot_kinetic_para',
14 | 'colormap'
15 | ]
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/celldancer/plotting/cell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import matplotlib.pyplot as plt
4 | from matplotlib.lines import Line2D
5 | from matplotlib.colors import ListedColormap, LinearSegmentedColormap
6 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
7 | from scipy.stats import norm as normal
8 | import bezier
9 | import numpy as np
10 | import pandas as pd
11 | from .colormap import *
12 |
13 | if __name__ == "__main__":
14 | sys.path.append('..')
15 | from utilities import find_nn_neighbors, extract_from_df
16 | else:
17 | from celldancer.utilities import find_nn_neighbors, extract_from_df
18 |
19 | def scatter_cell(
20 | ax,
21 | cellDancer_df,
22 | colors=None,
23 | custom_xlim=None,
24 | custom_ylim=None,
25 | vmin=None,
26 | vmax=None,
27 | alpha=0.5,
28 | s = 5,
29 | legend_marker_size=5,
30 | gene=None,
31 | velocity=False,
32 | legend='off',
33 | colorbar='on',
34 | min_mass=2,
35 | arrow_grid=(30,30)
36 | ):
37 |
38 | """Plot the RNA velocity on the embedding space; or plot the kinetic parameters ('alpha', 'beta', 'gamma', 'splice', 'unsplice', or 'pseudotime') of one gene on the embedding space.
39 |
40 | Arguments
41 | ---------
42 | ax: `ax`
43 | ax of plt.subplots()
44 | cellDancer_df: `pandas.DataFrame`
45 | Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime']
46 | colors: `list`, `dict`, or `str`
47 | When the input is a list: build a colormap dictionary for a list of cell type;
48 | When the input is a dictionary: it is the customized color map dictionary of each cell type;
49 | When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as input.
50 | custom_xlim: optional, `float` (default: None)
51 | Set the x limit of the current axes.
52 | custom_ylim: optional, `float` (default: None)
53 | Set the y limit of the current axes.
54 | vmin: optional, `float` (default: None)
55 | Set the minimum color limit of the current image.
56 | vmax: optional, `float` (default: None)
57 | Set the maximum color limit of the current image.
58 | alpha: optional, `float` (default: 0.5)
59 | The alpha blending value, between 0 (transparent) and 1 (opaque).
60 | s: optional, `float` (default: 5)
61 | The marker size.
62 | legend_marker_size: optional, `float` (default: 5)
63 | The lengend marker size.
64 | gene: optional, `str` (default: None)
65 | Gene name for plotting.
66 | velocity: optional, `bool` (default: False)
67 | `True` if plot velocity.
68 | legend: optional, `str` (default: 'off')
69 | `'off'` if the color map of cell legend is not plotted.
70 | `'only'` if only plot the cell type legend.
71 | colorbar: optional, `str` (default: 'on')
72 | `‘on’` if the colorbar of the plot of `alpha`, `beta`, `gamma`, `splice`, or `unsplice` is to be shown. `'off'` if the colorbar is to be not shown.
73 | min_mass: optional, `float` (default: 2)
74 | Filter by using the isotropic gaussian kernel to display the arrow on grids. The lower the min_mass, the more arrows.
75 | arrow_grid: optional, `tuple` (default: (30,30))
76 | The sparsity of the grids of velocity arrows. The larger, the more compact, and more arrows will be shown.
77 | Returns
78 | -------
79 | ax: matplotlib.axes.Axes
80 | """
81 |
82 | def gen_Line2D(label, markerfacecolor):
83 | return Line2D([0], [0], color='w', marker='o', label=label,
84 | markerfacecolor=markerfacecolor,
85 | markeredgewidth=0,
86 | markersize=legend_marker_size)
87 |
88 | if isinstance(colors, (list, tuple)):
89 | #print("\nbuild a colormap for a list of clusters as input\n")
90 | colors = build_colormap(colors)
91 |
92 | if isinstance(colors, dict):
93 | attr = 'clusters'
94 | legend_elements= [gen_Line2D(i, colors[i]) for i in colors]
95 | if legend != 'off':
96 | lgd=ax.legend(handles=legend_elements,
97 | bbox_to_anchor=(1.01, 1),
98 | loc='upper left')
99 | bbox_extra_artists=(lgd,)
100 | if legend == 'only':
101 | return lgd
102 | else:
103 | bbox_extra_artists=None
104 |
105 | c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters', gene))
106 | cmap=ListedColormap(list(colors.values()))
107 | elif isinstance(colors, str):
108 | attr = colors
109 | if colors in ['alpha', 'beta', 'gamma']:
110 | assert gene, '\nError! gene is required!\n'
111 | cmap = LinearSegmentedColormap.from_list("mycmap", colors_alpha_beta_gamma)
112 | if colors in ['splice', 'unsplice']:
113 | assert gene, '\nError! gene is required!\n'
114 | colors = {'splice':'splice', 'unsplice':'unsplice'}[colors]
115 | cmap = LinearSegmentedColormap.from_list("mycmap",
116 | colors_splice_unsplice)
117 | if colors in ['pseudotime']:
118 | cmap = 'viridis'
119 | c = extract_from_df(cellDancer_df, [colors], gene)
120 |
121 | elif colors is None:
122 | attr = 'basic'
123 | cmap = None
124 | c = 'Grey'
125 |
126 | embedding = extract_from_df(cellDancer_df, ['embedding1', 'embedding2'], gene)
127 | n_cells = embedding.shape[0]
128 |
129 | im=ax.scatter(embedding[:, 0],
130 | embedding[:, 1],
131 | c=c,
132 | cmap=cmap,
133 | s=s,
134 | vmin=vmin,
135 | vmax=vmax,
136 | alpha=alpha,
137 | edgecolor="none")
138 | if colorbar == 'on' and isinstance(colors, str):
139 | ax_divider = make_axes_locatable(ax)
140 | cax = ax_divider.append_axes("top", size="5%", pad="-5%")
141 |
142 | # print(" \n ")
143 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
144 | cbar.set_ticks([])
145 |
146 | if velocity:
147 | sample_cells = cellDancer_df['velocity1'][:n_cells].dropna().index
148 | embedding_ds = embedding[sample_cells]
149 | velocity_embedding= extract_from_df(cellDancer_df, ['velocity1', 'velocity2'], gene)
150 | grid_curve(ax, embedding_ds, velocity_embedding, arrow_grid, min_mass)
151 |
152 | if custom_xlim is not None:
153 | ax.set_xlim(custom_xlim[0], custom_xlim[1])
154 | if custom_ylim is not None:
155 | ax.set_ylim(custom_ylim[0], custom_ylim[1])
156 |
157 | return ax
158 |
159 | def grid_curve(
160 | ax,
161 | embedding_ds,
162 | velocity_embedding,
163 | arrow_grid,
164 | min_mass
165 | ):
166 | # calculate_grid_arrows
167 | # kernel grid plot
168 |
169 | def calculate_two_end_grid(embedding_ds, velocity_embedding, smooth=None, steps=None, min_mass=None):
170 | # Prepare the grid
171 | grs = []
172 | for dim_i in range(embedding_ds.shape[1]):
173 | m, M = np.min(embedding_ds[:, dim_i])-0.2, np.max(embedding_ds[:, dim_i])-0.2
174 | m = m - 0.025 * np.abs(M - m)
175 | M = M + 0.025 * np.abs(M - m)
176 | gr = np.linspace(m, M, steps[dim_i])
177 | grs.append(gr)
178 |
179 | meshes_tuple = np.meshgrid(*grs)
180 | gridpoints_coordinates = np.vstack(
181 | [i.flat for i in meshes_tuple]).T
182 |
183 | n_neighbors = int(velocity_embedding.shape[0]/3)
184 | dists_head, neighs_head = find_nn_neighbors(
185 | embedding_ds, gridpoints_coordinates, n_neighbors)
186 | dists_tail, neighs_tail = find_nn_neighbors(
187 | embedding_ds+velocity_embedding, gridpoints_coordinates,
188 | n_neighbors)
189 | std = np.mean([(g[1] - g[0]) for g in grs])
190 |
191 | # isotropic gaussian kernel
192 | gaussian_w_head = normal.pdf(
193 | loc=0, scale=smooth * std, x=dists_head)
194 | total_p_mass_head = gaussian_w_head.sum(1)
195 | gaussian_w_tail = normal.pdf(
196 | loc=0, scale=smooth * std, x=dists_tail)
197 | total_p_mass_tail = gaussian_w_tail.sum(1)
198 |
199 |
200 | UZ_head = (velocity_embedding[neighs_head] * gaussian_w_head[:, :, None]).sum(
201 | 1) / np.maximum(1, total_p_mass_head)[:, None] # weighed average
202 | UZ_tail = (velocity_embedding[neighs_tail] * gaussian_w_tail[:, :, None]).sum(
203 | 1) / np.maximum(1, total_p_mass_tail)[:, None] # weighed average
204 |
205 | XY = gridpoints_coordinates
206 |
207 | dists_head2, neighs_head2 = find_nn_neighbors(
208 | embedding_ds, XY+UZ_head, n_neighbors)
209 | dists_tail2, neighs_tail2 = find_nn_neighbors(
210 | embedding_ds, XY-UZ_tail, n_neighbors)
211 |
212 | gaussian_w_head2 = normal.pdf(
213 | loc=0, scale=smooth * std, x=dists_head2)
214 | total_p_mass_head2 = gaussian_w_head2.sum(1)
215 | gaussian_w_tail2 = normal.pdf(
216 | loc=0, scale=smooth * std, x=dists_tail2)
217 | total_p_mass_tail2 = gaussian_w_tail2.sum(1)
218 |
219 | UZ_head2 = (velocity_embedding[neighs_head2] * gaussian_w_head2[:, :, None]).sum(
220 | 1) / np.maximum(1, total_p_mass_head2)[:, None] # weighed average
221 | UZ_tail2 = (velocity_embedding[neighs_tail2] * gaussian_w_tail2[:, :, None]).sum(
222 | 1) / np.maximum(1, total_p_mass_tail2)[:, None] # weighed average
223 |
224 | mass_filter = total_p_mass_head < min_mass
225 |
226 | # filter dots
227 | UZ_head_filtered = UZ_head[~mass_filter, :]
228 | UZ_tail_filtered = UZ_tail[~mass_filter, :]
229 | UZ_head2_filtered = UZ_head2[~mass_filter, :]
230 | UZ_tail2_filtered = UZ_tail2[~mass_filter, :]
231 | XY_filtered = XY[~mass_filter, :]
232 | return(XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs)
233 |
234 | XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs = calculate_two_end_grid(
235 | embedding_ds, velocity_embedding, smooth=0.8, steps=arrow_grid, min_mass=min_mass)
236 |
237 | # connect two end grid to curve
238 | n_curves = XY_filtered.shape[0]
239 | s_vals = np.linspace(0.0, 1.5, 15) # TODO check last
240 | # get longest distance len and norm ratio
241 | XYM = XY_filtered
242 | UVT = UZ_tail_filtered
243 | UVH = UZ_head_filtered
244 | UVT2 = UZ_tail2_filtered
245 | UVH2 = UZ_head2_filtered
246 |
247 | def norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals):
248 | '''get the longest distance in prediction between the five points,
249 | and normalize by using the distance between two grids'''
250 |
251 | def distance(x, y):
252 | # calc disctnce list between a set of coordinate
253 | calculate_square = np.subtract(
254 | x[0:-1], x[1:])**2 + np.subtract(y[0:-1], y[1:])**2
255 | distance_result = (calculate_square)**0.5
256 | return distance_result
257 |
258 | max_discance = 0
259 | for i in range(n_curves):
260 | nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]],
261 | [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]])
262 | curve = bezier.Curve(nodes, degree=4)
263 | curve_dots = curve.evaluate_multi(s_vals)
264 | distance_sum = np.sum(
265 | distance(curve_dots[0], curve_dots[1]))
266 | max_discance = max(max_discance, distance_sum)
267 | distance_grid = (
268 | abs(grs[0][0]-grs[0][1]) + abs(grs[1][0]-grs[1][1]))/2
269 | norm_ratio = distance_grid/max_discance
270 | return(norm_ratio)
271 |
272 | norm_ratio = norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals)
273 |
274 | # plot the curve arrow for cell velocity
275 | XYM = XY_filtered
276 | UVT = UZ_tail_filtered * norm_ratio
277 | UVH = UZ_head_filtered * norm_ratio
278 | UVT2 = UZ_tail2_filtered * norm_ratio
279 | UVH2 = UZ_head2_filtered * norm_ratio
280 |
281 | def plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals):
282 | # TO DO: add 'colorful cell velocity' to here, now there is only curve arrows
283 | for i in range(n_curves):
284 | nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]],
285 | [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]])
286 | curve = bezier.Curve(nodes, degree=4)
287 | curve_dots = curve.evaluate_multi(s_vals)
288 | ax.plot(curve_dots[0], curve_dots[1],
289 | linewidth=0.5, color='black', alpha=1)
290 |
291 | # normalize the arrow of the last two points at the tail, to let all arrows has the same size in quiver
292 | U = curve_dots[0][-1]-curve_dots[0][-2]
293 | V = curve_dots[1][-1]-curve_dots[1][-2]
294 | N = np.sqrt(U**2 + V**2)
295 | U1, V1 = U/N*0.5, V/N*0.5 # 0.5 is to let the arrow have a suitable size
296 | ax.quiver(curve_dots[0][-2], curve_dots[1][-2], U1, V1, units='xy', angles='xy',
297 | scale=1, linewidth=0, color='black', alpha=1, minlength=0, width=0.1)
298 |
299 | plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals)
300 |
301 |
302 | def plot_kinetic_para(
303 | ax,
304 | kinetic_para,
305 | cellDancer_df,
306 | color_map=None,
307 | title=None,
308 | legend=False
309 | ):
310 |
311 | """Plot the UMAP calculated by the kinetic parameter(s).
312 |
313 | Arguments
314 | ---------
315 | ax: `ax`
316 | ax of plt.subplots()
317 | kinetic_para: `str`
318 | The parameter used to generate the embedding space based on UMAP, could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}.
319 | cellDancer_df: `pandas.DataFrame`
320 | Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'splice', 'unsplice', 'splice_predict', 'unsplice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2']
321 | color_map: `dict` (optional, default: None)
322 | The color map dictionary of each cell type.
323 | legend: `bool` (optional, default: False)
324 | `True` if the color map of cell legend is to be plotted.
325 | """
326 | onegene=cellDancer_df[cellDancer_df.gene_name==cellDancer_df.gene_name[0]]
327 | umap_para=onegene[[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]].to_numpy()
328 | onegene_cluster_info=onegene.clusters
329 |
330 | gene=None
331 | if gene is None:
332 | if color_map is None:
333 | from .colormap import build_colormap
334 | color_map=build_colormap(onegene_cluster_info)
335 |
336 | colors = list(map(lambda x: color_map.get(x, 'black'), onegene_cluster_info))
337 |
338 | if legend:
339 | markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in color_map.values()]
340 | lgd=plt.legend(markers, color_map.keys(), numpoints=1,loc='upper left',bbox_to_anchor=(1.01, 1))
341 |
342 | im=ax.scatter(umap_para[:,0], umap_para[:,1],c=colors,s=15,alpha=0.5,edgecolor="none")
343 | ax.axis('square')
344 | ax.axis('off')
345 | ax.set_title('UMAP of '+ kinetic_para)
346 |
347 | else:
348 | onegene=cellDancer_df[cellDancer_df.gene_name==gene]
349 | im=ax.scatter(umap_para[:,0], umap_para[:,1],c=np.log(onegene.splice+0.0001),s=15,alpha=1,edgecolor="none")
350 | ax.axis('square')
351 | ax.axis('off')
352 | ax.set_title('spliced reads of '+gene+'\n on UMAP of \n'+ kinetic_para)
353 |
354 | ax_divider = make_axes_locatable(ax)
355 | cax = ax_divider.append_axes("top", size="5%", pad="-5%")
356 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
357 | cbar.set_ticks([])
358 |
359 | umap_df=pd.concat([pd.DataFrame({'umap1':umap_para[:,0],'umap2':umap_para[:,1]})],axis=1)
360 |
361 | return ax
--------------------------------------------------------------------------------
/src/celldancer/plotting/colormap.py:
--------------------------------------------------------------------------------
1 | colors_alpha_beta_gamma = ["#007EB7","#3B9AB2", "#78B7C5", "#EBCC2A", "#E1AF00", "#F21A00"]
2 | colors_splice_unsplice = ["#2488F0","#7F3F98","#E22929","#FCB31A"]
3 |
4 | colormap_erythroid={
5 | 'Haematoendothelial progenitors':'#3361A5',
6 | 'Blood progenitors 1':'#248AF3',
7 | 'Blood progenitors 2':'#14B3FF',
8 | 'Erythroid1':'#88CEEF',
9 | 'Erythroid2':'#FDB31A',
10 | 'Erythroid3':'#E42A2A'
11 | }
12 |
13 | colormap_neuro = {
14 | 'CA': "#ed0345",
15 | 'CA1-Sub': "#710162",
16 | 'CA2-3-4': "#a12a5e",
17 | 'Granule':"#ef6a32",
18 | 'ImmGranule1': "#ef6a32",
19 | 'ImmGranule2': "#ef6a32",
20 | 'Nbl1': "#fbbf45",
21 | 'Nbl2': "#fbbf45",
22 | 'nIPC': "#aad962",
23 | 'RadialGlia': "#03c383",
24 | 'RadialGlia2': "#03c383",
25 | 'GlialProg': '#56A65A',
26 | 'OPC': "#017351",
27 | 'ImmAstro': "#08A8CE"
28 | }
29 |
30 |
31 | colormap_pancreas={
32 | 'Ductal':'#3361A5',
33 | 'Ngn3 low EP':'#248AF3',
34 | 'Ngn3 high EP':'#14B3FF',
35 | 'Pre-endocrine':'#88CEEF',
36 | 'Alpha':'#ff4800',
37 | 'Beta':"#B81136",
38 | 'Delta':'green',
39 | 'Epsilon':'#03B3B0'
40 | }
41 |
42 | colormap_hgForebrainGlut={
43 | 0:'#9408F7',
44 | 1:'#C729D6',
45 | 2:'#FA4AB5',
46 | 3:'#FF6A95',
47 | 4:'#FF8B74',
48 | 5:'#FFAC53',
49 | 6:'#FFCD32'
50 | }
51 |
52 | colormap_hgforebrainglut={
53 | 'Radial Glia':'#9408F7',
54 | 'Radial Glia':'#C729D6',
55 | 'Neuroblast':'#FA4AB5',
56 | 'Neuroblast':'#FF6A95',
57 | 'Immature Neuron':'#FF8B74',
58 | 'Immature Neuron':'#FFAC53',
59 | 'Neuron':'#FFCD32'
60 | }
61 |
62 | color_template = ["#08A8CE","#017351",'#56A65A',"#03c383","#aad962","#fbbf45","#ef6a32","#ed0345","#a12a5e","#710162","#3B9AB2"]
63 |
64 | def build_colormap(cluster_list):
65 | from itertools import cycle
66 | color_list=color_template
67 | colors = dict(zip(cluster_list, cycle(color_list)) if len(cluster_list) > len(color_list) else zip(cycle(cluster_list), color_list))
68 | return colors
69 |
70 |
--------------------------------------------------------------------------------
/src/celldancer/plotting/gene.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import os
3 | import sys
4 | import pandas as pd
5 | import numpy as np
6 | from matplotlib.lines import Line2D
7 | from matplotlib.colors import ListedColormap
8 | from .colormap import *
9 | from ..sampling import sampling_neighbors
10 | from ..utilities import extract_from_df
11 |
12 | def scatter_gene(
13 | ax=None,
14 | x=None,
15 | y=None,
16 | cellDancer_df=None,
17 | colors=None,
18 | custom_xlim=None,
19 | custom_ylim=None,
20 | vmin=None,
21 | vmax=None,
22 | alpha=0.5,
23 | s = 5,
24 | velocity=False,
25 | gene=None,
26 | legend='off',
27 | arrow_grid = (15,15)):
28 |
29 | """Plot the velocity (splice-unsplice) of a gene, or plot the parameter ('alpha', 'beta', 'gamma', 'splice', 'unsplice') in pseudotime, or customize the parameters in x-axis and y-axis of a gene.
30 |
31 | Arguments
32 | ---------
33 | ax: `ax of plt.subplots()`
34 | ax to add subplot.
35 | x: `str`
36 | Set x axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}.
37 | y: `str`
38 | Set y axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}.
39 | cellDancer_df: `pandas.DataFrame`
40 | Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime']
41 | colors: `list`, `dict`, or `str`
42 | When the input is a list: build a colormap dictionary for a list of cell type;
43 | When the input is a dictionary: the customized color map dictionary of each cell type;
44 | When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as value of color.
45 | custom_xlim: optional, `float` (default: None)
46 | Set the x limit of the current axes.
47 | custom_ylim: optional, `float` (default: None)
48 | Set the y limit of the current axes.
49 | vmin: optional, `float` (default: None)
50 | Set the minimum color limit of the current image.
51 | vmax: optional, `float` (default: None)
52 | Set the maximum color limit of the current image.
53 | alpha: optional, `float` (default: 0.5)
54 | The alpha blending value, between 0 (transparent) and 1 (opaque).
55 | s: optional, `float` (default: 5)
56 | The marker size.
57 | velocity: optional, `bool` (default: False)
58 | `True` if velocity in gene level is to be plotted.
59 | gene: optional, `str` (default: None)
60 | Gene selected to be plotted.
61 | legend: optional, `str` (default: 'off')
62 | `‘off’` if the color map of cell type legend is not to be plotted;
63 | `‘only’` if only plot the cell type legend.
64 | arrow_grid: optional, `tuple` (default: (15,15))
65 | The sparsity of the grids of velocity arrows. The larger, the more compact and more arrows will be shown.
66 |
67 | Returns
68 | -------
69 | ax: matplotlib.axes.Axes
70 | """
71 |
72 | def gen_Line2D(label, markerfacecolor):
73 | return Line2D([0], [0], color='w', marker='o', label=label,
74 | markerfacecolor=markerfacecolor,
75 | markeredgewidth=0,
76 | markersize=s)
77 |
78 | if isinstance(colors, list):
79 | colors = build_colormap(colors)
80 |
81 | if isinstance(colors, dict):
82 | attr = 'clusters'
83 | legend_elements= [gen_Line2D(i, colors[i]) for i in colors]
84 | if legend != 'off':
85 | lgd=ax.legend(handles=legend_elements,
86 | bbox_to_anchor=(1.01, 1),
87 | loc='upper left')
88 | bbox_extra_artists=(lgd,)
89 | if legend == 'only':
90 | return lgd
91 | else:
92 | bbox_extra_artists=None
93 |
94 | c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters'))
95 | cmap=ListedColormap(list(colors.values()))
96 |
97 | elif isinstance(colors, str):
98 | attr = colors
99 | if colors in ['alpha', 'beta', 'gamma']:
100 | assert gene, '\nError! gene is required!\n'
101 | cmap = ListedColormap(colors_alpha_beta_gamma)
102 | if colors in ['splice', 'unsplice']:
103 | assert gene, '\nError! gene is required!\n'
104 | cmap = ListedColormap(colors_splice_unsplice)
105 | if colors in ['pseudotime']:
106 | cmap = 'viridis'
107 | else:
108 | cmap = 'viridis'
109 |
110 | c = extract_from_df(cellDancer_df, [colors], gene)
111 | elif colors is None:
112 | attr = 'basic'
113 | cmap = None
114 | c = '#95D9EF'
115 |
116 | assert gene, '\nError! gene is required!\n'
117 | xy = extract_from_df(cellDancer_df, [x, y], gene)
118 | ax.scatter(xy[:, 0],
119 | xy[:, 1],
120 | c=c,
121 | cmap=cmap,
122 | s=s,
123 | alpha=alpha,
124 | vmin=vmin,
125 | vmax=vmax,
126 | edgecolor="none")
127 |
128 | if custom_xlim is not None:
129 | ax.set_xlim(custom_xlim[0], custom_xlim[1])
130 | if custom_ylim is not None:
131 | ax.set_ylim(custom_ylim[0], custom_ylim[1])
132 |
133 |
134 | if velocity:
135 | assert (x,y) in [('unsplice', 'splice'), ('splice', 'unsplice')]
136 | u_s = extract_from_df(cellDancer_df, ['unsplice','splice','unsplice_predict','splice_predict'], gene)
137 | sampling_idx=sampling_neighbors(u_s[:,0:2], step=arrow_grid, percentile=15) # Sampling
138 | u_s_downsample = u_s[sampling_idx,0:4]
139 |
140 | plt.scatter(u_s_downsample[:, 1], u_s_downsample[:,0], color="none", s=s, edgecolor="k")
141 | plt.quiver(u_s_downsample[:, 1], u_s_downsample[:, 0],
142 | u_s_downsample[:, 3]-u_s_downsample[:, 1],
143 | u_s_downsample[:, 2]-u_s_downsample[:, 0],
144 | angles='xy', clim=(0., 1.))
145 |
146 | return ax
147 |
148 |
--------------------------------------------------------------------------------
/src/celldancer/plotting/graph.py:
--------------------------------------------------------------------------------
1 | import os
2 | import networkx as nx
3 | import pandas as pd
4 | import numpy as np
5 | from datashader.layout import forceatlas2_layout
6 | from datashader.bundling import hammer_bundle, connect_edges
7 | import matplotlib.pyplot as plt
8 | from matplotlib.colors import ListedColormap
9 | from matplotlib.lines import Line2D
10 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
11 |
12 | from .colormap import *
13 | if __name__ == "__main__":# developer test
14 | sys.path.append('..')
15 | from utilities import extract_from_df
16 | else:
17 | from celldancer.utilities import extract_from_df
18 |
19 | def PTO_Graph(
20 | ax,
21 | cellDancer_df,
22 | node_layout='forceatlas2',
23 | PRNG_SEED=None,
24 | force_iters=2000,
25 | use_edge_bundling=True,
26 | node_colors=None,
27 | node_sizes=5,
28 | edge_length=None,
29 | legend='off',
30 | colorbar='on'):
31 |
32 | """
33 | Graph visualization of selected cells reflecting their orders in
34 | pseudotime (PseudoTimeOrdered_Graph: PTO_Graph). Embedding and pseudotime
35 | of the cells are required. Each cell makes a node and the connections between
36 | nodes are based on their separation in the embedding space and the strength
37 | of the connection is proportional to the pseudotime difference (the larger
38 | the pseudotime difference in absolute values, the weaker the connection).
39 |
40 | Example usage:
41 |
42 | .. code-block:: python
43 |
44 | from celldancer.plotting import graph
45 | from matplotlib import pyplot as plt
46 | fig, ax = plt.subplots(figsize=(10,10))
47 | graph.PTO_Graph(ax,
48 | load_cellDancer,
49 | node_layout='forcedirected',
50 | use_edge_bundling=True,
51 | node_colors='clusters',
52 | edge_length=3,
53 | node_sizes='pseudotime',
54 | colorbar='on',
55 | legend='on')
56 |
57 | In this example, we use a force-directed node layout algorithm (`ForceAtlas2
58 | `_).
59 | A connection is made between any two cells within 3 (unit in the embedding).
60 | The resulted edge lengths indicate the time difference between nodes (the
61 | closer in pseudotime, the shorter the edge length). Edge bundling is applied
62 | to highlight important edges (trunks). The sizes of the nodes are
63 | proportional to the pseudotime. The nodes are colored according to their
64 | cell types (if given by the input data).
65 |
66 | Arguments
67 | ---------
68 | cellDancer_df: `pandas.DataFrame`
69 | Dataframe of velocity estimation, cell velocity, and pseudotime results.
70 | Columns=['cellIndex', 'gene_name',
71 | 'unsplice', 'splice',
72 | 'unsplice_predict', 'splice_predict',
73 | 'alpha', 'beta', 'gamma',
74 | 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2',
75 | 'velocity1', 'velocity2', 'pseudotime']
76 |
77 | node_layout: optional, `str` (default: forceatlas2)
78 | Layout for the graph. Currently only supports the forceatlas2 and
79 | embedding.
80 |
81 | - `'forceatlas2'` or `'forcedirected'`: treat connections as forces
82 | between connected nodes.
83 |
84 | - `'embedding'`: use the embedding as positions of the nodes.
85 |
86 | PRNG_SEED: optional, `int`, or `None` (default: `None`)
87 | Seed to initialize the pseudo-random number generator.
88 |
89 | force_iters: optional, `int` (default: 2000)
90 | Number of passes for the force-directed layout calculation.
91 |
92 | use_edge_bundling: optional, `bool` (default: `True`)
93 | `True` if bundle the edges (computational demanding).
94 | Edge bundling allows edges to curve and groups nearby ones together
95 | for better visualization of the graph structure.
96 |
97 | node_colors: optional, `str` (default: `None`)
98 | The node fill colors.
99 | Possible values:
100 |
101 | - *clusters*: color according to the clusters information of the
102 | respective cells.
103 |
104 | - *pseudotime*: colors according to the pseudotime of the
105 | respective cells.
106 |
107 | - A single color format string.
108 |
109 | edge_length: optional, `float` (default: `None`)
110 | The distance cutoff in the embedding between two nodes to determine
111 | whether an edge should be formed (edge is formed when r < *edge_length*).
112 | By default, the mean of all the cell
113 |
114 | node_sizes: optional, `float` or `numeric list-like` or `str` (default: 5)
115 | The sizes of the nodes. If it is `str`, then the `str` has to be either one of those
116 | {`pseudotime`, `index`, `x`, `y`} read from the `nodes` dataframe.
117 |
118 | legend: optional, `str` (default: 'off')
119 | - `'off'`/`'on'`: Exclude/include the cell type legend on the plot.
120 | - `'only'`: Negelect the plot and only show the cell type legend.
121 |
122 | colorbar: optional, `str` (default: 'on')
123 | - `'off'`/`'on'`: Show the colorbar in the case nodes are colored by `pseudotime`.
124 |
125 |
126 | Returns
127 | -------
128 | ax: matplotlib.axes.Axes
129 |
130 | """
131 |
132 | nodes, edges = create_nodes_edges(cellDancer_df, edge_length)
133 |
134 | if node_layout in ['forceatlas2', 'forcedirected']:
135 | # Current version of datashader.layout does not support reading a layout (x,y) and perform layout function
136 | # It does not support other attributes except index.
137 | forcedirected = forceatlas2_layout(nodes[['index']], edges,
138 | weight='weight', iterations=force_iters, k=0.1, seed=PRNG_SEED)
139 | nodes['x'] = forcedirected['x']
140 | nodes['y'] = forcedirected['y']
141 |
142 | if use_edge_bundling:
143 | bundle = hammer_bundle(nodes, edges)
144 | else:
145 | bundle = connect_edges(nodes, edges)
146 |
147 |
148 | # For plotting settings
149 | def gen_Line2D(label, markerfacecolor, markersize):
150 | return Line2D([0], [0], color='w',
151 | marker='o',
152 | label=label,
153 | markerfacecolor=markerfacecolor,
154 | markeredgewidth=0,
155 | markersize=markersize)
156 |
157 | if isinstance(node_sizes, (int, float)) or isinstance(node_sizes, list):
158 | pass
159 | elif isinstance(node_sizes, str):
160 | node_sizes=nodes[node_sizes].to_numpy(dtype=float)*200
161 |
162 | if isinstance(node_colors, str):
163 | # This goes to dict case afterwards
164 | if node_colors in ['clusters']:
165 | node_colors = build_colormap(nodes[node_colors])
166 | if node_colors in ['pseudotime']:
167 | cmap='viridis'
168 | c=nodes[node_colors].to_numpy(dtype=float)
169 |
170 | if isinstance(node_colors, dict):
171 | legend_elements= [gen_Line2D(i,
172 | node_colors[i],
173 | 10)
174 | for i in node_colors]
175 |
176 | if legend != 'off':
177 | lgd=ax.legend(handles=legend_elements,
178 | bbox_to_anchor=(1.01, 1),
179 | loc='upper left')
180 | bbox_extra_artists=(lgd,)
181 | if legend == 'only':
182 | return lgd
183 | else:
184 | bbox_extra_artists=None
185 |
186 | c=nodes['clusters'].map(node_colors).to_list()
187 | cmap=ListedColormap(list(node_colors.values()))
188 |
189 | if node_colors is None:
190 | c = ['Grey']*len(nodes)
191 |
192 | ax.plot(bundle.x, bundle.y, 'y', zorder=1, linewidth=0.3, color='blue', alpha=1)
193 | im = ax.scatter(nodes.x, nodes.y, c=c, cmap=cmap, s=node_sizes, zorder=2, edgecolors='k', alpha=0.5)
194 |
195 | if colorbar == 'on' and isinstance(node_colors, str):
196 | ax_divider = make_axes_locatable(ax)
197 | cax = ax_divider.append_axes("top", size="5%", pad="-5%")
198 | cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
199 | cbar.set_ticks([])
200 | ax.axis('off')
201 |
202 | return ax
203 |
204 |
205 |
206 | def create_nodes_edges(data, radius):
207 | def create_KNN_based_graph():
208 | from sklearn.neighbors import NearestNeighbors
209 | neigh = NearestNeighbors(radius = radius)
210 | neigh.fit(embedding_ds)
211 | nn_graph = neigh.radius_neighbors_graph(embedding_ds, mode='connectivity')
212 | nn_array = nn_graph.toarray()
213 |
214 | # nn_array is effectively the edge list
215 | # Keep track of cells of 0 timeshift.
216 | node_list = [(i, {'pseudotime': pseudotime_ds[i,0], 'clusters':clusters_ds[i]})
217 | for i in range(len(embedding_ds))]
218 |
219 | dtime = pseudotime_ds[:,0] - pseudotime_ds
220 | INF = 1./np.min(np.abs(dtime[dtime != 0]))
221 |
222 | # upper triangle of the knn array (i0:
233 | edge_list.append((a, b, 1/w))
234 | elif w<0:
235 | edge_list.append((a, b, -1/w))
236 | else:
237 | edge_list.append((a, b, INF))
238 |
239 | G = nx.Graph()
240 | G.add_nodes_from(node_list)
241 | G.add_weighted_edges_from(edge_list)
242 | return G
243 |
244 | embedding = extract_from_df(data, ['embedding1', 'embedding2'])
245 | n_cells = embedding.shape[0]
246 | sample_cells = data['velocity1'][:n_cells].dropna().index
247 | clusters = extract_from_df(data, ['clusters'])
248 | pseudotime = extract_from_df(data, ['pseudotime'])
249 |
250 | embedding_ds = embedding[sample_cells]
251 | pseudotime_ds = pseudotime[sample_cells]
252 | clusters_ds = clusters[sample_cells]
253 |
254 | G = create_KNN_based_graph()
255 |
256 | index = np.array(range(len(embedding_ds)), dtype=int)[:,None]
257 | nodes = pd.DataFrame(np.hstack((embedding_ds, index, pseudotime_ds, clusters_ds)),
258 | columns=['x','y','index','pseudotime','clusters'])
259 |
260 | edges = pd.DataFrame([(i[0], i[1], G.edges[i]['weight']) for i in G.edges],
261 | columns=['source', 'target', 'weight'])
262 | return nodes, edges
263 |
--------------------------------------------------------------------------------
/src/celldancer/sampling.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from numpy.core.fromnumeric import size
4 | import scipy
5 | from sklearn.neighbors import NearestNeighbors
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def sampling_neighbors(gene_unsplice_splice,step=(30,30),percentile=25):
10 |
11 | from scipy.stats import norm
12 | def gaussian_kernel(X, mu = 0, sigma=1):
13 | return np.exp(-(X - mu)**2 / (2*sigma**2)) / np.sqrt(2*np.pi*sigma**2)
14 | grs = []
15 | for dim_i in range(gene_unsplice_splice.shape[1]):
16 | m, M = np.min(gene_unsplice_splice[:, dim_i]), np.max(gene_unsplice_splice[:, dim_i])
17 | m = m - 0.025 * np.abs(M - m)
18 | M = M + 0.025 * np.abs(M - m)
19 | gr = np.linspace(m, M, step[dim_i])
20 | grs.append(gr)
21 | meshes_tuple = np.meshgrid(*grs)
22 | gridpoints_coordinates = np.vstack([i.flat for i in meshes_tuple]).T
23 | gridpoints_coordinates = gridpoints_coordinates + norm.rvs(loc=0, scale=0.15, size=gridpoints_coordinates.shape)
24 |
25 | np.random.seed(10) # set random seed
26 |
27 | nn = NearestNeighbors()
28 |
29 | neighbors_1 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20)
30 | nn.fit(gene_unsplice_splice[:,0:2])
31 | dist, ixs = nn.kneighbors(gridpoints_coordinates, neighbors_1)
32 |
33 | ix_choice = ixs[:,0].flat[:]
34 | ix_choice = np.unique(ix_choice)
35 |
36 | nn = NearestNeighbors()
37 |
38 | neighbors_2 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20)
39 | nn.fit(gene_unsplice_splice[:,0:2])
40 | dist, ixs = nn.kneighbors(gene_unsplice_splice[ix_choice, 0:2], neighbors_2)
41 |
42 | density_extimate = gaussian_kernel(dist, mu=0, sigma=0.5).sum(1)
43 | bool_density = density_extimate > np.percentile(density_extimate, percentile)
44 | ix_choice = ix_choice[bool_density]
45 | return(ix_choice)
46 |
47 | def sampling_inverse(gene_unsplice_splice,target_amount=500):
48 | unsplice = gene_unsplice_splice[:,0]
49 | splice = gene_unsplice_splice[:,1]
50 | values = np.vstack([unsplice,splice])
51 | kernel = scipy.stats.gaussian_kde(values)
52 | p = kernel(values)
53 | # p2 = (1/p)/sum(1/p)
54 | p2 = (1/p)/sum(1/p)
55 | idx = np.arange(values.shape[1])
56 | r = scipy.stats.rv_discrete(values=(idx, p2))
57 | idx_choice = r.rvs(size=target_amount)
58 | return(idx_choice)
59 |
60 | def sampling_circle(gene_unsplice_splice,target_amount=500):
61 | unsplice = gene_unsplice_splice[:,0]
62 | splice = gene_unsplice_splice[:,1]
63 | values = np.vstack([unsplice,splice])
64 | kernel = scipy.stats.gaussian_kde(values)
65 | p = kernel(values)
66 | idx = np.arange(values.shape[1])
67 | tmp_p = np.square((1-(p/(max(p)))**2))+0.0001
68 | p2 = tmp_p/sum(tmp_p)
69 | r = scipy.stats.rv_discrete(values=(idx, p2))
70 | idx_choice = r.rvs(size=target_amount)
71 | return(idx_choice)
72 |
73 | def sampling_random(gene_unsplice_splice, target_amount=500):
74 | idx = np.random.choice(gene_unsplice_splice.shape[0], size = target_amount, replace=False)
75 | return(idx)
76 |
77 | def sampling_adata(detail,
78 | para,
79 | target_amount=500,
80 | step=(30,30)):
81 | if para == 'neighbors':
82 | data_U_S= np.array(detail[["unsplice","splice"]])
83 | idx = sampling_neighbors(data_U_S,step)
84 | elif para == 'inverse':
85 | data_U_S= np.array(detail[["unsplice","splice"]])
86 | idx = sampling_inverse(data_U_S,target_amount)
87 | elif para == 'circle':
88 | data_U_S= np.array(detail[["unsplice","splice"]])
89 | idx = sampling_circle(data_U_S,target_amount)
90 | elif para == 'random':
91 | data_U_S= np.array(detail[["unsplice","splice"]])
92 | idx = sampling_random(data_U_S,target_amount)
93 | else:
94 | print('para is neighbors or inverse or circle')
95 | return(idx)
96 |
97 | def sampling_embedding(detail,
98 | para,
99 | target_amount=500,
100 | step=(30,30)):
101 |
102 | '''
103 | Guangyu
104 | '''
105 | if para == 'neighbors':
106 | data_U_S= np.array(detail[["embedding1","embedding2"]])
107 | idx = sampling_neighbors(data_U_S,step)
108 | elif para == 'inverse':
109 | print('inverse')
110 | data_U_S= np.array(detail[["embedding1","embedding2"]])
111 | idx = sampling_inverse(data_U_S,target_amount)
112 | elif para == 'circle':
113 | data_U_S= np.array(detail[["embedding1","embedding2"]])
114 | idx = sampling_circle(data_U_S,target_amount)
115 | elif para == 'random':
116 | # print('random')
117 | data_U_S= np.array(detail[["embedding1","embedding2"]])
118 | idx = sampling_random(data_U_S,target_amount)
119 | else:
120 | print('para is neighbors or inverse or circle')
121 | return(idx)
122 |
123 | def adata_to_detail(data, para, gene):
124 | '''
125 | convert adata to detail format
126 | data: an anndata
127 | para: the varable name of unsplice, splice, and gene name
128 | para = ['Mu', 'Ms']
129 | '''
130 | data2 = data[:, data.var.index.isin([gene])].copy()
131 | unsplice = data2.layers[para[0]][:,0].copy().astype(np.float32)
132 | splice = data2.layers[para[1]][:,0].copy().astype(np.float32)
133 | detail = pd.DataFrame({'gene_name':gene, 'unsplice':unsplice, 'splice':splice})
134 | return(detail)
135 |
136 | def downsampling_embedding(data_df,para,target_amount, step, n_neighbors,expression_scale=None,projection_neighbor_choice='embedding',pca_n_components=None,umap_n=None,umap_n_components=None):
137 | '''
138 | Guangyu
139 | sampling cells by embedding
140 | data—df: from load_cellDancer
141 | para:
142 |
143 | return: sampled embedding, the indexs of sampled cells, and the neighbors of sampled cells
144 | '''
145 |
146 | gene = data_df['gene_name'].drop_duplicates().iloc[0]
147 | embedding = data_df.loc[data_df['gene_name']==gene][['embedding1','embedding2']]
148 |
149 | if step is not None:
150 | idx_downSampling_embedding = sampling_embedding(embedding,
151 | para=para,
152 | target_amount=target_amount,
153 | step=step)
154 | else:
155 | idx_downSampling_embedding=range(0,embedding.shape[0]) # all cells
156 |
157 | def transfer(data_df,expression_scale):
158 | if expression_scale=='log':
159 | data_df.splice=np.log(data_df.splice+0.000001)
160 | data_df.unsplice=np.log(data_df.unsplice+0.000001)
161 | elif expression_scale=='2power':
162 | data_df.splice=2**(data_df.splice)
163 | data_df.unsplice=2**(data_df.unsplice)
164 | elif expression_scale=='power10':
165 | data_df.splice=(data_df.splice)**10
166 | data_df.unsplice=(data_df.unsplice)**10
167 | elif expression_scale=='2power_norm_multi10':
168 | gene_order=data_df.gene_name.drop_duplicates()
169 | onegene=data_df[data_df.gene_name==data_df.gene_name[0]]
170 | cellAmt=len(onegene)
171 | data_df_max=data_df.groupby('gene_name')[['splice','unsplice']].max().rename(columns={'splice': 'splice_max','unsplice': 'unsplice_max'})
172 | data_df_min=data_df.groupby('gene_name')[['splice','unsplice']].min().rename(columns={'splice': 'splice_min','unsplice': 'unsplice_min'})
173 | data_df_fin=pd.concat([data_df_max,data_df_min],axis=1).reindex(gene_order)
174 | data_df_fin=data_df_fin.loc[data_df_fin.index.repeat(cellAmt)]
175 | data_df_combined=pd.concat([data_df.reset_index(drop=True) ,data_df_fin[['splice_max','unsplice_max','splice_min','unsplice_min']].reset_index(drop=True)],axis=1)
176 | data_df_combined['unsplice_norm']=''
177 | data_df_combined['splice_norm']=''
178 | data_df_combined.unsplice_norm=(data_df_combined.unsplice-data_df_combined.unsplice_min)/(data_df_combined.unsplice_max-data_df_combined.unsplice_min)
179 | data_df_combined.splice_norm=(data_df_combined.splice-data_df_combined.splice_min)/(data_df_combined.splice_max-data_df_combined.splice_min)
180 | data_df_combined.unsplice=2**(data_df_combined.unsplice_norm*10)
181 | data_df_combined.splice=2**(data_df_combined.splice_norm*10)
182 | data_df=data_df_combined
183 |
184 | return (data_df)
185 |
186 | data_df=transfer(data_df,expression_scale)
187 |
188 |
189 | if projection_neighbor_choice=='gene':
190 | #print('using gene projection_neighbor_choice')
191 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
192 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
193 | embedding_downsampling = data_df_pivot.iloc[idx_downSampling_embedding]
194 | elif projection_neighbor_choice=='pca': # not use
195 | from sklearn.decomposition import PCA
196 | #print('using pca projection_neighbor_choice')
197 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
198 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
199 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
200 | pca=PCA(n_components=pca_n_components)
201 | pca.fit(embedding_downsampling_0)
202 | embedding_downsampling = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)]
203 | elif projection_neighbor_choice=='pca_norm':
204 | from sklearn.decomposition import PCA
205 | #print('pca_norm')
206 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
207 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
208 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
209 | pca=PCA(n_components=pca_n_components)
210 | pca.fit(embedding_downsampling_0)
211 | embedding_downsampling_trans = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)]
212 | embedding_downsampling_trans_norm=(embedding_downsampling_trans - embedding_downsampling_trans.min(0)) / embedding_downsampling_trans.ptp(0)#normalize
213 | embedding_downsampling_trans_norm_mult10=embedding_downsampling_trans_norm*10 #optional
214 | embedding_downsampling=embedding_downsampling_trans_norm_mult10**5 # optional
215 | elif projection_neighbor_choice=='embedding':
216 | embedding_downsampling = embedding.iloc[idx_downSampling_embedding][['embedding1','embedding2']]
217 |
218 | elif projection_neighbor_choice =='umap':
219 | import umap
220 | #print('using umap projection_neighbor_choice')
221 | cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
222 | data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
223 | embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
224 |
225 | def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=umap_n_components, metric='euclidean'):
226 | fit = umap.UMAP(
227 | n_neighbors=n_neighbors,
228 | min_dist=min_dist,
229 | n_components=n_components,
230 | metric=metric
231 | )
232 | embed = fit.fit_transform(df);
233 | return(embed)
234 | embedding_downsampling=get_umap(embedding_downsampling_0)
235 |
236 | n_neighbors = min(int((embedding_downsampling.shape[0])/4), n_neighbors)
237 | if n_neighbors==0:
238 | n_neighbors=1
239 | nn = NearestNeighbors(n_neighbors=n_neighbors)
240 | nn.fit(embedding_downsampling)
241 | embedding_knn = nn.kneighbors_graph(mode="connectivity")
242 | return(embedding_downsampling, idx_downSampling_embedding, embedding_knn)
243 |
244 | def downsampling(data_df, gene_list, downsampling_ixs):
245 | '''
246 | Guangyu
247 | '''
248 | data_df_downsampled=pd.DataFrame()
249 | for gene in gene_list:
250 | data_df_one_gene=data_df[data_df['gene_name']==gene]
251 | data_df_one_gene_downsampled = data_df_one_gene.iloc[downsampling_ixs]
252 | data_df_downsampled=data_df_downsampled.append(data_df_one_gene_downsampled)
253 | return(data_df_downsampled)
254 |
--------------------------------------------------------------------------------
/src/celldancer/simulation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import pandas as pd
4 | from torch.utils.data import *
5 | import anndata
6 |
7 | from scipy.integrate import solve_ivp
8 |
9 | def _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples):
10 |
11 | def trans_dynamics(t, expr):
12 | s = expr[0]
13 | u = expr[1]
14 | du_dt = alpha - beta*u
15 | ds_dt = beta*u - gamma*s
16 | return [ds_dt, du_dt]
17 |
18 | #print("t1 and t2:", t1, t2)
19 | t_space = np.linspace(t1, t2, samples)
20 | num_sol = solve_ivp(trans_dynamics, [0, t2], [s0_start, u0_start], method='RK45', dense_output=True)
21 | XY_num_sol = num_sol.sol(t_space)
22 | S, U = XY_num_sol[0], XY_num_sol[1]
23 | return U, S
24 |
25 | def _jitter(U, S, scale):
26 | S = S + np.random.normal(loc=0.0, scale=scale*np.percentile(S, 99) / 10, size=np.size(S))
27 | U = U + np.random.normal(loc=0.0, scale=scale*np.percentile(U, 99) / 10, size=np.size(U))
28 | S1 = S[(S>0)&(U>0)]
29 | U1 = U[(S>0)&(U>0)]
30 | S1, U1 = np.clip(S, 0, None), np.clip(U, 0, None)
31 | return U1, S1
32 |
33 | def _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt=0.001, scale=1):
34 | u0, s0 = _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples)
35 | u0_end, s0_end = u0[-1], s0[-1]
36 | #u0, s0 = _jitter(u0, s0, scale)
37 | u1 = u0 + (alpha - beta*u0)*dt
38 | s1 = s0 + (beta*u0 - gamma*s0)*dt
39 |
40 | expr = pd.DataFrame(u0, columns=['u0'])
41 | expr['s0'] = s0
42 | expr['u1'] = u1
43 | expr['s1'] = s1
44 | expr['alpha'] = alpha
45 | expr['beta'] = beta
46 | expr['gamma'] = gamma
47 | return expr, (u0_end, s0_end)
48 |
49 | def _simulate_without_t( u0_start, s0_start, alpha, beta, gamma, percent_start_u, percent_end_u, samples, dt=0.001, scale=1):
50 | '''percentage_u: u_end/u_max'''
51 |
52 | def inversed_u(u, expr):
53 | t = expr[0]
54 | dt_du = 1/(alpha - beta*u)
55 | return dt_du
56 |
57 | if alpha != 0:
58 | u_max = alpha/beta
59 | u_start = u0_start + (u_max-u0_start) * percent_start_u/100
60 | u_end = u0_start + (u_max-u0_start) * percent_end_u/100
61 | else:
62 | u_max = u0_start
63 | u_start = u_max * (100-percent_start_u)/100
64 | u_end = u_max * (100-percent_end_u)/100
65 |
66 | t_sol = solve_ivp(inversed_u, [u0_start, u_end], [0], method='RK45', dense_output=True)
67 | t1 = t_sol.sol(u_start)[0]
68 | t2 = t_sol.sol(u_end)[0]
69 | return _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt, scale)
70 |
71 | def forward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1):
72 | expr, end = _simulate_without_t(0, 0, alpha, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level)
73 | return expr
74 |
75 | def backward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1):
76 | u0_start = alpha/beta
77 | s0_start = alpha/gamma
78 | expr, end = _simulate_without_t(u0_start, s0_start, 0, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level)
79 | return expr
80 |
81 | def two_alpha(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
82 | expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
83 | expr2, end2 = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
84 | expr = expr1.append(expr2)
85 | expr.index = range(len(expr))
86 | return expr
87 |
88 | def boost_path(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
89 |
90 | #expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
91 | #expr2, end2 = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level
92 | expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
93 | expr2, end2 = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
94 |
95 | # boosted induction starts from the end of the previous induction.
96 | expr2['u0'] += alpha1/beta1
97 | expr2['s0'] += alpha1/gamma1
98 | expr2['u1'] += alpha1/beta1
99 | expr2['s1'] += alpha1/gamma1
100 | expr = expr1.append(expr2)
101 | expr.index = range(len(expr))
102 | return expr
103 |
104 | def two_alpha2(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
105 | expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
106 | expr2, end2 = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
107 | expr = expr1.append(expr2)
108 | expr.index = range(len(expr))
109 | return expr
110 |
111 | def two_alpha3(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=0.02):
112 | exprx, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, 99.9, samples2, dt, noise_level)
113 | expr1, (new_u0_start2, new_s0_start2) = _simulate_without_t(new_u0_start, new_s0_start, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
114 | expr2, end1 = _simulate_without_t(new_u0_start2, new_s0_start2, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
115 | expr = expr1.append(expr2)
116 | expr.index = range(len(expr))
117 | return expr
118 |
119 | def generate_with_df(gene_info, dt=0.001, noise_level=0.2):
120 | expr = pd.DataFrame()
121 | last_u, last_s = None, None
122 | for i in range(len(gene_info.index)):
123 | gene_name, start_u, start_s = gene_info['gene_name'][i], gene_info['start_u'][i], gene_info['start_s'][i]
124 | alpha, beta, gamma = gene_info['alpha'][i], gene_info['beta'][i], gene_info['gamma'][i]
125 | start_pct, end_pct, samples = gene_info['start_pct'][i], gene_info['end_pct'][i], gene_info['samples'][i]
126 | if start_u is not None and start_s is not None:
127 | expr_tmp, (last_u, last_s) = _simulate_without_t(start_u, start_s, alpha, beta, gamma, start_pct, end_pct, samples)
128 | else:
129 | if last_u is None or last_s is None:
130 | print("start_u and start_s should not be None at the first line.")
131 | return None
132 | expr_tmp, (last_u, last_s) = _simulate_without_t(last_u, last_s, alpha, beta, gamma, start_pct, end_pct, samples)
133 | expr = expr.append(expr_tmp)
134 | expr.index = range(len(expr))
135 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
136 | return gene_info, expr
137 |
138 | def adata_to_detail(data, para, gene):
139 | data2 = data[:, data.var.index.isin([gene])].copy()
140 | u0 = data2.layers[para[0]][:,0].copy().astype(np.float32)
141 | s0 = data2.layers[para[1]][:,0].copy().astype(np.float32)
142 | alpha = data2.layers[para[2]][:,0].copy().astype(np.float32)
143 | beta = data2.layers[para[3]][:,0].copy().astype(np.float32)
144 | gamma = data2.layers[para[4]][:,0].copy().astype(np.float32)
145 | detail = pd.DataFrame({'gene_list':gene, 'u0':u0, 's0':s0, 'embedding1':u0, 'embedding2':s0, 'alpha':alpha, 'beta':beta, 'gamma':gamma})
146 | #detail['beta1'] = data2.var['beta1'].to_numpy()[0]
147 | #detail['beta2'] = data2.var['beta2'].to_numpy()[0]
148 | #detail['gamma1'] = data2.var['gamma1'].to_numpy()[0]
149 | #detail['gamma2'] = data2.var['gamma2'].to_numpy()[0]
150 | detail['path1_pct'] = data2.var['path1_pct'].to_numpy()[0]
151 | detail['path2_pct'] = data2.var['path2_pct'].to_numpy()[0]
152 | return(detail)
153 |
154 | def generate(type, gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level):
155 | cell_num=path1_sample+path2_sample
156 | u0s, s0s, u1s, s1s, alphas, betas, gammas = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
157 | gene_info = pd.DataFrame(columns = ['gene_name', 'type', 'alpha1', 'alpha2', 'beta1', 'beta2', 'gamma1', 'gamma2', 'path1_pct', 'path2_pct', 'samples'])
158 |
159 | for i in range(gene_num):
160 | samples1, samples2 = path1_sample, path2_sample
161 | if type == "forwad":
162 | expr = forward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9, samples=samples1, noise_level=noise_level)
163 | elif type == "backward":
164 | expr = backward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9, samples=samples1, noise_level=noise_level)
165 | elif type == "two_alpha":
166 | expr = two_alpha(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct,
167 | samples1=samples1, samples2=samples2, noise_level=noise_level)
168 | elif type == "two_alpha2":
169 | expr = two_alpha2(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct,
170 | samples1=samples1, samples2=samples2, noise_level=noise_level)
171 | elif type == "two_alpha3":
172 | expr = two_alpha3(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct,
173 | samples1=samples1, samples2=samples2, noise_level=noise_level)
174 | elif type == "boost":
175 | expr = boost_path(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct,
176 | samples1=samples1, samples2=samples2, noise_level=noise_level)
177 | else:
178 | print("type not match")
179 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
180 | expr = expr.head(cell_num)
181 | gene_name = "simulation"+str(i).zfill(3)
182 | u0s[gene_name] = expr.u0
183 | s0s[gene_name] = expr.s0
184 | u1s[gene_name] = expr.u1
185 | s1s[gene_name] = expr.s1
186 | alphas[gene_name] = expr.alpha
187 | betas[gene_name] = expr.beta
188 | gammas[gene_name] = expr.gamma
189 | gene_info = gene_info.append({'gene_name':gene_name, 'type':"multi_path", 'alpha1':alpha1, 'alpha2':alpha2, 'beta1':beta1, 'beta2':beta2, 'gamma1':gamma1, 'gamma2':gamma2, 'path1_pct':path1_pct, 'path2_pct':path2_pct, 'samples':len(expr)}, ignore_index=True)
190 |
191 | #gene_info.set_index("gene_name")
192 | cell_info = pd.DataFrame()
193 | cell_info['barcode'] = s0s.index
194 | adata = anndata.AnnData(
195 | X=s0s.to_numpy(),
196 | obs = cell_info,
197 | var = gene_info,
198 | layers = {
199 | 'u0s':u0s.to_numpy(),
200 | 's0s': s0s.to_numpy(),
201 | 'u1s':u1s.to_numpy(),
202 | 's1s': s1s.to_numpy(),
203 | 'alphas': alphas.to_numpy(),
204 | 'betas': betas.to_numpy(),
205 | 'gammas': gammas.to_numpy() }
206 | )
207 | adata.var_names = gene_info['gene_name']
208 |
209 | genelist_all=adata.var_names
210 | data_onegene = pd.DataFrame()
211 | for g in genelist_all:
212 | data_onegene = data_onegene.append(adata_to_detail(adata, para=['u0s', 's0s', 'alphas', 'betas', "gammas"], gene=g))
213 | data_onegene=data_onegene.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"})
214 | data_onegene.loc[:,'cellID']=list(range(len(data_onegene)))
215 | data_onegene.loc[:,'clusters']=None
216 | return data_onegene
217 |
218 | def generate_mono(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
219 | return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
220 |
221 | def generate_tran_boost(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
222 | return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
223 |
224 | def generate_forward(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
225 | return generate("two_alpha2", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
226 |
227 | def generate_backward(start_s1, start_s2, start_u1, start_u2,alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_sample, path2_sample,noise_level=None):
228 | gene_info = pd.DataFrame(columns = ['gene_name', 'start_u', 'start_s', 'alpha', 'beta', 'gamma', 'start_pct', 'end_pct', 'samples'])
229 | gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u1, 'start_s':start_s1, 'alpha':alpha1, 'beta':beta1, 'gamma':gamma1, 'start_pct':0, 'end_pct':99, 'samples':path1_sample}, ignore_index=True)
230 | gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u2, 'start_s':start_s2, 'alpha':alpha2, 'beta':beta2, 'gamma':gamma2, 'start_pct':0, 'end_pct':99, 'samples':path2_sample}, ignore_index=True)
231 |
232 | gene_info, expr = generate_with_df(gene_info,noise_level)
233 | expr['embedding1']=expr['u0']
234 | expr['embedding2']=expr['s0']
235 | expr=expr.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"})
236 | expr.loc[:,'cellID']=list(range(len(expr)))
237 | expr.loc[:,'clusters']=None
238 | return expr
239 |
240 | def generate_by_each_cell(df, t, dt=0.001, noise_level=1):
241 | expr = pd.DataFrame()
242 |
243 | ti = t/len(df.index)
244 |
245 | last_u0, last_s0 = 0, 0
246 |
247 | for i in range(len(df.index)):
248 | sub_expr, (u0i, s0i) = _simulate(
249 | u0_start = last_u0, s0_start=last_s0,
250 | alpha=df['alpha'][i], beta=df['beta'][i], gamma=df['gamma'][i],
251 | t1=ti, t2=ti,
252 | samples=1,
253 | dt=dt, scale=noise_level)
254 |
255 | last_u0, last_s0 = u0i, s0i
256 | expr = expr.append(sub_expr)
257 | expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
258 |
259 | expr.index = range(len(expr.index))
260 | expr['t'] = ti * (expr.index+1)
261 | return expr
262 |
263 | def simulate(kinetic_type,
264 | alpha1=None,
265 | alpha2=None,
266 | beta1=None,
267 | beta2=None,
268 | gamma1=None,
269 | gamma2=None,
270 | start_splice1=None,
271 | start_splice2=None,
272 | start_unsplice1=None,
273 | start_unsplice2=None,
274 | path1_pct=None,
275 | path2_pct=None,
276 | path1_cell_number=None,
277 | path2_cell_number=None,
278 | noise_level=0.2):
279 |
280 | """
281 | Simulate a gene with the kinetic type of mono-kinetic, multi-forward, multi-backward, or transcriptional boost.
282 |
283 | Arguments
284 | ---------
285 | kinetic_type: `pandas.DataFrame`
286 | kinetic_type could be selected from ['mono', 'multi_forward', 'multi_backward', 'tran_boost']
287 |
288 | alpha1: `float` (default: `None`)
289 | The simulated alpha (transcriptional rate) for the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward', or 'tran_boost'.
290 |
291 | alpha2: `float` (default: `None`)
292 | The simulated alpha (transcriptional rate) for the second lineage. This parameter is valid when kinetic_type is set to 'multi_forward' or 'tran_boost'.
293 |
294 | beta1: `float` (default: `None`)
295 | The simulated beta (splicing rate) for the first lineage.
296 |
297 | beta2: `float` (default: `None`)
298 | The simulated beta (splicing rate) for the second lineage.
299 |
300 | gamma1: `float` (default: `None`)
301 | The simulated gamma (degration rate) for the first lineage.
302 |
303 | gamma2: `float` (default: `None`)
304 | The simulated gamma (degration rate) for the second lineage.
305 |
306 | start_splice1: optional, `float` (default: `None`)
307 | The simulated spliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
308 |
309 | start_splice2: optional, `float` (default: `None`)
310 | The simulated spliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
311 |
312 | start_unsplice1: optional, `float` (default: `None`)
313 | The simulated unspliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
314 |
315 | start_unsplice2: optional, `float` (default: `None`)
316 | The simulated unspliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
317 |
318 | path1_pct: optional, `float` (default: `None`)
319 | To decrease the bias of cell distribution at the steady point in the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'.
320 |
321 | path2_pct: optional, `float` (default: `None`)
322 | To decrease the bias of cell distribution at the steady point in the second lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'.
323 |
324 | path1_cell_number: `float` (default: `None`)
325 | The number of cells to be generated in the first lineage.
326 |
327 | path2_cell_number: `float` (default: `None`)
328 | The number of cells to be generated in the second lineage.
329 |
330 | noise_level: `float` (default: `0.2`)
331 | The noise level to be set.
332 |
333 | Returns
334 | -------
335 | df: pandas.DataFrame
336 | The dataframe of one simulated gene.
337 |
338 |
339 | -------
340 |
341 | Example usage:
342 |
343 | .. code-block:: python
344 |
345 | import celldancer.simulation as cdsim
346 | import matplotlib.pyplot as plt
347 |
348 | # Mono-kinetic
349 | plt.figure(figsize=(5,5))
350 | gene=cdsim.simulate(kinetic_type='mono',
351 | alpha1=1,
352 | alpha2=0,
353 | beta1=1,
354 | beta2=1,
355 | gamma1=1,
356 | gamma2=1,
357 | path1_pct=99,
358 | path2_pct=99,
359 | path1_cell_number=1000,
360 | path2_cell_number=1000)
361 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
362 |
363 | # Multi-lineage forward branching
364 | plt.figure(figsize=(5,5))
365 | gene=cdsim.simulate(kinetic_type='multi_forward',
366 | alpha1=5,
367 | alpha2=1,
368 | beta1=1,
369 | beta2=0.5,
370 | gamma1=5,
371 | gamma2=0.25,
372 | path1_pct=99,
373 | path2_pct=99,
374 | path1_cell_number=1000,
375 | path2_cell_number=1000)
376 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
377 |
378 | # Multi-lineage backward branching
379 | plt.figure(figsize=(5,5))
380 | gene=cdsim.simulate(kinetic_type='multi_backward',
381 | beta1=1,
382 | beta2=1,
383 | gamma1=1,
384 | gamma2=1,
385 | start_splice1=1,
386 | start_splice2=1.5,
387 | start_unsplice1=1,
388 | start_unsplice2=0.2,
389 | path1_cell_number=1000,
390 | path2_cell_number=1000)
391 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
392 |
393 | # Transcriptional boost
394 | plt.figure(figsize=(5,5))
395 | gene=cdsim.simulate(kinetic_type='tran_boost',
396 | alpha1=2,
397 | alpha2=5,
398 | beta1=2,
399 | beta2=2,
400 | gamma1=1,
401 | gamma2=1,
402 | path1_pct=99,
403 | path2_pct=80,
404 | path1_cell_number=1000,
405 | path2_cell_number=1000)
406 | plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
407 |
408 | .. image:: _static/sim.png
409 | :width: 100%
410 | :alt: sim
411 |
412 | """
413 |
414 |
415 | if kinetic_type=='mono':
416 | df=generate_mono(alpha1=alpha1,
417 | alpha2=alpha2,
418 | beta1=beta1,
419 | beta2=beta2,
420 | gamma1=gamma1,
421 | gamma2=gamma2,
422 | path1_pct=path1_pct,
423 | path2_pct=path2_pct,
424 | path1_sample=path1_cell_number,
425 | path2_sample=path2_cell_number,
426 | noise_level=noise_level)
427 |
428 | elif kinetic_type=='multi_forward':
429 | df=generate_forward(alpha1=alpha1,
430 | alpha2=alpha2,
431 | beta1=beta1,
432 | beta2=beta2,
433 | gamma1=gamma1,
434 | gamma2=gamma2,
435 | path1_pct=path1_pct,
436 | path2_pct=path2_pct,
437 | path1_sample=path2_cell_number,
438 | path2_sample=path2_cell_number,
439 | noise_level=noise_level)
440 |
441 | elif kinetic_type=='multi_backward':
442 | df=generate_backward(start_s1=start_splice1,
443 | start_s2=start_splice2,
444 | start_u1=start_unsplice1,
445 | start_u2=start_unsplice2,
446 | alpha1=0,
447 | alpha2=0,
448 | beta1=beta1,
449 | beta2=beta2,
450 | gamma1=gamma1,
451 | gamma2=gamma2,
452 | path1_sample=path1_cell_number,
453 | path2_sample=path2_cell_number,
454 | noise_level=noise_level)
455 |
456 | elif kinetic_type=='tran_boost':
457 | df=generate_tran_boost(alpha1=alpha1,
458 | alpha2=alpha2,
459 | beta1=beta1,
460 | beta2=beta2,
461 | gamma1=gamma1,
462 | gamma2=gamma2,
463 | path1_pct=path1_pct,
464 | path2_pct=path2_pct,
465 | path1_sample=path1_cell_number,
466 | path2_sample=path2_cell_number,
467 | noise_level=noise_level)
468 |
469 |
470 | else:
471 | kinetic_type_list=['mono', 'multi_forward', 'multi_backward', 'tran_boost']
472 | print('Kinetic type in ',kinetic_type_list,' could be choose from.')
473 |
474 | return(df)
--------------------------------------------------------------------------------
/src/celldancer/utilities.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.sparse import csr_matrix
3 | import scipy
4 | import pandas as pd
5 | import anndata as ad
6 | from sklearn.neighbors import NearestNeighbors
7 | from statsmodels.nonparametric.kernel_regression import KernelReg
8 |
9 | # progress bar
10 | import contextlib
11 | import joblib
12 | from tqdm import tqdm
13 |
14 | @contextlib.contextmanager
15 | def tqdm_joblib(tqdm_object):
16 | """Context manager to patch joblib to report into tqdm progress bar given as argument"""
17 | class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
18 | def __call__(self, *args, **kwargs):
19 | tqdm_object.update(n=self.batch_size)
20 | return super().__call__(*args, **kwargs)
21 |
22 | old_batch_callback = joblib.parallel.BatchCompletionCallBack
23 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
24 | try:
25 | yield tqdm_object
26 | finally:
27 | joblib.parallel.BatchCompletionCallBack = old_batch_callback
28 | tqdm_object.close()
29 |
30 | def _non_para_kernel(X,Y,down_sample_idx):
31 | # (no first cls),pseudotime r square calculation
32 | # this version has downsampling section
33 | # TO DO WHEN ONLY USING ONE GENE, WILL CAUSL PROBLEM WHEN COMBINING
34 | # Usage: Gene pseudotime fitting and r square (moved to utilities)
35 | # input: X,Y
36 | # return: estimator, r_square
37 | # example:
38 | # X = pd.DataFrame(np.arange(100)*np.pi/100)
39 | # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1)))
40 | # estimator,r_square=non_para_kernel(X,Y)
41 |
42 | # X2=pd.DataFrame(np.random.randint(0,100,size=[200,1]))
43 | # Y2=pd.DataFrame(np.random.normal(9,5,size=[200]))
44 | # X = pd.DataFrame(np.arange(100)*np.pi/100)
45 | # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1)))
46 | from statsmodels.nonparametric.kernel_regression import KernelReg
47 | import matplotlib.pyplot as plt
48 | print('_non_para_kernel_t4')
49 | Y_sampled=Y[X['index'].isin(down_sample_idx)]
50 | X_sampled=X[X['index'].isin(down_sample_idx)].time
51 | kde=KernelReg(endog=Y_sampled,
52 | exog=X_sampled,
53 | var_type='c',
54 | )
55 | #X=merged.time
56 | #Y=merged.s0
57 | #print(kde.r_squared())
58 | n=X_sampled.shape[0]
59 |
60 | estimator = kde.fit(X_sampled)
61 | estimator = np.reshape(estimator[0],[n,1])
62 |
63 | return(estimator,kde.r_squared())
64 |
65 | def getidx_downSampling_embedding(load_cellDancer,cell_choice=None):
66 | # find the origional id
67 |
68 | if cell_choice is not None:
69 | load_cellDancer=load_cellDancer[load_cellDancer.cellIndex.isin(cell_choice)]
70 |
71 | embedding=load_cellDancer.loc[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]][['embedding1','embedding2']]
72 |
73 | # get transfer id
74 | from .sampling import sampling_embedding
75 | idx_downSampling_embedding = sampling_embedding(embedding,
76 | para='neighbors',
77 | target_amount=0,
78 | step=(30,30) # TODO: default is 30
79 | )
80 | if cell_choice is None:
81 | return(idx_downSampling_embedding)
82 | else:
83 | # transfer to the id of origional all detail list
84 | onegene=load_cellDancer[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]].copy()
85 | onegene.loc[:,'transfer_id']=range(len(onegene))
86 | sampled_left=onegene[onegene.transfer_id.isin(idx_downSampling_embedding)]
87 | transfered_index=sampled_left.cellIndex
88 | return(transfered_index)
89 |
90 |
91 | def combine_parallel_result(result,gene_list,sampled_idx,merged_part_time):
92 | # combine result of rsquare and non-para fitting obtained from parallel computing
93 | for i,result_i in enumerate(result):
94 |
95 | r_square=result_i[1]
96 | non_para_fit=result_i[0]
97 | #print(r_square)
98 | if i == 0:
99 | r_square_list = r_square
100 | non_para_fit_list = np.transpose(non_para_fit)
101 | else:
102 | r_square_list = np.vstack((r_square_list, r_square))
103 | non_para_fit_list = np.vstack((non_para_fit_list, np.transpose(non_para_fit)[0]))
104 | r_square=pd.DataFrame({'gene_name':gene_list,'r_square':np.transpose(r_square_list)[0]})
105 |
106 | non_para_fit_heat=pd.DataFrame(non_para_fit_list,index=gene_list)
107 | non_para_fit_heat.columns=merged_part_time[merged_part_time['index'].isin(sampled_idx)]['index']
108 |
109 | non_para_list=pd.DataFrame(non_para_fit_list)
110 | non_para_list['combined']=non_para_list.values.tolist()
111 | r_square
112 | r_square_non_para_list=pd.concat([r_square,non_para_list['combined']],axis=1)
113 | r_square_non_para_list_sort=r_square_non_para_list.sort_values(by=['r_square'], axis=0, ascending=False)
114 |
115 | return(r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list)
116 |
117 | def get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=None,):
118 | # downsample
119 | sampled_idx=getidx_downSampling_embedding(load_cellDancer,cell_choice=cell_choice)
120 |
121 | # parallel thread
122 | from joblib import Parallel, delayed
123 | # run parallel
124 | with tqdm_joblib(tqdm(desc="Calculate rsquare", total=len(gene_list))) as progress_bar:
125 | result = Parallel(n_jobs= -1, backend="loky")( # TODO: FIND suitable njobs
126 | delayed(_non_para_kernel_t4)(s0_merged_part_time,s0_merged_part_gene[gene_list[gene_index]],sampled_idx)
127 | for gene_index in range(0,len(gene_list)))
128 |
129 | # combine
130 | r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list=combine_parallel_result(result,gene_list,sampled_idx,s0_merged_part_time)
131 |
132 | return (r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx)
133 |
134 |
135 | def get_gene_s0_by_time(cell_time,load_cellDancer):
136 | cell_time_time_sort=cell_time.sort_values('pseudotime')
137 | cell_time_time_sort.columns=['index','time']
138 |
139 | s0_heatmap_raw=load_cellDancer.pivot(index='cellIndex', columns='gene_name', values='unsplice')
140 |
141 | s0_heatmap_raw
142 | s0_merged=pd.merge(cell_time_time_sort,s0_heatmap_raw,left_on='index', right_on='cellIndex') # TODO: NOT cellIndex in the future
143 |
144 | s0_merged_part_gene=s0_merged.loc[:, s0_merged.columns[2:]]
145 | s0_merged_part_time=s0_merged.loc[:, s0_merged.columns[0:2]]
146 |
147 | return(s0_merged_part_gene,s0_merged_part_time)
148 |
149 | def rank_rsquare(load_cellDancer,gene_list=None,cluster_choice=None):
150 | cell_time=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]][['cellIndex','pseudotime']]
151 | s0_merged_part_gene,s0_merged_part_time=get_gene_s0_by_time(cell_time,load_cellDancer)
152 |
153 | onegene=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]]
154 |
155 | if cluster_choice is None:
156 | cluster_choice=list(onegene.clusters.drop_duplicates())
157 | cell_idx=list(onegene[onegene.clusters.isin(cluster_choice)].cellIndex)
158 |
159 | if gene_list is None:
160 | gene_list=s0_merged_part_gene.columns
161 | r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx=get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=cell_idx)
162 | return(r_square_non_para_list_sort[['gene_name','r_square']].reset_index(drop=True))
163 |
164 |
165 | def adata_to_df_with_embed(adata,
166 | us_para=['Mu', 'Ms'],
167 | cell_type_para='celltype',
168 | embed_para='X_umap',
169 | save_path='cell_type_u_s_sample_df.csv',
170 | gene_list=None):
171 |
172 | """Convert adata to pandas.DataFrame format and save it as csv file with embedding info.
173 |
174 | Arguments
175 | ---------
176 | adata: `anndata._core.anndata.AnnData`
177 | The adata to be transferred.
178 | us_para: `list` (default: ['Mu','Ms'])
179 | The attributes of the two count matrices of pre-mature (unspliced) and mature (spliced) abundances from adata.layers. By default, splice and unsplice columns (the two count matrices of spliced and unspliced abundances) are obtained from the ['Ms', 'Mu'] attributes of adata.layers.
180 | cell_type_para: `str` (default: 'celltype')
181 | The attribute of cell type to be obtained from adata.obs. By default, cell type information is obtained from ['celltype'] column of adata.obs.
182 | embed_para: `str` (default: 'X_umap')
183 | The attribute of embedding space to be obtained from adata.obsm. It represents the 2-dimensional representation of all cells. The embedding1 and embedding2 columns are obtained from [‘X_umap’] attribute of adata.obsm.
184 | save_path: `str` (default: 'cell_type_u_s_sample_df.csv')
185 | Path to save the result of transferred csv file.
186 | gene_list: `list` (default: None)
187 | Specific gene(s) to be transfered.
188 | Returns
189 | -------
190 | raw_data: `pandas.DataFrame`
191 | pandas DataFrame with columns gene_name, unsplice, splice, cellID, clusters, embedding1, embedding2.
192 | """
193 | from tqdm import tqdm
194 | def adata_to_raw_one_gene(data, us_para, gene):
195 | '''
196 | convert adata to raw data format (one gene)
197 | data: an anndata
198 | us_para: the varable name of u0, s0, and gene name
199 | us_para = ['Mu', 'Ms']
200 | '''
201 | data2 = data[:, data.var.index.isin([gene])].copy()
202 | u0 = data2.layers[us_para[0]][:,0].copy().astype(np.float32)
203 | s0 = data2.layers[us_para[1]][:,0].copy().astype(np.float32)
204 | raw_data = pd.DataFrame({'gene_name':gene, 'unsplice':u0, 'splice':s0})
205 | return(raw_data)
206 |
207 | if gene_list is None: gene_list=adata.var.index
208 |
209 | for i,gene in enumerate(tqdm(gene_list)):
210 | data_onegene = adata_to_raw_one_gene(adata, us_para=us_para, gene=gene)
211 | if i==0:
212 | data_onegene.to_csv(save_path,header=True,index=False)
213 | else:
214 | data_onegene.to_csv(save_path,mode='a',header=False,index=False)
215 |
216 | # cell info
217 | gene_num=len(gene_list)
218 | cellID=pd.DataFrame({'cellID':adata.obs.index})
219 | celltype_meta=adata.obs[cell_type_para].reset_index(drop=True)
220 | celltype=pd.DataFrame({'clusters':celltype_meta})#
221 | embed_map=pd.DataFrame({'embedding1':adata.obsm[embed_para][:,0],'embedding2':adata.obsm[embed_para][:,1]})
222 | # embed_info_df = pd.concat([embed_info]*gene_num)
223 | embed_info=pd.concat([cellID,celltype,embed_map],axis=1)
224 | embed_raw=pd.concat([embed_info]*gene_num)
225 | embed_raw=embed_raw.reset_index(drop=True)
226 |
227 | raw_data=pd.read_csv(save_path)
228 | raw_data=pd.concat([raw_data,embed_raw],axis=1)
229 | raw_data.to_csv(save_path,header=True,index=False)
230 |
231 | return(raw_data)
232 |
233 | def to_dynamo(cellDancer_df):
234 | '''
235 | Convert the output dataframe of cellDancer to the input of dynamo. The output of this function can be directly used in the downstream analyses of dynamo.
236 |
237 | Example usage:
238 |
239 | .. code-block:: python
240 |
241 | import dynamo as dyn
242 | import numpy as np
243 | import pandas as pd
244 | import anndata as ann
245 | import matplotlib.pyplot as plt
246 | import celldancer as cd
247 | import celldancer.utilities as cdutil
248 |
249 | # load the prediction result of all genes, the data could be achieved from section 'Deciphering gene regulation through vector fields analysis in pancreatic endocrinogenesis'
250 | cellDancer_df=pd.read_csv('HgForebrainGlut_cellDancer_estimation_spliced.csv')
251 | cellDancer_df=cd.compute_cell_velocity(cellDancer_df=cellDancer_df, projection_neighbor_choice='embedding', expression_scale='power10', projection_neighbor_size=100) # compute cell velocity
252 |
253 | # transform celldancer dataframe to anndata
254 | adata_from_dancer = cdutil.to_dynamo(cellDancer_df)
255 |
256 | # plot the velocity vector
257 | dyn.pl.streamline_plot(adata_from_dancer, color=["clusters"], basis = "cdr", show_legend="on data", show_arrowed_spines=True)
258 |
259 | -------
260 |
261 | .. image:: _static/dynamo_plt.png
262 | :width: 60%
263 | :alt: dynamo_plt
264 |
265 | Arguments
266 | ---------
267 | cellDancer_df: `pandas.DataFrame`
268 | The output dataframe of cellDancer.
269 |
270 | cellDancer --> dynamo
271 |
272 | cellDancer_df.splice --> adata.X
273 |
274 | cellDancer_df.loss --> adata.var.loss
275 |
276 | cellDancer_df.cellID --> adata.obs
277 |
278 | cellDancer_df.clusters --> adata.obs.clusters
279 |
280 | cellDancer_df.splice --> adata.layers['X_spliced']
281 |
282 | cellDancer_df.splice --> adata.layers['M_s']
283 |
284 | cellDancer_df.unsplice --> adata.layers['X_unspliced']
285 |
286 | cellDancer_df.unsplice --> adata.layers['M_u']
287 |
288 | cellDancer_df.alpha --> adata.layers['alpha']
289 |
290 | cellDancer_df.beta --> adata.layers['beta']
291 |
292 | cellDancer_df.gamma --> adata.layers['gamma']
293 |
294 | cellDancer_df.unsplice_predict - cellDancer_df.unsplice --> adata.layers['velocity_U']
295 |
296 | cellDancer_df.splice_predict - cellDancer_df.splice --> adata.layers['velocity_S']
297 |
298 | cellDancer_df[['embeddding1', 'embedding2']] --> adata.obsm['X_cdr']
299 |
300 | cellDancer_df[['velocity1', 'velocity2']] --> adata.obsm['velocity_cdr']
301 |
302 | Returns
303 | -------
304 | adata
305 | '''
306 |
307 | # Sort the cellDancer_df by cellID, so if it's not done already, your cellDancer_df could be changed.
308 | # This is because pd.DataFrame.pivot does this automatically and we don't want to mess up with
309 | # the obsm etc
310 | cellDancer_df = cellDancer_df.sort_values('cellID')
311 |
312 | spliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice')
313 | unspliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice')
314 |
315 | spliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice_predict')
316 | unspliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice_predict')
317 |
318 | alpha = cellDancer_df.pivot(index='cellID', columns='gene_name', values='alpha')
319 | beta = cellDancer_df.pivot(index='cellID', columns='gene_name', values='beta')
320 | gamma = cellDancer_df.pivot(index='cellID', columns='gene_name', values='gamma')
321 |
322 | one_gene = cellDancer_df['gene_name'].iloc[0]
323 | one_cell = cellDancer_df['cellID'].iloc[0]
324 |
325 | adata1 = ad.AnnData(spliced)
326 |
327 | # var
328 | adata1.var['highly_variable_genes'] = True
329 | #adata1.var['loss'] = (cellDancer_df[cellDancer_df['cellID'] == one_cell]['loss']).tolist()
330 | loss = cellDancer_df.pivot(index='gene_name', columns='cellID', values='loss').iloc[:, 0]
331 | loss.index = loss.index.astype(str)
332 | adata1.var['loss'] = loss
333 | # celldancer uses all genes (high variable) for dynamics and transition.
334 | adata1.var['use_for_dynamics'] = True
335 | adata1.var['use_for_transition'] = True
336 |
337 | # obs
338 | if 'clusters' in cellDancer_df:
339 | clusters = cellDancer_df.pivot(index='cellID', columns='gene_name', values='clusters').iloc[:, 0]
340 | clusters.index = clusters.index.astype(str)
341 | adata1.obs['clusters'] = clusters
342 | # layers
343 | adata1.layers['X_spliced'] = spliced
344 | adata1.layers['X_unspliced'] = unspliced
345 |
346 | adata1.layers['M_s'] = spliced
347 | adata1.layers['M_u'] = unspliced
348 | adata1.layers['velocity_S'] = spliced_predict - spliced
349 |
350 | adata1.layers['velocity_U'] = unspliced_predict - unspliced
351 | adata1.layers['alpha'] = alpha
352 | adata1.layers['beta'] = beta
353 | adata1.layers['gamma'] = gamma
354 |
355 | # obsm
356 | adata1.obsm['X_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['embedding1', 'embedding2']].values
357 | # assuming no downsampling is used for the cell velocities in the cellDancer_df
358 | if 'velocity1' in cellDancer_df:
359 | adata1.obsm['velocity_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['velocity1', 'velocity2']].values
360 |
361 | # obsp
362 | n_neighbors = 20
363 | nn = NearestNeighbors(n_neighbors=n_neighbors)
364 | nn.fit(adata1.obsm['X_cdr'])
365 | connect_knn = nn.kneighbors_graph(mode='connectivity')
366 | distance_knn = nn.kneighbors_graph(mode='distance')
367 | adata1.obsp['connectivities'] = connect_knn
368 | adata1.obsp['distances'] = distance_knn
369 |
370 | # uns
371 | dynamics_info = {'filter_gene_mode': 'final',
372 | 't': None,
373 | 'group': None,
374 | 'X_data': None,
375 | 'X_fit_data': None,
376 | 'asspt_mRNA': 'ss',
377 | 'experiment_type': 'conventional',
378 | 'normalized': True,
379 | 'model': 'static',
380 | 'est_method': 'ols',
381 | 'has_splicing': True,
382 | 'has_labeling': False,
383 | 'splicing_labeling': False,
384 | 'has_protein': False,
385 | 'use_smoothed': True,
386 | 'NTR_vel': False,
387 | 'log_unnormalized': False,
388 | 'fraction_for_deg': False}
389 |
390 | adata1.uns['dynamics']= dynamics_info
391 |
392 | return adata1
393 |
394 | def export_velocity_to_dynamo(cellDancer_df,adata):
395 | '''
396 | Replace the velocities in adata of dynamo (“adata” in parameters) with the cellDancer predicted velocities (“cellDancer_df” in parameters). The output can be directly used in the downstream analyses of dynamo.
397 |
398 | -------
399 | The vector field could be learned by dynamo based on the RNA velocity of cellDancer. Details are shown in the section ‘Application of dynamo.’
400 |
401 | .. image:: _static/dynamo_vector_field_pancreas.png
402 | :width: 60%
403 | :alt: dynamo_vector_field_pancreas
404 |
405 | Arguments
406 | ---------
407 | cellDancer_df: `pandas.DataFrame`
408 | The output dataframe of cellDancer.
409 |
410 | cellDancer --> dynamo
411 |
412 | bools of the existance of cellDancer_df['gene_name'] in adata.var --> adata.var['use_for_dynamics']
413 |
414 | bools of the existance of cellDancer_df['gene_name'] in adata.var --> adata.var['use_for_transition']
415 |
416 | cellDancer_df.splice_predict - cellDancer_df.splice --> adata.layers['velocity_S']
417 |
418 | adata: `anndata._core.anndata.AnnData`
419 | The adata to be integrated with cellDancer velocity result.
420 |
421 |
422 | Returns
423 | -------
424 | adata
425 | '''
426 |
427 | dancer_genes = cellDancer_df['gene_name'].drop_duplicates()
428 | cellDancer_df["velocity_S"] = cellDancer_df["splice_predict"]-cellDancer_df["splice"]
429 | dancer_velocity_s = cellDancer_df[['cellID', 'gene_name', 'velocity_S']]
430 | pivoted = dancer_velocity_s.pivot(index="cellID", columns="gene_name", values="velocity_S")
431 | velocity_matrix = np.zeros(adata.shape)
432 | adata_ds_zeros = pd.DataFrame(velocity_matrix, columns=adata.var.index, index=adata.obs.index)
433 | celldancer_velocity_s_df = (adata_ds_zeros + pivoted).fillna(0)[adata.var.index]
434 |
435 | adata.layers['velocity_S'] = scipy.sparse.csr_matrix(celldancer_velocity_s_df.values)
436 | adata.var['use_for_dynamics'] = adata.var.index.isin(dancer_genes)
437 | adata.var['use_for_transition'] = adata.var.index.isin(dancer_genes)
438 | return(adata.copy())
439 |
440 | def adata_to_raw(adata,save_path,gene_list=None):
441 | '''convert adata to raw data format
442 | data:
443 | save_path:
444 | gene_list (optional):
445 | return: panda dataframe with gene_list,u0,s0,cellID
446 |
447 | run: test=adata_to_raw(adata,'/Users/shengyuli/Library/CloudStorage/OneDrive-HoustonMethodist/work/Velocity/bin/cellDancer-development_20220128/src/output/test.csv',gene_list=genelist_all)
448 | ref: mel - loom_to_celldancer_raw.py
449 | '''
450 | from tqdm import tqdm
451 |
452 | def adata_to_raw_one_gene(data, para, gene):
453 | '''
454 | convert adata to raw data format (one gene)
455 | data: an anndata
456 | para: the varable name of u0, s0, and gene name
457 | para = ['Mu', 'Ms']
458 | '''
459 | data2 = data[:, data.var.index.isin([gene])].copy()
460 | u0 = data2.layers[para[0]][:,0].copy().astype(np.float32)
461 | s0 = data2.layers[para[1]][:,0].copy().astype(np.float32)
462 | raw_data = pd.DataFrame({'gene_name':gene, 'u0':u0, 's0':s0})
463 | raw_data['cellID']=adata.obs.index
464 | return(raw_data)
465 |
466 | if gene_list is None: gene_list=adata.var.index
467 |
468 | for i,gene in enumerate(tqdm(gene_list)):
469 | data_onegene = adata_to_raw_one_gene(adata, para=['Mu', 'Ms'], gene=gene)
470 | if i==0:
471 | data_onegene.to_csv(save_path,header=True,index=False)
472 | else:
473 | data_onegene.to_csv(save_path,mode='a',header=False,index=False)
474 | raw_data=pd.read_csv(save_path)
475 |
476 | return(raw_data)
477 |
478 | def filter_by_neighbor_sample_parallel(load_raw_data,step_i=15,step_j=15,cutoff_s0_zero_ratio=0.2,cutoff_u0_zero_ratio=0.2,gene_amt_each_job=100):
479 | from joblib import Parallel, delayed
480 | import pandas as pd
481 | import numpy as np
482 |
483 | '''filter genes with'''
484 | # parallel filter gene_by_neighbor_sample_one_gene
485 | def filter_gene_by_neighbor_sample_one_gene(gene,load_raw_data,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None):
486 | # print(gene)
487 | u_s= np.array(load_raw_data[load_raw_data['gene_list']==gene][["u0","s0"]]) # u_s
488 | sampling_idx=sampling_neighbors(u_s[:,0:2], step_i=step_i,step_j=step_j,percentile=15) # Sampling
489 | u_s_downsample = u_s[sampling_idx,0:4]
490 | u_s_df=pd.DataFrame({"s0":u_s_downsample[:, 1],'u0':u_s_downsample[:, 0]})
491 | u_s_df=u_s_df[~((u_s_df.s0==0) & (u_s_df.u0==0))]
492 | # print(u_s_df)
493 | u_s_df_zero_amt=u_s_df.agg(lambda x: x.eq(0).sum())
494 | sampled_gene_amt=len(u_s_df)
495 | u_s_df_zero_ratio=u_s_df_zero_amt/sampled_gene_amt
496 | # plt.figure(None,(6,6))
497 | # plt.scatter(u_s_df.s0,u_s_df.u0,alpha=0.1)
498 | # plt.show()
499 | # return [u_s_df_zero_ratio.s0,u_s_df_zero_ratio.u0]
500 | # return(u_s_df)
501 | if ~(u_s_df_zero_ratio.s0>cutoff_s0_zero_ratio or u_s_df_zero_ratio.u0>cutoff_u0_zero_ratio):
502 | return(gene)
503 |
504 | def filter_gene_by_neighbor_sample(start_point,load_raw_data,gene_list=None,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None):
505 | if start_point+gene_amt_each_job rawdata[['gene_list', 'u0','s0']]
544 | return(ratio2, cor2)
545 | ratio2 [['gene_choice','ratio']]
546 | ratio2 [['gene_choice','correlation']]
547 | '''
548 | def identify_in_grid(u, s, onegene_u0_s0):
549 | select_cell =onegene_u0_s0[(onegene_u0_s0[:,0]>u[0]) & (onegene_u0_s0[:,0]s[0]) & (onegene_u0_s0[:,1] cell,
117 | # col -> neighboring cells,
118 | # value -> index of cells,
119 | # the fist col is the index of row
120 |
121 | expr = pd.merge(pd.DataFrame(splice, columns=['splice']), pd.DataFrame(unsplice, columns=['unsplice']), left_index=True, right_index=True)
122 | if barcode is not None:
123 | expr.index = barcode
124 | unsplice = torch.tensor(expr['unsplice'])
125 | splice = torch.tensor(expr['splice'])
126 | indices = torch.tensor(indices)
127 | unsplice_predict, splice_predict, alphas, beta, gamma = self.module(unsplice, splice, alpha0, beta0, gamma0, dt)
128 |
129 | def cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices):
130 | """Cost function
131 | Return:
132 | list of cosine distance and a list of the index of the next cell
133 | """
134 |
135 | uv, sv = unsplice_predict-unsplice, splice_predict-splice # Velocity from (unsplice, splice) to (unsplice_predict, splice_predict)
136 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice # Velocity from (unsplice, splice) to its neighbors
137 |
138 | den = torch.sqrt(unv**2 + snv**2) * torch.sqrt(uv**2+sv**2)
139 | den[den==0] = -1
140 | cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.)) # cosine: column -> individuel cell (cellI); row -> nearby cells of cell id ; value -> cosine between col and row cells
141 | cosine_max, cosine_max_idx = torch.max(cosine, dim=0)
142 | cell_idx = torch.diag(indices[:, cosine_max_idx+1])
143 | return 1 - cosine_max, cell_idx
144 |
145 |
146 |
147 | def rmse(unsplice, splice, unsplice_predict, splice_predict, indices):
148 | """
149 | This loss is defined as the rmse of the predicted velocity vector (uv, sv) from the neighboring velocity vectors (unv, snv).
150 |
151 | This loss is used during revision.
152 |
153 | """
154 | uv, sv = unsplice_predict-unsplice, splice_predict-splice
155 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice
156 |
157 | rmse = (uv-unv)**2 + (sv-snv)**2
158 | rmse = torch.sqrt(0.5*rmse)
159 |
160 | # normalize across all neighboring cells using a softmax function.
161 | # m = torch.nn.Softmax(dim=0)
162 | # rmse = m(rmse)
163 |
164 | rmse_min, rmse_min_idx = torch.min(rmse, dim=0)
165 | cell_idx = torch.diag(indices[:, rmse_min_idx+1])
166 | return rmse_min, cell_idx
167 |
168 |
169 | def mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio = 0.5):
170 | """
171 | This loss is defined as the mix of rmse loss and cosine loss.
172 |
173 | This loss is used during revision.
174 |
175 | Parameters:
176 |
177 | unsplice: 1d tensor [n_cells]
178 | splice: 1d tensor [n_cells]
179 | indices: 2d array [n_cells, n_neighbors]
180 | Return:
181 | list of cosine distance and a list of the index of the next cell
182 | """
183 |
184 | #print("mix ratio, ", mix_ratio)
185 | uv, sv = unsplice_predict-unsplice, splice_predict-splice
186 | unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice
187 | mag_v = torch.sqrt(uv**2 + sv**2)
188 | mag_nv = torch.sqrt(unv**2 + snv**2)
189 | mag = (mag_nv - mag_v)**2
190 |
191 | # minimize mag or maximize -mag
192 | # normalize across all neighboring cells using a softmax function
193 | m = torch.nn.Softmax(dim=0)
194 | mag = m(mag)
195 |
196 | den = mag_v * mag_nv
197 | den[den==0] = -1
198 |
199 | # cosine: [n_neighbors x n_cells]
200 | cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.))
201 |
202 | total = mix_ratio*(1-cosine) + (1 - mix_ratio)* mag
203 | total_min, total_min_idx = torch.min(total, dim=0)
204 |
205 | cell_idx = torch.diag(indices[:, total_min_idx+1])
206 | return total_min, cell_idx
207 |
208 |
209 | def trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx, version):
210 |
211 | # This cost has been deprecated.
212 |
213 | uv, sv = unsplice_predict-unsplice, splice_predict-splice
214 | tan = torch.where(sv!=1000000, uv/sv, torch.tensor(0.00001))
215 | atan_theta = torch.atan(tan) + torch.pi/2
216 | atan_theta2=atan_theta[idx]
217 | atan_theta3 = atan_theta[idx[idx]]
218 | if version=="v1":
219 | cost = atan_theta2/atan_theta+atan_theta3/atan_theta2
220 | elif version=="v2":
221 | cost=torch.where(atan_theta=torch.tensor(0.0), torch.tensor(0.0), torch.tensor(-corrcoef))
233 | return(cost)
234 |
235 | if trace_cost_ratio == 0 and corrcoef_cost_ratio == 0:
236 |
237 | if loss_func == 'cosine':
238 | cost1 = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices)[0]
239 | cost_fin = torch.mean(cost1)
240 |
241 | if loss_func == 'rmse':
242 | cost1 = rmse(unsplice, splice, unsplice_predict, splice_predict, indices)[0]
243 | cost_fin = torch.mean(cost1)
244 |
245 | elif 'mix' in loss_func:
246 | mix_ratio = loss_func[1]
247 | cost1 = mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio=mix_ratio)[0]
248 | cost_fin = torch.mean(cost1)
249 |
250 | else: # trace cost and corrcoef cost have been deprecated.
251 | # cosine cost
252 | cost1,idx = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices)
253 | cost1_normalize=(cost1-torch.min(cost1))/torch.max(cost1)
254 | cost1_mean = torch.mean(cost1_normalize)
255 |
256 | # trace cost
257 | if trace_cost_ratio>0:
258 | cost2 = trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx,"v2")
259 | cost2_normalize=(cost2-torch.min(cost2))/torch.max(cost2)
260 | cost2_mean = torch.mean(cost2_normalize)
261 | cost2_relu=(max((cost2_mean-cost2_cutoff), 0))
262 |
263 | # corrcoef cost
264 | if corrcoef_cost_ratio>0:
265 | corrcoef_cost=corrcoef_cost(alphas, unsplice, beta, splice)
266 |
267 | # sum all cost
268 | cosin_cost_ratio=1-trace_cost_ratio-corrcoef_cost_ratio
269 | cost_fin = cosin_cost_ratio*cost1_mean + \
270 | trace_cost_ratio*cost2_relu + \
271 | corrcoef_cost_ratio*corrcoef_cost
272 |
273 | return cost_fin, unsplice_predict, splice_predict, alphas, beta, gamma
274 |
275 |
276 | def summary_para_validation(self, cost_mean):
277 | loss_df = pd.DataFrame({'cost': cost_mean}, index=[0])
278 | return(loss_df)
279 |
280 | def summary_para(self, unsplice, splice, unsplice_predict, splice_predict, alphas, beta, gamma, cost):
281 | cellDancer_df = pd.merge(pd.DataFrame(unsplice, columns=['unsplice']),pd.DataFrame(splice, columns=['splice']), left_index=True, right_index=True)
282 | cellDancer_df['unsplice_predict'] = unsplice_predict
283 | cellDancer_df['splice_predict'] = splice_predict
284 | cellDancer_df['alpha'] = alphas
285 | cellDancer_df['beta'] = beta
286 | cellDancer_df['gamma'] = gamma
287 | cellDancer_df['cost'] = cost
288 | return cellDancer_df
289 |
290 | class ltModule(pl.LightningModule):
291 | '''
292 | train network using "DNN_module"
293 | '''
294 | def __init__(self,
295 | backbone=None,
296 | initial_zoom=2,
297 | initial_strech=1,
298 | learning_rate=None,
299 | dt=None,
300 | loss_func = None,
301 | cost2_cutoff=0,
302 | optimizer='Adam',
303 | trace_cost_ratio=0,
304 | corrcoef_cost_ratio=0,
305 | cost_type='smooth',
306 | average_cost_window_size=10,
307 | smooth_weight=0.9):
308 | super().__init__()
309 | self.backbone = backbone
310 | self.validation_loss_df = pd.DataFrame()
311 | self.test_cellDancer_df = None
312 | self.test_loss_df = None
313 | self.initial_zoom = initial_zoom
314 | self.initial_strech = initial_strech
315 | self.learning_rate=learning_rate
316 | self.dt=dt
317 | self.loss_func=loss_func
318 | self.cost2_cutoff=cost2_cutoff
319 | self.optimizer=optimizer
320 | self.trace_cost_ratio=trace_cost_ratio
321 | self.corrcoef_cost_ratio=corrcoef_cost_ratio
322 | self.save_hyperparameters()
323 | self.get_loss=1000
324 | self.cost_type=cost_type
325 | self.average_cost_window_size=average_cost_window_size # will be used only when cost_tpye.isin(['average', 'median'])
326 | self.cost_window=[]
327 | self.smooth_weight=smooth_weight
328 |
329 | def save(self, model_path):
330 | self.backbone.module.save(model_path) # save network
331 |
332 | def load(self, model_path):
333 | self.backbone.module.load(model_path) # load network
334 |
335 | def configure_optimizers(self): # define optimizer
336 | if self.optimizer=="Adam":
337 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.999), eps=10**(-8), weight_decay=0.004, amsgrad=False)
338 | elif self.optimizer=="SGD":
339 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.8)
340 | return optimizer
341 |
342 | def training_step(self, batch, batch_idx):
343 | '''
344 | traning network
345 | batch: [] output returned from realDataset.__getitem__
346 |
347 | '''
348 |
349 | unsplices, splices, gene_names, unsplicemaxs, splicemaxs, embedding1s, embedding2s = batch
350 | unsplice, splice, unsplicemax, splicemax, embedding1, embedding2 = unsplices[0], splices[0], unsplicemaxs[0], splicemaxs[0], embedding1s[0], embedding2s[0]
351 |
352 | umax = unsplicemax
353 | smax = splicemax
354 | alpha0 = np.float32(umax*self.initial_zoom)
355 | beta0 = np.float32(1.0)
356 | gamma0 = np.float32(umax/smax*self.initial_strech)
357 |
358 | cost, unsplice_predict, splice_predict, alphas, beta, gamma = self.backbone.velocity_calculate( \
359 | unsplice, splice, alpha0, beta0, gamma0, self.dt, embedding1, embedding2, \
360 | loss_func = self.loss_func, \
361 | cost2_cutoff = self.cost2_cutoff, \
362 | trace_cost_ratio = self.trace_cost_ratio, \
363 | corrcoef_cost_ratio=self.corrcoef_cost_ratio)
364 |
365 | if self.cost_type=='average': # keep the window len <= check_val_every_n_epoch
366 | if len(self.cost_window)0):
485 | data=data_fitting.sample(frac=self.permutation_ratio) # select cells to train using random methods
486 | else:
487 | print('sampling ratio is wrong!')
488 | elif self.datastatus=="predict_dataset":
489 | data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene
490 | data=data_pred
491 |
492 | data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene
493 |
494 | unsplicemax = np.float32(max(data_pred["unsplice"]))
495 | splicemax = np.float32(max(data_pred["splice"]))
496 | unsplice = np.array(data.unsplice.copy().astype(np.float32))
497 | splice = np.array(data.splice.copy().astype(np.float32))
498 | if self.norm_u_s:
499 | unsplice=unsplice/unsplicemax
500 | splice=splice/splicemax
501 |
502 | # add embedding
503 | embedding1 = np.array(data.embedding1.copy().astype(np.float32))
504 | embedding2 = np.array(data.embedding2.copy().astype(np.float32))
505 |
506 | return unsplice, splice, gene_name, unsplicemax, splicemax, embedding1, embedding2
507 |
508 |
509 |
510 | class feedData(pl.LightningDataModule):
511 | '''
512 | load training and test data
513 | '''
514 | def __init__(self, data_fit=None, data_predict=None,permutation_ratio=1,norm_u_s=True,norm_cell_distribution=False):
515 | super().__init__()
516 |
517 | self.fit_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="fit_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution)
518 |
519 | self.predict_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="predict_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s)
520 |
521 | def subset(self, indices):
522 | import copy
523 | temp = copy.copy(self)
524 | temp.fit_dataset = Subset(self.fit_dataset, indices)
525 | temp.predict_dataset = Subset(self.predict_dataset, indices)
526 | return temp
527 |
528 | def train_dataloader(self):
529 | return DataLoader(self.fit_dataset,num_workers=0)
530 | def val_dataloader(self):
531 | return DataLoader(self.fit_dataset,num_workers=0)
532 | def test_dataloader(self):
533 | return DataLoader(self.predict_dataset,num_workers=0,)
534 |
535 | def _train_thread(datamodule,
536 | data_indices,
537 | save_path=None,
538 | max_epoches=None,
539 | check_val_every_n_epoch=None,
540 | norm_u_s=None,
541 | patience=None,
542 | learning_rate=None,
543 | dt=None,
544 | loss_func=None,
545 | n_neighbors=None,
546 | ini_model=None,
547 | model_save_path=None):
548 |
549 | try:
550 | seed = 0
551 | torch.manual_seed(seed)
552 | random.seed(seed)
553 | np.random.seed(seed)
554 |
555 | # iniate network (DNN_layer) and loss function (DynamicModule)
556 | backbone = DNN_module(DNN_layer(100, 100), n_neighbors=n_neighbors)
557 | model = ltModule(backbone=backbone, dt=dt, learning_rate=learning_rate, loss_func=loss_func)
558 |
559 | selected_data = datamodule.subset(data_indices)
560 |
561 | unsplice, splice, this_gene_name, unsplicemax, splicemax, embedding1, embedding2=selected_data.fit_dataset.__getitem__(0)
562 |
563 | data_df=pd.DataFrame({'unsplice':unsplice,'splice':splice,'embedding1':embedding1,'embedding2':embedding2})
564 | data_df['gene_name']=this_gene_name
565 | try:
566 |
567 | # Note
568 | # here n_neighbors in the downsampling_embedding function is for selecting initial model.
569 | # which is different from the n_neighbors in _train_tread for velocity calculation.
570 | _, sampling_ixs_select_model, _ = downsampling_embedding(data_df, # for select model
571 | para='neighbors',
572 | step=(20,20),
573 | n_neighbors=30,
574 | target_amount=None,
575 | projection_neighbor_choice='embedding')
576 | except:
577 | sampling_ixs_select_model=list(data_df.index)
578 |
579 | gene_downsampling=downsampling(data_df=data_df, gene_list=[this_gene_name], downsampling_ixs=sampling_ixs_select_model)
580 | if ini_model=='circle':
581 | model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name
582 | if ini_model=='branch':
583 | model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name
584 | else:
585 | model_path=select_initial_net(this_gene_name, gene_downsampling, data_df)
586 | model.load(model_path)
587 |
588 | early_stop_callback = EarlyStopping(monitor="loss", min_delta=0.0, patience=patience,mode='min')
589 |
590 | if check_val_every_n_epoch is None:
591 | # not use early stop
592 | trainer = pl.Trainer(
593 | max_epochs=max_epoches,
594 | progress_bar_refresh_rate=0,
595 | reload_dataloaders_every_n_epochs=1,
596 | logger = False,
597 | enable_checkpointing = False,
598 | enable_model_summary=False,
599 | )
600 | else:
601 | # use early stop
602 | trainer = pl.Trainer(
603 | max_epochs=max_epoches,
604 | progress_bar_refresh_rate=0,
605 | reload_dataloaders_every_n_epochs=1,
606 | logger = False,
607 | enable_checkpointing = False,
608 | check_val_every_n_epoch = check_val_every_n_epoch,
609 | enable_model_summary=False,
610 | callbacks=[early_stop_callback]
611 | )
612 |
613 | if max_epoches > 0:
614 | trainer.fit(model, selected_data) # train network
615 |
616 | trainer.test(model, selected_data,verbose=False) # predict
617 |
618 | if(model_save_path != None):
619 | model.save(model_save_path)
620 |
621 | loss_df = model.validation_loss_df
622 | cellDancer_df = model.test_cellDancer_df
623 |
624 | if norm_u_s:
625 | cellDancer_df.unsplice=cellDancer_df.unsplice*unsplicemax
626 | cellDancer_df.splice=cellDancer_df.splice*splicemax
627 | cellDancer_df.unsplice_predict=cellDancer_df.unsplice_predict*unsplicemax
628 | cellDancer_df.splice_predict=cellDancer_df.splice_predict*splicemax
629 | cellDancer_df.beta=cellDancer_df.beta*unsplicemax
630 | cellDancer_df.gamma=cellDancer_df.gamma*splicemax
631 |
632 | if(model_save_path != None):
633 | model.save(model_save_path)
634 |
635 | header_loss_df=['gene_name','epoch','loss']
636 | header_cellDancer_df=['cellIndex','gene_name','unsplice','splice','unsplice_predict','splice_predict','alpha','beta','gamma','loss']
637 |
638 | loss_df.to_csv(os.path.join(save_path,'TEMP', ('loss'+'_'+this_gene_name+'.csv')),header=header_loss_df,index=False)
639 | cellDancer_df.to_csv(os.path.join(save_path,'TEMP', ('cellDancer_estimation_'+this_gene_name+'.csv')),header=header_cellDancer_df,index=False)
640 |
641 | return None
642 |
643 | except:
644 | return this_gene_name
645 |
646 |
647 |
648 |
649 |
650 | def build_datamodule(cell_type_u_s,
651 | speed_up,
652 | norm_u_s,
653 | permutation_ratio,
654 | norm_cell_distribution=False,
655 | gene_list=None,
656 | downsample_method='neighbors',
657 | n_neighbors_downsample=30,
658 | step=(200,200),
659 | downsample_target_amount=None):
660 |
661 | '''
662 | set fitting data, data to be predicted, and sampling ratio when fitting
663 | '''
664 | step_i=step[0]
665 | step_j=step[1]
666 |
667 | if gene_list is None:
668 | data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']]
669 | else:
670 | data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']][cell_type_u_s.gene_name.isin(gene_list)]
671 |
672 | if speed_up:
673 | _, sampling_ixs, _ = downsampling_embedding(data_df,
674 | para=downsample_method,
675 | target_amount=downsample_target_amount,
676 | step=(step_i,step_j),
677 | n_neighbors=n_neighbors_downsample,
678 | projection_neighbor_choice='embedding')
679 | data_df_one_gene=cell_type_u_s[cell_type_u_s['gene_name']==list(gene_list)[0]]
680 | downsample_cellid=data_df_one_gene.cellID.iloc[sampling_ixs]
681 | gene_downsampling=data_df[data_df.cellID.isin(downsample_cellid)]
682 |
683 | feed_data = feedData(data_fit = gene_downsampling, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default
684 | else:
685 | feed_data = feedData(data_fit = data_df, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default
686 |
687 | return(feed_data)
688 |
689 |
690 | def velocity(
691 | cell_type_u_s,
692 | gene_list=None,
693 | max_epoches=200,
694 | check_val_every_n_epoch=10,
695 | patience=3,
696 | learning_rate=0.001,
697 | dt=0.5,
698 | n_neighbors=30,
699 | permutation_ratio=0.125,
700 | speed_up=True,
701 | norm_u_s=True,
702 | norm_cell_distribution=True,
703 | loss_func='cosine',
704 | n_jobs=-1,
705 | save_path=None,
706 | ):
707 |
708 | """Velocity estimation for each cell.
709 |
710 | Arguments
711 | ---------
712 | cell_type_u_s: `pandas.DataFrame`
713 | Dataframe that contains the unspliced abundance, spliced abundance, embedding space, and cell type information. Columns=['gene_name', 'unsplice', 'splice' ,'cellID' ,'clusters' ,'embedding1' ,'embedding2']
714 | gene_list: optional, `list` (default: None)
715 | Gene list for velocity estimation. `None` if to estimate the velocity of all genes.
716 | max_epoches: optional, `int` (default: 200)
717 | Stop to update the network once this number of epochs is reached.
718 | check_val_every_n_epoch: optional, `int` (default: 10)
719 | Check loss every n train epochs.
720 | patience: optional, `int` (default: 3)
721 | Number of checks with no improvement after which training will be stopped.
722 | dt: optional, `float` (default: 0.5)
723 | Step size
724 | permutation_ratio: optional, `float` (default: 0.125)
725 | Sampling ratio of cells in each epoch when training each gene.
726 | speed_up: optional, `bool` (default: True)
727 | `True` if speed up by downsampling cells. `False` if to use all cells to train the model.
728 | norm_u_s: optional, `bool` (default: True)
729 | `True` if normalize unsplice (and splice) reads by dividing max value of unspliced (and spliced) reads.
730 | norm_cell_distribution: optional, `bool` (default: True)
731 | `True` if the bias of cell distribution is to be removed on embedding space (many cells share the same position of unspliced (and spliced) reads).
732 | loss_func: optional, `str` (default: `cosine`)
733 | Currently support `'cosine'`, `'rmse'`, and (`'mix'`, mix_ratio).
734 | n_jobs: optional, `int` (default: -1)
735 | The maximum number of concurrently running jobs.
736 | save_path: optional, `str` (default: 200)
737 | Path to save the result of velocity estimation.
738 | Returns
739 | -------
740 | loss_df: `pandas.DataFrame`
741 | The record of loss.
742 | cellDancer_df: `pandas.DataFrame`
743 | The result of velocity estimation.
744 | """
745 |
746 | # set output dir
747 | datestring = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S");
748 | folder_name='cellDancer_velocity_'+datestring
749 |
750 | if save_path is None:
751 | save_path=os.getcwd()
752 |
753 | try:shutil.rmtree(os.path.join(save_path,folder_name))
754 | except:os.mkdir(os.path.join(save_path,folder_name))
755 | save_path=os.path.join(save_path,folder_name)
756 | print('Using '+save_path+' as the output path.')
757 |
758 | try:shutil.rmtree(os.path.join(save_path,'TEMP'))
759 | except:os.mkdir(os.path.join(save_path,'TEMP'))
760 |
761 | # set gene_list if not given
762 | if gene_list is None:
763 | gene_list=list(cell_type_u_s.gene_name.drop_duplicates())
764 | else:
765 | cell_type_u_s=cell_type_u_s[cell_type_u_s.gene_name.isin(gene_list)]
766 | all_gene_name_cell_type_u_s=list(cell_type_u_s.gene_name.drop_duplicates())
767 | gene_not_in_cell_type_u_s= list(set(gene_list).difference(set(all_gene_name_cell_type_u_s)))
768 | gene_list=list(list(set(all_gene_name_cell_type_u_s).intersection(set(gene_list))))
769 | if len(gene_not_in_cell_type_u_s)>0: print(gene_not_in_cell_type_u_s," not in the data cell_type_u_s")
770 |
771 | cell_type_u_s=cell_type_u_s.reset_index(drop=True)
772 | # buring
773 | gene_list_buring=[list(cell_type_u_s.gene_name.drop_duplicates())[0]]
774 | datamodule=build_datamodule(cell_type_u_s,speed_up,norm_u_s,permutation_ratio,norm_cell_distribution,gene_list=gene_list_buring)
775 |
776 | result = Parallel(n_jobs=n_jobs, backend="loky")(
777 | delayed(_train_thread)(
778 | datamodule = datamodule,
779 | data_indices=[data_index],
780 | max_epoches=max_epoches,
781 | check_val_every_n_epoch=check_val_every_n_epoch,
782 | patience=patience,
783 | learning_rate=learning_rate,
784 | n_neighbors=n_neighbors,
785 | dt=dt,
786 | loss_func=loss_func,
787 | save_path=save_path,
788 | norm_u_s=norm_u_s)
789 | for data_index in range(0,len(gene_list_buring)))
790 |
791 | # clean directory
792 | shutil.rmtree(os.path.join(save_path,'TEMP'))
793 | os.mkdir(os.path.join(save_path,'TEMP'))
794 |
795 | data_len = len(gene_list)
796 |
797 | id_ranges=list()
798 | if n_jobs==-1:
799 | interval=os.cpu_count()
800 | else:
801 | interval=n_jobs
802 | for i in range(0,data_len,interval):
803 | idx_start=i
804 | if data_lens_max_90per) & (gene_u_s_full.unsplice>u_max_90per), 'position'] = 'cells_corner'
906 |
907 | if gene_u_s_full.loc[gene_u_s_full['position']=='cells_corner'].shape[0]>0.001*gene_u_s_full.shape[0]:
908 | # model in circle shape
909 | model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name
910 | else:
911 | # model in seperated branch shape
912 | model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name
913 | return(model_path)
--------------------------------------------------------------------------------