├── .github
    ├── scripts
    │   └── release.py
    └── workflows
    │   ├── publish.yml
    │   └── release.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── _static
    └── training_progress.png
├── dist
    ├── celldancer-1.1.4-py3-none-any.whl
    ├── celldancer-1.1.4.tar.gz
    ├── celldancer-1.1.7-py3-none-any.whl
    └── celldancer-1.1.7.tar.gz
├── notebooks
    ├── case_study_gastrulation.ipynb
    ├── case_study_hgforebrian.ipynb
    ├── case_study_neuro.ipynb
    ├── case_study_pancreas.ipynb
    ├── case_study_pancreas_dynamo.ipynb
    ├── case_study_rpe1.ipynb
    └── celldancer_prototype_model.ipynb
├── readme.rst
├── readme_pypi.rst
├── requirements.txt
├── setup.py
└── src
    └── celldancer
        ├── .Rapp.history
        ├── __init__.py
        ├── cdplt.py
        ├── compute_cell_velocity.py
        ├── diffusion.py
        ├── embedding_kinetic_para.py
        ├── model
            ├── branch.pt
            └── circle.pt
        ├── plotting
            ├── .Rapp.history
            ├── __init__.py
            ├── cell.py
            ├── colormap.py
            ├── gene.py
            └── graph.py
        ├── pseudo_time.py
        ├── sampling.py
        ├── simulation.py
        ├── utilities.py
        └── velocity_estimation.py


/.github/scripts/release.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import subprocess
 4 | 
 5 | 
 6 | def get_last_version() -> str:
 7 |     """Return the version number of the last release."""
 8 |     json_string = (
 9 |         subprocess.run(
10 |             ["gh", "release", "view", "--json", "tagName"],
11 |             check=True,
12 |             stdout=subprocess.PIPE,
13 |             stderr=subprocess.PIPE,
14 |         )
15 |         .stdout.decode("utf8")
16 |         .strip()
17 |     )
18 | 
19 |     return json.loads(json_string)["tagName"]
20 | 
21 | 
22 | def bump_patch_number(version_number: str) -> str:
23 |     """Return a copy of `version_number` with the patch number incremented."""
24 |     major, minor, patch = version_number.split(".")
25 |     return f"{major}.{minor}.{int(patch) + 1}"
26 | 
27 | 
28 | def create_new_patch_release():
29 |     """Create a new patch release on GitHub."""
30 |     try:
31 |         last_version_number = get_last_version()
32 |     except subprocess.CalledProcessError as err:
33 |         if err.stderr.decode("utf8").startswith("HTTP 404:"):
34 |             # The project doesn't have any releases yet.
35 |             new_version_number = "0.0.1"
36 |         else:
37 |             raise
38 |     else:
39 |         new_version_number = bump_patch_number(last_version_number)
40 | 
41 |     subprocess.run(
42 |         ["gh", "release", "create", "--generate-notes", new_version_number],
43 |         check=True,
44 |     )
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     create_new_patch_release()
49 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI.org
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | jobs:
 6 |   pypi:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Checkout
10 |         uses: actions/checkout@v3
11 |         with:
12 |           fetch-depth: 0
13 |       - run: python3 -m pip install --upgrade build && python3 -m build
14 |       - name: Publish package
15 |         uses: pypa/gh-action-pypi-publish@release/v1
16 |         with:
17 |           password: ${{ secrets.PYPI_API_TOKEN_CELLDANCER }}
18 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Create a new patch release
 2 | on: workflow_dispatch
 3 | jobs:
 4 |   github:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - name: Checkout
 8 |         uses: actions/checkout@v3
 9 |       - name: Create new patch release
10 |         run: .github/scripts/release.py
11 |         env:
12 |           GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | .eggs/
11 | *.egg-info/
12 | 
13 | # PyInstaller
14 | *.manifest
15 | *.spec
16 | build/
17 | 
18 | # Installer logs
19 | pip-log.txt
20 | pip-delete-this-directory.txt
21 | 
22 | # Unit test / coverage reports
23 | .cache
24 | 
25 | # Sphinx documentation
26 | docs/_build/
27 | 
28 | # Emacs, vim
29 | .#*
30 | *.swp
31 | 
32 | # Notebook Checkpoints
33 | .ipynb_checkpoints/
34 | 
35 | 
36 | # Mac specific
37 | .DS_Store
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Wang Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readme.rst
2 | include readme_pypi.rst
3 | include LICENSE


--------------------------------------------------------------------------------
/_static/training_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/_static/training_progress.png


--------------------------------------------------------------------------------
/dist/celldancer-1.1.4-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/celldancer-1.1.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.4.tar.gz


--------------------------------------------------------------------------------
/dist/celldancer-1.1.7-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/celldancer-1.1.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/dist/celldancer-1.1.7.tar.gz


--------------------------------------------------------------------------------
/readme.rst:
--------------------------------------------------------------------------------
 1 | cellDancer - Estimating Cell-dependent RNA Velocity
 2 | ===========================================================================================
 3 | 
 4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website <https://guangyuwanglab2021.github.io/cellDancer_website/>`_.
 5 | 
 6 | 
 7 | .. image:: _static/training_progress.png
 8 |   :width: 100%
 9 |   :alt: cell_type_u_s_sample_df
10 | 
11 | Cite
12 | 
13 | Shengyu Li#, Pengzhi Zhang#, Weiqing Chen, Lingqun Ye, Kristopher W. Brannan, Nhat-Tu Le, Jun-ichi Abe, John P. Cooke, Guangyu Wang. A relay velocity model infers cell-dependent RNA velocity. Nature Biotechnology (2023) https://doi.org/10.1038/s41587-023-01728-5
14 | 
15 | cellDancer's key applications
16 | ========================================================
17 | * Enable accurate inference of dynamic cell state transitions in heterogeneous cell populations.
18 | * Estimate cell-specific transcription (α), splicing (β) and degradation (γ) rates for each gene and reveal RNA turnover strategies.
19 | * Improves downstream analysis such as vector field predictions.
20 | 
21 | To be done
22 | ========================================================
23 | - [ ] Update an anndata-compatible version.
24 | 
25 | What's new
26 | ========================================================
27 | cellDancer is updated to v1.1.7
28 | 
29 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw().
30 | * Added try except to catch genes with low quality in velocity().
31 | 
32 | Installation
33 | ========================================================
34 | cellDancer requires Python version >= 3.7.6 to run.
35 | 
36 | To run cellDancer locally, we recommend to create a `conda <https://docs.conda.io/en/latest>`_ environment: ``conda create -n cellDancer python==3.7.6``. Then activate the new environment with ``conda activate cellDancer``. cellDancer package could be installed from pypi with ``pip install celldancer``. 
37 | 
38 |   Python 3.7 is not compatible with M1 Mac, ``conda create -n cellDancer python==3.9.16`` is the version that compatible with M1 Mac that has been well tested to run cellDancer.
39 | 
40 | To install the latest version from GitHub, run:
41 | 
42 | ``pip install git+https://github.com/GuangyuWangLab2021/cellDancer.git``
43 | 
44 | To install cellDancer from source code, run:
45 | 
46 | ``pip install 'your_path/Source Code/cellDancer'``.
47 | 
48 |   For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link: https://bezier.readthedocs.io/en/2021.2.12/#installing
49 | 
50 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt`` or manually install each package in requirements.txt.
51 | 
52 | To be compatible with Dynamo (optional), after first ``pip install celldancer`` and then ``pip install dynamo-release``, installing Dynamo will update numpy to 1.24.0, and we can downgrade numpy back to 1.20.0 with ``pip install numpy==1.20.0`` to let them be compatible.
53 | 
54 | Frequently asked questions
55 | ========================================================
56 | Q: How should I prepare the input for my own data?
57 | 
58 | A: The `Data Preparation <https://guangyuwanglab2021.github.io/cellDancer_website/data_preprocessing.html>`_ page introduces the details of how to prepare and pre-process your own data.
59 | 
60 | Check more frequently asked questions at `FAQ <https://guangyuwanglab2021.github.io/cellDancer_website/FAQ.html>`_ in our website. If you have any other question related to your specific contition, welcome to post it in our github `issue <https://github.com/GuangyuWangLab2021/cellDancer/issues>`_ page or email to sli5@houstonmethodist.org
61 | 
62 | Support
63 | ========================================================
64 | Welcome bug reports and suggestions to our GitHub issue page!
65 | 


--------------------------------------------------------------------------------
/readme_pypi.rst:
--------------------------------------------------------------------------------
 1 | cellDancer - Estimating Cell-dependent RNA Velocity
 2 | ===========================================================================================
 3 | 
 4 | **cellDancer** is a modularized, parallelized, and scalable tool based on a deep learning framework for the RNA velocity analysis of scRNA-seq. Our website of tutorials is available at `cellDancer Website <https://guangyuwanglab2021.github.io/cellDancer_website/>`_.
 5 | 
 6 | 
 7 | cellDancer's key applications
 8 | ========================================================
 9 | * Estimate cell-specific RNA velocity for each gene.
10 | * Derive cell fates in embedding space.
11 | * Estimate pseudotime for each cell in embedding space.
12 | 
13 | What's new
14 | ========================================================
15 | cellDancer is updated to v1.1.7
16 | 
17 | * Added progress bar for adata_to_df_with_embed() and adata_to_raw().
18 | * Added try except to catch genes with low quality in velocity().
19 | 
20 | Installation
21 | ========================================================
22 | cellDancer requires Python version >= 3.7.6 to run.
23 | 
24 | To run cellDancer locally, create an `conda <https://docs.conda.io/en/latest>`_ or `Anaconda <https://www.anaconda.com/>`_ environment as ``conda create -n cellDancer python==3.7.6``, and activate the new environment with ``conda activate cellDancer``. cellDancer could be installed with ``pip install celldancer``.
25 | 
26 | To install cellDancer from source code, run:
27 | ``pip install 'your_path/Source Code/cellDancer'``.
28 | 
29 | For M1 Mac users if you encountered a problem while installing bezier. Please refer to the following link:
30 | https://bezier.readthedocs.io/en/2021.2.12/#installing
31 | 
32 | If any other dependency could not be installed with ``pip install celldancer``, try ``pip install --no-deps celldancer``. Then install the dependencies by ``pip install -r requirements.txt``.
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytorch-lightning==1.5.2
 2 | torch==1.10.0
 3 | pandas==1.3.4
 4 | numpy==1.20.3
 5 | anndata==0.8.0
 6 | tqdm==4.62.3
 7 | scikit-learn==1.0.1
 8 | scipy==1.7.2
 9 | joblib==1.1.0
10 | scikit-image==0.19.2
11 | statsmodels==0.13.1
12 | matplotlib==3.5.3
13 | seaborn==0.11.2
14 | datashader==0.14.0
15 | bezier==2021.2.12
16 | umap-learn==0.5.2
17 | jupyterlab
18 | setuptools==59.5.0
19 | setuptools-scm==6.3.2


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | project_urls = {
 4 |   'cellDancer': 'https://github.com/GuangyuWangLab2021/cellDancer',
 5 |   'Documentation':'https://guangyuwanglab2021.github.io/cellDancer_website/'
 6 | }
 7 | 
 8 | with open("readme_pypi.rst", "rt", encoding="utf8") as f:
 9 |     long_description = f.read()
10 | 
11 | setuptools.setup(
12 |     name="celldancer",
13 |     version="1.1.7",
14 |     author="Wang Lab",
15 |     author_email="gwang2@houstonmethodist.org",
16 |     description="Study RNA velocity through neural network.",
17 |     long_description=long_description,
18 |     long_description_content_type="text/x-rst; charset=UTF-8",
19 |     classifiers=[
20 |         "Programming Language :: Python :: 3",
21 |         "License :: OSI Approved :: MIT License",
22 |         "Operating System :: OS Independent",
23 |     ],
24 |     project_urls = project_urls,
25 |     package_dir={"": "src"},
26 |     packages=setuptools.find_packages(where="src"),
27 |     package_data={'': ['model/*.pt']},
28 |     include_package_data=True,
29 |     python_requires=">=3.7.6",
30 |     install_requires = ['pytorch-lightning==1.5.2',
31 |                         'torch==1.10.0',
32 |                         'pandas==1.3.4',
33 |                         'numpy==1.20.3',
34 |                         'anndata==0.8.0',
35 |                         'tqdm==4.62.3',
36 |                         'scikit-learn==1.0.1',
37 |                         'scipy==1.7.2',
38 |                         'joblib==1.1.0',
39 |                         'scikit-image==0.19.2',
40 |                         'statsmodels==0.13.1',
41 |                         'matplotlib==3.5.3',
42 |                         'seaborn==0.11.2',
43 |                         'datashader==0.14.0',
44 |                         'bezier==2021.2.12',
45 |                         'umap-learn==0.5.2',
46 |                         'jupyterlab',
47 |                         'setuptools==59.5.0',
48 |                         'setuptools-scm==6.3.2'
49 |                         ]
50 | )
51 | 
52 | 


--------------------------------------------------------------------------------
/src/celldancer/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/.Rapp.history


--------------------------------------------------------------------------------
/src/celldancer/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import *
 2 | from .velocity_estimation import velocity
 3 | from .pseudo_time import pseudo_time
 4 | from .compute_cell_velocity import compute_cell_velocity
 5 | from .embedding_kinetic_para import embedding_kinetic_para
 6 | from .utilities import adata_to_df_with_embed
 7 | from .utilities import to_dynamo
 8 | from .utilities import export_velocity_to_dynamo
 9 | from .simulation import simulate
10 | from . import cdplt
11 | 
12 | __all__ = [
13 |     "cdplt",
14 |     "velocity_estimation",
15 |     "pseudo_time",
16 |     "diffusion",
17 |     "compute_cell_velocity",
18 |     "simulation",
19 |     "embedding_kinetic_para",
20 |     "sampling",
21 |     "utilities",
22 |     "simulation"
23 | ]
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/src/celldancer/cdplt.py:
--------------------------------------------------------------------------------
1 | from celldancer.plotting import *
2 | 


--------------------------------------------------------------------------------
/src/celldancer/compute_cell_velocity.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import random
  4 | import pandas as pd
  5 | import numpy as np
  6 | from sklearn.neighbors import NearestNeighbors
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | 
 10 | if __name__ == "__main__":
 11 |     sys.path.append('.')
 12 |     from sampling import *
 13 | else:
 14 |     try:
 15 |         from .sampling import *
 16 |     except ImportError:
 17 |         from sampling import *
 18 | 
 19 | 
 20 | def compute_cell_velocity(
 21 |     cellDancer_df,
 22 |     gene_list=None,
 23 |     speed_up=(60,60),
 24 |     expression_scale=None,
 25 |     projection_neighbor_size=200,
 26 |     projection_neighbor_choice='embedding'):
 27 | 
 28 |     """Project the RNA velocity onto the embedding space.
 29 |         
 30 |     Arguments
 31 |     ---------
 32 |     cellDancer_df: `pandas.DataFrame`
 33 |         Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID, 'clusters', 'embedding1', 'embedding2']
 34 |     gene_list: optional, `list` (default: None)
 35 |         Genes selected to calculate the cell velocity. `None` if all genes in the cellDancer_df are to be used.
 36 |     speed_up: optional, `tuple` (default: (60,60))
 37 |         Speed up by giving the sampling grid to downsample cells. 
 38 |         `None` if all cells are used to compute cell velocity. 
 39 |     expression_scale: optional, `str` (default: None)
 40 |         `None` if no expression scale is to be used. 
 41 |         `'power10'` if the 10th power is been used to scale spliced and unspliced reads.
 42 |     projection_neighbor_size: optional, `int` (default: '200')
 43 |         The number of neighboring cells used for the transition probability matrix for one cell.
 44 |     projection_neighbor_choice: optional, `str` (default: 'embedding')
 45 |         `'embedding'` if using the embedding space to obtain the neighbors. 
 46 |         `'gene'` if using the spliced reads of all genes to obtain the neighbors.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     cellDancer_df: `pandas.DataFrame`
 51 |         The updated cellDancer_df with additional columns ['velocity1', 'velocity2'].
 52 |     """
 53 | 
 54 |     def velocity_correlation(cell_matrix, velocity_matrix):
 55 |         """Calculate the correlation between the predict velocity (velocity_matrix[:,i])
 56 |         and the difference between a cell and every other (cell_matrix - cell_matrix[:, i])
 57 | 
 58 |         Arguments
 59 |         ---------
 60 |         cell_matrix: np.ndarray (ngenes, ncells)
 61 |             gene expression matrix
 62 |         velocity_matrix: np.ndarray (ngenes, ncells)
 63 |         Return
 64 |         ---------
 65 |         c_matrix: np.ndarray (ncells, ncells)
 66 |         """
 67 |         c_matrix = np.zeros((cell_matrix.shape[1], velocity_matrix.shape[1]))
 68 |         for i in range(cell_matrix.shape[1]):
 69 |             c_matrix[i, :] = corr_coeff(cell_matrix, velocity_matrix, i)[0, :]
 70 |         np.fill_diagonal(c_matrix, 0)
 71 |         return c_matrix
 72 | 
 73 | 
 74 |     def velocity_projection(cell_matrix, velocity_matrix, embedding, knn_embedding):
 75 |         '''
 76 |         cell_matrix: np.ndarray (ngenes, ncells)
 77 |             gene expression matrix
 78 |         velocity_matrix: np.ndarray (ngenes, ncells)
 79 |         '''
 80 |         # cell_matrix = np_splice[:,sampling_ixs]
 81 |         # velocity_matrix = np_dMatrix[:,sampling_ixs]
 82 |         sigma_corr = 0.05
 83 |         cell_matrix[np.isnan(cell_matrix)] = 0
 84 |         velocity_matrix[np.isnan(velocity_matrix)] = 0
 85 |         corrcoef = velocity_correlation(cell_matrix, velocity_matrix)
 86 |         probability_matrix = np.exp(corrcoef / sigma_corr)*knn_embedding.A
 87 |         probability_matrix /= probability_matrix.sum(1)[:, None]
 88 |         unitary_vectors = embedding.T[:, None, :] - embedding.T[:, :, None]
 89 |         with np.errstate(divide='ignore', invalid='ignore'):
 90 |             unitary_vectors /= np.linalg.norm(unitary_vectors, ord=2, axis=0)
 91 |             np.fill_diagonal(unitary_vectors[0, ...], 0)
 92 |             np.fill_diagonal(unitary_vectors[1, ...], 0)
 93 |         velocity_embedding = (probability_matrix * unitary_vectors).sum(2)
 94 |         velocity_embedding -= (knn_embedding.A * unitary_vectors).sum(2) / \
 95 |             knn_embedding.sum(1).A.T  # embedding_knn.A *
 96 |         velocity_embedding = velocity_embedding.T
 97 |         return velocity_embedding
 98 |     
 99 |     # remove invalid prediction
100 |     is_NaN = cellDancer_df[['alpha','beta']].isnull()
101 |     row_has_NaN = is_NaN. any(axis=1)
102 |     cellDancer_df = cellDancer_df[~row_has_NaN].reset_index(drop=True)
103 |     
104 |     if 'velocity1' in cellDancer_df.columns:
105 |         del cellDancer_df['velocity1']
106 |     if 'velocity2' in cellDancer_df.columns:
107 |         del cellDancer_df['velocity2']
108 |     
109 |     if gene_list is None:
110 |         gene_list=cellDancer_df.gene_name.drop_duplicates()
111 | 
112 | 
113 |     # This creates a new dataframe
114 |     cellDancer_df_input = cellDancer_df[cellDancer_df.gene_name.isin(gene_list)].reset_index(drop=True)
115 |     np_splice_all, np_dMatrix_all= data_reshape(cellDancer_df_input)
116 |     # print("(genes, cells): ", end="")
117 |     # print(np_splice_all.shape)
118 |     n_genes, n_cells = np_splice_all.shape
119 | 
120 |     # This creates a new dataframe
121 |     data_df = cellDancer_df_input.loc[:, 
122 |             ['gene_name', 'unsplice', 'splice', 'cellID','embedding1', 'embedding2']]
123 |     # random.seed(10)
124 |     embedding_downsampling, sampling_ixs, knn_embedding = downsampling_embedding(data_df,
125 |                                                                                  para='neighbors',
126 |                                                                                  target_amount=0,
127 |                                                                                  step=speed_up,
128 |                                                                                  n_neighbors=projection_neighbor_size,
129 |                                                                                  projection_neighbor_choice=projection_neighbor_choice,
130 |                                                                                  expression_scale=expression_scale,
131 |                                                                                  pca_n_components=None,
132 |                                                                                  umap_n=None,
133 |                                                                                  umap_n_components=None)
134 |     
135 | 
136 |     # projection_neighbor_choice only provides neighborlist, use embedding(from raw data) to compute cell velocity
137 |     embedding = cellDancer_df_input[cellDancer_df_input.gene_name == 
138 |             gene_list[0]][['embedding1', 'embedding2']]
139 |     embedding = embedding.to_numpy()
140 |     velocity_embedding = velocity_projection(
141 |             np_splice_all[:, sampling_ixs], 
142 |             np_dMatrix_all[:, sampling_ixs], 
143 |             embedding[sampling_ixs, :], 
144 |             knn_embedding)
145 | 
146 |     if set(['velocity1','velocity2']).issubset(cellDancer_df.columns):
147 |         print("Caution! Overwriting the \'velocity\' columns.") 
148 |         cellDancer_df.drop(['velocity1','velocity2'], axis=1, inplace=True)
149 | 
150 |     sampling_ixs_all_genes = cellDancer_df_input[cellDancer_df_input.cellIndex.isin(sampling_ixs)].index
151 |     cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity1'] = np.tile(velocity_embedding[:,0], n_genes)
152 |     cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity2'] = np.tile(velocity_embedding[:,1], n_genes)
153 |     # print("After downsampling, there are ", len(sampling_ixs), "cells.")
154 |     return(cellDancer_df_input)
155 | 
156 | def corr_coeff(ematrix, vmatrix, i):
157 |         '''
158 |         Calculate the correlation between the predict velocity (velocity_matrix[:,i])
159 |         and the displacement between a cell and every other (cell_matrix - cell_matrix[:, i])
160 |         ematrix = cell_matrix
161 |         vmatrix = velocity_matrix
162 |         '''
163 |         ematrix = ematrix.T
164 |         vmatrix = vmatrix.T
165 |         ematrix = ematrix - ematrix[i, :]
166 |         vmatrix = vmatrix[i, :][None, :]
167 |         ematrix_m = ematrix - ematrix.mean(1)[:, None]
168 |         vmatrix_m = vmatrix - vmatrix.mean(1)[:, None]
169 | 
170 |         # Sum of squares across rows
171 |         ematrix_ss = (ematrix_m**2).sum(1)
172 |         vmatrix_ss = (vmatrix_m**2).sum(1)
173 |         cor = np.dot(ematrix_m, vmatrix_m.T)
174 |         N = np.sqrt(np.dot(ematrix_ss[:, None], vmatrix_ss[None]))
175 |         cor=np.divide(cor, N, where=N!=0)
176 |         return cor.T
177 | 
178 | 
179 | def data_reshape(cellDancer_df): # pengzhi version
180 |     '''
181 |     load detail file
182 |     return expression matrix and velocity (ngenes, ncells)
183 |     '''
184 |     psc = 1
185 |     gene_names = cellDancer_df['gene_name'].drop_duplicates().to_list()
186 |     # PZ uncommented this.
187 |     cell_number = cellDancer_df[cellDancer_df['gene_name']==gene_names[0]].shape[0]
188 |     cellDancer_df['index'] = np.tile(range(cell_number),len(gene_names))
189 | 
190 |     splice_reshape = cellDancer_df.pivot(
191 |         index='gene_name', values='splice', columns='index')
192 |     splice_predict_reshape = cellDancer_df.pivot(
193 |         index='gene_name', values='splice_predict', columns='index')
194 |     dMatrix = splice_predict_reshape-splice_reshape
195 |     np_splice_reshape = np.array(splice_reshape)
196 |     np_dMatrix = np.array(dMatrix)
197 |     np_dMatrix2 = np.sqrt(np.abs(np_dMatrix) + psc) * \
198 |         np.sign(np_dMatrix)
199 |     return(np_splice_reshape, np_dMatrix2)
200 | 
201 | 


--------------------------------------------------------------------------------
/src/celldancer/diffusion.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import os
  5 | import sys
  6 | import random
  7 | import multiprocessing as mp
  8 | 
  9 | import numpy as np
 10 | from sklearn import preprocessing
 11 | 
 12 | import matplotlib as mpl
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | def embedding_normalization(cell_embedding, embedding=None, mode="minmax", NORM_ALL_CELLS=False):
 16 |     '''
 17 |     Normalize by the maximum absolute value.
 18 |     
 19 |     Parameters
 20 |     ----------
 21 |     embedding: 2D numpy array (n_cells, 2)
 22 |     mode: string
 23 |           'maxabs', "minmax"
 24 |     maxabs is meant for sparse data and/or centered at 0. 
 25 |     Note in this program (ML velocity), it is pretty safe to do maxabs normalization
 26 |     since the data are free of extreme outliers.
 27 |      
 28 |     '''
 29 |     if mode in ['max', 'maximum', 'maxabs']:
 30 |         transformer = preprocessing.MaxAbsScaler().fit(cell_embedding)
 31 |     elif mode in ['minmax']:
 32 |         transformer = preprocessing.MinMaxScaler().fit(cell_embedding)
 33 |     em = transformer.transform(cell_embedding)
 34 |     if NORM_ALL_CELLS:
 35 |         try:
 36 |             em_all = transformer.transform(embedding)
 37 |         except ValueError:
 38 |             print("ERROR! Missing embedding for all cells.")
 39 |             raise
 40 |         return em, em_all
 41 |     else:
 42 |         return em
 43 |     
 44 | def velocity_normalization(downsampled_vel, all_vel=None, mode="max", NORM_ALL_CELLS=False):
 45 |     '''
 46 |     Normalize by the maximum absolute value in the downsampled_vel.
 47 |     
 48 |     Parameters
 49 |     ----------
 50 |     downsampled_vel: 2D numpy array (n_cells, 2)
 51 |     mode: 'maxabs'
 52 |     
 53 |     maxabs is meant for sparse data and/or centered at 0. 
 54 |     
 55 |     Note in this program, it is pretty safe to do maxabs normalization
 56 |     since the data are free of extreme outliers.
 57 |      
 58 |     '''
 59 |     # add v_prime to vel of each cell without changing their directions.
 60 |     v_mag = np.linalg.norm(downsampled_vel, axis=1)
 61 |     v_prime = 0.1*np.std(v_mag)
 62 | 
 63 |     # for 0 velocity cell, nothing changed.
 64 |     v_prime = np.divide(v_prime, v_mag, where=v_mag > 0)
 65 |     downsampled_vel = downsampled_vel*(v_prime + 1)[:,None]
 66 | 
 67 |     if mode in ['max', 'maximum', 'maxabs']:
 68 |         transformer = preprocessing.MaxAbsScaler().fit(downsampled_vel)
 69 |     em = transformer.transform(downsampled_vel)
 70 |     if NORM_ALL_CELLS:
 71 |         em_all = transformer.transform(all_vel)
 72 |         return em, em_all
 73 |     else:
 74 |         return em
 75 |     
 76 | 
 77 | def discretize(coordinate, xmin, xmax, n_grids, capping=False):
 78 |     '''
 79 |     '''
 80 |     grid_size = np.array(xmax) - np.array(xmin)
 81 |     grid_size = grid_size / np.array(n_grids)
 82 | 
 83 |     grid_idx = np.int64(np.floor((coordinate-xmin)/grid_size))
 84 |     
 85 |     if capping:
 86 |         grid_idx = np.where(grid_idx > n_grids, n_grids, grid_idx)
 87 |         grid_idx = np.where(grid_idx <0, 0, grid_idx)
 88 |     
 89 |     grid_coor = xmin + grid_size * (grid_idx+0.5)
 90 |     return grid_idx, grid_coor 
 91 | 
 92 | 
 93 | def generate_grid(
 94 |         cell_embedding, 
 95 |         embedding, 
 96 |         velocity_embedding, 
 97 |         abr_umap = None, 
 98 |         n_grids = None):
 99 | 
100 |     xmin = np.min(cell_embedding, axis=0)
101 |     xmax = np.max(cell_embedding, axis=0)
102 |     n_grids = np.array(n_grids, dtype=int)
103 | 
104 |     cell_grid_idx, cell_grid_coor = discretize(cell_embedding, 
105 |             xmin=xmin, 
106 |             xmax=xmax, 
107 |             n_grids=n_grids)
108 | 
109 |     # The actual n_grids need to allow a leeway +1 in each dimension.
110 |     mesh = np.zeros(np.append(n_grids+1,len(n_grids)))
111 | 
112 |     cnt = np.zeros(n_grids+1)
113 |     for index in range(cell_grid_idx.shape[0]):
114 |         grid_index = cell_grid_idx[index]
115 |         if np.any(grid_index > n_grids) or np.any(grid_index < 0):
116 |             continue
117 |         grid_index = toTuple(grid_index)
118 |         mesh[grid_index] += velocity_embedding[index]
119 |         cnt[grid_index] += 1
120 |     cnt = cnt[:,:,None]
121 |     mesh = np.divide(mesh, cnt, out=np.zeros_like(mesh), where=cnt>0.1)
122 |     
123 |     # the all cell embedding is used to generate mass
124 |     mass = np.zeros(n_grids+1)
125 |     all_cells_grid_idx, all_cells_grid_coor = \
126 |             discretize(embedding, xmin=xmin, xmax=xmax, n_grids=n_grids)
127 |     n_cells = all_cells_grid_idx.shape[0]
128 | 
129 |     for index in range(n_cells):
130 |         all_cells_grid_index = all_cells_grid_idx[index]
131 |         
132 |         # mass outside the grid is not needed.
133 |         if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0):
134 |             continue
135 |         all_cells_grid_index = toTuple(all_cells_grid_index)
136 |         mass[all_cells_grid_index] += 1
137 | 
138 |     # the all cell embedding is used to generate grid_umap
139 |     if abr_umap is not None:
140 |         grid_umap = np.full_like(mesh, np.NAN)
141 |         n_umap_dims = all_cells_grid_idx.shape[-1]
142 |         for index in range(n_cells):
143 |             all_cells_grid_index = all_cells_grid_idx[index]
144 |             if np.any(all_cells_grid_index > n_grids) or np.any(all_cells_grid_index < 0):
145 |                 all_cells_grid_index = toTuple(all_cells_grid_index)
146 |                 grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), np.NAN)
147 |                 pass
148 |             all_cells_grid_index = toTuple(all_cells_grid_index)
149 |             if np.any(np.isnan(grid_umap[all_cells_grid_index])):
150 |                 grid_umap[all_cells_grid_index] = np.full((1,n_umap_dims), 0)
151 |             else:
152 |                 grid_umap[all_cells_grid_index] += abr_umap[index,:]
153 | 
154 |         # divide by 0 does not happen
155 |         # because where-ever mass is 0, grid_umap is nan. nan/0 -> nan
156 |         grid_umap = np.divide(grid_umap, mass[:,:,None])
157 | 
158 |     else:
159 |         grid_umap = None
160 | 
161 |     return mesh, mass, grid_umap, \
162 |             cell_grid_idx, cell_grid_coor, all_cells_grid_idx, all_cells_grid_coor
163 | 
164 | 
165 | def toTuple(arr):
166 |     '''
167 |     Parameters
168 |     ----------
169 |     arr: numpy ndarray or list
170 | 
171 |     Return
172 |     ------
173 |     A tuple (of nested tuples)
174 | 
175 |     '''
176 | 
177 |     try:
178 |         return tuple(toTuple(i) for i in arr)
179 |     except TypeError:
180 |         return arr
181 | 
182 | 
183 | def compute_path_divider_matrix(fmat, cutoff=0.3):
184 | 
185 | 
186 |     print("The cutoff for banning a path is ", cutoff)
187 |     ngrids = fmat.shape[:-1]
188 |     flat_length = np.multiply(*ngrids)
189 |     temp = fmat.reshape(flat_length, fmat.shape[-1])
190 | 
191 |     temp2 = temp-temp[:,None]
192 |     temp2 = np.linalg.norm(temp2, axis=-1)
193 | 
194 |     ban = temp2.reshape(ngrids+ngrids)
195 | 
196 |     path_divider_matrix = ban < cutoff
197 |     return path_divider_matrix
198 | 
199 | 
200 | def plot_velocity(embedding, velocity_embedding):
201 |     fig, ax = plt.subplots(figsize=(6,6))
202 |     plt.quiver(embedding[:, 0],embedding[:, 1],
203 |               velocity_embedding[:,0], velocity_embedding[:,1], 
204 |               color='Blue')
205 |     plt.show()
206 | 
207 | def plot_mesh_velocity(mesh, grid_mass):
208 |     x=list()
209 |     y=list()
210 |     vx=list()
211 |     vy=list()
212 |     for i in range(mesh.shape[0]):
213 |         for j in range(mesh.shape[1]):
214 |             x.append(i)
215 |             y.append(j)
216 |             vx.append(mesh[i,j][0])
217 |             vy.append(mesh[i,j][1])
218 |     fig, ax = plt.subplots(figsize=(6, 6))
219 |     ax.quiver(x,y,vx,vy,color='red',scale = 10)
220 |     plt.imshow(grid_mass.T, interpolation=None, origin='lower',cmap="Greys")
221 |     plt.show()
222 | 
223 | def velocity_add_random(velocity, theta):
224 |     '''
225 |     Rotate the velocity according to a randomized kicks on the perpendicular direction.
226 |     The direction is determined by the sign of a random number. 
227 |     The magnitude of the perpendicular kick is determined by the random number 
228 |     from a normal distribution N(0, theta).
229 |     Magnitude of the velocity is kept the same to conserve energy (temperature) of the system.
230 |     
231 |     Parameters
232 |     ----------
233 |     velocity
234 |         velocity of the grid
235 |     theta
236 |         the angular range that the noise could be affecting the direction of the velocity
237 |         
238 |     WARNING
239 |         at a rare chance, the rotation angle (magnitude) could be much larger than theta.
240 |         
241 |     Return
242 |     ------
243 |     Adjusted velocity for the interested cell
244 |         
245 |     '''
246 |     r = np.random.normal(0, theta, 1)
247 | #    print(mp.current_process(), r)
248 | 
249 |     cosine = np.cos(r)[0]
250 |     sine = np.sin(r)[0]
251 |     
252 |     # Rotation matrix
253 |     R = np.array([[cosine, sine],[-sine, cosine]])
254 |     velocity = np.dot(velocity, R)
255 |     return velocity
256 | 
257 | def velocity_rotation(velocity, theta):
258 |     '''
259 |     Rotate the velocity clockwise by angle theta
260 |     
261 |     Parameters
262 |     ----------
263 |     velocity
264 |         velocity of the grid
265 |     theta
266 |         the angular range that the noise could be affecting the direction of the velocity
267 |         
268 |     Return
269 |     ------
270 |     Adjusted velocity for the interested cell
271 |         
272 |     '''
273 |     cosine = np.cos(theta)
274 |     sine = np.sin(theta)
275 |     
276 |     # Rotation matrix
277 |     R = np.array([[cosine, sine],[-sine, cosine]])
278 |     velocity = np.dot(velocity, R)
279 |     return velocity
280 | 
281 | 
282 | def diffusion_off_grid_wallbound(
283 |         cell_embedding, 
284 |         vel, 
285 |         init, 
286 |         grid_mass,
287 |         dt = 0.001, 
288 |         t_total = 10000, 
289 |         eps = 1e-5,
290 |         random_seed = None,
291 |         pdm = None):
292 |     
293 |     '''
294 |     Simulate the diffusion of a cell in the velocity field (off grid), the
295 |     cell's velocity will turn 30 degrees
296 |     if it hits the boundary the next timestep.
297 | 
298 |     The diffusion is stopped by any of the criteria:
299 |     - reach t_total
300 |     - the magnitude of the velocity is less than eps.
301 |     - the cell goes to places where the cell mass <= MAX_IGNORED_MASS even after turning.
302 |     - the cell is out of the simulation box
303 | 
304 |     Parameters
305 |     ----------
306 |     
307 |     cell_embedding: numpy ndarray (n_cells x n_dims)
308 |         embedding coordinate for all the cells (downsampled)
309 | 
310 |     vel: numpy ndarray (n_grids x n_dims)
311 |         pre-assigned velocity of each grid
312 | 
313 |     init: numpy ndarray (n_cells x n_dims)
314 |         The initial position (cell_embedding)
315 | 
316 |     dt: float 
317 |         Step size of each integration time step
318 | 
319 |     t_total: int
320 |         Total number of time steps
321 | 
322 |     grid_mass: numpy ndarray (n_grids x n_dims)
323 |         mass of cells.
324 | 
325 |     eps 
326 |         Criterion to stop a trajectory before t_total (v_net < eps)
327 | 
328 |     
329 |     Return
330 |     ------
331 |         a numpy ndarray of coordinates in the trajectory, shape:
332 |         (real_n_time_steps, n_dims)
333 |     '''
334 |     
335 |     np.random.seed(seed = random_seed)
336 | #    print("random seed is set to, ", random_seed)
337 |     THETA = np.pi/6
338 |     
339 |     XMIN = np.min(cell_embedding, axis=0)
340 |     XMAX = np.max(cell_embedding, axis=0)
341 |     N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1)
342 | 
343 |     # lower 5% nonzero mass set to 0.
344 |     #MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0], 5)
345 |     MAX_IGNORED_MASS = 2
346 |     
347 |     def no_cells_around(xcur, xcur_d, vcur):
348 |         xnxt = xcur + vcur*dt
349 |         xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
350 |         try:
351 |             mass = grid_mass[xnxt_d[0], xnxt_d[1]]
352 |         except IndexError:
353 |             return True
354 |         return mass <= MAX_IGNORED_MASS
355 |    
356 |     x0 = init
357 |     x0_d, dummy = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
358 |     v0 = vel[x0_d[0],x0_d[1]]
359 |     v0 = velocity_add_random(v0, THETA)
360 |     trajectory = [x0]
361 |     
362 |     for i in range(int(t_total)):
363 |     
364 |         if np.linalg.norm(v0) < eps:
365 |             #print("Velocity is too small")
366 |             return np.array(trajectory)
367 |         if no_cells_around(x0, x0_d, v0):
368 |             v0_cc = velocity_rotation(v0, THETA)
369 |             v0_c = velocity_rotation(v0, -THETA)
370 | 
371 |             # nowhere to go but null
372 |             CC = no_cells_around(x0, x0_d, v0_cc) 
373 |             C = no_cells_around(x0, x0_d, v0_c)
374 | 
375 |             if CC and C:
376 |                 return np.array(trajectory)
377 |             elif not C:
378 |                 v0 = v0_c
379 |             else:
380 |                 v0 = v0_cc
381 |                 
382 |         else:
383 |             x = x0 + v0*dt
384 |             x_d, dummy = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
385 |             if (pdm is None) or (pdm[toTuple(x0_d)+toTuple(x_d)]):
386 |                 try:
387 |                     v = vel[x_d[0],x_d[1]]
388 |                     mass = grid_mass[x_d[0],x_d[1]]
389 |                     v = velocity_add_random(v, THETA)
390 |                 except IndexError:
391 |                     break
392 |             
393 |                 trajectory.append(x)
394 |                 x0 = x
395 |                 v0 = v
396 | 
397 |     return np.array(trajectory)
398 | 
399 | 
400 | def diffusion_on_grid_wallbound(
401 |         cell_embedding, 
402 |         vel, 
403 |         init, 
404 |         grid_mass,
405 |         dt=0.001, 
406 |         t_total=10000, 
407 |         eps = 1e-5):
408 |     
409 |     '''
410 |     same as diffusion_off_grid_wallbound, however, it returns the coordinates
411 |     of the grid traversed by the cell, instead of the position of the cell.
412 | 
413 |     The diffusion is stopped by any of the criteria:
414 |     1. reach t_total
415 |     2. the magnitude of the velocity is less than eps.
416 |     3. the cell goes to places where the cell mass = 0 even after turning.
417 |     4. the cell is out of the simulation box
418 | 
419 |     Parameters
420 |     ----------
421 |     
422 |     cell_embedding: numpy ndarray (n_cells x n_dims)
423 |         embedding coordinate for all the cells (downsampled)
424 | 
425 |     vel: numpy ndarray (n_grids x n_dims)
426 |         pre-assigned velocity of each grid
427 | 
428 |     init: numpy ndarray (n_cells x n_dims)
429 |         The initial position (cell_embedding)
430 | 
431 |     dt: float 
432 |         Step size of each integration time step
433 | 
434 |     t_total: int
435 |         Total number of time steps
436 | 
437 |     grid_mass: numpy ndarray (n_grids x n_dims)
438 |         mass of cells.
439 | 
440 |     eps 
441 |         Criterion to stop a trajectory before t_total (v_net < eps)
442 | 
443 |     
444 |     Return
445 |     ------
446 |         a numpy ndarray of coordinates in the trajectory, shape:
447 |         (real_n_time_steps, n_dims)
448 |     '''
449 |     
450 |     THETA = np.pi/6
451 |     
452 |     XMIN = np.min(cell_embedding, axis=0)
453 |     XMAX = np.max(cell_embedding, axis=0)
454 |     N_GRIDS=(vel.shape[0]-1,vel.shape[1]-1)
455 |     
456 |     # lower 5% nonzero mass set to 0.
457 |     MAX_IGNORED_MASS= np.percentile(grid_mass[grid_mass>0],5)
458 | 
459 |     def no_cells_around(xcur, xcur_d, vcur):
460 |         xnxt = xcur + vcur*dt
461 |         xnxt_d, dummy = discretize(xnxt, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
462 |         try:
463 |             mass = grid_mass[xnxt_d[0], xnxt_d[1]]
464 |         except IndexError:
465 |             return True
466 |         return mass < MAX_IGNORED_MASS
467 |    
468 |     x0 = init
469 |     x0_d, x0_d_coor = discretize(x0, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
470 |     v0 = vel[x0_d[0],x0_d[1]]
471 |     v0 = velocity_add_random(v0, THETA)
472 |     trajectory = [x0_d_coor]
473 |     
474 |     for i in range(int(t_total)):
475 |     
476 |         if np.linalg.norm(v0) < eps:
477 |             #print("Velocity is too small")
478 |             return np.array(trajectory)
479 |         if no_cells_around(x0_d_coor, x0_d, v0):
480 |             v0_cc = velocity_rotation(v0, np.pi/2)
481 |             v0_c = velocity_rotation(v0, -np.pi/2)
482 |             # nowhere to go but null
483 |             CC = no_cells_around(x0_d_coor, x0_d, v0_cc) 
484 |             C = no_cells_around(x0_d_coor, x0_d, v0_c)
485 |             if CC and C:
486 |                 return np.array(trajectory)
487 |             elif not C:
488 |                 v0 = v0_c
489 |             else:
490 |                 v0 = v0_cc
491 |                 
492 |         else:
493 |             x = x0_d_coor + v0*dt
494 |             x_d, x_d_coor = discretize(x, xmin=XMIN, xmax=XMAX, n_grids=N_GRIDS)
495 |             try:
496 |                 v = vel[x_d[0],x_d[1]]
497 |                 v = velocity_add_random(v, THETA)
498 |             except IndexError:
499 |                 break
500 |             
501 |             trajectory.append(x_d_coor)
502 |             x0 = x_d
503 |             x0_d_coor = x_d_coor
504 |             v0 = v
505 | 
506 |     return np.array(trajectory)
507 | 
508 | 
509 | def run_diffusion(
510 |         cell_embedding, 
511 |         vel, 
512 |         grid_mass, 
513 |         dt, 
514 |         t_total = 10000, 
515 |         eps = 1e-5, 
516 |         off_cell_init = False, 
517 |         init_cell = [], 
518 |         n_repeats = 10, 
519 |         n_jobs = 8, 
520 |         psrng_seeds_diffusion = None,
521 |         path_divider_matrix=None):
522 |     '''
523 |     Simulation of diffusion of a cell in the velocity field (on grid), 
524 |     the cell's velocity will turn 90 degrees if it hits the boundary the next timestep.
525 |     Embarrassingly parallel (process) are employed.
526 |     
527 |     Parameters
528 |     ----------
529 |     
530 |     cell_embedding: numpy.ndarray (n_cells, 2)
531 |         embedding coordinate for all the cells (downsampled)
532 |         
533 |     vel: numpy.ndarray (ngrid, ngrid, 2)
534 |         pre-assigned velocity of each grid
535 |     
536 |     dt: float
537 |         Step size of each integration time step
538 |     
539 |     t_total: int
540 |         Total number of time steps
541 |     
542 |     eps: float
543 |         Criterion to stop a trajectory before t_total (v_net < eps)
544 |     
545 |     off_cell_init: Boolean
546 |         Whether to spawn initial coordinates from the neighbouring space around a cell
547 |         
548 |     init_cell: list
549 |         List of initial cell indices. If empty list, use all cell indices in the given cell_embedding.
550 |     
551 |     n_repeats: init
552 |         Number of repeats (either on or off the cells)
553 |     
554 |     n_jobs: int
555 |         Number of threads
556 |     
557 |     Return
558 |     ------
559 |         a numpy array of trajectorys,  shape: (num_trajs, *n_time_steps, 2)
560 |     '''
561 |     import tqdm
562 | 
563 |     if psrng_seeds_diffusion is None:
564 |         psrng_seeds_diffusion = [i*100+11 for i in range(n_repeats)]
565 |         
566 |     assert len(psrng_seeds_diffusion) >= n_repeats
567 | 
568 |     if n_jobs >= mp.cpu_count():
569 |         n_jobs = mp.cpu_count()
570 | 
571 |     if n_jobs < 0:
572 |         n_jobs = mp.cpu_count() + 1 + n_jobs
573 | 
574 |     TASKS = list()
575 |     # Setting up the TASKS
576 |     n_cells = cell_embedding.shape[0]
577 |     
578 |     if not init_cell:
579 |         init_cell = list(range(n_cells))
580 | 
581 |     embedding_range = cell_embedding.max(axis=0) - cell_embedding.min(axis=0)
582 |     n_grids = np.array([vel.shape[0], vel.shape[1]])
583 |     grid_size = embedding_range/n_grids
584 |     
585 |     n_trajs = 0 
586 |     for i in init_cell:
587 |         for j in range(n_repeats):
588 |             n_trajs += 1
589 |             if off_cell_init:
590 |                 init_position = cell_embedding[i] + grid_size * np.random.uniform(-0.5,0.5,2)
591 |             else:
592 |                 init_position = cell_embedding[i]
593 |             TASKS.append((cell_embedding, vel, init_position, grid_mass, dt,
594 |                 t_total, 1e-5, psrng_seeds_diffusion[n_trajs % n_repeats],
595 |                 path_divider_matrix))
596 |     
597 |     with mp.Pool(n_jobs) as pool:
598 |         n_total = len(init_cell)*n_repeats
599 |         if n_total > 5000:
600 |             paths = pool.starmap(diffusion_off_grid_wallbound, 
601 |                     tqdm.tqdm(TASKS, total=n_total, 
602 |                         desc="Generating Trajectories", 
603 |                         colour="blue")
604 |                     )
605 |         else:
606 |             paths = pool.starmap(diffusion_off_grid_wallbound, TASKS) 
607 |     return np.array(paths, dtype=object)
608 | 


--------------------------------------------------------------------------------
/src/celldancer/embedding_kinetic_para.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import os 
 4 | 
 5 | os.environ['KMP_WARNINGS'] = '0'
 6 | 
 7 | def embedding_kinetic_para(
 8 |     cellDancer_df,
 9 |     kinetic_para,
10 |     umap_n=25
11 | ):
12 |     """Calculate the UMAP based on the kinetic parameter(s).
13 |         
14 |     Arguments
15 |     ---------
16 |     cellDancer_df: `pandas.DataFrame`
17 |         Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2']
18 |     kinetic_para: `str`
19 |         Choose Which parameter is used to calculate embedding space, which could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}.
20 |     umap_n: optional, `int` (default: 25)
21 |         The size of the local neighborhood (in terms of the number of neighboring sample points) used for manifold approximation in UMAP.
22 | 
23 |     Returns
24 |     -------
25 |     cellDancer_df: `pandas.DataFrame`
26 |         The updated cellDancer_df with an additional column of UMAP based on the kinetic parameter(s).
27 | 
28 |     """  
29 |     import umap
30 |     if set([(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]).issubset(cellDancer_df.columns):
31 |         cellDancer_df=cellDancer_df.drop(columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')])
32 | 
33 |     if kinetic_para=='alpha' or kinetic_para=='beta' or kinetic_para=='gamma':
34 |         para_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values=kinetic_para)
35 |     elif kinetic_para=='alpha_beta_gamma':
36 |         alpha_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='alpha')
37 |         beta_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='beta')
38 |         gamma_df=cellDancer_df.pivot(index='cellIndex', columns='gene_name', values='gamma')
39 |         para_df=pd.concat([alpha_df,beta_df,gamma_df],axis=1)
40 |     else:
41 |         print('kinetic_para should be set in one of alpha, beta, gamma, or alpha_beta_gamma.')
42 | 
43 |     def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=2, metric='euclidean'):
44 |         fit = umap.UMAP(
45 |             n_neighbors=n_neighbors,
46 |             min_dist=min_dist,
47 |             n_components=n_components,
48 |             metric=metric
49 |         )
50 |         embed = fit.fit_transform(df);
51 |         return(embed)
52 |     umap_para=get_umap(para_df)
53 |     umap_info=pd.DataFrame(umap_para,columns=[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')])
54 | 
55 |     gene_amt=len(cellDancer_df.gene_name.drop_duplicates())
56 |     umap_col=pd.concat([umap_info]*gene_amt)
57 |     umap_col.index=cellDancer_df.index
58 |     cellDancer_df=pd.concat([cellDancer_df,umap_col],axis=1)
59 |     return(cellDancer_df)
60 | 


--------------------------------------------------------------------------------
/src/celldancer/model/branch.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/branch.pt


--------------------------------------------------------------------------------
/src/celldancer/model/circle.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/model/circle.pt


--------------------------------------------------------------------------------
/src/celldancer/plotting/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuangyuWangLab2021/cellDancer/fed4c0db1bf7a7314000128b0311c37301fca1d9/src/celldancer/plotting/.Rapp.history


--------------------------------------------------------------------------------
/src/celldancer/plotting/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cell import scatter_cell
 2 | from .cell import plot_kinetic_para
 3 | from .graph import PTO_Graph
 4 | from .gene import scatter_gene
 5 | from .colormap import build_colormap
 6 | 
 7 | 
 8 | __all__=[
 9 |         'scatter_cell',
10 |         'build_colormap',
11 |         'scatter_gene',
12 |         'PTO_Graph',
13 |         'plot_kinetic_para',
14 |         'colormap'
15 |         ]
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/src/celldancer/plotting/cell.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.lines import Line2D
  5 | from matplotlib.colors import ListedColormap, LinearSegmentedColormap
  6 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
  7 | from scipy.stats import norm as normal
  8 | import bezier
  9 | import numpy as np
 10 | import pandas as pd
 11 | from .colormap import *
 12 | 
 13 | if __name__ == "__main__":
 14 |     sys.path.append('..')
 15 |     from utilities import find_nn_neighbors, extract_from_df
 16 | else:
 17 |     from celldancer.utilities import find_nn_neighbors, extract_from_df
 18 | 
 19 | def scatter_cell(
 20 |     ax,
 21 |     cellDancer_df, 
 22 |     colors=None, 
 23 |     custom_xlim=None,
 24 |     custom_ylim=None,
 25 |     vmin=None,
 26 |     vmax=None,
 27 |     alpha=0.5, 
 28 |     s = 5,
 29 |     legend_marker_size=5,
 30 |     gene=None,
 31 |     velocity=False,
 32 |     legend='off',
 33 |     colorbar='on',
 34 |     min_mass=2,
 35 |     arrow_grid=(30,30)
 36 | ): 
 37 | 
 38 |     """Plot the RNA velocity on the embedding space; or plot the kinetic parameters ('alpha', 'beta', 'gamma', 'splice', 'unsplice', or 'pseudotime') of one gene on the embedding space.
 39 |         
 40 |     Arguments
 41 |     ---------
 42 |     ax: `ax`
 43 |         ax of plt.subplots()
 44 |     cellDancer_df: `pandas.DataFrame`
 45 |         Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime']
 46 |     colors: `list`, `dict`, or `str`
 47 |         When the input is a list: build a colormap dictionary for a list of cell type;  
 48 |         When the input is a dictionary: it is the customized color map dictionary of each cell type; 
 49 |         When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as input.
 50 |     custom_xlim: optional, `float` (default: None)
 51 |         Set the x limit of the current axes.
 52 |     custom_ylim: optional, `float` (default: None)
 53 |         Set the y limit of the current axes.
 54 |     vmin: optional, `float` (default: None)
 55 |         Set the minimum color limit of the current image.
 56 |     vmax: optional, `float` (default: None)
 57 |         Set the maximum color limit of the current image.
 58 |     alpha: optional, `float` (default: 0.5)
 59 |         The alpha blending value, between 0 (transparent) and 1 (opaque).
 60 |     s: optional, `float` (default: 5)
 61 |         The marker size.
 62 |     legend_marker_size: optional, `float` (default: 5)
 63 |         The lengend marker size.
 64 |     gene: optional, `str` (default: None)
 65 |         Gene name for plotting.
 66 |     velocity: optional, `bool` (default: False)
 67 |         `True` if plot velocity.
 68 |     legend: optional, `str` (default: 'off')
 69 |         `'off'` if the color map of cell legend is not plotted. 
 70 |         `'only'` if only plot the cell type legend.
 71 |     colorbar: optional, `str` (default: 'on')
 72 |         `‘on’` if the colorbar of the plot of `alpha`, `beta`, `gamma`, `splice`, or `unsplice` is to be shown. `'off'` if the colorbar is to be not shown.
 73 |     min_mass: optional, `float` (default: 2)
 74 |         Filter by using the isotropic gaussian kernel to display the arrow on grids. The lower the min_mass, the more arrows.
 75 |     arrow_grid: optional, `tuple` (default: (30,30))
 76 |         The sparsity of the grids of velocity arrows. The larger, the more compact, and more arrows will be shown.
 77 |     Returns
 78 |     -------
 79 |     ax: matplotlib.axes.Axes
 80 |     """  
 81 | 
 82 |     def gen_Line2D(label, markerfacecolor):
 83 |         return Line2D([0], [0], color='w', marker='o', label=label,
 84 |             markerfacecolor=markerfacecolor, 
 85 |             markeredgewidth=0,
 86 |             markersize=legend_marker_size)
 87 | 
 88 |     if isinstance(colors, (list, tuple)):
 89 |         #print("\nbuild a colormap for a list of clusters as input\n")
 90 |         colors = build_colormap(colors)
 91 |     
 92 |     if isinstance(colors, dict):
 93 |         attr = 'clusters'
 94 |         legend_elements= [gen_Line2D(i, colors[i]) for i in colors]
 95 |         if legend != 'off':
 96 |             lgd=ax.legend(handles=legend_elements,
 97 |                 bbox_to_anchor=(1.01, 1),
 98 |                 loc='upper left')
 99 |             bbox_extra_artists=(lgd,)
100 |             if legend == 'only':
101 |                 return lgd
102 |         else:
103 |             bbox_extra_artists=None
104 | 
105 |         c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters', gene))
106 |         cmap=ListedColormap(list(colors.values()))
107 |     elif isinstance(colors, str):
108 |         attr = colors
109 |         if colors in ['alpha', 'beta', 'gamma']:
110 |             assert gene, '\nError! gene is required!\n'
111 |             cmap = LinearSegmentedColormap.from_list("mycmap", colors_alpha_beta_gamma)
112 |         if colors in ['splice', 'unsplice']:
113 |             assert gene, '\nError! gene is required!\n'
114 |             colors = {'splice':'splice', 'unsplice':'unsplice'}[colors]
115 |             cmap = LinearSegmentedColormap.from_list("mycmap",
116 |                     colors_splice_unsplice)
117 |         if colors in ['pseudotime']:
118 |             cmap = 'viridis'
119 |         c = extract_from_df(cellDancer_df, [colors], gene)
120 |         
121 |     elif colors is None:
122 |         attr = 'basic'
123 |         cmap = None
124 |         c = 'Grey'
125 |     
126 |     embedding = extract_from_df(cellDancer_df, ['embedding1', 'embedding2'], gene)
127 |     n_cells = embedding.shape[0]
128 |     
129 |     im=ax.scatter(embedding[:, 0],
130 |                 embedding[:, 1],
131 |                 c=c,
132 |                 cmap=cmap,
133 |                 s=s,
134 |                 vmin=vmin,
135 |                 vmax=vmax,
136 |                 alpha=alpha,
137 |                 edgecolor="none")
138 |     if colorbar == 'on' and isinstance(colors, str):
139 |         ax_divider = make_axes_locatable(ax)
140 |         cax = ax_divider.append_axes("top", size="5%", pad="-5%")
141 | 
142 |         # print("   \n ")
143 |         cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
144 |         cbar.set_ticks([])
145 | 
146 |     if velocity:
147 |         sample_cells = cellDancer_df['velocity1'][:n_cells].dropna().index
148 |         embedding_ds = embedding[sample_cells]
149 |         velocity_embedding= extract_from_df(cellDancer_df, ['velocity1', 'velocity2'], gene)
150 |         grid_curve(ax, embedding_ds, velocity_embedding, arrow_grid, min_mass)
151 | 
152 |     if custom_xlim is not None:
153 |         ax.set_xlim(custom_xlim[0], custom_xlim[1])
154 |     if custom_ylim is not None:
155 |         ax.set_ylim(custom_ylim[0], custom_ylim[1])
156 |     
157 |     return ax
158 | 
159 | def grid_curve(
160 |     ax, 
161 |     embedding_ds, 
162 |     velocity_embedding, 
163 |     arrow_grid, 
164 |     min_mass
165 | ):
166 |     # calculate_grid_arrows
167 |     # kernel grid plot
168 | 
169 |     def calculate_two_end_grid(embedding_ds, velocity_embedding, smooth=None, steps=None, min_mass=None):
170 |         # Prepare the grid
171 |         grs = []
172 |         for dim_i in range(embedding_ds.shape[1]):
173 |             m, M = np.min(embedding_ds[:, dim_i])-0.2, np.max(embedding_ds[:, dim_i])-0.2
174 |             m = m - 0.025 * np.abs(M - m)
175 |             M = M + 0.025 * np.abs(M - m)
176 |             gr = np.linspace(m, M, steps[dim_i])
177 |             grs.append(gr)
178 | 
179 |         meshes_tuple = np.meshgrid(*grs)
180 |         gridpoints_coordinates = np.vstack(
181 |             [i.flat for i in meshes_tuple]).T
182 | 
183 |         n_neighbors = int(velocity_embedding.shape[0]/3)
184 |         dists_head, neighs_head = find_nn_neighbors(
185 |             embedding_ds, gridpoints_coordinates, n_neighbors)
186 |         dists_tail, neighs_tail = find_nn_neighbors(
187 |             embedding_ds+velocity_embedding, gridpoints_coordinates,
188 |             n_neighbors)
189 |         std = np.mean([(g[1] - g[0]) for g in grs])
190 | 
191 |         # isotropic gaussian kernel
192 |         gaussian_w_head = normal.pdf(
193 |             loc=0, scale=smooth * std, x=dists_head)
194 |         total_p_mass_head = gaussian_w_head.sum(1)
195 |         gaussian_w_tail = normal.pdf(
196 |             loc=0, scale=smooth * std, x=dists_tail)
197 |         total_p_mass_tail = gaussian_w_tail.sum(1)
198 | 
199 |         
200 |         UZ_head = (velocity_embedding[neighs_head] * gaussian_w_head[:, :, None]).sum(
201 |             1) / np.maximum(1, total_p_mass_head)[:, None]  # weighed average
202 |         UZ_tail = (velocity_embedding[neighs_tail] * gaussian_w_tail[:, :, None]).sum(
203 |             1) / np.maximum(1, total_p_mass_tail)[:, None]  # weighed average
204 | 
205 |         XY = gridpoints_coordinates
206 | 
207 |         dists_head2, neighs_head2 = find_nn_neighbors(
208 |             embedding_ds, XY+UZ_head, n_neighbors)
209 |         dists_tail2, neighs_tail2 = find_nn_neighbors(
210 |             embedding_ds, XY-UZ_tail, n_neighbors)
211 | 
212 |         gaussian_w_head2 = normal.pdf(
213 |             loc=0, scale=smooth * std, x=dists_head2)
214 |         total_p_mass_head2 = gaussian_w_head2.sum(1)
215 |         gaussian_w_tail2 = normal.pdf(
216 |             loc=0, scale=smooth * std, x=dists_tail2)
217 |         total_p_mass_tail2 = gaussian_w_tail2.sum(1)
218 | 
219 |         UZ_head2 = (velocity_embedding[neighs_head2] * gaussian_w_head2[:, :, None]).sum(
220 |             1) / np.maximum(1, total_p_mass_head2)[:, None]  # weighed average
221 |         UZ_tail2 = (velocity_embedding[neighs_tail2] * gaussian_w_tail2[:, :, None]).sum(
222 |             1) / np.maximum(1, total_p_mass_tail2)[:, None]  # weighed average
223 | 
224 |         mass_filter = total_p_mass_head < min_mass
225 | 
226 |         # filter dots
227 |         UZ_head_filtered = UZ_head[~mass_filter, :]
228 |         UZ_tail_filtered = UZ_tail[~mass_filter, :]
229 |         UZ_head2_filtered = UZ_head2[~mass_filter, :]
230 |         UZ_tail2_filtered = UZ_tail2[~mass_filter, :]
231 |         XY_filtered = XY[~mass_filter, :]
232 |         return(XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs)
233 | 
234 |     XY_filtered, UZ_head_filtered, UZ_tail_filtered, UZ_head2_filtered, UZ_tail2_filtered, mass_filter, grs = calculate_two_end_grid(
235 |         embedding_ds, velocity_embedding, smooth=0.8, steps=arrow_grid, min_mass=min_mass)
236 | 
237 |     # connect two end grid to curve
238 |     n_curves = XY_filtered.shape[0]
239 |     s_vals = np.linspace(0.0, 1.5, 15) # TODO check last
240 |     # get longest distance len and norm ratio
241 |     XYM = XY_filtered
242 |     UVT = UZ_tail_filtered
243 |     UVH = UZ_head_filtered
244 |     UVT2 = UZ_tail2_filtered
245 |     UVH2 = UZ_head2_filtered
246 | 
247 |     def norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals):
248 |         '''get the longest distance in prediction between the five points,
249 |         and normalize by using the distance between two grids'''
250 | 
251 |         def distance(x, y):
252 |             # calc disctnce list between a set of coordinate
253 |             calculate_square = np.subtract(
254 |                 x[0:-1], x[1:])**2 + np.subtract(y[0:-1], y[1:])**2
255 |             distance_result = (calculate_square)**0.5
256 |             return distance_result
257 | 
258 |         max_discance = 0
259 |         for i in range(n_curves):
260 |             nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]],
261 |                                         [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]])
262 |             curve = bezier.Curve(nodes, degree=4)
263 |             curve_dots = curve.evaluate_multi(s_vals)
264 |             distance_sum = np.sum(
265 |                 distance(curve_dots[0], curve_dots[1]))
266 |             max_discance = max(max_discance, distance_sum)
267 |         distance_grid = (
268 |             abs(grs[0][0]-grs[0][1]) + abs(grs[1][0]-grs[1][1]))/2
269 |         norm_ratio = distance_grid/max_discance
270 |         return(norm_ratio)
271 | 
272 |     norm_ratio = norm_arrow_display_ratio(XYM, UVT, UVH, UVT2, UVH2, grs, s_vals)
273 | 
274 |     # plot the curve arrow for cell velocity
275 |     XYM = XY_filtered
276 |     UVT = UZ_tail_filtered * norm_ratio
277 |     UVH = UZ_head_filtered * norm_ratio
278 |     UVT2 = UZ_tail2_filtered * norm_ratio
279 |     UVH2 = UZ_head2_filtered * norm_ratio
280 | 
281 |     def plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals):
282 |         # TO DO: add 'colorful cell velocity' to here, now there is only curve arrows
283 |         for i in range(n_curves):
284 |             nodes = np.asfortranarray([[XYM[i, 0]-UVT[i, 0]-UVT2[i, 0], XYM[i, 0]-UVT[i, 0], XYM[i, 0], XYM[i, 0]+UVH[i, 0], XYM[i, 0]+UVH[i, 0]+UVH2[i, 0]],
285 |                                         [XYM[i, 1]-UVT[i, 1]-UVT2[i, 1], XYM[i, 1]-UVT[i, 1], XYM[i, 1], XYM[i, 1]+UVH[i, 1], XYM[i, 1]+UVH[i, 1]+UVH2[i, 1]]])
286 |             curve = bezier.Curve(nodes, degree=4)
287 |             curve_dots = curve.evaluate_multi(s_vals)
288 |             ax.plot(curve_dots[0], curve_dots[1],
289 |                         linewidth=0.5, color='black', alpha=1)
290 | 
291 |             # normalize the arrow of the last two points at the tail, to let all arrows has the same size in quiver
292 |             U = curve_dots[0][-1]-curve_dots[0][-2]
293 |             V = curve_dots[1][-1]-curve_dots[1][-2]
294 |             N = np.sqrt(U**2 + V**2)
295 |             U1, V1 = U/N*0.5, V/N*0.5  # 0.5 is to let the arrow have a suitable size
296 |             ax.quiver(curve_dots[0][-2], curve_dots[1][-2], U1, V1, units='xy', angles='xy',
297 |                         scale=1, linewidth=0, color='black', alpha=1, minlength=0, width=0.1)
298 | 
299 |     plot_cell_velocity_curve(XYM, UVT, UVH, UVT2, UVH2, s_vals)
300 | 
301 | 
302 | def plot_kinetic_para(
303 |     ax,
304 |     kinetic_para,
305 |     cellDancer_df,
306 |     color_map=None,
307 |     title=None,
308 |     legend=False
309 | ):
310 | 
311 |     """Plot the UMAP calculated by the kinetic parameter(s).
312 |         
313 |     Arguments
314 |     ---------
315 |     ax: `ax`
316 |         ax of plt.subplots()
317 |     kinetic_para: `str`
318 |         The parameter used to generate the embedding space based on UMAP, could be selected from {'alpha', 'beta', 'gamma', 'alpha_beta_gamma'}.
319 |     cellDancer_df: `pandas.DataFrame`
320 |         Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', 'splice', 'unsplice', 'splice_predict', 'unsplice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2']
321 |     color_map: `dict` (optional, default: None)
322 |         The color map dictionary of each cell type.
323 |     legend: `bool` (optional, default: False)
324 |         `True` if the color map of cell legend is to be plotted. 
325 |     """    
326 |     onegene=cellDancer_df[cellDancer_df.gene_name==cellDancer_df.gene_name[0]]
327 |     umap_para=onegene[[(kinetic_para+'_umap1'),(kinetic_para+'_umap2')]].to_numpy()
328 |     onegene_cluster_info=onegene.clusters
329 |     
330 |     gene=None
331 |     if gene is None:
332 |         if color_map is None:
333 |             from .colormap import build_colormap
334 |             color_map=build_colormap(onegene_cluster_info)
335 | 
336 |         colors = list(map(lambda x: color_map.get(x, 'black'), onegene_cluster_info))
337 | 
338 |         if legend:
339 |             markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in color_map.values()]
340 |             lgd=plt.legend(markers, color_map.keys(), numpoints=1,loc='upper left',bbox_to_anchor=(1.01, 1))
341 |                 
342 |         im=ax.scatter(umap_para[:,0], umap_para[:,1],c=colors,s=15,alpha=0.5,edgecolor="none")
343 |         ax.axis('square')
344 |         ax.axis('off')
345 |         ax.set_title('UMAP of '+ kinetic_para)
346 | 
347 |     else:
348 |         onegene=cellDancer_df[cellDancer_df.gene_name==gene]
349 |         im=ax.scatter(umap_para[:,0], umap_para[:,1],c=np.log(onegene.splice+0.0001),s=15,alpha=1,edgecolor="none")
350 |         ax.axis('square')
351 |         ax.axis('off')
352 |         ax.set_title('spliced reads of '+gene+'\n on UMAP of \n'+ kinetic_para)
353 |         
354 |         ax_divider = make_axes_locatable(ax)
355 |         cax = ax_divider.append_axes("top", size="5%", pad="-5%")
356 |         cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
357 |         cbar.set_ticks([])
358 |         
359 |     umap_df=pd.concat([pd.DataFrame({'umap1':umap_para[:,0],'umap2':umap_para[:,1]})],axis=1)
360 |     
361 |     return ax


--------------------------------------------------------------------------------
/src/celldancer/plotting/colormap.py:
--------------------------------------------------------------------------------
 1 | colors_alpha_beta_gamma = ["#007EB7","#3B9AB2", "#78B7C5", "#EBCC2A", "#E1AF00", "#F21A00"] 
 2 | colors_splice_unsplice = ["#2488F0","#7F3F98","#E22929","#FCB31A"]
 3 | 
 4 | colormap_erythroid={
 5 | 'Haematoendothelial progenitors':'#3361A5',
 6 | 'Blood progenitors 1':'#248AF3',
 7 | 'Blood progenitors 2':'#14B3FF',
 8 | 'Erythroid1':'#88CEEF',
 9 | 'Erythroid2':'#FDB31A',
10 | 'Erythroid3':'#E42A2A'
11 | }
12 | 
13 | colormap_neuro = {
14 | 'CA': "#ed0345",
15 | 'CA1-Sub': "#710162",
16 | 'CA2-3-4': "#a12a5e",
17 | 'Granule':"#ef6a32",
18 | 'ImmGranule1': "#ef6a32",
19 | 'ImmGranule2': "#ef6a32",
20 | 'Nbl1': "#fbbf45",
21 | 'Nbl2': "#fbbf45",
22 | 'nIPC': "#aad962",
23 | 'RadialGlia': "#03c383",
24 | 'RadialGlia2': "#03c383",
25 | 'GlialProg': '#56A65A',
26 | 'OPC': "#017351",
27 | 'ImmAstro': "#08A8CE"
28 | }
29 | 
30 | 
31 | colormap_pancreas={
32 | 'Ductal':'#3361A5',
33 | 'Ngn3 low EP':'#248AF3',
34 | 'Ngn3 high EP':'#14B3FF',
35 | 'Pre-endocrine':'#88CEEF',
36 | 'Alpha':'#ff4800',
37 | 'Beta':"#B81136",
38 | 'Delta':'green',
39 | 'Epsilon':'#03B3B0'
40 | }
41 | 
42 | colormap_hgForebrainGlut={
43 | 0:'#9408F7',
44 | 1:'#C729D6',
45 | 2:'#FA4AB5',
46 | 3:'#FF6A95',
47 | 4:'#FF8B74',
48 | 5:'#FFAC53',
49 | 6:'#FFCD32'
50 | }
51 | 
52 | colormap_hgforebrainglut={
53 | 'Radial Glia':'#9408F7',
54 | 'Radial Glia':'#C729D6',
55 | 'Neuroblast':'#FA4AB5',
56 | 'Neuroblast':'#FF6A95',
57 | 'Immature Neuron':'#FF8B74',
58 | 'Immature Neuron':'#FFAC53',
59 | 'Neuron':'#FFCD32'
60 | }
61 | 
62 | color_template = ["#08A8CE","#017351",'#56A65A',"#03c383","#aad962","#fbbf45","#ef6a32","#ed0345","#a12a5e","#710162","#3B9AB2"]
63 | 
64 | def build_colormap(cluster_list):
65 |     from itertools import cycle
66 |     color_list=color_template
67 |     colors = dict(zip(cluster_list, cycle(color_list)) if len(cluster_list) > len(color_list) else zip(cycle(cluster_list), color_list))
68 |     return colors
69 | 
70 | 


--------------------------------------------------------------------------------
/src/celldancer/plotting/gene.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import os
  3 | import sys
  4 | import pandas as pd
  5 | import numpy as np
  6 | from matplotlib.lines import Line2D
  7 | from matplotlib.colors import ListedColormap
  8 | from .colormap import *
  9 | from ..sampling import sampling_neighbors
 10 | from ..utilities import extract_from_df
 11 | 
 12 | def scatter_gene(
 13 |     ax=None,
 14 |     x=None,
 15 |     y=None,
 16 |     cellDancer_df=None,
 17 |     colors=None,
 18 |     custom_xlim=None,
 19 |     custom_ylim=None,
 20 |     vmin=None,
 21 |     vmax=None,
 22 |     alpha=0.5, 
 23 |     s = 5,
 24 |     velocity=False,
 25 |     gene=None,
 26 |     legend='off',
 27 |     arrow_grid = (15,15)):
 28 | 
 29 |     """Plot the velocity (splice-unsplice) of a gene, or plot the parameter ('alpha', 'beta', 'gamma', 'splice', 'unsplice') in pseudotime, or customize the parameters in x-axis and y-axis of a gene.
 30 |         
 31 |     Arguments
 32 |     ---------
 33 |     ax: `ax of plt.subplots()`
 34 |         ax to add subplot.
 35 |     x: `str`
 36 |         Set x axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}.
 37 |     y: `str`
 38 |         Set y axis as one of {'splice', 'unsplice', 'alpha', 'beta', 'gamma', 'pseudotime'}.
 39 |     cellDancer_df: `pandas.DataFrame`
 40 |         Dataframe of velocity estimation, cell velocity, and pseudotime results. Columns=['cellIndex', 'gene_name', 'unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 'velocity1', 'velocity2', 'pseudotime']
 41 |     colors: `list`, `dict`, or `str`
 42 |         When the input is a list: build a colormap dictionary for a list of cell type; 
 43 |         When the input is a dictionary: the customized color map dictionary of each cell type; 
 44 |         When the input is a str: one of {'alpha', 'beta', 'gamma', 'splice', 'unsplice', 'pseudotime'} is used as value of color.
 45 |     custom_xlim: optional, `float` (default: None)
 46 |         Set the x limit of the current axes.
 47 |     custom_ylim: optional, `float` (default: None)
 48 |         Set the y limit of the current axes.
 49 |     vmin: optional, `float` (default: None)
 50 |         Set the minimum color limit of the current image.
 51 |     vmax: optional, `float` (default: None)
 52 |         Set the maximum color limit of the current image.
 53 |     alpha: optional, `float` (default: 0.5)
 54 |         The alpha blending value, between 0 (transparent) and 1 (opaque).
 55 |     s: optional, `float` (default: 5)
 56 |         The marker size.
 57 |     velocity: optional, `bool` (default: False)
 58 |         `True` if velocity in gene level is to be plotted.
 59 |     gene: optional, `str` (default: None)
 60 |         Gene selected to be plotted.
 61 |     legend: optional, `str` (default: 'off')
 62 |         `‘off’` if the color map of cell type legend is not to be plotted;
 63 |         `‘only’` if only plot the cell type legend.
 64 |     arrow_grid: optional, `tuple` (default: (15,15))
 65 |         The sparsity of the grids of velocity arrows. The larger, the more compact and more arrows will be shown.
 66 | 
 67 |     Returns
 68 |     -------
 69 |     ax: matplotlib.axes.Axes
 70 |     """ 
 71 | 
 72 |     def gen_Line2D(label, markerfacecolor):
 73 |         return Line2D([0], [0], color='w', marker='o', label=label,
 74 |             markerfacecolor=markerfacecolor,
 75 |             markeredgewidth=0,
 76 |             markersize=s)
 77 |     
 78 |     if isinstance(colors, list):
 79 |         colors = build_colormap(colors)
 80 | 
 81 |     if isinstance(colors, dict):
 82 |         attr = 'clusters'
 83 |         legend_elements= [gen_Line2D(i, colors[i]) for i in colors]
 84 |         if legend != 'off':
 85 |             lgd=ax.legend(handles=legend_elements,
 86 |                 bbox_to_anchor=(1.01, 1),
 87 |                 loc='upper left')
 88 |             bbox_extra_artists=(lgd,)
 89 |             if legend == 'only':
 90 |                 return lgd
 91 |         else:
 92 |             bbox_extra_artists=None
 93 | 
 94 |         c=np.vectorize(colors.get)(extract_from_df(cellDancer_df, 'clusters'))
 95 |         cmap=ListedColormap(list(colors.values()))
 96 | 
 97 |     elif isinstance(colors, str):
 98 |         attr = colors
 99 |         if colors in ['alpha', 'beta', 'gamma']:
100 |             assert gene, '\nError! gene is required!\n'
101 |             cmap = ListedColormap(colors_alpha_beta_gamma)
102 |         if colors in ['splice', 'unsplice']:
103 |             assert gene, '\nError! gene is required!\n'
104 |             cmap = ListedColormap(colors_splice_unsplice)
105 |         if colors in ['pseudotime']:
106 |             cmap = 'viridis'
107 |         else:
108 |             cmap = 'viridis'
109 | 
110 |         c = extract_from_df(cellDancer_df, [colors], gene)
111 |     elif colors is None:
112 |         attr = 'basic'
113 |         cmap = None
114 |         c = '#95D9EF'
115 |     
116 |     assert gene, '\nError! gene is required!\n'
117 |     xy = extract_from_df(cellDancer_df, [x, y], gene)
118 |     ax.scatter(xy[:, 0],
119 |                xy[:, 1],
120 |                c=c,
121 |                cmap=cmap,
122 |                s=s,
123 |                alpha=alpha,
124 |                vmin=vmin,
125 |                vmax=vmax,
126 |                edgecolor="none")
127 | 
128 |     if custom_xlim is not None:
129 |         ax.set_xlim(custom_xlim[0], custom_xlim[1])
130 |     if custom_ylim is not None:
131 |         ax.set_ylim(custom_ylim[0], custom_ylim[1])
132 | 
133 |                                  
134 |     if velocity:
135 |         assert (x,y) in [('unsplice', 'splice'), ('splice', 'unsplice')]
136 |         u_s = extract_from_df(cellDancer_df, ['unsplice','splice','unsplice_predict','splice_predict'], gene)
137 |         sampling_idx=sampling_neighbors(u_s[:,0:2], step=arrow_grid, percentile=15) # Sampling
138 |         u_s_downsample = u_s[sampling_idx,0:4]
139 | 
140 |         plt.scatter(u_s_downsample[:, 1], u_s_downsample[:,0], color="none", s=s, edgecolor="k")
141 |         plt.quiver(u_s_downsample[:, 1], u_s_downsample[:, 0], 
142 |                    u_s_downsample[:, 3]-u_s_downsample[:, 1], 
143 |                    u_s_downsample[:, 2]-u_s_downsample[:, 0],
144 |                    angles='xy', clim=(0., 1.))
145 | 
146 |     return ax
147 | 
148 | 


--------------------------------------------------------------------------------
/src/celldancer/plotting/graph.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import networkx as nx
  3 | import pandas as pd
  4 | import numpy as np
  5 | from datashader.layout import forceatlas2_layout
  6 | from datashader.bundling import hammer_bundle, connect_edges
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.colors import ListedColormap
  9 | from matplotlib.lines import Line2D
 10 | from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
 11 | 
 12 | from .colormap import *
 13 | if __name__ == "__main__":# developer test
 14 |     sys.path.append('..')
 15 |     from utilities import extract_from_df
 16 | else:
 17 |     from celldancer.utilities import extract_from_df
 18 | 
 19 | def PTO_Graph(
 20 |         ax,
 21 |         cellDancer_df,
 22 |         node_layout='forceatlas2',
 23 |         PRNG_SEED=None,
 24 |         force_iters=2000,
 25 |         use_edge_bundling=True,
 26 |         node_colors=None,
 27 |         node_sizes=5,
 28 |         edge_length=None,
 29 |         legend='off',
 30 |         colorbar='on'):
 31 | 
 32 |     """ 
 33 |     Graph visualization of selected cells reflecting their orders in
 34 |     pseudotime (PseudoTimeOrdered_Graph: PTO_Graph). Embedding and pseudotime 
 35 |     of the cells are required. Each cell makes a node and the connections between 
 36 |     nodes are based on their separation in the embedding space and the strength 
 37 |     of the connection is proportional to the pseudotime difference (the larger 
 38 |     the pseudotime difference in absolute values, the weaker the connection).
 39 | 
 40 |     Example usage:
 41 | 
 42 |     .. code-block:: python
 43 | 
 44 |         from celldancer.plotting import graph
 45 |         from matplotlib import pyplot as plt
 46 |         fig, ax = plt.subplots(figsize=(10,10))
 47 |         graph.PTO_Graph(ax, 
 48 |             load_cellDancer, 
 49 |             node_layout='forcedirected', 
 50 |             use_edge_bundling=True, 
 51 |             node_colors='clusters', 
 52 |             edge_length=3, 
 53 |             node_sizes='pseudotime', 
 54 |             colorbar='on',
 55 |             legend='on')
 56 |         
 57 |     In this example, we use a force-directed node layout algorithm (`ForceAtlas2 
 58 |     <https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0098679>`_).
 59 |     A connection is made between any two cells within 3 (unit in the embedding).
 60 |     The resulted edge lengths indicate the time difference between nodes (the
 61 |     closer in pseudotime, the shorter the edge length). Edge bundling is applied
 62 |     to highlight important edges (trunks). The sizes of the nodes are
 63 |     proportional to the pseudotime. The nodes are colored according to their
 64 |     cell types (if given by the input data). 
 65 | 
 66 |     Arguments
 67 |     ---------
 68 |     cellDancer_df: `pandas.DataFrame`
 69 |         Dataframe of velocity estimation, cell velocity, and pseudotime results. 
 70 |         Columns=['cellIndex', 'gene_name', 
 71 |         'unsplice', 'splice', 
 72 |         'unsplice_predict', 'splice_predict', 
 73 |         'alpha', 'beta', 'gamma', 
 74 |         'loss', 'cellID', 'clusters', 'embedding1', 'embedding2', 
 75 |         'velocity1', 'velocity2', 'pseudotime']
 76 | 
 77 |     node_layout: optional, `str` (default: forceatlas2)
 78 |          Layout for the graph. Currently only supports the forceatlas2 and
 79 |          embedding. 
 80 | 
 81 |          - `'forceatlas2'` or `'forcedirected'`: treat connections as forces
 82 |          between connected nodes.
 83 | 
 84 |          - `'embedding'`: use the embedding as positions of the nodes.
 85 | 
 86 |     PRNG_SEED: optional, `int`, or `None` (default: `None`)
 87 |         Seed to initialize the pseudo-random number generator.
 88 | 
 89 |     force_iters: optional, `int` (default: 2000)
 90 |         Number of passes for the force-directed layout calculation.
 91 | 
 92 |     use_edge_bundling: optional, `bool` (default: `True`)
 93 |         `True` if bundle the edges (computational demanding). 
 94 |         Edge bundling allows edges to curve and groups nearby ones together 
 95 |         for better visualization of the graph structure. 
 96 | 
 97 |     node_colors: optional, `str` (default: `None`)
 98 |         The node fill colors. 
 99 |         Possible values:
100 | 
101 |             - *clusters*: color according to the clusters information of the
102 |               respective cells.
103 | 
104 |             - *pseudotime*: colors according to the pseudotime of the 
105 |               respective cells.
106 | 
107 |             - A single color format string.
108 | 
109 |     edge_length: optional, `float` (default: `None`)
110 |         The distance cutoff in the embedding between two nodes to determine 
111 |         whether an edge should be formed (edge is formed when r < *edge_length*).
112 |         By default, the mean of all the cell
113 |         
114 |     node_sizes: optional, `float` or `numeric list-like` or `str` (default: 5)
115 |         The sizes of the nodes. If it is `str`, then the `str` has to be either one of those
116 |         {`pseudotime`, `index`, `x`, `y`} read from the `nodes` dataframe.
117 | 
118 |     legend: optional, `str` (default: 'off')
119 |         - `'off'`/`'on'`: Exclude/include the cell type legend on the plot. 
120 |         - `'only'`: Negelect the plot and only show the cell type legend.
121 | 
122 |     colorbar: optional, `str` (default: 'on')
123 |         - `'off'`/`'on'`: Show the colorbar in the case nodes are colored by `pseudotime`.
124 |         
125 | 
126 |     Returns
127 |     -------
128 |     ax: matplotlib.axes.Axes
129 | 
130 |     """  
131 | 
132 |     nodes, edges = create_nodes_edges(cellDancer_df, edge_length)
133 | 
134 |     if node_layout in ['forceatlas2', 'forcedirected']:
135 |         # Current version of datashader.layout does not support reading a layout (x,y) and perform layout function
136 |         # It does not support other attributes except index.
137 |         forcedirected = forceatlas2_layout(nodes[['index']], edges,
138 |                 weight='weight', iterations=force_iters, k=0.1, seed=PRNG_SEED)
139 |         nodes['x'] = forcedirected['x']
140 |         nodes['y'] = forcedirected['y']
141 | 
142 |     if use_edge_bundling:
143 |         bundle = hammer_bundle(nodes, edges)
144 |     else:
145 |         bundle = connect_edges(nodes, edges)
146 | 
147 | 
148 |     # For plotting settings
149 |     def gen_Line2D(label, markerfacecolor, markersize):
150 |         return Line2D([0], [0], color='w', 
151 |             marker='o', 
152 |             label=label,
153 |             markerfacecolor=markerfacecolor,
154 |             markeredgewidth=0,
155 |             markersize=markersize)
156 | 
157 |     if isinstance(node_sizes, (int, float)) or isinstance(node_sizes, list):
158 |         pass
159 |     elif isinstance(node_sizes, str):
160 |         node_sizes=nodes[node_sizes].to_numpy(dtype=float)*200
161 |     
162 |     if isinstance(node_colors, str):
163 |         # This goes to dict case afterwards
164 |         if node_colors in ['clusters']:
165 |             node_colors = build_colormap(nodes[node_colors])
166 |         if node_colors in ['pseudotime']:
167 |             cmap='viridis'
168 |             c=nodes[node_colors].to_numpy(dtype=float)
169 | 
170 |     if isinstance(node_colors, dict):
171 |         legend_elements= [gen_Line2D(i, 
172 |                     node_colors[i], 
173 |                     10) 
174 |                     for i in node_colors]
175 | 
176 |         if legend != 'off':
177 |             lgd=ax.legend(handles=legend_elements,
178 |                 bbox_to_anchor=(1.01, 1),
179 |                 loc='upper left')
180 |             bbox_extra_artists=(lgd,)
181 |             if legend == 'only':
182 |                 return lgd
183 |         else:
184 |             bbox_extra_artists=None
185 | 
186 |         c=nodes['clusters'].map(node_colors).to_list()
187 |         cmap=ListedColormap(list(node_colors.values()))
188 | 
189 |     if node_colors is None:
190 |         c = ['Grey']*len(nodes)
191 | 
192 |     ax.plot(bundle.x, bundle.y, 'y', zorder=1, linewidth=0.3, color='blue', alpha=1)
193 |     im = ax.scatter(nodes.x, nodes.y, c=c, cmap=cmap, s=node_sizes, zorder=2, edgecolors='k', alpha=0.5)
194 |     
195 |     if colorbar == 'on' and isinstance(node_colors, str):
196 |         ax_divider = make_axes_locatable(ax)
197 |         cax = ax_divider.append_axes("top", size="5%", pad="-5%")
198 |         cbar = plt.colorbar(im, cax=cax, orientation="horizontal", shrink=0.1)
199 |         cbar.set_ticks([])
200 |     ax.axis('off')
201 | 
202 |     return ax
203 | 
204 | 
205 | 
206 | def create_nodes_edges(data, radius):
207 |     def create_KNN_based_graph():
208 |         from sklearn.neighbors import NearestNeighbors
209 |         neigh = NearestNeighbors(radius = radius)
210 |         neigh.fit(embedding_ds)
211 |         nn_graph = neigh.radius_neighbors_graph(embedding_ds, mode='connectivity')
212 |         nn_array = nn_graph.toarray()
213 | 
214 |         # nn_array is effectively the edge list
215 |         # Keep track of cells of 0 timeshift.
216 |         node_list = [(i, {'pseudotime': pseudotime_ds[i,0], 'clusters':clusters_ds[i]})
217 |                      for i in range(len(embedding_ds))]
218 | 
219 |         dtime = pseudotime_ds[:,0] - pseudotime_ds
220 |         INF = 1./np.min(np.abs(dtime[dtime != 0]))
221 | 
222 |         # upper triangle of the knn array (i<j and nn_array[i,j] = 1)
223 |         edge_filter = np.triu(nn_array, k=1)
224 |         (i,j) = np.where(edge_filter != 0)
225 | 
226 |         # for forcedirected layouts,
227 |         # edge length is positively correlated with weight.
228 |         # hence 1/dtime here as the weight
229 |         # Created for directed graph
230 |         edge_list = list()
231 |         for a,b,w in zip(i,j, dtime[i,j]):
232 |             if w>0:
233 |                 edge_list.append((a, b, 1/w))
234 |             elif w<0:
235 |                 edge_list.append((a, b, -1/w))
236 |             else:
237 |                 edge_list.append((a, b, INF))
238 | 
239 |         G = nx.Graph()
240 |         G.add_nodes_from(node_list)
241 |         G.add_weighted_edges_from(edge_list)
242 |         return G
243 | 
244 |     embedding = extract_from_df(data, ['embedding1', 'embedding2'])
245 |     n_cells = embedding.shape[0]
246 |     sample_cells = data['velocity1'][:n_cells].dropna().index
247 |     clusters = extract_from_df(data, ['clusters'])
248 |     pseudotime = extract_from_df(data, ['pseudotime'])
249 | 
250 |     embedding_ds = embedding[sample_cells]
251 |     pseudotime_ds = pseudotime[sample_cells]
252 |     clusters_ds = clusters[sample_cells]
253 | 
254 |     G = create_KNN_based_graph()
255 | 
256 |     index = np.array(range(len(embedding_ds)), dtype=int)[:,None]
257 |     nodes = pd.DataFrame(np.hstack((embedding_ds, index, pseudotime_ds, clusters_ds)),
258 |                          columns=['x','y','index','pseudotime','clusters'])
259 | 
260 |     edges = pd.DataFrame([(i[0], i[1], G.edges[i]['weight']) for i in G.edges],
261 |                          columns=['source', 'target', 'weight'])
262 |     return nodes, edges
263 | 


--------------------------------------------------------------------------------
/src/celldancer/sampling.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from numpy.core.fromnumeric import size
  4 | import scipy
  5 | from sklearn.neighbors import NearestNeighbors
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | def sampling_neighbors(gene_unsplice_splice,step=(30,30),percentile=25):
 10 | 
 11 |     from scipy.stats import norm
 12 |     def gaussian_kernel(X, mu = 0, sigma=1):
 13 |         return np.exp(-(X - mu)**2 / (2*sigma**2)) / np.sqrt(2*np.pi*sigma**2)
 14 |     grs = []
 15 |     for dim_i in range(gene_unsplice_splice.shape[1]):
 16 |         m, M = np.min(gene_unsplice_splice[:, dim_i]), np.max(gene_unsplice_splice[:, dim_i])
 17 |         m = m - 0.025 * np.abs(M - m)
 18 |         M = M + 0.025 * np.abs(M - m)
 19 |         gr = np.linspace(m, M, step[dim_i])
 20 |         grs.append(gr)
 21 |     meshes_tuple = np.meshgrid(*grs)
 22 |     gridpoints_coordinates = np.vstack([i.flat for i in meshes_tuple]).T
 23 |     gridpoints_coordinates = gridpoints_coordinates + norm.rvs(loc=0, scale=0.15, size=gridpoints_coordinates.shape)
 24 |     
 25 |     np.random.seed(10) # set random seed
 26 |     
 27 |     nn = NearestNeighbors()
 28 | 
 29 |     neighbors_1 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20)
 30 |     nn.fit(gene_unsplice_splice[:,0:2])
 31 |     dist, ixs = nn.kneighbors(gridpoints_coordinates, neighbors_1)
 32 | 
 33 |     ix_choice = ixs[:,0].flat[:]
 34 |     ix_choice = np.unique(ix_choice)
 35 | 
 36 |     nn = NearestNeighbors()
 37 | 
 38 |     neighbors_2 = min((gene_unsplice_splice[:,0:2].shape[0]-1), 20)
 39 |     nn.fit(gene_unsplice_splice[:,0:2])
 40 |     dist, ixs = nn.kneighbors(gene_unsplice_splice[ix_choice, 0:2], neighbors_2)
 41 |     
 42 |     density_extimate = gaussian_kernel(dist, mu=0, sigma=0.5).sum(1)
 43 |     bool_density = density_extimate > np.percentile(density_extimate, percentile)
 44 |     ix_choice = ix_choice[bool_density]
 45 |     return(ix_choice)
 46 | 
 47 | def sampling_inverse(gene_unsplice_splice,target_amount=500):
 48 |     unsplice = gene_unsplice_splice[:,0]
 49 |     splice = gene_unsplice_splice[:,1]
 50 |     values = np.vstack([unsplice,splice])
 51 |     kernel = scipy.stats.gaussian_kde(values)
 52 |     p = kernel(values)
 53 |     # p2 = (1/p)/sum(1/p)
 54 |     p2 = (1/p)/sum(1/p)
 55 |     idx = np.arange(values.shape[1])
 56 |     r = scipy.stats.rv_discrete(values=(idx, p2))
 57 |     idx_choice = r.rvs(size=target_amount)
 58 |     return(idx_choice)
 59 | 
 60 | def sampling_circle(gene_unsplice_splice,target_amount=500):
 61 |     unsplice = gene_unsplice_splice[:,0]
 62 |     splice = gene_unsplice_splice[:,1]
 63 |     values = np.vstack([unsplice,splice])
 64 |     kernel = scipy.stats.gaussian_kde(values)
 65 |     p = kernel(values)
 66 |     idx = np.arange(values.shape[1])
 67 |     tmp_p = np.square((1-(p/(max(p)))**2))+0.0001
 68 |     p2 = tmp_p/sum(tmp_p)
 69 |     r = scipy.stats.rv_discrete(values=(idx, p2))
 70 |     idx_choice = r.rvs(size=target_amount)
 71 |     return(idx_choice)
 72 | 
 73 | def sampling_random(gene_unsplice_splice, target_amount=500):
 74 |     idx = np.random.choice(gene_unsplice_splice.shape[0], size = target_amount, replace=False)
 75 |     return(idx)
 76 |     
 77 | def sampling_adata(detail, 
 78 |                     para,
 79 |                     target_amount=500,
 80 |                     step=(30,30)):
 81 |     if para == 'neighbors':
 82 |         data_U_S= np.array(detail[["unsplice","splice"]])
 83 |         idx = sampling_neighbors(data_U_S,step)
 84 |     elif para == 'inverse':
 85 |         data_U_S= np.array(detail[["unsplice","splice"]])
 86 |         idx = sampling_inverse(data_U_S,target_amount)
 87 |     elif para == 'circle':
 88 |         data_U_S= np.array(detail[["unsplice","splice"]])
 89 |         idx = sampling_circle(data_U_S,target_amount)
 90 |     elif para == 'random':
 91 |         data_U_S= np.array(detail[["unsplice","splice"]])
 92 |         idx = sampling_random(data_U_S,target_amount)
 93 |     else:
 94 |         print('para is neighbors or inverse or circle')
 95 |     return(idx)
 96 | 
 97 | def sampling_embedding(detail, 
 98 |                     para,
 99 |                     target_amount=500,
100 |                     step=(30,30)):
101 | 
102 |     '''
103 |     Guangyu
104 |     '''
105 |     if para == 'neighbors':
106 |         data_U_S= np.array(detail[["embedding1","embedding2"]])
107 |         idx = sampling_neighbors(data_U_S,step)
108 |     elif para == 'inverse':
109 |         print('inverse')
110 |         data_U_S= np.array(detail[["embedding1","embedding2"]])
111 |         idx = sampling_inverse(data_U_S,target_amount)
112 |     elif para == 'circle':
113 |         data_U_S= np.array(detail[["embedding1","embedding2"]])
114 |         idx = sampling_circle(data_U_S,target_amount)
115 |     elif para == 'random':
116 |         # print('random')
117 |         data_U_S= np.array(detail[["embedding1","embedding2"]])
118 |         idx = sampling_random(data_U_S,target_amount)
119 |     else:
120 |         print('para is neighbors or inverse or circle')
121 |     return(idx)
122 | 
123 | def adata_to_detail(data, para, gene):
124 |     '''
125 |     convert adata to detail format
126 |     data: an anndata
127 |     para: the varable name of unsplice, splice, and gene name
128 |     para = ['Mu', 'Ms']
129 |     '''
130 |     data2 = data[:, data.var.index.isin([gene])].copy()
131 |     unsplice = data2.layers[para[0]][:,0].copy().astype(np.float32)
132 |     splice = data2.layers[para[1]][:,0].copy().astype(np.float32)
133 |     detail = pd.DataFrame({'gene_name':gene, 'unsplice':unsplice, 'splice':splice})
134 |     return(detail)
135 | 
136 | def downsampling_embedding(data_df,para,target_amount, step, n_neighbors,expression_scale=None,projection_neighbor_choice='embedding',pca_n_components=None,umap_n=None,umap_n_components=None):
137 |     '''
138 |     Guangyu
139 |     sampling cells by embedding
140 |     data—df: from load_cellDancer
141 |     para:
142 |     
143 |     return: sampled embedding, the indexs of sampled cells, and the neighbors of sampled cells
144 |     '''
145 | 
146 |     gene = data_df['gene_name'].drop_duplicates().iloc[0]
147 |     embedding = data_df.loc[data_df['gene_name']==gene][['embedding1','embedding2']]
148 | 
149 |     if step is not None:
150 |         idx_downSampling_embedding = sampling_embedding(embedding,
151 |                     para=para,
152 |                     target_amount=target_amount,
153 |                     step=step)
154 |     else:
155 |         idx_downSampling_embedding=range(0,embedding.shape[0]) # all cells
156 |         
157 |     def transfer(data_df,expression_scale):
158 |         if expression_scale=='log':
159 |             data_df.splice=np.log(data_df.splice+0.000001)
160 |             data_df.unsplice=np.log(data_df.unsplice+0.000001)
161 |         elif expression_scale=='2power':
162 |             data_df.splice=2**(data_df.splice)
163 |             data_df.unsplice=2**(data_df.unsplice)
164 |         elif expression_scale=='power10':
165 |             data_df.splice=(data_df.splice)**10
166 |             data_df.unsplice=(data_df.unsplice)**10
167 |         elif expression_scale=='2power_norm_multi10':
168 |             gene_order=data_df.gene_name.drop_duplicates()
169 |             onegene=data_df[data_df.gene_name==data_df.gene_name[0]]
170 |             cellAmt=len(onegene)
171 |             data_df_max=data_df.groupby('gene_name')[['splice','unsplice']].max().rename(columns={'splice': 'splice_max','unsplice': 'unsplice_max'})
172 |             data_df_min=data_df.groupby('gene_name')[['splice','unsplice']].min().rename(columns={'splice': 'splice_min','unsplice': 'unsplice_min'})
173 |             data_df_fin=pd.concat([data_df_max,data_df_min],axis=1).reindex(gene_order)
174 |             data_df_fin=data_df_fin.loc[data_df_fin.index.repeat(cellAmt)]
175 |             data_df_combined=pd.concat([data_df.reset_index(drop=True) ,data_df_fin[['splice_max','unsplice_max','splice_min','unsplice_min']].reset_index(drop=True)],axis=1)
176 |             data_df_combined['unsplice_norm']=''
177 |             data_df_combined['splice_norm']=''
178 |             data_df_combined.unsplice_norm=(data_df_combined.unsplice-data_df_combined.unsplice_min)/(data_df_combined.unsplice_max-data_df_combined.unsplice_min)
179 |             data_df_combined.splice_norm=(data_df_combined.splice-data_df_combined.splice_min)/(data_df_combined.splice_max-data_df_combined.splice_min)
180 |             data_df_combined.unsplice=2**(data_df_combined.unsplice_norm*10)
181 |             data_df_combined.splice=2**(data_df_combined.splice_norm*10)
182 |             data_df=data_df_combined
183 | 
184 |         return (data_df)
185 | 
186 |     data_df=transfer(data_df,expression_scale)
187 |     
188 | 
189 |     if projection_neighbor_choice=='gene':
190 |         #print('using gene projection_neighbor_choice')
191 |         cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
192 |         data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
193 |         embedding_downsampling = data_df_pivot.iloc[idx_downSampling_embedding]
194 |     elif projection_neighbor_choice=='pca': # not use
195 |         from sklearn.decomposition import PCA
196 |         #print('using pca projection_neighbor_choice')
197 |         cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
198 |         data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
199 |         embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
200 |         pca=PCA(n_components=pca_n_components)
201 |         pca.fit(embedding_downsampling_0)
202 |         embedding_downsampling = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)]
203 |     elif projection_neighbor_choice=='pca_norm':
204 |         from sklearn.decomposition import PCA
205 |         #print('pca_norm')
206 |         cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
207 |         data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
208 |         embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
209 |         pca=PCA(n_components=pca_n_components)
210 |         pca.fit(embedding_downsampling_0)
211 |         embedding_downsampling_trans = pca.transform(embedding_downsampling_0)[:,range(pca_n_components)]
212 |         embedding_downsampling_trans_norm=(embedding_downsampling_trans - embedding_downsampling_trans.min(0)) / embedding_downsampling_trans.ptp(0)#normalize
213 |         embedding_downsampling_trans_norm_mult10=embedding_downsampling_trans_norm*10 #optional
214 |         embedding_downsampling=embedding_downsampling_trans_norm_mult10**5 # optional
215 |     elif projection_neighbor_choice=='embedding':
216 |         embedding_downsampling = embedding.iloc[idx_downSampling_embedding][['embedding1','embedding2']]
217 | 
218 |     elif projection_neighbor_choice =='umap':
219 |         import umap
220 |         #print('using umap projection_neighbor_choice')
221 |         cellID = data_df.loc[data_df['gene_name']==gene]['cellID']
222 |         data_df_pivot=data_df.pivot(index='cellID', columns='gene_name', values='splice').reindex(cellID)
223 |         embedding_downsampling_0 = data_df_pivot.iloc[idx_downSampling_embedding]
224 |         
225 |         def get_umap(df,n_neighbors=umap_n, min_dist=0.1, n_components=umap_n_components, metric='euclidean'): 
226 |             fit = umap.UMAP(
227 |                 n_neighbors=n_neighbors,
228 |                 min_dist=min_dist,
229 |                 n_components=n_components,
230 |                 metric=metric
231 |             )
232 |             embed = fit.fit_transform(df);
233 |             return(embed)
234 |         embedding_downsampling=get_umap(embedding_downsampling_0)
235 | 
236 |     n_neighbors = min(int((embedding_downsampling.shape[0])/4), n_neighbors)
237 |     if n_neighbors==0:
238 |         n_neighbors=1
239 |     nn = NearestNeighbors(n_neighbors=n_neighbors)
240 |     nn.fit(embedding_downsampling) 
241 |     embedding_knn = nn.kneighbors_graph(mode="connectivity")
242 |     return(embedding_downsampling, idx_downSampling_embedding, embedding_knn)
243 | 
244 | def downsampling(data_df, gene_list, downsampling_ixs):
245 |     '''
246 |     Guangyu
247 |     '''
248 |     data_df_downsampled=pd.DataFrame()
249 |     for gene in gene_list:
250 |         data_df_one_gene=data_df[data_df['gene_name']==gene]
251 |         data_df_one_gene_downsampled = data_df_one_gene.iloc[downsampling_ixs]
252 |         data_df_downsampled=data_df_downsampled.append(data_df_one_gene_downsampled)
253 |     return(data_df_downsampled)
254 | 


--------------------------------------------------------------------------------
/src/celldancer/simulation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import pandas as pd
  4 | from torch.utils.data import *
  5 | import anndata
  6 | 
  7 | from scipy.integrate import solve_ivp
  8 | 
  9 | def _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples):
 10 | 
 11 |     def trans_dynamics(t, expr): 
 12 |         s = expr[0]
 13 |         u = expr[1]
 14 |         du_dt = alpha - beta*u
 15 |         ds_dt = beta*u - gamma*s
 16 |         return [ds_dt, du_dt]
 17 | 
 18 |     #print("t1 and t2:", t1, t2)
 19 |     t_space = np.linspace(t1, t2, samples)
 20 |     num_sol = solve_ivp(trans_dynamics, [0, t2], [s0_start, u0_start], method='RK45', dense_output=True)
 21 |     XY_num_sol = num_sol.sol(t_space)
 22 |     S, U = XY_num_sol[0], XY_num_sol[1]
 23 |     return U, S
 24 | 
 25 | def _jitter(U, S, scale):
 26 |     S = S + np.random.normal(loc=0.0, scale=scale*np.percentile(S, 99) / 10, size=np.size(S))
 27 |     U = U + np.random.normal(loc=0.0, scale=scale*np.percentile(U, 99) / 10, size=np.size(U))
 28 |     S1 = S[(S>0)&(U>0)]
 29 |     U1 = U[(S>0)&(U>0)]
 30 |     S1, U1 = np.clip(S, 0, None), np.clip(U, 0, None)
 31 |     return U1, S1
 32 | 
 33 | def _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt=0.001, scale=1):
 34 |     u0, s0 = _generate_points(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples)
 35 |     u0_end, s0_end = u0[-1], s0[-1]
 36 |     #u0, s0 = _jitter(u0, s0, scale)
 37 |     u1 = u0 + (alpha - beta*u0)*dt
 38 |     s1 = s0 + (beta*u0 - gamma*s0)*dt
 39 | 
 40 |     expr = pd.DataFrame(u0, columns=['u0'])
 41 |     expr['s0'] = s0
 42 |     expr['u1'] = u1
 43 |     expr['s1'] = s1
 44 |     expr['alpha'] = alpha
 45 |     expr['beta'] = beta
 46 |     expr['gamma'] = gamma
 47 |     return expr, (u0_end, s0_end)
 48 | 
 49 | def _simulate_without_t( u0_start, s0_start, alpha, beta, gamma, percent_start_u, percent_end_u, samples, dt=0.001, scale=1):
 50 |     '''percentage_u: u_end/u_max'''
 51 | 
 52 |     def inversed_u(u, expr): 
 53 |         t = expr[0]
 54 |         dt_du = 1/(alpha - beta*u)
 55 |         return dt_du
 56 | 
 57 |     if alpha != 0:
 58 |         u_max = alpha/beta
 59 |         u_start = u0_start + (u_max-u0_start) * percent_start_u/100
 60 |         u_end = u0_start + (u_max-u0_start)  * percent_end_u/100
 61 |     else:
 62 |         u_max = u0_start
 63 |         u_start = u_max * (100-percent_start_u)/100
 64 |         u_end = u_max * (100-percent_end_u)/100
 65 | 
 66 |     t_sol = solve_ivp(inversed_u, [u0_start, u_end], [0], method='RK45', dense_output=True)
 67 |     t1 = t_sol.sol(u_start)[0]  
 68 |     t2 = t_sol.sol(u_end)[0]  
 69 |     return _simulate(u0_start, s0_start, alpha, beta, gamma, t1, t2, samples, dt, scale)
 70 | 
 71 | def forward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1):
 72 |     expr, end = _simulate_without_t(0, 0, alpha, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level)
 73 |     return expr
 74 | 
 75 | def backward(alpha, beta, gamma, percent_u1, percent_u2, samples, dt=0.001, noise_level=1):
 76 |     u0_start = alpha/beta
 77 |     s0_start = alpha/gamma
 78 |     expr, end = _simulate_without_t(u0_start, s0_start, 0, beta, gamma, percent_u1, percent_u2, samples, dt, noise_level)
 79 |     return expr
 80 | 
 81 | def two_alpha(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
 82 |     expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
 83 |     expr2, end2  = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
 84 |     expr = expr1.append(expr2)
 85 |     expr.index = range(len(expr))
 86 |     return expr
 87 | 
 88 | def boost_path(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
 89 | 
 90 |     #expr1, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
 91 |     #expr2, end2 = _simulate_without_t(new_u0_start, new_s0_start, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level
 92 |     expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
 93 |     expr2, end2 = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
 94 | 
 95 |     # boosted induction starts from the end of the previous induction.
 96 |     expr2['u0'] += alpha1/beta1
 97 |     expr2['s0'] += alpha1/gamma1
 98 |     expr2['u1'] += alpha1/beta1
 99 |     expr2['s1'] += alpha1/gamma1
100 |     expr = expr1.append(expr2)
101 |     expr.index = range(len(expr))
102 |     return expr
103 | 
104 | def two_alpha2(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=1):
105 |     expr1, end1 = _simulate_without_t(0, 0, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
106 |     expr2, end2  = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
107 |     expr = expr1.append(expr2)
108 |     expr.index = range(len(expr))
109 |     return expr
110 | 
111 | def two_alpha3(alpha1, alpha2, beta1, beta2, gamma1, gamma2, percent_u1, percent_u2, samples1, samples2, dt=0.001, noise_level=0.02):
112 |     exprx, (new_u0_start, new_s0_start) = _simulate_without_t(0, 0, alpha2, beta2, gamma2, 0, 99.9, samples2, dt, noise_level)
113 |     expr1, (new_u0_start2, new_s0_start2)  = _simulate_without_t(new_u0_start, new_s0_start, alpha1, beta1, gamma1, 0, percent_u1, samples1, dt, noise_level)
114 |     expr2, end1  = _simulate_without_t(new_u0_start2, new_s0_start2, alpha2, beta2, gamma2, 0, percent_u2, samples2, dt, noise_level)
115 |     expr = expr1.append(expr2)
116 |     expr.index = range(len(expr))
117 |     return expr
118 | 
119 | def generate_with_df(gene_info, dt=0.001, noise_level=0.2):
120 |     expr = pd.DataFrame()
121 |     last_u, last_s = None, None
122 |     for i in range(len(gene_info.index)):
123 |         gene_name, start_u, start_s = gene_info['gene_name'][i], gene_info['start_u'][i], gene_info['start_s'][i]
124 |         alpha, beta, gamma = gene_info['alpha'][i], gene_info['beta'][i], gene_info['gamma'][i]
125 |         start_pct, end_pct, samples = gene_info['start_pct'][i], gene_info['end_pct'][i], gene_info['samples'][i]
126 |         if start_u is not None and start_s is not None:
127 |             expr_tmp, (last_u, last_s) = _simulate_without_t(start_u, start_s, alpha, beta, gamma, start_pct, end_pct, samples)
128 |         else:
129 |             if last_u is None or last_s is None:
130 |                 print("start_u and start_s should not be None at the first line.")
131 |                 return None
132 |             expr_tmp, (last_u, last_s) = _simulate_without_t(last_u, last_s, alpha, beta, gamma, start_pct, end_pct, samples)
133 |         expr = expr.append(expr_tmp)
134 |     expr.index = range(len(expr))
135 |     expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
136 |     return gene_info, expr
137 | 
138 | def adata_to_detail(data, para, gene):
139 |     data2 = data[:, data.var.index.isin([gene])].copy()
140 |     u0 = data2.layers[para[0]][:,0].copy().astype(np.float32)
141 |     s0 = data2.layers[para[1]][:,0].copy().astype(np.float32)
142 |     alpha = data2.layers[para[2]][:,0].copy().astype(np.float32)
143 |     beta = data2.layers[para[3]][:,0].copy().astype(np.float32)
144 |     gamma = data2.layers[para[4]][:,0].copy().astype(np.float32)
145 |     detail = pd.DataFrame({'gene_list':gene, 'u0':u0, 's0':s0, 'embedding1':u0, 'embedding2':s0, 'alpha':alpha, 'beta':beta, 'gamma':gamma})
146 |     #detail['beta1'] = data2.var['beta1'].to_numpy()[0]
147 |     #detail['beta2'] = data2.var['beta2'].to_numpy()[0]
148 |     #detail['gamma1'] = data2.var['gamma1'].to_numpy()[0]
149 |     #detail['gamma2'] = data2.var['gamma2'].to_numpy()[0]
150 |     detail['path1_pct'] = data2.var['path1_pct'].to_numpy()[0]
151 |     detail['path2_pct'] = data2.var['path2_pct'].to_numpy()[0]
152 |     return(detail)
153 | 
154 | def generate(type, gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level):
155 |     cell_num=path1_sample+path2_sample
156 |     u0s, s0s, u1s, s1s, alphas, betas, gammas = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
157 |     gene_info = pd.DataFrame(columns = ['gene_name', 'type', 'alpha1', 'alpha2', 'beta1', 'beta2', 'gamma1', 'gamma2', 'path1_pct', 'path2_pct', 'samples'])
158 |     
159 |     for i in range(gene_num):
160 |         samples1, samples2 = path1_sample, path2_sample
161 |         if type == "forwad":
162 |             expr = forward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9,  samples=samples1, noise_level=noise_level)
163 |         elif type == "backward":
164 |             expr = backward(alpha=alpha1, beta=beta1, gamma=gamma1, percent_u1=0.1, percent_u2=99.9,  samples=samples1, noise_level=noise_level)
165 |         elif type == "two_alpha":
166 |             expr = two_alpha(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 
167 |                     samples1=samples1, samples2=samples2, noise_level=noise_level)
168 |         elif type == "two_alpha2":
169 |             expr = two_alpha2(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 
170 |                     samples1=samples1, samples2=samples2, noise_level=noise_level)
171 |         elif type == "two_alpha3":
172 |             expr = two_alpha3(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 
173 |                     samples1=samples1, samples2=samples2, noise_level=noise_level)
174 |         elif type == "boost":
175 |             expr = boost_path(alpha1=alpha1, alpha2=alpha2, beta1=beta1, beta2=beta2, gamma1=gamma1, gamma2=gamma2, percent_u1=path1_pct, percent_u2=path2_pct, 
176 |                     samples1=samples1, samples2=samples2, noise_level=noise_level)
177 |         else:
178 |             print("type not match")
179 |         expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
180 |         expr = expr.head(cell_num)
181 |         gene_name = "simulation"+str(i).zfill(3)
182 |         u0s[gene_name] = expr.u0
183 |         s0s[gene_name] = expr.s0
184 |         u1s[gene_name] = expr.u1
185 |         s1s[gene_name] = expr.s1
186 |         alphas[gene_name] = expr.alpha
187 |         betas[gene_name] = expr.beta
188 |         gammas[gene_name] = expr.gamma
189 |         gene_info = gene_info.append({'gene_name':gene_name, 'type':"multi_path", 'alpha1':alpha1, 'alpha2':alpha2, 'beta1':beta1, 'beta2':beta2, 'gamma1':gamma1, 'gamma2':gamma2, 'path1_pct':path1_pct, 'path2_pct':path2_pct, 'samples':len(expr)}, ignore_index=True)
190 | 
191 |     #gene_info.set_index("gene_name")
192 |     cell_info = pd.DataFrame()
193 |     cell_info['barcode'] = s0s.index
194 |     adata = anndata.AnnData(
195 |         X=s0s.to_numpy(),
196 |         obs = cell_info,
197 |         var = gene_info,
198 |         layers = {
199 |             'u0s':u0s.to_numpy(),
200 |             's0s': s0s.to_numpy(),
201 |             'u1s':u1s.to_numpy(),
202 |             's1s': s1s.to_numpy(),
203 |             'alphas': alphas.to_numpy(),
204 |             'betas': betas.to_numpy(),
205 |             'gammas': gammas.to_numpy() }
206 |     )
207 |     adata.var_names = gene_info['gene_name']
208 | 
209 |     genelist_all=adata.var_names
210 |     data_onegene = pd.DataFrame()
211 |     for g in genelist_all:
212 |         data_onegene = data_onegene.append(adata_to_detail(adata, para=['u0s', 's0s', 'alphas', 'betas', "gammas"], gene=g))
213 |     data_onegene=data_onegene.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"})
214 |     data_onegene.loc[:,'cellID']=list(range(len(data_onegene)))
215 |     data_onegene.loc[:,'clusters']=None
216 |     return data_onegene
217 | 
218 | def generate_mono(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
219 |     return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
220 | 
221 | def generate_tran_boost(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
222 |     return generate("two_alpha", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
223 | 
224 | def generate_forward(alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level, gene_num=1):
225 |     return generate("two_alpha2", gene_num, alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_pct, path2_pct, path1_sample, path2_sample, noise_level)
226 | 
227 | def generate_backward(start_s1, start_s2, start_u1, start_u2,alpha1, alpha2, beta1, beta2, gamma1, gamma2, path1_sample, path2_sample,noise_level=None):
228 |     gene_info = pd.DataFrame(columns = ['gene_name', 'start_u', 'start_s', 'alpha', 'beta', 'gamma', 'start_pct', 'end_pct', 'samples'])
229 |     gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u1, 'start_s':start_s1, 'alpha':alpha1, 'beta':beta1, 'gamma':gamma1, 'start_pct':0, 'end_pct':99, 'samples':path1_sample}, ignore_index=True)
230 |     gene_info = gene_info.append({'gene_name':'g1', 'start_u':start_u2, 'start_s':start_s2, 'alpha':alpha2, 'beta':beta2, 'gamma':gamma2, 'start_pct':0, 'end_pct':99, 'samples':path2_sample}, ignore_index=True)
231 | 
232 |     gene_info, expr = generate_with_df(gene_info,noise_level)
233 |     expr['embedding1']=expr['u0']
234 |     expr['embedding2']=expr['s0']
235 |     expr=expr.rename(columns={"u0": "unsplice", "s0": "splice","gene_list": "gene_name"})
236 |     expr.loc[:,'cellID']=list(range(len(expr)))
237 |     expr.loc[:,'clusters']=None
238 |     return expr
239 | 
240 | def generate_by_each_cell(df, t, dt=0.001, noise_level=1):
241 |     expr = pd.DataFrame()
242 | 
243 |     ti = t/len(df.index)
244 | 
245 |     last_u0, last_s0 = 0, 0
246 | 
247 |     for i in range(len(df.index)):
248 |         sub_expr, (u0i, s0i) = _simulate(
249 |             u0_start = last_u0, s0_start=last_s0, 
250 |             alpha=df['alpha'][i], beta=df['beta'][i], gamma=df['gamma'][i], 
251 |             t1=ti, t2=ti, 
252 |             samples=1, 
253 |             dt=dt, scale=noise_level)
254 | 
255 |         last_u0, last_s0 = u0i, s0i
256 |         expr = expr.append(sub_expr)
257 |     expr.u0, expr.s0 = _jitter(expr.u0, expr.s0, noise_level)
258 |  
259 |     expr.index = range(len(expr.index))
260 |     expr['t'] = ti * (expr.index+1)
261 |     return expr
262 | 
263 | def simulate(kinetic_type,
264 |              alpha1=None, 
265 |              alpha2=None, 
266 |              beta1=None, 
267 |              beta2=None, 
268 |              gamma1=None, 
269 |              gamma2=None, 
270 |              start_splice1=None,
271 |              start_splice2=None,
272 |              start_unsplice1=None,
273 |              start_unsplice2=None,
274 |              path1_pct=None, 
275 |              path2_pct=None, 
276 |              path1_cell_number=None,
277 |              path2_cell_number=None, 
278 |              noise_level=0.2):
279 |     
280 |     """ 
281 |     Simulate a gene with the kinetic type of mono-kinetic, multi-forward, multi-backward, or transcriptional boost.
282 | 
283 |     Arguments
284 |     ---------
285 |     kinetic_type: `pandas.DataFrame`
286 |         kinetic_type could be selected from ['mono', 'multi_forward', 'multi_backward', 'tran_boost']
287 | 
288 |     alpha1: `float` (default: `None`)
289 |         The simulated alpha (transcriptional rate) for the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward', or 'tran_boost'.
290 | 
291 |     alpha2: `float` (default: `None`)
292 |         The simulated alpha (transcriptional rate) for the second lineage. This parameter is valid when kinetic_type is set to 'multi_forward' or 'tran_boost'.
293 | 
294 |     beta1: `float` (default: `None`)
295 |         The simulated beta (splicing rate) for the first lineage.
296 | 
297 |     beta2: `float` (default: `None`)
298 |         The simulated beta (splicing rate) for the second lineage.
299 |         
300 |     gamma1: `float` (default: `None`)
301 |         The simulated gamma (degration rate) for the first lineage.
302 | 
303 |     gamma2: `float` (default: `None`)
304 |         The simulated gamma (degration rate) for the second lineage.
305 |         
306 |     start_splice1: optional, `float` (default: `None`)
307 |         The simulated spliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
308 |     
309 |     start_splice2: optional, `float` (default: `None`)
310 |         The simulated spliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
311 | 
312 |     start_unsplice1: optional, `float` (default: `None`)
313 |         The simulated unspliced abundance for the first lineage. Cells start from a region at a point of (start_splice1, start_unsplice1) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
314 |     
315 |     start_unsplice2: optional, `float` (default: `None`)
316 |         The simulated unspliced abundance for the second lineage. Cells start from a region at a point of (start_splice2, start_unsplice2) to decrease. This parameter is valid when kinetic_type is set to 'multi_backward'.
317 | 
318 |     path1_pct: optional, `float` (default: `None`)
319 |         To decrease the bias of cell distribution at the steady point in the first lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'.
320 |     
321 |     path2_pct: optional, `float` (default: `None`)
322 |         To decrease the bias of cell distribution at the steady point in the second lineage. This parameter is valid when kinetic_type is set to 'mono', 'multi_forward' or 'tran_boost'.
323 |         
324 |     path1_cell_number: `float` (default: `None`)
325 |         The number of cells to be generated in the first lineage.
326 |     
327 |     path2_cell_number: `float` (default: `None`)
328 |         The number of cells to be generated in the second lineage.
329 |         
330 |     noise_level: `float` (default: `0.2`)
331 |         The noise level to be set.
332 |             
333 |     Returns
334 |     -------
335 |     df: pandas.DataFrame
336 |         The dataframe of one simulated gene.
337 | 
338 |  
339 |     -------
340 | 
341 |     Example usage:
342 | 
343 |     .. code-block:: python
344 | 
345 |         import celldancer.simulation as cdsim
346 |         import matplotlib.pyplot as plt
347 | 
348 |         # Mono-kinetic
349 |         plt.figure(figsize=(5,5))
350 |         gene=cdsim.simulate(kinetic_type='mono',
351 |                             alpha1=1,
352 |                             alpha2=0,
353 |                             beta1=1,
354 |                             beta2=1,
355 |                             gamma1=1,
356 |                             gamma2=1,
357 |                             path1_pct=99,
358 |                             path2_pct=99,
359 |                             path1_cell_number=1000,
360 |                             path2_cell_number=1000)
361 |         plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
362 | 
363 |         # Multi-lineage forward branching
364 |         plt.figure(figsize=(5,5))
365 |         gene=cdsim.simulate(kinetic_type='multi_forward',
366 |                             alpha1=5,
367 |                             alpha2=1,
368 |                             beta1=1,
369 |                             beta2=0.5,
370 |                             gamma1=5,
371 |                             gamma2=0.25,
372 |                             path1_pct=99,
373 |                             path2_pct=99,
374 |                             path1_cell_number=1000,
375 |                             path2_cell_number=1000)
376 |         plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
377 | 
378 |         # Multi-lineage backward branching
379 |         plt.figure(figsize=(5,5))
380 |         gene=cdsim.simulate(kinetic_type='multi_backward',
381 |                             beta1=1,
382 |                             beta2=1,
383 |                             gamma1=1,
384 |                             gamma2=1,
385 |                             start_splice1=1,
386 |                             start_splice2=1.5,
387 |                             start_unsplice1=1,
388 |                             start_unsplice2=0.2,
389 |                             path1_cell_number=1000,
390 |                             path2_cell_number=1000)
391 |         plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
392 | 
393 |         # Transcriptional boost
394 |         plt.figure(figsize=(5,5))
395 |         gene=cdsim.simulate(kinetic_type='tran_boost',
396 |                             alpha1=2,
397 |                             alpha2=5,
398 |                             beta1=2,
399 |                             beta2=2,
400 |                             gamma1=1,
401 |                             gamma2=1,
402 |                             path1_pct=99,
403 |                             path2_pct=80,
404 |                             path1_cell_number=1000,
405 |                             path2_cell_number=1000)
406 |         plt.scatter(gene.splice,gene.unsplice,c='#95D9EF',alpha=0.5)
407 | 
408 |     .. image:: _static/sim.png
409 |       :width: 100%
410 |       :alt: sim
411 | 
412 |     """  
413 |     
414 | 
415 |     if kinetic_type=='mono':
416 |         df=generate_mono(alpha1=alpha1, 
417 |                          alpha2=alpha2, 
418 |                          beta1=beta1, 
419 |                          beta2=beta2, 
420 |                          gamma1=gamma1, 
421 |                          gamma2=gamma2, 
422 |                          path1_pct=path1_pct, 
423 |                          path2_pct=path2_pct, 
424 |                          path1_sample=path1_cell_number, 
425 |                          path2_sample=path2_cell_number, 
426 |                          noise_level=noise_level)
427 | 
428 |     elif kinetic_type=='multi_forward':
429 |         df=generate_forward(alpha1=alpha1, 
430 |                             alpha2=alpha2, 
431 |                             beta1=beta1, 
432 |                             beta2=beta2, 
433 |                             gamma1=gamma1, 
434 |                             gamma2=gamma2, 
435 |                             path1_pct=path1_pct, 
436 |                             path2_pct=path2_pct, 
437 |                             path1_sample=path2_cell_number, 
438 |                             path2_sample=path2_cell_number, 
439 |                             noise_level=noise_level)
440 | 
441 |     elif kinetic_type=='multi_backward':
442 |         df=generate_backward(start_s1=start_splice1, 
443 |                              start_s2=start_splice2, 
444 |                              start_u1=start_unsplice1, 
445 |                              start_u2=start_unsplice2,
446 |                              alpha1=0, 
447 |                              alpha2=0, 
448 |                              beta1=beta1, 
449 |                              beta2=beta2, 
450 |                              gamma1=gamma1, 
451 |                              gamma2=gamma2, 
452 |                              path1_sample=path1_cell_number, 
453 |                              path2_sample=path2_cell_number, 
454 |                              noise_level=noise_level)
455 |         
456 |     elif kinetic_type=='tran_boost':
457 |         df=generate_tran_boost(alpha1=alpha1, 
458 |                                alpha2=alpha2, 
459 |                                beta1=beta1, 
460 |                                beta2=beta2, 
461 |                                gamma1=gamma1, 
462 |                                gamma2=gamma2, 
463 |                                path1_pct=path1_pct, 
464 |                                path2_pct=path2_pct, 
465 |                                path1_sample=path1_cell_number, 
466 |                                path2_sample=path2_cell_number, 
467 |                                noise_level=noise_level)
468 | 
469 | 
470 |     else:
471 |         kinetic_type_list=['mono', 'multi_forward', 'multi_backward', 'tran_boost']
472 |         print('Kinetic type in ',kinetic_type_list,' could be choose from.')
473 | 
474 |     return(df)


--------------------------------------------------------------------------------
/src/celldancer/utilities.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.sparse import csr_matrix
  3 | import scipy
  4 | import pandas as pd
  5 | import anndata as ad
  6 | from sklearn.neighbors import NearestNeighbors
  7 | from statsmodels.nonparametric.kernel_regression import KernelReg
  8 | 
  9 | # progress bar
 10 | import contextlib
 11 | import joblib
 12 | from tqdm import tqdm
 13 | 
 14 | @contextlib.contextmanager
 15 | def tqdm_joblib(tqdm_object):
 16 |     """Context manager to patch joblib to report into tqdm progress bar given as argument"""
 17 |     class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
 18 |         def __call__(self, *args, **kwargs):
 19 |             tqdm_object.update(n=self.batch_size)
 20 |             return super().__call__(*args, **kwargs)
 21 | 
 22 |     old_batch_callback = joblib.parallel.BatchCompletionCallBack
 23 |     joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
 24 |     try:
 25 |         yield tqdm_object
 26 |     finally:
 27 |         joblib.parallel.BatchCompletionCallBack = old_batch_callback
 28 |         tqdm_object.close()
 29 | 
 30 | def _non_para_kernel(X,Y,down_sample_idx):
 31 |     # (no first cls),pseudotime r square calculation
 32 |     # this version has downsampling section
 33 |     # TO DO WHEN ONLY USING ONE GENE, WILL CAUSL PROBLEM WHEN COMBINING
 34 |     # Usage: Gene pseudotime fitting and r square (moved to utilities)
 35 |     # input: X,Y
 36 |     # return: estimator, r_square
 37 |     # example: 
 38 |     # X = pd.DataFrame(np.arange(100)*np.pi/100)
 39 |     # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1)))
 40 |     # estimator,r_square=non_para_kernel(X,Y)
 41 |     
 42 |     # X2=pd.DataFrame(np.random.randint(0,100,size=[200,1]))
 43 |     # Y2=pd.DataFrame(np.random.normal(9,5,size=[200]))
 44 |     # X = pd.DataFrame(np.arange(100)*np.pi/100)
 45 |     # Y = pd.DataFrame(np.sin(X)+np.random.normal(loc = 0, scale = 0.5, size = (100,1)))
 46 |     from statsmodels.nonparametric.kernel_regression import KernelReg
 47 |     import matplotlib.pyplot as plt
 48 |     print('_non_para_kernel_t4')
 49 |     Y_sampled=Y[X['index'].isin(down_sample_idx)]
 50 |     X_sampled=X[X['index'].isin(down_sample_idx)].time
 51 |     kde=KernelReg(endog=Y_sampled,
 52 |                            exog=X_sampled,
 53 |                            var_type='c',
 54 |                            )
 55 |     #X=merged.time
 56 |     #Y=merged.s0
 57 |     #print(kde.r_squared())
 58 |     n=X_sampled.shape[0]
 59 | 
 60 |     estimator = kde.fit(X_sampled)
 61 |     estimator = np.reshape(estimator[0],[n,1])
 62 | 
 63 |     return(estimator,kde.r_squared())
 64 | 
 65 | def getidx_downSampling_embedding(load_cellDancer,cell_choice=None):
 66 |     # find the origional id
 67 | 
 68 |     if cell_choice is not None:
 69 |         load_cellDancer=load_cellDancer[load_cellDancer.cellIndex.isin(cell_choice)]
 70 |         
 71 |     embedding=load_cellDancer.loc[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]][['embedding1','embedding2']]
 72 | 
 73 |     # get transfer id
 74 |     from .sampling import sampling_embedding
 75 |     idx_downSampling_embedding = sampling_embedding(embedding,
 76 |                 para='neighbors',
 77 |                 target_amount=0,
 78 |                 step=(30,30) # TODO: default is 30 
 79 |                 )
 80 |     if cell_choice is None:
 81 |         return(idx_downSampling_embedding)
 82 |     else:
 83 |         # transfer to the id of origional all detail list
 84 |         onegene=load_cellDancer[load_cellDancer.gene_name==list(load_cellDancer.gene_name)[0]].copy()
 85 |         onegene.loc[:,'transfer_id']=range(len(onegene))
 86 |         sampled_left=onegene[onegene.transfer_id.isin(idx_downSampling_embedding)]
 87 |         transfered_index=sampled_left.cellIndex
 88 |         return(transfered_index)
 89 | 
 90 | 
 91 | def combine_parallel_result(result,gene_list,sampled_idx,merged_part_time):
 92 |     # combine result of rsquare and non-para fitting obtained from parallel computing
 93 |     for i,result_i in enumerate(result):
 94 | 
 95 |         r_square=result_i[1]
 96 |         non_para_fit=result_i[0]
 97 |         #print(r_square)
 98 |         if i == 0:
 99 |             r_square_list = r_square
100 |             non_para_fit_list = np.transpose(non_para_fit)
101 |         else:
102 |             r_square_list = np.vstack((r_square_list, r_square))
103 |             non_para_fit_list = np.vstack((non_para_fit_list, np.transpose(non_para_fit)[0]))
104 |     r_square=pd.DataFrame({'gene_name':gene_list,'r_square':np.transpose(r_square_list)[0]})
105 | 
106 |     non_para_fit_heat=pd.DataFrame(non_para_fit_list,index=gene_list)
107 |     non_para_fit_heat.columns=merged_part_time[merged_part_time['index'].isin(sampled_idx)]['index']
108 | 
109 |     non_para_list=pd.DataFrame(non_para_fit_list)
110 |     non_para_list['combined']=non_para_list.values.tolist()
111 |     r_square
112 |     r_square_non_para_list=pd.concat([r_square,non_para_list['combined']],axis=1)
113 |     r_square_non_para_list_sort=r_square_non_para_list.sort_values(by=['r_square'], axis=0, ascending=False)
114 |     
115 |     return(r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list)    
116 |     
117 | def get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=None,):
118 |     # downsample
119 |     sampled_idx=getidx_downSampling_embedding(load_cellDancer,cell_choice=cell_choice)
120 |     
121 |     # parallel thread
122 |     from joblib import Parallel, delayed
123 |     # run parallel
124 |     with tqdm_joblib(tqdm(desc="Calculate rsquare", total=len(gene_list))) as progress_bar:
125 |         result = Parallel(n_jobs= -1, backend="loky")( # TODO: FIND suitable njobs
126 |             delayed(_non_para_kernel_t4)(s0_merged_part_time,s0_merged_part_gene[gene_list[gene_index]],sampled_idx)
127 |             for gene_index in range(0,len(gene_list)))
128 | 
129 |     # combine
130 |     r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list=combine_parallel_result(result,gene_list,sampled_idx,s0_merged_part_time)
131 |     
132 |     return (r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx)
133 | 
134 | 
135 | def get_gene_s0_by_time(cell_time,load_cellDancer):
136 |     cell_time_time_sort=cell_time.sort_values('pseudotime')
137 |     cell_time_time_sort.columns=['index','time']
138 | 
139 |     s0_heatmap_raw=load_cellDancer.pivot(index='cellIndex', columns='gene_name', values='unsplice')
140 | 
141 |     s0_heatmap_raw
142 |     s0_merged=pd.merge(cell_time_time_sort,s0_heatmap_raw,left_on='index', right_on='cellIndex') # TODO: NOT cellIndex in the future
143 | 
144 |     s0_merged_part_gene=s0_merged.loc[:, s0_merged.columns[2:]]
145 |     s0_merged_part_time=s0_merged.loc[:, s0_merged.columns[0:2]]
146 |     
147 |     return(s0_merged_part_gene,s0_merged_part_time)
148 | 
149 | def rank_rsquare(load_cellDancer,gene_list=None,cluster_choice=None):
150 |     cell_time=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]][['cellIndex','pseudotime']]
151 |     s0_merged_part_gene,s0_merged_part_time=get_gene_s0_by_time(cell_time,load_cellDancer)
152 |     
153 |     onegene=load_cellDancer[load_cellDancer.gene_name==load_cellDancer.gene_name[0]]
154 |     
155 |     if cluster_choice is None:
156 |         cluster_choice=list(onegene.clusters.drop_duplicates())
157 |     cell_idx=list(onegene[onegene.clusters.isin(cluster_choice)].cellIndex)
158 |     
159 |     if gene_list is None:
160 |         gene_list=s0_merged_part_gene.columns
161 |     r_square_non_para_list_sort,non_para_fit_heat,non_para_fit_list,sampled_idx=get_rsquare(load_cellDancer,gene_list,s0_merged_part_time,s0_merged_part_gene,cell_choice=cell_idx)
162 |     return(r_square_non_para_list_sort[['gene_name','r_square']].reset_index(drop=True))
163 | 
164 | 
165 | def adata_to_df_with_embed(adata,
166 |                             us_para=['Mu', 'Ms'],
167 |                             cell_type_para='celltype',
168 |                             embed_para='X_umap',
169 |                             save_path='cell_type_u_s_sample_df.csv',
170 |                             gene_list=None):
171 |     
172 |     """Convert adata to pandas.DataFrame format and save it as csv file with embedding info.
173 |         
174 |     Arguments
175 |     ---------
176 |     adata: `anndata._core.anndata.AnnData`
177 |         The adata to be transferred.
178 |     us_para: `list` (default: ['Mu','Ms'])
179 |         The attributes of the two count matrices of pre-mature (unspliced) and mature (spliced) abundances from adata.layers. By default, splice and unsplice columns (the two count matrices of spliced and unspliced abundances) are obtained from the ['Ms', 'Mu'] attributes of adata.layers.
180 |     cell_type_para: `str` (default: 'celltype')
181 |         The attribute of cell type to be obtained from adata.obs. By default, cell type information is obtained from ['celltype'] column of adata.obs.
182 |     embed_para: `str` (default: 'X_umap')
183 |         The attribute of embedding space to be obtained from adata.obsm. It represents the 2-dimensional representation of all cells. The embedding1 and embedding2 columns are obtained from [‘X_umap’] attribute of adata.obsm.
184 |     save_path: `str` (default: 'cell_type_u_s_sample_df.csv')
185 |         Path to save the result of transferred csv file.
186 |     gene_list: `list` (default: None)
187 |         Specific gene(s) to be transfered.
188 |     Returns
189 |     -------
190 |     raw_data: `pandas.DataFrame` 
191 |         pandas DataFrame with columns gene_name, unsplice, splice, cellID, clusters, embedding1, embedding2.
192 |     """
193 |     from tqdm import tqdm
194 |     def adata_to_raw_one_gene(data, us_para, gene):
195 |         '''
196 |         convert adata to raw data format (one gene)
197 |         data: an anndata
198 |         us_para: the varable name of u0, s0, and gene name
199 |         us_para = ['Mu', 'Ms']
200 |         '''
201 |         data2 = data[:, data.var.index.isin([gene])].copy()
202 |         u0 = data2.layers[us_para[0]][:,0].copy().astype(np.float32)
203 |         s0 = data2.layers[us_para[1]][:,0].copy().astype(np.float32)
204 |         raw_data = pd.DataFrame({'gene_name':gene, 'unsplice':u0, 'splice':s0})
205 |         return(raw_data)
206 | 
207 |     if gene_list is None: gene_list=adata.var.index
208 |     
209 |     for i,gene in enumerate(tqdm(gene_list)):
210 |         data_onegene = adata_to_raw_one_gene(adata, us_para=us_para, gene=gene)
211 |         if i==0:
212 |             data_onegene.to_csv(save_path,header=True,index=False)
213 |         else:
214 |             data_onegene.to_csv(save_path,mode='a',header=False,index=False)
215 |     
216 |     # cell info
217 |     gene_num=len(gene_list)
218 |     cellID=pd.DataFrame({'cellID':adata.obs.index})
219 |     celltype_meta=adata.obs[cell_type_para].reset_index(drop=True)
220 |     celltype=pd.DataFrame({'clusters':celltype_meta})#
221 |     embed_map=pd.DataFrame({'embedding1':adata.obsm[embed_para][:,0],'embedding2':adata.obsm[embed_para][:,1]})
222 |     # embed_info_df = pd.concat([embed_info]*gene_num)
223 |     embed_info=pd.concat([cellID,celltype,embed_map],axis=1)
224 |     embed_raw=pd.concat([embed_info]*gene_num)
225 |     embed_raw=embed_raw.reset_index(drop=True)
226 |     
227 |     raw_data=pd.read_csv(save_path)
228 |     raw_data=pd.concat([raw_data,embed_raw],axis=1)
229 |     raw_data.to_csv(save_path,header=True,index=False)
230 | 
231 |     return(raw_data)
232 | 
233 | def to_dynamo(cellDancer_df):
234 |     '''
235 |     Convert the output dataframe of cellDancer to the input of dynamo. The output of this function can be directly used in the downstream analyses of dynamo.
236 | 
237 |     Example usage:
238 | 
239 |     .. code-block:: python
240 | 
241 |         import dynamo as dyn
242 |         import numpy as np
243 |         import pandas as pd
244 |         import anndata as ann
245 |         import matplotlib.pyplot as plt
246 |         import celldancer as cd
247 |         import celldancer.utilities as cdutil
248 | 
249 |         # load the prediction result of all genes, the data could be achieved from section 'Deciphering gene regulation through vector fields analysis in pancreatic endocrinogenesis'
250 |         cellDancer_df=pd.read_csv('HgForebrainGlut_cellDancer_estimation_spliced.csv')
251 |         cellDancer_df=cd.compute_cell_velocity(cellDancer_df=cellDancer_df, projection_neighbor_choice='embedding', expression_scale='power10', projection_neighbor_size=100) # compute cell velocity
252 | 
253 |         # transform celldancer dataframe to anndata
254 |         adata_from_dancer = cdutil.to_dynamo(cellDancer_df)
255 | 
256 |         # plot the velocity vector
257 |         dyn.pl.streamline_plot(adata_from_dancer, color=["clusters"], basis = "cdr", show_legend="on data", show_arrowed_spines=True)
258 |         
259 |     -------
260 |     
261 |     .. image:: _static/dynamo_plt.png
262 |       :width: 60%
263 |       :alt: dynamo_plt
264 | 
265 |     Arguments
266 |     ---------
267 |     cellDancer_df: `pandas.DataFrame` 
268 |         The output dataframe of cellDancer. 
269 | 
270 |         cellDancer                  -->     dynamo
271 | 
272 |         cellDancer_df.splice            -->     adata.X
273 | 
274 |         cellDancer_df.loss              -->     adata.var.loss
275 | 
276 |         cellDancer_df.cellID            -->     adata.obs
277 | 
278 |         cellDancer_df.clusters          -->     adata.obs.clusters
279 | 
280 |         cellDancer_df.splice            -->     adata.layers['X_spliced']
281 | 
282 |         cellDancer_df.splice            -->     adata.layers['M_s']
283 | 
284 |         cellDancer_df.unsplice          -->     adata.layers['X_unspliced']
285 | 
286 |         cellDancer_df.unsplice          -->     adata.layers['M_u']
287 | 
288 |         cellDancer_df.alpha             -->     adata.layers['alpha']
289 | 
290 |         cellDancer_df.beta              -->     adata.layers['beta']
291 | 
292 |         cellDancer_df.gamma             -->     adata.layers['gamma']
293 | 
294 |         cellDancer_df.unsplice_predict - cellDancer_df.unsplice     -->    adata.layers['velocity_U']
295 | 
296 |         cellDancer_df.splice_predict - cellDancer_df.splice         -->    adata.layers['velocity_S']
297 | 
298 |         cellDancer_df[['embeddding1', 'embedding2']]   -->     adata.obsm['X_cdr']
299 | 
300 |         cellDancer_df[['velocity1', 'velocity2']]      -->     adata.obsm['velocity_cdr']
301 | 
302 |     Returns 
303 |     -------
304 |     adata
305 |     '''
306 | 
307 |     # Sort the cellDancer_df by cellID, so if it's not done already, your cellDancer_df could be changed.
308 |     # This is because pd.DataFrame.pivot does this automatically and we don't want to mess up with
309 |     # the obsm etc
310 |     cellDancer_df = cellDancer_df.sort_values('cellID')
311 | 
312 |     spliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice')
313 |     unspliced = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice')
314 | 
315 |     spliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='splice_predict')
316 |     unspliced_predict = cellDancer_df.pivot(index='cellID', columns='gene_name', values='unsplice_predict')
317 | 
318 |     alpha = cellDancer_df.pivot(index='cellID', columns='gene_name', values='alpha')
319 |     beta = cellDancer_df.pivot(index='cellID', columns='gene_name', values='beta')
320 |     gamma = cellDancer_df.pivot(index='cellID', columns='gene_name', values='gamma')
321 | 
322 |     one_gene = cellDancer_df['gene_name'].iloc[0]
323 |     one_cell = cellDancer_df['cellID'].iloc[0]
324 | 
325 |     adata1 = ad.AnnData(spliced)
326 | 
327 |     # var
328 |     adata1.var['highly_variable_genes'] = True
329 |     #adata1.var['loss'] = (cellDancer_df[cellDancer_df['cellID'] == one_cell]['loss']).tolist()
330 |     loss = cellDancer_df.pivot(index='gene_name', columns='cellID', values='loss').iloc[:, 0]
331 |     loss.index = loss.index.astype(str)
332 |     adata1.var['loss'] = loss
333 |     # celldancer uses all genes (high variable) for dynamics and transition.
334 |     adata1.var['use_for_dynamics'] = True
335 |     adata1.var['use_for_transition'] = True
336 | 
337 |     # obs
338 |     if 'clusters' in cellDancer_df:
339 |         clusters = cellDancer_df.pivot(index='cellID', columns='gene_name', values='clusters').iloc[:, 0]
340 |         clusters.index = clusters.index.astype(str)
341 |         adata1.obs['clusters'] = clusters
342 |     #  layers
343 |     adata1.layers['X_spliced'] = spliced
344 |     adata1.layers['X_unspliced'] = unspliced
345 | 
346 |     adata1.layers['M_s'] = spliced
347 |     adata1.layers['M_u'] = unspliced
348 |     adata1.layers['velocity_S'] = spliced_predict - spliced
349 | 
350 |     adata1.layers['velocity_U'] = unspliced_predict - unspliced
351 |     adata1.layers['alpha'] = alpha
352 |     adata1.layers['beta'] = beta
353 |     adata1.layers['gamma'] = gamma
354 | 
355 |     # obsm
356 |     adata1.obsm['X_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['embedding1', 'embedding2']].values
357 |     # assuming no downsampling is used for the cell velocities in the cellDancer_df
358 |     if 'velocity1' in cellDancer_df:
359 |         adata1.obsm['velocity_cdr'] = cellDancer_df[cellDancer_df['gene_name'] == one_gene][['velocity1', 'velocity2']].values
360 | 
361 |     # obsp
362 |     n_neighbors = 20
363 |     nn = NearestNeighbors(n_neighbors=n_neighbors)
364 |     nn.fit(adata1.obsm['X_cdr'])
365 |     connect_knn = nn.kneighbors_graph(mode='connectivity')
366 |     distance_knn = nn.kneighbors_graph(mode='distance')
367 |     adata1.obsp['connectivities'] = connect_knn
368 |     adata1.obsp['distances'] = distance_knn
369 | 
370 |     # uns
371 |     dynamics_info = {'filter_gene_mode': 'final',
372 |                 't': None,
373 |                 'group': None,
374 |                 'X_data': None,
375 |                 'X_fit_data': None,
376 |                 'asspt_mRNA': 'ss',
377 |                 'experiment_type': 'conventional',
378 |                 'normalized': True,
379 |                 'model': 'static',
380 |                 'est_method': 'ols',
381 |                 'has_splicing': True,
382 |                 'has_labeling': False,
383 |                 'splicing_labeling': False,
384 |                 'has_protein': False,
385 |                 'use_smoothed': True,
386 |                 'NTR_vel': False,
387 |                 'log_unnormalized': False,
388 |                 'fraction_for_deg': False}
389 | 
390 |     adata1.uns['dynamics']= dynamics_info
391 | 
392 |     return adata1
393 | 
394 | def export_velocity_to_dynamo(cellDancer_df,adata):
395 |     '''
396 |     Replace the velocities in adata of dynamo (“adata” in parameters) with the cellDancer predicted velocities (“cellDancer_df” in parameters). The output can be directly used in the downstream analyses of dynamo.
397 | 
398 |     -------
399 |     The vector field could be learned by dynamo based on the RNA velocity of cellDancer. Details are shown in the section ‘Application of dynamo.’
400 |     
401 |     .. image:: _static/dynamo_vector_field_pancreas.png
402 |       :width: 60%
403 |       :alt: dynamo_vector_field_pancreas
404 | 
405 |     Arguments
406 |     ---------
407 |     cellDancer_df: `pandas.DataFrame`
408 |         The output dataframe of cellDancer. 
409 | 
410 |         cellDancer                  -->     dynamo
411 | 
412 |         bools of the existance of cellDancer_df['gene_name'] in adata.var      -->     adata.var['use_for_dynamics']
413 | 
414 |         bools of the existance of cellDancer_df['gene_name'] in adata.var      -->     adata.var['use_for_transition']
415 | 
416 |         cellDancer_df.splice_predict - cellDancer_df.splice                    -->    adata.layers['velocity_S']
417 | 
418 |     adata: `anndata._core.anndata.AnnData`
419 |         The adata to be integrated with cellDancer velocity result.
420 | 
421 | 
422 |     Returns 
423 |     -------
424 |     adata
425 |     '''
426 | 
427 |     dancer_genes = cellDancer_df['gene_name'].drop_duplicates()
428 |     cellDancer_df["velocity_S"] = cellDancer_df["splice_predict"]-cellDancer_df["splice"]
429 |     dancer_velocity_s = cellDancer_df[['cellID', 'gene_name', 'velocity_S']]
430 |     pivoted = dancer_velocity_s.pivot(index="cellID", columns="gene_name", values="velocity_S")
431 |     velocity_matrix = np.zeros(adata.shape)
432 |     adata_ds_zeros = pd.DataFrame(velocity_matrix, columns=adata.var.index, index=adata.obs.index)
433 |     celldancer_velocity_s_df = (adata_ds_zeros + pivoted).fillna(0)[adata.var.index]
434 | 
435 |     adata.layers['velocity_S'] = scipy.sparse.csr_matrix(celldancer_velocity_s_df.values)
436 |     adata.var['use_for_dynamics'] = adata.var.index.isin(dancer_genes)
437 |     adata.var['use_for_transition'] = adata.var.index.isin(dancer_genes)
438 |     return(adata.copy())
439 | 
440 | def adata_to_raw(adata,save_path,gene_list=None):
441 |     '''convert adata to raw data format
442 |     data:
443 |     save_path:
444 |     gene_list (optional):
445 |     return: panda dataframe with gene_list,u0,s0,cellID
446 |     
447 |     run: test=adata_to_raw(adata,'/Users/shengyuli/Library/CloudStorage/OneDrive-HoustonMethodist/work/Velocity/bin/cellDancer-development_20220128/src/output/test.csv',gene_list=genelist_all)
448 |     ref: mel - loom_to_celldancer_raw.py
449 |     '''
450 |     from tqdm import tqdm
451 | 
452 |     def adata_to_raw_one_gene(data, para, gene):
453 |         '''
454 |         convert adata to raw data format (one gene)
455 |         data: an anndata
456 |         para: the varable name of u0, s0, and gene name
457 |         para = ['Mu', 'Ms']
458 |         '''
459 |         data2 = data[:, data.var.index.isin([gene])].copy()
460 |         u0 = data2.layers[para[0]][:,0].copy().astype(np.float32)
461 |         s0 = data2.layers[para[1]][:,0].copy().astype(np.float32)
462 |         raw_data = pd.DataFrame({'gene_name':gene, 'u0':u0, 's0':s0})
463 |         raw_data['cellID']=adata.obs.index
464 |         return(raw_data)
465 | 
466 |     if gene_list is None: gene_list=adata.var.index
467 | 
468 |     for i,gene in enumerate(tqdm(gene_list)):
469 |         data_onegene = adata_to_raw_one_gene(adata, para=['Mu', 'Ms'], gene=gene)
470 |         if i==0:
471 |             data_onegene.to_csv(save_path,header=True,index=False)
472 |         else:
473 |             data_onegene.to_csv(save_path,mode='a',header=False,index=False)
474 |     raw_data=pd.read_csv(save_path)
475 | 
476 |     return(raw_data)
477 | 
478 | def filter_by_neighbor_sample_parallel(load_raw_data,step_i=15,step_j=15,cutoff_s0_zero_ratio=0.2,cutoff_u0_zero_ratio=0.2,gene_amt_each_job=100):
479 |     from joblib import Parallel, delayed
480 |     import pandas as pd
481 |     import numpy as np
482 | 
483 |     '''filter genes with'''
484 |     # parallel filter gene_by_neighbor_sample_one_gene
485 |     def filter_gene_by_neighbor_sample_one_gene(gene,load_raw_data,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None):
486 |         # print(gene)
487 |         u_s= np.array(load_raw_data[load_raw_data['gene_list']==gene][["u0","s0"]]) # u_s
488 |         sampling_idx=sampling_neighbors(u_s[:,0:2], step_i=step_i,step_j=step_j,percentile=15) # Sampling
489 |         u_s_downsample = u_s[sampling_idx,0:4]
490 |         u_s_df=pd.DataFrame({"s0":u_s_downsample[:, 1],'u0':u_s_downsample[:, 0]})
491 |         u_s_df=u_s_df[~((u_s_df.s0==0) & (u_s_df.u0==0))]
492 |         # print(u_s_df)
493 |         u_s_df_zero_amt=u_s_df.agg(lambda x: x.eq(0).sum())
494 |         sampled_gene_amt=len(u_s_df)
495 |         u_s_df_zero_ratio=u_s_df_zero_amt/sampled_gene_amt
496 |         # plt.figure(None,(6,6))
497 |         # plt.scatter(u_s_df.s0,u_s_df.u0,alpha=0.1)
498 |         # plt.show()
499 |         # return [u_s_df_zero_ratio.s0,u_s_df_zero_ratio.u0]
500 |         # return(u_s_df)
501 |         if ~(u_s_df_zero_ratio.s0>cutoff_s0_zero_ratio or u_s_df_zero_ratio.u0>cutoff_u0_zero_ratio):
502 |             return(gene)
503 | 
504 |     def filter_gene_by_neighbor_sample(start_point,load_raw_data,gene_list=None,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None):
505 |         if start_point+gene_amt_each_job<len(load_raw_data.gene_list.drop_duplicates()):
506 |             gene_list=load_raw_data.gene_list.drop_duplicates()[start_point:(start_point+gene_amt_each_job)]
507 |         else:
508 |             gene_list=load_raw_data.gene_list.drop_duplicates()[start_point:,]
509 |         print(gene_list)
510 |         gene_list_keep=[]
511 |         for i,gene in enumerate(gene_list):
512 |             print(i)
513 |             filter_result=filter_gene_by_neighbor_sample_one_gene(gene,load_raw_data,step_i=step_i,step_j=step_j,cutoff_s0_zero_ratio=cutoff_s0_zero_ratio,cutoff_u0_zero_ratio=cutoff_u0_zero_ratio,gene_amt_each_job=gene_amt_each_job)
514 |             if filter_result is not None:gene_list_keep.append(filter_result)
515 |         return(gene_list_keep)
516 | 
517 |     def parallel_get_gene(load_raw_data,gene_list=None,step_i=None,step_j=None,cutoff_s0_zero_ratio=None,cutoff_u0_zero_ratio=None,gene_amt_each_job=None):
518 |         if gene_list is None:
519 |             gene_list=load_raw_data.gene_list.drop_duplicates().reset_index(drop=True)
520 |         else:
521 |             load_raw_data=load_raw_data[load_raw_data.gene_list.isin(gene_list)]
522 |         print(gene_list)
523 |         result = Parallel(n_jobs=-1, backend="loky",verbose=10)(
524 |             delayed(filter_gene_by_neighbor_sample)(start_point,load_raw_data,gene_list=gene_list,step_i=step_i,step_j=step_j,cutoff_s0_zero_ratio=cutoff_s0_zero_ratio,cutoff_u0_zero_ratio=cutoff_u0_zero_ratio,gene_amt_each_job=gene_amt_each_job)
525 |             for start_point in range(0,len(gene_list),gene_amt_each_job))
526 |         return(result)
527 | 
528 |     gene_list_keep=parallel_get_gene(load_raw_data,step_i=step_i,step_j=step_j,cutoff_s0_zero_ratio=cutoff_s0_zero_ratio,cutoff_u0_zero_ratio=cutoff_u0_zero_ratio,gene_amt_each_job=gene_amt_each_job)
529 | 
530 |     # combine parallel results
531 |     gene_list_keep_fin=[]
532 |     for segment_list in gene_list_keep:
533 |         gene_list_keep_fin=gene_list_keep_fin+segment_list
534 |     len(gene_list_keep_fin)
535 |     gene_list_keep_fin_pd=pd.DataFrame({'gene_list':gene_list_keep_fin})
536 | 
537 |     return(gene_list_keep_fin_pd)
538 | 
539 | def calculate_occupy_ratio_and_cor(gene_choice,data, u_fragment=30, s_fragment=30):
540 |     '''calculate occupy ratio and the correlation between u0 and s0
541 |     ref: analysis_calculate_occupy_ratio.py
542 |     parameters
543 |     data -> rawdata[['gene_list', 'u0','s0']]
544 |     return(ratio2, cor2)
545 |     ratio2 [['gene_choice','ratio']]
546 |     ratio2 [['gene_choice','correlation']]
547 |     '''
548 |     def identify_in_grid(u, s, onegene_u0_s0):
549 |         select_cell =onegene_u0_s0[(onegene_u0_s0[:,0]>u[0]) & (onegene_u0_s0[:,0]<u[1]) & (onegene_u0_s0[:,1]>s[0]) & (onegene_u0_s0[:,1]<s[1]), :]
550 |         if select_cell.shape[0]==0:
551 |             return False
552 |         else:
553 |             return True
554 | 
555 |     def build_grid_list(u_fragment,s_fragment,onegene_u0_s0):
556 |         min_u0 = min(onegene_u0_s0[:,0])
557 |         max_u0 = max(onegene_u0_s0[:,0])
558 |         min_s0 = min(onegene_u0_s0[:,1])
559 |         max_s0 = max(onegene_u0_s0[:,1])
560 |         u0_coordinate=np.linspace(start=min_u0, stop=max_u0, num=u_fragment+1).tolist()
561 |         s0_coordinate=np.linspace(start=min_s0, stop=max_s0, num=s_fragment+1).tolist()
562 |         u0_array = np.array([u0_coordinate[0:(len(u0_coordinate)-1)], u0_coordinate[1:(len(u0_coordinate))]]).T
563 |         s0_array = np.array([s0_coordinate[0:(len(s0_coordinate)-1)], s0_coordinate[1:(len(s0_coordinate))]]).T
564 |         return u0_array, s0_array
565 | 
566 |     # data = raw_data2
567 |     ratio = np.empty([len(gene_choice), 1])
568 |     cor = np.empty([len(gene_choice), 1])
569 |     for idx, gene in enumerate(gene_choice):
570 |         print(idx)
571 |         onegene_u0_s0=data[data.gene_list==gene][['u0','s0']].to_numpy()
572 |         u_grid, s_grid=build_grid_list(u_fragment,s_fragment,onegene_u0_s0)
573 |         # occupy = np.empty([1, u_grid.shape[0]*s_grid.shape[0]])
574 |         occupy = 0
575 |         for i, s in enumerate(s_grid):
576 |             for j,u in enumerate(u_grid):
577 |                 #print(one_grid)
578 |                 if identify_in_grid(u, s,onegene_u0_s0):
579 |                     # print(1)
580 |                     occupy = occupy + 1
581 |         occupy_ratio=occupy/(u_grid.shape[0]*s_grid.shape[0])
582 |         # print('occupy_ratio for '+gene+"="+str(occupy_ratio))
583 |         ratio[idx,0] = occupy_ratio
584 |         cor[idx, 0] = np.corrcoef(onegene_u0_s0[:,0], onegene_u0_s0[:,1])[0,1]
585 |     ratio2 = pd.DataFrame({'gene_choice': gene_choice, 'ratio': ratio[:,0]})
586 |     cor2 = pd.DataFrame({'gene_choice': gene_choice, 'correlation': cor[:,0]})
587 |     return(ratio2, cor2)
588 | 
589 | def find_neighbors(adata, n_pcs=30, n_neighbors=30):
590 |     '''Find neighbors by using pca on UMAP'''
591 |     from scanpy import Neighbors
592 |     import warnings
593 | 
594 |     neighbors = Neighbors(adata)
595 |     with warnings.catch_warnings():  # ignore numba warning (umap/issues/252)
596 |         warnings.simplefilter("ignore")
597 |         neighbors.compute_neighbors(
598 |             n_neighbors=n_neighbors,
599 |             knn=True,
600 |             n_pcs=n_pcs,
601 |             method="umap",
602 |             use_rep="X_pca",
603 |             random_state=0,
604 |             metric="euclidean",
605 |             metric_kwds={},
606 |             write_knn_indices=True,
607 |         )
608 | 
609 |     adata.obsp["distances"] = neighbors.distances
610 |     adata.obsp["connectivities"] = neighbors.connectivities
611 |     adata.uns["neighbors"]["connectivities_key"] = "connectivities"
612 |     adata.uns["neighbors"]["distances_key"] = "distances"
613 | 
614 |     if hasattr(neighbors, "knn_indices"):
615 |         adata.uns["neighbors"]["indices"] = neighbors.knn_indices
616 |         adata.uns["neighbors"]["params"] = {
617 |             "n_neighbors": n_neighbors,
618 |             "method": "umap",
619 |             "metric": "euclidean",
620 |             "n_pcs": n_pcs,
621 |             "use_rep": "X_pca",
622 |         }
623 | 
624 | def find_nn_neighbors(
625 |         data=None, 
626 |         gridpoints_coordinates=None, 
627 |         n_neighbors=None,
628 |         radius=None):
629 |     '''
630 |     data: numpy ndarray
631 |     gridpoints_coordinates: numpy ndarray
632 |     n_neighbors: int
633 |     raidus: float
634 |     '''
635 | 
636 |     if gridpoints_coordinates is None:
637 |         gridpoints_coordinates = data
638 | 
639 |     if n_neighbors is None and radius is not None:
640 |         nn = NearestNeighbors(radius = radius, n_jobs = -1)
641 |         nn.fit(data)
642 |         dists, neighs = nn.radius_neighbors(gridpoints_coordinates)
643 |     elif n_neighbors is not None and radius is None:
644 |         nn = NearestNeighbors(n_neighbors = n_neighbors, n_jobs = -1)
645 |         nn.fit(data)
646 |         dists, neighs = nn.kneighbors(gridpoints_coordinates)
647 | 
648 |     return(dists, neighs)
649 | 
650 | 
651 | def extract_from_df(load_cellDancer, attr_list, gene_name=None):
652 |     '''
653 |     Extract a single copy of a list of columns from the load_cellDancer data frame
654 |     Returns a numpy array.
655 |     '''
656 |     if gene_name is None:
657 |         gene_name = load_cellDancer.gene_name.iloc[0]
658 |     one_gene_idx = load_cellDancer.gene_name == gene_name
659 |     data = load_cellDancer[one_gene_idx][attr_list].dropna()
660 |     return data.to_numpy()


--------------------------------------------------------------------------------
/src/celldancer/velocity_estimation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import shutil
  5 | import datetime
  6 | import pandas as pd
  7 | import numpy as np
  8 | import random
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import pytorch_lightning as pl
 13 | from torch.utils.data import *
 14 | from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 15 | from pytorch_lightning.callbacks import ModelCheckpoint
 16 | from sklearn.neighbors import NearestNeighbors
 17 | from joblib import Parallel, delayed
 18 | from tqdm import tqdm
 19 | import pkg_resources
 20 | import warnings
 21 | 
 22 | warnings.filterwarnings("ignore", category=DeprecationWarning)
 23 | warnings.simplefilter("ignore", UserWarning)
 24 | import logging
 25 | handle='cellDancer'
 26 | logger_cd=logging.getLogger(handle)
 27 | logging.getLogger(handle).setLevel(logging.INFO)
 28 | 
 29 | logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)
 30 | from .sampling import *
 31 | 
 32 | class DNN_layer(nn.Module):
 33 | 
 34 |     """Define network structure.
 35 |     """
 36 | 
 37 |     def __init__(self, h1, h2):
 38 |         super().__init__()
 39 |         self.l1 = nn.Linear(2, h1)
 40 |         self.l2 = nn.Linear(h1, h2)
 41 |         self.l3 = nn.Linear(h2, 3)
 42 | 
 43 |     def forward(self, unsplice, splice, alpha0, beta0, gamma0, dt):
 44 |         #print(f"dt is {dt}")
 45 |         input = torch.tensor(np.array([np.array(unsplice), np.array(splice)]).T)
 46 |         x = self.l1(input)
 47 |         x = F.leaky_relu(x)
 48 |         x = self.l2(x)
 49 |         x = F.leaky_relu(x)
 50 |         x = self.l3(x)
 51 |         output = torch.sigmoid(x)
 52 |         beta = output[:,0]
 53 |         gamma = output[:,1]
 54 |         alphas = output[:,2]
 55 | 
 56 |         alphas = alphas * alpha0
 57 |         beta =  beta * beta0
 58 |         gamma = gamma * gamma0
 59 | 
 60 |         unsplice_predict = unsplice + (alphas - beta*unsplice)*dt
 61 |         splice_predict = splice + (beta*unsplice - gamma*splice)*dt
 62 |         return unsplice_predict, splice_predict, alphas, beta, gamma
 63 | 
 64 |     def save(self, model_path):
 65 |         torch.save({
 66 |             "l1": self.l1,
 67 |             "l2": self.l2,
 68 |             "l3": self.l3
 69 |         }, model_path)
 70 | 
 71 |     def load(self, model_path):
 72 |         checkpoint = torch.load(model_path)
 73 |         self.l1 = checkpoint["l1"]
 74 |         self.l2 = checkpoint["l2"]
 75 |         self.l3 = checkpoint["l3"]
 76 | 
 77 | class DNN_module(nn.Module):
 78 |     '''
 79 |     calculate loss function
 80 |     load network "DNN_layer"
 81 |     predict splice_predict and unsplice_predict
 82 |     '''
 83 |     def __init__(self, module, n_neighbors = None):
 84 |         super().__init__()
 85 |         self.module = module
 86 |         self.n_neighbors = n_neighbors
 87 | 
 88 |     def velocity_calculate(self, 
 89 |                            unsplice, 
 90 |                            splice, 
 91 |                            alpha0, 
 92 |                            beta0, 
 93 |                            gamma0,
 94 |                            dt,
 95 |                            embedding1,
 96 |                            embedding2, 
 97 |                            barcode = None, 
 98 |                            loss_func = None,
 99 |                            cost2_cutoff=None,
100 |                            trace_cost_ratio=None,
101 |                            corrcoef_cost_ratio=None):
102 |         '''
103 |         add embedding
104 |         for real dataset
105 |         calculate loss function
106 |         predict unsplice_predict splice_predict from network 
107 |         '''
108 |         #generate neighbor indices and expr dataframe
109 |         points = np.array([embedding1.numpy(), embedding2.numpy()]).transpose()
110 | 
111 |         self.n_neighbors=min((points.shape[0]-1), self.n_neighbors)
112 |         nbrs = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm='ball_tree').fit(points)
113 |         
114 |         distances, indices = nbrs.kneighbors(points) 
115 |         # indices: 
116 |         #   row -> cell, 
117 |         #   col -> neighboring cells, 
118 |         #   value -> index of cells, 
119 |         #   the fist col is the index of row
120 | 
121 |         expr = pd.merge(pd.DataFrame(splice, columns=['splice']), pd.DataFrame(unsplice, columns=['unsplice']), left_index=True, right_index=True)
122 |         if barcode is not None:
123 |             expr.index = barcode
124 |         unsplice = torch.tensor(expr['unsplice'])
125 |         splice = torch.tensor(expr['splice'])
126 |         indices = torch.tensor(indices)
127 |         unsplice_predict, splice_predict, alphas, beta, gamma = self.module(unsplice, splice, alpha0, beta0, gamma0, dt)
128 | 
129 |         def cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices):
130 |             """Cost function
131 |             Return:
132 |                 list of cosine distance and a list of the index of the next cell
133 |             """
134 |             
135 |             uv, sv = unsplice_predict-unsplice, splice_predict-splice # Velocity from (unsplice, splice) to (unsplice_predict, splice_predict)
136 |             unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice # Velocity from (unsplice, splice) to its neighbors
137 | 
138 |             den = torch.sqrt(unv**2 + snv**2) * torch.sqrt(uv**2+sv**2)
139 |             den[den==0] = -1
140 |             cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.)) # cosine: column -> individuel cell (cellI); row -> nearby cells of cell id ; value -> cosine between col and row cells
141 |             cosine_max, cosine_max_idx = torch.max(cosine, dim=0)
142 |             cell_idx = torch.diag(indices[:, cosine_max_idx+1])
143 |             return 1 - cosine_max, cell_idx
144 | 
145 | 
146 | 
147 |         def rmse(unsplice, splice, unsplice_predict, splice_predict, indices):
148 |             """
149 |             This loss is defined as the rmse of the predicted velocity vector (uv, sv) from the neighboring velocity vectors (unv, snv).
150 | 
151 |             This loss is used during revision.
152 | 
153 |             """
154 |             uv, sv = unsplice_predict-unsplice, splice_predict-splice 
155 |             unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice 
156 | 
157 |             rmse = (uv-unv)**2 + (sv-snv)**2
158 |             rmse = torch.sqrt(0.5*rmse)
159 | 
160 |             # normalize across all neighboring cells using a softmax function.
161 |             # m = torch.nn.Softmax(dim=0)
162 |             # rmse = m(rmse)
163 | 
164 |             rmse_min, rmse_min_idx = torch.min(rmse, dim=0)
165 |             cell_idx = torch.diag(indices[:, rmse_min_idx+1])
166 |             return rmse_min, cell_idx
167 | 
168 | 
169 |         def mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio = 0.5):
170 |             """
171 |             This loss is defined as the mix of rmse loss and cosine loss.
172 | 
173 |             This loss is used during revision.
174 | 
175 |             Parameters:
176 |             
177 |             unsplice: 1d tensor [n_cells] 
178 |             splice: 1d tensor [n_cells] 
179 |             indices: 2d array [n_cells, n_neighbors]
180 |             Return:
181 |                 list of cosine distance and a list of the index of the next cell
182 |             """
183 | 
184 |             #print("mix ratio, ", mix_ratio)
185 |             uv, sv = unsplice_predict-unsplice, splice_predict-splice 
186 |             unv, snv = unsplice[indices.T[1:]] - unsplice, splice[indices.T[1:]] - splice 
187 |             mag_v = torch.sqrt(uv**2 + sv**2)
188 |             mag_nv = torch.sqrt(unv**2 + snv**2)
189 |             mag = (mag_nv - mag_v)**2
190 | 
191 |             # minimize mag or maximize -mag
192 |             # normalize across all neighboring cells using a softmax function
193 |             m = torch.nn.Softmax(dim=0)
194 |             mag = m(mag)
195 | 
196 |             den = mag_v * mag_nv
197 |             den[den==0] = -1
198 | 
199 |             # cosine: [n_neighbors x n_cells]
200 |             cosine = torch.where(den!=-1, (unv*uv + snv*sv) / den, torch.tensor(1.))
201 | 
202 |             total = mix_ratio*(1-cosine) + (1 - mix_ratio)* mag
203 |             total_min, total_min_idx = torch.min(total, dim=0)
204 | 
205 |             cell_idx = torch.diag(indices[:, total_min_idx+1])
206 |             return total_min, cell_idx
207 | 
208 |         
209 |         def trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx, version):
210 | 
211 |             # This cost has been deprecated.
212 | 
213 |             uv, sv = unsplice_predict-unsplice, splice_predict-splice
214 |             tan = torch.where(sv!=1000000, uv/sv, torch.tensor(0.00001))
215 |             atan_theta = torch.atan(tan) + torch.pi/2
216 |             atan_theta2=atan_theta[idx]
217 |             atan_theta3 = atan_theta[idx[idx]]
218 |             if version=="v1":
219 |                 cost = atan_theta2/atan_theta+atan_theta3/atan_theta2
220 |             elif version=="v2":
221 |                 cost=torch.where(atan_theta<atan_theta2, 1, 0)+torch.where(atan_theta2<atan_theta3, 1, 0) 
222 |                 
223 |             return(cost)
224 | 
225 |         def corrcoef_cost(alphas, unsplice, beta, splice):
226 | 
227 |             # This cost has been deprecated.
228 |             
229 |             corrcoef1 = torch.corrcoef(torch.tensor([alphas.detach().numpy(),unsplice.detach().numpy()]))[1,0]
230 |             corrcoef2 = torch.corrcoef(torch.tensor([beta.detach().numpy(), splice.detach().numpy()]))[1,0]
231 |             corrcoef = corrcoef1 + corrcoef2
232 |             cost=torch.where(corrcoef>=torch.tensor(0.0), torch.tensor(0.0), torch.tensor(-corrcoef))
233 |             return(cost)
234 |         
235 |         if trace_cost_ratio == 0 and corrcoef_cost_ratio == 0:
236 | 
237 |             if loss_func == 'cosine':
238 |                 cost1 = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices)[0]
239 |                 cost_fin = torch.mean(cost1)
240 | 
241 |             if loss_func == 'rmse':
242 |                 cost1 = rmse(unsplice, splice, unsplice_predict, splice_predict, indices)[0]
243 |                 cost_fin = torch.mean(cost1)
244 | 
245 |             elif 'mix' in loss_func:
246 |                 mix_ratio = loss_func[1]
247 |                 cost1 = mix_loss(unsplice, splice, unsplice_predict, splice_predict, indices, mix_ratio=mix_ratio)[0]
248 |                 cost_fin = torch.mean(cost1)
249 | 
250 |         else: # trace cost and corrcoef cost have been deprecated.
251 |             # cosine cost
252 |             cost1,idx = cosine_similarity(unsplice, splice, unsplice_predict, splice_predict, indices)
253 |             cost1_normalize=(cost1-torch.min(cost1))/torch.max(cost1)
254 |             cost1_mean = torch.mean(cost1_normalize)
255 | 
256 |             # trace cost
257 |             if trace_cost_ratio>0:
258 |                 cost2 = trace_cost(unsplice, splice, unsplice_predict, splice_predict, idx,"v2")
259 |                 cost2_normalize=(cost2-torch.min(cost2))/torch.max(cost2)
260 |                 cost2_mean = torch.mean(cost2_normalize)
261 |                 cost2_relu=(max((cost2_mean-cost2_cutoff), 0))
262 | 
263 |             # corrcoef cost
264 |             if corrcoef_cost_ratio>0:
265 |                 corrcoef_cost=corrcoef_cost(alphas, unsplice, beta, splice)
266 | 
267 |             # sum all cost
268 |             cosin_cost_ratio=1-trace_cost_ratio-corrcoef_cost_ratio
269 |             cost_fin = cosin_cost_ratio*cost1_mean + \
270 |                        trace_cost_ratio*cost2_relu + \
271 |                        corrcoef_cost_ratio*corrcoef_cost
272 |             
273 |         return cost_fin, unsplice_predict, splice_predict, alphas, beta, gamma
274 | 
275 | 
276 |     def summary_para_validation(self, cost_mean): 
277 |         loss_df = pd.DataFrame({'cost': cost_mean}, index=[0])
278 |         return(loss_df)
279 | 
280 |     def summary_para(self, unsplice, splice, unsplice_predict, splice_predict, alphas, beta, gamma, cost): 
281 |         cellDancer_df = pd.merge(pd.DataFrame(unsplice, columns=['unsplice']),pd.DataFrame(splice, columns=['splice']), left_index=True, right_index=True) 
282 |         cellDancer_df['unsplice_predict'] = unsplice_predict
283 |         cellDancer_df['splice_predict'] = splice_predict
284 |         cellDancer_df['alpha'] = alphas
285 |         cellDancer_df['beta'] = beta
286 |         cellDancer_df['gamma'] = gamma
287 |         cellDancer_df['cost'] = cost
288 |         return cellDancer_df
289 | 
290 | class ltModule(pl.LightningModule):
291 |     '''
292 |     train network using "DNN_module"
293 |     '''
294 |     def __init__(self, 
295 |                 backbone=None, 
296 |                 initial_zoom=2, 
297 |                 initial_strech=1,
298 |                 learning_rate=None,
299 |                 dt=None,
300 |                 loss_func = None,
301 |                 cost2_cutoff=0,
302 |                 optimizer='Adam',
303 |                 trace_cost_ratio=0,
304 |                 corrcoef_cost_ratio=0,
305 |                 cost_type='smooth',
306 |                 average_cost_window_size=10,
307 |                 smooth_weight=0.9):
308 |         super().__init__()
309 |         self.backbone = backbone
310 |         self.validation_loss_df = pd.DataFrame()
311 |         self.test_cellDancer_df = None
312 |         self.test_loss_df = None
313 |         self.initial_zoom = initial_zoom
314 |         self.initial_strech = initial_strech
315 |         self.learning_rate=learning_rate
316 |         self.dt=dt
317 |         self.loss_func=loss_func
318 |         self.cost2_cutoff=cost2_cutoff
319 |         self.optimizer=optimizer
320 |         self.trace_cost_ratio=trace_cost_ratio
321 |         self.corrcoef_cost_ratio=corrcoef_cost_ratio
322 |         self.save_hyperparameters()
323 |         self.get_loss=1000
324 |         self.cost_type=cost_type
325 |         self.average_cost_window_size=average_cost_window_size # will be used only when cost_tpye.isin(['average', 'median'])
326 |         self.cost_window=[]
327 |         self.smooth_weight=smooth_weight
328 |         
329 |     def save(self, model_path):
330 |         self.backbone.module.save(model_path)    # save network
331 | 
332 |     def load(self, model_path):
333 |         self.backbone.module.load(model_path)   # load network
334 | 
335 |     def configure_optimizers(self):     # define optimizer
336 |         if self.optimizer=="Adam":
337 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.999), eps=10**(-8), weight_decay=0.004, amsgrad=False)
338 |         elif self.optimizer=="SGD":
339 |             optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.8)
340 |         return optimizer
341 | 
342 |     def training_step(self, batch, batch_idx):
343 |         '''
344 |         traning network
345 |         batch: [] output returned from realDataset.__getitem__
346 |         
347 |         '''
348 | 
349 |         unsplices, splices, gene_names, unsplicemaxs, splicemaxs, embedding1s, embedding2s = batch
350 |         unsplice, splice, unsplicemax, splicemax, embedding1, embedding2  = unsplices[0], splices[0], unsplicemaxs[0], splicemaxs[0], embedding1s[0], embedding2s[0]
351 |         
352 |         umax = unsplicemax
353 |         smax = splicemax
354 |         alpha0 = np.float32(umax*self.initial_zoom)
355 |         beta0 = np.float32(1.0)
356 |         gamma0 = np.float32(umax/smax*self.initial_strech)
357 | 
358 |         cost, unsplice_predict, splice_predict, alphas, beta, gamma = self.backbone.velocity_calculate( \
359 |                 unsplice, splice, alpha0, beta0, gamma0, self.dt, embedding1, embedding2, \
360 |                 loss_func = self.loss_func, \
361 |                 cost2_cutoff = self.cost2_cutoff, \
362 |                 trace_cost_ratio = self.trace_cost_ratio, \
363 |                 corrcoef_cost_ratio=self.corrcoef_cost_ratio)
364 | 
365 |         if self.cost_type=='average': # keep the window len <= check_val_every_n_epoch
366 |             if len(self.cost_window)<self.average_cost_window_size:
367 |                 self.cost_window.append(cost)
368 |             else:
369 |                 self.cost_window.pop(0)
370 |                 self.cost_window.append(cost)
371 |             self.get_loss = torch.mean(torch.stack(self.cost_window))
372 |             self.log("loss", self.get_loss)
373 |             
374 |         elif self.cost_type=='median': # keep the window len <= check_val_every_n_epoch
375 |             if len(self.cost_window)<self.average_cost_window_size:
376 |                 self.cost_window.append(cost)
377 |             else:
378 |                 self.cost_window.pop(0)
379 |                 self.cost_window.append(cost)
380 |             self.get_loss = torch.median(torch.stack(self.cost_window))
381 |             self.log("loss", self.get_loss)
382 |             
383 |         elif self.cost_type=='smooth':
384 |             if self.get_loss==1000:
385 |                 self.get_loss=cost
386 |             smoothed_val = cost * self.smooth_weight + (1 - self.smooth_weight) * self.get_loss  # calculate smoothed value
387 |             self.get_loss = smoothed_val  
388 |             self.log("loss", self.get_loss)
389 |         else:
390 |             self.get_loss = cost
391 |             self.log("loss", self.get_loss) 
392 |         
393 |         return {
394 |             "loss": cost,
395 |             "beta": beta.detach(),
396 |             "gamma": gamma.detach()
397 |         }
398 | 
399 |     def training_epoch_end(self, outputs):
400 |         '''
401 |         steps after finished each epoch
402 |         '''
403 |         loss = torch.stack([x["loss"] for x in outputs]).mean()
404 |         beta = torch.stack([x["beta"] for x in outputs]).mean()
405 |         gamma = torch.stack([x["gamma"] for x in outputs]).mean()
406 | 
407 |     def validation_step(self, batch, batch_idx):
408 |         '''
409 |         predict unsplice_predict, splice_predict on the training dataset
410 |         '''
411 | 
412 |         unsplices, splices, gene_names, unsplicemaxs, splicemaxs, embedding1s, embedding2s = batch
413 |         unsplice, splice,gene_name, unsplicemax, splicemax, embedding1, embedding2  = unsplices[0], splices[0], gene_names[0], unsplicemaxs[0], splicemaxs[0], embedding1s[0], embedding2s[0]
414 |         if self.current_epoch!=0:
415 |             cost = self.get_loss.data.numpy()
416 |             loss_df = self.backbone.summary_para_validation(cost)
417 |             loss_df.insert(0, "gene_name", gene_name)
418 |             loss_df.insert(1, "epoch", self.current_epoch)
419 |             if self.validation_loss_df.empty:
420 |                 self.validation_loss_df = loss_df
421 |             else:
422 |                 self.validation_loss_df = self.validation_loss_df.append(loss_df)
423 | 
424 |     def test_step(self, batch, batch_idx):
425 |         unsplices, splices, gene_names, unsplicemaxs, splicemaxs, embedding1s, embedding2s = batch
426 |         unsplice, splice, gene_name, unsplicemax, splicemax, embedding1, embedding2  = unsplices[0], splices[0], gene_names[0], unsplicemaxs[0], splicemaxs[0], embedding1s[0], embedding2s[0]
427 |         umax = unsplicemax
428 |         smax = splicemax
429 |         alpha0 = np.float32(umax*2)
430 |         beta0 = np.float32(1.0)
431 |         gamma0 = np.float32(umax/smax)
432 | 
433 |         cost, unsplice_predict, splice_predict, alphas, beta, gamma = self.backbone.velocity_calculate( \
434 |                 unsplice, splice, alpha0, beta0, gamma0, self.dt, embedding1, embedding2, \
435 |                 loss_func = self.loss_func, \
436 |                 cost2_cutoff = self.cost2_cutoff, \
437 |                 trace_cost_ratio = self.trace_cost_ratio, \
438 |                 corrcoef_cost_ratio=self.corrcoef_cost_ratio)
439 | 
440 |         self.test_cellDancer_df= self.backbone.summary_para(
441 |             unsplice, splice, unsplice_predict.data.numpy(), splice_predict.data.numpy(), 
442 |             alphas.data.numpy(), beta.data.numpy(), gamma.data.numpy(), 
443 |             cost.data.numpy())
444 |         
445 |         self.test_cellDancer_df.insert(0, "gene_name", gene_name)
446 |         self.test_cellDancer_df.insert(0, "cellIndex", self.test_cellDancer_df.index)
447 | 
448 | 
449 | class getItem(Dataset): 
450 |     def __init__(self, data_fit=None, data_predict=None,datastatus="predict_dataset", permutation_ratio=0.1,norm_u_s=True,norm_cell_distribution=False): 
451 |         self.data_fit=data_fit
452 |         self.data_predict=data_predict
453 |         self.datastatus=datastatus
454 |         self.permutation_ratio=permutation_ratio
455 |         self.gene_name=list(data_fit.gene_name.drop_duplicates())
456 |         self.norm_u_s=norm_u_s
457 |         self.norm_max_unsplice=None
458 |         self.norm_max_splice=None
459 |         self.norm_cell_distribution=norm_cell_distribution
460 | 
461 |     def __len__(self):
462 |         return len(self.gene_name) # gene count
463 | 
464 |     def __getitem__(self, idx):
465 |         gene_name = self.gene_name[idx]
466 | 
467 |         if self.datastatus=="fit_dataset":
468 |             data_fitting=self.data_fit[self.data_fit.gene_name==gene_name] # unsplice & splice for cells for one gene
469 |             if self.norm_cell_distribution==True:    # select cells to train using norm_cell_distribution methods
470 |                 unsplice = data_fitting.unsplice
471 |                 splice = data_fitting.splice
472 |                 unsplicemax_fit = np.float32(max(unsplice))
473 |                 splicemax_fit = np.float32(max(splice))
474 |                 unsplice = np.round(unsplice/unsplicemax_fit, 2)*unsplicemax_fit
475 |                 splice = np.round(splice/splicemax_fit, 2)*splicemax_fit
476 |                 upoints = np.unique(np.array([unsplice, splice]), axis=1)
477 |                 unsplice = upoints[0]
478 |                 splice = upoints[1]
479 |                 data_fitting = pd.DataFrame({'gene_name':gene_name,'unsplice':unsplice, 'splice':splice,'embedding1':unsplice,'embedding2':splice})
480 |         
481 |             # random sampling in each epoch
482 |             if self.permutation_ratio==1:
483 |                 data=data_fitting
484 |             elif (self.permutation_ratio<1) & (self.permutation_ratio>0):
485 |                 data=data_fitting.sample(frac=self.permutation_ratio)  # select cells to train using random methods
486 |             else:
487 |                 print('sampling ratio is wrong!')
488 |         elif self.datastatus=="predict_dataset":
489 |             data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene
490 |             data=data_pred
491 |             
492 |         data_pred=self.data_predict[self.data_predict.gene_name==gene_name] # unsplice & splice for cells for one gene
493 | 
494 |         unsplicemax = np.float32(max(data_pred["unsplice"]))
495 |         splicemax = np.float32(max(data_pred["splice"]))
496 |         unsplice = np.array(data.unsplice.copy().astype(np.float32))
497 |         splice = np.array(data.splice.copy().astype(np.float32))
498 |         if self.norm_u_s:
499 |             unsplice=unsplice/unsplicemax
500 |             splice=splice/splicemax
501 | 
502 |         # add embedding
503 |         embedding1 = np.array(data.embedding1.copy().astype(np.float32))
504 |         embedding2 = np.array(data.embedding2.copy().astype(np.float32))
505 | 
506 |         return unsplice, splice, gene_name, unsplicemax, splicemax, embedding1, embedding2
507 | 
508 | 
509 | 
510 | class feedData(pl.LightningDataModule):
511 |     '''
512 |     load training and test data
513 |     '''
514 |     def __init__(self, data_fit=None, data_predict=None,permutation_ratio=1,norm_u_s=True,norm_cell_distribution=False):
515 |         super().__init__()
516 | 
517 |         self.fit_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="fit_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution)
518 |         
519 |         self.predict_dataset = getItem(data_fit=data_fit, data_predict=data_predict,datastatus="predict_dataset", permutation_ratio=permutation_ratio,norm_u_s=norm_u_s)
520 | 
521 |     def subset(self, indices):
522 |         import copy
523 |         temp = copy.copy(self)
524 |         temp.fit_dataset = Subset(self.fit_dataset, indices)
525 |         temp.predict_dataset = Subset(self.predict_dataset, indices)
526 |         return temp
527 | 
528 |     def train_dataloader(self):
529 |         return DataLoader(self.fit_dataset,num_workers=0)
530 |     def val_dataloader(self):
531 |         return DataLoader(self.fit_dataset,num_workers=0)
532 |     def test_dataloader(self):
533 |         return DataLoader(self.predict_dataset,num_workers=0,)
534 | 
535 | def _train_thread(datamodule, 
536 |                   data_indices,
537 |                   save_path=None,
538 |                   max_epoches=None,
539 |                   check_val_every_n_epoch=None,
540 |                   norm_u_s=None,
541 |                   patience=None,
542 |                   learning_rate=None,
543 |                   dt=None,
544 |                   loss_func=None,
545 |                   n_neighbors=None,
546 |                   ini_model=None,
547 |                   model_save_path=None):
548 |     
549 |     try:
550 |         seed = 0
551 |         torch.manual_seed(seed)
552 |         random.seed(seed)
553 |         np.random.seed(seed)
554 | 
555 |         # iniate network (DNN_layer) and loss function (DynamicModule)
556 |         backbone = DNN_module(DNN_layer(100, 100), n_neighbors=n_neighbors)
557 |         model = ltModule(backbone=backbone, dt=dt, learning_rate=learning_rate, loss_func=loss_func)
558 | 
559 |         selected_data = datamodule.subset(data_indices)
560 | 
561 |         unsplice, splice, this_gene_name, unsplicemax, splicemax, embedding1, embedding2=selected_data.fit_dataset.__getitem__(0)
562 | 
563 |         data_df=pd.DataFrame({'unsplice':unsplice,'splice':splice,'embedding1':embedding1,'embedding2':embedding2})
564 |         data_df['gene_name']=this_gene_name
565 |         try:
566 | 
567 |             # Note
568 |             # here n_neighbors in the downsampling_embedding function is for selecting initial model.
569 |             # which is different from the n_neighbors in _train_tread for velocity calculation.
570 |             _, sampling_ixs_select_model, _ = downsampling_embedding(data_df, # for select model
571 |                                 para='neighbors',
572 |                                 step=(20,20),
573 |                                 n_neighbors=30,
574 |                                 target_amount=None,
575 |                                 projection_neighbor_choice='embedding')
576 |         except:
577 |             sampling_ixs_select_model=list(data_df.index)
578 |             
579 |         gene_downsampling=downsampling(data_df=data_df, gene_list=[this_gene_name], downsampling_ixs=sampling_ixs_select_model)
580 |         if ini_model=='circle':
581 |             model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name
582 |         if ini_model=='branch':
583 |             model_path=model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name
584 |         else:
585 |             model_path=select_initial_net(this_gene_name, gene_downsampling, data_df)
586 |         model.load(model_path)
587 | 
588 |         early_stop_callback = EarlyStopping(monitor="loss", min_delta=0.0, patience=patience,mode='min')
589 | 
590 |         if check_val_every_n_epoch is None:
591 |             # not use early stop
592 |             trainer = pl.Trainer(
593 |                 max_epochs=max_epoches, 
594 |                 progress_bar_refresh_rate=0, 
595 |                 reload_dataloaders_every_n_epochs=1, 
596 |                 logger = False,
597 |                 enable_checkpointing = False,
598 |                 enable_model_summary=False,
599 |                 )
600 |         else:
601 |             # use early stop
602 |             trainer = pl.Trainer(
603 |                 max_epochs=max_epoches, 
604 |                 progress_bar_refresh_rate=0, 
605 |                 reload_dataloaders_every_n_epochs=1, 
606 |                 logger = False,
607 |                 enable_checkpointing = False,
608 |                 check_val_every_n_epoch = check_val_every_n_epoch,
609 |                 enable_model_summary=False,
610 |                 callbacks=[early_stop_callback]
611 |                 )
612 | 
613 |         if max_epoches > 0:
614 |             trainer.fit(model, selected_data)   # train network
615 | 
616 |         trainer.test(model, selected_data,verbose=False)    # predict
617 |         
618 |         if(model_save_path != None):
619 |             model.save(model_save_path)
620 | 
621 |         loss_df = model.validation_loss_df
622 |         cellDancer_df = model.test_cellDancer_df
623 | 
624 |         if norm_u_s:
625 |             cellDancer_df.unsplice=cellDancer_df.unsplice*unsplicemax
626 |             cellDancer_df.splice=cellDancer_df.splice*splicemax
627 |             cellDancer_df.unsplice_predict=cellDancer_df.unsplice_predict*unsplicemax
628 |             cellDancer_df.splice_predict=cellDancer_df.splice_predict*splicemax
629 |             cellDancer_df.beta=cellDancer_df.beta*unsplicemax
630 |             cellDancer_df.gamma=cellDancer_df.gamma*splicemax
631 | 
632 |         if(model_save_path != None):
633 |             model.save(model_save_path)
634 |         
635 |         header_loss_df=['gene_name','epoch','loss']
636 |         header_cellDancer_df=['cellIndex','gene_name','unsplice','splice','unsplice_predict','splice_predict','alpha','beta','gamma','loss']
637 |         
638 |         loss_df.to_csv(os.path.join(save_path,'TEMP', ('loss'+'_'+this_gene_name+'.csv')),header=header_loss_df,index=False)
639 |         cellDancer_df.to_csv(os.path.join(save_path,'TEMP', ('cellDancer_estimation_'+this_gene_name+'.csv')),header=header_cellDancer_df,index=False)
640 |         
641 |         return None
642 | 
643 |     except:
644 |         return this_gene_name
645 | 
646 | 
647 | 
648 | 
649 | 
650 | def build_datamodule(cell_type_u_s,
651 |                    speed_up,
652 |                    norm_u_s,
653 |                    permutation_ratio, 
654 |                    norm_cell_distribution=False, 
655 |                    gene_list=None,
656 |                    downsample_method='neighbors',
657 |                    n_neighbors_downsample=30,
658 |                    step=(200,200),
659 |                    downsample_target_amount=None):
660 |     
661 |     '''
662 |     set fitting data, data to be predicted, and sampling ratio when fitting
663 |     '''
664 |     step_i=step[0]
665 |     step_j=step[1]
666 |     
667 |     if gene_list is None:
668 |         data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']]
669 |     else:
670 |         data_df=cell_type_u_s[['gene_name', 'unsplice','splice','embedding1','embedding2','cellID']][cell_type_u_s.gene_name.isin(gene_list)]
671 | 
672 |     if speed_up:
673 |         _, sampling_ixs, _ = downsampling_embedding(data_df,
674 |                             para=downsample_method,
675 |                             target_amount=downsample_target_amount,
676 |                             step=(step_i,step_j),
677 |                             n_neighbors=n_neighbors_downsample,
678 |                             projection_neighbor_choice='embedding')
679 |         data_df_one_gene=cell_type_u_s[cell_type_u_s['gene_name']==list(gene_list)[0]]
680 |         downsample_cellid=data_df_one_gene.cellID.iloc[sampling_ixs]
681 |         gene_downsampling=data_df[data_df.cellID.isin(downsample_cellid)]
682 | 
683 |         feed_data = feedData(data_fit = gene_downsampling, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default 
684 |     else:
685 |         feed_data = feedData(data_fit = data_df, data_predict=data_df, permutation_ratio=permutation_ratio,norm_u_s=norm_u_s,norm_cell_distribution=norm_cell_distribution) # default 
686 | 
687 |     return(feed_data)
688 | 
689 | 
690 | def velocity(
691 |     cell_type_u_s,
692 |     gene_list=None,
693 |     max_epoches=200, 
694 |     check_val_every_n_epoch=10,
695 |     patience=3,
696 |     learning_rate=0.001,
697 |     dt=0.5,
698 |     n_neighbors=30,
699 |     permutation_ratio=0.125,
700 |     speed_up=True,
701 |     norm_u_s=True,
702 |     norm_cell_distribution=True,
703 |     loss_func='cosine',
704 |     n_jobs=-1,
705 |     save_path=None,
706 | ):
707 | 
708 |     """Velocity estimation for each cell.
709 |         
710 |     Arguments
711 |     ---------
712 |     cell_type_u_s: `pandas.DataFrame`
713 |         Dataframe that contains the unspliced abundance, spliced abundance, embedding space, and cell type information. Columns=['gene_name', 'unsplice', 'splice' ,'cellID' ,'clusters' ,'embedding1' ,'embedding2']
714 |     gene_list: optional, `list` (default: None)
715 |         Gene list for velocity estimation. `None` if to estimate the velocity of all genes.
716 |     max_epoches: optional, `int` (default: 200)
717 |         Stop to update the network once this number of epochs is reached.
718 |     check_val_every_n_epoch: optional, `int` (default: 10)
719 |         Check loss every n train epochs.
720 |     patience: optional, `int` (default: 3)
721 |         Number of checks with no improvement after which training will be stopped.
722 |     dt: optional, `float` (default: 0.5)
723 |         Step size
724 |     permutation_ratio: optional, `float` (default: 0.125)
725 |         Sampling ratio of cells in each epoch when training each gene.
726 |     speed_up: optional, `bool` (default: True)
727 |         `True` if speed up by downsampling cells. `False` if to use all cells to train the model.
728 |     norm_u_s: optional, `bool` (default: True)
729 |         `True` if normalize unsplice (and splice) reads by dividing max value of unspliced (and spliced) reads.
730 |     norm_cell_distribution: optional, `bool` (default: True)
731 |         `True` if the bias of cell distribution is to be removed on embedding space (many cells share the same position of unspliced (and spliced) reads).
732 |     loss_func: optional, `str` (default: `cosine`)
733 |         Currently support `'cosine'`, `'rmse'`, and (`'mix'`, mix_ratio).
734 |     n_jobs: optional, `int` (default: -1)
735 |         The maximum number of concurrently running jobs.
736 |     save_path: optional, `str` (default: 200)
737 |         Path to save the result of velocity estimation.
738 |     Returns
739 |     -------
740 |     loss_df: `pandas.DataFrame`
741 |         The record of loss.
742 |     cellDancer_df: `pandas.DataFrame`
743 |         The result of velocity estimation.
744 |     """
745 | 
746 |     # set output dir
747 |     datestring = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S");
748 |     folder_name='cellDancer_velocity_'+datestring
749 | 
750 |     if save_path is None:
751 |         save_path=os.getcwd()
752 | 
753 |     try:shutil.rmtree(os.path.join(save_path,folder_name))
754 |     except:os.mkdir(os.path.join(save_path,folder_name))
755 |     save_path=os.path.join(save_path,folder_name)
756 |     print('Using '+save_path+' as the output path.')
757 | 
758 |     try:shutil.rmtree(os.path.join(save_path,'TEMP'))
759 |     except:os.mkdir(os.path.join(save_path,'TEMP'))
760 |     
761 |     # set gene_list if not given
762 |     if gene_list is None:
763 |         gene_list=list(cell_type_u_s.gene_name.drop_duplicates())
764 |     else:
765 |         cell_type_u_s=cell_type_u_s[cell_type_u_s.gene_name.isin(gene_list)]
766 |         all_gene_name_cell_type_u_s=list(cell_type_u_s.gene_name.drop_duplicates())
767 |         gene_not_in_cell_type_u_s= list(set(gene_list).difference(set(all_gene_name_cell_type_u_s)))
768 |         gene_list=list(list(set(all_gene_name_cell_type_u_s).intersection(set(gene_list))))
769 |         if len(gene_not_in_cell_type_u_s)>0: print(gene_not_in_cell_type_u_s," not in the data cell_type_u_s")
770 | 
771 |     cell_type_u_s=cell_type_u_s.reset_index(drop=True)
772 |     # buring
773 |     gene_list_buring=[list(cell_type_u_s.gene_name.drop_duplicates())[0]]
774 |     datamodule=build_datamodule(cell_type_u_s,speed_up,norm_u_s,permutation_ratio,norm_cell_distribution,gene_list=gene_list_buring)
775 | 
776 |     result = Parallel(n_jobs=n_jobs, backend="loky")(
777 |         delayed(_train_thread)(
778 |             datamodule = datamodule,
779 |             data_indices=[data_index], 
780 |             max_epoches=max_epoches,
781 |             check_val_every_n_epoch=check_val_every_n_epoch,
782 |             patience=patience,
783 |             learning_rate=learning_rate,
784 |             n_neighbors=n_neighbors,
785 |             dt=dt,
786 |             loss_func=loss_func,
787 |             save_path=save_path,
788 |             norm_u_s=norm_u_s)
789 |         for data_index in range(0,len(gene_list_buring)))
790 | 
791 |     # clean directory
792 |     shutil.rmtree(os.path.join(save_path,'TEMP'))
793 |     os.mkdir(os.path.join(save_path,'TEMP'))
794 |     
795 |     data_len = len(gene_list)
796 |     
797 |     id_ranges=list()
798 |     if n_jobs==-1:
799 |         interval=os.cpu_count()
800 |     else:
801 |         interval=n_jobs
802 |     for i in range(0,data_len,interval):
803 |         idx_start=i
804 |         if data_len<i+interval:
805 |             idx_end=data_len
806 |         else:
807 |             idx_end=i+interval
808 |         id_ranges.append((idx_start,idx_end))
809 | 
810 | 
811 |     print('Arranging genes for parallel job.')
812 |     if len(id_ranges)==1:
813 |         if id_ranges==1:
814 |             print(data_len,' gene was arranged to ',len(id_ranges),' portion.')
815 |         else:
816 |             print(data_len,' genes were arranged to ',len(id_ranges),' portion.')
817 |     else: 
818 |         print(data_len,' genes were arranged to ',len(id_ranges),' portions.')
819 |     
820 |     unpredicted_gene_lst=list()
821 |     for id_range in tqdm(id_ranges,desc="Velocity Estimation", total=len(id_ranges),position=1,leave=False, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}'):
822 |         gene_list_batch=gene_list[id_range[0]:id_range[1]]
823 |         datamodule=build_datamodule(cell_type_u_s,speed_up,norm_u_s,permutation_ratio,norm_cell_distribution,gene_list=gene_list_batch)
824 | 
825 |         result = Parallel(n_jobs=n_jobs, backend="loky")(
826 |             delayed(_train_thread)(
827 |             datamodule = datamodule,
828 |             data_indices=[data_index], 
829 |             max_epoches=max_epoches,
830 |             check_val_every_n_epoch=check_val_every_n_epoch,
831 |             n_neighbors=n_neighbors,
832 |             dt=dt,
833 |             loss_func=loss_func,
834 |             learning_rate=learning_rate,
835 |             patience=patience,
836 |             save_path=save_path,
837 |             norm_u_s=norm_u_s)
838 |             for data_index in range(0,len(gene_list_batch)))
839 | 
840 |         # unpredicted gene list
841 |         gene_name_lst=[x for x in result if x is not None]
842 |         for i in gene_name_lst:
843 |             unpredicted_gene_lst.append(i)
844 |     if len(unpredicted_gene_lst)!=0:
845 |         not_pred_err='Not predicted gene list:'+str(unpredicted_gene_lst)+'. Try visualizing the unspliced and spliced columns of the gene(s) to check the quality.'
846 |         logger_cd.error(not_pred_err)
847 | 
848 |     # summarize
849 |     cellDancer_df = os.path.join(save_path,'TEMP', "cellDancer_estimation*.csv")
850 |     cellDancer_df_files = glob.glob(cellDancer_df)
851 |     loss_df = os.path.join(save_path, 'TEMP',"loss*.csv")
852 |     loss_df_files = glob.glob(loss_df)
853 | 
854 |     def combine_csv(save_path,files):
855 |         with open(save_path,"wb") as fout:
856 |             # first file:
857 |             with open(files[0], "rb") as f:
858 |                 fout.write(f.read())
859 |             # the rest:    
860 |             for filepath in files[1:]:
861 |                 with open(filepath, "rb") as f:
862 |                     next(f)
863 |                     fout.write(f.read())
864 |         return(pd.read_csv(save_path))
865 | 
866 |     if len(cellDancer_df_files)==0:
867 |         # if no gene predicted
868 |         logger_cd.error('None of the genes were predicted. Try visualizing the unspliced and spliced columns of the gene(s) to check the quality.')
869 |         return None, None
870 |     else:
871 |         cellDancer_df=combine_csv(os.path.join(save_path,"cellDancer_estimation.csv"),cellDancer_df_files)
872 |         loss_df=combine_csv(os.path.join(save_path,"cellDancer_estimation.csv"),loss_df_files)
873 | 
874 |         shutil.rmtree(os.path.join(save_path,'TEMP'))
875 | 
876 |         cellDancer_df.sort_values(by = ['gene_name', 'cellIndex'], ascending = [True, True])
877 |         onegene=cell_type_u_s[cell_type_u_s.gene_name==cell_type_u_s.gene_name[0]]
878 |         embedding_info=onegene[['cellID','clusters','embedding1','embedding2']]
879 |         gene_amt=len(cellDancer_df.gene_name.drop_duplicates())
880 |         embedding_col=pd.concat([embedding_info]*gene_amt)
881 |         embedding_col.index=cellDancer_df.index
882 |         cellDancer_df=pd.concat([cellDancer_df,embedding_col],axis=1)
883 |         cellDancer_df.to_csv(os.path.join(save_path, ('cellDancer_estimation.csv')),index=False)
884 | 
885 |         loss_df.to_csv(os.path.join(save_path, ('loss.csv')),index=False)
886 | 
887 |         return loss_df, cellDancer_df
888 | 
889 |     
890 | def select_initial_net(gene, gene_downsampling, data_df):
891 |     '''
892 |     check if right top conner has cells
893 |     circle.pt is the model for single kinetic
894 |     branch.pt is multiple kinetic
895 |     '''
896 |     gene_u_s = gene_downsampling[gene_downsampling.gene_name==gene]
897 |     gene_u_s_full = data_df[data_df.gene_name==gene]
898 |     
899 |     s_max=np.max(gene_u_s.splice)
900 |     u_max = np.max(gene_u_s.unsplice)
901 |     s_max_90per = 0.9*s_max
902 |     u_max_90per = 0.9*u_max
903 |     
904 |     gene_u_s_full['position'] = 'position_cells'
905 |     gene_u_s_full.loc[(gene_u_s_full.splice>s_max_90per) & (gene_u_s_full.unsplice>u_max_90per), 'position'] = 'cells_corner'
906 | 
907 |     if gene_u_s_full.loc[gene_u_s_full['position']=='cells_corner'].shape[0]>0.001*gene_u_s_full.shape[0]:
908 |         # model in circle shape
909 |         model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'circle.pt')).name
910 |     else:
911 |         # model in seperated branch shape
912 |         model_path=pkg_resources.resource_stream(__name__,os.path.join('model', 'branch.pt')).name
913 |     return(model_path)


--------------------------------------------------------------------------------