├── .gitignore
├── LICENSE
├── README.md
├── docs
├── changelog.rst
├── faqs.rst
├── knn.png
├── mnn_direct.png
├── mnn_equation.png
├── mnn_rescue.png
├── n_neighbors_knn.png
├── rescue_equation_1.png
├── rescue_equation_2.png
├── rescue_equation_3.png
├── results
│ ├── SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png
│ ├── SingleCellFusion_plot_2_hist.png
│ ├── SingleCellFusion_plot_3_embedding_by_dataset.png
│ ├── SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png
│ └── SingleCellFusion_plot_5_confmat.png
└── scf_description.rst
├── environment.yml
├── environment_mini.yml
├── environment_mini_pegasus.yml
├── example-MOp_L5ET
├── datasets
│ ├── 10x_cells_v2.h5ad
│ ├── smarter_cells.h5ad
│ ├── smarter_nuclei.h5ad
│ └── snmcseq_gene.h5ad
├── run_scf.sh
└── visualize_results.ipynb
├── example-wholebrain
├── 00.test_all_preproc.sh
├── visualize_results_lite_3mods.ipynb
└── visualize_results_lite_rna_intron_exon.ipynb
├── example-wholebrainatac
├── normalize_and_select_features.ipynb
├── run_preproc.sh
├── run_scf.sh
└── visualize_results.ipynb
├── scf_description.rst
├── scripts
├── SCF_utils.py
├── SingleCellFusion
├── SingleCellFusion_prep
├── __init__.py
├── basic_utils.py
├── cli_parser.py
├── clst_utils.py
└── preproc_utils.py
└── setup.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # old
2 | old/
3 | scripts/old.py
4 | example-wholebrainatac/old/
5 | example-MOp_L5ET/old/
6 |
7 | # unused
8 | example-biccn_enhancer/
9 | example-testeran/
10 | example-MOp_L5ET-test2/
11 | example-wholebrain-test2/
12 |
13 | # results
14 | example-MOp_L5ET/results
15 | example-wholebrainatac/results
16 |
17 | # datasets
18 | example-wholebrainatac/datasets
19 | example-wholebrainatac/datasets_pre
20 | example-wholebrainatac/datasets_processed
21 |
22 | example-wholebrain/datasets
23 | example-wholebrain/processed
24 | example-wholebrain/results
25 | example-wholebrain/old
26 |
27 | # Byte-compiled / optimized / DLL files
28 | __pycache__/
29 | *.py[cod]
30 | *$py.class
31 |
32 |
33 | # C extensions
34 | *.so
35 |
36 | # Distribution / packaging
37 | .Python
38 | build/
39 | develop-eggs/
40 | dist/
41 | downloads/
42 | eggs/
43 | .eggs/
44 | lib/
45 | lib64/
46 | parts/
47 | sdist/
48 | var/
49 | wheels/
50 | pip-wheel-metadata/
51 | share/python-wheels/
52 | *.egg-info/
53 | .installed.cfg
54 | *.egg
55 | MANIFEST
56 |
57 | # PyInstaller
58 | # Usually these files are written by a python script from a template
59 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
60 | *.manifest
61 | *.spec
62 |
63 | # Installer logs
64 | pip-log.txt
65 | pip-delete-this-directory.txt
66 |
67 | # Unit test / coverage reports
68 | htmlcov/
69 | .tox/
70 | .nox/
71 | .coverage
72 | .coverage.*
73 | .cache
74 | nosetests.xml
75 | coverage.xml
76 | *.cover
77 | *.py,cover
78 | .hypothesis/
79 | .pytest_cache/
80 |
81 | # Translations
82 | *.mo
83 | *.pot
84 |
85 | # Django stuff:
86 | *.log
87 | local_settings.py
88 | db.sqlite3
89 | db.sqlite3-journal
90 |
91 | # Flask stuff:
92 | instance/
93 | .webassets-cache
94 |
95 | # Scrapy stuff:
96 | .scrapy
97 |
98 | # Sphinx documentation
99 | docs/_build/
100 |
101 | # PyBuilder
102 | target/
103 |
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 |
107 | # IPython
108 | profile_default/
109 | ipython_config.py
110 |
111 | # pyenv
112 | .python-version
113 |
114 | # pipenv
115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | # install all needed dependencies.
119 | #Pipfile.lock
120 |
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
122 | __pypackages__/
123 |
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 |
128 | # SageMath parsed files
129 | *.sage.py
130 |
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 |
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 |
144 | # Rope project settings
145 | .ropeproject
146 |
147 | # mkdocs documentation
148 | /site
149 |
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 |
155 | # Pyre type checker
156 | .pyre/
157 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SingleCellFusion
2 |
3 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. Code in this repository is used in [Luo et al., (2019) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1) and in [Yao et al., (2020) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2). [Here](docs/scf_description.rst) is a brief description of how SingleCellFusion works.
4 |
5 | Related publications:
6 | - Luo, C. et al. Single nucleus multi-omics links human cortical cell regulatory genome diversity to disease risk variants. bioRxiv 2019.12.11.873398 (2019) [doi:10.1101/2019.12.11.873398](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1)
7 | - Yao, Z. et al. An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types. bioRxiv 2020.02.29.970558 (2020) [doi:10.1101/2020.02.29.970558](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2)
8 | - BRAIN Initiative Cell Census Network (BICCN) et al. A multimodal cell census and atlas of the mammalian primary motor cortex. bioRxiv 2020.10.19.343129 (2020) [doi:10.1101/2020.10.19.343129](https://www.biorxiv.org/content/10.1101/2020.10.19.343129v1)
9 |
10 | Code contributors: [Fangming Xie](mailto:f7xie@ucsd.edu), Aditya Chandrasekar, Wayne I. Doyle, [Ethan Armand](mailto:ejarmand@ucsd.edu)
11 |
12 | Contact: [Eran Mukamel](mailto:emukamel@ucsd.edu)
13 |
14 | ## Installation
15 | Step 1: Clone this repo.
16 | ```bash
17 | git clone https://github.com/mukamel-lab/SingleCellFusion.git
18 | cd SingleCellFusion
19 | ```
20 |
21 | **TBD**
22 | Step 2: Set up a conda environment and install dependent packages. The environment should be installed on the UNIX terminal(Skip this step if not needed.)
23 | ```bash
24 | conda env create -f environment.yml # create an env named scf_terra
25 | source activate scf_terra
26 | ```
27 |
28 | ## Usage
29 | ```
30 | usage: SingleCellFusion [-h] -i xx.h5ad [xx.h5ad ...] -im rna/atac/mc [rna/atac/mc ...]
31 | -f xx.h5ad [xx.h5ad ...] [-o DIR] [-op OUTPUT_PREFIX]
32 | [--nearest_neighbors NEAREST_NEIGHBORS] [--relaxation RELAXATION]
33 | [--num_pcs NUM_PCS] [--smoothing_fractions SMOOTHING_FRACTIONS]
34 | [--leiden_n_neighbors LEIDEN_N_NEIGHBORS] [--leiden_resolutions LEIDEN_RESOLUTIONS]
35 | [--umap_n_neighbors UMAP_N_NEIGHBORS] [--umap_min_dist UMAP_MIN_DIST]
36 |
37 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets.
38 |
39 | optional arguments:
40 | -h, --help show this help message and exit
41 |
42 | required:
43 | -i xx.h5ad [xx.h5ad ...], --input_datasets xx.h5ad [xx.h5ad ...]
44 | (list of str) Paths to .h5ad files, each containing a cell-by-gene feature matrix,
45 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, Gene IDs
46 | should be shared or partially shared across files. Multiple inputs should be listed
47 | as a space seperated list of filenames. (default: None)
48 | -im rna/atac/mc [rna/atac/mc ...], --input_modalities rna/atac/mc [rna/atac/mc ...]
49 | (list of str) Data modalities chosen from 'rna', 'atac', or 'mc'. This should be
50 | listed in the same order as input_datasets. (default: None)
51 | -f xx.h5ad [xx.h5ad ...], --feature_datasets xx.h5ad [xx.h5ad ...]
52 | (list of str) Dataset(s) whose features all other datasets will impute into. This
53 | should be a subset of --input_datasets. Enter multiple datasets as a space-separated
54 | list of filenames. The features of these datasets will be the features kept in the
55 | output imputed data table." (default: None)
56 |
57 | optional:
58 | -o DIR, --output_dir DIR
59 | (str) Directory to store output files (default: ./results)
60 | -op OUTPUT_PREFIX, --output_prefix OUTPUT_PREFIX
61 | (str) The output files will contain this prefix. (default: SingleCellFusion)
62 | --nearest_neighbors NEAREST_NEIGHBORS
63 | (integer) Number of nearest neighbors used to impute data (default: 20)
64 | --relaxation RELAXATION
65 | (float) A value between 1 to infinity. This is a parameter that constraints the
66 | number of neighbors a cell is allowed to receive. Assume dataset 1 has N1 cells,
67 | dataset 2 has N2 cells. To find k neighbors in dataset 2 for every cell in dataset 1
68 | means on average each cell in dataset 2 receives (kN1/N2) connections. However, not
69 | all cells in dataset 2 gets the same number of connections. We therefore set an upper
70 | bound for the number of connections a cell in dataset 2 can receive to be:
71 | (kN1/N2)*relaxation where relaxation >= 1. Relaxation=1 enforces a hard limit that
72 | every cell receives the same number of nearest neighbors, while relaxation=infinity
73 | approaches traditional kNN. (default: 3)
74 |
75 | advanced:
76 | --num_pcs NUM_PCS (integer) Number of Principal Components to keep for each dataset for smoothing
77 | and for clustering/embedding after imputation. (default: 50)
78 | --smoothing_fractions SMOOTHING_FRACTIONS
79 | (list of floats) A list of three values between 0 to 1 that controls the relative
80 | contribution from the cell itself vs. its neighbors in within-dataset smoothing,
81 | specified for 'rna', 'atac', 'mc' data, respectively. (default: [0.7, 0.1, 0.9])
82 | --leiden_n_neighbors LEIDEN_N_NEIGHBORS
83 | (integer) Number of nearest neighbors to form in the integrated space, the resulting
84 | nearest neighbor graph is used for Leiden clustering. It is passed into the python
85 | package leidenalg.
86 | (default: 30)
87 | --leiden_resolutions LEIDEN_RESOLUTIONS
88 | (list of floats) A list of resolutions to be used for Leiden Clustering. It is
89 | passed into the python package leidenalg. (default: [0.1, 0.2, 0.4, 0.8])
90 | --umap_n_neighbors UMAP_N_NEIGHBORS
91 | (integer) Number of neighbors for UMAP. It is passed into the python package
92 | umap.UMAP(n_neighbors). (default: 60)
93 | --umap_min_dist UMAP_MIN_DIST
94 | (float) Minimum distance for UMAP. It is passed into the python package
95 | umap.UMAP(min_dist). (default: 0.5)
96 |
97 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel.
98 |
99 | Contact: Eran Mukamel (emukamel@ucsd.edu).
100 | ```
101 |
102 | ### Example:
103 |
104 | #### Integrating L5 ET cells from four data modalities from the mouse primary motor cortex:
105 |
106 | `./example-MOp-L5ET` contains an example of integrating the layer 5 Extratelencephalically Projecting neurons (L5 ET) from 4 different datasets from the mouse primary motor cortex. The example directory includes the organized datasets, code, and results, which could be used as a template for other similar tasks.
107 |
108 | After SingleCellFusion on the example data with `run_scf.sh` in `example-MOp_L5ET` (which is a call to SingleCellFusion with default parameters), the notebook `./example-MOp_L5ET/visualize_results.ipynb` provides a step-by-step walkthrough of manipulating and plotting the integrated data. The plots created are shown below, and all the required code to generate them is included in the notebook.
109 |
110 | ```
111 | cd ./example-MOp_L5ET
112 | # shell script to run SingleCellFusion using example parameters
113 | ./run_scf.sh
114 | # visualize results
115 | jupyter notebook visualize_results.ipynb
116 | ```
117 |
118 | More example datasets, and prepared `./run_scf.sh` files are included in the repository to play around with.
119 |
120 | #### Integrated Embedding and Clustering
121 |
122 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to plot these UMAP coordinates, coloring each data point to get a rough view of how integrated the modalities are.
123 |
124 | For our Top Panel, we plot the integrated UMAP space, coloring each point with the colors of each integrated modality.
125 |
126 | For the Bottom Panels, we plot the integrated UMAP space by each of the joint clusters found. Each plot corresponds to a separate clustering resolution, set when calling SingleCellFusion.
127 |
128 | 
129 |
130 | #### Cell Distribution in Clusters
131 |
132 | In order to ensure that each cluster has a similar composition of all of the datasets, we plot the dataset compositions of each Joint Cluster in a bar chart.
133 |
134 | Each bar corresponds to a different cluster found in our Joint Clustering, colored by the original datasets. To check that each cluster has a relatively even composition of each dataset, we plot the overall composition of datasets next to our bar charts for comparison.
135 |
136 | To ensure that SingleCellFusion does not cluster cells by their source dataset, we want the composition of each cluster to be as close to the overall composition of the data as possible (defined by the sizes of the original modalities).
137 |
138 | 
139 |
140 | #### Cell Embedding Colored by Dataset
141 |
142 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to show that our integration does not segregate our cells by modality, and rather integrates them by other expression-level features.
143 |
144 | To do this, we plot each modality separately on the same UMAP space and check that each modality is evenly distributed across the space.
145 |
146 | 
147 |
148 | #### Cell Embedding Colored by Original Annotations/Cluster
149 |
150 | To see how the original clusters from individual modalities are preserved in the integrated clustering, we display the original cell-type annotations of individual datasets in our integrated space.
151 |
152 | For the given example, we will focus on displaying L5 PT (L5 ET or its equivalent annotation) cells. The majority of the cells in our example are labeled as L5 PT/ET in each individual dataset's clusters, with a few exceptions.
153 |
154 | 
155 |
156 | #### Confusion Matrices between Integrated Clustering and Individual Dataset Clustering
157 |
158 | To continue investigating how the original clusters from individual modalities are maintained in the integrated clusters, we can plot a confusion matrix showing how the clusters of individual datasets are re-organized into new integrated clusters.
159 |
160 | The rows of each confusion matrix shows the cluster labels for the individual dataset. If no such cluster labels exist, a single column is shown, so we can still examine the distribution of cells in the Joint Clusters.
161 |
162 | The y-axis shows the three clusters identified in the integrated dataset.
163 |
164 | The Confusion Matrices on the first row are all Normalized by Joint Clusters (the Sum of each Row is 1).
165 |
166 | The Confusion Matrices on the second row are all Normalized by Original Clusters (the Sum of each Row is 1).
167 |
168 | 
169 |
--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | Changelog
2 | ================
3 | * Version 1.0.0 - 2019-11-08:
4 | * First stable release!
5 | * Basic example on how to use SingleCellFusion was added to the README
6 | * Minor formatting and documenting fixes throughout
7 | * Version 0.9.0 - 2019-11-07:
8 | * Finalizing tests before first stable release
9 | * Fixed indexing issue in low memory version of data integration
10 | * Version 0.8.0 - 2019-11-05:
11 | * Performed debugging
12 | * Added wrapper for all steps
13 | * Version 0.7.0 - 2019-10-14:
14 | * Added low memory version of integration functions
15 | * Version 0.6.0 - 2019-10-04:
16 | * Added functions to search for common, variable features
17 | * Added ability to perform high and low memory mean/standard deviation of loom files
18 | * Version 0.5.0 - 2019-10-03:
19 | * Fixed low and high memory versions of kNN
20 | * Added integration function (currently only in high memory format)
21 | * Added high memory PCA
22 | * Version 0.4.0 - 2019-09-30:
23 | * Added preliminary MNN functions
24 | * Removed recipes and integration functions pending updates
25 | * Version 0.3.0 - 2019-09-25:
26 | * Major overhaul to make SingleCellFusion more user friendly
27 | * Added low and high memory versions of constrained kNN search
28 | * Removed MNN method pending, version 0.4.0
29 | * Version 0.2.0 - 2019-09-19:
30 | * Initialization of changelog and versions, so a lot of changes have been done since the last version
31 | * Fixed a number of bugs
32 | * Version 0.1.0 - 2018-09-11:
33 | * Initial release of SingleCellFusion
34 |
35 |
--------------------------------------------------------------------------------
/docs/faqs.rst:
--------------------------------------------------------------------------------
1 | FAQs
2 | ================
3 | SingleCellFusion is under active development and function names and parameters will continue to be
4 | changed until a stable release is reached. In the interim, we have provided some answers to common
5 | questions and problems that can occur when using SingleCellFusion.
6 |
7 | Why do you use loom files and how do I make one?
8 | -------------------------------------------------
9 | The loom file format allows SingleCellFusion to have a low memory footprint when analyzing large data
10 | sets (such as 10x Genomics scRNA-seq data), and keep all of the meta-data in one centralized location.
11 | The loompy package was developed by the Sten Linnarsson group and has excellent documentation at
12 | `loompy.org `_.
13 |
14 | Within a loom file features are stored in rows, and cells in columns. As an example to create a loom file,
15 | say you have a pandas dataframe (df) in which the features are in rows and cells are in columns. The index of
16 | this dataframe contains the unique feature IDs and the column header contains unique cell IDs. A loom file
17 | can be generated with the following code::
18 |
19 | import loompy
20 | loompy.create(filename=filename,
21 | layers={'':df.values},
22 | row_attrs={'Accession:df.index.values},
23 | col_attrs={'CellID:df.columns.values})
24 |
25 | Why is my code using so much memory, even with the low_mem flag?
26 | -----------------------------------------------------------------
27 | Access of loom files is performed in batches to reduce the memory overload. In the basic recipe for
28 | SingleCellFusion (pairwise_impute) the size of these batches is controlled by the parameter batch_x and
29 | batch_y. If you are having memory issues, try reducing the size of these values to reduce your memory
30 | overhead.
31 |
32 | Why is my code using so many threads?
33 | --------------------------------------
34 | This is a problem with Numpy (see this `issue `_). You can solve this
35 | issue in two ways.
36 |
37 | The easiest way is to run the following lines on your command line before running any Python scripts or notebooks::
38 |
39 | export OMP_NUM_THREADS=1
40 | export OPENBLAS_NUM_THREADS=1
41 | export MKL_NUM_THREADS=1
42 | export VECLIB_MAXIMUM_THREADS=1
43 | export NUMEXPR_NUM_THREADS=1
44 |
45 | You can also run the below code in the first cell of a Python notebook or the beginning or a Python script. It must
46 | be run before importing any other packages (including MoP)::
47 |
48 | import os
49 | os.environ["OMP_NUM_THREADS"] = '1'
50 | os.environ["OPENBLAS_NUM_THREADS"] = '1'
51 | os.environ["MKL_NUM_THREADS"] = '1'
52 | os.environ["VECLIB_MAXIMUM_THREADS"] = '1'
53 | os.environ["NUMEXPR_NUM_THREADS"] = '1'
54 |
55 | You can specify the maximum number of threads that you want to use in that script or notebook by changing the value
56 | from 1 to your desired integer.
57 |
58 | This information came from this `StackOverflow question
59 | `_.
60 |
61 | Why is my code running slow with the low_mem flag?
62 | --------------------------------------------------
63 | Although the loom file format has a number of benefits, the access and processing of data in the file
64 | will get progressively slower as more data is added to the file. If you are finding that your code is
65 | running too slow it can be helpful to make a second loom file containing just the relevant data for running
66 | SingleCellFusion.
67 |
68 | Another cause of slow code is that the batch size for processing code (see "Why is my code using so much
69 | memory?" above) is too small. If you are not having memory issues, we recommend increasing the batch size
70 | to speed up the code.
71 |
72 | Why am I not finding many neighbors?
73 | -------------------------------------
74 | If you expect that similar cell types should be present in both data sets, this could be due to
75 | the sparseness of your data. We have found that if you first smooth your data (we highly
76 | recommend using `MAGIC `_. You can then use the
77 | smoothed data to find nearest neighbors, and impute on the observed data. A tutorial using our
78 | loom-based method of smoothing will be uploaded soon.
79 |
80 |
81 | Is SingleCellFusion just for integrating data from different sequencing modalities?
82 | -----------------------------------------------------------------------------------
83 | No, theoretically this pipeline could be applied to integration across species or to find common cell
84 | types across different research studies using the same sequencing technology. This is an active area
85 | of development.
86 |
87 | What happens if a cell type is present in only one modality?
88 | -------------------------------------------------------------
89 | In our experience, this situation is easily detectable. If the analysis is only performed on direct
90 | mutual nearest neighbors, these cells will not make nearest neighbors and will be dropped from the analysis.
91 | If the imputation is performed with the rescue, these cells will still not make mutual nearest neighbors.
92 | Their imputed counts will then come from their mutual nearest neighbors within their own data set. These
93 | imputed counts will not be similar to any observed counts, and these cells will self-segregate into their
94 | own clusters and will be visually separate on a tSNE or uMAP embedding. For the kNN method, these cells will
95 | make weak connections with a number of different cell types. This will lead to the imputation of counts that
96 | are not similar to any observed counts, also leading to segregation into unique clusters.
97 |
98 |
99 |
--------------------------------------------------------------------------------
/docs/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/knn.png
--------------------------------------------------------------------------------
/docs/mnn_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_direct.png
--------------------------------------------------------------------------------
/docs/mnn_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_equation.png
--------------------------------------------------------------------------------
/docs/mnn_rescue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_rescue.png
--------------------------------------------------------------------------------
/docs/n_neighbors_knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/n_neighbors_knn.png
--------------------------------------------------------------------------------
/docs/rescue_equation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_1.png
--------------------------------------------------------------------------------
/docs/rescue_equation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_2.png
--------------------------------------------------------------------------------
/docs/rescue_equation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_3.png
--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png
--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_2_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_2_hist.png
--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png
--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png
--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_5_confmat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_5_confmat.png
--------------------------------------------------------------------------------
/docs/scf_description.rst:
--------------------------------------------------------------------------------
1 | How does SingleCellFusion work?
2 | ================================
3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing,
4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would
5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know
6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation
7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell.
8 |
9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single
10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a
11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics
12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity.
13 |
14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets.
15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable
16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a
17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled
18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the
19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's
20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor
21 | graph specified, and are described below.
22 |
23 | Direct mutual nearest neighbors
24 | -------------------------------
25 | .. image:: mnn_direct.png
26 | :width: 400
27 | :alt: cartoon of direct MNN
28 |
29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the
30 | correlation of counts at highly variable features is calculated. These correlation values are used
31 | as the distance metric for identifying mutual neighbors.
32 |
33 | Once the correlation is calculated, neighbors across modalities are determined. We require that
34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled
35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable
36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq
37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the
38 | nearest neighbors graph is not dominated by noisy or spurious correlations.
39 |
40 | Once the neighbors graph is generated imputed counts are generated by the following equation:
41 |
42 | .. image:: mnn_equation.png
43 | :width: 400
44 | :alt: equation for imputation by MNN
45 |
46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality
47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest
48 | neighbors in modality *m'*.
49 |
50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual
51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can
52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them.
53 |
54 |
55 | Mutual nearest neighbors with rescue
56 | -------------------------------------
57 | .. image:: mnn_rescue.png
58 | :width: 400
59 | :alt: cartoon of rescue MNN
60 |
61 | As with the direct method, the distances between two pairs of cells is their correlation at
62 | highly variable genes. The only difference with this method is that in addition to a mutual
63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality
64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by
65 | using the within modality neighbors to determine what the best matched neighbors are across
66 | modalities.
67 |
68 | .. image:: rescue_equation_1.png
69 | :width: 400
70 | :alt: equation 1 of rescue
71 |
72 | where
73 |
74 | .. image:: rescue_equation_2.png
75 | :width: 400
76 | :alt: equation 2 of rescue
77 |
78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality
79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k*
80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality
81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in
82 | "Direct mutual nearest neighbors:"
83 |
84 | .. image:: mnn_equation.png
85 | :width: 400
86 | :alt: equation for imputation by MNN
87 |
88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following
89 | equation:
90 |
91 | .. image:: rescue_equation_3.png
92 | :width: 400
93 | :alt: equation 3 of rescue
94 |
95 | This is a more lenient method for generating imputed counts as all cells will receive imputed
96 | data. This method will enable all cells to be analyzed, and is our recommended approach.
97 |
98 | k-nearest neighbors
99 | -------------------
100 | .. image:: knn.png
101 | :width: 400
102 | :alt: cartoon of kNN
103 |
104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at
105 | highly variable features. The major difference with this method is that each cell is required to make
106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is
107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell
108 | in the opposite modality can make is given by the equation:
109 |
110 | .. image:: n_neighbors_knn.png
111 | :width: 200
112 | :alt: equation 1 of knn
113 |
114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required
115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\ is the number of cells in
116 | modality *m*, and *n*\ :sub:`m'`\ is the number of cells in modality *m'*. *z* is a relaxation
117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly
118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell
119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same
120 | equation as in "Direct mutual nearest neighbors:
121 |
122 | .. image:: mnn_equation.png
123 | :width: 400
124 | :alt: equation for imputation by MNN
125 |
126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors
127 | in the opposite data set.
128 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: scf_terra
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=conda_forge
8 | - _openmp_mutex=4.5=1_llvm
9 | - anndata=0.6.22.post1=py_0
10 | - attrs=19.3.0=py_0
11 | - backcall=0.1.0=py_0
12 | - blas=1.0=openblas
13 | - bleach=3.1.1=py_0
14 | - blosc=1.16.3=hd408876_0
15 | - bzip2=1.0.8=h7b6447c_0
16 | - ca-certificates=2021.4.13=h06a4308_1
17 | - cairo=1.16.0=hfb77d84_1002
18 | - certifi=2020.12.5=py38h06a4308_0
19 | - cycler=0.10.0=py_2
20 | - dbus=1.13.6=he372182_0
21 | - decorator=4.4.1=py_0
22 | - defusedxml=0.6.0=py_0
23 | - entrypoints=0.3=py38_1000
24 | - expat=2.2.9=he1b5a44_2
25 | - fbpca=1.0=py_0
26 | - fontconfig=2.13.1=h86ecdb6_1001
27 | - freetype=2.10.0=he983fc9_1
28 | - gettext=0.19.8.1=hc5be6a0_1002
29 | - glib=2.58.3=py38h6f030ca_1002
30 | - gmp=6.2.0=he1b5a44_2
31 | - gst-plugins-base=1.14.5=h0935bb2_2
32 | - gstreamer=1.14.5=h36ae1b5_2
33 | - h5py=2.10.0=py38h7918eee_0
34 | - hdf5=1.10.4=hb1b8bf9_0
35 | - icu=64.2=he1b5a44_1
36 | - igraph=0.7.1=h9e3b1fc_1007
37 | - importlib_metadata=1.5.0=py38_0
38 | - ipykernel=5.1.4=py38h5ca1d4c_0
39 | - ipython=7.12.0=py38h5ca1d4c_0
40 | - ipython_genutils=0.2.0=py_1
41 | - jedi=0.16.0=py38_0
42 | - jinja2=2.11.1=py_0
43 | - joblib=0.14.1=py_0
44 | - jpeg=9c=h14c3975_1001
45 | - json5=0.9.0=py_0
46 | - jsonschema=3.2.0=py38_0
47 | - jupyter_client=5.3.4=py38_1
48 | - jupyter_contrib_core=0.3.3=py_2
49 | - jupyter_contrib_nbextensions=0.5.1=py38_0
50 | - jupyter_core=4.6.3=py38_0
51 | - jupyter_highlight_selected_word=0.2.0=py38_1000
52 | - jupyter_latex_envs=1.4.6=py38_1000
53 | - jupyter_nbextensions_configurator=0.4.1=py38_0
54 | - jupyterlab=1.2.6=py_0
55 | - jupyterlab_server=1.0.6=py_0
56 | - kiwisolver=1.1.0=py38hc9558a2_0
57 | - ld_impl_linux-64=2.33.1=h53a641e_8
58 | - leidenalg=0.7.0=py38he1b5a44_1
59 | - libblas=3.8.0=15_openblas
60 | - libcblas=3.8.0=15_openblas
61 | - libclang=9.0.1=default_hde54327_0
62 | - libffi=3.2.1=he1b5a44_1006
63 | - libgcc-ng=9.2.0=h24d8f2e_2
64 | - libgfortran-ng=7.3.0=hdf63c60_5
65 | - libiconv=1.15=h516909a_1005
66 | - liblapack=3.8.0=15_openblas
67 | - libllvm8=8.0.1=hc9558a2_0
68 | - libllvm9=9.0.1=hc9558a2_0
69 | - libopenblas=0.3.8=h5ec1e0e_0
70 | - libpng=1.6.37=hed695b0_0
71 | - libsodium=1.0.17=h516909a_0
72 | - libstdcxx-ng=9.2.0=hdf63c60_2
73 | - libuuid=2.32.1=h14c3975_1000
74 | - libxcb=1.13=h14c3975_1002
75 | - libxkbcommon=0.10.0=he1b5a44_0
76 | - libxml2=2.9.10=hee79883_0
77 | - libxslt=1.1.33=h31b3aaa_0
78 | - llvm-openmp=9.0.1=hc9558a2_2
79 | - llvmlite=0.31.0=py38h8b12597_0
80 | - lxml=4.5.0=py38hbb43d70_1
81 | - lz4-c=1.8.1.2=h14c3975_0
82 | - lzo=2.10=h7b6447c_2
83 | - markupsafe=1.1.1=py38h516909a_0
84 | - matplotlib=3.1.3=py38_0
85 | - matplotlib-base=3.1.3=py38h250f245_0
86 | - mistune=0.8.4=py38h516909a_1000
87 | - mock=4.0.3=pyhd3eb1b0_0
88 | - natsort=7.1.1=pyhd3eb1b0_0
89 | - nbconvert=5.6.1=py38_0
90 | - nbformat=5.0.4=py_0
91 | - ncurses=6.1=hf484d3e_1002
92 | - notebook=6.0.3=py38_0
93 | - nspr=4.25=he1b5a44_0
94 | - nss=3.47=he751ad9_0
95 | - numba=0.48.0=py38hb3f55d8_0
96 | - numexpr=2.7.3=py38h4be448d_1
97 | - numpy=1.18.1=py38h95a1406_0
98 | - openssl=1.1.1k=h27cfd23_0
99 | - pandas=1.0.1=py38hb3f55d8_0
100 | - pandoc=2.9.2=0
101 | - pandocfilters=1.4.2=py_1
102 | - parso=0.6.1=py_0
103 | - patsy=0.5.1=py_0
104 | - pcre=8.44=he1b5a44_0
105 | - pexpect=4.8.0=py38_0
106 | - pickleshare=0.7.5=py38_1000
107 | - pip=20.0.2=py_2
108 | - pixman=0.38.0=h516909a_1003
109 | - prometheus_client=0.7.1=py_0
110 | - prompt_toolkit=3.0.3=py_0
111 | - pthread-stubs=0.4=h14c3975_1001
112 | - ptyprocess=0.6.0=py_1001
113 | - pycairo=1.19.1=py38h438ddbb_0
114 | - pygments=2.5.2=py_0
115 | - pyparsing=2.4.6=py_0
116 | - pyqt=5.12.3=py38hcca6a23_1
117 | - pyrsistent=0.15.7=py38h516909a_0
118 | - pytables=3.6.1=py38h9fd0a39_0
119 | - python=3.8.1=h357f687_2
120 | - python-dateutil=2.8.1=py_0
121 | - python-igraph=0.8.0=py38h9e3b1fc_0
122 | - python_abi=3.8=1_cp38
123 | - pytz=2019.3=py_0
124 | - pyyaml=5.3.1=py38h1e0a361_0
125 | - pyzmq=18.1.1=py38h1768529_0
126 | - qt=5.12.5=hd8c4c69_1
127 | - readline=8.0=hf8c457e_0
128 | - scikit-learn=0.22.1=py38hcdab131_1
129 | - scipy=1.4.1=py38h921218d_0
130 | - seaborn=0.10.0=py_1
131 | - send2trash=1.5.0=py_0
132 | - setuptools=45.2.0=py38_0
133 | - six=1.14.0=py38_0
134 | - snappy=1.1.8=he6710b0_0
135 | - sqlite=3.30.1=hcee41ef_0
136 | - statsmodels=0.11.1=py38h516909a_0
137 | - terminado=0.8.3=py38_0
138 | - testpath=0.4.4=py_0
139 | - texttable=1.6.2=py_0
140 | - tk=8.6.10=hed695b0_0
141 | - tornado=6.0.3=py38h516909a_4
142 | - tqdm=4.59.0=pyhd3eb1b0_1
143 | - traitlets=4.3.3=py38_0
144 | - umap-learn=0.3.10=py38_1
145 | - wcwidth=0.1.8=py_0
146 | - webencodings=0.5.1=py_1
147 | - wheel=0.34.2=py_1
148 | - xorg-kbproto=1.0.7=h14c3975_1002
149 | - xorg-libice=1.0.10=h516909a_0
150 | - xorg-libsm=1.2.3=h84519dc_1000
151 | - xorg-libx11=1.6.9=h516909a_0
152 | - xorg-libxau=1.0.9=h14c3975_0
153 | - xorg-libxdmcp=1.1.3=h516909a_0
154 | - xorg-libxext=1.3.4=h516909a_0
155 | - xorg-libxrender=0.9.10=h516909a_1002
156 | - xorg-renderproto=0.11.1=h14c3975_1002
157 | - xorg-xextproto=7.3.0=h14c3975_1002
158 | - xorg-xproto=7.0.31=h14c3975_1007
159 | - xz=5.2.4=h14c3975_1001
160 | - yaml=0.2.4=h516909a_0
161 | - zeromq=4.3.2=he1b5a44_2
162 | - zipp=3.0.0=py_0
163 | - zlib=1.2.11=h516909a_1006
164 | - zstd=1.3.7=h0b5b093_0
165 | - pip:
166 | - annoy==1.16.3
167 | - pyqt5-sip==4.19.18
168 | - pyqtwebengine==5.12.1
169 |
--------------------------------------------------------------------------------
/environment_mini.yml:
--------------------------------------------------------------------------------
1 | name: scf_mini
2 | channels:
3 | - hcc
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=4.5=1_gnu
9 | - anndata=0.7.6=py39hf3d152e_0
10 | - arpack=3.7.0=hc6cf775_2
11 | - blas=1.0=mkl
12 | - brotli=1.0.9=h7f98852_5
13 | - brotli-bin=1.0.9=h7f98852_5
14 | - ca-certificates=2021.5.30=ha878542_0
15 | - cached-property=1.5.2=py_0
16 | - certifi=2021.5.30=py39hf3d152e_0
17 | - cycler=0.10.0=py_2
18 | - dbus=1.13.6=he372182_0
19 | - expat=2.4.1=h9c3ff4c_0
20 | - fbpca=1.0=py_0
21 | - fontconfig=2.13.1=hba837de_1005
22 | - fonttools=4.25.0=pyhd3eb1b0_0
23 | - freetype=2.10.4=h0708190_1
24 | - glib=2.69.1=h5202010_0
25 | - glpk=4.65=h9202a9a_1004
26 | - gmp=6.2.1=h58526e2_0
27 | - gst-plugins-base=1.14.0=hbbd80ab_1
28 | - gstreamer=1.14.0=h28cd5cc_2
29 | - h5py=3.2.1=py39h6c542dc_0
30 | - hdf5=1.10.6=hb1b8bf9_0
31 | - icu=58.2=hf484d3e_1000
32 | - igraph=0.9.4=ha184e22_0
33 | - intel-openmp=2021.3.0=h06a4308_3350
34 | - jbig=2.1=h7f98852_2003
35 | - joblib=1.0.1=pyhd8ed1ab_0
36 | - jpeg=9d=h36c2ea0_0
37 | - kiwisolver=1.3.1=py39h2531618_0
38 | - lcms2=2.12=hddcbb42_0
39 | - ld_impl_linux-64=2.35.1=h7274673_9
40 | - leidenalg=0.8.7=py39he80948d_0
41 | - lerc=2.2.1=h9c3ff4c_0
42 | - libblas=3.9.0=11_linux64_mkl
43 | - libbrotlicommon=1.0.9=h7f98852_5
44 | - libbrotlidec=1.0.9=h7f98852_5
45 | - libbrotlienc=1.0.9=h7f98852_5
46 | - libcblas=3.9.0=11_linux64_mkl
47 | - libdeflate=1.7=h7f98852_5
48 | - libffi=3.3=he6710b0_2
49 | - libgcc-ng=9.3.0=h5101ec6_17
50 | - libgfortran-ng=7.5.0=ha8ba4b0_17
51 | - libgfortran4=7.5.0=ha8ba4b0_17
52 | - libgomp=9.3.0=h5101ec6_17
53 | - liblapack=3.9.0=11_linux64_mkl
54 | - libllvm10=10.0.1=he513fc3_3
55 | - libpng=1.6.37=h21135ba_2
56 | - libstdcxx-ng=9.3.0=hd4cf53a_17
57 | - libtiff=4.3.0=hf544144_1
58 | - libuuid=2.32.1=h7f98852_1000
59 | - libwebp-base=1.2.0=h27cfd23_0
60 | - libxcb=1.13=h7f98852_1003
61 | - libxml2=2.9.12=h03d6c58_0
62 | - llvmlite=0.36.0=py39h612dafd_4
63 | - lz4-c=1.9.3=h9c3ff4c_1
64 | - matplotlib=3.4.2=py39hf3d152e_0
65 | - matplotlib-base=3.4.2=py39hab158f2_0
66 | - metis=5.1.0=h58526e2_1006
67 | - mkl=2021.3.0=h06a4308_520
68 | - mkl-service=2.4.0=py39h7f8727e_0
69 | - mkl_fft=1.3.0=py39h42c9631_2
70 | - mkl_random=1.2.2=py39h51133e4_0
71 | - mpfr=4.1.0=h9202a9a_1
72 | - munkres=1.1.4=pyh9f0ad1d_0
73 | - natsort=7.1.1=pyhd8ed1ab_0
74 | - ncurses=6.2=he6710b0_1
75 | - numba=0.53.1=py39h56b8d98_1
76 | - numpy=1.20.3=py39hf144106_0
77 | - numpy-base=1.20.3=py39h74d4b33_0
78 | - olefile=0.46=pyh9f0ad1d_1
79 | - openjpeg=2.4.0=hb52868f_1
80 | - openssl=1.1.1k=h7f98852_0
81 | - packaging=21.0=pyhd8ed1ab_0
82 | - pandas=1.3.0=py39hde0f152_0
83 | - patsy=0.5.1=py_0
84 | - pcre=8.45=h9c3ff4c_0
85 | - pillow=8.3.1=py39ha612740_0
86 | - pip=21.2.4=py37h06a4308_0
87 | - pthread-stubs=0.4=h36c2ea0_1001
88 | - pynndescent=0.5.4=pyh6c4a22f_0
89 | - pyparsing=2.4.7=pyh9f0ad1d_0
90 | - pyqt=5.9.2=py39h2531618_6
91 | - python=3.9.6=h12debd9_1
92 | - python-annoy=1.17.0=py39he80948d_2
93 | - python-dateutil=2.8.2=pyhd8ed1ab_0
94 | - python-igraph=0.9.6=py39hfef886c_0
95 | - python_abi=3.9=2_cp39
96 | - pytz=2021.1=pyhd8ed1ab_0
97 | - qt=5.9.7=h5867ecd_1
98 | - readline=8.1=h27cfd23_0
99 | - scikit-learn=0.24.2=py39h4dfa638_0
100 | - scipy=1.6.2=py39had2a1c9_1
101 | - seaborn=0.11.2=hd8ed1ab_0
102 | - seaborn-base=0.11.2=pyhd8ed1ab_0
103 | - setuptools=52.0.0=py39h06a4308_0
104 | - sip=4.19.13=py39h2531618_0
105 | - six=1.16.0=pyhd3eb1b0_0
106 | - sqlite=3.36.0=hc218d9a_0
107 | - statsmodels=0.12.2=py39hce5d2b2_0
108 | - suitesparse=5.10.1=hd8046ac_0
109 | - tbb=2020.2=h4bd325d_4
110 | - texttable=1.6.4=pyhd8ed1ab_0
111 | - threadpoolctl=2.2.0=pyh8a188c0_0
112 | - tk=8.6.10=hbc83047_0
113 | - tornado=6.1=py39h3811e60_1
114 | - tzdata=2021a=h5d7bf9c_0
115 | - umap-learn=0.5.1=py39hf3d152e_1
116 | - wheel=0.37.0=pyhd3eb1b0_0
117 | - xorg-libxau=1.0.9=h7f98852_0
118 | - xorg-libxdmcp=1.1.3=h7f98852_0
119 | - xz=5.2.5=h7b6447c_0
120 | - zlib=1.2.11=h7b6447c_3
121 | - zstd=1.5.0=ha95c52a_0
122 |
--------------------------------------------------------------------------------
/environment_mini_pegasus.yml:
--------------------------------------------------------------------------------
1 | name: scf_mini_pegasus
2 | channels:
3 | - hcc
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=4.5=1_gnu
9 | - anndata=0.7.6=py39hf3d152e_0
10 | - arpack=3.7.0=hc6cf775_2
11 | - blas=1.0=mkl
12 | - brotli=1.0.9=h7f98852_5
13 | - brotli-bin=1.0.9=h7f98852_5
14 | - ca-certificates=2021.5.30=ha878542_0
15 | - cached-property=1.5.2=py_0
16 | - certifi=2021.5.30=py39hf3d152e_0
17 | - cycler=0.10.0=py_2
18 | - dbus=1.13.6=he372182_0
19 | - expat=2.4.1=h9c3ff4c_0
20 | - fbpca=1.0=py_0
21 | - fontconfig=2.13.1=hba837de_1005
22 | - fonttools=4.25.0=pyhd3eb1b0_0
23 | - freetype=2.10.4=h0708190_1
24 | - glib=2.69.1=h5202010_0
25 | - glpk=4.65=h9202a9a_1004
26 | - gmp=6.2.1=h58526e2_0
27 | - gst-plugins-base=1.14.0=hbbd80ab_1
28 | - gstreamer=1.14.0=h28cd5cc_2
29 | - h5py=3.2.1=py39h6c542dc_0
30 | - hdf5=1.10.6=hb1b8bf9_0
31 | - icu=58.2=hf484d3e_1000
32 | - igraph=0.9.4=ha184e22_0
33 | - intel-openmp=2021.3.0=h06a4308_3350
34 | - jbig=2.1=h7f98852_2003
35 | - joblib=1.0.1=pyhd8ed1ab_0
36 | - jpeg=9d=h36c2ea0_0
37 | - kiwisolver=1.3.1=py39h2531618_0
38 | - lcms2=2.12=hddcbb42_0
39 | - ld_impl_linux-64=2.35.1=h7274673_9
40 | - leidenalg=0.8.7=py39he80948d_0
41 | - lerc=2.2.1=h9c3ff4c_0
42 | - libblas=3.9.0=11_linux64_mkl
43 | - libbrotlicommon=1.0.9=h7f98852_5
44 | - libbrotlidec=1.0.9=h7f98852_5
45 | - libbrotlienc=1.0.9=h7f98852_5
46 | - libcblas=3.9.0=11_linux64_mkl
47 | - libdeflate=1.7=h7f98852_5
48 | - libffi=3.3=he6710b0_2
49 | - libgcc-ng=9.3.0=h5101ec6_17
50 | - libgfortran-ng=7.5.0=ha8ba4b0_17
51 | - libgfortran4=7.5.0=ha8ba4b0_17
52 | - libgomp=9.3.0=h5101ec6_17
53 | - liblapack=3.9.0=11_linux64_mkl
54 | - libllvm10=10.0.1=he513fc3_3
55 | - libpng=1.6.37=h21135ba_2
56 | - libstdcxx-ng=9.3.0=hd4cf53a_17
57 | - libtiff=4.3.0=hf544144_1
58 | - libuuid=2.32.1=h7f98852_1000
59 | - libwebp-base=1.2.0=h27cfd23_0
60 | - libxcb=1.13=h7f98852_1003
61 | - libxml2=2.9.12=h03d6c58_0
62 | - llvmlite=0.36.0=py39h612dafd_4
63 | - louvain=0.7.0=py39he80948d_0
64 | - lz4-c=1.9.3=h9c3ff4c_1
65 | - matplotlib=3.4.2=py39hf3d152e_0
66 | - matplotlib-base=3.4.2=py39hab158f2_0
67 | - metis=5.1.0=h58526e2_1006
68 | - mkl=2021.3.0=h06a4308_520
69 | - mkl-service=2.4.0=py39h7f8727e_0
70 | - mkl_fft=1.3.0=py39h42c9631_2
71 | - mkl_random=1.2.2=py39h51133e4_0
72 | - mpfr=4.1.0=h9202a9a_1
73 | - munkres=1.1.4=pyh9f0ad1d_0
74 | - natsort=7.1.1=pyhd8ed1ab_0
75 | - ncurses=6.2=he6710b0_1
76 | - numba=0.53.1=py39h56b8d98_1
77 | - numpy=1.20.3=py39hf144106_0
78 | - numpy-base=1.20.3=py39h74d4b33_0
79 | - olefile=0.46=pyh9f0ad1d_1
80 | - openjpeg=2.4.0=hb52868f_1
81 | - openssl=1.1.1k=h7f98852_0
82 | - packaging=21.0=pyhd8ed1ab_0
83 | - pandas=1.3.0=py39hde0f152_0
84 | - patsy=0.5.1=py_0
85 | - pcre=8.45=h9c3ff4c_0
86 | - pillow=8.3.1=py39ha612740_0
87 | - pip=21.2.4=py37h06a4308_0
88 | - pthread-stubs=0.4=h36c2ea0_1001
89 | - pynndescent=0.5.4=pyh6c4a22f_0
90 | - pyparsing=2.4.7=pyh9f0ad1d_0
91 | - pyqt=5.9.2=py39h2531618_6
92 | - python=3.9.6=h12debd9_1
93 | - python-annoy=1.17.0=py39he80948d_2
94 | - python-dateutil=2.8.2=pyhd8ed1ab_0
95 | - python-igraph=0.9.6=py39hfef886c_0
96 | - python_abi=3.9=2_cp39
97 | - pytz=2021.1=pyhd8ed1ab_0
98 | - qt=5.9.7=h5867ecd_1
99 | - readline=8.1=h27cfd23_0
100 | - scikit-learn=0.24.2=py39h4dfa638_0
101 | - scipy=1.6.2=py39had2a1c9_1
102 | - seaborn=0.11.2=hd8ed1ab_0
103 | - seaborn-base=0.11.2=pyhd8ed1ab_0
104 | - setuptools=52.0.0=py39h06a4308_0
105 | - sip=4.19.13=py39h2531618_0
106 | - six=1.16.0=pyhd3eb1b0_0
107 | - sqlite=3.36.0=hc218d9a_0
108 | - statsmodels=0.12.2=py39hce5d2b2_0
109 | - suitesparse=5.10.1=hd8046ac_0
110 | - tbb=2020.2=h4bd325d_4
111 | - texttable=1.6.4=pyhd8ed1ab_0
112 | - threadpoolctl=2.2.0=pyh8a188c0_0
113 | - tk=8.6.10=hbc83047_0
114 | - tornado=6.1=py39h3811e60_1
115 | - tzdata=2021a=h5d7bf9c_0
116 | - umap-learn=0.5.1=py39hf3d152e_1
117 | - wheel=0.37.0=pyhd3eb1b0_0
118 | - xorg-libxau=1.0.9=h7f98852_0
119 | - xorg-libxdmcp=1.1.3=h7f98852_0
120 | - xz=5.2.5=h7b6447c_0
121 | - zlib=1.2.11=h7b6447c_3
122 | - zstd=1.5.0=ha95c52a_0
123 | - pip:
124 | - adjusttext==0.7.3
125 | - asciitree==0.3.3
126 | - charset-normalizer==2.0.4
127 | - click==8.0.1
128 | - cython==0.29.24
129 | - demuxem==0.1.6
130 | - docopt==0.6.2
131 | - fasteners==0.16.3
132 | - forceatlas2-python==1.1
133 | - geosketch==1.2
134 | - gprofiler-official==1.0.0
135 | - harmony-pytorch==0.1.6
136 | - hnswlib==0.5.2
137 | - idna==3.2
138 | - intervaltree==2.1.0
139 | - lightgbm==3.2.1
140 | - loompy==3.0.6
141 | - nmf-torch==0.1.1
142 | - numcodecs==0.9.0
143 | - numpy-groupies==0.9.13
144 | - pegasusio==0.3.1.post2
145 | - pegasuspy==1.4.3
146 | - psutil==5.8.0
147 | - pybind11==2.7.1
148 | - requests==2.26.0
149 | - scanorama==1.7.1
150 | - scikit-misc==0.1.4
151 | - sortedcontainers==2.4.0
152 | - torch==1.9.0
153 | - typing-extensions==3.10.0.2
154 | - urllib3==1.26.6
155 | - wordcloud==1.8.1
156 | - xlrd==1.2.0
157 | - xlsxwriter==3.0.1
158 | - zarr==2.9.5
159 |
--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/10x_cells_v2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/10x_cells_v2.h5ad
--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/smarter_cells.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_cells.h5ad
--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/smarter_nuclei.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_nuclei.h5ad
--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/snmcseq_gene.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/snmcseq_gene.h5ad
--------------------------------------------------------------------------------
/example-MOp_L5ET/run_scf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SingleCellFusion \
4 | -i "./datasets/10x_cells_v2.h5ad" \
5 | "./datasets/smarter_cells.h5ad" \
6 | "./datasets/smarter_nuclei.h5ad" \
7 | "./datasets/snmcseq_gene.h5ad" \
8 | -im "rna" "rna" "rna" "mc" \
9 | -f "./datasets/10x_cells_v2.h5ad" \
10 | -o "./results"
11 |
--------------------------------------------------------------------------------
/example-wholebrain/00.test_all_preproc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # every normalized by CPMs
3 |
4 | # # mC prepare
5 | # SingleCellFusion_prep \
6 | # -i \
7 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \
8 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \
9 | # -icov \
10 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_CG.h5ad" \
11 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_CG.h5ad" \
12 | # -inorm "mc" "mc" \
13 | # -ga "./datasets/genes_biccn2.0.bed" \
14 | # -o "./processed" \
15 | # -op "hvg_mc"
16 |
17 | # # ATAC prepare
18 | # SingleCellFusion_prep \
19 | # -i \
20 | # "./datasets/atac/CEMBA171206_3C_genes_promo2kb.h5ad" \
21 | # "./datasets/atac/CEMBA171207_3C_genes_promo2kb.h5ad" \
22 | # -gi "ensid" \
23 | # -ci "cell_id" \
24 | # -inorm "tpm" "tpm" \
25 | # -sp \
26 | # -ga "./datasets/genes_promoter_2kb_biccn2.0.bed" \
27 | # -o "./processed" \
28 | # -op "hvg_atac"
29 |
30 | # # RNA prepare
31 | # SingleCellFusion_prep \
32 | # -i \
33 | # "./datasets/rna/smrt_intron_biccn2.h5ad" \
34 | # "./datasets/rna/smrt_exon_biccn2.h5ad" \
35 | # -inorm "cpm" "cpm" \
36 | # -o "./processed" \
37 | # -op "hvg_rna"
38 |
39 | # # run scf RNA mC ATAC
40 | # SingleCellFusion \
41 | # -i \
42 | # "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
43 | # "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \
44 | # "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \
45 | # "./processed/hvg_atac_CEMBA171206_3C_genes_promo2kb.h5ad" \
46 | # "./processed/hvg_atac_CEMBA171207_3C_genes_promo2kb.h5ad" \
47 | # -im "rna" "mc" "mc" "atac" "atac"\
48 | # -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
49 | # -o "./results" \
50 | # -op "SingleCellFusion"
51 |
52 | # run RNA intron and exon
53 | SingleCellFusion \
54 | -i \
55 | "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
56 | "./processed/hvg_rna_smrt_intron_biccn2.h5ad" \
57 | -im "rna" "rna"\
58 | -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
59 | -o "./results" \
60 | -op "intron_exon"
--------------------------------------------------------------------------------
/example-wholebrainatac/normalize_and_select_features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | ""
12 | ]
13 | },
14 | "execution_count": 3,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import sys\n",
21 | "import importlib\n",
22 | "sys.path.insert(0, '../scripts')\n",
23 | "\n",
24 | "import numpy as np\n",
25 | "from scipy import sparse\n",
26 | "import time\n",
27 | "import re\n",
28 | "import anndata\n",
29 | "\n",
30 | "from __global_variables import *\n",
31 | "from utils_new import *\n",
32 | "import basic_utils\n",
33 | "importlib.reload(basic_utils)\n",
34 | "import preproc_utils\n",
35 | "importlib.reload(preproc_utils)"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# Task\n",
43 | "- start from prepared files \n",
44 | "```anndata```\n",
45 | "- get and store hvfeatures\n",
46 | "```anndata```"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "# Settings"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 5,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "SRC_DIR = './datasets_pre'\n",
63 | "DST_DIR = './datasets'\n",
64 | "\n",
65 | "sys.path.insert(0, DST_DIR)\n",
66 | "# from __init__datasets import *\n",
67 | "\n",
68 | "\n",
69 | "f_data_format = '{0}/{1}.h5ad'\n",
70 | "f_hvftr_data_format = '{0}/{1}.h5ad'\n",
71 | "\n"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 6,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "mods_selected = [\n",
81 | " 'snatac',\n",
82 | "]\n",
83 | "normalization_options = {\n",
84 | " 'snatac': 'TPM',\n",
85 | "}"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 7,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "(32285,)\n"
98 | ]
99 | },
100 | {
101 | "data": {
102 | "text/plain": [
103 | "ensid\n",
104 | "ENSMUSG00000051951 465597\n",
105 | "ENSMUSG00000089699 46966\n",
106 | "ENSMUSG00000102331 11595\n",
107 | "ENSMUSG00000102343 80476\n",
108 | "ENSMUSG00000025900 409684\n",
109 | "dtype: int64"
110 | ]
111 | },
112 | "execution_count": 7,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "df_genes = get_gene_annotation().set_index('ensid')\n",
119 | "\n",
120 | "gene_lengths_base = (df_genes['end'] - df_genes['start'])\n",
121 | "print(gene_lengths_base.shape)\n",
122 | "gene_lengths_base.head()"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "# highly variable features"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 8,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "name": "stdout",
139 | "output_type": "stream",
140 | "text": [
141 | "snatac\n",
142 | "snatac Reading in files 0.00018596649169921875\n"
143 | ]
144 | },
145 | {
146 | "ename": "OSError",
147 | "evalue": "Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)",
148 | "output_type": "error",
149 | "traceback": [
150 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
151 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
152 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;31m# read in files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Reading in files {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mti\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mh5ad_mat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manndata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ensid'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mmeta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgxc_raw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbasic_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mh5ad_to_scf_rna_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh5ad_mat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
153 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36mread_h5ad\u001b[0;34m(filename, backed, chunk_size)\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[0;31m# load everything into memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 447\u001b[0;31m \u001b[0mconstructor_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_args_from_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 448\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstructor_args\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
154 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36m_read_args_from_h5ad\u001b[0;34m(adata, filename, mode, chunk_size)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 481\u001b[0;31m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 482\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbacked\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mAnnData\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_BACKED_ATTRS\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
155 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/h5py/h5sparse.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, force_dense, **kwds)\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m \u001b[0;31m# Python 3.5 can’t handle trailing commas here\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 154\u001b[0m ):\n\u001b[0;32m--> 155\u001b[0;31m self.h5f = h5py.File(\n\u001b[0m\u001b[1;32m 156\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
156 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, **kwds)\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nslots\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_w0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 406\u001b[0;31m fid = make_fid(name, mode, userblock_size,\n\u001b[0m\u001b[1;32m 407\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfcpl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmake_fcpl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 408\u001b[0m swmr=swmr)\n",
157 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 173\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 174\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
158 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
159 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
160 | "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n",
161 | "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "for mod in mods_selected:\n",
167 | " ti = time.time()\n",
168 | " print(mod)\n",
169 | " \n",
170 | " normalization_option = normalization_options[mod]\n",
171 | " # read data matrix\n",
172 | " if normalization_option == 'MC':\n",
173 | " f_data = f_data_format.format(SRC_DIR, mod)\n",
174 | " \n",
175 | " # read in files\n",
176 | " print(mod, \"Reading in files {}\".format(time.time()-ti))\n",
177 | " gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)\n",
178 | " print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)\n",
179 | " print(time.time()-ti)\n",
180 | " \n",
181 | " # output file\n",
182 | " f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') \n",
183 | " print(time.time()-ti)\n",
184 | " \n",
185 | " # check meta cells agree with gxc cells\n",
186 | " assert np.all(meta.index.values == gxc_raw.cell)\n",
187 | " # check genes are uniq \n",
188 | " assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n",
189 | " # do\n",
190 | " gxc_hvftr = preproc_utils.preproc_methylation(\n",
191 | " gxc_raw,\n",
192 | " meta,\n",
193 | " global_value_col=settings[mod].global_mean, \n",
194 | " base_call_cutoff=20, \n",
195 | " sufficient_coverage_fraction=0.95,\n",
196 | " hv_percentile=30,\n",
197 | " n_qcut=10,\n",
198 | " )\n",
199 | " # save\n",
200 | " print(mod, \"Saving to files {}\".format(time.time()-ti))\n",
201 | "# gxc_hvftr.to_csv(f_hvftr_data_methylation, sep=\"\\t\", header=True, index=True, na_rep='NA')\n",
202 | " h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n",
203 | " \n",
204 | " else:\n",
205 | " # input, output files\n",
206 | " f_data = f_data_format.format(SRC_DIR, mod,) \n",
207 | " f_hvftr_data = f_hvftr_data_format.format(DST_DIR, mod) \n",
208 | " \n",
209 | " # read in files\n",
210 | " print(mod, \"Reading in files {}\".format(time.time()-ti))\n",
211 | " h5ad_mat = anndata.read_h5ad(f_data)\n",
212 | " gid_col, cid_col = 'ensid', ''\n",
213 | " meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)\n",
214 | " \n",
215 | " # check meta cells agree with gxc cells\n",
216 | " assert np.all(meta.index.values == gxc_raw.cell)\n",
217 | " # check genes are uniq \n",
218 | " assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n",
219 | " \n",
220 | " # get hvftrs\n",
221 | " print(mod, \"Preproc and get highly variable genes {}\".format(time.time()-ti))\n",
222 | " if normalization_option == 'CPM':\n",
223 | " gxc_hvftr = preproc_utils.preproc_rna_cpm_based(\n",
224 | " gxc_raw, \n",
225 | " sufficient_cell_coverage=0.01, \n",
226 | " hv_percentile=30, hv_ncut=10)\n",
227 | " elif normalization_option == 'TPM':\n",
228 | " gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)\n",
229 | " gxc_hvftr = preproc_utils.preproc_rna_tpm_based(\n",
230 | " gxc_raw, gene_lengths, impute_gene_lengths=True, \n",
231 | " sufficient_cell_coverage=0.01, \n",
232 | " hv_percentile=30, hv_ncut=10)\n",
233 | " \n",
234 | " # save\n",
235 | " print(mod, \"Saving to file {}\".format(f_hvftr_data, time.time()-ti))\n",
236 | " h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)\n",
237 | " h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n",
238 | " \n",
239 | " print(mod, \"Total time used: {}\".format(time.time()-ti))\n",
240 | " break\n",
241 | " "
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "## Check highly-variable genes"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 7,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "# for mod in mods_selected:\n",
258 | "# print(mod)\n",
259 | "# if settings[mod].mod_category == 'mc':\n",
260 | "# f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'tsv') \n",
261 | "# gxc_hvftr = pd.read_csv(f_hvftr_data, sep=\"\\t\", index_col=0)\n",
262 | "# print(gxc_hvftr.index.values)\n",
263 | "# print(gxc_hvftr.columns.values)\n",
264 | "# print(gxc_hvftr.shape)\n",
265 | "# has_nan = np.isnan(gxc_hvftr.values).any()\n",
266 | "# print(\"Contains NaN? {}\".format(has_nan))\n",
267 | " \n",
268 | "# continue\n",
269 | " \n",
270 | "# f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'npz') \n",
271 | "# f_hvftr_gene = f_hvftr_format.format(SRC_DIR, mod, 'gene') \n",
272 | "# f_hvftr_cell = f_hvftr_format.format(SRC_DIR, mod, 'cell') \n",
273 | "# gxc_hvftr = snmcseq_utils.load_gc_matrix(f_hvftr_gene, f_hvftr_cell, f_hvftr_data)\n",
274 | "# print(gxc_hvftr.gene)\n",
275 | "# print(gxc_hvftr.cell)\n",
276 | "# print(len(gxc_hvftr.gene), len(gxc_hvftr.cell), gxc_hvftr.data.shape)\n",
277 | "# has_nan = np.isnan(gxc_hvftr.data.data).any()\n",
278 | "# print(\"Contains NaN? {}\".format(has_nan))\n",
279 | "# # break"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "Python 3",
286 | "language": "python",
287 | "name": "python3"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.8.1"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 4
304 | }
305 |
--------------------------------------------------------------------------------
/example-wholebrainatac/run_preproc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ga="/cndd/Public_Datasets/BICCN/BICCN2.0_whole_mouse_brain/references/refdata-gex-mm10-2020-A/genes/genes_promoter_2kb_biccn2.0.bed"
4 |
5 | ../scripts/normalize_and_select_features.py \
6 | -i "./datasets_pre/CEMBA171206_3C_genes_promo2kb.h5ad" \
7 | -inorm "tpm" \
8 | -ga $ga \
9 | -op "test_preproc_may3" \
10 | -o "./datasets_processed"
--------------------------------------------------------------------------------
/example-wholebrainatac/run_scf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ../scripts/SingleCellFusion \
4 | -i "./datasets/10x_cells_v2.h5ad" "./datasets/snatac.h5ad" \
5 | -im "rna" "atac" \
6 | -f "./datasets/10x_cells_v2.h5ad" \
7 | -op "test_april27" \
8 | -o "./results"
--------------------------------------------------------------------------------
/scf_description.rst:
--------------------------------------------------------------------------------
1 | How does SingleCellFusion work?
2 | ================================
3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing,
4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would
5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know
6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation
7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell.
8 |
9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single
10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a
11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics
12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity.
13 |
14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets.
15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable
16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a
17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled
18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the
19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's
20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor
21 | graph specified, and are described below.
22 |
23 | Direct mutual nearest neighbors
24 | -------------------------------
25 | .. image:: mnn_direct.png
26 | :width: 400
27 | :alt: cartoon of direct MNN
28 |
29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the
30 | correlation of counts at highly variable features is calculated. These correlation values are used
31 | as the distance metric for identifying mutual neighbors.
32 |
33 | Once the correlation is calculated, neighbors across modalities are determined. We require that
34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled
35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable
36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq
37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the
38 | nearest neighbors graph is not dominated by noisy or spurious correlations.
39 |
40 | Once the neighbors graph is generated imputed counts are generated by the following equation:
41 |
42 | .. image:: mnn_equation.png
43 | :width: 400
44 | :alt: equation for imputation by MNN
45 |
46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality
47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest
48 | neighbors in modality *m'*.
49 |
50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual
51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can
52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them.
53 |
54 |
55 | Mutual nearest neighbors with rescue
56 | -------------------------------------
57 | .. image:: mnn_rescue.png
58 | :width: 400
59 | :alt: cartoon of rescue MNN
60 |
61 | As with the direct method, the distances between two pairs of cells is their correlation at
62 | highly variable genes. The only difference with this method is that in addition to a mutual
63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality
64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by
65 | using the within modality neighbors to determine what the best matched neighbors are across
66 | modalities.
67 |
68 | .. image:: rescue_equation_1.png
69 | :width: 400
70 | :alt: equation 1 of rescue
71 |
72 | where
73 |
74 | .. image:: rescue_equation_2.png
75 | :width: 400
76 | :alt: equation 2 of rescue
77 |
78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality
79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k*
80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality
81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in
82 | "Direct mutual nearest neighbors:"
83 |
84 | .. image:: mnn_equation.png
85 | :width: 400
86 | :alt: equation for imputation by MNN
87 |
88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following
89 | equation:
90 |
91 | .. image:: rescue_equation_3.png
92 | :width: 400
93 | :alt: equation 3 of rescue
94 |
95 | This is a more lenient method for generating imputed counts as all cells will receive imputed
96 | data. This method will enable all cells to be analyzed, and is our recommended approach.
97 |
98 | k-nearest neighbors
99 | -------------------
100 | .. image:: knn.png
101 | :width: 400
102 | :alt: cartoon of kNN
103 |
104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at
105 | highly variable features. The major difference with this method is that each cell is required to make
106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is
107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell
108 | in the opposite modality can make is given by the equation:
109 |
110 | .. image:: n_neighbors_knn.png
111 | :width: 200
112 | :alt: equation 1 of knn
113 |
114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required
115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\ is the number of cells in
116 | modality *m*, and *n*\ :sub:`m'`\ is the number of cells in modality *m'*. *z* is a relaxation
117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly
118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell
119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same
120 | equation as in "Direct mutual nearest neighbors:
121 |
122 | .. image:: mnn_equation.png
123 | :width: 400
124 | :alt: equation for imputation by MNN
125 |
126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors
127 | in the opposite data set.
128 |
--------------------------------------------------------------------------------
/scripts/SCF_utils.py:
--------------------------------------------------------------------------------
1 | """Utilities for SingleCellFusion
2 | """
3 | from __init__ import *
4 |
5 | import functools
6 | import collections
7 | import itertools
8 | import re
9 | from scipy import sparse
10 | from scipy.stats import zscore
11 | import fbpca
12 | import sys
13 | import logging
14 | from memory_profiler import profile
15 | from datetime import datetime
16 |
17 | import basic_utils
18 | import clst_utils
19 |
20 | ctime = datetime.now().strftime("%Y%m%d%H%M%S")
21 | f=open('memory_profile_SCFutils_{}.log'.format(ctime), 'w+')
22 |
23 | @profile(stream=f)
24 | def sparse_adj_to_mat(adjs, row_size, col_size, dists=''):
25 | """Turn a knn adjacency matrix to a sparse matrix
26 | """
27 | n_obs, k = adjs.shape
28 | assert n_obs == row_size
29 | # row col 1/dist
30 | row_inds = np.repeat(np.arange(row_size), k)
31 | col_inds = np.ravel(adjs)
32 | if isinstance(dists, np.ndarray):
33 | assert dists.shape == adjs.shape
34 | data = np.ravel(dists)
35 | else:
36 | data = [1]*len(row_inds)
37 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(row_size, col_size))
38 | return knn_dist_mat
39 |
40 | # smooth-within modality
41 | @profile(stream=f)
42 | def smooth_in_modality(counts_matrix, norm_counts_matrix, k, ka, npc=100, sigma=1.0, p=0.1, drop_npc=0):
43 | """Smooth a data matrix
44 |
45 | Arguments:
46 | - counts_matrix (pandas dataframe, feature by cell)
47 | - norm_counts_matrix (pandas dataframe, feature by cell) log10(CPM+1)
48 | - k (number of nearest neighbors)
49 | Return:
50 | - smoothed cells_matrix (pandas dataframe)
51 | - markov affinity matrix
52 | """
53 | # from sklearn.neighbors import NearestNeighbors
54 | import fbpca
55 | import clst_utils
56 |
57 | assert counts_matrix.shape[1] == norm_counts_matrix.shape[1]
58 |
59 | c = norm_counts_matrix.columns.values
60 | N = len(c)
61 |
62 | # reduce dimension fast version
63 | U, s, Vt = fbpca.pca(norm_counts_matrix.T.values, k=npc)
64 | pcs = U.dot(np.diag(s))
65 | if drop_npc != 0:
66 | pcs = pcs[:, drop_npc:]
67 |
68 | # get k nearest neighbor distances fast version
69 | inds, dists = clst_utils.gen_knn_annoy(pcs, k, form='list',
70 | metric='euclidean', n_trees=10, search_k=-1, verbose=True,
71 | include_distances=True)
72 |
73 | # remove itself
74 | dists = dists[:, 1:]
75 | inds = inds[:, 1:]
76 |
77 | # normalize by ka's distance
78 | dists = (dists/(dists[:, ka].reshape(-1, 1)))
79 |
80 | # gaussian kernel
81 | adjs = np.exp(-((dists**2)/(sigma**2)))
82 |
83 | # construct a sparse matrix
84 | cols = np.ravel(inds)
85 | rows = np.repeat(np.arange(N), k-1) # remove itself
86 | vals = np.ravel(adjs)
87 | A = sparse.csr_matrix((vals, (rows, cols)), shape=(N, N))
88 |
89 | # Symmetrize A (union of connection)
90 | A = A + A.T
91 |
92 | # normalization fast (A is now a weight matrix excluding itself)
93 | degrees = A.sum(axis=1)
94 | A = sparse.diags(1.0/np.ravel(degrees)).dot(A)
95 |
96 | # include itself
97 | eye = sparse.identity(N)
98 | A = p*eye + (1-p)*A
99 |
100 | # smooth fast (future?)
101 | counts_matrix_smoothed = pd.DataFrame((A.dot(counts_matrix.T)).T,
102 | columns=counts_matrix.columns, index=counts_matrix.index)
103 | return counts_matrix_smoothed, A
104 |
105 | # impute across modality
106 | @profile(stream=f)
107 | def get_constrained_knn(mat_norm_j, mat_norm_i, knn, k_saturate, knn_speed_factor=10, metric='dot', verbose=False):
108 | """Get constrained knn
109 | j <- i
110 | Look for kNN in i for each cell in j, cells in i are constrained to k_saturated
111 |
112 | get knn_speed_factor*knn number of nearest neighbors internally
113 | """
114 | ti = time.time()
115 | assert mat_norm_i.shape[1] == mat_norm_j.shape[1]
116 | knn = int(knn)
117 | knn_speed_factor = int(knn_speed_factor)
118 |
119 | cells_i = np.arange(len(mat_norm_i))
120 | cells_j = np.arange(len(mat_norm_j))
121 |
122 | # record cells in j
123 | accepted_knn_ji = []
124 | accepted_cells = []
125 | rejected_cells = np.arange(len(cells_j))
126 |
127 | # record cell in i
128 | n_connects = np.zeros(len(cells_i)).astype(int) # record number of connection for each cell in i
129 | unsaturated = (n_connects < k_saturate) # unsaturated bool
130 | unsaturated_cells = np.arange(len(cells_i))[unsaturated]
131 |
132 | while rejected_cells.size != 0:
133 | if verbose:
134 | print(len(rejected_cells), len(unsaturated_cells), time.time()-ti)
135 |
136 | np.random.shuffle(rejected_cells) # random order
137 | # do something to rejected cells and unsaturated cells
138 | # knn_ji # for each cell in j, its knn in i
139 | knn_ji = clst_utils.gen_knn_annoy_train_test(mat_norm_i.values[unsaturated_cells], # look for nearest neighbors in i
140 | mat_norm_j.values[rejected_cells], # for each row in j
141 | min(knn*knn_speed_factor, len(unsaturated_cells)), #
142 | form='list', # adj matrix
143 | metric=metric, # correlation
144 | n_trees=10, search_k=-1, verbose=False,
145 | include_distances=False, # for now
146 | ).astype(int)
147 | knn_ji = unsaturated_cells[knn_ji] # transform it to global index, need to check this like
148 |
149 | rejected_local_idx = []
150 | # examine each cell in j
151 | for local_idx, cell in enumerate(rejected_cells):
152 | # get knn in i
153 | knn_in_i = knn_ji[local_idx]
154 | # filter out saturated ones
155 | knn_in_i = knn_in_i[unsaturated[knn_in_i]]
156 |
157 | if knn_in_i.size < knn:
158 | # reject
159 | rejected_local_idx.append(local_idx)
160 | else:
161 | # accept and update
162 | accepted_knn_ji.append(knn_in_i[:knn])
163 | accepted_cells.append(cell)
164 | n_connects[knn_in_i[:knn]] += 1
165 | unsaturated = (n_connects < k_saturate) # unsaturated bool
166 |
167 | unsaturated_cells = np.arange(len(cells_i))[unsaturated]
168 | rejected_cells = rejected_cells[rejected_local_idx]
169 | # break
170 |
171 | accepted_knn_ji = pd.DataFrame(np.vstack(accepted_knn_ji), index=accepted_cells)
172 | accepted_knn_ji = accepted_knn_ji.sort_index().values
173 |
174 | return accepted_knn_ji
175 |
176 | #
177 | @profile(stream=f)
178 | def impute_1pair_cca(mod_i, mod_j,
179 | smoothed_features_i, smoothed_features_j,
180 | settings,
181 | knn,
182 | relaxation,
183 | n_cca,
184 | output_knn_mat_ij='',
185 | output_knn_mat_ji='',
186 | impute_j=True,
187 | ):
188 | """
189 | """
190 | # set up
191 | direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction
192 |
193 | mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i
194 | mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j
195 |
196 | genes_i = mat_ii.columns.values
197 | genes_j = mat_jj.columns.values
198 | genes_common = np.intersect1d(genes_i, genes_j)
199 |
200 | cells_i = mat_ii.index.values
201 | cells_j = mat_jj.index.values
202 |
203 | ## CCA euclidean distance
204 | # normalize the feature matrix
205 | X = mat_ii[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_i # gene by cell, zscore across genes
206 | Y = mat_jj[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_j
207 | U, s, Vt = fbpca.pca(X.T.values.dot(Y.values), k=n_cca)
208 | del X, Y
209 |
210 | mat_norm_i = pd.DataFrame(U, index=mat_ii.index)
211 | maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get
212 | mat_norm_j = pd.DataFrame(Vt.T, index=mat_jj.index)
213 | maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get
214 |
215 | if impute_j:
216 | # knn_i and knn_j
217 | # j <- i for each j, get kNN in i
218 | knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='euclidean')
219 | mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i))
220 |
221 | if output_knn_mat_ji:
222 | sparse.save_npz(output_knn_mat_ji, mat_knn_ji)
223 |
224 | # normalize
225 | degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to
226 | mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji)
227 |
228 | # imputation both across and within modality
229 | mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i
230 |
231 |
232 | # i <- j
233 | knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='euclidean')
234 | mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j))
235 |
236 | if output_knn_mat_ij:
237 | sparse.save_npz(output_knn_mat_ij, mat_knn_ij)
238 |
239 | degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to
240 | mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij)
241 |
242 | mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j
243 |
244 | if impute_j:
245 | return mat_ij, mat_ji
246 | else:
247 | return mat_ij
248 |
249 | @profile(stream=f)
250 | def impute_1pair(mod_i, mod_j,
251 | smoothed_features_i, smoothed_features_j,
252 | settings,
253 | knn, # 20
254 | relaxation, # 3
255 | output_knn_mat_ij='',
256 | output_knn_mat_ji='',
257 | impute_j=True,
258 | ):
259 | """
260 | """
261 | # set up
262 | direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction
263 |
264 | mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i
265 | mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j
266 |
267 | genes_i = mat_ii.columns.values
268 | genes_j = mat_jj.columns.values
269 | genes_common = np.intersect1d(genes_i, genes_j)
270 |
271 | cells_i = mat_ii.index.values
272 | cells_j = mat_jj.index.values
273 |
274 | ## spearman correlation as distance (rank -> zscore -> (flip sign?) -> "dot" distance)
275 | # normalize the feature matrix
276 | mat_norm_i = (mat_ii[genes_common].rank(pct=True, axis=1)
277 | .apply(basic_utils.zscore, axis=1)
278 | *direct_i
279 | )
280 | mat_norm_j = (mat_jj[genes_common].rank(pct=True, axis=1)
281 | .apply(basic_utils.zscore, axis=1)
282 | *direct_j
283 | )
284 | maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get
285 | maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get
286 |
287 | if impute_j:
288 | # knn_i and knn_j
289 | # j <- i for each j, get kNN in i
290 | knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='dot')
291 | mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i))
292 |
293 | if output_knn_mat_ji:
294 | sparse.save_npz(output_knn_mat_ji, mat_knn_ji)
295 |
296 | # normalize
297 | degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to
298 | mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji)
299 |
300 | # imputation both across and within modality
301 | mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i
302 |
303 |
304 | # i <- j
305 | knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='dot')
306 | mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j))
307 |
308 | if output_knn_mat_ij:
309 | sparse.save_npz(output_knn_mat_ij, mat_knn_ij)
310 |
311 | degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to
312 | mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij)
313 |
314 | mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j
315 |
316 | if impute_j:
317 | return mat_ij, mat_ji
318 | else:
319 | return mat_ij
320 |
321 | @profile(stream=f)
322 | def core_scf_routine(mods_selected, features_selected, settings,
323 | metas, gxc_hvftrs,
324 | ps, drop_npcs,
325 | cross_mod_distance_measure, knn, relaxation, n_cca,
326 | npc,
327 | output_pcX_all,
328 | output_imputed_data_format,
329 | ):
330 | """smooth within modality, impute across modalities, and construct a joint PC matrix
331 | """
332 | # GENE * CELL !!!!
333 | smoothed_features = collections.OrderedDict()
334 | logging.info("Smoothing within modalities...")
335 | for mod in mods_selected:
336 | ti = time.time()
337 | if settings[mod].mod_category == 'mc':
338 | _df = gxc_hvftrs[mod]
339 | else:
340 | _mat = gxc_hvftrs[mod].data.todense()
341 | _df = pd.DataFrame(_mat,
342 | index=gxc_hvftrs[mod].gene,
343 | columns=gxc_hvftrs[mod].cell,
344 | )
345 | npc = min(len(metas[mod]), npc)
346 | k_smooth = min(len(metas[mod]), 30)
347 | ka = 5
348 | if k_smooth >= 2*ka:
349 | mat_smoothed, mat_knn = smooth_in_modality(_df, _df, k=k_smooth, ka=ka, npc=npc,
350 | p=ps[settings[mod].mod_category],
351 | drop_npc=drop_npcs[settings[mod].mod_category])
352 | smoothed_features[mod] = mat_smoothed
353 | else:
354 | smoothed_features[mod] = _df
355 | logging.info("{} finished in {} seconds".format(mod, time.time()-ti))
356 | # delete
357 | del gxc_hvftrs[mod]
358 |
359 | # construct a joint matrix (PCA)
360 | logging.info("Constructing a joint matrix...")
361 | cells_all = np.hstack([metas[mod].index.values for mod in mods_selected]) # cell (all mods)
362 | pcX_all = []
363 | for mod_y in features_selected: ## to
364 | logging.info("Imputing into {} space...".format(mod_y))
365 | # get all_features
366 | X = []
367 | for mod_x in mods_selected:
368 | logging.info("for {} cells...".format(mod_x))
369 | if mod_x == mod_y:
370 | smoothed_yy = smoothed_features[mod_y].T # gene by cell !!! VERY IMPORTANT
371 | X.append(smoothed_yy)
372 | else:
373 | # impute x cells y space
374 | smoothed_features_x = smoothed_features[mod_x]
375 | smoothed_features_y = smoothed_features[mod_y]
376 | if cross_mod_distance_measure == 'correlation':
377 | imputed_xy = impute_1pair(mod_x, mod_y,
378 | smoothed_features_x, smoothed_features_y,
379 | settings,
380 | knn=knn,
381 | relaxation=relaxation,
382 | impute_j=False,
383 | )
384 | elif cross_mod_distance_measure == 'cca':
385 | imputed_xy = impute_1pair_cca(mod_x, mod_y,
386 | smoothed_features_x, smoothed_features_y,
387 | settings,
388 | knn=knn,
389 | relaxation=relaxation,
390 | n_cca=n_cca,
391 | impute_j=False,
392 | )
393 | else:
394 | raise ValueError("Choose from correlation and cca")
395 | X.append(imputed_xy)
396 | X = np.vstack(X) # cell (all mods) by gene (mod_y)
397 | # save X (imputed counts; for debuggng only)
398 | if len(output_imputed_data_format)>0:
399 | np.save(output_imputed_data_format.format(mod_y), X)
400 | # PCA
401 | U, s, V = fbpca.pca(X, npc)
402 | del X
403 | pcX = U.dot(np.diag(s))
404 | # normalize PCs
405 | sigma = np.sqrt(np.sum(s*s)/(pcX.shape[0]*pcX.shape[1]))
406 | pcX = pcX/sigma
407 | pcX_all.append(pcX)
408 |
409 | pcX_all = np.hstack(pcX_all)
410 | # save pcX_all
411 | df_pcX = pd.DataFrame(
412 | pcX_all,
413 | index=cells_all,
414 | columns=['PC'+str(i+1) for i in np.arange(pcX_all.shape[1])],
415 | )
416 | df_pcX.index.name = 'cell_id'
417 | df_pcX.to_csv(
418 | output_pcX_all,
419 | sep='\t', index=True, header=True,
420 | )
421 | logging.info("Saved output to: {}".format(output_pcX_all))
422 | return pcX_all, cells_all
423 |
424 | @profile(stream=f)
425 | def clustering_umap_routine(pcX_all, cells_all, mods_selected, metas,
426 | resolutions, k,
427 | umap_neighbors, min_dist,
428 | output_clst_and_umap,
429 | use_netUMAP=False,
430 | use_tsne=False,
431 | cluster_only=False,
432 | ):
433 | """
434 | """
435 | # clustering
436 | df_clsts = []
437 | for resolution in resolutions:
438 | logging.info('resolution r: {}'.format(resolution))
439 | df_clst = clst_utils.clustering_routine(
440 | pcX_all,
441 | cells_all, k,
442 | resolution=resolution,
443 | metric='euclidean', option='plain', n_trees=10, search_k=-1, verbose=False)
444 | df_clsts.append(df_clst.rename(columns={'cluster':
445 | 'cluster_joint_r{}'.format(resolution)
446 | }))
447 | df_clst = pd.concat(df_clsts, axis=1)
448 |
449 | df_summary = df_clst
450 | # umap
451 | if not cluster_only:
452 | df_embed = clst_utils.run_umap_lite(
453 | pcX_all,
454 | cells_all,
455 | n_neighbors=umap_neighbors, min_dist=min_dist, n_dim=2,
456 | random_state=1,
457 | use_netUMAP=use_netUMAP,
458 | use_tsne=use_tsne,
459 | )
460 | df_summary = df_summary.join(df_embed)
461 | # add dataset info
462 | df_summary['dataset'] = ''
463 | for mod in mods_selected:
464 | _cells = metas[mod].index.values
465 | df_summary.loc[_cells, 'dataset'] = mod
466 | # name
467 | df_summary.index.name = 'cell_id'
468 | # save results
469 | df_summary.to_csv(
470 | output_clst_and_umap,
471 | sep='\t', header=True, index=True,
472 | )
473 | return df_summary
474 |
--------------------------------------------------------------------------------
/scripts/SingleCellFusion:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """SingleCellFusion main routine"""
3 |
4 | from __init__ import *
5 | # public packages
6 | import collections
7 | import os
8 | import warnings
9 | with warnings.catch_warnings():
10 | warnings.filterwarnings("ignore", category=FutureWarning)
11 | import anndata
12 |
13 | import logging
14 | from memory_profiler import profile
15 | from datetime import datetime
16 |
17 | # scripts from this package
18 | import cli_parser
19 | import basic_utils
20 | import SCF_utils
21 |
22 | ctime = datetime.now().strftime("%Y%m%d%H%M%S")
23 | f=open('memory_profile_SCF_{}.log'.format(ctime), 'w+')
24 | @profile(stream=f)
25 | def main():
26 | parser = cli_parser.create_parser()
27 | args = parser.parse_args()
28 |
29 | log = basic_utils.create_logger()
30 | logging.info('* Parsing Command Line Arguments')
31 |
32 | # specify output filenames
33 | outdir = args.output_dir
34 | if not os.path.isdir(outdir):
35 | os.makedirs(outdir)
36 | name = args.output_prefix
37 |
38 | output_clst_and_umap = outdir + '/{}_assigned_clusters_embeddings.tsv.gz'.format(name)
39 | output_pcX_all = outdir + '/{}_principal_components.tsv.gz'.format(name)
40 | output_figures = outdir + '/{}_{{}}.{{}}'.format(name)
41 |
42 | ### --- outputs for debugging only
43 | output_imputed_data_format = '' # leave it blank or set it to be: outdir + '/{}_imputed_data_{{}}.npy'.format(name)
44 | # output_cluster_centroids = outdir + '/{}_centroids.pkl'.format(name) # not used
45 | ### --- end
46 |
47 | # get input files, modaltiies (internal rep of input files), and feature datasets
48 | data_files = args.input_datasets
49 | feature_files = args.feature_datasets
50 | mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files]
51 | features_selected = [cli_parser.parse_filename(data_file) for data_file in feature_files]
52 | for features_modality in features_selected:
53 | assert (features_modality in mods_selected)
54 |
55 | # get dataset metadata
56 | mod_catgories = args.input_modalities
57 | assert len(mod_catgories) == len(data_files)
58 |
59 | for mod_category in mod_catgories:
60 | assert (mod_category in ['mc', 'atac', 'rna'])
61 | settings = collections.OrderedDict()
62 | Mod_info = collections.namedtuple('Mod_info', ['mod', 'mod_category', 'mod_direction',])
63 | for mod, mod_category in zip(mods_selected, mod_catgories):
64 | mod_direction = cli_parser.modality_default_options(mod_category)
65 | settings[mod] = Mod_info(mod, mod_category, mod_direction,)
66 |
67 | # parameters
68 | # Within modality
69 | ps = {
70 | 'rna': args.smoothing_fractions[0],
71 | 'atac': args.smoothing_fractions[1],
72 | 'mc': args.smoothing_fractions[2],
73 | }
74 |
75 | # across modality
76 | knn = args.nearest_neighbors
77 | relaxation = args.relaxation
78 | # PCA
79 | npc = args.num_pcs
80 | # clustering
81 | k = args.leiden_n_neighbors
82 | resolutions = args.leiden_resolutions
83 | # umap
84 | umap_neighbors = args.umap_n_neighbors
85 | min_dist = args.umap_min_dist
86 |
87 | # precomputed_pca (skip integration)
88 | precomputed_pca_file = args.precomputed_pca_file
89 | # use netUMAP
90 | use_netUMAP = args.use_netUMAP
91 | use_tsne = args.use_tsne
92 |
93 | ### --- deprecated arguments (for testing; not open to general users)
94 | n_cca = 0 # deprecated args.n_cca
95 | drop_npcs = {
96 | 'mc': 0,
97 | 'rna': 0,
98 | 'atac': 0,
99 | }
100 | cross_mod_distance_measure = 'correlation' # or 'cca'
101 | ### --- end of deprecation
102 | logging.info(
103 | "knn = {}\n".format(knn) +
104 | "relaxation = {}\n".format(relaxation) +
105 | "number of PCs = {}\n".format(npc) +
106 | "ps = {}\n".format(ps) +
107 | "umap_n_neighbors = {}\n".format(umap_neighbors) +
108 | "umap_min_dist = {}\n".format(min_dist) +
109 | "leiden_resolutions = {}\n".format(resolutions) +
110 | "leiden_n_neighbors = {}\n".format(k)
111 | )
112 |
113 | # ## Read in data
114 | logging.info('* Begin integration')
115 | ### read in data (h5ad)
116 | metas = collections.OrderedDict()
117 | gxc_hvftrs = collections.OrderedDict()
118 | for mod, _file in zip(mods_selected, data_files):
119 | logging.info("processing {}".format(mod))
120 | # read
121 | logging.info("reading {}".format(_file))
122 | h5ad_mat = anndata.read_h5ad(_file)
123 | h5ad_mat.obs.index = [cell+"_"+mod for cell in h5ad_mat.obs.index] # resolve possible cellid conflict across datasets
124 |
125 | if settings[mod].mod_category == 'mc':
126 | # convert
127 | meta, mat = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
128 | assert np.all(mat.columns.values == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat)
129 | logging.info("{} genes, {} cells in the feature matrix".format(*mat.shape))
130 |
131 | metas[mod] = meta
132 | gxc_hvftrs[mod] = mat
133 |
134 | else:
135 | # convert
136 | meta, gc_mat = basic_utils.h5ad_to_scf_rna_format(h5ad_mat)
137 | assert np.all(gc_mat.cell == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat)
138 | logging.info("{} genes, {} cells in the feature matrix".format(*gc_mat.data.shape))
139 |
140 | metas[mod] = meta
141 | gxc_hvftrs[mod] = gc_mat
142 |
143 | logging.info('Done reading data')
144 |
145 | # ## run SCF to get integrated PCA
146 | if os.path.isfile(precomputed_pca_file):
147 | logging.info('Loading precomputed PCA matrix')
148 | precomputed_pca_df = pd.read_csv(precomputed_pca_file, sep='\t', index_col=0)
149 | pcX_all = precomputed_pca_df.values
150 | cells_all = precomputed_pca_df.index.values
151 | else:
152 | pcX_all, cells_all = SCF_utils.core_scf_routine(mods_selected, features_selected, settings,
153 | metas, gxc_hvftrs,
154 | ps, drop_npcs,
155 | cross_mod_distance_measure, knn, relaxation, n_cca,
156 | npc,
157 | output_pcX_all,
158 | output_imputed_data_format,
159 | )
160 | logging.info('Done integration into a common PC space')
161 |
162 | # run clustering and imputation
163 | df_summary = SCF_utils.clustering_umap_routine(pcX_all, cells_all, mods_selected, metas,
164 | resolutions, k,
165 | umap_neighbors, min_dist,
166 | output_clst_and_umap,
167 | use_netUMAP=use_netUMAP,
168 | use_tsne=use_tsne,
169 | )
170 | logging.info('Done clustering and UMAP')
171 |
172 | if __name__ == "__main__":
173 | main()
--------------------------------------------------------------------------------
/scripts/SingleCellFusion_prep:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding: utf-8
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from scipy import sparse
7 | import time
8 | import re
9 | import warnings
10 | with warnings.catch_warnings():
11 | warnings.filterwarnings("ignore", category=FutureWarning)
12 |
13 | import anndata
14 | import scanpy
15 | import logging
16 | import os
17 |
18 | from __init__ import *
19 | import basic_utils
20 | import preproc_utils
21 | import cli_parser
22 |
23 | def get_gene_annotation(gene_annotation_file):
24 | """
25 | """
26 | genes = pd.read_csv(
27 | gene_annotation_file,
28 | sep='\t',
29 | header=None,
30 | usecols=[0,1,2,3],
31 | ).rename(columns={
32 | 0: 'chr',
33 | 1: 'start',
34 | 2: 'end',
35 | 3: 'ensid',
36 | })
37 | return genes
38 |
39 | def preproc(
40 | f_data,
41 | f_hvftr_data,
42 | normalization_option,
43 | sub_n=None,
44 | sub_frac=None,
45 | f_cov_data='',
46 | gene_lengths_base='', # required if normalization option == "tpm"
47 | gid_col='',
48 | cid_col='',
49 | global_mean_mc_col='', # required if normalization option == 'mc'
50 | ):
51 | """Generate normalized HVG matrices from raw count matrices
52 |
53 | normalization_option == 'mc' needs f_cov_data
54 | """
55 | # # highly variable features
56 | ti = time.time()
57 | logging.info("Preprocessing")
58 |
59 | # read data matrix
60 | if normalization_option == 'mc':
61 | # read in files
62 | logging.info("Reading in file {}".format(f_data))
63 | h5ad_mat = anndata.read_h5ad(f_data)
64 | ### subsampling ###
65 | if sub_n is not None or sub_frac is not None:
66 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
67 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
68 | ### end of subsampling ###
69 | logging.info("matrix size = {}".format(h5ad_mat.shape))
70 | meta, mat_mc = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
71 |
72 | logging.info("Reading in file {}".format(f_cov_data))
73 | h5ad_mat = anndata.read_h5ad(f_cov_data)
74 | ### subsampling ###
75 | if sub_n is not None or sub_frac is not None:
76 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
77 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
78 | ### end of subsampling ###
79 | logging.info("matrix size = {}".format(h5ad_mat.shape))
80 | meta, mat_c = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
81 |
82 | assert mat_mc.shape == mat_c.shape
83 | assert np.all(mat_mc.values <= mat_c.values)
84 |
85 | gxc_raw = GC_matrix(
86 | mat_mc.index.values,
87 | mat_mc.columns.values,
88 | {'mc': mat_mc.values, 'c': mat_c.values},
89 | )
90 |
91 | # check meta cells agree with gxc cells
92 | assert np.all(meta.index.values == gxc_raw.cell)
93 | # check genes are uniq
94 | assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene))
95 | # check cells are uniq
96 | assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell))
97 |
98 | # do
99 | gxc_hvftr = preproc_utils.preproc_methylation(
100 | gxc_raw,
101 | meta,
102 | global_value_col=global_mean_mc_col,
103 | base_call_cutoff=20,
104 | sufficient_coverage_fraction=0.95,
105 | hv_percentile=30,
106 | n_qcut=10,
107 | )
108 |
109 | # save
110 | logging.info("Saving to file {}".format(f_hvftr_data))
111 | h5ad_mat_hvftr = basic_utils.scf_mc_format_to_h5ad(meta, gxc_hvftr)
112 | h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
113 |
114 | else:
115 | # read in files
116 | logging.info("Reading in file {}".format(f_data))
117 | h5ad_mat = anndata.read_h5ad(f_data)
118 | ### subsampling ###
119 | if sub_n is not None or sub_frac is not None:
120 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
121 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
122 | ### end of subsampling ###
123 | logging.info("matrix size = {}".format(h5ad_mat.shape))
124 | if tosparse:
125 | h5ad_mat.X = sparse.coo_matrix(h5ad_mat.X)
126 | meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)
127 |
128 | # check meta cells agree with gxc cells
129 | assert np.all(meta.index.values == gxc_raw.cell)
130 | # check genes are uniq
131 | assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene))
132 | # check cells are uniq
133 | assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell))
134 |
135 | # get hvftrs
136 | logging.info("Preproc and get highly variable genes {}".format(f_data))
137 | if normalization_option == 'cpm':
138 | gxc_hvftr = preproc_utils.preproc_rna_cpm_based(
139 | gxc_raw,
140 | sufficient_cell_coverage=0.01,
141 | hv_percentile=30, hv_ncut=10)
142 | elif normalization_option == 'tpm':
143 | gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
144 | gxc_hvftr = preproc_utils.preproc_rna_tpm_based(
145 | gxc_raw, gene_lengths, impute_gene_lengths=True,
146 | sufficient_cell_coverage=0.01,
147 | hv_percentile=30, hv_ncut=10)
148 |
149 | # save
150 | logging.info("Saving to file {}".format(f_hvftr_data))
151 | h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)
152 | h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
153 | return
154 |
155 | if __name__ == "__main__":
156 | log = basic_utils.create_logger()
157 |
158 | parser = cli_parser.create_parser_preproc()
159 | args = parser.parse_args()
160 | logging.info('* Parsing Command Line Arguments')
161 |
162 | # get input files
163 | data_files = args.input_datasets
164 | data_cov_files = args.input_datasets_coverage
165 | mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files]
166 | gid_col = args.geneid_column
167 | cid_col = args.cellid_column
168 | global_mean_mc_col = args.global_mean_mc_column
169 | tosparse = args.tosparse
170 |
171 | # specify output files
172 | outdir = args.output_dir
173 | if not os.path.isdir(outdir):
174 | os.makedirs(outdir)
175 | outprefix = args.output_prefix
176 |
177 | output_files = [
178 | os.path.join(outdir, "{}_{}".format(outprefix, os.path.basename(input_file)))
179 | for input_file in data_files
180 | ]
181 |
182 | # parameters
183 | gene_annotation_file = args.gene_annotation_file
184 |
185 | # get dataset normalizations
186 | input_normalizations = args.input_normalizations
187 |
188 | # subsampling
189 | sub_n = args.sub_n
190 | sub_frac = args.sub_frac
191 |
192 | # check and set up
193 | gene_lengths_base = ''
194 | for option in input_normalizations:
195 | assert (option in ['mc', 'cpm', 'tpm'])
196 | if option == 'mc':
197 | assert len(data_cov_files) == len(data_files)
198 | elif option == 'tpm':
199 | assert gene_annotation_file
200 | df_genes = get_gene_annotation(gene_annotation_file).set_index('ensid')
201 | gene_lengths_base = (df_genes['end'] - df_genes['start'])
202 |
203 | for i, (data_file, output_file, norm_option) in enumerate(zip(
204 | data_files, output_files, input_normalizations
205 | )):
206 |
207 | if norm_option == 'mc':
208 | data_cov_file = data_cov_files[i]
209 | else:
210 | data_cov_file = ''
211 |
212 | preproc(
213 | data_file,
214 | output_file,
215 | norm_option,
216 | sub_n=sub_n,
217 | sub_frac=sub_frac,
218 | gene_lengths_base=gene_lengths_base, # required if normalization option == "tpm"
219 | f_cov_data=data_cov_file,
220 | gid_col=gid_col,
221 | cid_col=cid_col,
222 | global_mean_mc_col=global_mean_mc_col,
223 | )
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | """Import commonly used libraries"""
2 |
3 | import time
4 | import logging
5 | import glob
6 | import os
7 | import numpy as np
8 | import pandas as pd
9 | import collections
10 | # from natsort import natsorted
11 |
12 | # matplotlib
13 | import matplotlib as mpl
14 | import matplotlib.pyplot as plt
15 | mpl.rcParams['pdf.fonttype'] = 42 # editable text in matplotlib
16 | mpl.rcParams['svg.fonttype'] = 'none'
17 |
18 | import matplotlib.ticker as mtick
19 | PercentFormat = mtick.FuncFormatter(lambda y, _: '{:.3%}'.format(y))
20 | ScalarFormat = mtick.ScalarFormatter()
21 |
22 | # seaborn
23 | import seaborn as sns
24 | sns.set_style('ticks', rc={'axes.grid':True})
25 | sns.set_context('talk')
26 |
27 | # data structures
28 | GC_matrix = collections.namedtuple('GC_matrix', ['gene', 'cell', 'data'])
29 |
--------------------------------------------------------------------------------
/scripts/basic_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | """
3 | from __init__ import *
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 | import os
9 | from scipy import sparse
10 | import anndata
11 | import logging
12 |
13 | def get_size_in_GB(obj):
14 | """"""
15 | GB = 1024**3
16 | return sys.getsizeof(obj)/GB
17 |
18 | def scf_rna_format_to_h5ad(meta, gc_mat):
19 | """
20 | input:
21 | - meta (cell metadata)
22 | - gc_mat
23 |
24 | output:
25 | - anndata
26 | """
27 | X = gc_mat.data.T # cell by gene [scipy sparse matrix]
28 | obs = meta # cell annotation [pandas dataframe]
29 | var = pd.DataFrame(index=gc_mat.gene) # gene annotation [pandas dataframe]
30 |
31 | h5ad_mat = anndata.AnnData(X, obs, var,)
32 |
33 | return h5ad_mat
34 |
35 | def scf_mc_format_to_h5ad(meta, mat):
36 | """
37 | input:
38 | - meta (cell metadata)
39 | - mat
40 |
41 | output:
42 | - anndata
43 | """
44 | X = mat.T.values # cell by gene [numpy array]
45 | obs = meta # cell annotation [pandas dataframe]
46 | var = pd.DataFrame(index=mat.index) # gene annotation [pandas dataframe]
47 |
48 | h5ad_mat = anndata.AnnData(X, obs, var,)
49 |
50 | return h5ad_mat
51 |
52 | def h5ad_to_scf_rna_format(h5ad_mat, gid_col='', cid_col=''):
53 | """
54 | input:
55 | - anndata
56 | output:
57 | - meta (cell metadata)
58 | - gc_mat
59 |
60 | """
61 | meta = h5ad_mat.obs
62 | if gid_col:
63 | genes = h5ad_mat.var[gid_col].values
64 | else:
65 | genes = h5ad_mat.var.index.values
66 | if cid_col:
67 | cells = h5ad_mat.obs[cid_col].values
68 | meta = meta.set_index(cid_col)
69 | else:
70 | cells = h5ad_mat.obs.index.values
71 |
72 | gc_mat = GC_matrix(genes,
73 | cells,
74 | h5ad_mat.X.T,
75 | )
76 | return meta, gc_mat
77 |
78 | def h5ad_to_scf_mc_format(h5ad_mat, gid_col='', cid_col=''):
79 | """
80 | input:
81 | - anndata
82 | output:
83 | - meta (cell metadata)
84 | - pandas data frame
85 | """
86 |
87 | meta = h5ad_mat.obs
88 | if gid_col:
89 | genes = h5ad_mat.var[gid_col].values
90 | else:
91 | genes = h5ad_mat.var.index.values
92 | if cid_col:
93 | cells = h5ad_mat.obs[cid_col].values
94 | meta = meta.set_index(cid_col)
95 | else:
96 | cells = h5ad_mat.obs.index.values
97 | mat = pd.DataFrame(h5ad_mat.X.T,
98 | index=genes,
99 | columns=cells,
100 | )
101 | return meta, mat
102 |
103 | def diag_matrix(X, rows=np.array([]), cols=np.array([]), threshold=None):
104 | """Diagonalize a matrix as much as possible
105 | """
106 | di, dj = X.shape
107 | transposed = 0
108 |
109 | if di > dj:
110 | di, dj = dj, di
111 | X = X.T.copy()
112 | rows, cols = cols.copy(), rows.copy()
113 | transposed = 1
114 |
115 | # start (di <= dj)
116 | new_X = X.copy()
117 | new_rows = rows.copy()
118 | new_cols = cols.copy()
119 | if new_rows.size == 0:
120 | new_rows = np.arange(di)
121 | if new_cols.size == 0:
122 | new_cols = np.arange(dj)
123 |
124 | # bring the greatest values in the lower right matrix to diagnal position
125 | for idx in range(min(di, dj)):
126 |
127 | T = new_X[idx: , idx: ]
128 | i, j = np.unravel_index(T.argmax(), T.shape) # get the coords of the max element of T
129 |
130 | if threshold and T[i, j] < threshold:
131 | dm = idx # new_X[:dm, :dm] is done (0, 1, ..., dm-1) excluding dm
132 | break
133 | else:
134 | dm = idx+1 # new_X[:dm, :dm] will be done
135 |
136 | # swap row idx, idx+i
137 | tmp = new_X[idx, :].copy()
138 | new_X[idx, :] = new_X[idx+i, :].copy()
139 | new_X[idx+i, :] = tmp
140 |
141 | tmp = new_rows[idx]
142 | new_rows[idx] = new_rows[idx+i]
143 | new_rows[idx+i] = tmp
144 |
145 | # swap col idx, idx+j
146 | tmp = new_X[:, idx].copy()
147 | new_X[:, idx] = new_X[:, idx+j].copy()
148 | new_X[:, idx+j] = tmp
149 |
150 | tmp = new_cols[idx]
151 | new_cols[idx] = new_cols[idx+j]
152 | new_cols[idx+j] = tmp
153 |
154 | #
155 | if dm == dj:
156 | pass
157 | elif dm < dj: # free columns
158 |
159 | col_dict = {}
160 | sorted_col_idx = np.arange(dm)
161 | free_col_idx = np.arange(dm, dj)
162 | linked_rowcol_idx = new_X[:, dm:].argmax(axis=0)
163 |
164 | for col in sorted_col_idx:
165 | col_dict[col] = [col]
166 | for col, key in zip(free_col_idx, linked_rowcol_idx):
167 | if key < dm:
168 | col_dict[key] = col_dict[key] + [col]
169 | else:
170 | col_dict[key] = [col]
171 |
172 |
173 | new_col_order = np.hstack([col_dict[key] for key in sorted(col_dict.keys())])
174 |
175 | # update new_X new_cols
176 | new_X = new_X[:, new_col_order].copy()
177 | new_cols = new_cols[new_col_order]
178 | else:
179 | raise ValueError("Unexpected situation: dm > dj")
180 |
181 | if transposed:
182 | new_X = new_X.T
183 | new_rows, new_cols = new_cols, new_rows
184 | return new_X, new_rows, new_cols
185 |
186 | def diag_matrix_rows(X, rows=np.array([]), cols=np.array([]),):
187 | """Diagonalize a matrix as much as possible by only rearrange rows
188 | """
189 | di, dj = X.shape
190 |
191 | new_X = X.copy()
192 | new_rows = rows.copy()
193 | new_cols = cols.copy()
194 |
195 | # free to move rows
196 | row_dict = {}
197 | free_row_idx = np.arange(di)
198 | linked_rowcol_idx = new_X.argmax(axis=1) # the column with max value for each row
199 |
200 | for row, key in zip(free_row_idx, linked_rowcol_idx):
201 | if key in row_dict.keys():
202 | row_dict[key] = row_dict[key] + [row]
203 | else:
204 | row_dict[key] = [row]
205 |
206 | new_row_order = np.hstack([row_dict[key] for key in sorted(row_dict.keys())])
207 | # update new_X new_cols
208 | new_X = new_X[new_row_order, :].copy()
209 | new_rows = new_rows[new_row_order]
210 |
211 | return new_X, new_rows, new_cols
212 |
213 | def get_grad_colors(n, cmap='copper'):
214 | """Generate n colors from a given colormap (a matplotlib.cm)
215 | """
216 | from matplotlib import cm
217 | cmap = cm.get_cmap(cmap)
218 | return [cmap(int(i)) for i in np.linspace(0, 255, n)]
219 |
220 | def logcpm(counts):
221 | """
222 | Args:
223 | - gene-cell matrix
224 | """
225 | cov = counts.sum(axis=0)
226 | logcpm = np.log10(counts.divide(cov, axis=1)*1000000 + 1)
227 | return logcpm
228 |
229 | def logtpm(counts, gene_lengths):
230 | """
231 | Args:
232 | - gene-cell matrix
233 | - gene_lengths: a series indexed by gene_id
234 | """
235 | tpm = counts.divide(gene_lengths.loc[counts.index], axis=0)
236 | cov = tpm.sum(axis=0)
237 | logtpm = np.log10((tpm.divide(cov, axis=1))*1000000 + 1)
238 | return logtpm
239 |
240 | def sparse_logcpm(gc_matrix, mode='logcpm', lib_size=[]):
241 | """
242 | """
243 | lib_size = np.array(lib_size)
244 | if np.size(lib_size) == 0:
245 | lib_size = gc_matrix.data.sum(axis=0)
246 |
247 | lib_size_inv = sparse.diags(np.ravel(1.0/(1e-7+lib_size)))
248 | cpm = (gc_matrix.data).dot(lib_size_inv*1e6).tocoo()
249 |
250 | if mode == 'logcpm':
251 | cpm.data = np.log10(cpm.data + 1)
252 | elif mode == 'cpm':
253 | pass
254 |
255 | gc_cpm = GC_matrix(
256 | gc_matrix.gene,
257 | gc_matrix.cell,
258 | cpm,
259 | )
260 |
261 | return gc_cpm
262 |
263 | def sparse_logtpm(gc_matrix, gene_lengths):
264 | """
265 | gene_lengths: array like
266 |
267 | """
268 | gene_lengths = np.array(gene_lengths)
269 | gene_length_inv = sparse.diags(np.ravel(1.0/gene_lengths))
270 | tmp = (gene_length_inv).dot(gc_matrix.data).tocoo()
271 | lib_size_inv = sparse.diags(np.ravel(1.0/tmp.sum(axis=0)))
272 |
273 | logtpm = tmp.dot(lib_size_inv*1e6).tocoo()
274 | logtpm.data = np.log10(logtpm.data + 1)
275 |
276 | gc_logtpm = GC_matrix(
277 | gc_matrix.gene,
278 | gc_matrix.cell,
279 | logtpm,
280 | )
281 |
282 | return gc_logtpm
283 |
284 | class cd:
285 | """Context manager for changing the current working directory"""
286 | def __init__(self, newPath):
287 | self.newPath = os.path.expanduser(newPath)
288 |
289 | def __enter__(self):
290 | self.savedPath = os.getcwd()
291 | os.chdir(self.newPath)
292 |
293 | def __exit__(self, etype, value, traceback):
294 | os.chdir(self.savedPath)
295 |
296 | def create_logger(name='log'):
297 | """
298 | args: logger name
299 |
300 | return: a logger object
301 | """
302 | logging.basicConfig(
303 | format='%(asctime)s %(message)s',
304 | datefmt='%m/%d/%Y %I:%M:%S %p',
305 | level=logging.INFO)
306 | return logging.getLogger(name)
307 |
308 | def set_value_by_percentile(this, lo, hi):
309 | """set `this` below or above percentiles to given values
310 | this (float)
311 | lo(float)
312 | hi(float)
313 | """
314 | if this < lo:
315 | return lo
316 | elif this > hi:
317 | return hi
318 | else:
319 | return this
320 |
321 | def mcc_percentile_norm(mcc, low_p=5, hi_p=95):
322 | """
323 | set values above and below specific percentiles to be at the value of percentiles
324 |
325 | args: mcc, low_p, hi_p
326 |
327 | return: normalized mcc levels
328 | """
329 | # mcc_norm = [np.isnan(mcc) for mcc_i in list(mcc)]
330 | mcc_norm = np.copy(mcc)
331 | mcc_norm = mcc_norm[~np.isnan(mcc_norm)]
332 |
333 | lo = np.percentile(mcc_norm, low_p)
334 | hi = np.percentile(mcc_norm, hi_p)
335 |
336 | mcc_norm = [set_value_by_percentile(mcc_i, lo, hi) for mcc_i in list(mcc)]
337 | mcc_norm = np.array(mcc_norm)
338 |
339 | return mcc_norm
340 |
341 | def plot_tsne_values(df, tx='tsne_x', ty='tsne_y', tc='mCH',
342 | low_p=5, hi_p=95,
343 | s=2,
344 | cbar_label=None,
345 | output=None, show=True, close=False,
346 | t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6), **kwargs):
347 | """
348 | tSNE plot
349 |
350 | xlim, ylim is set to facilitate displaying glial clusters only
351 |
352 | """
353 | import matplotlib.pyplot as plt
354 | import seaborn as sns
355 |
356 | fig, ax = plt.subplots(figsize=figsize)
357 |
358 | im = ax.scatter(df[tx], df[ty], s=s,
359 | c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs)
360 | if title:
361 | ax.set_title(title)
362 | else:
363 | ax.set_title(tc)
364 | ax.set_xlabel(tx)
365 | ax.set_ylabel(ty)
366 | # ax.set_aspect('auto')
367 |
368 |
369 | clb = plt.colorbar(im, ax=ax)
370 | if cbar_label:
371 | clb.set_label(cbar_label, rotation=270, labelpad=10)
372 |
373 | if t_xlim == 'auto':
374 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
375 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
376 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
377 | ax.set_xlim(t_xlim)
378 | elif t_xlim:
379 | ax.set_xlim(t_xlim)
380 | else:
381 | pass
382 |
383 | if t_ylim == 'auto':
384 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
385 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
386 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
387 | ax.set_ylim(t_ylim)
388 | elif t_ylim:
389 | ax.set_ylim(t_ylim)
390 | else:
391 | pass
392 |
393 | fig.tight_layout()
394 | if output:
395 | fig.savefig(output)
396 | print('Saved to ' + output)
397 | if show:
398 | plt.show()
399 | if close:
400 | plt.close(fig)
401 |
402 | def get_kwcolors(labels, colors):
403 | """Generate a dictinary of {label: color} using unique labels and a list of availabel colors
404 | """
405 | nc = len(colors)
406 | nl = len(labels)
407 | n_repeats = int((nl + nc - 1)/nc)
408 | colors = list(colors)*n_repeats
409 |
410 | kw_colors = {l:c for (l,c) in zip(labels, colors)}
411 | return kw_colors
412 |
413 | def rgb2hex(r,g,b):
414 | """From rgb (255, 255, 255) to hex
415 | """
416 | hex = "#{:02x}{:02x}{:02x}".format(int(r),int(g),int(b))
417 | return hex
418 |
419 | def gen_colors(n, l=0.6, s=0.6, colors=None):
420 | """Generate compatible and distinct hex colors
421 | """
422 | if not colors:
423 | import colorsys
424 | hs = np.linspace(0, 1, n, endpoint=False)
425 | rgbs = [rgb2hex(*(256*np.array(colorsys.hls_to_rgb(h, l, s))))
426 | for h in hs]
427 | return rgbs
428 | else:
429 | clrs = [colors[i%len(colors)] for i in range(n)]
430 | return clrs
431 |
432 | def myScatter(ax, df, x, y, l,
433 | s=20,
434 | sample_frac=None,
435 | sample_n=None,
436 | legend_size=None,
437 | legend_kws=None,
438 | grey_label='unlabeled',
439 | shuffle=True,
440 | random_state=None,
441 | legend_mode=0,
442 | kw_colors=False,
443 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
444 | """
445 | take an axis object and make a scatter plot
446 |
447 | - kw_colors is a dictinary {label: color}
448 | """
449 |
450 | import matplotlib.pyplot as plt
451 | import seaborn as sns
452 | df = df.copy()
453 | # shuffle (and copy) data
454 | if sample_n:
455 | df = (df.groupby(l).apply(lambda x: x.sample(min(len(x), sample_n), random_state=random_state))
456 | .reset_index(level=0, drop=True)
457 | )
458 | if sample_frac:
459 | df = (df.groupby(l).apply(lambda x: x.sample(frac=sample_frac, random_state=random_state))
460 | .reset_index(level=0, drop=True)
461 | )
462 | if shuffle:
463 | df = df.sample(frac=1, random_state=random_state)
464 |
465 | if not kw_colors:
466 | # add a color column
467 | inds, catgs = pd.factorize(df[l])
468 | df['c'] = [colors[i%len(colors)] if catgs[i]!=grey_label else 'grey'
469 | for i in inds]
470 | else:
471 | df['c'] = [kw_colors[i] if i!=grey_label else 'grey' for i in df[l]]
472 |
473 | # take care of legend
474 | if legend_mode != -1:
475 | for ind, row in df.groupby(l).first().iterrows():
476 | ax.scatter(row[x], row[y], c=row['c'], label=ind, s=s, **kwargs)
477 |
478 | if legend_mode == -1:
479 | pass
480 | elif legend_mode == 0:
481 | lgnd = ax.legend()
482 | elif legend_mode == 1:
483 | # Shrink current axis's height by 10% on the bottom
484 | box = ax.get_position()
485 | ax.set_position([box.x0, box.y0 + box.height * 0.1,
486 | box.width, box.height * 0.9])
487 | lgnd = ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.07),
488 | ncol=6, fancybox=False, shadow=False)
489 | elif legend_mode == 2:
490 | # Shrink current axis's width by 10% on the bottom
491 | box = ax.get_position()
492 | ax.set_position([box.x0 + box.width*0.1, box.y0,
493 | box.width*0.8, box.height])
494 |
495 | if legend_kws:
496 | lgnd = ax.legend(**legend_kws)
497 |
498 | if legend_mode != -1 and legend_size:
499 | for handle in lgnd.legendHandles:
500 | handle._sizes = [legend_size]
501 |
502 | # backgroud (grey)
503 | df_grey = df.loc[df['c']=='grey']
504 | if not df_grey.empty:
505 | ax.scatter(df_grey[x],
506 | df_grey[y],
507 | c=df_grey['c'], s=s, **kwargs)
508 | # actual plot
509 | df_tmp = df.loc[df['c']!='grey']
510 | ax.scatter(df_tmp[x],
511 | df_tmp[y],
512 | c=df_tmp['c'], s=s, **kwargs)
513 |
514 | return
515 |
516 | def plot_tsne_labels_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='cluster_ID',
517 | sample_frac=None,
518 | sample_n=None,
519 | legend_size=None,
520 | legend_kws=None,
521 | grey_label='unlabeled',
522 | legend_mode=0,
523 | s=1,
524 | shuffle=True,
525 | random_state=None,
526 | t_xlim='auto', t_ylim='auto', title=None,
527 | legend_loc='lower right',
528 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
529 | """
530 | tSNE plot
531 |
532 | xlim, ylim is set to facilitate displaying glial clusters only
533 |
534 | # avoid gray-like 'C7' in colors
535 | # color orders are arranged for exci-inhi-glia plot 11/1/2017
536 | """
537 | import matplotlib.pyplot as plt
538 |
539 | myScatter(ax, df, tx, ty, tc,
540 | s=s,
541 | sample_frac=sample_frac,
542 | sample_n=sample_n,
543 | legend_size=legend_size,
544 | legend_kws=legend_kws,
545 | shuffle=shuffle,
546 | grey_label=grey_label,
547 | random_state=random_state,
548 | legend_mode=legend_mode,
549 | colors=colors, **kwargs)
550 |
551 | if title:
552 | ax.set_title(title)
553 | else:
554 | ax.set_title(tc)
555 | ax.set_xlabel(tx)
556 | ax.set_ylabel(ty)
557 | # ax.set_aspect('auto')
558 |
559 | if t_xlim == 'auto':
560 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
561 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
562 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
563 | ax.set_xlim(t_xlim)
564 | elif t_xlim:
565 | ax.set_xlim(t_xlim)
566 | else:
567 | pass
568 |
569 | if t_ylim == 'auto':
570 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
571 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
572 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
573 | ax.set_ylim(t_ylim)
574 | elif t_ylim:
575 | ax.set_ylim(t_ylim)
576 | else:
577 | pass
578 |
579 | return
580 |
581 |
582 | def plot_tsne_labels(df, tx='tsne_x', ty='tsne_y', tc='cluster_ID',
583 | grey_label='unlabeled',
584 | sample_frac=None,
585 | sample_n=None,
586 | legend_size=None,
587 | legend_mode=0,
588 | legend_kws=None,
589 | s=1,
590 | random_state=None,
591 | output=None, show=True, close=False,
592 | t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6),
593 | legend_loc='lower right',
594 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
595 | """
596 | tSNE plot
597 |
598 | xlim, ylim is set to facilitate displaying glial clusters only
599 |
600 | # avoid gray-like 'C7' in colors
601 | # color orders are arranged for exci-inhi-glia plot 11/1/2017
602 | """
603 | import matplotlib.pyplot as plt
604 | import seaborn as sns
605 | fig, ax = plt.subplots(figsize=figsize)
606 |
607 | myScatter(ax, df, tx, ty, tc,
608 | s=s,
609 | sample_frac=sample_frac,
610 | sample_n=sample_n,
611 | legend_size=legend_size,
612 | legend_kws=legend_kws,
613 | grey_label=grey_label,
614 | random_state=random_state,
615 | legend_mode=legend_mode,
616 | colors=colors, **kwargs)
617 |
618 | if title:
619 | ax.set_title(title)
620 | else:
621 | ax.set_title(tc)
622 | ax.set_xlabel(tx)
623 | ax.set_ylabel(ty)
624 | # ax.set_aspect('auto')
625 |
626 | if t_xlim == 'auto':
627 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
628 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
629 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
630 | ax.set_xlim(t_xlim)
631 | elif t_xlim:
632 | ax.set_xlim(t_xlim)
633 | else:
634 | pass
635 |
636 | if t_ylim == 'auto':
637 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
638 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
639 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
640 | ax.set_ylim(t_ylim)
641 | elif t_ylim:
642 | ax.set_ylim(t_ylim)
643 | else:
644 | pass
645 |
646 | if output:
647 | fig.savefig(output)
648 | print('Saved to ' + output)
649 | if show:
650 | plt.show()
651 | if close:
652 | plt.close(fig)
653 |
654 | def plot_tsne_values_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='mCH',
655 | low_p=5, hi_p=95,
656 | s=2,
657 | cbar=True,
658 | cbar_ax=None,
659 | cbar_label=None,
660 | t_xlim='auto', t_ylim='auto', title=None, **kwargs):
661 | """
662 | tSNE plot
663 |
664 | xlim, ylim is set to facilitate displaying glial clusters only
665 |
666 | """
667 | import matplotlib.pyplot as plt
668 |
669 |
670 | im = ax.scatter(df[tx], df[ty], s=s,
671 | c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs)
672 | if title:
673 | ax.set_title(title)
674 | else:
675 | ax.set_title(tc)
676 | # ax.set_aspect('auto')
677 | if cbar:
678 | if cbar_ax:
679 | clb = plt.colorbar(im, cax=cbar_ax, shrink=0.4)
680 | else:
681 | clb = plt.colorbar(im, cax=ax, shrink=1)
682 | if cbar_label:
683 | clb.set_label(cbar_label, rotation=270, labelpad=10)
684 |
685 | if t_xlim == 'auto':
686 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
687 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
688 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
689 | ax.set_xlim(t_xlim)
690 | elif t_xlim:
691 | ax.set_xlim(t_xlim)
692 | else:
693 | pass
694 |
695 | if t_ylim == 'auto':
696 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
697 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
698 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
699 | ax.set_ylim(t_ylim)
700 | elif t_ylim:
701 | ax.set_ylim(t_ylim)
702 | else:
703 | pass
704 |
705 | return im
706 |
707 |
708 | def get_mcc(df, base_call_cutoff=100, sufficient_coverage_fraction=1, suffix=True, fillna=True):
709 | """Get mcc matrix from mc_c matrix (filtering out low coverage gene or bins)
710 | """
711 | logging.info('Getting mcc matrix from mc and c')
712 | logging.info('base_call_cutoff={}, sufficient_coverage_fraction={}'.format(
713 | base_call_cutoff, sufficient_coverage_fraction))
714 |
715 | df_c = df.filter(regex="_c$")
716 | df_c.columns = [col[:-len('_c')] for col in df_c.columns]
717 | df_mc = df.filter(regex="_mc$")
718 | df_mc.columns = [col[:-len('_mc')] for col in df_mc.columns]
719 | # a gene is sufficiently covered in % of cells
720 | condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df.shape[1])/2.0
721 |
722 | logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape))
723 | logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape))
724 |
725 | # get mcc matrix with kept bins and nan values for low coverage sites
726 | df_c_nan = df_c.copy()
727 | df_c_nan[df_c < base_call_cutoff] = np.nan
728 | df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition]
729 | logging.info(df_mcc.shape)
730 |
731 | # imputation (missing value -> mean value of all cells)
732 | if fillna:
733 | logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)')
734 | means = df_mcc.mean(axis=1)
735 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
736 | df_mcc.fillna(fill_value, inplace=True)
737 |
738 | # add suffix
739 | if suffix:
740 | df_mcc.columns = df_mcc.columns.values + '_mcc'
741 |
742 | return df_mcc
743 |
744 | def get_mcc_lite(mc_table, c_table, base_call_cutoff=100, sufficient_coverage_fraction=1, fillna=True):
745 | """Given 2 numpy array, return mcc table
746 | Gene/region by sample matrix
747 | """
748 | df_c = pd.DataFrame(c_table)
749 | df_mc = pd.DataFrame(mc_table)
750 | assert df_c.shape == df_mc.shape
751 |
752 | # a gene is sufficiently covered in % of cells
753 | condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df_c.shape[1])
754 |
755 | logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape))
756 | logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape))
757 |
758 | # get mcc matrix with kept bins and nan values for low coverage sites
759 | df_c_nan = df_c.copy()
760 | df_c_nan[df_c < base_call_cutoff] = np.nan
761 | df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition]
762 | logging.info(df_mcc.shape)
763 |
764 | # imputation (missing value -> mean value of all cells)
765 | if fillna:
766 | logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)')
767 | means = df_mcc.mean(axis=1)
768 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
769 | df_mcc.fillna(fill_value, inplace=True)
770 |
771 | # return matrix and index (regions)
772 | return df_mcc.values, df_mcc.index.values
773 |
774 | def get_mcc_lite_v2(df_c, df_mc, base_call_cutoff):
775 | """
776 | """
777 | # get mcc matrix with kept bins and nan values for low coverage sites
778 | df_c_nan = df_c.copy()
779 | df_c_nan[df_c < base_call_cutoff] = np.nan
780 | df_mcc = df_mc/df_c_nan
781 | logging.info(df_mcc.shape)
782 |
783 | # imputation (missing value -> mean value of all cells)
784 | means = df_mcc.mean(axis=1)
785 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
786 | df_mcc.fillna(fill_value, inplace=True)
787 |
788 | return df_mcc
789 |
790 | def get_mcc_lite_v3(df_c, df_mc, base_call_cutoff):
791 | """
792 | """
793 | # get mcc matrix with kept bins and nan values for low coverage sites
794 | df_c_nan = df_c.copy()
795 | df_c_nan[df_c < base_call_cutoff] = np.nan
796 | df_mcc = df_mc/df_c_nan
797 | return df_mcc
798 |
799 |
800 | def get_clusters_mc_c_worker(df_cells, df_input, cluster_col):
801 | """reduce gene*cell or bin*cell matrix to a gene*cluster or bin*cluster matrix
802 | Arguments:
803 | - df_cells: a dataframe indexed by 'cell_name', and have '$cluster_col' as column
804 | - df_input: a dataframe with 'sample_mc', 'sample_c' ... as columns
805 | sample names are cell names
806 | """
807 | # cluster mc_c
808 | df_c = df_input.filter(regex='_c$')
809 | df_mc = df_input.filter(regex='_mc$')
810 |
811 | df_mc_c = pd.DataFrame()
812 | for label, df_sub in df_cells.groupby(cluster_col):
813 | samples = df_sub.index.values
814 | df_mc_c['{}_mc'.format(label)] = df_mc[samples+'_mc'].sum(axis=1)
815 | df_mc_c['{}_c'.format(label)] = df_c[samples+'_c'].sum(axis=1)
816 |
817 | logging.info("Output shape: {}".format(df_mc_c.shape))
818 | return df_mc_c
819 |
820 | def rank_array(array):
821 | """Return ranking of each element of an array
822 | """
823 | array = np.array(array)
824 | temp = array.argsort()
825 | ranks = np.empty_like(temp)
826 | ranks[temp] = np.arange(len(array))
827 | return ranks
828 |
829 | # added 4/5/2019
830 | def rank_rows(matrix):
831 | """Return rankings of each rwo in a 2d array
832 | """
833 | matrix = np.array(matrix)
834 | return np.apply_along_axis(rank_array, 1, matrix) # row = 1
835 |
836 | def spearman_corrcoef(X, Y):
837 | """return spearman correlation matrix for each pair of rows of X and Y
838 | """
839 | return np.corrcoef(rank_rows(X), rank_rows(Y))
840 |
841 | def spearmanr_paired_rows(X, Y):
842 | from scipy import stats
843 |
844 | X = np.array(X)
845 | Y = np.array(Y)
846 | corrs = []
847 | ps = []
848 | for x, y in zip(X, Y):
849 | r, p = stats.spearmanr(x, y)
850 | corrs.append(r)
851 | return np.array(corrs), np.array(ps)
852 |
853 | def get_index_from_array(arr, inqs, na_rep=-1):
854 | """Get index of array
855 | """
856 | arr = np.array(arr)
857 | arr = pd.Series(arr).reset_index().set_index(0)
858 | idxs = arr.reindex(inqs)['index'].fillna(na_rep).astype(int).values
859 | return idxs
860 |
861 | def get_genomic_distance(sa, ea, sb, eb):
862 | """Get genomic distance
863 | """
864 | assert sa < ea and sb < eb
865 | if sa > sb:
866 | sa, sb = sb, sa
867 | ea, eb = eb, ea
868 |
869 | # sa <= sb
870 | distance = max(0, sb - ea)
871 |
872 | return distance
873 |
874 | def get_reverse_comp(string):
875 | """Get reverse compliment of a string
876 | """
877 | comp_dict = {
878 | 'A': 'T',
879 | 'T': 'A',
880 | 'G': 'C',
881 | 'C': 'G',
882 | 'N': 'N',
883 | }
884 | for char in set(string):
885 | if char not in ['A', 'C', 'G', 'T', 'N']:
886 | raise ValueError('Not allowed char in string')
887 |
888 | new_string = ''.join([comp_dict[char] for char in string[::-1]])
889 | return new_string
890 |
891 | def save_gc_matrix(gc_matrix, f_gene, f_cell, f_mat):
892 | """
893 | """
894 | sparse.save_npz(f_mat, gc_matrix.data)
895 | with open(f_gene, 'w') as f:
896 | f.write('\n'.join(gc_matrix.gene)+'\n')
897 | with open(f_cell, 'w') as f:
898 | f.write('\n'.join(gc_matrix.cell)+'\n')
899 |
900 | def save_gc_matrix_methylation(gc_matrix, f_gene, f_cell, f_mat_mc, f_mat_c):
901 | """
902 | """
903 | sparse.save_npz(f_mat_mc, gc_matrix.data['mc'])
904 | sparse.save_npz(f_mat_c, gc_matrix.data['c'])
905 | with open(f_gene, 'w') as f:
906 | f.write('\n'.join(gc_matrix.gene)+'\n')
907 | with open(f_cell, 'w') as f:
908 | f.write('\n'.join(gc_matrix.cell)+'\n')
909 |
910 | def import_single_textcol(fname, header=None, col=0):
911 | return pd.read_csv(fname, header=header, sep='\t')[col].values
912 |
913 | def export_single_textcol(fname, array):
914 | with open(fname, 'w') as f:
915 | f.write('\n'.join(array)+'\n')
916 |
917 | def load_gc_matrix(f_gene, f_cell, f_mat):
918 | """
919 | """
920 | gene = import_single_textcol(f_gene)
921 | cell = import_single_textcol(f_cell)
922 | mat = sparse.load_npz(f_mat)
923 | assert (len(gene), len(cell)) == mat.shape
924 | return GC_matrix(gene, cell, mat)
925 |
926 | def load_gc_matrix_methylation(f_gene, f_cell, f_mat_mc, f_mat_c):
927 | """
928 | """
929 | _gene = import_single_textcol(f_gene)
930 | _cell = import_single_textcol(f_cell)
931 | _mat_mc = sparse.load_npz(f_mat_mc)
932 | _mat_c = sparse.load_npz(f_mat_c)
933 | gxc_raw = GC_matrix(_gene, _cell,
934 | {'c': _mat_c, 'mc': _mat_mc})
935 | return gxc_raw
936 |
937 | def nondup_legends(ax='', **kwargs):
938 | """Assuming plt (matplotlib.pyplot) is imported
939 | """
940 | from collections import OrderedDict
941 | import matplotlib.pyplot as plt
942 |
943 | if ax == '':
944 | handles, labels = plt.gca().get_legend_handles_labels()
945 | by_label = OrderedDict(zip(labels, handles))
946 | plt.legend(by_label.values(), by_label.keys(), **kwargs)
947 | else:
948 | handles, labels = ax.get_legend_handles_labels()
949 | by_label = OrderedDict(zip(labels, handles))
950 | ax.legend(by_label.values(), by_label.keys(), **kwargs)
951 | return
952 |
953 | def dedup_array_elements(x, empty_string=''):
954 | """Replacing repeats with empty_string
955 | """
956 | newx = np.empty_like(x)
957 | newx[0] = x[0]
958 | for i in range(1, len(x)):
959 | if x[i-1] == x[i]:
960 | newx[i] = empty_string
961 | else:
962 | newx[i] = x[i]
963 | return newx
964 |
965 | def vcorrcoef(X,Y):
966 | """Compute correlation coef for each rows of X and Y
967 | """
968 | assert X.shape == Y.shape
969 | Xm = np.mean(X,axis=1).reshape(-1,1)
970 | Ym = np.mean(Y,axis=1).reshape(-1,1)
971 | Xm = X-Xm
972 | Ym = Y-Ym
973 |
974 | r_num = np.sum(Xm*Ym,axis=1)
975 | r_den = np.sqrt(np.sum(Xm**2,axis=1)*np.sum(Ym**2, axis=1))
976 | r = r_num/r_den
977 | return r
978 |
979 | def zscore(x, offset=1e-7, ddof=1):
980 | return (x - np.mean(x))/(np.std(x, ddof=ddof) + offset)
981 |
982 |
983 | def clst_umap_pipe_lite(pcs, cells_all,
984 | resolution=1,
985 | npc=50,
986 | k=30,
987 | verbose=False, seed=0, cluster_only=False,
988 | ):
989 | # clustering
990 | import CEMBA_clst_utils
991 | import CEMBA_run_tsne
992 |
993 | df_clst = CEMBA_clst_utils.clustering_routine(
994 | pcs,
995 | cells_all, k,
996 | verbose=verbose,
997 | resolution=resolution,
998 | seed=seed,
999 | metric='euclidean', option='plain', n_trees=10, search_k=-1)
1000 |
1001 | # umap
1002 | if not cluster_only:
1003 | df_tsne = CEMBA_run_tsne.run_umap_lite(
1004 | pcs,
1005 | cells_all,
1006 | verbose=verbose,
1007 | n_neighbors=30, min_dist=0.5, n_dim=2,
1008 | random_state=1)
1009 |
1010 | df_summary = df_clst.join(df_tsne)
1011 | return df_summary
1012 | else:
1013 | return df_clst
1014 |
1015 | def gen_cdf(array, ax, x_range=[], n_points=1000, show=True, flip=False, **kwargs):
1016 | """
1017 | """
1018 | x = np.sort(array)
1019 | y = np.arange(len(array))/len(array)
1020 | if flip:
1021 | # x = x[::-1]
1022 | y = 1 - y
1023 |
1024 | if not x_range:
1025 | if show:
1026 | ax.plot(x, y, **kwargs)
1027 | return x, y
1028 | else:
1029 | start, end = x_range
1030 | xbins = np.linspace(start, end, n_points)
1031 | ybins = np.interp(xbins, x, y)
1032 | if show:
1033 | ax.plot(xbins, ybins, **kwargs)
1034 | return xbins, ybins
1035 |
1036 | def savefig(fig, path):
1037 | """
1038 | """
1039 | fig.savefig(path, bbox_inches='tight', dpi=300)
1040 | return
1041 |
--------------------------------------------------------------------------------
/scripts/cli_parser.py:
--------------------------------------------------------------------------------
1 | """Command line interface is defined here.
2 | """
3 | DESCRIPTION_preproc="""
4 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets.
5 | This is the CLI for its preprocessing module
6 | (from count matrices to normalized HVG feature matrices).
7 | """
8 |
9 | DESCRIPTION="""
10 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets.
11 | """
12 |
13 | EPILOG="""
14 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel.
15 | Contact: Eran Mukamel (emukamel@ucsd.edu).
16 | """
17 |
18 | import argparse
19 | import os
20 |
21 | def create_parser_preproc():
22 | """
23 | """
24 | parser = argparse.ArgumentParser(
25 | prog="SingleCellFusion_pre",
26 | description=DESCRIPTION_preproc,
27 | epilog=EPILOG,
28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
29 | )
30 |
31 | required = parser.add_argument_group('required')
32 | optional = parser.add_argument_group('optional')
33 |
34 | # Input/Output Dataset Settings
35 | required.add_argument(
36 | "-i", "--input_datasets",
37 | type=str,
38 | nargs="+",
39 | required=True,
40 | help='''(list of str)
41 | Paths to .h5ad files, each containing a cell-by-gene feature matrix,
42 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file,
43 | Gene IDs should be shared or partially shared across files.
44 | Multiple inputs should be listed as a space seperated list of filenames.
45 | '''
46 | )
47 | optional.add_argument(
48 | "-icov", "--input_datasets_coverage",
49 | type=str,
50 | nargs="+",
51 | help='''(list of str)
52 | Paths to .h5ad files, each containing a cell-by-gene feature matrix,
53 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file,
54 | Gene IDs should be shared or partially shared across files.
55 | Multiple inputs should be listed as a space seperated list of filenames.
56 |
57 | Required for "mc" datasets. Should follow the order of -i
58 | '''
59 | )
60 | required.add_argument(
61 | "-inorm", "--input_normalizations",
62 | type=str,
63 | nargs="+",
64 | required=True,
65 | help='''(list of str)
66 | Data modalities chosen from 'mc', 'cpm', or 'tpm'. This should be
67 | listed in the same order as input_datasets
68 | ''',
69 | )
70 | optional.add_argument(
71 | "-ci", "--cellid_column",
72 | type=str,
73 | default="",
74 | help='''(str)
75 | Cell id column - column in AnnData.obs that represents cell id.
76 | This needs to be unique within and across datasets.
77 | Empty string means the column is the index of AnnData.obs.
78 | '''
79 | )
80 | optional.add_argument(
81 | "-gi", "--geneid_column",
82 | type=str,
83 | default="",
84 | help='''(str)
85 | Gene id column - column in AnnData.var that presents gene id.
86 | This needs to be unique and shared across genes.
87 | Empty string means the column is the index of AnnData.var.
88 | '''
89 | )
90 | optional.add_argument(
91 | "-gmmc", "--global_mean_mc_column",
92 | type=str,
93 | default="",
94 | help='''(str)
95 | Global mean mc column - column in AnnData.obs that presents global mean methylation level.
96 | If empty, estimated by the input matrix
97 | '''
98 | )
99 | optional.add_argument(
100 | "-sp", "--tosparse",
101 | action='store_true',
102 | help='''()
103 | this turns the input matrix into scipy sparse matrix format
104 | '''
105 | )
106 | optional.add_argument(
107 | "-o", "--output_dir",
108 | type=str,
109 | default="./preprocessed",
110 | help='''(str)
111 | Directory to store output files
112 | '''
113 | )
114 | optional.add_argument(
115 | "-op", "--output_prefix",
116 | metavar="OUT_PREFIX",
117 | type=str,
118 | default="SingleCellFusion",
119 | help='''(str)
120 | The output files will contain this prefix
121 | '''
122 | )
123 | optional.add_argument(
124 | "-ga", "--gene_annotation_file",
125 | type=str,
126 | default="",
127 | help='''(str)
128 | Gene annotation file (bed format: chr, start, end, gene_id/gene_name/any identifier)
129 | required if choose 'tpm' as the normalization option.
130 | the fourth column is used to identify individual genes.
131 | '''
132 | )
133 | optional.add_argument(
134 | "-subn", "--sub_n",
135 | type=int,
136 | default=None,
137 | help='''(int)
138 | Subsampling this number of cells for each input dataset
139 | '''
140 | )
141 | optional.add_argument(
142 | "-subf", "--sub_frac",
143 | type=float,
144 | default=None,
145 | help='''(float)
146 | Subsampling this fraction (0~1) of cells for each input dataset
147 | '''
148 | )
149 | return parser
150 |
151 | def create_parser():
152 | """
153 | """
154 | parser = argparse.ArgumentParser(
155 | prog="SingleCellFusion",
156 | description=DESCRIPTION,
157 | epilog=EPILOG,
158 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
159 | )
160 |
161 | required = parser.add_argument_group('required')
162 | optional = parser.add_argument_group('optional')
163 | advanced = parser.add_argument_group('advanced')
164 |
165 | ## ARGUMENTS DIRECTLY FED INTO SingleCellFusion CLI
166 | # Input/Output Dataset Settings
167 | required.add_argument(
168 | "-i", "--input_datasets",
169 | metavar="xx.h5ad",
170 | type=str,
171 | nargs="+",
172 | required=True,
173 | help='''(list of str)
174 | Paths to .h5ad files, each containing a cell-by-gene feature matrix,
175 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file,
176 | Gene IDs should be shared or partially shared across files.
177 | Multiple inputs should be listed as a space seperated list of filenames.
178 | '''
179 | )
180 | required.add_argument(
181 | "-im", "--input_modalities",
182 | metavar="rna/atac/mc",
183 | type=str,
184 | nargs="+",
185 | required=True,
186 | help='''(list of str)
187 | Data modalities chosen from 'rna', 'atac', or 'mc'. This should be
188 | listed in the same order as input_datasets.
189 | '''
190 | )
191 | # may need this in the future
192 | # parser.add_argument(
193 | # "-im", "--input_meta",
194 | # type=str,
195 | # required=True,
196 | # help="(list of str) Input metadata csv file",
197 | # )
198 |
199 | required.add_argument(
200 | "-f", "--feature_datasets",
201 | metavar="xx.h5ad",
202 | type=str,
203 | nargs="+",
204 | required=True,
205 | help='''(list of str)
206 | Dataset(s) whose features all other datasets will impute into.
207 | This should be a subset of --input_datasets.
208 | Enter multiple datasets as a space-separated list of filenames.
209 | The features of these datasets will
210 | be the features kept in the output imputed data table.",
211 | '''
212 | )
213 | optional.add_argument(
214 | "-o", "--output_dir",
215 | metavar="DIR",
216 | type=str,
217 | default="./results",
218 | help='''(str)
219 | Directory to store output files
220 | '''
221 | )
222 | optional.add_argument(
223 | "-op", "--output_prefix",
224 | type=str,
225 | default="SingleCellFusion",
226 | help='''(str)
227 | The output files will contain this prefix.
228 | '''
229 | )
230 |
231 | # constraint kNN across modalities
232 | optional.add_argument(
233 | "--nearest_neighbors",
234 | type=int,
235 | default=20,
236 | help='''(integer)
237 | Number of nearest neighbors used to impute data
238 | '''
239 | )
240 | optional.add_argument(
241 | "--relaxation",
242 | type=float,
243 | default=3,
244 | help='''(float)
245 | A value between 1 to infinity.
246 | This is a parameter that constraints the number of neighbors a cell is allowed to receive.
247 | Assume dataset 1 has N1 cells, dataset 2 has N2 cells. To find k neighbors in dataset 2 for
248 | every cell in dataset 1 means on average each cell in dataset 2 receives (kN1/N2) connections.
249 | However, not all cells in dataset 2 gets the same number of connections. We therefore set an
250 | upper bound for the number of connections a cell in dataset 2 can receive to be:
251 | (kN1/N2)*relaxation
252 | where relaxation >= 1. Relaxation=1 enforces a hard limit that every cell receives
253 | the same number of nearest neighbors, while relaxation=infinity approaches traditional kNN.
254 | '''
255 | )
256 | optional.add_argument(
257 | "--precomputed_pca_file",
258 | type=str,
259 | default='',
260 | help='''(str)
261 | Precomputed PCA matrix (tab separated table; text file or gzipped)
262 | with the first row as the header, and the first column as the cell_id
263 | Each following rows are cell features, and columns are PCs.
264 |
265 | Providing this file will by-pass SingleCellFusion integration,
266 | and do clustering and UMAP just on this matrix instead.
267 | '''
268 | )
269 | optional.add_argument(
270 | "--use_netUMAP",
271 | action='store_true',
272 | help='''(bool)
273 | Include this argument to use Net-UMAP from Pegasus (Li et al. 2020)
274 | Net-UMAP is an approximate but fast algorithm for UMAP.
275 | It runs traditional UMAP on a subset of cells,
276 | then it uses deep neural network to learn embedding for all cells.
277 | The package pegasus is required.
278 | '''
279 | )
280 | optional.add_argument(
281 | "--use_tsne",
282 | action='store_true',
283 | help='''(bool)
284 | Include this argument to use tSNE instead of UMAP
285 | '''
286 | )
287 |
288 | # within modality smoothing
289 | advanced.add_argument(
290 | "--num_pcs",
291 | type=int,
292 | default=50,
293 | help='''(integer)
294 | Number of Principal Components to keep for each dataset
295 | for smoothing and for clustering/embedding after imputation.
296 | '''
297 | )
298 | advanced.add_argument(
299 | "--smoothing_fractions",
300 | nargs="+",
301 | type=float,
302 | default=[0.7, 0.1, 0.9],
303 | help='''(list of floats)
304 | A list of three values between 0 to 1 that controls the relative contribution
305 | from the cell itself vs. its neighbors in within-dataset smoothing,
306 | specified for 'rna', 'atac', 'mc' data, respectively.
307 | '''
308 | )
309 |
310 | # Arguments for Clustering
311 | advanced.add_argument(
312 | "--leiden_n_neighbors",
313 | type=int,
314 | default=30,
315 | help='''(integer)
316 | Number of nearest neighbors to form in the integrated space,
317 | the resulting nearest neighbor graph is used for Leiden clustering.
318 | It is passed into the python package leidenalg.
319 | '''
320 | )
321 | advanced.add_argument(
322 | "--leiden_resolutions",
323 | type=list,
324 | default=[0.1, 0.2, 0.4, 0.8],
325 | help='''(list of floats)
326 | A list of resolutions to be used for Leiden Clustering.
327 | It is passed into the python package leidenalg.
328 | '''
329 | )
330 |
331 | # Arguments for UMAP
332 | advanced.add_argument(
333 | "--umap_n_neighbors",
334 | type=int,
335 | default=60,
336 | help='''(integer)
337 | Number of neighbors for UMAP. It is passed into the python package umap.UMAP(n_neighbors).
338 | '''
339 | )
340 | advanced.add_argument(
341 | "--umap_min_dist",
342 | type=float,
343 | default=0.5,
344 | help='''(float)
345 | Minimum distance for UMAP. It is passed into the python package umap.UMAP(min_dist).
346 | '''
347 | )
348 | return parser
349 |
350 | def parse_filename(data_file):
351 | """turn a xxx/xxx/XXXX.h5ad into XXXX
352 | """
353 | dataset_name = os.path.basename(data_file)
354 | if dataset_name.endswith('.h5ad'):
355 | dataset_name = dataset_name[:-len('.h5ad')]
356 | else:
357 | raise ValueError("filenames don't have the format xxxx.h5ad")
358 | return dataset_name
359 |
360 | def modality_default_options(mod):
361 | """
362 | """
363 | if mod == 'mc':
364 | mod_direction = -1
365 | # norm_option = 'mc'
366 | elif mod == 'rna':
367 | mod_direction = 1
368 | # norm_option = 'cpm'
369 | elif mod == 'atac':
370 | mod_direction = 1
371 | # norm_option = 'tpm'
372 | else:
373 | raise ValueError("choose from ['mc', 'rna', 'atac']")
374 | return mod_direction
375 |
--------------------------------------------------------------------------------
/scripts/clst_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions for clusterings and embeddings
2 | """
3 |
4 | from __init__ import *
5 | # from sklearn.decomposition import PCA
6 | import igraph as ig
7 | from scipy import sparse
8 | from annoy import AnnoyIndex
9 | from umap import UMAP
10 | import leidenalg
11 |
12 | from basic_utils import create_logger
13 |
14 | # major change in annoy functions 5/7/2019
15 | def build_knn_map(X, metric='euclidean', n_trees=10, verbose=True):
16 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
17 |
18 | return:
19 | t: annoy knn object, can be used in the following ways
20 | t.get_nns_by_vector
21 | t.get_nns_by_item
22 | """
23 | ti = time.time()
24 |
25 | n_obs, n_f = X.shape
26 | t = AnnoyIndex(n_f, metric=metric) # Length of item vector that will be indexed
27 | for i, X_row in enumerate(X):
28 | t.add_item(i, X_row)
29 | t.build(n_trees) # 10 trees
30 | if verbose:
31 | print("Time used to build kNN map {}".format(time.time()-ti))
32 | return t
33 |
34 | def get_knn_by_items(t, k,
35 | form='list',
36 | search_k=-1,
37 | include_distances=False,
38 | verbose=True,
39 | ):
40 | """Get kNN for each item in the knn map t
41 | """
42 | ti = time.time()
43 | # set up
44 | n_obs = t.get_n_items()
45 | n_f = t.f
46 | if k > n_obs:
47 | print("Actual k: {}->{} due to low n_obs".format(k, n_obs))
48 | k = n_obs
49 |
50 | knn = [0]*(n_obs)
51 | knn_dist = [0]*(n_obs)
52 | # this block of code can be optimized
53 | if include_distances:
54 | for i in range(n_obs):
55 | res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances)
56 | knn[i] = res[0]
57 | knn_dist[i] = res[1]
58 | else:
59 | for i in range(n_obs):
60 | res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances)
61 | knn[i] = res
62 |
63 | knn = np.array(knn)
64 | knn_dist = np.array(knn_dist)
65 |
66 | if verbose:
67 | print("Time used to get kNN {}".format(time.time()-ti))
68 |
69 | if form == 'adj':
70 | # row col 1/dist
71 | row_inds = np.repeat(np.arange(n_obs), k)
72 | col_inds = np.ravel(knn)
73 | if include_distances:
74 | data = np.ravel(knn_dist)
75 | else:
76 | data = [1]*len(row_inds)
77 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs, n_obs))
78 | return knn_dist_mat
79 | elif form == 'list': #
80 | if include_distances:
81 | return knn, knn_dist
82 | else:
83 | return knn
84 | else:
85 | raise ValueError("Choose from 'adj' and 'list'")
86 |
87 | def get_knn_by_vectors(t, X, k,
88 | form='list',
89 | search_k=-1,
90 | include_distances=False,
91 | verbose=True,
92 | ):
93 | """Get kNN for each row vector of X
94 | """
95 | ti = time.time()
96 | # set up
97 | n_obs = t.get_n_items()
98 | n_f = t.f
99 | n_obs_test, n_f_test = X.shape
100 | assert n_f_test == n_f
101 |
102 | if k > n_obs:
103 | print("Actual k: {}->{} due to low n_obs".format(k, n_obs))
104 | k = n_obs
105 |
106 | knn = [0]*(n_obs_test)
107 | knn_dist = [0]*(n_obs_test)
108 | if include_distances:
109 | for i, vector in enumerate(X):
110 | res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances)
111 | knn[i] = res[0]
112 | knn_dist[i] = res[1]
113 | else:
114 | for i, vector in enumerate(X):
115 | res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances)
116 | knn[i] = res
117 |
118 | knn = np.array(knn)
119 | knn_dist = np.array(knn_dist)
120 |
121 | if verbose:
122 | print("Time used to get kNN {}".format(time.time()-ti))
123 |
124 | if form == 'adj':
125 | # row col 1/dist
126 | row_inds = np.repeat(np.arange(n_obs_test), k)
127 | col_inds = np.ravel(knn)
128 | if include_distances:
129 | data = np.ravel(knn_dist)
130 | else:
131 | data = [1]*len(row_inds)
132 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs_test, n_obs))
133 | return knn_dist_mat
134 | elif form == 'list': #
135 | if include_distances:
136 | return knn, knn_dist
137 | else:
138 | return knn
139 | else:
140 | raise ValueError("Choose from 'adj' and 'list'")
141 |
142 | def gen_knn_annoy(X, k, form='list',
143 | metric='euclidean', n_trees=10, search_k=-1, verbose=True,
144 | include_distances=False,
145 | ):
146 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
147 | """
148 | ti = time.time()
149 |
150 | n_obs, n_f = X.shape
151 | t = build_knn_map(X, metric=metric, n_trees=n_trees, verbose=verbose)
152 |
153 | return get_knn_by_items(t, k,
154 | form=form,
155 | search_k=search_k,
156 | include_distances=include_distances,
157 | verbose=verbose,
158 | )
159 |
160 | def gen_knn_annoy_train_test(X_train, X_test, k,
161 | form='list',
162 | metric='euclidean', n_trees=10, search_k=-1, verbose=True,
163 | include_distances=False,
164 | ):
165 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
166 | For each row in X_test, find k nearest neighbors in X_train
167 | """
168 | ti = time.time()
169 |
170 | n_obs, n_f = X_train.shape
171 | n_obs_test, n_f_test = X_test.shape
172 | assert n_f == n_f_test
173 |
174 | t = build_knn_map(X_train, metric=metric, n_trees=n_trees, verbose=verbose)
175 | return get_knn_by_vectors(t, X_test, k,
176 | form=form,
177 | search_k=search_k,
178 | include_distances=include_distances,
179 | verbose=verbose,
180 | )
181 |
182 | def compute_jaccard_weights_from_knn(X):
183 | """compute jaccard index on a knn graph
184 | Arguments:
185 | X (unweighted) kNN ajacency matrix (each row Xi* gives the kNNs of cell i)
186 | X has to be 0-1 valued
187 | k (number of nearest neighbors)
188 |
189 | output: numpy matrix Y
190 | """
191 | X = sparse.csr_matrix(X)
192 | ni, nj = X.shape
193 | assert ni == nj
194 |
195 | k = X[0, :].sum() # number of neighbors
196 |
197 | Y = X.dot(X.T)
198 | # Y = X.multiply(tmp/(2*k - tmp.todense()))
199 | Y.data = Y.data/(2*k - Y.data)
200 |
201 | return Y
202 |
203 | def adjacency_to_igraph(adj_mtx, weighted=False):
204 | """
205 | Converts an adjacency matrix to an igraph object
206 |
207 | Args:
208 | adj_mtx (sparse matrix): Adjacency matrix
209 | directed (bool): If graph should be directed
210 |
211 | Returns:
212 | G (igraph object): igraph object of adjacency matrix
213 |
214 | Uses code from:
215 | https://github.com/igraph/python-igraph/issues/168
216 | https://stackoverflow.com/questions/29655111
217 |
218 | Author:
219 | Wayne Doyle
220 | (Fangming Xie modified)
221 | """
222 | nrow, ncol = adj_mtx.shape
223 | if nrow != ncol:
224 | raise ValueError('Adjacency matrix should be a square matrix')
225 | vcount = nrow
226 | sources, targets = adj_mtx.nonzero()
227 | edgelist = list(zip(sources.tolist(), targets.tolist()))
228 | G = ig.Graph(n=vcount, edges=edgelist, directed=True)
229 | if weighted:
230 | G.es['weight'] = adj_mtx.data
231 | return G
232 |
233 | def leiden_lite(g, cell_list, resolution=1, weighted=False, verbose=True, num_starts=None, seed=1):
234 | """ Code from Ethan Armand and Wayne Doyle, ./mukamel_lab/mop
235 | slightly modified by Fangming Xie 05/13/2019
236 | """
237 |
238 | ti = time.time()
239 |
240 | if num_starts is not None:
241 | np.random.seed(seed)
242 | partitions = []
243 | quality = []
244 | seeds = np.random.randint(10*num_starts, size=num_starts)
245 | for seed in seeds:
246 | if weighted:
247 | temp_partition = leidenalg.find_partition(g,
248 | leidenalg.RBConfigurationVertexPartition,
249 | weights=g.es['weight'],
250 | resolution_parameter=resolution,
251 | seed=seed,
252 | )
253 | else:
254 | temp_partition = leidenalg.find_partition(g,
255 | leidenalg.RBConfigurationVertexPartition,
256 | resolution_parameter=resolution,
257 | seed=seed,
258 | )
259 | quality.append(temp_partition.quality())
260 | partitions.append(temp_partition)
261 | partition1 = partitions[np.argmax(quality)]
262 | else:
263 | if weighted:
264 | partition1 = leidenalg.find_partition(g,
265 | leidenalg.RBConfigurationVertexPartition,
266 | weights=g.es['weight'],
267 | resolution_parameter=resolution,
268 | seed=seed,
269 | )
270 | else:
271 | partition1 = leidenalg.find_partition(g,
272 | leidenalg.RBConfigurationVertexPartition,
273 | resolution_parameter=resolution,
274 | seed=seed,
275 | )
276 |
277 | # get cluster labels from partition1
278 | labels = [0]*(len(cell_list))
279 | for i, cluster in enumerate(partition1):
280 | for element in cluster:
281 | labels[element] = i+1
282 |
283 | df_res = pd.DataFrame(index=cell_list)
284 | df_res['cluster'] = labels
285 | df_res = df_res.rename_axis('sample', inplace=False)
286 |
287 | if verbose:
288 | print("Time spent on leiden clustering: {}".format(time.time()-ti))
289 |
290 | return df_res
291 |
292 | def clustering_routine(X, cell_list, k,
293 | seed=1, verbose=True,
294 | resolution=1, metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None):
295 | """
296 | X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended
297 | option: {'plain', 'jaccard', ...}
298 | """
299 | assert len(cell_list) == len(X)
300 |
301 | if option == 'plain':
302 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric,
303 | n_trees=n_trees, search_k=search_k, verbose=verbose)
304 | G = adjacency_to_igraph(g_knn, weighted=False)
305 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed,
306 | weighted=False, verbose=verbose, num_starts=num_starts)
307 |
308 | elif option == 'jaccard':
309 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric,
310 | n_trees=n_trees, search_k=search_k, verbose=verbose)
311 | gw_knn = compute_jaccard_weights_from_knn(g_knn)
312 | G = adjacency_to_igraph(gw_knn, weighted=True)
313 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed,
314 | weighted=True, verbose=verbose, num_starts=num_starts)
315 | else:
316 | raise ValueError('Choose from "plain" and "jaccard"')
317 |
318 | return df_res
319 |
320 | def clustering_routine_multiple_resolutions(X, cell_list, k,
321 | seed=1, verbose=True,
322 | resolutions=[1], metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None):
323 | """
324 | X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended
325 | option: {'plain', 'jaccard', ...}
326 | """
327 | assert len(cell_list) == len(X)
328 |
329 | res = []
330 | if option == 'plain':
331 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric,
332 | n_trees=n_trees, search_k=search_k, verbose=verbose)
333 | G = adjacency_to_igraph(g_knn, weighted=False)
334 | for resolution in resolutions:
335 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed,
336 | weighted=False, verbose=verbose, num_starts=num_starts)
337 | df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)})
338 | res.append(df_res)
339 |
340 | elif option == 'jaccard':
341 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric,
342 | n_trees=n_trees, search_k=search_k, verbose=verbose)
343 | gw_knn = compute_jaccard_weights_from_knn(g_knn)
344 | G = adjacency_to_igraph(gw_knn, weighted=True)
345 | for resolution in resolutions:
346 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed,
347 | weighted=True, verbose=verbose, num_starts=num_starts)
348 | df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)})
349 | res.append(df_res)
350 |
351 | else:
352 | raise ValueError('Choose from "plain" and "jaccard"')
353 | res = pd.concat(res, axis=1)
354 |
355 | return res
356 |
357 | def run_net_umap_pegasus(X, **kwargs):
358 | """
359 | X (m, n) -> res_umap (m, 2)
360 | """
361 | import pegasus
362 | import pegasusio
363 | # pegasus netUMAP
364 | # construct a pegasus object (a hack - pegasus='1.4.3'; pegasusio='0.3.1.post2')
365 | m, n = X.shape
366 | pgX = pegasusio.MultimodalData(pegasusio.UnimodalData(
367 | {'barcodekey': np.arange(m).astype(str)},
368 | {'featurekey': np.arange(n).astype(str)},
369 | {"X": X},
370 | ))
371 | # (a hack) select_alpha=0 is important to resolve a
372 | # sampling bug by pegasus when n is large
373 | pegasus.net_umap(pgX, rep=None, select_alpha=0, **kwargs)
374 | res_umap = pgX.obsm['X_net_umap'] # an array
375 | return res_umap
376 |
377 | def run_umap_lite(X, cell_list, n_neighbors=15, min_dist=0.1, n_dim=2,
378 | random_state=1, output_file=None, use_netUMAP=False, use_tsne=False, **kwargs):
379 | """run umap on X (n_obs, n_features)
380 | """
381 | ti = time.time()
382 |
383 | logging.info("Running UMAP: {} n_neighbors, {} min_dist , {} dim.\n\
384 | Input shape: (# observations, # features) = {}\n\
385 | Use netUMAP from pegasus: {}\n\
386 | Use tSNE: {}\n\
387 | "
388 | .format(n_neighbors, min_dist, n_dim, X.shape, use_netUMAP, use_tsne))
389 |
390 | if use_netUMAP:
391 | umap_res = run_net_umap_pegasus(X,
392 | n_components=n_dim,
393 | random_state=random_state,
394 | n_neighbors=n_neighbors,
395 | min_dist=min_dist,
396 | **kwargs)
397 | elif use_tsne:
398 | from sklearn.manifold import TSNE
399 | umap_res = TSNE(n_components=n_dim,
400 | random_state=random_state,
401 | **kwargs,
402 | ).fit_transform(X)
403 | else:
404 | umap_res = UMAP(n_components=n_dim,
405 | random_state=random_state,
406 | n_neighbors=n_neighbors,
407 | min_dist=min_dist,
408 | **kwargs).fit_transform(X)
409 |
410 |
411 | columns = ['umap_{}'.format(i+1) for i in np.arange(n_dim)]
412 | df_umap = pd.DataFrame(umap_res, columns=columns)
413 | df_umap['sample'] = cell_list
414 | df_umap = df_umap.set_index('sample')
415 |
416 | if output_file:
417 | df_umap.to_csv(output_file, sep="\t", na_rep='NA', header=True, index=True)
418 | logging.info("Saved coordinates to file. {}".format(output_file))
419 |
420 | tf = time.time()
421 | logging.info("Done. running time: {} seconds.".format(tf - ti))
422 |
423 | return df_umap
--------------------------------------------------------------------------------
/scripts/preproc_utils.py:
--------------------------------------------------------------------------------
1 | from __init__ import *
2 | import numpy as np
3 | import pandas as pd
4 | import logging
5 | from sklearn.utils.sparsefuncs import mean_variance_axis
6 | from scipy.stats import kruskal
7 |
8 | import basic_utils
9 |
10 | def select_hvg(gbc_cpm, percentile=30, n_qcut=20,):
11 | # further select highly variable genes
12 | # variance/mean
13 | mean_cpm, var_cpm = mean_variance_axis(gbc_cpm.data.tocsr(), axis=1)
14 | vmr_cpm = (var_cpm+1)/(mean_cpm+1)
15 | # select top 30 percentile vmr from each first 9 deciles of CPM
16 | # duplicates = 'drop' 9/21/2019 Fangming
17 | _x = pd.qcut(pd.Series(mean_cpm), n_qcut, labels=False, duplicates='drop').to_frame('decile')
18 | hvgs = []
19 | for decile, _x_sub in _x.groupby('decile'):
20 | gene_group = _x_sub.index.values
21 | mean_cpm_gg = mean_cpm[gene_group]
22 | vmr_cpm_gg = vmr_cpm[gene_group]
23 | # genes with top 30% of vmr
24 | hvg_group = gene_group[vmr_cpm_gg > np.percentile(vmr_cpm_gg, 100-percentile)]
25 |
26 | if decile != n_qcut-1:
27 | hvgs.append(hvg_group)
28 | hvgs = np.hstack(hvgs)
29 | return hvgs
30 |
31 | def select_hvg_methylation(df_nmcc, percentile=30, n_qcut=20,):
32 | # further select highly variable genes
33 | # standard deviation
34 |
35 | stds_nmcc = df_nmcc.std(axis=1)
36 | mean_nmcc = df_nmcc.mean(axis=1)
37 |
38 | # select top 30 percentile vmr from each first 9 deciles of NMCC
39 | # duplicates = 'drop' 9/21/2019 Fangming
40 | _x = pd.qcut(mean_nmcc, n_qcut, labels=False, duplicates='drop').to_frame('decile')
41 | hvgs = []
42 | for decile, _x_sub in _x.groupby('decile'):
43 | gene_group = _x_sub.index.values
44 |
45 | mean_nmcc_gg = mean_nmcc.loc[gene_group]
46 | stds_nmcc_gg = stds_nmcc.loc[gene_group]
47 | # logging.info(gene_group.shape, stds_nmcc_gg.shape)
48 | # genes with top 30% of stds
49 | hvg_group = gene_group[stds_nmcc_gg > np.percentile(stds_nmcc_gg, 100-percentile)]
50 | hvgs.append(hvg_group)
51 |
52 | hvgs = np.hstack(hvgs)
53 | return hvgs
54 |
55 | def filter_genes(gxc_raw, sufficient_cell_coverage=0.01):
56 | """
57 | """
58 | n_gene, n_cell = gxc_raw.data.shape
59 | gene_cov = (gxc_raw.data > 0).sum(axis=1)
60 | gene_cov = np.array(gene_cov).squeeze()/n_cell # fraction of cells covered
61 | cond = gene_cov>sufficient_cell_coverage
62 | gxc_raw_filtered = GC_matrix(np.array(gxc_raw.gene)[cond],
63 | gxc_raw.cell,
64 | gxc_raw.data.tocsr()[cond, :],
65 | )
66 | return gxc_raw_filtered
67 |
68 | def preproc_rna_cpm_based(gxc_raw, sufficient_cell_coverage=0.01,
69 | hv_percentile=30, hv_ncut=20):
70 | # select genes expressed in > 1% of cells
71 | # raw genes
72 | # _gxc_tmp, gxc_ftr, hvgs
73 | logging.info("Removing low coverage genes...")
74 | lib_size = np.ravel(gxc_raw.data.sum(axis=0))
75 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
76 |
77 | # CPM matrix
78 | logging.info("Getting CPM..")
79 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size)
80 | del _gxc_tmp
81 |
82 | # select highy variable genes
83 | logging.info("Getting highly variable genes and logCPM...")
84 | hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut)
85 |
86 | gxc_hvftr = GC_matrix(
87 | gxc_ftr.gene[hvgs],
88 | gxc_ftr.cell,
89 | gxc_ftr.data.tocsr()[hvgs, :],
90 | )
91 | del gxc_ftr
92 | gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important
93 | logging.info("Number of genes: {}".format(len(hvgs)))
94 | return gxc_hvftr
95 |
96 | def preproc_rna_cpm_based_kruskal(metadata, cluster_col, gxc_raw, sufficient_cell_coverage=0.01,
97 | hv_percentile=30):
98 | # select genes expressed in > 1% of cells
99 | # raw genes
100 | # _gxc_tmp, gxc_ftr, hvgs
101 |
102 | logging.info("Removing low coverage genes...")
103 | lib_size = np.ravel(gxc_raw.data.sum(axis=0))
104 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
105 |
106 | # CPM matrix
107 | logging.info("Getting CPM..")
108 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size) # logcpm for kw
109 | del _gxc_tmp
110 |
111 | # select highy variable genes
112 | logging.info("Getting highly variable genes and logCPM...")
113 | # select genes with KW test
114 | datasets = []
115 | for clst, df_sub in metadata.groupby(cluster_col):
116 | cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values)
117 | datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr())
118 | ps = []
119 | for i, gene in enumerate(gxc_ftr.gene):
120 | if i%1000==0:
121 | logging.info(i)
122 | gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets]
123 | try:
124 | _, p = kruskal(*gene_data)
125 | except:
126 | p = 1
127 | ps.append(p)
128 | p_th = np.percentile(ps, hv_percentile)
129 | logging.info("Pvalue threshold p_th: {}".format(p_th))
130 | hvgs = np.arange(len(ps))[ps<=p_th]
131 |
132 | gxc_hvftr = GC_matrix(
133 | gxc_ftr.gene[hvgs],
134 | gxc_ftr.cell,
135 | gxc_ftr.data.tocsr()[hvgs, :],
136 | )
137 | del gxc_ftr
138 | gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important
139 | logging.info("Number of genes: {}".format(len(hvgs)))
140 | return gxc_hvftr
141 |
142 | def preproc_rna_tpm_based(gxc_raw, gene_lengths,
143 | impute_gene_lengths=True,
144 | sufficient_cell_coverage=0.01,
145 | hv_percentile=30, hv_ncut=20):
146 | """Gene lengths is a gene length pandas series indexed by gene names
147 | """
148 | # gxc_raw, gxc_logtpm
149 | # _gxc_tmp, gxc_ftr, hvgs
150 |
151 | assert np.all(gxc_raw.gene == gene_lengths.index.values)
152 | if impute_gene_lengths:
153 | logging.info("Imputing gene lengths...")
154 | gene_lengths = gene_lengths.fillna(gene_lengths.mean())
155 | lib_size = np.ravel(gxc_raw.data.sum(axis=0))
156 |
157 | # select genes expressed in > 1% of cells
158 | logging.info("Removing low coverage genes...")
159 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
160 |
161 | # CPM matrix
162 | logging.info("Getting CPM..")
163 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size)
164 | del _gxc_tmp
165 |
166 | # select highy variable genes
167 | logging.info("Getting highly variable genes and logCPM...")
168 | hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut) # index in gxc_ftr
169 | hvgs_genes = gxc_ftr.gene[hvgs]
170 | del gxc_ftr
171 |
172 | # TPM matrix from gxc_raw
173 | logging.info("Getting logTPM...")
174 | gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths)
175 | hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes)
176 |
177 | # Trim logTPM matrix
178 | logging.info("Trim logTPM matrix...")
179 | gxc_hvftr = GC_matrix(
180 | gxc_logtpm.gene[hvgs_idx],
181 | gxc_logtpm.cell,
182 | gxc_logtpm.data.tocsr()[hvgs_idx, :],
183 | )
184 | logging.info("Number of genes: {}".format(len(hvgs_idx)))
185 | return gxc_hvftr
186 |
187 | def preproc_rna_tpm_based_kruskal(metadata, cluster_col, gxc_raw, gene_lengths,
188 | impute_gene_lengths=True,
189 | sufficient_cell_coverage=0.01,
190 | hv_percentile=30):
191 | """Gene lengths is a gene length pandas series indexed by gene names
192 | """
193 |
194 | assert np.all(gxc_raw.gene == gene_lengths.index.values)
195 | if impute_gene_lengths:
196 | logging.info("Imputing gene lengths...")
197 | gene_lengths = gene_lengths.fillna(gene_lengths.mean())
198 | lib_size = np.ravel(gxc_raw.data.sum(axis=0))
199 |
200 | # select genes expressed in > 1% of cells
201 | logging.info("Removing low coverage genes...")
202 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
203 |
204 | # CPM matrix
205 | logging.info("Getting CPM..")
206 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size)
207 | del _gxc_tmp
208 |
209 |
210 | logging.info("Getting highly variable genes...")
211 | # select genes with KW test
212 | datasets = []
213 | for clst, df_sub in metadata.groupby(cluster_col):
214 | cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values)
215 | datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr())
216 | ps = []
217 | for i, gene in enumerate(gxc_ftr.gene):
218 | if i%1000==0:
219 | logging.info(i)
220 | gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets]
221 | try:
222 | s, p = kruskal(*gene_data)
223 | except:
224 | p = 1
225 | ps.append(p)
226 |
227 | p_th = np.percentile(ps, hv_percentile)
228 | logging.info("Pvalue threshold p_th: {}".format(p_th))
229 | hvgs = np.arange(len(ps))[ps<=p_th]
230 | hvgs_genes = gxc_ftr.gene[hvgs]
231 | del gxc_ftr
232 |
233 | # TPM matrix from gxc_raw
234 | logging.info("Getting logTPM...")
235 | gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths)
236 | hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes)
237 |
238 | # Trim logTPM matrix
239 | logging.info("Trim logTPM matrix...")
240 | gxc_hvftr = GC_matrix(
241 | gxc_logtpm.gene[hvgs_idx],
242 | gxc_logtpm.cell,
243 | gxc_logtpm.data.tocsr()[hvgs_idx, :],
244 | )
245 | logging.info("Number of genes: {}".format(len(hvgs_idx)))
246 | return gxc_hvftr
247 |
248 | def preproc_methylation(
249 | gxc_raw,
250 | metadata,
251 | global_value_col='mCH',
252 | base_call_cutoff=20,
253 | sufficient_coverage_fraction=0.95,
254 | hv_percentile=30,
255 | n_qcut=10,
256 | ):
257 | """
258 | """
259 | # select genes covered (20 counts) in > 95% of cells
260 | df_mc = pd.DataFrame(gxc_raw.data['mc'], index=gxc_raw.gene, columns=gxc_raw.cell)
261 | df_c = pd.DataFrame(gxc_raw.data['c'], index=gxc_raw.gene, columns=gxc_raw.cell)
262 |
263 | n_gene, n_cell = df_c.shape
264 | gene_cov = (df_c > base_call_cutoff).sum(axis=1)/n_cell # fraction of cells covered
265 | cond = gene_cov>sufficient_coverage_fraction
266 | df_mc = df_mc[cond]
267 | df_c = df_c[cond]
268 |
269 | # compute normalized methylation matrix (no need to further select genes)
270 | df_mcc = basic_utils.get_mcc_lite_v2(df_c, df_mc, base_call_cutoff=base_call_cutoff)
271 |
272 | # normalize by global mean
273 | if global_value_col:
274 | df_nmcc = df_mcc.divide(metadata.loc[df_mcc.columns.values, global_value_col], axis=1)
275 | else:
276 | global_mean = df_mc.sum(axis=0)/df_c.sum(axis=0)
277 | df_nmcc = df_mcc.divide(global_mean, axis=1)
278 |
279 | # select highly variable genes
280 | hvgs = select_hvg_methylation(df_nmcc, percentile=hv_percentile, n_qcut=n_qcut)
281 | # trim
282 | df_hvnmcc = df_nmcc.loc[hvgs]
283 |
284 | return df_hvnmcc
285 |
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # add path
4 | rpath=$(realpath ./scripts)
5 | echo $rpath
6 | export PATH=$PATH:$rpath
--------------------------------------------------------------------------------