├── .gitignore ├── LICENSE ├── README.md ├── docs ├── changelog.rst ├── faqs.rst ├── knn.png ├── mnn_direct.png ├── mnn_equation.png ├── mnn_rescue.png ├── n_neighbors_knn.png ├── rescue_equation_1.png ├── rescue_equation_2.png ├── rescue_equation_3.png ├── results │ ├── SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png │ ├── SingleCellFusion_plot_2_hist.png │ ├── SingleCellFusion_plot_3_embedding_by_dataset.png │ ├── SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png │ └── SingleCellFusion_plot_5_confmat.png └── scf_description.rst ├── environment.yml ├── environment_mini.yml ├── environment_mini_pegasus.yml ├── example-MOp_L5ET ├── datasets │ ├── 10x_cells_v2.h5ad │ ├── smarter_cells.h5ad │ ├── smarter_nuclei.h5ad │ └── snmcseq_gene.h5ad ├── run_scf.sh └── visualize_results.ipynb ├── example-wholebrain ├── 00.test_all_preproc.sh ├── visualize_results_lite_3mods.ipynb └── visualize_results_lite_rna_intron_exon.ipynb ├── example-wholebrainatac ├── normalize_and_select_features.ipynb ├── run_preproc.sh ├── run_scf.sh └── visualize_results.ipynb ├── scf_description.rst ├── scripts ├── SCF_utils.py ├── SingleCellFusion ├── SingleCellFusion_prep ├── __init__.py ├── basic_utils.py ├── cli_parser.py ├── clst_utils.py └── preproc_utils.py └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # old 2 | old/ 3 | scripts/old.py 4 | example-wholebrainatac/old/ 5 | example-MOp_L5ET/old/ 6 | 7 | # unused 8 | example-biccn_enhancer/ 9 | example-testeran/ 10 | example-MOp_L5ET-test2/ 11 | example-wholebrain-test2/ 12 | 13 | # results 14 | example-MOp_L5ET/results 15 | example-wholebrainatac/results 16 | 17 | # datasets 18 | example-wholebrainatac/datasets 19 | example-wholebrainatac/datasets_pre 20 | example-wholebrainatac/datasets_processed 21 | 22 | example-wholebrain/datasets 23 | example-wholebrain/processed 24 | example-wholebrain/results 25 | example-wholebrain/old 26 | 27 | # Byte-compiled / optimized / DLL files 28 | __pycache__/ 29 | *.py[cod] 30 | *$py.class 31 | 32 | 33 | # C extensions 34 | *.so 35 | 36 | # Distribution / packaging 37 | .Python 38 | build/ 39 | develop-eggs/ 40 | dist/ 41 | downloads/ 42 | eggs/ 43 | .eggs/ 44 | lib/ 45 | lib64/ 46 | parts/ 47 | sdist/ 48 | var/ 49 | wheels/ 50 | pip-wheel-metadata/ 51 | share/python-wheels/ 52 | *.egg-info/ 53 | .installed.cfg 54 | *.egg 55 | MANIFEST 56 | 57 | # PyInstaller 58 | # Usually these files are written by a python script from a template 59 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 60 | *.manifest 61 | *.spec 62 | 63 | # Installer logs 64 | pip-log.txt 65 | pip-delete-this-directory.txt 66 | 67 | # Unit test / coverage reports 68 | htmlcov/ 69 | .tox/ 70 | .nox/ 71 | .coverage 72 | .coverage.* 73 | .cache 74 | nosetests.xml 75 | coverage.xml 76 | *.cover 77 | *.py,cover 78 | .hypothesis/ 79 | .pytest_cache/ 80 | 81 | # Translations 82 | *.mo 83 | *.pot 84 | 85 | # Django stuff: 86 | *.log 87 | local_settings.py 88 | db.sqlite3 89 | db.sqlite3-journal 90 | 91 | # Flask stuff: 92 | instance/ 93 | .webassets-cache 94 | 95 | # Scrapy stuff: 96 | .scrapy 97 | 98 | # Sphinx documentation 99 | docs/_build/ 100 | 101 | # PyBuilder 102 | target/ 103 | 104 | # Jupyter Notebook 105 | .ipynb_checkpoints 106 | 107 | # IPython 108 | profile_default/ 109 | ipython_config.py 110 | 111 | # pyenv 112 | .python-version 113 | 114 | # pipenv 115 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 116 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 117 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 118 | # install all needed dependencies. 119 | #Pipfile.lock 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SingleCellFusion 2 | 3 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. Code in this repository is used in [Luo et al., (2019) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1) and in [Yao et al., (2020) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2). [Here](docs/scf_description.rst) is a brief description of how SingleCellFusion works. 4 | 5 | Related publications: 6 | - Luo, C. et al. Single nucleus multi-omics links human cortical cell regulatory genome diversity to disease risk variants. bioRxiv 2019.12.11.873398 (2019) [doi:10.1101/2019.12.11.873398](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1) 7 | - Yao, Z. et al. An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types. bioRxiv 2020.02.29.970558 (2020) [doi:10.1101/2020.02.29.970558](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2) 8 | - BRAIN Initiative Cell Census Network (BICCN) et al. A multimodal cell census and atlas of the mammalian primary motor cortex. bioRxiv 2020.10.19.343129 (2020) [doi:10.1101/2020.10.19.343129](https://www.biorxiv.org/content/10.1101/2020.10.19.343129v1) 9 | 10 | Code contributors: [Fangming Xie](mailto:f7xie@ucsd.edu), Aditya Chandrasekar, Wayne I. Doyle, [Ethan Armand](mailto:ejarmand@ucsd.edu) 11 | 12 | Contact: [Eran Mukamel](mailto:emukamel@ucsd.edu) 13 | 14 | ## Installation 15 | Step 1: Clone this repo. 16 | ```bash 17 | git clone https://github.com/mukamel-lab/SingleCellFusion.git 18 | cd SingleCellFusion 19 | ``` 20 | 21 | **TBD** 22 | Step 2: Set up a conda environment and install dependent packages. The environment should be installed on the UNIX terminal(Skip this step if not needed.) 23 | ```bash 24 | conda env create -f environment.yml # create an env named scf_terra 25 | source activate scf_terra 26 | ``` 27 | 28 | ## Usage 29 | ``` 30 | usage: SingleCellFusion [-h] -i xx.h5ad [xx.h5ad ...] -im rna/atac/mc [rna/atac/mc ...] 31 | -f xx.h5ad [xx.h5ad ...] [-o DIR] [-op OUTPUT_PREFIX] 32 | [--nearest_neighbors NEAREST_NEIGHBORS] [--relaxation RELAXATION] 33 | [--num_pcs NUM_PCS] [--smoothing_fractions SMOOTHING_FRACTIONS] 34 | [--leiden_n_neighbors LEIDEN_N_NEIGHBORS] [--leiden_resolutions LEIDEN_RESOLUTIONS] 35 | [--umap_n_neighbors UMAP_N_NEIGHBORS] [--umap_min_dist UMAP_MIN_DIST] 36 | 37 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. 38 | 39 | optional arguments: 40 | -h, --help show this help message and exit 41 | 42 | required: 43 | -i xx.h5ad [xx.h5ad ...], --input_datasets xx.h5ad [xx.h5ad ...] 44 | (list of str) Paths to .h5ad files, each containing a cell-by-gene feature matrix, 45 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, Gene IDs 46 | should be shared or partially shared across files. Multiple inputs should be listed 47 | as a space seperated list of filenames. (default: None) 48 | -im rna/atac/mc [rna/atac/mc ...], --input_modalities rna/atac/mc [rna/atac/mc ...] 49 | (list of str) Data modalities chosen from 'rna', 'atac', or 'mc'. This should be 50 | listed in the same order as input_datasets. (default: None) 51 | -f xx.h5ad [xx.h5ad ...], --feature_datasets xx.h5ad [xx.h5ad ...] 52 | (list of str) Dataset(s) whose features all other datasets will impute into. This 53 | should be a subset of --input_datasets. Enter multiple datasets as a space-separated 54 | list of filenames. The features of these datasets will be the features kept in the 55 | output imputed data table." (default: None) 56 | 57 | optional: 58 | -o DIR, --output_dir DIR 59 | (str) Directory to store output files (default: ./results) 60 | -op OUTPUT_PREFIX, --output_prefix OUTPUT_PREFIX 61 | (str) The output files will contain this prefix. (default: SingleCellFusion) 62 | --nearest_neighbors NEAREST_NEIGHBORS 63 | (integer) Number of nearest neighbors used to impute data (default: 20) 64 | --relaxation RELAXATION 65 | (float) A value between 1 to infinity. This is a parameter that constraints the 66 | number of neighbors a cell is allowed to receive. Assume dataset 1 has N1 cells, 67 | dataset 2 has N2 cells. To find k neighbors in dataset 2 for every cell in dataset 1 68 | means on average each cell in dataset 2 receives (kN1/N2) connections. However, not 69 | all cells in dataset 2 gets the same number of connections. We therefore set an upper 70 | bound for the number of connections a cell in dataset 2 can receive to be: 71 | (kN1/N2)*relaxation where relaxation >= 1. Relaxation=1 enforces a hard limit that 72 | every cell receives the same number of nearest neighbors, while relaxation=infinity 73 | approaches traditional kNN. (default: 3) 74 | 75 | advanced: 76 | --num_pcs NUM_PCS (integer) Number of Principal Components to keep for each dataset for smoothing 77 | and for clustering/embedding after imputation. (default: 50) 78 | --smoothing_fractions SMOOTHING_FRACTIONS 79 | (list of floats) A list of three values between 0 to 1 that controls the relative 80 | contribution from the cell itself vs. its neighbors in within-dataset smoothing, 81 | specified for 'rna', 'atac', 'mc' data, respectively. (default: [0.7, 0.1, 0.9]) 82 | --leiden_n_neighbors LEIDEN_N_NEIGHBORS 83 | (integer) Number of nearest neighbors to form in the integrated space, the resulting 84 | nearest neighbor graph is used for Leiden clustering. It is passed into the python 85 | package leidenalg. 86 | (default: 30) 87 | --leiden_resolutions LEIDEN_RESOLUTIONS 88 | (list of floats) A list of resolutions to be used for Leiden Clustering. It is 89 | passed into the python package leidenalg. (default: [0.1, 0.2, 0.4, 0.8]) 90 | --umap_n_neighbors UMAP_N_NEIGHBORS 91 | (integer) Number of neighbors for UMAP. It is passed into the python package 92 | umap.UMAP(n_neighbors). (default: 60) 93 | --umap_min_dist UMAP_MIN_DIST 94 | (float) Minimum distance for UMAP. It is passed into the python package 95 | umap.UMAP(min_dist). (default: 0.5) 96 | 97 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel. 98 | 99 | Contact: Eran Mukamel (emukamel@ucsd.edu). 100 | ``` 101 | 102 | ### Example: 103 | 104 | #### Integrating L5 ET cells from four data modalities from the mouse primary motor cortex: 105 | 106 | `./example-MOp-L5ET` contains an example of integrating the layer 5 Extratelencephalically Projecting neurons (L5 ET) from 4 different datasets from the mouse primary motor cortex. The example directory includes the organized datasets, code, and results, which could be used as a template for other similar tasks. 107 | 108 | After SingleCellFusion on the example data with `run_scf.sh` in `example-MOp_L5ET` (which is a call to SingleCellFusion with default parameters), the notebook `./example-MOp_L5ET/visualize_results.ipynb` provides a step-by-step walkthrough of manipulating and plotting the integrated data. The plots created are shown below, and all the required code to generate them is included in the notebook. 109 | 110 | ``` 111 | cd ./example-MOp_L5ET 112 | # shell script to run SingleCellFusion using example parameters 113 | ./run_scf.sh 114 | # visualize results 115 | jupyter notebook visualize_results.ipynb 116 | ``` 117 | 118 | More example datasets, and prepared `./run_scf.sh` files are included in the repository to play around with. 119 | 120 | #### Integrated Embedding and Clustering 121 | 122 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to plot these UMAP coordinates, coloring each data point to get a rough view of how integrated the modalities are. 123 | 124 | For our Top Panel, we plot the integrated UMAP space, coloring each point with the colors of each integrated modality. 125 | 126 | For the Bottom Panels, we plot the integrated UMAP space by each of the joint clusters found. Each plot corresponds to a separate clustering resolution, set when calling SingleCellFusion. 127 | 128 | ![Plot 1](./docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png) 129 | 130 | #### Cell Distribution in Clusters 131 | 132 | In order to ensure that each cluster has a similar composition of all of the datasets, we plot the dataset compositions of each Joint Cluster in a bar chart. 133 | 134 | Each bar corresponds to a different cluster found in our Joint Clustering, colored by the original datasets. To check that each cluster has a relatively even composition of each dataset, we plot the overall composition of datasets next to our bar charts for comparison. 135 | 136 | To ensure that SingleCellFusion does not cluster cells by their source dataset, we want the composition of each cluster to be as close to the overall composition of the data as possible (defined by the sizes of the original modalities). 137 | 138 | ![Plot 2](./docs/results/SingleCellFusion_plot_2_hist.png) 139 | 140 | #### Cell Embedding Colored by Dataset 141 | 142 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to show that our integration does not segregate our cells by modality, and rather integrates them by other expression-level features. 143 | 144 | To do this, we plot each modality separately on the same UMAP space and check that each modality is evenly distributed across the space. 145 | 146 | ![Plot 3](./docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png) 147 | 148 | #### Cell Embedding Colored by Original Annotations/Cluster 149 | 150 | To see how the original clusters from individual modalities are preserved in the integrated clustering, we display the original cell-type annotations of individual datasets in our integrated space. 151 | 152 | For the given example, we will focus on displaying L5 PT (L5 ET or its equivalent annotation) cells. The majority of the cells in our example are labeled as L5 PT/ET in each individual dataset's clusters, with a few exceptions. 153 | 154 | ![Plot 4](./docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png) 155 | 156 | #### Confusion Matrices between Integrated Clustering and Individual Dataset Clustering 157 | 158 | To continue investigating how the original clusters from individual modalities are maintained in the integrated clusters, we can plot a confusion matrix showing how the clusters of individual datasets are re-organized into new integrated clusters. 159 | 160 | The rows of each confusion matrix shows the cluster labels for the individual dataset. If no such cluster labels exist, a single column is shown, so we can still examine the distribution of cells in the Joint Clusters. 161 | 162 | The y-axis shows the three clusters identified in the integrated dataset. 163 | 164 | The Confusion Matrices on the first row are all Normalized by Joint Clusters (the Sum of each Row is 1). 165 | 166 | The Confusion Matrices on the second row are all Normalized by Original Clusters (the Sum of each Row is 1). 167 | 168 | ![Plot 5](./docs/results/SingleCellFusion_plot_5_confmat.png) 169 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ================ 3 | * Version 1.0.0 - 2019-11-08: 4 | * First stable release! 5 | * Basic example on how to use SingleCellFusion was added to the README 6 | * Minor formatting and documenting fixes throughout 7 | * Version 0.9.0 - 2019-11-07: 8 | * Finalizing tests before first stable release 9 | * Fixed indexing issue in low memory version of data integration 10 | * Version 0.8.0 - 2019-11-05: 11 | * Performed debugging 12 | * Added wrapper for all steps 13 | * Version 0.7.0 - 2019-10-14: 14 | * Added low memory version of integration functions 15 | * Version 0.6.0 - 2019-10-04: 16 | * Added functions to search for common, variable features 17 | * Added ability to perform high and low memory mean/standard deviation of loom files 18 | * Version 0.5.0 - 2019-10-03: 19 | * Fixed low and high memory versions of kNN 20 | * Added integration function (currently only in high memory format) 21 | * Added high memory PCA 22 | * Version 0.4.0 - 2019-09-30: 23 | * Added preliminary MNN functions 24 | * Removed recipes and integration functions pending updates 25 | * Version 0.3.0 - 2019-09-25: 26 | * Major overhaul to make SingleCellFusion more user friendly 27 | * Added low and high memory versions of constrained kNN search 28 | * Removed MNN method pending, version 0.4.0 29 | * Version 0.2.0 - 2019-09-19: 30 | * Initialization of changelog and versions, so a lot of changes have been done since the last version 31 | * Fixed a number of bugs 32 | * Version 0.1.0 - 2018-09-11: 33 | * Initial release of SingleCellFusion 34 | 35 | -------------------------------------------------------------------------------- /docs/faqs.rst: -------------------------------------------------------------------------------- 1 | FAQs 2 | ================ 3 | SingleCellFusion is under active development and function names and parameters will continue to be 4 | changed until a stable release is reached. In the interim, we have provided some answers to common 5 | questions and problems that can occur when using SingleCellFusion. 6 | 7 | Why do you use loom files and how do I make one? 8 | ------------------------------------------------- 9 | The loom file format allows SingleCellFusion to have a low memory footprint when analyzing large data 10 | sets (such as 10x Genomics scRNA-seq data), and keep all of the meta-data in one centralized location. 11 | The loompy package was developed by the Sten Linnarsson group and has excellent documentation at 12 | `loompy.org `_. 13 | 14 | Within a loom file features are stored in rows, and cells in columns. As an example to create a loom file, 15 | say you have a pandas dataframe (df) in which the features are in rows and cells are in columns. The index of 16 | this dataframe contains the unique feature IDs and the column header contains unique cell IDs. A loom file 17 | can be generated with the following code:: 18 | 19 | import loompy 20 | loompy.create(filename=filename, 21 | layers={'':df.values}, 22 | row_attrs={'Accession:df.index.values}, 23 | col_attrs={'CellID:df.columns.values}) 24 | 25 | Why is my code using so much memory, even with the low_mem flag? 26 | ----------------------------------------------------------------- 27 | Access of loom files is performed in batches to reduce the memory overload. In the basic recipe for 28 | SingleCellFusion (pairwise_impute) the size of these batches is controlled by the parameter batch_x and 29 | batch_y. If you are having memory issues, try reducing the size of these values to reduce your memory 30 | overhead. 31 | 32 | Why is my code using so many threads? 33 | -------------------------------------- 34 | This is a problem with Numpy (see this `issue `_). You can solve this 35 | issue in two ways. 36 | 37 | The easiest way is to run the following lines on your command line before running any Python scripts or notebooks:: 38 | 39 | export OMP_NUM_THREADS=1 40 | export OPENBLAS_NUM_THREADS=1 41 | export MKL_NUM_THREADS=1 42 | export VECLIB_MAXIMUM_THREADS=1 43 | export NUMEXPR_NUM_THREADS=1 44 | 45 | You can also run the below code in the first cell of a Python notebook or the beginning or a Python script. It must 46 | be run before importing any other packages (including MoP):: 47 | 48 | import os 49 | os.environ["OMP_NUM_THREADS"] = '1' 50 | os.environ["OPENBLAS_NUM_THREADS"] = '1' 51 | os.environ["MKL_NUM_THREADS"] = '1' 52 | os.environ["VECLIB_MAXIMUM_THREADS"] = '1' 53 | os.environ["NUMEXPR_NUM_THREADS"] = '1' 54 | 55 | You can specify the maximum number of threads that you want to use in that script or notebook by changing the value 56 | from 1 to your desired integer. 57 | 58 | This information came from this `StackOverflow question 59 | `_. 60 | 61 | Why is my code running slow with the low_mem flag? 62 | -------------------------------------------------- 63 | Although the loom file format has a number of benefits, the access and processing of data in the file 64 | will get progressively slower as more data is added to the file. If you are finding that your code is 65 | running too slow it can be helpful to make a second loom file containing just the relevant data for running 66 | SingleCellFusion. 67 | 68 | Another cause of slow code is that the batch size for processing code (see "Why is my code using so much 69 | memory?" above) is too small. If you are not having memory issues, we recommend increasing the batch size 70 | to speed up the code. 71 | 72 | Why am I not finding many neighbors? 73 | ------------------------------------- 74 | If you expect that similar cell types should be present in both data sets, this could be due to 75 | the sparseness of your data. We have found that if you first smooth your data (we highly 76 | recommend using `MAGIC `_. You can then use the 77 | smoothed data to find nearest neighbors, and impute on the observed data. A tutorial using our 78 | loom-based method of smoothing will be uploaded soon. 79 | 80 | 81 | Is SingleCellFusion just for integrating data from different sequencing modalities? 82 | ----------------------------------------------------------------------------------- 83 | No, theoretically this pipeline could be applied to integration across species or to find common cell 84 | types across different research studies using the same sequencing technology. This is an active area 85 | of development. 86 | 87 | What happens if a cell type is present in only one modality? 88 | ------------------------------------------------------------- 89 | In our experience, this situation is easily detectable. If the analysis is only performed on direct 90 | mutual nearest neighbors, these cells will not make nearest neighbors and will be dropped from the analysis. 91 | If the imputation is performed with the rescue, these cells will still not make mutual nearest neighbors. 92 | Their imputed counts will then come from their mutual nearest neighbors within their own data set. These 93 | imputed counts will not be similar to any observed counts, and these cells will self-segregate into their 94 | own clusters and will be visually separate on a tSNE or uMAP embedding. For the kNN method, these cells will 95 | make weak connections with a number of different cell types. This will lead to the imputation of counts that 96 | are not similar to any observed counts, also leading to segregation into unique clusters. 97 | 98 | 99 | -------------------------------------------------------------------------------- /docs/knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/knn.png -------------------------------------------------------------------------------- /docs/mnn_direct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_direct.png -------------------------------------------------------------------------------- /docs/mnn_equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_equation.png -------------------------------------------------------------------------------- /docs/mnn_rescue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_rescue.png -------------------------------------------------------------------------------- /docs/n_neighbors_knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/n_neighbors_knn.png -------------------------------------------------------------------------------- /docs/rescue_equation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_1.png -------------------------------------------------------------------------------- /docs/rescue_equation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_2.png -------------------------------------------------------------------------------- /docs/rescue_equation_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_3.png -------------------------------------------------------------------------------- /docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png -------------------------------------------------------------------------------- /docs/results/SingleCellFusion_plot_2_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_2_hist.png -------------------------------------------------------------------------------- /docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png -------------------------------------------------------------------------------- /docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png -------------------------------------------------------------------------------- /docs/results/SingleCellFusion_plot_5_confmat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_5_confmat.png -------------------------------------------------------------------------------- /docs/scf_description.rst: -------------------------------------------------------------------------------- 1 | How does SingleCellFusion work? 2 | ================================ 3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing, 4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would 5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know 6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation 7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell. 8 | 9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single 10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a 11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics 12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity. 13 | 14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets. 15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable 16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a 17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled 18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the 19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's 20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor 21 | graph specified, and are described below. 22 | 23 | Direct mutual nearest neighbors 24 | ------------------------------- 25 | .. image:: mnn_direct.png 26 | :width: 400 27 | :alt: cartoon of direct MNN 28 | 29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the 30 | correlation of counts at highly variable features is calculated. These correlation values are used 31 | as the distance metric for identifying mutual neighbors. 32 | 33 | Once the correlation is calculated, neighbors across modalities are determined. We require that 34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled 35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable 36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq 37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the 38 | nearest neighbors graph is not dominated by noisy or spurious correlations. 39 | 40 | Once the neighbors graph is generated imputed counts are generated by the following equation: 41 | 42 | .. image:: mnn_equation.png 43 | :width: 400 44 | :alt: equation for imputation by MNN 45 | 46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality 47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest 48 | neighbors in modality *m'*. 49 | 50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual 51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can 52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them. 53 | 54 | 55 | Mutual nearest neighbors with rescue 56 | ------------------------------------- 57 | .. image:: mnn_rescue.png 58 | :width: 400 59 | :alt: cartoon of rescue MNN 60 | 61 | As with the direct method, the distances between two pairs of cells is their correlation at 62 | highly variable genes. The only difference with this method is that in addition to a mutual 63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality 64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by 65 | using the within modality neighbors to determine what the best matched neighbors are across 66 | modalities. 67 | 68 | .. image:: rescue_equation_1.png 69 | :width: 400 70 | :alt: equation 1 of rescue 71 | 72 | where 73 | 74 | .. image:: rescue_equation_2.png 75 | :width: 400 76 | :alt: equation 2 of rescue 77 | 78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality 79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k* 80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality 81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in 82 | "Direct mutual nearest neighbors:" 83 | 84 | .. image:: mnn_equation.png 85 | :width: 400 86 | :alt: equation for imputation by MNN 87 | 88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following 89 | equation: 90 | 91 | .. image:: rescue_equation_3.png 92 | :width: 400 93 | :alt: equation 3 of rescue 94 | 95 | This is a more lenient method for generating imputed counts as all cells will receive imputed 96 | data. This method will enable all cells to be analyzed, and is our recommended approach. 97 | 98 | k-nearest neighbors 99 | ------------------- 100 | .. image:: knn.png 101 | :width: 400 102 | :alt: cartoon of kNN 103 | 104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at 105 | highly variable features. The major difference with this method is that each cell is required to make 106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is 107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell 108 | in the opposite modality can make is given by the equation: 109 | 110 | .. image:: n_neighbors_knn.png 111 | :width: 200 112 | :alt: equation 1 of knn 113 | 114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required 115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\ is the number of cells in 116 | modality *m*, and *n*\ :sub:`m'`\ is the number of cells in modality *m'*. *z* is a relaxation 117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly 118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell 119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same 120 | equation as in "Direct mutual nearest neighbors: 121 | 122 | .. image:: mnn_equation.png 123 | :width: 400 124 | :alt: equation for imputation by MNN 125 | 126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors 127 | in the opposite data set. 128 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: scf_terra 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=1_llvm 9 | - anndata=0.6.22.post1=py_0 10 | - attrs=19.3.0=py_0 11 | - backcall=0.1.0=py_0 12 | - blas=1.0=openblas 13 | - bleach=3.1.1=py_0 14 | - blosc=1.16.3=hd408876_0 15 | - bzip2=1.0.8=h7b6447c_0 16 | - ca-certificates=2021.4.13=h06a4308_1 17 | - cairo=1.16.0=hfb77d84_1002 18 | - certifi=2020.12.5=py38h06a4308_0 19 | - cycler=0.10.0=py_2 20 | - dbus=1.13.6=he372182_0 21 | - decorator=4.4.1=py_0 22 | - defusedxml=0.6.0=py_0 23 | - entrypoints=0.3=py38_1000 24 | - expat=2.2.9=he1b5a44_2 25 | - fbpca=1.0=py_0 26 | - fontconfig=2.13.1=h86ecdb6_1001 27 | - freetype=2.10.0=he983fc9_1 28 | - gettext=0.19.8.1=hc5be6a0_1002 29 | - glib=2.58.3=py38h6f030ca_1002 30 | - gmp=6.2.0=he1b5a44_2 31 | - gst-plugins-base=1.14.5=h0935bb2_2 32 | - gstreamer=1.14.5=h36ae1b5_2 33 | - h5py=2.10.0=py38h7918eee_0 34 | - hdf5=1.10.4=hb1b8bf9_0 35 | - icu=64.2=he1b5a44_1 36 | - igraph=0.7.1=h9e3b1fc_1007 37 | - importlib_metadata=1.5.0=py38_0 38 | - ipykernel=5.1.4=py38h5ca1d4c_0 39 | - ipython=7.12.0=py38h5ca1d4c_0 40 | - ipython_genutils=0.2.0=py_1 41 | - jedi=0.16.0=py38_0 42 | - jinja2=2.11.1=py_0 43 | - joblib=0.14.1=py_0 44 | - jpeg=9c=h14c3975_1001 45 | - json5=0.9.0=py_0 46 | - jsonschema=3.2.0=py38_0 47 | - jupyter_client=5.3.4=py38_1 48 | - jupyter_contrib_core=0.3.3=py_2 49 | - jupyter_contrib_nbextensions=0.5.1=py38_0 50 | - jupyter_core=4.6.3=py38_0 51 | - jupyter_highlight_selected_word=0.2.0=py38_1000 52 | - jupyter_latex_envs=1.4.6=py38_1000 53 | - jupyter_nbextensions_configurator=0.4.1=py38_0 54 | - jupyterlab=1.2.6=py_0 55 | - jupyterlab_server=1.0.6=py_0 56 | - kiwisolver=1.1.0=py38hc9558a2_0 57 | - ld_impl_linux-64=2.33.1=h53a641e_8 58 | - leidenalg=0.7.0=py38he1b5a44_1 59 | - libblas=3.8.0=15_openblas 60 | - libcblas=3.8.0=15_openblas 61 | - libclang=9.0.1=default_hde54327_0 62 | - libffi=3.2.1=he1b5a44_1006 63 | - libgcc-ng=9.2.0=h24d8f2e_2 64 | - libgfortran-ng=7.3.0=hdf63c60_5 65 | - libiconv=1.15=h516909a_1005 66 | - liblapack=3.8.0=15_openblas 67 | - libllvm8=8.0.1=hc9558a2_0 68 | - libllvm9=9.0.1=hc9558a2_0 69 | - libopenblas=0.3.8=h5ec1e0e_0 70 | - libpng=1.6.37=hed695b0_0 71 | - libsodium=1.0.17=h516909a_0 72 | - libstdcxx-ng=9.2.0=hdf63c60_2 73 | - libuuid=2.32.1=h14c3975_1000 74 | - libxcb=1.13=h14c3975_1002 75 | - libxkbcommon=0.10.0=he1b5a44_0 76 | - libxml2=2.9.10=hee79883_0 77 | - libxslt=1.1.33=h31b3aaa_0 78 | - llvm-openmp=9.0.1=hc9558a2_2 79 | - llvmlite=0.31.0=py38h8b12597_0 80 | - lxml=4.5.0=py38hbb43d70_1 81 | - lz4-c=1.8.1.2=h14c3975_0 82 | - lzo=2.10=h7b6447c_2 83 | - markupsafe=1.1.1=py38h516909a_0 84 | - matplotlib=3.1.3=py38_0 85 | - matplotlib-base=3.1.3=py38h250f245_0 86 | - mistune=0.8.4=py38h516909a_1000 87 | - mock=4.0.3=pyhd3eb1b0_0 88 | - natsort=7.1.1=pyhd3eb1b0_0 89 | - nbconvert=5.6.1=py38_0 90 | - nbformat=5.0.4=py_0 91 | - ncurses=6.1=hf484d3e_1002 92 | - notebook=6.0.3=py38_0 93 | - nspr=4.25=he1b5a44_0 94 | - nss=3.47=he751ad9_0 95 | - numba=0.48.0=py38hb3f55d8_0 96 | - numexpr=2.7.3=py38h4be448d_1 97 | - numpy=1.18.1=py38h95a1406_0 98 | - openssl=1.1.1k=h27cfd23_0 99 | - pandas=1.0.1=py38hb3f55d8_0 100 | - pandoc=2.9.2=0 101 | - pandocfilters=1.4.2=py_1 102 | - parso=0.6.1=py_0 103 | - patsy=0.5.1=py_0 104 | - pcre=8.44=he1b5a44_0 105 | - pexpect=4.8.0=py38_0 106 | - pickleshare=0.7.5=py38_1000 107 | - pip=20.0.2=py_2 108 | - pixman=0.38.0=h516909a_1003 109 | - prometheus_client=0.7.1=py_0 110 | - prompt_toolkit=3.0.3=py_0 111 | - pthread-stubs=0.4=h14c3975_1001 112 | - ptyprocess=0.6.0=py_1001 113 | - pycairo=1.19.1=py38h438ddbb_0 114 | - pygments=2.5.2=py_0 115 | - pyparsing=2.4.6=py_0 116 | - pyqt=5.12.3=py38hcca6a23_1 117 | - pyrsistent=0.15.7=py38h516909a_0 118 | - pytables=3.6.1=py38h9fd0a39_0 119 | - python=3.8.1=h357f687_2 120 | - python-dateutil=2.8.1=py_0 121 | - python-igraph=0.8.0=py38h9e3b1fc_0 122 | - python_abi=3.8=1_cp38 123 | - pytz=2019.3=py_0 124 | - pyyaml=5.3.1=py38h1e0a361_0 125 | - pyzmq=18.1.1=py38h1768529_0 126 | - qt=5.12.5=hd8c4c69_1 127 | - readline=8.0=hf8c457e_0 128 | - scikit-learn=0.22.1=py38hcdab131_1 129 | - scipy=1.4.1=py38h921218d_0 130 | - seaborn=0.10.0=py_1 131 | - send2trash=1.5.0=py_0 132 | - setuptools=45.2.0=py38_0 133 | - six=1.14.0=py38_0 134 | - snappy=1.1.8=he6710b0_0 135 | - sqlite=3.30.1=hcee41ef_0 136 | - statsmodels=0.11.1=py38h516909a_0 137 | - terminado=0.8.3=py38_0 138 | - testpath=0.4.4=py_0 139 | - texttable=1.6.2=py_0 140 | - tk=8.6.10=hed695b0_0 141 | - tornado=6.0.3=py38h516909a_4 142 | - tqdm=4.59.0=pyhd3eb1b0_1 143 | - traitlets=4.3.3=py38_0 144 | - umap-learn=0.3.10=py38_1 145 | - wcwidth=0.1.8=py_0 146 | - webencodings=0.5.1=py_1 147 | - wheel=0.34.2=py_1 148 | - xorg-kbproto=1.0.7=h14c3975_1002 149 | - xorg-libice=1.0.10=h516909a_0 150 | - xorg-libsm=1.2.3=h84519dc_1000 151 | - xorg-libx11=1.6.9=h516909a_0 152 | - xorg-libxau=1.0.9=h14c3975_0 153 | - xorg-libxdmcp=1.1.3=h516909a_0 154 | - xorg-libxext=1.3.4=h516909a_0 155 | - xorg-libxrender=0.9.10=h516909a_1002 156 | - xorg-renderproto=0.11.1=h14c3975_1002 157 | - xorg-xextproto=7.3.0=h14c3975_1002 158 | - xorg-xproto=7.0.31=h14c3975_1007 159 | - xz=5.2.4=h14c3975_1001 160 | - yaml=0.2.4=h516909a_0 161 | - zeromq=4.3.2=he1b5a44_2 162 | - zipp=3.0.0=py_0 163 | - zlib=1.2.11=h516909a_1006 164 | - zstd=1.3.7=h0b5b093_0 165 | - pip: 166 | - annoy==1.16.3 167 | - pyqt5-sip==4.19.18 168 | - pyqtwebengine==5.12.1 169 | -------------------------------------------------------------------------------- /environment_mini.yml: -------------------------------------------------------------------------------- 1 | name: scf_mini 2 | channels: 3 | - hcc 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=4.5=1_gnu 9 | - anndata=0.7.6=py39hf3d152e_0 10 | - arpack=3.7.0=hc6cf775_2 11 | - blas=1.0=mkl 12 | - brotli=1.0.9=h7f98852_5 13 | - brotli-bin=1.0.9=h7f98852_5 14 | - ca-certificates=2021.5.30=ha878542_0 15 | - cached-property=1.5.2=py_0 16 | - certifi=2021.5.30=py39hf3d152e_0 17 | - cycler=0.10.0=py_2 18 | - dbus=1.13.6=he372182_0 19 | - expat=2.4.1=h9c3ff4c_0 20 | - fbpca=1.0=py_0 21 | - fontconfig=2.13.1=hba837de_1005 22 | - fonttools=4.25.0=pyhd3eb1b0_0 23 | - freetype=2.10.4=h0708190_1 24 | - glib=2.69.1=h5202010_0 25 | - glpk=4.65=h9202a9a_1004 26 | - gmp=6.2.1=h58526e2_0 27 | - gst-plugins-base=1.14.0=hbbd80ab_1 28 | - gstreamer=1.14.0=h28cd5cc_2 29 | - h5py=3.2.1=py39h6c542dc_0 30 | - hdf5=1.10.6=hb1b8bf9_0 31 | - icu=58.2=hf484d3e_1000 32 | - igraph=0.9.4=ha184e22_0 33 | - intel-openmp=2021.3.0=h06a4308_3350 34 | - jbig=2.1=h7f98852_2003 35 | - joblib=1.0.1=pyhd8ed1ab_0 36 | - jpeg=9d=h36c2ea0_0 37 | - kiwisolver=1.3.1=py39h2531618_0 38 | - lcms2=2.12=hddcbb42_0 39 | - ld_impl_linux-64=2.35.1=h7274673_9 40 | - leidenalg=0.8.7=py39he80948d_0 41 | - lerc=2.2.1=h9c3ff4c_0 42 | - libblas=3.9.0=11_linux64_mkl 43 | - libbrotlicommon=1.0.9=h7f98852_5 44 | - libbrotlidec=1.0.9=h7f98852_5 45 | - libbrotlienc=1.0.9=h7f98852_5 46 | - libcblas=3.9.0=11_linux64_mkl 47 | - libdeflate=1.7=h7f98852_5 48 | - libffi=3.3=he6710b0_2 49 | - libgcc-ng=9.3.0=h5101ec6_17 50 | - libgfortran-ng=7.5.0=ha8ba4b0_17 51 | - libgfortran4=7.5.0=ha8ba4b0_17 52 | - libgomp=9.3.0=h5101ec6_17 53 | - liblapack=3.9.0=11_linux64_mkl 54 | - libllvm10=10.0.1=he513fc3_3 55 | - libpng=1.6.37=h21135ba_2 56 | - libstdcxx-ng=9.3.0=hd4cf53a_17 57 | - libtiff=4.3.0=hf544144_1 58 | - libuuid=2.32.1=h7f98852_1000 59 | - libwebp-base=1.2.0=h27cfd23_0 60 | - libxcb=1.13=h7f98852_1003 61 | - libxml2=2.9.12=h03d6c58_0 62 | - llvmlite=0.36.0=py39h612dafd_4 63 | - lz4-c=1.9.3=h9c3ff4c_1 64 | - matplotlib=3.4.2=py39hf3d152e_0 65 | - matplotlib-base=3.4.2=py39hab158f2_0 66 | - metis=5.1.0=h58526e2_1006 67 | - mkl=2021.3.0=h06a4308_520 68 | - mkl-service=2.4.0=py39h7f8727e_0 69 | - mkl_fft=1.3.0=py39h42c9631_2 70 | - mkl_random=1.2.2=py39h51133e4_0 71 | - mpfr=4.1.0=h9202a9a_1 72 | - munkres=1.1.4=pyh9f0ad1d_0 73 | - natsort=7.1.1=pyhd8ed1ab_0 74 | - ncurses=6.2=he6710b0_1 75 | - numba=0.53.1=py39h56b8d98_1 76 | - numpy=1.20.3=py39hf144106_0 77 | - numpy-base=1.20.3=py39h74d4b33_0 78 | - olefile=0.46=pyh9f0ad1d_1 79 | - openjpeg=2.4.0=hb52868f_1 80 | - openssl=1.1.1k=h7f98852_0 81 | - packaging=21.0=pyhd8ed1ab_0 82 | - pandas=1.3.0=py39hde0f152_0 83 | - patsy=0.5.1=py_0 84 | - pcre=8.45=h9c3ff4c_0 85 | - pillow=8.3.1=py39ha612740_0 86 | - pip=21.2.4=py37h06a4308_0 87 | - pthread-stubs=0.4=h36c2ea0_1001 88 | - pynndescent=0.5.4=pyh6c4a22f_0 89 | - pyparsing=2.4.7=pyh9f0ad1d_0 90 | - pyqt=5.9.2=py39h2531618_6 91 | - python=3.9.6=h12debd9_1 92 | - python-annoy=1.17.0=py39he80948d_2 93 | - python-dateutil=2.8.2=pyhd8ed1ab_0 94 | - python-igraph=0.9.6=py39hfef886c_0 95 | - python_abi=3.9=2_cp39 96 | - pytz=2021.1=pyhd8ed1ab_0 97 | - qt=5.9.7=h5867ecd_1 98 | - readline=8.1=h27cfd23_0 99 | - scikit-learn=0.24.2=py39h4dfa638_0 100 | - scipy=1.6.2=py39had2a1c9_1 101 | - seaborn=0.11.2=hd8ed1ab_0 102 | - seaborn-base=0.11.2=pyhd8ed1ab_0 103 | - setuptools=52.0.0=py39h06a4308_0 104 | - sip=4.19.13=py39h2531618_0 105 | - six=1.16.0=pyhd3eb1b0_0 106 | - sqlite=3.36.0=hc218d9a_0 107 | - statsmodels=0.12.2=py39hce5d2b2_0 108 | - suitesparse=5.10.1=hd8046ac_0 109 | - tbb=2020.2=h4bd325d_4 110 | - texttable=1.6.4=pyhd8ed1ab_0 111 | - threadpoolctl=2.2.0=pyh8a188c0_0 112 | - tk=8.6.10=hbc83047_0 113 | - tornado=6.1=py39h3811e60_1 114 | - tzdata=2021a=h5d7bf9c_0 115 | - umap-learn=0.5.1=py39hf3d152e_1 116 | - wheel=0.37.0=pyhd3eb1b0_0 117 | - xorg-libxau=1.0.9=h7f98852_0 118 | - xorg-libxdmcp=1.1.3=h7f98852_0 119 | - xz=5.2.5=h7b6447c_0 120 | - zlib=1.2.11=h7b6447c_3 121 | - zstd=1.5.0=ha95c52a_0 122 | -------------------------------------------------------------------------------- /environment_mini_pegasus.yml: -------------------------------------------------------------------------------- 1 | name: scf_mini_pegasus 2 | channels: 3 | - hcc 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=4.5=1_gnu 9 | - anndata=0.7.6=py39hf3d152e_0 10 | - arpack=3.7.0=hc6cf775_2 11 | - blas=1.0=mkl 12 | - brotli=1.0.9=h7f98852_5 13 | - brotli-bin=1.0.9=h7f98852_5 14 | - ca-certificates=2021.5.30=ha878542_0 15 | - cached-property=1.5.2=py_0 16 | - certifi=2021.5.30=py39hf3d152e_0 17 | - cycler=0.10.0=py_2 18 | - dbus=1.13.6=he372182_0 19 | - expat=2.4.1=h9c3ff4c_0 20 | - fbpca=1.0=py_0 21 | - fontconfig=2.13.1=hba837de_1005 22 | - fonttools=4.25.0=pyhd3eb1b0_0 23 | - freetype=2.10.4=h0708190_1 24 | - glib=2.69.1=h5202010_0 25 | - glpk=4.65=h9202a9a_1004 26 | - gmp=6.2.1=h58526e2_0 27 | - gst-plugins-base=1.14.0=hbbd80ab_1 28 | - gstreamer=1.14.0=h28cd5cc_2 29 | - h5py=3.2.1=py39h6c542dc_0 30 | - hdf5=1.10.6=hb1b8bf9_0 31 | - icu=58.2=hf484d3e_1000 32 | - igraph=0.9.4=ha184e22_0 33 | - intel-openmp=2021.3.0=h06a4308_3350 34 | - jbig=2.1=h7f98852_2003 35 | - joblib=1.0.1=pyhd8ed1ab_0 36 | - jpeg=9d=h36c2ea0_0 37 | - kiwisolver=1.3.1=py39h2531618_0 38 | - lcms2=2.12=hddcbb42_0 39 | - ld_impl_linux-64=2.35.1=h7274673_9 40 | - leidenalg=0.8.7=py39he80948d_0 41 | - lerc=2.2.1=h9c3ff4c_0 42 | - libblas=3.9.0=11_linux64_mkl 43 | - libbrotlicommon=1.0.9=h7f98852_5 44 | - libbrotlidec=1.0.9=h7f98852_5 45 | - libbrotlienc=1.0.9=h7f98852_5 46 | - libcblas=3.9.0=11_linux64_mkl 47 | - libdeflate=1.7=h7f98852_5 48 | - libffi=3.3=he6710b0_2 49 | - libgcc-ng=9.3.0=h5101ec6_17 50 | - libgfortran-ng=7.5.0=ha8ba4b0_17 51 | - libgfortran4=7.5.0=ha8ba4b0_17 52 | - libgomp=9.3.0=h5101ec6_17 53 | - liblapack=3.9.0=11_linux64_mkl 54 | - libllvm10=10.0.1=he513fc3_3 55 | - libpng=1.6.37=h21135ba_2 56 | - libstdcxx-ng=9.3.0=hd4cf53a_17 57 | - libtiff=4.3.0=hf544144_1 58 | - libuuid=2.32.1=h7f98852_1000 59 | - libwebp-base=1.2.0=h27cfd23_0 60 | - libxcb=1.13=h7f98852_1003 61 | - libxml2=2.9.12=h03d6c58_0 62 | - llvmlite=0.36.0=py39h612dafd_4 63 | - louvain=0.7.0=py39he80948d_0 64 | - lz4-c=1.9.3=h9c3ff4c_1 65 | - matplotlib=3.4.2=py39hf3d152e_0 66 | - matplotlib-base=3.4.2=py39hab158f2_0 67 | - metis=5.1.0=h58526e2_1006 68 | - mkl=2021.3.0=h06a4308_520 69 | - mkl-service=2.4.0=py39h7f8727e_0 70 | - mkl_fft=1.3.0=py39h42c9631_2 71 | - mkl_random=1.2.2=py39h51133e4_0 72 | - mpfr=4.1.0=h9202a9a_1 73 | - munkres=1.1.4=pyh9f0ad1d_0 74 | - natsort=7.1.1=pyhd8ed1ab_0 75 | - ncurses=6.2=he6710b0_1 76 | - numba=0.53.1=py39h56b8d98_1 77 | - numpy=1.20.3=py39hf144106_0 78 | - numpy-base=1.20.3=py39h74d4b33_0 79 | - olefile=0.46=pyh9f0ad1d_1 80 | - openjpeg=2.4.0=hb52868f_1 81 | - openssl=1.1.1k=h7f98852_0 82 | - packaging=21.0=pyhd8ed1ab_0 83 | - pandas=1.3.0=py39hde0f152_0 84 | - patsy=0.5.1=py_0 85 | - pcre=8.45=h9c3ff4c_0 86 | - pillow=8.3.1=py39ha612740_0 87 | - pip=21.2.4=py37h06a4308_0 88 | - pthread-stubs=0.4=h36c2ea0_1001 89 | - pynndescent=0.5.4=pyh6c4a22f_0 90 | - pyparsing=2.4.7=pyh9f0ad1d_0 91 | - pyqt=5.9.2=py39h2531618_6 92 | - python=3.9.6=h12debd9_1 93 | - python-annoy=1.17.0=py39he80948d_2 94 | - python-dateutil=2.8.2=pyhd8ed1ab_0 95 | - python-igraph=0.9.6=py39hfef886c_0 96 | - python_abi=3.9=2_cp39 97 | - pytz=2021.1=pyhd8ed1ab_0 98 | - qt=5.9.7=h5867ecd_1 99 | - readline=8.1=h27cfd23_0 100 | - scikit-learn=0.24.2=py39h4dfa638_0 101 | - scipy=1.6.2=py39had2a1c9_1 102 | - seaborn=0.11.2=hd8ed1ab_0 103 | - seaborn-base=0.11.2=pyhd8ed1ab_0 104 | - setuptools=52.0.0=py39h06a4308_0 105 | - sip=4.19.13=py39h2531618_0 106 | - six=1.16.0=pyhd3eb1b0_0 107 | - sqlite=3.36.0=hc218d9a_0 108 | - statsmodels=0.12.2=py39hce5d2b2_0 109 | - suitesparse=5.10.1=hd8046ac_0 110 | - tbb=2020.2=h4bd325d_4 111 | - texttable=1.6.4=pyhd8ed1ab_0 112 | - threadpoolctl=2.2.0=pyh8a188c0_0 113 | - tk=8.6.10=hbc83047_0 114 | - tornado=6.1=py39h3811e60_1 115 | - tzdata=2021a=h5d7bf9c_0 116 | - umap-learn=0.5.1=py39hf3d152e_1 117 | - wheel=0.37.0=pyhd3eb1b0_0 118 | - xorg-libxau=1.0.9=h7f98852_0 119 | - xorg-libxdmcp=1.1.3=h7f98852_0 120 | - xz=5.2.5=h7b6447c_0 121 | - zlib=1.2.11=h7b6447c_3 122 | - zstd=1.5.0=ha95c52a_0 123 | - pip: 124 | - adjusttext==0.7.3 125 | - asciitree==0.3.3 126 | - charset-normalizer==2.0.4 127 | - click==8.0.1 128 | - cython==0.29.24 129 | - demuxem==0.1.6 130 | - docopt==0.6.2 131 | - fasteners==0.16.3 132 | - forceatlas2-python==1.1 133 | - geosketch==1.2 134 | - gprofiler-official==1.0.0 135 | - harmony-pytorch==0.1.6 136 | - hnswlib==0.5.2 137 | - idna==3.2 138 | - intervaltree==2.1.0 139 | - lightgbm==3.2.1 140 | - loompy==3.0.6 141 | - nmf-torch==0.1.1 142 | - numcodecs==0.9.0 143 | - numpy-groupies==0.9.13 144 | - pegasusio==0.3.1.post2 145 | - pegasuspy==1.4.3 146 | - psutil==5.8.0 147 | - pybind11==2.7.1 148 | - requests==2.26.0 149 | - scanorama==1.7.1 150 | - scikit-misc==0.1.4 151 | - sortedcontainers==2.4.0 152 | - torch==1.9.0 153 | - typing-extensions==3.10.0.2 154 | - urllib3==1.26.6 155 | - wordcloud==1.8.1 156 | - xlrd==1.2.0 157 | - xlsxwriter==3.0.1 158 | - zarr==2.9.5 159 | -------------------------------------------------------------------------------- /example-MOp_L5ET/datasets/10x_cells_v2.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/10x_cells_v2.h5ad -------------------------------------------------------------------------------- /example-MOp_L5ET/datasets/smarter_cells.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_cells.h5ad -------------------------------------------------------------------------------- /example-MOp_L5ET/datasets/smarter_nuclei.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_nuclei.h5ad -------------------------------------------------------------------------------- /example-MOp_L5ET/datasets/snmcseq_gene.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/snmcseq_gene.h5ad -------------------------------------------------------------------------------- /example-MOp_L5ET/run_scf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SingleCellFusion \ 4 | -i "./datasets/10x_cells_v2.h5ad" \ 5 | "./datasets/smarter_cells.h5ad" \ 6 | "./datasets/smarter_nuclei.h5ad" \ 7 | "./datasets/snmcseq_gene.h5ad" \ 8 | -im "rna" "rna" "rna" "mc" \ 9 | -f "./datasets/10x_cells_v2.h5ad" \ 10 | -o "./results" 11 | -------------------------------------------------------------------------------- /example-wholebrain/00.test_all_preproc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # every normalized by CPMs 3 | 4 | # # mC prepare 5 | # SingleCellFusion_prep \ 6 | # -i \ 7 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \ 8 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \ 9 | # -icov \ 10 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_CG.h5ad" \ 11 | # "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_CG.h5ad" \ 12 | # -inorm "mc" "mc" \ 13 | # -ga "./datasets/genes_biccn2.0.bed" \ 14 | # -o "./processed" \ 15 | # -op "hvg_mc" 16 | 17 | # # ATAC prepare 18 | # SingleCellFusion_prep \ 19 | # -i \ 20 | # "./datasets/atac/CEMBA171206_3C_genes_promo2kb.h5ad" \ 21 | # "./datasets/atac/CEMBA171207_3C_genes_promo2kb.h5ad" \ 22 | # -gi "ensid" \ 23 | # -ci "cell_id" \ 24 | # -inorm "tpm" "tpm" \ 25 | # -sp \ 26 | # -ga "./datasets/genes_promoter_2kb_biccn2.0.bed" \ 27 | # -o "./processed" \ 28 | # -op "hvg_atac" 29 | 30 | # # RNA prepare 31 | # SingleCellFusion_prep \ 32 | # -i \ 33 | # "./datasets/rna/smrt_intron_biccn2.h5ad" \ 34 | # "./datasets/rna/smrt_exon_biccn2.h5ad" \ 35 | # -inorm "cpm" "cpm" \ 36 | # -o "./processed" \ 37 | # -op "hvg_rna" 38 | 39 | # # run scf RNA mC ATAC 40 | # SingleCellFusion \ 41 | # -i \ 42 | # "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \ 43 | # "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \ 44 | # "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \ 45 | # "./processed/hvg_atac_CEMBA171206_3C_genes_promo2kb.h5ad" \ 46 | # "./processed/hvg_atac_CEMBA171207_3C_genes_promo2kb.h5ad" \ 47 | # -im "rna" "mc" "mc" "atac" "atac"\ 48 | # -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \ 49 | # -o "./results" \ 50 | # -op "SingleCellFusion" 51 | 52 | # run RNA intron and exon 53 | SingleCellFusion \ 54 | -i \ 55 | "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \ 56 | "./processed/hvg_rna_smrt_intron_biccn2.h5ad" \ 57 | -im "rna" "rna"\ 58 | -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \ 59 | -o "./results" \ 60 | -op "intron_exon" -------------------------------------------------------------------------------- /example-wholebrainatac/normalize_and_select_features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "" 12 | ] 13 | }, 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import sys\n", 21 | "import importlib\n", 22 | "sys.path.insert(0, '../scripts')\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from scipy import sparse\n", 26 | "import time\n", 27 | "import re\n", 28 | "import anndata\n", 29 | "\n", 30 | "from __global_variables import *\n", 31 | "from utils_new import *\n", 32 | "import basic_utils\n", 33 | "importlib.reload(basic_utils)\n", 34 | "import preproc_utils\n", 35 | "importlib.reload(preproc_utils)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Task\n", 43 | "- start from prepared files \n", 44 | "```anndata```\n", 45 | "- get and store hvfeatures\n", 46 | "```anndata```" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Settings" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "SRC_DIR = './datasets_pre'\n", 63 | "DST_DIR = './datasets'\n", 64 | "\n", 65 | "sys.path.insert(0, DST_DIR)\n", 66 | "# from __init__datasets import *\n", 67 | "\n", 68 | "\n", 69 | "f_data_format = '{0}/{1}.h5ad'\n", 70 | "f_hvftr_data_format = '{0}/{1}.h5ad'\n", 71 | "\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "mods_selected = [\n", 81 | " 'snatac',\n", 82 | "]\n", 83 | "normalization_options = {\n", 84 | " 'snatac': 'TPM',\n", 85 | "}" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "(32285,)\n" 98 | ] 99 | }, 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "ensid\n", 104 | "ENSMUSG00000051951 465597\n", 105 | "ENSMUSG00000089699 46966\n", 106 | "ENSMUSG00000102331 11595\n", 107 | "ENSMUSG00000102343 80476\n", 108 | "ENSMUSG00000025900 409684\n", 109 | "dtype: int64" 110 | ] 111 | }, 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "df_genes = get_gene_annotation().set_index('ensid')\n", 119 | "\n", 120 | "gene_lengths_base = (df_genes['end'] - df_genes['start'])\n", 121 | "print(gene_lengths_base.shape)\n", 122 | "gene_lengths_base.head()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "# highly variable features" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 8, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "snatac\n", 142 | "snatac Reading in files 0.00018596649169921875\n" 143 | ] 144 | }, 145 | { 146 | "ename": "OSError", 147 | "evalue": "Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)", 148 | "output_type": "error", 149 | "traceback": [ 150 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 151 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", 152 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;31m# read in files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Reading in files {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mti\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mh5ad_mat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manndata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ensid'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mmeta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgxc_raw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbasic_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mh5ad_to_scf_rna_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh5ad_mat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 153 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36mread_h5ad\u001b[0;34m(filename, backed, chunk_size)\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[0;31m# load everything into memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 447\u001b[0;31m \u001b[0mconstructor_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_args_from_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 448\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstructor_args\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 154 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36m_read_args_from_h5ad\u001b[0;34m(adata, filename, mode, chunk_size)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 481\u001b[0;31m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 482\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbacked\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mAnnData\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_BACKED_ATTRS\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 155 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/h5py/h5sparse.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, force_dense, **kwds)\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m \u001b[0;31m# Python 3.5 can’t handle trailing commas here\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 154\u001b[0m ):\n\u001b[0;32m--> 155\u001b[0;31m self.h5f = h5py.File(\n\u001b[0m\u001b[1;32m 156\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 156 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, **kwds)\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nslots\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_w0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 406\u001b[0;31m fid = make_fid(name, mode, userblock_size,\n\u001b[0m\u001b[1;32m 407\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfcpl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmake_fcpl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 408\u001b[0m swmr=swmr)\n", 157 | "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 173\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 174\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 158 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 159 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 160 | "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n", 161 | "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "for mod in mods_selected:\n", 167 | " ti = time.time()\n", 168 | " print(mod)\n", 169 | " \n", 170 | " normalization_option = normalization_options[mod]\n", 171 | " # read data matrix\n", 172 | " if normalization_option == 'MC':\n", 173 | " f_data = f_data_format.format(SRC_DIR, mod)\n", 174 | " \n", 175 | " # read in files\n", 176 | " print(mod, \"Reading in files {}\".format(time.time()-ti))\n", 177 | " gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)\n", 178 | " print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)\n", 179 | " print(time.time()-ti)\n", 180 | " \n", 181 | " # output file\n", 182 | " f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') \n", 183 | " print(time.time()-ti)\n", 184 | " \n", 185 | " # check meta cells agree with gxc cells\n", 186 | " assert np.all(meta.index.values == gxc_raw.cell)\n", 187 | " # check genes are uniq \n", 188 | " assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n", 189 | " # do\n", 190 | " gxc_hvftr = preproc_utils.preproc_methylation(\n", 191 | " gxc_raw,\n", 192 | " meta,\n", 193 | " global_value_col=settings[mod].global_mean, \n", 194 | " base_call_cutoff=20, \n", 195 | " sufficient_coverage_fraction=0.95,\n", 196 | " hv_percentile=30,\n", 197 | " n_qcut=10,\n", 198 | " )\n", 199 | " # save\n", 200 | " print(mod, \"Saving to files {}\".format(time.time()-ti))\n", 201 | "# gxc_hvftr.to_csv(f_hvftr_data_methylation, sep=\"\\t\", header=True, index=True, na_rep='NA')\n", 202 | " h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n", 203 | " \n", 204 | " else:\n", 205 | " # input, output files\n", 206 | " f_data = f_data_format.format(SRC_DIR, mod,) \n", 207 | " f_hvftr_data = f_hvftr_data_format.format(DST_DIR, mod) \n", 208 | " \n", 209 | " # read in files\n", 210 | " print(mod, \"Reading in files {}\".format(time.time()-ti))\n", 211 | " h5ad_mat = anndata.read_h5ad(f_data)\n", 212 | " gid_col, cid_col = 'ensid', ''\n", 213 | " meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)\n", 214 | " \n", 215 | " # check meta cells agree with gxc cells\n", 216 | " assert np.all(meta.index.values == gxc_raw.cell)\n", 217 | " # check genes are uniq \n", 218 | " assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n", 219 | " \n", 220 | " # get hvftrs\n", 221 | " print(mod, \"Preproc and get highly variable genes {}\".format(time.time()-ti))\n", 222 | " if normalization_option == 'CPM':\n", 223 | " gxc_hvftr = preproc_utils.preproc_rna_cpm_based(\n", 224 | " gxc_raw, \n", 225 | " sufficient_cell_coverage=0.01, \n", 226 | " hv_percentile=30, hv_ncut=10)\n", 227 | " elif normalization_option == 'TPM':\n", 228 | " gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)\n", 229 | " gxc_hvftr = preproc_utils.preproc_rna_tpm_based(\n", 230 | " gxc_raw, gene_lengths, impute_gene_lengths=True, \n", 231 | " sufficient_cell_coverage=0.01, \n", 232 | " hv_percentile=30, hv_ncut=10)\n", 233 | " \n", 234 | " # save\n", 235 | " print(mod, \"Saving to file {}\".format(f_hvftr_data, time.time()-ti))\n", 236 | " h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)\n", 237 | " h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n", 238 | " \n", 239 | " print(mod, \"Total time used: {}\".format(time.time()-ti))\n", 240 | " break\n", 241 | " " 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Check highly-variable genes" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 7, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# for mod in mods_selected:\n", 258 | "# print(mod)\n", 259 | "# if settings[mod].mod_category == 'mc':\n", 260 | "# f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'tsv') \n", 261 | "# gxc_hvftr = pd.read_csv(f_hvftr_data, sep=\"\\t\", index_col=0)\n", 262 | "# print(gxc_hvftr.index.values)\n", 263 | "# print(gxc_hvftr.columns.values)\n", 264 | "# print(gxc_hvftr.shape)\n", 265 | "# has_nan = np.isnan(gxc_hvftr.values).any()\n", 266 | "# print(\"Contains NaN? {}\".format(has_nan))\n", 267 | " \n", 268 | "# continue\n", 269 | " \n", 270 | "# f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'npz') \n", 271 | "# f_hvftr_gene = f_hvftr_format.format(SRC_DIR, mod, 'gene') \n", 272 | "# f_hvftr_cell = f_hvftr_format.format(SRC_DIR, mod, 'cell') \n", 273 | "# gxc_hvftr = snmcseq_utils.load_gc_matrix(f_hvftr_gene, f_hvftr_cell, f_hvftr_data)\n", 274 | "# print(gxc_hvftr.gene)\n", 275 | "# print(gxc_hvftr.cell)\n", 276 | "# print(len(gxc_hvftr.gene), len(gxc_hvftr.cell), gxc_hvftr.data.shape)\n", 277 | "# has_nan = np.isnan(gxc_hvftr.data.data).any()\n", 278 | "# print(\"Contains NaN? {}\".format(has_nan))\n", 279 | "# # break" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.8.1" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 4 304 | } 305 | -------------------------------------------------------------------------------- /example-wholebrainatac/run_preproc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ga="/cndd/Public_Datasets/BICCN/BICCN2.0_whole_mouse_brain/references/refdata-gex-mm10-2020-A/genes/genes_promoter_2kb_biccn2.0.bed" 4 | 5 | ../scripts/normalize_and_select_features.py \ 6 | -i "./datasets_pre/CEMBA171206_3C_genes_promo2kb.h5ad" \ 7 | -inorm "tpm" \ 8 | -ga $ga \ 9 | -op "test_preproc_may3" \ 10 | -o "./datasets_processed" -------------------------------------------------------------------------------- /example-wholebrainatac/run_scf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ../scripts/SingleCellFusion \ 4 | -i "./datasets/10x_cells_v2.h5ad" "./datasets/snatac.h5ad" \ 5 | -im "rna" "atac" \ 6 | -f "./datasets/10x_cells_v2.h5ad" \ 7 | -op "test_april27" \ 8 | -o "./results" -------------------------------------------------------------------------------- /scf_description.rst: -------------------------------------------------------------------------------- 1 | How does SingleCellFusion work? 2 | ================================ 3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing, 4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would 5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know 6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation 7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell. 8 | 9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single 10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a 11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics 12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity. 13 | 14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets. 15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable 16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a 17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled 18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the 19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's 20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor 21 | graph specified, and are described below. 22 | 23 | Direct mutual nearest neighbors 24 | ------------------------------- 25 | .. image:: mnn_direct.png 26 | :width: 400 27 | :alt: cartoon of direct MNN 28 | 29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the 30 | correlation of counts at highly variable features is calculated. These correlation values are used 31 | as the distance metric for identifying mutual neighbors. 32 | 33 | Once the correlation is calculated, neighbors across modalities are determined. We require that 34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled 35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable 36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq 37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the 38 | nearest neighbors graph is not dominated by noisy or spurious correlations. 39 | 40 | Once the neighbors graph is generated imputed counts are generated by the following equation: 41 | 42 | .. image:: mnn_equation.png 43 | :width: 400 44 | :alt: equation for imputation by MNN 45 | 46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality 47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest 48 | neighbors in modality *m'*. 49 | 50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual 51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can 52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them. 53 | 54 | 55 | Mutual nearest neighbors with rescue 56 | ------------------------------------- 57 | .. image:: mnn_rescue.png 58 | :width: 400 59 | :alt: cartoon of rescue MNN 60 | 61 | As with the direct method, the distances between two pairs of cells is their correlation at 62 | highly variable genes. The only difference with this method is that in addition to a mutual 63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality 64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by 65 | using the within modality neighbors to determine what the best matched neighbors are across 66 | modalities. 67 | 68 | .. image:: rescue_equation_1.png 69 | :width: 400 70 | :alt: equation 1 of rescue 71 | 72 | where 73 | 74 | .. image:: rescue_equation_2.png 75 | :width: 400 76 | :alt: equation 2 of rescue 77 | 78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality 79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k* 80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality 81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in 82 | "Direct mutual nearest neighbors:" 83 | 84 | .. image:: mnn_equation.png 85 | :width: 400 86 | :alt: equation for imputation by MNN 87 | 88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following 89 | equation: 90 | 91 | .. image:: rescue_equation_3.png 92 | :width: 400 93 | :alt: equation 3 of rescue 94 | 95 | This is a more lenient method for generating imputed counts as all cells will receive imputed 96 | data. This method will enable all cells to be analyzed, and is our recommended approach. 97 | 98 | k-nearest neighbors 99 | ------------------- 100 | .. image:: knn.png 101 | :width: 400 102 | :alt: cartoon of kNN 103 | 104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at 105 | highly variable features. The major difference with this method is that each cell is required to make 106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is 107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell 108 | in the opposite modality can make is given by the equation: 109 | 110 | .. image:: n_neighbors_knn.png 111 | :width: 200 112 | :alt: equation 1 of knn 113 | 114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required 115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\ is the number of cells in 116 | modality *m*, and *n*\ :sub:`m'`\ is the number of cells in modality *m'*. *z* is a relaxation 117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly 118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell 119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same 120 | equation as in "Direct mutual nearest neighbors: 121 | 122 | .. image:: mnn_equation.png 123 | :width: 400 124 | :alt: equation for imputation by MNN 125 | 126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors 127 | in the opposite data set. 128 | -------------------------------------------------------------------------------- /scripts/SCF_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for SingleCellFusion 2 | """ 3 | from __init__ import * 4 | 5 | import functools 6 | import collections 7 | import itertools 8 | import re 9 | from scipy import sparse 10 | from scipy.stats import zscore 11 | import fbpca 12 | import sys 13 | import logging 14 | from memory_profiler import profile 15 | from datetime import datetime 16 | 17 | import basic_utils 18 | import clst_utils 19 | 20 | ctime = datetime.now().strftime("%Y%m%d%H%M%S") 21 | f=open('memory_profile_SCFutils_{}.log'.format(ctime), 'w+') 22 | 23 | @profile(stream=f) 24 | def sparse_adj_to_mat(adjs, row_size, col_size, dists=''): 25 | """Turn a knn adjacency matrix to a sparse matrix 26 | """ 27 | n_obs, k = adjs.shape 28 | assert n_obs == row_size 29 | # row col 1/dist 30 | row_inds = np.repeat(np.arange(row_size), k) 31 | col_inds = np.ravel(adjs) 32 | if isinstance(dists, np.ndarray): 33 | assert dists.shape == adjs.shape 34 | data = np.ravel(dists) 35 | else: 36 | data = [1]*len(row_inds) 37 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(row_size, col_size)) 38 | return knn_dist_mat 39 | 40 | # smooth-within modality 41 | @profile(stream=f) 42 | def smooth_in_modality(counts_matrix, norm_counts_matrix, k, ka, npc=100, sigma=1.0, p=0.1, drop_npc=0): 43 | """Smooth a data matrix 44 | 45 | Arguments: 46 | - counts_matrix (pandas dataframe, feature by cell) 47 | - norm_counts_matrix (pandas dataframe, feature by cell) log10(CPM+1) 48 | - k (number of nearest neighbors) 49 | Return: 50 | - smoothed cells_matrix (pandas dataframe) 51 | - markov affinity matrix 52 | """ 53 | # from sklearn.neighbors import NearestNeighbors 54 | import fbpca 55 | import clst_utils 56 | 57 | assert counts_matrix.shape[1] == norm_counts_matrix.shape[1] 58 | 59 | c = norm_counts_matrix.columns.values 60 | N = len(c) 61 | 62 | # reduce dimension fast version 63 | U, s, Vt = fbpca.pca(norm_counts_matrix.T.values, k=npc) 64 | pcs = U.dot(np.diag(s)) 65 | if drop_npc != 0: 66 | pcs = pcs[:, drop_npc:] 67 | 68 | # get k nearest neighbor distances fast version 69 | inds, dists = clst_utils.gen_knn_annoy(pcs, k, form='list', 70 | metric='euclidean', n_trees=10, search_k=-1, verbose=True, 71 | include_distances=True) 72 | 73 | # remove itself 74 | dists = dists[:, 1:] 75 | inds = inds[:, 1:] 76 | 77 | # normalize by ka's distance 78 | dists = (dists/(dists[:, ka].reshape(-1, 1))) 79 | 80 | # gaussian kernel 81 | adjs = np.exp(-((dists**2)/(sigma**2))) 82 | 83 | # construct a sparse matrix 84 | cols = np.ravel(inds) 85 | rows = np.repeat(np.arange(N), k-1) # remove itself 86 | vals = np.ravel(adjs) 87 | A = sparse.csr_matrix((vals, (rows, cols)), shape=(N, N)) 88 | 89 | # Symmetrize A (union of connection) 90 | A = A + A.T 91 | 92 | # normalization fast (A is now a weight matrix excluding itself) 93 | degrees = A.sum(axis=1) 94 | A = sparse.diags(1.0/np.ravel(degrees)).dot(A) 95 | 96 | # include itself 97 | eye = sparse.identity(N) 98 | A = p*eye + (1-p)*A 99 | 100 | # smooth fast (future?) 101 | counts_matrix_smoothed = pd.DataFrame((A.dot(counts_matrix.T)).T, 102 | columns=counts_matrix.columns, index=counts_matrix.index) 103 | return counts_matrix_smoothed, A 104 | 105 | # impute across modality 106 | @profile(stream=f) 107 | def get_constrained_knn(mat_norm_j, mat_norm_i, knn, k_saturate, knn_speed_factor=10, metric='dot', verbose=False): 108 | """Get constrained knn 109 | j <- i 110 | Look for kNN in i for each cell in j, cells in i are constrained to k_saturated 111 | 112 | get knn_speed_factor*knn number of nearest neighbors internally 113 | """ 114 | ti = time.time() 115 | assert mat_norm_i.shape[1] == mat_norm_j.shape[1] 116 | knn = int(knn) 117 | knn_speed_factor = int(knn_speed_factor) 118 | 119 | cells_i = np.arange(len(mat_norm_i)) 120 | cells_j = np.arange(len(mat_norm_j)) 121 | 122 | # record cells in j 123 | accepted_knn_ji = [] 124 | accepted_cells = [] 125 | rejected_cells = np.arange(len(cells_j)) 126 | 127 | # record cell in i 128 | n_connects = np.zeros(len(cells_i)).astype(int) # record number of connection for each cell in i 129 | unsaturated = (n_connects < k_saturate) # unsaturated bool 130 | unsaturated_cells = np.arange(len(cells_i))[unsaturated] 131 | 132 | while rejected_cells.size != 0: 133 | if verbose: 134 | print(len(rejected_cells), len(unsaturated_cells), time.time()-ti) 135 | 136 | np.random.shuffle(rejected_cells) # random order 137 | # do something to rejected cells and unsaturated cells 138 | # knn_ji # for each cell in j, its knn in i 139 | knn_ji = clst_utils.gen_knn_annoy_train_test(mat_norm_i.values[unsaturated_cells], # look for nearest neighbors in i 140 | mat_norm_j.values[rejected_cells], # for each row in j 141 | min(knn*knn_speed_factor, len(unsaturated_cells)), # 142 | form='list', # adj matrix 143 | metric=metric, # correlation 144 | n_trees=10, search_k=-1, verbose=False, 145 | include_distances=False, # for now 146 | ).astype(int) 147 | knn_ji = unsaturated_cells[knn_ji] # transform it to global index, need to check this like 148 | 149 | rejected_local_idx = [] 150 | # examine each cell in j 151 | for local_idx, cell in enumerate(rejected_cells): 152 | # get knn in i 153 | knn_in_i = knn_ji[local_idx] 154 | # filter out saturated ones 155 | knn_in_i = knn_in_i[unsaturated[knn_in_i]] 156 | 157 | if knn_in_i.size < knn: 158 | # reject 159 | rejected_local_idx.append(local_idx) 160 | else: 161 | # accept and update 162 | accepted_knn_ji.append(knn_in_i[:knn]) 163 | accepted_cells.append(cell) 164 | n_connects[knn_in_i[:knn]] += 1 165 | unsaturated = (n_connects < k_saturate) # unsaturated bool 166 | 167 | unsaturated_cells = np.arange(len(cells_i))[unsaturated] 168 | rejected_cells = rejected_cells[rejected_local_idx] 169 | # break 170 | 171 | accepted_knn_ji = pd.DataFrame(np.vstack(accepted_knn_ji), index=accepted_cells) 172 | accepted_knn_ji = accepted_knn_ji.sort_index().values 173 | 174 | return accepted_knn_ji 175 | 176 | # 177 | @profile(stream=f) 178 | def impute_1pair_cca(mod_i, mod_j, 179 | smoothed_features_i, smoothed_features_j, 180 | settings, 181 | knn, 182 | relaxation, 183 | n_cca, 184 | output_knn_mat_ij='', 185 | output_knn_mat_ji='', 186 | impute_j=True, 187 | ): 188 | """ 189 | """ 190 | # set up 191 | direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction 192 | 193 | mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i 194 | mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j 195 | 196 | genes_i = mat_ii.columns.values 197 | genes_j = mat_jj.columns.values 198 | genes_common = np.intersect1d(genes_i, genes_j) 199 | 200 | cells_i = mat_ii.index.values 201 | cells_j = mat_jj.index.values 202 | 203 | ## CCA euclidean distance 204 | # normalize the feature matrix 205 | X = mat_ii[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_i # gene by cell, zscore across genes 206 | Y = mat_jj[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_j 207 | U, s, Vt = fbpca.pca(X.T.values.dot(Y.values), k=n_cca) 208 | del X, Y 209 | 210 | mat_norm_i = pd.DataFrame(U, index=mat_ii.index) 211 | maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get 212 | mat_norm_j = pd.DataFrame(Vt.T, index=mat_jj.index) 213 | maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get 214 | 215 | if impute_j: 216 | # knn_i and knn_j 217 | # j <- i for each j, get kNN in i 218 | knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='euclidean') 219 | mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i)) 220 | 221 | if output_knn_mat_ji: 222 | sparse.save_npz(output_knn_mat_ji, mat_knn_ji) 223 | 224 | # normalize 225 | degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to 226 | mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji) 227 | 228 | # imputation both across and within modality 229 | mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i 230 | 231 | 232 | # i <- j 233 | knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='euclidean') 234 | mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j)) 235 | 236 | if output_knn_mat_ij: 237 | sparse.save_npz(output_knn_mat_ij, mat_knn_ij) 238 | 239 | degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to 240 | mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij) 241 | 242 | mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j 243 | 244 | if impute_j: 245 | return mat_ij, mat_ji 246 | else: 247 | return mat_ij 248 | 249 | @profile(stream=f) 250 | def impute_1pair(mod_i, mod_j, 251 | smoothed_features_i, smoothed_features_j, 252 | settings, 253 | knn, # 20 254 | relaxation, # 3 255 | output_knn_mat_ij='', 256 | output_knn_mat_ji='', 257 | impute_j=True, 258 | ): 259 | """ 260 | """ 261 | # set up 262 | direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction 263 | 264 | mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i 265 | mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j 266 | 267 | genes_i = mat_ii.columns.values 268 | genes_j = mat_jj.columns.values 269 | genes_common = np.intersect1d(genes_i, genes_j) 270 | 271 | cells_i = mat_ii.index.values 272 | cells_j = mat_jj.index.values 273 | 274 | ## spearman correlation as distance (rank -> zscore -> (flip sign?) -> "dot" distance) 275 | # normalize the feature matrix 276 | mat_norm_i = (mat_ii[genes_common].rank(pct=True, axis=1) 277 | .apply(basic_utils.zscore, axis=1) 278 | *direct_i 279 | ) 280 | mat_norm_j = (mat_jj[genes_common].rank(pct=True, axis=1) 281 | .apply(basic_utils.zscore, axis=1) 282 | *direct_j 283 | ) 284 | maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get 285 | maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get 286 | 287 | if impute_j: 288 | # knn_i and knn_j 289 | # j <- i for each j, get kNN in i 290 | knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='dot') 291 | mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i)) 292 | 293 | if output_knn_mat_ji: 294 | sparse.save_npz(output_knn_mat_ji, mat_knn_ji) 295 | 296 | # normalize 297 | degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to 298 | mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji) 299 | 300 | # imputation both across and within modality 301 | mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i 302 | 303 | 304 | # i <- j 305 | knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='dot') 306 | mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j)) 307 | 308 | if output_knn_mat_ij: 309 | sparse.save_npz(output_knn_mat_ij, mat_knn_ij) 310 | 311 | degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to 312 | mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij) 313 | 314 | mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j 315 | 316 | if impute_j: 317 | return mat_ij, mat_ji 318 | else: 319 | return mat_ij 320 | 321 | @profile(stream=f) 322 | def core_scf_routine(mods_selected, features_selected, settings, 323 | metas, gxc_hvftrs, 324 | ps, drop_npcs, 325 | cross_mod_distance_measure, knn, relaxation, n_cca, 326 | npc, 327 | output_pcX_all, 328 | output_imputed_data_format, 329 | ): 330 | """smooth within modality, impute across modalities, and construct a joint PC matrix 331 | """ 332 | # GENE * CELL !!!! 333 | smoothed_features = collections.OrderedDict() 334 | logging.info("Smoothing within modalities...") 335 | for mod in mods_selected: 336 | ti = time.time() 337 | if settings[mod].mod_category == 'mc': 338 | _df = gxc_hvftrs[mod] 339 | else: 340 | _mat = gxc_hvftrs[mod].data.todense() 341 | _df = pd.DataFrame(_mat, 342 | index=gxc_hvftrs[mod].gene, 343 | columns=gxc_hvftrs[mod].cell, 344 | ) 345 | npc = min(len(metas[mod]), npc) 346 | k_smooth = min(len(metas[mod]), 30) 347 | ka = 5 348 | if k_smooth >= 2*ka: 349 | mat_smoothed, mat_knn = smooth_in_modality(_df, _df, k=k_smooth, ka=ka, npc=npc, 350 | p=ps[settings[mod].mod_category], 351 | drop_npc=drop_npcs[settings[mod].mod_category]) 352 | smoothed_features[mod] = mat_smoothed 353 | else: 354 | smoothed_features[mod] = _df 355 | logging.info("{} finished in {} seconds".format(mod, time.time()-ti)) 356 | # delete 357 | del gxc_hvftrs[mod] 358 | 359 | # construct a joint matrix (PCA) 360 | logging.info("Constructing a joint matrix...") 361 | cells_all = np.hstack([metas[mod].index.values for mod in mods_selected]) # cell (all mods) 362 | pcX_all = [] 363 | for mod_y in features_selected: ## to 364 | logging.info("Imputing into {} space...".format(mod_y)) 365 | # get all_features 366 | X = [] 367 | for mod_x in mods_selected: 368 | logging.info("for {} cells...".format(mod_x)) 369 | if mod_x == mod_y: 370 | smoothed_yy = smoothed_features[mod_y].T # gene by cell !!! VERY IMPORTANT 371 | X.append(smoothed_yy) 372 | else: 373 | # impute x cells y space 374 | smoothed_features_x = smoothed_features[mod_x] 375 | smoothed_features_y = smoothed_features[mod_y] 376 | if cross_mod_distance_measure == 'correlation': 377 | imputed_xy = impute_1pair(mod_x, mod_y, 378 | smoothed_features_x, smoothed_features_y, 379 | settings, 380 | knn=knn, 381 | relaxation=relaxation, 382 | impute_j=False, 383 | ) 384 | elif cross_mod_distance_measure == 'cca': 385 | imputed_xy = impute_1pair_cca(mod_x, mod_y, 386 | smoothed_features_x, smoothed_features_y, 387 | settings, 388 | knn=knn, 389 | relaxation=relaxation, 390 | n_cca=n_cca, 391 | impute_j=False, 392 | ) 393 | else: 394 | raise ValueError("Choose from correlation and cca") 395 | X.append(imputed_xy) 396 | X = np.vstack(X) # cell (all mods) by gene (mod_y) 397 | # save X (imputed counts; for debuggng only) 398 | if len(output_imputed_data_format)>0: 399 | np.save(output_imputed_data_format.format(mod_y), X) 400 | # PCA 401 | U, s, V = fbpca.pca(X, npc) 402 | del X 403 | pcX = U.dot(np.diag(s)) 404 | # normalize PCs 405 | sigma = np.sqrt(np.sum(s*s)/(pcX.shape[0]*pcX.shape[1])) 406 | pcX = pcX/sigma 407 | pcX_all.append(pcX) 408 | 409 | pcX_all = np.hstack(pcX_all) 410 | # save pcX_all 411 | df_pcX = pd.DataFrame( 412 | pcX_all, 413 | index=cells_all, 414 | columns=['PC'+str(i+1) for i in np.arange(pcX_all.shape[1])], 415 | ) 416 | df_pcX.index.name = 'cell_id' 417 | df_pcX.to_csv( 418 | output_pcX_all, 419 | sep='\t', index=True, header=True, 420 | ) 421 | logging.info("Saved output to: {}".format(output_pcX_all)) 422 | return pcX_all, cells_all 423 | 424 | @profile(stream=f) 425 | def clustering_umap_routine(pcX_all, cells_all, mods_selected, metas, 426 | resolutions, k, 427 | umap_neighbors, min_dist, 428 | output_clst_and_umap, 429 | use_netUMAP=False, 430 | use_tsne=False, 431 | cluster_only=False, 432 | ): 433 | """ 434 | """ 435 | # clustering 436 | df_clsts = [] 437 | for resolution in resolutions: 438 | logging.info('resolution r: {}'.format(resolution)) 439 | df_clst = clst_utils.clustering_routine( 440 | pcX_all, 441 | cells_all, k, 442 | resolution=resolution, 443 | metric='euclidean', option='plain', n_trees=10, search_k=-1, verbose=False) 444 | df_clsts.append(df_clst.rename(columns={'cluster': 445 | 'cluster_joint_r{}'.format(resolution) 446 | })) 447 | df_clst = pd.concat(df_clsts, axis=1) 448 | 449 | df_summary = df_clst 450 | # umap 451 | if not cluster_only: 452 | df_embed = clst_utils.run_umap_lite( 453 | pcX_all, 454 | cells_all, 455 | n_neighbors=umap_neighbors, min_dist=min_dist, n_dim=2, 456 | random_state=1, 457 | use_netUMAP=use_netUMAP, 458 | use_tsne=use_tsne, 459 | ) 460 | df_summary = df_summary.join(df_embed) 461 | # add dataset info 462 | df_summary['dataset'] = '' 463 | for mod in mods_selected: 464 | _cells = metas[mod].index.values 465 | df_summary.loc[_cells, 'dataset'] = mod 466 | # name 467 | df_summary.index.name = 'cell_id' 468 | # save results 469 | df_summary.to_csv( 470 | output_clst_and_umap, 471 | sep='\t', header=True, index=True, 472 | ) 473 | return df_summary 474 | -------------------------------------------------------------------------------- /scripts/SingleCellFusion: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """SingleCellFusion main routine""" 3 | 4 | from __init__ import * 5 | # public packages 6 | import collections 7 | import os 8 | import warnings 9 | with warnings.catch_warnings(): 10 | warnings.filterwarnings("ignore", category=FutureWarning) 11 | import anndata 12 | 13 | import logging 14 | from memory_profiler import profile 15 | from datetime import datetime 16 | 17 | # scripts from this package 18 | import cli_parser 19 | import basic_utils 20 | import SCF_utils 21 | 22 | ctime = datetime.now().strftime("%Y%m%d%H%M%S") 23 | f=open('memory_profile_SCF_{}.log'.format(ctime), 'w+') 24 | @profile(stream=f) 25 | def main(): 26 | parser = cli_parser.create_parser() 27 | args = parser.parse_args() 28 | 29 | log = basic_utils.create_logger() 30 | logging.info('* Parsing Command Line Arguments') 31 | 32 | # specify output filenames 33 | outdir = args.output_dir 34 | if not os.path.isdir(outdir): 35 | os.makedirs(outdir) 36 | name = args.output_prefix 37 | 38 | output_clst_and_umap = outdir + '/{}_assigned_clusters_embeddings.tsv.gz'.format(name) 39 | output_pcX_all = outdir + '/{}_principal_components.tsv.gz'.format(name) 40 | output_figures = outdir + '/{}_{{}}.{{}}'.format(name) 41 | 42 | ### --- outputs for debugging only 43 | output_imputed_data_format = '' # leave it blank or set it to be: outdir + '/{}_imputed_data_{{}}.npy'.format(name) 44 | # output_cluster_centroids = outdir + '/{}_centroids.pkl'.format(name) # not used 45 | ### --- end 46 | 47 | # get input files, modaltiies (internal rep of input files), and feature datasets 48 | data_files = args.input_datasets 49 | feature_files = args.feature_datasets 50 | mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files] 51 | features_selected = [cli_parser.parse_filename(data_file) for data_file in feature_files] 52 | for features_modality in features_selected: 53 | assert (features_modality in mods_selected) 54 | 55 | # get dataset metadata 56 | mod_catgories = args.input_modalities 57 | assert len(mod_catgories) == len(data_files) 58 | 59 | for mod_category in mod_catgories: 60 | assert (mod_category in ['mc', 'atac', 'rna']) 61 | settings = collections.OrderedDict() 62 | Mod_info = collections.namedtuple('Mod_info', ['mod', 'mod_category', 'mod_direction',]) 63 | for mod, mod_category in zip(mods_selected, mod_catgories): 64 | mod_direction = cli_parser.modality_default_options(mod_category) 65 | settings[mod] = Mod_info(mod, mod_category, mod_direction,) 66 | 67 | # parameters 68 | # Within modality 69 | ps = { 70 | 'rna': args.smoothing_fractions[0], 71 | 'atac': args.smoothing_fractions[1], 72 | 'mc': args.smoothing_fractions[2], 73 | } 74 | 75 | # across modality 76 | knn = args.nearest_neighbors 77 | relaxation = args.relaxation 78 | # PCA 79 | npc = args.num_pcs 80 | # clustering 81 | k = args.leiden_n_neighbors 82 | resolutions = args.leiden_resolutions 83 | # umap 84 | umap_neighbors = args.umap_n_neighbors 85 | min_dist = args.umap_min_dist 86 | 87 | # precomputed_pca (skip integration) 88 | precomputed_pca_file = args.precomputed_pca_file 89 | # use netUMAP 90 | use_netUMAP = args.use_netUMAP 91 | use_tsne = args.use_tsne 92 | 93 | ### --- deprecated arguments (for testing; not open to general users) 94 | n_cca = 0 # deprecated args.n_cca 95 | drop_npcs = { 96 | 'mc': 0, 97 | 'rna': 0, 98 | 'atac': 0, 99 | } 100 | cross_mod_distance_measure = 'correlation' # or 'cca' 101 | ### --- end of deprecation 102 | logging.info( 103 | "knn = {}\n".format(knn) + 104 | "relaxation = {}\n".format(relaxation) + 105 | "number of PCs = {}\n".format(npc) + 106 | "ps = {}\n".format(ps) + 107 | "umap_n_neighbors = {}\n".format(umap_neighbors) + 108 | "umap_min_dist = {}\n".format(min_dist) + 109 | "leiden_resolutions = {}\n".format(resolutions) + 110 | "leiden_n_neighbors = {}\n".format(k) 111 | ) 112 | 113 | # ## Read in data 114 | logging.info('* Begin integration') 115 | ### read in data (h5ad) 116 | metas = collections.OrderedDict() 117 | gxc_hvftrs = collections.OrderedDict() 118 | for mod, _file in zip(mods_selected, data_files): 119 | logging.info("processing {}".format(mod)) 120 | # read 121 | logging.info("reading {}".format(_file)) 122 | h5ad_mat = anndata.read_h5ad(_file) 123 | h5ad_mat.obs.index = [cell+"_"+mod for cell in h5ad_mat.obs.index] # resolve possible cellid conflict across datasets 124 | 125 | if settings[mod].mod_category == 'mc': 126 | # convert 127 | meta, mat = basic_utils.h5ad_to_scf_mc_format(h5ad_mat) 128 | assert np.all(mat.columns.values == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat) 129 | logging.info("{} genes, {} cells in the feature matrix".format(*mat.shape)) 130 | 131 | metas[mod] = meta 132 | gxc_hvftrs[mod] = mat 133 | 134 | else: 135 | # convert 136 | meta, gc_mat = basic_utils.h5ad_to_scf_rna_format(h5ad_mat) 137 | assert np.all(gc_mat.cell == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat) 138 | logging.info("{} genes, {} cells in the feature matrix".format(*gc_mat.data.shape)) 139 | 140 | metas[mod] = meta 141 | gxc_hvftrs[mod] = gc_mat 142 | 143 | logging.info('Done reading data') 144 | 145 | # ## run SCF to get integrated PCA 146 | if os.path.isfile(precomputed_pca_file): 147 | logging.info('Loading precomputed PCA matrix') 148 | precomputed_pca_df = pd.read_csv(precomputed_pca_file, sep='\t', index_col=0) 149 | pcX_all = precomputed_pca_df.values 150 | cells_all = precomputed_pca_df.index.values 151 | else: 152 | pcX_all, cells_all = SCF_utils.core_scf_routine(mods_selected, features_selected, settings, 153 | metas, gxc_hvftrs, 154 | ps, drop_npcs, 155 | cross_mod_distance_measure, knn, relaxation, n_cca, 156 | npc, 157 | output_pcX_all, 158 | output_imputed_data_format, 159 | ) 160 | logging.info('Done integration into a common PC space') 161 | 162 | # run clustering and imputation 163 | df_summary = SCF_utils.clustering_umap_routine(pcX_all, cells_all, mods_selected, metas, 164 | resolutions, k, 165 | umap_neighbors, min_dist, 166 | output_clst_and_umap, 167 | use_netUMAP=use_netUMAP, 168 | use_tsne=use_tsne, 169 | ) 170 | logging.info('Done clustering and UMAP') 171 | 172 | if __name__ == "__main__": 173 | main() -------------------------------------------------------------------------------- /scripts/SingleCellFusion_prep: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import sparse 7 | import time 8 | import re 9 | import warnings 10 | with warnings.catch_warnings(): 11 | warnings.filterwarnings("ignore", category=FutureWarning) 12 | 13 | import anndata 14 | import scanpy 15 | import logging 16 | import os 17 | 18 | from __init__ import * 19 | import basic_utils 20 | import preproc_utils 21 | import cli_parser 22 | 23 | def get_gene_annotation(gene_annotation_file): 24 | """ 25 | """ 26 | genes = pd.read_csv( 27 | gene_annotation_file, 28 | sep='\t', 29 | header=None, 30 | usecols=[0,1,2,3], 31 | ).rename(columns={ 32 | 0: 'chr', 33 | 1: 'start', 34 | 2: 'end', 35 | 3: 'ensid', 36 | }) 37 | return genes 38 | 39 | def preproc( 40 | f_data, 41 | f_hvftr_data, 42 | normalization_option, 43 | sub_n=None, 44 | sub_frac=None, 45 | f_cov_data='', 46 | gene_lengths_base='', # required if normalization option == "tpm" 47 | gid_col='', 48 | cid_col='', 49 | global_mean_mc_col='', # required if normalization option == 'mc' 50 | ): 51 | """Generate normalized HVG matrices from raw count matrices 52 | 53 | normalization_option == 'mc' needs f_cov_data 54 | """ 55 | # # highly variable features 56 | ti = time.time() 57 | logging.info("Preprocessing") 58 | 59 | # read data matrix 60 | if normalization_option == 'mc': 61 | # read in files 62 | logging.info("Reading in file {}".format(f_data)) 63 | h5ad_mat = anndata.read_h5ad(f_data) 64 | ### subsampling ### 65 | if sub_n is not None or sub_frac is not None: 66 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac)) 67 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0) 68 | ### end of subsampling ### 69 | logging.info("matrix size = {}".format(h5ad_mat.shape)) 70 | meta, mat_mc = basic_utils.h5ad_to_scf_mc_format(h5ad_mat) 71 | 72 | logging.info("Reading in file {}".format(f_cov_data)) 73 | h5ad_mat = anndata.read_h5ad(f_cov_data) 74 | ### subsampling ### 75 | if sub_n is not None or sub_frac is not None: 76 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac)) 77 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0) 78 | ### end of subsampling ### 79 | logging.info("matrix size = {}".format(h5ad_mat.shape)) 80 | meta, mat_c = basic_utils.h5ad_to_scf_mc_format(h5ad_mat) 81 | 82 | assert mat_mc.shape == mat_c.shape 83 | assert np.all(mat_mc.values <= mat_c.values) 84 | 85 | gxc_raw = GC_matrix( 86 | mat_mc.index.values, 87 | mat_mc.columns.values, 88 | {'mc': mat_mc.values, 'c': mat_c.values}, 89 | ) 90 | 91 | # check meta cells agree with gxc cells 92 | assert np.all(meta.index.values == gxc_raw.cell) 93 | # check genes are uniq 94 | assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 95 | # check cells are uniq 96 | assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell)) 97 | 98 | # do 99 | gxc_hvftr = preproc_utils.preproc_methylation( 100 | gxc_raw, 101 | meta, 102 | global_value_col=global_mean_mc_col, 103 | base_call_cutoff=20, 104 | sufficient_coverage_fraction=0.95, 105 | hv_percentile=30, 106 | n_qcut=10, 107 | ) 108 | 109 | # save 110 | logging.info("Saving to file {}".format(f_hvftr_data)) 111 | h5ad_mat_hvftr = basic_utils.scf_mc_format_to_h5ad(meta, gxc_hvftr) 112 | h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip') 113 | 114 | else: 115 | # read in files 116 | logging.info("Reading in file {}".format(f_data)) 117 | h5ad_mat = anndata.read_h5ad(f_data) 118 | ### subsampling ### 119 | if sub_n is not None or sub_frac is not None: 120 | logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac)) 121 | scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0) 122 | ### end of subsampling ### 123 | logging.info("matrix size = {}".format(h5ad_mat.shape)) 124 | if tosparse: 125 | h5ad_mat.X = sparse.coo_matrix(h5ad_mat.X) 126 | meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col) 127 | 128 | # check meta cells agree with gxc cells 129 | assert np.all(meta.index.values == gxc_raw.cell) 130 | # check genes are uniq 131 | assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 132 | # check cells are uniq 133 | assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell)) 134 | 135 | # get hvftrs 136 | logging.info("Preproc and get highly variable genes {}".format(f_data)) 137 | if normalization_option == 'cpm': 138 | gxc_hvftr = preproc_utils.preproc_rna_cpm_based( 139 | gxc_raw, 140 | sufficient_cell_coverage=0.01, 141 | hv_percentile=30, hv_ncut=10) 142 | elif normalization_option == 'tpm': 143 | gene_lengths = gene_lengths_base.reindex(gxc_raw.gene) 144 | gxc_hvftr = preproc_utils.preproc_rna_tpm_based( 145 | gxc_raw, gene_lengths, impute_gene_lengths=True, 146 | sufficient_cell_coverage=0.01, 147 | hv_percentile=30, hv_ncut=10) 148 | 149 | # save 150 | logging.info("Saving to file {}".format(f_hvftr_data)) 151 | h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr) 152 | h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip') 153 | return 154 | 155 | if __name__ == "__main__": 156 | log = basic_utils.create_logger() 157 | 158 | parser = cli_parser.create_parser_preproc() 159 | args = parser.parse_args() 160 | logging.info('* Parsing Command Line Arguments') 161 | 162 | # get input files 163 | data_files = args.input_datasets 164 | data_cov_files = args.input_datasets_coverage 165 | mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files] 166 | gid_col = args.geneid_column 167 | cid_col = args.cellid_column 168 | global_mean_mc_col = args.global_mean_mc_column 169 | tosparse = args.tosparse 170 | 171 | # specify output files 172 | outdir = args.output_dir 173 | if not os.path.isdir(outdir): 174 | os.makedirs(outdir) 175 | outprefix = args.output_prefix 176 | 177 | output_files = [ 178 | os.path.join(outdir, "{}_{}".format(outprefix, os.path.basename(input_file))) 179 | for input_file in data_files 180 | ] 181 | 182 | # parameters 183 | gene_annotation_file = args.gene_annotation_file 184 | 185 | # get dataset normalizations 186 | input_normalizations = args.input_normalizations 187 | 188 | # subsampling 189 | sub_n = args.sub_n 190 | sub_frac = args.sub_frac 191 | 192 | # check and set up 193 | gene_lengths_base = '' 194 | for option in input_normalizations: 195 | assert (option in ['mc', 'cpm', 'tpm']) 196 | if option == 'mc': 197 | assert len(data_cov_files) == len(data_files) 198 | elif option == 'tpm': 199 | assert gene_annotation_file 200 | df_genes = get_gene_annotation(gene_annotation_file).set_index('ensid') 201 | gene_lengths_base = (df_genes['end'] - df_genes['start']) 202 | 203 | for i, (data_file, output_file, norm_option) in enumerate(zip( 204 | data_files, output_files, input_normalizations 205 | )): 206 | 207 | if norm_option == 'mc': 208 | data_cov_file = data_cov_files[i] 209 | else: 210 | data_cov_file = '' 211 | 212 | preproc( 213 | data_file, 214 | output_file, 215 | norm_option, 216 | sub_n=sub_n, 217 | sub_frac=sub_frac, 218 | gene_lengths_base=gene_lengths_base, # required if normalization option == "tpm" 219 | f_cov_data=data_cov_file, 220 | gid_col=gid_col, 221 | cid_col=cid_col, 222 | global_mean_mc_col=global_mean_mc_col, 223 | ) -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | """Import commonly used libraries""" 2 | 3 | import time 4 | import logging 5 | import glob 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | import collections 10 | # from natsort import natsorted 11 | 12 | # matplotlib 13 | import matplotlib as mpl 14 | import matplotlib.pyplot as plt 15 | mpl.rcParams['pdf.fonttype'] = 42 # editable text in matplotlib 16 | mpl.rcParams['svg.fonttype'] = 'none' 17 | 18 | import matplotlib.ticker as mtick 19 | PercentFormat = mtick.FuncFormatter(lambda y, _: '{:.3%}'.format(y)) 20 | ScalarFormat = mtick.ScalarFormatter() 21 | 22 | # seaborn 23 | import seaborn as sns 24 | sns.set_style('ticks', rc={'axes.grid':True}) 25 | sns.set_context('talk') 26 | 27 | # data structures 28 | GC_matrix = collections.namedtuple('GC_matrix', ['gene', 'cell', 'data']) 29 | -------------------------------------------------------------------------------- /scripts/basic_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | """ 3 | from __init__ import * 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import os 9 | from scipy import sparse 10 | import anndata 11 | import logging 12 | 13 | def get_size_in_GB(obj): 14 | """""" 15 | GB = 1024**3 16 | return sys.getsizeof(obj)/GB 17 | 18 | def scf_rna_format_to_h5ad(meta, gc_mat): 19 | """ 20 | input: 21 | - meta (cell metadata) 22 | - gc_mat 23 | 24 | output: 25 | - anndata 26 | """ 27 | X = gc_mat.data.T # cell by gene [scipy sparse matrix] 28 | obs = meta # cell annotation [pandas dataframe] 29 | var = pd.DataFrame(index=gc_mat.gene) # gene annotation [pandas dataframe] 30 | 31 | h5ad_mat = anndata.AnnData(X, obs, var,) 32 | 33 | return h5ad_mat 34 | 35 | def scf_mc_format_to_h5ad(meta, mat): 36 | """ 37 | input: 38 | - meta (cell metadata) 39 | - mat 40 | 41 | output: 42 | - anndata 43 | """ 44 | X = mat.T.values # cell by gene [numpy array] 45 | obs = meta # cell annotation [pandas dataframe] 46 | var = pd.DataFrame(index=mat.index) # gene annotation [pandas dataframe] 47 | 48 | h5ad_mat = anndata.AnnData(X, obs, var,) 49 | 50 | return h5ad_mat 51 | 52 | def h5ad_to_scf_rna_format(h5ad_mat, gid_col='', cid_col=''): 53 | """ 54 | input: 55 | - anndata 56 | output: 57 | - meta (cell metadata) 58 | - gc_mat 59 | 60 | """ 61 | meta = h5ad_mat.obs 62 | if gid_col: 63 | genes = h5ad_mat.var[gid_col].values 64 | else: 65 | genes = h5ad_mat.var.index.values 66 | if cid_col: 67 | cells = h5ad_mat.obs[cid_col].values 68 | meta = meta.set_index(cid_col) 69 | else: 70 | cells = h5ad_mat.obs.index.values 71 | 72 | gc_mat = GC_matrix(genes, 73 | cells, 74 | h5ad_mat.X.T, 75 | ) 76 | return meta, gc_mat 77 | 78 | def h5ad_to_scf_mc_format(h5ad_mat, gid_col='', cid_col=''): 79 | """ 80 | input: 81 | - anndata 82 | output: 83 | - meta (cell metadata) 84 | - pandas data frame 85 | """ 86 | 87 | meta = h5ad_mat.obs 88 | if gid_col: 89 | genes = h5ad_mat.var[gid_col].values 90 | else: 91 | genes = h5ad_mat.var.index.values 92 | if cid_col: 93 | cells = h5ad_mat.obs[cid_col].values 94 | meta = meta.set_index(cid_col) 95 | else: 96 | cells = h5ad_mat.obs.index.values 97 | mat = pd.DataFrame(h5ad_mat.X.T, 98 | index=genes, 99 | columns=cells, 100 | ) 101 | return meta, mat 102 | 103 | def diag_matrix(X, rows=np.array([]), cols=np.array([]), threshold=None): 104 | """Diagonalize a matrix as much as possible 105 | """ 106 | di, dj = X.shape 107 | transposed = 0 108 | 109 | if di > dj: 110 | di, dj = dj, di 111 | X = X.T.copy() 112 | rows, cols = cols.copy(), rows.copy() 113 | transposed = 1 114 | 115 | # start (di <= dj) 116 | new_X = X.copy() 117 | new_rows = rows.copy() 118 | new_cols = cols.copy() 119 | if new_rows.size == 0: 120 | new_rows = np.arange(di) 121 | if new_cols.size == 0: 122 | new_cols = np.arange(dj) 123 | 124 | # bring the greatest values in the lower right matrix to diagnal position 125 | for idx in range(min(di, dj)): 126 | 127 | T = new_X[idx: , idx: ] 128 | i, j = np.unravel_index(T.argmax(), T.shape) # get the coords of the max element of T 129 | 130 | if threshold and T[i, j] < threshold: 131 | dm = idx # new_X[:dm, :dm] is done (0, 1, ..., dm-1) excluding dm 132 | break 133 | else: 134 | dm = idx+1 # new_X[:dm, :dm] will be done 135 | 136 | # swap row idx, idx+i 137 | tmp = new_X[idx, :].copy() 138 | new_X[idx, :] = new_X[idx+i, :].copy() 139 | new_X[idx+i, :] = tmp 140 | 141 | tmp = new_rows[idx] 142 | new_rows[idx] = new_rows[idx+i] 143 | new_rows[idx+i] = tmp 144 | 145 | # swap col idx, idx+j 146 | tmp = new_X[:, idx].copy() 147 | new_X[:, idx] = new_X[:, idx+j].copy() 148 | new_X[:, idx+j] = tmp 149 | 150 | tmp = new_cols[idx] 151 | new_cols[idx] = new_cols[idx+j] 152 | new_cols[idx+j] = tmp 153 | 154 | # 155 | if dm == dj: 156 | pass 157 | elif dm < dj: # free columns 158 | 159 | col_dict = {} 160 | sorted_col_idx = np.arange(dm) 161 | free_col_idx = np.arange(dm, dj) 162 | linked_rowcol_idx = new_X[:, dm:].argmax(axis=0) 163 | 164 | for col in sorted_col_idx: 165 | col_dict[col] = [col] 166 | for col, key in zip(free_col_idx, linked_rowcol_idx): 167 | if key < dm: 168 | col_dict[key] = col_dict[key] + [col] 169 | else: 170 | col_dict[key] = [col] 171 | 172 | 173 | new_col_order = np.hstack([col_dict[key] for key in sorted(col_dict.keys())]) 174 | 175 | # update new_X new_cols 176 | new_X = new_X[:, new_col_order].copy() 177 | new_cols = new_cols[new_col_order] 178 | else: 179 | raise ValueError("Unexpected situation: dm > dj") 180 | 181 | if transposed: 182 | new_X = new_X.T 183 | new_rows, new_cols = new_cols, new_rows 184 | return new_X, new_rows, new_cols 185 | 186 | def diag_matrix_rows(X, rows=np.array([]), cols=np.array([]),): 187 | """Diagonalize a matrix as much as possible by only rearrange rows 188 | """ 189 | di, dj = X.shape 190 | 191 | new_X = X.copy() 192 | new_rows = rows.copy() 193 | new_cols = cols.copy() 194 | 195 | # free to move rows 196 | row_dict = {} 197 | free_row_idx = np.arange(di) 198 | linked_rowcol_idx = new_X.argmax(axis=1) # the column with max value for each row 199 | 200 | for row, key in zip(free_row_idx, linked_rowcol_idx): 201 | if key in row_dict.keys(): 202 | row_dict[key] = row_dict[key] + [row] 203 | else: 204 | row_dict[key] = [row] 205 | 206 | new_row_order = np.hstack([row_dict[key] for key in sorted(row_dict.keys())]) 207 | # update new_X new_cols 208 | new_X = new_X[new_row_order, :].copy() 209 | new_rows = new_rows[new_row_order] 210 | 211 | return new_X, new_rows, new_cols 212 | 213 | def get_grad_colors(n, cmap='copper'): 214 | """Generate n colors from a given colormap (a matplotlib.cm) 215 | """ 216 | from matplotlib import cm 217 | cmap = cm.get_cmap(cmap) 218 | return [cmap(int(i)) for i in np.linspace(0, 255, n)] 219 | 220 | def logcpm(counts): 221 | """ 222 | Args: 223 | - gene-cell matrix 224 | """ 225 | cov = counts.sum(axis=0) 226 | logcpm = np.log10(counts.divide(cov, axis=1)*1000000 + 1) 227 | return logcpm 228 | 229 | def logtpm(counts, gene_lengths): 230 | """ 231 | Args: 232 | - gene-cell matrix 233 | - gene_lengths: a series indexed by gene_id 234 | """ 235 | tpm = counts.divide(gene_lengths.loc[counts.index], axis=0) 236 | cov = tpm.sum(axis=0) 237 | logtpm = np.log10((tpm.divide(cov, axis=1))*1000000 + 1) 238 | return logtpm 239 | 240 | def sparse_logcpm(gc_matrix, mode='logcpm', lib_size=[]): 241 | """ 242 | """ 243 | lib_size = np.array(lib_size) 244 | if np.size(lib_size) == 0: 245 | lib_size = gc_matrix.data.sum(axis=0) 246 | 247 | lib_size_inv = sparse.diags(np.ravel(1.0/(1e-7+lib_size))) 248 | cpm = (gc_matrix.data).dot(lib_size_inv*1e6).tocoo() 249 | 250 | if mode == 'logcpm': 251 | cpm.data = np.log10(cpm.data + 1) 252 | elif mode == 'cpm': 253 | pass 254 | 255 | gc_cpm = GC_matrix( 256 | gc_matrix.gene, 257 | gc_matrix.cell, 258 | cpm, 259 | ) 260 | 261 | return gc_cpm 262 | 263 | def sparse_logtpm(gc_matrix, gene_lengths): 264 | """ 265 | gene_lengths: array like 266 | 267 | """ 268 | gene_lengths = np.array(gene_lengths) 269 | gene_length_inv = sparse.diags(np.ravel(1.0/gene_lengths)) 270 | tmp = (gene_length_inv).dot(gc_matrix.data).tocoo() 271 | lib_size_inv = sparse.diags(np.ravel(1.0/tmp.sum(axis=0))) 272 | 273 | logtpm = tmp.dot(lib_size_inv*1e6).tocoo() 274 | logtpm.data = np.log10(logtpm.data + 1) 275 | 276 | gc_logtpm = GC_matrix( 277 | gc_matrix.gene, 278 | gc_matrix.cell, 279 | logtpm, 280 | ) 281 | 282 | return gc_logtpm 283 | 284 | class cd: 285 | """Context manager for changing the current working directory""" 286 | def __init__(self, newPath): 287 | self.newPath = os.path.expanduser(newPath) 288 | 289 | def __enter__(self): 290 | self.savedPath = os.getcwd() 291 | os.chdir(self.newPath) 292 | 293 | def __exit__(self, etype, value, traceback): 294 | os.chdir(self.savedPath) 295 | 296 | def create_logger(name='log'): 297 | """ 298 | args: logger name 299 | 300 | return: a logger object 301 | """ 302 | logging.basicConfig( 303 | format='%(asctime)s %(message)s', 304 | datefmt='%m/%d/%Y %I:%M:%S %p', 305 | level=logging.INFO) 306 | return logging.getLogger(name) 307 | 308 | def set_value_by_percentile(this, lo, hi): 309 | """set `this` below or above percentiles to given values 310 | this (float) 311 | lo(float) 312 | hi(float) 313 | """ 314 | if this < lo: 315 | return lo 316 | elif this > hi: 317 | return hi 318 | else: 319 | return this 320 | 321 | def mcc_percentile_norm(mcc, low_p=5, hi_p=95): 322 | """ 323 | set values above and below specific percentiles to be at the value of percentiles 324 | 325 | args: mcc, low_p, hi_p 326 | 327 | return: normalized mcc levels 328 | """ 329 | # mcc_norm = [np.isnan(mcc) for mcc_i in list(mcc)] 330 | mcc_norm = np.copy(mcc) 331 | mcc_norm = mcc_norm[~np.isnan(mcc_norm)] 332 | 333 | lo = np.percentile(mcc_norm, low_p) 334 | hi = np.percentile(mcc_norm, hi_p) 335 | 336 | mcc_norm = [set_value_by_percentile(mcc_i, lo, hi) for mcc_i in list(mcc)] 337 | mcc_norm = np.array(mcc_norm) 338 | 339 | return mcc_norm 340 | 341 | def plot_tsne_values(df, tx='tsne_x', ty='tsne_y', tc='mCH', 342 | low_p=5, hi_p=95, 343 | s=2, 344 | cbar_label=None, 345 | output=None, show=True, close=False, 346 | t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6), **kwargs): 347 | """ 348 | tSNE plot 349 | 350 | xlim, ylim is set to facilitate displaying glial clusters only 351 | 352 | """ 353 | import matplotlib.pyplot as plt 354 | import seaborn as sns 355 | 356 | fig, ax = plt.subplots(figsize=figsize) 357 | 358 | im = ax.scatter(df[tx], df[ty], s=s, 359 | c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs) 360 | if title: 361 | ax.set_title(title) 362 | else: 363 | ax.set_title(tc) 364 | ax.set_xlabel(tx) 365 | ax.set_ylabel(ty) 366 | # ax.set_aspect('auto') 367 | 368 | 369 | clb = plt.colorbar(im, ax=ax) 370 | if cbar_label: 371 | clb.set_label(cbar_label, rotation=270, labelpad=10) 372 | 373 | if t_xlim == 'auto': 374 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)] 375 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0]) 376 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0]) 377 | ax.set_xlim(t_xlim) 378 | elif t_xlim: 379 | ax.set_xlim(t_xlim) 380 | else: 381 | pass 382 | 383 | if t_ylim == 'auto': 384 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)] 385 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0]) 386 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0]) 387 | ax.set_ylim(t_ylim) 388 | elif t_ylim: 389 | ax.set_ylim(t_ylim) 390 | else: 391 | pass 392 | 393 | fig.tight_layout() 394 | if output: 395 | fig.savefig(output) 396 | print('Saved to ' + output) 397 | if show: 398 | plt.show() 399 | if close: 400 | plt.close(fig) 401 | 402 | def get_kwcolors(labels, colors): 403 | """Generate a dictinary of {label: color} using unique labels and a list of availabel colors 404 | """ 405 | nc = len(colors) 406 | nl = len(labels) 407 | n_repeats = int((nl + nc - 1)/nc) 408 | colors = list(colors)*n_repeats 409 | 410 | kw_colors = {l:c for (l,c) in zip(labels, colors)} 411 | return kw_colors 412 | 413 | def rgb2hex(r,g,b): 414 | """From rgb (255, 255, 255) to hex 415 | """ 416 | hex = "#{:02x}{:02x}{:02x}".format(int(r),int(g),int(b)) 417 | return hex 418 | 419 | def gen_colors(n, l=0.6, s=0.6, colors=None): 420 | """Generate compatible and distinct hex colors 421 | """ 422 | if not colors: 423 | import colorsys 424 | hs = np.linspace(0, 1, n, endpoint=False) 425 | rgbs = [rgb2hex(*(256*np.array(colorsys.hls_to_rgb(h, l, s)))) 426 | for h in hs] 427 | return rgbs 428 | else: 429 | clrs = [colors[i%len(colors)] for i in range(n)] 430 | return clrs 431 | 432 | def myScatter(ax, df, x, y, l, 433 | s=20, 434 | sample_frac=None, 435 | sample_n=None, 436 | legend_size=None, 437 | legend_kws=None, 438 | grey_label='unlabeled', 439 | shuffle=True, 440 | random_state=None, 441 | legend_mode=0, 442 | kw_colors=False, 443 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs): 444 | """ 445 | take an axis object and make a scatter plot 446 | 447 | - kw_colors is a dictinary {label: color} 448 | """ 449 | 450 | import matplotlib.pyplot as plt 451 | import seaborn as sns 452 | df = df.copy() 453 | # shuffle (and copy) data 454 | if sample_n: 455 | df = (df.groupby(l).apply(lambda x: x.sample(min(len(x), sample_n), random_state=random_state)) 456 | .reset_index(level=0, drop=True) 457 | ) 458 | if sample_frac: 459 | df = (df.groupby(l).apply(lambda x: x.sample(frac=sample_frac, random_state=random_state)) 460 | .reset_index(level=0, drop=True) 461 | ) 462 | if shuffle: 463 | df = df.sample(frac=1, random_state=random_state) 464 | 465 | if not kw_colors: 466 | # add a color column 467 | inds, catgs = pd.factorize(df[l]) 468 | df['c'] = [colors[i%len(colors)] if catgs[i]!=grey_label else 'grey' 469 | for i in inds] 470 | else: 471 | df['c'] = [kw_colors[i] if i!=grey_label else 'grey' for i in df[l]] 472 | 473 | # take care of legend 474 | if legend_mode != -1: 475 | for ind, row in df.groupby(l).first().iterrows(): 476 | ax.scatter(row[x], row[y], c=row['c'], label=ind, s=s, **kwargs) 477 | 478 | if legend_mode == -1: 479 | pass 480 | elif legend_mode == 0: 481 | lgnd = ax.legend() 482 | elif legend_mode == 1: 483 | # Shrink current axis's height by 10% on the bottom 484 | box = ax.get_position() 485 | ax.set_position([box.x0, box.y0 + box.height * 0.1, 486 | box.width, box.height * 0.9]) 487 | lgnd = ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.07), 488 | ncol=6, fancybox=False, shadow=False) 489 | elif legend_mode == 2: 490 | # Shrink current axis's width by 10% on the bottom 491 | box = ax.get_position() 492 | ax.set_position([box.x0 + box.width*0.1, box.y0, 493 | box.width*0.8, box.height]) 494 | 495 | if legend_kws: 496 | lgnd = ax.legend(**legend_kws) 497 | 498 | if legend_mode != -1 and legend_size: 499 | for handle in lgnd.legendHandles: 500 | handle._sizes = [legend_size] 501 | 502 | # backgroud (grey) 503 | df_grey = df.loc[df['c']=='grey'] 504 | if not df_grey.empty: 505 | ax.scatter(df_grey[x], 506 | df_grey[y], 507 | c=df_grey['c'], s=s, **kwargs) 508 | # actual plot 509 | df_tmp = df.loc[df['c']!='grey'] 510 | ax.scatter(df_tmp[x], 511 | df_tmp[y], 512 | c=df_tmp['c'], s=s, **kwargs) 513 | 514 | return 515 | 516 | def plot_tsne_labels_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='cluster_ID', 517 | sample_frac=None, 518 | sample_n=None, 519 | legend_size=None, 520 | legend_kws=None, 521 | grey_label='unlabeled', 522 | legend_mode=0, 523 | s=1, 524 | shuffle=True, 525 | random_state=None, 526 | t_xlim='auto', t_ylim='auto', title=None, 527 | legend_loc='lower right', 528 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs): 529 | """ 530 | tSNE plot 531 | 532 | xlim, ylim is set to facilitate displaying glial clusters only 533 | 534 | # avoid gray-like 'C7' in colors 535 | # color orders are arranged for exci-inhi-glia plot 11/1/2017 536 | """ 537 | import matplotlib.pyplot as plt 538 | 539 | myScatter(ax, df, tx, ty, tc, 540 | s=s, 541 | sample_frac=sample_frac, 542 | sample_n=sample_n, 543 | legend_size=legend_size, 544 | legend_kws=legend_kws, 545 | shuffle=shuffle, 546 | grey_label=grey_label, 547 | random_state=random_state, 548 | legend_mode=legend_mode, 549 | colors=colors, **kwargs) 550 | 551 | if title: 552 | ax.set_title(title) 553 | else: 554 | ax.set_title(tc) 555 | ax.set_xlabel(tx) 556 | ax.set_ylabel(ty) 557 | # ax.set_aspect('auto') 558 | 559 | if t_xlim == 'auto': 560 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)] 561 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0]) 562 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0]) 563 | ax.set_xlim(t_xlim) 564 | elif t_xlim: 565 | ax.set_xlim(t_xlim) 566 | else: 567 | pass 568 | 569 | if t_ylim == 'auto': 570 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)] 571 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0]) 572 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0]) 573 | ax.set_ylim(t_ylim) 574 | elif t_ylim: 575 | ax.set_ylim(t_ylim) 576 | else: 577 | pass 578 | 579 | return 580 | 581 | 582 | def plot_tsne_labels(df, tx='tsne_x', ty='tsne_y', tc='cluster_ID', 583 | grey_label='unlabeled', 584 | sample_frac=None, 585 | sample_n=None, 586 | legend_size=None, 587 | legend_mode=0, 588 | legend_kws=None, 589 | s=1, 590 | random_state=None, 591 | output=None, show=True, close=False, 592 | t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6), 593 | legend_loc='lower right', 594 | colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs): 595 | """ 596 | tSNE plot 597 | 598 | xlim, ylim is set to facilitate displaying glial clusters only 599 | 600 | # avoid gray-like 'C7' in colors 601 | # color orders are arranged for exci-inhi-glia plot 11/1/2017 602 | """ 603 | import matplotlib.pyplot as plt 604 | import seaborn as sns 605 | fig, ax = plt.subplots(figsize=figsize) 606 | 607 | myScatter(ax, df, tx, ty, tc, 608 | s=s, 609 | sample_frac=sample_frac, 610 | sample_n=sample_n, 611 | legend_size=legend_size, 612 | legend_kws=legend_kws, 613 | grey_label=grey_label, 614 | random_state=random_state, 615 | legend_mode=legend_mode, 616 | colors=colors, **kwargs) 617 | 618 | if title: 619 | ax.set_title(title) 620 | else: 621 | ax.set_title(tc) 622 | ax.set_xlabel(tx) 623 | ax.set_ylabel(ty) 624 | # ax.set_aspect('auto') 625 | 626 | if t_xlim == 'auto': 627 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)] 628 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0]) 629 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0]) 630 | ax.set_xlim(t_xlim) 631 | elif t_xlim: 632 | ax.set_xlim(t_xlim) 633 | else: 634 | pass 635 | 636 | if t_ylim == 'auto': 637 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)] 638 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0]) 639 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0]) 640 | ax.set_ylim(t_ylim) 641 | elif t_ylim: 642 | ax.set_ylim(t_ylim) 643 | else: 644 | pass 645 | 646 | if output: 647 | fig.savefig(output) 648 | print('Saved to ' + output) 649 | if show: 650 | plt.show() 651 | if close: 652 | plt.close(fig) 653 | 654 | def plot_tsne_values_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='mCH', 655 | low_p=5, hi_p=95, 656 | s=2, 657 | cbar=True, 658 | cbar_ax=None, 659 | cbar_label=None, 660 | t_xlim='auto', t_ylim='auto', title=None, **kwargs): 661 | """ 662 | tSNE plot 663 | 664 | xlim, ylim is set to facilitate displaying glial clusters only 665 | 666 | """ 667 | import matplotlib.pyplot as plt 668 | 669 | 670 | im = ax.scatter(df[tx], df[ty], s=s, 671 | c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs) 672 | if title: 673 | ax.set_title(title) 674 | else: 675 | ax.set_title(tc) 676 | # ax.set_aspect('auto') 677 | if cbar: 678 | if cbar_ax: 679 | clb = plt.colorbar(im, cax=cbar_ax, shrink=0.4) 680 | else: 681 | clb = plt.colorbar(im, cax=ax, shrink=1) 682 | if cbar_label: 683 | clb.set_label(cbar_label, rotation=270, labelpad=10) 684 | 685 | if t_xlim == 'auto': 686 | t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)] 687 | t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0]) 688 | t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0]) 689 | ax.set_xlim(t_xlim) 690 | elif t_xlim: 691 | ax.set_xlim(t_xlim) 692 | else: 693 | pass 694 | 695 | if t_ylim == 'auto': 696 | t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)] 697 | t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0]) 698 | t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0]) 699 | ax.set_ylim(t_ylim) 700 | elif t_ylim: 701 | ax.set_ylim(t_ylim) 702 | else: 703 | pass 704 | 705 | return im 706 | 707 | 708 | def get_mcc(df, base_call_cutoff=100, sufficient_coverage_fraction=1, suffix=True, fillna=True): 709 | """Get mcc matrix from mc_c matrix (filtering out low coverage gene or bins) 710 | """ 711 | logging.info('Getting mcc matrix from mc and c') 712 | logging.info('base_call_cutoff={}, sufficient_coverage_fraction={}'.format( 713 | base_call_cutoff, sufficient_coverage_fraction)) 714 | 715 | df_c = df.filter(regex="_c$") 716 | df_c.columns = [col[:-len('_c')] for col in df_c.columns] 717 | df_mc = df.filter(regex="_mc$") 718 | df_mc.columns = [col[:-len('_mc')] for col in df_mc.columns] 719 | # a gene is sufficiently covered in % of cells 720 | condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df.shape[1])/2.0 721 | 722 | logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape)) 723 | logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape)) 724 | 725 | # get mcc matrix with kept bins and nan values for low coverage sites 726 | df_c_nan = df_c.copy() 727 | df_c_nan[df_c < base_call_cutoff] = np.nan 728 | df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition] 729 | logging.info(df_mcc.shape) 730 | 731 | # imputation (missing value -> mean value of all cells) 732 | if fillna: 733 | logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)') 734 | means = df_mcc.mean(axis=1) 735 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns}) 736 | df_mcc.fillna(fill_value, inplace=True) 737 | 738 | # add suffix 739 | if suffix: 740 | df_mcc.columns = df_mcc.columns.values + '_mcc' 741 | 742 | return df_mcc 743 | 744 | def get_mcc_lite(mc_table, c_table, base_call_cutoff=100, sufficient_coverage_fraction=1, fillna=True): 745 | """Given 2 numpy array, return mcc table 746 | Gene/region by sample matrix 747 | """ 748 | df_c = pd.DataFrame(c_table) 749 | df_mc = pd.DataFrame(mc_table) 750 | assert df_c.shape == df_mc.shape 751 | 752 | # a gene is sufficiently covered in % of cells 753 | condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df_c.shape[1]) 754 | 755 | logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape)) 756 | logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape)) 757 | 758 | # get mcc matrix with kept bins and nan values for low coverage sites 759 | df_c_nan = df_c.copy() 760 | df_c_nan[df_c < base_call_cutoff] = np.nan 761 | df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition] 762 | logging.info(df_mcc.shape) 763 | 764 | # imputation (missing value -> mean value of all cells) 765 | if fillna: 766 | logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)') 767 | means = df_mcc.mean(axis=1) 768 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns}) 769 | df_mcc.fillna(fill_value, inplace=True) 770 | 771 | # return matrix and index (regions) 772 | return df_mcc.values, df_mcc.index.values 773 | 774 | def get_mcc_lite_v2(df_c, df_mc, base_call_cutoff): 775 | """ 776 | """ 777 | # get mcc matrix with kept bins and nan values for low coverage sites 778 | df_c_nan = df_c.copy() 779 | df_c_nan[df_c < base_call_cutoff] = np.nan 780 | df_mcc = df_mc/df_c_nan 781 | logging.info(df_mcc.shape) 782 | 783 | # imputation (missing value -> mean value of all cells) 784 | means = df_mcc.mean(axis=1) 785 | fill_value = pd.DataFrame({col: means for col in df_mcc.columns}) 786 | df_mcc.fillna(fill_value, inplace=True) 787 | 788 | return df_mcc 789 | 790 | def get_mcc_lite_v3(df_c, df_mc, base_call_cutoff): 791 | """ 792 | """ 793 | # get mcc matrix with kept bins and nan values for low coverage sites 794 | df_c_nan = df_c.copy() 795 | df_c_nan[df_c < base_call_cutoff] = np.nan 796 | df_mcc = df_mc/df_c_nan 797 | return df_mcc 798 | 799 | 800 | def get_clusters_mc_c_worker(df_cells, df_input, cluster_col): 801 | """reduce gene*cell or bin*cell matrix to a gene*cluster or bin*cluster matrix 802 | Arguments: 803 | - df_cells: a dataframe indexed by 'cell_name', and have '$cluster_col' as column 804 | - df_input: a dataframe with 'sample_mc', 'sample_c' ... as columns 805 | sample names are cell names 806 | """ 807 | # cluster mc_c 808 | df_c = df_input.filter(regex='_c$') 809 | df_mc = df_input.filter(regex='_mc$') 810 | 811 | df_mc_c = pd.DataFrame() 812 | for label, df_sub in df_cells.groupby(cluster_col): 813 | samples = df_sub.index.values 814 | df_mc_c['{}_mc'.format(label)] = df_mc[samples+'_mc'].sum(axis=1) 815 | df_mc_c['{}_c'.format(label)] = df_c[samples+'_c'].sum(axis=1) 816 | 817 | logging.info("Output shape: {}".format(df_mc_c.shape)) 818 | return df_mc_c 819 | 820 | def rank_array(array): 821 | """Return ranking of each element of an array 822 | """ 823 | array = np.array(array) 824 | temp = array.argsort() 825 | ranks = np.empty_like(temp) 826 | ranks[temp] = np.arange(len(array)) 827 | return ranks 828 | 829 | # added 4/5/2019 830 | def rank_rows(matrix): 831 | """Return rankings of each rwo in a 2d array 832 | """ 833 | matrix = np.array(matrix) 834 | return np.apply_along_axis(rank_array, 1, matrix) # row = 1 835 | 836 | def spearman_corrcoef(X, Y): 837 | """return spearman correlation matrix for each pair of rows of X and Y 838 | """ 839 | return np.corrcoef(rank_rows(X), rank_rows(Y)) 840 | 841 | def spearmanr_paired_rows(X, Y): 842 | from scipy import stats 843 | 844 | X = np.array(X) 845 | Y = np.array(Y) 846 | corrs = [] 847 | ps = [] 848 | for x, y in zip(X, Y): 849 | r, p = stats.spearmanr(x, y) 850 | corrs.append(r) 851 | return np.array(corrs), np.array(ps) 852 | 853 | def get_index_from_array(arr, inqs, na_rep=-1): 854 | """Get index of array 855 | """ 856 | arr = np.array(arr) 857 | arr = pd.Series(arr).reset_index().set_index(0) 858 | idxs = arr.reindex(inqs)['index'].fillna(na_rep).astype(int).values 859 | return idxs 860 | 861 | def get_genomic_distance(sa, ea, sb, eb): 862 | """Get genomic distance 863 | """ 864 | assert sa < ea and sb < eb 865 | if sa > sb: 866 | sa, sb = sb, sa 867 | ea, eb = eb, ea 868 | 869 | # sa <= sb 870 | distance = max(0, sb - ea) 871 | 872 | return distance 873 | 874 | def get_reverse_comp(string): 875 | """Get reverse compliment of a string 876 | """ 877 | comp_dict = { 878 | 'A': 'T', 879 | 'T': 'A', 880 | 'G': 'C', 881 | 'C': 'G', 882 | 'N': 'N', 883 | } 884 | for char in set(string): 885 | if char not in ['A', 'C', 'G', 'T', 'N']: 886 | raise ValueError('Not allowed char in string') 887 | 888 | new_string = ''.join([comp_dict[char] for char in string[::-1]]) 889 | return new_string 890 | 891 | def save_gc_matrix(gc_matrix, f_gene, f_cell, f_mat): 892 | """ 893 | """ 894 | sparse.save_npz(f_mat, gc_matrix.data) 895 | with open(f_gene, 'w') as f: 896 | f.write('\n'.join(gc_matrix.gene)+'\n') 897 | with open(f_cell, 'w') as f: 898 | f.write('\n'.join(gc_matrix.cell)+'\n') 899 | 900 | def save_gc_matrix_methylation(gc_matrix, f_gene, f_cell, f_mat_mc, f_mat_c): 901 | """ 902 | """ 903 | sparse.save_npz(f_mat_mc, gc_matrix.data['mc']) 904 | sparse.save_npz(f_mat_c, gc_matrix.data['c']) 905 | with open(f_gene, 'w') as f: 906 | f.write('\n'.join(gc_matrix.gene)+'\n') 907 | with open(f_cell, 'w') as f: 908 | f.write('\n'.join(gc_matrix.cell)+'\n') 909 | 910 | def import_single_textcol(fname, header=None, col=0): 911 | return pd.read_csv(fname, header=header, sep='\t')[col].values 912 | 913 | def export_single_textcol(fname, array): 914 | with open(fname, 'w') as f: 915 | f.write('\n'.join(array)+'\n') 916 | 917 | def load_gc_matrix(f_gene, f_cell, f_mat): 918 | """ 919 | """ 920 | gene = import_single_textcol(f_gene) 921 | cell = import_single_textcol(f_cell) 922 | mat = sparse.load_npz(f_mat) 923 | assert (len(gene), len(cell)) == mat.shape 924 | return GC_matrix(gene, cell, mat) 925 | 926 | def load_gc_matrix_methylation(f_gene, f_cell, f_mat_mc, f_mat_c): 927 | """ 928 | """ 929 | _gene = import_single_textcol(f_gene) 930 | _cell = import_single_textcol(f_cell) 931 | _mat_mc = sparse.load_npz(f_mat_mc) 932 | _mat_c = sparse.load_npz(f_mat_c) 933 | gxc_raw = GC_matrix(_gene, _cell, 934 | {'c': _mat_c, 'mc': _mat_mc}) 935 | return gxc_raw 936 | 937 | def nondup_legends(ax='', **kwargs): 938 | """Assuming plt (matplotlib.pyplot) is imported 939 | """ 940 | from collections import OrderedDict 941 | import matplotlib.pyplot as plt 942 | 943 | if ax == '': 944 | handles, labels = plt.gca().get_legend_handles_labels() 945 | by_label = OrderedDict(zip(labels, handles)) 946 | plt.legend(by_label.values(), by_label.keys(), **kwargs) 947 | else: 948 | handles, labels = ax.get_legend_handles_labels() 949 | by_label = OrderedDict(zip(labels, handles)) 950 | ax.legend(by_label.values(), by_label.keys(), **kwargs) 951 | return 952 | 953 | def dedup_array_elements(x, empty_string=''): 954 | """Replacing repeats with empty_string 955 | """ 956 | newx = np.empty_like(x) 957 | newx[0] = x[0] 958 | for i in range(1, len(x)): 959 | if x[i-1] == x[i]: 960 | newx[i] = empty_string 961 | else: 962 | newx[i] = x[i] 963 | return newx 964 | 965 | def vcorrcoef(X,Y): 966 | """Compute correlation coef for each rows of X and Y 967 | """ 968 | assert X.shape == Y.shape 969 | Xm = np.mean(X,axis=1).reshape(-1,1) 970 | Ym = np.mean(Y,axis=1).reshape(-1,1) 971 | Xm = X-Xm 972 | Ym = Y-Ym 973 | 974 | r_num = np.sum(Xm*Ym,axis=1) 975 | r_den = np.sqrt(np.sum(Xm**2,axis=1)*np.sum(Ym**2, axis=1)) 976 | r = r_num/r_den 977 | return r 978 | 979 | def zscore(x, offset=1e-7, ddof=1): 980 | return (x - np.mean(x))/(np.std(x, ddof=ddof) + offset) 981 | 982 | 983 | def clst_umap_pipe_lite(pcs, cells_all, 984 | resolution=1, 985 | npc=50, 986 | k=30, 987 | verbose=False, seed=0, cluster_only=False, 988 | ): 989 | # clustering 990 | import CEMBA_clst_utils 991 | import CEMBA_run_tsne 992 | 993 | df_clst = CEMBA_clst_utils.clustering_routine( 994 | pcs, 995 | cells_all, k, 996 | verbose=verbose, 997 | resolution=resolution, 998 | seed=seed, 999 | metric='euclidean', option='plain', n_trees=10, search_k=-1) 1000 | 1001 | # umap 1002 | if not cluster_only: 1003 | df_tsne = CEMBA_run_tsne.run_umap_lite( 1004 | pcs, 1005 | cells_all, 1006 | verbose=verbose, 1007 | n_neighbors=30, min_dist=0.5, n_dim=2, 1008 | random_state=1) 1009 | 1010 | df_summary = df_clst.join(df_tsne) 1011 | return df_summary 1012 | else: 1013 | return df_clst 1014 | 1015 | def gen_cdf(array, ax, x_range=[], n_points=1000, show=True, flip=False, **kwargs): 1016 | """ 1017 | """ 1018 | x = np.sort(array) 1019 | y = np.arange(len(array))/len(array) 1020 | if flip: 1021 | # x = x[::-1] 1022 | y = 1 - y 1023 | 1024 | if not x_range: 1025 | if show: 1026 | ax.plot(x, y, **kwargs) 1027 | return x, y 1028 | else: 1029 | start, end = x_range 1030 | xbins = np.linspace(start, end, n_points) 1031 | ybins = np.interp(xbins, x, y) 1032 | if show: 1033 | ax.plot(xbins, ybins, **kwargs) 1034 | return xbins, ybins 1035 | 1036 | def savefig(fig, path): 1037 | """ 1038 | """ 1039 | fig.savefig(path, bbox_inches='tight', dpi=300) 1040 | return 1041 | -------------------------------------------------------------------------------- /scripts/cli_parser.py: -------------------------------------------------------------------------------- 1 | """Command line interface is defined here. 2 | """ 3 | DESCRIPTION_preproc=""" 4 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. 5 | This is the CLI for its preprocessing module 6 | (from count matrices to normalized HVG feature matrices). 7 | """ 8 | 9 | DESCRIPTION=""" 10 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. 11 | """ 12 | 13 | EPILOG=""" 14 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel. 15 | Contact: Eran Mukamel (emukamel@ucsd.edu). 16 | """ 17 | 18 | import argparse 19 | import os 20 | 21 | def create_parser_preproc(): 22 | """ 23 | """ 24 | parser = argparse.ArgumentParser( 25 | prog="SingleCellFusion_pre", 26 | description=DESCRIPTION_preproc, 27 | epilog=EPILOG, 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 29 | ) 30 | 31 | required = parser.add_argument_group('required') 32 | optional = parser.add_argument_group('optional') 33 | 34 | # Input/Output Dataset Settings 35 | required.add_argument( 36 | "-i", "--input_datasets", 37 | type=str, 38 | nargs="+", 39 | required=True, 40 | help='''(list of str) 41 | Paths to .h5ad files, each containing a cell-by-gene feature matrix, 42 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 43 | Gene IDs should be shared or partially shared across files. 44 | Multiple inputs should be listed as a space seperated list of filenames. 45 | ''' 46 | ) 47 | optional.add_argument( 48 | "-icov", "--input_datasets_coverage", 49 | type=str, 50 | nargs="+", 51 | help='''(list of str) 52 | Paths to .h5ad files, each containing a cell-by-gene feature matrix, 53 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 54 | Gene IDs should be shared or partially shared across files. 55 | Multiple inputs should be listed as a space seperated list of filenames. 56 | 57 | Required for "mc" datasets. Should follow the order of -i 58 | ''' 59 | ) 60 | required.add_argument( 61 | "-inorm", "--input_normalizations", 62 | type=str, 63 | nargs="+", 64 | required=True, 65 | help='''(list of str) 66 | Data modalities chosen from 'mc', 'cpm', or 'tpm'. This should be 67 | listed in the same order as input_datasets 68 | ''', 69 | ) 70 | optional.add_argument( 71 | "-ci", "--cellid_column", 72 | type=str, 73 | default="", 74 | help='''(str) 75 | Cell id column - column in AnnData.obs that represents cell id. 76 | This needs to be unique within and across datasets. 77 | Empty string means the column is the index of AnnData.obs. 78 | ''' 79 | ) 80 | optional.add_argument( 81 | "-gi", "--geneid_column", 82 | type=str, 83 | default="", 84 | help='''(str) 85 | Gene id column - column in AnnData.var that presents gene id. 86 | This needs to be unique and shared across genes. 87 | Empty string means the column is the index of AnnData.var. 88 | ''' 89 | ) 90 | optional.add_argument( 91 | "-gmmc", "--global_mean_mc_column", 92 | type=str, 93 | default="", 94 | help='''(str) 95 | Global mean mc column - column in AnnData.obs that presents global mean methylation level. 96 | If empty, estimated by the input matrix 97 | ''' 98 | ) 99 | optional.add_argument( 100 | "-sp", "--tosparse", 101 | action='store_true', 102 | help='''() 103 | this turns the input matrix into scipy sparse matrix format 104 | ''' 105 | ) 106 | optional.add_argument( 107 | "-o", "--output_dir", 108 | type=str, 109 | default="./preprocessed", 110 | help='''(str) 111 | Directory to store output files 112 | ''' 113 | ) 114 | optional.add_argument( 115 | "-op", "--output_prefix", 116 | metavar="OUT_PREFIX", 117 | type=str, 118 | default="SingleCellFusion", 119 | help='''(str) 120 | The output files will contain this prefix 121 | ''' 122 | ) 123 | optional.add_argument( 124 | "-ga", "--gene_annotation_file", 125 | type=str, 126 | default="", 127 | help='''(str) 128 | Gene annotation file (bed format: chr, start, end, gene_id/gene_name/any identifier) 129 | required if choose 'tpm' as the normalization option. 130 | the fourth column is used to identify individual genes. 131 | ''' 132 | ) 133 | optional.add_argument( 134 | "-subn", "--sub_n", 135 | type=int, 136 | default=None, 137 | help='''(int) 138 | Subsampling this number of cells for each input dataset 139 | ''' 140 | ) 141 | optional.add_argument( 142 | "-subf", "--sub_frac", 143 | type=float, 144 | default=None, 145 | help='''(float) 146 | Subsampling this fraction (0~1) of cells for each input dataset 147 | ''' 148 | ) 149 | return parser 150 | 151 | def create_parser(): 152 | """ 153 | """ 154 | parser = argparse.ArgumentParser( 155 | prog="SingleCellFusion", 156 | description=DESCRIPTION, 157 | epilog=EPILOG, 158 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 159 | ) 160 | 161 | required = parser.add_argument_group('required') 162 | optional = parser.add_argument_group('optional') 163 | advanced = parser.add_argument_group('advanced') 164 | 165 | ## ARGUMENTS DIRECTLY FED INTO SingleCellFusion CLI 166 | # Input/Output Dataset Settings 167 | required.add_argument( 168 | "-i", "--input_datasets", 169 | metavar="xx.h5ad", 170 | type=str, 171 | nargs="+", 172 | required=True, 173 | help='''(list of str) 174 | Paths to .h5ad files, each containing a cell-by-gene feature matrix, 175 | cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 176 | Gene IDs should be shared or partially shared across files. 177 | Multiple inputs should be listed as a space seperated list of filenames. 178 | ''' 179 | ) 180 | required.add_argument( 181 | "-im", "--input_modalities", 182 | metavar="rna/atac/mc", 183 | type=str, 184 | nargs="+", 185 | required=True, 186 | help='''(list of str) 187 | Data modalities chosen from 'rna', 'atac', or 'mc'. This should be 188 | listed in the same order as input_datasets. 189 | ''' 190 | ) 191 | # may need this in the future 192 | # parser.add_argument( 193 | # "-im", "--input_meta", 194 | # type=str, 195 | # required=True, 196 | # help="(list of str) Input metadata csv file", 197 | # ) 198 | 199 | required.add_argument( 200 | "-f", "--feature_datasets", 201 | metavar="xx.h5ad", 202 | type=str, 203 | nargs="+", 204 | required=True, 205 | help='''(list of str) 206 | Dataset(s) whose features all other datasets will impute into. 207 | This should be a subset of --input_datasets. 208 | Enter multiple datasets as a space-separated list of filenames. 209 | The features of these datasets will 210 | be the features kept in the output imputed data table.", 211 | ''' 212 | ) 213 | optional.add_argument( 214 | "-o", "--output_dir", 215 | metavar="DIR", 216 | type=str, 217 | default="./results", 218 | help='''(str) 219 | Directory to store output files 220 | ''' 221 | ) 222 | optional.add_argument( 223 | "-op", "--output_prefix", 224 | type=str, 225 | default="SingleCellFusion", 226 | help='''(str) 227 | The output files will contain this prefix. 228 | ''' 229 | ) 230 | 231 | # constraint kNN across modalities 232 | optional.add_argument( 233 | "--nearest_neighbors", 234 | type=int, 235 | default=20, 236 | help='''(integer) 237 | Number of nearest neighbors used to impute data 238 | ''' 239 | ) 240 | optional.add_argument( 241 | "--relaxation", 242 | type=float, 243 | default=3, 244 | help='''(float) 245 | A value between 1 to infinity. 246 | This is a parameter that constraints the number of neighbors a cell is allowed to receive. 247 | Assume dataset 1 has N1 cells, dataset 2 has N2 cells. To find k neighbors in dataset 2 for 248 | every cell in dataset 1 means on average each cell in dataset 2 receives (kN1/N2) connections. 249 | However, not all cells in dataset 2 gets the same number of connections. We therefore set an 250 | upper bound for the number of connections a cell in dataset 2 can receive to be: 251 | (kN1/N2)*relaxation 252 | where relaxation >= 1. Relaxation=1 enforces a hard limit that every cell receives 253 | the same number of nearest neighbors, while relaxation=infinity approaches traditional kNN. 254 | ''' 255 | ) 256 | optional.add_argument( 257 | "--precomputed_pca_file", 258 | type=str, 259 | default='', 260 | help='''(str) 261 | Precomputed PCA matrix (tab separated table; text file or gzipped) 262 | with the first row as the header, and the first column as the cell_id 263 | Each following rows are cell features, and columns are PCs. 264 | 265 | Providing this file will by-pass SingleCellFusion integration, 266 | and do clustering and UMAP just on this matrix instead. 267 | ''' 268 | ) 269 | optional.add_argument( 270 | "--use_netUMAP", 271 | action='store_true', 272 | help='''(bool) 273 | Include this argument to use Net-UMAP from Pegasus (Li et al. 2020) 274 | Net-UMAP is an approximate but fast algorithm for UMAP. 275 | It runs traditional UMAP on a subset of cells, 276 | then it uses deep neural network to learn embedding for all cells. 277 | The package pegasus is required. 278 | ''' 279 | ) 280 | optional.add_argument( 281 | "--use_tsne", 282 | action='store_true', 283 | help='''(bool) 284 | Include this argument to use tSNE instead of UMAP 285 | ''' 286 | ) 287 | 288 | # within modality smoothing 289 | advanced.add_argument( 290 | "--num_pcs", 291 | type=int, 292 | default=50, 293 | help='''(integer) 294 | Number of Principal Components to keep for each dataset 295 | for smoothing and for clustering/embedding after imputation. 296 | ''' 297 | ) 298 | advanced.add_argument( 299 | "--smoothing_fractions", 300 | nargs="+", 301 | type=float, 302 | default=[0.7, 0.1, 0.9], 303 | help='''(list of floats) 304 | A list of three values between 0 to 1 that controls the relative contribution 305 | from the cell itself vs. its neighbors in within-dataset smoothing, 306 | specified for 'rna', 'atac', 'mc' data, respectively. 307 | ''' 308 | ) 309 | 310 | # Arguments for Clustering 311 | advanced.add_argument( 312 | "--leiden_n_neighbors", 313 | type=int, 314 | default=30, 315 | help='''(integer) 316 | Number of nearest neighbors to form in the integrated space, 317 | the resulting nearest neighbor graph is used for Leiden clustering. 318 | It is passed into the python package leidenalg. 319 | ''' 320 | ) 321 | advanced.add_argument( 322 | "--leiden_resolutions", 323 | type=list, 324 | default=[0.1, 0.2, 0.4, 0.8], 325 | help='''(list of floats) 326 | A list of resolutions to be used for Leiden Clustering. 327 | It is passed into the python package leidenalg. 328 | ''' 329 | ) 330 | 331 | # Arguments for UMAP 332 | advanced.add_argument( 333 | "--umap_n_neighbors", 334 | type=int, 335 | default=60, 336 | help='''(integer) 337 | Number of neighbors for UMAP. It is passed into the python package umap.UMAP(n_neighbors). 338 | ''' 339 | ) 340 | advanced.add_argument( 341 | "--umap_min_dist", 342 | type=float, 343 | default=0.5, 344 | help='''(float) 345 | Minimum distance for UMAP. It is passed into the python package umap.UMAP(min_dist). 346 | ''' 347 | ) 348 | return parser 349 | 350 | def parse_filename(data_file): 351 | """turn a xxx/xxx/XXXX.h5ad into XXXX 352 | """ 353 | dataset_name = os.path.basename(data_file) 354 | if dataset_name.endswith('.h5ad'): 355 | dataset_name = dataset_name[:-len('.h5ad')] 356 | else: 357 | raise ValueError("filenames don't have the format xxxx.h5ad") 358 | return dataset_name 359 | 360 | def modality_default_options(mod): 361 | """ 362 | """ 363 | if mod == 'mc': 364 | mod_direction = -1 365 | # norm_option = 'mc' 366 | elif mod == 'rna': 367 | mod_direction = 1 368 | # norm_option = 'cpm' 369 | elif mod == 'atac': 370 | mod_direction = 1 371 | # norm_option = 'tpm' 372 | else: 373 | raise ValueError("choose from ['mc', 'rna', 'atac']") 374 | return mod_direction 375 | -------------------------------------------------------------------------------- /scripts/clst_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for clusterings and embeddings 2 | """ 3 | 4 | from __init__ import * 5 | # from sklearn.decomposition import PCA 6 | import igraph as ig 7 | from scipy import sparse 8 | from annoy import AnnoyIndex 9 | from umap import UMAP 10 | import leidenalg 11 | 12 | from basic_utils import create_logger 13 | 14 | # major change in annoy functions 5/7/2019 15 | def build_knn_map(X, metric='euclidean', n_trees=10, verbose=True): 16 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50) 17 | 18 | return: 19 | t: annoy knn object, can be used in the following ways 20 | t.get_nns_by_vector 21 | t.get_nns_by_item 22 | """ 23 | ti = time.time() 24 | 25 | n_obs, n_f = X.shape 26 | t = AnnoyIndex(n_f, metric=metric) # Length of item vector that will be indexed 27 | for i, X_row in enumerate(X): 28 | t.add_item(i, X_row) 29 | t.build(n_trees) # 10 trees 30 | if verbose: 31 | print("Time used to build kNN map {}".format(time.time()-ti)) 32 | return t 33 | 34 | def get_knn_by_items(t, k, 35 | form='list', 36 | search_k=-1, 37 | include_distances=False, 38 | verbose=True, 39 | ): 40 | """Get kNN for each item in the knn map t 41 | """ 42 | ti = time.time() 43 | # set up 44 | n_obs = t.get_n_items() 45 | n_f = t.f 46 | if k > n_obs: 47 | print("Actual k: {}->{} due to low n_obs".format(k, n_obs)) 48 | k = n_obs 49 | 50 | knn = [0]*(n_obs) 51 | knn_dist = [0]*(n_obs) 52 | # this block of code can be optimized 53 | if include_distances: 54 | for i in range(n_obs): 55 | res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances) 56 | knn[i] = res[0] 57 | knn_dist[i] = res[1] 58 | else: 59 | for i in range(n_obs): 60 | res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances) 61 | knn[i] = res 62 | 63 | knn = np.array(knn) 64 | knn_dist = np.array(knn_dist) 65 | 66 | if verbose: 67 | print("Time used to get kNN {}".format(time.time()-ti)) 68 | 69 | if form == 'adj': 70 | # row col 1/dist 71 | row_inds = np.repeat(np.arange(n_obs), k) 72 | col_inds = np.ravel(knn) 73 | if include_distances: 74 | data = np.ravel(knn_dist) 75 | else: 76 | data = [1]*len(row_inds) 77 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs, n_obs)) 78 | return knn_dist_mat 79 | elif form == 'list': # 80 | if include_distances: 81 | return knn, knn_dist 82 | else: 83 | return knn 84 | else: 85 | raise ValueError("Choose from 'adj' and 'list'") 86 | 87 | def get_knn_by_vectors(t, X, k, 88 | form='list', 89 | search_k=-1, 90 | include_distances=False, 91 | verbose=True, 92 | ): 93 | """Get kNN for each row vector of X 94 | """ 95 | ti = time.time() 96 | # set up 97 | n_obs = t.get_n_items() 98 | n_f = t.f 99 | n_obs_test, n_f_test = X.shape 100 | assert n_f_test == n_f 101 | 102 | if k > n_obs: 103 | print("Actual k: {}->{} due to low n_obs".format(k, n_obs)) 104 | k = n_obs 105 | 106 | knn = [0]*(n_obs_test) 107 | knn_dist = [0]*(n_obs_test) 108 | if include_distances: 109 | for i, vector in enumerate(X): 110 | res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances) 111 | knn[i] = res[0] 112 | knn_dist[i] = res[1] 113 | else: 114 | for i, vector in enumerate(X): 115 | res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances) 116 | knn[i] = res 117 | 118 | knn = np.array(knn) 119 | knn_dist = np.array(knn_dist) 120 | 121 | if verbose: 122 | print("Time used to get kNN {}".format(time.time()-ti)) 123 | 124 | if form == 'adj': 125 | # row col 1/dist 126 | row_inds = np.repeat(np.arange(n_obs_test), k) 127 | col_inds = np.ravel(knn) 128 | if include_distances: 129 | data = np.ravel(knn_dist) 130 | else: 131 | data = [1]*len(row_inds) 132 | knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs_test, n_obs)) 133 | return knn_dist_mat 134 | elif form == 'list': # 135 | if include_distances: 136 | return knn, knn_dist 137 | else: 138 | return knn 139 | else: 140 | raise ValueError("Choose from 'adj' and 'list'") 141 | 142 | def gen_knn_annoy(X, k, form='list', 143 | metric='euclidean', n_trees=10, search_k=-1, verbose=True, 144 | include_distances=False, 145 | ): 146 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50) 147 | """ 148 | ti = time.time() 149 | 150 | n_obs, n_f = X.shape 151 | t = build_knn_map(X, metric=metric, n_trees=n_trees, verbose=verbose) 152 | 153 | return get_knn_by_items(t, k, 154 | form=form, 155 | search_k=search_k, 156 | include_distances=include_distances, 157 | verbose=verbose, 158 | ) 159 | 160 | def gen_knn_annoy_train_test(X_train, X_test, k, 161 | form='list', 162 | metric='euclidean', n_trees=10, search_k=-1, verbose=True, 163 | include_distances=False, 164 | ): 165 | """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50) 166 | For each row in X_test, find k nearest neighbors in X_train 167 | """ 168 | ti = time.time() 169 | 170 | n_obs, n_f = X_train.shape 171 | n_obs_test, n_f_test = X_test.shape 172 | assert n_f == n_f_test 173 | 174 | t = build_knn_map(X_train, metric=metric, n_trees=n_trees, verbose=verbose) 175 | return get_knn_by_vectors(t, X_test, k, 176 | form=form, 177 | search_k=search_k, 178 | include_distances=include_distances, 179 | verbose=verbose, 180 | ) 181 | 182 | def compute_jaccard_weights_from_knn(X): 183 | """compute jaccard index on a knn graph 184 | Arguments: 185 | X (unweighted) kNN ajacency matrix (each row Xi* gives the kNNs of cell i) 186 | X has to be 0-1 valued 187 | k (number of nearest neighbors) 188 | 189 | output: numpy matrix Y 190 | """ 191 | X = sparse.csr_matrix(X) 192 | ni, nj = X.shape 193 | assert ni == nj 194 | 195 | k = X[0, :].sum() # number of neighbors 196 | 197 | Y = X.dot(X.T) 198 | # Y = X.multiply(tmp/(2*k - tmp.todense())) 199 | Y.data = Y.data/(2*k - Y.data) 200 | 201 | return Y 202 | 203 | def adjacency_to_igraph(adj_mtx, weighted=False): 204 | """ 205 | Converts an adjacency matrix to an igraph object 206 | 207 | Args: 208 | adj_mtx (sparse matrix): Adjacency matrix 209 | directed (bool): If graph should be directed 210 | 211 | Returns: 212 | G (igraph object): igraph object of adjacency matrix 213 | 214 | Uses code from: 215 | https://github.com/igraph/python-igraph/issues/168 216 | https://stackoverflow.com/questions/29655111 217 | 218 | Author: 219 | Wayne Doyle 220 | (Fangming Xie modified) 221 | """ 222 | nrow, ncol = adj_mtx.shape 223 | if nrow != ncol: 224 | raise ValueError('Adjacency matrix should be a square matrix') 225 | vcount = nrow 226 | sources, targets = adj_mtx.nonzero() 227 | edgelist = list(zip(sources.tolist(), targets.tolist())) 228 | G = ig.Graph(n=vcount, edges=edgelist, directed=True) 229 | if weighted: 230 | G.es['weight'] = adj_mtx.data 231 | return G 232 | 233 | def leiden_lite(g, cell_list, resolution=1, weighted=False, verbose=True, num_starts=None, seed=1): 234 | """ Code from Ethan Armand and Wayne Doyle, ./mukamel_lab/mop 235 | slightly modified by Fangming Xie 05/13/2019 236 | """ 237 | 238 | ti = time.time() 239 | 240 | if num_starts is not None: 241 | np.random.seed(seed) 242 | partitions = [] 243 | quality = [] 244 | seeds = np.random.randint(10*num_starts, size=num_starts) 245 | for seed in seeds: 246 | if weighted: 247 | temp_partition = leidenalg.find_partition(g, 248 | leidenalg.RBConfigurationVertexPartition, 249 | weights=g.es['weight'], 250 | resolution_parameter=resolution, 251 | seed=seed, 252 | ) 253 | else: 254 | temp_partition = leidenalg.find_partition(g, 255 | leidenalg.RBConfigurationVertexPartition, 256 | resolution_parameter=resolution, 257 | seed=seed, 258 | ) 259 | quality.append(temp_partition.quality()) 260 | partitions.append(temp_partition) 261 | partition1 = partitions[np.argmax(quality)] 262 | else: 263 | if weighted: 264 | partition1 = leidenalg.find_partition(g, 265 | leidenalg.RBConfigurationVertexPartition, 266 | weights=g.es['weight'], 267 | resolution_parameter=resolution, 268 | seed=seed, 269 | ) 270 | else: 271 | partition1 = leidenalg.find_partition(g, 272 | leidenalg.RBConfigurationVertexPartition, 273 | resolution_parameter=resolution, 274 | seed=seed, 275 | ) 276 | 277 | # get cluster labels from partition1 278 | labels = [0]*(len(cell_list)) 279 | for i, cluster in enumerate(partition1): 280 | for element in cluster: 281 | labels[element] = i+1 282 | 283 | df_res = pd.DataFrame(index=cell_list) 284 | df_res['cluster'] = labels 285 | df_res = df_res.rename_axis('sample', inplace=False) 286 | 287 | if verbose: 288 | print("Time spent on leiden clustering: {}".format(time.time()-ti)) 289 | 290 | return df_res 291 | 292 | def clustering_routine(X, cell_list, k, 293 | seed=1, verbose=True, 294 | resolution=1, metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None): 295 | """ 296 | X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended 297 | option: {'plain', 'jaccard', ...} 298 | """ 299 | assert len(cell_list) == len(X) 300 | 301 | if option == 'plain': 302 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 303 | n_trees=n_trees, search_k=search_k, verbose=verbose) 304 | G = adjacency_to_igraph(g_knn, weighted=False) 305 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 306 | weighted=False, verbose=verbose, num_starts=num_starts) 307 | 308 | elif option == 'jaccard': 309 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 310 | n_trees=n_trees, search_k=search_k, verbose=verbose) 311 | gw_knn = compute_jaccard_weights_from_knn(g_knn) 312 | G = adjacency_to_igraph(gw_knn, weighted=True) 313 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 314 | weighted=True, verbose=verbose, num_starts=num_starts) 315 | else: 316 | raise ValueError('Choose from "plain" and "jaccard"') 317 | 318 | return df_res 319 | 320 | def clustering_routine_multiple_resolutions(X, cell_list, k, 321 | seed=1, verbose=True, 322 | resolutions=[1], metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None): 323 | """ 324 | X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended 325 | option: {'plain', 'jaccard', ...} 326 | """ 327 | assert len(cell_list) == len(X) 328 | 329 | res = [] 330 | if option == 'plain': 331 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 332 | n_trees=n_trees, search_k=search_k, verbose=verbose) 333 | G = adjacency_to_igraph(g_knn, weighted=False) 334 | for resolution in resolutions: 335 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 336 | weighted=False, verbose=verbose, num_starts=num_starts) 337 | df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)}) 338 | res.append(df_res) 339 | 340 | elif option == 'jaccard': 341 | g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 342 | n_trees=n_trees, search_k=search_k, verbose=verbose) 343 | gw_knn = compute_jaccard_weights_from_knn(g_knn) 344 | G = adjacency_to_igraph(gw_knn, weighted=True) 345 | for resolution in resolutions: 346 | df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 347 | weighted=True, verbose=verbose, num_starts=num_starts) 348 | df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)}) 349 | res.append(df_res) 350 | 351 | else: 352 | raise ValueError('Choose from "plain" and "jaccard"') 353 | res = pd.concat(res, axis=1) 354 | 355 | return res 356 | 357 | def run_net_umap_pegasus(X, **kwargs): 358 | """ 359 | X (m, n) -> res_umap (m, 2) 360 | """ 361 | import pegasus 362 | import pegasusio 363 | # pegasus netUMAP 364 | # construct a pegasus object (a hack - pegasus='1.4.3'; pegasusio='0.3.1.post2') 365 | m, n = X.shape 366 | pgX = pegasusio.MultimodalData(pegasusio.UnimodalData( 367 | {'barcodekey': np.arange(m).astype(str)}, 368 | {'featurekey': np.arange(n).astype(str)}, 369 | {"X": X}, 370 | )) 371 | # (a hack) select_alpha=0 is important to resolve a 372 | # sampling bug by pegasus when n is large 373 | pegasus.net_umap(pgX, rep=None, select_alpha=0, **kwargs) 374 | res_umap = pgX.obsm['X_net_umap'] # an array 375 | return res_umap 376 | 377 | def run_umap_lite(X, cell_list, n_neighbors=15, min_dist=0.1, n_dim=2, 378 | random_state=1, output_file=None, use_netUMAP=False, use_tsne=False, **kwargs): 379 | """run umap on X (n_obs, n_features) 380 | """ 381 | ti = time.time() 382 | 383 | logging.info("Running UMAP: {} n_neighbors, {} min_dist , {} dim.\n\ 384 | Input shape: (# observations, # features) = {}\n\ 385 | Use netUMAP from pegasus: {}\n\ 386 | Use tSNE: {}\n\ 387 | " 388 | .format(n_neighbors, min_dist, n_dim, X.shape, use_netUMAP, use_tsne)) 389 | 390 | if use_netUMAP: 391 | umap_res = run_net_umap_pegasus(X, 392 | n_components=n_dim, 393 | random_state=random_state, 394 | n_neighbors=n_neighbors, 395 | min_dist=min_dist, 396 | **kwargs) 397 | elif use_tsne: 398 | from sklearn.manifold import TSNE 399 | umap_res = TSNE(n_components=n_dim, 400 | random_state=random_state, 401 | **kwargs, 402 | ).fit_transform(X) 403 | else: 404 | umap_res = UMAP(n_components=n_dim, 405 | random_state=random_state, 406 | n_neighbors=n_neighbors, 407 | min_dist=min_dist, 408 | **kwargs).fit_transform(X) 409 | 410 | 411 | columns = ['umap_{}'.format(i+1) for i in np.arange(n_dim)] 412 | df_umap = pd.DataFrame(umap_res, columns=columns) 413 | df_umap['sample'] = cell_list 414 | df_umap = df_umap.set_index('sample') 415 | 416 | if output_file: 417 | df_umap.to_csv(output_file, sep="\t", na_rep='NA', header=True, index=True) 418 | logging.info("Saved coordinates to file. {}".format(output_file)) 419 | 420 | tf = time.time() 421 | logging.info("Done. running time: {} seconds.".format(tf - ti)) 422 | 423 | return df_umap -------------------------------------------------------------------------------- /scripts/preproc_utils.py: -------------------------------------------------------------------------------- 1 | from __init__ import * 2 | import numpy as np 3 | import pandas as pd 4 | import logging 5 | from sklearn.utils.sparsefuncs import mean_variance_axis 6 | from scipy.stats import kruskal 7 | 8 | import basic_utils 9 | 10 | def select_hvg(gbc_cpm, percentile=30, n_qcut=20,): 11 | # further select highly variable genes 12 | # variance/mean 13 | mean_cpm, var_cpm = mean_variance_axis(gbc_cpm.data.tocsr(), axis=1) 14 | vmr_cpm = (var_cpm+1)/(mean_cpm+1) 15 | # select top 30 percentile vmr from each first 9 deciles of CPM 16 | # duplicates = 'drop' 9/21/2019 Fangming 17 | _x = pd.qcut(pd.Series(mean_cpm), n_qcut, labels=False, duplicates='drop').to_frame('decile') 18 | hvgs = [] 19 | for decile, _x_sub in _x.groupby('decile'): 20 | gene_group = _x_sub.index.values 21 | mean_cpm_gg = mean_cpm[gene_group] 22 | vmr_cpm_gg = vmr_cpm[gene_group] 23 | # genes with top 30% of vmr 24 | hvg_group = gene_group[vmr_cpm_gg > np.percentile(vmr_cpm_gg, 100-percentile)] 25 | 26 | if decile != n_qcut-1: 27 | hvgs.append(hvg_group) 28 | hvgs = np.hstack(hvgs) 29 | return hvgs 30 | 31 | def select_hvg_methylation(df_nmcc, percentile=30, n_qcut=20,): 32 | # further select highly variable genes 33 | # standard deviation 34 | 35 | stds_nmcc = df_nmcc.std(axis=1) 36 | mean_nmcc = df_nmcc.mean(axis=1) 37 | 38 | # select top 30 percentile vmr from each first 9 deciles of NMCC 39 | # duplicates = 'drop' 9/21/2019 Fangming 40 | _x = pd.qcut(mean_nmcc, n_qcut, labels=False, duplicates='drop').to_frame('decile') 41 | hvgs = [] 42 | for decile, _x_sub in _x.groupby('decile'): 43 | gene_group = _x_sub.index.values 44 | 45 | mean_nmcc_gg = mean_nmcc.loc[gene_group] 46 | stds_nmcc_gg = stds_nmcc.loc[gene_group] 47 | # logging.info(gene_group.shape, stds_nmcc_gg.shape) 48 | # genes with top 30% of stds 49 | hvg_group = gene_group[stds_nmcc_gg > np.percentile(stds_nmcc_gg, 100-percentile)] 50 | hvgs.append(hvg_group) 51 | 52 | hvgs = np.hstack(hvgs) 53 | return hvgs 54 | 55 | def filter_genes(gxc_raw, sufficient_cell_coverage=0.01): 56 | """ 57 | """ 58 | n_gene, n_cell = gxc_raw.data.shape 59 | gene_cov = (gxc_raw.data > 0).sum(axis=1) 60 | gene_cov = np.array(gene_cov).squeeze()/n_cell # fraction of cells covered 61 | cond = gene_cov>sufficient_cell_coverage 62 | gxc_raw_filtered = GC_matrix(np.array(gxc_raw.gene)[cond], 63 | gxc_raw.cell, 64 | gxc_raw.data.tocsr()[cond, :], 65 | ) 66 | return gxc_raw_filtered 67 | 68 | def preproc_rna_cpm_based(gxc_raw, sufficient_cell_coverage=0.01, 69 | hv_percentile=30, hv_ncut=20): 70 | # select genes expressed in > 1% of cells 71 | # raw genes 72 | # _gxc_tmp, gxc_ftr, hvgs 73 | logging.info("Removing low coverage genes...") 74 | lib_size = np.ravel(gxc_raw.data.sum(axis=0)) 75 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage) 76 | 77 | # CPM matrix 78 | logging.info("Getting CPM..") 79 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size) 80 | del _gxc_tmp 81 | 82 | # select highy variable genes 83 | logging.info("Getting highly variable genes and logCPM...") 84 | hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut) 85 | 86 | gxc_hvftr = GC_matrix( 87 | gxc_ftr.gene[hvgs], 88 | gxc_ftr.cell, 89 | gxc_ftr.data.tocsr()[hvgs, :], 90 | ) 91 | del gxc_ftr 92 | gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important 93 | logging.info("Number of genes: {}".format(len(hvgs))) 94 | return gxc_hvftr 95 | 96 | def preproc_rna_cpm_based_kruskal(metadata, cluster_col, gxc_raw, sufficient_cell_coverage=0.01, 97 | hv_percentile=30): 98 | # select genes expressed in > 1% of cells 99 | # raw genes 100 | # _gxc_tmp, gxc_ftr, hvgs 101 | 102 | logging.info("Removing low coverage genes...") 103 | lib_size = np.ravel(gxc_raw.data.sum(axis=0)) 104 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage) 105 | 106 | # CPM matrix 107 | logging.info("Getting CPM..") 108 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size) # logcpm for kw 109 | del _gxc_tmp 110 | 111 | # select highy variable genes 112 | logging.info("Getting highly variable genes and logCPM...") 113 | # select genes with KW test 114 | datasets = [] 115 | for clst, df_sub in metadata.groupby(cluster_col): 116 | cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values) 117 | datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr()) 118 | ps = [] 119 | for i, gene in enumerate(gxc_ftr.gene): 120 | if i%1000==0: 121 | logging.info(i) 122 | gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets] 123 | try: 124 | _, p = kruskal(*gene_data) 125 | except: 126 | p = 1 127 | ps.append(p) 128 | p_th = np.percentile(ps, hv_percentile) 129 | logging.info("Pvalue threshold p_th: {}".format(p_th)) 130 | hvgs = np.arange(len(ps))[ps<=p_th] 131 | 132 | gxc_hvftr = GC_matrix( 133 | gxc_ftr.gene[hvgs], 134 | gxc_ftr.cell, 135 | gxc_ftr.data.tocsr()[hvgs, :], 136 | ) 137 | del gxc_ftr 138 | gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important 139 | logging.info("Number of genes: {}".format(len(hvgs))) 140 | return gxc_hvftr 141 | 142 | def preproc_rna_tpm_based(gxc_raw, gene_lengths, 143 | impute_gene_lengths=True, 144 | sufficient_cell_coverage=0.01, 145 | hv_percentile=30, hv_ncut=20): 146 | """Gene lengths is a gene length pandas series indexed by gene names 147 | """ 148 | # gxc_raw, gxc_logtpm 149 | # _gxc_tmp, gxc_ftr, hvgs 150 | 151 | assert np.all(gxc_raw.gene == gene_lengths.index.values) 152 | if impute_gene_lengths: 153 | logging.info("Imputing gene lengths...") 154 | gene_lengths = gene_lengths.fillna(gene_lengths.mean()) 155 | lib_size = np.ravel(gxc_raw.data.sum(axis=0)) 156 | 157 | # select genes expressed in > 1% of cells 158 | logging.info("Removing low coverage genes...") 159 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage) 160 | 161 | # CPM matrix 162 | logging.info("Getting CPM..") 163 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size) 164 | del _gxc_tmp 165 | 166 | # select highy variable genes 167 | logging.info("Getting highly variable genes and logCPM...") 168 | hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut) # index in gxc_ftr 169 | hvgs_genes = gxc_ftr.gene[hvgs] 170 | del gxc_ftr 171 | 172 | # TPM matrix from gxc_raw 173 | logging.info("Getting logTPM...") 174 | gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths) 175 | hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes) 176 | 177 | # Trim logTPM matrix 178 | logging.info("Trim logTPM matrix...") 179 | gxc_hvftr = GC_matrix( 180 | gxc_logtpm.gene[hvgs_idx], 181 | gxc_logtpm.cell, 182 | gxc_logtpm.data.tocsr()[hvgs_idx, :], 183 | ) 184 | logging.info("Number of genes: {}".format(len(hvgs_idx))) 185 | return gxc_hvftr 186 | 187 | def preproc_rna_tpm_based_kruskal(metadata, cluster_col, gxc_raw, gene_lengths, 188 | impute_gene_lengths=True, 189 | sufficient_cell_coverage=0.01, 190 | hv_percentile=30): 191 | """Gene lengths is a gene length pandas series indexed by gene names 192 | """ 193 | 194 | assert np.all(gxc_raw.gene == gene_lengths.index.values) 195 | if impute_gene_lengths: 196 | logging.info("Imputing gene lengths...") 197 | gene_lengths = gene_lengths.fillna(gene_lengths.mean()) 198 | lib_size = np.ravel(gxc_raw.data.sum(axis=0)) 199 | 200 | # select genes expressed in > 1% of cells 201 | logging.info("Removing low coverage genes...") 202 | _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage) 203 | 204 | # CPM matrix 205 | logging.info("Getting CPM..") 206 | gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size) 207 | del _gxc_tmp 208 | 209 | 210 | logging.info("Getting highly variable genes...") 211 | # select genes with KW test 212 | datasets = [] 213 | for clst, df_sub in metadata.groupby(cluster_col): 214 | cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values) 215 | datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr()) 216 | ps = [] 217 | for i, gene in enumerate(gxc_ftr.gene): 218 | if i%1000==0: 219 | logging.info(i) 220 | gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets] 221 | try: 222 | s, p = kruskal(*gene_data) 223 | except: 224 | p = 1 225 | ps.append(p) 226 | 227 | p_th = np.percentile(ps, hv_percentile) 228 | logging.info("Pvalue threshold p_th: {}".format(p_th)) 229 | hvgs = np.arange(len(ps))[ps<=p_th] 230 | hvgs_genes = gxc_ftr.gene[hvgs] 231 | del gxc_ftr 232 | 233 | # TPM matrix from gxc_raw 234 | logging.info("Getting logTPM...") 235 | gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths) 236 | hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes) 237 | 238 | # Trim logTPM matrix 239 | logging.info("Trim logTPM matrix...") 240 | gxc_hvftr = GC_matrix( 241 | gxc_logtpm.gene[hvgs_idx], 242 | gxc_logtpm.cell, 243 | gxc_logtpm.data.tocsr()[hvgs_idx, :], 244 | ) 245 | logging.info("Number of genes: {}".format(len(hvgs_idx))) 246 | return gxc_hvftr 247 | 248 | def preproc_methylation( 249 | gxc_raw, 250 | metadata, 251 | global_value_col='mCH', 252 | base_call_cutoff=20, 253 | sufficient_coverage_fraction=0.95, 254 | hv_percentile=30, 255 | n_qcut=10, 256 | ): 257 | """ 258 | """ 259 | # select genes covered (20 counts) in > 95% of cells 260 | df_mc = pd.DataFrame(gxc_raw.data['mc'], index=gxc_raw.gene, columns=gxc_raw.cell) 261 | df_c = pd.DataFrame(gxc_raw.data['c'], index=gxc_raw.gene, columns=gxc_raw.cell) 262 | 263 | n_gene, n_cell = df_c.shape 264 | gene_cov = (df_c > base_call_cutoff).sum(axis=1)/n_cell # fraction of cells covered 265 | cond = gene_cov>sufficient_coverage_fraction 266 | df_mc = df_mc[cond] 267 | df_c = df_c[cond] 268 | 269 | # compute normalized methylation matrix (no need to further select genes) 270 | df_mcc = basic_utils.get_mcc_lite_v2(df_c, df_mc, base_call_cutoff=base_call_cutoff) 271 | 272 | # normalize by global mean 273 | if global_value_col: 274 | df_nmcc = df_mcc.divide(metadata.loc[df_mcc.columns.values, global_value_col], axis=1) 275 | else: 276 | global_mean = df_mc.sum(axis=0)/df_c.sum(axis=0) 277 | df_nmcc = df_mcc.divide(global_mean, axis=1) 278 | 279 | # select highly variable genes 280 | hvgs = select_hvg_methylation(df_nmcc, percentile=hv_percentile, n_qcut=n_qcut) 281 | # trim 282 | df_hvnmcc = df_nmcc.loc[hvgs] 283 | 284 | return df_hvnmcc 285 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # add path 4 | rpath=$(realpath ./scripts) 5 | echo $rpath 6 | export PATH=$PATH:$rpath --------------------------------------------------------------------------------