├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── changelog.rst
    ├── faqs.rst
    ├── knn.png
    ├── mnn_direct.png
    ├── mnn_equation.png
    ├── mnn_rescue.png
    ├── n_neighbors_knn.png
    ├── rescue_equation_1.png
    ├── rescue_equation_2.png
    ├── rescue_equation_3.png
    ├── results
    │   ├── SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png
    │   ├── SingleCellFusion_plot_2_hist.png
    │   ├── SingleCellFusion_plot_3_embedding_by_dataset.png
    │   ├── SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png
    │   └── SingleCellFusion_plot_5_confmat.png
    └── scf_description.rst
├── environment.yml
├── environment_mini.yml
├── environment_mini_pegasus.yml
├── example-MOp_L5ET
    ├── datasets
    │   ├── 10x_cells_v2.h5ad
    │   ├── smarter_cells.h5ad
    │   ├── smarter_nuclei.h5ad
    │   └── snmcseq_gene.h5ad
    ├── run_scf.sh
    └── visualize_results.ipynb
├── example-wholebrain
    ├── 00.test_all_preproc.sh
    ├── visualize_results_lite_3mods.ipynb
    └── visualize_results_lite_rna_intron_exon.ipynb
├── example-wholebrainatac
    ├── normalize_and_select_features.ipynb
    ├── run_preproc.sh
    ├── run_scf.sh
    └── visualize_results.ipynb
├── scf_description.rst
├── scripts
    ├── SCF_utils.py
    ├── SingleCellFusion
    ├── SingleCellFusion_prep
    ├── __init__.py
    ├── basic_utils.py
    ├── cli_parser.py
    ├── clst_utils.py
    └── preproc_utils.py
└── setup.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # old
  2 | old/
  3 | scripts/old.py
  4 | example-wholebrainatac/old/
  5 | example-MOp_L5ET/old/
  6 | 
  7 | # unused 
  8 | example-biccn_enhancer/
  9 | example-testeran/
 10 | example-MOp_L5ET-test2/
 11 | example-wholebrain-test2/
 12 | 
 13 | # results
 14 | example-MOp_L5ET/results
 15 | example-wholebrainatac/results
 16 | 
 17 | # datasets
 18 | example-wholebrainatac/datasets
 19 | example-wholebrainatac/datasets_pre
 20 | example-wholebrainatac/datasets_processed
 21 | 
 22 | example-wholebrain/datasets
 23 | example-wholebrain/processed
 24 | example-wholebrain/results
 25 | example-wholebrain/old
 26 | 
 27 | # Byte-compiled / optimized / DLL files
 28 | __pycache__/
 29 | *.py[cod]
 30 | *$py.class
 31 | 
 32 | 
 33 | # C extensions
 34 | *.so
 35 | 
 36 | # Distribution / packaging
 37 | .Python
 38 | build/
 39 | develop-eggs/
 40 | dist/
 41 | downloads/
 42 | eggs/
 43 | .eggs/
 44 | lib/
 45 | lib64/
 46 | parts/
 47 | sdist/
 48 | var/
 49 | wheels/
 50 | pip-wheel-metadata/
 51 | share/python-wheels/
 52 | *.egg-info/
 53 | .installed.cfg
 54 | *.egg
 55 | MANIFEST
 56 | 
 57 | # PyInstaller
 58 | #  Usually these files are written by a python script from a template
 59 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 60 | *.manifest
 61 | *.spec
 62 | 
 63 | # Installer logs
 64 | pip-log.txt
 65 | pip-delete-this-directory.txt
 66 | 
 67 | # Unit test / coverage reports
 68 | htmlcov/
 69 | .tox/
 70 | .nox/
 71 | .coverage
 72 | .coverage.*
 73 | .cache
 74 | nosetests.xml
 75 | coverage.xml
 76 | *.cover
 77 | *.py,cover
 78 | .hypothesis/
 79 | .pytest_cache/
 80 | 
 81 | # Translations
 82 | *.mo
 83 | *.pot
 84 | 
 85 | # Django stuff:
 86 | *.log
 87 | local_settings.py
 88 | db.sqlite3
 89 | db.sqlite3-journal
 90 | 
 91 | # Flask stuff:
 92 | instance/
 93 | .webassets-cache
 94 | 
 95 | # Scrapy stuff:
 96 | .scrapy
 97 | 
 98 | # Sphinx documentation
 99 | docs/_build/
100 | 
101 | # PyBuilder
102 | target/
103 | 
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 | 
107 | # IPython
108 | profile_default/
109 | ipython_config.py
110 | 
111 | # pyenv
112 | .python-version
113 | 
114 | # pipenv
115 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
116 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
117 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
118 | #   install all needed dependencies.
119 | #Pipfile.lock
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SingleCellFusion
  2 | 
  3 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. Code in this repository is used in [Luo et al., (2019) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1) and in [Yao et al., (2020) *BioRxiv*](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2). [Here](docs/scf_description.rst) is a brief description of how SingleCellFusion works.
  4 | 
  5 | Related publications:
  6 | - Luo, C. et al. Single nucleus multi-omics links human cortical cell regulatory genome diversity to disease risk variants. bioRxiv 2019.12.11.873398 (2019) [doi:10.1101/2019.12.11.873398](https://www.biorxiv.org/content/10.1101/2019.12.11.873398v1)
  7 | - Yao, Z. et al. An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types. bioRxiv 2020.02.29.970558 (2020) [doi:10.1101/2020.02.29.970558](https://www.biorxiv.org/content/10.1101/2020.02.29.970558v2)
  8 | - BRAIN Initiative Cell Census Network (BICCN) et al. A multimodal cell census and atlas of the mammalian primary motor cortex. bioRxiv 2020.10.19.343129 (2020) [doi:10.1101/2020.10.19.343129](https://www.biorxiv.org/content/10.1101/2020.10.19.343129v1)
  9 | 
 10 | Code contributors: [Fangming Xie](mailto:f7xie@ucsd.edu), Aditya Chandrasekar,  Wayne I. Doyle, [Ethan Armand](mailto:ejarmand@ucsd.edu)
 11 | 
 12 | Contact: [Eran Mukamel](mailto:emukamel@ucsd.edu)
 13 | 
 14 | ## Installation
 15 | Step 1: Clone this repo.
 16 | ```bash
 17 | git clone https://github.com/mukamel-lab/SingleCellFusion.git
 18 | cd SingleCellFusion
 19 | ```
 20 | 
 21 | **TBD**
 22 | Step 2: Set up a conda environment and install dependent packages. The environment should be installed on the UNIX terminal(Skip this step if not needed.)
 23 | ```bash
 24 | conda env create -f environment.yml # create an env named scf_terra
 25 | source activate scf_terra
 26 | ```
 27 | 
 28 | ## Usage
 29 | ```
 30 | usage: SingleCellFusion [-h] -i xx.h5ad [xx.h5ad ...] -im rna/atac/mc [rna/atac/mc ...]
 31 |                         -f xx.h5ad [xx.h5ad ...] [-o DIR] [-op OUTPUT_PREFIX]
 32 |                         [--nearest_neighbors NEAREST_NEIGHBORS] [--relaxation RELAXATION]
 33 |                         [--num_pcs NUM_PCS] [--smoothing_fractions SMOOTHING_FRACTIONS]
 34 |                         [--leiden_n_neighbors LEIDEN_N_NEIGHBORS] [--leiden_resolutions LEIDEN_RESOLUTIONS]
 35 |                         [--umap_n_neighbors UMAP_N_NEIGHBORS] [--umap_min_dist UMAP_MIN_DIST]
 36 | 
 37 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets.
 38 | 
 39 | optional arguments:
 40 |   -h, --help            show this help message and exit
 41 | 
 42 | required:
 43 |   -i xx.h5ad [xx.h5ad ...], --input_datasets xx.h5ad [xx.h5ad ...]
 44 |                         (list of str) Paths to .h5ad files, each containing a cell-by-gene feature matrix,
 45 |                         cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, Gene IDs
 46 |                         should be shared or partially shared across files. Multiple inputs should be listed
 47 |                         as a space seperated list of filenames. (default: None)
 48 |   -im rna/atac/mc [rna/atac/mc ...], --input_modalities rna/atac/mc [rna/atac/mc ...]
 49 |                         (list of str) Data modalities chosen from 'rna', 'atac', or 'mc'. This should be
 50 |                         listed in the same order as input_datasets. (default: None)
 51 |   -f xx.h5ad [xx.h5ad ...], --feature_datasets xx.h5ad [xx.h5ad ...]
 52 |                         (list of str) Dataset(s) whose features all other datasets will impute into. This
 53 |                         should be a subset of --input_datasets. Enter multiple datasets as a space-separated
 54 |                         list of filenames. The features of these datasets will be the features kept in the
 55 |                         output imputed data table." (default: None)
 56 | 
 57 | optional:
 58 |   -o DIR, --output_dir DIR
 59 |                         (str) Directory to store output files (default: ./results)
 60 |   -op OUTPUT_PREFIX, --output_prefix OUTPUT_PREFIX
 61 |                         (str) The output files will contain this prefix. (default: SingleCellFusion)
 62 |   --nearest_neighbors NEAREST_NEIGHBORS
 63 |                         (integer) Number of nearest neighbors used to impute data (default: 20)
 64 |   --relaxation RELAXATION
 65 |                         (float) A value between 1 to infinity. This is a parameter that constraints the
 66 |                         number of neighbors a cell is allowed to receive. Assume dataset 1 has N1 cells,
 67 |                         dataset 2 has N2 cells. To find k neighbors in dataset 2 for every cell in dataset 1
 68 |                         means on average each cell in dataset 2 receives (kN1/N2) connections. However, not
 69 |                         all cells in dataset 2 gets the same number of connections. We therefore set an upper
 70 |                         bound for the number of connections a cell in dataset 2 can receive to be:
 71 |                         (kN1/N2)*relaxation where relaxation >= 1. Relaxation=1 enforces a hard limit that
 72 |                         every cell receives the same number of nearest neighbors, while relaxation=infinity
 73 |                         approaches traditional kNN. (default: 3)
 74 | 
 75 | advanced:
 76 |   --num_pcs NUM_PCS     (integer) Number of Principal Components to keep for each dataset for smoothing
 77 |                         and for clustering/embedding after imputation. (default: 50)
 78 |   --smoothing_fractions SMOOTHING_FRACTIONS
 79 |                         (list of floats) A list of three values between 0 to 1 that controls the relative
 80 |                         contribution from the cell itself vs. its neighbors in within-dataset smoothing,
 81 |                         specified for 'rna', 'atac', 'mc' data, respectively. (default: [0.7, 0.1, 0.9])
 82 |   --leiden_n_neighbors LEIDEN_N_NEIGHBORS
 83 |                         (integer) Number of nearest neighbors to form in the integrated space, the resulting
 84 |                         nearest neighbor graph is used for Leiden clustering. It is passed into the python
 85 |                         package leidenalg.
 86 |                         (default: 30)
 87 |   --leiden_resolutions LEIDEN_RESOLUTIONS
 88 |                         (list of floats) A list of resolutions to be used for Leiden Clustering. It is
 89 |                         passed into the python package leidenalg. (default: [0.1, 0.2, 0.4, 0.8])
 90 |   --umap_n_neighbors UMAP_N_NEIGHBORS
 91 |                         (integer) Number of neighbors for UMAP. It is passed into the python package
 92 |                         umap.UMAP(n_neighbors). (default: 60)
 93 |   --umap_min_dist UMAP_MIN_DIST
 94 |                         (float) Minimum distance for UMAP. It is passed into the python package
 95 |                         umap.UMAP(min_dist). (default: 0.5)
 96 | 
 97 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel.
 98 | 
 99 | Contact: Eran Mukamel (emukamel@ucsd.edu).
100 | ```
101 | 
102 | ### Example:
103 | 
104 | #### Integrating L5 ET cells from four data modalities from the mouse primary motor cortex:
105 | 
106 | `./example-MOp-L5ET` contains an example of integrating the layer 5 Extratelencephalically Projecting neurons (L5 ET) from 4 different datasets from the mouse primary motor cortex. The example directory includes the organized datasets, code, and results, which could be used as a template for other similar tasks.
107 | 
108 | After SingleCellFusion on the example data with `run_scf.sh` in `example-MOp_L5ET` (which is a call to SingleCellFusion with default parameters), the notebook `./example-MOp_L5ET/visualize_results.ipynb` provides a step-by-step walkthrough of manipulating and plotting the integrated data. The plots created are shown below, and all the required code to generate them is included in the notebook.
109 | 
110 | ```
111 | cd ./example-MOp_L5ET
112 | # shell script to run SingleCellFusion using example parameters
113 | ./run_scf.sh
114 | # visualize results
115 | jupyter notebook visualize_results.ipynb
116 | ```
117 | 
118 | More example datasets, and prepared `./run_scf.sh` files are included in the repository to play around with.
119 | 
120 | #### Integrated Embedding and Clustering
121 | 
122 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to plot these UMAP coordinates, coloring each data point to get a rough view of how integrated the modalities are.
123 | 
124 | For our Top Panel, we plot the integrated UMAP space, coloring each point with the colors of each integrated modality.
125 | 
126 | For the Bottom Panels, we plot the integrated UMAP space by each of the joint clusters found. Each plot corresponds to a separate clustering resolution, set when calling SingleCellFusion.
127 | 
128 | ![Plot 1](./docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png)
129 | 
130 | #### Cell Distribution in Clusters
131 | 
132 | In order to ensure that each cluster has a similar composition of all of the datasets, we plot the dataset compositions of each Joint Cluster in a bar chart.
133 | 
134 | Each bar corresponds to a different cluster found in our Joint Clustering, colored by the original datasets. To check that each cluster has a relatively even composition of each dataset, we plot the overall composition of datasets next to our bar charts for comparison.
135 | 
136 | To ensure that SingleCellFusion does not cluster cells by their source dataset, we want the composition of each cluster to be as close to the overall composition of the data as possible (defined by the sizes of the original modalities).
137 | 
138 | ![Plot 2](./docs/results/SingleCellFusion_plot_2_hist.png)
139 | 
140 | #### Cell Embedding Colored by Dataset
141 | 
142 | SingleCellFusion integrates our modalities and embeds the integrated space into common UMAP coordinates. We want to show that our integration does not segregate our cells by modality, and rather integrates them by other expression-level features.
143 | 
144 | To do this, we plot each modality separately on the same UMAP space and check that each modality is evenly distributed across the space.
145 | 
146 | ![Plot 3](./docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png)
147 | 
148 | #### Cell Embedding Colored by Original Annotations/Cluster
149 | 
150 | To see how the original clusters from individual modalities are preserved in the integrated clustering, we display the original cell-type annotations of individual datasets in our integrated space.
151 | 
152 | For the given example, we will focus on displaying L5 PT (L5 ET or its equivalent annotation) cells. The majority of the cells in our example are labeled as L5 PT/ET in each individual dataset's clusters, with a few exceptions.
153 | 
154 | ![Plot 4](./docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png)
155 | 
156 | #### Confusion Matrices between Integrated Clustering and Individual Dataset Clustering 
157 | 
158 | To continue investigating how the original clusters from individual modalities are maintained in the integrated clusters, we can plot a confusion matrix showing how the clusters of individual datasets are re-organized into new integrated clusters.
159 | 
160 | The rows of each confusion matrix shows the cluster labels for the individual dataset. If no such cluster labels exist, a single column is shown, so we can still examine the distribution of cells in the Joint Clusters.
161 | 
162 | The y-axis shows the three clusters identified in the integrated dataset.
163 | 
164 | The Confusion Matrices on the first row are all Normalized by Joint Clusters (the Sum of each Row is 1).
165 | 
166 | The Confusion Matrices on the second row are all Normalized by Original Clusters (the Sum of each Row is 1).
167 | 
168 | ![Plot 5](./docs/results/SingleCellFusion_plot_5_confmat.png)
169 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | ================
 3 | * Version 1.0.0 - 2019-11-08:
 4 |     * First stable release!
 5 |     * Basic example on how to use SingleCellFusion was added to the README
 6 |     * Minor formatting and documenting fixes throughout
 7 | * Version 0.9.0 - 2019-11-07:
 8 |     * Finalizing tests before first stable release
 9 |     * Fixed indexing issue in low memory version of data integration
10 | * Version 0.8.0 - 2019-11-05:
11 |     * Performed debugging
12 |     * Added wrapper for all steps
13 | * Version 0.7.0 - 2019-10-14:
14 |     * Added low memory version of integration functions
15 | * Version 0.6.0 - 2019-10-04:
16 |     * Added functions to search for common, variable features
17 |     * Added ability to perform high and low memory mean/standard deviation of loom files
18 | * Version 0.5.0 - 2019-10-03:
19 |     * Fixed low and high memory versions of kNN
20 |     * Added integration function (currently only in high memory format)
21 |     * Added high memory PCA
22 | * Version 0.4.0 - 2019-09-30:
23 |     * Added preliminary MNN functions
24 |     * Removed recipes and integration functions pending updates
25 | * Version 0.3.0 - 2019-09-25:
26 |     * Major overhaul to make SingleCellFusion more user friendly
27 |     * Added low and high memory versions of constrained kNN search
28 |     * Removed MNN method pending, version 0.4.0
29 | * Version 0.2.0 - 2019-09-19:
30 |     * Initialization of changelog and versions, so a lot of changes have been done since the last version
31 |     * Fixed a number of bugs
32 | * Version 0.1.0 - 2018-09-11:
33 |     * Initial release of SingleCellFusion
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/faqs.rst:
--------------------------------------------------------------------------------
 1 | FAQs
 2 | ================
 3 | SingleCellFusion is under active development and function names and parameters will continue to be
 4 | changed until a stable release is reached. In the interim, we have provided some answers to common
 5 | questions and problems that can occur when using SingleCellFusion.
 6 | 
 7 | Why do you use loom files and how do I make one?
 8 | -------------------------------------------------
 9 | The loom file format allows SingleCellFusion to have a low memory footprint when analyzing large data
10 | sets (such as 10x Genomics scRNA-seq data), and keep all of the meta-data in one centralized location.
11 | The loompy package was developed by the Sten Linnarsson group and has excellent documentation at
12 | `loompy.org <http://loompy.org/>`_.
13 | 
14 | Within a loom file features are stored in rows, and cells in columns. As an example to create a loom file,
15 | say you have a pandas dataframe (df) in which the features are in rows and cells are in columns. The index of
16 | this dataframe contains the unique feature IDs and the column header contains unique cell IDs. A loom file
17 | can be generated with the following code::
18 | 
19 |     import loompy
20 |     loompy.create(filename=filename,
21 |                   layers={'':df.values},
22 |                   row_attrs={'Accession:df.index.values},
23 |                   col_attrs={'CellID:df.columns.values})
24 | 
25 | Why is my code using so much memory, even with the low_mem flag?
26 | -----------------------------------------------------------------
27 | Access of loom files is performed in batches to reduce the memory overload. In the basic recipe for
28 | SingleCellFusion (pairwise_impute) the size of these batches is controlled by the parameter batch_x and
29 | batch_y. If you are having memory issues, try reducing the size of these values to reduce your memory
30 | overhead.
31 | 
32 | Why is my code using so many threads?
33 | --------------------------------------
34 | This is a problem with Numpy (see this `issue <https://github.com/numpy/numpy/issues/11826>`_). You can solve this
35 | issue in two ways.
36 | 
37 | The easiest way is to run the following lines on your command line before running any Python scripts or notebooks::
38 | 
39 |     export OMP_NUM_THREADS=1
40 |     export OPENBLAS_NUM_THREADS=1
41 |     export MKL_NUM_THREADS=1
42 |     export VECLIB_MAXIMUM_THREADS=1
43 |     export NUMEXPR_NUM_THREADS=1
44 | 
45 | You can also run the below code in the first cell of a Python notebook or the beginning or a Python script. It must
46 | be run before importing any other packages (including MoP)::
47 | 
48 |     import os
49 |     os.environ["OMP_NUM_THREADS"] = '1'
50 |     os.environ["OPENBLAS_NUM_THREADS"] = '1'
51 |     os.environ["MKL_NUM_THREADS"] = '1'
52 |     os.environ["VECLIB_MAXIMUM_THREADS"] = '1'
53 |     os.environ["NUMEXPR_NUM_THREADS"] = '1'
54 | 
55 | You can specify the maximum number of threads that you want to use in that script or notebook by changing the value
56 | from 1 to your desired integer.
57 | 
58 | This information came from this `StackOverflow question
59 | <https://stackoverflow.com/questions/30791550/limit-number-of-threads-in-numpy>`_.
60 | 
61 | Why is my code running slow with the low_mem flag?
62 | --------------------------------------------------
63 | Although the loom file format has a number of benefits, the access and processing of data in the file
64 | will get progressively slower as more data is added to the file. If you are finding that your code is
65 | running too slow it can be helpful to make a second loom file containing just the relevant data for running
66 | SingleCellFusion.
67 | 
68 | Another cause of slow code is that the batch size for processing code (see "Why is my code using so much
69 | memory?" above) is too small. If you are not having memory issues, we recommend increasing the batch size
70 | to speed up the code.
71 | 
72 | Why am I not finding many neighbors?
73 | -------------------------------------
74 | If you expect that similar cell types should be present in both data sets, this could be due to
75 | the sparseness of your data. We have found that if you first smooth your data (we highly
76 | recommend using `MAGIC <https://github.com/KrishnaswamyLab/MAGIC>`_. You can then use the
77 | smoothed data to find nearest neighbors, and impute on the observed data. A tutorial using our
78 | loom-based method of smoothing will be uploaded soon.
79 | 
80 | 
81 | Is SingleCellFusion just for integrating data from different sequencing modalities?
82 | -----------------------------------------------------------------------------------
83 | No, theoretically this pipeline could be applied to integration across species or to find common cell
84 | types across different research studies using the same sequencing technology. This is an active area
85 | of development.
86 | 
87 | What happens if a cell type is present in only one modality?
88 | -------------------------------------------------------------
89 | In our experience, this situation is easily detectable. If the analysis is only performed on direct
90 | mutual nearest neighbors, these cells will not make nearest neighbors and will be dropped from the analysis.
91 | If the imputation is performed with the rescue, these cells will still not make mutual nearest neighbors.
92 | Their imputed counts will then come from their mutual nearest neighbors within their own data set. These
93 | imputed counts will not be similar to any observed counts, and these cells will self-segregate into their
94 | own clusters and will be visually separate on a tSNE or uMAP embedding. For the kNN method, these cells will
95 | make weak connections with a number of different cell types. This will lead to the imputation of counts that
96 | are not similar to any observed counts, also leading to segregation into unique clusters.
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/docs/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/knn.png


--------------------------------------------------------------------------------
/docs/mnn_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_direct.png


--------------------------------------------------------------------------------
/docs/mnn_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_equation.png


--------------------------------------------------------------------------------
/docs/mnn_rescue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/mnn_rescue.png


--------------------------------------------------------------------------------
/docs/n_neighbors_knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/n_neighbors_knn.png


--------------------------------------------------------------------------------
/docs/rescue_equation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_1.png


--------------------------------------------------------------------------------
/docs/rescue_equation_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_2.png


--------------------------------------------------------------------------------
/docs/rescue_equation_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/rescue_equation_3.png


--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_1_joint_embedding_and_clusterings_overview.png


--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_2_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_2_hist.png


--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_3_embedding_by_dataset.png


--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_4_embedding_by_individual_mod_clusterings.png


--------------------------------------------------------------------------------
/docs/results/SingleCellFusion_plot_5_confmat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/docs/results/SingleCellFusion_plot_5_confmat.png


--------------------------------------------------------------------------------
/docs/scf_description.rst:
--------------------------------------------------------------------------------
  1 | How does SingleCellFusion work?
  2 | ================================
  3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing,
  4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would
  5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know
  6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation
  7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell.
  8 | 
  9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single
 10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a
 11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics
 12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity.
 13 | 
 14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets.
 15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable
 16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a
 17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled
 18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the
 19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's
 20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor
 21 | graph specified, and are described below.
 22 | 
 23 | Direct mutual nearest neighbors
 24 | -------------------------------
 25 | .. image:: mnn_direct.png
 26 |   :width: 400
 27 |   :alt: cartoon of direct MNN
 28 | 
 29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the
 30 | correlation of counts at highly variable features is calculated. These correlation values are used
 31 | as the distance metric for identifying mutual neighbors.
 32 | 
 33 | Once the correlation is calculated, neighbors across modalities are determined. We require that
 34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled
 35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable
 36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq
 37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the
 38 | nearest neighbors graph is not dominated by noisy or spurious correlations.
 39 | 
 40 | Once the neighbors graph is generated imputed counts are generated by the following equation:
 41 | 
 42 | .. image:: mnn_equation.png
 43 |   :width: 400
 44 |   :alt: equation for imputation by MNN
 45 | 
 46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality
 47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest
 48 | neighbors in modality *m'*.
 49 | 
 50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual
 51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can
 52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them.
 53 | 
 54 | 
 55 | Mutual nearest neighbors with rescue
 56 | -------------------------------------
 57 | .. image:: mnn_rescue.png
 58 |   :width: 400
 59 |   :alt: cartoon of rescue MNN
 60 | 
 61 | As with the direct method, the distances between two pairs of cells is their correlation at
 62 | highly variable genes. The only difference with this method is that in addition to a mutual
 63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality
 64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by
 65 | using the within modality neighbors to determine what the best matched neighbors are across
 66 | modalities.
 67 | 
 68 | .. image:: rescue_equation_1.png
 69 |   :width: 400
 70 |   :alt: equation 1 of rescue
 71 | 
 72 | where
 73 | 
 74 | .. image:: rescue_equation_2.png
 75 |   :width: 400
 76 |   :alt: equation 2 of rescue
 77 | 
 78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality
 79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k*
 80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality
 81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in
 82 | "Direct mutual nearest neighbors:"
 83 | 
 84 | .. image:: mnn_equation.png
 85 |   :width: 400
 86 |   :alt: equation for imputation by MNN
 87 | 
 88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following
 89 | equation:
 90 | 
 91 | .. image:: rescue_equation_3.png
 92 |   :width: 400
 93 |   :alt: equation 3 of rescue
 94 | 
 95 | This is a more lenient method for generating imputed counts as all cells will receive imputed
 96 | data. This method will enable all cells to be analyzed, and is our recommended approach.
 97 | 
 98 | k-nearest neighbors
 99 | -------------------
100 | .. image:: knn.png
101 |   :width: 400
102 |   :alt: cartoon of kNN
103 | 
104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at
105 | highly variable features. The major difference with this method is that each cell is required to make
106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is
107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell
108 | in the opposite modality can make is given by the equation:
109 | 
110 | .. image:: n_neighbors_knn.png
111 |   :width: 200
112 |   :alt: equation 1 of knn
113 | 
114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required
115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\  is the number of cells in
116 | modality *m*, and *n*\ :sub:`m'`\  is the number of cells in modality *m'*. *z* is a relaxation
117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly 
118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell
119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same 
120 | equation as in "Direct mutual nearest neighbors:
121 | 
122 | .. image:: mnn_equation.png
123 |   :width: 400
124 |   :alt: equation for imputation by MNN
125 | 
126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors
127 | in the opposite data set.
128 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: scf_terra
  2 | channels:
  3 |   - bioconda
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=1_llvm
  9 |   - anndata=0.6.22.post1=py_0
 10 |   - attrs=19.3.0=py_0
 11 |   - backcall=0.1.0=py_0
 12 |   - blas=1.0=openblas
 13 |   - bleach=3.1.1=py_0
 14 |   - blosc=1.16.3=hd408876_0
 15 |   - bzip2=1.0.8=h7b6447c_0
 16 |   - ca-certificates=2021.4.13=h06a4308_1
 17 |   - cairo=1.16.0=hfb77d84_1002
 18 |   - certifi=2020.12.5=py38h06a4308_0
 19 |   - cycler=0.10.0=py_2
 20 |   - dbus=1.13.6=he372182_0
 21 |   - decorator=4.4.1=py_0
 22 |   - defusedxml=0.6.0=py_0
 23 |   - entrypoints=0.3=py38_1000
 24 |   - expat=2.2.9=he1b5a44_2
 25 |   - fbpca=1.0=py_0
 26 |   - fontconfig=2.13.1=h86ecdb6_1001
 27 |   - freetype=2.10.0=he983fc9_1
 28 |   - gettext=0.19.8.1=hc5be6a0_1002
 29 |   - glib=2.58.3=py38h6f030ca_1002
 30 |   - gmp=6.2.0=he1b5a44_2
 31 |   - gst-plugins-base=1.14.5=h0935bb2_2
 32 |   - gstreamer=1.14.5=h36ae1b5_2
 33 |   - h5py=2.10.0=py38h7918eee_0
 34 |   - hdf5=1.10.4=hb1b8bf9_0
 35 |   - icu=64.2=he1b5a44_1
 36 |   - igraph=0.7.1=h9e3b1fc_1007
 37 |   - importlib_metadata=1.5.0=py38_0
 38 |   - ipykernel=5.1.4=py38h5ca1d4c_0
 39 |   - ipython=7.12.0=py38h5ca1d4c_0
 40 |   - ipython_genutils=0.2.0=py_1
 41 |   - jedi=0.16.0=py38_0
 42 |   - jinja2=2.11.1=py_0
 43 |   - joblib=0.14.1=py_0
 44 |   - jpeg=9c=h14c3975_1001
 45 |   - json5=0.9.0=py_0
 46 |   - jsonschema=3.2.0=py38_0
 47 |   - jupyter_client=5.3.4=py38_1
 48 |   - jupyter_contrib_core=0.3.3=py_2
 49 |   - jupyter_contrib_nbextensions=0.5.1=py38_0
 50 |   - jupyter_core=4.6.3=py38_0
 51 |   - jupyter_highlight_selected_word=0.2.0=py38_1000
 52 |   - jupyter_latex_envs=1.4.6=py38_1000
 53 |   - jupyter_nbextensions_configurator=0.4.1=py38_0
 54 |   - jupyterlab=1.2.6=py_0
 55 |   - jupyterlab_server=1.0.6=py_0
 56 |   - kiwisolver=1.1.0=py38hc9558a2_0
 57 |   - ld_impl_linux-64=2.33.1=h53a641e_8
 58 |   - leidenalg=0.7.0=py38he1b5a44_1
 59 |   - libblas=3.8.0=15_openblas
 60 |   - libcblas=3.8.0=15_openblas
 61 |   - libclang=9.0.1=default_hde54327_0
 62 |   - libffi=3.2.1=he1b5a44_1006
 63 |   - libgcc-ng=9.2.0=h24d8f2e_2
 64 |   - libgfortran-ng=7.3.0=hdf63c60_5
 65 |   - libiconv=1.15=h516909a_1005
 66 |   - liblapack=3.8.0=15_openblas
 67 |   - libllvm8=8.0.1=hc9558a2_0
 68 |   - libllvm9=9.0.1=hc9558a2_0
 69 |   - libopenblas=0.3.8=h5ec1e0e_0
 70 |   - libpng=1.6.37=hed695b0_0
 71 |   - libsodium=1.0.17=h516909a_0
 72 |   - libstdcxx-ng=9.2.0=hdf63c60_2
 73 |   - libuuid=2.32.1=h14c3975_1000
 74 |   - libxcb=1.13=h14c3975_1002
 75 |   - libxkbcommon=0.10.0=he1b5a44_0
 76 |   - libxml2=2.9.10=hee79883_0
 77 |   - libxslt=1.1.33=h31b3aaa_0
 78 |   - llvm-openmp=9.0.1=hc9558a2_2
 79 |   - llvmlite=0.31.0=py38h8b12597_0
 80 |   - lxml=4.5.0=py38hbb43d70_1
 81 |   - lz4-c=1.8.1.2=h14c3975_0
 82 |   - lzo=2.10=h7b6447c_2
 83 |   - markupsafe=1.1.1=py38h516909a_0
 84 |   - matplotlib=3.1.3=py38_0
 85 |   - matplotlib-base=3.1.3=py38h250f245_0
 86 |   - mistune=0.8.4=py38h516909a_1000
 87 |   - mock=4.0.3=pyhd3eb1b0_0
 88 |   - natsort=7.1.1=pyhd3eb1b0_0
 89 |   - nbconvert=5.6.1=py38_0
 90 |   - nbformat=5.0.4=py_0
 91 |   - ncurses=6.1=hf484d3e_1002
 92 |   - notebook=6.0.3=py38_0
 93 |   - nspr=4.25=he1b5a44_0
 94 |   - nss=3.47=he751ad9_0
 95 |   - numba=0.48.0=py38hb3f55d8_0
 96 |   - numexpr=2.7.3=py38h4be448d_1
 97 |   - numpy=1.18.1=py38h95a1406_0
 98 |   - openssl=1.1.1k=h27cfd23_0
 99 |   - pandas=1.0.1=py38hb3f55d8_0
100 |   - pandoc=2.9.2=0
101 |   - pandocfilters=1.4.2=py_1
102 |   - parso=0.6.1=py_0
103 |   - patsy=0.5.1=py_0
104 |   - pcre=8.44=he1b5a44_0
105 |   - pexpect=4.8.0=py38_0
106 |   - pickleshare=0.7.5=py38_1000
107 |   - pip=20.0.2=py_2
108 |   - pixman=0.38.0=h516909a_1003
109 |   - prometheus_client=0.7.1=py_0
110 |   - prompt_toolkit=3.0.3=py_0
111 |   - pthread-stubs=0.4=h14c3975_1001
112 |   - ptyprocess=0.6.0=py_1001
113 |   - pycairo=1.19.1=py38h438ddbb_0
114 |   - pygments=2.5.2=py_0
115 |   - pyparsing=2.4.6=py_0
116 |   - pyqt=5.12.3=py38hcca6a23_1
117 |   - pyrsistent=0.15.7=py38h516909a_0
118 |   - pytables=3.6.1=py38h9fd0a39_0
119 |   - python=3.8.1=h357f687_2
120 |   - python-dateutil=2.8.1=py_0
121 |   - python-igraph=0.8.0=py38h9e3b1fc_0
122 |   - python_abi=3.8=1_cp38
123 |   - pytz=2019.3=py_0
124 |   - pyyaml=5.3.1=py38h1e0a361_0
125 |   - pyzmq=18.1.1=py38h1768529_0
126 |   - qt=5.12.5=hd8c4c69_1
127 |   - readline=8.0=hf8c457e_0
128 |   - scikit-learn=0.22.1=py38hcdab131_1
129 |   - scipy=1.4.1=py38h921218d_0
130 |   - seaborn=0.10.0=py_1
131 |   - send2trash=1.5.0=py_0
132 |   - setuptools=45.2.0=py38_0
133 |   - six=1.14.0=py38_0
134 |   - snappy=1.1.8=he6710b0_0
135 |   - sqlite=3.30.1=hcee41ef_0
136 |   - statsmodels=0.11.1=py38h516909a_0
137 |   - terminado=0.8.3=py38_0
138 |   - testpath=0.4.4=py_0
139 |   - texttable=1.6.2=py_0
140 |   - tk=8.6.10=hed695b0_0
141 |   - tornado=6.0.3=py38h516909a_4
142 |   - tqdm=4.59.0=pyhd3eb1b0_1
143 |   - traitlets=4.3.3=py38_0
144 |   - umap-learn=0.3.10=py38_1
145 |   - wcwidth=0.1.8=py_0
146 |   - webencodings=0.5.1=py_1
147 |   - wheel=0.34.2=py_1
148 |   - xorg-kbproto=1.0.7=h14c3975_1002
149 |   - xorg-libice=1.0.10=h516909a_0
150 |   - xorg-libsm=1.2.3=h84519dc_1000
151 |   - xorg-libx11=1.6.9=h516909a_0
152 |   - xorg-libxau=1.0.9=h14c3975_0
153 |   - xorg-libxdmcp=1.1.3=h516909a_0
154 |   - xorg-libxext=1.3.4=h516909a_0
155 |   - xorg-libxrender=0.9.10=h516909a_1002
156 |   - xorg-renderproto=0.11.1=h14c3975_1002
157 |   - xorg-xextproto=7.3.0=h14c3975_1002
158 |   - xorg-xproto=7.0.31=h14c3975_1007
159 |   - xz=5.2.4=h14c3975_1001
160 |   - yaml=0.2.4=h516909a_0
161 |   - zeromq=4.3.2=he1b5a44_2
162 |   - zipp=3.0.0=py_0
163 |   - zlib=1.2.11=h516909a_1006
164 |   - zstd=1.3.7=h0b5b093_0
165 |   - pip:
166 |     - annoy==1.16.3
167 |     - pyqt5-sip==4.19.18
168 |     - pyqtwebengine==5.12.1
169 | 


--------------------------------------------------------------------------------
/environment_mini.yml:
--------------------------------------------------------------------------------
  1 | name: scf_mini
  2 | channels:
  3 |   - hcc
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=4.5=1_gnu
  9 |   - anndata=0.7.6=py39hf3d152e_0
 10 |   - arpack=3.7.0=hc6cf775_2
 11 |   - blas=1.0=mkl
 12 |   - brotli=1.0.9=h7f98852_5
 13 |   - brotli-bin=1.0.9=h7f98852_5
 14 |   - ca-certificates=2021.5.30=ha878542_0
 15 |   - cached-property=1.5.2=py_0
 16 |   - certifi=2021.5.30=py39hf3d152e_0
 17 |   - cycler=0.10.0=py_2
 18 |   - dbus=1.13.6=he372182_0
 19 |   - expat=2.4.1=h9c3ff4c_0
 20 |   - fbpca=1.0=py_0
 21 |   - fontconfig=2.13.1=hba837de_1005
 22 |   - fonttools=4.25.0=pyhd3eb1b0_0
 23 |   - freetype=2.10.4=h0708190_1
 24 |   - glib=2.69.1=h5202010_0
 25 |   - glpk=4.65=h9202a9a_1004
 26 |   - gmp=6.2.1=h58526e2_0
 27 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 28 |   - gstreamer=1.14.0=h28cd5cc_2
 29 |   - h5py=3.2.1=py39h6c542dc_0
 30 |   - hdf5=1.10.6=hb1b8bf9_0
 31 |   - icu=58.2=hf484d3e_1000
 32 |   - igraph=0.9.4=ha184e22_0
 33 |   - intel-openmp=2021.3.0=h06a4308_3350
 34 |   - jbig=2.1=h7f98852_2003
 35 |   - joblib=1.0.1=pyhd8ed1ab_0
 36 |   - jpeg=9d=h36c2ea0_0
 37 |   - kiwisolver=1.3.1=py39h2531618_0
 38 |   - lcms2=2.12=hddcbb42_0
 39 |   - ld_impl_linux-64=2.35.1=h7274673_9
 40 |   - leidenalg=0.8.7=py39he80948d_0
 41 |   - lerc=2.2.1=h9c3ff4c_0
 42 |   - libblas=3.9.0=11_linux64_mkl
 43 |   - libbrotlicommon=1.0.9=h7f98852_5
 44 |   - libbrotlidec=1.0.9=h7f98852_5
 45 |   - libbrotlienc=1.0.9=h7f98852_5
 46 |   - libcblas=3.9.0=11_linux64_mkl
 47 |   - libdeflate=1.7=h7f98852_5
 48 |   - libffi=3.3=he6710b0_2
 49 |   - libgcc-ng=9.3.0=h5101ec6_17
 50 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
 51 |   - libgfortran4=7.5.0=ha8ba4b0_17
 52 |   - libgomp=9.3.0=h5101ec6_17
 53 |   - liblapack=3.9.0=11_linux64_mkl
 54 |   - libllvm10=10.0.1=he513fc3_3
 55 |   - libpng=1.6.37=h21135ba_2
 56 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
 57 |   - libtiff=4.3.0=hf544144_1
 58 |   - libuuid=2.32.1=h7f98852_1000
 59 |   - libwebp-base=1.2.0=h27cfd23_0
 60 |   - libxcb=1.13=h7f98852_1003
 61 |   - libxml2=2.9.12=h03d6c58_0
 62 |   - llvmlite=0.36.0=py39h612dafd_4
 63 |   - lz4-c=1.9.3=h9c3ff4c_1
 64 |   - matplotlib=3.4.2=py39hf3d152e_0
 65 |   - matplotlib-base=3.4.2=py39hab158f2_0
 66 |   - metis=5.1.0=h58526e2_1006
 67 |   - mkl=2021.3.0=h06a4308_520
 68 |   - mkl-service=2.4.0=py39h7f8727e_0
 69 |   - mkl_fft=1.3.0=py39h42c9631_2
 70 |   - mkl_random=1.2.2=py39h51133e4_0
 71 |   - mpfr=4.1.0=h9202a9a_1
 72 |   - munkres=1.1.4=pyh9f0ad1d_0
 73 |   - natsort=7.1.1=pyhd8ed1ab_0
 74 |   - ncurses=6.2=he6710b0_1
 75 |   - numba=0.53.1=py39h56b8d98_1
 76 |   - numpy=1.20.3=py39hf144106_0
 77 |   - numpy-base=1.20.3=py39h74d4b33_0
 78 |   - olefile=0.46=pyh9f0ad1d_1
 79 |   - openjpeg=2.4.0=hb52868f_1
 80 |   - openssl=1.1.1k=h7f98852_0
 81 |   - packaging=21.0=pyhd8ed1ab_0
 82 |   - pandas=1.3.0=py39hde0f152_0
 83 |   - patsy=0.5.1=py_0
 84 |   - pcre=8.45=h9c3ff4c_0
 85 |   - pillow=8.3.1=py39ha612740_0
 86 |   - pip=21.2.4=py37h06a4308_0
 87 |   - pthread-stubs=0.4=h36c2ea0_1001
 88 |   - pynndescent=0.5.4=pyh6c4a22f_0
 89 |   - pyparsing=2.4.7=pyh9f0ad1d_0
 90 |   - pyqt=5.9.2=py39h2531618_6
 91 |   - python=3.9.6=h12debd9_1
 92 |   - python-annoy=1.17.0=py39he80948d_2
 93 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
 94 |   - python-igraph=0.9.6=py39hfef886c_0
 95 |   - python_abi=3.9=2_cp39
 96 |   - pytz=2021.1=pyhd8ed1ab_0
 97 |   - qt=5.9.7=h5867ecd_1
 98 |   - readline=8.1=h27cfd23_0
 99 |   - scikit-learn=0.24.2=py39h4dfa638_0
100 |   - scipy=1.6.2=py39had2a1c9_1
101 |   - seaborn=0.11.2=hd8ed1ab_0
102 |   - seaborn-base=0.11.2=pyhd8ed1ab_0
103 |   - setuptools=52.0.0=py39h06a4308_0
104 |   - sip=4.19.13=py39h2531618_0
105 |   - six=1.16.0=pyhd3eb1b0_0
106 |   - sqlite=3.36.0=hc218d9a_0
107 |   - statsmodels=0.12.2=py39hce5d2b2_0
108 |   - suitesparse=5.10.1=hd8046ac_0
109 |   - tbb=2020.2=h4bd325d_4
110 |   - texttable=1.6.4=pyhd8ed1ab_0
111 |   - threadpoolctl=2.2.0=pyh8a188c0_0
112 |   - tk=8.6.10=hbc83047_0
113 |   - tornado=6.1=py39h3811e60_1
114 |   - tzdata=2021a=h5d7bf9c_0
115 |   - umap-learn=0.5.1=py39hf3d152e_1
116 |   - wheel=0.37.0=pyhd3eb1b0_0
117 |   - xorg-libxau=1.0.9=h7f98852_0
118 |   - xorg-libxdmcp=1.1.3=h7f98852_0
119 |   - xz=5.2.5=h7b6447c_0
120 |   - zlib=1.2.11=h7b6447c_3
121 |   - zstd=1.5.0=ha95c52a_0
122 | 


--------------------------------------------------------------------------------
/environment_mini_pegasus.yml:
--------------------------------------------------------------------------------
  1 | name: scf_mini_pegasus
  2 | channels:
  3 |   - hcc
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=4.5=1_gnu
  9 |   - anndata=0.7.6=py39hf3d152e_0
 10 |   - arpack=3.7.0=hc6cf775_2
 11 |   - blas=1.0=mkl
 12 |   - brotli=1.0.9=h7f98852_5
 13 |   - brotli-bin=1.0.9=h7f98852_5
 14 |   - ca-certificates=2021.5.30=ha878542_0
 15 |   - cached-property=1.5.2=py_0
 16 |   - certifi=2021.5.30=py39hf3d152e_0
 17 |   - cycler=0.10.0=py_2
 18 |   - dbus=1.13.6=he372182_0
 19 |   - expat=2.4.1=h9c3ff4c_0
 20 |   - fbpca=1.0=py_0
 21 |   - fontconfig=2.13.1=hba837de_1005
 22 |   - fonttools=4.25.0=pyhd3eb1b0_0
 23 |   - freetype=2.10.4=h0708190_1
 24 |   - glib=2.69.1=h5202010_0
 25 |   - glpk=4.65=h9202a9a_1004
 26 |   - gmp=6.2.1=h58526e2_0
 27 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 28 |   - gstreamer=1.14.0=h28cd5cc_2
 29 |   - h5py=3.2.1=py39h6c542dc_0
 30 |   - hdf5=1.10.6=hb1b8bf9_0
 31 |   - icu=58.2=hf484d3e_1000
 32 |   - igraph=0.9.4=ha184e22_0
 33 |   - intel-openmp=2021.3.0=h06a4308_3350
 34 |   - jbig=2.1=h7f98852_2003
 35 |   - joblib=1.0.1=pyhd8ed1ab_0
 36 |   - jpeg=9d=h36c2ea0_0
 37 |   - kiwisolver=1.3.1=py39h2531618_0
 38 |   - lcms2=2.12=hddcbb42_0
 39 |   - ld_impl_linux-64=2.35.1=h7274673_9
 40 |   - leidenalg=0.8.7=py39he80948d_0
 41 |   - lerc=2.2.1=h9c3ff4c_0
 42 |   - libblas=3.9.0=11_linux64_mkl
 43 |   - libbrotlicommon=1.0.9=h7f98852_5
 44 |   - libbrotlidec=1.0.9=h7f98852_5
 45 |   - libbrotlienc=1.0.9=h7f98852_5
 46 |   - libcblas=3.9.0=11_linux64_mkl
 47 |   - libdeflate=1.7=h7f98852_5
 48 |   - libffi=3.3=he6710b0_2
 49 |   - libgcc-ng=9.3.0=h5101ec6_17
 50 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
 51 |   - libgfortran4=7.5.0=ha8ba4b0_17
 52 |   - libgomp=9.3.0=h5101ec6_17
 53 |   - liblapack=3.9.0=11_linux64_mkl
 54 |   - libllvm10=10.0.1=he513fc3_3
 55 |   - libpng=1.6.37=h21135ba_2
 56 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
 57 |   - libtiff=4.3.0=hf544144_1
 58 |   - libuuid=2.32.1=h7f98852_1000
 59 |   - libwebp-base=1.2.0=h27cfd23_0
 60 |   - libxcb=1.13=h7f98852_1003
 61 |   - libxml2=2.9.12=h03d6c58_0
 62 |   - llvmlite=0.36.0=py39h612dafd_4
 63 |   - louvain=0.7.0=py39he80948d_0
 64 |   - lz4-c=1.9.3=h9c3ff4c_1
 65 |   - matplotlib=3.4.2=py39hf3d152e_0
 66 |   - matplotlib-base=3.4.2=py39hab158f2_0
 67 |   - metis=5.1.0=h58526e2_1006
 68 |   - mkl=2021.3.0=h06a4308_520
 69 |   - mkl-service=2.4.0=py39h7f8727e_0
 70 |   - mkl_fft=1.3.0=py39h42c9631_2
 71 |   - mkl_random=1.2.2=py39h51133e4_0
 72 |   - mpfr=4.1.0=h9202a9a_1
 73 |   - munkres=1.1.4=pyh9f0ad1d_0
 74 |   - natsort=7.1.1=pyhd8ed1ab_0
 75 |   - ncurses=6.2=he6710b0_1
 76 |   - numba=0.53.1=py39h56b8d98_1
 77 |   - numpy=1.20.3=py39hf144106_0
 78 |   - numpy-base=1.20.3=py39h74d4b33_0
 79 |   - olefile=0.46=pyh9f0ad1d_1
 80 |   - openjpeg=2.4.0=hb52868f_1
 81 |   - openssl=1.1.1k=h7f98852_0
 82 |   - packaging=21.0=pyhd8ed1ab_0
 83 |   - pandas=1.3.0=py39hde0f152_0
 84 |   - patsy=0.5.1=py_0
 85 |   - pcre=8.45=h9c3ff4c_0
 86 |   - pillow=8.3.1=py39ha612740_0
 87 |   - pip=21.2.4=py37h06a4308_0
 88 |   - pthread-stubs=0.4=h36c2ea0_1001
 89 |   - pynndescent=0.5.4=pyh6c4a22f_0
 90 |   - pyparsing=2.4.7=pyh9f0ad1d_0
 91 |   - pyqt=5.9.2=py39h2531618_6
 92 |   - python=3.9.6=h12debd9_1
 93 |   - python-annoy=1.17.0=py39he80948d_2
 94 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
 95 |   - python-igraph=0.9.6=py39hfef886c_0
 96 |   - python_abi=3.9=2_cp39
 97 |   - pytz=2021.1=pyhd8ed1ab_0
 98 |   - qt=5.9.7=h5867ecd_1
 99 |   - readline=8.1=h27cfd23_0
100 |   - scikit-learn=0.24.2=py39h4dfa638_0
101 |   - scipy=1.6.2=py39had2a1c9_1
102 |   - seaborn=0.11.2=hd8ed1ab_0
103 |   - seaborn-base=0.11.2=pyhd8ed1ab_0
104 |   - setuptools=52.0.0=py39h06a4308_0
105 |   - sip=4.19.13=py39h2531618_0
106 |   - six=1.16.0=pyhd3eb1b0_0
107 |   - sqlite=3.36.0=hc218d9a_0
108 |   - statsmodels=0.12.2=py39hce5d2b2_0
109 |   - suitesparse=5.10.1=hd8046ac_0
110 |   - tbb=2020.2=h4bd325d_4
111 |   - texttable=1.6.4=pyhd8ed1ab_0
112 |   - threadpoolctl=2.2.0=pyh8a188c0_0
113 |   - tk=8.6.10=hbc83047_0
114 |   - tornado=6.1=py39h3811e60_1
115 |   - tzdata=2021a=h5d7bf9c_0
116 |   - umap-learn=0.5.1=py39hf3d152e_1
117 |   - wheel=0.37.0=pyhd3eb1b0_0
118 |   - xorg-libxau=1.0.9=h7f98852_0
119 |   - xorg-libxdmcp=1.1.3=h7f98852_0
120 |   - xz=5.2.5=h7b6447c_0
121 |   - zlib=1.2.11=h7b6447c_3
122 |   - zstd=1.5.0=ha95c52a_0
123 |   - pip:
124 |     - adjusttext==0.7.3
125 |     - asciitree==0.3.3
126 |     - charset-normalizer==2.0.4
127 |     - click==8.0.1
128 |     - cython==0.29.24
129 |     - demuxem==0.1.6
130 |     - docopt==0.6.2
131 |     - fasteners==0.16.3
132 |     - forceatlas2-python==1.1
133 |     - geosketch==1.2
134 |     - gprofiler-official==1.0.0
135 |     - harmony-pytorch==0.1.6
136 |     - hnswlib==0.5.2
137 |     - idna==3.2
138 |     - intervaltree==2.1.0
139 |     - lightgbm==3.2.1
140 |     - loompy==3.0.6
141 |     - nmf-torch==0.1.1
142 |     - numcodecs==0.9.0
143 |     - numpy-groupies==0.9.13
144 |     - pegasusio==0.3.1.post2
145 |     - pegasuspy==1.4.3
146 |     - psutil==5.8.0
147 |     - pybind11==2.7.1
148 |     - requests==2.26.0
149 |     - scanorama==1.7.1
150 |     - scikit-misc==0.1.4
151 |     - sortedcontainers==2.4.0
152 |     - torch==1.9.0
153 |     - typing-extensions==3.10.0.2
154 |     - urllib3==1.26.6
155 |     - wordcloud==1.8.1
156 |     - xlrd==1.2.0
157 |     - xlsxwriter==3.0.1
158 |     - zarr==2.9.5
159 | 


--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/10x_cells_v2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/10x_cells_v2.h5ad


--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/smarter_cells.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_cells.h5ad


--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/smarter_nuclei.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/smarter_nuclei.h5ad


--------------------------------------------------------------------------------
/example-MOp_L5ET/datasets/snmcseq_gene.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukamel-lab/SingleCellFusion/a815809c33b78d0c33c738809ab6e28be98f6d57/example-MOp_L5ET/datasets/snmcseq_gene.h5ad


--------------------------------------------------------------------------------
/example-MOp_L5ET/run_scf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SingleCellFusion \
 4 |     -i "./datasets/10x_cells_v2.h5ad" \
 5 |        "./datasets/smarter_cells.h5ad" \
 6 |        "./datasets/smarter_nuclei.h5ad" \
 7 |        "./datasets/snmcseq_gene.h5ad" \
 8 |     -im "rna" "rna" "rna" "mc" \
 9 |     -f "./datasets/10x_cells_v2.h5ad" \
10 |     -o "./results"
11 | 


--------------------------------------------------------------------------------
/example-wholebrain/00.test_all_preproc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # every normalized by CPMs
 3 | 
 4 | # # mC prepare
 5 | # SingleCellFusion_prep \
 6 | #     -i \
 7 | #       "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \
 8 | #       "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \
 9 | #     -icov \
10 | #       "./datasets/mc/genes_forBICCN2_CEMBA_3C_171206_CG.h5ad" \
11 | #       "./datasets/mc/genes_forBICCN2_CEMBA_3C_171207_CG.h5ad" \
12 | #     -inorm "mc" "mc" \
13 | #     -ga "./datasets/genes_biccn2.0.bed" \
14 | #     -o "./processed" \
15 | #     -op "hvg_mc" 
16 | 
17 | # # ATAC prepare
18 | # SingleCellFusion_prep \
19 | #     -i \
20 | #       "./datasets/atac/CEMBA171206_3C_genes_promo2kb.h5ad" \
21 | #       "./datasets/atac/CEMBA171207_3C_genes_promo2kb.h5ad" \
22 | #     -gi "ensid" \
23 | #     -ci "cell_id" \
24 | #     -inorm "tpm" "tpm" \
25 | #     -sp \
26 | #     -ga "./datasets/genes_promoter_2kb_biccn2.0.bed" \
27 | #     -o "./processed" \
28 | #     -op "hvg_atac" 
29 | 
30 | # # RNA prepare
31 | # SingleCellFusion_prep \
32 | #     -i \
33 | #       "./datasets/rna/smrt_intron_biccn2.h5ad" \
34 | #       "./datasets/rna/smrt_exon_biccn2.h5ad" \
35 | #     -inorm "cpm" "cpm" \
36 | #     -o "./processed" \
37 | #     -op "hvg_rna"
38 | 
39 | # # run scf RNA mC ATAC
40 | # SingleCellFusion \
41 | #     -i \
42 | #       "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
43 | #       "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171206_mCG.h5ad" \
44 | #       "./processed/hvg_mc_genes_forBICCN2_CEMBA_3C_171207_mCG.h5ad" \
45 | #       "./processed/hvg_atac_CEMBA171206_3C_genes_promo2kb.h5ad" \
46 | #       "./processed/hvg_atac_CEMBA171207_3C_genes_promo2kb.h5ad" \
47 | #     -im "rna" "mc" "mc" "atac" "atac"\
48 | #     -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
49 | #     -o "./results" \
50 | #     -op "SingleCellFusion"
51 | 
52 | # run RNA intron and exon
53 | SingleCellFusion \
54 |     -i \
55 |       "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
56 |       "./processed/hvg_rna_smrt_intron_biccn2.h5ad" \
57 |     -im "rna" "rna"\
58 |     -f "./processed/hvg_rna_smrt_exon_biccn2.h5ad" \
59 |     -o "./results" \
60 |     -op "intron_exon"


--------------------------------------------------------------------------------
/example-wholebrainatac/normalize_and_select_features.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "<module 'preproc_utils' from '../scripts/preproc_utils.py'>"
 12 |       ]
 13 |      },
 14 |      "execution_count": 3,
 15 |      "metadata": {},
 16 |      "output_type": "execute_result"
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "import sys\n",
 21 |     "import importlib\n",
 22 |     "sys.path.insert(0, '../scripts')\n",
 23 |     "\n",
 24 |     "import numpy as np\n",
 25 |     "from scipy import sparse\n",
 26 |     "import time\n",
 27 |     "import re\n",
 28 |     "import anndata\n",
 29 |     "\n",
 30 |     "from __global_variables import *\n",
 31 |     "from utils_new import *\n",
 32 |     "import basic_utils\n",
 33 |     "importlib.reload(basic_utils)\n",
 34 |     "import preproc_utils\n",
 35 |     "importlib.reload(preproc_utils)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Task\n",
 43 |     "- start from prepared files \n",
 44 |     "```anndata```\n",
 45 |     "- get and store hvfeatures\n",
 46 |     "```anndata```"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Settings"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "SRC_DIR = './datasets_pre'\n",
 63 |     "DST_DIR = './datasets'\n",
 64 |     "\n",
 65 |     "sys.path.insert(0, DST_DIR)\n",
 66 |     "# from __init__datasets import *\n",
 67 |     "\n",
 68 |     "\n",
 69 |     "f_data_format = '{0}/{1}.h5ad'\n",
 70 |     "f_hvftr_data_format = '{0}/{1}.h5ad'\n",
 71 |     "\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 6,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "mods_selected = [\n",
 81 |     "    'snatac',\n",
 82 |     "]\n",
 83 |     "normalization_options = {\n",
 84 |     "    'snatac': 'TPM',\n",
 85 |     "}"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 7,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "(32285,)\n"
 98 |      ]
 99 |     },
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "ensid\n",
104 |        "ENSMUSG00000051951    465597\n",
105 |        "ENSMUSG00000089699     46966\n",
106 |        "ENSMUSG00000102331     11595\n",
107 |        "ENSMUSG00000102343     80476\n",
108 |        "ENSMUSG00000025900    409684\n",
109 |        "dtype: int64"
110 |       ]
111 |      },
112 |      "execution_count": 7,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "df_genes = get_gene_annotation().set_index('ensid')\n",
119 |     "\n",
120 |     "gene_lengths_base = (df_genes['end'] - df_genes['start'])\n",
121 |     "print(gene_lengths_base.shape)\n",
122 |     "gene_lengths_base.head()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "# highly variable features"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 8,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "snatac\n",
142 |       "snatac Reading in files 0.00018596649169921875\n"
143 |      ]
144 |     },
145 |     {
146 |      "ename": "OSError",
147 |      "evalue": "Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)",
148 |      "output_type": "error",
149 |      "traceback": [
150 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
151 |       "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
152 |       "\u001b[0;32m<ipython-input-8-ed8f48407ed6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     44\u001b[0m         \u001b[0;31m# read in files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     45\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Reading in files {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mti\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m         \u001b[0mh5ad_mat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manndata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     47\u001b[0m         \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ensid'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m         \u001b[0mmeta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgxc_raw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbasic_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mh5ad_to_scf_rna_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh5ad_mat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgid_col\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcid_col\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
153 |       "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36mread_h5ad\u001b[0;34m(filename, backed, chunk_size)\u001b[0m\n\u001b[1;32m    445\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    446\u001b[0m         \u001b[0;31m# load everything into memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 447\u001b[0;31m         \u001b[0mconstructor_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_args_from_h5ad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    448\u001b[0m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstructor_args\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    449\u001b[0m         \u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
154 |       "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/readwrite/read.py\u001b[0m in \u001b[0;36m_read_args_from_h5ad\u001b[0;34m(adata, filename, mode, chunk_size)\u001b[0m\n\u001b[1;32m    479\u001b[0m         \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    480\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 481\u001b[0;31m         \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    482\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    483\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mbacked\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mAnnData\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_BACKED_ATTRS\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
155 |       "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/anndata/h5py/h5sparse.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, force_dense, **kwds)\u001b[0m\n\u001b[1;32m    153\u001b[0m         \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m  \u001b[0;31m# Python 3.5 can’t handle trailing commas here\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    154\u001b[0m     ):\n\u001b[0;32m--> 155\u001b[0;31m         self.h5f = h5py.File(\n\u001b[0m\u001b[1;32m    156\u001b[0m             \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    157\u001b[0m             \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
156 |       "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, **kwds)\u001b[0m\n\u001b[1;32m    404\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    405\u001b[0m                 \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nslots\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_nbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrdcc_w0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 406\u001b[0;31m                 fid = make_fid(name, mode, userblock_size,\n\u001b[0m\u001b[1;32m    407\u001b[0m                                \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfcpl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmake_fcpl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack_order\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    408\u001b[0m                                swmr=swmr)\n",
157 |       "\u001b[0;32m/cndd2/fangming/venvs/routine/lib/python3.8/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m    171\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    172\u001b[0m             \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 173\u001b[0;31m         \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    174\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    175\u001b[0m         \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
158 |       "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
159 |       "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
160 |       "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n",
161 |       "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = './datasets_pre/snatac.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "for mod in mods_selected:\n",
167 |     "    ti = time.time()\n",
168 |     "    print(mod)\n",
169 |     "    \n",
170 |     "    normalization_option = normalization_options[mod]\n",
171 |     "    # read data matrix\n",
172 |     "    if normalization_option == 'MC':\n",
173 |     "        f_data = f_data_format.format(SRC_DIR, mod)\n",
174 |     "        \n",
175 |     "        # read in files\n",
176 |     "        print(mod, \"Reading in files {}\".format(time.time()-ti))\n",
177 |     "        gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)\n",
178 |     "        print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)\n",
179 |     "        print(time.time()-ti)\n",
180 |     "        \n",
181 |     "        # output file\n",
182 |     "        f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') \n",
183 |     "        print(time.time()-ti)\n",
184 |     "        \n",
185 |     "        # check meta cells agree with gxc cells\n",
186 |     "        assert np.all(meta.index.values == gxc_raw.cell)\n",
187 |     "        # check genes are uniq \n",
188 |     "        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n",
189 |     "        # do\n",
190 |     "        gxc_hvftr = preproc_utils.preproc_methylation(\n",
191 |     "                                                      gxc_raw,\n",
192 |     "                                                      meta,\n",
193 |     "                                                      global_value_col=settings[mod].global_mean, \n",
194 |     "                                                      base_call_cutoff=20, \n",
195 |     "                                                      sufficient_coverage_fraction=0.95,\n",
196 |     "                                                      hv_percentile=30,\n",
197 |     "                                                      n_qcut=10,\n",
198 |     "                                                      )\n",
199 |     "        # save\n",
200 |     "        print(mod, \"Saving to files {}\".format(time.time()-ti))\n",
201 |     "#         gxc_hvftr.to_csv(f_hvftr_data_methylation, sep=\"\\t\", header=True, index=True, na_rep='NA')\n",
202 |     "        h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n",
203 |     "        \n",
204 |     "    else:\n",
205 |     "        # input, output files\n",
206 |     "        f_data = f_data_format.format(SRC_DIR, mod,) \n",
207 |     "        f_hvftr_data = f_hvftr_data_format.format(DST_DIR, mod) \n",
208 |     "        \n",
209 |     "        # read in files\n",
210 |     "        print(mod, \"Reading in files {}\".format(time.time()-ti))\n",
211 |     "        h5ad_mat = anndata.read_h5ad(f_data)\n",
212 |     "        gid_col, cid_col = 'ensid', ''\n",
213 |     "        meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)\n",
214 |     "        \n",
215 |     "        # check meta cells agree with gxc cells\n",
216 |     "        assert np.all(meta.index.values == gxc_raw.cell)\n",
217 |     "        # check genes are uniq \n",
218 |     "        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) \n",
219 |     "    \n",
220 |     "        # get hvftrs\n",
221 |     "        print(mod, \"Preproc and get highly variable genes {}\".format(time.time()-ti))\n",
222 |     "        if normalization_option == 'CPM':\n",
223 |     "            gxc_hvftr = preproc_utils.preproc_rna_cpm_based(\n",
224 |     "                                             gxc_raw, \n",
225 |     "                                             sufficient_cell_coverage=0.01, \n",
226 |     "                                             hv_percentile=30, hv_ncut=10)\n",
227 |     "        elif normalization_option == 'TPM':\n",
228 |     "            gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)\n",
229 |     "            gxc_hvftr = preproc_utils.preproc_rna_tpm_based(\n",
230 |     "                                             gxc_raw, gene_lengths, impute_gene_lengths=True, \n",
231 |     "                                             sufficient_cell_coverage=0.01, \n",
232 |     "                                             hv_percentile=30, hv_ncut=10)\n",
233 |     "    \n",
234 |     "        # save\n",
235 |     "        print(mod, \"Saving to file {}\".format(f_hvftr_data, time.time()-ti))\n",
236 |     "        h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)\n",
237 |     "        h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')\n",
238 |     "    \n",
239 |     "    print(mod, \"Total time used: {}\".format(time.time()-ti))\n",
240 |     "    break\n",
241 |     "    "
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## Check highly-variable genes"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 7,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# for mod in mods_selected:\n",
258 |     "#     print(mod)\n",
259 |     "#     if settings[mod].mod_category == 'mc':\n",
260 |     "#         f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'tsv') \n",
261 |     "#         gxc_hvftr = pd.read_csv(f_hvftr_data, sep=\"\\t\", index_col=0)\n",
262 |     "#         print(gxc_hvftr.index.values)\n",
263 |     "#         print(gxc_hvftr.columns.values)\n",
264 |     "#         print(gxc_hvftr.shape)\n",
265 |     "#         has_nan = np.isnan(gxc_hvftr.values).any()\n",
266 |     "#         print(\"Contains NaN? {}\".format(has_nan))\n",
267 |     "        \n",
268 |     "#         continue\n",
269 |     "        \n",
270 |     "#     f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'npz') \n",
271 |     "#     f_hvftr_gene = f_hvftr_format.format(SRC_DIR, mod, 'gene') \n",
272 |     "#     f_hvftr_cell = f_hvftr_format.format(SRC_DIR, mod, 'cell') \n",
273 |     "#     gxc_hvftr = snmcseq_utils.load_gc_matrix(f_hvftr_gene, f_hvftr_cell, f_hvftr_data)\n",
274 |     "#     print(gxc_hvftr.gene)\n",
275 |     "#     print(gxc_hvftr.cell)\n",
276 |     "#     print(len(gxc_hvftr.gene), len(gxc_hvftr.cell), gxc_hvftr.data.shape)\n",
277 |     "#     has_nan = np.isnan(gxc_hvftr.data.data).any()\n",
278 |     "#     print(\"Contains NaN? {}\".format(has_nan))\n",
279 |     "# #     break"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.8.1"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 4
304 | }
305 | 


--------------------------------------------------------------------------------
/example-wholebrainatac/run_preproc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ga="/cndd/Public_Datasets/BICCN/BICCN2.0_whole_mouse_brain/references/refdata-gex-mm10-2020-A/genes/genes_promoter_2kb_biccn2.0.bed"
 4 | 
 5 | ../scripts/normalize_and_select_features.py \
 6 |     -i "./datasets_pre/CEMBA171206_3C_genes_promo2kb.h5ad" \
 7 |     -inorm "tpm" \
 8 |     -ga $ga \
 9 |     -op "test_preproc_may3" \
10 |     -o "./datasets_processed"


--------------------------------------------------------------------------------
/example-wholebrainatac/run_scf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ../scripts/SingleCellFusion \
4 |     -i "./datasets/10x_cells_v2.h5ad" "./datasets/snatac.h5ad" \
5 |     -im "rna" "atac" \
6 |     -f "./datasets/10x_cells_v2.h5ad" \
7 |     -op "test_april27" \
8 |     -o "./results"


--------------------------------------------------------------------------------
/scf_description.rst:
--------------------------------------------------------------------------------
  1 | How does SingleCellFusion work?
  2 | ================================
  3 | SingleCellFusion is built around the idea that for a cell profiled by a given omics technique (RNA-sequencing,
  4 | snATAC-sequencing, snmC-sequencing) there are unobserved features of that cell that if sampled would
  5 | provide a fuller picture of that cell's identity. For example, if a cell underwent RNA-sequencing we know
  6 | what genes are expressed but we don't know the patterns of DNA methylation in that same cell. The methylation
  7 | status of DNA in that cell is unobserved, limiting our ability to fully understand the identity of that cell.
  8 | 
  9 | In an ideal world we would obtain the transcriptome, methylome, and chromatin accessibility of a single
 10 | cell at once, but as the technologies for this type of experiment develop SingleCellFusion can provide a
 11 | computational equivalent. SingleCellFusion uses known relationships between different types of multiomics
 12 | data to impute unobserved data, enabling the multimodal analysis of a cell's identity.
 13 | 
 14 | The core of SingleCellFusion is the generation of a nearest neighbors graph between different data sets.
 15 | This graph is generated by finding nearest neighbors using the correlation of counts at highly variable
 16 | features. For example, DNA methylation is known to be negatively correlated with gene expression. If a
 17 | snmC-seq profiled cell has low methylation at a number of highly variable genes, and a snRNA-seq profiled
 18 | cell has high gene expression at those same genes, we can assume that those two cells likely belong to the
 19 | same cell type. We use this nearest neighbors graph to generate imputed counts by averaging among a cell's
 20 | neighbors in the opposite modality. The actions of SingleCellFusion depend on the type of nearest neighbor
 21 | graph specified, and are described below.
 22 | 
 23 | Direct mutual nearest neighbors
 24 | -------------------------------
 25 | .. image:: mnn_direct.png
 26 |   :width: 400
 27 |   :alt: cartoon of direct MNN
 28 | 
 29 | In this method, highly variable features are identified in each data set. On a cell-to-cell basis the
 30 | correlation of counts at highly variable features is calculated. These correlation values are used
 31 | as the distance metric for identifying mutual neighbors.
 32 | 
 33 | Once the correlation is calculated, neighbors across modalities are determined. We require that
 34 | each neighbor has to have high correlation between each other. In other words, a snmC-seq profiled
 35 | cell can only be a neighbor with a scRNA-seq cell if the methylation levels at the highly variable
 36 | features are strongly anti-correlated with gene expression at those same features in the scRNA-seq
 37 | profiled cell, and vice versa. This ensures that only strong neighbors are found and that the
 38 | nearest neighbors graph is not dominated by noisy or spurious correlations.
 39 | 
 40 | Once the neighbors graph is generated imputed counts are generated by the following equation:
 41 | 
 42 | .. image:: mnn_equation.png
 43 |   :width: 400
 44 |   :alt: equation for imputation by MNN
 45 | 
 46 | For cell *j* in modality *m* which has direct mutual nearest neighbors with cells in modality
 47 | *m*, the imputed *m'* counts for feature *f* are given by the average over its *k* nearest
 48 | neighbors in modality *m'*.
 49 | 
 50 | This is the most conservative method for generating imputed counts, only cells that make direct mutual
 51 | nearest neighbors will receive imputed data. This method typically leads to good integration but can
 52 | result in the loss of large fractions of cells from the analysis if mutual neighbors were not found for them.
 53 | 
 54 | 
 55 | Mutual nearest neighbors with rescue
 56 | -------------------------------------
 57 | .. image:: mnn_rescue.png
 58 |   :width: 400
 59 |   :alt: cartoon of rescue MNN
 60 | 
 61 | As with the direct method, the distances between two pairs of cells is their correlation at
 62 | highly variable genes. The only difference with this method is that in addition to a mutual
 63 | nearest neighbors graph between modalities, a mutual nearest neighbor graph within each modality
 64 | is also generated. This within modality graph allows for imputation to be performed on all cells, by
 65 | using the within modality neighbors to determine what the best matched neighbors are across
 66 | modalities.
 67 | 
 68 | .. image:: rescue_equation_1.png
 69 |   :width: 400
 70 |   :alt: equation 1 of rescue
 71 | 
 72 | where
 73 | 
 74 | .. image:: rescue_equation_2.png
 75 |   :width: 400
 76 |   :alt: equation 2 of rescue
 77 | 
 78 | For a cell *l* in modality *m*, which has no direct mutual neighbors with cells in modality
 79 | *m'*, the imputed *m'* counts for feature *f* are given by a weighted average over its *k*
 80 | nearest neighbors in modality *m* which have direct mutual neighbors with cells in modality
 81 | *m'*. The cells with direct mutual nearest neighbors have imputed counts per the equation in
 82 | "Direct mutual nearest neighbors:"
 83 | 
 84 | .. image:: mnn_equation.png
 85 |   :width: 400
 86 |   :alt: equation for imputation by MNN
 87 | 
 88 | The weights *A(l,j)* are determined by the distance between *l* and *j*, *d(l,j)* by the following
 89 | equation:
 90 | 
 91 | .. image:: rescue_equation_3.png
 92 |   :width: 400
 93 |   :alt: equation 3 of rescue
 94 | 
 95 | This is a more lenient method for generating imputed counts as all cells will receive imputed
 96 | data. This method will enable all cells to be analyzed, and is our recommended approach.
 97 | 
 98 | k-nearest neighbors
 99 | -------------------
100 | .. image:: knn.png
101 |   :width: 400
102 |   :alt: cartoon of kNN
103 | 
104 | Similar to the other methods, the distance metric between two pairs of cells is the correlation at
105 | highly variable features. The major difference with this method is that each cell is required to make
106 | *k* neighbors in the opposite modality, with the restriction that a cell in the opposite modality is
107 | restricted to only making a set *j* number of neighbors. The maximum number of neighbors that a cell
108 | in the opposite modality can make is given by the equation:
109 | 
110 | .. image:: n_neighbors_knn.png
111 |   :width: 200
112 |   :alt: equation 1 of knn
113 | 
114 | where *j* is the maximum number of neighbors a cell in modality *m'* can make, *k* is the required
115 | number of nearest neighbors per cell in modality *m*, and *n*\ :sub:`m`\  is the number of cells in
116 | modality *m*, and *n*\ :sub:`m'`\  is the number of cells in modality *m'*. *z* is a relaxation
117 | parameter to restrict cells from becoming hyperconnected. The neighbor graph is created by randomly 
118 | iterating through each cell and finding its k nearest neighbors that are below the maximumn cell
119 | threshold. Once the nearest neighbors graph is generated imputed counts are generated by the same 
120 | equation as in "Direct mutual nearest neighbors:
121 | 
122 | .. image:: mnn_equation.png
123 |   :width: 400
124 |   :alt: equation for imputation by MNN
125 | 
126 | This is the most lenient method for generating imputed counts, as all cells will make neighbors
127 | in the opposite data set.
128 | 


--------------------------------------------------------------------------------
/scripts/SCF_utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities for SingleCellFusion
  2 | """
  3 | from __init__ import *
  4 | 
  5 | import functools
  6 | import collections
  7 | import itertools
  8 | import re
  9 | from scipy import sparse
 10 | from scipy.stats import zscore
 11 | import fbpca
 12 | import sys
 13 | import logging
 14 | from memory_profiler import profile
 15 | from datetime import datetime
 16 | 
 17 | import basic_utils
 18 | import clst_utils
 19 | 
 20 | ctime = datetime.now().strftime("%Y%m%d%H%M%S")
 21 | f=open('memory_profile_SCFutils_{}.log'.format(ctime), 'w+')
 22 | 
 23 | @profile(stream=f)
 24 | def sparse_adj_to_mat(adjs, row_size, col_size, dists=''):
 25 |     """Turn a knn adjacency matrix to a sparse matrix
 26 |     """
 27 |     n_obs, k = adjs.shape
 28 |     assert n_obs == row_size
 29 |     # row col 1/dist 
 30 |     row_inds = np.repeat(np.arange(row_size), k)
 31 |     col_inds = np.ravel(adjs)
 32 |     if isinstance(dists, np.ndarray):
 33 |         assert dists.shape == adjs.shape
 34 |         data = np.ravel(dists) 
 35 |     else:
 36 |         data = [1]*len(row_inds)
 37 |     knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(row_size, col_size))
 38 |     return knn_dist_mat
 39 | 
 40 | # smooth-within modality
 41 | @profile(stream=f)
 42 | def smooth_in_modality(counts_matrix, norm_counts_matrix, k, ka, npc=100, sigma=1.0, p=0.1, drop_npc=0):
 43 |     """Smooth a data matrix
 44 |     
 45 |     Arguments:
 46 |         - counts_matrix (pandas dataframe, feature by cell)
 47 |         - norm_counts_matrix (pandas dataframe, feature by cell) log10(CPM+1)
 48 |         - k (number of nearest neighbors)
 49 |     Return:
 50 |         - smoothed cells_matrix (pandas dataframe)
 51 |         - markov affinity matrix
 52 |     """
 53 |     # from sklearn.neighbors import NearestNeighbors
 54 |     import fbpca
 55 |     import clst_utils
 56 |     
 57 |     assert counts_matrix.shape[1] == norm_counts_matrix.shape[1] 
 58 | 
 59 |     c = norm_counts_matrix.columns.values
 60 |     N = len(c)
 61 | 
 62 |     # reduce dimension fast version
 63 |     U, s, Vt = fbpca.pca(norm_counts_matrix.T.values, k=npc)
 64 |     pcs = U.dot(np.diag(s))
 65 |     if drop_npc != 0:
 66 |         pcs = pcs[:, drop_npc:]
 67 | 
 68 |     # get k nearest neighbor distances fast version 
 69 |     inds, dists = clst_utils.gen_knn_annoy(pcs, k, form='list', 
 70 |                                                 metric='euclidean', n_trees=10, search_k=-1, verbose=True, 
 71 |                                                 include_distances=True)
 72 |     
 73 |     # remove itself
 74 |     dists = dists[:, 1:]
 75 |     inds = inds[:, 1:]
 76 | 
 77 |     # normalize by ka's distance 
 78 |     dists = (dists/(dists[:, ka].reshape(-1, 1)))
 79 | 
 80 |     # gaussian kernel
 81 |     adjs = np.exp(-((dists**2)/(sigma**2))) 
 82 | 
 83 |     # construct a sparse matrix 
 84 |     cols = np.ravel(inds)
 85 |     rows = np.repeat(np.arange(N), k-1) # remove itself
 86 |     vals = np.ravel(adjs)
 87 |     A = sparse.csr_matrix((vals, (rows, cols)), shape=(N, N))
 88 | 
 89 |     # Symmetrize A (union of connection)
 90 |     A = A + A.T
 91 | 
 92 |     # normalization fast (A is now a weight matrix excluding itself)
 93 |     degrees = A.sum(axis=1)
 94 |     A = sparse.diags(1.0/np.ravel(degrees)).dot(A)
 95 | 
 96 |     # include itself
 97 |     eye = sparse.identity(N)
 98 |     A = p*eye + (1-p)*A
 99 |     
100 |     # smooth fast (future?)
101 |     counts_matrix_smoothed = pd.DataFrame((A.dot(counts_matrix.T)).T, 
102 |                                          columns=counts_matrix.columns, index=counts_matrix.index)
103 |     return counts_matrix_smoothed, A
104 | 
105 | # impute across modality
106 | @profile(stream=f)
107 | def get_constrained_knn(mat_norm_j, mat_norm_i, knn, k_saturate, knn_speed_factor=10, metric='dot', verbose=False):
108 |     """Get constrained knn
109 |     j <- i
110 |     Look for kNN in i for each cell in j, cells in i are constrained to k_saturated
111 |     
112 |     get knn_speed_factor*knn number of nearest neighbors internally
113 |     """
114 |     ti = time.time()
115 |     assert mat_norm_i.shape[1] == mat_norm_j.shape[1]
116 |     knn = int(knn)
117 |     knn_speed_factor = int(knn_speed_factor)
118 |     
119 |     cells_i = np.arange(len(mat_norm_i))
120 |     cells_j = np.arange(len(mat_norm_j))
121 |     
122 |     # record cells in j
123 |     accepted_knn_ji = [] 
124 |     accepted_cells = []
125 |     rejected_cells = np.arange(len(cells_j))
126 |     
127 |     # record cell in i
128 |     n_connects = np.zeros(len(cells_i)).astype(int) # record number of connection for each cell in i 
129 |     unsaturated = (n_connects < k_saturate) # unsaturated bool 
130 |     unsaturated_cells = np.arange(len(cells_i))[unsaturated]
131 |     
132 |     while rejected_cells.size != 0:
133 |         if verbose:
134 |             print(len(rejected_cells), len(unsaturated_cells), time.time()-ti)
135 |         
136 |         np.random.shuffle(rejected_cells) # random order
137 |         # do something to rejected cells and unsaturated cells
138 |         # knn_ji # for each cell in j, its knn in i
139 |         knn_ji = clst_utils.gen_knn_annoy_train_test(mat_norm_i.values[unsaturated_cells], # look for nearest neighbors in i 
140 |                                                            mat_norm_j.values[rejected_cells], # for each row in j
141 |                                                            min(knn*knn_speed_factor, len(unsaturated_cells)), #  
142 |                                                            form='list', # adj matrix 
143 |                                                            metric=metric, # correlation 
144 |                                                            n_trees=10, search_k=-1, verbose=False, 
145 |                                                            include_distances=False, # for now
146 |                                                            ).astype(int)
147 |         knn_ji = unsaturated_cells[knn_ji] # transform it to global index, need to check this like 
148 |         
149 |         rejected_local_idx = []
150 |         # examine each cell in j
151 |         for local_idx, cell in enumerate(rejected_cells):
152 |             # get knn in i
153 |             knn_in_i = knn_ji[local_idx]
154 |             # filter out saturated ones
155 |             knn_in_i = knn_in_i[unsaturated[knn_in_i]]
156 |             
157 |             if knn_in_i.size < knn:
158 |                 # reject
159 |                 rejected_local_idx.append(local_idx)
160 |             else:
161 |                 # accept and update
162 |                 accepted_knn_ji.append(knn_in_i[:knn])
163 |                 accepted_cells.append(cell)
164 |                 n_connects[knn_in_i[:knn]] += 1 
165 |                 unsaturated = (n_connects < k_saturate) # unsaturated bool 
166 |                 
167 |         unsaturated_cells = np.arange(len(cells_i))[unsaturated]
168 |         rejected_cells = rejected_cells[rejected_local_idx]
169 |     # break
170 |                 
171 |     accepted_knn_ji = pd.DataFrame(np.vstack(accepted_knn_ji), index=accepted_cells)
172 |     accepted_knn_ji = accepted_knn_ji.sort_index().values
173 |     
174 |     return accepted_knn_ji
175 | 
176 | # 
177 | @profile(stream=f)
178 | def impute_1pair_cca(mod_i, mod_j, 
179 |                      smoothed_features_i, smoothed_features_j,
180 |                      settings,
181 |                      knn,
182 |                      relaxation,
183 |                      n_cca,
184 |                      output_knn_mat_ij='',
185 |                      output_knn_mat_ji='',
186 |                      impute_j=True,
187 |                     ):
188 |     """
189 |     """
190 |     # set up
191 |     direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction
192 |     
193 |     mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i
194 |     mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j
195 |     
196 |     genes_i = mat_ii.columns.values
197 |     genes_j = mat_jj.columns.values
198 |     genes_common = np.intersect1d(genes_i, genes_j)
199 |     
200 |     cells_i = mat_ii.index.values
201 |     cells_j = mat_jj.index.values
202 |     
203 |     ## CCA euclidean distance 
204 |     # normalize the feature matrix
205 |     X = mat_ii[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_i # gene by cell, zscore across genes
206 |     Y = mat_jj[genes_common].T.apply(basic_utils.zscore, axis=0)*direct_j
207 |     U, s, Vt = fbpca.pca(X.T.values.dot(Y.values), k=n_cca)
208 |     del X, Y
209 | 
210 |     mat_norm_i = pd.DataFrame(U, index=mat_ii.index)
211 |     maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get 
212 |     mat_norm_j = pd.DataFrame(Vt.T, index=mat_jj.index)
213 |     maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get 
214 |     
215 |     if impute_j:
216 |         # knn_i and knn_j
217 |         # j <- i for each j, get kNN in i
218 |         knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='euclidean')
219 |         mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i))
220 |         
221 |         if output_knn_mat_ji:
222 |             sparse.save_npz(output_knn_mat_ji, mat_knn_ji)
223 |         
224 |         # normalize 
225 |         degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to 
226 |         mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji) 
227 |         
228 |         # imputation both across and within modality
229 |         mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i
230 |     
231 |     
232 |     # i <- j
233 |     knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='euclidean')
234 |     mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j))
235 |     
236 |     if output_knn_mat_ij:
237 |         sparse.save_npz(output_knn_mat_ij, mat_knn_ij)
238 |     
239 |     degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to 
240 |     mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij) 
241 |     
242 |     mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j
243 |     
244 |     if impute_j:
245 |         return mat_ij, mat_ji
246 |     else:
247 |         return mat_ij
248 | 
249 | @profile(stream=f)
250 | def impute_1pair(mod_i, mod_j, 
251 |                  smoothed_features_i, smoothed_features_j,
252 |                  settings,
253 |                  knn, # 20
254 |                  relaxation, # 3
255 |                  output_knn_mat_ij='',
256 |                  output_knn_mat_ji='',
257 |                  impute_j=True,
258 |                  ):
259 |     """
260 |     """
261 |     # set up
262 |     direct_i, direct_j = settings[mod_i].mod_direction, settings[mod_j].mod_direction
263 |     
264 |     mat_ii = smoothed_features_i.T # cell in mod i; gene in mod i
265 |     mat_jj = smoothed_features_j.T # cell in mod j; gene in mod j
266 |     
267 |     genes_i = mat_ii.columns.values
268 |     genes_j = mat_jj.columns.values
269 |     genes_common = np.intersect1d(genes_i, genes_j)
270 |     
271 |     cells_i = mat_ii.index.values
272 |     cells_j = mat_jj.index.values
273 |     
274 |     ## spearman correlation as distance  (rank -> zscore -> (flip sign?) -> "dot" distance) 
275 |     # normalize the feature matrix
276 |     mat_norm_i = (mat_ii[genes_common].rank(pct=True, axis=1)
277 |                                       .apply(basic_utils.zscore, axis=1)
278 |                                       *direct_i
279 |                  )
280 |     mat_norm_j = (mat_jj[genes_common].rank(pct=True, axis=1)
281 |                                       .apply(basic_utils.zscore, axis=1)
282 |                                       *direct_j
283 |                  )
284 |     maxk_i = int((len(cells_j)/len(cells_i))*knn*relaxation)+1 # max number of NN a cell in i can get 
285 |     maxk_j = int((len(cells_i)/len(cells_j))*knn*relaxation)+1 # max number of NN a cell in j can get 
286 |     
287 |     if impute_j:
288 |         # knn_i and knn_j
289 |         # j <- i for each j, get kNN in i
290 |         knn_ji = get_constrained_knn(mat_norm_j, mat_norm_i, knn=knn, k_saturate=maxk_i, metric='dot')
291 |         mat_knn_ji = sparse_adj_to_mat(knn_ji, len(cells_j), len(cells_i))
292 |         
293 |         if output_knn_mat_ji:
294 |             sparse.save_npz(output_knn_mat_ji, mat_knn_ji)
295 |         
296 |         # normalize 
297 |         degrees_j = np.ravel(mat_knn_ji.sum(axis=1)) # for each cell in j, how many cells in i it connects to 
298 |         mat_knn_ji = sparse.diags(1.0/(degrees_j+1e-7)).dot(mat_knn_ji) 
299 |         
300 |         # imputation both across and within modality
301 |         mat_ji = mat_knn_ji.dot(mat_ii) # cell in mod j, gene in mod i
302 |     
303 |     
304 |     # i <- j
305 |     knn_ij = get_constrained_knn(mat_norm_i, mat_norm_j, knn=knn, k_saturate=maxk_j, metric='dot')
306 |     mat_knn_ij = sparse_adj_to_mat(knn_ij, len(cells_i), len(cells_j))
307 | 
308 |     if output_knn_mat_ij:
309 |         sparse.save_npz(output_knn_mat_ij, mat_knn_ij)
310 |     
311 |     degrees_i = np.ravel(mat_knn_ij.sum(axis=1)) # for each cell in i, how many cells in j it connects to 
312 |     mat_knn_ij = sparse.diags(1.0/(degrees_i+1e-7)).dot(mat_knn_ij) 
313 |     
314 |     mat_ij = mat_knn_ij.dot(mat_jj) # cell in mod i, gene in mod j
315 |     
316 |     if impute_j:
317 |         return mat_ij, mat_ji
318 |     else:
319 |         return mat_ij
320 | 
321 | @profile(stream=f)
322 | def core_scf_routine(mods_selected, features_selected, settings, 
323 |                     metas, gxc_hvftrs, 
324 |                     ps, drop_npcs,
325 |                     cross_mod_distance_measure, knn, relaxation, n_cca,
326 |                     npc,
327 |                     output_pcX_all, 
328 |                     output_imputed_data_format,
329 |                     ):
330 |     """smooth within modality, impute across modalities, and construct a joint PC matrix
331 |     """
332 |     # GENE * CELL !!!!
333 |     smoothed_features = collections.OrderedDict()
334 |     logging.info("Smoothing within modalities...")
335 |     for mod in mods_selected:
336 |         ti = time.time()
337 |         if settings[mod].mod_category == 'mc':
338 |             _df = gxc_hvftrs[mod]
339 |         else:
340 |             _mat = gxc_hvftrs[mod].data.todense()
341 |             _df = pd.DataFrame(_mat, 
342 |                               index=gxc_hvftrs[mod].gene, 
343 |                               columns=gxc_hvftrs[mod].cell, 
344 |                               ) 
345 |         npc = min(len(metas[mod]), npc)
346 |         k_smooth = min(len(metas[mod]), 30)
347 |         ka = 5
348 |         if k_smooth >= 2*ka:
349 |             mat_smoothed, mat_knn = smooth_in_modality(_df, _df, k=k_smooth, ka=ka, npc=npc, 
350 |                                                          p=ps[settings[mod].mod_category], 
351 |                                                          drop_npc=drop_npcs[settings[mod].mod_category])
352 |             smoothed_features[mod] = mat_smoothed
353 |         else:
354 |             smoothed_features[mod] = _df
355 |         logging.info("{} finished in {} seconds".format(mod, time.time()-ti))
356 |     # delete
357 |     del gxc_hvftrs[mod]
358 | 
359 |     # construct a joint matrix (PCA)
360 |     logging.info("Constructing a joint matrix...")
361 |     cells_all = np.hstack([metas[mod].index.values for mod in mods_selected]) # cell (all mods)  
362 |     pcX_all = []
363 |     for mod_y in features_selected: ## to 
364 |         logging.info("Imputing into {} space...".format(mod_y))
365 |         # get all_features
366 |         X = []
367 |         for mod_x in mods_selected:
368 |             logging.info("for {} cells...".format(mod_x))
369 |             if mod_x == mod_y:
370 |                 smoothed_yy = smoothed_features[mod_y].T # gene by cell !!! VERY IMPORTANT
371 |                 X.append(smoothed_yy)
372 |             else:
373 |                 # impute x cells y space
374 |                 smoothed_features_x = smoothed_features[mod_x]
375 |                 smoothed_features_y = smoothed_features[mod_y]
376 |                 if cross_mod_distance_measure == 'correlation':
377 |                     imputed_xy = impute_1pair(mod_x, mod_y, 
378 |                                               smoothed_features_x, smoothed_features_y,
379 |                                               settings,
380 |                                               knn=knn,
381 |                                               relaxation=relaxation,
382 |                                               impute_j=False,
383 |                                               )
384 |                 elif cross_mod_distance_measure == 'cca':
385 |                     imputed_xy = impute_1pair_cca(mod_x, mod_y, 
386 |                                                  smoothed_features_x, smoothed_features_y,
387 |                                                  settings,
388 |                                                  knn=knn,
389 |                                                  relaxation=relaxation,
390 |                                                  n_cca=n_cca,
391 |                                                  impute_j=False,
392 |                                                 )
393 |                 else:
394 |                     raise ValueError("Choose from correlation and cca")
395 |                 X.append(imputed_xy)
396 |         X = np.vstack(X) # cell (all mods) by gene (mod_y) 
397 |         # save X (imputed counts; for debuggng only)
398 |         if len(output_imputed_data_format)>0:
399 |             np.save(output_imputed_data_format.format(mod_y), X)
400 |         # PCA
401 |         U, s, V = fbpca.pca(X, npc)
402 |         del X
403 |         pcX = U.dot(np.diag(s))
404 |         # normalize PCs
405 |         sigma = np.sqrt(np.sum(s*s)/(pcX.shape[0]*pcX.shape[1]))
406 |         pcX = pcX/sigma
407 |         pcX_all.append(pcX)
408 |         
409 |     pcX_all = np.hstack(pcX_all)
410 |     # save pcX_all
411 |     df_pcX = pd.DataFrame(
412 |         pcX_all, 
413 |         index=cells_all, 
414 |         columns=['PC'+str(i+1) for i in np.arange(pcX_all.shape[1])],
415 |     )
416 |     df_pcX.index.name = 'cell_id'
417 |     df_pcX.to_csv(
418 |         output_pcX_all, 
419 |         sep='\t', index=True, header=True,
420 |     )
421 |     logging.info("Saved output to: {}".format(output_pcX_all))
422 |     return pcX_all, cells_all
423 | 
424 | @profile(stream=f)
425 | def clustering_umap_routine(pcX_all, cells_all, mods_selected, metas, 
426 |                             resolutions, k, 
427 |                             umap_neighbors, min_dist, 
428 |                             output_clst_and_umap,
429 |                             use_netUMAP=False,
430 |                             use_tsne=False,
431 |                             cluster_only=False,
432 |                             ):
433 |     """
434 |     """
435 |     # clustering
436 |     df_clsts = []
437 |     for resolution in resolutions:
438 |         logging.info('resolution r: {}'.format(resolution))
439 |         df_clst = clst_utils.clustering_routine(
440 |                                         pcX_all, 
441 |                                         cells_all, k, 
442 |                                         resolution=resolution,
443 |                                         metric='euclidean', option='plain', n_trees=10, search_k=-1, verbose=False)
444 |         df_clsts.append(df_clst.rename(columns={'cluster': 
445 |                                                 'cluster_joint_r{}'.format(resolution)
446 |                                                }))
447 |     df_clst = pd.concat(df_clsts, axis=1) 
448 | 
449 |     df_summary = df_clst
450 |     # umap
451 |     if not cluster_only:
452 |         df_embed = clst_utils.run_umap_lite(
453 |                     pcX_all, 
454 |                     cells_all, 
455 |                     n_neighbors=umap_neighbors, min_dist=min_dist, n_dim=2, 
456 |                     random_state=1,
457 |                     use_netUMAP=use_netUMAP,
458 |                     use_tsne=use_tsne,
459 |                     )
460 |         df_summary = df_summary.join(df_embed)
461 |     # add dataset info
462 |     df_summary['dataset'] = ''
463 |     for mod in mods_selected:
464 |         _cells = metas[mod].index.values
465 |         df_summary.loc[_cells, 'dataset'] = mod
466 |     # name
467 |     df_summary.index.name = 'cell_id'
468 |     # save results
469 |     df_summary.to_csv(
470 |         output_clst_and_umap, 
471 |         sep='\t', header=True, index=True, 
472 |     )    
473 |     return df_summary
474 | 


--------------------------------------------------------------------------------
/scripts/SingleCellFusion:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """SingleCellFusion main routine"""
  3 | 
  4 | from __init__ import *
  5 | # public packages 
  6 | import collections
  7 | import os
  8 | import warnings
  9 | with warnings.catch_warnings():
 10 |     warnings.filterwarnings("ignore", category=FutureWarning)
 11 |     import anndata
 12 | 
 13 | import logging
 14 | from memory_profiler import profile
 15 | from datetime import datetime
 16 | 
 17 | # scripts from this package
 18 | import cli_parser
 19 | import basic_utils
 20 | import SCF_utils
 21 | 
 22 | ctime = datetime.now().strftime("%Y%m%d%H%M%S")
 23 | f=open('memory_profile_SCF_{}.log'.format(ctime), 'w+')
 24 | @profile(stream=f)
 25 | def main():
 26 |     parser = cli_parser.create_parser()
 27 |     args = parser.parse_args()
 28 | 
 29 |     log = basic_utils.create_logger()
 30 |     logging.info('* Parsing Command Line Arguments')
 31 | 
 32 |     # specify output filenames
 33 |     outdir = args.output_dir
 34 |     if not os.path.isdir(outdir):
 35 |         os.makedirs(outdir)
 36 |     name = args.output_prefix
 37 | 
 38 |     output_clst_and_umap = outdir + '/{}_assigned_clusters_embeddings.tsv.gz'.format(name)
 39 |     output_pcX_all = outdir + '/{}_principal_components.tsv.gz'.format(name)
 40 |     output_figures = outdir + '/{}_{{}}.{{}}'.format(name)
 41 | 
 42 |     ### --- outputs for debugging only 
 43 |     output_imputed_data_format = '' # leave it blank or set it to be: outdir + '/{}_imputed_data_{{}}.npy'.format(name)
 44 |     # output_cluster_centroids = outdir + '/{}_centroids.pkl'.format(name) # not used
 45 |     ### --- end 
 46 | 
 47 |     # get input files, modaltiies (internal rep of input files), and feature datasets
 48 |     data_files = args.input_datasets
 49 |     feature_files = args.feature_datasets
 50 |     mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files]
 51 |     features_selected = [cli_parser.parse_filename(data_file) for data_file in feature_files]
 52 |     for features_modality in features_selected:
 53 |         assert (features_modality in mods_selected)
 54 | 
 55 |     # get dataset metadata
 56 |     mod_catgories = args.input_modalities
 57 |     assert len(mod_catgories) == len(data_files) 
 58 | 
 59 |     for mod_category in mod_catgories:
 60 |         assert (mod_category in ['mc', 'atac', 'rna'])
 61 |     settings = collections.OrderedDict()
 62 |     Mod_info = collections.namedtuple('Mod_info', ['mod', 'mod_category', 'mod_direction',])
 63 |     for mod, mod_category in zip(mods_selected, mod_catgories):
 64 |         mod_direction = cli_parser.modality_default_options(mod_category)
 65 |         settings[mod] = Mod_info(mod, mod_category, mod_direction,)
 66 | 
 67 |     # parameters
 68 |     # Within modality
 69 |     ps = {
 70 |         'rna': args.smoothing_fractions[0],
 71 |         'atac': args.smoothing_fractions[1],
 72 |         'mc': args.smoothing_fractions[2],
 73 |     }
 74 | 
 75 |     # across modality
 76 |     knn = args.nearest_neighbors
 77 |     relaxation = args.relaxation
 78 |     # PCA
 79 |     npc = args.num_pcs
 80 |     # clustering
 81 |     k = args.leiden_n_neighbors
 82 |     resolutions = args.leiden_resolutions
 83 |     # umap
 84 |     umap_neighbors = args.umap_n_neighbors
 85 |     min_dist = args.umap_min_dist
 86 | 
 87 |     # precomputed_pca (skip integration)
 88 |     precomputed_pca_file = args.precomputed_pca_file
 89 |     # use netUMAP
 90 |     use_netUMAP = args.use_netUMAP
 91 |     use_tsne = args.use_tsne
 92 | 
 93 |     ### --- deprecated arguments (for testing; not open to general users)
 94 |     n_cca = 0 # deprecated args.n_cca
 95 |     drop_npcs = {
 96 |         'mc': 0, 
 97 |         'rna': 0, 
 98 |         'atac': 0, 
 99 |         } 
100 |     cross_mod_distance_measure = 'correlation' # or 'cca' 
101 |     ### --- end of deprecation
102 |     logging.info(
103 |         "knn = {}\n".format(knn) +
104 |         "relaxation = {}\n".format(relaxation) +
105 |         "number of PCs = {}\n".format(npc) +
106 |         "ps = {}\n".format(ps) +
107 |         "umap_n_neighbors = {}\n".format(umap_neighbors) +
108 |         "umap_min_dist = {}\n".format(min_dist) +
109 |         "leiden_resolutions = {}\n".format(resolutions) +
110 |         "leiden_n_neighbors = {}\n".format(k)
111 |         )
112 | 
113 |     # ## Read in data 
114 |     logging.info('* Begin integration')
115 |     ### read in data (h5ad)
116 |     metas = collections.OrderedDict()
117 |     gxc_hvftrs = collections.OrderedDict()
118 |     for mod, _file in zip(mods_selected, data_files):
119 |         logging.info("processing {}".format(mod))
120 |         # read 
121 |         logging.info("reading {}".format(_file))
122 |         h5ad_mat = anndata.read_h5ad(_file) 
123 |         h5ad_mat.obs.index = [cell+"_"+mod for cell in h5ad_mat.obs.index] # resolve possible cellid conflict across datasets
124 | 
125 |         if settings[mod].mod_category == 'mc':
126 |             # convert
127 |             meta, mat = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
128 |             assert np.all(mat.columns.values == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat)
129 |             logging.info("{} genes, {} cells in the feature matrix".format(*mat.shape))
130 | 
131 |             metas[mod] = meta
132 |             gxc_hvftrs[mod] = mat
133 | 
134 |         else:
135 |             # convert
136 |             meta, gc_mat = basic_utils.h5ad_to_scf_rna_format(h5ad_mat)
137 |             assert np.all(gc_mat.cell == meta.index.values) # make sure cell name is in the sanme order as metas (important if save knn mat)
138 |             logging.info("{} genes, {} cells in the feature matrix".format(*gc_mat.data.shape))
139 | 
140 |             metas[mod] = meta
141 |             gxc_hvftrs[mod] = gc_mat
142 | 
143 |     logging.info('Done reading data')
144 | 
145 |     # ## run SCF to get integrated PCA
146 |     if os.path.isfile(precomputed_pca_file):
147 |         logging.info('Loading precomputed PCA matrix')
148 |         precomputed_pca_df = pd.read_csv(precomputed_pca_file, sep='\t', index_col=0)
149 |         pcX_all = precomputed_pca_df.values
150 |         cells_all = precomputed_pca_df.index.values
151 |     else:
152 |         pcX_all, cells_all = SCF_utils.core_scf_routine(mods_selected, features_selected, settings, 
153 |                                                         metas, gxc_hvftrs, 
154 |                                                         ps, drop_npcs,
155 |                                                         cross_mod_distance_measure, knn, relaxation, n_cca,
156 |                                                         npc,
157 |                                                         output_pcX_all, 
158 |                                                         output_imputed_data_format,
159 |                                                         )
160 |         logging.info('Done integration into a common PC space')
161 | 
162 |     # run clustering and imputation
163 |     df_summary = SCF_utils.clustering_umap_routine(pcX_all, cells_all, mods_selected, metas,
164 |                                                 resolutions, k, 
165 |                                                 umap_neighbors, min_dist, 
166 |                                                 output_clst_and_umap,
167 |                                                 use_netUMAP=use_netUMAP,
168 |                                                 use_tsne=use_tsne,
169 |                                                 )
170 |     logging.info('Done clustering and UMAP')
171 | 
172 | if __name__ == "__main__":
173 |     main()


--------------------------------------------------------------------------------
/scripts/SingleCellFusion_prep:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import sparse
  7 | import time
  8 | import re
  9 | import warnings
 10 | with warnings.catch_warnings():
 11 |     warnings.filterwarnings("ignore", category=FutureWarning)
 12 | 
 13 | import anndata
 14 | import scanpy
 15 | import logging
 16 | import os
 17 | 
 18 | from __init__ import *
 19 | import basic_utils
 20 | import preproc_utils
 21 | import cli_parser
 22 | 
 23 | def get_gene_annotation(gene_annotation_file):
 24 |     """
 25 |     """
 26 |     genes = pd.read_csv(
 27 |         gene_annotation_file,
 28 |         sep='\t',
 29 |         header=None,
 30 |         usecols=[0,1,2,3],
 31 |         ).rename(columns={
 32 |             0: 'chr', 
 33 |             1: 'start',
 34 |             2: 'end',
 35 |             3: 'ensid',
 36 |         })
 37 |     return genes
 38 | 
 39 | def preproc(
 40 |     f_data, 
 41 |     f_hvftr_data, 
 42 |     normalization_option, 
 43 |     sub_n=None,
 44 |     sub_frac=None,
 45 |     f_cov_data='',
 46 |     gene_lengths_base='', # required if normalization option == "tpm"
 47 |     gid_col='', 
 48 |     cid_col='',
 49 |     global_mean_mc_col='', # required if normalization option == 'mc'
 50 |     ):
 51 |     """Generate normalized HVG matrices from raw count matrices
 52 | 
 53 |     normalization_option == 'mc' needs f_cov_data
 54 |     """
 55 |     # # highly variable features
 56 |     ti = time.time()
 57 |     logging.info("Preprocessing")
 58 | 
 59 |     # read data matrix
 60 |     if normalization_option == 'mc':
 61 |         # read in files
 62 |         logging.info("Reading in file {}".format(f_data))
 63 |         h5ad_mat = anndata.read_h5ad(f_data)
 64 |         ### subsampling ###
 65 |         if sub_n is not None or sub_frac is not None:
 66 |             logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
 67 |             scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
 68 |         ### end of subsampling ###
 69 |         logging.info("matrix size = {}".format(h5ad_mat.shape))
 70 |         meta, mat_mc = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
 71 | 
 72 |         logging.info("Reading in file {}".format(f_cov_data))
 73 |         h5ad_mat = anndata.read_h5ad(f_cov_data)
 74 |         ### subsampling ###
 75 |         if sub_n is not None or sub_frac is not None:
 76 |             logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
 77 |             scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
 78 |         ### end of subsampling ###
 79 |         logging.info("matrix size = {}".format(h5ad_mat.shape))
 80 |         meta, mat_c = basic_utils.h5ad_to_scf_mc_format(h5ad_mat)
 81 | 
 82 |         assert mat_mc.shape == mat_c.shape
 83 |         assert np.all(mat_mc.values <= mat_c.values)
 84 | 
 85 |         gxc_raw = GC_matrix(
 86 |             mat_mc.index.values,
 87 |             mat_mc.columns.values,
 88 |             {'mc': mat_mc.values, 'c': mat_c.values},
 89 |         )
 90 | 
 91 |         # check meta cells agree with gxc cells
 92 |         assert np.all(meta.index.values == gxc_raw.cell)
 93 |         # check genes are uniq 
 94 |         assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
 95 |         # check cells are uniq 
 96 |         assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell)) 
 97 | 
 98 |         # do
 99 |         gxc_hvftr = preproc_utils.preproc_methylation(
100 |                                                     gxc_raw,
101 |                                                     meta,
102 |                                                     global_value_col=global_mean_mc_col, 
103 |                                                     base_call_cutoff=20, 
104 |                                                     sufficient_coverage_fraction=0.95,
105 |                                                     hv_percentile=30,
106 |                                                     n_qcut=10,
107 |                                                     )
108 | 
109 |         # save
110 |         logging.info("Saving to file {}".format(f_hvftr_data))
111 |         h5ad_mat_hvftr = basic_utils.scf_mc_format_to_h5ad(meta, gxc_hvftr)
112 |         h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
113 | 
114 |     else:
115 |         # read in files
116 |         logging.info("Reading in file {}".format(f_data))
117 |         h5ad_mat = anndata.read_h5ad(f_data)
118 |         ### subsampling ###
119 |         if sub_n is not None or sub_frac is not None:
120 |             logging.info("Subsampling to n={} frac={}".format(sub_n, sub_frac))
121 |             scanpy.pp.subsample(h5ad_mat, n_obs=sub_n, fraction=sub_frac, random_state=0)
122 |         ### end of subsampling ###
123 |         logging.info("matrix size = {}".format(h5ad_mat.shape))
124 |         if tosparse:
125 |             h5ad_mat.X = sparse.coo_matrix(h5ad_mat.X)
126 |         meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)
127 |         
128 |         # check meta cells agree with gxc cells
129 |         assert np.all(meta.index.values == gxc_raw.cell)
130 |         # check genes are uniq 
131 |         assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
132 |         # check cells are uniq 
133 |         assert len(gxc_raw.cell) == len(np.unique(gxc_raw.cell)) 
134 |     
135 |         # get hvftrs
136 |         logging.info("Preproc and get highly variable genes {}".format(f_data))
137 |         if normalization_option == 'cpm':
138 |             gxc_hvftr = preproc_utils.preproc_rna_cpm_based(
139 |                                             gxc_raw, 
140 |                                             sufficient_cell_coverage=0.01, 
141 |                                             hv_percentile=30, hv_ncut=10)
142 |         elif normalization_option == 'tpm':
143 |             gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
144 |             gxc_hvftr = preproc_utils.preproc_rna_tpm_based(
145 |                                             gxc_raw, gene_lengths, impute_gene_lengths=True, 
146 |                                             sufficient_cell_coverage=0.01, 
147 |                                             hv_percentile=30, hv_ncut=10)
148 |     
149 |         # save
150 |         logging.info("Saving to file {}".format(f_hvftr_data))
151 |         h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)
152 |         h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
153 |     return 
154 |         
155 | if __name__ == "__main__":
156 |     log = basic_utils.create_logger()
157 | 
158 |     parser = cli_parser.create_parser_preproc()
159 |     args = parser.parse_args()
160 |     logging.info('* Parsing Command Line Arguments')
161 | 
162 |     # get input files
163 |     data_files = args.input_datasets
164 |     data_cov_files = args.input_datasets_coverage
165 |     mods_selected = [cli_parser.parse_filename(data_file) for data_file in data_files]
166 |     gid_col = args.geneid_column
167 |     cid_col = args.cellid_column
168 |     global_mean_mc_col = args.global_mean_mc_column
169 |     tosparse = args.tosparse
170 | 
171 |     # specify output files
172 |     outdir = args.output_dir
173 |     if not os.path.isdir(outdir):
174 |         os.makedirs(outdir)
175 |     outprefix = args.output_prefix
176 | 
177 |     output_files = [
178 |         os.path.join(outdir, "{}_{}".format(outprefix, os.path.basename(input_file)))
179 |         for input_file in data_files
180 |     ]
181 | 
182 |     # parameters
183 |     gene_annotation_file = args.gene_annotation_file
184 | 
185 |     # get dataset normalizations
186 |     input_normalizations = args.input_normalizations
187 | 
188 |     # subsampling
189 |     sub_n = args.sub_n
190 |     sub_frac = args.sub_frac 
191 | 
192 |     # check and set up
193 |     gene_lengths_base = ''
194 |     for option in input_normalizations:
195 |         assert (option in ['mc', 'cpm', 'tpm'])
196 |         if option == 'mc':
197 |             assert len(data_cov_files) == len(data_files) 
198 |         elif option == 'tpm':
199 |             assert gene_annotation_file
200 |             df_genes = get_gene_annotation(gene_annotation_file).set_index('ensid')
201 |             gene_lengths_base = (df_genes['end'] - df_genes['start'])
202 | 
203 |     for i, (data_file, output_file, norm_option) in enumerate(zip(
204 |         data_files, output_files, input_normalizations
205 |         )):
206 | 
207 |         if norm_option == 'mc':
208 |             data_cov_file = data_cov_files[i]
209 |         else:
210 |             data_cov_file = ''
211 | 
212 |         preproc(
213 |             data_file, 
214 |             output_file, 
215 |             norm_option, 
216 |             sub_n=sub_n,
217 |             sub_frac=sub_frac,
218 |             gene_lengths_base=gene_lengths_base, # required if normalization option == "tpm"
219 |             f_cov_data=data_cov_file,
220 |             gid_col=gid_col, 
221 |             cid_col=cid_col,
222 |             global_mean_mc_col=global_mean_mc_col,
223 |         )


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | """Import commonly used libraries"""
 2 | 
 3 | import time
 4 | import logging
 5 | import glob
 6 | import os
 7 | import numpy as np
 8 | import pandas as pd
 9 | import collections
10 | # from natsort import natsorted
11 | 
12 | # matplotlib
13 | import matplotlib as mpl
14 | import matplotlib.pyplot as plt
15 | mpl.rcParams['pdf.fonttype'] = 42 # editable text in matplotlib
16 | mpl.rcParams['svg.fonttype'] = 'none'
17 | 
18 | import matplotlib.ticker as mtick
19 | PercentFormat = mtick.FuncFormatter(lambda y, _: '{:.3%}'.format(y))
20 | ScalarFormat = mtick.ScalarFormatter()
21 | 
22 | # seaborn
23 | import seaborn as sns
24 | sns.set_style('ticks', rc={'axes.grid':True})
25 | sns.set_context('talk')
26 | 
27 | # data structures
28 | GC_matrix = collections.namedtuple('GC_matrix', ['gene', 'cell', 'data'])
29 | 


--------------------------------------------------------------------------------
/scripts/basic_utils.py:
--------------------------------------------------------------------------------
   1 | """
   2 | """
   3 | from __init__ import *
   4 | 
   5 | import numpy as np
   6 | import pandas as pd
   7 | 
   8 | import os
   9 | from scipy import sparse
  10 | import anndata
  11 | import logging
  12 | 
  13 | def get_size_in_GB(obj):
  14 |     """"""
  15 |     GB = 1024**3
  16 |     return sys.getsizeof(obj)/GB
  17 | 
  18 | def scf_rna_format_to_h5ad(meta, gc_mat):
  19 |     """
  20 |     input:
  21 |         - meta (cell metadata)
  22 |         - gc_mat
  23 |     
  24 |     output:
  25 |         - anndata
  26 |     """
  27 |     X = gc_mat.data.T # cell by gene [scipy sparse matrix]
  28 |     obs = meta # cell annotation [pandas dataframe]
  29 |     var = pd.DataFrame(index=gc_mat.gene) # gene annotation [pandas dataframe]
  30 |     
  31 |     h5ad_mat = anndata.AnnData(X, obs, var,)
  32 |     
  33 |     return h5ad_mat
  34 | 
  35 | def scf_mc_format_to_h5ad(meta, mat):
  36 |     """
  37 |     input:
  38 |         - meta (cell metadata)
  39 |         - mat
  40 |     
  41 |     output:
  42 |         - anndata
  43 |     """
  44 |     X = mat.T.values # cell by gene [numpy array]
  45 |     obs = meta # cell annotation [pandas dataframe]
  46 |     var = pd.DataFrame(index=mat.index) # gene annotation [pandas dataframe]
  47 |     
  48 |     h5ad_mat = anndata.AnnData(X, obs, var,)
  49 |     
  50 |     return h5ad_mat
  51 | 
  52 | def h5ad_to_scf_rna_format(h5ad_mat, gid_col='', cid_col=''):
  53 |     """
  54 |     input:
  55 |         - anndata
  56 |     output:
  57 |         - meta (cell metadata)
  58 |         - gc_mat
  59 | 
  60 |     """
  61 |     meta = h5ad_mat.obs
  62 |     if gid_col:
  63 |         genes = h5ad_mat.var[gid_col].values
  64 |     else:
  65 |         genes = h5ad_mat.var.index.values
  66 |     if cid_col:
  67 |         cells = h5ad_mat.obs[cid_col].values
  68 |         meta = meta.set_index(cid_col)
  69 |     else:
  70 |         cells = h5ad_mat.obs.index.values 
  71 | 
  72 |     gc_mat = GC_matrix(genes,
  73 |                        cells,
  74 |                        h5ad_mat.X.T,
  75 |                       )
  76 |     return meta, gc_mat
  77 | 
  78 | def h5ad_to_scf_mc_format(h5ad_mat, gid_col='', cid_col=''):
  79 |     """
  80 |     input:
  81 |         - anndata
  82 |     output:
  83 |         - meta (cell metadata)
  84 |         - pandas data frame 
  85 |     """
  86 | 
  87 |     meta = h5ad_mat.obs
  88 |     if gid_col:
  89 |         genes = h5ad_mat.var[gid_col].values
  90 |     else:
  91 |         genes = h5ad_mat.var.index.values
  92 |     if cid_col:
  93 |         cells = h5ad_mat.obs[cid_col].values
  94 |         meta = meta.set_index(cid_col)
  95 |     else:
  96 |         cells = h5ad_mat.obs.index.values 
  97 |     mat = pd.DataFrame(h5ad_mat.X.T,
  98 |                        index=genes,
  99 |                        columns=cells,
 100 |                       )
 101 |     return meta, mat
 102 | 
 103 | def diag_matrix(X, rows=np.array([]), cols=np.array([]), threshold=None):
 104 |     """Diagonalize a matrix as much as possible
 105 |     """
 106 |     di, dj = X.shape
 107 |     transposed = 0
 108 |     
 109 |     if di > dj:
 110 |         di, dj = dj, di
 111 |         X = X.T.copy()
 112 |         rows, cols = cols.copy(), rows.copy()
 113 |         transposed = 1
 114 |         
 115 |     # start (di <= dj)
 116 |     new_X = X.copy()
 117 |     new_rows = rows.copy() 
 118 |     new_cols = cols.copy() 
 119 |     if new_rows.size == 0:
 120 |         new_rows = np.arange(di)
 121 |     if new_cols.size == 0:
 122 |         new_cols = np.arange(dj)
 123 |         
 124 |     # bring the greatest values in the lower right matrix to diagnal position 
 125 |     for idx in range(min(di, dj)):
 126 | 
 127 |         T = new_X[idx: , idx: ]
 128 |         i, j = np.unravel_index(T.argmax(), T.shape) # get the coords of the max element of T
 129 |         
 130 |         if threshold and T[i, j] < threshold:
 131 |             dm = idx # new_X[:dm, :dm] is done (0, 1, ..., dm-1) excluding dm
 132 |             break
 133 |         else:
 134 |             dm = idx+1 # new_X[:dm, :dm] will be done
 135 | 
 136 |         # swap row idx, idx+i
 137 |         tmp = new_X[idx, :].copy()
 138 |         new_X[idx, :] = new_X[idx+i, :].copy() 
 139 |         new_X[idx+i, :] = tmp 
 140 |         
 141 |         tmp = new_rows[idx]
 142 |         new_rows[idx] = new_rows[idx+i]
 143 |         new_rows[idx+i] = tmp
 144 | 
 145 |         # swap col idx, idx+j
 146 |         tmp = new_X[:, idx].copy()
 147 |         new_X[:, idx] = new_X[:, idx+j].copy() 
 148 |         new_X[:, idx+j] = tmp 
 149 |         
 150 |         tmp = new_cols[idx]
 151 |         new_cols[idx] = new_cols[idx+j]
 152 |         new_cols[idx+j] = tmp
 153 |         
 154 |     # 
 155 |     if dm == dj:
 156 |         pass
 157 |     elif dm < dj: # free columns
 158 | 
 159 |         col_dict = {}
 160 |         sorted_col_idx = np.arange(dm)
 161 |         free_col_idx = np.arange(dm, dj)
 162 |         linked_rowcol_idx = new_X[:, dm:].argmax(axis=0)
 163 |         
 164 |         for col in sorted_col_idx:
 165 |             col_dict[col] = [col]
 166 |         for col, key in zip(free_col_idx, linked_rowcol_idx): 
 167 |             if key < dm:
 168 |                 col_dict[key] = col_dict[key] + [col]
 169 |             else:
 170 |                 col_dict[key] = [col]
 171 |                 
 172 |             
 173 |         new_col_order = np.hstack([col_dict[key] for key in sorted(col_dict.keys())])
 174 |         
 175 |         # update new_X new_cols
 176 |         new_X = new_X[:, new_col_order].copy()
 177 |         new_cols = new_cols[new_col_order]
 178 |     else:
 179 |         raise ValueError("Unexpected situation: dm > dj")
 180 |     
 181 |     if transposed:
 182 |         new_X = new_X.T
 183 |         new_rows, new_cols = new_cols, new_rows
 184 |     return new_X, new_rows, new_cols 
 185 | 
 186 | def diag_matrix_rows(X, rows=np.array([]), cols=np.array([]),):
 187 |     """Diagonalize a matrix as much as possible by only rearrange rows
 188 |     """
 189 |     di, dj = X.shape
 190 |     
 191 |     new_X = X.copy()
 192 |     new_rows = rows.copy() 
 193 |     new_cols = cols.copy() 
 194 |     
 195 |     # free to move rows
 196 |     row_dict = {}
 197 |     free_row_idx = np.arange(di)
 198 |     linked_rowcol_idx = new_X.argmax(axis=1) # the column with max value for each row
 199 |     
 200 |     for row, key in zip(free_row_idx, linked_rowcol_idx): 
 201 |         if key in row_dict.keys():
 202 |             row_dict[key] = row_dict[key] + [row]
 203 |         else:
 204 |             row_dict[key] = [row]
 205 |             
 206 |     new_row_order = np.hstack([row_dict[key] for key in sorted(row_dict.keys())])
 207 |     # update new_X new_cols
 208 |     new_X = new_X[new_row_order, :].copy()
 209 |     new_rows = new_rows[new_row_order]
 210 |     
 211 |     return new_X, new_rows, new_cols 
 212 | 
 213 | def get_grad_colors(n, cmap='copper'):
 214 |     """Generate n colors from a given colormap (a matplotlib.cm)
 215 |     """
 216 |     from matplotlib import cm
 217 |     cmap = cm.get_cmap(cmap)
 218 |     return [cmap(int(i)) for i in np.linspace(0, 255, n)] 
 219 | 
 220 | def logcpm(counts):
 221 |     """
 222 |     Args:
 223 |         - gene-cell matrix
 224 |     """
 225 |     cov = counts.sum(axis=0)
 226 |     logcpm = np.log10(counts.divide(cov, axis=1)*1000000 + 1)
 227 |     return logcpm
 228 | 
 229 | def logtpm(counts, gene_lengths):
 230 |     """
 231 |     Args:
 232 |         - gene-cell matrix
 233 |         - gene_lengths: a series indexed by gene_id
 234 |     """
 235 |     tpm = counts.divide(gene_lengths.loc[counts.index], axis=0)
 236 |     cov = tpm.sum(axis=0)
 237 |     logtpm = np.log10((tpm.divide(cov, axis=1))*1000000 + 1)
 238 |     return logtpm
 239 | 
 240 | def sparse_logcpm(gc_matrix, mode='logcpm', lib_size=[]):
 241 |     """
 242 |     """
 243 |     lib_size = np.array(lib_size)
 244 |     if np.size(lib_size) == 0:
 245 |         lib_size = gc_matrix.data.sum(axis=0)
 246 | 
 247 |     lib_size_inv = sparse.diags(np.ravel(1.0/(1e-7+lib_size)))
 248 |     cpm = (gc_matrix.data).dot(lib_size_inv*1e6).tocoo()
 249 | 
 250 |     if mode == 'logcpm':
 251 |         cpm.data = np.log10(cpm.data + 1)
 252 |     elif mode == 'cpm':
 253 |         pass
 254 | 
 255 |     gc_cpm = GC_matrix(
 256 |         gc_matrix.gene, 
 257 |         gc_matrix.cell, 
 258 |         cpm,
 259 |     )
 260 |     
 261 |     return gc_cpm
 262 | 
 263 | def sparse_logtpm(gc_matrix, gene_lengths):
 264 |     """
 265 |     gene_lengths: array like 
 266 |     
 267 |     """
 268 |     gene_lengths = np.array(gene_lengths)
 269 |     gene_length_inv = sparse.diags(np.ravel(1.0/gene_lengths))
 270 |     tmp = (gene_length_inv).dot(gc_matrix.data).tocoo()
 271 |     lib_size_inv = sparse.diags(np.ravel(1.0/tmp.sum(axis=0)))
 272 |     
 273 |     logtpm = tmp.dot(lib_size_inv*1e6).tocoo()
 274 |     logtpm.data = np.log10(logtpm.data + 1)
 275 | 
 276 |     gc_logtpm = GC_matrix(
 277 |         gc_matrix.gene, 
 278 |         gc_matrix.cell, 
 279 |         logtpm,
 280 |     )
 281 |     
 282 |     return gc_logtpm
 283 | 
 284 | class cd:
 285 |     """Context manager for changing the current working directory"""
 286 |     def __init__(self, newPath):
 287 |         self.newPath = os.path.expanduser(newPath)
 288 | 
 289 |     def __enter__(self):
 290 |         self.savedPath = os.getcwd()
 291 |         os.chdir(self.newPath)
 292 | 
 293 |     def __exit__(self, etype, value, traceback):
 294 |         os.chdir(self.savedPath)
 295 | 
 296 | def create_logger(name='log'):
 297 |     """
 298 |     args: logger name
 299 | 
 300 |     return: a logger object
 301 |     """
 302 |     logging.basicConfig(
 303 |         format='%(asctime)s %(message)s', 
 304 |         datefmt='%m/%d/%Y %I:%M:%S %p',
 305 |         level=logging.INFO)
 306 |     return logging.getLogger(name)
 307 | 
 308 | def set_value_by_percentile(this, lo, hi):
 309 |     """set `this` below or above percentiles to given values
 310 |     this (float)
 311 |     lo(float)
 312 |     hi(float)
 313 |     """
 314 |     if this < lo:
 315 |         return lo
 316 |     elif this > hi:
 317 |         return hi
 318 |     else:
 319 |         return this
 320 | 
 321 | def mcc_percentile_norm(mcc, low_p=5, hi_p=95):
 322 |     """
 323 |     set values above and below specific percentiles to be at the value of percentiles 
 324 | 
 325 |     args: mcc, low_p, hi_p  
 326 | 
 327 |     return: normalized mcc levels
 328 |     """
 329 | #   mcc_norm = [np.isnan(mcc) for mcc_i in list(mcc)]
 330 |     mcc_norm = np.copy(mcc)
 331 |     mcc_norm = mcc_norm[~np.isnan(mcc_norm)]
 332 | 
 333 |     lo = np.percentile(mcc_norm, low_p)
 334 |     hi = np.percentile(mcc_norm, hi_p)
 335 | 
 336 |     mcc_norm = [set_value_by_percentile(mcc_i, lo, hi) for mcc_i in list(mcc)]
 337 |     mcc_norm = np.array(mcc_norm)
 338 | 
 339 |     return mcc_norm
 340 | 
 341 | def plot_tsne_values(df, tx='tsne_x', ty='tsne_y', tc='mCH',
 342 |                     low_p=5, hi_p=95,
 343 |                     s=2,
 344 |                     cbar_label=None,
 345 |                     output=None, show=True, close=False, 
 346 |                     t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6), **kwargs):
 347 |     """
 348 |     tSNE plot
 349 | 
 350 |     xlim, ylim is set to facilitate displaying glial clusters only
 351 | 
 352 |     """
 353 |     import matplotlib.pyplot as plt
 354 |     import seaborn as sns
 355 | 
 356 |     fig, ax = plt.subplots(figsize=figsize)
 357 | 
 358 |     im = ax.scatter(df[tx], df[ty], s=s, 
 359 |         c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs)
 360 |     if title:
 361 |         ax.set_title(title)
 362 |     else:
 363 |         ax.set_title(tc)
 364 |     ax.set_xlabel(tx)
 365 |     ax.set_ylabel(ty)
 366 |     # ax.set_aspect('auto')
 367 | 
 368 | 
 369 |     clb = plt.colorbar(im, ax=ax)
 370 |     if cbar_label:
 371 |         clb.set_label(cbar_label, rotation=270, labelpad=10)
 372 | 
 373 |     if t_xlim == 'auto':
 374 |         t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
 375 |         t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
 376 |         t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
 377 |         ax.set_xlim(t_xlim)
 378 |     elif t_xlim:
 379 |         ax.set_xlim(t_xlim)
 380 |     else:
 381 |         pass  
 382 | 
 383 |     if t_ylim == 'auto':
 384 |         t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
 385 |         t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
 386 |         t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
 387 |         ax.set_ylim(t_ylim)
 388 |     elif t_ylim:
 389 |         ax.set_ylim(t_ylim)
 390 |     else:
 391 |         pass
 392 | 
 393 |     fig.tight_layout()
 394 |     if output:
 395 |         fig.savefig(output)
 396 |         print('Saved to ' + output) 
 397 |     if show:
 398 |         plt.show()
 399 |     if close:
 400 |         plt.close(fig)
 401 | 
 402 | def get_kwcolors(labels, colors):
 403 |     """Generate a dictinary of {label: color} using unique labels and a list of availabel colors
 404 |     """
 405 |     nc = len(colors)
 406 |     nl = len(labels)
 407 |     n_repeats = int((nl + nc - 1)/nc)
 408 |     colors = list(colors)*n_repeats
 409 |     
 410 |     kw_colors = {l:c for (l,c) in zip(labels, colors)}
 411 |     return kw_colors
 412 | 
 413 | def rgb2hex(r,g,b):
 414 |     """From rgb (255, 255, 255) to hex
 415 |     """
 416 |     hex = "#{:02x}{:02x}{:02x}".format(int(r),int(g),int(b))
 417 |     return hex
 418 | 
 419 | def gen_colors(n, l=0.6, s=0.6, colors=None):
 420 |     """Generate compatible and distinct hex colors
 421 |     """
 422 |     if not colors:
 423 |         import colorsys
 424 |         hs = np.linspace(0, 1, n, endpoint=False)
 425 |         rgbs = [rgb2hex(*(256*np.array(colorsys.hls_to_rgb(h, l, s))))
 426 |                  for h in hs]
 427 |         return rgbs
 428 |     else:
 429 |         clrs = [colors[i%len(colors)] for i in range(n)] 
 430 |         return clrs 
 431 | 
 432 | def myScatter(ax, df, x, y, l, 
 433 |               s=20,
 434 |               sample_frac=None,
 435 |               sample_n=None,
 436 |               legend_size=None,
 437 |               legend_kws=None,
 438 |               grey_label='unlabeled',
 439 |               shuffle=True,
 440 |               random_state=None,
 441 |               legend_mode=0,
 442 |               kw_colors=False,
 443 |               colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
 444 |     """
 445 |     take an axis object and make a scatter plot
 446 | 
 447 |     - kw_colors is a dictinary {label: color}
 448 |     """
 449 | 
 450 |     import matplotlib.pyplot as plt
 451 |     import seaborn as sns
 452 |     df = df.copy()
 453 |     # shuffle (and copy) data
 454 |     if sample_n:
 455 |         df = (df.groupby(l).apply(lambda x: x.sample(min(len(x), sample_n), random_state=random_state))
 456 |                             .reset_index(level=0, drop=True)
 457 |             )
 458 |     if sample_frac:
 459 |         df = (df.groupby(l).apply(lambda x: x.sample(frac=sample_frac, random_state=random_state))
 460 |                             .reset_index(level=0, drop=True)
 461 |             )
 462 |     if shuffle:
 463 |         df = df.sample(frac=1, random_state=random_state)
 464 | 
 465 |     if not kw_colors:
 466 |         # add a color column
 467 |         inds, catgs = pd.factorize(df[l])
 468 |         df['c'] = [colors[i%len(colors)] if catgs[i]!=grey_label else 'grey' 
 469 |                     for i in inds]
 470 |     else:
 471 |         df['c'] = [kw_colors[i] if i!=grey_label else 'grey' for i in df[l]]
 472 |     
 473 |     # take care of legend
 474 |     if legend_mode != -1:
 475 |         for ind, row in df.groupby(l).first().iterrows():
 476 |             ax.scatter(row[x], row[y], c=row['c'], label=ind, s=s, **kwargs)
 477 |         
 478 |     if legend_mode == -1:
 479 |         pass
 480 |     elif legend_mode == 0:
 481 |         lgnd = ax.legend()
 482 |     elif legend_mode == 1:
 483 |         # Shrink current axis's height by 10% on the bottom
 484 |         box = ax.get_position()
 485 |         ax.set_position([box.x0, box.y0 + box.height * 0.1,
 486 |                          box.width, box.height * 0.9])
 487 |         lgnd = ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.07),
 488 |               ncol=6, fancybox=False, shadow=False) 
 489 |     elif legend_mode == 2:
 490 |         # Shrink current axis's width by 10% on the bottom
 491 |         box = ax.get_position()
 492 |         ax.set_position([box.x0 + box.width*0.1, box.y0,
 493 |                                  box.width*0.8, box.height])
 494 | 
 495 |     if legend_kws:
 496 |         lgnd = ax.legend(**legend_kws)
 497 | 
 498 |     if legend_mode != -1 and legend_size:
 499 |         for handle in lgnd.legendHandles:
 500 |             handle._sizes = [legend_size] 
 501 | 
 502 |     # backgroud (grey)
 503 |     df_grey = df.loc[df['c']=='grey']
 504 |     if not df_grey.empty:
 505 |         ax.scatter(df_grey[x], 
 506 |                    df_grey[y],
 507 |                    c=df_grey['c'], s=s, **kwargs)
 508 |     # actual plot
 509 |     df_tmp = df.loc[df['c']!='grey']
 510 |     ax.scatter(df_tmp[x], 
 511 |                df_tmp[y],
 512 |                c=df_tmp['c'], s=s, **kwargs)
 513 |     
 514 |     return
 515 | 
 516 | def plot_tsne_labels_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='cluster_ID', 
 517 |                     sample_frac=None,
 518 |                     sample_n=None,
 519 |                     legend_size=None,
 520 |                     legend_kws=None,
 521 |                     grey_label='unlabeled',
 522 |                     legend_mode=0,
 523 |                     s=1,
 524 |                     shuffle=True,
 525 |                     random_state=None,
 526 |                     t_xlim='auto', t_ylim='auto', title=None, 
 527 |                     legend_loc='lower right',
 528 |                     colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
 529 |     """
 530 |     tSNE plot
 531 | 
 532 |     xlim, ylim is set to facilitate displaying glial clusters only
 533 | 
 534 |     # avoid gray-like 'C7' in colors
 535 |     # color orders are arranged for exci-inhi-glia plot 11/1/2017
 536 |     """
 537 |     import matplotlib.pyplot as plt
 538 | 
 539 |     myScatter(ax, df, tx, ty, tc,
 540 |              s=s,
 541 |              sample_frac=sample_frac,
 542 |              sample_n=sample_n,
 543 |              legend_size=legend_size,
 544 |              legend_kws=legend_kws,
 545 |              shuffle=shuffle,
 546 |              grey_label=grey_label,
 547 |              random_state=random_state, 
 548 |              legend_mode=legend_mode, 
 549 |              colors=colors, **kwargs)
 550 | 
 551 |     if title:
 552 |         ax.set_title(title)
 553 |     else:
 554 |         ax.set_title(tc)
 555 |     ax.set_xlabel(tx)
 556 |     ax.set_ylabel(ty)
 557 |     # ax.set_aspect('auto')
 558 | 
 559 |     if t_xlim == 'auto':
 560 |         t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
 561 |         t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
 562 |         t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
 563 |         ax.set_xlim(t_xlim)
 564 |     elif t_xlim:
 565 |         ax.set_xlim(t_xlim)
 566 |     else:
 567 |         pass  
 568 | 
 569 |     if t_ylim == 'auto':
 570 |         t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
 571 |         t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
 572 |         t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
 573 |         ax.set_ylim(t_ylim)
 574 |     elif t_ylim:
 575 |         ax.set_ylim(t_ylim)
 576 |     else:
 577 |         pass
 578 | 
 579 |     return
 580 | 
 581 | 
 582 | def plot_tsne_labels(df, tx='tsne_x', ty='tsne_y', tc='cluster_ID', 
 583 |                     grey_label='unlabeled',
 584 |                     sample_frac=None,
 585 |                     sample_n=None,
 586 |                     legend_size=None,
 587 |                     legend_mode=0,
 588 |                     legend_kws=None,
 589 |                     s=1,
 590 |                     random_state=None,
 591 |                     output=None, show=True, close=False, 
 592 |                     t_xlim='auto', t_ylim='auto', title=None, figsize=(8,6),
 593 |                     legend_loc='lower right',
 594 |                     colors=['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9'], **kwargs):
 595 |     """
 596 |     tSNE plot
 597 | 
 598 |     xlim, ylim is set to facilitate displaying glial clusters only
 599 | 
 600 |     # avoid gray-like 'C7' in colors
 601 |     # color orders are arranged for exci-inhi-glia plot 11/1/2017
 602 |     """
 603 |     import matplotlib.pyplot as plt
 604 |     import seaborn as sns
 605 |     fig, ax = plt.subplots(figsize=figsize)
 606 | 
 607 |     myScatter(ax, df, tx, ty, tc,
 608 |              s=s,
 609 |              sample_frac=sample_frac,
 610 |              sample_n=sample_n,
 611 |              legend_size=legend_size,
 612 |              legend_kws=legend_kws,
 613 |              grey_label=grey_label,
 614 |              random_state=random_state, 
 615 |              legend_mode=legend_mode, 
 616 |              colors=colors, **kwargs)
 617 | 
 618 |     if title:
 619 |         ax.set_title(title)
 620 |     else:
 621 |         ax.set_title(tc)
 622 |     ax.set_xlabel(tx)
 623 |     ax.set_ylabel(ty)
 624 |     # ax.set_aspect('auto')
 625 | 
 626 |     if t_xlim == 'auto':
 627 |         t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
 628 |         t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
 629 |         t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
 630 |         ax.set_xlim(t_xlim)
 631 |     elif t_xlim:
 632 |         ax.set_xlim(t_xlim)
 633 |     else:
 634 |         pass  
 635 | 
 636 |     if t_ylim == 'auto':
 637 |         t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
 638 |         t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
 639 |         t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
 640 |         ax.set_ylim(t_ylim)
 641 |     elif t_ylim:
 642 |         ax.set_ylim(t_ylim)
 643 |     else:
 644 |         pass
 645 | 
 646 |     if output:
 647 |         fig.savefig(output)
 648 |         print('Saved to ' + output) 
 649 |     if show:
 650 |         plt.show()
 651 |     if close:
 652 |         plt.close(fig)
 653 | 
 654 | def plot_tsne_values_ax(df, ax, tx='tsne_x', ty='tsne_y', tc='mCH',
 655 |                     low_p=5, hi_p=95,
 656 |                     s=2,
 657 |                     cbar=True,
 658 |                     cbar_ax=None,
 659 |                     cbar_label=None,
 660 |                     t_xlim='auto', t_ylim='auto', title=None, **kwargs):
 661 |     """
 662 |     tSNE plot
 663 | 
 664 |     xlim, ylim is set to facilitate displaying glial clusters only
 665 | 
 666 |     """
 667 |     import matplotlib.pyplot as plt
 668 | 
 669 | 
 670 |     im = ax.scatter(df[tx], df[ty], s=s, 
 671 |         c=mcc_percentile_norm(df[tc].values, low_p=low_p, hi_p=hi_p), **kwargs)
 672 |     if title:
 673 |         ax.set_title(title)
 674 |     else:
 675 |         ax.set_title(tc)
 676 |     # ax.set_aspect('auto')
 677 |     if cbar:
 678 |         if cbar_ax:
 679 |             clb = plt.colorbar(im, cax=cbar_ax, shrink=0.4)
 680 |         else:
 681 |             clb = plt.colorbar(im, cax=ax, shrink=1)
 682 |         if cbar_label:
 683 |             clb.set_label(cbar_label, rotation=270, labelpad=10)
 684 | 
 685 |     if t_xlim == 'auto':
 686 |         t_xlim = [np.nanpercentile(df[tx].values, 0.1), np.nanpercentile(df[tx].values, 99.9)]
 687 |         t_xlim[0] = t_xlim[0] - 0.1*(t_xlim[1] - t_xlim[0])
 688 |         t_xlim[1] = t_xlim[1] + 0.1*(t_xlim[1] - t_xlim[0])
 689 |         ax.set_xlim(t_xlim)
 690 |     elif t_xlim:
 691 |         ax.set_xlim(t_xlim)
 692 |     else:
 693 |         pass  
 694 | 
 695 |     if t_ylim == 'auto':
 696 |         t_ylim = [np.nanpercentile(df[ty].values, 0.1), np.nanpercentile(df[ty].values, 99.9)]
 697 |         t_ylim[0] = t_ylim[0] - 0.1*(t_ylim[1] - t_ylim[0])
 698 |         t_ylim[1] = t_ylim[1] + 0.1*(t_ylim[1] - t_ylim[0])
 699 |         ax.set_ylim(t_ylim)
 700 |     elif t_ylim:
 701 |         ax.set_ylim(t_ylim)
 702 |     else:
 703 |         pass
 704 | 
 705 |     return im  
 706 | 
 707 | 
 708 | def get_mcc(df, base_call_cutoff=100, sufficient_coverage_fraction=1, suffix=True, fillna=True):
 709 |     """Get mcc matrix from mc_c matrix (filtering out low coverage gene or bins)
 710 |     """
 711 |     logging.info('Getting mcc matrix from mc and c') 
 712 |     logging.info('base_call_cutoff={}, sufficient_coverage_fraction={}'.format(
 713 |                 base_call_cutoff, sufficient_coverage_fraction))
 714 |     
 715 |     df_c = df.filter(regex="_c$")
 716 |     df_c.columns = [col[:-len('_c')] for col in df_c.columns] 
 717 |     df_mc = df.filter(regex="_mc$")
 718 |     df_mc.columns = [col[:-len('_mc')] for col in df_mc.columns] 
 719 |     # a gene is sufficiently covered in % of cells 
 720 |     condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df.shape[1])/2.0 
 721 | 
 722 |     logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape))
 723 |     logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape))
 724 |     
 725 |     # get mcc matrix with kept bins and nan values for low coverage sites
 726 |     df_c_nan = df_c.copy()
 727 |     df_c_nan[df_c < base_call_cutoff] = np.nan
 728 |     df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition]
 729 |     logging.info(df_mcc.shape)
 730 | 
 731 |     # imputation (missing value -> mean value of all cells)
 732 |     if fillna:
 733 |         logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)')
 734 |         means = df_mcc.mean(axis=1)
 735 |         fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
 736 |         df_mcc.fillna(fill_value, inplace=True)
 737 |     
 738 |     # add suffix
 739 |     if suffix:
 740 |         df_mcc.columns = df_mcc.columns.values + '_mcc'
 741 |     
 742 |     return df_mcc
 743 | 
 744 | def get_mcc_lite(mc_table, c_table, base_call_cutoff=100, sufficient_coverage_fraction=1, fillna=True):
 745 |     """Given 2 numpy array, return mcc table
 746 |     Gene/region by sample matrix
 747 |     """
 748 |     df_c = pd.DataFrame(c_table)
 749 |     df_mc = pd.DataFrame(mc_table)
 750 |     assert df_c.shape == df_mc.shape
 751 |     
 752 |     # a gene is sufficiently covered in % of cells 
 753 |     condition = (df_c > base_call_cutoff).sum(axis=1) >= sufficient_coverage_fraction*(df_c.shape[1])
 754 | 
 755 |     logging.info("Matrix size before pruning (# features, # cells) = "+ str(df_c.shape))
 756 |     logging.info("Matrix size after pruning (# features, # cells) = "+ str(df_c.loc[condition].shape))
 757 |     
 758 |     # get mcc matrix with kept bins and nan values for low coverage sites
 759 |     df_c_nan = df_c.copy()
 760 |     df_c_nan[df_c < base_call_cutoff] = np.nan
 761 |     df_mcc = df_mc.loc[condition]/df_c_nan.loc[condition]
 762 |     logging.info(df_mcc.shape)
 763 | 
 764 |     # imputation (missing value -> mean value of all cells)
 765 |     if fillna:
 766 |         logging.info('Imputing data... (No effect if sufficient_coverage_fraction=1)')
 767 |         means = df_mcc.mean(axis=1)
 768 |         fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
 769 |         df_mcc.fillna(fill_value, inplace=True)
 770 |     
 771 |     # return matrix and index (regions)
 772 |     return df_mcc.values, df_mcc.index.values
 773 | 
 774 | def get_mcc_lite_v2(df_c, df_mc, base_call_cutoff):
 775 |     """
 776 |     """
 777 |     # get mcc matrix with kept bins and nan values for low coverage sites
 778 |     df_c_nan = df_c.copy()
 779 |     df_c_nan[df_c < base_call_cutoff] = np.nan
 780 |     df_mcc = df_mc/df_c_nan
 781 |     logging.info(df_mcc.shape)
 782 | 
 783 |     # imputation (missing value -> mean value of all cells)
 784 |     means = df_mcc.mean(axis=1)
 785 |     fill_value = pd.DataFrame({col: means for col in df_mcc.columns})
 786 |     df_mcc.fillna(fill_value, inplace=True)
 787 |     
 788 |     return df_mcc
 789 | 
 790 | def get_mcc_lite_v3(df_c, df_mc, base_call_cutoff):
 791 |     """
 792 |     """
 793 |     # get mcc matrix with kept bins and nan values for low coverage sites
 794 |     df_c_nan = df_c.copy()
 795 |     df_c_nan[df_c < base_call_cutoff] = np.nan
 796 |     df_mcc = df_mc/df_c_nan
 797 |     return df_mcc
 798 | 
 799 | 
 800 | def get_clusters_mc_c_worker(df_cells, df_input, cluster_col):
 801 |     """reduce gene*cell or bin*cell matrix to a gene*cluster or bin*cluster matrix
 802 |     Arguments:
 803 |         - df_cells: a dataframe indexed by 'cell_name', and have '$cluster_col' as column
 804 |         - df_input: a dataframe with 'sample_mc', 'sample_c' ... as columns
 805 |         sample names are cell names
 806 |     """
 807 |     # cluster mc_c
 808 |     df_c = df_input.filter(regex='_c$')
 809 |     df_mc = df_input.filter(regex='_mc$')
 810 | 
 811 |     df_mc_c = pd.DataFrame() 
 812 |     for label, df_sub in df_cells.groupby(cluster_col):
 813 |         samples = df_sub.index.values
 814 |         df_mc_c['{}_mc'.format(label)] = df_mc[samples+'_mc'].sum(axis=1)
 815 |         df_mc_c['{}_c'.format(label)] = df_c[samples+'_c'].sum(axis=1)
 816 | 
 817 |     logging.info("Output shape: {}".format(df_mc_c.shape))
 818 |     return df_mc_c
 819 | 
 820 | def rank_array(array):
 821 |     """Return ranking of each element of an array
 822 |     """
 823 |     array = np.array(array)
 824 |     temp = array.argsort()
 825 |     ranks = np.empty_like(temp)
 826 |     ranks[temp] = np.arange(len(array))
 827 |     return ranks
 828 | 
 829 | # added 4/5/2019
 830 | def rank_rows(matrix):
 831 |     """Return rankings of each rwo in a 2d array
 832 |     """
 833 |     matrix = np.array(matrix)
 834 |     return np.apply_along_axis(rank_array, 1, matrix) # row = 1
 835 | 
 836 | def spearman_corrcoef(X, Y):
 837 |     """return spearman correlation matrix for each pair of rows of X and Y
 838 |     """
 839 |     return np.corrcoef(rank_rows(X), rank_rows(Y))
 840 | 
 841 | def spearmanr_paired_rows(X, Y):
 842 |     from scipy import stats
 843 |     
 844 |     X = np.array(X)
 845 |     Y = np.array(Y)
 846 |     corrs = []
 847 |     ps = []
 848 |     for x, y in zip(X, Y):
 849 |         r, p = stats.spearmanr(x, y)
 850 |         corrs.append(r)
 851 |     return np.array(corrs), np.array(ps)
 852 | 
 853 | def get_index_from_array(arr, inqs, na_rep=-1):
 854 |     """Get index of array
 855 |     """
 856 |     arr = np.array(arr)
 857 |     arr = pd.Series(arr).reset_index().set_index(0)
 858 |     idxs = arr.reindex(inqs)['index'].fillna(na_rep).astype(int).values
 859 |     return idxs
 860 | 
 861 | def get_genomic_distance(sa, ea, sb, eb):
 862 |     """Get genomic distance
 863 |     """
 864 |     assert sa < ea and sb < eb
 865 |     if sa > sb:
 866 |         sa, sb = sb, sa
 867 |         ea, eb = eb, ea
 868 |         
 869 |     # sa <= sb
 870 |     distance = max(0, sb - ea)
 871 |     
 872 |     return distance
 873 | 
 874 | def get_reverse_comp(string):
 875 |     """Get reverse compliment of a string
 876 |     """
 877 |     comp_dict = {
 878 |         'A': 'T',
 879 |         'T': 'A',
 880 |         'G': 'C',
 881 |         'C': 'G',
 882 |         'N': 'N',
 883 |     }
 884 |     for char in set(string):
 885 |         if char not in ['A', 'C', 'G', 'T', 'N']:
 886 |             raise ValueError('Not allowed char in string')
 887 |             
 888 |     new_string = ''.join([comp_dict[char] for char in string[::-1]])
 889 |     return new_string
 890 |     
 891 | def save_gc_matrix(gc_matrix, f_gene, f_cell, f_mat):
 892 |     """
 893 |     """
 894 |     sparse.save_npz(f_mat, gc_matrix.data)
 895 |     with open(f_gene, 'w') as f:
 896 |         f.write('\n'.join(gc_matrix.gene)+'\n')
 897 |     with open(f_cell, 'w') as f:
 898 |         f.write('\n'.join(gc_matrix.cell)+'\n')
 899 | 
 900 | def save_gc_matrix_methylation(gc_matrix, f_gene, f_cell, f_mat_mc, f_mat_c):
 901 |     """
 902 |     """
 903 |     sparse.save_npz(f_mat_mc, gc_matrix.data['mc'])
 904 |     sparse.save_npz(f_mat_c, gc_matrix.data['c'])
 905 |     with open(f_gene, 'w') as f:
 906 |         f.write('\n'.join(gc_matrix.gene)+'\n')
 907 |     with open(f_cell, 'w') as f:
 908 |         f.write('\n'.join(gc_matrix.cell)+'\n') 
 909 | 
 910 | def import_single_textcol(fname, header=None, col=0):
 911 |     return pd.read_csv(fname, header=header, sep='\t')[col].values
 912 | 
 913 | def export_single_textcol(fname, array):
 914 |     with open(fname, 'w') as f:
 915 |         f.write('\n'.join(array)+'\n')
 916 | 
 917 | def load_gc_matrix(f_gene, f_cell, f_mat):
 918 |     """
 919 |     """
 920 |     gene = import_single_textcol(f_gene)
 921 |     cell = import_single_textcol(f_cell)
 922 |     mat = sparse.load_npz(f_mat) 
 923 |     assert (len(gene), len(cell)) == mat.shape
 924 |     return GC_matrix(gene, cell, mat) 
 925 | 
 926 | def load_gc_matrix_methylation(f_gene, f_cell, f_mat_mc, f_mat_c):
 927 |     """
 928 |     """
 929 |     _gene = import_single_textcol(f_gene) 
 930 |     _cell = import_single_textcol(f_cell)
 931 |     _mat_mc = sparse.load_npz(f_mat_mc) 
 932 |     _mat_c = sparse.load_npz(f_mat_c) 
 933 |     gxc_raw = GC_matrix(_gene, _cell, 
 934 |                               {'c': _mat_c, 'mc': _mat_mc})
 935 |     return gxc_raw
 936 | 
 937 | def nondup_legends(ax='', **kwargs):
 938 |     """Assuming plt (matplotlib.pyplot) is imported
 939 |     """
 940 |     from collections import OrderedDict
 941 |     import matplotlib.pyplot as plt
 942 | 
 943 |     if ax == '':
 944 |         handles, labels = plt.gca().get_legend_handles_labels()
 945 |         by_label = OrderedDict(zip(labels, handles))
 946 |         plt.legend(by_label.values(), by_label.keys(), **kwargs)
 947 |     else:
 948 |         handles, labels = ax.get_legend_handles_labels()
 949 |         by_label = OrderedDict(zip(labels, handles))
 950 |         ax.legend(by_label.values(), by_label.keys(), **kwargs)
 951 |     return 
 952 | 
 953 | def dedup_array_elements(x, empty_string=''):
 954 |     """Replacing repeats with empty_string
 955 |     """
 956 |     newx = np.empty_like(x)
 957 |     newx[0] = x[0]
 958 |     for i in range(1, len(x)):
 959 |         if x[i-1] == x[i]:
 960 |             newx[i] = empty_string
 961 |         else:
 962 |             newx[i] = x[i]
 963 |     return newx
 964 | 
 965 | def vcorrcoef(X,Y):
 966 |     """Compute correlation coef for each rows of X and Y
 967 |     """
 968 |     assert X.shape == Y.shape
 969 |     Xm = np.mean(X,axis=1).reshape(-1,1)
 970 |     Ym = np.mean(Y,axis=1).reshape(-1,1)
 971 |     Xm = X-Xm
 972 |     Ym = Y-Ym
 973 |     
 974 |     r_num = np.sum(Xm*Ym,axis=1)
 975 |     r_den = np.sqrt(np.sum(Xm**2,axis=1)*np.sum(Ym**2, axis=1))
 976 |     r = r_num/r_den
 977 |     return r
 978 | 
 979 | def zscore(x, offset=1e-7, ddof=1):
 980 |     return (x - np.mean(x))/(np.std(x, ddof=ddof) + offset)
 981 | 
 982 | 
 983 | def clst_umap_pipe_lite(pcs, cells_all, 
 984 |                         resolution=1,
 985 |                         npc=50,
 986 |                         k=30,
 987 |                         verbose=False, seed=0, cluster_only=False, 
 988 |                        ):
 989 |     # clustering
 990 |     import CEMBA_clst_utils
 991 |     import CEMBA_run_tsne
 992 | 
 993 |     df_clst = CEMBA_clst_utils.clustering_routine(
 994 |                                     pcs, 
 995 |                                     cells_all, k, 
 996 |                                     verbose=verbose,
 997 |                                     resolution=resolution,
 998 |                                     seed=seed,
 999 |                                     metric='euclidean', option='plain', n_trees=10, search_k=-1)
1000 | 
1001 |     # umap
1002 |     if not cluster_only:
1003 |         df_tsne = CEMBA_run_tsne.run_umap_lite(
1004 |                     pcs, 
1005 |                     cells_all, 
1006 |                     verbose=verbose,
1007 |                     n_neighbors=30, min_dist=0.5, n_dim=2, 
1008 |                     random_state=1)
1009 | 
1010 |         df_summary = df_clst.join(df_tsne)
1011 |         return df_summary
1012 |     else:
1013 |         return df_clst
1014 | 
1015 | def gen_cdf(array, ax, x_range=[], n_points=1000, show=True, flip=False, **kwargs):
1016 |     """
1017 |     """
1018 |     x = np.sort(array)
1019 |     y = np.arange(len(array))/len(array)
1020 |     if flip:
1021 |         # x = x[::-1]
1022 |         y = 1 - y
1023 | 
1024 |     if not x_range:
1025 |         if show:
1026 |             ax.plot(x, y, **kwargs)
1027 |         return x, y 
1028 |     else:
1029 |         start, end = x_range
1030 |         xbins = np.linspace(start, end, n_points)
1031 |         ybins = np.interp(xbins, x, y)
1032 |         if show:
1033 |             ax.plot(xbins, ybins, **kwargs)
1034 |         return xbins, ybins 
1035 | 
1036 | def savefig(fig, path):
1037 |     """
1038 |     """
1039 |     fig.savefig(path, bbox_inches='tight', dpi=300)
1040 |     return 
1041 | 


--------------------------------------------------------------------------------
/scripts/cli_parser.py:
--------------------------------------------------------------------------------
  1 | """Command line interface is defined here.
  2 | """
  3 | DESCRIPTION_preproc="""
  4 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. 
  5 | This is the CLI for its preprocessing module 
  6 | (from count matrices to normalized HVG feature matrices).
  7 | """
  8 | 
  9 | DESCRIPTION="""
 10 | SingleCellFusion is a computational tool to integrate single-cell transcriptome and epigenome datasets. 
 11 | """
 12 | 
 13 | EPILOG="""
 14 | Contributors: Fangming Xie, Aditya Chandrasekar, Wayne I. Doyle, Ethan J. Armand, Eran Mukamel.
 15 | Contact: Eran Mukamel (emukamel@ucsd.edu).
 16 | """
 17 | 
 18 | import argparse
 19 | import os
 20 | 
 21 | def create_parser_preproc():
 22 |     """
 23 |     """
 24 |     parser = argparse.ArgumentParser(
 25 |         prog="SingleCellFusion_pre",
 26 |         description=DESCRIPTION_preproc,
 27 |         epilog=EPILOG,
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 29 |     )
 30 | 
 31 |     required = parser.add_argument_group('required')
 32 |     optional = parser.add_argument_group('optional') 
 33 |     
 34 |     # Input/Output Dataset Settings
 35 |     required.add_argument(
 36 |         "-i", "--input_datasets", 
 37 |         type=str,
 38 |         nargs="+",
 39 |         required=True,
 40 |         help='''(list of str) 
 41 |              Paths to .h5ad files, each containing a cell-by-gene feature matrix, 
 42 |              cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 
 43 |              Gene IDs should be shared or partially shared across files. 
 44 |              Multiple inputs should be listed as a space seperated list of filenames. 
 45 |              '''
 46 |     )
 47 |     optional.add_argument(
 48 |         "-icov", "--input_datasets_coverage", 
 49 |         type=str,
 50 |         nargs="+",
 51 |         help='''(list of str) 
 52 |              Paths to .h5ad files, each containing a cell-by-gene feature matrix, 
 53 |              cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 
 54 |              Gene IDs should be shared or partially shared across files. 
 55 |              Multiple inputs should be listed as a space seperated list of filenames. 
 56 | 
 57 |              Required for "mc" datasets. Should follow the order of -i
 58 |              '''
 59 |     )
 60 |     required.add_argument(
 61 |         "-inorm", "--input_normalizations", 
 62 |         type=str,
 63 |         nargs="+",
 64 |         required=True,
 65 |         help='''(list of str) 
 66 |              Data modalities chosen from 'mc', 'cpm', or 'tpm'. This should be 
 67 |              listed in the same order as input_datasets 
 68 |              ''',
 69 |     )
 70 |     optional.add_argument(
 71 |         "-ci", "--cellid_column", 
 72 |         type=str,
 73 |         default="",
 74 |         help='''(str) 
 75 |              Cell id column - column in AnnData.obs that represents cell id.
 76 |              This needs to be unique within and across datasets.
 77 |              Empty string means the column is the index of AnnData.obs.
 78 |              '''
 79 |     )
 80 |     optional.add_argument(
 81 |         "-gi", "--geneid_column", 
 82 |         type=str,
 83 |         default="",
 84 |         help='''(str) 
 85 |              Gene id column - column in AnnData.var that presents gene id.
 86 |              This needs to be unique and shared across genes.
 87 |              Empty string means the column is the index of AnnData.var.
 88 |              '''
 89 |     )
 90 |     optional.add_argument(
 91 |         "-gmmc", "--global_mean_mc_column", 
 92 |         type=str,
 93 |         default="",
 94 |         help='''(str) 
 95 |              Global mean mc column - column in AnnData.obs that presents global mean methylation level.
 96 |              If empty, estimated by the input matrix
 97 |              '''
 98 |     )
 99 |     optional.add_argument(
100 |         "-sp", "--tosparse", 
101 |         action='store_true',
102 |         help='''() 
103 |              this turns the input matrix into scipy sparse matrix format
104 |              '''
105 |     )
106 |     optional.add_argument(
107 |         "-o", "--output_dir", 
108 |         type=str,
109 |         default="./preprocessed",
110 |         help='''(str) 
111 |              Directory to store output files
112 |              '''
113 |     )
114 |     optional.add_argument(
115 |         "-op", "--output_prefix", 
116 |         metavar="OUT_PREFIX", 
117 |         type=str,
118 |         default="SingleCellFusion",
119 |         help='''(str) 
120 |              The output files will contain this prefix
121 |              '''
122 |     )
123 |     optional.add_argument(
124 |         "-ga", "--gene_annotation_file", 
125 |         type=str,
126 |         default="",
127 |         help='''(str) 
128 |              Gene annotation file (bed format: chr, start, end, gene_id/gene_name/any identifier)
129 |              required if choose 'tpm' as the normalization option.
130 |              the fourth column is used to identify individual genes.
131 |              '''
132 |     )
133 |     optional.add_argument(
134 |         "-subn", "--sub_n", 
135 |         type=int,
136 |         default=None,
137 |         help='''(int) 
138 |              Subsampling this number of cells for each input dataset
139 |              '''
140 |     )
141 |     optional.add_argument(
142 |         "-subf", "--sub_frac", 
143 |         type=float,
144 |         default=None,
145 |         help='''(float) 
146 |              Subsampling this fraction (0~1) of cells for each input dataset
147 |              '''
148 |     )
149 |     return parser
150 | 
151 | def create_parser():
152 |     """
153 |     """
154 |     parser = argparse.ArgumentParser(
155 |         prog="SingleCellFusion",
156 |         description=DESCRIPTION,
157 |         epilog=EPILOG,
158 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
159 |     )
160 | 
161 |     required = parser.add_argument_group('required')
162 |     optional = parser.add_argument_group('optional') 
163 |     advanced = parser.add_argument_group('advanced')
164 |     
165 |     ## ARGUMENTS DIRECTLY FED INTO SingleCellFusion CLI 
166 |     # Input/Output Dataset Settings
167 |     required.add_argument(
168 |         "-i", "--input_datasets", 
169 |         metavar="xx.h5ad",
170 |         type=str,
171 |         nargs="+",
172 |         required=True,
173 |         help='''(list of str) 
174 |              Paths to .h5ad files, each containing a cell-by-gene feature matrix, 
175 |              cell IDs and gene IDs. Cell IDs should be unique within each .h5ad file, 
176 |              Gene IDs should be shared or partially shared across files. 
177 |              Multiple inputs should be listed as a space seperated list of filenames.
178 |              '''
179 |     )
180 |     required.add_argument(
181 |         "-im", "--input_modalities", 
182 |         metavar="rna/atac/mc",
183 |         type=str,
184 |         nargs="+",
185 |         required=True,
186 |         help='''(list of str)
187 |              Data modalities chosen from 'rna', 'atac', or 'mc'. This should be 
188 |              listed in the same order as input_datasets. 
189 |              '''
190 |     )
191 |     # may need this in the future
192 |     # parser.add_argument(
193 |     #     "-im", "--input_meta", 
194 |     #     type=str,
195 |     #     required=True,
196 |     #     help="(list of str) Input metadata csv file",
197 |     # )
198 | 
199 |     required.add_argument(
200 |         "-f", "--feature_datasets", 
201 |         metavar="xx.h5ad",
202 |         type=str,
203 |         nargs="+",
204 |         required=True,
205 |         help='''(list of str)
206 |              Dataset(s) whose features all other datasets will impute into.
207 |              This should be a subset of --input_datasets.
208 |              Enter multiple datasets as a space-separated list of filenames.
209 |              The features of these datasets will
210 |              be the features kept in the output imputed data table.",
211 |              '''
212 |     )
213 |     optional.add_argument(
214 |         "-o", "--output_dir", 
215 |         metavar="DIR",
216 |         type=str,
217 |         default="./results",
218 |         help='''(str)
219 |              Directory to store output files
220 |              '''
221 |     )
222 |     optional.add_argument(
223 |         "-op", "--output_prefix", 
224 |         type=str,
225 |         default="SingleCellFusion",
226 |         help='''(str)
227 |              The output files will contain this prefix.
228 |              '''
229 |     )
230 | 
231 |     # constraint kNN across modalities
232 |     optional.add_argument(
233 |         "--nearest_neighbors", 
234 |         type=int,
235 |         default=20,
236 |         help='''(integer)
237 |              Number of nearest neighbors used to impute data
238 |              '''
239 |     )
240 |     optional.add_argument(
241 |         "--relaxation", 
242 |         type=float,
243 |         default=3,
244 |         help='''(float)
245 |              A value between 1 to infinity. 
246 |              This is a parameter that constraints the number of neighbors a cell is allowed to receive.
247 |              Assume dataset 1 has N1 cells, dataset 2 has N2 cells. To find k neighbors in dataset 2 for 
248 |              every cell in dataset 1 means on average each cell in dataset 2 receives (kN1/N2) connections.
249 |              However, not all cells in dataset 2 gets the same number of connections. We therefore set an 
250 |              upper bound for the number of connections a cell in dataset 2 can receive to be:
251 |                 (kN1/N2)*relaxation
252 |              where relaxation >= 1. Relaxation=1 enforces a hard limit that every cell receives 
253 |              the same number of nearest neighbors, while relaxation=infinity approaches traditional kNN.
254 |              '''
255 |     )
256 |     optional.add_argument(
257 |         "--precomputed_pca_file", 
258 |         type=str,
259 |         default='',
260 |         help='''(str)
261 |              Precomputed PCA matrix (tab separated table; text file or gzipped)
262 |              with the first row as the header, and the first column as the cell_id
263 |              Each following rows are cell features, and columns are PCs.
264 | 
265 |              Providing this file will by-pass SingleCellFusion integration, 
266 |              and do clustering and UMAP just on this matrix instead.
267 |              '''
268 |     )
269 |     optional.add_argument(
270 |         "--use_netUMAP", 
271 |         action='store_true',
272 |         help='''(bool)
273 |              Include this argument to use Net-UMAP from Pegasus (Li et al. 2020)
274 |              Net-UMAP is an approximate but fast algorithm for UMAP.
275 |              It runs traditional UMAP on a subset of cells, 
276 |              then it uses deep neural network to learn embedding for all cells.
277 |              The package pegasus is required.
278 |              '''
279 |     )
280 |     optional.add_argument(
281 |         "--use_tsne", 
282 |         action='store_true',
283 |         help='''(bool)
284 |              Include this argument to use tSNE instead of UMAP
285 |              '''
286 |     )
287 | 
288 |     # within modality smoothing 
289 |     advanced.add_argument(
290 |         "--num_pcs", 
291 |         type=int,
292 |         default=50,
293 |         help='''(integer)
294 |              Number of Principal Components to keep for each dataset 
295 |              for smoothing and for clustering/embedding after imputation.
296 |              '''
297 |     )
298 |     advanced.add_argument(
299 |         "--smoothing_fractions", 
300 |         nargs="+",
301 |         type=float,
302 |         default=[0.7, 0.1, 0.9],
303 |         help='''(list of floats) 
304 |              A list of three values between 0 to 1 that controls the relative contribution
305 |              from the cell itself vs. its neighbors in within-dataset smoothing, 
306 |              specified for 'rna', 'atac', 'mc' data, respectively.
307 |              '''
308 |     )
309 | 
310 |     # Arguments for Clustering
311 |     advanced.add_argument(
312 |         "--leiden_n_neighbors", 
313 |         type=int,
314 |         default=30,
315 |         help='''(integer) 
316 |              Number of nearest neighbors to form in the integrated space, 
317 |              the resulting nearest neighbor graph is used for Leiden clustering.
318 |              It is passed into the python package leidenalg.
319 |              '''
320 |     )
321 |     advanced.add_argument(
322 |         "--leiden_resolutions", 
323 |         type=list,
324 |         default=[0.1, 0.2, 0.4, 0.8],
325 |         help='''(list of floats) 
326 |              A list of resolutions to be used for Leiden Clustering.
327 |              It is passed into the python package leidenalg.
328 |              '''
329 |     )
330 | 
331 |     # Arguments for UMAP
332 |     advanced.add_argument(
333 |         "--umap_n_neighbors", 
334 |         type=int,
335 | 		default=60,
336 |         help='''(integer)
337 |              Number of neighbors for UMAP. It is passed into the python package umap.UMAP(n_neighbors).
338 |              '''
339 |     )
340 |     advanced.add_argument(
341 |         "--umap_min_dist", 
342 |         type=float,
343 |         default=0.5,
344 |         help='''(float)
345 |              Minimum distance for UMAP. It is passed into the python package umap.UMAP(min_dist).
346 |              '''
347 |     )
348 |     return parser
349 | 
350 | def parse_filename(data_file):
351 |     """turn a xxx/xxx/XXXX.h5ad into XXXX 
352 |     """
353 |     dataset_name = os.path.basename(data_file)
354 |     if dataset_name.endswith('.h5ad'):
355 |         dataset_name = dataset_name[:-len('.h5ad')]
356 |     else:
357 |         raise ValueError("filenames don't have the format xxxx.h5ad")
358 |     return dataset_name
359 | 
360 | def modality_default_options(mod):
361 |     """
362 |     """
363 |     if mod == 'mc':
364 |         mod_direction = -1
365 |         # norm_option = 'mc'
366 |     elif mod == 'rna':
367 |         mod_direction = 1
368 |         # norm_option = 'cpm'
369 |     elif mod == 'atac':
370 |         mod_direction = 1
371 |         # norm_option = 'tpm'
372 |     else:
373 |         raise ValueError("choose from ['mc', 'rna', 'atac']")
374 |     return mod_direction 
375 | 


--------------------------------------------------------------------------------
/scripts/clst_utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for clusterings and embeddings
  2 | """
  3 | 
  4 | from __init__ import *
  5 | # from sklearn.decomposition import PCA
  6 | import igraph as ig
  7 | from scipy import sparse
  8 | from annoy import AnnoyIndex
  9 | from umap import UMAP
 10 | import leidenalg
 11 | 
 12 | from basic_utils import create_logger
 13 | 
 14 | # major change in annoy functions 5/7/2019 
 15 | def build_knn_map(X, metric='euclidean', n_trees=10, verbose=True):
 16 |     """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
 17 | 
 18 |     return:
 19 |          t: annoy knn object, can be used in the following ways 
 20 |                 t.get_nns_by_vector
 21 |                 t.get_nns_by_item
 22 |     """
 23 |     ti = time.time()
 24 | 
 25 |     n_obs, n_f = X.shape
 26 |     t = AnnoyIndex(n_f, metric=metric)  # Length of item vector that will be indexed
 27 |     for i, X_row in enumerate(X):
 28 |         t.add_item(i, X_row)
 29 |     t.build(n_trees) # 10 trees
 30 |     if verbose:
 31 |         print("Time used to build kNN map {}".format(time.time()-ti))
 32 |     return t 
 33 | 
 34 | def get_knn_by_items(t, k, 
 35 |     form='list', 
 36 |     search_k=-1, 
 37 |     include_distances=False,
 38 |     verbose=True, 
 39 |     ):
 40 |     """Get kNN for each item in the knn map t
 41 |     """
 42 |     ti = time.time()
 43 |     # set up
 44 |     n_obs = t.get_n_items()
 45 |     n_f = t.f
 46 |     if k > n_obs:
 47 |         print("Actual k: {}->{} due to low n_obs".format(k, n_obs))
 48 |         k = n_obs
 49 | 
 50 |     knn = [0]*(n_obs)
 51 |     knn_dist = [0]*(n_obs)
 52 |     # this block of code can be optimized
 53 |     if include_distances:
 54 |         for i in range(n_obs):
 55 |             res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances)
 56 |             knn[i] = res[0]
 57 |             knn_dist[i] = res[1]
 58 |     else:
 59 |         for i in range(n_obs):
 60 |             res = t.get_nns_by_item(i, k, search_k=search_k, include_distances=include_distances) 
 61 |             knn[i] = res
 62 | 
 63 |     knn = np.array(knn)
 64 |     knn_dist = np.array(knn_dist)
 65 | 
 66 |     if verbose:
 67 |         print("Time used to get kNN {}".format(time.time()-ti))
 68 | 
 69 |     if form == 'adj':
 70 |         # row col 1/dist 
 71 |         row_inds = np.repeat(np.arange(n_obs), k)
 72 |         col_inds = np.ravel(knn)
 73 |         if include_distances:
 74 |             data = np.ravel(knn_dist) 
 75 |         else:
 76 |             data = [1]*len(row_inds)
 77 |         knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs, n_obs))
 78 |         return knn_dist_mat
 79 |     elif form == 'list':  #
 80 |         if include_distances:
 81 |             return knn, knn_dist
 82 |         else:
 83 |             return knn
 84 |     else:
 85 |         raise ValueError("Choose from 'adj' and 'list'")
 86 | 
 87 | def get_knn_by_vectors(t, X, k, 
 88 |     form='list', 
 89 |     search_k=-1, 
 90 |     include_distances=False,
 91 |     verbose=True, 
 92 |     ):
 93 |     """Get kNN for each row vector of X 
 94 |     """
 95 |     ti = time.time()
 96 |     # set up
 97 |     n_obs = t.get_n_items()
 98 |     n_f = t.f
 99 |     n_obs_test, n_f_test = X.shape
100 |     assert n_f_test == n_f
101 | 
102 |     if k > n_obs:
103 |         print("Actual k: {}->{} due to low n_obs".format(k, n_obs))
104 |         k = n_obs
105 | 
106 |     knn = [0]*(n_obs_test)
107 |     knn_dist = [0]*(n_obs_test)
108 |     if include_distances:
109 |         for i, vector in enumerate(X):
110 |             res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances) 
111 |             knn[i] = res[0]
112 |             knn_dist[i] = res[1]
113 |     else:
114 |         for i, vector in enumerate(X):
115 |             res = t.get_nns_by_vector(vector, k, search_k=search_k, include_distances=include_distances) 
116 |             knn[i] = res
117 | 
118 |     knn = np.array(knn)
119 |     knn_dist = np.array(knn_dist)
120 | 
121 |     if verbose:
122 |         print("Time used to get kNN {}".format(time.time()-ti))
123 | 
124 |     if form == 'adj':
125 |         # row col 1/dist 
126 |         row_inds = np.repeat(np.arange(n_obs_test), k)
127 |         col_inds = np.ravel(knn)
128 |         if include_distances:
129 |             data = np.ravel(knn_dist) 
130 |         else:
131 |             data = [1]*len(row_inds)
132 |         knn_dist_mat = sparse.coo_matrix((data, (row_inds, col_inds)), shape=(n_obs_test, n_obs))
133 |         return knn_dist_mat
134 |     elif form == 'list':  #
135 |         if include_distances:
136 |             return knn, knn_dist
137 |         else:
138 |             return knn
139 |     else:
140 |         raise ValueError("Choose from 'adj' and 'list'")
141 | 
142 | def gen_knn_annoy(X, k, form='list', 
143 |     metric='euclidean', n_trees=10, search_k=-1, verbose=True, 
144 |     include_distances=False,
145 |     ):
146 |     """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
147 |     """
148 |     ti = time.time()
149 | 
150 |     n_obs, n_f = X.shape
151 |     t = build_knn_map(X, metric=metric, n_trees=n_trees, verbose=verbose)
152 | 
153 |     return get_knn_by_items(t, k,                             
154 |                             form=form, 
155 |                             search_k=search_k, 
156 |                             include_distances=include_distances,
157 |                             verbose=verbose, 
158 |                             )
159 | 
160 | def gen_knn_annoy_train_test(X_train, X_test, k, 
161 |     form='list', 
162 |     metric='euclidean', n_trees=10, search_k=-1, verbose=True, 
163 |     include_distances=False,
164 |     ):
165 |     """X is expected to have low feature dimensions (n_obs, n_features) with (n_features <= 50)
166 |     For each row in X_test, find k nearest neighbors in X_train
167 |     """
168 |     ti = time.time()
169 |     
170 |     n_obs, n_f = X_train.shape
171 |     n_obs_test, n_f_test = X_test.shape
172 |     assert n_f == n_f_test 
173 |     
174 |     t = build_knn_map(X_train, metric=metric, n_trees=n_trees, verbose=verbose)
175 |     return get_knn_by_vectors(t, X_test, k, 
176 |                                 form=form, 
177 |                                 search_k=search_k, 
178 |                                 include_distances=include_distances,
179 |                                 verbose=verbose, 
180 |                                 )
181 |     
182 | def compute_jaccard_weights_from_knn(X):
183 |     """compute jaccard index on a knn graph
184 |     Arguments: 
185 |         X (unweighted) kNN ajacency matrix (each row Xi* gives the kNNs of cell i) 
186 |         X has to be 0-1 valued 
187 |         k (number of nearest neighbors) 
188 |         
189 |     output: numpy matrix Y
190 |     """
191 |     X = sparse.csr_matrix(X)
192 |     ni, nj = X.shape
193 |     assert ni == nj
194 |     
195 |     k = X[0, :].sum() # number of neighbors
196 |     
197 |     Y = X.dot(X.T)
198 |     # Y = X.multiply(tmp/(2*k - tmp.todense()))    
199 |     Y.data = Y.data/(2*k - Y.data)
200 |     
201 |     return Y 
202 | 
203 | def adjacency_to_igraph(adj_mtx, weighted=False):
204 |     """
205 |     Converts an adjacency matrix to an igraph object
206 |     
207 |     Args:
208 |         adj_mtx (sparse matrix): Adjacency matrix
209 |         directed (bool): If graph should be directed
210 |     
211 |     Returns:
212 |         G (igraph object): igraph object of adjacency matrix
213 |     
214 |     Uses code from:
215 |         https://github.com/igraph/python-igraph/issues/168
216 |         https://stackoverflow.com/questions/29655111
217 | 
218 |     Author:
219 |         Wayne Doyle 
220 |         (Fangming Xie modified) 
221 |     """
222 |     nrow, ncol = adj_mtx.shape
223 |     if nrow != ncol:
224 |         raise ValueError('Adjacency matrix should be a square matrix')
225 |     vcount = nrow
226 |     sources, targets = adj_mtx.nonzero()
227 |     edgelist = list(zip(sources.tolist(), targets.tolist()))
228 |     G = ig.Graph(n=vcount, edges=edgelist, directed=True)
229 |     if weighted:
230 |         G.es['weight'] = adj_mtx.data
231 |     return G
232 | 
233 | def leiden_lite(g, cell_list, resolution=1, weighted=False, verbose=True, num_starts=None, seed=1):
234 |     """ Code from Ethan Armand and Wayne Doyle, ./mukamel_lab/mop
235 |     slightly modified by Fangming Xie 05/13/2019
236 |     """
237 |     
238 |     ti = time.time()
239 |     
240 |     if num_starts is not None:
241 |         np.random.seed(seed)
242 |         partitions = []
243 |         quality = []
244 |         seeds = np.random.randint(10*num_starts, size=num_starts)
245 |         for seed in seeds:
246 |             if weighted:
247 |                 temp_partition = leidenalg.find_partition(g,
248 |                                                       leidenalg.RBConfigurationVertexPartition, 
249 |                                                       weights=g.es['weight'],
250 |                                                       resolution_parameter=resolution,
251 |                                                       seed=seed,
252 |                                                       )
253 |             else:
254 |                 temp_partition = leidenalg.find_partition(g,
255 |                                                       leidenalg.RBConfigurationVertexPartition,
256 |                                                       resolution_parameter=resolution,
257 |                                                       seed=seed,
258 |                                                       )
259 |             quality.append(temp_partition.quality())
260 |             partitions.append(temp_partition)
261 |         partition1 = partitions[np.argmax(quality)]
262 |     else:
263 |         if weighted:
264 |             partition1 = leidenalg.find_partition(g,
265 |                                                   leidenalg.RBConfigurationVertexPartition,
266 |                                                   weights=g.es['weight'],
267 |                                                   resolution_parameter=resolution,
268 |                                                   seed=seed,
269 |                                                   )
270 |         else:
271 |             partition1 = leidenalg.find_partition(g,
272 |                                                   leidenalg.RBConfigurationVertexPartition,
273 |                                                   resolution_parameter=resolution,
274 |                                                   seed=seed,
275 |                                                   )
276 | 
277 |     # get cluster labels from partition1
278 |     labels = [0]*(len(cell_list)) 
279 |     for i, cluster in enumerate(partition1):
280 |         for element in cluster:
281 |             labels[element] = i+1
282 | 
283 |     df_res = pd.DataFrame(index=cell_list)
284 |     df_res['cluster'] = labels 
285 |     df_res = df_res.rename_axis('sample', inplace=False)
286 |     
287 |     if verbose:
288 |         print("Time spent on leiden clustering: {}".format(time.time()-ti))
289 |         
290 |     return df_res
291 | 
292 | def clustering_routine(X, cell_list, k, 
293 |     seed=1, verbose=True,
294 |     resolution=1, metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None):
295 |     """
296 |     X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended
297 |     option: {'plain', 'jaccard', ...}
298 |     """
299 |     assert len(cell_list) == len(X)
300 |     
301 |     if option == 'plain':
302 |         g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 
303 |                               n_trees=n_trees, search_k=search_k, verbose=verbose)
304 |         G = adjacency_to_igraph(g_knn, weighted=False)
305 |         df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 
306 |                             weighted=False, verbose=verbose, num_starts=num_starts)
307 |         
308 |     elif option == 'jaccard':
309 |         g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 
310 |                               n_trees=n_trees, search_k=search_k, verbose=verbose)
311 |         gw_knn = compute_jaccard_weights_from_knn(g_knn)
312 |         G = adjacency_to_igraph(gw_knn, weighted=True)
313 |         df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 
314 |                             weighted=True, verbose=verbose, num_starts=num_starts)
315 |     else:
316 |         raise ValueError('Choose from "plain" and "jaccard"')
317 |     
318 |     return df_res
319 | 
320 | def clustering_routine_multiple_resolutions(X, cell_list, k, 
321 |     seed=1, verbose=True,
322 |     resolutions=[1], metric='euclidean', option='plain', n_trees=10, search_k=-1, num_starts=None):
323 |     """
324 |     X is a (n_obs, n_feature) matrix, n_feature <=50 is recommended
325 |     option: {'plain', 'jaccard', ...}
326 |     """
327 |     assert len(cell_list) == len(X)
328 |     
329 |     res = []
330 |     if option == 'plain':
331 |         g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 
332 |                               n_trees=n_trees, search_k=search_k, verbose=verbose)
333 |         G = adjacency_to_igraph(g_knn, weighted=False)
334 |         for resolution in resolutions:
335 |             df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 
336 |                                 weighted=False, verbose=verbose, num_starts=num_starts)
337 |             df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)})
338 |             res.append(df_res)
339 |         
340 |     elif option == 'jaccard':
341 |         g_knn = gen_knn_annoy(X, k, form='adj', metric=metric, 
342 |                               n_trees=n_trees, search_k=search_k, verbose=verbose)
343 |         gw_knn = compute_jaccard_weights_from_knn(g_knn)
344 |         G = adjacency_to_igraph(gw_knn, weighted=True)
345 |         for resolution in resolutions:
346 |             df_res = leiden_lite(G, cell_list, resolution=resolution, seed=seed, 
347 |                                 weighted=True, verbose=verbose, num_starts=num_starts)
348 |             df_res = df_res.rename(columns={'cluster': 'cluster_r{}'.format(resolution)})
349 |             res.append(df_res)
350 |         
351 |     else:
352 |         raise ValueError('Choose from "plain" and "jaccard"')
353 |     res = pd.concat(res, axis=1)
354 |     
355 |     return res
356 | 
357 | def run_net_umap_pegasus(X, **kwargs):
358 |     """
359 |     X (m, n) -> res_umap (m, 2)
360 |     """
361 |     import pegasus
362 |     import pegasusio
363 |     # pegasus netUMAP
364 |     # construct a pegasus object (a hack - pegasus='1.4.3'; pegasusio='0.3.1.post2')
365 |     m, n = X.shape
366 |     pgX = pegasusio.MultimodalData(pegasusio.UnimodalData(
367 |         {'barcodekey': np.arange(m).astype(str)}, 
368 |         {'featurekey': np.arange(n).astype(str)}, 
369 |         {"X": X},
370 |         ))
371 |     # (a hack) select_alpha=0 is important to resolve a 
372 |     # sampling bug by pegasus when n is large 
373 |     pegasus.net_umap(pgX, rep=None, select_alpha=0, **kwargs) 
374 |     res_umap = pgX.obsm['X_net_umap'] # an array
375 |     return res_umap
376 | 
377 | def run_umap_lite(X, cell_list, n_neighbors=15, min_dist=0.1, n_dim=2, 
378 |              random_state=1, output_file=None, use_netUMAP=False, use_tsne=False, **kwargs):
379 |     """run umap on X (n_obs, n_features) 
380 |     """
381 |     ti = time.time()
382 | 
383 |     logging.info("Running UMAP: {} n_neighbors, {} min_dist , {} dim.\n\
384 |                   Input shape: (# observations, # features) = {}\n\
385 |                   Use netUMAP from pegasus: {}\n\
386 |                   Use tSNE: {}\n\
387 |                  "
388 |                         .format(n_neighbors, min_dist, n_dim, X.shape, use_netUMAP, use_tsne))
389 | 
390 |     if use_netUMAP:
391 |         umap_res = run_net_umap_pegasus(X, 
392 |                     n_components=n_dim, 
393 |                     random_state=random_state, 
394 |                     n_neighbors=n_neighbors, 
395 |                     min_dist=min_dist, 
396 |                     **kwargs)
397 |     elif use_tsne:
398 |         from sklearn.manifold import TSNE
399 |         umap_res = TSNE(n_components=n_dim, 
400 |                         random_state=random_state,
401 |                         **kwargs,
402 |                    ).fit_transform(X)
403 |     else:
404 |         umap_res = UMAP(n_components=n_dim, 
405 |                     random_state=random_state, 
406 |                     n_neighbors=n_neighbors, 
407 |                     min_dist=min_dist, 
408 |                     **kwargs).fit_transform(X)
409 |         
410 | 
411 |     columns = ['umap_{}'.format(i+1) for i in np.arange(n_dim)]
412 |     df_umap = pd.DataFrame(umap_res, columns=columns)
413 |     df_umap['sample'] = cell_list 
414 |     df_umap = df_umap.set_index('sample')
415 |     
416 |     if output_file:
417 |         df_umap.to_csv(output_file, sep="\t", na_rep='NA', header=True, index=True)
418 |         logging.info("Saved coordinates to file. {}".format(output_file))
419 | 
420 |     tf = time.time()
421 |     logging.info("Done. running time: {} seconds.".format(tf - ti))
422 |     
423 |     return df_umap


--------------------------------------------------------------------------------
/scripts/preproc_utils.py:
--------------------------------------------------------------------------------
  1 | from __init__ import *
  2 | import numpy as np
  3 | import pandas as pd
  4 | import logging
  5 | from sklearn.utils.sparsefuncs import mean_variance_axis
  6 | from scipy.stats import kruskal
  7 | 
  8 | import basic_utils
  9 | 
 10 | def select_hvg(gbc_cpm, percentile=30, n_qcut=20,):
 11 |     # further select highly variable genes
 12 |     # variance/mean
 13 |     mean_cpm, var_cpm = mean_variance_axis(gbc_cpm.data.tocsr(), axis=1) 
 14 |     vmr_cpm = (var_cpm+1)/(mean_cpm+1)
 15 |     # select top 30 percentile vmr from each first 9 deciles of CPM
 16 |     # duplicates = 'drop' 9/21/2019 Fangming
 17 |     _x = pd.qcut(pd.Series(mean_cpm), n_qcut, labels=False, duplicates='drop').to_frame('decile')
 18 |     hvgs = []
 19 |     for decile, _x_sub in _x.groupby('decile'):
 20 |         gene_group = _x_sub.index.values
 21 |         mean_cpm_gg = mean_cpm[gene_group]
 22 |         vmr_cpm_gg = vmr_cpm[gene_group]
 23 |         # genes with top 30% of vmr
 24 |         hvg_group = gene_group[vmr_cpm_gg > np.percentile(vmr_cpm_gg, 100-percentile)]
 25 | 
 26 |         if decile != n_qcut-1:
 27 |             hvgs.append(hvg_group)
 28 |     hvgs = np.hstack(hvgs)
 29 |     return hvgs
 30 | 
 31 | def select_hvg_methylation(df_nmcc, percentile=30, n_qcut=20,):
 32 |     # further select highly variable genes
 33 |     # standard deviation 
 34 |     
 35 |     stds_nmcc = df_nmcc.std(axis=1)
 36 |     mean_nmcc = df_nmcc.mean(axis=1)
 37 | 
 38 |     # select top 30 percentile vmr from each first 9 deciles of NMCC 
 39 |     # duplicates = 'drop' 9/21/2019 Fangming
 40 |     _x = pd.qcut(mean_nmcc, n_qcut, labels=False, duplicates='drop').to_frame('decile')
 41 |     hvgs = []
 42 |     for decile, _x_sub in _x.groupby('decile'):
 43 |         gene_group = _x_sub.index.values
 44 | 
 45 |         mean_nmcc_gg = mean_nmcc.loc[gene_group]
 46 |         stds_nmcc_gg = stds_nmcc.loc[gene_group]
 47 |         # logging.info(gene_group.shape, stds_nmcc_gg.shape)
 48 |         # genes with top 30% of stds 
 49 |         hvg_group = gene_group[stds_nmcc_gg > np.percentile(stds_nmcc_gg, 100-percentile)]
 50 |         hvgs.append(hvg_group)
 51 | 
 52 |     hvgs = np.hstack(hvgs)
 53 |     return hvgs
 54 | 
 55 | def filter_genes(gxc_raw, sufficient_cell_coverage=0.01):
 56 |     """
 57 |     """
 58 |     n_gene, n_cell = gxc_raw.data.shape
 59 |     gene_cov = (gxc_raw.data > 0).sum(axis=1)
 60 |     gene_cov = np.array(gene_cov).squeeze()/n_cell # fraction of cells covered
 61 |     cond = gene_cov>sufficient_cell_coverage
 62 |     gxc_raw_filtered = GC_matrix(np.array(gxc_raw.gene)[cond],
 63 |                                 gxc_raw.cell,
 64 |                                 gxc_raw.data.tocsr()[cond, :],
 65 |                                )
 66 |     return gxc_raw_filtered
 67 | 
 68 | def preproc_rna_cpm_based(gxc_raw, sufficient_cell_coverage=0.01, 
 69 |                           hv_percentile=30, hv_ncut=20):
 70 |     # select genes expressed in > 1% of cells
 71 |     # raw genes
 72 |     # _gxc_tmp, gxc_ftr, hvgs
 73 |     logging.info("Removing low coverage genes...")
 74 |     lib_size = np.ravel(gxc_raw.data.sum(axis=0))
 75 |     _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
 76 |     
 77 |     # CPM matrix
 78 |     logging.info("Getting CPM..")
 79 |     gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size)
 80 |     del _gxc_tmp
 81 | 
 82 |     # select highy variable genes
 83 |     logging.info("Getting highly variable genes and logCPM...")
 84 |     hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut)
 85 |     
 86 |     gxc_hvftr = GC_matrix(
 87 |                           gxc_ftr.gene[hvgs],
 88 |                           gxc_ftr.cell,
 89 |                           gxc_ftr.data.tocsr()[hvgs, :],
 90 |                           )
 91 |     del gxc_ftr
 92 |     gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important 
 93 |     logging.info("Number of genes: {}".format(len(hvgs)))
 94 |     return gxc_hvftr
 95 | 
 96 | def preproc_rna_cpm_based_kruskal(metadata, cluster_col, gxc_raw, sufficient_cell_coverage=0.01, 
 97 |                           hv_percentile=30):
 98 |     # select genes expressed in > 1% of cells
 99 |     # raw genes
100 |     # _gxc_tmp, gxc_ftr, hvgs
101 | 
102 |     logging.info("Removing low coverage genes...")
103 |     lib_size = np.ravel(gxc_raw.data.sum(axis=0))
104 |     _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
105 |     
106 |     # CPM matrix
107 |     logging.info("Getting CPM..")
108 |     gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size) # logcpm for kw
109 |     del _gxc_tmp
110 | 
111 |     # select highy variable genes
112 |     logging.info("Getting highly variable genes and logCPM...")
113 |     # select genes with KW test
114 |     datasets = []
115 |     for clst, df_sub in metadata.groupby(cluster_col):
116 |         cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values)
117 |         datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr())
118 |     ps = []
119 |     for i, gene in enumerate(gxc_ftr.gene): 
120 |         if i%1000==0:
121 |             logging.info(i)
122 |         gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets]
123 |         try:
124 |             _, p = kruskal(*gene_data)
125 |         except:
126 |             p = 1
127 |         ps.append(p)
128 |     p_th = np.percentile(ps, hv_percentile)
129 |     logging.info("Pvalue threshold p_th: {}".format(p_th))
130 |     hvgs = np.arange(len(ps))[ps<=p_th]
131 | 
132 |     gxc_hvftr = GC_matrix(
133 |                           gxc_ftr.gene[hvgs],
134 |                           gxc_ftr.cell,
135 |                           gxc_ftr.data.tocsr()[hvgs, :],
136 |                           )
137 |     del gxc_ftr
138 |     gxc_hvftr.data.data = np.log10(1+gxc_hvftr.data.data) # very important 
139 |     logging.info("Number of genes: {}".format(len(hvgs)))
140 |     return gxc_hvftr
141 | 
142 | def preproc_rna_tpm_based(gxc_raw, gene_lengths,                           
143 |                           impute_gene_lengths=True, 
144 |                           sufficient_cell_coverage=0.01, 
145 |                           hv_percentile=30, hv_ncut=20):
146 |     """Gene lengths is a gene length pandas series indexed by gene names
147 |     """
148 |     # gxc_raw, gxc_logtpm
149 |     # _gxc_tmp, gxc_ftr, hvgs
150 | 
151 |     assert np.all(gxc_raw.gene == gene_lengths.index.values) 
152 |     if impute_gene_lengths:
153 |         logging.info("Imputing gene lengths...")
154 |         gene_lengths = gene_lengths.fillna(gene_lengths.mean())
155 |     lib_size = np.ravel(gxc_raw.data.sum(axis=0))
156 | 
157 |     # select genes expressed in > 1% of cells
158 |     logging.info("Removing low coverage genes...")
159 |     _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
160 |     
161 |     # CPM matrix
162 |     logging.info("Getting CPM..")
163 |     gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='cpm', lib_size=lib_size)
164 |     del _gxc_tmp
165 | 
166 |     # select highy variable genes
167 |     logging.info("Getting highly variable genes and logCPM...")
168 |     hvgs = select_hvg(gxc_ftr, percentile=hv_percentile, n_qcut=hv_ncut) # index in gxc_ftr
169 |     hvgs_genes = gxc_ftr.gene[hvgs]
170 |     del gxc_ftr
171 | 
172 |     # TPM matrix from gxc_raw
173 |     logging.info("Getting logTPM...")
174 |     gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths)
175 |     hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes)
176 | 
177 |     # Trim logTPM matrix
178 |     logging.info("Trim logTPM matrix...")
179 |     gxc_hvftr = GC_matrix(
180 |                           gxc_logtpm.gene[hvgs_idx],
181 |                           gxc_logtpm.cell,
182 |                           gxc_logtpm.data.tocsr()[hvgs_idx, :],
183 |                           )
184 |     logging.info("Number of genes: {}".format(len(hvgs_idx)))
185 |     return gxc_hvftr
186 | 
187 | def preproc_rna_tpm_based_kruskal(metadata, cluster_col, gxc_raw, gene_lengths, 
188 |                                   impute_gene_lengths=True, 
189 |                                   sufficient_cell_coverage=0.01, 
190 |                                   hv_percentile=30):
191 |     """Gene lengths is a gene length pandas series indexed by gene names
192 |     """
193 |     
194 |     assert np.all(gxc_raw.gene == gene_lengths.index.values) 
195 |     if impute_gene_lengths:
196 |         logging.info("Imputing gene lengths...")
197 |         gene_lengths = gene_lengths.fillna(gene_lengths.mean())
198 |     lib_size = np.ravel(gxc_raw.data.sum(axis=0))
199 | 
200 |     # select genes expressed in > 1% of cells
201 |     logging.info("Removing low coverage genes...")
202 |     _gxc_tmp = filter_genes(gxc_raw, sufficient_cell_coverage=sufficient_cell_coverage)
203 |     
204 |     # CPM matrix
205 |     logging.info("Getting CPM..")
206 |     gxc_ftr = basic_utils.sparse_logcpm(_gxc_tmp, mode='logcpm', lib_size=lib_size)
207 |     del _gxc_tmp
208 | 
209 | 
210 |     logging.info("Getting highly variable genes...")
211 |     # select genes with KW test
212 |     datasets = []
213 |     for clst, df_sub in metadata.groupby(cluster_col):
214 |         cell_idx = basic_utils.get_index_from_array(gxc_ftr.cell, df_sub.index.values)
215 |         datasets.append(gxc_ftr.data.tocsc()[:,cell_idx].tocsr())
216 |     ps = []
217 |     for i, gene in enumerate(gxc_ftr.gene): 
218 |         if i%1000==0:
219 |             logging.info(i)
220 |         gene_data = [np.ravel(np.array(dataset[i,:].todense())) for dataset in datasets]
221 |         try:
222 |             s, p = kruskal(*gene_data)
223 |         except:
224 |             p = 1
225 |         ps.append(p)
226 |     
227 |     p_th = np.percentile(ps, hv_percentile)
228 |     logging.info("Pvalue threshold p_th: {}".format(p_th))
229 |     hvgs = np.arange(len(ps))[ps<=p_th]
230 |     hvgs_genes = gxc_ftr.gene[hvgs]   
231 |     del gxc_ftr
232 | 
233 |     # TPM matrix from gxc_raw
234 |     logging.info("Getting logTPM...")
235 |     gxc_logtpm = basic_utils.sparse_logtpm(gxc_raw, gene_lengths)
236 |     hvgs_idx = basic_utils.get_index_from_array(gxc_logtpm.gene, hvgs_genes)
237 | 
238 |     # Trim logTPM matrix
239 |     logging.info("Trim logTPM matrix...")
240 |     gxc_hvftr = GC_matrix(
241 |                           gxc_logtpm.gene[hvgs_idx],
242 |                           gxc_logtpm.cell,
243 |                           gxc_logtpm.data.tocsr()[hvgs_idx, :],
244 |                           )
245 |     logging.info("Number of genes: {}".format(len(hvgs_idx)))
246 |     return gxc_hvftr
247 | 
248 | def preproc_methylation(
249 |     gxc_raw,
250 |     metadata,
251 |     global_value_col='mCH', 
252 |     base_call_cutoff=20, 
253 |     sufficient_coverage_fraction=0.95,
254 |     hv_percentile=30,
255 |     n_qcut=10,
256 |     ):
257 |     """
258 |     """
259 |     # select genes covered (20 counts) in > 95% of cells
260 |     df_mc = pd.DataFrame(gxc_raw.data['mc'], index=gxc_raw.gene, columns=gxc_raw.cell)
261 |     df_c = pd.DataFrame(gxc_raw.data['c'], index=gxc_raw.gene, columns=gxc_raw.cell) 
262 | 
263 |     n_gene, n_cell = df_c.shape
264 |     gene_cov = (df_c > base_call_cutoff).sum(axis=1)/n_cell # fraction of cells covered
265 |     cond = gene_cov>sufficient_coverage_fraction
266 |     df_mc = df_mc[cond]
267 |     df_c = df_c[cond]
268 | 
269 |     # compute normalized methylation matrix (no need to further select genes) 
270 |     df_mcc = basic_utils.get_mcc_lite_v2(df_c, df_mc, base_call_cutoff=base_call_cutoff)
271 | 
272 |     # normalize by global mean
273 |     if global_value_col:
274 |         df_nmcc = df_mcc.divide(metadata.loc[df_mcc.columns.values, global_value_col], axis=1)
275 |     else:
276 |         global_mean = df_mc.sum(axis=0)/df_c.sum(axis=0)
277 |         df_nmcc = df_mcc.divide(global_mean, axis=1)
278 | 
279 |     # select highly variable genes 
280 |     hvgs = select_hvg_methylation(df_nmcc, percentile=hv_percentile, n_qcut=n_qcut)
281 |     # trim 
282 |     df_hvnmcc = df_nmcc.loc[hvgs] 
283 | 
284 |     return df_hvnmcc
285 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # add path
4 | rpath=$(realpath ./scripts)
5 | echo $rpath
6 | export PATH=$PATH:$rpath


--------------------------------------------------------------------------------