├── .github
└── workflows
│ └── CI.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── README.md
├── R_scripts
├── README.md
└── scan_for_kmers_motifs.R
├── docs
├── Makefile
├── environment.yml
├── make.bat
├── requirements.txt
└── source
│ ├── API.rst
│ ├── About SIMBA.rst
│ ├── Basic concepts.rst
│ ├── Citation.rst
│ ├── Installation.rst
│ ├── Makefile
│ ├── Output.rst
│ ├── Release notes.rst
│ ├── _ext
│ └── edit_on_github.py
│ ├── _static
│ └── img
│ │ ├── Figure1.png
│ │ ├── lion_icon.svg
│ │ └── logo_simba.png
│ ├── conf.py
│ ├── index.rst
│ └── make.bat
├── pytest.ini
├── requirements.txt
├── setup.py
├── simba
├── __init__.py
├── _settings.py
├── _utils.py
├── _version.py
├── data
│ └── gene_anno
│ │ ├── hg19_genes.bed
│ │ ├── hg38_genes.bed
│ │ ├── mm10_genes.bed
│ │ └── mm9_genes.bed
├── datasets
│ ├── __init__.py
│ └── _datasets.py
├── plotting
│ ├── __init__.py
│ ├── _palettes.py
│ ├── _plot.py
│ ├── _post_training.py
│ └── _utils.py
├── preprocessing
│ ├── __init__.py
│ ├── _general.py
│ ├── _pca.py
│ ├── _qc.py
│ ├── _utils.py
│ └── _variable_genes.py
├── readwrite.py
└── tools
│ ├── __init__.py
│ ├── _gene_scores.py
│ ├── _general.py
│ ├── _integration.py
│ ├── _pbg.py
│ ├── _post_training.py
│ ├── _umap.py
│ └── _utils.py
└── tests
├── data
├── 10xpbmc_atac_subset.h5ad
├── 10xpbmc_rna_subset.h5ad
├── pbg_training
│ ├── entity_alias.txt
│ ├── graph_stats.json
│ ├── input
│ │ └── entity
│ │ │ ├── entity_count_C_0.txt
│ │ │ ├── entity_count_G_0.txt
│ │ │ ├── entity_names_C_0.json
│ │ │ └── entity_names_G_0.json
│ ├── model
│ │ ├── checkpoint_version.txt
│ │ ├── config.json
│ │ ├── embeddings_C_0.v10.h5
│ │ ├── embeddings_G_0.v10.h5
│ │ ├── model.v10.h5
│ │ └── training_stats.json
│ └── pbg_graph.txt
└── preprocessed
│ ├── atac_preprocessed.h5ad
│ └── rna_preprocessed.h5ad
├── test_pbg_training.py
├── test_post_training.py
└── test_preprocessing.py
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build-linux:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | max-parallel: 5
10 | matrix:
11 | python-version: [3.7, 3.8, 3.9]
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Add conda to system path
20 | run: |
21 | # $CONDA is an environment variable pointing to the root of the miniconda directory
22 | echo $CONDA/bin >> $GITHUB_PATH
23 | - name: Install dependencies
24 | run: |
25 | conda config --add channels defaults
26 | conda config --add channels bioconda
27 | conda config --add channels conda-forge
28 | conda config --set channel_priority strict
29 | # conda env update --file environment.yml --name base
30 | conda install simba
31 | python -m pip install --upgrade pip
32 | pip install -r requirements.txt
33 | pip install -e .
34 | - name: Lint with flake8
35 | run: |
36 | conda install flake8
37 | # stop the build if there are Python syntax errors or undefined names
38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
40 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
41 | - name: Test with pytest
42 | run: |
43 | conda install pytest pytest-cov
44 | pytest --cov
45 | - name: Coverage report
46 | run: |
47 | bash <(curl -s https://codecov.io/bash)
48 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/osx,python,windows
2 |
3 | ### OSX ###
4 | *.DS_Store
5 | .AppleDouble
6 | .LSOverride
7 |
8 | # Icon must end with two \r
9 | Icon
10 |
11 | # Thumbnails
12 | ._*
13 |
14 | # Files that might appear in the root of a volume
15 | .DocumentRevisions-V100
16 | .fseventsd
17 | .Spotlight-V100
18 | .TemporaryItems
19 | .Trashes
20 | .VolumeIcon.icns
21 | .com.apple.timemachine.donotpresent
22 |
23 | # Directories potentially created on remote AFP share
24 | .AppleDB
25 | .AppleDesktop
26 | Network Trash Folder
27 | Temporary Items
28 | .apdisk
29 |
30 | ### Python ###
31 | # Byte-compiled / optimized / DLL files
32 | __pycache__/
33 | *.py[cod]
34 | *$py.class
35 |
36 | # C extensions
37 | *.so
38 |
39 | # Distribution / packaging
40 | .Python
41 | build/
42 | develop-eggs/
43 | dist/
44 | downloads/
45 | eggs/
46 | .eggs/
47 | lib/
48 | lib64/
49 | parts/
50 | sdist/
51 | var/
52 | wheels/
53 | *.egg-info/
54 | .installed.cfg
55 | *.egg
56 |
57 | # PyInstaller
58 | # Usually these files are written by a python script from a template
59 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
60 | *.manifest
61 | *.spec
62 |
63 | # Installer logs
64 | pip-log.txt
65 | pip-delete-this-directory.txt
66 |
67 | # Unit test / coverage reports
68 | htmlcov/
69 | .tox/
70 | .coverage
71 | .coverage.*
72 | .cache
73 | .pytest_cache/
74 | nosetests.xml
75 | coverage.xml
76 | *.cover
77 | .hypothesis/
78 |
79 | # Translations
80 | *.mo
81 | *.pot
82 |
83 | # Flask stuff:
84 | instance/
85 | .webassets-cache
86 |
87 | # Scrapy stuff:
88 | .scrapy
89 |
90 | # Sphinx documentation
91 | docs/_build/
92 |
93 | # PyBuilder
94 | target/
95 |
96 | # Jupyter Notebook
97 | .ipynb_checkpoints
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # celery beat schedule file
103 | celerybeat-schedule.*
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 |
130 | ### Windows ###
131 | # Windows thumbnail cache files
132 | Thumbs.db
133 | ehthumbs.db
134 | ehthumbs_vista.db
135 |
136 | # Folder config file
137 | Desktop.ini
138 |
139 | # Recycle Bin used on file shares
140 | $RECYCLE.BIN/
141 |
142 | # Windows Installer files
143 | *.cab
144 | *.msi
145 | *.msm
146 | *.msp
147 |
148 | # Windows shortcuts
149 | *.lnk
150 |
151 | # R
152 | *.Rhistory
153 |
154 | # Sphinx
155 | docs/source/_autosummary/
156 |
157 | # End of https://www.gitignore.io/api/osx,python,windows
158 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | conda:
4 | environment: docs/environment.yml
5 |
6 | build:
7 | image: latest
8 |
9 | sphinx:
10 | builder: html
11 | configuration: docs/source/conf.py
12 | fail_on_warning: false
13 |
14 | python:
15 | version: 3.7
16 | # install:
17 | # - method: pip
18 | # path: .
19 | # extra_requirements:
20 | # - docs
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021, Huidong Chen, Pinello Lab
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://simba-bio.readthedocs.io/en/latest/)
2 | [](https://github.com/pinellolab/simba/actions/workflows/CI.yml)
3 | [](https://anaconda.org/bioconda/simba)
4 | [](https://codecov.io/gh/pinellolab/simba)
5 |
6 | # SIMBA
7 |
8 | SIMBA: **SI**ngle-cell e**MB**edding **A**long with features
9 |
10 | Main website, documentation and tutorials: https://simba-bio.readthedocs.io
11 |
12 | Preprint: Huidong Chen, Jayoung Ryu, Michael E. Vinyard, Adam Lerer & Luca Pinello. ["SIMBA: SIngle-cell eMBedding Along with features. *bioRxiv, 2021.10.17.464750v1* (2021)."](https://www.biorxiv.org/content/10.1101/2021.10.17.464750v1)
13 |
14 | The scripts used for the comparison analyses in the manuscript can be found [here](https://github.com/pinellolab/simba_comparison).
15 |
16 |
17 |
18 | ## Installation
19 | Before installing SIMBA make sure to have the correct channels priority by executing these commands:
20 | ```
21 | conda config --add channels defaults
22 | conda config --add channels bioconda
23 | conda config --add channels conda-forge
24 | conda config --set channel_priority strict
25 | ```
26 |
27 | To install the simba package with conda, run:
28 | ```
29 | conda create -n env_simba jupyter simba
30 | ```
31 |
32 | To enable the k-mer and TF analyses please install these additional dependencies(optional):
33 | ```
34 | conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array
35 | ```
36 |
37 | ## [SIMBA v1.2 (dev)](https://github.com/pinellolab/simba/tree/dev) update
38 | We have added the support for
39 | * Continuous edge weight encoding for scRNA-seq ([tutorial](https://github.com/pinellolab/simba_tutorials/blob/main/v1.2/rna_10xpmbc_edgeweigts.ipynb))
40 | * Significance testing of features' cell type specificity metrics ([tutorial](https://github.com/pinellolab/simba_tutorials/tree/main/v1.1sig))
41 |
42 | ### SIMBA v1.2 Installation
43 | To install the latest development version of simba:
44 | ```
45 | conda create -n env_simba_dev jupyter pytorch pybedtools -y
46 | pip install 'simba @ git+https://github.com/pinellolab/simba@dev'
47 | ```
48 | To enable the k-mer and TF analyses please install these additional dependencies(optional):
49 | ```
50 | conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array
51 | ```
52 |
53 | Please refer to the main documentation website to learn how to use SIMBA with the provided tutorials: https://simba-bio.readthedocs.io
54 |
55 |
--------------------------------------------------------------------------------
/R_scripts/README.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | To run `scan_for_kmers_motifs.R`:
4 |
5 | step1: install all the dependencies:
6 |
7 | ```sh
8 | $ conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array
9 | ```
10 |
11 | step2: run `Rscript scan_for_kmers_motifs.R -h`
12 |
13 | e.g.,
14 | ```sh
15 | $ Rscript scan_for_kmers_motifs.R -i peaks.bed -g hg19.fa -s 'Homo sapiens'
16 | ```
17 |
--------------------------------------------------------------------------------
/R_scripts/scan_for_kmers_motifs.R:
--------------------------------------------------------------------------------
1 | # This script scans specified regions for kmers or/and motifs using JASPAR2020 database.
2 | # It outputs regions-by-kmers/motifs frequency matrix in .h5 format
3 |
4 | # Author: Huidong Chen
5 | # Contact information: hd7chen AT gmail DOT com
6 |
7 | suppressMessages(library(optparse,quietly = TRUE))
8 |
9 | main <- function(){
10 | option_list = list(
11 | make_option(c("-i", "--input"), type="character", default=NULL,
12 | help="input region file name in .bed format", metavar="character"),
13 | make_option(c("-g", "--genome"), type="character", default=NULL,
14 | help="Path to reference genome", metavar="character"),
15 | make_option(c("--no_kmer"), action = "store_true",default=FALSE,
16 | help="disable scanning for kmers"),
17 | make_option(c("--no_motif"), action = "store_true",default=FALSE,
18 | help="disable scanning for motifs"),
19 | make_option(c("-k","--k_kmer"), type="integer", default=6,
20 | help="k-mer length [default = %default].", metavar="integer"),
21 | make_option(c("-s","--species"), type="character", default=NULL,
22 | help="Species of motifs in the JASPAR database.
23 | Choose from 'Homo sapiens','Mus musculus'. Only valid when motif is used",
24 | metavar="character"),
25 | make_option(c("-o", "--output"), type="character", default='output_kmers_motifs',
26 | help="Output folder [default = %default]", metavar="character")
27 | )
28 |
29 | opt_parser = OptionParser(option_list=option_list)
30 | opt = parse_args(opt_parser)
31 |
32 | if(is.null(opt$input)){
33 | print_help(opt_parser)
34 | stop("input region file must be specified", call.=FALSE)
35 | }
36 | if(!opt$no_motif){
37 | if(any(is.null(opt$genome),is.null(opt$species))){
38 | print_help(opt_parser)
39 | stop("reference genome and species must be both specified", call.=FALSE)
40 | }
41 | }
42 |
43 | file.input = opt$input
44 | genome = opt$genome
45 | no_kmer = opt$no_kmer
46 | no_motif = opt$no_motif
47 | k = opt$k_kmer
48 | species = opt$species
49 | dir.output = opt$output
50 |
51 | suppressMessages(library(rhdf5))
52 | suppressMessages(library(HDF5Array)) # used for saving sparse matrix
53 | suppressMessages(library(Biostrings))
54 | suppressMessages(library(Matrix))
55 | suppressMessages(library(TFBSTools))
56 | suppressMessages(library(JASPAR2020))
57 | suppressMessages(library(motifmatchr))
58 | suppressMessages(library(SummarizedExperiment))
59 | suppressMessages(library(doParallel))
60 |
61 | set.seed(2020)
62 |
63 | system(paste0('mkdir -p ',dir.output))
64 |
65 | print('Converting .bed to .fasta ...')
66 | ### convert peaks bed file to fasta file
67 | file.input.fa = paste0(basename(file.input),'.fa')
68 | system(paste("bedtools getfasta -fi",genome,
69 | "-bed",file.input,
70 | "-fo",file.path(dir.output,file.input.fa)))
71 |
72 | peaks_seq <- readDNAStringSet(file.path(dir.output,file.input.fa), "fasta")
73 | peaks_name = gsub(":|-",'_',names(peaks_seq))
74 |
75 | ### count kmers
76 | if(!no_kmer){
77 | print('Scanning for kmers ...')
78 | freq_k = oligonucleotideFrequency(peaks_seq, k)
79 | rownames(freq_k) = peaks_name
80 | freq_k = as(freq_k, "sparseMatrix")
81 | }
82 |
83 | ### scan for TF motifs
84 | if(!no_motif){
85 | print('Scanning for TF motifs ...')
86 | opts <- list()
87 | opts["species"] <- species
88 | opts["collection"] <- "CORE"
89 | PFMatrixList = TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020,opts = opts)
90 | motif_ix_scores <- motifmatchr::matchMotifs(PFMatrixList,peaks_seq, out = "scores")
91 | freq_motif = motifCounts(motif_ix_scores)
92 | motif_names = c()
93 | for (x in names(PFMatrixList)){
94 | motif_names = c(motif_names,PFMatrixList[[x]]@name)
95 | }
96 | colnames(freq_motif) = gsub("::",'_',motif_names)
97 | rownames(freq_motif) = peaks_name
98 | }
99 |
100 | ### save results
101 | ### save kmers
102 | if(!no_kmer){
103 | print('Saving kmer matrix ...')
104 |
105 | # output_dir = file.path(dir.output, 'freq_k')
106 | # system(paste0('mkdir -p ',output_dir))
107 | # filename = 'freq_k.mtx'
108 | # writeMM(freq_k,file = file.path(output_dir,filename))
109 | # write.table(rownames(freq_k),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
110 | # write.table(colnames(freq_k),file.path(output_dir,'kmers.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
111 |
112 | filename = 'freq_kmer.h5'
113 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation
114 | writeHDF5Array(t(freq_k), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE)
115 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names
116 | h5write(rownames(freq_k), file.path(dir.output,filename), "row_names")
117 | h5write(colnames(freq_k), file.path(dir.output,filename), "col_names")
118 | }
119 |
120 | ### save motifs
121 | if(!no_motif){
122 | print('Saving motif matrix ...')
123 |
124 | # output_dir = file.path(dir.output, 'freq_motif')
125 | # system(paste0('mkdir -p ',output_dir))
126 | # filename = 'freq_motif.mtx'
127 | # writeMM(freq_motif,file = file.path(output_dir,filename))
128 | # write.table(rownames(freq_motif),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
129 | # write.table(colnames(freq_motif),file.path(output_dir,'motifs.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
130 |
131 | filename = 'freq_motif.h5'
132 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation
133 | writeHDF5Array(t(freq_motif), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE)
134 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names
135 | h5write(rownames(freq_motif), file.path(dir.output,filename), "row_names")
136 | h5write(colnames(freq_motif), file.path(dir.output,filename), "col_names")
137 | }
138 |
139 | print('Finished.')
140 | }
141 |
142 | main()
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/environment.yml:
--------------------------------------------------------------------------------
1 | name: readthedocs
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - defaults
6 | dependencies:
7 | - simba
8 | - pandoc>=2.14
9 | - pip:
10 | - sphinx>=3.0
11 | - sphinx-rtd-theme>=0.5
12 | - nbsphinx>=0.8
13 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=3.0
2 | sphinx-rtd-theme>=0.5
3 | nbsphinx>=0.8
--------------------------------------------------------------------------------
/docs/source/API.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: simba
2 |
3 | API
4 | ===
5 |
6 | Import simba as::
7 |
8 | import simba as si
9 |
10 | Configuration for SIMBA
11 | ~~~~~~~~~~~~~~~~~~~~~~~
12 | .. autosummary::
13 | :toctree: _autosummary
14 |
15 | settings.set_figure_params
16 | settings.set_pbg_params
17 | settings.set_workdir
18 |
19 |
20 | Reading
21 | ~~~~~~~
22 |
23 | .. autosummary::
24 | :toctree: _autosummary
25 |
26 | read_csv
27 | read_h5ad
28 | read_10x_h5
29 | read_mtx
30 | read_embedding
31 | load_pbg_config
32 | load_graph_stats
33 |
34 | See more at `anndata `_
35 |
36 | Preprocessing
37 | ~~~~~~~~~~~~~
38 |
39 | .. autosummary::
40 | :toctree: _autosummary
41 |
42 | pp.log_transform
43 | pp.normalize
44 | pp.binarize
45 | pp.cal_qc
46 | pp.cal_qc_rna
47 | pp.cal_qc_atac
48 | pp.filter_samples
49 | pp.filter_cells_rna
50 | pp.filter_cells_atac
51 | pp.filter_features
52 | pp.filter_genes
53 | pp.filter_peaks
54 | pp.pca
55 | pp.select_pcs
56 | pp.select_pcs_features
57 | pp.select_variable_genes
58 |
59 | Tools
60 | ~~~~~
61 |
62 | .. autosummary::
63 | :toctree: _autosummary
64 |
65 | tl.discretize
66 | tl.umap
67 | tl.gene_scores
68 | tl.infer_edges
69 | tl.trim_edges
70 | tl.gen_graph
71 | tl.pbg_train
72 | tl.softmax
73 | tl.embed
74 | tl.compare_entities
75 | tl.query
76 | tl.find_master_regulators
77 | tl.find_target_genes
78 |
79 |
80 | Plotting
81 | ~~~~~~~~
82 |
83 | .. autosummary::
84 | :toctree: _autosummary
85 |
86 | pl.pca_variance_ratio
87 | pl.pcs_features
88 | pl.variable_genes
89 | pl.violin
90 | pl.hist
91 | pl.umap
92 | pl.discretize
93 | pl.node_similarity
94 | pl.svd_nodes
95 | pl.pbg_metrics
96 | pl.entity_metrics
97 | pl.entity_barcode
98 | pl.query
99 |
100 |
101 | Datasets
102 | ~~~~~~~~
103 |
104 | .. autosummary::
105 | :toctree: _autosummary
106 |
107 | datasets.rna_10xpmbc3k
108 | datasets.rna_han2018
109 | datasets.rna_tmc2018
110 | datasets.rna_baron2016
111 | datasets.rna_muraro2016
112 | datasets.rna_segerstolpe2016
113 | datasets.rna_wang2016
114 | datasets.rna_xin2016
115 | datasets.atac_buenrostro2018
116 | datasets.atac_10xpbmc5k
117 | datasets.atac_chen2019
118 | datasets.atac_cusanovich2018_subset
119 | datasets.multiome_ma2020_fig4
120 | datasets.multiome_chen2019
121 | datasets.multiome_10xpbmc10k
122 |
--------------------------------------------------------------------------------
/docs/source/About SIMBA.rst:
--------------------------------------------------------------------------------
1 | About SIMBA
2 | ===========
3 |
4 | SIMBA ( **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features) is a versatile single-cell embedding method that co-embeds cells and features into the same latent space. By formulating single-cell analyses as multi-entity graph embedding problems, SIMBA can be used to solve popular single cell tasks that appear very different in a single framework.
5 |
6 | For each task, SIMBA constructs a graph with nodes of different entities (cells and features), and edges of different types indicating relations between these entities. SIMBA then applies multi-entity graph embedding algorithms adapted from the literature on social network and knowledge graph embeddings on this graph, and introduces a Softmax-based transformation to embed these entities (nodes) into a low-dimensional space such that the embeddings of these entities are comparable.
7 |
8 | We show that the SIMBA framework can perform many important single-cell analyses, including dimensionality reduction techniques for studying cellular states; clustering-free marker detection based on the similarity between single cells and features; single-cell multimodal analysis and the study of gene regulation; batch correction and omics integration analysis and simultaneous identification marker features. SIMBA can be adapted to these diverse analysis tasks by simply modifying how the input graph is constructed from the relevant single-cell data. We believe that SIMBA will simplify the task of adapting single-cell analysis to new tasks and single-cell modalities.
--------------------------------------------------------------------------------
/docs/source/Basic concepts.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Basic concepts
3 | ================
4 |
5 |
6 | Graph construction
7 | ~~~~~~~~~~~~~~~~~~
8 | SIMBA encodes entities of different types, including genes, open chromatin regions (peaks or bins), and DNA sequences (transcription factor motifs or k-mers), into a single large graph based on the relation between them. In this graph, nodes represent different entities and edges indicate the relation between entities.
9 |
10 | * In scRNA-seq analysis, each node represents either a cell or a gene. If a gene is expressed in a cell, then an edge is added between this gene and cell. The gene expression level is encoded into the weight of this edge.
11 |
12 | * In scATAC-seq analysis, each node represents either a cell or a region (peak/bin). If a region is open in a cell, then an edge is added between this region and cell. Optionally, if DNA sequences (TF motifs or k-mers) are also used, each node represents a cell, or a region, or a DNA sequence. In addition to the relation between a cell and a region, if a DNA sequence is found within the open region, then an edge is added between this DNA sequence and open region.
13 |
14 | * In multimodal analysis, each node can be any of these entities, including a cell, a gene, a open region , a DNA sequence, etc. Edges are added similarly as in scRNA-seq analysis and scATAC-seq analysis.
15 |
16 | * In batch correction analysis, in addition to the experimentally measured edges as described above, batch correction is further enhanced with the computationally inferred edges between cell nodes across datasets using a truncated randomized singular value decomposition (SVD)-based procedure
17 |
18 | * In multiomics integration analysis (scRNA-seq and scATAC-seq), SIMBA first builds one graph for scRNA-seq data and one graph for scATAC-seq data independently as described above. To connect these two graphs, SIMBA calculates gene scores by summarizing accessible regions from scATAC-seq data and then infer edges between cells of different omics based on their shared gene expression modules through a similar procedure as in batch correction.
19 |
20 | PBG training
21 | ~~~~~~~~~~~~
22 | Following the construction of a multi-relational graph between biological entities, we adapt graph embedding techniques from the knowledge graph and recommendation systems literature to construct unsupervised representations for these entities.
23 |
24 | We use the PyTorch-BigGraph(PBG) framework, which provides efficient computation of multi-relation graph embeddings over multiple entity types and can scale to graphs with millions or billions of entities.
25 |
26 | In SIMBA, several key modifications have been made based on PBG, including:
27 |
28 | * Type-constrainted negative sampling
29 |
30 | * Negative samples are produced in two ways:
31 |
32 | * by corrupting the edge with a source or destination sampled uniformly from the nodes with the correct types for this relation;
33 |
34 | * by corrupting the edge with a source or destination node sampled with probability proportional to its degree.
35 |
36 | * Introducing a weight decay procedure to solve overfitting problem.
37 |
38 | The resulting graph embeddings have two desirable properties that we will take advantage of:
39 |
40 | #. First-order similarity: for two entity types with a relation between them, edges with high likelihood should have higher dot product.
41 | #. Second-order similarity: within a single entity type, entities that have ‘similar contexts’, i.e., a similar distribution of edge probabilities, should have similar embeddings.
42 |
43 | Evaluation during training
44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
45 | During the PBG training procedure, a small percent of edges is held out (by default, the evaluation fraction is set to 5%) to monitor overfitting and evaluate the final model.
46 |
47 | Five metrics are computed on the reserved set of edges, including mean reciprocal rank (MRR, the average of the reciprocal of the ranks of all positives), R1 (the fraction of positives that rank better than all their negatives, i.e., have a rank of 1), R10 (the fraction of positives that rank in the top 10 among their negatives), R50 (the fraction of positives that rank in the top 50 among their negatives), and AUC (Area Under the Curve).
48 |
49 | By default, we show MRR along with training loss and validation loss while other metric are also available in SIMBA package (Supplementary Fig. 1a). The learning curves for validation loss and these metrics can be used to determine when training has completed. The relative values of training and validation loss along with these evaluation metrics can be used to identify issues with training (underfitting vs overfitting) and tune the hyperparameters weight decay, embedding dimension, and number of training epochs appropriately. However, for most datasets we find that the default parameters do not need tuning.
50 |
51 | Softmax transformation
52 | ~~~~~~~~~~~~~~~~~~~~~~
53 | PyTorch-BigGraph training provides initial embeddings of all entities (nodes). However, entities of different types (e.g., cells vs peaks, cells of different batches or modalities) have different edge distributions and thus may lie on different manifolds of the latent space. To make the embeddings of entities of different types comparable, we transform the embeddings of features with Softmax function by utilizing the first-order similarity between cells (reference) and features (query). In the case of batch correction or multi-omics integration, the SoftMax transformation is also performed based on the first-order similarity between cells of different batches or modalities.
54 |
--------------------------------------------------------------------------------
/docs/source/Citation.rst:
--------------------------------------------------------------------------------
1 | Citation
2 | ========
3 |
4 | Chen, H., Ryu, J., Vinyard, M.E., Lerer, A. & Pinello, L. SIMBA: SIngle-cell eMBedding Along with features. bioRxiv, 2021.2010.2017.464750 (2021).
5 |
6 | Please check out our `preprint `_ on bioRxiv to learn more.
--------------------------------------------------------------------------------
/docs/source/Installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | Anaconda
5 | ~~~~~~~~
6 |
7 | To install the `simba `_ package with conda, run::
8 |
9 | conda install -c bioconda simba
10 |
11 | **Recommended**: install *simba* in a new virtual enviroment::
12 |
13 | conda create -n env_simba python simba
14 | conda activate env_simba
15 |
16 | conda config --add channels defaults
17 | conda config --add channels bioconda
18 | conda config --add channels conda-forge
19 | conda config --set channel_priority strict
20 |
21 |
22 | Dev version
23 | ~~~~~~~~~~~
24 |
25 | To install the development version on `GitHub `_, run following on top of the stable installation::
26 |
27 | pip install 'simba @ git+https://github.com/pinellolab/simba@dev'
28 |
29 |
--------------------------------------------------------------------------------
/docs/source/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/source/Output.rst:
--------------------------------------------------------------------------------
1 | Output
2 | ======
3 |
4 | SIMBA result structure will look like this:
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | ::
8 |
9 | result_simba
10 | ├── figures
11 | └── pbg
12 | └── graph0
13 | ├── pbg_graph.txt
14 | ├── graph_stats.json
15 | ├── entity_alias.txt
16 | └── input
17 | ├── edge
18 | └── entity
19 | └── model0
20 | ├── config.json
21 | ├── training_stats.json
22 | ├── checkpoint_version.txt
23 | ├── embeddings.h5
24 | └── model.h5
25 | └── model1
26 | ├── config.json
27 | ├── training_stats.json
28 | ├── checkpoint_version.txt
29 | ├── embeddings.h5
30 | └── model.h5
31 | └── model2
32 | ├── config.json
33 | ├── training_stats.json
34 | ├── checkpoint_version.txt
35 | ├── embeddings.h5
36 | └── model.h5
37 | └── graph1
38 | ├── pbg_graph.txt
39 | ├── graph_stats.json
40 | ├── entity_alias.txt
41 | └── input
42 | ├── edge
43 | └── entity
44 | └── model
45 | ├── config.json
46 | ├── training_stats.json
47 | ├── checkpoint_version.txt
48 | ├── embeddings.h5
49 | └── model.h5
50 |
51 | By default, all figures will be saved under ``result_simba/figures``
52 |
53 | PBG training will be saved under the folder ``result_simba/pbg``. Within this folder, each constructed graph is saved into a separate folder (by default ``graph0``) under ``pbg``. For each graph:
54 |
55 | - ``pbg_graph.txt`` stores its edges on which PBG training is performed;
56 | - ``graph_stats.json`` stores the statistics related to this graph;
57 | - ``entity_alias.txt`` keeps the mapping between the original entity IDs and their aliases.
58 | - ``input`` stores the extracted nodes (entities) and edges from ``pbg_graph.txt``, which are prepared for PBG training.
59 | - ``model`` stores the training result of one parameter configuration. (by default ``model``)
--------------------------------------------------------------------------------
/docs/source/Release notes.rst:
--------------------------------------------------------------------------------
1 | Release notes
2 | =============
--------------------------------------------------------------------------------
/docs/source/_ext/edit_on_github.py:
--------------------------------------------------------------------------------
1 | """
2 | Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the
3 | sidebar.
4 | """
5 |
6 | import os
7 | import warnings
8 |
9 | __licence__ = "BSD (3 clause)"
10 |
11 |
12 | # def get_github_repo(app, path):
13 | # if path.endswith(".ipynb"):
14 | # return app.config.github_nb_repo, "/"
15 | # return app.config.github_repo, "/docs/source/"
16 |
17 |
18 | def html_page_context(app, pagename, templatename, context, doctree):
19 | if templatename != "page.html":
20 | return
21 |
22 | if doctree is not None:
23 | path = os.path.relpath(doctree.get("source"), app.builder.srcdir)
24 | if path.endswith(".ipynb"):
25 | context["display_github"] = True
26 | context["github_user"] = "huidongchen"
27 | context["github_repo"] = "simba_tutorials"
28 | context["github_version"] = "main"
29 | if path.endswith("rna_10x_mouse_brain_1p3M.ipynb"):
30 | context["conf_py_path"] = "/v1.1/"
31 | else:
32 | context["conf_py_path"] = "/v1.0/"
33 | else:
34 | context["display_github"] = True
35 | context["github_user"] = "pinellolab"
36 | context["github_repo"] = "simba"
37 | context["github_version"] = "master"
38 | context["conf_py_path"] = "/docs/source/"
39 |
40 | def setup(app):
41 | app.add_config_value("github_nb_repo", "", True)
42 | app.add_config_value("github_repo", "", True)
43 | app.connect("html-page-context", html_page_context)
44 |
--------------------------------------------------------------------------------
/docs/source/_static/img/Figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/docs/source/_static/img/Figure1.png
--------------------------------------------------------------------------------
/docs/source/_static/img/lion_icon.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/_static/img/logo_simba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/docs/source/_static/img/logo_simba.png
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../simba'))
16 | sys.path.insert(0, os.path.abspath('_ext'))
17 | import simba # noqa: E402
18 |
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = 'SIMBA'
23 | copyright = '2021, Huidong Chen'
24 | author = 'Huidong Chen'
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = simba.__version__
28 |
29 |
30 | # -- Retrieve notebooks (borrowed from scVelo) -------------------------------
31 |
32 | from urllib.request import urlretrieve # noqa: E402
33 |
34 | notebooks_url = "https://github.com/huidongchen/simba_tutorials/raw/main/"
35 | notebooks_v1_0 = [
36 | "rna_10xpmbc_all_genes.ipynb",
37 | "atac_buenrostro2018_peaks_and_sequences.ipynb",
38 | "multiome_shareseq.ipynb",
39 | "multiome_shareseq_GRN.ipynb",
40 | "rna_mouse_atlas.ipynb",
41 | "rna_human_pancreas.ipynb",
42 | "multiome_10xpmbc10k_integration.ipynb",
43 | ]
44 | notebooks_v1_1 = [
45 | "rna_10x_mouse_brain_1p3M.ipynb",
46 | ]
47 | for nb in notebooks_v1_0:
48 | try:
49 | urlretrieve(notebooks_url + "v1.0/" + nb, nb)
50 | except Exception:
51 | pass
52 |
53 | for nb in notebooks_v1_1:
54 | try:
55 | urlretrieve(notebooks_url + "v1.1/" + nb, nb)
56 | except Exception:
57 | pass
58 |
59 | # -- General configuration ---------------------------------------------------
60 |
61 | # Add any Sphinx extension module names here, as strings. They can be
62 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
63 | # ones.
64 |
65 | needs_sphinx = "3.0"
66 |
67 | extensions = [
68 | "sphinx.ext.autodoc",
69 | "sphinx.ext.autosummary",
70 | 'sphinx.ext.napoleon',
71 | "sphinx.ext.intersphinx",
72 | "sphinx.ext.mathjax",
73 | "sphinx.ext.viewcode",
74 | "nbsphinx",
75 | "edit_on_github",
76 | ]
77 |
78 | autosummary_generate = True
79 |
80 | # Napoleon settings
81 | napoleon_google_docstring = False
82 |
83 | # Add any paths that contain templates here, relative to this directory.
84 | templates_path = ['_templates']
85 |
86 | # List of patterns, relative to source directory, that match files and
87 | # directories to ignore when looking for source files.
88 | # This pattern also affects html_static_path and html_extra_path.
89 | exclude_patterns = ['_build']
90 |
91 | # Add prolog for notebooks
92 |
93 | # nbsphinx_prolog = r"""
94 | # {% set docname = 'github/huidongchen/simba_tutorials/blob/main/v1.0/' + env.doc2path(env.docname, base=None) %}
95 | # """
96 |
97 | # -- Options for HTML output -------------------------------------------------
98 |
99 | # The theme to use for HTML and HTML Help pages. See the documentation for
100 | # a list of builtin themes.
101 | #
102 | html_theme = 'sphinx_rtd_theme'
103 | html_theme_options = {
104 | "navigation_depth": 1,
105 | "titles_only": True,
106 | 'logo_only': True,
107 | }
108 | html_show_sphinx = False
109 | html_logo = '_static/img/logo_simba.png'
110 | html_favicon = '_static/img/lion_icon.svg'
111 | # html_context = dict(
112 | # display_github=True,
113 | # github_user='pinellolab',
114 | # github_repo='simba',
115 | # github_version='master',
116 | # conf_py_path='/docs/source/',
117 | # )
118 | # html_context = dict(
119 | # display_github=True,
120 | # github_user='huidongchen',
121 | # github_repo='simba_tutorials',
122 | # github_version='main',
123 | # conf_py_path='/v1.0/',
124 | # )
125 | github_repo = 'simba'
126 | github_nb_repo = 'simba_tutorials'
127 |
128 |
129 | # Add any paths that contain custom static files (such as style sheets) here,
130 | # relative to this directory. They are copied after the builtin static files,
131 | # so a file named "default.css" will overwrite the builtin "default.css".
132 |
133 | html_static_path = ['_static']
134 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | |CI| |Docs| |Install with conda| |Codecov| |Last updated| |Downloads| |License|
2 |
3 | **SIMBA**: **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features
4 | ========================================================================
5 |
6 | SIMBA is a method to embed cells along with their defining features such as gene expression, transcription factor binding sequences and chromatin accessibility peaks into the same latent space. The joint embedding of cells and features allows SIMBA to perform various types of single cell tasks, including but not limited to single-modal analysis (e.g. scRNA-seq and scATAC-seq analysis), multimodal analysis, batch correction, and multi-omic integration.
7 |
8 |
9 | .. image:: _static/img/Figure1.png
10 | :align: center
11 | :width: 600
12 | :alt: SIMBA overview
13 |
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 | :caption: Overview
18 | :hidden:
19 |
20 | About SIMBA
21 | Installation
22 | API
23 | Release notes
24 | Citation
25 |
26 |
27 | .. toctree::
28 | :maxdepth: 1
29 | :caption: SIMBA primer
30 |
31 | Basic concepts
32 | Output
33 |
34 |
35 | .. toctree::
36 | :maxdepth: 1
37 | :caption: Tutorials
38 |
39 | rna_10xpmbc_all_genes
40 | atac_buenrostro2018_peaks_and_sequences
41 | multiome_shareseq
42 | multiome_shareseq_GRN
43 | rna_mouse_atlas
44 | rna_human_pancreas
45 | multiome_10xpmbc10k_integration
46 | rna_10x_mouse_brain_1p3M
47 |
48 |
49 | .. |Docs| image:: https://readthedocs.org/projects/simba-bio/badge/?version=latest
50 | :target: https://simba-bio.readthedocs.io
51 |
52 | .. |CI| image:: https://github.com/pinellolab/simba/actions/workflows/CI.yml/badge.svg
53 | :target: https://github.com/pinellolab/simba/actions/workflows/CI.yml
54 |
55 | .. |Install with conda| image:: https://anaconda.org/bioconda/simba/badges/version.svg
56 | :target: https://anaconda.org/bioconda/simba
57 |
58 | .. |Last updated| image:: https://anaconda.org/bioconda/simba/badges/latest_release_date.svg
59 | :target: https://anaconda.org/bioconda/simba
60 |
61 | .. |License| image:: https://anaconda.org/bioconda/simba/badges/license.svg
62 | :target: https://github.com/pinellolab/simba/blob/master/LICENSE
63 |
64 | .. |Downloads| image:: https://anaconda.org/bioconda/simba/badges/downloads.svg
65 | :target: https://anaconda.org/bioconda/simba
66 |
67 | .. |Codecov| image:: https://codecov.io/gh/pinellolab/simba/branch/master/graph/badge.svg?token=NDQJQPL18K
68 | :target: https://codecov.io/gh/pinellolab/simba
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/docs/source/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = 'test_*.py'
3 | testpaths = 'tests/'
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.0
2 | pandas>=1.0,!=1.1 # required by Anndata
3 | anndata>=0.7.4
4 | # h5py<3.0.0 # avoid byte strings but caused building errors
5 | # h5py>=3.4
6 | scikit-learn>=0.19
7 | scipy>=1.4
8 | kneed>=0.7
9 | seaborn>=0.11
10 | matplotlib>=3.3
11 | scikit-misc>=0.1.3
12 | adjusttext>=0.7.3
13 | umap-learn>=0.3.0
14 | #plotly>=4.14.0
15 | pybedtools>=0.8.0
16 | # bedtools>=2.29.0 # not available in pip
17 | tables
18 |
19 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | if sys.version_info < (3, 6):
4 | sys.exit('simba requires Python >= 3.6')
5 |
6 | from setuptools import setup, find_packages
7 | from pathlib import Path
8 |
9 | version = {}
10 | with open("simba/_version.py") as fp:
11 | exec(fp.read(), version)
12 |
13 |
14 | setup(
15 | name='simba',
16 | version=version['__version__'],
17 | author='Huidong Chen',
18 | athor_email='hd7chen AT gmail DOT com',
19 | license='BSD',
20 | description='SIngle-cell eMBedding Along with features',
21 | long_description=Path('README.md').read_text('utf-8'),
22 | long_description_content_type="text/markdown",
23 | url='https://github.com/pinellolab/simba',
24 | packages=find_packages(),
25 | classifiers=[
26 | "Programming Language :: Python :: 3",
27 | "License :: OSI Approved :: BSD License",
28 | "Operating System :: OS Independent",
29 | ],
30 | python_requires='>=3.7',
31 | install_requires=[
32 | x.strip() for x in
33 | Path('requirements.txt').read_text('utf-8').splitlines()
34 | ],
35 | include_package_data=True,
36 | package_data={"simba": ["data/gene_anno/*.bed"]}
37 | )
38 |
--------------------------------------------------------------------------------
/simba/__init__.py:
--------------------------------------------------------------------------------
1 | """SIngle-cell eMBedding Along with features"""
2 |
3 | from ._settings import settings
4 | from . import preprocessing as pp
5 | from . import tools as tl
6 | from . import plotting as pl
7 | from .readwrite import *
8 | from . import datasets
9 | from ._version import __version__
10 |
11 |
12 | import sys
13 | # needed when building doc (borrowed from scanpy)
14 | sys.modules.update(
15 | {f'{__name__}.{m}': globals()[m] for m in ['tl', 'pp', 'pl']})
16 |
--------------------------------------------------------------------------------
/simba/_settings.py:
--------------------------------------------------------------------------------
1 | """Configuration for SIMBA"""
2 |
3 | import os
4 | import seaborn as sns
5 | import matplotlib as mpl
6 |
7 |
8 | class SimbaConfig:
9 | """configuration class for SIMBA"""
10 |
11 | def __init__(self,
12 | workdir='./result_simba',
13 | save_fig=False,
14 | n_jobs=1):
15 | self.workdir = workdir
16 | self.save_fig = save_fig
17 | self.n_jobs = n_jobs
18 | self.set_pbg_params()
19 | self.graph_stats = dict()
20 |
21 | def set_figure_params(self,
22 | context='notebook',
23 | style='white',
24 | palette='deep',
25 | font='sans-serif',
26 | font_scale=1.1,
27 | color_codes=True,
28 | dpi=80,
29 | dpi_save=150,
30 | fig_size=[5.4, 4.8],
31 | rc=None):
32 | """ Set global parameters for figures. Modified from sns.set()
33 |
34 | Parameters
35 | ----------
36 | context : string or dict
37 | Plotting context parameters, see `seaborn.plotting_context`
38 | style: `string`,optional (default: 'white')
39 | Axes style parameters, see `seaborn.axes_style`
40 | palette : string or sequence
41 | Color palette, see `seaborn.color_palette`
42 | font_scale: `float`, optional (default: 1.3)
43 | Separate scaling factor to independently
44 | scale the size of the font elements.
45 | color_codes : `bool`, optional (default: True)
46 | If ``True`` and ``palette`` is a seaborn palette,
47 | remap the shorthand color codes (e.g. "b", "g", "r", etc.)
48 | to the colors from this palette.
49 | dpi: `int`,optional (default: 80)
50 | Resolution of rendered figures.
51 | dpi_save: `int`,optional (default: 150)
52 | Resolution of saved figures.
53 | rc: `dict`,optional (default: None)
54 | rc settings properties.
55 | Parameter mappings to override the values in the preset style.
56 | Please see "`matplotlibrc file
57 | `__"
58 | """
59 | sns.set(context=context,
60 | style=style,
61 | palette=palette,
62 | font=font,
63 | font_scale=font_scale,
64 | color_codes=color_codes,
65 | rc={'figure.dpi': dpi,
66 | 'savefig.dpi': dpi_save,
67 | 'figure.figsize': fig_size,
68 | 'image.cmap': 'viridis',
69 | 'lines.markersize': 6,
70 | 'legend.columnspacing': 0.1,
71 | 'legend.borderaxespad': 0.1,
72 | 'legend.handletextpad': 0.1,
73 | 'pdf.fonttype': 42,
74 | })
75 | if rc is not None:
76 | assert isinstance(rc, dict), "rc must be dict"
77 | for key, value in rc.items():
78 | if key in mpl.rcParams.keys():
79 | mpl.rcParams[key] = value
80 | else:
81 | raise Exception("unrecognized property '%s'" % key)
82 |
83 | def set_workdir(self, workdir=None):
84 | """Set working directory.
85 |
86 | Parameters
87 | ----------
88 | workdir: `str`, optional (default: None)
89 | Working directory.
90 |
91 | Returns
92 | -------
93 | """
94 | if(workdir is None):
95 | workdir = self.workdir
96 | print("Using default working directory.")
97 | if(not os.path.exists(workdir)):
98 | os.makedirs(workdir)
99 | self.workdir = workdir
100 | self.set_pbg_params()
101 | print('Saving results in: %s' % workdir)
102 |
103 | def set_pbg_params(self, config=None):
104 | """Set PBG parameters
105 |
106 | Parameters
107 | ----------
108 | config : `dict`, optional (default: None)
109 | PBG training configuration parameters.
110 | By default it resets parameters to the default setting.
111 |
112 | Returns
113 | -------
114 | """
115 | if config is None:
116 | config = dict(
117 | # I/O data
118 | entity_path="",
119 | edge_paths=["", ],
120 | checkpoint_path="",
121 |
122 | # Graph structure
123 | entities={},
124 | relations=[],
125 | dynamic_relations=False,
126 |
127 | # Scoring model
128 | dimension=50,
129 | global_emb=False,
130 | comparator='dot',
131 |
132 | # Training
133 | num_epochs=10,
134 | workers=4,
135 | num_batch_negs=50,
136 | num_uniform_negs=50,
137 | loss_fn='softmax',
138 | lr=0.1,
139 |
140 | early_stopping=False,
141 | regularization_coef=0.0,
142 | wd=0.0,
143 | wd_interval=50,
144 |
145 | # Evaluation during training
146 | eval_fraction=0.05,
147 | eval_num_batch_negs=50,
148 | eval_num_uniform_negs=50,
149 |
150 | checkpoint_preservation_interval=None,
151 | )
152 | assert isinstance(config, dict), "`config` must be dict"
153 | self.pbg_params = config
154 |
155 |
156 | settings = SimbaConfig()
157 |
--------------------------------------------------------------------------------
/simba/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from kneed import KneeLocator
5 | import tables
6 | from anndata import AnnData
7 |
8 |
9 | def locate_elbow(x, y, S=10, min_elbow=0,
10 | curve='convex', direction='decreasing', online=False,
11 | **kwargs):
12 | """Detect knee points
13 |
14 | Parameters
15 | ----------
16 | x : `array-like`
17 | x values
18 | y : `array-like`
19 | y values
20 | S : `float`, optional (default: 10)
21 | Sensitivity
22 | min_elbow: `int`, optional (default: 0)
23 | The minimum elbow location
24 | curve: `str`, optional (default: 'convex')
25 | Choose from {'convex','concave'}
26 | If 'concave', algorithm will detect knees,
27 | If 'convex', algorithm will detect elbows.
28 | direction: `str`, optional (default: 'decreasing')
29 | Choose from {'decreasing','increasing'}
30 | online: `bool`, optional (default: False)
31 | kneed will correct old knee points if True,
32 | kneed will return first knee if False.
33 | **kwargs: `dict`, optional
34 | Extra arguments to KneeLocator.
35 |
36 | Returns
37 | -------
38 | elbow: `int`
39 | elbow point
40 | """
41 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):],
42 | S=S, curve=curve,
43 | direction=direction,
44 | online=online,
45 | **kwargs,
46 | )
47 | if(kneedle.elbow is None):
48 | elbow = len(y)
49 | else:
50 | elbow = int(kneedle.elbow)
51 | return(elbow)
52 |
53 |
54 | # modifed from
55 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
56 | def _read_legacy_10x_h5(filename, genome=None):
57 | """
58 | Read hdf5 file from Cell Ranger v2 or earlier versions.
59 | """
60 | with tables.open_file(str(filename), 'r') as f:
61 | try:
62 | children = [x._v_name for x in f.list_nodes(f.root)]
63 | if not genome:
64 | if len(children) > 1:
65 | raise ValueError(
66 | f"'{filename}' contains more than one genome. "
67 | "For legacy 10x h5 "
68 | "files you must specify the genome "
69 | "if more than one is present. "
70 | f"Available genomes are: {children}"
71 | )
72 | genome = children[0]
73 | elif genome not in children:
74 | raise ValueError(
75 | f"Could not find genome '{genome}' in '{filename}'. "
76 | f'Available genomes are: {children}'
77 | )
78 | dsets = {}
79 | for node in f.walk_nodes('/' + genome, 'Array'):
80 | dsets[node.name] = node.read()
81 | # AnnData works with csr matrices
82 | # 10x stores the transposed data, so we do the transposition
83 | from scipy.sparse import csr_matrix
84 |
85 | M, N = dsets['shape']
86 | data = dsets['data']
87 | if dsets['data'].dtype == np.dtype('int32'):
88 | data = dsets['data'].view('float32')
89 | data[:] = dsets['data']
90 | matrix = csr_matrix(
91 | (data, dsets['indices'], dsets['indptr']),
92 | shape=(N, M),
93 | )
94 | # the csc matrix is automatically the transposed csr matrix
95 | # as scanpy expects it, so, no need for a further transpostion
96 | adata = AnnData(
97 | matrix,
98 | obs=dict(obs_names=dsets['barcodes'].astype(str)),
99 | var=dict(
100 | var_names=dsets['gene_names'].astype(str),
101 | gene_ids=dsets['genes'].astype(str),
102 | ),
103 | )
104 | return adata
105 | except KeyError:
106 | raise Exception('File is missing one or more required datasets.')
107 |
108 |
109 | # modifed from
110 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
111 | def _read_v3_10x_h5(filename):
112 | """
113 | Read hdf5 file from Cell Ranger v3 or later versions.
114 | """
115 | with tables.open_file(str(filename), 'r') as f:
116 | try:
117 | dsets = {}
118 | for node in f.walk_nodes('/matrix', 'Array'):
119 | dsets[node.name] = node.read()
120 | from scipy.sparse import csr_matrix
121 |
122 | M, N = dsets['shape']
123 | data = dsets['data']
124 | if dsets['data'].dtype == np.dtype('int32'):
125 | data = dsets['data'].view('float32')
126 | data[:] = dsets['data']
127 | matrix = csr_matrix(
128 | (data, dsets['indices'], dsets['indptr']),
129 | shape=(N, M),
130 | )
131 | adata = AnnData(
132 | matrix,
133 | obs=dict(obs_names=dsets['barcodes'].astype(str)),
134 | var=dict(
135 | var_names=dsets['name'].astype(str),
136 | gene_ids=dsets['id'].astype(str),
137 | feature_types=dsets['feature_type'].astype(str),
138 | genome=dsets['genome'].astype(str),
139 | ),
140 | )
141 | return adata
142 | except KeyError:
143 | raise Exception('File is missing one or more required datasets.')
144 |
--------------------------------------------------------------------------------
/simba/_version.py:
--------------------------------------------------------------------------------
1 | """Version information"""
2 |
3 | __version__ = "1.1"
4 |
--------------------------------------------------------------------------------
/simba/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """Builtin Datasets."""
2 |
3 | from ._datasets import (
4 | rna_10xpmbc3k,
5 | rna_han2018,
6 | rna_tmc2018,
7 | rna_baron2016,
8 | rna_muraro2016,
9 | rna_segerstolpe2016,
10 | rna_wang2016,
11 | rna_xin2016,
12 | atac_buenrostro2018,
13 | atac_10xpbmc5k,
14 | atac_chen2019,
15 | atac_cusanovich2018_subset,
16 | multiome_ma2020_fig4,
17 | multiome_chen2019,
18 | multiome_10xpbmc10k
19 | )
20 |
--------------------------------------------------------------------------------
/simba/datasets/_datasets.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | from tqdm import tqdm
3 | import os
4 |
5 | from .._settings import settings
6 | from ..readwrite import read_h5ad
7 |
8 |
9 | class DownloadProgressBar(tqdm):
10 | def update_to(self,
11 | b=1,
12 | bsize=1,
13 | tsize=None):
14 | if tsize is not None:
15 | self.total = tsize
16 | self.update(b * bsize - self.n)
17 |
18 |
19 | def download_url(url,
20 | output_path,
21 | desc=None):
22 | if desc is None:
23 | desc = url.split('/')[-1]
24 | with DownloadProgressBar(
25 | unit='B',
26 | unit_scale=True,
27 | miniters=1,
28 | desc=desc
29 | ) as t:
30 | urllib.request.urlretrieve(
31 | url,
32 | filename=output_path,
33 | reporthook=t.update_to)
34 |
35 |
36 | def rna_10xpmbc3k():
37 | """10X human peripheral blood mononuclear cells (PBMCs) scRNA-seq data
38 |
39 | Returns
40 | -------
41 | adata: `AnnData`
42 | Anndata object
43 | """
44 | url = 'https://www.dropbox.com/s/087wuliddmbp3oe/rna_seq.h5ad?dl=1'
45 | filename = 'rna_10xpmbc3k.h5ad'
46 | filepath = os.path.join(settings.workdir, 'data')
47 | fullpath = os.path.join(filepath, filename)
48 | if(not os.path.exists(fullpath)):
49 | print('Downloading data ...')
50 | os.makedirs(filepath, exist_ok=True)
51 | download_url(url,
52 | fullpath,
53 | desc=filename)
54 | print(f'Downloaded to {filepath}.')
55 | adata = read_h5ad(fullpath)
56 | return adata
57 |
58 |
59 | def rna_han2018():
60 | """single-cell microwell-seq mouse cell atlas data
61 |
62 | ref: Han, X. et al. Mapping the mouse cell atlas by microwell-seq.
63 | Cell 172, 1091-1107. e1017 (2018).
64 |
65 | Returns
66 | -------
67 | adata: `AnnData`
68 | Anndata object
69 | """
70 | url = 'https://www.dropbox.com/s/nxbszjbir44g99n/rna_seq_mi.h5ad?dl=1'
71 | filename = 'rna_han2018.h5ad'
72 | filepath = os.path.join(settings.workdir, 'data')
73 | fullpath = os.path.join(filepath, filename)
74 | if(not os.path.exists(fullpath)):
75 | print('Downloading data ...')
76 | os.makedirs(filepath, exist_ok=True)
77 | download_url(url,
78 | fullpath,
79 | desc=filename)
80 | print(f'Downloaded to {filepath}.')
81 | adata = read_h5ad(fullpath)
82 | return adata
83 |
84 |
85 | def rna_tmc2018():
86 | """single-cell Smart-Seq2 mouse cell atlas data
87 |
88 | ref: Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse
89 | organs creates a Tabula Muris. Nature 562, 367-372 (2018).
90 |
91 | Returns
92 | -------
93 | adata: `AnnData`
94 | Anndata object
95 | """
96 | url = 'https://www.dropbox.com/s/rnpyp6vfpuiptkz/rna_seq_sm.h5ad?dl=1'
97 | filename = 'rna_tmc2018.h5ad'
98 | filepath = os.path.join(settings.workdir, 'data')
99 | fullpath = os.path.join(filepath, filename)
100 | if(not os.path.exists(fullpath)):
101 | print('Downloading data ...')
102 | os.makedirs(filepath, exist_ok=True)
103 | download_url(url,
104 | fullpath,
105 | desc=filename)
106 | print(f'Downloaded to {filepath}.')
107 | adata = read_h5ad(fullpath)
108 | return adata
109 |
110 |
111 | def rna_baron2016():
112 | """single-cell RNA-seq human pancreas data
113 |
114 | ref: Baron, M. et al. A single-cell transcriptomic map of the human and
115 | mouse pancreas reveals inter-and intra-cell population structure. Cell
116 | systems 3, 346-360. e344 (2016)
117 |
118 | Returns
119 | -------
120 | adata: `AnnData`
121 | Anndata object
122 | """
123 | url = 'https://www.dropbox.com/s/bvziclu6d3fdzow/rna_seq_baron.h5ad?dl=1'
124 | filename = 'rna_baron2016.h5ad'
125 | filepath = os.path.join(settings.workdir, 'data')
126 | fullpath = os.path.join(filepath, filename)
127 | if(not os.path.exists(fullpath)):
128 | print('Downloading data ...')
129 | os.makedirs(filepath, exist_ok=True)
130 | download_url(url,
131 | fullpath,
132 | desc=filename)
133 | print(f'Downloaded to {filepath}.')
134 | adata = read_h5ad(fullpath)
135 | return adata
136 |
137 |
138 | def rna_muraro2016():
139 | """single-cell RNA-seq human pancreas data
140 |
141 | ref: Muraro, M.J. et al. A single-cell transcriptome atlas of the
142 | human pancreas.Cell systems 3, 385-394. e383 (2016).
143 |
144 | Returns
145 | -------
146 | adata: `AnnData`
147 | Anndata object
148 | """
149 | url = 'https://www.dropbox.com/s/ginc9rbo4qmobwx/rna_seq_muraro.h5ad?dl=1'
150 | filename = 'rna_muraro2016.h5ad'
151 | filepath = os.path.join(settings.workdir, 'data')
152 | fullpath = os.path.join(filepath, filename)
153 | if(not os.path.exists(fullpath)):
154 | print('Downloading data ...')
155 | os.makedirs(filepath, exist_ok=True)
156 | download_url(url,
157 | fullpath,
158 | desc=filename)
159 | print(f'Downloaded to {filepath}.')
160 | adata = read_h5ad(fullpath)
161 | return adata
162 |
163 |
164 | def rna_segerstolpe2016():
165 | """single-cell RNA-seq human pancreas data
166 |
167 | ref: Segerstolpe, Å. et al. Single-cell transcriptome profiling of human
168 | pancreatic islets in health and type 2 diabetes.
169 | Cell metabolism 24, 593-607 (2016).
170 |
171 | Returns
172 | -------
173 | adata: `AnnData`
174 | Anndata object
175 | """
176 | url = 'https://www.dropbox.com/s/qomnf4860jwm9pd/rna_seq_segerstolpe.h5ad?dl=1'
177 | filename = 'rna_segerstolpe2016.h5ad'
178 | filepath = os.path.join(settings.workdir, 'data')
179 | fullpath = os.path.join(filepath, filename)
180 | if(not os.path.exists(fullpath)):
181 | print('Downloading data ...')
182 | os.makedirs(filepath, exist_ok=True)
183 | download_url(url,
184 | fullpath,
185 | desc=filename)
186 | print(f'Downloaded to {filepath}.')
187 | adata = read_h5ad(fullpath)
188 | return adata
189 |
190 |
191 | def rna_wang2016():
192 | """single-cell RNA-seq human pancreas data
193 |
194 | ref: Wang, Y.J. et al. Single-cell transcriptomics of the human endocrine
195 | pancreas. Diabetes 65, 3028-3038 (2016).
196 |
197 | Returns
198 | -------
199 | adata: `AnnData`
200 | Anndata object
201 | """
202 | url = 'https://www.dropbox.com/s/9tv44nugwpx9t4c/rna_seq_wang.h5ad?dl=1'
203 | filename = 'rna_wang2016.h5ad'
204 | filepath = os.path.join(settings.workdir, 'data')
205 | fullpath = os.path.join(filepath, filename)
206 | if(not os.path.exists(fullpath)):
207 | print('Downloading data ...')
208 | os.makedirs(filepath, exist_ok=True)
209 | download_url(url,
210 | fullpath,
211 | desc=filename)
212 | print(f'Downloaded to {filepath}.')
213 | adata = read_h5ad(fullpath)
214 | return adata
215 |
216 |
217 | def rna_xin2016():
218 | """single-cell RNA-seq human pancreas data
219 |
220 | ref: Xin, Y. et al. RNA sequencing of single human islet cells reveals
221 | type 2 diabetes genes. Cell metabolism 24, 608-615 (2016).
222 |
223 | Returns
224 | -------
225 | adata: `AnnData`
226 | Anndata object
227 | """
228 | url = 'https://www.dropbox.com/s/j483i47mxty6rzo/rna_seq_xin.h5ad?dl=1'
229 | filename = 'rna_xin2016.h5ad'
230 | filepath = os.path.join(settings.workdir, 'data')
231 | fullpath = os.path.join(filepath, filename)
232 | if(not os.path.exists(fullpath)):
233 | print('Downloading data ...')
234 | os.makedirs(filepath, exist_ok=True)
235 | download_url(url,
236 | fullpath,
237 | desc=filename)
238 | print(f'Downloaded to {filepath}.')
239 | adata = read_h5ad(fullpath)
240 | return adata
241 |
242 |
243 | def atac_buenrostro2018():
244 | """single cell ATAC-seq human blood data
245 |
246 | ref: Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the
247 | Continuous RegulatoryLandscape of Human Hematopoietic Differentiation.
248 | Cell 173, 1535-1548 e1516 (2018).
249 |
250 | Returns
251 | -------
252 | adata: `AnnData`
253 | Anndata object
254 | """
255 | url = 'https://www.dropbox.com/s/7hxjqgdxtbna1tm/atac_seq.h5ad?dl=1'
256 | filename = 'atac_buenrostro2018.h5ad'
257 | filepath = os.path.join(settings.workdir, 'data')
258 | fullpath = os.path.join(filepath, filename)
259 | if(not os.path.exists(fullpath)):
260 | print('Downloading data ...')
261 | os.makedirs(filepath, exist_ok=True)
262 | download_url(url,
263 | fullpath,
264 | desc=filename)
265 | print(f'Downloaded to {filepath}.')
266 | adata = read_h5ad(fullpath)
267 | return adata
268 |
269 |
270 | def atac_10xpbmc5k():
271 | """10X human peripheral blood mononuclear cells (PBMCs) scATAC-seq data
272 |
273 | Returns
274 | -------
275 | adata: `AnnData`
276 | Anndata object
277 | """
278 | url = 'https://www.dropbox.com/s/xa8u7rlskc5h7iv/atac_seq.h5ad?dl=1'
279 | filename = 'atac_10xpbmc5k.h5ad'
280 | filepath = os.path.join(settings.workdir, 'data')
281 | fullpath = os.path.join(filepath, filename)
282 | if(not os.path.exists(fullpath)):
283 | print('Downloading data ...')
284 | os.makedirs(filepath, exist_ok=True)
285 | download_url(url,
286 | fullpath,
287 | desc=filename)
288 | print(f'Downloaded to {filepath}.')
289 | adata = read_h5ad(fullpath)
290 | return adata
291 |
292 |
293 | def atac_cusanovich2018_subset():
294 | """downsampled sci-ATAC-seq mouse tissue data
295 |
296 | ref: Cusanovich, D.A. et al. A Single-Cell Atlas of In Vivo Mammalian
297 | Chromatin Accessibility. Cell 174, 1309-1324 e1318 (2018).
298 |
299 | Returns
300 | -------
301 | adata: `AnnData`
302 | Anndata object
303 | """
304 | url = 'https://www.dropbox.com/s/e8iqwm93m33i5wt/atac_seq.h5ad?dl=1'
305 | filename = 'atac_cusanovich2018_subset.h5ad'
306 | filepath = os.path.join(settings.workdir, 'data')
307 | fullpath = os.path.join(filepath, filename)
308 | if(not os.path.exists(fullpath)):
309 | print('Downloading data ...')
310 | os.makedirs(filepath, exist_ok=True)
311 | download_url(url,
312 | fullpath,
313 | desc=filename)
314 | print(f'Downloaded to {filepath}.')
315 | adata = read_h5ad(fullpath)
316 | return adata
317 |
318 |
319 | def atac_chen2019():
320 | """simulated scATAC-seq bone marrow data with a noise level of 0.4
321 | and a coverage of 2500 fragments
322 |
323 | ref: Chen, H. et al. Assessment of computational methods for the analysis
324 | of single-cell ATAC-seq data. Genome Biology 20, 241 (2019).
325 |
326 | Returns
327 | -------
328 | adata: `AnnData`
329 | Anndata object
330 | """
331 | url = 'https://www.dropbox.com/s/fthhh3mz5b39d4y/atac_seq.h5ad?dl=1'
332 | filename = 'atac_chen2019.h5ad'
333 | filepath = os.path.join(settings.workdir, 'data')
334 | fullpath = os.path.join(filepath, filename)
335 | if(not os.path.exists(fullpath)):
336 | print('Downloading data ...')
337 | os.makedirs(filepath, exist_ok=True)
338 | download_url(url,
339 | fullpath,
340 | desc=filename)
341 | print(f'Downloaded to {filepath}.')
342 | adata = read_h5ad(fullpath)
343 | return adata
344 |
345 |
346 | def multiome_ma2020_fig4():
347 | """single cell multiome mouse skin data (SHARE-seq)
348 |
349 | ref: Ma, S. et al. Chromatin Potential Identified by Shared Single-Cell
350 | Profiling of RNA and Chromatin. Cell (2020).
351 |
352 | Returns
353 | -------
354 | dict_adata: `dict`
355 | A dictionary of anndata objects
356 | """
357 | url_rna = 'https://www.dropbox.com/s/gmmf77l8kzle6o7/rna_seq_fig4.h5ad?dl=1'
358 | url_atac = 'https://www.dropbox.com/s/ts0v2y2m5fcumcb/atac_seq_fig4.h5ad?dl=1'
359 | filename_rna = 'multiome_ma2020_fig4_rna.h5ad'
360 | filename_atac = 'multiome_ma2020_fig4_atac.h5ad'
361 | filepath = os.path.join(settings.workdir, 'data')
362 | fullpath_rna = os.path.join(filepath, filename_rna)
363 | fullpath_atac = os.path.join(filepath, filename_atac)
364 |
365 | if(not os.path.exists(fullpath_rna)):
366 | print('Downloading data ...')
367 | os.makedirs(filepath, exist_ok=True)
368 | download_url(url_rna,
369 | fullpath_rna,
370 | desc=filename_rna)
371 | print(f'Downloaded to {filepath}.')
372 | if(not os.path.exists(fullpath_atac)):
373 | print('Downloading data ...')
374 | os.makedirs(filepath, exist_ok=True)
375 | download_url(url_atac,
376 | fullpath_atac,
377 | desc=filename_atac)
378 | print(f'Downloaded to {filepath}.')
379 | adata_rna = read_h5ad(fullpath_rna)
380 | adata_atac = read_h5ad(fullpath_atac)
381 | dict_adata = {'rna': adata_rna,
382 | 'atac': adata_atac}
383 | return dict_adata
384 |
385 |
386 | def multiome_chen2019():
387 | """single cell multiome neonatal mouse cerebral cortex data (SNARE-seq)
388 |
389 | ref: Chen, S., Lake, B.B. & Zhang, K. High-throughput sequencing of the
390 | transcriptome and chromatin accessibility in the same cell.
391 | Nat Biotechnol (2019).
392 |
393 | Returns
394 | -------
395 | dict_adata: `dict`
396 | A dictionary of anndata objects
397 | """
398 | url_rna = 'https://www.dropbox.com/s/b1bbcs500q0pigt/rna_seq.h5ad?dl=1'
399 | url_atac = 'https://www.dropbox.com/s/ljepkfber68pdvc/atac_seq.h5ad?dl=1'
400 | filename_rna = 'multiome_chen2019_rna.h5ad'
401 | filename_atac = 'multiome_chen2019_atac.h5ad'
402 | filepath = os.path.join(settings.workdir, 'data')
403 | fullpath_rna = os.path.join(filepath, filename_rna)
404 | fullpath_atac = os.path.join(filepath, filename_atac)
405 |
406 | if(not os.path.exists(fullpath_rna)):
407 | print('Downloading data ...')
408 | os.makedirs(filepath, exist_ok=True)
409 | download_url(url_rna,
410 | fullpath_rna,
411 | desc=filename_rna)
412 | print(f'Downloaded to {filepath}.')
413 | if(not os.path.exists(fullpath_atac)):
414 | print('Downloading data ...')
415 | os.makedirs(filepath, exist_ok=True)
416 | download_url(url_atac,
417 | fullpath_atac,
418 | desc=filename_atac)
419 | print(f'Downloaded to {filepath}.')
420 | adata_rna = read_h5ad(fullpath_rna)
421 | adata_atac = read_h5ad(fullpath_atac)
422 | dict_adata = {'rna': adata_rna,
423 | 'atac': adata_atac}
424 | return dict_adata
425 |
426 |
427 | def multiome_10xpbmc10k():
428 | """single cell 10X human peripheral blood mononuclear cells (PBMCs)
429 | multiome data
430 |
431 | Returns
432 | -------
433 | dict_adata: `dict`
434 | A dictionary of anndata objects
435 | """
436 | url_rna = 'https://www.dropbox.com/s/zwlim6vljnbfp43/rna_seq.h5ad?dl=1'
437 | url_atac = 'https://www.dropbox.com/s/163msz0k9hkfrt7/atac_seq.h5ad?dl=1'
438 | filename_rna = 'multiome_10xpbmc10k_rna.h5ad'
439 | filename_atac = 'multiome_10xpbmc10k_atac.h5ad'
440 | filepath = os.path.join(settings.workdir, 'data')
441 | fullpath_rna = os.path.join(filepath, filename_rna)
442 | fullpath_atac = os.path.join(filepath, filename_atac)
443 |
444 | if(not os.path.exists(fullpath_rna)):
445 | print('Downloading data ...')
446 | os.makedirs(filepath, exist_ok=True)
447 | download_url(url_rna,
448 | fullpath_rna,
449 | desc=filename_rna)
450 | print(f'Downloaded to {filepath}.')
451 | if(not os.path.exists(fullpath_atac)):
452 | print('Downloading data ...')
453 | os.makedirs(filepath, exist_ok=True)
454 | download_url(url_atac,
455 | fullpath_atac,
456 | desc=filename_atac)
457 | print(f'Downloaded to {filepath}.')
458 | adata_rna = read_h5ad(fullpath_rna)
459 | adata_atac = read_h5ad(fullpath_atac)
460 | dict_adata = {'rna': adata_rna,
461 | 'atac': adata_atac}
462 | return dict_adata
463 |
--------------------------------------------------------------------------------
/simba/plotting/__init__.py:
--------------------------------------------------------------------------------
1 | """Plotting"""
2 |
3 | from ._plot import (
4 | pca_variance_ratio,
5 | pcs_features,
6 | variable_genes,
7 | violin,
8 | hist,
9 | umap,
10 | discretize,
11 | node_similarity,
12 | svd_nodes,
13 | )
14 | from ._post_training import (
15 | pbg_metrics,
16 | entity_metrics,
17 | entity_barcode,
18 | query
19 | )
20 |
--------------------------------------------------------------------------------
/simba/plotting/_palettes.py:
--------------------------------------------------------------------------------
1 | """Color palettes in addition to matplotlib's palettes
2 |
3 | This is modifed from
4 | scanpy palettes https://github.com/theislab/scanpy/blob/master/scanpy/plotting/palettes.py
5 | """
6 |
7 | from matplotlib import cm, colors
8 |
9 | # Colorblindness adjusted vega_10
10 | # See https://github.com/theislab/scanpy/issues/387
11 | vega_10 = list(map(colors.to_hex, cm.tab10.colors))
12 | vega_10_scanpy = vega_10.copy()
13 | vega_10_scanpy[2] = '#279e68' # green
14 | vega_10_scanpy[4] = '#aa40fc' # purple
15 | vega_10_scanpy[8] = '#b5bd61' # kakhi
16 |
17 | # default matplotlib 2.0 palette
18 | # see 'category20' on https://github.com/vega/vega/wiki/Scales#scale-range-literals
19 | vega_20 = list(map(colors.to_hex, cm.tab20.colors))
20 |
21 | # reorderd, some removed, some added
22 | vega_20_scanpy = [
23 | *vega_20[0:14:2], *vega_20[16::2], # dark without grey
24 | *vega_20[1:15:2], *vega_20[17::2], # light without grey
25 | '#ad494a', '#8c6d31', # manual additions
26 | ]
27 | vega_20_scanpy[2] = vega_10_scanpy[2]
28 | vega_20_scanpy[4] = vega_10_scanpy[4]
29 | vega_20_scanpy[7] = vega_10_scanpy[8] # kakhi shifted by missing grey
30 | # TODO: also replace pale colors if necessary
31 |
32 | default_20 = vega_20_scanpy
33 |
34 | # https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d
35 | # update 1
36 | # orig reference http://epub.wu.ac.at/1692/1/document.pdf
37 | zeileis_28 = [
38 | "#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", "#4a6fe3",
39 | "#8595e1", "#b5bbe3", "#e6afb9", "#e07b91", "#d33f6a", "#11c638", "#8dd593",
40 | "#c6dec7", "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6", "#d5eae7",
41 | "#f3e1eb", "#f6c4e1", "#f79cd4",
42 | '#7f7f7f', "#c7c7c7", "#1CE6FF", "#336600", # these last ones were added,
43 | ]
44 |
45 | default_28 = zeileis_28
46 |
47 | # from http://godsnotwheregodsnot.blogspot.de/2012/09/color-distribution-methodology.html
48 | godsnot_102 = [
49 | # "#000000", # remove the black, as often, we have black colored annotation
50 | "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
51 | "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
52 | "#5A0007", "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
53 | "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
54 | "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
55 | "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
56 | "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
57 | "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
58 | "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
59 | "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
60 | "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
61 | "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
62 | "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72",
63 | ]
64 |
65 | default_102 = godsnot_102
66 |
--------------------------------------------------------------------------------
/simba/plotting/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from pandas.api.types import (
6 | is_numeric_dtype,
7 | is_string_dtype,
8 | is_categorical_dtype,
9 | )
10 | import matplotlib as mpl
11 |
12 | from ._palettes import (
13 | default_20,
14 | default_28,
15 | default_102
16 | )
17 |
18 |
19 | def get_colors(arr,
20 | vmin=None,
21 | vmax=None,
22 | clip=False):
23 | """Generate a list of colors for a given array
24 | """
25 |
26 | if not isinstance(arr, (pd.Series, np.ndarray)):
27 | raise TypeError("`arr` must be pd.Series or np.ndarray")
28 | colors = []
29 | if is_numeric_dtype(arr):
30 | image_cmap = mpl.rcParams['image.cmap']
31 | cm = mpl.cm.get_cmap(image_cmap, 512)
32 | if vmin is None:
33 | vmin = min(arr)
34 | if vmax is None:
35 | vmax = max(arr)
36 | norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax, clip=clip)
37 | colors = [mpl.colors.to_hex(cm(norm(x))) for x in arr]
38 | elif is_string_dtype(arr) or is_categorical_dtype(arr):
39 | categories = np.unique(arr)
40 | length = len(categories)
41 | # check if default matplotlib palette has enough colors
42 | # mpl.style.use('default')
43 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length:
44 | cc = mpl.rcParams['axes.prop_cycle']()
45 | palette = [mpl.colors.rgb2hex(next(cc)['color'])
46 | for _ in range(length)]
47 | else:
48 | if length <= 20:
49 | palette = default_20
50 | elif length <= 28:
51 | palette = default_28
52 | elif length <= len(default_102): # 103 colors
53 | palette = default_102
54 | else:
55 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length))
56 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1])
57 | for i in range(length)]
58 | colors = pd.Series(['']*len(arr))
59 | for i, x in enumerate(categories):
60 | ids = np.where(arr == x)[0]
61 | colors[ids] = palette[i]
62 | colors = list(colors)
63 | else:
64 | raise TypeError("unsupported data type for `arr`")
65 | return colors
66 |
67 |
68 | def generate_palette(arr):
69 | """Generate a color palette for a given array
70 | """
71 |
72 | if not isinstance(arr, (pd.Series, np.ndarray)):
73 | raise TypeError("`arr` must be pd.Series or np.ndarray")
74 | colors = []
75 | if is_string_dtype(arr) or is_categorical_dtype(arr):
76 | categories = np.unique(arr)
77 | length = len(categories)
78 | # check if default matplotlib palette has enough colors
79 | # mpl.style.use('default')
80 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length:
81 | cc = mpl.rcParams['axes.prop_cycle']()
82 | palette = [mpl.colors.rgb2hex(next(cc)['color'])
83 | for _ in range(length)]
84 | else:
85 | if length <= 20:
86 | palette = default_20
87 | elif length <= 28:
88 | palette = default_28
89 | elif length <= len(default_102): # 103 colors
90 | palette = default_102
91 | else:
92 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length))
93 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1])
94 | for i in range(length)]
95 | colors = pd.Series(['']*len(arr))
96 | for i, x in enumerate(categories):
97 | ids = np.where(arr == x)[0]
98 | colors[ids] = palette[i]
99 | colors = list(colors)
100 | else:
101 | raise TypeError("unsupported data type for `arr`")
102 | dict_palette = dict(zip(arr, colors))
103 | return dict_palette
104 |
--------------------------------------------------------------------------------
/simba/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """Preprocessing"""
2 |
3 | from ._general import (
4 | log_transform,
5 | normalize,
6 | binarize
7 | )
8 | from ._qc import (
9 | cal_qc,
10 | cal_qc_rna,
11 | cal_qc_atac,
12 | filter_samples,
13 | filter_cells_rna,
14 | filter_cells_atac,
15 | filter_features,
16 | filter_genes,
17 | filter_peaks,
18 | )
19 | from ._pca import (
20 | pca,
21 | select_pcs,
22 | select_pcs_features,
23 | )
24 | from ._variable_genes import (
25 | select_variable_genes
26 | )
27 |
--------------------------------------------------------------------------------
/simba/preprocessing/_general.py:
--------------------------------------------------------------------------------
1 | """General preprocessing functions"""
2 |
3 | import numpy as np
4 | from sklearn.utils import sparsefuncs
5 | from sklearn import preprocessing
6 | from ._utils import (
7 | cal_tf_idf
8 | )
9 | from scipy.sparse import (
10 | issparse,
11 | csr_matrix,
12 | )
13 |
14 |
15 | def log_transform(adata):
16 | """Return the natural logarithm of one plus the input array, element-wise.
17 |
18 | Parameters
19 | ----------
20 | adata: AnnData
21 | Annotated data matrix.
22 |
23 | Returns
24 | -------
25 | updates `adata` with the following fields.
26 | X: `numpy.ndarray` (`adata.X`)
27 | Store #observations × #var_genes logarithmized data matrix.
28 | """
29 | if(not issparse(adata.X)):
30 | adata.X = csr_matrix(adata.X)
31 | adata.X = np.log1p(adata.X)
32 | return None
33 |
34 |
35 | def binarize(adata,
36 | threshold=1e-5):
37 | """Binarize an array.
38 | Parameters
39 | ----------
40 | adata: AnnData
41 | Annotated data matrix.
42 | threshold: `float`, optional (default: 1e-5)
43 | Values below or equal to this are replaced by 0, above it by 1.
44 |
45 | Returns
46 | -------
47 | updates `adata` with the following fields.
48 | X: `numpy.ndarray` (`adata.X`)
49 | Store #observations × #var_genes binarized data matrix.
50 | """
51 | if(not issparse(adata.X)):
52 | adata.X = csr_matrix(adata.X)
53 | adata.X = preprocessing.binarize(adata.X,
54 | threshold=threshold,
55 | copy=True)
56 |
57 |
58 | def normalize(adata,
59 | method='lib_size',
60 | scale_factor=1e4,
61 | save_raw=True):
62 | """Normalize count matrix.
63 |
64 | Parameters
65 | ----------
66 | adata: AnnData
67 | Annotated data matrix.
68 | method: `str`, optional (default: 'lib_size')
69 | Choose from {{'lib_size','tf_idf'}}
70 | Method used for dimension reduction.
71 | 'lib_size': Total-count normalize (library-size correct)
72 | 'tf_idf': TF-IDF (term frequency–inverse document frequency)
73 | transformation
74 |
75 | Returns
76 | -------
77 | updates `adata` with the following fields.
78 | X: `numpy.ndarray` (`adata.X`)
79 | Store #observations × #var_genes normalized data matrix.
80 | """
81 | if(method not in ['lib_size', 'tf_idf']):
82 | raise ValueError("unrecognized method '%s'" % method)
83 | if(not issparse(adata.X)):
84 | adata.X = csr_matrix(adata.X)
85 | if(save_raw):
86 | adata.layers['raw'] = adata.X.copy()
87 | if(method == 'lib_size'):
88 | sparsefuncs.inplace_row_scale(adata.X, 1/adata.X.sum(axis=1).A)
89 | adata.X = adata.X*scale_factor
90 | if(method == 'tf_idf'):
91 | adata.X = cal_tf_idf(adata.X)
92 |
--------------------------------------------------------------------------------
/simba/preprocessing/_pca.py:
--------------------------------------------------------------------------------
1 | """Principal component analysis"""
2 |
3 | import numpy as np
4 | from sklearn.decomposition import TruncatedSVD
5 | from ._utils import (
6 | locate_elbow,
7 | )
8 |
9 |
10 | def pca(adata,
11 | n_components=50,
12 | algorithm='randomized',
13 | n_iter=5,
14 | random_state=2021,
15 | tol=0.0,
16 | feature=None,
17 | **kwargs,
18 | ):
19 | """perform Principal Component Analysis (PCA)
20 |
21 | Parameters
22 | ----------
23 | adata: AnnData
24 | Annotated data matrix.
25 | n_components: `int`, optional (default: 50)
26 | Desired dimensionality of output data
27 | algorithm: `str`, optional (default: 'randomized')
28 | SVD solver to use. Choose from {'arpack', 'randomized'}.
29 | n_iter: `int`, optional (default: '5')
30 | Number of iterations for randomized SVD solver.
31 | Not used by ARPACK.
32 | tol: `float`, optional (default: 0)
33 | Tolerance for ARPACK. 0 means machine precision.
34 | Ignored by randomized SVD solver.
35 | feature: `str`, optional (default: None)
36 | Feature used to perform PCA.
37 | The data type of `.var[feature]` needs to be `bool`
38 | If None, adata.X will be used.
39 | kwargs:
40 | Other keyword arguments are passed down to `TruncatedSVD()`
41 |
42 | Returns
43 | -------
44 | updates `adata` with the following fields:
45 | `.obsm['X_pca']` : `array`
46 | PCA transformed X.
47 | `.uns['pca']['PCs']` : `array`
48 | Principal components in feature space,
49 | representing the directions of maximum variance in the data.
50 | `.uns['pca']['variance']` : `array`
51 | The variance of the training samples transformed by a
52 | projection to each component.
53 | `.uns['pca']['variance_ratio']` : `array`
54 | Percentage of variance explained by each of the selected components.
55 | """
56 | if(feature is None):
57 | X = adata.X.copy()
58 | else:
59 | mask = adata.var[feature]
60 | X = adata[:, mask].X.copy()
61 | svd = TruncatedSVD(n_components=n_components,
62 | algorithm=algorithm,
63 | n_iter=n_iter,
64 | random_state=random_state,
65 | tol=tol,
66 | **kwargs)
67 | svd.fit(X)
68 | adata.obsm['X_pca'] = svd.transform(X)
69 | adata.uns['pca'] = dict()
70 | adata.uns['pca']['n_pcs'] = n_components
71 | adata.uns['pca']['PCs'] = svd.components_.T
72 | adata.uns['pca']['variance'] = svd.explained_variance_
73 | adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_
74 |
75 |
76 | def select_pcs(adata,
77 | n_pcs=None,
78 | S=1,
79 | curve='convex',
80 | direction='decreasing',
81 | online=False,
82 | min_elbow=None,
83 | **kwargs):
84 | """select top PCs based on variance_ratio
85 |
86 | Parameters
87 | ----------
88 | n_pcs: `int`, optional (default: None)
89 | If n_pcs is None,
90 | the number of PCs will be automatically selected with "`kneed
91 | `__"
92 | S : `float`, optional (default: 1)
93 | Sensitivity
94 | min_elbow: `int`, optional (default: None)
95 | The minimum elbow location
96 | By default, it is n_components/10
97 | curve: `str`, optional (default: 'convex')
98 | Choose from {'convex','concave'}
99 | If 'concave', algorithm will detect knees,
100 | If 'convex', algorithm will detect elbows.
101 | direction: `str`, optional (default: 'decreasing')
102 | Choose from {'decreasing','increasing'}
103 | online: `bool`, optional (default: False)
104 | kneed will correct old knee points if True,
105 | kneed will return first knee if False.
106 | **kwargs: `dict`, optional
107 | Extra arguments to KneeLocator.
108 | Returns
109 |
110 | """
111 | if(n_pcs is None):
112 | n_components = adata.obsm['X_pca'].shape[1]
113 | if(min_elbow is None):
114 | min_elbow = n_components/10
115 | n_pcs = locate_elbow(range(n_components),
116 | adata.uns['pca']['variance_ratio'],
117 | S=S,
118 | curve=curve,
119 | min_elbow=min_elbow,
120 | direction=direction,
121 | online=online,
122 | **kwargs)
123 | adata.uns['pca']['n_pcs'] = n_pcs
124 | else:
125 | adata.uns['pca']['n_pcs'] = n_pcs
126 |
127 |
128 | def select_pcs_features(adata,
129 | S=1,
130 | curve='convex',
131 | direction='decreasing',
132 | online=False,
133 | min_elbow=None,
134 | **kwargs):
135 | """select features that contribute to the top PCs
136 |
137 | Parameters
138 | ----------
139 | S : `float`, optional (default: 10)
140 | Sensitivity
141 | min_elbow: `int`, optional (default: None)
142 | The minimum elbow location.
143 | By default, it is #features/6
144 | curve: `str`, optional (default: 'convex')
145 | Choose from {'convex','concave'}
146 | If 'concave', algorithm will detect knees,
147 | If 'convex', algorithm will detect elbows.
148 | direction: `str`, optional (default: 'decreasing')
149 | Choose from {'decreasing','increasing'}
150 | online: `bool`, optional (default: False)
151 | kneed will correct old knee points if True,
152 | kneed will return first knee if False.
153 | **kwargs: `dict`, optional
154 | Extra arguments to KneeLocator.
155 | Returns
156 | -------
157 | """
158 | n_pcs = adata.uns['pca']['n_pcs']
159 | n_features = adata.uns['pca']['PCs'].shape[0]
160 | if(min_elbow is None):
161 | min_elbow = n_features/6
162 | adata.uns['pca']['features'] = dict()
163 | ids_features = list()
164 | for i in range(n_pcs):
165 | elbow = locate_elbow(range(n_features),
166 | np.sort(
167 | np.abs(adata.uns['pca']['PCs'][:, i],))[::-1],
168 | S=S,
169 | min_elbow=min_elbow,
170 | curve=curve,
171 | direction=direction,
172 | online=online,
173 | **kwargs)
174 | ids_features_i = \
175 | list(np.argsort(np.abs(
176 | adata.uns['pca']['PCs'][:, i],))[::-1][:elbow])
177 | adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i
178 | ids_features = ids_features + ids_features_i
179 | print(f'#features selected from PC {i}: {len(ids_features_i)}')
180 | adata.var['top_pcs'] = False
181 | adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True
182 | print(f'#features in total: {adata.var["top_pcs"].sum()}')
183 |
--------------------------------------------------------------------------------
/simba/preprocessing/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from kneed import KneeLocator
5 | from scipy.sparse import csr_matrix, diags
6 |
7 |
8 | def locate_elbow(x, y, S=10, min_elbow=0,
9 | curve='convex', direction='decreasing', online=False,
10 | **kwargs):
11 | """Detect knee points
12 |
13 | Parameters
14 | ----------
15 | x : `array_like`
16 | x values
17 | y : `array_like`
18 | y values
19 | S : `float`, optional (default: 10)
20 | Sensitivity
21 | min_elbow: `int`, optional (default: 0)
22 | The minimum elbow location
23 | curve: `str`, optional (default: 'convex')
24 | Choose from {'convex','concave'}
25 | If 'concave', algorithm will detect knees,
26 | If 'convex', algorithm will detect elbows.
27 | direction: `str`, optional (default: 'decreasing')
28 | Choose from {'decreasing','increasing'}
29 | online: `bool`, optional (default: False)
30 | kneed will correct old knee points if True,
31 | kneed will return first knee if False.
32 | **kwargs: `dict`, optional
33 | Extra arguments to KneeLocator.
34 |
35 | Returns
36 | -------
37 | elbow: `int`
38 | elbow point
39 | """
40 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):],
41 | S=S, curve=curve,
42 | direction=direction,
43 | online=online,
44 | **kwargs,
45 | )
46 | if(kneedle.elbow is None):
47 | elbow = len(y)
48 | else:
49 | elbow = int(kneedle.elbow)
50 | return elbow
51 |
52 |
53 | def cal_tf_idf(mat):
54 | """Transform a count matrix to a tf-idf representation
55 | """
56 | mat = csr_matrix(mat)
57 | tf = csr_matrix(mat/(mat.sum(axis=0)))
58 | idf = np.array(np.log(1 + mat.shape[1] / mat.sum(axis=1))).flatten()
59 | tf_idf = csr_matrix(np.dot(diags(idf), tf))
60 | return tf_idf
61 |
--------------------------------------------------------------------------------
/simba/preprocessing/_variable_genes.py:
--------------------------------------------------------------------------------
1 | """Preprocess"""
2 |
3 | import numpy as np
4 | from scipy.sparse import (
5 | csr_matrix,
6 | )
7 | from sklearn.utils import sparsefuncs
8 | from skmisc.loess import loess
9 |
10 |
11 | def select_variable_genes(adata,
12 | layer='raw',
13 | span=0.3,
14 | n_top_genes=2000,
15 | ):
16 | """Select highly variable genes.
17 |
18 | This function implenments the method 'vst' in Seurat v3.
19 | Inspired by Scanpy.
20 |
21 | Parameters
22 | ----------
23 | adata: AnnData
24 | Annotated data matrix.
25 | layer: `str`, optional (default: 'raw')
26 | The layer to use for calculating variable genes.
27 | span: `float`, optional (default: 0.3)
28 | Loess smoothing factor
29 | n_top_genes: `int`, optional (default: 2000)
30 | The number of genes to keep
31 |
32 | Returns
33 | -------
34 | updates `adata` with the following fields.
35 |
36 | variances_norm: `float`, (`adata.var['variances_norm']`)
37 | Normalized variance per gene
38 | variances: `float`, (`adata.var['variances']`)
39 | Variance per gene.
40 | means: `float`, (`adata.var['means']`)
41 | Means per gene
42 | highly_variable: `bool` (`adata.var['highly_variable']`)
43 | Indicator of variable genes
44 | """
45 | if layer is None:
46 | X = adata.X
47 | else:
48 | X = adata.layers[layer].astype(np.float64).copy()
49 | mean, variance = sparsefuncs.mean_variance_axis(X, axis=0)
50 | variance_expected = np.zeros(adata.shape[1], dtype=np.float64)
51 | not_const = variance > 0
52 |
53 | model = loess(np.log10(mean[not_const]),
54 | np.log10(variance[not_const]),
55 | span=span,
56 | degree=2)
57 | model.fit()
58 | variance_expected[not_const] = 10**model.outputs.fitted_values
59 | N = adata.shape[0]
60 | clip_max = np.sqrt(N)
61 | clip_val = np.sqrt(variance_expected) * clip_max + mean
62 |
63 | X = csr_matrix(X)
64 | mask = X.data > clip_val[X.indices]
65 | X.data[mask] = clip_val[X.indices[mask]]
66 |
67 | squared_X_sum = np.array(X.power(2).sum(axis=0))
68 | X_sum = np.array(X.sum(axis=0))
69 |
70 | norm_gene_var = (1 / ((N - 1) * variance_expected)) \
71 | * ((N * np.square(mean))
72 | + squared_X_sum
73 | - 2 * X_sum * mean
74 | )
75 | norm_gene_var = norm_gene_var.flatten()
76 |
77 | adata.var['variances_norm'] = norm_gene_var
78 | adata.var['variances'] = variance
79 | adata.var['means'] = mean
80 | ids_top = norm_gene_var.argsort()[-n_top_genes:][::-1]
81 | adata.var['highly_variable'] = np.isin(range(adata.shape[1]), ids_top)
82 | print(f'{n_top_genes} variable genes are selected.')
83 |
--------------------------------------------------------------------------------
/simba/readwrite.py:
--------------------------------------------------------------------------------
1 | """reading and writing"""
2 |
3 | import os
4 | import pandas as pd
5 | import json
6 | from anndata import (
7 | AnnData,
8 | read_h5ad,
9 | read_csv,
10 | read_excel,
11 | read_hdf,
12 | read_loom,
13 | read_mtx,
14 | read_text,
15 | read_umi_tools,
16 | read_zarr,
17 | )
18 | from pathlib import Path
19 | import tables
20 |
21 | from ._settings import settings
22 | from ._utils import _read_legacy_10x_h5, _read_v3_10x_h5
23 |
24 |
25 | def read_embedding(path_emb=None,
26 | path_entity=None,
27 | convert_alias=True,
28 | path_entity_alias=None,
29 | prefix=None,
30 | num_epochs=None):
31 | """Read in entity embeddings from pbg training
32 |
33 | Parameters
34 | ----------
35 | path_emb: `str`, optional (default: None)
36 | Path to directory for pbg embedding model
37 | If None, .settings.pbg_params['checkpoint_path'] will be used.
38 | path_entity: `str`, optional (default: None)
39 | Path to entity name file
40 | prefix: `list`, optional (default: None)
41 | A list of entity type prefixes to include.
42 | By default, it reads in the embeddings of all entities.
43 | convert_alias: `bool`, optional (default: True)
44 | If True, it will convert entity aliases to the original indices
45 | path_entity: `str`, optional (default: None)
46 | Path to entity alias file
47 | num_epochs: `int`, optional (default: None)
48 | The embedding result associated with num_epochs to read in
49 |
50 | Returns
51 | -------
52 | dict_adata: `dict`
53 | A dictionary of anndata objects of shape
54 | (#entities x #dimensions)
55 | """
56 | pbg_params = settings.pbg_params
57 | if path_emb is None:
58 | path_emb = pbg_params['checkpoint_path']
59 | if path_entity is None:
60 | path_entity = pbg_params['entity_path']
61 | if num_epochs is None:
62 | num_epochs = pbg_params["num_epochs"]
63 | if prefix is None:
64 | prefix = []
65 | assert isinstance(prefix, list), \
66 | "`prefix` must be list"
67 | if convert_alias:
68 | if path_entity_alias is None:
69 | path_entity_alias = Path(path_emb).parent.as_posix()
70 | df_entity_alias = pd.read_csv(
71 | os.path.join(path_entity_alias, 'entity_alias.txt'),
72 | header=0,
73 | index_col=0,
74 | sep='\t')
75 | df_entity_alias['id'] = df_entity_alias.index
76 | df_entity_alias.index = df_entity_alias['alias'].values
77 |
78 | dict_adata = dict()
79 | for x in os.listdir(path_emb):
80 | if x.startswith('embeddings'):
81 | entity_type = x.split('_')[1]
82 | if (len(prefix) == 0) or (entity_type in prefix):
83 | adata = \
84 | read_hdf(os.path.join(path_emb,
85 | f'embeddings_{entity_type}_0.'
86 | f'v{num_epochs}.h5'),
87 | key="embeddings")
88 | with open(
89 | os.path.join(path_entity,
90 | f'entity_names_{entity_type}_0.json'), "rt")\
91 | as tf:
92 | names_entity = json.load(tf)
93 | if convert_alias:
94 | names_entity = \
95 | df_entity_alias.loc[names_entity, 'id'].tolist()
96 | adata.obs.index = names_entity
97 | dict_adata[entity_type] = adata
98 | return dict_adata
99 |
100 |
101 | # modifed from
102 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
103 | def read_10x_h5(filename,
104 | genome=None,
105 | gex_only=True):
106 | """Read 10x-Genomics-formatted hdf5 file.
107 |
108 | Parameters
109 | ----------
110 | filename
111 | Path to a 10x hdf5 file.
112 | genome
113 | Filter expression to genes within this genome. For legacy 10x h5
114 | files, this must be provided if the data contains more than one genome.
115 | gex_only
116 | Only keep 'Gene Expression' data and ignore other feature types,
117 | e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
118 |
119 | Returns
120 | -------
121 | adata: AnnData
122 | Annotated data matrix, where observations/cells are named by their
123 | barcode and variables/genes by gene name
124 | """
125 | with tables.open_file(str(filename), 'r') as f:
126 | v3 = '/matrix' in f
127 | if v3:
128 | adata = _read_v3_10x_h5(filename)
129 | if genome:
130 | if genome not in adata.var['genome'].values:
131 | raise ValueError(
132 | f"Could not find data corresponding to "
133 | f"genome '{genome}' in '{filename}'. "
134 | f'Available genomes are:'
135 | f' {list(adata.var["genome"].unique())}.'
136 | )
137 | adata = adata[:, adata.var['genome'] == genome]
138 | if gex_only:
139 | adata = adata[:, adata.var['feature_types'] == 'Gene Expression']
140 | if adata.is_view:
141 | adata = adata.copy()
142 | else:
143 | adata = _read_legacy_10x_h5(filename, genome=genome)
144 | return adata
145 |
146 |
147 | def load_pbg_config(path=None):
148 | """Load PBG configuration into global setting
149 |
150 | Parameters
151 | ----------
152 | path: `str`, optional (default: None)
153 | Path to the directory for pbg configuration file
154 | If None, `.settings.pbg_params['checkpoint_path']` will be used
155 |
156 | Returns
157 | -------
158 | Updates `.settings.pbg_params`
159 |
160 | """
161 | if path is None:
162 | path = settings.pbg_params['checkpoint_path']
163 | path = os.path.normpath(path)
164 | with open(os.path.join(path, 'config.json'), "rt") as tf:
165 | pbg_params = json.load(tf)
166 | settings.set_pbg_params(config=pbg_params)
167 |
168 |
169 | def load_graph_stats(path=None):
170 | """Load graph statistics into global setting
171 |
172 | Parameters
173 | ----------
174 | path: `str`, optional (default: None)
175 | Path to the directory for graph statistics file
176 | If None, `.settings.pbg_params['checkpoint_path']` will be used
177 |
178 | Returns
179 | -------
180 | Updates `.settings.graph_stats`
181 | """
182 | if path is None:
183 | path = \
184 | Path(settings.pbg_params['entity_path']).parent.parent.as_posix()
185 | path = os.path.normpath(path)
186 | with open(os.path.join(path, 'graph_stats.json'), "rt") as tf:
187 | dict_graph_stats = json.load(tf)
188 | dirname = os.path.basename(path)
189 | settings.graph_stats[dirname] = dict_graph_stats.copy()
190 |
191 |
192 | def write_bed(adata,
193 | use_top_pcs=True,
194 | filename=None
195 | ):
196 | """Write peaks into .bed file
197 |
198 | Parameters
199 | ----------
200 | adata: AnnData
201 | Annotated data matrix with peaks as variables.
202 | use_top_pcs: `bool`, optional (default: True)
203 | Use top-PCs-associated features
204 | filename: `str`, optional (default: None)
205 | Filename name for peaks.
206 | By default, a file named 'peaks.bed' will be written to
207 | `.settings.workdir`
208 | """
209 | if filename is None:
210 | filename = os.path.join(settings.workdir, 'peaks.bed')
211 | for x in ['chr', 'start', 'end']:
212 | if x not in adata.var_keys():
213 | raise ValueError(f"could not find {x} in `adata.var_keys()`")
214 | if use_top_pcs:
215 | assert 'top_pcs' in adata.var_keys(), \
216 | "please run `si.pp.select_pcs_features()` first"
217 | peaks_selected = adata.var[
218 | adata.var['top_pcs']][['chr', 'start', 'end']]
219 | else:
220 | peaks_selected = adata.var[
221 | ['chr', 'start', 'end']]
222 | peaks_selected.to_csv(filename,
223 | sep='\t',
224 | header=False,
225 | index=False)
226 | fp, fn = os.path.split(filename)
227 | print(f'"{fn}" was written to "{fp}".')
228 |
--------------------------------------------------------------------------------
/simba/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """The core functionality"""
2 |
3 | from ._general import (
4 | discretize,
5 | )
6 | from ._umap import umap
7 | from ._gene_scores import gene_scores
8 | from ._integration import (
9 | infer_edges,
10 | trim_edges
11 | )
12 | from ._pbg import (
13 | gen_graph,
14 | pbg_train
15 | )
16 | from ._post_training import (
17 | softmax,
18 | embed,
19 | compare_entities,
20 | query,
21 | find_master_regulators,
22 | find_target_genes,
23 | )
24 |
--------------------------------------------------------------------------------
/simba/tools/_gene_scores.py:
--------------------------------------------------------------------------------
1 | """Predict gene scores based on chromatin accessibility"""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import anndata as ad
6 | import io
7 | import pybedtools
8 | from scipy.sparse import (
9 | coo_matrix,
10 | csr_matrix
11 | )
12 | import pkgutil
13 |
14 | from ._utils import _uniquify
15 |
16 |
17 | class GeneScores:
18 | """A class used to represent gene scores
19 |
20 | Attributes
21 | ----------
22 |
23 | Methods
24 | -------
25 |
26 | """
27 | def __init__(self,
28 | adata,
29 | genome,
30 | gene_anno=None,
31 | tss_upstream=1e5,
32 | tss_downsteam=1e5,
33 | gb_upstream=5000,
34 | cutoff_weight=1,
35 | use_top_pcs=True,
36 | use_precomputed=True,
37 | use_gene_weigt=True,
38 | min_w=1,
39 | max_w=5):
40 | """
41 | Parameters
42 | ----------
43 | adata: `Anndata`
44 | Input anndata
45 | genome : `str`
46 | The genome name
47 | """
48 | self.adata = adata
49 | self.genome = genome
50 | self.gene_anno = gene_anno
51 | self.tss_upstream = tss_upstream
52 | self.tss_downsteam = tss_downsteam
53 | self.gb_upstream = gb_upstream
54 | self.cutoff_weight = cutoff_weight
55 | self.use_top_pcs = use_top_pcs
56 | self.use_precomputed = use_precomputed
57 | self.use_gene_weigt = use_gene_weigt
58 | self.min_w = min_w
59 | self.max_w = max_w
60 |
61 | def _read_gene_anno(self):
62 | """Read in gene annotation
63 |
64 | Parameters
65 | ----------
66 |
67 | Returns
68 | -------
69 |
70 | """
71 | assert (self.genome in ['hg19', 'hg38', 'mm9', 'mm10']),\
72 | "`genome` must be one of ['hg19','hg38','mm9','mm10']"
73 |
74 | bin_str = pkgutil.get_data('simba',
75 | f'data/gene_anno/{self.genome}_genes.bed')
76 | gene_anno = pd.read_csv(io.BytesIO(bin_str),
77 | encoding='utf8',
78 | sep='\t',
79 | header=None,
80 | names=['chr', 'start', 'end',
81 | 'symbol', 'strand'])
82 | self.gene_anno = gene_anno
83 | return self.gene_anno
84 |
85 | def _extend_tss(self, pbt_gene):
86 | """Extend transcription start site in both directions
87 |
88 | Parameters
89 | ----------
90 |
91 | Returns
92 | -------
93 |
94 | """
95 | ext_tss = pbt_gene
96 | if(ext_tss['strand'] == '+'):
97 | ext_tss.start = max(0, ext_tss.start - self.tss_upstream)
98 | ext_tss.end = max(ext_tss.end, ext_tss.start + self.tss_downsteam)
99 | else:
100 | ext_tss.start = max(0, min(ext_tss.start,
101 | ext_tss.end - self.tss_downsteam))
102 | ext_tss.end = ext_tss.end + self.tss_upstream
103 | return ext_tss
104 |
105 | def _extend_genebody(self, pbt_gene):
106 | """Extend gene body upstream
107 |
108 | Parameters
109 | ----------
110 |
111 | Returns
112 | -------
113 |
114 | """
115 | ext_gb = pbt_gene
116 | if(ext_gb['strand'] == '+'):
117 | ext_gb.start = max(0, ext_gb.start - self.gb_upstream)
118 | else:
119 | ext_gb.end = ext_gb.end + self.gb_upstream
120 | return ext_gb
121 |
122 | def _weight_genes(self):
123 | """Weight genes
124 |
125 | Parameters
126 | ----------
127 |
128 | Returns
129 | -------
130 |
131 | """
132 | gene_anno = self.gene_anno
133 | gene_size = gene_anno['end'] - gene_anno['start']
134 | w = 1/gene_size
135 | w_scaled = (self.max_w-self.min_w) * (w-min(w)) / (max(w)-min(w)) \
136 | + self.min_w
137 | return w_scaled
138 |
139 | def cal_gene_scores(self):
140 | """Calculate gene scores
141 |
142 | Parameters
143 | ----------
144 |
145 | Returns
146 | -------
147 |
148 | """
149 | adata = self.adata
150 | if self.gene_anno is None:
151 | gene_ann = self._read_gene_anno()
152 | else:
153 | gene_ann = self.gene_anno
154 |
155 | df_gene_ann = gene_ann.copy()
156 | df_gene_ann.index = _uniquify(df_gene_ann['symbol'].values)
157 | if self.use_top_pcs:
158 | mask_p = adata.var['top_pcs']
159 | else:
160 | mask_p = pd.Series(True, index=adata.var_names)
161 | df_peaks = adata.var[mask_p][['chr', 'start', 'end']].copy()
162 |
163 | if('gene_scores' not in adata.uns_keys()):
164 | print('Gene scores are being calculated for the first time')
165 | print('`use_precomputed` has been ignored')
166 | self.use_precomputed = False
167 |
168 | if(self.use_precomputed):
169 | print('Using precomputed overlap')
170 | df_overlap_updated = adata.uns['gene_scores']['overlap'].copy()
171 | else:
172 | # add the fifth column
173 | # so that pybedtool can recognize the sixth column as the strand
174 | df_gene_ann_for_pbt = df_gene_ann.copy()
175 | df_gene_ann_for_pbt['score'] = 0
176 | df_gene_ann_for_pbt = df_gene_ann_for_pbt[['chr', 'start', 'end',
177 | 'symbol', 'score',
178 | 'strand']]
179 | df_gene_ann_for_pbt['id'] = range(df_gene_ann_for_pbt.shape[0])
180 |
181 | df_peaks_for_pbt = df_peaks.copy()
182 | df_peaks_for_pbt['id'] = range(df_peaks_for_pbt.shape[0])
183 |
184 | pbt_gene_ann = pybedtools.BedTool.from_dataframe(
185 | df_gene_ann_for_pbt
186 | )
187 | pbt_gene_ann_ext = pbt_gene_ann.each(self._extend_tss)
188 | pbt_gene_gb_ext = pbt_gene_ann.each(self._extend_genebody)
189 |
190 | pbt_peaks = pybedtools.BedTool.from_dataframe(df_peaks_for_pbt)
191 |
192 | # peaks overlapping with extended TSS
193 | pbt_overlap = pbt_peaks.intersect(pbt_gene_ann_ext,
194 | wa=True,
195 | wb=True)
196 | df_overlap = pbt_overlap.to_dataframe(
197 | names=[x+'_p' for x in df_peaks_for_pbt.columns]
198 | + [x+'_g' for x in df_gene_ann_for_pbt.columns])
199 | # peaks overlapping with gene body
200 | pbt_overlap2 = pbt_peaks.intersect(pbt_gene_gb_ext,
201 | wa=True,
202 | wb=True)
203 | df_overlap2 = pbt_overlap2.to_dataframe(
204 | names=[x+'_p' for x in df_peaks_for_pbt.columns]
205 | + [x+'_g' for x in df_gene_ann_for_pbt.columns])
206 |
207 | # add distance and weight for each overlap
208 | df_overlap_updated = df_overlap.copy()
209 | df_overlap_updated['dist'] = 0
210 |
211 | for i, x in enumerate(df_overlap['symbol_g'].unique()):
212 | # peaks within the extended TSS
213 | df_overlap_x = \
214 | df_overlap[df_overlap['symbol_g'] == x].copy()
215 | # peaks within the gene body
216 | df_overlap2_x = \
217 | df_overlap2[df_overlap2['symbol_g'] == x].copy()
218 | # peaks that are not intersecting with the promoter
219 | # and gene body of gene x
220 | id_overlap = df_overlap_x.index[
221 | ~np.isin(df_overlap_x['id_p'], df_overlap2_x['id_p'])]
222 | mask_x = (df_gene_ann['symbol'] == x)
223 | range_x = df_gene_ann[mask_x][['start', 'end']].values\
224 | .flatten()
225 | if(df_overlap_x['strand_g'].iloc[0] == '+'):
226 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat(
227 | [abs(df_overlap_x.loc[id_overlap, 'start_p']
228 | - (range_x[1])),
229 | abs(df_overlap_x.loc[id_overlap, 'end_p']
230 | - max(0, range_x[0]-self.gb_upstream))],
231 | axis=1, sort=False).min(axis=1)
232 | else:
233 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat(
234 | [abs(df_overlap_x.loc[id_overlap, 'start_p']
235 | - (range_x[1]+self.gb_upstream)),
236 | abs(df_overlap_x.loc[id_overlap, 'end_p']
237 | - (range_x[0]))],
238 | axis=1, sort=False).min(axis=1)
239 |
240 | n_batch = int(df_gene_ann_for_pbt.shape[0]/5)
241 | if(i % n_batch == 0):
242 | print(f'Processing: {i/df_gene_ann_for_pbt.shape[0]:.1%}')
243 | df_overlap_updated['dist'] = df_overlap_updated['dist']\
244 | .astype(float)
245 |
246 | adata.uns['gene_scores'] = dict()
247 | adata.uns['gene_scores']['overlap'] = df_overlap_updated.copy()
248 |
249 | df_overlap_updated['weight'] = np.exp(
250 | -(df_overlap_updated['dist'].values/self.gb_upstream))
251 | mask_w = (df_overlap_updated['weight'] < self.cutoff_weight)
252 | df_overlap_updated.loc[mask_w, 'weight'] = 0
253 | # construct genes-by-peaks matrix
254 | mat_GP = csr_matrix(coo_matrix((df_overlap_updated['weight'],
255 | (df_overlap_updated['id_g'],
256 | df_overlap_updated['id_p'])),
257 | shape=(df_gene_ann.shape[0],
258 | df_peaks.shape[0])))
259 | # adata_GP = ad.AnnData(X=csr_matrix(mat_GP),
260 | # obs=df_gene_ann,
261 | # var=df_peaks)
262 | # adata_GP.layers['weight'] = adata_GP.X.copy()
263 | if self.use_gene_weigt:
264 | gene_weights = self._weight_genes()
265 | gene_scores = adata[:, mask_p].X * \
266 | (mat_GP.T.multiply(gene_weights))
267 | else:
268 | gene_scores = adata[:, mask_p].X * mat_GP.T
269 | adata_CG_atac = ad.AnnData(gene_scores,
270 | obs=adata.obs.copy(),
271 | var=df_gene_ann.copy())
272 | return adata_CG_atac
273 |
274 |
275 | def gene_scores(adata,
276 | genome,
277 | gene_anno=None,
278 | tss_upstream=1e5,
279 | tss_downsteam=1e5,
280 | gb_upstream=5000,
281 | cutoff_weight=1,
282 | use_top_pcs=True,
283 | use_precomputed=True,
284 | use_gene_weigt=True,
285 | min_w=1,
286 | max_w=5):
287 | """Calculate gene scores
288 |
289 | Parameters
290 | ----------
291 | adata : AnnData
292 | Annotated data matrix.
293 | genome : `str`
294 | Reference genome. Choose from {'hg19', 'hg38', 'mm9', 'mm10'}
295 | gene_anno : `pandas.DataFrame`, optional (default: None)
296 | Dataframe of gene annotation.
297 | If None, built-in gene annotation will be used depending on `genome`;
298 | If provided, custom gene annotation will be used instead.
299 | tss_upstream : `int`, optional (default: 1e5)
300 | The number of base pairs upstream of TSS
301 | tss_downsteam : `int`, optional (default: 1e5)
302 | The number of base pairs downstream of TSS
303 | gb_upstream : `int`, optional (default: 5000)
304 | The number of base pairs upstream by which gene body is extended.
305 | Peaks within the extended gene body are given the weight of 1.
306 | cutoff_weight : `float`, optional (default: 1)
307 | Weight cutoff for peaks
308 | use_top_pcs : `bool`, optional (default: True)
309 | If True, only peaks associated with top PCs will be used
310 | use_precomputed : `bool`, optional (default: True)
311 | If True, overlap bewteen peaks and genes
312 | (stored in `adata.uns['gene_scores']['overlap']`) will be imported
313 | use_gene_weigt : `bool`, optional (default: True)
314 | If True, for each gene, the number of peaks assigned to it
315 | will be rescaled based on gene size
316 | min_w : `int`, optional (default: 1)
317 | The minimum weight for each gene.
318 | Only valid if `use_gene_weigt` is True
319 | max_w : `int`, optional (default: 5)
320 | The maximum weight for each gene.
321 | Only valid if `use_gene_weigt` is True
322 |
323 | Returns
324 | -------
325 | adata_new: AnnData
326 | Annotated data matrix.
327 | Stores #cells x #genes gene score matrix
328 |
329 | updates `adata` with the following fields.
330 | overlap: `pandas.DataFrame`, (`adata.uns['gene_scores']['overlap']`)
331 | Dataframe of overlap between peaks and genes
332 | """
333 | GS = GeneScores(adata,
334 | genome,
335 | gene_anno=gene_anno,
336 | tss_upstream=tss_upstream,
337 | tss_downsteam=tss_downsteam,
338 | gb_upstream=gb_upstream,
339 | cutoff_weight=cutoff_weight,
340 | use_top_pcs=use_top_pcs,
341 | use_precomputed=use_precomputed,
342 | use_gene_weigt=use_gene_weigt,
343 | min_w=min_w,
344 | max_w=max_w)
345 | adata_CG_atac = GS.cal_gene_scores()
346 | return adata_CG_atac
347 |
--------------------------------------------------------------------------------
/simba/tools/_general.py:
--------------------------------------------------------------------------------
1 | """General-purpose tools"""
2 |
3 | import numpy as np
4 | from sklearn.cluster import KMeans
5 |
6 |
7 | def discretize(adata,
8 | layer=None,
9 | n_bins=5,
10 | max_bins=100):
11 | """Discretize continous values
12 |
13 | Parameters
14 | ----------
15 | adata: AnnData
16 | Annotated data matrix.
17 | layer: `str`, optional (default: None)
18 | The layer used to perform discretization
19 | n_bins: `int`, optional (default: 5)
20 | The number of bins to produce.
21 | It must be smaller than `max_bins`.
22 | max_bins: `int`, optional (default: 100)
23 | The number of bins used in the initial approximation.
24 | i.e. the number of bins to cluster.
25 |
26 | Returns
27 | -------
28 | updates `adata` with the following fields
29 |
30 | `.layer['disc']` : `array_like`
31 | Discretized values.
32 | `.uns['disc']` : `dict`
33 | `bin_edges`: The edges of each bin.
34 | `bin_count`: The number of values in each bin.
35 | `hist_edges`: The edges of each bin \
36 | in the initial approximation.
37 | `hist_count`: The number of values in each bin \
38 | for the initial approximation.
39 | """
40 | if layer is None:
41 | X = adata.X
42 | else:
43 | X = adata.layers[layer]
44 | nonzero_cont = X.data
45 |
46 | hist_count, hist_edges = np.histogram(
47 | nonzero_cont,
48 | bins=max_bins,
49 | density=False)
50 | hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2
51 |
52 | kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit(
53 | hist_centroids.reshape(-1, 1),
54 | sample_weight=hist_count)
55 | cluster_centers = np.sort(kmeans.cluster_centers_.flatten())
56 |
57 | padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10)
58 | bin_edges = np.array(
59 | [hist_edges[0]-padding] +
60 | list((cluster_centers[0:-1] + cluster_centers[1:])/2) +
61 | [hist_edges[-1]+padding])
62 | nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,)
63 | bin_count = np.unique(nonzero_disc, return_counts=True)[1]
64 |
65 | adata.layers['disc'] = X.copy()
66 | adata.layers['disc'].data = nonzero_disc
67 | adata.uns['disc'] = dict()
68 | adata.uns['disc']['bin_edges'] = bin_edges
69 | adata.uns['disc']['bin_count'] = bin_count
70 | adata.uns['disc']['hist_edges'] = hist_edges
71 | adata.uns['disc']['hist_count'] = hist_count
72 |
--------------------------------------------------------------------------------
/simba/tools/_integration.py:
--------------------------------------------------------------------------------
1 | """Integration across experimental conditions or single cell modalities"""
2 |
3 | import numpy as np
4 | import anndata as ad
5 | # from sklearn.metrics.pairwise import pairwise_distances
6 | from sklearn.utils.extmath import randomized_svd
7 | from scipy.sparse import csr_matrix, find
8 |
9 | from ._utils import _knn
10 |
11 |
12 | def infer_edges(adata_ref,
13 | adata_query,
14 | feature='highly_variable',
15 | n_components=20,
16 | random_state=42,
17 | layer=None,
18 | k=20,
19 | metric='euclidean',
20 | leaf_size=40,
21 | **kwargs):
22 | """Infer edges between reference and query observations
23 |
24 | Parameters
25 | ----------
26 | adata_ref: `AnnData`
27 | Annotated reference data matrix.
28 | adata_query: `AnnData`
29 | Annotated query data matrix.
30 | feature: `str`, optional (default: None)
31 | Feature used for edges inference.
32 | The data type of `.var[feature]` needs to be `bool`
33 | n_components: `int`, optional (default: 20)
34 | The number of components used in `randomized_svd`
35 | for comparing reference and query observations
36 | random_state: `int`, optional (default: 42)
37 | The seed used for truncated randomized SVD
38 | n_top_edges: `int`, optional (default: None)
39 | The number of edges to keep
40 | If specified, `percentile` will be ignored
41 | percentile: `float`, optional (default: 0.01)
42 | The percentile of edges to keep
43 | k: `int`, optional (default: 5)
44 | The number of nearest neighbors to consider within each dataset
45 | metric: `str`, optional (default: 'euclidean')
46 | The metric to use when calculating distance between
47 | reference and query observations
48 | layer: `str`, optional (default: None)
49 | The layer used to perform edge inference
50 | If None, `.X` will be used.
51 | kwargs:
52 | Other keyword arguments are passed down to `randomized_svd()`
53 |
54 | Returns
55 | -------
56 | adata_ref_query: `AnnData`
57 | Annotated relation matrix betwewn reference and query observations
58 | Store reference entity as observations and query entity as variables
59 | """
60 |
61 | mask_ref = adata_ref.var[feature]
62 | feature_ref = adata_ref.var_names[mask_ref]
63 | feature_query = adata_query.var_names
64 | feature_shared = list(set(feature_ref).intersection(set(feature_query)))
65 | print(f'#shared features: {len(feature_shared)}')
66 | if layer is None:
67 | X_ref = adata_ref[:, feature_shared].X
68 | X_query = adata_query[:, feature_shared].X
69 | else:
70 | X_ref = adata_ref[:, feature_shared].layers[layer]
71 | X_query = adata_query[:, feature_shared].layers[layer]
72 |
73 | if any(X_ref.sum(axis=1) == 0) or any(X_query.sum(axis=1) == 0):
74 | raise ValueError(
75 | f'Some nodes contain zero expressed {feature} features.\n'
76 | f'Please try to include more {feature} features.')
77 |
78 | print('Performing randomized SVD ...')
79 | mat = X_ref * X_query.T
80 | U, Sigma, VT = randomized_svd(mat,
81 | n_components=n_components,
82 | random_state=random_state,
83 | **kwargs)
84 | svd_data = np.vstack((U, VT.T))
85 | X_svd_ref = svd_data[:U.shape[0], :]
86 | X_svd_query = svd_data[-VT.shape[1]:, :]
87 | X_svd_ref = X_svd_ref / (X_svd_ref**2).sum(-1, keepdims=True)**0.5
88 | X_svd_query = X_svd_query / (X_svd_query**2).sum(-1, keepdims=True)**0.5
89 |
90 | # print('Searching for neighbors within each dataset ...')
91 | # knn_conn_ref, knn_dist_ref = _knn(
92 | # X_ref=X_svd_ref,
93 | # k=k,
94 | # leaf_size=leaf_size,
95 | # metric=metric)
96 | # knn_conn_query, knn_dist_query = _knn(
97 | # X_ref=X_svd_query,
98 | # k=k,
99 | # leaf_size=leaf_size,
100 | # metric=metric)
101 |
102 | print('Searching for mutual nearest neighbors ...')
103 | knn_conn_ref_query, knn_dist_ref_query = _knn(
104 | X_ref=X_svd_ref,
105 | X_query=X_svd_query,
106 | k=k,
107 | leaf_size=leaf_size,
108 | metric=metric)
109 | knn_conn_query_ref, knn_dist_query_ref = _knn(
110 | X_ref=X_svd_query,
111 | X_query=X_svd_ref,
112 | k=k,
113 | leaf_size=leaf_size,
114 | metric=metric)
115 |
116 | sum_conn_ref_query = knn_conn_ref_query + knn_conn_query_ref.T
117 | id_x, id_y, values = find(sum_conn_ref_query > 1)
118 | print(f'{len(id_x)} edges are selected')
119 | conn_ref_query = csr_matrix(
120 | (values*1, (id_x, id_y)),
121 | shape=(knn_conn_ref_query.shape))
122 | dist_ref_query = csr_matrix(
123 | (knn_dist_ref_query[id_x, id_y].A.flatten(), (id_x, id_y)),
124 | shape=(knn_conn_ref_query.shape))
125 | # it's easier to distinguish zeros (no connection vs zero distance)
126 | # using similarity scores
127 | sim_ref_query = csr_matrix(
128 | (1/(dist_ref_query.data+1), dist_ref_query.nonzero()),
129 | shape=(dist_ref_query.shape)) # similarity scores
130 |
131 | # print('Computing similarity scores ...')
132 | # dist_ref_query = pairwise_distances(X_svd_ref,
133 | # X_svd_query,
134 | # metric=metric)
135 | # sim_ref_query = 1/(1+dist_ref_query)
136 | # # remove low similarity entries to save memory
137 | # sim_ref_query = np.where(
138 | # sim_ref_query < np.percentile(sim_ref_query, pct_keep*100),
139 | # 0, sim_ref_query)
140 | # sim_ref_query = csr_matrix(sim_ref_query)
141 |
142 | adata_ref_query = ad.AnnData(X=sim_ref_query,
143 | obs=adata_ref.obs,
144 | var=adata_query.obs)
145 | adata_ref_query.layers['conn'] = conn_ref_query
146 | adata_ref_query.obsm['svd'] = X_svd_ref
147 | # adata_ref_query.obsp['conn'] = knn_conn_ref
148 | # adata_ref_query.obsp['dist'] = knn_dist_ref
149 | adata_ref_query.varm['svd'] = X_svd_query
150 | # adata_ref_query.varp['conn'] = knn_conn_query
151 | # adata_ref_query.varp['dist'] = knn_dist_query
152 | return adata_ref_query
153 |
154 |
155 | def trim_edges(adata_ref_query,
156 | cutoff=None,
157 | n_edges=None):
158 | """Trim edges based on the similarity scores
159 |
160 | Parameters
161 | ----------
162 | adata_ref_query: `AnnData`
163 | Annotated relation matrix betwewn reference and query observations.
164 | n_edges: `int`, optional (default: None)
165 | The number of edges to keep
166 | If specified, `percentile` will be ignored
167 | cutoff: `float`, optional (default: None)
168 | The distance cutoff.
169 | If None, it will be decided by `n_top_edges`
170 | If specified, `n_top_edges` will be ignored
171 |
172 | Returns
173 | -------
174 | updates `adata_ref_query` with the following field.
175 | `.layers['conn']` : `array_like`
176 | relation matrix betwewn reference and query observations
177 | """
178 | sim_ref_query = adata_ref_query.X
179 | if cutoff is None:
180 | if n_edges is None:
181 | raise ValueError('"cutoff" or "n_edges" has to be specified')
182 | else:
183 | cutoff = \
184 | np.partition(sim_ref_query.data,
185 | (sim_ref_query.size-n_edges))[
186 | sim_ref_query.size-n_edges]
187 | # cutoff = \
188 | # np.partition(sim_ref_query.flatten(),
189 | # (len(sim_ref_query.flatten())-n_edges))[
190 | # len(sim_ref_query.flatten())-n_edges]
191 | id_x, id_y, values = find(sim_ref_query > cutoff)
192 |
193 | print(f'{len(id_x)} edges are selected')
194 | conn_ref_query = csr_matrix(
195 | (values*1, (id_x, id_y)),
196 | shape=(sim_ref_query.shape))
197 | adata_ref_query.layers['conn'] = conn_ref_query
198 |
--------------------------------------------------------------------------------
/simba/tools/_pbg.py:
--------------------------------------------------------------------------------
1 | """PyTorch-BigGraph (PBG) for learning graph embeddings"""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import os
6 | import json
7 |
8 | from pathlib import Path
9 | import attr
10 | from torchbiggraph.config import (
11 | add_to_sys_path,
12 | ConfigFileLoader
13 | )
14 | from torchbiggraph.converters.importers import (
15 | convert_input_data,
16 | TSVEdgelistReader
17 | )
18 | from torchbiggraph.train import train
19 | from torchbiggraph.util import (
20 | set_logging_verbosity,
21 | setup_logging,
22 | SubprocessInitializer,
23 | )
24 |
25 | from .._settings import settings
26 |
27 |
28 | def gen_graph(list_CP=None,
29 | list_PM=None,
30 | list_PK=None,
31 | list_CG=None,
32 | list_CC=None,
33 | prefix_C='C',
34 | prefix_P='P',
35 | prefix_M='M',
36 | prefix_K='K',
37 | prefix_G='G',
38 | copy=False,
39 | dirname='graph0',
40 | use_highly_variable=True,
41 | use_top_pcs=True,
42 | use_top_pcs_CP=None,
43 | use_top_pcs_PM=None,
44 | use_top_pcs_PK=None,
45 | ):
46 | """Generate graph for PBG training based on indices of obs and var
47 | It also generates an accompanying file 'entity_alias.tsv' to map
48 | the indices to the aliases used in the graph
49 |
50 | Parameters
51 | ----------
52 | list_CP: `list`, optional (default: None)
53 | A list of anndata objects that store ATAC-seq data (Cells by Peaks)
54 | list_PM: `list`, optional (default: None)
55 | A list of anndata objects that store relation between Peaks and Motifs
56 | list_PK: `list`, optional (default: None)
57 | A list of anndata objects that store relation between Peaks and Kmers
58 | list_CG: `list`, optional (default: None)
59 | A list of anndata objects that store RNA-seq data (Cells by Genes)
60 | list_CC: `list`, optional (default: None)
61 | A list of anndata objects that store relation between Cells
62 | from two conditions
63 | prefix_C: `str`, optional (default: 'C')
64 | Prefix to indicate the entity type of cells
65 | prefix_G: `str`, optional (default: 'G')
66 | Prefix to indicate the entity type of genes
67 | dirname: `str`, (default: 'graph0')
68 | The name of the directory in which each graph will be stored
69 | use_highly_variable: `bool`, optional (default: True)
70 | Use highly variable genes
71 | use_top_pcs: `bool`, optional (default: True)
72 | Use top-PCs-associated features for CP, PM, PK
73 | use_top_pcs_CP: `bool`, optional (default: None)
74 | Use top-PCs-associated features for CP
75 | Once specified, it will overwrite `use_top_pcs`
76 | use_top_pcs_PM: `bool`, optional (default: None)
77 | Use top-PCs-associated features for PM
78 | Once specified, it will overwrite `use_top_pcs`
79 | use_top_pcs_PK: `bool`, optional (default: None)
80 | Use top-PCs-associated features for PK
81 | Once specified, it will overwrite `use_top_pcs`
82 | copy: `bool`, optional (default: False)
83 | If True, it returns the graph file as a data frame
84 |
85 | Returns
86 | -------
87 | If `copy` is True,
88 | edges: `pd.DataFrame`
89 | The edges of the graph used for PBG training.
90 | Each line contains information about one edge.
91 | Using tabs as separators, each line contains the identifiers of
92 | the source entities, the relation types and the target entities.
93 |
94 | updates `.settings.pbg_params` with the following parameters.
95 | entity_path: `str`
96 | The path of the directory containing entity count files.
97 | edge_paths: `list`
98 | A list of paths to directories containing (partitioned) edgelists.
99 | Typically a single path is provided.
100 | entities: `dict`
101 | The entity types.
102 | relations: `list`
103 | The relation types.
104 |
105 | updates `.settings.graph_stats` with the following parameters.
106 | `dirname`: `dict`
107 | Statistics of input graph
108 | """
109 |
110 | if(sum(list(map(lambda x: x is None,
111 | [list_CP,
112 | list_PM,
113 | list_PK,
114 | list_CG,
115 | list_CC]))) == 5):
116 | return 'No graph is generated'
117 |
118 | filepath = os.path.join(settings.workdir, 'pbg', dirname)
119 | settings.pbg_params['entity_path'] = \
120 | os.path.join(filepath, "input/entity")
121 | settings.pbg_params['edge_paths'] = \
122 | [os.path.join(filepath, "input/edge"), ]
123 | if(not os.path.exists(filepath)):
124 | os.makedirs(filepath)
125 |
126 | # Collect the indices of entities
127 | dict_cells = dict() # unique cell indices from all cell-centric datasets
128 | ids_genes = pd.Index([])
129 | ids_peaks = pd.Index([])
130 | ids_kmers = pd.Index([])
131 | ids_motifs = pd.Index([])
132 |
133 | if list_CP is not None:
134 | for adata_ori in list_CP:
135 | if use_top_pcs_CP is None:
136 | flag_top_pcs = use_top_pcs
137 | else:
138 | flag_top_pcs = use_top_pcs_CP
139 | if flag_top_pcs:
140 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
141 | else:
142 | adata = adata_ori.copy()
143 | ids_cells_i = adata.obs.index
144 | if(len(dict_cells) == 0):
145 | dict_cells[prefix_C] = ids_cells_i
146 | else:
147 | # check if cell indices are included in dict_cells
148 | flag_included = False
149 | for k in dict_cells.keys():
150 | ids_cells_k = dict_cells[k]
151 | if set(ids_cells_i) <= set(ids_cells_k):
152 | flag_included = True
153 | break
154 | if not flag_included:
155 | # create a new set of entities
156 | # when not all indices are included
157 | dict_cells[f'{prefix_C}{len(dict_cells)+1}'] = ids_cells_i
158 | ids_peaks = ids_peaks.union(adata.var.index)
159 | if list_PM is not None:
160 | for adata_ori in list_PM:
161 | if use_top_pcs_PM is None:
162 | flag_top_pcs = use_top_pcs
163 | else:
164 | flag_top_pcs = use_top_pcs_PM
165 | if flag_top_pcs:
166 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
167 | else:
168 | adata = adata_ori.copy()
169 | ids_peaks = ids_peaks.union(adata.obs.index)
170 | ids_motifs = ids_motifs.union(adata.var.index)
171 | if list_PK is not None:
172 | for adata_ori in list_PK:
173 | if use_top_pcs_PK is None:
174 | flag_top_pcs = use_top_pcs
175 | else:
176 | flag_top_pcs = use_top_pcs_PK
177 | if flag_top_pcs:
178 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
179 | else:
180 | adata = adata_ori.copy()
181 | ids_peaks = ids_peaks.union(adata.obs.index)
182 | ids_kmers = ids_kmers.union(adata.var.index)
183 | if list_CG is not None:
184 | for adata_ori in list_CG:
185 | if use_highly_variable:
186 | adata = adata_ori[:, adata_ori.var['highly_variable']].copy()
187 | else:
188 | adata = adata_ori.copy()
189 | ids_cells_i = adata.obs.index
190 | if(len(dict_cells) == 0):
191 | dict_cells[prefix_C] = ids_cells_i
192 | else:
193 | # check if cell indices are included in dict_cells
194 | flag_included = False
195 | for k in dict_cells.keys():
196 | ids_cells_k = dict_cells[k]
197 | if set(ids_cells_i) <= set(ids_cells_k):
198 | flag_included = True
199 | break
200 | if not flag_included:
201 | # create a new set of entities
202 | # when not all indices are included
203 | dict_cells[f'{prefix_C}{len(dict_cells)+1}'] = ids_cells_i
204 | ids_genes = ids_genes.union(adata.var.index)
205 |
206 | entity_alias = pd.DataFrame(columns=['alias'])
207 | dict_df_cells = dict() # unique cell dataframes
208 | for k in dict_cells.keys():
209 | dict_df_cells[k] = pd.DataFrame(
210 | index=dict_cells[k],
211 | columns=['alias'],
212 | data=[f'{k}.{x}' for x in range(len(dict_cells[k]))])
213 | settings.pbg_params['entities'][k] = {'num_partitions': 1}
214 | entity_alias = entity_alias.append(dict_df_cells[k],
215 | ignore_index=False)
216 | if(len(ids_genes) > 0):
217 | df_genes = pd.DataFrame(
218 | index=ids_genes,
219 | columns=['alias'],
220 | data=[f'{prefix_G}.{x}' for x in range(len(ids_genes))])
221 | settings.pbg_params['entities'][prefix_G] = {'num_partitions': 1}
222 | entity_alias = entity_alias.append(df_genes,
223 | ignore_index=False)
224 | if(len(ids_peaks) > 0):
225 | df_peaks = pd.DataFrame(
226 | index=ids_peaks,
227 | columns=['alias'],
228 | data=[f'{prefix_P}.{x}' for x in range(len(ids_peaks))])
229 | settings.pbg_params['entities'][prefix_P] = {'num_partitions': 1}
230 | entity_alias = entity_alias.append(df_peaks,
231 | ignore_index=False)
232 | if(len(ids_kmers) > 0):
233 | df_kmers = pd.DataFrame(
234 | index=ids_kmers,
235 | columns=['alias'],
236 | data=[f'{prefix_K}.{x}' for x in range(len(ids_kmers))])
237 | settings.pbg_params['entities'][prefix_K] = {'num_partitions': 1}
238 | entity_alias = entity_alias.append(df_kmers,
239 | ignore_index=False)
240 | if(len(ids_motifs) > 0):
241 | df_motifs = pd.DataFrame(
242 | index=ids_motifs,
243 | columns=['alias'],
244 | data=[f'{prefix_M}.{x}' for x in range(len(ids_motifs))])
245 | settings.pbg_params['entities'][prefix_M] = {'num_partitions': 1}
246 | entity_alias = entity_alias.append(df_motifs,
247 | ignore_index=False)
248 |
249 | # generate edges
250 | dict_graph_stats = dict()
251 | col_names = ["source", "relation", "destination"]
252 | df_edges = pd.DataFrame(columns=col_names)
253 | id_r = 0
254 | settings.pbg_params['relations'] = []
255 |
256 | if list_CP is not None:
257 | for adata_ori in list_CP:
258 | if use_top_pcs:
259 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
260 | else:
261 | adata = adata_ori.copy()
262 | # select reference of cells
263 | for key, df_cells in dict_df_cells.items():
264 | if set(adata.obs_names) <= set(df_cells.index):
265 | break
266 | df_edges_x = pd.DataFrame(columns=col_names)
267 | df_edges_x['source'] = df_cells.loc[
268 | adata.obs_names[adata.X.nonzero()[0]],
269 | 'alias'].values
270 | df_edges_x['relation'] = f'r{id_r}'
271 | df_edges_x['destination'] = df_peaks.loc[
272 | adata.var_names[adata.X.nonzero()[1]],
273 | 'alias'].values
274 | print(f'relation{id_r}: '
275 | f'source: {key}, '
276 | f'destination: {prefix_P}\n'
277 | f'#edges: {df_edges_x.shape[0]}')
278 | dict_graph_stats[f'relation{id_r}'] = \
279 | {'source': key,
280 | 'destination': prefix_P,
281 | 'n_edges': df_edges_x.shape[0]}
282 | df_edges = df_edges.append(df_edges_x,
283 | ignore_index=True)
284 | settings.pbg_params['relations'].append(
285 | {'name': f'r{id_r}',
286 | 'lhs': f'{key}',
287 | 'rhs': f'{prefix_P}',
288 | 'operator': 'none',
289 | 'weight': 1.0
290 | })
291 | id_r += 1
292 | adata_ori.obs['pbg_id'] = ""
293 | adata_ori.var['pbg_id'] = ""
294 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
295 | df_cells.loc[adata.obs_names, 'alias'].copy()
296 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
297 | df_peaks.loc[adata.var_names, 'alias'].copy()
298 |
299 | if list_PM is not None:
300 | for adata_ori in list_PM:
301 | if use_top_pcs:
302 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
303 | else:
304 | adata = adata_ori.copy()
305 | df_edges_x = pd.DataFrame(columns=col_names)
306 | df_edges_x['source'] = df_peaks.loc[
307 | adata.obs_names[adata.X.nonzero()[0]],
308 | 'alias'].values
309 | df_edges_x['relation'] = f'r{id_r}'
310 | df_edges_x['destination'] = df_motifs.loc[
311 | adata.var_names[adata.X.nonzero()[1]],
312 | 'alias'].values
313 | print(f'relation{id_r}: '
314 | f'source: {prefix_P}, '
315 | f'destination: {prefix_M}\n'
316 | f'#edges: {df_edges_x.shape[0]}')
317 | dict_graph_stats[f'relation{id_r}'] = \
318 | {'source': prefix_P,
319 | 'destination': prefix_M,
320 | 'n_edges': df_edges_x.shape[0]}
321 | df_edges = df_edges.append(df_edges_x,
322 | ignore_index=True)
323 | settings.pbg_params['relations'].append(
324 | {'name': f'r{id_r}',
325 | 'lhs': f'{prefix_P}',
326 | 'rhs': f'{prefix_M}',
327 | 'operator': 'none',
328 | 'weight': 0.2
329 | })
330 | id_r += 1
331 | adata_ori.obs['pbg_id'] = ""
332 | adata_ori.var['pbg_id'] = ""
333 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
334 | df_peaks.loc[adata.obs_names, 'alias'].copy()
335 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
336 | df_motifs.loc[adata.var_names, 'alias'].copy()
337 |
338 | if list_PK is not None:
339 | for adata_ori in list_PK:
340 | if use_top_pcs:
341 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy()
342 | else:
343 | adata = adata_ori.copy()
344 | df_edges_x = pd.DataFrame(columns=col_names)
345 | df_edges_x['source'] = df_peaks.loc[
346 | adata.obs_names[adata.X.nonzero()[0]],
347 | 'alias'].values
348 | df_edges_x['relation'] = f'r{id_r}'
349 | df_edges_x['destination'] = df_kmers.loc[
350 | adata.var_names[adata.X.nonzero()[1]],
351 | 'alias'].values
352 | print(f'relation{id_r}: '
353 | f'source: {prefix_P}, '
354 | f'destination: {prefix_K}\n'
355 | f'#edges: {df_edges_x.shape[0]}')
356 | dict_graph_stats[f'relation{id_r}'] = \
357 | {'source': prefix_P,
358 | 'destination': prefix_K,
359 | 'n_edges': df_edges_x.shape[0]}
360 | df_edges = df_edges.append(df_edges_x,
361 | ignore_index=True)
362 | settings.pbg_params['relations'].append(
363 | {'name': f'r{id_r}',
364 | 'lhs': f'{prefix_P}',
365 | 'rhs': f'{prefix_K}',
366 | 'operator': 'none',
367 | 'weight': 0.02
368 | })
369 | id_r += 1
370 | adata_ori.obs['pbg_id'] = ""
371 | adata_ori.var['pbg_id'] = ""
372 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
373 | df_peaks.loc[adata.obs_names, 'alias'].copy()
374 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
375 | df_kmers.loc[adata.var_names, 'alias'].copy()
376 |
377 | if list_CG is not None:
378 | for adata_ori in list_CG:
379 | if use_highly_variable:
380 | adata = adata_ori[:, adata_ori.var['highly_variable']].copy()
381 | else:
382 | adata = adata_ori.copy()
383 | # select reference of cells
384 | for key, df_cells in dict_df_cells.items():
385 | if set(adata.obs_names) <= set(df_cells.index):
386 | break
387 | expr_level = np.unique(adata.layers['disc'].data)
388 | expr_weight = np.linspace(start=1, stop=5, num=len(expr_level))
389 | for i_lvl, lvl in enumerate(expr_level):
390 | df_edges_x = pd.DataFrame(columns=col_names)
391 | df_edges_x['source'] = df_cells.loc[
392 | adata.obs_names[(adata.layers['disc'] == lvl)
393 | .astype(int).nonzero()[0]],
394 | 'alias'].values
395 | df_edges_x['relation'] = f'r{id_r}'
396 | df_edges_x['destination'] = df_genes.loc[
397 | adata.var_names[(adata.layers['disc'] == lvl)
398 | .astype(int).nonzero()[1]],
399 | 'alias'].values
400 | print(f'relation{id_r}: '
401 | f'source: {key}, '
402 | f'destination: {prefix_G}\n'
403 | f'#edges: {df_edges_x.shape[0]}')
404 | dict_graph_stats[f'relation{id_r}'] = \
405 | {'source': key,
406 | 'destination': prefix_G,
407 | 'n_edges': df_edges_x.shape[0]}
408 | df_edges = df_edges.append(df_edges_x,
409 | ignore_index=True)
410 | settings.pbg_params['relations'].append(
411 | {'name': f'r{id_r}',
412 | 'lhs': f'{key}',
413 | 'rhs': f'{prefix_G}',
414 | 'operator': 'none',
415 | 'weight': round(expr_weight[i_lvl], 2),
416 | })
417 | id_r += 1
418 | adata_ori.obs['pbg_id'] = ""
419 | adata_ori.var['pbg_id'] = ""
420 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
421 | df_cells.loc[adata.obs_names, 'alias'].copy()
422 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
423 | df_genes.loc[adata.var_names, 'alias'].copy()
424 |
425 | if list_CC is not None:
426 | for adata in list_CC:
427 | # select reference of cells
428 | for key_obs, df_cells_obs in dict_df_cells.items():
429 | if set(adata.obs_names) <= set(df_cells_obs.index):
430 | break
431 | for key_var, df_cells_var in dict_df_cells.items():
432 | if set(adata.var_names) <= set(df_cells_var.index):
433 | break
434 | # edges between ref and query
435 | df_edges_x = pd.DataFrame(columns=col_names)
436 | df_edges_x['source'] = df_cells_obs.loc[
437 | adata.obs_names[adata.layers['conn'].nonzero()[0]],
438 | 'alias'].values
439 | df_edges_x['relation'] = f'r{id_r}'
440 | df_edges_x['destination'] = df_cells_var.loc[
441 | adata.var_names[adata.layers['conn'].nonzero()[1]],
442 | 'alias'].values
443 | print(f'relation{id_r}: '
444 | f'source: {key_obs}, '
445 | f'destination: {key_var}\n'
446 | f'#edges: {df_edges_x.shape[0]}')
447 | dict_graph_stats[f'relation{id_r}'] = \
448 | {'source': key_obs,
449 | 'destination': key_var,
450 | 'n_edges': df_edges_x.shape[0]}
451 | df_edges = df_edges.append(df_edges_x,
452 | ignore_index=True)
453 | settings.pbg_params['relations'].append(
454 | {'name': f'r{id_r}',
455 | 'lhs': f'{key_obs}',
456 | 'rhs': f'{key_var}',
457 | 'operator': 'none',
458 | 'weight': 10.0
459 | })
460 | id_r += 1
461 |
462 | # # edges within ref
463 | # df_edges_x = pd.DataFrame(columns=col_names)
464 | # df_edges_x['source'] = df_cells_obs.loc[
465 | # adata.obs_names[adata.obsp['conn'].nonzero()[0]],
466 | # 'alias'].values
467 | # df_edges_x['relation'] = f'r{id_r}'
468 | # df_edges_x['destination'] = df_cells_obs.loc[
469 | # adata.obs_names[adata.obsp['conn'].nonzero()[1]],
470 | # 'alias'].values
471 | # print(f'relation{id_r}: '
472 | # f'source: {key_obs}, '
473 | # f'destination: {key_obs}\n'
474 | # f'#edges: {df_edges_x.shape[0]}')
475 | # dict_graph_stats[f'relation{id_r}'] = \
476 | # {'source': key_obs,
477 | # 'destination': key_obs,
478 | # 'n_edges': df_edges_x.shape[0]}
479 | # df_edges = df_edges.append(df_edges_x,
480 | # ignore_index=True)
481 | # settings.pbg_params['relations'].append(
482 | # {'name': f'r{id_r}',
483 | # 'lhs': f'{key_obs}',
484 | # 'rhs': f'{key_obs}',
485 | # 'operator': 'none',
486 | # 'weight': 1.0
487 | # })
488 | # id_r += 1
489 |
490 | # # edges within query
491 | # df_edges_x = pd.DataFrame(columns=col_names)
492 | # df_edges_x['source'] = df_cells_var.loc[
493 | # adata.var_names[adata.varp['conn'].nonzero()[0]],
494 | # 'alias'].values
495 | # df_edges_x['relation'] = f'r{id_r}'
496 | # df_edges_x['destination'] = df_cells_var.loc[
497 | # adata.var_names[adata.varp['conn'].nonzero()[1]],
498 | # 'alias'].values
499 | # print(f'relation{id_r}: '
500 | # f'source: {key_var}, '
501 | # f'destination: {key_var}\n'
502 | # f'#edges: {df_edges_x.shape[0]}')
503 | # dict_graph_stats[f'relation{id_r}'] = \
504 | # {'source': key_var,
505 | # 'destination': key_var,
506 | # 'n_edges': df_edges_x.shape[0]}
507 | # df_edges = df_edges.append(df_edges_x,
508 | # ignore_index=True)
509 | # settings.pbg_params['relations'].append(
510 | # {'name': f'r{id_r}',
511 | # 'lhs': f'{key_var}',
512 | # 'rhs': f'{key_var}',
513 | # 'operator': 'none',
514 | # 'weight': 1.0
515 | # })
516 | # id_r += 1
517 |
518 | adata.obs['pbg_id'] = df_cells_obs.loc[adata.obs_names,
519 | 'alias'].copy()
520 | adata.var['pbg_id'] = df_cells_var.loc[adata.var_names,
521 | 'alias'].copy()
522 |
523 | print(f'Total number of edges: {df_edges.shape[0]}')
524 | dict_graph_stats['n_edges'] = df_edges.shape[0]
525 | settings.graph_stats[dirname] = dict_graph_stats
526 |
527 | print(f'Writing graph file "pbg_graph.txt" to "{filepath}" ...')
528 | df_edges.to_csv(os.path.join(filepath, "pbg_graph.txt"),
529 | header=False,
530 | index=False,
531 | sep='\t')
532 | entity_alias.to_csv(os.path.join(filepath, 'entity_alias.txt'),
533 | header=True,
534 | index=True,
535 | sep='\t')
536 | with open(os.path.join(filepath, 'graph_stats.json'), 'w') as fp:
537 | json.dump(dict_graph_stats,
538 | fp,
539 | sort_keys=True,
540 | indent=4,
541 | separators=(',', ': '))
542 | print("Finished.")
543 | if copy:
544 | return df_edges
545 | else:
546 | return None
547 |
548 |
549 | def pbg_train(dirname=None,
550 | pbg_params=None,
551 | output='model',
552 | auto_wd=True,
553 | save_wd=False):
554 | """PBG training
555 |
556 | Parameters
557 | ----------
558 | dirname: `str`, optional (default: None)
559 | The name of the directory in which graph is stored
560 | If None, it will be inferred from `pbg_params['entity_path']`
561 | pbg_params: `dict`, optional (default: None)
562 | Configuration for pbg training.
563 | If specified, it will be used instead of the default setting
564 | output: `str`, optional (default: 'model')
565 | The name of the directory where training output will be written to.
566 | It overrides `pbg_params` if `checkpoint_path` is specified in it
567 | auto_wd: `bool`, optional (default: True)
568 | If True, it will override `pbg_params['wd']` with a new weight decay
569 | estimated based on training sample size
570 | Recommended for relative small training sample size (<1e7)
571 | save_wd: `bool`, optional (default: False)
572 | If True, estimated `wd` will be saved to `settings.pbg_params['wd']`
573 |
574 | Returns
575 | -------
576 | updates `settings.pbg_params` with the following parameter
577 | checkpoint_path:
578 | The path to the directory where checkpoints (and thus the output)
579 | will be written to.
580 | If checkpoints are found in it, training will resume from them.
581 | """
582 |
583 | if pbg_params is None:
584 | pbg_params = settings.pbg_params.copy()
585 | else:
586 | assert isinstance(pbg_params, dict),\
587 | "`pbg_params` must be dict"
588 |
589 | if dirname is None:
590 | filepath = Path(pbg_params['entity_path']).parent.parent.as_posix()
591 | else:
592 | filepath = os.path.join(settings.workdir, 'pbg', dirname)
593 |
594 | pbg_params['checkpoint_path'] = os.path.join(filepath, output)
595 | settings.pbg_params['checkpoint_path'] = pbg_params['checkpoint_path']
596 |
597 | if auto_wd:
598 | # empirical numbers from simulation experiments
599 | if settings.graph_stats[
600 | os.path.basename(filepath)]['n_edges'] < 5e7:
601 | # optimial wd (0.013) for sample size (2725781)
602 | wd = np.around(
603 | 0.013 * 2725781 / settings.graph_stats[
604 | os.path.basename(filepath)]['n_edges'],
605 | decimals=6)
606 | else:
607 | # optimial wd (0.0004) for sample size (59103481)
608 | wd = np.around(
609 | 0.0004 * 59103481 / settings.graph_stats[
610 | os.path.basename(filepath)]['n_edges'],
611 | decimals=6)
612 | print(f'Auto-estimated weight decay is {wd}')
613 | pbg_params['wd'] = wd
614 | if save_wd:
615 | settings.pbg_params['wd'] = pbg_params['wd']
616 | print(f"`.settings.pbg_params['wd']` has been updated to {wd}")
617 |
618 | # to avoid oversubscription issues in workloads
619 | # that involve nested parallelism
620 | os.environ["OMP_NUM_THREADS"] = "1"
621 |
622 | loader = ConfigFileLoader()
623 | config = loader.load_config_simba(pbg_params)
624 | set_logging_verbosity(config.verbose)
625 |
626 | list_filenames = [os.path.join(filepath, "pbg_graph.txt")]
627 | input_edge_paths = [Path(name) for name in list_filenames]
628 | print("Converting input data ...")
629 | convert_input_data(
630 | config.entities,
631 | config.relations,
632 | config.entity_path,
633 | config.edge_paths,
634 | input_edge_paths,
635 | TSVEdgelistReader(lhs_col=0, rhs_col=2, rel_col=1),
636 | dynamic_relations=config.dynamic_relations,
637 | )
638 |
639 | subprocess_init = SubprocessInitializer()
640 | subprocess_init.register(setup_logging, config.verbose)
641 | subprocess_init.register(add_to_sys_path, loader.config_dir.name)
642 |
643 | train_config = attr.evolve(config, edge_paths=config.edge_paths)
644 | print("Starting training ...")
645 | train(train_config, subprocess_init=subprocess_init)
646 | print("Finished")
647 |
--------------------------------------------------------------------------------
/simba/tools/_umap.py:
--------------------------------------------------------------------------------
1 | """UMAP (Uniform Manifold Approximation and Projection)"""
2 |
3 | import umap as umap_learn
4 |
5 |
6 | def umap(adata,
7 | n_neighbors=15,
8 | n_components=2,
9 | random_state=2020,
10 | layer=None,
11 | obsm=None,
12 | n_dim=None,
13 | **kwargs,
14 | ):
15 | """perform UMAP
16 | Parameters
17 | ----------
18 | adata: AnnData
19 | Annotated data matrix.
20 | n_neighbors: `int`, optional (default: 15)
21 | The size of local neighborhood for UMAP
22 | n_components: `int`, optional (default: None)
23 | The dimension of the space to embed into for UMAP
24 | random_state: `int`, optional (default: None)
25 | The seed used by the random number generator for UMAP
26 | layer: `str`, optional (default: None)
27 | The layer used to perform UMAP
28 | obsm: `str`, optional (default: None)
29 | The multi-dimensional annotation of observations used to perform UMAP
30 | n_dim: `str`, optional (default: None)
31 | The number of dimensions used in `layer` or `obsm`
32 | kwargs:
33 | Other keyword arguments are passed down to `umap_learn.UMAP`
34 |
35 | Returns
36 | -------
37 | updates `adata` with the following fields:
38 | `.obsm['X_umap']` : `array`
39 | UMAP coordinates of samples.
40 | """
41 |
42 | if(sum(list(map(lambda x: x is not None,
43 | [layer, obsm]))) == 2):
44 | raise ValueError("Only one of `layer` and `obsm` can be used")
45 | elif(obsm is not None):
46 | X = adata.obsm[obsm]
47 | elif(layer is not None):
48 | X = adata.layers[layer]
49 | else:
50 | X = adata.X
51 | if(n_dim is not None):
52 | X = X[:, :n_dim]
53 | reducer = umap_learn.UMAP(n_neighbors=n_neighbors,
54 | n_components=n_components,
55 | random_state=random_state,
56 | **kwargs)
57 | reducer.fit(X)
58 | adata.obsm['X_umap'] = reducer.embedding_
59 |
--------------------------------------------------------------------------------
/simba/tools/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from sklearn.neighbors import KDTree
5 | from scipy.sparse import csr_matrix
6 |
7 |
8 | def _uniquify(seq, sep='-'):
9 | """Uniquify a list of strings.
10 |
11 | Adding unique numbers to duplicate values.
12 |
13 | Parameters
14 | ----------
15 | seq : `list` or `array-like`
16 | A list of values
17 | sep : `str`
18 | Separator
19 |
20 | Returns
21 | -------
22 | seq: `list` or `array-like`
23 | A list of updated values
24 | """
25 |
26 | dups = {}
27 |
28 | for i, val in enumerate(seq):
29 | if val not in dups:
30 | # Store index of first occurrence and occurrence value
31 | dups[val] = [i, 1]
32 | else:
33 | # Increment occurrence value, index value doesn't matter anymore
34 | dups[val][1] += 1
35 |
36 | # Use stored occurrence value
37 | seq[i] += (sep+str(dups[val][1]))
38 |
39 | return(seq)
40 |
41 |
42 | def _gini(array):
43 | """Calculate the Gini coefficient of a numpy array.
44 | """
45 |
46 | array = array.flatten().astype(float)
47 | if np.amin(array) < 0:
48 | # Values cannot be negative:
49 | array -= np.amin(array)
50 | # Values cannot be 0:
51 | array += 0.0000001
52 | # Values must be sorted:
53 | array = np.sort(array)
54 | # Index per array element:
55 | index = np.arange(1, array.shape[0]+1)
56 | # Number of array elements:
57 | n = array.shape[0]
58 | # Gini coefficient:
59 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))
60 |
61 |
62 | def _knn(X_ref,
63 | X_query=None,
64 | k=20,
65 | leaf_size=40,
66 | metric='euclidean'):
67 | """Calculate K nearest neigbors for each row.
68 | """
69 | if X_query is None:
70 | X_query = X_ref.copy()
71 | kdt = KDTree(X_ref, leaf_size=leaf_size, metric=metric)
72 | kdt_d, kdt_i = kdt.query(X_query, k=k, return_distance=True)
73 | # kdt_i = kdt_i[:, 1:] # exclude the point itself
74 | # kdt_d = kdt_d[:, 1:] # exclude the point itself
75 | sp_row = np.repeat(np.arange(kdt_i.shape[0]), kdt_i.shape[1])
76 | sp_col = kdt_i.flatten()
77 | sp_conn = np.repeat(1, len(sp_row))
78 | sp_dist = kdt_d.flatten()
79 | mat_conn_ref_query = csr_matrix(
80 | (sp_conn, (sp_row, sp_col)),
81 | shape=(X_query.shape[0], X_ref.shape[0])).T
82 | mat_dist_ref_query = csr_matrix(
83 | (sp_dist, (sp_row, sp_col)),
84 | shape=(X_query.shape[0], X_ref.shape[0])).T
85 | return mat_conn_ref_query, mat_dist_ref_query
86 |
--------------------------------------------------------------------------------
/tests/data/10xpbmc_atac_subset.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/10xpbmc_atac_subset.h5ad
--------------------------------------------------------------------------------
/tests/data/10xpbmc_rna_subset.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/10xpbmc_rna_subset.h5ad
--------------------------------------------------------------------------------
/tests/data/pbg_training/entity_alias.txt:
--------------------------------------------------------------------------------
1 | alias
2 | GCACCTAAGTTAGTGC-1_rna C.0
3 | TCTCCTCGTGGAGCAA-1_rna C.1
4 | GTTCTTGTCTTGACCC-1_rna C.2
5 | CGCGATTCAGGATGGC-1_rna C.3
6 | CCTACTTCAGAATGAC-1_rna C.4
7 | TGAAGGATCCTTTACG-1_rna C.5
8 | AATCCGTAGCACTAAC-1_rna C.6
9 | GCTTAAATCGGCCATA-1_rna C.7
10 | GGGCTAACACCTAAGC-1_rna C.8
11 | TAGCCTGAGGTCTTGG-1_rna C.9
12 | ACCTTGCTCGTTAGCG-1_rna C.10
13 | CTAGGCGGTAGACAAA-1_rna C.11
14 | CGATTTGCATTGCGTA-1_rna C.12
15 | CGTGAGGAGGAGCAAC-1_rna C.13
16 | GGGTCAACACATAGCC-1_rna C.14
17 | ACTCAGTAGTAGGATG-1_rna C.15
18 | ATTACCCGTAGGTTAT-1_rna C.16
19 | ATTAGGTGTTTGGCGG-1_rna C.17
20 | ACCAAACTCATTATGG-1_rna C.18
21 | AGCTTGGTCGCTAGTG-1_rna C.19
22 | TGTGGAGCAACCTGGT-1_rna C.20
23 | TTTAGCTTCCTTAAGA-1_rna C.21
24 | GTTTAACCATAATCCG-1_rna C.22
25 | CGTATTGCAGCTAATT-1_rna C.23
26 | ACATAGCTCCCTGACT-1_rna C.24
27 | GTTAAACGTTTCCACG-1_rna C.25
28 | TTAACTGAGTATTGTG-1_rna C.26
29 | GGCTATGTCCCTGGTT-1_rna C.27
30 | AGGTCATTCTAACCAA-1_rna C.28
31 | CAGCCTTTCTCACAAA-1_rna C.29
32 | TCAGTAGGTAGGTTAT-1_rna C.30
33 | TGAAGTGAGGAAGTAT-1_rna C.31
34 | GCTGGTTCAATTAAGG-1_rna C.32
35 | TGAGCCGGTGCACGCA-1_rna C.33
36 | CGCTTAACAGCCGCTA-1_rna C.34
37 | GTGCACGGTTGTAAAC-1_rna C.35
38 | CTGTTAAAGAATGACG-1_rna C.36
39 | TGTAAGCTCTTAGGAC-1_rna C.37
40 | GAGCGAAGTTCGGGAT-1_rna C.38
41 | GATTCATCATAATGTC-1_rna C.39
42 | TTCCTCAAGGTTTGAC-1_rna C.40
43 | AGGTACGCAGCCTTGG-1_rna C.41
44 | CATTTGTTCGCACACA-1_rna C.42
45 | CCGTTAACAATCCTGA-1_rna C.43
46 | GGCCTCTGTCCTCCAA-1_rna C.44
47 | GCTGTGATCAATCTCT-1_rna C.45
48 | GGGAATATCTTAATGG-1_rna C.46
49 | AGGTGAGGTGGATTCA-1_rna C.47
50 | TGCACTTGTTTACGTC-1_rna C.48
51 | GTTCGCTTCCGTTAAA-1_rna C.49
52 | CCAAATCAGCGGTTAT-1_rna C.50
53 | GGCCTAATCCCTGTTA-1_rna C.51
54 | GGGCCTAGTGTCCAGG-1_rna C.52
55 | CGGACCTAGTCACTCC-1_rna C.53
56 | ATATGGTGTCAGGAAG-1_rna C.54
57 | CGCTTACTCCTAATTC-1_rna C.55
58 | AAAGCTTGTCGACTAA-1_rna C.56
59 | AAGCATGAGGCCTAAT-1_rna C.57
60 | GTTACAGGTAGGTTAT-1_rna C.58
61 | CCATAATCATGCTATG-1_rna C.59
62 | CACCTCAGTTTGCGAA-1_rna C.60
63 | ATTGCGCCACTAGCGT-1_rna C.61
64 | CCTACTGGTGCCGCAA-1_rna C.62
65 | GCAATAGAGGCGGATG-1_rna C.63
66 | GCGCGATTCCTCCCTC-1_rna C.64
67 | TAGGTTATCTCGACCT-1_rna C.65
68 | TGCTTGCTCATGAGCT-1_rna C.66
69 | GACTCACCAGTAATAG-1_rna C.67
70 | TTCCCGCCAATAACGA-1_rna C.68
71 | CGAACCGGTAGCCATA-1_rna C.69
72 | ACCAAGCGTCATAAGT-1_rna C.70
73 | CGCCAAATCCCAGTAG-1_rna C.71
74 | ACTCACTGTTTGGTTC-1_rna C.72
75 | AGAAGGTGTAGGTTGC-1_rna C.73
76 | ATTTGCGCAGGCTTGT-1_rna C.74
77 | TAGTTGTCATCGCTCC-1_rna C.75
78 | GGACAGCCAGATTCAT-1_rna C.76
79 | TTGAGCTAGCTTACTT-1_rna C.77
80 | CATTTGTTCTAAATCG-1_rna C.78
81 | TAGGAGTCACTGACCG-1_rna C.79
82 | ATCAAGCTCGGGATTT-1_rna C.80
83 | TCTCGCCCAACCTGGT-1_rna C.81
84 | GCGGTTATCCTGATGG-1_rna C.82
85 | GATTGCAGTGGAGCAA-1_rna C.83
86 | CGATTCCTCTTGCTAT-1_rna C.84
87 | GCTCATTGTTCACCAT-1_rna C.85
88 | CAAACGCGTTTCGCGC-1_rna C.86
89 | TCTTAGCGTCCGTGAG-1_rna C.87
90 | GCGCGATTCCTTGAGG-1_rna C.88
91 | ATAGGTACAGGTCCTG-1_rna C.89
92 | TCCTTAGTCCTGAGTG-1_rna C.90
93 | TTCCCACAGCCAAATC-1_rna C.91
94 | GGGTGAAGTGCATCGG-1_rna C.92
95 | CGGATAAAGTAGAGGC-1_rna C.93
96 | AGGATGTCACAAAGAC-1_rna C.94
97 | CTTCTCAAGGGTGGAT-1_rna C.95
98 | AACCCGCAGCGGATTT-1_rna C.96
99 | GGACATAAGGGATGCG-1_rna C.97
100 | TACAAGCTCTGTGAGT-1_rna C.98
101 | CGTATTGCATTCAGCA-1_rna C.99
102 | DPH3 G.0
103 | BICD1 G.1
104 | MAML3 G.2
105 | TTN-AS1 G.3
106 | APPL2 G.4
107 | HLX G.5
108 | CHIC1 G.6
109 | DDX39B G.7
110 | SRC G.8
111 | VAPB G.9
112 | RPS10-NUDT3 G.10
113 | POLR2A G.11
114 | AC007262.2 G.12
115 | CCND2 G.13
116 | PTCD3 G.14
117 | TNFRSF10A G.15
118 | POLR3GL G.16
119 | NNT G.17
120 | IL26 G.18
121 | RPL10 G.19
122 | UHRF1BP1L G.20
123 | AC124014.1 G.21
124 | ELOVL1 G.22
125 | SGPL1 G.23
126 | USP42 G.24
127 | ATF7IP2 G.25
128 | METTL22 G.26
129 | HSCB G.27
130 | PCTP G.28
131 | FAM174B G.29
132 | TMEM184B G.30
133 | SERF2 G.31
134 | KIAA0930 G.32
135 | GNAQ G.33
136 | SCFD1 G.34
137 | UBE2R2 G.35
138 | ARL5B G.36
139 | FRMD4A G.37
140 | EML5 G.38
141 | FAM3A G.39
142 | ARHGAP22 G.40
143 | KXD1 G.41
144 | A1BG G.42
145 | C4orf3 G.43
146 | FAM153CP G.44
147 | PPP1R9A G.45
148 | IQGAP2 G.46
149 | ACTG1 G.47
150 | GPLD1 G.48
151 | SIRPG G.49
152 | CALML4 G.50
153 | IAH1 G.51
154 | LAT2 G.52
155 | AAAS G.53
156 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/graph_stats.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_edges": 1075,
3 | "relation0": {
4 | "destination": "G",
5 | "n_edges": 153,
6 | "source": "C"
7 | },
8 | "relation1": {
9 | "destination": "G",
10 | "n_edges": 369,
11 | "source": "C"
12 | },
13 | "relation2": {
14 | "destination": "G",
15 | "n_edges": 301,
16 | "source": "C"
17 | },
18 | "relation3": {
19 | "destination": "G",
20 | "n_edges": 166,
21 | "source": "C"
22 | },
23 | "relation4": {
24 | "destination": "G",
25 | "n_edges": 86,
26 | "source": "C"
27 | }
28 | }
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_count_C_0.txt:
--------------------------------------------------------------------------------
1 | 100
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_count_G_0.txt:
--------------------------------------------------------------------------------
1 | 54
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_names_C_0.json:
--------------------------------------------------------------------------------
1 | [
2 | "C.3",
3 | "C.73",
4 | "C.5",
5 | "C.93",
6 | "C.58",
7 | "C.38",
8 | "C.14",
9 | "C.24",
10 | "C.35",
11 | "C.60",
12 | "C.70",
13 | "C.64",
14 | "C.72",
15 | "C.68",
16 | "C.79",
17 | "C.12",
18 | "C.52",
19 | "C.81",
20 | "C.83",
21 | "C.87",
22 | "C.48",
23 | "C.91",
24 | "C.11",
25 | "C.33",
26 | "C.77",
27 | "C.88",
28 | "C.9",
29 | "C.0",
30 | "C.39",
31 | "C.28",
32 | "C.36",
33 | "C.75",
34 | "C.92",
35 | "C.85",
36 | "C.10",
37 | "C.67",
38 | "C.20",
39 | "C.37",
40 | "C.46",
41 | "C.7",
42 | "C.53",
43 | "C.44",
44 | "C.23",
45 | "C.4",
46 | "C.42",
47 | "C.8",
48 | "C.50",
49 | "C.90",
50 | "C.1",
51 | "C.76",
52 | "C.61",
53 | "C.6",
54 | "C.56",
55 | "C.13",
56 | "C.89",
57 | "C.41",
58 | "C.25",
59 | "C.62",
60 | "C.84",
61 | "C.15",
62 | "C.40",
63 | "C.55",
64 | "C.96",
65 | "C.65",
66 | "C.86",
67 | "C.69",
68 | "C.98",
69 | "C.17",
70 | "C.94",
71 | "C.97",
72 | "C.18",
73 | "C.54",
74 | "C.19",
75 | "C.59",
76 | "C.49",
77 | "C.34",
78 | "C.26",
79 | "C.2",
80 | "C.95",
81 | "C.47",
82 | "C.66",
83 | "C.45",
84 | "C.51",
85 | "C.82",
86 | "C.22",
87 | "C.21",
88 | "C.57",
89 | "C.71",
90 | "C.43",
91 | "C.99",
92 | "C.27",
93 | "C.30",
94 | "C.32",
95 | "C.29",
96 | "C.16",
97 | "C.80",
98 | "C.63",
99 | "C.74",
100 | "C.31",
101 | "C.78"
102 | ]
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_names_G_0.json:
--------------------------------------------------------------------------------
1 | [
2 | "G.34",
3 | "G.35",
4 | "G.19",
5 | "G.9",
6 | "G.11",
7 | "G.8",
8 | "G.37",
9 | "G.22",
10 | "G.48",
11 | "G.29",
12 | "G.18",
13 | "G.26",
14 | "G.23",
15 | "G.20",
16 | "G.2",
17 | "G.28",
18 | "G.13",
19 | "G.46",
20 | "G.25",
21 | "G.4",
22 | "G.52",
23 | "G.3",
24 | "G.17",
25 | "G.30",
26 | "G.36",
27 | "G.51",
28 | "G.7",
29 | "G.24",
30 | "G.53",
31 | "G.12",
32 | "G.39",
33 | "G.15",
34 | "G.16",
35 | "G.6",
36 | "G.5",
37 | "G.40",
38 | "G.38",
39 | "G.33",
40 | "G.0",
41 | "G.31",
42 | "G.27",
43 | "G.32",
44 | "G.45",
45 | "G.14",
46 | "G.47",
47 | "G.21",
48 | "G.44",
49 | "G.50",
50 | "G.43",
51 | "G.10",
52 | "G.1",
53 | "G.42",
54 | "G.41",
55 | "G.49"
56 | ]
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/checkpoint_version.txt:
--------------------------------------------------------------------------------
1 | 10
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "entities": {
3 | "C": {
4 | "num_partitions": 1,
5 | "featurized": false,
6 | "dimension": null
7 | },
8 | "G": {
9 | "num_partitions": 1,
10 | "featurized": false,
11 | "dimension": null
12 | }
13 | },
14 | "relations": [
15 | {
16 | "name": "r0",
17 | "lhs": "C",
18 | "rhs": "G",
19 | "weight": 1.0,
20 | "operator": "none",
21 | "all_negs": false
22 | },
23 | {
24 | "name": "r1",
25 | "lhs": "C",
26 | "rhs": "G",
27 | "weight": 2.0,
28 | "operator": "none",
29 | "all_negs": false
30 | },
31 | {
32 | "name": "r2",
33 | "lhs": "C",
34 | "rhs": "G",
35 | "weight": 3.0,
36 | "operator": "none",
37 | "all_negs": false
38 | },
39 | {
40 | "name": "r3",
41 | "lhs": "C",
42 | "rhs": "G",
43 | "weight": 4.0,
44 | "operator": "none",
45 | "all_negs": false
46 | },
47 | {
48 | "name": "r4",
49 | "lhs": "C",
50 | "rhs": "G",
51 | "weight": 5.0,
52 | "operator": "none",
53 | "all_negs": false
54 | }
55 | ],
56 | "dimension": 50,
57 | "init_scale": 0.001,
58 | "max_norm": null,
59 | "global_emb": false,
60 | "comparator": "dot",
61 | "bias": false,
62 | "loss_fn": "softmax",
63 | "margin": 0.1,
64 | "regularization_coef": 0.0,
65 | "regularizer": "N3",
66 | "wd": 32.962933,
67 | "wd_interval": 50,
68 | "entity_path": "./result_simba/pbg/graph0/input/entity",
69 | "edge_paths": [
70 | "./result_simba/pbg/graph0/input/edge"
71 | ],
72 | "checkpoint_path": "result_simba/pbg/graph0/model",
73 | "init_path": null,
74 | "checkpoint_preservation_interval": null,
75 | "num_epochs": 10,
76 | "num_edge_chunks": null,
77 | "max_edges_per_chunk": 1000000000,
78 | "bucket_order": "inside_out",
79 | "workers": 12,
80 | "batch_size": 1000,
81 | "num_batch_negs": 50,
82 | "num_uniform_negs": 50,
83 | "disable_lhs_negs": false,
84 | "disable_rhs_negs": false,
85 | "lr": 0.1,
86 | "relation_lr": null,
87 | "eval_fraction": 0.05,
88 | "eval_num_batch_negs": 50,
89 | "eval_num_uniform_negs": 50,
90 | "early_stopping": false,
91 | "background_io": false,
92 | "verbose": 0,
93 | "hogwild_delay": 2.0,
94 | "dynamic_relations": false,
95 | "num_machines": 1,
96 | "num_partition_servers": -1,
97 | "distributed_init_method": null,
98 | "distributed_tree_init_order": true,
99 | "num_gpus": 0,
100 | "num_groups_for_partition_server": 16,
101 | "half_precision": false
102 | }
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/embeddings_C_0.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/embeddings_C_0.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/embeddings_G_0.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/embeddings_G_0.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/model.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/model.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/training_stats.json:
--------------------------------------------------------------------------------
1 | {"lhs_partition": 0, "rhs_partition": 0, "index": 1, "stats": {"count": 1022, "metrics": {"loss": 23.252048253546487, "reg": 0.0, "violators_lhs": 36.36497064579256, "violators_rhs": 31.131115459882583}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.4885098259404, "pos_rank": 26.50943396226415, "mrr": 0.08514296270485194, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5094339645133829}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}, "epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0}
2 | {"epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 0, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}}
3 | {"lhs_partition": 0, "rhs_partition": 0, "index": 2, "stats": {"count": 1022, "metrics": {"loss": 22.486315262527615, "reg": 0.0, "violators_lhs": 30.62720156555773, "violators_rhs": 23.104696673189824}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.591775120429272, "pos_rank": 25.566037735849058, "mrr": 0.07682948650897674, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9716981132075472, "auc": 0.5660377372548265}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}, "epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0}
4 | {"epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 1, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}}
5 | {"lhs_partition": 0, "rhs_partition": 0, "index": 3, "stats": {"count": 1022, "metrics": {"loss": 22.591744126172447, "reg": 0.0, "violators_lhs": 30.874755381604697, "violators_rhs": 23.437377690802347}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.54162585960244, "pos_rank": 24.30188679245283, "mrr": 0.08181398664161844, "r1": 0.018867924528301886, "r10": 0.2169811320754717, "r50": 0.9622641509433962, "auc": 0.5377358521492976}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}, "epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0}
6 | {"epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 2, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}}
7 | {"lhs_partition": 0, "rhs_partition": 0, "index": 4, "stats": {"count": 1022, "metrics": {"loss": 22.62260271658403, "reg": 0.0, "violators_lhs": 30.645792563600782, "violators_rhs": 23.364970645792564}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.508187914794345, "pos_rank": 24.88679245283019, "mrr": 0.08207970859377452, "r1": 0.009433962264150943, "r10": 0.20754716981132076, "r50": 0.9245283018867925, "auc": 0.4905660402662349}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}, "epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0}
8 | {"epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 3, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}}
9 | {"lhs_partition": 0, "rhs_partition": 0, "index": 5, "stats": {"count": 1022, "metrics": {"loss": 22.690110387634157, "reg": 0.0, "violators_lhs": 30.770058708414872, "violators_rhs": 23.117416829745597}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.526717563845075, "pos_rank": 24.89622641509434, "mrr": 0.07438522921699398, "r1": 0.0, "r10": 0.2358490566037736, "r50": 0.9622641509433962, "auc": 0.5188679262152258}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}, "epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0}
10 | {"epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 4, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}}
11 | {"lhs_partition": 0, "rhs_partition": 0, "index": 6, "stats": {"count": 1022, "metrics": {"loss": 22.763349835420076, "reg": 0.0, "violators_lhs": 31.012720156555773, "violators_rhs": 22.820939334637966}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.493189523804862, "pos_rank": 24.38679245283019, "mrr": 0.0724140086896577, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9528301886792453, "auc": 0.500000000843462}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}, "epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0}
12 | {"epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 5, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}}
13 | {"lhs_partition": 0, "rhs_partition": 0, "index": 7, "stats": {"count": 1022, "metrics": {"loss": 22.76916164241425, "reg": 0.0, "violators_lhs": 32.49412915851272, "violators_rhs": 26.92367906066536}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.49149281123899, "pos_rank": 24.566037735849058, "mrr": 0.0773335443458186, "r1": 0.0, "r10": 0.22641509433962265, "r50": 0.9433962264150944, "auc": 0.5660377392229045}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}, "epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0}
14 | {"epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 6, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}}
15 | {"lhs_partition": 0, "rhs_partition": 0, "index": 8, "stats": {"count": 1022, "metrics": {"loss": 22.794376134405862, "reg": 0.0, "violators_lhs": 31.71917808219178, "violators_rhs": 23.874755381604697}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.490235013781852, "pos_rank": 23.90566037735849, "mrr": 0.08131686016425209, "r1": 0.009433962264150943, "r10": 0.2641509433962264, "r50": 0.9528301886792453, "auc": 0.6792452849869458}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}, "epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0}
16 | {"epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 7, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}}
17 | {"lhs_partition": 0, "rhs_partition": 0, "index": 9, "stats": {"count": 1022, "metrics": {"loss": 22.79033338020459, "reg": 0.0, "violators_lhs": 31.304305283757337, "violators_rhs": 23.480430528375734}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.48922288642739, "pos_rank": 23.849056603773583, "mrr": 0.09737446559768803, "r1": 0.018867924528301886, "r10": 0.25471698113207547, "r50": 0.9622641509433962, "auc": 0.5943396251718953}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}, "epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0}
18 | {"epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 8, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}}
19 | {"lhs_partition": 0, "rhs_partition": 0, "index": 10, "stats": {"count": 1022, "metrics": {"loss": 22.792577922694136, "reg": 0.0, "violators_lhs": 31.645792563600782, "violators_rhs": 23.681996086105674}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.489368258782154, "pos_rank": 24.528301886792452, "mrr": 0.08778320558650314, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5660377378171345}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}, "epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0}
20 | {"epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 9, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}}
21 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/pbg_graph.txt:
--------------------------------------------------------------------------------
1 | C.2 r0 G.0
2 | C.2 r0 G.4
3 | C.2 r0 G.14
4 | C.2 r0 G.22
5 | C.2 r0 G.25
6 | C.2 r0 G.43
7 | C.3 r0 G.1
8 | C.3 r0 G.11
9 | C.3 r0 G.13
10 | C.3 r0 G.17
11 | C.3 r0 G.20
12 | C.3 r0 G.24
13 | C.3 r0 G.25
14 | C.3 r0 G.36
15 | C.3 r0 G.44
16 | C.3 r0 G.45
17 | C.4 r0 G.0
18 | C.4 r0 G.5
19 | C.4 r0 G.9
20 | C.4 r0 G.11
21 | C.4 r0 G.26
22 | C.4 r0 G.35
23 | C.4 r0 G.52
24 | C.8 r0 G.2
25 | C.8 r0 G.7
26 | C.8 r0 G.32
27 | C.8 r0 G.40
28 | C.8 r0 G.52
29 | C.16 r0 G.0
30 | C.16 r0 G.3
31 | C.16 r0 G.5
32 | C.16 r0 G.6
33 | C.16 r0 G.7
34 | C.16 r0 G.9
35 | C.16 r0 G.16
36 | C.16 r0 G.23
37 | C.16 r0 G.27
38 | C.16 r0 G.30
39 | C.16 r0 G.32
40 | C.16 r0 G.43
41 | C.16 r0 G.44
42 | C.16 r0 G.46
43 | C.16 r0 G.49
44 | C.16 r0 G.52
45 | C.22 r0 G.6
46 | C.22 r0 G.14
47 | C.22 r0 G.16
48 | C.22 r0 G.17
49 | C.22 r0 G.20
50 | C.22 r0 G.21
51 | C.22 r0 G.24
52 | C.22 r0 G.33
53 | C.22 r0 G.34
54 | C.27 r0 G.4
55 | C.27 r0 G.8
56 | C.27 r0 G.9
57 | C.27 r0 G.13
58 | C.27 r0 G.20
59 | C.27 r0 G.27
60 | C.27 r0 G.30
61 | C.27 r0 G.36
62 | C.30 r0 G.5
63 | C.30 r0 G.7
64 | C.30 r0 G.16
65 | C.30 r0 G.24
66 | C.30 r0 G.26
67 | C.30 r0 G.28
68 | C.30 r0 G.37
69 | C.30 r0 G.40
70 | C.30 r0 G.43
71 | C.45 r0 G.11
72 | C.45 r0 G.12
73 | C.45 r0 G.27
74 | C.45 r0 G.40
75 | C.47 r0 G.9
76 | C.47 r0 G.21
77 | C.47 r0 G.32
78 | C.47 r0 G.34
79 | C.47 r0 G.35
80 | C.47 r0 G.36
81 | C.47 r0 G.41
82 | C.47 r0 G.46
83 | C.52 r0 G.0
84 | C.52 r0 G.3
85 | C.52 r0 G.7
86 | C.52 r0 G.20
87 | C.52 r0 G.24
88 | C.52 r0 G.25
89 | C.52 r0 G.33
90 | C.52 r0 G.35
91 | C.52 r0 G.43
92 | C.52 r0 G.49
93 | C.68 r0 G.1
94 | C.68 r0 G.4
95 | C.68 r0 G.8
96 | C.68 r0 G.17
97 | C.68 r0 G.20
98 | C.68 r0 G.21
99 | C.68 r0 G.23
100 | C.68 r0 G.24
101 | C.68 r0 G.25
102 | C.68 r0 G.34
103 | C.68 r0 G.51
104 | C.72 r0 G.3
105 | C.72 r0 G.13
106 | C.72 r0 G.25
107 | C.72 r0 G.41
108 | C.74 r0 G.6
109 | C.74 r0 G.7
110 | C.74 r0 G.10
111 | C.74 r0 G.13
112 | C.74 r0 G.16
113 | C.74 r0 G.27
114 | C.74 r0 G.32
115 | C.74 r0 G.35
116 | C.74 r0 G.43
117 | C.74 r0 G.52
118 | C.80 r0 G.6
119 | C.80 r0 G.24
120 | C.90 r0 G.7
121 | C.90 r0 G.9
122 | C.90 r0 G.15
123 | C.90 r0 G.16
124 | C.90 r0 G.17
125 | C.90 r0 G.24
126 | C.90 r0 G.25
127 | C.90 r0 G.33
128 | C.90 r0 G.34
129 | C.90 r0 G.35
130 | C.90 r0 G.39
131 | C.91 r0 G.5
132 | C.91 r0 G.9
133 | C.91 r0 G.14
134 | C.91 r0 G.23
135 | C.91 r0 G.37
136 | C.91 r0 G.42
137 | C.95 r0 G.21
138 | C.95 r0 G.29
139 | C.95 r0 G.37
140 | C.98 r0 G.4
141 | C.98 r0 G.7
142 | C.98 r0 G.13
143 | C.98 r0 G.15
144 | C.98 r0 G.23
145 | C.98 r0 G.24
146 | C.98 r0 G.32
147 | C.98 r0 G.35
148 | C.98 r0 G.36
149 | C.98 r0 G.43
150 | C.98 r0 G.46
151 | C.98 r0 G.49
152 | C.98 r0 G.51
153 | C.98 r0 G.52
154 | C.0 r1 G.1
155 | C.0 r1 G.16
156 | C.0 r1 G.38
157 | C.2 r1 G.33
158 | C.2 r1 G.34
159 | C.2 r1 G.41
160 | C.3 r1 G.16
161 | C.3 r1 G.52
162 | C.4 r1 G.7
163 | C.4 r1 G.10
164 | C.4 r1 G.31
165 | C.4 r1 G.32
166 | C.4 r1 G.36
167 | C.4 r1 G.46
168 | C.4 r1 G.47
169 | C.5 r1 G.11
170 | C.5 r1 G.12
171 | C.5 r1 G.19
172 | C.5 r1 G.20
173 | C.5 r1 G.24
174 | C.5 r1 G.34
175 | C.5 r1 G.43
176 | C.6 r1 G.7
177 | C.6 r1 G.10
178 | C.6 r1 G.11
179 | C.6 r1 G.13
180 | C.6 r1 G.14
181 | C.6 r1 G.16
182 | C.6 r1 G.33
183 | C.6 r1 G.46
184 | C.6 r1 G.48
185 | C.7 r1 G.2
186 | C.7 r1 G.10
187 | C.7 r1 G.25
188 | C.7 r1 G.28
189 | C.7 r1 G.33
190 | C.7 r1 G.43
191 | C.7 r1 G.45
192 | C.8 r1 G.34
193 | C.8 r1 G.51
194 | C.14 r1 G.0
195 | C.14 r1 G.7
196 | C.14 r1 G.13
197 | C.14 r1 G.16
198 | C.14 r1 G.33
199 | C.14 r1 G.34
200 | C.14 r1 G.47
201 | C.14 r1 G.53
202 | C.16 r1 G.14
203 | C.16 r1 G.33
204 | C.17 r1 G.4
205 | C.17 r1 G.8
206 | C.17 r1 G.28
207 | C.17 r1 G.34
208 | C.17 r1 G.39
209 | C.17 r1 G.46
210 | C.17 r1 G.53
211 | C.18 r1 G.6
212 | C.18 r1 G.31
213 | C.18 r1 G.43
214 | C.18 r1 G.46
215 | C.18 r1 G.49
216 | C.19 r1 G.1
217 | C.19 r1 G.7
218 | C.19 r1 G.9
219 | C.19 r1 G.34
220 | C.19 r1 G.43
221 | C.19 r1 G.46
222 | C.20 r1 G.17
223 | C.20 r1 G.24
224 | C.20 r1 G.33
225 | C.21 r1 G.0
226 | C.21 r1 G.2
227 | C.21 r1 G.20
228 | C.21 r1 G.30
229 | C.21 r1 G.34
230 | C.21 r1 G.42
231 | C.21 r1 G.51
232 | C.22 r1 G.1
233 | C.22 r1 G.4
234 | C.22 r1 G.7
235 | C.22 r1 G.23
236 | C.22 r1 G.36
237 | C.22 r1 G.46
238 | C.23 r1 G.4
239 | C.23 r1 G.7
240 | C.23 r1 G.9
241 | C.23 r1 G.11
242 | C.23 r1 G.16
243 | C.23 r1 G.41
244 | C.23 r1 G.52
245 | C.26 r1 G.2
246 | C.26 r1 G.14
247 | C.26 r1 G.32
248 | C.26 r1 G.35
249 | C.27 r1 G.24
250 | C.27 r1 G.25
251 | C.27 r1 G.35
252 | C.27 r1 G.47
253 | C.27 r1 G.52
254 | C.29 r1 G.9
255 | C.29 r1 G.11
256 | C.29 r1 G.14
257 | C.29 r1 G.19
258 | C.29 r1 G.20
259 | C.29 r1 G.24
260 | C.29 r1 G.34
261 | C.29 r1 G.35
262 | C.29 r1 G.42
263 | C.29 r1 G.47
264 | C.29 r1 G.52
265 | C.30 r1 G.23
266 | C.30 r1 G.35
267 | C.30 r1 G.47
268 | C.32 r1 G.4
269 | C.32 r1 G.7
270 | C.32 r1 G.11
271 | C.32 r1 G.17
272 | C.32 r1 G.33
273 | C.32 r1 G.46
274 | C.33 r1 G.0
275 | C.33 r1 G.2
276 | C.33 r1 G.8
277 | C.33 r1 G.16
278 | C.33 r1 G.20
279 | C.33 r1 G.24
280 | C.33 r1 G.32
281 | C.33 r1 G.34
282 | C.33 r1 G.43
283 | C.33 r1 G.47
284 | C.33 r1 G.52
285 | C.33 r1 G.53
286 | C.36 r1 G.16
287 | C.36 r1 G.31
288 | C.36 r1 G.51
289 | C.37 r1 G.7
290 | C.37 r1 G.13
291 | C.37 r1 G.17
292 | C.37 r1 G.18
293 | C.37 r1 G.34
294 | C.37 r1 G.35
295 | C.37 r1 G.37
296 | C.37 r1 G.49
297 | C.37 r1 G.50
298 | C.38 r1 G.13
299 | C.38 r1 G.43
300 | C.39 r1 G.2
301 | C.39 r1 G.6
302 | C.39 r1 G.22
303 | C.39 r1 G.39
304 | C.39 r1 G.50
305 | C.40 r1 G.14
306 | C.40 r1 G.15
307 | C.40 r1 G.17
308 | C.40 r1 G.34
309 | C.40 r1 G.39
310 | C.41 r1 G.7
311 | C.41 r1 G.13
312 | C.41 r1 G.31
313 | C.41 r1 G.34
314 | C.41 r1 G.42
315 | C.41 r1 G.46
316 | C.42 r1 G.3
317 | C.42 r1 G.4
318 | C.42 r1 G.9
319 | C.42 r1 G.11
320 | C.42 r1 G.14
321 | C.42 r1 G.20
322 | C.42 r1 G.24
323 | C.42 r1 G.31
324 | C.42 r1 G.46
325 | C.45 r1 G.8
326 | C.45 r1 G.17
327 | C.45 r1 G.31
328 | C.45 r1 G.35
329 | C.45 r1 G.46
330 | C.45 r1 G.47
331 | C.45 r1 G.52
332 | C.47 r1 G.2
333 | C.47 r1 G.52
334 | C.49 r1 G.0
335 | C.49 r1 G.7
336 | C.49 r1 G.27
337 | C.49 r1 G.34
338 | C.49 r1 G.35
339 | C.49 r1 G.36
340 | C.49 r1 G.47
341 | C.50 r1 G.4
342 | C.50 r1 G.8
343 | C.50 r1 G.11
344 | C.50 r1 G.14
345 | C.50 r1 G.16
346 | C.50 r1 G.20
347 | C.50 r1 G.31
348 | C.50 r1 G.32
349 | C.50 r1 G.43
350 | C.51 r1 G.15
351 | C.51 r1 G.23
352 | C.51 r1 G.30
353 | C.51 r1 G.31
354 | C.51 r1 G.46
355 | C.52 r1 G.17
356 | C.52 r1 G.30
357 | C.53 r1 G.30
358 | C.53 r1 G.33
359 | C.53 r1 G.43
360 | C.54 r1 G.34
361 | C.54 r1 G.36
362 | C.55 r1 G.1
363 | C.55 r1 G.3
364 | C.55 r1 G.16
365 | C.55 r1 G.18
366 | C.55 r1 G.34
367 | C.55 r1 G.43
368 | C.55 r1 G.46
369 | C.55 r1 G.52
370 | C.56 r1 G.0
371 | C.56 r1 G.1
372 | C.56 r1 G.6
373 | C.56 r1 G.24
374 | C.56 r1 G.26
375 | C.56 r1 G.46
376 | C.58 r1 G.27
377 | C.58 r1 G.31
378 | C.58 r1 G.48
379 | C.58 r1 G.50
380 | C.60 r1 G.13
381 | C.60 r1 G.16
382 | C.60 r1 G.31
383 | C.60 r1 G.37
384 | C.60 r1 G.43
385 | C.60 r1 G.52
386 | C.62 r1 G.8
387 | C.62 r1 G.9
388 | C.62 r1 G.14
389 | C.62 r1 G.20
390 | C.62 r1 G.26
391 | C.62 r1 G.34
392 | C.62 r1 G.46
393 | C.62 r1 G.52
394 | C.63 r1 G.3
395 | C.63 r1 G.35
396 | C.65 r1 G.0
397 | C.65 r1 G.7
398 | C.65 r1 G.24
399 | C.65 r1 G.25
400 | C.65 r1 G.46
401 | C.67 r1 G.0
402 | C.67 r1 G.2
403 | C.67 r1 G.7
404 | C.67 r1 G.8
405 | C.67 r1 G.14
406 | C.67 r1 G.46
407 | C.67 r1 G.52
408 | C.68 r1 G.31
409 | C.69 r1 G.4
410 | C.69 r1 G.22
411 | C.69 r1 G.46
412 | C.69 r1 G.49
413 | C.70 r1 G.1
414 | C.70 r1 G.7
415 | C.70 r1 G.8
416 | C.70 r1 G.9
417 | C.70 r1 G.17
418 | C.70 r1 G.31
419 | C.70 r1 G.32
420 | C.70 r1 G.35
421 | C.70 r1 G.37
422 | C.70 r1 G.43
423 | C.71 r1 G.2
424 | C.71 r1 G.26
425 | C.71 r1 G.32
426 | C.71 r1 G.35
427 | C.71 r1 G.43
428 | C.71 r1 G.46
429 | C.72 r1 G.16
430 | C.72 r1 G.43
431 | C.72 r1 G.52
432 | C.73 r1 G.8
433 | C.73 r1 G.33
434 | C.73 r1 G.47
435 | C.75 r1 G.4
436 | C.75 r1 G.7
437 | C.75 r1 G.17
438 | C.75 r1 G.33
439 | C.75 r1 G.35
440 | C.75 r1 G.37
441 | C.75 r1 G.47
442 | C.77 r1 G.7
443 | C.77 r1 G.15
444 | C.77 r1 G.16
445 | C.77 r1 G.34
446 | C.77 r1 G.36
447 | C.77 r1 G.49
448 | C.77 r1 G.52
449 | C.79 r1 G.1
450 | C.79 r1 G.4
451 | C.79 r1 G.11
452 | C.79 r1 G.13
453 | C.79 r1 G.16
454 | C.79 r1 G.17
455 | C.79 r1 G.21
456 | C.79 r1 G.31
457 | C.79 r1 G.37
458 | C.79 r1 G.44
459 | C.79 r1 G.46
460 | C.80 r1 G.22
461 | C.80 r1 G.43
462 | C.80 r1 G.46
463 | C.81 r1 G.3
464 | C.81 r1 G.4
465 | C.81 r1 G.17
466 | C.81 r1 G.30
467 | C.81 r1 G.33
468 | C.81 r1 G.35
469 | C.82 r1 G.11
470 | C.82 r1 G.13
471 | C.82 r1 G.25
472 | C.82 r1 G.31
473 | C.82 r1 G.34
474 | C.82 r1 G.42
475 | C.83 r1 G.1
476 | C.83 r1 G.6
477 | C.83 r1 G.36
478 | C.83 r1 G.42
479 | C.84 r1 G.25
480 | C.84 r1 G.30
481 | C.84 r1 G.41
482 | C.84 r1 G.42
483 | C.84 r1 G.43
484 | C.88 r1 G.4
485 | C.88 r1 G.7
486 | C.88 r1 G.11
487 | C.88 r1 G.13
488 | C.88 r1 G.22
489 | C.88 r1 G.32
490 | C.88 r1 G.34
491 | C.88 r1 G.35
492 | C.88 r1 G.37
493 | C.88 r1 G.42
494 | C.89 r1 G.10
495 | C.89 r1 G.19
496 | C.89 r1 G.20
497 | C.89 r1 G.21
498 | C.89 r1 G.26
499 | C.89 r1 G.35
500 | C.89 r1 G.38
501 | C.90 r1 G.3
502 | C.91 r1 G.2
503 | C.91 r1 G.7
504 | C.91 r1 G.11
505 | C.91 r1 G.34
506 | C.91 r1 G.35
507 | C.91 r1 G.36
508 | C.91 r1 G.47
509 | C.91 r1 G.51
510 | C.93 r1 G.7
511 | C.93 r1 G.25
512 | C.93 r1 G.46
513 | C.93 r1 G.49
514 | C.94 r1 G.14
515 | C.94 r1 G.25
516 | C.94 r1 G.34
517 | C.94 r1 G.47
518 | C.94 r1 G.52
519 | C.95 r1 G.32
520 | C.95 r1 G.40
521 | C.95 r1 G.43
522 | C.95 r1 G.46
523 | C.0 r2 G.2
524 | C.0 r2 G.25
525 | C.0 r2 G.31
526 | C.0 r2 G.46
527 | C.0 r2 G.47
528 | C.1 r2 G.8
529 | C.1 r2 G.19
530 | C.1 r2 G.33
531 | C.1 r2 G.34
532 | C.1 r2 G.35
533 | C.1 r2 G.36
534 | C.1 r2 G.47
535 | C.2 r2 G.2
536 | C.3 r2 G.19
537 | C.3 r2 G.31
538 | C.3 r2 G.34
539 | C.3 r2 G.35
540 | C.4 r2 G.19
541 | C.4 r2 G.34
542 | C.5 r2 G.16
543 | C.5 r2 G.45
544 | C.5 r2 G.52
545 | C.6 r2 G.25
546 | C.7 r2 G.31
547 | C.8 r2 G.20
548 | C.8 r2 G.24
549 | C.8 r2 G.31
550 | C.8 r2 G.33
551 | C.8 r2 G.35
552 | C.8 r2 G.47
553 | C.9 r2 G.6
554 | C.9 r2 G.19
555 | C.9 r2 G.24
556 | C.9 r2 G.25
557 | C.9 r2 G.34
558 | C.11 r2 G.3
559 | C.11 r2 G.19
560 | C.11 r2 G.31
561 | C.11 r2 G.36
562 | C.11 r2 G.40
563 | C.11 r2 G.46
564 | C.12 r2 G.1
565 | C.12 r2 G.7
566 | C.12 r2 G.15
567 | C.12 r2 G.31
568 | C.13 r2 G.24
569 | C.13 r2 G.25
570 | C.13 r2 G.37
571 | C.14 r2 G.11
572 | C.14 r2 G.14
573 | C.15 r2 G.47
574 | C.17 r2 G.5
575 | C.17 r2 G.7
576 | C.17 r2 G.11
577 | C.17 r2 G.35
578 | C.17 r2 G.52
579 | C.19 r2 G.17
580 | C.19 r2 G.31
581 | C.19 r2 G.47
582 | C.20 r2 G.46
583 | C.21 r2 G.19
584 | C.21 r2 G.35
585 | C.21 r2 G.47
586 | C.22 r2 G.25
587 | C.22 r2 G.31
588 | C.22 r2 G.47
589 | C.23 r2 G.17
590 | C.23 r2 G.31
591 | C.23 r2 G.43
592 | C.23 r2 G.46
593 | C.24 r2 G.31
594 | C.24 r2 G.43
595 | C.24 r2 G.47
596 | C.26 r2 G.28
597 | C.27 r2 G.7
598 | C.27 r2 G.14
599 | C.27 r2 G.31
600 | C.27 r2 G.46
601 | C.28 r2 G.2
602 | C.28 r2 G.7
603 | C.28 r2 G.17
604 | C.30 r2 G.2
605 | C.30 r2 G.11
606 | C.30 r2 G.14
607 | C.30 r2 G.17
608 | C.30 r2 G.29
609 | C.30 r2 G.34
610 | C.30 r2 G.46
611 | C.30 r2 G.52
612 | C.31 r2 G.7
613 | C.31 r2 G.16
614 | C.32 r2 G.16
615 | C.32 r2 G.43
616 | C.33 r2 G.19
617 | C.33 r2 G.35
618 | C.34 r2 G.23
619 | C.34 r2 G.37
620 | C.34 r2 G.42
621 | C.34 r2 G.47
622 | C.35 r2 G.14
623 | C.35 r2 G.19
624 | C.36 r2 G.24
625 | C.36 r2 G.33
626 | C.36 r2 G.46
627 | C.37 r2 G.33
628 | C.38 r2 G.25
629 | C.39 r2 G.7
630 | C.39 r2 G.43
631 | C.40 r2 G.2
632 | C.40 r2 G.31
633 | C.41 r2 G.9
634 | C.43 r2 G.9
635 | C.43 r2 G.22
636 | C.43 r2 G.26
637 | C.43 r2 G.31
638 | C.43 r2 G.39
639 | C.44 r2 G.3
640 | C.44 r2 G.17
641 | C.44 r2 G.24
642 | C.44 r2 G.33
643 | C.44 r2 G.46
644 | C.45 r2 G.19
645 | C.45 r2 G.24
646 | C.45 r2 G.34
647 | C.46 r2 G.0
648 | C.48 r2 G.0
649 | C.48 r2 G.22
650 | C.48 r2 G.33
651 | C.48 r2 G.34
652 | C.48 r2 G.35
653 | C.48 r2 G.52
654 | C.49 r2 G.31
655 | C.49 r2 G.46
656 | C.50 r2 G.2
657 | C.50 r2 G.33
658 | C.50 r2 G.46
659 | C.50 r2 G.52
660 | C.51 r2 G.4
661 | C.51 r2 G.11
662 | C.51 r2 G.25
663 | C.51 r2 G.33
664 | C.51 r2 G.34
665 | C.52 r2 G.31
666 | C.52 r2 G.46
667 | C.53 r2 G.7
668 | C.53 r2 G.34
669 | C.53 r2 G.47
670 | C.54 r2 G.42
671 | C.55 r2 G.7
672 | C.55 r2 G.25
673 | C.56 r2 G.13
674 | C.56 r2 G.25
675 | C.56 r2 G.31
676 | C.56 r2 G.33
677 | C.56 r2 G.47
678 | C.57 r2 G.13
679 | C.57 r2 G.18
680 | C.57 r2 G.21
681 | C.57 r2 G.24
682 | C.57 r2 G.46
683 | C.58 r2 G.46
684 | C.58 r2 G.49
685 | C.59 r2 G.11
686 | C.59 r2 G.23
687 | C.59 r2 G.34
688 | C.59 r2 G.40
689 | C.59 r2 G.43
690 | C.59 r2 G.47
691 | C.60 r2 G.25
692 | C.60 r2 G.34
693 | C.60 r2 G.47
694 | C.60 r2 G.49
695 | C.61 r2 G.11
696 | C.61 r2 G.34
697 | C.61 r2 G.39
698 | C.61 r2 G.46
699 | C.62 r2 G.23
700 | C.62 r2 G.33
701 | C.62 r2 G.47
702 | C.63 r2 G.9
703 | C.64 r2 G.1
704 | C.64 r2 G.11
705 | C.64 r2 G.18
706 | C.64 r2 G.19
707 | C.64 r2 G.21
708 | C.64 r2 G.48
709 | C.64 r2 G.52
710 | C.65 r2 G.11
711 | C.65 r2 G.19
712 | C.65 r2 G.28
713 | C.65 r2 G.35
714 | C.65 r2 G.47
715 | C.66 r2 G.1
716 | C.66 r2 G.14
717 | C.66 r2 G.19
718 | C.66 r2 G.27
719 | C.67 r2 G.35
720 | C.67 r2 G.47
721 | C.68 r2 G.2
722 | C.68 r2 G.35
723 | C.68 r2 G.52
724 | C.70 r2 G.33
725 | C.70 r2 G.34
726 | C.70 r2 G.46
727 | C.71 r2 G.11
728 | C.71 r2 G.31
729 | C.71 r2 G.33
730 | C.71 r2 G.34
731 | C.72 r2 G.34
732 | C.72 r2 G.46
733 | C.73 r2 G.31
734 | C.74 r2 G.31
735 | C.74 r2 G.46
736 | C.75 r2 G.31
737 | C.76 r2 G.1
738 | C.76 r2 G.7
739 | C.76 r2 G.27
740 | C.76 r2 G.32
741 | C.76 r2 G.33
742 | C.76 r2 G.37
743 | C.76 r2 G.44
744 | C.76 r2 G.47
745 | C.77 r2 G.3
746 | C.77 r2 G.31
747 | C.77 r2 G.46
748 | C.77 r2 G.47
749 | C.78 r2 G.25
750 | C.78 r2 G.41
751 | C.78 r2 G.52
752 | C.80 r2 G.31
753 | C.80 r2 G.47
754 | C.81 r2 G.25
755 | C.81 r2 G.46
756 | C.81 r2 G.47
757 | C.82 r2 G.7
758 | C.82 r2 G.33
759 | C.82 r2 G.47
760 | C.83 r2 G.47
761 | C.83 r2 G.48
762 | C.85 r2 G.9
763 | C.85 r2 G.25
764 | C.85 r2 G.31
765 | C.85 r2 G.33
766 | C.85 r2 G.35
767 | C.85 r2 G.43
768 | C.86 r2 G.15
769 | C.86 r2 G.16
770 | C.86 r2 G.34
771 | C.86 r2 G.38
772 | C.86 r2 G.47
773 | C.87 r2 G.7
774 | C.87 r2 G.8
775 | C.87 r2 G.29
776 | C.87 r2 G.30
777 | C.87 r2 G.31
778 | C.87 r2 G.32
779 | C.87 r2 G.35
780 | C.87 r2 G.52
781 | C.88 r2 G.2
782 | C.88 r2 G.14
783 | C.88 r2 G.17
784 | C.88 r2 G.19
785 | C.88 r2 G.33
786 | C.88 r2 G.46
787 | C.88 r2 G.47
788 | C.89 r2 G.46
789 | C.90 r2 G.1
790 | C.90 r2 G.31
791 | C.90 r2 G.46
792 | C.91 r2 G.4
793 | C.91 r2 G.19
794 | C.91 r2 G.29
795 | C.91 r2 G.31
796 | C.91 r2 G.33
797 | C.92 r2 G.31
798 | C.92 r2 G.33
799 | C.93 r2 G.31
800 | C.93 r2 G.44
801 | C.94 r2 G.2
802 | C.94 r2 G.17
803 | C.94 r2 G.31
804 | C.94 r2 G.35
805 | C.95 r2 G.2
806 | C.95 r2 G.33
807 | C.95 r2 G.47
808 | C.95 r2 G.52
809 | C.96 r2 G.12
810 | C.96 r2 G.19
811 | C.97 r2 G.4
812 | C.97 r2 G.20
813 | C.97 r2 G.24
814 | C.97 r2 G.34
815 | C.97 r2 G.35
816 | C.97 r2 G.36
817 | C.98 r2 G.31
818 | C.98 r2 G.33
819 | C.99 r2 G.4
820 | C.99 r2 G.11
821 | C.99 r2 G.21
822 | C.99 r2 G.31
823 | C.99 r2 G.46
824 | C.1 r3 G.2
825 | C.1 r3 G.7
826 | C.1 r3 G.31
827 | C.2 r3 G.31
828 | C.2 r3 G.47
829 | C.3 r3 G.47
830 | C.4 r3 G.2
831 | C.4 r3 G.33
832 | C.5 r3 G.46
833 | C.5 r3 G.47
834 | C.6 r3 G.31
835 | C.7 r3 G.46
836 | C.7 r3 G.47
837 | C.8 r3 G.46
838 | C.9 r3 G.31
839 | C.9 r3 G.46
840 | C.10 r3 G.17
841 | C.11 r3 G.2
842 | C.11 r3 G.24
843 | C.12 r3 G.47
844 | C.13 r3 G.46
845 | C.14 r3 G.1
846 | C.14 r3 G.19
847 | C.14 r3 G.31
848 | C.14 r3 G.46
849 | C.15 r3 G.6
850 | C.15 r3 G.13
851 | C.15 r3 G.31
852 | C.15 r3 G.35
853 | C.16 r3 G.31
854 | C.17 r3 G.2
855 | C.17 r3 G.33
856 | C.18 r3 G.1
857 | C.18 r3 G.47
858 | C.20 r3 G.47
859 | C.21 r3 G.33
860 | C.23 r3 G.19
861 | C.23 r3 G.47
862 | C.24 r3 G.1
863 | C.26 r3 G.4
864 | C.26 r3 G.19
865 | C.26 r3 G.33
866 | C.26 r3 G.46
867 | C.27 r3 G.2
868 | C.27 r3 G.19
869 | C.27 r3 G.33
870 | C.28 r3 G.15
871 | C.28 r3 G.19
872 | C.28 r3 G.33
873 | C.28 r3 G.34
874 | C.28 r3 G.35
875 | C.28 r3 G.46
876 | C.29 r3 G.2
877 | C.29 r3 G.4
878 | C.29 r3 G.33
879 | C.29 r3 G.46
880 | C.31 r3 G.11
881 | C.31 r3 G.19
882 | C.32 r3 G.25
883 | C.32 r3 G.31
884 | C.32 r3 G.47
885 | C.33 r3 G.31
886 | C.33 r3 G.33
887 | C.34 r3 G.11
888 | C.35 r3 G.35
889 | C.36 r3 G.47
890 | C.37 r3 G.31
891 | C.37 r3 G.47
892 | C.39 r3 G.31
893 | C.39 r3 G.47
894 | C.41 r3 G.47
895 | C.43 r3 G.17
896 | C.43 r3 G.46
897 | C.43 r3 G.52
898 | C.45 r3 G.2
899 | C.46 r3 G.31
900 | C.46 r3 G.35
901 | C.47 r3 G.31
902 | C.47 r3 G.33
903 | C.47 r3 G.47
904 | C.48 r3 G.47
905 | C.49 r3 G.2
906 | C.49 r3 G.17
907 | C.49 r3 G.19
908 | C.50 r3 G.19
909 | C.50 r3 G.47
910 | C.51 r3 G.47
911 | C.52 r3 G.47
912 | C.53 r3 G.31
913 | C.53 r3 G.46
914 | C.55 r3 G.2
915 | C.55 r3 G.19
916 | C.55 r3 G.47
917 | C.57 r3 G.17
918 | C.57 r3 G.47
919 | C.58 r3 G.47
920 | C.59 r3 G.19
921 | C.59 r3 G.31
922 | C.59 r3 G.33
923 | C.59 r3 G.46
924 | C.62 r3 G.19
925 | C.62 r3 G.31
926 | C.62 r3 G.35
927 | C.63 r3 G.47
928 | C.64 r3 G.47
929 | C.65 r3 G.33
930 | C.65 r3 G.34
931 | C.65 r3 G.52
932 | C.66 r3 G.45
933 | C.67 r3 G.1
934 | C.67 r3 G.19
935 | C.67 r3 G.33
936 | C.68 r3 G.19
937 | C.68 r3 G.33
938 | C.68 r3 G.47
939 | C.69 r3 G.31
940 | C.70 r3 G.2
941 | C.70 r3 G.19
942 | C.70 r3 G.47
943 | C.71 r3 G.47
944 | C.72 r3 G.19
945 | C.72 r3 G.31
946 | C.72 r3 G.47
947 | C.73 r3 G.2
948 | C.73 r3 G.11
949 | C.73 r3 G.19
950 | C.73 r3 G.46
951 | C.75 r3 G.25
952 | C.76 r3 G.17
953 | C.76 r3 G.31
954 | C.77 r3 G.2
955 | C.78 r3 G.47
956 | C.79 r3 G.33
957 | C.79 r3 G.47
958 | C.83 r3 G.19
959 | C.83 r3 G.43
960 | C.83 r3 G.46
961 | C.84 r3 G.31
962 | C.84 r3 G.47
963 | C.85 r3 G.47
964 | C.86 r3 G.31
965 | C.86 r3 G.35
966 | C.87 r3 G.2
967 | C.87 r3 G.19
968 | C.87 r3 G.33
969 | C.89 r3 G.7
970 | C.89 r3 G.17
971 | C.89 r3 G.33
972 | C.90 r3 G.47
973 | C.91 r3 G.46
974 | C.92 r3 G.14
975 | C.92 r3 G.17
976 | C.92 r3 G.19
977 | C.94 r3 G.19
978 | C.94 r3 G.33
979 | C.94 r3 G.46
980 | C.95 r3 G.19
981 | C.95 r3 G.31
982 | C.95 r3 G.35
983 | C.96 r3 G.34
984 | C.96 r3 G.35
985 | C.96 r3 G.47
986 | C.97 r3 G.2
987 | C.97 r3 G.27
988 | C.97 r3 G.46
989 | C.98 r3 G.47
990 | C.0 r4 G.19
991 | C.2 r4 G.19
992 | C.3 r4 G.46
993 | C.6 r4 G.19
994 | C.7 r4 G.19
995 | C.8 r4 G.19
996 | C.9 r4 G.47
997 | C.10 r4 G.19
998 | C.10 r4 G.47
999 | C.12 r4 G.19
1000 | C.13 r4 G.19
1001 | C.15 r4 G.19
1002 | C.16 r4 G.19
1003 | C.16 r4 G.47
1004 | C.18 r4 G.19
1005 | C.19 r4 G.19
1006 | C.20 r4 G.19
1007 | C.21 r4 G.46
1008 | C.22 r4 G.19
1009 | C.24 r4 G.19
1010 | C.25 r4 G.19
1011 | C.25 r4 G.46
1012 | C.25 r4 G.53
1013 | C.30 r4 G.33
1014 | C.31 r4 G.47
1015 | C.32 r4 G.19
1016 | C.33 r4 G.46
1017 | C.34 r4 G.19
1018 | C.35 r4 G.2
1019 | C.35 r4 G.33
1020 | C.36 r4 G.19
1021 | C.37 r4 G.19
1022 | C.38 r4 G.19
1023 | C.38 r4 G.47
1024 | C.39 r4 G.19
1025 | C.40 r4 G.19
1026 | C.40 r4 G.47
1027 | C.41 r4 G.19
1028 | C.42 r4 G.19
1029 | C.42 r4 G.47
1030 | C.44 r4 G.19
1031 | C.44 r4 G.47
1032 | C.45 r4 G.33
1033 | C.46 r4 G.19
1034 | C.47 r4 G.19
1035 | C.48 r4 G.19
1036 | C.51 r4 G.19
1037 | C.52 r4 G.19
1038 | C.53 r4 G.19
1039 | C.54 r4 G.19
1040 | C.56 r4 G.19
1041 | C.57 r4 G.19
1042 | C.58 r4 G.19
1043 | C.60 r4 G.19
1044 | C.61 r4 G.19
1045 | C.61 r4 G.25
1046 | C.63 r4 G.19
1047 | C.63 r4 G.31
1048 | C.66 r4 G.47
1049 | C.69 r4 G.19
1050 | C.69 r4 G.47
1051 | C.71 r4 G.19
1052 | C.74 r4 G.19
1053 | C.74 r4 G.47
1054 | C.75 r4 G.19
1055 | C.75 r4 G.46
1056 | C.77 r4 G.33
1057 | C.78 r4 G.19
1058 | C.79 r4 G.19
1059 | C.80 r4 G.19
1060 | C.81 r4 G.19
1061 | C.82 r4 G.19
1062 | C.82 r4 G.46
1063 | C.83 r4 G.31
1064 | C.84 r4 G.19
1065 | C.85 r4 G.19
1066 | C.86 r4 G.19
1067 | C.89 r4 G.2
1068 | C.90 r4 G.19
1069 | C.92 r4 G.46
1070 | C.93 r4 G.19
1071 | C.93 r4 G.47
1072 | C.96 r4 G.33
1073 | C.98 r4 G.19
1074 | C.99 r4 G.2
1075 | C.99 r4 G.19
1076 |
--------------------------------------------------------------------------------
/tests/data/preprocessed/atac_preprocessed.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/preprocessed/atac_preprocessed.h5ad
--------------------------------------------------------------------------------
/tests/data/preprocessed/rna_preprocessed.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/preprocessed/rna_preprocessed.h5ad
--------------------------------------------------------------------------------
/tests/test_pbg_training.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def adata_CG():
7 | return si.read_h5ad(
8 | "tests/data/preprocessed/rna_preprocessed.h5ad")
9 |
10 |
11 | @pytest.fixture
12 | def adata_CP():
13 | return si.read_h5ad(
14 | "tests/data/preprocessed/atac_preprocessed.h5ad")
15 |
16 |
17 | def test_pbg_training_rna(adata_CG, tmp_path):
18 | si.settings.set_workdir(tmp_path / "simba_rna")
19 | si.tl.gen_graph(list_CG=[adata_CG],
20 | copy=False,
21 | dirname='graph0')
22 | si.tl.pbg_train(auto_wd=True,
23 | output='model')
24 | si.pl.pbg_metrics(fig_ncol=1,
25 | save_fig=True)
26 |
27 |
28 | def test_pbg_training_atac(adata_CP, tmp_path):
29 | si.settings.set_workdir(tmp_path / "simba_atac")
30 | si.tl.gen_graph(list_CP=[adata_CP],
31 | copy=False,
32 | dirname='graph0')
33 | si.tl.pbg_train(auto_wd=True,
34 | output='model')
35 | si.pl.pbg_metrics(fig_ncol=1,
36 | save_fig=True)
37 |
--------------------------------------------------------------------------------
/tests/test_post_training.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def dict_adata():
7 |
8 | return si.read_embedding(
9 | path_emb='tests/data/pbg_training/model/',
10 | path_entity='tests/data/pbg_training/input/entity/',
11 | path_entity_alias='tests/data/pbg_training')
12 |
13 |
14 | def test_embeddding_rna(dict_adata, tmp_path):
15 | si.settings.set_workdir(tmp_path / "simba_rna")
16 | adata_C = dict_adata['C']
17 | adata_G = dict_adata['G']
18 | adata_all_CG = si.tl.embed(
19 | adata_ref=adata_C,
20 | list_adata_query=[adata_G])
21 | # add annotations of cells and genes
22 | adata_all_CG.obs['entity_anno'] = ""
23 | adata_all_CG.obs.loc[adata_C.obs_names, 'entity_anno'] = 'cell'
24 | adata_all_CG.obs.loc[adata_G.obs_names, 'entity_anno'] = 'gene'
25 |
26 | si.tl.umap(adata_all_CG,
27 | n_neighbors=15,
28 | n_components=2)
29 | adata_cmp = si.tl.compare_entities(
30 | adata_ref=adata_C,
31 | adata_query=adata_G)
32 | si.pl.entity_metrics(adata_cmp,
33 | x='max',
34 | y='gini',
35 | show_contour=False,
36 | texts=adata_G.obs_names[:2],
37 | show_texts=True,
38 | show_cutoff=True,
39 | size=5,
40 | text_expand=(1.3, 1.5),
41 | cutoff_x=1.,
42 | cutoff_y=0.3,
43 | save_fig=True)
44 | si.pl.entity_barcode(adata_cmp,
45 | layer='softmax',
46 | entities=list(adata_G.obs_names[:2]),
47 | show_cutoff=True,
48 | cutoff=0.001,
49 | fig_size=(5, 2.5),
50 | save_fig=True)
51 | query_result = si.tl.query(adata_all_CG,
52 | entity=list(adata_C.obs_names[:2]),
53 | obsm='X_umap',
54 | use_radius=False,
55 | k=50,
56 | anno_filter='entity_anno',
57 | filters=['gene'])
58 | print(query_result.head())
59 | si.pl.query(adata_all_CG,
60 | show_texts=False,
61 | color=['entity_anno'],
62 | alpha=0.9,
63 | alpha_bg=0.1,
64 | save_fig=True)
65 |
--------------------------------------------------------------------------------
/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def adata_CG():
7 | return si.read_h5ad("tests/data/10xpbmc_rna_subset.h5ad")
8 |
9 |
10 | @pytest.fixture
11 | def adata_CP():
12 | return si.read_h5ad("tests/data/10xpbmc_atac_subset.h5ad")
13 |
14 |
15 | def test_rna(adata_CG, tmp_path):
16 | si.settings.set_workdir(tmp_path / "simba_rna")
17 | si.settings.set_figure_params(dpi=80,
18 | style='white',
19 | fig_size=[5, 5],
20 | rc={'image.cmap': 'viridis'})
21 | si.pp.filter_genes(adata_CG, min_n_cells=3)
22 | si.pp.cal_qc_rna(adata_CG)
23 | si.pl.violin(adata_CG,
24 | list_obs=['n_counts', 'n_genes', 'pct_mt'],
25 | save_fig=True,
26 | fig_name='plot_violin.png')
27 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2)
28 | si.pp.normalize(adata_CG, method='lib_size')
29 | si.pp.log_transform(adata_CG)
30 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000)
31 | si.pl.variable_genes(adata_CG,
32 | show_texts=True,
33 | save_fig=True,
34 | fig_name='plot_variable_genes.png')
35 | si.tl.discretize(adata_CG, n_bins=5)
36 | si.pl.discretize(adata_CG,
37 | save_fig=True,
38 | fig_name='plot_discretize.png')
39 |
40 |
41 | def test_atac(adata_CP, tmp_path):
42 | si.settings.set_workdir(tmp_path / "simba_atac")
43 | si.pp.filter_peaks(adata_CP, min_n_cells=5)
44 | si.pp.cal_qc_atac(adata_CP)
45 | si.pl.hist(adata_CP,
46 | list_obs=['n_counts', 'n_peaks', 'pct_peaks'],
47 | log=True,
48 | list_var=['n_cells'],
49 | fig_size=(3, 3),
50 | save_fig=True,
51 | fig_name='plot_histogram.png')
52 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5)
53 | si.pp.pca(adata_CP, n_components=30)
54 | si.pl.pca_variance_ratio(adata_CP,
55 | show_cutoff=True,
56 | save_fig=True,
57 | fig_name='plot_variance_ratio.png')
58 | si.pp.select_pcs(adata_CP, n_pcs=10)
59 | si.pp.select_pcs_features(adata_CP)
60 | si.pl.pcs_features(adata_CP,
61 | fig_ncol=5,
62 | save_fig=True,
63 | fig_name='plot_pcs_features.png')
64 |
65 |
66 | def test_genescores(adata_CP):
67 | si.pp.filter_peaks(adata_CP, min_n_cells=5)
68 | si.pp.cal_qc_atac(adata_CP)
69 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5)
70 | si.pp.pca(adata_CP, n_components=30)
71 | si.pp.select_pcs(adata_CP, n_pcs=10)
72 | si.pp.select_pcs_features(adata_CP)
73 |
74 | adata_CG_atac = si.tl.gene_scores(adata_CP,
75 | genome='hg19',
76 | use_gene_weigt=True,
77 | use_top_pcs=True)
78 | print(adata_CG_atac)
79 |
80 |
81 | def test_integration(adata_CG):
82 | si.pp.filter_genes(adata_CG, min_n_cells=3)
83 | si.pp.cal_qc_rna(adata_CG)
84 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2)
85 | si.pp.normalize(adata_CG, method='lib_size')
86 | si.pp.log_transform(adata_CG)
87 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000)
88 | adata_C1C2 = si.tl.infer_edges(
89 | adata_CG, adata_CG, n_components=20, k=20)
90 | si.pl.node_similarity(adata_C1C2,
91 | cutoff=0.5,
92 | save_fig=True)
93 | si.pl.svd_nodes(adata_C1C2,
94 | cutoff=0.5,
95 | save_fig=True)
96 | si.tl.trim_edges(adata_C1C2, cutoff=0.5)
97 |
--------------------------------------------------------------------------------