├── .codecov.yml
├── .github
└── workflows
│ └── CI.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── README.md
├── R_scripts
└── scan_for_kmers_motifs.R
├── docs
├── Makefile
├── environment.yml
├── make.bat
├── requirements.txt
└── source
│ ├── API.rst
│ ├── About SIMBA.rst
│ ├── Basic concepts.rst
│ ├── Citation.rst
│ ├── Installation.rst
│ ├── Makefile
│ ├── Output.rst
│ ├── Release notes.rst
│ ├── _ext
│ └── edit_on_github.py
│ ├── _static
│ └── img
│ │ ├── Figure1.png
│ │ ├── lion_icon.svg
│ │ └── logo_simba.png
│ ├── conf.py
│ ├── index.rst
│ └── make.bat
├── pytest.ini
├── requirements.txt
├── setup.py
├── simba
├── __init__.py
├── _settings.py
├── _utils.py
├── _version.py
├── data
│ └── gene_anno
│ │ ├── hg19_genes.bed
│ │ ├── hg38_genes.bed
│ │ ├── mm10_genes.bed
│ │ └── mm9_genes.bed
├── datasets
│ ├── __init__.py
│ └── _datasets.py
├── plotting
│ ├── __init__.py
│ ├── _palettes.py
│ ├── _plot.py
│ ├── _post_training.py
│ └── _utils.py
├── preprocessing
│ ├── __init__.py
│ ├── _general.py
│ ├── _pca.py
│ ├── _qc.py
│ ├── _utils.py
│ └── _variable_genes.py
├── readwrite.py
└── tools
│ ├── __init__.py
│ ├── _gene_scores.py
│ ├── _general.py
│ ├── _integration.py
│ ├── _pbg.py
│ ├── _post_training.py
│ ├── _umap.py
│ └── _utils.py
└── tests
├── data
├── 10xpbmc_atac_subset.h5ad
├── 10xpbmc_rna_subset.h5ad
├── pbg_training
│ ├── entity_alias.txt
│ ├── graph_stats.json
│ ├── input
│ │ └── entity
│ │ │ ├── entity_count_C_0.txt
│ │ │ ├── entity_count_G_0.txt
│ │ │ ├── entity_names_C_0.json
│ │ │ └── entity_names_G_0.json
│ ├── model
│ │ ├── checkpoint_version.txt
│ │ ├── config.json
│ │ ├── embeddings_C_0.v10.h5
│ │ ├── embeddings_G_0.v10.h5
│ │ ├── model.v10.h5
│ │ └── training_stats.json
│ └── pbg_graph.txt
└── preprocessed
│ ├── atac_preprocessed.h5ad
│ └── rna_preprocessed.h5ad
├── test_pbg_training.py
├── test_post_training.py
└── test_preprocessing.py
/.codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 | - "simba/datasets/*"
3 | - "**/_utils.py"
--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build-linux:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | max-parallel: 5
10 | matrix:
11 | python-version: ['3.8', '3.9', '3.10']
12 |
13 | steps:
14 | - uses: actions/checkout@v3
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v3
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | # - name: Add conda to system path
20 | # run: |
21 | # # $CONDA is an environment variable pointing to the root of the miniconda directory
22 | # echo $CONDA/bin >> $GITHUB_PATH
23 | - uses: mamba-org/setup-micromamba@v1
24 | with:
25 | condarc: |
26 | channels:
27 | - conda-forge
28 | - bioconda
29 | - defaults
30 | init-shell: bash
31 | environment-name: test-env
32 | create-args: >-
33 | python=${{ matrix.python-version }}
34 | simba>=1.1
35 | flake8
36 | pytest
37 | pytest-cov
38 | - name: Install SIMBA
39 | run: |
40 | python -m pip install --upgrade pip
41 | # pip install -r requirements.txt
42 | pip install -e .
43 | shell: bash -el {0}
44 | - name: Lint with flake8
45 | run: |
46 | # stop the build if there are Python syntax errors or undefined names
47 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
48 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
49 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
50 | shell: bash -el {0}
51 | - name: Test with pytest
52 | run: |
53 | pytest --cov
54 | shell: bash -el {0}
55 | - name: Coverage report
56 | run: |
57 | bash <(curl -s https://codecov.io/bash)
58 | shell: bash -el {0}
59 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/osx,python,windows
2 |
3 | ### OSX ###
4 | *.DS_Store
5 | .AppleDouble
6 | .LSOverride
7 |
8 | # Icon must end with two \r
9 | Icon
10 |
11 | # Thumbnails
12 | ._*
13 |
14 | # Files that might appear in the root of a volume
15 | .DocumentRevisions-V100
16 | .fseventsd
17 | .Spotlight-V100
18 | .TemporaryItems
19 | .Trashes
20 | .VolumeIcon.icns
21 | .com.apple.timemachine.donotpresent
22 |
23 | # Directories potentially created on remote AFP share
24 | .AppleDB
25 | .AppleDesktop
26 | Network Trash Folder
27 | Temporary Items
28 | .apdisk
29 |
30 | ### Python ###
31 | # Byte-compiled / optimized / DLL files
32 | __pycache__/
33 | *.py[cod]
34 | *$py.class
35 |
36 | # C extensions
37 | *.so
38 |
39 | # Distribution / packaging
40 | .Python
41 | build/
42 | develop-eggs/
43 | dist/
44 | downloads/
45 | eggs/
46 | .eggs/
47 | lib/
48 | lib64/
49 | parts/
50 | sdist/
51 | var/
52 | wheels/
53 | *.egg-info/
54 | .installed.cfg
55 | *.egg
56 |
57 | # PyInstaller
58 | # Usually these files are written by a python script from a template
59 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
60 | *.manifest
61 | *.spec
62 |
63 | # Installer logs
64 | pip-log.txt
65 | pip-delete-this-directory.txt
66 |
67 | # Unit test / coverage reports
68 | htmlcov/
69 | .tox/
70 | .coverage
71 | .coverage.*
72 | .cache
73 | .pytest_cache/
74 | nosetests.xml
75 | coverage.xml
76 | *.cover
77 | .hypothesis/
78 |
79 | # Translations
80 | *.mo
81 | *.pot
82 |
83 | # Flask stuff:
84 | instance/
85 | .webassets-cache
86 |
87 | # Scrapy stuff:
88 | .scrapy
89 |
90 | # Sphinx documentation
91 | docs/_build/
92 |
93 | # PyBuilder
94 | target/
95 |
96 | # Jupyter Notebook
97 | .ipynb_checkpoints
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # celery beat schedule file
103 | celerybeat-schedule.*
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 |
130 | ### Windows ###
131 | # Windows thumbnail cache files
132 | Thumbs.db
133 | ehthumbs.db
134 | ehthumbs_vista.db
135 |
136 | # Folder config file
137 | Desktop.ini
138 |
139 | # Recycle Bin used on file shares
140 | $RECYCLE.BIN/
141 |
142 | # Windows Installer files
143 | *.cab
144 | *.msi
145 | *.msm
146 | *.msp
147 |
148 | # Windows shortcuts
149 | *.lnk
150 |
151 | # R
152 | *.Rhistory
153 |
154 | # Sphinx
155 | docs/source/_autosummary/
156 |
157 | # End of https://www.gitignore.io/api/osx,python,windows
158 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | conda:
4 | environment: docs/environment.yml
5 |
6 | build:
7 | os: ubuntu-22.04
8 | tools:
9 | python: "mambaforge-4.10"
10 |
11 | sphinx:
12 | builder: html
13 | configuration: docs/source/conf.py
14 | fail_on_warning: false
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021, Huidong Chen, Pinello Lab
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://simba-bio.readthedocs.io/en/latest/)
2 | [](https://github.com/huidongchen/simba/actions/workflows/CI.yml)
3 | [](https://anaconda.org/bioconda/simba)
4 | [](http://bioconda.github.io/recipes/simba/README.html)
5 | [](https://codecov.io/gh/huidongchen/simba)
6 |
7 | # SIMBA
8 |
9 | SIMBA: **SI**ngle-cell e**MB**edding **A**long with features
10 |
11 | Website: https://simba-bio.readthedocs.io
12 |
13 | Manuscript: Huidong Chen, Jayoung Ryu, Michael E. Vinyard, Adam Lerer & Luca Pinello. ["SIMBA: single-cell embedding along with features. *Nat Methods* (2023)"](https://doi.org/10.1038/s41592-023-01899-8).
14 |
15 |
16 |
--------------------------------------------------------------------------------
/R_scripts/scan_for_kmers_motifs.R:
--------------------------------------------------------------------------------
1 | # This script scans specified regions for kmers or/and motifs using JASPAR2020 database.
2 | # It outputs regions-by-kmers/motifs frequency matrix in .h5 format
3 |
4 | # Author: Huidong Chen
5 | # Contact information: hd7chen AT gmail DOT com
6 |
7 | suppressMessages(library(optparse,quietly = TRUE))
8 |
9 | main <- function(){
10 | option_list = list(
11 | make_option(c("-i", "--input"), type="character", default=NULL,
12 | help="input region file name in .bed format", metavar="character"),
13 | make_option(c("-g", "--genome"), type="character", default=NULL,
14 | help="Path to reference genome", metavar="character"),
15 | make_option(c("--no_kmer"), action = "store_true",default=FALSE,
16 | help="disable scanning for kmers"),
17 | make_option(c("--no_motif"), action = "store_true",default=FALSE,
18 | help="disable scanning for motifs"),
19 | make_option(c("-k","--k_kmer"), type="integer", default=6,
20 | help="k-mer length [default = %default].", metavar="integer"),
21 | make_option(c("-s","--species"), type="character", default=NULL,
22 | help="Species of motifs in the JASPAR database.
23 | Choose from 'Homo sapiens','Mus musculus'. Only valid when motif is used",
24 | metavar="character"),
25 | make_option(c("-o", "--output"), type="character", default='output_kmers_motifs',
26 | help="Output folder [default = %default]", metavar="character")
27 | )
28 |
29 | opt_parser = OptionParser(option_list=option_list)
30 | opt = parse_args(opt_parser)
31 |
32 | if(is.null(opt$input)){
33 | print_help(opt_parser)
34 | stop("input region file must be specified", call.=FALSE)
35 | }
36 | if(!opt$no_motif){
37 | if(any(is.null(opt$genome),is.null(opt$species))){
38 | print_help(opt_parser)
39 | stop("reference genome and species must be both specified", call.=FALSE)
40 | }
41 | }
42 |
43 | file.input = opt$input
44 | genome = opt$genome
45 | no_kmer = opt$no_kmer
46 | no_motif = opt$no_motif
47 | k = opt$k_kmer
48 | species = opt$species
49 | dir.output = opt$output
50 |
51 | suppressMessages(library(rhdf5))
52 | suppressMessages(library(HDF5Array)) # used for saving sparse matrix
53 | suppressMessages(library(Biostrings))
54 | suppressMessages(library(Matrix))
55 | suppressMessages(library(TFBSTools))
56 | suppressMessages(library(JASPAR2020))
57 | suppressMessages(library(motifmatchr))
58 | suppressMessages(library(SummarizedExperiment))
59 | suppressMessages(library(doParallel))
60 |
61 | set.seed(2020)
62 |
63 | system(paste0('mkdir -p ',dir.output))
64 |
65 | print('Converting .bed to .fasta ...')
66 | ### convert peaks bed file to fasta file
67 | file.input.fa = paste0(basename(file.input),'.fa')
68 | system(paste("bedtools getfasta -fi",genome,
69 | "-bed",file.input,
70 | "-fo",file.path(dir.output,file.input.fa)))
71 |
72 | peaks_seq <- readDNAStringSet(file.path(dir.output,file.input.fa), "fasta")
73 | peaks_name = gsub(":|-",'_',names(peaks_seq))
74 |
75 | ### count kmers
76 | if(!no_kmer){
77 | print('Scanning for kmers ...')
78 | freq_k = oligonucleotideFrequency(peaks_seq, k)
79 | rownames(freq_k) = peaks_name
80 | freq_k = as(freq_k, "sparseMatrix")
81 | }
82 |
83 | ### scan for TF motifs
84 | if(!no_motif){
85 | print('Scanning for TF motifs ...')
86 | opts <- list()
87 | opts["species"] <- species
88 | opts["collection"] <- "CORE"
89 | PFMatrixList = TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020,opts = opts)
90 | motif_ix_scores <- motifmatchr::matchMotifs(PFMatrixList,peaks_seq, out = "scores")
91 | freq_motif = motifCounts(motif_ix_scores)
92 | motif_names = c()
93 | for (x in names(PFMatrixList)){
94 | motif_names = c(motif_names,PFMatrixList[[x]]@name)
95 | }
96 | colnames(freq_motif) = gsub("::",'_',motif_names)
97 | rownames(freq_motif) = peaks_name
98 | }
99 |
100 | ### save results
101 | ### save kmers
102 | if(!no_kmer){
103 | print('Saving kmer matrix ...')
104 |
105 | # output_dir = file.path(dir.output, 'freq_k')
106 | # system(paste0('mkdir -p ',output_dir))
107 | # filename = 'freq_k.mtx'
108 | # writeMM(freq_k,file = file.path(output_dir,filename))
109 | # write.table(rownames(freq_k),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
110 | # write.table(colnames(freq_k),file.path(output_dir,'kmers.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
111 |
112 | filename = 'freq_kmer.h5'
113 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation
114 | writeHDF5Array(t(freq_k), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE)
115 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names
116 | h5write(rownames(freq_k), file.path(dir.output,filename), "row_names")
117 | h5write(colnames(freq_k), file.path(dir.output,filename), "col_names")
118 | }
119 |
120 | ### save motifs
121 | if(!no_motif){
122 | print('Saving motif matrix ...')
123 |
124 | # output_dir = file.path(dir.output, 'freq_motif')
125 | # system(paste0('mkdir -p ',output_dir))
126 | # filename = 'freq_motif.mtx'
127 | # writeMM(freq_motif,file = file.path(output_dir,filename))
128 | # write.table(rownames(freq_motif),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
129 | # write.table(colnames(freq_motif),file.path(output_dir,'motifs.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE)
130 |
131 | filename = 'freq_motif.h5'
132 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation
133 | writeHDF5Array(t(freq_motif), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE)
134 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names
135 | h5write(rownames(freq_motif), file.path(dir.output,filename), "row_names")
136 | h5write(colnames(freq_motif), file.path(dir.output,filename), "col_names")
137 | }
138 |
139 | print('Finished.')
140 | }
141 |
142 | main()
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/environment.yml:
--------------------------------------------------------------------------------
1 | name: readthedocs
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - defaults
6 | dependencies:
7 | - pip
8 | - numpy<1.24.0 #avoid errors caused by 1.24
9 | - simba>=1.1
10 | - pandoc>=2.14
11 | - pip:
12 | - sphinx>=3.0
13 | - sphinx-rtd-theme>=0.5
14 | - nbsphinx>=0.8
15 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=3.0
2 | sphinx-rtd-theme>=0.5
3 | nbsphinx>=0.8
--------------------------------------------------------------------------------
/docs/source/API.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: simba
2 |
3 | API
4 | ===
5 |
6 | Import simba as::
7 |
8 | import simba as si
9 |
10 | Configuration for SIMBA
11 | ~~~~~~~~~~~~~~~~~~~~~~~
12 | .. autosummary::
13 | :toctree: _autosummary
14 |
15 | settings.set_figure_params
16 | settings.set_pbg_params
17 | settings.set_workdir
18 |
19 |
20 | Reading
21 | ~~~~~~~
22 |
23 | .. autosummary::
24 | :toctree: _autosummary
25 |
26 | read_csv
27 | read_h5ad
28 | read_10x_h5
29 | read_mtx
30 | read_embedding
31 | load_pbg_config
32 | load_graph_stats
33 |
34 | See more at `anndata `_
35 |
36 | Preprocessing
37 | ~~~~~~~~~~~~~
38 |
39 | .. autosummary::
40 | :toctree: _autosummary
41 |
42 | pp.log_transform
43 | pp.normalize
44 | pp.binarize
45 | pp.cal_qc
46 | pp.cal_qc_rna
47 | pp.cal_qc_atac
48 | pp.filter_samples
49 | pp.filter_cells_rna
50 | pp.filter_cells_atac
51 | pp.filter_features
52 | pp.filter_genes
53 | pp.filter_peaks
54 | pp.pca
55 | pp.select_pcs
56 | pp.select_pcs_features
57 | pp.select_variable_genes
58 |
59 | Tools
60 | ~~~~~
61 |
62 | .. autosummary::
63 | :toctree: _autosummary
64 |
65 | tl.discretize
66 | tl.umap
67 | tl.gene_scores
68 | tl.infer_edges
69 | tl.trim_edges
70 | tl.gen_graph
71 | tl.pbg_train
72 | tl.softmax
73 | tl.embed
74 | tl.compare_entities
75 | tl.query
76 | tl.find_master_regulators
77 | tl.find_target_genes
78 |
79 |
80 | Plotting
81 | ~~~~~~~~
82 |
83 | .. autosummary::
84 | :toctree: _autosummary
85 |
86 | pl.pca_variance_ratio
87 | pl.pcs_features
88 | pl.variable_genes
89 | pl.violin
90 | pl.hist
91 | pl.umap
92 | pl.discretize
93 | pl.node_similarity
94 | pl.svd_nodes
95 | pl.pbg_metrics
96 | pl.entity_metrics
97 | pl.entity_barcode
98 | pl.query
99 |
100 |
101 | Datasets
102 | ~~~~~~~~
103 |
104 | .. autosummary::
105 | :toctree: _autosummary
106 |
107 | datasets.rna_10xpmbc3k
108 | datasets.rna_han2018
109 | datasets.rna_tmc2018
110 | datasets.rna_baron2016
111 | datasets.rna_muraro2016
112 | datasets.rna_segerstolpe2016
113 | datasets.rna_wang2016
114 | datasets.rna_xin2016
115 | datasets.atac_buenrostro2018
116 | datasets.atac_10xpbmc5k
117 | datasets.atac_chen2019
118 | datasets.atac_cusanovich2018_subset
119 | datasets.multiome_ma2020_fig4
120 | datasets.multiome_chen2019
121 | datasets.multiome_10xpbmc10k
122 |
--------------------------------------------------------------------------------
/docs/source/About SIMBA.rst:
--------------------------------------------------------------------------------
1 | About SIMBA
2 | ===========
3 |
4 | SIMBA ( **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features) is a graph embedding method that jointly embeds single cells and their defining features, such as genes, chromatin accessible regions, and DNA sequences into a common latent space. SIMBA explicitly learns low-dimensional representations of cells and features, and implicitly enables the possibility of clustering-free marker discovery, batch effect removal and multi-omics integration. Importantly, SIMBA introduces several crucial procedures including Softmax transformation, weight decay for controlling overfitting, and entity-type constraints to generate comparable embeddings (co-embeddings) of cells and features and to address unique challenges in single-cell data.
5 |
6 | SIMBA first encodes different types of entities such as cells, genes, open chromatin regions (peaks or bins), transcription factor (TF) motifs, and k-mers (short sequences of a specific length, k), into a single graph, where each node represents an individual entity and edges indicate relations between entities. Unlike existing methods that primarily focus on learning cell states, SIMBA treats both cells and features as equal nodes in the same graph.
7 |
8 | In SIMBA, edges may be added in two ways: 1) measured experimentally; 2) inferred computationally. For edges that are measured experimentally, each cell-feature edge corresponds to a single-cell measurement (e.g., the expression value of a gene or a chromatin-accessible peak observed in a cell). For example, if a gene is expressed in a cell, an edge is created between the gene and cell. The weight of this edge is determined by the gene expression level. Similarly, an edge is added between a cell and a chromatin region if the region is open in this cell. Edges are also allowed between different features to capture and model the underlying regulatory mechanisms. For example, an edge between a chromatin region and a TF-motif (or k-mer) captures the notion that a TF may bind to a regulatory region containing a specific DNA sequence. For edges that cannot be directly measured, they are inferred computationally by summarizing features of the same or different types. Each edge between cells of different batches or modalities indicates the cellular functional or structural similarity.
9 |
10 | Once the input graph is constructed, SIMBA applies a multi-entity graph embedding algorithm as well as a Softmax-based transformation to embed the nodes/entities into a common low-dimensional space wherein cells and features are comparable and can be analyzed based on their distance. Graph construction is inherently flexible, enabling SIMBA to be applied to a wide variety of single-cell tasks.
11 |
12 | Overall, SIMBA is versatile and can accommodate features of various domains as long as they can be encoded into a connected graph. It can readily extend to new single-cell modalities and tasks. SIMBA provides a single generalizable framework that allows diverse single-cell problems to be formulated in a unified way and thus simplifies the development of new analyses and extension to new single-cell modalities.
--------------------------------------------------------------------------------
/docs/source/Basic concepts.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Basic concepts
3 | ================
4 |
5 |
6 | Graph construction
7 | ~~~~~~~~~~~~~~~~~~
8 | SIMBA encodes entities of different types, including genes, open chromatin regions (peaks or bins), and DNA sequences (transcription factor motifs or k-mers), into a connected large graph based on the relation between them. In this graph, nodes represent different entities and edges indicate the relation between entities.
9 |
10 | * In scRNA-seq analysis, each node represents either a cell or a gene. If a gene is expressed in a cell, then an edge is added between this gene and cell. The gene expression level is encoded into the weight of this edge.
11 |
12 | * In scATAC-seq analysis, each node represents either a cell or a region (peak/bin). If a region is open in a cell, then an edge is added between this region and cell. Optionally, if DNA sequences (TF motifs or k-mers) are also used, each node represents a cell, or a region, or a DNA sequence. In addition to the relation between a cell and a region, if a DNA sequence is found within the open region, then an edge is added between this DNA sequence and open region.
13 |
14 | * In multimodal analysis, each node can be any of these entities, including a cell, a gene, a open region , a DNA sequence, etc. Edges are added similarly as in scRNA-seq analysis and scATAC-seq analysis.
15 |
16 | * In batch correction analysis, in addition to the experimentally measured edges as described above, batch correction is further enhanced with the computationally inferred edges between cell nodes across datasets using a truncated randomized singular value decomposition (SVD)-based procedure
17 |
18 | * In multiomics integration analysis (scRNA-seq and scATAC-seq), SIMBA first builds one graph for scRNA-seq data and one graph for scATAC-seq data independently as described above. To connect these two graphs, SIMBA calculates gene scores by summarizing accessible regions from scATAC-seq data and then infer edges between cells of different omics based on their shared gene expression modules through a similar procedure as in batch correction.
19 |
20 | PBG training
21 | ~~~~~~~~~~~~
22 | Following the construction of a multi-relational graph between biological entities, we adapt graph embedding techniques from the knowledge graph and recommendation systems literature to construct unsupervised representations for these entities.
23 |
24 | We use the PyTorch-BigGraph(PBG) framework, which provides efficient computation of multi-relation graph embeddings over multiple entity types and can scale to graphs with millions or billions of entities.
25 |
26 | In SIMBA, several key modifications have been made based on PBG, including:
27 |
28 | * Type-constrainted negative sampling
29 |
30 | * Negative samples are produced in two ways:
31 |
32 | * by corrupting the edge with a source or destination sampled uniformly from the nodes with the correct types for this relation;
33 |
34 | * by corrupting the edge with a source or destination node sampled with probability proportional to its degree.
35 |
36 | * Introducing a weight decay procedure to solve overfitting problem.
37 |
38 | The resulting graph embeddings have two desirable properties that we will take advantage of:
39 |
40 | #. First-order similarity: for two entity types with a relation between them, edges with high likelihood should have higher dot product.
41 | #. Second-order similarity: within a single entity type, entities that have ‘similar contexts’, i.e., a similar distribution of edge probabilities, should have similar embeddings.
42 |
43 | Evaluation during training
44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
45 | During the PBG training procedure, a small percent of edges is held out (by default, the evaluation fraction is set to 5%) to monitor overfitting and evaluate the final model.
46 |
47 | Five metrics are computed on the reserved set of edges, including mean reciprocal rank (MRR, the average of the reciprocal of the ranks of all positives), R1 (the fraction of positives that rank better than all their negatives, i.e., have a rank of 1), R10 (the fraction of positives that rank in the top 10 among their negatives), R50 (the fraction of positives that rank in the top 50 among their negatives), and AUC (Area Under the Curve).
48 |
49 | By default, we show MRR along with training loss and validation loss while other metric are also available in SIMBA package. The learning curves for validation loss and these metrics can be used to determine when training has completed. The relative values of training and validation loss along with these evaluation metrics can be used to identify issues with training (underfitting vs overfitting) and tune the hyperparameters weight decay, embedding dimension, and number of training epochs appropriately. However, for most datasets we find that the default parameters do not need tuning.
50 |
51 | Softmax transformation
52 | ~~~~~~~~~~~~~~~~~~~~~~
53 | PyTorch-BigGraph training provides initial embeddings of all entities (nodes). However, entities of different types (e.g., cells vs peaks, cells of different batches or modalities) have different edge distributions and thus may lie on different manifolds of the latent space. To make the embeddings of entities of different types comparable, we transform the embeddings of features with Softmax function by utilizing the first-order similarity between cells (reference) and features (query). In the case of batch correction or multi-omics integration, the SoftMax transformation is also performed based on the first-order similarity between cells of different batches or modalities.
54 |
--------------------------------------------------------------------------------
/docs/source/Citation.rst:
--------------------------------------------------------------------------------
1 | Citation
2 | ========
3 |
4 | Chen, H., Ryu, J., Vinyard, M. E., Lerer, A., & Pinello, L. (2023). SIMBA: SIngle-cell eMBedding Along with features. *Nature Methods*, 1-11.
5 |
6 | Please check out our `manuscript `_ to learn more.
--------------------------------------------------------------------------------
/docs/source/Installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | Anaconda
5 | ~~~~~~~~
6 |
7 |
8 | For first-time *conda* users, perform a one-time set up of Bioconda with the following commands::
9 |
10 | conda config --add channels defaults
11 | conda config --add channels bioconda
12 | conda config --add channels conda-forge
13 | conda config --set channel_priority strict
14 |
15 |
16 | To install `simba `_ with conda, run::
17 |
18 | conda install -c bioconda simba
19 |
20 | **Recommended**: install *simba* in a new virtual enviroment::
21 |
22 | conda create -n env_simba simba
23 | conda activate env_simba
24 |
25 |
26 | Dev version
27 | ~~~~~~~~~~~
28 |
29 | To install the latest version on `GitHub `_,
30 |
31 | first install `simba_pbg `_ ::
32 |
33 | conda install -c bioconda simba_pbg
34 |
35 |
36 | then run::
37 |
38 | git clone https://github.com/huidongchen/simba.git
39 | pip install simba --user
40 |
41 | or::
42 |
43 | pip install git+https://github.com/huidongchen/simba
44 |
--------------------------------------------------------------------------------
/docs/source/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/source/Output.rst:
--------------------------------------------------------------------------------
1 | Output
2 | ======
3 |
4 | SIMBA result structure will look like this:
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | ::
8 |
9 | result_simba
10 | ├── figures
11 | └── pbg
12 | └── graph0
13 | ├── pbg_graph.txt
14 | ├── graph_stats.json
15 | ├── entity_alias.txt
16 | └── input
17 | ├── edge
18 | └── entity
19 | └── model0
20 | ├── config.json
21 | ├── training_stats.json
22 | ├── checkpoint_version.txt
23 | ├── embeddings.h5
24 | └── model.h5
25 | └── model1
26 | ├── config.json
27 | ├── training_stats.json
28 | ├── checkpoint_version.txt
29 | ├── embeddings.h5
30 | └── model.h5
31 | └── model2
32 | ├── config.json
33 | ├── training_stats.json
34 | ├── checkpoint_version.txt
35 | ├── embeddings.h5
36 | └── model.h5
37 | └── graph1
38 | ├── pbg_graph.txt
39 | ├── graph_stats.json
40 | ├── entity_alias.txt
41 | └── input
42 | ├── edge
43 | └── entity
44 | └── model
45 | ├── config.json
46 | ├── training_stats.json
47 | ├── checkpoint_version.txt
48 | ├── embeddings.h5
49 | └── model.h5
50 |
51 | By default, all figures will be saved under ``result_simba/figures``
52 |
53 | The PBG training results will be stored in the directory ``result_simba/pbg``. Inside this folder, each constructed graph will be saved in a distinct folder (by default ``result_simba/pbg/graph0``), and each model trained on that graph will be saved into a separate folder (by default ``result_simba/pbg/graph0/model``).
54 |
55 | Inside each graph folder (e.g., ``result_simba/pbg/graph0``):
56 |
57 | - ``pbg_graph.txt`` stores its edges on which PBG training is performed;
58 | - ``graph_stats.json`` stores the statistics associated with this graph;
59 | - ``entity_alias.txt`` keeps the mapping between the original entity IDs and their aliases.
60 | - ``input`` stores the extracted nodes (entities) and edges from ``pbg_graph.txt``, which are prepared for PBG training.
61 | - ``model`` stores the training result of one parameter configuration. (by default ``model``)
--------------------------------------------------------------------------------
/docs/source/Release notes.rst:
--------------------------------------------------------------------------------
1 | Release notes
2 | =============
--------------------------------------------------------------------------------
/docs/source/_ext/edit_on_github.py:
--------------------------------------------------------------------------------
1 | """
2 | Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the
3 | sidebar.
4 | """
5 |
6 | import os
7 | import warnings
8 |
9 | __licence__ = "BSD (3 clause)"
10 |
11 |
12 | # def get_github_repo(app, path):
13 | # if path.endswith(".ipynb"):
14 | # return app.config.github_nb_repo, "/"
15 | # return app.config.github_repo, "/docs/source/"
16 |
17 |
18 | def html_page_context(app, pagename, templatename, context, doctree):
19 | if templatename != "page.html":
20 | return
21 |
22 | if doctree is not None:
23 | path = os.path.relpath(doctree.get("source"), app.builder.srcdir)
24 | if path.endswith(".ipynb"):
25 | context["display_github"] = True
26 | context["github_user"] = "huidongchen"
27 | context["github_repo"] = "simba_tutorials"
28 | context["github_version"] = "main"
29 | if path.endswith("rna_10x_mouse_brain_1p3M.ipynb"):
30 | context["conf_py_path"] = "/v1.1/"
31 | else:
32 | context["conf_py_path"] = "/v1.0/"
33 | else:
34 | context["display_github"] = True
35 | context["github_user"] = "huidongchen"
36 | context["github_repo"] = "simba"
37 | context["github_version"] = "master"
38 | context["conf_py_path"] = "/docs/source/"
39 |
40 | def setup(app):
41 | app.add_config_value("github_nb_repo", "", True)
42 | app.add_config_value("github_repo", "", True)
43 | app.connect("html-page-context", html_page_context)
44 |
--------------------------------------------------------------------------------
/docs/source/_static/img/Figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/docs/source/_static/img/Figure1.png
--------------------------------------------------------------------------------
/docs/source/_static/img/lion_icon.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/_static/img/logo_simba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/docs/source/_static/img/logo_simba.png
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../simba'))
16 | sys.path.insert(0, os.path.abspath('_ext'))
17 | import simba # noqa: E402
18 |
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = 'SIMBA'
23 | copyright = '2023, Huidong Chen'
24 | author = 'Huidong Chen'
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = simba.__version__
28 |
29 |
30 | # -- Retrieve notebooks (borrowed from scVelo) -------------------------------
31 |
32 | from urllib.request import urlretrieve # noqa: E402
33 |
34 | notebooks_url = "https://github.com/huidongchen/simba_tutorials/raw/main/"
35 | notebooks_v1_0 = [
36 | "atac_buenrostro2018_peaks_and_sequences.ipynb",
37 | "multiome_shareseq.ipynb",
38 | "multiome_shareseq_GRN.ipynb",
39 | "rna_mouse_atlas.ipynb",
40 | "rna_human_pancreas.ipynb",
41 | "multiome_10xpmbc10k_integration.ipynb",
42 | ]
43 | notebooks_v1_1 = [
44 | "rna_10x_mouse_brain_1p3M.ipynb",
45 | ]
46 | notebooks_v1_2 = [
47 | "rna_10xpmbc_all_genes_v1.2.ipynb",
48 | "rna_10xpmbc_edgeweigts.ipynb",
49 | 'new_graph_generation.ipynb'
50 | ]
51 | for nb in notebooks_v1_0:
52 | try:
53 | urlretrieve(notebooks_url + "v1.0/" + nb, nb)
54 | except Exception:
55 | pass
56 |
57 | for nb in notebooks_v1_1:
58 | try:
59 | urlretrieve(notebooks_url + "v1.1/" + nb, nb)
60 | except Exception:
61 | pass
62 |
63 | for nb in notebooks_v1_2:
64 | try:
65 | urlretrieve(notebooks_url + "v1.2/" + nb, nb)
66 | except Exception:
67 | pass
68 | # -- General configuration ---------------------------------------------------
69 |
70 | # Add any Sphinx extension module names here, as strings. They can be
71 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
72 | # ones.
73 |
74 | needs_sphinx = "3.0"
75 |
76 | extensions = [
77 | "sphinx.ext.autodoc",
78 | "sphinx.ext.autosummary",
79 | 'sphinx.ext.napoleon',
80 | "sphinx.ext.intersphinx",
81 | "sphinx.ext.mathjax",
82 | "sphinx.ext.viewcode",
83 | "nbsphinx",
84 | "edit_on_github",
85 | ]
86 |
87 | autosummary_generate = True
88 |
89 | # Napoleon settings
90 | napoleon_google_docstring = False
91 |
92 | # Add any paths that contain templates here, relative to this directory.
93 | templates_path = ['_templates']
94 |
95 | # List of patterns, relative to source directory, that match files and
96 | # directories to ignore when looking for source files.
97 | # This pattern also affects html_static_path and html_extra_path.
98 | exclude_patterns = ['_build']
99 |
100 | # Add prolog for notebooks
101 |
102 | # nbsphinx_prolog = r"""
103 | # {% set docname = 'github/huidongchen/simba_tutorials/blob/main/v1.0/' + env.doc2path(env.docname, base=None) %} # noqa
104 | # """
105 |
106 | # -- Options for HTML output -------------------------------------------------
107 |
108 | # The theme to use for HTML and HTML Help pages. See the documentation for
109 | # a list of builtin themes.
110 | #
111 | html_theme = 'sphinx_rtd_theme'
112 | html_theme_options = {
113 | "navigation_depth": 1,
114 | "titles_only": True,
115 | 'logo_only': True,
116 | }
117 | html_show_sphinx = False
118 | html_logo = '_static/img/logo_simba.png'
119 | html_favicon = '_static/img/lion_icon.svg'
120 | # html_context = dict(
121 | # display_github=True,
122 | # github_user='pinellolab',
123 | # github_repo='simba',
124 | # github_version='master',
125 | # conf_py_path='/docs/source/',
126 | # )
127 | # html_context = dict(
128 | # display_github=True,
129 | # github_user='huidongchen',
130 | # github_repo='simba_tutorials',
131 | # github_version='main',
132 | # conf_py_path='/v1.0/',
133 | # )
134 | github_repo = 'simba'
135 | github_nb_repo = 'simba_tutorials'
136 |
137 |
138 | # Add any paths that contain custom static files (such as style sheets) here,
139 | # relative to this directory. They are copied after the builtin static files,
140 | # so a file named "default.css" will overwrite the builtin "default.css".
141 |
142 | html_static_path = ['_static']
143 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | |CI| |Docs| |Anaconda| |Install with conda| |Codecov| |Last updated| |Downloads| |License|
2 |
3 | **SIMBA**: **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features
4 | ========================================================================
5 |
6 | SIMBA is a method to embed cells along with their defining features such as gene expression, transcription factor binding sequences and chromatin accessibility peaks into the same latent space. The joint embedding of cells and features allows SIMBA to perform various types of single cell tasks, including but not limited to single-modal analysis (e.g. scRNA-seq and scATAC-seq analysis), multimodal analysis, batch correction, and multi-omic integration.
7 |
8 |
9 | .. image:: _static/img/Figure1.png
10 | :align: center
11 | :width: 600
12 | :alt: SIMBA overview
13 |
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 | :caption: Overview
18 | :hidden:
19 |
20 | About SIMBA
21 | Installation
22 | API
23 | Release notes
24 | Citation
25 |
26 |
27 | .. toctree::
28 | :maxdepth: 1
29 | :caption: SIMBA primer
30 |
31 | Basic concepts
32 | Output
33 |
34 |
35 | .. toctree::
36 | :maxdepth: 1
37 | :caption: Tutorials
38 |
39 | rna_10xpmbc_all_genes_v1.2
40 | atac_buenrostro2018_peaks_and_sequences
41 | multiome_shareseq
42 | multiome_shareseq_GRN
43 | rna_mouse_atlas
44 | rna_human_pancreas
45 | multiome_10xpmbc10k_integration
46 | new_graph_generation
47 | rna_10xpmbc_edgeweigts
48 | rna_10x_mouse_brain_1p3M
49 |
50 |
51 | .. |Docs| image:: https://readthedocs.org/projects/simba-bio/badge/?version=latest
52 | :target: https://simba-bio.readthedocs.io
53 |
54 | .. |CI| image:: https://github.com/huidongchen/simba/actions/workflows/CI.yml/badge.svg
55 | :target: https://github.com/huidongchen/simba/actions/workflows/CI.yml
56 |
57 | .. |Anaconda| image:: https://anaconda.org/bioconda/simba/badges/version.svg
58 | :target: https://anaconda.org/bioconda/simba
59 |
60 | .. |Install with conda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat
61 | :target: http://bioconda.github.io/recipes/simba/README.html
62 |
63 | .. |Last updated| image:: https://anaconda.org/bioconda/simba/badges/latest_release_date.svg
64 | :target: https://anaconda.org/bioconda/simba
65 |
66 | .. |License| image:: https://anaconda.org/bioconda/simba/badges/license.svg
67 | :target: https://github.com/pinellolab/simba/blob/master/LICENSE
68 |
69 | .. |Downloads| image:: https://anaconda.org/bioconda/simba/badges/downloads.svg
70 | :target: https://anaconda.org/bioconda/simba
71 |
72 | .. |Codecov| image:: https://codecov.io/gh/huidongchen/simba/branch/master/graph/badge.svg?token=ZUA70S1LUU
73 | :target: https://codecov.io/gh/huidongchen/simba
74 |
--------------------------------------------------------------------------------
/docs/source/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = 'test_*.py'
3 | testpaths = 'tests/'
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.0
2 | pandas>=1.0,!=1.1 # required by Anndata
3 | anndata>=0.7.4
4 | # h5py<3.0.0 # avoid byte strings but caused building errors
5 | # h5py>=3.4
6 | scikit-learn>=1.2
7 | scipy>=1.4
8 | kneed>=0.7
9 | seaborn>=0.11
10 | matplotlib>=3.3
11 | scikit-misc>=0.1.3
12 | adjusttext>=0.7.3
13 | umap-learn>=0.3.0
14 | #plotly>=4.14.0
15 | pybedtools>=0.8.0
16 | # bedtools>=2.29.0 # not available in pip
17 | tables
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | if sys.version_info < (3, 7):
4 | sys.exit('simba requires Python >= 3.7')
5 |
6 | from setuptools import setup, find_packages
7 | from pathlib import Path
8 |
9 | version = {}
10 | with open("simba/_version.py") as fp:
11 | exec(fp.read(), version)
12 |
13 |
14 | setup(
15 | name='simba',
16 | version=version['__version__'],
17 | author='Huidong Chen',
18 | athor_email='hd7chen AT gmail DOT com',
19 | license='BSD',
20 | description='SIngle-cell eMBedding Along with features',
21 | long_description=Path('README.md').read_text('utf-8'),
22 | long_description_content_type="text/markdown",
23 | url='https://github.com/pinellolab/simba',
24 | packages=find_packages(),
25 | classifiers=[
26 | "Programming Language :: Python :: 3",
27 | "License :: OSI Approved :: BSD License",
28 | "Operating System :: OS Independent",
29 | ],
30 | python_requires='>=3.7',
31 | install_requires=[
32 | x.strip() for x in
33 | Path('requirements.txt').read_text('utf-8').splitlines()
34 | ],
35 | include_package_data=True,
36 | package_data={"simba": ["data/gene_anno/*.bed"]}
37 | )
38 |
--------------------------------------------------------------------------------
/simba/__init__.py:
--------------------------------------------------------------------------------
1 | """SIngle-cell eMBedding Along with features"""
2 |
3 | from ._settings import settings
4 | from . import preprocessing as pp
5 | from . import tools as tl
6 | from . import plotting as pl
7 | from .readwrite import *
8 | from . import datasets
9 | from ._version import __version__
10 |
11 |
12 | import sys
13 | # needed when building doc (borrowed from scanpy)
14 | sys.modules.update(
15 | {f'{__name__}.{m}': globals()[m] for m in ['tl', 'pp', 'pl']})
16 |
--------------------------------------------------------------------------------
/simba/_settings.py:
--------------------------------------------------------------------------------
1 | """Configuration for SIMBA"""
2 |
3 | import os
4 | import seaborn as sns
5 | import matplotlib as mpl
6 |
7 |
8 | class SimbaConfig:
9 | """configuration class for SIMBA"""
10 |
11 | def __init__(self,
12 | workdir='./result_simba',
13 | save_fig=False,
14 | n_jobs=1):
15 | self.workdir = workdir
16 | self.save_fig = save_fig
17 | self.n_jobs = n_jobs
18 | self.set_pbg_params()
19 | self.graph_stats = dict()
20 |
21 | def set_figure_params(self,
22 | context='notebook',
23 | style='white',
24 | palette='deep',
25 | font='sans-serif',
26 | font_scale=1.1,
27 | color_codes=True,
28 | dpi=80,
29 | dpi_save=150,
30 | fig_size=[5.4, 4.8],
31 | rc=None):
32 | """ Set global parameters for figures. Modified from sns.set()
33 |
34 | Parameters
35 | ----------
36 | context : string or dict
37 | Plotting context parameters, see `seaborn.plotting_context`
38 | style: `string`,optional (default: 'white')
39 | Axes style parameters, see `seaborn.axes_style`
40 | palette : string or sequence
41 | Color palette, see `seaborn.color_palette`
42 | font_scale: `float`, optional (default: 1.3)
43 | Separate scaling factor to independently
44 | scale the size of the font elements.
45 | color_codes : `bool`, optional (default: True)
46 | If ``True`` and ``palette`` is a seaborn palette,
47 | remap the shorthand color codes (e.g. "b", "g", "r", etc.)
48 | to the colors from this palette.
49 | dpi: `int`,optional (default: 80)
50 | Resolution of rendered figures.
51 | dpi_save: `int`,optional (default: 150)
52 | Resolution of saved figures.
53 | rc: `dict`,optional (default: None)
54 | rc settings properties.
55 | Parameter mappings to override the values in the preset style.
56 | Please see "`matplotlibrc file
57 | `__"
58 | """
59 | sns.set(context=context,
60 | style=style,
61 | palette=palette,
62 | font=font,
63 | font_scale=font_scale,
64 | color_codes=color_codes,
65 | rc={'figure.dpi': dpi,
66 | 'savefig.dpi': dpi_save,
67 | 'figure.figsize': fig_size,
68 | 'image.cmap': 'viridis',
69 | 'lines.markersize': 6,
70 | 'legend.columnspacing': 0.1,
71 | 'legend.borderaxespad': 0.1,
72 | 'legend.handletextpad': 0.1,
73 | 'pdf.fonttype': 42,
74 | })
75 | if rc is not None:
76 | assert isinstance(rc, dict), "rc must be dict"
77 | for key, value in rc.items():
78 | if key in mpl.rcParams.keys():
79 | mpl.rcParams[key] = value
80 | else:
81 | raise Exception("unrecognized property '%s'" % key)
82 |
83 | def set_workdir(self, workdir=None):
84 | """Set working directory.
85 |
86 | Parameters
87 | ----------
88 | workdir: `str`, optional (default: None)
89 | Working directory.
90 |
91 | Returns
92 | -------
93 | """
94 | if workdir is None:
95 | workdir = self.workdir
96 | print("Using default working directory.")
97 | if not os.path.exists(workdir):
98 | os.makedirs(workdir)
99 | self.workdir = workdir
100 | self.set_pbg_params()
101 | print('Saving results in: %s' % workdir)
102 |
103 | def set_pbg_params(self, config=None):
104 | """Set PBG parameters
105 |
106 | Parameters
107 | ----------
108 | config : `dict`, optional (default: None)
109 | PBG training configuration parameters.
110 | By default it resets parameters to the default setting.
111 |
112 | Returns
113 | -------
114 | """
115 | if config is None:
116 | config = dict(
117 | # I/O data
118 | entity_path="",
119 | edge_paths=["", ],
120 | checkpoint_path="",
121 |
122 | # Graph structure
123 | entities={},
124 | relations=[],
125 | dynamic_relations=False,
126 |
127 | # Scoring model
128 | dimension=50,
129 | global_emb=False,
130 | comparator='dot',
131 |
132 | # Training
133 | num_epochs=10,
134 | workers=4,
135 | num_batch_negs=50,
136 | num_uniform_negs=50,
137 | loss_fn='softmax',
138 | lr=0.1,
139 |
140 | early_stopping=False,
141 | regularization_coef=0.0,
142 | wd=0.0,
143 | wd_interval=50,
144 |
145 | # Evaluation during training
146 | eval_fraction=0.05,
147 | eval_num_batch_negs=50,
148 | eval_num_uniform_negs=50,
149 |
150 | checkpoint_preservation_interval=None,
151 | )
152 | assert isinstance(config, dict), "`config` must be dict"
153 | self.pbg_params = config
154 |
155 |
156 | settings = SimbaConfig()
157 |
--------------------------------------------------------------------------------
/simba/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from kneed import KneeLocator
5 | import tables
6 | from anndata import AnnData
7 |
8 |
9 | def locate_elbow(x, y, S=10, min_elbow=0,
10 | curve='convex', direction='decreasing', online=False,
11 | **kwargs):
12 | """Detect knee points
13 |
14 | Parameters
15 | ----------
16 | x : `array-like`
17 | x values
18 | y : `array-like`
19 | y values
20 | S : `float`, optional (default: 10)
21 | Sensitivity
22 | min_elbow: `int`, optional (default: 0)
23 | The minimum elbow location
24 | curve: `str`, optional (default: 'convex')
25 | Choose from {'convex','concave'}
26 | If 'concave', algorithm will detect knees,
27 | If 'convex', algorithm will detect elbows.
28 | direction: `str`, optional (default: 'decreasing')
29 | Choose from {'decreasing','increasing'}
30 | online: `bool`, optional (default: False)
31 | kneed will correct old knee points if True,
32 | kneed will return first knee if False.
33 | **kwargs: `dict`, optional
34 | Extra arguments to KneeLocator.
35 |
36 | Returns
37 | -------
38 | elbow: `int`
39 | elbow point
40 | """
41 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):],
42 | S=S, curve=curve,
43 | direction=direction,
44 | online=online,
45 | **kwargs,
46 | )
47 | if kneedle.elbow is None:
48 | elbow = len(y)
49 | else:
50 | elbow = int(kneedle.elbow)
51 | return elbow
52 |
53 |
54 | # modifed from
55 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
56 | def _read_legacy_10x_h5(filename, genome=None):
57 | """
58 | Read hdf5 file from Cell Ranger v2 or earlier versions.
59 | """
60 | with tables.open_file(str(filename), 'r') as f:
61 | try:
62 | children = [x._v_name for x in f.list_nodes(f.root)]
63 | if not genome:
64 | if len(children) > 1:
65 | raise ValueError(
66 | f"'{filename}' contains more than one genome. "
67 | "For legacy 10x h5 "
68 | "files you must specify the genome "
69 | "if more than one is present. "
70 | f"Available genomes are: {children}"
71 | )
72 | genome = children[0]
73 | elif genome not in children:
74 | raise ValueError(
75 | f"Could not find genome '{genome}' in '{filename}'. "
76 | f'Available genomes are: {children}'
77 | )
78 | dsets = {}
79 | for node in f.walk_nodes('/' + genome, 'Array'):
80 | dsets[node.name] = node.read()
81 | # AnnData works with csr matrices
82 | # 10x stores the transposed data, so we do the transposition
83 | from scipy.sparse import csr_matrix
84 |
85 | M, N = dsets['shape']
86 | data = dsets['data']
87 | if dsets['data'].dtype == np.dtype('int32'):
88 | data = dsets['data'].view('float32')
89 | data[:] = dsets['data']
90 | matrix = csr_matrix(
91 | (data, dsets['indices'], dsets['indptr']),
92 | shape=(N, M),
93 | )
94 | # the csc matrix is automatically the transposed csr matrix
95 | # as scanpy expects it, so, no need for a further transpostion
96 | adata = AnnData(
97 | matrix,
98 | obs=dict(obs_names=dsets['barcodes'].astype(str)),
99 | var=dict(
100 | var_names=dsets['gene_names'].astype(str),
101 | gene_ids=dsets['genes'].astype(str),
102 | ),
103 | )
104 | return adata
105 | except KeyError:
106 | raise Exception('File is missing one or more required datasets.')
107 |
108 |
109 | # modifed from
110 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
111 | def _read_v3_10x_h5(filename):
112 | """
113 | Read hdf5 file from Cell Ranger v3 or later versions.
114 | """
115 | with tables.open_file(str(filename), 'r') as f:
116 | try:
117 | dsets = {}
118 | for node in f.walk_nodes('/matrix', 'Array'):
119 | dsets[node.name] = node.read()
120 | from scipy.sparse import csr_matrix
121 |
122 | M, N = dsets['shape']
123 | data = dsets['data']
124 | if dsets['data'].dtype == np.dtype('int32'):
125 | data = dsets['data'].view('float32')
126 | data[:] = dsets['data']
127 | matrix = csr_matrix(
128 | (data, dsets['indices'], dsets['indptr']),
129 | shape=(N, M),
130 | )
131 | adata = AnnData(
132 | matrix,
133 | obs=dict(obs_names=dsets['barcodes'].astype(str)),
134 | var=dict(
135 | var_names=dsets['name'].astype(str),
136 | gene_ids=dsets['id'].astype(str),
137 | feature_types=dsets['feature_type'].astype(str),
138 | genome=dsets['genome'].astype(str),
139 | ),
140 | )
141 | return adata
142 | except KeyError:
143 | raise Exception('File is missing one or more required datasets.')
144 |
--------------------------------------------------------------------------------
/simba/_version.py:
--------------------------------------------------------------------------------
1 | """Version information"""
2 |
3 | __version__ = "1.2"
4 |
--------------------------------------------------------------------------------
/simba/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """Builtin Datasets."""
2 |
3 | from ._datasets import (
4 | rna_10xpmbc3k,
5 | rna_han2018,
6 | rna_tmc2018,
7 | rna_baron2016,
8 | rna_muraro2016,
9 | rna_segerstolpe2016,
10 | rna_wang2016,
11 | rna_xin2016,
12 | atac_buenrostro2018,
13 | atac_10xpbmc5k,
14 | atac_chen2019,
15 | atac_cusanovich2018_subset,
16 | multiome_ma2020_fig4,
17 | multiome_chen2019,
18 | multiome_10xpbmc10k
19 | )
20 |
--------------------------------------------------------------------------------
/simba/datasets/_datasets.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | from tqdm import tqdm
3 | import os
4 |
5 | from .._settings import settings
6 | from ..readwrite import read_h5ad
7 |
8 |
9 | class DownloadProgressBar(tqdm):
10 | def update_to(self,
11 | b=1,
12 | bsize=1,
13 | tsize=None):
14 | if tsize is not None:
15 | self.total = tsize
16 | self.update(b * bsize - self.n)
17 |
18 |
19 | def download_url(url,
20 | output_path,
21 | desc=None):
22 | if desc is None:
23 | desc = url.split('/')[-1]
24 | with DownloadProgressBar(
25 | unit='B',
26 | unit_scale=True,
27 | miniters=1,
28 | desc=desc
29 | ) as t:
30 | urllib.request.urlretrieve(
31 | url,
32 | filename=output_path,
33 | reporthook=t.update_to)
34 |
35 |
36 | def rna_10xpmbc3k():
37 | """10X human peripheral blood mononuclear cells (PBMCs) scRNA-seq data
38 |
39 | Returns
40 | -------
41 | adata: `AnnData`
42 | Anndata object
43 | """
44 | url = 'https://www.dropbox.com/s/087wuliddmbp3oe/rna_seq.h5ad?dl=1'
45 | filename = 'rna_10xpmbc3k.h5ad'
46 | filepath = os.path.join(settings.workdir, 'data')
47 | fullpath = os.path.join(filepath, filename)
48 | if(not os.path.exists(fullpath)):
49 | print('Downloading data ...')
50 | os.makedirs(filepath, exist_ok=True)
51 | download_url(url,
52 | fullpath,
53 | desc=filename)
54 | print(f'Downloaded to {filepath}.')
55 | adata = read_h5ad(fullpath)
56 | return adata
57 |
58 |
59 | def rna_han2018():
60 | """single-cell microwell-seq mouse cell atlas data
61 |
62 | ref: Han, X. et al. Mapping the mouse cell atlas by microwell-seq.
63 | Cell 172, 1091-1107. e1017 (2018).
64 |
65 | Returns
66 | -------
67 | adata: `AnnData`
68 | Anndata object
69 | """
70 | url = 'https://www.dropbox.com/s/nxbszjbir44g99n/rna_seq_mi.h5ad?dl=1'
71 | filename = 'rna_han2018.h5ad'
72 | filepath = os.path.join(settings.workdir, 'data')
73 | fullpath = os.path.join(filepath, filename)
74 | if(not os.path.exists(fullpath)):
75 | print('Downloading data ...')
76 | os.makedirs(filepath, exist_ok=True)
77 | download_url(url,
78 | fullpath,
79 | desc=filename)
80 | print(f'Downloaded to {filepath}.')
81 | adata = read_h5ad(fullpath)
82 | return adata
83 |
84 |
85 | def rna_tmc2018():
86 | """single-cell Smart-Seq2 mouse cell atlas data
87 |
88 | ref: Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse
89 | organs creates a Tabula Muris. Nature 562, 367-372 (2018).
90 |
91 | Returns
92 | -------
93 | adata: `AnnData`
94 | Anndata object
95 | """
96 | url = 'https://www.dropbox.com/s/rnpyp6vfpuiptkz/rna_seq_sm.h5ad?dl=1'
97 | filename = 'rna_tmc2018.h5ad'
98 | filepath = os.path.join(settings.workdir, 'data')
99 | fullpath = os.path.join(filepath, filename)
100 | if(not os.path.exists(fullpath)):
101 | print('Downloading data ...')
102 | os.makedirs(filepath, exist_ok=True)
103 | download_url(url,
104 | fullpath,
105 | desc=filename)
106 | print(f'Downloaded to {filepath}.')
107 | adata = read_h5ad(fullpath)
108 | return adata
109 |
110 |
111 | def rna_baron2016():
112 | """single-cell RNA-seq human pancreas data
113 |
114 | ref: Baron, M. et al. A single-cell transcriptomic map of the human and
115 | mouse pancreas reveals inter-and intra-cell population structure. Cell
116 | systems 3, 346-360. e344 (2016)
117 |
118 | Returns
119 | -------
120 | adata: `AnnData`
121 | Anndata object
122 | """
123 | url = 'https://www.dropbox.com/s/bvziclu6d3fdzow/rna_seq_baron.h5ad?dl=1'
124 | filename = 'rna_baron2016.h5ad'
125 | filepath = os.path.join(settings.workdir, 'data')
126 | fullpath = os.path.join(filepath, filename)
127 | if(not os.path.exists(fullpath)):
128 | print('Downloading data ...')
129 | os.makedirs(filepath, exist_ok=True)
130 | download_url(url,
131 | fullpath,
132 | desc=filename)
133 | print(f'Downloaded to {filepath}.')
134 | adata = read_h5ad(fullpath)
135 | return adata
136 |
137 |
138 | def rna_muraro2016():
139 | """single-cell RNA-seq human pancreas data
140 |
141 | ref: Muraro, M.J. et al. A single-cell transcriptome atlas of the
142 | human pancreas.Cell systems 3, 385-394. e383 (2016).
143 |
144 | Returns
145 | -------
146 | adata: `AnnData`
147 | Anndata object
148 | """
149 | url = 'https://www.dropbox.com/s/ginc9rbo4qmobwx/rna_seq_muraro.h5ad?dl=1'
150 | filename = 'rna_muraro2016.h5ad'
151 | filepath = os.path.join(settings.workdir, 'data')
152 | fullpath = os.path.join(filepath, filename)
153 | if(not os.path.exists(fullpath)):
154 | print('Downloading data ...')
155 | os.makedirs(filepath, exist_ok=True)
156 | download_url(url,
157 | fullpath,
158 | desc=filename)
159 | print(f'Downloaded to {filepath}.')
160 | adata = read_h5ad(fullpath)
161 | return adata
162 |
163 |
164 | def rna_segerstolpe2016():
165 | """single-cell RNA-seq human pancreas data
166 |
167 | ref: Segerstolpe, Å. et al. Single-cell transcriptome profiling of human
168 | pancreatic islets in health and type 2 diabetes.
169 | Cell metabolism 24, 593-607 (2016).
170 |
171 | Returns
172 | -------
173 | adata: `AnnData`
174 | Anndata object
175 | """
176 | url = 'https://www.dropbox.com/s/qomnf4860jwm9pd/rna_seq_segerstolpe.h5ad?dl=1'
177 | filename = 'rna_segerstolpe2016.h5ad'
178 | filepath = os.path.join(settings.workdir, 'data')
179 | fullpath = os.path.join(filepath, filename)
180 | if(not os.path.exists(fullpath)):
181 | print('Downloading data ...')
182 | os.makedirs(filepath, exist_ok=True)
183 | download_url(url,
184 | fullpath,
185 | desc=filename)
186 | print(f'Downloaded to {filepath}.')
187 | adata = read_h5ad(fullpath)
188 | return adata
189 |
190 |
191 | def rna_wang2016():
192 | """single-cell RNA-seq human pancreas data
193 |
194 | ref: Wang, Y.J. et al. Single-cell transcriptomics of the human endocrine
195 | pancreas. Diabetes 65, 3028-3038 (2016).
196 |
197 | Returns
198 | -------
199 | adata: `AnnData`
200 | Anndata object
201 | """
202 | url = 'https://www.dropbox.com/s/9tv44nugwpx9t4c/rna_seq_wang.h5ad?dl=1'
203 | filename = 'rna_wang2016.h5ad'
204 | filepath = os.path.join(settings.workdir, 'data')
205 | fullpath = os.path.join(filepath, filename)
206 | if(not os.path.exists(fullpath)):
207 | print('Downloading data ...')
208 | os.makedirs(filepath, exist_ok=True)
209 | download_url(url,
210 | fullpath,
211 | desc=filename)
212 | print(f'Downloaded to {filepath}.')
213 | adata = read_h5ad(fullpath)
214 | return adata
215 |
216 |
217 | def rna_xin2016():
218 | """single-cell RNA-seq human pancreas data
219 |
220 | ref: Xin, Y. et al. RNA sequencing of single human islet cells reveals
221 | type 2 diabetes genes. Cell metabolism 24, 608-615 (2016).
222 |
223 | Returns
224 | -------
225 | adata: `AnnData`
226 | Anndata object
227 | """
228 | url = 'https://www.dropbox.com/s/j483i47mxty6rzo/rna_seq_xin.h5ad?dl=1'
229 | filename = 'rna_xin2016.h5ad'
230 | filepath = os.path.join(settings.workdir, 'data')
231 | fullpath = os.path.join(filepath, filename)
232 | if(not os.path.exists(fullpath)):
233 | print('Downloading data ...')
234 | os.makedirs(filepath, exist_ok=True)
235 | download_url(url,
236 | fullpath,
237 | desc=filename)
238 | print(f'Downloaded to {filepath}.')
239 | adata = read_h5ad(fullpath)
240 | return adata
241 |
242 |
243 | def atac_buenrostro2018():
244 | """single cell ATAC-seq human blood data
245 |
246 | ref: Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the
247 | Continuous RegulatoryLandscape of Human Hematopoietic Differentiation.
248 | Cell 173, 1535-1548 e1516 (2018).
249 |
250 | Returns
251 | -------
252 | adata: `AnnData`
253 | Anndata object
254 | """
255 | url = 'https://www.dropbox.com/s/7hxjqgdxtbna1tm/atac_seq.h5ad?dl=1'
256 | filename = 'atac_buenrostro2018.h5ad'
257 | filepath = os.path.join(settings.workdir, 'data')
258 | fullpath = os.path.join(filepath, filename)
259 | if(not os.path.exists(fullpath)):
260 | print('Downloading data ...')
261 | os.makedirs(filepath, exist_ok=True)
262 | download_url(url,
263 | fullpath,
264 | desc=filename)
265 | print(f'Downloaded to {filepath}.')
266 | adata = read_h5ad(fullpath)
267 | return adata
268 |
269 |
270 | def atac_10xpbmc5k():
271 | """10X human peripheral blood mononuclear cells (PBMCs) scATAC-seq data
272 |
273 | Returns
274 | -------
275 | adata: `AnnData`
276 | Anndata object
277 | """
278 | url = 'https://www.dropbox.com/s/xa8u7rlskc5h7iv/atac_seq.h5ad?dl=1'
279 | filename = 'atac_10xpbmc5k.h5ad'
280 | filepath = os.path.join(settings.workdir, 'data')
281 | fullpath = os.path.join(filepath, filename)
282 | if(not os.path.exists(fullpath)):
283 | print('Downloading data ...')
284 | os.makedirs(filepath, exist_ok=True)
285 | download_url(url,
286 | fullpath,
287 | desc=filename)
288 | print(f'Downloaded to {filepath}.')
289 | adata = read_h5ad(fullpath)
290 | return adata
291 |
292 |
293 | def atac_cusanovich2018_subset():
294 | """downsampled sci-ATAC-seq mouse tissue data
295 |
296 | ref: Cusanovich, D.A. et al. A Single-Cell Atlas of In Vivo Mammalian
297 | Chromatin Accessibility. Cell 174, 1309-1324 e1318 (2018).
298 |
299 | Returns
300 | -------
301 | adata: `AnnData`
302 | Anndata object
303 | """
304 | url = 'https://www.dropbox.com/s/e8iqwm93m33i5wt/atac_seq.h5ad?dl=1'
305 | filename = 'atac_cusanovich2018_subset.h5ad'
306 | filepath = os.path.join(settings.workdir, 'data')
307 | fullpath = os.path.join(filepath, filename)
308 | if(not os.path.exists(fullpath)):
309 | print('Downloading data ...')
310 | os.makedirs(filepath, exist_ok=True)
311 | download_url(url,
312 | fullpath,
313 | desc=filename)
314 | print(f'Downloaded to {filepath}.')
315 | adata = read_h5ad(fullpath)
316 | return adata
317 |
318 |
319 | def atac_chen2019():
320 | """simulated scATAC-seq bone marrow data with a noise level of 0.4
321 | and a coverage of 2500 fragments
322 |
323 | ref: Chen, H. et al. Assessment of computational methods for the analysis
324 | of single-cell ATAC-seq data. Genome Biology 20, 241 (2019).
325 |
326 | Returns
327 | -------
328 | adata: `AnnData`
329 | Anndata object
330 | """
331 | url = 'https://www.dropbox.com/s/fthhh3mz5b39d4y/atac_seq.h5ad?dl=1'
332 | filename = 'atac_chen2019.h5ad'
333 | filepath = os.path.join(settings.workdir, 'data')
334 | fullpath = os.path.join(filepath, filename)
335 | if(not os.path.exists(fullpath)):
336 | print('Downloading data ...')
337 | os.makedirs(filepath, exist_ok=True)
338 | download_url(url,
339 | fullpath,
340 | desc=filename)
341 | print(f'Downloaded to {filepath}.')
342 | adata = read_h5ad(fullpath)
343 | return adata
344 |
345 |
346 | def multiome_ma2020_fig4():
347 | """single cell multiome mouse skin data (SHARE-seq)
348 |
349 | ref: Ma, S. et al. Chromatin Potential Identified by Shared Single-Cell
350 | Profiling of RNA and Chromatin. Cell (2020).
351 |
352 | Returns
353 | -------
354 | dict_adata: `dict`
355 | A dictionary of anndata objects
356 | """
357 | url_rna = 'https://www.dropbox.com/s/gmmf77l8kzle6o7/rna_seq_fig4.h5ad?dl=1'
358 | url_atac = 'https://www.dropbox.com/s/ts0v2y2m5fcumcb/atac_seq_fig4.h5ad?dl=1'
359 | filename_rna = 'multiome_ma2020_fig4_rna.h5ad'
360 | filename_atac = 'multiome_ma2020_fig4_atac.h5ad'
361 | filepath = os.path.join(settings.workdir, 'data')
362 | fullpath_rna = os.path.join(filepath, filename_rna)
363 | fullpath_atac = os.path.join(filepath, filename_atac)
364 |
365 | if(not os.path.exists(fullpath_rna)):
366 | print('Downloading data ...')
367 | os.makedirs(filepath, exist_ok=True)
368 | download_url(url_rna,
369 | fullpath_rna,
370 | desc=filename_rna)
371 | print(f'Downloaded to {filepath}.')
372 | if(not os.path.exists(fullpath_atac)):
373 | print('Downloading data ...')
374 | os.makedirs(filepath, exist_ok=True)
375 | download_url(url_atac,
376 | fullpath_atac,
377 | desc=filename_atac)
378 | print(f'Downloaded to {filepath}.')
379 | adata_rna = read_h5ad(fullpath_rna)
380 | adata_atac = read_h5ad(fullpath_atac)
381 | dict_adata = {'rna': adata_rna,
382 | 'atac': adata_atac}
383 | return dict_adata
384 |
385 |
386 | def multiome_chen2019():
387 | """single cell multiome neonatal mouse cerebral cortex data (SNARE-seq)
388 |
389 | ref: Chen, S., Lake, B.B. & Zhang, K. High-throughput sequencing of the
390 | transcriptome and chromatin accessibility in the same cell.
391 | Nat Biotechnol (2019).
392 |
393 | Returns
394 | -------
395 | dict_adata: `dict`
396 | A dictionary of anndata objects
397 | """
398 | url_rna = 'https://www.dropbox.com/s/b1bbcs500q0pigt/rna_seq.h5ad?dl=1'
399 | url_atac = 'https://www.dropbox.com/s/ljepkfber68pdvc/atac_seq.h5ad?dl=1'
400 | filename_rna = 'multiome_chen2019_rna.h5ad'
401 | filename_atac = 'multiome_chen2019_atac.h5ad'
402 | filepath = os.path.join(settings.workdir, 'data')
403 | fullpath_rna = os.path.join(filepath, filename_rna)
404 | fullpath_atac = os.path.join(filepath, filename_atac)
405 |
406 | if(not os.path.exists(fullpath_rna)):
407 | print('Downloading data ...')
408 | os.makedirs(filepath, exist_ok=True)
409 | download_url(url_rna,
410 | fullpath_rna,
411 | desc=filename_rna)
412 | print(f'Downloaded to {filepath}.')
413 | if(not os.path.exists(fullpath_atac)):
414 | print('Downloading data ...')
415 | os.makedirs(filepath, exist_ok=True)
416 | download_url(url_atac,
417 | fullpath_atac,
418 | desc=filename_atac)
419 | print(f'Downloaded to {filepath}.')
420 | adata_rna = read_h5ad(fullpath_rna)
421 | adata_atac = read_h5ad(fullpath_atac)
422 | dict_adata = {'rna': adata_rna,
423 | 'atac': adata_atac}
424 | return dict_adata
425 |
426 |
427 | def multiome_10xpbmc10k():
428 | """single cell 10X human peripheral blood mononuclear cells (PBMCs)
429 | multiome data
430 |
431 | Returns
432 | -------
433 | dict_adata: `dict`
434 | A dictionary of anndata objects
435 | """
436 | url_rna = 'https://www.dropbox.com/s/zwlim6vljnbfp43/rna_seq.h5ad?dl=1'
437 | url_atac = 'https://www.dropbox.com/s/163msz0k9hkfrt7/atac_seq.h5ad?dl=1'
438 | filename_rna = 'multiome_10xpbmc10k_rna.h5ad'
439 | filename_atac = 'multiome_10xpbmc10k_atac.h5ad'
440 | filepath = os.path.join(settings.workdir, 'data')
441 | fullpath_rna = os.path.join(filepath, filename_rna)
442 | fullpath_atac = os.path.join(filepath, filename_atac)
443 |
444 | if(not os.path.exists(fullpath_rna)):
445 | print('Downloading data ...')
446 | os.makedirs(filepath, exist_ok=True)
447 | download_url(url_rna,
448 | fullpath_rna,
449 | desc=filename_rna)
450 | print(f'Downloaded to {filepath}.')
451 | if(not os.path.exists(fullpath_atac)):
452 | print('Downloading data ...')
453 | os.makedirs(filepath, exist_ok=True)
454 | download_url(url_atac,
455 | fullpath_atac,
456 | desc=filename_atac)
457 | print(f'Downloaded to {filepath}.')
458 | adata_rna = read_h5ad(fullpath_rna)
459 | adata_atac = read_h5ad(fullpath_atac)
460 | dict_adata = {'rna': adata_rna,
461 | 'atac': adata_atac}
462 | return dict_adata
463 |
--------------------------------------------------------------------------------
/simba/plotting/__init__.py:
--------------------------------------------------------------------------------
1 | """Plotting"""
2 |
3 | from ._plot import (
4 | pca_variance_ratio,
5 | pcs_features,
6 | variable_genes,
7 | violin,
8 | hist,
9 | umap,
10 | discretize,
11 | node_similarity,
12 | svd_nodes,
13 | )
14 | from ._post_training import (
15 | pbg_metrics,
16 | entity_metrics,
17 | entity_barcode,
18 | query
19 | )
20 |
--------------------------------------------------------------------------------
/simba/plotting/_palettes.py:
--------------------------------------------------------------------------------
1 | """Color palettes in addition to matplotlib's palettes.
2 | This is modifed from
3 | scanpy palettes https://github.com/theislab/scanpy/blob/master/scanpy/plotting/palettes.py # noqa
4 | """
5 |
6 | from matplotlib import cm, colors
7 |
8 | # Colorblindness adjusted vega_10
9 | # See https://github.com/theislab/scanpy/issues/387
10 | vega_10 = list(map(colors.to_hex, cm.tab10.colors))
11 | vega_10_scanpy = vega_10.copy()
12 | vega_10_scanpy[2] = "#279e68" # green
13 | vega_10_scanpy[4] = "#aa40fc" # purple
14 | vega_10_scanpy[8] = "#b5bd61" # kakhi
15 |
16 | # default matplotlib 2.0 palette
17 | # see 'category20' on https://github.com/vega/vega/wiki/Scales#scale-range-literals # noqa
18 | vega_20 = list(map(colors.to_hex, cm.tab20.colors))
19 |
20 | # reorderd, some removed, some added
21 | vega_20_scanpy = [
22 | *vega_20[0:14:2],
23 | *vega_20[16::2], # dark without grey
24 | *vega_20[1:15:2],
25 | *vega_20[17::2], # light without grey
26 | "#ad494a",
27 | "#8c6d31", # manual additions
28 | ]
29 | vega_20_scanpy[2] = vega_10_scanpy[2]
30 | vega_20_scanpy[4] = vega_10_scanpy[4]
31 | vega_20_scanpy[7] = vega_10_scanpy[8] # kakhi shifted by missing grey
32 | # TODO: also replace pale colors if necessary
33 |
34 | default_20 = vega_20_scanpy
35 |
36 | # https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d
37 | # update 1
38 | # orig reference http://epub.wu.ac.at/1692/1/document.pdf
39 | zeileis_28 = [
40 | "#023fa5",
41 | "#7d87b9",
42 | "#bec1d4",
43 | "#d6bcc0",
44 | "#bb7784",
45 | "#8e063b",
46 | "#4a6fe3",
47 | "#8595e1",
48 | "#b5bbe3",
49 | "#e6afb9",
50 | "#e07b91",
51 | "#d33f6a",
52 | "#11c638",
53 | "#8dd593",
54 | "#c6dec7",
55 | "#ead3c6",
56 | "#f0b98d",
57 | "#ef9708",
58 | "#0fcfc0",
59 | "#9cded6",
60 | "#d5eae7",
61 | "#f3e1eb",
62 | "#f6c4e1",
63 | "#f79cd4",
64 | "#7f7f7f",
65 | "#c7c7c7",
66 | "#1CE6FF",
67 | "#336600", # these last ones were added,
68 | ]
69 |
70 | default_28 = zeileis_28
71 |
72 | # from http://godsnotwheregodsnot.blogspot.de/2012/09/color-distribution-methodology.html # noqa
73 | godsnot_102 = [
74 | # "#000000",
75 | # remove the black, as often, we have black colored annotation
76 | "#FFFF00",
77 | "#1CE6FF",
78 | "#FF34FF",
79 | "#FF4A46",
80 | "#008941",
81 | "#006FA6",
82 | "#A30059",
83 | "#FFDBE5",
84 | "#7A4900",
85 | "#0000A6",
86 | "#63FFAC",
87 | "#B79762",
88 | "#004D43",
89 | "#8FB0FF",
90 | "#997D87",
91 | "#5A0007",
92 | "#809693",
93 | "#6A3A4C",
94 | "#1B4400",
95 | "#4FC601",
96 | "#3B5DFF",
97 | "#4A3B53",
98 | "#FF2F80",
99 | "#61615A",
100 | "#BA0900",
101 | "#6B7900",
102 | "#00C2A0",
103 | "#FFAA92",
104 | "#FF90C9",
105 | "#B903AA",
106 | "#D16100",
107 | "#DDEFFF",
108 | "#000035",
109 | "#7B4F4B",
110 | "#A1C299",
111 | "#300018",
112 | "#0AA6D8",
113 | "#013349",
114 | "#00846F",
115 | "#372101",
116 | "#FFB500",
117 | "#C2FFED",
118 | "#A079BF",
119 | "#CC0744",
120 | "#C0B9B2",
121 | "#C2FF99",
122 | "#001E09",
123 | "#00489C",
124 | "#6F0062",
125 | "#0CBD66",
126 | "#EEC3FF",
127 | "#456D75",
128 | "#B77B68",
129 | "#7A87A1",
130 | "#788D66",
131 | "#885578",
132 | "#FAD09F",
133 | "#FF8A9A",
134 | "#D157A0",
135 | "#BEC459",
136 | "#456648",
137 | "#0086ED",
138 | "#886F4C",
139 | "#34362D",
140 | "#B4A8BD",
141 | "#00A6AA",
142 | "#452C2C",
143 | "#636375",
144 | "#A3C8C9",
145 | "#FF913F",
146 | "#938A81",
147 | "#575329",
148 | "#00FECF",
149 | "#B05B6F",
150 | "#8CD0FF",
151 | "#3B9700",
152 | "#04F757",
153 | "#C8A1A1",
154 | "#1E6E00",
155 | "#7900D7",
156 | "#A77500",
157 | "#6367A9",
158 | "#A05837",
159 | "#6B002C",
160 | "#772600",
161 | "#D790FF",
162 | "#9B9700",
163 | "#549E79",
164 | "#FFF69F",
165 | "#201625",
166 | "#72418F",
167 | "#BC23FF",
168 | "#99ADC0",
169 | "#3A2465",
170 | "#922329",
171 | "#5B4534",
172 | "#FDE8DC",
173 | "#404E55",
174 | "#0089A3",
175 | "#CB7E98",
176 | "#A4E804",
177 | "#324E72",
178 | ]
179 |
180 | default_102 = godsnot_102
181 |
--------------------------------------------------------------------------------
/simba/plotting/_post_training.py:
--------------------------------------------------------------------------------
1 | """post-training plotting functions"""
2 |
3 | import os
4 | import numpy as np
5 | import pandas as pd
6 | import json
7 | import matplotlib as mpl
8 | import matplotlib.pyplot as plt
9 | import seaborn as sns
10 | from matplotlib.collections import LineCollection
11 | from adjustText import adjust_text
12 | from pandas.api.types import (
13 | is_numeric_dtype
14 | )
15 | from scipy.stats import rankdata
16 |
17 | from ._utils import (
18 | get_colors,
19 | generate_palette
20 | )
21 | from .._settings import settings
22 | from ._plot import _scatterplot2d
23 |
24 |
25 | def pbg_metrics(metrics=['mrr'],
26 | path_emb=None,
27 | fig_size=(5, 3),
28 | fig_ncol=1,
29 | save_fig=None,
30 | fig_path=None,
31 | fig_name='pbg_metrics.pdf',
32 | pad=1.08,
33 | w_pad=None,
34 | h_pad=None,
35 | **kwargs):
36 | """Plot PBG training metrics
37 |
38 | Parameters
39 | ----------
40 | metrics: `list`, optional (default: ['mrr])
41 | Evalulation metrics for PBG training. Possible metrics:
42 |
43 | - 'pos_rank' : the average of the ranks of all positives
44 | (lower is better, best is 1).
45 | - 'mrr' : the average of the reciprocal of the ranks of all positives
46 | (higher is better, best is 1).
47 | - 'r1' : the fraction of positives that rank better than
48 | all their negatives, i.e., have a rank of 1
49 | (higher is better, best is 1).
50 | - 'r10' : the fraction of positives that rank in the top 10
51 | among their negatives
52 | (higher is better, best is 1).
53 | - 'r50' : the fraction of positives that rank in the top 50
54 | among their negatives
55 | (higher is better, best is 1).
56 | - 'auc' : Area Under the Curve (AUC)
57 | path_emb: `str`, optional (default: None)
58 | Path to directory for pbg embedding model.
59 | If None, .settings.pbg_params['checkpoint_path'] will be used.
60 | pad: `float`, optional (default: 1.08)
61 | Padding between the figure edge and the edges of subplots,
62 | as a fraction of the font size.
63 | h_pad, w_pad: `float`, optional (default: None)
64 | Padding (height/width) between edges of adjacent subplots,
65 | as a fraction of the font size. Defaults to pad.
66 | fig_size: `tuple`, optional (default: (5, 3))
67 | figure size.
68 | fig_ncol: `int`, optional (default: 1)
69 | the number of columns of the figure panel
70 | save_fig: `bool`, optional (default: False)
71 | if True,save the figure.
72 | fig_path: `str`, optional (default: None)
73 | If save_fig is True, specify figure path.
74 | fig_name: `str`, optional (default: 'plot_umap.pdf')
75 | if save_fig is True, specify figure name.
76 | Returns
77 | -------
78 | None
79 | """
80 | if save_fig is None:
81 | save_fig = settings.save_fig
82 | if fig_path is None:
83 | fig_path = os.path.join(settings.workdir, 'figures')
84 |
85 | assert isinstance(metrics, list), "`metrics` must be list"
86 | for x in metrics:
87 | if x not in ['pos_rank', 'mrr', 'r1',
88 | 'r10', 'r50', 'auc']:
89 | raise ValueError(f'unrecognized metric {x}')
90 | pbg_params = settings.pbg_params
91 | if path_emb is None:
92 | path_emb = pbg_params['checkpoint_path']
93 | training_loss = []
94 | eval_stats_before = dict()
95 | with open(os.path.join(path_emb, 'training_stats.json'), 'r') as f:
96 | for line in f:
97 | line_json = json.loads(line)
98 | if 'stats' in line_json.keys():
99 | training_loss.append(line_json['stats']['metrics']['loss'])
100 | line_stats_before = line_json['eval_stats_before']['metrics']
101 | for x in line_stats_before.keys():
102 | if x not in eval_stats_before.keys():
103 | eval_stats_before[x] = [line_stats_before[x]]
104 | else:
105 | eval_stats_before[x].append(line_stats_before[x])
106 | df_metrics = pd.DataFrame(index=range(pbg_params['num_epochs']))
107 | df_metrics['epoch'] = range(pbg_params['num_epochs'])
108 | df_metrics['training_loss'] = training_loss
109 | df_metrics['validation_loss'] = eval_stats_before['loss']
110 | for x in metrics:
111 | df_metrics[x] = eval_stats_before[x]
112 |
113 | fig_nrow = int(np.ceil((df_metrics.shape[1]-1)/fig_ncol))
114 | fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05,
115 | fig_size[1]*fig_nrow))
116 | dict_palette = generate_palette(df_metrics.columns[1:].values)
117 | for i, metric in enumerate(df_metrics.columns[1:]):
118 | ax_i = fig.add_subplot(fig_nrow, fig_ncol, i+1)
119 | ax_i.scatter(df_metrics['epoch'],
120 | df_metrics[metric],
121 | c=dict_palette[metric],
122 | **kwargs)
123 | ax_i.set_title(metric)
124 | ax_i.set_xlabel('epoch')
125 | ax_i.set_ylabel(metric)
126 | plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad)
127 | if save_fig:
128 | if not os.path.exists(fig_path):
129 | os.makedirs(fig_path)
130 | plt.savefig(os.path.join(fig_path, fig_name),
131 | pad_inches=1,
132 | bbox_inches='tight')
133 | plt.close(fig)
134 |
135 |
136 | def entity_metrics(adata_cmp,
137 | x,
138 | y,
139 | show_texts=True,
140 | show_cutoff=False,
141 | show_contour=True,
142 | levels=4,
143 | thresh=0.05,
144 | cutoff_x=0,
145 | cutoff_y=0,
146 | n_texts=10,
147 | size=8,
148 | texts=None,
149 | text_size=10,
150 | text_expand=(1.05, 1.2),
151 | fig_size=None,
152 | save_fig=None,
153 | fig_path=None,
154 | fig_name='entity_metrics.pdf',
155 | pad=1.08,
156 | w_pad=None,
157 | h_pad=None,
158 | **kwargs):
159 | """Plot entity metrics
160 |
161 | Parameters
162 | ----------
163 | adata_cmp: `AnnData`
164 | Anndata object from `compare_entities`
165 | x, y: `str`
166 | Variables that specify positions on the x and y axes.
167 | Possible values:
168 | - max (The average maximum dot product of top-rank reference entities,
169 | based on normalized dot product)
170 | - std (standard deviation of reference entities,
171 | based on dot product)
172 | - gini (Gini coefficients of reference entities,
173 | based on softmax probability)
174 | - entropy (The entropy of reference entities,
175 | based on softmax probability)
176 | show_texts : `bool`, optional (default: True)
177 | If True, text annotation will be shown.
178 | show_cutoff : `bool`, optional (default: False)
179 | If True, cutoff of `x` and `y` will be shown.
180 | show_contour : `bool`, optional (default: True)
181 | If True, the plot will overlaid with contours
182 | texts: `list` optional (default: None)
183 | Entity names to plot
184 | text_size : `int`, optional (default: 10)
185 | The text size
186 | text_expand : `tuple`, optional (default: (1.05, 1.2))
187 | Two multipliers (x, y) by which to expand the bounding box of texts
188 | when repelling them from each other/points/other objects.
189 | cutoff_x : `float`, optional (default: 0)
190 | Cutoff of axis x
191 | cutoff_y : `float`, optional (default: 0)
192 | Cutoff of axis y
193 | levels: `int`, optional (default: 6)
194 | Number of contour levels or values to draw contours at
195 | thresh: `float`, optional ([0, 1], default: 0.05)
196 | Lowest iso-proportion level at which to draw a contour line.
197 | pad: `float`, optional (default: 1.08)
198 | Padding between the figure edge and the edges of subplots,
199 | as a fraction of the font size.
200 | h_pad, w_pad: `float`, optional (default: None)
201 | Padding (height/width) between edges of adjacent subplots,
202 | as a fraction of the font size. Defaults to pad.
203 | fig_size: `tuple`, optional (default: None)
204 | figure size.
205 | If None, `mpl.rcParams['figure.figsize']` will be used.
206 | fig_ncol: `int`, optional (default: 1)
207 | the number of columns of the figure panel
208 | save_fig: `bool`, optional (default: False)
209 | if True,save the figure.
210 | fig_path: `str`, optional (default: None)
211 | If save_fig is True, specify figure path.
212 | fig_name: `str`, optional (default: 'plot_umap.pdf')
213 | if save_fig is True, specify figure name.
214 |
215 | Returns
216 | -------
217 | None
218 | """
219 | if fig_size is None:
220 | fig_size = mpl.rcParams['figure.figsize']
221 | if save_fig is None:
222 | save_fig = settings.save_fig
223 | if fig_path is None:
224 | fig_path = os.path.join(settings.workdir, 'figures')
225 |
226 | assert (x in ['max', 'std', 'gini', 'entropy']), \
227 | "x must be one of ['max','std','gini','entropy']"
228 | assert (y in ['max', 'std', 'gini', 'entropy']), \
229 | "y must be one of ['max','std','gini','entropy']"
230 |
231 | fig, ax = plt.subplots(figsize=fig_size)
232 | ax.scatter(adata_cmp.var[x],
233 | adata_cmp.var[y],
234 | s=size,
235 | **kwargs)
236 | if show_texts:
237 | if texts is not None:
238 | plt_texts = [plt.text(adata_cmp.var[x][t],
239 | adata_cmp.var[y][t],
240 | t,
241 | fontdict={'family': 'serif',
242 | 'color': 'black',
243 | 'weight': 'normal',
244 | 'size': text_size})
245 | for t in texts]
246 | else:
247 | if x == 'entropy':
248 | ranks_x = rankdata(-adata_cmp.var[x])
249 | else:
250 | ranks_x = rankdata(adata_cmp.var[x])
251 | if y == 'entropy':
252 | ranks_y = rankdata(-adata_cmp.var[y])
253 | else:
254 | ranks_y = rankdata(adata_cmp.var[y])
255 | ids = np.argsort(ranks_x + ranks_y)[::-1][:n_texts]
256 | plt_texts = [plt.text(adata_cmp.var[x][i],
257 | adata_cmp.var[y][i],
258 | adata_cmp.var_names[i],
259 | fontdict={'family': 'serif',
260 | 'color': 'black',
261 | 'weight': 'normal',
262 | 'size': text_size})
263 | for i in ids]
264 | adjust_text(plt_texts,
265 | expand=text_expand,
266 | arrowprops=dict(arrowstyle='-', color='black'))
267 | if show_cutoff:
268 | ax.axvline(x=cutoff_x, linestyle='--', color='#CE3746')
269 | ax.axhline(y=cutoff_y, linestyle='--', color='#CE3746')
270 | if show_contour:
271 | sns.kdeplot(ax=ax,
272 | data=adata_cmp.var,
273 | x=x,
274 | y=y,
275 | alpha=0.7,
276 | color='black',
277 | levels=levels,
278 | thresh=thresh)
279 | ax.set_xlabel(x)
280 | ax.set_ylabel(y)
281 | ax.locator_params(axis='x', tight=True)
282 | ax.locator_params(axis='y', tight=True)
283 | fig.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad)
284 | if save_fig:
285 | if not os.path.exists(fig_path):
286 | os.makedirs(fig_path)
287 | fig.savefig(os.path.join(fig_path, fig_name),
288 | pad_inches=1,
289 | bbox_inches='tight')
290 | plt.close(fig)
291 |
292 |
293 | def entity_barcode(adata_cmp,
294 | entities,
295 | anno_ref=None,
296 | layer='softmax',
297 | palette=None,
298 | alpha=0.8,
299 | linewidths=1,
300 | show_cutoff=False,
301 | cutoff=0.5,
302 | min_rank=None,
303 | max_rank=None,
304 | fig_size=(6, 2),
305 | fig_ncol=1,
306 | save_fig=None,
307 | fig_path=None,
308 | fig_name='plot_barcode.pdf',
309 | pad=1.08,
310 | w_pad=None,
311 | h_pad=None,
312 | **kwargs
313 | ):
314 | """Plot query entity barcode
315 |
316 | Parameters
317 | ----------
318 | adata_cmp : `AnnData`
319 | Anndata object from `compare_entities`
320 | entities : `list`
321 | Entity names to plot.
322 | anno_ref : `str`
323 | Annotation used for reference entity
324 | layer : `str`, optional (default: 'softmax')
325 | Layer to use make barcode plots
326 | palette : `dict`, optional (default: None)
327 | Color palette used for `anno_ref`
328 | alpha : `float`, optional (default: 0.8)
329 | 0.0 transparent through 1.0 opaque
330 | linewidths : `int`, optional (default: 1)
331 | The width of each line.
332 | show_cutoff : `bool`, optional (default: True)
333 | If True, cutoff will be shown
334 | cutoff : `float`, optional (default: 0.5)
335 | Cutoff value for y axis
336 | min_rank : `int`, optional (default: None)
337 | Specify the minimum rank of observations to show.
338 | If None, `min_rank` is set to 0.
339 | max_rank : `int`, optional (default: None)
340 | Specify the maximum rank of observations to show.
341 | If None, `max_rank` is set to the number of observations.
342 | fig_size: `tuple`, optional (default: (6,2))
343 | figure size.
344 | fig_ncol: `int`, optional (default: 1)
345 | the number of columns of the figure panel
346 | save_fig: `bool`, optional (default: False)
347 | if True,save the figure.
348 | fig_path: `str`, optional (default: None)
349 | If save_fig is True, specify figure path.
350 | fig_name: `str`, optional (default: 'plot_barcode.pdf')
351 | if `save_fig` is True, specify figure name.
352 | **kwargs: `dict`, optional
353 | Other keyword arguments are passed through to
354 | ``mpl.collections.LineCollection``
355 |
356 | Returns
357 | -------
358 | None
359 | """
360 | if fig_size is None:
361 | fig_size = mpl.rcParams['figure.figsize']
362 | if save_fig is None:
363 | save_fig = settings.save_fig
364 | if fig_path is None:
365 | fig_path = os.path.join(settings.workdir, 'figures')
366 |
367 | assert isinstance(entities, list), "`entities` must be list"
368 |
369 | if layer is None:
370 | X = adata_cmp[:, entities].X.copy()
371 | else:
372 | X = adata_cmp[:, entities].layers[layer].copy()
373 | df_scores = pd.DataFrame(
374 | data=X,
375 | index=adata_cmp.obs_names,
376 | columns=entities)
377 |
378 | if min_rank is None:
379 | min_rank = 0
380 | if max_rank is None:
381 | max_rank = df_scores.shape[0]
382 |
383 | n_plots = len(entities)
384 | fig_nrow = int(np.ceil(n_plots/fig_ncol))
385 | fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05,
386 | fig_size[1]*fig_nrow))
387 |
388 | for i, x in enumerate(entities):
389 | ax_i = fig.add_subplot(fig_nrow, fig_ncol, i+1)
390 | scores_x_sorted = df_scores[x].sort_values(ascending=False)
391 | lines = []
392 | for xx, yy in zip(np.arange(len(scores_x_sorted))[min_rank:max_rank],
393 | scores_x_sorted[min_rank:max_rank]):
394 | lines.append([(xx, 0), (xx, yy)])
395 | if anno_ref is None:
396 | colors = get_colors(np.array([""]*len(scores_x_sorted)))
397 | else:
398 | ids_ref = scores_x_sorted.index
399 | if palette is None:
400 | colors = get_colors(adata_cmp[ids_ref, :].obs[anno_ref])
401 | else:
402 | colors = [palette[adata_cmp.obs.loc[xx, anno_ref]]
403 | for xx in scores_x_sorted.index]
404 | stemlines = LineCollection(
405 | lines,
406 | colors=colors,
407 | alpha=alpha,
408 | linewidths=linewidths,
409 | **kwargs)
410 | ax_i.add_collection(stemlines)
411 | ax_i.autoscale()
412 | ax_i.set_title(x)
413 | ax_i.set_ylabel(layer)
414 | ax_i.locator_params(axis='y', tight=True)
415 | if show_cutoff:
416 | ax_i.axhline(y=cutoff,
417 | color='#CC6F47',
418 | linestyle='--')
419 | plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad)
420 | if save_fig:
421 | if not os.path.exists(fig_path):
422 | os.makedirs(fig_path)
423 | plt.savefig(os.path.join(fig_path, fig_name),
424 | pad_inches=1,
425 | bbox_inches='tight')
426 | plt.close(fig)
427 |
428 |
429 | def query(adata,
430 | comp1=0,
431 | comp2=1,
432 | obsm='X_umap',
433 | layer=None,
434 | color=None,
435 | dict_palette=None,
436 | size=8,
437 | drawing_order='random',
438 | dict_drawing_order=None,
439 | show_texts=False,
440 | texts=None,
441 | text_expand=(1.05, 1.2),
442 | text_size=10,
443 | n_texts=8,
444 | fig_size=None,
445 | fig_ncol=3,
446 | fig_legend_ncol=1,
447 | fig_legend_order=None,
448 | alpha=0.9,
449 | alpha_bg=0.3,
450 | pad=1.08,
451 | w_pad=None,
452 | h_pad=None,
453 | save_fig=None,
454 | fig_path=None,
455 | fig_name='plot_query.pdf',
456 | vmin=None,
457 | vmax=None,
458 | **kwargs):
459 | """Plot query output
460 |
461 | Parameters
462 | ----------
463 | adata : `Anndata`
464 | Annotated data matrix.
465 | comp1 : `int`, optional (default: 0)
466 | Component used for x axis.
467 | comp2 : `int`, optional (default: 1)
468 | Component used for y axis.
469 | obsm : `str`, optional (default: 'X_umap')
470 | The field to use for plotting
471 | layer : `str`, optional (default: None)
472 | The layer to use for plotting
473 | color: `list`, optional (default: None)
474 | A list of variables that will produce points with different colors.
475 | e.g. color = ['anno1', 'anno2']
476 | dict_palette: `dict`,optional (default: None)
477 | A dictionary of palettes for different variables in `color`.
478 | Only valid for categorical/string variables
479 | e.g. dict_palette = {'ann1': {},'ann2': {}}
480 | size: `int` (default: 8)
481 | Point size.
482 | drawing_order: `str` (default: 'random')
483 | The order in which values are plotted, This can be
484 | one of the following values
485 |
486 | - 'original': plot points in the same order as in input dataframe
487 | - 'sorted' : plot points with higher values on top.
488 | - 'random' : plot points in a random order
489 | dict_drawing_order: `dict`,optional (default: None)
490 | A dictionary of drawing_order for different variables in `color`.
491 | Only valid for categorical/string variables
492 | e.g. dict_drawing_order = {'ann1': 'original','ann2': 'sorted'}
493 | show_texts : `bool`, optional (default: False)
494 | If True, text annotation will be shown.
495 | text_size : `int`, optional (default: 10)
496 | The text size.
497 | texts: `list` optional (default: None)
498 | Point names to plot.
499 | text_expand : `tuple`, optional (default: (1.05, 1.2))
500 | Two multipliers (x, y) by which to expand the bounding box of texts
501 | when repelling them from each other/points/other objects.
502 | n_texts : `int`, optional (default: 8)
503 | The number of texts to plot.
504 | fig_size: `tuple`, optional (default: (4, 4))
505 | figure size.
506 | fig_ncol: `int`, optional (default: 3)
507 | the number of columns of the figure panel
508 | fig_legend_order: `dict`,optional (default: None)
509 | Specified order for the appearance of the annotation keys.
510 | Only valid for categorical/string variable
511 | e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']}
512 | fig_legend_ncol: `int`, optional (default: 1)
513 | The number of columns that the legend has.
514 | vmin,vmax: `float`, optional (default: None)
515 | The min and max values are used to normalize continuous values.
516 | If None, the respective min and max of continuous values is used.
517 | alpha: `float`, optional (default: 0.9)
518 | The alpha blending value, between 0 (transparent) and 1 (opaque)
519 | for returned points.
520 | alpha_bg: `float`, optional (default: 0.3)
521 | The alpha blending value, between 0 (transparent) and 1 (opaque)
522 | for background points
523 | pad: `float`, optional (default: 1.08)
524 | Padding between the figure edge and the edges of subplots,
525 | as a fraction of the font size.
526 | h_pad, w_pad: `float`, optional (default: None)
527 | Padding (height/width) between edges of adjacent subplots,
528 | as a fraction of the font size. Defaults to pad.
529 | save_fig: `bool`, optional (default: False)
530 | if True,save the figure.
531 | fig_path: `str`, optional (default: None)
532 | If save_fig is True, specify figure path.
533 | fig_name: `str`, optional (default: 'plot_query.pdf')
534 | if save_fig is True, specify figure name.
535 |
536 | Returns
537 | -------
538 | None
539 | """
540 | if fig_size is None:
541 | fig_size = mpl.rcParams['figure.figsize']
542 | if save_fig is None:
543 | save_fig = settings.save_fig
544 | if fig_path is None:
545 | fig_path = os.path.join(settings.workdir, 'figures')
546 |
547 | if dict_palette is None:
548 | dict_palette = dict()
549 |
550 | query_output = adata.uns['query']['output']
551 | nn = query_output.index.tolist() # nearest neighbors
552 | if len(nn) == 0:
553 | print('No neighbor entities were found.')
554 | return
555 | query_params = adata.uns['query']['params']
556 | query_obsm = query_params['obsm']
557 | query_layer = query_params['layer']
558 | entity = query_params['entity']
559 | use_radius = query_params['use_radius']
560 | r = query_params['r']
561 | if (obsm == query_obsm) and (layer == query_layer):
562 | pin = query_params['pin']
563 | else:
564 | if entity is not None:
565 | if obsm is not None:
566 | pin = adata[entity, :].obsm[obsm].copy()
567 | elif layer is not None:
568 | pin = adata[entity, :].layers[layer].copy()
569 | else:
570 | pin = adata[entity, :].X.copy()
571 | else:
572 | pin = None
573 |
574 | if sum(list(map(lambda x: x is not None,
575 | [layer, obsm]))) == 2:
576 | raise ValueError("Only one of `layer` and `obsm` can be used")
577 | if obsm is not None:
578 | X = adata.obsm[obsm].copy()
579 | X_nn = adata[nn, :].obsm[obsm].copy()
580 | elif layer is not None:
581 | X = adata.layers[layer].copy()
582 | X_nn = adata[nn, :].layers[layer].copy()
583 | else:
584 | X = adata.X.copy()
585 | X_nn = adata[nn, :].X.copy()
586 | df_plot = pd.DataFrame(index=adata.obs.index,
587 | data=X[:, [comp1, comp2]],
588 | columns=[f'Dim {comp1}', f'Dim {comp2}'])
589 | df_plot_nn = pd.DataFrame(index=adata[nn, :].obs.index,
590 | data=X_nn[:, [comp1, comp2]],
591 | columns=[f'Dim {comp1}', f'Dim {comp2}'])
592 | if show_texts:
593 | if texts is None:
594 | texts = nn[:n_texts]
595 | if color is None:
596 | list_ax = _scatterplot2d(df_plot,
597 | x=f'Dim {comp1}',
598 | y=f'Dim {comp2}',
599 | drawing_order=drawing_order,
600 | size=size,
601 | fig_size=fig_size,
602 | alpha=alpha_bg,
603 | pad=pad,
604 | w_pad=w_pad,
605 | h_pad=h_pad,
606 | save_fig=False,
607 | copy=True,
608 | **kwargs)
609 | else:
610 | color = list(dict.fromkeys(color)) # remove duplicate keys
611 | for ann in color:
612 | if ann in adata.obs_keys():
613 | df_plot[ann] = adata.obs[ann]
614 | if not is_numeric_dtype(df_plot[ann]):
615 | if 'color' not in adata.uns_keys():
616 | adata.uns['color'] = dict()
617 |
618 | if ann not in dict_palette.keys():
619 | if (ann+'_color' in adata.uns['color'].keys()) \
620 | and \
621 | (all(np.isin(np.unique(df_plot[ann]),
622 | list(adata.uns['color']
623 | [ann+'_color'].keys())))):
624 | dict_palette[ann] = \
625 | adata.uns['color'][ann+'_color']
626 | else:
627 | dict_palette[ann] = \
628 | generate_palette(adata.obs[ann])
629 | adata.uns['color'][ann+'_color'] = \
630 | dict_palette[ann].copy()
631 | else:
632 | if ann+'_color' not in adata.uns['color'].keys():
633 | adata.uns['color'][ann+'_color'] = \
634 | dict_palette[ann].copy()
635 |
636 | elif ann in adata.var_names:
637 | df_plot[ann] = adata.obs_vector(ann)
638 | else:
639 | raise ValueError(f"could not find {ann} in `adata.obs.columns`"
640 | " and `adata.var_names`")
641 | list_ax = _scatterplot2d(df_plot,
642 | x=f'Dim {comp1}',
643 | y=f'Dim {comp2}',
644 | list_hue=color,
645 | hue_palette=dict_palette,
646 | drawing_order=drawing_order,
647 | dict_drawing_order=dict_drawing_order,
648 | size=size,
649 | fig_size=fig_size,
650 | fig_ncol=fig_ncol,
651 | fig_legend_ncol=fig_legend_ncol,
652 | fig_legend_order=fig_legend_order,
653 | vmin=vmin,
654 | vmax=vmax,
655 | alpha=alpha_bg,
656 | pad=pad,
657 | w_pad=w_pad,
658 | h_pad=h_pad,
659 | save_fig=False,
660 | copy=True,
661 | **kwargs)
662 | for ax in list_ax:
663 | ax.scatter(
664 | df_plot_nn[f'Dim {comp1}'],
665 | df_plot_nn[f'Dim {comp2}'],
666 | s=size,
667 | color='#AE6C68',
668 | alpha=alpha,
669 | lw=0)
670 | if pin is not None:
671 | ax.scatter(pin[:, comp1],
672 | pin[:, comp2],
673 | s=20*size,
674 | marker='+',
675 | color='#B33831')
676 | if use_radius:
677 | circle = plt.Circle((pin[:, comp1],
678 | pin[:, comp2]),
679 | radius=r,
680 | color='#B33831',
681 | fill=False)
682 | ax.add_artist(circle)
683 | if show_texts:
684 | plt_texts = [ax.text(df_plot_nn[f'Dim {comp1}'][t],
685 | df_plot_nn[f'Dim {comp2}'][t],
686 | t,
687 | fontdict={'family': 'serif',
688 | 'color': 'black',
689 | 'weight': 'normal',
690 | 'size': text_size})
691 | for t in texts]
692 | adjust_text(plt_texts,
693 | ax=ax,
694 | expand=text_expand,
695 | arrowprops=dict(arrowstyle='->', color='black'))
696 | if save_fig:
697 | fig = plt.gcf()
698 | if not os.path.exists(fig_path):
699 | os.makedirs(fig_path)
700 | fig.savefig(os.path.join(fig_path, fig_name),
701 | pad_inches=1,
702 | bbox_inches='tight')
703 | plt.close(fig)
704 |
--------------------------------------------------------------------------------
/simba/plotting/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from pandas.api.types import (
6 | is_numeric_dtype,
7 | is_string_dtype,
8 | is_categorical_dtype,
9 | )
10 | import matplotlib as mpl
11 |
12 | from ._palettes import (
13 | default_20,
14 | default_28,
15 | default_102
16 | )
17 |
18 |
19 | def get_colors(arr,
20 | vmin=None,
21 | vmax=None,
22 | clip=False):
23 | """Generate a list of colors for a given array
24 | """
25 |
26 | if not isinstance(arr, (pd.Series, np.ndarray)):
27 | raise TypeError("`arr` must be pd.Series or np.ndarray")
28 | colors = []
29 | if is_numeric_dtype(arr):
30 | image_cmap = mpl.rcParams['image.cmap']
31 | cm = mpl.cm.get_cmap(image_cmap, 512)
32 | if vmin is None:
33 | vmin = min(arr)
34 | if vmax is None:
35 | vmax = max(arr)
36 | norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax, clip=clip)
37 | colors = [mpl.colors.to_hex(cm(norm(x))) for x in arr]
38 | elif is_string_dtype(arr) or is_categorical_dtype(arr):
39 | categories = np.unique(arr)
40 | length = len(categories)
41 | # check if default matplotlib palette has enough colors
42 | # mpl.style.use('default')
43 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length:
44 | cc = mpl.rcParams['axes.prop_cycle']()
45 | palette = [mpl.colors.rgb2hex(next(cc)['color'])
46 | for _ in range(length)]
47 | else:
48 | if length <= 20:
49 | palette = default_20
50 | elif length <= 28:
51 | palette = default_28
52 | elif length <= len(default_102): # 103 colors
53 | palette = default_102
54 | else:
55 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length))
56 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1])
57 | for i in range(length)]
58 | colors = pd.Series(['']*len(arr))
59 | for i, x in enumerate(categories):
60 | ids = np.where(arr == x)[0]
61 | colors[ids] = palette[i]
62 | colors = list(colors)
63 | else:
64 | raise TypeError("unsupported data type for `arr`")
65 | return colors
66 |
67 |
68 | def generate_palette(arr):
69 | """Generate a color palette for a given array
70 | """
71 |
72 | if not isinstance(arr, (pd.Series, np.ndarray)):
73 | raise TypeError("`arr` must be pd.Series or np.ndarray")
74 | colors = []
75 | if is_string_dtype(arr) or is_categorical_dtype(arr):
76 | categories = np.unique(arr)
77 | length = len(categories)
78 | # check if default matplotlib palette has enough colors
79 | # mpl.style.use('default')
80 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length:
81 | cc = mpl.rcParams['axes.prop_cycle']()
82 | palette = [mpl.colors.rgb2hex(next(cc)['color'])
83 | for _ in range(length)]
84 | else:
85 | if length <= 20:
86 | palette = default_20
87 | elif length <= 28:
88 | palette = default_28
89 | elif length <= len(default_102): # 103 colors
90 | palette = default_102
91 | else:
92 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length))
93 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1])
94 | for i in range(length)]
95 | colors = pd.Series(['']*len(arr))
96 | for i, x in enumerate(categories):
97 | ids = np.where(arr == x)[0]
98 | colors[ids] = palette[i]
99 | colors = list(colors)
100 | else:
101 | raise TypeError("unsupported data type for `arr`")
102 | dict_palette = dict(zip(arr, colors))
103 | return dict_palette
104 |
--------------------------------------------------------------------------------
/simba/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """Preprocessing"""
2 |
3 | from ._general import (
4 | log_transform,
5 | normalize,
6 | binarize
7 | )
8 | from ._qc import (
9 | cal_qc,
10 | cal_qc_rna,
11 | cal_qc_atac,
12 | filter_samples,
13 | filter_cells_rna,
14 | filter_cells_atac,
15 | filter_features,
16 | filter_genes,
17 | filter_peaks,
18 | )
19 | from ._pca import (
20 | pca,
21 | select_pcs,
22 | select_pcs_features,
23 | )
24 | from ._variable_genes import (
25 | select_variable_genes
26 | )
27 |
--------------------------------------------------------------------------------
/simba/preprocessing/_general.py:
--------------------------------------------------------------------------------
1 | """General preprocessing functions"""
2 |
3 | import numpy as np
4 | from sklearn.utils import sparsefuncs
5 | from sklearn import preprocessing
6 | from ._utils import (
7 | cal_tf_idf
8 | )
9 | from scipy.sparse import (
10 | issparse,
11 | csr_matrix,
12 | )
13 |
14 |
15 | def log_transform(adata):
16 | """Return the natural logarithm of one plus the input array, element-wise.
17 |
18 | Parameters
19 | ----------
20 | adata: AnnData
21 | Annotated data matrix.
22 |
23 | Returns
24 | -------
25 | updates `adata` with the following fields.
26 | X: `numpy.ndarray` (`adata.X`)
27 | Store #observations × #var_genes logarithmized data matrix.
28 | """
29 | if not issparse(adata.X):
30 | adata.X = csr_matrix(adata.X)
31 | adata.X = np.log1p(adata.X)
32 | return None
33 |
34 |
35 | def binarize(adata,
36 | threshold=1e-5):
37 | """Binarize an array.
38 | Parameters
39 | ----------
40 | adata: AnnData
41 | Annotated data matrix.
42 | threshold: `float`, optional (default: 1e-5)
43 | Values below or equal to this are replaced by 0, above it by 1.
44 |
45 | Returns
46 | -------
47 | updates `adata` with the following fields.
48 | X: `numpy.ndarray` (`adata.X`)
49 | Store #observations × #var_genes binarized data matrix.
50 | """
51 | if not issparse(adata.X):
52 | adata.X = csr_matrix(adata.X)
53 | adata.X = preprocessing.binarize(adata.X,
54 | threshold=threshold,
55 | copy=True)
56 |
57 |
58 | def normalize(adata,
59 | method='lib_size',
60 | scale_factor=1e4,
61 | save_raw=True):
62 | """Normalize count matrix.
63 |
64 | Parameters
65 | ----------
66 | adata: AnnData
67 | Annotated data matrix.
68 | method: `str`, optional (default: 'lib_size')
69 | Choose from {{'lib_size','tf_idf'}}
70 | Method used for dimension reduction.
71 | 'lib_size': Total-count normalize (library-size correct)
72 | 'tf_idf': TF-IDF (term frequency–inverse document frequency)
73 | transformation
74 |
75 | Returns
76 | -------
77 | updates `adata` with the following fields.
78 | X: `numpy.ndarray` (`adata.X`)
79 | Store #observations × #var_genes normalized data matrix.
80 | """
81 | if method not in ['lib_size', 'tf_idf']:
82 | raise ValueError("unrecognized method '%s'" % method)
83 | if not issparse(adata.X):
84 | adata.X = csr_matrix(adata.X)
85 | if save_raw:
86 | adata.layers['raw'] = adata.X.copy()
87 | if method == 'lib_size':
88 | sparsefuncs.inplace_row_scale(adata.X, 1/adata.X.sum(axis=1).A)
89 | adata.X = adata.X*scale_factor
90 | if method == 'tf_idf':
91 | adata.X = cal_tf_idf(adata.X)
92 |
--------------------------------------------------------------------------------
/simba/preprocessing/_pca.py:
--------------------------------------------------------------------------------
1 | """Principal component analysis"""
2 |
3 | import numpy as np
4 | from sklearn.decomposition import TruncatedSVD
5 | from ._utils import (
6 | locate_elbow,
7 | )
8 |
9 |
10 | def pca(adata,
11 | n_components=50,
12 | algorithm='randomized',
13 | n_iter=5,
14 | random_state=2021,
15 | tol=0.0,
16 | feature=None,
17 | **kwargs,
18 | ):
19 | """perform Principal Component Analysis (PCA)
20 |
21 | Parameters
22 | ----------
23 | adata: AnnData
24 | Annotated data matrix.
25 | n_components: `int`, optional (default: 50)
26 | Desired dimensionality of output data
27 | algorithm: `str`, optional (default: 'randomized')
28 | SVD solver to use. Choose from {'arpack', 'randomized'}.
29 | n_iter: `int`, optional (default: '5')
30 | Number of iterations for randomized SVD solver.
31 | Not used by ARPACK.
32 | tol: `float`, optional (default: 0)
33 | Tolerance for ARPACK. 0 means machine precision.
34 | Ignored by randomized SVD solver.
35 | feature: `str`, optional (default: None)
36 | Feature used to perform PCA.
37 | The data type of `.var[feature]` needs to be `bool`
38 | If None, adata.X will be used.
39 | kwargs:
40 | Other keyword arguments are passed down to `TruncatedSVD()`
41 |
42 | Returns
43 | -------
44 | updates `adata` with the following fields:
45 | `.obsm['X_pca']` : `array`
46 | PCA transformed X.
47 | `.uns['pca']['PCs']` : `array`
48 | Principal components in feature space,
49 | representing the directions of maximum variance in the data.
50 | `.uns['pca']['variance']` : `array`
51 | The variance of the training samples transformed by a
52 | projection to each component.
53 | `.uns['pca']['variance_ratio']` : `array`
54 | Percentage of variance explained by each of the selected components.
55 | """
56 | if feature is None:
57 | X = adata.X.copy()
58 | else:
59 | mask = adata.var[feature]
60 | X = adata[:, mask].X.copy()
61 | svd = TruncatedSVD(n_components=n_components,
62 | algorithm=algorithm,
63 | n_iter=n_iter,
64 | random_state=random_state,
65 | tol=tol,
66 | **kwargs)
67 | svd.fit(X)
68 | adata.obsm['X_pca'] = svd.transform(X)
69 | adata.uns['pca'] = dict()
70 | adata.uns['pca']['n_pcs'] = n_components
71 | adata.uns['pca']['PCs'] = svd.components_.T
72 | adata.uns['pca']['variance'] = svd.explained_variance_
73 | adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_
74 |
75 |
76 | def select_pcs(adata,
77 | n_pcs=None,
78 | S=1,
79 | curve='convex',
80 | direction='decreasing',
81 | online=False,
82 | min_elbow=None,
83 | **kwargs):
84 | """select top PCs based on variance_ratio
85 |
86 | Parameters
87 | ----------
88 | n_pcs: `int`, optional (default: None)
89 | If n_pcs is None,
90 | the number of PCs will be automatically selected with "`kneed
91 | `__"
92 | S : `float`, optional (default: 1)
93 | Sensitivity
94 | min_elbow: `int`, optional (default: None)
95 | The minimum elbow location
96 | By default, it is n_components/10
97 | curve: `str`, optional (default: 'convex')
98 | Choose from {'convex','concave'}
99 | If 'concave', algorithm will detect knees,
100 | If 'convex', algorithm will detect elbows.
101 | direction: `str`, optional (default: 'decreasing')
102 | Choose from {'decreasing','increasing'}
103 | online: `bool`, optional (default: False)
104 | kneed will correct old knee points if True,
105 | kneed will return first knee if False.
106 | **kwargs: `dict`, optional
107 | Extra arguments to KneeLocator.
108 | Returns
109 |
110 | """
111 | if n_pcs is None:
112 | n_components = adata.obsm['X_pca'].shape[1]
113 | if min_elbow is None:
114 | min_elbow = n_components/10
115 | n_pcs = locate_elbow(range(n_components),
116 | adata.uns['pca']['variance_ratio'],
117 | S=S,
118 | curve=curve,
119 | min_elbow=min_elbow,
120 | direction=direction,
121 | online=online,
122 | **kwargs)
123 | adata.uns['pca']['n_pcs'] = n_pcs
124 | else:
125 | adata.uns['pca']['n_pcs'] = n_pcs
126 |
127 |
128 | def select_pcs_features(adata,
129 | S=1,
130 | curve='convex',
131 | direction='decreasing',
132 | online=False,
133 | min_elbow=None,
134 | **kwargs):
135 | """select features that contribute to the top PCs
136 |
137 | Parameters
138 | ----------
139 | S : `float`, optional (default: 10)
140 | Sensitivity
141 | min_elbow: `int`, optional (default: None)
142 | The minimum elbow location.
143 | By default, it is #features/6
144 | curve: `str`, optional (default: 'convex')
145 | Choose from {'convex','concave'}
146 | If 'concave', algorithm will detect knees,
147 | If 'convex', algorithm will detect elbows.
148 | direction: `str`, optional (default: 'decreasing')
149 | Choose from {'decreasing','increasing'}
150 | online: `bool`, optional (default: False)
151 | kneed will correct old knee points if True,
152 | kneed will return first knee if False.
153 | **kwargs: `dict`, optional
154 | Extra arguments to KneeLocator.
155 | Returns
156 | -------
157 | """
158 | n_pcs = adata.uns['pca']['n_pcs']
159 | n_features = adata.uns['pca']['PCs'].shape[0]
160 | if min_elbow is None:
161 | min_elbow = n_features/6
162 | adata.uns['pca']['features'] = dict()
163 | ids_features = list()
164 | for i in range(n_pcs):
165 | elbow = locate_elbow(range(n_features),
166 | np.sort(
167 | np.abs(adata.uns['pca']['PCs'][:, i],))[::-1],
168 | S=S,
169 | min_elbow=min_elbow,
170 | curve=curve,
171 | direction=direction,
172 | online=online,
173 | **kwargs)
174 | ids_features_i = \
175 | list(np.argsort(np.abs(
176 | adata.uns['pca']['PCs'][:, i],))[::-1][:elbow])
177 | adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i
178 | ids_features = ids_features + ids_features_i
179 | print(f'#features selected from PC {i}: {len(ids_features_i)}')
180 | adata.var['top_pcs'] = False
181 | adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True
182 | print(f'#features in total: {adata.var["top_pcs"].sum()}')
183 |
--------------------------------------------------------------------------------
/simba/preprocessing/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from kneed import KneeLocator
5 | from scipy.sparse import csr_matrix, diags
6 |
7 |
8 | def locate_elbow(x, y, S=10, min_elbow=0,
9 | curve='convex', direction='decreasing', online=False,
10 | **kwargs):
11 | """Detect knee points
12 |
13 | Parameters
14 | ----------
15 | x : `array_like`
16 | x values
17 | y : `array_like`
18 | y values
19 | S : `float`, optional (default: 10)
20 | Sensitivity
21 | min_elbow: `int`, optional (default: 0)
22 | The minimum elbow location
23 | curve: `str`, optional (default: 'convex')
24 | Choose from {'convex','concave'}
25 | If 'concave', algorithm will detect knees,
26 | If 'convex', algorithm will detect elbows.
27 | direction: `str`, optional (default: 'decreasing')
28 | Choose from {'decreasing','increasing'}
29 | online: `bool`, optional (default: False)
30 | kneed will correct old knee points if True,
31 | kneed will return first knee if False.
32 | **kwargs: `dict`, optional
33 | Extra arguments to KneeLocator.
34 |
35 | Returns
36 | -------
37 | elbow: `int`
38 | elbow point
39 | """
40 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):],
41 | S=S, curve=curve,
42 | direction=direction,
43 | online=online,
44 | **kwargs,
45 | )
46 | if kneedle.elbow is None:
47 | elbow = len(y)
48 | else:
49 | elbow = int(kneedle.elbow)
50 | return elbow
51 |
52 |
53 | def cal_tf_idf(mat):
54 | """Transform a count matrix to a tf-idf representation
55 | """
56 | mat = csr_matrix(mat)
57 | tf = csr_matrix(mat/(mat.sum(axis=0)))
58 | idf = np.array(np.log(1 + mat.shape[1] / mat.sum(axis=1))).flatten()
59 | tf_idf = csr_matrix(np.dot(diags(idf), tf))
60 | return tf_idf
61 |
--------------------------------------------------------------------------------
/simba/preprocessing/_variable_genes.py:
--------------------------------------------------------------------------------
1 | """Preprocess"""
2 |
3 | import numpy as np
4 | from scipy.sparse import (
5 | csr_matrix,
6 | )
7 | from sklearn.utils import sparsefuncs
8 | from skmisc.loess import loess
9 |
10 |
11 | def select_variable_genes(adata,
12 | layer='raw',
13 | span=0.3,
14 | n_top_genes=2000,
15 | ):
16 | """Select highly variable genes.
17 |
18 | This function implenments the method 'vst' in Seurat v3.
19 | Inspired by Scanpy.
20 |
21 | Parameters
22 | ----------
23 | adata: AnnData
24 | Annotated data matrix.
25 | layer: `str`, optional (default: 'raw')
26 | The layer to use for calculating variable genes.
27 | span: `float`, optional (default: 0.3)
28 | Loess smoothing factor
29 | n_top_genes: `int`, optional (default: 2000)
30 | The number of genes to keep
31 |
32 | Returns
33 | -------
34 | updates `adata` with the following fields.
35 |
36 | variances_norm: `float`, (`adata.var['variances_norm']`)
37 | Normalized variance per gene
38 | variances: `float`, (`adata.var['variances']`)
39 | Variance per gene.
40 | means: `float`, (`adata.var['means']`)
41 | Means per gene
42 | highly_variable: `bool` (`adata.var['highly_variable']`)
43 | Indicator of variable genes
44 | """
45 | if layer is None:
46 | X = adata.X
47 | else:
48 | X = adata.layers[layer].astype(np.float64).copy()
49 | mean, variance = sparsefuncs.mean_variance_axis(X, axis=0)
50 | variance_expected = np.zeros(adata.shape[1], dtype=np.float64)
51 | not_const = variance > 0
52 |
53 | model = loess(np.log10(mean[not_const]),
54 | np.log10(variance[not_const]),
55 | span=span,
56 | degree=2)
57 | model.fit()
58 | variance_expected[not_const] = 10**model.outputs.fitted_values
59 | N = adata.shape[0]
60 | clip_max = np.sqrt(N)
61 | clip_val = np.sqrt(variance_expected) * clip_max + mean
62 |
63 | X = csr_matrix(X)
64 | mask = X.data > clip_val[X.indices]
65 | X.data[mask] = clip_val[X.indices[mask]]
66 |
67 | squared_X_sum = np.array(X.power(2).sum(axis=0))
68 | X_sum = np.array(X.sum(axis=0))
69 |
70 | norm_gene_var = (1 / ((N - 1) * variance_expected)) \
71 | * ((N * np.square(mean))
72 | + squared_X_sum
73 | - 2 * X_sum * mean
74 | )
75 | norm_gene_var = norm_gene_var.flatten()
76 |
77 | adata.var['variances_norm'] = norm_gene_var
78 | adata.var['variances'] = variance
79 | adata.var['means'] = mean
80 | ids_top = norm_gene_var.argsort()[-n_top_genes:][::-1]
81 | adata.var['highly_variable'] = np.isin(range(adata.shape[1]), ids_top)
82 | print(f'{n_top_genes} variable genes are selected.')
83 |
--------------------------------------------------------------------------------
/simba/readwrite.py:
--------------------------------------------------------------------------------
1 | """reading and writing"""
2 |
3 | import os
4 | import pandas as pd
5 | import json
6 | from anndata import (
7 | AnnData,
8 | read_h5ad,
9 | read_csv,
10 | read_excel,
11 | read_hdf,
12 | read_loom,
13 | read_mtx,
14 | read_text,
15 | read_umi_tools,
16 | read_zarr,
17 | )
18 | from pathlib import Path
19 | import tables
20 |
21 | from ._settings import settings
22 | from ._utils import _read_legacy_10x_h5, _read_v3_10x_h5
23 |
24 |
25 | def read_embedding(path_emb=None,
26 | path_entity=None,
27 | convert_alias=True,
28 | path_entity_alias=None,
29 | prefix=None,
30 | num_epochs=None):
31 | """Read in entity embeddings from pbg training
32 |
33 | Parameters
34 | ----------
35 | path_emb: `str`, optional (default: None)
36 | Path to directory for pbg embedding model
37 | If None, .settings.pbg_params['checkpoint_path'] will be used.
38 | path_entity: `str`, optional (default: None)
39 | Path to entity name file
40 | prefix: `list`, optional (default: None)
41 | A list of entity type prefixes to include.
42 | By default, it reads in the embeddings of all entities.
43 | convert_alias: `bool`, optional (default: True)
44 | If True, it will convert entity aliases to the original indices
45 | path_entity_alias: `str`, optional (default: None)
46 | Path to entity alias file
47 | num_epochs: `int`, optional (default: None)
48 | The embedding result associated with num_epochs to read in
49 |
50 | Returns
51 | -------
52 | dict_adata: `dict`
53 | A dictionary of anndata objects of shape
54 | (#entities x #dimensions)
55 | """
56 | pbg_params = settings.pbg_params
57 | if path_emb is None:
58 | path_emb = pbg_params['checkpoint_path']
59 | if path_entity is None:
60 | path_entity = pbg_params['entity_path']
61 | if num_epochs is None:
62 | num_epochs = pbg_params["num_epochs"]
63 | if prefix is None:
64 | prefix = []
65 | assert isinstance(prefix, list), \
66 | "`prefix` must be list"
67 | if convert_alias:
68 | if path_entity_alias is None:
69 | path_entity_alias = Path(path_emb).parent.as_posix()
70 | df_entity_alias = pd.read_csv(
71 | os.path.join(path_entity_alias, 'entity_alias.txt'),
72 | header=0,
73 | index_col=0,
74 | sep='\t')
75 | df_entity_alias['id'] = df_entity_alias.index
76 | df_entity_alias.index = df_entity_alias['alias'].values
77 |
78 | dict_adata = dict()
79 | for x in os.listdir(path_emb):
80 | if x.startswith('embeddings'):
81 | entity_type = x.split('_')[1]
82 | if (len(prefix) == 0) or (entity_type in prefix):
83 | adata = \
84 | read_hdf(os.path.join(path_emb,
85 | f'embeddings_{entity_type}_0.'
86 | f'v{num_epochs}.h5'),
87 | key="embeddings")
88 | with open(
89 | os.path.join(path_entity,
90 | f'entity_names_{entity_type}_0.json'), "rt")\
91 | as tf:
92 | names_entity = json.load(tf)
93 | if convert_alias:
94 | names_entity = \
95 | df_entity_alias.loc[names_entity, 'id'].tolist()
96 | adata.obs.index = names_entity
97 | dict_adata[entity_type] = adata
98 | return dict_adata
99 |
100 |
101 | # modifed from
102 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py
103 | def read_10x_h5(filename,
104 | genome=None,
105 | gex_only=True):
106 | """Read 10x-Genomics-formatted hdf5 file.
107 |
108 | Parameters
109 | ----------
110 | filename
111 | Path to a 10x hdf5 file.
112 | genome
113 | Filter expression to genes within this genome. For legacy 10x h5
114 | files, this must be provided if the data contains more than one genome.
115 | gex_only
116 | Only keep 'Gene Expression' data and ignore other feature types,
117 | e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
118 |
119 | Returns
120 | -------
121 | adata: AnnData
122 | Annotated data matrix, where observations/cells are named by their
123 | barcode and variables/genes by gene name
124 | """
125 | with tables.open_file(str(filename), 'r') as f:
126 | v3 = '/matrix' in f
127 | if v3:
128 | adata = _read_v3_10x_h5(filename)
129 | if genome:
130 | if genome not in adata.var['genome'].values:
131 | raise ValueError(
132 | f"Could not find data corresponding to "
133 | f"genome '{genome}' in '{filename}'. "
134 | f'Available genomes are:'
135 | f' {list(adata.var["genome"].unique())}.'
136 | )
137 | adata = adata[:, adata.var['genome'] == genome]
138 | if gex_only:
139 | adata = adata[:, adata.var['feature_types'] == 'Gene Expression']
140 | if adata.is_view:
141 | adata = adata.copy()
142 | else:
143 | adata = _read_legacy_10x_h5(filename, genome=genome)
144 | return adata
145 |
146 |
147 | def load_pbg_config(path=None):
148 | """Load PBG configuration into global setting
149 |
150 | Parameters
151 | ----------
152 | path: `str`, optional (default: None)
153 | Path to the directory for pbg configuration file
154 | If None, `.settings.pbg_params['checkpoint_path']` will be used
155 |
156 | Returns
157 | -------
158 | Updates `.settings.pbg_params`
159 |
160 | """
161 | if path is None:
162 | path = settings.pbg_params['checkpoint_path']
163 | path = os.path.normpath(path)
164 | with open(os.path.join(path, 'config.json'), "rt") as tf:
165 | pbg_params = json.load(tf)
166 | settings.set_pbg_params(config=pbg_params)
167 |
168 |
169 | def load_graph_stats(path=None):
170 | """Load graph statistics into global setting
171 |
172 | Parameters
173 | ----------
174 | path: `str`, optional (default: None)
175 | Path to the directory for graph statistics file
176 | If None, `.settings.pbg_params['checkpoint_path']` will be used
177 |
178 | Returns
179 | -------
180 | Updates `.settings.graph_stats`
181 | """
182 | if path is None:
183 | path = \
184 | Path(settings.pbg_params['entity_path']).parent.parent.as_posix()
185 | path = os.path.normpath(path)
186 | with open(os.path.join(path, 'graph_stats.json'), "rt") as tf:
187 | dict_graph_stats = json.load(tf)
188 | dirname = os.path.basename(path)
189 | settings.graph_stats[dirname] = dict_graph_stats.copy()
190 |
191 |
192 | def write_bed(adata,
193 | use_top_pcs=True,
194 | filename=None
195 | ):
196 | """Write peaks into .bed file
197 |
198 | Parameters
199 | ----------
200 | adata: AnnData
201 | Annotated data matrix with peaks as variables.
202 | use_top_pcs: `bool`, optional (default: True)
203 | Use top-PCs-associated features
204 | filename: `str`, optional (default: None)
205 | Filename name for peaks.
206 | By default, a file named 'peaks.bed' will be written to
207 | `.settings.workdir`
208 | """
209 | if filename is None:
210 | filename = os.path.join(settings.workdir, 'peaks.bed')
211 | for x in ['chr', 'start', 'end']:
212 | if x not in adata.var_keys():
213 | raise ValueError(f"could not find {x} in `adata.var_keys()`")
214 | if use_top_pcs:
215 | assert 'top_pcs' in adata.var_keys(), \
216 | "please run `si.pp.select_pcs_features()` first"
217 | peaks_selected = adata.var[
218 | adata.var['top_pcs']][['chr', 'start', 'end']]
219 | else:
220 | peaks_selected = adata.var[
221 | ['chr', 'start', 'end']]
222 | peaks_selected.to_csv(filename,
223 | sep='\t',
224 | header=False,
225 | index=False)
226 | fp, fn = os.path.split(filename)
227 | print(f'"{fn}" was written to "{fp}".')
228 |
--------------------------------------------------------------------------------
/simba/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """The core functionality"""
2 |
3 | from ._general import (
4 | discretize,
5 | )
6 | from ._umap import umap
7 | from ._gene_scores import gene_scores
8 | from ._integration import (
9 | infer_edges,
10 | trim_edges
11 | )
12 | from ._pbg import (
13 | gen_graph,
14 | pbg_train
15 | )
16 | from ._post_training import (
17 | softmax,
18 | embed,
19 | compare_entities,
20 | query,
21 | find_master_regulators,
22 | find_target_genes,
23 | )
24 |
--------------------------------------------------------------------------------
/simba/tools/_gene_scores.py:
--------------------------------------------------------------------------------
1 | """Predict gene scores based on chromatin accessibility"""
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import anndata as ad
6 | import io
7 | import pybedtools
8 | from scipy.sparse import (
9 | coo_matrix,
10 | csr_matrix
11 | )
12 | import pkgutil
13 |
14 | from ._utils import _uniquify
15 |
16 |
17 | class GeneScores:
18 | """A class used to represent gene scores
19 |
20 | Attributes
21 | ----------
22 |
23 | Methods
24 | -------
25 |
26 | """
27 | def __init__(self,
28 | adata,
29 | genome,
30 | gene_anno=None,
31 | tss_upstream=1e5,
32 | tss_downsteam=1e5,
33 | gb_upstream=5000,
34 | cutoff_weight=1,
35 | use_top_pcs=True,
36 | use_precomputed=True,
37 | use_gene_weigt=True,
38 | min_w=1,
39 | max_w=5):
40 | """
41 | Parameters
42 | ----------
43 | adata: `Anndata`
44 | Input anndata
45 | genome : `str`
46 | The genome name
47 | """
48 | self.adata = adata
49 | self.genome = genome
50 | self.gene_anno = gene_anno
51 | self.tss_upstream = tss_upstream
52 | self.tss_downsteam = tss_downsteam
53 | self.gb_upstream = gb_upstream
54 | self.cutoff_weight = cutoff_weight
55 | self.use_top_pcs = use_top_pcs
56 | self.use_precomputed = use_precomputed
57 | self.use_gene_weigt = use_gene_weigt
58 | self.min_w = min_w
59 | self.max_w = max_w
60 |
61 | def _read_gene_anno(self):
62 | """Read in gene annotation
63 |
64 | Parameters
65 | ----------
66 |
67 | Returns
68 | -------
69 |
70 | """
71 | assert (self.genome in ['hg19', 'hg38', 'mm9', 'mm10']),\
72 | "`genome` must be one of ['hg19','hg38','mm9','mm10']"
73 |
74 | bin_str = pkgutil.get_data('simba',
75 | f'data/gene_anno/{self.genome}_genes.bed')
76 | gene_anno = pd.read_csv(io.BytesIO(bin_str),
77 | encoding='utf8',
78 | sep='\t',
79 | header=None,
80 | names=['chr', 'start', 'end',
81 | 'symbol', 'strand'])
82 | self.gene_anno = gene_anno
83 | return self.gene_anno
84 |
85 | def _extend_tss(self, pbt_gene):
86 | """Extend transcription start site in both directions
87 |
88 | Parameters
89 | ----------
90 |
91 | Returns
92 | -------
93 |
94 | """
95 | ext_tss = pbt_gene
96 | if ext_tss['strand'] == '+':
97 | ext_tss.start = max(0, ext_tss.start - self.tss_upstream)
98 | ext_tss.end = max(ext_tss.end, ext_tss.start + self.tss_downsteam)
99 | else:
100 | ext_tss.start = max(0, min(ext_tss.start,
101 | ext_tss.end - self.tss_downsteam))
102 | ext_tss.end = ext_tss.end + self.tss_upstream
103 | return ext_tss
104 |
105 | def _extend_genebody(self, pbt_gene):
106 | """Extend gene body upstream
107 |
108 | Parameters
109 | ----------
110 |
111 | Returns
112 | -------
113 |
114 | """
115 | ext_gb = pbt_gene
116 | if ext_gb['strand'] == '+':
117 | ext_gb.start = max(0, ext_gb.start - self.gb_upstream)
118 | else:
119 | ext_gb.end = ext_gb.end + self.gb_upstream
120 | return ext_gb
121 |
122 | def _weight_genes(self):
123 | """Weight genes
124 |
125 | Parameters
126 | ----------
127 |
128 | Returns
129 | -------
130 |
131 | """
132 | gene_anno = self.gene_anno
133 | gene_size = gene_anno['end'] - gene_anno['start']
134 | w = 1/gene_size
135 | w_scaled = (self.max_w-self.min_w) * (w-min(w)) / (max(w)-min(w)) \
136 | + self.min_w
137 | return w_scaled
138 |
139 | def cal_gene_scores(self):
140 | """Calculate gene scores
141 |
142 | Parameters
143 | ----------
144 |
145 | Returns
146 | -------
147 |
148 | """
149 | adata = self.adata
150 | if self.gene_anno is None:
151 | gene_ann = self._read_gene_anno()
152 | else:
153 | gene_ann = self.gene_anno
154 |
155 | df_gene_ann = gene_ann.copy()
156 | df_gene_ann.index = _uniquify(df_gene_ann['symbol'].values)
157 | if self.use_top_pcs:
158 | mask_p = adata.var['top_pcs']
159 | else:
160 | mask_p = pd.Series(True, index=adata.var_names)
161 | df_peaks = adata.var[mask_p][['chr', 'start', 'end']].copy()
162 |
163 | if 'gene_scores' not in adata.uns_keys():
164 | print('Gene scores are being calculated for the first time')
165 | print('`use_precomputed` has been ignored')
166 | self.use_precomputed = False
167 |
168 | if self.use_precomputed:
169 | print('Using precomputed overlap')
170 | df_overlap_updated = adata.uns['gene_scores']['overlap'].copy()
171 | else:
172 | # add the fifth column
173 | # so that pybedtool can recognize the sixth column as the strand
174 | df_gene_ann_for_pbt = df_gene_ann.copy()
175 | df_gene_ann_for_pbt['score'] = 0
176 | df_gene_ann_for_pbt = df_gene_ann_for_pbt[['chr', 'start', 'end',
177 | 'symbol', 'score',
178 | 'strand']]
179 | df_gene_ann_for_pbt['id'] = range(df_gene_ann_for_pbt.shape[0])
180 |
181 | df_peaks_for_pbt = df_peaks.copy()
182 | df_peaks_for_pbt['id'] = range(df_peaks_for_pbt.shape[0])
183 |
184 | pbt_gene_ann = pybedtools.BedTool.from_dataframe(
185 | df_gene_ann_for_pbt
186 | )
187 | pbt_gene_ann_ext = pbt_gene_ann.each(self._extend_tss)
188 | pbt_gene_gb_ext = pbt_gene_ann.each(self._extend_genebody)
189 |
190 | pbt_peaks = pybedtools.BedTool.from_dataframe(df_peaks_for_pbt)
191 |
192 | # peaks overlapping with extended TSS
193 | pbt_overlap = pbt_peaks.intersect(pbt_gene_ann_ext,
194 | wa=True,
195 | wb=True)
196 | df_overlap = pbt_overlap.to_dataframe(
197 | names=[x+'_p' for x in df_peaks_for_pbt.columns]
198 | + [x+'_g' for x in df_gene_ann_for_pbt.columns])
199 | # peaks overlapping with gene body
200 | pbt_overlap2 = pbt_peaks.intersect(pbt_gene_gb_ext,
201 | wa=True,
202 | wb=True)
203 | df_overlap2 = pbt_overlap2.to_dataframe(
204 | names=[x+'_p' for x in df_peaks_for_pbt.columns]
205 | + [x+'_g' for x in df_gene_ann_for_pbt.columns])
206 |
207 | # add distance and weight for each overlap
208 | df_overlap_updated = df_overlap.copy()
209 | df_overlap_updated['dist'] = 0
210 |
211 | for i, x in enumerate(df_overlap['symbol_g'].unique()):
212 | # peaks within the extended TSS
213 | df_overlap_x = \
214 | df_overlap[df_overlap['symbol_g'] == x].copy()
215 | # peaks within the gene body
216 | df_overlap2_x = \
217 | df_overlap2[df_overlap2['symbol_g'] == x].copy()
218 | # peaks that are not intersecting with the promoter
219 | # and gene body of gene x
220 | id_overlap = df_overlap_x.index[
221 | ~np.isin(df_overlap_x['id_p'], df_overlap2_x['id_p'])]
222 | mask_x = (df_gene_ann['symbol'] == x)
223 | range_x = df_gene_ann[mask_x][['start', 'end']].values\
224 | .flatten()
225 | if df_overlap_x['strand_g'].iloc[0] == '+':
226 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat(
227 | [abs(df_overlap_x.loc[id_overlap, 'start_p']
228 | - (range_x[1])),
229 | abs(df_overlap_x.loc[id_overlap, 'end_p']
230 | - max(0, range_x[0]-self.gb_upstream))],
231 | axis=1, sort=False).min(axis=1)
232 | else:
233 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat(
234 | [abs(df_overlap_x.loc[id_overlap, 'start_p']
235 | - (range_x[1]+self.gb_upstream)),
236 | abs(df_overlap_x.loc[id_overlap, 'end_p']
237 | - (range_x[0]))],
238 | axis=1, sort=False).min(axis=1)
239 |
240 | n_batch = int(df_gene_ann_for_pbt.shape[0]/5)
241 | if i % n_batch == 0:
242 | print(f'Processing: {i/df_gene_ann_for_pbt.shape[0]:.1%}')
243 | df_overlap_updated['dist'] = df_overlap_updated['dist']\
244 | .astype(float)
245 |
246 | adata.uns['gene_scores'] = dict()
247 | adata.uns['gene_scores']['overlap'] = df_overlap_updated.copy()
248 |
249 | df_overlap_updated['weight'] = np.exp(
250 | -(df_overlap_updated['dist'].values/self.gb_upstream))
251 | mask_w = (df_overlap_updated['weight'] < self.cutoff_weight)
252 | df_overlap_updated.loc[mask_w, 'weight'] = 0
253 | # construct genes-by-peaks matrix
254 | mat_GP = csr_matrix(coo_matrix((df_overlap_updated['weight'],
255 | (df_overlap_updated['id_g'],
256 | df_overlap_updated['id_p'])),
257 | shape=(df_gene_ann.shape[0],
258 | df_peaks.shape[0])))
259 | # adata_GP = ad.AnnData(X=csr_matrix(mat_GP),
260 | # obs=df_gene_ann,
261 | # var=df_peaks)
262 | # adata_GP.layers['weight'] = adata_GP.X.copy()
263 | if self.use_gene_weigt:
264 | gene_weights = self._weight_genes()
265 | gene_scores = adata[:, mask_p].X * \
266 | (mat_GP.T.multiply(gene_weights))
267 | else:
268 | gene_scores = adata[:, mask_p].X * mat_GP.T
269 | adata_CG_atac = ad.AnnData(gene_scores,
270 | obs=adata.obs.copy(),
271 | var=df_gene_ann.copy())
272 | return adata_CG_atac
273 |
274 |
275 | def gene_scores(adata,
276 | genome,
277 | gene_anno=None,
278 | tss_upstream=1e5,
279 | tss_downsteam=1e5,
280 | gb_upstream=5000,
281 | cutoff_weight=1,
282 | use_top_pcs=True,
283 | use_precomputed=True,
284 | use_gene_weigt=True,
285 | min_w=1,
286 | max_w=5):
287 | """Calculate gene scores
288 |
289 | Parameters
290 | ----------
291 | adata : AnnData
292 | Annotated data matrix.
293 | genome : `str`
294 | Reference genome. Choose from {'hg19', 'hg38', 'mm9', 'mm10'}
295 | gene_anno : `pandas.DataFrame`, optional (default: None)
296 | Dataframe of gene annotation.
297 | If None, built-in gene annotation will be used depending on `genome`;
298 | If provided, custom gene annotation will be used instead.
299 | tss_upstream : `int`, optional (default: 1e5)
300 | The number of base pairs upstream of TSS
301 | tss_downsteam : `int`, optional (default: 1e5)
302 | The number of base pairs downstream of TSS
303 | gb_upstream : `int`, optional (default: 5000)
304 | The number of base pairs upstream by which gene body is extended.
305 | Peaks within the extended gene body are given the weight of 1.
306 | cutoff_weight : `float`, optional (default: 1)
307 | Weight cutoff for peaks
308 | use_top_pcs : `bool`, optional (default: True)
309 | If True, only peaks associated with top PCs will be used
310 | use_precomputed : `bool`, optional (default: True)
311 | If True, overlap bewteen peaks and genes
312 | (stored in `adata.uns['gene_scores']['overlap']`) will be imported
313 | use_gene_weigt : `bool`, optional (default: True)
314 | If True, for each gene, the number of peaks assigned to it
315 | will be rescaled based on gene size
316 | min_w : `int`, optional (default: 1)
317 | The minimum weight for each gene.
318 | Only valid if `use_gene_weigt` is True
319 | max_w : `int`, optional (default: 5)
320 | The maximum weight for each gene.
321 | Only valid if `use_gene_weigt` is True
322 |
323 | Returns
324 | -------
325 | adata_new: AnnData
326 | Annotated data matrix.
327 | Stores #cells x #genes gene score matrix
328 |
329 | updates `adata` with the following fields.
330 | overlap: `pandas.DataFrame`, (`adata.uns['gene_scores']['overlap']`)
331 | Dataframe of overlap between peaks and genes
332 | """
333 | GS = GeneScores(adata,
334 | genome,
335 | gene_anno=gene_anno,
336 | tss_upstream=tss_upstream,
337 | tss_downsteam=tss_downsteam,
338 | gb_upstream=gb_upstream,
339 | cutoff_weight=cutoff_weight,
340 | use_top_pcs=use_top_pcs,
341 | use_precomputed=use_precomputed,
342 | use_gene_weigt=use_gene_weigt,
343 | min_w=min_w,
344 | max_w=max_w)
345 | adata_CG_atac = GS.cal_gene_scores()
346 | return adata_CG_atac
347 |
--------------------------------------------------------------------------------
/simba/tools/_general.py:
--------------------------------------------------------------------------------
1 | """General-purpose tools"""
2 |
3 | import numpy as np
4 | from sklearn.cluster import KMeans
5 |
6 |
7 | def discretize(adata,
8 | layer=None,
9 | n_bins=5,
10 | max_bins=100):
11 | """Discretize continous values
12 |
13 | Parameters
14 | ----------
15 | adata: AnnData
16 | Annotated data matrix.
17 | layer: `str`, optional (default: None)
18 | The layer used to perform discretization
19 | n_bins: `int`, optional (default: 5)
20 | The number of bins to produce.
21 | It must be smaller than `max_bins`.
22 | max_bins: `int`, optional (default: 100)
23 | The number of bins used in the initial approximation.
24 | i.e. the number of bins to cluster.
25 |
26 | Returns
27 | -------
28 | updates `adata` with the following fields
29 |
30 | `.layer['simba']` : `array_like`
31 | The matrix of discretized values to build SIMBA graph.
32 | `.uns['disc']` : `dict`
33 | `bin_edges`: The edges of each bin.
34 | `bin_count`: The number of values in each bin.
35 | `hist_edges`: The edges of each bin \
36 | in the initial approximation.
37 | `hist_count`: The number of values in each bin \
38 | for the initial approximation.
39 | """
40 | if layer is None:
41 | X = adata.X
42 | else:
43 | X = adata.layers[layer]
44 | nonzero_cont = X.data
45 |
46 | hist_count, hist_edges = np.histogram(
47 | nonzero_cont,
48 | bins=max_bins,
49 | density=False)
50 | hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2
51 |
52 | kmeans = KMeans(n_clusters=n_bins, random_state=2021, n_init='auto').fit(
53 | hist_centroids.reshape(-1, 1),
54 | sample_weight=hist_count)
55 | cluster_centers = np.sort(kmeans.cluster_centers_.flatten())
56 |
57 | padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10)
58 | bin_edges = np.array(
59 | [hist_edges[0]-padding] +
60 | list((cluster_centers[0:-1] + cluster_centers[1:])/2) +
61 | [hist_edges[-1]+padding])
62 | nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,)
63 | bin_count = np.unique(nonzero_disc, return_counts=True)[1]
64 |
65 | adata.layers['simba'] = X.copy()
66 | adata.layers['simba'].data = nonzero_disc
67 | adata.uns['disc'] = dict()
68 | adata.uns['disc']['bin_edges'] = bin_edges
69 | adata.uns['disc']['bin_count'] = bin_count
70 | adata.uns['disc']['hist_edges'] = hist_edges
71 | adata.uns['disc']['hist_count'] = hist_count
72 |
--------------------------------------------------------------------------------
/simba/tools/_integration.py:
--------------------------------------------------------------------------------
1 | """Integration across experimental conditions or single cell modalities"""
2 |
3 | import numpy as np
4 | import anndata as ad
5 | # from sklearn.metrics.pairwise import pairwise_distances
6 | from sklearn.utils.extmath import randomized_svd
7 | from scipy.sparse import csr_matrix, find
8 |
9 | from ._utils import _knn
10 |
11 |
12 | def infer_edges(adata_ref,
13 | adata_query,
14 | feature='highly_variable',
15 | n_components=20,
16 | random_state=42,
17 | layer=None,
18 | k=20,
19 | metric='euclidean',
20 | leaf_size=40,
21 | **kwargs):
22 | """Infer edges between reference and query observations
23 |
24 | Parameters
25 | ----------
26 | adata_ref: `AnnData`
27 | Annotated reference data matrix.
28 | adata_query: `AnnData`
29 | Annotated query data matrix.
30 | feature: `str`, optional (default: None)
31 | Feature used for edges inference.
32 | The data type of `.var[feature]` needs to be `bool`
33 | n_components: `int`, optional (default: 20)
34 | The number of components used in `randomized_svd`
35 | for comparing reference and query observations
36 | random_state: `int`, optional (default: 42)
37 | The seed used for truncated randomized SVD
38 | n_top_edges: `int`, optional (default: None)
39 | The number of edges to keep
40 | If specified, `percentile` will be ignored
41 | percentile: `float`, optional (default: 0.01)
42 | The percentile of edges to keep
43 | k: `int`, optional (default: 5)
44 | The number of nearest neighbors to consider within each dataset
45 | metric: `str`, optional (default: 'euclidean')
46 | The metric to use when calculating distance between
47 | reference and query observations
48 | layer: `str`, optional (default: None)
49 | The layer used to perform edge inference
50 | If None, `.X` will be used.
51 | kwargs:
52 | Other keyword arguments are passed down to `randomized_svd()`
53 |
54 | Returns
55 | -------
56 | adata_ref_query: `AnnData`
57 | Annotated relation matrix betwewn reference and query observations
58 | Store reference entity as observations and query entity as variables
59 | """
60 |
61 | mask_ref = adata_ref.var[feature]
62 | feature_ref = adata_ref.var_names[mask_ref]
63 | feature_query = adata_query.var_names
64 | feature_shared = list(set(feature_ref).intersection(set(feature_query)))
65 | print(f'#shared features: {len(feature_shared)}')
66 | if layer is None:
67 | X_ref = adata_ref[:, feature_shared].X
68 | X_query = adata_query[:, feature_shared].X
69 | else:
70 | X_ref = adata_ref[:, feature_shared].layers[layer]
71 | X_query = adata_query[:, feature_shared].layers[layer]
72 |
73 | if any(X_ref.sum(axis=1) == 0) or any(X_query.sum(axis=1) == 0):
74 | raise ValueError(
75 | f'Some nodes contain zero expressed {feature} features.\n'
76 | f'Please try to include more {feature} features.')
77 |
78 | print('Performing randomized SVD ...')
79 | mat = X_ref * X_query.T
80 | U, Sigma, VT = randomized_svd(mat,
81 | n_components=n_components,
82 | random_state=random_state,
83 | **kwargs)
84 | svd_data = np.vstack((U, VT.T))
85 | X_svd_ref = svd_data[:U.shape[0], :]
86 | X_svd_query = svd_data[-VT.shape[1]:, :]
87 | X_svd_ref = X_svd_ref / (X_svd_ref**2).sum(-1, keepdims=True)**0.5
88 | X_svd_query = X_svd_query / (X_svd_query**2).sum(-1, keepdims=True)**0.5
89 |
90 | # print('Searching for neighbors within each dataset ...')
91 | # knn_conn_ref, knn_dist_ref = _knn(
92 | # X_ref=X_svd_ref,
93 | # k=k,
94 | # leaf_size=leaf_size,
95 | # metric=metric)
96 | # knn_conn_query, knn_dist_query = _knn(
97 | # X_ref=X_svd_query,
98 | # k=k,
99 | # leaf_size=leaf_size,
100 | # metric=metric)
101 |
102 | print('Searching for mutual nearest neighbors ...')
103 | knn_conn_ref_query, knn_dist_ref_query = _knn(
104 | X_ref=X_svd_ref,
105 | X_query=X_svd_query,
106 | k=k,
107 | leaf_size=leaf_size,
108 | metric=metric)
109 | knn_conn_query_ref, knn_dist_query_ref = _knn(
110 | X_ref=X_svd_query,
111 | X_query=X_svd_ref,
112 | k=k,
113 | leaf_size=leaf_size,
114 | metric=metric)
115 |
116 | sum_conn_ref_query = knn_conn_ref_query + knn_conn_query_ref.T
117 | id_x, id_y, values = find(sum_conn_ref_query > 1)
118 | print(f'{len(id_x)} edges are selected')
119 | conn_ref_query = csr_matrix(
120 | (values*1, (id_x, id_y)),
121 | shape=(knn_conn_ref_query.shape))
122 | dist_ref_query = csr_matrix(
123 | (knn_dist_ref_query[id_x, id_y].A.flatten(), (id_x, id_y)),
124 | shape=(knn_conn_ref_query.shape))
125 | # it's easier to distinguish zeros (no connection vs zero distance)
126 | # using similarity scores
127 | sim_ref_query = csr_matrix(
128 | (1/(dist_ref_query.data+1), dist_ref_query.nonzero()),
129 | shape=(dist_ref_query.shape)) # similarity scores
130 |
131 | # print('Computing similarity scores ...')
132 | # dist_ref_query = pairwise_distances(X_svd_ref,
133 | # X_svd_query,
134 | # metric=metric)
135 | # sim_ref_query = 1/(1+dist_ref_query)
136 | # # remove low similarity entries to save memory
137 | # sim_ref_query = np.where(
138 | # sim_ref_query < np.percentile(sim_ref_query, pct_keep*100),
139 | # 0, sim_ref_query)
140 | # sim_ref_query = csr_matrix(sim_ref_query)
141 |
142 | adata_ref_query = ad.AnnData(X=sim_ref_query,
143 | obs=adata_ref.obs,
144 | var=adata_query.obs)
145 | adata_ref_query.layers['simba'] = conn_ref_query
146 | adata_ref_query.obsm['svd'] = X_svd_ref
147 | # adata_ref_query.obsp['conn'] = knn_conn_ref
148 | # adata_ref_query.obsp['dist'] = knn_dist_ref
149 | adata_ref_query.varm['svd'] = X_svd_query
150 | # adata_ref_query.varp['conn'] = knn_conn_query
151 | # adata_ref_query.varp['dist'] = knn_dist_query
152 | return adata_ref_query
153 |
154 |
155 | def trim_edges(adata_ref_query,
156 | cutoff=None,
157 | n_edges=None):
158 | """Trim edges based on the similarity scores
159 |
160 | Parameters
161 | ----------
162 | adata_ref_query: `AnnData`
163 | Annotated relation matrix betwewn reference and query observations.
164 | n_edges: `int`, optional (default: None)
165 | The number of edges to keep
166 | If specified, `percentile` will be ignored
167 | cutoff: `float`, optional (default: None)
168 | The distance cutoff.
169 | If None, it will be decided by `n_top_edges`
170 | If specified, `n_top_edges` will be ignored
171 |
172 | Returns
173 | -------
174 | updates `adata_ref_query` with the following field.
175 | `.layers['simba']` : `array_like`
176 | relation matrix betwewn reference and query observations
177 | """
178 | sim_ref_query = adata_ref_query.X
179 | if cutoff is None:
180 | if n_edges is None:
181 | raise ValueError('"cutoff" or "n_edges" has to be specified')
182 | else:
183 | cutoff = \
184 | np.partition(sim_ref_query.data,
185 | (sim_ref_query.size-n_edges))[
186 | sim_ref_query.size-n_edges]
187 | # cutoff = \
188 | # np.partition(sim_ref_query.flatten(),
189 | # (len(sim_ref_query.flatten())-n_edges))[
190 | # len(sim_ref_query.flatten())-n_edges]
191 | id_x, id_y, values = find(sim_ref_query > cutoff)
192 |
193 | print(f'{len(id_x)} edges are selected')
194 | conn_ref_query = csr_matrix(
195 | (values*1, (id_x, id_y)),
196 | shape=(sim_ref_query.shape))
197 | adata_ref_query.layers['simba'] = conn_ref_query
198 |
--------------------------------------------------------------------------------
/simba/tools/_umap.py:
--------------------------------------------------------------------------------
1 | """UMAP (Uniform Manifold Approximation and Projection)"""
2 |
3 | import umap as umap_learn
4 |
5 |
6 | def umap(adata,
7 | n_neighbors=15,
8 | n_components=2,
9 | random_state=2020,
10 | layer=None,
11 | obsm=None,
12 | n_dim=None,
13 | **kwargs,
14 | ):
15 | """perform UMAP
16 | Parameters
17 | ----------
18 | adata: AnnData
19 | Annotated data matrix.
20 | n_neighbors: `int`, optional (default: 15)
21 | The size of local neighborhood for UMAP
22 | n_components: `int`, optional (default: None)
23 | The dimension of the space to embed into for UMAP
24 | random_state: `int`, optional (default: None)
25 | The seed used by the random number generator for UMAP
26 | layer: `str`, optional (default: None)
27 | The layer used to perform UMAP
28 | obsm: `str`, optional (default: None)
29 | The multi-dimensional annotation of observations used to perform UMAP
30 | n_dim: `str`, optional (default: None)
31 | The number of dimensions used in `layer` or `obsm`
32 | kwargs:
33 | Other keyword arguments are passed down to `umap_learn.UMAP`
34 |
35 | Returns
36 | -------
37 | updates `adata` with the following fields:
38 | `.obsm['X_umap']` : `array`
39 | UMAP coordinates of samples.
40 | """
41 |
42 | if sum(list(map(lambda x: x is not None,
43 | [layer, obsm]))) == 2:
44 | raise ValueError("Only one of `layer` and `obsm` can be used")
45 | elif obsm is not None:
46 | X = adata.obsm[obsm]
47 | elif layer is not None:
48 | X = adata.layers[layer]
49 | else:
50 | X = adata.X
51 | if n_dim is not None:
52 | X = X[:, :n_dim]
53 | reducer = umap_learn.UMAP(n_neighbors=n_neighbors,
54 | n_components=n_components,
55 | random_state=random_state,
56 | **kwargs)
57 | reducer.fit(X)
58 | adata.obsm['X_umap'] = reducer.embedding_
59 |
--------------------------------------------------------------------------------
/simba/tools/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions and classes"""
2 |
3 | import numpy as np
4 | from sklearn.neighbors import KDTree
5 | from scipy.sparse import csr_matrix
6 |
7 |
8 | def _uniquify(seq, sep='-'):
9 | """Uniquify a list of strings.
10 |
11 | Adding unique numbers to duplicate values.
12 |
13 | Parameters
14 | ----------
15 | seq : `list` or `array-like`
16 | A list of values
17 | sep : `str`
18 | Separator
19 |
20 | Returns
21 | -------
22 | seq: `list` or `array-like`
23 | A list of updated values
24 | """
25 |
26 | dups = {}
27 |
28 | for i, val in enumerate(seq):
29 | if val not in dups:
30 | # Store index of first occurrence and occurrence value
31 | dups[val] = [i, 1]
32 | else:
33 | # Increment occurrence value, index value doesn't matter anymore
34 | dups[val][1] += 1
35 |
36 | # Use stored occurrence value
37 | seq[i] += (sep+str(dups[val][1]))
38 |
39 | return seq
40 |
41 |
42 | def _gini(array):
43 | """Calculate the Gini coefficient of a numpy array.
44 | """
45 |
46 | array = array.flatten().astype(float)
47 | if np.amin(array) < 0:
48 | # Values cannot be negative:
49 | array -= np.amin(array)
50 | # Values cannot be 0:
51 | array += 0.0000001
52 | # Values must be sorted:
53 | array = np.sort(array)
54 | # Index per array element:
55 | index = np.arange(1, array.shape[0]+1)
56 | # Number of array elements:
57 | n = array.shape[0]
58 | # Gini coefficient:
59 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))
60 |
61 |
62 | def _knn(X_ref,
63 | X_query=None,
64 | k=20,
65 | leaf_size=40,
66 | metric='euclidean'):
67 | """Calculate K nearest neigbors for each row.
68 | """
69 | if X_query is None:
70 | X_query = X_ref.copy()
71 | kdt = KDTree(X_ref, leaf_size=leaf_size, metric=metric)
72 | kdt_d, kdt_i = kdt.query(X_query, k=k, return_distance=True)
73 | # kdt_i = kdt_i[:, 1:] # exclude the point itself
74 | # kdt_d = kdt_d[:, 1:] # exclude the point itself
75 | sp_row = np.repeat(np.arange(kdt_i.shape[0]), kdt_i.shape[1])
76 | sp_col = kdt_i.flatten()
77 | sp_conn = np.repeat(1, len(sp_row))
78 | sp_dist = kdt_d.flatten()
79 | mat_conn_ref_query = csr_matrix(
80 | (sp_conn, (sp_row, sp_col)),
81 | shape=(X_query.shape[0], X_ref.shape[0])).T
82 | mat_dist_ref_query = csr_matrix(
83 | (sp_dist, (sp_row, sp_col)),
84 | shape=(X_query.shape[0], X_ref.shape[0])).T
85 | return mat_conn_ref_query, mat_dist_ref_query
86 |
--------------------------------------------------------------------------------
/tests/data/10xpbmc_atac_subset.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/10xpbmc_atac_subset.h5ad
--------------------------------------------------------------------------------
/tests/data/10xpbmc_rna_subset.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/10xpbmc_rna_subset.h5ad
--------------------------------------------------------------------------------
/tests/data/pbg_training/entity_alias.txt:
--------------------------------------------------------------------------------
1 | alias
2 | GCACCTAAGTTAGTGC-1_rna C.0
3 | TCTCCTCGTGGAGCAA-1_rna C.1
4 | GTTCTTGTCTTGACCC-1_rna C.2
5 | CGCGATTCAGGATGGC-1_rna C.3
6 | CCTACTTCAGAATGAC-1_rna C.4
7 | TGAAGGATCCTTTACG-1_rna C.5
8 | AATCCGTAGCACTAAC-1_rna C.6
9 | GCTTAAATCGGCCATA-1_rna C.7
10 | GGGCTAACACCTAAGC-1_rna C.8
11 | TAGCCTGAGGTCTTGG-1_rna C.9
12 | ACCTTGCTCGTTAGCG-1_rna C.10
13 | CTAGGCGGTAGACAAA-1_rna C.11
14 | CGATTTGCATTGCGTA-1_rna C.12
15 | CGTGAGGAGGAGCAAC-1_rna C.13
16 | GGGTCAACACATAGCC-1_rna C.14
17 | ACTCAGTAGTAGGATG-1_rna C.15
18 | ATTACCCGTAGGTTAT-1_rna C.16
19 | ATTAGGTGTTTGGCGG-1_rna C.17
20 | ACCAAACTCATTATGG-1_rna C.18
21 | AGCTTGGTCGCTAGTG-1_rna C.19
22 | TGTGGAGCAACCTGGT-1_rna C.20
23 | TTTAGCTTCCTTAAGA-1_rna C.21
24 | GTTTAACCATAATCCG-1_rna C.22
25 | CGTATTGCAGCTAATT-1_rna C.23
26 | ACATAGCTCCCTGACT-1_rna C.24
27 | GTTAAACGTTTCCACG-1_rna C.25
28 | TTAACTGAGTATTGTG-1_rna C.26
29 | GGCTATGTCCCTGGTT-1_rna C.27
30 | AGGTCATTCTAACCAA-1_rna C.28
31 | CAGCCTTTCTCACAAA-1_rna C.29
32 | TCAGTAGGTAGGTTAT-1_rna C.30
33 | TGAAGTGAGGAAGTAT-1_rna C.31
34 | GCTGGTTCAATTAAGG-1_rna C.32
35 | TGAGCCGGTGCACGCA-1_rna C.33
36 | CGCTTAACAGCCGCTA-1_rna C.34
37 | GTGCACGGTTGTAAAC-1_rna C.35
38 | CTGTTAAAGAATGACG-1_rna C.36
39 | TGTAAGCTCTTAGGAC-1_rna C.37
40 | GAGCGAAGTTCGGGAT-1_rna C.38
41 | GATTCATCATAATGTC-1_rna C.39
42 | TTCCTCAAGGTTTGAC-1_rna C.40
43 | AGGTACGCAGCCTTGG-1_rna C.41
44 | CATTTGTTCGCACACA-1_rna C.42
45 | CCGTTAACAATCCTGA-1_rna C.43
46 | GGCCTCTGTCCTCCAA-1_rna C.44
47 | GCTGTGATCAATCTCT-1_rna C.45
48 | GGGAATATCTTAATGG-1_rna C.46
49 | AGGTGAGGTGGATTCA-1_rna C.47
50 | TGCACTTGTTTACGTC-1_rna C.48
51 | GTTCGCTTCCGTTAAA-1_rna C.49
52 | CCAAATCAGCGGTTAT-1_rna C.50
53 | GGCCTAATCCCTGTTA-1_rna C.51
54 | GGGCCTAGTGTCCAGG-1_rna C.52
55 | CGGACCTAGTCACTCC-1_rna C.53
56 | ATATGGTGTCAGGAAG-1_rna C.54
57 | CGCTTACTCCTAATTC-1_rna C.55
58 | AAAGCTTGTCGACTAA-1_rna C.56
59 | AAGCATGAGGCCTAAT-1_rna C.57
60 | GTTACAGGTAGGTTAT-1_rna C.58
61 | CCATAATCATGCTATG-1_rna C.59
62 | CACCTCAGTTTGCGAA-1_rna C.60
63 | ATTGCGCCACTAGCGT-1_rna C.61
64 | CCTACTGGTGCCGCAA-1_rna C.62
65 | GCAATAGAGGCGGATG-1_rna C.63
66 | GCGCGATTCCTCCCTC-1_rna C.64
67 | TAGGTTATCTCGACCT-1_rna C.65
68 | TGCTTGCTCATGAGCT-1_rna C.66
69 | GACTCACCAGTAATAG-1_rna C.67
70 | TTCCCGCCAATAACGA-1_rna C.68
71 | CGAACCGGTAGCCATA-1_rna C.69
72 | ACCAAGCGTCATAAGT-1_rna C.70
73 | CGCCAAATCCCAGTAG-1_rna C.71
74 | ACTCACTGTTTGGTTC-1_rna C.72
75 | AGAAGGTGTAGGTTGC-1_rna C.73
76 | ATTTGCGCAGGCTTGT-1_rna C.74
77 | TAGTTGTCATCGCTCC-1_rna C.75
78 | GGACAGCCAGATTCAT-1_rna C.76
79 | TTGAGCTAGCTTACTT-1_rna C.77
80 | CATTTGTTCTAAATCG-1_rna C.78
81 | TAGGAGTCACTGACCG-1_rna C.79
82 | ATCAAGCTCGGGATTT-1_rna C.80
83 | TCTCGCCCAACCTGGT-1_rna C.81
84 | GCGGTTATCCTGATGG-1_rna C.82
85 | GATTGCAGTGGAGCAA-1_rna C.83
86 | CGATTCCTCTTGCTAT-1_rna C.84
87 | GCTCATTGTTCACCAT-1_rna C.85
88 | CAAACGCGTTTCGCGC-1_rna C.86
89 | TCTTAGCGTCCGTGAG-1_rna C.87
90 | GCGCGATTCCTTGAGG-1_rna C.88
91 | ATAGGTACAGGTCCTG-1_rna C.89
92 | TCCTTAGTCCTGAGTG-1_rna C.90
93 | TTCCCACAGCCAAATC-1_rna C.91
94 | GGGTGAAGTGCATCGG-1_rna C.92
95 | CGGATAAAGTAGAGGC-1_rna C.93
96 | AGGATGTCACAAAGAC-1_rna C.94
97 | CTTCTCAAGGGTGGAT-1_rna C.95
98 | AACCCGCAGCGGATTT-1_rna C.96
99 | GGACATAAGGGATGCG-1_rna C.97
100 | TACAAGCTCTGTGAGT-1_rna C.98
101 | CGTATTGCATTCAGCA-1_rna C.99
102 | DPH3 G.0
103 | BICD1 G.1
104 | MAML3 G.2
105 | TTN-AS1 G.3
106 | APPL2 G.4
107 | HLX G.5
108 | CHIC1 G.6
109 | DDX39B G.7
110 | SRC G.8
111 | VAPB G.9
112 | RPS10-NUDT3 G.10
113 | POLR2A G.11
114 | AC007262.2 G.12
115 | CCND2 G.13
116 | PTCD3 G.14
117 | TNFRSF10A G.15
118 | POLR3GL G.16
119 | NNT G.17
120 | IL26 G.18
121 | RPL10 G.19
122 | UHRF1BP1L G.20
123 | AC124014.1 G.21
124 | ELOVL1 G.22
125 | SGPL1 G.23
126 | USP42 G.24
127 | ATF7IP2 G.25
128 | METTL22 G.26
129 | HSCB G.27
130 | PCTP G.28
131 | FAM174B G.29
132 | TMEM184B G.30
133 | SERF2 G.31
134 | KIAA0930 G.32
135 | GNAQ G.33
136 | SCFD1 G.34
137 | UBE2R2 G.35
138 | ARL5B G.36
139 | FRMD4A G.37
140 | EML5 G.38
141 | FAM3A G.39
142 | ARHGAP22 G.40
143 | KXD1 G.41
144 | A1BG G.42
145 | C4orf3 G.43
146 | FAM153CP G.44
147 | PPP1R9A G.45
148 | IQGAP2 G.46
149 | ACTG1 G.47
150 | GPLD1 G.48
151 | SIRPG G.49
152 | CALML4 G.50
153 | IAH1 G.51
154 | LAT2 G.52
155 | AAAS G.53
156 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/graph_stats.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_edges": 1075,
3 | "relation0": {
4 | "destination": "G",
5 | "n_edges": 153,
6 | "source": "C"
7 | },
8 | "relation1": {
9 | "destination": "G",
10 | "n_edges": 369,
11 | "source": "C"
12 | },
13 | "relation2": {
14 | "destination": "G",
15 | "n_edges": 301,
16 | "source": "C"
17 | },
18 | "relation3": {
19 | "destination": "G",
20 | "n_edges": 166,
21 | "source": "C"
22 | },
23 | "relation4": {
24 | "destination": "G",
25 | "n_edges": 86,
26 | "source": "C"
27 | }
28 | }
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_count_C_0.txt:
--------------------------------------------------------------------------------
1 | 100
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_count_G_0.txt:
--------------------------------------------------------------------------------
1 | 54
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_names_C_0.json:
--------------------------------------------------------------------------------
1 | [
2 | "C.3",
3 | "C.73",
4 | "C.5",
5 | "C.93",
6 | "C.58",
7 | "C.38",
8 | "C.14",
9 | "C.24",
10 | "C.35",
11 | "C.60",
12 | "C.70",
13 | "C.64",
14 | "C.72",
15 | "C.68",
16 | "C.79",
17 | "C.12",
18 | "C.52",
19 | "C.81",
20 | "C.83",
21 | "C.87",
22 | "C.48",
23 | "C.91",
24 | "C.11",
25 | "C.33",
26 | "C.77",
27 | "C.88",
28 | "C.9",
29 | "C.0",
30 | "C.39",
31 | "C.28",
32 | "C.36",
33 | "C.75",
34 | "C.92",
35 | "C.85",
36 | "C.10",
37 | "C.67",
38 | "C.20",
39 | "C.37",
40 | "C.46",
41 | "C.7",
42 | "C.53",
43 | "C.44",
44 | "C.23",
45 | "C.4",
46 | "C.42",
47 | "C.8",
48 | "C.50",
49 | "C.90",
50 | "C.1",
51 | "C.76",
52 | "C.61",
53 | "C.6",
54 | "C.56",
55 | "C.13",
56 | "C.89",
57 | "C.41",
58 | "C.25",
59 | "C.62",
60 | "C.84",
61 | "C.15",
62 | "C.40",
63 | "C.55",
64 | "C.96",
65 | "C.65",
66 | "C.86",
67 | "C.69",
68 | "C.98",
69 | "C.17",
70 | "C.94",
71 | "C.97",
72 | "C.18",
73 | "C.54",
74 | "C.19",
75 | "C.59",
76 | "C.49",
77 | "C.34",
78 | "C.26",
79 | "C.2",
80 | "C.95",
81 | "C.47",
82 | "C.66",
83 | "C.45",
84 | "C.51",
85 | "C.82",
86 | "C.22",
87 | "C.21",
88 | "C.57",
89 | "C.71",
90 | "C.43",
91 | "C.99",
92 | "C.27",
93 | "C.30",
94 | "C.32",
95 | "C.29",
96 | "C.16",
97 | "C.80",
98 | "C.63",
99 | "C.74",
100 | "C.31",
101 | "C.78"
102 | ]
--------------------------------------------------------------------------------
/tests/data/pbg_training/input/entity/entity_names_G_0.json:
--------------------------------------------------------------------------------
1 | [
2 | "G.34",
3 | "G.35",
4 | "G.19",
5 | "G.9",
6 | "G.11",
7 | "G.8",
8 | "G.37",
9 | "G.22",
10 | "G.48",
11 | "G.29",
12 | "G.18",
13 | "G.26",
14 | "G.23",
15 | "G.20",
16 | "G.2",
17 | "G.28",
18 | "G.13",
19 | "G.46",
20 | "G.25",
21 | "G.4",
22 | "G.52",
23 | "G.3",
24 | "G.17",
25 | "G.30",
26 | "G.36",
27 | "G.51",
28 | "G.7",
29 | "G.24",
30 | "G.53",
31 | "G.12",
32 | "G.39",
33 | "G.15",
34 | "G.16",
35 | "G.6",
36 | "G.5",
37 | "G.40",
38 | "G.38",
39 | "G.33",
40 | "G.0",
41 | "G.31",
42 | "G.27",
43 | "G.32",
44 | "G.45",
45 | "G.14",
46 | "G.47",
47 | "G.21",
48 | "G.44",
49 | "G.50",
50 | "G.43",
51 | "G.10",
52 | "G.1",
53 | "G.42",
54 | "G.41",
55 | "G.49"
56 | ]
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/checkpoint_version.txt:
--------------------------------------------------------------------------------
1 | 10
2 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "entities": {
3 | "C": {
4 | "num_partitions": 1,
5 | "featurized": false,
6 | "dimension": null
7 | },
8 | "G": {
9 | "num_partitions": 1,
10 | "featurized": false,
11 | "dimension": null
12 | }
13 | },
14 | "relations": [
15 | {
16 | "name": "r0",
17 | "lhs": "C",
18 | "rhs": "G",
19 | "weight": 1.0,
20 | "operator": "none",
21 | "all_negs": false
22 | },
23 | {
24 | "name": "r1",
25 | "lhs": "C",
26 | "rhs": "G",
27 | "weight": 2.0,
28 | "operator": "none",
29 | "all_negs": false
30 | },
31 | {
32 | "name": "r2",
33 | "lhs": "C",
34 | "rhs": "G",
35 | "weight": 3.0,
36 | "operator": "none",
37 | "all_negs": false
38 | },
39 | {
40 | "name": "r3",
41 | "lhs": "C",
42 | "rhs": "G",
43 | "weight": 4.0,
44 | "operator": "none",
45 | "all_negs": false
46 | },
47 | {
48 | "name": "r4",
49 | "lhs": "C",
50 | "rhs": "G",
51 | "weight": 5.0,
52 | "operator": "none",
53 | "all_negs": false
54 | }
55 | ],
56 | "dimension": 50,
57 | "init_scale": 0.001,
58 | "max_norm": null,
59 | "global_emb": false,
60 | "comparator": "dot",
61 | "bias": false,
62 | "loss_fn": "softmax",
63 | "margin": 0.1,
64 | "regularization_coef": 0.0,
65 | "regularizer": "N3",
66 | "wd": 32.962933,
67 | "wd_interval": 50,
68 | "entity_path": "./result_simba/pbg/graph0/input/entity",
69 | "edge_paths": [
70 | "./result_simba/pbg/graph0/input/edge"
71 | ],
72 | "checkpoint_path": "result_simba/pbg/graph0/model",
73 | "init_path": null,
74 | "checkpoint_preservation_interval": null,
75 | "num_epochs": 10,
76 | "num_edge_chunks": null,
77 | "max_edges_per_chunk": 1000000000,
78 | "bucket_order": "inside_out",
79 | "workers": 12,
80 | "batch_size": 1000,
81 | "num_batch_negs": 50,
82 | "num_uniform_negs": 50,
83 | "disable_lhs_negs": false,
84 | "disable_rhs_negs": false,
85 | "lr": 0.1,
86 | "relation_lr": null,
87 | "eval_fraction": 0.05,
88 | "eval_num_batch_negs": 50,
89 | "eval_num_uniform_negs": 50,
90 | "early_stopping": false,
91 | "background_io": false,
92 | "verbose": 0,
93 | "hogwild_delay": 2.0,
94 | "dynamic_relations": false,
95 | "num_machines": 1,
96 | "num_partition_servers": -1,
97 | "distributed_init_method": null,
98 | "distributed_tree_init_order": true,
99 | "num_gpus": 0,
100 | "num_groups_for_partition_server": 16,
101 | "half_precision": false
102 | }
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/embeddings_C_0.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/embeddings_C_0.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/embeddings_G_0.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/embeddings_G_0.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/model.v10.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/model.v10.h5
--------------------------------------------------------------------------------
/tests/data/pbg_training/model/training_stats.json:
--------------------------------------------------------------------------------
1 | {"lhs_partition": 0, "rhs_partition": 0, "index": 1, "stats": {"count": 1022, "metrics": {"loss": 23.252048253546487, "reg": 0.0, "violators_lhs": 36.36497064579256, "violators_rhs": 31.131115459882583}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.4885098259404, "pos_rank": 26.50943396226415, "mrr": 0.08514296270485194, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5094339645133829}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}, "epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0}
2 | {"epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 0, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}}
3 | {"lhs_partition": 0, "rhs_partition": 0, "index": 2, "stats": {"count": 1022, "metrics": {"loss": 22.486315262527615, "reg": 0.0, "violators_lhs": 30.62720156555773, "violators_rhs": 23.104696673189824}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.591775120429272, "pos_rank": 25.566037735849058, "mrr": 0.07682948650897674, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9716981132075472, "auc": 0.5660377372548265}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}, "epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0}
4 | {"epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 1, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}}
5 | {"lhs_partition": 0, "rhs_partition": 0, "index": 3, "stats": {"count": 1022, "metrics": {"loss": 22.591744126172447, "reg": 0.0, "violators_lhs": 30.874755381604697, "violators_rhs": 23.437377690802347}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.54162585960244, "pos_rank": 24.30188679245283, "mrr": 0.08181398664161844, "r1": 0.018867924528301886, "r10": 0.2169811320754717, "r50": 0.9622641509433962, "auc": 0.5377358521492976}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}, "epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0}
6 | {"epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 2, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}}
7 | {"lhs_partition": 0, "rhs_partition": 0, "index": 4, "stats": {"count": 1022, "metrics": {"loss": 22.62260271658403, "reg": 0.0, "violators_lhs": 30.645792563600782, "violators_rhs": 23.364970645792564}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.508187914794345, "pos_rank": 24.88679245283019, "mrr": 0.08207970859377452, "r1": 0.009433962264150943, "r10": 0.20754716981132076, "r50": 0.9245283018867925, "auc": 0.4905660402662349}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}, "epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0}
8 | {"epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 3, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}}
9 | {"lhs_partition": 0, "rhs_partition": 0, "index": 5, "stats": {"count": 1022, "metrics": {"loss": 22.690110387634157, "reg": 0.0, "violators_lhs": 30.770058708414872, "violators_rhs": 23.117416829745597}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.526717563845075, "pos_rank": 24.89622641509434, "mrr": 0.07438522921699398, "r1": 0.0, "r10": 0.2358490566037736, "r50": 0.9622641509433962, "auc": 0.5188679262152258}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}, "epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0}
10 | {"epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 4, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}}
11 | {"lhs_partition": 0, "rhs_partition": 0, "index": 6, "stats": {"count": 1022, "metrics": {"loss": 22.763349835420076, "reg": 0.0, "violators_lhs": 31.012720156555773, "violators_rhs": 22.820939334637966}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.493189523804862, "pos_rank": 24.38679245283019, "mrr": 0.0724140086896577, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9528301886792453, "auc": 0.500000000843462}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}, "epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0}
12 | {"epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 5, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}}
13 | {"lhs_partition": 0, "rhs_partition": 0, "index": 7, "stats": {"count": 1022, "metrics": {"loss": 22.76916164241425, "reg": 0.0, "violators_lhs": 32.49412915851272, "violators_rhs": 26.92367906066536}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.49149281123899, "pos_rank": 24.566037735849058, "mrr": 0.0773335443458186, "r1": 0.0, "r10": 0.22641509433962265, "r50": 0.9433962264150944, "auc": 0.5660377392229045}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}, "epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0}
14 | {"epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 6, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}}
15 | {"lhs_partition": 0, "rhs_partition": 0, "index": 8, "stats": {"count": 1022, "metrics": {"loss": 22.794376134405862, "reg": 0.0, "violators_lhs": 31.71917808219178, "violators_rhs": 23.874755381604697}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.490235013781852, "pos_rank": 23.90566037735849, "mrr": 0.08131686016425209, "r1": 0.009433962264150943, "r10": 0.2641509433962264, "r50": 0.9528301886792453, "auc": 0.6792452849869458}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}, "epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0}
16 | {"epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 7, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}}
17 | {"lhs_partition": 0, "rhs_partition": 0, "index": 9, "stats": {"count": 1022, "metrics": {"loss": 22.79033338020459, "reg": 0.0, "violators_lhs": 31.304305283757337, "violators_rhs": 23.480430528375734}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.48922288642739, "pos_rank": 23.849056603773583, "mrr": 0.09737446559768803, "r1": 0.018867924528301886, "r10": 0.25471698113207547, "r50": 0.9622641509433962, "auc": 0.5943396251718953}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}, "epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0}
18 | {"epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 8, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}}
19 | {"lhs_partition": 0, "rhs_partition": 0, "index": 10, "stats": {"count": 1022, "metrics": {"loss": 22.792577922694136, "reg": 0.0, "violators_lhs": 31.645792563600782, "violators_rhs": 23.681996086105674}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.489368258782154, "pos_rank": 24.528301886792452, "mrr": 0.08778320558650314, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5660377378171345}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}, "epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0}
20 | {"epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 9, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}}
21 |
--------------------------------------------------------------------------------
/tests/data/pbg_training/pbg_graph.txt:
--------------------------------------------------------------------------------
1 | C.2 r0 G.0
2 | C.2 r0 G.4
3 | C.2 r0 G.14
4 | C.2 r0 G.22
5 | C.2 r0 G.25
6 | C.2 r0 G.43
7 | C.3 r0 G.1
8 | C.3 r0 G.11
9 | C.3 r0 G.13
10 | C.3 r0 G.17
11 | C.3 r0 G.20
12 | C.3 r0 G.24
13 | C.3 r0 G.25
14 | C.3 r0 G.36
15 | C.3 r0 G.44
16 | C.3 r0 G.45
17 | C.4 r0 G.0
18 | C.4 r0 G.5
19 | C.4 r0 G.9
20 | C.4 r0 G.11
21 | C.4 r0 G.26
22 | C.4 r0 G.35
23 | C.4 r0 G.52
24 | C.8 r0 G.2
25 | C.8 r0 G.7
26 | C.8 r0 G.32
27 | C.8 r0 G.40
28 | C.8 r0 G.52
29 | C.16 r0 G.0
30 | C.16 r0 G.3
31 | C.16 r0 G.5
32 | C.16 r0 G.6
33 | C.16 r0 G.7
34 | C.16 r0 G.9
35 | C.16 r0 G.16
36 | C.16 r0 G.23
37 | C.16 r0 G.27
38 | C.16 r0 G.30
39 | C.16 r0 G.32
40 | C.16 r0 G.43
41 | C.16 r0 G.44
42 | C.16 r0 G.46
43 | C.16 r0 G.49
44 | C.16 r0 G.52
45 | C.22 r0 G.6
46 | C.22 r0 G.14
47 | C.22 r0 G.16
48 | C.22 r0 G.17
49 | C.22 r0 G.20
50 | C.22 r0 G.21
51 | C.22 r0 G.24
52 | C.22 r0 G.33
53 | C.22 r0 G.34
54 | C.27 r0 G.4
55 | C.27 r0 G.8
56 | C.27 r0 G.9
57 | C.27 r0 G.13
58 | C.27 r0 G.20
59 | C.27 r0 G.27
60 | C.27 r0 G.30
61 | C.27 r0 G.36
62 | C.30 r0 G.5
63 | C.30 r0 G.7
64 | C.30 r0 G.16
65 | C.30 r0 G.24
66 | C.30 r0 G.26
67 | C.30 r0 G.28
68 | C.30 r0 G.37
69 | C.30 r0 G.40
70 | C.30 r0 G.43
71 | C.45 r0 G.11
72 | C.45 r0 G.12
73 | C.45 r0 G.27
74 | C.45 r0 G.40
75 | C.47 r0 G.9
76 | C.47 r0 G.21
77 | C.47 r0 G.32
78 | C.47 r0 G.34
79 | C.47 r0 G.35
80 | C.47 r0 G.36
81 | C.47 r0 G.41
82 | C.47 r0 G.46
83 | C.52 r0 G.0
84 | C.52 r0 G.3
85 | C.52 r0 G.7
86 | C.52 r0 G.20
87 | C.52 r0 G.24
88 | C.52 r0 G.25
89 | C.52 r0 G.33
90 | C.52 r0 G.35
91 | C.52 r0 G.43
92 | C.52 r0 G.49
93 | C.68 r0 G.1
94 | C.68 r0 G.4
95 | C.68 r0 G.8
96 | C.68 r0 G.17
97 | C.68 r0 G.20
98 | C.68 r0 G.21
99 | C.68 r0 G.23
100 | C.68 r0 G.24
101 | C.68 r0 G.25
102 | C.68 r0 G.34
103 | C.68 r0 G.51
104 | C.72 r0 G.3
105 | C.72 r0 G.13
106 | C.72 r0 G.25
107 | C.72 r0 G.41
108 | C.74 r0 G.6
109 | C.74 r0 G.7
110 | C.74 r0 G.10
111 | C.74 r0 G.13
112 | C.74 r0 G.16
113 | C.74 r0 G.27
114 | C.74 r0 G.32
115 | C.74 r0 G.35
116 | C.74 r0 G.43
117 | C.74 r0 G.52
118 | C.80 r0 G.6
119 | C.80 r0 G.24
120 | C.90 r0 G.7
121 | C.90 r0 G.9
122 | C.90 r0 G.15
123 | C.90 r0 G.16
124 | C.90 r0 G.17
125 | C.90 r0 G.24
126 | C.90 r0 G.25
127 | C.90 r0 G.33
128 | C.90 r0 G.34
129 | C.90 r0 G.35
130 | C.90 r0 G.39
131 | C.91 r0 G.5
132 | C.91 r0 G.9
133 | C.91 r0 G.14
134 | C.91 r0 G.23
135 | C.91 r0 G.37
136 | C.91 r0 G.42
137 | C.95 r0 G.21
138 | C.95 r0 G.29
139 | C.95 r0 G.37
140 | C.98 r0 G.4
141 | C.98 r0 G.7
142 | C.98 r0 G.13
143 | C.98 r0 G.15
144 | C.98 r0 G.23
145 | C.98 r0 G.24
146 | C.98 r0 G.32
147 | C.98 r0 G.35
148 | C.98 r0 G.36
149 | C.98 r0 G.43
150 | C.98 r0 G.46
151 | C.98 r0 G.49
152 | C.98 r0 G.51
153 | C.98 r0 G.52
154 | C.0 r1 G.1
155 | C.0 r1 G.16
156 | C.0 r1 G.38
157 | C.2 r1 G.33
158 | C.2 r1 G.34
159 | C.2 r1 G.41
160 | C.3 r1 G.16
161 | C.3 r1 G.52
162 | C.4 r1 G.7
163 | C.4 r1 G.10
164 | C.4 r1 G.31
165 | C.4 r1 G.32
166 | C.4 r1 G.36
167 | C.4 r1 G.46
168 | C.4 r1 G.47
169 | C.5 r1 G.11
170 | C.5 r1 G.12
171 | C.5 r1 G.19
172 | C.5 r1 G.20
173 | C.5 r1 G.24
174 | C.5 r1 G.34
175 | C.5 r1 G.43
176 | C.6 r1 G.7
177 | C.6 r1 G.10
178 | C.6 r1 G.11
179 | C.6 r1 G.13
180 | C.6 r1 G.14
181 | C.6 r1 G.16
182 | C.6 r1 G.33
183 | C.6 r1 G.46
184 | C.6 r1 G.48
185 | C.7 r1 G.2
186 | C.7 r1 G.10
187 | C.7 r1 G.25
188 | C.7 r1 G.28
189 | C.7 r1 G.33
190 | C.7 r1 G.43
191 | C.7 r1 G.45
192 | C.8 r1 G.34
193 | C.8 r1 G.51
194 | C.14 r1 G.0
195 | C.14 r1 G.7
196 | C.14 r1 G.13
197 | C.14 r1 G.16
198 | C.14 r1 G.33
199 | C.14 r1 G.34
200 | C.14 r1 G.47
201 | C.14 r1 G.53
202 | C.16 r1 G.14
203 | C.16 r1 G.33
204 | C.17 r1 G.4
205 | C.17 r1 G.8
206 | C.17 r1 G.28
207 | C.17 r1 G.34
208 | C.17 r1 G.39
209 | C.17 r1 G.46
210 | C.17 r1 G.53
211 | C.18 r1 G.6
212 | C.18 r1 G.31
213 | C.18 r1 G.43
214 | C.18 r1 G.46
215 | C.18 r1 G.49
216 | C.19 r1 G.1
217 | C.19 r1 G.7
218 | C.19 r1 G.9
219 | C.19 r1 G.34
220 | C.19 r1 G.43
221 | C.19 r1 G.46
222 | C.20 r1 G.17
223 | C.20 r1 G.24
224 | C.20 r1 G.33
225 | C.21 r1 G.0
226 | C.21 r1 G.2
227 | C.21 r1 G.20
228 | C.21 r1 G.30
229 | C.21 r1 G.34
230 | C.21 r1 G.42
231 | C.21 r1 G.51
232 | C.22 r1 G.1
233 | C.22 r1 G.4
234 | C.22 r1 G.7
235 | C.22 r1 G.23
236 | C.22 r1 G.36
237 | C.22 r1 G.46
238 | C.23 r1 G.4
239 | C.23 r1 G.7
240 | C.23 r1 G.9
241 | C.23 r1 G.11
242 | C.23 r1 G.16
243 | C.23 r1 G.41
244 | C.23 r1 G.52
245 | C.26 r1 G.2
246 | C.26 r1 G.14
247 | C.26 r1 G.32
248 | C.26 r1 G.35
249 | C.27 r1 G.24
250 | C.27 r1 G.25
251 | C.27 r1 G.35
252 | C.27 r1 G.47
253 | C.27 r1 G.52
254 | C.29 r1 G.9
255 | C.29 r1 G.11
256 | C.29 r1 G.14
257 | C.29 r1 G.19
258 | C.29 r1 G.20
259 | C.29 r1 G.24
260 | C.29 r1 G.34
261 | C.29 r1 G.35
262 | C.29 r1 G.42
263 | C.29 r1 G.47
264 | C.29 r1 G.52
265 | C.30 r1 G.23
266 | C.30 r1 G.35
267 | C.30 r1 G.47
268 | C.32 r1 G.4
269 | C.32 r1 G.7
270 | C.32 r1 G.11
271 | C.32 r1 G.17
272 | C.32 r1 G.33
273 | C.32 r1 G.46
274 | C.33 r1 G.0
275 | C.33 r1 G.2
276 | C.33 r1 G.8
277 | C.33 r1 G.16
278 | C.33 r1 G.20
279 | C.33 r1 G.24
280 | C.33 r1 G.32
281 | C.33 r1 G.34
282 | C.33 r1 G.43
283 | C.33 r1 G.47
284 | C.33 r1 G.52
285 | C.33 r1 G.53
286 | C.36 r1 G.16
287 | C.36 r1 G.31
288 | C.36 r1 G.51
289 | C.37 r1 G.7
290 | C.37 r1 G.13
291 | C.37 r1 G.17
292 | C.37 r1 G.18
293 | C.37 r1 G.34
294 | C.37 r1 G.35
295 | C.37 r1 G.37
296 | C.37 r1 G.49
297 | C.37 r1 G.50
298 | C.38 r1 G.13
299 | C.38 r1 G.43
300 | C.39 r1 G.2
301 | C.39 r1 G.6
302 | C.39 r1 G.22
303 | C.39 r1 G.39
304 | C.39 r1 G.50
305 | C.40 r1 G.14
306 | C.40 r1 G.15
307 | C.40 r1 G.17
308 | C.40 r1 G.34
309 | C.40 r1 G.39
310 | C.41 r1 G.7
311 | C.41 r1 G.13
312 | C.41 r1 G.31
313 | C.41 r1 G.34
314 | C.41 r1 G.42
315 | C.41 r1 G.46
316 | C.42 r1 G.3
317 | C.42 r1 G.4
318 | C.42 r1 G.9
319 | C.42 r1 G.11
320 | C.42 r1 G.14
321 | C.42 r1 G.20
322 | C.42 r1 G.24
323 | C.42 r1 G.31
324 | C.42 r1 G.46
325 | C.45 r1 G.8
326 | C.45 r1 G.17
327 | C.45 r1 G.31
328 | C.45 r1 G.35
329 | C.45 r1 G.46
330 | C.45 r1 G.47
331 | C.45 r1 G.52
332 | C.47 r1 G.2
333 | C.47 r1 G.52
334 | C.49 r1 G.0
335 | C.49 r1 G.7
336 | C.49 r1 G.27
337 | C.49 r1 G.34
338 | C.49 r1 G.35
339 | C.49 r1 G.36
340 | C.49 r1 G.47
341 | C.50 r1 G.4
342 | C.50 r1 G.8
343 | C.50 r1 G.11
344 | C.50 r1 G.14
345 | C.50 r1 G.16
346 | C.50 r1 G.20
347 | C.50 r1 G.31
348 | C.50 r1 G.32
349 | C.50 r1 G.43
350 | C.51 r1 G.15
351 | C.51 r1 G.23
352 | C.51 r1 G.30
353 | C.51 r1 G.31
354 | C.51 r1 G.46
355 | C.52 r1 G.17
356 | C.52 r1 G.30
357 | C.53 r1 G.30
358 | C.53 r1 G.33
359 | C.53 r1 G.43
360 | C.54 r1 G.34
361 | C.54 r1 G.36
362 | C.55 r1 G.1
363 | C.55 r1 G.3
364 | C.55 r1 G.16
365 | C.55 r1 G.18
366 | C.55 r1 G.34
367 | C.55 r1 G.43
368 | C.55 r1 G.46
369 | C.55 r1 G.52
370 | C.56 r1 G.0
371 | C.56 r1 G.1
372 | C.56 r1 G.6
373 | C.56 r1 G.24
374 | C.56 r1 G.26
375 | C.56 r1 G.46
376 | C.58 r1 G.27
377 | C.58 r1 G.31
378 | C.58 r1 G.48
379 | C.58 r1 G.50
380 | C.60 r1 G.13
381 | C.60 r1 G.16
382 | C.60 r1 G.31
383 | C.60 r1 G.37
384 | C.60 r1 G.43
385 | C.60 r1 G.52
386 | C.62 r1 G.8
387 | C.62 r1 G.9
388 | C.62 r1 G.14
389 | C.62 r1 G.20
390 | C.62 r1 G.26
391 | C.62 r1 G.34
392 | C.62 r1 G.46
393 | C.62 r1 G.52
394 | C.63 r1 G.3
395 | C.63 r1 G.35
396 | C.65 r1 G.0
397 | C.65 r1 G.7
398 | C.65 r1 G.24
399 | C.65 r1 G.25
400 | C.65 r1 G.46
401 | C.67 r1 G.0
402 | C.67 r1 G.2
403 | C.67 r1 G.7
404 | C.67 r1 G.8
405 | C.67 r1 G.14
406 | C.67 r1 G.46
407 | C.67 r1 G.52
408 | C.68 r1 G.31
409 | C.69 r1 G.4
410 | C.69 r1 G.22
411 | C.69 r1 G.46
412 | C.69 r1 G.49
413 | C.70 r1 G.1
414 | C.70 r1 G.7
415 | C.70 r1 G.8
416 | C.70 r1 G.9
417 | C.70 r1 G.17
418 | C.70 r1 G.31
419 | C.70 r1 G.32
420 | C.70 r1 G.35
421 | C.70 r1 G.37
422 | C.70 r1 G.43
423 | C.71 r1 G.2
424 | C.71 r1 G.26
425 | C.71 r1 G.32
426 | C.71 r1 G.35
427 | C.71 r1 G.43
428 | C.71 r1 G.46
429 | C.72 r1 G.16
430 | C.72 r1 G.43
431 | C.72 r1 G.52
432 | C.73 r1 G.8
433 | C.73 r1 G.33
434 | C.73 r1 G.47
435 | C.75 r1 G.4
436 | C.75 r1 G.7
437 | C.75 r1 G.17
438 | C.75 r1 G.33
439 | C.75 r1 G.35
440 | C.75 r1 G.37
441 | C.75 r1 G.47
442 | C.77 r1 G.7
443 | C.77 r1 G.15
444 | C.77 r1 G.16
445 | C.77 r1 G.34
446 | C.77 r1 G.36
447 | C.77 r1 G.49
448 | C.77 r1 G.52
449 | C.79 r1 G.1
450 | C.79 r1 G.4
451 | C.79 r1 G.11
452 | C.79 r1 G.13
453 | C.79 r1 G.16
454 | C.79 r1 G.17
455 | C.79 r1 G.21
456 | C.79 r1 G.31
457 | C.79 r1 G.37
458 | C.79 r1 G.44
459 | C.79 r1 G.46
460 | C.80 r1 G.22
461 | C.80 r1 G.43
462 | C.80 r1 G.46
463 | C.81 r1 G.3
464 | C.81 r1 G.4
465 | C.81 r1 G.17
466 | C.81 r1 G.30
467 | C.81 r1 G.33
468 | C.81 r1 G.35
469 | C.82 r1 G.11
470 | C.82 r1 G.13
471 | C.82 r1 G.25
472 | C.82 r1 G.31
473 | C.82 r1 G.34
474 | C.82 r1 G.42
475 | C.83 r1 G.1
476 | C.83 r1 G.6
477 | C.83 r1 G.36
478 | C.83 r1 G.42
479 | C.84 r1 G.25
480 | C.84 r1 G.30
481 | C.84 r1 G.41
482 | C.84 r1 G.42
483 | C.84 r1 G.43
484 | C.88 r1 G.4
485 | C.88 r1 G.7
486 | C.88 r1 G.11
487 | C.88 r1 G.13
488 | C.88 r1 G.22
489 | C.88 r1 G.32
490 | C.88 r1 G.34
491 | C.88 r1 G.35
492 | C.88 r1 G.37
493 | C.88 r1 G.42
494 | C.89 r1 G.10
495 | C.89 r1 G.19
496 | C.89 r1 G.20
497 | C.89 r1 G.21
498 | C.89 r1 G.26
499 | C.89 r1 G.35
500 | C.89 r1 G.38
501 | C.90 r1 G.3
502 | C.91 r1 G.2
503 | C.91 r1 G.7
504 | C.91 r1 G.11
505 | C.91 r1 G.34
506 | C.91 r1 G.35
507 | C.91 r1 G.36
508 | C.91 r1 G.47
509 | C.91 r1 G.51
510 | C.93 r1 G.7
511 | C.93 r1 G.25
512 | C.93 r1 G.46
513 | C.93 r1 G.49
514 | C.94 r1 G.14
515 | C.94 r1 G.25
516 | C.94 r1 G.34
517 | C.94 r1 G.47
518 | C.94 r1 G.52
519 | C.95 r1 G.32
520 | C.95 r1 G.40
521 | C.95 r1 G.43
522 | C.95 r1 G.46
523 | C.0 r2 G.2
524 | C.0 r2 G.25
525 | C.0 r2 G.31
526 | C.0 r2 G.46
527 | C.0 r2 G.47
528 | C.1 r2 G.8
529 | C.1 r2 G.19
530 | C.1 r2 G.33
531 | C.1 r2 G.34
532 | C.1 r2 G.35
533 | C.1 r2 G.36
534 | C.1 r2 G.47
535 | C.2 r2 G.2
536 | C.3 r2 G.19
537 | C.3 r2 G.31
538 | C.3 r2 G.34
539 | C.3 r2 G.35
540 | C.4 r2 G.19
541 | C.4 r2 G.34
542 | C.5 r2 G.16
543 | C.5 r2 G.45
544 | C.5 r2 G.52
545 | C.6 r2 G.25
546 | C.7 r2 G.31
547 | C.8 r2 G.20
548 | C.8 r2 G.24
549 | C.8 r2 G.31
550 | C.8 r2 G.33
551 | C.8 r2 G.35
552 | C.8 r2 G.47
553 | C.9 r2 G.6
554 | C.9 r2 G.19
555 | C.9 r2 G.24
556 | C.9 r2 G.25
557 | C.9 r2 G.34
558 | C.11 r2 G.3
559 | C.11 r2 G.19
560 | C.11 r2 G.31
561 | C.11 r2 G.36
562 | C.11 r2 G.40
563 | C.11 r2 G.46
564 | C.12 r2 G.1
565 | C.12 r2 G.7
566 | C.12 r2 G.15
567 | C.12 r2 G.31
568 | C.13 r2 G.24
569 | C.13 r2 G.25
570 | C.13 r2 G.37
571 | C.14 r2 G.11
572 | C.14 r2 G.14
573 | C.15 r2 G.47
574 | C.17 r2 G.5
575 | C.17 r2 G.7
576 | C.17 r2 G.11
577 | C.17 r2 G.35
578 | C.17 r2 G.52
579 | C.19 r2 G.17
580 | C.19 r2 G.31
581 | C.19 r2 G.47
582 | C.20 r2 G.46
583 | C.21 r2 G.19
584 | C.21 r2 G.35
585 | C.21 r2 G.47
586 | C.22 r2 G.25
587 | C.22 r2 G.31
588 | C.22 r2 G.47
589 | C.23 r2 G.17
590 | C.23 r2 G.31
591 | C.23 r2 G.43
592 | C.23 r2 G.46
593 | C.24 r2 G.31
594 | C.24 r2 G.43
595 | C.24 r2 G.47
596 | C.26 r2 G.28
597 | C.27 r2 G.7
598 | C.27 r2 G.14
599 | C.27 r2 G.31
600 | C.27 r2 G.46
601 | C.28 r2 G.2
602 | C.28 r2 G.7
603 | C.28 r2 G.17
604 | C.30 r2 G.2
605 | C.30 r2 G.11
606 | C.30 r2 G.14
607 | C.30 r2 G.17
608 | C.30 r2 G.29
609 | C.30 r2 G.34
610 | C.30 r2 G.46
611 | C.30 r2 G.52
612 | C.31 r2 G.7
613 | C.31 r2 G.16
614 | C.32 r2 G.16
615 | C.32 r2 G.43
616 | C.33 r2 G.19
617 | C.33 r2 G.35
618 | C.34 r2 G.23
619 | C.34 r2 G.37
620 | C.34 r2 G.42
621 | C.34 r2 G.47
622 | C.35 r2 G.14
623 | C.35 r2 G.19
624 | C.36 r2 G.24
625 | C.36 r2 G.33
626 | C.36 r2 G.46
627 | C.37 r2 G.33
628 | C.38 r2 G.25
629 | C.39 r2 G.7
630 | C.39 r2 G.43
631 | C.40 r2 G.2
632 | C.40 r2 G.31
633 | C.41 r2 G.9
634 | C.43 r2 G.9
635 | C.43 r2 G.22
636 | C.43 r2 G.26
637 | C.43 r2 G.31
638 | C.43 r2 G.39
639 | C.44 r2 G.3
640 | C.44 r2 G.17
641 | C.44 r2 G.24
642 | C.44 r2 G.33
643 | C.44 r2 G.46
644 | C.45 r2 G.19
645 | C.45 r2 G.24
646 | C.45 r2 G.34
647 | C.46 r2 G.0
648 | C.48 r2 G.0
649 | C.48 r2 G.22
650 | C.48 r2 G.33
651 | C.48 r2 G.34
652 | C.48 r2 G.35
653 | C.48 r2 G.52
654 | C.49 r2 G.31
655 | C.49 r2 G.46
656 | C.50 r2 G.2
657 | C.50 r2 G.33
658 | C.50 r2 G.46
659 | C.50 r2 G.52
660 | C.51 r2 G.4
661 | C.51 r2 G.11
662 | C.51 r2 G.25
663 | C.51 r2 G.33
664 | C.51 r2 G.34
665 | C.52 r2 G.31
666 | C.52 r2 G.46
667 | C.53 r2 G.7
668 | C.53 r2 G.34
669 | C.53 r2 G.47
670 | C.54 r2 G.42
671 | C.55 r2 G.7
672 | C.55 r2 G.25
673 | C.56 r2 G.13
674 | C.56 r2 G.25
675 | C.56 r2 G.31
676 | C.56 r2 G.33
677 | C.56 r2 G.47
678 | C.57 r2 G.13
679 | C.57 r2 G.18
680 | C.57 r2 G.21
681 | C.57 r2 G.24
682 | C.57 r2 G.46
683 | C.58 r2 G.46
684 | C.58 r2 G.49
685 | C.59 r2 G.11
686 | C.59 r2 G.23
687 | C.59 r2 G.34
688 | C.59 r2 G.40
689 | C.59 r2 G.43
690 | C.59 r2 G.47
691 | C.60 r2 G.25
692 | C.60 r2 G.34
693 | C.60 r2 G.47
694 | C.60 r2 G.49
695 | C.61 r2 G.11
696 | C.61 r2 G.34
697 | C.61 r2 G.39
698 | C.61 r2 G.46
699 | C.62 r2 G.23
700 | C.62 r2 G.33
701 | C.62 r2 G.47
702 | C.63 r2 G.9
703 | C.64 r2 G.1
704 | C.64 r2 G.11
705 | C.64 r2 G.18
706 | C.64 r2 G.19
707 | C.64 r2 G.21
708 | C.64 r2 G.48
709 | C.64 r2 G.52
710 | C.65 r2 G.11
711 | C.65 r2 G.19
712 | C.65 r2 G.28
713 | C.65 r2 G.35
714 | C.65 r2 G.47
715 | C.66 r2 G.1
716 | C.66 r2 G.14
717 | C.66 r2 G.19
718 | C.66 r2 G.27
719 | C.67 r2 G.35
720 | C.67 r2 G.47
721 | C.68 r2 G.2
722 | C.68 r2 G.35
723 | C.68 r2 G.52
724 | C.70 r2 G.33
725 | C.70 r2 G.34
726 | C.70 r2 G.46
727 | C.71 r2 G.11
728 | C.71 r2 G.31
729 | C.71 r2 G.33
730 | C.71 r2 G.34
731 | C.72 r2 G.34
732 | C.72 r2 G.46
733 | C.73 r2 G.31
734 | C.74 r2 G.31
735 | C.74 r2 G.46
736 | C.75 r2 G.31
737 | C.76 r2 G.1
738 | C.76 r2 G.7
739 | C.76 r2 G.27
740 | C.76 r2 G.32
741 | C.76 r2 G.33
742 | C.76 r2 G.37
743 | C.76 r2 G.44
744 | C.76 r2 G.47
745 | C.77 r2 G.3
746 | C.77 r2 G.31
747 | C.77 r2 G.46
748 | C.77 r2 G.47
749 | C.78 r2 G.25
750 | C.78 r2 G.41
751 | C.78 r2 G.52
752 | C.80 r2 G.31
753 | C.80 r2 G.47
754 | C.81 r2 G.25
755 | C.81 r2 G.46
756 | C.81 r2 G.47
757 | C.82 r2 G.7
758 | C.82 r2 G.33
759 | C.82 r2 G.47
760 | C.83 r2 G.47
761 | C.83 r2 G.48
762 | C.85 r2 G.9
763 | C.85 r2 G.25
764 | C.85 r2 G.31
765 | C.85 r2 G.33
766 | C.85 r2 G.35
767 | C.85 r2 G.43
768 | C.86 r2 G.15
769 | C.86 r2 G.16
770 | C.86 r2 G.34
771 | C.86 r2 G.38
772 | C.86 r2 G.47
773 | C.87 r2 G.7
774 | C.87 r2 G.8
775 | C.87 r2 G.29
776 | C.87 r2 G.30
777 | C.87 r2 G.31
778 | C.87 r2 G.32
779 | C.87 r2 G.35
780 | C.87 r2 G.52
781 | C.88 r2 G.2
782 | C.88 r2 G.14
783 | C.88 r2 G.17
784 | C.88 r2 G.19
785 | C.88 r2 G.33
786 | C.88 r2 G.46
787 | C.88 r2 G.47
788 | C.89 r2 G.46
789 | C.90 r2 G.1
790 | C.90 r2 G.31
791 | C.90 r2 G.46
792 | C.91 r2 G.4
793 | C.91 r2 G.19
794 | C.91 r2 G.29
795 | C.91 r2 G.31
796 | C.91 r2 G.33
797 | C.92 r2 G.31
798 | C.92 r2 G.33
799 | C.93 r2 G.31
800 | C.93 r2 G.44
801 | C.94 r2 G.2
802 | C.94 r2 G.17
803 | C.94 r2 G.31
804 | C.94 r2 G.35
805 | C.95 r2 G.2
806 | C.95 r2 G.33
807 | C.95 r2 G.47
808 | C.95 r2 G.52
809 | C.96 r2 G.12
810 | C.96 r2 G.19
811 | C.97 r2 G.4
812 | C.97 r2 G.20
813 | C.97 r2 G.24
814 | C.97 r2 G.34
815 | C.97 r2 G.35
816 | C.97 r2 G.36
817 | C.98 r2 G.31
818 | C.98 r2 G.33
819 | C.99 r2 G.4
820 | C.99 r2 G.11
821 | C.99 r2 G.21
822 | C.99 r2 G.31
823 | C.99 r2 G.46
824 | C.1 r3 G.2
825 | C.1 r3 G.7
826 | C.1 r3 G.31
827 | C.2 r3 G.31
828 | C.2 r3 G.47
829 | C.3 r3 G.47
830 | C.4 r3 G.2
831 | C.4 r3 G.33
832 | C.5 r3 G.46
833 | C.5 r3 G.47
834 | C.6 r3 G.31
835 | C.7 r3 G.46
836 | C.7 r3 G.47
837 | C.8 r3 G.46
838 | C.9 r3 G.31
839 | C.9 r3 G.46
840 | C.10 r3 G.17
841 | C.11 r3 G.2
842 | C.11 r3 G.24
843 | C.12 r3 G.47
844 | C.13 r3 G.46
845 | C.14 r3 G.1
846 | C.14 r3 G.19
847 | C.14 r3 G.31
848 | C.14 r3 G.46
849 | C.15 r3 G.6
850 | C.15 r3 G.13
851 | C.15 r3 G.31
852 | C.15 r3 G.35
853 | C.16 r3 G.31
854 | C.17 r3 G.2
855 | C.17 r3 G.33
856 | C.18 r3 G.1
857 | C.18 r3 G.47
858 | C.20 r3 G.47
859 | C.21 r3 G.33
860 | C.23 r3 G.19
861 | C.23 r3 G.47
862 | C.24 r3 G.1
863 | C.26 r3 G.4
864 | C.26 r3 G.19
865 | C.26 r3 G.33
866 | C.26 r3 G.46
867 | C.27 r3 G.2
868 | C.27 r3 G.19
869 | C.27 r3 G.33
870 | C.28 r3 G.15
871 | C.28 r3 G.19
872 | C.28 r3 G.33
873 | C.28 r3 G.34
874 | C.28 r3 G.35
875 | C.28 r3 G.46
876 | C.29 r3 G.2
877 | C.29 r3 G.4
878 | C.29 r3 G.33
879 | C.29 r3 G.46
880 | C.31 r3 G.11
881 | C.31 r3 G.19
882 | C.32 r3 G.25
883 | C.32 r3 G.31
884 | C.32 r3 G.47
885 | C.33 r3 G.31
886 | C.33 r3 G.33
887 | C.34 r3 G.11
888 | C.35 r3 G.35
889 | C.36 r3 G.47
890 | C.37 r3 G.31
891 | C.37 r3 G.47
892 | C.39 r3 G.31
893 | C.39 r3 G.47
894 | C.41 r3 G.47
895 | C.43 r3 G.17
896 | C.43 r3 G.46
897 | C.43 r3 G.52
898 | C.45 r3 G.2
899 | C.46 r3 G.31
900 | C.46 r3 G.35
901 | C.47 r3 G.31
902 | C.47 r3 G.33
903 | C.47 r3 G.47
904 | C.48 r3 G.47
905 | C.49 r3 G.2
906 | C.49 r3 G.17
907 | C.49 r3 G.19
908 | C.50 r3 G.19
909 | C.50 r3 G.47
910 | C.51 r3 G.47
911 | C.52 r3 G.47
912 | C.53 r3 G.31
913 | C.53 r3 G.46
914 | C.55 r3 G.2
915 | C.55 r3 G.19
916 | C.55 r3 G.47
917 | C.57 r3 G.17
918 | C.57 r3 G.47
919 | C.58 r3 G.47
920 | C.59 r3 G.19
921 | C.59 r3 G.31
922 | C.59 r3 G.33
923 | C.59 r3 G.46
924 | C.62 r3 G.19
925 | C.62 r3 G.31
926 | C.62 r3 G.35
927 | C.63 r3 G.47
928 | C.64 r3 G.47
929 | C.65 r3 G.33
930 | C.65 r3 G.34
931 | C.65 r3 G.52
932 | C.66 r3 G.45
933 | C.67 r3 G.1
934 | C.67 r3 G.19
935 | C.67 r3 G.33
936 | C.68 r3 G.19
937 | C.68 r3 G.33
938 | C.68 r3 G.47
939 | C.69 r3 G.31
940 | C.70 r3 G.2
941 | C.70 r3 G.19
942 | C.70 r3 G.47
943 | C.71 r3 G.47
944 | C.72 r3 G.19
945 | C.72 r3 G.31
946 | C.72 r3 G.47
947 | C.73 r3 G.2
948 | C.73 r3 G.11
949 | C.73 r3 G.19
950 | C.73 r3 G.46
951 | C.75 r3 G.25
952 | C.76 r3 G.17
953 | C.76 r3 G.31
954 | C.77 r3 G.2
955 | C.78 r3 G.47
956 | C.79 r3 G.33
957 | C.79 r3 G.47
958 | C.83 r3 G.19
959 | C.83 r3 G.43
960 | C.83 r3 G.46
961 | C.84 r3 G.31
962 | C.84 r3 G.47
963 | C.85 r3 G.47
964 | C.86 r3 G.31
965 | C.86 r3 G.35
966 | C.87 r3 G.2
967 | C.87 r3 G.19
968 | C.87 r3 G.33
969 | C.89 r3 G.7
970 | C.89 r3 G.17
971 | C.89 r3 G.33
972 | C.90 r3 G.47
973 | C.91 r3 G.46
974 | C.92 r3 G.14
975 | C.92 r3 G.17
976 | C.92 r3 G.19
977 | C.94 r3 G.19
978 | C.94 r3 G.33
979 | C.94 r3 G.46
980 | C.95 r3 G.19
981 | C.95 r3 G.31
982 | C.95 r3 G.35
983 | C.96 r3 G.34
984 | C.96 r3 G.35
985 | C.96 r3 G.47
986 | C.97 r3 G.2
987 | C.97 r3 G.27
988 | C.97 r3 G.46
989 | C.98 r3 G.47
990 | C.0 r4 G.19
991 | C.2 r4 G.19
992 | C.3 r4 G.46
993 | C.6 r4 G.19
994 | C.7 r4 G.19
995 | C.8 r4 G.19
996 | C.9 r4 G.47
997 | C.10 r4 G.19
998 | C.10 r4 G.47
999 | C.12 r4 G.19
1000 | C.13 r4 G.19
1001 | C.15 r4 G.19
1002 | C.16 r4 G.19
1003 | C.16 r4 G.47
1004 | C.18 r4 G.19
1005 | C.19 r4 G.19
1006 | C.20 r4 G.19
1007 | C.21 r4 G.46
1008 | C.22 r4 G.19
1009 | C.24 r4 G.19
1010 | C.25 r4 G.19
1011 | C.25 r4 G.46
1012 | C.25 r4 G.53
1013 | C.30 r4 G.33
1014 | C.31 r4 G.47
1015 | C.32 r4 G.19
1016 | C.33 r4 G.46
1017 | C.34 r4 G.19
1018 | C.35 r4 G.2
1019 | C.35 r4 G.33
1020 | C.36 r4 G.19
1021 | C.37 r4 G.19
1022 | C.38 r4 G.19
1023 | C.38 r4 G.47
1024 | C.39 r4 G.19
1025 | C.40 r4 G.19
1026 | C.40 r4 G.47
1027 | C.41 r4 G.19
1028 | C.42 r4 G.19
1029 | C.42 r4 G.47
1030 | C.44 r4 G.19
1031 | C.44 r4 G.47
1032 | C.45 r4 G.33
1033 | C.46 r4 G.19
1034 | C.47 r4 G.19
1035 | C.48 r4 G.19
1036 | C.51 r4 G.19
1037 | C.52 r4 G.19
1038 | C.53 r4 G.19
1039 | C.54 r4 G.19
1040 | C.56 r4 G.19
1041 | C.57 r4 G.19
1042 | C.58 r4 G.19
1043 | C.60 r4 G.19
1044 | C.61 r4 G.19
1045 | C.61 r4 G.25
1046 | C.63 r4 G.19
1047 | C.63 r4 G.31
1048 | C.66 r4 G.47
1049 | C.69 r4 G.19
1050 | C.69 r4 G.47
1051 | C.71 r4 G.19
1052 | C.74 r4 G.19
1053 | C.74 r4 G.47
1054 | C.75 r4 G.19
1055 | C.75 r4 G.46
1056 | C.77 r4 G.33
1057 | C.78 r4 G.19
1058 | C.79 r4 G.19
1059 | C.80 r4 G.19
1060 | C.81 r4 G.19
1061 | C.82 r4 G.19
1062 | C.82 r4 G.46
1063 | C.83 r4 G.31
1064 | C.84 r4 G.19
1065 | C.85 r4 G.19
1066 | C.86 r4 G.19
1067 | C.89 r4 G.2
1068 | C.90 r4 G.19
1069 | C.92 r4 G.46
1070 | C.93 r4 G.19
1071 | C.93 r4 G.47
1072 | C.96 r4 G.33
1073 | C.98 r4 G.19
1074 | C.99 r4 G.2
1075 | C.99 r4 G.19
1076 |
--------------------------------------------------------------------------------
/tests/data/preprocessed/atac_preprocessed.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/preprocessed/atac_preprocessed.h5ad
--------------------------------------------------------------------------------
/tests/data/preprocessed/rna_preprocessed.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/preprocessed/rna_preprocessed.h5ad
--------------------------------------------------------------------------------
/tests/test_pbg_training.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def adata_CG():
7 | return si.read_h5ad(
8 | "tests/data/preprocessed/rna_preprocessed.h5ad")
9 |
10 |
11 | @pytest.fixture
12 | def adata_CP():
13 | return si.read_h5ad(
14 | "tests/data/preprocessed/atac_preprocessed.h5ad")
15 |
16 |
17 | def test_gen_graph(adata_CG, adata_CP, tmp_path):
18 | si.settings.set_workdir(tmp_path / "simba_rna")
19 | si.tl.gen_graph(list_CG=[adata_CG],
20 | copy=False,
21 | dirname='graph0')
22 | si.tl.gen_graph(list_CG=[adata_CG],
23 | copy=False,
24 | add_edge_weights=True,
25 | dirname='graph1')
26 | si.tl.gen_graph(list_adata=[adata_CG],
27 | copy=False,
28 | dirname='graph2')
29 | si.tl.gen_graph(list_adata=[adata_CG],
30 | copy=False,
31 | add_edge_weights=True,
32 | dirname='graph3')
33 | si.tl.gen_graph(list_adata=[adata_CG, adata_CP],
34 | copy=False,
35 | add_edge_weights=True,
36 | dirname='graph4')
37 |
38 |
39 | def test_pbg_training_rna(adata_CG, tmp_path):
40 | si.settings.set_workdir(tmp_path / "simba_rna")
41 | si.tl.gen_graph(list_CG=[adata_CG],
42 | copy=False,
43 | dirname='graph0')
44 | dict_config = si.settings.pbg_params.copy()
45 | si.settings.set_pbg_params(dict_config)
46 | si.tl.pbg_train(auto_wd=True,
47 | output='model0')
48 | si.tl.pbg_train(auto_wd=True,
49 | use_edge_weights=True,
50 | output='model1')
51 | si.load_graph_stats()
52 | si.load_pbg_config()
53 | si.pl.pbg_metrics(fig_ncol=1,
54 | save_fig=True)
55 |
56 |
57 | def test_pbg_training_atac(adata_CP, tmp_path):
58 | si.settings.set_workdir(tmp_path / "simba_atac")
59 | si.tl.gen_graph(list_CP=[adata_CP],
60 | copy=False,
61 | dirname='graph0')
62 | si.tl.pbg_train(auto_wd=True,
63 | output='model')
64 | si.pl.pbg_metrics(fig_ncol=1,
65 | save_fig=True)
66 |
--------------------------------------------------------------------------------
/tests/test_post_training.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def dict_adata():
7 |
8 | return si.read_embedding(
9 | path_emb='tests/data/pbg_training/model/',
10 | path_entity='tests/data/pbg_training/input/entity/',
11 | path_entity_alias='tests/data/pbg_training')
12 |
13 |
14 | def test_embeddding_rna(dict_adata, tmp_path):
15 | si.settings.set_workdir(tmp_path / "simba_rna")
16 | adata_C = dict_adata['C']
17 | adata_G = dict_adata['G']
18 | adata_all_CG = si.tl.embed(
19 | adata_ref=adata_C,
20 | list_adata_query=[adata_G],
21 | n_top=20)
22 | adata_all_CG = si.tl.embed(
23 | adata_ref=adata_C,
24 | list_adata_query=[adata_G])
25 | # add annotations of cells and genes
26 | adata_all_CG.obs['entity_anno'] = ""
27 | adata_all_CG.obs.loc[adata_C.obs_names, 'entity_anno'] = 'cell'
28 | adata_all_CG.obs.loc[adata_G.obs_names, 'entity_anno'] = 'gene'
29 |
30 | si.tl.umap(adata_all_CG,
31 | n_neighbors=15,
32 | n_components=2)
33 | si.pl.umap(adata_all_CG, drawing_order='random')
34 | si.pl.umap(adata_all_CG, color=['entity_anno'], drawing_order='random')
35 | adata_cmp = si.tl.compare_entities(
36 | adata_ref=adata_C,
37 | adata_query=adata_G)
38 | si.pl.entity_metrics(adata_cmp,
39 | x='max',
40 | y='gini',
41 | show_contour=False,
42 | texts=adata_G.obs_names[:2],
43 | show_texts=True,
44 | show_cutoff=True,
45 | size=5,
46 | text_expand=(1.3, 1.5),
47 | cutoff_x=1.,
48 | cutoff_y=0.3,
49 | save_fig=True)
50 | si.pl.entity_barcode(adata_cmp,
51 | layer='softmax',
52 | entities=list(adata_G.obs_names[:2]),
53 | show_cutoff=True,
54 | cutoff=0.001,
55 | fig_size=(5, 2.5),
56 | save_fig=True)
57 | query_result = si.tl.query(adata_all_CG,
58 | entity=list(adata_C.obs_names[:2]),
59 | obsm=None,
60 | use_radius=False,
61 | k=50,
62 | anno_filter='entity_anno',
63 | filters=['gene'])
64 | print(query_result.head())
65 | si.pl.query(adata_all_CG,
66 | obsm=None,
67 | show_texts=False,
68 | color=['entity_anno'],
69 | alpha=0.9,
70 | alpha_bg=0.1,
71 | save_fig=True)
72 | query_result = si.tl.query(adata_all_CG,
73 | entity=adata_C.obs_names[0],
74 | obsm='X_umap',
75 | use_radius=True,
76 | anno_filter='entity_anno')
77 | print(query_result.head())
78 | si.pl.query(adata_all_CG,
79 | obsm='X_umap',
80 | show_texts=False,
81 | color=['entity_anno'],
82 | alpha=0.9,
83 | alpha_bg=0.1,
84 | save_fig=True)
85 |
--------------------------------------------------------------------------------
/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
1 | import simba as si
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def adata_CG():
7 | return si.read_h5ad("tests/data/10xpbmc_rna_subset.h5ad")
8 |
9 |
10 | @pytest.fixture
11 | def adata_CP():
12 | return si.read_h5ad("tests/data/10xpbmc_atac_subset.h5ad")
13 |
14 |
15 | def test_rna(adata_CG, tmp_path):
16 | si.settings.set_workdir(tmp_path / "simba_rna")
17 | si.settings.set_figure_params(dpi=80,
18 | style='white',
19 | fig_size=[5, 5],
20 | rc={'image.cmap': 'viridis'})
21 | si.pp.filter_features(adata_CG, min_n_samples=1)
22 | si.pp.filter_genes(adata_CG, min_n_cells=3)
23 | si.pp.cal_qc(adata_CG)
24 | si.pl.violin(adata_CG,
25 | list_obs=['n_counts', 'n_features'],
26 | save_fig=True,
27 | fig_name='plot_violin.png')
28 | si.pp.filter_samples(adata_CG, min_n_features=1)
29 | si.pp.cal_qc_rna(adata_CG)
30 | si.pl.violin(adata_CG,
31 | list_obs=['n_counts', 'n_genes', 'pct_mt'],
32 | save_fig=True,
33 | fig_name='plot_violin.png')
34 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2)
35 | si.pp.normalize(adata_CG, method='lib_size')
36 | si.pp.log_transform(adata_CG)
37 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000)
38 | si.pl.variable_genes(adata_CG,
39 | show_texts=True,
40 | save_fig=True,
41 | fig_name='plot_variable_genes.png')
42 | si.tl.discretize(adata_CG, n_bins=5)
43 | si.pl.discretize(adata_CG,
44 | save_fig=True,
45 | fig_name='plot_discretize.png')
46 |
47 |
48 | def test_atac(adata_CP, tmp_path):
49 | si.settings.set_workdir(tmp_path / "simba_atac")
50 | si.pp.filter_peaks(adata_CP, min_n_cells=5)
51 | si.pp.binarize(adata_CP)
52 | si.pp.cal_qc_atac(adata_CP)
53 | si.pl.hist(adata_CP,
54 | list_obs=['n_counts', 'n_peaks', 'pct_peaks'],
55 | log=True,
56 | list_var=['n_cells'],
57 | fig_size=(3, 3),
58 | save_fig=True,
59 | fig_name='plot_histogram.png')
60 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5)
61 | si.pp.pca(adata_CP, n_components=30)
62 | si.pl.pca_variance_ratio(adata_CP,
63 | show_cutoff=True,
64 | save_fig=True,
65 | fig_name='plot_variance_ratio.png')
66 | si.pp.select_pcs(adata_CP, n_pcs=10)
67 | si.pp.select_pcs_features(adata_CP)
68 | si.pl.pcs_features(adata_CP,
69 | fig_ncol=5,
70 | save_fig=True,
71 | fig_name='plot_pcs_features.png')
72 | si.write_bed(adata_CP, use_top_pcs=True)
73 |
74 |
75 | def test_genescores(adata_CP):
76 | si.pp.filter_peaks(adata_CP, min_n_cells=5)
77 | si.pp.cal_qc_atac(adata_CP)
78 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5)
79 | si.pp.pca(adata_CP, n_components=30)
80 | si.pp.select_pcs(adata_CP, n_pcs=10)
81 | si.pp.select_pcs_features(adata_CP)
82 |
83 | adata_CG_atac = si.tl.gene_scores(adata_CP,
84 | genome='hg19',
85 | use_gene_weigt=True,
86 | use_top_pcs=True)
87 | print(adata_CG_atac)
88 |
89 |
90 | def test_integration(adata_CG):
91 | si.pp.filter_genes(adata_CG, min_n_cells=3)
92 | si.pp.cal_qc_rna(adata_CG)
93 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2)
94 | si.pp.normalize(adata_CG, method='lib_size')
95 | si.pp.log_transform(adata_CG)
96 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000)
97 | adata_C1C2 = si.tl.infer_edges(
98 | adata_CG, adata_CG, n_components=20, k=20)
99 | si.pl.node_similarity(adata_C1C2,
100 | cutoff=0.5,
101 | save_fig=True)
102 | si.pl.svd_nodes(adata_C1C2,
103 | cutoff=0.5,
104 | save_fig=True)
105 | si.tl.trim_edges(adata_C1C2, cutoff=0.5)
106 |
--------------------------------------------------------------------------------