├── .codecov.yml ├── .github └── workflows │ └── CI.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.md ├── R_scripts └── scan_for_kmers_motifs.R ├── docs ├── Makefile ├── environment.yml ├── make.bat ├── requirements.txt └── source │ ├── API.rst │ ├── About SIMBA.rst │ ├── Basic concepts.rst │ ├── Citation.rst │ ├── Installation.rst │ ├── Makefile │ ├── Output.rst │ ├── Release notes.rst │ ├── _ext │ └── edit_on_github.py │ ├── _static │ └── img │ │ ├── Figure1.png │ │ ├── lion_icon.svg │ │ └── logo_simba.png │ ├── conf.py │ ├── index.rst │ └── make.bat ├── pytest.ini ├── requirements.txt ├── setup.py ├── simba ├── __init__.py ├── _settings.py ├── _utils.py ├── _version.py ├── data │ └── gene_anno │ │ ├── hg19_genes.bed │ │ ├── hg38_genes.bed │ │ ├── mm10_genes.bed │ │ └── mm9_genes.bed ├── datasets │ ├── __init__.py │ └── _datasets.py ├── plotting │ ├── __init__.py │ ├── _palettes.py │ ├── _plot.py │ ├── _post_training.py │ └── _utils.py ├── preprocessing │ ├── __init__.py │ ├── _general.py │ ├── _pca.py │ ├── _qc.py │ ├── _utils.py │ └── _variable_genes.py ├── readwrite.py └── tools │ ├── __init__.py │ ├── _gene_scores.py │ ├── _general.py │ ├── _integration.py │ ├── _pbg.py │ ├── _post_training.py │ ├── _umap.py │ └── _utils.py └── tests ├── data ├── 10xpbmc_atac_subset.h5ad ├── 10xpbmc_rna_subset.h5ad ├── pbg_training │ ├── entity_alias.txt │ ├── graph_stats.json │ ├── input │ │ └── entity │ │ │ ├── entity_count_C_0.txt │ │ │ ├── entity_count_G_0.txt │ │ │ ├── entity_names_C_0.json │ │ │ └── entity_names_G_0.json │ ├── model │ │ ├── checkpoint_version.txt │ │ ├── config.json │ │ ├── embeddings_C_0.v10.h5 │ │ ├── embeddings_G_0.v10.h5 │ │ ├── model.v10.h5 │ │ └── training_stats.json │ └── pbg_graph.txt └── preprocessed │ ├── atac_preprocessed.h5ad │ └── rna_preprocessed.h5ad ├── test_pbg_training.py ├── test_post_training.py └── test_preprocessing.py /.codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "simba/datasets/*" 3 | - "**/_utils.py" -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | matrix: 11 | python-version: ['3.8', '3.9', '3.10'] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | # - name: Add conda to system path 20 | # run: | 21 | # # $CONDA is an environment variable pointing to the root of the miniconda directory 22 | # echo $CONDA/bin >> $GITHUB_PATH 23 | - uses: mamba-org/setup-micromamba@v1 24 | with: 25 | condarc: | 26 | channels: 27 | - conda-forge 28 | - bioconda 29 | - defaults 30 | init-shell: bash 31 | environment-name: test-env 32 | create-args: >- 33 | python=${{ matrix.python-version }} 34 | simba>=1.1 35 | flake8 36 | pytest 37 | pytest-cov 38 | - name: Install SIMBA 39 | run: | 40 | python -m pip install --upgrade pip 41 | # pip install -r requirements.txt 42 | pip install -e . 43 | shell: bash -el {0} 44 | - name: Lint with flake8 45 | run: | 46 | # stop the build if there are Python syntax errors or undefined names 47 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 48 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 49 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 50 | shell: bash -el {0} 51 | - name: Test with pytest 52 | run: | 53 | pytest --cov 54 | shell: bash -el {0} 55 | - name: Coverage report 56 | run: | 57 | bash <(curl -s https://codecov.io/bash) 58 | shell: bash -el {0} 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/osx,python,windows 2 | 3 | ### OSX ### 4 | *.DS_Store 5 | .AppleDouble 6 | .LSOverride 7 | 8 | # Icon must end with two \r 9 | Icon 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | .com.apple.timemachine.donotpresent 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | 30 | ### Python ### 31 | # Byte-compiled / optimized / DLL files 32 | __pycache__/ 33 | *.py[cod] 34 | *$py.class 35 | 36 | # C extensions 37 | *.so 38 | 39 | # Distribution / packaging 40 | .Python 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | 57 | # PyInstaller 58 | # Usually these files are written by a python script from a template 59 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 60 | *.manifest 61 | *.spec 62 | 63 | # Installer logs 64 | pip-log.txt 65 | pip-delete-this-directory.txt 66 | 67 | # Unit test / coverage reports 68 | htmlcov/ 69 | .tox/ 70 | .coverage 71 | .coverage.* 72 | .cache 73 | .pytest_cache/ 74 | nosetests.xml 75 | coverage.xml 76 | *.cover 77 | .hypothesis/ 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | target/ 95 | 96 | # Jupyter Notebook 97 | .ipynb_checkpoints 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # celery beat schedule file 103 | celerybeat-schedule.* 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | 130 | ### Windows ### 131 | # Windows thumbnail cache files 132 | Thumbs.db 133 | ehthumbs.db 134 | ehthumbs_vista.db 135 | 136 | # Folder config file 137 | Desktop.ini 138 | 139 | # Recycle Bin used on file shares 140 | $RECYCLE.BIN/ 141 | 142 | # Windows Installer files 143 | *.cab 144 | *.msi 145 | *.msm 146 | *.msp 147 | 148 | # Windows shortcuts 149 | *.lnk 150 | 151 | # R 152 | *.Rhistory 153 | 154 | # Sphinx 155 | docs/source/_autosummary/ 156 | 157 | # End of https://www.gitignore.io/api/osx,python,windows 158 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | conda: 4 | environment: docs/environment.yml 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "mambaforge-4.10" 10 | 11 | sphinx: 12 | builder: html 13 | configuration: docs/source/conf.py 14 | fail_on_warning: false -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Huidong Chen, Pinello Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![documentation](https://readthedocs.org/projects/simba-bio/badge/?version=latest)](https://simba-bio.readthedocs.io/en/latest/) 2 | [![CI](https://github.com/huidongchen/simba/actions/workflows/CI.yml/badge.svg)](https://github.com/huidongchen/simba/actions/workflows/CI.yml) 3 | [![Anaconda](https://anaconda.org/bioconda/simba/badges/version.svg)](https://anaconda.org/bioconda/simba) 4 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/simba/README.html) 5 | [![codecov](https://codecov.io/gh/huidongchen/simba/branch/master/graph/badge.svg?token=ZUA70S1LUU)](https://codecov.io/gh/huidongchen/simba) 6 | 7 | # SIMBA 8 | 9 | SIMBA: **SI**ngle-cell e**MB**edding **A**long with features 10 | 11 | Website: https://simba-bio.readthedocs.io 12 | 13 | Manuscript: Huidong Chen, Jayoung Ryu, Michael E. Vinyard, Adam Lerer & Luca Pinello. ["SIMBA: single-cell embedding along with features. *Nat Methods* (2023)"](https://doi.org/10.1038/s41592-023-01899-8). 14 | 15 | 16 | -------------------------------------------------------------------------------- /R_scripts/scan_for_kmers_motifs.R: -------------------------------------------------------------------------------- 1 | # This script scans specified regions for kmers or/and motifs using JASPAR2020 database. 2 | # It outputs regions-by-kmers/motifs frequency matrix in .h5 format 3 | 4 | # Author: Huidong Chen 5 | # Contact information: hd7chen AT gmail DOT com 6 | 7 | suppressMessages(library(optparse,quietly = TRUE)) 8 | 9 | main <- function(){ 10 | option_list = list( 11 | make_option(c("-i", "--input"), type="character", default=NULL, 12 | help="input region file name in .bed format", metavar="character"), 13 | make_option(c("-g", "--genome"), type="character", default=NULL, 14 | help="Path to reference genome", metavar="character"), 15 | make_option(c("--no_kmer"), action = "store_true",default=FALSE, 16 | help="disable scanning for kmers"), 17 | make_option(c("--no_motif"), action = "store_true",default=FALSE, 18 | help="disable scanning for motifs"), 19 | make_option(c("-k","--k_kmer"), type="integer", default=6, 20 | help="k-mer length [default = %default].", metavar="integer"), 21 | make_option(c("-s","--species"), type="character", default=NULL, 22 | help="Species of motifs in the JASPAR database. 23 | Choose from 'Homo sapiens','Mus musculus'. Only valid when motif is used", 24 | metavar="character"), 25 | make_option(c("-o", "--output"), type="character", default='output_kmers_motifs', 26 | help="Output folder [default = %default]", metavar="character") 27 | ) 28 | 29 | opt_parser = OptionParser(option_list=option_list) 30 | opt = parse_args(opt_parser) 31 | 32 | if(is.null(opt$input)){ 33 | print_help(opt_parser) 34 | stop("input region file must be specified", call.=FALSE) 35 | } 36 | if(!opt$no_motif){ 37 | if(any(is.null(opt$genome),is.null(opt$species))){ 38 | print_help(opt_parser) 39 | stop("reference genome and species must be both specified", call.=FALSE) 40 | } 41 | } 42 | 43 | file.input = opt$input 44 | genome = opt$genome 45 | no_kmer = opt$no_kmer 46 | no_motif = opt$no_motif 47 | k = opt$k_kmer 48 | species = opt$species 49 | dir.output = opt$output 50 | 51 | suppressMessages(library(rhdf5)) 52 | suppressMessages(library(HDF5Array)) # used for saving sparse matrix 53 | suppressMessages(library(Biostrings)) 54 | suppressMessages(library(Matrix)) 55 | suppressMessages(library(TFBSTools)) 56 | suppressMessages(library(JASPAR2020)) 57 | suppressMessages(library(motifmatchr)) 58 | suppressMessages(library(SummarizedExperiment)) 59 | suppressMessages(library(doParallel)) 60 | 61 | set.seed(2020) 62 | 63 | system(paste0('mkdir -p ',dir.output)) 64 | 65 | print('Converting .bed to .fasta ...') 66 | ### convert peaks bed file to fasta file 67 | file.input.fa = paste0(basename(file.input),'.fa') 68 | system(paste("bedtools getfasta -fi",genome, 69 | "-bed",file.input, 70 | "-fo",file.path(dir.output,file.input.fa))) 71 | 72 | peaks_seq <- readDNAStringSet(file.path(dir.output,file.input.fa), "fasta") 73 | peaks_name = gsub(":|-",'_',names(peaks_seq)) 74 | 75 | ### count kmers 76 | if(!no_kmer){ 77 | print('Scanning for kmers ...') 78 | freq_k = oligonucleotideFrequency(peaks_seq, k) 79 | rownames(freq_k) = peaks_name 80 | freq_k = as(freq_k, "sparseMatrix") 81 | } 82 | 83 | ### scan for TF motifs 84 | if(!no_motif){ 85 | print('Scanning for TF motifs ...') 86 | opts <- list() 87 | opts["species"] <- species 88 | opts["collection"] <- "CORE" 89 | PFMatrixList = TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020,opts = opts) 90 | motif_ix_scores <- motifmatchr::matchMotifs(PFMatrixList,peaks_seq, out = "scores") 91 | freq_motif = motifCounts(motif_ix_scores) 92 | motif_names = c() 93 | for (x in names(PFMatrixList)){ 94 | motif_names = c(motif_names,PFMatrixList[[x]]@name) 95 | } 96 | colnames(freq_motif) = gsub("::",'_',motif_names) 97 | rownames(freq_motif) = peaks_name 98 | } 99 | 100 | ### save results 101 | ### save kmers 102 | if(!no_kmer){ 103 | print('Saving kmer matrix ...') 104 | 105 | # output_dir = file.path(dir.output, 'freq_k') 106 | # system(paste0('mkdir -p ',output_dir)) 107 | # filename = 'freq_k.mtx' 108 | # writeMM(freq_k,file = file.path(output_dir,filename)) 109 | # write.table(rownames(freq_k),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 110 | # write.table(colnames(freq_k),file.path(output_dir,'kmers.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 111 | 112 | filename = 'freq_kmer.h5' 113 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation 114 | writeHDF5Array(t(freq_k), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE) 115 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names 116 | h5write(rownames(freq_k), file.path(dir.output,filename), "row_names") 117 | h5write(colnames(freq_k), file.path(dir.output,filename), "col_names") 118 | } 119 | 120 | ### save motifs 121 | if(!no_motif){ 122 | print('Saving motif matrix ...') 123 | 124 | # output_dir = file.path(dir.output, 'freq_motif') 125 | # system(paste0('mkdir -p ',output_dir)) 126 | # filename = 'freq_motif.mtx' 127 | # writeMM(freq_motif,file = file.path(output_dir,filename)) 128 | # write.table(rownames(freq_motif),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 129 | # write.table(colnames(freq_motif),file.path(output_dir,'motifs.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 130 | 131 | filename = 'freq_motif.h5' 132 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation 133 | writeHDF5Array(t(freq_motif), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE) 134 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names 135 | h5write(rownames(freq_motif), file.path(dir.output,filename), "row_names") 136 | h5write(colnames(freq_motif), file.path(dir.output,filename), "col_names") 137 | } 138 | 139 | print('Finished.') 140 | } 141 | 142 | main() -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: readthedocs 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - pip 8 | - numpy<1.24.0 #avoid errors caused by 1.24 9 | - simba>=1.1 10 | - pandoc>=2.14 11 | - pip: 12 | - sphinx>=3.0 13 | - sphinx-rtd-theme>=0.5 14 | - nbsphinx>=0.8 15 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.0 2 | sphinx-rtd-theme>=0.5 3 | nbsphinx>=0.8 -------------------------------------------------------------------------------- /docs/source/API.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: simba 2 | 3 | API 4 | === 5 | 6 | Import simba as:: 7 | 8 | import simba as si 9 | 10 | Configuration for SIMBA 11 | ~~~~~~~~~~~~~~~~~~~~~~~ 12 | .. autosummary:: 13 | :toctree: _autosummary 14 | 15 | settings.set_figure_params 16 | settings.set_pbg_params 17 | settings.set_workdir 18 | 19 | 20 | Reading 21 | ~~~~~~~ 22 | 23 | .. autosummary:: 24 | :toctree: _autosummary 25 | 26 | read_csv 27 | read_h5ad 28 | read_10x_h5 29 | read_mtx 30 | read_embedding 31 | load_pbg_config 32 | load_graph_stats 33 | 34 | See more at `anndata `_ 35 | 36 | Preprocessing 37 | ~~~~~~~~~~~~~ 38 | 39 | .. autosummary:: 40 | :toctree: _autosummary 41 | 42 | pp.log_transform 43 | pp.normalize 44 | pp.binarize 45 | pp.cal_qc 46 | pp.cal_qc_rna 47 | pp.cal_qc_atac 48 | pp.filter_samples 49 | pp.filter_cells_rna 50 | pp.filter_cells_atac 51 | pp.filter_features 52 | pp.filter_genes 53 | pp.filter_peaks 54 | pp.pca 55 | pp.select_pcs 56 | pp.select_pcs_features 57 | pp.select_variable_genes 58 | 59 | Tools 60 | ~~~~~ 61 | 62 | .. autosummary:: 63 | :toctree: _autosummary 64 | 65 | tl.discretize 66 | tl.umap 67 | tl.gene_scores 68 | tl.infer_edges 69 | tl.trim_edges 70 | tl.gen_graph 71 | tl.pbg_train 72 | tl.softmax 73 | tl.embed 74 | tl.compare_entities 75 | tl.query 76 | tl.find_master_regulators 77 | tl.find_target_genes 78 | 79 | 80 | Plotting 81 | ~~~~~~~~ 82 | 83 | .. autosummary:: 84 | :toctree: _autosummary 85 | 86 | pl.pca_variance_ratio 87 | pl.pcs_features 88 | pl.variable_genes 89 | pl.violin 90 | pl.hist 91 | pl.umap 92 | pl.discretize 93 | pl.node_similarity 94 | pl.svd_nodes 95 | pl.pbg_metrics 96 | pl.entity_metrics 97 | pl.entity_barcode 98 | pl.query 99 | 100 | 101 | Datasets 102 | ~~~~~~~~ 103 | 104 | .. autosummary:: 105 | :toctree: _autosummary 106 | 107 | datasets.rna_10xpmbc3k 108 | datasets.rna_han2018 109 | datasets.rna_tmc2018 110 | datasets.rna_baron2016 111 | datasets.rna_muraro2016 112 | datasets.rna_segerstolpe2016 113 | datasets.rna_wang2016 114 | datasets.rna_xin2016 115 | datasets.atac_buenrostro2018 116 | datasets.atac_10xpbmc5k 117 | datasets.atac_chen2019 118 | datasets.atac_cusanovich2018_subset 119 | datasets.multiome_ma2020_fig4 120 | datasets.multiome_chen2019 121 | datasets.multiome_10xpbmc10k 122 | -------------------------------------------------------------------------------- /docs/source/About SIMBA.rst: -------------------------------------------------------------------------------- 1 | About SIMBA 2 | =========== 3 | 4 | SIMBA ( **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features) is a graph embedding method that jointly embeds single cells and their defining features, such as genes, chromatin accessible regions, and DNA sequences into a common latent space. SIMBA explicitly learns low-dimensional representations of cells and features, and implicitly enables the possibility of clustering-free marker discovery, batch effect removal and multi-omics integration. Importantly, SIMBA introduces several crucial procedures including Softmax transformation, weight decay for controlling overfitting, and entity-type constraints to generate comparable embeddings (co-embeddings) of cells and features and to address unique challenges in single-cell data. 5 | 6 | SIMBA first encodes different types of entities such as cells, genes, open chromatin regions (peaks or bins), transcription factor (TF) motifs, and k-mers (short sequences of a specific length, k), into a single graph, where each node represents an individual entity and edges indicate relations between entities. Unlike existing methods that primarily focus on learning cell states, SIMBA treats both cells and features as equal nodes in the same graph. 7 | 8 | In SIMBA, edges may be added in two ways: 1) measured experimentally; 2) inferred computationally. For edges that are measured experimentally, each cell-feature edge corresponds to a single-cell measurement (e.g., the expression value of a gene or a chromatin-accessible peak observed in a cell). For example, if a gene is expressed in a cell, an edge is created between the gene and cell. The weight of this edge is determined by the gene expression level. Similarly, an edge is added between a cell and a chromatin region if the region is open in this cell. Edges are also allowed between different features to capture and model the underlying regulatory mechanisms. For example, an edge between a chromatin region and a TF-motif (or k-mer) captures the notion that a TF may bind to a regulatory region containing a specific DNA sequence. For edges that cannot be directly measured, they are inferred computationally by summarizing features of the same or different types. Each edge between cells of different batches or modalities indicates the cellular functional or structural similarity. 9 | 10 | Once the input graph is constructed, SIMBA applies a multi-entity graph embedding algorithm as well as a Softmax-based transformation to embed the nodes/entities into a common low-dimensional space wherein cells and features are comparable and can be analyzed based on their distance. Graph construction is inherently flexible, enabling SIMBA to be applied to a wide variety of single-cell tasks. 11 | 12 | Overall, SIMBA is versatile and can accommodate features of various domains as long as they can be encoded into a connected graph. It can readily extend to new single-cell modalities and tasks. SIMBA provides a single generalizable framework that allows diverse single-cell problems to be formulated in a unified way and thus simplifies the development of new analyses and extension to new single-cell modalities. -------------------------------------------------------------------------------- /docs/source/Basic concepts.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Basic concepts 3 | ================ 4 | 5 | 6 | Graph construction 7 | ~~~~~~~~~~~~~~~~~~ 8 | SIMBA encodes entities of different types, including genes, open chromatin regions (peaks or bins), and DNA sequences (transcription factor motifs or k-mers), into a connected large graph based on the relation between them. In this graph, nodes represent different entities and edges indicate the relation between entities. 9 | 10 | * In scRNA-seq analysis, each node represents either a cell or a gene. If a gene is expressed in a cell, then an edge is added between this gene and cell. The gene expression level is encoded into the weight of this edge. 11 | 12 | * In scATAC-seq analysis, each node represents either a cell or a region (peak/bin). If a region is open in a cell, then an edge is added between this region and cell. Optionally, if DNA sequences (TF motifs or k-mers) are also used, each node represents a cell, or a region, or a DNA sequence. In addition to the relation between a cell and a region, if a DNA sequence is found within the open region, then an edge is added between this DNA sequence and open region. 13 | 14 | * In multimodal analysis, each node can be any of these entities, including a cell, a gene, a open region , a DNA sequence, etc. Edges are added similarly as in scRNA-seq analysis and scATAC-seq analysis. 15 | 16 | * In batch correction analysis, in addition to the experimentally measured edges as described above, batch correction is further enhanced with the computationally inferred edges between cell nodes across datasets using a truncated randomized singular value decomposition (SVD)-based procedure 17 | 18 | * In multiomics integration analysis (scRNA-seq and scATAC-seq), SIMBA first builds one graph for scRNA-seq data and one graph for scATAC-seq data independently as described above. To connect these two graphs, SIMBA calculates gene scores by summarizing accessible regions from scATAC-seq data and then infer edges between cells of different omics based on their shared gene expression modules through a similar procedure as in batch correction. 19 | 20 | PBG training 21 | ~~~~~~~~~~~~ 22 | Following the construction of a multi-relational graph between biological entities, we adapt graph embedding techniques from the knowledge graph and recommendation systems literature to construct unsupervised representations for these entities. 23 | 24 | We use the PyTorch-BigGraph(PBG) framework, which provides efficient computation of multi-relation graph embeddings over multiple entity types and can scale to graphs with millions or billions of entities. 25 | 26 | In SIMBA, several key modifications have been made based on PBG, including: 27 | 28 | * Type-constrainted negative sampling 29 | 30 | * Negative samples are produced in two ways: 31 | 32 | * by corrupting the edge with a source or destination sampled uniformly from the nodes with the correct types for this relation; 33 | 34 | * by corrupting the edge with a source or destination node sampled with probability proportional to its degree. 35 | 36 | * Introducing a weight decay procedure to solve overfitting problem. 37 | 38 | The resulting graph embeddings have two desirable properties that we will take advantage of: 39 | 40 | #. First-order similarity: for two entity types with a relation between them, edges with high likelihood should have higher dot product. 41 | #. Second-order similarity: within a single entity type, entities that have ‘similar contexts’, i.e., a similar distribution of edge probabilities, should have similar embeddings. 42 | 43 | Evaluation during training 44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 45 | During the PBG training procedure, a small percent of edges is held out (by default, the evaluation fraction is set to 5%) to monitor overfitting and evaluate the final model. 46 | 47 | Five metrics are computed on the reserved set of edges, including mean reciprocal rank (MRR, the average of the reciprocal of the ranks of all positives), R1 (the fraction of positives that rank better than all their negatives, i.e., have a rank of 1), R10 (the fraction of positives that rank in the top 10 among their negatives), R50 (the fraction of positives that rank in the top 50 among their negatives), and AUC (Area Under the Curve). 48 | 49 | By default, we show MRR along with training loss and validation loss while other metric are also available in SIMBA package. The learning curves for validation loss and these metrics can be used to determine when training has completed. The relative values of training and validation loss along with these evaluation metrics can be used to identify issues with training (underfitting vs overfitting) and tune the hyperparameters weight decay, embedding dimension, and number of training epochs appropriately. However, for most datasets we find that the default parameters do not need tuning. 50 | 51 | Softmax transformation 52 | ~~~~~~~~~~~~~~~~~~~~~~ 53 | PyTorch-BigGraph training provides initial embeddings of all entities (nodes). However, entities of different types (e.g., cells vs peaks, cells of different batches or modalities) have different edge distributions and thus may lie on different manifolds of the latent space. To make the embeddings of entities of different types comparable, we transform the embeddings of features with Softmax function by utilizing the first-order similarity between cells (reference) and features (query). In the case of batch correction or multi-omics integration, the SoftMax transformation is also performed based on the first-order similarity between cells of different batches or modalities. 54 | -------------------------------------------------------------------------------- /docs/source/Citation.rst: -------------------------------------------------------------------------------- 1 | Citation 2 | ======== 3 | 4 | Chen, H., Ryu, J., Vinyard, M. E., Lerer, A., & Pinello, L. (2023). SIMBA: SIngle-cell eMBedding Along with features. *Nature Methods*, 1-11. 5 | 6 | Please check out our `manuscript `_ to learn more. -------------------------------------------------------------------------------- /docs/source/Installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Anaconda 5 | ~~~~~~~~ 6 | 7 | 8 | For first-time *conda* users, perform a one-time set up of Bioconda with the following commands:: 9 | 10 | conda config --add channels defaults 11 | conda config --add channels bioconda 12 | conda config --add channels conda-forge 13 | conda config --set channel_priority strict 14 | 15 | 16 | To install `simba `_ with conda, run:: 17 | 18 | conda install -c bioconda simba 19 | 20 | **Recommended**: install *simba* in a new virtual enviroment:: 21 | 22 | conda create -n env_simba simba 23 | conda activate env_simba 24 | 25 | 26 | Dev version 27 | ~~~~~~~~~~~ 28 | 29 | To install the latest version on `GitHub `_, 30 | 31 | first install `simba_pbg `_ :: 32 | 33 | conda install -c bioconda simba_pbg 34 | 35 | 36 | then run:: 37 | 38 | git clone https://github.com/huidongchen/simba.git 39 | pip install simba --user 40 | 41 | or:: 42 | 43 | pip install git+https://github.com/huidongchen/simba 44 | -------------------------------------------------------------------------------- /docs/source/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/Output.rst: -------------------------------------------------------------------------------- 1 | Output 2 | ====== 3 | 4 | SIMBA result structure will look like this: 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | :: 8 | 9 | result_simba 10 | ├── figures 11 | └── pbg 12 | └── graph0 13 | ├── pbg_graph.txt 14 | ├── graph_stats.json 15 | ├── entity_alias.txt 16 | └── input 17 | ├── edge 18 | └── entity 19 | └── model0 20 | ├── config.json 21 | ├── training_stats.json 22 | ├── checkpoint_version.txt 23 | ├── embeddings.h5 24 | └── model.h5 25 | └── model1 26 | ├── config.json 27 | ├── training_stats.json 28 | ├── checkpoint_version.txt 29 | ├── embeddings.h5 30 | └── model.h5 31 | └── model2 32 | ├── config.json 33 | ├── training_stats.json 34 | ├── checkpoint_version.txt 35 | ├── embeddings.h5 36 | └── model.h5 37 | └── graph1 38 | ├── pbg_graph.txt 39 | ├── graph_stats.json 40 | ├── entity_alias.txt 41 | └── input 42 | ├── edge 43 | └── entity 44 | └── model 45 | ├── config.json 46 | ├── training_stats.json 47 | ├── checkpoint_version.txt 48 | ├── embeddings.h5 49 | └── model.h5 50 | 51 | By default, all figures will be saved under ``result_simba/figures`` 52 | 53 | The PBG training results will be stored in the directory ``result_simba/pbg``. Inside this folder, each constructed graph will be saved in a distinct folder (by default ``result_simba/pbg/graph0``), and each model trained on that graph will be saved into a separate folder (by default ``result_simba/pbg/graph0/model``). 54 | 55 | Inside each graph folder (e.g., ``result_simba/pbg/graph0``): 56 | 57 | - ``pbg_graph.txt`` stores its edges on which PBG training is performed; 58 | - ``graph_stats.json`` stores the statistics associated with this graph; 59 | - ``entity_alias.txt`` keeps the mapping between the original entity IDs and their aliases. 60 | - ``input`` stores the extracted nodes (entities) and edges from ``pbg_graph.txt``, which are prepared for PBG training. 61 | - ``model`` stores the training result of one parameter configuration. (by default ``model``) -------------------------------------------------------------------------------- /docs/source/Release notes.rst: -------------------------------------------------------------------------------- 1 | Release notes 2 | ============= -------------------------------------------------------------------------------- /docs/source/_ext/edit_on_github.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the 3 | sidebar. 4 | """ 5 | 6 | import os 7 | import warnings 8 | 9 | __licence__ = "BSD (3 clause)" 10 | 11 | 12 | # def get_github_repo(app, path): 13 | # if path.endswith(".ipynb"): 14 | # return app.config.github_nb_repo, "/" 15 | # return app.config.github_repo, "/docs/source/" 16 | 17 | 18 | def html_page_context(app, pagename, templatename, context, doctree): 19 | if templatename != "page.html": 20 | return 21 | 22 | if doctree is not None: 23 | path = os.path.relpath(doctree.get("source"), app.builder.srcdir) 24 | if path.endswith(".ipynb"): 25 | context["display_github"] = True 26 | context["github_user"] = "huidongchen" 27 | context["github_repo"] = "simba_tutorials" 28 | context["github_version"] = "main" 29 | if path.endswith("rna_10x_mouse_brain_1p3M.ipynb"): 30 | context["conf_py_path"] = "/v1.1/" 31 | else: 32 | context["conf_py_path"] = "/v1.0/" 33 | else: 34 | context["display_github"] = True 35 | context["github_user"] = "huidongchen" 36 | context["github_repo"] = "simba" 37 | context["github_version"] = "master" 38 | context["conf_py_path"] = "/docs/source/" 39 | 40 | def setup(app): 41 | app.add_config_value("github_nb_repo", "", True) 42 | app.add_config_value("github_repo", "", True) 43 | app.connect("html-page-context", html_page_context) 44 | -------------------------------------------------------------------------------- /docs/source/_static/img/Figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/docs/source/_static/img/Figure1.png -------------------------------------------------------------------------------- /docs/source/_static/img/lion_icon.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/_static/img/logo_simba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/docs/source/_static/img/logo_simba.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../simba')) 16 | sys.path.insert(0, os.path.abspath('_ext')) 17 | import simba # noqa: E402 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'SIMBA' 23 | copyright = '2023, Huidong Chen' 24 | author = 'Huidong Chen' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = simba.__version__ 28 | 29 | 30 | # -- Retrieve notebooks (borrowed from scVelo) ------------------------------- 31 | 32 | from urllib.request import urlretrieve # noqa: E402 33 | 34 | notebooks_url = "https://github.com/huidongchen/simba_tutorials/raw/main/" 35 | notebooks_v1_0 = [ 36 | "atac_buenrostro2018_peaks_and_sequences.ipynb", 37 | "multiome_shareseq.ipynb", 38 | "multiome_shareseq_GRN.ipynb", 39 | "rna_mouse_atlas.ipynb", 40 | "rna_human_pancreas.ipynb", 41 | "multiome_10xpmbc10k_integration.ipynb", 42 | ] 43 | notebooks_v1_1 = [ 44 | "rna_10x_mouse_brain_1p3M.ipynb", 45 | ] 46 | notebooks_v1_2 = [ 47 | "rna_10xpmbc_all_genes_v1.2.ipynb", 48 | "rna_10xpmbc_edgeweigts.ipynb", 49 | 'new_graph_generation.ipynb' 50 | ] 51 | for nb in notebooks_v1_0: 52 | try: 53 | urlretrieve(notebooks_url + "v1.0/" + nb, nb) 54 | except Exception: 55 | pass 56 | 57 | for nb in notebooks_v1_1: 58 | try: 59 | urlretrieve(notebooks_url + "v1.1/" + nb, nb) 60 | except Exception: 61 | pass 62 | 63 | for nb in notebooks_v1_2: 64 | try: 65 | urlretrieve(notebooks_url + "v1.2/" + nb, nb) 66 | except Exception: 67 | pass 68 | # -- General configuration --------------------------------------------------- 69 | 70 | # Add any Sphinx extension module names here, as strings. They can be 71 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 72 | # ones. 73 | 74 | needs_sphinx = "3.0" 75 | 76 | extensions = [ 77 | "sphinx.ext.autodoc", 78 | "sphinx.ext.autosummary", 79 | 'sphinx.ext.napoleon', 80 | "sphinx.ext.intersphinx", 81 | "sphinx.ext.mathjax", 82 | "sphinx.ext.viewcode", 83 | "nbsphinx", 84 | "edit_on_github", 85 | ] 86 | 87 | autosummary_generate = True 88 | 89 | # Napoleon settings 90 | napoleon_google_docstring = False 91 | 92 | # Add any paths that contain templates here, relative to this directory. 93 | templates_path = ['_templates'] 94 | 95 | # List of patterns, relative to source directory, that match files and 96 | # directories to ignore when looking for source files. 97 | # This pattern also affects html_static_path and html_extra_path. 98 | exclude_patterns = ['_build'] 99 | 100 | # Add prolog for notebooks 101 | 102 | # nbsphinx_prolog = r""" 103 | # {% set docname = 'github/huidongchen/simba_tutorials/blob/main/v1.0/' + env.doc2path(env.docname, base=None) %} # noqa 104 | # """ 105 | 106 | # -- Options for HTML output ------------------------------------------------- 107 | 108 | # The theme to use for HTML and HTML Help pages. See the documentation for 109 | # a list of builtin themes. 110 | # 111 | html_theme = 'sphinx_rtd_theme' 112 | html_theme_options = { 113 | "navigation_depth": 1, 114 | "titles_only": True, 115 | 'logo_only': True, 116 | } 117 | html_show_sphinx = False 118 | html_logo = '_static/img/logo_simba.png' 119 | html_favicon = '_static/img/lion_icon.svg' 120 | # html_context = dict( 121 | # display_github=True, 122 | # github_user='pinellolab', 123 | # github_repo='simba', 124 | # github_version='master', 125 | # conf_py_path='/docs/source/', 126 | # ) 127 | # html_context = dict( 128 | # display_github=True, 129 | # github_user='huidongchen', 130 | # github_repo='simba_tutorials', 131 | # github_version='main', 132 | # conf_py_path='/v1.0/', 133 | # ) 134 | github_repo = 'simba' 135 | github_nb_repo = 'simba_tutorials' 136 | 137 | 138 | # Add any paths that contain custom static files (such as style sheets) here, 139 | # relative to this directory. They are copied after the builtin static files, 140 | # so a file named "default.css" will overwrite the builtin "default.css". 141 | 142 | html_static_path = ['_static'] 143 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | |CI| |Docs| |Anaconda| |Install with conda| |Codecov| |Last updated| |Downloads| |License| 2 | 3 | **SIMBA**: **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features 4 | ======================================================================== 5 | 6 | SIMBA is a method to embed cells along with their defining features such as gene expression, transcription factor binding sequences and chromatin accessibility peaks into the same latent space. The joint embedding of cells and features allows SIMBA to perform various types of single cell tasks, including but not limited to single-modal analysis (e.g. scRNA-seq and scATAC-seq analysis), multimodal analysis, batch correction, and multi-omic integration. 7 | 8 | 9 | .. image:: _static/img/Figure1.png 10 | :align: center 11 | :width: 600 12 | :alt: SIMBA overview 13 | 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | :caption: Overview 18 | :hidden: 19 | 20 | About SIMBA 21 | Installation 22 | API 23 | Release notes 24 | Citation 25 | 26 | 27 | .. toctree:: 28 | :maxdepth: 1 29 | :caption: SIMBA primer 30 | 31 | Basic concepts 32 | Output 33 | 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | :caption: Tutorials 38 | 39 | rna_10xpmbc_all_genes_v1.2 40 | atac_buenrostro2018_peaks_and_sequences 41 | multiome_shareseq 42 | multiome_shareseq_GRN 43 | rna_mouse_atlas 44 | rna_human_pancreas 45 | multiome_10xpmbc10k_integration 46 | new_graph_generation 47 | rna_10xpmbc_edgeweigts 48 | rna_10x_mouse_brain_1p3M 49 | 50 | 51 | .. |Docs| image:: https://readthedocs.org/projects/simba-bio/badge/?version=latest 52 | :target: https://simba-bio.readthedocs.io 53 | 54 | .. |CI| image:: https://github.com/huidongchen/simba/actions/workflows/CI.yml/badge.svg 55 | :target: https://github.com/huidongchen/simba/actions/workflows/CI.yml 56 | 57 | .. |Anaconda| image:: https://anaconda.org/bioconda/simba/badges/version.svg 58 | :target: https://anaconda.org/bioconda/simba 59 | 60 | .. |Install with conda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat 61 | :target: http://bioconda.github.io/recipes/simba/README.html 62 | 63 | .. |Last updated| image:: https://anaconda.org/bioconda/simba/badges/latest_release_date.svg 64 | :target: https://anaconda.org/bioconda/simba 65 | 66 | .. |License| image:: https://anaconda.org/bioconda/simba/badges/license.svg 67 | :target: https://github.com/pinellolab/simba/blob/master/LICENSE 68 | 69 | .. |Downloads| image:: https://anaconda.org/bioconda/simba/badges/downloads.svg 70 | :target: https://anaconda.org/bioconda/simba 71 | 72 | .. |Codecov| image:: https://codecov.io/gh/huidongchen/simba/branch/master/graph/badge.svg?token=ZUA70S1LUU 73 | :target: https://codecov.io/gh/huidongchen/simba 74 | -------------------------------------------------------------------------------- /docs/source/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = 'test_*.py' 3 | testpaths = 'tests/' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.17.0 2 | pandas>=1.0,!=1.1 # required by Anndata 3 | anndata>=0.7.4 4 | # h5py<3.0.0 # avoid byte strings but caused building errors 5 | # h5py>=3.4 6 | scikit-learn>=1.2 7 | scipy>=1.4 8 | kneed>=0.7 9 | seaborn>=0.11 10 | matplotlib>=3.3 11 | scikit-misc>=0.1.3 12 | adjusttext>=0.7.3 13 | umap-learn>=0.3.0 14 | #plotly>=4.14.0 15 | pybedtools>=0.8.0 16 | # bedtools>=2.29.0 # not available in pip 17 | tables -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version_info < (3, 7): 4 | sys.exit('simba requires Python >= 3.7') 5 | 6 | from setuptools import setup, find_packages 7 | from pathlib import Path 8 | 9 | version = {} 10 | with open("simba/_version.py") as fp: 11 | exec(fp.read(), version) 12 | 13 | 14 | setup( 15 | name='simba', 16 | version=version['__version__'], 17 | author='Huidong Chen', 18 | athor_email='hd7chen AT gmail DOT com', 19 | license='BSD', 20 | description='SIngle-cell eMBedding Along with features', 21 | long_description=Path('README.md').read_text('utf-8'), 22 | long_description_content_type="text/markdown", 23 | url='https://github.com/pinellolab/simba', 24 | packages=find_packages(), 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: BSD License", 28 | "Operating System :: OS Independent", 29 | ], 30 | python_requires='>=3.7', 31 | install_requires=[ 32 | x.strip() for x in 33 | Path('requirements.txt').read_text('utf-8').splitlines() 34 | ], 35 | include_package_data=True, 36 | package_data={"simba": ["data/gene_anno/*.bed"]} 37 | ) 38 | -------------------------------------------------------------------------------- /simba/__init__.py: -------------------------------------------------------------------------------- 1 | """SIngle-cell eMBedding Along with features""" 2 | 3 | from ._settings import settings 4 | from . import preprocessing as pp 5 | from . import tools as tl 6 | from . import plotting as pl 7 | from .readwrite import * 8 | from . import datasets 9 | from ._version import __version__ 10 | 11 | 12 | import sys 13 | # needed when building doc (borrowed from scanpy) 14 | sys.modules.update( 15 | {f'{__name__}.{m}': globals()[m] for m in ['tl', 'pp', 'pl']}) 16 | -------------------------------------------------------------------------------- /simba/_settings.py: -------------------------------------------------------------------------------- 1 | """Configuration for SIMBA""" 2 | 3 | import os 4 | import seaborn as sns 5 | import matplotlib as mpl 6 | 7 | 8 | class SimbaConfig: 9 | """configuration class for SIMBA""" 10 | 11 | def __init__(self, 12 | workdir='./result_simba', 13 | save_fig=False, 14 | n_jobs=1): 15 | self.workdir = workdir 16 | self.save_fig = save_fig 17 | self.n_jobs = n_jobs 18 | self.set_pbg_params() 19 | self.graph_stats = dict() 20 | 21 | def set_figure_params(self, 22 | context='notebook', 23 | style='white', 24 | palette='deep', 25 | font='sans-serif', 26 | font_scale=1.1, 27 | color_codes=True, 28 | dpi=80, 29 | dpi_save=150, 30 | fig_size=[5.4, 4.8], 31 | rc=None): 32 | """ Set global parameters for figures. Modified from sns.set() 33 | 34 | Parameters 35 | ---------- 36 | context : string or dict 37 | Plotting context parameters, see `seaborn.plotting_context` 38 | style: `string`,optional (default: 'white') 39 | Axes style parameters, see `seaborn.axes_style` 40 | palette : string or sequence 41 | Color palette, see `seaborn.color_palette` 42 | font_scale: `float`, optional (default: 1.3) 43 | Separate scaling factor to independently 44 | scale the size of the font elements. 45 | color_codes : `bool`, optional (default: True) 46 | If ``True`` and ``palette`` is a seaborn palette, 47 | remap the shorthand color codes (e.g. "b", "g", "r", etc.) 48 | to the colors from this palette. 49 | dpi: `int`,optional (default: 80) 50 | Resolution of rendered figures. 51 | dpi_save: `int`,optional (default: 150) 52 | Resolution of saved figures. 53 | rc: `dict`,optional (default: None) 54 | rc settings properties. 55 | Parameter mappings to override the values in the preset style. 56 | Please see "`matplotlibrc file 57 | `__" 58 | """ 59 | sns.set(context=context, 60 | style=style, 61 | palette=palette, 62 | font=font, 63 | font_scale=font_scale, 64 | color_codes=color_codes, 65 | rc={'figure.dpi': dpi, 66 | 'savefig.dpi': dpi_save, 67 | 'figure.figsize': fig_size, 68 | 'image.cmap': 'viridis', 69 | 'lines.markersize': 6, 70 | 'legend.columnspacing': 0.1, 71 | 'legend.borderaxespad': 0.1, 72 | 'legend.handletextpad': 0.1, 73 | 'pdf.fonttype': 42, 74 | }) 75 | if rc is not None: 76 | assert isinstance(rc, dict), "rc must be dict" 77 | for key, value in rc.items(): 78 | if key in mpl.rcParams.keys(): 79 | mpl.rcParams[key] = value 80 | else: 81 | raise Exception("unrecognized property '%s'" % key) 82 | 83 | def set_workdir(self, workdir=None): 84 | """Set working directory. 85 | 86 | Parameters 87 | ---------- 88 | workdir: `str`, optional (default: None) 89 | Working directory. 90 | 91 | Returns 92 | ------- 93 | """ 94 | if workdir is None: 95 | workdir = self.workdir 96 | print("Using default working directory.") 97 | if not os.path.exists(workdir): 98 | os.makedirs(workdir) 99 | self.workdir = workdir 100 | self.set_pbg_params() 101 | print('Saving results in: %s' % workdir) 102 | 103 | def set_pbg_params(self, config=None): 104 | """Set PBG parameters 105 | 106 | Parameters 107 | ---------- 108 | config : `dict`, optional (default: None) 109 | PBG training configuration parameters. 110 | By default it resets parameters to the default setting. 111 | 112 | Returns 113 | ------- 114 | """ 115 | if config is None: 116 | config = dict( 117 | # I/O data 118 | entity_path="", 119 | edge_paths=["", ], 120 | checkpoint_path="", 121 | 122 | # Graph structure 123 | entities={}, 124 | relations=[], 125 | dynamic_relations=False, 126 | 127 | # Scoring model 128 | dimension=50, 129 | global_emb=False, 130 | comparator='dot', 131 | 132 | # Training 133 | num_epochs=10, 134 | workers=4, 135 | num_batch_negs=50, 136 | num_uniform_negs=50, 137 | loss_fn='softmax', 138 | lr=0.1, 139 | 140 | early_stopping=False, 141 | regularization_coef=0.0, 142 | wd=0.0, 143 | wd_interval=50, 144 | 145 | # Evaluation during training 146 | eval_fraction=0.05, 147 | eval_num_batch_negs=50, 148 | eval_num_uniform_negs=50, 149 | 150 | checkpoint_preservation_interval=None, 151 | ) 152 | assert isinstance(config, dict), "`config` must be dict" 153 | self.pbg_params = config 154 | 155 | 156 | settings = SimbaConfig() 157 | -------------------------------------------------------------------------------- /simba/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from kneed import KneeLocator 5 | import tables 6 | from anndata import AnnData 7 | 8 | 9 | def locate_elbow(x, y, S=10, min_elbow=0, 10 | curve='convex', direction='decreasing', online=False, 11 | **kwargs): 12 | """Detect knee points 13 | 14 | Parameters 15 | ---------- 16 | x : `array-like` 17 | x values 18 | y : `array-like` 19 | y values 20 | S : `float`, optional (default: 10) 21 | Sensitivity 22 | min_elbow: `int`, optional (default: 0) 23 | The minimum elbow location 24 | curve: `str`, optional (default: 'convex') 25 | Choose from {'convex','concave'} 26 | If 'concave', algorithm will detect knees, 27 | If 'convex', algorithm will detect elbows. 28 | direction: `str`, optional (default: 'decreasing') 29 | Choose from {'decreasing','increasing'} 30 | online: `bool`, optional (default: False) 31 | kneed will correct old knee points if True, 32 | kneed will return first knee if False. 33 | **kwargs: `dict`, optional 34 | Extra arguments to KneeLocator. 35 | 36 | Returns 37 | ------- 38 | elbow: `int` 39 | elbow point 40 | """ 41 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):], 42 | S=S, curve=curve, 43 | direction=direction, 44 | online=online, 45 | **kwargs, 46 | ) 47 | if kneedle.elbow is None: 48 | elbow = len(y) 49 | else: 50 | elbow = int(kneedle.elbow) 51 | return elbow 52 | 53 | 54 | # modifed from 55 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 56 | def _read_legacy_10x_h5(filename, genome=None): 57 | """ 58 | Read hdf5 file from Cell Ranger v2 or earlier versions. 59 | """ 60 | with tables.open_file(str(filename), 'r') as f: 61 | try: 62 | children = [x._v_name for x in f.list_nodes(f.root)] 63 | if not genome: 64 | if len(children) > 1: 65 | raise ValueError( 66 | f"'{filename}' contains more than one genome. " 67 | "For legacy 10x h5 " 68 | "files you must specify the genome " 69 | "if more than one is present. " 70 | f"Available genomes are: {children}" 71 | ) 72 | genome = children[0] 73 | elif genome not in children: 74 | raise ValueError( 75 | f"Could not find genome '{genome}' in '{filename}'. " 76 | f'Available genomes are: {children}' 77 | ) 78 | dsets = {} 79 | for node in f.walk_nodes('/' + genome, 'Array'): 80 | dsets[node.name] = node.read() 81 | # AnnData works with csr matrices 82 | # 10x stores the transposed data, so we do the transposition 83 | from scipy.sparse import csr_matrix 84 | 85 | M, N = dsets['shape'] 86 | data = dsets['data'] 87 | if dsets['data'].dtype == np.dtype('int32'): 88 | data = dsets['data'].view('float32') 89 | data[:] = dsets['data'] 90 | matrix = csr_matrix( 91 | (data, dsets['indices'], dsets['indptr']), 92 | shape=(N, M), 93 | ) 94 | # the csc matrix is automatically the transposed csr matrix 95 | # as scanpy expects it, so, no need for a further transpostion 96 | adata = AnnData( 97 | matrix, 98 | obs=dict(obs_names=dsets['barcodes'].astype(str)), 99 | var=dict( 100 | var_names=dsets['gene_names'].astype(str), 101 | gene_ids=dsets['genes'].astype(str), 102 | ), 103 | ) 104 | return adata 105 | except KeyError: 106 | raise Exception('File is missing one or more required datasets.') 107 | 108 | 109 | # modifed from 110 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 111 | def _read_v3_10x_h5(filename): 112 | """ 113 | Read hdf5 file from Cell Ranger v3 or later versions. 114 | """ 115 | with tables.open_file(str(filename), 'r') as f: 116 | try: 117 | dsets = {} 118 | for node in f.walk_nodes('/matrix', 'Array'): 119 | dsets[node.name] = node.read() 120 | from scipy.sparse import csr_matrix 121 | 122 | M, N = dsets['shape'] 123 | data = dsets['data'] 124 | if dsets['data'].dtype == np.dtype('int32'): 125 | data = dsets['data'].view('float32') 126 | data[:] = dsets['data'] 127 | matrix = csr_matrix( 128 | (data, dsets['indices'], dsets['indptr']), 129 | shape=(N, M), 130 | ) 131 | adata = AnnData( 132 | matrix, 133 | obs=dict(obs_names=dsets['barcodes'].astype(str)), 134 | var=dict( 135 | var_names=dsets['name'].astype(str), 136 | gene_ids=dsets['id'].astype(str), 137 | feature_types=dsets['feature_type'].astype(str), 138 | genome=dsets['genome'].astype(str), 139 | ), 140 | ) 141 | return adata 142 | except KeyError: 143 | raise Exception('File is missing one or more required datasets.') 144 | -------------------------------------------------------------------------------- /simba/_version.py: -------------------------------------------------------------------------------- 1 | """Version information""" 2 | 3 | __version__ = "1.2" 4 | -------------------------------------------------------------------------------- /simba/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Builtin Datasets.""" 2 | 3 | from ._datasets import ( 4 | rna_10xpmbc3k, 5 | rna_han2018, 6 | rna_tmc2018, 7 | rna_baron2016, 8 | rna_muraro2016, 9 | rna_segerstolpe2016, 10 | rna_wang2016, 11 | rna_xin2016, 12 | atac_buenrostro2018, 13 | atac_10xpbmc5k, 14 | atac_chen2019, 15 | atac_cusanovich2018_subset, 16 | multiome_ma2020_fig4, 17 | multiome_chen2019, 18 | multiome_10xpbmc10k 19 | ) 20 | -------------------------------------------------------------------------------- /simba/datasets/_datasets.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from tqdm import tqdm 3 | import os 4 | 5 | from .._settings import settings 6 | from ..readwrite import read_h5ad 7 | 8 | 9 | class DownloadProgressBar(tqdm): 10 | def update_to(self, 11 | b=1, 12 | bsize=1, 13 | tsize=None): 14 | if tsize is not None: 15 | self.total = tsize 16 | self.update(b * bsize - self.n) 17 | 18 | 19 | def download_url(url, 20 | output_path, 21 | desc=None): 22 | if desc is None: 23 | desc = url.split('/')[-1] 24 | with DownloadProgressBar( 25 | unit='B', 26 | unit_scale=True, 27 | miniters=1, 28 | desc=desc 29 | ) as t: 30 | urllib.request.urlretrieve( 31 | url, 32 | filename=output_path, 33 | reporthook=t.update_to) 34 | 35 | 36 | def rna_10xpmbc3k(): 37 | """10X human peripheral blood mononuclear cells (PBMCs) scRNA-seq data 38 | 39 | Returns 40 | ------- 41 | adata: `AnnData` 42 | Anndata object 43 | """ 44 | url = 'https://www.dropbox.com/s/087wuliddmbp3oe/rna_seq.h5ad?dl=1' 45 | filename = 'rna_10xpmbc3k.h5ad' 46 | filepath = os.path.join(settings.workdir, 'data') 47 | fullpath = os.path.join(filepath, filename) 48 | if(not os.path.exists(fullpath)): 49 | print('Downloading data ...') 50 | os.makedirs(filepath, exist_ok=True) 51 | download_url(url, 52 | fullpath, 53 | desc=filename) 54 | print(f'Downloaded to {filepath}.') 55 | adata = read_h5ad(fullpath) 56 | return adata 57 | 58 | 59 | def rna_han2018(): 60 | """single-cell microwell-seq mouse cell atlas data 61 | 62 | ref: Han, X. et al. Mapping the mouse cell atlas by microwell-seq. 63 | Cell 172, 1091-1107. e1017 (2018). 64 | 65 | Returns 66 | ------- 67 | adata: `AnnData` 68 | Anndata object 69 | """ 70 | url = 'https://www.dropbox.com/s/nxbszjbir44g99n/rna_seq_mi.h5ad?dl=1' 71 | filename = 'rna_han2018.h5ad' 72 | filepath = os.path.join(settings.workdir, 'data') 73 | fullpath = os.path.join(filepath, filename) 74 | if(not os.path.exists(fullpath)): 75 | print('Downloading data ...') 76 | os.makedirs(filepath, exist_ok=True) 77 | download_url(url, 78 | fullpath, 79 | desc=filename) 80 | print(f'Downloaded to {filepath}.') 81 | adata = read_h5ad(fullpath) 82 | return adata 83 | 84 | 85 | def rna_tmc2018(): 86 | """single-cell Smart-Seq2 mouse cell atlas data 87 | 88 | ref: Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse 89 | organs creates a Tabula Muris. Nature 562, 367-372 (2018). 90 | 91 | Returns 92 | ------- 93 | adata: `AnnData` 94 | Anndata object 95 | """ 96 | url = 'https://www.dropbox.com/s/rnpyp6vfpuiptkz/rna_seq_sm.h5ad?dl=1' 97 | filename = 'rna_tmc2018.h5ad' 98 | filepath = os.path.join(settings.workdir, 'data') 99 | fullpath = os.path.join(filepath, filename) 100 | if(not os.path.exists(fullpath)): 101 | print('Downloading data ...') 102 | os.makedirs(filepath, exist_ok=True) 103 | download_url(url, 104 | fullpath, 105 | desc=filename) 106 | print(f'Downloaded to {filepath}.') 107 | adata = read_h5ad(fullpath) 108 | return adata 109 | 110 | 111 | def rna_baron2016(): 112 | """single-cell RNA-seq human pancreas data 113 | 114 | ref: Baron, M. et al. A single-cell transcriptomic map of the human and 115 | mouse pancreas reveals inter-and intra-cell population structure. Cell 116 | systems 3, 346-360. e344 (2016) 117 | 118 | Returns 119 | ------- 120 | adata: `AnnData` 121 | Anndata object 122 | """ 123 | url = 'https://www.dropbox.com/s/bvziclu6d3fdzow/rna_seq_baron.h5ad?dl=1' 124 | filename = 'rna_baron2016.h5ad' 125 | filepath = os.path.join(settings.workdir, 'data') 126 | fullpath = os.path.join(filepath, filename) 127 | if(not os.path.exists(fullpath)): 128 | print('Downloading data ...') 129 | os.makedirs(filepath, exist_ok=True) 130 | download_url(url, 131 | fullpath, 132 | desc=filename) 133 | print(f'Downloaded to {filepath}.') 134 | adata = read_h5ad(fullpath) 135 | return adata 136 | 137 | 138 | def rna_muraro2016(): 139 | """single-cell RNA-seq human pancreas data 140 | 141 | ref: Muraro, M.J. et al. A single-cell transcriptome atlas of the 142 | human pancreas.Cell systems 3, 385-394. e383 (2016). 143 | 144 | Returns 145 | ------- 146 | adata: `AnnData` 147 | Anndata object 148 | """ 149 | url = 'https://www.dropbox.com/s/ginc9rbo4qmobwx/rna_seq_muraro.h5ad?dl=1' 150 | filename = 'rna_muraro2016.h5ad' 151 | filepath = os.path.join(settings.workdir, 'data') 152 | fullpath = os.path.join(filepath, filename) 153 | if(not os.path.exists(fullpath)): 154 | print('Downloading data ...') 155 | os.makedirs(filepath, exist_ok=True) 156 | download_url(url, 157 | fullpath, 158 | desc=filename) 159 | print(f'Downloaded to {filepath}.') 160 | adata = read_h5ad(fullpath) 161 | return adata 162 | 163 | 164 | def rna_segerstolpe2016(): 165 | """single-cell RNA-seq human pancreas data 166 | 167 | ref: Segerstolpe, Å. et al. Single-cell transcriptome profiling of human 168 | pancreatic islets in health and type 2 diabetes. 169 | Cell metabolism 24, 593-607 (2016). 170 | 171 | Returns 172 | ------- 173 | adata: `AnnData` 174 | Anndata object 175 | """ 176 | url = 'https://www.dropbox.com/s/qomnf4860jwm9pd/rna_seq_segerstolpe.h5ad?dl=1' 177 | filename = 'rna_segerstolpe2016.h5ad' 178 | filepath = os.path.join(settings.workdir, 'data') 179 | fullpath = os.path.join(filepath, filename) 180 | if(not os.path.exists(fullpath)): 181 | print('Downloading data ...') 182 | os.makedirs(filepath, exist_ok=True) 183 | download_url(url, 184 | fullpath, 185 | desc=filename) 186 | print(f'Downloaded to {filepath}.') 187 | adata = read_h5ad(fullpath) 188 | return adata 189 | 190 | 191 | def rna_wang2016(): 192 | """single-cell RNA-seq human pancreas data 193 | 194 | ref: Wang, Y.J. et al. Single-cell transcriptomics of the human endocrine 195 | pancreas. Diabetes 65, 3028-3038 (2016). 196 | 197 | Returns 198 | ------- 199 | adata: `AnnData` 200 | Anndata object 201 | """ 202 | url = 'https://www.dropbox.com/s/9tv44nugwpx9t4c/rna_seq_wang.h5ad?dl=1' 203 | filename = 'rna_wang2016.h5ad' 204 | filepath = os.path.join(settings.workdir, 'data') 205 | fullpath = os.path.join(filepath, filename) 206 | if(not os.path.exists(fullpath)): 207 | print('Downloading data ...') 208 | os.makedirs(filepath, exist_ok=True) 209 | download_url(url, 210 | fullpath, 211 | desc=filename) 212 | print(f'Downloaded to {filepath}.') 213 | adata = read_h5ad(fullpath) 214 | return adata 215 | 216 | 217 | def rna_xin2016(): 218 | """single-cell RNA-seq human pancreas data 219 | 220 | ref: Xin, Y. et al. RNA sequencing of single human islet cells reveals 221 | type 2 diabetes genes. Cell metabolism 24, 608-615 (2016). 222 | 223 | Returns 224 | ------- 225 | adata: `AnnData` 226 | Anndata object 227 | """ 228 | url = 'https://www.dropbox.com/s/j483i47mxty6rzo/rna_seq_xin.h5ad?dl=1' 229 | filename = 'rna_xin2016.h5ad' 230 | filepath = os.path.join(settings.workdir, 'data') 231 | fullpath = os.path.join(filepath, filename) 232 | if(not os.path.exists(fullpath)): 233 | print('Downloading data ...') 234 | os.makedirs(filepath, exist_ok=True) 235 | download_url(url, 236 | fullpath, 237 | desc=filename) 238 | print(f'Downloaded to {filepath}.') 239 | adata = read_h5ad(fullpath) 240 | return adata 241 | 242 | 243 | def atac_buenrostro2018(): 244 | """single cell ATAC-seq human blood data 245 | 246 | ref: Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the 247 | Continuous RegulatoryLandscape of Human Hematopoietic Differentiation. 248 | Cell 173, 1535-1548 e1516 (2018). 249 | 250 | Returns 251 | ------- 252 | adata: `AnnData` 253 | Anndata object 254 | """ 255 | url = 'https://www.dropbox.com/s/7hxjqgdxtbna1tm/atac_seq.h5ad?dl=1' 256 | filename = 'atac_buenrostro2018.h5ad' 257 | filepath = os.path.join(settings.workdir, 'data') 258 | fullpath = os.path.join(filepath, filename) 259 | if(not os.path.exists(fullpath)): 260 | print('Downloading data ...') 261 | os.makedirs(filepath, exist_ok=True) 262 | download_url(url, 263 | fullpath, 264 | desc=filename) 265 | print(f'Downloaded to {filepath}.') 266 | adata = read_h5ad(fullpath) 267 | return adata 268 | 269 | 270 | def atac_10xpbmc5k(): 271 | """10X human peripheral blood mononuclear cells (PBMCs) scATAC-seq data 272 | 273 | Returns 274 | ------- 275 | adata: `AnnData` 276 | Anndata object 277 | """ 278 | url = 'https://www.dropbox.com/s/xa8u7rlskc5h7iv/atac_seq.h5ad?dl=1' 279 | filename = 'atac_10xpbmc5k.h5ad' 280 | filepath = os.path.join(settings.workdir, 'data') 281 | fullpath = os.path.join(filepath, filename) 282 | if(not os.path.exists(fullpath)): 283 | print('Downloading data ...') 284 | os.makedirs(filepath, exist_ok=True) 285 | download_url(url, 286 | fullpath, 287 | desc=filename) 288 | print(f'Downloaded to {filepath}.') 289 | adata = read_h5ad(fullpath) 290 | return adata 291 | 292 | 293 | def atac_cusanovich2018_subset(): 294 | """downsampled sci-ATAC-seq mouse tissue data 295 | 296 | ref: Cusanovich, D.A. et al. A Single-Cell Atlas of In Vivo Mammalian 297 | Chromatin Accessibility. Cell 174, 1309-1324 e1318 (2018). 298 | 299 | Returns 300 | ------- 301 | adata: `AnnData` 302 | Anndata object 303 | """ 304 | url = 'https://www.dropbox.com/s/e8iqwm93m33i5wt/atac_seq.h5ad?dl=1' 305 | filename = 'atac_cusanovich2018_subset.h5ad' 306 | filepath = os.path.join(settings.workdir, 'data') 307 | fullpath = os.path.join(filepath, filename) 308 | if(not os.path.exists(fullpath)): 309 | print('Downloading data ...') 310 | os.makedirs(filepath, exist_ok=True) 311 | download_url(url, 312 | fullpath, 313 | desc=filename) 314 | print(f'Downloaded to {filepath}.') 315 | adata = read_h5ad(fullpath) 316 | return adata 317 | 318 | 319 | def atac_chen2019(): 320 | """simulated scATAC-seq bone marrow data with a noise level of 0.4 321 | and a coverage of 2500 fragments 322 | 323 | ref: Chen, H. et al. Assessment of computational methods for the analysis 324 | of single-cell ATAC-seq data. Genome Biology 20, 241 (2019). 325 | 326 | Returns 327 | ------- 328 | adata: `AnnData` 329 | Anndata object 330 | """ 331 | url = 'https://www.dropbox.com/s/fthhh3mz5b39d4y/atac_seq.h5ad?dl=1' 332 | filename = 'atac_chen2019.h5ad' 333 | filepath = os.path.join(settings.workdir, 'data') 334 | fullpath = os.path.join(filepath, filename) 335 | if(not os.path.exists(fullpath)): 336 | print('Downloading data ...') 337 | os.makedirs(filepath, exist_ok=True) 338 | download_url(url, 339 | fullpath, 340 | desc=filename) 341 | print(f'Downloaded to {filepath}.') 342 | adata = read_h5ad(fullpath) 343 | return adata 344 | 345 | 346 | def multiome_ma2020_fig4(): 347 | """single cell multiome mouse skin data (SHARE-seq) 348 | 349 | ref: Ma, S. et al. Chromatin Potential Identified by Shared Single-Cell 350 | Profiling of RNA and Chromatin. Cell (2020). 351 | 352 | Returns 353 | ------- 354 | dict_adata: `dict` 355 | A dictionary of anndata objects 356 | """ 357 | url_rna = 'https://www.dropbox.com/s/gmmf77l8kzle6o7/rna_seq_fig4.h5ad?dl=1' 358 | url_atac = 'https://www.dropbox.com/s/ts0v2y2m5fcumcb/atac_seq_fig4.h5ad?dl=1' 359 | filename_rna = 'multiome_ma2020_fig4_rna.h5ad' 360 | filename_atac = 'multiome_ma2020_fig4_atac.h5ad' 361 | filepath = os.path.join(settings.workdir, 'data') 362 | fullpath_rna = os.path.join(filepath, filename_rna) 363 | fullpath_atac = os.path.join(filepath, filename_atac) 364 | 365 | if(not os.path.exists(fullpath_rna)): 366 | print('Downloading data ...') 367 | os.makedirs(filepath, exist_ok=True) 368 | download_url(url_rna, 369 | fullpath_rna, 370 | desc=filename_rna) 371 | print(f'Downloaded to {filepath}.') 372 | if(not os.path.exists(fullpath_atac)): 373 | print('Downloading data ...') 374 | os.makedirs(filepath, exist_ok=True) 375 | download_url(url_atac, 376 | fullpath_atac, 377 | desc=filename_atac) 378 | print(f'Downloaded to {filepath}.') 379 | adata_rna = read_h5ad(fullpath_rna) 380 | adata_atac = read_h5ad(fullpath_atac) 381 | dict_adata = {'rna': adata_rna, 382 | 'atac': adata_atac} 383 | return dict_adata 384 | 385 | 386 | def multiome_chen2019(): 387 | """single cell multiome neonatal mouse cerebral cortex data (SNARE-seq) 388 | 389 | ref: Chen, S., Lake, B.B. & Zhang, K. High-throughput sequencing of the 390 | transcriptome and chromatin accessibility in the same cell. 391 | Nat Biotechnol (2019). 392 | 393 | Returns 394 | ------- 395 | dict_adata: `dict` 396 | A dictionary of anndata objects 397 | """ 398 | url_rna = 'https://www.dropbox.com/s/b1bbcs500q0pigt/rna_seq.h5ad?dl=1' 399 | url_atac = 'https://www.dropbox.com/s/ljepkfber68pdvc/atac_seq.h5ad?dl=1' 400 | filename_rna = 'multiome_chen2019_rna.h5ad' 401 | filename_atac = 'multiome_chen2019_atac.h5ad' 402 | filepath = os.path.join(settings.workdir, 'data') 403 | fullpath_rna = os.path.join(filepath, filename_rna) 404 | fullpath_atac = os.path.join(filepath, filename_atac) 405 | 406 | if(not os.path.exists(fullpath_rna)): 407 | print('Downloading data ...') 408 | os.makedirs(filepath, exist_ok=True) 409 | download_url(url_rna, 410 | fullpath_rna, 411 | desc=filename_rna) 412 | print(f'Downloaded to {filepath}.') 413 | if(not os.path.exists(fullpath_atac)): 414 | print('Downloading data ...') 415 | os.makedirs(filepath, exist_ok=True) 416 | download_url(url_atac, 417 | fullpath_atac, 418 | desc=filename_atac) 419 | print(f'Downloaded to {filepath}.') 420 | adata_rna = read_h5ad(fullpath_rna) 421 | adata_atac = read_h5ad(fullpath_atac) 422 | dict_adata = {'rna': adata_rna, 423 | 'atac': adata_atac} 424 | return dict_adata 425 | 426 | 427 | def multiome_10xpbmc10k(): 428 | """single cell 10X human peripheral blood mononuclear cells (PBMCs) 429 | multiome data 430 | 431 | Returns 432 | ------- 433 | dict_adata: `dict` 434 | A dictionary of anndata objects 435 | """ 436 | url_rna = 'https://www.dropbox.com/s/zwlim6vljnbfp43/rna_seq.h5ad?dl=1' 437 | url_atac = 'https://www.dropbox.com/s/163msz0k9hkfrt7/atac_seq.h5ad?dl=1' 438 | filename_rna = 'multiome_10xpbmc10k_rna.h5ad' 439 | filename_atac = 'multiome_10xpbmc10k_atac.h5ad' 440 | filepath = os.path.join(settings.workdir, 'data') 441 | fullpath_rna = os.path.join(filepath, filename_rna) 442 | fullpath_atac = os.path.join(filepath, filename_atac) 443 | 444 | if(not os.path.exists(fullpath_rna)): 445 | print('Downloading data ...') 446 | os.makedirs(filepath, exist_ok=True) 447 | download_url(url_rna, 448 | fullpath_rna, 449 | desc=filename_rna) 450 | print(f'Downloaded to {filepath}.') 451 | if(not os.path.exists(fullpath_atac)): 452 | print('Downloading data ...') 453 | os.makedirs(filepath, exist_ok=True) 454 | download_url(url_atac, 455 | fullpath_atac, 456 | desc=filename_atac) 457 | print(f'Downloaded to {filepath}.') 458 | adata_rna = read_h5ad(fullpath_rna) 459 | adata_atac = read_h5ad(fullpath_atac) 460 | dict_adata = {'rna': adata_rna, 461 | 'atac': adata_atac} 462 | return dict_adata 463 | -------------------------------------------------------------------------------- /simba/plotting/__init__.py: -------------------------------------------------------------------------------- 1 | """Plotting""" 2 | 3 | from ._plot import ( 4 | pca_variance_ratio, 5 | pcs_features, 6 | variable_genes, 7 | violin, 8 | hist, 9 | umap, 10 | discretize, 11 | node_similarity, 12 | svd_nodes, 13 | ) 14 | from ._post_training import ( 15 | pbg_metrics, 16 | entity_metrics, 17 | entity_barcode, 18 | query 19 | ) 20 | -------------------------------------------------------------------------------- /simba/plotting/_palettes.py: -------------------------------------------------------------------------------- 1 | """Color palettes in addition to matplotlib's palettes. 2 | This is modifed from 3 | scanpy palettes https://github.com/theislab/scanpy/blob/master/scanpy/plotting/palettes.py # noqa 4 | """ 5 | 6 | from matplotlib import cm, colors 7 | 8 | # Colorblindness adjusted vega_10 9 | # See https://github.com/theislab/scanpy/issues/387 10 | vega_10 = list(map(colors.to_hex, cm.tab10.colors)) 11 | vega_10_scanpy = vega_10.copy() 12 | vega_10_scanpy[2] = "#279e68" # green 13 | vega_10_scanpy[4] = "#aa40fc" # purple 14 | vega_10_scanpy[8] = "#b5bd61" # kakhi 15 | 16 | # default matplotlib 2.0 palette 17 | # see 'category20' on https://github.com/vega/vega/wiki/Scales#scale-range-literals # noqa 18 | vega_20 = list(map(colors.to_hex, cm.tab20.colors)) 19 | 20 | # reorderd, some removed, some added 21 | vega_20_scanpy = [ 22 | *vega_20[0:14:2], 23 | *vega_20[16::2], # dark without grey 24 | *vega_20[1:15:2], 25 | *vega_20[17::2], # light without grey 26 | "#ad494a", 27 | "#8c6d31", # manual additions 28 | ] 29 | vega_20_scanpy[2] = vega_10_scanpy[2] 30 | vega_20_scanpy[4] = vega_10_scanpy[4] 31 | vega_20_scanpy[7] = vega_10_scanpy[8] # kakhi shifted by missing grey 32 | # TODO: also replace pale colors if necessary 33 | 34 | default_20 = vega_20_scanpy 35 | 36 | # https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d 37 | # update 1 38 | # orig reference http://epub.wu.ac.at/1692/1/document.pdf 39 | zeileis_28 = [ 40 | "#023fa5", 41 | "#7d87b9", 42 | "#bec1d4", 43 | "#d6bcc0", 44 | "#bb7784", 45 | "#8e063b", 46 | "#4a6fe3", 47 | "#8595e1", 48 | "#b5bbe3", 49 | "#e6afb9", 50 | "#e07b91", 51 | "#d33f6a", 52 | "#11c638", 53 | "#8dd593", 54 | "#c6dec7", 55 | "#ead3c6", 56 | "#f0b98d", 57 | "#ef9708", 58 | "#0fcfc0", 59 | "#9cded6", 60 | "#d5eae7", 61 | "#f3e1eb", 62 | "#f6c4e1", 63 | "#f79cd4", 64 | "#7f7f7f", 65 | "#c7c7c7", 66 | "#1CE6FF", 67 | "#336600", # these last ones were added, 68 | ] 69 | 70 | default_28 = zeileis_28 71 | 72 | # from http://godsnotwheregodsnot.blogspot.de/2012/09/color-distribution-methodology.html # noqa 73 | godsnot_102 = [ 74 | # "#000000", 75 | # remove the black, as often, we have black colored annotation 76 | "#FFFF00", 77 | "#1CE6FF", 78 | "#FF34FF", 79 | "#FF4A46", 80 | "#008941", 81 | "#006FA6", 82 | "#A30059", 83 | "#FFDBE5", 84 | "#7A4900", 85 | "#0000A6", 86 | "#63FFAC", 87 | "#B79762", 88 | "#004D43", 89 | "#8FB0FF", 90 | "#997D87", 91 | "#5A0007", 92 | "#809693", 93 | "#6A3A4C", 94 | "#1B4400", 95 | "#4FC601", 96 | "#3B5DFF", 97 | "#4A3B53", 98 | "#FF2F80", 99 | "#61615A", 100 | "#BA0900", 101 | "#6B7900", 102 | "#00C2A0", 103 | "#FFAA92", 104 | "#FF90C9", 105 | "#B903AA", 106 | "#D16100", 107 | "#DDEFFF", 108 | "#000035", 109 | "#7B4F4B", 110 | "#A1C299", 111 | "#300018", 112 | "#0AA6D8", 113 | "#013349", 114 | "#00846F", 115 | "#372101", 116 | "#FFB500", 117 | "#C2FFED", 118 | "#A079BF", 119 | "#CC0744", 120 | "#C0B9B2", 121 | "#C2FF99", 122 | "#001E09", 123 | "#00489C", 124 | "#6F0062", 125 | "#0CBD66", 126 | "#EEC3FF", 127 | "#456D75", 128 | "#B77B68", 129 | "#7A87A1", 130 | "#788D66", 131 | "#885578", 132 | "#FAD09F", 133 | "#FF8A9A", 134 | "#D157A0", 135 | "#BEC459", 136 | "#456648", 137 | "#0086ED", 138 | "#886F4C", 139 | "#34362D", 140 | "#B4A8BD", 141 | "#00A6AA", 142 | "#452C2C", 143 | "#636375", 144 | "#A3C8C9", 145 | "#FF913F", 146 | "#938A81", 147 | "#575329", 148 | "#00FECF", 149 | "#B05B6F", 150 | "#8CD0FF", 151 | "#3B9700", 152 | "#04F757", 153 | "#C8A1A1", 154 | "#1E6E00", 155 | "#7900D7", 156 | "#A77500", 157 | "#6367A9", 158 | "#A05837", 159 | "#6B002C", 160 | "#772600", 161 | "#D790FF", 162 | "#9B9700", 163 | "#549E79", 164 | "#FFF69F", 165 | "#201625", 166 | "#72418F", 167 | "#BC23FF", 168 | "#99ADC0", 169 | "#3A2465", 170 | "#922329", 171 | "#5B4534", 172 | "#FDE8DC", 173 | "#404E55", 174 | "#0089A3", 175 | "#CB7E98", 176 | "#A4E804", 177 | "#324E72", 178 | ] 179 | 180 | default_102 = godsnot_102 181 | -------------------------------------------------------------------------------- /simba/plotting/_post_training.py: -------------------------------------------------------------------------------- 1 | """post-training plotting functions""" 2 | 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | import json 7 | import matplotlib as mpl 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | from matplotlib.collections import LineCollection 11 | from adjustText import adjust_text 12 | from pandas.api.types import ( 13 | is_numeric_dtype 14 | ) 15 | from scipy.stats import rankdata 16 | 17 | from ._utils import ( 18 | get_colors, 19 | generate_palette 20 | ) 21 | from .._settings import settings 22 | from ._plot import _scatterplot2d 23 | 24 | 25 | def pbg_metrics(metrics=['mrr'], 26 | path_emb=None, 27 | fig_size=(5, 3), 28 | fig_ncol=1, 29 | save_fig=None, 30 | fig_path=None, 31 | fig_name='pbg_metrics.pdf', 32 | pad=1.08, 33 | w_pad=None, 34 | h_pad=None, 35 | **kwargs): 36 | """Plot PBG training metrics 37 | 38 | Parameters 39 | ---------- 40 | metrics: `list`, optional (default: ['mrr]) 41 | Evalulation metrics for PBG training. Possible metrics: 42 | 43 | - 'pos_rank' : the average of the ranks of all positives 44 | (lower is better, best is 1). 45 | - 'mrr' : the average of the reciprocal of the ranks of all positives 46 | (higher is better, best is 1). 47 | - 'r1' : the fraction of positives that rank better than 48 | all their negatives, i.e., have a rank of 1 49 | (higher is better, best is 1). 50 | - 'r10' : the fraction of positives that rank in the top 10 51 | among their negatives 52 | (higher is better, best is 1). 53 | - 'r50' : the fraction of positives that rank in the top 50 54 | among their negatives 55 | (higher is better, best is 1). 56 | - 'auc' : Area Under the Curve (AUC) 57 | path_emb: `str`, optional (default: None) 58 | Path to directory for pbg embedding model. 59 | If None, .settings.pbg_params['checkpoint_path'] will be used. 60 | pad: `float`, optional (default: 1.08) 61 | Padding between the figure edge and the edges of subplots, 62 | as a fraction of the font size. 63 | h_pad, w_pad: `float`, optional (default: None) 64 | Padding (height/width) between edges of adjacent subplots, 65 | as a fraction of the font size. Defaults to pad. 66 | fig_size: `tuple`, optional (default: (5, 3)) 67 | figure size. 68 | fig_ncol: `int`, optional (default: 1) 69 | the number of columns of the figure panel 70 | save_fig: `bool`, optional (default: False) 71 | if True,save the figure. 72 | fig_path: `str`, optional (default: None) 73 | If save_fig is True, specify figure path. 74 | fig_name: `str`, optional (default: 'plot_umap.pdf') 75 | if save_fig is True, specify figure name. 76 | Returns 77 | ------- 78 | None 79 | """ 80 | if save_fig is None: 81 | save_fig = settings.save_fig 82 | if fig_path is None: 83 | fig_path = os.path.join(settings.workdir, 'figures') 84 | 85 | assert isinstance(metrics, list), "`metrics` must be list" 86 | for x in metrics: 87 | if x not in ['pos_rank', 'mrr', 'r1', 88 | 'r10', 'r50', 'auc']: 89 | raise ValueError(f'unrecognized metric {x}') 90 | pbg_params = settings.pbg_params 91 | if path_emb is None: 92 | path_emb = pbg_params['checkpoint_path'] 93 | training_loss = [] 94 | eval_stats_before = dict() 95 | with open(os.path.join(path_emb, 'training_stats.json'), 'r') as f: 96 | for line in f: 97 | line_json = json.loads(line) 98 | if 'stats' in line_json.keys(): 99 | training_loss.append(line_json['stats']['metrics']['loss']) 100 | line_stats_before = line_json['eval_stats_before']['metrics'] 101 | for x in line_stats_before.keys(): 102 | if x not in eval_stats_before.keys(): 103 | eval_stats_before[x] = [line_stats_before[x]] 104 | else: 105 | eval_stats_before[x].append(line_stats_before[x]) 106 | df_metrics = pd.DataFrame(index=range(pbg_params['num_epochs'])) 107 | df_metrics['epoch'] = range(pbg_params['num_epochs']) 108 | df_metrics['training_loss'] = training_loss 109 | df_metrics['validation_loss'] = eval_stats_before['loss'] 110 | for x in metrics: 111 | df_metrics[x] = eval_stats_before[x] 112 | 113 | fig_nrow = int(np.ceil((df_metrics.shape[1]-1)/fig_ncol)) 114 | fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, 115 | fig_size[1]*fig_nrow)) 116 | dict_palette = generate_palette(df_metrics.columns[1:].values) 117 | for i, metric in enumerate(df_metrics.columns[1:]): 118 | ax_i = fig.add_subplot(fig_nrow, fig_ncol, i+1) 119 | ax_i.scatter(df_metrics['epoch'], 120 | df_metrics[metric], 121 | c=dict_palette[metric], 122 | **kwargs) 123 | ax_i.set_title(metric) 124 | ax_i.set_xlabel('epoch') 125 | ax_i.set_ylabel(metric) 126 | plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad) 127 | if save_fig: 128 | if not os.path.exists(fig_path): 129 | os.makedirs(fig_path) 130 | plt.savefig(os.path.join(fig_path, fig_name), 131 | pad_inches=1, 132 | bbox_inches='tight') 133 | plt.close(fig) 134 | 135 | 136 | def entity_metrics(adata_cmp, 137 | x, 138 | y, 139 | show_texts=True, 140 | show_cutoff=False, 141 | show_contour=True, 142 | levels=4, 143 | thresh=0.05, 144 | cutoff_x=0, 145 | cutoff_y=0, 146 | n_texts=10, 147 | size=8, 148 | texts=None, 149 | text_size=10, 150 | text_expand=(1.05, 1.2), 151 | fig_size=None, 152 | save_fig=None, 153 | fig_path=None, 154 | fig_name='entity_metrics.pdf', 155 | pad=1.08, 156 | w_pad=None, 157 | h_pad=None, 158 | **kwargs): 159 | """Plot entity metrics 160 | 161 | Parameters 162 | ---------- 163 | adata_cmp: `AnnData` 164 | Anndata object from `compare_entities` 165 | x, y: `str` 166 | Variables that specify positions on the x and y axes. 167 | Possible values: 168 | - max (The average maximum dot product of top-rank reference entities, 169 | based on normalized dot product) 170 | - std (standard deviation of reference entities, 171 | based on dot product) 172 | - gini (Gini coefficients of reference entities, 173 | based on softmax probability) 174 | - entropy (The entropy of reference entities, 175 | based on softmax probability) 176 | show_texts : `bool`, optional (default: True) 177 | If True, text annotation will be shown. 178 | show_cutoff : `bool`, optional (default: False) 179 | If True, cutoff of `x` and `y` will be shown. 180 | show_contour : `bool`, optional (default: True) 181 | If True, the plot will overlaid with contours 182 | texts: `list` optional (default: None) 183 | Entity names to plot 184 | text_size : `int`, optional (default: 10) 185 | The text size 186 | text_expand : `tuple`, optional (default: (1.05, 1.2)) 187 | Two multipliers (x, y) by which to expand the bounding box of texts 188 | when repelling them from each other/points/other objects. 189 | cutoff_x : `float`, optional (default: 0) 190 | Cutoff of axis x 191 | cutoff_y : `float`, optional (default: 0) 192 | Cutoff of axis y 193 | levels: `int`, optional (default: 6) 194 | Number of contour levels or values to draw contours at 195 | thresh: `float`, optional ([0, 1], default: 0.05) 196 | Lowest iso-proportion level at which to draw a contour line. 197 | pad: `float`, optional (default: 1.08) 198 | Padding between the figure edge and the edges of subplots, 199 | as a fraction of the font size. 200 | h_pad, w_pad: `float`, optional (default: None) 201 | Padding (height/width) between edges of adjacent subplots, 202 | as a fraction of the font size. Defaults to pad. 203 | fig_size: `tuple`, optional (default: None) 204 | figure size. 205 | If None, `mpl.rcParams['figure.figsize']` will be used. 206 | fig_ncol: `int`, optional (default: 1) 207 | the number of columns of the figure panel 208 | save_fig: `bool`, optional (default: False) 209 | if True,save the figure. 210 | fig_path: `str`, optional (default: None) 211 | If save_fig is True, specify figure path. 212 | fig_name: `str`, optional (default: 'plot_umap.pdf') 213 | if save_fig is True, specify figure name. 214 | 215 | Returns 216 | ------- 217 | None 218 | """ 219 | if fig_size is None: 220 | fig_size = mpl.rcParams['figure.figsize'] 221 | if save_fig is None: 222 | save_fig = settings.save_fig 223 | if fig_path is None: 224 | fig_path = os.path.join(settings.workdir, 'figures') 225 | 226 | assert (x in ['max', 'std', 'gini', 'entropy']), \ 227 | "x must be one of ['max','std','gini','entropy']" 228 | assert (y in ['max', 'std', 'gini', 'entropy']), \ 229 | "y must be one of ['max','std','gini','entropy']" 230 | 231 | fig, ax = plt.subplots(figsize=fig_size) 232 | ax.scatter(adata_cmp.var[x], 233 | adata_cmp.var[y], 234 | s=size, 235 | **kwargs) 236 | if show_texts: 237 | if texts is not None: 238 | plt_texts = [plt.text(adata_cmp.var[x][t], 239 | adata_cmp.var[y][t], 240 | t, 241 | fontdict={'family': 'serif', 242 | 'color': 'black', 243 | 'weight': 'normal', 244 | 'size': text_size}) 245 | for t in texts] 246 | else: 247 | if x == 'entropy': 248 | ranks_x = rankdata(-adata_cmp.var[x]) 249 | else: 250 | ranks_x = rankdata(adata_cmp.var[x]) 251 | if y == 'entropy': 252 | ranks_y = rankdata(-adata_cmp.var[y]) 253 | else: 254 | ranks_y = rankdata(adata_cmp.var[y]) 255 | ids = np.argsort(ranks_x + ranks_y)[::-1][:n_texts] 256 | plt_texts = [plt.text(adata_cmp.var[x][i], 257 | adata_cmp.var[y][i], 258 | adata_cmp.var_names[i], 259 | fontdict={'family': 'serif', 260 | 'color': 'black', 261 | 'weight': 'normal', 262 | 'size': text_size}) 263 | for i in ids] 264 | adjust_text(plt_texts, 265 | expand=text_expand, 266 | arrowprops=dict(arrowstyle='-', color='black')) 267 | if show_cutoff: 268 | ax.axvline(x=cutoff_x, linestyle='--', color='#CE3746') 269 | ax.axhline(y=cutoff_y, linestyle='--', color='#CE3746') 270 | if show_contour: 271 | sns.kdeplot(ax=ax, 272 | data=adata_cmp.var, 273 | x=x, 274 | y=y, 275 | alpha=0.7, 276 | color='black', 277 | levels=levels, 278 | thresh=thresh) 279 | ax.set_xlabel(x) 280 | ax.set_ylabel(y) 281 | ax.locator_params(axis='x', tight=True) 282 | ax.locator_params(axis='y', tight=True) 283 | fig.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad) 284 | if save_fig: 285 | if not os.path.exists(fig_path): 286 | os.makedirs(fig_path) 287 | fig.savefig(os.path.join(fig_path, fig_name), 288 | pad_inches=1, 289 | bbox_inches='tight') 290 | plt.close(fig) 291 | 292 | 293 | def entity_barcode(adata_cmp, 294 | entities, 295 | anno_ref=None, 296 | layer='softmax', 297 | palette=None, 298 | alpha=0.8, 299 | linewidths=1, 300 | show_cutoff=False, 301 | cutoff=0.5, 302 | min_rank=None, 303 | max_rank=None, 304 | fig_size=(6, 2), 305 | fig_ncol=1, 306 | save_fig=None, 307 | fig_path=None, 308 | fig_name='plot_barcode.pdf', 309 | pad=1.08, 310 | w_pad=None, 311 | h_pad=None, 312 | **kwargs 313 | ): 314 | """Plot query entity barcode 315 | 316 | Parameters 317 | ---------- 318 | adata_cmp : `AnnData` 319 | Anndata object from `compare_entities` 320 | entities : `list` 321 | Entity names to plot. 322 | anno_ref : `str` 323 | Annotation used for reference entity 324 | layer : `str`, optional (default: 'softmax') 325 | Layer to use make barcode plots 326 | palette : `dict`, optional (default: None) 327 | Color palette used for `anno_ref` 328 | alpha : `float`, optional (default: 0.8) 329 | 0.0 transparent through 1.0 opaque 330 | linewidths : `int`, optional (default: 1) 331 | The width of each line. 332 | show_cutoff : `bool`, optional (default: True) 333 | If True, cutoff will be shown 334 | cutoff : `float`, optional (default: 0.5) 335 | Cutoff value for y axis 336 | min_rank : `int`, optional (default: None) 337 | Specify the minimum rank of observations to show. 338 | If None, `min_rank` is set to 0. 339 | max_rank : `int`, optional (default: None) 340 | Specify the maximum rank of observations to show. 341 | If None, `max_rank` is set to the number of observations. 342 | fig_size: `tuple`, optional (default: (6,2)) 343 | figure size. 344 | fig_ncol: `int`, optional (default: 1) 345 | the number of columns of the figure panel 346 | save_fig: `bool`, optional (default: False) 347 | if True,save the figure. 348 | fig_path: `str`, optional (default: None) 349 | If save_fig is True, specify figure path. 350 | fig_name: `str`, optional (default: 'plot_barcode.pdf') 351 | if `save_fig` is True, specify figure name. 352 | **kwargs: `dict`, optional 353 | Other keyword arguments are passed through to 354 | ``mpl.collections.LineCollection`` 355 | 356 | Returns 357 | ------- 358 | None 359 | """ 360 | if fig_size is None: 361 | fig_size = mpl.rcParams['figure.figsize'] 362 | if save_fig is None: 363 | save_fig = settings.save_fig 364 | if fig_path is None: 365 | fig_path = os.path.join(settings.workdir, 'figures') 366 | 367 | assert isinstance(entities, list), "`entities` must be list" 368 | 369 | if layer is None: 370 | X = adata_cmp[:, entities].X.copy() 371 | else: 372 | X = adata_cmp[:, entities].layers[layer].copy() 373 | df_scores = pd.DataFrame( 374 | data=X, 375 | index=adata_cmp.obs_names, 376 | columns=entities) 377 | 378 | if min_rank is None: 379 | min_rank = 0 380 | if max_rank is None: 381 | max_rank = df_scores.shape[0] 382 | 383 | n_plots = len(entities) 384 | fig_nrow = int(np.ceil(n_plots/fig_ncol)) 385 | fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, 386 | fig_size[1]*fig_nrow)) 387 | 388 | for i, x in enumerate(entities): 389 | ax_i = fig.add_subplot(fig_nrow, fig_ncol, i+1) 390 | scores_x_sorted = df_scores[x].sort_values(ascending=False) 391 | lines = [] 392 | for xx, yy in zip(np.arange(len(scores_x_sorted))[min_rank:max_rank], 393 | scores_x_sorted[min_rank:max_rank]): 394 | lines.append([(xx, 0), (xx, yy)]) 395 | if anno_ref is None: 396 | colors = get_colors(np.array([""]*len(scores_x_sorted))) 397 | else: 398 | ids_ref = scores_x_sorted.index 399 | if palette is None: 400 | colors = get_colors(adata_cmp[ids_ref, :].obs[anno_ref]) 401 | else: 402 | colors = [palette[adata_cmp.obs.loc[xx, anno_ref]] 403 | for xx in scores_x_sorted.index] 404 | stemlines = LineCollection( 405 | lines, 406 | colors=colors, 407 | alpha=alpha, 408 | linewidths=linewidths, 409 | **kwargs) 410 | ax_i.add_collection(stemlines) 411 | ax_i.autoscale() 412 | ax_i.set_title(x) 413 | ax_i.set_ylabel(layer) 414 | ax_i.locator_params(axis='y', tight=True) 415 | if show_cutoff: 416 | ax_i.axhline(y=cutoff, 417 | color='#CC6F47', 418 | linestyle='--') 419 | plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad) 420 | if save_fig: 421 | if not os.path.exists(fig_path): 422 | os.makedirs(fig_path) 423 | plt.savefig(os.path.join(fig_path, fig_name), 424 | pad_inches=1, 425 | bbox_inches='tight') 426 | plt.close(fig) 427 | 428 | 429 | def query(adata, 430 | comp1=0, 431 | comp2=1, 432 | obsm='X_umap', 433 | layer=None, 434 | color=None, 435 | dict_palette=None, 436 | size=8, 437 | drawing_order='random', 438 | dict_drawing_order=None, 439 | show_texts=False, 440 | texts=None, 441 | text_expand=(1.05, 1.2), 442 | text_size=10, 443 | n_texts=8, 444 | fig_size=None, 445 | fig_ncol=3, 446 | fig_legend_ncol=1, 447 | fig_legend_order=None, 448 | alpha=0.9, 449 | alpha_bg=0.3, 450 | pad=1.08, 451 | w_pad=None, 452 | h_pad=None, 453 | save_fig=None, 454 | fig_path=None, 455 | fig_name='plot_query.pdf', 456 | vmin=None, 457 | vmax=None, 458 | **kwargs): 459 | """Plot query output 460 | 461 | Parameters 462 | ---------- 463 | adata : `Anndata` 464 | Annotated data matrix. 465 | comp1 : `int`, optional (default: 0) 466 | Component used for x axis. 467 | comp2 : `int`, optional (default: 1) 468 | Component used for y axis. 469 | obsm : `str`, optional (default: 'X_umap') 470 | The field to use for plotting 471 | layer : `str`, optional (default: None) 472 | The layer to use for plotting 473 | color: `list`, optional (default: None) 474 | A list of variables that will produce points with different colors. 475 | e.g. color = ['anno1', 'anno2'] 476 | dict_palette: `dict`,optional (default: None) 477 | A dictionary of palettes for different variables in `color`. 478 | Only valid for categorical/string variables 479 | e.g. dict_palette = {'ann1': {},'ann2': {}} 480 | size: `int` (default: 8) 481 | Point size. 482 | drawing_order: `str` (default: 'random') 483 | The order in which values are plotted, This can be 484 | one of the following values 485 | 486 | - 'original': plot points in the same order as in input dataframe 487 | - 'sorted' : plot points with higher values on top. 488 | - 'random' : plot points in a random order 489 | dict_drawing_order: `dict`,optional (default: None) 490 | A dictionary of drawing_order for different variables in `color`. 491 | Only valid for categorical/string variables 492 | e.g. dict_drawing_order = {'ann1': 'original','ann2': 'sorted'} 493 | show_texts : `bool`, optional (default: False) 494 | If True, text annotation will be shown. 495 | text_size : `int`, optional (default: 10) 496 | The text size. 497 | texts: `list` optional (default: None) 498 | Point names to plot. 499 | text_expand : `tuple`, optional (default: (1.05, 1.2)) 500 | Two multipliers (x, y) by which to expand the bounding box of texts 501 | when repelling them from each other/points/other objects. 502 | n_texts : `int`, optional (default: 8) 503 | The number of texts to plot. 504 | fig_size: `tuple`, optional (default: (4, 4)) 505 | figure size. 506 | fig_ncol: `int`, optional (default: 3) 507 | the number of columns of the figure panel 508 | fig_legend_order: `dict`,optional (default: None) 509 | Specified order for the appearance of the annotation keys. 510 | Only valid for categorical/string variable 511 | e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']} 512 | fig_legend_ncol: `int`, optional (default: 1) 513 | The number of columns that the legend has. 514 | vmin,vmax: `float`, optional (default: None) 515 | The min and max values are used to normalize continuous values. 516 | If None, the respective min and max of continuous values is used. 517 | alpha: `float`, optional (default: 0.9) 518 | The alpha blending value, between 0 (transparent) and 1 (opaque) 519 | for returned points. 520 | alpha_bg: `float`, optional (default: 0.3) 521 | The alpha blending value, between 0 (transparent) and 1 (opaque) 522 | for background points 523 | pad: `float`, optional (default: 1.08) 524 | Padding between the figure edge and the edges of subplots, 525 | as a fraction of the font size. 526 | h_pad, w_pad: `float`, optional (default: None) 527 | Padding (height/width) between edges of adjacent subplots, 528 | as a fraction of the font size. Defaults to pad. 529 | save_fig: `bool`, optional (default: False) 530 | if True,save the figure. 531 | fig_path: `str`, optional (default: None) 532 | If save_fig is True, specify figure path. 533 | fig_name: `str`, optional (default: 'plot_query.pdf') 534 | if save_fig is True, specify figure name. 535 | 536 | Returns 537 | ------- 538 | None 539 | """ 540 | if fig_size is None: 541 | fig_size = mpl.rcParams['figure.figsize'] 542 | if save_fig is None: 543 | save_fig = settings.save_fig 544 | if fig_path is None: 545 | fig_path = os.path.join(settings.workdir, 'figures') 546 | 547 | if dict_palette is None: 548 | dict_palette = dict() 549 | 550 | query_output = adata.uns['query']['output'] 551 | nn = query_output.index.tolist() # nearest neighbors 552 | if len(nn) == 0: 553 | print('No neighbor entities were found.') 554 | return 555 | query_params = adata.uns['query']['params'] 556 | query_obsm = query_params['obsm'] 557 | query_layer = query_params['layer'] 558 | entity = query_params['entity'] 559 | use_radius = query_params['use_radius'] 560 | r = query_params['r'] 561 | if (obsm == query_obsm) and (layer == query_layer): 562 | pin = query_params['pin'] 563 | else: 564 | if entity is not None: 565 | if obsm is not None: 566 | pin = adata[entity, :].obsm[obsm].copy() 567 | elif layer is not None: 568 | pin = adata[entity, :].layers[layer].copy() 569 | else: 570 | pin = adata[entity, :].X.copy() 571 | else: 572 | pin = None 573 | 574 | if sum(list(map(lambda x: x is not None, 575 | [layer, obsm]))) == 2: 576 | raise ValueError("Only one of `layer` and `obsm` can be used") 577 | if obsm is not None: 578 | X = adata.obsm[obsm].copy() 579 | X_nn = adata[nn, :].obsm[obsm].copy() 580 | elif layer is not None: 581 | X = adata.layers[layer].copy() 582 | X_nn = adata[nn, :].layers[layer].copy() 583 | else: 584 | X = adata.X.copy() 585 | X_nn = adata[nn, :].X.copy() 586 | df_plot = pd.DataFrame(index=adata.obs.index, 587 | data=X[:, [comp1, comp2]], 588 | columns=[f'Dim {comp1}', f'Dim {comp2}']) 589 | df_plot_nn = pd.DataFrame(index=adata[nn, :].obs.index, 590 | data=X_nn[:, [comp1, comp2]], 591 | columns=[f'Dim {comp1}', f'Dim {comp2}']) 592 | if show_texts: 593 | if texts is None: 594 | texts = nn[:n_texts] 595 | if color is None: 596 | list_ax = _scatterplot2d(df_plot, 597 | x=f'Dim {comp1}', 598 | y=f'Dim {comp2}', 599 | drawing_order=drawing_order, 600 | size=size, 601 | fig_size=fig_size, 602 | alpha=alpha_bg, 603 | pad=pad, 604 | w_pad=w_pad, 605 | h_pad=h_pad, 606 | save_fig=False, 607 | copy=True, 608 | **kwargs) 609 | else: 610 | color = list(dict.fromkeys(color)) # remove duplicate keys 611 | for ann in color: 612 | if ann in adata.obs_keys(): 613 | df_plot[ann] = adata.obs[ann] 614 | if not is_numeric_dtype(df_plot[ann]): 615 | if 'color' not in adata.uns_keys(): 616 | adata.uns['color'] = dict() 617 | 618 | if ann not in dict_palette.keys(): 619 | if (ann+'_color' in adata.uns['color'].keys()) \ 620 | and \ 621 | (all(np.isin(np.unique(df_plot[ann]), 622 | list(adata.uns['color'] 623 | [ann+'_color'].keys())))): 624 | dict_palette[ann] = \ 625 | adata.uns['color'][ann+'_color'] 626 | else: 627 | dict_palette[ann] = \ 628 | generate_palette(adata.obs[ann]) 629 | adata.uns['color'][ann+'_color'] = \ 630 | dict_palette[ann].copy() 631 | else: 632 | if ann+'_color' not in adata.uns['color'].keys(): 633 | adata.uns['color'][ann+'_color'] = \ 634 | dict_palette[ann].copy() 635 | 636 | elif ann in adata.var_names: 637 | df_plot[ann] = adata.obs_vector(ann) 638 | else: 639 | raise ValueError(f"could not find {ann} in `adata.obs.columns`" 640 | " and `adata.var_names`") 641 | list_ax = _scatterplot2d(df_plot, 642 | x=f'Dim {comp1}', 643 | y=f'Dim {comp2}', 644 | list_hue=color, 645 | hue_palette=dict_palette, 646 | drawing_order=drawing_order, 647 | dict_drawing_order=dict_drawing_order, 648 | size=size, 649 | fig_size=fig_size, 650 | fig_ncol=fig_ncol, 651 | fig_legend_ncol=fig_legend_ncol, 652 | fig_legend_order=fig_legend_order, 653 | vmin=vmin, 654 | vmax=vmax, 655 | alpha=alpha_bg, 656 | pad=pad, 657 | w_pad=w_pad, 658 | h_pad=h_pad, 659 | save_fig=False, 660 | copy=True, 661 | **kwargs) 662 | for ax in list_ax: 663 | ax.scatter( 664 | df_plot_nn[f'Dim {comp1}'], 665 | df_plot_nn[f'Dim {comp2}'], 666 | s=size, 667 | color='#AE6C68', 668 | alpha=alpha, 669 | lw=0) 670 | if pin is not None: 671 | ax.scatter(pin[:, comp1], 672 | pin[:, comp2], 673 | s=20*size, 674 | marker='+', 675 | color='#B33831') 676 | if use_radius: 677 | circle = plt.Circle((pin[:, comp1], 678 | pin[:, comp2]), 679 | radius=r, 680 | color='#B33831', 681 | fill=False) 682 | ax.add_artist(circle) 683 | if show_texts: 684 | plt_texts = [ax.text(df_plot_nn[f'Dim {comp1}'][t], 685 | df_plot_nn[f'Dim {comp2}'][t], 686 | t, 687 | fontdict={'family': 'serif', 688 | 'color': 'black', 689 | 'weight': 'normal', 690 | 'size': text_size}) 691 | for t in texts] 692 | adjust_text(plt_texts, 693 | ax=ax, 694 | expand=text_expand, 695 | arrowprops=dict(arrowstyle='->', color='black')) 696 | if save_fig: 697 | fig = plt.gcf() 698 | if not os.path.exists(fig_path): 699 | os.makedirs(fig_path) 700 | fig.savefig(os.path.join(fig_path, fig_name), 701 | pad_inches=1, 702 | bbox_inches='tight') 703 | plt.close(fig) 704 | -------------------------------------------------------------------------------- /simba/plotting/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.api.types import ( 6 | is_numeric_dtype, 7 | is_string_dtype, 8 | is_categorical_dtype, 9 | ) 10 | import matplotlib as mpl 11 | 12 | from ._palettes import ( 13 | default_20, 14 | default_28, 15 | default_102 16 | ) 17 | 18 | 19 | def get_colors(arr, 20 | vmin=None, 21 | vmax=None, 22 | clip=False): 23 | """Generate a list of colors for a given array 24 | """ 25 | 26 | if not isinstance(arr, (pd.Series, np.ndarray)): 27 | raise TypeError("`arr` must be pd.Series or np.ndarray") 28 | colors = [] 29 | if is_numeric_dtype(arr): 30 | image_cmap = mpl.rcParams['image.cmap'] 31 | cm = mpl.cm.get_cmap(image_cmap, 512) 32 | if vmin is None: 33 | vmin = min(arr) 34 | if vmax is None: 35 | vmax = max(arr) 36 | norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax, clip=clip) 37 | colors = [mpl.colors.to_hex(cm(norm(x))) for x in arr] 38 | elif is_string_dtype(arr) or is_categorical_dtype(arr): 39 | categories = np.unique(arr) 40 | length = len(categories) 41 | # check if default matplotlib palette has enough colors 42 | # mpl.style.use('default') 43 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length: 44 | cc = mpl.rcParams['axes.prop_cycle']() 45 | palette = [mpl.colors.rgb2hex(next(cc)['color']) 46 | for _ in range(length)] 47 | else: 48 | if length <= 20: 49 | palette = default_20 50 | elif length <= 28: 51 | palette = default_28 52 | elif length <= len(default_102): # 103 colors 53 | palette = default_102 54 | else: 55 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length)) 56 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1]) 57 | for i in range(length)] 58 | colors = pd.Series(['']*len(arr)) 59 | for i, x in enumerate(categories): 60 | ids = np.where(arr == x)[0] 61 | colors[ids] = palette[i] 62 | colors = list(colors) 63 | else: 64 | raise TypeError("unsupported data type for `arr`") 65 | return colors 66 | 67 | 68 | def generate_palette(arr): 69 | """Generate a color palette for a given array 70 | """ 71 | 72 | if not isinstance(arr, (pd.Series, np.ndarray)): 73 | raise TypeError("`arr` must be pd.Series or np.ndarray") 74 | colors = [] 75 | if is_string_dtype(arr) or is_categorical_dtype(arr): 76 | categories = np.unique(arr) 77 | length = len(categories) 78 | # check if default matplotlib palette has enough colors 79 | # mpl.style.use('default') 80 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length: 81 | cc = mpl.rcParams['axes.prop_cycle']() 82 | palette = [mpl.colors.rgb2hex(next(cc)['color']) 83 | for _ in range(length)] 84 | else: 85 | if length <= 20: 86 | palette = default_20 87 | elif length <= 28: 88 | palette = default_28 89 | elif length <= len(default_102): # 103 colors 90 | palette = default_102 91 | else: 92 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length)) 93 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1]) 94 | for i in range(length)] 95 | colors = pd.Series(['']*len(arr)) 96 | for i, x in enumerate(categories): 97 | ids = np.where(arr == x)[0] 98 | colors[ids] = palette[i] 99 | colors = list(colors) 100 | else: 101 | raise TypeError("unsupported data type for `arr`") 102 | dict_palette = dict(zip(arr, colors)) 103 | return dict_palette 104 | -------------------------------------------------------------------------------- /simba/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Preprocessing""" 2 | 3 | from ._general import ( 4 | log_transform, 5 | normalize, 6 | binarize 7 | ) 8 | from ._qc import ( 9 | cal_qc, 10 | cal_qc_rna, 11 | cal_qc_atac, 12 | filter_samples, 13 | filter_cells_rna, 14 | filter_cells_atac, 15 | filter_features, 16 | filter_genes, 17 | filter_peaks, 18 | ) 19 | from ._pca import ( 20 | pca, 21 | select_pcs, 22 | select_pcs_features, 23 | ) 24 | from ._variable_genes import ( 25 | select_variable_genes 26 | ) 27 | -------------------------------------------------------------------------------- /simba/preprocessing/_general.py: -------------------------------------------------------------------------------- 1 | """General preprocessing functions""" 2 | 3 | import numpy as np 4 | from sklearn.utils import sparsefuncs 5 | from sklearn import preprocessing 6 | from ._utils import ( 7 | cal_tf_idf 8 | ) 9 | from scipy.sparse import ( 10 | issparse, 11 | csr_matrix, 12 | ) 13 | 14 | 15 | def log_transform(adata): 16 | """Return the natural logarithm of one plus the input array, element-wise. 17 | 18 | Parameters 19 | ---------- 20 | adata: AnnData 21 | Annotated data matrix. 22 | 23 | Returns 24 | ------- 25 | updates `adata` with the following fields. 26 | X: `numpy.ndarray` (`adata.X`) 27 | Store #observations × #var_genes logarithmized data matrix. 28 | """ 29 | if not issparse(adata.X): 30 | adata.X = csr_matrix(adata.X) 31 | adata.X = np.log1p(adata.X) 32 | return None 33 | 34 | 35 | def binarize(adata, 36 | threshold=1e-5): 37 | """Binarize an array. 38 | Parameters 39 | ---------- 40 | adata: AnnData 41 | Annotated data matrix. 42 | threshold: `float`, optional (default: 1e-5) 43 | Values below or equal to this are replaced by 0, above it by 1. 44 | 45 | Returns 46 | ------- 47 | updates `adata` with the following fields. 48 | X: `numpy.ndarray` (`adata.X`) 49 | Store #observations × #var_genes binarized data matrix. 50 | """ 51 | if not issparse(adata.X): 52 | adata.X = csr_matrix(adata.X) 53 | adata.X = preprocessing.binarize(adata.X, 54 | threshold=threshold, 55 | copy=True) 56 | 57 | 58 | def normalize(adata, 59 | method='lib_size', 60 | scale_factor=1e4, 61 | save_raw=True): 62 | """Normalize count matrix. 63 | 64 | Parameters 65 | ---------- 66 | adata: AnnData 67 | Annotated data matrix. 68 | method: `str`, optional (default: 'lib_size') 69 | Choose from {{'lib_size','tf_idf'}} 70 | Method used for dimension reduction. 71 | 'lib_size': Total-count normalize (library-size correct) 72 | 'tf_idf': TF-IDF (term frequency–inverse document frequency) 73 | transformation 74 | 75 | Returns 76 | ------- 77 | updates `adata` with the following fields. 78 | X: `numpy.ndarray` (`adata.X`) 79 | Store #observations × #var_genes normalized data matrix. 80 | """ 81 | if method not in ['lib_size', 'tf_idf']: 82 | raise ValueError("unrecognized method '%s'" % method) 83 | if not issparse(adata.X): 84 | adata.X = csr_matrix(adata.X) 85 | if save_raw: 86 | adata.layers['raw'] = adata.X.copy() 87 | if method == 'lib_size': 88 | sparsefuncs.inplace_row_scale(adata.X, 1/adata.X.sum(axis=1).A) 89 | adata.X = adata.X*scale_factor 90 | if method == 'tf_idf': 91 | adata.X = cal_tf_idf(adata.X) 92 | -------------------------------------------------------------------------------- /simba/preprocessing/_pca.py: -------------------------------------------------------------------------------- 1 | """Principal component analysis""" 2 | 3 | import numpy as np 4 | from sklearn.decomposition import TruncatedSVD 5 | from ._utils import ( 6 | locate_elbow, 7 | ) 8 | 9 | 10 | def pca(adata, 11 | n_components=50, 12 | algorithm='randomized', 13 | n_iter=5, 14 | random_state=2021, 15 | tol=0.0, 16 | feature=None, 17 | **kwargs, 18 | ): 19 | """perform Principal Component Analysis (PCA) 20 | 21 | Parameters 22 | ---------- 23 | adata: AnnData 24 | Annotated data matrix. 25 | n_components: `int`, optional (default: 50) 26 | Desired dimensionality of output data 27 | algorithm: `str`, optional (default: 'randomized') 28 | SVD solver to use. Choose from {'arpack', 'randomized'}. 29 | n_iter: `int`, optional (default: '5') 30 | Number of iterations for randomized SVD solver. 31 | Not used by ARPACK. 32 | tol: `float`, optional (default: 0) 33 | Tolerance for ARPACK. 0 means machine precision. 34 | Ignored by randomized SVD solver. 35 | feature: `str`, optional (default: None) 36 | Feature used to perform PCA. 37 | The data type of `.var[feature]` needs to be `bool` 38 | If None, adata.X will be used. 39 | kwargs: 40 | Other keyword arguments are passed down to `TruncatedSVD()` 41 | 42 | Returns 43 | ------- 44 | updates `adata` with the following fields: 45 | `.obsm['X_pca']` : `array` 46 | PCA transformed X. 47 | `.uns['pca']['PCs']` : `array` 48 | Principal components in feature space, 49 | representing the directions of maximum variance in the data. 50 | `.uns['pca']['variance']` : `array` 51 | The variance of the training samples transformed by a 52 | projection to each component. 53 | `.uns['pca']['variance_ratio']` : `array` 54 | Percentage of variance explained by each of the selected components. 55 | """ 56 | if feature is None: 57 | X = adata.X.copy() 58 | else: 59 | mask = adata.var[feature] 60 | X = adata[:, mask].X.copy() 61 | svd = TruncatedSVD(n_components=n_components, 62 | algorithm=algorithm, 63 | n_iter=n_iter, 64 | random_state=random_state, 65 | tol=tol, 66 | **kwargs) 67 | svd.fit(X) 68 | adata.obsm['X_pca'] = svd.transform(X) 69 | adata.uns['pca'] = dict() 70 | adata.uns['pca']['n_pcs'] = n_components 71 | adata.uns['pca']['PCs'] = svd.components_.T 72 | adata.uns['pca']['variance'] = svd.explained_variance_ 73 | adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_ 74 | 75 | 76 | def select_pcs(adata, 77 | n_pcs=None, 78 | S=1, 79 | curve='convex', 80 | direction='decreasing', 81 | online=False, 82 | min_elbow=None, 83 | **kwargs): 84 | """select top PCs based on variance_ratio 85 | 86 | Parameters 87 | ---------- 88 | n_pcs: `int`, optional (default: None) 89 | If n_pcs is None, 90 | the number of PCs will be automatically selected with "`kneed 91 | `__" 92 | S : `float`, optional (default: 1) 93 | Sensitivity 94 | min_elbow: `int`, optional (default: None) 95 | The minimum elbow location 96 | By default, it is n_components/10 97 | curve: `str`, optional (default: 'convex') 98 | Choose from {'convex','concave'} 99 | If 'concave', algorithm will detect knees, 100 | If 'convex', algorithm will detect elbows. 101 | direction: `str`, optional (default: 'decreasing') 102 | Choose from {'decreasing','increasing'} 103 | online: `bool`, optional (default: False) 104 | kneed will correct old knee points if True, 105 | kneed will return first knee if False. 106 | **kwargs: `dict`, optional 107 | Extra arguments to KneeLocator. 108 | Returns 109 | 110 | """ 111 | if n_pcs is None: 112 | n_components = adata.obsm['X_pca'].shape[1] 113 | if min_elbow is None: 114 | min_elbow = n_components/10 115 | n_pcs = locate_elbow(range(n_components), 116 | adata.uns['pca']['variance_ratio'], 117 | S=S, 118 | curve=curve, 119 | min_elbow=min_elbow, 120 | direction=direction, 121 | online=online, 122 | **kwargs) 123 | adata.uns['pca']['n_pcs'] = n_pcs 124 | else: 125 | adata.uns['pca']['n_pcs'] = n_pcs 126 | 127 | 128 | def select_pcs_features(adata, 129 | S=1, 130 | curve='convex', 131 | direction='decreasing', 132 | online=False, 133 | min_elbow=None, 134 | **kwargs): 135 | """select features that contribute to the top PCs 136 | 137 | Parameters 138 | ---------- 139 | S : `float`, optional (default: 10) 140 | Sensitivity 141 | min_elbow: `int`, optional (default: None) 142 | The minimum elbow location. 143 | By default, it is #features/6 144 | curve: `str`, optional (default: 'convex') 145 | Choose from {'convex','concave'} 146 | If 'concave', algorithm will detect knees, 147 | If 'convex', algorithm will detect elbows. 148 | direction: `str`, optional (default: 'decreasing') 149 | Choose from {'decreasing','increasing'} 150 | online: `bool`, optional (default: False) 151 | kneed will correct old knee points if True, 152 | kneed will return first knee if False. 153 | **kwargs: `dict`, optional 154 | Extra arguments to KneeLocator. 155 | Returns 156 | ------- 157 | """ 158 | n_pcs = adata.uns['pca']['n_pcs'] 159 | n_features = adata.uns['pca']['PCs'].shape[0] 160 | if min_elbow is None: 161 | min_elbow = n_features/6 162 | adata.uns['pca']['features'] = dict() 163 | ids_features = list() 164 | for i in range(n_pcs): 165 | elbow = locate_elbow(range(n_features), 166 | np.sort( 167 | np.abs(adata.uns['pca']['PCs'][:, i],))[::-1], 168 | S=S, 169 | min_elbow=min_elbow, 170 | curve=curve, 171 | direction=direction, 172 | online=online, 173 | **kwargs) 174 | ids_features_i = \ 175 | list(np.argsort(np.abs( 176 | adata.uns['pca']['PCs'][:, i],))[::-1][:elbow]) 177 | adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i 178 | ids_features = ids_features + ids_features_i 179 | print(f'#features selected from PC {i}: {len(ids_features_i)}') 180 | adata.var['top_pcs'] = False 181 | adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True 182 | print(f'#features in total: {adata.var["top_pcs"].sum()}') 183 | -------------------------------------------------------------------------------- /simba/preprocessing/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from kneed import KneeLocator 5 | from scipy.sparse import csr_matrix, diags 6 | 7 | 8 | def locate_elbow(x, y, S=10, min_elbow=0, 9 | curve='convex', direction='decreasing', online=False, 10 | **kwargs): 11 | """Detect knee points 12 | 13 | Parameters 14 | ---------- 15 | x : `array_like` 16 | x values 17 | y : `array_like` 18 | y values 19 | S : `float`, optional (default: 10) 20 | Sensitivity 21 | min_elbow: `int`, optional (default: 0) 22 | The minimum elbow location 23 | curve: `str`, optional (default: 'convex') 24 | Choose from {'convex','concave'} 25 | If 'concave', algorithm will detect knees, 26 | If 'convex', algorithm will detect elbows. 27 | direction: `str`, optional (default: 'decreasing') 28 | Choose from {'decreasing','increasing'} 29 | online: `bool`, optional (default: False) 30 | kneed will correct old knee points if True, 31 | kneed will return first knee if False. 32 | **kwargs: `dict`, optional 33 | Extra arguments to KneeLocator. 34 | 35 | Returns 36 | ------- 37 | elbow: `int` 38 | elbow point 39 | """ 40 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):], 41 | S=S, curve=curve, 42 | direction=direction, 43 | online=online, 44 | **kwargs, 45 | ) 46 | if kneedle.elbow is None: 47 | elbow = len(y) 48 | else: 49 | elbow = int(kneedle.elbow) 50 | return elbow 51 | 52 | 53 | def cal_tf_idf(mat): 54 | """Transform a count matrix to a tf-idf representation 55 | """ 56 | mat = csr_matrix(mat) 57 | tf = csr_matrix(mat/(mat.sum(axis=0))) 58 | idf = np.array(np.log(1 + mat.shape[1] / mat.sum(axis=1))).flatten() 59 | tf_idf = csr_matrix(np.dot(diags(idf), tf)) 60 | return tf_idf 61 | -------------------------------------------------------------------------------- /simba/preprocessing/_variable_genes.py: -------------------------------------------------------------------------------- 1 | """Preprocess""" 2 | 3 | import numpy as np 4 | from scipy.sparse import ( 5 | csr_matrix, 6 | ) 7 | from sklearn.utils import sparsefuncs 8 | from skmisc.loess import loess 9 | 10 | 11 | def select_variable_genes(adata, 12 | layer='raw', 13 | span=0.3, 14 | n_top_genes=2000, 15 | ): 16 | """Select highly variable genes. 17 | 18 | This function implenments the method 'vst' in Seurat v3. 19 | Inspired by Scanpy. 20 | 21 | Parameters 22 | ---------- 23 | adata: AnnData 24 | Annotated data matrix. 25 | layer: `str`, optional (default: 'raw') 26 | The layer to use for calculating variable genes. 27 | span: `float`, optional (default: 0.3) 28 | Loess smoothing factor 29 | n_top_genes: `int`, optional (default: 2000) 30 | The number of genes to keep 31 | 32 | Returns 33 | ------- 34 | updates `adata` with the following fields. 35 | 36 | variances_norm: `float`, (`adata.var['variances_norm']`) 37 | Normalized variance per gene 38 | variances: `float`, (`adata.var['variances']`) 39 | Variance per gene. 40 | means: `float`, (`adata.var['means']`) 41 | Means per gene 42 | highly_variable: `bool` (`adata.var['highly_variable']`) 43 | Indicator of variable genes 44 | """ 45 | if layer is None: 46 | X = adata.X 47 | else: 48 | X = adata.layers[layer].astype(np.float64).copy() 49 | mean, variance = sparsefuncs.mean_variance_axis(X, axis=0) 50 | variance_expected = np.zeros(adata.shape[1], dtype=np.float64) 51 | not_const = variance > 0 52 | 53 | model = loess(np.log10(mean[not_const]), 54 | np.log10(variance[not_const]), 55 | span=span, 56 | degree=2) 57 | model.fit() 58 | variance_expected[not_const] = 10**model.outputs.fitted_values 59 | N = adata.shape[0] 60 | clip_max = np.sqrt(N) 61 | clip_val = np.sqrt(variance_expected) * clip_max + mean 62 | 63 | X = csr_matrix(X) 64 | mask = X.data > clip_val[X.indices] 65 | X.data[mask] = clip_val[X.indices[mask]] 66 | 67 | squared_X_sum = np.array(X.power(2).sum(axis=0)) 68 | X_sum = np.array(X.sum(axis=0)) 69 | 70 | norm_gene_var = (1 / ((N - 1) * variance_expected)) \ 71 | * ((N * np.square(mean)) 72 | + squared_X_sum 73 | - 2 * X_sum * mean 74 | ) 75 | norm_gene_var = norm_gene_var.flatten() 76 | 77 | adata.var['variances_norm'] = norm_gene_var 78 | adata.var['variances'] = variance 79 | adata.var['means'] = mean 80 | ids_top = norm_gene_var.argsort()[-n_top_genes:][::-1] 81 | adata.var['highly_variable'] = np.isin(range(adata.shape[1]), ids_top) 82 | print(f'{n_top_genes} variable genes are selected.') 83 | -------------------------------------------------------------------------------- /simba/readwrite.py: -------------------------------------------------------------------------------- 1 | """reading and writing""" 2 | 3 | import os 4 | import pandas as pd 5 | import json 6 | from anndata import ( 7 | AnnData, 8 | read_h5ad, 9 | read_csv, 10 | read_excel, 11 | read_hdf, 12 | read_loom, 13 | read_mtx, 14 | read_text, 15 | read_umi_tools, 16 | read_zarr, 17 | ) 18 | from pathlib import Path 19 | import tables 20 | 21 | from ._settings import settings 22 | from ._utils import _read_legacy_10x_h5, _read_v3_10x_h5 23 | 24 | 25 | def read_embedding(path_emb=None, 26 | path_entity=None, 27 | convert_alias=True, 28 | path_entity_alias=None, 29 | prefix=None, 30 | num_epochs=None): 31 | """Read in entity embeddings from pbg training 32 | 33 | Parameters 34 | ---------- 35 | path_emb: `str`, optional (default: None) 36 | Path to directory for pbg embedding model 37 | If None, .settings.pbg_params['checkpoint_path'] will be used. 38 | path_entity: `str`, optional (default: None) 39 | Path to entity name file 40 | prefix: `list`, optional (default: None) 41 | A list of entity type prefixes to include. 42 | By default, it reads in the embeddings of all entities. 43 | convert_alias: `bool`, optional (default: True) 44 | If True, it will convert entity aliases to the original indices 45 | path_entity_alias: `str`, optional (default: None) 46 | Path to entity alias file 47 | num_epochs: `int`, optional (default: None) 48 | The embedding result associated with num_epochs to read in 49 | 50 | Returns 51 | ------- 52 | dict_adata: `dict` 53 | A dictionary of anndata objects of shape 54 | (#entities x #dimensions) 55 | """ 56 | pbg_params = settings.pbg_params 57 | if path_emb is None: 58 | path_emb = pbg_params['checkpoint_path'] 59 | if path_entity is None: 60 | path_entity = pbg_params['entity_path'] 61 | if num_epochs is None: 62 | num_epochs = pbg_params["num_epochs"] 63 | if prefix is None: 64 | prefix = [] 65 | assert isinstance(prefix, list), \ 66 | "`prefix` must be list" 67 | if convert_alias: 68 | if path_entity_alias is None: 69 | path_entity_alias = Path(path_emb).parent.as_posix() 70 | df_entity_alias = pd.read_csv( 71 | os.path.join(path_entity_alias, 'entity_alias.txt'), 72 | header=0, 73 | index_col=0, 74 | sep='\t') 75 | df_entity_alias['id'] = df_entity_alias.index 76 | df_entity_alias.index = df_entity_alias['alias'].values 77 | 78 | dict_adata = dict() 79 | for x in os.listdir(path_emb): 80 | if x.startswith('embeddings'): 81 | entity_type = x.split('_')[1] 82 | if (len(prefix) == 0) or (entity_type in prefix): 83 | adata = \ 84 | read_hdf(os.path.join(path_emb, 85 | f'embeddings_{entity_type}_0.' 86 | f'v{num_epochs}.h5'), 87 | key="embeddings") 88 | with open( 89 | os.path.join(path_entity, 90 | f'entity_names_{entity_type}_0.json'), "rt")\ 91 | as tf: 92 | names_entity = json.load(tf) 93 | if convert_alias: 94 | names_entity = \ 95 | df_entity_alias.loc[names_entity, 'id'].tolist() 96 | adata.obs.index = names_entity 97 | dict_adata[entity_type] = adata 98 | return dict_adata 99 | 100 | 101 | # modifed from 102 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 103 | def read_10x_h5(filename, 104 | genome=None, 105 | gex_only=True): 106 | """Read 10x-Genomics-formatted hdf5 file. 107 | 108 | Parameters 109 | ---------- 110 | filename 111 | Path to a 10x hdf5 file. 112 | genome 113 | Filter expression to genes within this genome. For legacy 10x h5 114 | files, this must be provided if the data contains more than one genome. 115 | gex_only 116 | Only keep 'Gene Expression' data and ignore other feature types, 117 | e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom' 118 | 119 | Returns 120 | ------- 121 | adata: AnnData 122 | Annotated data matrix, where observations/cells are named by their 123 | barcode and variables/genes by gene name 124 | """ 125 | with tables.open_file(str(filename), 'r') as f: 126 | v3 = '/matrix' in f 127 | if v3: 128 | adata = _read_v3_10x_h5(filename) 129 | if genome: 130 | if genome not in adata.var['genome'].values: 131 | raise ValueError( 132 | f"Could not find data corresponding to " 133 | f"genome '{genome}' in '{filename}'. " 134 | f'Available genomes are:' 135 | f' {list(adata.var["genome"].unique())}.' 136 | ) 137 | adata = adata[:, adata.var['genome'] == genome] 138 | if gex_only: 139 | adata = adata[:, adata.var['feature_types'] == 'Gene Expression'] 140 | if adata.is_view: 141 | adata = adata.copy() 142 | else: 143 | adata = _read_legacy_10x_h5(filename, genome=genome) 144 | return adata 145 | 146 | 147 | def load_pbg_config(path=None): 148 | """Load PBG configuration into global setting 149 | 150 | Parameters 151 | ---------- 152 | path: `str`, optional (default: None) 153 | Path to the directory for pbg configuration file 154 | If None, `.settings.pbg_params['checkpoint_path']` will be used 155 | 156 | Returns 157 | ------- 158 | Updates `.settings.pbg_params` 159 | 160 | """ 161 | if path is None: 162 | path = settings.pbg_params['checkpoint_path'] 163 | path = os.path.normpath(path) 164 | with open(os.path.join(path, 'config.json'), "rt") as tf: 165 | pbg_params = json.load(tf) 166 | settings.set_pbg_params(config=pbg_params) 167 | 168 | 169 | def load_graph_stats(path=None): 170 | """Load graph statistics into global setting 171 | 172 | Parameters 173 | ---------- 174 | path: `str`, optional (default: None) 175 | Path to the directory for graph statistics file 176 | If None, `.settings.pbg_params['checkpoint_path']` will be used 177 | 178 | Returns 179 | ------- 180 | Updates `.settings.graph_stats` 181 | """ 182 | if path is None: 183 | path = \ 184 | Path(settings.pbg_params['entity_path']).parent.parent.as_posix() 185 | path = os.path.normpath(path) 186 | with open(os.path.join(path, 'graph_stats.json'), "rt") as tf: 187 | dict_graph_stats = json.load(tf) 188 | dirname = os.path.basename(path) 189 | settings.graph_stats[dirname] = dict_graph_stats.copy() 190 | 191 | 192 | def write_bed(adata, 193 | use_top_pcs=True, 194 | filename=None 195 | ): 196 | """Write peaks into .bed file 197 | 198 | Parameters 199 | ---------- 200 | adata: AnnData 201 | Annotated data matrix with peaks as variables. 202 | use_top_pcs: `bool`, optional (default: True) 203 | Use top-PCs-associated features 204 | filename: `str`, optional (default: None) 205 | Filename name for peaks. 206 | By default, a file named 'peaks.bed' will be written to 207 | `.settings.workdir` 208 | """ 209 | if filename is None: 210 | filename = os.path.join(settings.workdir, 'peaks.bed') 211 | for x in ['chr', 'start', 'end']: 212 | if x not in adata.var_keys(): 213 | raise ValueError(f"could not find {x} in `adata.var_keys()`") 214 | if use_top_pcs: 215 | assert 'top_pcs' in adata.var_keys(), \ 216 | "please run `si.pp.select_pcs_features()` first" 217 | peaks_selected = adata.var[ 218 | adata.var['top_pcs']][['chr', 'start', 'end']] 219 | else: 220 | peaks_selected = adata.var[ 221 | ['chr', 'start', 'end']] 222 | peaks_selected.to_csv(filename, 223 | sep='\t', 224 | header=False, 225 | index=False) 226 | fp, fn = os.path.split(filename) 227 | print(f'"{fn}" was written to "{fp}".') 228 | -------------------------------------------------------------------------------- /simba/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """The core functionality""" 2 | 3 | from ._general import ( 4 | discretize, 5 | ) 6 | from ._umap import umap 7 | from ._gene_scores import gene_scores 8 | from ._integration import ( 9 | infer_edges, 10 | trim_edges 11 | ) 12 | from ._pbg import ( 13 | gen_graph, 14 | pbg_train 15 | ) 16 | from ._post_training import ( 17 | softmax, 18 | embed, 19 | compare_entities, 20 | query, 21 | find_master_regulators, 22 | find_target_genes, 23 | ) 24 | -------------------------------------------------------------------------------- /simba/tools/_gene_scores.py: -------------------------------------------------------------------------------- 1 | """Predict gene scores based on chromatin accessibility""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import anndata as ad 6 | import io 7 | import pybedtools 8 | from scipy.sparse import ( 9 | coo_matrix, 10 | csr_matrix 11 | ) 12 | import pkgutil 13 | 14 | from ._utils import _uniquify 15 | 16 | 17 | class GeneScores: 18 | """A class used to represent gene scores 19 | 20 | Attributes 21 | ---------- 22 | 23 | Methods 24 | ------- 25 | 26 | """ 27 | def __init__(self, 28 | adata, 29 | genome, 30 | gene_anno=None, 31 | tss_upstream=1e5, 32 | tss_downsteam=1e5, 33 | gb_upstream=5000, 34 | cutoff_weight=1, 35 | use_top_pcs=True, 36 | use_precomputed=True, 37 | use_gene_weigt=True, 38 | min_w=1, 39 | max_w=5): 40 | """ 41 | Parameters 42 | ---------- 43 | adata: `Anndata` 44 | Input anndata 45 | genome : `str` 46 | The genome name 47 | """ 48 | self.adata = adata 49 | self.genome = genome 50 | self.gene_anno = gene_anno 51 | self.tss_upstream = tss_upstream 52 | self.tss_downsteam = tss_downsteam 53 | self.gb_upstream = gb_upstream 54 | self.cutoff_weight = cutoff_weight 55 | self.use_top_pcs = use_top_pcs 56 | self.use_precomputed = use_precomputed 57 | self.use_gene_weigt = use_gene_weigt 58 | self.min_w = min_w 59 | self.max_w = max_w 60 | 61 | def _read_gene_anno(self): 62 | """Read in gene annotation 63 | 64 | Parameters 65 | ---------- 66 | 67 | Returns 68 | ------- 69 | 70 | """ 71 | assert (self.genome in ['hg19', 'hg38', 'mm9', 'mm10']),\ 72 | "`genome` must be one of ['hg19','hg38','mm9','mm10']" 73 | 74 | bin_str = pkgutil.get_data('simba', 75 | f'data/gene_anno/{self.genome}_genes.bed') 76 | gene_anno = pd.read_csv(io.BytesIO(bin_str), 77 | encoding='utf8', 78 | sep='\t', 79 | header=None, 80 | names=['chr', 'start', 'end', 81 | 'symbol', 'strand']) 82 | self.gene_anno = gene_anno 83 | return self.gene_anno 84 | 85 | def _extend_tss(self, pbt_gene): 86 | """Extend transcription start site in both directions 87 | 88 | Parameters 89 | ---------- 90 | 91 | Returns 92 | ------- 93 | 94 | """ 95 | ext_tss = pbt_gene 96 | if ext_tss['strand'] == '+': 97 | ext_tss.start = max(0, ext_tss.start - self.tss_upstream) 98 | ext_tss.end = max(ext_tss.end, ext_tss.start + self.tss_downsteam) 99 | else: 100 | ext_tss.start = max(0, min(ext_tss.start, 101 | ext_tss.end - self.tss_downsteam)) 102 | ext_tss.end = ext_tss.end + self.tss_upstream 103 | return ext_tss 104 | 105 | def _extend_genebody(self, pbt_gene): 106 | """Extend gene body upstream 107 | 108 | Parameters 109 | ---------- 110 | 111 | Returns 112 | ------- 113 | 114 | """ 115 | ext_gb = pbt_gene 116 | if ext_gb['strand'] == '+': 117 | ext_gb.start = max(0, ext_gb.start - self.gb_upstream) 118 | else: 119 | ext_gb.end = ext_gb.end + self.gb_upstream 120 | return ext_gb 121 | 122 | def _weight_genes(self): 123 | """Weight genes 124 | 125 | Parameters 126 | ---------- 127 | 128 | Returns 129 | ------- 130 | 131 | """ 132 | gene_anno = self.gene_anno 133 | gene_size = gene_anno['end'] - gene_anno['start'] 134 | w = 1/gene_size 135 | w_scaled = (self.max_w-self.min_w) * (w-min(w)) / (max(w)-min(w)) \ 136 | + self.min_w 137 | return w_scaled 138 | 139 | def cal_gene_scores(self): 140 | """Calculate gene scores 141 | 142 | Parameters 143 | ---------- 144 | 145 | Returns 146 | ------- 147 | 148 | """ 149 | adata = self.adata 150 | if self.gene_anno is None: 151 | gene_ann = self._read_gene_anno() 152 | else: 153 | gene_ann = self.gene_anno 154 | 155 | df_gene_ann = gene_ann.copy() 156 | df_gene_ann.index = _uniquify(df_gene_ann['symbol'].values) 157 | if self.use_top_pcs: 158 | mask_p = adata.var['top_pcs'] 159 | else: 160 | mask_p = pd.Series(True, index=adata.var_names) 161 | df_peaks = adata.var[mask_p][['chr', 'start', 'end']].copy() 162 | 163 | if 'gene_scores' not in adata.uns_keys(): 164 | print('Gene scores are being calculated for the first time') 165 | print('`use_precomputed` has been ignored') 166 | self.use_precomputed = False 167 | 168 | if self.use_precomputed: 169 | print('Using precomputed overlap') 170 | df_overlap_updated = adata.uns['gene_scores']['overlap'].copy() 171 | else: 172 | # add the fifth column 173 | # so that pybedtool can recognize the sixth column as the strand 174 | df_gene_ann_for_pbt = df_gene_ann.copy() 175 | df_gene_ann_for_pbt['score'] = 0 176 | df_gene_ann_for_pbt = df_gene_ann_for_pbt[['chr', 'start', 'end', 177 | 'symbol', 'score', 178 | 'strand']] 179 | df_gene_ann_for_pbt['id'] = range(df_gene_ann_for_pbt.shape[0]) 180 | 181 | df_peaks_for_pbt = df_peaks.copy() 182 | df_peaks_for_pbt['id'] = range(df_peaks_for_pbt.shape[0]) 183 | 184 | pbt_gene_ann = pybedtools.BedTool.from_dataframe( 185 | df_gene_ann_for_pbt 186 | ) 187 | pbt_gene_ann_ext = pbt_gene_ann.each(self._extend_tss) 188 | pbt_gene_gb_ext = pbt_gene_ann.each(self._extend_genebody) 189 | 190 | pbt_peaks = pybedtools.BedTool.from_dataframe(df_peaks_for_pbt) 191 | 192 | # peaks overlapping with extended TSS 193 | pbt_overlap = pbt_peaks.intersect(pbt_gene_ann_ext, 194 | wa=True, 195 | wb=True) 196 | df_overlap = pbt_overlap.to_dataframe( 197 | names=[x+'_p' for x in df_peaks_for_pbt.columns] 198 | + [x+'_g' for x in df_gene_ann_for_pbt.columns]) 199 | # peaks overlapping with gene body 200 | pbt_overlap2 = pbt_peaks.intersect(pbt_gene_gb_ext, 201 | wa=True, 202 | wb=True) 203 | df_overlap2 = pbt_overlap2.to_dataframe( 204 | names=[x+'_p' for x in df_peaks_for_pbt.columns] 205 | + [x+'_g' for x in df_gene_ann_for_pbt.columns]) 206 | 207 | # add distance and weight for each overlap 208 | df_overlap_updated = df_overlap.copy() 209 | df_overlap_updated['dist'] = 0 210 | 211 | for i, x in enumerate(df_overlap['symbol_g'].unique()): 212 | # peaks within the extended TSS 213 | df_overlap_x = \ 214 | df_overlap[df_overlap['symbol_g'] == x].copy() 215 | # peaks within the gene body 216 | df_overlap2_x = \ 217 | df_overlap2[df_overlap2['symbol_g'] == x].copy() 218 | # peaks that are not intersecting with the promoter 219 | # and gene body of gene x 220 | id_overlap = df_overlap_x.index[ 221 | ~np.isin(df_overlap_x['id_p'], df_overlap2_x['id_p'])] 222 | mask_x = (df_gene_ann['symbol'] == x) 223 | range_x = df_gene_ann[mask_x][['start', 'end']].values\ 224 | .flatten() 225 | if df_overlap_x['strand_g'].iloc[0] == '+': 226 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat( 227 | [abs(df_overlap_x.loc[id_overlap, 'start_p'] 228 | - (range_x[1])), 229 | abs(df_overlap_x.loc[id_overlap, 'end_p'] 230 | - max(0, range_x[0]-self.gb_upstream))], 231 | axis=1, sort=False).min(axis=1) 232 | else: 233 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat( 234 | [abs(df_overlap_x.loc[id_overlap, 'start_p'] 235 | - (range_x[1]+self.gb_upstream)), 236 | abs(df_overlap_x.loc[id_overlap, 'end_p'] 237 | - (range_x[0]))], 238 | axis=1, sort=False).min(axis=1) 239 | 240 | n_batch = int(df_gene_ann_for_pbt.shape[0]/5) 241 | if i % n_batch == 0: 242 | print(f'Processing: {i/df_gene_ann_for_pbt.shape[0]:.1%}') 243 | df_overlap_updated['dist'] = df_overlap_updated['dist']\ 244 | .astype(float) 245 | 246 | adata.uns['gene_scores'] = dict() 247 | adata.uns['gene_scores']['overlap'] = df_overlap_updated.copy() 248 | 249 | df_overlap_updated['weight'] = np.exp( 250 | -(df_overlap_updated['dist'].values/self.gb_upstream)) 251 | mask_w = (df_overlap_updated['weight'] < self.cutoff_weight) 252 | df_overlap_updated.loc[mask_w, 'weight'] = 0 253 | # construct genes-by-peaks matrix 254 | mat_GP = csr_matrix(coo_matrix((df_overlap_updated['weight'], 255 | (df_overlap_updated['id_g'], 256 | df_overlap_updated['id_p'])), 257 | shape=(df_gene_ann.shape[0], 258 | df_peaks.shape[0]))) 259 | # adata_GP = ad.AnnData(X=csr_matrix(mat_GP), 260 | # obs=df_gene_ann, 261 | # var=df_peaks) 262 | # adata_GP.layers['weight'] = adata_GP.X.copy() 263 | if self.use_gene_weigt: 264 | gene_weights = self._weight_genes() 265 | gene_scores = adata[:, mask_p].X * \ 266 | (mat_GP.T.multiply(gene_weights)) 267 | else: 268 | gene_scores = adata[:, mask_p].X * mat_GP.T 269 | adata_CG_atac = ad.AnnData(gene_scores, 270 | obs=adata.obs.copy(), 271 | var=df_gene_ann.copy()) 272 | return adata_CG_atac 273 | 274 | 275 | def gene_scores(adata, 276 | genome, 277 | gene_anno=None, 278 | tss_upstream=1e5, 279 | tss_downsteam=1e5, 280 | gb_upstream=5000, 281 | cutoff_weight=1, 282 | use_top_pcs=True, 283 | use_precomputed=True, 284 | use_gene_weigt=True, 285 | min_w=1, 286 | max_w=5): 287 | """Calculate gene scores 288 | 289 | Parameters 290 | ---------- 291 | adata : AnnData 292 | Annotated data matrix. 293 | genome : `str` 294 | Reference genome. Choose from {'hg19', 'hg38', 'mm9', 'mm10'} 295 | gene_anno : `pandas.DataFrame`, optional (default: None) 296 | Dataframe of gene annotation. 297 | If None, built-in gene annotation will be used depending on `genome`; 298 | If provided, custom gene annotation will be used instead. 299 | tss_upstream : `int`, optional (default: 1e5) 300 | The number of base pairs upstream of TSS 301 | tss_downsteam : `int`, optional (default: 1e5) 302 | The number of base pairs downstream of TSS 303 | gb_upstream : `int`, optional (default: 5000) 304 | The number of base pairs upstream by which gene body is extended. 305 | Peaks within the extended gene body are given the weight of 1. 306 | cutoff_weight : `float`, optional (default: 1) 307 | Weight cutoff for peaks 308 | use_top_pcs : `bool`, optional (default: True) 309 | If True, only peaks associated with top PCs will be used 310 | use_precomputed : `bool`, optional (default: True) 311 | If True, overlap bewteen peaks and genes 312 | (stored in `adata.uns['gene_scores']['overlap']`) will be imported 313 | use_gene_weigt : `bool`, optional (default: True) 314 | If True, for each gene, the number of peaks assigned to it 315 | will be rescaled based on gene size 316 | min_w : `int`, optional (default: 1) 317 | The minimum weight for each gene. 318 | Only valid if `use_gene_weigt` is True 319 | max_w : `int`, optional (default: 5) 320 | The maximum weight for each gene. 321 | Only valid if `use_gene_weigt` is True 322 | 323 | Returns 324 | ------- 325 | adata_new: AnnData 326 | Annotated data matrix. 327 | Stores #cells x #genes gene score matrix 328 | 329 | updates `adata` with the following fields. 330 | overlap: `pandas.DataFrame`, (`adata.uns['gene_scores']['overlap']`) 331 | Dataframe of overlap between peaks and genes 332 | """ 333 | GS = GeneScores(adata, 334 | genome, 335 | gene_anno=gene_anno, 336 | tss_upstream=tss_upstream, 337 | tss_downsteam=tss_downsteam, 338 | gb_upstream=gb_upstream, 339 | cutoff_weight=cutoff_weight, 340 | use_top_pcs=use_top_pcs, 341 | use_precomputed=use_precomputed, 342 | use_gene_weigt=use_gene_weigt, 343 | min_w=min_w, 344 | max_w=max_w) 345 | adata_CG_atac = GS.cal_gene_scores() 346 | return adata_CG_atac 347 | -------------------------------------------------------------------------------- /simba/tools/_general.py: -------------------------------------------------------------------------------- 1 | """General-purpose tools""" 2 | 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | 6 | 7 | def discretize(adata, 8 | layer=None, 9 | n_bins=5, 10 | max_bins=100): 11 | """Discretize continous values 12 | 13 | Parameters 14 | ---------- 15 | adata: AnnData 16 | Annotated data matrix. 17 | layer: `str`, optional (default: None) 18 | The layer used to perform discretization 19 | n_bins: `int`, optional (default: 5) 20 | The number of bins to produce. 21 | It must be smaller than `max_bins`. 22 | max_bins: `int`, optional (default: 100) 23 | The number of bins used in the initial approximation. 24 | i.e. the number of bins to cluster. 25 | 26 | Returns 27 | ------- 28 | updates `adata` with the following fields 29 | 30 | `.layer['simba']` : `array_like` 31 | The matrix of discretized values to build SIMBA graph. 32 | `.uns['disc']` : `dict` 33 | `bin_edges`: The edges of each bin. 34 | `bin_count`: The number of values in each bin. 35 | `hist_edges`: The edges of each bin \ 36 | in the initial approximation. 37 | `hist_count`: The number of values in each bin \ 38 | for the initial approximation. 39 | """ 40 | if layer is None: 41 | X = adata.X 42 | else: 43 | X = adata.layers[layer] 44 | nonzero_cont = X.data 45 | 46 | hist_count, hist_edges = np.histogram( 47 | nonzero_cont, 48 | bins=max_bins, 49 | density=False) 50 | hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2 51 | 52 | kmeans = KMeans(n_clusters=n_bins, random_state=2021, n_init='auto').fit( 53 | hist_centroids.reshape(-1, 1), 54 | sample_weight=hist_count) 55 | cluster_centers = np.sort(kmeans.cluster_centers_.flatten()) 56 | 57 | padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10) 58 | bin_edges = np.array( 59 | [hist_edges[0]-padding] + 60 | list((cluster_centers[0:-1] + cluster_centers[1:])/2) + 61 | [hist_edges[-1]+padding]) 62 | nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,) 63 | bin_count = np.unique(nonzero_disc, return_counts=True)[1] 64 | 65 | adata.layers['simba'] = X.copy() 66 | adata.layers['simba'].data = nonzero_disc 67 | adata.uns['disc'] = dict() 68 | adata.uns['disc']['bin_edges'] = bin_edges 69 | adata.uns['disc']['bin_count'] = bin_count 70 | adata.uns['disc']['hist_edges'] = hist_edges 71 | adata.uns['disc']['hist_count'] = hist_count 72 | -------------------------------------------------------------------------------- /simba/tools/_integration.py: -------------------------------------------------------------------------------- 1 | """Integration across experimental conditions or single cell modalities""" 2 | 3 | import numpy as np 4 | import anndata as ad 5 | # from sklearn.metrics.pairwise import pairwise_distances 6 | from sklearn.utils.extmath import randomized_svd 7 | from scipy.sparse import csr_matrix, find 8 | 9 | from ._utils import _knn 10 | 11 | 12 | def infer_edges(adata_ref, 13 | adata_query, 14 | feature='highly_variable', 15 | n_components=20, 16 | random_state=42, 17 | layer=None, 18 | k=20, 19 | metric='euclidean', 20 | leaf_size=40, 21 | **kwargs): 22 | """Infer edges between reference and query observations 23 | 24 | Parameters 25 | ---------- 26 | adata_ref: `AnnData` 27 | Annotated reference data matrix. 28 | adata_query: `AnnData` 29 | Annotated query data matrix. 30 | feature: `str`, optional (default: None) 31 | Feature used for edges inference. 32 | The data type of `.var[feature]` needs to be `bool` 33 | n_components: `int`, optional (default: 20) 34 | The number of components used in `randomized_svd` 35 | for comparing reference and query observations 36 | random_state: `int`, optional (default: 42) 37 | The seed used for truncated randomized SVD 38 | n_top_edges: `int`, optional (default: None) 39 | The number of edges to keep 40 | If specified, `percentile` will be ignored 41 | percentile: `float`, optional (default: 0.01) 42 | The percentile of edges to keep 43 | k: `int`, optional (default: 5) 44 | The number of nearest neighbors to consider within each dataset 45 | metric: `str`, optional (default: 'euclidean') 46 | The metric to use when calculating distance between 47 | reference and query observations 48 | layer: `str`, optional (default: None) 49 | The layer used to perform edge inference 50 | If None, `.X` will be used. 51 | kwargs: 52 | Other keyword arguments are passed down to `randomized_svd()` 53 | 54 | Returns 55 | ------- 56 | adata_ref_query: `AnnData` 57 | Annotated relation matrix betwewn reference and query observations 58 | Store reference entity as observations and query entity as variables 59 | """ 60 | 61 | mask_ref = adata_ref.var[feature] 62 | feature_ref = adata_ref.var_names[mask_ref] 63 | feature_query = adata_query.var_names 64 | feature_shared = list(set(feature_ref).intersection(set(feature_query))) 65 | print(f'#shared features: {len(feature_shared)}') 66 | if layer is None: 67 | X_ref = adata_ref[:, feature_shared].X 68 | X_query = adata_query[:, feature_shared].X 69 | else: 70 | X_ref = adata_ref[:, feature_shared].layers[layer] 71 | X_query = adata_query[:, feature_shared].layers[layer] 72 | 73 | if any(X_ref.sum(axis=1) == 0) or any(X_query.sum(axis=1) == 0): 74 | raise ValueError( 75 | f'Some nodes contain zero expressed {feature} features.\n' 76 | f'Please try to include more {feature} features.') 77 | 78 | print('Performing randomized SVD ...') 79 | mat = X_ref * X_query.T 80 | U, Sigma, VT = randomized_svd(mat, 81 | n_components=n_components, 82 | random_state=random_state, 83 | **kwargs) 84 | svd_data = np.vstack((U, VT.T)) 85 | X_svd_ref = svd_data[:U.shape[0], :] 86 | X_svd_query = svd_data[-VT.shape[1]:, :] 87 | X_svd_ref = X_svd_ref / (X_svd_ref**2).sum(-1, keepdims=True)**0.5 88 | X_svd_query = X_svd_query / (X_svd_query**2).sum(-1, keepdims=True)**0.5 89 | 90 | # print('Searching for neighbors within each dataset ...') 91 | # knn_conn_ref, knn_dist_ref = _knn( 92 | # X_ref=X_svd_ref, 93 | # k=k, 94 | # leaf_size=leaf_size, 95 | # metric=metric) 96 | # knn_conn_query, knn_dist_query = _knn( 97 | # X_ref=X_svd_query, 98 | # k=k, 99 | # leaf_size=leaf_size, 100 | # metric=metric) 101 | 102 | print('Searching for mutual nearest neighbors ...') 103 | knn_conn_ref_query, knn_dist_ref_query = _knn( 104 | X_ref=X_svd_ref, 105 | X_query=X_svd_query, 106 | k=k, 107 | leaf_size=leaf_size, 108 | metric=metric) 109 | knn_conn_query_ref, knn_dist_query_ref = _knn( 110 | X_ref=X_svd_query, 111 | X_query=X_svd_ref, 112 | k=k, 113 | leaf_size=leaf_size, 114 | metric=metric) 115 | 116 | sum_conn_ref_query = knn_conn_ref_query + knn_conn_query_ref.T 117 | id_x, id_y, values = find(sum_conn_ref_query > 1) 118 | print(f'{len(id_x)} edges are selected') 119 | conn_ref_query = csr_matrix( 120 | (values*1, (id_x, id_y)), 121 | shape=(knn_conn_ref_query.shape)) 122 | dist_ref_query = csr_matrix( 123 | (knn_dist_ref_query[id_x, id_y].A.flatten(), (id_x, id_y)), 124 | shape=(knn_conn_ref_query.shape)) 125 | # it's easier to distinguish zeros (no connection vs zero distance) 126 | # using similarity scores 127 | sim_ref_query = csr_matrix( 128 | (1/(dist_ref_query.data+1), dist_ref_query.nonzero()), 129 | shape=(dist_ref_query.shape)) # similarity scores 130 | 131 | # print('Computing similarity scores ...') 132 | # dist_ref_query = pairwise_distances(X_svd_ref, 133 | # X_svd_query, 134 | # metric=metric) 135 | # sim_ref_query = 1/(1+dist_ref_query) 136 | # # remove low similarity entries to save memory 137 | # sim_ref_query = np.where( 138 | # sim_ref_query < np.percentile(sim_ref_query, pct_keep*100), 139 | # 0, sim_ref_query) 140 | # sim_ref_query = csr_matrix(sim_ref_query) 141 | 142 | adata_ref_query = ad.AnnData(X=sim_ref_query, 143 | obs=adata_ref.obs, 144 | var=adata_query.obs) 145 | adata_ref_query.layers['simba'] = conn_ref_query 146 | adata_ref_query.obsm['svd'] = X_svd_ref 147 | # adata_ref_query.obsp['conn'] = knn_conn_ref 148 | # adata_ref_query.obsp['dist'] = knn_dist_ref 149 | adata_ref_query.varm['svd'] = X_svd_query 150 | # adata_ref_query.varp['conn'] = knn_conn_query 151 | # adata_ref_query.varp['dist'] = knn_dist_query 152 | return adata_ref_query 153 | 154 | 155 | def trim_edges(adata_ref_query, 156 | cutoff=None, 157 | n_edges=None): 158 | """Trim edges based on the similarity scores 159 | 160 | Parameters 161 | ---------- 162 | adata_ref_query: `AnnData` 163 | Annotated relation matrix betwewn reference and query observations. 164 | n_edges: `int`, optional (default: None) 165 | The number of edges to keep 166 | If specified, `percentile` will be ignored 167 | cutoff: `float`, optional (default: None) 168 | The distance cutoff. 169 | If None, it will be decided by `n_top_edges` 170 | If specified, `n_top_edges` will be ignored 171 | 172 | Returns 173 | ------- 174 | updates `adata_ref_query` with the following field. 175 | `.layers['simba']` : `array_like` 176 | relation matrix betwewn reference and query observations 177 | """ 178 | sim_ref_query = adata_ref_query.X 179 | if cutoff is None: 180 | if n_edges is None: 181 | raise ValueError('"cutoff" or "n_edges" has to be specified') 182 | else: 183 | cutoff = \ 184 | np.partition(sim_ref_query.data, 185 | (sim_ref_query.size-n_edges))[ 186 | sim_ref_query.size-n_edges] 187 | # cutoff = \ 188 | # np.partition(sim_ref_query.flatten(), 189 | # (len(sim_ref_query.flatten())-n_edges))[ 190 | # len(sim_ref_query.flatten())-n_edges] 191 | id_x, id_y, values = find(sim_ref_query > cutoff) 192 | 193 | print(f'{len(id_x)} edges are selected') 194 | conn_ref_query = csr_matrix( 195 | (values*1, (id_x, id_y)), 196 | shape=(sim_ref_query.shape)) 197 | adata_ref_query.layers['simba'] = conn_ref_query 198 | -------------------------------------------------------------------------------- /simba/tools/_umap.py: -------------------------------------------------------------------------------- 1 | """UMAP (Uniform Manifold Approximation and Projection)""" 2 | 3 | import umap as umap_learn 4 | 5 | 6 | def umap(adata, 7 | n_neighbors=15, 8 | n_components=2, 9 | random_state=2020, 10 | layer=None, 11 | obsm=None, 12 | n_dim=None, 13 | **kwargs, 14 | ): 15 | """perform UMAP 16 | Parameters 17 | ---------- 18 | adata: AnnData 19 | Annotated data matrix. 20 | n_neighbors: `int`, optional (default: 15) 21 | The size of local neighborhood for UMAP 22 | n_components: `int`, optional (default: None) 23 | The dimension of the space to embed into for UMAP 24 | random_state: `int`, optional (default: None) 25 | The seed used by the random number generator for UMAP 26 | layer: `str`, optional (default: None) 27 | The layer used to perform UMAP 28 | obsm: `str`, optional (default: None) 29 | The multi-dimensional annotation of observations used to perform UMAP 30 | n_dim: `str`, optional (default: None) 31 | The number of dimensions used in `layer` or `obsm` 32 | kwargs: 33 | Other keyword arguments are passed down to `umap_learn.UMAP` 34 | 35 | Returns 36 | ------- 37 | updates `adata` with the following fields: 38 | `.obsm['X_umap']` : `array` 39 | UMAP coordinates of samples. 40 | """ 41 | 42 | if sum(list(map(lambda x: x is not None, 43 | [layer, obsm]))) == 2: 44 | raise ValueError("Only one of `layer` and `obsm` can be used") 45 | elif obsm is not None: 46 | X = adata.obsm[obsm] 47 | elif layer is not None: 48 | X = adata.layers[layer] 49 | else: 50 | X = adata.X 51 | if n_dim is not None: 52 | X = X[:, :n_dim] 53 | reducer = umap_learn.UMAP(n_neighbors=n_neighbors, 54 | n_components=n_components, 55 | random_state=random_state, 56 | **kwargs) 57 | reducer.fit(X) 58 | adata.obsm['X_umap'] = reducer.embedding_ 59 | -------------------------------------------------------------------------------- /simba/tools/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from sklearn.neighbors import KDTree 5 | from scipy.sparse import csr_matrix 6 | 7 | 8 | def _uniquify(seq, sep='-'): 9 | """Uniquify a list of strings. 10 | 11 | Adding unique numbers to duplicate values. 12 | 13 | Parameters 14 | ---------- 15 | seq : `list` or `array-like` 16 | A list of values 17 | sep : `str` 18 | Separator 19 | 20 | Returns 21 | ------- 22 | seq: `list` or `array-like` 23 | A list of updated values 24 | """ 25 | 26 | dups = {} 27 | 28 | for i, val in enumerate(seq): 29 | if val not in dups: 30 | # Store index of first occurrence and occurrence value 31 | dups[val] = [i, 1] 32 | else: 33 | # Increment occurrence value, index value doesn't matter anymore 34 | dups[val][1] += 1 35 | 36 | # Use stored occurrence value 37 | seq[i] += (sep+str(dups[val][1])) 38 | 39 | return seq 40 | 41 | 42 | def _gini(array): 43 | """Calculate the Gini coefficient of a numpy array. 44 | """ 45 | 46 | array = array.flatten().astype(float) 47 | if np.amin(array) < 0: 48 | # Values cannot be negative: 49 | array -= np.amin(array) 50 | # Values cannot be 0: 51 | array += 0.0000001 52 | # Values must be sorted: 53 | array = np.sort(array) 54 | # Index per array element: 55 | index = np.arange(1, array.shape[0]+1) 56 | # Number of array elements: 57 | n = array.shape[0] 58 | # Gini coefficient: 59 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) 60 | 61 | 62 | def _knn(X_ref, 63 | X_query=None, 64 | k=20, 65 | leaf_size=40, 66 | metric='euclidean'): 67 | """Calculate K nearest neigbors for each row. 68 | """ 69 | if X_query is None: 70 | X_query = X_ref.copy() 71 | kdt = KDTree(X_ref, leaf_size=leaf_size, metric=metric) 72 | kdt_d, kdt_i = kdt.query(X_query, k=k, return_distance=True) 73 | # kdt_i = kdt_i[:, 1:] # exclude the point itself 74 | # kdt_d = kdt_d[:, 1:] # exclude the point itself 75 | sp_row = np.repeat(np.arange(kdt_i.shape[0]), kdt_i.shape[1]) 76 | sp_col = kdt_i.flatten() 77 | sp_conn = np.repeat(1, len(sp_row)) 78 | sp_dist = kdt_d.flatten() 79 | mat_conn_ref_query = csr_matrix( 80 | (sp_conn, (sp_row, sp_col)), 81 | shape=(X_query.shape[0], X_ref.shape[0])).T 82 | mat_dist_ref_query = csr_matrix( 83 | (sp_dist, (sp_row, sp_col)), 84 | shape=(X_query.shape[0], X_ref.shape[0])).T 85 | return mat_conn_ref_query, mat_dist_ref_query 86 | -------------------------------------------------------------------------------- /tests/data/10xpbmc_atac_subset.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/10xpbmc_atac_subset.h5ad -------------------------------------------------------------------------------- /tests/data/10xpbmc_rna_subset.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/10xpbmc_rna_subset.h5ad -------------------------------------------------------------------------------- /tests/data/pbg_training/entity_alias.txt: -------------------------------------------------------------------------------- 1 | alias 2 | GCACCTAAGTTAGTGC-1_rna C.0 3 | TCTCCTCGTGGAGCAA-1_rna C.1 4 | GTTCTTGTCTTGACCC-1_rna C.2 5 | CGCGATTCAGGATGGC-1_rna C.3 6 | CCTACTTCAGAATGAC-1_rna C.4 7 | TGAAGGATCCTTTACG-1_rna C.5 8 | AATCCGTAGCACTAAC-1_rna C.6 9 | GCTTAAATCGGCCATA-1_rna C.7 10 | GGGCTAACACCTAAGC-1_rna C.8 11 | TAGCCTGAGGTCTTGG-1_rna C.9 12 | ACCTTGCTCGTTAGCG-1_rna C.10 13 | CTAGGCGGTAGACAAA-1_rna C.11 14 | CGATTTGCATTGCGTA-1_rna C.12 15 | CGTGAGGAGGAGCAAC-1_rna C.13 16 | GGGTCAACACATAGCC-1_rna C.14 17 | ACTCAGTAGTAGGATG-1_rna C.15 18 | ATTACCCGTAGGTTAT-1_rna C.16 19 | ATTAGGTGTTTGGCGG-1_rna C.17 20 | ACCAAACTCATTATGG-1_rna C.18 21 | AGCTTGGTCGCTAGTG-1_rna C.19 22 | TGTGGAGCAACCTGGT-1_rna C.20 23 | TTTAGCTTCCTTAAGA-1_rna C.21 24 | GTTTAACCATAATCCG-1_rna C.22 25 | CGTATTGCAGCTAATT-1_rna C.23 26 | ACATAGCTCCCTGACT-1_rna C.24 27 | GTTAAACGTTTCCACG-1_rna C.25 28 | TTAACTGAGTATTGTG-1_rna C.26 29 | GGCTATGTCCCTGGTT-1_rna C.27 30 | AGGTCATTCTAACCAA-1_rna C.28 31 | CAGCCTTTCTCACAAA-1_rna C.29 32 | TCAGTAGGTAGGTTAT-1_rna C.30 33 | TGAAGTGAGGAAGTAT-1_rna C.31 34 | GCTGGTTCAATTAAGG-1_rna C.32 35 | TGAGCCGGTGCACGCA-1_rna C.33 36 | CGCTTAACAGCCGCTA-1_rna C.34 37 | GTGCACGGTTGTAAAC-1_rna C.35 38 | CTGTTAAAGAATGACG-1_rna C.36 39 | TGTAAGCTCTTAGGAC-1_rna C.37 40 | GAGCGAAGTTCGGGAT-1_rna C.38 41 | GATTCATCATAATGTC-1_rna C.39 42 | TTCCTCAAGGTTTGAC-1_rna C.40 43 | AGGTACGCAGCCTTGG-1_rna C.41 44 | CATTTGTTCGCACACA-1_rna C.42 45 | CCGTTAACAATCCTGA-1_rna C.43 46 | GGCCTCTGTCCTCCAA-1_rna C.44 47 | GCTGTGATCAATCTCT-1_rna C.45 48 | GGGAATATCTTAATGG-1_rna C.46 49 | AGGTGAGGTGGATTCA-1_rna C.47 50 | TGCACTTGTTTACGTC-1_rna C.48 51 | GTTCGCTTCCGTTAAA-1_rna C.49 52 | CCAAATCAGCGGTTAT-1_rna C.50 53 | GGCCTAATCCCTGTTA-1_rna C.51 54 | GGGCCTAGTGTCCAGG-1_rna C.52 55 | CGGACCTAGTCACTCC-1_rna C.53 56 | ATATGGTGTCAGGAAG-1_rna C.54 57 | CGCTTACTCCTAATTC-1_rna C.55 58 | AAAGCTTGTCGACTAA-1_rna C.56 59 | AAGCATGAGGCCTAAT-1_rna C.57 60 | GTTACAGGTAGGTTAT-1_rna C.58 61 | CCATAATCATGCTATG-1_rna C.59 62 | CACCTCAGTTTGCGAA-1_rna C.60 63 | ATTGCGCCACTAGCGT-1_rna C.61 64 | CCTACTGGTGCCGCAA-1_rna C.62 65 | GCAATAGAGGCGGATG-1_rna C.63 66 | GCGCGATTCCTCCCTC-1_rna C.64 67 | TAGGTTATCTCGACCT-1_rna C.65 68 | TGCTTGCTCATGAGCT-1_rna C.66 69 | GACTCACCAGTAATAG-1_rna C.67 70 | TTCCCGCCAATAACGA-1_rna C.68 71 | CGAACCGGTAGCCATA-1_rna C.69 72 | ACCAAGCGTCATAAGT-1_rna C.70 73 | CGCCAAATCCCAGTAG-1_rna C.71 74 | ACTCACTGTTTGGTTC-1_rna C.72 75 | AGAAGGTGTAGGTTGC-1_rna C.73 76 | ATTTGCGCAGGCTTGT-1_rna C.74 77 | TAGTTGTCATCGCTCC-1_rna C.75 78 | GGACAGCCAGATTCAT-1_rna C.76 79 | TTGAGCTAGCTTACTT-1_rna C.77 80 | CATTTGTTCTAAATCG-1_rna C.78 81 | TAGGAGTCACTGACCG-1_rna C.79 82 | ATCAAGCTCGGGATTT-1_rna C.80 83 | TCTCGCCCAACCTGGT-1_rna C.81 84 | GCGGTTATCCTGATGG-1_rna C.82 85 | GATTGCAGTGGAGCAA-1_rna C.83 86 | CGATTCCTCTTGCTAT-1_rna C.84 87 | GCTCATTGTTCACCAT-1_rna C.85 88 | CAAACGCGTTTCGCGC-1_rna C.86 89 | TCTTAGCGTCCGTGAG-1_rna C.87 90 | GCGCGATTCCTTGAGG-1_rna C.88 91 | ATAGGTACAGGTCCTG-1_rna C.89 92 | TCCTTAGTCCTGAGTG-1_rna C.90 93 | TTCCCACAGCCAAATC-1_rna C.91 94 | GGGTGAAGTGCATCGG-1_rna C.92 95 | CGGATAAAGTAGAGGC-1_rna C.93 96 | AGGATGTCACAAAGAC-1_rna C.94 97 | CTTCTCAAGGGTGGAT-1_rna C.95 98 | AACCCGCAGCGGATTT-1_rna C.96 99 | GGACATAAGGGATGCG-1_rna C.97 100 | TACAAGCTCTGTGAGT-1_rna C.98 101 | CGTATTGCATTCAGCA-1_rna C.99 102 | DPH3 G.0 103 | BICD1 G.1 104 | MAML3 G.2 105 | TTN-AS1 G.3 106 | APPL2 G.4 107 | HLX G.5 108 | CHIC1 G.6 109 | DDX39B G.7 110 | SRC G.8 111 | VAPB G.9 112 | RPS10-NUDT3 G.10 113 | POLR2A G.11 114 | AC007262.2 G.12 115 | CCND2 G.13 116 | PTCD3 G.14 117 | TNFRSF10A G.15 118 | POLR3GL G.16 119 | NNT G.17 120 | IL26 G.18 121 | RPL10 G.19 122 | UHRF1BP1L G.20 123 | AC124014.1 G.21 124 | ELOVL1 G.22 125 | SGPL1 G.23 126 | USP42 G.24 127 | ATF7IP2 G.25 128 | METTL22 G.26 129 | HSCB G.27 130 | PCTP G.28 131 | FAM174B G.29 132 | TMEM184B G.30 133 | SERF2 G.31 134 | KIAA0930 G.32 135 | GNAQ G.33 136 | SCFD1 G.34 137 | UBE2R2 G.35 138 | ARL5B G.36 139 | FRMD4A G.37 140 | EML5 G.38 141 | FAM3A G.39 142 | ARHGAP22 G.40 143 | KXD1 G.41 144 | A1BG G.42 145 | C4orf3 G.43 146 | FAM153CP G.44 147 | PPP1R9A G.45 148 | IQGAP2 G.46 149 | ACTG1 G.47 150 | GPLD1 G.48 151 | SIRPG G.49 152 | CALML4 G.50 153 | IAH1 G.51 154 | LAT2 G.52 155 | AAAS G.53 156 | -------------------------------------------------------------------------------- /tests/data/pbg_training/graph_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_edges": 1075, 3 | "relation0": { 4 | "destination": "G", 5 | "n_edges": 153, 6 | "source": "C" 7 | }, 8 | "relation1": { 9 | "destination": "G", 10 | "n_edges": 369, 11 | "source": "C" 12 | }, 13 | "relation2": { 14 | "destination": "G", 15 | "n_edges": 301, 16 | "source": "C" 17 | }, 18 | "relation3": { 19 | "destination": "G", 20 | "n_edges": 166, 21 | "source": "C" 22 | }, 23 | "relation4": { 24 | "destination": "G", 25 | "n_edges": 86, 26 | "source": "C" 27 | } 28 | } -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_count_C_0.txt: -------------------------------------------------------------------------------- 1 | 100 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_count_G_0.txt: -------------------------------------------------------------------------------- 1 | 54 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_names_C_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | "C.3", 3 | "C.73", 4 | "C.5", 5 | "C.93", 6 | "C.58", 7 | "C.38", 8 | "C.14", 9 | "C.24", 10 | "C.35", 11 | "C.60", 12 | "C.70", 13 | "C.64", 14 | "C.72", 15 | "C.68", 16 | "C.79", 17 | "C.12", 18 | "C.52", 19 | "C.81", 20 | "C.83", 21 | "C.87", 22 | "C.48", 23 | "C.91", 24 | "C.11", 25 | "C.33", 26 | "C.77", 27 | "C.88", 28 | "C.9", 29 | "C.0", 30 | "C.39", 31 | "C.28", 32 | "C.36", 33 | "C.75", 34 | "C.92", 35 | "C.85", 36 | "C.10", 37 | "C.67", 38 | "C.20", 39 | "C.37", 40 | "C.46", 41 | "C.7", 42 | "C.53", 43 | "C.44", 44 | "C.23", 45 | "C.4", 46 | "C.42", 47 | "C.8", 48 | "C.50", 49 | "C.90", 50 | "C.1", 51 | "C.76", 52 | "C.61", 53 | "C.6", 54 | "C.56", 55 | "C.13", 56 | "C.89", 57 | "C.41", 58 | "C.25", 59 | "C.62", 60 | "C.84", 61 | "C.15", 62 | "C.40", 63 | "C.55", 64 | "C.96", 65 | "C.65", 66 | "C.86", 67 | "C.69", 68 | "C.98", 69 | "C.17", 70 | "C.94", 71 | "C.97", 72 | "C.18", 73 | "C.54", 74 | "C.19", 75 | "C.59", 76 | "C.49", 77 | "C.34", 78 | "C.26", 79 | "C.2", 80 | "C.95", 81 | "C.47", 82 | "C.66", 83 | "C.45", 84 | "C.51", 85 | "C.82", 86 | "C.22", 87 | "C.21", 88 | "C.57", 89 | "C.71", 90 | "C.43", 91 | "C.99", 92 | "C.27", 93 | "C.30", 94 | "C.32", 95 | "C.29", 96 | "C.16", 97 | "C.80", 98 | "C.63", 99 | "C.74", 100 | "C.31", 101 | "C.78" 102 | ] -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_names_G_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | "G.34", 3 | "G.35", 4 | "G.19", 5 | "G.9", 6 | "G.11", 7 | "G.8", 8 | "G.37", 9 | "G.22", 10 | "G.48", 11 | "G.29", 12 | "G.18", 13 | "G.26", 14 | "G.23", 15 | "G.20", 16 | "G.2", 17 | "G.28", 18 | "G.13", 19 | "G.46", 20 | "G.25", 21 | "G.4", 22 | "G.52", 23 | "G.3", 24 | "G.17", 25 | "G.30", 26 | "G.36", 27 | "G.51", 28 | "G.7", 29 | "G.24", 30 | "G.53", 31 | "G.12", 32 | "G.39", 33 | "G.15", 34 | "G.16", 35 | "G.6", 36 | "G.5", 37 | "G.40", 38 | "G.38", 39 | "G.33", 40 | "G.0", 41 | "G.31", 42 | "G.27", 43 | "G.32", 44 | "G.45", 45 | "G.14", 46 | "G.47", 47 | "G.21", 48 | "G.44", 49 | "G.50", 50 | "G.43", 51 | "G.10", 52 | "G.1", 53 | "G.42", 54 | "G.41", 55 | "G.49" 56 | ] -------------------------------------------------------------------------------- /tests/data/pbg_training/model/checkpoint_version.txt: -------------------------------------------------------------------------------- 1 | 10 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/model/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "entities": { 3 | "C": { 4 | "num_partitions": 1, 5 | "featurized": false, 6 | "dimension": null 7 | }, 8 | "G": { 9 | "num_partitions": 1, 10 | "featurized": false, 11 | "dimension": null 12 | } 13 | }, 14 | "relations": [ 15 | { 16 | "name": "r0", 17 | "lhs": "C", 18 | "rhs": "G", 19 | "weight": 1.0, 20 | "operator": "none", 21 | "all_negs": false 22 | }, 23 | { 24 | "name": "r1", 25 | "lhs": "C", 26 | "rhs": "G", 27 | "weight": 2.0, 28 | "operator": "none", 29 | "all_negs": false 30 | }, 31 | { 32 | "name": "r2", 33 | "lhs": "C", 34 | "rhs": "G", 35 | "weight": 3.0, 36 | "operator": "none", 37 | "all_negs": false 38 | }, 39 | { 40 | "name": "r3", 41 | "lhs": "C", 42 | "rhs": "G", 43 | "weight": 4.0, 44 | "operator": "none", 45 | "all_negs": false 46 | }, 47 | { 48 | "name": "r4", 49 | "lhs": "C", 50 | "rhs": "G", 51 | "weight": 5.0, 52 | "operator": "none", 53 | "all_negs": false 54 | } 55 | ], 56 | "dimension": 50, 57 | "init_scale": 0.001, 58 | "max_norm": null, 59 | "global_emb": false, 60 | "comparator": "dot", 61 | "bias": false, 62 | "loss_fn": "softmax", 63 | "margin": 0.1, 64 | "regularization_coef": 0.0, 65 | "regularizer": "N3", 66 | "wd": 32.962933, 67 | "wd_interval": 50, 68 | "entity_path": "./result_simba/pbg/graph0/input/entity", 69 | "edge_paths": [ 70 | "./result_simba/pbg/graph0/input/edge" 71 | ], 72 | "checkpoint_path": "result_simba/pbg/graph0/model", 73 | "init_path": null, 74 | "checkpoint_preservation_interval": null, 75 | "num_epochs": 10, 76 | "num_edge_chunks": null, 77 | "max_edges_per_chunk": 1000000000, 78 | "bucket_order": "inside_out", 79 | "workers": 12, 80 | "batch_size": 1000, 81 | "num_batch_negs": 50, 82 | "num_uniform_negs": 50, 83 | "disable_lhs_negs": false, 84 | "disable_rhs_negs": false, 85 | "lr": 0.1, 86 | "relation_lr": null, 87 | "eval_fraction": 0.05, 88 | "eval_num_batch_negs": 50, 89 | "eval_num_uniform_negs": 50, 90 | "early_stopping": false, 91 | "background_io": false, 92 | "verbose": 0, 93 | "hogwild_delay": 2.0, 94 | "dynamic_relations": false, 95 | "num_machines": 1, 96 | "num_partition_servers": -1, 97 | "distributed_init_method": null, 98 | "distributed_tree_init_order": true, 99 | "num_gpus": 0, 100 | "num_groups_for_partition_server": 16, 101 | "half_precision": false 102 | } -------------------------------------------------------------------------------- /tests/data/pbg_training/model/embeddings_C_0.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/embeddings_C_0.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/embeddings_G_0.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/embeddings_G_0.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/model.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/pbg_training/model/model.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/training_stats.json: -------------------------------------------------------------------------------- 1 | {"lhs_partition": 0, "rhs_partition": 0, "index": 1, "stats": {"count": 1022, "metrics": {"loss": 23.252048253546487, "reg": 0.0, "violators_lhs": 36.36497064579256, "violators_rhs": 31.131115459882583}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.4885098259404, "pos_rank": 26.50943396226415, "mrr": 0.08514296270485194, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5094339645133829}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}, "epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0} 2 | {"epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 0, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}} 3 | {"lhs_partition": 0, "rhs_partition": 0, "index": 2, "stats": {"count": 1022, "metrics": {"loss": 22.486315262527615, "reg": 0.0, "violators_lhs": 30.62720156555773, "violators_rhs": 23.104696673189824}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.591775120429272, "pos_rank": 25.566037735849058, "mrr": 0.07682948650897674, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9716981132075472, "auc": 0.5660377372548265}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}, "epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0} 4 | {"epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 1, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}} 5 | {"lhs_partition": 0, "rhs_partition": 0, "index": 3, "stats": {"count": 1022, "metrics": {"loss": 22.591744126172447, "reg": 0.0, "violators_lhs": 30.874755381604697, "violators_rhs": 23.437377690802347}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.54162585960244, "pos_rank": 24.30188679245283, "mrr": 0.08181398664161844, "r1": 0.018867924528301886, "r10": 0.2169811320754717, "r50": 0.9622641509433962, "auc": 0.5377358521492976}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}, "epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0} 6 | {"epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 2, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}} 7 | {"lhs_partition": 0, "rhs_partition": 0, "index": 4, "stats": {"count": 1022, "metrics": {"loss": 22.62260271658403, "reg": 0.0, "violators_lhs": 30.645792563600782, "violators_rhs": 23.364970645792564}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.508187914794345, "pos_rank": 24.88679245283019, "mrr": 0.08207970859377452, "r1": 0.009433962264150943, "r10": 0.20754716981132076, "r50": 0.9245283018867925, "auc": 0.4905660402662349}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}, "epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0} 8 | {"epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 3, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}} 9 | {"lhs_partition": 0, "rhs_partition": 0, "index": 5, "stats": {"count": 1022, "metrics": {"loss": 22.690110387634157, "reg": 0.0, "violators_lhs": 30.770058708414872, "violators_rhs": 23.117416829745597}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.526717563845075, "pos_rank": 24.89622641509434, "mrr": 0.07438522921699398, "r1": 0.0, "r10": 0.2358490566037736, "r50": 0.9622641509433962, "auc": 0.5188679262152258}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}, "epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0} 10 | {"epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 4, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}} 11 | {"lhs_partition": 0, "rhs_partition": 0, "index": 6, "stats": {"count": 1022, "metrics": {"loss": 22.763349835420076, "reg": 0.0, "violators_lhs": 31.012720156555773, "violators_rhs": 22.820939334637966}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.493189523804862, "pos_rank": 24.38679245283019, "mrr": 0.0724140086896577, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9528301886792453, "auc": 0.500000000843462}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}, "epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0} 12 | {"epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 5, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}} 13 | {"lhs_partition": 0, "rhs_partition": 0, "index": 7, "stats": {"count": 1022, "metrics": {"loss": 22.76916164241425, "reg": 0.0, "violators_lhs": 32.49412915851272, "violators_rhs": 26.92367906066536}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.49149281123899, "pos_rank": 24.566037735849058, "mrr": 0.0773335443458186, "r1": 0.0, "r10": 0.22641509433962265, "r50": 0.9433962264150944, "auc": 0.5660377392229045}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}, "epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0} 14 | {"epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 6, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}} 15 | {"lhs_partition": 0, "rhs_partition": 0, "index": 8, "stats": {"count": 1022, "metrics": {"loss": 22.794376134405862, "reg": 0.0, "violators_lhs": 31.71917808219178, "violators_rhs": 23.874755381604697}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.490235013781852, "pos_rank": 23.90566037735849, "mrr": 0.08131686016425209, "r1": 0.009433962264150943, "r10": 0.2641509433962264, "r50": 0.9528301886792453, "auc": 0.6792452849869458}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}, "epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0} 16 | {"epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 7, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}} 17 | {"lhs_partition": 0, "rhs_partition": 0, "index": 9, "stats": {"count": 1022, "metrics": {"loss": 22.79033338020459, "reg": 0.0, "violators_lhs": 31.304305283757337, "violators_rhs": 23.480430528375734}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.48922288642739, "pos_rank": 23.849056603773583, "mrr": 0.09737446559768803, "r1": 0.018867924528301886, "r10": 0.25471698113207547, "r50": 0.9622641509433962, "auc": 0.5943396251718953}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}, "epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0} 18 | {"epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 8, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}} 19 | {"lhs_partition": 0, "rhs_partition": 0, "index": 10, "stats": {"count": 1022, "metrics": {"loss": 22.792577922694136, "reg": 0.0, "violators_lhs": 31.645792563600782, "violators_rhs": 23.681996086105674}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.489368258782154, "pos_rank": 24.528301886792452, "mrr": 0.08778320558650314, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5660377378171345}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}, "epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0} 20 | {"epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 9, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}} 21 | -------------------------------------------------------------------------------- /tests/data/pbg_training/pbg_graph.txt: -------------------------------------------------------------------------------- 1 | C.2 r0 G.0 2 | C.2 r0 G.4 3 | C.2 r0 G.14 4 | C.2 r0 G.22 5 | C.2 r0 G.25 6 | C.2 r0 G.43 7 | C.3 r0 G.1 8 | C.3 r0 G.11 9 | C.3 r0 G.13 10 | C.3 r0 G.17 11 | C.3 r0 G.20 12 | C.3 r0 G.24 13 | C.3 r0 G.25 14 | C.3 r0 G.36 15 | C.3 r0 G.44 16 | C.3 r0 G.45 17 | C.4 r0 G.0 18 | C.4 r0 G.5 19 | C.4 r0 G.9 20 | C.4 r0 G.11 21 | C.4 r0 G.26 22 | C.4 r0 G.35 23 | C.4 r0 G.52 24 | C.8 r0 G.2 25 | C.8 r0 G.7 26 | C.8 r0 G.32 27 | C.8 r0 G.40 28 | C.8 r0 G.52 29 | C.16 r0 G.0 30 | C.16 r0 G.3 31 | C.16 r0 G.5 32 | C.16 r0 G.6 33 | C.16 r0 G.7 34 | C.16 r0 G.9 35 | C.16 r0 G.16 36 | C.16 r0 G.23 37 | C.16 r0 G.27 38 | C.16 r0 G.30 39 | C.16 r0 G.32 40 | C.16 r0 G.43 41 | C.16 r0 G.44 42 | C.16 r0 G.46 43 | C.16 r0 G.49 44 | C.16 r0 G.52 45 | C.22 r0 G.6 46 | C.22 r0 G.14 47 | C.22 r0 G.16 48 | C.22 r0 G.17 49 | C.22 r0 G.20 50 | C.22 r0 G.21 51 | C.22 r0 G.24 52 | C.22 r0 G.33 53 | C.22 r0 G.34 54 | C.27 r0 G.4 55 | C.27 r0 G.8 56 | C.27 r0 G.9 57 | C.27 r0 G.13 58 | C.27 r0 G.20 59 | C.27 r0 G.27 60 | C.27 r0 G.30 61 | C.27 r0 G.36 62 | C.30 r0 G.5 63 | C.30 r0 G.7 64 | C.30 r0 G.16 65 | C.30 r0 G.24 66 | C.30 r0 G.26 67 | C.30 r0 G.28 68 | C.30 r0 G.37 69 | C.30 r0 G.40 70 | C.30 r0 G.43 71 | C.45 r0 G.11 72 | C.45 r0 G.12 73 | C.45 r0 G.27 74 | C.45 r0 G.40 75 | C.47 r0 G.9 76 | C.47 r0 G.21 77 | C.47 r0 G.32 78 | C.47 r0 G.34 79 | C.47 r0 G.35 80 | C.47 r0 G.36 81 | C.47 r0 G.41 82 | C.47 r0 G.46 83 | C.52 r0 G.0 84 | C.52 r0 G.3 85 | C.52 r0 G.7 86 | C.52 r0 G.20 87 | C.52 r0 G.24 88 | C.52 r0 G.25 89 | C.52 r0 G.33 90 | C.52 r0 G.35 91 | C.52 r0 G.43 92 | C.52 r0 G.49 93 | C.68 r0 G.1 94 | C.68 r0 G.4 95 | C.68 r0 G.8 96 | C.68 r0 G.17 97 | C.68 r0 G.20 98 | C.68 r0 G.21 99 | C.68 r0 G.23 100 | C.68 r0 G.24 101 | C.68 r0 G.25 102 | C.68 r0 G.34 103 | C.68 r0 G.51 104 | C.72 r0 G.3 105 | C.72 r0 G.13 106 | C.72 r0 G.25 107 | C.72 r0 G.41 108 | C.74 r0 G.6 109 | C.74 r0 G.7 110 | C.74 r0 G.10 111 | C.74 r0 G.13 112 | C.74 r0 G.16 113 | C.74 r0 G.27 114 | C.74 r0 G.32 115 | C.74 r0 G.35 116 | C.74 r0 G.43 117 | C.74 r0 G.52 118 | C.80 r0 G.6 119 | C.80 r0 G.24 120 | C.90 r0 G.7 121 | C.90 r0 G.9 122 | C.90 r0 G.15 123 | C.90 r0 G.16 124 | C.90 r0 G.17 125 | C.90 r0 G.24 126 | C.90 r0 G.25 127 | C.90 r0 G.33 128 | C.90 r0 G.34 129 | C.90 r0 G.35 130 | C.90 r0 G.39 131 | C.91 r0 G.5 132 | C.91 r0 G.9 133 | C.91 r0 G.14 134 | C.91 r0 G.23 135 | C.91 r0 G.37 136 | C.91 r0 G.42 137 | C.95 r0 G.21 138 | C.95 r0 G.29 139 | C.95 r0 G.37 140 | C.98 r0 G.4 141 | C.98 r0 G.7 142 | C.98 r0 G.13 143 | C.98 r0 G.15 144 | C.98 r0 G.23 145 | C.98 r0 G.24 146 | C.98 r0 G.32 147 | C.98 r0 G.35 148 | C.98 r0 G.36 149 | C.98 r0 G.43 150 | C.98 r0 G.46 151 | C.98 r0 G.49 152 | C.98 r0 G.51 153 | C.98 r0 G.52 154 | C.0 r1 G.1 155 | C.0 r1 G.16 156 | C.0 r1 G.38 157 | C.2 r1 G.33 158 | C.2 r1 G.34 159 | C.2 r1 G.41 160 | C.3 r1 G.16 161 | C.3 r1 G.52 162 | C.4 r1 G.7 163 | C.4 r1 G.10 164 | C.4 r1 G.31 165 | C.4 r1 G.32 166 | C.4 r1 G.36 167 | C.4 r1 G.46 168 | C.4 r1 G.47 169 | C.5 r1 G.11 170 | C.5 r1 G.12 171 | C.5 r1 G.19 172 | C.5 r1 G.20 173 | C.5 r1 G.24 174 | C.5 r1 G.34 175 | C.5 r1 G.43 176 | C.6 r1 G.7 177 | C.6 r1 G.10 178 | C.6 r1 G.11 179 | C.6 r1 G.13 180 | C.6 r1 G.14 181 | C.6 r1 G.16 182 | C.6 r1 G.33 183 | C.6 r1 G.46 184 | C.6 r1 G.48 185 | C.7 r1 G.2 186 | C.7 r1 G.10 187 | C.7 r1 G.25 188 | C.7 r1 G.28 189 | C.7 r1 G.33 190 | C.7 r1 G.43 191 | C.7 r1 G.45 192 | C.8 r1 G.34 193 | C.8 r1 G.51 194 | C.14 r1 G.0 195 | C.14 r1 G.7 196 | C.14 r1 G.13 197 | C.14 r1 G.16 198 | C.14 r1 G.33 199 | C.14 r1 G.34 200 | C.14 r1 G.47 201 | C.14 r1 G.53 202 | C.16 r1 G.14 203 | C.16 r1 G.33 204 | C.17 r1 G.4 205 | C.17 r1 G.8 206 | C.17 r1 G.28 207 | C.17 r1 G.34 208 | C.17 r1 G.39 209 | C.17 r1 G.46 210 | C.17 r1 G.53 211 | C.18 r1 G.6 212 | C.18 r1 G.31 213 | C.18 r1 G.43 214 | C.18 r1 G.46 215 | C.18 r1 G.49 216 | C.19 r1 G.1 217 | C.19 r1 G.7 218 | C.19 r1 G.9 219 | C.19 r1 G.34 220 | C.19 r1 G.43 221 | C.19 r1 G.46 222 | C.20 r1 G.17 223 | C.20 r1 G.24 224 | C.20 r1 G.33 225 | C.21 r1 G.0 226 | C.21 r1 G.2 227 | C.21 r1 G.20 228 | C.21 r1 G.30 229 | C.21 r1 G.34 230 | C.21 r1 G.42 231 | C.21 r1 G.51 232 | C.22 r1 G.1 233 | C.22 r1 G.4 234 | C.22 r1 G.7 235 | C.22 r1 G.23 236 | C.22 r1 G.36 237 | C.22 r1 G.46 238 | C.23 r1 G.4 239 | C.23 r1 G.7 240 | C.23 r1 G.9 241 | C.23 r1 G.11 242 | C.23 r1 G.16 243 | C.23 r1 G.41 244 | C.23 r1 G.52 245 | C.26 r1 G.2 246 | C.26 r1 G.14 247 | C.26 r1 G.32 248 | C.26 r1 G.35 249 | C.27 r1 G.24 250 | C.27 r1 G.25 251 | C.27 r1 G.35 252 | C.27 r1 G.47 253 | C.27 r1 G.52 254 | C.29 r1 G.9 255 | C.29 r1 G.11 256 | C.29 r1 G.14 257 | C.29 r1 G.19 258 | C.29 r1 G.20 259 | C.29 r1 G.24 260 | C.29 r1 G.34 261 | C.29 r1 G.35 262 | C.29 r1 G.42 263 | C.29 r1 G.47 264 | C.29 r1 G.52 265 | C.30 r1 G.23 266 | C.30 r1 G.35 267 | C.30 r1 G.47 268 | C.32 r1 G.4 269 | C.32 r1 G.7 270 | C.32 r1 G.11 271 | C.32 r1 G.17 272 | C.32 r1 G.33 273 | C.32 r1 G.46 274 | C.33 r1 G.0 275 | C.33 r1 G.2 276 | C.33 r1 G.8 277 | C.33 r1 G.16 278 | C.33 r1 G.20 279 | C.33 r1 G.24 280 | C.33 r1 G.32 281 | C.33 r1 G.34 282 | C.33 r1 G.43 283 | C.33 r1 G.47 284 | C.33 r1 G.52 285 | C.33 r1 G.53 286 | C.36 r1 G.16 287 | C.36 r1 G.31 288 | C.36 r1 G.51 289 | C.37 r1 G.7 290 | C.37 r1 G.13 291 | C.37 r1 G.17 292 | C.37 r1 G.18 293 | C.37 r1 G.34 294 | C.37 r1 G.35 295 | C.37 r1 G.37 296 | C.37 r1 G.49 297 | C.37 r1 G.50 298 | C.38 r1 G.13 299 | C.38 r1 G.43 300 | C.39 r1 G.2 301 | C.39 r1 G.6 302 | C.39 r1 G.22 303 | C.39 r1 G.39 304 | C.39 r1 G.50 305 | C.40 r1 G.14 306 | C.40 r1 G.15 307 | C.40 r1 G.17 308 | C.40 r1 G.34 309 | C.40 r1 G.39 310 | C.41 r1 G.7 311 | C.41 r1 G.13 312 | C.41 r1 G.31 313 | C.41 r1 G.34 314 | C.41 r1 G.42 315 | C.41 r1 G.46 316 | C.42 r1 G.3 317 | C.42 r1 G.4 318 | C.42 r1 G.9 319 | C.42 r1 G.11 320 | C.42 r1 G.14 321 | C.42 r1 G.20 322 | C.42 r1 G.24 323 | C.42 r1 G.31 324 | C.42 r1 G.46 325 | C.45 r1 G.8 326 | C.45 r1 G.17 327 | C.45 r1 G.31 328 | C.45 r1 G.35 329 | C.45 r1 G.46 330 | C.45 r1 G.47 331 | C.45 r1 G.52 332 | C.47 r1 G.2 333 | C.47 r1 G.52 334 | C.49 r1 G.0 335 | C.49 r1 G.7 336 | C.49 r1 G.27 337 | C.49 r1 G.34 338 | C.49 r1 G.35 339 | C.49 r1 G.36 340 | C.49 r1 G.47 341 | C.50 r1 G.4 342 | C.50 r1 G.8 343 | C.50 r1 G.11 344 | C.50 r1 G.14 345 | C.50 r1 G.16 346 | C.50 r1 G.20 347 | C.50 r1 G.31 348 | C.50 r1 G.32 349 | C.50 r1 G.43 350 | C.51 r1 G.15 351 | C.51 r1 G.23 352 | C.51 r1 G.30 353 | C.51 r1 G.31 354 | C.51 r1 G.46 355 | C.52 r1 G.17 356 | C.52 r1 G.30 357 | C.53 r1 G.30 358 | C.53 r1 G.33 359 | C.53 r1 G.43 360 | C.54 r1 G.34 361 | C.54 r1 G.36 362 | C.55 r1 G.1 363 | C.55 r1 G.3 364 | C.55 r1 G.16 365 | C.55 r1 G.18 366 | C.55 r1 G.34 367 | C.55 r1 G.43 368 | C.55 r1 G.46 369 | C.55 r1 G.52 370 | C.56 r1 G.0 371 | C.56 r1 G.1 372 | C.56 r1 G.6 373 | C.56 r1 G.24 374 | C.56 r1 G.26 375 | C.56 r1 G.46 376 | C.58 r1 G.27 377 | C.58 r1 G.31 378 | C.58 r1 G.48 379 | C.58 r1 G.50 380 | C.60 r1 G.13 381 | C.60 r1 G.16 382 | C.60 r1 G.31 383 | C.60 r1 G.37 384 | C.60 r1 G.43 385 | C.60 r1 G.52 386 | C.62 r1 G.8 387 | C.62 r1 G.9 388 | C.62 r1 G.14 389 | C.62 r1 G.20 390 | C.62 r1 G.26 391 | C.62 r1 G.34 392 | C.62 r1 G.46 393 | C.62 r1 G.52 394 | C.63 r1 G.3 395 | C.63 r1 G.35 396 | C.65 r1 G.0 397 | C.65 r1 G.7 398 | C.65 r1 G.24 399 | C.65 r1 G.25 400 | C.65 r1 G.46 401 | C.67 r1 G.0 402 | C.67 r1 G.2 403 | C.67 r1 G.7 404 | C.67 r1 G.8 405 | C.67 r1 G.14 406 | C.67 r1 G.46 407 | C.67 r1 G.52 408 | C.68 r1 G.31 409 | C.69 r1 G.4 410 | C.69 r1 G.22 411 | C.69 r1 G.46 412 | C.69 r1 G.49 413 | C.70 r1 G.1 414 | C.70 r1 G.7 415 | C.70 r1 G.8 416 | C.70 r1 G.9 417 | C.70 r1 G.17 418 | C.70 r1 G.31 419 | C.70 r1 G.32 420 | C.70 r1 G.35 421 | C.70 r1 G.37 422 | C.70 r1 G.43 423 | C.71 r1 G.2 424 | C.71 r1 G.26 425 | C.71 r1 G.32 426 | C.71 r1 G.35 427 | C.71 r1 G.43 428 | C.71 r1 G.46 429 | C.72 r1 G.16 430 | C.72 r1 G.43 431 | C.72 r1 G.52 432 | C.73 r1 G.8 433 | C.73 r1 G.33 434 | C.73 r1 G.47 435 | C.75 r1 G.4 436 | C.75 r1 G.7 437 | C.75 r1 G.17 438 | C.75 r1 G.33 439 | C.75 r1 G.35 440 | C.75 r1 G.37 441 | C.75 r1 G.47 442 | C.77 r1 G.7 443 | C.77 r1 G.15 444 | C.77 r1 G.16 445 | C.77 r1 G.34 446 | C.77 r1 G.36 447 | C.77 r1 G.49 448 | C.77 r1 G.52 449 | C.79 r1 G.1 450 | C.79 r1 G.4 451 | C.79 r1 G.11 452 | C.79 r1 G.13 453 | C.79 r1 G.16 454 | C.79 r1 G.17 455 | C.79 r1 G.21 456 | C.79 r1 G.31 457 | C.79 r1 G.37 458 | C.79 r1 G.44 459 | C.79 r1 G.46 460 | C.80 r1 G.22 461 | C.80 r1 G.43 462 | C.80 r1 G.46 463 | C.81 r1 G.3 464 | C.81 r1 G.4 465 | C.81 r1 G.17 466 | C.81 r1 G.30 467 | C.81 r1 G.33 468 | C.81 r1 G.35 469 | C.82 r1 G.11 470 | C.82 r1 G.13 471 | C.82 r1 G.25 472 | C.82 r1 G.31 473 | C.82 r1 G.34 474 | C.82 r1 G.42 475 | C.83 r1 G.1 476 | C.83 r1 G.6 477 | C.83 r1 G.36 478 | C.83 r1 G.42 479 | C.84 r1 G.25 480 | C.84 r1 G.30 481 | C.84 r1 G.41 482 | C.84 r1 G.42 483 | C.84 r1 G.43 484 | C.88 r1 G.4 485 | C.88 r1 G.7 486 | C.88 r1 G.11 487 | C.88 r1 G.13 488 | C.88 r1 G.22 489 | C.88 r1 G.32 490 | C.88 r1 G.34 491 | C.88 r1 G.35 492 | C.88 r1 G.37 493 | C.88 r1 G.42 494 | C.89 r1 G.10 495 | C.89 r1 G.19 496 | C.89 r1 G.20 497 | C.89 r1 G.21 498 | C.89 r1 G.26 499 | C.89 r1 G.35 500 | C.89 r1 G.38 501 | C.90 r1 G.3 502 | C.91 r1 G.2 503 | C.91 r1 G.7 504 | C.91 r1 G.11 505 | C.91 r1 G.34 506 | C.91 r1 G.35 507 | C.91 r1 G.36 508 | C.91 r1 G.47 509 | C.91 r1 G.51 510 | C.93 r1 G.7 511 | C.93 r1 G.25 512 | C.93 r1 G.46 513 | C.93 r1 G.49 514 | C.94 r1 G.14 515 | C.94 r1 G.25 516 | C.94 r1 G.34 517 | C.94 r1 G.47 518 | C.94 r1 G.52 519 | C.95 r1 G.32 520 | C.95 r1 G.40 521 | C.95 r1 G.43 522 | C.95 r1 G.46 523 | C.0 r2 G.2 524 | C.0 r2 G.25 525 | C.0 r2 G.31 526 | C.0 r2 G.46 527 | C.0 r2 G.47 528 | C.1 r2 G.8 529 | C.1 r2 G.19 530 | C.1 r2 G.33 531 | C.1 r2 G.34 532 | C.1 r2 G.35 533 | C.1 r2 G.36 534 | C.1 r2 G.47 535 | C.2 r2 G.2 536 | C.3 r2 G.19 537 | C.3 r2 G.31 538 | C.3 r2 G.34 539 | C.3 r2 G.35 540 | C.4 r2 G.19 541 | C.4 r2 G.34 542 | C.5 r2 G.16 543 | C.5 r2 G.45 544 | C.5 r2 G.52 545 | C.6 r2 G.25 546 | C.7 r2 G.31 547 | C.8 r2 G.20 548 | C.8 r2 G.24 549 | C.8 r2 G.31 550 | C.8 r2 G.33 551 | C.8 r2 G.35 552 | C.8 r2 G.47 553 | C.9 r2 G.6 554 | C.9 r2 G.19 555 | C.9 r2 G.24 556 | C.9 r2 G.25 557 | C.9 r2 G.34 558 | C.11 r2 G.3 559 | C.11 r2 G.19 560 | C.11 r2 G.31 561 | C.11 r2 G.36 562 | C.11 r2 G.40 563 | C.11 r2 G.46 564 | C.12 r2 G.1 565 | C.12 r2 G.7 566 | C.12 r2 G.15 567 | C.12 r2 G.31 568 | C.13 r2 G.24 569 | C.13 r2 G.25 570 | C.13 r2 G.37 571 | C.14 r2 G.11 572 | C.14 r2 G.14 573 | C.15 r2 G.47 574 | C.17 r2 G.5 575 | C.17 r2 G.7 576 | C.17 r2 G.11 577 | C.17 r2 G.35 578 | C.17 r2 G.52 579 | C.19 r2 G.17 580 | C.19 r2 G.31 581 | C.19 r2 G.47 582 | C.20 r2 G.46 583 | C.21 r2 G.19 584 | C.21 r2 G.35 585 | C.21 r2 G.47 586 | C.22 r2 G.25 587 | C.22 r2 G.31 588 | C.22 r2 G.47 589 | C.23 r2 G.17 590 | C.23 r2 G.31 591 | C.23 r2 G.43 592 | C.23 r2 G.46 593 | C.24 r2 G.31 594 | C.24 r2 G.43 595 | C.24 r2 G.47 596 | C.26 r2 G.28 597 | C.27 r2 G.7 598 | C.27 r2 G.14 599 | C.27 r2 G.31 600 | C.27 r2 G.46 601 | C.28 r2 G.2 602 | C.28 r2 G.7 603 | C.28 r2 G.17 604 | C.30 r2 G.2 605 | C.30 r2 G.11 606 | C.30 r2 G.14 607 | C.30 r2 G.17 608 | C.30 r2 G.29 609 | C.30 r2 G.34 610 | C.30 r2 G.46 611 | C.30 r2 G.52 612 | C.31 r2 G.7 613 | C.31 r2 G.16 614 | C.32 r2 G.16 615 | C.32 r2 G.43 616 | C.33 r2 G.19 617 | C.33 r2 G.35 618 | C.34 r2 G.23 619 | C.34 r2 G.37 620 | C.34 r2 G.42 621 | C.34 r2 G.47 622 | C.35 r2 G.14 623 | C.35 r2 G.19 624 | C.36 r2 G.24 625 | C.36 r2 G.33 626 | C.36 r2 G.46 627 | C.37 r2 G.33 628 | C.38 r2 G.25 629 | C.39 r2 G.7 630 | C.39 r2 G.43 631 | C.40 r2 G.2 632 | C.40 r2 G.31 633 | C.41 r2 G.9 634 | C.43 r2 G.9 635 | C.43 r2 G.22 636 | C.43 r2 G.26 637 | C.43 r2 G.31 638 | C.43 r2 G.39 639 | C.44 r2 G.3 640 | C.44 r2 G.17 641 | C.44 r2 G.24 642 | C.44 r2 G.33 643 | C.44 r2 G.46 644 | C.45 r2 G.19 645 | C.45 r2 G.24 646 | C.45 r2 G.34 647 | C.46 r2 G.0 648 | C.48 r2 G.0 649 | C.48 r2 G.22 650 | C.48 r2 G.33 651 | C.48 r2 G.34 652 | C.48 r2 G.35 653 | C.48 r2 G.52 654 | C.49 r2 G.31 655 | C.49 r2 G.46 656 | C.50 r2 G.2 657 | C.50 r2 G.33 658 | C.50 r2 G.46 659 | C.50 r2 G.52 660 | C.51 r2 G.4 661 | C.51 r2 G.11 662 | C.51 r2 G.25 663 | C.51 r2 G.33 664 | C.51 r2 G.34 665 | C.52 r2 G.31 666 | C.52 r2 G.46 667 | C.53 r2 G.7 668 | C.53 r2 G.34 669 | C.53 r2 G.47 670 | C.54 r2 G.42 671 | C.55 r2 G.7 672 | C.55 r2 G.25 673 | C.56 r2 G.13 674 | C.56 r2 G.25 675 | C.56 r2 G.31 676 | C.56 r2 G.33 677 | C.56 r2 G.47 678 | C.57 r2 G.13 679 | C.57 r2 G.18 680 | C.57 r2 G.21 681 | C.57 r2 G.24 682 | C.57 r2 G.46 683 | C.58 r2 G.46 684 | C.58 r2 G.49 685 | C.59 r2 G.11 686 | C.59 r2 G.23 687 | C.59 r2 G.34 688 | C.59 r2 G.40 689 | C.59 r2 G.43 690 | C.59 r2 G.47 691 | C.60 r2 G.25 692 | C.60 r2 G.34 693 | C.60 r2 G.47 694 | C.60 r2 G.49 695 | C.61 r2 G.11 696 | C.61 r2 G.34 697 | C.61 r2 G.39 698 | C.61 r2 G.46 699 | C.62 r2 G.23 700 | C.62 r2 G.33 701 | C.62 r2 G.47 702 | C.63 r2 G.9 703 | C.64 r2 G.1 704 | C.64 r2 G.11 705 | C.64 r2 G.18 706 | C.64 r2 G.19 707 | C.64 r2 G.21 708 | C.64 r2 G.48 709 | C.64 r2 G.52 710 | C.65 r2 G.11 711 | C.65 r2 G.19 712 | C.65 r2 G.28 713 | C.65 r2 G.35 714 | C.65 r2 G.47 715 | C.66 r2 G.1 716 | C.66 r2 G.14 717 | C.66 r2 G.19 718 | C.66 r2 G.27 719 | C.67 r2 G.35 720 | C.67 r2 G.47 721 | C.68 r2 G.2 722 | C.68 r2 G.35 723 | C.68 r2 G.52 724 | C.70 r2 G.33 725 | C.70 r2 G.34 726 | C.70 r2 G.46 727 | C.71 r2 G.11 728 | C.71 r2 G.31 729 | C.71 r2 G.33 730 | C.71 r2 G.34 731 | C.72 r2 G.34 732 | C.72 r2 G.46 733 | C.73 r2 G.31 734 | C.74 r2 G.31 735 | C.74 r2 G.46 736 | C.75 r2 G.31 737 | C.76 r2 G.1 738 | C.76 r2 G.7 739 | C.76 r2 G.27 740 | C.76 r2 G.32 741 | C.76 r2 G.33 742 | C.76 r2 G.37 743 | C.76 r2 G.44 744 | C.76 r2 G.47 745 | C.77 r2 G.3 746 | C.77 r2 G.31 747 | C.77 r2 G.46 748 | C.77 r2 G.47 749 | C.78 r2 G.25 750 | C.78 r2 G.41 751 | C.78 r2 G.52 752 | C.80 r2 G.31 753 | C.80 r2 G.47 754 | C.81 r2 G.25 755 | C.81 r2 G.46 756 | C.81 r2 G.47 757 | C.82 r2 G.7 758 | C.82 r2 G.33 759 | C.82 r2 G.47 760 | C.83 r2 G.47 761 | C.83 r2 G.48 762 | C.85 r2 G.9 763 | C.85 r2 G.25 764 | C.85 r2 G.31 765 | C.85 r2 G.33 766 | C.85 r2 G.35 767 | C.85 r2 G.43 768 | C.86 r2 G.15 769 | C.86 r2 G.16 770 | C.86 r2 G.34 771 | C.86 r2 G.38 772 | C.86 r2 G.47 773 | C.87 r2 G.7 774 | C.87 r2 G.8 775 | C.87 r2 G.29 776 | C.87 r2 G.30 777 | C.87 r2 G.31 778 | C.87 r2 G.32 779 | C.87 r2 G.35 780 | C.87 r2 G.52 781 | C.88 r2 G.2 782 | C.88 r2 G.14 783 | C.88 r2 G.17 784 | C.88 r2 G.19 785 | C.88 r2 G.33 786 | C.88 r2 G.46 787 | C.88 r2 G.47 788 | C.89 r2 G.46 789 | C.90 r2 G.1 790 | C.90 r2 G.31 791 | C.90 r2 G.46 792 | C.91 r2 G.4 793 | C.91 r2 G.19 794 | C.91 r2 G.29 795 | C.91 r2 G.31 796 | C.91 r2 G.33 797 | C.92 r2 G.31 798 | C.92 r2 G.33 799 | C.93 r2 G.31 800 | C.93 r2 G.44 801 | C.94 r2 G.2 802 | C.94 r2 G.17 803 | C.94 r2 G.31 804 | C.94 r2 G.35 805 | C.95 r2 G.2 806 | C.95 r2 G.33 807 | C.95 r2 G.47 808 | C.95 r2 G.52 809 | C.96 r2 G.12 810 | C.96 r2 G.19 811 | C.97 r2 G.4 812 | C.97 r2 G.20 813 | C.97 r2 G.24 814 | C.97 r2 G.34 815 | C.97 r2 G.35 816 | C.97 r2 G.36 817 | C.98 r2 G.31 818 | C.98 r2 G.33 819 | C.99 r2 G.4 820 | C.99 r2 G.11 821 | C.99 r2 G.21 822 | C.99 r2 G.31 823 | C.99 r2 G.46 824 | C.1 r3 G.2 825 | C.1 r3 G.7 826 | C.1 r3 G.31 827 | C.2 r3 G.31 828 | C.2 r3 G.47 829 | C.3 r3 G.47 830 | C.4 r3 G.2 831 | C.4 r3 G.33 832 | C.5 r3 G.46 833 | C.5 r3 G.47 834 | C.6 r3 G.31 835 | C.7 r3 G.46 836 | C.7 r3 G.47 837 | C.8 r3 G.46 838 | C.9 r3 G.31 839 | C.9 r3 G.46 840 | C.10 r3 G.17 841 | C.11 r3 G.2 842 | C.11 r3 G.24 843 | C.12 r3 G.47 844 | C.13 r3 G.46 845 | C.14 r3 G.1 846 | C.14 r3 G.19 847 | C.14 r3 G.31 848 | C.14 r3 G.46 849 | C.15 r3 G.6 850 | C.15 r3 G.13 851 | C.15 r3 G.31 852 | C.15 r3 G.35 853 | C.16 r3 G.31 854 | C.17 r3 G.2 855 | C.17 r3 G.33 856 | C.18 r3 G.1 857 | C.18 r3 G.47 858 | C.20 r3 G.47 859 | C.21 r3 G.33 860 | C.23 r3 G.19 861 | C.23 r3 G.47 862 | C.24 r3 G.1 863 | C.26 r3 G.4 864 | C.26 r3 G.19 865 | C.26 r3 G.33 866 | C.26 r3 G.46 867 | C.27 r3 G.2 868 | C.27 r3 G.19 869 | C.27 r3 G.33 870 | C.28 r3 G.15 871 | C.28 r3 G.19 872 | C.28 r3 G.33 873 | C.28 r3 G.34 874 | C.28 r3 G.35 875 | C.28 r3 G.46 876 | C.29 r3 G.2 877 | C.29 r3 G.4 878 | C.29 r3 G.33 879 | C.29 r3 G.46 880 | C.31 r3 G.11 881 | C.31 r3 G.19 882 | C.32 r3 G.25 883 | C.32 r3 G.31 884 | C.32 r3 G.47 885 | C.33 r3 G.31 886 | C.33 r3 G.33 887 | C.34 r3 G.11 888 | C.35 r3 G.35 889 | C.36 r3 G.47 890 | C.37 r3 G.31 891 | C.37 r3 G.47 892 | C.39 r3 G.31 893 | C.39 r3 G.47 894 | C.41 r3 G.47 895 | C.43 r3 G.17 896 | C.43 r3 G.46 897 | C.43 r3 G.52 898 | C.45 r3 G.2 899 | C.46 r3 G.31 900 | C.46 r3 G.35 901 | C.47 r3 G.31 902 | C.47 r3 G.33 903 | C.47 r3 G.47 904 | C.48 r3 G.47 905 | C.49 r3 G.2 906 | C.49 r3 G.17 907 | C.49 r3 G.19 908 | C.50 r3 G.19 909 | C.50 r3 G.47 910 | C.51 r3 G.47 911 | C.52 r3 G.47 912 | C.53 r3 G.31 913 | C.53 r3 G.46 914 | C.55 r3 G.2 915 | C.55 r3 G.19 916 | C.55 r3 G.47 917 | C.57 r3 G.17 918 | C.57 r3 G.47 919 | C.58 r3 G.47 920 | C.59 r3 G.19 921 | C.59 r3 G.31 922 | C.59 r3 G.33 923 | C.59 r3 G.46 924 | C.62 r3 G.19 925 | C.62 r3 G.31 926 | C.62 r3 G.35 927 | C.63 r3 G.47 928 | C.64 r3 G.47 929 | C.65 r3 G.33 930 | C.65 r3 G.34 931 | C.65 r3 G.52 932 | C.66 r3 G.45 933 | C.67 r3 G.1 934 | C.67 r3 G.19 935 | C.67 r3 G.33 936 | C.68 r3 G.19 937 | C.68 r3 G.33 938 | C.68 r3 G.47 939 | C.69 r3 G.31 940 | C.70 r3 G.2 941 | C.70 r3 G.19 942 | C.70 r3 G.47 943 | C.71 r3 G.47 944 | C.72 r3 G.19 945 | C.72 r3 G.31 946 | C.72 r3 G.47 947 | C.73 r3 G.2 948 | C.73 r3 G.11 949 | C.73 r3 G.19 950 | C.73 r3 G.46 951 | C.75 r3 G.25 952 | C.76 r3 G.17 953 | C.76 r3 G.31 954 | C.77 r3 G.2 955 | C.78 r3 G.47 956 | C.79 r3 G.33 957 | C.79 r3 G.47 958 | C.83 r3 G.19 959 | C.83 r3 G.43 960 | C.83 r3 G.46 961 | C.84 r3 G.31 962 | C.84 r3 G.47 963 | C.85 r3 G.47 964 | C.86 r3 G.31 965 | C.86 r3 G.35 966 | C.87 r3 G.2 967 | C.87 r3 G.19 968 | C.87 r3 G.33 969 | C.89 r3 G.7 970 | C.89 r3 G.17 971 | C.89 r3 G.33 972 | C.90 r3 G.47 973 | C.91 r3 G.46 974 | C.92 r3 G.14 975 | C.92 r3 G.17 976 | C.92 r3 G.19 977 | C.94 r3 G.19 978 | C.94 r3 G.33 979 | C.94 r3 G.46 980 | C.95 r3 G.19 981 | C.95 r3 G.31 982 | C.95 r3 G.35 983 | C.96 r3 G.34 984 | C.96 r3 G.35 985 | C.96 r3 G.47 986 | C.97 r3 G.2 987 | C.97 r3 G.27 988 | C.97 r3 G.46 989 | C.98 r3 G.47 990 | C.0 r4 G.19 991 | C.2 r4 G.19 992 | C.3 r4 G.46 993 | C.6 r4 G.19 994 | C.7 r4 G.19 995 | C.8 r4 G.19 996 | C.9 r4 G.47 997 | C.10 r4 G.19 998 | C.10 r4 G.47 999 | C.12 r4 G.19 1000 | C.13 r4 G.19 1001 | C.15 r4 G.19 1002 | C.16 r4 G.19 1003 | C.16 r4 G.47 1004 | C.18 r4 G.19 1005 | C.19 r4 G.19 1006 | C.20 r4 G.19 1007 | C.21 r4 G.46 1008 | C.22 r4 G.19 1009 | C.24 r4 G.19 1010 | C.25 r4 G.19 1011 | C.25 r4 G.46 1012 | C.25 r4 G.53 1013 | C.30 r4 G.33 1014 | C.31 r4 G.47 1015 | C.32 r4 G.19 1016 | C.33 r4 G.46 1017 | C.34 r4 G.19 1018 | C.35 r4 G.2 1019 | C.35 r4 G.33 1020 | C.36 r4 G.19 1021 | C.37 r4 G.19 1022 | C.38 r4 G.19 1023 | C.38 r4 G.47 1024 | C.39 r4 G.19 1025 | C.40 r4 G.19 1026 | C.40 r4 G.47 1027 | C.41 r4 G.19 1028 | C.42 r4 G.19 1029 | C.42 r4 G.47 1030 | C.44 r4 G.19 1031 | C.44 r4 G.47 1032 | C.45 r4 G.33 1033 | C.46 r4 G.19 1034 | C.47 r4 G.19 1035 | C.48 r4 G.19 1036 | C.51 r4 G.19 1037 | C.52 r4 G.19 1038 | C.53 r4 G.19 1039 | C.54 r4 G.19 1040 | C.56 r4 G.19 1041 | C.57 r4 G.19 1042 | C.58 r4 G.19 1043 | C.60 r4 G.19 1044 | C.61 r4 G.19 1045 | C.61 r4 G.25 1046 | C.63 r4 G.19 1047 | C.63 r4 G.31 1048 | C.66 r4 G.47 1049 | C.69 r4 G.19 1050 | C.69 r4 G.47 1051 | C.71 r4 G.19 1052 | C.74 r4 G.19 1053 | C.74 r4 G.47 1054 | C.75 r4 G.19 1055 | C.75 r4 G.46 1056 | C.77 r4 G.33 1057 | C.78 r4 G.19 1058 | C.79 r4 G.19 1059 | C.80 r4 G.19 1060 | C.81 r4 G.19 1061 | C.82 r4 G.19 1062 | C.82 r4 G.46 1063 | C.83 r4 G.31 1064 | C.84 r4 G.19 1065 | C.85 r4 G.19 1066 | C.86 r4 G.19 1067 | C.89 r4 G.2 1068 | C.90 r4 G.19 1069 | C.92 r4 G.46 1070 | C.93 r4 G.19 1071 | C.93 r4 G.47 1072 | C.96 r4 G.33 1073 | C.98 r4 G.19 1074 | C.99 r4 G.2 1075 | C.99 r4 G.19 1076 | -------------------------------------------------------------------------------- /tests/data/preprocessed/atac_preprocessed.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/preprocessed/atac_preprocessed.h5ad -------------------------------------------------------------------------------- /tests/data/preprocessed/rna_preprocessed.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huidongchen/simba/534e0b022ea1163face30263696f28b9a955c291/tests/data/preprocessed/rna_preprocessed.h5ad -------------------------------------------------------------------------------- /tests/test_pbg_training.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def adata_CG(): 7 | return si.read_h5ad( 8 | "tests/data/preprocessed/rna_preprocessed.h5ad") 9 | 10 | 11 | @pytest.fixture 12 | def adata_CP(): 13 | return si.read_h5ad( 14 | "tests/data/preprocessed/atac_preprocessed.h5ad") 15 | 16 | 17 | def test_gen_graph(adata_CG, adata_CP, tmp_path): 18 | si.settings.set_workdir(tmp_path / "simba_rna") 19 | si.tl.gen_graph(list_CG=[adata_CG], 20 | copy=False, 21 | dirname='graph0') 22 | si.tl.gen_graph(list_CG=[adata_CG], 23 | copy=False, 24 | add_edge_weights=True, 25 | dirname='graph1') 26 | si.tl.gen_graph(list_adata=[adata_CG], 27 | copy=False, 28 | dirname='graph2') 29 | si.tl.gen_graph(list_adata=[adata_CG], 30 | copy=False, 31 | add_edge_weights=True, 32 | dirname='graph3') 33 | si.tl.gen_graph(list_adata=[adata_CG, adata_CP], 34 | copy=False, 35 | add_edge_weights=True, 36 | dirname='graph4') 37 | 38 | 39 | def test_pbg_training_rna(adata_CG, tmp_path): 40 | si.settings.set_workdir(tmp_path / "simba_rna") 41 | si.tl.gen_graph(list_CG=[adata_CG], 42 | copy=False, 43 | dirname='graph0') 44 | dict_config = si.settings.pbg_params.copy() 45 | si.settings.set_pbg_params(dict_config) 46 | si.tl.pbg_train(auto_wd=True, 47 | output='model0') 48 | si.tl.pbg_train(auto_wd=True, 49 | use_edge_weights=True, 50 | output='model1') 51 | si.load_graph_stats() 52 | si.load_pbg_config() 53 | si.pl.pbg_metrics(fig_ncol=1, 54 | save_fig=True) 55 | 56 | 57 | def test_pbg_training_atac(adata_CP, tmp_path): 58 | si.settings.set_workdir(tmp_path / "simba_atac") 59 | si.tl.gen_graph(list_CP=[adata_CP], 60 | copy=False, 61 | dirname='graph0') 62 | si.tl.pbg_train(auto_wd=True, 63 | output='model') 64 | si.pl.pbg_metrics(fig_ncol=1, 65 | save_fig=True) 66 | -------------------------------------------------------------------------------- /tests/test_post_training.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def dict_adata(): 7 | 8 | return si.read_embedding( 9 | path_emb='tests/data/pbg_training/model/', 10 | path_entity='tests/data/pbg_training/input/entity/', 11 | path_entity_alias='tests/data/pbg_training') 12 | 13 | 14 | def test_embeddding_rna(dict_adata, tmp_path): 15 | si.settings.set_workdir(tmp_path / "simba_rna") 16 | adata_C = dict_adata['C'] 17 | adata_G = dict_adata['G'] 18 | adata_all_CG = si.tl.embed( 19 | adata_ref=adata_C, 20 | list_adata_query=[adata_G], 21 | n_top=20) 22 | adata_all_CG = si.tl.embed( 23 | adata_ref=adata_C, 24 | list_adata_query=[adata_G]) 25 | # add annotations of cells and genes 26 | adata_all_CG.obs['entity_anno'] = "" 27 | adata_all_CG.obs.loc[adata_C.obs_names, 'entity_anno'] = 'cell' 28 | adata_all_CG.obs.loc[adata_G.obs_names, 'entity_anno'] = 'gene' 29 | 30 | si.tl.umap(adata_all_CG, 31 | n_neighbors=15, 32 | n_components=2) 33 | si.pl.umap(adata_all_CG, drawing_order='random') 34 | si.pl.umap(adata_all_CG, color=['entity_anno'], drawing_order='random') 35 | adata_cmp = si.tl.compare_entities( 36 | adata_ref=adata_C, 37 | adata_query=adata_G) 38 | si.pl.entity_metrics(adata_cmp, 39 | x='max', 40 | y='gini', 41 | show_contour=False, 42 | texts=adata_G.obs_names[:2], 43 | show_texts=True, 44 | show_cutoff=True, 45 | size=5, 46 | text_expand=(1.3, 1.5), 47 | cutoff_x=1., 48 | cutoff_y=0.3, 49 | save_fig=True) 50 | si.pl.entity_barcode(adata_cmp, 51 | layer='softmax', 52 | entities=list(adata_G.obs_names[:2]), 53 | show_cutoff=True, 54 | cutoff=0.001, 55 | fig_size=(5, 2.5), 56 | save_fig=True) 57 | query_result = si.tl.query(adata_all_CG, 58 | entity=list(adata_C.obs_names[:2]), 59 | obsm=None, 60 | use_radius=False, 61 | k=50, 62 | anno_filter='entity_anno', 63 | filters=['gene']) 64 | print(query_result.head()) 65 | si.pl.query(adata_all_CG, 66 | obsm=None, 67 | show_texts=False, 68 | color=['entity_anno'], 69 | alpha=0.9, 70 | alpha_bg=0.1, 71 | save_fig=True) 72 | query_result = si.tl.query(adata_all_CG, 73 | entity=adata_C.obs_names[0], 74 | obsm='X_umap', 75 | use_radius=True, 76 | anno_filter='entity_anno') 77 | print(query_result.head()) 78 | si.pl.query(adata_all_CG, 79 | obsm='X_umap', 80 | show_texts=False, 81 | color=['entity_anno'], 82 | alpha=0.9, 83 | alpha_bg=0.1, 84 | save_fig=True) 85 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def adata_CG(): 7 | return si.read_h5ad("tests/data/10xpbmc_rna_subset.h5ad") 8 | 9 | 10 | @pytest.fixture 11 | def adata_CP(): 12 | return si.read_h5ad("tests/data/10xpbmc_atac_subset.h5ad") 13 | 14 | 15 | def test_rna(adata_CG, tmp_path): 16 | si.settings.set_workdir(tmp_path / "simba_rna") 17 | si.settings.set_figure_params(dpi=80, 18 | style='white', 19 | fig_size=[5, 5], 20 | rc={'image.cmap': 'viridis'}) 21 | si.pp.filter_features(adata_CG, min_n_samples=1) 22 | si.pp.filter_genes(adata_CG, min_n_cells=3) 23 | si.pp.cal_qc(adata_CG) 24 | si.pl.violin(adata_CG, 25 | list_obs=['n_counts', 'n_features'], 26 | save_fig=True, 27 | fig_name='plot_violin.png') 28 | si.pp.filter_samples(adata_CG, min_n_features=1) 29 | si.pp.cal_qc_rna(adata_CG) 30 | si.pl.violin(adata_CG, 31 | list_obs=['n_counts', 'n_genes', 'pct_mt'], 32 | save_fig=True, 33 | fig_name='plot_violin.png') 34 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2) 35 | si.pp.normalize(adata_CG, method='lib_size') 36 | si.pp.log_transform(adata_CG) 37 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000) 38 | si.pl.variable_genes(adata_CG, 39 | show_texts=True, 40 | save_fig=True, 41 | fig_name='plot_variable_genes.png') 42 | si.tl.discretize(adata_CG, n_bins=5) 43 | si.pl.discretize(adata_CG, 44 | save_fig=True, 45 | fig_name='plot_discretize.png') 46 | 47 | 48 | def test_atac(adata_CP, tmp_path): 49 | si.settings.set_workdir(tmp_path / "simba_atac") 50 | si.pp.filter_peaks(adata_CP, min_n_cells=5) 51 | si.pp.binarize(adata_CP) 52 | si.pp.cal_qc_atac(adata_CP) 53 | si.pl.hist(adata_CP, 54 | list_obs=['n_counts', 'n_peaks', 'pct_peaks'], 55 | log=True, 56 | list_var=['n_cells'], 57 | fig_size=(3, 3), 58 | save_fig=True, 59 | fig_name='plot_histogram.png') 60 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5) 61 | si.pp.pca(adata_CP, n_components=30) 62 | si.pl.pca_variance_ratio(adata_CP, 63 | show_cutoff=True, 64 | save_fig=True, 65 | fig_name='plot_variance_ratio.png') 66 | si.pp.select_pcs(adata_CP, n_pcs=10) 67 | si.pp.select_pcs_features(adata_CP) 68 | si.pl.pcs_features(adata_CP, 69 | fig_ncol=5, 70 | save_fig=True, 71 | fig_name='plot_pcs_features.png') 72 | si.write_bed(adata_CP, use_top_pcs=True) 73 | 74 | 75 | def test_genescores(adata_CP): 76 | si.pp.filter_peaks(adata_CP, min_n_cells=5) 77 | si.pp.cal_qc_atac(adata_CP) 78 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5) 79 | si.pp.pca(adata_CP, n_components=30) 80 | si.pp.select_pcs(adata_CP, n_pcs=10) 81 | si.pp.select_pcs_features(adata_CP) 82 | 83 | adata_CG_atac = si.tl.gene_scores(adata_CP, 84 | genome='hg19', 85 | use_gene_weigt=True, 86 | use_top_pcs=True) 87 | print(adata_CG_atac) 88 | 89 | 90 | def test_integration(adata_CG): 91 | si.pp.filter_genes(adata_CG, min_n_cells=3) 92 | si.pp.cal_qc_rna(adata_CG) 93 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2) 94 | si.pp.normalize(adata_CG, method='lib_size') 95 | si.pp.log_transform(adata_CG) 96 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000) 97 | adata_C1C2 = si.tl.infer_edges( 98 | adata_CG, adata_CG, n_components=20, k=20) 99 | si.pl.node_similarity(adata_C1C2, 100 | cutoff=0.5, 101 | save_fig=True) 102 | si.pl.svd_nodes(adata_C1C2, 103 | cutoff=0.5, 104 | save_fig=True) 105 | si.tl.trim_edges(adata_C1C2, cutoff=0.5) 106 | --------------------------------------------------------------------------------