├── .github └── workflows │ └── CI.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.md ├── R_scripts ├── README.md └── scan_for_kmers_motifs.R ├── docs ├── Makefile ├── environment.yml ├── make.bat ├── requirements.txt └── source │ ├── API.rst │ ├── About SIMBA.rst │ ├── Basic concepts.rst │ ├── Citation.rst │ ├── Installation.rst │ ├── Makefile │ ├── Output.rst │ ├── Release notes.rst │ ├── _ext │ └── edit_on_github.py │ ├── _static │ └── img │ │ ├── Figure1.png │ │ ├── lion_icon.svg │ │ └── logo_simba.png │ ├── conf.py │ ├── index.rst │ └── make.bat ├── pytest.ini ├── requirements.txt ├── setup.py ├── simba ├── __init__.py ├── _settings.py ├── _utils.py ├── _version.py ├── data │ └── gene_anno │ │ ├── hg19_genes.bed │ │ ├── hg38_genes.bed │ │ ├── mm10_genes.bed │ │ └── mm9_genes.bed ├── datasets │ ├── __init__.py │ └── _datasets.py ├── plotting │ ├── __init__.py │ ├── _palettes.py │ ├── _plot.py │ ├── _post_training.py │ └── _utils.py ├── preprocessing │ ├── __init__.py │ ├── _general.py │ ├── _pca.py │ ├── _qc.py │ ├── _utils.py │ └── _variable_genes.py ├── readwrite.py └── tools │ ├── __init__.py │ ├── _gene_scores.py │ ├── _general.py │ ├── _integration.py │ ├── _pbg.py │ ├── _post_training.py │ ├── _umap.py │ └── _utils.py └── tests ├── data ├── 10xpbmc_atac_subset.h5ad ├── 10xpbmc_rna_subset.h5ad ├── pbg_training │ ├── entity_alias.txt │ ├── graph_stats.json │ ├── input │ │ └── entity │ │ │ ├── entity_count_C_0.txt │ │ │ ├── entity_count_G_0.txt │ │ │ ├── entity_names_C_0.json │ │ │ └── entity_names_G_0.json │ ├── model │ │ ├── checkpoint_version.txt │ │ ├── config.json │ │ ├── embeddings_C_0.v10.h5 │ │ ├── embeddings_G_0.v10.h5 │ │ ├── model.v10.h5 │ │ └── training_stats.json │ └── pbg_graph.txt └── preprocessed │ ├── atac_preprocessed.h5ad │ └── rna_preprocessed.h5ad ├── test_pbg_training.py ├── test_post_training.py └── test_preprocessing.py /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | matrix: 11 | python-version: [3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Add conda to system path 20 | run: | 21 | # $CONDA is an environment variable pointing to the root of the miniconda directory 22 | echo $CONDA/bin >> $GITHUB_PATH 23 | - name: Install dependencies 24 | run: | 25 | conda config --add channels defaults 26 | conda config --add channels bioconda 27 | conda config --add channels conda-forge 28 | conda config --set channel_priority strict 29 | # conda env update --file environment.yml --name base 30 | conda install simba 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt 33 | pip install -e . 34 | - name: Lint with flake8 35 | run: | 36 | conda install flake8 37 | # stop the build if there are Python syntax errors or undefined names 38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 39 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 40 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 41 | - name: Test with pytest 42 | run: | 43 | conda install pytest pytest-cov 44 | pytest --cov 45 | - name: Coverage report 46 | run: | 47 | bash <(curl -s https://codecov.io/bash) 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/osx,python,windows 2 | 3 | ### OSX ### 4 | *.DS_Store 5 | .AppleDouble 6 | .LSOverride 7 | 8 | # Icon must end with two \r 9 | Icon 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | .com.apple.timemachine.donotpresent 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | 30 | ### Python ### 31 | # Byte-compiled / optimized / DLL files 32 | __pycache__/ 33 | *.py[cod] 34 | *$py.class 35 | 36 | # C extensions 37 | *.so 38 | 39 | # Distribution / packaging 40 | .Python 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | 57 | # PyInstaller 58 | # Usually these files are written by a python script from a template 59 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 60 | *.manifest 61 | *.spec 62 | 63 | # Installer logs 64 | pip-log.txt 65 | pip-delete-this-directory.txt 66 | 67 | # Unit test / coverage reports 68 | htmlcov/ 69 | .tox/ 70 | .coverage 71 | .coverage.* 72 | .cache 73 | .pytest_cache/ 74 | nosetests.xml 75 | coverage.xml 76 | *.cover 77 | .hypothesis/ 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | target/ 95 | 96 | # Jupyter Notebook 97 | .ipynb_checkpoints 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # celery beat schedule file 103 | celerybeat-schedule.* 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | 130 | ### Windows ### 131 | # Windows thumbnail cache files 132 | Thumbs.db 133 | ehthumbs.db 134 | ehthumbs_vista.db 135 | 136 | # Folder config file 137 | Desktop.ini 138 | 139 | # Recycle Bin used on file shares 140 | $RECYCLE.BIN/ 141 | 142 | # Windows Installer files 143 | *.cab 144 | *.msi 145 | *.msm 146 | *.msp 147 | 148 | # Windows shortcuts 149 | *.lnk 150 | 151 | # R 152 | *.Rhistory 153 | 154 | # Sphinx 155 | docs/source/_autosummary/ 156 | 157 | # End of https://www.gitignore.io/api/osx,python,windows 158 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | conda: 4 | environment: docs/environment.yml 5 | 6 | build: 7 | image: latest 8 | 9 | sphinx: 10 | builder: html 11 | configuration: docs/source/conf.py 12 | fail_on_warning: false 13 | 14 | python: 15 | version: 3.7 16 | # install: 17 | # - method: pip 18 | # path: . 19 | # extra_requirements: 20 | # - docs -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Huidong Chen, Pinello Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![documentation](https://readthedocs.org/projects/simba-bio/badge/?version=latest)](https://simba-bio.readthedocs.io/en/latest/) 2 | [![CI](https://github.com/pinellolab/simba/actions/workflows/CI.yml/badge.svg)](https://github.com/pinellolab/simba/actions/workflows/CI.yml) 3 | [![Install with conda](https://anaconda.org/bioconda/simba/badges/version.svg)](https://anaconda.org/bioconda/simba) 4 | [![codecov](https://codecov.io/gh/pinellolab/simba/branch/master/graph/badge.svg?token=NDQJQPL18K)](https://codecov.io/gh/pinellolab/simba) 5 | 6 | # SIMBA 7 | 8 | SIMBA: **SI**ngle-cell e**MB**edding **A**long with features 9 | 10 | Main website, documentation and tutorials: https://simba-bio.readthedocs.io 11 | 12 | Preprint: Huidong Chen, Jayoung Ryu, Michael E. Vinyard, Adam Lerer & Luca Pinello. ["SIMBA: SIngle-cell eMBedding Along with features. *bioRxiv, 2021.10.17.464750v1* (2021)."](https://www.biorxiv.org/content/10.1101/2021.10.17.464750v1) 13 | 14 | The scripts used for the comparison analyses in the manuscript can be found [here](https://github.com/pinellolab/simba_comparison). 15 | 16 | 17 | 18 | ## Installation 19 | Before installing SIMBA make sure to have the correct channels priority by executing these commands: 20 | ``` 21 | conda config --add channels defaults 22 | conda config --add channels bioconda 23 | conda config --add channels conda-forge 24 | conda config --set channel_priority strict 25 | ``` 26 | 27 | To install the simba package with conda, run: 28 | ``` 29 | conda create -n env_simba jupyter simba 30 | ``` 31 | 32 | To enable the k-mer and TF analyses please install these additional dependencies(optional): 33 | ``` 34 | conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array 35 | ``` 36 | 37 | ## [SIMBA v1.2 (dev)](https://github.com/pinellolab/simba/tree/dev) update 38 | We have added the support for 39 | * Continuous edge weight encoding for scRNA-seq ([tutorial](https://github.com/pinellolab/simba_tutorials/blob/main/v1.2/rna_10xpmbc_edgeweigts.ipynb)) 40 | * Significance testing of features' cell type specificity metrics ([tutorial](https://github.com/pinellolab/simba_tutorials/tree/main/v1.1sig)) 41 | 42 | ### SIMBA v1.2 Installation 43 | To install the latest development version of simba: 44 | ``` 45 | conda create -n env_simba_dev jupyter pytorch pybedtools -y 46 | pip install 'simba @ git+https://github.com/pinellolab/simba@dev' 47 | ``` 48 | To enable the k-mer and TF analyses please install these additional dependencies(optional): 49 | ``` 50 | conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array 51 | ``` 52 | 53 | Please refer to the main documentation website to learn how to use SIMBA with the provided tutorials: https://simba-bio.readthedocs.io 54 | 55 | -------------------------------------------------------------------------------- /R_scripts/README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | To run `scan_for_kmers_motifs.R`: 4 | 5 | step1: install all the dependencies: 6 | 7 | ```sh 8 | $ conda install r-essentials r-optparse bioconductor-jaspar2020 bioconductor-biostrings bioconductor-tfbstools bioconductor-motifmatchr bioconductor-summarizedexperiment r-doparallel bioconductor-rhdf5 bioconductor-hdf5array 9 | ``` 10 | 11 | step2: run `Rscript scan_for_kmers_motifs.R -h` 12 | 13 | e.g., 14 | ```sh 15 | $ Rscript scan_for_kmers_motifs.R -i peaks.bed -g hg19.fa -s 'Homo sapiens' 16 | ``` 17 | -------------------------------------------------------------------------------- /R_scripts/scan_for_kmers_motifs.R: -------------------------------------------------------------------------------- 1 | # This script scans specified regions for kmers or/and motifs using JASPAR2020 database. 2 | # It outputs regions-by-kmers/motifs frequency matrix in .h5 format 3 | 4 | # Author: Huidong Chen 5 | # Contact information: hd7chen AT gmail DOT com 6 | 7 | suppressMessages(library(optparse,quietly = TRUE)) 8 | 9 | main <- function(){ 10 | option_list = list( 11 | make_option(c("-i", "--input"), type="character", default=NULL, 12 | help="input region file name in .bed format", metavar="character"), 13 | make_option(c("-g", "--genome"), type="character", default=NULL, 14 | help="Path to reference genome", metavar="character"), 15 | make_option(c("--no_kmer"), action = "store_true",default=FALSE, 16 | help="disable scanning for kmers"), 17 | make_option(c("--no_motif"), action = "store_true",default=FALSE, 18 | help="disable scanning for motifs"), 19 | make_option(c("-k","--k_kmer"), type="integer", default=6, 20 | help="k-mer length [default = %default].", metavar="integer"), 21 | make_option(c("-s","--species"), type="character", default=NULL, 22 | help="Species of motifs in the JASPAR database. 23 | Choose from 'Homo sapiens','Mus musculus'. Only valid when motif is used", 24 | metavar="character"), 25 | make_option(c("-o", "--output"), type="character", default='output_kmers_motifs', 26 | help="Output folder [default = %default]", metavar="character") 27 | ) 28 | 29 | opt_parser = OptionParser(option_list=option_list) 30 | opt = parse_args(opt_parser) 31 | 32 | if(is.null(opt$input)){ 33 | print_help(opt_parser) 34 | stop("input region file must be specified", call.=FALSE) 35 | } 36 | if(!opt$no_motif){ 37 | if(any(is.null(opt$genome),is.null(opt$species))){ 38 | print_help(opt_parser) 39 | stop("reference genome and species must be both specified", call.=FALSE) 40 | } 41 | } 42 | 43 | file.input = opt$input 44 | genome = opt$genome 45 | no_kmer = opt$no_kmer 46 | no_motif = opt$no_motif 47 | k = opt$k_kmer 48 | species = opt$species 49 | dir.output = opt$output 50 | 51 | suppressMessages(library(rhdf5)) 52 | suppressMessages(library(HDF5Array)) # used for saving sparse matrix 53 | suppressMessages(library(Biostrings)) 54 | suppressMessages(library(Matrix)) 55 | suppressMessages(library(TFBSTools)) 56 | suppressMessages(library(JASPAR2020)) 57 | suppressMessages(library(motifmatchr)) 58 | suppressMessages(library(SummarizedExperiment)) 59 | suppressMessages(library(doParallel)) 60 | 61 | set.seed(2020) 62 | 63 | system(paste0('mkdir -p ',dir.output)) 64 | 65 | print('Converting .bed to .fasta ...') 66 | ### convert peaks bed file to fasta file 67 | file.input.fa = paste0(basename(file.input),'.fa') 68 | system(paste("bedtools getfasta -fi",genome, 69 | "-bed",file.input, 70 | "-fo",file.path(dir.output,file.input.fa))) 71 | 72 | peaks_seq <- readDNAStringSet(file.path(dir.output,file.input.fa), "fasta") 73 | peaks_name = gsub(":|-",'_',names(peaks_seq)) 74 | 75 | ### count kmers 76 | if(!no_kmer){ 77 | print('Scanning for kmers ...') 78 | freq_k = oligonucleotideFrequency(peaks_seq, k) 79 | rownames(freq_k) = peaks_name 80 | freq_k = as(freq_k, "sparseMatrix") 81 | } 82 | 83 | ### scan for TF motifs 84 | if(!no_motif){ 85 | print('Scanning for TF motifs ...') 86 | opts <- list() 87 | opts["species"] <- species 88 | opts["collection"] <- "CORE" 89 | PFMatrixList = TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020,opts = opts) 90 | motif_ix_scores <- motifmatchr::matchMotifs(PFMatrixList,peaks_seq, out = "scores") 91 | freq_motif = motifCounts(motif_ix_scores) 92 | motif_names = c() 93 | for (x in names(PFMatrixList)){ 94 | motif_names = c(motif_names,PFMatrixList[[x]]@name) 95 | } 96 | colnames(freq_motif) = gsub("::",'_',motif_names) 97 | rownames(freq_motif) = peaks_name 98 | } 99 | 100 | ### save results 101 | ### save kmers 102 | if(!no_kmer){ 103 | print('Saving kmer matrix ...') 104 | 105 | # output_dir = file.path(dir.output, 'freq_k') 106 | # system(paste0('mkdir -p ',output_dir)) 107 | # filename = 'freq_k.mtx' 108 | # writeMM(freq_k,file = file.path(output_dir,filename)) 109 | # write.table(rownames(freq_k),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 110 | # write.table(colnames(freq_k),file.path(output_dir,'kmers.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 111 | 112 | filename = 'freq_kmer.h5' 113 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation 114 | writeHDF5Array(t(freq_k), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE) 115 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names 116 | h5write(rownames(freq_k), file.path(dir.output,filename), "row_names") 117 | h5write(colnames(freq_k), file.path(dir.output,filename), "col_names") 118 | } 119 | 120 | ### save motifs 121 | if(!no_motif){ 122 | print('Saving motif matrix ...') 123 | 124 | # output_dir = file.path(dir.output, 'freq_motif') 125 | # system(paste0('mkdir -p ',output_dir)) 126 | # filename = 'freq_motif.mtx' 127 | # writeMM(freq_motif,file = file.path(output_dir,filename)) 128 | # write.table(rownames(freq_motif),file.path(output_dir,'peaks.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 129 | # write.table(colnames(freq_motif),file.path(output_dir,'motifs.tsv'),quote=FALSE,row.names = FALSE,col.names = FALSE) 130 | 131 | filename = 'freq_motif.h5' 132 | # writeHDF5Array internally transposes the matrix so `t()` is used to counteract this operation 133 | writeHDF5Array(t(freq_motif), file.path(dir.output,filename), name="mat", with.dimnames=FALSE, verbose=FALSE) 134 | # using this structure in order for anndata 'read_hdf' to recognize row names and column names 135 | h5write(rownames(freq_motif), file.path(dir.output,filename), "row_names") 136 | h5write(colnames(freq_motif), file.path(dir.output,filename), "col_names") 137 | } 138 | 139 | print('Finished.') 140 | } 141 | 142 | main() -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: readthedocs 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - simba 8 | - pandoc>=2.14 9 | - pip: 10 | - sphinx>=3.0 11 | - sphinx-rtd-theme>=0.5 12 | - nbsphinx>=0.8 13 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.0 2 | sphinx-rtd-theme>=0.5 3 | nbsphinx>=0.8 -------------------------------------------------------------------------------- /docs/source/API.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: simba 2 | 3 | API 4 | === 5 | 6 | Import simba as:: 7 | 8 | import simba as si 9 | 10 | Configuration for SIMBA 11 | ~~~~~~~~~~~~~~~~~~~~~~~ 12 | .. autosummary:: 13 | :toctree: _autosummary 14 | 15 | settings.set_figure_params 16 | settings.set_pbg_params 17 | settings.set_workdir 18 | 19 | 20 | Reading 21 | ~~~~~~~ 22 | 23 | .. autosummary:: 24 | :toctree: _autosummary 25 | 26 | read_csv 27 | read_h5ad 28 | read_10x_h5 29 | read_mtx 30 | read_embedding 31 | load_pbg_config 32 | load_graph_stats 33 | 34 | See more at `anndata `_ 35 | 36 | Preprocessing 37 | ~~~~~~~~~~~~~ 38 | 39 | .. autosummary:: 40 | :toctree: _autosummary 41 | 42 | pp.log_transform 43 | pp.normalize 44 | pp.binarize 45 | pp.cal_qc 46 | pp.cal_qc_rna 47 | pp.cal_qc_atac 48 | pp.filter_samples 49 | pp.filter_cells_rna 50 | pp.filter_cells_atac 51 | pp.filter_features 52 | pp.filter_genes 53 | pp.filter_peaks 54 | pp.pca 55 | pp.select_pcs 56 | pp.select_pcs_features 57 | pp.select_variable_genes 58 | 59 | Tools 60 | ~~~~~ 61 | 62 | .. autosummary:: 63 | :toctree: _autosummary 64 | 65 | tl.discretize 66 | tl.umap 67 | tl.gene_scores 68 | tl.infer_edges 69 | tl.trim_edges 70 | tl.gen_graph 71 | tl.pbg_train 72 | tl.softmax 73 | tl.embed 74 | tl.compare_entities 75 | tl.query 76 | tl.find_master_regulators 77 | tl.find_target_genes 78 | 79 | 80 | Plotting 81 | ~~~~~~~~ 82 | 83 | .. autosummary:: 84 | :toctree: _autosummary 85 | 86 | pl.pca_variance_ratio 87 | pl.pcs_features 88 | pl.variable_genes 89 | pl.violin 90 | pl.hist 91 | pl.umap 92 | pl.discretize 93 | pl.node_similarity 94 | pl.svd_nodes 95 | pl.pbg_metrics 96 | pl.entity_metrics 97 | pl.entity_barcode 98 | pl.query 99 | 100 | 101 | Datasets 102 | ~~~~~~~~ 103 | 104 | .. autosummary:: 105 | :toctree: _autosummary 106 | 107 | datasets.rna_10xpmbc3k 108 | datasets.rna_han2018 109 | datasets.rna_tmc2018 110 | datasets.rna_baron2016 111 | datasets.rna_muraro2016 112 | datasets.rna_segerstolpe2016 113 | datasets.rna_wang2016 114 | datasets.rna_xin2016 115 | datasets.atac_buenrostro2018 116 | datasets.atac_10xpbmc5k 117 | datasets.atac_chen2019 118 | datasets.atac_cusanovich2018_subset 119 | datasets.multiome_ma2020_fig4 120 | datasets.multiome_chen2019 121 | datasets.multiome_10xpbmc10k 122 | -------------------------------------------------------------------------------- /docs/source/About SIMBA.rst: -------------------------------------------------------------------------------- 1 | About SIMBA 2 | =========== 3 | 4 | SIMBA ( **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features) is a versatile single-cell embedding method that co-embeds cells and features into the same latent space. By formulating single-cell analyses as multi-entity graph embedding problems, SIMBA can be used to solve popular single cell tasks that appear very different in a single framework. 5 | 6 | For each task, SIMBA constructs a graph with nodes of different entities (cells and features), and edges of different types indicating relations between these entities. SIMBA then applies multi-entity graph embedding algorithms adapted from the literature on social network and knowledge graph embeddings on this graph, and introduces a Softmax-based transformation to embed these entities (nodes) into a low-dimensional space such that the embeddings of these entities are comparable. 7 | 8 | We show that the SIMBA framework can perform many important single-cell analyses, including dimensionality reduction techniques for studying cellular states; clustering-free marker detection based on the similarity between single cells and features; single-cell multimodal analysis and the study of gene regulation; batch correction and omics integration analysis and simultaneous identification marker features. SIMBA can be adapted to these diverse analysis tasks by simply modifying how the input graph is constructed from the relevant single-cell data. We believe that SIMBA will simplify the task of adapting single-cell analysis to new tasks and single-cell modalities. -------------------------------------------------------------------------------- /docs/source/Basic concepts.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Basic concepts 3 | ================ 4 | 5 | 6 | Graph construction 7 | ~~~~~~~~~~~~~~~~~~ 8 | SIMBA encodes entities of different types, including genes, open chromatin regions (peaks or bins), and DNA sequences (transcription factor motifs or k-mers), into a single large graph based on the relation between them. In this graph, nodes represent different entities and edges indicate the relation between entities. 9 | 10 | * In scRNA-seq analysis, each node represents either a cell or a gene. If a gene is expressed in a cell, then an edge is added between this gene and cell. The gene expression level is encoded into the weight of this edge. 11 | 12 | * In scATAC-seq analysis, each node represents either a cell or a region (peak/bin). If a region is open in a cell, then an edge is added between this region and cell. Optionally, if DNA sequences (TF motifs or k-mers) are also used, each node represents a cell, or a region, or a DNA sequence. In addition to the relation between a cell and a region, if a DNA sequence is found within the open region, then an edge is added between this DNA sequence and open region. 13 | 14 | * In multimodal analysis, each node can be any of these entities, including a cell, a gene, a open region , a DNA sequence, etc. Edges are added similarly as in scRNA-seq analysis and scATAC-seq analysis. 15 | 16 | * In batch correction analysis, in addition to the experimentally measured edges as described above, batch correction is further enhanced with the computationally inferred edges between cell nodes across datasets using a truncated randomized singular value decomposition (SVD)-based procedure 17 | 18 | * In multiomics integration analysis (scRNA-seq and scATAC-seq), SIMBA first builds one graph for scRNA-seq data and one graph for scATAC-seq data independently as described above. To connect these two graphs, SIMBA calculates gene scores by summarizing accessible regions from scATAC-seq data and then infer edges between cells of different omics based on their shared gene expression modules through a similar procedure as in batch correction. 19 | 20 | PBG training 21 | ~~~~~~~~~~~~ 22 | Following the construction of a multi-relational graph between biological entities, we adapt graph embedding techniques from the knowledge graph and recommendation systems literature to construct unsupervised representations for these entities. 23 | 24 | We use the PyTorch-BigGraph(PBG) framework, which provides efficient computation of multi-relation graph embeddings over multiple entity types and can scale to graphs with millions or billions of entities. 25 | 26 | In SIMBA, several key modifications have been made based on PBG, including: 27 | 28 | * Type-constrainted negative sampling 29 | 30 | * Negative samples are produced in two ways: 31 | 32 | * by corrupting the edge with a source or destination sampled uniformly from the nodes with the correct types for this relation; 33 | 34 | * by corrupting the edge with a source or destination node sampled with probability proportional to its degree. 35 | 36 | * Introducing a weight decay procedure to solve overfitting problem. 37 | 38 | The resulting graph embeddings have two desirable properties that we will take advantage of: 39 | 40 | #. First-order similarity: for two entity types with a relation between them, edges with high likelihood should have higher dot product. 41 | #. Second-order similarity: within a single entity type, entities that have ‘similar contexts’, i.e., a similar distribution of edge probabilities, should have similar embeddings. 42 | 43 | Evaluation during training 44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 45 | During the PBG training procedure, a small percent of edges is held out (by default, the evaluation fraction is set to 5%) to monitor overfitting and evaluate the final model. 46 | 47 | Five metrics are computed on the reserved set of edges, including mean reciprocal rank (MRR, the average of the reciprocal of the ranks of all positives), R1 (the fraction of positives that rank better than all their negatives, i.e., have a rank of 1), R10 (the fraction of positives that rank in the top 10 among their negatives), R50 (the fraction of positives that rank in the top 50 among their negatives), and AUC (Area Under the Curve). 48 | 49 | By default, we show MRR along with training loss and validation loss while other metric are also available in SIMBA package (Supplementary Fig. 1a). The learning curves for validation loss and these metrics can be used to determine when training has completed. The relative values of training and validation loss along with these evaluation metrics can be used to identify issues with training (underfitting vs overfitting) and tune the hyperparameters weight decay, embedding dimension, and number of training epochs appropriately. However, for most datasets we find that the default parameters do not need tuning. 50 | 51 | Softmax transformation 52 | ~~~~~~~~~~~~~~~~~~~~~~ 53 | PyTorch-BigGraph training provides initial embeddings of all entities (nodes). However, entities of different types (e.g., cells vs peaks, cells of different batches or modalities) have different edge distributions and thus may lie on different manifolds of the latent space. To make the embeddings of entities of different types comparable, we transform the embeddings of features with Softmax function by utilizing the first-order similarity between cells (reference) and features (query). In the case of batch correction or multi-omics integration, the SoftMax transformation is also performed based on the first-order similarity between cells of different batches or modalities. 54 | -------------------------------------------------------------------------------- /docs/source/Citation.rst: -------------------------------------------------------------------------------- 1 | Citation 2 | ======== 3 | 4 | Chen, H., Ryu, J., Vinyard, M.E., Lerer, A. & Pinello, L. SIMBA: SIngle-cell eMBedding Along with features. bioRxiv, 2021.2010.2017.464750 (2021). 5 | 6 | Please check out our `preprint `_ on bioRxiv to learn more. -------------------------------------------------------------------------------- /docs/source/Installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Anaconda 5 | ~~~~~~~~ 6 | 7 | To install the `simba `_ package with conda, run:: 8 | 9 | conda install -c bioconda simba 10 | 11 | **Recommended**: install *simba* in a new virtual enviroment:: 12 | 13 | conda create -n env_simba python simba 14 | conda activate env_simba 15 | 16 | conda config --add channels defaults 17 | conda config --add channels bioconda 18 | conda config --add channels conda-forge 19 | conda config --set channel_priority strict 20 | 21 | 22 | Dev version 23 | ~~~~~~~~~~~ 24 | 25 | To install the development version on `GitHub `_, run following on top of the stable installation:: 26 | 27 | pip install 'simba @ git+https://github.com/pinellolab/simba@dev' 28 | 29 | -------------------------------------------------------------------------------- /docs/source/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/Output.rst: -------------------------------------------------------------------------------- 1 | Output 2 | ====== 3 | 4 | SIMBA result structure will look like this: 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | :: 8 | 9 | result_simba 10 | ├── figures 11 | └── pbg 12 | └── graph0 13 | ├── pbg_graph.txt 14 | ├── graph_stats.json 15 | ├── entity_alias.txt 16 | └── input 17 | ├── edge 18 | └── entity 19 | └── model0 20 | ├── config.json 21 | ├── training_stats.json 22 | ├── checkpoint_version.txt 23 | ├── embeddings.h5 24 | └── model.h5 25 | └── model1 26 | ├── config.json 27 | ├── training_stats.json 28 | ├── checkpoint_version.txt 29 | ├── embeddings.h5 30 | └── model.h5 31 | └── model2 32 | ├── config.json 33 | ├── training_stats.json 34 | ├── checkpoint_version.txt 35 | ├── embeddings.h5 36 | └── model.h5 37 | └── graph1 38 | ├── pbg_graph.txt 39 | ├── graph_stats.json 40 | ├── entity_alias.txt 41 | └── input 42 | ├── edge 43 | └── entity 44 | └── model 45 | ├── config.json 46 | ├── training_stats.json 47 | ├── checkpoint_version.txt 48 | ├── embeddings.h5 49 | └── model.h5 50 | 51 | By default, all figures will be saved under ``result_simba/figures`` 52 | 53 | PBG training will be saved under the folder ``result_simba/pbg``. Within this folder, each constructed graph is saved into a separate folder (by default ``graph0``) under ``pbg``. For each graph: 54 | 55 | - ``pbg_graph.txt`` stores its edges on which PBG training is performed; 56 | - ``graph_stats.json`` stores the statistics related to this graph; 57 | - ``entity_alias.txt`` keeps the mapping between the original entity IDs and their aliases. 58 | - ``input`` stores the extracted nodes (entities) and edges from ``pbg_graph.txt``, which are prepared for PBG training. 59 | - ``model`` stores the training result of one parameter configuration. (by default ``model``) -------------------------------------------------------------------------------- /docs/source/Release notes.rst: -------------------------------------------------------------------------------- 1 | Release notes 2 | ============= -------------------------------------------------------------------------------- /docs/source/_ext/edit_on_github.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the 3 | sidebar. 4 | """ 5 | 6 | import os 7 | import warnings 8 | 9 | __licence__ = "BSD (3 clause)" 10 | 11 | 12 | # def get_github_repo(app, path): 13 | # if path.endswith(".ipynb"): 14 | # return app.config.github_nb_repo, "/" 15 | # return app.config.github_repo, "/docs/source/" 16 | 17 | 18 | def html_page_context(app, pagename, templatename, context, doctree): 19 | if templatename != "page.html": 20 | return 21 | 22 | if doctree is not None: 23 | path = os.path.relpath(doctree.get("source"), app.builder.srcdir) 24 | if path.endswith(".ipynb"): 25 | context["display_github"] = True 26 | context["github_user"] = "huidongchen" 27 | context["github_repo"] = "simba_tutorials" 28 | context["github_version"] = "main" 29 | if path.endswith("rna_10x_mouse_brain_1p3M.ipynb"): 30 | context["conf_py_path"] = "/v1.1/" 31 | else: 32 | context["conf_py_path"] = "/v1.0/" 33 | else: 34 | context["display_github"] = True 35 | context["github_user"] = "pinellolab" 36 | context["github_repo"] = "simba" 37 | context["github_version"] = "master" 38 | context["conf_py_path"] = "/docs/source/" 39 | 40 | def setup(app): 41 | app.add_config_value("github_nb_repo", "", True) 42 | app.add_config_value("github_repo", "", True) 43 | app.connect("html-page-context", html_page_context) 44 | -------------------------------------------------------------------------------- /docs/source/_static/img/Figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/docs/source/_static/img/Figure1.png -------------------------------------------------------------------------------- /docs/source/_static/img/lion_icon.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/_static/img/logo_simba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/docs/source/_static/img/logo_simba.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../simba')) 16 | sys.path.insert(0, os.path.abspath('_ext')) 17 | import simba # noqa: E402 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'SIMBA' 23 | copyright = '2021, Huidong Chen' 24 | author = 'Huidong Chen' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = simba.__version__ 28 | 29 | 30 | # -- Retrieve notebooks (borrowed from scVelo) ------------------------------- 31 | 32 | from urllib.request import urlretrieve # noqa: E402 33 | 34 | notebooks_url = "https://github.com/huidongchen/simba_tutorials/raw/main/" 35 | notebooks_v1_0 = [ 36 | "rna_10xpmbc_all_genes.ipynb", 37 | "atac_buenrostro2018_peaks_and_sequences.ipynb", 38 | "multiome_shareseq.ipynb", 39 | "multiome_shareseq_GRN.ipynb", 40 | "rna_mouse_atlas.ipynb", 41 | "rna_human_pancreas.ipynb", 42 | "multiome_10xpmbc10k_integration.ipynb", 43 | ] 44 | notebooks_v1_1 = [ 45 | "rna_10x_mouse_brain_1p3M.ipynb", 46 | ] 47 | for nb in notebooks_v1_0: 48 | try: 49 | urlretrieve(notebooks_url + "v1.0/" + nb, nb) 50 | except Exception: 51 | pass 52 | 53 | for nb in notebooks_v1_1: 54 | try: 55 | urlretrieve(notebooks_url + "v1.1/" + nb, nb) 56 | except Exception: 57 | pass 58 | 59 | # -- General configuration --------------------------------------------------- 60 | 61 | # Add any Sphinx extension module names here, as strings. They can be 62 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 63 | # ones. 64 | 65 | needs_sphinx = "3.0" 66 | 67 | extensions = [ 68 | "sphinx.ext.autodoc", 69 | "sphinx.ext.autosummary", 70 | 'sphinx.ext.napoleon', 71 | "sphinx.ext.intersphinx", 72 | "sphinx.ext.mathjax", 73 | "sphinx.ext.viewcode", 74 | "nbsphinx", 75 | "edit_on_github", 76 | ] 77 | 78 | autosummary_generate = True 79 | 80 | # Napoleon settings 81 | napoleon_google_docstring = False 82 | 83 | # Add any paths that contain templates here, relative to this directory. 84 | templates_path = ['_templates'] 85 | 86 | # List of patterns, relative to source directory, that match files and 87 | # directories to ignore when looking for source files. 88 | # This pattern also affects html_static_path and html_extra_path. 89 | exclude_patterns = ['_build'] 90 | 91 | # Add prolog for notebooks 92 | 93 | # nbsphinx_prolog = r""" 94 | # {% set docname = 'github/huidongchen/simba_tutorials/blob/main/v1.0/' + env.doc2path(env.docname, base=None) %} 95 | # """ 96 | 97 | # -- Options for HTML output ------------------------------------------------- 98 | 99 | # The theme to use for HTML and HTML Help pages. See the documentation for 100 | # a list of builtin themes. 101 | # 102 | html_theme = 'sphinx_rtd_theme' 103 | html_theme_options = { 104 | "navigation_depth": 1, 105 | "titles_only": True, 106 | 'logo_only': True, 107 | } 108 | html_show_sphinx = False 109 | html_logo = '_static/img/logo_simba.png' 110 | html_favicon = '_static/img/lion_icon.svg' 111 | # html_context = dict( 112 | # display_github=True, 113 | # github_user='pinellolab', 114 | # github_repo='simba', 115 | # github_version='master', 116 | # conf_py_path='/docs/source/', 117 | # ) 118 | # html_context = dict( 119 | # display_github=True, 120 | # github_user='huidongchen', 121 | # github_repo='simba_tutorials', 122 | # github_version='main', 123 | # conf_py_path='/v1.0/', 124 | # ) 125 | github_repo = 'simba' 126 | github_nb_repo = 'simba_tutorials' 127 | 128 | 129 | # Add any paths that contain custom static files (such as style sheets) here, 130 | # relative to this directory. They are copied after the builtin static files, 131 | # so a file named "default.css" will overwrite the builtin "default.css". 132 | 133 | html_static_path = ['_static'] 134 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | |CI| |Docs| |Install with conda| |Codecov| |Last updated| |Downloads| |License| 2 | 3 | **SIMBA**: **SI**\ ngle-cell e\ **MB**\ edding **A**\ long with features 4 | ======================================================================== 5 | 6 | SIMBA is a method to embed cells along with their defining features such as gene expression, transcription factor binding sequences and chromatin accessibility peaks into the same latent space. The joint embedding of cells and features allows SIMBA to perform various types of single cell tasks, including but not limited to single-modal analysis (e.g. scRNA-seq and scATAC-seq analysis), multimodal analysis, batch correction, and multi-omic integration. 7 | 8 | 9 | .. image:: _static/img/Figure1.png 10 | :align: center 11 | :width: 600 12 | :alt: SIMBA overview 13 | 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | :caption: Overview 18 | :hidden: 19 | 20 | About SIMBA 21 | Installation 22 | API 23 | Release notes 24 | Citation 25 | 26 | 27 | .. toctree:: 28 | :maxdepth: 1 29 | :caption: SIMBA primer 30 | 31 | Basic concepts 32 | Output 33 | 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | :caption: Tutorials 38 | 39 | rna_10xpmbc_all_genes 40 | atac_buenrostro2018_peaks_and_sequences 41 | multiome_shareseq 42 | multiome_shareseq_GRN 43 | rna_mouse_atlas 44 | rna_human_pancreas 45 | multiome_10xpmbc10k_integration 46 | rna_10x_mouse_brain_1p3M 47 | 48 | 49 | .. |Docs| image:: https://readthedocs.org/projects/simba-bio/badge/?version=latest 50 | :target: https://simba-bio.readthedocs.io 51 | 52 | .. |CI| image:: https://github.com/pinellolab/simba/actions/workflows/CI.yml/badge.svg 53 | :target: https://github.com/pinellolab/simba/actions/workflows/CI.yml 54 | 55 | .. |Install with conda| image:: https://anaconda.org/bioconda/simba/badges/version.svg 56 | :target: https://anaconda.org/bioconda/simba 57 | 58 | .. |Last updated| image:: https://anaconda.org/bioconda/simba/badges/latest_release_date.svg 59 | :target: https://anaconda.org/bioconda/simba 60 | 61 | .. |License| image:: https://anaconda.org/bioconda/simba/badges/license.svg 62 | :target: https://github.com/pinellolab/simba/blob/master/LICENSE 63 | 64 | .. |Downloads| image:: https://anaconda.org/bioconda/simba/badges/downloads.svg 65 | :target: https://anaconda.org/bioconda/simba 66 | 67 | .. |Codecov| image:: https://codecov.io/gh/pinellolab/simba/branch/master/graph/badge.svg?token=NDQJQPL18K 68 | :target: https://codecov.io/gh/pinellolab/simba 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /docs/source/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = 'test_*.py' 3 | testpaths = 'tests/' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.17.0 2 | pandas>=1.0,!=1.1 # required by Anndata 3 | anndata>=0.7.4 4 | # h5py<3.0.0 # avoid byte strings but caused building errors 5 | # h5py>=3.4 6 | scikit-learn>=0.19 7 | scipy>=1.4 8 | kneed>=0.7 9 | seaborn>=0.11 10 | matplotlib>=3.3 11 | scikit-misc>=0.1.3 12 | adjusttext>=0.7.3 13 | umap-learn>=0.3.0 14 | #plotly>=4.14.0 15 | pybedtools>=0.8.0 16 | # bedtools>=2.29.0 # not available in pip 17 | tables 18 | 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version_info < (3, 6): 4 | sys.exit('simba requires Python >= 3.6') 5 | 6 | from setuptools import setup, find_packages 7 | from pathlib import Path 8 | 9 | version = {} 10 | with open("simba/_version.py") as fp: 11 | exec(fp.read(), version) 12 | 13 | 14 | setup( 15 | name='simba', 16 | version=version['__version__'], 17 | author='Huidong Chen', 18 | athor_email='hd7chen AT gmail DOT com', 19 | license='BSD', 20 | description='SIngle-cell eMBedding Along with features', 21 | long_description=Path('README.md').read_text('utf-8'), 22 | long_description_content_type="text/markdown", 23 | url='https://github.com/pinellolab/simba', 24 | packages=find_packages(), 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: BSD License", 28 | "Operating System :: OS Independent", 29 | ], 30 | python_requires='>=3.7', 31 | install_requires=[ 32 | x.strip() for x in 33 | Path('requirements.txt').read_text('utf-8').splitlines() 34 | ], 35 | include_package_data=True, 36 | package_data={"simba": ["data/gene_anno/*.bed"]} 37 | ) 38 | -------------------------------------------------------------------------------- /simba/__init__.py: -------------------------------------------------------------------------------- 1 | """SIngle-cell eMBedding Along with features""" 2 | 3 | from ._settings import settings 4 | from . import preprocessing as pp 5 | from . import tools as tl 6 | from . import plotting as pl 7 | from .readwrite import * 8 | from . import datasets 9 | from ._version import __version__ 10 | 11 | 12 | import sys 13 | # needed when building doc (borrowed from scanpy) 14 | sys.modules.update( 15 | {f'{__name__}.{m}': globals()[m] for m in ['tl', 'pp', 'pl']}) 16 | -------------------------------------------------------------------------------- /simba/_settings.py: -------------------------------------------------------------------------------- 1 | """Configuration for SIMBA""" 2 | 3 | import os 4 | import seaborn as sns 5 | import matplotlib as mpl 6 | 7 | 8 | class SimbaConfig: 9 | """configuration class for SIMBA""" 10 | 11 | def __init__(self, 12 | workdir='./result_simba', 13 | save_fig=False, 14 | n_jobs=1): 15 | self.workdir = workdir 16 | self.save_fig = save_fig 17 | self.n_jobs = n_jobs 18 | self.set_pbg_params() 19 | self.graph_stats = dict() 20 | 21 | def set_figure_params(self, 22 | context='notebook', 23 | style='white', 24 | palette='deep', 25 | font='sans-serif', 26 | font_scale=1.1, 27 | color_codes=True, 28 | dpi=80, 29 | dpi_save=150, 30 | fig_size=[5.4, 4.8], 31 | rc=None): 32 | """ Set global parameters for figures. Modified from sns.set() 33 | 34 | Parameters 35 | ---------- 36 | context : string or dict 37 | Plotting context parameters, see `seaborn.plotting_context` 38 | style: `string`,optional (default: 'white') 39 | Axes style parameters, see `seaborn.axes_style` 40 | palette : string or sequence 41 | Color palette, see `seaborn.color_palette` 42 | font_scale: `float`, optional (default: 1.3) 43 | Separate scaling factor to independently 44 | scale the size of the font elements. 45 | color_codes : `bool`, optional (default: True) 46 | If ``True`` and ``palette`` is a seaborn palette, 47 | remap the shorthand color codes (e.g. "b", "g", "r", etc.) 48 | to the colors from this palette. 49 | dpi: `int`,optional (default: 80) 50 | Resolution of rendered figures. 51 | dpi_save: `int`,optional (default: 150) 52 | Resolution of saved figures. 53 | rc: `dict`,optional (default: None) 54 | rc settings properties. 55 | Parameter mappings to override the values in the preset style. 56 | Please see "`matplotlibrc file 57 | `__" 58 | """ 59 | sns.set(context=context, 60 | style=style, 61 | palette=palette, 62 | font=font, 63 | font_scale=font_scale, 64 | color_codes=color_codes, 65 | rc={'figure.dpi': dpi, 66 | 'savefig.dpi': dpi_save, 67 | 'figure.figsize': fig_size, 68 | 'image.cmap': 'viridis', 69 | 'lines.markersize': 6, 70 | 'legend.columnspacing': 0.1, 71 | 'legend.borderaxespad': 0.1, 72 | 'legend.handletextpad': 0.1, 73 | 'pdf.fonttype': 42, 74 | }) 75 | if rc is not None: 76 | assert isinstance(rc, dict), "rc must be dict" 77 | for key, value in rc.items(): 78 | if key in mpl.rcParams.keys(): 79 | mpl.rcParams[key] = value 80 | else: 81 | raise Exception("unrecognized property '%s'" % key) 82 | 83 | def set_workdir(self, workdir=None): 84 | """Set working directory. 85 | 86 | Parameters 87 | ---------- 88 | workdir: `str`, optional (default: None) 89 | Working directory. 90 | 91 | Returns 92 | ------- 93 | """ 94 | if(workdir is None): 95 | workdir = self.workdir 96 | print("Using default working directory.") 97 | if(not os.path.exists(workdir)): 98 | os.makedirs(workdir) 99 | self.workdir = workdir 100 | self.set_pbg_params() 101 | print('Saving results in: %s' % workdir) 102 | 103 | def set_pbg_params(self, config=None): 104 | """Set PBG parameters 105 | 106 | Parameters 107 | ---------- 108 | config : `dict`, optional (default: None) 109 | PBG training configuration parameters. 110 | By default it resets parameters to the default setting. 111 | 112 | Returns 113 | ------- 114 | """ 115 | if config is None: 116 | config = dict( 117 | # I/O data 118 | entity_path="", 119 | edge_paths=["", ], 120 | checkpoint_path="", 121 | 122 | # Graph structure 123 | entities={}, 124 | relations=[], 125 | dynamic_relations=False, 126 | 127 | # Scoring model 128 | dimension=50, 129 | global_emb=False, 130 | comparator='dot', 131 | 132 | # Training 133 | num_epochs=10, 134 | workers=4, 135 | num_batch_negs=50, 136 | num_uniform_negs=50, 137 | loss_fn='softmax', 138 | lr=0.1, 139 | 140 | early_stopping=False, 141 | regularization_coef=0.0, 142 | wd=0.0, 143 | wd_interval=50, 144 | 145 | # Evaluation during training 146 | eval_fraction=0.05, 147 | eval_num_batch_negs=50, 148 | eval_num_uniform_negs=50, 149 | 150 | checkpoint_preservation_interval=None, 151 | ) 152 | assert isinstance(config, dict), "`config` must be dict" 153 | self.pbg_params = config 154 | 155 | 156 | settings = SimbaConfig() 157 | -------------------------------------------------------------------------------- /simba/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from kneed import KneeLocator 5 | import tables 6 | from anndata import AnnData 7 | 8 | 9 | def locate_elbow(x, y, S=10, min_elbow=0, 10 | curve='convex', direction='decreasing', online=False, 11 | **kwargs): 12 | """Detect knee points 13 | 14 | Parameters 15 | ---------- 16 | x : `array-like` 17 | x values 18 | y : `array-like` 19 | y values 20 | S : `float`, optional (default: 10) 21 | Sensitivity 22 | min_elbow: `int`, optional (default: 0) 23 | The minimum elbow location 24 | curve: `str`, optional (default: 'convex') 25 | Choose from {'convex','concave'} 26 | If 'concave', algorithm will detect knees, 27 | If 'convex', algorithm will detect elbows. 28 | direction: `str`, optional (default: 'decreasing') 29 | Choose from {'decreasing','increasing'} 30 | online: `bool`, optional (default: False) 31 | kneed will correct old knee points if True, 32 | kneed will return first knee if False. 33 | **kwargs: `dict`, optional 34 | Extra arguments to KneeLocator. 35 | 36 | Returns 37 | ------- 38 | elbow: `int` 39 | elbow point 40 | """ 41 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):], 42 | S=S, curve=curve, 43 | direction=direction, 44 | online=online, 45 | **kwargs, 46 | ) 47 | if(kneedle.elbow is None): 48 | elbow = len(y) 49 | else: 50 | elbow = int(kneedle.elbow) 51 | return(elbow) 52 | 53 | 54 | # modifed from 55 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 56 | def _read_legacy_10x_h5(filename, genome=None): 57 | """ 58 | Read hdf5 file from Cell Ranger v2 or earlier versions. 59 | """ 60 | with tables.open_file(str(filename), 'r') as f: 61 | try: 62 | children = [x._v_name for x in f.list_nodes(f.root)] 63 | if not genome: 64 | if len(children) > 1: 65 | raise ValueError( 66 | f"'{filename}' contains more than one genome. " 67 | "For legacy 10x h5 " 68 | "files you must specify the genome " 69 | "if more than one is present. " 70 | f"Available genomes are: {children}" 71 | ) 72 | genome = children[0] 73 | elif genome not in children: 74 | raise ValueError( 75 | f"Could not find genome '{genome}' in '{filename}'. " 76 | f'Available genomes are: {children}' 77 | ) 78 | dsets = {} 79 | for node in f.walk_nodes('/' + genome, 'Array'): 80 | dsets[node.name] = node.read() 81 | # AnnData works with csr matrices 82 | # 10x stores the transposed data, so we do the transposition 83 | from scipy.sparse import csr_matrix 84 | 85 | M, N = dsets['shape'] 86 | data = dsets['data'] 87 | if dsets['data'].dtype == np.dtype('int32'): 88 | data = dsets['data'].view('float32') 89 | data[:] = dsets['data'] 90 | matrix = csr_matrix( 91 | (data, dsets['indices'], dsets['indptr']), 92 | shape=(N, M), 93 | ) 94 | # the csc matrix is automatically the transposed csr matrix 95 | # as scanpy expects it, so, no need for a further transpostion 96 | adata = AnnData( 97 | matrix, 98 | obs=dict(obs_names=dsets['barcodes'].astype(str)), 99 | var=dict( 100 | var_names=dsets['gene_names'].astype(str), 101 | gene_ids=dsets['genes'].astype(str), 102 | ), 103 | ) 104 | return adata 105 | except KeyError: 106 | raise Exception('File is missing one or more required datasets.') 107 | 108 | 109 | # modifed from 110 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 111 | def _read_v3_10x_h5(filename): 112 | """ 113 | Read hdf5 file from Cell Ranger v3 or later versions. 114 | """ 115 | with tables.open_file(str(filename), 'r') as f: 116 | try: 117 | dsets = {} 118 | for node in f.walk_nodes('/matrix', 'Array'): 119 | dsets[node.name] = node.read() 120 | from scipy.sparse import csr_matrix 121 | 122 | M, N = dsets['shape'] 123 | data = dsets['data'] 124 | if dsets['data'].dtype == np.dtype('int32'): 125 | data = dsets['data'].view('float32') 126 | data[:] = dsets['data'] 127 | matrix = csr_matrix( 128 | (data, dsets['indices'], dsets['indptr']), 129 | shape=(N, M), 130 | ) 131 | adata = AnnData( 132 | matrix, 133 | obs=dict(obs_names=dsets['barcodes'].astype(str)), 134 | var=dict( 135 | var_names=dsets['name'].astype(str), 136 | gene_ids=dsets['id'].astype(str), 137 | feature_types=dsets['feature_type'].astype(str), 138 | genome=dsets['genome'].astype(str), 139 | ), 140 | ) 141 | return adata 142 | except KeyError: 143 | raise Exception('File is missing one or more required datasets.') 144 | -------------------------------------------------------------------------------- /simba/_version.py: -------------------------------------------------------------------------------- 1 | """Version information""" 2 | 3 | __version__ = "1.1" 4 | -------------------------------------------------------------------------------- /simba/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Builtin Datasets.""" 2 | 3 | from ._datasets import ( 4 | rna_10xpmbc3k, 5 | rna_han2018, 6 | rna_tmc2018, 7 | rna_baron2016, 8 | rna_muraro2016, 9 | rna_segerstolpe2016, 10 | rna_wang2016, 11 | rna_xin2016, 12 | atac_buenrostro2018, 13 | atac_10xpbmc5k, 14 | atac_chen2019, 15 | atac_cusanovich2018_subset, 16 | multiome_ma2020_fig4, 17 | multiome_chen2019, 18 | multiome_10xpbmc10k 19 | ) 20 | -------------------------------------------------------------------------------- /simba/datasets/_datasets.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from tqdm import tqdm 3 | import os 4 | 5 | from .._settings import settings 6 | from ..readwrite import read_h5ad 7 | 8 | 9 | class DownloadProgressBar(tqdm): 10 | def update_to(self, 11 | b=1, 12 | bsize=1, 13 | tsize=None): 14 | if tsize is not None: 15 | self.total = tsize 16 | self.update(b * bsize - self.n) 17 | 18 | 19 | def download_url(url, 20 | output_path, 21 | desc=None): 22 | if desc is None: 23 | desc = url.split('/')[-1] 24 | with DownloadProgressBar( 25 | unit='B', 26 | unit_scale=True, 27 | miniters=1, 28 | desc=desc 29 | ) as t: 30 | urllib.request.urlretrieve( 31 | url, 32 | filename=output_path, 33 | reporthook=t.update_to) 34 | 35 | 36 | def rna_10xpmbc3k(): 37 | """10X human peripheral blood mononuclear cells (PBMCs) scRNA-seq data 38 | 39 | Returns 40 | ------- 41 | adata: `AnnData` 42 | Anndata object 43 | """ 44 | url = 'https://www.dropbox.com/s/087wuliddmbp3oe/rna_seq.h5ad?dl=1' 45 | filename = 'rna_10xpmbc3k.h5ad' 46 | filepath = os.path.join(settings.workdir, 'data') 47 | fullpath = os.path.join(filepath, filename) 48 | if(not os.path.exists(fullpath)): 49 | print('Downloading data ...') 50 | os.makedirs(filepath, exist_ok=True) 51 | download_url(url, 52 | fullpath, 53 | desc=filename) 54 | print(f'Downloaded to {filepath}.') 55 | adata = read_h5ad(fullpath) 56 | return adata 57 | 58 | 59 | def rna_han2018(): 60 | """single-cell microwell-seq mouse cell atlas data 61 | 62 | ref: Han, X. et al. Mapping the mouse cell atlas by microwell-seq. 63 | Cell 172, 1091-1107. e1017 (2018). 64 | 65 | Returns 66 | ------- 67 | adata: `AnnData` 68 | Anndata object 69 | """ 70 | url = 'https://www.dropbox.com/s/nxbszjbir44g99n/rna_seq_mi.h5ad?dl=1' 71 | filename = 'rna_han2018.h5ad' 72 | filepath = os.path.join(settings.workdir, 'data') 73 | fullpath = os.path.join(filepath, filename) 74 | if(not os.path.exists(fullpath)): 75 | print('Downloading data ...') 76 | os.makedirs(filepath, exist_ok=True) 77 | download_url(url, 78 | fullpath, 79 | desc=filename) 80 | print(f'Downloaded to {filepath}.') 81 | adata = read_h5ad(fullpath) 82 | return adata 83 | 84 | 85 | def rna_tmc2018(): 86 | """single-cell Smart-Seq2 mouse cell atlas data 87 | 88 | ref: Tabula Muris Consortium. Single-cell transcriptomics of 20 mouse 89 | organs creates a Tabula Muris. Nature 562, 367-372 (2018). 90 | 91 | Returns 92 | ------- 93 | adata: `AnnData` 94 | Anndata object 95 | """ 96 | url = 'https://www.dropbox.com/s/rnpyp6vfpuiptkz/rna_seq_sm.h5ad?dl=1' 97 | filename = 'rna_tmc2018.h5ad' 98 | filepath = os.path.join(settings.workdir, 'data') 99 | fullpath = os.path.join(filepath, filename) 100 | if(not os.path.exists(fullpath)): 101 | print('Downloading data ...') 102 | os.makedirs(filepath, exist_ok=True) 103 | download_url(url, 104 | fullpath, 105 | desc=filename) 106 | print(f'Downloaded to {filepath}.') 107 | adata = read_h5ad(fullpath) 108 | return adata 109 | 110 | 111 | def rna_baron2016(): 112 | """single-cell RNA-seq human pancreas data 113 | 114 | ref: Baron, M. et al. A single-cell transcriptomic map of the human and 115 | mouse pancreas reveals inter-and intra-cell population structure. Cell 116 | systems 3, 346-360. e344 (2016) 117 | 118 | Returns 119 | ------- 120 | adata: `AnnData` 121 | Anndata object 122 | """ 123 | url = 'https://www.dropbox.com/s/bvziclu6d3fdzow/rna_seq_baron.h5ad?dl=1' 124 | filename = 'rna_baron2016.h5ad' 125 | filepath = os.path.join(settings.workdir, 'data') 126 | fullpath = os.path.join(filepath, filename) 127 | if(not os.path.exists(fullpath)): 128 | print('Downloading data ...') 129 | os.makedirs(filepath, exist_ok=True) 130 | download_url(url, 131 | fullpath, 132 | desc=filename) 133 | print(f'Downloaded to {filepath}.') 134 | adata = read_h5ad(fullpath) 135 | return adata 136 | 137 | 138 | def rna_muraro2016(): 139 | """single-cell RNA-seq human pancreas data 140 | 141 | ref: Muraro, M.J. et al. A single-cell transcriptome atlas of the 142 | human pancreas.Cell systems 3, 385-394. e383 (2016). 143 | 144 | Returns 145 | ------- 146 | adata: `AnnData` 147 | Anndata object 148 | """ 149 | url = 'https://www.dropbox.com/s/ginc9rbo4qmobwx/rna_seq_muraro.h5ad?dl=1' 150 | filename = 'rna_muraro2016.h5ad' 151 | filepath = os.path.join(settings.workdir, 'data') 152 | fullpath = os.path.join(filepath, filename) 153 | if(not os.path.exists(fullpath)): 154 | print('Downloading data ...') 155 | os.makedirs(filepath, exist_ok=True) 156 | download_url(url, 157 | fullpath, 158 | desc=filename) 159 | print(f'Downloaded to {filepath}.') 160 | adata = read_h5ad(fullpath) 161 | return adata 162 | 163 | 164 | def rna_segerstolpe2016(): 165 | """single-cell RNA-seq human pancreas data 166 | 167 | ref: Segerstolpe, Å. et al. Single-cell transcriptome profiling of human 168 | pancreatic islets in health and type 2 diabetes. 169 | Cell metabolism 24, 593-607 (2016). 170 | 171 | Returns 172 | ------- 173 | adata: `AnnData` 174 | Anndata object 175 | """ 176 | url = 'https://www.dropbox.com/s/qomnf4860jwm9pd/rna_seq_segerstolpe.h5ad?dl=1' 177 | filename = 'rna_segerstolpe2016.h5ad' 178 | filepath = os.path.join(settings.workdir, 'data') 179 | fullpath = os.path.join(filepath, filename) 180 | if(not os.path.exists(fullpath)): 181 | print('Downloading data ...') 182 | os.makedirs(filepath, exist_ok=True) 183 | download_url(url, 184 | fullpath, 185 | desc=filename) 186 | print(f'Downloaded to {filepath}.') 187 | adata = read_h5ad(fullpath) 188 | return adata 189 | 190 | 191 | def rna_wang2016(): 192 | """single-cell RNA-seq human pancreas data 193 | 194 | ref: Wang, Y.J. et al. Single-cell transcriptomics of the human endocrine 195 | pancreas. Diabetes 65, 3028-3038 (2016). 196 | 197 | Returns 198 | ------- 199 | adata: `AnnData` 200 | Anndata object 201 | """ 202 | url = 'https://www.dropbox.com/s/9tv44nugwpx9t4c/rna_seq_wang.h5ad?dl=1' 203 | filename = 'rna_wang2016.h5ad' 204 | filepath = os.path.join(settings.workdir, 'data') 205 | fullpath = os.path.join(filepath, filename) 206 | if(not os.path.exists(fullpath)): 207 | print('Downloading data ...') 208 | os.makedirs(filepath, exist_ok=True) 209 | download_url(url, 210 | fullpath, 211 | desc=filename) 212 | print(f'Downloaded to {filepath}.') 213 | adata = read_h5ad(fullpath) 214 | return adata 215 | 216 | 217 | def rna_xin2016(): 218 | """single-cell RNA-seq human pancreas data 219 | 220 | ref: Xin, Y. et al. RNA sequencing of single human islet cells reveals 221 | type 2 diabetes genes. Cell metabolism 24, 608-615 (2016). 222 | 223 | Returns 224 | ------- 225 | adata: `AnnData` 226 | Anndata object 227 | """ 228 | url = 'https://www.dropbox.com/s/j483i47mxty6rzo/rna_seq_xin.h5ad?dl=1' 229 | filename = 'rna_xin2016.h5ad' 230 | filepath = os.path.join(settings.workdir, 'data') 231 | fullpath = os.path.join(filepath, filename) 232 | if(not os.path.exists(fullpath)): 233 | print('Downloading data ...') 234 | os.makedirs(filepath, exist_ok=True) 235 | download_url(url, 236 | fullpath, 237 | desc=filename) 238 | print(f'Downloaded to {filepath}.') 239 | adata = read_h5ad(fullpath) 240 | return adata 241 | 242 | 243 | def atac_buenrostro2018(): 244 | """single cell ATAC-seq human blood data 245 | 246 | ref: Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the 247 | Continuous RegulatoryLandscape of Human Hematopoietic Differentiation. 248 | Cell 173, 1535-1548 e1516 (2018). 249 | 250 | Returns 251 | ------- 252 | adata: `AnnData` 253 | Anndata object 254 | """ 255 | url = 'https://www.dropbox.com/s/7hxjqgdxtbna1tm/atac_seq.h5ad?dl=1' 256 | filename = 'atac_buenrostro2018.h5ad' 257 | filepath = os.path.join(settings.workdir, 'data') 258 | fullpath = os.path.join(filepath, filename) 259 | if(not os.path.exists(fullpath)): 260 | print('Downloading data ...') 261 | os.makedirs(filepath, exist_ok=True) 262 | download_url(url, 263 | fullpath, 264 | desc=filename) 265 | print(f'Downloaded to {filepath}.') 266 | adata = read_h5ad(fullpath) 267 | return adata 268 | 269 | 270 | def atac_10xpbmc5k(): 271 | """10X human peripheral blood mononuclear cells (PBMCs) scATAC-seq data 272 | 273 | Returns 274 | ------- 275 | adata: `AnnData` 276 | Anndata object 277 | """ 278 | url = 'https://www.dropbox.com/s/xa8u7rlskc5h7iv/atac_seq.h5ad?dl=1' 279 | filename = 'atac_10xpbmc5k.h5ad' 280 | filepath = os.path.join(settings.workdir, 'data') 281 | fullpath = os.path.join(filepath, filename) 282 | if(not os.path.exists(fullpath)): 283 | print('Downloading data ...') 284 | os.makedirs(filepath, exist_ok=True) 285 | download_url(url, 286 | fullpath, 287 | desc=filename) 288 | print(f'Downloaded to {filepath}.') 289 | adata = read_h5ad(fullpath) 290 | return adata 291 | 292 | 293 | def atac_cusanovich2018_subset(): 294 | """downsampled sci-ATAC-seq mouse tissue data 295 | 296 | ref: Cusanovich, D.A. et al. A Single-Cell Atlas of In Vivo Mammalian 297 | Chromatin Accessibility. Cell 174, 1309-1324 e1318 (2018). 298 | 299 | Returns 300 | ------- 301 | adata: `AnnData` 302 | Anndata object 303 | """ 304 | url = 'https://www.dropbox.com/s/e8iqwm93m33i5wt/atac_seq.h5ad?dl=1' 305 | filename = 'atac_cusanovich2018_subset.h5ad' 306 | filepath = os.path.join(settings.workdir, 'data') 307 | fullpath = os.path.join(filepath, filename) 308 | if(not os.path.exists(fullpath)): 309 | print('Downloading data ...') 310 | os.makedirs(filepath, exist_ok=True) 311 | download_url(url, 312 | fullpath, 313 | desc=filename) 314 | print(f'Downloaded to {filepath}.') 315 | adata = read_h5ad(fullpath) 316 | return adata 317 | 318 | 319 | def atac_chen2019(): 320 | """simulated scATAC-seq bone marrow data with a noise level of 0.4 321 | and a coverage of 2500 fragments 322 | 323 | ref: Chen, H. et al. Assessment of computational methods for the analysis 324 | of single-cell ATAC-seq data. Genome Biology 20, 241 (2019). 325 | 326 | Returns 327 | ------- 328 | adata: `AnnData` 329 | Anndata object 330 | """ 331 | url = 'https://www.dropbox.com/s/fthhh3mz5b39d4y/atac_seq.h5ad?dl=1' 332 | filename = 'atac_chen2019.h5ad' 333 | filepath = os.path.join(settings.workdir, 'data') 334 | fullpath = os.path.join(filepath, filename) 335 | if(not os.path.exists(fullpath)): 336 | print('Downloading data ...') 337 | os.makedirs(filepath, exist_ok=True) 338 | download_url(url, 339 | fullpath, 340 | desc=filename) 341 | print(f'Downloaded to {filepath}.') 342 | adata = read_h5ad(fullpath) 343 | return adata 344 | 345 | 346 | def multiome_ma2020_fig4(): 347 | """single cell multiome mouse skin data (SHARE-seq) 348 | 349 | ref: Ma, S. et al. Chromatin Potential Identified by Shared Single-Cell 350 | Profiling of RNA and Chromatin. Cell (2020). 351 | 352 | Returns 353 | ------- 354 | dict_adata: `dict` 355 | A dictionary of anndata objects 356 | """ 357 | url_rna = 'https://www.dropbox.com/s/gmmf77l8kzle6o7/rna_seq_fig4.h5ad?dl=1' 358 | url_atac = 'https://www.dropbox.com/s/ts0v2y2m5fcumcb/atac_seq_fig4.h5ad?dl=1' 359 | filename_rna = 'multiome_ma2020_fig4_rna.h5ad' 360 | filename_atac = 'multiome_ma2020_fig4_atac.h5ad' 361 | filepath = os.path.join(settings.workdir, 'data') 362 | fullpath_rna = os.path.join(filepath, filename_rna) 363 | fullpath_atac = os.path.join(filepath, filename_atac) 364 | 365 | if(not os.path.exists(fullpath_rna)): 366 | print('Downloading data ...') 367 | os.makedirs(filepath, exist_ok=True) 368 | download_url(url_rna, 369 | fullpath_rna, 370 | desc=filename_rna) 371 | print(f'Downloaded to {filepath}.') 372 | if(not os.path.exists(fullpath_atac)): 373 | print('Downloading data ...') 374 | os.makedirs(filepath, exist_ok=True) 375 | download_url(url_atac, 376 | fullpath_atac, 377 | desc=filename_atac) 378 | print(f'Downloaded to {filepath}.') 379 | adata_rna = read_h5ad(fullpath_rna) 380 | adata_atac = read_h5ad(fullpath_atac) 381 | dict_adata = {'rna': adata_rna, 382 | 'atac': adata_atac} 383 | return dict_adata 384 | 385 | 386 | def multiome_chen2019(): 387 | """single cell multiome neonatal mouse cerebral cortex data (SNARE-seq) 388 | 389 | ref: Chen, S., Lake, B.B. & Zhang, K. High-throughput sequencing of the 390 | transcriptome and chromatin accessibility in the same cell. 391 | Nat Biotechnol (2019). 392 | 393 | Returns 394 | ------- 395 | dict_adata: `dict` 396 | A dictionary of anndata objects 397 | """ 398 | url_rna = 'https://www.dropbox.com/s/b1bbcs500q0pigt/rna_seq.h5ad?dl=1' 399 | url_atac = 'https://www.dropbox.com/s/ljepkfber68pdvc/atac_seq.h5ad?dl=1' 400 | filename_rna = 'multiome_chen2019_rna.h5ad' 401 | filename_atac = 'multiome_chen2019_atac.h5ad' 402 | filepath = os.path.join(settings.workdir, 'data') 403 | fullpath_rna = os.path.join(filepath, filename_rna) 404 | fullpath_atac = os.path.join(filepath, filename_atac) 405 | 406 | if(not os.path.exists(fullpath_rna)): 407 | print('Downloading data ...') 408 | os.makedirs(filepath, exist_ok=True) 409 | download_url(url_rna, 410 | fullpath_rna, 411 | desc=filename_rna) 412 | print(f'Downloaded to {filepath}.') 413 | if(not os.path.exists(fullpath_atac)): 414 | print('Downloading data ...') 415 | os.makedirs(filepath, exist_ok=True) 416 | download_url(url_atac, 417 | fullpath_atac, 418 | desc=filename_atac) 419 | print(f'Downloaded to {filepath}.') 420 | adata_rna = read_h5ad(fullpath_rna) 421 | adata_atac = read_h5ad(fullpath_atac) 422 | dict_adata = {'rna': adata_rna, 423 | 'atac': adata_atac} 424 | return dict_adata 425 | 426 | 427 | def multiome_10xpbmc10k(): 428 | """single cell 10X human peripheral blood mononuclear cells (PBMCs) 429 | multiome data 430 | 431 | Returns 432 | ------- 433 | dict_adata: `dict` 434 | A dictionary of anndata objects 435 | """ 436 | url_rna = 'https://www.dropbox.com/s/zwlim6vljnbfp43/rna_seq.h5ad?dl=1' 437 | url_atac = 'https://www.dropbox.com/s/163msz0k9hkfrt7/atac_seq.h5ad?dl=1' 438 | filename_rna = 'multiome_10xpbmc10k_rna.h5ad' 439 | filename_atac = 'multiome_10xpbmc10k_atac.h5ad' 440 | filepath = os.path.join(settings.workdir, 'data') 441 | fullpath_rna = os.path.join(filepath, filename_rna) 442 | fullpath_atac = os.path.join(filepath, filename_atac) 443 | 444 | if(not os.path.exists(fullpath_rna)): 445 | print('Downloading data ...') 446 | os.makedirs(filepath, exist_ok=True) 447 | download_url(url_rna, 448 | fullpath_rna, 449 | desc=filename_rna) 450 | print(f'Downloaded to {filepath}.') 451 | if(not os.path.exists(fullpath_atac)): 452 | print('Downloading data ...') 453 | os.makedirs(filepath, exist_ok=True) 454 | download_url(url_atac, 455 | fullpath_atac, 456 | desc=filename_atac) 457 | print(f'Downloaded to {filepath}.') 458 | adata_rna = read_h5ad(fullpath_rna) 459 | adata_atac = read_h5ad(fullpath_atac) 460 | dict_adata = {'rna': adata_rna, 461 | 'atac': adata_atac} 462 | return dict_adata 463 | -------------------------------------------------------------------------------- /simba/plotting/__init__.py: -------------------------------------------------------------------------------- 1 | """Plotting""" 2 | 3 | from ._plot import ( 4 | pca_variance_ratio, 5 | pcs_features, 6 | variable_genes, 7 | violin, 8 | hist, 9 | umap, 10 | discretize, 11 | node_similarity, 12 | svd_nodes, 13 | ) 14 | from ._post_training import ( 15 | pbg_metrics, 16 | entity_metrics, 17 | entity_barcode, 18 | query 19 | ) 20 | -------------------------------------------------------------------------------- /simba/plotting/_palettes.py: -------------------------------------------------------------------------------- 1 | """Color palettes in addition to matplotlib's palettes 2 | 3 | This is modifed from 4 | scanpy palettes https://github.com/theislab/scanpy/blob/master/scanpy/plotting/palettes.py 5 | """ 6 | 7 | from matplotlib import cm, colors 8 | 9 | # Colorblindness adjusted vega_10 10 | # See https://github.com/theislab/scanpy/issues/387 11 | vega_10 = list(map(colors.to_hex, cm.tab10.colors)) 12 | vega_10_scanpy = vega_10.copy() 13 | vega_10_scanpy[2] = '#279e68' # green 14 | vega_10_scanpy[4] = '#aa40fc' # purple 15 | vega_10_scanpy[8] = '#b5bd61' # kakhi 16 | 17 | # default matplotlib 2.0 palette 18 | # see 'category20' on https://github.com/vega/vega/wiki/Scales#scale-range-literals 19 | vega_20 = list(map(colors.to_hex, cm.tab20.colors)) 20 | 21 | # reorderd, some removed, some added 22 | vega_20_scanpy = [ 23 | *vega_20[0:14:2], *vega_20[16::2], # dark without grey 24 | *vega_20[1:15:2], *vega_20[17::2], # light without grey 25 | '#ad494a', '#8c6d31', # manual additions 26 | ] 27 | vega_20_scanpy[2] = vega_10_scanpy[2] 28 | vega_20_scanpy[4] = vega_10_scanpy[4] 29 | vega_20_scanpy[7] = vega_10_scanpy[8] # kakhi shifted by missing grey 30 | # TODO: also replace pale colors if necessary 31 | 32 | default_20 = vega_20_scanpy 33 | 34 | # https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d 35 | # update 1 36 | # orig reference http://epub.wu.ac.at/1692/1/document.pdf 37 | zeileis_28 = [ 38 | "#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", "#4a6fe3", 39 | "#8595e1", "#b5bbe3", "#e6afb9", "#e07b91", "#d33f6a", "#11c638", "#8dd593", 40 | "#c6dec7", "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6", "#d5eae7", 41 | "#f3e1eb", "#f6c4e1", "#f79cd4", 42 | '#7f7f7f', "#c7c7c7", "#1CE6FF", "#336600", # these last ones were added, 43 | ] 44 | 45 | default_28 = zeileis_28 46 | 47 | # from http://godsnotwheregodsnot.blogspot.de/2012/09/color-distribution-methodology.html 48 | godsnot_102 = [ 49 | # "#000000", # remove the black, as often, we have black colored annotation 50 | "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", 51 | "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", 52 | "#5A0007", "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", 53 | "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", 54 | "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", 55 | "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", 56 | "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", 57 | "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", 58 | "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", 59 | "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", 60 | "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", 61 | "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", 62 | "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", 63 | ] 64 | 65 | default_102 = godsnot_102 66 | -------------------------------------------------------------------------------- /simba/plotting/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.api.types import ( 6 | is_numeric_dtype, 7 | is_string_dtype, 8 | is_categorical_dtype, 9 | ) 10 | import matplotlib as mpl 11 | 12 | from ._palettes import ( 13 | default_20, 14 | default_28, 15 | default_102 16 | ) 17 | 18 | 19 | def get_colors(arr, 20 | vmin=None, 21 | vmax=None, 22 | clip=False): 23 | """Generate a list of colors for a given array 24 | """ 25 | 26 | if not isinstance(arr, (pd.Series, np.ndarray)): 27 | raise TypeError("`arr` must be pd.Series or np.ndarray") 28 | colors = [] 29 | if is_numeric_dtype(arr): 30 | image_cmap = mpl.rcParams['image.cmap'] 31 | cm = mpl.cm.get_cmap(image_cmap, 512) 32 | if vmin is None: 33 | vmin = min(arr) 34 | if vmax is None: 35 | vmax = max(arr) 36 | norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax, clip=clip) 37 | colors = [mpl.colors.to_hex(cm(norm(x))) for x in arr] 38 | elif is_string_dtype(arr) or is_categorical_dtype(arr): 39 | categories = np.unique(arr) 40 | length = len(categories) 41 | # check if default matplotlib palette has enough colors 42 | # mpl.style.use('default') 43 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length: 44 | cc = mpl.rcParams['axes.prop_cycle']() 45 | palette = [mpl.colors.rgb2hex(next(cc)['color']) 46 | for _ in range(length)] 47 | else: 48 | if length <= 20: 49 | palette = default_20 50 | elif length <= 28: 51 | palette = default_28 52 | elif length <= len(default_102): # 103 colors 53 | palette = default_102 54 | else: 55 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length)) 56 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1]) 57 | for i in range(length)] 58 | colors = pd.Series(['']*len(arr)) 59 | for i, x in enumerate(categories): 60 | ids = np.where(arr == x)[0] 61 | colors[ids] = palette[i] 62 | colors = list(colors) 63 | else: 64 | raise TypeError("unsupported data type for `arr`") 65 | return colors 66 | 67 | 68 | def generate_palette(arr): 69 | """Generate a color palette for a given array 70 | """ 71 | 72 | if not isinstance(arr, (pd.Series, np.ndarray)): 73 | raise TypeError("`arr` must be pd.Series or np.ndarray") 74 | colors = [] 75 | if is_string_dtype(arr) or is_categorical_dtype(arr): 76 | categories = np.unique(arr) 77 | length = len(categories) 78 | # check if default matplotlib palette has enough colors 79 | # mpl.style.use('default') 80 | if len(mpl.rcParams['axes.prop_cycle'].by_key()['color']) >= length: 81 | cc = mpl.rcParams['axes.prop_cycle']() 82 | palette = [mpl.colors.rgb2hex(next(cc)['color']) 83 | for _ in range(length)] 84 | else: 85 | if length <= 20: 86 | palette = default_20 87 | elif length <= 28: 88 | palette = default_28 89 | elif length <= len(default_102): # 103 colors 90 | palette = default_102 91 | else: 92 | rgb_rainbow = mpl.cm.rainbow(np.linspace(0, 1, length)) 93 | palette = [mpl.colors.rgb2hex(rgb_rainbow[i, :-1]) 94 | for i in range(length)] 95 | colors = pd.Series(['']*len(arr)) 96 | for i, x in enumerate(categories): 97 | ids = np.where(arr == x)[0] 98 | colors[ids] = palette[i] 99 | colors = list(colors) 100 | else: 101 | raise TypeError("unsupported data type for `arr`") 102 | dict_palette = dict(zip(arr, colors)) 103 | return dict_palette 104 | -------------------------------------------------------------------------------- /simba/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Preprocessing""" 2 | 3 | from ._general import ( 4 | log_transform, 5 | normalize, 6 | binarize 7 | ) 8 | from ._qc import ( 9 | cal_qc, 10 | cal_qc_rna, 11 | cal_qc_atac, 12 | filter_samples, 13 | filter_cells_rna, 14 | filter_cells_atac, 15 | filter_features, 16 | filter_genes, 17 | filter_peaks, 18 | ) 19 | from ._pca import ( 20 | pca, 21 | select_pcs, 22 | select_pcs_features, 23 | ) 24 | from ._variable_genes import ( 25 | select_variable_genes 26 | ) 27 | -------------------------------------------------------------------------------- /simba/preprocessing/_general.py: -------------------------------------------------------------------------------- 1 | """General preprocessing functions""" 2 | 3 | import numpy as np 4 | from sklearn.utils import sparsefuncs 5 | from sklearn import preprocessing 6 | from ._utils import ( 7 | cal_tf_idf 8 | ) 9 | from scipy.sparse import ( 10 | issparse, 11 | csr_matrix, 12 | ) 13 | 14 | 15 | def log_transform(adata): 16 | """Return the natural logarithm of one plus the input array, element-wise. 17 | 18 | Parameters 19 | ---------- 20 | adata: AnnData 21 | Annotated data matrix. 22 | 23 | Returns 24 | ------- 25 | updates `adata` with the following fields. 26 | X: `numpy.ndarray` (`adata.X`) 27 | Store #observations × #var_genes logarithmized data matrix. 28 | """ 29 | if(not issparse(adata.X)): 30 | adata.X = csr_matrix(adata.X) 31 | adata.X = np.log1p(adata.X) 32 | return None 33 | 34 | 35 | def binarize(adata, 36 | threshold=1e-5): 37 | """Binarize an array. 38 | Parameters 39 | ---------- 40 | adata: AnnData 41 | Annotated data matrix. 42 | threshold: `float`, optional (default: 1e-5) 43 | Values below or equal to this are replaced by 0, above it by 1. 44 | 45 | Returns 46 | ------- 47 | updates `adata` with the following fields. 48 | X: `numpy.ndarray` (`adata.X`) 49 | Store #observations × #var_genes binarized data matrix. 50 | """ 51 | if(not issparse(adata.X)): 52 | adata.X = csr_matrix(adata.X) 53 | adata.X = preprocessing.binarize(adata.X, 54 | threshold=threshold, 55 | copy=True) 56 | 57 | 58 | def normalize(adata, 59 | method='lib_size', 60 | scale_factor=1e4, 61 | save_raw=True): 62 | """Normalize count matrix. 63 | 64 | Parameters 65 | ---------- 66 | adata: AnnData 67 | Annotated data matrix. 68 | method: `str`, optional (default: 'lib_size') 69 | Choose from {{'lib_size','tf_idf'}} 70 | Method used for dimension reduction. 71 | 'lib_size': Total-count normalize (library-size correct) 72 | 'tf_idf': TF-IDF (term frequency–inverse document frequency) 73 | transformation 74 | 75 | Returns 76 | ------- 77 | updates `adata` with the following fields. 78 | X: `numpy.ndarray` (`adata.X`) 79 | Store #observations × #var_genes normalized data matrix. 80 | """ 81 | if(method not in ['lib_size', 'tf_idf']): 82 | raise ValueError("unrecognized method '%s'" % method) 83 | if(not issparse(adata.X)): 84 | adata.X = csr_matrix(adata.X) 85 | if(save_raw): 86 | adata.layers['raw'] = adata.X.copy() 87 | if(method == 'lib_size'): 88 | sparsefuncs.inplace_row_scale(adata.X, 1/adata.X.sum(axis=1).A) 89 | adata.X = adata.X*scale_factor 90 | if(method == 'tf_idf'): 91 | adata.X = cal_tf_idf(adata.X) 92 | -------------------------------------------------------------------------------- /simba/preprocessing/_pca.py: -------------------------------------------------------------------------------- 1 | """Principal component analysis""" 2 | 3 | import numpy as np 4 | from sklearn.decomposition import TruncatedSVD 5 | from ._utils import ( 6 | locate_elbow, 7 | ) 8 | 9 | 10 | def pca(adata, 11 | n_components=50, 12 | algorithm='randomized', 13 | n_iter=5, 14 | random_state=2021, 15 | tol=0.0, 16 | feature=None, 17 | **kwargs, 18 | ): 19 | """perform Principal Component Analysis (PCA) 20 | 21 | Parameters 22 | ---------- 23 | adata: AnnData 24 | Annotated data matrix. 25 | n_components: `int`, optional (default: 50) 26 | Desired dimensionality of output data 27 | algorithm: `str`, optional (default: 'randomized') 28 | SVD solver to use. Choose from {'arpack', 'randomized'}. 29 | n_iter: `int`, optional (default: '5') 30 | Number of iterations for randomized SVD solver. 31 | Not used by ARPACK. 32 | tol: `float`, optional (default: 0) 33 | Tolerance for ARPACK. 0 means machine precision. 34 | Ignored by randomized SVD solver. 35 | feature: `str`, optional (default: None) 36 | Feature used to perform PCA. 37 | The data type of `.var[feature]` needs to be `bool` 38 | If None, adata.X will be used. 39 | kwargs: 40 | Other keyword arguments are passed down to `TruncatedSVD()` 41 | 42 | Returns 43 | ------- 44 | updates `adata` with the following fields: 45 | `.obsm['X_pca']` : `array` 46 | PCA transformed X. 47 | `.uns['pca']['PCs']` : `array` 48 | Principal components in feature space, 49 | representing the directions of maximum variance in the data. 50 | `.uns['pca']['variance']` : `array` 51 | The variance of the training samples transformed by a 52 | projection to each component. 53 | `.uns['pca']['variance_ratio']` : `array` 54 | Percentage of variance explained by each of the selected components. 55 | """ 56 | if(feature is None): 57 | X = adata.X.copy() 58 | else: 59 | mask = adata.var[feature] 60 | X = adata[:, mask].X.copy() 61 | svd = TruncatedSVD(n_components=n_components, 62 | algorithm=algorithm, 63 | n_iter=n_iter, 64 | random_state=random_state, 65 | tol=tol, 66 | **kwargs) 67 | svd.fit(X) 68 | adata.obsm['X_pca'] = svd.transform(X) 69 | adata.uns['pca'] = dict() 70 | adata.uns['pca']['n_pcs'] = n_components 71 | adata.uns['pca']['PCs'] = svd.components_.T 72 | adata.uns['pca']['variance'] = svd.explained_variance_ 73 | adata.uns['pca']['variance_ratio'] = svd.explained_variance_ratio_ 74 | 75 | 76 | def select_pcs(adata, 77 | n_pcs=None, 78 | S=1, 79 | curve='convex', 80 | direction='decreasing', 81 | online=False, 82 | min_elbow=None, 83 | **kwargs): 84 | """select top PCs based on variance_ratio 85 | 86 | Parameters 87 | ---------- 88 | n_pcs: `int`, optional (default: None) 89 | If n_pcs is None, 90 | the number of PCs will be automatically selected with "`kneed 91 | `__" 92 | S : `float`, optional (default: 1) 93 | Sensitivity 94 | min_elbow: `int`, optional (default: None) 95 | The minimum elbow location 96 | By default, it is n_components/10 97 | curve: `str`, optional (default: 'convex') 98 | Choose from {'convex','concave'} 99 | If 'concave', algorithm will detect knees, 100 | If 'convex', algorithm will detect elbows. 101 | direction: `str`, optional (default: 'decreasing') 102 | Choose from {'decreasing','increasing'} 103 | online: `bool`, optional (default: False) 104 | kneed will correct old knee points if True, 105 | kneed will return first knee if False. 106 | **kwargs: `dict`, optional 107 | Extra arguments to KneeLocator. 108 | Returns 109 | 110 | """ 111 | if(n_pcs is None): 112 | n_components = adata.obsm['X_pca'].shape[1] 113 | if(min_elbow is None): 114 | min_elbow = n_components/10 115 | n_pcs = locate_elbow(range(n_components), 116 | adata.uns['pca']['variance_ratio'], 117 | S=S, 118 | curve=curve, 119 | min_elbow=min_elbow, 120 | direction=direction, 121 | online=online, 122 | **kwargs) 123 | adata.uns['pca']['n_pcs'] = n_pcs 124 | else: 125 | adata.uns['pca']['n_pcs'] = n_pcs 126 | 127 | 128 | def select_pcs_features(adata, 129 | S=1, 130 | curve='convex', 131 | direction='decreasing', 132 | online=False, 133 | min_elbow=None, 134 | **kwargs): 135 | """select features that contribute to the top PCs 136 | 137 | Parameters 138 | ---------- 139 | S : `float`, optional (default: 10) 140 | Sensitivity 141 | min_elbow: `int`, optional (default: None) 142 | The minimum elbow location. 143 | By default, it is #features/6 144 | curve: `str`, optional (default: 'convex') 145 | Choose from {'convex','concave'} 146 | If 'concave', algorithm will detect knees, 147 | If 'convex', algorithm will detect elbows. 148 | direction: `str`, optional (default: 'decreasing') 149 | Choose from {'decreasing','increasing'} 150 | online: `bool`, optional (default: False) 151 | kneed will correct old knee points if True, 152 | kneed will return first knee if False. 153 | **kwargs: `dict`, optional 154 | Extra arguments to KneeLocator. 155 | Returns 156 | ------- 157 | """ 158 | n_pcs = adata.uns['pca']['n_pcs'] 159 | n_features = adata.uns['pca']['PCs'].shape[0] 160 | if(min_elbow is None): 161 | min_elbow = n_features/6 162 | adata.uns['pca']['features'] = dict() 163 | ids_features = list() 164 | for i in range(n_pcs): 165 | elbow = locate_elbow(range(n_features), 166 | np.sort( 167 | np.abs(adata.uns['pca']['PCs'][:, i],))[::-1], 168 | S=S, 169 | min_elbow=min_elbow, 170 | curve=curve, 171 | direction=direction, 172 | online=online, 173 | **kwargs) 174 | ids_features_i = \ 175 | list(np.argsort(np.abs( 176 | adata.uns['pca']['PCs'][:, i],))[::-1][:elbow]) 177 | adata.uns['pca']['features'][f'pc_{i}'] = ids_features_i 178 | ids_features = ids_features + ids_features_i 179 | print(f'#features selected from PC {i}: {len(ids_features_i)}') 180 | adata.var['top_pcs'] = False 181 | adata.var.loc[adata.var_names[np.unique(ids_features)], 'top_pcs'] = True 182 | print(f'#features in total: {adata.var["top_pcs"].sum()}') 183 | -------------------------------------------------------------------------------- /simba/preprocessing/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from kneed import KneeLocator 5 | from scipy.sparse import csr_matrix, diags 6 | 7 | 8 | def locate_elbow(x, y, S=10, min_elbow=0, 9 | curve='convex', direction='decreasing', online=False, 10 | **kwargs): 11 | """Detect knee points 12 | 13 | Parameters 14 | ---------- 15 | x : `array_like` 16 | x values 17 | y : `array_like` 18 | y values 19 | S : `float`, optional (default: 10) 20 | Sensitivity 21 | min_elbow: `int`, optional (default: 0) 22 | The minimum elbow location 23 | curve: `str`, optional (default: 'convex') 24 | Choose from {'convex','concave'} 25 | If 'concave', algorithm will detect knees, 26 | If 'convex', algorithm will detect elbows. 27 | direction: `str`, optional (default: 'decreasing') 28 | Choose from {'decreasing','increasing'} 29 | online: `bool`, optional (default: False) 30 | kneed will correct old knee points if True, 31 | kneed will return first knee if False. 32 | **kwargs: `dict`, optional 33 | Extra arguments to KneeLocator. 34 | 35 | Returns 36 | ------- 37 | elbow: `int` 38 | elbow point 39 | """ 40 | kneedle = KneeLocator(x[int(min_elbow):], y[int(min_elbow):], 41 | S=S, curve=curve, 42 | direction=direction, 43 | online=online, 44 | **kwargs, 45 | ) 46 | if(kneedle.elbow is None): 47 | elbow = len(y) 48 | else: 49 | elbow = int(kneedle.elbow) 50 | return elbow 51 | 52 | 53 | def cal_tf_idf(mat): 54 | """Transform a count matrix to a tf-idf representation 55 | """ 56 | mat = csr_matrix(mat) 57 | tf = csr_matrix(mat/(mat.sum(axis=0))) 58 | idf = np.array(np.log(1 + mat.shape[1] / mat.sum(axis=1))).flatten() 59 | tf_idf = csr_matrix(np.dot(diags(idf), tf)) 60 | return tf_idf 61 | -------------------------------------------------------------------------------- /simba/preprocessing/_variable_genes.py: -------------------------------------------------------------------------------- 1 | """Preprocess""" 2 | 3 | import numpy as np 4 | from scipy.sparse import ( 5 | csr_matrix, 6 | ) 7 | from sklearn.utils import sparsefuncs 8 | from skmisc.loess import loess 9 | 10 | 11 | def select_variable_genes(adata, 12 | layer='raw', 13 | span=0.3, 14 | n_top_genes=2000, 15 | ): 16 | """Select highly variable genes. 17 | 18 | This function implenments the method 'vst' in Seurat v3. 19 | Inspired by Scanpy. 20 | 21 | Parameters 22 | ---------- 23 | adata: AnnData 24 | Annotated data matrix. 25 | layer: `str`, optional (default: 'raw') 26 | The layer to use for calculating variable genes. 27 | span: `float`, optional (default: 0.3) 28 | Loess smoothing factor 29 | n_top_genes: `int`, optional (default: 2000) 30 | The number of genes to keep 31 | 32 | Returns 33 | ------- 34 | updates `adata` with the following fields. 35 | 36 | variances_norm: `float`, (`adata.var['variances_norm']`) 37 | Normalized variance per gene 38 | variances: `float`, (`adata.var['variances']`) 39 | Variance per gene. 40 | means: `float`, (`adata.var['means']`) 41 | Means per gene 42 | highly_variable: `bool` (`adata.var['highly_variable']`) 43 | Indicator of variable genes 44 | """ 45 | if layer is None: 46 | X = adata.X 47 | else: 48 | X = adata.layers[layer].astype(np.float64).copy() 49 | mean, variance = sparsefuncs.mean_variance_axis(X, axis=0) 50 | variance_expected = np.zeros(adata.shape[1], dtype=np.float64) 51 | not_const = variance > 0 52 | 53 | model = loess(np.log10(mean[not_const]), 54 | np.log10(variance[not_const]), 55 | span=span, 56 | degree=2) 57 | model.fit() 58 | variance_expected[not_const] = 10**model.outputs.fitted_values 59 | N = adata.shape[0] 60 | clip_max = np.sqrt(N) 61 | clip_val = np.sqrt(variance_expected) * clip_max + mean 62 | 63 | X = csr_matrix(X) 64 | mask = X.data > clip_val[X.indices] 65 | X.data[mask] = clip_val[X.indices[mask]] 66 | 67 | squared_X_sum = np.array(X.power(2).sum(axis=0)) 68 | X_sum = np.array(X.sum(axis=0)) 69 | 70 | norm_gene_var = (1 / ((N - 1) * variance_expected)) \ 71 | * ((N * np.square(mean)) 72 | + squared_X_sum 73 | - 2 * X_sum * mean 74 | ) 75 | norm_gene_var = norm_gene_var.flatten() 76 | 77 | adata.var['variances_norm'] = norm_gene_var 78 | adata.var['variances'] = variance 79 | adata.var['means'] = mean 80 | ids_top = norm_gene_var.argsort()[-n_top_genes:][::-1] 81 | adata.var['highly_variable'] = np.isin(range(adata.shape[1]), ids_top) 82 | print(f'{n_top_genes} variable genes are selected.') 83 | -------------------------------------------------------------------------------- /simba/readwrite.py: -------------------------------------------------------------------------------- 1 | """reading and writing""" 2 | 3 | import os 4 | import pandas as pd 5 | import json 6 | from anndata import ( 7 | AnnData, 8 | read_h5ad, 9 | read_csv, 10 | read_excel, 11 | read_hdf, 12 | read_loom, 13 | read_mtx, 14 | read_text, 15 | read_umi_tools, 16 | read_zarr, 17 | ) 18 | from pathlib import Path 19 | import tables 20 | 21 | from ._settings import settings 22 | from ._utils import _read_legacy_10x_h5, _read_v3_10x_h5 23 | 24 | 25 | def read_embedding(path_emb=None, 26 | path_entity=None, 27 | convert_alias=True, 28 | path_entity_alias=None, 29 | prefix=None, 30 | num_epochs=None): 31 | """Read in entity embeddings from pbg training 32 | 33 | Parameters 34 | ---------- 35 | path_emb: `str`, optional (default: None) 36 | Path to directory for pbg embedding model 37 | If None, .settings.pbg_params['checkpoint_path'] will be used. 38 | path_entity: `str`, optional (default: None) 39 | Path to entity name file 40 | prefix: `list`, optional (default: None) 41 | A list of entity type prefixes to include. 42 | By default, it reads in the embeddings of all entities. 43 | convert_alias: `bool`, optional (default: True) 44 | If True, it will convert entity aliases to the original indices 45 | path_entity: `str`, optional (default: None) 46 | Path to entity alias file 47 | num_epochs: `int`, optional (default: None) 48 | The embedding result associated with num_epochs to read in 49 | 50 | Returns 51 | ------- 52 | dict_adata: `dict` 53 | A dictionary of anndata objects of shape 54 | (#entities x #dimensions) 55 | """ 56 | pbg_params = settings.pbg_params 57 | if path_emb is None: 58 | path_emb = pbg_params['checkpoint_path'] 59 | if path_entity is None: 60 | path_entity = pbg_params['entity_path'] 61 | if num_epochs is None: 62 | num_epochs = pbg_params["num_epochs"] 63 | if prefix is None: 64 | prefix = [] 65 | assert isinstance(prefix, list), \ 66 | "`prefix` must be list" 67 | if convert_alias: 68 | if path_entity_alias is None: 69 | path_entity_alias = Path(path_emb).parent.as_posix() 70 | df_entity_alias = pd.read_csv( 71 | os.path.join(path_entity_alias, 'entity_alias.txt'), 72 | header=0, 73 | index_col=0, 74 | sep='\t') 75 | df_entity_alias['id'] = df_entity_alias.index 76 | df_entity_alias.index = df_entity_alias['alias'].values 77 | 78 | dict_adata = dict() 79 | for x in os.listdir(path_emb): 80 | if x.startswith('embeddings'): 81 | entity_type = x.split('_')[1] 82 | if (len(prefix) == 0) or (entity_type in prefix): 83 | adata = \ 84 | read_hdf(os.path.join(path_emb, 85 | f'embeddings_{entity_type}_0.' 86 | f'v{num_epochs}.h5'), 87 | key="embeddings") 88 | with open( 89 | os.path.join(path_entity, 90 | f'entity_names_{entity_type}_0.json'), "rt")\ 91 | as tf: 92 | names_entity = json.load(tf) 93 | if convert_alias: 94 | names_entity = \ 95 | df_entity_alias.loc[names_entity, 'id'].tolist() 96 | adata.obs.index = names_entity 97 | dict_adata[entity_type] = adata 98 | return dict_adata 99 | 100 | 101 | # modifed from 102 | # scanpy https://github.com/theislab/scanpy/blob/master/scanpy/readwrite.py 103 | def read_10x_h5(filename, 104 | genome=None, 105 | gex_only=True): 106 | """Read 10x-Genomics-formatted hdf5 file. 107 | 108 | Parameters 109 | ---------- 110 | filename 111 | Path to a 10x hdf5 file. 112 | genome 113 | Filter expression to genes within this genome. For legacy 10x h5 114 | files, this must be provided if the data contains more than one genome. 115 | gex_only 116 | Only keep 'Gene Expression' data and ignore other feature types, 117 | e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom' 118 | 119 | Returns 120 | ------- 121 | adata: AnnData 122 | Annotated data matrix, where observations/cells are named by their 123 | barcode and variables/genes by gene name 124 | """ 125 | with tables.open_file(str(filename), 'r') as f: 126 | v3 = '/matrix' in f 127 | if v3: 128 | adata = _read_v3_10x_h5(filename) 129 | if genome: 130 | if genome not in adata.var['genome'].values: 131 | raise ValueError( 132 | f"Could not find data corresponding to " 133 | f"genome '{genome}' in '{filename}'. " 134 | f'Available genomes are:' 135 | f' {list(adata.var["genome"].unique())}.' 136 | ) 137 | adata = adata[:, adata.var['genome'] == genome] 138 | if gex_only: 139 | adata = adata[:, adata.var['feature_types'] == 'Gene Expression'] 140 | if adata.is_view: 141 | adata = adata.copy() 142 | else: 143 | adata = _read_legacy_10x_h5(filename, genome=genome) 144 | return adata 145 | 146 | 147 | def load_pbg_config(path=None): 148 | """Load PBG configuration into global setting 149 | 150 | Parameters 151 | ---------- 152 | path: `str`, optional (default: None) 153 | Path to the directory for pbg configuration file 154 | If None, `.settings.pbg_params['checkpoint_path']` will be used 155 | 156 | Returns 157 | ------- 158 | Updates `.settings.pbg_params` 159 | 160 | """ 161 | if path is None: 162 | path = settings.pbg_params['checkpoint_path'] 163 | path = os.path.normpath(path) 164 | with open(os.path.join(path, 'config.json'), "rt") as tf: 165 | pbg_params = json.load(tf) 166 | settings.set_pbg_params(config=pbg_params) 167 | 168 | 169 | def load_graph_stats(path=None): 170 | """Load graph statistics into global setting 171 | 172 | Parameters 173 | ---------- 174 | path: `str`, optional (default: None) 175 | Path to the directory for graph statistics file 176 | If None, `.settings.pbg_params['checkpoint_path']` will be used 177 | 178 | Returns 179 | ------- 180 | Updates `.settings.graph_stats` 181 | """ 182 | if path is None: 183 | path = \ 184 | Path(settings.pbg_params['entity_path']).parent.parent.as_posix() 185 | path = os.path.normpath(path) 186 | with open(os.path.join(path, 'graph_stats.json'), "rt") as tf: 187 | dict_graph_stats = json.load(tf) 188 | dirname = os.path.basename(path) 189 | settings.graph_stats[dirname] = dict_graph_stats.copy() 190 | 191 | 192 | def write_bed(adata, 193 | use_top_pcs=True, 194 | filename=None 195 | ): 196 | """Write peaks into .bed file 197 | 198 | Parameters 199 | ---------- 200 | adata: AnnData 201 | Annotated data matrix with peaks as variables. 202 | use_top_pcs: `bool`, optional (default: True) 203 | Use top-PCs-associated features 204 | filename: `str`, optional (default: None) 205 | Filename name for peaks. 206 | By default, a file named 'peaks.bed' will be written to 207 | `.settings.workdir` 208 | """ 209 | if filename is None: 210 | filename = os.path.join(settings.workdir, 'peaks.bed') 211 | for x in ['chr', 'start', 'end']: 212 | if x not in adata.var_keys(): 213 | raise ValueError(f"could not find {x} in `adata.var_keys()`") 214 | if use_top_pcs: 215 | assert 'top_pcs' in adata.var_keys(), \ 216 | "please run `si.pp.select_pcs_features()` first" 217 | peaks_selected = adata.var[ 218 | adata.var['top_pcs']][['chr', 'start', 'end']] 219 | else: 220 | peaks_selected = adata.var[ 221 | ['chr', 'start', 'end']] 222 | peaks_selected.to_csv(filename, 223 | sep='\t', 224 | header=False, 225 | index=False) 226 | fp, fn = os.path.split(filename) 227 | print(f'"{fn}" was written to "{fp}".') 228 | -------------------------------------------------------------------------------- /simba/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """The core functionality""" 2 | 3 | from ._general import ( 4 | discretize, 5 | ) 6 | from ._umap import umap 7 | from ._gene_scores import gene_scores 8 | from ._integration import ( 9 | infer_edges, 10 | trim_edges 11 | ) 12 | from ._pbg import ( 13 | gen_graph, 14 | pbg_train 15 | ) 16 | from ._post_training import ( 17 | softmax, 18 | embed, 19 | compare_entities, 20 | query, 21 | find_master_regulators, 22 | find_target_genes, 23 | ) 24 | -------------------------------------------------------------------------------- /simba/tools/_gene_scores.py: -------------------------------------------------------------------------------- 1 | """Predict gene scores based on chromatin accessibility""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import anndata as ad 6 | import io 7 | import pybedtools 8 | from scipy.sparse import ( 9 | coo_matrix, 10 | csr_matrix 11 | ) 12 | import pkgutil 13 | 14 | from ._utils import _uniquify 15 | 16 | 17 | class GeneScores: 18 | """A class used to represent gene scores 19 | 20 | Attributes 21 | ---------- 22 | 23 | Methods 24 | ------- 25 | 26 | """ 27 | def __init__(self, 28 | adata, 29 | genome, 30 | gene_anno=None, 31 | tss_upstream=1e5, 32 | tss_downsteam=1e5, 33 | gb_upstream=5000, 34 | cutoff_weight=1, 35 | use_top_pcs=True, 36 | use_precomputed=True, 37 | use_gene_weigt=True, 38 | min_w=1, 39 | max_w=5): 40 | """ 41 | Parameters 42 | ---------- 43 | adata: `Anndata` 44 | Input anndata 45 | genome : `str` 46 | The genome name 47 | """ 48 | self.adata = adata 49 | self.genome = genome 50 | self.gene_anno = gene_anno 51 | self.tss_upstream = tss_upstream 52 | self.tss_downsteam = tss_downsteam 53 | self.gb_upstream = gb_upstream 54 | self.cutoff_weight = cutoff_weight 55 | self.use_top_pcs = use_top_pcs 56 | self.use_precomputed = use_precomputed 57 | self.use_gene_weigt = use_gene_weigt 58 | self.min_w = min_w 59 | self.max_w = max_w 60 | 61 | def _read_gene_anno(self): 62 | """Read in gene annotation 63 | 64 | Parameters 65 | ---------- 66 | 67 | Returns 68 | ------- 69 | 70 | """ 71 | assert (self.genome in ['hg19', 'hg38', 'mm9', 'mm10']),\ 72 | "`genome` must be one of ['hg19','hg38','mm9','mm10']" 73 | 74 | bin_str = pkgutil.get_data('simba', 75 | f'data/gene_anno/{self.genome}_genes.bed') 76 | gene_anno = pd.read_csv(io.BytesIO(bin_str), 77 | encoding='utf8', 78 | sep='\t', 79 | header=None, 80 | names=['chr', 'start', 'end', 81 | 'symbol', 'strand']) 82 | self.gene_anno = gene_anno 83 | return self.gene_anno 84 | 85 | def _extend_tss(self, pbt_gene): 86 | """Extend transcription start site in both directions 87 | 88 | Parameters 89 | ---------- 90 | 91 | Returns 92 | ------- 93 | 94 | """ 95 | ext_tss = pbt_gene 96 | if(ext_tss['strand'] == '+'): 97 | ext_tss.start = max(0, ext_tss.start - self.tss_upstream) 98 | ext_tss.end = max(ext_tss.end, ext_tss.start + self.tss_downsteam) 99 | else: 100 | ext_tss.start = max(0, min(ext_tss.start, 101 | ext_tss.end - self.tss_downsteam)) 102 | ext_tss.end = ext_tss.end + self.tss_upstream 103 | return ext_tss 104 | 105 | def _extend_genebody(self, pbt_gene): 106 | """Extend gene body upstream 107 | 108 | Parameters 109 | ---------- 110 | 111 | Returns 112 | ------- 113 | 114 | """ 115 | ext_gb = pbt_gene 116 | if(ext_gb['strand'] == '+'): 117 | ext_gb.start = max(0, ext_gb.start - self.gb_upstream) 118 | else: 119 | ext_gb.end = ext_gb.end + self.gb_upstream 120 | return ext_gb 121 | 122 | def _weight_genes(self): 123 | """Weight genes 124 | 125 | Parameters 126 | ---------- 127 | 128 | Returns 129 | ------- 130 | 131 | """ 132 | gene_anno = self.gene_anno 133 | gene_size = gene_anno['end'] - gene_anno['start'] 134 | w = 1/gene_size 135 | w_scaled = (self.max_w-self.min_w) * (w-min(w)) / (max(w)-min(w)) \ 136 | + self.min_w 137 | return w_scaled 138 | 139 | def cal_gene_scores(self): 140 | """Calculate gene scores 141 | 142 | Parameters 143 | ---------- 144 | 145 | Returns 146 | ------- 147 | 148 | """ 149 | adata = self.adata 150 | if self.gene_anno is None: 151 | gene_ann = self._read_gene_anno() 152 | else: 153 | gene_ann = self.gene_anno 154 | 155 | df_gene_ann = gene_ann.copy() 156 | df_gene_ann.index = _uniquify(df_gene_ann['symbol'].values) 157 | if self.use_top_pcs: 158 | mask_p = adata.var['top_pcs'] 159 | else: 160 | mask_p = pd.Series(True, index=adata.var_names) 161 | df_peaks = adata.var[mask_p][['chr', 'start', 'end']].copy() 162 | 163 | if('gene_scores' not in adata.uns_keys()): 164 | print('Gene scores are being calculated for the first time') 165 | print('`use_precomputed` has been ignored') 166 | self.use_precomputed = False 167 | 168 | if(self.use_precomputed): 169 | print('Using precomputed overlap') 170 | df_overlap_updated = adata.uns['gene_scores']['overlap'].copy() 171 | else: 172 | # add the fifth column 173 | # so that pybedtool can recognize the sixth column as the strand 174 | df_gene_ann_for_pbt = df_gene_ann.copy() 175 | df_gene_ann_for_pbt['score'] = 0 176 | df_gene_ann_for_pbt = df_gene_ann_for_pbt[['chr', 'start', 'end', 177 | 'symbol', 'score', 178 | 'strand']] 179 | df_gene_ann_for_pbt['id'] = range(df_gene_ann_for_pbt.shape[0]) 180 | 181 | df_peaks_for_pbt = df_peaks.copy() 182 | df_peaks_for_pbt['id'] = range(df_peaks_for_pbt.shape[0]) 183 | 184 | pbt_gene_ann = pybedtools.BedTool.from_dataframe( 185 | df_gene_ann_for_pbt 186 | ) 187 | pbt_gene_ann_ext = pbt_gene_ann.each(self._extend_tss) 188 | pbt_gene_gb_ext = pbt_gene_ann.each(self._extend_genebody) 189 | 190 | pbt_peaks = pybedtools.BedTool.from_dataframe(df_peaks_for_pbt) 191 | 192 | # peaks overlapping with extended TSS 193 | pbt_overlap = pbt_peaks.intersect(pbt_gene_ann_ext, 194 | wa=True, 195 | wb=True) 196 | df_overlap = pbt_overlap.to_dataframe( 197 | names=[x+'_p' for x in df_peaks_for_pbt.columns] 198 | + [x+'_g' for x in df_gene_ann_for_pbt.columns]) 199 | # peaks overlapping with gene body 200 | pbt_overlap2 = pbt_peaks.intersect(pbt_gene_gb_ext, 201 | wa=True, 202 | wb=True) 203 | df_overlap2 = pbt_overlap2.to_dataframe( 204 | names=[x+'_p' for x in df_peaks_for_pbt.columns] 205 | + [x+'_g' for x in df_gene_ann_for_pbt.columns]) 206 | 207 | # add distance and weight for each overlap 208 | df_overlap_updated = df_overlap.copy() 209 | df_overlap_updated['dist'] = 0 210 | 211 | for i, x in enumerate(df_overlap['symbol_g'].unique()): 212 | # peaks within the extended TSS 213 | df_overlap_x = \ 214 | df_overlap[df_overlap['symbol_g'] == x].copy() 215 | # peaks within the gene body 216 | df_overlap2_x = \ 217 | df_overlap2[df_overlap2['symbol_g'] == x].copy() 218 | # peaks that are not intersecting with the promoter 219 | # and gene body of gene x 220 | id_overlap = df_overlap_x.index[ 221 | ~np.isin(df_overlap_x['id_p'], df_overlap2_x['id_p'])] 222 | mask_x = (df_gene_ann['symbol'] == x) 223 | range_x = df_gene_ann[mask_x][['start', 'end']].values\ 224 | .flatten() 225 | if(df_overlap_x['strand_g'].iloc[0] == '+'): 226 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat( 227 | [abs(df_overlap_x.loc[id_overlap, 'start_p'] 228 | - (range_x[1])), 229 | abs(df_overlap_x.loc[id_overlap, 'end_p'] 230 | - max(0, range_x[0]-self.gb_upstream))], 231 | axis=1, sort=False).min(axis=1) 232 | else: 233 | df_overlap_updated.loc[id_overlap, 'dist'] = pd.concat( 234 | [abs(df_overlap_x.loc[id_overlap, 'start_p'] 235 | - (range_x[1]+self.gb_upstream)), 236 | abs(df_overlap_x.loc[id_overlap, 'end_p'] 237 | - (range_x[0]))], 238 | axis=1, sort=False).min(axis=1) 239 | 240 | n_batch = int(df_gene_ann_for_pbt.shape[0]/5) 241 | if(i % n_batch == 0): 242 | print(f'Processing: {i/df_gene_ann_for_pbt.shape[0]:.1%}') 243 | df_overlap_updated['dist'] = df_overlap_updated['dist']\ 244 | .astype(float) 245 | 246 | adata.uns['gene_scores'] = dict() 247 | adata.uns['gene_scores']['overlap'] = df_overlap_updated.copy() 248 | 249 | df_overlap_updated['weight'] = np.exp( 250 | -(df_overlap_updated['dist'].values/self.gb_upstream)) 251 | mask_w = (df_overlap_updated['weight'] < self.cutoff_weight) 252 | df_overlap_updated.loc[mask_w, 'weight'] = 0 253 | # construct genes-by-peaks matrix 254 | mat_GP = csr_matrix(coo_matrix((df_overlap_updated['weight'], 255 | (df_overlap_updated['id_g'], 256 | df_overlap_updated['id_p'])), 257 | shape=(df_gene_ann.shape[0], 258 | df_peaks.shape[0]))) 259 | # adata_GP = ad.AnnData(X=csr_matrix(mat_GP), 260 | # obs=df_gene_ann, 261 | # var=df_peaks) 262 | # adata_GP.layers['weight'] = adata_GP.X.copy() 263 | if self.use_gene_weigt: 264 | gene_weights = self._weight_genes() 265 | gene_scores = adata[:, mask_p].X * \ 266 | (mat_GP.T.multiply(gene_weights)) 267 | else: 268 | gene_scores = adata[:, mask_p].X * mat_GP.T 269 | adata_CG_atac = ad.AnnData(gene_scores, 270 | obs=adata.obs.copy(), 271 | var=df_gene_ann.copy()) 272 | return adata_CG_atac 273 | 274 | 275 | def gene_scores(adata, 276 | genome, 277 | gene_anno=None, 278 | tss_upstream=1e5, 279 | tss_downsteam=1e5, 280 | gb_upstream=5000, 281 | cutoff_weight=1, 282 | use_top_pcs=True, 283 | use_precomputed=True, 284 | use_gene_weigt=True, 285 | min_w=1, 286 | max_w=5): 287 | """Calculate gene scores 288 | 289 | Parameters 290 | ---------- 291 | adata : AnnData 292 | Annotated data matrix. 293 | genome : `str` 294 | Reference genome. Choose from {'hg19', 'hg38', 'mm9', 'mm10'} 295 | gene_anno : `pandas.DataFrame`, optional (default: None) 296 | Dataframe of gene annotation. 297 | If None, built-in gene annotation will be used depending on `genome`; 298 | If provided, custom gene annotation will be used instead. 299 | tss_upstream : `int`, optional (default: 1e5) 300 | The number of base pairs upstream of TSS 301 | tss_downsteam : `int`, optional (default: 1e5) 302 | The number of base pairs downstream of TSS 303 | gb_upstream : `int`, optional (default: 5000) 304 | The number of base pairs upstream by which gene body is extended. 305 | Peaks within the extended gene body are given the weight of 1. 306 | cutoff_weight : `float`, optional (default: 1) 307 | Weight cutoff for peaks 308 | use_top_pcs : `bool`, optional (default: True) 309 | If True, only peaks associated with top PCs will be used 310 | use_precomputed : `bool`, optional (default: True) 311 | If True, overlap bewteen peaks and genes 312 | (stored in `adata.uns['gene_scores']['overlap']`) will be imported 313 | use_gene_weigt : `bool`, optional (default: True) 314 | If True, for each gene, the number of peaks assigned to it 315 | will be rescaled based on gene size 316 | min_w : `int`, optional (default: 1) 317 | The minimum weight for each gene. 318 | Only valid if `use_gene_weigt` is True 319 | max_w : `int`, optional (default: 5) 320 | The maximum weight for each gene. 321 | Only valid if `use_gene_weigt` is True 322 | 323 | Returns 324 | ------- 325 | adata_new: AnnData 326 | Annotated data matrix. 327 | Stores #cells x #genes gene score matrix 328 | 329 | updates `adata` with the following fields. 330 | overlap: `pandas.DataFrame`, (`adata.uns['gene_scores']['overlap']`) 331 | Dataframe of overlap between peaks and genes 332 | """ 333 | GS = GeneScores(adata, 334 | genome, 335 | gene_anno=gene_anno, 336 | tss_upstream=tss_upstream, 337 | tss_downsteam=tss_downsteam, 338 | gb_upstream=gb_upstream, 339 | cutoff_weight=cutoff_weight, 340 | use_top_pcs=use_top_pcs, 341 | use_precomputed=use_precomputed, 342 | use_gene_weigt=use_gene_weigt, 343 | min_w=min_w, 344 | max_w=max_w) 345 | adata_CG_atac = GS.cal_gene_scores() 346 | return adata_CG_atac 347 | -------------------------------------------------------------------------------- /simba/tools/_general.py: -------------------------------------------------------------------------------- 1 | """General-purpose tools""" 2 | 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | 6 | 7 | def discretize(adata, 8 | layer=None, 9 | n_bins=5, 10 | max_bins=100): 11 | """Discretize continous values 12 | 13 | Parameters 14 | ---------- 15 | adata: AnnData 16 | Annotated data matrix. 17 | layer: `str`, optional (default: None) 18 | The layer used to perform discretization 19 | n_bins: `int`, optional (default: 5) 20 | The number of bins to produce. 21 | It must be smaller than `max_bins`. 22 | max_bins: `int`, optional (default: 100) 23 | The number of bins used in the initial approximation. 24 | i.e. the number of bins to cluster. 25 | 26 | Returns 27 | ------- 28 | updates `adata` with the following fields 29 | 30 | `.layer['disc']` : `array_like` 31 | Discretized values. 32 | `.uns['disc']` : `dict` 33 | `bin_edges`: The edges of each bin. 34 | `bin_count`: The number of values in each bin. 35 | `hist_edges`: The edges of each bin \ 36 | in the initial approximation. 37 | `hist_count`: The number of values in each bin \ 38 | for the initial approximation. 39 | """ 40 | if layer is None: 41 | X = adata.X 42 | else: 43 | X = adata.layers[layer] 44 | nonzero_cont = X.data 45 | 46 | hist_count, hist_edges = np.histogram( 47 | nonzero_cont, 48 | bins=max_bins, 49 | density=False) 50 | hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2 51 | 52 | kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit( 53 | hist_centroids.reshape(-1, 1), 54 | sample_weight=hist_count) 55 | cluster_centers = np.sort(kmeans.cluster_centers_.flatten()) 56 | 57 | padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10) 58 | bin_edges = np.array( 59 | [hist_edges[0]-padding] + 60 | list((cluster_centers[0:-1] + cluster_centers[1:])/2) + 61 | [hist_edges[-1]+padding]) 62 | nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,) 63 | bin_count = np.unique(nonzero_disc, return_counts=True)[1] 64 | 65 | adata.layers['disc'] = X.copy() 66 | adata.layers['disc'].data = nonzero_disc 67 | adata.uns['disc'] = dict() 68 | adata.uns['disc']['bin_edges'] = bin_edges 69 | adata.uns['disc']['bin_count'] = bin_count 70 | adata.uns['disc']['hist_edges'] = hist_edges 71 | adata.uns['disc']['hist_count'] = hist_count 72 | -------------------------------------------------------------------------------- /simba/tools/_integration.py: -------------------------------------------------------------------------------- 1 | """Integration across experimental conditions or single cell modalities""" 2 | 3 | import numpy as np 4 | import anndata as ad 5 | # from sklearn.metrics.pairwise import pairwise_distances 6 | from sklearn.utils.extmath import randomized_svd 7 | from scipy.sparse import csr_matrix, find 8 | 9 | from ._utils import _knn 10 | 11 | 12 | def infer_edges(adata_ref, 13 | adata_query, 14 | feature='highly_variable', 15 | n_components=20, 16 | random_state=42, 17 | layer=None, 18 | k=20, 19 | metric='euclidean', 20 | leaf_size=40, 21 | **kwargs): 22 | """Infer edges between reference and query observations 23 | 24 | Parameters 25 | ---------- 26 | adata_ref: `AnnData` 27 | Annotated reference data matrix. 28 | adata_query: `AnnData` 29 | Annotated query data matrix. 30 | feature: `str`, optional (default: None) 31 | Feature used for edges inference. 32 | The data type of `.var[feature]` needs to be `bool` 33 | n_components: `int`, optional (default: 20) 34 | The number of components used in `randomized_svd` 35 | for comparing reference and query observations 36 | random_state: `int`, optional (default: 42) 37 | The seed used for truncated randomized SVD 38 | n_top_edges: `int`, optional (default: None) 39 | The number of edges to keep 40 | If specified, `percentile` will be ignored 41 | percentile: `float`, optional (default: 0.01) 42 | The percentile of edges to keep 43 | k: `int`, optional (default: 5) 44 | The number of nearest neighbors to consider within each dataset 45 | metric: `str`, optional (default: 'euclidean') 46 | The metric to use when calculating distance between 47 | reference and query observations 48 | layer: `str`, optional (default: None) 49 | The layer used to perform edge inference 50 | If None, `.X` will be used. 51 | kwargs: 52 | Other keyword arguments are passed down to `randomized_svd()` 53 | 54 | Returns 55 | ------- 56 | adata_ref_query: `AnnData` 57 | Annotated relation matrix betwewn reference and query observations 58 | Store reference entity as observations and query entity as variables 59 | """ 60 | 61 | mask_ref = adata_ref.var[feature] 62 | feature_ref = adata_ref.var_names[mask_ref] 63 | feature_query = adata_query.var_names 64 | feature_shared = list(set(feature_ref).intersection(set(feature_query))) 65 | print(f'#shared features: {len(feature_shared)}') 66 | if layer is None: 67 | X_ref = adata_ref[:, feature_shared].X 68 | X_query = adata_query[:, feature_shared].X 69 | else: 70 | X_ref = adata_ref[:, feature_shared].layers[layer] 71 | X_query = adata_query[:, feature_shared].layers[layer] 72 | 73 | if any(X_ref.sum(axis=1) == 0) or any(X_query.sum(axis=1) == 0): 74 | raise ValueError( 75 | f'Some nodes contain zero expressed {feature} features.\n' 76 | f'Please try to include more {feature} features.') 77 | 78 | print('Performing randomized SVD ...') 79 | mat = X_ref * X_query.T 80 | U, Sigma, VT = randomized_svd(mat, 81 | n_components=n_components, 82 | random_state=random_state, 83 | **kwargs) 84 | svd_data = np.vstack((U, VT.T)) 85 | X_svd_ref = svd_data[:U.shape[0], :] 86 | X_svd_query = svd_data[-VT.shape[1]:, :] 87 | X_svd_ref = X_svd_ref / (X_svd_ref**2).sum(-1, keepdims=True)**0.5 88 | X_svd_query = X_svd_query / (X_svd_query**2).sum(-1, keepdims=True)**0.5 89 | 90 | # print('Searching for neighbors within each dataset ...') 91 | # knn_conn_ref, knn_dist_ref = _knn( 92 | # X_ref=X_svd_ref, 93 | # k=k, 94 | # leaf_size=leaf_size, 95 | # metric=metric) 96 | # knn_conn_query, knn_dist_query = _knn( 97 | # X_ref=X_svd_query, 98 | # k=k, 99 | # leaf_size=leaf_size, 100 | # metric=metric) 101 | 102 | print('Searching for mutual nearest neighbors ...') 103 | knn_conn_ref_query, knn_dist_ref_query = _knn( 104 | X_ref=X_svd_ref, 105 | X_query=X_svd_query, 106 | k=k, 107 | leaf_size=leaf_size, 108 | metric=metric) 109 | knn_conn_query_ref, knn_dist_query_ref = _knn( 110 | X_ref=X_svd_query, 111 | X_query=X_svd_ref, 112 | k=k, 113 | leaf_size=leaf_size, 114 | metric=metric) 115 | 116 | sum_conn_ref_query = knn_conn_ref_query + knn_conn_query_ref.T 117 | id_x, id_y, values = find(sum_conn_ref_query > 1) 118 | print(f'{len(id_x)} edges are selected') 119 | conn_ref_query = csr_matrix( 120 | (values*1, (id_x, id_y)), 121 | shape=(knn_conn_ref_query.shape)) 122 | dist_ref_query = csr_matrix( 123 | (knn_dist_ref_query[id_x, id_y].A.flatten(), (id_x, id_y)), 124 | shape=(knn_conn_ref_query.shape)) 125 | # it's easier to distinguish zeros (no connection vs zero distance) 126 | # using similarity scores 127 | sim_ref_query = csr_matrix( 128 | (1/(dist_ref_query.data+1), dist_ref_query.nonzero()), 129 | shape=(dist_ref_query.shape)) # similarity scores 130 | 131 | # print('Computing similarity scores ...') 132 | # dist_ref_query = pairwise_distances(X_svd_ref, 133 | # X_svd_query, 134 | # metric=metric) 135 | # sim_ref_query = 1/(1+dist_ref_query) 136 | # # remove low similarity entries to save memory 137 | # sim_ref_query = np.where( 138 | # sim_ref_query < np.percentile(sim_ref_query, pct_keep*100), 139 | # 0, sim_ref_query) 140 | # sim_ref_query = csr_matrix(sim_ref_query) 141 | 142 | adata_ref_query = ad.AnnData(X=sim_ref_query, 143 | obs=adata_ref.obs, 144 | var=adata_query.obs) 145 | adata_ref_query.layers['conn'] = conn_ref_query 146 | adata_ref_query.obsm['svd'] = X_svd_ref 147 | # adata_ref_query.obsp['conn'] = knn_conn_ref 148 | # adata_ref_query.obsp['dist'] = knn_dist_ref 149 | adata_ref_query.varm['svd'] = X_svd_query 150 | # adata_ref_query.varp['conn'] = knn_conn_query 151 | # adata_ref_query.varp['dist'] = knn_dist_query 152 | return adata_ref_query 153 | 154 | 155 | def trim_edges(adata_ref_query, 156 | cutoff=None, 157 | n_edges=None): 158 | """Trim edges based on the similarity scores 159 | 160 | Parameters 161 | ---------- 162 | adata_ref_query: `AnnData` 163 | Annotated relation matrix betwewn reference and query observations. 164 | n_edges: `int`, optional (default: None) 165 | The number of edges to keep 166 | If specified, `percentile` will be ignored 167 | cutoff: `float`, optional (default: None) 168 | The distance cutoff. 169 | If None, it will be decided by `n_top_edges` 170 | If specified, `n_top_edges` will be ignored 171 | 172 | Returns 173 | ------- 174 | updates `adata_ref_query` with the following field. 175 | `.layers['conn']` : `array_like` 176 | relation matrix betwewn reference and query observations 177 | """ 178 | sim_ref_query = adata_ref_query.X 179 | if cutoff is None: 180 | if n_edges is None: 181 | raise ValueError('"cutoff" or "n_edges" has to be specified') 182 | else: 183 | cutoff = \ 184 | np.partition(sim_ref_query.data, 185 | (sim_ref_query.size-n_edges))[ 186 | sim_ref_query.size-n_edges] 187 | # cutoff = \ 188 | # np.partition(sim_ref_query.flatten(), 189 | # (len(sim_ref_query.flatten())-n_edges))[ 190 | # len(sim_ref_query.flatten())-n_edges] 191 | id_x, id_y, values = find(sim_ref_query > cutoff) 192 | 193 | print(f'{len(id_x)} edges are selected') 194 | conn_ref_query = csr_matrix( 195 | (values*1, (id_x, id_y)), 196 | shape=(sim_ref_query.shape)) 197 | adata_ref_query.layers['conn'] = conn_ref_query 198 | -------------------------------------------------------------------------------- /simba/tools/_pbg.py: -------------------------------------------------------------------------------- 1 | """PyTorch-BigGraph (PBG) for learning graph embeddings""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | import json 7 | 8 | from pathlib import Path 9 | import attr 10 | from torchbiggraph.config import ( 11 | add_to_sys_path, 12 | ConfigFileLoader 13 | ) 14 | from torchbiggraph.converters.importers import ( 15 | convert_input_data, 16 | TSVEdgelistReader 17 | ) 18 | from torchbiggraph.train import train 19 | from torchbiggraph.util import ( 20 | set_logging_verbosity, 21 | setup_logging, 22 | SubprocessInitializer, 23 | ) 24 | 25 | from .._settings import settings 26 | 27 | 28 | def gen_graph(list_CP=None, 29 | list_PM=None, 30 | list_PK=None, 31 | list_CG=None, 32 | list_CC=None, 33 | prefix_C='C', 34 | prefix_P='P', 35 | prefix_M='M', 36 | prefix_K='K', 37 | prefix_G='G', 38 | copy=False, 39 | dirname='graph0', 40 | use_highly_variable=True, 41 | use_top_pcs=True, 42 | use_top_pcs_CP=None, 43 | use_top_pcs_PM=None, 44 | use_top_pcs_PK=None, 45 | ): 46 | """Generate graph for PBG training based on indices of obs and var 47 | It also generates an accompanying file 'entity_alias.tsv' to map 48 | the indices to the aliases used in the graph 49 | 50 | Parameters 51 | ---------- 52 | list_CP: `list`, optional (default: None) 53 | A list of anndata objects that store ATAC-seq data (Cells by Peaks) 54 | list_PM: `list`, optional (default: None) 55 | A list of anndata objects that store relation between Peaks and Motifs 56 | list_PK: `list`, optional (default: None) 57 | A list of anndata objects that store relation between Peaks and Kmers 58 | list_CG: `list`, optional (default: None) 59 | A list of anndata objects that store RNA-seq data (Cells by Genes) 60 | list_CC: `list`, optional (default: None) 61 | A list of anndata objects that store relation between Cells 62 | from two conditions 63 | prefix_C: `str`, optional (default: 'C') 64 | Prefix to indicate the entity type of cells 65 | prefix_G: `str`, optional (default: 'G') 66 | Prefix to indicate the entity type of genes 67 | dirname: `str`, (default: 'graph0') 68 | The name of the directory in which each graph will be stored 69 | use_highly_variable: `bool`, optional (default: True) 70 | Use highly variable genes 71 | use_top_pcs: `bool`, optional (default: True) 72 | Use top-PCs-associated features for CP, PM, PK 73 | use_top_pcs_CP: `bool`, optional (default: None) 74 | Use top-PCs-associated features for CP 75 | Once specified, it will overwrite `use_top_pcs` 76 | use_top_pcs_PM: `bool`, optional (default: None) 77 | Use top-PCs-associated features for PM 78 | Once specified, it will overwrite `use_top_pcs` 79 | use_top_pcs_PK: `bool`, optional (default: None) 80 | Use top-PCs-associated features for PK 81 | Once specified, it will overwrite `use_top_pcs` 82 | copy: `bool`, optional (default: False) 83 | If True, it returns the graph file as a data frame 84 | 85 | Returns 86 | ------- 87 | If `copy` is True, 88 | edges: `pd.DataFrame` 89 | The edges of the graph used for PBG training. 90 | Each line contains information about one edge. 91 | Using tabs as separators, each line contains the identifiers of 92 | the source entities, the relation types and the target entities. 93 | 94 | updates `.settings.pbg_params` with the following parameters. 95 | entity_path: `str` 96 | The path of the directory containing entity count files. 97 | edge_paths: `list` 98 | A list of paths to directories containing (partitioned) edgelists. 99 | Typically a single path is provided. 100 | entities: `dict` 101 | The entity types. 102 | relations: `list` 103 | The relation types. 104 | 105 | updates `.settings.graph_stats` with the following parameters. 106 | `dirname`: `dict` 107 | Statistics of input graph 108 | """ 109 | 110 | if(sum(list(map(lambda x: x is None, 111 | [list_CP, 112 | list_PM, 113 | list_PK, 114 | list_CG, 115 | list_CC]))) == 5): 116 | return 'No graph is generated' 117 | 118 | filepath = os.path.join(settings.workdir, 'pbg', dirname) 119 | settings.pbg_params['entity_path'] = \ 120 | os.path.join(filepath, "input/entity") 121 | settings.pbg_params['edge_paths'] = \ 122 | [os.path.join(filepath, "input/edge"), ] 123 | if(not os.path.exists(filepath)): 124 | os.makedirs(filepath) 125 | 126 | # Collect the indices of entities 127 | dict_cells = dict() # unique cell indices from all cell-centric datasets 128 | ids_genes = pd.Index([]) 129 | ids_peaks = pd.Index([]) 130 | ids_kmers = pd.Index([]) 131 | ids_motifs = pd.Index([]) 132 | 133 | if list_CP is not None: 134 | for adata_ori in list_CP: 135 | if use_top_pcs_CP is None: 136 | flag_top_pcs = use_top_pcs 137 | else: 138 | flag_top_pcs = use_top_pcs_CP 139 | if flag_top_pcs: 140 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 141 | else: 142 | adata = adata_ori.copy() 143 | ids_cells_i = adata.obs.index 144 | if(len(dict_cells) == 0): 145 | dict_cells[prefix_C] = ids_cells_i 146 | else: 147 | # check if cell indices are included in dict_cells 148 | flag_included = False 149 | for k in dict_cells.keys(): 150 | ids_cells_k = dict_cells[k] 151 | if set(ids_cells_i) <= set(ids_cells_k): 152 | flag_included = True 153 | break 154 | if not flag_included: 155 | # create a new set of entities 156 | # when not all indices are included 157 | dict_cells[f'{prefix_C}{len(dict_cells)+1}'] = ids_cells_i 158 | ids_peaks = ids_peaks.union(adata.var.index) 159 | if list_PM is not None: 160 | for adata_ori in list_PM: 161 | if use_top_pcs_PM is None: 162 | flag_top_pcs = use_top_pcs 163 | else: 164 | flag_top_pcs = use_top_pcs_PM 165 | if flag_top_pcs: 166 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 167 | else: 168 | adata = adata_ori.copy() 169 | ids_peaks = ids_peaks.union(adata.obs.index) 170 | ids_motifs = ids_motifs.union(adata.var.index) 171 | if list_PK is not None: 172 | for adata_ori in list_PK: 173 | if use_top_pcs_PK is None: 174 | flag_top_pcs = use_top_pcs 175 | else: 176 | flag_top_pcs = use_top_pcs_PK 177 | if flag_top_pcs: 178 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 179 | else: 180 | adata = adata_ori.copy() 181 | ids_peaks = ids_peaks.union(adata.obs.index) 182 | ids_kmers = ids_kmers.union(adata.var.index) 183 | if list_CG is not None: 184 | for adata_ori in list_CG: 185 | if use_highly_variable: 186 | adata = adata_ori[:, adata_ori.var['highly_variable']].copy() 187 | else: 188 | adata = adata_ori.copy() 189 | ids_cells_i = adata.obs.index 190 | if(len(dict_cells) == 0): 191 | dict_cells[prefix_C] = ids_cells_i 192 | else: 193 | # check if cell indices are included in dict_cells 194 | flag_included = False 195 | for k in dict_cells.keys(): 196 | ids_cells_k = dict_cells[k] 197 | if set(ids_cells_i) <= set(ids_cells_k): 198 | flag_included = True 199 | break 200 | if not flag_included: 201 | # create a new set of entities 202 | # when not all indices are included 203 | dict_cells[f'{prefix_C}{len(dict_cells)+1}'] = ids_cells_i 204 | ids_genes = ids_genes.union(adata.var.index) 205 | 206 | entity_alias = pd.DataFrame(columns=['alias']) 207 | dict_df_cells = dict() # unique cell dataframes 208 | for k in dict_cells.keys(): 209 | dict_df_cells[k] = pd.DataFrame( 210 | index=dict_cells[k], 211 | columns=['alias'], 212 | data=[f'{k}.{x}' for x in range(len(dict_cells[k]))]) 213 | settings.pbg_params['entities'][k] = {'num_partitions': 1} 214 | entity_alias = entity_alias.append(dict_df_cells[k], 215 | ignore_index=False) 216 | if(len(ids_genes) > 0): 217 | df_genes = pd.DataFrame( 218 | index=ids_genes, 219 | columns=['alias'], 220 | data=[f'{prefix_G}.{x}' for x in range(len(ids_genes))]) 221 | settings.pbg_params['entities'][prefix_G] = {'num_partitions': 1} 222 | entity_alias = entity_alias.append(df_genes, 223 | ignore_index=False) 224 | if(len(ids_peaks) > 0): 225 | df_peaks = pd.DataFrame( 226 | index=ids_peaks, 227 | columns=['alias'], 228 | data=[f'{prefix_P}.{x}' for x in range(len(ids_peaks))]) 229 | settings.pbg_params['entities'][prefix_P] = {'num_partitions': 1} 230 | entity_alias = entity_alias.append(df_peaks, 231 | ignore_index=False) 232 | if(len(ids_kmers) > 0): 233 | df_kmers = pd.DataFrame( 234 | index=ids_kmers, 235 | columns=['alias'], 236 | data=[f'{prefix_K}.{x}' for x in range(len(ids_kmers))]) 237 | settings.pbg_params['entities'][prefix_K] = {'num_partitions': 1} 238 | entity_alias = entity_alias.append(df_kmers, 239 | ignore_index=False) 240 | if(len(ids_motifs) > 0): 241 | df_motifs = pd.DataFrame( 242 | index=ids_motifs, 243 | columns=['alias'], 244 | data=[f'{prefix_M}.{x}' for x in range(len(ids_motifs))]) 245 | settings.pbg_params['entities'][prefix_M] = {'num_partitions': 1} 246 | entity_alias = entity_alias.append(df_motifs, 247 | ignore_index=False) 248 | 249 | # generate edges 250 | dict_graph_stats = dict() 251 | col_names = ["source", "relation", "destination"] 252 | df_edges = pd.DataFrame(columns=col_names) 253 | id_r = 0 254 | settings.pbg_params['relations'] = [] 255 | 256 | if list_CP is not None: 257 | for adata_ori in list_CP: 258 | if use_top_pcs: 259 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 260 | else: 261 | adata = adata_ori.copy() 262 | # select reference of cells 263 | for key, df_cells in dict_df_cells.items(): 264 | if set(adata.obs_names) <= set(df_cells.index): 265 | break 266 | df_edges_x = pd.DataFrame(columns=col_names) 267 | df_edges_x['source'] = df_cells.loc[ 268 | adata.obs_names[adata.X.nonzero()[0]], 269 | 'alias'].values 270 | df_edges_x['relation'] = f'r{id_r}' 271 | df_edges_x['destination'] = df_peaks.loc[ 272 | adata.var_names[adata.X.nonzero()[1]], 273 | 'alias'].values 274 | print(f'relation{id_r}: ' 275 | f'source: {key}, ' 276 | f'destination: {prefix_P}\n' 277 | f'#edges: {df_edges_x.shape[0]}') 278 | dict_graph_stats[f'relation{id_r}'] = \ 279 | {'source': key, 280 | 'destination': prefix_P, 281 | 'n_edges': df_edges_x.shape[0]} 282 | df_edges = df_edges.append(df_edges_x, 283 | ignore_index=True) 284 | settings.pbg_params['relations'].append( 285 | {'name': f'r{id_r}', 286 | 'lhs': f'{key}', 287 | 'rhs': f'{prefix_P}', 288 | 'operator': 'none', 289 | 'weight': 1.0 290 | }) 291 | id_r += 1 292 | adata_ori.obs['pbg_id'] = "" 293 | adata_ori.var['pbg_id'] = "" 294 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ 295 | df_cells.loc[adata.obs_names, 'alias'].copy() 296 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ 297 | df_peaks.loc[adata.var_names, 'alias'].copy() 298 | 299 | if list_PM is not None: 300 | for adata_ori in list_PM: 301 | if use_top_pcs: 302 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 303 | else: 304 | adata = adata_ori.copy() 305 | df_edges_x = pd.DataFrame(columns=col_names) 306 | df_edges_x['source'] = df_peaks.loc[ 307 | adata.obs_names[adata.X.nonzero()[0]], 308 | 'alias'].values 309 | df_edges_x['relation'] = f'r{id_r}' 310 | df_edges_x['destination'] = df_motifs.loc[ 311 | adata.var_names[adata.X.nonzero()[1]], 312 | 'alias'].values 313 | print(f'relation{id_r}: ' 314 | f'source: {prefix_P}, ' 315 | f'destination: {prefix_M}\n' 316 | f'#edges: {df_edges_x.shape[0]}') 317 | dict_graph_stats[f'relation{id_r}'] = \ 318 | {'source': prefix_P, 319 | 'destination': prefix_M, 320 | 'n_edges': df_edges_x.shape[0]} 321 | df_edges = df_edges.append(df_edges_x, 322 | ignore_index=True) 323 | settings.pbg_params['relations'].append( 324 | {'name': f'r{id_r}', 325 | 'lhs': f'{prefix_P}', 326 | 'rhs': f'{prefix_M}', 327 | 'operator': 'none', 328 | 'weight': 0.2 329 | }) 330 | id_r += 1 331 | adata_ori.obs['pbg_id'] = "" 332 | adata_ori.var['pbg_id'] = "" 333 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ 334 | df_peaks.loc[adata.obs_names, 'alias'].copy() 335 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ 336 | df_motifs.loc[adata.var_names, 'alias'].copy() 337 | 338 | if list_PK is not None: 339 | for adata_ori in list_PK: 340 | if use_top_pcs: 341 | adata = adata_ori[:, adata_ori.var['top_pcs']].copy() 342 | else: 343 | adata = adata_ori.copy() 344 | df_edges_x = pd.DataFrame(columns=col_names) 345 | df_edges_x['source'] = df_peaks.loc[ 346 | adata.obs_names[adata.X.nonzero()[0]], 347 | 'alias'].values 348 | df_edges_x['relation'] = f'r{id_r}' 349 | df_edges_x['destination'] = df_kmers.loc[ 350 | adata.var_names[adata.X.nonzero()[1]], 351 | 'alias'].values 352 | print(f'relation{id_r}: ' 353 | f'source: {prefix_P}, ' 354 | f'destination: {prefix_K}\n' 355 | f'#edges: {df_edges_x.shape[0]}') 356 | dict_graph_stats[f'relation{id_r}'] = \ 357 | {'source': prefix_P, 358 | 'destination': prefix_K, 359 | 'n_edges': df_edges_x.shape[0]} 360 | df_edges = df_edges.append(df_edges_x, 361 | ignore_index=True) 362 | settings.pbg_params['relations'].append( 363 | {'name': f'r{id_r}', 364 | 'lhs': f'{prefix_P}', 365 | 'rhs': f'{prefix_K}', 366 | 'operator': 'none', 367 | 'weight': 0.02 368 | }) 369 | id_r += 1 370 | adata_ori.obs['pbg_id'] = "" 371 | adata_ori.var['pbg_id'] = "" 372 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ 373 | df_peaks.loc[adata.obs_names, 'alias'].copy() 374 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ 375 | df_kmers.loc[adata.var_names, 'alias'].copy() 376 | 377 | if list_CG is not None: 378 | for adata_ori in list_CG: 379 | if use_highly_variable: 380 | adata = adata_ori[:, adata_ori.var['highly_variable']].copy() 381 | else: 382 | adata = adata_ori.copy() 383 | # select reference of cells 384 | for key, df_cells in dict_df_cells.items(): 385 | if set(adata.obs_names) <= set(df_cells.index): 386 | break 387 | expr_level = np.unique(adata.layers['disc'].data) 388 | expr_weight = np.linspace(start=1, stop=5, num=len(expr_level)) 389 | for i_lvl, lvl in enumerate(expr_level): 390 | df_edges_x = pd.DataFrame(columns=col_names) 391 | df_edges_x['source'] = df_cells.loc[ 392 | adata.obs_names[(adata.layers['disc'] == lvl) 393 | .astype(int).nonzero()[0]], 394 | 'alias'].values 395 | df_edges_x['relation'] = f'r{id_r}' 396 | df_edges_x['destination'] = df_genes.loc[ 397 | adata.var_names[(adata.layers['disc'] == lvl) 398 | .astype(int).nonzero()[1]], 399 | 'alias'].values 400 | print(f'relation{id_r}: ' 401 | f'source: {key}, ' 402 | f'destination: {prefix_G}\n' 403 | f'#edges: {df_edges_x.shape[0]}') 404 | dict_graph_stats[f'relation{id_r}'] = \ 405 | {'source': key, 406 | 'destination': prefix_G, 407 | 'n_edges': df_edges_x.shape[0]} 408 | df_edges = df_edges.append(df_edges_x, 409 | ignore_index=True) 410 | settings.pbg_params['relations'].append( 411 | {'name': f'r{id_r}', 412 | 'lhs': f'{key}', 413 | 'rhs': f'{prefix_G}', 414 | 'operator': 'none', 415 | 'weight': round(expr_weight[i_lvl], 2), 416 | }) 417 | id_r += 1 418 | adata_ori.obs['pbg_id'] = "" 419 | adata_ori.var['pbg_id'] = "" 420 | adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ 421 | df_cells.loc[adata.obs_names, 'alias'].copy() 422 | adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ 423 | df_genes.loc[adata.var_names, 'alias'].copy() 424 | 425 | if list_CC is not None: 426 | for adata in list_CC: 427 | # select reference of cells 428 | for key_obs, df_cells_obs in dict_df_cells.items(): 429 | if set(adata.obs_names) <= set(df_cells_obs.index): 430 | break 431 | for key_var, df_cells_var in dict_df_cells.items(): 432 | if set(adata.var_names) <= set(df_cells_var.index): 433 | break 434 | # edges between ref and query 435 | df_edges_x = pd.DataFrame(columns=col_names) 436 | df_edges_x['source'] = df_cells_obs.loc[ 437 | adata.obs_names[adata.layers['conn'].nonzero()[0]], 438 | 'alias'].values 439 | df_edges_x['relation'] = f'r{id_r}' 440 | df_edges_x['destination'] = df_cells_var.loc[ 441 | adata.var_names[adata.layers['conn'].nonzero()[1]], 442 | 'alias'].values 443 | print(f'relation{id_r}: ' 444 | f'source: {key_obs}, ' 445 | f'destination: {key_var}\n' 446 | f'#edges: {df_edges_x.shape[0]}') 447 | dict_graph_stats[f'relation{id_r}'] = \ 448 | {'source': key_obs, 449 | 'destination': key_var, 450 | 'n_edges': df_edges_x.shape[0]} 451 | df_edges = df_edges.append(df_edges_x, 452 | ignore_index=True) 453 | settings.pbg_params['relations'].append( 454 | {'name': f'r{id_r}', 455 | 'lhs': f'{key_obs}', 456 | 'rhs': f'{key_var}', 457 | 'operator': 'none', 458 | 'weight': 10.0 459 | }) 460 | id_r += 1 461 | 462 | # # edges within ref 463 | # df_edges_x = pd.DataFrame(columns=col_names) 464 | # df_edges_x['source'] = df_cells_obs.loc[ 465 | # adata.obs_names[adata.obsp['conn'].nonzero()[0]], 466 | # 'alias'].values 467 | # df_edges_x['relation'] = f'r{id_r}' 468 | # df_edges_x['destination'] = df_cells_obs.loc[ 469 | # adata.obs_names[adata.obsp['conn'].nonzero()[1]], 470 | # 'alias'].values 471 | # print(f'relation{id_r}: ' 472 | # f'source: {key_obs}, ' 473 | # f'destination: {key_obs}\n' 474 | # f'#edges: {df_edges_x.shape[0]}') 475 | # dict_graph_stats[f'relation{id_r}'] = \ 476 | # {'source': key_obs, 477 | # 'destination': key_obs, 478 | # 'n_edges': df_edges_x.shape[0]} 479 | # df_edges = df_edges.append(df_edges_x, 480 | # ignore_index=True) 481 | # settings.pbg_params['relations'].append( 482 | # {'name': f'r{id_r}', 483 | # 'lhs': f'{key_obs}', 484 | # 'rhs': f'{key_obs}', 485 | # 'operator': 'none', 486 | # 'weight': 1.0 487 | # }) 488 | # id_r += 1 489 | 490 | # # edges within query 491 | # df_edges_x = pd.DataFrame(columns=col_names) 492 | # df_edges_x['source'] = df_cells_var.loc[ 493 | # adata.var_names[adata.varp['conn'].nonzero()[0]], 494 | # 'alias'].values 495 | # df_edges_x['relation'] = f'r{id_r}' 496 | # df_edges_x['destination'] = df_cells_var.loc[ 497 | # adata.var_names[adata.varp['conn'].nonzero()[1]], 498 | # 'alias'].values 499 | # print(f'relation{id_r}: ' 500 | # f'source: {key_var}, ' 501 | # f'destination: {key_var}\n' 502 | # f'#edges: {df_edges_x.shape[0]}') 503 | # dict_graph_stats[f'relation{id_r}'] = \ 504 | # {'source': key_var, 505 | # 'destination': key_var, 506 | # 'n_edges': df_edges_x.shape[0]} 507 | # df_edges = df_edges.append(df_edges_x, 508 | # ignore_index=True) 509 | # settings.pbg_params['relations'].append( 510 | # {'name': f'r{id_r}', 511 | # 'lhs': f'{key_var}', 512 | # 'rhs': f'{key_var}', 513 | # 'operator': 'none', 514 | # 'weight': 1.0 515 | # }) 516 | # id_r += 1 517 | 518 | adata.obs['pbg_id'] = df_cells_obs.loc[adata.obs_names, 519 | 'alias'].copy() 520 | adata.var['pbg_id'] = df_cells_var.loc[adata.var_names, 521 | 'alias'].copy() 522 | 523 | print(f'Total number of edges: {df_edges.shape[0]}') 524 | dict_graph_stats['n_edges'] = df_edges.shape[0] 525 | settings.graph_stats[dirname] = dict_graph_stats 526 | 527 | print(f'Writing graph file "pbg_graph.txt" to "{filepath}" ...') 528 | df_edges.to_csv(os.path.join(filepath, "pbg_graph.txt"), 529 | header=False, 530 | index=False, 531 | sep='\t') 532 | entity_alias.to_csv(os.path.join(filepath, 'entity_alias.txt'), 533 | header=True, 534 | index=True, 535 | sep='\t') 536 | with open(os.path.join(filepath, 'graph_stats.json'), 'w') as fp: 537 | json.dump(dict_graph_stats, 538 | fp, 539 | sort_keys=True, 540 | indent=4, 541 | separators=(',', ': ')) 542 | print("Finished.") 543 | if copy: 544 | return df_edges 545 | else: 546 | return None 547 | 548 | 549 | def pbg_train(dirname=None, 550 | pbg_params=None, 551 | output='model', 552 | auto_wd=True, 553 | save_wd=False): 554 | """PBG training 555 | 556 | Parameters 557 | ---------- 558 | dirname: `str`, optional (default: None) 559 | The name of the directory in which graph is stored 560 | If None, it will be inferred from `pbg_params['entity_path']` 561 | pbg_params: `dict`, optional (default: None) 562 | Configuration for pbg training. 563 | If specified, it will be used instead of the default setting 564 | output: `str`, optional (default: 'model') 565 | The name of the directory where training output will be written to. 566 | It overrides `pbg_params` if `checkpoint_path` is specified in it 567 | auto_wd: `bool`, optional (default: True) 568 | If True, it will override `pbg_params['wd']` with a new weight decay 569 | estimated based on training sample size 570 | Recommended for relative small training sample size (<1e7) 571 | save_wd: `bool`, optional (default: False) 572 | If True, estimated `wd` will be saved to `settings.pbg_params['wd']` 573 | 574 | Returns 575 | ------- 576 | updates `settings.pbg_params` with the following parameter 577 | checkpoint_path: 578 | The path to the directory where checkpoints (and thus the output) 579 | will be written to. 580 | If checkpoints are found in it, training will resume from them. 581 | """ 582 | 583 | if pbg_params is None: 584 | pbg_params = settings.pbg_params.copy() 585 | else: 586 | assert isinstance(pbg_params, dict),\ 587 | "`pbg_params` must be dict" 588 | 589 | if dirname is None: 590 | filepath = Path(pbg_params['entity_path']).parent.parent.as_posix() 591 | else: 592 | filepath = os.path.join(settings.workdir, 'pbg', dirname) 593 | 594 | pbg_params['checkpoint_path'] = os.path.join(filepath, output) 595 | settings.pbg_params['checkpoint_path'] = pbg_params['checkpoint_path'] 596 | 597 | if auto_wd: 598 | # empirical numbers from simulation experiments 599 | if settings.graph_stats[ 600 | os.path.basename(filepath)]['n_edges'] < 5e7: 601 | # optimial wd (0.013) for sample size (2725781) 602 | wd = np.around( 603 | 0.013 * 2725781 / settings.graph_stats[ 604 | os.path.basename(filepath)]['n_edges'], 605 | decimals=6) 606 | else: 607 | # optimial wd (0.0004) for sample size (59103481) 608 | wd = np.around( 609 | 0.0004 * 59103481 / settings.graph_stats[ 610 | os.path.basename(filepath)]['n_edges'], 611 | decimals=6) 612 | print(f'Auto-estimated weight decay is {wd}') 613 | pbg_params['wd'] = wd 614 | if save_wd: 615 | settings.pbg_params['wd'] = pbg_params['wd'] 616 | print(f"`.settings.pbg_params['wd']` has been updated to {wd}") 617 | 618 | # to avoid oversubscription issues in workloads 619 | # that involve nested parallelism 620 | os.environ["OMP_NUM_THREADS"] = "1" 621 | 622 | loader = ConfigFileLoader() 623 | config = loader.load_config_simba(pbg_params) 624 | set_logging_verbosity(config.verbose) 625 | 626 | list_filenames = [os.path.join(filepath, "pbg_graph.txt")] 627 | input_edge_paths = [Path(name) for name in list_filenames] 628 | print("Converting input data ...") 629 | convert_input_data( 630 | config.entities, 631 | config.relations, 632 | config.entity_path, 633 | config.edge_paths, 634 | input_edge_paths, 635 | TSVEdgelistReader(lhs_col=0, rhs_col=2, rel_col=1), 636 | dynamic_relations=config.dynamic_relations, 637 | ) 638 | 639 | subprocess_init = SubprocessInitializer() 640 | subprocess_init.register(setup_logging, config.verbose) 641 | subprocess_init.register(add_to_sys_path, loader.config_dir.name) 642 | 643 | train_config = attr.evolve(config, edge_paths=config.edge_paths) 644 | print("Starting training ...") 645 | train(train_config, subprocess_init=subprocess_init) 646 | print("Finished") 647 | -------------------------------------------------------------------------------- /simba/tools/_umap.py: -------------------------------------------------------------------------------- 1 | """UMAP (Uniform Manifold Approximation and Projection)""" 2 | 3 | import umap as umap_learn 4 | 5 | 6 | def umap(adata, 7 | n_neighbors=15, 8 | n_components=2, 9 | random_state=2020, 10 | layer=None, 11 | obsm=None, 12 | n_dim=None, 13 | **kwargs, 14 | ): 15 | """perform UMAP 16 | Parameters 17 | ---------- 18 | adata: AnnData 19 | Annotated data matrix. 20 | n_neighbors: `int`, optional (default: 15) 21 | The size of local neighborhood for UMAP 22 | n_components: `int`, optional (default: None) 23 | The dimension of the space to embed into for UMAP 24 | random_state: `int`, optional (default: None) 25 | The seed used by the random number generator for UMAP 26 | layer: `str`, optional (default: None) 27 | The layer used to perform UMAP 28 | obsm: `str`, optional (default: None) 29 | The multi-dimensional annotation of observations used to perform UMAP 30 | n_dim: `str`, optional (default: None) 31 | The number of dimensions used in `layer` or `obsm` 32 | kwargs: 33 | Other keyword arguments are passed down to `umap_learn.UMAP` 34 | 35 | Returns 36 | ------- 37 | updates `adata` with the following fields: 38 | `.obsm['X_umap']` : `array` 39 | UMAP coordinates of samples. 40 | """ 41 | 42 | if(sum(list(map(lambda x: x is not None, 43 | [layer, obsm]))) == 2): 44 | raise ValueError("Only one of `layer` and `obsm` can be used") 45 | elif(obsm is not None): 46 | X = adata.obsm[obsm] 47 | elif(layer is not None): 48 | X = adata.layers[layer] 49 | else: 50 | X = adata.X 51 | if(n_dim is not None): 52 | X = X[:, :n_dim] 53 | reducer = umap_learn.UMAP(n_neighbors=n_neighbors, 54 | n_components=n_components, 55 | random_state=random_state, 56 | **kwargs) 57 | reducer.fit(X) 58 | adata.obsm['X_umap'] = reducer.embedding_ 59 | -------------------------------------------------------------------------------- /simba/tools/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and classes""" 2 | 3 | import numpy as np 4 | from sklearn.neighbors import KDTree 5 | from scipy.sparse import csr_matrix 6 | 7 | 8 | def _uniquify(seq, sep='-'): 9 | """Uniquify a list of strings. 10 | 11 | Adding unique numbers to duplicate values. 12 | 13 | Parameters 14 | ---------- 15 | seq : `list` or `array-like` 16 | A list of values 17 | sep : `str` 18 | Separator 19 | 20 | Returns 21 | ------- 22 | seq: `list` or `array-like` 23 | A list of updated values 24 | """ 25 | 26 | dups = {} 27 | 28 | for i, val in enumerate(seq): 29 | if val not in dups: 30 | # Store index of first occurrence and occurrence value 31 | dups[val] = [i, 1] 32 | else: 33 | # Increment occurrence value, index value doesn't matter anymore 34 | dups[val][1] += 1 35 | 36 | # Use stored occurrence value 37 | seq[i] += (sep+str(dups[val][1])) 38 | 39 | return(seq) 40 | 41 | 42 | def _gini(array): 43 | """Calculate the Gini coefficient of a numpy array. 44 | """ 45 | 46 | array = array.flatten().astype(float) 47 | if np.amin(array) < 0: 48 | # Values cannot be negative: 49 | array -= np.amin(array) 50 | # Values cannot be 0: 51 | array += 0.0000001 52 | # Values must be sorted: 53 | array = np.sort(array) 54 | # Index per array element: 55 | index = np.arange(1, array.shape[0]+1) 56 | # Number of array elements: 57 | n = array.shape[0] 58 | # Gini coefficient: 59 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) 60 | 61 | 62 | def _knn(X_ref, 63 | X_query=None, 64 | k=20, 65 | leaf_size=40, 66 | metric='euclidean'): 67 | """Calculate K nearest neigbors for each row. 68 | """ 69 | if X_query is None: 70 | X_query = X_ref.copy() 71 | kdt = KDTree(X_ref, leaf_size=leaf_size, metric=metric) 72 | kdt_d, kdt_i = kdt.query(X_query, k=k, return_distance=True) 73 | # kdt_i = kdt_i[:, 1:] # exclude the point itself 74 | # kdt_d = kdt_d[:, 1:] # exclude the point itself 75 | sp_row = np.repeat(np.arange(kdt_i.shape[0]), kdt_i.shape[1]) 76 | sp_col = kdt_i.flatten() 77 | sp_conn = np.repeat(1, len(sp_row)) 78 | sp_dist = kdt_d.flatten() 79 | mat_conn_ref_query = csr_matrix( 80 | (sp_conn, (sp_row, sp_col)), 81 | shape=(X_query.shape[0], X_ref.shape[0])).T 82 | mat_dist_ref_query = csr_matrix( 83 | (sp_dist, (sp_row, sp_col)), 84 | shape=(X_query.shape[0], X_ref.shape[0])).T 85 | return mat_conn_ref_query, mat_dist_ref_query 86 | -------------------------------------------------------------------------------- /tests/data/10xpbmc_atac_subset.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/10xpbmc_atac_subset.h5ad -------------------------------------------------------------------------------- /tests/data/10xpbmc_rna_subset.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/10xpbmc_rna_subset.h5ad -------------------------------------------------------------------------------- /tests/data/pbg_training/entity_alias.txt: -------------------------------------------------------------------------------- 1 | alias 2 | GCACCTAAGTTAGTGC-1_rna C.0 3 | TCTCCTCGTGGAGCAA-1_rna C.1 4 | GTTCTTGTCTTGACCC-1_rna C.2 5 | CGCGATTCAGGATGGC-1_rna C.3 6 | CCTACTTCAGAATGAC-1_rna C.4 7 | TGAAGGATCCTTTACG-1_rna C.5 8 | AATCCGTAGCACTAAC-1_rna C.6 9 | GCTTAAATCGGCCATA-1_rna C.7 10 | GGGCTAACACCTAAGC-1_rna C.8 11 | TAGCCTGAGGTCTTGG-1_rna C.9 12 | ACCTTGCTCGTTAGCG-1_rna C.10 13 | CTAGGCGGTAGACAAA-1_rna C.11 14 | CGATTTGCATTGCGTA-1_rna C.12 15 | CGTGAGGAGGAGCAAC-1_rna C.13 16 | GGGTCAACACATAGCC-1_rna C.14 17 | ACTCAGTAGTAGGATG-1_rna C.15 18 | ATTACCCGTAGGTTAT-1_rna C.16 19 | ATTAGGTGTTTGGCGG-1_rna C.17 20 | ACCAAACTCATTATGG-1_rna C.18 21 | AGCTTGGTCGCTAGTG-1_rna C.19 22 | TGTGGAGCAACCTGGT-1_rna C.20 23 | TTTAGCTTCCTTAAGA-1_rna C.21 24 | GTTTAACCATAATCCG-1_rna C.22 25 | CGTATTGCAGCTAATT-1_rna C.23 26 | ACATAGCTCCCTGACT-1_rna C.24 27 | GTTAAACGTTTCCACG-1_rna C.25 28 | TTAACTGAGTATTGTG-1_rna C.26 29 | GGCTATGTCCCTGGTT-1_rna C.27 30 | AGGTCATTCTAACCAA-1_rna C.28 31 | CAGCCTTTCTCACAAA-1_rna C.29 32 | TCAGTAGGTAGGTTAT-1_rna C.30 33 | TGAAGTGAGGAAGTAT-1_rna C.31 34 | GCTGGTTCAATTAAGG-1_rna C.32 35 | TGAGCCGGTGCACGCA-1_rna C.33 36 | CGCTTAACAGCCGCTA-1_rna C.34 37 | GTGCACGGTTGTAAAC-1_rna C.35 38 | CTGTTAAAGAATGACG-1_rna C.36 39 | TGTAAGCTCTTAGGAC-1_rna C.37 40 | GAGCGAAGTTCGGGAT-1_rna C.38 41 | GATTCATCATAATGTC-1_rna C.39 42 | TTCCTCAAGGTTTGAC-1_rna C.40 43 | AGGTACGCAGCCTTGG-1_rna C.41 44 | CATTTGTTCGCACACA-1_rna C.42 45 | CCGTTAACAATCCTGA-1_rna C.43 46 | GGCCTCTGTCCTCCAA-1_rna C.44 47 | GCTGTGATCAATCTCT-1_rna C.45 48 | GGGAATATCTTAATGG-1_rna C.46 49 | AGGTGAGGTGGATTCA-1_rna C.47 50 | TGCACTTGTTTACGTC-1_rna C.48 51 | GTTCGCTTCCGTTAAA-1_rna C.49 52 | CCAAATCAGCGGTTAT-1_rna C.50 53 | GGCCTAATCCCTGTTA-1_rna C.51 54 | GGGCCTAGTGTCCAGG-1_rna C.52 55 | CGGACCTAGTCACTCC-1_rna C.53 56 | ATATGGTGTCAGGAAG-1_rna C.54 57 | CGCTTACTCCTAATTC-1_rna C.55 58 | AAAGCTTGTCGACTAA-1_rna C.56 59 | AAGCATGAGGCCTAAT-1_rna C.57 60 | GTTACAGGTAGGTTAT-1_rna C.58 61 | CCATAATCATGCTATG-1_rna C.59 62 | CACCTCAGTTTGCGAA-1_rna C.60 63 | ATTGCGCCACTAGCGT-1_rna C.61 64 | CCTACTGGTGCCGCAA-1_rna C.62 65 | GCAATAGAGGCGGATG-1_rna C.63 66 | GCGCGATTCCTCCCTC-1_rna C.64 67 | TAGGTTATCTCGACCT-1_rna C.65 68 | TGCTTGCTCATGAGCT-1_rna C.66 69 | GACTCACCAGTAATAG-1_rna C.67 70 | TTCCCGCCAATAACGA-1_rna C.68 71 | CGAACCGGTAGCCATA-1_rna C.69 72 | ACCAAGCGTCATAAGT-1_rna C.70 73 | CGCCAAATCCCAGTAG-1_rna C.71 74 | ACTCACTGTTTGGTTC-1_rna C.72 75 | AGAAGGTGTAGGTTGC-1_rna C.73 76 | ATTTGCGCAGGCTTGT-1_rna C.74 77 | TAGTTGTCATCGCTCC-1_rna C.75 78 | GGACAGCCAGATTCAT-1_rna C.76 79 | TTGAGCTAGCTTACTT-1_rna C.77 80 | CATTTGTTCTAAATCG-1_rna C.78 81 | TAGGAGTCACTGACCG-1_rna C.79 82 | ATCAAGCTCGGGATTT-1_rna C.80 83 | TCTCGCCCAACCTGGT-1_rna C.81 84 | GCGGTTATCCTGATGG-1_rna C.82 85 | GATTGCAGTGGAGCAA-1_rna C.83 86 | CGATTCCTCTTGCTAT-1_rna C.84 87 | GCTCATTGTTCACCAT-1_rna C.85 88 | CAAACGCGTTTCGCGC-1_rna C.86 89 | TCTTAGCGTCCGTGAG-1_rna C.87 90 | GCGCGATTCCTTGAGG-1_rna C.88 91 | ATAGGTACAGGTCCTG-1_rna C.89 92 | TCCTTAGTCCTGAGTG-1_rna C.90 93 | TTCCCACAGCCAAATC-1_rna C.91 94 | GGGTGAAGTGCATCGG-1_rna C.92 95 | CGGATAAAGTAGAGGC-1_rna C.93 96 | AGGATGTCACAAAGAC-1_rna C.94 97 | CTTCTCAAGGGTGGAT-1_rna C.95 98 | AACCCGCAGCGGATTT-1_rna C.96 99 | GGACATAAGGGATGCG-1_rna C.97 100 | TACAAGCTCTGTGAGT-1_rna C.98 101 | CGTATTGCATTCAGCA-1_rna C.99 102 | DPH3 G.0 103 | BICD1 G.1 104 | MAML3 G.2 105 | TTN-AS1 G.3 106 | APPL2 G.4 107 | HLX G.5 108 | CHIC1 G.6 109 | DDX39B G.7 110 | SRC G.8 111 | VAPB G.9 112 | RPS10-NUDT3 G.10 113 | POLR2A G.11 114 | AC007262.2 G.12 115 | CCND2 G.13 116 | PTCD3 G.14 117 | TNFRSF10A G.15 118 | POLR3GL G.16 119 | NNT G.17 120 | IL26 G.18 121 | RPL10 G.19 122 | UHRF1BP1L G.20 123 | AC124014.1 G.21 124 | ELOVL1 G.22 125 | SGPL1 G.23 126 | USP42 G.24 127 | ATF7IP2 G.25 128 | METTL22 G.26 129 | HSCB G.27 130 | PCTP G.28 131 | FAM174B G.29 132 | TMEM184B G.30 133 | SERF2 G.31 134 | KIAA0930 G.32 135 | GNAQ G.33 136 | SCFD1 G.34 137 | UBE2R2 G.35 138 | ARL5B G.36 139 | FRMD4A G.37 140 | EML5 G.38 141 | FAM3A G.39 142 | ARHGAP22 G.40 143 | KXD1 G.41 144 | A1BG G.42 145 | C4orf3 G.43 146 | FAM153CP G.44 147 | PPP1R9A G.45 148 | IQGAP2 G.46 149 | ACTG1 G.47 150 | GPLD1 G.48 151 | SIRPG G.49 152 | CALML4 G.50 153 | IAH1 G.51 154 | LAT2 G.52 155 | AAAS G.53 156 | -------------------------------------------------------------------------------- /tests/data/pbg_training/graph_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_edges": 1075, 3 | "relation0": { 4 | "destination": "G", 5 | "n_edges": 153, 6 | "source": "C" 7 | }, 8 | "relation1": { 9 | "destination": "G", 10 | "n_edges": 369, 11 | "source": "C" 12 | }, 13 | "relation2": { 14 | "destination": "G", 15 | "n_edges": 301, 16 | "source": "C" 17 | }, 18 | "relation3": { 19 | "destination": "G", 20 | "n_edges": 166, 21 | "source": "C" 22 | }, 23 | "relation4": { 24 | "destination": "G", 25 | "n_edges": 86, 26 | "source": "C" 27 | } 28 | } -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_count_C_0.txt: -------------------------------------------------------------------------------- 1 | 100 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_count_G_0.txt: -------------------------------------------------------------------------------- 1 | 54 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_names_C_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | "C.3", 3 | "C.73", 4 | "C.5", 5 | "C.93", 6 | "C.58", 7 | "C.38", 8 | "C.14", 9 | "C.24", 10 | "C.35", 11 | "C.60", 12 | "C.70", 13 | "C.64", 14 | "C.72", 15 | "C.68", 16 | "C.79", 17 | "C.12", 18 | "C.52", 19 | "C.81", 20 | "C.83", 21 | "C.87", 22 | "C.48", 23 | "C.91", 24 | "C.11", 25 | "C.33", 26 | "C.77", 27 | "C.88", 28 | "C.9", 29 | "C.0", 30 | "C.39", 31 | "C.28", 32 | "C.36", 33 | "C.75", 34 | "C.92", 35 | "C.85", 36 | "C.10", 37 | "C.67", 38 | "C.20", 39 | "C.37", 40 | "C.46", 41 | "C.7", 42 | "C.53", 43 | "C.44", 44 | "C.23", 45 | "C.4", 46 | "C.42", 47 | "C.8", 48 | "C.50", 49 | "C.90", 50 | "C.1", 51 | "C.76", 52 | "C.61", 53 | "C.6", 54 | "C.56", 55 | "C.13", 56 | "C.89", 57 | "C.41", 58 | "C.25", 59 | "C.62", 60 | "C.84", 61 | "C.15", 62 | "C.40", 63 | "C.55", 64 | "C.96", 65 | "C.65", 66 | "C.86", 67 | "C.69", 68 | "C.98", 69 | "C.17", 70 | "C.94", 71 | "C.97", 72 | "C.18", 73 | "C.54", 74 | "C.19", 75 | "C.59", 76 | "C.49", 77 | "C.34", 78 | "C.26", 79 | "C.2", 80 | "C.95", 81 | "C.47", 82 | "C.66", 83 | "C.45", 84 | "C.51", 85 | "C.82", 86 | "C.22", 87 | "C.21", 88 | "C.57", 89 | "C.71", 90 | "C.43", 91 | "C.99", 92 | "C.27", 93 | "C.30", 94 | "C.32", 95 | "C.29", 96 | "C.16", 97 | "C.80", 98 | "C.63", 99 | "C.74", 100 | "C.31", 101 | "C.78" 102 | ] -------------------------------------------------------------------------------- /tests/data/pbg_training/input/entity/entity_names_G_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | "G.34", 3 | "G.35", 4 | "G.19", 5 | "G.9", 6 | "G.11", 7 | "G.8", 8 | "G.37", 9 | "G.22", 10 | "G.48", 11 | "G.29", 12 | "G.18", 13 | "G.26", 14 | "G.23", 15 | "G.20", 16 | "G.2", 17 | "G.28", 18 | "G.13", 19 | "G.46", 20 | "G.25", 21 | "G.4", 22 | "G.52", 23 | "G.3", 24 | "G.17", 25 | "G.30", 26 | "G.36", 27 | "G.51", 28 | "G.7", 29 | "G.24", 30 | "G.53", 31 | "G.12", 32 | "G.39", 33 | "G.15", 34 | "G.16", 35 | "G.6", 36 | "G.5", 37 | "G.40", 38 | "G.38", 39 | "G.33", 40 | "G.0", 41 | "G.31", 42 | "G.27", 43 | "G.32", 44 | "G.45", 45 | "G.14", 46 | "G.47", 47 | "G.21", 48 | "G.44", 49 | "G.50", 50 | "G.43", 51 | "G.10", 52 | "G.1", 53 | "G.42", 54 | "G.41", 55 | "G.49" 56 | ] -------------------------------------------------------------------------------- /tests/data/pbg_training/model/checkpoint_version.txt: -------------------------------------------------------------------------------- 1 | 10 2 | -------------------------------------------------------------------------------- /tests/data/pbg_training/model/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "entities": { 3 | "C": { 4 | "num_partitions": 1, 5 | "featurized": false, 6 | "dimension": null 7 | }, 8 | "G": { 9 | "num_partitions": 1, 10 | "featurized": false, 11 | "dimension": null 12 | } 13 | }, 14 | "relations": [ 15 | { 16 | "name": "r0", 17 | "lhs": "C", 18 | "rhs": "G", 19 | "weight": 1.0, 20 | "operator": "none", 21 | "all_negs": false 22 | }, 23 | { 24 | "name": "r1", 25 | "lhs": "C", 26 | "rhs": "G", 27 | "weight": 2.0, 28 | "operator": "none", 29 | "all_negs": false 30 | }, 31 | { 32 | "name": "r2", 33 | "lhs": "C", 34 | "rhs": "G", 35 | "weight": 3.0, 36 | "operator": "none", 37 | "all_negs": false 38 | }, 39 | { 40 | "name": "r3", 41 | "lhs": "C", 42 | "rhs": "G", 43 | "weight": 4.0, 44 | "operator": "none", 45 | "all_negs": false 46 | }, 47 | { 48 | "name": "r4", 49 | "lhs": "C", 50 | "rhs": "G", 51 | "weight": 5.0, 52 | "operator": "none", 53 | "all_negs": false 54 | } 55 | ], 56 | "dimension": 50, 57 | "init_scale": 0.001, 58 | "max_norm": null, 59 | "global_emb": false, 60 | "comparator": "dot", 61 | "bias": false, 62 | "loss_fn": "softmax", 63 | "margin": 0.1, 64 | "regularization_coef": 0.0, 65 | "regularizer": "N3", 66 | "wd": 32.962933, 67 | "wd_interval": 50, 68 | "entity_path": "./result_simba/pbg/graph0/input/entity", 69 | "edge_paths": [ 70 | "./result_simba/pbg/graph0/input/edge" 71 | ], 72 | "checkpoint_path": "result_simba/pbg/graph0/model", 73 | "init_path": null, 74 | "checkpoint_preservation_interval": null, 75 | "num_epochs": 10, 76 | "num_edge_chunks": null, 77 | "max_edges_per_chunk": 1000000000, 78 | "bucket_order": "inside_out", 79 | "workers": 12, 80 | "batch_size": 1000, 81 | "num_batch_negs": 50, 82 | "num_uniform_negs": 50, 83 | "disable_lhs_negs": false, 84 | "disable_rhs_negs": false, 85 | "lr": 0.1, 86 | "relation_lr": null, 87 | "eval_fraction": 0.05, 88 | "eval_num_batch_negs": 50, 89 | "eval_num_uniform_negs": 50, 90 | "early_stopping": false, 91 | "background_io": false, 92 | "verbose": 0, 93 | "hogwild_delay": 2.0, 94 | "dynamic_relations": false, 95 | "num_machines": 1, 96 | "num_partition_servers": -1, 97 | "distributed_init_method": null, 98 | "distributed_tree_init_order": true, 99 | "num_gpus": 0, 100 | "num_groups_for_partition_server": 16, 101 | "half_precision": false 102 | } -------------------------------------------------------------------------------- /tests/data/pbg_training/model/embeddings_C_0.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/embeddings_C_0.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/embeddings_G_0.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/embeddings_G_0.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/model.v10.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/pbg_training/model/model.v10.h5 -------------------------------------------------------------------------------- /tests/data/pbg_training/model/training_stats.json: -------------------------------------------------------------------------------- 1 | {"lhs_partition": 0, "rhs_partition": 0, "index": 1, "stats": {"count": 1022, "metrics": {"loss": 23.252048253546487, "reg": 0.0, "violators_lhs": 36.36497064579256, "violators_rhs": 31.131115459882583}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.4885098259404, "pos_rank": 26.50943396226415, "mrr": 0.08514296270485194, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5094339645133829}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}, "epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0} 2 | {"epoch_idx": 0, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 0, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.550433608720887, "pos_rank": 24.358490566037737, "mrr": 0.0786743724437536, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9433962264150944, "auc": 0.5094339647945368}}} 3 | {"lhs_partition": 0, "rhs_partition": 0, "index": 2, "stats": {"count": 1022, "metrics": {"loss": 22.486315262527615, "reg": 0.0, "violators_lhs": 30.62720156555773, "violators_rhs": 23.104696673189824}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.591775120429272, "pos_rank": 25.566037735849058, "mrr": 0.07682948650897674, "r1": 0.009433962264150943, "r10": 0.18867924528301888, "r50": 0.9716981132075472, "auc": 0.5660377372548265}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}, "epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0} 4 | {"epoch_idx": 1, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 1, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.554165813158143, "pos_rank": 24.27358490566038, "mrr": 0.07591504424388679, "r1": 0.0, "r10": 0.20754716981132076, "r50": 0.9811320754716981, "auc": 0.5471698152569106}}} 5 | {"lhs_partition": 0, "rhs_partition": 0, "index": 3, "stats": {"count": 1022, "metrics": {"loss": 22.591744126172447, "reg": 0.0, "violators_lhs": 30.874755381604697, "violators_rhs": 23.437377690802347}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.54162585960244, "pos_rank": 24.30188679245283, "mrr": 0.08181398664161844, "r1": 0.018867924528301886, "r10": 0.2169811320754717, "r50": 0.9622641509433962, "auc": 0.5377358521492976}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}, "epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0} 6 | {"epoch_idx": 2, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 2, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.498737011315686, "pos_rank": 24.61320754716981, "mrr": 0.07638183331011601, "r1": 0.009433962264150943, "r10": 0.2169811320754717, "r50": 0.9433962264150944, "auc": 0.5660377380982885}}} 7 | {"lhs_partition": 0, "rhs_partition": 0, "index": 4, "stats": {"count": 1022, "metrics": {"loss": 22.62260271658403, "reg": 0.0, "violators_lhs": 30.645792563600782, "violators_rhs": 23.364970645792564}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.508187914794345, "pos_rank": 24.88679245283019, "mrr": 0.08207970859377452, "r1": 0.009433962264150943, "r10": 0.20754716981132076, "r50": 0.9245283018867925, "auc": 0.4905660402662349}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}, "epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0} 8 | {"epoch_idx": 3, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 3, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.506183201411986, "pos_rank": 24.67924528301887, "mrr": 0.0760200916490746, "r1": 0.0, "r10": 0.24528301886792453, "r50": 0.9339622641509434, "auc": 0.5283018881982228}}} 9 | {"lhs_partition": 0, "rhs_partition": 0, "index": 5, "stats": {"count": 1022, "metrics": {"loss": 22.690110387634157, "reg": 0.0, "violators_lhs": 30.770058708414872, "violators_rhs": 23.117416829745597}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.526717563845075, "pos_rank": 24.89622641509434, "mrr": 0.07438522921699398, "r1": 0.0, "r10": 0.2358490566037736, "r50": 0.9622641509433962, "auc": 0.5188679262152258}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}, "epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0} 10 | {"epoch_idx": 4, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 4, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.4956772462377, "pos_rank": 24.88679245283019, "mrr": 0.0729775528405916, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9622641509433962, "auc": 0.5094339650756908}}} 11 | {"lhs_partition": 0, "rhs_partition": 0, "index": 6, "stats": {"count": 1022, "metrics": {"loss": 22.763349835420076, "reg": 0.0, "violators_lhs": 31.012720156555773, "violators_rhs": 22.820939334637966}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.493189523804862, "pos_rank": 24.38679245283019, "mrr": 0.0724140086896577, "r1": 0.0, "r10": 0.19811320754716982, "r50": 0.9528301886792453, "auc": 0.500000000843462}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}, "epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0} 12 | {"epoch_idx": 5, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 5, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.495307175618297, "pos_rank": 24.735849056603772, "mrr": 0.08363932611877625, "r1": 0.018867924528301886, "r10": 0.18867924528301888, "r50": 0.9245283018867925, "auc": 0.5283018893228387}}} 13 | {"lhs_partition": 0, "rhs_partition": 0, "index": 7, "stats": {"count": 1022, "metrics": {"loss": 22.76916164241425, "reg": 0.0, "violators_lhs": 32.49412915851272, "violators_rhs": 26.92367906066536}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.49149281123899, "pos_rank": 24.566037735849058, "mrr": 0.0773335443458186, "r1": 0.0, "r10": 0.22641509433962265, "r50": 0.9433962264150944, "auc": 0.5660377392229045}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}, "epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0} 14 | {"epoch_idx": 6, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 6, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.490976243648888, "pos_rank": 23.92452830188679, "mrr": 0.08941427604207453, "r1": 0.009433962264150943, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5283018876359148}}} 15 | {"lhs_partition": 0, "rhs_partition": 0, "index": 8, "stats": {"count": 1022, "metrics": {"loss": 22.794376134405862, "reg": 0.0, "violators_lhs": 31.71917808219178, "violators_rhs": 23.874755381604697}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.490235013781852, "pos_rank": 23.90566037735849, "mrr": 0.08131686016425209, "r1": 0.009433962264150943, "r10": 0.2641509433962264, "r50": 0.9528301886792453, "auc": 0.6792452849869458}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}, "epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0} 16 | {"epoch_idx": 7, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 7, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.488997333454638, "pos_rank": 24.28301886792453, "mrr": 0.09202509533332766, "r1": 0.009433962264150943, "r10": 0.22641509433962265, "r50": 0.9811320754716981, "auc": 0.50000000140577}}} 17 | {"lhs_partition": 0, "rhs_partition": 0, "index": 9, "stats": {"count": 1022, "metrics": {"loss": 22.79033338020459, "reg": 0.0, "violators_lhs": 31.304305283757337, "violators_rhs": 23.480430528375734}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.48922288642739, "pos_rank": 23.849056603773583, "mrr": 0.09737446559768803, "r1": 0.018867924528301886, "r10": 0.25471698113207547, "r50": 0.9622641509433962, "auc": 0.5943396251718953}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}, "epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0} 18 | {"epoch_idx": 8, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 8, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.489219827472038, "pos_rank": 24.80188679245283, "mrr": 0.0892462246822861, "r1": 0.009433962264150943, "r10": 0.19811320754716982, "r50": 0.9433962264150944, "auc": 0.5377358524304516}}} 19 | {"lhs_partition": 0, "rhs_partition": 0, "index": 10, "stats": {"count": 1022, "metrics": {"loss": 22.792577922694136, "reg": 0.0, "violators_lhs": 31.645792563600782, "violators_rhs": 23.681996086105674}}, "eval_stats_before": {"count": 53, "metrics": {"loss": 21.489368258782154, "pos_rank": 24.528301886792452, "mrr": 0.08778320558650314, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5660377378171345}}, "eval_stats_after": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}, "epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0} 20 | {"epoch_idx": 9, "edge_path_idx": 0, "edge_chunk_idx": 0, "index": 9, "eval_stats_chunk_avg": {"count": 53, "metrics": {"loss": 21.48918512632262, "pos_rank": 24.132075471698112, "mrr": 0.0999531695127206, "r1": 0.018867924528301886, "r10": 0.24528301886792453, "r50": 0.9433962264150944, "auc": 0.5188679267775338}}} 21 | -------------------------------------------------------------------------------- /tests/data/pbg_training/pbg_graph.txt: -------------------------------------------------------------------------------- 1 | C.2 r0 G.0 2 | C.2 r0 G.4 3 | C.2 r0 G.14 4 | C.2 r0 G.22 5 | C.2 r0 G.25 6 | C.2 r0 G.43 7 | C.3 r0 G.1 8 | C.3 r0 G.11 9 | C.3 r0 G.13 10 | C.3 r0 G.17 11 | C.3 r0 G.20 12 | C.3 r0 G.24 13 | C.3 r0 G.25 14 | C.3 r0 G.36 15 | C.3 r0 G.44 16 | C.3 r0 G.45 17 | C.4 r0 G.0 18 | C.4 r0 G.5 19 | C.4 r0 G.9 20 | C.4 r0 G.11 21 | C.4 r0 G.26 22 | C.4 r0 G.35 23 | C.4 r0 G.52 24 | C.8 r0 G.2 25 | C.8 r0 G.7 26 | C.8 r0 G.32 27 | C.8 r0 G.40 28 | C.8 r0 G.52 29 | C.16 r0 G.0 30 | C.16 r0 G.3 31 | C.16 r0 G.5 32 | C.16 r0 G.6 33 | C.16 r0 G.7 34 | C.16 r0 G.9 35 | C.16 r0 G.16 36 | C.16 r0 G.23 37 | C.16 r0 G.27 38 | C.16 r0 G.30 39 | C.16 r0 G.32 40 | C.16 r0 G.43 41 | C.16 r0 G.44 42 | C.16 r0 G.46 43 | C.16 r0 G.49 44 | C.16 r0 G.52 45 | C.22 r0 G.6 46 | C.22 r0 G.14 47 | C.22 r0 G.16 48 | C.22 r0 G.17 49 | C.22 r0 G.20 50 | C.22 r0 G.21 51 | C.22 r0 G.24 52 | C.22 r0 G.33 53 | C.22 r0 G.34 54 | C.27 r0 G.4 55 | C.27 r0 G.8 56 | C.27 r0 G.9 57 | C.27 r0 G.13 58 | C.27 r0 G.20 59 | C.27 r0 G.27 60 | C.27 r0 G.30 61 | C.27 r0 G.36 62 | C.30 r0 G.5 63 | C.30 r0 G.7 64 | C.30 r0 G.16 65 | C.30 r0 G.24 66 | C.30 r0 G.26 67 | C.30 r0 G.28 68 | C.30 r0 G.37 69 | C.30 r0 G.40 70 | C.30 r0 G.43 71 | C.45 r0 G.11 72 | C.45 r0 G.12 73 | C.45 r0 G.27 74 | C.45 r0 G.40 75 | C.47 r0 G.9 76 | C.47 r0 G.21 77 | C.47 r0 G.32 78 | C.47 r0 G.34 79 | C.47 r0 G.35 80 | C.47 r0 G.36 81 | C.47 r0 G.41 82 | C.47 r0 G.46 83 | C.52 r0 G.0 84 | C.52 r0 G.3 85 | C.52 r0 G.7 86 | C.52 r0 G.20 87 | C.52 r0 G.24 88 | C.52 r0 G.25 89 | C.52 r0 G.33 90 | C.52 r0 G.35 91 | C.52 r0 G.43 92 | C.52 r0 G.49 93 | C.68 r0 G.1 94 | C.68 r0 G.4 95 | C.68 r0 G.8 96 | C.68 r0 G.17 97 | C.68 r0 G.20 98 | C.68 r0 G.21 99 | C.68 r0 G.23 100 | C.68 r0 G.24 101 | C.68 r0 G.25 102 | C.68 r0 G.34 103 | C.68 r0 G.51 104 | C.72 r0 G.3 105 | C.72 r0 G.13 106 | C.72 r0 G.25 107 | C.72 r0 G.41 108 | C.74 r0 G.6 109 | C.74 r0 G.7 110 | C.74 r0 G.10 111 | C.74 r0 G.13 112 | C.74 r0 G.16 113 | C.74 r0 G.27 114 | C.74 r0 G.32 115 | C.74 r0 G.35 116 | C.74 r0 G.43 117 | C.74 r0 G.52 118 | C.80 r0 G.6 119 | C.80 r0 G.24 120 | C.90 r0 G.7 121 | C.90 r0 G.9 122 | C.90 r0 G.15 123 | C.90 r0 G.16 124 | C.90 r0 G.17 125 | C.90 r0 G.24 126 | C.90 r0 G.25 127 | C.90 r0 G.33 128 | C.90 r0 G.34 129 | C.90 r0 G.35 130 | C.90 r0 G.39 131 | C.91 r0 G.5 132 | C.91 r0 G.9 133 | C.91 r0 G.14 134 | C.91 r0 G.23 135 | C.91 r0 G.37 136 | C.91 r0 G.42 137 | C.95 r0 G.21 138 | C.95 r0 G.29 139 | C.95 r0 G.37 140 | C.98 r0 G.4 141 | C.98 r0 G.7 142 | C.98 r0 G.13 143 | C.98 r0 G.15 144 | C.98 r0 G.23 145 | C.98 r0 G.24 146 | C.98 r0 G.32 147 | C.98 r0 G.35 148 | C.98 r0 G.36 149 | C.98 r0 G.43 150 | C.98 r0 G.46 151 | C.98 r0 G.49 152 | C.98 r0 G.51 153 | C.98 r0 G.52 154 | C.0 r1 G.1 155 | C.0 r1 G.16 156 | C.0 r1 G.38 157 | C.2 r1 G.33 158 | C.2 r1 G.34 159 | C.2 r1 G.41 160 | C.3 r1 G.16 161 | C.3 r1 G.52 162 | C.4 r1 G.7 163 | C.4 r1 G.10 164 | C.4 r1 G.31 165 | C.4 r1 G.32 166 | C.4 r1 G.36 167 | C.4 r1 G.46 168 | C.4 r1 G.47 169 | C.5 r1 G.11 170 | C.5 r1 G.12 171 | C.5 r1 G.19 172 | C.5 r1 G.20 173 | C.5 r1 G.24 174 | C.5 r1 G.34 175 | C.5 r1 G.43 176 | C.6 r1 G.7 177 | C.6 r1 G.10 178 | C.6 r1 G.11 179 | C.6 r1 G.13 180 | C.6 r1 G.14 181 | C.6 r1 G.16 182 | C.6 r1 G.33 183 | C.6 r1 G.46 184 | C.6 r1 G.48 185 | C.7 r1 G.2 186 | C.7 r1 G.10 187 | C.7 r1 G.25 188 | C.7 r1 G.28 189 | C.7 r1 G.33 190 | C.7 r1 G.43 191 | C.7 r1 G.45 192 | C.8 r1 G.34 193 | C.8 r1 G.51 194 | C.14 r1 G.0 195 | C.14 r1 G.7 196 | C.14 r1 G.13 197 | C.14 r1 G.16 198 | C.14 r1 G.33 199 | C.14 r1 G.34 200 | C.14 r1 G.47 201 | C.14 r1 G.53 202 | C.16 r1 G.14 203 | C.16 r1 G.33 204 | C.17 r1 G.4 205 | C.17 r1 G.8 206 | C.17 r1 G.28 207 | C.17 r1 G.34 208 | C.17 r1 G.39 209 | C.17 r1 G.46 210 | C.17 r1 G.53 211 | C.18 r1 G.6 212 | C.18 r1 G.31 213 | C.18 r1 G.43 214 | C.18 r1 G.46 215 | C.18 r1 G.49 216 | C.19 r1 G.1 217 | C.19 r1 G.7 218 | C.19 r1 G.9 219 | C.19 r1 G.34 220 | C.19 r1 G.43 221 | C.19 r1 G.46 222 | C.20 r1 G.17 223 | C.20 r1 G.24 224 | C.20 r1 G.33 225 | C.21 r1 G.0 226 | C.21 r1 G.2 227 | C.21 r1 G.20 228 | C.21 r1 G.30 229 | C.21 r1 G.34 230 | C.21 r1 G.42 231 | C.21 r1 G.51 232 | C.22 r1 G.1 233 | C.22 r1 G.4 234 | C.22 r1 G.7 235 | C.22 r1 G.23 236 | C.22 r1 G.36 237 | C.22 r1 G.46 238 | C.23 r1 G.4 239 | C.23 r1 G.7 240 | C.23 r1 G.9 241 | C.23 r1 G.11 242 | C.23 r1 G.16 243 | C.23 r1 G.41 244 | C.23 r1 G.52 245 | C.26 r1 G.2 246 | C.26 r1 G.14 247 | C.26 r1 G.32 248 | C.26 r1 G.35 249 | C.27 r1 G.24 250 | C.27 r1 G.25 251 | C.27 r1 G.35 252 | C.27 r1 G.47 253 | C.27 r1 G.52 254 | C.29 r1 G.9 255 | C.29 r1 G.11 256 | C.29 r1 G.14 257 | C.29 r1 G.19 258 | C.29 r1 G.20 259 | C.29 r1 G.24 260 | C.29 r1 G.34 261 | C.29 r1 G.35 262 | C.29 r1 G.42 263 | C.29 r1 G.47 264 | C.29 r1 G.52 265 | C.30 r1 G.23 266 | C.30 r1 G.35 267 | C.30 r1 G.47 268 | C.32 r1 G.4 269 | C.32 r1 G.7 270 | C.32 r1 G.11 271 | C.32 r1 G.17 272 | C.32 r1 G.33 273 | C.32 r1 G.46 274 | C.33 r1 G.0 275 | C.33 r1 G.2 276 | C.33 r1 G.8 277 | C.33 r1 G.16 278 | C.33 r1 G.20 279 | C.33 r1 G.24 280 | C.33 r1 G.32 281 | C.33 r1 G.34 282 | C.33 r1 G.43 283 | C.33 r1 G.47 284 | C.33 r1 G.52 285 | C.33 r1 G.53 286 | C.36 r1 G.16 287 | C.36 r1 G.31 288 | C.36 r1 G.51 289 | C.37 r1 G.7 290 | C.37 r1 G.13 291 | C.37 r1 G.17 292 | C.37 r1 G.18 293 | C.37 r1 G.34 294 | C.37 r1 G.35 295 | C.37 r1 G.37 296 | C.37 r1 G.49 297 | C.37 r1 G.50 298 | C.38 r1 G.13 299 | C.38 r1 G.43 300 | C.39 r1 G.2 301 | C.39 r1 G.6 302 | C.39 r1 G.22 303 | C.39 r1 G.39 304 | C.39 r1 G.50 305 | C.40 r1 G.14 306 | C.40 r1 G.15 307 | C.40 r1 G.17 308 | C.40 r1 G.34 309 | C.40 r1 G.39 310 | C.41 r1 G.7 311 | C.41 r1 G.13 312 | C.41 r1 G.31 313 | C.41 r1 G.34 314 | C.41 r1 G.42 315 | C.41 r1 G.46 316 | C.42 r1 G.3 317 | C.42 r1 G.4 318 | C.42 r1 G.9 319 | C.42 r1 G.11 320 | C.42 r1 G.14 321 | C.42 r1 G.20 322 | C.42 r1 G.24 323 | C.42 r1 G.31 324 | C.42 r1 G.46 325 | C.45 r1 G.8 326 | C.45 r1 G.17 327 | C.45 r1 G.31 328 | C.45 r1 G.35 329 | C.45 r1 G.46 330 | C.45 r1 G.47 331 | C.45 r1 G.52 332 | C.47 r1 G.2 333 | C.47 r1 G.52 334 | C.49 r1 G.0 335 | C.49 r1 G.7 336 | C.49 r1 G.27 337 | C.49 r1 G.34 338 | C.49 r1 G.35 339 | C.49 r1 G.36 340 | C.49 r1 G.47 341 | C.50 r1 G.4 342 | C.50 r1 G.8 343 | C.50 r1 G.11 344 | C.50 r1 G.14 345 | C.50 r1 G.16 346 | C.50 r1 G.20 347 | C.50 r1 G.31 348 | C.50 r1 G.32 349 | C.50 r1 G.43 350 | C.51 r1 G.15 351 | C.51 r1 G.23 352 | C.51 r1 G.30 353 | C.51 r1 G.31 354 | C.51 r1 G.46 355 | C.52 r1 G.17 356 | C.52 r1 G.30 357 | C.53 r1 G.30 358 | C.53 r1 G.33 359 | C.53 r1 G.43 360 | C.54 r1 G.34 361 | C.54 r1 G.36 362 | C.55 r1 G.1 363 | C.55 r1 G.3 364 | C.55 r1 G.16 365 | C.55 r1 G.18 366 | C.55 r1 G.34 367 | C.55 r1 G.43 368 | C.55 r1 G.46 369 | C.55 r1 G.52 370 | C.56 r1 G.0 371 | C.56 r1 G.1 372 | C.56 r1 G.6 373 | C.56 r1 G.24 374 | C.56 r1 G.26 375 | C.56 r1 G.46 376 | C.58 r1 G.27 377 | C.58 r1 G.31 378 | C.58 r1 G.48 379 | C.58 r1 G.50 380 | C.60 r1 G.13 381 | C.60 r1 G.16 382 | C.60 r1 G.31 383 | C.60 r1 G.37 384 | C.60 r1 G.43 385 | C.60 r1 G.52 386 | C.62 r1 G.8 387 | C.62 r1 G.9 388 | C.62 r1 G.14 389 | C.62 r1 G.20 390 | C.62 r1 G.26 391 | C.62 r1 G.34 392 | C.62 r1 G.46 393 | C.62 r1 G.52 394 | C.63 r1 G.3 395 | C.63 r1 G.35 396 | C.65 r1 G.0 397 | C.65 r1 G.7 398 | C.65 r1 G.24 399 | C.65 r1 G.25 400 | C.65 r1 G.46 401 | C.67 r1 G.0 402 | C.67 r1 G.2 403 | C.67 r1 G.7 404 | C.67 r1 G.8 405 | C.67 r1 G.14 406 | C.67 r1 G.46 407 | C.67 r1 G.52 408 | C.68 r1 G.31 409 | C.69 r1 G.4 410 | C.69 r1 G.22 411 | C.69 r1 G.46 412 | C.69 r1 G.49 413 | C.70 r1 G.1 414 | C.70 r1 G.7 415 | C.70 r1 G.8 416 | C.70 r1 G.9 417 | C.70 r1 G.17 418 | C.70 r1 G.31 419 | C.70 r1 G.32 420 | C.70 r1 G.35 421 | C.70 r1 G.37 422 | C.70 r1 G.43 423 | C.71 r1 G.2 424 | C.71 r1 G.26 425 | C.71 r1 G.32 426 | C.71 r1 G.35 427 | C.71 r1 G.43 428 | C.71 r1 G.46 429 | C.72 r1 G.16 430 | C.72 r1 G.43 431 | C.72 r1 G.52 432 | C.73 r1 G.8 433 | C.73 r1 G.33 434 | C.73 r1 G.47 435 | C.75 r1 G.4 436 | C.75 r1 G.7 437 | C.75 r1 G.17 438 | C.75 r1 G.33 439 | C.75 r1 G.35 440 | C.75 r1 G.37 441 | C.75 r1 G.47 442 | C.77 r1 G.7 443 | C.77 r1 G.15 444 | C.77 r1 G.16 445 | C.77 r1 G.34 446 | C.77 r1 G.36 447 | C.77 r1 G.49 448 | C.77 r1 G.52 449 | C.79 r1 G.1 450 | C.79 r1 G.4 451 | C.79 r1 G.11 452 | C.79 r1 G.13 453 | C.79 r1 G.16 454 | C.79 r1 G.17 455 | C.79 r1 G.21 456 | C.79 r1 G.31 457 | C.79 r1 G.37 458 | C.79 r1 G.44 459 | C.79 r1 G.46 460 | C.80 r1 G.22 461 | C.80 r1 G.43 462 | C.80 r1 G.46 463 | C.81 r1 G.3 464 | C.81 r1 G.4 465 | C.81 r1 G.17 466 | C.81 r1 G.30 467 | C.81 r1 G.33 468 | C.81 r1 G.35 469 | C.82 r1 G.11 470 | C.82 r1 G.13 471 | C.82 r1 G.25 472 | C.82 r1 G.31 473 | C.82 r1 G.34 474 | C.82 r1 G.42 475 | C.83 r1 G.1 476 | C.83 r1 G.6 477 | C.83 r1 G.36 478 | C.83 r1 G.42 479 | C.84 r1 G.25 480 | C.84 r1 G.30 481 | C.84 r1 G.41 482 | C.84 r1 G.42 483 | C.84 r1 G.43 484 | C.88 r1 G.4 485 | C.88 r1 G.7 486 | C.88 r1 G.11 487 | C.88 r1 G.13 488 | C.88 r1 G.22 489 | C.88 r1 G.32 490 | C.88 r1 G.34 491 | C.88 r1 G.35 492 | C.88 r1 G.37 493 | C.88 r1 G.42 494 | C.89 r1 G.10 495 | C.89 r1 G.19 496 | C.89 r1 G.20 497 | C.89 r1 G.21 498 | C.89 r1 G.26 499 | C.89 r1 G.35 500 | C.89 r1 G.38 501 | C.90 r1 G.3 502 | C.91 r1 G.2 503 | C.91 r1 G.7 504 | C.91 r1 G.11 505 | C.91 r1 G.34 506 | C.91 r1 G.35 507 | C.91 r1 G.36 508 | C.91 r1 G.47 509 | C.91 r1 G.51 510 | C.93 r1 G.7 511 | C.93 r1 G.25 512 | C.93 r1 G.46 513 | C.93 r1 G.49 514 | C.94 r1 G.14 515 | C.94 r1 G.25 516 | C.94 r1 G.34 517 | C.94 r1 G.47 518 | C.94 r1 G.52 519 | C.95 r1 G.32 520 | C.95 r1 G.40 521 | C.95 r1 G.43 522 | C.95 r1 G.46 523 | C.0 r2 G.2 524 | C.0 r2 G.25 525 | C.0 r2 G.31 526 | C.0 r2 G.46 527 | C.0 r2 G.47 528 | C.1 r2 G.8 529 | C.1 r2 G.19 530 | C.1 r2 G.33 531 | C.1 r2 G.34 532 | C.1 r2 G.35 533 | C.1 r2 G.36 534 | C.1 r2 G.47 535 | C.2 r2 G.2 536 | C.3 r2 G.19 537 | C.3 r2 G.31 538 | C.3 r2 G.34 539 | C.3 r2 G.35 540 | C.4 r2 G.19 541 | C.4 r2 G.34 542 | C.5 r2 G.16 543 | C.5 r2 G.45 544 | C.5 r2 G.52 545 | C.6 r2 G.25 546 | C.7 r2 G.31 547 | C.8 r2 G.20 548 | C.8 r2 G.24 549 | C.8 r2 G.31 550 | C.8 r2 G.33 551 | C.8 r2 G.35 552 | C.8 r2 G.47 553 | C.9 r2 G.6 554 | C.9 r2 G.19 555 | C.9 r2 G.24 556 | C.9 r2 G.25 557 | C.9 r2 G.34 558 | C.11 r2 G.3 559 | C.11 r2 G.19 560 | C.11 r2 G.31 561 | C.11 r2 G.36 562 | C.11 r2 G.40 563 | C.11 r2 G.46 564 | C.12 r2 G.1 565 | C.12 r2 G.7 566 | C.12 r2 G.15 567 | C.12 r2 G.31 568 | C.13 r2 G.24 569 | C.13 r2 G.25 570 | C.13 r2 G.37 571 | C.14 r2 G.11 572 | C.14 r2 G.14 573 | C.15 r2 G.47 574 | C.17 r2 G.5 575 | C.17 r2 G.7 576 | C.17 r2 G.11 577 | C.17 r2 G.35 578 | C.17 r2 G.52 579 | C.19 r2 G.17 580 | C.19 r2 G.31 581 | C.19 r2 G.47 582 | C.20 r2 G.46 583 | C.21 r2 G.19 584 | C.21 r2 G.35 585 | C.21 r2 G.47 586 | C.22 r2 G.25 587 | C.22 r2 G.31 588 | C.22 r2 G.47 589 | C.23 r2 G.17 590 | C.23 r2 G.31 591 | C.23 r2 G.43 592 | C.23 r2 G.46 593 | C.24 r2 G.31 594 | C.24 r2 G.43 595 | C.24 r2 G.47 596 | C.26 r2 G.28 597 | C.27 r2 G.7 598 | C.27 r2 G.14 599 | C.27 r2 G.31 600 | C.27 r2 G.46 601 | C.28 r2 G.2 602 | C.28 r2 G.7 603 | C.28 r2 G.17 604 | C.30 r2 G.2 605 | C.30 r2 G.11 606 | C.30 r2 G.14 607 | C.30 r2 G.17 608 | C.30 r2 G.29 609 | C.30 r2 G.34 610 | C.30 r2 G.46 611 | C.30 r2 G.52 612 | C.31 r2 G.7 613 | C.31 r2 G.16 614 | C.32 r2 G.16 615 | C.32 r2 G.43 616 | C.33 r2 G.19 617 | C.33 r2 G.35 618 | C.34 r2 G.23 619 | C.34 r2 G.37 620 | C.34 r2 G.42 621 | C.34 r2 G.47 622 | C.35 r2 G.14 623 | C.35 r2 G.19 624 | C.36 r2 G.24 625 | C.36 r2 G.33 626 | C.36 r2 G.46 627 | C.37 r2 G.33 628 | C.38 r2 G.25 629 | C.39 r2 G.7 630 | C.39 r2 G.43 631 | C.40 r2 G.2 632 | C.40 r2 G.31 633 | C.41 r2 G.9 634 | C.43 r2 G.9 635 | C.43 r2 G.22 636 | C.43 r2 G.26 637 | C.43 r2 G.31 638 | C.43 r2 G.39 639 | C.44 r2 G.3 640 | C.44 r2 G.17 641 | C.44 r2 G.24 642 | C.44 r2 G.33 643 | C.44 r2 G.46 644 | C.45 r2 G.19 645 | C.45 r2 G.24 646 | C.45 r2 G.34 647 | C.46 r2 G.0 648 | C.48 r2 G.0 649 | C.48 r2 G.22 650 | C.48 r2 G.33 651 | C.48 r2 G.34 652 | C.48 r2 G.35 653 | C.48 r2 G.52 654 | C.49 r2 G.31 655 | C.49 r2 G.46 656 | C.50 r2 G.2 657 | C.50 r2 G.33 658 | C.50 r2 G.46 659 | C.50 r2 G.52 660 | C.51 r2 G.4 661 | C.51 r2 G.11 662 | C.51 r2 G.25 663 | C.51 r2 G.33 664 | C.51 r2 G.34 665 | C.52 r2 G.31 666 | C.52 r2 G.46 667 | C.53 r2 G.7 668 | C.53 r2 G.34 669 | C.53 r2 G.47 670 | C.54 r2 G.42 671 | C.55 r2 G.7 672 | C.55 r2 G.25 673 | C.56 r2 G.13 674 | C.56 r2 G.25 675 | C.56 r2 G.31 676 | C.56 r2 G.33 677 | C.56 r2 G.47 678 | C.57 r2 G.13 679 | C.57 r2 G.18 680 | C.57 r2 G.21 681 | C.57 r2 G.24 682 | C.57 r2 G.46 683 | C.58 r2 G.46 684 | C.58 r2 G.49 685 | C.59 r2 G.11 686 | C.59 r2 G.23 687 | C.59 r2 G.34 688 | C.59 r2 G.40 689 | C.59 r2 G.43 690 | C.59 r2 G.47 691 | C.60 r2 G.25 692 | C.60 r2 G.34 693 | C.60 r2 G.47 694 | C.60 r2 G.49 695 | C.61 r2 G.11 696 | C.61 r2 G.34 697 | C.61 r2 G.39 698 | C.61 r2 G.46 699 | C.62 r2 G.23 700 | C.62 r2 G.33 701 | C.62 r2 G.47 702 | C.63 r2 G.9 703 | C.64 r2 G.1 704 | C.64 r2 G.11 705 | C.64 r2 G.18 706 | C.64 r2 G.19 707 | C.64 r2 G.21 708 | C.64 r2 G.48 709 | C.64 r2 G.52 710 | C.65 r2 G.11 711 | C.65 r2 G.19 712 | C.65 r2 G.28 713 | C.65 r2 G.35 714 | C.65 r2 G.47 715 | C.66 r2 G.1 716 | C.66 r2 G.14 717 | C.66 r2 G.19 718 | C.66 r2 G.27 719 | C.67 r2 G.35 720 | C.67 r2 G.47 721 | C.68 r2 G.2 722 | C.68 r2 G.35 723 | C.68 r2 G.52 724 | C.70 r2 G.33 725 | C.70 r2 G.34 726 | C.70 r2 G.46 727 | C.71 r2 G.11 728 | C.71 r2 G.31 729 | C.71 r2 G.33 730 | C.71 r2 G.34 731 | C.72 r2 G.34 732 | C.72 r2 G.46 733 | C.73 r2 G.31 734 | C.74 r2 G.31 735 | C.74 r2 G.46 736 | C.75 r2 G.31 737 | C.76 r2 G.1 738 | C.76 r2 G.7 739 | C.76 r2 G.27 740 | C.76 r2 G.32 741 | C.76 r2 G.33 742 | C.76 r2 G.37 743 | C.76 r2 G.44 744 | C.76 r2 G.47 745 | C.77 r2 G.3 746 | C.77 r2 G.31 747 | C.77 r2 G.46 748 | C.77 r2 G.47 749 | C.78 r2 G.25 750 | C.78 r2 G.41 751 | C.78 r2 G.52 752 | C.80 r2 G.31 753 | C.80 r2 G.47 754 | C.81 r2 G.25 755 | C.81 r2 G.46 756 | C.81 r2 G.47 757 | C.82 r2 G.7 758 | C.82 r2 G.33 759 | C.82 r2 G.47 760 | C.83 r2 G.47 761 | C.83 r2 G.48 762 | C.85 r2 G.9 763 | C.85 r2 G.25 764 | C.85 r2 G.31 765 | C.85 r2 G.33 766 | C.85 r2 G.35 767 | C.85 r2 G.43 768 | C.86 r2 G.15 769 | C.86 r2 G.16 770 | C.86 r2 G.34 771 | C.86 r2 G.38 772 | C.86 r2 G.47 773 | C.87 r2 G.7 774 | C.87 r2 G.8 775 | C.87 r2 G.29 776 | C.87 r2 G.30 777 | C.87 r2 G.31 778 | C.87 r2 G.32 779 | C.87 r2 G.35 780 | C.87 r2 G.52 781 | C.88 r2 G.2 782 | C.88 r2 G.14 783 | C.88 r2 G.17 784 | C.88 r2 G.19 785 | C.88 r2 G.33 786 | C.88 r2 G.46 787 | C.88 r2 G.47 788 | C.89 r2 G.46 789 | C.90 r2 G.1 790 | C.90 r2 G.31 791 | C.90 r2 G.46 792 | C.91 r2 G.4 793 | C.91 r2 G.19 794 | C.91 r2 G.29 795 | C.91 r2 G.31 796 | C.91 r2 G.33 797 | C.92 r2 G.31 798 | C.92 r2 G.33 799 | C.93 r2 G.31 800 | C.93 r2 G.44 801 | C.94 r2 G.2 802 | C.94 r2 G.17 803 | C.94 r2 G.31 804 | C.94 r2 G.35 805 | C.95 r2 G.2 806 | C.95 r2 G.33 807 | C.95 r2 G.47 808 | C.95 r2 G.52 809 | C.96 r2 G.12 810 | C.96 r2 G.19 811 | C.97 r2 G.4 812 | C.97 r2 G.20 813 | C.97 r2 G.24 814 | C.97 r2 G.34 815 | C.97 r2 G.35 816 | C.97 r2 G.36 817 | C.98 r2 G.31 818 | C.98 r2 G.33 819 | C.99 r2 G.4 820 | C.99 r2 G.11 821 | C.99 r2 G.21 822 | C.99 r2 G.31 823 | C.99 r2 G.46 824 | C.1 r3 G.2 825 | C.1 r3 G.7 826 | C.1 r3 G.31 827 | C.2 r3 G.31 828 | C.2 r3 G.47 829 | C.3 r3 G.47 830 | C.4 r3 G.2 831 | C.4 r3 G.33 832 | C.5 r3 G.46 833 | C.5 r3 G.47 834 | C.6 r3 G.31 835 | C.7 r3 G.46 836 | C.7 r3 G.47 837 | C.8 r3 G.46 838 | C.9 r3 G.31 839 | C.9 r3 G.46 840 | C.10 r3 G.17 841 | C.11 r3 G.2 842 | C.11 r3 G.24 843 | C.12 r3 G.47 844 | C.13 r3 G.46 845 | C.14 r3 G.1 846 | C.14 r3 G.19 847 | C.14 r3 G.31 848 | C.14 r3 G.46 849 | C.15 r3 G.6 850 | C.15 r3 G.13 851 | C.15 r3 G.31 852 | C.15 r3 G.35 853 | C.16 r3 G.31 854 | C.17 r3 G.2 855 | C.17 r3 G.33 856 | C.18 r3 G.1 857 | C.18 r3 G.47 858 | C.20 r3 G.47 859 | C.21 r3 G.33 860 | C.23 r3 G.19 861 | C.23 r3 G.47 862 | C.24 r3 G.1 863 | C.26 r3 G.4 864 | C.26 r3 G.19 865 | C.26 r3 G.33 866 | C.26 r3 G.46 867 | C.27 r3 G.2 868 | C.27 r3 G.19 869 | C.27 r3 G.33 870 | C.28 r3 G.15 871 | C.28 r3 G.19 872 | C.28 r3 G.33 873 | C.28 r3 G.34 874 | C.28 r3 G.35 875 | C.28 r3 G.46 876 | C.29 r3 G.2 877 | C.29 r3 G.4 878 | C.29 r3 G.33 879 | C.29 r3 G.46 880 | C.31 r3 G.11 881 | C.31 r3 G.19 882 | C.32 r3 G.25 883 | C.32 r3 G.31 884 | C.32 r3 G.47 885 | C.33 r3 G.31 886 | C.33 r3 G.33 887 | C.34 r3 G.11 888 | C.35 r3 G.35 889 | C.36 r3 G.47 890 | C.37 r3 G.31 891 | C.37 r3 G.47 892 | C.39 r3 G.31 893 | C.39 r3 G.47 894 | C.41 r3 G.47 895 | C.43 r3 G.17 896 | C.43 r3 G.46 897 | C.43 r3 G.52 898 | C.45 r3 G.2 899 | C.46 r3 G.31 900 | C.46 r3 G.35 901 | C.47 r3 G.31 902 | C.47 r3 G.33 903 | C.47 r3 G.47 904 | C.48 r3 G.47 905 | C.49 r3 G.2 906 | C.49 r3 G.17 907 | C.49 r3 G.19 908 | C.50 r3 G.19 909 | C.50 r3 G.47 910 | C.51 r3 G.47 911 | C.52 r3 G.47 912 | C.53 r3 G.31 913 | C.53 r3 G.46 914 | C.55 r3 G.2 915 | C.55 r3 G.19 916 | C.55 r3 G.47 917 | C.57 r3 G.17 918 | C.57 r3 G.47 919 | C.58 r3 G.47 920 | C.59 r3 G.19 921 | C.59 r3 G.31 922 | C.59 r3 G.33 923 | C.59 r3 G.46 924 | C.62 r3 G.19 925 | C.62 r3 G.31 926 | C.62 r3 G.35 927 | C.63 r3 G.47 928 | C.64 r3 G.47 929 | C.65 r3 G.33 930 | C.65 r3 G.34 931 | C.65 r3 G.52 932 | C.66 r3 G.45 933 | C.67 r3 G.1 934 | C.67 r3 G.19 935 | C.67 r3 G.33 936 | C.68 r3 G.19 937 | C.68 r3 G.33 938 | C.68 r3 G.47 939 | C.69 r3 G.31 940 | C.70 r3 G.2 941 | C.70 r3 G.19 942 | C.70 r3 G.47 943 | C.71 r3 G.47 944 | C.72 r3 G.19 945 | C.72 r3 G.31 946 | C.72 r3 G.47 947 | C.73 r3 G.2 948 | C.73 r3 G.11 949 | C.73 r3 G.19 950 | C.73 r3 G.46 951 | C.75 r3 G.25 952 | C.76 r3 G.17 953 | C.76 r3 G.31 954 | C.77 r3 G.2 955 | C.78 r3 G.47 956 | C.79 r3 G.33 957 | C.79 r3 G.47 958 | C.83 r3 G.19 959 | C.83 r3 G.43 960 | C.83 r3 G.46 961 | C.84 r3 G.31 962 | C.84 r3 G.47 963 | C.85 r3 G.47 964 | C.86 r3 G.31 965 | C.86 r3 G.35 966 | C.87 r3 G.2 967 | C.87 r3 G.19 968 | C.87 r3 G.33 969 | C.89 r3 G.7 970 | C.89 r3 G.17 971 | C.89 r3 G.33 972 | C.90 r3 G.47 973 | C.91 r3 G.46 974 | C.92 r3 G.14 975 | C.92 r3 G.17 976 | C.92 r3 G.19 977 | C.94 r3 G.19 978 | C.94 r3 G.33 979 | C.94 r3 G.46 980 | C.95 r3 G.19 981 | C.95 r3 G.31 982 | C.95 r3 G.35 983 | C.96 r3 G.34 984 | C.96 r3 G.35 985 | C.96 r3 G.47 986 | C.97 r3 G.2 987 | C.97 r3 G.27 988 | C.97 r3 G.46 989 | C.98 r3 G.47 990 | C.0 r4 G.19 991 | C.2 r4 G.19 992 | C.3 r4 G.46 993 | C.6 r4 G.19 994 | C.7 r4 G.19 995 | C.8 r4 G.19 996 | C.9 r4 G.47 997 | C.10 r4 G.19 998 | C.10 r4 G.47 999 | C.12 r4 G.19 1000 | C.13 r4 G.19 1001 | C.15 r4 G.19 1002 | C.16 r4 G.19 1003 | C.16 r4 G.47 1004 | C.18 r4 G.19 1005 | C.19 r4 G.19 1006 | C.20 r4 G.19 1007 | C.21 r4 G.46 1008 | C.22 r4 G.19 1009 | C.24 r4 G.19 1010 | C.25 r4 G.19 1011 | C.25 r4 G.46 1012 | C.25 r4 G.53 1013 | C.30 r4 G.33 1014 | C.31 r4 G.47 1015 | C.32 r4 G.19 1016 | C.33 r4 G.46 1017 | C.34 r4 G.19 1018 | C.35 r4 G.2 1019 | C.35 r4 G.33 1020 | C.36 r4 G.19 1021 | C.37 r4 G.19 1022 | C.38 r4 G.19 1023 | C.38 r4 G.47 1024 | C.39 r4 G.19 1025 | C.40 r4 G.19 1026 | C.40 r4 G.47 1027 | C.41 r4 G.19 1028 | C.42 r4 G.19 1029 | C.42 r4 G.47 1030 | C.44 r4 G.19 1031 | C.44 r4 G.47 1032 | C.45 r4 G.33 1033 | C.46 r4 G.19 1034 | C.47 r4 G.19 1035 | C.48 r4 G.19 1036 | C.51 r4 G.19 1037 | C.52 r4 G.19 1038 | C.53 r4 G.19 1039 | C.54 r4 G.19 1040 | C.56 r4 G.19 1041 | C.57 r4 G.19 1042 | C.58 r4 G.19 1043 | C.60 r4 G.19 1044 | C.61 r4 G.19 1045 | C.61 r4 G.25 1046 | C.63 r4 G.19 1047 | C.63 r4 G.31 1048 | C.66 r4 G.47 1049 | C.69 r4 G.19 1050 | C.69 r4 G.47 1051 | C.71 r4 G.19 1052 | C.74 r4 G.19 1053 | C.74 r4 G.47 1054 | C.75 r4 G.19 1055 | C.75 r4 G.46 1056 | C.77 r4 G.33 1057 | C.78 r4 G.19 1058 | C.79 r4 G.19 1059 | C.80 r4 G.19 1060 | C.81 r4 G.19 1061 | C.82 r4 G.19 1062 | C.82 r4 G.46 1063 | C.83 r4 G.31 1064 | C.84 r4 G.19 1065 | C.85 r4 G.19 1066 | C.86 r4 G.19 1067 | C.89 r4 G.2 1068 | C.90 r4 G.19 1069 | C.92 r4 G.46 1070 | C.93 r4 G.19 1071 | C.93 r4 G.47 1072 | C.96 r4 G.33 1073 | C.98 r4 G.19 1074 | C.99 r4 G.2 1075 | C.99 r4 G.19 1076 | -------------------------------------------------------------------------------- /tests/data/preprocessed/atac_preprocessed.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/preprocessed/atac_preprocessed.h5ad -------------------------------------------------------------------------------- /tests/data/preprocessed/rna_preprocessed.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinellolab/simba/7b25fd089873aba9580f9923f2e412d375de9a46/tests/data/preprocessed/rna_preprocessed.h5ad -------------------------------------------------------------------------------- /tests/test_pbg_training.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def adata_CG(): 7 | return si.read_h5ad( 8 | "tests/data/preprocessed/rna_preprocessed.h5ad") 9 | 10 | 11 | @pytest.fixture 12 | def adata_CP(): 13 | return si.read_h5ad( 14 | "tests/data/preprocessed/atac_preprocessed.h5ad") 15 | 16 | 17 | def test_pbg_training_rna(adata_CG, tmp_path): 18 | si.settings.set_workdir(tmp_path / "simba_rna") 19 | si.tl.gen_graph(list_CG=[adata_CG], 20 | copy=False, 21 | dirname='graph0') 22 | si.tl.pbg_train(auto_wd=True, 23 | output='model') 24 | si.pl.pbg_metrics(fig_ncol=1, 25 | save_fig=True) 26 | 27 | 28 | def test_pbg_training_atac(adata_CP, tmp_path): 29 | si.settings.set_workdir(tmp_path / "simba_atac") 30 | si.tl.gen_graph(list_CP=[adata_CP], 31 | copy=False, 32 | dirname='graph0') 33 | si.tl.pbg_train(auto_wd=True, 34 | output='model') 35 | si.pl.pbg_metrics(fig_ncol=1, 36 | save_fig=True) 37 | -------------------------------------------------------------------------------- /tests/test_post_training.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def dict_adata(): 7 | 8 | return si.read_embedding( 9 | path_emb='tests/data/pbg_training/model/', 10 | path_entity='tests/data/pbg_training/input/entity/', 11 | path_entity_alias='tests/data/pbg_training') 12 | 13 | 14 | def test_embeddding_rna(dict_adata, tmp_path): 15 | si.settings.set_workdir(tmp_path / "simba_rna") 16 | adata_C = dict_adata['C'] 17 | adata_G = dict_adata['G'] 18 | adata_all_CG = si.tl.embed( 19 | adata_ref=adata_C, 20 | list_adata_query=[adata_G]) 21 | # add annotations of cells and genes 22 | adata_all_CG.obs['entity_anno'] = "" 23 | adata_all_CG.obs.loc[adata_C.obs_names, 'entity_anno'] = 'cell' 24 | adata_all_CG.obs.loc[adata_G.obs_names, 'entity_anno'] = 'gene' 25 | 26 | si.tl.umap(adata_all_CG, 27 | n_neighbors=15, 28 | n_components=2) 29 | adata_cmp = si.tl.compare_entities( 30 | adata_ref=adata_C, 31 | adata_query=adata_G) 32 | si.pl.entity_metrics(adata_cmp, 33 | x='max', 34 | y='gini', 35 | show_contour=False, 36 | texts=adata_G.obs_names[:2], 37 | show_texts=True, 38 | show_cutoff=True, 39 | size=5, 40 | text_expand=(1.3, 1.5), 41 | cutoff_x=1., 42 | cutoff_y=0.3, 43 | save_fig=True) 44 | si.pl.entity_barcode(adata_cmp, 45 | layer='softmax', 46 | entities=list(adata_G.obs_names[:2]), 47 | show_cutoff=True, 48 | cutoff=0.001, 49 | fig_size=(5, 2.5), 50 | save_fig=True) 51 | query_result = si.tl.query(adata_all_CG, 52 | entity=list(adata_C.obs_names[:2]), 53 | obsm='X_umap', 54 | use_radius=False, 55 | k=50, 56 | anno_filter='entity_anno', 57 | filters=['gene']) 58 | print(query_result.head()) 59 | si.pl.query(adata_all_CG, 60 | show_texts=False, 61 | color=['entity_anno'], 62 | alpha=0.9, 63 | alpha_bg=0.1, 64 | save_fig=True) 65 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import simba as si 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def adata_CG(): 7 | return si.read_h5ad("tests/data/10xpbmc_rna_subset.h5ad") 8 | 9 | 10 | @pytest.fixture 11 | def adata_CP(): 12 | return si.read_h5ad("tests/data/10xpbmc_atac_subset.h5ad") 13 | 14 | 15 | def test_rna(adata_CG, tmp_path): 16 | si.settings.set_workdir(tmp_path / "simba_rna") 17 | si.settings.set_figure_params(dpi=80, 18 | style='white', 19 | fig_size=[5, 5], 20 | rc={'image.cmap': 'viridis'}) 21 | si.pp.filter_genes(adata_CG, min_n_cells=3) 22 | si.pp.cal_qc_rna(adata_CG) 23 | si.pl.violin(adata_CG, 24 | list_obs=['n_counts', 'n_genes', 'pct_mt'], 25 | save_fig=True, 26 | fig_name='plot_violin.png') 27 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2) 28 | si.pp.normalize(adata_CG, method='lib_size') 29 | si.pp.log_transform(adata_CG) 30 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000) 31 | si.pl.variable_genes(adata_CG, 32 | show_texts=True, 33 | save_fig=True, 34 | fig_name='plot_variable_genes.png') 35 | si.tl.discretize(adata_CG, n_bins=5) 36 | si.pl.discretize(adata_CG, 37 | save_fig=True, 38 | fig_name='plot_discretize.png') 39 | 40 | 41 | def test_atac(adata_CP, tmp_path): 42 | si.settings.set_workdir(tmp_path / "simba_atac") 43 | si.pp.filter_peaks(adata_CP, min_n_cells=5) 44 | si.pp.cal_qc_atac(adata_CP) 45 | si.pl.hist(adata_CP, 46 | list_obs=['n_counts', 'n_peaks', 'pct_peaks'], 47 | log=True, 48 | list_var=['n_cells'], 49 | fig_size=(3, 3), 50 | save_fig=True, 51 | fig_name='plot_histogram.png') 52 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5) 53 | si.pp.pca(adata_CP, n_components=30) 54 | si.pl.pca_variance_ratio(adata_CP, 55 | show_cutoff=True, 56 | save_fig=True, 57 | fig_name='plot_variance_ratio.png') 58 | si.pp.select_pcs(adata_CP, n_pcs=10) 59 | si.pp.select_pcs_features(adata_CP) 60 | si.pl.pcs_features(adata_CP, 61 | fig_ncol=5, 62 | save_fig=True, 63 | fig_name='plot_pcs_features.png') 64 | 65 | 66 | def test_genescores(adata_CP): 67 | si.pp.filter_peaks(adata_CP, min_n_cells=5) 68 | si.pp.cal_qc_atac(adata_CP) 69 | si.pp.filter_cells_atac(adata_CP, min_n_peaks=5) 70 | si.pp.pca(adata_CP, n_components=30) 71 | si.pp.select_pcs(adata_CP, n_pcs=10) 72 | si.pp.select_pcs_features(adata_CP) 73 | 74 | adata_CG_atac = si.tl.gene_scores(adata_CP, 75 | genome='hg19', 76 | use_gene_weigt=True, 77 | use_top_pcs=True) 78 | print(adata_CG_atac) 79 | 80 | 81 | def test_integration(adata_CG): 82 | si.pp.filter_genes(adata_CG, min_n_cells=3) 83 | si.pp.cal_qc_rna(adata_CG) 84 | si.pp.filter_cells_rna(adata_CG, min_n_genes=2) 85 | si.pp.normalize(adata_CG, method='lib_size') 86 | si.pp.log_transform(adata_CG) 87 | si.pp.select_variable_genes(adata_CG, n_top_genes=2000) 88 | adata_C1C2 = si.tl.infer_edges( 89 | adata_CG, adata_CG, n_components=20, k=20) 90 | si.pl.node_similarity(adata_C1C2, 91 | cutoff=0.5, 92 | save_fig=True) 93 | si.pl.svd_nodes(adata_C1C2, 94 | cutoff=0.5, 95 | save_fig=True) 96 | si.tl.trim_edges(adata_C1C2, cutoff=0.5) 97 | --------------------------------------------------------------------------------