├── normbench ├── __init__.py ├── methods │ ├── __init__.py │ ├── lognorm_runner.py │ ├── runBASICS.R │ ├── method_runner.py │ ├── glmpca_method_runner.py │ ├── ad2seurat.py │ ├── sctransform_method_runner.py │ ├── data.py │ ├── runSCNorm.R │ ├── scvi_method_runner.py │ └── runLocalRegression.R └── testing │ └── test_pyScTransform.py ├── requirements.txt ├── setup.py ├── packages.R ├── README.md ├── Dockerfile ├── .circleci └── config.yml ├── run_benchmark.py ├── LICENSE └── .gitignore /normbench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /normbench/methods/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anndata==0.6.22.post1 2 | anndata2ri 3 | glmpca==0.1.0 4 | dataclasses 5 | loompy 6 | pytest 7 | matplotlib>=3.1.1 8 | numpy 9 | requests 10 | rpy2 11 | scanpy 12 | scvi==0.5.0 13 | sklearn 14 | statsmodels 15 | torch 16 | torchvision 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='normbench', 4 | version='0.1', 5 | description='Benchmarking normalization methods', 6 | url='https://github.com/normjam/benchmark', 7 | author='Normjam benchmark team', 8 | author_email='', 9 | packages=['normbench'], 10 | zip_safe=False) 11 | 12 | -------------------------------------------------------------------------------- /normbench/testing/test_pyScTransform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from normbench.methods import ad2seurat as a2s 3 | from normbench.methods.data import pbmc3k 4 | 5 | def test_pyScTransform(): 6 | adata = pbmc3k() 7 | 8 | a2s.pyScTransform(adata) 9 | 10 | # Test that it runs 11 | assert 'normalized' in adata.layers 12 | 13 | # Test functionality 14 | assert np.isclose(adata.layers['normalized'][0,0], -0.03377807) 15 | -------------------------------------------------------------------------------- /normbench/methods/lognorm_runner.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | 3 | from .method_runner import MethodRunner 4 | 5 | 6 | class LogNormalizationRunner(MethodRunner): 7 | 8 | def __init__(self, data, verbose): 9 | MethodRunner.__init__(self, data, verbose) 10 | 11 | def run(self): 12 | sc.pp.normalize_per_cell(self.data, copy=True) 13 | sc.pp.log1p(self.data) 14 | 15 | # Normalized matrix 16 | self.dump_to_h5ad("lognorm_normalized") 17 | -------------------------------------------------------------------------------- /packages.R: -------------------------------------------------------------------------------- 1 | install.packages("Seurat") 2 | install.packages("devtools") 3 | 4 | library("devtools") 5 | install.packages(c("RcppEigen", "urltools", "Rtsne","BiocManager","robustbase")) 6 | 7 | BiocManager::install( 8 | c("GO.db", "org.Hs.eg.db","org.Mm.eg.db", "pcaMethods","DESeq2","edgeR","BiocGenerics", "BASiCS", "SCnorm"), 9 | version = "3.10", 10 | update = TRUE, 11 | ask = FALSE 12 | ) 13 | 14 | devtools::install_github("rhondabacher/SCnorm") 15 | devtools::install_github("catavallejos/BASiCS", ref = "batches") 16 | 17 | devtools::install_github("hms-dbmi/pagoda2") 18 | devtools::install_github("hms-dbmi/conos", ref="master") 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## Usage 4 | 5 | This repository defines a docker environment that contains R and Python kernels with the dependencies required to 6 | run the benchmarking scripts. R dependencies are installed by `packages.R` and python dependencies are listed in 7 | `requirements.txt` 8 | 9 | ## Build Docker 10 | 11 | Install [Docker](https://docs.docker.com/v17.09/engine/installation/). Run, from the root of the repository: 12 | 13 | `> docker build -t normjam -f Dockerfile .` 14 | 15 | ## Use container 16 | 17 | When docker is installed on your system, the following command will drop you into a bash shell that exposes R & python 18 | installed. 19 | 20 | `> docker run -it normjam` 21 | -------------------------------------------------------------------------------- /normbench/methods/runBASICS.R: -------------------------------------------------------------------------------- 1 | 2 | runBASiCS<-function(Seurat_Obj) 3 | { 4 | library(BASiCS) 5 | umiData<-Seurat_Obj@assays$RNA@counts 6 | umiData <- SingleCellExperiment::SingleCellExperiment(assays = list('counts' = umiData)) 7 | 8 | # naive test of BASiCS, no batches, no clustering information, data are expected to be filtered with some QC 9 | Chain <- BASiCS_MCMC(Data = umiData, N = 1000, Thin = 10, Burn = 500, 10 | WithSpikes = FALSE, 11 | PrintProgress = FALSE, Regression = TRUE) 12 | 13 | DenoisedCounts <- BASiCS_DenoisedCounts(Data = umiData, Chain = Chain) 14 | str(DenoisedCounts) 15 | write.table(DenoisedCounts, "BASiCS_NormalizedCounts.tsv", sep = "\t") 16 | return(DenoisedCounts) 17 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM r-base:3.6.1 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | build-essential \ 7 | libblas-dev \ 8 | liblapack-dev \ 9 | gfortran \ 10 | python3.7 \ 11 | python3-pip \ 12 | python3-setuptools \ 13 | python3-dev \ 14 | cmake \ 15 | libcurl4-openssl-dev \ 16 | libgsl0-dev \ 17 | libeigen3-dev \ 18 | libssl-dev \ 19 | libcairo2-dev \ 20 | libxt-dev \ 21 | libxml2-dev \ 22 | libgtk2.0-dev \ 23 | libcairo2-dev \ 24 | xvfb \ 25 | xauth \ 26 | xfonts-base \ 27 | libz-dev \ 28 | libhdf5-dev 29 | 30 | WORKDIR /app 31 | 32 | COPY . . 33 | 34 | RUN pip3 install -r requirements.txt 35 | RUN Rscript packages.R 36 | RUN pip3 install . 37 | 38 | ENTRYPOINT ["/bin/bash"] 39 | -------------------------------------------------------------------------------- /normbench/methods/method_runner.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import time 3 | import loompy 4 | 5 | class MethodRunner(abc.ABC): 6 | """ Interface class encapsulating required methods to run a normalization algorithm. """ 7 | def __init__(self, data, verbose): 8 | self.data = data 9 | self.verbose = verbose 10 | 11 | @abc.abstractmethod 12 | def run(self): 13 | raise NotImplementedError("Subclasses of MethodRunner are required to override this method.") 14 | 15 | def dump_to_loom(self, filename, latent_matrix, row_attrs, col_attrs): 16 | filename = filename + "_" + str(int(time.time())) + ".loom" 17 | loompy.create(filename, latent_matrix, row_attrs, col_attrs) 18 | 19 | def dump_to_h5ad(self, filename): 20 | filename = filename + "_" + str(int(time.time())) + ".h5ad" 21 | self.data.write(filename) 22 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | working_directory: /app 5 | docker: 6 | - image: docker:17.05.0-ce-git 7 | steps: 8 | - checkout 9 | - setup_remote_docker 10 | - restore_cache: 11 | keys: 12 | - v1-{{ .Branch }} 13 | paths: 14 | - /caches/app.tar 15 | - run: 16 | name: Load Docker image layer cache 17 | command: | 18 | set +o pipefail 19 | docker load -i /caches/app.tar | true 20 | - run: 21 | name: Build application Docker image 22 | command: | 23 | docker build --cache-from=app -t app . 24 | - run: 25 | name: Save Docker image layer cache 26 | command: | 27 | mkdir -p /caches 28 | docker save -o /caches/app.tar app 29 | - run: 30 | name: Run tests 31 | command: | 32 | docker run app pytest 33 | - save_cache: 34 | key: v1-{{ .Branch }}-{{ epoch }} 35 | paths: 36 | - /caches/app.tar 37 | -------------------------------------------------------------------------------- /run_benchmark.py: -------------------------------------------------------------------------------- 1 | from normbench.methods import LogNormalizationRunner, GLMPCAMethodRunner, data, ScViMethodRunner, \ 2 | ScTransformMethodRunner 3 | 4 | VERBOSE = False 5 | 6 | # Pull datasets 7 | datasets = { 8 | "pbmc3k": data.pbmc3k(), 9 | "cite_seq_bone_marrow": data.cite_seq_bone_marrow(), 10 | "endoderm_downsampled": data.endoderm_downsampled(), 11 | "endoderm_full": data.endoderm_full() 12 | } 13 | 14 | # Run datasets on existing methods 15 | for data_name, data in datasets: 16 | print(f"Running scVI on dataset {data_name}") 17 | ScViMethodRunner(data, VERBOSE).run() 18 | 19 | print(f"Running scTransform on dataset {data_name}") 20 | ScTransformMethodRunner(data, VERBOSE).run() 21 | 22 | print(f"Running LogNorm on dataset {data_name}") 23 | LogNormalizationRunner(data, VERBOSE).run() 24 | 25 | print(f"Running GLMPCA (Poisson) on dataset {data_name}") 26 | GLMPCAMethodRunner(data, VERBOSE).run() 27 | 28 | print(f"Running GLMPCA (Negative Binomial) on dataset {data_name}") 29 | GLMPCAMethodRunner(data, VERBOSE, likelihood="nb").run() 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 normjam 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /normbench/methods/glmpca_method_runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from glmpca.glmpca import glmpca 3 | 4 | from .method_runner import MethodRunner 5 | 6 | 7 | class GLMPCAMethodRunner(MethodRunner): 8 | 9 | def __init__(self, data, verbose, n_latent=10, likelihood="poi"): 10 | """ 11 | Contains parameters for running GLMPCA normalization. n_latent is the number of latent dimensions and the 12 | likelihood is the likelihood function to use. Likelihood should generally be one of "poi" which represents 13 | Poisson or "nb" which represents negative binomial. 14 | """ 15 | 16 | MethodRunner.__init__(self, data, verbose) 17 | self.n_latent = n_latent 18 | self.likelihood = likelihood 19 | 20 | def run(self): 21 | Y = self.data.X.todense().T 22 | res = glmpca(Y, self.n_latent, fam=self.likelihood) 23 | 24 | # Normalized matrix 25 | norm = np.dot(res['factors'], res['loadings'].T) 26 | self.data.obsm['X_norm'] = norm 27 | self.data.obsm['X_emb'] = res['factors'] 28 | 29 | self.dump_to_h5ad(f"glmpca_{self.likelihood}") 30 | -------------------------------------------------------------------------------- /normbench/methods/ad2seurat.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | from scipy.sparse import issparse 3 | 4 | 5 | def pyScTransform(adata, output_file=None): 6 | """ 7 | Function to call scTransform from Python 8 | """ 9 | import rpy2.robjects as ro 10 | import anndata2ri 11 | 12 | ro.r('library(Seurat)') 13 | ro.r('library(scater)') 14 | anndata2ri.activate() 15 | 16 | sc.pp.filter_genes(adata, min_cells=5) 17 | 18 | if issparse(adata.X): 19 | if not adata.X.has_sorted_indices: 20 | adata.X.sort_indices() 21 | 22 | for key in adata.layers: 23 | if issparse(adata.layers[key]): 24 | if not adata.layers[key].has_sorted_indices: 25 | adata.layers[key].sort_indices() 26 | 27 | ro.globalenv['adata'] = adata 28 | 29 | ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') 30 | 31 | ro.r('res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)') 32 | 33 | norm_x = ro.r('res@assays$SCT@scale.data').T 34 | 35 | adata.layers['normalized'] = norm_x 36 | 37 | if output_file: 38 | adata.write(output_file) 39 | -------------------------------------------------------------------------------- /normbench/methods/sctransform_method_runner.py: -------------------------------------------------------------------------------- 1 | import anndata2ri 2 | import rpy2.robjects as ro 3 | import scanpy as sc 4 | from scipy.sparse import issparse 5 | 6 | from .method_runner import MethodRunner 7 | 8 | 9 | class ScTransformMethodRunner(MethodRunner): 10 | def __init__(self, data, verbose): 11 | MethodRunner.__init__(self, data, verbose) 12 | 13 | def run(self): 14 | """ 15 | Function to call scTransform from Python 16 | """ 17 | 18 | ro.r('library(Seurat)') 19 | ro.r('library(scater)') 20 | anndata2ri.activate() 21 | 22 | sc.pp.filter_genes(self.data, min_cells=5) 23 | 24 | if issparse(self.data.X): 25 | if not self.data.X.has_sorted_indices: 26 | self.data.X.sort_indices() 27 | 28 | for key in self.data.layers: 29 | if issparse(self.data.layers[key]): 30 | if not self.data.layers[key].has_sorted_indices: 31 | self.data.layers[key].sort_indices() 32 | 33 | ro.globalenv['adata'] = self.data 34 | 35 | ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') 36 | 37 | ro.r('res <- SCTransform(object=seurat_obj)') 38 | 39 | norm_x = ro.r('res@assays$SCT@data').T 40 | 41 | self.data.layers['normalized'] = norm_x 42 | 43 | self.dump_to_h5ad("scTransform") 44 | -------------------------------------------------------------------------------- /normbench/methods/data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from io import BytesIO 3 | from anndata import AnnData, read_h5ad 4 | import numpy as np 5 | 6 | 7 | def _download_adata(url) -> AnnData: 8 | response = requests.get(url) 9 | f = BytesIO(response.content) 10 | return read_h5ad(f) 11 | 12 | 13 | # Just cast unsigned ints to int64 14 | # Hope that this doesn't break (uint64) 15 | def _no_unsigned_int(pdata): 16 | for i,t in enumerate(pdata.dtypes): 17 | if t.name.startswith('u'): #this is an unsigned integer 18 | colname = pdata.columns[i] 19 | pdata[colname] = pdata[colname].astype(int) 20 | 21 | 22 | # Helper function to clean anndata object 23 | def clean_adata(adata) -> AnnData: 24 | _no_unsigned_int(adata.var) 25 | _no_unsigned_int(adata.obs) 26 | return(adata) 27 | 28 | def pbmc3k() -> AnnData: 29 | return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18789473?private_link=6e12f5f56bb46842f5ee")) 30 | 31 | 32 | def cite_seq_bone_marrow() -> AnnData: 33 | return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18788633?private_link=cfbc86f6a399343677ea")) 34 | 35 | 36 | def endoderm_full() -> AnnData: 37 | return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18789569?private_link=fe99143697ea99121240")) 38 | 39 | 40 | def endoderm_downsampled() -> AnnData: 41 | return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18788645?private_link=c204122a8d0550282502")) 42 | 43 | 44 | -------------------------------------------------------------------------------- /normbench/methods/runSCNorm.R: -------------------------------------------------------------------------------- 1 | 2 | runSCNorm<-function(Seurat_Obj) 3 | { 4 | library(SCnorm) 5 | library(Seurat) 6 | Conditions = Idents(Seurat_Obj) 7 | umiData <- SingleCellExperiment::SingleCellExperiment(assays = list('counts' = Seurat_Obj@assays$RNA@counts)) 8 | 9 | str(umiData) 10 | head(Conditions) 11 | 12 | # pdf("check_exampleData_count-depth_evaluation.pdf", height=5, width=7) 13 | # countDeptEst <- plotCountDepth(Data = umiData, Conditions = Conditions, 14 | # FilterCellProportion = .1, NCores=1) 15 | # dev.off() 16 | # str(countDeptEst) 17 | # head(countDeptEst[[1]]) 18 | 19 | DataNorm <- SCnorm(Data = umiData, 20 | Conditions= Conditions, 21 | PrintProgressPlots = TRUE, 22 | FilterCellNum = 10, 23 | PropToUse = .1, 24 | Thresh = .1, 25 | ditherCounts = TRUE) 26 | 27 | str(DataNorm@assays@data$counts) 28 | str(DataNorm@assays@data$normcounts) 29 | 30 | write.table(DataNorm@assays@data$normcounts, "NormalizedCounts.tsv", sep = "\t") 31 | #NormSeurat_Obj<-CreateSeurat_Obj(raw.data = Matrix(DataNorm@assays@data$normcounts), project = "test") 32 | return(DataNorm@assays@data$normcounts) 33 | # Seurat has convert function, use it 34 | # #this did not work, requires Seurat re-intallation ... 35 | # NormSeurat_Obj<-CreateSeurat_Obj(raw.data = Matrix(DataNorm@assays@data$normcounts), project = "test") 36 | # pfile <- Convert(from = NormSeurat_Obj, to = "loom", filename = "NormCounts.loom", 37 | # display.progress = FALSE) 38 | # pfile 39 | 40 | } 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /normbench/methods/scvi_method_runner.py: -------------------------------------------------------------------------------- 1 | import anndata 2 | import numpy as np 3 | from scvi.dataset import AnnDatasetFromAnnData 4 | from scvi.inference import UnsupervisedTrainer 5 | from scvi.models import VAE 6 | from sklearn.preprocessing import LabelEncoder 7 | 8 | from .method_runner import MethodRunner 9 | 10 | 11 | class ScViMethodRunner(MethodRunner): 12 | 13 | def __init__(self, data, verbose, batch=1, highly_variable_genes=None): 14 | MethodRunner.__init__(self, data, verbose) 15 | self.batch = batch 16 | self.highly_variable_genes = highly_variable_genes 17 | 18 | self.validate_method_parameters() 19 | 20 | def run(self): 21 | n_epochs = 100 22 | n_latent = 10 23 | n_hidden = 128 24 | n_layers = 2 25 | net_data = self.data.copy() 26 | net_data.X = self.data.layers['counts'] 27 | del net_data.layers['counts'] 28 | net_data.raw = None # Ensure that the raw counts are not accidentally used 29 | 30 | # Define batch indices 31 | le = LabelEncoder() 32 | net_data.obs['batch_indices'] = le.fit_transform(net_data.obs[self.batch].values) 33 | net_data = AnnDatasetFromAnnData(net_data) 34 | vae = VAE(net_data.nb_genes, reconstruction_loss='nb', n_batch=net_data.n_batches, n_layers=n_layers, 35 | n_latent=n_latent, n_hidden=n_hidden) 36 | trainer = UnsupervisedTrainer(vae, net_data, train_size=1, use_cuda=False) 37 | trainer.train(n_epochs=n_epochs, lr=1e-3) 38 | full = trainer.create_posterior(trainer.model, net_data, indices=np.arange(len(net_data))) 39 | latent, _, _ = full.sequential().get_latent() 40 | self.data.obsm['X_emb'] = latent 41 | self.dump_to_h5ad("scvi") 42 | 43 | def validate_method_parameters(self): 44 | """ 45 | Checks the input parameters for the scVI Method Runner and raises an exception if it fails a basic set of 46 | sanity check around typing. 47 | """ 48 | 49 | # Validate data 50 | if type(self.data) is not anndata.AnnData: 51 | raise TypeError("Input data is not a valid AnnData object") 52 | if 'counts' not in self.data.layers: 53 | raise TypeError("Input data does not contain a `counts` layer in `data.layers[`counts`]`") 54 | 55 | # Validate batch 56 | obs = self.data.obs 57 | if self.batch not in obs: 58 | raise ValueError(f"Column {self.batch} is not in obs") 59 | elif self.verbose: 60 | print(f"Object contains {obs[self.batch].nunique()} batches.") 61 | 62 | # Validate highly variable genes 63 | if self.highly_variable_genes: 64 | if type(self.highly_variable_genes) is not list: 65 | raise TypeError("HVG list is not a list") 66 | else: 67 | data_var = self.data.var 68 | if not all(i in data_var.index for i in self.highly_variable_genes): 69 | raise ValueError("Not all HVGs are in the data object") 70 | -------------------------------------------------------------------------------- /normbench/methods/runLocalRegression.R: -------------------------------------------------------------------------------- 1 | # runner function 2 | # sce - single cell experiment data structure 3 | runLocalRegressionFromSCE <- function(sce,n.cores=detectCores()) { 4 | # extract count matrix 5 | counts <- sce@assays@data@counts; 6 | runLocalRegressionFromMatrix(counts,n.cores=n.cores) 7 | } 8 | 9 | # cd - matrix, should be castable to dgCMatrix; rows - genes; columns- cells 10 | runLocalRegressionFromMatrix <- function(cd,n.cores=detectCores()) { 11 | cd <- as(cd,'dgCMatrix'); # assert format 12 | # get the count matrix 13 | #cd <- seurat.object@assays$RNA@counts; 14 | rownames(cd) <- make.unique(rownames(cd)) 15 | 16 | # run local regression with an increasing size of clusters 17 | r <- strawnorm(cd,res=c(0.5,1,2,3),n.cores=n.cores); 18 | # return normalized matrix 19 | x <- r$counts; 20 | # normalize gene variance 21 | x@x <- x@x*rep(r$misc[['varinfo']][colnames(x),'gsf'],diff(x@p)) 22 | x 23 | } 24 | 25 | #### 26 | # the method itself 27 | require(conos) 28 | require(edgeR) 29 | require(pagoda2) 30 | require(parallel) 31 | require(Matrix) 32 | require(robustbase) 33 | 34 | 35 | # strawman normalization 36 | # local regression - iterative regression of each cell against combined expression profiles of local clusters 37 | strawnorm <- function(cd,res=c(0.5,1,2,3),fix.init.depth=NULL,n.cores=30,nPcs=30,n.odgenes=2e3,k=30,use.edgeR.correction=TRUE,verbose=TRUE) { 38 | 39 | # refine lib sizes using robust regression against the cluster total 40 | refine.lib.sizes <- function(counts,groups) { 41 | 42 | # collapse all molecules within clusters into a meta-cell 43 | lvec <- conos:::colSumByFactor(counts,as.factor(groups))[-1,,drop=F]; 44 | lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4 45 | 46 | x <- pagoda2:::papply(1:length(levels(groups)),function(j) { 47 | ii <- names(groups)[which(groups==j)] 48 | if(length(ii)<2) { return(setNames(c(rowSums(counts[ii,])/1e4),ii)) } # if it's just one cell, don't bother 49 | av <- lvec[,j] 50 | avi <- which(av>0) 51 | av <- av[avi] 52 | cvm <- as.matrix(counts[ii,avi]) 53 | 54 | x <- unlist(lapply(ii,function(i) { 55 | cv <- cvm[i,] 56 | as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4))) 57 | })) 58 | names(x) <- ii; 59 | x 60 | },n.cores=n.cores) 61 | 62 | 63 | if(use.edgeR.correction) { 64 | # even out cluster scales against each other 65 | f <- edgeR::calcNormFactors(lvec) 66 | f <- f/exp(mean(log(f))) 67 | x <- lapply(1:length(x),function(i) { 68 | x[[i]]*f[i] 69 | }) 70 | } 71 | 72 | lib.sizes <- unlist(x)[rownames(counts)] 73 | # global scaling 74 | lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(counts)) 75 | } 76 | 77 | # subsample columns of a sparse matrix to approximately a desired depth 78 | subsample.cell.depth <- function(m,depth) { 79 | p.sample <- pmin(1,rep(depth/colSums(m),diff(m@p))) 80 | m@x <- as.numeric(rbinom(length(m@x),m@x,p.sample)) 81 | m 82 | } 83 | 84 | 85 | 86 | internal.norm.loop <- function(cd,groups=NULL,res=1) { 87 | if(!is.null(groups)) { 88 | lib.sizes <- refine.lib.sizes(t(cd),groups); # regression-based estimates 89 | } else { 90 | lib.sizes <- NULL; # use number of molecules estimate 91 | } 92 | # make a p2 object 93 | p2process(cd,lib.sizes=lib.sizes,n.cores=n.cores,nPcs=nPcs,k=k,n.odgenes=n.odgenes,res=res,verbose=F) 94 | } 95 | 96 | ccd <- cd; 97 | if(!is.null(fix.init.depth)) { 98 | if(is.logical(fix.init.depth) & fix.init.depth) { 99 | init.depth <- min(colSums(cd)) 100 | cat("limiting cell depth for the initial round to the smallest cell: ",init.depth,"\n") 101 | } else if(is.numeric(fix.init.depth)) { 102 | init.depth <- fix.init.depth; 103 | cat("limiting cell depth for the initial round to the specified: ",init.depth,"\n") 104 | } 105 | # subsample cells 106 | ccd <- subsample.cell.depth(ccd,init.depth) 107 | } 108 | 109 | 110 | res <- c(res,res[length(res)]) 111 | 112 | cat("initial processing ...") 113 | r <- internal.norm.loop(ccd,res=res[1],groups=NULL) 114 | cat(" done\n") 115 | 116 | cat("processing iterations "); 117 | for(i in res) { 118 | groups <- r$clusters$PCA[[1]]; 119 | r <- internal.norm.loop(cd,res=i,groups=groups) 120 | cat(".") 121 | } 122 | cat(" done\n") 123 | return(r); 124 | } 125 | 126 | 127 | # basic p2 processing 128 | p2process <- function(cd,lib.sizes=NULL,n.cores=30,nPcs=30,n.odgenes=2e3,res=1,k=30,verbose=FALSE) { 129 | r <- Pagoda2$new(cd,lib.sizes=lib.sizes,log.scale=TRUE, n.cores=n.cores,verbose=verbose) 130 | # varnorm, PCA, kNN, cluster 131 | r$adjustVariance(plot=F,gam.k=10,verbose=F); 132 | r$calculatePcaReduction(nPcs=nPcs,n.odgenes=n.odgenes,verbose=F); 133 | r$makeKnnGraph(k=k,type='PCA',center=T,distance='cosine',verbose=F); 134 | r$getKnnClusters(type='PCA',method=conos:::leiden.community,resolution=res) 135 | r 136 | } 137 | 138 | --------------------------------------------------------------------------------