├── normbench
    ├── __init__.py
    ├── methods
    │   ├── __init__.py
    │   ├── lognorm_runner.py
    │   ├── runBASICS.R
    │   ├── method_runner.py
    │   ├── glmpca_method_runner.py
    │   ├── ad2seurat.py
    │   ├── sctransform_method_runner.py
    │   ├── data.py
    │   ├── runSCNorm.R
    │   ├── scvi_method_runner.py
    │   └── runLocalRegression.R
    └── testing
    │   └── test_pyScTransform.py
├── requirements.txt
├── setup.py
├── packages.R
├── README.md
├── Dockerfile
├── .circleci
    └── config.yml
├── run_benchmark.py
├── LICENSE
└── .gitignore


/normbench/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/normbench/methods/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anndata==0.6.22.post1
 2 | anndata2ri
 3 | glmpca==0.1.0
 4 | dataclasses
 5 | loompy
 6 | pytest
 7 | matplotlib>=3.1.1
 8 | numpy
 9 | requests
10 | rpy2
11 | scanpy
12 | scvi==0.5.0
13 | sklearn
14 | statsmodels
15 | torch
16 | torchvision
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='normbench',
 4 |       version='0.1',
 5 |       description='Benchmarking normalization methods',
 6 |       url='https://github.com/normjam/benchmark',
 7 |       author='Normjam benchmark team',
 8 |       author_email='',
 9 |       packages=['normbench'],
10 |       zip_safe=False)
11 | 
12 | 


--------------------------------------------------------------------------------
/normbench/testing/test_pyScTransform.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from normbench.methods import ad2seurat as a2s
 3 | from normbench.methods.data import pbmc3k
 4 | 
 5 | def test_pyScTransform():
 6 |     adata = pbmc3k()
 7 | 
 8 |     a2s.pyScTransform(adata)
 9 | 
10 |     # Test that it runs
11 |     assert 'normalized' in adata.layers
12 | 
13 |     # Test functionality
14 |     assert np.isclose(adata.layers['normalized'][0,0], -0.03377807)
15 | 


--------------------------------------------------------------------------------
/normbench/methods/lognorm_runner.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | 
 3 | from .method_runner import MethodRunner
 4 | 
 5 | 
 6 | class LogNormalizationRunner(MethodRunner):
 7 | 
 8 |     def __init__(self, data, verbose):
 9 |         MethodRunner.__init__(self, data, verbose)
10 | 
11 |     def run(self):
12 |         sc.pp.normalize_per_cell(self.data, copy=True)
13 |         sc.pp.log1p(self.data)
14 | 
15 |         # Normalized matrix
16 |         self.dump_to_h5ad("lognorm_normalized")
17 | 


--------------------------------------------------------------------------------
/packages.R:
--------------------------------------------------------------------------------
 1 | install.packages("Seurat")
 2 | install.packages("devtools")
 3 | 
 4 | library("devtools")
 5 | install.packages(c("RcppEigen", "urltools", "Rtsne","BiocManager","robustbase"))
 6 | 
 7 | BiocManager::install(
 8 |   c("GO.db", "org.Hs.eg.db","org.Mm.eg.db", "pcaMethods","DESeq2","edgeR","BiocGenerics", "BASiCS", "SCnorm"),
 9 |   version = "3.10",
10 |   update = TRUE,
11 |   ask = FALSE
12 | )
13 | 
14 | devtools::install_github("rhondabacher/SCnorm")
15 | devtools::install_github("catavallejos/BASiCS", ref = "batches")
16 | 
17 | devtools::install_github("hms-dbmi/pagoda2")
18 | devtools::install_github("hms-dbmi/conos", ref="master")
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | ## Usage
 4 | 
 5 | This repository defines a docker environment that contains R and Python kernels with the dependencies required to
 6 | run the benchmarking scripts. R dependencies are installed by `packages.R` and python dependencies are listed in
 7 | `requirements.txt`
 8 | 
 9 | ## Build Docker
10 | 
11 | Install [Docker](https://docs.docker.com/v17.09/engine/installation/). Run, from the root of the repository:
12 | 
13 | `> docker build -t normjam -f Dockerfile .`
14 | 
15 | ## Use container
16 | 
17 | When docker is installed on your system, the following command will drop you into a bash shell that exposes R & python
18 | installed.
19 | 
20 | `> docker run -it normjam`
21 | 


--------------------------------------------------------------------------------
/normbench/methods/runBASICS.R:
--------------------------------------------------------------------------------
 1 | 
 2 | runBASiCS<-function(Seurat_Obj)
 3 | {
 4 |   library(BASiCS)
 5 |   umiData<-Seurat_Obj@assays$RNA@counts
 6 |   umiData <- SingleCellExperiment::SingleCellExperiment(assays = list('counts' = umiData))
 7 |   
 8 |   # naive test of BASiCS, no batches, no clustering information, data are expected to be filtered with some QC
 9 |   Chain <- BASiCS_MCMC(Data = umiData, N = 1000, Thin = 10, Burn = 500, 
10 |                        WithSpikes = FALSE,
11 |                        PrintProgress = FALSE, Regression = TRUE)
12 |   
13 |   DenoisedCounts <- BASiCS_DenoisedCounts(Data = umiData, Chain = Chain)
14 |   str(DenoisedCounts)
15 |   write.table(DenoisedCounts, "BASiCS_NormalizedCounts.tsv", sep = "\t")
16 |    return(DenoisedCounts)
17 | }


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM r-base:3.6.1
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |     build-essential \
 7 |     libblas-dev \
 8 |     liblapack-dev \
 9 |     gfortran \
10 |     python3.7 \
11 |     python3-pip \
12 |     python3-setuptools \
13 |     python3-dev \
14 |     cmake \
15 |     libcurl4-openssl-dev \
16 |     libgsl0-dev \
17 |     libeigen3-dev \
18 |     libssl-dev \
19 |     libcairo2-dev \
20 |     libxt-dev \
21 |     libxml2-dev \
22 |     libgtk2.0-dev \
23 |     libcairo2-dev \
24 |     xvfb  \
25 |     xauth \
26 |     xfonts-base \
27 |     libz-dev \
28 |     libhdf5-dev
29 | 
30 | WORKDIR /app
31 | 
32 | COPY . .
33 | 
34 | RUN pip3 install -r requirements.txt
35 | RUN Rscript packages.R
36 | RUN pip3 install .
37 | 
38 | ENTRYPOINT ["/bin/bash"]
39 | 


--------------------------------------------------------------------------------
/normbench/methods/method_runner.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import time
 3 | import loompy
 4 | 
 5 | class MethodRunner(abc.ABC):
 6 |     """ Interface class encapsulating required methods to run a normalization algorithm. """
 7 |     def __init__(self, data, verbose):
 8 |         self.data = data
 9 |         self.verbose = verbose
10 | 
11 |     @abc.abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError("Subclasses of MethodRunner are required to override this method.")
14 | 
15 |     def dump_to_loom(self, filename, latent_matrix, row_attrs, col_attrs):
16 |         filename = filename + "_" + str(int(time.time())) + ".loom"
17 |         loompy.create(filename, latent_matrix, row_attrs, col_attrs)
18 | 
19 |     def dump_to_h5ad(self, filename):
20 |         filename = filename + "_" + str(int(time.time())) + ".h5ad"
21 |         self.data.write(filename)
22 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     working_directory: /app
 5 |     docker:
 6 |       - image: docker:17.05.0-ce-git
 7 |     steps:
 8 |     - checkout
 9 |     - setup_remote_docker
10 |     - restore_cache:
11 |         keys:
12 |           - v1-{{ .Branch }}
13 |         paths:
14 |           - /caches/app.tar
15 |     - run:
16 |         name: Load Docker image layer cache
17 |         command: |
18 |           set +o pipefail
19 |           docker load -i /caches/app.tar | true
20 |     - run:
21 |         name: Build application Docker image
22 |         command: |
23 |           docker build --cache-from=app -t app .
24 |     - run:
25 |         name: Save Docker image layer cache
26 |         command: |
27 |           mkdir -p /caches
28 |           docker save -o /caches/app.tar app
29 |     - run:
30 |         name: Run tests
31 |         command: |
32 |             docker run app pytest
33 |     - save_cache:
34 |         key: v1-{{ .Branch }}-{{ epoch }}
35 |         paths:
36 |           - /caches/app.tar
37 | 


--------------------------------------------------------------------------------
/run_benchmark.py:
--------------------------------------------------------------------------------
 1 | from normbench.methods import LogNormalizationRunner, GLMPCAMethodRunner, data, ScViMethodRunner, \
 2 |     ScTransformMethodRunner
 3 | 
 4 | VERBOSE = False
 5 | 
 6 | # Pull datasets
 7 | datasets = {
 8 |     "pbmc3k": data.pbmc3k(),
 9 |     "cite_seq_bone_marrow": data.cite_seq_bone_marrow(),
10 |     "endoderm_downsampled": data.endoderm_downsampled(),
11 |     "endoderm_full": data.endoderm_full()
12 | }
13 | 
14 | # Run datasets on existing methods
15 | for data_name, data in datasets:
16 |     print(f"Running scVI on dataset {data_name}")
17 |     ScViMethodRunner(data, VERBOSE).run()
18 | 
19 |     print(f"Running scTransform on dataset {data_name}")
20 |     ScTransformMethodRunner(data, VERBOSE).run()
21 | 
22 |     print(f"Running LogNorm on dataset {data_name}")
23 |     LogNormalizationRunner(data, VERBOSE).run()
24 | 
25 |     print(f"Running GLMPCA (Poisson) on dataset {data_name}")
26 |     GLMPCAMethodRunner(data, VERBOSE).run()
27 | 
28 |     print(f"Running GLMPCA (Negative Binomial) on dataset {data_name}")
29 |     GLMPCAMethodRunner(data, VERBOSE, likelihood="nb").run()
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 normjam
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/normbench/methods/glmpca_method_runner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from glmpca.glmpca import glmpca
 3 | 
 4 | from .method_runner import MethodRunner
 5 | 
 6 | 
 7 | class GLMPCAMethodRunner(MethodRunner):
 8 | 
 9 |     def __init__(self, data, verbose, n_latent=10, likelihood="poi"):
10 |         """
11 |         Contains parameters for running GLMPCA normalization. n_latent is the number of latent dimensions and the
12 |         likelihood is the likelihood function to use. Likelihood should generally be one of "poi" which represents
13 |         Poisson or "nb" which represents negative binomial.
14 |         """
15 | 
16 |         MethodRunner.__init__(self, data, verbose)
17 |         self.n_latent = n_latent
18 |         self.likelihood = likelihood
19 | 
20 |     def run(self):
21 |         Y = self.data.X.todense().T
22 |         res = glmpca(Y, self.n_latent, fam=self.likelihood)
23 | 
24 |         # Normalized matrix
25 |         norm = np.dot(res['factors'], res['loadings'].T)
26 |         self.data.obsm['X_norm'] = norm
27 |         self.data.obsm['X_emb'] = res['factors']
28 | 
29 |         self.dump_to_h5ad(f"glmpca_{self.likelihood}")
30 | 


--------------------------------------------------------------------------------
/normbench/methods/ad2seurat.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | from scipy.sparse import issparse
 3 | 
 4 | 
 5 | def pyScTransform(adata, output_file=None):
 6 |     """
 7 |     Function to call scTransform from Python
 8 |     """
 9 |     import rpy2.robjects as ro
10 |     import anndata2ri
11 | 
12 |     ro.r('library(Seurat)')
13 |     ro.r('library(scater)')
14 |     anndata2ri.activate()
15 | 
16 |     sc.pp.filter_genes(adata, min_cells=5)
17 |     
18 |     if issparse(adata.X):
19 |         if not adata.X.has_sorted_indices:
20 |             adata.X.sort_indices()
21 | 
22 |     for key in adata.layers:
23 |         if issparse(adata.layers[key]):
24 |             if not adata.layers[key].has_sorted_indices:
25 |                 adata.layers[key].sort_indices()
26 | 
27 |     ro.globalenv['adata'] = adata
28 | 
29 |     ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')
30 | 
31 |     ro.r('res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)')
32 | 
33 |     norm_x = ro.r('res@assays$SCT@scale.data').T
34 | 
35 |     adata.layers['normalized'] = norm_x
36 | 
37 |     if output_file:
38 |         adata.write(output_file)
39 | 


--------------------------------------------------------------------------------
/normbench/methods/sctransform_method_runner.py:
--------------------------------------------------------------------------------
 1 | import anndata2ri
 2 | import rpy2.robjects as ro
 3 | import scanpy as sc
 4 | from scipy.sparse import issparse
 5 | 
 6 | from .method_runner import MethodRunner
 7 | 
 8 | 
 9 | class ScTransformMethodRunner(MethodRunner):
10 |     def __init__(self, data, verbose):
11 |         MethodRunner.__init__(self, data, verbose)
12 | 
13 |     def run(self):
14 |         """
15 |         Function to call scTransform from Python
16 |         """
17 | 
18 |         ro.r('library(Seurat)')
19 |         ro.r('library(scater)')
20 |         anndata2ri.activate()
21 | 
22 |         sc.pp.filter_genes(self.data, min_cells=5)
23 | 
24 |         if issparse(self.data.X):
25 |             if not self.data.X.has_sorted_indices:
26 |                 self.data.X.sort_indices()
27 | 
28 |         for key in self.data.layers:
29 |             if issparse(self.data.layers[key]):
30 |                 if not self.data.layers[key].has_sorted_indices:
31 |                     self.data.layers[key].sort_indices()
32 | 
33 |         ro.globalenv['adata'] = self.data
34 | 
35 |         ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)')
36 | 
37 |         ro.r('res <- SCTransform(object=seurat_obj)')
38 | 
39 |         norm_x = ro.r('res@assays$SCT@data').T
40 | 
41 |         self.data.layers['normalized'] = norm_x
42 | 
43 |         self.dump_to_h5ad("scTransform")
44 | 


--------------------------------------------------------------------------------
/normbench/methods/data.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from io import BytesIO
 3 | from anndata import AnnData, read_h5ad
 4 | import numpy as np
 5 | 
 6 | 
 7 | def _download_adata(url) -> AnnData:
 8 |     response = requests.get(url)
 9 |     f = BytesIO(response.content)
10 |     return read_h5ad(f)
11 | 
12 | 
13 | # Just cast unsigned ints to int64
14 | # Hope that this doesn't break (uint64)
15 | def _no_unsigned_int(pdata):
16 |     for i,t in enumerate(pdata.dtypes):
17 |         if t.name.startswith('u'): #this is an unsigned integer
18 |             colname = pdata.columns[i]
19 |             pdata[colname] = pdata[colname].astype(int)
20 | 
21 | 
22 | # Helper function to clean anndata object
23 | def clean_adata(adata) -> AnnData:
24 |     _no_unsigned_int(adata.var)
25 |     _no_unsigned_int(adata.obs)
26 |     return(adata)
27 |     
28 | def pbmc3k() -> AnnData:
29 |     return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18789473?private_link=6e12f5f56bb46842f5ee"))
30 | 
31 | 
32 | def cite_seq_bone_marrow() -> AnnData:
33 |     return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18788633?private_link=cfbc86f6a399343677ea"))
34 | 
35 | 
36 | def endoderm_full() -> AnnData:
37 |     return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18789569?private_link=fe99143697ea99121240"))
38 | 
39 | 
40 | def endoderm_downsampled() -> AnnData:
41 |     return clean_adata(_download_adata("https://ndownloader.figshare.com/files/18788645?private_link=c204122a8d0550282502"))
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/normbench/methods/runSCNorm.R:
--------------------------------------------------------------------------------
 1 | 
 2 | runSCNorm<-function(Seurat_Obj)
 3 | {
 4 |   library(SCnorm)
 5 |   library(Seurat)
 6 |   Conditions = Idents(Seurat_Obj)
 7 |   umiData <- SingleCellExperiment::SingleCellExperiment(assays = list('counts' = Seurat_Obj@assays$RNA@counts))
 8 |   
 9 |   str(umiData)
10 |   head(Conditions)
11 |   
12 |   # pdf("check_exampleData_count-depth_evaluation.pdf", height=5, width=7)
13 |   # countDeptEst <- plotCountDepth(Data = umiData, Conditions = Conditions,
14 |   #                                FilterCellProportion = .1, NCores=1)
15 |   # dev.off()
16 |   # str(countDeptEst)
17 |   # head(countDeptEst[[1]])
18 |   
19 |   DataNorm <- SCnorm(Data = umiData, 
20 |                      Conditions= Conditions,
21 |                      PrintProgressPlots = TRUE,
22 |                      FilterCellNum = 10,
23 |                      PropToUse = .1,
24 |                      Thresh = .1,
25 |                      ditherCounts = TRUE)
26 |   
27 |   str(DataNorm@assays@data$counts)
28 |   str(DataNorm@assays@data$normcounts)
29 | 
30 |   write.table(DataNorm@assays@data$normcounts, "NormalizedCounts.tsv", sep = "\t")
31 |   #NormSeurat_Obj<-CreateSeurat_Obj(raw.data = Matrix(DataNorm@assays@data$normcounts), project = "test")
32 |   return(DataNorm@assays@data$normcounts)
33 |   # Seurat has convert function, use it
34 |   # #this did not work, requires Seurat re-intallation ...
35 |   # NormSeurat_Obj<-CreateSeurat_Obj(raw.data = Matrix(DataNorm@assays@data$normcounts), project = "test")
36 |   # pfile <- Convert(from = NormSeurat_Obj, to = "loom", filename = "NormCounts.loom", 
37 |   #                  display.progress = FALSE)
38 |   # pfile
39 |   
40 | }
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/normbench/methods/scvi_method_runner.py:
--------------------------------------------------------------------------------
 1 | import anndata
 2 | import numpy as np
 3 | from scvi.dataset import AnnDatasetFromAnnData
 4 | from scvi.inference import UnsupervisedTrainer
 5 | from scvi.models import VAE
 6 | from sklearn.preprocessing import LabelEncoder
 7 | 
 8 | from .method_runner import MethodRunner
 9 | 
10 | 
11 | class ScViMethodRunner(MethodRunner):
12 | 
13 |     def __init__(self, data, verbose, batch=1, highly_variable_genes=None):
14 |         MethodRunner.__init__(self, data, verbose)
15 |         self.batch = batch
16 |         self.highly_variable_genes = highly_variable_genes
17 | 
18 |         self.validate_method_parameters()
19 | 
20 |     def run(self):
21 |         n_epochs = 100
22 |         n_latent = 10
23 |         n_hidden = 128
24 |         n_layers = 2
25 |         net_data = self.data.copy()
26 |         net_data.X = self.data.layers['counts']
27 |         del net_data.layers['counts']
28 |         net_data.raw = None  # Ensure that the raw counts are not accidentally used
29 | 
30 |         # Define batch indices
31 |         le = LabelEncoder()
32 |         net_data.obs['batch_indices'] = le.fit_transform(net_data.obs[self.batch].values)
33 |         net_data = AnnDatasetFromAnnData(net_data)
34 |         vae = VAE(net_data.nb_genes, reconstruction_loss='nb', n_batch=net_data.n_batches, n_layers=n_layers,
35 |                   n_latent=n_latent, n_hidden=n_hidden)
36 |         trainer = UnsupervisedTrainer(vae, net_data, train_size=1, use_cuda=False)
37 |         trainer.train(n_epochs=n_epochs, lr=1e-3)
38 |         full = trainer.create_posterior(trainer.model, net_data, indices=np.arange(len(net_data)))
39 |         latent, _, _ = full.sequential().get_latent()
40 |         self.data.obsm['X_emb'] = latent
41 |         self.dump_to_h5ad("scvi")
42 | 
43 |     def validate_method_parameters(self):
44 |         """
45 |         Checks the input parameters for the scVI Method Runner and raises an exception if it fails a basic set of
46 |         sanity check around typing.
47 |         """
48 | 
49 |         # Validate data
50 |         if type(self.data) is not anndata.AnnData:
51 |             raise TypeError("Input data is not a valid AnnData object")
52 |         if 'counts' not in self.data.layers:
53 |             raise TypeError("Input data does not contain a `counts` layer in `data.layers[`counts`]`")
54 | 
55 |         # Validate batch
56 |         obs = self.data.obs
57 |         if self.batch not in obs:
58 |             raise ValueError(f"Column {self.batch} is not in obs")
59 |         elif self.verbose:
60 |             print(f"Object contains {obs[self.batch].nunique()} batches.")
61 | 
62 |         # Validate highly variable genes
63 |         if self.highly_variable_genes:
64 |             if type(self.highly_variable_genes) is not list:
65 |                 raise TypeError("HVG list is not a list")
66 |             else:
67 |                 data_var = self.data.var
68 |                 if not all(i in data_var.index for i in self.highly_variable_genes):
69 |                     raise ValueError("Not all HVGs are in the data object")
70 | 


--------------------------------------------------------------------------------
/normbench/methods/runLocalRegression.R:
--------------------------------------------------------------------------------
  1 | # runner function
  2 | # sce - single cell experiment data structure
  3 | runLocalRegressionFromSCE <- function(sce,n.cores=detectCores()) {
  4 |   # extract count matrix
  5 |   counts <- sce@assays@data@counts;
  6 |   runLocalRegressionFromMatrix(counts,n.cores=n.cores)
  7 | }
  8 | 
  9 | # cd - matrix, should be castable to dgCMatrix; rows - genes; columns- cells
 10 | runLocalRegressionFromMatrix <- function(cd,n.cores=detectCores()) {
 11 |   cd <- as(cd,'dgCMatrix'); # assert format
 12 |   # get the count matrix
 13 |   #cd <- seurat.object@assays$RNA@counts;
 14 |   rownames(cd) <- make.unique(rownames(cd))
 15 | 
 16 |   # run local regression with an increasing size of clusters
 17 |   r <- strawnorm(cd,res=c(0.5,1,2,3),n.cores=n.cores);
 18 |   # return normalized matrix
 19 |   x <- r$counts;
 20 |   # normalize gene variance
 21 |   x@x <- x@x*rep(r$misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
 22 |   x
 23 | }
 24 | 
 25 | #### 
 26 | # the method itself
 27 | require(conos)
 28 | require(edgeR)
 29 | require(pagoda2)
 30 | require(parallel)
 31 | require(Matrix)
 32 | require(robustbase)
 33 | 
 34 | 
 35 | # strawman normalization
 36 | # local regression - iterative regression of each cell against combined expression profiles of local clusters
 37 | strawnorm <- function(cd,res=c(0.5,1,2,3),fix.init.depth=NULL,n.cores=30,nPcs=30,n.odgenes=2e3,k=30,use.edgeR.correction=TRUE,verbose=TRUE) {
 38 | 
 39 |   # refine lib sizes using robust regression against the cluster total
 40 |   refine.lib.sizes <- function(counts,groups) {
 41 | 
 42 |     # collapse all molecules within clusters into a meta-cell
 43 |     lvec <- conos:::colSumByFactor(counts,as.factor(groups))[-1,,drop=F];
 44 |     lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4
 45 | 
 46 |     x <- pagoda2:::papply(1:length(levels(groups)),function(j) {
 47 |       ii <- names(groups)[which(groups==j)]
 48 |       if(length(ii)<2) { return(setNames(c(rowSums(counts[ii,])/1e4),ii)) } # if it's just one cell, don't bother
 49 |       av <- lvec[,j]
 50 |       avi <- which(av>0)
 51 |       av <- av[avi]
 52 |       cvm <- as.matrix(counts[ii,avi])
 53 |       
 54 |       x <- unlist(lapply(ii,function(i) {
 55 |         cv <- cvm[i,]
 56 |         as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
 57 |       }))
 58 |       names(x) <- ii;
 59 |       x
 60 |     },n.cores=n.cores)
 61 | 
 62 |     
 63 |     if(use.edgeR.correction) {
 64 |       # even out cluster scales against each other
 65 |       f <- edgeR::calcNormFactors(lvec)
 66 |       f <- f/exp(mean(log(f)))
 67 |       x <- lapply(1:length(x),function(i) {
 68 |         x[[i]]*f[i]
 69 |       })
 70 |     }
 71 | 
 72 |     lib.sizes <- unlist(x)[rownames(counts)]
 73 |     # global scaling
 74 |     lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(counts))
 75 |   }
 76 | 
 77 |   # subsample columns of a sparse matrix to approximately a desired depth
 78 |   subsample.cell.depth <- function(m,depth) {
 79 |     p.sample <- pmin(1,rep(depth/colSums(m),diff(m@p)))
 80 |     m@x <- as.numeric(rbinom(length(m@x),m@x,p.sample))
 81 |     m
 82 |   }
 83 | 
 84 | 
 85 | 
 86 |   internal.norm.loop <- function(cd,groups=NULL,res=1) {
 87 |     if(!is.null(groups)) {
 88 |       lib.sizes <- refine.lib.sizes(t(cd),groups); # regression-based estimates
 89 |     } else {
 90 |       lib.sizes <- NULL; # use number of molecules estimate
 91 |     }
 92 |     # make a p2 object
 93 |     p2process(cd,lib.sizes=lib.sizes,n.cores=n.cores,nPcs=nPcs,k=k,n.odgenes=n.odgenes,res=res,verbose=F)
 94 |   }
 95 | 
 96 |   ccd <- cd;
 97 |   if(!is.null(fix.init.depth)) {
 98 |     if(is.logical(fix.init.depth) & fix.init.depth) {
 99 |       init.depth <- min(colSums(cd))
100 |       cat("limiting cell depth for the initial round to the smallest cell: ",init.depth,"\n")
101 |     } else if(is.numeric(fix.init.depth)) {
102 |       init.depth <- fix.init.depth;
103 |       cat("limiting cell depth for the initial round to the specified: ",init.depth,"\n")
104 |     }
105 |     # subsample cells
106 |     ccd <- subsample.cell.depth(ccd,init.depth)
107 |   }
108 | 
109 |   
110 |   res <- c(res,res[length(res)])
111 | 
112 |   cat("initial processing ...")
113 |   r <- internal.norm.loop(ccd,res=res[1],groups=NULL)
114 |   cat(" done\n")
115 | 
116 |   cat("processing iterations ");
117 |   for(i in res) {
118 |     groups <- r$clusters$PCA[[1]];
119 |     r <- internal.norm.loop(cd,res=i,groups=groups)
120 |     cat(".")
121 |   }
122 |   cat(" done\n")
123 |   return(r);
124 | }
125 | 
126 | 
127 | # basic p2 processing
128 | p2process <- function(cd,lib.sizes=NULL,n.cores=30,nPcs=30,n.odgenes=2e3,res=1,k=30,verbose=FALSE) {
129 |   r <- Pagoda2$new(cd,lib.sizes=lib.sizes,log.scale=TRUE, n.cores=n.cores,verbose=verbose)
130 |   # varnorm, PCA, kNN, cluster
131 |   r$adjustVariance(plot=F,gam.k=10,verbose=F);
132 |   r$calculatePcaReduction(nPcs=nPcs,n.odgenes=n.odgenes,verbose=F);
133 |   r$makeKnnGraph(k=k,type='PCA',center=T,distance='cosine',verbose=F);
134 |   r$getKnnClusters(type='PCA',method=conos:::leiden.community,resolution=res)
135 |   r
136 | }
137 | 
138 | 


--------------------------------------------------------------------------------