├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── DM.R ├── RcppExports.R ├── buildSNNGraph.R ├── clusterCells.R ├── combineBlocks.R ├── combineMarkers.R ├── combinePValues.R ├── combineVar.R ├── computeMinRank.R ├── computeSumFactors.R ├── convertTo.R ├── correlateGenes.R ├── correlateNull.R ├── correlatePairs.R ├── cyclone.R ├── decideTestsPerLabel.R ├── defunct.R ├── denoisePCA.R ├── expandPairings.R ├── findMarkers.R ├── fitTrendCV2.R ├── fitTrendPoisson.R ├── fitTrendVar.R ├── fixedPCA.R ├── getClusteredPCs.R ├── getMarkerEffects.R ├── getTopHVGs.R ├── getTopMarkers.R ├── modelGeneCV2.R ├── modelGeneCV2WithSpikes.R ├── modelGeneVar.R ├── modelGeneVarByPoisson.R ├── modelGeneVarWithSpikes.R ├── multiMarkerStats.R ├── namespace.R ├── pairwiseBinom.R ├── pairwiseTTests.R ├── pairwiseWilcox.R ├── pseudoBulkDGE.R ├── pseudoBulkSpecific.R ├── quickCluster.R ├── quickSubCluster.R ├── rhoToPValue.R ├── sandbag.R ├── scaledColRanks.R ├── scoreMarkers.R ├── summaryMarkerStats.R ├── testLinearModel.R ├── utils_markers.R ├── utils_parallel.R ├── utils_pca.R ├── utils_tricube.R └── utils_variance.R ├── inst ├── CITATION ├── NEWS.Rd └── exdata │ ├── generate_markers.R │ ├── human_cycle_markers.rds │ └── mouse_cycle_markers.rds ├── man ├── DM.Rd ├── buildSNNGraph.Rd ├── clusterCells.Rd ├── combineBlocks.Rd ├── combineMarkers.Rd ├── combinePValues.Rd ├── combineVar.Rd ├── computeMinRank.Rd ├── computeSumFactors.Rd ├── convertTo.Rd ├── correlateGenes.Rd ├── correlateNull.Rd ├── correlatePairs.Rd ├── cyclone.Rd ├── decideTestsPerLabel.Rd ├── defunct.Rd ├── denoisePCA.Rd ├── findMarkers.Rd ├── fitTrendCV2.Rd ├── fitTrendPoisson.Rd ├── fitTrendVar.Rd ├── fixedPCA.Rd ├── gene_selection.Rd ├── getClusteredPCs.Rd ├── getMarkerEffects.Rd ├── getTopHVGs.Rd ├── getTopMarkers.Rd ├── logBH.Rd ├── modelGeneCV2.Rd ├── modelGeneCV2WithSpikes.Rd ├── modelGeneVar.Rd ├── modelGeneVarByPoisson.Rd ├── modelGeneVarWithSpikes.Rd ├── multiMarkerStats.Rd ├── pairwiseBinom.Rd ├── pairwiseTTests.Rd ├── pairwiseWilcox.Rd ├── pseudoBulkDGE.Rd ├── pseudoBulkSpecific.Rd ├── quickCluster.Rd ├── quickSubCluster.Rd ├── rhoToPValue.Rd ├── sandbag.Rd ├── scaledColRanks.Rd ├── scoreMarkers.Rd ├── summaryMarkerStats.Rd └── testLinearModel.Rd ├── src ├── Makevars ├── RcppExports.cpp ├── choose_effect_size.cpp ├── combine_rho.cpp ├── compute_blocked_stats.cpp ├── compute_residual_stats.cpp ├── compute_rho_null.cpp ├── cyclone_scores.cpp ├── overlap_exprs.cpp ├── rand_custom.cpp ├── rand_custom.h └── utils.h ├── tests ├── testthat.R └── testthat │ ├── setup.R │ ├── test-altrep.R │ ├── test-build-snn.R │ ├── test-colranks.R │ ├── test-combine-markers.R │ ├── test-combine-p.R │ ├── test-combine-var.R │ ├── test-convert.R │ ├── test-correlate-genes.R │ ├── test-correlate-pairs.R │ ├── test-cyclone.R │ ├── test-denoise-pca.R │ ├── test-expand-pairings.R │ ├── test-fixed-pca.R │ ├── test-linear-test.R │ ├── test-markers.R │ ├── test-model-cv2.R │ ├── test-model-var.R │ ├── test-multi-markers.R │ ├── test-pairwise-binom.R │ ├── test-pairwise-t.R │ ├── test-pairwise-wilcox.R │ ├── test-pseudo-dge.R │ ├── test-pseudo-spec.R │ ├── test-quickclust.R │ ├── test-sandbag.R │ ├── test-score-markers.R │ ├── test-subclust.R │ ├── test-top-hvgs.R │ ├── test-trend-cv2.R │ └── test-trend-var.R └── vignettes ├── ref.bib └── scran.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.gitignore$ 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.o 3 | *.html 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: scran 2 | Version: 1.33.2 3 | Date: 2024-09-05 4 | Title: Methods for Single-Cell RNA-Seq Data Analysis 5 | Description: 6 | Implements miscellaneous functions for interpretation of single-cell RNA-seq data. 7 | Methods are provided for assignment of cell cycle phase, detection of highly 8 | variable and significantly correlated genes, identification of marker genes, 9 | and other common tasks in routine single-cell analysis workflows. 10 | Authors@R: 11 | c(person("Aaron", "Lun", role = c("aut", "cre"), email = "infinite.monkeys.with.keyboards@gmail.com"), 12 | person("Karsten", "Bach", role = "aut"), 13 | person("Jong Kyoung", "Kim", role = "ctb"), 14 | person("Antonio", "Scialdone", role="ctb")) 15 | Depends: SingleCellExperiment, scuttle 16 | Imports: 17 | SummarizedExperiment, 18 | S4Vectors, 19 | BiocGenerics, 20 | BiocParallel, 21 | Rcpp, 22 | stats, 23 | methods, 24 | utils, 25 | Matrix, 26 | edgeR, 27 | limma, 28 | igraph, 29 | statmod, 30 | MatrixGenerics, 31 | S4Arrays, 32 | DelayedArray, 33 | BiocSingular, 34 | bluster, 35 | metapod, 36 | dqrng, 37 | beachmat 38 | Suggests: 39 | testthat, 40 | BiocStyle, 41 | knitr, 42 | rmarkdown, 43 | DelayedMatrixStats, 44 | HDF5Array, 45 | scRNAseq, 46 | dynamicTreeCut, 47 | ResidualMatrix, 48 | ScaledMatrix, 49 | DESeq2, 50 | pheatmap, 51 | scater 52 | biocViews: 53 | ImmunoOncology, 54 | Normalization, 55 | Sequencing, 56 | RNASeq, 57 | Software, 58 | GeneExpression, 59 | Transcriptomics, 60 | SingleCell, 61 | Clustering 62 | LinkingTo: 63 | Rcpp, 64 | beachmat, 65 | BH, 66 | dqrng, 67 | scuttle 68 | License: GPL-3 69 | NeedsCompilation: yes 70 | VignetteBuilder: knitr 71 | SystemRequirements: C++11 72 | RoxygenNote: 7.3.2 73 | URL: https://github.com/MarioniLab/scran/ 74 | BugReports: https://github.com/MarioniLab/scran/issues 75 | -------------------------------------------------------------------------------- /R/DM.R: -------------------------------------------------------------------------------- 1 | #' @importFrom stats runmed 2 | #' @export 3 | DM <- function(mean, cv2, win.size=51) 4 | # Computes the distance to median for the CV2 values across all genes, 5 | # after fitting an abundance-dependent trend. 6 | # 7 | # written by Jong Kyoung Kim 8 | # with modifications by Aaron Lun 9 | # created 12 March 2015 10 | { 11 | keep <- mean > 0 & !is.na(cv2) & cv2 > 0 12 | mean.expr <- log10(mean[keep]) 13 | cv2.expr <- log10(cv2[keep]) 14 | 15 | o <- order(mean.expr) 16 | if (win.size%%2L==0L) { 17 | win.size <- win.size+1L 18 | } 19 | med.trend <- runmed(cv2.expr[o], k=win.size) 20 | med.trend[o] <- med.trend 21 | 22 | dm.out <- cv2.expr - med.trend 23 | DM <- rep(NA_real_, length(keep)) 24 | DM[keep] <- dm.out 25 | names(DM) <- names(mean) 26 | DM 27 | } 28 | 29 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | compute_Top_statistic_from_ranks <- function(Ranks, prop) { 5 | .Call('_scran_compute_Top_statistic_from_ranks', PACKAGE = 'scran', Ranks, prop) 6 | } 7 | 8 | choose_middle_effect_size <- function(Pvals, Effects, prop) { 9 | .Call('_scran_choose_middle_effect_size', PACKAGE = 'scran', Pvals, Effects, prop) 10 | } 11 | 12 | combine_rho <- function(Ngenes, first, second, Rho, Pval, Order) { 13 | .Call('_scran_combine_rho', PACKAGE = 'scran', Ngenes, first, second, Rho, Pval, Order) 14 | } 15 | 16 | compute_blocked_stats_lognorm <- function(mat, block, nblocks, sf, pseudo) { 17 | .Call('_scran_compute_blocked_stats_lognorm', PACKAGE = 'scran', mat, block, nblocks, sf, pseudo) 18 | } 19 | 20 | compute_blocked_stats_norm <- function(mat, block, nblocks, sf) { 21 | .Call('_scran_compute_blocked_stats_norm', PACKAGE = 'scran', mat, block, nblocks, sf) 22 | } 23 | 24 | compute_blocked_stats_none <- function(mat, block, nblocks) { 25 | .Call('_scran_compute_blocked_stats_none', PACKAGE = 'scran', mat, block, nblocks) 26 | } 27 | 28 | compute_residual_stats_lognorm <- function(qr, qraux, inmat, sf, pseudo) { 29 | .Call('_scran_compute_residual_stats_lognorm', PACKAGE = 'scran', qr, qraux, inmat, sf, pseudo) 30 | } 31 | 32 | compute_residual_stats_none <- function(qr, qraux, inmat) { 33 | .Call('_scran_compute_residual_stats_none', PACKAGE = 'scran', qr, qraux, inmat) 34 | } 35 | 36 | get_null_rho <- function(Ncells, Niters, Seeds, Streams) { 37 | .Call('_scran_get_null_rho', PACKAGE = 'scran', Ncells, Niters, Seeds, Streams) 38 | } 39 | 40 | get_null_rho_design <- function(qr, qraux, Niters, Seeds, Streams) { 41 | .Call('_scran_get_null_rho_design', PACKAGE = 'scran', qr, qraux, Niters, Seeds, Streams) 42 | } 43 | 44 | cyclone_scores <- function(exprs, marker1, marker2, indices, niters, miniters, minpairs, seeds, streams) { 45 | .Call('_scran_cyclone_scores', PACKAGE = 'scran', exprs, marker1, marker2, indices, niters, miniters, minpairs, seeds, streams) 46 | } 47 | 48 | overlap_exprs <- function(exprs, groups, lfc) { 49 | .Call('_scran_overlap_exprs', PACKAGE = 'scran', exprs, groups, lfc) 50 | } 51 | 52 | overlap_exprs_paired <- function(exprs, left, right, groups, lfc) { 53 | .Call('_scran_overlap_exprs_paired', PACKAGE = 'scran', exprs, left, right, groups, lfc) 54 | } 55 | 56 | -------------------------------------------------------------------------------- /R/clusterCells.R: -------------------------------------------------------------------------------- 1 | #' Cluster cells in a SingleCellExperiment 2 | #' 3 | #' A \linkS4class{SingleCellExperiment}-compatible wrapper around \code{\link{clusterRows}} from the \pkg{bluster} package. 4 | #' 5 | #' @param x A \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing cells in the columns. 6 | #' @param assay.type Integer or string specifying the assay values to use for clustering, typically log-normalized expression. 7 | #' @param use.dimred Integer or string specifying the reduced dimensions to use for clustering, typically PC scores. 8 | #' Only used when \code{assay.type=NULL}, and only applicable if \code{x} is a SingleCellExperiment. 9 | #' @param BLUSPARAM A \linkS4class{BlusterParam} object specifying the clustering algorithm to use, 10 | #' defaults to a graph-based method. 11 | #' @param ... Further arguments to pass to \code{\link{clusterRows}}. 12 | #' 13 | #' @return A factor of cluster identities for each cell in \code{x}, 14 | #' or a list containing such a factor - see the return value of \code{?\link{clusterRows}}. 15 | #' 16 | #' @details 17 | #' This is largely a convenience wrapper to avoid the need to manually extract the relevant assays or reduced dimensions from \code{x}. 18 | #' Altering \code{BLUSPARAM} can easily change the parameters or algorithm used for clustering - 19 | #' see \code{?"\link{BlusterParam-class}"} for more details. 20 | #' 21 | #' @author Aaron Lun 22 | #' 23 | #' @examples 24 | #' library(scuttle) 25 | #' sce <- mockSCE() 26 | #' sce <- logNormCounts(sce) 27 | #' 28 | #' # From log-expression values: 29 | #' clusters <- clusterCells(sce, assay.type="logcounts") 30 | #' 31 | #' # From PCs: 32 | #' sce <- scater::runPCA(sce) 33 | #' clusters2 <- clusterCells(sce, use.dimred="PCA") 34 | #' 35 | #' # With different parameters: 36 | #' library(bluster) 37 | #' clusters3 <- clusterCells(sce, use.dimred="PCA", BLUSPARAM=NNGraphParam(k=5)) 38 | #' 39 | #' # With different algorithms: 40 | #' clusters4 <- clusterCells(sce, use.dimred="PCA", BLUSPARAM=KmeansParam(centers=10)) 41 | #' 42 | #' @export 43 | #' @importFrom SingleCellExperiment reducedDim 44 | #' @importFrom SummarizedExperiment assay 45 | #' @importFrom bluster clusterRows NNGraphParam 46 | clusterCells <- function(x, assay.type=NULL, use.dimred=NULL, BLUSPARAM=NNGraphParam(), ...) { 47 | if (!is.null(assay.type)) { 48 | x <- t(assay(x, assay.type)) 49 | } else if (!is.null(use.dimred)) { 50 | x <- reducedDim(x, use.dimred) 51 | } else { 52 | stop("either 'assay.type=' or 'use.dimred=' must be specified") 53 | } 54 | 55 | x <- as.matrix(x) 56 | clusterRows(x, BLUSPARAM=BLUSPARAM, ...) 57 | } 58 | -------------------------------------------------------------------------------- /R/combineBlocks.R: -------------------------------------------------------------------------------- 1 | #' Combine blockwise statistics 2 | #' 3 | #' Combine DataFrames of statistics computed separately for each block. 4 | #' This usually refers to feature-level statistics and sample-level blocks. 5 | #' 6 | #' @param blocks A list of \linkS4class{DataFrame}s containing blockwise statistics. 7 | #' These should have the same number of rows and the same set of columns. 8 | #' @param ave.fields Character vector specifying the columns of \code{blocks} to be averaged. 9 | #' The value of each column is averaged across blocks, potentially in a weighted manner. 10 | #' @param pval.field String specifying the column of \code{blocks} containing the p-value. 11 | #' This is combined using \code{\link{combineParallelPValues}}. 12 | #' @param method String specifying how p-values should be combined, see \code{?\link{combineParallelPValues}}. 13 | #' @param geometric Logical scalar indicating whether the geometric mean should be computed when averaging \code{ave.fields}. 14 | #' @param equiweight Logical scalar indicating whether each block should be given equal weight. 15 | #' @param weights Numeric vector of length equal to \code{blocks}, containing the weight for each block. 16 | #' Only used if \code{equiweight=TRUE}. 17 | #' @param valid Logical vector indicating whether each block is valid. 18 | #' Invalid blocks are still stored in the \code{per.block} output but are not used to compute the combined statistics. 19 | #' 20 | #' @return A \linkS4class{DataFrame} containing all fields in \code{ave.fields} and the p-values, 21 | #' where each column is created by combining the corresponding block-specific columns. 22 | #' A \code{per.block} column is also reported, containing a DataFrame of the DataFrames of blockwise statistics. 23 | #' 24 | #' @author Aaron Lun 25 | #' 26 | #' @seealso 27 | #' This function is used in \code{\link{modelGeneVar}} and friends, \code{\link{combineVar}} and \code{\link{testLinearModel}}. 28 | #' 29 | #' @examples 30 | #' library(scuttle) 31 | #' sce <- mockSCE() 32 | #' 33 | #' y1 <- sce[,1:100] 34 | #' y1 <- logNormCounts(y1) # normalize separately after subsetting. 35 | #' results1 <- modelGeneVar(y1) 36 | #' 37 | #' y2 <- sce[,1:100 + 100] 38 | #' y2 <- logNormCounts(y2) # normalize separately after subsetting. 39 | #' results2 <- modelGeneVar(y2) 40 | #' 41 | #' # A manual implementation of combineVar: 42 | #' combineBlocks(list(results1, results2), 43 | #' ave.fields=c("mean", "total", "bio", "tech"), 44 | #' pval.field='p.value', 45 | #' method='fisher', 46 | #' geometric=FALSE, 47 | #' equiweight=TRUE, 48 | #' weights=NULL, 49 | #' valid=c(TRUE, TRUE)) 50 | #' 51 | #' @export 52 | #' @importFrom stats p.adjust 53 | #' @importFrom S4Vectors DataFrame I 54 | #' @importFrom metapod combineParallelPValues 55 | combineBlocks <- function(blocks, ave.fields, pval.field, method, geometric, equiweight, weights, valid) { 56 | if (length(blocks)==1L) { 57 | return(blocks[[1]]) 58 | } 59 | 60 | rn <- unique(lapply(blocks, rownames)) 61 | if (length(rn)!=1L) { 62 | stop("gene identities should be the same") 63 | } 64 | 65 | if (equiweight) { 66 | weights <- rep(1, length(blocks)) 67 | } else if (is.null(weights)) { 68 | stop("'weights' must be specified if 'equiweight=FALSE'") 69 | } 70 | 71 | original <- blocks 72 | if (length(unique(vapply(original, nrow, 0L)))!=1L) { 73 | stop("not all 'blocks' have the same number of rows") 74 | } 75 | 76 | if (!any(valid)) { 77 | stop("no entry of 'blocks' has positive weights") 78 | } 79 | blocks <- blocks[valid] 80 | weights <- weights[valid] 81 | 82 | combined <- list() 83 | for (i in ave.fields) { 84 | extracted <- lapply(blocks, "[[", i=i) 85 | 86 | if (geometric) { 87 | extracted <- lapply(extracted, log) 88 | } 89 | extracted <- mapply("*", extracted, weights, SIMPLIFY=FALSE, USE.NAMES=FALSE) 90 | averaged <- Reduce("+", extracted)/sum(weights) 91 | if (geometric) { 92 | averaged <- exp(averaged) 93 | } 94 | combined[[i]] <- averaged 95 | } 96 | 97 | extracted <- lapply(blocks, "[[", i=pval.field) 98 | 99 | if (method=="z") { 100 | .Deprecated(old='method="z"', new='method="stouffer"') 101 | method <- "stouffer" 102 | } else if (method=="holm-middle") { 103 | .Deprecated(old='method="holm-middle"', new='method="holm-min"') 104 | method <- "holm-min" 105 | } 106 | combined$p.value <- combineParallelPValues(extracted, method=method, weights=weights)$p.value 107 | combined$FDR <- p.adjust(combined$p.value, method="BH") 108 | 109 | output <- DataFrame(combined, row.names=rn[[1]]) 110 | output$per.block <- do.call(DataFrame, c(lapply(original, I), list(check.names=FALSE))) 111 | 112 | output 113 | } 114 | -------------------------------------------------------------------------------- /R/computeMinRank.R: -------------------------------------------------------------------------------- 1 | #' Compute the minimum rank 2 | #' 3 | #' Compute the minimum rank in a matrix of statistics, usually effect sizes from a set of differential comparisons. 4 | #' 5 | #' @param x A matrix of statistics from multiple differential comparisons (columns) and genes (rows). 6 | #' @param ties.method String specifying how ties should be handled. 7 | #' @param decreasing Logical scalar indicating whether to obtain ranks for decreasing magnitude of values in \code{x}. 8 | #' 9 | #' @details 10 | #' For each gene, the minimum rank, a.k.a., \dQuote{min-rank} is defined by ranking values within each column of \code{x}, and then taking the minimum rank value across columns. 11 | #' This is most useful when the columns of \code{x} contain significance statistics or effect sizes from a single differential comparison, where larger values represent stronger differences. 12 | #' In this setting, the min-rank represents the highest rank that each gene achieves in any comparison. 13 | #' Taking all genes with min-ranks less than or equal to \eqn{T} yields the union of the top \eqn{T} DE genes from all comparisons. 14 | #' 15 | #' To illustrate, the set of genes with min-rank values of 1 will contain the top gene from each pairwise comparison to every other cluster. 16 | #' If we instead take all genes with min-ranks less than or equal to, say, \eqn{T = 5}, the set will consist of the \emph{union} of the top 5 genes from each pairwise comparison. 17 | #' Multiple genes can have the same min-rank as different genes may have the same rank across different pairwise comparisons. 18 | #' Conversely, the marker set may be smaller than the product of \eqn{T} and the number of other clusters, as the same gene may be shared across different comparisons. 19 | #' 20 | #' In the context of marker detection with pairwise comparisons between groups of cells, sorting by the min-rank guarantees the inclusion of genes that can distinguish between any two groups. 21 | #' More specifically, this approach does not explicitly favour genes that are uniquely expressed in a cluster. 22 | #' Rather, it focuses on combinations of genes that - together - drive separation of a cluster from the others. 23 | #' This is more general and robust but tends to yield a less focused marker set compared to the other methods of ranking potential markers. 24 | #' 25 | #' @return A numeric vector containing the minimum (i.e., top) rank for each gene across all comparisons. 26 | #' 27 | #' @seealso 28 | #' \code{\link{scoreMarkers}}, where this function is used to compute one of the effect size summaries. 29 | #' 30 | #' \code{\link{combineMarkers}}, where the same principle is used for the \code{Top} field. 31 | #' @examples 32 | #' # Get min-rank by log-FC: 33 | #' lfcs <- matrix(rnorm(100), ncol=5) 34 | #' computeMinRank(lfcs) 35 | #' 36 | #' # Get min-rank by p-value: 37 | #' pvals <- matrix(runif(100), ncol=5) 38 | #' computeMinRank(pvals, decreasing=FALSE) 39 | #' 40 | #' @export 41 | #' @importFrom MatrixGenerics colMins colRanks 42 | computeMinRank <- function(x, ties.method="min", decreasing=TRUE) { 43 | x <- as.matrix(x) 44 | if (decreasing) x <- -x 45 | colMins(colRanks(x, ties.method=ties.method), na.rm=TRUE) 46 | } 47 | -------------------------------------------------------------------------------- /R/computeSumFactors.R: -------------------------------------------------------------------------------- 1 | #' Normalization by deconvolution 2 | #' 3 | #' Scaling normalization of single-cell RNA-seq data by deconvolving size factors from cell pools. 4 | #' These functions have been moved to the \pkg{scuttle} package and are just retained here for compatibility. 5 | #' 6 | #' @param ... Further arguments to pass to \code{\link{pooledSizeFactors}} or \code{\link{computePooledFactors}}. 7 | #' 8 | #' @return 9 | #' For \code{calculateSumFactors}, a numeric vector of size factors returned by \code{\link{pooledSizeFactors}}. 10 | #' 11 | #' For \code{computeSumFactors}, a SingleCellExperiment containing the size factors in its \code{\link{sizeFactors}}, 12 | #' as returned by \code{\link{computePooledFactors}}. 13 | #' 14 | #' @author Aaron Lun 15 | #' @export 16 | #' @importFrom scuttle computePooledFactors 17 | computeSumFactors <- function(...) { 18 | computePooledFactors(...) 19 | } 20 | 21 | #' @export 22 | #' @importFrom scuttle pooledSizeFactors 23 | #' @rdname computeSumFactors 24 | calculateSumFactors <- function(...) { 25 | pooledSizeFactors(...) 26 | } 27 | -------------------------------------------------------------------------------- /R/convertTo.R: -------------------------------------------------------------------------------- 1 | #' Convert to other classes 2 | #' 3 | #' Convert a \linkS4class{SingleCellExperiment} object into other classes for entry into other analysis pipelines. 4 | #' 5 | #' @param x A \linkS4class{SingleCellExperiment} object. 6 | #' @param type A string specifying the analysis for which the object should be prepared. 7 | #' @param ... Other arguments to be passed to pipeline-specific constructors. 8 | #' @param assay.type A string specifying which assay of \code{x} should be put in the returned object. 9 | #' @param subset.row See \code{?"\link{scran-gene-selection}"}. 10 | #' 11 | #' @return 12 | #' For \code{type="edgeR"}, a DGEList object is returned containing the count matrix. 13 | #' Size factors are converted to normalization factors. 14 | #' Gene-specific \code{rowData} is stored in the \code{genes} element, and cell-specific \code{colData} is stored in the \code{samples} element. 15 | #' 16 | #' For \code{type="DESeq2"}, a DESeqDataSet object is returned containing the count matrix and size factors. 17 | #' Additional gene- and cell-specific data is stored in the \code{mcols} and \code{colData} respectively. 18 | #' 19 | #' @details 20 | #' This function converts an SingleCellExperiment object into various other classes in preparation for entry into other analysis pipelines, as specified by \code{type}. 21 | #' 22 | #' @author 23 | #' Aaron Lun 24 | #' 25 | #' @seealso 26 | #' \code{\link[edgeR]{DGEList}}, 27 | #' \code{\link[DESeq2:DESeqDataSet]{DESeqDataSetFromMatrix}} 28 | #' for specific class constructors. 29 | #' 30 | #' @examples 31 | #' library(scuttle) 32 | #' sce <- mockSCE() 33 | #' 34 | #' # Adding some additional embellishments. 35 | #' sizeFactors(sce) <- 2^rnorm(ncol(sce)) 36 | #' rowData(sce)$SYMBOL <- paste0("X", seq_len(nrow(sce))) 37 | #' sce$other <- sample(LETTERS, ncol(sce), replace=TRUE) 38 | #' 39 | #' # Converting to various objects. 40 | #' convertTo(sce, type="edgeR") 41 | #' convertTo(sce, type="DESeq2") 42 | #' 43 | #' @export 44 | #' @importFrom BiocGenerics sizeFactors as.data.frame "sizeFactors<-" 45 | #' @importFrom SummarizedExperiment rowData colData assay rowData<- 46 | #' @importFrom S4Vectors "mcols<-" 47 | #' @importFrom edgeR DGEList "[.DGEList" scaleOffset.DGEList 48 | #' @importFrom scuttle .subset2index 49 | convertTo <- function(x, type=c("edgeR", "DESeq2", "monocle"), ..., assay.type=1, subset.row=NULL) { 50 | type <- match.arg(type) 51 | if (type=="edgeR" || type=="DESeq2") { 52 | fd <- rowData(x) 53 | pd <- colData(x) 54 | } else if (type=="monocle") { 55 | .Defunct(msg="'type=\"monocle\" is no longer supported, use monocle::newCellDataSet directly instead") 56 | } 57 | 58 | sf <- suppressWarnings(sizeFactors(x)) 59 | subset.row <- .subset2index(subset.row, x) 60 | 61 | if (type=="edgeR") { 62 | y <- DGEList(assay(x, i=assay.type)[subset.row,,drop=FALSE], ...) 63 | if (ncol(fd)) { 64 | y$genes <- fd[subset.row,,drop=FALSE] 65 | } 66 | 67 | if (!is.null(sf)) { 68 | nf <- log(sf/y$samples$lib.size) 69 | nf <- exp(nf - mean(nf)) 70 | y$samples$norm.factors <- nf 71 | } 72 | 73 | if (ncol(pd)) { 74 | y$samples <- cbind(y$samples, pd) 75 | } 76 | return(y) 77 | 78 | } else if (type=="DESeq2") { 79 | dds <- DESeq2::DESeqDataSetFromMatrix(assay(x, i=assay.type)[subset.row,,drop=FALSE], pd, ~1, ...) 80 | rowData(dds) <- fd[subset.row,,drop=FALSE] 81 | if (!is.null(sf)) { 82 | sizeFactors(dds) <- sf 83 | } 84 | return(dds) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /R/correlateGenes.R: -------------------------------------------------------------------------------- 1 | #' Per-gene correlation statistics 2 | #' 3 | #' Compute per-gene correlation statistics by combining results from gene pair correlations. 4 | #' 5 | #' @param stats A \linkS4class{DataFrame} of pairwise correlation statistics, returned by \code{\link{correlatePairs}}. 6 | #' 7 | #' @return 8 | #' A \linkS4class{DataFrame} with one row per unique gene in \code{stats} and containing the fields: 9 | #' \describe{ 10 | #' \item{\code{gene}:}{A field of the same type as \code{stats$gene1} specifying the gene identity.} 11 | #' \item{\code{rho}:}{Numeric, the correlation with the largest magnitude across all gene pairs involving the corresponding gene.} 12 | #' \item{\code{p.value}:}{Numeric, the Simes p-value for this gene.} 13 | #' \item{\code{FDR}:}{Numeric, the adjusted \code{p.value} across all rows.} 14 | #' } 15 | #' 16 | #' @details 17 | #' For each gene, all of its pairs are identified and the corresponding p-values are combined using Simes' method. 18 | #' This tests whether the gene is involved in significant correlations to \emph{any} other gene. 19 | #' Per-gene statistics are useful for identifying correlated genes without regard to what they are correlated with (e.g., during feature selection). 20 | #' 21 | #' @seealso 22 | #' \code{\link{correlatePairs}}, to compute \code{stats}. 23 | #' 24 | #' @author 25 | #' Aaron Lun 26 | #' 27 | #' @examples 28 | #' library(scuttle) 29 | #' sce <- mockSCE() 30 | #' sce <- logNormCounts(sce) 31 | #' pairs <- correlatePairs(sce, iters=1e5, subset.row=1:100) 32 | #' 33 | #' g.out <- correlateGenes(pairs) 34 | #' head(g.out) 35 | #' 36 | #' @references 37 | #' Simes RJ (1986). 38 | #' An improved Bonferroni procedure for multiple tests of significance. 39 | #' \emph{Biometrika} 73:751-754. 40 | #' 41 | #' @export 42 | #' @importFrom S4Vectors DataFrame 43 | #' @importFrom stats p.adjust 44 | correlateGenes <- function(stats) { 45 | pool <- union(stats$gene1, stats$gene2) 46 | m1 <- match(stats$gene1, pool) 47 | m2 <- match(stats$gene2, pool) 48 | by.gene <- combine_rho(length(pool), m1 - 1L, m2 - 1L, stats$rho, stats$p.value, order(stats$p.value) - 1L) 49 | DataFrame(gene=pool, rho=by.gene[[2]], p.value=by.gene[[1]], FDR=p.adjust(by.gene[[1]], method="BH")) 50 | } 51 | -------------------------------------------------------------------------------- /R/decideTestsPerLabel.R: -------------------------------------------------------------------------------- 1 | #' Decide tests for each label 2 | #' 3 | #' Decide which tests (i.e., genes) are significant for differential expression between conditions in each label, 4 | #' using the output of \code{\link{pseudoBulkDGE}}. 5 | #' This mimics the \code{\link{decideTests}} functionality from \pkg{limma}. 6 | #' 7 | #' @param results A \linkS4class{List} containing the output of \code{\link{pseudoBulkDGE}}. 8 | #' Each entry should be a DataFrame with the same number and order of rows, 9 | #' containing at least a numeric \code{"PValue"} column (and usually a \code{"logFC"} column). 10 | #' 11 | #' For \code{summarizeTestsPerLabel}, this may also be a matrix produced by \code{decideTestsPerLabel}. 12 | #' @param method String specifying whether the Benjamini-Hochberg correction should be applied across all clustesr 13 | #' or separately within each label. 14 | #' @param threshold Numeric scalar specifying the FDR threshold to consider genes as significant. 15 | #' @param pval.field String containing the name of the column containing the p-value in each entry of \code{results}. 16 | #' Defaults to \code{"PValue"}, \code{"P.Value"} or \code{"p.value"} based on fields in the first entry of \code{results}. 17 | #' @param lfc.field String containing the name of the column containing the log-fold change. 18 | #' Ignored if the column is not available Defaults to \code{"logFC"} if this field is available. 19 | #' @param ... Further arguments to pass to \code{decideTestsPerLabel} if \code{results} is a List. 20 | #' 21 | #' @return 22 | #' For \code{decideTestsPerLabel}, 23 | #' an integer matrix indicating whether each gene (row) is significantly DE between conditions for each label (column). 24 | #' 25 | #' For \code{summarizeTestsPerLabel}, 26 | #' an integer matrix containing the number of genes of each DE status (column) in each label (row). 27 | #' 28 | #' @details 29 | #' If a log-fold change field is available and specified in \code{lfc.field}, values of \code{1}, \code{-1} and \code{0} 30 | #' indicate that the gene is significantly upregulated, downregulated or not significant, respectively. 31 | #' Note, the interpretation of \dQuote{up} and \dQuote{down} depends on the design and contrast in \code{\link{pseudoBulkDGE}}. 32 | #' 33 | #' Otherwise, if no log-fold change is available or if \code{lfc.field=NULL}, 34 | #' values of \code{1} or \code{0} indicate that a gene is significantly DE or not, respectively. 35 | #' 36 | #' \code{NA} values indicate either that the relevant gene was low-abundance for a particular label and filtered out, 37 | #' or that the DE comparison for that label was not possible (e.g., no residual d.f.). 38 | #' 39 | #' @author Aaron Lun 40 | #' 41 | #' @examples 42 | #' example(pseudoBulkDGE) 43 | #' head(decideTestsPerLabel(out)) 44 | #' summarizeTestsPerLabel(out) 45 | #' 46 | #' @seealso 47 | #' \code{\link{pseudoBulkDGE}}, which generates the input to this function. 48 | #' 49 | #' \code{\link{decideTests}}, which inspired this function. 50 | #' 51 | #' @export 52 | #' @importFrom stats p.adjust 53 | decideTestsPerLabel <- function(results, method=c("separate", "global"), threshold=0.05, 54 | pval.field=NULL, lfc.field="logFC") 55 | { 56 | method <- match.arg(method) 57 | 58 | if (is.null(pval.field)) { 59 | pval.field <- intersect(c("PValue", "P.Value", "p.value"), colnames(results[[1]])) 60 | if (length(pval.field)==0) { 61 | stop("could not automatically determine 'pval.field'") 62 | } 63 | pval.field <- pval.field[1] 64 | } 65 | all.p <- lapply(results, "[[", i=pval.field) 66 | 67 | if (method=="separate") { 68 | all.p <- lapply(all.p, p.adjust, method="BH") 69 | all.p <- do.call(cbind, all.p) 70 | 71 | } else { 72 | all.p <- do.call(cbind, all.p) 73 | all.p[] <- p.adjust(all.p, method="BH") 74 | } 75 | 76 | rownames(all.p) <- rownames(results[[1]]) 77 | sig <- all.p <= threshold 78 | 79 | if (!is.null(lfc.field) && !lfc.field %in% colnames(results[[1]])) { 80 | lfc.field <- NULL 81 | } 82 | if (!is.null(lfc.field)) { 83 | all.lfc <- do.call(cbind, lapply(results, "[[", i=lfc.field)) 84 | sig <- sig * sign(all.lfc) 85 | } 86 | 87 | storage.mode(sig) <- "integer" 88 | sig 89 | } 90 | 91 | #' @export 92 | #' @rdname decideTestsPerLabel 93 | summarizeTestsPerLabel <- function(results, ...) { 94 | if (!is.matrix(results)) { 95 | results <- decideTestsPerLabel(results, ...) 96 | } 97 | 98 | output <- list() 99 | available <- sort(unique(as.vector(results)), na.last=TRUE) 100 | for (i in available) { 101 | output[[as.character(i)]] <- if (is.na(i)) { 102 | colSums(is.na(results)) 103 | } else { 104 | colSums(results==i, na.rm=TRUE) 105 | } 106 | } 107 | 108 | do.call(cbind, output) 109 | } 110 | -------------------------------------------------------------------------------- /R/expandPairings.R: -------------------------------------------------------------------------------- 1 | .expand_pairings <- function(pairings, universe) { 2 | .SUBSET <- function(request, clean=TRUE) { 3 | if (is.null(request)) { 4 | out <- seq_along(universe) 5 | } else { 6 | out <- match(request, universe) 7 | } 8 | if (clean) { 9 | out <- unique(out[!is.na(out)]) 10 | } 11 | out 12 | } 13 | 14 | .expand_pairings_core(pairings, .SUBSET) 15 | } 16 | 17 | .expand_pairings_core <- function(pairings, .SUBSET) { 18 | .clean_expand <- function(x, y, keep.perm) { 19 | all.pairs <- expand.grid(x, y) 20 | keep <- all.pairs[,1] != all.pairs[,2] 21 | all.pairs[keep,] 22 | } 23 | 24 | if (is.matrix(pairings)) { 25 | # If matrix, we're using pre-specified pairs. 26 | if ((!is.numeric(pairings) && !is.character(pairings)) || ncol(pairings)!=2L) { 27 | stop("'pairings' should be a numeric/character matrix with 2 columns") 28 | } 29 | s1 <- .SUBSET(pairings[,1], clean=FALSE) 30 | s2 <- .SUBSET(pairings[,2], clean=FALSE) 31 | 32 | # Discarding pairs with missing elements silently. 33 | keep <- !is.na(s1) & !is.na(s2) 34 | s1 <- s1[keep] 35 | s2 <- s2[keep] 36 | mode <- "predefined pairs" 37 | 38 | } else if (is.list(pairings)) { 39 | # If list, we're correlating between one gene selected from each of two pools. 40 | if (length(pairings)!=2L) { 41 | stop("'pairings' as a list should have length 2") 42 | } 43 | converted <- lapply(pairings, FUN=.SUBSET) 44 | all.pairs <- .clean_expand(converted[[1]], converted[[2]]) 45 | s1 <- all.pairs[,1] 46 | s2 <- all.pairs[,2] 47 | mode <- "double pool" 48 | 49 | } else { 50 | available <- .SUBSET(pairings) 51 | all.pairs <- .clean_expand(available, available) 52 | s1 <- all.pairs[,1] 53 | s2 <- all.pairs[,2] 54 | mode <- "single pool" 55 | } 56 | 57 | list(id1=s1, id2=s2, mode=mode) 58 | } 59 | -------------------------------------------------------------------------------- /R/fitTrendPoisson.R: -------------------------------------------------------------------------------- 1 | #' Generate a trend for Poisson noise 2 | #' 3 | #' Create a mean-variance trend for log-normalized expression values derived from Poisson-distributed counts. 4 | #' 5 | #' @param means A numeric vector of length 2 or more, containing the range of mean counts observed in the dataset. 6 | #' @param size.factors A numeric vector of size factors for all cells in the dataset. 7 | #' @param dispersion A numeric scalar specifying the dispersion for the NB distribution. 8 | #' If zero, a Poisson distribution is used. 9 | #' @param pseudo.count A numeric scalar specifying the pseudo-count to be added to the scaled counts before log-transformation. 10 | #' @param npts An integer scalar specifying the number of interpolation points to use. 11 | #' @param ... Further arguments to pass to \code{\link{fitTrendVar}} for trend fitting. 12 | #' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed across interpolation points. 13 | #' 14 | #' @return A named list is returned containing: 15 | #' \describe{ 16 | #' \item{\code{trend}:}{A function that returns the fitted value of the trend at any value of the mean.} 17 | #' \item{\code{std.dev}:}{A numeric scalar containing the robust standard deviation of the ratio of \code{var} to the fitted value of the trend across all features used for trend fitting.} 18 | #' } 19 | #' 20 | #' @details 21 | #' This function is useful for modelling technical noise in highly diverse datasets without spike-ins, 22 | #' where fitting a trend to the endogenous genes would not be appropriate given the strong biological heterogeneity. 23 | #' It is mostly intended for UMI datasets where the technical noise is close to Poisson-distributed. 24 | #' 25 | #' This function operates by simulating Poisson or negative binomial-distributed counts, 26 | #' computing log-transformed normalized expression values from those counts, 27 | #' calculating the mean and variance and then passing those metrics to \code{\link{fitTrendVar}}. 28 | #' The log-transformation ensures that variance is modelled in the same space that is used for downstream analyses like PCA. 29 | #' 30 | #' Simulations are performed across a range of values in \code{means} to achieve reliable interpolation, 31 | #' with the stability of the trend determined by the number of simulation points in \code{npts}. 32 | #' The number of cells is determined from the length of \code{size.factors}, 33 | #' which are used to scale the distribution means prior to sampling counts. 34 | #' 35 | #' @seealso 36 | #' \code{\link{fitTrendVar}}, which is used to fit the trend. 37 | #' 38 | #' @author Aaron Lun 39 | #' 40 | #' @examples 41 | #' # Mocking up means and size factors: 42 | #' sf <- 2^rnorm(1000, sd=0.1) 43 | #' sf <- sf/mean(sf) 44 | #' means <- rexp(100, 0.1) 45 | #' 46 | #' # Using these to construct a Poisson trend: 47 | #' out <- fitTrendPoisson(means, sf) 48 | #' curve(out$trend(x), xlim=c(0, 10)) 49 | #' 50 | #' @export 51 | #' @importFrom BiocParallel SerialParam 52 | fitTrendPoisson <- function(means, size.factors, npts=1000, dispersion=0, pseudo.count=1, BPPARAM=SerialParam(), ...) { 53 | out <- .generate_poisson_values(means, size.factors, npts=npts, 54 | dispersion=dispersion, pseudo.count=pseudo.count, BPPARAM=BPPARAM) 55 | fitTrendVar(out$means, out$vars, ...) 56 | } 57 | 58 | #' @importFrom stats rpois rnbinom 59 | #' @importFrom BiocParallel SerialParam 60 | .generate_poisson_values <- function(means, size.factors, npts=1000, dispersion=0, pseudo.count=1, 61 | block=NULL, design=NULL, BPPARAM=SerialParam()) 62 | { 63 | if (dispersion==0) { 64 | FUN <- function(m) rpois(length(m), lambda=m) 65 | } else { 66 | FUN <- function(m) rnbinom(length(m), mu=m, size=1/dispersion) 67 | } 68 | 69 | means <- means[means>0] 70 | pts <- exp(seq(from=log(min(means)), to=log(max(means)), length=npts)) 71 | 72 | Y <- matrix(0, npts, length(size.factors)) 73 | for (i in seq_along(pts)) { 74 | Y[i,] <- FUN(pts[i] * size.factors) 75 | } 76 | 77 | .compute_mean_var(Y, block=block, design=design, subset.row=NULL, 78 | block.FUN=compute_blocked_stats_lognorm, 79 | residual.FUN=compute_residual_stats_lognorm, 80 | BPPARAM=BPPARAM, sf=size.factors, pseudo=pseudo.count) 81 | } 82 | -------------------------------------------------------------------------------- /R/fixedPCA.R: -------------------------------------------------------------------------------- 1 | #' PCA with a fixed number of components 2 | #' 3 | #' Perform a PCA where the desired number of components is known ahead of time. 4 | #' 5 | #' @param x A \linkS4class{SingleCellExperiment} object containing a log-expression amtrix. 6 | #' @inheritParams denoisePCA 7 | #' @param rank Integer scalar specifying the number of components. 8 | #' 9 | #' @return 10 | #' A modified \code{x} with: 11 | #' \itemize{ 12 | #' \item the PC results stored in the \code{\link{reducedDims}} as a \code{"PCA"} entry, if \code{type="pca"}. 13 | #' The attributes contain the rotation matrix, the variance explained and the percentage of variance explained. 14 | #' (Note that the last may not sum to 100\% if \code{max.rank} is smaller than the total number of PCs.) 15 | #' \item a low-rank approximation stored as a new \code{"lowrank"} assay, if \code{type="lowrank"}. 16 | #' This is represented as a \linkS4class{LowRankMatrix}. 17 | #' } 18 | #' 19 | #' @details 20 | #' In theory, there is an optimal number of components for any given application, 21 | #' but in practice, the criterion for the optimum is difficult to define. 22 | #' As a result, it is often satisfactory to take an \emph{a priori}-defined \dQuote{reasonable} number of PCs for downstream analyses. 23 | #' A good rule of thumb is to set this to the upper bound on the expected number of subpopulations in the dataset 24 | #' (see the reasoning in \code{\link{getClusteredPCs}}. 25 | #' 26 | #' We can use \code{subset.row} to perform the PCA on a subset of genes. 27 | #' This is typically used to subset to HVGs to reduce computational time and increase the signal-to-noise ratio of downstream analyses. 28 | #' If \code{preserve.shape=TRUE}, the rotation matrix is extrapolated to include loadings for \dQuote{unselected} genes, i.e., not in \code{subset.row}. 29 | #' This is done by projecting their expression profiles into the low-dimensional space defined by the SVD on the selected genes. 30 | #' By doing so, we ensure that the output always has the same number of rows as \code{x} such that any \code{value="lowrank"} can fit into the assays. 31 | #' 32 | #' Otherwise, if \code{preserve.shape=FALSE}, the output is subsetted by any non-\code{NULL} value of \code{subset.row}. 33 | #' This is equivalent to the return value after calling the function on \code{x[subset.row,]}. 34 | #' 35 | #' @author Aaron Lun 36 | #' 37 | #' @seealso 38 | #' \code{\link{denoisePCA}}, where the number of PCs is automatically chosen. 39 | #' 40 | #' \code{\link{getClusteredPCs}}, another method to choose the number of PCs. 41 | #' 42 | #' @examples 43 | #' library(scuttle) 44 | #' sce <- mockSCE() 45 | #' sce <- logNormCounts(sce) 46 | #' 47 | #' # Modelling the variance: 48 | #' var.stats <- modelGeneVar(sce) 49 | #' hvgs <- getTopHVGs(var.stats, n=1000) 50 | #' 51 | #' # Defaults to pulling out the top 50 PCs. 52 | #' set.seed(1000) 53 | #' sce <- fixedPCA(sce, subset.row=hvgs) 54 | #' reducedDimNames(sce) 55 | #' 56 | #' # Get the percentage of variance explained. 57 | #' attr(reducedDim(sce), "percentVar") 58 | #' 59 | #' @export 60 | #' @importFrom BiocSingular bsparam 61 | #' @importFrom BiocParallel SerialParam bpstop bpstart 62 | #' @importFrom SummarizedExperiment assay 63 | #' @importFrom scuttle .bpNotSharedOrUp 64 | #' @importFrom Matrix t 65 | #' @importFrom beachmat realizeFileBackedMatrix 66 | #' @importFrom MatrixGenerics colVars 67 | fixedPCA <- function(x, rank=50, value=c("pca", "lowrank"), subset.row, preserve.shape=TRUE, assay.type="logcounts", name=NULL, BSPARAM=bsparam(), BPPARAM=SerialParam()) { 68 | if (.bpNotSharedOrUp(BPPARAM)) { 69 | bpstart(BPPARAM) 70 | on.exit(bpstop(BPPARAM)) 71 | } 72 | 73 | original <- x 74 | x <- assay(x, assay.type) 75 | 76 | subset.row <- .process_subset_for_pca(subset.row, x) 77 | y <- t(x[subset.row,,drop=FALSE]) 78 | y <- realizeFileBackedMatrix(y) 79 | 80 | svd.out <- .centered_SVD(y, rank, keep.left=TRUE, keep.right=TRUE, 81 | BSPARAM=BSPARAM, BPPARAM=BPPARAM) 82 | var.exp <- svd.out$d^2 / (nrow(y) - 1) 83 | 84 | total.var <- sum(colVars(y)) 85 | 86 | pcs <- list( 87 | components=.svd_to_pca(svd.out, rank), 88 | rotation=.svd_to_rot(svd.out, rank, x, subset.row, fill.missing=preserve.shape), 89 | var.explained=var.exp, 90 | percent.var=var.exp/total.var*100 91 | ) 92 | 93 | if (!preserve.shape) { 94 | original <- original[subset.row,] 95 | } 96 | 97 | value <- match.arg(value) 98 | .pca_to_output(original, pcs, value=value, name=name) 99 | } 100 | -------------------------------------------------------------------------------- /R/getMarkerEffects.R: -------------------------------------------------------------------------------- 1 | #' Get marker effect sizes 2 | #' 3 | #' Utility function to extract the marker effect sizes as a matrix from the output of \code{\link{findMarkers}}. 4 | #' 5 | #' @param x A \linkS4class{DataFrame} containing marker statistics for a given group/cluster, 6 | #' usually one element of the List returned by \code{\link{findMarkers}}. 7 | #' @param prefix String containing the prefix for the columns containing the effect size. 8 | #' @param strip Logical scalar indicating whether the prefix should be removed from the output column names. 9 | #' @param remove.na.col Logical scalar indicating whether to remove columns containing any \code{NA}s. 10 | #' 11 | #' @details 12 | #' Setting \code{remove.na.col=TRUE} may be desirable in applications involving blocked comparisons, 13 | #' where some pairwise comparisons are not possible if the relevant levels occur in different blocks. 14 | #' In such cases, the resulting column is filled with \code{NA}s that may interfere with downstream steps like clustering. 15 | #' 16 | #' @return A numeric matrix containing the effect sizes for the comparison to every other group/cluster. 17 | #' 18 | #' @author Aaron Lun 19 | #' 20 | #' @examples 21 | #' library(scuttle) 22 | #' sce <- mockSCE() 23 | #' sce <- logNormCounts(sce) 24 | #' 25 | #' kout <- kmeans(t(logcounts(sce)), centers=4) 26 | #' out <- findMarkers(sce, groups=kout$cluster) 27 | #' 28 | #' eff1 <- getMarkerEffects(out[[1]]) 29 | #' str(eff1) 30 | #' 31 | #' @seealso 32 | #' \code{\link{findMarkers}} and \code{\link{combineMarkers}}, to generate the DataFrames. 33 | #' 34 | #' @export 35 | #' @importFrom MatrixGenerics colAnyNAs 36 | getMarkerEffects <- function(x, prefix="logFC", strip=TRUE, remove.na.col=FALSE) { 37 | regex <- paste0("^", prefix, "\\.") 38 | i <- grep(regex, colnames(x)) 39 | out <- as.matrix(x[,i]) 40 | 41 | if (strip) { 42 | colnames(out) <- sub(regex, "", colnames(out)) 43 | } 44 | if (remove.na.col) { 45 | out <- out[,!colAnyNAs(out),drop=FALSE] 46 | } 47 | 48 | out 49 | } 50 | -------------------------------------------------------------------------------- /R/getTopHVGs.R: -------------------------------------------------------------------------------- 1 | #' Identify HVGs 2 | #' 3 | #' Define a set of highly variable genes, based on variance modelling statistics 4 | #' from \code{\link{modelGeneVar}} or related functions. 5 | #' 6 | #' @param stats A \linkS4class{DataFrame} of variance modelling statistics with one row per gene. 7 | #' Alternatively, a \linkS4class{SummarizedExperiment} object, in which case it is supplied to \code{\link{modelGeneVar}} to generate the required DataFrame. 8 | #' @param var.field String specifying the column of \code{stats} containing the relevant metric of variation. 9 | #' @param n Integer scalar specifying the number of top HVGs to report. 10 | #' @param prop Numeric scalar specifying the proportion of genes to report as HVGs. 11 | #' @param var.threshold Numeric scalar specifying the minimum threshold on the metric of variation. 12 | #' @param fdr.field String specifying the column of \code{stats} containing the adjusted p-values. 13 | #' If \code{NULL}, no filtering is performed on the FDR. 14 | #' @param fdr.threshold Numeric scalar specifying the FDR threshold. 15 | #' @param row.names Logical scalar indicating whether row names should be reported. 16 | #' 17 | #' @return 18 | #' A character vector containing the names of the most variable genes, if \code{row.names=TRUE}. 19 | #' 20 | #' Otherwise, an integer vector specifying the indices of \code{stats} containing the most variable genes. 21 | #' 22 | #' @details 23 | #' This function will identify all genes where the relevant metric of variation is greater than \code{var.threshold}. 24 | #' By default, this means that we retain all genes with positive values in the \code{var.field} column of \code{stats}. 25 | #' If \code{var.threshold=NULL}, the minimum threshold on the value of the metric is not applied. 26 | #' 27 | #' If \code{fdr.threshold} is specified, we further subset to genes that have FDR less than or equal to \code{fdr.threshold}. 28 | #' By default, FDR thresholding is turned off as \code{\link{modelGeneVar}} and related functions 29 | #' determine significance of large variances \emph{relative} to other genes. 30 | #' This can be overly conservative if many genes are highly variable. 31 | #' 32 | #' If \code{n=NULL} and \code{prop=NULL}, the resulting subset of genes is directly returned. 33 | #' Otherwise, the top set of genes with the largest values of the variance metric are returned, 34 | #' where the size of the set is defined as the larger of \code{n} and \code{prop*nrow(stats)}. 35 | #' 36 | #' @seealso 37 | #' \code{\link{modelGeneVar}} and friends, to generate \code{stats}. 38 | #' 39 | #' \code{\link{modelGeneCV2}} and friends, to also generate \code{stats}. 40 | #' 41 | #' @author Aaron Lun 42 | #' @examples 43 | #' library(scuttle) 44 | #' sce <- mockSCE() 45 | #' sce <- logNormCounts(sce) 46 | #' 47 | #' stats <- modelGeneVar(sce) 48 | #' str(getTopHVGs(stats)) 49 | #' str(getTopHVGs(stats, fdr.threshold=0.05)) # more stringent 50 | #' 51 | #' # Or directly pass in the SingleCellExperiment: 52 | #' str(getTopHVGs(sce)) 53 | #' 54 | #' # Alternatively, use with the coefficient of variation: 55 | #' stats2 <- modelGeneCV2(sce) 56 | #' str(getTopHVGs(stats2, var.field="ratio")) 57 | #' 58 | #' @export 59 | #' @importFrom utils head 60 | getTopHVGs <- function(stats, var.field="bio", n=NULL, prop=NULL, var.threshold=0, 61 | fdr.field="FDR", fdr.threshold=NULL, row.names=!is.null(rownames(stats))) 62 | { 63 | if (is(stats, "SummarizedExperiment")) { 64 | stats <- modelGeneVar(stats) 65 | } 66 | 67 | survivors <- seq_len(nrow(stats)) 68 | 69 | if (!is.null(fdr.threshold)) { 70 | fdr <- stats[[fdr.field]] 71 | keep <- !is.na(fdr) & fdr <= fdr.threshold 72 | survivors <- survivors[keep] 73 | stats <- stats[keep,,drop=FALSE] 74 | } 75 | 76 | if (!is.null(var.threshold)) { 77 | var <- stats[[var.field]] 78 | keep <- !is.na(var) & var > var.threshold 79 | survivors <- survivors[keep] 80 | stats <- stats[keep,,drop=FALSE] 81 | } 82 | 83 | o <- order(stats[[var.field]], decreasing=TRUE) 84 | if (!is.null(n) || !is.null(prop)) { 85 | n <- max(n, round(prop*nrow(stats))) 86 | o <- head(o, n) 87 | } 88 | 89 | if (row.names) { 90 | rownames(stats)[o] 91 | } else { 92 | survivors[o] 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /R/namespace.R: -------------------------------------------------------------------------------- 1 | #' @import methods 2 | #' @import SingleCellExperiment 3 | #' @importFrom Matrix t which 4 | #' @importFrom MatrixGenerics colSums rowMeans 5 | #' @importFrom Rcpp sourceCpp 6 | #' @useDynLib scran 7 | NULL 8 | -------------------------------------------------------------------------------- /R/rhoToPValue.R: -------------------------------------------------------------------------------- 1 | #' Spearman's rho to a p-value 2 | #' 3 | #' Compute an approximate p-value against the null hypothesis that Spearman's rho is zero. 4 | #' This vectorizes the approximate p-value calculation in \code{\link{cor.test}} with \code{method="spearman"}. 5 | #' 6 | #' @param rho Numeric vector of rho values. 7 | #' @param n Integer scalar specifying the number of observations used to compute \code{rho}. 8 | #' @param positive Logical scalar indicating whether to perform a one-sided test for the alternative of a positive (\code{TRUE}) or negative rho (\code{FALSE}). 9 | #' Default is to return statistics for both directions. 10 | #' 11 | #' @return 12 | #' If \code{positive=NULL}, a list of two numeric vectors is returned, 13 | #' containing p-values for the test against the alternative hypothesis in each direction. 14 | #' 15 | #' Otherwise, a numeric vector is returned containing the p-values for the test in the specified direction. 16 | #' 17 | #' @author Aaron Lun 18 | #' 19 | #' @examples 20 | #' rhoToPValue(seq(-1, 1, 21), 50) 21 | #' 22 | #' @export 23 | #' @importFrom stats pt 24 | rhoToPValue <- function(rho, n, positive=NULL) { 25 | # Mildly adapted from cor.test. 26 | q <- (n^3 - n) * (1 - rho)/6 27 | den <- (n * (n^2 - 1)/6) 28 | r <- 1 - q/den 29 | tstat <- r/sqrt((1 - r^2)/(n - 2)) 30 | 31 | FUN <- function(p) pt(tstat, df = n - 2, lower.tail = !p) 32 | 33 | if (!is.null(positive)) { 34 | FUN(positive) 35 | } else { 36 | list(positive=FUN(TRUE), negative=FUN(FALSE)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /R/scaledColRanks.R: -------------------------------------------------------------------------------- 1 | #' Compute scaled column ranks 2 | #' 3 | #' Compute scaled column ranks from each cell's expression profile for distance calculations based on rank correlations. 4 | #' 5 | #' @param x A numeric matrix-like object containing cells in columns and features in the rows. 6 | #' @param subset.row A logical, integer or character scalar indicating the rows of \code{x} to use, see \code{?"\link{scran-gene-selection}"}. 7 | #' @param min.mean A numeric scalar specifying the filter to be applied on the average normalized count for each feature prior to computing ranks. 8 | #' Disabled by setting to \code{NULL}. 9 | #' @param transposed A logical scalar specifying whether the output should be transposed. 10 | #' @param as.sparse A logical scalar indicating whether the output should be sparse. 11 | #' @param withDimnames A logical scalar specifying whether the output should contain the dimnames of \code{x}. 12 | #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether and how parallelization should be performed. 13 | #' Currently only used for filtering if \code{min.mean} is not provided. 14 | #' 15 | #' @return 16 | #' A matrix of the same dimensions as \code{x}, where each column contains the centred and scaled ranks of the expression values for each cell. 17 | #' If \code{transposed=TRUE}, this matrix is transposed so that rows correspond to cells. 18 | #' If \code{as.sparse}, the columns are not centered to preserve sparsity. 19 | #' 20 | #' @details 21 | #' Euclidean distances computed based on the output rank matrix are equivalent to distances computed from Spearman's rank correlation. 22 | #' This can be used in clustering, nearest-neighbour searches, etc. as a robust alternative to Euclidean distances computed directly from \code{x}. 23 | #' 24 | #' If \code{as.sparse=TRUE}, the most common average rank is set to zero in the output. 25 | #' This can be useful for highly sparse input data where zeroes have the same rank and are themselves returned as zeroes. 26 | #' Obviously, this means that the ranks are not centred, so this will have to be done manually prior to any downstream distance calculations. 27 | #' 28 | #' @author 29 | #' Aaron Lun 30 | #' 31 | #' @seealso 32 | #' \code{\link{quickCluster}}, where this function is used. 33 | #' 34 | #' @examples 35 | #' library(scuttle) 36 | #' sce <- mockSCE() 37 | #' rout <- scaledColRanks(counts(sce), transposed=TRUE) 38 | #' 39 | #' # For use in clustering: 40 | #' d <- dist(rout) 41 | #' table(cutree(hclust(d), 4)) 42 | #' 43 | #' g <- buildSNNGraph(rout, transposed=TRUE) 44 | #' table(igraph::cluster_walktrap(g)$membership) 45 | #' 46 | #' @export 47 | #' @importFrom scuttle calculateAverage .subset2index .bpNotSharedOrUp 48 | #' @importFrom BiocParallel SerialParam bpstart bpstop 49 | scaledColRanks <- function(x, subset.row=NULL, min.mean=NULL, transposed=FALSE, as.sparse=FALSE, 50 | withDimnames=TRUE, BPPARAM=SerialParam()) 51 | { 52 | if (.bpNotSharedOrUp(BPPARAM)) { 53 | bpstart(BPPARAM) 54 | on.exit(bpstop(BPPARAM)) 55 | } 56 | 57 | subset.row <- .subset2index(subset.row, x, byrow=TRUE) 58 | if (!is.null(min.mean) && all(dim(x)>0L)) { 59 | further.subset <- calculateAverage(x, subset_row=subset.row, BPPARAM=BPPARAM) >= min.mean 60 | subset.row <- subset.row[further.subset] 61 | } 62 | 63 | out <- colBlockApply(x[subset.row,,drop=FALSE], FUN=.get_scaled_ranks, grid=as.sparse, 64 | transposed=transposed, .as.sparse=as.sparse, BPPARAM=BPPARAM) 65 | 66 | if (transposed) { 67 | rkout <- do.call(rbind, out) 68 | } else { 69 | rkout <- do.call(cbind, out) 70 | } 71 | 72 | if (withDimnames && !is.null(dimnames(x))) { 73 | dn <- list(rownames(x)[subset.row], colnames(x)) 74 | if (transposed) { 75 | dn <- rev(dn) 76 | } 77 | dimnames(rkout) <- dn 78 | } else if (!is.null(dimnames(rkout))) { 79 | dimnames(rkout) <- NULL 80 | } 81 | 82 | rkout 83 | } 84 | 85 | #' @importClassesFrom Matrix dgCMatrix 86 | #' @importFrom MatrixGenerics colRanks rowVars rowMeans 87 | #' @importFrom DelayedArray DelayedArray getAutoBPPARAM setAutoBPPARAM 88 | .get_scaled_ranks <- function(block, transposed, .as.sparse) { 89 | old <- getAutoBPPARAM() 90 | setAutoBPPARAM(NULL) # turning off any additional parallelization, just in case. 91 | on.exit(setAutoBPPARAM(old)) 92 | 93 | out <- colRanks(DelayedArray(block), ties.method="average", preserveShape=FALSE) 94 | 95 | sig <- sqrt(rowVars(out) * (ncol(out)-1)) * 2 96 | if (any(is.na(sig) | sig==0)) { 97 | stop("rank variances of zero detected for a cell") 98 | } 99 | 100 | if (.as.sparse) { 101 | # Figure out what the zeroes got transformed into. 102 | is.zero <- which(block==0, arr.ind=TRUE) 103 | offset <- numeric(ncol(block)) 104 | offset[is.zero[,2]] <- out[is.zero[,2:1]] 105 | 106 | out <- out - offset 107 | out <- as(out, "dgCMatrix") 108 | } else { 109 | out <- out - rowMeans(out) 110 | } 111 | 112 | out <- out/sig 113 | 114 | if (!transposed) { 115 | out <- t(out) 116 | } 117 | out 118 | } 119 | -------------------------------------------------------------------------------- /R/summaryMarkerStats.R: -------------------------------------------------------------------------------- 1 | #' Summary marker statistics 2 | #' 3 | #' Compute additional gene-level statistics for each group to assist in identifying marker genes, 4 | #' to complement the formal test statistics generated by \code{\link{findMarkers}}. 5 | #' 6 | #' @param x A numeric matrix-like object of expression values, 7 | #' where each column corresponds to a cell and each row corresponds to an endogenous gene. 8 | #' This is generally expected to be normalized log-expression values unless one knows better. 9 | #' 10 | #' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix. 11 | #' @inheritParams findMarkers 12 | #' @param average String specifying the type of average, to be passed to \code{\link{sumCountsAcrossCells}}. 13 | #' @param ... For the generic, further arguments to pass to specific methods. 14 | #' 15 | #' For the SummarizedExperiment method, further arguments to pass to the ANY method. 16 | #' 17 | #' @details 18 | #' This function only generates descriptive statistics for each gene to assist marker selection. 19 | #' It does not consider blocking factors or covariates that would otherwise be available from comparisons between groups. 20 | #' For the sake of brevity, statistics for the \dQuote{other} groups are summarized into a single value. 21 | #' 22 | #' @return A named \linkS4class{List} of \linkS4class{DataFrame}s, with one entry per level of \code{groups}. 23 | #' Each DataFrame has number of rows corresponding to the rows in \code{x} and contains the fields: 24 | #' \itemize{ 25 | #' \item \code{self.average}, the average (log-)expression across all cells in the current group. 26 | #' \item \code{other.average}, the grand average of the average (log-)expression across cells in the other groups. 27 | #' \item \code{self.detected}, the proportion of cells with detected expression in the current group. 28 | #' \item \code{other.detected}, the average proportion of cells with detected expression in the other groups. 29 | #' } 30 | #' 31 | #' @author Aaron Lun 32 | #' 33 | #' @seealso 34 | #' \code{\link{findMarkers}}, where the output of this function can be used in \code{row.data=}. 35 | #' 36 | #' @examples 37 | #' library(scuttle) 38 | #' sce <- mockSCE() 39 | #' sce <- logNormCounts(sce) 40 | #' 41 | #' # Any clustering method is okay. 42 | #' kout <- kmeans(t(logcounts(sce)), centers=3) 43 | #' sum.out <- summaryMarkerStats(sce, kout$cluster) 44 | #' sum.out[["1"]] 45 | #' 46 | #' # Add extra rowData if you like. 47 | #' rd <- DataFrame(Symbol=sample(LETTERS, nrow(sce), replace=TRUE), 48 | #' row.names=rownames(sce)) 49 | #' sum.out <- summaryMarkerStats(sce, kout$cluster, row.data=rd) 50 | #' sum.out[["1"]] 51 | #' 52 | #' @name summaryMarkerStats 53 | NULL 54 | 55 | #' @importFrom scuttle sumCountsAcrossCells .bpNotSharedOrUp numDetectedAcrossCells 56 | #' @importFrom SummarizedExperiment assay 57 | #' @importFrom BiocParallel bpstart bpstop SerialParam 58 | #' @importFrom S4Vectors SimpleList 59 | .summary_marker_stats <- function(x, groups, row.data=NULL, average="mean", BPPARAM=SerialParam()) { 60 | if (.bpNotSharedOrUp(BPPARAM)) { 61 | bpstart(BPPARAM) 62 | on.exit(bpstop(BPPARAM)) 63 | } 64 | 65 | ave.out <- sumCountsAcrossCells(x, ids=groups, average=average, BPPARAM=BPPARAM) 66 | ave.mat <- assay(ave.out) 67 | ave.ids <- ave.out$ids 68 | 69 | num.out <- numDetectedAcrossCells(x, ids=groups, average=TRUE, BPPARAM=BPPARAM) 70 | num.mat <- assay(num.out) 71 | num.ids <- num.out$ids 72 | 73 | collated <- list() 74 | for (i in seq_along(ave.ids)) { 75 | curid <- ave.ids[i] 76 | ave.df <- .extractor(ave.mat, ave.ids, curid, name="average") 77 | num.df <- .extractor(num.mat, num.ids, curid, name="detected") 78 | collated[[as.character(curid)]] <- cbind(ave.df, num.df) 79 | } 80 | 81 | collated <- .add_row_data(collated, row.data, match.names=FALSE) 82 | SimpleList(collated) 83 | } 84 | 85 | #' @importFrom S4Vectors DataFrame 86 | .extractor <- function(mat, ids, curid, name) { 87 | m <- which(ids==curid) 88 | out <- DataFrame(X=mat[,m], Y=rowMeans(mat[,-m,drop=FALSE])) 89 | colnames(out) <- paste0(c("self", "other"), ".", name) 90 | out 91 | } 92 | 93 | #' @export 94 | #' @rdname summaryMarkerStats 95 | setGeneric("summaryMarkerStats", function(x, ...) setGeneric("summaryMarkerStats")) 96 | 97 | #' @export 98 | #' @rdname summaryMarkerStats 99 | setMethod("summaryMarkerStats", "ANY", .summary_marker_stats) 100 | 101 | #' @export 102 | #' @rdname summaryMarkerStats 103 | #' @importFrom SummarizedExperiment assay 104 | setMethod("summaryMarkerStats", "SummarizedExperiment", function(x, ..., assay.type="logcounts") { 105 | .summary_marker_stats(assay(x, assay.type), ...) 106 | }) 107 | -------------------------------------------------------------------------------- /R/utils_parallel.R: -------------------------------------------------------------------------------- 1 | #' @importFrom dqrng generateSeedVectors 2 | .setup_pcg_state <- function(per.core) { 3 | seeds <- streams <- vector("list", length(per.core)) 4 | last <- 0L 5 | for (i in seq_along(per.core)) { 6 | N <- per.core[i] 7 | seeds[[i]] <- generateSeedVectors(N, nwords=2) 8 | streams[[i]] <- last + seq_len(N) 9 | last <- last + N 10 | } 11 | list(seeds=seeds, streams=streams) 12 | } 13 | -------------------------------------------------------------------------------- /R/utils_pca.R: -------------------------------------------------------------------------------- 1 | #' @importFrom BiocSingular runSVD bsparam 2 | #' @importFrom BiocParallel SerialParam 3 | .centered_SVD <- function(y, max.rank, BSPARAM=bsparam(), BPPARAM=SerialParam(), keep.left=TRUE, keep.right=TRUE) 4 | # Performs the PCA given a log-expression matrix. 5 | # Switches between svd() and irlba() on request. 6 | # Output format is guaranteed to be the same. 7 | { 8 | runSVD(y, center=TRUE, BSPARAM=BSPARAM, k=max.rank, 9 | nu=if (keep.left) max.rank else 0L, 10 | nv=if (keep.right) max.rank else 0L, 11 | BPPARAM=BPPARAM) 12 | } 13 | 14 | .svd_to_pca <- function(svd.out, ncomp, named=TRUE) 15 | # Converts centred results to PCs. 16 | { 17 | if (is.null(svd.out$u)) { 18 | stop("missing 'U' in SVD results") 19 | } else if (ncomp > ncol(svd.out$u)) { 20 | warning("requested number of components greater than available rank") 21 | ncomp <- ncol(svd.out$u) 22 | } 23 | 24 | ix <- seq_len(ncomp) 25 | U <- svd.out$u[,ix,drop=FALSE] 26 | D <- svd.out$d[ix] 27 | 28 | # Pulling out the PCs (column-multiplying the left eigenvectors). 29 | pcs <- sweep(U, 2L, D, FUN="*", check.margin = FALSE) 30 | if (named) { 31 | colnames(pcs) <- sprintf("PC%i", ix) 32 | } 33 | pcs 34 | } 35 | 36 | #' @importFrom MatrixGenerics rowMeans colSums 37 | .svd_to_rot <- function(svd.out, ncomp, original.mat, subset.row, fill.missing) { 38 | ncomp <- min(ncomp, ncol(svd.out$v)) 39 | 40 | ix <- seq_len(ncomp) 41 | V <- svd.out$v[,ix,drop=FALSE] 42 | if (is.null(subset.row) || !fill.missing) { 43 | rownames(V) <- rownames(original.mat)[subset.row] 44 | return(V) 45 | } 46 | 47 | U <- svd.out$u[,ix,drop=FALSE] 48 | D <- svd.out$d[ix] 49 | 50 | fullV <- matrix(0, nrow(original.mat), ncomp) 51 | rownames(fullV) <- rownames(original.mat) 52 | colnames(fullV) <- colnames(V) 53 | fullV[subset.row,] <- V 54 | 55 | # The idea is that after our SVD, we have X=UDV' where each column of X is a gene. 56 | # Leftover genes are new columns in X, which are projected on the space of U by doing U'X. 57 | # This can be treated as new columns in DV', which can be multiplied by U to give denoised values. 58 | # I've done a lot of implicit transpositions here, hence the code does not tightly follow the logic above. 59 | leftovers <- !logical(nrow(original.mat)) 60 | leftovers[subset.row] <- FALSE 61 | 62 | left.x <- original.mat[leftovers,,drop=FALSE] 63 | left.x <- as.matrix(left.x %*% U) - outer(rowMeans(left.x), colSums(U)) 64 | 65 | fullV[leftovers,] <- sweep(left.x, 2, D, "/", check.margin=FALSE) 66 | 67 | fullV 68 | } 69 | 70 | #' @importFrom BiocSingular LowRankMatrix 71 | #' @importFrom SingleCellExperiment reduced.dim.matrix reducedDim<- 72 | #' @importFrom SummarizedExperiment assay<- 73 | .pca_to_output <- function(x, pcs, value=c("pca", "lowrank"), name="PCA") { 74 | if (value=="pca"){ 75 | out <- reduced.dim.matrix(pcs$components) 76 | attr(out, "percentVar") <- pcs$percent.var 77 | attr(out, "varExplained") <- pcs$var.explained 78 | attr(out, "rotation") <- pcs$rotation 79 | } else { 80 | out <- LowRankMatrix(pcs$rotation, pcs$components) 81 | } 82 | 83 | value <- match.arg(value) 84 | if (value=="pca"){ 85 | if (is.null(name)) name <- "PCA" 86 | reducedDim(x, name) <- out 87 | } else if (value=="lowrank") { 88 | if (is.null(name)) name <- "lowrank" 89 | assay(x, i=name) <- out 90 | } 91 | x 92 | } 93 | 94 | #' @importFrom scuttle .subset2index 95 | .process_subset_for_pca <- function(subset.row, x) { 96 | if (missing(subset.row)) { 97 | warning(paste(strwrap("'subset.row=' is typically used to specify HVGs for PCA. If the use of all genes is intentional, suppress this message with 'subset.row=NULL'."), collapse="\n")) 98 | subset.row <- NULL 99 | } 100 | .subset2index(subset.row, x, byrow=TRUE) 101 | } 102 | -------------------------------------------------------------------------------- /R/utils_tricube.R: -------------------------------------------------------------------------------- 1 | .compute_tricube_average <- function(vals, indices, distances, bandwidth=NULL, ndist=3) 2 | # Centralized function to compute tricube averages. 3 | # Bandwidth is set at 'ndist' times the median distance, if not specified. 4 | { 5 | if (is.null(bandwidth)) { 6 | middle <- ceiling(ncol(indices)/2L) 7 | mid.dist <- distances[,middle] 8 | bandwidth <- mid.dist * ndist 9 | } 10 | bandwidth <- pmax(1e-8, bandwidth) 11 | 12 | rel.dist <- distances/bandwidth 13 | rel.dist[rel.dist > 1] <- 1 # don't use pmin(), as this destroys dimensions. 14 | tricube <- (1 - rel.dist^3)^3 15 | weight <- tricube/rowSums(tricube) 16 | 17 | output <- 0 18 | for (kdx in seq_len(ncol(indices))) { 19 | output <- output + vals[indices[,kdx],,drop=FALSE] * weight[,kdx] 20 | } 21 | 22 | if (is.null(dim(output))) { 23 | matrix(0, nrow(vals), ncol(vals)) 24 | } else { 25 | output 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "article", 3 | author = c( 4 | person(c("Aaron", "T.", "L."), "Lun"), 5 | person(c("Davis", "J."), "McCarthy"), 6 | person(c("John", "C."), "Marioni")), 7 | title="A step-by-step workflow for low-level analysis of single-cell RNA-seq data with Bioconductor", 8 | journal="F1000Res.", 9 | year="2016", 10 | volume="5", 11 | pages="2122", 12 | doi = "10.12688/f1000research.9501.2" 13 | ) 14 | 15 | -------------------------------------------------------------------------------- /inst/exdata/generate_markers.R: -------------------------------------------------------------------------------- 1 | library(scran) 2 | dir.create("temp") 3 | 4 | # MOUSE (relies on semi-public data): 5 | 6 | zipfile <- "temp/current.zip" 7 | download.file("https://github.com/PMBio/cyclone/archive/c982e7388d8e49e1459055504313f87bb3eb0ceb.zip", zipfile) 8 | out <- unzip(zipfile, exdir="temp") 9 | 10 | pdata <- out[grep("pairs_functions.RData$", out)] 11 | load(pdata) 12 | 13 | all.pairs <- sandbag(training.data, list(G1=id.G1, S=id.S, G2M=id.G2M), fraction=0.5, subset.row=genes.training) 14 | saveRDS(file="mouse_cycle_markers.rds", all.pairs) 15 | 16 | rm(list=ls()) 17 | 18 | # HUMAN: 19 | 20 | library(GEOquery) 21 | out <- getGEOSuppFiles("GSE64016", baseDir="temp", makeDirectory=FALSE) 22 | count.file <- "temp/GSE64016_H1andFUCCI_normalized_EC.csv.gz" 23 | hs.counts <- read.csv(count.file, header=TRUE, row.names=1) 24 | hs.G1 <- grepl("G1", colnames(hs.counts)) 25 | hs.S <- grepl("S", colnames(hs.counts)) 26 | hs.G2 <- grepl("G2", colnames(hs.counts)) 27 | 28 | library(org.Hs.eg.db) 29 | anno <- select(org.Hs.eg.db, keytype="SYMBOL", key=rownames(hs.counts), column="ENSEMBL") 30 | anno <- anno[!is.na(anno$ENSEMBL),] 31 | m <- match(anno$SYMBOL, rownames(hs.counts)) 32 | hs.counts2 <- as.matrix(hs.counts[m,]) 33 | rownames(hs.counts2) <- anno$ENSEMBL 34 | hs.cycle <- select(org.Hs.eg.db, keytype="GOALL", key="GO:0007049", column="ENSEMBL") 35 | hs.training <- rownames(hs.counts2) %in% hs.cycle$ENSEMBL 36 | 37 | all.pairs <- sandbag(hs.counts2, list(G1=hs.G1, S=hs.S, G2M=hs.G2), fraction=0.5, subset.row=hs.training) 38 | saveRDS(file="human_cycle_markers.rds", all.pairs) 39 | 40 | # Cleaning up: 41 | 42 | unlink("temp", recursive=TRUE) 43 | -------------------------------------------------------------------------------- /inst/exdata/human_cycle_markers.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarioniLab/scran/60f82785843cd4cee7aa1e5a827aa3dad0430674/inst/exdata/human_cycle_markers.rds -------------------------------------------------------------------------------- /inst/exdata/mouse_cycle_markers.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarioniLab/scran/60f82785843cd4cee7aa1e5a827aa3dad0430674/inst/exdata/mouse_cycle_markers.rds -------------------------------------------------------------------------------- /man/DM.Rd: -------------------------------------------------------------------------------- 1 | \name{Distance-to-median} 2 | \alias{DM} 3 | 4 | \title{Compute the distance-to-median statistic} 5 | \description{Compute the distance-to-median statistic for the CV2 residuals of all genes} 6 | 7 | \usage{ 8 | DM(mean, cv2, win.size=51) 9 | } 10 | 11 | \arguments{ 12 | \item{mean}{A numeric vector of average counts for each gene.} 13 | \item{cv2}{A numeric vector of squared coefficients of variation for each gene.} 14 | \item{win.size}{An integer scalar specifying the window size for median-based smoothing. 15 | This should be odd or will be incremented by 1.} 16 | } 17 | 18 | \details{ 19 | This function will compute the distance-to-median (DM) statistic described by Kolodziejczyk et al. (2015). 20 | Briefly, a median-based trend is fitted to the log-transformed \code{cv2} against the log-transformed \code{mean} using \code{\link{runmed}}. 21 | The DM is defined as the residual from the trend for each gene. 22 | This statistic is a measure of the relative variability of each gene, after accounting for the empirical mean-variance relationship. 23 | Highly variable genes can then be identified as those with high DM values. 24 | } 25 | 26 | \value{ 27 | A numeric vector of DM statistics for all genes. 28 | } 29 | 30 | \author{ 31 | Jong Kyoung Kim, 32 | with modifications by Aaron Lun 33 | } 34 | 35 | \examples{ 36 | # Mocking up some data 37 | ngenes <- 1000 38 | ncells <- 100 39 | gene.means <- 2^runif(ngenes, 0, 10) 40 | dispersions <- 1/gene.means + 0.2 41 | counts <- matrix(rnbinom(ngenes*ncells, mu=gene.means, size=1/dispersions), nrow=ngenes) 42 | 43 | # Computing the DM. 44 | means <- rowMeans(counts) 45 | cv2 <- apply(counts, 1, var)/means^2 46 | dm.stat <- DM(means, cv2) 47 | head(dm.stat) 48 | } 49 | 50 | \references{ 51 | Kolodziejczyk AA, Kim JK, Tsang JCH et al. (2015). 52 | Single cell RNA-sequencing of pluripotent states unlocks modular transcriptional variation. 53 | \emph{Cell Stem Cell} 17(4), 471--85. 54 | } 55 | 56 | \keyword{variance} 57 | -------------------------------------------------------------------------------- /man/buildSNNGraph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/buildSNNGraph.R 3 | \docType{methods} 4 | \name{buildSNNGraph} 5 | \alias{buildSNNGraph} 6 | \alias{buildSNNGraph,ANY-method} 7 | \alias{buildSNNGraph,SummarizedExperiment-method} 8 | \alias{buildSNNGraph,SingleCellExperiment-method} 9 | \alias{buildKNNGraph} 10 | \alias{buildKNNGraph,ANY-method} 11 | \alias{buildKNNGraph,SingleCellExperiment-method} 12 | \title{Build a nearest-neighbor graph} 13 | \usage{ 14 | buildSNNGraph(x, ...) 15 | 16 | \S4method{buildSNNGraph}{ANY}( 17 | x, 18 | ..., 19 | d = 50, 20 | transposed = FALSE, 21 | subset.row = NULL, 22 | BSPARAM = bsparam(), 23 | BPPARAM = SerialParam() 24 | ) 25 | 26 | \S4method{buildSNNGraph}{SummarizedExperiment}(x, ..., assay.type = "logcounts") 27 | 28 | \S4method{buildSNNGraph}{SingleCellExperiment}(x, ..., use.dimred = NULL) 29 | 30 | buildKNNGraph(x, ...) 31 | 32 | \S4method{buildKNNGraph}{ANY}( 33 | x, 34 | ..., 35 | d = 50, 36 | transposed = FALSE, 37 | subset.row = NULL, 38 | BSPARAM = bsparam(), 39 | BPPARAM = SerialParam() 40 | ) 41 | 42 | \S4method{buildKNNGraph}{SingleCellExperiment}(x, ..., use.dimred = NULL) 43 | 44 | \S4method{buildKNNGraph}{SingleCellExperiment}(x, ..., use.dimred = NULL) 45 | } 46 | \arguments{ 47 | \item{x}{A matrix-like object containing expression values for each gene (row) in each cell (column). 48 | These dimensions can be transposed if \code{transposed=TRUE}. 49 | 50 | Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} containing such an expression matrix. 51 | If \code{x} is a SingleCellExperiment and \code{use.dimred} is set, its \code{\link{reducedDims}} will be used instead.} 52 | 53 | \item{...}{For the generics, additional arguments to pass to the specific methods. 54 | 55 | For the ANY methods, additional arguments to pass to \code{\link{makeSNNGraph}} or \code{\link{makeKNNGraph}}. 56 | 57 | For the SummarizedExperiment methods, additional arguments to pass to the corresponding ANY method. 58 | 59 | For the SingleCellExperiment methods, additional arguments to pass to the corresponding SummarizedExperiment method.} 60 | 61 | \item{d}{An integer scalar specifying the number of dimensions to use for a PCA on the expression matrix prior to the nearest neighbor search. 62 | Ignored for the ANY method if \code{transposed=TRUE} and for the SingleCellExperiment methods if \code{use.dimred} is set.} 63 | 64 | \item{transposed}{A logical scalar indicating whether \code{x} is transposed (i.e., rows are cells).} 65 | 66 | \item{subset.row}{See \code{?"\link{scran-gene-selection}"}. 67 | Only used when \code{transposed=FALSE}.} 68 | 69 | \item{BSPARAM}{A \linkS4class{BiocSingularParam} object specifying the algorithm to use for PCA, if \code{d} is not \code{NA}.} 70 | 71 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object to use for parallel processing.} 72 | 73 | \item{assay.type}{A string specifying which assay values to use.} 74 | 75 | \item{use.dimred}{A string specifying whether existing values in \code{reducedDims(x)} should be used.} 76 | } 77 | \value{ 78 | A \link{graph} where nodes are cells and edges represent connections between nearest neighbors, 79 | see \code{?\link{makeSNNGraph}} for more details. 80 | } 81 | \description{ 82 | \linkS4class{SingleCellExperiment}-friendly wrapper around the \code{\link{makeSNNGraph}} and \code{\link{makeKNNGraph}} functions for creating nearest-neighbor graphs. 83 | } 84 | \examples{ 85 | library(scuttle) 86 | sce <- mockSCE(ncells=500) 87 | sce <- logNormCounts(sce) 88 | 89 | g <- buildSNNGraph(sce) 90 | clusters <- igraph::cluster_fast_greedy(g)$membership 91 | table(clusters) 92 | 93 | # Any clustering method from igraph can be used: 94 | clusters <- igraph::cluster_walktrap(g)$membership 95 | table(clusters) 96 | 97 | # Smaller 'k' usually yields finer clusters: 98 | g <- buildSNNGraph(sce, k=5) 99 | clusters <- igraph::cluster_walktrap(g)$membership 100 | table(clusters) 101 | 102 | # Graph can be built off existing reducedDims results: 103 | sce <- scater::runPCA(sce) 104 | g <- buildSNNGraph(sce, use.dimred="PCA") 105 | clusters <- igraph::cluster_fast_greedy(g)$membership 106 | table(clusters) 107 | 108 | } 109 | \seealso{ 110 | \code{\link{makeSNNGraph}} and \code{\link{makeKNNGraph}}, for the underlying functions that do the work. 111 | 112 | See \code{\link{cluster_walktrap}} and related functions in \pkg{igraph} for clustering based on the produced graph. 113 | 114 | \code{\link{clusterCells}}, for a more succinct way of performing graph-based clustering. 115 | } 116 | \author{ 117 | Aaron Lun 118 | } 119 | -------------------------------------------------------------------------------- /man/clusterCells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterCells.R 3 | \name{clusterCells} 4 | \alias{clusterCells} 5 | \title{Cluster cells in a SingleCellExperiment} 6 | \usage{ 7 | clusterCells( 8 | x, 9 | assay.type = NULL, 10 | use.dimred = NULL, 11 | BLUSPARAM = NNGraphParam(), 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing cells in the columns.} 17 | 18 | \item{assay.type}{Integer or string specifying the assay values to use for clustering, typically log-normalized expression.} 19 | 20 | \item{use.dimred}{Integer or string specifying the reduced dimensions to use for clustering, typically PC scores. 21 | Only used when \code{assay.type=NULL}, and only applicable if \code{x} is a SingleCellExperiment.} 22 | 23 | \item{BLUSPARAM}{A \linkS4class{BlusterParam} object specifying the clustering algorithm to use, 24 | defaults to a graph-based method.} 25 | 26 | \item{...}{Further arguments to pass to \code{\link{clusterRows}}.} 27 | } 28 | \value{ 29 | A factor of cluster identities for each cell in \code{x}, 30 | or a list containing such a factor - see the return value of \code{?\link{clusterRows}}. 31 | } 32 | \description{ 33 | A \linkS4class{SingleCellExperiment}-compatible wrapper around \code{\link{clusterRows}} from the \pkg{bluster} package. 34 | } 35 | \details{ 36 | This is largely a convenience wrapper to avoid the need to manually extract the relevant assays or reduced dimensions from \code{x}. 37 | Altering \code{BLUSPARAM} can easily change the parameters or algorithm used for clustering - 38 | see \code{?"\link{BlusterParam-class}"} for more details. 39 | } 40 | \examples{ 41 | library(scuttle) 42 | sce <- mockSCE() 43 | sce <- logNormCounts(sce) 44 | 45 | # From log-expression values: 46 | clusters <- clusterCells(sce, assay.type="logcounts") 47 | 48 | # From PCs: 49 | sce <- scater::runPCA(sce) 50 | clusters2 <- clusterCells(sce, use.dimred="PCA") 51 | 52 | # With different parameters: 53 | library(bluster) 54 | clusters3 <- clusterCells(sce, use.dimred="PCA", BLUSPARAM=NNGraphParam(k=5)) 55 | 56 | # With different algorithms: 57 | clusters4 <- clusterCells(sce, use.dimred="PCA", BLUSPARAM=KmeansParam(centers=10)) 58 | 59 | } 60 | \author{ 61 | Aaron Lun 62 | } 63 | -------------------------------------------------------------------------------- /man/combineBlocks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/combineBlocks.R 3 | \name{combineBlocks} 4 | \alias{combineBlocks} 5 | \title{Combine blockwise statistics} 6 | \usage{ 7 | combineBlocks( 8 | blocks, 9 | ave.fields, 10 | pval.field, 11 | method, 12 | geometric, 13 | equiweight, 14 | weights, 15 | valid 16 | ) 17 | } 18 | \arguments{ 19 | \item{blocks}{A list of \linkS4class{DataFrame}s containing blockwise statistics. 20 | These should have the same number of rows and the same set of columns.} 21 | 22 | \item{ave.fields}{Character vector specifying the columns of \code{blocks} to be averaged. 23 | The value of each column is averaged across blocks, potentially in a weighted manner.} 24 | 25 | \item{pval.field}{String specifying the column of \code{blocks} containing the p-value. 26 | This is combined using \code{\link{combineParallelPValues}}.} 27 | 28 | \item{method}{String specifying how p-values should be combined, see \code{?\link{combineParallelPValues}}.} 29 | 30 | \item{geometric}{Logical scalar indicating whether the geometric mean should be computed when averaging \code{ave.fields}.} 31 | 32 | \item{equiweight}{Logical scalar indicating whether each block should be given equal weight.} 33 | 34 | \item{weights}{Numeric vector of length equal to \code{blocks}, containing the weight for each block. 35 | Only used if \code{equiweight=TRUE}.} 36 | 37 | \item{valid}{Logical vector indicating whether each block is valid. 38 | Invalid blocks are still stored in the \code{per.block} output but are not used to compute the combined statistics.} 39 | } 40 | \value{ 41 | A \linkS4class{DataFrame} containing all fields in \code{ave.fields} and the p-values, 42 | where each column is created by combining the corresponding block-specific columns. 43 | A \code{per.block} column is also reported, containing a DataFrame of the DataFrames of blockwise statistics. 44 | } 45 | \description{ 46 | Combine DataFrames of statistics computed separately for each block. 47 | This usually refers to feature-level statistics and sample-level blocks. 48 | } 49 | \examples{ 50 | library(scuttle) 51 | sce <- mockSCE() 52 | 53 | y1 <- sce[,1:100] 54 | y1 <- logNormCounts(y1) # normalize separately after subsetting. 55 | results1 <- modelGeneVar(y1) 56 | 57 | y2 <- sce[,1:100 + 100] 58 | y2 <- logNormCounts(y2) # normalize separately after subsetting. 59 | results2 <- modelGeneVar(y2) 60 | 61 | # A manual implementation of combineVar: 62 | combineBlocks(list(results1, results2), 63 | ave.fields=c("mean", "total", "bio", "tech"), 64 | pval.field='p.value', 65 | method='fisher', 66 | geometric=FALSE, 67 | equiweight=TRUE, 68 | weights=NULL, 69 | valid=c(TRUE, TRUE)) 70 | 71 | } 72 | \seealso{ 73 | This function is used in \code{\link{modelGeneVar}} and friends, \code{\link{combineVar}} and \code{\link{testLinearModel}}. 74 | } 75 | \author{ 76 | Aaron Lun 77 | } 78 | -------------------------------------------------------------------------------- /man/combinePValues.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/combinePValues.R 3 | \name{combinePValues} 4 | \alias{combinePValues} 5 | \title{Combine p-values} 6 | \usage{ 7 | combinePValues( 8 | ..., 9 | method = c("fisher", "z", "simes", "berger", "holm-middle"), 10 | weights = NULL, 11 | log.p = FALSE, 12 | min.prop = 0.5 13 | ) 14 | } 15 | \arguments{ 16 | \item{...}{Two or more numeric vectors of p-values of the same length.} 17 | 18 | \item{method}{A string specifying the combining strategy to use.} 19 | 20 | \item{weights}{A numeric vector of positive weights, with one value per vector in \code{...}. 21 | Alternatively, a list of numeric vectors of weights, with one vector per element in \code{...}. 22 | This is only used when \code{method="z"}.} 23 | 24 | \item{log.p}{Logical scalar indicating whether the p-values in \code{...} are log-transformed.} 25 | 26 | \item{min.prop}{Numeric scalar in [0, 1] specifying the minimum proportion of tests to reject for each set of p-values when \code{method="holm-middle"}.} 27 | } 28 | \value{ 29 | A numeric vector containing the combined p-values. 30 | } 31 | \description{ 32 | Combine p-values from independent or dependent hypothesis tests using a variety of meta-analysis methods. 33 | This is deprecated in favor of \code{\link{combineParallelPValues}} from the \pkg{metapod} package. 34 | } 35 | \details{ 36 | This function will operate across elements on \code{...} in parallel to combine p-values. 37 | That is, the set of first p-values from all vectors will be combined, followed by the second p-values and so on. 38 | This is useful for combining p-values for each gene across different hypothesis tests. 39 | 40 | Fisher's method, Stouffer's Z method and Simes' method test the global null hypothesis that all of the individual null hypotheses in the set are true. 41 | The global null is rejected if any of the individual nulls are rejected. 42 | However, each test has different characteristics: 43 | \itemize{ 44 | \item Fisher's method requires independence of the test statistic. 45 | It is useful in asymmetric scenarios, i.e., when the null is only rejected in one of the tests in the set. 46 | Thus, a low p-value in any test is sufficient to obtain a low combined p-value. 47 | \item Stouffer's Z method require independence of the test statistic. 48 | It favours symmetric rejection and is less sensitive to a single low p-value, requiring more consistently low p-values to yield a low combined p-value. 49 | It can also accommodate weighting of the different p-values. 50 | \item Simes' method technically requires independence but tends to be quite robust to dependencies between tests. 51 | See Sarkar and Chung (1997) for details, as well as work on the related Benjamini-Hochberg method. 52 | It favours asymmetric rejection and is less powerful than the other two methods under independence. 53 | } 54 | 55 | Berger's intersection-union test examines a different global null hypothesis - 56 | that at least one of the individual null hypotheses are true. 57 | Rejection in the IUT indicates that all of the individual nulls have been rejected. 58 | This is the statistically rigorous equivalent of a naive intersection operation. 59 | 60 | In the Holm-middle approach, the global null hypothesis is that more than \code{1 - min.prop} proportion of the individual nulls in the set are true. 61 | We apply the Holm-Bonferroni correction to all p-values in the set and take the \code{ceiling(min.prop * N)}-th smallest value where \code{N} is the size of the set (excluding \code{NA} values). 62 | This method works correctly in the presence of correlations between p-values. 63 | 64 | % We apply Holm until we reject the ceil(N * min.prop)-th test, which causes us to reject the global null. 65 | % The combined p-value is thus defined as the p-value at this rejection point. 66 | } 67 | \examples{ 68 | p1 <- runif(10000) 69 | p2 <- runif(10000) 70 | p3 <- runif(10000) 71 | 72 | fish <- combinePValues(p1, p2, p3) 73 | hist(fish) 74 | 75 | z <- combinePValues(p1, p2, p3, method="z", weights=1:3) 76 | hist(z) 77 | 78 | simes <- combinePValues(p1, p2, p3, method="simes") 79 | hist(simes) 80 | 81 | berger <- combinePValues(p1, p2, p3, method="berger") 82 | hist(berger) 83 | 84 | } 85 | \references{ 86 | Fisher, R.A. (1925). 87 | \emph{Statistical Methods for Research Workers.} 88 | Oliver and Boyd (Edinburgh). 89 | 90 | Whitlock MC (2005). 91 | Combining probability from independent tests: the weighted Z-method is superior to Fisher's approach. 92 | \emph{J. Evol. Biol.} 18, 5:1368-73. 93 | 94 | Simes RJ (1986). 95 | An improved Bonferroni procedure for multiple tests of significance. 96 | \emph{Biometrika} 73:751-754. 97 | 98 | Berger RL and Hsu JC (1996). 99 | Bioequivalence trials, intersection-union tests and equivalence confidence sets. 100 | \emph{Statist. Sci.} 11, 283-319. 101 | 102 | Sarkar SK and Chung CK (1997). 103 | The Simes method for multiple hypothesis testing with positively dependent test statistics. 104 | \emph{J. Am. Stat. Assoc.} 92, 1601-1608. 105 | } 106 | \author{ 107 | Aaron Lun 108 | } 109 | -------------------------------------------------------------------------------- /man/combineVar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/combineVar.R 3 | \name{combineVar} 4 | \alias{combineVar} 5 | \alias{combineCV2} 6 | \title{Combine variance decompositions} 7 | \usage{ 8 | combineVar( 9 | ..., 10 | method = "fisher", 11 | pval.field = "p.value", 12 | other.fields = NULL, 13 | equiweight = TRUE, 14 | ncells = NULL 15 | ) 16 | 17 | combineCV2( 18 | ..., 19 | method = "fisher", 20 | pval.field = "p.value", 21 | other.fields = NULL, 22 | equiweight = TRUE, 23 | ncells = NULL 24 | ) 25 | } 26 | \arguments{ 27 | \item{...}{Two or more \linkS4class{DataFrame}s of variance modelling results. 28 | For \code{combineVar}, these should be produced by \code{\link{modelGeneVar}} or \code{\link{modelGeneVarWithSpikes}}. 29 | For \code{combineCV2}, these should be produced by \code{\link{modelGeneCV2}} or \code{\link{modelGeneCV2WithSpikes}}. 30 | 31 | Alternatively, one or more lists of DataFrames containing variance modelling results. 32 | Mixed inputs are also acceptable, e.g., lists of DataFrames alongside the DataFrames themselves.} 33 | 34 | \item{method}{String specifying how p-values are to be combined, see \code{\link{combineParallelPValues}} for options.} 35 | 36 | \item{pval.field}{A string specifying the column name of each element of \code{...} that contains the p-value.} 37 | 38 | \item{other.fields}{A character vector specifying the fields containing other statistics to combine.} 39 | 40 | \item{equiweight}{Logical scalar indicating whether each result is to be given equal weight in the combined statistics.} 41 | 42 | \item{ncells}{Numeric vector containing the number of cells used to generate each element of \code{...}. 43 | Only used if \code{equiweight=FALSE}.} 44 | } 45 | \value{ 46 | A DataFrame with the same numeric fields as that produced by \code{\link{modelGeneVar}} or \code{\link{modelGeneCV2}}. 47 | Each row corresponds to an input gene. 48 | Each field contains the (weighted) arithmetic/geometric mean across all batches except for \code{p.value}, which contains the combined p-value based on \code{method}; 49 | and \code{FDR}, which contains the adjusted p-value using the BH method. 50 | } 51 | \description{ 52 | Combine the results of multiple variance decompositions, usually generated for the same genes across separate batches of cells. 53 | } 54 | \details{ 55 | These functions are designed to merge results from separate calls to \code{\link{modelGeneVar}}, \code{\link{modelGeneCV2}} or related functions, where each result is usually computed for a different batch of cells. 56 | Separate variance decompositions are necessary in cases where the mean-variance relationships vary across batches (e.g., different concentrations of spike-in have been added to the cells in each batch), which precludes the use of a common trend fit. 57 | By combining these results into a single set of statistics, we can apply standard strategies for feature selection in multi-batch integrated analyses. 58 | 59 | By default, statistics in \code{other.fields} contain all common non-numeric fields that are not \code{pval.field} or \code{"FDR"}. 60 | This usually includes \code{"mean"}, \code{"total"}, \code{"bio"} (for \code{combineVar}) or \code{"ratio"} (for \code{combineCV2}). 61 | \itemize{ 62 | \item For \code{combineVar}, statistics are combined by averaging them across all input DataFrames. 63 | \item For \code{combineCV2}, statistics are combined by taking the geometric mean across all inputs. 64 | } 65 | This difference between functions reflects the method by which the relevant measure of overdispersion is computed. 66 | For example, \code{"bio"} is computed by subtraction, so taking the average \code{bio} remains consistent with subtraction of the total and technical averages. 67 | Similarly, \code{"ratio"} is computed by division, so the combined \code{ratio} is consistent with division of the geometric means of the total and trend values. 68 | 69 | If \code{equiweight=FALSE}, each per-batch statistic is weighted by the number of cells used to compute it. 70 | The number of cells can be explicitly set using \code{ncells}, and is otherwise assumed to be equal for all batches. 71 | No weighting is performed by default, which ensures that all batches contribute equally to the combined statistics and avoids situations where batches with many cells dominate the output. 72 | 73 | The \code{\link{combineParallelPValues}} function is used to combine p-values across batches. 74 | The default is to use Fisher's method, which will achieve a low p-value if a gene is highly variable in any batch. 75 | Only \code{method="stouffer"} will perform any weighting of batches, and only if \code{weights} is set. 76 | } 77 | \examples{ 78 | library(scuttle) 79 | sce <- mockSCE() 80 | 81 | y1 <- sce[,1:100] 82 | y1 <- logNormCounts(y1) # normalize separately after subsetting. 83 | results1 <- modelGeneVar(y1) 84 | 85 | y2 <- sce[,1:100 + 100] 86 | y2 <- logNormCounts(y2) # normalize separately after subsetting. 87 | results2 <- modelGeneVar(y2) 88 | 89 | head(combineVar(results1, results2)) 90 | head(combineVar(results1, results2, method="simes")) 91 | head(combineVar(results1, results2, method="berger")) 92 | 93 | } 94 | \seealso{ 95 | \code{\link{modelGeneVar}} and \code{\link{modelGeneCV2}}, for two possible inputs into this function. 96 | 97 | \code{\link{combineParallelPValues}}, for details on how the p-values are combined. 98 | } 99 | \author{ 100 | Aaron Lun 101 | } 102 | -------------------------------------------------------------------------------- /man/computeMinRank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/computeMinRank.R 3 | \name{computeMinRank} 4 | \alias{computeMinRank} 5 | \title{Compute the minimum rank} 6 | \usage{ 7 | computeMinRank(x, ties.method = "min", decreasing = TRUE) 8 | } 9 | \arguments{ 10 | \item{x}{A matrix of statistics from multiple differential comparisons (columns) and genes (rows).} 11 | 12 | \item{ties.method}{String specifying how ties should be handled.} 13 | 14 | \item{decreasing}{Logical scalar indicating whether to obtain ranks for decreasing magnitude of values in \code{x}.} 15 | } 16 | \value{ 17 | A numeric vector containing the minimum (i.e., top) rank for each gene across all comparisons. 18 | } 19 | \description{ 20 | Compute the minimum rank in a matrix of statistics, usually effect sizes from a set of differential comparisons. 21 | } 22 | \details{ 23 | For each gene, the minimum rank, a.k.a., \dQuote{min-rank} is defined by ranking values within each column of \code{x}, and then taking the minimum rank value across columns. 24 | This is most useful when the columns of \code{x} contain significance statistics or effect sizes from a single differential comparison, where larger values represent stronger differences. 25 | In this setting, the min-rank represents the highest rank that each gene achieves in any comparison. 26 | Taking all genes with min-ranks less than or equal to \eqn{T} yields the union of the top \eqn{T} DE genes from all comparisons. 27 | 28 | To illustrate, the set of genes with min-rank values of 1 will contain the top gene from each pairwise comparison to every other cluster. 29 | If we instead take all genes with min-ranks less than or equal to, say, \eqn{T = 5}, the set will consist of the \emph{union} of the top 5 genes from each pairwise comparison. 30 | Multiple genes can have the same min-rank as different genes may have the same rank across different pairwise comparisons. 31 | Conversely, the marker set may be smaller than the product of \eqn{T} and the number of other clusters, as the same gene may be shared across different comparisons. 32 | 33 | In the context of marker detection with pairwise comparisons between groups of cells, sorting by the min-rank guarantees the inclusion of genes that can distinguish between any two groups. 34 | More specifically, this approach does not explicitly favour genes that are uniquely expressed in a cluster. 35 | Rather, it focuses on combinations of genes that - together - drive separation of a cluster from the others. 36 | This is more general and robust but tends to yield a less focused marker set compared to the other methods of ranking potential markers. 37 | } 38 | \examples{ 39 | # Get min-rank by log-FC: 40 | lfcs <- matrix(rnorm(100), ncol=5) 41 | computeMinRank(lfcs) 42 | 43 | # Get min-rank by p-value: 44 | pvals <- matrix(runif(100), ncol=5) 45 | computeMinRank(pvals, decreasing=FALSE) 46 | 47 | } 48 | \seealso{ 49 | \code{\link{scoreMarkers}}, where this function is used to compute one of the effect size summaries. 50 | 51 | \code{\link{combineMarkers}}, where the same principle is used for the \code{Top} field. 52 | } 53 | -------------------------------------------------------------------------------- /man/computeSumFactors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/computeSumFactors.R 3 | \name{computeSumFactors} 4 | \alias{computeSumFactors} 5 | \alias{calculateSumFactors} 6 | \title{Normalization by deconvolution} 7 | \usage{ 8 | computeSumFactors(...) 9 | 10 | calculateSumFactors(...) 11 | } 12 | \arguments{ 13 | \item{...}{Further arguments to pass to \code{\link{pooledSizeFactors}} or \code{\link{computePooledFactors}}.} 14 | } 15 | \value{ 16 | For \code{calculateSumFactors}, a numeric vector of size factors returned by \code{\link{pooledSizeFactors}}. 17 | 18 | For \code{computeSumFactors}, a SingleCellExperiment containing the size factors in its \code{\link{sizeFactors}}, 19 | as returned by \code{\link{computePooledFactors}}. 20 | } 21 | \description{ 22 | Scaling normalization of single-cell RNA-seq data by deconvolving size factors from cell pools. 23 | These functions have been moved to the \pkg{scuttle} package and are just retained here for compatibility. 24 | } 25 | \author{ 26 | Aaron Lun 27 | } 28 | -------------------------------------------------------------------------------- /man/convertTo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convertTo.R 3 | \name{convertTo} 4 | \alias{convertTo} 5 | \title{Convert to other classes} 6 | \usage{ 7 | convertTo( 8 | x, 9 | type = c("edgeR", "DESeq2", "monocle"), 10 | ..., 11 | assay.type = 1, 12 | subset.row = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \linkS4class{SingleCellExperiment} object.} 17 | 18 | \item{type}{A string specifying the analysis for which the object should be prepared.} 19 | 20 | \item{...}{Other arguments to be passed to pipeline-specific constructors.} 21 | 22 | \item{assay.type}{A string specifying which assay of \code{x} should be put in the returned object.} 23 | 24 | \item{subset.row}{See \code{?"\link{scran-gene-selection}"}.} 25 | } 26 | \value{ 27 | For \code{type="edgeR"}, a DGEList object is returned containing the count matrix. 28 | Size factors are converted to normalization factors. 29 | Gene-specific \code{rowData} is stored in the \code{genes} element, and cell-specific \code{colData} is stored in the \code{samples} element. 30 | 31 | For \code{type="DESeq2"}, a DESeqDataSet object is returned containing the count matrix and size factors. 32 | Additional gene- and cell-specific data is stored in the \code{mcols} and \code{colData} respectively. 33 | } 34 | \description{ 35 | Convert a \linkS4class{SingleCellExperiment} object into other classes for entry into other analysis pipelines. 36 | } 37 | \details{ 38 | This function converts an SingleCellExperiment object into various other classes in preparation for entry into other analysis pipelines, as specified by \code{type}. 39 | } 40 | \examples{ 41 | library(scuttle) 42 | sce <- mockSCE() 43 | 44 | # Adding some additional embellishments. 45 | sizeFactors(sce) <- 2^rnorm(ncol(sce)) 46 | rowData(sce)$SYMBOL <- paste0("X", seq_len(nrow(sce))) 47 | sce$other <- sample(LETTERS, ncol(sce), replace=TRUE) 48 | 49 | # Converting to various objects. 50 | convertTo(sce, type="edgeR") 51 | convertTo(sce, type="DESeq2") 52 | 53 | } 54 | \seealso{ 55 | \code{\link[edgeR]{DGEList}}, 56 | \code{\link[DESeq2:DESeqDataSet]{DESeqDataSetFromMatrix}} 57 | for specific class constructors. 58 | } 59 | \author{ 60 | Aaron Lun 61 | } 62 | -------------------------------------------------------------------------------- /man/correlateGenes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correlateGenes.R 3 | \name{correlateGenes} 4 | \alias{correlateGenes} 5 | \title{Per-gene correlation statistics} 6 | \usage{ 7 | correlateGenes(stats) 8 | } 9 | \arguments{ 10 | \item{stats}{A \linkS4class{DataFrame} of pairwise correlation statistics, returned by \code{\link{correlatePairs}}.} 11 | } 12 | \value{ 13 | A \linkS4class{DataFrame} with one row per unique gene in \code{stats} and containing the fields: 14 | \describe{ 15 | \item{\code{gene}:}{A field of the same type as \code{stats$gene1} specifying the gene identity.} 16 | \item{\code{rho}:}{Numeric, the correlation with the largest magnitude across all gene pairs involving the corresponding gene.} 17 | \item{\code{p.value}:}{Numeric, the Simes p-value for this gene.} 18 | \item{\code{FDR}:}{Numeric, the adjusted \code{p.value} across all rows.} 19 | } 20 | } 21 | \description{ 22 | Compute per-gene correlation statistics by combining results from gene pair correlations. 23 | } 24 | \details{ 25 | For each gene, all of its pairs are identified and the corresponding p-values are combined using Simes' method. 26 | This tests whether the gene is involved in significant correlations to \emph{any} other gene. 27 | Per-gene statistics are useful for identifying correlated genes without regard to what they are correlated with (e.g., during feature selection). 28 | } 29 | \examples{ 30 | library(scuttle) 31 | sce <- mockSCE() 32 | sce <- logNormCounts(sce) 33 | pairs <- correlatePairs(sce, iters=1e5, subset.row=1:100) 34 | 35 | g.out <- correlateGenes(pairs) 36 | head(g.out) 37 | 38 | } 39 | \references{ 40 | Simes RJ (1986). 41 | An improved Bonferroni procedure for multiple tests of significance. 42 | \emph{Biometrika} 73:751-754. 43 | } 44 | \seealso{ 45 | \code{\link{correlatePairs}}, to compute \code{stats}. 46 | } 47 | \author{ 48 | Aaron Lun 49 | } 50 | -------------------------------------------------------------------------------- /man/correlateNull.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correlateNull.R 3 | \name{correlateNull} 4 | \alias{correlateNull} 5 | \title{Build null correlations} 6 | \usage{ 7 | correlateNull( 8 | ncells, 9 | iters = 1e+06, 10 | block = NULL, 11 | design = NULL, 12 | equiweight = TRUE, 13 | BPPARAM = SerialParam() 14 | ) 15 | } 16 | \arguments{ 17 | \item{ncells}{An integer scalar indicating the number of cells in the data set.} 18 | 19 | \item{iters}{An integer scalar specifying the number of values in the null distribution.} 20 | 21 | \item{block}{A factor specifying the blocking level for each cell.} 22 | 23 | \item{design}{A numeric design matrix containing uninteresting factors to be ignored.} 24 | 25 | \item{equiweight}{A logical scalar indicating whether statistics from each block should be given equal weight. 26 | Otherwise, each block is weighted according to its number of cells. 27 | Only used if \code{block} is specified.} 28 | 29 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object that specifies the manner of parallel processing to use.} 30 | } 31 | \value{ 32 | A numeric vector of length \code{iters} is returned containing the sorted correlations under the null hypothesis of no correlations. 33 | } 34 | \description{ 35 | Build a distribution of correlations under the null hypothesis of independent expression between pairs of genes. 36 | This is now deprecated as \code{\link{correlatePairs}} uses an approximation instead. 37 | } 38 | \details{ 39 | The \code{correlateNull} function constructs an empirical null distribution for Spearman's rank correlation when it is computed with \code{ncells} cells. 40 | This is done by shuffling the ranks, calculating the correlation and repeating until \code{iters} values are obtained. 41 | No consideration is given to tied ranks, which has implications for the accuracy of p-values in \code{\link{correlatePairs}}. 42 | 43 | If \code{block} is specified, a null correlation is created within each level of \code{block} using the shuffled ranks. 44 | The final correlation is then defined as the average of the per-level correlations, 45 | weighted by the number of cells in that level if \code{equiweight=FALSE}. 46 | Levels with fewer than 3 cells are ignored, and if no level has 3 or more cells, all returned correlations will be \code{NA}. 47 | 48 | If \code{design} is specified, the same process is performed on ranks derived from simulated residuals computed by fitting the linear model to a vector of normally distributed values. 49 | If there are not at least 3 residual d.f., all returned correlations will be \code{NA}. 50 | The \code{design} argument cannot be used at the same time as \code{block}. 51 | 52 | % Yeah, we could use a t-distribution for this, but the empirical distribution is probably more robust if you have few cells (or effects, after batch correction). 53 | } 54 | \examples{ 55 | set.seed(0) 56 | ncells <- 100 57 | 58 | # Simplest case: 59 | null.dist <- correlateNull(ncells, iters=10000) 60 | hist(null.dist) 61 | 62 | # With a blocking factor: 63 | block <- sample(LETTERS[1:3], ncells, replace=TRUE) 64 | null.dist <- correlateNull(block=block, iters=10000) 65 | hist(null.dist) 66 | 67 | # With a design matrix. 68 | cov <- runif(ncells) 69 | X <- model.matrix(~cov) 70 | null.dist <- correlateNull(design=X, iters=10000) 71 | hist(null.dist) 72 | 73 | } 74 | \seealso{ 75 | \code{\link{correlatePairs}}, where the null distribution is used to compute p-values. 76 | } 77 | \author{ 78 | Aaron Lun 79 | } 80 | -------------------------------------------------------------------------------- /man/decideTestsPerLabel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decideTestsPerLabel.R 3 | \name{decideTestsPerLabel} 4 | \alias{decideTestsPerLabel} 5 | \alias{summarizeTestsPerLabel} 6 | \title{Decide tests for each label} 7 | \usage{ 8 | decideTestsPerLabel( 9 | results, 10 | method = c("separate", "global"), 11 | threshold = 0.05, 12 | pval.field = NULL, 13 | lfc.field = "logFC" 14 | ) 15 | 16 | summarizeTestsPerLabel(results, ...) 17 | } 18 | \arguments{ 19 | \item{results}{A \linkS4class{List} containing the output of \code{\link{pseudoBulkDGE}}. 20 | Each entry should be a DataFrame with the same number and order of rows, 21 | containing at least a numeric \code{"PValue"} column (and usually a \code{"logFC"} column). 22 | 23 | For \code{summarizeTestsPerLabel}, this may also be a matrix produced by \code{decideTestsPerLabel}.} 24 | 25 | \item{method}{String specifying whether the Benjamini-Hochberg correction should be applied across all clustesr 26 | or separately within each label.} 27 | 28 | \item{threshold}{Numeric scalar specifying the FDR threshold to consider genes as significant.} 29 | 30 | \item{pval.field}{String containing the name of the column containing the p-value in each entry of \code{results}. 31 | Defaults to \code{"PValue"}, \code{"P.Value"} or \code{"p.value"} based on fields in the first entry of \code{results}.} 32 | 33 | \item{lfc.field}{String containing the name of the column containing the log-fold change. 34 | Ignored if the column is not available Defaults to \code{"logFC"} if this field is available.} 35 | 36 | \item{...}{Further arguments to pass to \code{decideTestsPerLabel} if \code{results} is a List.} 37 | } 38 | \value{ 39 | For \code{decideTestsPerLabel}, 40 | an integer matrix indicating whether each gene (row) is significantly DE between conditions for each label (column). 41 | 42 | For \code{summarizeTestsPerLabel}, 43 | an integer matrix containing the number of genes of each DE status (column) in each label (row). 44 | } 45 | \description{ 46 | Decide which tests (i.e., genes) are significant for differential expression between conditions in each label, 47 | using the output of \code{\link{pseudoBulkDGE}}. 48 | This mimics the \code{\link{decideTests}} functionality from \pkg{limma}. 49 | } 50 | \details{ 51 | If a log-fold change field is available and specified in \code{lfc.field}, values of \code{1}, \code{-1} and \code{0} 52 | indicate that the gene is significantly upregulated, downregulated or not significant, respectively. 53 | Note, the interpretation of \dQuote{up} and \dQuote{down} depends on the design and contrast in \code{\link{pseudoBulkDGE}}. 54 | 55 | Otherwise, if no log-fold change is available or if \code{lfc.field=NULL}, 56 | values of \code{1} or \code{0} indicate that a gene is significantly DE or not, respectively. 57 | 58 | \code{NA} values indicate either that the relevant gene was low-abundance for a particular label and filtered out, 59 | or that the DE comparison for that label was not possible (e.g., no residual d.f.). 60 | } 61 | \examples{ 62 | example(pseudoBulkDGE) 63 | head(decideTestsPerLabel(out)) 64 | summarizeTestsPerLabel(out) 65 | 66 | } 67 | \seealso{ 68 | \code{\link{pseudoBulkDGE}}, which generates the input to this function. 69 | 70 | \code{\link{decideTests}}, which inspired this function. 71 | } 72 | \author{ 73 | Aaron Lun 74 | } 75 | -------------------------------------------------------------------------------- /man/defunct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/defunct.R 3 | \name{defunct} 4 | \alias{defunct} 5 | \alias{trendVar} 6 | \alias{decomposeVar} 7 | \alias{testVar} 8 | \alias{improvedCV2} 9 | \alias{technicalCV2} 10 | \alias{makeTechTrend} 11 | \alias{multiBlockVar} 12 | \alias{multiBlockNorm} 13 | \alias{overlapExprs} 14 | \alias{parallelPCA} 15 | \alias{bootstrapCluster} 16 | \alias{clusterModularity} 17 | \alias{clusterPurity} 18 | \alias{clusterKNNGraph} 19 | \alias{clusterSNNGraph} 20 | \alias{coassignProb} 21 | \alias{createClusterMST} 22 | \alias{connectClusterMST} 23 | \alias{orderClusterMST} 24 | \alias{quickPseudotime} 25 | \alias{testPseudotime} 26 | \alias{doubletCells} 27 | \alias{doubletCluster} 28 | \alias{doubletRecovery} 29 | \title{Defunct functions} 30 | \usage{ 31 | trendVar(...) 32 | 33 | decomposeVar(...) 34 | 35 | testVar(...) 36 | 37 | improvedCV2(...) 38 | 39 | technicalCV2(...) 40 | 41 | makeTechTrend(...) 42 | 43 | multiBlockVar(...) 44 | 45 | multiBlockNorm(...) 46 | 47 | overlapExprs(...) 48 | 49 | parallelPCA(...) 50 | 51 | bootstrapCluster(...) 52 | 53 | clusterModularity(...) 54 | 55 | clusterPurity(...) 56 | 57 | clusterKNNGraph(...) 58 | 59 | clusterSNNGraph(...) 60 | 61 | coassignProb(...) 62 | 63 | createClusterMST(...) 64 | 65 | connectClusterMST(...) 66 | 67 | orderClusterMST(...) 68 | 69 | quickPseudotime(...) 70 | 71 | testPseudotime(...) 72 | 73 | doubletCells(...) 74 | 75 | doubletCluster(...) 76 | 77 | doubletRecovery(...) 78 | } 79 | \arguments{ 80 | \item{...}{Ignored arguments.} 81 | } 82 | \value{ 83 | All functions error out with a defunct message pointing towards its descendent (if available). 84 | } 85 | \description{ 86 | Functions that have passed on to the function afterlife. 87 | Their successors are also listed. 88 | } 89 | \section{Variance modelling}{ 90 | 91 | \code{trendVar}, \code{decomposeVar} and \code{testVar} are succeeded by a suite of funtions related to \code{\link{modelGeneVar}} and \code{\link{fitTrendVar}}. 92 | 93 | \code{improvedCV2} and \code{technicalCV2} are succeeded by \code{\link{modelGeneCV2}} and \code{\link{fitTrendCV2}}. 94 | 95 | \code{makeTechTrend} is succeeded by \code{\link{modelGeneVarByPoisson}}. 96 | 97 | \code{multiBlockVar} is succeeded by the \code{block} argument in many of the modelling functions, and \code{multiBlockNorm} is no longer necessary. 98 | } 99 | 100 | \section{Clustering-related functions}{ 101 | 102 | \code{bootstrapCluster} has been moved over to the \pkg{bluster} package, as the \code{\link{bootstrapStability}} function. 103 | 104 | \code{neighborsToSNNGraph} and \code{neighborsToKNNGraph} have been moved over to the \pkg{bluster} package. 105 | 106 | \code{clusterModularity} has been moved over to the \pkg{bluster} package, as the \code{\link{pairwiseModularity}} function. 107 | 108 | \code{clusterPurity} has been moved over to the \pkg{bluster} package, as the \code{\link{neighborPurity}} function. 109 | 110 | \code{clusterSNNGraph} and \code{clusterKNNGraph} have been replaced by \code{\link{clusterRows}} with \linkS4class{NNGraphParam} or \linkS4class{TwoStepParam} from the \pkg{bluster} package. 111 | 112 | \code{coassignProb} and \code{clusterRand} have been replaced by \code{\link{pairwiseRand}} from the \pkg{bluster} package. 113 | } 114 | 115 | \section{Pseudotime-related functions}{ 116 | 117 | \code{createClusterMST}, \code{quickPseudotime} and \code{testPseudotime} have been moved over to the \pkg{TSCAN} package. 118 | 119 | \code{connectClusterMST} has been moved over to the \pkg{TSCAN} package, as the \code{reportEdges} function. 120 | 121 | \code{orderClusterMST} has been moved over to the \pkg{TSCAN} package, as the \code{orderCells} function. 122 | } 123 | 124 | \section{Doublet-related functions}{ 125 | 126 | \code{doubletCells} has been moved over to the \pkg{scDblFinder} package, as the \code{computeDoubletDensity} function. 127 | 128 | \code{doubletCluster} has been moved over to the \pkg{scDblFinder} package, as the \code{findDoubletClusters} function. 129 | 130 | \code{doubletRecovery} has been moved over to the \pkg{scDblFinder} package, as the \code{recoverDoublets} function. 131 | } 132 | 133 | \section{Other functions}{ 134 | 135 | \code{overlapExprs} is succeeded by \code{\link{findMarkers}} with \code{test.type="wilcox"}. 136 | 137 | \code{parallelPCA} has been moved over to the \pkg{PCAtools} package. 138 | } 139 | 140 | \examples{ 141 | try(trendVar()) 142 | } 143 | \author{ 144 | Aaron Lun 145 | } 146 | -------------------------------------------------------------------------------- /man/fitTrendCV2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fitTrendCV2.R 3 | \name{fitTrendCV2} 4 | \alias{fitTrendCV2} 5 | \title{Fit a trend to the CV2} 6 | \usage{ 7 | fitTrendCV2( 8 | means, 9 | cv2, 10 | ncells, 11 | min.mean = 0.1, 12 | nls.args = list(), 13 | simplified = TRUE, 14 | nmads = 6, 15 | max.iter = 50 16 | ) 17 | } 18 | \arguments{ 19 | \item{means}{A numeric vector containing mean normalized expression values for all genes.} 20 | 21 | \item{cv2}{A numeric vector containing the squared coefficient of variation computed from normalized expression values for all genes.} 22 | 23 | \item{ncells}{Integer scalar specifying the number of cells used to compute \code{cv2} and \code{means}.} 24 | 25 | \item{min.mean}{Numeric scalar specifying the minimum mean to use for trend fitting.} 26 | 27 | \item{nls.args}{A list of parameters to pass to \code{\link{nls}}.} 28 | 29 | \item{simplified}{Logical scalar indicating whether the function can automatically use a simpler trend if errors are encountered for the usual paramterization.} 30 | 31 | \item{nmads}{Numeric scalar specifying the number of MADs to use to compute the tricube bandwidth during robustification.} 32 | 33 | \item{max.iter}{Integer scalar specifying the maximum number of robustness iterations to perform.} 34 | } 35 | \value{ 36 | A named list is returned containing: 37 | \describe{ 38 | \item{\code{trend}:}{A function that returns the fitted value of the trend at any value of the mean.} 39 | \item{\code{std.dev}:}{A numeric scalar containing the robust standard deviation of the ratio of \code{var} to the fitted value of the trend across all features used for trend fitting.} 40 | } 41 | } 42 | \description{ 43 | Fit a mean-dependent trend to the squared coefficient of variation, 44 | computed from count data after size factor normalization. 45 | } 46 | \details{ 47 | This function fits a mean-dependent trend to the CV2 of normalized expression values for the selected features. 48 | Specifically, it fits a trend of the form 49 | \deqn{y = A + \frac{B}{x}}{y = A + B/x} 50 | using an iteratively reweighted least-squares approach implemented via \code{\link{nls}}. 51 | This trend is based on a similar formulation from \pkg{DESeq2} and generally captures the mean-CV2 trend well. 52 | 53 | Trend fitting is performed after weighting each observation according to the inverse of the density of observations at the same mean. 54 | This avoids problems with differences in the distribution of means that would otherwise favor good fits in highly dense intervals at the expense of sparser intervals. 55 | Low-abundance genes with means below \code{min.mean} are also removed prior to fitting, to avoid problems with discreteness and the upper bound on the CV2 at low counts. 56 | 57 | Robustness iterations are also performed to protect against outliers. 58 | An initial fit is performed and each observation is weighted using tricube-transformed standardized residuals (in addition to the existing inverse-density weights). 59 | The bandwidth of the tricube scheme is defined as \code{nmads} multiplied by the median standardized residual. 60 | Iterations are performed until convergence or \code{max.iters} is reached. 61 | 62 | Occasionally, there are not enough high-abundance points to uniquely determine the \eqn{A} parameter. 63 | In such cases, the function collapses back to fitting a simpler trend 64 | \deqn{y = \frac{B}{x}}{y = B/x} 65 | to avoid errors about singular gradients in \code{\link{nls}}. 66 | If \code{simplified=FALSE}, this simplification is not allowed and the error is directly reported. 67 | } 68 | \examples{ 69 | library(scuttle) 70 | sce <- mockSCE() 71 | normcounts <- normalizeCounts(sce, log=FALSE) 72 | 73 | # Fitting a trend: 74 | library(DelayedMatrixStats) 75 | means <- rowMeans(normcounts) 76 | cv2 <- rowVars(normcounts)/means^2 77 | fit <- fitTrendCV2(means, cv2, ncol(sce)) 78 | 79 | # Examining the trend fit: 80 | plot(means, cv2, pch=16, cex=0.5, 81 | xlab="Mean", ylab="CV2", log="xy") 82 | curve(fit$trend(x), add=TRUE, col="dodgerblue", lwd=3) 83 | 84 | } 85 | \references{ 86 | Brennecke P, Anders S, Kim JK et al. (2013). 87 | Accounting for technical noise in single-cell RNA-seq experiments. 88 | \emph{Nat. Methods} 10:1093-95 89 | } 90 | \seealso{ 91 | \code{\link{modelGeneCV2}} and \code{\link{modelGeneCV2WithSpikes}}, where this function is used. 92 | } 93 | \author{ 94 | Aaron Lun 95 | } 96 | -------------------------------------------------------------------------------- /man/fitTrendPoisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fitTrendPoisson.R 3 | \name{fitTrendPoisson} 4 | \alias{fitTrendPoisson} 5 | \title{Generate a trend for Poisson noise} 6 | \usage{ 7 | fitTrendPoisson( 8 | means, 9 | size.factors, 10 | npts = 1000, 11 | dispersion = 0, 12 | pseudo.count = 1, 13 | BPPARAM = SerialParam(), 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{means}{A numeric vector of length 2 or more, containing the range of mean counts observed in the dataset.} 19 | 20 | \item{size.factors}{A numeric vector of size factors for all cells in the dataset.} 21 | 22 | \item{npts}{An integer scalar specifying the number of interpolation points to use.} 23 | 24 | \item{dispersion}{A numeric scalar specifying the dispersion for the NB distribution. 25 | If zero, a Poisson distribution is used.} 26 | 27 | \item{pseudo.count}{A numeric scalar specifying the pseudo-count to be added to the scaled counts before log-transformation.} 28 | 29 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed across interpolation points.} 30 | 31 | \item{...}{Further arguments to pass to \code{\link{fitTrendVar}} for trend fitting.} 32 | } 33 | \value{ 34 | A named list is returned containing: 35 | \describe{ 36 | \item{\code{trend}:}{A function that returns the fitted value of the trend at any value of the mean.} 37 | \item{\code{std.dev}:}{A numeric scalar containing the robust standard deviation of the ratio of \code{var} to the fitted value of the trend across all features used for trend fitting.} 38 | } 39 | } 40 | \description{ 41 | Create a mean-variance trend for log-normalized expression values derived from Poisson-distributed counts. 42 | } 43 | \details{ 44 | This function is useful for modelling technical noise in highly diverse datasets without spike-ins, 45 | where fitting a trend to the endogenous genes would not be appropriate given the strong biological heterogeneity. 46 | It is mostly intended for UMI datasets where the technical noise is close to Poisson-distributed. 47 | 48 | This function operates by simulating Poisson or negative binomial-distributed counts, 49 | computing log-transformed normalized expression values from those counts, 50 | calculating the mean and variance and then passing those metrics to \code{\link{fitTrendVar}}. 51 | The log-transformation ensures that variance is modelled in the same space that is used for downstream analyses like PCA. 52 | 53 | Simulations are performed across a range of values in \code{means} to achieve reliable interpolation, 54 | with the stability of the trend determined by the number of simulation points in \code{npts}. 55 | The number of cells is determined from the length of \code{size.factors}, 56 | which are used to scale the distribution means prior to sampling counts. 57 | } 58 | \examples{ 59 | # Mocking up means and size factors: 60 | sf <- 2^rnorm(1000, sd=0.1) 61 | sf <- sf/mean(sf) 62 | means <- rexp(100, 0.1) 63 | 64 | # Using these to construct a Poisson trend: 65 | out <- fitTrendPoisson(means, sf) 66 | curve(out$trend(x), xlim=c(0, 10)) 67 | 68 | } 69 | \seealso{ 70 | \code{\link{fitTrendVar}}, which is used to fit the trend. 71 | } 72 | \author{ 73 | Aaron Lun 74 | } 75 | -------------------------------------------------------------------------------- /man/fixedPCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fixedPCA.R 3 | \name{fixedPCA} 4 | \alias{fixedPCA} 5 | \title{PCA with a fixed number of components} 6 | \usage{ 7 | fixedPCA( 8 | x, 9 | rank = 50, 10 | value = c("pca", "lowrank"), 11 | subset.row, 12 | preserve.shape = TRUE, 13 | assay.type = "logcounts", 14 | name = NULL, 15 | BSPARAM = bsparam(), 16 | BPPARAM = SerialParam() 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{A \linkS4class{SingleCellExperiment} object containing a log-expression amtrix.} 21 | 22 | \item{rank}{Integer scalar specifying the number of components.} 23 | 24 | \item{value}{String specifying the type of value to return. 25 | \code{"pca"} will return the PCs, \code{"n"} will return the number of retained components, 26 | and \code{"lowrank"} will return a low-rank approximation.} 27 | 28 | \item{subset.row}{A logical, character or integer vector specifying the rows of \code{x} to use in the PCA. 29 | Defaults to \code{NULL} (i.e., all rows used) with a warning.} 30 | 31 | \item{preserve.shape}{Logical scalar indicating whether or not the output SingleCellExperiment should be subsetted to \code{subset.row}. 32 | Only used if \code{subset.row} is not \code{NULL}.} 33 | 34 | \item{assay.type}{A string specifying which assay values to use.} 35 | 36 | \item{name}{String containing the name which which to store the results. 37 | Defaults to \code{"PCA"} in the \code{\link{reducedDimNames}} for \code{value="pca"} and \code{"lowrank"} in the \code{\link{assays}} for \code{value="lowrank"}.} 38 | 39 | \item{BSPARAM}{A \linkS4class{BiocSingularParam} object specifying the algorithm to use for PCA.} 40 | 41 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object to use for parallel processing.} 42 | } 43 | \value{ 44 | A modified \code{x} with: 45 | \itemize{ 46 | \item the PC results stored in the \code{\link{reducedDims}} as a \code{"PCA"} entry, if \code{type="pca"}. 47 | The attributes contain the rotation matrix, the variance explained and the percentage of variance explained. 48 | (Note that the last may not sum to 100\% if \code{max.rank} is smaller than the total number of PCs.) 49 | \item a low-rank approximation stored as a new \code{"lowrank"} assay, if \code{type="lowrank"}. 50 | This is represented as a \linkS4class{LowRankMatrix}. 51 | } 52 | } 53 | \description{ 54 | Perform a PCA where the desired number of components is known ahead of time. 55 | } 56 | \details{ 57 | In theory, there is an optimal number of components for any given application, 58 | but in practice, the criterion for the optimum is difficult to define. 59 | As a result, it is often satisfactory to take an \emph{a priori}-defined \dQuote{reasonable} number of PCs for downstream analyses. 60 | A good rule of thumb is to set this to the upper bound on the expected number of subpopulations in the dataset 61 | (see the reasoning in \code{\link{getClusteredPCs}}. 62 | 63 | We can use \code{subset.row} to perform the PCA on a subset of genes. 64 | This is typically used to subset to HVGs to reduce computational time and increase the signal-to-noise ratio of downstream analyses. 65 | If \code{preserve.shape=TRUE}, the rotation matrix is extrapolated to include loadings for \dQuote{unselected} genes, i.e., not in \code{subset.row}. 66 | This is done by projecting their expression profiles into the low-dimensional space defined by the SVD on the selected genes. 67 | By doing so, we ensure that the output always has the same number of rows as \code{x} such that any \code{value="lowrank"} can fit into the assays. 68 | 69 | Otherwise, if \code{preserve.shape=FALSE}, the output is subsetted by any non-\code{NULL} value of \code{subset.row}. 70 | This is equivalent to the return value after calling the function on \code{x[subset.row,]}. 71 | } 72 | \examples{ 73 | library(scuttle) 74 | sce <- mockSCE() 75 | sce <- logNormCounts(sce) 76 | 77 | # Modelling the variance: 78 | var.stats <- modelGeneVar(sce) 79 | hvgs <- getTopHVGs(var.stats, n=1000) 80 | 81 | # Defaults to pulling out the top 50 PCs. 82 | set.seed(1000) 83 | sce <- fixedPCA(sce, subset.row=hvgs) 84 | reducedDimNames(sce) 85 | 86 | # Get the percentage of variance explained. 87 | attr(reducedDim(sce), "percentVar") 88 | 89 | } 90 | \seealso{ 91 | \code{\link{denoisePCA}}, where the number of PCs is automatically chosen. 92 | 93 | \code{\link{getClusteredPCs}}, another method to choose the number of PCs. 94 | } 95 | \author{ 96 | Aaron Lun 97 | } 98 | -------------------------------------------------------------------------------- /man/gene_selection.Rd: -------------------------------------------------------------------------------- 1 | \name{Gene selection} 2 | \alias{scran-gene-selection} 3 | 4 | \title{Gene selection} 5 | \description{Details on how gene selection is performed in almost all \pkg{scran} functions.} 6 | 7 | \section{Subsetting by row}{ 8 | For functions accepting some gene-by-cell matrix \code{x}, we can choose to perform calculations only on a subset of rows (i.e., genes) with the \code{subset.row} argument. 9 | This can be a logical, integer or character vector indicating the rows of \code{x} to use. 10 | If a character vector, it must contain the names of the rows in \code{x}. 11 | Future support will be added for more esoteric subsetting vectors like the Bioconductor \linkS4class{Rle} classes. 12 | 13 | The output of running a function with \code{subset.row} will \emph{always} be the same as the output of subsetting \code{x} beforehand and passing it into the function. 14 | However, it is often more efficient to use \code{subset.row} as we can avoid constructing an intermediate subsetted matrix. 15 | The same reasoning applies for any \code{x} that is a \linkS4class{SingleCellExperiment} object. 16 | } 17 | 18 | \section{Filtering by mean}{ 19 | Some functions will have a \code{min.mean} argument to filter out low-abundance genes prior to processing. 20 | Depending on the function, the filter may be applied to the average library size-adjusted count computed by \code{\link{calculateAverage}}, the average log-count, 21 | or some other measure of abundance - see the documentation for each function for details. 22 | 23 | Any filtering on \code{min.mean} is automatically intersected with a specified \code{subset.row}. 24 | For example, only subsetted genes that pass the filter are retained if \code{subset.row} is specified alongside \code{min.mean}. 25 | } 26 | 27 | \author{ 28 | Aaron Lun 29 | } 30 | -------------------------------------------------------------------------------- /man/getClusteredPCs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getClusteredPCs.R 3 | \name{getClusteredPCs} 4 | \alias{getClusteredPCs} 5 | \title{Use clusters to choose the number of PCs} 6 | \usage{ 7 | getClusteredPCs( 8 | pcs, 9 | FUN = NULL, 10 | ..., 11 | BLUSPARAM = NNGraphParam(), 12 | min.rank = 5, 13 | max.rank = ncol(pcs), 14 | by = 1 15 | ) 16 | } 17 | \arguments{ 18 | \item{pcs}{A numeric matrix of PCs, where rows are cells and columns are dimensions representing successive PCs.} 19 | 20 | \item{FUN}{A clustering function that takes a numeric matrix with rows as cells and 21 | returns a vector containing a cluster label for each cell. 22 | Defaults to \code{\link{clusterRows}}.} 23 | 24 | \item{...}{Further arguments to pass to \code{FUN}. 25 | Ignored if \code{FUN=NULL}, use \code{BLUSPARAM} instead.} 26 | 27 | \item{BLUSPARAM}{A \linkS4class{BlusterParam} object specifying the clustering to use when \code{FUN=NULL}.} 28 | 29 | \item{min.rank}{Integer scalar specifying the minimum number of PCs to use.} 30 | 31 | \item{max.rank}{Integer scalar specifying the maximum number of PCs to use.} 32 | 33 | \item{by}{Integer scalar specifying what intervals should be tested between \code{min.rank} and \code{max.rank}.} 34 | } 35 | \value{ 36 | A \linkS4class{DataFrame} with one row per tested number of PCs. 37 | This contains the fields: 38 | \describe{ 39 | \item{\code{n.pcs}:}{Integer scalar specifying the number of PCs used.} 40 | \item{\code{n.clusters}:}{Integer scalar specifying the number of clusters identified.} 41 | \item{\code{clusters}:}{A \linkS4class{List} containing the cluster identities for this number of PCs.} 42 | } 43 | The metadata of the DataFrame contains \code{chosen}, 44 | an integer scalar specifying the \dQuote{ideal} number of PCs to use. 45 | } 46 | \description{ 47 | Cluster cells after using varying number of PCs, 48 | and pick the number of PCs using a heuristic based on the number of clusters. 49 | } 50 | \details{ 51 | Assume that the data contains multiple subpopulations, each of which is separated from the others on a different axis. 52 | For example, each subpopulation could be defined by a unique set of marker genes that drives separation on its own PC. 53 | If we had \eqn{x} subpopulations, we would need at least \eqn{x-1} PCs to successfully distinguish all of them. 54 | This motivates the choice of the number of PCs provided we know the number of subpopulations in the data. 55 | 56 | In practice, we do not know the number of subpopulations so we use the number of clusters as a proxy instead. 57 | We apply a clustering function \code{FUN} on the first \eqn{d} PCs, 58 | and only consider the values of \eqn{d} that yield no more than \eqn{d+1} clusters. 59 | If we see more clusters with fewer dimensions, 60 | we consider this to represent overclustering rather than distinct subpopulations, 61 | as multiple subpopulations should not be distinguishable on the same axes (based on the assumption above). 62 | 63 | We choose \eqn{d} that satisfies the constraint above and maximizes the number of clusters. 64 | The idea is that more PCs should include more biological signal, allowing \code{FUN} to detect more distinct subpopulations; 65 | until the point that the extra signal outweights the added noise at high dimensions, 66 | such that resolution decreases and it becomes more difficult for \code{FUN} to distinguish between subpopulations. 67 | 68 | Any \code{FUN} can be used that automatically chooses the number of clusters based on the data. 69 | The default is a graph-based clustering method using \code{\link{makeSNNGraph}} and \code{\link{cluster_walktrap}}, 70 | where arguments in \code{...} are passed to the former. 71 | Users should not supply \code{FUN} where the number of clusters is fixed in advance, 72 | (e.g., k-means, hierarchical clustering with known \code{k} in \code{\link{cutree}}). 73 | 74 | The identities of the output clusters are returned at each step for comparison, e.g., using methods like \pkg{clustree}. 75 | } 76 | \examples{ 77 | library(scuttle) 78 | sce <- mockSCE() 79 | sce <- logNormCounts(sce) 80 | 81 | sce <- scater::runPCA(sce) 82 | output <- getClusteredPCs(reducedDim(sce)) 83 | output 84 | 85 | metadata(output)$chosen 86 | 87 | } 88 | \seealso{ 89 | \code{\link{runPCA}}, to compute the PCs in the first place. 90 | 91 | \code{\link{clusterRows}} and \linkS4class{BlusterParam}, for possible choices of \code{BLUSPARAM}. 92 | } 93 | \author{ 94 | Aaron Lun 95 | } 96 | -------------------------------------------------------------------------------- /man/getMarkerEffects.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getMarkerEffects.R 3 | \name{getMarkerEffects} 4 | \alias{getMarkerEffects} 5 | \title{Get marker effect sizes} 6 | \usage{ 7 | getMarkerEffects(x, prefix = "logFC", strip = TRUE, remove.na.col = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A \linkS4class{DataFrame} containing marker statistics for a given group/cluster, 11 | usually one element of the List returned by \code{\link{findMarkers}}.} 12 | 13 | \item{prefix}{String containing the prefix for the columns containing the effect size.} 14 | 15 | \item{strip}{Logical scalar indicating whether the prefix should be removed from the output column names.} 16 | 17 | \item{remove.na.col}{Logical scalar indicating whether to remove columns containing any \code{NA}s.} 18 | } 19 | \value{ 20 | A numeric matrix containing the effect sizes for the comparison to every other group/cluster. 21 | } 22 | \description{ 23 | Utility function to extract the marker effect sizes as a matrix from the output of \code{\link{findMarkers}}. 24 | } 25 | \details{ 26 | Setting \code{remove.na.col=TRUE} may be desirable in applications involving blocked comparisons, 27 | where some pairwise comparisons are not possible if the relevant levels occur in different blocks. 28 | In such cases, the resulting column is filled with \code{NA}s that may interfere with downstream steps like clustering. 29 | } 30 | \examples{ 31 | library(scuttle) 32 | sce <- mockSCE() 33 | sce <- logNormCounts(sce) 34 | 35 | kout <- kmeans(t(logcounts(sce)), centers=4) 36 | out <- findMarkers(sce, groups=kout$cluster) 37 | 38 | eff1 <- getMarkerEffects(out[[1]]) 39 | str(eff1) 40 | 41 | } 42 | \seealso{ 43 | \code{\link{findMarkers}} and \code{\link{combineMarkers}}, to generate the DataFrames. 44 | } 45 | \author{ 46 | Aaron Lun 47 | } 48 | -------------------------------------------------------------------------------- /man/getTopHVGs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getTopHVGs.R 3 | \name{getTopHVGs} 4 | \alias{getTopHVGs} 5 | \title{Identify HVGs} 6 | \usage{ 7 | getTopHVGs( 8 | stats, 9 | var.field = "bio", 10 | n = NULL, 11 | prop = NULL, 12 | var.threshold = 0, 13 | fdr.field = "FDR", 14 | fdr.threshold = NULL, 15 | row.names = !is.null(rownames(stats)) 16 | ) 17 | } 18 | \arguments{ 19 | \item{stats}{A \linkS4class{DataFrame} of variance modelling statistics with one row per gene. 20 | Alternatively, a \linkS4class{SummarizedExperiment} object, in which case it is supplied to \code{\link{modelGeneVar}} to generate the required DataFrame.} 21 | 22 | \item{var.field}{String specifying the column of \code{stats} containing the relevant metric of variation.} 23 | 24 | \item{n}{Integer scalar specifying the number of top HVGs to report.} 25 | 26 | \item{prop}{Numeric scalar specifying the proportion of genes to report as HVGs.} 27 | 28 | \item{var.threshold}{Numeric scalar specifying the minimum threshold on the metric of variation.} 29 | 30 | \item{fdr.field}{String specifying the column of \code{stats} containing the adjusted p-values. 31 | If \code{NULL}, no filtering is performed on the FDR.} 32 | 33 | \item{fdr.threshold}{Numeric scalar specifying the FDR threshold.} 34 | 35 | \item{row.names}{Logical scalar indicating whether row names should be reported.} 36 | } 37 | \value{ 38 | A character vector containing the names of the most variable genes, if \code{row.names=TRUE}. 39 | 40 | Otherwise, an integer vector specifying the indices of \code{stats} containing the most variable genes. 41 | } 42 | \description{ 43 | Define a set of highly variable genes, based on variance modelling statistics 44 | from \code{\link{modelGeneVar}} or related functions. 45 | } 46 | \details{ 47 | This function will identify all genes where the relevant metric of variation is greater than \code{var.threshold}. 48 | By default, this means that we retain all genes with positive values in the \code{var.field} column of \code{stats}. 49 | If \code{var.threshold=NULL}, the minimum threshold on the value of the metric is not applied. 50 | 51 | If \code{fdr.threshold} is specified, we further subset to genes that have FDR less than or equal to \code{fdr.threshold}. 52 | By default, FDR thresholding is turned off as \code{\link{modelGeneVar}} and related functions 53 | determine significance of large variances \emph{relative} to other genes. 54 | This can be overly conservative if many genes are highly variable. 55 | 56 | If \code{n=NULL} and \code{prop=NULL}, the resulting subset of genes is directly returned. 57 | Otherwise, the top set of genes with the largest values of the variance metric are returned, 58 | where the size of the set is defined as the larger of \code{n} and \code{prop*nrow(stats)}. 59 | } 60 | \examples{ 61 | library(scuttle) 62 | sce <- mockSCE() 63 | sce <- logNormCounts(sce) 64 | 65 | stats <- modelGeneVar(sce) 66 | str(getTopHVGs(stats)) 67 | str(getTopHVGs(stats, fdr.threshold=0.05)) # more stringent 68 | 69 | # Or directly pass in the SingleCellExperiment: 70 | str(getTopHVGs(sce)) 71 | 72 | # Alternatively, use with the coefficient of variation: 73 | stats2 <- modelGeneCV2(sce) 74 | str(getTopHVGs(stats2, var.field="ratio")) 75 | 76 | } 77 | \seealso{ 78 | \code{\link{modelGeneVar}} and friends, to generate \code{stats}. 79 | 80 | \code{\link{modelGeneCV2}} and friends, to also generate \code{stats}. 81 | } 82 | \author{ 83 | Aaron Lun 84 | } 85 | -------------------------------------------------------------------------------- /man/getTopMarkers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getTopMarkers.R 3 | \name{getTopMarkers} 4 | \alias{getTopMarkers} 5 | \title{Get top markers} 6 | \usage{ 7 | getTopMarkers( 8 | de.lists, 9 | pairs, 10 | n = 10, 11 | pval.field = "p.value", 12 | fdr.field = "FDR", 13 | pairwise = TRUE, 14 | pval.type = c("any", "some", "all"), 15 | fdr.threshold = 0.05, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{de.lists}{A list-like object where each element is a data.frame or \linkS4class{DataFrame}. 21 | Each element should represent the results of a pairwise comparison between two groups/clusters, 22 | in which each row should contain the statistics for a single gene/feature. 23 | Rows should be named by the feature name in the same order for all elements.} 24 | 25 | \item{pairs}{A matrix, data.frame or \linkS4class{DataFrame} with two columns and number of rows equal to the length of \code{de.lists}. 26 | Each row should specify the pair of clusters being compared for the corresponding element of \code{de.lists}.} 27 | 28 | \item{n}{Integer scalar specifying the number of markers to obtain from each pairwise comparison, if \code{pairwise=FALSE}. 29 | 30 | Otherwise, the number of top genes to take from each cluster's combined marker set, see Details.} 31 | 32 | \item{pval.field}{String specifying the column of each DataFrame in \code{de.lists} to use to identify top markers. 33 | Smaller values are assigned higher rank.} 34 | 35 | \item{fdr.field}{String specifying the column containing the adjusted p-values.} 36 | 37 | \item{pairwise}{Logical scalar indicating whether top markers should be returned for every pairwise comparison. 38 | If \code{FALSE}, one marker set is returned for every cluster.} 39 | 40 | \item{pval.type}{String specifying how markers from pairwise comparisons are to be combined if \code{pairwise=FALSE}. 41 | This has the same effect as \code{pval.type} in \code{\link{combineMarkers}}.} 42 | 43 | \item{fdr.threshold}{Numeric scalar specifying the FDR threshold for filtering. 44 | If \code{NULL}, no filtering is performed on the FDR.} 45 | 46 | \item{...}{Further arguments to pass to \code{\link{combineMarkers}} if \code{pairwise=FALSE}.} 47 | } 48 | \value{ 49 | If \code{pairwise=TRUE}, a \linkS4class{List} of Lists of character vectors is returned. 50 | Each element of the outer list corresponds to cluster X, each element of the inner list corresponds to another cluster Y, 51 | and each character vector specifies the marker genes that distinguish X from Y. 52 | 53 | If \code{pairwise=FALSE}, a List of character vectors is returned. 54 | Each character vector contains the marker genes that distinguish X from any, some or all other clusters, 55 | depending on \code{combine.type}. 56 | } 57 | \description{ 58 | Obtain the top markers for each pairwise comparison between clusters, or for each cluster. 59 | } 60 | \details{ 61 | This is a convenience utility that converts the results of pairwise comparisons into a marker list 62 | that can be used in downstream functions, e.g., as the marker sets in \pkg{SingleR}. 63 | By default, it returns a list of lists containing the top genes for every pairwise comparison, 64 | which is useful for feature selection to select genes distinguishing between closely related clusters. 65 | The top \code{n} genes are chosen with adjusted p-values below \code{fdr.threshold}. 66 | 67 | If \code{pairwise=FALSE}, \code{\link{combineMarkers}} is called on \code{de.lists} and \code{pairs} 68 | to obtain a per-cluster ranking of genes from all pairwise comparisons involving that cluster. 69 | If \code{pval.type="any"}, the top genes with \code{Top} values no greater than \code{n} are retained; 70 | this is equivalent to taking the union of the top \code{n} genes from each pairwise comparison for each cluster. 71 | Otherwise, the top \code{n} genes with the smallest p-values are retained. 72 | In both cases, genes are further filtered by \code{fdr.threshold}. 73 | } 74 | \examples{ 75 | library(scuttle) 76 | sce <- mockSCE() 77 | sce <- logNormCounts(sce) 78 | 79 | # Any clustering method is okay. 80 | kout <- kmeans(t(logcounts(sce)), centers=3) 81 | 82 | out <- pairwiseTTests(logcounts(sce), 83 | groups=paste0("Cluster", kout$cluster)) 84 | 85 | # Getting top pairwise markers: 86 | top <- getTopMarkers(out$statistics, out$pairs) 87 | top[[1]] 88 | top[[1]][[2]] 89 | 90 | # Getting top per-cluster markers: 91 | top <- getTopMarkers(out$statistics, out$pairs, pairwise=FALSE) 92 | top[[1]] 93 | } 94 | \seealso{ 95 | \code{\link{pairwiseTTests}} and friends, to obtain \code{de.lists} and \code{pairs}. 96 | 97 | \code{\link{combineMarkers}}, for another function that consolidates pairwise DE comparisons. 98 | } 99 | \author{ 100 | Aaron Lun 101 | } 102 | -------------------------------------------------------------------------------- /man/logBH.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils_markers.R 3 | \name{.logBH} 4 | \alias{.logBH} 5 | \title{BH correction on log-p-values} 6 | \usage{ 7 | .logBH(log.p.val) 8 | } 9 | \arguments{ 10 | \item{log.p.val}{Numeric vector of log-transformed p-values.} 11 | } 12 | \value{ 13 | A numeric vector of the same length as \code{log.p.val} containing log-transformed BH-corrected p-values. 14 | } 15 | \description{ 16 | Perform a Benjamini-Hochberg correction on log-transformed p-values to get log-adjusted p-values, 17 | without the loss of precision from undoing and redoing the log-transformations. 18 | } 19 | \examples{ 20 | log.p.values <- log(runif(1000)) 21 | obs <- .logBH(log.p.values) 22 | head(obs) 23 | 24 | ref <- log(p.adjust(exp(log.p.values), method="BH")) 25 | head(ref) 26 | 27 | } 28 | \author{ 29 | Aaron Lun 30 | } 31 | -------------------------------------------------------------------------------- /man/multiMarkerStats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/multiMarkerStats.R 3 | \name{multiMarkerStats} 4 | \alias{multiMarkerStats} 5 | \title{Combine multiple sets of marker statistics} 6 | \usage{ 7 | multiMarkerStats(..., repeated = NULL, sorted = TRUE) 8 | } 9 | \arguments{ 10 | \item{...}{Two or more lists or \linkS4class{List}s produced by \code{\link{findMarkers}} or \code{\link{combineMarkers}}. 11 | Each list should contain \linkS4class{DataFrame}s of results, one for each group/cluster of cells. 12 | 13 | The names of each List should be the same; the universe of genes in each DataFrame should be the same; 14 | and the same number of columns in each DataFrame should be named. 15 | All elements in \code{...} are also expected to be named.} 16 | 17 | \item{repeated}{Character vector of columns that are present in one or more DataFrames but should only be reported once. 18 | Typically used to avoid reporting redundant copies of annotation-related columns.} 19 | 20 | \item{sorted}{Logical scalar indicating whether each output DataFrame should be sorted by some relevant statistic.} 21 | } 22 | \value{ 23 | A named List of DataFrames with one DataFrame per group/cluster. 24 | Each DataFrame contains statistics from the corresponding entry of each List in \code{...}, 25 | prefixed with the name of the List. 26 | In addition, several combined statistics are reported: 27 | \itemize{ 28 | \item \code{Top}, the largest rank of each gene across all DataFrames for that group. 29 | This is only reported if each list in \code{...} was generated with \code{pval.type="any"} in \code{\link{combineMarkers}}. 30 | \item \code{p.value}, the largest p-value of each gene across all DataFrames for that group. 31 | This is replaced by \code{log.p.value} if p-values in \code{...} are log-transformed. 32 | \item \code{FDR}, the BH-adjusted value of \code{p.value}. 33 | This is replaced by \code{log.FDR} if p-values in \code{...} are log-transformed. 34 | } 35 | } 36 | \description{ 37 | Combine multiple sets of marker statistics, typically from different tests, 38 | into a single \linkS4class{DataFrame} for convenient inspection. 39 | } 40 | \details{ 41 | The combined statistics are designed to favor a gene that is highly ranked in each of the individual test results. 42 | This is highly conservative and aims to identify robust DE that is significant under all testing schemes. 43 | 44 | A combined \code{Top} value of T indicates that the gene is among the top T genes of one or more pairwise comparisons 45 | in each of the testing schemes. 46 | (We can be even more aggressive if the individual results were generated with a larger \code{min.prop} value.) 47 | In effect, a gene can only achieve a low \code{Top} value if it is consistently highly ranked in each test. 48 | If \code{sorted=TRUE}, this is used to order the genes in the output DataFrame. 49 | 50 | The combined \code{p.value} is effectively the result of applying an intersection-union test to the per-test results. 51 | This will only be low if the gene has a low p-value in each of the test results. 52 | If \code{sorted=TRUE} and \code{Top} is not present, this will be used to order the genes in the output DataFrame. 53 | } 54 | \examples{ 55 | library(scuttle) 56 | sce <- mockSCE() 57 | sce <- logNormCounts(sce) 58 | 59 | # Any clustering method is okay, only using k-means for convenience. 60 | kout <- kmeans(t(logcounts(sce)), centers=4) 61 | 62 | tout <- findMarkers(sce, groups=kout$cluster, direction="up") 63 | wout <- findMarkers(sce, groups=kout$cluster, direction="up", test="wilcox") 64 | 65 | combined <- multiMarkerStats(t=tout, wilcox=wout) 66 | colnames(combined[[1]]) 67 | 68 | } 69 | \seealso{ 70 | \code{\link{findMarkers}} and \code{\link{combineMarkers}}, to generate elements in \code{...}. 71 | } 72 | \author{ 73 | Aaron Lun 74 | } 75 | -------------------------------------------------------------------------------- /man/rhoToPValue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rhoToPValue.R 3 | \name{rhoToPValue} 4 | \alias{rhoToPValue} 5 | \title{Spearman's rho to a p-value} 6 | \usage{ 7 | rhoToPValue(rho, n, positive = NULL) 8 | } 9 | \arguments{ 10 | \item{rho}{Numeric vector of rho values.} 11 | 12 | \item{n}{Integer scalar specifying the number of observations used to compute \code{rho}.} 13 | 14 | \item{positive}{Logical scalar indicating whether to perform a one-sided test for the alternative of a positive (\code{TRUE}) or negative rho (\code{FALSE}). 15 | Default is to return statistics for both directions.} 16 | } 17 | \value{ 18 | If \code{positive=NULL}, a list of two numeric vectors is returned, 19 | containing p-values for the test against the alternative hypothesis in each direction. 20 | 21 | Otherwise, a numeric vector is returned containing the p-values for the test in the specified direction. 22 | } 23 | \description{ 24 | Compute an approximate p-value against the null hypothesis that Spearman's rho is zero. 25 | This vectorizes the approximate p-value calculation in \code{\link{cor.test}} with \code{method="spearman"}. 26 | } 27 | \examples{ 28 | rhoToPValue(seq(-1, 1, 21), 50) 29 | 30 | } 31 | \author{ 32 | Aaron Lun 33 | } 34 | -------------------------------------------------------------------------------- /man/sandbag.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sandbag.R 3 | \name{sandbag} 4 | \alias{sandbag} 5 | \alias{sandbag,ANY-method} 6 | \alias{sandbag,SummarizedExperiment-method} 7 | \title{Cell cycle phase training} 8 | \usage{ 9 | sandbag(x, ...) 10 | 11 | \S4method{sandbag}{ANY}(x, phases, gene.names = rownames(x), fraction = 0.5, subset.row = NULL) 12 | 13 | \S4method{sandbag}{SummarizedExperiment}(x, ..., assay.type = "counts") 14 | } 15 | \arguments{ 16 | \item{x}{A numeric matrix of gene expression values where rows are genes and columns are cells. 17 | 18 | Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.} 19 | 20 | \item{...}{For the generic, additional arguments to pass to specific methods. 21 | 22 | For the SummarizedExperiment method, additional arguments to pass to the ANY method.} 23 | 24 | \item{phases}{A list of subsetting vectors specifying which cells are in each phase of the cell cycle. 25 | This should typically be of length 3, with elements named as \code{"G1"}, \code{"S"} and \code{"G2M"}.} 26 | 27 | \item{gene.names}{A character vector of gene names.} 28 | 29 | \item{fraction}{A numeric scalar specifying the minimum fraction to define a marker gene pair.} 30 | 31 | \item{subset.row}{See \code{?"\link{scran-gene-selection}"}.} 32 | 33 | \item{assay.type}{A string specifying which assay values to use, e.g., \code{"counts"} or \code{"logcounts"}.} 34 | } 35 | \value{ 36 | A named list of data.frames, where each data frame corresponds to a cell cycle phase and contains the names of the genes in each marker pair. 37 | } 38 | \description{ 39 | Use gene expression data to train a classifier for cell cycle phase. 40 | } 41 | \details{ 42 | This function implements the training step of the pair-based prediction method described by Scialdone et al. (2015). 43 | Pairs of genes (A, B) are identified from a training data set where in each pair, 44 | the fraction of cells in phase G1 with expression of A > B (based on expression values in \code{training.data}) 45 | and the fraction with B > A in each other phase exceeds \code{fraction}. 46 | These pairs are defined as the marker pairs for G1. 47 | This is repeated for each phase to obtain a separate marker pair set. 48 | 49 | Pre-defined sets of marker pairs are provided for mouse and human (see Examples). 50 | The mouse set was generated as described by Scialdone et al. (2015), while the human training set was generated with data from Leng et al. (2015). 51 | Classification from test data can be performed using the \code{\link{cyclone}} function. 52 | For each cell, this involves comparing expression values between genes in each marker pair. 53 | The cell is then assigned to the phase that is consistent with the direction of the difference in expression in the majority of pairs. 54 | } 55 | \examples{ 56 | library(scuttle) 57 | sce <- mockSCE(ncells=50, ngenes=200) 58 | 59 | is.G1 <- 1:20 60 | is.S <- 21:30 61 | is.G2M <- 31:50 62 | out <- sandbag(sce, list(G1=is.G1, S=is.S, G2M=is.G2M)) 63 | str(out) 64 | 65 | # Getting pre-trained marker sets 66 | mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) 67 | hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran")) 68 | 69 | } 70 | \references{ 71 | Scialdone A, Natarajana KN, Saraiva LR et al. (2015). 72 | Computational assignment of cell-cycle stage from single-cell transcriptome data. 73 | \emph{Methods} 85:54--61 74 | 75 | Leng N, Chu LF, Barry C et al. (2015). 76 | Oscope identifies oscillatory genes in unsynchronized single-cell RNA-seq experiments. 77 | \emph{Nat. Methods} 12:947--50 78 | } 79 | \seealso{ 80 | \code{\link{cyclone}}, to perform the classification on a test dataset. 81 | } 82 | \author{ 83 | Antonio Scialdone, 84 | with modifications by Aaron Lun 85 | } 86 | -------------------------------------------------------------------------------- /man/scaledColRanks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scaledColRanks.R 3 | \name{scaledColRanks} 4 | \alias{scaledColRanks} 5 | \title{Compute scaled column ranks} 6 | \usage{ 7 | scaledColRanks( 8 | x, 9 | subset.row = NULL, 10 | min.mean = NULL, 11 | transposed = FALSE, 12 | as.sparse = FALSE, 13 | withDimnames = TRUE, 14 | BPPARAM = SerialParam() 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A numeric matrix-like object containing cells in columns and features in the rows.} 19 | 20 | \item{subset.row}{A logical, integer or character scalar indicating the rows of \code{x} to use, see \code{?"\link{scran-gene-selection}"}.} 21 | 22 | \item{min.mean}{A numeric scalar specifying the filter to be applied on the average normalized count for each feature prior to computing ranks. 23 | Disabled by setting to \code{NULL}.} 24 | 25 | \item{transposed}{A logical scalar specifying whether the output should be transposed.} 26 | 27 | \item{as.sparse}{A logical scalar indicating whether the output should be sparse.} 28 | 29 | \item{withDimnames}{A logical scalar specifying whether the output should contain the dimnames of \code{x}.} 30 | 31 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying whether and how parallelization should be performed. 32 | Currently only used for filtering if \code{min.mean} is not provided.} 33 | } 34 | \value{ 35 | A matrix of the same dimensions as \code{x}, where each column contains the centred and scaled ranks of the expression values for each cell. 36 | If \code{transposed=TRUE}, this matrix is transposed so that rows correspond to cells. 37 | If \code{as.sparse}, the columns are not centered to preserve sparsity. 38 | } 39 | \description{ 40 | Compute scaled column ranks from each cell's expression profile for distance calculations based on rank correlations. 41 | } 42 | \details{ 43 | Euclidean distances computed based on the output rank matrix are equivalent to distances computed from Spearman's rank correlation. 44 | This can be used in clustering, nearest-neighbour searches, etc. as a robust alternative to Euclidean distances computed directly from \code{x}. 45 | 46 | If \code{as.sparse=TRUE}, the most common average rank is set to zero in the output. 47 | This can be useful for highly sparse input data where zeroes have the same rank and are themselves returned as zeroes. 48 | Obviously, this means that the ranks are not centred, so this will have to be done manually prior to any downstream distance calculations. 49 | } 50 | \examples{ 51 | library(scuttle) 52 | sce <- mockSCE() 53 | rout <- scaledColRanks(counts(sce), transposed=TRUE) 54 | 55 | # For use in clustering: 56 | d <- dist(rout) 57 | table(cutree(hclust(d), 4)) 58 | 59 | g <- buildSNNGraph(rout, transposed=TRUE) 60 | table(igraph::cluster_walktrap(g)$membership) 61 | 62 | } 63 | \seealso{ 64 | \code{\link{quickCluster}}, where this function is used. 65 | } 66 | \author{ 67 | Aaron Lun 68 | } 69 | -------------------------------------------------------------------------------- /man/summaryMarkerStats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summaryMarkerStats.R 3 | \name{summaryMarkerStats} 4 | \alias{summaryMarkerStats} 5 | \alias{summaryMarkerStats,ANY-method} 6 | \alias{summaryMarkerStats,SummarizedExperiment-method} 7 | \title{Summary marker statistics} 8 | \usage{ 9 | summaryMarkerStats(x, ...) 10 | 11 | \S4method{summaryMarkerStats}{ANY}( 12 | x, 13 | groups, 14 | row.data = NULL, 15 | average = "mean", 16 | BPPARAM = SerialParam() 17 | ) 18 | 19 | \S4method{summaryMarkerStats}{SummarizedExperiment}(x, ..., assay.type = "logcounts") 20 | } 21 | \arguments{ 22 | \item{x}{A numeric matrix-like object of expression values, 23 | where each column corresponds to a cell and each row corresponds to an endogenous gene. 24 | This is generally expected to be normalized log-expression values unless one knows better. 25 | 26 | Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix.} 27 | 28 | \item{...}{For the generic, further arguments to pass to specific methods. 29 | 30 | For the SummarizedExperiment method, further arguments to pass to the ANY method.} 31 | 32 | \item{groups}{A vector of length equal to \code{ncol(x)}, 33 | specifying the group to which each cell is assigned. 34 | If \code{x} is a \linkS4class{SingleCellExperiment}, this defaults to \code{\link{colLabels}(x)} if available.} 35 | 36 | \item{row.data}{A \linkS4class{DataFrame} containing additional row metadata for each gene in \code{x}, 37 | to be included in each of the output DataFrames. 38 | This should generally have row names identical to those of \code{x}. 39 | 40 | Alternatively, a list containing one such DataFrame per level of \code{groups}, 41 | where each DataFrame contains group-specific metadata for each gene to be included in the appropriate output DataFrame.} 42 | 43 | \item{average}{String specifying the type of average, to be passed to \code{\link{sumCountsAcrossCells}}.} 44 | 45 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating whether and how parallelization should be performed across genes.} 46 | 47 | \item{assay.type}{A string specifying which assay values to use, usually \code{"logcounts"}.} 48 | } 49 | \value{ 50 | A named \linkS4class{List} of \linkS4class{DataFrame}s, with one entry per level of \code{groups}. 51 | Each DataFrame has number of rows corresponding to the rows in \code{x} and contains the fields: 52 | \itemize{ 53 | \item \code{self.average}, the average (log-)expression across all cells in the current group. 54 | \item \code{other.average}, the grand average of the average (log-)expression across cells in the other groups. 55 | \item \code{self.detected}, the proportion of cells with detected expression in the current group. 56 | \item \code{other.detected}, the average proportion of cells with detected expression in the other groups. 57 | } 58 | } 59 | \description{ 60 | Compute additional gene-level statistics for each group to assist in identifying marker genes, 61 | to complement the formal test statistics generated by \code{\link{findMarkers}}. 62 | } 63 | \details{ 64 | This function only generates descriptive statistics for each gene to assist marker selection. 65 | It does not consider blocking factors or covariates that would otherwise be available from comparisons between groups. 66 | For the sake of brevity, statistics for the \dQuote{other} groups are summarized into a single value. 67 | } 68 | \examples{ 69 | library(scuttle) 70 | sce <- mockSCE() 71 | sce <- logNormCounts(sce) 72 | 73 | # Any clustering method is okay. 74 | kout <- kmeans(t(logcounts(sce)), centers=3) 75 | sum.out <- summaryMarkerStats(sce, kout$cluster) 76 | sum.out[["1"]] 77 | 78 | # Add extra rowData if you like. 79 | rd <- DataFrame(Symbol=sample(LETTERS, nrow(sce), replace=TRUE), 80 | row.names=rownames(sce)) 81 | sum.out <- summaryMarkerStats(sce, kout$cluster, row.data=rd) 82 | sum.out[["1"]] 83 | 84 | } 85 | \seealso{ 86 | \code{\link{findMarkers}}, where the output of this function can be used in \code{row.data=}. 87 | } 88 | \author{ 89 | Aaron Lun 90 | } 91 | -------------------------------------------------------------------------------- /man/testLinearModel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/testLinearModel.R 3 | \name{testLinearModel} 4 | \alias{testLinearModel} 5 | \alias{testLinearModel,ANY-method} 6 | \alias{testLinearModel,SummarizedExperiment-method} 7 | \title{Hypothesis tests with linear models} 8 | \usage{ 9 | testLinearModel(x, ...) 10 | 11 | \S4method{testLinearModel}{ANY}( 12 | x, 13 | design, 14 | coefs = ncol(design), 15 | contrasts = NULL, 16 | block = NULL, 17 | equiweight = FALSE, 18 | method = "stouffer", 19 | subset.row = NULL, 20 | BPPARAM = SerialParam() 21 | ) 22 | 23 | \S4method{testLinearModel}{SummarizedExperiment}(x, ..., assay.type = "logcounts") 24 | } 25 | \arguments{ 26 | \item{x}{A numeric matrix-like object containing log-expression values for cells (columns) and genes (rows). 27 | Alternatively, a \linkS4class{SummarizedExperiment} containing such a matrix.} 28 | 29 | \item{...}{For the generic, further arguments to pass to specific methods. 30 | 31 | For the SummarizedExperiment method, further arguments to pass to the ANY method.} 32 | 33 | \item{design}{A numeric design matrix with number of rows equal to \code{ncol(x)}.} 34 | 35 | \item{coefs}{An integer vector specifying the coefficients to drop to form the null model. 36 | Only used if \code{contrasts} is not specified.} 37 | 38 | \item{contrasts}{A numeric vector or matrix specifying the contrast of interest. 39 | This should have length (if vector) or number of rows (if matrix) equal to \code{ncol(x)}.} 40 | 41 | \item{block}{A factor specifying the blocking levels for each cell in \code{x}. 42 | If specified, variance modelling is performed separately within each block and statistics are combined across blocks.} 43 | 44 | \item{equiweight}{A logical scalar indicating whether statistics from each block should be given equal weight. 45 | Otherwise, each block is weighted according to its number of cells. 46 | Only used if \code{block} is specified.} 47 | 48 | \item{method}{String specifying how p-values should be combined when \code{block} is specified, see \code{\link{combineParallelPValues}}.} 49 | 50 | \item{subset.row}{See \code{?"\link{scran-gene-selection}"}, specifying the rows for which to model the variance. 51 | Defaults to all genes in \code{x}.} 52 | 53 | \item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating whether parallelization should be performed across genes.} 54 | 55 | \item{assay.type}{String or integer scalar specifying the assay containing the log-expression values.} 56 | } 57 | \value{ 58 | A \linkS4class{DataFrame} containing test results with one row per row of \code{x}. 59 | It contains the estimated values of the contrasted coefficients 60 | as well as the p-value and FDR for each gene. 61 | } 62 | \description{ 63 | Perform basic hypothesis tests with linear models in an efficient manner. 64 | } 65 | \details{ 66 | This function can be considered a more efficient version of \code{\link{lmFit}} 67 | that works on a variety of matrix representations (see \code{\link{fitLinearModel}}). 68 | It also omits the empirical Bayes shrinkage step, 69 | which is acceptable given the large number of residual d.f. in typical single-cell studies. 70 | 71 | If \code{contrasts} is specified, the null hypothesis is defined by the contrast matrix or vector in the same manner 72 | that is used in the \pkg{limma} and \pkg{edgeR} packages. 73 | Briefly, the contrast vector specifies a linear combination of coefficients that sums to zero under the null. 74 | For contrast matrices, the joint null consists of the intersection of the nulls defined by each column vector. 75 | 76 | Otherwise, if only \code{coefs} is specified, 77 | the null model is formed by simply dropping all of the specified coefficients from \code{design}. 78 | 79 | If \code{block} is specified, a linear model is fitted separately to the cells in each level. 80 | The results are combined across levels by averaging coefficients and combining p-values with \code{\link{combinePValues}}. 81 | By default, the contribution from each level is weighted by its number of cells; 82 | if \code{equiweight=TRUE}, each level is given equal weight instead. 83 | } 84 | \examples{ 85 | y <- matrix(rnorm(10000), ncol=100) 86 | 87 | # Example with categorical factors: 88 | A <- gl(2, 50) 89 | design <- model.matrix(~A) 90 | testLinearModel(y, design, contrasts=c(0, 1)) 91 | 92 | # Example with continuous variables: 93 | u <- runif(100) 94 | design <- model.matrix(~u) 95 | testLinearModel(y, design, contrasts=c(0, 1)) 96 | 97 | # Example with multiple variables: 98 | B <- gl(4, 25) 99 | design <- model.matrix(~B) 100 | testLinearModel(y, design, contrasts=cbind(c(0,1,0,0), c(0,0,1,-1))) 101 | 102 | } 103 | \seealso{ 104 | \code{\link{fitLinearModel}}, which performs the hard work of fitting the linear models. 105 | } 106 | \author{ 107 | Aaron Lun 108 | } 109 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_LIBS=$(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 2 | -------------------------------------------------------------------------------- /src/choose_effect_size.cpp: -------------------------------------------------------------------------------- 1 | #include "Rcpp.h" 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | size_t instantiate_list(Rcpp::List input, std::vector& output, const std::string msg) { 12 | size_t n=0; 13 | for (size_t c=0; c(current.size())) { 18 | throw std::runtime_error(msg + " vectors must be of the same length"); 19 | } 20 | } 21 | return n; 22 | } 23 | 24 | size_t define_jump (size_t ntests, double prop) { 25 | /* We want the p-value for the (prop*ntests)-th rejection, which rejects 26 | * the null that more than (1-prop) of the nulls are true. The '-1' is 27 | * for the zero-indexing but obviously does not apply if prop=0. 28 | */ 29 | size_t jump=std::ceil(ntests * prop); 30 | if (jump) { --jump; } 31 | return jump; 32 | } 33 | 34 | // [[Rcpp::export(rng=false)]] 35 | Rcpp::IntegerVector compute_Top_statistic_from_ranks(Rcpp::List Ranks, double prop) { 36 | const size_t ncon=Ranks.size(); 37 | std::vector individual(ncon); 38 | const size_t ngenes=instantiate_list(Ranks, individual, "rank"); 39 | 40 | std::vector collected(ncon); 41 | Rcpp::IntegerVector output(ngenes, NA_INTEGER); 42 | 43 | for (size_t g=0; g effects(ncon), pvals(ncon); 67 | const size_t neffects=instantiate_list(Effects, effects, "effect"); 68 | const size_t npvals=instantiate_list(Pvals, pvals, "p-value"); 69 | if (neffects!=npvals) { 70 | throw std::runtime_error("p-value and effect vectors should have the same length"); 71 | } 72 | 73 | std::vector > collected(ncon); 74 | Rcpp::NumericVector output(neffects, R_NaReal); 75 | 76 | for (size_t g=0; g 6 | #include 7 | #include 8 | 9 | /*** Combining correlated p-values for each gene into a single combined p-value. ***/ 10 | 11 | // [[Rcpp::export(rng=false)]] 12 | Rcpp::List combine_rho (int Ngenes, Rcpp::IntegerVector first, Rcpp::IntegerVector second, 13 | Rcpp::NumericVector Rho, Rcpp::NumericVector Pval, Rcpp::IntegerVector Order) 14 | { 15 | // Checking inputs. 16 | if (first.size()!=second.size()) { 17 | throw std::runtime_error("gene index vectors must be of the same length"); 18 | } 19 | if (first.size()!=Rho.size()) { 20 | throw std::runtime_error("'rho' must be a double precision vector of length equal to the number of pairs"); 21 | } 22 | if (first.size()!=Pval.size()) { 23 | throw std::runtime_error("'pval' must be a double precision vector of length equal to the number of pairs"); 24 | } 25 | if (first.size()!=Order.size()) { 26 | throw std::runtime_error("'order' must be an integer vector of length equal to the number of pairs"); 27 | } 28 | 29 | const size_t Npairs=first.size(); 30 | if (Ngenes < 0) { throw std::runtime_error("number of genes should be non-negative"); } 31 | 32 | // Going through and computing the combined p-value for each gene. 33 | Rcpp::NumericVector pout(Ngenes), rout(Ngenes); 34 | std::vector sofar(Ngenes); 35 | 36 | for (auto oIt=Order.begin(); oIt!=Order.end(); ++oIt) { 37 | const int& curp=*oIt; 38 | if (curp < 0 || static_cast(curp) >= Npairs) { 39 | throw std::runtime_error("order indices out of range"); 40 | } 41 | const double& currho=Rho[curp]; 42 | const double& curpval=Pval[curp]; 43 | 44 | for (int i=0; i<2; ++i) { 45 | const int& gx=(i==0 ? first[curp] : second[curp]); 46 | if (gx < 0 || gx >= Ngenes) { 47 | throw std::runtime_error("supplied gene index is out of range"); 48 | } 49 | 50 | // Checking if this is smaller than what is there, or if nothing is there yet. 51 | int& already_there=sofar[gx]; 52 | ++already_there; 53 | const double temp_combined=curpval/already_there; 54 | double& combined_pval=pout[gx]; 55 | 56 | if (already_there==1 || temp_combined < combined_pval) { 57 | combined_pval=temp_combined; 58 | } 59 | 60 | double& max_rho=rout[gx]; 61 | if (already_there==1 || std::abs(max_rho) < std::abs(currho)) { 62 | max_rho=currho; 63 | } 64 | } 65 | } 66 | 67 | // Multiplying by the total number of tests for each gene. 68 | auto sfIt=sofar.begin(); 69 | for (auto poIt=pout.begin(); poIt!=pout.end(); ++poIt, ++sfIt) { 70 | (*poIt)*=(*sfIt); 71 | } 72 | 73 | return Rcpp::List::create(pout, rout); 74 | } 75 | -------------------------------------------------------------------------------- /src/compute_residual_stats.cpp: -------------------------------------------------------------------------------- 1 | #include "Rcpp.h" 2 | #include "beachmat3/beachmat.h" 3 | #include "scuttle/linear_model_fit.h" 4 | 5 | template 6 | Rcpp::List compute_residual_stats(Rcpp::NumericMatrix qr, Rcpp::NumericVector qraux, Rcpp::RObject inmat, TRANSFORMER trans) { 7 | auto emat = beachmat::read_lin_block(inmat); 8 | const size_t ncells=emat->get_ncol(); 9 | const size_t ngenes=emat->get_nrow(); 10 | 11 | scuttle::linear_model_fit fitter(qr, qraux); 12 | const size_t ncoefs=fitter.get_ncoefs(); 13 | 14 | // Setting up the output objects. 15 | Rcpp::NumericMatrix outvar(1, ngenes); 16 | Rcpp::NumericMatrix outmean(1, ngenes); 17 | Rcpp::NumericVector incoming(ncells); 18 | 19 | for (size_t counter=0; counterget_row(counter, iIt); 22 | trans(ptr, ptr + ncells, iIt); 23 | 24 | auto curvarrow=outvar.column(counter); 25 | auto curvar=curvarrow.begin(); 26 | auto curmeanrow=outmean.column(counter); 27 | auto curmean=curmeanrow.begin(); 28 | 29 | auto iEnd = incoming.end(); 30 | (*curmean)=std::accumulate(iIt, iEnd, 0.0)/ncells; 31 | fitter.multiply(iIt); 32 | 33 | double& v=(*curvar); 34 | iIt+=ncoefs; 35 | while (iIt != iEnd) { // only using the residual effects. 36 | v += (*iIt) * (*iIt); 37 | ++iIt; 38 | } 39 | v /= ncells - ncoefs; 40 | } 41 | 42 | return Rcpp::List::create(outmean, outvar); 43 | } 44 | 45 | /************************************************ 46 | * Compute statistics for log-transformed counts. 47 | ***********************************************/ 48 | 49 | struct lognorm { 50 | lognorm(Rcpp::NumericVector sizefactors, double pseudo) : sf(sizefactors), ps(pseudo) {} 51 | 52 | template 53 | void operator()(IN start, IN end, OUT out) { 54 | auto sfIt = sf.begin(); 55 | while (start != end) { 56 | *out = std::log(*start/(*sfIt) + ps)/M_LN2; 57 | ++start; 58 | ++sfIt; 59 | ++out; 60 | } 61 | } 62 | private: 63 | Rcpp::NumericVector sf; 64 | double ps; 65 | }; 66 | 67 | // [[Rcpp::export(rng=false)]] 68 | Rcpp::List compute_residual_stats_lognorm(Rcpp::NumericMatrix qr, Rcpp::NumericVector qraux, Rcpp::RObject inmat, 69 | Rcpp::NumericVector sf, double pseudo) 70 | { 71 | lognorm LN(sf, pseudo); 72 | return compute_residual_stats(qr, qraux, inmat, LN); 73 | } 74 | 75 | /*********************************************** 76 | * Compute statistics for expression as provided. 77 | ***********************************************/ 78 | 79 | struct none { 80 | none() {} 81 | 82 | template 83 | void operator()(IN start, IN end, OUT out) { 84 | if (out!=start) { 85 | std::copy(start, end, out); 86 | } 87 | } 88 | }; 89 | 90 | // [[Rcpp::export(rng=false)]] 91 | Rcpp::List compute_residual_stats_none(Rcpp::NumericMatrix qr, Rcpp::NumericVector qraux, Rcpp::RObject inmat) { 92 | none N; 93 | return compute_residual_stats(qr, qraux, inmat, N); 94 | } 95 | -------------------------------------------------------------------------------- /src/compute_rho_null.cpp: -------------------------------------------------------------------------------- 1 | #include "Rcpp.h" 2 | 3 | #include "rand_custom.h" 4 | #include "scuttle/linear_model_fit.h" 5 | #include "boost/range/algorithm.hpp" 6 | #include "utils.h" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | static double rho_mult (double Ncells) { 13 | return 6/(Ncells*(Ncells*Ncells-1)); 14 | } 15 | 16 | /*** Null distribution estimation without a design matrix. ***/ 17 | 18 | // [[Rcpp::export(rng=false)]] 19 | Rcpp::NumericVector get_null_rho (int Ncells, int Niters, Rcpp::List Seeds, Rcpp::IntegerVector Streams) { 20 | if (Ncells <= 1) { throw std::runtime_error("number of cells should be greater than 2"); } 21 | if (Niters < 0) { throw std::runtime_error("number of iterations should be non-negative"); } 22 | check_pcg_vectors(Seeds, Streams, Niters, "iterations"); 23 | 24 | // Filling rank vector. 25 | std::vector rankings(Ncells); 26 | Rcpp::NumericVector output(Niters); 27 | const double mult=rho_mult(Ncells); 28 | 29 | for (int it=0; it holding_val(Nobs); 63 | std::vector holding_idx(Nobs); 64 | std::vector rank1(Nobs), rank2(Nobs); 65 | const double mult=rho_mult(Nobs); 66 | 67 | /* Simulating residuals, using the Q-matrix to do it. We set the main effects to zero 68 | * (hence, starting from "Ncoefs") and simulate normals for the residual effects. 69 | * We then use this to reconstruct the residuals themselves - twice - and then compute 70 | * correlations between the two reconstructions. 71 | */ 72 | for (int it=0; it cpp_rnorm; 75 | 76 | for (int mode=0; mode<2; ++mode) { 77 | // Simulating the residuals. 78 | std::fill(holding_val.begin(), holding_val.begin()+Ncoef, 0); 79 | for (int col=Ncoef; col 7 | #include 8 | 9 | pcg32 create_pcg32(SEXP seed, int stream) { 10 | return pcg32(dqrng::convert_seed(seed), stream); 11 | } 12 | 13 | void check_pcg_vectors(const Rcpp::List seeds, Rcpp::IntegerVector streams, size_t N, const char* msg) { 14 | if (static_cast(seeds.size())!=N) { 15 | std::stringstream err; 16 | err << "number of " << msg << " and seeds should be the same"; 17 | throw std::runtime_error(err.str()); 18 | } 19 | 20 | if (static_cast(streams.size())!=N) { 21 | std::stringstream err; 22 | err << "number of " << msg << " and streams should be the same"; 23 | throw std::runtime_error(err.str()); 24 | } 25 | 26 | return; 27 | } 28 | -------------------------------------------------------------------------------- /src/rand_custom.h: -------------------------------------------------------------------------------- 1 | #ifndef RAND_CUSTOM_H 2 | #define RAND_CUSTOM_H 3 | 4 | #include "Rcpp.h" 5 | #include "pcg_random.hpp" 6 | #include "boost/random.hpp" 7 | #include 8 | 9 | void check_pcg_vectors(Rcpp::List, Rcpp::IntegerVector, size_t, const char*); 10 | 11 | pcg32 create_pcg32(SEXP, int); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include "Rcpp.h" 5 | 6 | // Overloaded functions to check for NA'ness. 7 | 8 | inline bool isNA(int x) { 9 | return x==NA_INTEGER; 10 | } 11 | 12 | inline bool isNA(double x) { 13 | return ISNAN(x); 14 | } 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(scran) 3 | test_check("scran") 4 | -------------------------------------------------------------------------------- /tests/testthat/setup.R: -------------------------------------------------------------------------------- 1 | all_positive_integers <- function(N) sample(.Machine$integer.max, N, replace=TRUE) 2 | 3 | are_PCs_equal <- function(first, second, tol=1e-8) 4 | # Check if PCs are equal (other than sign). 5 | { 6 | expect_identical(dim(first), dim(second)) 7 | relative <- first/second 8 | expect_true(all(colSums(relative > 0) %in% c(0, nrow(first)))) 9 | expect_true(all(abs(abs(relative)-1) < tol)) 10 | } 11 | 12 | # Because SnowParam() is too slow, yet MulticoreParam() fails on Windows. 13 | # See discussion at https://github.com/Bioconductor/BiocParallel/issues/98. 14 | safeBPParam <- function(nworkers) { 15 | if (.Platform$OS.type=="windows") { 16 | BiocParallel::SerialParam() 17 | } else { 18 | BiocParallel::MulticoreParam(nworkers) 19 | } 20 | } 21 | 22 | # Using ExactParam to avoid the trouble of setting the seed for all SVD-related tests. 23 | options(BiocSingularParam.default=BiocSingular::ExactParam()) 24 | 25 | # Adding a test to flush out any uncontrolled parallelization. 26 | library(BiocParallel) 27 | failgen <- setRefClass("FailParam", 28 | contains="BiocParallelParam", 29 | fields=list(), 30 | methods=list()) 31 | 32 | FAIL <- failgen() 33 | register(FAIL) 34 | 35 | library(DelayedArray) 36 | setAutoBPPARAM(FAIL) 37 | -------------------------------------------------------------------------------- /tests/testthat/test-altrep.R: -------------------------------------------------------------------------------- 1 | # This tests that various functions are applicable with alternative matrix representations. 2 | # library(scran); library(testthat); source("test-altrep.R") 3 | 4 | set.seed(99999) 5 | library(Matrix) 6 | X <- as(matrix(rpois(100000, lambda=1), ncol=100), "dgCMatrix") 7 | X_ <- as.matrix(X) 8 | 9 | library(HDF5Array) 10 | Y <- as(matrix(rpois(100000, lambda=5), ncol=100), "HDF5Array") 11 | Y_ <- as.matrix(Y) 12 | 13 | test_that("cyclone runs properly", { 14 | mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) 15 | rownames(X) <- rownames(X_) <- sample(mm.pairs$G1[,1], nrow(X)) 16 | rownames(Y) <- rownames(Y_) <- sample(mm.pairs$G1[,1], nrow(Y)) 17 | 18 | set.seed(100) 19 | assignments1 <- cyclone(X[,1:10], mm.pairs) 20 | set.seed(100) 21 | assignments2 <- cyclone(X_[,1:10], mm.pairs) 22 | expect_identical(assignments1, assignments2) 23 | 24 | set.seed(100) 25 | assignments1 <- cyclone(Y[,1:10], mm.pairs) 26 | set.seed(100) 27 | assignments2 <- cyclone(Y_[,1:10], mm.pairs) 28 | expect_identical(assignments1, assignments2) 29 | }) 30 | 31 | test_that("Variance estimation runs properly", { 32 | dec1 <- modelGeneVar(Y) 33 | dec2 <- modelGeneVar(Y_) 34 | expect_equal(dec1, dec2) 35 | 36 | dec1 <- modelGeneCV2(Y) 37 | dec2 <- modelGeneCV2(Y_) 38 | expect_equal(dec1, dec2) 39 | }) 40 | 41 | test_that("correlatePairs runs properly", { 42 | set.seed(1000) 43 | null <- correlateNull(ncol(X), iters=1e6) 44 | 45 | set.seed(100) 46 | ref <- correlatePairs(X_[1:10,], null.dist=null) 47 | set.seed(100) 48 | alt <- correlatePairs(X[1:10,], null.dist=null) 49 | expect_equal(ref, alt) 50 | 51 | set.seed(200) 52 | ref <- correlatePairs(Y_[20:50,], null.dist=null) 53 | set.seed(200) 54 | alt <- correlatePairs(Y[20:50,], null.dist=null) 55 | expect_equal(ref, alt) 56 | }) 57 | 58 | test_that("buildSNNGraph with irlba runs properly on sparse matrices", { 59 | set.seed(1000) 60 | g1 <- buildSNNGraph(X, BSPARAM=BiocSingular::IrlbaParam()) 61 | set.seed(1000) 62 | g2 <- buildSNNGraph(X_, BSPARAM=BiocSingular::IrlbaParam()) 63 | expect_identical(g1[], g2[]) 64 | 65 | set.seed(100) 66 | g1 <- buildSNNGraph(X, d=10, BSPARAM=BiocSingular::IrlbaParam()) 67 | set.seed(100) 68 | g2 <- buildSNNGraph(X_, d=10, BSPARAM=BiocSingular::IrlbaParam()) 69 | expect_identical(g1[], g2[]) 70 | }) 71 | 72 | test_that("findMarkers and overlapExprs work properly", { 73 | groups <- sample(2, ncol(X), replace=TRUE) 74 | expect_equal(findMarkers(Y, groups), findMarkers(Y_, groups)) 75 | }) 76 | -------------------------------------------------------------------------------- /tests/testthat/test-build-snn.R: -------------------------------------------------------------------------------- 1 | # Checks the construction of the SNN graph. 2 | # require(scran); require(testthat); source("setup.R"); source("test-build-snn.R") 3 | 4 | ngenes <- 500 5 | ncells <- 200 6 | 7 | are_graphs_same <- function(g1, g2) { 8 | expect_equal(g1[], g2[]) 9 | return(TRUE) 10 | } 11 | 12 | set.seed(20001) 13 | test_that("Subsetting does not change the result", { 14 | dummy <- matrix(rnorm(ngenes*ncells), ncol=ncells, nrow=ngenes) 15 | 16 | selected <- sample(ngenes, 50) 17 | g <- buildSNNGraph(dummy[selected,]) 18 | g2 <- buildSNNGraph(dummy, subset.row=selected) 19 | are_graphs_same(g, g2) 20 | 21 | selected <- rbinom(ngenes, 1, 0.5)==1 22 | g <- buildSNNGraph(dummy[selected,]) 23 | g2 <- buildSNNGraph(dummy, subset.row=selected) 24 | are_graphs_same(g, g2) 25 | }) 26 | 27 | set.seed(20002) 28 | test_that("buildSNNGraph works properly on SingleCellExperiment objects", { 29 | dummy <- matrix(rnorm(ngenes*ncells), ncol=ncells, nrow=ngenes) 30 | sce <- SingleCellExperiment(list(counts=2^dummy, logcounts=dummy)) 31 | g <- buildSNNGraph(sce) 32 | g2 <- buildSNNGraph(assay(sce, "logcounts")) 33 | are_graphs_same(g, g2) 34 | 35 | g <- buildSNNGraph(sce, assay.type="counts") 36 | g2 <- buildSNNGraph(assay(sce, "counts")) 37 | are_graphs_same(g, g2) 38 | 39 | selected <- sample(ngenes, 50) 40 | g <- buildSNNGraph(sce, subset.row=selected) 41 | g2 <- buildSNNGraph(sce[selected,]) 42 | are_graphs_same(g, g2) 43 | }) 44 | 45 | set.seed(20004) 46 | test_that("buildSNNGraph with PCA works correctly", { 47 | dummy <- matrix(rnorm(ngenes*ncells), ncol=ncells, nrow=ngenes) 48 | pc <- prcomp(t(dummy)) 49 | ref <- buildSNNGraph(t(pc$x[,1:20]), k=10, d=NA) 50 | alt <- buildSNNGraph(dummy, k=10, d=20) 51 | are_graphs_same(ref, alt) 52 | 53 | ref <- buildSNNGraph(t(pc$x[,1:50]), k=10, d=NA) 54 | alt <- buildSNNGraph(dummy, k=10, d=50) 55 | are_graphs_same(ref, alt) 56 | 57 | # Checking that it correctly extracts stuff from the reducedDimension slot. 58 | X <- SingleCellExperiment(list(logcounts=dummy)) 59 | reducedDim(X, "PCA") <- pc$x[,1:50] 60 | alt <- buildSNNGraph(X, use.dimred="PCA") 61 | are_graphs_same(ref, alt) 62 | 63 | # Unaffected by subset.row specifications (correctly). 64 | sub <- sample(ngenes, 50) 65 | alt <- buildSNNGraph(X, use.dimred="PCA", subset.row=sub) 66 | are_graphs_same(ref, alt) 67 | }) 68 | -------------------------------------------------------------------------------- /tests/testthat/test-colranks.R: -------------------------------------------------------------------------------- 1 | # This tests out the scaledColRanks function. 2 | # require(scran); require(testthat); source("setup.R"); source("test-colranks.R") 3 | 4 | ncells <- 100 5 | ngenes <- 200 6 | 7 | set.seed(430000) 8 | test_that("scaledColRanks correctly computes the ranks", { 9 | dummy <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 10 | 11 | emp.ranks <- scaledColRanks(dummy) 12 | ref <- apply(dummy, 2, FUN=function(y) { 13 | r <- rank(y) 14 | r <- r - mean(r) 15 | r/sqrt(sum(r^2))/2 16 | }) 17 | expect_equal(emp.ranks, ref) 18 | 19 | # Behaves with many ties. 20 | dummy <- matrix(sample(50, ncells*ngenes, replace=TRUE), ncol=ncells, nrow=ngenes) 21 | emp.ranks <- scaledColRanks(dummy) 22 | ref <- apply(dummy, 2, FUN=function(y) { 23 | r <- rank(y) 24 | r <- r - mean(r) 25 | r/sqrt(sum(r^2))/2 26 | }) 27 | expect_equal(emp.ranks, ref) 28 | 29 | # Behaves with no ties. 30 | dummy <- matrix(rnorm(ncells*ngenes), ncol=ncells, nrow=ngenes) 31 | emp.ranks <- scaledColRanks(dummy) 32 | ref <- apply(dummy, 2, FUN=function(y) { 33 | r <- rank(y) 34 | r <- r - mean(r) 35 | r/sqrt(sum(r^2))/2 36 | }) 37 | expect_equal(emp.ranks, ref) 38 | 39 | # Works correctly with shuffling. 40 | shuffled <- sample(ncells) 41 | emp.ranks <- scaledColRanks(dummy, subset.row=shuffled) 42 | ref <- apply(dummy, 2, FUN=function(y) { 43 | r <- rank(y[shuffled]) 44 | r <- r - mean(r) 45 | r/sqrt(sum(r^2))/2 46 | }) 47 | expect_equal(emp.ranks, ref) 48 | 49 | # Works correctly on sparse matrices. 50 | sparse <- abs(Matrix::rsparsematrix(ngenes, ncells, density=0.1)) 51 | out <- scaledColRanks(sparse, min.mean=0) 52 | ref <- scaledColRanks(as.matrix(sparse), min.mean=0) 53 | expect_identical(unname(out), unname(ref)) 54 | }) 55 | 56 | set.seed(430001) 57 | test_that("scaledColRanks responds to other options", { 58 | mat <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 59 | 60 | # Subsetting. 61 | keep <- sample(ngenes, ngenes/2) 62 | rnks <- scaledColRanks(mat, subset.row=keep) 63 | expect_identical(rnks, scaledColRanks(mat[keep,])) 64 | 65 | # Minimum mean. 66 | rnks <- scaledColRanks(mat, min.mean=10) 67 | expect_identical(rnks, scaledColRanks(mat, subset.row=scuttle::calculateAverage(mat) >= 10)) 68 | 69 | # Transposition. 70 | rnks <- scaledColRanks(mat, transposed=TRUE) 71 | expect_identical(rnks, t(scaledColRanks(mat))) 72 | }) 73 | 74 | set.seed(430002) 75 | test_that("scaledColRanks naming is handled correctly", { 76 | mat <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 77 | rownames(mat) <- seq_len(nrow(mat)) 78 | colnames(mat) <- seq_len(ncol(mat)) 79 | 80 | rnks <- scaledColRanks(mat) 81 | expect_identical(dimnames(rnks), dimnames(mat)) 82 | 83 | rnks <- scaledColRanks(mat, transposed=TRUE) 84 | expect_identical(dimnames(rnks), rev(dimnames(mat))) 85 | 86 | rnks <- scaledColRanks(mat, subset.row=1:10) 87 | expect_identical(rownames(rnks), rownames(mat)[1:10]) 88 | 89 | rnks <- scaledColRanks(mat, withDimnames=FALSE) 90 | expect_identical(dimnames(rnks), NULL) 91 | }) 92 | 93 | set.seed(430003) 94 | test_that("scaledColRanks handles sparsity requests", { 95 | mat <- matrix(rnbinom(ncells*ngenes, mu=1, size=20), ncol=ncells, nrow=ngenes) 96 | ref <- scaledColRanks(mat) 97 | 98 | library(Matrix) 99 | rnks <- scaledColRanks(mat, as.sparse=TRUE) 100 | expect_s4_class(rnks, "dgCMatrix") 101 | expect_identical(rnks!=0, as(mat, "dgCMatrix")!=0) 102 | 103 | centred <- sweep(rnks, 2, Matrix::colMeans(rnks), "-") 104 | centred <- as.matrix(centred) 105 | dimnames(centred) <- NULL 106 | expect_equal(centred, ref) 107 | 108 | # With transposition. 109 | rnks <- scaledColRanks(mat, as.sparse=TRUE, transposed=TRUE) 110 | expect_identical(rnks!=0, as(t(mat), "dgCMatrix")!=0) 111 | 112 | centred <- rnks - Matrix::rowMeans(rnks) 113 | centred <- as.matrix(centred) 114 | dimnames(centred) <- NULL 115 | expect_equal(centred, t(ref)) 116 | }) 117 | 118 | set.seed(430003) 119 | test_that("scaledColRanks handles DA inputs", { 120 | dummy <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 121 | expect_identical(scaledColRanks(dummy), scaledColRanks(DelayedArray(dummy))) 122 | expect_identical(scaledColRanks(dummy, transposed=TRUE), scaledColRanks(DelayedArray(dummy), transposed=TRUE)) 123 | expect_identical(scaledColRanks(dummy, as.sparse=TRUE), scaledColRanks(DelayedArray(dummy), as.sparse=TRUE)) 124 | }) 125 | 126 | set.seed(430004) 127 | test_that("scaledColRanks handles silly inputs", { 128 | mat <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 129 | expect_error(scaledColRanks(mat[0,,drop=FALSE]), "rank variances of zero detected for a cell") 130 | 131 | out <- scaledColRanks(mat[,0,drop=FALSE]) 132 | expect_identical(dim(out), c(as.integer(ngenes), 0L)) 133 | }) 134 | -------------------------------------------------------------------------------- /tests/testthat/test-convert.R: -------------------------------------------------------------------------------- 1 | # Tests the convertTo function. 2 | # require(scran); require(testthat); source("test-convert.R") 3 | 4 | set.seed(40000) 5 | ncells <- 200 6 | ngenes <- 1000 7 | count.sizes <- rnbinom(ncells, mu=100, size=5) 8 | dummy <- matrix(count.sizes, ncol=ncells, nrow=ngenes, byrow=TRUE) 9 | rownames(dummy) <- paste0("X", seq_len(ngenes)) 10 | colnames(dummy) <- paste0("Y", seq_len(ncells)) 11 | 12 | X <- SingleCellExperiment(list(counts=dummy)) 13 | sizeFactors(X) <- 2^rnorm(ncells) 14 | rowData(X)$SYMBOL <- paste0("X", seq_len(ngenes)) 15 | X$other <- sample(LETTERS, ncells, replace=TRUE) 16 | 17 | # Converting to a DGEList. 18 | 19 | test_that("Can convert from SingleCellExperiment to DGEList", { 20 | y <- convertTo(X, type="edgeR") 21 | expect_identical(y$counts, counts(X)) 22 | 23 | # Checking subsetting behaves as expected. 24 | chosen <- c(50:1, 101:200) 25 | y <- convertTo(X, type="edgeR", subset.row=chosen) 26 | expect_identical(y$counts, counts(X)[chosen,]) 27 | 28 | # Checking metadata behaves as expected. 29 | y <- convertTo(X, type="edgeR") 30 | expect_identical(y$samples$other, X$other) 31 | expect_identical(y$genes$SYMBOL, rowData(X)$SYMBOL) 32 | 33 | # # Trying out silly settings. 34 | # expect_warning(y <- convertTo(X[0,], type="edgeR")) 35 | # expect_identical(y$counts, counts(X)[0,]) 36 | # expect_identical(y$genes$SYMBOL, character(0)) 37 | # 38 | # y <- convertTo(X[,0], type="edgeR") 39 | # expect_identical(y$counts, counts(X)[,0]) 40 | # expect_identical(y$samples$other, character(0)) 41 | }) 42 | 43 | test_that("Can convert SingleCellExperiment to a DESeqDataSet", { 44 | library(DESeq2) 45 | y <- convertTo(X, type="DESeq2") 46 | expect_equal(counts(y), counts(X)) 47 | expect_identical(unname(sizeFactors(y)), sizeFactors(X)) 48 | 49 | # Checking subsetting behaves as expected. 50 | chosen <- c(50:1, 101:200) 51 | y <- convertTo(X, type="DESeq2", subset.row=chosen) 52 | expect_equivalent(assay(y), counts(X)[chosen,]) 53 | 54 | # Checking metadata is extracted as expected. 55 | y <- convertTo(X, type="DESeq2") 56 | expect_identical(y$other, X$other) 57 | expect_identical(mcols(y)$SYMBOL, rowData(X)$SYMBOL) 58 | }) 59 | 60 | # Converting to a CellDataSet. 61 | 62 | catch_warning <- function(...) { 63 | expect_warning(..., "gene_short_name") 64 | } 65 | 66 | get_exprs <- function(y) { 67 | assayDataElement(y, "exprs") 68 | } 69 | 70 | test_that("Can convert SingleCellExperiment to a CellDataSet", { 71 | skip("monocle is a little broken because of clusterApply") 72 | catch_warning(y <- convertTo(X, type="monocle")) 73 | expect_equal(get_exprs(y), counts(X)) 74 | expect_equivalent(sizeFactors(y), sizeFactors(X)) 75 | 76 | # Checking that subsetting works as expected. 77 | chosen <- c(50:1, 101:200) 78 | catch_warning(y <- convertTo(X, type="monocle", subset.row=chosen)) 79 | expect_equal(get_exprs(y), counts(X)[chosen,]) 80 | expect_equivalent(sizeFactors(y), sizeFactors(X)) 81 | 82 | # Checking metadata is extracted as expected. 83 | catch_warning(y <- convertTo(X, type="monocle")) 84 | expect_identical(y$other, X$other) 85 | expect_identical(fData(y)$SYMBOL, rowData(X)$SYMBOL) 86 | 87 | # # Looks like the CellDataSet constructor just fails with no rows. 88 | # y <- convertTo(X[0,], type="monocle", row.fields="SYMBOL") 89 | # expect_identical(get_exprs(y), counts(X)[0,]) 90 | # expect_identical(fData(y)$SYMBOL, character(0)) 91 | 92 | catch_warning(y <- convertTo(X[,0], type="monocle") ) 93 | expect_identical(get_exprs(y), counts(X)[,0]) 94 | expect_identical(y$other, character(0)) 95 | 96 | X2 <- X 97 | sizeFactors(X2) <- NULL 98 | expect_warning(y2 <- convertTo(X2, type="monocle")) 99 | expect_equal(counts(X2), get_exprs(y2)) 100 | expect_true(all(is.na(sizeFactors(y2)))) 101 | }) 102 | -------------------------------------------------------------------------------- /tests/testthat/test-correlate-genes.R: -------------------------------------------------------------------------------- 1 | # Tests the correlateGenes function. 2 | # library(testthat); library(scran); source("test-correlate-genes.R") 3 | 4 | set.seed(100022) 5 | Ngenes <- 20 6 | Ncells <- 100 7 | X <- log(matrix(rpois(Ngenes*Ncells, lambda=10), nrow=Ngenes) + 1) 8 | rownames(X) <- paste0("X", seq_len(Ngenes)) 9 | ref <- correlatePairs(X) 10 | 11 | test_that("correlateGenes works correctly", { 12 | out <- correlateGenes(ref) 13 | for (x in rownames(X)) { 14 | collected <- ref$gene1 == x | ref$gene2==x 15 | 16 | simes.p <- min(p.adjust(ref$p.value[collected], method="BH")) 17 | expect_equal(simes.p, out$p.value[out$gene==x]) 18 | 19 | max.i <- which.max(abs(ref$rho[collected])) 20 | expect_equal(ref$rho[collected][max.i], out$rho[out$gene==x]) 21 | } 22 | }) 23 | 24 | test_that("correlateGenes handles silly inputs", { 25 | out <- correlateGenes(ref[1:10,]) 26 | expect_identical(out[0,], correlateGenes(ref[0,])) 27 | }) 28 | -------------------------------------------------------------------------------- /tests/testthat/test-expand-pairings.R: -------------------------------------------------------------------------------- 1 | # This tests the expand_pairings utility in scran. 2 | # library(testthat); library(scran); source("test-expand-pairings.R") 3 | 4 | test_that("expand_pairings works as expected for NULL", { 5 | out <- scran:::.expand_pairings(NULL, universe=1:5) 6 | expect_identical(out$mode, "single pool") 7 | 8 | grid <- expand.grid(1:5, 1:5) 9 | grid <- grid[grid[,1] != grid[,2],] 10 | expect_identical(out$id1, grid[,1]) 11 | expect_identical(out$id2, grid[,2]) 12 | 13 | # Even when empty. 14 | out <- scran:::.expand_pairings(NULL, universe=integer(0)) 15 | expect_identical(out$id1, integer(0)) 16 | expect_identical(out$id2, integer(0)) 17 | }) 18 | 19 | test_that("expand_pairings works as expected for vectors", { 20 | out <- scran:::.expand_pairings(2:4, universe=1:5) 21 | expect_identical(out$mode, "single pool") 22 | 23 | grid <- expand.grid(2:4, 2:4) 24 | grid <- grid[grid[,1] != grid[,2],] 25 | expect_identical(out$id1, grid[,1]) 26 | expect_identical(out$id2, grid[,2]) 27 | 28 | out2 <- scran:::.expand_pairings(c("C", "D", "E"), universe=LETTERS[2:5]) 29 | expect_identical(out, out2) 30 | 31 | ref2 <- scran:::.expand_pairings(1:3, universe=1:5) 32 | out2 <- scran:::.expand_pairings(5:3, universe=5:1) 33 | expect_identical(ref2, out2) 34 | 35 | # Works for elements out of range. 36 | out3 <- scran:::.expand_pairings(c("C", "D", "E"), universe=LETTERS[1:4]) 37 | out4 <- scran:::.expand_pairings(3:4, universe=1:4) 38 | expect_identical(out3, out4) 39 | 40 | # Even when empty. 41 | out <- scran:::.expand_pairings(integer(0), universe=1:5) 42 | expect_identical(out$id1, integer(0)) 43 | expect_identical(out$id2, integer(0)) 44 | }) 45 | 46 | test_that("expand_pairings works as expected for lists", { 47 | out <- scran:::.expand_pairings(list(c(1,5), c(2:4)), universe=1:5) 48 | expect_identical(out$mode, "double pool") 49 | 50 | grid <- expand.grid(c(1L,5L), 2:4) 51 | expect_identical(out$id1, grid[,1]) 52 | expect_identical(out$id2, grid[,2]) 53 | 54 | ref2 <- scran:::.expand_pairings(list(1:3, 4:5), universe=1:5) 55 | out2 <- scran:::.expand_pairings(list(c(5,3,1),c(4,2)), universe=c(5,3,1,4,2)) 56 | expect_identical(ref2, out2) 57 | 58 | # Works for character vectors. 59 | out2 <- scran:::.expand_pairings(list(c("B", "F"), c("C", "D", "E")), universe=LETTERS[2:6]) 60 | expect_identical(out, out2) 61 | 62 | # Works for NULL. 63 | null <- scran:::.expand_pairings(list(NULL, NULL), universe=LETTERS[2:6]) 64 | ref <- scran:::.expand_pairings(NULL, universe=LETTERS[2:6]) 65 | expect_identical(null[1:2], ref[1:2]) 66 | 67 | # Works for elements out of range. 68 | out3 <- scran:::.expand_pairings(list(c(1,3,5), c(2, 4, 6)), universe=1:4) 69 | out4 <- scran:::.expand_pairings(list(c(1,3), c(2, 4)), universe=1:4) 70 | expect_identical(out3, out4) 71 | 72 | # Even when empty. 73 | out <- scran:::.expand_pairings(list(1:5, 1:5), universe=integer(0)) 74 | expect_identical(out$id1, integer(0)) 75 | expect_identical(out$id2, integer(0)) 76 | }) 77 | 78 | test_that("expand_pairings works as expected for matrices", { 79 | mat <- cbind(1:5, 2:6) 80 | out <- scran:::.expand_pairings(mat, universe=1:6) 81 | expect_identical(out$mode, "predefined pairs") 82 | expect_identical(out$id1, mat[,1]) 83 | expect_identical(out$id2, mat[,2]) 84 | 85 | # Filters out invalid pairs. 86 | ref <- scran:::.expand_pairings(mat[1:4,], universe=1:5) 87 | out <- scran:::.expand_pairings(mat, universe=1:5) 88 | expect_identical(ref, out) 89 | 90 | # Handles empty inputs. 91 | out <- scran:::.expand_pairings(mat[0,], universe=integer(0)) 92 | expect_identical(out$id1, integer(0)) 93 | expect_identical(out$id2, integer(0)) 94 | }) 95 | -------------------------------------------------------------------------------- /tests/testthat/test-fixed-pca.R: -------------------------------------------------------------------------------- 1 | # Tests the fixedPCA function. 2 | # library(testthat); library(scran); source("test-fixed-pca.R") 3 | 4 | library(scuttle) 5 | sce <- mockSCE() 6 | sce <- logNormCounts(sce) 7 | library(BiocSingular) 8 | 9 | test_that("fixedPCA works correctly", { 10 | set.seed(100) 11 | sce2 <- fixedPCA(sce, subset.row=NULL) 12 | set.seed(100) 13 | ref <- runPCA(t(logcounts(sce2)), rank=50, BSPARAM=bsparam()) 14 | expect_equal(unclass(reducedDim(sce2))[,], ref$x) 15 | 16 | set.seed(100) 17 | sce2 <- fixedPCA(sce, subset.row=1:200) 18 | set.seed(100) 19 | ref <- runPCA(t(logcounts(sce2)[1:200,]), rank=50, BSPARAM=bsparam()) 20 | expect_equal(unclass(reducedDim(sce2))[,], ref$x) 21 | expect_equal(logcounts(sce), logcounts(sce2)) 22 | 23 | # Doesn't preserve shape if we don't ask. 24 | set.seed(100) 25 | sce2 <- fixedPCA(sce, subset.row=1:200, preserve.shape=FALSE) 26 | expect_equal(unclass(reducedDim(sce2))[,], ref$x) 27 | expect_equal(logcounts(sce2), logcounts(sce)[1:200,]) 28 | 29 | set.seed(100) 30 | sce <- fixedPCA(sce, rank=20, subset.row=1:50) 31 | set.seed(100) 32 | ref <- runPCA(t(logcounts(sce)[1:50,]), rank=20, BSPARAM=bsparam()) 33 | expect_equal(unclass(reducedDim(sce))[,], ref$x) 34 | }) 35 | 36 | test_that("fixedPCA works correctly with low rank approximations", { 37 | set.seed(100) 38 | sce <- fixedPCA(sce, subset.row=NULL) 39 | set.seed(100) 40 | sce2 <- fixedPCA(sce, value="lowrank", subset.row=NULL) 41 | rot <- attr(reducedDim(sce), "rotation") 42 | expect_identical(as.matrix(assay(sce2, "lowrank")[1:10,]), tcrossprod(rot[1:10,], reducedDim(sce))) 43 | 44 | # Works with subsetting. 45 | set.seed(100) 46 | sce3 <- fixedPCA(rbind(sce, sce[1:10,]), subset.row=seq_len(nrow(sce)), value="lowrank") 47 | expect_identical(assay(sce2, "lowrank"), assay(sce3, "lowrank")[seq_len(nrow(sce)),]) 48 | expect_equal(assay(sce2, "lowrank")[1:10,], assay(sce3, "lowrank")[nrow(sce)+1:10,], tol=1e-6) 49 | 50 | # Won't preserve the shape. 51 | set.seed(100) 52 | sce4 <- fixedPCA(rbind(sce, sce[1:10,]), subset.row=seq_len(nrow(sce)), value="lowrank", preserve.shape=FALSE) 53 | expect_identical(assay(sce2, "lowrank"), assay(sce4, "lowrank")) 54 | }) 55 | 56 | test_that("fixedPCA warns when subset.row is not specified", { 57 | expect_warning(fixedPCA(sce), "subset.row") 58 | }) 59 | 60 | -------------------------------------------------------------------------------- /tests/testthat/test-multi-markers.R: -------------------------------------------------------------------------------- 1 | # This tests the multiMarkerStats function. 2 | # library(scran); library(testthat); source("test-multi-markers.R") 3 | 4 | set.seed(1000) 5 | library(scuttle) 6 | sce <- mockSCE() 7 | sce <- logNormCounts(sce) 8 | 9 | # Any clustering method is okay, only using k-means for convenience. 10 | kout <- kmeans(t(logcounts(sce)), centers=4) 11 | 12 | tout <- findMarkers(sce, groups=kout$cluster, direction="up") 13 | wout <- findMarkers(sce, groups=kout$cluster, direction="up", test="wilcox") 14 | bout <- findMarkers(sce, groups=kout$cluster, direction="up", test="binom") 15 | 16 | test_that("multiMarkerStats preserves single inputs correctly", { 17 | single <- multiMarkerStats(t=tout) 18 | 19 | for (i in seq_along(single)) { 20 | expect_equivalent(as.matrix(single[[i]][,1:3]), as.matrix(single[[i]][,1:3+3])) 21 | expect_equal(single[[i]][,1:3], tout[[i]][,1:3]) 22 | 23 | lfc <- single[[i]][,-(1:6)] 24 | colnames(lfc) <- sub("^t.", "", colnames(lfc)) 25 | expect_equal(lfc, tout[[i]][,-(1:3)]) 26 | } 27 | }) 28 | 29 | test_that("multiMarkerStats interleaves multiple inputs correctly", { 30 | combined <- multiMarkerStats(t=tout, wilcox=wout, binom=bout) 31 | 32 | for (i in seq_along(combined)) { 33 | curcom <- combined[[i]] 34 | expect_identical( 35 | colnames(curcom)[-(1:3)], 36 | as.character(rbind( 37 | paste0("t.", colnames(tout[[i]])), 38 | paste0("wilcox.", colnames(wout[[i]])), 39 | paste0("binom.", colnames(bout[[i]])) 40 | )) 41 | ) 42 | 43 | curt <- tout[[i]][rownames(curcom),] 44 | curw <- wout[[i]][rownames(curcom),] 45 | curb <- bout[[i]][rownames(curcom),] 46 | 47 | expect_identical(curcom$Top, pmax(curt$Top, curw$Top, curb$Top)) 48 | expect_identical(curcom$p.value, pmax(curt$p.value, curw$p.value, curb$p.value)) 49 | 50 | expect_identical(curcom$t.Top, curt$Top) 51 | expect_identical(curcom$wilcox.Top, curw$Top) 52 | expect_identical(curcom$binom.Top, curb$Top) 53 | 54 | expect_equivalent( 55 | as.matrix(curt[,-(1:3)]), 56 | as.matrix(curcom[,grep("^t\\..*logFC", colnames(curcom))]) 57 | ) 58 | expect_equivalent( 59 | as.matrix(curw[,-(1:3)]), 60 | as.matrix(curcom[,grep("^wilcox\\..*AUC", colnames(curcom))]) 61 | ) 62 | expect_equivalent( 63 | as.matrix(curb[,-(1:3)]), 64 | as.matrix(curcom[,grep("^binom\\..*logFC", colnames(curcom))]) 65 | ) 66 | } 67 | }) 68 | 69 | test_that("multiMarkerStats works correctly without 'Top' inputs", { 70 | tout2 <- findMarkers(sce, groups=kout$cluster, direction="up", pval.type="all") 71 | wout2 <- findMarkers(sce, groups=kout$cluster, direction="up", test="wilcox", pval.type="all") 72 | bout2 <- findMarkers(sce, groups=kout$cluster, direction="up", test="binom", pval.type="all") 73 | 74 | combined <- multiMarkerStats(t=tout2, wilcox=wout2, binom=bout2) 75 | for (i in seq_along(combined)) { 76 | curcom <- combined[[i]] 77 | expect_false("Top" %in% colnames(curcom)) 78 | expect_false(is.unsorted(curcom$p.value)) 79 | } 80 | }) 81 | 82 | test_that("multiMarkerStats works correctly with log-transformed inputs", { 83 | ltout <- findMarkers(sce, groups=kout$cluster, log.p=TRUE, direction="up") 84 | lwout <- findMarkers(sce, groups=kout$cluster, log.p=TRUE, direction="up", test="wilcox") 85 | lbout <- findMarkers(sce, groups=kout$cluster, log.p=TRUE, direction="up", test="binom") 86 | 87 | ref <- multiMarkerStats(t=tout, wilcox=wout, binom=bout) 88 | combined <- multiMarkerStats(t=ltout, wilcox=lwout, binom=lbout) 89 | for (i in seq_along(combined)) { 90 | expect_equal(log(ref[[i]]$p.value), combined[[i]]$log.p.value) 91 | expect_equal(log(ref[[i]]$FDR), combined[[i]]$log.FDR) 92 | } 93 | }) 94 | 95 | test_that("multiMarkerStats respects annotation correctly", { 96 | stuff <- DataFrame(stuff=runif(nrow(sce))) 97 | rownames(stuff) <- rownames(sce) 98 | 99 | touta <- findMarkers(sce, groups=kout$cluster, direction="up", row.data=stuff) 100 | wouta <- findMarkers(sce, groups=kout$cluster, direction="up", test="wilcox", row.data=stuff) 101 | bouta <- findMarkers(sce, groups=kout$cluster, direction="up", test="binom", row.data=stuff) 102 | 103 | combined <- multiMarkerStats(t=touta, wilcox=wouta, binom=bouta, repeated="stuff") 104 | for (i in seq_along(combined)) { 105 | expect_equal(combined[[i]][rownames(sce),"stuff"], stuff$stuff) 106 | } 107 | }) 108 | 109 | test_that("multiMarkerStats raises the expected set of errors", { 110 | toutx <- tout 111 | colnames(toutx[[1]]) <- paste0("A:", colnames(toutx[[1]])) 112 | expect_error(multiMarkerStats(t=toutx, wilcox=wout, binom=bout), "either all or no") 113 | 114 | toutx[[1]] <- toutx[[1]][,-1] 115 | expect_error(multiMarkerStats(t=toutx, wilcox=wout, binom=bout), "different numbers of columns") 116 | 117 | toutx[[1]] <- tout[[1]] 118 | rownames(toutx[[1]]) <- paste0("X", rownames(tout[[1]])) 119 | expect_error(multiMarkerStats(t=toutx, wilcox=wout, binom=bout), "row names") 120 | rownames(toutx[[1]]) <- NULL 121 | expect_error(multiMarkerStats(t=toutx, wilcox=wout, binom=bout), "non-NULL") 122 | }) 123 | -------------------------------------------------------------------------------- /tests/testthat/test-pseudo-spec.R: -------------------------------------------------------------------------------- 1 | # This tests the pseudoBulkSpecific functions. 2 | # library(testthat); library(scran); source("test-pseudo-spec.R") 3 | 4 | set.seed(10000) 5 | library(scuttle) 6 | sce <- mockSCE(ncells=1000) 7 | sce$samples <- gl(8, 125) # Pretending we have 8 samples. 8 | 9 | # Making up some clusters. 10 | sce <- logNormCounts(sce) 11 | clusters <- kmeans(t(logcounts(sce)), centers=3)$cluster 12 | 13 | # Creating a set of pseudo-bulk profiles: 14 | info <- DataFrame(sample=sce$samples, cluster=clusters) 15 | pseudo <- sumCountsAcrossCells(sce, info) 16 | pseudo$DRUG <- gl(2,4)[pseudo$sample] 17 | 18 | test_that("pseudoBulkSpecific works correctly in vanilla cases", { 19 | # Spiking in DE for all clusters. 20 | pseudo2 <- pseudo 21 | xDRUG <- pseudo$DRUG 22 | assay(pseudo2)[1,xDRUG==1] <- assay(pseudo2)[1,xDRUG==1] * 100 23 | assay(pseudo2)[2,xDRUG==2] <- assay(pseudo2)[2,xDRUG==2] * 100 24 | 25 | ref <- pseudoBulkDGE(pseudo2, 26 | label=pseudo2$cluster, 27 | design=~DRUG, 28 | coef="DRUG2" 29 | ) 30 | 31 | out <- pseudoBulkSpecific(pseudo2, 32 | label=pseudo2$cluster, 33 | design=~DRUG, 34 | coef="DRUG2" 35 | ) 36 | 37 | expect_identical(names(ref), names(out)) 38 | for (i in names(ref)) { 39 | left <- ref[[i]] 40 | right <- out[[i]] 41 | expect_true(all(left$PValue <= right$PValue)) 42 | expect_identical(left$LogFC, right$LogFC) 43 | expect_true(all(right$PValue[1:2] > 0.01)) 44 | expect_true(all(left$PValue[1:2] < 0.01)) 45 | } 46 | 47 | # Also works for voom. 48 | ref <- pseudoBulkDGE(pseudo2, 49 | label=pseudo2$cluster, 50 | design=~DRUG, 51 | coef="DRUG2", 52 | method="voom" 53 | ) 54 | 55 | out <- pseudoBulkSpecific(pseudo2, 56 | label=pseudo2$cluster, 57 | design=~DRUG, 58 | coef="DRUG2", 59 | method="voom" 60 | ) 61 | 62 | expect_identical(names(ref), names(out)) 63 | for (i in names(ref)) { 64 | left <- ref[[i]] 65 | right <- out[[i]] 66 | expect_true(all(left$P.Value <= right$P.Value)) 67 | expect_identical(left$LogFC, right$LogFC) 68 | expect_true(all(right$P.Value[1:2] > 0.05)) 69 | expect_true(all(left$P.Value[1:2] < 0.05)) 70 | } 71 | }) 72 | 73 | test_that("pseudoBulkSpecific gives the same answers with a reference", { 74 | out <- pseudoBulkSpecific(pseudo, 75 | label=pseudo$cluster, 76 | design=~DRUG, 77 | coef="DRUG2" 78 | ) 79 | 80 | ref <- pseudoBulkDGE(pseudo, 81 | label=pseudo$cluster, 82 | design=~DRUG, 83 | coef="DRUG2" 84 | ) 85 | metadata(ref)$tag <- "I'm here!" 86 | metadata(ref[[1]])$tag <- "I'm still here!" 87 | 88 | out2 <- pseudoBulkSpecific(pseudo, 89 | label=pseudo$cluster, 90 | design=~DRUG, 91 | coef="DRUG2", 92 | reference=ref 93 | ) 94 | 95 | expect_identical(metadata(out2)$tag, "I'm here!") 96 | expect_identical(metadata(out2[[1]])$tag, "I'm still here!") 97 | 98 | metadata(out2)$tag <- NULL 99 | metadata(out2[[1]])$tag <- NULL 100 | expect_identical(out, out2) 101 | }) 102 | 103 | test_that("pseudoBulkSpecific works correctly with sorting", { 104 | ref <- pseudoBulkDGE(pseudo, 105 | label=pseudo$cluster, 106 | design=~DRUG, 107 | coef="DRUG2" 108 | ) 109 | 110 | out <- pseudoBulkSpecific(pseudo, 111 | label=pseudo$cluster, 112 | design=~DRUG, 113 | coef="DRUG2", 114 | reference=ref 115 | ) 116 | 117 | out2 <- pseudoBulkSpecific(pseudo, 118 | label=pseudo$cluster, 119 | design=~DRUG, 120 | coef="DRUG2", 121 | reference=ref, 122 | sorted=TRUE 123 | ) 124 | 125 | for (i in names(ref)) { 126 | left <- out[[i]] 127 | right <- out2[[i]] 128 | expect_identical(left[order(left$PValue),], right) 129 | } 130 | }) 131 | 132 | test_that("pseudoBulkSpecific works correctly with zero replacement", { 133 | pseudo2 <- pseudo 134 | assay(pseudo2)[1,pseudo2$cluster!=1] <- 0 135 | 136 | ref <- pseudoBulkDGE(pseudo2, 137 | label=pseudo2$cluster, 138 | design=~DRUG, 139 | coef="DRUG2" 140 | ) 141 | 142 | out <- pseudoBulkSpecific(pseudo2, 143 | label=pseudo2$cluster, 144 | design=~DRUG, 145 | coef="DRUG2", 146 | reference=ref, 147 | ) 148 | 149 | out2 <- pseudoBulkSpecific(pseudo2, 150 | label=pseudo2$cluster, 151 | design=~DRUG, 152 | coef="DRUG2", 153 | reference=ref, 154 | missing.as.zero=TRUE 155 | ) 156 | 157 | expect_identical(out[[1]]$OtherAverage[1], NA_real_) 158 | expect_identical(out2[[1]]$OtherAverage[1], 0) 159 | }) 160 | 161 | test_that("pseudoBulkSpecific errors out correctly", { 162 | expect_error(pseudoBulkSpecific(pseudo, 163 | label=pseudo$cluster, 164 | design=~DRUG, 165 | coef=1:2 166 | )) 167 | }) 168 | -------------------------------------------------------------------------------- /tests/testthat/test-sandbag.R: -------------------------------------------------------------------------------- 1 | # This tests the sandbag function, by running it and checking that the selecter markers make sense. 2 | # require(scran); require(testthat); source("test-sandbag.R") 3 | 4 | happycheck <- function(X1, X2, X3, pairings, frac) { 5 | is.okay <- logical(nrow(pairings)) 6 | thresh1 <- ceiling(ncol(X1) * frac) 7 | thresh2 <- ceiling(ncol(X2) * frac) 8 | thresh3 <- ceiling(ncol(X3) * frac) 9 | for (p in seq_along(is.okay)) { 10 | diff1 <- X1[pairings$first[p],] - X1[pairings$second[p],] 11 | diff2 <- X2[pairings$first[p],] - X2[pairings$second[p],] 12 | diff3 <- X3[pairings$first[p],] - X3[pairings$second[p],] 13 | 14 | u1 <- sum(diff1 > 0) 15 | u2 <- sum(diff2 > 0) 16 | u3 <- sum(diff3 > 0) 17 | d1 <- sum(diff1 < 0) 18 | d2 <- sum(diff2 < 0) 19 | d3 <- sum(diff3 < 0) 20 | 21 | is.okay[p] <- (u1 >= thresh1 && d2 >= thresh2 && d3 >= thresh3) || 22 | (d1 >= thresh1 && u2 >= thresh2 && u3 >= thresh3) 23 | } 24 | return(is.okay) 25 | } 26 | 27 | #################################################################################################### 28 | 29 | set.seed(100) 30 | 31 | Ngenes <- 100 32 | phases <- sample(3, 100, replace=TRUE) 33 | is.G1 <- phases==1L 34 | is.G2M <- phases==2L 35 | is.S <- phases==3L 36 | 37 | frac <- 0.5 38 | X <- matrix(rpois(Ngenes*length(phases), lambda=10), nrow=Ngenes) 39 | rownames(X) <- paste0("X", seq_len(Ngenes)) 40 | cur.classes <- list(G1=is.G1, S=is.S, G2M=is.G2M) 41 | out <- sandbag(X, cur.classes, fraction=frac) 42 | 43 | XG1 <- X[,is.G1,drop=FALSE] 44 | XS <- X[,is.S,drop=FALSE] 45 | XG2M <- X[,is.G2M,drop=FALSE] 46 | 47 | expect_true(all(happycheck(XG1, XS, XG2M, out$G1, frac))) 48 | expect_true(all(happycheck(XG2M, XS, XG1, out$G2M, frac))) 49 | expect_true(all(happycheck(XS, XG1, XG2M, out$S, frac))) 50 | 51 | # Checking silly inputs. 52 | 53 | out <- sandbag(X[0,], cur.classes, fraction=frac) 54 | expect_identical(out$G1, data.frame(first=character(0), second=character(0), stringsAsFactors=FALSE)) 55 | expect_identical(out$G2M, data.frame(first=character(0), second=character(0), stringsAsFactors=FALSE)) 56 | expect_identical(out$S, data.frame(first=character(0), second=character(0), stringsAsFactors=FALSE)) 57 | expect_error(sandbag(X, list(G1=integer(0), S=is.S, G2M=is.G2M), fraction=frac), "each class must have at least one cell") 58 | expect_error(sandbag(X, unname(cur.classes), fraction=frac), "names") 59 | 60 | is.G1 <- 1 61 | is.G2M <- 2 62 | is.S <- 3 63 | out <- sandbag(X, list(G1=is.G1, S=is.S, G2M=is.G2M), fraction=frac) 64 | XG1 <- X[,is.G1,drop=FALSE] 65 | XS <- X[,is.S,drop=FALSE] 66 | XG2M <- X[,is.G2M,drop=FALSE] 67 | expect_true(all(happycheck(XG1, XS, XG2M, out$G1, frac))) 68 | expect_true(all(happycheck(XG2M, XS, XG1, out$G2M, frac))) 69 | expect_true(all(happycheck(XS, XG1, XG2M, out$S, frac))) 70 | 71 | # Testing for a SCESet, without spike-ins. 72 | 73 | set.seed(200) 74 | test_that("sandbag works correctly with SingleCellExperiment objects", { 75 | Ngenes <- 100 76 | phases <- sample(3, 100, replace=TRUE) 77 | is.G1 <- phases==1L 78 | is.G2M <- phases==2L 79 | is.S <- phases==3L 80 | 81 | frac <- 0.5 82 | X <- matrix(rpois(Ngenes*length(phases), lambda=10), nrow=Ngenes) 83 | rownames(X) <- paste0("X", seq_len(Ngenes)) 84 | X <- SingleCellExperiment(list(counts=X)) 85 | 86 | cur.classes <- list(G1=is.G1, S=is.S, G2M=is.G2M) 87 | out <- sandbag(X, cur.classes, fraction=frac) 88 | expect_identical(sandbag(counts(X), cur.classes, fraction=frac), out) 89 | 90 | XG1 <- counts(X[,is.G1,drop=FALSE]) 91 | XS <- counts(X[,is.S,drop=FALSE]) 92 | XG2M <- counts(X[,is.G2M,drop=FALSE]) 93 | 94 | expect_true(all(happycheck(XG1, XS, XG2M, out$G1, frac))) 95 | expect_true(all(happycheck(XG2M, XS, XG1, out$G2M, frac))) 96 | expect_true(all(happycheck(XS, XG1, XG2M, out$S, frac))) 97 | }) 98 | -------------------------------------------------------------------------------- /tests/testthat/test-subclust.R: -------------------------------------------------------------------------------- 1 | # Tests the quickSubCluster utility. 2 | # require(scran); require(testthat); source("setup.R"); source("test-subclust.R") 3 | 4 | library(scran) 5 | 6 | set.seed(30000) 7 | ncells <- 700 8 | ngenes <- 1000 9 | dummy <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 10 | 11 | set.seed(30001) 12 | test_that("quickSubCluster's defaults are consistent with quickCluster's defaults", { 13 | output <- quickCluster(dummy, min.size=0) 14 | output2 <- quickSubCluster(dummy, groups=rep(1, ncol(dummy))) 15 | expect_identical(as.character(output), sub(".*\\.", "", output2[[1]]$subcluster)) 16 | 17 | sampling <- sample(3, ncells, replace=TRUE) 18 | output <- quickSubCluster(dummy, groups=sampling) 19 | for (i in unique(sampling)) { 20 | ref <- quickCluster(dummy[,i==sampling], min.size=0) 21 | expect_identical(as.character(ref), sub(".*\\.", "", output[[i]]$subcluster)) 22 | } 23 | }) 24 | 25 | set.seed(30001) 26 | test_that("quickSubCluster's metadata output makes sense", { 27 | sampling <- sample(3, ncells, replace=TRUE) 28 | output <- quickSubCluster(dummy, groups=sampling) 29 | 30 | for (i in unique(sampling)) { 31 | expect_identical(metadata(output)$index[[i]], which(sampling==i)) 32 | expect_identical(metadata(output)$subcluster[metadata(output)$index[[i]]], output[[i]]$subcluster) 33 | } 34 | 35 | expect_identical(sub("\\..*", "", metadata(output)$subcluster), as.character(sampling)) 36 | 37 | raw.out <- quickSubCluster(dummy, groups=sampling, simplify=TRUE) 38 | expect_identical(metadata(output)$subcluster, raw.out) 39 | }) 40 | 41 | set.seed(30002) 42 | test_that("quickSubCluster behaves correctly upon changing the assay", { 43 | dummy <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 44 | sampling <- sample(3, ncells, replace=TRUE) 45 | sce <- SingleCellExperiment(list(whee=dummy, blah=scuttle::normalizeCounts(dummy))) 46 | 47 | output <- quickSubCluster(dummy, groups=sampling) 48 | output2 <- quickSubCluster(sce, groups=sampling, assay.type="whee") 49 | expect_identical(lapply(output, reducedDim), lapply(output2, reducedDim)) 50 | 51 | # Checking that we get the same output when we turn off normalization. 52 | expect_error(output <- quickSubCluster(scuttle::normalizeCounts(dummy), groups=sampling, normalize=FALSE), NA) 53 | output2 <- quickSubCluster(sce, groups=sampling, normalize=FALSE, assay.type="blah") 54 | expect_identical(lapply(output, reducedDim), lapply(output2, reducedDim)) 55 | }) 56 | 57 | set.seed(30003) 58 | test_that("quickSubCluster avoids subclustering with too few cells", { 59 | ncells <- 99 60 | dummy <- matrix(rnbinom(ncells*ngenes, mu=10, size=20), ncol=ncells, nrow=ngenes) 61 | sampling <- sample(2, ncells, replace=TRUE) 62 | 63 | suppressWarnings(output <- quickSubCluster(dummy, groups=sampling)) 64 | has.sub1 <- any(grepl("\\.", output[[1]]$subcluster)) 65 | has.sub2 <- any(grepl("\\.", output[[2]]$subcluster)) 66 | expect_true(has.sub1!=has.sub2) 67 | 68 | # Avoids crashing with one-cell clusters. 69 | test <- quickSubCluster(dummy, groups=rep(1:2, c(1, ncells-1))) 70 | expect_equivalent(counts(test[[1]]), dummy[,1,drop=FALSE]) 71 | }) 72 | 73 | set.seed(30001) 74 | test_that("quickSubCluster restrictions work as expected", { 75 | sampling <- sample(LETTERS[1:3], ncells, replace=TRUE) 76 | 77 | set.seed(100) 78 | suppressWarnings(full <- quickSubCluster(dummy, groups=sampling)) 79 | expect_identical(names(full), LETTERS[1:3]) 80 | 81 | set.seed(100) 82 | suppressWarnings(res <- quickSubCluster(dummy, groups=sampling, restricted="A")) 83 | expect_identical(names(res), "A") 84 | expect_identical(res$A, full$A) 85 | 86 | idx <- which(sampling == "A") 87 | expect_identical(metadata(res)$index, list(A=idx)) 88 | expect_identical(metadata(res)$subcluster[idx], metadata(full)$subcluster[idx]) 89 | expect_identical(metadata(res)$subcluster[-idx], sampling[-idx]) 90 | }) 91 | 92 | -------------------------------------------------------------------------------- /tests/testthat/test-top-hvgs.R: -------------------------------------------------------------------------------- 1 | # This tests the getTopHVGs function. 2 | # library(testthat); library(scran); source("test-top-hvgs.R") 3 | 4 | library(scuttle) 5 | sce <- mockSCE() 6 | sce <- logNormCounts(sce) 7 | 8 | expect_identical_sorted <- function(x, y) expect_identical(sort(x), sort(y)) 9 | 10 | test_that("getTopHVGs works correctly", { 11 | stats <- modelGeneVar(sce) 12 | 13 | expect_identical_sorted(getTopHVGs(stats), 14 | rownames(stats)[stats$bio > 0]) 15 | 16 | expect_identical_sorted(getTopHVGs(stats, fdr.threshold=0.05), 17 | rownames(stats)[stats$bio > 0 & stats$FDR <= 0.05]) 18 | 19 | # Handles top choices correctly. 20 | expect_identical(getTopHVGs(stats, n=200, var.threshold=NULL), 21 | head(rownames(stats)[order(-stats$bio)], 200)) 22 | 23 | expect_identical(getTopHVGs(stats, n=200, prop=0.1, var.threshold=NULL), 24 | head(rownames(stats)[order(-stats$bio)], 0.1*nrow(stats))) 25 | 26 | expect_identical(getTopHVGs(stats, n=2000, prop=0.001, var.threshold=NULL), 27 | head(rownames(stats)[order(-stats$bio)], 2000)) 28 | 29 | expect_identical(getTopHVGs(stats, n=NULL, prop=0.1, var.threshold=NULL), 30 | head(rownames(stats)[order(-stats$bio)], 0.1*nrow(stats))) 31 | 32 | expect_identical(getTopHVGs(stats, n=Inf, var.threshold=NULL), 33 | rownames(stats)[order(-stats$bio)]) 34 | }) 35 | 36 | test_that("getTopHVGs handles unnamed inputs correctly", { 37 | stats <- modelGeneVar(sce) 38 | rownames(stats) <- NULL 39 | 40 | expect_identical_sorted(getTopHVGs(stats), which(stats$bio > 0)) 41 | 42 | expect_identical_sorted(getTopHVGs(stats, fdr.threshold=0.05), 43 | which(stats$bio > 0 & stats$FDR <= 0.05)) 44 | 45 | expect_identical(getTopHVGs(stats, var.threshold=NULL), 46 | head(order(stats$bio, decreasing=TRUE), 2000)) 47 | }) 48 | 49 | test_that("getTopHVGs handles NA values", { 50 | stats <- modelGeneVar(sce) 51 | stats$bio[1] <- 10000 52 | stats$FDR[1] <- NA 53 | stats$bio[2] <- NA 54 | stats$FDR[2] <- 0 55 | 56 | expect_identical_sorted( 57 | getTopHVGs(stats, fdr.threshold=0.05, var.threshold=NULL), 58 | getTopHVGs(stats[-1,], fdr.threshold=0.05, var.threshold=NULL) 59 | ) 60 | 61 | expect_identical_sorted( 62 | getTopHVGs(stats), 63 | getTopHVGs(stats[-2,]) 64 | ) 65 | 66 | expect_identical_sorted( 67 | getTopHVGs(stats, fdr.threshold=0.05), 68 | getTopHVGs(stats[-(1:2),], fdr.threshold=0.05) 69 | ) 70 | }) 71 | 72 | test_that("getTopHVGs works correctly with CV2", { 73 | stats2 <- modelGeneCV2(sce) 74 | 75 | expect_identical_sorted(getTopHVGs(stats2, var.field="ratio", var.threshold=1), 76 | rownames(stats2)[stats2$ratio > 1]) 77 | 78 | expect_identical_sorted(getTopHVGs(stats2, var.field="ratio", var.threshold=1, fdr.threshold=0.05), 79 | rownames(stats2)[stats2$ratio > 1 & stats2$FDR <= 0.05]) 80 | 81 | expect_identical_sorted(getTopHVGs(stats2, var.field="ratio", n=200, var.threshold=NULL), 82 | head(rownames(stats2)[order(-stats2$ratio)], 200)) 83 | 84 | expect_identical_sorted(getTopHVGs(stats2, var.field="ratio", n=Inf, var.threshold=NULL), 85 | rownames(stats2)[order(-stats2$ratio)]) 86 | }) 87 | -------------------------------------------------------------------------------- /tests/testthat/test-trend-cv2.R: -------------------------------------------------------------------------------- 1 | # This tests the various trendCV2() options. 2 | # require(scran); require(testthat); source("test-trend-cv2.R") 3 | 4 | set.seed(20002) 5 | ncells <- 200 6 | ngenes <- 1000 7 | means <- 2^runif(ngenes, -1, 5) 8 | dummy <- matrix(rnbinom(ngenes*ncells, mu=means, size=5), ncol=ncells, nrow=ngenes) 9 | 10 | library(DelayedMatrixStats) 11 | means <- rowMeans(dummy) 12 | cv2s <- rowVars(dummy)/means^2 13 | 14 | test_that("trendCV2 works on a basic scenario", { 15 | out <- fitTrendCV2(means, cv2s, ncells) 16 | expect_true(out$std.dev > 0) 17 | expect_is(out$trend, "function") 18 | 19 | # Hard to test it without copying the code, so I'll just check the limits. 20 | expect_equal(out$trend(0), Inf) 21 | expect_equal(out$trend(1:10), sapply(1:10, out$trend)) # Checking we get consistent results with returned function. 22 | expect_equal(out$trend(100:1/20), sapply(100:1/20, out$trend)) # More checking, reversed order. 23 | }) 24 | 25 | test_that("trendCV2 prunes out capped CV2 values", { 26 | dummy[1:10,] <- 0 27 | dummy[1:10,1] <- 1:10*100 28 | means <- rowMeans(dummy) 29 | cv2s <- rowVars(dummy)/means^2 30 | 31 | out <- fitTrendCV2(means, cv2s, ncells) 32 | ref <- fitTrendCV2(means[-(1:10)], cv2s[-(1:10)], ncells) 33 | 34 | expect_identical(out$std.dev, ref$std.dev) 35 | expect_identical(out$trend(1:100), ref$trend(1:100)) 36 | }) 37 | 38 | test_that("trendCV2 handles nls errors gracefully", { 39 | cv2.err <- jitter(10/1:10) 40 | m.err <- 1:10 41 | expect_error(fitTrendCV2(m.err, cv2.err, 10, simplified=FALSE), "singular gradient") 42 | expect_error(fit <- fitTrendCV2(m.err, cv2.err, 10), NA) 43 | }) 44 | -------------------------------------------------------------------------------- /tests/testthat/test-trend-var.R: -------------------------------------------------------------------------------- 1 | # This tests the various fitTrendVar() options. 2 | # require(scran); require(testthat); source("test-trend-var.R") 3 | 4 | set.seed(20000) 5 | ncells <- 200 6 | ngenes <- 1000 7 | means <- 2^runif(ngenes, -1, 5) 8 | dummy <- matrix(rnbinom(ngenes*ncells, mu=means, size=5), ncol=ncells, nrow=ngenes) 9 | 10 | library(scuttle) 11 | out <- normalizeCounts(dummy) 12 | 13 | library(DelayedMatrixStats) 14 | means <- rowMeans(out) 15 | vars <- rowVars(out) 16 | 17 | test_that("fitTrendVar works on a basic scenario", { 18 | out <- fitTrendVar(means, vars) 19 | expect_true(out$std.dev > 0) 20 | expect_is(out$trend, "function") 21 | 22 | # Hard to test it without copying the code, so I'll just check the limits. 23 | expect_equal(out$trend(0), 0) 24 | expect_equal(out$trend(1:10), sapply(1:10, out$trend)) # Checking we get consistent results with returned function. 25 | expect_equal(out$trend(100:1/20), sapply(100:1/20, out$trend)) # More checking, reversed order. 26 | 27 | # Parametric left edge limits work (almost) correctly (below 0.1). 28 | expect_equal(out$trend(0.01)*2, out$trend(0.02), tol=1e-4) 29 | expect_equal(out$trend(0.01)*5, out$trend(0.05), tol=1e-4) 30 | 31 | # Parametric right limits do not use rule=2. 32 | mx <- max(means) 33 | expect_false(isTRUE(all.equal(out$trend(mx), out$trend(mx+1)))) 34 | }) 35 | 36 | test_that("fitTrendVar works when parametric mode is turned off", { 37 | out <- fitTrendVar(means, vars, parametric=FALSE) 38 | expect_equal(out$trend(0), 0) 39 | expect_equal(out$trend(1:10), sapply(1:10, out$trend)) 40 | expect_equal(out$trend(100:1/20), sapply(100:1/20, out$trend)) 41 | 42 | # Parametric left edge limits work correctly (below 0.1). 43 | expect_equal(out$trend(0.01)*2, out$trend(0.02)) 44 | expect_equal(out$trend(0.01)*5, out$trend(0.05)) 45 | expect_equal(out$trend(0.01)*10, out$trend(0.1)) 46 | 47 | # Right edge goes to rule=2. 48 | mx <- max(means) 49 | expect_equal(out$trend(mx), out$trend(mx+1)) 50 | }) 51 | 52 | test_that("fitTrendVar works when lowess mode is turned off", { 53 | out <- fitTrendVar(means, vars, lowess=FALSE) 54 | expect_equal(out$trend(0), 0) 55 | expect_equal(out$trend(1:10), sapply(1:10, out$trend)) 56 | expect_equal(out$trend(100:1/20), sapply(100:1/20, out$trend)) 57 | 58 | # Right limits do not use rule=2. 59 | mx <- max(means) 60 | expect_false(isTRUE(all.equal(out$trend(mx), out$trend(mx+1)))) 61 | 62 | expect_error(fitTrendVar(means, vars, lowess=FALSE, parametric=FALSE), "at least one") 63 | }) 64 | 65 | test_that("fitTrendVar works when density weights are turned off", { 66 | out <- fitTrendVar(means, vars, density.weights=FALSE) 67 | expect_equal(out$trend(0), 0) 68 | expect_equal(out$trend(1:10), sapply(1:10, out$trend)) 69 | expect_equal(out$trend(100:1/20), sapply(100:1/20, out$trend)) 70 | 71 | # Right limits do not use rule=2. 72 | mx <- max(means) 73 | expect_false(isTRUE(all.equal(out$trend(mx), out$trend(mx+1)))) 74 | }) 75 | 76 | set.seed(91919) 77 | test_that("fitTrendVar handles nls errors gracefully", { 78 | # This requires the set.seed(), as sometimes it *doesn't* fail. 79 | # Damn the robustness of this algorithm! 80 | X <- runif(100) 81 | Y <- runif(100) 82 | expect_error(out <- fitTrendVar(X, Y), NA) 83 | expect_true("Aest" %in% ls(environment(environment(environment(out$trend)$FUN)$PARAMFUN))) 84 | 85 | expect_error(fitTrendVar(runif(2), runif(2)), "need at least 4") 86 | expect_error(fitTrendVar(runif(1), runif(1)), "need at least 2") 87 | }) 88 | -------------------------------------------------------------------------------- /vignettes/ref.bib: -------------------------------------------------------------------------------- 1 | @Article{scialdone2015computational, 2 | Author="Scialdone, A. and Natarajan, K. N. and Saraiva, L. R. and Proserpio, V. and Teichmann, S. A. and Stegle, O. and Marioni, J. C. and Buettner, F. ", 3 | Title="{{C}omputational assignment of cell-cycle stage from single-cell transcriptome data}", 4 | Journal="Methods", 5 | Year="2015", 6 | Volume="85", 7 | Pages="54--61", 8 | Month="Sep" 9 | } 10 | 11 | @Article{lun2016pooling, 12 | Author="Lun, A. T. and Bach, K. and Marioni, J. C. ", 13 | Title="{{P}ooling across cells to normalize single-cell {R}{N}{A} sequencing data with many zero counts}", 14 | Journal="Genome Biol.", 15 | Year="2016", 16 | Volume="17", 17 | Pages="75", 18 | Month="Apr" 19 | } 20 | 21 | @Article{lun2017assessing, 22 | Author="Lun, A. T. L. and Calero-Nieto, F. J. and Haim-Vilmovsky, L. and Gottgens, B. and Marioni, J. C. ", 23 | Title="{{A}ssessing the reliability of spike-in normalization for analyses of single-cell {R}{N}{A} sequencing data}", 24 | Journal="Genome Res.", 25 | Year="2017", 26 | Volume="27", 27 | Number="11", 28 | Pages="1795--1806", 29 | Month="Nov" 30 | } 31 | 32 | @Article{mccarthy2017scater, 33 | Author="McCarthy, D. J. and Campbell, K. R. and Lun, A. T. and Wills, Q. F. ", 34 | Title="{{S}cater: pre-processing, quality control, normalization and visualization of single-cell {R}{N}{A}-seq data in {R}}", 35 | Journal="Bioinformatics", 36 | Year="2017", 37 | Volume="33", 38 | Number="8", 39 | Pages="1179--1186", 40 | Month="Apr" 41 | } 42 | 43 | @Article{lun2016step, 44 | Author="Lun, A. T. and McCarthy, D. J. and Marioni, J. C. ", 45 | Title="{{A} step-by-step workflow for low-level analysis of single-cell {R}{N}{A}-seq data with {B}ioconductor}", 46 | Journal="F1000Res", 47 | Year="2016", 48 | Volume="5", 49 | Pages="2122" 50 | } 51 | 52 | @Article{grun2016denovo, 53 | Author="Grun, D. and Muraro, M. J. and Boisset, J. C. and Wiebrands, K. and Lyubimova, A. and Dharmadhikari, G. and van den Born, M. and van Es, J. and Jansen, E. and Clevers, H. and de Koning, E. J. P. and van Oudenaarden, A. ", 54 | Title="{{D}e {N}ovo {P}rediction of {S}tem {C}ell {I}dentity using {S}ingle-{C}ell {T}ranscriptome {D}ata}", 55 | Journal="Cell Stem Cell", 56 | Year="2016", 57 | Volume="19", 58 | Number="2", 59 | Pages="266--277", 60 | Month="08" 61 | } 62 | 63 | @Article{xu2015identification, 64 | Author="Xu, C. and Su, Z. ", 65 | Title="{{I}dentification of cell types from single-cell transcriptomes using a novel clustering method}", 66 | Journal="Bioinformatics", 67 | Year="2015", 68 | Volume="31", 69 | Number="12", 70 | Pages="1974--1980", 71 | Month="Jun" 72 | } 73 | --------------------------------------------------------------------------------