├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── acoshp1.R
    ├── delta_method_transform.R
    ├── helpers.R
    ├── residual_transform.R
    ├── sparse_sweep.R
    ├── transformGamPoi-package.R
    └── transformGamPoi.R
├── README.Rmd
├── README.md
├── inst
    └── CITATION
├── man
    ├── acosh_transform.Rd
    ├── dot-handle_data_parameter.Rd
    ├── estimate_size_factors.Rd
    ├── figures
    │   └── README-plotMeanVar-1.png
    ├── residual_transform.Rd
    └── transformGamPoi.Rd
├── src
    ├── .gitignore
    ├── RcppExports.cpp
    └── sparse_divide_out_size_factor.cpp
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-acoshp1.R
    │   ├── test-sparse_sweep.R
    │   ├── test-transform.R
    │   └── test-transformGamPoi.R
└── vignettes
    ├── .gitignore
    └── transformGamPoi.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rproj
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | inst/doc
7 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: transformGamPoi
 2 | Type: Package
 3 | Title: Variance Stabilizing Transformation for Gamma-Poisson Models
 4 | Version: 1.5.1
 5 | Authors@R: person("Constantin", "Ahlmann-Eltze", email = "artjom31415@googlemail.com", 
 6 |                role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3762-068X"))
 7 | Description: Variance-stabilizing transformations help with the analysis of 
 8 |   heteroskedastic data (i.e., data where the variance is not constant, like count data).
 9 |   This package provide two types of variance stabilizing transformations: (1) methods based on the
10 |   delta method (e.g., 'acosh', 'log(x+1)'), (2) model residual based (Pearson and randomized 
11 |   quantile residuals).
12 | BugReports: https://github.com/const-ae/transformGamPoi/issues
13 | URL: https://github.com/const-ae/transformGamPoi
14 | License: GPL-3
15 | Encoding: UTF-8
16 | Imports:
17 |     glmGamPoi,
18 |     DelayedArray,
19 |     Matrix,
20 |     MatrixGenerics,
21 |     SummarizedExperiment,
22 |     HDF5Array,
23 |     methods,
24 |     utils,
25 |     Rcpp
26 | Suggests: 
27 |     testthat,
28 |     TENxPBMCData,
29 |     scran,
30 |     knitr,
31 |     rmarkdown,
32 |     BiocStyle
33 | Roxygen: list(markdown = TRUE)
34 | RoxygenNote: 7.1.2
35 | Config/testthat/edition: 3
36 | biocViews: SingleCell, Normalization, Preprocessing, Regression
37 | VignetteBuilder: knitr
38 | LinkingTo: 
39 |     Rcpp
40 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(acosh_transform)
 4 | export(residual_transform)
 5 | export(shifted_log_transform)
 6 | export(transformGamPoi)
 7 | importClassesFrom(Matrix,dgCMatrix)
 8 | importFrom(MatrixGenerics,colSums2)
 9 | importFrom(Rcpp,sourceCpp)
10 | importFrom(methods,as)
11 | importFrom(methods,canCoerce)
12 | importFrom(methods,is)
13 | importFrom(utils,head)
14 | useDynLib(transformGamPoi, .registration = TRUE)
15 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # transformGamPoi 1.1.x
 2 | 
 3 | * Implement faster scaling with size factors for acosh and log-based transformations
 4 | * Implement analytic Pearson residuals
 5 | 
 6 | 
 7 | # transformGamPoi 0.1.x
 8 | 
 9 | * Add clipping functionality to `residual_transform()`
10 | * Add check against `residual_type` argument in `transformGamPoi()` 
11 | * Fix bug in `acosh_transform()` related to sparse input and `on_disk = FALSE`
12 | * Change default of `overdispersion_shrinkage` to `TRUE` if `overdispersion = TRUE` 
13 | for `acosh_transform()` and `shifted_log_transform()`
14 | 
15 | # transformGamPoi 0.1.0
16 | 
17 | * Initial release of transformGamPoi on GitHub https://github.com/const-ae/transformGamPoi
18 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 | 
4 | sparse_divide_out_size_factor_impl <- function(x, p, s) {
5 |     .Call(`_transformGamPoi_sparse_divide_out_size_factor_impl`, x, p, s)
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/R/acoshp1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | acoshp1 <- function(x){
 5 |   acosh(x + 1)
 6 | }
 7 | 
 8 | setGeneric("acoshp1", function(x){
 9 |   standardGeneric("acoshp1")
10 | })
11 | 
12 | setMethod("acoshp1", signature = "sparseMatrix", function(x){
13 |   acoshp1(as(x, "CsparseMatrix"))
14 | })
15 | 
16 | 
17 | setMethod("acoshp1", signature = "CsparseMatrix", function(x){
18 |   # acosh(0 + 1) == 0
19 |   x@x <- acosh(x@x + 1)
20 |   x
21 | })
22 | 


--------------------------------------------------------------------------------
/R/delta_method_transform.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | .acosh_trans_impl <- function(x, alpha){
  7 |   1/sqrt(alpha) * acoshp1(2 * alpha * x)
  8 | }
  9 | 
 10 | .sqrt_trans_impl <- function(x){
 11 |   2 * sqrt(x)
 12 | }
 13 | 
 14 | 
 15 | #' Delta method-based variance stabilizing transformation
 16 | #'
 17 | #'
 18 | #' @inherit transformGamPoi
 19 | #' @param pseudo_count instead of specifying the overdispersion, the
 20 | #'   `shifted_log_transform` is commonly parameterized with a pseudo-count
 21 | #'   (\eqn{pseudo-count = 1/(4 * overdispersion)}). If both the `pseudo-count`
 22 | #'   and `overdispersion` is specified, the `overdispersion` is ignored.
 23 | #'   Default: `1/(4 * overdispersion)`
 24 | #' @param minimum_overdispersion the `acosh_transform` converges against
 25 | #'   \eqn{2 * sqrt(x)} for `overdispersion == 0`. However, the `shifted_log_transform`
 26 | #'   would just become `0`, thus here we apply the `minimum_overdispersion` to avoid
 27 | #'   this behavior.
 28 | #' @param ... additional parameters for `glmGamPoi::glm_gp()` which is called in
 29 | #'   case `overdispersion = TRUE`.
 30 | #'
 31 | #' @describeIn  acosh_transform \eqn{1/sqrt(alpha)} acosh(2 * alpha * x + 1)
 32 | #'
 33 | #' @return a matrix (or a vector if the input is a vector) with the transformed values.
 34 | #'
 35 | #' @examples
 36 | #'   # Load a single cell dataset
 37 | #'   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 38 | #'   # Reduce size for this example
 39 | #'   set.seed(1)
 40 | #'   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
 41 | #'                  sample(ncol(sce), 200)]
 42 | #'
 43 | #'   assay(sce_red, "acosh") <- acosh_transform(sce_red)
 44 | #'   assay(sce_red, "shifted_log") <- shifted_log_transform(sce_red)
 45 | #'   plot(rowMeans2(assay(sce_red, "acosh")), rowVars(assay(sce_red, "acosh")), log = "x")
 46 | #'   points(rowMeans2(assay(sce_red, "shifted_log")), rowVars(assay(sce_red, "shifted_log")),
 47 | #'          col = "red")
 48 | #'
 49 | #'   # Sqrt transformation
 50 | #'   sqrt_dat <- acosh_transform(sce_red, overdispersion = 0, size_factor = 1)
 51 | #'   plot(2 * sqrt(assay(sce_red))[,1], sqrt_dat[,1]); abline(0,1)
 52 | #'
 53 | #' @export
 54 | acosh_transform <- function(data, overdispersion = 0.05,
 55 |                             size_factors = TRUE,
 56 |                             ...,
 57 |                             on_disk = NULL,
 58 |                             verbose = FALSE){
 59 | 
 60 |   counts <- .handle_data_parameter(data, on_disk, allow_sparse = TRUE)
 61 | 
 62 |   if(inherits(data, "glmGamPoi")){
 63 |     size_factors <- data$size_factors
 64 |   }else{
 65 |     size_factors <- .handle_size_factors(size_factors, counts)
 66 |   }
 67 | 
 68 |   if(all(isTRUE(overdispersion)) || all(overdispersion == "global")){
 69 |     if(HDF5Array::is_sparse(counts)){
 70 |       counts <- .handle_data_parameter(data, on_disk, allow_sparse = FALSE)
 71 |     }
 72 |     dots <- list(...)
 73 |     overdispersion_shrinkage <- if("overdispersion_shrinkage" %in% names(dots)){
 74 |       dots[["overdispersion_shrinkage"]]
 75 |     }else{
 76 |       TRUE
 77 |     }
 78 |     fit <- glmGamPoi::glm_gp(counts, design = ~ 1, size_factors = size_factors,
 79 |                              overdispersion = overdispersion,
 80 |                              overdispersion_shrinkage = overdispersion_shrinkage,
 81 |                              verbose = verbose)
 82 |     if(overdispersion_shrinkage){
 83 |       # Use the dispersion trend when calculating the residuals
 84 |       fit$overdispersion_shrinkage_list$original_overdispersions <- fit$overdispersions
 85 |       fit$overdispersions <- fit$overdispersion_shrinkage_list$dispersion_trend
 86 |     }
 87 |     overdispersion <- fit$overdispersions
 88 |   }else{
 89 |     overdispersion <- .handle_overdispersion(overdispersion, counts)
 90 |   }
 91 | 
 92 |   if(is(counts, "dgCMatrix")){
 93 |     norm_counts <- sparse_divide_out_size_factor(counts, size_factors)
 94 |   }else{
 95 |     norm_counts <- DelayedArray::sweep(counts, 2, size_factors, FUN = "/")
 96 |   }
 97 | 
 98 |   overdispersion_near_zero <- .near(overdispersion, 0)
 99 | 
100 |   result <- if(! any(overdispersion_near_zero)){
101 |     # no overdispersion is zero
102 |     .acosh_trans_impl(norm_counts, overdispersion)
103 |   }else if(all(overdispersion_near_zero)){
104 |     # all overdispersion is zero
105 |     .sqrt_trans_impl(norm_counts)
106 |   }else{
107 |     # overdispersion is a mix of zeros and other values.
108 |     if(is.matrix(overdispersion)){
109 |       norm_counts[overdispersion_near_zero] <- .sqrt_trans_impl(norm_counts[overdispersion_near_zero])
110 |       norm_counts[!overdispersion_near_zero] <- .acosh_trans_impl(norm_counts[!overdispersion_near_zero],
111 |                                                                   overdispersion[! overdispersion_near_zero])
112 |       norm_counts
113 |     }else{
114 |       norm_counts[overdispersion_near_zero, ] <- .sqrt_trans_impl(norm_counts[overdispersion_near_zero, ])
115 |       norm_counts[!overdispersion_near_zero, ] <- .acosh_trans_impl(norm_counts[!overdispersion_near_zero, ],
116 |                                                                     overdispersion[! overdispersion_near_zero])
117 |       norm_counts
118 |     }
119 |   }
120 | 
121 |   .convert_to_output(result, data)
122 | }
123 | 
124 | 
125 | 
126 | .log_plus_alpha_impl <- function(x, alpha){
127 |   1/sqrt(alpha) * log1p(4 * alpha * x)
128 | }
129 | 
130 | 
131 | #' @describeIn acosh_transform \eqn{1/sqrt(alpha) log(4 * alpha * x + 1)}
132 | #' @export
133 | shifted_log_transform <- function(data,
134 |                                   overdispersion = 0.05,
135 |                                   pseudo_count = 1/(4 * overdispersion),
136 |                                   size_factors = TRUE,
137 |                                   minimum_overdispersion = 0.001,
138 |                                   ...,
139 |                                   on_disk = NULL,
140 |                                   verbose = FALSE){
141 | 
142 |   counts <- .handle_data_parameter(data, on_disk, allow_sparse = TRUE)
143 | 
144 |   if(inherits(data, "glmGamPoi")){
145 |     size_factors <- data$size_factors
146 |   }else{
147 |     size_factors <- .handle_size_factors(size_factors, counts)
148 |   }
149 | 
150 |   if(all(isTRUE(overdispersion)) || all(overdispersion == "global")){
151 |     if(HDF5Array::is_sparse(counts)){
152 |       counts <- .handle_data_parameter(data, on_disk, allow_sparse = FALSE)
153 |     }
154 |     dots <- list(...)
155 |     overdispersion_shrinkage <- if("overdispersion_shrinkage" %in% names(dots)){
156 |       dots[["overdispersion_shrinkage"]]
157 |     }else{
158 |       TRUE
159 |     }
160 |     fit <- glmGamPoi::glm_gp(counts, design = ~ 1, size_factors = size_factors,
161 |                              overdispersion = overdispersion,
162 |                              overdispersion_shrinkage = overdispersion_shrinkage,
163 |                              verbose = verbose)
164 |     if(overdispersion_shrinkage){
165 |       fit$overdispersion_shrinkage_list$original_overdispersions <- fit$overdispersions
166 |       fit$overdispersions <- fit$overdispersion_shrinkage_list$dispersion_trend
167 |     }
168 |     overdispersion <- fit$overdispersions
169 |   }else{
170 |     overdispersion <- 1/(4 * pseudo_count)
171 |     overdispersion <- .handle_overdispersion(overdispersion, counts)
172 |   }
173 | 
174 |   if(is(counts, "dgCMatrix")){
175 |     norm_counts <- sparse_divide_out_size_factor(counts, size_factors)
176 |   }else{
177 |     norm_counts <- DelayedArray::sweep(counts, 2, size_factors, FUN = "/")
178 |   }
179 | 
180 | 
181 |   overdispersion[overdispersion < minimum_overdispersion] <- minimum_overdispersion
182 | 
183 |   result <- .log_plus_alpha_impl(norm_counts, overdispersion)
184 |   .convert_to_output(result, data)
185 | }
186 | 
187 | 
188 | 
189 | 
190 | .convert_to_output <- function(result, data){
191 |   if(is.vector(data)){
192 |     as.vector(result)
193 |   }else{
194 |     result
195 |   }
196 | }
197 | 
198 | 
199 | .near <- function (x, y, tol = .Machine$double.eps^0.5){
200 |   abs(x - y) < tol
201 | }
202 | 


--------------------------------------------------------------------------------
/R/helpers.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Take any kind of data and extract the matrix
  3 | #'
  4 | #'
  5 | #' Adapted from glmGamPoi:::handle_data_parameter
  6 | #'
  7 | #' @return A matrix.
  8 | #'
  9 | #' @keywords internal
 10 | .handle_data_parameter <- function(data, on_disk, allow_sparse = TRUE){
 11 |   if(is.vector(data)){
 12 |     data <- matrix(data, nrow = 1)
 13 |   }
 14 |   if(is.matrix(data)){
 15 |     if(! is.numeric(data)){
 16 |       stop("The data argument must consist of numeric values and not of ", mode(data), " values")
 17 |     }
 18 |     if(is.null(on_disk) || isFALSE(on_disk)){
 19 |       data_mat <- data
 20 |     }else if(isTRUE(on_disk)){
 21 |       data_mat <- HDF5Array::writeHDF5Array(data)
 22 |     }else{
 23 |       stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
 24 |     }
 25 |   }else if(is(data, "DelayedArray")){
 26 |     if(is.null(on_disk) || isTRUE(on_disk)){
 27 |       data_mat <- data
 28 |     }else if(isFALSE(on_disk)){
 29 |       data_mat <- as.matrix(data)
 30 |     }else{
 31 |       stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
 32 |     }
 33 |   }else if(is(data, "SummarizedExperiment")){
 34 |     data_mat <- .handle_data_parameter(SummarizedExperiment::assay(data, "counts"), on_disk, allow_sparse)
 35 |   }else if(canCoerce(data, "SummarizedExperiment")){
 36 |     se <- as(data, "SummarizedExperiment")
 37 |     data_mat <- .handle_data_parameter(SummarizedExperiment::assay(se, "counts"), on_disk, allow_sparse)
 38 |   }else if(is(data, "dgCMatrix") || is(data, "dgTMatrix")) {
 39 |     if(isTRUE(on_disk)){
 40 |       data_mat <- HDF5Array::writeHDF5Array(data)
 41 |     }else if(isFALSE(on_disk)){
 42 |       if(allow_sparse){
 43 |         data_mat <- data
 44 |       }else{
 45 |         data_mat <- as.matrix(data)
 46 |       }
 47 |     }else{
 48 |       if(allow_sparse){
 49 |         data_mat <- data
 50 |       }else{
 51 |         stop("transformGamPoi does not yet support sparse input data of type '", class(data),"'. ",
 52 |              "Please explicitly set the 'on_disk' parameter to force a conversion to a dense format either in-memory ('on_disk = FALSE') ",
 53 |              "or on-disk ('on_disk = TRUE')")
 54 |       }
 55 |     }
 56 |   }else if(inherits(data, "glmGamPoi")){
 57 |     data_mat <- .handle_data_parameter(data$data, on_disk, allow_sparse)
 58 |   }else{
 59 |     stop("Cannot handle data of class '", class(data), "'.",
 60 |          "It must be of a matrix object (i.e., a base matrix or DelayedArray),",
 61 |          " or a container for such a matrix (for example: SummarizedExperiment).")
 62 |   }
 63 |   data_mat
 64 | }
 65 | 
 66 | 
 67 | 
 68 | 
 69 | .handle_size_factors <- function(size_factors, Y, verbose = FALSE){
 70 |   n_samples <- ncol(Y)
 71 | 
 72 |   if(isTRUE(size_factors) || is.character(size_factors)){
 73 |     method <- if(isTRUE(size_factors)){
 74 |       "normed_sum"
 75 |     }else if(all(size_factors == c("normed_sum", "deconvolution", "poscounts"))){
 76 |       "normed_sum"
 77 |     }else if(length(size_factors) == 1 && size_factors %in% c("normed_sum", "deconvolution", "poscounts")){
 78 |       size_factors
 79 |     }else{
 80 |       stop("Cannot handle size factor ", paste0(size_factors, collapse = ", "))
 81 |     }
 82 |     if(verbose){ message("Calculate Size Factors (", method, ")") }
 83 |     sf <- estimate_size_factors(Y, method = method, verbose = verbose)
 84 |   }else if(isFALSE(size_factors)){
 85 |     sf <- rep(1, n_samples)
 86 |   }else{
 87 |     stopifnot(is.numeric(size_factors) && (length(size_factors) == 1 || length(size_factors) == n_samples))
 88 |     if(any(size_factors < 0)){
 89 |       stop("size factor 'size_factors' must be larger than 0")
 90 |     }
 91 |     if(length(size_factors) == 1){
 92 |       sf <- rep(size_factors, n_samples)
 93 |     }else{
 94 |       sf <- size_factors
 95 |     }
 96 |   }
 97 |   sf
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | #' Estimate the Size Factors
107 | #'
108 | #' @param Y any matrix-like object (\code{base::matrix()}, \code{DelayedArray}, \code{HDF5Matrix},
109 | #'   \code{Matrix::Matrix()}, etc.)
110 | #' @param method one of \code{c("normed_sum", "deconvolution", "poscounts")}
111 | #'
112 | #'
113 | #' @return a vector with one size factor per column of `Y`
114 | #'
115 | #' @keywords internal
116 | estimate_size_factors <- function(Y, method, verbose = FALSE){
117 |   if(nrow(Y) <= 1){
118 |     if(verbose) {
119 |       message("Skipping size factor estimation, because there is only a single gene.",
120 |               call. = FALSE)
121 |     }
122 |     return(rep(1, ncol(Y)))
123 |   }
124 |   stopifnot(length(method) == 1 && is.character(method))
125 | 
126 |   if(method == "poscounts"){
127 |     # Accept any matrix-like object
128 |     log_geometric_means <- MatrixGenerics::rowMeans2(log(Y + 0.5))
129 |     Y2 <- Y
130 |     Y2[Y2 == 0] <- NA
131 |     sf <- exp(MatrixGenerics::colMedians(subtract_vector_from_each_column(log(Y2), log_geometric_means), na.rm = TRUE))
132 |   }else if(method == "deconvolution"){
133 |     if(requireNamespace("scran", quietly = TRUE)){
134 |       tryCatch({
135 |         sf <- scran::calculateSumFactors(Y)
136 |       }, error = function(err){
137 |         stop("Error in size factor estimation with 'size_factors = \"deconvolution\"'.\n",
138 |              "Alternative size factor estimation procedures are: \"normed_sum\" or \"poscounts\"\n",
139 |              "'scran::calculateSumFactors(Y)' threw the following error: \n\t",
140 |              err, call. = FALSE)
141 |       })
142 |     }else{
143 |       stop("To use the \"deconvolution\" method for size factor calculation, you need to install the ",
144 |            "'scran' package from Bioconductor. Alternatively, you can use \"normed_sum\" or \"poscounts\"",
145 |            "to calculate the size factors.", call. = FALSE)
146 |     }
147 |   }else if(method == "normed_sum"){
148 |     sf <- colSums2(Y)
149 |     # sf <- matrixStats::colSums2(as.matrix(Y))
150 |   }else{
151 |     stop("Unknown size factor estimation method: ", method)
152 |   }
153 | 
154 | 
155 | 
156 |   # stabilize size factors to have geometric mean of 1
157 |   all_zero_column <- is.nan(sf) | sf <= 0
158 |   sf[all_zero_column] <- NA
159 |   if(any(all_zero_column)){
160 |     sf <- sf/exp(mean(log(sf), na.rm=TRUE))
161 |     sf[all_zero_column] <- 0.001
162 |   }else{
163 |     sf <- sf/exp(mean(log(sf)))
164 |   }
165 |   sf
166 | }
167 | 
168 | 
169 | .handle_overdispersion <- function(overdispersion, counts, verbose = FALSE){
170 |   stopifnot(length(overdispersion) == 1 || length(overdispersion) == nrow(counts) || all(dim(overdispersion) == dim(counts)))
171 |   if(length(overdispersion) == 1){
172 |     rep(overdispersion, nrow(counts))
173 |   }else{
174 |     overdispersion
175 |   }
176 | 
177 |   if(verbose && any(overdispersion > 1)){
178 |     warning("The overdispersion at position", paste0(head(which(overdispersion > 1)), collapse = ", "), " seems unusually large.")
179 |   }
180 |   overdispersion
181 | }
182 | 
183 | 
184 | subtract_vector_from_each_column <- function(matrix, vector){
185 |   stopifnot(length(vector) == 1 || length(vector) == nrow(matrix))
186 |   matrix - vector
187 | }
188 | 
189 | 
190 | 
191 | delayed_matrix_multiply <- function(x, y){
192 |   res_sink <- HDF5Array::HDF5RealizationSink(c(nrow(x), ncol(y)))
193 |   on.exit({
194 |     DelayedArray::close(res_sink)
195 |   }, add = TRUE)
196 | 
197 |   res_grid <- DelayedArray::defaultAutoGrid(res_sink)
198 | 
199 |   row_ticks <- cumsum(vapply(seq_len(dim(res_grid)[1]), function(idx){
200 |     dim(res_grid[[idx, 1L]])[1]
201 |   }, FUN.VALUE = 0L))
202 |   col_ticks <- cumsum(vapply(seq_len(dim(res_grid)[2]), function(idx){
203 |     dim(res_grid[[1L, idx]])[2]
204 |   }, FUN.VALUE = 0L))
205 | 
206 | 
207 | 
208 |   x_grid <- DelayedArray::ArbitraryArrayGrid(tickmarks = list(row_ticks, ncol(x)))
209 |   y_grid <- DelayedArray::ArbitraryArrayGrid(tickmarks = list(nrow(y), col_ticks))
210 | 
211 | 
212 |   for (coord1 in seq_len(ncol(res_grid))) {
213 |     for(coord2 in seq_len(nrow(res_grid))){
214 |       x_block <- DelayedArray::read_block(x, x_grid[[coord2]])
215 |       y_block <- DelayedArray::read_block(y, y_grid[[coord1]])
216 |       res_block <- x_block %*% y_block
217 |       DelayedArray::write_block(res_sink, res_grid[[coord2, coord1]], res_block)
218 |     }
219 |   }
220 | 
221 |   as(res_sink, "DelayedArray")
222 | }
223 | 
224 | 


--------------------------------------------------------------------------------
/R/residual_transform.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #'  Residual-based Variance Stabilizing Transformation
  4 | #'
  5 | #'  Fit an intercept Gamma-Poisson model that corrects for sequencing depth and return the residuals
  6 | #'  as variance stabilized results for further downstream application, for which no proper count-based
  7 | #'  method exist or is performant enough (e.g., clustering, dimensionality reduction).
  8 | #'
  9 | #' @param offset_model boolean to decide if \eqn{\beta_1} in \eqn{y = \beta_0 + \beta_1 log(sf)},
 10 | #'   is set to 1 (i.e., treating the log of the size factors as an offset ) or is estimated per gene.
 11 | #'   From a theoretical point, it should be fine to treat \eqn{\beta_1} as an offset, because a cell that is
 12 | #'   twice as big, should have twice as many counts per gene (without any gene-specific effects).
 13 | #'   However, `sctransform` suggested that it would be advantageous to nonetheless estimate
 14 | #'   \eqn{\beta_0} as it may counter data artifacts. On the other side, Lause et al. (2020)
 15 | #'   demonstrated that the estimating \eqn{\beta_0} and \eqn{\beta_1} together can be difficult. If
 16 | #'   you still want to fit `sctransform`'s model, you can set the `ridge_penalty` argument to a
 17 | #'   non-zero value, which shrinks \eqn{\beta_1} towards 1 and resolves the degeneracy. \cr
 18 | #'   Default: `TRUE`.
 19 | #' @param residual_type a string that specifies what kind of residual is returned as variance stabilized-value.
 20 | #'   \describe{
 21 | #'     \item{`"randomized_quantile"`}{The discrete nature of count distribution stops simple transformations from
 22 | #'     obtaining a truly standard normal residuals. The trick of of quantile randomized residuals is to match the
 23 | #'     cumulative density function of the Gamma-Poisson and the Normal distribution. Due to the discrete nature of
 24 | #'     Gamma-Poisson distribution, a count does not correspond to a single quantile of the Normal distribution, but
 25 | #'     to a range of possible value. This is resolved by randomly choosing one of the mapping values from the
 26 | #'     Normal distribution as the residual. This ensures perfectly normal distributed
 27 | #'     residuals, for the cost of introducing randomness. More details are available in the documentation
 28 | #'     of [`statmod::qresiduals()`] and the corresponding publication by Dunn and Smyth (1996).}
 29 | #'     \item{`"pearson"`}{The Pearson residuals are defined as \eqn{res = (y - m) / sqrt(m + m^2 * theta)}.}
 30 | #'     \item{`"analytic_pearson"`}{Similar to the method above, however, instead of estimating \eqn{m} using a
 31 | #'     GLM model fit, \eqn{m} is approximated by \eqn{m_ij = (\sum_j y_{ij}) (\sum_i y_{ij}) / (\sum_{i,j} y_{ij})}.
 32 | #'     For all details, see Lause et al. (2021).
 33 | #'     Note that `overdispersion_shrinkage` and `ridge_penalty` are ignored when fitting analytic Pearson residuals and
 34 | #'     `alpha=TRUE` is interpreted as `alpha=0.01`, unlike the other methods which estimate the overdispersion from the data.}
 35 | #'   }
 36 | #'   The two above options are the most common choices, however you can use any `residual_type` supported by
 37 | #'   [`glmGamPoi::residuals.glmGamPoi()`]. Default: `"randomized_quantile"`
 38 | #' @param clipping a single boolean or numeric value specifying that all residuals are in the range
 39 | #'   `[-clipping, +clipping]`. If `clipping = TRUE`, we use the default of `clipping = sqrt(ncol(data))`
 40 | #'   which is the default behavior for `sctransform`. Default: `FALSE`, which means no clipping is applied.
 41 | #' @param overdispersion_shrinkage,size_factors arguments that are passed to the underlying
 42 | #'   call to [`glmGamPoi::glm_gp()`]. Default for each: `TRUE`.
 43 | #' @param ridge_penalty another argument that is passed to [`glmGamPoi::glm_gp()`]. It is ignored if
 44 | #'   `offset_model = TRUE`. Default: `2`.
 45 | #' @param return_fit boolean to decide if the matrix of residuals is returned directly (`return_fit = FALSE`)
 46 | #'   or if in addition the `glmGamPoi`-fit is returned (`return_fit = TRUE`) . Default: `FALSE`.
 47 | #' @param ... additional parameters passed to [`glmGamPoi::glm_gp()`].
 48 | #' @inherit transformGamPoi
 49 | #'
 50 | #' @details
 51 | #'  Internally, this method uses the `glmGamPoi` package. The function goes through the following steps
 52 | #'  \enumerate{
 53 | #'    \item fit model using [`glmGamPoi::glm_gp()`]
 54 | #'    \item plug in the trended overdispersion estimates
 55 | #'    \item call [`glmGamPoi::residuals.glmGamPoi()`] to calculate the residuals.
 56 | #'  }
 57 | #'
 58 | #' @return a matrix (or a vector if the input is a vector) with the transformed values. If `return_fit = TRUE`,
 59 | #'   a list is returned with two elements: `fit` and `Residuals`.
 60 | #'
 61 | #' @seealso [`glmGamPoi::glm_gp()`], [`glmGamPoi::residuals.glmGamPoi()`], `sctransform::vst()`,
 62 | #'   `statmod::qresiduals()`
 63 | #'
 64 | #' @references
 65 | #'   Ahlmann-Eltze, Constantin, and Wolfgang Huber. "glmGamPoi: Fitting Gamma-Poisson Generalized Linear
 66 | #'   Models on Single Cell Count Data." Bioinformatics (2020)
 67 | #'
 68 | #'   Dunn, Peter K., and Gordon K. Smyth. "Randomized quantile residuals." Journal of Computational and
 69 | #'   Graphical Statistics 5.3 (1996): 236-244.
 70 | #'
 71 | #'   Hafemeister, Christoph, and Rahul Satija. "Normalization and variance stabilization of single-cell
 72 | #'   RNA-seq data using regularized negative binomial regression." Genome biology 20.1 (2019): 1-15.
 73 | #'
 74 | #'   Hafemeister, Christoph, and Rahul Satija. "Analyzing scRNA-seq data with the sctransform and offset
 75 | #'   models." (2020)
 76 | #'
 77 | #'   Lause, Jan, Philipp Berens, and Dmitry Kobak. "Analytic Pearson residuals for normalization of
 78 | #'   single-cell RNA-seq UMI data." Genome Biology (2021).
 79 | #'
 80 | #' @examples
 81 | #'   # Load a single cell dataset
 82 | #'   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 83 | #'   # Reduce size for this example
 84 | #'   set.seed(1)
 85 | #'   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
 86 | #'                  sample(ncol(sce), 200)]
 87 | #'   counts(sce_red) <- as.matrix(counts(sce_red))
 88 | #'
 89 | #'   # Residual Based Variance Stabilizing Transformation
 90 | #'   rq <- residual_transform(sce_red, residual_type = "randomized_quantile",
 91 | #'                            verbose = TRUE)
 92 | #'   pearson <- residual_transform(sce_red, residual_type = "pearson", verbose = TRUE)
 93 | #'
 94 | #'   # Plot first two principal components
 95 | #'   pearson_pca <- prcomp(t(pearson), rank. = 2)
 96 | #'   rq_pca <- prcomp(t(rq), rank. = 2)
 97 | #'   plot(rq_pca$x, asp = 1)
 98 | #'   points(pearson_pca$x, col = "red")
 99 | #'
100 | #' @export
101 | residual_transform <- function(data,
102 |                                residual_type = c("randomized_quantile", "pearson", "analytic_pearson"),
103 |                                clipping = FALSE,
104 |                                overdispersion = 0.05,
105 |                                size_factors = TRUE,
106 |                                offset_model = TRUE,
107 |                                overdispersion_shrinkage = TRUE,
108 |                                ridge_penalty = 2,
109 |                                on_disk = NULL,
110 |                                return_fit = FALSE,
111 |                                verbose = FALSE, ...){
112 | 
113 |   # Allow any valid argument from glmGamPoi::residual.glmGamPoi()
114 |   residual_type <- match.arg(residual_type[1], c("deviance", "pearson", "randomized_quantile",
115 |                                                  "working", "response", "quantile", "analytic_pearson"))
116 | 
117 |   if(residual_type == "analytic_pearson"){
118 |     return(analytic_pearson_residual_transform(data = data, clipping = clipping, overdispersion = overdispersion, size_factors = size_factors, on_disk = on_disk, return_fit = return_fit, verbose = verbose))
119 |   }
120 | 
121 |   if(inherits(data, "glmGamPoi")){
122 |     fit <- data
123 |   }else if(offset_model){
124 |     counts <- .handle_data_parameter(data, on_disk, allow_sparse = FALSE  )
125 |     size_factors <- .handle_size_factors(size_factors, counts)
126 | 
127 |     fit <- glmGamPoi::glm_gp(counts, design = ~ 1, size_factors = size_factors,
128 |                              overdispersion = overdispersion,
129 |                              overdispersion_shrinkage = overdispersion_shrinkage,
130 |                              verbose = verbose, ...)
131 |   }else{
132 |     counts <- .handle_data_parameter(data, on_disk, allow_sparse = FALSE  )
133 |     size_factors <- .handle_size_factors(size_factors, counts)
134 | 
135 |     log_sf <- log(size_factors)
136 |     attr(ridge_penalty, "target") <- c(0, 1)
137 | 
138 |     fit <- glmGamPoi::glm_gp(counts, design = ~ log_sf + 1, size_factors = 1,
139 |                              overdispersion = overdispersion,
140 |                              overdispersion_shrinkage = overdispersion_shrinkage,
141 |                              ridge_penalty = ridge_penalty,
142 |                              verbose = verbose, ...)
143 |   }
144 | 
145 |   if(overdispersion_shrinkage){
146 |     # Use the dispersion trend when calculating the residuals
147 |     fit$overdispersion_shrinkage_list$original_overdispersions <- fit$overdispersions
148 |     fit$overdispersions <- fit$overdispersion_shrinkage_list$dispersion_trend
149 |   }
150 | 
151 | 
152 |   if(verbose){message("Calculate ", residual_type, " residuals")}
153 | 
154 |   resid <- stats::residuals(fit, type = residual_type)
155 |   resid <- clip_residuals(resid, clipping)
156 |   resid <- .convert_to_output(resid, data)
157 | 
158 |   if(! return_fit){
159 |     resid
160 |   }else{
161 |     list(Residuals = resid, fit = fit)
162 |   }
163 | }
164 | 
165 | # Original implementation in scanpy by Jan Lause
166 | # https://github.com/scverse/scanpy/blob/bd06cc3d1e0bd990f6994e54414512fa0b25fea0/scanpy/experimental/pp/_normalization.py
167 | # Translated to R by Constantin Ahlmann-Eltze
168 | analytic_pearson_residual_transform <- function(data,
169 |                                                 clipping = FALSE,
170 |                                                 overdispersion = 0.05,
171 |                                                 size_factors = TRUE,
172 |                                                 on_disk = NULL,
173 |                                                 return_fit = FALSE,
174 |                                                 verbose = FALSE){
175 |   if(isFALSE(overdispersion)){
176 |     overdispersion <- 0
177 |   }
178 |   if(isTRUE(overdispersion)){
179 |     overdispersion <- 0.01
180 |   }
181 | 
182 |   counts <- .handle_data_parameter(data, on_disk, allow_sparse = TRUE)
183 |   size_factors <- .handle_size_factors(size_factors, counts)
184 | 
185 |   make_offset_hdf5_mat <- is(counts, "DelayedMatrix") && is(DelayedArray::seed(counts), "HDF5ArraySeed")
186 | 
187 |   sum_genes <- MatrixGenerics::rowSums2(counts)
188 |   size_factors <- size_factors / sum(size_factors)
189 | 
190 |   Mu <- if(make_offset_hdf5_mat){
191 |     delayed_matrix_multiply(DelayedArray::DelayedArray(matrix(sum_genes, ncol = 1)), DelayedArray::DelayedArray(matrix(size_factors, nrow = 1)))
192 |   }else{
193 |     tcrossprod(sum_genes, size_factors)
194 |   }
195 |   resid <- (counts - Mu) / sqrt(Mu + Mu^2 * overdispersion)
196 |   resid <- clip_residuals(resid, clipping)
197 |   resid <- .convert_to_output(resid, data)
198 | 
199 |   if(! return_fit){
200 |     resid
201 |   }else{
202 |     list(Residuals = resid, fit = NULL)
203 |   }
204 | }
205 | 
206 | 
207 | 
208 | clip_residuals <- function(resid, clipping){
209 |   if(isFALSE(clipping)){
210 |     # Do nothing
211 |   }else{
212 |     if(isTRUE(clipping)){
213 |       clipping <- sqrt(ncol(resid))
214 |     }
215 |     if(! is.numeric(clipping) || length(clipping) != 1){
216 |       stop("Clipping has to be 'TRUE'/'FALSE' or a single numeric value.")
217 |     }
218 |     if(clipping < 0){
219 |       stop("'clipping = ", clipping, "' is negative. Only positive values are allowed.")
220 |     }
221 |     resid[resid > clipping] <- clipping
222 |     resid[resid < -clipping] <- -clipping
223 |   }
224 |   resid
225 | }
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/R/sparse_sweep.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | sparse_divide_out_size_factor <- function(sp_mat, sf){
 4 |   if(length(sf) == 1){
 5 |     sp_mat@x <- sp_mat@x / sf
 6 |   }else if(length(sf) == ncol(sp_mat)){
 7 |     sp_mat@x <- sparse_divide_out_size_factor_impl(sp_mat@x, sp_mat@p, sf)
 8 |   }else{
 9 |     stop("Length of sf does not match the number of columns in sp_mat")
10 |   }
11 |   sp_mat
12 | }
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/R/transformGamPoi-package.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @importFrom methods is as canCoerce
 3 | #' @importFrom utils head
 4 | #' @importClassesFrom Matrix dgCMatrix
 5 | #' @importFrom MatrixGenerics colSums2
 6 | NULL
 7 | 
 8 | 
 9 | ## usethis namespace: start
10 | #' @importFrom Rcpp sourceCpp
11 | #' @useDynLib transformGamPoi, .registration = TRUE
12 | ## usethis namespace: end
13 | NULL
14 | 


--------------------------------------------------------------------------------
/R/transformGamPoi.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #'  Variance Stabilizing Transformation for Gamma Poisson Data
  3 | #'
  4 | #' @param data any matrix-like object (e.g. matrix, dgCMatrix, DelayedArray, HDF5Matrix)
  5 | #'   with one column per sample and row per gene. It can also be an object of type `glmGamPoi`,
  6 | #'   in which case it is directly used to calculate the variance-stabilized values.
  7 | #' @param transformation one of `c("acosh", "shifted_log", "randomized_quantile_residuals", "pearson_residuals", "analytic_pearson_residuals")`.
  8 | #'   See [`acosh_transform`], [`shifted_log_transform`], or [`residual_transform`] for more information.
  9 | #' @param overdispersion the simplest count model is the Poisson model. However, the Poisson model
 10 | #'   assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
 11 | #'   allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
 12 | #'   `overdispersion` can either be
 13 | #'   \itemize{
 14 | #'      \item a single boolean that indicates if an overdispersion is estimated for each gene.
 15 | #'      \item a numeric vector of length `nrow(data)` fixing the overdispersion to those values.
 16 | #'      \item the string `"global"` to indicate that one dispersion is fit across all genes.
 17 | #'   }
 18 | #'   Note that `overdispersion = 0` and `overdispersion = FALSE` are equivalent and both reduce
 19 | #'   the Gamma-Poisson to the classical Poisson model. Default: `0.05` which is roughly the
 20 | #'   overdispersion observed on ostensibly homogeneous cell lines.
 21 | #' @param verbose boolean that decides if information about the individual steps are printed.
 22 | #'   Default: `FALSE`
 23 | #' @param ... additional parameters passed to [`acosh_transform`], [`shifted_log_transform`], or [`residual_transform`]
 24 | #' @inheritParams glmGamPoi::glm_gp
 25 | #'
 26 | #' @return a matrix (or a vector if the input is a vector) with the transformed values.
 27 | #'
 28 | #' @seealso [`acosh_transform`], [`shifted_log_transform`], and [`residual_transform`]
 29 | #'
 30 | #' @references
 31 | #'   Ahlmann-Eltze, Constantin, and Wolfgang Huber. "Transformation and Preprocessing of Single-Cell
 32 | #'   RNA-Seq Data." bioRxiv (2021).
 33 | #'
 34 | #'   Ahlmann-Eltze, Constantin, and Wolfgang Huber. "glmGamPoi: Fitting Gamma-Poisson Generalized Linear
 35 | #'   Models on Single Cell Count Data." Bioinformatics (2020)
 36 | #'
 37 | #'   Dunn, Peter K., and Gordon K. Smyth. "Randomized quantile residuals." Journal of Computational and
 38 | #'   Graphical Statistics 5.3 (1996): 236-244.
 39 | #'
 40 | #'   Hafemeister, Christoph, and Rahul Satija. "Normalization and variance stabilization of single-cell
 41 | #'   RNA-seq data using regularized negative binomial regression." Genome biology 20.1 (2019): 1-15.
 42 | #'
 43 | #'   Hafemeister, Christoph, and Rahul Satija. "Analyzing scRNA-seq data with the sctransform and offset
 44 | #'   models." (2020)
 45 | #'
 46 | #'   Lause, Jan, Philipp Berens, and Dmitry Kobak. "Analytic Pearson residuals for normalization of
 47 | #'   single-cell RNA-seq UMI data." Genome Biology (2021).
 48 | #'
 49 | #' @examples
 50 | #'   # Load a single cell dataset
 51 | #'   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 52 | #'   # Reduce size for this example
 53 | #'   set.seed(1)
 54 | #'   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
 55 | #'                  sample(ncol(sce), 200)]
 56 | #'
 57 | #'   assay(sce_red, "acosh") <- transformGamPoi(sce_red, "acosh")
 58 | #'   assay(sce_red, "shifted_log") <- transformGamPoi(sce_red, "shifted_log")
 59 | #'
 60 | #'   # Residual Based Variance Stabilizing Transformation
 61 | #'   rq <- transformGamPoi(sce_red, transformation = "randomized_quantile", on_disk = FALSE,
 62 | #'                         verbose = TRUE)
 63 | #'   pearson <- transformGamPoi(sce_red, transformation = "pearson", on_disk = FALSE, verbose = TRUE)
 64 | #'
 65 | #'   plot(rowMeans2(counts(sce_red)), rowVars(assay(sce_red, "acosh")), log = "x")
 66 | #'   points(rowMeans2(counts(sce_red)), rowVars(assay(sce_red, "shifted_log")), col = "red")
 67 | #'   points(rowMeans2(counts(sce_red)), rowVars(rq), col = "blue")
 68 | #'
 69 | #'
 70 | #'   # Plot first two principal components
 71 | #'   acosh_pca <- prcomp(t(assay(sce_red, "acosh")), rank. = 2)
 72 | #'   rq_pca <- prcomp(t(rq), rank. = 2)
 73 | #'   pearson_pca <- prcomp(t(pearson), rank. = 2)
 74 | #'
 75 | #'   plot(acosh_pca$x, asp = 1)
 76 | #'   points(rq_pca$x, col = "blue")
 77 | #'   points(pearson_pca$x, col = "green")
 78 | #'
 79 | #' @export
 80 | transformGamPoi <- function(data,
 81 |                             transformation = c("acosh", "shifted_log", "randomized_quantile_residuals", "pearson_residuals", "analytic_pearson_residuals"),
 82 |                             overdispersion = 0.05, size_factors = TRUE, ..., on_disk = NULL, verbose = FALSE){
 83 | 
 84 |   transformation <- match.arg(transformation)
 85 | 
 86 |   dots <- list(...)
 87 |   if(! is.null(dots$residual_type)){
 88 |     stop("You specified 'residual_type = \"", dots$residual_type, "\"'. However, ",
 89 |          "for the 'transformGamPoi()' function rather specify 'transformation = \"",
 90 |          "randomized_quantile_residuals\"' or 'transformation = \"pearson_residuals\"'.")
 91 |   }
 92 | 
 93 |   if(transformation == "acosh"){
 94 |     acosh_transform(data, overdispersion = overdispersion, size_factors = size_factors,
 95 |                     on_disk = on_disk, verbose = verbose)
 96 |   }else if(transformation == "shifted_log"){
 97 |     shifted_log_transform(data, overdispersion = overdispersion, size_factors = size_factors,
 98 |                     on_disk = on_disk, verbose = verbose, ...)
 99 |   }else if(transformation == "randomized_quantile_residuals"){
100 |     residual_transform(data, residual_type = "randomized_quantile",
101 |                        overdispersion = overdispersion, size_factors = size_factors,
102 |                        on_disk = on_disk, verbose = verbose, ...)
103 |   }else if(transformation == "pearson_residuals"){
104 |     residual_transform(data, residual_type = "pearson", overdispersion = overdispersion, size_factors = size_factors,
105 |                        on_disk = on_disk, verbose = verbose, ...)
106 |   }else if(transformation == "pearson_residuals"){
107 |     residual_transform(data, residual_type = "analytic_pearson", overdispersion = overdispersion, size_factors = size_factors,
108 |                        on_disk = on_disk, verbose = verbose, ...)
109 |   }else{
110 |     stop("Unsupported transformation of type: ", transformation, ". The available options are: \n",
111 |          paste0(c("acosh", "shifted_log", "randomized_quantile_residuals", "pearson_residuals", "analytic_pearson_residuals"), collapse = ", "))
112 |   }
113 | 
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | ```{r, include = FALSE}
 6 | knitr::opts_chunk$set(
 7 |   collapse = TRUE,
 8 |   comment = "#>",
 9 |   fig.path = "man/figures/README-"
10 | )
11 | ```
12 | 
13 | # transformGamPoi
14 | 
15 | <!-- badges: start -->
16 | <!-- badges: end -->
17 | 
18 | R package that accompanies our paper 'Comparison of transformations for single-cell RNA-seq data
19 | ' (https://www.nature.com/articles/s41592-023-01814-1).
20 | 
21 | `transformGamPoi` provides methods to stabilize the variance of single cell count data:
22 | 
23 | * acosh transformation based on the delta method
24 | * shifted logarithm (log(x + c)) with a pseudo-count c, so that it approximates the acosh transformation
25 | * randomized quantile and Pearson residuals 
26 | 
27 | ## Installation
28 | 
29 | You can install the current development version of `transformGamPoi` by typing the following into the [_R_](https://cloud.r-project.org/) console:
30 | ``` r
31 | # install.packages("devtools")
32 | devtools::install_github("const-ae/transformGamPoi")
33 | ```
34 | The installation should only take a few seconds and work across all major operating systems (MacOS, Linux, Windows). 
35 | 
36 | ## Example
37 | 
38 | Let's compare the different variance-stabilizing transformations.
39 | 
40 | We start by loading the `transformGamPoi` package and setting a seed to make sure the results are reproducible.
41 | ```{r loadLibraries}
42 | library(transformGamPoi)
43 | set.seed(1)
44 | ```
45 | 
46 | We then load some example data, which we subset to 1000 genes and 500 cells
47 | ```{r loadData}
48 | sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
49 | sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
50 |                sample(ncol(sce), 500)]
51 | ```
52 | 
53 | We calculate the different variance-stabilizing transformations. We can either use the generic `transformGamPoi()` method and specify the `transformation`, or we use the low-level functions `acosh_transform()`, `shifted_log_transform()`, and `residual_transform()` which provide more settings. All functions return a matrix, which we can for example insert back into the `SingleCellExperiment` object:
54 | ```{r applyVSTs}
55 | assay(sce_red, "acosh") <- transformGamPoi(sce_red, transformation = "acosh")
56 | assay(sce_red, "shifted_log") <- shifted_log_transform(sce_red, overdispersion = 0.1)
57 | # For large datasets, we can also do the processing without 
58 | # loading the full dataset into memory (on_disk = TRUE)
59 | assay(sce_red, "rand_quant") <- residual_transform(sce_red, "randomized_quantile", on_disk = FALSE)
60 | assay(sce_red, "pearson") <- residual_transform(sce_red, "pearson", clipping = TRUE, on_disk = FALSE)
61 | ```
62 | 
63 | Finally, we compare the variance of the genes after transformation using a scatter plot
64 | ```{r plotMeanVar, warning=FALSE}
65 | par(pch = 20, cex = 1.15)
66 | mus <- rowMeans2(counts(sce_red))
67 | plot(mus, rowVars(assay(sce_red, "acosh")), log = "x", col = "#1b9e77aa", cex = 0.6,
68 |      xlab =  "Log Gene Means", ylab = "Variance after transformation")
69 | points(mus, rowVars(assay(sce_red, "shifted_log")), col = "#d95f02aa", cex = 0.6)
70 | points(mus, rowVars(assay(sce_red, "pearson")), col = "#7570b3aa", cex = 0.6)
71 | points(mus, rowVars(assay(sce_red, "rand_quant")), col = "#e7298aaa", cex = 0.6)
72 | legend("topleft", legend = c("acosh", "shifted log", "Pearson Resid.", "Rand. Quantile Resid."),
73 |        col = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a"), pch = 16)
74 | ```
75 | 
76 | # See also
77 | 
78 | There are a number of preprocessing methods and packages out there. Of particular interests are
79 | 
80 | * [sctransform](https://github.com/ChristophH/sctransform) by Christoph Hafemeister and the [Satija lab](https://satijalab.org/). The original developers of the Pearson residual variance-stabilizing transformation approach for single cell data.
81 | * [scuttle::logNormCounts()](https://bioconductor.org/packages/release/bioc/html/scuttle.html) by Aaron Lun. This is an alternative to the `shifted_log_transform()` and plays nicely together with the Bioconductor universe. For more information, I highly recommend to take a look at the [normalization](https://bioconductor.org/books/release/OSCA/normalization.html) section of the [OSCA book](https://bioconductor.org/books/release/OSCA/).
82 | * [Sanity](https://github.com/jmbreda/Sanity) by Jérémie Breda _et al._. This method is not directly concerned with variance stabilization, but still provides an interesting approach for single cell data preprocessing.
83 | 
84 | 
85 | 
86 | # Session Info
87 | 
88 | ```{r}
89 | sessionInfo()
90 | ```
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # transformGamPoi
  3 | 
  4 | <!-- badges: start -->
  5 | <!-- badges: end -->
  6 | 
  7 | R package that accompanies our paper ‘Comparison of transformations for
  8 | single-cell RNA-seq data’
  9 | (<https://www.nature.com/articles/s41592-023-01814-1>).
 10 | 
 11 | `transformGamPoi` provides methods to stabilize the variance of single
 12 | cell count data:
 13 | 
 14 | - acosh transformation based on the delta method
 15 | - shifted logarithm (log(x + c)) with a pseudo-count c, so that it
 16 |   approximates the acosh transformation
 17 | - randomized quantile and Pearson residuals
 18 | 
 19 | ## Installation
 20 | 
 21 | You can install the current development version of `transformGamPoi` by
 22 | typing the following into the [*R*](https://cloud.r-project.org/)
 23 | console:
 24 | 
 25 | ``` r
 26 | # install.packages("devtools")
 27 | devtools::install_github("const-ae/transformGamPoi")
 28 | ```
 29 | 
 30 | The installation should only take a few seconds and work across all
 31 | major operating systems (MacOS, Linux, Windows).
 32 | 
 33 | ## Example
 34 | 
 35 | Let’s compare the different variance-stabilizing transformations.
 36 | 
 37 | We start by loading the `transformGamPoi` package and setting a seed to
 38 | make sure the results are reproducible.
 39 | 
 40 | ``` r
 41 | library(transformGamPoi)
 42 | set.seed(1)
 43 | ```
 44 | 
 45 | We then load some example data, which we subset to 1000 genes and 500
 46 | cells
 47 | 
 48 | ``` r
 49 | sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 50 | #> snapshotDate(): 2022-10-31
 51 | #> Warning: package 'GenomicRanges' was built under R version 4.2.2
 52 | #> Warning: package 'S4Vectors' was built under R version 4.2.2
 53 | #> Warning: package 'GenomeInfoDb' was built under R version 4.2.2
 54 | #> see ?TENxPBMCData and browseVignettes('TENxPBMCData') for documentation
 55 | #> loading from cache
 56 | sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
 57 |                sample(ncol(sce), 500)]
 58 | ```
 59 | 
 60 | We calculate the different variance-stabilizing transformations. We can
 61 | either use the generic `transformGamPoi()` method and specify the
 62 | `transformation`, or we use the low-level functions `acosh_transform()`,
 63 | `shifted_log_transform()`, and `residual_transform()` which provide more
 64 | settings. All functions return a matrix, which we can for example insert
 65 | back into the `SingleCellExperiment` object:
 66 | 
 67 | ``` r
 68 | assay(sce_red, "acosh") <- transformGamPoi(sce_red, transformation = "acosh")
 69 | assay(sce_red, "shifted_log") <- shifted_log_transform(sce_red, overdispersion = 0.1)
 70 | # For large datasets, we can also do the processing without 
 71 | # loading the full dataset into memory (on_disk = TRUE)
 72 | assay(sce_red, "rand_quant") <- residual_transform(sce_red, "randomized_quantile", on_disk = FALSE)
 73 | assay(sce_red, "pearson") <- residual_transform(sce_red, "pearson", clipping = TRUE, on_disk = FALSE)
 74 | ```
 75 | 
 76 | Finally, we compare the variance of the genes after transformation using
 77 | a scatter plot
 78 | 
 79 | ``` r
 80 | par(pch = 20, cex = 1.15)
 81 | mus <- rowMeans2(counts(sce_red))
 82 | plot(mus, rowVars(assay(sce_red, "acosh")), log = "x", col = "#1b9e77aa", cex = 0.6,
 83 |      xlab =  "Log Gene Means", ylab = "Variance after transformation")
 84 | points(mus, rowVars(assay(sce_red, "shifted_log")), col = "#d95f02aa", cex = 0.6)
 85 | points(mus, rowVars(assay(sce_red, "pearson")), col = "#7570b3aa", cex = 0.6)
 86 | points(mus, rowVars(assay(sce_red, "rand_quant")), col = "#e7298aaa", cex = 0.6)
 87 | legend("topleft", legend = c("acosh", "shifted log", "Pearson Resid.", "Rand. Quantile Resid."),
 88 |        col = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a"), pch = 16)
 89 | ```
 90 | 
 91 | ![](man/figures/README-plotMeanVar-1.png)<!-- -->
 92 | 
 93 | # See also
 94 | 
 95 | There are a number of preprocessing methods and packages out there. Of
 96 | particular interests are
 97 | 
 98 | - [sctransform](https://github.com/ChristophH/sctransform) by Christoph
 99 |   Hafemeister and the [Satija lab](https://satijalab.org/). The original
100 |   developers of the Pearson residual variance-stabilizing transformation
101 |   approach for single cell data.
102 | - [scuttle::logNormCounts()](https://bioconductor.org/packages/release/bioc/html/scuttle.html)
103 |   by Aaron Lun. This is an alternative to the `shifted_log_transform()`
104 |   and plays nicely together with the Bioconductor universe. For more
105 |   information, I highly recommend to take a look at the
106 |   [normalization](https://bioconductor.org/books/release/OSCA/normalization.html)
107 |   section of the [OSCA
108 |   book](https://bioconductor.org/books/release/OSCA/).
109 | - [Sanity](https://github.com/jmbreda/Sanity) by Jérémie Breda *et al.*.
110 |   This method is not directly concerned with variance stabilization, but
111 |   still provides an interesting approach for single cell data
112 |   preprocessing.
113 | 
114 | # Session Info
115 | 
116 | ``` r
117 | sessionInfo()
118 | #> R version 4.2.1 RC (2022-06-17 r82503)
119 | #> Platform: x86_64-apple-darwin17.0 (64-bit)
120 | #> Running under: macOS Big Sur ... 10.16
121 | #> 
122 | #> Matrix products: default
123 | #> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
124 | #> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
125 | #> 
126 | #> locale:
127 | #> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
128 | #> 
129 | #> attached base packages:
130 | #> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
131 | #> [8] base     
132 | #> 
133 | #> other attached packages:
134 | #>  [1] TENxPBMCData_1.16.0         HDF5Array_1.26.0           
135 | #>  [3] rhdf5_2.42.0                DelayedArray_0.24.0        
136 | #>  [5] Matrix_1.5-3                SingleCellExperiment_1.20.0
137 | #>  [7] SummarizedExperiment_1.28.0 Biobase_2.58.0             
138 | #>  [9] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
139 | #> [11] IRanges_2.32.0              S4Vectors_0.36.2           
140 | #> [13] BiocGenerics_0.44.0         MatrixGenerics_1.10.0      
141 | #> [15] matrixStats_0.63.0          transformGamPoi_1.4.0      
142 | #> 
143 | #> loaded via a namespace (and not attached):
144 | #>  [1] httr_1.4.5                    bit64_4.0.5                  
145 | #>  [3] AnnotationHub_3.6.0           DelayedMatrixStats_1.20.0    
146 | #>  [5] shiny_1.7.4                   interactiveDisplayBase_1.36.0
147 | #>  [7] highr_0.10                    BiocManager_1.30.20          
148 | #>  [9] BiocFileCache_2.6.1           blob_1.2.3                   
149 | #> [11] GenomeInfoDbData_1.2.9        yaml_2.3.7                   
150 | #> [13] BiocVersion_3.16.0            pillar_1.8.1                 
151 | #> [15] RSQLite_2.3.0                 lattice_0.20-45              
152 | #> [17] glue_1.6.2                    digest_0.6.31                
153 | #> [19] promises_1.2.0.1              XVector_0.38.0               
154 | #> [21] htmltools_0.5.4               httpuv_1.6.9                 
155 | #> [23] pkgconfig_2.0.3               zlibbioc_1.44.0              
156 | #> [25] purrr_1.0.1                   xtable_1.8-4                 
157 | #> [27] later_1.3.0                   tibble_3.1.8                 
158 | #> [29] KEGGREST_1.38.0               generics_0.1.3               
159 | #> [31] ellipsis_0.3.2                withr_2.5.0                  
160 | #> [33] cachem_1.0.7                  cli_3.6.0                    
161 | #> [35] magrittr_2.0.3                crayon_1.5.2                 
162 | #> [37] mime_0.12                     memoise_2.0.1                
163 | #> [39] evaluate_0.20                 fansi_1.0.4                  
164 | #> [41] tools_4.2.1                   lifecycle_1.0.3              
165 | #> [43] Rhdf5lib_1.20.0               AnnotationDbi_1.60.0         
166 | #> [45] Biostrings_2.66.0             compiler_4.2.1               
167 | #> [47] rlang_1.0.6                   grid_4.2.1                   
168 | #> [49] RCurl_1.98-1.10               rhdf5filters_1.10.0          
169 | #> [51] rstudioapi_0.14               rappdirs_0.3.3               
170 | #> [53] glmGamPoi_1.11.7              bitops_1.0-7                 
171 | #> [55] rmarkdown_2.20                ExperimentHub_2.6.0          
172 | #> [57] DBI_1.1.3                     curl_5.0.0                   
173 | #> [59] R6_2.5.1                      knitr_1.42                   
174 | #> [61] dplyr_1.1.0                   fastmap_1.1.1                
175 | #> [63] bit_4.0.5                     utf8_1.2.3                   
176 | #> [65] filelock_1.0.2                Rcpp_1.0.10                  
177 | #> [67] vctrs_0.5.2                   png_0.1-8                    
178 | #> [69] sparseMatrixStats_1.10.0      dbplyr_2.3.1                 
179 | #> [71] tidyselect_1.2.0              xfun_0.37
180 | ```
181 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite transformGamPoi in publications use:")
 2 | 
 3 | citEntry(
 4 |   entry    = "Article",
 5 |   title    = "Comparison of transformations for single-cell {RNA-seq} data",
 6 |   author   = c(person("Constantin", "Ahlmann-Eltze", comment = c(ORCID = "0000-0002-3762-068X")),
 7 |                person("Wolfgang", "Huber", comment = c(ORCID = "0000-0002-0474-2218"))),
 8 |   journal  = "Nature Methods",
 9 |   year     = 2023,
10 |   publisher = "Springer Nature",
11 |   url      = "https://www.nature.com/articles/s41592-023-01814-1",
12 |   doi      = "10.1038/s41592-023-01814-1",
13 |   textVersion = paste(
14 |     "Constantin Ahlmann-Eltze and Wolfgang Huber.",
15 |     "\"Comparison of transformations for single-cell RNA-seq data.\" Nat Methods (2023)"
16 |   )
17 | )
18 | 


--------------------------------------------------------------------------------
/man/acosh_transform.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/delta_method_transform.R
  3 | \name{acosh_transform}
  4 | \alias{acosh_transform}
  5 | \alias{shifted_log_transform}
  6 | \title{Delta method-based variance stabilizing transformation}
  7 | \usage{
  8 | acosh_transform(
  9 |   data,
 10 |   overdispersion = 0.05,
 11 |   size_factors = TRUE,
 12 |   ...,
 13 |   on_disk = NULL,
 14 |   verbose = FALSE
 15 | )
 16 | 
 17 | shifted_log_transform(
 18 |   data,
 19 |   overdispersion = 0.05,
 20 |   pseudo_count = 1/(4 * overdispersion),
 21 |   size_factors = TRUE,
 22 |   minimum_overdispersion = 0.001,
 23 |   ...,
 24 |   on_disk = NULL,
 25 |   verbose = FALSE
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{data}{any matrix-like object (e.g. matrix, dgCMatrix, DelayedArray, HDF5Matrix)
 30 | with one column per sample and row per gene. It can also be an object of type \code{glmGamPoi},
 31 | in which case it is directly used to calculate the variance-stabilized values.}
 32 | 
 33 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model
 34 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
 35 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
 36 | \code{overdispersion} can either be
 37 | \itemize{
 38 | \item a single boolean that indicates if an overdispersion is estimated for each gene.
 39 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values.
 40 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes.
 41 | }
 42 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce
 43 | the Gamma-Poisson to the classical Poisson model. Default: \code{0.05} which is roughly the
 44 | overdispersion observed on ostensibly homogeneous cell lines.}
 45 | 
 46 | \item{size_factors}{in large scale experiments, each sample is typically of different size
 47 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to
 48 | correct for this effect.\cr
 49 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data
 50 | that specifies the size factors that are used.
 51 | Or it can be a string that species the method that is used to estimate the size factors
 52 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}).
 53 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly
 54 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use
 55 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need
 56 | to separately install the \code{scran} package from Bioconductor for this method to work.
 57 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given,
 58 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.}
 59 | 
 60 | \item{...}{additional parameters for \code{glmGamPoi::glm_gp()} which is called in
 61 | case \code{overdispersion = TRUE}.}
 62 | 
 63 | \item{on_disk}{a boolean that indicates if the dataset is loaded into memory or if it is kept on disk
 64 | to reduce the memory usage. Processing in memory can be significantly faster than on disk.
 65 | Default: \code{NULL} which means that the data is only processed in memory if \code{data} is an in-memory
 66 | data structure.}
 67 | 
 68 | \item{verbose}{boolean that decides if information about the individual steps are printed.
 69 | Default: \code{FALSE}}
 70 | 
 71 | \item{pseudo_count}{instead of specifying the overdispersion, the
 72 | \code{shifted_log_transform} is commonly parameterized with a pseudo-count
 73 | (\eqn{pseudo-count = 1/(4 * overdispersion)}). If both the \code{pseudo-count}
 74 | and \code{overdispersion} is specified, the \code{overdispersion} is ignored.
 75 | Default: \code{1/(4 * overdispersion)}}
 76 | 
 77 | \item{minimum_overdispersion}{the \code{acosh_transform} converges against
 78 | \eqn{2 * sqrt(x)} for \code{overdispersion == 0}. However, the \code{shifted_log_transform}
 79 | would just become \code{0}, thus here we apply the \code{minimum_overdispersion} to avoid
 80 | this behavior.}
 81 | }
 82 | \value{
 83 | a matrix (or a vector if the input is a vector) with the transformed values.
 84 | }
 85 | \description{
 86 | Delta method-based variance stabilizing transformation
 87 | }
 88 | \section{Functions}{
 89 | \itemize{
 90 | \item \code{acosh_transform}: \eqn{1/sqrt(alpha)} acosh(2 * alpha * x + 1)
 91 | 
 92 | \item \code{shifted_log_transform}: \eqn{1/sqrt(alpha) log(4 * alpha * x + 1)}
 93 | }}
 94 | 
 95 | \examples{
 96 |   # Load a single cell dataset
 97 |   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 98 |   # Reduce size for this example
 99 |   set.seed(1)
100 |   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
101 |                  sample(ncol(sce), 200)]
102 | 
103 |   assay(sce_red, "acosh") <- acosh_transform(sce_red)
104 |   assay(sce_red, "shifted_log") <- shifted_log_transform(sce_red)
105 |   plot(rowMeans2(assay(sce_red, "acosh")), rowVars(assay(sce_red, "acosh")), log = "x")
106 |   points(rowMeans2(assay(sce_red, "shifted_log")), rowVars(assay(sce_red, "shifted_log")),
107 |          col = "red")
108 | 
109 |   # Sqrt transformation
110 |   sqrt_dat <- acosh_transform(sce_red, overdispersion = 0, size_factor = 1)
111 |   plot(2 * sqrt(assay(sce_red))[,1], sqrt_dat[,1]); abline(0,1)
112 | 
113 | }
114 | \references{
115 | Ahlmann-Eltze, Constantin, and Wolfgang Huber. "Transformation and Preprocessing of Single-Cell
116 | RNA-Seq Data." bioRxiv (2021).
117 | 
118 | Ahlmann-Eltze, Constantin, and Wolfgang Huber. "glmGamPoi: Fitting Gamma-Poisson Generalized Linear
119 | Models on Single Cell Count Data." Bioinformatics (2020)
120 | 
121 | Dunn, Peter K., and Gordon K. Smyth. "Randomized quantile residuals." Journal of Computational and
122 | Graphical Statistics 5.3 (1996): 236-244.
123 | 
124 | Hafemeister, Christoph, and Rahul Satija. "Normalization and variance stabilization of single-cell
125 | RNA-seq data using regularized negative binomial regression." Genome biology 20.1 (2019): 1-15.
126 | 
127 | Hafemeister, Christoph, and Rahul Satija. "Analyzing scRNA-seq data with the sctransform and offset
128 | models." (2020)
129 | 
130 | Lause, Jan, Philipp Berens, and Dmitry Kobak. "Analytic Pearson residuals for normalization of
131 | single-cell RNA-seq UMI data." Genome Biology (2021).
132 | }
133 | \seealso{
134 | \code{\link{acosh_transform}}, \code{\link{shifted_log_transform}}, and \code{\link{residual_transform}}
135 | }
136 | 


--------------------------------------------------------------------------------
/man/dot-handle_data_parameter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helpers.R
 3 | \name{.handle_data_parameter}
 4 | \alias{.handle_data_parameter}
 5 | \title{Take any kind of data and extract the matrix}
 6 | \usage{
 7 | .handle_data_parameter(data, on_disk, allow_sparse = TRUE)
 8 | }
 9 | \value{
10 | A matrix.
11 | }
12 | \description{
13 | Adapted from glmGamPoi:::handle_data_parameter
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/man/estimate_size_factors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helpers.R
 3 | \name{estimate_size_factors}
 4 | \alias{estimate_size_factors}
 5 | \title{Estimate the Size Factors}
 6 | \usage{
 7 | estimate_size_factors(Y, method, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{Y}{any matrix-like object (\code{base::matrix()}, \code{DelayedArray}, \code{HDF5Matrix},
11 | \code{Matrix::Matrix()}, etc.)}
12 | 
13 | \item{method}{one of \code{c("normed_sum", "deconvolution", "poscounts")}}
14 | }
15 | \value{
16 | a vector with one size factor per column of \code{Y}
17 | }
18 | \description{
19 | Estimate the Size Factors
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/figures/README-plotMeanVar-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/const-ae/transformGamPoi/10bcccd4fc02e2659c2803ec97ccfbc6215e9600/man/figures/README-plotMeanVar-1.png


--------------------------------------------------------------------------------
/man/residual_transform.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/residual_transform.R
  3 | \name{residual_transform}
  4 | \alias{residual_transform}
  5 | \title{Residual-based Variance Stabilizing Transformation}
  6 | \usage{
  7 | residual_transform(
  8 |   data,
  9 |   residual_type = c("randomized_quantile", "pearson", "analytic_pearson"),
 10 |   clipping = FALSE,
 11 |   overdispersion = 0.05,
 12 |   size_factors = TRUE,
 13 |   offset_model = TRUE,
 14 |   overdispersion_shrinkage = TRUE,
 15 |   ridge_penalty = 2,
 16 |   on_disk = NULL,
 17 |   return_fit = FALSE,
 18 |   verbose = FALSE,
 19 |   ...
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{data}{any matrix-like object (e.g. matrix, dgCMatrix, DelayedArray, HDF5Matrix)
 24 | with one column per sample and row per gene. It can also be an object of type \code{glmGamPoi},
 25 | in which case it is directly used to calculate the variance-stabilized values.}
 26 | 
 27 | \item{residual_type}{a string that specifies what kind of residual is returned as variance stabilized-value.
 28 | \describe{
 29 | \item{\code{"randomized_quantile"}}{The discrete nature of count distribution stops simple transformations from
 30 | obtaining a truly standard normal residuals. The trick of of quantile randomized residuals is to match the
 31 | cumulative density function of the Gamma-Poisson and the Normal distribution. Due to the discrete nature of
 32 | Gamma-Poisson distribution, a count does not correspond to a single quantile of the Normal distribution, but
 33 | to a range of possible value. This is resolved by randomly choosing one of the mapping values from the
 34 | Normal distribution as the residual. This ensures perfectly normal distributed
 35 | residuals, for the cost of introducing randomness. More details are available in the documentation
 36 | of \code{\link[statmod:qresiduals]{statmod::qresiduals()}} and the corresponding publication by Dunn and Smyth (1996).}
 37 | \item{\code{"pearson"}}{The Pearson residuals are defined as \eqn{res = (y - m) / sqrt(m + m^2 * theta)}.}
 38 | \item{\code{"analytic_pearson"}}{Similar to the method above, however, instead of estimating \eqn{m} using a
 39 | GLM model fit, \eqn{m} is approximated by \eqn{m_ij = (\sum_j y_{ij}) (\sum_i y_{ij}) / (\sum_{i,j} y_{ij})}.
 40 | For all details, see Lause et al. (2021).
 41 | Note that \code{overdispersion_shrinkage} and \code{ridge_penalty} are ignored when fitting analytic Pearson residuals.}
 42 | }
 43 | The two above options are the most common choices, however you can use any \code{residual_type} supported by
 44 | \code{\link[glmGamPoi:residuals.glmGamPoi]{glmGamPoi::residuals.glmGamPoi()}}. Default: \code{"randomized_quantile"}}
 45 | 
 46 | \item{clipping}{a single boolean or numeric value specifying that all residuals are in the range
 47 | \verb{[-clipping, +clipping]}. If \code{clipping = TRUE}, we use the default of \code{clipping = sqrt(ncol(data))}
 48 | which is the default behavior for \code{sctransform}. Default: \code{FALSE}, which means no clipping is applied.}
 49 | 
 50 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model
 51 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
 52 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
 53 | \code{overdispersion} can either be
 54 | \itemize{
 55 | \item a single boolean that indicates if an overdispersion is estimated for each gene.
 56 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values.
 57 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes.
 58 | }
 59 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce
 60 | the Gamma-Poisson to the classical Poisson model. Default: \code{0.05} which is roughly the
 61 | overdispersion observed on ostensibly homogeneous cell lines.}
 62 | 
 63 | \item{offset_model}{boolean to decide if \eqn{\beta_1} in \eqn{y = \beta_0 + \beta_1 log(sf)},
 64 | is set to 1 (i.e., treating the log of the size factors as an offset ) or is estimated per gene.
 65 | From a theoretical point, it should be fine to treat \eqn{\beta_1} as an offset, because a cell that is
 66 | twice as big, should have twice as many counts per gene (without any gene-specific effects).
 67 | However, \code{sctransform} suggested that it would be advantageous to nonetheless estimate
 68 | \eqn{\beta_0} as it may counter data artifacts. On the other side, Lause et al. (2020)
 69 | demonstrated that the estimating \eqn{\beta_0} and \eqn{\beta_1} together can be difficult. If
 70 | you still want to fit \code{sctransform}'s model, you can set the \code{ridge_penalty} argument to a
 71 | non-zero value, which shrinks \eqn{\beta_1} towards 1 and resolves the degeneracy. \cr
 72 | Default: \code{TRUE}.}
 73 | 
 74 | \item{overdispersion_shrinkage, size_factors}{arguments that are passed to the underlying
 75 | call to \code{\link[glmGamPoi:glm_gp]{glmGamPoi::glm_gp()}}. Default for each: \code{TRUE}.}
 76 | 
 77 | \item{ridge_penalty}{another argument that is passed to \code{\link[glmGamPoi:glm_gp]{glmGamPoi::glm_gp()}}. It is ignored if
 78 | \code{offset_model = TRUE}. Default: \code{2}.}
 79 | 
 80 | \item{on_disk}{a boolean that indicates if the dataset is loaded into memory or if it is kept on disk
 81 | to reduce the memory usage. Processing in memory can be significantly faster than on disk.
 82 | Default: \code{NULL} which means that the data is only processed in memory if \code{data} is an in-memory
 83 | data structure.}
 84 | 
 85 | \item{return_fit}{boolean to decide if the matrix of residuals is returned directly (\code{return_fit = FALSE})
 86 | or if in addition the \code{glmGamPoi}-fit is returned (\code{return_fit = TRUE}) . Default: \code{FALSE}.}
 87 | 
 88 | \item{verbose}{boolean that decides if information about the individual steps are printed.
 89 | Default: \code{FALSE}}
 90 | 
 91 | \item{...}{additional parameters passed to \code{\link[glmGamPoi:glm_gp]{glmGamPoi::glm_gp()}}.}
 92 | }
 93 | \value{
 94 | a matrix (or a vector if the input is a vector) with the transformed values. If \code{return_fit = TRUE},
 95 | a list is returned with two elements: \code{fit} and \code{Residuals}.
 96 | }
 97 | \description{
 98 | Fit an intercept Gamma-Poisson model that corrects for sequencing depth and return the residuals
 99 | as variance stabilized results for further downstream application, for which no proper count-based
100 | method exist or is performant enough (e.g., clustering, dimensionality reduction).
101 | }
102 | \details{
103 | Internally, this method uses the \code{glmGamPoi} package. The function goes through the following steps
104 | \enumerate{
105 | \item fit model using \code{\link[glmGamPoi:glm_gp]{glmGamPoi::glm_gp()}}
106 | \item plug in the trended overdispersion estimates
107 | \item call \code{\link[glmGamPoi:residuals.glmGamPoi]{glmGamPoi::residuals.glmGamPoi()}} to calculate the residuals.
108 | }
109 | }
110 | \examples{
111 |   # Load a single cell dataset
112 |   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
113 |   # Reduce size for this example
114 |   set.seed(1)
115 |   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
116 |                  sample(ncol(sce), 200)]
117 |   counts(sce_red) <- as.matrix(counts(sce_red))
118 | 
119 |   # Residual Based Variance Stabilizing Transformation
120 |   rq <- residual_transform(sce_red, residual_type = "randomized_quantile",
121 |                            verbose = TRUE)
122 |   pearson <- residual_transform(sce_red, residual_type = "pearson", verbose = TRUE)
123 | 
124 |   # Plot first two principal components
125 |   pearson_pca <- prcomp(t(pearson), rank. = 2)
126 |   rq_pca <- prcomp(t(rq), rank. = 2)
127 |   plot(rq_pca$x, asp = 1)
128 |   points(pearson_pca$x, col = "red")
129 | 
130 | }
131 | \references{
132 | Ahlmann-Eltze, Constantin, and Wolfgang Huber. "glmGamPoi: Fitting Gamma-Poisson Generalized Linear
133 | Models on Single Cell Count Data." Bioinformatics (2020)
134 | 
135 | Dunn, Peter K., and Gordon K. Smyth. "Randomized quantile residuals." Journal of Computational and
136 | Graphical Statistics 5.3 (1996): 236-244.
137 | 
138 | Hafemeister, Christoph, and Rahul Satija. "Normalization and variance stabilization of single-cell
139 | RNA-seq data using regularized negative binomial regression." Genome biology 20.1 (2019): 1-15.
140 | 
141 | Hafemeister, Christoph, and Rahul Satija. "Analyzing scRNA-seq data with the sctransform and offset
142 | models." (2020)
143 | 
144 | Lause, Jan, Philipp Berens, and Dmitry Kobak. "Analytic Pearson residuals for normalization of
145 | single-cell RNA-seq UMI data." Genome Biology (2021).
146 | }
147 | \seealso{
148 | \code{\link[glmGamPoi:glm_gp]{glmGamPoi::glm_gp()}}, \code{\link[glmGamPoi:residuals.glmGamPoi]{glmGamPoi::residuals.glmGamPoi()}}, \code{sctransform::vst()},
149 | \code{statmod::qresiduals()}
150 | }
151 | 


--------------------------------------------------------------------------------
/man/transformGamPoi.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/transformGamPoi.R
  3 | \name{transformGamPoi}
  4 | \alias{transformGamPoi}
  5 | \title{Variance Stabilizing Transformation for Gamma Poisson Data}
  6 | \usage{
  7 | transformGamPoi(
  8 |   data,
  9 |   transformation = c("acosh", "shifted_log", "randomized_quantile_residuals",
 10 |     "pearson_residuals", "analytic_pearson_residuals"),
 11 |   overdispersion = 0.05,
 12 |   size_factors = TRUE,
 13 |   ...,
 14 |   on_disk = NULL,
 15 |   verbose = FALSE
 16 | )
 17 | }
 18 | \arguments{
 19 | \item{data}{any matrix-like object (e.g. matrix, dgCMatrix, DelayedArray, HDF5Matrix)
 20 | with one column per sample and row per gene. It can also be an object of type \code{glmGamPoi},
 21 | in which case it is directly used to calculate the variance-stabilized values.}
 22 | 
 23 | \item{transformation}{one of \code{c("acosh", "shifted_log", "randomized_quantile_residuals", "pearson_residuals", "analytic_pearson_residuals")}.
 24 | See \code{\link{acosh_transform}}, \code{\link{shifted_log_transform}}, or \code{\link{residual_transform}} for more information.}
 25 | 
 26 | \item{overdispersion}{the simplest count model is the Poisson model. However, the Poisson model
 27 | assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
 28 | allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
 29 | \code{overdispersion} can either be
 30 | \itemize{
 31 | \item a single boolean that indicates if an overdispersion is estimated for each gene.
 32 | \item a numeric vector of length \code{nrow(data)} fixing the overdispersion to those values.
 33 | \item the string \code{"global"} to indicate that one dispersion is fit across all genes.
 34 | }
 35 | Note that \code{overdispersion = 0} and \code{overdispersion = FALSE} are equivalent and both reduce
 36 | the Gamma-Poisson to the classical Poisson model. Default: \code{0.05} which is roughly the
 37 | overdispersion observed on ostensibly homogeneous cell lines.}
 38 | 
 39 | \item{size_factors}{in large scale experiments, each sample is typically of different size
 40 | (for example different sequencing depths). A size factor is an internal mechanism of GLMs to
 41 | correct for this effect.\cr
 42 | \code{size_factors} is either a numeric vector with positive entries that has the same lengths as columns in the data
 43 | that specifies the size factors that are used.
 44 | Or it can be a string that species the method that is used to estimate the size factors
 45 | (one of \code{c("normed_sum", "deconvolution", "poscounts")}).
 46 | Note that \code{"normed_sum"} and \code{"poscounts"} are fairly
 47 | simple methods and can lead to suboptimal results. For the best performance, I recommend to use
 48 | \code{size_factors = "deconvolution"} which calls \code{scran::calculateSumFactors()}. However, you need
 49 | to separately install the \code{scran} package from Bioconductor for this method to work.
 50 | Also note that \code{size_factors = 1} and \code{size_factors = FALSE} are equivalent. If only a single gene is given,
 51 | no size factor is estimated (ie. \code{size_factors = 1}). Default: \code{"normed_sum"}.}
 52 | 
 53 | \item{...}{additional parameters passed to \code{\link{acosh_transform}}, \code{\link{shifted_log_transform}}, or \code{\link{residual_transform}}}
 54 | 
 55 | \item{on_disk}{a boolean that indicates if the dataset is loaded into memory or if it is kept on disk
 56 | to reduce the memory usage. Processing in memory can be significantly faster than on disk.
 57 | Default: \code{NULL} which means that the data is only processed in memory if \code{data} is an in-memory
 58 | data structure.}
 59 | 
 60 | \item{verbose}{boolean that decides if information about the individual steps are printed.
 61 | Default: \code{FALSE}}
 62 | }
 63 | \value{
 64 | a matrix (or a vector if the input is a vector) with the transformed values.
 65 | }
 66 | \description{
 67 | Variance Stabilizing Transformation for Gamma Poisson Data
 68 | }
 69 | \examples{
 70 |   # Load a single cell dataset
 71 |   sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 72 |   # Reduce size for this example
 73 |   set.seed(1)
 74 |   sce_red <- sce[sample(which(rowSums2(counts(sce)) > 0), 1000),
 75 |                  sample(ncol(sce), 200)]
 76 | 
 77 |   assay(sce_red, "acosh") <- transformGamPoi(sce_red, "acosh")
 78 |   assay(sce_red, "shifted_log") <- transformGamPoi(sce_red, "shifted_log")
 79 | 
 80 |   # Residual Based Variance Stabilizing Transformation
 81 |   rq <- transformGamPoi(sce_red, transformation = "randomized_quantile", on_disk = FALSE,
 82 |                         verbose = TRUE)
 83 |   pearson <- transformGamPoi(sce_red, transformation = "pearson", on_disk = FALSE, verbose = TRUE)
 84 | 
 85 |   plot(rowMeans2(counts(sce_red)), rowVars(assay(sce_red, "acosh")), log = "x")
 86 |   points(rowMeans2(counts(sce_red)), rowVars(assay(sce_red, "shifted_log")), col = "red")
 87 |   points(rowMeans2(counts(sce_red)), rowVars(rq), col = "blue")
 88 | 
 89 | 
 90 |   # Plot first two principal components
 91 |   acosh_pca <- prcomp(t(assay(sce_red, "acosh")), rank. = 2)
 92 |   rq_pca <- prcomp(t(rq), rank. = 2)
 93 |   pearson_pca <- prcomp(t(pearson), rank. = 2)
 94 | 
 95 |   plot(acosh_pca$x, asp = 1)
 96 |   points(rq_pca$x, col = "blue")
 97 |   points(pearson_pca$x, col = "green")
 98 | 
 99 | }
100 | \references{
101 | Ahlmann-Eltze, Constantin, and Wolfgang Huber. "Transformation and Preprocessing of Single-Cell
102 | RNA-Seq Data." bioRxiv (2021).
103 | 
104 | Ahlmann-Eltze, Constantin, and Wolfgang Huber. "glmGamPoi: Fitting Gamma-Poisson Generalized Linear
105 | Models on Single Cell Count Data." Bioinformatics (2020)
106 | 
107 | Dunn, Peter K., and Gordon K. Smyth. "Randomized quantile residuals." Journal of Computational and
108 | Graphical Statistics 5.3 (1996): 236-244.
109 | 
110 | Hafemeister, Christoph, and Rahul Satija. "Normalization and variance stabilization of single-cell
111 | RNA-seq data using regularized negative binomial regression." Genome biology 20.1 (2019): 1-15.
112 | 
113 | Hafemeister, Christoph, and Rahul Satija. "Analyzing scRNA-seq data with the sctransform and offset
114 | models." (2020)
115 | 
116 | Lause, Jan, Philipp Berens, and Dmitry Kobak. "Analytic Pearson residuals for normalization of
117 | single-cell RNA-seq UMI data." Genome Biology (2021).
118 | }
119 | \seealso{
120 | \code{\link{acosh_transform}}, \code{\link{shifted_log_transform}}, and \code{\link{residual_transform}}
121 | }
122 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // sparse_divide_out_size_factor_impl
14 | NumericVector sparse_divide_out_size_factor_impl(const NumericVector& x, const IntegerVector& p, const NumericVector& s);
15 | RcppExport SEXP _transformGamPoi_sparse_divide_out_size_factor_impl(SEXP xSEXP, SEXP pSEXP, SEXP sSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
20 |     Rcpp::traits::input_parameter< const IntegerVector& >::type p(pSEXP);
21 |     Rcpp::traits::input_parameter< const NumericVector& >::type s(sSEXP);
22 |     rcpp_result_gen = Rcpp::wrap(sparse_divide_out_size_factor_impl(x, p, s));
23 |     return rcpp_result_gen;
24 | END_RCPP
25 | }
26 | 
27 | static const R_CallMethodDef CallEntries[] = {
28 |     {"_transformGamPoi_sparse_divide_out_size_factor_impl", (DL_FUNC) &_transformGamPoi_sparse_divide_out_size_factor_impl, 3},
29 |     {NULL, NULL, 0}
30 | };
31 | 
32 | RcppExport void R_init_transformGamPoi(DllInfo *dll) {
33 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
34 |     R_useDynamicSymbols(dll, FALSE);
35 | }
36 | 


--------------------------------------------------------------------------------
/src/sparse_divide_out_size_factor.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | 
 5 | // [[Rcpp::export]]
 6 | NumericVector sparse_divide_out_size_factor_impl(const NumericVector& x, const IntegerVector& p, const NumericVector& s){
 7 |   const int n_elem = x.size();
 8 |   NumericVector res(n_elem);
 9 |   int col_idx = 0;
10 |   int pointer_loc = p[1];
11 |   for(int idx = 0; idx < n_elem; ++idx){
12 |     while(idx >= pointer_loc){
13 |       pointer_loc = p[++col_idx + 1];
14 |     }
15 |     res[idx] = x[idx] / s[col_idx];
16 |   }
17 |   return res;
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(transformGamPoi)
3 | 
4 | test_check("transformGamPoi")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-acoshp1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | test_that("acoshp1 works", {
 3 | 
 4 |   expect_equal(acoshp1(17), acosh(17 + 1))
 5 | 
 6 | })
 7 | 
 8 | test_that("sparse acosh plus 1 works", {
 9 | 
10 |   mat <- matrix(rpois(n = 10 * 4, lambda = 0.3), nrow = 10, ncol = 4)
11 |   expect_equal(acoshp1(mat), acosh(mat + 1))
12 | 
13 |   sp_mat <- as(mat, "dgCMatrix")
14 |   expect_s4_class(acoshp1(sp_mat), "CsparseMatrix")
15 |   expect_equal(acoshp1(sp_mat), as(acosh(mat + 1), "dgCMatrix"))
16 | })
17 | 


--------------------------------------------------------------------------------
/tests/testthat/test-sparse_sweep.R:
--------------------------------------------------------------------------------
 1 | test_that("sweep for dgCMatrices works", {
 2 |   mat <- matrix(0, nrow = 10, ncol = 17)
 3 |   mat[sample.int(length(mat), 20)] <- rnorm(n = 20)
 4 |   sp_mat <- as(mat, "dgCMatrix")
 5 | 
 6 |   expect_equal(sparse_divide_out_size_factor(sp_mat, 8), DelayedArray::sweep(sp_mat, 2, 8, FUN = "/"))
 7 |   expect_equal(sparse_divide_out_size_factor(sp_mat, 1:17), DelayedArray::sweep(sp_mat, 2, 1:17, FUN = "/"))
 8 | })
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/testthat/test-transform.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | test_that("acosh_transformation works", {
  4 | 
  5 | 
  6 |   mat <- matrix(rpois(n = 10 * 4, lambda = 0.3), nrow = 10, ncol = 4)
  7 |   expect_equal(acosh_transform(mat, size_factors = 1), .acosh_trans_impl(mat, 0.05))
  8 | 
  9 |   # Setting the overdispersion works
 10 |   alpha <- rnorm(10)^2
 11 |   expect_equal(acosh_transform(mat, overdispersion = alpha, size_factors = 1), .acosh_trans_impl(mat, alpha))
 12 | 
 13 |   # Exact zeros in overdispersion work
 14 |   sel <- which.max(rowSums(mat))
 15 |   alpha[sel] <- 0
 16 |   res <- acosh_transform(mat, overdispersion = alpha, size_factors = 1)
 17 |   expect_equal(res[-sel, ], .acosh_trans_impl(mat[-sel, ], alpha = alpha[-sel]))
 18 |   expect_equal(res[sel, ], .sqrt_trans_impl(mat[sel, ]))
 19 | 
 20 | 
 21 |   alpha <- matrix(rnorm(10 * 4)^2, nrow = 10, ncol = 4)
 22 |   expect_equal(acosh_transform(mat, overdispersion = alpha, size_factors = 1), .acosh_trans_impl(mat, alpha))
 23 | 
 24 |   # Check vector
 25 |   expect_equal(acosh_transform(1:3), .acosh_trans_impl(1:3, alpha = 0.05))
 26 | })
 27 | 
 28 | test_that("acosh_transformation works for sparse input", {
 29 |   mat <- matrix(rpois(n = 10 * 4, lambda = 0.3), nrow = 10, ncol = 4)
 30 |   sp_mat <- as(mat, "dgCMatrix")
 31 |   expect_equal(acosh_transform(sp_mat), as(acosh_transform(mat), "dgCMatrix"))
 32 |   expect_s4_class(acosh_transform(sp_mat), "CsparseMatrix")
 33 | 
 34 |   # Setting the overdispersion works
 35 |   alpha <- rnorm(10)^2
 36 |   expect_equal(acosh_transform(sp_mat, overdispersion = alpha),
 37 |                as(acosh_transform(mat, overdispersion = alpha), "dgCMatrix"))
 38 | 
 39 |   # Exact zeros in overdispersion work
 40 |   sel <- which.max(rowSums(mat))
 41 |   alpha[sel] <- 0
 42 |   expect_equal(acosh_transform(sp_mat, overdispersion = alpha),
 43 |                as(acosh_transform(mat, overdispersion = alpha), "dgCMatrix"))
 44 | 
 45 | 
 46 | })
 47 | 
 48 | test_that("acosh_transform calling to glm_gp works as expected", {
 49 |   mat <- matrix(rpois(n = 10 * 4, lambda = 0.3), nrow = 10, ncol = 4)
 50 |   sp_mat <- as(mat, "dgCMatrix")
 51 | 
 52 |   expect_equal(acosh_transform(sp_mat, overdispersion = TRUE, on_disk = FALSE),
 53 |                acosh_transform(mat, overdispersion = TRUE), ignore_attr = TRUE)
 54 | 
 55 |   fit <- glmGamPoi::glm_gp(mat, overdispersion_shrinkage = TRUE)
 56 |   expect_equal(acosh_transform(mat, overdispersion = TRUE),
 57 |                acosh_transform(mat, overdispersion = fit$overdispersion_shrinkage_list$dispersion_trend), ignore_attr = TRUE)
 58 | 
 59 |   fit <- glmGamPoi::glm_gp(mat, overdispersion_shrinkage = FALSE)
 60 |   expect_equal(acosh_transform(mat, overdispersion = TRUE, overdispersion_shrinkage = FALSE),
 61 |                acosh_transform(mat, overdispersion = fit$overdispersions), ignore_attr = TRUE)
 62 | })
 63 | 
 64 | 
 65 | test_that("the transition from acosh to sqrt is smooth", {
 66 | 
 67 |   expect_lt(acosh_transform(3, overdispersion = 1e-5), acosh_transform(3, overdispersion = 0))
 68 |   expect_equal(acosh_transform(3, overdispersion = 1e-5), acosh_transform(3, overdispersion = 0), tolerance = 1e-4)
 69 | 
 70 | })
 71 | 
 72 | 
 73 | test_that("shifted log is correct", {
 74 |   # Values between 1e-10 and 1e6
 75 |   xg <- rchisq(100, df = 0.3) * 1e6
 76 |   alpha <- 0.3
 77 |   expect_equal(.log_plus_alpha_impl(xg, alpha = alpha), 1/sqrt(alpha) * (log(xg + 1/(4 * alpha)) + log(4 * alpha)))
 78 | 
 79 |   # Zero stays zero
 80 |   expect_equal(.log_plus_alpha_impl(0, alpha = alpha), 1/sqrt(alpha) * (log(0 + 1/(4 * alpha)) + log(4 * alpha)))
 81 | })
 82 | 
 83 | 
 84 | test_that("shifted_log_transform errors if overdispersion and pseudo_count are specified", {
 85 |   expect_silent(shifted_log_transform(3, overdispersion = 0.01))
 86 |   expect_silent(shifted_log_transform(3, pseudo_count = 1))
 87 |   expect_silent(shifted_log_transform(3, overdispersion = 0.01, pseudo_count = 1/(4 * 0.01)))
 88 | })
 89 | 
 90 | 
 91 | test_that("acosh, sqrt, and shifted log converge to each other", {
 92 | 
 93 |   expect_equal(shifted_log_transform(1e5),
 94 |                acosh_transform(1e5), tolerance = 1e-4)
 95 | 
 96 |   expect_equal(acosh_transform(5, overdispersion = 1e-6),
 97 |                2 * sqrt(5), tolerance = 1e-4)
 98 | 
 99 | })
100 | 
101 | 
102 | test_that("different input types work", {
103 | 
104 |   n_genes <- 100
105 |   n_cells <- 500
106 | 
107 |   beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
108 |   sf <- rchisq(n = n_cells, df = 100)
109 |   sf <- sf / mean(sf)
110 | 
111 |   Mu <- exp( beta0 %*% t(log(sf)) )
112 | 
113 |   Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 0.1), nrow = n_genes, ncol = n_cells)
114 | 
115 |   fit <- glmGamPoi::glm_gp(Y, design = ~ 1)
116 | 
117 |   # matrix
118 |   res <- acosh_transform(Y, verbose = TRUE)
119 |   # glmGamPoi
120 |   res2 <- acosh_transform(fit)
121 |   # SummarizedExperiment
122 |   res3 <- acosh_transform(fit$data)
123 | 
124 |   expect_equal(res, res2)
125 |   expect_equal(res, res3)
126 | 
127 | })
128 | 
129 | 
130 | test_that("overdispersion handling works", {
131 | 
132 |   n_genes <- 100
133 |   n_cells <- 500
134 |   beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
135 |   sf <- rchisq(n = n_cells, df = 100)
136 |   sf <- sf / mean(sf)
137 |   Mu <- exp( beta0 %*% t(log(sf)) )
138 |   Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 0.1), nrow = n_genes, ncol = n_cells)
139 | 
140 |   fit <- glmGamPoi::glm_gp(Y, design = ~ 1)
141 |   res1 <- acosh_transform(Y, overdispersion = TRUE)
142 |   res2 <- acosh_transform(Y, overdispersion = fit$overdispersion_shrinkage_list$dispersion_trend)
143 | 
144 |   expect_equal(res1, res2)
145 | 
146 |   res3 <- shifted_log_transform(Y, overdispersion = TRUE)
147 |   res4 <- shifted_log_transform(Y, overdispersion = fit$overdispersion_shrinkage_list$dispersion_trend)
148 |   expect_equal(res3, res4)
149 | 
150 |   fit <- glmGamPoi::glm_gp(Y, design = ~ 1, overdispersion_shrinkage = TRUE)
151 |   res5 <- acosh_transform(Y, overdispersion = TRUE)
152 |   res6 <- acosh_transform(Y, overdispersion = fit$overdispersion_shrinkage_list$dispersion_trend)
153 | 
154 |   expect_equal(res5, res6)
155 | 
156 |   res7 <- shifted_log_transform(Y, overdispersion = TRUE)
157 |   res8 <- shifted_log_transform(Y, overdispersion = fit$overdispersion_shrinkage_list$dispersion_trend)
158 |   expect_equal(res7, res8)
159 | 
160 | })
161 | 
162 | 
163 | 
164 | test_that("on_disk works", {
165 |   n_genes <- 100
166 |   n_cells <- 30
167 |   beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
168 |   sf <- rchisq(n = n_cells, df = 100)
169 |   sf <- sf / mean(sf)
170 |   Mu <- exp( beta0 %*% t(log(sf)) )
171 |   Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 0.1), nrow = n_genes, ncol = n_cells)
172 | 
173 |   Y_hdf5 <- HDF5Array::writeHDF5Array(Y)
174 | 
175 |   res <- acosh_transform(Y)
176 |   res1 <- acosh_transform(Y_hdf5)
177 |   res2 <- acosh_transform(Y, on_disk = TRUE)
178 | 
179 |   expect_s4_class(res1, "DelayedMatrix")
180 |   expect_s4_class(res2, "DelayedMatrix")
181 | 
182 |   expect_equal(res, as.matrix(res1))
183 |   expect_equal(res, as.matrix(res2))
184 | 
185 |   Y_sp <- as(Y, "dgCMatrix")
186 |   Y_sp_hdf5 <- HDF5Array::writeHDF5Array(Y_sp)
187 |   res3 <- acosh_transform(Y_sp)
188 |   res4 <- acosh_transform(Y_sp_hdf5)
189 |   res5 <- acosh_transform(Y_sp, on_disk = TRUE)
190 | 
191 |   expect_true(HDF5Array::is_sparse(res3))
192 |   expect_true(HDF5Array::is_sparse(res4))
193 |   expect_true(HDF5Array::is_sparse(res5))
194 |   expect_equal(res3, as(res4, "dgCMatrix"), ignore_attr = TRUE)
195 |   expect_equal(res3, as(res5, "dgCMatrix"), ignore_attr = TRUE)
196 | })
197 | 
198 | 
199 | test_that("Clipping works for Pearson residuals", {
200 | 
201 |   Y <- matrix(rnbinom(n = 10 * 30, mu = 3, size = 1/0.15), nrow = 10, ncol = 5)
202 |   resid1 <- residual_transform(Y, "pearson", clipping = FALSE)
203 |   resid2 <- residual_transform(Y, "pearson", clipping = TRUE)
204 | 
205 |   clip <- sqrt(5)
206 |   large_clip <- resid1 > clip
207 |   small_clip <- resid1 < -clip
208 | 
209 |   expect_equal(resid2[large_clip], rep(clip, sum(large_clip)))
210 |   expect_equal(resid2[small_clip], rep(-clip, sum(small_clip)))
211 |   expect_equal(resid2[! (large_clip | small_clip)],
212 |                resid1[! (large_clip | small_clip)])
213 | 
214 | 
215 |   resid3 <- residual_transform(Y, "pearson", clipping = 0.234)
216 |   clip <- 0.234
217 |   large_clip <- resid1 > clip
218 |   small_clip <- resid1 < -clip
219 | 
220 |   expect_equal(resid3[large_clip], rep(clip, sum(large_clip)))
221 |   expect_equal(resid3[small_clip], rep(-clip, sum(small_clip)))
222 |   expect_equal(resid3[! (large_clip | small_clip)],
223 |                resid1[! (large_clip | small_clip)])
224 | 
225 | })
226 | 
227 | 
228 | 
229 | test_that("Analytic Pearson residual implementation works", {
230 | 
231 |   Y <- matrix(rnbinom(n = 10 * 30, mu = 3, size = 1/0.15), nrow = 10, ncol = 5)
232 |   Y_sp <- as(Y, "dgCMatrix")
233 |   Y_hdf5 <- HDF5Array::writeHDF5Array(Y)
234 |   Y_sp_hdf5 <- HDF5Array::writeHDF5Array(Y_sp)
235 | 
236 |   resid1 <- residual_transform(Y, "pearson")
237 |   resid2 <- residual_transform(Y, "analytic_pearson")
238 |   resid3 <- residual_transform(Y, "analytic_pearson", size_factors = "poscounts")
239 |   resid4 <- residual_transform(Y_sp, "analytic_pearson")
240 |   resid5 <- residual_transform(Y_hdf5, "analytic_pearson")
241 |   resid6 <- residual_transform(Y_sp_hdf5, "analytic_pearson")
242 |   resid7 <- residual_transform(Y, "analytic_pearson", overdispersion = 0.05 + rnorm(10, sd = 0.001))
243 | 
244 |   expect_equal(resid2, as.matrix(resid4), ignore_attr = TRUE)
245 |   expect_equal(resid2, as.matrix(resid5), ignore_attr = TRUE)
246 |   expect_equal(resid2, as.matrix(resid6), ignore_attr = TRUE)
247 | 
248 |   expect_true(all(diag(cor(resid1, resid2)) > 0.95))
249 |   expect_true(all(diag(cor(resid1, resid3)) > 0.95))
250 |   expect_true(all(diag(cor(resid1, as.matrix(resid4))) > 0.95))
251 |   expect_true(all(diag(cor(resid1, as.matrix(resid5))) > 0.95))
252 |   expect_true(all(diag(cor(resid1, as.matrix(resid6))) > 0.95))
253 |   expect_true(all(diag(cor(resid1, as.matrix(resid7))) > 0.95))
254 | })
255 | 
256 | 


--------------------------------------------------------------------------------
/tests/testthat/test-transformGamPoi.R:
--------------------------------------------------------------------------------
 1 | test_that("residual_transform works", {
 2 |     n_genes <- 100
 3 |     n_cells <- 500
 4 | 
 5 |     beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
 6 |     sf <- rchisq(n = n_cells, df = 100)
 7 |     sf <- sf / mean(sf)
 8 | 
 9 |     Mu <- exp( beta0 %*% t(log(sf)) )
10 | 
11 |     Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 0.1), nrow = n_genes, ncol = n_cells)
12 | 
13 |     summary(MatrixGenerics::colMeans2(Y))
14 |     summary(MatrixGenerics::rowMeans2(Y))
15 | 
16 | 
17 |     resids <- residual_transform(Y, verbose = FALSE)
18 |     res2 <- residual_transform(Y, offset_model = FALSE, verbose = FALSE, return_fit = TRUE)
19 |     expect_true(all(abs(res2$fit$Beta[,2] - 1) < 0.1))
20 | 
21 | })
22 | 
23 | 
24 | test_that("different input types work", {
25 |     n_genes <- 100
26 |     n_cells <- 500
27 | 
28 |     beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
29 |     sf <- rchisq(n = n_cells, df = 100)
30 |     sf <- sf / mean(sf)
31 | 
32 |     Mu <- exp( beta0 %*% t(log(sf)) )
33 | 
34 |     Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 0.1), nrow = n_genes, ncol = n_cells)
35 | 
36 |     # matrix
37 |     res <- residual_transform(Y, verbose = FALSE, return_fit = TRUE, residual_type = "pearson")
38 |     # glmGamPoi
39 |     res2 <- residual_transform(res$fit, residual_type = "pearson")
40 |     # SummarizedExperiment
41 |     res3 <- residual_transform(res$fit$data, residual_type = "pearson")
42 | 
43 |     expect_equal(res$Residuals, res2)
44 |     expect_equal(res$Residuals, res3)
45 | })
46 | 
47 | 
48 | test_that("overdisperion = 'global' works", {
49 |     set.seed(1)
50 |     n_genes <- 100
51 |     n_cells <- 500
52 | 
53 |     beta0 <- rnorm(n = n_genes, mean = 2, sd = 0.3)
54 |     sf <- rchisq(n = n_cells, df = 100)
55 |     sf <- sf / mean(sf)
56 | 
57 |     Mu <- exp( beta0 %*% t(log(sf)) )
58 | 
59 |     Y <- matrix(rnbinom(n = n_genes * n_cells, mu = Mu, size = 1/0.1), nrow = n_genes, ncol = n_cells)
60 | 
61 |     tmp <- transformGamPoi(Y, "rand", overdispersion = "global", verbose = FALSE, on_disk = FALSE, return_fit = TRUE)
62 |     expect_equal(tmp$fit$overdispersions, rep(0.1, n_genes), tolerance = 0.1)
63 | })
64 | 
65 | 
66 | test_that("transformGamPoi errors if 'residual_type' is specified", {
67 | 
68 |     set.seed(1)
69 |     Y <- matrix(rnbinom(n = 24, mu = 3, size = 1/0.1), nrow = 3, ncol = 8)
70 |     expect_error(transformGamPoi(Y, residual_type = "pearson"))
71 | 
72 | })
73 | 
74 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/transformGamPoi.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "transformGamPoi Quickstart"
  3 | author: Constantin Ahlmann-Eltze
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::html_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{glmGamPoi Quickstart}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "#>"
 16 | )
 17 | set.seed(1)
 18 | par(cex = 1.5)
 19 | ```
 20 | 
 21 | 
 22 | _transformGamPoi_ provides variance stabilizing transformations to handle the heteroskedasticity of count data. For example single-cell RNA sequencing counts vary more for highly expressed genes than for lowly expressed genes. However, many classical statistical methods perform best on data with uniform variance. This package provides a set of different variance stabilizing transformations to make the subsequent application of generic statistical methods more palatable.
 23 | 
 24 | 
 25 | # Installation
 26 | 
 27 | You can install _transformGamPoi_ from [Bioconductor](https://bioconductor.org/packages/transformGamPoi/) after it has been accepted using the following command
 28 | 
 29 | ```r
 30 | if (!requireNamespace("BiocManager", quietly = TRUE))
 31 |     install.packages("BiocManager")
 32 | BiocManager::install("transformGamPoi")
 33 | ```
 34 | 
 35 | In the mean time or to get the latest development version, you can install _transformGamPoi_ directly from [GitHub](https://github.com/const-ae/transformGamPoi) using the [devtools](https://devtools.r-lib.org/) package
 36 | 
 37 | ```r
 38 | # install.packages("devtools")
 39 | devtools::install_github("const-ae/transformGamPoi")
 40 | ```
 41 | 
 42 | # Example
 43 | 
 44 | The functions in _transformGamPoi_ take any kind of matrix-like object (e.g., `matrix`, `dgCMatrix`, `DelayedArray`, `SummarizedExperiment`, `SingleCellExperiment`) and return the corresponding transformed matrix objects. For sparse input the functions try to preserve sparsity. For container objects like `SummarizedExperiment`, _transformGamPoi_ extracts the `"counts"` assay and returns an object of the same type as that `"counts"` assay.
 45 | 
 46 | We start by loading the package to make the transformation functions available in our R session:
 47 | ```{r setup}
 48 | library(transformGamPoi)
 49 | ```
 50 | 
 51 | In the next step, we load some example data. Here, we use a single-cell RNA sequencing experiment of 4096 blood cells. For convenience, we subset the data to 1,000 genes and 5,00 cells
 52 | ```{r loadData}
 53 | # Load the 'TENxPBMCData' as a SingleCellExperiment object
 54 | sce <- TENxPBMCData::TENxPBMCData("pbmc4k")
 55 | # Subset the data to 1,000 x 500 random genes and cells
 56 | sce <- sce[sample(nrow(sce), 1000), sample(ncol(sce), 500)]
 57 | ```
 58 | If we take a look at the stored counts (mainly zeros) stored as a sparse matrix of type `DelayedMatrix`. Fortunately, the precise meaning of that storage type is not important, because _transformGamPoi_ handles this automatically.
 59 | ```{r}
 60 | assay(sce, "counts")[1:10, 1:5]
 61 | ```
 62 | 
 63 | To see what we mean by heteroskedasticity, let us compare the mean and variance for each gene across cells. We will use the [_MatrixGenerics_](https://bioconductor.org/packages/MatrixGenerics/) package to calculate the row means and row variances. You might be familiar with the [_matrixStats_](https://cran.r-project.org/package=matrixStats) package; _MatrixGenerics_ provides the same set of functions but depending on the type of the matrix automatically dispatches the call to either _matrixStats_, [_DelayedMatrixStats_](https://bioconductor.org/packages/DelayedMatrixStats/), or [_sparseMatrixStats_](https://bioconductor.org/packages/sparseMatrixStats/).
 64 | ```{r}
 65 | library(MatrixGenerics)
 66 | # Exclude genes where all counts are zero
 67 | sce <- sce[rowMeans2(counts(sce)) > 0, ]
 68 | gene_means <- rowMeans2(counts(sce))
 69 | gene_var <- rowVars(counts(sce))
 70 | plot(gene_means, gene_var, log = "xy", main = "Log-log scatter plot of mean vs variance")
 71 | abline(a = 0, b = 1)
 72 | sorted_means <- sort(gene_means)
 73 | lines(sorted_means, sorted_means + 0.2 * sorted_means^2, col = "purple")
 74 | ```
 75 | The purple line shows a quadratic mean-variance relation ($\text{Var} = \mu + 0.2 \mu^2$) typical for data that is Gamma-Poisson distributed. For example a gene with a mean expression of 5 the corresponding variance is 10, whereas for a gene with a mean expression of 500 the variance ~50,000. Here we used an overdispersion of $\alpha = 0.2$, _transformGamPoi_ provides options to either fit $\alpha$ on the data or fix it to a user-defined value.
 76 | 
 77 | _transformGamPoi_ implements two approaches for variance stabilization: (1) based on the delta method, (2) based on model residuals.
 78 | 
 79 | ## Delta method-based variance stabilizing transformations
 80 | 
 81 | The delta method relates the standard deviation of a transformed random variable $g(X_i)$ to the standard deviation of the original random variable $X_i$. This can be used to find a function such that $g(X_i) = \text{const}$. For a quadratic mean variance relation this function is
 82 | $$
 83 | g(x) = \frac{1}{\sqrt{\alpha}} \text{acosh}(2 \alpha x + 1).
 84 | $$
 85 | 
 86 | We can apply this transformation to the counts:
 87 | ```{r}
 88 | assay(sce, "acosh") <- acosh_transform(assay(sce, "counts"))
 89 | # Equivalent to 'assay(sce, "acosh") <- acosh_transform(sce)'
 90 | ```
 91 | 
 92 | 
 93 | We plot the variance of the `acosh` transformed counts and find that for $\mu < 0.5$ the variance still increases for higher average gene expression. However, for larger expression values the variance for a gene is approximately independent of the corresponding average gene expression (note that the y-axis is not log transformed anymore!).
 94 | ```{r}
 95 | acosh_var <- rowVars(assay(sce, "acosh"))
 96 | plot(gene_means, acosh_var, log = "x", main = "Log expression vs variance of acosh stabilized values")
 97 | abline(h = 1)
 98 | ```
 99 | 
100 | The most popular transformation for single cell data is $g(x) = \log(x + c)$ with pseudo-count $c=1$. It turns out that this transformation is closely related to the `acosh` transformation. When we choose $c = 1/(4\alpha)$ the two converge rapidly, only for small values the `acosh` is closer to $g(x) = 2\sqrt{x}$:
101 | ```{r}
102 | x <- seq(0, 30, length.out = 1000)
103 | y_acosh <- acosh_transform(x, overdispersion = 0.1)
104 | y_shiftLog <- shifted_log_transform(x, pseudo_count = 1/(4 * 0.1))
105 | y_sqrt <- 2 * sqrt(x) # Identical to acosh_transform(x, overdispersion = 0)
106 | ```
107 | The plot looks as follows:
108 | ```{r}
109 | plot(x, y_acosh, type = "l", col = "black", lwd = 3, ylab = "g(x)", ylim = c(0, 10))
110 | lines(x, y_shiftLog, col = "red", lwd = 3)
111 | lines(x, y_sqrt, col = "blue", lwd = 3)
112 | legend("bottomright", legend = c(expression(2*sqrt(x)), 
113 |                                  expression(1/sqrt(alpha)~acosh(2*alpha*x+1)),
114 |                                  expression(1/sqrt(alpha)~log(x+1/(4*alpha))+b)),
115 |        col = c("blue", "black", "red"), lty = 1, inset = 0.1, lwd = 3)
116 | ```
117 | The offset $b$ for the shifted logarithm has no influence on the variance stabilization. We choose $b$ such that sparsity of the input is retained (i.e., $g(0) = 0$).
118 | 
119 | 
120 | 
121 | ## Model residuals-based variance stabilizing transformations
122 | 
123 | An alternative approach for variance stabilization was suggested by Hafemeister and Satija (2019). They used the Pearson residuals from a Gamma-Poisson generalized linear model fit as the variance stabilized values. The advantage of this approach is that the variance is also stabilized for lowly expressed genes unlike the delta method-based transformations:
124 | ```{r}
125 | assay(sce, "pearson") <- residual_transform(sce, "pearson", clipping = TRUE, on_disk = FALSE)
126 | ```
127 | 
128 | ```{r}
129 | pearson_var <- rowVars(assay(sce, "pearson"))
130 | plot(gene_means, pearson_var, log = "x", main = "Log expression vs variance of Pearson residuals")
131 | abline(h = 1)
132 | ```
133 | 
134 | Pearson residuals are by definition a linear transformation. This means that for genes with strong expression differences across subgroups they cannot achieve variance stabilization. As an alternative, _transformGamPoi_ provides  randomized quantile residuals which are non-linear and exploit randomization to work around the discrete nature of counts:
135 | ```{r}
136 | assay(sce, "rand_quantile") <- residual_transform(sce, "randomized_quantile", on_disk = FALSE)
137 | ```
138 | 
139 | 
140 | ```{r}
141 | rand_quant_var <- rowVars(assay(sce, "rand_quantile"))
142 | plot(gene_means, rand_quant_var, log = "x", main = "Log expression vs variance of Randomized Quantile residuals")
143 | abline(h = 1)
144 | ```
145 | 
146 | 
147 | 
148 | 
149 | # Session Info
150 | 
151 | ```{r}
152 | sessionInfo()
153 | ```
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------