├── tests
    ├── testthat.R
    └── testthat
    │   └── testcorral.R
├── .Rbuildignore
├── corral_sticker.png
├── vignettes
    ├── corral_IO.png
    ├── corralm_IO.png
    ├── ref.bib
    ├── corralm_alignment.Rmd
    └── corral_dimred.Rmd
├── inst
    ├── NEWS.md
    └── CITATION
├── man
    ├── na2zero.Rd
    ├── get_weights.Rd
    ├── all_are.Rd
    ├── pairwise_rv.Rd
    ├── rv.Rd
    ├── scal_var_mat.Rd
    ├── list2mat.Rd
    ├── var_stabilize.Rd
    ├── get_pct_var_exp_svd.Rd
    ├── sce2matlist.Rd
    ├── obs2probs.Rd
    ├── trim_matdist.Rd
    ├── add_embeddings2scelist.Rd
    ├── scal_var.Rd
    ├── compsvd.Rd
    ├── earthmover_dist.Rd
    ├── biplot_corral.Rd
    ├── corral_preproc.Rd
    ├── plot_embedding_sce.Rd
    ├── plot_embedding.Rd
    ├── corral.Rd
    └── corralm.Rd
├── README.md
├── NAMESPACE
├── R
    ├── checkers.R
    ├── scal_var.R
    ├── evaluation.R
    ├── corralm.R
    ├── utils.R
    ├── plot_embedding.R
    └── corral.R
└── DESCRIPTION


/tests/testthat.R:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/corral_sticker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/corral_sticker.png


--------------------------------------------------------------------------------
/vignettes/corral_IO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/vignettes/corral_IO.png


--------------------------------------------------------------------------------
/vignettes/corralm_IO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/vignettes/corralm_IO.png


--------------------------------------------------------------------------------
/inst/NEWS.md:
--------------------------------------------------------------------------------
1 | Changes in version 1.5.1 (2021-11-08)
2 | + Made significant updates to include methods described in our manuscript. Both are described in dim reduction vignette
3 |   o New variations of CA that address overdispersion
4 |   o Biplots - visualize relationships between cells and genes
5 |   o Scaled variance plots - visualize batch integration embedding space
6 |   


--------------------------------------------------------------------------------
/tests/testthat/testcorral.R:
--------------------------------------------------------------------------------
 1 | context("corral")
 2 | 
 3 | library(corral)
 4 | library(ade4)
 5 | 
 6 | test_that('same eigens as dudi.coa',{
 7 |   mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25)
 8 |   a <- corral(mat,ncomp = 2)
 9 |   b <- dudi.coa(mat, scannf = FALSE, nf = 2)
10 |   expect_equal(all.equal(b$eig[1:2],a$d[1:2]^2),TRUE)
11 | })
12 | 
13 | 
14 | # add checks that it takes sce / mat
15 | 
16 | # add checks for thin and thick matrix
17 | 
18 | 


--------------------------------------------------------------------------------
/man/na2zero.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{na2zero}
 4 | \alias{na2zero}
 5 | \title{Set na to 0}
 6 | \usage{
 7 | na2zero(x)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix of values for which na values should be changed to 0}
11 | }
12 | \value{
13 | matrix, where na values are set to 0
14 | }
15 | \description{
16 | Set na to 0
17 | }
18 | \examples{
19 | x <- matrix(sample(0:10, 5000, replace = TRUE), ncol = 25)
20 | x[sample(1:5000, 10)] <- NA
21 | 
22 | na2zero(x)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/get_weights.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_weights}
 4 | \alias{get_weights}
 5 | \title{Get weights}
 6 | \usage{
 7 | get_weights(inp_mat)
 8 | }
 9 | \arguments{
10 | \item{inp_mat}{matrix for which weights should be calculated (sparse or full)}
11 | }
12 | \value{
13 | list of 2 elements: 'row.w' and 'col.w' contain the row and column weights respectively
14 | }
15 | \description{
16 | Computes row weights and column weights
17 | }
18 | \examples{
19 | mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10)
20 | ws <- get_weights(mat)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/all_are.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{all_are}
 4 | \alias{all_are}
 5 | \title{all_are}
 6 | \usage{
 7 | all_are(inplist, typechar)
 8 | }
 9 | \arguments{
10 | \item{inplist}{list or List to be checked}
11 | 
12 | \item{typechar}{char of the type to check for}
13 | }
14 | \value{
15 | boolean, for whether the elements of \code{inplist} are all \code{typechar}
16 | }
17 | \description{
18 | Checks if all elements of a list or List are of a (single) particular type \code{typechar}
19 | }
20 | \examples{
21 | x <- list(1,2)
22 | all_are(x,'numeric')
23 | all_are(x,'char')
24 | 
25 | y <- list(1,2,'c')
26 | all_are(y,'numeric')
27 | all_are(y,'char')
28 | }
29 | 


--------------------------------------------------------------------------------
/man/pairwise_rv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{pairwise_rv}
 4 | \alias{pairwise_rv}
 5 | \title{Pairwise rv coefficient}
 6 | \usage{
 7 | pairwise_rv(matlist)
 8 | }
 9 | \arguments{
10 | \item{matlist}{list of matrices (or matrix-like; see \code{rv} function) for which to compute pairwise RV coefficients}
11 | }
12 | \value{
13 | matrix of the pairwise coefficients
14 | }
15 | \description{
16 | Pairwise rv coefficient
17 | }
18 | \examples{
19 | a <- matrix(sample(1:10,100,TRUE), nrow = 10)
20 | b <- matrix(sample(1:10,50,TRUE), nrow = 5)
21 | c <- matrix(sample(1:10,20,TRUE), nrow = 2)
22 | 
23 | matlist <- list(a,b,c)
24 | pairwise_rv(matlist)
25 | pairwise_rv(lapply(matlist, t))
26 | }
27 | 


--------------------------------------------------------------------------------
/man/rv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{rv}
 4 | \alias{rv}
 5 | \title{rv coefficient}
 6 | \usage{
 7 | rv(mat1, mat2)
 8 | }
 9 | \arguments{
10 | \item{mat1}{matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat2}}
11 | 
12 | \item{mat2}{matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat1}}
13 | }
14 | \value{
15 | numeric; RV coefficient between the matched matrices
16 | }
17 | \description{
18 | rv coefficient
19 | }
20 | \examples{
21 | a <- matrix(sample(1:10,100, TRUE), nrow = 10)
22 | b <- matrix(sample(1:10,50, TRUE), nrow = 5)
23 | 
24 | rv(a, b) # matched by columns
25 | rv(t(a), t(b)) # matched by rows
26 | }
27 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citEntry(entry="article",
 2 |          title = "Correspondence analysis for dimension reduction, batch integration, and visualization of single-cell RNA-seq data",
 3 |          author = personList( as.person("Lauren L. Hsu"),
 4 |                               as.person("Aedin C. Culhane")),
 5 |          year = "2023",
 6 |          journal = "Scientific Reports",
 7 |          doi = "10.1038/s41598-022-26434-1",
 8 |          issue = "13",
 9 |          number = "1197",
10 |          url = "https://www.nature.com/articles/s41598-022-26434-1",
11 |          textVersion = 
12 |          paste("Hsu, L.L., Culhane, A.C.", 
13 |                "Correspondence analysis for dimension reduction, batch integration, and visualization of single-cell RNA-seq data",
14 |                 "Scientific Reports 13, 1197 (2023)" ) )
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <img src="corral_sticker.png" width="180"/>
 3 | </p>
 4 | 
 5 | # corral
 6 | *Dimensionality reduction and batch integration methods for single cell data*
 7 | 
 8 | corral is a package to perform correspondence analysis on count data, particularly for single-cell (e.g., RNA-seq).
 9 | 
10 | It can be used on a single table for dimensionality reduction, or it can be used for integration across batches, samples, and modes.
11 | 
12 | To install the dev version: `devtools::install_github('laurenhsu1/corral')`
13 | 
14 | Or it can be installed with `BiocManager::install("laurenhsu1/corral", dependencies = TRUE)`
15 | 
16 | # How to
17 | 
18 | The package contains two main function calls:
19 | 
20 | 1. `corral` for dimensionality reduction on a single table 
21 | 2. `corralm` for alignment and batch integration of multiple tables
22 | 
23 | See the vignettes for details on outputs and how to plot.
24 | 


--------------------------------------------------------------------------------
/man/scal_var_mat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scal_var.R
 3 | \name{scal_var_mat}
 4 | \alias{scal_var_mat}
 5 | \title{Generate a matrix of the scaled variance values}
 6 | \usage{
 7 | scal_var_mat(inp, batchvec = NULL)
 8 | }
 9 | \arguments{
10 | \item{inp}{corralm object or matrix; embedding to compute scaled variances}
11 | 
12 | \item{batchvec}{vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`.}
13 | }
14 | \value{
15 | matrix of the scaled variance values by PC (batches in rows; PCs in columns)
16 | }
17 | \description{
18 | Generate a matrix of the scaled variance values
19 | }
20 | \examples{
21 | dat <- matrix(rnorm(5000), ncol = 50)
22 | bv <- rep(seq(3),c(10,30,60))
23 | scal_var_mat(dat, bv)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/list2mat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{list2mat}
 4 | \alias{list2mat}
 5 | \title{List to Matrix}
 6 | \usage{
 7 | list2mat(matlist, direction = c("c", "r")[1])
 8 | }
 9 | \arguments{
10 | \item{matlist}{list of matrices to concatenate}
11 | 
12 | \item{direction}{character, r or c, to indicate whether should be row-wise (i.e., rbind to match on columns) or column-wise (i.e., cbind to match on rows). Defaults to columnwise (matching on rows) to match convention of SingleCellExperiments}
13 | }
14 | \value{
15 | matrix
16 | }
17 | \description{
18 | List to Matrix
19 | }
20 | \examples{
21 | listofmats <- list(matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10),
22 |                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 10))
23 | newmat <- list2mat(listofmats) # to "cbind" them
24 | listofmats_t <- lapply(listofmats,t)
25 | newmat_t <- list2mat(listofmats_t, 'r') # to "rbind" them
26 | }
27 | 


--------------------------------------------------------------------------------
/man/var_stabilize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/corral.R
 3 | \name{var_stabilize}
 4 | \alias{var_stabilize}
 5 | \title{Apply a variance stabilizing transformation}
 6 | \usage{
 7 | var_stabilize(inp, transform = c("sqrt", "freemantukey", "anscombe"))
 8 | }
 9 | \arguments{
10 | \item{inp}{matrix, numeric, counts or logcounts; can be sparse Matrix or matrix}
11 | 
12 | \item{transform}{character indicating which method should be applied. Defaults to the square root transform (`"sqrt"`). Other options include `"freemantukey"` and `"anscombe"`.}
13 | }
14 | \value{
15 | variance-stabilized matrix; sparse if possible
16 | }
17 | \description{
18 | Prior to running CA, there is an option to apply a variance stabilizing transformation. This function can be called explicitly or used with the `vst_mth` argument in \code{corral} and \code{corral_preproc}.
19 | }
20 | \examples{
21 | x <- as.matrix(rpois(100, lambda = 50), ncol = 10)
22 | vst_x <- var_stabilize(x)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/get_pct_var_exp_svd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_pct_var_exp_svd}
 4 | \alias{get_pct_var_exp_svd}
 5 | \title{Compute percent of variance explained}
 6 | \usage{
 7 | get_pct_var_exp_svd(thissvd, preproc_mat = thissvd$d)
 8 | }
 9 | \arguments{
10 | \item{thissvd}{list outputted from an svd function (svd, irlba; can also take output from \code{\link{corral_mat}} and \code{\link{corralm_matlist}})}
11 | 
12 | \item{preproc_mat}{matrix of pre-processed values (optional) - important to include if the svd is only partial as this is used to compute the sum of eigenvalues}
13 | }
14 | \value{
15 | vector of percent variance explained values, indexed by PC
16 | }
17 | \description{
18 | Compute percent of variance explained
19 | }
20 | \examples{
21 | mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10)
22 | my_svd <- svd(mat)
23 | get_pct_var_exp_svd(my_svd) # this works if my_svd is a full svd
24 | my_irl <- irlba::irlba(mat,nv = 2)
25 | get_pct_var_exp_svd(my_irl, preproc_mat = mat) # ... otherwise use this 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/sce2matlist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{sce2matlist}
 4 | \alias{sce2matlist}
 5 | \title{SingleCellExperiment to list of matrices}
 6 | \usage{
 7 | sce2matlist(sce, splitby, to_include = NULL, whichmat = "counts")
 8 | }
 9 | \arguments{
10 | \item{sce}{SingleCellExperiment that is to be separated into list of count matrices}
11 | 
12 | \item{splitby}{character; name of the attribute from colData that should be used to separate the SCE}
13 | 
14 | \item{to_include}{(optional) character vector; determines which values from the "splitby" column will be included in the outputted matlist. NULL is the default, and will result in selecting all elements}
15 | 
16 | \item{whichmat}{character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object}
17 | }
18 | \value{
19 | list of matrices
20 | }
21 | \description{
22 | SingleCellExperiment to list of matrices
23 | }
24 | \examples{
25 | library(DuoClustering2018)
26 | sce <- sce_full_Zhengmix4eq()
27 | matlist <- sce2matlist(sce = sce, splitby = 'phenoid', whichmat = 'logcounts')
28 | }
29 | 


--------------------------------------------------------------------------------
/man/obs2probs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/evaluation.R
 3 | \name{obs2probs}
 4 | \alias{obs2probs}
 5 | \title{Observations --> discrete probabilities}
 6 | \usage{
 7 | obs2probs(obs, numbins = 100, startbin = min(obs), endbin = max(obs) + 1e-05)
 8 | }
 9 | \arguments{
10 | \item{obs}{vector of numeric, with the observations}
11 | 
12 | \item{numbins}{int, the number of evenly sized bins to discretize the observations to}
13 | 
14 | \item{startbin}{numeric, the starting value for the smallest bin. Defaults to taking the minimum of obs}
15 | 
16 | \item{endbin}{numeric, the ending value for the largest bin. Defaults to taking the maximum of obs (plus a tiny decimal to ensure full range of obs is captured)}
17 | }
18 | \value{
19 | dataframe, results has rows corresponding to each bin with columns for probability ('prob'), cumulative frequency ('cumfreq'), and frequency ('freq') of observations falling into that bin. The 'bins' column indicates the end of the bin (start is the preceding column)
20 | }
21 | \description{
22 | usage:
23 | embedding <- matrix(sample(x = seq(0,10,.1),200, replace = TRUE))
24 | disc_probs <- obs2probs(embedding)
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/trim_matdist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{trim_matdist}
 4 | \alias{trim_matdist}
 5 | \title{Trim extreme values in a pre-processed matrix}
 6 | \usage{
 7 | trim_matdist(mat, pct_trim = 0.01)
 8 | }
 9 | \arguments{
10 | \item{mat}{matrix; should be pre-processed/normalized to some sort of approximately normally distributed statistic (e.g., chi-squared transformation with `corral_preproc` or Z-score normalization)}
11 | 
12 | \item{pct_trim}{numeric; the percent of observations to smooth.  Defaults to `pct_trim` = .01, which corresponds to smoothing all observations to be between the .5 percentile and 99.5 percentile range of the input matrix}
13 | }
14 | \value{
15 | smoothed matrix
16 | }
17 | \description{
18 | Smooths the extreme values in a chi-square-transformed matrix to lessen the influence of "rare objects."
19 | }
20 | \details{
21 | (Usually not called directly; can be included by using the `smooth` argument in the `corral`, `corralm`, and `corral_preproc` functions)
22 | }
23 | \examples{
24 | count_mat <- matrix(rpois(10000, 300)*rbinom(10000,1,.1), ncol = 100)
25 | smoothed_preproc_mat <- corral_preproc(count_mat, smooth = TRUE)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/add_embeddings2scelist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{add_embeddings2scelist}
 4 | \alias{add_embeddings2scelist}
 5 | \title{Add embeddings to list of SCEs}
 6 | \usage{
 7 | add_embeddings2scelist(scelist, embeddings, slotname = "corralm")
 8 | }
 9 | \arguments{
10 | \item{scelist}{list of SingleCellExperiments; to which the corresponding embeddings should be added}
11 | 
12 | \item{embeddings}{matrix; the embeddings outputted from a dimension reduction, e.g. \code{\link{corralm}}. Rows in this table correspond to columns in the SCEs in \code{scelist} (if all the SCEs were column-bound), and row indices should correspond to cells.}
13 | 
14 | \item{slotname}{character; name of the slot for the reduced dim embedding; defaults to \code{corral}}
15 | }
16 | \value{
17 | list of SingleCellExperiments with respective embeddings stored in them
18 | }
19 | \description{
20 | Add embeddings to list of SCEs
21 | }
22 | \examples{
23 | library(DuoClustering2018)
24 | sce <- sce_full_Zhengmix4eq()
25 | scelist <- list(sce,sce)
26 | embeddings <- matrix(sample(seq(0,20,1),dim(sce)[2]*6,replace = TRUE),nrow = dim(sce)[2]*2)
27 | scelist <- add_embeddings2scelist(scelist, embeddings)
28 | }
29 | 


--------------------------------------------------------------------------------
/man/scal_var.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scal_var.R
 3 | \name{scal_var}
 4 | \alias{scal_var}
 5 | \title{Generate a scaled variance plot for an integrative embedding}
 6 | \usage{
 7 | scal_var(
 8 |   inp,
 9 |   batchvec = NULL,
10 |   pcs = seq(3),
11 |   returngg = FALSE,
12 |   showplot = TRUE,
13 |   plot_subtitle = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{inp}{corralm object or matrix; embedding to compute scaled variances}
18 | 
19 | \item{batchvec}{vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`.}
20 | 
21 | \item{pcs}{numeric; vector of which PCs should be shown. Defaults to 1:3}
22 | 
23 | \item{returngg}{boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE}}
24 | 
25 | \item{showplot}{boolean; whether or not to show the plot, defaults \code{TRUE}}
26 | 
27 | \item{plot_subtitle}{string; the text that should show in the subtitle for the plot. defaults to NULL}
28 | }
29 | \value{
30 | N/A or a ggplot object
31 | }
32 | \description{
33 | Generate a scaled variance plot for an integrative embedding
34 | }
35 | \examples{
36 | dat <- matrix(rnorm(10000), ncol = 50)
37 | bv <- rep(seq(4),c(10,30,60,100))
38 | scal_var(dat,bv, pcs = seq(4))
39 | }
40 | 


--------------------------------------------------------------------------------
/man/compsvd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/corral.R
 3 | \name{compsvd}
 4 | \alias{compsvd}
 5 | \title{compsvd: Compute Singular Value Decomposition (SVD)}
 6 | \usage{
 7 | compsvd(mat, method = c("irl", "svd"), ncomp = 30, ...)
 8 | }
 9 | \arguments{
10 | \item{mat}{matrix, pre-processed input; can be sparse or full (pre-processing can be performed using \code{\link{corral_preproc}} from this package)}
11 | 
12 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd}
13 | 
14 | \item{ncomp}{numeric, number of components; Default is 30}
15 | 
16 | \item{...}{(additional arguments for methods)}
17 | }
18 | \value{
19 | SVD result - a list with the following elements:
20 | \describe{
21 |     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat}. Note that using \code{svd} will result in the full set of singular values, while \code{irlba} will only compute the first \code{ncomp} singular values.}
22 |     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns}
23 |     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns}
24 |     \item{\code{eigsum}}{sum of the eigenvalues, for calculating percent variance explained}
25 | }
26 | }
27 | \description{
28 | Computes SVD.
29 | }
30 | \examples{
31 | mat <- matrix(sample(0:10, 2500, replace=TRUE), ncol=50)
32 | compsvd(mat, method = 'irl', ncomp = 5)
33 | }
34 | 


--------------------------------------------------------------------------------
/man/earthmover_dist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/evaluation.R
 3 | \name{earthmover_dist}
 4 | \alias{earthmover_dist}
 5 | \title{Earthmover distance (and general Wasserstein distance)}
 6 | \usage{
 7 | earthmover_dist(batch1, batch2, whichdim = 1, numbins = 100, p_param = 1)
 8 | }
 9 | \arguments{
10 | \item{batch1}{matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)}
11 | 
12 | \item{batch2}{matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)}
13 | 
14 | \item{whichdim}{int; which dimension (i.e., column) from the embeddings is used. defaults on first}
15 | 
16 | \item{numbins}{int; number of bins for the probability discretization (defaults to 100)}
17 | 
18 | \item{p_param}{int; penalty parameter for general Wasserstein distance. Defaults to 1, which corresonds to earthmover.}
19 | }
20 | \value{
21 | num; the distance
22 | }
23 | \description{
24 | i.e., wasserstein distance with L1 (p_param = 1); can also use other penalties > 1
25 | (Not technically earthmover distance if using other p_param values)
26 | }
27 | \examples{
28 | # To compare distributions of reduced dimension values to assess similarity, 
29 | # e.g. as a metric for batch integration
30 | embedding <- matrix(sample(x = seq(0,10,.1),1000, replace = TRUE),ncol = 5)
31 | batch <- matrix(sample(c(1,2),200, replace = TRUE))
32 | earthmover_dist(embedding[which(batch == 1),],embedding[which(batch == 2),]) 
33 | }
34 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,corral)
 4 | S3method(print,corralm)
 5 | export(add_embeddings2scelist)
 6 | export(all_are)
 7 | export(biplot_corral)
 8 | export(compsvd)
 9 | export(corral)
10 | export(corral_mat)
11 | export(corral_preproc)
12 | export(corral_sce)
13 | export(corralm)
14 | export(corralm_matlist)
15 | export(corralm_sce)
16 | export(earthmover_dist)
17 | export(get_pct_var_exp_svd)
18 | export(get_weights)
19 | export(list2mat)
20 | export(na2zero)
21 | export(pairwise_rv)
22 | export(plot_embedding)
23 | export(plot_embedding_sce)
24 | export(rv)
25 | export(scal_var)
26 | export(scal_var_mat)
27 | export(sce2matlist)
28 | export(trim_matdist)
29 | export(var_stabilize)
30 | import(SingleCellExperiment)
31 | import(ggplot2)
32 | import(ggthemes)
33 | import(gridExtra)
34 | import(pals)
35 | import(reshape2)
36 | importClassesFrom(Matrix,dgCMatrix)
37 | importClassesFrom(MultiAssayExperiment,MultiAssayExperiment)
38 | importClassesFrom(SingleCellExperiment,SingleCellExperiment)
39 | importFrom(Matrix,Matrix)
40 | importFrom(Matrix,colSums)
41 | importFrom(Matrix,rowSums)
42 | importFrom(MultiAssayExperiment,assays)
43 | importFrom(MultiAssayExperiment,experiments)
44 | importFrom(MultiAssayExperiment,intersectRows)
45 | importFrom(SingleCellExperiment,colData)
46 | importFrom(SingleCellExperiment,reducedDim)
47 | importFrom(SummarizedExperiment,assay)
48 | importFrom(ggthemes,scale_color_few)
49 | importFrom(grDevices,colorRampPalette)
50 | importFrom(graphics,plot)
51 | importFrom(irlba,irlba)
52 | importFrom(methods,is)
53 | importFrom(stats,quantile)
54 | importFrom(stats,var)
55 | importFrom(transport,wasserstein)
56 | importFrom(utils,combn)
57 | 


--------------------------------------------------------------------------------
/R/checkers.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | .check_vals <- function(mat){
 3 |   if(!is.numeric(mat)){
 4 |     stop('Matrix input should contain only numbers.')
 5 |   }
 6 |   else if (min(mat) < 0){
 7 |     stop('Matrix input should only contain positive values.')
 8 |   }
 9 | }
10 | 
11 | #' @keywords internal
12 | .check_mat <- function(mat){
13 |   .check_vals(mat)
14 |   if (min(dim(mat)) < 2){
15 |     stop('Matrix input must have more than 1 element and more than 1 feature.')
16 |   }
17 | }
18 | 
19 | #' @keywords internal
20 | .check_dims <- function(matlist){
21 |   dims <- unlist(lapply(matlist,nrow))
22 |   if(length(unique(dims)) > 1) {
23 |     stop('If performing multi-table analysis, the matrices must be matched by rows; currently the dimensions do not match. \nIf they are matched by columns, then transpose the matrices.')}
24 | }
25 | 
26 | #' @keywords internal
27 | .check_rw_contrib <- function(matlist, rw_contrib){
28 |   # Verifying valid input for rw_contrib, otherwise setting to equal weight.
29 |   matlist_len <- length(matlist)
30 |   if(matlist_len != length(rw_contrib)){
31 |     cat('\nThe provided weights did not match number of batches (i.e., number of matrices).\nThey will be set to equal weight.\n')
32 |     return(rep(1, matlist_len))
33 |   }
34 |   else if(sum(is.na(as.numeric(rw_contrib)))){
35 |     cat('\nNon-numeric values provided in rw_contrib.\nThey will be set to equal weight.\n')
36 |     return(rep(1, matlist_len))
37 |   }
38 |   else{
39 |     return(rw_contrib)
40 |   }
41 | }
42 | 
43 | #' @keywords internal
44 | .check_ncomp <- function(mat, ncomp){
45 |   max_comps <- min(dim(mat)) - 1
46 |   if(ncomp > max_comps){
47 |     ncomp <- max_comps
48 |     cat('Too many components requested; setting ncomp =', max_comps, '\n')
49 |   }
50 |   return(ncomp)
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/vignettes/ref.bib:
--------------------------------------------------------------------------------
 1 | @article{zmdata,
 2 |     AUTHOR = {Duò, A and Robinson, MD and Soneson, C},
 3 |     TITLE = {A systematic performance evaluation of clustering methods for single-cell RNA-seq data [version 2; peer review: 2 approved],
 4 |     JOURNAL = {F1000Research},
 5 |     VOLUME = {7},
 6 |     YEAR = {2018},
 7 |     NUMBER = {1141},
 8 |     DOI = {10.12688/f1000research.15666.2}
 9 | }}
10 | 
11 | @Article{zheng,
12 | title = {Massively parallel digital transcriptional profiling of single cells},
13 | 	volume = {8},
14 | 	issn = {2041-1723},
15 | 	url = {http://www.nature.com/articles/ncomms14049},
16 | 	doi = {10.1038/ncomms14049},
17 | 	language = {en},
18 | 	number = {1},
19 | 	urldate = {2020-04-24},
20 | 	journal = {Nature Communications},
21 | 	author = {Zheng, Grace X. Y. and Terry, Jessica M. and Belgrader, Phillip and Ryvkin, Paul and Bent, Zachary W. and Wilson, Ryan and Ziraldo, Solongo B. and Wheeler, Tobias D. and McDermott, Geoff P. and Zhu, Junjie and Gregory, Mark T. and Shuga, Joe and Montesclaros, Luz and Underwood, Jason G. and Masquelier, Donald A. and Nishimura, Stefanie Y. and Schnall-Levin, Michael and Wyatt, Paul W. and Hindson, Christopher M. and Bharadwaj, Rajiv and Wong, Alexander and Ness, Kevin D. and Beppu, Lan W. and Deeg, H. Joachim and McFarland, Christopher and Loeb, Keith R. and Valente, William J. and Ericson, Nolan G. and Stevens, Emily A. and Radich, Jerald P. and Mikkelsen, Tarjei S. and Hindson, Benjamin J. and Bielas, Jason H.},
22 | 	month = apr,
23 | 	year = {2017},
24 | 	pages = {14049}
25 | }
26 | 
27 | @article{scmix,
28 | 	title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments},
29 | 	volume = {16},
30 | 	issn = {1548-7091, 1548-7105},
31 | 	url = {http://www.nature.com/articles/s41592-019-0425-8},
32 | 	doi = {10.1038/s41592-019-0425-8},
33 | 	language = {en},
34 | 	number = {6},
35 | 	urldate = {2019-11-21},
36 | 	journal = {Nature Methods},
37 | 	author = {Tian, Luyi and Dong, Xueyi and Freytag, Saskia and Lê Cao, Kim-Anh and Su, Shian and JalalAbadi, Abolfazl and Amann-Zalcenstein, Daniela and Weber, Tom S. and Seidi, Azadeh and Jabbari, Jafar S. and Naik, Shalin H. and Ritchie, Matthew E.},
38 | 	month = jun,
39 | 	year = {2019},
40 | 	pages = {479--487}
41 | }


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: corral
 2 | Title: Correspondence Analysis for Single Cell Data
 3 | Version: 1.9.1
 4 | Date: 2023-02-09
 5 | Authors@R: 
 6 |     c(person(given = "Lauren", family = "Hsu",
 7 |              role = c("aut", "cre"),
 8 |              email = "lrnshoe@gmail.com",
 9 |              comment = c(ORCID = "0000-0002-6035-7381")),
10 |      person(given = "Aedin", family = "Culhane",
11 |              role = c("aut"),
12 |              email = "Aedin.Culhane@ul.ie",
13 |              comment = c(ORCID = "0000-0002-1395-9734")))
14 | Description: 
15 |     Correspondence analysis (CA) is a matrix factorization method, and is 
16 |     similar to principal components analysis (PCA). Whereas PCA is designed for 
17 |     application to continuous, approximately normally distributed data, CA is 
18 |     appropriate for non-negative, count-based data that are in the same additive scale. 
19 |     The corral package implements CA for dimensionality reduction of a single matrix of 
20 |     single-cell data, as well as a multi-table adaptation of CA that leverages 
21 |     data-optimized scaling to align data generated from different sequencing platforms 
22 |     by projecting into a shared latent space. corral utilizes sparse matrices and a 
23 |     fast implementation of SVD, and can be called directly on Bioconductor objects 
24 |     (e.g., SingleCellExperiment) for easy pipeline integration.
25 |     The package also includes additional options, including variations of CA to
26 |     address overdispersion in count data (e.g., Freeman-Tukey chi-squared residual), 
27 |     as well as the option to apply CA-style processing to continuous data 
28 |     (e.g., proteomic TOF intensities) with the Hellinger distance adaptation of CA.
29 | Imports: 
30 |     ggplot2, 
31 |     ggthemes, 
32 |     grDevices,
33 |     gridExtra,
34 |     irlba, 
35 |     Matrix, 
36 |     methods,
37 |     MultiAssayExperiment,
38 |     pals,
39 |     reshape2,
40 |     SingleCellExperiment, 
41 |     SummarizedExperiment,
42 |     transport
43 | Suggests: 
44 |     ade4,
45 |     BiocStyle,
46 |     CellBench,
47 |     DuoClustering2018,
48 |     knitr,
49 |     rmarkdown,
50 |     scater,
51 |     testthat
52 | License: GPL-2
53 | RoxygenNote: 7.1.2
54 | VignetteBuilder: knitr
55 | biocViews: 
56 |     BatchEffect,
57 |     DimensionReduction, 
58 |     GeneExpression,
59 |     Preprocessing,
60 |     PrincipalComponent, 
61 |     Sequencing,
62 |     SingleCell, 
63 |     Software,
64 |     Visualization
65 | Encoding: UTF-8
66 | 


--------------------------------------------------------------------------------
/man/biplot_corral.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_embedding.R
 3 | \name{biplot_corral}
 4 | \alias{biplot_corral}
 5 | \title{Generate biplot for corral object}
 6 | \usage{
 7 | biplot_corral(
 8 |   corral_obj,
 9 |   color_vec,
10 |   text_vec,
11 |   feat_name = "(genes)",
12 |   nfeat = 20,
13 |   xpc = 1,
14 |   plot_title = "Biplot",
15 |   text_size = 2,
16 |   xjitter = 0.005,
17 |   yjitter = 0.005,
18 |   coords = c("svd", "PC", "SC")
19 | )
20 | }
21 | \arguments{
22 | \item{corral_obj}{list outputted by the \code{corral} function}
23 | 
24 | \item{color_vec}{vector; length should correspond to the number of rows in v of \code{corral_obj}, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., cell type)}
25 | 
26 | \item{text_vec}{vector; length should correspond to the number of rows in u of \code{corral_obj}, and each element of the vector is the label for the respective feature that would show on the biplot.}
27 | 
28 | \item{feat_name}{char; the label will in the legend. Defaults to \code{(genes)}.}
29 | 
30 | \item{nfeat}{int; the number of features to include. The function will first order them by distance from origin in the selected dimensions, then select the top n to be displayed.}
31 | 
32 | \item{xpc}{int; which PC to put on the x-axis (defaults to 1)}
33 | 
34 | \item{plot_title}{char; title of plot (defaults to *Biplot*)}
35 | 
36 | \item{text_size}{numeric; size of the feature labels given in \code{text_vec} (defaults to 2; for \code{ggplot2})}
37 | 
38 | \item{xjitter}{numeric; the amount of jitter for the text labels in x direction (defaults to .005; for \code{ggplot2})}
39 | 
40 | \item{yjitter}{numeric; the amount of jitter for the text labels in y direction (defaults to .005; for \code{ggplot2})}
41 | 
42 | \item{coords}{char; indicator for sets of coordinates to use. \code{svd} plots the left and right singular vectors as outputted by SVD (\code{u} and \code{v}), which \code{PC} and \code{SC} use the principal and standard coordinates, respectively (defaults to \code{svd})}
43 | }
44 | \value{
45 | ggplot2 object of the biplot
46 | }
47 | \description{
48 | Generate biplot for corral object
49 | }
50 | \examples{
51 | library(DuoClustering2018)
52 | library(SingleCellExperiment)
53 | zm4eq.sce <- sce_full_Zhengmix4eq()
54 | zm4eq.countmat <- counts(zm4eq.sce)
55 | zm4eq.corral_obj <- corral(zm4eq.countmat)
56 | gene_names <- rowData(zm4eq.sce)$symbol
57 | ctvec <- zm4eq.sce$phenoid
58 | 
59 | biplot_corral(corral_obj = zm4eq.corral_obj, color_vec = ctvec, text_vec = gene_names)
60 | }
61 | 


--------------------------------------------------------------------------------
/man/corral_preproc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/corral.R
 3 | \name{corral_preproc}
 4 | \alias{corral_preproc}
 5 | \title{Preprocess a matrix for SVD to perform Correspondence Analysis (CA)}
 6 | \usage{
 7 | corral_preproc(
 8 |   inp,
 9 |   rtype = c("standardized", "indexed", "hellinger", "freemantukey", "pearson"),
10 |   vst_mth = c("none", "sqrt", "freemantukey", "anscombe"),
11 |   powdef_alpha = NULL,
12 |   row.w = NULL,
13 |   col.w = NULL,
14 |   smooth = FALSE,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{inp}{matrix, numeric, counts or logcounts; can be sparse Matrix or matrix}
20 | 
21 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.}
22 | 
23 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`}
24 | 
25 | \item{powdef_alpha}{numeric for the power that should be applied if using power deflation. Must be in (0,1), and if provided a number outside this range, will be ignored. Defaults to `NULL` which does not perform this step.}
26 | 
27 | \item{row.w}{numeric vector; Default is \code{NULL}, to compute row.w based on \code{inp}. Use this parameter to replace computed row weights with custom row weights}
28 | 
29 | \item{col.w}{numeric vector; Default is \code{NULL}, to compute col.w based on \code{inp}. Use this parameter to replace computed column weights with custom column weights}
30 | 
31 | \item{smooth}{logical; Whether or not to perform the additional smoothing step with `trim_matdist`. Default is \code{FALSE}. Incompatible with `powdef_alpha`, so that parameter takes precedence over this one.}
32 | 
33 | \item{...}{(additional arguments for methods)}
34 | }
35 | \value{
36 | matrix, processed for input to \code{compsvd} to finish CA routine
37 | }
38 | \description{
39 | This function performs the row and column scaling pre-processing operations, prior to SVD, for the corral methods. See \code{\link{corral}} for single matrix correspondence analysis and \code{\link{corralm}} for multi-matrix correspondence analysis.
40 | }
41 | \details{
42 | 
43 | }
44 | \examples{
45 | mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25)
46 | mat_corral <- corral_preproc(mat)
47 | corral_output <- compsvd(mat_corral, ncomp = 5)
48 | }
49 | 


--------------------------------------------------------------------------------
/man/plot_embedding_sce.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_embedding.R
 3 | \name{plot_embedding_sce}
 4 | \alias{plot_embedding_sce}
 5 | \title{Plot selected PCs from an embedding saved in a SingleCellExperiment object}
 6 | \usage{
 7 | plot_embedding_sce(
 8 |   sce,
 9 |   which_embedding,
10 |   color_attr = NULL,
11 |   color_title = color_attr,
12 |   ellipse_attr = NULL,
13 |   facet_attr = NULL,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{sce}{\code{\link{SingleCellExperiment}} object; contains the embedding within the \code{reducedDim} slot}
19 | 
20 | \item{which_embedding}{character; for the embedding to plot}
21 | 
22 | \item{color_attr}{character; name of the attribute within \code{colData} to use for assigning colors (in lieu of \code{color_vec} in the \code{\link{plot_embedding}} function)}
23 | 
24 | \item{color_title}{character; title to use for colors legend, defaults to the same as \code{color_attr}}
25 | 
26 | \item{ellipse_attr}{character; name of the attribute within \code{colData} to use for drawing ellipse(s) (in lieu of \code{ellipse_vec} in the \code{\link{plot_embedding}} function)}
27 | 
28 | \item{facet_attr}{character; name of the attribute within \code{colData} to use for faceting (in lieu of \code{facet_vec} in the \code{\link{plot_embedding}} function)}
29 | 
30 | \item{...}{additional optional arguments - see \code{\link{plot_embedding}} function for details on other potential arguments: \code{xpc}, \code{ypc}, \code{plot_title}, \code{color_title} (if title is different from \code{color_attr}), \code{ptsize}, \code{saveplot}, \code{plotfn}, \code{showplot}, \code{returngg}, \code{color_pal_vec}, \code{dimname}}
31 | }
32 | \value{
33 | default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg})
34 | }
35 | \description{
36 | Plot selected PCs from an embedding saved in a SingleCellExperiment object
37 | }
38 | \examples{
39 | library(DuoClustering2018)
40 | library(SingleCellExperiment)
41 | sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)]
42 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
43 | sce <- corralm(sce, splitby = 'Method')
44 | 
45 | # to plot and show only
46 | plot_embedding_sce(sce = sce, 
47 |                    which_embedding = 'corralm', 
48 |                    xpc = 1, 
49 |                    plot_title = 'corralm: PC1 by PC2',
50 |                    color_attr = "Method", 
51 |                    ellipse_attr = 'phenoid',
52 |                    saveplot = FALSE)
53 | 
54 | # to return ggplot2 object and display, but not save
55 | corralm_ggplot <- plot_embedding_sce(sce = sce, 
56 |                                      which_embedding = 'corralm', 
57 |                                      xpc = 1, 
58 |                                      plot_title = 'corralm: PC1 by PC2',
59 |                                      color_attr = 'Method', 
60 |                                      ellipse_attr = 'phenoid', 
61 |                                      returngg = TRUE, 
62 |                                      saveplot = FALSE)
63 | 
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/R/scal_var.R:
--------------------------------------------------------------------------------
 1 | #' Generate a matrix of the scaled variance values
 2 | #'
 3 | #' @param inp corralm object or matrix; embedding to compute scaled variances
 4 | #' @param batchvec vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`.
 5 | #'
 6 | #' @return matrix of the scaled variance values by PC (batches in rows; PCs in columns)
 7 | #' @export
 8 | #'
 9 | #' @examples
10 | #' dat <- matrix(rnorm(5000), ncol = 50)
11 | #' bv <- rep(seq(3),c(10,30,60))
12 | #' scal_var_mat(dat, bv)
13 | scal_var_mat <- function(inp, batchvec = NULL){
14 |   if('corralm' %in% class(inp)){
15 |     splitvec <- rep(rownames(inp$batch_sizes),inp$batch_sizes[,2])
16 |     emb <- inp$v
17 |   }
18 |   else if(!is.null(batchvec)){
19 |     if(length(batchvec) != nrow(inp)){
20 |       stop('batchvec must be same length as number of rows in inp')
21 |     }
22 |     emb <- inp
23 |     splitvec <- batchvec
24 |   }
25 |   else{
26 |     stop('Please provide a valid input: either corralm object or embedding with batchvec arg')
27 |   }
28 |   sepemb <- apply(emb, MARGIN = 2, FUN = split, f = splitvec)
29 |   sep_var <- lapply(sepemb, FUN = function(x) rbind(unlist(lapply(x, var))))
30 |   sv_mat <- Reduce(cbind,lapply(sep_var, t))
31 |   sv_mat[,1]
32 |   
33 |   overall_var <- apply(emb, MARGIN = 2, FUN = var)
34 |   
35 |   sv_mat <- sweep(sv_mat, MARGIN = 2, STATS = overall_var, FUN = '/')
36 |   sv_mat
37 | }
38 | 
39 | #' Generate a scaled variance plot for an integrative embedding
40 | #'
41 | #' @param pcs numeric; vector of which PCs should be shown. Defaults to 1:3
42 | #' @param plot_subtitle string; the text that should show in the subtitle for the plot. defaults to NULL
43 | #' @inheritParams scal_var_mat
44 | #' @inheritParams plot_embedding
45 | #'
46 | #' @import ggplot2
47 | #' @import ggthemes
48 | #' @import reshape2
49 | #' @return N/A or a ggplot object
50 | #' @export
51 | #' 
52 | #' @importFrom stats quantile var
53 | #'
54 | #' @examples
55 | #' dat <- matrix(rnorm(10000), ncol = 50)
56 | #' bv <- rep(seq(4),c(10,30,60,100))
57 | #' scal_var(dat,bv, pcs = seq(4))
58 | scal_var <- function(inp, batchvec = NULL, pcs = seq(3), returngg = FALSE, showplot = TRUE, plot_subtitle = NULL){
59 |   svmat <- scal_var_mat(inp, batchvec)
60 |   svmelt <- reshape2::melt(svmat[,pcs])
61 |   colnames(svmelt) <- c('Batch','PC','sv')
62 |   svmelt$Batch <- as.factor(svmelt$Batch)
63 |   
64 |   svmelt$PC_ind <- svmelt$PC + seq(length(svmelt$PC))
65 |   
66 |   pc_seplines <- seq(2,max(svmelt$PC_ind))[-which(seq(2,max(svmelt$PC_ind)) %in% svmelt$PC_ind)]
67 |   pc_seplines <- c(pc_seplines, max(svmelt$PC_ind)+1)
68 |   pc_lablocs <- pc_seplines - nrow(svmat)/2
69 |   
70 |   ggobj <- ggplot(svmelt, aes(x = PC_ind, y = sv, colour = Batch, group = Batch)) + 
71 |     geom_hline(yintercept=1, color = 'gray') + 
72 |     geom_segment(aes(x=PC_ind, xend=PC_ind, y=1, yend=sv), color="gray") +
73 |     geom_point(size = 2) + ggthemes::scale_color_hc() + # other color option - scale_color_few()
74 |     theme_classic() + 
75 |     geom_vline(xintercept = pc_seplines, color = '#555555') +
76 |     labs(x = 'Component',
77 |          y = 'Scaled variance by group for top PCs',
78 |          title = 'Scaled variance by batch',
79 |          subtitle = plot_subtitle,
80 |          colour = 'Batch') + 
81 |     scale_x_continuous(breaks = pc_lablocs, 
82 |                        labels = paste0('PC',pcs)) +
83 |     theme(axis.ticks.x = element_blank(), axis.title.x = element_text(size = rel(1.2))) +
84 |     ylim(0,NA)
85 |   
86 |   if(showplot){
87 |     show(ggobj)
88 |   }
89 |   if(returngg){
90 |     return(ggobj)
91 |   }
92 | }
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/man/plot_embedding.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_embedding.R
 3 | \name{plot_embedding}
 4 | \alias{plot_embedding}
 5 | \title{Plot selected PCs from an embedding}
 6 | \usage{
 7 | plot_embedding(
 8 |   embedding,
 9 |   xpc = 1,
10 |   ypc = xpc + 1,
11 |   plot_title = paste0("Dim", xpc, " by Dim", ypc),
12 |   color_vec = NULL,
13 |   color_title = NULL,
14 |   ellipse_vec = NULL,
15 |   facet_vec = NULL,
16 |   ptsize = 0.8,
17 |   saveplot = FALSE,
18 |   plotfn = paste(plot_title, xpc, sep = "_"),
19 |   showplot = TRUE,
20 |   returngg = FALSE,
21 |   color_pal_vec = NULL,
22 |   dimname = "Dim"
23 | )
24 | }
25 | \arguments{
26 | \item{embedding}{matrix or other tabular format where columns correspond to PCs and rows correspond to cells (entries). \code{corral} and \code{corralm}  objects are also accepted.}
27 | 
28 | \item{xpc}{int; which PC to put on the x-axis (defaults to 1)}
29 | 
30 | \item{ypc}{int; which PC to put on the y-axis (defaults to the one after \code{xpc})}
31 | 
32 | \item{plot_title}{char; title of plot (defaults to titling based on \code{xpc} and \code{ypc})}
33 | 
34 | \item{color_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., this could be indicating which batch each cell is from)}
35 | 
36 | \item{color_title}{char; what attribute the colors represent}
37 | 
38 | \item{ellipse_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, and elements of the same class will be circled in an ellipse. (e.g., this could be indicating the cell type or cell line; works best for attributes intended to be compact)}
39 | 
40 | \item{facet_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class. Plot will be faceted by this attribute.}
41 | 
42 | \item{ptsize}{numeric; the size of the points as passed to \code{geom_point()}. Defaults to 0.8.}
43 | 
44 | \item{saveplot}{boolean; whether or not to save the plot, defaults \code{FALSE}}
45 | 
46 | \item{plotfn}{char; what the filename is to be called. (defaults to making a name based on \code{plot_title} and \code{xpc})}
47 | 
48 | \item{showplot}{boolean; whether or not to show the plot, defaults \code{TRUE}}
49 | 
50 | \item{returngg}{boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE}}
51 | 
52 | \item{color_pal_vec}{char; hex codes for the color palette to be used. Default is to use the ggthemes few for plots with less than 9 colors, and to use/"stretch" pals polychrome if more colors are needed.}
53 | 
54 | \item{dimname}{char; the name of the dimensions. defaults to "Dim"}
55 | }
56 | \value{
57 | default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg})
58 | }
59 | \description{
60 | Plot selected PCs from an embedding
61 | }
62 | \examples{
63 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20),
64 |                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20))
65 | corralm_obj <- corralm(listofmats, ncomp = 5)
66 | embed_mat <- corralm_obj$v
67 | cell_type_vec <- sample(c('type1','type2','type3'),100,replace = TRUE)
68 | plot_embedding(embedding = embed_mat, 
69 |                xpc = 1, 
70 |                plot_title = 'corralm plot',
71 |                color_vec = cell_type_vec, 
72 |                color_title = 'cell type',
73 |                saveplot = FALSE)
74 | 
75 | # or, call directly on the corralm object              
76 | plot_embedding(corralm_obj)
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/R/evaluation.R:
--------------------------------------------------------------------------------
 1 | # Functions for evaluating performance with respect to software / computing, as well as results for dimensionality reduction and batch integration.
 2 | 
 3 | #' @keywords internal
 4 | .cumtotal <- function(vals, ref){
 5 |   return(sum(vals < ref))
 6 | }
 7 | 
 8 | #' Observations --> discrete probabilities
 9 | #' 
10 | #' usage:
11 | #' embedding <- matrix(sample(x = seq(0,10,.1),200, replace = TRUE))
12 | #' disc_probs <- obs2probs(embedding)
13 | #'
14 | #' @param obs vector of numeric, with the observations
15 | #' @param numbins int, the number of evenly sized bins to discretize the observations to
16 | #' @param startbin numeric, the starting value for the smallest bin. Defaults to taking the minimum of obs
17 | #' @param endbin numeric, the ending value for the largest bin. Defaults to taking the maximum of obs (plus a tiny decimal to ensure full range of obs is captured)
18 | #'
19 | #' @return dataframe, results has rows corresponding to each bin with columns for probability ('prob'), cumulative frequency ('cumfreq'), and frequency ('freq') of observations falling into that bin. The 'bins' column indicates the end of the bin (start is the preceding column)
20 | #' @keywords internal
21 | #'
22 | obs2probs <- function(obs, numbins = 100, startbin = min(obs), endbin = max(obs) + .00001){
23 |   bins <- seq(from = startbin, to = endbin, length.out = numbins)
24 |   result <- data.frame(bins)
25 |   result[1,'cumfreq'] <- 0
26 |   result[1, 'freq'] <- 0
27 |   for (ind in seq(2,numbins,1)){
28 |     cumsum <- .cumtotal(obs, bins[ind])
29 |     result[ind, 'cumfreq'] <- cumsum
30 |     result[ind, 'freq'] <- cumsum - result[ind - 1, 'cumfreq']
31 |   }
32 |   result$probs <- result$freq / length(obs)
33 |   return(result)
34 | }
35 | 
36 | #' @keywords internal
37 | .make_costmat <- function(matdim, mincost = 1, maxcost = 10){
38 |   abval_dif <- function(x,y) {return(abs(x-y))}
39 |   costmat <- matrix(seq(mincost, maxcost,length.out = matdim), matdim, matdim)
40 |   costmat <- sweep(costmat, 
41 |                    MARGIN = 2,
42 |                    STATS = seq(mincost, maxcost, length.out = matdim),
43 |                    FUN = abval_dif)
44 |   return(costmat)
45 | }
46 | 
47 | 
48 | #' Earthmover distance (and general Wasserstein distance)
49 | #' 
50 | #' i.e., wasserstein distance with L1 (p_param = 1); can also use other penalties > 1
51 | #' (Not technically earthmover distance if using other p_param values)
52 | #'
53 | #' @param batch1 matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)
54 | #' @param batch2 matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)
55 | #' @param whichdim int; which dimension (i.e., column) from the embeddings is used. defaults on first
56 | #' @param numbins int; number of bins for the probability discretization (defaults to 100)
57 | #' @param p_param int; penalty parameter for general Wasserstein distance. Defaults to 1, which corresonds to earthmover.
58 | #'
59 | #' @return num; the distance
60 | #' @export
61 | #' 
62 | #' @importFrom transport wasserstein
63 | #'
64 | #' @examples
65 | #' # To compare distributions of reduced dimension values to assess similarity, 
66 | #' # e.g. as a metric for batch integration
67 | #' embedding <- matrix(sample(x = seq(0,10,.1),1000, replace = TRUE),ncol = 5)
68 | #' batch <- matrix(sample(c(1,2),200, replace = TRUE))
69 | #' earthmover_dist(embedding[which(batch == 1),],embedding[which(batch == 2),]) 
70 | earthmover_dist <- function(batch1, batch2, whichdim = 1, numbins = 100, p_param = 1){
71 |   minval <- min(min(batch1), min(batch2))
72 |   maxval <- max(max(batch1), max(batch2)) + .00001
73 |   df_1 <- obs2probs(obs = batch1[,whichdim], numbins = numbins, startbin = minval, endbin = maxval)
74 |   df_2 <- obs2probs(obs = batch2[,whichdim], numbins = numbins, startbin = minval, endbin = maxval)
75 |   costmat <- .make_costmat(matdim = numbins)
76 |   transport::wasserstein(df_1$probs, df_2$probs, costm = costmat, p = p_param)
77 | }
78 | 


--------------------------------------------------------------------------------
/man/corral.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/corral.R
  3 | \name{corral_mat}
  4 | \alias{corral_mat}
  5 | \alias{corral_sce}
  6 | \alias{corral}
  7 | \alias{print.corral}
  8 | \title{corral: Correspondence analysis on a single matrix}
  9 | \usage{
 10 | corral_mat(
 11 |   inp,
 12 |   method = c("irl", "svd"),
 13 |   ncomp = 30,
 14 |   row.w = NULL,
 15 |   col.w = NULL,
 16 |   rtype = c("standardized", "indexed", "hellinger", "freemantukey", "pearson"),
 17 |   vst_mth = c("none", "sqrt", "freemantukey", "anscombe"),
 18 |   ...
 19 | )
 20 | 
 21 | corral_sce(
 22 |   inp,
 23 |   method = c("irl", "svd"),
 24 |   ncomp = 30,
 25 |   whichmat = "counts",
 26 |   fullout = FALSE,
 27 |   subset_row = NULL,
 28 |   ...
 29 | )
 30 | 
 31 | corral(inp, ...)
 32 | 
 33 | \method{print}{corral}(x, ...)
 34 | }
 35 | \arguments{
 36 | \item{inp}{matrix (any type), \code{SingleCellExperiment}, or \code{SummarizedExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}).}
 37 | 
 38 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd}
 39 | 
 40 | \item{ncomp}{numeric, number of components; Default is 30}
 41 | 
 42 | \item{row.w}{numeric vector; the row weights to use in chi-squared scaling. Defaults to `NULL`, in which case row weights are computed from the input matrix.}
 43 | 
 44 | \item{col.w}{numeric vector; the column weights to use in chi-squared scaling. For instance, size factors could be given here. Defaults to `NULL`, in which case column weights are computed from the input matrix.}
 45 | 
 46 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.}
 47 | 
 48 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`}
 49 | 
 50 | \item{...}{(additional arguments for methods)}
 51 | 
 52 | \item{whichmat}{character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object}
 53 | 
 54 | \item{fullout}{boolean; whether the function will return the full \code{corral} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corral_mat}}-style output, set this to \code{TRUE}.}
 55 | 
 56 | \item{subset_row}{numeric, character, or boolean vector; the rows to include in corral, as indices (numeric), rownames (character), or with booleans (same length as the number of rows in the matrix). If this parameter is \code{NULL}, then all rows will be used.}
 57 | 
 58 | \item{x}{(print method) corral object; the list output from \code{corral_mat}}
 59 | }
 60 | \value{
 61 | When run on a matrix, a list with the correspondence analysis matrix decomposition result:
 62 | \describe{
 63 |     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)}
 64 |     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)}
 65 |     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)}
 66 |     \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained}
 67 |     \item{\code{SCu and SCv}}{standard coordinates, left and right, respectively}
 68 |     \item{\code{PCu and PCv}}{principal coordinates, left and right, respectively}
 69 | }
 70 | 
 71 | When run on a \code{\link{SingleCellExperiment}}, returns a SCE with the embeddings (PCv from the full corral output) in the \code{reducedDim} slot \code{corral} (default). Also can return the same output as \code{\link{corral_mat}} when \code{fullout} is set to \code{TRUE}.
 72 | 
 73 | For matrix and \code{SummarizedExperiment} input, returns list with the correspondence analysis matrix decomposition result (u,v,d are the raw svd output; SCu and SCv are the standard coordinates; PCu and PCv are the principal coordinates)
 74 | 
 75 | For \code{SummarizedExperiment} input, returns the same as for a matrix.
 76 | 
 77 | .
 78 | }
 79 | \description{
 80 | corral can be used for dimension reduction to find a set of low-dimensional embeddings for a count matrix.
 81 | 
 82 | \code{corral} is a wrapper for \code{\link{corral_mat}} and \code{\link{corral_sce}}, and can be called on any of the acceptable input types.
 83 | }
 84 | \examples{
 85 | mat <- matrix(sample(0:10, 5000, replace=TRUE), ncol=50)
 86 | result <- corral_mat(mat)
 87 | result <- corral_mat(mat, method = 'irl', ncomp = 5)
 88 | 
 89 | library(DuoClustering2018)
 90 | sce <- sce_full_Zhengmix4eq()[1:100,1:100]
 91 | result_1 <- corral_sce(sce)
 92 | result_2 <- corral_sce(sce, method = 'svd')
 93 | result_3 <- corral_sce(sce, method = 'irl', ncomp = 30, whichmat = 'logcounts')
 94 | 
 95 | 
 96 | library(DuoClustering2018)
 97 | sce <- sce_full_Zhengmix4eq()[1:100,1:100]
 98 | corral_sce <- corral(sce,whichmat = 'counts')
 99 | 
100 | mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25)
101 | corral_mat <- corral(mat, ncomp=5)
102 | 
103 | mat <- matrix(sample(1:100, 10000, replace = TRUE), ncol = 100)
104 | corral(mat)
105 | }
106 | 


--------------------------------------------------------------------------------
/man/corralm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/corralm.R
  3 | \name{corralm_matlist}
  4 | \alias{corralm_matlist}
  5 | \alias{corralm_sce}
  6 | \alias{corralm}
  7 | \alias{print.corralm}
  8 | \title{Multi-table correspondence analysis (list of matrices)}
  9 | \usage{
 10 | corralm_matlist(
 11 |   matlist,
 12 |   method = c("irl", "svd"),
 13 |   ncomp = 30,
 14 |   rtype = c("indexed", "standardized", "hellinger", "freemantukey", "pearson"),
 15 |   vst_mth = c("none", "sqrt", "freemantukey", "anscombe"),
 16 |   rw_contrib = NULL,
 17 |   ...
 18 | )
 19 | 
 20 | corralm_sce(
 21 |   sce,
 22 |   splitby,
 23 |   method = c("irl", "svd"),
 24 |   ncomp = 30,
 25 |   whichmat = "counts",
 26 |   fullout = FALSE,
 27 |   rw_contrib = NULL,
 28 |   ...
 29 | )
 30 | 
 31 | corralm(inp, whichmat = "counts", fullout = FALSE, ...)
 32 | 
 33 | \method{print}{corralm}(x, ...)
 34 | }
 35 | \arguments{
 36 | \item{matlist}{(for \code{corralm_matlist}) list of input matrices; input matrices should be counts (raw or log). Matrices should be aligned row-wise by common features (either by sample or by gene)}
 37 | 
 38 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd}
 39 | 
 40 | \item{ncomp}{numeric, number of components; Default is 30}
 41 | 
 42 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.}
 43 | 
 44 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`}
 45 | 
 46 | \item{rw_contrib}{numeric vector, same length as the matlist. Indicates the weight that each dataset should contribute to the row weights. When set to NULL the row weights are *not* combined and each matrix is scaled independently (i.e., using their observed row weights, respectively). When set to a vector of all the same values, this is equivalent to taking the mean. Another option is to the number of observations per matrix to create a weighted mean. Regardless of input scale, row weights for each table must sum to 1 and thus are scaled. When this option is specified (i.e., not `NULL`), the `rtype` argument will automatically be set to `standardized`, and whatever argument is given will be ignored.}
 47 | 
 48 | \item{...}{(additional arguments for methods)}
 49 | 
 50 | \item{sce}{(for \code{corralm_sce}) SingleCellExperiment; containing the data to be integrated. Default is to use the counts, and to include all of the data in the integration. These can be changed by passing additional arguments. See \code{\link{sce2matlist}} function documentation for list of available parameters.}
 51 | 
 52 | \item{splitby}{character; name of the attribute from \code{colData} that should be used to separate the SCE.}
 53 | 
 54 | \item{whichmat}{char, when using SingleCellExperiment or other SummarizedExperiment, can be specified. default is 'counts'.}
 55 | 
 56 | \item{fullout}{boolean; whether the function will return the full \code{corralm} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corralm_matlist}}-style output, set this to \code{TRUE}.}
 57 | 
 58 | \item{inp}{list of matrices (any type), a \code{SingleCellExperiment}, list of \code{SingleCellExperiment}s, list of \code{SummarizedExperiment}s, or \code{MultiAssayExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}). Additionally, if it is one \code{SingleCellExperiment}, then it is also necessary to include the \code{splitby} argument to specify the batches. For a \code{MultiAssayExperiment}, it will take the intersect of the features across all the assays, and use those to match the matrices; to use a different subset, select desired subsets then call \code{corral}}
 59 | 
 60 | \item{x}{(print method) corralm object; the list output from \code{corralm_matlist}}
 61 | }
 62 | \value{
 63 | When run on a list of matrices, a list with the correspondence analysis matrix decomposition result, with indices corresponding to the concatenated matrices (in order of the list):
 64 | \describe{
 65 |     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)}
 66 |     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)}
 67 |     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)}
 68 |     \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained}
 69 | }
 70 | 
 71 | For SingleCellExperiment input, returns the SCE with embeddings in the reducedDim slot 'corralm'
 72 | 
 73 | For a list of \code{\link{SingleCellExperiment}}s, returns a list of the SCEs with the embeddings in the respective \code{reducedDim} slot 'corralm'
 74 | 
 75 | .
 76 | }
 77 | \description{
 78 | This multi-table adaptation of correpondence analysis applies the same scaling technique and enables data alignment by finding a set of embeddings for each dataset within shared latent space.
 79 | }
 80 | \details{
 81 | \code{corralm} is a wrapper for \code{\link{corralm_matlist}} and \code{\link{corralm_sce}}, and can be called on any of the acceptable input types (see \code{inp} below).
 82 | }
 83 | \examples{
 84 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25),
 85 |                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25))
 86 | result <- corralm_matlist(listofmats)
 87 | library(DuoClustering2018)
 88 | library(SingleCellExperiment)
 89 | sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)]
 90 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
 91 | result <- corralm_sce(sce, splitby = 'Method')
 92 | 
 93 | 
 94 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20),
 95 |                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20))
 96 | corralm(listofmats)
 97 | 
 98 | library(DuoClustering2018)
 99 | library(SingleCellExperiment)
100 | sce <- sce_full_Zhengmix4eq()[seq(1,100,1),sample(seq(1,3500,1),100,replace = FALSE)]
101 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
102 | result <- corralm(sce, splitby = 'Method')
103 | 
104 | # default print method for corralm objects
105 | }
106 | 


--------------------------------------------------------------------------------
/vignettes/corralm_alignment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Alignment & batch integration of single cell data with corralm"
  3 | author: 
  4 | - name: Lauren Hsu
  5 |   affiliation: Harvard TH Chan School of Public Health; Dana-Farber Cancer Institute
  6 | - name: Aedin Culhane
  7 |   affiliation: Department of Data Sciences, Dana-Farber Cancer Institute, Department of Biostatistics, Harvard TH Chan School of Public Health
  8 | date: "4/28/2020"
  9 | output:
 10 |   BiocStyle::html_document:
 11 |     toc_float: true
 12 |   BiocStyle::pdf_document: default
 13 | package: BiocStyle
 14 | bibliography: ref.bib
 15 | vignette: |
 16 |   %\VignetteIndexEntry{alignment with corralm}
 17 |   %\VignetteEngine{knitr::rmarkdown}
 18 |   %\VignetteEncoding{UTF-8}
 19 | ---
 20 | ---
 21 | 
 22 | ```{r setup, include=FALSE}
 23 | knitr::opts_chunk$set(echo = TRUE)
 24 | library(gridExtra)
 25 | ```
 26 | 
 27 | # Introduction
 28 | 
 29 | Data from different experimental platforms and/or batches exhibit systematic 
 30 | variation -- i.e., batch effects. Therefore, when conducting joint analysis 
 31 | of data from different batches, a key first step is to align the datasets.
 32 | 
 33 | `corralm` is a multi-table adaptation of correspondence analysis designed 
 34 | for single-cell data, which applies multi-dimensional optimized scaling and 
 35 | matrix factorization to compute integrated embeddings across the datasets. 
 36 | These embeddings can then be used in downstream analyses, such as clustering, 
 37 | cell type classification, trajectory analysis, etc. 
 38 | 
 39 | See the vignette for `corral` for dimensionality reduction of a single matrix of single-cell data.
 40 | 
 41 | # Loading packages and data
 42 | 
 43 | We will use the `SCMixology` datasets from the `r Biocpkg('CellBench')` package [@scmix].
 44 | 
 45 | ```{r, message = FALSE}
 46 | library(corral)
 47 | library(SingleCellExperiment)
 48 | library(ggplot2)
 49 | library(CellBench)
 50 | library(MultiAssayExperiment)
 51 | 
 52 | scmix_dat <- load_all_data()[1:3]
 53 | 
 54 | ```
 55 | 
 56 | These datasets include a mixture of three lung cancer cell lines:
 57 | 
 58 | * H2228
 59 | * H1975
 60 | * HCC827
 61 | 
 62 | which was sequenced using three platforms:
 63 | 
 64 | * 10X
 65 | * CELseq2
 66 | * Dropseq
 67 | 
 68 | ```{r}
 69 | scmix_dat
 70 | ```
 71 | 
 72 | Each sequencing platform captures a different set of genes. 
 73 | In order to apply this method, the matrices need to be matched by features (i.e., genes). 
 74 | We'll find the intersect of the three datasets, then subset for that as we proceed.
 75 | 
 76 | First, we will prepare the data by:
 77 | 1. adding to the colData the sequencing platform (`Method` in `colData` for each SCE), and 
 78 | 2. subsetting by the intersect of the genes.
 79 | 
 80 | ```{r}
 81 | platforms <- c('10X','CELseq2','Dropseq')
 82 | for(i in seq_along(scmix_dat)) {
 83 |   colData(scmix_dat[[i]])$Method<- rep(platforms[i], ncol(scmix_dat[[i]]))
 84 | }
 85 | 
 86 | scmix_mae <- as(scmix_dat,'MultiAssayExperiment')
 87 | scmix_dat <- as.list(MultiAssayExperiment::experiments(MultiAssayExperiment::intersectRows(scmix_mae)))
 88 | 
 89 | ```
 90 | 
 91 | ![](corralm_IO.png)
 92 | 
 93 | `corralm` can be applied to the following types of objects:
 94 | 
 95 | * **a single `r Biocpkg('SingleCellExperiment')`** requires specifying `splitby` (also see documentation of `corralm_matlist` for additional optional arguments that can be passed), which is a character string for the attribute in `colData` that is tracking the batches. In our case, this would be the "Method" attribute we just added. The output from this type of input is the same `SingleCellExperiment`, with the result added to the `reducedDim` slot under `corralm`.
 96 | * **a list of `r Biocpkg('SingleCellExperiment')`** does not require any specific arguments. The output is a list of the input `SingleCellExperiment`s, with the result added to the `reducedDim` slot under `corralm`.
 97 | * **a list of matrices** (or other matrix-like objects: matrix, Matrix, tibble, data.frame, etc.) also does not require specific arguments. The output will be a concatenated list of SVD output matrices (`u`,`d`,`v`) where `v` contains a concatenated vector of the embeddings for the cells
 98 | * **a list of `r Biocpkg('SummarizedExperiment')`s** does not require specific arguments. The output is the same as for a list of matrices.
 99 | * **`r Biocpkg('MultiAssayExperiment')` or `ExperimentList`** does not require any specific arguments. `corralm` will identify the intersect of the rows, and use these to match the matrices. The output will be the same as for a list of matrices.
100 | 
101 | For purposes of illustration, we will walk through using `corralm` with a single SCE, and with a list of matrices.
102 | 
103 | # `corralm` on a single `r Biocpkg('SingleCellExperiment')`
104 | 
105 | First, setting up the data to demonstrate this:
106 | ```{r}
107 | colData(scmix_dat[[2]])$non_ERCC_percent <- NULL
108 | # need to remove this column so the objects can be concatenated
109 | 
110 | scmix_sce <- SingleCellExperiment::cbind(scmix_dat[[1]],
111 |                                          scmix_dat[[2]],
112 |                                          scmix_dat[[3]])
113 | ```
114 | 
115 | Running `corralm`, and specifying the `splitby` argument:
116 | (Note that the default is for the `counts` matrix to be used. 
117 | To change this default, use the `whichmat` argument.)
118 | ```{r}
119 | scmix_sce <- corralm(scmix_sce, splitby = 'Method')
120 | ```
121 | 
122 | Visualizing the results:
123 | ```{r}
124 | plot_embedding_sce(sce = scmix_sce, 
125 |                    which_embedding = 'corralm', 
126 |                    color_attr = 'Method', 
127 |                    color_title = 'platform', 
128 |                    ellipse_attr = 'cell_line', 
129 |                    plot_title = 'corralm on scmix', 
130 |                    saveplot = FALSE)
131 | ```
132 | 
133 | # `corralm` on a list of matrices
134 | 
135 | Again, preparing the data to be in this input format:
136 | ```{r}
137 | scmix_matlist <- sce2matlist(sce = scmix_sce, 
138 |                              splitby = 'Method', 
139 |                              whichmat = 'counts')
140 | 
141 | # for plotting purposes later, while we're here
142 | platforms <- colData(scmix_sce)$Method
143 | cell_lines <- colData(scmix_sce)$cell_line
144 | ```
145 | 
146 | Running corralm and visualizing output...
147 | (the embeddings are in the `v` matrix because these data are matched by genes 
148 | in the rows and have cells in the columns; if this were reversed, with cells 
149 | in the rows and genes/features in the column, then the cell embeddings would 
150 | instead be in the `u` matrix.)
151 | ```{r}
152 | scmix_corralm <- corralm(scmix_matlist)
153 | scmix_corralm
154 | plot_embedding(embedding = scmix_corralm$v, 
155 |                plot_title = 'corralm on scmix', 
156 |                color_vec = platforms, 
157 |                color_title = 'platform', 
158 |                ellipse_vec = cell_lines, 
159 |                saveplot = FALSE)
160 | ```
161 | 
162 | As expected, we get the same results as above. (Note that in performing SVD, 
163 | the direction of the axes doesn't matter and they may be flipped between runs, 
164 | as `corral` and `corralm` use `irlba` to perform fast approximation.)
165 | 
166 | # Scaled variance plots to evaluate integration
167 | 
168 | Scaled variance plots provide a simple and fast visual summary of the integration of embeddings. It can be called using the `scal_var` function, and works on both `corralm` objects and custom embeddings (with a vector indicating batch).
169 | 
170 | When integrating embedding representations across batches, measures for cluster evaluation are effective for assessing group compactness and recovery of cell populations via clustering. However, they do not directly assess how well dataset embeddings are integrated across batches. To focus specifically on batch integration, we developed and applied a heuristic scaled variance metric, which captures the relative dispersion of each batch with respect to the entire dataset. The scaled variance of component dimension $d^*$ for the subset of observations in batch $b^*$, $SV_{b^*,d}$, is computed with:
171 | $$SV_{b^*,d} = \frac{\mathrm{Var}(\mathbf{E_{b=b^*,d=d^*}})}{\mathrm{Var}(\mathbf{E_{d=d^*}})}$$
172 | where $\mathbf{E}$ is the matrix of embeddings, and $b$ indexes the rows (observations by batch) while $d$ indexes the columns to indicate which component dimension to evaluate.
173 | 
174 | ```{r}
175 | scal_var(scmix_corralm)
176 | ```
177 | 
178 | When the datasets are well integrated, SV values for each batch are close to 1, indicating that each batch's subset has similar dispersion as compared to the entire embedding. In contrast, if there is poorer integration, the scaled variance values will be more extreme away from 1 because the variance within batches will differ more from the variance overall. This metric is appropriate when the types of cells represented in different datasets are expected to be similar, but cannot account for situations where the expected distribution of cell types (and therefore, embeddings) is fundamentally different between batches.
179 | 
180 | # Session information
181 | 
182 | ```{r}
183 | sessionInfo()
184 | ```
185 | 
186 | # References
187 | 


--------------------------------------------------------------------------------
/R/corralm.R:
--------------------------------------------------------------------------------
  1 | #' Multi-table correspondence analysis (list of matrices)
  2 | #'
  3 | #' @param matlist (for \code{corralm_matlist}) list of input matrices; input matrices should be counts (raw or log). Matrices should be aligned row-wise by common features (either by sample or by gene)
  4 | #' @param rw_contrib numeric vector, same length as the matlist. Indicates the weight that each dataset should contribute to the row weights. When set to NULL the row weights are *not* combined and each matrix is scaled independently (i.e., using their observed row weights, respectively). When set to a vector of all the same values, this is equivalent to taking the mean. Another option is to the number of observations per matrix to create a weighted mean. Regardless of input scale, row weights for each table must sum to 1 and thus are scaled. When this option is specified (i.e., not `NULL`), the `rtype` argument will automatically be set to `standardized`, and whatever argument is given will be ignored.
  5 | #' @inheritParams compsvd
  6 | #' @inheritParams corral_preproc
  7 | #' 
  8 | #' @return When run on a list of matrices, a list with the correspondence analysis matrix decomposition result, with indices corresponding to the concatenated matrices (in order of the list):
  9 | #' \describe{
 10 | #'     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)}
 11 | #'     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)}
 12 | #'     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)}
 13 | #'     \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained}
 14 | #' }
 15 | #' @rdname corralm
 16 | #' @export
 17 | #' 
 18 | #' @importFrom irlba irlba
 19 | #' @importFrom Matrix Matrix rowSums colSums
 20 | #' @importClassesFrom Matrix dgCMatrix
 21 | #'
 22 | #' @examples
 23 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25),
 24 | #'                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25))
 25 | #' result <- corralm_matlist(listofmats)
 26 | corralm_matlist <- function(matlist, method = c('irl','svd'), ncomp = 30, rtype = c('indexed','standardized','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), rw_contrib = NULL, ...){
 27 |   method <- match.arg(method, c('irl','svd'))
 28 |   rtype <- match.arg(rtype, c('indexed','standardized','hellinger','freemantukey','pearson'))
 29 |   vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe'))
 30 |   
 31 |   .check_dims(matlist)
 32 |   if(is.null(rw_contrib)){
 33 |     preproc_mats <- lapply(matlist, corral_preproc, rtype = rtype, vst_mth = vst_mth, ...) 
 34 |   }
 35 |   else{
 36 |     rw_contrib <- .check_rw_contrib(matlist, rw_contrib)
 37 |     
 38 |     ws <- lapply(matlist, get_weights)
 39 |     rwlist <- lapply(ws, '[[', 'row.w')
 40 |     row.ws <- Reduce(cbind, rwlist)
 41 |     row.ws <- row.ws %*% rw_contrib
 42 |     comp_rw <- rowSums(row.ws) / sum(row.ws)
 43 |     
 44 |     preproc_mats <- lapply(matlist, corral_preproc, rtype = 'standardized', row.w = comp_rw, vst_mth = vst_mth, ...)
 45 |     if(rtype != 'standardized'){
 46 |       cat('\nSwitched residual type to standardized; using shared row weights.\n')
 47 |     }
 48 |   }
 49 |   concatted <- list2mat(matlist = preproc_mats, direction = 'c')
 50 |   result <- compsvd(concatted, method = method, ncomp = ncomp)
 51 |   result[['batch_sizes']] <- .batch_sizes(matlist)
 52 |   class(result) <- c(class(result),"corralm")
 53 |   return(result)
 54 | }
 55 | 
 56 | #' Multi-table correspondence analysis (SingleCellExperiment)
 57 | #'
 58 | #' @param sce (for \code{corralm_sce}) SingleCellExperiment; containing the data to be integrated. Default is to use the counts, and to include all of the data in the integration. These can be changed by passing additional arguments. See \code{\link{sce2matlist}} function documentation for list of available parameters.
 59 | #' @param splitby character; name of the attribute from \code{colData} that should be used to separate the SCE.
 60 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object
 61 | #' @param fullout boolean; whether the function will return the full \code{corralm} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corralm_matlist}}-style output, set this to \code{TRUE}.
 62 | #' @inheritParams compsvd
 63 | #' @param ... (additional arguments for methods)
 64 | #'
 65 | #' @return For SingleCellExperiment input, returns the SCE with embeddings in the reducedDim slot 'corralm' 
 66 | #' @rdname corralm
 67 | #' @export
 68 | #' 
 69 | #' @importFrom irlba irlba
 70 | #' @importFrom Matrix Matrix rowSums colSums
 71 | #' @importFrom SingleCellExperiment reducedDim
 72 | #' @importClassesFrom Matrix dgCMatrix
 73 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
 74 | #'
 75 | #' @examples
 76 | #' library(DuoClustering2018)
 77 | #' library(SingleCellExperiment)
 78 | #' sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)]
 79 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
 80 | #' result <- corralm_sce(sce, splitby = 'Method')
 81 | #' 
 82 | #' 
 83 | corralm_sce <- function(sce, splitby, method = c('irl','svd'), ncomp = 30, whichmat = 'counts', fullout = FALSE, rw_contrib = NULL, ...){
 84 |   method <- match.arg(method, c('irl','svd'))
 85 |   if(missing(splitby)) {stop('If performing multi-table analysis with a single SCE, the splitby variable must be specified. \nUse corral to analyze as a single table.')}
 86 |   mat_list <- sce2matlist(sce, splitby = splitby, whichmat = whichmat)
 87 |   svd_output <- corralm_matlist(mat_list, method = method, ncomp = ncomp, rw_contrib = rw_contrib, ...)
 88 |   if(fullout){
 89 |     class(svd_output) <- c(class(svd_output),'corralm')
 90 |     return(svd_output)
 91 |   }
 92 |   else{
 93 |     ind_order <- .indsbysplitby(sce, splitby)
 94 |     SingleCellExperiment::reducedDim(sce, 'corralm') <- svd_output$v[ind_order,]
 95 |     return(sce)
 96 |   }
 97 | }
 98 | 
 99 | #' Multi-table adaptation of correspondence analysis
100 | #' 
101 | #' This multi-table adaptation of correpondence analysis applies the same scaling technique and enables data alignment by finding a set of embeddings for each dataset within shared latent space.
102 | #' 
103 | #' \code{corralm} is a wrapper for \code{\link{corralm_matlist}} and \code{\link{corralm_sce}}, and can be called on any of the acceptable input types (see \code{inp} below).
104 | #'
105 | #' @param inp list of matrices (any type), a \code{SingleCellExperiment}, list of \code{SingleCellExperiment}s, list of \code{SummarizedExperiment}s, or \code{MultiAssayExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}). Additionally, if it is one \code{SingleCellExperiment}, then it is also necessary to include the \code{splitby} argument to specify the batches. For a \code{MultiAssayExperiment}, it will take the intersect of the features across all the assays, and use those to match the matrices; to use a different subset, select desired subsets then call \code{corral}
106 | #' @param whichmat char, when using SingleCellExperiment or other SummarizedExperiment, can be specified. default is 'counts'.
107 | #' @param ... (additional arguments for methods)
108 | #' @inheritParams corralm_matlist
109 | #' @inheritParams corralm_sce
110 | #'
111 | #' @return For a list of \code{\link{SingleCellExperiment}}s, returns a list of the SCEs with the embeddings in the respective \code{reducedDim} slot 'corralm'
112 | #' @rdname corralm
113 | #' @export
114 | #' 
115 | #' @importFrom irlba irlba
116 | #' @importFrom Matrix Matrix rowSums colSums
117 | #' @importFrom methods is
118 | #' @importFrom SingleCellExperiment reducedDim
119 | #' @importFrom MultiAssayExperiment experiments intersectRows assays
120 | #' @importFrom SummarizedExperiment assay
121 | #' @importClassesFrom Matrix dgCMatrix
122 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
123 | #' @importClassesFrom MultiAssayExperiment MultiAssayExperiment
124 | #'
125 | #' @examples
126 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20),
127 | #'                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20))
128 | #' corralm(listofmats)
129 | #' 
130 | #' library(DuoClustering2018)
131 | #' library(SingleCellExperiment)
132 | #' sce <- sce_full_Zhengmix4eq()[seq(1,100,1),sample(seq(1,3500,1),100,replace = FALSE)]
133 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
134 | #' result <- corralm(sce, splitby = 'Method')
135 | #' 
136 | corralm <- function(inp, whichmat = 'counts', fullout = FALSE,...){
137 |   if(is(inp,'SingleCellExperiment')){
138 |     corralm_sce(sce = inp, ...)
139 |   }
140 |   else if (is(inp,'ExperimentList')){
141 |     matlist <- as.list(MultiAssayExperiment::assays(MultiAssayExperiment::intersectRows(inp)))
142 |     corralm_matlist(matlist = matlist, ...)
143 |   }
144 |   else if (is(inp,'MultiAssayExperiment')){
145 |     matlist <- as.list(MultiAssayExperiment::assays(MultiAssayExperiment::experiments(MultiAssayExperiment::intersectRows(inp))))
146 |     corralm_matlist(matlist = matlist, ...)
147 |   }
148 |   else if(is(inp,'list') | is(inp,'List')){
149 |     if(all_are(inp,'SingleCellExperiment')){
150 |       matlist <- lapply(inp, SummarizedExperiment::assay, whichmat)
151 |       res <- corralm_matlist(matlist = matlist, ...)
152 |       if(fullout) {
153 |         class(res) <- c(class(res),"corralm")
154 |         return(res)
155 |       }
156 |       else{
157 |         add_embeddings2scelist(scelist = inp, embeddings = res$v) 
158 |       }
159 |     }
160 |     else if (all_are(inp,'SummarizedExperiment')){
161 |       matlist <- lapply(inp, SummarizedExperiment::assay, whichmat)
162 |       corralm_matlist(matlist = matlist, ...)
163 |     }
164 |     else{
165 |       corralm_matlist(matlist = inp, ...) 
166 |     }
167 |   }
168 | }
169 | 
170 | #' Print method for S3 object corralm
171 | #'
172 | #' @param x (print method) corralm object; the list output from \code{corralm_matlist}
173 | #'
174 | #' @rdname corralm
175 | #'
176 | #' @return .
177 | #' @export
178 | #'
179 | #' @examples
180 | #' # default print method for corralm objects
181 | print.corralm <- function(x,...){
182 |   inp <- x
183 |   pct_var_exp <- inp$pct_var_exp
184 |   ncomp <- length(inp$d)
185 |   cat('corralm output summary==========================================\n')
186 |   cat('  Output "list" includes SVD output (u, d, v) & a table of the\n')
187 |   cat('  dimensions of the input matrices (batch_sizes)\n')
188 |   cat('Variance explained----------------------------------------------\n')
189 |   print(round(pct_var_exp[,seq(1,min(8,ncomp),1)],2))
190 |   cat('\n')
191 |   cat('Dimensions of output elements-----------------------------------\n')
192 |   cat('  Singular values (d) :: ')
193 |   cat(ncomp)
194 |   cat('\n  Left singular vectors (u) :: ')
195 |   cat(dim(inp$u))
196 |   cat('\n  Right singular vectors (v) :: ')
197 |   cat(dim(inp$v))
198 |   cat('\n  See corralm help for details on each output element.')
199 |   cat('\n\n')
200 |   cat('Original batches & sizes (in order)-----------------------------')
201 |   cat('\n ')
202 |   cat(paste0(' ',rownames(inp$batch_sizes),' :: ',inp$batch_sizes[,2],'\n'))
203 |   cat('\n  Use plot_embedding to visualize; see docs for details.\n')
204 |   cat('================================================================\n')
205 | }
206 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Set na to 0
  2 | #'
  3 | #' @param x matrix of values for which na values should be changed to 0
  4 | #'
  5 | #' @return matrix, where na values are set to 0
  6 | #' @export
  7 | #'
  8 | #' @examples
  9 | #' x <- matrix(sample(0:10, 5000, replace = TRUE), ncol = 25)
 10 | #' x[sample(1:5000, 10)] <- NA
 11 | #' 
 12 | #' na2zero(x)
 13 | na2zero <- function(x) {
 14 |   func <- function(x){
 15 |     if (is.na(x)) return(0)
 16 |     else return(x)}
 17 |   
 18 |   return(apply(x, c(1,2), func))
 19 | }
 20 | 
 21 | #' List to Matrix
 22 | #'
 23 | #' @param matlist list of matrices to concatenate 
 24 | #' @param direction character, r or c, to indicate whether should be row-wise (i.e., rbind to match on columns) or column-wise (i.e., cbind to match on rows). Defaults to columnwise (matching on rows) to match convention of SingleCellExperiments
 25 | #'
 26 | #' @return matrix
 27 | #' @export
 28 | #'
 29 | #' @examples
 30 | #' listofmats <- list(matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10),
 31 | #'                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 10))
 32 | #' newmat <- list2mat(listofmats) # to "cbind" them
 33 | #' listofmats_t <- lapply(listofmats,t)
 34 | #' newmat_t <- list2mat(listofmats_t, 'r') # to "rbind" them
 35 | list2mat <- function(matlist, direction = c('c','r')[1]){
 36 |   if (direction == 'r') concatted <- Reduce(rbind, matlist)
 37 |   if (direction == 'c') concatted <- Reduce(cbind, matlist)
 38 |   return(concatted)
 39 | }
 40 | 
 41 | #' Compute percent of variance explained
 42 | #'
 43 | #' @param thissvd list outputted from an svd function (svd, irlba; can also take output from \code{\link{corral_mat}} and \code{\link{corralm_matlist}})
 44 | #' @param preproc_mat matrix of pre-processed values (optional) - important to include if the svd is only partial as this is used to compute the sum of eigenvalues
 45 | #'
 46 | #' @return vector of percent variance explained values, indexed by PC
 47 | #' @export
 48 | #'
 49 | #' @examples
 50 | #' mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10)
 51 | #' my_svd <- svd(mat)
 52 | #' get_pct_var_exp_svd(my_svd) # this works if my_svd is a full svd
 53 | #' my_irl <- irlba::irlba(mat,nv = 2)
 54 | #' get_pct_var_exp_svd(my_irl, preproc_mat = mat) # ... otherwise use this 
 55 | get_pct_var_exp_svd <- function(thissvd, preproc_mat = thissvd$d){
 56 |   denom <- sum(preproc_mat^2)
 57 |   return(thissvd$d^2 / denom)
 58 | }
 59 | 
 60 | 
 61 | #' Get weights
 62 | #' 
 63 | #' Computes row weights and column weights
 64 | #'
 65 | #' @param inp_mat matrix for which weights should be calculated (sparse or full)
 66 | #'
 67 | #' @return list of 2 elements: 'row.w' and 'col.w' contain the row and column weights respectively
 68 | #' @export
 69 | #' 
 70 | #' @importFrom Matrix Matrix rowSums colSums
 71 | #' @importFrom methods is
 72 | #' @importClassesFrom Matrix dgCMatrix
 73 | #'
 74 | #' @examples
 75 | #' mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10)
 76 | #' ws <- get_weights(mat)
 77 | get_weights <- function(inp_mat){
 78 |   w <- list()
 79 |   if(!is(inp_mat, "dgCMatrix")) {sp_mat <- Matrix(inp_mat, sparse = TRUE)}  # convert to a sparse matrix
 80 |   else {sp_mat <- inp_mat}
 81 |   N <- sum(sp_mat)
 82 |   sp_mat <- sp_mat/N
 83 |   w[['row.w']] <- Matrix::rowSums(sp_mat)
 84 |   w[['col.w']] <- Matrix::colSums(sp_mat)
 85 |   return(w)
 86 | }
 87 | 
 88 | 
 89 | #' SingleCellExperiment to list of matrices
 90 | #'
 91 | #' @param sce SingleCellExperiment that is to be separated into list of count matrices
 92 | #' @param splitby character; name of the attribute from colData that should be used to separate the SCE
 93 | #' @param to_include (optional) character vector; determines which values from the "splitby" column will be included in the outputted matlist. NULL is the default, and will result in selecting all elements
 94 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object
 95 | #'
 96 | #' @return list of matrices
 97 | #' @export
 98 | #'
 99 | #' @importFrom SingleCellExperiment colData
100 | #' @importFrom SummarizedExperiment assay
101 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
102 | #'
103 | #' @examples
104 | #' library(DuoClustering2018)
105 | #' sce <- sce_full_Zhengmix4eq()
106 | #' matlist <- sce2matlist(sce = sce, splitby = 'phenoid', whichmat = 'logcounts')
107 | sce2matlist <- function(sce,splitby,to_include = NULL,whichmat = 'counts'){
108 |   if(is.null(to_include)){
109 |     to_include <- unique(as.character(colData(sce)[,splitby]))
110 |   }
111 |   matlist <- list()
112 |   countmatrix <- SummarizedExperiment::assay(sce, whichmat)
113 |   for(gn in to_include){
114 |     matlist[[gn]] <- countmatrix[,which(colData(sce)[,splitby] == gn)]
115 |   }
116 |   return(matlist)
117 | }
118 | 
119 | 
120 | 
121 | #' @keywords internal
122 | #' @import SingleCellExperiment
123 | .indsbysplitby <- function(sce, splitby, to_include = NULL){
124 |   if(is.null(to_include)){
125 |     to_include <- unique(as.character(colData(sce)[,splitby]))
126 |   }
127 |   inds <- c()
128 |   for(gn in to_include){
129 |     inds <- c(inds, which(colData(sce)[,splitby] == gn))
130 |   }
131 |   return(inds)
132 | }
133 | 
134 | #' Add embeddings to list of SCEs
135 | #'
136 | #' @param scelist list of SingleCellExperiments; to which the corresponding embeddings should be added
137 | #' @param embeddings matrix; the embeddings outputted from a dimension reduction, e.g. \code{\link{corralm}}. Rows in this table correspond to columns in the SCEs in \code{scelist} (if all the SCEs were column-bound), and row indices should correspond to cells.
138 | #' @param slotname character; name of the slot for the reduced dim embedding; defaults to \code{corral}
139 | #'
140 | #' @return list of SingleCellExperiments with respective embeddings stored in them
141 | #' @export
142 | #' 
143 | #' @importFrom SingleCellExperiment reducedDim
144 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
145 | #'
146 | #' @examples
147 | #' library(DuoClustering2018)
148 | #' sce <- sce_full_Zhengmix4eq()
149 | #' scelist <- list(sce,sce)
150 | #' embeddings <- matrix(sample(seq(0,20,1),dim(sce)[2]*6,replace = TRUE),nrow = dim(sce)[2]*2)
151 | #' scelist <- add_embeddings2scelist(scelist, embeddings)
152 | add_embeddings2scelist <- function(scelist,embeddings, slotname = 'corralm'){
153 |   if(!all_are(scelist,'SingleCellExperiment')) {
154 |     cat('Warning! You may have non-SCE elements in the list.')
155 |   }
156 |   dimvec <- unlist(lapply(scelist,ncol))
157 |   for (i in seq(1,length(scelist),1)){
158 |     if(i == 1) {start_ind <- 1}
159 |     else{start_ind <- (sum(dimvec[seq(1,i-1,1)]) + 1)}
160 |     
161 |     end_ind <- start_ind + dimvec[i] - 1
162 |     
163 |     SingleCellExperiment::reducedDim(scelist[[i]],slotname) <- embeddings[seq(start_ind,end_ind,1),]
164 |   }
165 |   return(scelist)
166 | }
167 | 
168 | #' @keywords internal
169 | .batch_sizes <- function(matlist){
170 |   df <- as.data.frame(do.call(rbind, lapply(matlist,dim)))
171 |   colnames(df) <- NULL
172 |   rownames(df) <- names(matlist)
173 |   return(df)
174 | }
175 | 
176 | #' @keywords internal
177 | #' @import ggthemes
178 | #' @importFrom grDevices colorRampPalette
179 | .generate_palette_func <- function(ncolors, color_values){
180 |   # adapted from ggthemes::scale_color_* functions
181 |   if(missing(color_values)){
182 |     values <- ggthemes::ggthemes_data$few$colors[['Medium']][['value']]
183 |   }
184 |   else{values <- color_values}
185 |   values <- c(values[1],colorRampPalette(values[seq(2,length(values),1)])(1 + ncolors)) # expanding palette to fit
186 |   max_n <- length(values) - 1L
187 |   f <- function(n) {
188 |     ggthemes:::check_pal_n(n, max_n)
189 |     if (n == 1L) {
190 |       values[[1L]]
191 |     }
192 |     else {
193 |       unname(values[2L:(n + 1L)])
194 |     }
195 |   }
196 |   attr(f, "max_n") <- length(values) - 1L
197 |   f
198 | }
199 | 
200 | 
201 | #' all_are
202 | #'
203 | #' Checks if all elements of a list or List are of a (single) particular type \code{typechar}
204 | #'
205 | #' @param inplist list or List to be checked
206 | #' @param typechar char of the type to check for
207 | #'
208 | #' @return boolean, for whether the elements of \code{inplist} are all \code{typechar}
209 | #' @export
210 | #'
211 | #' @examples
212 | #' x <- list(1,2)
213 | #' all_are(x,'numeric')
214 | #' all_are(x,'char')
215 | #' 
216 | #' y <- list(1,2,'c')
217 | #' all_are(y,'numeric')
218 | #' all_are(y,'char')
219 | all_are <- function(inplist,typechar){
220 |   return(all(unlist(lapply(inplist,is,typechar))))
221 | }
222 | 
223 | #' rv coefficient
224 | #'
225 | #' @param mat1 matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat2}
226 | #' @param mat2 matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat1}
227 | #'
228 | #' @return numeric; RV coefficient between the matched matrices
229 | #' @export
230 | #'
231 | #' @examples
232 | #' a <- matrix(sample(1:10,100, TRUE), nrow = 10)
233 | #' b <- matrix(sample(1:10,50, TRUE), nrow = 5)
234 | #' 
235 | #' rv(a, b) # matched by columns
236 | #' rv(t(a), t(b)) # matched by rows
237 | rv <- function(mat1, mat2) {
238 |   if(ncol(mat1) != ncol(mat2)){
239 |     if(nrow(mat1) == nrow(mat2)){
240 |       mat1 <- t(mat1)
241 |       mat2 <- t(mat2)
242 |     }
243 |     else{stop('Matrices must be aligned by rows or columns.')}
244 |   }
245 |   nscm1 <- crossprod(as.matrix(mat1))
246 |   nscm2 <- crossprod(as.matrix(mat2))
247 |   rv <- sum(nscm1 * nscm2)/(sum(nscm1 * nscm1) * sum(nscm2 * nscm2))^0.5
248 |   return(rv)
249 | }
250 | 
251 | 
252 | #' Pairwise rv coefficient
253 | #'
254 | #' @param matlist list of matrices (or matrix-like; see \code{rv} function) for which to compute pairwise RV coefficients
255 | #'
256 | #' @return matrix of the pairwise coefficients
257 | #' @export
258 | #' 
259 | #' @importFrom utils combn
260 | #'
261 | #' @examples
262 | #' a <- matrix(sample(1:10,100,TRUE), nrow = 10)
263 | #' b <- matrix(sample(1:10,50,TRUE), nrow = 5)
264 | #' c <- matrix(sample(1:10,20,TRUE), nrow = 2)
265 | #' 
266 | #' matlist <- list(a,b,c)
267 | #' pairwise_rv(matlist)
268 | #' pairwise_rv(lapply(matlist, t))
269 | pairwise_rv <- function(matlist){
270 |   n <- length(matlist)
271 |   a <- utils::combn(seq_len(n), 2, 
272 |              FUN = function(x) rv(matlist[[x[1]]], matlist[[x[2]]]), simplify = TRUE)
273 |   m <- matrix(1, n, n)
274 |   m[lower.tri(m)] <- a
275 |   m[upper.tri(m)] <- t(m)[upper.tri(m)]
276 |   colnames(m) <- rownames(m) <- names(matlist)
277 |   return(m)
278 | }
279 | 
280 | 
281 | #' Trim extreme values in a pre-processed matrix
282 | #'
283 | #' Smooths the extreme values in a chi-square-transformed matrix to lessen the influence of "rare objects."
284 | #' 
285 | #' (Usually not called directly; can be included by using the `smooth` argument in the `corral`, `corralm`, and `corral_preproc` functions)
286 | #'
287 | #' @param mat matrix; should be pre-processed/normalized to some sort of approximately normally distributed statistic (e.g., chi-squared transformation with `corral_preproc` or Z-score normalization)
288 | #' @param pct_trim numeric; the percent of observations to smooth.  Defaults to `pct_trim` = .01, which corresponds to smoothing all observations to be between the .5 percentile and 99.5 percentile range of the input matrix
289 | #'
290 | #' @return smoothed matrix
291 | #' @export
292 | #'
293 | #' @examples
294 | #' count_mat <- matrix(rpois(10000, 300)*rbinom(10000,1,.1), ncol = 100)
295 | #' smoothed_preproc_mat <- corral_preproc(count_mat, smooth = TRUE)
296 | trim_matdist <- function(mat, pct_trim = .01){
297 |   up_qt <- 1-(pct_trim/2)
298 |   upthresh <- quantile(mat, up_qt)
299 |   downthresh <- quantile(mat, 1-up_qt)
300 |   if(sum(mat > upthresh) > 0){
301 |     mat[mat > upthresh] <- upthresh
302 |   }
303 |   if(sum(mat < downthresh) > 0){
304 |     mat[mat < downthresh] <- downthresh
305 |   }
306 |   return(mat)
307 | }
308 | 


--------------------------------------------------------------------------------
/vignettes/corral_dimred.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Dimension reduction of single cell data with corral"
  3 | author: 
  4 | - name: Lauren Hsu
  5 |   affiliation: Harvard TH Chan School of Public Health; Dana-Farber Cancer Institute
  6 | - name: Aedin Culhane
  7 |   affiliation: Department of Data Science, Dana-Farber Cancer Institute, Department of Biostatistics, Harvard TH Chan School of Public Health
  8 | date: "5/17/2021"
  9 | output:
 10 |   BiocStyle::html_document:
 11 |     toc_float: true
 12 |   BiocStyle::pdf_document: default
 13 | package: BiocStyle
 14 | bibliography: ref.bib
 15 | vignette: |
 16 |   %\VignetteIndexEntry{dim reduction with corral}
 17 |   %\VignetteEngine{knitr::rmarkdown}
 18 |   %\VignetteEncoding{UTF-8}
 19 | ---
 20 | 
 21 | ```{r setup, include=FALSE}
 22 | knitr::opts_chunk$set(echo = TRUE)
 23 | library(gridExtra)
 24 | ```
 25 | 
 26 | # Introduction
 27 | 
 28 | Single-cell 'omics analysis enables high-resolution characterization of heterogeneous 
 29 | populations of cells by quantifying measurements in individual cells and thus 
 30 | provides a fuller, more nuanced picture into the complexity and heterogeneity between 
 31 | cells. However, the data also present new and significant challenges as compared to 
 32 | previous approaches, especially as single-cell data are much larger and sparser than 
 33 | data generated from bulk sequencing methods. Dimension reduction is a key step 
 34 | in the single-cell analysis to address the high dimension and sparsity of these 
 35 | data, and to enable the application of more complex, computationally expensive downstream pipelines.
 36 | 
 37 | Correspondence analysis (CA) is a matrix factorization method, and is similar to 
 38 | principal components analysis (PCA). Whereas PCA is designed for application to 
 39 | continuous, approximately normally distributed data, CA is appropriate for 
 40 | non-negative, count-based data that are in the same additive scale. `corral` 
 41 | implements CA for dimensionality reduction of a single matrix of single-cell data. 
 42 | 
 43 | See the vignette for `corralm` for the multi-table adaptation of CA for single-cell batch alignment/integration.
 44 | 
 45 | corral can be used with various types of input. When called on a matrix (or other matrix-like object), it returns a list with the SVD output, principal coordinates, and standard coordinates. When called on a `r Biocpkg('SingleCellExperiment')`, it returns the `r Biocpkg('SingleCellExperiment')` with the corral embeddings in the `reducedDim` slot named `corral`. To retrieve the full list output from a `SingleCellExperiment` input, the `fullout` argument can be set to `TRUE`.
 46 | ![](corral_IO.png)
 47 | 
 48 | # Loading packages and data
 49 | 
 50 | We will use the `Zhengmix4eq` dataset from the `r Biocpkg('DuoClustering2018')` package.
 51 | 
 52 | ```{r, message = FALSE}
 53 | library(corral)
 54 | library(SingleCellExperiment)
 55 | library(ggplot2)
 56 | library(DuoClustering2018)
 57 | zm4eq.sce <- sce_full_Zhengmix4eq()
 58 | zm8eq <- sce_full_Zhengmix8eq()
 59 | ```
 60 | 
 61 | This dataset includes approximately 4,000 pre-sorted and annotated cells of 
 62 | 4 types mixed by Duo et al. in approximately equal proportions [@zmdata]. 
 63 | The cells were sampled from a "Massively parallel digital transcriptional 
 64 | profiling of single cells" [@zheng].
 65 | 
 66 | ```{r}
 67 | zm4eq.sce
 68 | table(colData(zm4eq.sce)$phenoid)
 69 | ```
 70 | 
 71 | 
 72 | # `corral` on `r Biocpkg('SingleCellExperiment')`
 73 | 
 74 | We will run `corral` directly on the raw count data:
 75 | 
 76 | ```{r}
 77 | zm4eq.sce <- corral(inp = zm4eq.sce, 
 78 |                     whichmat = 'counts')
 79 | 
 80 | zm4eq.sce
 81 | ```
 82 | 
 83 | We can use `plot_embedding` to visualize the output:
 84 | 
 85 | ```{r}
 86 | plot_embedding_sce(sce = zm4eq.sce,
 87 |                    which_embedding = 'corral',
 88 |                    plot_title = 'corral on Zhengmix4eq',
 89 |                    color_attr = 'phenoid',
 90 |                    color_title = 'cell type',
 91 |                    saveplot = FALSE)
 92 | ```
 93 | 
 94 | Using the `scater` package, we can also add and visualize `umap` and `tsne` embeddings based on the `corral` output:
 95 | 
 96 | ```{r}
 97 | library(scater)
 98 | library(gridExtra) # so we can arrange the plots side by side
 99 | 
100 | zm4eq.sce <- runUMAP(zm4eq.sce,
101 |                      dimred = 'corral',
102 |                      name = 'corral_UMAP')
103 | zm4eq.sce <- runTSNE(zm4eq.sce,
104 |                      dimred = 'corral',
105 |                      name = 'corral_TSNE')
106 | 
107 | ggplot_umap <- plot_embedding_sce(sce = zm4eq.sce,
108 |                                   which_embedding = 'corral_UMAP',
109 |                                   plot_title = 'Zhengmix4eq corral with UMAP',
110 |                                   color_attr = 'phenoid',
111 |                                   color_title = 'cell type',
112 |                                   returngg = TRUE,
113 |                                   showplot = FALSE,
114 |                                   saveplot = FALSE)
115 | 
116 | ggplot_tsne <- plot_embedding_sce(sce = zm4eq.sce,
117 |                                   which_embedding = 'corral_TSNE',
118 |                                   plot_title = 'Zhengmix4eq corral with tSNE',
119 |                                   color_attr = 'phenoid',
120 |                                   color_title = 'cell type',
121 |                                   returngg = TRUE,
122 |                                   showplot = FALSE,
123 |                                   saveplot = FALSE)
124 | 
125 | gridExtra::grid.arrange(ggplot_umap, ggplot_tsne, ncol = 2)
126 | 
127 | ```
128 | 
129 | The `corral` embeddings stored in the `reducedDim` slot can be used in 
130 | downstream analysis, such as for clustering or trajectory analysis.
131 | 
132 | `corral` can also be run on a `SummarizedExperiment` object.
133 | 
134 | # `corral` on matrix
135 | 
136 | `corral` can also be performed on a matrix (or matrix-like) input.
137 | 
138 | ```{r}
139 | zm4eq.countmat <- assay(zm4eq.sce,'counts')
140 | zm4eq.countcorral <- corral(zm4eq.countmat)
141 | ```
142 | 
143 | The output is in a `list` format, including the SVD output (`u`,`d`,`v`), 
144 | the standard coordinates (`SCu`,`SCv`), and the principal coordinates (`PCu`,`PCv`).
145 | 
146 | ```{r}
147 | zm4eq.countcorral
148 | ```
149 | 
150 | We can use `plot_embedding` to visualize the output:
151 | (the embeddings are in the `v` matrix because these data are by genes in the 
152 | rows and have cells in the columns; if this were reversed, with cells in the 
153 | rows and genes/features in the column, then the cell embeddings would instead 
154 | be in the `u` matrix.)
155 | ```{r}
156 | celltype_vec <- zm4eq.sce$phenoid
157 | plot_embedding(embedding = zm4eq.countcorral$v,
158 |                plot_title = 'corral on Zhengmix4eq',
159 |                color_vec = celltype_vec,
160 |                color_title = 'cell type',
161 |                saveplot = FALSE)
162 | ```
163 | 
164 | The output is the same as above with the `SingleCellExperiment`, and can be 
165 | passed as the low-dimension embedding for downstream analysis. Similarly, 
166 | UMAP and tSNE can be computed for visualization. (Note that in performing SVD, 
167 | the direction of the axes doesn't matter so they may be flipped between runs, 
168 | as `corral` and `corralm` use `irlba` to perform fast approximation.)
169 | 
170 | # Updates to CA to address overdispersion
171 | 
172 | Correspondence analysis is known to be sensitive to "rare objects" (Greenacre, 2013). Sometimes this can be beneficial because the method can detect small perturbations of rare populations. However, in other cases, a couple outlier cells can be allowed to exert undue influence on a particular dimension.
173 | 
174 | In the `corral` manuscript, we describe three general approaches, included below; see our manuscript for more details and results. In this vignette we also present a fourth approach (Trimming extreme values with `smooth` mode)
175 | 
176 | ## Changing the residual type (`rtype`)
177 | 
178 | Standard correspondence analysis decomposes Pearson $\chi^2$ residuals, computed with the formula:
179 | $$r_{p; ij} = \frac{\mathrm{observed} - \mathrm{expected}}{\sqrt{\mathrm{expected}}} = \frac{p_{ij} - p_{i.} \ p_{.j}}{\sqrt{p_{i.} \ p_{.j}}}$$
180 | 
181 | where $p_{ij} = \frac{x_{ij}}{N}$, $N = \sum_{i=1}^m \sum_{j=1}^n x_{ij}$, $p_{i.} = \mathrm{row \ weights} = \sum_{i=1}^m p_{ij}$, and $p_{.j} = \mathrm{col \ weights} = \sum_{j=1}^n p_{ij}$.
182 | 
183 | In `corral`, this is the default setting. It can also be explicitly selected by setting `rtype = 'standardized'` or `rtype = 'pearson'`.
184 | 
185 | Another $\chi^2$ residual is the Freeman-Tukey:
186 | $$r_{f; ij} = \sqrt{p_{ij}} + \sqrt{p_{ij} + \frac{1}{N}} - \sqrt{4 p_{i.} \ p_{.j} + \frac{1}{N}}$$
187 | 
188 | It is more robust to overdispersion than the Pearson residuals, and therefore outperforms standard CA in many scRNAseq datasets.
189 | 
190 | In `corral`, this option can be selected by setting `rtype = 'freemantukey'`.
191 | 
192 | ## Variance stabilization before CA (`vst_mth`)
193 | 
194 | Another approach for addressing overdispersed counts is to apply a variance stabilizing transformation. The options included in the package:
195 | 
196 | - Square root transform ($\sqrt{x}$): `vst_mth = 'sqrt'`
197 | - Anscombe transform ($2 \sqrt{x + \frac{3}{8}}$): `vst_mth = 'anscombe'`
198 | - Freeman-Tukey transform ($\sqrt{x} + \sqrt{x + 1}$): `vst_mth = 'freemantukey'` **Note that this option is different from setting the `rtype` parameter to `'freemantukey'`
199 | 
200 | ## Power deflation (`powdef_alpha`)
201 | 
202 | To apply a smoothing effect to the $\chi^2$ residuals, another approach is to transform the residual matrix by a power of $\alpha \in (0,1)$. To achieve a "soft" smoothing effect, we suggest $\alpha \in [0.9,0.99]$. This option is controlled with the `powdef_alpha` parameter, which takes the default value of `NULL` (not used). To set it, use this parameter and set it equal to the desired value for $\alpha$ as a numeric. e.g., `powdef_alpha = 0.95` would be including this option and setting $\alpha = 0.95$.
203 | 
204 | ## Trimming extreme values (`smooth` mode)
205 | 
206 | One adaptation (not described in the manuscript) that addresses unduly influential outliers is to apply an alternative smoothing procedure that narrows the range of the $\chi^2$-transformed values by symmetrically trimming the top $n$ fraction of extreme values ($n$ defaults to $.01$ and can be set with the `pct_trim` argument). Since the `corral` matrix pre-processing procedure transforms the values into standardized $\chi^2$ space, they can be considered proportional to the significance of difference between observed and expected abundance for a given gene in a given cell. This approach differs from power deflation in that it only adjusts the most extreme values, and explicitly so, whereas power deflation shifts the distribution of all values to be less extreme.
207 | 
208 | This additional pre-processing step can be applied in `corral` by setting the `smooth` argument to `TRUE` (it defaults to `FALSE`), and this mode only works with standardized and indexed residuals options.
209 | 
210 | ```{r}
211 | zm8eq.corral <- corral(zm8eq, fullout = TRUE)
212 | zm8eq.corralsmooth <- corral(zm8eq, fullout = TRUE, smooth = TRUE)
213 | ```
214 | 
215 | # Visualizing links between features and sub-populations with biplots
216 | 
217 | Reduced dimension embeddings are often used to find relationships between observations. `corral` can be used to additionally explore relationships between the features and the observations. By plotting both embeddings into the same space, the biplot reveals potential associations between the "rows" and "columns" -- in this case, cells and genes. 
218 | 
219 | Influential features in a given dimension will be further from the origin, and features strongly associated to a particular cluster of observations will be close in terms of vector direction (e.g., cosine similarity).
220 | 
221 | ```{r}
222 | gene_names <- rowData(zm4eq.sce)$symbol
223 | 
224 | biplot_corral(corral_obj = zm4eq.countcorral, color_vec = celltype_vec, text_vec = gene_names)
225 | ```
226 | 
227 | For example, in this case, in the general direction of the B cells, there is the MALAT1 gene, which is a B cell marker. Considering the side of the plot with the CD14 monocytes, by showing which genes are closer to which subpopulation, the biplot can help characterize the different groups of cells.
228 | 
229 | # Session information
230 | 
231 | ```{r}
232 | sessionInfo()
233 | ```
234 | 
235 | # References
236 | 
237 | 


--------------------------------------------------------------------------------
/R/plot_embedding.R:
--------------------------------------------------------------------------------
  1 | #' Plot selected PCs from an embedding
  2 | #'
  3 | #' @param embedding matrix or other tabular format where columns correspond to PCs and rows correspond to cells (entries). \code{corral} and \code{corralm}  objects are also accepted.
  4 | #' @param xpc int; which PC to put on the x-axis (defaults to 1)
  5 | #' @param ypc int; which PC to put on the y-axis (defaults to the one after \code{xpc})
  6 | #' @param plot_title char; title of plot (defaults to titling based on \code{xpc} and \code{ypc})
  7 | #' @param color_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., this could be indicating which batch each cell is from)
  8 | #' @param color_title char; what attribute the colors represent
  9 | #' @param ellipse_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, and elements of the same class will be circled in an ellipse. (e.g., this could be indicating the cell type or cell line; works best for attributes intended to be compact)
 10 | #' @param facet_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class. Plot will be faceted by this attribute.
 11 | #' @param ptsize numeric; the size of the points as passed to \code{geom_point()}. Defaults to 0.8.
 12 | #' @param saveplot boolean; whether or not to save the plot, defaults \code{FALSE}
 13 | #' @param plotfn char; what the filename is to be called. (defaults to making a name based on \code{plot_title} and \code{xpc})
 14 | #' @param showplot boolean; whether or not to show the plot, defaults \code{TRUE}
 15 | #' @param returngg boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE}
 16 | #' @param color_pal_vec char; hex codes for the color palette to be used. Default is to use the ggthemes few for plots with less than 9 colors, and to use/"stretch" pals polychrome if more colors are needed.
 17 | #' @param dimname char; the name of the dimensions. defaults to "Dim"
 18 | #'
 19 | #' @return default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg})
 20 | #' @export
 21 | #' 
 22 | #' @import ggplot2
 23 | #' @import gridExtra
 24 | #' @import ggthemes
 25 | #' @import pals
 26 | #' @importFrom graphics plot
 27 | #'
 28 | #' @examples
 29 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20),
 30 | #'                    matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20))
 31 | #' corralm_obj <- corralm(listofmats, ncomp = 5)
 32 | #' embed_mat <- corralm_obj$v
 33 | #' cell_type_vec <- sample(c('type1','type2','type3'),100,replace = TRUE)
 34 | #' plot_embedding(embedding = embed_mat, 
 35 | #'                xpc = 1, 
 36 | #'                plot_title = 'corralm plot',
 37 | #'                color_vec = cell_type_vec, 
 38 | #'                color_title = 'cell type',
 39 | #'                saveplot = FALSE)
 40 | #' 
 41 | #' # or, call directly on the corralm object              
 42 | #' plot_embedding(corralm_obj)
 43 | #' 
 44 | plot_embedding <- function(embedding, xpc = 1, ypc = xpc + 1, plot_title = paste0('Dim',xpc,' by Dim',ypc), color_vec = NULL, color_title = NULL, ellipse_vec = NULL, facet_vec = NULL, ptsize = 0.8, saveplot = FALSE, plotfn = paste(plot_title,xpc, sep = '_'), showplot = TRUE, returngg = FALSE, color_pal_vec = NULL, dimname = 'Dim'){
 45 |   if('corral' %in% class(embedding)){
 46 |     embedding <- embedding$PCv
 47 |   }
 48 |   if('corralm' %in% class(embedding)){
 49 |     corralm_obj <- embedding
 50 |     embedding <- corralm_obj$v
 51 |     if(is.null(color_vec)){
 52 |       color_vec <- rep(rownames(corralm_obj$batch_sizes), corralm_obj$batch_sizes[,2])
 53 |       color_title <- 'Batch'
 54 |     }
 55 |   }
 56 |   if(is.null(color_vec)){
 57 |     color_vec <- rep('emb', nrow(embedding))
 58 |   }
 59 |   
 60 |   xlab <- paste0(dimname, xpc)
 61 |   ylab <- paste0(dimname, ypc)
 62 | 
 63 |   df <- cbind(data.frame(embedding[,c(xpc, ypc)]), color_vec)
 64 |   
 65 |   colnames(df) <- c('Xdim','Ydim', 'color_vec')
 66 |   
 67 |   if(!is.null(ellipse_vec)){
 68 |     df <- cbind(df, ellipse_vec)
 69 |   }
 70 |   
 71 |   if(!is.null(facet_vec)){
 72 |     df <- cbind(df, facet_vec)
 73 |   }
 74 |   
 75 |   # Setting up the colors
 76 |   .colscale <- function(palette){ggplot2::discrete_scale('colour','colscale',palette)}
 77 |   
 78 |   if(!is.null(color_pal_vec)){
 79 |     palette_func <- .generate_palette_func(ncolors = length(unique(color_vec)), color_pal_vec)
 80 |   } else if(length(unique(color_vec)) < 9){
 81 |     palette_func <- ggthemes::few_pal('Medium')
 82 |   } else{
 83 |     palette_func <- .generate_palette_func(ncolors = length(unique(color_vec)), pals::alphabet2())
 84 |   }
 85 |   
 86 |   gg_obj <- ggplot(df, aes(x = Xdim, y = Ydim, colour = color_vec)) + 
 87 |     geom_point(size = ptsize) +
 88 |     theme_classic() + .colscale(palette_func) + 
 89 |     geom_hline(yintercept = 0, color = 'gray') + 
 90 |     geom_vline(xintercept = 0, color = 'gray') + 
 91 |     labs(x = xlab, y = ylab, 
 92 |          title = plot_title, 
 93 |          color = color_title) + 
 94 |     theme(axis.text.x = element_text(size = rel(1.4), colour = 'black'), 
 95 |           axis.text.y = element_text(size = rel(1.4), colour = 'black'), 
 96 |           axis.title = element_text(size = rel(1.4), colour = 'black')) + 
 97 |     guides(color = guide_legend(override.aes = list(size = 5)))
 98 |   
 99 |   if(!is.null(ellipse_vec)){
100 |     gg_obj <- gg_obj + stat_ellipse(aes(x = Xdim, y = Ydim, group = ellipse_vec), type = 'norm', linetype = 2)
101 |   }
102 |   
103 |   if(!is.null(facet_vec)){
104 |     gg_obj <- gg_obj + facet_wrap(facets = vars(facet_vec))
105 |   }
106 |   
107 |   if(saveplot){
108 |     ggsave(paste0(plotfn,'.png'))
109 |   }
110 |   
111 |   if(showplot){
112 |     plot(gg_obj)
113 |   }
114 |   
115 |   if(returngg){
116 |     return(gg_obj)
117 |   }
118 | }
119 | 
120 | 
121 | #' Plot selected PCs from an embedding saved in a SingleCellExperiment object
122 | #'
123 | #' @param sce \code{\link{SingleCellExperiment}} object; contains the embedding within the \code{reducedDim} slot
124 | #' @param which_embedding character; for the embedding to plot
125 | #' @param color_attr character; name of the attribute within \code{colData} to use for assigning colors (in lieu of \code{color_vec} in the \code{\link{plot_embedding}} function)
126 | #' @param color_title character; title to use for colors legend, defaults to the same as \code{color_attr}
127 | #' @param ellipse_attr character; name of the attribute within \code{colData} to use for drawing ellipse(s) (in lieu of \code{ellipse_vec} in the \code{\link{plot_embedding}} function)
128 | #' @param facet_attr character; name of the attribute within \code{colData} to use for faceting (in lieu of \code{facet_vec} in the \code{\link{plot_embedding}} function)
129 | #' @param ... additional optional arguments - see \code{\link{plot_embedding}} function for details on other potential arguments: \code{xpc}, \code{ypc}, \code{plot_title}, \code{color_title} (if title is different from \code{color_attr}), \code{ptsize}, \code{saveplot}, \code{plotfn}, \code{showplot}, \code{returngg}, \code{color_pal_vec}, \code{dimname}
130 | #' 
131 | #' @return default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg})
132 | #' @export
133 | #' 
134 | #' @import ggplot2
135 | #' @importFrom ggthemes scale_color_few
136 | #' @importFrom SingleCellExperiment colData reducedDim
137 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
138 | #'
139 | #' @examples
140 | #' library(DuoClustering2018)
141 | #' library(SingleCellExperiment)
142 | #' sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)]
143 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE))
144 | #' sce <- corralm(sce, splitby = 'Method')
145 | #' 
146 | #' # to plot and show only
147 | #' plot_embedding_sce(sce = sce, 
148 | #'                    which_embedding = 'corralm', 
149 | #'                    xpc = 1, 
150 | #'                    plot_title = 'corralm: PC1 by PC2',
151 | #'                    color_attr = "Method", 
152 | #'                    ellipse_attr = 'phenoid',
153 | #'                    saveplot = FALSE)
154 | #' 
155 | #' # to return ggplot2 object and display, but not save
156 | #' corralm_ggplot <- plot_embedding_sce(sce = sce, 
157 | #'                                      which_embedding = 'corralm', 
158 | #'                                      xpc = 1, 
159 | #'                                      plot_title = 'corralm: PC1 by PC2',
160 | #'                                      color_attr = 'Method', 
161 | #'                                      ellipse_attr = 'phenoid', 
162 | #'                                      returngg = TRUE, 
163 | #'                                      saveplot = FALSE)
164 | #' 
165 | #' 
166 | plot_embedding_sce <- function(sce, which_embedding, color_attr = NULL, color_title = color_attr, ellipse_attr = NULL, facet_attr = NULL, ...){
167 |   embed_mat <- SingleCellExperiment::reducedDim(sce, which_embedding)
168 |   
169 |   if(is.null(color_attr)){
170 |     color_vec <- NULL
171 |   }
172 |   else{
173 |     color_vec <- SingleCellExperiment::colData(sce)[, color_attr]
174 |   }
175 |   
176 |   if(!is.null(ellipse_attr)){
177 |     ellipse_vec <- SingleCellExperiment::colData(sce)[, ellipse_attr]
178 |   }
179 |   else{
180 |     ellipse_vec <- NULL
181 |   }
182 |   
183 |   if(!is.null(facet_attr)){
184 |     facet_vec <- SingleCellExperiment::colData(sce)[, facet_attr]
185 |   }
186 |   else{
187 |     facet_vec <- NULL
188 |   }
189 |     
190 |   plot_embedding(embed_mat, color_vec = color_vec, color_title = color_title, ellipse_vec = ellipse_vec, facet_vec = facet_vec, ...)
191 | }
192 | 
193 | 
194 | #' Generate biplot for corral object
195 | #'
196 | #' @param corral_obj list outputted by the \code{corral} function
197 | #' @param color_vec vector; length should correspond to the number of rows in v of \code{corral_obj}, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., cell type)
198 | #' @param text_vec vector; length should correspond to the number of rows in u of \code{corral_obj}, and each element of the vector is the label for the respective feature that would show on the biplot.
199 | #' @param feat_name char; the label will in the legend. Defaults to \code{(genes)}.
200 | #' @param nfeat int; the number of features to include. The function will first order them by distance from origin in the selected dimensions, then select the top n to be displayed.
201 | #' @param xpc int; which PC to put on the x-axis (defaults to 1)
202 | #' @param plot_title char; title of plot (defaults to *Biplot*)
203 | #' @param text_size numeric; size of the feature labels given in \code{text_vec} (defaults to 2; for \code{ggplot2})
204 | #' @param xjitter numeric; the amount of jitter for the text labels in x direction (defaults to .005; for \code{ggplot2})
205 | #' @param yjitter numeric; the amount of jitter for the text labels in y direction (defaults to .005; for \code{ggplot2})
206 | #' @param coords char; indicator for sets of coordinates to use. \code{svd} plots the left and right singular vectors as outputted by SVD (\code{u} and \code{v}), which \code{PC} and \code{SC} use the principal and standard coordinates, respectively (defaults to \code{svd})
207 | #'
208 | #' @return ggplot2 object of the biplot
209 | #' @export
210 | #'
211 | #' @import ggplot2
212 | #'
213 | #' @examples
214 | #' library(DuoClustering2018)
215 | #' library(SingleCellExperiment)
216 | #' zm4eq.sce <- sce_full_Zhengmix4eq()
217 | #' zm4eq.countmat <- counts(zm4eq.sce)
218 | #' zm4eq.corral_obj <- corral(zm4eq.countmat)
219 | #' gene_names <- rowData(zm4eq.sce)$symbol
220 | #' ctvec <- zm4eq.sce$phenoid
221 | #' 
222 | #' biplot_corral(corral_obj = zm4eq.corral_obj, color_vec = ctvec, text_vec = gene_names)
223 | biplot_corral <- function(corral_obj, color_vec, text_vec, feat_name = '(genes)', nfeat = 20, xpc = 1, plot_title = 'Biplot', text_size = 2, xjitter = .005, yjitter = .005, coords = c('svd','PC','SC')){
224 |   n <- nfeat
225 |   coords <- match.arg(coords, c('svd','PC','SC'))
226 |   
227 |   if(coords != 'svd'){
228 |     umat <- paste0(coords, umat)
229 |     vmat <- paste0(coords, vmat)
230 |   }
231 |   else{
232 |     umat <- 'u'
233 |     vmat <- 'v'
234 |   }
235 |   
236 |   gene_dists <- sqrt(corral_obj[[umat]][,xpc]^2 + corral_obj[[umat]][,xpc + 1]^2)
237 |   gene_dists_ordinds <- order(gene_dists, decreasing = TRUE)
238 |   
239 |   inflgenes <- corral_obj[[umat]][gene_dists_ordinds[1:n],]
240 |   rownames(inflgenes) <- text_vec[gene_dists_ordinds][1:n]
241 |   biplot_labs_filt <- c(color_vec, rep(feat_name,n))
242 |   biplot_dat_filt <- rbind(corral_obj[[vmat]], inflgenes)
243 |   bipfilt_gg <- plot_embedding(biplot_dat_filt, 
244 |                                color_vec = biplot_labs_filt, returngg = T,
245 |                                dimname = 'corral', xpc = xpc, showplot = F,
246 |                                plot_title = plot_title)
247 |   
248 |   bipfilt_gg$data$Name <- rownames(bipfilt_gg$data)
249 |   
250 |   bipfilt_gg <- bipfilt_gg + 
251 |     geom_text(aes(label=ifelse(color_vec == feat_name,as.character(Name),'')),
252 |               hjust=0,vjust=0, 
253 |               size = text_size, 
254 |               position=position_jitter(width=xjitter,height=yjitter))
255 |   
256 |   return(bipfilt_gg)
257 | }
258 | 
259 | 
260 | 
261 | 
262 | 


--------------------------------------------------------------------------------
/R/corral.R:
--------------------------------------------------------------------------------
  1 | #' compsvd: Compute Singular Value Decomposition (SVD)
  2 | #'
  3 | #' Computes SVD.
  4 | #'
  5 | #' @param mat matrix, pre-processed input; can be sparse or full (pre-processing can be performed using \code{\link{corral_preproc}} from this package)
  6 | #' @param method character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd
  7 | #' @param ncomp numeric, number of components; Default is 30
  8 | #' @param ... (additional arguments for methods)
  9 | #'
 10 | #' @return SVD result - a list with the following elements:
 11 | #' \describe{
 12 | #'     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat}. Note that using \code{svd} will result in the full set of singular values, while \code{irlba} will only compute the first \code{ncomp} singular values.}
 13 | #'     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns}
 14 | #'     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns}
 15 | #'     \item{\code{eigsum}}{sum of the eigenvalues, for calculating percent variance explained}
 16 | #' }
 17 | #' @export
 18 | #' 
 19 | #' @importFrom irlba irlba
 20 | #'
 21 | #' @examples
 22 | #' mat <- matrix(sample(0:10, 2500, replace=TRUE), ncol=50)
 23 | #' compsvd(mat, method = 'irl', ncomp = 5)
 24 | compsvd <- function(mat, method = c('irl','svd'), ncomp = 30, ...){
 25 |   method <- match.arg(method, c('irl','svd'))
 26 |   ncomp <- .check_ncomp(mat, ncomp)
 27 |   if(method == 'irl'){
 28 |     result <- irlba::irlba(mat, nv = ncomp, ...)
 29 |   }
 30 |   else if(method == 'svd'){
 31 |     result <- svd(mat, nv = ncomp, nu = ncomp, ...)
 32 |   }
 33 |   else {
 34 |     print('Provided method was not understood; used irlba.')
 35 |     result <- irlba::irlba(mat, nv = ncomp, ...)
 36 |   }
 37 |   result[['eigsum']] <- sum(mat^2)
 38 |   if(!is.null(rownames(mat))){rownames(result$u) <- rownames(mat)}
 39 |   if(!is.null(colnames(mat))){rownames(result$v) <- colnames(mat)}
 40 |   
 41 |   # add percent variance explained
 42 |   pct_var_exp <- t(data.frame('percent.Var.explained' = result$d^2 / result$eigsum))
 43 |   colnames(pct_var_exp) <- paste0(rep('PC',ncomp),seq(1,ncomp,1))
 44 |   result$pct_var_exp <- rbind(pct_var_exp,t(data.frame('cumulative.Var.explained' = cumsum(pct_var_exp[1,]))))
 45 |   
 46 |   return(result)
 47 | }
 48 | 
 49 | #' Apply a variance stabilizing transformation
 50 | #' 
 51 | #' Prior to running CA, there is an option to apply a variance stabilizing transformation. This function can be called explicitly or used with the `vst_mth` argument in \code{corral} and \code{corral_preproc}.
 52 | #'
 53 | #' @param inp matrix, numeric, counts or logcounts; can be sparse Matrix or matrix
 54 | #' @param transform character indicating which method should be applied. Defaults to the square root transform (`"sqrt"`). Other options include `"freemantukey"` and `"anscombe"`.
 55 | #'
 56 | #' @return variance-stabilized matrix; sparse if possible
 57 | #' @export
 58 | #'
 59 | #' @examples
 60 | #' x <- as.matrix(rpois(100, lambda = 50), ncol = 10)
 61 | #' vst_x <- var_stabilize(x)
 62 | var_stabilize <- function(inp, transform = c('sqrt','freemantukey','anscombe')){
 63 |   transform <- match.arg(transform, c('sqrt','freemantukey','anscombe'))
 64 |   .check_vals(inp)
 65 |   if(transform == 'sqrt'){
 66 |     return(inp^.5)
 67 |   }
 68 |   else if(transform == 'freemantukey'){
 69 |     return(inp^.5 + (inp + 1)^.5)
 70 |   }
 71 |   else if(transform == 'anscombe'){
 72 |     return(2 * (inp + 3/8)^.5)
 73 |   }
 74 | }
 75 | 
 76 | 
 77 | #' Preprocess a matrix for SVD to perform Correspondence Analysis (CA)
 78 | #'
 79 | #' This function performs the row and column scaling pre-processing operations, prior to SVD, for the corral methods. See \code{\link{corral}} for single matrix correspondence analysis and \code{\link{corralm}} for multi-matrix correspondence analysis.
 80 | #' 
 81 | #' In addition to standard CA (SVD of Pearson residuals), which is the default setting, this function also provides options to customize processing steps -- including the variations described in the corresponding manuscript for \code{corral}. Parameters that may be changed:
 82 | #' \describe{
 83 | #'     \item{Residual (`rtype`)}{Which analytic residual should be calculated? The default is standardized Pearson residuals. Freeman-Tukey is a more robust choice for handling overdispersed scRNAseq counts.}
 84 | #'     \item{Variance stabilization method (`vst_mth`)}{Should a variance stabilizing transformation be applied to the count matrix before computing residuals? Defaults to no.}
 85 | #'     \item{Power deflation $\alpha$ (`powdef_alpha`)}{Should a power deflation transformation be applied after computing residuals? Defaults to no. If using, set $\alpha \in (0,1)$. For a "soft" smoothing effect, we suggest $\alpha \in [0.9,0.99]$}
 86 | #'     \item{Trimming-based smoothing option (`smooth`)}{As an alternative to power deflation, this option provides a more subtle correction: the most extreme 1% of values (distributed equally between both tails) are set to the 0.5 percentile and 99.5 percentile values, respectively}
 87 | #' }
 88 | #'
 89 | #' @param inp matrix, numeric, counts or logcounts; can be sparse Matrix or matrix
 90 | #' @param rtype character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data. 
 91 | #' @param vst_mth character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`
 92 | #' @param powdef_alpha numeric for the power that should be applied if using power deflation. Must be in (0,1), and if provided a number outside this range, will be ignored. Defaults to `NULL` which does not perform this step.
 93 | #' @param row.w numeric vector; Default is \code{NULL}, to compute row.w based on \code{inp}. Use this parameter to replace computed row weights with custom row weights
 94 | #' @param col.w numeric vector; Default is \code{NULL}, to compute col.w based on \code{inp}. Use this parameter to replace computed column weights with custom column weights
 95 | #' @param smooth logical; Whether or not to perform the additional smoothing step with `trim_matdist`. Default is \code{FALSE}. Incompatible with `powdef_alpha`, so that parameter takes precedence over this one.
 96 | #' @param ... (additional arguments for methods)
 97 | #'
 98 | #' @return matrix, processed for input to \code{compsvd} to finish CA routine
 99 | #' @export
100 | #' 
101 | #' @importFrom Matrix Matrix rowSums colSums
102 | #' @importFrom methods is
103 | #' @importClassesFrom Matrix dgCMatrix
104 | #'
105 | #' @examples
106 | #' mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25)
107 | #' mat_corral <- corral_preproc(mat)
108 | #' corral_output <- compsvd(mat_corral, ncomp = 5)
109 | corral_preproc <- function(inp, rtype = c('standardized','indexed','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), powdef_alpha = NULL,  row.w = NULL, col.w = NULL, smooth = FALSE, ...){
110 |   rtype <- match.arg(rtype, c('standardized','indexed','hellinger','freemantukey','pearson'))
111 |   vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe'))
112 |   if(!is.null(powdef_alpha)){
113 |     if(powdef_alpha > 1 | powdef_alpha < 0){
114 |       powdef_alpha <- NULL
115 |       cat('Invalid choice for power deflation parameter alpha (powdef_alpha) so ignoring; should be in (0,1)')
116 |     }
117 |   }
118 |   if(!is(inp, "dgCMatrix")){
119 |     x_mat <- Matrix::Matrix(inp, sparse = TRUE)
120 |   } else {x_mat <- inp}
121 |   N <- sum(x_mat)
122 |   
123 |   if(vst_mth != 'none'){
124 |     x_mat <- var_stabilize(x_mat, transform = vst_mth)
125 |   }
126 |   
127 |   p_mat <- x_mat/N
128 |   w <- get_weights(inp)
129 |   if(is.null(row.w)) {row.w <- w$row.w}
130 |   if(is.null(col.w)) {col.w<- w$col.w}
131 |   else {col.w <- col.w / sum(col.w)}
132 |   
133 |   res <- NULL
134 |   
135 |   if(rtype == 'hellinger'){
136 |    p_mat <- p_mat / row.w
137 |    res <- sqrt(p_mat)
138 |   }
139 |   if(rtype == 'freemantukey'){
140 |     expectedp <- row.w %*% t(col.w)
141 |     res <- p_mat^.5 + (p_mat + 1/N)^.5 - (4*expectedp + 1/N)^.5
142 |   }
143 |   
144 |   p_mat <- p_mat/row.w
145 |   p_mat <- sweep(p_mat, 2, col.w, "/") - 1
146 |   if (any(is.na(p_mat))) {
147 |     p_mat <- na2zero(p_mat)
148 |   }
149 |   
150 |   if (rtype == 'indexed'){
151 |     res <- p_mat
152 |   }
153 |   else if (rtype %in% c('standardized','pearson')){
154 |     p_mat <- p_mat * sqrt(row.w)
155 |     p_mat <- sweep(p_mat, 2, sqrt(col.w), "*")
156 |     res <- p_mat
157 |   }
158 |   
159 |   if(!is.null(powdef_alpha)){
160 |     return(sign(res) * (abs(res)^powdef_alpha))
161 |   }
162 |   else if (smooth){
163 |     return(trim_matdist(res, ...))
164 |   }
165 |   return(res)
166 | }
167 | 
168 | 
169 | #' corral: Correspondence analysis on a single matrix
170 | #'
171 | #' corral can be used for dimension reduction to find a set of low-dimensional embeddings for a count matrix.
172 | #'
173 | #' @param inp matrix (or any matrix-like object that can be coerced using Matrix), numeric raw or lognormed counts (no negative values)
174 | #' @param row.w numeric vector; the row weights to use in chi-squared scaling. Defaults to `NULL`, in which case row weights are computed from the input matrix.
175 | #' @param col.w numeric vector; the column weights to use in chi-squared scaling. For instance, size factors could be given here. Defaults to `NULL`, in which case column weights are computed from the input matrix.
176 | #' @inheritParams compsvd
177 | #' @inheritParams corral_preproc
178 | #'
179 | #' @return When run on a matrix, a list with the correspondence analysis matrix decomposition result:
180 | #' \describe{
181 | #'     \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)}
182 | #'     \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)}
183 | #'     \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)}
184 | #'     \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained}
185 | #'     \item{\code{SCu and SCv}}{standard coordinates, left and right, respectively}
186 | #'     \item{\code{PCu and PCv}}{principal coordinates, left and right, respectively}
187 | #' }
188 | #' 
189 | #' @rdname corral
190 | #' @export
191 | #'
192 | #' @importFrom irlba irlba
193 | #' @importFrom Matrix Matrix rowSums colSums
194 | #' @importClassesFrom Matrix dgCMatrix
195 | #'
196 | #' @examples
197 | #' mat <- matrix(sample(0:10, 5000, replace=TRUE), ncol=50)
198 | #' result <- corral_mat(mat)
199 | #' result <- corral_mat(mat, method = 'irl', ncomp = 5)
200 | #' 
201 | corral_mat <- function(inp, method = c('irl','svd'),ncomp = 30, row.w = NULL, col.w = NULL, rtype = c('standardized','indexed','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), ...){
202 |   method <- match.arg(method, c('irl','svd'))
203 |   rtype <- match.arg(rtype, c('standardized','indexed','hellinger','freemantukey','pearson'))
204 |   vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe'))
205 |   preproc_mat <- corral_preproc(inp, row.w = row.w, col.w = col.w, rtype = rtype, vst_mth = vst_mth, ...)
206 |   result <- compsvd(preproc_mat, method, ncomp, ...)
207 |   w <- get_weights(inp)
208 |   if(is.null(row.w)) {row.w <- w$row.w}
209 |   if(is.null(col.w)) {col.w<- w$col.w}
210 |   result[['SCu']] <- sweep(result$u,1,sqrt(row.w),'/') # Standard coordinates
211 |   result[['SCv']] <- sweep(result$v,1,sqrt(col.w),'/')
212 |   result[['PCu']] <- sweep(result[['SCu']],2,result$d[seq(1,ncol(result$u),1)],'*') # Principal coordinates
213 |   result[['PCv']] <- sweep(result[['SCv']],2,result$d[seq(1,ncol(result$v),1)],'*')
214 |   class(result) <- c(class(result),'corral')
215 |   return(result)
216 | }
217 | 
218 | #' corral: Correspondence analysis on a single matrix (SingleCellExperiment)
219 | #'
220 | #' @param inp SingleCellExperiment; raw or lognormed counts (no negative values)
221 | #' @inheritParams compsvd
222 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object
223 | #' @param fullout boolean; whether the function will return the full \code{corral} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corral_mat}}-style output, set this to \code{TRUE}.
224 | #' @param subset_row numeric, character, or boolean vector; the rows to include in corral, as indices (numeric), rownames (character), or with booleans (same length as the number of rows in the matrix). If this parameter is \code{NULL}, then all rows will be used.
225 | #'
226 | #' @return When run on a \code{\link{SingleCellExperiment}}, returns a SCE with the embeddings (PCv from the full corral output) in the \code{reducedDim} slot \code{corral} (default). Also can return the same output as \code{\link{corral_mat}} when \code{fullout} is set to \code{TRUE}.
227 | #' 
228 | #' @rdname corral
229 | #' @export
230 | #' 
231 | #' @importFrom irlba irlba
232 | #' @importFrom Matrix Matrix rowSums colSums
233 | #' @importFrom SingleCellExperiment reducedDim
234 | #' @importClassesFrom Matrix dgCMatrix
235 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
236 | #'
237 | #' @examples
238 | #' library(DuoClustering2018)
239 | #' sce <- sce_full_Zhengmix4eq()[1:100,1:100]
240 | #' result_1 <- corral_sce(sce)
241 | #' result_2 <- corral_sce(sce, method = 'svd')
242 | #' result_3 <- corral_sce(sce, method = 'irl', ncomp = 30, whichmat = 'logcounts')
243 | #' 
244 | #' 
245 | corral_sce <- function(inp, method = c('irl','svd'), ncomp = 30, whichmat = 'counts', fullout = FALSE, subset_row = NULL, ...){
246 |   method <- match.arg(method, c('irl','svd'))
247 |   inp_mat <- SummarizedExperiment::assay(inp, whichmat)
248 |   if(!is.null(subset_row)){
249 |     inp_mat <- inp_mat[subset_row,]
250 |   }
251 |   svd_output <- corral_mat(inp_mat, ...)
252 |   if(fullout){
253 |     return(svd_output)
254 |   }
255 |   else{
256 |     SingleCellExperiment::reducedDim(inp,'corral') <- svd_output$PCv
257 |     return(inp)
258 |   }
259 | }
260 | 
261 | 
262 | #' corral: Correspondence analysis for a single matrix
263 | #' 
264 | #' \code{corral} is a wrapper for \code{\link{corral_mat}} and \code{\link{corral_sce}}, and can be called on any of the acceptable input types.
265 | #'
266 | #' @param inp matrix (any type), \code{SingleCellExperiment}, or \code{SummarizedExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}).
267 | #' @param ... (additional arguments for methods)
268 | #'
269 | #' @return For matrix and \code{SummarizedExperiment} input, returns list with the correspondence analysis matrix decomposition result (u,v,d are the raw svd output; SCu and SCv are the standard coordinates; PCu and PCv are the principal coordinates)
270 | #' @return
271 | #' For \code{SummarizedExperiment} input, returns the same as for a matrix.
272 | #' @rdname corral
273 | #' @export
274 | #' 
275 | #' @importFrom irlba irlba
276 | #' @importFrom Matrix Matrix rowSums colSums
277 | #' @importFrom methods is
278 | #' @importFrom SingleCellExperiment reducedDim
279 | #' @importFrom SummarizedExperiment assay
280 | #' @importClassesFrom Matrix dgCMatrix
281 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
282 | #'
283 | #' @examples
284 | #' library(DuoClustering2018)
285 | #' sce <- sce_full_Zhengmix4eq()[1:100,1:100]
286 | #' corral_sce <- corral(sce,whichmat = 'counts')
287 | #' 
288 | #' mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25)
289 | #' corral_mat <- corral(mat, ncomp=5)
290 | #' 
291 | corral <- function(inp,...){
292 |   if(is(inp,"SingleCellExperiment")){
293 |     corral_sce(inp = inp, ...)
294 |   } else if(is(inp,"SummarizedExperiment")){
295 |     if(missing(whichmat)) {whichmat <- 'counts'}
296 |     corral_mat(inp = SummarizedExperiment::assay(inp,whichmat),...)
297 |   } else{
298 |     corral_mat(inp = inp, ...)
299 |   }
300 | }
301 | 
302 | 
303 | #' Print method for S3 object corral
304 | #'
305 | #' @param x (print method) corral object; the list output from \code{corral_mat}
306 | #'
307 | #' @rdname corral
308 | #'
309 | #' @return .
310 | #' @export
311 | #'
312 | #' @examples
313 | #' mat <- matrix(sample(1:100, 10000, replace = TRUE), ncol = 100)
314 | #' corral(mat)
315 | print.corral <- function(x,...){
316 |   inp <- x
317 |   pct_var_exp <- inp$pct_var_exp
318 |   ncomp <- length(inp$d)
319 |   cat('corral output summary===========================================\n')
320 |   cat('  Output "list" includes standard coordinates (SCu, SCv),\n')
321 |   cat('  principal coordinates (PCu, PCv), & SVD output (u, d, v)\n')
322 |   cat('Variance explained----------------------------------------------\n')
323 |   print(round(pct_var_exp[,seq(1,min(8,ncomp),1)],2))
324 |   cat('\n')
325 |   cat('Dimensions of output elements-----------------------------------\n')
326 |   cat('  Singular values (d) :: ')
327 |   cat(ncomp)
328 |   cat('\n  Left singular vectors & coordinates (u, SCu, PCu) :: ')
329 |   cat(dim(inp$u))
330 |   cat('\n  Right singular vectors & coordinates (v, SCv, PCv) :: ')
331 |   cat(dim(inp$v))
332 |   cat('\n  See corral help for details on each output element.')
333 |   cat('\n  Use plot_embedding to visualize; see docs for details.')
334 |   cat('\n================================================================\n')
335 | }
336 | 
337 | 


--------------------------------------------------------------------------------