├── tests ├── testthat.R └── testthat │ └── testcorral.R ├── .Rbuildignore ├── corral_sticker.png ├── vignettes ├── corral_IO.png ├── corralm_IO.png ├── ref.bib ├── corralm_alignment.Rmd └── corral_dimred.Rmd ├── inst ├── NEWS.md └── CITATION ├── man ├── na2zero.Rd ├── get_weights.Rd ├── all_are.Rd ├── pairwise_rv.Rd ├── rv.Rd ├── scal_var_mat.Rd ├── list2mat.Rd ├── var_stabilize.Rd ├── get_pct_var_exp_svd.Rd ├── sce2matlist.Rd ├── obs2probs.Rd ├── trim_matdist.Rd ├── add_embeddings2scelist.Rd ├── scal_var.Rd ├── compsvd.Rd ├── earthmover_dist.Rd ├── biplot_corral.Rd ├── corral_preproc.Rd ├── plot_embedding_sce.Rd ├── plot_embedding.Rd ├── corral.Rd └── corralm.Rd ├── README.md ├── NAMESPACE ├── R ├── checkers.R ├── scal_var.R ├── evaluation.R ├── corralm.R ├── utils.R ├── plot_embedding.R └── corral.R └── DESCRIPTION /tests/testthat.R: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /corral_sticker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/corral_sticker.png -------------------------------------------------------------------------------- /vignettes/corral_IO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/vignettes/corral_IO.png -------------------------------------------------------------------------------- /vignettes/corralm_IO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurenhsu1/corral/HEAD/vignettes/corralm_IO.png -------------------------------------------------------------------------------- /inst/NEWS.md: -------------------------------------------------------------------------------- 1 | Changes in version 1.5.1 (2021-11-08) 2 | + Made significant updates to include methods described in our manuscript. Both are described in dim reduction vignette 3 | o New variations of CA that address overdispersion 4 | o Biplots - visualize relationships between cells and genes 5 | o Scaled variance plots - visualize batch integration embedding space 6 | -------------------------------------------------------------------------------- /tests/testthat/testcorral.R: -------------------------------------------------------------------------------- 1 | context("corral") 2 | 3 | library(corral) 4 | library(ade4) 5 | 6 | test_that('same eigens as dudi.coa',{ 7 | mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25) 8 | a <- corral(mat,ncomp = 2) 9 | b <- dudi.coa(mat, scannf = FALSE, nf = 2) 10 | expect_equal(all.equal(b$eig[1:2],a$d[1:2]^2),TRUE) 11 | }) 12 | 13 | 14 | # add checks that it takes sce / mat 15 | 16 | # add checks for thin and thick matrix 17 | 18 | -------------------------------------------------------------------------------- /man/na2zero.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{na2zero} 4 | \alias{na2zero} 5 | \title{Set na to 0} 6 | \usage{ 7 | na2zero(x) 8 | } 9 | \arguments{ 10 | \item{x}{matrix of values for which na values should be changed to 0} 11 | } 12 | \value{ 13 | matrix, where na values are set to 0 14 | } 15 | \description{ 16 | Set na to 0 17 | } 18 | \examples{ 19 | x <- matrix(sample(0:10, 5000, replace = TRUE), ncol = 25) 20 | x[sample(1:5000, 10)] <- NA 21 | 22 | na2zero(x) 23 | } 24 | -------------------------------------------------------------------------------- /man/get_weights.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_weights} 4 | \alias{get_weights} 5 | \title{Get weights} 6 | \usage{ 7 | get_weights(inp_mat) 8 | } 9 | \arguments{ 10 | \item{inp_mat}{matrix for which weights should be calculated (sparse or full)} 11 | } 12 | \value{ 13 | list of 2 elements: 'row.w' and 'col.w' contain the row and column weights respectively 14 | } 15 | \description{ 16 | Computes row weights and column weights 17 | } 18 | \examples{ 19 | mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10) 20 | ws <- get_weights(mat) 21 | } 22 | -------------------------------------------------------------------------------- /man/all_are.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{all_are} 4 | \alias{all_are} 5 | \title{all_are} 6 | \usage{ 7 | all_are(inplist, typechar) 8 | } 9 | \arguments{ 10 | \item{inplist}{list or List to be checked} 11 | 12 | \item{typechar}{char of the type to check for} 13 | } 14 | \value{ 15 | boolean, for whether the elements of \code{inplist} are all \code{typechar} 16 | } 17 | \description{ 18 | Checks if all elements of a list or List are of a (single) particular type \code{typechar} 19 | } 20 | \examples{ 21 | x <- list(1,2) 22 | all_are(x,'numeric') 23 | all_are(x,'char') 24 | 25 | y <- list(1,2,'c') 26 | all_are(y,'numeric') 27 | all_are(y,'char') 28 | } 29 | -------------------------------------------------------------------------------- /man/pairwise_rv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{pairwise_rv} 4 | \alias{pairwise_rv} 5 | \title{Pairwise rv coefficient} 6 | \usage{ 7 | pairwise_rv(matlist) 8 | } 9 | \arguments{ 10 | \item{matlist}{list of matrices (or matrix-like; see \code{rv} function) for which to compute pairwise RV coefficients} 11 | } 12 | \value{ 13 | matrix of the pairwise coefficients 14 | } 15 | \description{ 16 | Pairwise rv coefficient 17 | } 18 | \examples{ 19 | a <- matrix(sample(1:10,100,TRUE), nrow = 10) 20 | b <- matrix(sample(1:10,50,TRUE), nrow = 5) 21 | c <- matrix(sample(1:10,20,TRUE), nrow = 2) 22 | 23 | matlist <- list(a,b,c) 24 | pairwise_rv(matlist) 25 | pairwise_rv(lapply(matlist, t)) 26 | } 27 | -------------------------------------------------------------------------------- /man/rv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{rv} 4 | \alias{rv} 5 | \title{rv coefficient} 6 | \usage{ 7 | rv(mat1, mat2) 8 | } 9 | \arguments{ 10 | \item{mat1}{matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat2}} 11 | 12 | \item{mat2}{matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat1}} 13 | } 14 | \value{ 15 | numeric; RV coefficient between the matched matrices 16 | } 17 | \description{ 18 | rv coefficient 19 | } 20 | \examples{ 21 | a <- matrix(sample(1:10,100, TRUE), nrow = 10) 22 | b <- matrix(sample(1:10,50, TRUE), nrow = 5) 23 | 24 | rv(a, b) # matched by columns 25 | rv(t(a), t(b)) # matched by rows 26 | } 27 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citEntry(entry="article", 2 | title = "Correspondence analysis for dimension reduction, batch integration, and visualization of single-cell RNA-seq data", 3 | author = personList( as.person("Lauren L. Hsu"), 4 | as.person("Aedin C. Culhane")), 5 | year = "2023", 6 | journal = "Scientific Reports", 7 | doi = "10.1038/s41598-022-26434-1", 8 | issue = "13", 9 | number = "1197", 10 | url = "https://www.nature.com/articles/s41598-022-26434-1", 11 | textVersion = 12 | paste("Hsu, L.L., Culhane, A.C.", 13 | "Correspondence analysis for dimension reduction, batch integration, and visualization of single-cell RNA-seq data", 14 | "Scientific Reports 13, 1197 (2023)" ) ) 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # corral 6 | *Dimensionality reduction and batch integration methods for single cell data* 7 | 8 | corral is a package to perform correspondence analysis on count data, particularly for single-cell (e.g., RNA-seq). 9 | 10 | It can be used on a single table for dimensionality reduction, or it can be used for integration across batches, samples, and modes. 11 | 12 | To install the dev version: `devtools::install_github('laurenhsu1/corral')` 13 | 14 | Or it can be installed with `BiocManager::install("laurenhsu1/corral", dependencies = TRUE)` 15 | 16 | # How to 17 | 18 | The package contains two main function calls: 19 | 20 | 1. `corral` for dimensionality reduction on a single table 21 | 2. `corralm` for alignment and batch integration of multiple tables 22 | 23 | See the vignettes for details on outputs and how to plot. 24 | -------------------------------------------------------------------------------- /man/scal_var_mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scal_var.R 3 | \name{scal_var_mat} 4 | \alias{scal_var_mat} 5 | \title{Generate a matrix of the scaled variance values} 6 | \usage{ 7 | scal_var_mat(inp, batchvec = NULL) 8 | } 9 | \arguments{ 10 | \item{inp}{corralm object or matrix; embedding to compute scaled variances} 11 | 12 | \item{batchvec}{vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`.} 13 | } 14 | \value{ 15 | matrix of the scaled variance values by PC (batches in rows; PCs in columns) 16 | } 17 | \description{ 18 | Generate a matrix of the scaled variance values 19 | } 20 | \examples{ 21 | dat <- matrix(rnorm(5000), ncol = 50) 22 | bv <- rep(seq(3),c(10,30,60)) 23 | scal_var_mat(dat, bv) 24 | } 25 | -------------------------------------------------------------------------------- /man/list2mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{list2mat} 4 | \alias{list2mat} 5 | \title{List to Matrix} 6 | \usage{ 7 | list2mat(matlist, direction = c("c", "r")[1]) 8 | } 9 | \arguments{ 10 | \item{matlist}{list of matrices to concatenate} 11 | 12 | \item{direction}{character, r or c, to indicate whether should be row-wise (i.e., rbind to match on columns) or column-wise (i.e., cbind to match on rows). Defaults to columnwise (matching on rows) to match convention of SingleCellExperiments} 13 | } 14 | \value{ 15 | matrix 16 | } 17 | \description{ 18 | List to Matrix 19 | } 20 | \examples{ 21 | listofmats <- list(matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10), 22 | matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 10)) 23 | newmat <- list2mat(listofmats) # to "cbind" them 24 | listofmats_t <- lapply(listofmats,t) 25 | newmat_t <- list2mat(listofmats_t, 'r') # to "rbind" them 26 | } 27 | -------------------------------------------------------------------------------- /man/var_stabilize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/corral.R 3 | \name{var_stabilize} 4 | \alias{var_stabilize} 5 | \title{Apply a variance stabilizing transformation} 6 | \usage{ 7 | var_stabilize(inp, transform = c("sqrt", "freemantukey", "anscombe")) 8 | } 9 | \arguments{ 10 | \item{inp}{matrix, numeric, counts or logcounts; can be sparse Matrix or matrix} 11 | 12 | \item{transform}{character indicating which method should be applied. Defaults to the square root transform (`"sqrt"`). Other options include `"freemantukey"` and `"anscombe"`.} 13 | } 14 | \value{ 15 | variance-stabilized matrix; sparse if possible 16 | } 17 | \description{ 18 | Prior to running CA, there is an option to apply a variance stabilizing transformation. This function can be called explicitly or used with the `vst_mth` argument in \code{corral} and \code{corral_preproc}. 19 | } 20 | \examples{ 21 | x <- as.matrix(rpois(100, lambda = 50), ncol = 10) 22 | vst_x <- var_stabilize(x) 23 | } 24 | -------------------------------------------------------------------------------- /man/get_pct_var_exp_svd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_pct_var_exp_svd} 4 | \alias{get_pct_var_exp_svd} 5 | \title{Compute percent of variance explained} 6 | \usage{ 7 | get_pct_var_exp_svd(thissvd, preproc_mat = thissvd$d) 8 | } 9 | \arguments{ 10 | \item{thissvd}{list outputted from an svd function (svd, irlba; can also take output from \code{\link{corral_mat}} and \code{\link{corralm_matlist}})} 11 | 12 | \item{preproc_mat}{matrix of pre-processed values (optional) - important to include if the svd is only partial as this is used to compute the sum of eigenvalues} 13 | } 14 | \value{ 15 | vector of percent variance explained values, indexed by PC 16 | } 17 | \description{ 18 | Compute percent of variance explained 19 | } 20 | \examples{ 21 | mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10) 22 | my_svd <- svd(mat) 23 | get_pct_var_exp_svd(my_svd) # this works if my_svd is a full svd 24 | my_irl <- irlba::irlba(mat,nv = 2) 25 | get_pct_var_exp_svd(my_irl, preproc_mat = mat) # ... otherwise use this 26 | } 27 | -------------------------------------------------------------------------------- /man/sce2matlist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{sce2matlist} 4 | \alias{sce2matlist} 5 | \title{SingleCellExperiment to list of matrices} 6 | \usage{ 7 | sce2matlist(sce, splitby, to_include = NULL, whichmat = "counts") 8 | } 9 | \arguments{ 10 | \item{sce}{SingleCellExperiment that is to be separated into list of count matrices} 11 | 12 | \item{splitby}{character; name of the attribute from colData that should be used to separate the SCE} 13 | 14 | \item{to_include}{(optional) character vector; determines which values from the "splitby" column will be included in the outputted matlist. NULL is the default, and will result in selecting all elements} 15 | 16 | \item{whichmat}{character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object} 17 | } 18 | \value{ 19 | list of matrices 20 | } 21 | \description{ 22 | SingleCellExperiment to list of matrices 23 | } 24 | \examples{ 25 | library(DuoClustering2018) 26 | sce <- sce_full_Zhengmix4eq() 27 | matlist <- sce2matlist(sce = sce, splitby = 'phenoid', whichmat = 'logcounts') 28 | } 29 | -------------------------------------------------------------------------------- /man/obs2probs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/evaluation.R 3 | \name{obs2probs} 4 | \alias{obs2probs} 5 | \title{Observations --> discrete probabilities} 6 | \usage{ 7 | obs2probs(obs, numbins = 100, startbin = min(obs), endbin = max(obs) + 1e-05) 8 | } 9 | \arguments{ 10 | \item{obs}{vector of numeric, with the observations} 11 | 12 | \item{numbins}{int, the number of evenly sized bins to discretize the observations to} 13 | 14 | \item{startbin}{numeric, the starting value for the smallest bin. Defaults to taking the minimum of obs} 15 | 16 | \item{endbin}{numeric, the ending value for the largest bin. Defaults to taking the maximum of obs (plus a tiny decimal to ensure full range of obs is captured)} 17 | } 18 | \value{ 19 | dataframe, results has rows corresponding to each bin with columns for probability ('prob'), cumulative frequency ('cumfreq'), and frequency ('freq') of observations falling into that bin. The 'bins' column indicates the end of the bin (start is the preceding column) 20 | } 21 | \description{ 22 | usage: 23 | embedding <- matrix(sample(x = seq(0,10,.1),200, replace = TRUE)) 24 | disc_probs <- obs2probs(embedding) 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/trim_matdist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{trim_matdist} 4 | \alias{trim_matdist} 5 | \title{Trim extreme values in a pre-processed matrix} 6 | \usage{ 7 | trim_matdist(mat, pct_trim = 0.01) 8 | } 9 | \arguments{ 10 | \item{mat}{matrix; should be pre-processed/normalized to some sort of approximately normally distributed statistic (e.g., chi-squared transformation with `corral_preproc` or Z-score normalization)} 11 | 12 | \item{pct_trim}{numeric; the percent of observations to smooth. Defaults to `pct_trim` = .01, which corresponds to smoothing all observations to be between the .5 percentile and 99.5 percentile range of the input matrix} 13 | } 14 | \value{ 15 | smoothed matrix 16 | } 17 | \description{ 18 | Smooths the extreme values in a chi-square-transformed matrix to lessen the influence of "rare objects." 19 | } 20 | \details{ 21 | (Usually not called directly; can be included by using the `smooth` argument in the `corral`, `corralm`, and `corral_preproc` functions) 22 | } 23 | \examples{ 24 | count_mat <- matrix(rpois(10000, 300)*rbinom(10000,1,.1), ncol = 100) 25 | smoothed_preproc_mat <- corral_preproc(count_mat, smooth = TRUE) 26 | } 27 | -------------------------------------------------------------------------------- /man/add_embeddings2scelist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{add_embeddings2scelist} 4 | \alias{add_embeddings2scelist} 5 | \title{Add embeddings to list of SCEs} 6 | \usage{ 7 | add_embeddings2scelist(scelist, embeddings, slotname = "corralm") 8 | } 9 | \arguments{ 10 | \item{scelist}{list of SingleCellExperiments; to which the corresponding embeddings should be added} 11 | 12 | \item{embeddings}{matrix; the embeddings outputted from a dimension reduction, e.g. \code{\link{corralm}}. Rows in this table correspond to columns in the SCEs in \code{scelist} (if all the SCEs were column-bound), and row indices should correspond to cells.} 13 | 14 | \item{slotname}{character; name of the slot for the reduced dim embedding; defaults to \code{corral}} 15 | } 16 | \value{ 17 | list of SingleCellExperiments with respective embeddings stored in them 18 | } 19 | \description{ 20 | Add embeddings to list of SCEs 21 | } 22 | \examples{ 23 | library(DuoClustering2018) 24 | sce <- sce_full_Zhengmix4eq() 25 | scelist <- list(sce,sce) 26 | embeddings <- matrix(sample(seq(0,20,1),dim(sce)[2]*6,replace = TRUE),nrow = dim(sce)[2]*2) 27 | scelist <- add_embeddings2scelist(scelist, embeddings) 28 | } 29 | -------------------------------------------------------------------------------- /man/scal_var.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scal_var.R 3 | \name{scal_var} 4 | \alias{scal_var} 5 | \title{Generate a scaled variance plot for an integrative embedding} 6 | \usage{ 7 | scal_var( 8 | inp, 9 | batchvec = NULL, 10 | pcs = seq(3), 11 | returngg = FALSE, 12 | showplot = TRUE, 13 | plot_subtitle = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{inp}{corralm object or matrix; embedding to compute scaled variances} 18 | 19 | \item{batchvec}{vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`.} 20 | 21 | \item{pcs}{numeric; vector of which PCs should be shown. Defaults to 1:3} 22 | 23 | \item{returngg}{boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE}} 24 | 25 | \item{showplot}{boolean; whether or not to show the plot, defaults \code{TRUE}} 26 | 27 | \item{plot_subtitle}{string; the text that should show in the subtitle for the plot. defaults to NULL} 28 | } 29 | \value{ 30 | N/A or a ggplot object 31 | } 32 | \description{ 33 | Generate a scaled variance plot for an integrative embedding 34 | } 35 | \examples{ 36 | dat <- matrix(rnorm(10000), ncol = 50) 37 | bv <- rep(seq(4),c(10,30,60,100)) 38 | scal_var(dat,bv, pcs = seq(4)) 39 | } 40 | -------------------------------------------------------------------------------- /man/compsvd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/corral.R 3 | \name{compsvd} 4 | \alias{compsvd} 5 | \title{compsvd: Compute Singular Value Decomposition (SVD)} 6 | \usage{ 7 | compsvd(mat, method = c("irl", "svd"), ncomp = 30, ...) 8 | } 9 | \arguments{ 10 | \item{mat}{matrix, pre-processed input; can be sparse or full (pre-processing can be performed using \code{\link{corral_preproc}} from this package)} 11 | 12 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd} 13 | 14 | \item{ncomp}{numeric, number of components; Default is 30} 15 | 16 | \item{...}{(additional arguments for methods)} 17 | } 18 | \value{ 19 | SVD result - a list with the following elements: 20 | \describe{ 21 | \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat}. Note that using \code{svd} will result in the full set of singular values, while \code{irlba} will only compute the first \code{ncomp} singular values.} 22 | \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns} 23 | \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns} 24 | \item{\code{eigsum}}{sum of the eigenvalues, for calculating percent variance explained} 25 | } 26 | } 27 | \description{ 28 | Computes SVD. 29 | } 30 | \examples{ 31 | mat <- matrix(sample(0:10, 2500, replace=TRUE), ncol=50) 32 | compsvd(mat, method = 'irl', ncomp = 5) 33 | } 34 | -------------------------------------------------------------------------------- /man/earthmover_dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/evaluation.R 3 | \name{earthmover_dist} 4 | \alias{earthmover_dist} 5 | \title{Earthmover distance (and general Wasserstein distance)} 6 | \usage{ 7 | earthmover_dist(batch1, batch2, whichdim = 1, numbins = 100, p_param = 1) 8 | } 9 | \arguments{ 10 | \item{batch1}{matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)} 11 | 12 | \item{batch2}{matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype)} 13 | 14 | \item{whichdim}{int; which dimension (i.e., column) from the embeddings is used. defaults on first} 15 | 16 | \item{numbins}{int; number of bins for the probability discretization (defaults to 100)} 17 | 18 | \item{p_param}{int; penalty parameter for general Wasserstein distance. Defaults to 1, which corresonds to earthmover.} 19 | } 20 | \value{ 21 | num; the distance 22 | } 23 | \description{ 24 | i.e., wasserstein distance with L1 (p_param = 1); can also use other penalties > 1 25 | (Not technically earthmover distance if using other p_param values) 26 | } 27 | \examples{ 28 | # To compare distributions of reduced dimension values to assess similarity, 29 | # e.g. as a metric for batch integration 30 | embedding <- matrix(sample(x = seq(0,10,.1),1000, replace = TRUE),ncol = 5) 31 | batch <- matrix(sample(c(1,2),200, replace = TRUE)) 32 | earthmover_dist(embedding[which(batch == 1),],embedding[which(batch == 2),]) 33 | } 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,corral) 4 | S3method(print,corralm) 5 | export(add_embeddings2scelist) 6 | export(all_are) 7 | export(biplot_corral) 8 | export(compsvd) 9 | export(corral) 10 | export(corral_mat) 11 | export(corral_preproc) 12 | export(corral_sce) 13 | export(corralm) 14 | export(corralm_matlist) 15 | export(corralm_sce) 16 | export(earthmover_dist) 17 | export(get_pct_var_exp_svd) 18 | export(get_weights) 19 | export(list2mat) 20 | export(na2zero) 21 | export(pairwise_rv) 22 | export(plot_embedding) 23 | export(plot_embedding_sce) 24 | export(rv) 25 | export(scal_var) 26 | export(scal_var_mat) 27 | export(sce2matlist) 28 | export(trim_matdist) 29 | export(var_stabilize) 30 | import(SingleCellExperiment) 31 | import(ggplot2) 32 | import(ggthemes) 33 | import(gridExtra) 34 | import(pals) 35 | import(reshape2) 36 | importClassesFrom(Matrix,dgCMatrix) 37 | importClassesFrom(MultiAssayExperiment,MultiAssayExperiment) 38 | importClassesFrom(SingleCellExperiment,SingleCellExperiment) 39 | importFrom(Matrix,Matrix) 40 | importFrom(Matrix,colSums) 41 | importFrom(Matrix,rowSums) 42 | importFrom(MultiAssayExperiment,assays) 43 | importFrom(MultiAssayExperiment,experiments) 44 | importFrom(MultiAssayExperiment,intersectRows) 45 | importFrom(SingleCellExperiment,colData) 46 | importFrom(SingleCellExperiment,reducedDim) 47 | importFrom(SummarizedExperiment,assay) 48 | importFrom(ggthemes,scale_color_few) 49 | importFrom(grDevices,colorRampPalette) 50 | importFrom(graphics,plot) 51 | importFrom(irlba,irlba) 52 | importFrom(methods,is) 53 | importFrom(stats,quantile) 54 | importFrom(stats,var) 55 | importFrom(transport,wasserstein) 56 | importFrom(utils,combn) 57 | -------------------------------------------------------------------------------- /R/checkers.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | .check_vals <- function(mat){ 3 | if(!is.numeric(mat)){ 4 | stop('Matrix input should contain only numbers.') 5 | } 6 | else if (min(mat) < 0){ 7 | stop('Matrix input should only contain positive values.') 8 | } 9 | } 10 | 11 | #' @keywords internal 12 | .check_mat <- function(mat){ 13 | .check_vals(mat) 14 | if (min(dim(mat)) < 2){ 15 | stop('Matrix input must have more than 1 element and more than 1 feature.') 16 | } 17 | } 18 | 19 | #' @keywords internal 20 | .check_dims <- function(matlist){ 21 | dims <- unlist(lapply(matlist,nrow)) 22 | if(length(unique(dims)) > 1) { 23 | stop('If performing multi-table analysis, the matrices must be matched by rows; currently the dimensions do not match. \nIf they are matched by columns, then transpose the matrices.')} 24 | } 25 | 26 | #' @keywords internal 27 | .check_rw_contrib <- function(matlist, rw_contrib){ 28 | # Verifying valid input for rw_contrib, otherwise setting to equal weight. 29 | matlist_len <- length(matlist) 30 | if(matlist_len != length(rw_contrib)){ 31 | cat('\nThe provided weights did not match number of batches (i.e., number of matrices).\nThey will be set to equal weight.\n') 32 | return(rep(1, matlist_len)) 33 | } 34 | else if(sum(is.na(as.numeric(rw_contrib)))){ 35 | cat('\nNon-numeric values provided in rw_contrib.\nThey will be set to equal weight.\n') 36 | return(rep(1, matlist_len)) 37 | } 38 | else{ 39 | return(rw_contrib) 40 | } 41 | } 42 | 43 | #' @keywords internal 44 | .check_ncomp <- function(mat, ncomp){ 45 | max_comps <- min(dim(mat)) - 1 46 | if(ncomp > max_comps){ 47 | ncomp <- max_comps 48 | cat('Too many components requested; setting ncomp =', max_comps, '\n') 49 | } 50 | return(ncomp) 51 | } 52 | 53 | -------------------------------------------------------------------------------- /vignettes/ref.bib: -------------------------------------------------------------------------------- 1 | @article{zmdata, 2 | AUTHOR = {Duò, A and Robinson, MD and Soneson, C}, 3 | TITLE = {A systematic performance evaluation of clustering methods for single-cell RNA-seq data [version 2; peer review: 2 approved], 4 | JOURNAL = {F1000Research}, 5 | VOLUME = {7}, 6 | YEAR = {2018}, 7 | NUMBER = {1141}, 8 | DOI = {10.12688/f1000research.15666.2} 9 | }} 10 | 11 | @Article{zheng, 12 | title = {Massively parallel digital transcriptional profiling of single cells}, 13 | volume = {8}, 14 | issn = {2041-1723}, 15 | url = {http://www.nature.com/articles/ncomms14049}, 16 | doi = {10.1038/ncomms14049}, 17 | language = {en}, 18 | number = {1}, 19 | urldate = {2020-04-24}, 20 | journal = {Nature Communications}, 21 | author = {Zheng, Grace X. Y. and Terry, Jessica M. and Belgrader, Phillip and Ryvkin, Paul and Bent, Zachary W. and Wilson, Ryan and Ziraldo, Solongo B. and Wheeler, Tobias D. and McDermott, Geoff P. and Zhu, Junjie and Gregory, Mark T. and Shuga, Joe and Montesclaros, Luz and Underwood, Jason G. and Masquelier, Donald A. and Nishimura, Stefanie Y. and Schnall-Levin, Michael and Wyatt, Paul W. and Hindson, Christopher M. and Bharadwaj, Rajiv and Wong, Alexander and Ness, Kevin D. and Beppu, Lan W. and Deeg, H. Joachim and McFarland, Christopher and Loeb, Keith R. and Valente, William J. and Ericson, Nolan G. and Stevens, Emily A. and Radich, Jerald P. and Mikkelsen, Tarjei S. and Hindson, Benjamin J. and Bielas, Jason H.}, 22 | month = apr, 23 | year = {2017}, 24 | pages = {14049} 25 | } 26 | 27 | @article{scmix, 28 | title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, 29 | volume = {16}, 30 | issn = {1548-7091, 1548-7105}, 31 | url = {http://www.nature.com/articles/s41592-019-0425-8}, 32 | doi = {10.1038/s41592-019-0425-8}, 33 | language = {en}, 34 | number = {6}, 35 | urldate = {2019-11-21}, 36 | journal = {Nature Methods}, 37 | author = {Tian, Luyi and Dong, Xueyi and Freytag, Saskia and Lê Cao, Kim-Anh and Su, Shian and JalalAbadi, Abolfazl and Amann-Zalcenstein, Daniela and Weber, Tom S. and Seidi, Azadeh and Jabbari, Jafar S. and Naik, Shalin H. and Ritchie, Matthew E.}, 38 | month = jun, 39 | year = {2019}, 40 | pages = {479--487} 41 | } -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: corral 2 | Title: Correspondence Analysis for Single Cell Data 3 | Version: 1.9.1 4 | Date: 2023-02-09 5 | Authors@R: 6 | c(person(given = "Lauren", family = "Hsu", 7 | role = c("aut", "cre"), 8 | email = "lrnshoe@gmail.com", 9 | comment = c(ORCID = "0000-0002-6035-7381")), 10 | person(given = "Aedin", family = "Culhane", 11 | role = c("aut"), 12 | email = "Aedin.Culhane@ul.ie", 13 | comment = c(ORCID = "0000-0002-1395-9734"))) 14 | Description: 15 | Correspondence analysis (CA) is a matrix factorization method, and is 16 | similar to principal components analysis (PCA). Whereas PCA is designed for 17 | application to continuous, approximately normally distributed data, CA is 18 | appropriate for non-negative, count-based data that are in the same additive scale. 19 | The corral package implements CA for dimensionality reduction of a single matrix of 20 | single-cell data, as well as a multi-table adaptation of CA that leverages 21 | data-optimized scaling to align data generated from different sequencing platforms 22 | by projecting into a shared latent space. corral utilizes sparse matrices and a 23 | fast implementation of SVD, and can be called directly on Bioconductor objects 24 | (e.g., SingleCellExperiment) for easy pipeline integration. 25 | The package also includes additional options, including variations of CA to 26 | address overdispersion in count data (e.g., Freeman-Tukey chi-squared residual), 27 | as well as the option to apply CA-style processing to continuous data 28 | (e.g., proteomic TOF intensities) with the Hellinger distance adaptation of CA. 29 | Imports: 30 | ggplot2, 31 | ggthemes, 32 | grDevices, 33 | gridExtra, 34 | irlba, 35 | Matrix, 36 | methods, 37 | MultiAssayExperiment, 38 | pals, 39 | reshape2, 40 | SingleCellExperiment, 41 | SummarizedExperiment, 42 | transport 43 | Suggests: 44 | ade4, 45 | BiocStyle, 46 | CellBench, 47 | DuoClustering2018, 48 | knitr, 49 | rmarkdown, 50 | scater, 51 | testthat 52 | License: GPL-2 53 | RoxygenNote: 7.1.2 54 | VignetteBuilder: knitr 55 | biocViews: 56 | BatchEffect, 57 | DimensionReduction, 58 | GeneExpression, 59 | Preprocessing, 60 | PrincipalComponent, 61 | Sequencing, 62 | SingleCell, 63 | Software, 64 | Visualization 65 | Encoding: UTF-8 66 | -------------------------------------------------------------------------------- /man/biplot_corral.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_embedding.R 3 | \name{biplot_corral} 4 | \alias{biplot_corral} 5 | \title{Generate biplot for corral object} 6 | \usage{ 7 | biplot_corral( 8 | corral_obj, 9 | color_vec, 10 | text_vec, 11 | feat_name = "(genes)", 12 | nfeat = 20, 13 | xpc = 1, 14 | plot_title = "Biplot", 15 | text_size = 2, 16 | xjitter = 0.005, 17 | yjitter = 0.005, 18 | coords = c("svd", "PC", "SC") 19 | ) 20 | } 21 | \arguments{ 22 | \item{corral_obj}{list outputted by the \code{corral} function} 23 | 24 | \item{color_vec}{vector; length should correspond to the number of rows in v of \code{corral_obj}, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., cell type)} 25 | 26 | \item{text_vec}{vector; length should correspond to the number of rows in u of \code{corral_obj}, and each element of the vector is the label for the respective feature that would show on the biplot.} 27 | 28 | \item{feat_name}{char; the label will in the legend. Defaults to \code{(genes)}.} 29 | 30 | \item{nfeat}{int; the number of features to include. The function will first order them by distance from origin in the selected dimensions, then select the top n to be displayed.} 31 | 32 | \item{xpc}{int; which PC to put on the x-axis (defaults to 1)} 33 | 34 | \item{plot_title}{char; title of plot (defaults to *Biplot*)} 35 | 36 | \item{text_size}{numeric; size of the feature labels given in \code{text_vec} (defaults to 2; for \code{ggplot2})} 37 | 38 | \item{xjitter}{numeric; the amount of jitter for the text labels in x direction (defaults to .005; for \code{ggplot2})} 39 | 40 | \item{yjitter}{numeric; the amount of jitter for the text labels in y direction (defaults to .005; for \code{ggplot2})} 41 | 42 | \item{coords}{char; indicator for sets of coordinates to use. \code{svd} plots the left and right singular vectors as outputted by SVD (\code{u} and \code{v}), which \code{PC} and \code{SC} use the principal and standard coordinates, respectively (defaults to \code{svd})} 43 | } 44 | \value{ 45 | ggplot2 object of the biplot 46 | } 47 | \description{ 48 | Generate biplot for corral object 49 | } 50 | \examples{ 51 | library(DuoClustering2018) 52 | library(SingleCellExperiment) 53 | zm4eq.sce <- sce_full_Zhengmix4eq() 54 | zm4eq.countmat <- counts(zm4eq.sce) 55 | zm4eq.corral_obj <- corral(zm4eq.countmat) 56 | gene_names <- rowData(zm4eq.sce)$symbol 57 | ctvec <- zm4eq.sce$phenoid 58 | 59 | biplot_corral(corral_obj = zm4eq.corral_obj, color_vec = ctvec, text_vec = gene_names) 60 | } 61 | -------------------------------------------------------------------------------- /man/corral_preproc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/corral.R 3 | \name{corral_preproc} 4 | \alias{corral_preproc} 5 | \title{Preprocess a matrix for SVD to perform Correspondence Analysis (CA)} 6 | \usage{ 7 | corral_preproc( 8 | inp, 9 | rtype = c("standardized", "indexed", "hellinger", "freemantukey", "pearson"), 10 | vst_mth = c("none", "sqrt", "freemantukey", "anscombe"), 11 | powdef_alpha = NULL, 12 | row.w = NULL, 13 | col.w = NULL, 14 | smooth = FALSE, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{inp}{matrix, numeric, counts or logcounts; can be sparse Matrix or matrix} 20 | 21 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.} 22 | 23 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`} 24 | 25 | \item{powdef_alpha}{numeric for the power that should be applied if using power deflation. Must be in (0,1), and if provided a number outside this range, will be ignored. Defaults to `NULL` which does not perform this step.} 26 | 27 | \item{row.w}{numeric vector; Default is \code{NULL}, to compute row.w based on \code{inp}. Use this parameter to replace computed row weights with custom row weights} 28 | 29 | \item{col.w}{numeric vector; Default is \code{NULL}, to compute col.w based on \code{inp}. Use this parameter to replace computed column weights with custom column weights} 30 | 31 | \item{smooth}{logical; Whether or not to perform the additional smoothing step with `trim_matdist`. Default is \code{FALSE}. Incompatible with `powdef_alpha`, so that parameter takes precedence over this one.} 32 | 33 | \item{...}{(additional arguments for methods)} 34 | } 35 | \value{ 36 | matrix, processed for input to \code{compsvd} to finish CA routine 37 | } 38 | \description{ 39 | This function performs the row and column scaling pre-processing operations, prior to SVD, for the corral methods. See \code{\link{corral}} for single matrix correspondence analysis and \code{\link{corralm}} for multi-matrix correspondence analysis. 40 | } 41 | \details{ 42 | 43 | } 44 | \examples{ 45 | mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25) 46 | mat_corral <- corral_preproc(mat) 47 | corral_output <- compsvd(mat_corral, ncomp = 5) 48 | } 49 | -------------------------------------------------------------------------------- /man/plot_embedding_sce.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_embedding.R 3 | \name{plot_embedding_sce} 4 | \alias{plot_embedding_sce} 5 | \title{Plot selected PCs from an embedding saved in a SingleCellExperiment object} 6 | \usage{ 7 | plot_embedding_sce( 8 | sce, 9 | which_embedding, 10 | color_attr = NULL, 11 | color_title = color_attr, 12 | ellipse_attr = NULL, 13 | facet_attr = NULL, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{sce}{\code{\link{SingleCellExperiment}} object; contains the embedding within the \code{reducedDim} slot} 19 | 20 | \item{which_embedding}{character; for the embedding to plot} 21 | 22 | \item{color_attr}{character; name of the attribute within \code{colData} to use for assigning colors (in lieu of \code{color_vec} in the \code{\link{plot_embedding}} function)} 23 | 24 | \item{color_title}{character; title to use for colors legend, defaults to the same as \code{color_attr}} 25 | 26 | \item{ellipse_attr}{character; name of the attribute within \code{colData} to use for drawing ellipse(s) (in lieu of \code{ellipse_vec} in the \code{\link{plot_embedding}} function)} 27 | 28 | \item{facet_attr}{character; name of the attribute within \code{colData} to use for faceting (in lieu of \code{facet_vec} in the \code{\link{plot_embedding}} function)} 29 | 30 | \item{...}{additional optional arguments - see \code{\link{plot_embedding}} function for details on other potential arguments: \code{xpc}, \code{ypc}, \code{plot_title}, \code{color_title} (if title is different from \code{color_attr}), \code{ptsize}, \code{saveplot}, \code{plotfn}, \code{showplot}, \code{returngg}, \code{color_pal_vec}, \code{dimname}} 31 | } 32 | \value{ 33 | default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg}) 34 | } 35 | \description{ 36 | Plot selected PCs from an embedding saved in a SingleCellExperiment object 37 | } 38 | \examples{ 39 | library(DuoClustering2018) 40 | library(SingleCellExperiment) 41 | sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)] 42 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 43 | sce <- corralm(sce, splitby = 'Method') 44 | 45 | # to plot and show only 46 | plot_embedding_sce(sce = sce, 47 | which_embedding = 'corralm', 48 | xpc = 1, 49 | plot_title = 'corralm: PC1 by PC2', 50 | color_attr = "Method", 51 | ellipse_attr = 'phenoid', 52 | saveplot = FALSE) 53 | 54 | # to return ggplot2 object and display, but not save 55 | corralm_ggplot <- plot_embedding_sce(sce = sce, 56 | which_embedding = 'corralm', 57 | xpc = 1, 58 | plot_title = 'corralm: PC1 by PC2', 59 | color_attr = 'Method', 60 | ellipse_attr = 'phenoid', 61 | returngg = TRUE, 62 | saveplot = FALSE) 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /R/scal_var.R: -------------------------------------------------------------------------------- 1 | #' Generate a matrix of the scaled variance values 2 | #' 3 | #' @param inp corralm object or matrix; embedding to compute scaled variances 4 | #' @param batchvec vector; batch labels (can be numeric or char). Defaults to `NULL`, which is appropriate for using a corralm object. If using an embedding matrix for inp, then this argument must be given and length must correspond to number of rows in `inp`. 5 | #' 6 | #' @return matrix of the scaled variance values by PC (batches in rows; PCs in columns) 7 | #' @export 8 | #' 9 | #' @examples 10 | #' dat <- matrix(rnorm(5000), ncol = 50) 11 | #' bv <- rep(seq(3),c(10,30,60)) 12 | #' scal_var_mat(dat, bv) 13 | scal_var_mat <- function(inp, batchvec = NULL){ 14 | if('corralm' %in% class(inp)){ 15 | splitvec <- rep(rownames(inp$batch_sizes),inp$batch_sizes[,2]) 16 | emb <- inp$v 17 | } 18 | else if(!is.null(batchvec)){ 19 | if(length(batchvec) != nrow(inp)){ 20 | stop('batchvec must be same length as number of rows in inp') 21 | } 22 | emb <- inp 23 | splitvec <- batchvec 24 | } 25 | else{ 26 | stop('Please provide a valid input: either corralm object or embedding with batchvec arg') 27 | } 28 | sepemb <- apply(emb, MARGIN = 2, FUN = split, f = splitvec) 29 | sep_var <- lapply(sepemb, FUN = function(x) rbind(unlist(lapply(x, var)))) 30 | sv_mat <- Reduce(cbind,lapply(sep_var, t)) 31 | sv_mat[,1] 32 | 33 | overall_var <- apply(emb, MARGIN = 2, FUN = var) 34 | 35 | sv_mat <- sweep(sv_mat, MARGIN = 2, STATS = overall_var, FUN = '/') 36 | sv_mat 37 | } 38 | 39 | #' Generate a scaled variance plot for an integrative embedding 40 | #' 41 | #' @param pcs numeric; vector of which PCs should be shown. Defaults to 1:3 42 | #' @param plot_subtitle string; the text that should show in the subtitle for the plot. defaults to NULL 43 | #' @inheritParams scal_var_mat 44 | #' @inheritParams plot_embedding 45 | #' 46 | #' @import ggplot2 47 | #' @import ggthemes 48 | #' @import reshape2 49 | #' @return N/A or a ggplot object 50 | #' @export 51 | #' 52 | #' @importFrom stats quantile var 53 | #' 54 | #' @examples 55 | #' dat <- matrix(rnorm(10000), ncol = 50) 56 | #' bv <- rep(seq(4),c(10,30,60,100)) 57 | #' scal_var(dat,bv, pcs = seq(4)) 58 | scal_var <- function(inp, batchvec = NULL, pcs = seq(3), returngg = FALSE, showplot = TRUE, plot_subtitle = NULL){ 59 | svmat <- scal_var_mat(inp, batchvec) 60 | svmelt <- reshape2::melt(svmat[,pcs]) 61 | colnames(svmelt) <- c('Batch','PC','sv') 62 | svmelt$Batch <- as.factor(svmelt$Batch) 63 | 64 | svmelt$PC_ind <- svmelt$PC + seq(length(svmelt$PC)) 65 | 66 | pc_seplines <- seq(2,max(svmelt$PC_ind))[-which(seq(2,max(svmelt$PC_ind)) %in% svmelt$PC_ind)] 67 | pc_seplines <- c(pc_seplines, max(svmelt$PC_ind)+1) 68 | pc_lablocs <- pc_seplines - nrow(svmat)/2 69 | 70 | ggobj <- ggplot(svmelt, aes(x = PC_ind, y = sv, colour = Batch, group = Batch)) + 71 | geom_hline(yintercept=1, color = 'gray') + 72 | geom_segment(aes(x=PC_ind, xend=PC_ind, y=1, yend=sv), color="gray") + 73 | geom_point(size = 2) + ggthemes::scale_color_hc() + # other color option - scale_color_few() 74 | theme_classic() + 75 | geom_vline(xintercept = pc_seplines, color = '#555555') + 76 | labs(x = 'Component', 77 | y = 'Scaled variance by group for top PCs', 78 | title = 'Scaled variance by batch', 79 | subtitle = plot_subtitle, 80 | colour = 'Batch') + 81 | scale_x_continuous(breaks = pc_lablocs, 82 | labels = paste0('PC',pcs)) + 83 | theme(axis.ticks.x = element_blank(), axis.title.x = element_text(size = rel(1.2))) + 84 | ylim(0,NA) 85 | 86 | if(showplot){ 87 | show(ggobj) 88 | } 89 | if(returngg){ 90 | return(ggobj) 91 | } 92 | } 93 | 94 | 95 | -------------------------------------------------------------------------------- /man/plot_embedding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_embedding.R 3 | \name{plot_embedding} 4 | \alias{plot_embedding} 5 | \title{Plot selected PCs from an embedding} 6 | \usage{ 7 | plot_embedding( 8 | embedding, 9 | xpc = 1, 10 | ypc = xpc + 1, 11 | plot_title = paste0("Dim", xpc, " by Dim", ypc), 12 | color_vec = NULL, 13 | color_title = NULL, 14 | ellipse_vec = NULL, 15 | facet_vec = NULL, 16 | ptsize = 0.8, 17 | saveplot = FALSE, 18 | plotfn = paste(plot_title, xpc, sep = "_"), 19 | showplot = TRUE, 20 | returngg = FALSE, 21 | color_pal_vec = NULL, 22 | dimname = "Dim" 23 | ) 24 | } 25 | \arguments{ 26 | \item{embedding}{matrix or other tabular format where columns correspond to PCs and rows correspond to cells (entries). \code{corral} and \code{corralm} objects are also accepted.} 27 | 28 | \item{xpc}{int; which PC to put on the x-axis (defaults to 1)} 29 | 30 | \item{ypc}{int; which PC to put on the y-axis (defaults to the one after \code{xpc})} 31 | 32 | \item{plot_title}{char; title of plot (defaults to titling based on \code{xpc} and \code{ypc})} 33 | 34 | \item{color_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., this could be indicating which batch each cell is from)} 35 | 36 | \item{color_title}{char; what attribute the colors represent} 37 | 38 | \item{ellipse_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, and elements of the same class will be circled in an ellipse. (e.g., this could be indicating the cell type or cell line; works best for attributes intended to be compact)} 39 | 40 | \item{facet_vec}{vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class. Plot will be faceted by this attribute.} 41 | 42 | \item{ptsize}{numeric; the size of the points as passed to \code{geom_point()}. Defaults to 0.8.} 43 | 44 | \item{saveplot}{boolean; whether or not to save the plot, defaults \code{FALSE}} 45 | 46 | \item{plotfn}{char; what the filename is to be called. (defaults to making a name based on \code{plot_title} and \code{xpc})} 47 | 48 | \item{showplot}{boolean; whether or not to show the plot, defaults \code{TRUE}} 49 | 50 | \item{returngg}{boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE}} 51 | 52 | \item{color_pal_vec}{char; hex codes for the color palette to be used. Default is to use the ggthemes few for plots with less than 9 colors, and to use/"stretch" pals polychrome if more colors are needed.} 53 | 54 | \item{dimname}{char; the name of the dimensions. defaults to "Dim"} 55 | } 56 | \value{ 57 | default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg}) 58 | } 59 | \description{ 60 | Plot selected PCs from an embedding 61 | } 62 | \examples{ 63 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20), 64 | matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20)) 65 | corralm_obj <- corralm(listofmats, ncomp = 5) 66 | embed_mat <- corralm_obj$v 67 | cell_type_vec <- sample(c('type1','type2','type3'),100,replace = TRUE) 68 | plot_embedding(embedding = embed_mat, 69 | xpc = 1, 70 | plot_title = 'corralm plot', 71 | color_vec = cell_type_vec, 72 | color_title = 'cell type', 73 | saveplot = FALSE) 74 | 75 | # or, call directly on the corralm object 76 | plot_embedding(corralm_obj) 77 | 78 | } 79 | -------------------------------------------------------------------------------- /R/evaluation.R: -------------------------------------------------------------------------------- 1 | # Functions for evaluating performance with respect to software / computing, as well as results for dimensionality reduction and batch integration. 2 | 3 | #' @keywords internal 4 | .cumtotal <- function(vals, ref){ 5 | return(sum(vals < ref)) 6 | } 7 | 8 | #' Observations --> discrete probabilities 9 | #' 10 | #' usage: 11 | #' embedding <- matrix(sample(x = seq(0,10,.1),200, replace = TRUE)) 12 | #' disc_probs <- obs2probs(embedding) 13 | #' 14 | #' @param obs vector of numeric, with the observations 15 | #' @param numbins int, the number of evenly sized bins to discretize the observations to 16 | #' @param startbin numeric, the starting value for the smallest bin. Defaults to taking the minimum of obs 17 | #' @param endbin numeric, the ending value for the largest bin. Defaults to taking the maximum of obs (plus a tiny decimal to ensure full range of obs is captured) 18 | #' 19 | #' @return dataframe, results has rows corresponding to each bin with columns for probability ('prob'), cumulative frequency ('cumfreq'), and frequency ('freq') of observations falling into that bin. The 'bins' column indicates the end of the bin (start is the preceding column) 20 | #' @keywords internal 21 | #' 22 | obs2probs <- function(obs, numbins = 100, startbin = min(obs), endbin = max(obs) + .00001){ 23 | bins <- seq(from = startbin, to = endbin, length.out = numbins) 24 | result <- data.frame(bins) 25 | result[1,'cumfreq'] <- 0 26 | result[1, 'freq'] <- 0 27 | for (ind in seq(2,numbins,1)){ 28 | cumsum <- .cumtotal(obs, bins[ind]) 29 | result[ind, 'cumfreq'] <- cumsum 30 | result[ind, 'freq'] <- cumsum - result[ind - 1, 'cumfreq'] 31 | } 32 | result$probs <- result$freq / length(obs) 33 | return(result) 34 | } 35 | 36 | #' @keywords internal 37 | .make_costmat <- function(matdim, mincost = 1, maxcost = 10){ 38 | abval_dif <- function(x,y) {return(abs(x-y))} 39 | costmat <- matrix(seq(mincost, maxcost,length.out = matdim), matdim, matdim) 40 | costmat <- sweep(costmat, 41 | MARGIN = 2, 42 | STATS = seq(mincost, maxcost, length.out = matdim), 43 | FUN = abval_dif) 44 | return(costmat) 45 | } 46 | 47 | 48 | #' Earthmover distance (and general Wasserstein distance) 49 | #' 50 | #' i.e., wasserstein distance with L1 (p_param = 1); can also use other penalties > 1 51 | #' (Not technically earthmover distance if using other p_param values) 52 | #' 53 | #' @param batch1 matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype) 54 | #' @param batch2 matrix; subset of observations from an embedding correponding to some attribute (e.g., batch or phenotype) 55 | #' @param whichdim int; which dimension (i.e., column) from the embeddings is used. defaults on first 56 | #' @param numbins int; number of bins for the probability discretization (defaults to 100) 57 | #' @param p_param int; penalty parameter for general Wasserstein distance. Defaults to 1, which corresonds to earthmover. 58 | #' 59 | #' @return num; the distance 60 | #' @export 61 | #' 62 | #' @importFrom transport wasserstein 63 | #' 64 | #' @examples 65 | #' # To compare distributions of reduced dimension values to assess similarity, 66 | #' # e.g. as a metric for batch integration 67 | #' embedding <- matrix(sample(x = seq(0,10,.1),1000, replace = TRUE),ncol = 5) 68 | #' batch <- matrix(sample(c(1,2),200, replace = TRUE)) 69 | #' earthmover_dist(embedding[which(batch == 1),],embedding[which(batch == 2),]) 70 | earthmover_dist <- function(batch1, batch2, whichdim = 1, numbins = 100, p_param = 1){ 71 | minval <- min(min(batch1), min(batch2)) 72 | maxval <- max(max(batch1), max(batch2)) + .00001 73 | df_1 <- obs2probs(obs = batch1[,whichdim], numbins = numbins, startbin = minval, endbin = maxval) 74 | df_2 <- obs2probs(obs = batch2[,whichdim], numbins = numbins, startbin = minval, endbin = maxval) 75 | costmat <- .make_costmat(matdim = numbins) 76 | transport::wasserstein(df_1$probs, df_2$probs, costm = costmat, p = p_param) 77 | } 78 | -------------------------------------------------------------------------------- /man/corral.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/corral.R 3 | \name{corral_mat} 4 | \alias{corral_mat} 5 | \alias{corral_sce} 6 | \alias{corral} 7 | \alias{print.corral} 8 | \title{corral: Correspondence analysis on a single matrix} 9 | \usage{ 10 | corral_mat( 11 | inp, 12 | method = c("irl", "svd"), 13 | ncomp = 30, 14 | row.w = NULL, 15 | col.w = NULL, 16 | rtype = c("standardized", "indexed", "hellinger", "freemantukey", "pearson"), 17 | vst_mth = c("none", "sqrt", "freemantukey", "anscombe"), 18 | ... 19 | ) 20 | 21 | corral_sce( 22 | inp, 23 | method = c("irl", "svd"), 24 | ncomp = 30, 25 | whichmat = "counts", 26 | fullout = FALSE, 27 | subset_row = NULL, 28 | ... 29 | ) 30 | 31 | corral(inp, ...) 32 | 33 | \method{print}{corral}(x, ...) 34 | } 35 | \arguments{ 36 | \item{inp}{matrix (any type), \code{SingleCellExperiment}, or \code{SummarizedExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}).} 37 | 38 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd} 39 | 40 | \item{ncomp}{numeric, number of components; Default is 30} 41 | 42 | \item{row.w}{numeric vector; the row weights to use in chi-squared scaling. Defaults to `NULL`, in which case row weights are computed from the input matrix.} 43 | 44 | \item{col.w}{numeric vector; the column weights to use in chi-squared scaling. For instance, size factors could be given here. Defaults to `NULL`, in which case column weights are computed from the input matrix.} 45 | 46 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.} 47 | 48 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`} 49 | 50 | \item{...}{(additional arguments for methods)} 51 | 52 | \item{whichmat}{character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object} 53 | 54 | \item{fullout}{boolean; whether the function will return the full \code{corral} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corral_mat}}-style output, set this to \code{TRUE}.} 55 | 56 | \item{subset_row}{numeric, character, or boolean vector; the rows to include in corral, as indices (numeric), rownames (character), or with booleans (same length as the number of rows in the matrix). If this parameter is \code{NULL}, then all rows will be used.} 57 | 58 | \item{x}{(print method) corral object; the list output from \code{corral_mat}} 59 | } 60 | \value{ 61 | When run on a matrix, a list with the correspondence analysis matrix decomposition result: 62 | \describe{ 63 | \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)} 64 | \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)} 65 | \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)} 66 | \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained} 67 | \item{\code{SCu and SCv}}{standard coordinates, left and right, respectively} 68 | \item{\code{PCu and PCv}}{principal coordinates, left and right, respectively} 69 | } 70 | 71 | When run on a \code{\link{SingleCellExperiment}}, returns a SCE with the embeddings (PCv from the full corral output) in the \code{reducedDim} slot \code{corral} (default). Also can return the same output as \code{\link{corral_mat}} when \code{fullout} is set to \code{TRUE}. 72 | 73 | For matrix and \code{SummarizedExperiment} input, returns list with the correspondence analysis matrix decomposition result (u,v,d are the raw svd output; SCu and SCv are the standard coordinates; PCu and PCv are the principal coordinates) 74 | 75 | For \code{SummarizedExperiment} input, returns the same as for a matrix. 76 | 77 | . 78 | } 79 | \description{ 80 | corral can be used for dimension reduction to find a set of low-dimensional embeddings for a count matrix. 81 | 82 | \code{corral} is a wrapper for \code{\link{corral_mat}} and \code{\link{corral_sce}}, and can be called on any of the acceptable input types. 83 | } 84 | \examples{ 85 | mat <- matrix(sample(0:10, 5000, replace=TRUE), ncol=50) 86 | result <- corral_mat(mat) 87 | result <- corral_mat(mat, method = 'irl', ncomp = 5) 88 | 89 | library(DuoClustering2018) 90 | sce <- sce_full_Zhengmix4eq()[1:100,1:100] 91 | result_1 <- corral_sce(sce) 92 | result_2 <- corral_sce(sce, method = 'svd') 93 | result_3 <- corral_sce(sce, method = 'irl', ncomp = 30, whichmat = 'logcounts') 94 | 95 | 96 | library(DuoClustering2018) 97 | sce <- sce_full_Zhengmix4eq()[1:100,1:100] 98 | corral_sce <- corral(sce,whichmat = 'counts') 99 | 100 | mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25) 101 | corral_mat <- corral(mat, ncomp=5) 102 | 103 | mat <- matrix(sample(1:100, 10000, replace = TRUE), ncol = 100) 104 | corral(mat) 105 | } 106 | -------------------------------------------------------------------------------- /man/corralm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/corralm.R 3 | \name{corralm_matlist} 4 | \alias{corralm_matlist} 5 | \alias{corralm_sce} 6 | \alias{corralm} 7 | \alias{print.corralm} 8 | \title{Multi-table correspondence analysis (list of matrices)} 9 | \usage{ 10 | corralm_matlist( 11 | matlist, 12 | method = c("irl", "svd"), 13 | ncomp = 30, 14 | rtype = c("indexed", "standardized", "hellinger", "freemantukey", "pearson"), 15 | vst_mth = c("none", "sqrt", "freemantukey", "anscombe"), 16 | rw_contrib = NULL, 17 | ... 18 | ) 19 | 20 | corralm_sce( 21 | sce, 22 | splitby, 23 | method = c("irl", "svd"), 24 | ncomp = 30, 25 | whichmat = "counts", 26 | fullout = FALSE, 27 | rw_contrib = NULL, 28 | ... 29 | ) 30 | 31 | corralm(inp, whichmat = "counts", fullout = FALSE, ...) 32 | 33 | \method{print}{corralm}(x, ...) 34 | } 35 | \arguments{ 36 | \item{matlist}{(for \code{corralm_matlist}) list of input matrices; input matrices should be counts (raw or log). Matrices should be aligned row-wise by common features (either by sample or by gene)} 37 | 38 | \item{method}{character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd} 39 | 40 | \item{ncomp}{numeric, number of components; Default is 30} 41 | 42 | \item{rtype}{character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data.} 43 | 44 | \item{vst_mth}{character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"`} 45 | 46 | \item{rw_contrib}{numeric vector, same length as the matlist. Indicates the weight that each dataset should contribute to the row weights. When set to NULL the row weights are *not* combined and each matrix is scaled independently (i.e., using their observed row weights, respectively). When set to a vector of all the same values, this is equivalent to taking the mean. Another option is to the number of observations per matrix to create a weighted mean. Regardless of input scale, row weights for each table must sum to 1 and thus are scaled. When this option is specified (i.e., not `NULL`), the `rtype` argument will automatically be set to `standardized`, and whatever argument is given will be ignored.} 47 | 48 | \item{...}{(additional arguments for methods)} 49 | 50 | \item{sce}{(for \code{corralm_sce}) SingleCellExperiment; containing the data to be integrated. Default is to use the counts, and to include all of the data in the integration. These can be changed by passing additional arguments. See \code{\link{sce2matlist}} function documentation for list of available parameters.} 51 | 52 | \item{splitby}{character; name of the attribute from \code{colData} that should be used to separate the SCE.} 53 | 54 | \item{whichmat}{char, when using SingleCellExperiment or other SummarizedExperiment, can be specified. default is 'counts'.} 55 | 56 | \item{fullout}{boolean; whether the function will return the full \code{corralm} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corralm_matlist}}-style output, set this to \code{TRUE}.} 57 | 58 | \item{inp}{list of matrices (any type), a \code{SingleCellExperiment}, list of \code{SingleCellExperiment}s, list of \code{SummarizedExperiment}s, or \code{MultiAssayExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}). Additionally, if it is one \code{SingleCellExperiment}, then it is also necessary to include the \code{splitby} argument to specify the batches. For a \code{MultiAssayExperiment}, it will take the intersect of the features across all the assays, and use those to match the matrices; to use a different subset, select desired subsets then call \code{corral}} 59 | 60 | \item{x}{(print method) corralm object; the list output from \code{corralm_matlist}} 61 | } 62 | \value{ 63 | When run on a list of matrices, a list with the correspondence analysis matrix decomposition result, with indices corresponding to the concatenated matrices (in order of the list): 64 | \describe{ 65 | \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)} 66 | \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)} 67 | \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)} 68 | \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained} 69 | } 70 | 71 | For SingleCellExperiment input, returns the SCE with embeddings in the reducedDim slot 'corralm' 72 | 73 | For a list of \code{\link{SingleCellExperiment}}s, returns a list of the SCEs with the embeddings in the respective \code{reducedDim} slot 'corralm' 74 | 75 | . 76 | } 77 | \description{ 78 | This multi-table adaptation of correpondence analysis applies the same scaling technique and enables data alignment by finding a set of embeddings for each dataset within shared latent space. 79 | } 80 | \details{ 81 | \code{corralm} is a wrapper for \code{\link{corralm_matlist}} and \code{\link{corralm_sce}}, and can be called on any of the acceptable input types (see \code{inp} below). 82 | } 83 | \examples{ 84 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25), 85 | matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25)) 86 | result <- corralm_matlist(listofmats) 87 | library(DuoClustering2018) 88 | library(SingleCellExperiment) 89 | sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)] 90 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 91 | result <- corralm_sce(sce, splitby = 'Method') 92 | 93 | 94 | listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20), 95 | matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20)) 96 | corralm(listofmats) 97 | 98 | library(DuoClustering2018) 99 | library(SingleCellExperiment) 100 | sce <- sce_full_Zhengmix4eq()[seq(1,100,1),sample(seq(1,3500,1),100,replace = FALSE)] 101 | colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 102 | result <- corralm(sce, splitby = 'Method') 103 | 104 | # default print method for corralm objects 105 | } 106 | -------------------------------------------------------------------------------- /vignettes/corralm_alignment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Alignment & batch integration of single cell data with corralm" 3 | author: 4 | - name: Lauren Hsu 5 | affiliation: Harvard TH Chan School of Public Health; Dana-Farber Cancer Institute 6 | - name: Aedin Culhane 7 | affiliation: Department of Data Sciences, Dana-Farber Cancer Institute, Department of Biostatistics, Harvard TH Chan School of Public Health 8 | date: "4/28/2020" 9 | output: 10 | BiocStyle::html_document: 11 | toc_float: true 12 | BiocStyle::pdf_document: default 13 | package: BiocStyle 14 | bibliography: ref.bib 15 | vignette: | 16 | %\VignetteIndexEntry{alignment with corralm} 17 | %\VignetteEngine{knitr::rmarkdown} 18 | %\VignetteEncoding{UTF-8} 19 | --- 20 | --- 21 | 22 | ```{r setup, include=FALSE} 23 | knitr::opts_chunk$set(echo = TRUE) 24 | library(gridExtra) 25 | ``` 26 | 27 | # Introduction 28 | 29 | Data from different experimental platforms and/or batches exhibit systematic 30 | variation -- i.e., batch effects. Therefore, when conducting joint analysis 31 | of data from different batches, a key first step is to align the datasets. 32 | 33 | `corralm` is a multi-table adaptation of correspondence analysis designed 34 | for single-cell data, which applies multi-dimensional optimized scaling and 35 | matrix factorization to compute integrated embeddings across the datasets. 36 | These embeddings can then be used in downstream analyses, such as clustering, 37 | cell type classification, trajectory analysis, etc. 38 | 39 | See the vignette for `corral` for dimensionality reduction of a single matrix of single-cell data. 40 | 41 | # Loading packages and data 42 | 43 | We will use the `SCMixology` datasets from the `r Biocpkg('CellBench')` package [@scmix]. 44 | 45 | ```{r, message = FALSE} 46 | library(corral) 47 | library(SingleCellExperiment) 48 | library(ggplot2) 49 | library(CellBench) 50 | library(MultiAssayExperiment) 51 | 52 | scmix_dat <- load_all_data()[1:3] 53 | 54 | ``` 55 | 56 | These datasets include a mixture of three lung cancer cell lines: 57 | 58 | * H2228 59 | * H1975 60 | * HCC827 61 | 62 | which was sequenced using three platforms: 63 | 64 | * 10X 65 | * CELseq2 66 | * Dropseq 67 | 68 | ```{r} 69 | scmix_dat 70 | ``` 71 | 72 | Each sequencing platform captures a different set of genes. 73 | In order to apply this method, the matrices need to be matched by features (i.e., genes). 74 | We'll find the intersect of the three datasets, then subset for that as we proceed. 75 | 76 | First, we will prepare the data by: 77 | 1. adding to the colData the sequencing platform (`Method` in `colData` for each SCE), and 78 | 2. subsetting by the intersect of the genes. 79 | 80 | ```{r} 81 | platforms <- c('10X','CELseq2','Dropseq') 82 | for(i in seq_along(scmix_dat)) { 83 | colData(scmix_dat[[i]])$Method<- rep(platforms[i], ncol(scmix_dat[[i]])) 84 | } 85 | 86 | scmix_mae <- as(scmix_dat,'MultiAssayExperiment') 87 | scmix_dat <- as.list(MultiAssayExperiment::experiments(MultiAssayExperiment::intersectRows(scmix_mae))) 88 | 89 | ``` 90 | 91 | ![](corralm_IO.png) 92 | 93 | `corralm` can be applied to the following types of objects: 94 | 95 | * **a single `r Biocpkg('SingleCellExperiment')`** requires specifying `splitby` (also see documentation of `corralm_matlist` for additional optional arguments that can be passed), which is a character string for the attribute in `colData` that is tracking the batches. In our case, this would be the "Method" attribute we just added. The output from this type of input is the same `SingleCellExperiment`, with the result added to the `reducedDim` slot under `corralm`. 96 | * **a list of `r Biocpkg('SingleCellExperiment')`** does not require any specific arguments. The output is a list of the input `SingleCellExperiment`s, with the result added to the `reducedDim` slot under `corralm`. 97 | * **a list of matrices** (or other matrix-like objects: matrix, Matrix, tibble, data.frame, etc.) also does not require specific arguments. The output will be a concatenated list of SVD output matrices (`u`,`d`,`v`) where `v` contains a concatenated vector of the embeddings for the cells 98 | * **a list of `r Biocpkg('SummarizedExperiment')`s** does not require specific arguments. The output is the same as for a list of matrices. 99 | * **`r Biocpkg('MultiAssayExperiment')` or `ExperimentList`** does not require any specific arguments. `corralm` will identify the intersect of the rows, and use these to match the matrices. The output will be the same as for a list of matrices. 100 | 101 | For purposes of illustration, we will walk through using `corralm` with a single SCE, and with a list of matrices. 102 | 103 | # `corralm` on a single `r Biocpkg('SingleCellExperiment')` 104 | 105 | First, setting up the data to demonstrate this: 106 | ```{r} 107 | colData(scmix_dat[[2]])$non_ERCC_percent <- NULL 108 | # need to remove this column so the objects can be concatenated 109 | 110 | scmix_sce <- SingleCellExperiment::cbind(scmix_dat[[1]], 111 | scmix_dat[[2]], 112 | scmix_dat[[3]]) 113 | ``` 114 | 115 | Running `corralm`, and specifying the `splitby` argument: 116 | (Note that the default is for the `counts` matrix to be used. 117 | To change this default, use the `whichmat` argument.) 118 | ```{r} 119 | scmix_sce <- corralm(scmix_sce, splitby = 'Method') 120 | ``` 121 | 122 | Visualizing the results: 123 | ```{r} 124 | plot_embedding_sce(sce = scmix_sce, 125 | which_embedding = 'corralm', 126 | color_attr = 'Method', 127 | color_title = 'platform', 128 | ellipse_attr = 'cell_line', 129 | plot_title = 'corralm on scmix', 130 | saveplot = FALSE) 131 | ``` 132 | 133 | # `corralm` on a list of matrices 134 | 135 | Again, preparing the data to be in this input format: 136 | ```{r} 137 | scmix_matlist <- sce2matlist(sce = scmix_sce, 138 | splitby = 'Method', 139 | whichmat = 'counts') 140 | 141 | # for plotting purposes later, while we're here 142 | platforms <- colData(scmix_sce)$Method 143 | cell_lines <- colData(scmix_sce)$cell_line 144 | ``` 145 | 146 | Running corralm and visualizing output... 147 | (the embeddings are in the `v` matrix because these data are matched by genes 148 | in the rows and have cells in the columns; if this were reversed, with cells 149 | in the rows and genes/features in the column, then the cell embeddings would 150 | instead be in the `u` matrix.) 151 | ```{r} 152 | scmix_corralm <- corralm(scmix_matlist) 153 | scmix_corralm 154 | plot_embedding(embedding = scmix_corralm$v, 155 | plot_title = 'corralm on scmix', 156 | color_vec = platforms, 157 | color_title = 'platform', 158 | ellipse_vec = cell_lines, 159 | saveplot = FALSE) 160 | ``` 161 | 162 | As expected, we get the same results as above. (Note that in performing SVD, 163 | the direction of the axes doesn't matter and they may be flipped between runs, 164 | as `corral` and `corralm` use `irlba` to perform fast approximation.) 165 | 166 | # Scaled variance plots to evaluate integration 167 | 168 | Scaled variance plots provide a simple and fast visual summary of the integration of embeddings. It can be called using the `scal_var` function, and works on both `corralm` objects and custom embeddings (with a vector indicating batch). 169 | 170 | When integrating embedding representations across batches, measures for cluster evaluation are effective for assessing group compactness and recovery of cell populations via clustering. However, they do not directly assess how well dataset embeddings are integrated across batches. To focus specifically on batch integration, we developed and applied a heuristic scaled variance metric, which captures the relative dispersion of each batch with respect to the entire dataset. The scaled variance of component dimension $d^*$ for the subset of observations in batch $b^*$, $SV_{b^*,d}$, is computed with: 171 | $$SV_{b^*,d} = \frac{\mathrm{Var}(\mathbf{E_{b=b^*,d=d^*}})}{\mathrm{Var}(\mathbf{E_{d=d^*}})}$$ 172 | where $\mathbf{E}$ is the matrix of embeddings, and $b$ indexes the rows (observations by batch) while $d$ indexes the columns to indicate which component dimension to evaluate. 173 | 174 | ```{r} 175 | scal_var(scmix_corralm) 176 | ``` 177 | 178 | When the datasets are well integrated, SV values for each batch are close to 1, indicating that each batch's subset has similar dispersion as compared to the entire embedding. In contrast, if there is poorer integration, the scaled variance values will be more extreme away from 1 because the variance within batches will differ more from the variance overall. This metric is appropriate when the types of cells represented in different datasets are expected to be similar, but cannot account for situations where the expected distribution of cell types (and therefore, embeddings) is fundamentally different between batches. 179 | 180 | # Session information 181 | 182 | ```{r} 183 | sessionInfo() 184 | ``` 185 | 186 | # References 187 | -------------------------------------------------------------------------------- /R/corralm.R: -------------------------------------------------------------------------------- 1 | #' Multi-table correspondence analysis (list of matrices) 2 | #' 3 | #' @param matlist (for \code{corralm_matlist}) list of input matrices; input matrices should be counts (raw or log). Matrices should be aligned row-wise by common features (either by sample or by gene) 4 | #' @param rw_contrib numeric vector, same length as the matlist. Indicates the weight that each dataset should contribute to the row weights. When set to NULL the row weights are *not* combined and each matrix is scaled independently (i.e., using their observed row weights, respectively). When set to a vector of all the same values, this is equivalent to taking the mean. Another option is to the number of observations per matrix to create a weighted mean. Regardless of input scale, row weights for each table must sum to 1 and thus are scaled. When this option is specified (i.e., not `NULL`), the `rtype` argument will automatically be set to `standardized`, and whatever argument is given will be ignored. 5 | #' @inheritParams compsvd 6 | #' @inheritParams corral_preproc 7 | #' 8 | #' @return When run on a list of matrices, a list with the correspondence analysis matrix decomposition result, with indices corresponding to the concatenated matrices (in order of the list): 9 | #' \describe{ 10 | #' \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)} 11 | #' \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)} 12 | #' \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)} 13 | #' \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained} 14 | #' } 15 | #' @rdname corralm 16 | #' @export 17 | #' 18 | #' @importFrom irlba irlba 19 | #' @importFrom Matrix Matrix rowSums colSums 20 | #' @importClassesFrom Matrix dgCMatrix 21 | #' 22 | #' @examples 23 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25), 24 | #' matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 25)) 25 | #' result <- corralm_matlist(listofmats) 26 | corralm_matlist <- function(matlist, method = c('irl','svd'), ncomp = 30, rtype = c('indexed','standardized','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), rw_contrib = NULL, ...){ 27 | method <- match.arg(method, c('irl','svd')) 28 | rtype <- match.arg(rtype, c('indexed','standardized','hellinger','freemantukey','pearson')) 29 | vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe')) 30 | 31 | .check_dims(matlist) 32 | if(is.null(rw_contrib)){ 33 | preproc_mats <- lapply(matlist, corral_preproc, rtype = rtype, vst_mth = vst_mth, ...) 34 | } 35 | else{ 36 | rw_contrib <- .check_rw_contrib(matlist, rw_contrib) 37 | 38 | ws <- lapply(matlist, get_weights) 39 | rwlist <- lapply(ws, '[[', 'row.w') 40 | row.ws <- Reduce(cbind, rwlist) 41 | row.ws <- row.ws %*% rw_contrib 42 | comp_rw <- rowSums(row.ws) / sum(row.ws) 43 | 44 | preproc_mats <- lapply(matlist, corral_preproc, rtype = 'standardized', row.w = comp_rw, vst_mth = vst_mth, ...) 45 | if(rtype != 'standardized'){ 46 | cat('\nSwitched residual type to standardized; using shared row weights.\n') 47 | } 48 | } 49 | concatted <- list2mat(matlist = preproc_mats, direction = 'c') 50 | result <- compsvd(concatted, method = method, ncomp = ncomp) 51 | result[['batch_sizes']] <- .batch_sizes(matlist) 52 | class(result) <- c(class(result),"corralm") 53 | return(result) 54 | } 55 | 56 | #' Multi-table correspondence analysis (SingleCellExperiment) 57 | #' 58 | #' @param sce (for \code{corralm_sce}) SingleCellExperiment; containing the data to be integrated. Default is to use the counts, and to include all of the data in the integration. These can be changed by passing additional arguments. See \code{\link{sce2matlist}} function documentation for list of available parameters. 59 | #' @param splitby character; name of the attribute from \code{colData} that should be used to separate the SCE. 60 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object 61 | #' @param fullout boolean; whether the function will return the full \code{corralm} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corralm_matlist}}-style output, set this to \code{TRUE}. 62 | #' @inheritParams compsvd 63 | #' @param ... (additional arguments for methods) 64 | #' 65 | #' @return For SingleCellExperiment input, returns the SCE with embeddings in the reducedDim slot 'corralm' 66 | #' @rdname corralm 67 | #' @export 68 | #' 69 | #' @importFrom irlba irlba 70 | #' @importFrom Matrix Matrix rowSums colSums 71 | #' @importFrom SingleCellExperiment reducedDim 72 | #' @importClassesFrom Matrix dgCMatrix 73 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 74 | #' 75 | #' @examples 76 | #' library(DuoClustering2018) 77 | #' library(SingleCellExperiment) 78 | #' sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)] 79 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 80 | #' result <- corralm_sce(sce, splitby = 'Method') 81 | #' 82 | #' 83 | corralm_sce <- function(sce, splitby, method = c('irl','svd'), ncomp = 30, whichmat = 'counts', fullout = FALSE, rw_contrib = NULL, ...){ 84 | method <- match.arg(method, c('irl','svd')) 85 | if(missing(splitby)) {stop('If performing multi-table analysis with a single SCE, the splitby variable must be specified. \nUse corral to analyze as a single table.')} 86 | mat_list <- sce2matlist(sce, splitby = splitby, whichmat = whichmat) 87 | svd_output <- corralm_matlist(mat_list, method = method, ncomp = ncomp, rw_contrib = rw_contrib, ...) 88 | if(fullout){ 89 | class(svd_output) <- c(class(svd_output),'corralm') 90 | return(svd_output) 91 | } 92 | else{ 93 | ind_order <- .indsbysplitby(sce, splitby) 94 | SingleCellExperiment::reducedDim(sce, 'corralm') <- svd_output$v[ind_order,] 95 | return(sce) 96 | } 97 | } 98 | 99 | #' Multi-table adaptation of correspondence analysis 100 | #' 101 | #' This multi-table adaptation of correpondence analysis applies the same scaling technique and enables data alignment by finding a set of embeddings for each dataset within shared latent space. 102 | #' 103 | #' \code{corralm} is a wrapper for \code{\link{corralm_matlist}} and \code{\link{corralm_sce}}, and can be called on any of the acceptable input types (see \code{inp} below). 104 | #' 105 | #' @param inp list of matrices (any type), a \code{SingleCellExperiment}, list of \code{SingleCellExperiment}s, list of \code{SummarizedExperiment}s, or \code{MultiAssayExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}). Additionally, if it is one \code{SingleCellExperiment}, then it is also necessary to include the \code{splitby} argument to specify the batches. For a \code{MultiAssayExperiment}, it will take the intersect of the features across all the assays, and use those to match the matrices; to use a different subset, select desired subsets then call \code{corral} 106 | #' @param whichmat char, when using SingleCellExperiment or other SummarizedExperiment, can be specified. default is 'counts'. 107 | #' @param ... (additional arguments for methods) 108 | #' @inheritParams corralm_matlist 109 | #' @inheritParams corralm_sce 110 | #' 111 | #' @return For a list of \code{\link{SingleCellExperiment}}s, returns a list of the SCEs with the embeddings in the respective \code{reducedDim} slot 'corralm' 112 | #' @rdname corralm 113 | #' @export 114 | #' 115 | #' @importFrom irlba irlba 116 | #' @importFrom Matrix Matrix rowSums colSums 117 | #' @importFrom methods is 118 | #' @importFrom SingleCellExperiment reducedDim 119 | #' @importFrom MultiAssayExperiment experiments intersectRows assays 120 | #' @importFrom SummarizedExperiment assay 121 | #' @importClassesFrom Matrix dgCMatrix 122 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 123 | #' @importClassesFrom MultiAssayExperiment MultiAssayExperiment 124 | #' 125 | #' @examples 126 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20), 127 | #' matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20)) 128 | #' corralm(listofmats) 129 | #' 130 | #' library(DuoClustering2018) 131 | #' library(SingleCellExperiment) 132 | #' sce <- sce_full_Zhengmix4eq()[seq(1,100,1),sample(seq(1,3500,1),100,replace = FALSE)] 133 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 134 | #' result <- corralm(sce, splitby = 'Method') 135 | #' 136 | corralm <- function(inp, whichmat = 'counts', fullout = FALSE,...){ 137 | if(is(inp,'SingleCellExperiment')){ 138 | corralm_sce(sce = inp, ...) 139 | } 140 | else if (is(inp,'ExperimentList')){ 141 | matlist <- as.list(MultiAssayExperiment::assays(MultiAssayExperiment::intersectRows(inp))) 142 | corralm_matlist(matlist = matlist, ...) 143 | } 144 | else if (is(inp,'MultiAssayExperiment')){ 145 | matlist <- as.list(MultiAssayExperiment::assays(MultiAssayExperiment::experiments(MultiAssayExperiment::intersectRows(inp)))) 146 | corralm_matlist(matlist = matlist, ...) 147 | } 148 | else if(is(inp,'list') | is(inp,'List')){ 149 | if(all_are(inp,'SingleCellExperiment')){ 150 | matlist <- lapply(inp, SummarizedExperiment::assay, whichmat) 151 | res <- corralm_matlist(matlist = matlist, ...) 152 | if(fullout) { 153 | class(res) <- c(class(res),"corralm") 154 | return(res) 155 | } 156 | else{ 157 | add_embeddings2scelist(scelist = inp, embeddings = res$v) 158 | } 159 | } 160 | else if (all_are(inp,'SummarizedExperiment')){ 161 | matlist <- lapply(inp, SummarizedExperiment::assay, whichmat) 162 | corralm_matlist(matlist = matlist, ...) 163 | } 164 | else{ 165 | corralm_matlist(matlist = inp, ...) 166 | } 167 | } 168 | } 169 | 170 | #' Print method for S3 object corralm 171 | #' 172 | #' @param x (print method) corralm object; the list output from \code{corralm_matlist} 173 | #' 174 | #' @rdname corralm 175 | #' 176 | #' @return . 177 | #' @export 178 | #' 179 | #' @examples 180 | #' # default print method for corralm objects 181 | print.corralm <- function(x,...){ 182 | inp <- x 183 | pct_var_exp <- inp$pct_var_exp 184 | ncomp <- length(inp$d) 185 | cat('corralm output summary==========================================\n') 186 | cat(' Output "list" includes SVD output (u, d, v) & a table of the\n') 187 | cat(' dimensions of the input matrices (batch_sizes)\n') 188 | cat('Variance explained----------------------------------------------\n') 189 | print(round(pct_var_exp[,seq(1,min(8,ncomp),1)],2)) 190 | cat('\n') 191 | cat('Dimensions of output elements-----------------------------------\n') 192 | cat(' Singular values (d) :: ') 193 | cat(ncomp) 194 | cat('\n Left singular vectors (u) :: ') 195 | cat(dim(inp$u)) 196 | cat('\n Right singular vectors (v) :: ') 197 | cat(dim(inp$v)) 198 | cat('\n See corralm help for details on each output element.') 199 | cat('\n\n') 200 | cat('Original batches & sizes (in order)-----------------------------') 201 | cat('\n ') 202 | cat(paste0(' ',rownames(inp$batch_sizes),' :: ',inp$batch_sizes[,2],'\n')) 203 | cat('\n Use plot_embedding to visualize; see docs for details.\n') 204 | cat('================================================================\n') 205 | } 206 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Set na to 0 2 | #' 3 | #' @param x matrix of values for which na values should be changed to 0 4 | #' 5 | #' @return matrix, where na values are set to 0 6 | #' @export 7 | #' 8 | #' @examples 9 | #' x <- matrix(sample(0:10, 5000, replace = TRUE), ncol = 25) 10 | #' x[sample(1:5000, 10)] <- NA 11 | #' 12 | #' na2zero(x) 13 | na2zero <- function(x) { 14 | func <- function(x){ 15 | if (is.na(x)) return(0) 16 | else return(x)} 17 | 18 | return(apply(x, c(1,2), func)) 19 | } 20 | 21 | #' List to Matrix 22 | #' 23 | #' @param matlist list of matrices to concatenate 24 | #' @param direction character, r or c, to indicate whether should be row-wise (i.e., rbind to match on columns) or column-wise (i.e., cbind to match on rows). Defaults to columnwise (matching on rows) to match convention of SingleCellExperiments 25 | #' 26 | #' @return matrix 27 | #' @export 28 | #' 29 | #' @examples 30 | #' listofmats <- list(matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10), 31 | #' matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 10)) 32 | #' newmat <- list2mat(listofmats) # to "cbind" them 33 | #' listofmats_t <- lapply(listofmats,t) 34 | #' newmat_t <- list2mat(listofmats_t, 'r') # to "rbind" them 35 | list2mat <- function(matlist, direction = c('c','r')[1]){ 36 | if (direction == 'r') concatted <- Reduce(rbind, matlist) 37 | if (direction == 'c') concatted <- Reduce(cbind, matlist) 38 | return(concatted) 39 | } 40 | 41 | #' Compute percent of variance explained 42 | #' 43 | #' @param thissvd list outputted from an svd function (svd, irlba; can also take output from \code{\link{corral_mat}} and \code{\link{corralm_matlist}}) 44 | #' @param preproc_mat matrix of pre-processed values (optional) - important to include if the svd is only partial as this is used to compute the sum of eigenvalues 45 | #' 46 | #' @return vector of percent variance explained values, indexed by PC 47 | #' @export 48 | #' 49 | #' @examples 50 | #' mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10) 51 | #' my_svd <- svd(mat) 52 | #' get_pct_var_exp_svd(my_svd) # this works if my_svd is a full svd 53 | #' my_irl <- irlba::irlba(mat,nv = 2) 54 | #' get_pct_var_exp_svd(my_irl, preproc_mat = mat) # ... otherwise use this 55 | get_pct_var_exp_svd <- function(thissvd, preproc_mat = thissvd$d){ 56 | denom <- sum(preproc_mat^2) 57 | return(thissvd$d^2 / denom) 58 | } 59 | 60 | 61 | #' Get weights 62 | #' 63 | #' Computes row weights and column weights 64 | #' 65 | #' @param inp_mat matrix for which weights should be calculated (sparse or full) 66 | #' 67 | #' @return list of 2 elements: 'row.w' and 'col.w' contain the row and column weights respectively 68 | #' @export 69 | #' 70 | #' @importFrom Matrix Matrix rowSums colSums 71 | #' @importFrom methods is 72 | #' @importClassesFrom Matrix dgCMatrix 73 | #' 74 | #' @examples 75 | #' mat <- matrix(sample(seq(0,20,1),100,replace = TRUE),nrow = 10) 76 | #' ws <- get_weights(mat) 77 | get_weights <- function(inp_mat){ 78 | w <- list() 79 | if(!is(inp_mat, "dgCMatrix")) {sp_mat <- Matrix(inp_mat, sparse = TRUE)} # convert to a sparse matrix 80 | else {sp_mat <- inp_mat} 81 | N <- sum(sp_mat) 82 | sp_mat <- sp_mat/N 83 | w[['row.w']] <- Matrix::rowSums(sp_mat) 84 | w[['col.w']] <- Matrix::colSums(sp_mat) 85 | return(w) 86 | } 87 | 88 | 89 | #' SingleCellExperiment to list of matrices 90 | #' 91 | #' @param sce SingleCellExperiment that is to be separated into list of count matrices 92 | #' @param splitby character; name of the attribute from colData that should be used to separate the SCE 93 | #' @param to_include (optional) character vector; determines which values from the "splitby" column will be included in the outputted matlist. NULL is the default, and will result in selecting all elements 94 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object 95 | #' 96 | #' @return list of matrices 97 | #' @export 98 | #' 99 | #' @importFrom SingleCellExperiment colData 100 | #' @importFrom SummarizedExperiment assay 101 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 102 | #' 103 | #' @examples 104 | #' library(DuoClustering2018) 105 | #' sce <- sce_full_Zhengmix4eq() 106 | #' matlist <- sce2matlist(sce = sce, splitby = 'phenoid', whichmat = 'logcounts') 107 | sce2matlist <- function(sce,splitby,to_include = NULL,whichmat = 'counts'){ 108 | if(is.null(to_include)){ 109 | to_include <- unique(as.character(colData(sce)[,splitby])) 110 | } 111 | matlist <- list() 112 | countmatrix <- SummarizedExperiment::assay(sce, whichmat) 113 | for(gn in to_include){ 114 | matlist[[gn]] <- countmatrix[,which(colData(sce)[,splitby] == gn)] 115 | } 116 | return(matlist) 117 | } 118 | 119 | 120 | 121 | #' @keywords internal 122 | #' @import SingleCellExperiment 123 | .indsbysplitby <- function(sce, splitby, to_include = NULL){ 124 | if(is.null(to_include)){ 125 | to_include <- unique(as.character(colData(sce)[,splitby])) 126 | } 127 | inds <- c() 128 | for(gn in to_include){ 129 | inds <- c(inds, which(colData(sce)[,splitby] == gn)) 130 | } 131 | return(inds) 132 | } 133 | 134 | #' Add embeddings to list of SCEs 135 | #' 136 | #' @param scelist list of SingleCellExperiments; to which the corresponding embeddings should be added 137 | #' @param embeddings matrix; the embeddings outputted from a dimension reduction, e.g. \code{\link{corralm}}. Rows in this table correspond to columns in the SCEs in \code{scelist} (if all the SCEs were column-bound), and row indices should correspond to cells. 138 | #' @param slotname character; name of the slot for the reduced dim embedding; defaults to \code{corral} 139 | #' 140 | #' @return list of SingleCellExperiments with respective embeddings stored in them 141 | #' @export 142 | #' 143 | #' @importFrom SingleCellExperiment reducedDim 144 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 145 | #' 146 | #' @examples 147 | #' library(DuoClustering2018) 148 | #' sce <- sce_full_Zhengmix4eq() 149 | #' scelist <- list(sce,sce) 150 | #' embeddings <- matrix(sample(seq(0,20,1),dim(sce)[2]*6,replace = TRUE),nrow = dim(sce)[2]*2) 151 | #' scelist <- add_embeddings2scelist(scelist, embeddings) 152 | add_embeddings2scelist <- function(scelist,embeddings, slotname = 'corralm'){ 153 | if(!all_are(scelist,'SingleCellExperiment')) { 154 | cat('Warning! You may have non-SCE elements in the list.') 155 | } 156 | dimvec <- unlist(lapply(scelist,ncol)) 157 | for (i in seq(1,length(scelist),1)){ 158 | if(i == 1) {start_ind <- 1} 159 | else{start_ind <- (sum(dimvec[seq(1,i-1,1)]) + 1)} 160 | 161 | end_ind <- start_ind + dimvec[i] - 1 162 | 163 | SingleCellExperiment::reducedDim(scelist[[i]],slotname) <- embeddings[seq(start_ind,end_ind,1),] 164 | } 165 | return(scelist) 166 | } 167 | 168 | #' @keywords internal 169 | .batch_sizes <- function(matlist){ 170 | df <- as.data.frame(do.call(rbind, lapply(matlist,dim))) 171 | colnames(df) <- NULL 172 | rownames(df) <- names(matlist) 173 | return(df) 174 | } 175 | 176 | #' @keywords internal 177 | #' @import ggthemes 178 | #' @importFrom grDevices colorRampPalette 179 | .generate_palette_func <- function(ncolors, color_values){ 180 | # adapted from ggthemes::scale_color_* functions 181 | if(missing(color_values)){ 182 | values <- ggthemes::ggthemes_data$few$colors[['Medium']][['value']] 183 | } 184 | else{values <- color_values} 185 | values <- c(values[1],colorRampPalette(values[seq(2,length(values),1)])(1 + ncolors)) # expanding palette to fit 186 | max_n <- length(values) - 1L 187 | f <- function(n) { 188 | ggthemes:::check_pal_n(n, max_n) 189 | if (n == 1L) { 190 | values[[1L]] 191 | } 192 | else { 193 | unname(values[2L:(n + 1L)]) 194 | } 195 | } 196 | attr(f, "max_n") <- length(values) - 1L 197 | f 198 | } 199 | 200 | 201 | #' all_are 202 | #' 203 | #' Checks if all elements of a list or List are of a (single) particular type \code{typechar} 204 | #' 205 | #' @param inplist list or List to be checked 206 | #' @param typechar char of the type to check for 207 | #' 208 | #' @return boolean, for whether the elements of \code{inplist} are all \code{typechar} 209 | #' @export 210 | #' 211 | #' @examples 212 | #' x <- list(1,2) 213 | #' all_are(x,'numeric') 214 | #' all_are(x,'char') 215 | #' 216 | #' y <- list(1,2,'c') 217 | #' all_are(y,'numeric') 218 | #' all_are(y,'char') 219 | all_are <- function(inplist,typechar){ 220 | return(all(unlist(lapply(inplist,is,typechar)))) 221 | } 222 | 223 | #' rv coefficient 224 | #' 225 | #' @param mat1 matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat2} 226 | #' @param mat2 matrix (or matrix-like, e.g., df); either columns or rows should be matched with \code{mat1} 227 | #' 228 | #' @return numeric; RV coefficient between the matched matrices 229 | #' @export 230 | #' 231 | #' @examples 232 | #' a <- matrix(sample(1:10,100, TRUE), nrow = 10) 233 | #' b <- matrix(sample(1:10,50, TRUE), nrow = 5) 234 | #' 235 | #' rv(a, b) # matched by columns 236 | #' rv(t(a), t(b)) # matched by rows 237 | rv <- function(mat1, mat2) { 238 | if(ncol(mat1) != ncol(mat2)){ 239 | if(nrow(mat1) == nrow(mat2)){ 240 | mat1 <- t(mat1) 241 | mat2 <- t(mat2) 242 | } 243 | else{stop('Matrices must be aligned by rows or columns.')} 244 | } 245 | nscm1 <- crossprod(as.matrix(mat1)) 246 | nscm2 <- crossprod(as.matrix(mat2)) 247 | rv <- sum(nscm1 * nscm2)/(sum(nscm1 * nscm1) * sum(nscm2 * nscm2))^0.5 248 | return(rv) 249 | } 250 | 251 | 252 | #' Pairwise rv coefficient 253 | #' 254 | #' @param matlist list of matrices (or matrix-like; see \code{rv} function) for which to compute pairwise RV coefficients 255 | #' 256 | #' @return matrix of the pairwise coefficients 257 | #' @export 258 | #' 259 | #' @importFrom utils combn 260 | #' 261 | #' @examples 262 | #' a <- matrix(sample(1:10,100,TRUE), nrow = 10) 263 | #' b <- matrix(sample(1:10,50,TRUE), nrow = 5) 264 | #' c <- matrix(sample(1:10,20,TRUE), nrow = 2) 265 | #' 266 | #' matlist <- list(a,b,c) 267 | #' pairwise_rv(matlist) 268 | #' pairwise_rv(lapply(matlist, t)) 269 | pairwise_rv <- function(matlist){ 270 | n <- length(matlist) 271 | a <- utils::combn(seq_len(n), 2, 272 | FUN = function(x) rv(matlist[[x[1]]], matlist[[x[2]]]), simplify = TRUE) 273 | m <- matrix(1, n, n) 274 | m[lower.tri(m)] <- a 275 | m[upper.tri(m)] <- t(m)[upper.tri(m)] 276 | colnames(m) <- rownames(m) <- names(matlist) 277 | return(m) 278 | } 279 | 280 | 281 | #' Trim extreme values in a pre-processed matrix 282 | #' 283 | #' Smooths the extreme values in a chi-square-transformed matrix to lessen the influence of "rare objects." 284 | #' 285 | #' (Usually not called directly; can be included by using the `smooth` argument in the `corral`, `corralm`, and `corral_preproc` functions) 286 | #' 287 | #' @param mat matrix; should be pre-processed/normalized to some sort of approximately normally distributed statistic (e.g., chi-squared transformation with `corral_preproc` or Z-score normalization) 288 | #' @param pct_trim numeric; the percent of observations to smooth. Defaults to `pct_trim` = .01, which corresponds to smoothing all observations to be between the .5 percentile and 99.5 percentile range of the input matrix 289 | #' 290 | #' @return smoothed matrix 291 | #' @export 292 | #' 293 | #' @examples 294 | #' count_mat <- matrix(rpois(10000, 300)*rbinom(10000,1,.1), ncol = 100) 295 | #' smoothed_preproc_mat <- corral_preproc(count_mat, smooth = TRUE) 296 | trim_matdist <- function(mat, pct_trim = .01){ 297 | up_qt <- 1-(pct_trim/2) 298 | upthresh <- quantile(mat, up_qt) 299 | downthresh <- quantile(mat, 1-up_qt) 300 | if(sum(mat > upthresh) > 0){ 301 | mat[mat > upthresh] <- upthresh 302 | } 303 | if(sum(mat < downthresh) > 0){ 304 | mat[mat < downthresh] <- downthresh 305 | } 306 | return(mat) 307 | } 308 | -------------------------------------------------------------------------------- /vignettes/corral_dimred.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Dimension reduction of single cell data with corral" 3 | author: 4 | - name: Lauren Hsu 5 | affiliation: Harvard TH Chan School of Public Health; Dana-Farber Cancer Institute 6 | - name: Aedin Culhane 7 | affiliation: Department of Data Science, Dana-Farber Cancer Institute, Department of Biostatistics, Harvard TH Chan School of Public Health 8 | date: "5/17/2021" 9 | output: 10 | BiocStyle::html_document: 11 | toc_float: true 12 | BiocStyle::pdf_document: default 13 | package: BiocStyle 14 | bibliography: ref.bib 15 | vignette: | 16 | %\VignetteIndexEntry{dim reduction with corral} 17 | %\VignetteEngine{knitr::rmarkdown} 18 | %\VignetteEncoding{UTF-8} 19 | --- 20 | 21 | ```{r setup, include=FALSE} 22 | knitr::opts_chunk$set(echo = TRUE) 23 | library(gridExtra) 24 | ``` 25 | 26 | # Introduction 27 | 28 | Single-cell 'omics analysis enables high-resolution characterization of heterogeneous 29 | populations of cells by quantifying measurements in individual cells and thus 30 | provides a fuller, more nuanced picture into the complexity and heterogeneity between 31 | cells. However, the data also present new and significant challenges as compared to 32 | previous approaches, especially as single-cell data are much larger and sparser than 33 | data generated from bulk sequencing methods. Dimension reduction is a key step 34 | in the single-cell analysis to address the high dimension and sparsity of these 35 | data, and to enable the application of more complex, computationally expensive downstream pipelines. 36 | 37 | Correspondence analysis (CA) is a matrix factorization method, and is similar to 38 | principal components analysis (PCA). Whereas PCA is designed for application to 39 | continuous, approximately normally distributed data, CA is appropriate for 40 | non-negative, count-based data that are in the same additive scale. `corral` 41 | implements CA for dimensionality reduction of a single matrix of single-cell data. 42 | 43 | See the vignette for `corralm` for the multi-table adaptation of CA for single-cell batch alignment/integration. 44 | 45 | corral can be used with various types of input. When called on a matrix (or other matrix-like object), it returns a list with the SVD output, principal coordinates, and standard coordinates. When called on a `r Biocpkg('SingleCellExperiment')`, it returns the `r Biocpkg('SingleCellExperiment')` with the corral embeddings in the `reducedDim` slot named `corral`. To retrieve the full list output from a `SingleCellExperiment` input, the `fullout` argument can be set to `TRUE`. 46 | ![](corral_IO.png) 47 | 48 | # Loading packages and data 49 | 50 | We will use the `Zhengmix4eq` dataset from the `r Biocpkg('DuoClustering2018')` package. 51 | 52 | ```{r, message = FALSE} 53 | library(corral) 54 | library(SingleCellExperiment) 55 | library(ggplot2) 56 | library(DuoClustering2018) 57 | zm4eq.sce <- sce_full_Zhengmix4eq() 58 | zm8eq <- sce_full_Zhengmix8eq() 59 | ``` 60 | 61 | This dataset includes approximately 4,000 pre-sorted and annotated cells of 62 | 4 types mixed by Duo et al. in approximately equal proportions [@zmdata]. 63 | The cells were sampled from a "Massively parallel digital transcriptional 64 | profiling of single cells" [@zheng]. 65 | 66 | ```{r} 67 | zm4eq.sce 68 | table(colData(zm4eq.sce)$phenoid) 69 | ``` 70 | 71 | 72 | # `corral` on `r Biocpkg('SingleCellExperiment')` 73 | 74 | We will run `corral` directly on the raw count data: 75 | 76 | ```{r} 77 | zm4eq.sce <- corral(inp = zm4eq.sce, 78 | whichmat = 'counts') 79 | 80 | zm4eq.sce 81 | ``` 82 | 83 | We can use `plot_embedding` to visualize the output: 84 | 85 | ```{r} 86 | plot_embedding_sce(sce = zm4eq.sce, 87 | which_embedding = 'corral', 88 | plot_title = 'corral on Zhengmix4eq', 89 | color_attr = 'phenoid', 90 | color_title = 'cell type', 91 | saveplot = FALSE) 92 | ``` 93 | 94 | Using the `scater` package, we can also add and visualize `umap` and `tsne` embeddings based on the `corral` output: 95 | 96 | ```{r} 97 | library(scater) 98 | library(gridExtra) # so we can arrange the plots side by side 99 | 100 | zm4eq.sce <- runUMAP(zm4eq.sce, 101 | dimred = 'corral', 102 | name = 'corral_UMAP') 103 | zm4eq.sce <- runTSNE(zm4eq.sce, 104 | dimred = 'corral', 105 | name = 'corral_TSNE') 106 | 107 | ggplot_umap <- plot_embedding_sce(sce = zm4eq.sce, 108 | which_embedding = 'corral_UMAP', 109 | plot_title = 'Zhengmix4eq corral with UMAP', 110 | color_attr = 'phenoid', 111 | color_title = 'cell type', 112 | returngg = TRUE, 113 | showplot = FALSE, 114 | saveplot = FALSE) 115 | 116 | ggplot_tsne <- plot_embedding_sce(sce = zm4eq.sce, 117 | which_embedding = 'corral_TSNE', 118 | plot_title = 'Zhengmix4eq corral with tSNE', 119 | color_attr = 'phenoid', 120 | color_title = 'cell type', 121 | returngg = TRUE, 122 | showplot = FALSE, 123 | saveplot = FALSE) 124 | 125 | gridExtra::grid.arrange(ggplot_umap, ggplot_tsne, ncol = 2) 126 | 127 | ``` 128 | 129 | The `corral` embeddings stored in the `reducedDim` slot can be used in 130 | downstream analysis, such as for clustering or trajectory analysis. 131 | 132 | `corral` can also be run on a `SummarizedExperiment` object. 133 | 134 | # `corral` on matrix 135 | 136 | `corral` can also be performed on a matrix (or matrix-like) input. 137 | 138 | ```{r} 139 | zm4eq.countmat <- assay(zm4eq.sce,'counts') 140 | zm4eq.countcorral <- corral(zm4eq.countmat) 141 | ``` 142 | 143 | The output is in a `list` format, including the SVD output (`u`,`d`,`v`), 144 | the standard coordinates (`SCu`,`SCv`), and the principal coordinates (`PCu`,`PCv`). 145 | 146 | ```{r} 147 | zm4eq.countcorral 148 | ``` 149 | 150 | We can use `plot_embedding` to visualize the output: 151 | (the embeddings are in the `v` matrix because these data are by genes in the 152 | rows and have cells in the columns; if this were reversed, with cells in the 153 | rows and genes/features in the column, then the cell embeddings would instead 154 | be in the `u` matrix.) 155 | ```{r} 156 | celltype_vec <- zm4eq.sce$phenoid 157 | plot_embedding(embedding = zm4eq.countcorral$v, 158 | plot_title = 'corral on Zhengmix4eq', 159 | color_vec = celltype_vec, 160 | color_title = 'cell type', 161 | saveplot = FALSE) 162 | ``` 163 | 164 | The output is the same as above with the `SingleCellExperiment`, and can be 165 | passed as the low-dimension embedding for downstream analysis. Similarly, 166 | UMAP and tSNE can be computed for visualization. (Note that in performing SVD, 167 | the direction of the axes doesn't matter so they may be flipped between runs, 168 | as `corral` and `corralm` use `irlba` to perform fast approximation.) 169 | 170 | # Updates to CA to address overdispersion 171 | 172 | Correspondence analysis is known to be sensitive to "rare objects" (Greenacre, 2013). Sometimes this can be beneficial because the method can detect small perturbations of rare populations. However, in other cases, a couple outlier cells can be allowed to exert undue influence on a particular dimension. 173 | 174 | In the `corral` manuscript, we describe three general approaches, included below; see our manuscript for more details and results. In this vignette we also present a fourth approach (Trimming extreme values with `smooth` mode) 175 | 176 | ## Changing the residual type (`rtype`) 177 | 178 | Standard correspondence analysis decomposes Pearson $\chi^2$ residuals, computed with the formula: 179 | $$r_{p; ij} = \frac{\mathrm{observed} - \mathrm{expected}}{\sqrt{\mathrm{expected}}} = \frac{p_{ij} - p_{i.} \ p_{.j}}{\sqrt{p_{i.} \ p_{.j}}}$$ 180 | 181 | where $p_{ij} = \frac{x_{ij}}{N}$, $N = \sum_{i=1}^m \sum_{j=1}^n x_{ij}$, $p_{i.} = \mathrm{row \ weights} = \sum_{i=1}^m p_{ij}$, and $p_{.j} = \mathrm{col \ weights} = \sum_{j=1}^n p_{ij}$. 182 | 183 | In `corral`, this is the default setting. It can also be explicitly selected by setting `rtype = 'standardized'` or `rtype = 'pearson'`. 184 | 185 | Another $\chi^2$ residual is the Freeman-Tukey: 186 | $$r_{f; ij} = \sqrt{p_{ij}} + \sqrt{p_{ij} + \frac{1}{N}} - \sqrt{4 p_{i.} \ p_{.j} + \frac{1}{N}}$$ 187 | 188 | It is more robust to overdispersion than the Pearson residuals, and therefore outperforms standard CA in many scRNAseq datasets. 189 | 190 | In `corral`, this option can be selected by setting `rtype = 'freemantukey'`. 191 | 192 | ## Variance stabilization before CA (`vst_mth`) 193 | 194 | Another approach for addressing overdispersed counts is to apply a variance stabilizing transformation. The options included in the package: 195 | 196 | - Square root transform ($\sqrt{x}$): `vst_mth = 'sqrt'` 197 | - Anscombe transform ($2 \sqrt{x + \frac{3}{8}}$): `vst_mth = 'anscombe'` 198 | - Freeman-Tukey transform ($\sqrt{x} + \sqrt{x + 1}$): `vst_mth = 'freemantukey'` **Note that this option is different from setting the `rtype` parameter to `'freemantukey'` 199 | 200 | ## Power deflation (`powdef_alpha`) 201 | 202 | To apply a smoothing effect to the $\chi^2$ residuals, another approach is to transform the residual matrix by a power of $\alpha \in (0,1)$. To achieve a "soft" smoothing effect, we suggest $\alpha \in [0.9,0.99]$. This option is controlled with the `powdef_alpha` parameter, which takes the default value of `NULL` (not used). To set it, use this parameter and set it equal to the desired value for $\alpha$ as a numeric. e.g., `powdef_alpha = 0.95` would be including this option and setting $\alpha = 0.95$. 203 | 204 | ## Trimming extreme values (`smooth` mode) 205 | 206 | One adaptation (not described in the manuscript) that addresses unduly influential outliers is to apply an alternative smoothing procedure that narrows the range of the $\chi^2$-transformed values by symmetrically trimming the top $n$ fraction of extreme values ($n$ defaults to $.01$ and can be set with the `pct_trim` argument). Since the `corral` matrix pre-processing procedure transforms the values into standardized $\chi^2$ space, they can be considered proportional to the significance of difference between observed and expected abundance for a given gene in a given cell. This approach differs from power deflation in that it only adjusts the most extreme values, and explicitly so, whereas power deflation shifts the distribution of all values to be less extreme. 207 | 208 | This additional pre-processing step can be applied in `corral` by setting the `smooth` argument to `TRUE` (it defaults to `FALSE`), and this mode only works with standardized and indexed residuals options. 209 | 210 | ```{r} 211 | zm8eq.corral <- corral(zm8eq, fullout = TRUE) 212 | zm8eq.corralsmooth <- corral(zm8eq, fullout = TRUE, smooth = TRUE) 213 | ``` 214 | 215 | # Visualizing links between features and sub-populations with biplots 216 | 217 | Reduced dimension embeddings are often used to find relationships between observations. `corral` can be used to additionally explore relationships between the features and the observations. By plotting both embeddings into the same space, the biplot reveals potential associations between the "rows" and "columns" -- in this case, cells and genes. 218 | 219 | Influential features in a given dimension will be further from the origin, and features strongly associated to a particular cluster of observations will be close in terms of vector direction (e.g., cosine similarity). 220 | 221 | ```{r} 222 | gene_names <- rowData(zm4eq.sce)$symbol 223 | 224 | biplot_corral(corral_obj = zm4eq.countcorral, color_vec = celltype_vec, text_vec = gene_names) 225 | ``` 226 | 227 | For example, in this case, in the general direction of the B cells, there is the MALAT1 gene, which is a B cell marker. Considering the side of the plot with the CD14 monocytes, by showing which genes are closer to which subpopulation, the biplot can help characterize the different groups of cells. 228 | 229 | # Session information 230 | 231 | ```{r} 232 | sessionInfo() 233 | ``` 234 | 235 | # References 236 | 237 | -------------------------------------------------------------------------------- /R/plot_embedding.R: -------------------------------------------------------------------------------- 1 | #' Plot selected PCs from an embedding 2 | #' 3 | #' @param embedding matrix or other tabular format where columns correspond to PCs and rows correspond to cells (entries). \code{corral} and \code{corralm} objects are also accepted. 4 | #' @param xpc int; which PC to put on the x-axis (defaults to 1) 5 | #' @param ypc int; which PC to put on the y-axis (defaults to the one after \code{xpc}) 6 | #' @param plot_title char; title of plot (defaults to titling based on \code{xpc} and \code{ypc}) 7 | #' @param color_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., this could be indicating which batch each cell is from) 8 | #' @param color_title char; what attribute the colors represent 9 | #' @param ellipse_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class, and elements of the same class will be circled in an ellipse. (e.g., this could be indicating the cell type or cell line; works best for attributes intended to be compact) 10 | #' @param facet_vec vector; length should correspond to the number of rows in embedding, and each element of the vector classifies that cell (entry) in the embedding to that particular class. Plot will be faceted by this attribute. 11 | #' @param ptsize numeric; the size of the points as passed to \code{geom_point()}. Defaults to 0.8. 12 | #' @param saveplot boolean; whether or not to save the plot, defaults \code{FALSE} 13 | #' @param plotfn char; what the filename is to be called. (defaults to making a name based on \code{plot_title} and \code{xpc}) 14 | #' @param showplot boolean; whether or not to show the plot, defaults \code{TRUE} 15 | #' @param returngg boolean; whether or not to return a \code{\link{ggplot2}} object, defaults \code{FALSE} 16 | #' @param color_pal_vec char; hex codes for the color palette to be used. Default is to use the ggthemes few for plots with less than 9 colors, and to use/"stretch" pals polychrome if more colors are needed. 17 | #' @param dimname char; the name of the dimensions. defaults to "Dim" 18 | #' 19 | #' @return default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg}) 20 | #' @export 21 | #' 22 | #' @import ggplot2 23 | #' @import gridExtra 24 | #' @import ggthemes 25 | #' @import pals 26 | #' @importFrom graphics plot 27 | #' 28 | #' @examples 29 | #' listofmats <- list(matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20), 30 | #' matrix(sample(seq(0,20,1),1000,replace = TRUE),nrow = 20)) 31 | #' corralm_obj <- corralm(listofmats, ncomp = 5) 32 | #' embed_mat <- corralm_obj$v 33 | #' cell_type_vec <- sample(c('type1','type2','type3'),100,replace = TRUE) 34 | #' plot_embedding(embedding = embed_mat, 35 | #' xpc = 1, 36 | #' plot_title = 'corralm plot', 37 | #' color_vec = cell_type_vec, 38 | #' color_title = 'cell type', 39 | #' saveplot = FALSE) 40 | #' 41 | #' # or, call directly on the corralm object 42 | #' plot_embedding(corralm_obj) 43 | #' 44 | plot_embedding <- function(embedding, xpc = 1, ypc = xpc + 1, plot_title = paste0('Dim',xpc,' by Dim',ypc), color_vec = NULL, color_title = NULL, ellipse_vec = NULL, facet_vec = NULL, ptsize = 0.8, saveplot = FALSE, plotfn = paste(plot_title,xpc, sep = '_'), showplot = TRUE, returngg = FALSE, color_pal_vec = NULL, dimname = 'Dim'){ 45 | if('corral' %in% class(embedding)){ 46 | embedding <- embedding$PCv 47 | } 48 | if('corralm' %in% class(embedding)){ 49 | corralm_obj <- embedding 50 | embedding <- corralm_obj$v 51 | if(is.null(color_vec)){ 52 | color_vec <- rep(rownames(corralm_obj$batch_sizes), corralm_obj$batch_sizes[,2]) 53 | color_title <- 'Batch' 54 | } 55 | } 56 | if(is.null(color_vec)){ 57 | color_vec <- rep('emb', nrow(embedding)) 58 | } 59 | 60 | xlab <- paste0(dimname, xpc) 61 | ylab <- paste0(dimname, ypc) 62 | 63 | df <- cbind(data.frame(embedding[,c(xpc, ypc)]), color_vec) 64 | 65 | colnames(df) <- c('Xdim','Ydim', 'color_vec') 66 | 67 | if(!is.null(ellipse_vec)){ 68 | df <- cbind(df, ellipse_vec) 69 | } 70 | 71 | if(!is.null(facet_vec)){ 72 | df <- cbind(df, facet_vec) 73 | } 74 | 75 | # Setting up the colors 76 | .colscale <- function(palette){ggplot2::discrete_scale('colour','colscale',palette)} 77 | 78 | if(!is.null(color_pal_vec)){ 79 | palette_func <- .generate_palette_func(ncolors = length(unique(color_vec)), color_pal_vec) 80 | } else if(length(unique(color_vec)) < 9){ 81 | palette_func <- ggthemes::few_pal('Medium') 82 | } else{ 83 | palette_func <- .generate_palette_func(ncolors = length(unique(color_vec)), pals::alphabet2()) 84 | } 85 | 86 | gg_obj <- ggplot(df, aes(x = Xdim, y = Ydim, colour = color_vec)) + 87 | geom_point(size = ptsize) + 88 | theme_classic() + .colscale(palette_func) + 89 | geom_hline(yintercept = 0, color = 'gray') + 90 | geom_vline(xintercept = 0, color = 'gray') + 91 | labs(x = xlab, y = ylab, 92 | title = plot_title, 93 | color = color_title) + 94 | theme(axis.text.x = element_text(size = rel(1.4), colour = 'black'), 95 | axis.text.y = element_text(size = rel(1.4), colour = 'black'), 96 | axis.title = element_text(size = rel(1.4), colour = 'black')) + 97 | guides(color = guide_legend(override.aes = list(size = 5))) 98 | 99 | if(!is.null(ellipse_vec)){ 100 | gg_obj <- gg_obj + stat_ellipse(aes(x = Xdim, y = Ydim, group = ellipse_vec), type = 'norm', linetype = 2) 101 | } 102 | 103 | if(!is.null(facet_vec)){ 104 | gg_obj <- gg_obj + facet_wrap(facets = vars(facet_vec)) 105 | } 106 | 107 | if(saveplot){ 108 | ggsave(paste0(plotfn,'.png')) 109 | } 110 | 111 | if(showplot){ 112 | plot(gg_obj) 113 | } 114 | 115 | if(returngg){ 116 | return(gg_obj) 117 | } 118 | } 119 | 120 | 121 | #' Plot selected PCs from an embedding saved in a SingleCellExperiment object 122 | #' 123 | #' @param sce \code{\link{SingleCellExperiment}} object; contains the embedding within the \code{reducedDim} slot 124 | #' @param which_embedding character; for the embedding to plot 125 | #' @param color_attr character; name of the attribute within \code{colData} to use for assigning colors (in lieu of \code{color_vec} in the \code{\link{plot_embedding}} function) 126 | #' @param color_title character; title to use for colors legend, defaults to the same as \code{color_attr} 127 | #' @param ellipse_attr character; name of the attribute within \code{colData} to use for drawing ellipse(s) (in lieu of \code{ellipse_vec} in the \code{\link{plot_embedding}} function) 128 | #' @param facet_attr character; name of the attribute within \code{colData} to use for faceting (in lieu of \code{facet_vec} in the \code{\link{plot_embedding}} function) 129 | #' @param ... additional optional arguments - see \code{\link{plot_embedding}} function for details on other potential arguments: \code{xpc}, \code{ypc}, \code{plot_title}, \code{color_title} (if title is different from \code{color_attr}), \code{ptsize}, \code{saveplot}, \code{plotfn}, \code{showplot}, \code{returngg}, \code{color_pal_vec}, \code{dimname} 130 | #' 131 | #' @return default none; options to display plot (\code{showplot}), save plot (\code{saveplot}), and/or return \code{\link{ggplot2}} object (\code{returngg}) 132 | #' @export 133 | #' 134 | #' @import ggplot2 135 | #' @importFrom ggthemes scale_color_few 136 | #' @importFrom SingleCellExperiment colData reducedDim 137 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 138 | #' 139 | #' @examples 140 | #' library(DuoClustering2018) 141 | #' library(SingleCellExperiment) 142 | #' sce <- sce_full_Zhengmix4eq()[1:100,sample(1:3500,100,replace = FALSE)] 143 | #' colData(sce)$Method <- matrix(sample(c('Method1','Method2'),100,replace = TRUE)) 144 | #' sce <- corralm(sce, splitby = 'Method') 145 | #' 146 | #' # to plot and show only 147 | #' plot_embedding_sce(sce = sce, 148 | #' which_embedding = 'corralm', 149 | #' xpc = 1, 150 | #' plot_title = 'corralm: PC1 by PC2', 151 | #' color_attr = "Method", 152 | #' ellipse_attr = 'phenoid', 153 | #' saveplot = FALSE) 154 | #' 155 | #' # to return ggplot2 object and display, but not save 156 | #' corralm_ggplot <- plot_embedding_sce(sce = sce, 157 | #' which_embedding = 'corralm', 158 | #' xpc = 1, 159 | #' plot_title = 'corralm: PC1 by PC2', 160 | #' color_attr = 'Method', 161 | #' ellipse_attr = 'phenoid', 162 | #' returngg = TRUE, 163 | #' saveplot = FALSE) 164 | #' 165 | #' 166 | plot_embedding_sce <- function(sce, which_embedding, color_attr = NULL, color_title = color_attr, ellipse_attr = NULL, facet_attr = NULL, ...){ 167 | embed_mat <- SingleCellExperiment::reducedDim(sce, which_embedding) 168 | 169 | if(is.null(color_attr)){ 170 | color_vec <- NULL 171 | } 172 | else{ 173 | color_vec <- SingleCellExperiment::colData(sce)[, color_attr] 174 | } 175 | 176 | if(!is.null(ellipse_attr)){ 177 | ellipse_vec <- SingleCellExperiment::colData(sce)[, ellipse_attr] 178 | } 179 | else{ 180 | ellipse_vec <- NULL 181 | } 182 | 183 | if(!is.null(facet_attr)){ 184 | facet_vec <- SingleCellExperiment::colData(sce)[, facet_attr] 185 | } 186 | else{ 187 | facet_vec <- NULL 188 | } 189 | 190 | plot_embedding(embed_mat, color_vec = color_vec, color_title = color_title, ellipse_vec = ellipse_vec, facet_vec = facet_vec, ...) 191 | } 192 | 193 | 194 | #' Generate biplot for corral object 195 | #' 196 | #' @param corral_obj list outputted by the \code{corral} function 197 | #' @param color_vec vector; length should correspond to the number of rows in v of \code{corral_obj}, and each element of the vector classifies that cell (entry) in the embedding to that particular class, which will be colored the same. (e.g., cell type) 198 | #' @param text_vec vector; length should correspond to the number of rows in u of \code{corral_obj}, and each element of the vector is the label for the respective feature that would show on the biplot. 199 | #' @param feat_name char; the label will in the legend. Defaults to \code{(genes)}. 200 | #' @param nfeat int; the number of features to include. The function will first order them by distance from origin in the selected dimensions, then select the top n to be displayed. 201 | #' @param xpc int; which PC to put on the x-axis (defaults to 1) 202 | #' @param plot_title char; title of plot (defaults to *Biplot*) 203 | #' @param text_size numeric; size of the feature labels given in \code{text_vec} (defaults to 2; for \code{ggplot2}) 204 | #' @param xjitter numeric; the amount of jitter for the text labels in x direction (defaults to .005; for \code{ggplot2}) 205 | #' @param yjitter numeric; the amount of jitter for the text labels in y direction (defaults to .005; for \code{ggplot2}) 206 | #' @param coords char; indicator for sets of coordinates to use. \code{svd} plots the left and right singular vectors as outputted by SVD (\code{u} and \code{v}), which \code{PC} and \code{SC} use the principal and standard coordinates, respectively (defaults to \code{svd}) 207 | #' 208 | #' @return ggplot2 object of the biplot 209 | #' @export 210 | #' 211 | #' @import ggplot2 212 | #' 213 | #' @examples 214 | #' library(DuoClustering2018) 215 | #' library(SingleCellExperiment) 216 | #' zm4eq.sce <- sce_full_Zhengmix4eq() 217 | #' zm4eq.countmat <- counts(zm4eq.sce) 218 | #' zm4eq.corral_obj <- corral(zm4eq.countmat) 219 | #' gene_names <- rowData(zm4eq.sce)$symbol 220 | #' ctvec <- zm4eq.sce$phenoid 221 | #' 222 | #' biplot_corral(corral_obj = zm4eq.corral_obj, color_vec = ctvec, text_vec = gene_names) 223 | biplot_corral <- function(corral_obj, color_vec, text_vec, feat_name = '(genes)', nfeat = 20, xpc = 1, plot_title = 'Biplot', text_size = 2, xjitter = .005, yjitter = .005, coords = c('svd','PC','SC')){ 224 | n <- nfeat 225 | coords <- match.arg(coords, c('svd','PC','SC')) 226 | 227 | if(coords != 'svd'){ 228 | umat <- paste0(coords, umat) 229 | vmat <- paste0(coords, vmat) 230 | } 231 | else{ 232 | umat <- 'u' 233 | vmat <- 'v' 234 | } 235 | 236 | gene_dists <- sqrt(corral_obj[[umat]][,xpc]^2 + corral_obj[[umat]][,xpc + 1]^2) 237 | gene_dists_ordinds <- order(gene_dists, decreasing = TRUE) 238 | 239 | inflgenes <- corral_obj[[umat]][gene_dists_ordinds[1:n],] 240 | rownames(inflgenes) <- text_vec[gene_dists_ordinds][1:n] 241 | biplot_labs_filt <- c(color_vec, rep(feat_name,n)) 242 | biplot_dat_filt <- rbind(corral_obj[[vmat]], inflgenes) 243 | bipfilt_gg <- plot_embedding(biplot_dat_filt, 244 | color_vec = biplot_labs_filt, returngg = T, 245 | dimname = 'corral', xpc = xpc, showplot = F, 246 | plot_title = plot_title) 247 | 248 | bipfilt_gg$data$Name <- rownames(bipfilt_gg$data) 249 | 250 | bipfilt_gg <- bipfilt_gg + 251 | geom_text(aes(label=ifelse(color_vec == feat_name,as.character(Name),'')), 252 | hjust=0,vjust=0, 253 | size = text_size, 254 | position=position_jitter(width=xjitter,height=yjitter)) 255 | 256 | return(bipfilt_gg) 257 | } 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /R/corral.R: -------------------------------------------------------------------------------- 1 | #' compsvd: Compute Singular Value Decomposition (SVD) 2 | #' 3 | #' Computes SVD. 4 | #' 5 | #' @param mat matrix, pre-processed input; can be sparse or full (pre-processing can be performed using \code{\link{corral_preproc}} from this package) 6 | #' @param method character, the algorithm to be used for svd. Default is irl. Currently supports 'irl' for irlba::irlba or 'svd' for stats::svd 7 | #' @param ncomp numeric, number of components; Default is 30 8 | #' @param ... (additional arguments for methods) 9 | #' 10 | #' @return SVD result - a list with the following elements: 11 | #' \describe{ 12 | #' \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat}. Note that using \code{svd} will result in the full set of singular values, while \code{irlba} will only compute the first \code{ncomp} singular values.} 13 | #' \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns} 14 | #' \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns} 15 | #' \item{\code{eigsum}}{sum of the eigenvalues, for calculating percent variance explained} 16 | #' } 17 | #' @export 18 | #' 19 | #' @importFrom irlba irlba 20 | #' 21 | #' @examples 22 | #' mat <- matrix(sample(0:10, 2500, replace=TRUE), ncol=50) 23 | #' compsvd(mat, method = 'irl', ncomp = 5) 24 | compsvd <- function(mat, method = c('irl','svd'), ncomp = 30, ...){ 25 | method <- match.arg(method, c('irl','svd')) 26 | ncomp <- .check_ncomp(mat, ncomp) 27 | if(method == 'irl'){ 28 | result <- irlba::irlba(mat, nv = ncomp, ...) 29 | } 30 | else if(method == 'svd'){ 31 | result <- svd(mat, nv = ncomp, nu = ncomp, ...) 32 | } 33 | else { 34 | print('Provided method was not understood; used irlba.') 35 | result <- irlba::irlba(mat, nv = ncomp, ...) 36 | } 37 | result[['eigsum']] <- sum(mat^2) 38 | if(!is.null(rownames(mat))){rownames(result$u) <- rownames(mat)} 39 | if(!is.null(colnames(mat))){rownames(result$v) <- colnames(mat)} 40 | 41 | # add percent variance explained 42 | pct_var_exp <- t(data.frame('percent.Var.explained' = result$d^2 / result$eigsum)) 43 | colnames(pct_var_exp) <- paste0(rep('PC',ncomp),seq(1,ncomp,1)) 44 | result$pct_var_exp <- rbind(pct_var_exp,t(data.frame('cumulative.Var.explained' = cumsum(pct_var_exp[1,])))) 45 | 46 | return(result) 47 | } 48 | 49 | #' Apply a variance stabilizing transformation 50 | #' 51 | #' Prior to running CA, there is an option to apply a variance stabilizing transformation. This function can be called explicitly or used with the `vst_mth` argument in \code{corral} and \code{corral_preproc}. 52 | #' 53 | #' @param inp matrix, numeric, counts or logcounts; can be sparse Matrix or matrix 54 | #' @param transform character indicating which method should be applied. Defaults to the square root transform (`"sqrt"`). Other options include `"freemantukey"` and `"anscombe"`. 55 | #' 56 | #' @return variance-stabilized matrix; sparse if possible 57 | #' @export 58 | #' 59 | #' @examples 60 | #' x <- as.matrix(rpois(100, lambda = 50), ncol = 10) 61 | #' vst_x <- var_stabilize(x) 62 | var_stabilize <- function(inp, transform = c('sqrt','freemantukey','anscombe')){ 63 | transform <- match.arg(transform, c('sqrt','freemantukey','anscombe')) 64 | .check_vals(inp) 65 | if(transform == 'sqrt'){ 66 | return(inp^.5) 67 | } 68 | else if(transform == 'freemantukey'){ 69 | return(inp^.5 + (inp + 1)^.5) 70 | } 71 | else if(transform == 'anscombe'){ 72 | return(2 * (inp + 3/8)^.5) 73 | } 74 | } 75 | 76 | 77 | #' Preprocess a matrix for SVD to perform Correspondence Analysis (CA) 78 | #' 79 | #' This function performs the row and column scaling pre-processing operations, prior to SVD, for the corral methods. See \code{\link{corral}} for single matrix correspondence analysis and \code{\link{corralm}} for multi-matrix correspondence analysis. 80 | #' 81 | #' In addition to standard CA (SVD of Pearson residuals), which is the default setting, this function also provides options to customize processing steps -- including the variations described in the corresponding manuscript for \code{corral}. Parameters that may be changed: 82 | #' \describe{ 83 | #' \item{Residual (`rtype`)}{Which analytic residual should be calculated? The default is standardized Pearson residuals. Freeman-Tukey is a more robust choice for handling overdispersed scRNAseq counts.} 84 | #' \item{Variance stabilization method (`vst_mth`)}{Should a variance stabilizing transformation be applied to the count matrix before computing residuals? Defaults to no.} 85 | #' \item{Power deflation $\alpha$ (`powdef_alpha`)}{Should a power deflation transformation be applied after computing residuals? Defaults to no. If using, set $\alpha \in (0,1)$. For a "soft" smoothing effect, we suggest $\alpha \in [0.9,0.99]$} 86 | #' \item{Trimming-based smoothing option (`smooth`)}{As an alternative to power deflation, this option provides a more subtle correction: the most extreme 1% of values (distributed equally between both tails) are set to the 0.5 percentile and 99.5 percentile values, respectively} 87 | #' } 88 | #' 89 | #' @param inp matrix, numeric, counts or logcounts; can be sparse Matrix or matrix 90 | #' @param rtype character indicating what type of residual should be computed; options are `"indexed"`, `"standardized"` (or `"pearson"` is equivalent), `"freemantukey"`, and `"hellinger"`; defaults to `"standardized"` for \code{\link{corral}} and `"indexed"` for \code{\link{corralm}}. `"indexed"`, `"standardized"`, and `"freemantukey"` compute the respective chi-squared residuals and are appropriate for count data. The `"hellinger"` option is appropriate for continuous data. 91 | #' @param vst_mth character indicating whether a variance-stabilizing transform should be applied prior to calculating chi-squared residuals; defaults to `"none"` 92 | #' @param powdef_alpha numeric for the power that should be applied if using power deflation. Must be in (0,1), and if provided a number outside this range, will be ignored. Defaults to `NULL` which does not perform this step. 93 | #' @param row.w numeric vector; Default is \code{NULL}, to compute row.w based on \code{inp}. Use this parameter to replace computed row weights with custom row weights 94 | #' @param col.w numeric vector; Default is \code{NULL}, to compute col.w based on \code{inp}. Use this parameter to replace computed column weights with custom column weights 95 | #' @param smooth logical; Whether or not to perform the additional smoothing step with `trim_matdist`. Default is \code{FALSE}. Incompatible with `powdef_alpha`, so that parameter takes precedence over this one. 96 | #' @param ... (additional arguments for methods) 97 | #' 98 | #' @return matrix, processed for input to \code{compsvd} to finish CA routine 99 | #' @export 100 | #' 101 | #' @importFrom Matrix Matrix rowSums colSums 102 | #' @importFrom methods is 103 | #' @importClassesFrom Matrix dgCMatrix 104 | #' 105 | #' @examples 106 | #' mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25) 107 | #' mat_corral <- corral_preproc(mat) 108 | #' corral_output <- compsvd(mat_corral, ncomp = 5) 109 | corral_preproc <- function(inp, rtype = c('standardized','indexed','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), powdef_alpha = NULL, row.w = NULL, col.w = NULL, smooth = FALSE, ...){ 110 | rtype <- match.arg(rtype, c('standardized','indexed','hellinger','freemantukey','pearson')) 111 | vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe')) 112 | if(!is.null(powdef_alpha)){ 113 | if(powdef_alpha > 1 | powdef_alpha < 0){ 114 | powdef_alpha <- NULL 115 | cat('Invalid choice for power deflation parameter alpha (powdef_alpha) so ignoring; should be in (0,1)') 116 | } 117 | } 118 | if(!is(inp, "dgCMatrix")){ 119 | x_mat <- Matrix::Matrix(inp, sparse = TRUE) 120 | } else {x_mat <- inp} 121 | N <- sum(x_mat) 122 | 123 | if(vst_mth != 'none'){ 124 | x_mat <- var_stabilize(x_mat, transform = vst_mth) 125 | } 126 | 127 | p_mat <- x_mat/N 128 | w <- get_weights(inp) 129 | if(is.null(row.w)) {row.w <- w$row.w} 130 | if(is.null(col.w)) {col.w<- w$col.w} 131 | else {col.w <- col.w / sum(col.w)} 132 | 133 | res <- NULL 134 | 135 | if(rtype == 'hellinger'){ 136 | p_mat <- p_mat / row.w 137 | res <- sqrt(p_mat) 138 | } 139 | if(rtype == 'freemantukey'){ 140 | expectedp <- row.w %*% t(col.w) 141 | res <- p_mat^.5 + (p_mat + 1/N)^.5 - (4*expectedp + 1/N)^.5 142 | } 143 | 144 | p_mat <- p_mat/row.w 145 | p_mat <- sweep(p_mat, 2, col.w, "/") - 1 146 | if (any(is.na(p_mat))) { 147 | p_mat <- na2zero(p_mat) 148 | } 149 | 150 | if (rtype == 'indexed'){ 151 | res <- p_mat 152 | } 153 | else if (rtype %in% c('standardized','pearson')){ 154 | p_mat <- p_mat * sqrt(row.w) 155 | p_mat <- sweep(p_mat, 2, sqrt(col.w), "*") 156 | res <- p_mat 157 | } 158 | 159 | if(!is.null(powdef_alpha)){ 160 | return(sign(res) * (abs(res)^powdef_alpha)) 161 | } 162 | else if (smooth){ 163 | return(trim_matdist(res, ...)) 164 | } 165 | return(res) 166 | } 167 | 168 | 169 | #' corral: Correspondence analysis on a single matrix 170 | #' 171 | #' corral can be used for dimension reduction to find a set of low-dimensional embeddings for a count matrix. 172 | #' 173 | #' @param inp matrix (or any matrix-like object that can be coerced using Matrix), numeric raw or lognormed counts (no negative values) 174 | #' @param row.w numeric vector; the row weights to use in chi-squared scaling. Defaults to `NULL`, in which case row weights are computed from the input matrix. 175 | #' @param col.w numeric vector; the column weights to use in chi-squared scaling. For instance, size factors could be given here. Defaults to `NULL`, in which case column weights are computed from the input matrix. 176 | #' @inheritParams compsvd 177 | #' @inheritParams corral_preproc 178 | #' 179 | #' @return When run on a matrix, a list with the correspondence analysis matrix decomposition result: 180 | #' \describe{ 181 | #' \item{\code{d}}{a vector of the diagonal singular values of the input \code{mat} (from SVD output)} 182 | #' \item{\code{u}}{a matrix of with the left singular vectors of \code{mat} in the columns (from SVD output)} 183 | #' \item{\code{v}}{a matrix of with the right singular vectors of \code{mat} in the columns. When cells are in the columns, these are the cell embeddings. (from SVD output)} 184 | #' \item{\code{eigsum}}{sum of the eigenvalues for calculating percent variance explained} 185 | #' \item{\code{SCu and SCv}}{standard coordinates, left and right, respectively} 186 | #' \item{\code{PCu and PCv}}{principal coordinates, left and right, respectively} 187 | #' } 188 | #' 189 | #' @rdname corral 190 | #' @export 191 | #' 192 | #' @importFrom irlba irlba 193 | #' @importFrom Matrix Matrix rowSums colSums 194 | #' @importClassesFrom Matrix dgCMatrix 195 | #' 196 | #' @examples 197 | #' mat <- matrix(sample(0:10, 5000, replace=TRUE), ncol=50) 198 | #' result <- corral_mat(mat) 199 | #' result <- corral_mat(mat, method = 'irl', ncomp = 5) 200 | #' 201 | corral_mat <- function(inp, method = c('irl','svd'),ncomp = 30, row.w = NULL, col.w = NULL, rtype = c('standardized','indexed','hellinger','freemantukey','pearson'), vst_mth = c('none','sqrt','freemantukey','anscombe'), ...){ 202 | method <- match.arg(method, c('irl','svd')) 203 | rtype <- match.arg(rtype, c('standardized','indexed','hellinger','freemantukey','pearson')) 204 | vst_mth <- match.arg(vst_mth, c('none','sqrt','freemantukey','anscombe')) 205 | preproc_mat <- corral_preproc(inp, row.w = row.w, col.w = col.w, rtype = rtype, vst_mth = vst_mth, ...) 206 | result <- compsvd(preproc_mat, method, ncomp, ...) 207 | w <- get_weights(inp) 208 | if(is.null(row.w)) {row.w <- w$row.w} 209 | if(is.null(col.w)) {col.w<- w$col.w} 210 | result[['SCu']] <- sweep(result$u,1,sqrt(row.w),'/') # Standard coordinates 211 | result[['SCv']] <- sweep(result$v,1,sqrt(col.w),'/') 212 | result[['PCu']] <- sweep(result[['SCu']],2,result$d[seq(1,ncol(result$u),1)],'*') # Principal coordinates 213 | result[['PCv']] <- sweep(result[['SCv']],2,result$d[seq(1,ncol(result$v),1)],'*') 214 | class(result) <- c(class(result),'corral') 215 | return(result) 216 | } 217 | 218 | #' corral: Correspondence analysis on a single matrix (SingleCellExperiment) 219 | #' 220 | #' @param inp SingleCellExperiment; raw or lognormed counts (no negative values) 221 | #' @inheritParams compsvd 222 | #' @param whichmat character; defaults to \code{counts}, can also use \code{logcounts} or \code{normcounts} if stored in the \code{sce} object 223 | #' @param fullout boolean; whether the function will return the full \code{corral} output as a list, or a SingleCellExperiment; defaults to SingleCellExperiment (\code{FALSE}). To get back the \code{\link{corral_mat}}-style output, set this to \code{TRUE}. 224 | #' @param subset_row numeric, character, or boolean vector; the rows to include in corral, as indices (numeric), rownames (character), or with booleans (same length as the number of rows in the matrix). If this parameter is \code{NULL}, then all rows will be used. 225 | #' 226 | #' @return When run on a \code{\link{SingleCellExperiment}}, returns a SCE with the embeddings (PCv from the full corral output) in the \code{reducedDim} slot \code{corral} (default). Also can return the same output as \code{\link{corral_mat}} when \code{fullout} is set to \code{TRUE}. 227 | #' 228 | #' @rdname corral 229 | #' @export 230 | #' 231 | #' @importFrom irlba irlba 232 | #' @importFrom Matrix Matrix rowSums colSums 233 | #' @importFrom SingleCellExperiment reducedDim 234 | #' @importClassesFrom Matrix dgCMatrix 235 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 236 | #' 237 | #' @examples 238 | #' library(DuoClustering2018) 239 | #' sce <- sce_full_Zhengmix4eq()[1:100,1:100] 240 | #' result_1 <- corral_sce(sce) 241 | #' result_2 <- corral_sce(sce, method = 'svd') 242 | #' result_3 <- corral_sce(sce, method = 'irl', ncomp = 30, whichmat = 'logcounts') 243 | #' 244 | #' 245 | corral_sce <- function(inp, method = c('irl','svd'), ncomp = 30, whichmat = 'counts', fullout = FALSE, subset_row = NULL, ...){ 246 | method <- match.arg(method, c('irl','svd')) 247 | inp_mat <- SummarizedExperiment::assay(inp, whichmat) 248 | if(!is.null(subset_row)){ 249 | inp_mat <- inp_mat[subset_row,] 250 | } 251 | svd_output <- corral_mat(inp_mat, ...) 252 | if(fullout){ 253 | return(svd_output) 254 | } 255 | else{ 256 | SingleCellExperiment::reducedDim(inp,'corral') <- svd_output$PCv 257 | return(inp) 258 | } 259 | } 260 | 261 | 262 | #' corral: Correspondence analysis for a single matrix 263 | #' 264 | #' \code{corral} is a wrapper for \code{\link{corral_mat}} and \code{\link{corral_sce}}, and can be called on any of the acceptable input types. 265 | #' 266 | #' @param inp matrix (any type), \code{SingleCellExperiment}, or \code{SummarizedExperiment}. If using \code{SingleCellExperiment} or \code{SummarizedExperiment}, then include the \code{whichmat} argument to specify which slot to use (defaults to \code{counts}). 267 | #' @param ... (additional arguments for methods) 268 | #' 269 | #' @return For matrix and \code{SummarizedExperiment} input, returns list with the correspondence analysis matrix decomposition result (u,v,d are the raw svd output; SCu and SCv are the standard coordinates; PCu and PCv are the principal coordinates) 270 | #' @return 271 | #' For \code{SummarizedExperiment} input, returns the same as for a matrix. 272 | #' @rdname corral 273 | #' @export 274 | #' 275 | #' @importFrom irlba irlba 276 | #' @importFrom Matrix Matrix rowSums colSums 277 | #' @importFrom methods is 278 | #' @importFrom SingleCellExperiment reducedDim 279 | #' @importFrom SummarizedExperiment assay 280 | #' @importClassesFrom Matrix dgCMatrix 281 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment 282 | #' 283 | #' @examples 284 | #' library(DuoClustering2018) 285 | #' sce <- sce_full_Zhengmix4eq()[1:100,1:100] 286 | #' corral_sce <- corral(sce,whichmat = 'counts') 287 | #' 288 | #' mat <- matrix(sample(0:10, 500, replace=TRUE), ncol=25) 289 | #' corral_mat <- corral(mat, ncomp=5) 290 | #' 291 | corral <- function(inp,...){ 292 | if(is(inp,"SingleCellExperiment")){ 293 | corral_sce(inp = inp, ...) 294 | } else if(is(inp,"SummarizedExperiment")){ 295 | if(missing(whichmat)) {whichmat <- 'counts'} 296 | corral_mat(inp = SummarizedExperiment::assay(inp,whichmat),...) 297 | } else{ 298 | corral_mat(inp = inp, ...) 299 | } 300 | } 301 | 302 | 303 | #' Print method for S3 object corral 304 | #' 305 | #' @param x (print method) corral object; the list output from \code{corral_mat} 306 | #' 307 | #' @rdname corral 308 | #' 309 | #' @return . 310 | #' @export 311 | #' 312 | #' @examples 313 | #' mat <- matrix(sample(1:100, 10000, replace = TRUE), ncol = 100) 314 | #' corral(mat) 315 | print.corral <- function(x,...){ 316 | inp <- x 317 | pct_var_exp <- inp$pct_var_exp 318 | ncomp <- length(inp$d) 319 | cat('corral output summary===========================================\n') 320 | cat(' Output "list" includes standard coordinates (SCu, SCv),\n') 321 | cat(' principal coordinates (PCu, PCv), & SVD output (u, d, v)\n') 322 | cat('Variance explained----------------------------------------------\n') 323 | print(round(pct_var_exp[,seq(1,min(8,ncomp),1)],2)) 324 | cat('\n') 325 | cat('Dimensions of output elements-----------------------------------\n') 326 | cat(' Singular values (d) :: ') 327 | cat(ncomp) 328 | cat('\n Left singular vectors & coordinates (u, SCu, PCu) :: ') 329 | cat(dim(inp$u)) 330 | cat('\n Right singular vectors & coordinates (v, SCv, PCv) :: ') 331 | cat(dim(inp$v)) 332 | cat('\n See corral help for details on each output element.') 333 | cat('\n Use plot_embedding to visualize; see docs for details.') 334 | cat('\n================================================================\n') 335 | } 336 | 337 | --------------------------------------------------------------------------------