├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── marker_based.R
    ├── reference_based.R
    ├── simulation.R
    └── utils.R
├── README.md
├── man
    ├── CalculateSCCellProportions.Rd
    ├── CorTri.Rd
    ├── CountsToCPM.Rd
    ├── EstimatePCACellTypeProportions.Rd
    ├── FilterUnexpressedGenes.Rd
    ├── FilterZeroVarianceGenes.Rd
    ├── GenerateSCReference.Rd
    ├── GetCTP.Rd
    ├── GetNumGenes.Rd
    ├── GetNumGenesWeighted.Rd
    ├── GetOverlappingGenes.Rd
    ├── GetOverlappingSamples.Rd
    ├── GetUniqueMarkers.Rd
    ├── MarkerBasedDecomposition.Rd
    ├── ReferenceBasedDecomposition.Rd
    ├── SemisupervisedTransformBulk.Rd
    ├── SeuratToExpressionSet.Rd
    ├── SimulateBarcode.Rd
    ├── SimulateData.Rd
    └── SupervisedTransformBulk.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_marker_based.R
    │   ├── test_ref_based.R
    │   ├── test_sim.R
    │   └── test_utils.R
└── vignettes
    ├── .gitignore
    └── bisque.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.travis\.yml$
2 | ^codecov\.yml$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | inst/doc
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | *.Rproj
7 | *.html
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | sudo: false
 5 | cache: packages
 6 | jobs:
 7 |   include:
 8 |     - stage: "Build"
 9 |       bioc_packages:
10 |         - Biobase
11 | r_packages:
12 |   - covr
13 | 
14 | after_success:
15 |   - Rscript -e 'library(covr); codecov()'
16 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: BisqueRNA
 2 | Title: Decomposition of Bulk Expression with Single-Cell Sequencing
 3 | Version: 1.0.5
 4 | Authors@R: c(
 5 |   person(given = 'Brandon', family = 'Jew', email = 'brandon.jew@ucla.edu', role = c('aut', 'cre')),
 6 |   person(given = 'Marcus', family = 'Alvarez', role = 'aut')
 7 |   )
 8 | Description: Provides tools to accurately estimate cell type abundances 
 9 |     from heterogeneous bulk expression. A reference-based method utilizes
10 |     single-cell information to generate a signature matrix and transformation
11 |     of bulk expression for accurate regression based estimates. A marker-based
12 |     method utilizes known cell-specific marker genes to measure relative
13 |     abundances across samples.
14 |     For more details, see Jew and Alvarez et al (2019) <doi:10.1101/669911>.
15 | Depends: R (>= 3.5.0)
16 | License: GPL-3
17 | Encoding: UTF-8
18 | LazyData: true
19 | RoxygenNote: 7.0.2
20 | biocViews:
21 | Imports:
22 |   Biobase,
23 |   limSolve,
24 |   methods,
25 |   stats
26 | Suggests:
27 |     Seurat,
28 |     plyr,
29 |     knitr,
30 |     rmarkdown,
31 |     testthat
32 | URL: https://www.biorxiv.org/content/10.1101/669911v1
33 | BugReports: https://github.com/cozygene/bisque/issues
34 | VignetteBuilder: knitr
35 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(CalculateSCCellProportions)
4 | export(GenerateSCReference)
5 | export(MarkerBasedDecomposition)
6 | export(ReferenceBasedDecomposition)
7 | export(SeuratToExpressionSet)
8 | export(SimulateData)
9 | 


--------------------------------------------------------------------------------
/R/marker_based.R:
--------------------------------------------------------------------------------
  1 | #' Estimate cell type proportions using first PC of expression matrix
  2 | #' 
  3 | #' @param x A sample by gene bulk expression matrix. Genes should be marker
  4 | #'   genes
  5 | #' @param weighted Boolean. If weighted=TRUE, multiply scaled gene expression by
  6 | #'   gene weights
  7 | #' @param w Numeric vector. Weights of genes 
  8 | #' @return ret List. Attribute \strong{pcs} contains matrix of PCs, where PC1
  9 | #'   should be used as estimates for cell type abundances
 10 | #'   Attribute \strong{sdev} contains eigenvalues of eigendecomposition of
 11 | #'   var-covar matrix. The 1st eigenvalue should explain most of the variance.
 12 | #'   Attribute \strong{genes} contains names of genes.
 13 | EstimatePCACellTypeProportions <- function(x, weighted=FALSE, w=NULL){
 14 |   x <- base::scale(x)
 15 |   if (weighted) {
 16 |     # Intersect gene names of weights and column names of x
 17 |     common.markers <- base::intersect( base::colnames(x), base::names(w) )
 18 |     # not sure if this will ever happen, since we check this at line 268
 19 |     #if ( length(common.markers) == 0 ) {
 20 |     #  base::stop(base::paste0("Genes from weights w do not match with column ",
 21 |     #                         "names of expression matrix x."))
 22 |     #}
 23 |     x <- x[,common.markers,drop=F]
 24 |     w <- w[common.markers,drop=F]
 25 |     wd <- base::diag(w)
 26 |     if (base::all.equal(base::dim(wd), c(0,0)) == TRUE){
 27 |       wd <- w
 28 |     }
 29 |     xw <- x %*% wd
 30 |     varcov <- t(xw) %*% xw
 31 |   }
 32 |   else {
 33 |     varcov <- t(x) %*% x
 34 |   }
 35 |   varcov.ed <- base::eigen(varcov)
 36 |   rot <- varcov.ed$vectors
 37 |   wpcs <- x %*% rot
 38 |   sds <- varcov.ed$values
 39 |   # x contains PCs, sdev contains eigenvalues of eigendecomposition
 40 |   return(list( pcs = wpcs, sdev = sds, genes=colnames(x)))
 41 | }
 42 | 
 43 | #' Get number of genes to use with weighted PCA
 44 | #'
 45 | #' @param x Numeric Matrix. A sample by gene expression matrix containing the
 46 | #'   marker genes.
 47 | #' @param w Numeric Vector. The weights of the genes that correspond to the
 48 | #'   columns of x.
 49 | #' @param min.gene Numeric. Minimum number of genes to consider as markers.
 50 | #' @param max.gene Numeric. Maximum number of genes to consider as markers.
 51 | #' @return best.n Numeric. Number of genes to use
 52 | GetNumGenesWeighted = function(x, w, min.gene = 25, max.gene = 200){
 53 |   max.gene = base::min(max.gene, base::ncol(x))
 54 |   if (max.gene == 1) return(1)
 55 |   ratios = base::lapply(min.gene:max.gene,
 56 |                         function(i) {
 57 |                           ret = EstimatePCACellTypeProportions(x[,1:i,drop=FALSE],
 58 |                                         weighted=TRUE,
 59 |                                         w=w[1:i])
 60 |                           vars = ret$sdev
 61 |                           return(vars[1] / vars[2])
 62 |                         })
 63 |   best.n = base::which.max(ratios) + min.gene - 1
 64 |   return(best.n)
 65 | }
 66 | 
 67 | #' Get number of genes to use with no weighted information
 68 | #'
 69 | #' @param x Numeric Matrix. A sample by gene expression matrix containing the
 70 | #'   marker genes.
 71 | #' @param min.gene Numeric. Minimum number of genes to consider as markers.
 72 | #' @param max.gene Numeric. Maximum number of genes to consider as markers.
 73 | #' @return best.n Numeric. Number of genes to use
 74 | GetNumGenes = function(x, min.gene = 25, max.gene = 200){
 75 |   max.gene = base::min(max.gene, base::ncol(x))
 76 |   if (max.gene == 1) return(1)
 77 |   ratios = base::lapply(min.gene:max.gene,
 78 |                         function(i) {
 79 |                           ret = EstimatePCACellTypeProportions(x[,1:i])
 80 |                           vars = ret$sdev
 81 |                           return(vars[1] / vars[2])
 82 |                         })
 83 |   best.n = base::which.max(ratios) + min.gene - 1
 84 |   return(best.n)
 85 | }
 86 | 
 87 | #' Get unique markers present in only 1 cell type
 88 | #'
 89 | #' Given a data frame of marker genes for cell types, 
 90 | #' returns a new data frame with non-unique markers removed.
 91 | #'
 92 | #' @param x Data frame. Contains column with marker gene names
 93 | #' @param gene_col Character string. Name of the column that contains
 94 | #'   the marker genes
 95 | #' @return x Data frame. Markers with non-unique markers removed
 96 | #'
 97 | GetUniqueMarkers <- function(x, gene_col="gene"){
 98 |   keep <- ! (duplicated(x[,gene_col], fromLast = FALSE) |
 99 |              duplicated(x[,gene_col], fromLast = TRUE) )
100 |   return(x[keep,])
101 | }
102 | 
103 | #' Correlate columns of data frame
104 | #'
105 | #' This function runs correlation between markers of a data frame or matrix, 
106 | #' returning the values of the lower/upper triangular of the correlation matrix
107 | #' in a vector.
108 | #'
109 | #' @param x Data frame or matrix. Column vectors are correlated
110 | #' @param method Character string. Name of method passed to cor. 
111 | #'   Pearson by default.
112 | #' @return cors Numeric vector. Correlation coefficients of pairs
113 | #'
114 | CorTri <- function(x, method="pearson"){
115 |   cors <- stats::cor(x, method=method)
116 |   cors <- cors[base::lower.tri(cors)]
117 |   return(cors)
118 | }
119 | 
120 | #' Return cell type proportions from bulk
121 | #'
122 | #' Calculate cell type proportions from a data frame containing bulk expression
123 | #' values. Uses PCA (weighted or regular) to estimate relative proportions
124 | #' within each cell type.
125 | #'
126 | #' @param bulk Expression Set containing bulk data
127 | #' @param cell_types Character vector. Names of cell types.
128 | #' @param markers Data frame with columns specifying cluster and gene,
129 | #'   and optionally a column for weights, typically the fold-change of the gene.
130 | #'   Important that the genes for each cell type are row-sorted by signficance.
131 | #' @param ct_col Character string. Column name specifying cluster/cell type
132 | #'   corresponding to each marker gene in \strong{markers}. 
133 | #' @param gene_col Character string. Column name specifying gene names in
134 | #'   \strong{markers}.
135 | #' @param min_gene Numeric. Min number of genes to use for each cell type.
136 | #' @param max_gene Numeric. Max number of genes to use for each cell type.
137 | #' @param weighted Boolean. Whether to use weights for gene prioritization
138 | #' @param w_col Character string. Column name for weights, such as "avg_logFC",
139 | #'  in \strong{markers}
140 | #' @param verbose Boolean. Whether to print log info during decomposition.
141 | #'   Errors will be printed regardless. 
142 | #'
143 | #' @return A List. Slot \strong{cors} contains list of vectors with correlation
144 | #'   coefficients. Slot \strong{ctps} contains list of CTP objects returned by
145 | #'   GetCTP
146 | GetCTP <- function(bulk,
147 |                    cell_types,
148 |                    markers,
149 |                    ct_col,
150 |                    gene_col,
151 |                    min_gene,
152 |                    max_gene,
153 |                    weighted,
154 |                    w_col,
155 |                    verbose){
156 |   ctp = base::lapply(cell_types, function(ct){
157 |                      # Get marker genes
158 |                      markers_ct = markers[ markers[,ct_col] == ct , , drop=FALSE]
159 |                      ctm = base::make.unique(as.character(markers_ct[, gene_col]))
160 |                      # Get markers in common between bulk and markers data frame
161 |                      common_markers = base::intersect(ctm, Biobase::featureNames(bulk))
162 |                      if ( base::length(common_markers) == 0 ){
163 |                        base::stop("No marker genes found in bulk expression data")
164 |                      }
165 |                      expr = base::t(Biobase::exprs(bulk)[common_markers, , drop = FALSE])
166 |                      if ( base::ncol(expr) < min_gene ){
167 |                        base::stop(base::paste0(base::sprintf("For cell type %s, There are less marker genes in ", ct),
168 |                                                base::sprintf("the bulk expression set (%i) than the ", base::ncol(expr)),
169 |                                                base::sprintf("minimum number of genes set (%i) ", min_gene),
170 |                                                "for PCA-based decomposition\nSet the min_gene parameter to a lower integer."))
171 |                      }
172 |                      if (weighted){
173 |                        # Get gene weights
174 |                        ctw = markers_ct[, w_col]; names(ctw) = ctm; ctw = ctw[common_markers]
175 |                        ng = GetNumGenesWeighted(expr, ctw, min_gene, max_gene) # Number of markers for PCA
176 |                        expr = expr[, 1:ng, drop = FALSE]
177 |                        if (verbose){
178 |                          base::message(base::sprintf("Using %i genes for cell type %s; ", ng, ct))
179 |                        }
180 |                        ret = EstimatePCACellTypeProportions(expr, weighted = TRUE, w = ctw[1:ng])
181 |                      }
182 |                      else{
183 |                        ng = GetNumGenes(expr, min_gene, max_gene)
184 |                        expr = expr[, 1:ng, drop = FALSE]
185 |                        if (verbose){
186 |                          base::message(base::sprintf("Using %i genes for cell type %s; ", ng, ct))
187 |                        }
188 |                        ret = EstimatePCACellTypeProportions(expr)
189 |                      }
190 |                      # Flip the sign of the first PC if negatively correlated with most genes
191 |                      cors = stats::cor(expr, ret$pcs[,1])
192 |                      n_pos = sum(cors[,1] > 0)
193 |                      if (n_pos/base::length(cors[,1]) < (0.5)) {
194 |                        ret$pcs[,1] = ret$pcs[,1] * -1
195 |                      }
196 |                      if (verbose){
197 |                        cors = stats::cor(expr, ret$pcs[,1]); n_pos = sum(cors[,1] > 0)
198 |                        pct <- as.character(as.integer(100 * round(n_pos/base::length(cors[,1]), 2)))
199 |                        clen <- as.character(base::length(cors[,1]))
200 |                        base::message(base::paste0(pct, "% of ", clen, " marker genes correlate positively with PC1 for cell type ", ct))
201 |                      }
202 |                      return(ret)
203 |                    })
204 |   return(ctp)
205 | }
206 | 
207 | #' Performs marker-based decomposition of bulk expression using marker genes
208 | #' 
209 | #' Estimates relative abundances of cell types from PCA-based decomposition.
210 | #' Uses a list of marker genes to subset the expression data, and returns the 
211 | #' first PC of each sub-matrix as the cell type fraction estimates.
212 | #' Optionally, weights for each marker gene can be used to prioritize genes
213 | #' that are highly expressed in the given cell type.
214 | #'
215 | #' Note that this method expects the input bulk data to be normalized, unlike
216 | #' the reference-based method.
217 | #' 
218 | #' @param bulk.eset Expression Set. Normalized bulk expression data.
219 | #' @param markers Data frame with columns specifying cluster and gene,
220 | #'   and optionally a column for weights, typically the fold-change of the gene.
221 | #'   Important that the genes for each cell type are row-sorted by signficance.
222 | #' @param ct_col Character string. Column name specifying cluster/cell type
223 | #'   corresponding to each marker gene in \strong{markers}. 
224 | #' @param gene_col Character string. Column name specifying gene names in
225 | #'   \strong{markers}.
226 | #' @param min_gene Numeric. Min number of genes to use for each cell type.
227 | #' @param max_gene Numeric. Max number of genes to use for each cell type.
228 | #' @param weighted Boolean. Whether to use weights for gene prioritization
229 | #' @param w_col Character string. Column name for weights, such as "avg_logFC",
230 | #'  in \strong{markers}
231 | #' @param unique_markers Boolean. If TRUE, subset markers to include only genes 
232 | #'   that are markers for only one cell type
233 | #' @param verbose Boolean. Whether to print log info during decomposition.
234 | #'   Errors will be printed regardless. 
235 | #' @return A List. Slot \strong{bulk.props} contains estimated relative cell
236 | #'   type abundances. Slot \strong{var.explained} contains variance explained by
237 | #'   first 20 PCs for cell type marker genes. Slot \strong{genes.used} contains
238 | #'   vector of genes used for decomposition.
239 | #' @examples
240 | #' library(Biobase)
241 | #' sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
242 | #'                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
243 | #'                          avg.props=c(.5, .3, .2))
244 | #' res <- MarkerBasedDecomposition(sim.data$bulk.eset, sim.data$markers, weighted=FALSE)
245 | #' estimated.cell.proportions <- res$bulk.props
246 | #' 
247 | #' @export
248 | MarkerBasedDecomposition <- function(bulk.eset, 
249 |                                      markers,
250 |                                      ct_col="cluster",
251 |                                      gene_col="gene",
252 |                                      min_gene = 5,
253 |                                      max_gene = 200,
254 |                                      weighted=FALSE,
255 |                                      w_col = "avg_logFC",
256 |                                      unique_markers = TRUE,
257 |                                      verbose=TRUE){
258 |   # Check input
259 |   if ( ! methods::is(bulk.eset, "ExpressionSet") ) {
260 |     base::stop("Expression data should be in ExpressionSet")
261 |   }
262 |   if (min_gene > max_gene){
263 |     base::stop(base::paste0(base::sprintf("min_gene (set at %i) ", min_gene),
264 |                             "must be less than or equal to max_gene ",
265 |                             base::sprintf("(set at %i)\n", max_gene)))
266 |   }
267 |   if (min_gene <= 0){
268 |       base::stop("'min_gene' must be greater than or equal to 1")
269 |   }
270 | 
271 |   # Get unique markers if applicable
272 |   if (unique_markers){
273 |     if (verbose) message("Getting unique markers")
274 |     # Remove gene rows from markers data frame that are shared
275 |     markers <- GetUniqueMarkers(markers, gene_col=gene_col)
276 |   }
277 |   cg <- intersect(unique(markers[,gene_col]), rownames(bulk.eset))
278 |   if (length(cg) == 0) {
279 |     base::stop("No overlapping genes between markers and bulk.eset")
280 |   }
281 |   markers <- markers[markers[,gene_col] %in% cg,]
282 | 
283 |   # Check if there are enough markers
284 |   n_genes_clust <- table(markers[,ct_col])
285 |   if (any(n_genes_clust < min_gene)) {
286 |     nctbelow <- sum(n_genes_clust < min_gene)
287 |     ctbelow <- names(n_genes_clust)[n_genes_clust < min_gene]
288 |     base::stop(ngettext(nctbelow, "Cell type ", "Cell types "), 
289 |                paste(ctbelow, collapse = ", "), 
290 |                ngettext(nctbelow, " has ", " have "), 
291 |                "less than min_gene=", 
292 |                as.character(min_gene), 
293 |                " marker genes")
294 |   }
295 | 
296 |   # Throw warning if less than 5 markers used for decomposition
297 |   if (verbose){
298 |     if (any(n_genes_clust < 5)){
299 |       nctbelow <- sum(n_genes_clust < 5)
300 |       ctbelow <- names(n_genes_clust)[n_genes_clust < 5]
301 |       base::warning("WARN: Less than 5 marker genes available for ", 
302 |                     "PCA-based decomposition in ", 
303 |                     ngettext(nctbelow, "cell type ", "cell types "), 
304 |                     paste(ctbelow, collapse = ", "), 
305 |                     ". Estimated cell type proportions may be ", 
306 |                     "unreliable")
307 |     }
308 |   }
309 | 
310 |   markers[,ct_col] <- as.character(markers[,ct_col])
311 |   cell_types <- sort(unique(markers[,ct_col]))
312 |   n_ct <- length(cell_types)
313 |   n_s <- base::ncol(bulk.eset)
314 |   if (verbose){
315 |     base::message(base::paste0("Estimating proportions for ", 
316 |                                base::sprintf("%i cell types in %i samples", 
317 |                                              n_ct, n_s)))
318 |   }
319 | 
320 |   # Remove zero-variance genes
321 |   bulk.eset <- FilterZeroVarianceGenes(bulk.eset, verbose)
322 | 
323 |   # Get cell type proportions
324 |   ctp <- GetCTP(bulk = bulk.eset, 
325 |                 cell_types = cell_types, 
326 |                 markers = markers, 
327 |                 ct_col = ct_col, 
328 |                 gene_col = gene_col,
329 |                 min_gene = min_gene, 
330 |                 max_gene = max_gene, 
331 |                 weighted = weighted, 
332 |                 w_col = w_col, 
333 |                 verbose = verbose)
334 |   names(ctp) <- cell_types
335 |   ctp_pc1 <- base::lapply(ctp, function(x) x$pcs[,1])
336 |   ctp_pc1 <- base::do.call(cbind, ctp_pc1)
337 | 
338 |   # Check if all proportions are correlated with each other
339 |   ctp_cors <- stats::cor(ctp_pc1)
340 |   ctp_cors <- CorTri(ctp_cors)
341 |   if (verbose & all(ctp_cors > 0)){
342 |     base::warning("WARN: All cell type proportion estimates are correlated ",
343 |                   "positively with each other. Check to make sure the ",
344 |                   "expression data is properly normalized")
345 |   }
346 | 
347 |   # Return results in list
348 |   ctp_pc1 <- base::t(ctp_pc1)
349 |   ctp_varexpl <- base::sapply(ctp, function(x) x$sdev[1:20])
350 |   rownames(ctp_varexpl) <- base::paste0("PC", base::as.character(1:20))
351 |   genes_used <- base::lapply(ctp, function(x) x$genes)
352 |   if (verbose) message("Finished estimating cell type proportions using PCA")
353 |   return(list(bulk.props=ctp_pc1,
354 |               var.explained=ctp_varexpl,
355 |               genes.used=genes_used))
356 | }
357 | 
358 | 


--------------------------------------------------------------------------------
/R/reference_based.R:
--------------------------------------------------------------------------------
  1 | #' Find overlapping samples in single-cell and bulk data
  2 | #'
  3 | #' @param sc.eset Expression Set with single-cell data
  4 | #' @param bulk.eset Expression Set with bulk data
  5 | #' @param subject.names A character string. Name of phenoData attribute in
  6 | #'   sc.eset that indicates individual ID (that would be found in bulk.eset
  7 | #'   if overlapping)
  8 | #' @param verbose Boolean. Print logging info
  9 | #' @return samples A list with attributes \emph{overlapping} and
 10 | #'   \emph{remaining}. Each attribute refers to a character vector that lists
 11 | #'   the samples found in both datasets and samples found only in bulk,
 12 | #'   respectively
 13 | GetOverlappingSamples <- function(sc.eset, bulk.eset, subject.names, verbose) {
 14 |   bulk.samples <- Biobase::sampleNames(bulk.eset)
 15 |   sc.samples <- base::levels(base::factor(sc.eset[[subject.names]]))
 16 |   overlapping.samples <- base::intersect(bulk.samples, sc.samples)
 17 |   if (base::length(overlapping.samples) == 0) {
 18 |     base::stop("No overlapping samples in bulk and single-cell expression.")
 19 |   }
 20 |   remaining.samples <- base::setdiff(Biobase::sampleNames(bulk.eset),
 21 |                                      overlapping.samples)
 22 |   if (base::length(remaining.samples) == 0) {
 23 |     base::stop("All samples have single-cell data, nothing to process.")
 24 |   }
 25 |   samples <- base::list("overlapping"=overlapping.samples,
 26 |                         "remaining"=remaining.samples)
 27 |   if (verbose) {
 28 |     n.overlapping <- base::length(samples$overlapping)
 29 |     n.remaining <- base::length(samples$remaining)
 30 |     base::message(base::sprintf("Found %i samples ", n.overlapping),
 31 |                                 "with bulk and single-cell expression.")
 32 |     base::message(base::sprintf("Remaining %i ", n.remaining),
 33 |                                 "bulk samples will be decomposed.")
 34 |   }
 35 |   return(samples)
 36 | }
 37 | 
 38 | #' Find overlapping genes in single-cell data, bulk data, and marker genes
 39 | #'
 40 | #' @param sc.eset Expression Set with single-cell data
 41 | #' @param bulk.eset Expression Set with bulk data
 42 | #' @param markers Character vector. List of relevant marker genes
 43 | #' @param verbose Boolean. Print logging info
 44 | #' @return overlapping.genes Character vector. List of genes found in markers
 45 | #'   and both datasets. 
 46 | GetOverlappingGenes <- function(sc.eset, bulk.eset, markers, verbose) {
 47 |   bulk.genes <- Biobase::featureNames(bulk.eset)
 48 |   sc.genes <- Biobase::featureNames(sc.eset)
 49 |   overlapping.genes <- base::intersect(bulk.genes, sc.genes)
 50 |   if (base::length(overlapping.genes) == 0) {
 51 |     base::stop(base::paste0("No overlapping genes found between bulk and ",
 52 |                            "single-cell expression."))
 53 |   }
 54 |   overlapping.genes <- base::intersect(overlapping.genes, markers)
 55 |   if (base::length(overlapping.genes) == 0) {
 56 |     base::stop(base::paste0("No marker genes found in both bulk and ",
 57 |                            "single-cell expression."))
 58 |   }
 59 |   if (verbose) {
 60 |     n.genes <- base::length(overlapping.genes)
 61 |     base::message(base::sprintf("Using %i genes in both", n.genes),
 62 |                                 " bulk and single-cell expression.")
 63 |   }
 64 |   return(overlapping.genes)
 65 | }
 66 | 
 67 | #' Generate reference profile for cell types identified in single-cell data
 68 | #'
 69 | #' Averages expression within each cell type across all samples to use as 
 70 | #' reference profile.
 71 | #'
 72 | #' @param sc.eset Expression Set with single-cell data
 73 | #' @param cell.types A character string. Name of phenoData attribute in sc.eset
 74 | #'   that indicates cell type
 75 | #' @return sc.ref Matrix. Reference profile with number of gene rows by number
 76 | #'   of cell types columns. 
 77 | #' @export
 78 | GenerateSCReference <- function(sc.eset, cell.types) {
 79 |   cell.labels <- base::factor(sc.eset[[cell.types]])
 80 |   all.cell.types <- base::levels(cell.labels)
 81 |   aggr.fn <- function(cell.type) {
 82 |     base::rowMeans(Biobase::exprs(sc.eset)[,cell.labels == cell.type, drop=F])
 83 |   }
 84 |   template <- base::numeric(base::nrow(sc.eset))
 85 |   sc.ref <- base::vapply(all.cell.types, aggr.fn, template)
 86 |   return(sc.ref)
 87 | }
 88 | 
 89 | #' Calculate cell proportions based on single-cell data
 90 | #'
 91 | #' Returns proportion of each cell type out of total cells for each individual
 92 | #' in the single-cell Expression Set
 93 | #'
 94 | #' @param sc.eset Expression Set with single-cell data
 95 | #' @param subject.names A character string. Name of phenoData attribute in 
 96 | #'   sc.eset that indicates individual ID.
 97 | #' @param cell.types A character string. Name of phenoData attribute in sc.eset
 98 | #'   that indicates cell type
 99 | #' @return sc.props Matrix. Cell proportions with number of cell types rows 
100 | #'   by number of individuals columns
101 | #' @export
102 | CalculateSCCellProportions <- function(sc.eset, subject.names, cell.types) {
103 |   individual.labels <- base::factor(sc.eset[[subject.names]])
104 |   individuals <- base::levels(individual.labels)
105 |   cell.labels <- base::as.factor(sc.eset[[cell.types]])
106 |   aggr.fn <- function(individual) {
107 |      base::table(cell.labels[individual.labels == individual]) /
108 |        base::length(cell.labels[individual.labels == individual])
109 |   }
110 |   sc.props <- base::sapply(individuals, aggr.fn)
111 |   return(sc.props)
112 | }
113 | 
114 | #' Transforms bulk expression of a gene given overlapping data
115 | #'
116 | #' For a specific gene, this function uses linear regression to learn a
117 | #' transformation of the bulk expression to match the values produced
118 | #' by the single-cell based reference and observed single-cell based cell
119 | #' proportions.
120 | #'
121 | #' If a linear transformation cannot be learned for a gene (zero variance in
122 | #' observed bulk or single-cell based weighted sums), a vector of NaNs will 
123 | #' be returned of the expected length (length of X.pred)
124 | #'
125 | #' @param gene Character string. Gene name that corresponds to row in Y.train
126 | #' @param Y.train Numeric Matrix. Number of gene rows by number of overlapping
127 | #'   individuals columns. Contains weighted sum of reference profile by 
128 | #'   single-cell based cell proportion estimates for each individual
129 | #' @param X.train Numeric Matrix. Number of gene rows by number of overlapping
130 | #'   individuals columns. Contains observed bulk expression for each individual
131 | #' @param X.pred Numeric Matrix. Number of gene rows by number of remaining
132 | #'   individuals columns. Contains observed bulk expression for each individual
133 | #'   to be transformed.
134 | #' @return Y.pred Numeric Matrix. One row for given gene by number of remaining
135 | #'   individuals columns. Contains transformed bulk expression for each
136 | #'   individual.
137 | SupervisedTransformBulk <- function(gene, Y.train, X.train, X.pred) {
138 |   Y.train.scaled <- base::scale(Y.train[gene,,drop=T])
139 |   Y.center <- base::attr(Y.train.scaled, "scaled:center")
140 |   Y.scale <- base::attr(Y.train.scaled, "scaled:scale")
141 |   X.train.scaled <- base::scale(X.train[gene,,drop=T])
142 |   X.center <- base::attr(X.train.scaled, "scaled:center")
143 |   X.scale <- base::attr(X.train.scaled, "scaled:scale")
144 |   # If zero variance in both X and Y train, just solve coefficient directly
145 |   # for one individual.
146 |   if (base::anyNA(X.train.scaled) & base::anyNA(Y.train.scaled)) {
147 |     coeff <- Y.train[gene,,drop=T][1]/X.train[gene,,drop=T][1]
148 |     if (coeff == 0 || ! is.finite(coeff)) {
149 |       coeff = NaN
150 |     }
151 |     Y.pred <- base::matrix(X.pred[gene,,drop=T] * coeff,
152 |                            dimnames=base::list(base::colnames(X.pred), gene))
153 |   }
154 |   # If only one of X or Y has zero variance, return NaN. We shouldn't use this 
155 |   # gene for decomposition.
156 |   else if (anyNA(X.train.scaled) || anyNA(Y.train.scaled)) {
157 |     Y.pred <- base::matrix(X.pred[gene,,drop=T] * NaN,
158 |                            dimnames=base::list(base::colnames(X.pred), gene))
159 |   }
160 |   # Otherwise, do standard linear model on scaled data, then unscale.
161 |   else {
162 |     X.pred.scaled <- base::scale(X.pred[gene,,drop=T],
163 |                                  center=X.center,
164 |                                  scale=X.scale)
165 |     model <- stats::lm(Y.train.scaled ~ X.train.scaled +0)
166 |     coeff <- base::as.numeric(stats::coefficients(model))
167 |     Y.pred.scaled <- X.pred.scaled * coeff
168 |     Y.pred <- base::matrix((Y.pred.scaled * Y.scale) + Y.center,
169 |                            dimnames=base::list(base::colnames(X.pred), gene))
170 |   }
171 |   return(Y.pred)
172 | }
173 | 
174 | #' Transforms bulk expression of a gene using only single-cell data
175 | #'
176 | #' For a specific gene, this function learns a transformation of
177 | #' the bulk expression to match the distribution produced
178 | #' by the single-cell based reference and observed single-cell based cell
179 | #' proportions.
180 | #'
181 | #' 02/23/2021 - Github user @empircalbayes pointed out typo in shrink.scale
182 | #'              Changed denominator from n+1 to (n+1). Thanks @empiricalbayes!
183 | #'
184 | #' @param gene Character string. Gene name that corresponds to row in Y.train
185 | #' @param Y.train Numeric Matrix. Number of gene rows by number of overlapping
186 | #'   individuals columns. Contains weighted sum of reference profile by 
187 | #'   single-cell based cell proportion estimates for each individual
188 | #' @param X.pred Numeric Matrix. Number of gene rows by number of remaining
189 | #'   individuals columns. Contains observed bulk expression for each individual
190 | #'   to be transformed.
191 | #' @return Y.pred Numeric Matrix. One row for given gene by number of remaining
192 | #'   individuals columns. Contains transformed bulk expression for each
193 | #'   individual.
194 | SemisupervisedTransformBulk <- function(gene, Y.train, X.pred) {
195 |   # Learns linear transformation of observed bulk to match distribution of
196 |   # weighted sum of reference
197 |   #
198 |   # Used with vapply, processes one gene
199 |   Y.train.scaled <- base::scale(Y.train[gene,,drop=T])
200 |   Y.center <- base::attr(Y.train.scaled, "scaled:center")
201 |   Y.scale <- base::attr(Y.train.scaled, "scaled:scale")
202 |   n <- base::length(Y.train.scaled)
203 |   # Shrinkage estimator that minimizes MSE for scaling factor
204 |   shrink.scale <- base::sqrt(base::sum((Y.train[gene,,drop=T]-Y.center)^2)/(n+1))
205 |   X.pred.scaled <- base::scale(X.pred[gene,,drop=T])
206 |   Y.pred <- base::matrix((X.pred.scaled * shrink.scale) + Y.center,
207 |                          dimnames=base::list(base::colnames(X.pred), gene))
208 |   return(Y.pred)
209 | }
210 | 
211 | #' Performs reference-based decomposition of bulk expression using single-cell
212 | #' data
213 | #'
214 | #' Generates a reference profile based on single-cell data. Learns a
215 | #' transformation of bulk expression based on observed single-cell proportions
216 | #' and performs  NNLS regression on these transformed values to estimate cell
217 | #' proportions.
218 | #'
219 | #' Expects read counts for both datasets, as they will be converted to 
220 | #' counts per million (CPM). Two options available: Use overlapping indivudals
221 | #' found in both single-cell and bulk datasets to learn transformation or 
222 | #' learn transformation from single-cell alone. The overlapping option is
223 | #' expected to have better performance. 
224 | #'
225 | #' @param bulk.eset Expression Set containin bulk data. No PhenoData required
226 | #'   but if overlapping option used, IDs returned by sampleNames(bulk.eset) 
227 | #'   should match those found in sc.eset phenoData individual labels.
228 | #' @param sc.eset Expression Set containing single-cell data. PhenoData of this
229 | #'   Expression Set should contain cell type and individual labels for each
230 | #'   cell. Names of these fields specified by arguments below.
231 | #' @param markers Structure, such as character vector, containing marker genes
232 | #'   to be used in decomposition. `base::unique(base::unlist(markers))` should
233 | #'   return a simple vector containing each gene name. If no argument or NULL
234 | #'   provided, the method will use all available genes for decomposition.
235 | #' @param cell.types Character string. Name of phenoData attribute in sc.eset
236 | #'   indicating cell type label for each cell
237 | #' @param subject.names Character string. Name of phenoData attribute in sc.eset
238 | #'   indicating individual label for each cell
239 | #' @param use.overlap Boolean. Whether to use and expect overlapping samples 
240 | #'   in decomposition.
241 | #' @param verbose Boolean. Whether to print log info during decomposition.
242 | #'   Errors will be printed regardless.
243 | #' @param old.cpm Prior to version 1.0.4 (updated in July 2020), the package
244 | #'   converted counts to CPM after subsetting the marker genes. Github user
245 | #'   randel pointed out that the order of these operations should be switched.
246 | #'   Thanks randel! This option is provided for replication of older BisqueRNA
247 | #'   but should be enabled, especially for small marker gene sets.
248 | #'   We briefly tested this change on the cortex and adipose datasets.
249 | #'   The original and new order of operations produce estimates that have an
250 | #'   average correlation of 0.87 for the cortex and 0.84 for the adipose within
251 | #'   each cell type.
252 | #' @return A list. Slot \strong{bulk.props} contains a matrix of cell type
253 | #'   proportion estimates with cell types as rows and individuals as columns.
254 | #'   Slot \strong{sc.props} contains a matrix of cell type proportions 
255 | #'   estimated directly from counting single-cell data. 
256 | #'   Slot \strong{rnorm} contains Euclidean norm of the residuals for each
257 | #'   individual's proportion estimates. Slot \strong{genes.used} contains
258 | #'   vector of genes used in decomposition. Slot \strong{transformed.bulk}
259 | #'   contains the transformed bulk expression used for decomposition. These
260 | #'   values are generated by applying a linear transformation to the CPM
261 | #'   expression.
262 | #' @examples
263 | #' library(Biobase)
264 | #' sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
265 | #'                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
266 | #'                          avg.props=c(.5, .3, .2))
267 | #' sim.data$sc.eset <- sim.data$sc.eset[,sim.data$sc.eset$SubjectName %in% as.character(6:10)]
268 | #' res <- ReferenceBasedDecomposition(sim.data$bulk.eset, sim.data$sc.eset)
269 | #' estimated.cell.proportions <- res$bulk.props
270 | #' 
271 | #' @export
272 | ReferenceBasedDecomposition <- function(bulk.eset,
273 |                                         sc.eset,
274 |                                         markers=NULL,
275 |                                         cell.types="cellType",
276 |                                         subject.names="SubjectName",
277 |                                         use.overlap=TRUE, 
278 |                                         verbose=TRUE,
279 |                                         old.cpm=TRUE) {
280 |   if ((! methods::is(sc.eset, "ExpressionSet")) || 
281 |       (! methods::is(bulk.eset, "ExpressionSet"))) {
282 |     base::stop("Expression data should be in ExpressionSet")
283 |   }
284 |   else if (! cell.types %in% Biobase::varLabels(sc.eset)) {
285 |     base::stop(base::sprintf("Cell type label \"%s\" ", cell.types),
286 |                "not found in single-cell ExpressionSet varLabels.")
287 |   }
288 |   else if (! subject.names %in% Biobase::varLabels(sc.eset)) {
289 |     base::stop(base::sprintf("Individual label \"%s\"", subject.names),
290 |                " not found in single-cell ExpressionSet varLabels.")
291 |   }
292 |   n.sc.individuals <-
293 |     base::length(base::levels(base::factor(sc.eset[[subject.names]])))
294 |   if (n.sc.individuals == 1) {
295 |     base::stop("Only one individual detected in single-cell data. At least ",
296 |                "two subjects are needed (three or more recommended).")
297 |   }
298 |   else if (n.sc.individuals == 2) {
299 |     base::warning("Only two individuals detected in single-cell data. While ",
300 |                   "Bisque will run, we recommend at least three subjects for",
301 |                   " reliable performance.")
302 |   }
303 |   n.cell.types <-
304 |     base::length(base::levels(base::factor(sc.eset[[cell.types]])))
305 |   if (n.cell.types == 1) {
306 |     base::stop("Single-cell pheno data indicates only one cell type",
307 |                " present. No need for decomposition.")
308 |   }
309 |   if (verbose) {
310 |     base::message(base::sprintf("Decomposing into %i cell types.",
311 |                                 n.cell.types))
312 |   }
313 |   if (use.overlap) {
314 |     samples <- GetOverlappingSamples(sc.eset, bulk.eset, subject.names, verbose)
315 |   }
316 |   if (base::is.null(markers)) {
317 |     markers <- Biobase::featureNames(sc.eset)
318 |   }
319 |   else {
320 |     markers <- base::unique(base::unlist(markers))
321 |   }
322 |   genes <- GetOverlappingGenes(sc.eset, bulk.eset, markers, verbose)
323 |   if (old.cpm) {
324 |     sc.eset <-
325 |       Biobase::ExpressionSet(assayData=Biobase::exprs(sc.eset)[genes,],
326 |                              phenoData=sc.eset@phenoData)
327 |     bulk.eset <-
328 |       Biobase::ExpressionSet(assayData=Biobase::exprs(bulk.eset)[genes,],
329 |                              phenoData=bulk.eset@phenoData)
330 |   }
331 |   if (verbose) {
332 |     base::message("Converting single-cell counts to CPM and ",
333 |               "filtering zero variance genes.")
334 |   }
335 |   sc.eset <- CountsToCPM(sc.eset)
336 |   if (!old.cpm) {
337 |     sc.eset <-
338 |       Biobase::ExpressionSet(assayData=Biobase::exprs(sc.eset)[genes,],
339 |                              phenoData=sc.eset@phenoData)
340 |   }
341 |   sc.eset <- FilterZeroVarianceGenes(sc.eset, verbose)
342 |   if (verbose) {
343 |     base::message("Converting bulk counts to CPM and filtering",
344 |                   " unexpressed genes.")
345 |   }
346 |   bulk.eset <- CountsToCPM(bulk.eset)
347 |   if (!old.cpm) {
348 |     bulk.eset <-
349 |       Biobase::ExpressionSet(assayData=Biobase::exprs(bulk.eset)[genes,],
350 |                              phenoData=bulk.eset@phenoData)
351 |   }
352 |   bulk.eset <- FilterUnexpressedGenes(bulk.eset, verbose)
353 |   genes <- base::intersect(Biobase::featureNames(sc.eset),
354 |                            Biobase::featureNames(bulk.eset))
355 |   if (base::length(genes) == 0) {
356 |     base::stop("Zero genes remaining after filtering and ",
357 |                "intersecting bulk, single-cell, and marker genes.")
358 |   }
359 |   if (verbose) {
360 |     n.cells <- base::ncol(sc.eset)
361 |     base::message("Generating single-cell based reference from ",
362 |                   sprintf("%i cells.\n", n.cells))
363 |   }
364 |   sc.ref <- GenerateSCReference(sc.eset, cell.types)[genes,,drop=F]
365 |   sc.props <- CalculateSCCellProportions(sc.eset, subject.names, cell.types)
366 |   sc.props <- sc.props[base::colnames(sc.ref),,drop=F]
367 |   if (use.overlap) {
368 |     if (verbose) {
369 |       base::message("Learning bulk transformation from overlapping samples.")
370 |     }
371 |     # Y.train is pseudo-bulk expression based on reference profile weighted by
372 |     #   cell type proportions estimated for single-cell samples.
373 |     Y.train <- sc.ref %*% sc.props[,samples$overlapping,drop=F]
374 |     # X.train is the actual bulk for the single-cell samples.
375 |     X.train <- Biobase::exprs(bulk.eset)[genes,samples$overlapping,drop=F]
376 |     # X.pred is the bulk for the remaining samples to be decomposed.
377 |     X.pred <- Biobase::exprs(bulk.eset)[genes,samples$remaining,drop=F]
378 |     template <- base::numeric(base::length(samples$remaining))
379 |     base::names(template) <- samples$remaining
380 |     if (verbose) {
381 |       base::message("Applying transformation to bulk samples and decomposing.")
382 |     }
383 |     # Y.pred is the transformed bulk for samples to be decomposed.
384 |     Y.pred <- base::matrix(base::vapply(X=genes, FUN=SupervisedTransformBulk,
385 |                                         FUN.VALUE=template,
386 |                                         Y.train, X.train, X.pred,
387 |                                         USE.NAMES=TRUE),
388 |                            nrow=base::length(samples$remaining))
389 |     sample.names <- samples$remaining
390 |   }
391 |   else {
392 |     if (verbose) {
393 |       base::message("Inferring bulk transformation from single-cell alone.")
394 |     }
395 |     # Y.train is pseudo-bulk expression based on reference profile weighted by
396 |     #   cell type proportions estimated for single-cell samples.
397 |     Y.train <- sc.ref %*% sc.props
398 |     # X.pred is the bulk for the remaining samples to be decomposed.
399 |     X.pred <- Biobase::exprs(bulk.eset)[genes,,drop=F]
400 |     sample.names <- base::colnames(Biobase::exprs(bulk.eset))
401 |     template <- base::numeric(base::length(sample.names))
402 |     base::names(template) <- sample.names
403 |     if (verbose) {
404 |       base::message("Applying transformation to bulk samples and decomposing.")
405 |     }
406 |     # Y.pred is the transformed bulk for samples to be decomposed.
407 |     Y.pred <- base::matrix(base::vapply(X=genes,
408 |                                         FUN=SemisupervisedTransformBulk,
409 |                                         FUN.VALUE=template,
410 |                                         Y.train, X.pred,
411 |                                         USE.NAMES=TRUE),
412 |                            nrow=base::length(sample.names))
413 |   }
414 |   # Columns in Y.pred with NaN indicate transformation could not be learned
415 |   #   for that gene. 
416 |   indices <- base::apply(Y.pred, MARGIN=2,
417 |                          FUN=function(column) {base::anyNA(column)})
418 |   if (base::any(indices)) {
419 |     if (verbose) {
420 |       n.dropped <- base::sum(indices)
421 |       base::message(base::sprintf("Dropped an additional %i genes", n.dropped),
422 |                     " for which a transformation could not be learned.")
423 |     }
424 |     if (sum(!indices) == 0) {
425 |       base::stop("Zero genes left for decomposition.")
426 |     }
427 |     Y.pred <- Y.pred[,!indices,drop=F]
428 |     sc.ref <- sc.ref[!indices,,drop=F]
429 |   }
430 |   # limsolve nnls matrices and vectors
431 |   E <- base::matrix(1,nrow=n.cell.types, ncol=n.cell.types)
432 |   f <- base::rep(1, n.cell.types)
433 |   G <- base::diag(n.cell.types)
434 |   h <- base::rep(0, n.cell.types)
435 |   results <- base::as.matrix(base::apply(Y.pred, 1,
436 |                                          function(b) {
437 |                                            sol <- limSolve::lsei(sc.ref, b,
438 |                                                                  E, f, G, h)
439 |                                            sol.p <- sol$X
440 |                                            sol.r <- base::sqrt(sol$solutionNorm)
441 |                                            return(base::append(sol.p, sol.r))
442 |                                          }))
443 |   base::rownames(results) <- base::append(base::colnames(sc.ref), "rnorm")
444 |   base::colnames(results) <- sample.names
445 |   rnorm <- results["rnorm",,drop=T]
446 |   base::names(rnorm) <- sample.names
447 |   Y.pred <- base::t(Y.pred) 
448 |   base::rownames(Y.pred) <- base::rownames(sc.ref)
449 |   base::colnames(Y.pred) <- sample.names
450 |   results <- base::list(bulk.props=results[base::colnames(sc.ref),,drop=F],
451 |                         sc.props=sc.props,
452 |                         rnorm=rnorm,
453 |                         genes.used=base::rownames(sc.ref),
454 |                         transformed.bulk=Y.pred)
455 |   return(results)
456 | }
457 | 


--------------------------------------------------------------------------------
/R/simulation.R:
--------------------------------------------------------------------------------
  1 | #' Simulate barcode for decomposition illustration
  2 | #'
  3 | #' Generates a nucleotide barcode similar to those generated by
  4 | #' 10x chromium sequencing platforms for illustration purposes.
  5 | #' Generates barcode and individual ID separated by '-' delimiter.
  6 | #'
  7 | #' @param index Integer. Index of cell ID from 0 to barcode.length to the
  8 | #'   fourth power. Will generate a unique nucleotide barcode for each
  9 | #'   index.
 10 | #' @param individual Character. ID of individual that the cell is from.
 11 | #' @param barcode.length Integer. Length of nucleotide barcode.
 12 | #' @return Simulated barcode for cell from an individual
 13 | SimulateBarcode <- function(index, individual, barcode.length){
 14 |   index <- index - 1
 15 |   barcode <- rep(0, barcode.length)
 16 |   for (i in 1:barcode.length){
 17 |     barcode[i] <- index %% 4
 18 |     index <- index %/% 4
 19 |   }
 20 |   barcode <- plyr::mapvalues(barcode, from=0:3, to=c("A","T","C","G"),
 21 |                              warn_missing = FALSE)
 22 |   barcode <- paste(c(barcode,'-', individual), collapse="")
 23 |   return(barcode)
 24 | }
 25 | 
 26 | #' Simulate data for decomposition illustration
 27 | #'
 28 | #' Simulates bulk and single-cell expression, as well as marker genes and
 29 | #' true proportions that can be used as an example of decomposition
 30 | #'
 31 | #' @param n.ind Integer. Number of individuals to simulate
 32 | #' @param n.genes Integer. Number of genes to simulate
 33 | #' @param n.cells Integer. Number of cells per individual for single-cell data
 34 | #' @param cell.types Character vector. List of cell types to simulate
 35 | #' @param avg.props Numeric vector. List of average proportions for given 
 36 | #'   cell types. Should be same length as cell.types and sum to 1
 37 | #' @return A list with simulated single-cell in slot `sc.eset` and bulk in
 38 | #'   `bulk.eset`, as well as true proportions in `props` and marker genes
 39 | #'   in `markers`.
 40 | #' @examples
 41 | #' library(Biobase)
 42 | #' sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
 43 | #'                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
 44 | #'                          avg.props=c(.5, .3, .2))
 45 | #'
 46 | #' @export
 47 | SimulateData <- function(n.ind, n.genes, n.cells, cell.types, avg.props){
 48 |   true.props <- stats::rmultinom(n.ind, n.cells, avg.props)/n.cells
 49 |   colnames(true.props) <- as.character(1:n.ind)
 50 |   rownames(true.props) <- cell.types
 51 |   reference <- replicate(length(cell.types), stats::rgeom(n.genes, prob=0.5))
 52 |   colnames(reference) <- cell.types
 53 |   rownames(reference) <- paste("Gene", 1:n.genes)
 54 |   markers <- data.frame(gene=paste("Gene", 1:n.genes),
 55 |                         cluster=sapply(1:n.genes,
 56 |                                        function(x){
 57 |                                          names(which.max(reference[x,]))
 58 |                                        }),
 59 |                         avg_logFC=sapply(1:n.genes,
 60 |                                          function(x){
 61 |                                            cell.type <- names(which.max(reference[x,]))
 62 |                                            max.expr <- reference[x,cell.type]
 63 |                                            avg.expr <- mean(reference[x,colnames(reference) != cell.type])
 64 |                                            return(log(max.expr/avg.expr))
 65 |                                          }))
 66 |   barcode.length <- max(ceiling(log(n.cells, base=4)), 4)
 67 |   sc.data <- NULL
 68 |   bulk.data <- NULL
 69 |   individual.labels <- NULL
 70 |   cell.type.labels <- NULL
 71 |   for (ind in 1:n.ind){
 72 |     counts <- NULL
 73 |     for (cell.type in cell.types){
 74 |       n.cells.type <- round(n.cells*true.props[cell.type,ind])
 75 |       if (n.cells.type > 0) {
 76 |         new.counts <- t(sapply(reference[,cell.type], 
 77 |                                function(l){
 78 |                                  stats::rpois(n.cells.type,l)
 79 |                                }))
 80 |         if (n.cells.type == 1){
 81 |           new.counts <- t(new.counts)
 82 |         }
 83 |         counts <- cbind(counts, new.counts)
 84 |         cell.type.labels <- c(cell.type.labels, rep(cell.type, n.cells.type))
 85 |       }
 86 |     }
 87 |     colnames(counts) <- sapply(1:n.cells, function(x) SimulateBarcode(x, ind, barcode.length))
 88 |     bulk.data <- cbind(bulk.data, rowSums(counts))
 89 |     sc.data <- cbind(sc.data, counts)
 90 |     individual.labels <- c(individual.labels, rep(as.character(ind), n.cells))
 91 |   }
 92 |   rownames(bulk.data) <- paste("Gene", 1:n.genes)
 93 |   rownames(sc.data) <- paste("Gene", 1:n.genes)
 94 |   sc.pheno <- data.frame(check.names=F, check.rows=F,
 95 |                          stringsAsFactors=F,
 96 |                          row.names=colnames(sc.data),
 97 |                          SubjectName=individual.labels,
 98 |                          cellType=cell.type.labels)
 99 |   sc.meta <- data.frame(labelDescription=c("SubjectName",
100 |                                            "cellType"),
101 |                         row.names=c("SubjectName",
102 |                                     "cellType"))
103 |   sc.pdata <- methods::new("AnnotatedDataFrame",
104 |                            data=sc.pheno,
105 |                            varMetadata=sc.meta)
106 |   sc.eset <- Biobase::ExpressionSet(assayData=sc.data,
107 |                                     phenoData=sc.pdata)
108 |   bulk.eset <- Biobase::ExpressionSet(assayData=bulk.data)
109 |   return(list(sc.eset=sc.eset, bulk.eset=bulk.eset, props=true.props, markers=markers))
110 | }
111 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Converts Seurat object to Expression Set
  2 | #'
  3 | #' `SeuratToExpressionSet()` returns an Expression Set with phenotype data
  4 | #' indicating cell type (cellType) and individual (SubjectName) for each cell
  5 | #' in a Seurat object. Raw counts data is used for assay data.
  6 | #'
  7 | #' Note that the \emph{Seurat} and \emph{Biobase} libraries should be attached
  8 | #' before running this function. The \emph{delimiter} and \emph{position} 
  9 | #' arguments are used to infer the individual ID from the cell ID. For example,
 10 | #' a delimiter of "-" and position of "2" indicates that the individual ID for
 11 | #' the cell ID \strong{ACTG-2} would be \strong{2}. 
 12 | #'
 13 | #' @param seurat.object Seurat object with attributes \emph{raw.data},
 14 | #'   \emph{ident}, and \emph{cell.names}
 15 | #' @param delimiter Character to split cell names with to find individual ID.
 16 | #' @param position Integer indicating 1-indexed position of individual ID after
 17 | #'   splitting cell name with \emph{delimiter}.
 18 | #' @param version Character string. Either "v2" or "v3. Seurat version used to
 19 | #'   create Seurat object. 
 20 | #' @return sc.eset Expression set containing relevant phenotype and individual
 21 | #'   data, \emph{cellType} and \emph{SubjectName}.
 22 | #' @examples
 23 | #' \donttest{
 24 | #'   library(Seurat)
 25 | #'   library(Biobase)
 26 | #'
 27 | #'   # We make a class to emulate a Seurat v2 object for illustration only
 28 | #'   setClass("testSeuratv2", representation(cell.names = "character",
 29 | #'                                           ident = "character",
 30 | #'                                           raw.data = "matrix"))
 31 | #'   sc.counts <- matrix(0,nrow=3,ncol=3)
 32 | #'   # These barcodes correspond to a delimiter of "-" and position 2 for individual id.
 33 | #'   test.cell.names <- c("ATCG-1", "TAGC-2", "GTCA-3")
 34 | #'   test.ident <- c("cell type a", "cell type b", "cell type c")
 35 | #'   names(test.ident) <- test.cell.names
 36 | #'   colnames(sc.counts) <- test.cell.names
 37 | #'   test.seurat.obj <- new("testSeuratv2",
 38 | #'                          cell.names=test.cell.names,
 39 | #'                          ident=test.ident,
 40 | #'                          raw.data=sc.counts)
 41 | #'
 42 | #'   single.cell.expression.set <- SeuratToExpressionSet(test.seurat.obj, delimiter='-',
 43 | #'                                                       position=2, version="v2")
 44 | #' }
 45 | #'
 46 | #' @export
 47 | SeuratToExpressionSet <- function(seurat.object, delimiter, position,
 48 |                                   version = c("v2", "v3")) {
 49 |   if (! "Seurat" %in% base::.packages()) {
 50 |     base::warning("Seurat package is not attached. ",
 51 |                   "ExpressionSet may be malformed. ",
 52 |                   "Please load Seurat library.")
 53 |   }
 54 |   version <- base::match.arg(version)
 55 |   if (version == "v2") {
 56 |     get.cell.names <- function(obj) obj@cell.names
 57 |     get.ident <- function(obj) obj@ident
 58 |     get.raw.data <- function(obj) obj@raw.data
 59 |   }
 60 |   else if (version == "v3") {
 61 |     get.cell.names <- function(obj) base::colnames(obj)
 62 |     get.ident <- function(obj) Seurat::Idents(object=obj)
 63 |     get.raw.data <- function(obj) Seurat::GetAssayData(object = obj,
 64 |                                                        slot = "counts")
 65 |   }
 66 |   individual.ids <- base::sapply(base::strsplit(get.cell.names(seurat.object),
 67 |                                                 delimiter),
 68 |                                  `[[`, position)
 69 |   base::names(individual.ids) <- get.cell.names(seurat.object)
 70 |   individual.ids <- base::factor(individual.ids)
 71 |   n.individuals <- base::length(base::levels(individual.ids))
 72 |   base::message(base::sprintf("Split sample names by \"%s\"", delimiter),
 73 |                 base::sprintf(" and checked position %i.", position),
 74 |                 base::sprintf(" Found %i individuals.", n.individuals))
 75 |   base::message(base::sprintf("Example: \"%s\" corresponds to individual \"%s\".",
 76 |                               get.cell.names(seurat.object)[1], individual.ids[1]))
 77 |   sample.ids <- base::names(get.ident(seurat.object))
 78 |   sc.pheno <- base::data.frame(check.names=F, check.rows=F,
 79 |                                stringsAsFactors=F,
 80 |                                row.names=sample.ids,
 81 |                                SubjectName=individual.ids,
 82 |                                cellType=get.ident(seurat.object))
 83 |   sc.meta <- base::data.frame(labelDescription=base::c("SubjectName",
 84 |                                                        "cellType"),
 85 |                               row.names=base::c("SubjectName",
 86 |                                                 "cellType"))
 87 |   sc.pdata <- methods::new("AnnotatedDataFrame",
 88 |                            data=sc.pheno,
 89 |                            varMetadata=sc.meta)
 90 |   sc.data <- base::as.matrix(get.raw.data(seurat.object)[,sample.ids,drop=F])
 91 |   sc.eset <- Biobase::ExpressionSet(assayData=sc.data,
 92 |                                     phenoData=sc.pdata)
 93 |   return(sc.eset)
 94 | }
 95 | 
 96 | #' Convert counts data in Expression Set to counts per million (CPM)
 97 | #'
 98 | #' @param eset Expression Set containing counts assay data.
 99 | #' @return eset Expression Set containing CPM assay data
100 | CountsToCPM <- function(eset) {
101 |   Biobase::exprs(eset) <- base::sweep(Biobase::exprs(eset),
102 |                                       2, base::colSums(Biobase::exprs(eset)),
103 |                                       `/`) * 1000000
104 |   indices <- base::apply(Biobase::exprs(eset), MARGIN=2,
105 |                          FUN=function(column) {base::anyNA(column)})
106 |   if (base::any(indices)) {
107 |     n.cells <- base::sum(indices)
108 |     base::stop(base::sprintf("Zero expression in selected genes for %i cells",
109 |                              n.cells))
110 |   }
111 |   return(eset)
112 | }
113 | 
114 | #' Remove genes in Expression Set with zero variance across samples
115 | #'
116 | #' @param eset Expression Set 
117 | #' @param verbose Boolean. Print logging info
118 | #' @return eset Expression Set with zero variance genes removed
119 | FilterZeroVarianceGenes <- function(eset, verbose=TRUE) {
120 |   indices <- (base::apply(Biobase::exprs(eset), 1, stats::var) != 0)
121 |   indices <- indices & (! base::is.na(indices))
122 |   if (base::sum(indices) < base::length(indices)) {
123 |     eset <-
124 |       Biobase::ExpressionSet(assayData=Biobase::exprs(eset)[indices,,drop=F],
125 |                              phenoData=eset@phenoData)
126 |   }
127 |   if (verbose) {
128 |     genes.filtered <- base::length(indices) - base::nrow(eset)
129 |     base::message(base::sprintf("Filtered %i zero variance genes.",
130 |                                 genes.filtered))
131 |   }
132 |   return(eset)
133 | }
134 | 
135 | #' Remove genes in Expression Set with zero expression in all samples
136 | #'
137 | #' @param eset Expression Set
138 | #' @param verbose Boolean. Print logging info
139 | #' @return eset Expression Set with zero expression genes removed
140 | FilterUnexpressedGenes <- function(eset, verbose=TRUE) {
141 |   indices <- (base::apply(Biobase::exprs(eset), 1, base::sum) != 0)
142 |   indices <- indices & (! base::is.na(indices))
143 |   if (base::sum(indices) < base::length(indices)) {
144 |     eset <-
145 |       Biobase::ExpressionSet(assayData=Biobase::exprs(eset)[indices,,drop=F],
146 |                              phenoData=eset@phenoData)
147 |   }
148 |   if (verbose) {
149 |     genes.filtered <- base::length(indices) - base::nrow(eset)
150 |     base::message(base::sprintf("Filtered %i unexpressed genes.",
151 |                                 genes.filtered))
152 |   }
153 |   return(eset)
154 | }
155 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bisque
 2 | 
 3 | [![Build Status](https://travis-ci.org/cozygene/bisque.svg?branch=master)](https://travis-ci.org/cozygene/bisque)
 4 | [![codecov](https://codecov.io/gh/cozygene/bisque/branch/master/graph/badge.svg)](https://codecov.io/gh/cozygene/bisque)
 5 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/r-bisquerna/README.html)
 6 | [![CRAN Version](https://www.r-pkg.org/badges/version/BisqueRNA)](https://CRAN.R-project.org/package=BisqueRNA)
 7 | 
 8 | 
 9 | An R toolkit for accurate and efficient estimation of cell composition ('decomposition') from bulk expression data with single-cell information.
10 | 
11 | Bisque provides two modes of operation:
12 | 
13 | ### Reference-based decomposition
14 | This method utilizes single-cell data to decompose bulk expression.
15 | We assume that both single-cell and bulk counts are measured from the same tissue.
16 | Specifically, the cell composition of the labeled single-cell data should match the expected physiological composition.
17 | While we don't explicitly require matched samples, we expect having samples with both single-cell and bulk expression measured will provide more accurate results.
18 | 
19 | ### Marker-based decomposition
20 | This method utilizes marker genes alone to decompose bulk expression when a reference profile is not available.
21 | Single-cell data is not explicitly required but can be used to identify these marker genes.
22 | This method captures relative abundances of a cell type across individuals. Note that these abundances are not proportions, so they cannot be compared between different cell types. 
23 | 
24 | ## Installation
25 | 
26 | The Bisque R package is available on CRAN
27 | ```r
28 | install.packages("BisqueRNA")
29 | ```
30 | 
31 | as well as Bioconda
32 | ```
33 | conda install r-bisquerna
34 | ```
35 | 
36 | The package can also be installed from the GitHub repository
37 | ```r
38 | devtools::install_github("cozygene/bisque")
39 | ```
40 | 
41 | ## Getting Started
42 | You can load Bisque as follows:
43 | 
44 | ```r
45 | library(BisqueRNA)
46 | ```
47 | 
48 | The two modes of operation described above are called as follows:
49 | 
50 | ```r
51 | res <- BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset, markers)
52 | ```
53 | 
54 | ```r
55 | res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers)
56 | ```
57 | 
58 | Each method returns a list of results with estimated cell proportions/abundances stored in `res$bulk.props`.
59 | 
60 | To see examples of these methods on simulated data, check out the vignette:
61 | 
62 | ```r
63 | browseVignettes("BisqueRNA")
64 | ```
65 | 


--------------------------------------------------------------------------------
/man/CalculateSCCellProportions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{CalculateSCCellProportions}
 4 | \alias{CalculateSCCellProportions}
 5 | \title{Calculate cell proportions based on single-cell data}
 6 | \usage{
 7 | CalculateSCCellProportions(sc.eset, subject.names, cell.types)
 8 | }
 9 | \arguments{
10 | \item{sc.eset}{Expression Set with single-cell data}
11 | 
12 | \item{subject.names}{A character string. Name of phenoData attribute in 
13 | sc.eset that indicates individual ID.}
14 | 
15 | \item{cell.types}{A character string. Name of phenoData attribute in sc.eset
16 | that indicates cell type}
17 | }
18 | \value{
19 | sc.props Matrix. Cell proportions with number of cell types rows 
20 |   by number of individuals columns
21 | }
22 | \description{
23 | Returns proportion of each cell type out of total cells for each individual
24 | in the single-cell Expression Set
25 | }
26 | 


--------------------------------------------------------------------------------
/man/CorTri.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{CorTri}
 4 | \alias{CorTri}
 5 | \title{Correlate columns of data frame}
 6 | \usage{
 7 | CorTri(x, method = "pearson")
 8 | }
 9 | \arguments{
10 | \item{x}{Data frame or matrix. Column vectors are correlated}
11 | 
12 | \item{method}{Character string. Name of method passed to cor. 
13 | Pearson by default.}
14 | }
15 | \value{
16 | cors Numeric vector. Correlation coefficients of pairs
17 | }
18 | \description{
19 | This function runs correlation between markers of a data frame or matrix, 
20 | returning the values of the lower/upper triangular of the correlation matrix
21 | in a vector.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/CountsToCPM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{CountsToCPM}
 4 | \alias{CountsToCPM}
 5 | \title{Convert counts data in Expression Set to counts per million (CPM)}
 6 | \usage{
 7 | CountsToCPM(eset)
 8 | }
 9 | \arguments{
10 | \item{eset}{Expression Set containing counts assay data.}
11 | }
12 | \value{
13 | eset Expression Set containing CPM assay data
14 | }
15 | \description{
16 | Convert counts data in Expression Set to counts per million (CPM)
17 | }
18 | 


--------------------------------------------------------------------------------
/man/EstimatePCACellTypeProportions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{EstimatePCACellTypeProportions}
 4 | \alias{EstimatePCACellTypeProportions}
 5 | \title{Estimate cell type proportions using first PC of expression matrix}
 6 | \usage{
 7 | EstimatePCACellTypeProportions(x, weighted = FALSE, w = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{A sample by gene bulk expression matrix. Genes should be marker
11 | genes}
12 | 
13 | \item{weighted}{Boolean. If weighted=TRUE, multiply scaled gene expression by
14 | gene weights}
15 | 
16 | \item{w}{Numeric vector. Weights of genes}
17 | }
18 | \value{
19 | ret List. Attribute \strong{pcs} contains matrix of PCs, where PC1
20 |   should be used as estimates for cell type abundances
21 |   Attribute \strong{sdev} contains eigenvalues of eigendecomposition of
22 |   var-covar matrix. The 1st eigenvalue should explain most of the variance.
23 |   Attribute \strong{genes} contains names of genes.
24 | }
25 | \description{
26 | Estimate cell type proportions using first PC of expression matrix
27 | }
28 | 


--------------------------------------------------------------------------------
/man/FilterUnexpressedGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{FilterUnexpressedGenes}
 4 | \alias{FilterUnexpressedGenes}
 5 | \title{Remove genes in Expression Set with zero expression in all samples}
 6 | \usage{
 7 | FilterUnexpressedGenes(eset, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{eset}{Expression Set}
11 | 
12 | \item{verbose}{Boolean. Print logging info}
13 | }
14 | \value{
15 | eset Expression Set with zero expression genes removed
16 | }
17 | \description{
18 | Remove genes in Expression Set with zero expression in all samples
19 | }
20 | 


--------------------------------------------------------------------------------
/man/FilterZeroVarianceGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{FilterZeroVarianceGenes}
 4 | \alias{FilterZeroVarianceGenes}
 5 | \title{Remove genes in Expression Set with zero variance across samples}
 6 | \usage{
 7 | FilterZeroVarianceGenes(eset, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{eset}{Expression Set}
11 | 
12 | \item{verbose}{Boolean. Print logging info}
13 | }
14 | \value{
15 | eset Expression Set with zero variance genes removed
16 | }
17 | \description{
18 | Remove genes in Expression Set with zero variance across samples
19 | }
20 | 


--------------------------------------------------------------------------------
/man/GenerateSCReference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{GenerateSCReference}
 4 | \alias{GenerateSCReference}
 5 | \title{Generate reference profile for cell types identified in single-cell data}
 6 | \usage{
 7 | GenerateSCReference(sc.eset, cell.types)
 8 | }
 9 | \arguments{
10 | \item{sc.eset}{Expression Set with single-cell data}
11 | 
12 | \item{cell.types}{A character string. Name of phenoData attribute in sc.eset
13 | that indicates cell type}
14 | }
15 | \value{
16 | sc.ref Matrix. Reference profile with number of gene rows by number
17 |   of cell types columns.
18 | }
19 | \description{
20 | Averages expression within each cell type across all samples to use as 
21 | reference profile.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/GetCTP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{GetCTP}
 4 | \alias{GetCTP}
 5 | \title{Return cell type proportions from bulk}
 6 | \usage{
 7 | GetCTP(
 8 |   bulk,
 9 |   cell_types,
10 |   markers,
11 |   ct_col,
12 |   gene_col,
13 |   min_gene,
14 |   max_gene,
15 |   weighted,
16 |   w_col,
17 |   verbose
18 | )
19 | }
20 | \arguments{
21 | \item{bulk}{Expression Set containing bulk data}
22 | 
23 | \item{cell_types}{Character vector. Names of cell types.}
24 | 
25 | \item{markers}{Data frame with columns specifying cluster and gene,
26 | and optionally a column for weights, typically the fold-change of the gene.
27 | Important that the genes for each cell type are row-sorted by signficance.}
28 | 
29 | \item{ct_col}{Character string. Column name specifying cluster/cell type
30 | corresponding to each marker gene in \strong{markers}.}
31 | 
32 | \item{gene_col}{Character string. Column name specifying gene names in
33 | \strong{markers}.}
34 | 
35 | \item{min_gene}{Numeric. Min number of genes to use for each cell type.}
36 | 
37 | \item{max_gene}{Numeric. Max number of genes to use for each cell type.}
38 | 
39 | \item{weighted}{Boolean. Whether to use weights for gene prioritization}
40 | 
41 | \item{w_col}{Character string. Column name for weights, such as "avg_logFC",
42 | in \strong{markers}}
43 | 
44 | \item{verbose}{Boolean. Whether to print log info during decomposition.
45 | Errors will be printed regardless.}
46 | }
47 | \value{
48 | A List. Slot \strong{cors} contains list of vectors with correlation
49 |   coefficients. Slot \strong{ctps} contains list of CTP objects returned by
50 |   GetCTP
51 | }
52 | \description{
53 | Calculate cell type proportions from a data frame containing bulk expression
54 | values. Uses PCA (weighted or regular) to estimate relative proportions
55 | within each cell type.
56 | }
57 | 


--------------------------------------------------------------------------------
/man/GetNumGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{GetNumGenes}
 4 | \alias{GetNumGenes}
 5 | \title{Get number of genes to use with no weighted information}
 6 | \usage{
 7 | GetNumGenes(x, min.gene = 25, max.gene = 200)
 8 | }
 9 | \arguments{
10 | \item{x}{Numeric Matrix. A sample by gene expression matrix containing the
11 | marker genes.}
12 | 
13 | \item{min.gene}{Numeric. Minimum number of genes to consider as markers.}
14 | 
15 | \item{max.gene}{Numeric. Maximum number of genes to consider as markers.}
16 | }
17 | \value{
18 | best.n Numeric. Number of genes to use
19 | }
20 | \description{
21 | Get number of genes to use with no weighted information
22 | }
23 | 


--------------------------------------------------------------------------------
/man/GetNumGenesWeighted.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{GetNumGenesWeighted}
 4 | \alias{GetNumGenesWeighted}
 5 | \title{Get number of genes to use with weighted PCA}
 6 | \usage{
 7 | GetNumGenesWeighted(x, w, min.gene = 25, max.gene = 200)
 8 | }
 9 | \arguments{
10 | \item{x}{Numeric Matrix. A sample by gene expression matrix containing the
11 | marker genes.}
12 | 
13 | \item{w}{Numeric Vector. The weights of the genes that correspond to the
14 | columns of x.}
15 | 
16 | \item{min.gene}{Numeric. Minimum number of genes to consider as markers.}
17 | 
18 | \item{max.gene}{Numeric. Maximum number of genes to consider as markers.}
19 | }
20 | \value{
21 | best.n Numeric. Number of genes to use
22 | }
23 | \description{
24 | Get number of genes to use with weighted PCA
25 | }
26 | 


--------------------------------------------------------------------------------
/man/GetOverlappingGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{GetOverlappingGenes}
 4 | \alias{GetOverlappingGenes}
 5 | \title{Find overlapping genes in single-cell data, bulk data, and marker genes}
 6 | \usage{
 7 | GetOverlappingGenes(sc.eset, bulk.eset, markers, verbose)
 8 | }
 9 | \arguments{
10 | \item{sc.eset}{Expression Set with single-cell data}
11 | 
12 | \item{bulk.eset}{Expression Set with bulk data}
13 | 
14 | \item{markers}{Character vector. List of relevant marker genes}
15 | 
16 | \item{verbose}{Boolean. Print logging info}
17 | }
18 | \value{
19 | overlapping.genes Character vector. List of genes found in markers
20 |   and both datasets.
21 | }
22 | \description{
23 | Find overlapping genes in single-cell data, bulk data, and marker genes
24 | }
25 | 


--------------------------------------------------------------------------------
/man/GetOverlappingSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{GetOverlappingSamples}
 4 | \alias{GetOverlappingSamples}
 5 | \title{Find overlapping samples in single-cell and bulk data}
 6 | \usage{
 7 | GetOverlappingSamples(sc.eset, bulk.eset, subject.names, verbose)
 8 | }
 9 | \arguments{
10 | \item{sc.eset}{Expression Set with single-cell data}
11 | 
12 | \item{bulk.eset}{Expression Set with bulk data}
13 | 
14 | \item{subject.names}{A character string. Name of phenoData attribute in
15 | sc.eset that indicates individual ID (that would be found in bulk.eset
16 | if overlapping)}
17 | 
18 | \item{verbose}{Boolean. Print logging info}
19 | }
20 | \value{
21 | samples A list with attributes \emph{overlapping} and
22 |   \emph{remaining}. Each attribute refers to a character vector that lists
23 |   the samples found in both datasets and samples found only in bulk,
24 |   respectively
25 | }
26 | \description{
27 | Find overlapping samples in single-cell and bulk data
28 | }
29 | 


--------------------------------------------------------------------------------
/man/GetUniqueMarkers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{GetUniqueMarkers}
 4 | \alias{GetUniqueMarkers}
 5 | \title{Get unique markers present in only 1 cell type}
 6 | \usage{
 7 | GetUniqueMarkers(x, gene_col = "gene")
 8 | }
 9 | \arguments{
10 | \item{x}{Data frame. Contains column with marker gene names}
11 | 
12 | \item{gene_col}{Character string. Name of the column that contains
13 | the marker genes}
14 | }
15 | \value{
16 | x Data frame. Markers with non-unique markers removed
17 | }
18 | \description{
19 | Given a data frame of marker genes for cell types, 
20 | returns a new data frame with non-unique markers removed.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/MarkerBasedDecomposition.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marker_based.R
 3 | \name{MarkerBasedDecomposition}
 4 | \alias{MarkerBasedDecomposition}
 5 | \title{Performs marker-based decomposition of bulk expression using marker genes}
 6 | \usage{
 7 | MarkerBasedDecomposition(
 8 |   bulk.eset,
 9 |   markers,
10 |   ct_col = "cluster",
11 |   gene_col = "gene",
12 |   min_gene = 5,
13 |   max_gene = 200,
14 |   weighted = FALSE,
15 |   w_col = "avg_logFC",
16 |   unique_markers = TRUE,
17 |   verbose = TRUE
18 | )
19 | }
20 | \arguments{
21 | \item{bulk.eset}{Expression Set. Normalized bulk expression data.}
22 | 
23 | \item{markers}{Data frame with columns specifying cluster and gene,
24 | and optionally a column for weights, typically the fold-change of the gene.
25 | Important that the genes for each cell type are row-sorted by signficance.}
26 | 
27 | \item{ct_col}{Character string. Column name specifying cluster/cell type
28 | corresponding to each marker gene in \strong{markers}.}
29 | 
30 | \item{gene_col}{Character string. Column name specifying gene names in
31 | \strong{markers}.}
32 | 
33 | \item{min_gene}{Numeric. Min number of genes to use for each cell type.}
34 | 
35 | \item{max_gene}{Numeric. Max number of genes to use for each cell type.}
36 | 
37 | \item{weighted}{Boolean. Whether to use weights for gene prioritization}
38 | 
39 | \item{w_col}{Character string. Column name for weights, such as "avg_logFC",
40 | in \strong{markers}}
41 | 
42 | \item{unique_markers}{Boolean. If TRUE, subset markers to include only genes 
43 | that are markers for only one cell type}
44 | 
45 | \item{verbose}{Boolean. Whether to print log info during decomposition.
46 | Errors will be printed regardless.}
47 | }
48 | \value{
49 | A List. Slot \strong{bulk.props} contains estimated relative cell
50 |   type abundances. Slot \strong{var.explained} contains variance explained by
51 |   first 20 PCs for cell type marker genes. Slot \strong{genes.used} contains
52 |   vector of genes used for decomposition.
53 | }
54 | \description{
55 | Estimates relative abundances of cell types from PCA-based decomposition.
56 | Uses a list of marker genes to subset the expression data, and returns the 
57 | first PC of each sub-matrix as the cell type fraction estimates.
58 | Optionally, weights for each marker gene can be used to prioritize genes
59 | that are highly expressed in the given cell type.
60 | }
61 | \details{
62 | Note that this method expects the input bulk data to be normalized, unlike
63 | the reference-based method.
64 | }
65 | \examples{
66 | library(Biobase)
67 | sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
68 |                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
69 |                          avg.props=c(.5, .3, .2))
70 | res <- MarkerBasedDecomposition(sim.data$bulk.eset, sim.data$markers, weighted=FALSE)
71 | estimated.cell.proportions <- res$bulk.props
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/man/ReferenceBasedDecomposition.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{ReferenceBasedDecomposition}
 4 | \alias{ReferenceBasedDecomposition}
 5 | \title{Performs reference-based decomposition of bulk expression using single-cell
 6 | data}
 7 | \usage{
 8 | ReferenceBasedDecomposition(
 9 |   bulk.eset,
10 |   sc.eset,
11 |   markers = NULL,
12 |   cell.types = "cellType",
13 |   subject.names = "SubjectName",
14 |   use.overlap = TRUE,
15 |   verbose = TRUE,
16 |   old.cpm = TRUE
17 | )
18 | }
19 | \arguments{
20 | \item{bulk.eset}{Expression Set containin bulk data. No PhenoData required
21 | but if overlapping option used, IDs returned by sampleNames(bulk.eset) 
22 | should match those found in sc.eset phenoData individual labels.}
23 | 
24 | \item{sc.eset}{Expression Set containing single-cell data. PhenoData of this
25 | Expression Set should contain cell type and individual labels for each
26 | cell. Names of these fields specified by arguments below.}
27 | 
28 | \item{markers}{Structure, such as character vector, containing marker genes
29 | to be used in decomposition. `base::unique(base::unlist(markers))` should
30 | return a simple vector containing each gene name. If no argument or NULL
31 | provided, the method will use all available genes for decomposition.}
32 | 
33 | \item{cell.types}{Character string. Name of phenoData attribute in sc.eset
34 | indicating cell type label for each cell}
35 | 
36 | \item{subject.names}{Character string. Name of phenoData attribute in sc.eset
37 | indicating individual label for each cell}
38 | 
39 | \item{use.overlap}{Boolean. Whether to use and expect overlapping samples 
40 | in decomposition.}
41 | 
42 | \item{verbose}{Boolean. Whether to print log info during decomposition.
43 | Errors will be printed regardless.}
44 | 
45 | \item{old.cpm}{Prior to version 1.0.4 (updated in July 2020), the package
46 | converted counts to CPM after subsetting the marker genes. Github user
47 | randel pointed out that the order of these operations should be switched.
48 | Thanks randel! This option is provided for replication of older BisqueRNA
49 | but should be enabled, especially for small marker gene sets.
50 | We briefly tested this change on the cortex and adipose datasets.
51 | The original and new order of operations produce estimates that have an
52 | average correlation of 0.87 for the cortex and 0.84 for the adipose within
53 | each cell type.}
54 | }
55 | \value{
56 | A list. Slot \strong{bulk.props} contains a matrix of cell type
57 |   proportion estimates with cell types as rows and individuals as columns.
58 |   Slot \strong{sc.props} contains a matrix of cell type proportions 
59 |   estimated directly from counting single-cell data. 
60 |   Slot \strong{rnorm} contains Euclidean norm of the residuals for each
61 |   individual's proportion estimates. Slot \strong{genes.used} contains
62 |   vector of genes used in decomposition. Slot \strong{transformed.bulk}
63 |   contains the transformed bulk expression used for decomposition. These
64 |   values are generated by applying a linear transformation to the CPM
65 |   expression.
66 | }
67 | \description{
68 | Generates a reference profile based on single-cell data. Learns a
69 | transformation of bulk expression based on observed single-cell proportions
70 | and performs  NNLS regression on these transformed values to estimate cell
71 | proportions.
72 | }
73 | \details{
74 | Expects read counts for both datasets, as they will be converted to 
75 | counts per million (CPM). Two options available: Use overlapping indivudals
76 | found in both single-cell and bulk datasets to learn transformation or 
77 | learn transformation from single-cell alone. The overlapping option is
78 | expected to have better performance.
79 | }
80 | \examples{
81 | library(Biobase)
82 | sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
83 |                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
84 |                          avg.props=c(.5, .3, .2))
85 | sim.data$sc.eset <- sim.data$sc.eset[,sim.data$sc.eset$SubjectName \%in\% as.character(6:10)]
86 | res <- ReferenceBasedDecomposition(sim.data$bulk.eset, sim.data$sc.eset)
87 | estimated.cell.proportions <- res$bulk.props
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/man/SemisupervisedTransformBulk.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{SemisupervisedTransformBulk}
 4 | \alias{SemisupervisedTransformBulk}
 5 | \title{Transforms bulk expression of a gene using only single-cell data}
 6 | \usage{
 7 | SemisupervisedTransformBulk(gene, Y.train, X.pred)
 8 | }
 9 | \arguments{
10 | \item{gene}{Character string. Gene name that corresponds to row in Y.train}
11 | 
12 | \item{Y.train}{Numeric Matrix. Number of gene rows by number of overlapping
13 | individuals columns. Contains weighted sum of reference profile by 
14 | single-cell based cell proportion estimates for each individual}
15 | 
16 | \item{X.pred}{Numeric Matrix. Number of gene rows by number of remaining
17 | individuals columns. Contains observed bulk expression for each individual
18 | to be transformed.}
19 | }
20 | \value{
21 | Y.pred Numeric Matrix. One row for given gene by number of remaining
22 |   individuals columns. Contains transformed bulk expression for each
23 |   individual.
24 | }
25 | \description{
26 | For a specific gene, this function learns a transformation of
27 | the bulk expression to match the distribution produced
28 | by the single-cell based reference and observed single-cell based cell
29 | proportions.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/SeuratToExpressionSet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{SeuratToExpressionSet}
 4 | \alias{SeuratToExpressionSet}
 5 | \title{Converts Seurat object to Expression Set}
 6 | \usage{
 7 | SeuratToExpressionSet(
 8 |   seurat.object,
 9 |   delimiter,
10 |   position,
11 |   version = c("v2", "v3")
12 | )
13 | }
14 | \arguments{
15 | \item{seurat.object}{Seurat object with attributes \emph{raw.data},
16 | \emph{ident}, and \emph{cell.names}}
17 | 
18 | \item{delimiter}{Character to split cell names with to find individual ID.}
19 | 
20 | \item{position}{Integer indicating 1-indexed position of individual ID after
21 | splitting cell name with \emph{delimiter}.}
22 | 
23 | \item{version}{Character string. Either "v2" or "v3. Seurat version used to
24 | create Seurat object.}
25 | }
26 | \value{
27 | sc.eset Expression set containing relevant phenotype and individual
28 |   data, \emph{cellType} and \emph{SubjectName}.
29 | }
30 | \description{
31 | `SeuratToExpressionSet()` returns an Expression Set with phenotype data
32 | indicating cell type (cellType) and individual (SubjectName) for each cell
33 | in a Seurat object. Raw counts data is used for assay data.
34 | }
35 | \details{
36 | Note that the \emph{Seurat} and \emph{Biobase} libraries should be attached
37 | before running this function. The \emph{delimiter} and \emph{position} 
38 | arguments are used to infer the individual ID from the cell ID. For example,
39 | a delimiter of "-" and position of "2" indicates that the individual ID for
40 | the cell ID \strong{ACTG-2} would be \strong{2}.
41 | }
42 | \examples{
43 | \donttest{
44 |   library(Seurat)
45 |   library(Biobase)
46 | 
47 |   # We make a class to emulate a Seurat v2 object for illustration only
48 |   setClass("testSeuratv2", representation(cell.names = "character",
49 |                                           ident = "character",
50 |                                           raw.data = "matrix"))
51 |   sc.counts <- matrix(0,nrow=3,ncol=3)
52 |   # These barcodes correspond to a delimiter of "-" and position 2 for individual id.
53 |   test.cell.names <- c("ATCG-1", "TAGC-2", "GTCA-3")
54 |   test.ident <- c("cell type a", "cell type b", "cell type c")
55 |   names(test.ident) <- test.cell.names
56 |   colnames(sc.counts) <- test.cell.names
57 |   test.seurat.obj <- new("testSeuratv2",
58 |                          cell.names=test.cell.names,
59 |                          ident=test.ident,
60 |                          raw.data=sc.counts)
61 | 
62 |   single.cell.expression.set <- SeuratToExpressionSet(test.seurat.obj, delimiter='-',
63 |                                                       position=2, version="v2")
64 | }
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/man/SimulateBarcode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simulation.R
 3 | \name{SimulateBarcode}
 4 | \alias{SimulateBarcode}
 5 | \title{Simulate barcode for decomposition illustration}
 6 | \usage{
 7 | SimulateBarcode(index, individual, barcode.length)
 8 | }
 9 | \arguments{
10 | \item{index}{Integer. Index of cell ID from 0 to barcode.length to the
11 | fourth power. Will generate a unique nucleotide barcode for each
12 | index.}
13 | 
14 | \item{individual}{Character. ID of individual that the cell is from.}
15 | 
16 | \item{barcode.length}{Integer. Length of nucleotide barcode.}
17 | }
18 | \value{
19 | Simulated barcode for cell from an individual
20 | }
21 | \description{
22 | Generates a nucleotide barcode similar to those generated by
23 | 10x chromium sequencing platforms for illustration purposes.
24 | Generates barcode and individual ID separated by '-' delimiter.
25 | }
26 | 


--------------------------------------------------------------------------------
/man/SimulateData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simulation.R
 3 | \name{SimulateData}
 4 | \alias{SimulateData}
 5 | \title{Simulate data for decomposition illustration}
 6 | \usage{
 7 | SimulateData(n.ind, n.genes, n.cells, cell.types, avg.props)
 8 | }
 9 | \arguments{
10 | \item{n.ind}{Integer. Number of individuals to simulate}
11 | 
12 | \item{n.genes}{Integer. Number of genes to simulate}
13 | 
14 | \item{n.cells}{Integer. Number of cells per individual for single-cell data}
15 | 
16 | \item{cell.types}{Character vector. List of cell types to simulate}
17 | 
18 | \item{avg.props}{Numeric vector. List of average proportions for given 
19 | cell types. Should be same length as cell.types and sum to 1}
20 | }
21 | \value{
22 | A list with simulated single-cell in slot `sc.eset` and bulk in
23 |   `bulk.eset`, as well as true proportions in `props` and marker genes
24 |   in `markers`.
25 | }
26 | \description{
27 | Simulates bulk and single-cell expression, as well as marker genes and
28 | true proportions that can be used as an example of decomposition
29 | }
30 | \examples{
31 | library(Biobase)
32 | sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=100,
33 |                          cell.types=c("Neurons", "Astrocytes", "Microglia"),
34 |                          avg.props=c(.5, .3, .2))
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/man/SupervisedTransformBulk.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reference_based.R
 3 | \name{SupervisedTransformBulk}
 4 | \alias{SupervisedTransformBulk}
 5 | \title{Transforms bulk expression of a gene given overlapping data}
 6 | \usage{
 7 | SupervisedTransformBulk(gene, Y.train, X.train, X.pred)
 8 | }
 9 | \arguments{
10 | \item{gene}{Character string. Gene name that corresponds to row in Y.train}
11 | 
12 | \item{Y.train}{Numeric Matrix. Number of gene rows by number of overlapping
13 | individuals columns. Contains weighted sum of reference profile by 
14 | single-cell based cell proportion estimates for each individual}
15 | 
16 | \item{X.train}{Numeric Matrix. Number of gene rows by number of overlapping
17 | individuals columns. Contains observed bulk expression for each individual}
18 | 
19 | \item{X.pred}{Numeric Matrix. Number of gene rows by number of remaining
20 | individuals columns. Contains observed bulk expression for each individual
21 | to be transformed.}
22 | }
23 | \value{
24 | Y.pred Numeric Matrix. One row for given gene by number of remaining
25 |   individuals columns. Contains transformed bulk expression for each
26 |   individual.
27 | }
28 | \description{
29 | For a specific gene, this function uses linear regression to learn a
30 | transformation of the bulk expression to match the values produced
31 | by the single-cell based reference and observed single-cell based cell
32 | proportions.
33 | }
34 | \details{
35 | If a linear transformation cannot be learned for a gene (zero variance in
36 | observed bulk or single-cell based weighted sums), a vector of NaNs will 
37 | be returned of the expected length (length of X.pred)
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(BisqueRNA)
3 | 
4 | test_check("BisqueRNA")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_marker_based.R:
--------------------------------------------------------------------------------
 1 | context("Marker-based decomposition")
 2 | library(Biobase)
 3 | 
 4 | test_that("Catches input errors", {
 5 |   # Expression not in eset
 6 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
 7 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 8 |   markers <- data.frame(gene=c("1","2"), cluster=c("a", "b"))
 9 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.counts, markers))
10 |   # Trying to use min_gene = 0
11 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers,
12 |                                                    min_gene=0, max_gene=5))
13 |   # Min gene greater than max gene
14 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers,
15 |                                                    min_gene=6, max_gene=5))
16 |   # Not enough markers
17 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers))
18 |   # No overlapping markers
19 |   markers <- data.frame(gene=c("3","4"), cluster=c("a", "b")) 
20 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=1))
21 |   # No marker genes after filtering for zero variance
22 |   markers <- data.frame(gene=c("1","2"), cluster=c("a", "b"))
23 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=1))
24 |   # One cell type loses too many markers due to zero variance
25 |   bulk.counts <- matrix(1:36,nrow=6,ncol=6)
26 |   bulk.counts[1,] = rep(0, 6)
27 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
28 |   markers <- data.frame(gene=as.character(1:6), cluster=c("a", "a", "a",
29 |                                                           "b", "b", "b"))
30 |   expect_error(BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=3))
31 | })
32 | 
33 | test_that("Produces output for simulated data", {
34 |   bulk.eset <- Biobase::ExpressionSet(assayData = matrix(1:16, nrow=4, ncol=4))
35 |   markers <- data.frame(gene=as.character(1:4), cluster=rep('a', 4), avg_logFC=rep(.3, 4))
36 |   # weighted
37 |   expect_warning(res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=4, weighted=T))
38 |   expect_true("bulk.props" %in% unlist(attributes(res)))
39 |   # unweighted
40 |   expect_warning(res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=4, weighted=F))
41 |   expect_true("bulk.props" %in% unlist(attributes(res)))
42 |   # weighted with max.gene = 1
43 |   expect_warning(res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=1, max_gene=1, weighted=T))
44 |   expect_true("bulk.props" %in% unlist(attributes(res)))
45 |   # unweighted with max.gene = 1
46 |   expect_warning(res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, min_gene=1, max_gene=1, weighted=F))
47 |   expect_true("bulk.props" %in% unlist(attributes(res)))
48 | })
49 | 


--------------------------------------------------------------------------------
/tests/testthat/test_ref_based.R:
--------------------------------------------------------------------------------
  1 | context("Reference-based decomposition")
  2 | library(Biobase)
  3 | 
  4 | test_that("Catches expression input not in expressionset", {
  5 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
  6 |   sc.counts <- matrix(0,nrow=2,ncol=2)
  7 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts)
  8 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.counts, sc.eset))
  9 | })
 10 | 
 11 | test_that("Catches expressionset missing given labels", {
 12 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
 13 |   sc.counts <- matrix(0,nrow=2,ncol=2)
 14 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 15 |   sc.pheno <- data.frame(SubjectName=c('a', 'b'), CellType=c('a', 'b'))
 16 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "CellType"),
 17 |                         row.names=c("SubjectName", "CellType"))
 18 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 19 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 20 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 21 |   sc.pheno <- data.frame(subjectName=c('a', 'b'), cellType=c('a', 'b'))
 22 |   sc.meta <- data.frame(labelDescription=c("subjectName", "cellType"),
 23 |                         row.names=c("subjectName", "cellType"))
 24 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 25 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 26 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 27 | })
 28 | 
 29 | test_that("Catches single-cell data with only one or two subjects", {
 30 |   bulk.counts <- matrix(1:4,nrow=2,ncol=2)
 31 |   sc.counts <- matrix(0,nrow=2,ncol=2)
 32 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 33 |   # One Subject
 34 |   sc.pheno <- data.frame(SubjectName=c('a', 'a'), cellType=c('a', 'b'))
 35 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
 36 |                         row.names=c("SubjectName", "cellType"))
 37 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 38 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 39 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset,
 40 |                                                       use.overlap=FALSE))
 41 |   # Two subjects
 42 |   sc.counts <- matrix(1:8,nrow=2,ncol=4)
 43 |   sc.pheno <- data.frame(SubjectName=c('a', 'a', 'b', 'b'),
 44 |                          cellType=c('a', 'b', 'a', 'b'))
 45 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
 46 |                         row.names=c("SubjectName", "cellType"))
 47 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 48 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 49 |   expect_warning(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset,
 50 |                                                         use.overlap=FALSE,
 51 |                                                         old.cpm=FALSE),
 52 |                  regexp="Only two individuals detected in single-cell data")
 53 | })
 54 | 
 55 | test_that("Catches single-cell data with only one cell type label", {
 56 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
 57 |   sc.counts <- matrix(0,nrow=2,ncol=2)
 58 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 59 |   sc.pheno <- data.frame(SubjectName=c('a', 'b'), cellType=c('a', 'a'))
 60 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
 61 |                         row.names=c("SubjectName", "cellType"))
 62 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 63 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 64 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 65 | })
 66 | 
 67 | test_that("Catches no overlapping samples in overlap mode", {
 68 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
 69 |   sc.counts <- matrix(0,nrow=2,ncol=2)
 70 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 71 |   sc.pheno <- data.frame(SubjectName=c('a', 'b'), cellType=c('a', 'b'))
 72 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
 73 |                         row.names=c("SubjectName", "cellType"))
 74 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 75 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 76 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 77 |   colnames(bulk.counts) <- c("a","b")
 78 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 79 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 80 | })
 81 | 
 82 | test_that("Catches no overlapping genes between bulk and single-cell and markers", {
 83 |   markers <- c("a", "b")
 84 |   bulk.counts <- matrix(0,nrow=2,ncol=2)
 85 |   sc.counts <- matrix(0,nrow=2,ncol=2)
 86 |   colnames(bulk.counts) <- c("a", "c")
 87 |   rownames(bulk.counts) <- c("a", "b")
 88 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 89 |   sc.pheno <- data.frame(SubjectName=c('a', 'b'), cellType=c('a', 'b'))
 90 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
 91 |                         row.names=c("SubjectName", "cellType"))
 92 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
 93 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
 94 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
 95 |   rownames(bulk.counts) <- c("1", "2")
 96 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
 97 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset, markers))
 98 |   
 99 | })
100 | 
101 | test_that("Catches data with no expressed/zero variance genes", {
102 |   bulk.counts <- matrix(1,nrow=2,ncol=2)
103 |   sc.counts <- matrix(1,nrow=2,ncol=2)
104 |   colnames(bulk.counts) <- c("a", "c")
105 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
106 |   sc.pheno <- data.frame(SubjectName=c('a', 'b'), cellType=c('a', 'b'))
107 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
108 |                         row.names=c("SubjectName", "cellType"))
109 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
110 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
111 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
112 | })
113 | 
114 | test_that("Catches data where zero variance in training data", {
115 |   # all genes good
116 |   bulk.counts <- matrix(1:9,nrow=3,ncol=3)
117 |   sc.counts <- matrix(1:9,nrow=3,ncol=3)
118 |   colnames(bulk.counts) <- c("a", "b", "c")
119 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
120 |   sc.pheno <- data.frame(SubjectName=c('b', 'c', 'd'), cellType=c('a', 'b', 'c'))
121 |   sc.meta <- data.frame(labelDescription=c("SubjectName", "cellType"),
122 |                         row.names=c("SubjectName", "cellType"))
123 |   sc.pdata <- new("AnnotatedDataFrame", data=sc.pheno, varMetadata=sc.meta)
124 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
125 |   # all genes good, no overlap
126 |   expect_equal(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset, use.overlap=F)$genes.used,
127 |                c("1","3"))
128 |   # Bulk has some zero variance
129 |   bulk.counts[1,c("b","c")] <- c(0,0)
130 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
131 |   expect_equal(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset)$genes.used, c("3"))
132 |   # Single cell has some zero variance
133 |   bulk.counts <- matrix(1:9,nrow=3,ncol=3)
134 |   colnames(bulk.counts) <- c("a", "b", "c")
135 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
136 |   sc.counts[1,c(1,2)] <- c(0,0)
137 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
138 |   expect_equal(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset)$genes.used, c("3"))
139 |   # All single cell training data has zero variance
140 |   sc.counts[1,c(1,2)] <- c(1,1)
141 |   sc.counts[2,c(1,2)] <- c(1,1)
142 |   sc.counts[3,c(1,2)] <- c(1,1)
143 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
144 |   expect_error(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset))
145 |   # Both have zero variance and undefined coefficient
146 |   sc.counts <- matrix(1:9,nrow=3,ncol=3)
147 |   sc.counts[1,c(1,2)] <- c(0,0)
148 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
149 |   bulk.counts[1,c("b","c")] <- c(0,0)
150 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
151 |   expect_equal(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset)$genes.used, c("2", "3"))
152 |   # Both have zero variance but defined coefficient
153 |   bulk.counts[1,c("b","c")] <- c(2,2)
154 |   bulk.counts[c(2,3), "c"] <- c(6,5)
155 |   bulk.eset <- Biobase::ExpressionSet(assayData = bulk.counts)
156 |   sc.counts[1,c(1,2)] <- c(1,1)
157 |   sc.counts[c(2,3), 2] <- c(3,2)
158 |   sc.eset <- Biobase::ExpressionSet(assayData = sc.counts, phenoData = sc.pdata)
159 |   expect_equal(BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset)$genes.used, c("1","2", "3"))
160 |   })
161 | 


--------------------------------------------------------------------------------
/tests/testthat/test_sim.R:
--------------------------------------------------------------------------------
 1 | context("Basic simulation framwork")
 2 | library(Biobase)
 3 | 
 4 | test_that("Simulation provides output", {
 5 |   set.seed(42)
 6 |   sim.data <- BisqueRNA::SimulateData(3,3,2,c('a','b'), c(.5,.5))
 7 |   expect_match(class(sim.data$sc.eset), "ExpressionSet")
 8 |   expect_match(class(sim.data$bulk.eset), "ExpressionSet")
 9 |   expect("matrix" %in% class(sim.data$props),
10 |          failure_message="sim.data$props not of class matrix")
11 |   expect_match(class(sim.data$markers), "data.frame")
12 | })
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test_utils.R:
--------------------------------------------------------------------------------
  1 | context("Expression matrix utlities")
  2 | library(Biobase)
  3 | 
  4 | test_that("CountsToCPM converts properly for valid data", {
  5 |   # 3 by 3
  6 |   example.counts <- matrix(1:9, nrow = 3, ncol = 3)
  7 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
  8 |   example.eset <- CountsToCPM(example.eset)
  9 |   expect_equal(unname(colSums(Biobase::exprs(example.eset))), rep(1000000, 3))
 10 |   # 3 by 1
 11 |   example.counts <- matrix(1:3, nrow = 3, ncol = 1)
 12 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 13 |   example.eset <- CountsToCPM(example.eset)
 14 |   expect_equal(unname(colSums(Biobase::exprs(example.eset))), rep(1000000, 1))
 15 |   # 1 by 3
 16 |   example.counts <- matrix(1:3, nrow = 1, ncol = 3)
 17 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 18 |   example.eset <- CountsToCPM(example.eset)
 19 |   expect_equal(unname(colSums(Biobase::exprs(example.eset))), rep(1000000, 3))
 20 |   # 1 by 1
 21 |   example.counts <- matrix(1, nrow = 1, ncol = 1)
 22 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 23 |   example.eset <- CountsToCPM(example.eset)
 24 |   expect_equal(unname(colSums(Biobase::exprs(example.eset))), rep(1000000, 1))
 25 | })
 26 | 
 27 | test_that("CountsToCPM catches invalid samples with no expression", {
 28 |   # 3 by 3
 29 |   example.counts <- matrix(1:9, nrow = 3, ncol = 3)
 30 |   example.counts[,1] <- rep(0,3)
 31 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 32 |   expect_error(CountsToCPM(example.eset))
 33 |   # 3 by 1
 34 |   example.counts <- matrix(rep(0,3), nrow = 3, ncol = 1)
 35 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 36 |   expect_error(CountsToCPM(example.eset))
 37 |   # 1 by 3
 38 |   example.counts <- matrix(1:3, nrow = 1, ncol = 3)
 39 |   example.counts[1,1] <- 0
 40 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 41 |   expect_error(CountsToCPM(example.eset))
 42 |   # 1 by 1
 43 |   example.counts <- matrix(0, nrow = 1, ncol = 1)
 44 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 45 |   expect_error(CountsToCPM(example.eset))
 46 | })
 47 | 
 48 | test_that("FilterZeroVarianceGenes works", {
 49 |   # All have nonzero variance
 50 |   example.counts <- matrix(1:9, nrow = 3, ncol = 3)
 51 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 52 |   expect_equal(dim(example.eset), dim(FilterZeroVarianceGenes(example.eset)))
 53 |   # Should remove 1 row
 54 |   example.counts[1,] <- rep(1,3)
 55 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 56 |   expect_equal(2, unname(nrow(FilterZeroVarianceGenes(example.eset))))
 57 |   # If only one sample, should remove all genes
 58 |   example.counts <- matrix(1:3, nrow = 3, ncol = 1)
 59 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 60 |   expect_equal(0, unname(nrow(FilterZeroVarianceGenes(example.eset))))
 61 |   # If all have 0 variance, get rid of all genes
 62 |   example.counts <- matrix(1, nrow = 3, ncol = 3)
 63 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 64 |   expect_equal(0, unname(nrow(FilterZeroVarianceGenes(example.eset))))
 65 | })
 66 | 
 67 | test_that("FilterUnexpressedGenes works", {
 68 |   # All are expressed
 69 |   example.counts <- matrix(1, nrow = 3, ncol = 3)
 70 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 71 |   expect_equal(dim(example.eset), dim(FilterUnexpressedGenes(example.eset)))
 72 |   # Should remove 1 row
 73 |   example.counts[1,] <- rep(0,3)
 74 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 75 |   expect_equal(2, unname(nrow(FilterUnexpressedGenes(example.eset))))
 76 |   # Only one sample, should still remove first row
 77 |   example.counts <- matrix(c(0,1,2), nrow = 3, ncol = 1)
 78 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 79 |   expect_equal(2, unname(nrow(FilterUnexpressedGenes(example.eset))))
 80 |   # All are unexpressed, should remove all rows
 81 |   example.counts <- matrix(0, nrow = 3, ncol = 3)
 82 |   example.eset <- Biobase::ExpressionSet(assayData = example.counts)
 83 |   expect_equal(0, unname(nrow(FilterUnexpressedGenes(example.eset))))
 84 | })
 85 | 
 86 | test_that("SeuratToExpressionSet works", {
 87 |   # v2 test
 88 |   setClass("testthatSeuratv2", representation(cell.names = "character", 
 89 |                                               ident = "character", 
 90 |                                               raw.data = "matrix"))
 91 |   sc.counts <- matrix(0,nrow=3,ncol=3)
 92 |   test.cell.names <- c("a-1", "b-2", "c-3")
 93 |   test.ident <- c("a", "b", "c")
 94 |   names(test.ident) <- test.cell.names
 95 |   colnames(sc.counts) <- test.cell.names
 96 |   test.seurat.obj <- new("testthatSeuratv2", 
 97 |                          cell.names=test.cell.names,
 98 |                          ident=test.ident,
 99 |                          raw.data=sc.counts
100 |                          )
101 |   expect_warning({test.eset <- SeuratToExpressionSet(test.seurat.obj, delimiter = '-',
102 |                                      position=2, version = "v2")})
103 |   expect_match(class(test.eset), "ExpressionSet")
104 |   # v3 failure test for 100% code coverage
105 |   expect_error({test.eset <- SeuratToExpressionSet(test.seurat.obj, delimiter = '-',
106 |                                                      position=2, version = "v3")})
107 | })
108 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/bisque.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bisque Example Usage"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Bisque Example Usage}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r setup, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = FALSE,
 13 |   comment = "##",
 14 |   highlight = TRUE,
 15 |   prompt = FALSE,
 16 |   results = "markup"
 17 | )
 18 | ```
 19 | 
 20 | This vignette provides a basic example of using Bisque to decompose bulk expression. Bisque offers two modes of operation: Reference-based and Marker-based decomposition. We will provide brief examples of both.
 21 | 
 22 | ```{r load, echo=T, results='hide', message=F, warning=F}
 23 | library(Biobase)
 24 | library(BisqueRNA)
 25 | ```
 26 | 
 27 | ## Input Format
 28 | 
 29 | Bisque requires expression data in the ExpressionSet format from the Biobase package. 
 30 | 
 31 | Bulk RNA-seq data can be converted from a matrix (columns are samples, rows are genes) to an ExpressionSet as follows:
 32 | 
 33 | ```{r eval=FALSE}
 34 | bulk.eset <- Biobase::ExpressionSet(assayData = bulk.matrix)
 35 | ```
 36 | 
 37 | Single-cell data requires additional information in the ExpressionSet, specificially cell type labels and individual labels. Individual labels indicate which individual each cell originated from. To add this information, Biobase requires it to be stored in a data frame format. Assuming we have character vectors of cell type labels (```cell.type.labels```) and individual labels (```individual.labels```), we can convert scRNA-seq data (with counts also in matrix format) as follows:
 38 | 
 39 | ```{r eval=FALSE}
 40 | sample.ids <- colnames(sc.counts.matrix)
 41 | # individual.ids and cell.types should be in the same order as in sample.ids
 42 | sc.pheno <- data.frame(check.names=F, check.rows=F,
 43 |                        stringsAsFactors=F,
 44 |                        row.names=sample.ids,
 45 |                        SubjectName=individual.labels,
 46 |                        cellType=cell.type.labels)
 47 | sc.meta <- data.frame(labelDescription=c("SubjectName",
 48 |                                          "cellType"),
 49 |                       row.names=c("SubjectName",
 50 |                                   "cellType"))
 51 | sc.pdata <- new("AnnotatedDataFrame",
 52 |                 data=sc.pheno,
 53 |                 varMetadata=sc.meta)
 54 | sc.eset <- Biobase::ExpressionSet(assayData=sc.counts.matrix,
 55 |                                   phenoData=sc.pdata)
 56 | ```
 57 | 
 58 | If your single-cell data (from 10x platform) is in a Seurat object with cell type assignments, Bisque includes a function that will automatically convert this object to an ExpressionSet:
 59 | 
 60 | ```{r eval=FALSE}
 61 | sc.eset <- BisqueRNA::SeuratToExpressionSet(seurat.obj, delimiter="-", position=2, version="v3")
 62 | ```
 63 | 
 64 | The delimiter and position arguments describe the barcode format of 10x single-cell data. For example, barcodes of "ATCGATCG-1" and "ATGCAAGT-2" have the individual ID in position 2 after splitting by the delimiter '-'. 
 65 | 
 66 | ```{r simulate, echo=FALSE}
 67 | set.seed(42)
 68 | cell.types <- c("Neurons", "Astrocytes", "Oligodendrocytes", "Microglia", "Endothelial Cells")
 69 | avg.props <- c(.5, .2, .2, .07, .03)
 70 | 
 71 | expr.data <- BisqueRNA::SimulateData(n.ind=2, n.genes=10, n.cells=10, cell.types=cell.types, avg.props=avg.props)
 72 | sc.eset <- expr.data$sc.eset
 73 | bulk.eset <- expr.data$bulk.eset
 74 | ```
 75 | Here is an example of input single-cell and bulk data for 2 individuals with 10 cells sequenced each:
 76 | 
 77 | ```{r example_input}
 78 | sampleNames(sc.eset)
 79 | sc.eset$SubjectName
 80 | sc.eset$cellType
 81 | sampleNames(bulk.eset)
 82 | ```
 83 | 
 84 | Note that if you have samples with both single-cell and bulk RNA-seq data, their IDs should be found in both ```sc.eset$SubjectName``` and ```sampleNames(bulk.eset)``` .
 85 | 
 86 | ## Reference-based decomposition
 87 | 
 88 | We will use data simulated under a simple model (code for SimulateData() can be found in R/simulation.R).
 89 | We simulate single-cell and bulk RNA-seq counts for 10 individuals. We remove 5 individuals from the single-cell data. We will estimate the cell composition for these 5 individuals.
 90 | 
 91 | ```{r example_input_2}
 92 | cell.types <- c("Neurons", "Astrocytes", "Oligodendrocytes", "Microglia", "Endothelial Cells")
 93 | avg.props <- c(.5, .2, .2, .07, .03)
 94 | sim.data <- SimulateData(n.ind=10, n.genes=100, n.cells=500, cell.types=cell.types, avg.props=avg.props)
 95 | sc.eset <- sim.data$sc.eset[,sim.data$sc.eset$SubjectName %in% as.character(6:10)]
 96 | bulk.eset <- sim.data$bulk.eset
 97 | true.props <- sim.data$props
 98 | markers <- sim.data$markers
 99 | ```
100 | ```{r cleanup, echo=FALSE}
101 | rm(sim.data)
102 | ```
103 | 
104 | By default, Bisque uses all genes for decomposition. However, you may supply a list of genes (such as marker genes) to be used with the ```markers``` parameter. Also, since we have samples with both bulk and single-cell RNA-seq data, we set the ```use.overlap``` parameter to ```TRUE```. If there are no overlapping samples, you can set this parameter to ```FALSE``` (we expect performance to be better if overlapping samples are available).
105 | 
106 | Here's how to call the reference-based decomposition method:
107 | 
108 | ```{r reference_based}
109 | res <- BisqueRNA::ReferenceBasedDecomposition(bulk.eset, sc.eset, markers=NULL, use.overlap=TRUE)
110 | ```
111 | 
112 | A list is returned with decomposition estimates in slot ```bulk.props```.
113 | 
114 | ```{r ref_results}
115 | ref.based.estimates <- res$bulk.props
116 | knitr::kable(ref.based.estimates, digits=2)
117 | ```
118 | 
119 | Just to make sure this worked, we can correlate all the estimates with the true proportions.
120 | ```{r ref_results_2}
121 | r <- cor(as.vector(ref.based.estimates), 
122 |          as.vector(true.props[row.names(ref.based.estimates),colnames(ref.based.estimates)]))
123 | knitr::knit_print(sprintf("R: %f", r))
124 | ```
125 | 
126 | ## Marker-based decomposition
127 | 
128 | BisqueMarker can provide estimates of relative cell type abundances using only known marker genes when a reference profile is not available. Marker genes are stored in a data frame with columns that specify gene, cluster that the gene is a marker for, and an optional column for weights (typically fold-change). Here's what this data frame might look like:
129 | 
130 | ```{r marker_example, echo=FALSE}
131 | marker.data.frame <- data.frame(gene=paste("Gene", 1:6),
132 |                                 cluster=c("Neurons", "Neurons", "Astrocytes", "Oligodendrocytes", "Microglia", "Endothelial Cells"),
133 |                                 avg_logFC=c(0.82, 0.59, 0.68, 0.66, 0.71, 0.62))
134 | knitr::kable(marker.data.frame)
135 | ```
136 | 
137 | 
138 | Here's how to call the marker-based decomposition method:
139 | ```{r marker_based}
140 | res <- BisqueRNA::MarkerBasedDecomposition(bulk.eset, markers, weighted=F)
141 | ```
142 | A list is returned with decomposition estimates in slot ```bulk.props```.
143 | 
144 | ```{r marker_results}
145 | marker.based.estimates <- res$bulk.props
146 | knitr::kable(marker.based.estimates, digits = 2)
147 | ```
148 | 
149 | Note that these estimates are relative within each cell type, so you cannot immediately compare abundance estimates between cell types.
150 | 
151 | Just to make sure this worked, we can correlate these estimates with the scaled true proportions.
152 | 
153 | ```{r marker_comparison}
154 | scaled.true.props <- t(scale(t(true.props)))[rownames(marker.based.estimates),]
155 | r <- cor(as.vector(marker.based.estimates),
156 |          as.vector(scaled.true.props))
157 | knitr::knit_print(sprintf("R: %f", r))
158 | ```
159 | 


--------------------------------------------------------------------------------