├── .covrignore
├── CellBench.png
├── vignettes
    ├── .gitignore
    ├── Timing.Rmd
    ├── WritingWrappers.Rmd
    ├── DataManipulation.Rmd
    ├── TidyversePatterns.Rmd
    └── Introduction.Rmd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-memoise.R
    │   ├── test-data_list.R
    │   ├── test-utils_sampling.R
    │   ├── test-benchmark_tbl_methods.R
    │   ├── test-cellbench_file.R
    │   ├── test-utils_check.R
    │   ├── test-fn_list.R
    │   ├── test-benchmark_timing_tbl_methods.R
    │   ├── test-time_methods.R
    │   ├── test-set_options.R
    │   ├── test-fn_arg_seq.R
    │   ├── test-utils_filter.R
    │   ├── test-utils.R
    │   └── test-apply_methods.R
├── data
    └── sample_sce_data.rda
├── inst
    ├── extdata
    │   ├── 10x_sce_sample.rds
    │   └── celseq_sce_sample.rds
    ├── prebuilt-doc
    │   ├── UsersGuide.pdf
    │   ├── ParameterTesting.pdf
    │   ├── UsersGuide.Rmd
    │   ├── CreatingBenchmarks.Rmd
    │   ├── Examples.Rmd
    │   ├── ImputeBenchmark.Rmd
    │   └── ParameterTesting.Rmd
    └── CITATION
├── .gitignore
├── codecov.yml
├── .Rbuildignore
├── R
    ├── zzz.R
    ├── cellbench_casestudy.R
    ├── is_task_error.R
    ├── alias.R
    ├── cellbench_file.R
    ├── sample_sce_data.R
    ├── package.R
    ├── any_task_errors.R
    ├── fn_list.R
    ├── data_list.R
    ├── print.R
    ├── utils_sampling.R
    ├── split_step.R
    ├── utils_check.R
    ├── benchmark_tbl_methods.R
    ├── memoise.R
    ├── fn_arg_seq.R
    ├── set_options.R
    ├── benchmark_timing_tbl_methods.R
    ├── utils_filter.R
    ├── time_methods.R
    ├── load_data.R
    ├── apply_methods.R
    └── utils.R
├── .travis.yml
├── man
    ├── clear_cellbench_cache.Rd
    ├── clear_cached_datasets.Rd
    ├── print.task_error.Rd
    ├── cellbench_case_study.Rd
    ├── all_unique.Rd
    ├── print.fn_arg_seq.Rd
    ├── fn_list.Rd
    ├── is.task_error.Rd
    ├── filter_zero_genes.Rd
    ├── data_list.Rd
    ├── cellbench_file.Rd
    ├── sample_genes.Rd
    ├── sample_cells.Rd
    ├── mhead.Rd
    ├── keep_high_count_cells.Rd
    ├── keep_high_count_genes.Rd
    ├── arrow_sep.Rd
    ├── keep_high_var_genes.Rd
    ├── any_task_errors.Rd
    ├── set_cellbench_cache_path.Rd
    ├── set_cellbench_bpparam.Rd
    ├── summary.benchmark_tbl.Rd
    ├── load_all_data.Rd
    ├── set_cellbench_threads.Rd
    ├── CellBench-package.Rd
    ├── strip_timing.Rd
    ├── as_pipeline_list.Rd
    ├── check_class.Rd
    ├── sample_sce_data.Rd
    ├── split_step.Rd
    ├── unpack_timing.Rd
    ├── cache_method.Rd
    ├── fn_arg_seq.Rd
    ├── collapse_pipeline.Rd
    ├── time_methods.Rd
    └── apply_methods.Rd
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
└── README.md


/.covrignore:
--------------------------------------------------------------------------------
1 | R/alias.R
2 | R/load_data.R
3 | R/print.R
4 | R/zzz.R
5 | 


--------------------------------------------------------------------------------
/CellBench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/CellBench.png


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | *.pdf
4 | *.log
5 | CaseStudy.Rmd
6 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(CellBench)
3 | 
4 | test_check("CellBench")
5 | 


--------------------------------------------------------------------------------
/data/sample_sce_data.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/data/sample_sce_data.rda


--------------------------------------------------------------------------------
/inst/extdata/10x_sce_sample.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/inst/extdata/10x_sce_sample.rds


--------------------------------------------------------------------------------
/inst/prebuilt-doc/UsersGuide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/inst/prebuilt-doc/UsersGuide.pdf


--------------------------------------------------------------------------------
/inst/extdata/celseq_sce_sample.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/inst/extdata/celseq_sce_sample.rds


--------------------------------------------------------------------------------
/inst/prebuilt-doc/ParameterTesting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shians/CellBench/HEAD/inst/prebuilt-doc/ParameterTesting.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Meta
 2 | debug
 3 | CellBench.BiocCheck
 4 | .Rproj.user
 5 | .Rhistory
 6 | .RData
 7 | .Ruserdata
 8 | .CellBenchCache
 9 | .DS_Store
10 | ..Rcheck
11 | *.RData
12 | *.Rproj
13 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^LICENSE\.md$
 2 | ^codecov\.yml$
 3 | ^\.travis\.yml$
 4 | ^Meta$
 5 | ^doc$
 6 | ^debug$
 7 | ^.*\.Rproj$
 8 | ^\.Rproj\.user$
 9 | ^\.CellBench$
10 | ^\.git$
11 | ^\.covrignore$
12 | ^\.DS_Store$
13 | ^CellBench.BiocCheck$
14 | ^CellBench.png$
15 | 


--------------------------------------------------------------------------------
/tests/testthat/test-memoise.R:
--------------------------------------------------------------------------------
 1 | context("Memoise")
 2 | 
 3 | test_that(
 4 |     "Memoised functions product the same result", {
 5 |     cache_path <- file.path(tempdir(), ".CellBenchCache")
 6 |     set_cellbench_cache_path(cache_path)
 7 |     fn <- function(x) { x }
 8 |     mfn <- cache_method(fn)
 9 | 
10 |     expect_identical(mfn(1), fn(1))
11 |     expect_message(clear_cellbench_cache())
12 | })
13 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname, pkgname) {
 2 |     op <- options()
 3 |     op.CellBench <- list(
 4 |         CellBench.threads = 1,
 5 |         CellBench.bpparam = BiocParallel::SerialParam(stop.on.error = FALSE)
 6 |     )
 7 | 
 8 |     # set any options not already set
 9 |     toset <- !(names(op.CellBench) %in% names(op))
10 |     if (any(toset)) options(op.CellBench[toset])
11 | 
12 |     invisible()
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/testthat/test-data_list.R:
--------------------------------------------------------------------------------
 1 | context("Data list constructor")
 2 | 
 3 | test_that(
 4 |     "Data list constructor works properly", {
 5 |     expect_is(data_list(a = 1, b = 2), "list")
 6 |     expect_is(data_list(a = "a", b = "b"), "list")
 7 |     expect_error(data_list(1, b = 2))
 8 |     expect_error(data_list(a = 1, 2))
 9 |     expect_error(data_list(a = 1, b = "b"))
10 |     expect_error(data_list(1))
11 | })
12 | 


--------------------------------------------------------------------------------
/R/cellbench_casestudy.R:
--------------------------------------------------------------------------------
 1 | #' Open vignetted containing a case study using CellBench
 2 | #'
 3 | #' @return opens a vignette containing a case study
 4 | #'
 5 | #' @examples
 6 | #' \dontrun{
 7 | #' cellbench_case_study()
 8 | #' }
 9 | #'
10 | #' @importFrom utils browseURL
11 | #'
12 | #' @export
13 | cellbench_case_study <- function() {
14 |     browseURL(system.file("case-study", "CellBenchCaseStudy.html", package = "CellBench"))
15 | }
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | r:
 5 |     - bioc-devel
 6 | sudo: false
 7 | cache: packages
 8 | compiler: clang
 9 | 
10 | bioc_packages:
11 |     - BiocFileCache
12 |     - BiocParallel
13 |     - SummarizedExperiment
14 |     - SingleCellExperiment
15 | 
16 | apt_packages:
17 |     - libhdf5-dev
18 | 
19 | after_success:
20 |     - Rscript -e 'covr::codecov()'
21 | 


--------------------------------------------------------------------------------
/man/clear_cellbench_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/memoise.R
 3 | \name{clear_cellbench_cache}
 4 | \alias{clear_cellbench_cache}
 5 | \title{Clear CellBench Cache}
 6 | \usage{
 7 | clear_cellbench_cache()
 8 | }
 9 | \value{
10 | None
11 | }
12 | \description{
13 | Clears the method cache for CellBench
14 | }
15 | \examples{
16 | \dontrun{
17 | clear_cellbench_cache()
18 | }
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/inst/prebuilt-doc/UsersGuide.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "CellBench User's Guide"
 3 | author: "Shian Su"
 4 | date: "`r Sys.Date()`"
 5 | output: BiocStyle::pdf_document
 6 | vignette: >
 7 |   %\VignetteIndexEntry{Vignette Title}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r setup, include=FALSE}
13 | knitr::opts_chunk$set(echo = TRUE)
14 | ```
15 | 
16 | # Introduction
17 | 
18 | # Benchmarking Framework
19 | 
20 | 


--------------------------------------------------------------------------------
/man/clear_cached_datasets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_data.R
 3 | \name{clear_cached_datasets}
 4 | \alias{clear_cached_datasets}
 5 | \title{Clear cached datasets}
 6 | \usage{
 7 | clear_cached_datasets()
 8 | }
 9 | \value{
10 | None
11 | }
12 | \description{
13 | Delete the datasets cached by the load_*_data set of functions
14 | }
15 | \examples{
16 | \dontrun{
17 | clear_cached_datasets()
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/print.task_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.task_error}
 4 | \alias{print.task_error}
 5 | \title{Print method for task_error object}
 6 | \usage{
 7 | \method{print}{task_error}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a task_error object}
11 | 
12 | \item{...}{not used}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | task_error are objects that result from failed methods
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/R/is_task_error.R:
--------------------------------------------------------------------------------
 1 | #' Check for task errors
 2 | #'
 3 | #' This is a helper function for checking the result column of a benchmark_tbl
 4 | #' for task_error objects. This is useful for filtering out rows where the
 5 | #' result is a task error.
 6 | #'
 7 | #' @param x the object to be tested
 8 | #'
 9 | #' @return vector of logicals denoting if elements of the list are task_error objects
10 | #' @export
11 | is.task_error <- function(x) {
12 |     purrr::map_lgl(x, function(xx) is(xx, "task_error"))
13 | }
14 | 


--------------------------------------------------------------------------------
/R/alias.R:
--------------------------------------------------------------------------------
 1 | ## File containing simple function aliases that do not need to be tested.
 2 | 
 3 | # apply function across rows
 4 | row_apply <- purrr::partial(apply, MARGIN = 1)
 5 | 
 6 | # apply function down columns
 7 | col_apply <- purrr::partial(apply, MARGIN = 2)
 8 | 
 9 | # glue collapse with default sep and last
10 | collapse_with_comma <- purrr::partial(
11 |     glue::glue_collapse,
12 |     sep = ", ",
13 |     last = " and "
14 | )
15 | 
16 | is.error <- function(x) {
17 |     is(x, "error")
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils_sampling.R:
--------------------------------------------------------------------------------
 1 | context("Random sampling")
 2 | 
 3 | test_that(
 4 |     "Correct number of rows sampled", {
 5 |     data("sample_sce_data")
 6 | 
 7 |     x <- sample_cells(sample_sce_data, n = 5)
 8 |     expect_equal(ncol(x), 5)
 9 | 
10 |     x <- sample_cells(sample_sce_data, n = 51)
11 |     expect_equal(ncol(x), 50)
12 | 
13 |     y <- sample_genes(sample_sce_data, n = 5)
14 |     expect_equal(nrow(y), 5)
15 | 
16 |     y <- sample_genes(sample_sce_data, n = 201)
17 |     expect_equal(nrow(y), 200)
18 | })
19 | 


--------------------------------------------------------------------------------
/man/cellbench_case_study.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellbench_casestudy.R
 3 | \name{cellbench_case_study}
 4 | \alias{cellbench_case_study}
 5 | \title{Open vignetted containing a case study using CellBench}
 6 | \usage{
 7 | cellbench_case_study()
 8 | }
 9 | \value{
10 | opens a vignette containing a case study
11 | }
12 | \description{
13 | Open vignetted containing a case study using CellBench
14 | }
15 | \examples{
16 | \dontrun{
17 | cellbench_case_study()
18 | }
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/man/all_unique.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_check.R
 3 | \name{all_unique}
 4 | \alias{all_unique}
 5 | \title{Check if all values in a vector are unique}
 6 | \usage{
 7 | all_unique(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the vector to check}
11 | }
12 | \value{
13 | TRUE if all values in the vector are unique
14 | }
15 | \description{
16 | Check if all values in a vector are unique
17 | }
18 | \examples{
19 | all_unique(c(1, 2, 3)) # TRUE
20 | all_unique(c(1, 2, 2)) # FALSE
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-benchmark_tbl_methods.R:
--------------------------------------------------------------------------------
 1 | context("benchmark_tbl methods")
 2 | 
 3 | test_that(
 4 |     "benchmark_tbl summary works properly", {
 5 |     sample_sce_data <- readRDS(cellbench_file("10x_sce_sample.rds"))
 6 |     data_list <- list(
 7 |         data = sample_sce_data
 8 |     )
 9 | 
10 |     method_list <- list(
11 |         sample_cells = purrr::partial(sample_cells, n = 10),
12 |         sample_genes = purrr::partial(sample_genes, n = 10)
13 |     )
14 | 
15 |     res <- apply_methods(data_list, method_list)
16 | 
17 |     expect_output(summary(res))
18 | })
19 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cellbench_file.R:
--------------------------------------------------------------------------------
 1 | context("Helper for CellBench internal files")
 2 | 
 3 | test_that(
 4 |     "cellbench_file works properly", {
 5 |     expect_is(cellbench_file(), "character")
 6 |     expect_gt(length(cellbench_file()), 0)
 7 | 
 8 |     expect_is(cellbench_file("10x_sce_sample.rds"), "character")
 9 |     expect_length(cellbench_file("10x_sce_sample.rds"), 1)
10 | 
11 |     expect_error(
12 |         cellbench_file("10x_sce_samples.rds"),
13 |         "file not found, run cellbench_file() to see available files",
14 |         fixed = TRUE
15 |     )
16 | })
17 | 


--------------------------------------------------------------------------------
/man/print.fn_arg_seq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.fn_arg_seq}
 4 | \alias{print.fn_arg_seq}
 5 | \title{Print method for fn_arg_seq output}
 6 | \usage{
 7 | \method{print}{fn_arg_seq}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{fn_arg_seq object}
11 | 
12 | \item{...}{addition arguments for print}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | Print method for fn_arg_seq output
19 | }
20 | \examples{
21 | fn_seq <- fn_arg_seq(kmeans, centers = 1:3)
22 | fn_seq
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/fn_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fn_list.R
 3 | \name{fn_list}
 4 | \alias{fn_list}
 5 | \title{Constructor for a function list}
 6 | \usage{
 7 | fn_list(...)
 8 | }
 9 | \arguments{
10 | \item{...}{objects, must all be named}
11 | }
12 | \value{
13 | a list of named functions
14 | }
15 | \description{
16 | Constructor for a list of functions, a thin wrapper around list() which
17 | checks that all the inputs are functions and have names
18 | }
19 | \examples{
20 | flist <- fn_list(
21 |     mean = mean,
22 |     median = median
23 | )
24 | }
25 | 


--------------------------------------------------------------------------------
/man/is.task_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/is_task_error.R
 3 | \name{is.task_error}
 4 | \alias{is.task_error}
 5 | \title{Check for task errors}
 6 | \usage{
 7 | is.task_error(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the object to be tested}
11 | }
12 | \value{
13 | vector of logicals denoting if elements of the list are task_error objects
14 | }
15 | \description{
16 | This is a helper function for checking the result column of a benchmark_tbl
17 | for task_error objects. This is useful for filtering out rows where the
18 | result is a task error.
19 | }
20 | 


--------------------------------------------------------------------------------
/man/filter_zero_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_filter.R
 3 | \name{filter_zero_genes}
 4 | \alias{filter_zero_genes}
 5 | \title{Filter out zero count genes}
 6 | \usage{
 7 | filter_zero_genes(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment or matrix to filter}
11 | }
12 | \value{
13 | object of same type as input with all zero count genes removed
14 | }
15 | \description{
16 | Remove all genes (rows) where the total count is 0
17 | }
18 | \examples{
19 | x <- matrix(rep(0:5, times = 5), nrow = 6, ncol = 5)
20 | filter_zero_genes(x)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/data_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data_list.R
 3 | \name{data_list}
 4 | \alias{data_list}
 5 | \title{Constructor for a data list}
 6 | \usage{
 7 | data_list(...)
 8 | }
 9 | \arguments{
10 | \item{...}{objects, must all be named}
11 | }
12 | \value{
13 | a list of named data
14 | }
15 | \description{
16 | Constructor for a list of data, a thin wrapper around list() which
17 | checks that all the inputs are of the same type and have names
18 | }
19 | \examples{
20 | data(iris)
21 | flist <- data_list(
22 |     data1 = iris[1:20, ],
23 |     data2 = iris[21:40, ]
24 | )
25 | }
26 | 


--------------------------------------------------------------------------------
/inst/prebuilt-doc/CreatingBenchmarks.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Creating Benchmarks"
 3 | author: "Shian Su"
 4 | date: "`r Sys.Date()`"
 5 | output: BiocStyle::pdf_document
 6 | vignette: >
 7 |   %\VignetteIndexEntry{Vignette Title}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | # Introduction
13 | 
14 | This vignette will provide a tutorial on how to construct your own benchmarks. There are two essentialy components that need to be prepared for benchmarking, data prepared in a consistent fashion and methods that have consistent inputs and outputs. 
15 | 
16 | # Preparing Data
17 | 
18 | # Preparing Methods
19 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils_check.R:
--------------------------------------------------------------------------------
 1 | context("Checking functions")
 2 | 
 3 | test_that(
 4 | "Test class testing functions", {
 5 |     expect_true(is_one_of(1, c("numeric", "character")))
 6 |     expect_true(is_one_of("A", c("numeric", "character")))
 7 |     expect_false(is_one_of(TRUE, c("numeric", "character")))
 8 |     expect_false(is_one_of("A", c("numeric", "logical")))
 9 |     expect_error(is_one_of(1, 1))
10 | 
11 |     x <- 0
12 |     class(x) <- c("foo", "numeric")
13 |     expect_true(is_all_of(x, c("foo", "numeric")))
14 |     expect_true(is_all_of(x, c("numeric", "foo")))
15 |     expect_false(is_all_of(x, c("foo", "numeric", "bar")))
16 |     expect_error(is_all_of(1, 1))
17 | })
18 | 


--------------------------------------------------------------------------------
/man/cellbench_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellbench_file.R
 3 | \name{cellbench_file}
 4 | \alias{cellbench_file}
 5 | \title{Get path to CellBench packaged data}
 6 | \usage{
 7 | cellbench_file(filename = NULL)
 8 | }
 9 | \arguments{
10 | \item{filename}{the name of the file to look for}
11 | }
12 | \value{
13 | string containing the path to the packaged data
14 | }
15 | \description{
16 | Search CellBench package for packaged data, leaving argument empty will list
17 | the available data.
18 | }
19 | \examples{
20 | cellbench_file() # shows available files
21 | cellbench_file("10x_sce_sample.rds") # returns path to 10x sample data
22 | }
23 | 


--------------------------------------------------------------------------------
/man/sample_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_sampling.R
 3 | \name{sample_genes}
 4 | \alias{sample_genes}
 5 | \title{Sample genes from a SingleCellExperiment}
 6 | \usage{
 7 | sample_genes(x, n)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment object}
11 | 
12 | \item{n}{the number of genes to sample}
13 | }
14 | \value{
15 | SingleCellExperiment object
16 | }
17 | \description{
18 | Sample n genes from a SingleCellExperiment object with no replacement
19 | }
20 | \examples{
21 | sample_sce_data <- readRDS(cellbench_file("10x_sce_sample.rds"))
22 | dim(sample_sce_data)
23 | x <- sample_genes(sample_sce_data, 50)
24 | dim(x)
25 | }
26 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite the 'CellBench' package please use:")
 2 | 
 3 | bibentry(
 4 |     bibtype = "Article",
 5 |     doi = "10.1093/bioinformatics/btz889",
 6 |     author = personList(
 7 |         as.person("Shian Su"),
 8 |         as.person("Luyi Tian"),
 9 |         as.person("Xueyi Dong"),
10 |         as.person("Peter F. Hickey"),
11 |         as.person("Saskia Freytag"),
12 |         as.person("Matthew E. Ritchie")
13 |     ),
14 |     journal = "Bioinformatics",
15 |     title = "CellBench: R/Bioconductor software for comparing single-cell RNA-seq analysis methods",
16 |     year = "2019",
17 |     month = "11",
18 |     volume = "36",
19 |     pages = "2288-2290",
20 |     number = "7"
21 | )
22 | 


--------------------------------------------------------------------------------
/man/sample_cells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_sampling.R
 3 | \name{sample_cells}
 4 | \alias{sample_cells}
 5 | \title{Sample cells from a SingleCellExperiment}
 6 | \usage{
 7 | sample_cells(x, n)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment object}
11 | 
12 | \item{n}{the number of cells to sample}
13 | }
14 | \value{
15 | SingleCellExperiment object
16 | }
17 | \description{
18 | Sample n cells from a SingleCellExperiment object with no replacement.
19 | }
20 | \examples{
21 | sample_sce_data <- readRDS(cellbench_file("celseq_sce_sample.rds"))
22 | dim(sample_sce_data)
23 | x <- sample_cells(sample_sce_data, 10)
24 | dim(x)
25 | }
26 | 


--------------------------------------------------------------------------------
/tests/testthat/test-fn_list.R:
--------------------------------------------------------------------------------
 1 | context("Function list constructor")
 2 | 
 3 | test_that(
 4 |     "fn_list constructor works properly", {
 5 |     flist <- fn_list(
 6 |         mean = mean,
 7 |         median = median
 8 |     )
 9 | 
10 |     expected <- list(
11 |         mean = mean,
12 |         median = median
13 |     )
14 | 
15 |     expect_identical(flist, expected)
16 | 
17 |     expect_error(
18 |         fn_list(1),
19 |         "all fn_list members must be functions"
20 |     )
21 | 
22 |     expect_error(
23 |         fn_list(log),
24 |         "all fn_list members must have names, e.g. fn_list(fn1 = log)",
25 |         fixed = TRUE
26 |     )
27 | 
28 |     expect_error(fn_list(a = log, mean))
29 | })
30 | 


--------------------------------------------------------------------------------
/man/mhead.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{mhead}
 4 | \alias{mhead}
 5 | \title{Get head of 2 dimensional object as a square block}
 6 | \usage{
 7 | mhead(x, n = 6)
 8 | }
 9 | \arguments{
10 | \item{x}{the object with 2 dimensions}
11 | 
12 | \item{n}{the size of the n-by-n block to extract}
13 | }
14 | \value{
15 | an n-by-n sized subset of x
16 | }
17 | \description{
18 | head prints all columns which may flood the console, mhead takes a square
19 | block which can look nicer and still provide a good inspection of the
20 | contents
21 | }
22 | \examples{
23 | x <- matrix(runif(100), nrow = 10, ncol = 10)
24 | 
25 | mhead(x)
26 | mhead(x, n = 3)
27 | }
28 | 


--------------------------------------------------------------------------------
/man/keep_high_count_cells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_filter.R
 3 | \name{keep_high_count_cells}
 4 | \alias{keep_high_count_cells}
 5 | \title{Filter down to the highest count cells}
 6 | \usage{
 7 | keep_high_count_cells(x, n)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment or matrix}
11 | 
12 | \item{n}{the number of highest count cells to keep}
13 | }
14 | \value{
15 | object of same type as input containing the highest count cells
16 | }
17 | \description{
18 | Filter a SingleCellExperiment or matrix down to the cells (columns) with the
19 | highest counts
20 | }
21 | \examples{
22 | data(sample_sce_data)
23 | keep_high_count_cells(sample_sce_data, 10)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/keep_high_count_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_filter.R
 3 | \name{keep_high_count_genes}
 4 | \alias{keep_high_count_genes}
 5 | \title{Filter down to the highest count genes}
 6 | \usage{
 7 | keep_high_count_genes(x, n)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment or matrix}
11 | 
12 | \item{n}{the number of highest count genes to keep}
13 | }
14 | \value{
15 | object of same type as input containing the highest count genes
16 | }
17 | \description{
18 | Filter a SingleCellExperiment or matrix down to the genes (rows) with the
19 | highest counts
20 | }
21 | \examples{
22 | data(sample_sce_data)
23 | keep_high_count_genes(sample_sce_data, 300)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/arrow_sep.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{arrow_sep}
 4 | \alias{arrow_sep}
 5 | \title{Unicode arrow separators}
 6 | \usage{
 7 | arrow_sep(towards = c("right", "left"), unicode = FALSE)
 8 | }
 9 | \arguments{
10 | \item{towards}{the direction the unicode arrow points towards}
11 | 
12 | \item{unicode}{whether unicode arrows should be used. Does not work inside
13 | plots within knitted PDF documents.}
14 | }
15 | \value{
16 | a string containing an unicode arrow surrounded by two spaces
17 | }
18 | \description{
19 | Utility function for generating unicode arrow separators.
20 | }
21 | \examples{
22 | arrow_sep("left") # left arrrow
23 | arrow_sep("right") # right arrrow
24 | }
25 | 


--------------------------------------------------------------------------------
/man/keep_high_var_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_filter.R
 3 | \name{keep_high_var_genes}
 4 | \alias{keep_high_var_genes}
 5 | \title{Filter down to the most variable genes}
 6 | \usage{
 7 | keep_high_var_genes(x, n)
 8 | }
 9 | \arguments{
10 | \item{x}{the SingleCellExperiment or matrix}
11 | 
12 | \item{n}{the number of most variable genes to keep}
13 | }
14 | \value{
15 | object of same type as input containing the most variable genes
16 | }
17 | \description{
18 | Filter a SingleCellExperiment or matrix down to the most variable genes
19 | (rows), variability is determined by var() scaled by the total counts for the
20 | gene.
21 | }
22 | \examples{
23 | data(sample_sce_data)
24 | keep_high_var_genes(sample_sce_data, 50)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/any_task_errors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/any_task_errors.R
 3 | \name{any_task_errors}
 4 | \alias{any_task_errors}
 5 | \alias{any_task_errors.benchmark_tbl}
 6 | \title{Check if any tasks produced errors}
 7 | \usage{
 8 | any_task_errors(x, verbose)
 9 | 
10 | \method{any_task_errors}{benchmark_tbl}(x, verbose = FALSE)
11 | }
12 | \arguments{
13 | \item{x}{the tibble to check}
14 | 
15 | \item{verbose}{TRUE if the rows with errors should be reported}
16 | }
17 | \value{
18 | TRUE if any entry in the result column is a task_error object
19 | }
20 | \description{
21 | Check the results column of a benchmark tibble for any task_error objects.
22 | }
23 | \section{Methods (by class)}{
24 | \itemize{
25 | \item \code{any_task_errors(benchmark_tbl)}: 
26 | 
27 | }}
28 | 


--------------------------------------------------------------------------------
/man/set_cellbench_cache_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/set_options.R
 3 | \name{set_cellbench_cache_path}
 4 | \alias{set_cellbench_cache_path}
 5 | \title{Set CellBench cache path}
 6 | \usage{
 7 | set_cellbench_cache_path(path = "./.CellBenchCache")
 8 | }
 9 | \arguments{
10 | \item{path}{the path to where method caches should be stored}
11 | }
12 | \value{
13 | None
14 | }
15 | \description{
16 | Set CellBench cache path
17 | }
18 | \examples{
19 | \dontrun{
20 | # hidden folder in local path
21 | set_cellbench_cache_path(".CellBenchCache"))
22 | }
23 | # store in temp directory valid for this session
24 | set_cellbench_cache_path(file.path(tempdir(), ".CellBenchCache"))
25 | 
26 | }
27 | \seealso{
28 | \code{\link{cache_method}} for constructing cached methods.
29 | }
30 | 


--------------------------------------------------------------------------------
/man/set_cellbench_bpparam.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/set_options.R
 3 | \name{set_cellbench_bpparam}
 4 | \alias{set_cellbench_bpparam}
 5 | \title{Set BiocParallel parameter used CellBench}
 6 | \usage{
 7 | set_cellbench_bpparam(param)
 8 | }
 9 | \arguments{
10 | \item{param}{the BiocParallel parameter object}
11 | }
12 | \value{
13 | None
14 | }
15 | \description{
16 | This is a more advanced interface for changing CellBench's parallelism
17 | settings. Internally CellBench uses BiocParallel for parallelism, consult
18 | the documentation of BiocParallel to see what settings are available.
19 | }
20 | \examples{
21 | set_cellbench_threads(1) # CellBench runs on a single thread
22 | 
23 | }
24 | \seealso{
25 | \code{\link{set_cellbench_threads}} for more basic interface
26 | }
27 | 


--------------------------------------------------------------------------------
/R/cellbench_file.R:
--------------------------------------------------------------------------------
 1 | #' Get path to CellBench packaged data
 2 | #'
 3 | #' Search CellBench package for packaged data, leaving argument empty will list
 4 | #' the available data.
 5 | #'
 6 | #' @param filename the name of the file to look for
 7 | #'
 8 | #' @return string containing the path to the packaged data
 9 | #' @export
10 | #'
11 | #' @examples
12 | #' cellbench_file() # shows available files
13 | #' cellbench_file("10x_sce_sample.rds") # returns path to 10x sample data
14 | cellbench_file <- function(filename = NULL) {
15 |     if (is.null(filename)) {
16 |         dir(system.file("extdata", package = "CellBench"))
17 |     } else {
18 |         output <- system.file("extdata", filename, package = "CellBench")
19 |         if (output == "") {
20 |             stop("file not found, run cellbench_file() to see available files")
21 |         }
22 |         output
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/R/sample_sce_data.R:
--------------------------------------------------------------------------------
 1 | #' This is data for testing functions in CellBench.
 2 | #'
 3 | #' A dataset containing 200 genes and 50 cells randomly sampled from the CelSeq
 4 | #' mRNA mixture dataset, each sample is a mixture of mRNA material from 3
 5 | #' different human adenocarcinoma cell lines. Useful for quick prototyping of
 6 | #' method wrappers.
 7 | #'
 8 | #' @usage data(sample_sce_data)
 9 | #'
10 | #' @format A SingleCellExperiment object with sample annotations in
11 | #'   \code{colData(sample_sce_data)}. The annotation contains various QC metrics
12 | #'   as well as the cell line mixture proportions
13 | #' \describe{
14 | #'   \item{H2228_prop}{proportion of mRNA from H2228 cell line}
15 | #'   \item{H1975_prop}{proportion of mRNA from H1975 cell line}
16 | #'   \item{HCC827_prop}{proportion of mRNA from HCC827 cell line}
17 | #' }
18 | #'
19 | #' @seealso \code{\link{load_mrna_mix_data}}
20 | "sample_sce_data"
21 | 


--------------------------------------------------------------------------------
/man/summary.benchmark_tbl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/benchmark_tbl_methods.R
 3 | \name{summary.benchmark_tbl}
 4 | \alias{summary.benchmark_tbl}
 5 | \title{Summary of benchmark_tbl}
 6 | \usage{
 7 | \method{summary}{benchmark_tbl}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{the benchmark_tbl to be summarised}
11 | 
12 | \item{...}{additional arguments affecting the summary produced.}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | Summary of benchmark_tbl
19 | }
20 | \examples{
21 | # list of data
22 | datasets <- list(
23 |     set1 = rnorm(500, mean = 2, sd = 1),
24 |     set2 = rnorm(500, mean = 1, sd = 2)
25 | )
26 | 
27 | # list of functions
28 | add_noise <- list(
29 |     none = identity,
30 |     add_bias = function(x) { x + 1 }
31 | )
32 | 
33 | res <- apply_methods(datasets, add_noise)
34 | summary(res)
35 | }
36 | \keyword{internal}
37 | 


--------------------------------------------------------------------------------
/man/load_all_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_data.R
 3 | \name{load_sc_data}
 4 | \alias{load_sc_data}
 5 | \alias{load_cell_mix_data}
 6 | \alias{load_mrna_mix_data}
 7 | \alias{load_all_data}
 8 | \title{Load CellBench Data}
 9 | \usage{
10 | load_sc_data()
11 | 
12 | load_cell_mix_data()
13 | 
14 | load_mrna_mix_data()
15 | 
16 | load_all_data()
17 | }
18 | \value{
19 | list of SingleCellExperiment
20 | }
21 | \description{
22 | Load in all CellBench data described at <https://github.com/LuyiTian/CellBench_data/blob/master/README.md>.
23 | }
24 | \section{Functions}{
25 | \itemize{
26 | \item \code{load_sc_data()}: Load single cell data
27 | 
28 | \item \code{load_cell_mix_data()}: Load cell mixture data
29 | 
30 | \item \code{load_mrna_mix_data()}: Load mrna mixture data
31 | 
32 | }}
33 | \examples{
34 | \dontrun{
35 | cellbench_file <- load_all_data()
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/set_cellbench_threads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/set_options.R
 3 | \name{set_cellbench_threads}
 4 | \alias{set_cellbench_threads}
 5 | \title{Set number of threads used by CellBench}
 6 | \usage{
 7 | set_cellbench_threads(nthreads = 1)
 8 | }
 9 | \arguments{
10 | \item{nthreads}{the number of threads used by CellBench}
11 | }
12 | \value{
13 | None
14 | }
15 | \description{
16 | Sets global parameter for CellBench to use multiple threads for applying
17 | methods. If any methods applied are multi-threaded then it's recommended to
18 | set CellBench threads to 1. It only recommended to use CellBench with
19 | multiple threads if methods applied can be set to run on single threads.
20 | }
21 | \examples{
22 | set_cellbench_threads(1) # CellBench runs on a single thread
23 | 
24 | }
25 | \seealso{
26 | \code{\link{set_cellbench_bpparam}} for more advanced interface
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/testthat/test-benchmark_timing_tbl_methods.R:
--------------------------------------------------------------------------------
 1 | context("Bechmark timing table methods")
 2 | 
 3 | test_that("benchmark_timing_tbl methods work", {
 4 |     datasets <- list(
 5 |         data1 = 1:10,
 6 |         data2 = 11:20
 7 |     )
 8 | 
 9 |     transform <- list(
10 |         identity = identity
11 |     )
12 | 
13 |     res <- datasets %>%
14 |         time_methods(transform)
15 | 
16 |     res_timing_stripped <- res %>% strip_timing()
17 |     res_timing_unpacked <- res %>% unpack_timing()
18 | 
19 |     expect_equal(
20 |         class(res_timing_stripped),
21 |         c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
22 |     )
23 | 
24 |     expect_equal(
25 |         datasets,
26 |         res_timing_stripped$result,
27 |         check.names = FALSE
28 |     )
29 | 
30 |     expect_equal(
31 |         colnames(res_timing_unpacked),
32 |         c("data", "transform", "user", "system", "elapsed")
33 |     )
34 | })
35 | 
36 | 


--------------------------------------------------------------------------------
/R/package.R:
--------------------------------------------------------------------------------
 1 | #' A framework for benchmarking combinations of methods in multi-stage pipelines
 2 | #'
 3 | #' This package contains a framework for benchmarking combinations of methods in
 4 | #' a multi-stage pipeline. It is mainly based around the \code{apply_methods}
 5 | #' function, which takes lists of functions to be applied in stages of a
 6 | #' pipeline.
 7 | #'
 8 | #' @docType package
 9 | #' @name CellBench-package
10 | #' @aliases CellBench
11 | #'
12 | #' @author Shian Su <\url{https://www.github.com/shians}>
13 | #' @seealso The core function in this package is \code{\link{apply_methods}},
14 | #'   see \code{vignette("Introduction", package = "CellBench")} for basic usage.
15 | #'   Run \code{cellbench_case_study()} to see a case study using CellBench. The
16 | #'   data loading functions from \code{\link{load_all_data}} may also be of
17 | #'   interest.
18 | #' @importFrom methods is
19 | #' @importFrom magrittr %>%
20 | #' @importFrom assertthat assert_that
21 | NULL
22 | 


--------------------------------------------------------------------------------
/man/CellBench-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/package.R
 3 | \docType{package}
 4 | \name{CellBench-package}
 5 | \alias{CellBench-package}
 6 | \alias{CellBench}
 7 | \title{A framework for benchmarking combinations of methods in multi-stage pipelines}
 8 | \description{
 9 | This package contains a framework for benchmarking combinations of methods in
10 | a multi-stage pipeline. It is mainly based around the \code{apply_methods}
11 | function, which takes lists of functions to be applied in stages of a
12 | pipeline.
13 | }
14 | \seealso{
15 | The core function in this package is \code{\link{apply_methods}},
16 |   see \code{vignette("Introduction", package = "CellBench")} for basic usage.
17 |   Run \code{cellbench_case_study()} to see a case study using CellBench. The
18 |   data loading functions from \code{\link{load_all_data}} may also be of
19 |   interest.
20 | }
21 | \author{
22 | Shian Su <\url{https://www.github.com/shians}>
23 | }
24 | 


--------------------------------------------------------------------------------
/R/any_task_errors.R:
--------------------------------------------------------------------------------
 1 | #' Check if any tasks produced errors
 2 | #'
 3 | #' Check the results column of a benchmark tibble for any task_error objects.
 4 | #'
 5 | #' @param x the tibble to check
 6 | #' @param verbose TRUE if the rows with errors should be reported
 7 | #'
 8 | #' @return TRUE if any entry in the result column is a task_error object
 9 | #' @export
10 | any_task_errors <- function(x, verbose) {
11 |     UseMethod("any_task_errors", x)
12 | }
13 | 
14 | #' @describeIn any_task_errors
15 | #'
16 | #' @export
17 | any_task_errors.benchmark_tbl <- function(x, verbose = FALSE) {
18 |     any_errors <- FALSE
19 | 
20 |     for (i in seq_along(x$result)) {
21 |         res <- x$result[[i]]
22 | 
23 |         if (is(res, "task_error")) {
24 |             any_errors <- TRUE
25 | 
26 |             if (verbose) {
27 |                 message(glue::glue("task_error in row {i}"))
28 |             }
29 |         }
30 |     }
31 | 
32 |     any_errors
33 | }
34 | 
35 | # TODO: Implement for benchmark_timing_tbl
36 | 


--------------------------------------------------------------------------------
/man/strip_timing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/benchmark_timing_tbl_methods.R
 3 | \name{strip_timing}
 4 | \alias{strip_timing}
 5 | \alias{strip_timing.benchmark_timing_tbl}
 6 | \title{Strip timing information}
 7 | \usage{
 8 | strip_timing(x)
 9 | 
10 | \method{strip_timing}{benchmark_timing_tbl}(x)
11 | }
12 | \arguments{
13 | \item{x}{the benchmark_timing_tbl object}
14 | }
15 | \value{
16 | benchmark_tbl
17 | }
18 | \description{
19 | Takes the result of a time_methods() call and remove timing information from
20 | the `timed_result` column, replacing it with a `result` column and converting
21 | it to a benchmark_tbl.
22 | }
23 | \examples{
24 | \dontrun{
25 | datasets <- list(
26 |     data1 = 1:1e8,
27 | )
28 | 
29 | transforms <- list(
30 |     log = log,
31 |     sqrt = sqrt
32 | )
33 | 
34 | datasets \%>\%
35 |     time_methods(transforms) \%>\%
36 |     strip_timing()
37 | }
38 | 
39 | }
40 | \seealso{
41 | \code{\link{unpack_timing}}
42 | }
43 | \keyword{internal}
44 | 


--------------------------------------------------------------------------------
/man/as_pipeline_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{as_pipeline_list}
 4 | \alias{as_pipeline_list}
 5 | \title{convert benchmark_tbl to list}
 6 | \usage{
 7 | as_pipeline_list(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the benchmark_tbl object to convert}
11 | }
12 | \value{
13 | list containing the results with names set to data and pipeline steps
14 |   separated by ..
15 | }
16 | \description{
17 | convert a benchmark_tbl to a list where the name of the elements represent the pipeline steps separated by "..". This can be useful for using the apply family of functions.
18 | }
19 | \examples{
20 | # list of data
21 | datasets <- list(
22 |     set1 = rnorm(500, mean = 2, sd = 1),
23 |     set2 = rnorm(500, mean = 1, sd = 2)
24 | )
25 | 
26 | # list of functions
27 | add_noise <- list(
28 |     none = identity,
29 |     add_bias = function(x) { x + 1 }
30 | )
31 | 
32 | res <- apply_methods(datasets, add_noise)
33 | as_pipeline_list(res)
34 | }
35 | \seealso{
36 | \code{\link{collapse_pipeline}}
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/testthat/test-time_methods.R:
--------------------------------------------------------------------------------
 1 | context("Time methods")
 2 | 
 3 | # System.time has rather large overhead and it's difficult to write extensive
 4 | # unit tests without enormous cost.
 5 | 
 6 | test_that("Time methods works", {
 7 |     datasets <- list(
 8 |         data1 = 1:10,
 9 |         data2 = 11:20
10 |     )
11 | 
12 |     transform <- list(
13 |         exp = exp
14 |     )
15 | 
16 |     res <- datasets %>%
17 |         time_methods(transform)
18 | 
19 |     expect_identical(res$data, factor(c("data1", "data2")))
20 |     expect_identical(res$transform, factor(c("exp", "exp")))
21 |     expect_equal(
22 |         unlist(lapply(res$timed_result, function(x) x$timing["elapsed"])),
23 |         c(0, 0),
24 |         check.names = FALSE,
25 |         tolerance = 0.2
26 |     )
27 | 
28 |     transform2 <- list(
29 |         log = log
30 |     )
31 | 
32 |     res2 <- res %>%
33 |         time_methods(transform2)
34 | 
35 |     expect_equal(
36 |         res2 %>% strip_timing() %>% extract2("result"),
37 |         datasets,
38 |         check.names = FALSE
39 |     )
40 | })
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test-set_options.R:
--------------------------------------------------------------------------------
 1 | context("Option setting")
 2 | 
 3 | test_that(
 4 |     "Threads can be set properly", {
 5 |     set_cellbench_threads(1)
 6 |     expect_identical(getOption("CellBench.threads"), 1)
 7 | 
 8 |     set_cellbench_threads(2)
 9 |     expect_identical(getOption("CellBench.threads"), 2)
10 | 
11 |     expect_error(
12 |         set_cellbench_threads("foo"),
13 |         "is.numeric(nthreads) is not TRUE",
14 |         fixed = TRUE
15 |     )
16 | 
17 |     expect_error(
18 |         set_cellbench_bpparam(1),
19 |         'is(param, "BiocParallelParam") is not TRUE',
20 |         fixed = TRUE
21 |     )
22 | 
23 |     expect_silent(set_cellbench_bpparam(BiocParallel::SerialParam()))
24 | })
25 | 
26 | test_that(
27 |     "Cache path can be set properly", {
28 |     expect_error(
29 |         set_cellbench_cache_path(1),
30 |         "is.character(path) is not TRUE",
31 |         fixed = TRUE
32 |     )
33 | 
34 |     p <- file.path(tempdir(), ".CellBenchCache")
35 |     set_cellbench_cache_path(p)
36 | 
37 |     expect_true(dir.exists(p))
38 |     unlink(p)
39 | })
40 | 


--------------------------------------------------------------------------------
/man/check_class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_check.R
 3 | \name{check_class}
 4 | \alias{check_class}
 5 | \alias{is_one_of}
 6 | \alias{is_any_of}
 7 | \alias{is_all_of}
 8 | \title{Check class of object}
 9 | \usage{
10 | is_one_of(x, classes)
11 | 
12 | is_any_of(x, classes)
13 | 
14 | is_all_of(x, classes)
15 | }
16 | \arguments{
17 | \item{x}{the object to check}
18 | 
19 | \item{classes}{the vector of strings of class names}
20 | }
21 | \value{
22 | boolean value for the result of the check
23 | }
24 | \description{
25 | Check an object against a vector of class names. Testing if they
26 |   match any or all of the classes. For is_all_of, the object needs to be at
27 |   least every class specified, but it can have addition classes and still
28 |   pass the check.
29 | }
30 | \examples{
31 | is_one_of(1, c("numeric", "logical")) # TRUE
32 | is_one_of(1, c("character", "logical")) # FALSE
33 | 
34 | is_all_of(1, c("numeric", "logical")) # FALSE
35 | is_all_of(tibble::tibble(), c("tbl", "data.frame")) # TRUE
36 | }
37 | \keyword{internal}
38 | 


--------------------------------------------------------------------------------
/man/sample_sce_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sample_sce_data.R
 3 | \docType{data}
 4 | \name{sample_sce_data}
 5 | \alias{sample_sce_data}
 6 | \title{This is data for testing functions in CellBench.}
 7 | \format{
 8 | A SingleCellExperiment object with sample annotations in
 9 |   \code{colData(sample_sce_data)}. The annotation contains various QC metrics
10 |   as well as the cell line mixture proportions
11 | \describe{
12 |   \item{H2228_prop}{proportion of mRNA from H2228 cell line}
13 |   \item{H1975_prop}{proportion of mRNA from H1975 cell line}
14 |   \item{HCC827_prop}{proportion of mRNA from HCC827 cell line}
15 | }
16 | }
17 | \usage{
18 | data(sample_sce_data)
19 | }
20 | \description{
21 | A dataset containing 200 genes and 50 cells randomly sampled from the CelSeq
22 | mRNA mixture dataset, each sample is a mixture of mRNA material from 3
23 | different human adenocarcinoma cell lines. Useful for quick prototyping of
24 | method wrappers.
25 | }
26 | \seealso{
27 | \code{\link{load_mrna_mix_data}}
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/tests/testthat/test-fn_arg_seq.R:
--------------------------------------------------------------------------------
 1 | context("Generating functions with sequence of arguments")
 2 | 
 3 | test_that(
 4 |     "Function sequence generated correctly", {
 5 |     fn <- function(x, y) {
 6 |         x + y
 7 |     }
 8 | 
 9 |     fn_list <- fn_arg_seq(fn, y = 1:3)
10 | 
11 |     expect_identical(
12 |         names(fn_list),
13 |         c("fn(y = 1)", "fn(y = 2)", "fn(y = 3)")
14 |     )
15 | 
16 |     expect_identical(
17 |         purrr::map_dbl(fn_list, function(f) {f(1)}),
18 |         c(`fn(y = 1)` = 2, `fn(y = 2)` = 3, `fn(y = 3)` = 4)
19 |     )
20 | })
21 | 
22 | test_that(
23 |     "Argument vector ordering is respected", {
24 |     fn <- function(x, y) {
25 |         x + y
26 |     }
27 |     fn_list <- fn_arg_seq(fn, x = 3:1)
28 | 
29 |     expect_identical(
30 |         names(fn_list),
31 |         c("fn(x = 3)", "fn(x = 2)", "fn(x = 1)")
32 |     )
33 | })
34 | 
35 | test_that(
36 |     "Errors are properly reported", {
37 |     fn <- function(x, y) {
38 |         x + y
39 |     }
40 | 
41 |     expect_error(fn_arg_seq(fn, z = 1:3, .strict = TRUE), "args not used in fn: 'z'", fixed = TRUE)
42 | })
43 | 


--------------------------------------------------------------------------------
/R/fn_list.R:
--------------------------------------------------------------------------------
 1 | #' Constructor for a function list
 2 | #'
 3 | #' Constructor for a list of functions, a thin wrapper around list() which
 4 | #' checks that all the inputs are functions and have names
 5 | #'
 6 | #' @param ... objects, must all be named
 7 | #'
 8 | #' @return a list of named functions
 9 | #' @export
10 | #'
11 | #' @examples
12 | #' flist <- fn_list(
13 | #'     mean = mean,
14 | #'     median = median
15 | #' )
16 | fn_list <- function(...) {
17 |     out <- list(...)
18 | 
19 |     if (!purrr::every(out, is.function)) {
20 |         stop("all fn_list members must be functions")
21 |     }
22 | 
23 |     if (is.null(names(out))) {
24 |         fn_name <- deparse(substitute(...))
25 |         stop(glue::glue("all fn_list members must have names, e.g. fn_list(fn1 = {fn_name})"))
26 |     }
27 | 
28 |     if (any(names(out) == "")) {
29 |         missing_names <- which(names(out) == "")
30 |         missing_names <- glue::glue(collapse_with_comma("{missing_names}"))
31 |         stop(glue::glue("all fn_list members must have names, indices of members without name: {missing_names}"))
32 |     }
33 | 
34 |     out
35 | }
36 | 


--------------------------------------------------------------------------------
/R/data_list.R:
--------------------------------------------------------------------------------
 1 | #' Constructor for a data list
 2 | #'
 3 | #' Constructor for a list of data, a thin wrapper around list() which
 4 | #' checks that all the inputs are of the same type and have names
 5 | #'
 6 | #' @param ... objects, must all be named
 7 | #'
 8 | #' @return a list of named data
 9 | #' @export
10 | #'
11 | #' @examples
12 | #' data(iris)
13 | #' flist <- data_list(
14 | #'     data1 = iris[1:20, ],
15 | #'     data2 = iris[21:40, ]
16 | #' )
17 | data_list <- function(...) {
18 |     out <- list(...)
19 | 
20 |     if (!all_same_class(out)) {
21 |         stop("all data_list members must be have the same class")
22 |     }
23 | 
24 |     if (is.null(names(out))) {
25 |         data_name <- deparse(substitute(...))
26 |         stop(glue::glue("all data_list members must have names, e.g. data_list(fn1 = {data_name})"))
27 |     }
28 | 
29 |     if (any(names(out) == "")) {
30 |         missing_names <- which(names(out) == "")
31 |         missing_names <- glue::glue(collapse_with_comma("{missing_names}"))
32 |         stop(glue::glue("all data_list members must have names, indices of members without name: {missing_names}"))
33 |     }
34 | 
35 |     out
36 | }
37 | 


--------------------------------------------------------------------------------
/man/split_step.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_step.R
 3 | \name{split_step}
 4 | \alias{split_step}
 5 | \title{Split combined pipeline step}
 6 | \usage{
 7 | split_step(x, step, into)
 8 | }
 9 | \arguments{
10 | \item{x}{a results data.frame from `apply_methods()`.}
11 | 
12 | \item{step}{the name of the column to split.}
13 | 
14 | \item{into}{the name of the columns to split into.}
15 | }
16 | \value{
17 | a results data.frame where the `step` column has been split into
18 | the `into` columns with duplicated values.
19 | }
20 | \description{
21 | Some methods perform multiple steps of a pipeline. This function assists with
22 | splitting the combined pipeline step into multiple steps with duplicated
23 | method names.
24 | }
25 | \examples{
26 | datasets <- list(
27 |     set1 = rnorm(500, mean = 2, sd = 1),
28 |     set2 = rnorm(500, mean = 1, sd = 2)
29 | )
30 | 
31 | # list of functions
32 | add_noise <- list(
33 |     none = identity,
34 |     add_bias = function(x) { x + 1 }
35 | )
36 | 
37 | res <- apply_methods(datasets, add_noise)
38 | 
39 | res \%>\%
40 |     split_step("add_noise", c("split1", "split2"))
41 | }
42 | 


--------------------------------------------------------------------------------
/man/unpack_timing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/benchmark_timing_tbl_methods.R
 3 | \name{unpack_timing}
 4 | \alias{unpack_timing}
 5 | \alias{unpack_timing.benchmark_timing_tbl}
 6 | \title{Unpack timing information}
 7 | \usage{
 8 | unpack_timing(x)
 9 | 
10 | \method{unpack_timing}{benchmark_timing_tbl}(x)
11 | }
12 | \arguments{
13 | \item{x}{the benchmark_timing_tbl object}
14 | }
15 | \value{
16 | a tibble containing pipeline steps and timing information
17 | }
18 | \description{
19 | Takes the result of a time_methods() call and remove the `timed_result`
20 | column, replacing it with three columns of durations representing the
21 | `system`, `user` and `elapsed` times from a system.time() call.
22 | }
23 | \examples{
24 | \dontrun{
25 | datasets <- list(
26 |     data1 = c(1, 2, 3)
27 | )
28 | 
29 | transforms <- list(
30 |     log = function(x) { Sys.sleep(0.1); log(x) },
31 |     sqrt = function(x) { Sys.sleep(0.1); sqrt(x) }
32 | )
33 | 
34 | datasets \%>\%
35 |     time_methods(transforms) \%>\%
36 |     unpack_timing()
37 | }
38 | 
39 | }
40 | \seealso{
41 | \code{\link{strip_timing}}
42 | }
43 | \keyword{internal}
44 | 


--------------------------------------------------------------------------------
/man/cache_method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/memoise.R
 3 | \name{cache_method}
 4 | \alias{cache_method}
 5 | \title{Create a cached function for CellBench}
 6 | \usage{
 7 | cache_method(f, cache = getOption("CellBench.cache"))
 8 | }
 9 | \arguments{
10 | \item{f}{the function to be cached}
11 | 
12 | \item{cache}{the cache information (from memoise package)}
13 | }
14 | \value{
15 | function whose results are cached and is called identically to f
16 | }
17 | \description{
18 | Take a function and return a cached version. The arguments and results of a
19 | cached method is saved to disk and if the cached function is called again
20 | with the same arguments then the results will be retrieved from the cache
21 | rather than be recomputed.
22 | }
23 | \details{
24 | \bold{(CAUTION)} Because cached functions called
25 | with the same argument will always return the same output, pseudo-random
26 | methods will not return varying results over repeated runs as one might
27 | expect.
28 | 
29 | This function is a thin wrapper around \code{\link[memoise]{memoise}}
30 | }
31 | \examples{
32 | # sets cache path to a temporary directory
33 | set_cellbench_cache_path(file.path(tempdir(), ".CellBenchCache"))
34 | f <- function(x) { x + 1 }
35 | cached_f <- cache_method(f)
36 | 
37 | }
38 | \seealso{
39 | \code{\link{set_cellbench_cache_path}}
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils_filter.R:
--------------------------------------------------------------------------------
 1 | context("Filtering utilities")
 2 | 
 3 | test_that(
 4 |     "Filtering zero genes works", {
 5 |     x <- matrix(runif(100), nrow = 10, ncol = 10)
 6 | 
 7 |     x[1, ] <- 0
 8 | 
 9 |     expect_identical(
10 |         filter_zero_genes(x),
11 |         x[-1, ]
12 |     )
13 | 
14 |     sample_sce_data <- readRDS(cellbench_file("10x_sce_sample.rds"))
15 | 
16 |     x <- sample_sce_data
17 |     counts(x)[1, ] <- 0
18 |     expect_equal(
19 |         filter_zero_genes(x),
20 |         x[-1, ]
21 |     )
22 | 
23 |     y <- "foo"
24 |     expect_error(
25 |         filter_zero_genes(y),
26 |         "is_one_of(x, c(\"SingleCellExperiment\", \"matrix\")) is not TRUE",
27 |         fixed = TRUE
28 |     )
29 | })
30 | 
31 | test_that(
32 |     "Keeping high count genes works", {
33 |     x <- matrix(rep(1:5, times = 10), nrow = 5)
34 |     expect_identical(
35 |         keep_high_count_genes(x, n = 2),
36 |         x[4:5, ]
37 |     )
38 | })
39 | 
40 | test_that(
41 |     "Keeping high count cells works", {
42 |     x <- matrix(rep(1:5, times = 10), ncol = 5, byrow = TRUE)
43 |     expect_identical(
44 |         keep_high_count_cells(x, n = 2),
45 |         x[, 4:5]
46 |     )
47 | })
48 | 
49 | test_that(
50 |     "Keeping high variance genes works", {
51 |     x <- matrix(rep(1:5, times = 10), ncol = 5, byrow = TRUE)
52 |     x[1:8, ] <- 1
53 |     expect_identical(
54 |         keep_high_var_genes(x, n = 2),
55 |         x[9:10, ]
56 |     )
57 | })
58 | 


--------------------------------------------------------------------------------
/R/print.R:
--------------------------------------------------------------------------------
 1 | #' Print method for task_error object
 2 | #'
 3 | #' task_error are objects that result from failed methods
 4 | #'
 5 | #' @param x a task_error object
 6 | #' @param ... not used
 7 | #'
 8 | #' @return None
 9 | #'
10 | #' @export
11 | #' @keywords internal
12 | print.task_error <- function(x, ...) {
13 |     cat(
14 |         glue::glue(
15 |             "error at step: '{x$error_location}'",
16 |             "message: 'error: {x$message}'",
17 |             "traceback() is available for this object",
18 |             .sep = "\n"
19 |         )
20 |     )
21 | 
22 |     invisible() # guard against implicit return
23 | }
24 | 
25 | #' Print method for fn_arg_seq output
26 | #'
27 | #' @param x fn_arg_seq object
28 | #' @param ... addition arguments for print
29 | #'
30 | #' @return None
31 | #'
32 | #' @export
33 | #' @keywords internal
34 | #'
35 | #' @examples
36 | #' fn_seq <- fn_arg_seq(kmeans, centers = 1:3)
37 | #' fn_seq
38 | print.fn_arg_seq <- function(x, ...) {
39 |     print(glue::glue("List of {length(x)} partial functions"))
40 |     print(glue::glue(" $ {names(x)}"))
41 | }
42 | 
43 | #' @exportS3Method vctrs::vec_ptype_abbr
44 | vec_ptype_abbr.task_error <- function(x, ...) {
45 |     "task_error"
46 | }
47 | 
48 | #' @exportS3Method vctrs::vec_ptype_abbr
49 | vec_ptype_abbr.remote_error <- function(x, ...) {
50 |     "remote_error"
51 | }
52 | 
53 | #' @exportS3Method vctrs::vec_ptype_abbr
54 | vec_ptype_abbr.SingleCellExperiment <- function(x, ...) {
55 |     "SingleCellExperiment"
56 | }
57 | 


--------------------------------------------------------------------------------
/R/utils_sampling.R:
--------------------------------------------------------------------------------
 1 | # sample n rows from data
 2 | sample_rows <- function(x, n) {
 3 |     if (nrow(x) > n) {
 4 |         x[sample(seq_nrow(x), n), ]
 5 |     } else {
 6 |         x
 7 |     }
 8 | }
 9 | 
10 | # sample n cols from data
11 | sample_cols <- function(x, n) {
12 |     if (ncol(x) > n) {
13 |         x[, sample(seq_ncol(x), n)]
14 |     } else {
15 |         x
16 |     }
17 | }
18 | 
19 | #' Sample cells from a SingleCellExperiment
20 | #'
21 | #' Sample n cells from a SingleCellExperiment object with no replacement.
22 | #'
23 | #' @param x the SingleCellExperiment object
24 | #' @param n the number of cells to sample
25 | #'
26 | #' @return SingleCellExperiment object
27 | #' @export
28 | #'
29 | #' @examples
30 | #' sample_sce_data <- readRDS(cellbench_file("celseq_sce_sample.rds"))
31 | #' dim(sample_sce_data)
32 | #' x <- sample_cells(sample_sce_data, 10)
33 | #' dim(x)
34 | sample_cells <- function(x, n) {
35 |     stopifnot(is(x, "SingleCellExperiment"))
36 |     sample_cols(x, n)
37 | }
38 | 
39 | #' Sample genes from a SingleCellExperiment
40 | #'
41 | #' Sample n genes from a SingleCellExperiment object with no replacement
42 | #'
43 | #' @param x the SingleCellExperiment object
44 | #' @param n the number of genes to sample
45 | #'
46 | #' @return SingleCellExperiment object
47 | #' @export
48 | #'
49 | #' @examples
50 | #' sample_sce_data <- readRDS(cellbench_file("10x_sce_sample.rds"))
51 | #' dim(sample_sce_data)
52 | #' x <- sample_genes(sample_sce_data, 50)
53 | #' dim(x)
54 | sample_genes <- function(x, n) {
55 |     stopifnot(is(x, "SingleCellExperiment"))
56 |     sample_rows(x, n)
57 | }
58 | 


--------------------------------------------------------------------------------
/R/split_step.R:
--------------------------------------------------------------------------------
 1 | #' Split combined pipeline step
 2 | #'
 3 | #' Some methods perform multiple steps of a pipeline. This function assists with
 4 | #' splitting the combined pipeline step into multiple steps with duplicated
 5 | #' method names.
 6 | #'
 7 | #' @param x a results data.frame from `apply_methods()`.
 8 | #' @param step the name of the column to split.
 9 | #' @param into the name of the columns to split into.
10 | #'
11 | #' @return a results data.frame where the `step` column has been split into
12 | #' the `into` columns with duplicated values.
13 | #'
14 | #' @importFrom rlang :=
15 | #' @export
16 | #'
17 | #' @examples
18 | #' datasets <- list(
19 | #'     set1 = rnorm(500, mean = 2, sd = 1),
20 | #'     set2 = rnorm(500, mean = 1, sd = 2)
21 | #' )
22 | #'
23 | #' # list of functions
24 | #' add_noise <- list(
25 | #'     none = identity,
26 | #'     add_bias = function(x) { x + 1 }
27 | #' )
28 | #'
29 | #' res <- apply_methods(datasets, add_noise)
30 | #'
31 | #' res %>%
32 | #'     split_step("add_noise", c("split1", "split2"))
33 | split_step <- function(x, step, into) {
34 |     assert_that(is(x, "data.frame"), msg = "`x` must be a data.frame.")
35 |     assert_that(
36 |         is(step, "character"),
37 |         length(step) == 1,
38 |         msg = "`step` must be character of length 1."
39 |     )
40 |     assert_that(
41 |         is(into, "character"),
42 |         length(into) > 1,
43 |         msg = "`into` must be character of length >1."
44 |     )
45 | 
46 |     x <- x %>%
47 |         dplyr::rename(!!into[1] := !!step)
48 | 
49 |     for (i in length(into):2) {
50 |         x <- x %>%
51 |             dplyr::mutate(!!into[i] := .data[[!!into[1]]], .after = !!into[1])
52 |     }
53 | 
54 |     x
55 | }
56 | 


--------------------------------------------------------------------------------
/man/fn_arg_seq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fn_arg_seq.R
 3 | \name{fn_arg_seq}
 4 | \alias{fn_arg_seq}
 5 | \title{Create a list of functions with arguments varying over a sequence}
 6 | \usage{
 7 | fn_arg_seq(func, ..., .strict = FALSE)
 8 | }
 9 | \arguments{
10 | \item{func}{function to generate list from}
11 | 
12 | \item{...}{vectors of values to use as arguments}
13 | 
14 | \item{.strict}{TRUE if argument names are checked, giving an error if
15 | specified argument does not appear in function signature. Note that
16 | functions with multiple methods generally have only f(x, ...) as their
17 | signature, so the check would fail even if the arguments are passed on.}
18 | }
19 | \value{
20 | list of functions with the specified arguments pre-applied. Names of
21 |     the list indicate the values that have been pre-applied.
22 | }
23 | \description{
24 | Generate a list of functions where specific arguments have been pre-applied
25 | from a sequences of arguments, i.e. a function f(x, n) may have the 'n'
26 | argument pre-applied with specific values to obtain functions f1(x, n = 1)
27 | and f2(x, n = 2) stored in a list.
28 | }
29 | \details{
30 | If multiple argument vectors are provided
31 | then the combinations of arguments in the sequences will be generated.
32 | }
33 | \examples{
34 | f <- function(x) {
35 |     cat("x:", x)
36 | }
37 | 
38 | f_list <- fn_arg_seq(f, x = c(1, 2))
39 | f_list
40 | f_list[[1]]() # x: 1
41 | f_list[[2]]() # x: 2
42 | 
43 | g <- function(x, y) {
44 |     cat("x:", x, "y:", y)
45 | }
46 | 
47 | g_list <- fn_arg_seq(g, x = c(1, 2), y = c(3, 4))
48 | g_list
49 | g_list[[1]]() # x: 1 y: 3
50 | g_list[[2]]() # x: 1 y: 4
51 | g_list[[3]]() # x: 2 y: 3
52 | g_list[[4]]() # x: 2 y: 4
53 | }
54 | 


--------------------------------------------------------------------------------
/man/collapse_pipeline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{collapse_pipeline}
 4 | \alias{collapse_pipeline}
 5 | \alias{pipeline_collapse}
 6 | \title{Collapse benchmark_tbl into a two column summary}
 7 | \usage{
 8 | collapse_pipeline(
 9 |   x,
10 |   sep = arrow_sep("right"),
11 |   drop.steps = TRUE,
12 |   data.name = TRUE
13 | )
14 | 
15 | pipeline_collapse(
16 |   x,
17 |   sep = arrow_sep("right"),
18 |   drop.steps = TRUE,
19 |   data.name = TRUE
20 | )
21 | }
22 | \arguments{
23 | \item{x}{the benchmark_tbl to collapse}
24 | 
25 | \item{sep}{the separator to use for concatenating the pipeline steps}
26 | 
27 | \item{drop.steps}{if the data name and methods steps should be dropped from
28 | the output. TRUE by default.}
29 | 
30 | \item{data.name}{if the dataset name should be included in the pipeline
31 | string. Useful if only a single dataset is used.}
32 | }
33 | \value{
34 | benchmark_tbl with pipeline and result columns (and all other columns
35 |   if drop.steps is FALSE)
36 | }
37 | \description{
38 | Collapse benchmark_tbl into two columns: "pipeline" and "result". The
39 | "pipeline" column will be the concatenated values from the data and methods
40 | columns while the "result" column remains unchanged from the benchmark_tbl.
41 | This is useful for having a string summary of the pipeline for annotating.
42 | }
43 | \examples{
44 | # list of data
45 | datasets <- list(
46 |     set1 = rnorm(500, mean = 2, sd = 1),
47 |     set2 = rnorm(500, mean = 1, sd = 2)
48 | )
49 | 
50 | # list of functions
51 | add_noise <- list(
52 |     none = identity,
53 |     add_bias = function(x) { x + 1 }
54 | )
55 | 
56 | res <- apply_methods(datasets, add_noise)
57 | collapse_pipeline(res)
58 | }
59 | \seealso{
60 | \code{\link{as_pipeline_list}}
61 | }
62 | 


--------------------------------------------------------------------------------
/R/utils_check.R:
--------------------------------------------------------------------------------
 1 | #' @name check_class
 2 | #' @title Check class of object
 3 | #' @description Check an object against a vector of class names. Testing if they
 4 | #'   match any or all of the classes. For is_all_of, the object needs to be at
 5 | #'   least every class specified, but it can have addition classes and still
 6 | #'   pass the check.
 7 | #'
 8 | #' @param x the object to check
 9 | #' @param classes the vector of strings of class names
10 | #'
11 | #' @return boolean value for the result of the check
12 | NULL
13 | 
14 | #' @rdname check_class
15 | #'
16 | #' @importFrom methods is
17 | #' @export
18 | #' @keywords internal
19 | #'
20 | #' @examples
21 | #' is_one_of(1, c("numeric", "logical")) # TRUE
22 | #' is_one_of(1, c("character", "logical")) # FALSE
23 | #'
24 | #' is_all_of(1, c("numeric", "logical")) # FALSE
25 | #' is_all_of(tibble::tibble(), c("tbl", "data.frame")) # TRUE
26 | is_one_of <- function(x, classes) {
27 |     stopifnot(is(classes, "character"))
28 |     purrr::map_lgl(classes, function(class) is(x, class)) %>% any()
29 | }
30 | 
31 | #' @rdname check_class
32 | #' @keywords internal
33 | is_any_of <- is_one_of
34 | 
35 | #' @rdname check_class
36 | #'
37 | #' @importFrom methods is
38 | #' @keywords internal
39 | #' @export
40 | is_all_of <- function(x, classes) {
41 |     stopifnot(is(classes, "character"))
42 |     purrr::map_lgl(classes, function(class) is(x, class)) %>% all()
43 | }
44 | 
45 | 
46 | #' Check if all values in a vector are unique
47 | #'
48 | #' @param x the vector to check
49 | #'
50 | #' @return TRUE if all values in the vector are unique
51 | #'
52 | #' @export
53 | #' @keywords internal
54 | #'
55 | #' @examples
56 | #' all_unique(c(1, 2, 3)) # TRUE
57 | #' all_unique(c(1, 2, 2)) # FALSE
58 | all_unique <- function(x) {
59 |     stopifnot(is.vector(x))
60 | 
61 |     length(x) == length(unique(x))
62 | }
63 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CellBench
 2 | Type: Package
 3 | Title: Construct Benchmarks for Single Cell Analysis Methods
 4 | Version: 1.17.1
 5 | Authors@R: c(
 6 |     person("Shian", "Su", email = "su.s@wehi.edu.au", role = c("cre", "aut")),
 7 |     person("Saskia", "Freytag", role = "aut"),
 8 |     person("Luyi", "Tian", role = "aut"),
 9 |     person("Xueyi", "Dong", role = "aut"),
10 |     person("Matthew", "Ritchie", role = "aut"),
11 |     person("Peter", "Hickey", role = "ctb"),
12 |     person("Stuart", "Lee", role = "ctb"))
13 | Description: This package contains infrastructure for benchmarking analysis 
14 |     methods and access to single cell mixture benchmarking data. It provides
15 |     a framework for organising analysis methods and testing combinations of
16 |     methods in a pipeline without explicitly laying out each combination. It
17 |     also provides utilities for sampling and filtering SingleCellExperiment
18 |     objects, constructing lists of functions with varying parameters, and
19 |     multithreaded evaluation of analysis methods.
20 | biocViews:
21 |     Software,
22 |     Infrastructure,
23 |     SingleCell
24 | URL: https://github.com/shians/cellbench
25 | BugReports: https://github.com/Shians/CellBench/issues
26 | License: GPL-3
27 | Encoding: UTF-8
28 | Depends:
29 |     R (>= 3.6),
30 |     SingleCellExperiment,
31 |     magrittr,
32 |     methods,
33 |     stats,
34 |     tibble,
35 |     utils
36 | Imports:
37 |     assertthat,
38 |     BiocGenerics,
39 |     BiocFileCache,
40 |     BiocParallel,
41 |     dplyr,
42 |     rlang,
43 |     glue,
44 |     memoise,
45 |     purrr (>= 0.3.0),
46 |     rappdirs,
47 |     tidyr,
48 |     tidyselect,
49 |     lubridate
50 | Suggests:
51 |     BiocStyle,
52 |     covr,
53 |     knitr,
54 |     rmarkdown,
55 |     testthat,
56 |     limma,
57 |     ggplot2
58 | VignetteBuilder: knitr
59 | RoxygenNote: 7.2.3
60 | 


--------------------------------------------------------------------------------
/R/benchmark_tbl_methods.R:
--------------------------------------------------------------------------------
 1 | #' Summary of benchmark_tbl
 2 | #'
 3 | #' @param object the benchmark_tbl to be summarised
 4 | #' @param ... additional arguments affecting the summary produced.
 5 | #'
 6 | #' @return None
 7 | #'
 8 | #' @export
 9 | #' @keywords internal
10 | #'
11 | #' @examples
12 | #' # list of data
13 | #' datasets <- list(
14 | #'     set1 = rnorm(500, mean = 2, sd = 1),
15 | #'     set2 = rnorm(500, mean = 1, sd = 2)
16 | #' )
17 | #'
18 | #' # list of functions
19 | #' add_noise <- list(
20 | #'     none = identity,
21 | #'     add_bias = function(x) { x + 1 }
22 | #' )
23 | #'
24 | #' res <- apply_methods(datasets, add_noise)
25 | #' summary(res)
26 | summary.benchmark_tbl <- function(object, ...) {
27 |     if (dplyr::last(colnames(object)) != "result") {
28 |         # if benchmark_tbl has been manipulated by user to non-standard form
29 |         print(summary.data.frame(object))
30 |         return()
31 |     }
32 | 
33 |     method_names <- names(object)
34 |     method_names <- method_names[-1]
35 |     method_names <- method_names[-length(method_names)]
36 | 
37 |     out <- ""
38 | 
39 |     out <- c(glue::glue("Pipeline summary:"), out)
40 |     pipeline_str_vec <- c("data", glue::glue("{method_names}"), "result")
41 |     out <- c(out, glue::glue_collapse(pipeline_str_vec, sep = " \u2192 "))
42 | 
43 |     names(method_names) <- method_names
44 |     unique_method_list <- purrr::map(
45 |         method_names,
46 |         function(nm) unique(object[, nm]) %>% dplyr::pull(1)
47 |         )
48 | 
49 |     for (method_name in method_names) {
50 |         unique_methods <- unique_method_list[[method_name]]
51 |         out <- c(out, glue::glue(""))
52 |         out <- c(out, glue::glue("{method_name} variants:"))
53 |         out <- c(out, glue::glue_collapse(glue::glue(" * {unique_methods}"), sep = "\n"))
54 |     }
55 | 
56 |     cat(out, sep = "\n")
57 | }
58 | 


--------------------------------------------------------------------------------
/man/time_methods.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/time_methods.R
 3 | \name{time_methods}
 4 | \alias{time_methods}
 5 | \alias{time_methods.list}
 6 | \alias{time_methods.benchmark_timing_tbl}
 7 | \title{Time methods}
 8 | \usage{
 9 | time_methods(x, fn_list, name = NULL, suppress.messages = TRUE)
10 | 
11 | \method{time_methods}{list}(x, fn_list, name = NULL, suppress.messages = TRUE)
12 | 
13 | \method{time_methods}{benchmark_timing_tbl}(x, fn_list, name = NULL, suppress.messages = TRUE)
14 | }
15 | \arguments{
16 | \item{x}{the list of data or benchmark timing tibble to apply methods to}
17 | 
18 | \item{fn_list}{the list of methods to be applied}
19 | 
20 | \item{name}{(optional) the name of the column for methods applied}
21 | 
22 | \item{suppress.messages}{TRUE if messages from running methods should be
23 | suppressed}
24 | }
25 | \value{
26 | benchmark_timing_tbl object containing results from methods applied,
27 |   the first column is the name of the dataset as factors, middle columns
28 |   contain method names as factors and the final column is a list of lists
29 |   containing the results of applying the methods and timings from calling
30 |   system.time().
31 | }
32 | \description{
33 | time_methods() take either lists of datasets or benchmark_timing_tbl objects
34 | and applies a list of functions. The output is a benchmark_timing_tbl where
35 | each method has been applied to each dataset or preceding result. Unlike
36 | apply_methods(), time_methods() is always single threaded as to produce fair
37 | and more consistent timings.
38 | }
39 | \examples{
40 | datasets <- list(
41 |     set1 = 1:1e7
42 | )
43 | 
44 | transform <- list(
45 |     sqrt = sqrt,
46 |     log = log
47 | )
48 | 
49 | time_methods(datasets, transform) \%>\%
50 |     unpack_timing() # extract timings out of list
51 | 
52 | }
53 | \seealso{
54 | \code{\link{apply_methods}}
55 | }
56 | 


--------------------------------------------------------------------------------
/inst/prebuilt-doc/Examples.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "CellBench User's Guide"
 3 | author: "Shian Su"
 4 | date: "`r Sys.Date()`"
 5 | output: BiocStyle::pdf_document
 6 | vignette: >
 7 |   %\VignetteIndexEntry{Vignette Title}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r setup, include=FALSE}
13 | knitr::opts_chunk$set(echo = TRUE)
14 | ```
15 | 
16 | ```{r}
17 | library(CellBench)
18 | library(dplyr)
19 | library(ggplot2)
20 | 
21 | set_cellbench_threads(4)
22 | 
23 | cellbench_mrna_mix_data <- load_mrna_mix_data()
24 | 
25 | data <- list(
26 |     mrna_mix_celseq = cellbench_mrna_mix_data$mrna_mix_celseq
27 | )
28 | 
29 | norm_method <- list(
30 |     scran = scran_norm_expr
31 | )
32 | 
33 | impute_method <- fn_arg_seq(
34 |     impute_knn_smooth,
35 |     k = c(4, 8, 16, 32)
36 | )
37 | 
38 | res <- data %>%
39 |     apply_methods(norm_method)
40 | 
41 | res <- res %>%
42 |     apply_methods(impute_method)
43 | 
44 | dim_red <- list(
45 |     pca = compute_pca
46 | )
47 | 
48 | res <- res %>%
49 |     apply_methods(dim_red)
50 | 
51 | append_anno <- function(data_key, result) {
52 |     mRNA_amount <- colData(cellbench_mrna_mix_data$mrna_mix_celseq)$mRNA_amount
53 |     truth <- with(
54 |         colData(cellbench_mrna_mix_data$mrna_mix_celseq),
55 |         paste(H2228_prop, H1975_prop, HCC827_prop)
56 |     )
57 | 
58 |     result %>%
59 |         tibble::add_column(mRNA_amount, .before = TRUE) %>%
60 |         tibble::add_column(truth, .before = TRUE)
61 | }
62 | 
63 | annotated_res <- res %>%
64 |     dplyr::mutate(data_key = paste(data)) %>%
65 |     dplyr::mutate(result = map2(data_key, result, append_anno)) %>%
66 |     dplyr::select(-data_key)
67 | 
68 | plot_df <- tidyr::unnest(annotated_res)
69 | 
70 | plot_df %>%
71 |     ggplot(aes(x = Dim1, y = Dim2, col = truth)) +
72 |     geom_point() +
73 |     facet_wrap(~impute_method, nrow = 2) +
74 |     ggtitle("KNN Smooth Imputation")
75 | 
76 | ```
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/inst/prebuilt-doc/ImputeBenchmark.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "ImputeBenchmark"
 3 | author: "Shian Su"
 4 | date: "08/11/2018"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | library(CellBench)
10 | library(dplyr)
11 | library(ggplot2)
12 | library(purrr)
13 | knitr::opts_chunk$set(echo = TRUE)
14 | ```
15 | 
16 | ```{r}
17 | CellBench:::load_mrna_mix_data()
18 | 
19 | set_cellbench_threads(4)
20 | 
21 | data <- list(
22 |     mrna_mix_celseq = cellbench_mrna_mix_data$mrna_mix_celseq
23 |         %>% sample_cells(n = 150)
24 |         %>% sample_genes(n = 2000)
25 | )
26 | 
27 | impute_method <- list(
28 |     scran_drimpute = impute_drimpute,
29 |     basics = impute_basics,
30 |     knn_smooth = impute_knn_smooth
31 | )
32 | 
33 | res1 <- apply_methods(data, impute_method)
34 | 
35 | res1
36 | ```
37 | 
38 | ```{r}
39 | dim_red <- list(
40 |     pca = compute_pca,
41 |     pca_500_most_var = partial(compute_pca_most_var, ngenes = 500),
42 |     pca_300_most_var = partial(compute_pca_most_var, ngenes = 300),
43 |     pca_100_most_var = partial(compute_pca_most_var, ngenes = 100)
44 | )
45 | 
46 | res2 <- res1 %>%
47 |     apply_methods(dim_red)
48 | 
49 | res2
50 | ```
51 | 
52 | ```{r}
53 | mrna_mix_data <- data$mrna_mix_celseq
54 | append_anno <- function(data, result) {
55 |     
56 |     mRNA_amount <- colData(mrna_mix_data)$mRNA_amount
57 |     truth <- with(
58 |         colData(mrna_mix_data),
59 |         paste(H2228_prop, H1975_prop, HCC827_prop)
60 |     )
61 |     
62 |     result %>%
63 |         tibble::add_column(mRNA_amount, .before = TRUE) %>%
64 |         tibble::add_column(truth, .before = TRUE)
65 | }
66 | 
67 | annotated_res <- res2 %>%
68 |    mutate(result = map2(data, result, append_anno))
69 | 
70 | annotated_res
71 | ```
72 | 
73 | ```{r}
74 | plot_df <- tidyr::unnest(annotated_res)
75 | 
76 | plot_df %>%
77 |     ggplot(aes(x = Dim1, y = Dim2, col = truth)) +
78 |     geom_point() +
79 |     facet_grid(dim_red~impute_method, scales = "free") + 
80 |     theme_bw()
81 | ```
82 | 
83 | ```{r}
84 | 
85 | ```
86 | 


--------------------------------------------------------------------------------
/R/memoise.R:
--------------------------------------------------------------------------------
 1 | #' Create a cached function for CellBench
 2 | #'
 3 | #' Take a function and return a cached version. The arguments and results of a
 4 | #' cached method is saved to disk and if the cached function is called again
 5 | #' with the same arguments then the results will be retrieved from the cache
 6 | #' rather than be recomputed.
 7 | #'
 8 | #' \bold{(CAUTION)} Because cached functions called
 9 | #' with the same argument will always return the same output, pseudo-random
10 | #' methods will not return varying results over repeated runs as one might
11 | #' expect.
12 | #'
13 | #' This function is a thin wrapper around \code{\link[memoise]{memoise}}
14 | #'
15 | #' @param f the function to be cached
16 | #' @param cache the cache information (from memoise package)
17 | #'
18 | #' @return function whose results are cached and is called identically to f
19 | #' @export
20 | #'
21 | #' @seealso \code{\link{set_cellbench_cache_path}}
22 | #'
23 | #' @examples
24 | #' # sets cache path to a temporary directory
25 | #' set_cellbench_cache_path(file.path(tempdir(), ".CellBenchCache"))
26 | #' f <- function(x) { x + 1 }
27 | #' cached_f <- cache_method(f)
28 | #'
29 | cache_method <- function(f, cache = getOption("CellBench.cache")) {
30 |     stopifnot(is.function(f))
31 | 
32 |     if (is.null(cache)) {
33 |         stop("CellBench cache path has not been set, please run `set_cellbench_cache_path()`")
34 |     }
35 | 
36 |     memoise::memoise(f, cache = cache)
37 | }
38 | 
39 | #' Clear CellBench Cache
40 | #'
41 | #' Clears the method cache for CellBench
42 | #'
43 | #' @return None
44 | #'
45 | #' @export
46 | #'
47 | #' @examples
48 | #' \dontrun{
49 | #' clear_cellbench_cache()
50 | #' }
51 | #'
52 | clear_cellbench_cache <- function() {
53 |     cache_path <- getOption("CellBench.cache_path")
54 |     if (is.null(cache_path)) {
55 |         return()
56 |     }
57 | 
58 |     files <- file.path(cache_path, dir(cache_path))
59 | 
60 |     if (length(files) > 0) {
61 |         rem <- file.remove(files)
62 |     }
63 | 
64 |     message(glue::glue("{sum(rem)} files removed from cache"))
65 | 
66 |     invisible()
67 | }


--------------------------------------------------------------------------------
/man/apply_methods.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/apply_methods.R
 3 | \name{apply_methods}
 4 | \alias{apply_methods}
 5 | \alias{apply_methods.list}
 6 | \alias{apply_methods.benchmark_tbl}
 7 | \alias{apply_methods.tbl_df}
 8 | \alias{apply_metrics}
 9 | \alias{begin_benchmark}
10 | \title{Apply methods}
11 | \usage{
12 | apply_methods(x, fn_list, name = NULL, suppress.messages = TRUE)
13 | 
14 | \method{apply_methods}{list}(x, fn_list, name = NULL, suppress.messages = TRUE)
15 | 
16 | \method{apply_methods}{benchmark_tbl}(x, fn_list, name = NULL, suppress.messages = TRUE)
17 | 
18 | \method{apply_methods}{tbl_df}(x, fn_list, name = NULL, suppress.messages = TRUE)
19 | 
20 | apply_metrics(x, fn_list, name = NULL, suppress.messages = TRUE)
21 | 
22 | begin_benchmark(x, fn_list, name = NULL, suppress.messages = TRUE)
23 | }
24 | \arguments{
25 | \item{x}{the list of data or benchmark tibble to apply methods to}
26 | 
27 | \item{fn_list}{the list of methods to be applied}
28 | 
29 | \item{name}{(optional) the name of the column for methods applied}
30 | 
31 | \item{suppress.messages}{TRUE if messages from running methods should be
32 | suppressed}
33 | }
34 | \value{
35 | benchmark_tbl object containing results from methods applied, the
36 |   first column is the name of the dataset as factors, middle columns contain
37 |   method names as factors and the final column is a list of results of
38 |   applying the methods.
39 | }
40 | \description{
41 | apply_methods() and its aliases apply_metrics and begin_benchmark take either
42 | lists of datasets or benchmark_tbl objects and applies a list of functions.
43 | The output is a benchmark_tbl where each method has been applied to each
44 | dataset or preceeding result.
45 | }
46 | \examples{
47 | # list of data
48 | datasets <- list(
49 |     set1 = rnorm(500, mean = 2, sd = 1),
50 |     set2 = rnorm(500, mean = 1, sd = 2)
51 | )
52 | 
53 | # list of functions
54 | add_noise <- list(
55 |     none = identity,
56 |     add_bias = function(x) { x + 1 }
57 | )
58 | 
59 | res <- apply_methods(datasets, add_noise)
60 | 
61 | }
62 | \seealso{
63 | \code{\link{time_methods}}
64 | }
65 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(any_task_errors,benchmark_tbl)
 4 | S3method(apply_methods,benchmark_tbl)
 5 | S3method(apply_methods,list)
 6 | S3method(apply_methods,tbl_df)
 7 | S3method(print,fn_arg_seq)
 8 | S3method(print,task_error)
 9 | S3method(strip_timing,benchmark_timing_tbl)
10 | S3method(summary,benchmark_tbl)
11 | S3method(time_methods,benchmark_timing_tbl)
12 | S3method(time_methods,list)
13 | S3method(unpack_timing,benchmark_timing_tbl)
14 | S3method(vctrs::vec_ptype_abbr,SingleCellExperiment)
15 | S3method(vctrs::vec_ptype_abbr,remote_error)
16 | S3method(vctrs::vec_ptype_abbr,task_error)
17 | export(all_unique)
18 | export(any_task_errors)
19 | export(apply_methods)
20 | export(apply_metrics)
21 | export(arrow_sep)
22 | export(as_pipeline_list)
23 | export(begin_benchmark)
24 | export(cache_method)
25 | export(cellbench_case_study)
26 | export(cellbench_file)
27 | export(clear_cached_datasets)
28 | export(clear_cellbench_cache)
29 | export(collapse_pipeline)
30 | export(data_list)
31 | export(filter_zero_genes)
32 | export(fn_arg_seq)
33 | export(fn_list)
34 | export(is.task_error)
35 | export(is_all_of)
36 | export(is_one_of)
37 | export(keep_high_count_cells)
38 | export(keep_high_count_genes)
39 | export(keep_high_var_genes)
40 | export(load_all_data)
41 | export(load_cell_mix_data)
42 | export(load_mrna_mix_data)
43 | export(load_sc_data)
44 | export(mhead)
45 | export(pipeline_collapse)
46 | export(sample_cells)
47 | export(sample_genes)
48 | export(set_cellbench_bpparam)
49 | export(set_cellbench_cache_path)
50 | export(set_cellbench_threads)
51 | export(split_step)
52 | export(strip_timing)
53 | export(time_methods)
54 | export(unpack_timing)
55 | importClassesFrom(SingleCellExperiment,SingleCellExperiment)
56 | importFrom(BiocFileCache,BiocFileCache)
57 | importFrom(BiocFileCache,bfcadd)
58 | importFrom(BiocFileCache,bfcquery)
59 | importFrom(BiocFileCache,bfcrpath)
60 | importFrom(BiocGenerics,updateObject)
61 | importFrom(BiocParallel,MulticoreParam)
62 | importFrom(BiocParallel,SerialParam)
63 | importFrom(BiocParallel,SnowParam)
64 | importFrom(BiocParallel,bplapply)
65 | importFrom(BiocParallel,bpnworkers)
66 | importFrom(BiocParallel,bptry)
67 | importFrom(assertthat,assert_that)
68 | importFrom(dplyr,mutate)
69 | importFrom(dplyr,select)
70 | importFrom(glue,glue)
71 | importFrom(lubridate,as.duration)
72 | importFrom(lubridate,seconds)
73 | importFrom(magrittr,"%>%")
74 | importFrom(magrittr,extract)
75 | importFrom(magrittr,set_names)
76 | importFrom(methods,is)
77 | importFrom(purrr,map)
78 | importFrom(purrr,map_dbl)
79 | importFrom(rappdirs,user_cache_dir)
80 | importFrom(rlang,":=")
81 | importFrom(rlang,.data)
82 | importFrom(rlang,exprs)
83 | importFrom(stats,setNames)
84 | importFrom(stats,var)
85 | importFrom(tibble,tibble)
86 | importFrom(tidyselect,all_of)
87 | importFrom(utils,browseURL)
88 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # CellBench 1.3.2
 2 | 
 3 | ## Bug Fixes
 4 | * apply_methods and time_methods now correctly produces results for custom data.frames whose columns are not sorted from left to right
 5 | 
 6 | # CellBench 1.1.3
 7 | 
 8 | ## Bug Fixes
 9 | * Data loading functions now appear in package index and documentation
10 | 
11 | # CellBench 1.1.2
12 | 
13 | ## Bug Fixes
14 | * Updated make_combinations to work with tidyr 1.0.0
15 | 
16 | ## Modifications
17 | * Updated the WritingWrappers vignette.
18 | * Added a case study precompiled vignette.
19 | 
20 | # CellBench 1.1.1
21 | 
22 | ## New Features
23 | * Added any_task_errors() function to check if any tasks failed in benchmark tibble.
24 | 
25 | # CellBench 0.99.10
26 | * Accepted into Bioconductor.
27 | 
28 | ## New Features
29 | * Added new vignettes for Tidyverse Patterns and Method Wrappers.
30 | 
31 | ## Modifications
32 | * `fn_arg_seq()` now has a `.strict` argument to check if arguments supplied are actually used in the function. Default is `FALSE`, previously this check is always done, but it failed for functions that use methods dispatch.
33 | * `pipeline_collapse()` now has a `data.name` argument for if the name of the dataset should be kept in the pipeline string. Useful if only one dataset is used.
34 | * `arrow_sep()` now uses ascii glyphs (only left and right available) instead of unicode. Unicode arrows fail when ggplots in rmarkdown is compiled into PDF, a common enough use-case for this to be concerning.
35 | 
36 | # CellBench 0.0.7
37 | 
38 | ## Modifications
39 | * purrr version number requirement set to (>= 0.3.0) because of argument name change in `partial()`
40 | * Documentation reorganised to clean up package documentation index.
41 | * Added landing page for `?CellBench`
42 | 
43 | # CellBench 0.0.6
44 | 
45 | ## New Features
46 | 
47 | * Added propagation for errors.
48 | * Added task_error class for errors.
49 | * Added print method for task_error objects.
50 | 
51 | # CellBench 0.0.5
52 | 
53 | ## New Features
54 | * Added "Timing" vignette to explain time_methods.
55 | * Changed apply_methods() to continue on errors and return error object in result column.
56 | 
57 | # CellBench 0.0.4
58 | 
59 | ## Breaking Changes
60 | * Changed `.name` arguments in `time_methods()` and `apply_methods()` to `name`.
61 | 
62 | ## New Features
63 | * Added time_methods function.
64 | * Added set_cellbench_bpparam for more advanced parallelism options.
65 | 
66 | # CellBench 0.0.3
67 | 
68 | ## New Features
69 | * Implemented parallel application of methods to benchmark_tbl, previously only worked for dataset lists.
70 | * Added fn_list constructor.
71 | * Added data_list constructor.
72 | 
73 | # CellBench 0.0.2
74 | 
75 | ## Bug Fixes
76 | * Fixed bug in apply_methods() causing it to fail when more than 1 thread is used.
77 | 
78 | ## Modifications
79 | * Updated introduction vignette to describe multithreading and function caching.
80 | 
81 | # CellBench 0.0.1
82 | 
83 | * Minimal functioning package created.
84 | * Compliant with BiocCheck::BiocCheck() and goodpractice::goodpractice().
85 | 


--------------------------------------------------------------------------------
/R/fn_arg_seq.R:
--------------------------------------------------------------------------------
 1 | #' Create a list of functions with arguments varying over a sequence
 2 | #'
 3 | #' Generate a list of functions where specific arguments have been pre-applied
 4 | #' from a sequences of arguments, i.e. a function f(x, n) may have the 'n'
 5 | #' argument pre-applied with specific values to obtain functions f1(x, n = 1)
 6 | #' and f2(x, n = 2) stored in a list.
 7 | #'
 8 | #' If multiple argument vectors are provided
 9 | #' then the combinations of arguments in the sequences will be generated.
10 | #'
11 | #' @param func function to generate list from
12 | #' @param ... vectors of values to use as arguments
13 | #' @param .strict TRUE if argument names are checked, giving an error if
14 | #'   specified argument does not appear in function signature. Note that
15 | #'   functions with multiple methods generally have only f(x, ...) as their
16 | #'   signature, so the check would fail even if the arguments are passed on.
17 | #'
18 | #' @return list of functions with the specified arguments pre-applied. Names of
19 | #'     the list indicate the values that have been pre-applied.
20 | #'
21 | #' @importFrom stats setNames
22 | #' @export
23 | #'
24 | #' @examples
25 | #' f <- function(x) {
26 | #'     cat("x:", x)
27 | #' }
28 | #'
29 | #' f_list <- fn_arg_seq(f, x = c(1, 2))
30 | #' f_list
31 | #' f_list[[1]]() # x: 1
32 | #' f_list[[2]]() # x: 2
33 | #'
34 | #' g <- function(x, y) {
35 | #'     cat("x:", x, "y:", y)
36 | #' }
37 | #'
38 | #' g_list <- fn_arg_seq(g, x = c(1, 2), y = c(3, 4))
39 | #' g_list
40 | #' g_list[[1]]() # x: 1 y: 3
41 | #' g_list[[2]]() # x: 1 y: 4
42 | #' g_list[[3]]() # x: 2 y: 3
43 | #' g_list[[4]]() # x: 2 y: 4
44 | fn_arg_seq <- function(func, ..., .strict = FALSE) {
45 |     stopifnot(is.function(func))
46 | 
47 |     # get actual function name
48 |     func_name <- deparse(substitute(func))
49 |     args <- list(...)
50 | 
51 |     # find arguments input but not used by func
52 |     names_args <- names(args)
53 |     names_f_args <- names(formals(func))
54 |     invalid_args <- setdiff(names_args, names_f_args)
55 | 
56 |     # stop if there are invalid args
57 |     if (.strict && length(invalid_args) != 0) {
58 |         invalid_args <- glue::glue("'{invalid_args}'") %>% collapse_with_comma()
59 |         stop(glue::glue("args not used in {func_name}: {invalid_args}"))
60 |     }
61 | 
62 |     arg_combs <- do.call(
63 |         purrr::partial(expand.grid, stringsAsFactors = FALSE),
64 |         args
65 |     )
66 |     cnames <- colnames(arg_combs)
67 | 
68 |     rows_as_list <- function(df) {
69 |         row_apply(df, function(x) { as.list(x) %>% setNames(nm = cnames) }) %>%
70 |             purrr::map(function(x) { append(list(".f" = func), x) })
71 |     }
72 | 
73 |     out <- purrr::map(
74 |         rows_as_list(arg_combs),
75 |         function(x) { do.call(purrr::partial, x) }
76 |     )
77 | 
78 |     arg_sigs <- row_apply(
79 |         dplyr::mutate(
80 |             arg_combs,
81 |             dplyr::across(dplyr::everything(), as.character)
82 |         ),
83 |         function(x) {
84 |             paste(glue::glue("{cnames} = {x}"), collapse = ", ")
85 |         }
86 |     )
87 | 
88 |     call_sigs <- glue::glue("{func_name}({arg_sigs})")
89 | 
90 |     names(out) <- call_sigs
91 |     out <- add_class(out, "fn_arg_seq")
92 | 
93 |     out
94 | }
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CellBench
 2 | 
 3 | [![Travis build status](https://travis-ci.org/Shians/CellBench.svg?branch=master)](https://travis-ci.org/Shians/CellBench)
 4 | [![Coverage status](https://codecov.io/gh/Shians/CellBench/branch/master/graph/badge.svg)](https://codecov.io/github/Shians/CellBench?branch=master)
 5 | 
 6 | <img src="https://github.com/Shians/CellBench/raw/master/CellBench.png" width="150" />
 7 | 
 8 | R package for benchmarking single cell analysis methods, primarily inspired by the modelling structure used in [DSC](https://github.com/stephenslab/dsc).
 9 | 
10 | # Installation
11 | 
12 | ```r
13 | if (!require(remotes)) install.packages("remotes")
14 | remotes::install_github("shians/CellBench", ref = "R-3.5", build_opts = c("--no-resave-data", "--no-manual"))
15 | ```
16 | 
17 | # Introduction
18 | 
19 | This package revolves around one object and one function. The `benchmark_tbl` (benchmark [tibble](https://tibble.tidyverse.org)) and the `apply_methods(x, methods)` function.
20 | 
21 | We expect data to to be stored in lists, and we apply functions stored in lists to the data. This creates a `benchmark_tbl` where the names of the lists items are stored as columns and the final column contains the result of the computations.
22 | 
23 | ```r
24 | library(CellBench)
25 | 
26 | sample1 <- data.frame(
27 |     x = matrix(runif(25), nrow = 5, ncol = 5)
28 | )
29 | 
30 | sample2 <- data.frame(
31 |     x = matrix(runif(25), nrow = 5, ncol = 5)
32 | )
33 | 
34 | datasets <- list(
35 |     sample1 = sample1,
36 |     sample2 = sample2
37 | )
38 | 
39 | transform <- list(
40 |     correlation = cor,
41 |     covariance = cov
42 | )
43 | 
44 | datasets %>% apply_methods(transform)
45 | 
46 | ## # A tibble: 4 x 3
47 | ##   data    metric      result       
48 | ##   <fct>   <fct>       <list>       
49 | ## 1 sample1 correlation <dbl [5 × 5]>
50 | ## 2 sample1 covariance  <dbl [5 × 5]>
51 | ## 3 sample2 correlation <dbl [5 × 5]>
52 | ## 4 sample2 covariance  <dbl [5 × 5]>
53 | ```
54 | 
55 | We can additionally chain method applications and this will combinatorially expand our `benchmark_tbl` so that combinations of methods can easily be computed.
56 | 
57 | ```r
58 | metric <- list(
59 |     mean = mean,
60 |     median = median
61 | )
62 | 
63 | datasets %>%
64 |     apply_methods(transform) %>%
65 |     apply_methods(metric)
66 | 
67 | ## # A tibble: 8 x 4
68 | ##   data    transform   metric   result
69 | ##   <fct>   <fct>       <fct>     <dbl>
70 | ## 1 sample1 correlation mean    0.0602 
71 | ## 2 sample1 correlation median -0.0520 
72 | ## 3 sample1 covariance  mean    0.00823
73 | ## 4 sample1 covariance  median -0.00219
74 | ## 5 sample2 correlation mean    0.303  
75 | ## 6 sample2 correlation median  0.482  
76 | ## 7 sample2 covariance  mean    0.0115 
77 | ## 8 sample2 covariance  median  0.0132 
78 | ```
79 | 
80 | The result table is essentially a regular `tibble` and works with all `tidyverse` packages.
81 | 
82 | See
83 | 
84 | ```r
85 | vignette("Introduction", package = "CellBench")
86 | ```
87 | 
88 | for a more detailed introduction and example with biological data.
89 | 
90 | # Features
91 | 
92 | * High compatibility with dplyr and rest of tidyverse, fundamental data object can be used with dplyr verbs
93 | * Multithreading, methods can be applied in parallel
94 | 
95 | # License
96 | 
97 | This package is licensed under GNU General Public License v3.0 (GPL-3.0).
98 | 


--------------------------------------------------------------------------------
/R/set_options.R:
--------------------------------------------------------------------------------
  1 | #' Set number of threads used by CellBench
  2 | #'
  3 | #' Sets global parameter for CellBench to use multiple threads for applying
  4 | #' methods. If any methods applied are multi-threaded then it's recommended to
  5 | #' set CellBench threads to 1. It only recommended to use CellBench with
  6 | #' multiple threads if methods applied can be set to run on single threads.
  7 | #'
  8 | #' @param nthreads the number of threads used by CellBench
  9 | #'
 10 | #' @return None
 11 | #'
 12 | #' @seealso \code{\link{set_cellbench_bpparam}} for more advanced interface
 13 | #'
 14 | #' @export
 15 | #'
 16 | #' @examples
 17 | #' set_cellbench_threads(1) # CellBench runs on a single thread
 18 | #'
 19 | set_cellbench_threads <- function(nthreads = 1) {
 20 |     stopifnot(
 21 |         is.numeric(nthreads),
 22 |         nthreads >= 1
 23 |     )
 24 | 
 25 |     options("CellBench.threads" = nthreads)
 26 | 
 27 |     if (nthreads == 1) {
 28 |         options(
 29 |             "CellBench.bpparam" = BiocParallel::SerialParam(
 30 |                 stop.on.error = FALSE
 31 |             )
 32 |         )
 33 |     } else {
 34 |         if (.Platform$OS.type == "windows") {
 35 |             options(
 36 |                 "CellBench.bpparam" = BiocParallel::SnowParam(
 37 |                     workers = nthreads,
 38 |                     stop.on.error = FALSE
 39 |                 )
 40 |             )
 41 |         } else {
 42 |             options(
 43 |                 "CellBench.bpparam" = BiocParallel::MulticoreParam(
 44 |                     workers = nthreads,
 45 |                     stop.on.error = FALSE
 46 |                 )
 47 |             )
 48 |         }
 49 |     }
 50 | 
 51 |     invisible() # guard against implicit returns
 52 | }
 53 | 
 54 | #' Set BiocParallel parameter used CellBench
 55 | #'
 56 | #' This is a more advanced interface for changing CellBench's parallelism
 57 | #' settings. Internally CellBench uses BiocParallel for parallelism, consult
 58 | #' the documentation of BiocParallel to see what settings are available.
 59 | #'
 60 | #' @param param the BiocParallel parameter object
 61 | #'
 62 | #' @seealso \code{\link{set_cellbench_threads}} for more basic interface
 63 | #'
 64 | #' @return None
 65 | #'
 66 | #' @importFrom BiocParallel bpnworkers
 67 | #' @export
 68 | #'
 69 | #' @examples
 70 | #' set_cellbench_threads(1) # CellBench runs on a single thread
 71 | #'
 72 | set_cellbench_bpparam <- function(param) {
 73 |     stopifnot(is(param, "BiocParallelParam"))
 74 | 
 75 |     options("CellBench.threads" = BiocParallel::bpnworkers(param))
 76 |     options("CellBench.bpparam" = param)
 77 | 
 78 |     invisible() # guard against implicit returns
 79 | }
 80 | 
 81 | #' Set CellBench cache path
 82 | #'
 83 | #' @return None
 84 | #'
 85 | #' @param path the path to where method caches should be stored
 86 | #'
 87 | #' @export
 88 | #'
 89 | #' @seealso \code{\link{cache_method}} for constructing cached methods.
 90 | #'
 91 | #' @examples
 92 | #' \dontrun{
 93 | #' # hidden folder in local path
 94 | #' set_cellbench_cache_path(".CellBenchCache"))
 95 | #' }
 96 | #' # store in temp directory valid for this session
 97 | #' set_cellbench_cache_path(file.path(tempdir(), ".CellBenchCache"))
 98 | #'
 99 | set_cellbench_cache_path <- function(path = "./.CellBenchCache") {
100 |     stopifnot(is.character(path))
101 | 
102 |     options("CellBench.cache" = memoise::cache_filesystem(path = path))
103 |     options("CellBench.cache_path" = path)
104 | 
105 |     invisible() # guard against implicit returns
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/R/benchmark_timing_tbl_methods.R:
--------------------------------------------------------------------------------
  1 | #' Strip timing information
  2 | #'
  3 | #' Takes the result of a time_methods() call and remove timing information from
  4 | #' the `timed_result` column, replacing it with a `result` column and converting
  5 | #' it to a benchmark_tbl.
  6 | #'
  7 | #' @param x the benchmark_timing_tbl object
  8 | #'
  9 | #' @return benchmark_tbl
 10 | #'
 11 | #' @seealso \code{\link{unpack_timing}}
 12 | #'
 13 | #' @export
 14 | #'
 15 | #' @examples
 16 | #' \dontrun{
 17 | #' datasets <- list(
 18 | #'     data1 = 1:1e8,
 19 | #' )
 20 | #'
 21 | #' transforms <- list(
 22 | #'     log = log,
 23 | #'     sqrt = sqrt
 24 | #' )
 25 | #'
 26 | #' datasets %>%
 27 | #'     time_methods(transforms) %>%
 28 | #'     strip_timing()
 29 | #' }
 30 | #'
 31 | strip_timing <- function(x) {
 32 |     UseMethod("strip_timing", x)
 33 | }
 34 | 
 35 | #' @rdname strip_timing
 36 | #' @importFrom rlang .data
 37 | #' @export
 38 | #' @keywords internal
 39 | strip_timing.benchmark_timing_tbl <- function(x) {
 40 |     x <- x %>%
 41 |         dplyr::mutate(result = purrr::map(.data$timed_result, function(x) x$result)) %>%
 42 |         dplyr::select(-"timed_result")
 43 | 
 44 |     if (all_length_one(x$result)) {
 45 |         x$result <- unlist(x$result)
 46 |     }
 47 | 
 48 |     x <- drop_class(x, "benchmark_timing_tbl")
 49 |     x <- add_class(x, "benchmark_tbl")
 50 | 
 51 |     x
 52 | }
 53 | 
 54 | 
 55 | #' Unpack timing information
 56 | #'
 57 | #' Takes the result of a time_methods() call and remove the `timed_result`
 58 | #' column, replacing it with three columns of durations representing the
 59 | #' `system`, `user` and `elapsed` times from a system.time() call.
 60 | #'
 61 | #' @param x the benchmark_timing_tbl object
 62 | #'
 63 | #' @return a tibble containing pipeline steps and timing information
 64 | #'
 65 | #' @seealso \code{\link{strip_timing}}
 66 | #'
 67 | #' @export
 68 | #'
 69 | #' @examples
 70 | #' \dontrun{
 71 | #' datasets <- list(
 72 | #'     data1 = c(1, 2, 3)
 73 | #' )
 74 | #'
 75 | #' transforms <- list(
 76 | #'     log = function(x) { Sys.sleep(0.1); log(x) },
 77 | #'     sqrt = function(x) { Sys.sleep(0.1); sqrt(x) }
 78 | #' )
 79 | #'
 80 | #' datasets %>%
 81 | #'     time_methods(transforms) %>%
 82 | #'     unpack_timing()
 83 | #' }
 84 | #'
 85 | unpack_timing <- function(x) {
 86 |     UseMethod("unpack_timing", x)
 87 | }
 88 | 
 89 | #' @rdname unpack_timing
 90 | #' @importFrom dplyr mutate select
 91 | #' @importFrom purrr map map_dbl
 92 | #' @export
 93 | #' @keywords internal
 94 | unpack_timing.benchmark_timing_tbl <- function(x) {
 95 |     extract_timing <- function(x) {
 96 |         if (is(x, "error")) {
 97 |             list(
 98 |                 user = NaN,
 99 |                 system = NaN,
100 |                 elapsed = NaN
101 |             )
102 |         } else {
103 |             x$timing
104 |         }
105 |     }
106 | 
107 |     x %>%
108 |         dplyr::mutate(
109 |             timing = purrr::map(.data$timed_result, extract_timing)
110 |         ) %>%
111 |         dplyr::mutate(
112 |             user = duration_seconds(
113 |                 purrr::map_dbl(.data$timing, function(x) x[["user"]])
114 |             ),
115 |             system = duration_seconds(
116 |                 purrr::map_dbl(.data$timing, function(x) x[["system"]])
117 |             ),
118 |             elapsed = duration_seconds(
119 |                 purrr::map_dbl(.data$timing, function(x) x[["elapsed"]])
120 |             )
121 |         ) %>%
122 |         dplyr::select(-"timing", -"timed_result") %>%
123 |         drop_class("benchmark_timing_tbl")
124 | }
125 | 


--------------------------------------------------------------------------------
/vignettes/Timing.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Timing methods in CellBench"
 3 | author: "Shian Su"
 4 | date: "`r Sys.Date()`"
 5 | output: BiocStyle::html_document
 6 | vignette: >
 7 |   %\VignetteIndexEntry{Timing}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r, include = FALSE}
13 | library(CellBench)
14 | ```
15 | 
16 | # Introduction
17 | 
18 | CellBench provides the ability to measure the running time of pipelines. This is done using the `time_methods()` function which runs in the same way that `apply_methods()` does, with the difference that it does not run in parallel. This is an intentional design choice because running things in parallel usually results in some competition for computer resource and therefore produces less reliable or stable timings.
19 | 
20 | # Timing methods
21 | 
22 | The setup for timing methods is identical to applying methods. You have a list of data and a list of functions, then you use `time_methods()` instead of `apply_methods()`. 
23 | 
24 | ```{r}
25 | library(CellBench)
26 | 
27 | # wrap a simple vector in a list
28 | datasets <- list(
29 |     data1 = c(1, 2, 3)
30 | )
31 | 
32 | # use Sys.sleep in functions to simulate long-running functions
33 | transform <- list(
34 |     log = function(x) { Sys.sleep(0.1); log(x) },
35 |     sqrt = function(x) { Sys.sleep(0.1); sqrt(x) }
36 | )
37 | 
38 | # time the functions
39 | res <- datasets %>%
40 |     time_methods(transform)
41 | 
42 | res
43 | ```
44 | 
45 | Where we usually have the `result` column we now have `timed_result`, this is a list of two objects: the timing object and the result. It is necessary to keep the result so that we can chain computations together.
46 | 
47 | ```{r}
48 | res$timed_result[[1]]
49 | ```
50 | 
51 | As is the case with `apply_methods()`, more lists of methods can be applied and results will expand out combinatorially. The timings in this case will be cumulative over the methods applied.
52 | 
53 | ```{r}
54 | transform2 <- list(
55 |     plus = function(x) { Sys.sleep(0.1); x + 1 },
56 |     minus = function(x) { Sys.sleep(0.1); x - 1 }
57 | )
58 | 
59 | res2 <- datasets %>%
60 |     time_methods(transform) %>%
61 |     time_methods(transform2)
62 | 
63 | res2
64 | ```
65 | 
66 | The class of results from `time_methods()` is `benchmark_timing_tbl`. Once all methods have been applied, the result may be discarded using `unpack_timing()` and the object can be transformed into a more flat `tbl` representation. See `?proc_time` for an explanation of what `user`, `system` and `elapsed` refer to. 
67 | 
68 | The timing values have been converted to `Duration` objects from the `lubridate` package, these behave as numeric measurements in seconds but have nicer printing properties (try `lubridate::duration(1000, units = "seconds")`).
69 | 
70 | ```{r}
71 | # discard results and expand out timings into columns
72 | res2 %>%
73 |     unpack_timing()
74 | ```
75 | 
76 | Alternatively the timing information can be discarded and a `benchmark_tbl` can be produced using `strip_timing()`.
77 | 
78 | ```{r}
79 | # discard timings and produce benchmark_tbl object
80 | res2 %>%
81 |     strip_timing()
82 | ```
83 | 
84 | # Summary
85 | 
86 | CellBench provides a simple way to measure the running times of pipelines from various combinations of methods. This is done with the `time_methods()` function which is called in the same way as `apply_methods()` and has the same chaining properties. The resultant object can be transformed in two useful ways, as a flat `tibble` with timings expanded out as columns and discarding the results, or as a `benchmark_tbl` with the results as a `list-column` and discarding the timings.
87 | 


--------------------------------------------------------------------------------
/R/utils_filter.R:
--------------------------------------------------------------------------------
  1 | #' Filter out zero count genes
  2 | #'
  3 | #' Remove all genes (rows) where the total count is 0
  4 | #'
  5 | #' @param x the SingleCellExperiment or matrix to filter
  6 | #'
  7 | #' @return object of same type as input with all zero count genes removed
  8 | #' @export
  9 | #'
 10 | #' @examples
 11 | #' x <- matrix(rep(0:5, times = 5), nrow = 6, ncol = 5)
 12 | #' filter_zero_genes(x)
 13 | filter_zero_genes <- function(x) {
 14 |     stopifnot(is_one_of(x, c("SingleCellExperiment", "matrix")))
 15 | 
 16 |     if (is(x, "SingleCellExperiment")) {
 17 |         zero_genes <- rowSums(SingleCellExperiment::counts(x)) == 0
 18 |     } else {
 19 |         zero_genes <- rowSums(x) == 0
 20 |     }
 21 | 
 22 |     x[!zero_genes, ]
 23 | }
 24 | 
 25 | #' Filter down to the highest count genes
 26 | #'
 27 | #' Filter a SingleCellExperiment or matrix down to the genes (rows) with the
 28 | #' highest counts
 29 | #'
 30 | #' @param x the SingleCellExperiment or matrix
 31 | #' @param n the number of highest count genes to keep
 32 | #'
 33 | #' @return object of same type as input containing the highest count genes
 34 | #'
 35 | #' @importFrom magrittr extract
 36 | #' @export
 37 | #'
 38 | #' @examples
 39 | #' data(sample_sce_data)
 40 | #' keep_high_count_genes(sample_sce_data, 300)
 41 | keep_high_count_genes <- function(x, n) {
 42 |     stopifnot(is_one_of(x, c("SingleCellExperiment", "matrix")))
 43 | 
 44 |     if (is(x, "SingleCellExperiment")) {
 45 |         counts <- SingleCellExperiment::counts(x)
 46 |     } else {
 47 |         counts <- x
 48 |     }
 49 | 
 50 |     highest <- rowSums(counts) %>%
 51 |         order(decreasing = TRUE) %>%
 52 |         magrittr::extract(seq_len(n)) %>%
 53 |         sort()
 54 | 
 55 |     x[highest, ]
 56 | }
 57 | 
 58 | #' Filter down to the highest count cells
 59 | #'
 60 | #' Filter a SingleCellExperiment or matrix down to the cells (columns) with the
 61 | #' highest counts
 62 | #'
 63 | #' @param x the SingleCellExperiment or matrix
 64 | #' @param n the number of highest count cells to keep
 65 | #'
 66 | #' @return object of same type as input containing the highest count cells
 67 | #'
 68 | #' @importFrom magrittr extract
 69 | #' @export
 70 | #'
 71 | #' @examples
 72 | #' data(sample_sce_data)
 73 | #' keep_high_count_cells(sample_sce_data, 10)
 74 | keep_high_count_cells <- function(x, n) {
 75 |     stopifnot(is_one_of(x, c("SingleCellExperiment", "matrix")))
 76 | 
 77 |     if (is(x, "SingleCellExperiment")) {
 78 |         counts <- SingleCellExperiment::counts(x)
 79 |     } else {
 80 |         counts <- x
 81 |     }
 82 | 
 83 |     highest <- colSums(counts) %>%
 84 |         order(decreasing = TRUE) %>%
 85 |         magrittr::extract(seq_len(n))  %>%
 86 |         sort()
 87 | 
 88 |     x[, highest]
 89 | }
 90 | 
 91 | #' Filter down to the most variable genes
 92 | #'
 93 | #' Filter a SingleCellExperiment or matrix down to the most variable genes
 94 | #' (rows), variability is determined by var() scaled by the total counts for the
 95 | #' gene.
 96 | #'
 97 | #' @param x the SingleCellExperiment or matrix
 98 | #' @param n the number of most variable genes to keep
 99 | #'
100 | #' @return object of same type as input containing the most variable genes
101 | #'
102 | #' @importFrom stats var
103 | #' @importFrom magrittr extract
104 | #' @export
105 | #'
106 | #' @examples
107 | #' data(sample_sce_data)
108 | #' keep_high_var_genes(sample_sce_data, 50)
109 | keep_high_var_genes <- function(x, n) {
110 |     stopifnot(is_one_of(x, c("SingleCellExperiment", "matrix")))
111 |     stopifnot(n < nrow(x))
112 | 
113 |     if (is(x, "SingleCellExperiment")) {
114 |         counts <- SingleCellExperiment::counts(x)
115 |     } else {
116 |         counts <- x
117 |     }
118 | 
119 |     scaled_var <- row_apply(counts, stats::var) / rowSums(counts)
120 |     highest <- scaled_var %>%
121 |         order(decreasing = TRUE) %>%
122 |         magrittr::extract(seq_len(n))
123 | 
124 |     x[highest, ]
125 | }
126 | 


--------------------------------------------------------------------------------
/vignettes/WritingWrappers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Writing Wrappers"
  3 | author: "Shian Su"
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::html_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Writing Wrappers}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | library(CellBench)
 14 | library(purrr)
 15 | ```
 16 | 
 17 | # Introduction
 18 | 
 19 | This vignette will introduce the reason for creating method wrappers and
 20 | conventions to follow. Method wrappers are a way to subtlely alter the behaviour
 21 | of functions from other libraries. CellBench requires that all methods within a
 22 | step accept the same type of input and produce the same type of output, this is
 23 | often not the case for functions of different libraries which perform the same
 24 | task, therefore it is necessary to write code that "wraps" these methods to
 25 | conform to our requirements.
 26 | 
 27 | Examples of wrappers can be found at the following Github repositories
 28 | 
 29 | * [NormCBM](https://github.com/Shians/NormCBM)
 30 | * [ImputeCBM](https://github.com/Shians/ImputeCBM)
 31 | * [ClusterCBM](https://github.com/Shians/ClusterCBM)
 32 | 
 33 | # Wrapper Guidelines
 34 | 
 35 | There are some requirements for wrappers used in CellBench:
 36 | 
 37 | * All method wrappers must be able to run with a single argument. All other
 38 | arguments need to be optional with sensible defaults.
 39 | * All methods of the same pipeline step must accept objects of the same type and
 40 | produce objects of the same type. They should at the very least be the same
 41 | `class()`, also be careful of methods that return raw counts compared to
 42 | normalised counts.
 43 | 
 44 | To ensure flexibility and compatibility of wrappers, the following conventions are recommended:
 45 | 
 46 | * Wrappers take SingleCellExperiment objects.
 47 | * Wrappers should do the minimal amount of work possible. Ideally it does a little bit of data manipulation, runs a meaningful function and post-processes the results slightly. Wrappers should not perform multiple steps of a pipeline.
 48 | 
 49 | # Practical Examples
 50 | 
 51 | ## Simple Wrapper
 52 | 
 53 | Wrappers should only require a single argument, additional arguments should be set to sensible defaults, if more arguments non-defaultable are absolutely necessary then wrappers should take a list as its first argument.
 54 | 
 55 | ```{r}
 56 | # generic skeleton for a wrapper
 57 | wrapper <- function(sce, ...) {
 58 |     stopifnot(is(sce, "SingleCellExperiment"))
 59 |     
 60 |     res <- method_function(sce, ...)
 61 |     
 62 |     return(res)
 63 | }
 64 | ```
 65 | 
 66 | We can write a simple wrapper for `DrImpute()`:
 67 | 
 68 | ```{r}
 69 | # one possible wrapper implmentation
 70 | drimpute_wrapper <- function(sce, ...) {
 71 |     # wrapper only accepts SingleCellExperiment or matrix type objects
 72 |     stopifnot(is(sce, "SingleCellExperiment"))
 73 | 
 74 |     expr <- SingleCellExperiment::normcounts(sce)
 75 |     expr_processed <- DrImpute::preprocessSC(expr)
 76 |     logcounts(sce) <- DrImpute::DrImpute(expr_processed, ...)
 77 | 
 78 |     return(sce)
 79 | }
 80 | ```
 81 | 
 82 | Any argument passed into `...` will be passed onto `DrImpute()`. Sometimes it helps to explicitly name the arguments we may want to change, or limit what we allow to be changed in order to guarantee better consistency.
 83 | 
 84 | ```{r}
 85 | # another possible implementation
 86 | # DrImpute's default ks is 10:15, we can use 5:15 for robustness
 87 | drimpute_wrapper <- function(sce, ks = 5:15, method = c("mean", "med")) {
 88 |     stopifnot(is(sce, "SingleCellExperiment"))
 89 | 
 90 |     expr <- SingleCellExperiment::normcounts(sce)
 91 | 
 92 |     expr_processed <- DrImpute::preprocessSC(expr)
 93 |     method <- match.arg(method)
 94 |     logcounts(sce) <- DrImpute::DrImpute(expr_processed, ks = ks, method = method)
 95 | 
 96 |     return(sce)
 97 | }
 98 | ```
 99 | 
100 | Then we can alter these wrappers on-the-fly using `purrr::partial()`
101 | 
102 | ```{r}
103 | imputation_method <- fn_list(
104 |     dr_impute_mean = purrr::partial(drimpute_wrapper, method = "mean"),
105 |     dr_impute_median = purrr::partial(drimpute_wrapper, method = "med")
106 | )
107 | ```
108 | 
109 | # Final remarks
110 | 
111 | Wrappers for methods should take `SingleCellExperiment` and return `SingleCellExperiments` with results stored in the the appropriate slots of `assays`, `colData` or `rowData`. If the computational results don't fit nicely into these slots the they should be placed in an appropriate property in `metadata`.
112 | 


--------------------------------------------------------------------------------
/R/time_methods.R:
--------------------------------------------------------------------------------
  1 | #' Time methods
  2 | #'
  3 | #' time_methods() take either lists of datasets or benchmark_timing_tbl objects
  4 | #' and applies a list of functions. The output is a benchmark_timing_tbl where
  5 | #' each method has been applied to each dataset or preceding result. Unlike
  6 | #' apply_methods(), time_methods() is always single threaded as to produce fair
  7 | #' and more consistent timings.
  8 | #'
  9 | #' @param x the list of data or benchmark timing tibble to apply methods to
 10 | #' @param fn_list the list of methods to be applied
 11 | #' @param name (optional) the name of the column for methods applied
 12 | #' @param suppress.messages TRUE if messages from running methods should be
 13 | #'   suppressed
 14 | #'
 15 | #' @return benchmark_timing_tbl object containing results from methods applied,
 16 | #'   the first column is the name of the dataset as factors, middle columns
 17 | #'   contain method names as factors and the final column is a list of lists
 18 | #'   containing the results of applying the methods and timings from calling
 19 | #'   system.time().
 20 | #'
 21 | #' @importFrom magrittr %>%
 22 | #' @importFrom BiocParallel SerialParam bplapply bptry
 23 | #'
 24 | #' @seealso \code{\link{apply_methods}}
 25 | #'
 26 | #' @export
 27 | #'
 28 | #' @examples
 29 | #' datasets <- list(
 30 | #'     set1 = 1:1e7
 31 | #' )
 32 | #'
 33 | #' transform <- list(
 34 | #'     sqrt = sqrt,
 35 | #'     log = log
 36 | #' )
 37 | #'
 38 | #' time_methods(datasets, transform) %>%
 39 | #'     unpack_timing() # extract timings out of list
 40 | #'
 41 | time_methods <- function(x, fn_list, name = NULL, suppress.messages = TRUE) {
 42 |     method_names <- names(fn_list)
 43 |     if (length(method_names) != length(fn_list)) {
 44 |         stop("every element of fn_list must be named")
 45 |     }
 46 | 
 47 |     UseMethod("time_methods", x)
 48 | }
 49 | 
 50 | #' @rdname time_methods
 51 | #' @export
 52 | time_methods.list <- function(
 53 |     x,
 54 |     fn_list,
 55 |     name = NULL,
 56 |     suppress.messages = TRUE
 57 | ) {
 58 |     data_names <- names(x)
 59 |     method_names <- names(fn_list)
 60 | 
 61 |     if (is.null(name)) {
 62 |         name <- deparse(substitute(fn_list))
 63 |     }
 64 | 
 65 |     output <- make_combinations(data_names, method_names)
 66 |     colnames(output) <- c("data", name)
 67 | 
 68 |     tasks <- .generate_tasks(output, x, fn_list, name)
 69 | 
 70 |     timed_result <-
 71 |         .bp_try_apply(
 72 |             BPPARAM = BiocParallel::SerialParam(stop.on.error = FALSE),
 73 |             X = tasks,
 74 |             FUN = function(task) {
 75 |                 list(
 76 |                     timing = simple_time(res <- task$method(task$data)),
 77 |                     result = res
 78 |                 )
 79 |             }
 80 |         )
 81 | 
 82 |     output <- .make_output(output, timed_result, name, timed = TRUE)
 83 |     output <- add_class(output, "benchmark_timing_tbl")
 84 | 
 85 |     output
 86 | }
 87 | 
 88 | #' @rdname time_methods
 89 | #' @importFrom rlang .data
 90 | #' @importFrom BiocParallel SerialParam bplapply bptry
 91 | #' @export
 92 | time_methods.benchmark_timing_tbl <- function(
 93 |     x,
 94 |     fn_list,
 95 |     name = NULL,
 96 |     suppress.messages = TRUE
 97 | ) {
 98 |     stopifnot(all_unique(names(fn_list)))
 99 | 
100 |     method_names <- names(fn_list)
101 | 
102 |     if (missing("name")) {
103 |         # get name from variable name
104 |         name <- deparse(substitute(fn_list))
105 |         name <- gsub("methods$", "method", name)
106 |     }
107 | 
108 |     tasks <- list()
109 |     for (data in x$timed_result) {
110 |         for (fn in fn_list) {
111 |             tasks <- append(
112 |                 tasks,
113 |                 list(
114 |                     list(
115 |                         method = fn,
116 |                         data = data$result,
117 |                         timing = data$timing
118 |                     )
119 |                 )
120 |             )
121 |         }
122 |     }
123 | 
124 |     results <-
125 |         .bp_try_apply(
126 |             BPPARAM = BiocParallel::SerialParam(stop.on.error = FALSE),
127 |             X = tasks,
128 |             FUN = function(task) {
129 |                 list(
130 |                     timing = simple_time(res <- task$method(task$data)) + task$timing,
131 |                     result = res
132 |                 )
133 |             }
134 |         )
135 | 
136 |     output <- x %>% dplyr::select(-"timed_result")
137 |     output <- tidyr::crossing(output, factor_no_sort(method_names))
138 |     names(output)[ncol(output)] <- name
139 |     output <- output %>%
140 |         tibble::add_column(timed_result = results)
141 | 
142 |     if (!"benchmark_timing_tbl" %in% class(output)) {
143 |         output <- add_class(output, "benchmark_timing_tbl")
144 |     }
145 | 
146 |     output
147 | }
148 | 


--------------------------------------------------------------------------------
/R/load_data.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom rappdirs user_cache_dir
  2 | #' @importFrom BiocFileCache BiocFileCache
  3 | .get_cache <- function() {
  4 |     cache <- rappdirs::user_cache_dir(appname = "CellBench")
  5 |     BiocFileCache::BiocFileCache(cache)
  6 | }
  7 | 
  8 | #' @importFrom BiocFileCache bfcquery bfcadd bfcrpath
  9 | #' @importFrom glue glue
 10 | #' @importClassesFrom SingleCellExperiment SingleCellExperiment
 11 | get_data <- function(url, filename) {
 12 |     bfc <- .get_cache()
 13 |     rid <- BiocFileCache::bfcquery(bfc, filename, "rname")$rid
 14 | 
 15 |     if (length(rid) == 0) {
 16 |         message(glue::glue("Downloading data file from {url}"))
 17 |         rid <- names(BiocFileCache::bfcadd(bfc, filename, url))
 18 |     }
 19 | 
 20 |     BiocFileCache::bfcrpath(bfc, rids = rid)
 21 | }
 22 | 
 23 | #' @describeIn load_all_data Load single cell data
 24 | #' @importFrom BiocGenerics updateObject
 25 | #' @export
 26 | load_sc_data <- function() {
 27 |     data_path <- get_data(
 28 |         "https://github.com/Shians/scBenchData/raw/master/single_cell_data.RData",
 29 |         "sc_data"
 30 |     )
 31 | 
 32 |     # dummy bindings
 33 |     sc_10x <- NULL
 34 |     sc_celseq <- NULL
 35 |     sc_dropseq <- NULL
 36 | 
 37 |     load(data_path)
 38 | 
 39 |     out <- list(
 40 |         "sc_10x" = updateObject(sc_10x),
 41 |         "sc_celseq" = updateObject(sc_celseq),
 42 |         "sc_dropseq" = updateObject(sc_dropseq)
 43 |     )
 44 | 
 45 |     invisible(out)
 46 | }
 47 | 
 48 | #' @describeIn load_all_data Load cell mixture data
 49 | #' @export
 50 | load_cell_mix_data <- function() {
 51 |     data_path <- get_data(
 52 |         "https://github.com/Shians/scBenchData/raw/master/mix_9cell_data.RData",
 53 |         "mrna_mix_data"
 54 |     )
 55 | 
 56 |     # dummy bindings
 57 |     mix_9cell_07clean_1cell_mat <- NULL
 58 |     mix_9cell_08clean_1cell_mat <- NULL
 59 |     mix_9cell_09clean_1cell_mat <- NULL
 60 |     mix_9cell_07clean_3cell_mat <- NULL
 61 |     mix_9cell_07clean_90cell_mat <- NULL
 62 | 
 63 |     load(data_path)
 64 | 
 65 |     out <- list(
 66 |         "cell_mix1" = mix_9cell_07clean_1cell_mat,
 67 |         "cell_mix2" = mix_9cell_08clean_1cell_mat,
 68 |         "cell_mix3" = mix_9cell_09clean_1cell_mat,
 69 |         "cell_mix4" = mix_9cell_07clean_3cell_mat,
 70 |         "cell_mix5" = mix_9cell_07clean_90cell_mat
 71 |     )
 72 | 
 73 |     invisible(out)
 74 | }
 75 | 
 76 | #' @describeIn load_all_data Load mrna mixture data
 77 | #' @export
 78 | load_mrna_mix_data <- function() {
 79 |     data_path <- get_data(
 80 |         "https://github.com/Shians/scBenchData/raw/master/mrna_mix_data.RData",
 81 |         "cell_mix_data"
 82 |     )
 83 | 
 84 |     # dummy bindings
 85 |     mrna_mix_celseq <- NULL
 86 |     mrna_mix_sortseq <- NULL
 87 | 
 88 |     load(data_path)
 89 | 
 90 |     out <- list(
 91 |         "mrna_mix_celseq" = mrna_mix_celseq,
 92 |         "mrna_mix_sortseq" = mrna_mix_sortseq
 93 |     )
 94 | 
 95 |     invisible(out)
 96 | }
 97 | 
 98 | 
 99 | #' Load CellBench Data
100 | #'
101 | #' Load in all CellBench data described at <https://github.com/LuyiTian/CellBench_data/blob/master/README.md>.
102 | #'
103 | #' @return list of SingleCellExperiment
104 | #' @importFrom BiocGenerics updateObject
105 | #' @export
106 | #'
107 | #' @examples
108 | #' \dontrun{
109 | #' cellbench_file <- load_all_data()
110 | #' }
111 | load_all_data <- function() {
112 |     data_path1 <- get_data(
113 |         "https://github.com/Shians/scBenchData/raw/master/single_cell_data.RData",
114 |         "sc_data"
115 |     )
116 |     data_path2 <- get_data(
117 |         "https://github.com/Shians/scBenchData/raw/master/mrna_mix_data.RData",
118 |         "mrna_mix_data"
119 |     )
120 |     data_path3 <- get_data(
121 |         "https://github.com/Shians/scBenchData/raw/master/mix_9cell_data.RData",
122 |         "cell_mix_data"
123 |     )
124 | 
125 |     # dummy bindings
126 |     sc_10x <- NULL
127 |     sc_celseq <- NULL
128 |     sc_dropseq <- NULL
129 |     mix_9cell_07clean_1cell_mat <- NULL
130 |     mix_9cell_08clean_1cell_mat <- NULL
131 |     mix_9cell_09clean_1cell_mat <- NULL
132 |     mix_9cell_07clean_3cell_mat <- NULL
133 |     mix_9cell_07clean_90cell_mat <- NULL
134 |     mrna_mix_celseq <- NULL
135 |     mrna_mix_sortseq <- NULL
136 | 
137 |     load(data_path1)
138 |     load(data_path2)
139 |     load(data_path3)
140 | 
141 |     out <- list(
142 |         "sc_10x" = updateObject(sc_10x),
143 |         "sc_celseq" = updateObject(sc_celseq),
144 |         "sc_dropseq" = updateObject(sc_dropseq),
145 |         "cell_mix1" = mix_9cell_07clean_1cell_mat,
146 |         "cell_mix2" = mix_9cell_08clean_1cell_mat,
147 |         "cell_mix3" = mix_9cell_09clean_1cell_mat,
148 |         "cell_mix4" = mix_9cell_07clean_3cell_mat,
149 |         "cell_mix5" = mix_9cell_07clean_90cell_mat,
150 |         "mrna_mix_celseq" = mrna_mix_celseq,
151 |         "mrna_mix_sortseq" = mrna_mix_sortseq
152 |     )
153 | 
154 |     invisible(out)
155 | }
156 | 
157 | 
158 | #' Clear cached datasets
159 | #'
160 | #' Delete the datasets cached by the load_*_data set of functions
161 | #'
162 | #' @return None
163 | #'
164 | #' @export
165 | #'
166 | #' @examples
167 | #' \dontrun{
168 | #' clear_cached_datasets()
169 | #' }
170 | clear_cached_datasets <- function() {
171 |     bfc <- .get_cache()
172 | 
173 |     get_query_rid <- function(query) {
174 |         BiocFileCache::bfcquery(bfc, query)$rid
175 |     }
176 | 
177 |     BiocFileCache::bfcremove(bfc, get_query_rid("sc_data"))
178 |     BiocFileCache::bfcremove(bfc, get_query_rid("mrna_mix_data"))
179 |     BiocFileCache::bfcremove(bfc, get_query_rid("cell_mix_data"))
180 | }
181 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
  1 | context("Utilities")
  2 | 
  3 | test_that(
  4 |     "Matrix head works", {
  5 |     x <- matrix(runif(100), nrow = 10, ncol = 10)
  6 | 
  7 |     expect_identical(mhead(x), x[1:6, 1:6])
  8 |     expect_identical(mhead(x, n = 10), x)
  9 |     expect_identical(mhead(x, n = 11), x)
 10 | 
 11 |     y <- c(1, 2, 3)
 12 |     expect_error(mhead(y), "!is.null(dim(x)) is not TRUE", fixed = TRUE)
 13 |     expect_error(mhead(x, n = 0), "n > 0 is not TRUE", fixed = TRUE)
 14 |     expect_error(mhead(x, n = "a"), "is.numeric(n) is not TRUE", fixed = TRUE)
 15 | })
 16 | 
 17 | test_that(
 18 |     "Pipeline summarisation works", {
 19 |     methods1 <- list(
 20 |         mean = mean,
 21 |         median = median
 22 |     )
 23 | 
 24 |     methods2 <- list(
 25 |         add1 = function(x) { x+1 },
 26 |         times2 = function(x) { x*2 }
 27 |     )
 28 | 
 29 |     data <- list(
 30 |         data1 = c(1, 2, 3)
 31 |     )
 32 | 
 33 |     expect_identical(
 34 |         structure(
 35 |             list(
 36 |                 pipeline = factor(c(
 37 |                     "data1 » mean » add1", "data1 » mean » times2",
 38 |                     "data1 » median » add1", "data1 » median » times2"
 39 |                 )),
 40 |                 result = c(3, 4, 3, 4)
 41 |             ),
 42 |             row.names = c(NA, -4L),
 43 |             class = c("tbl_df", "tbl", "data.frame")
 44 |         ),
 45 |         data %>%
 46 |             apply_methods(methods1) %>%
 47 |             apply_methods(methods2) %>%
 48 |             pipeline_collapse()
 49 |     )
 50 | 
 51 |     expect_identical(
 52 |         structure(
 53 |             list(
 54 |                 pipeline = factor(c(
 55 |                     "mean » add1", "mean » times2",
 56 |                     "median » add1", "median » times2"
 57 |                 )),
 58 |                 result = c(3, 4, 3, 4)
 59 |             ),
 60 |             row.names = c(NA, -4L),
 61 |             class = c("tbl_df", "tbl", "data.frame")
 62 |         ),
 63 |         data %>%
 64 |             apply_methods(methods1) %>%
 65 |             apply_methods(methods2) %>%
 66 |             pipeline_collapse(data.name = FALSE)
 67 |     )
 68 | })
 69 | 
 70 | test_that(
 71 |     "all_same_class works properly", {
 72 |     x <- list(
 73 |         1, 2, 3
 74 |     )
 75 |     expect_true(all_same_class(x))
 76 | 
 77 |     x <- list(
 78 |         1, 2, "a"
 79 |     )
 80 |     expect_false(all_same_class(x))
 81 | })
 82 | 
 83 | test_that(
 84 |     "class manipulators work properly", {
 85 |     expect_identical(class(add_class(1, "a")), c("a", "numeric"))
 86 |     expect_identical(
 87 |         1, 1 %>% add_class("a") %>% drop_class("a")
 88 |     )
 89 | 
 90 |     expect_identical(
 91 |         1, 1 %>% drop_class("numeric") %>% drop_class("numeric")
 92 |     )
 93 | 
 94 |     expect_identical(
 95 |         1, 1 %>% add_class("numeric")
 96 |     )
 97 | 
 98 |     expect_identical(
 99 |         1, 1 %>% drop_class("a")
100 |     )
101 | })
102 | 
103 | test_that(
104 |     "if_null_then works properly", {
105 |     expect_identical(if_null_then(NULL, 10), 10)
106 |     expect_identical(if_null_then(1, 10), 1)
107 | })
108 | 
109 | test_that(
110 |     "if_empty_then works properly", {
111 |     expect_identical(if_empty_then(NULL, 10), 10)
112 |     expect_identical(if_empty_then(1, 10), 1)
113 | })
114 | 
115 | test_that(
116 |     "make_combinations works properly", {
117 |     x <- factor_no_sort(c("b", "a"))
118 |     y <- factor_no_sort(c("y", "z"))
119 |     z <- factor_no_sort(c("j", "i"))
120 | 
121 |     # explicitly set stringsAsFactors as default changed from TRUE to FALSE in
122 |     # R 4.0.0
123 |     xy_df <- data.frame(x, y)
124 | 
125 |     expect_equal(
126 |         make_combinations(xy_df, z),
127 |         tibble::tibble(
128 |             x = factor_no_sort(c("b", "b", "a", "a")),
129 |             y = factor_no_sort(c("y", "y", "z", "z")),
130 |             z = factor_no_sort(c("j", "i", "j", "i"))
131 |         )
132 |     )
133 | 
134 |     expect_equal(
135 |         make_combinations(horse = xy_df, z),
136 |         tibble::tibble(
137 |             x = factor_no_sort(c("b", "b", "a", "a")),
138 |             y = factor_no_sort(c("y", "y", "z", "z")),
139 |             z = factor_no_sort(c("j", "i", "j", "i"))
140 |         )
141 |     )
142 | 
143 |     expect_equal(
144 |         make_combinations(xy_df, shoe = z),
145 |         tibble::tibble(
146 |             x = factor_no_sort(c("b", "b", "a", "a")),
147 |             y = factor_no_sort(c("y", "y", "z", "z")),
148 |             shoe = factor_no_sort(c("j", "i", "j", "i"))
149 |         )
150 |     )
151 | 
152 |     expect_equal(
153 |         make_combinations(horse = xy_df, shoe = z),
154 |         tibble::tibble(
155 |             x = factor_no_sort(c("b", "b", "a", "a")),
156 |             y = factor_no_sort(c("y", "y", "z", "z")),
157 |             shoe = factor_no_sort(c("j", "i", "j", "i"))
158 |         )
159 |     )
160 | 
161 |     # check make_combination handles unsorted input
162 |     xy_comb_unsrt <- make_combinations(x, y) %>%
163 |         select(y, x)
164 | 
165 |     expect_equal(
166 |         make_combinations(xy_comb_unsrt, z),
167 |         tibble::tibble(
168 |             y = factor(
169 |                 c("y", "y", "y", "y", "z", "z", "z", "z"),
170 |                 levels = c("y", "z")),
171 |             x = factor(
172 |                 c("b", "b", "a", "a", "b", "b", "a", "a"),
173 |                 levels = c("b", "a")),
174 |             z = factor(
175 |                 c("j", "i", "j", "i", "j", "i", "j", "i"),
176 |                 levels = c("j", "i"))
177 |         )
178 |     )
179 | })
180 | 
181 | test_that(
182 |     "infer_names_from_dots works properly", {
183 |     x <- 1
184 |     y <- "a"
185 |     df <- data.frame(
186 |         foo = "foo",
187 |         bar = "bar"
188 |     )
189 | 
190 |     expect_identical(infer_names_from_dots(x, y), c("x", "y"))
191 |     expect_identical(infer_names_from_dots(X = x, y), c("X", "y"))
192 |     expect_identical(infer_names_from_dots(X = x, y, df), c("X", "y", "df"))
193 |     expect_identical(infer_names_from_dots(X = x, y, DF = df), c("X", "y", "DF"))
194 | 
195 |     expect_warning(
196 |         infer_names_from_dots(y = x, y, DF = df),
197 |         "not all names were unique, numbers appended to duplicates"
198 |     )
199 | })
200 | 
201 | test_that(
202 |     "seq utils work", {
203 |     x <- matrix(1, ncol = 10, nrow = 8)
204 | 
205 |     expect_identical(seq_nrow(x), 1:8)
206 |     expect_identical(seq_ncol(x), 1:10)
207 | 
208 |     expect_length(seq_nrow(c(1,2,3)), 0)
209 |     expect_length(seq_ncol(c(1,2,3)), 0)
210 | })
211 | 


--------------------------------------------------------------------------------
/tests/testthat/test-apply_methods.R:
--------------------------------------------------------------------------------
  1 | context("Apply methods")
  2 | 
  3 | test_that(
  4 |     "Apply methods works for most basic case", {
  5 |     x <- list(
  6 |         data = c(1, 2, 3)
  7 |     )
  8 | 
  9 |     f_list <- list(
 10 |         mean = mean
 11 |     )
 12 | 
 13 |     expected <- structure(
 14 |         list(
 15 |             data = structure(1L, .Label = "data", class = "factor"),
 16 |             f_list = structure(1L, .Label = "mean", class = "factor"),
 17 |             result = 2
 18 |         ),
 19 |         row.names = c(NA, -1L),
 20 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
 21 |     )
 22 | 
 23 |     expect_identical(apply_methods(x, f_list), expected)
 24 | })
 25 | 
 26 | test_that(
 27 |     "Apply methods works for expanding functions", {
 28 |     x <- list(
 29 |         data = c(1, 2, 3)
 30 |     )
 31 | 
 32 |     f_list <- list(
 33 |         mean = mean,
 34 |         median = median
 35 |     )
 36 | 
 37 |     expected <- structure(
 38 |         list(
 39 |             data = structure(c(1L, 1L), .Label = "data", class = "factor"),
 40 |             f_list = structure(1:2, .Label = c("mean", "median"), class = "factor"),
 41 |             result = c(2, 2)
 42 |         ),
 43 |         row.names = c(NA, -2L),
 44 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
 45 |     )
 46 | 
 47 |     expect_identical(apply_methods(x, f_list), expected)
 48 | })
 49 | 
 50 | test_that(
 51 |     "Apply methods works for expanding data", {
 52 |     x <- list(
 53 |         data1 = c(1, 2, 3),
 54 |         data2 = c(1, 2, 3)
 55 |     )
 56 | 
 57 |     f_list <- list(
 58 |         mean = mean
 59 |     )
 60 | 
 61 |     expected <- structure(
 62 |         list(
 63 |             data = structure(1:2, .Label = c("data1", "data2"), class = "factor"),
 64 |             f_list = structure(c(1L, 1L), .Label = "mean", class = "factor"),
 65 |             result = c(2, 2)
 66 |         ),
 67 |         row.names = c(NA, -2L),
 68 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
 69 |     )
 70 | 
 71 |     expect_identical(apply_methods(x, f_list), expected)
 72 | })
 73 | 
 74 | test_that(
 75 |     "Apply methods works for expanding data and functions", {
 76 |     x <- list(
 77 |         data1 = c(1, 2, 3),
 78 |         data2 = c(1, 2, 3)
 79 |     )
 80 | 
 81 |     f_list <- list(
 82 |         mean = mean,
 83 |         median = median
 84 |     )
 85 | 
 86 |     expected <- structure(
 87 |         list(
 88 |             data = structure(c(1L, 1L, 2L, 2L), .Label = c("data1", "data2"), class = "factor"),
 89 |             f_list = structure(c(1L, 2L, 1L, 2L), .Label = c("mean", "median"), class = "factor"),
 90 |             result = c(2, 2, 2, 2)
 91 |         ),
 92 |         row.names = c(NA, -4L),
 93 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
 94 |     )
 95 | 
 96 |     expect_identical(apply_methods(x, f_list), expected)
 97 | })
 98 | 
 99 | test_that(
100 |     "Apply methods works for chain expanding", {
101 |     x <- list(
102 |         data1 = c(1, 2, 3)
103 |     )
104 | 
105 |     f_list1 <- list(
106 |         mean = mean,
107 |         median = median
108 |     )
109 | 
110 |     f_list2 <- list(
111 |         double = function(x) x * 2,
112 |         add_one = function(x) x + 1
113 |     )
114 | 
115 |     expected <- structure(
116 |         list(
117 |             data = structure(c(1L, 1L, 1L, 1L), .Label = "data1", class = "factor"),
118 |             f_list1 = structure(c(1L, 1L, 2L, 2L), .Label = c("mean", "median"), class = "factor"),
119 |             f_list2 = structure(c(1L, 2L, 1L, 2L), .Label = c("double", "add_one"), class = "factor"),
120 |             result = c(4, 3, 4, 3)
121 |         ),
122 |         row.names = c(NA, -4L),
123 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
124 |     )
125 | 
126 |     res <- x %>%
127 |         apply_methods(f_list1) %>%
128 |         apply_methods(f_list2)
129 | 
130 |     expect_identical(res, expected)
131 | })
132 | 
133 | test_that(
134 |     "Errors are properly reported", {
135 |     data_list <- list(
136 |         x = 1:5
137 |     )
138 | 
139 |     method_list <- list(
140 |         mean
141 |     )
142 | 
143 |     expect_error(
144 |         apply_methods(data_list, method_list),
145 |         "every element of fn_list must be named"
146 |     )
147 | 
148 |     expect_error(
149 |         apply_methods(list(1:5), list(mean = mean)),
150 |         "every element of x must be named"
151 |     )
152 | })
153 | 
154 | test_that(
155 |     "Apply methods works for expanding functions", {
156 |     x <- list(
157 |         data = c(1, 2, 3)
158 |     )
159 | 
160 |     f_list <- list(
161 |         mean = mean,
162 |         median = median
163 |     )
164 | 
165 |     expected <- structure(
166 |         list(
167 |             data = structure(c(1L, 1L), .Label = "data", class = "factor"),
168 |             f_list = structure(1:2, .Label = c("mean", "median"), class = "factor"),
169 |             result = c(2, 2)
170 |         ),
171 |         row.names = c(NA, -2L),
172 |         class = c("benchmark_tbl", "tbl_df", "tbl", "data.frame")
173 |     )
174 | 
175 |     expect_identical(apply_methods(x, f_list), expected)
176 | })
177 | 
178 | test_that(
179 |     "Multithreading works", {
180 | 
181 |     x <- list(
182 |         data1 = c(1, 2, 3),
183 |         data2 = c(1, 2, 3)
184 |     )
185 | 
186 |     f_list <- list(
187 |         mean = mean,
188 |         median = median
189 |     )
190 | 
191 |     set_cellbench_threads(2)
192 |     res <- apply_methods(x, f_list)
193 | 
194 |     set_cellbench_threads(1)
195 |     expected <- apply_methods(x, f_list)
196 | 
197 |     expect_identical(res, expected)
198 | 
199 |     f_list2 <- fn_list(
200 |         f1 = function(x) x + 1
201 |     )
202 | 
203 |     set_cellbench_threads(2)
204 |     res <- x %>%
205 |         apply_methods(f_list) %>%
206 |         apply_methods(f_list2)
207 | 
208 |     set_cellbench_threads(1)
209 |     expected <- x %>%
210 |         apply_methods(f_list) %>%
211 |         apply_methods(f_list2)
212 | 
213 |     expect_identical(res, expected)
214 | })
215 | 
216 | test_that(
217 |     "Error propagation works", {
218 |     x <- list(
219 |         data1 = 1,
220 |         data2 = "a"
221 |     )
222 | 
223 |     method <- list(
224 |         log = log
225 |     )
226 | 
227 |     output <- apply_methods(x, method)
228 |     expect_is(output$result[[2]], "task_error")
229 |     expect_identical(output$result[[2]]$error_location, "method")
230 | 
231 |     method2 <- list(
232 |         sqrt = sqrt
233 |     )
234 | 
235 |     output2 <- apply_methods(output, method2)
236 |     expect_is(output2$result[[2]], "task_error")
237 |     expect_identical(output2$result[[2]]$error_location, "method")
238 | })
239 | 


--------------------------------------------------------------------------------
/R/apply_methods.R:
--------------------------------------------------------------------------------
  1 | #' Apply methods
  2 | #'
  3 | #' apply_methods() and its aliases apply_metrics and begin_benchmark take either
  4 | #' lists of datasets or benchmark_tbl objects and applies a list of functions.
  5 | #' The output is a benchmark_tbl where each method has been applied to each
  6 | #' dataset or preceeding result.
  7 | #'
  8 | #' @param x the list of data or benchmark tibble to apply methods to
  9 | #' @param fn_list the list of methods to be applied
 10 | #' @param name (optional) the name of the column for methods applied
 11 | #' @param suppress.messages TRUE if messages from running methods should be
 12 | #'   suppressed
 13 | #'
 14 | #' @return benchmark_tbl object containing results from methods applied, the
 15 | #'   first column is the name of the dataset as factors, middle columns contain
 16 | #'   method names as factors and the final column is a list of results of
 17 | #'   applying the methods.
 18 | #'
 19 | #' @importFrom magrittr %>%
 20 | #' @importFrom tidyselect all_of
 21 | #'
 22 | #' @seealso \code{\link{time_methods}}
 23 | #'
 24 | #' @export
 25 | #'
 26 | #' @examples
 27 | #' # list of data
 28 | #' datasets <- list(
 29 | #'     set1 = rnorm(500, mean = 2, sd = 1),
 30 | #'     set2 = rnorm(500, mean = 1, sd = 2)
 31 | #' )
 32 | #'
 33 | #' # list of functions
 34 | #' add_noise <- list(
 35 | #'     none = identity,
 36 | #'     add_bias = function(x) { x + 1 }
 37 | #' )
 38 | #'
 39 | #' res <- apply_methods(datasets, add_noise)
 40 | #'
 41 | apply_methods <- function(x, fn_list, name = NULL, suppress.messages = TRUE) {
 42 |     method_names <- names(fn_list)
 43 |     if (length(method_names) != length(fn_list)) {
 44 |         stop("every element of fn_list must be named")
 45 |     }
 46 | 
 47 |     UseMethod("apply_methods", x)
 48 | }
 49 | 
 50 | #' @rdname apply_methods
 51 | #' @importFrom BiocParallel SnowParam MulticoreParam
 52 | #' @importFrom tibble tibble
 53 | #' @export
 54 | apply_methods.list <- function(
 55 |     x,
 56 |     fn_list,
 57 |     name = NULL,
 58 |     suppress.messages = TRUE
 59 | ) {
 60 |     data_names <- names(x)
 61 |     if (length(data_names) != length(x)) {
 62 |         stop("every element of x must be named")
 63 |     }
 64 | 
 65 |     method_names <- names(fn_list)
 66 | 
 67 |     if (is.null(name)) {
 68 |         name <- deparse(substitute(fn_list))
 69 |         name <- gsub("methods$", "method", name)
 70 |     }
 71 | 
 72 |     multithread_param <- getOption("CellBench.bpparam")
 73 | 
 74 |     output <- make_combinations(data_names, method_names) %>%
 75 |         magrittr::set_colnames(c("data", name))
 76 | 
 77 |     tasks <- .generate_tasks(output, x, fn_list, name)
 78 | 
 79 |     result <- .bp_try_apply(
 80 |         BPPARAM = multithread_param,
 81 |         X = tasks,
 82 |         suppress.messages = suppress.messages,
 83 |         FUN = function(task, suppress.messages) {
 84 |             suppressMsgAndPrint(
 85 |                 task$method(task$data),
 86 |                 suppress = suppress.messages
 87 |             )
 88 |         }
 89 |     )
 90 | 
 91 |     result <- purrr::map(
 92 |         result,
 93 |         name = name,
 94 |         function(res, name) {
 95 |             if (is.error(res) && is.null(res$error_location)) {
 96 |                 res$error_location <- name
 97 |                 res <- add_class(res, "error")
 98 |                 res <- add_class(res, "task_error")
 99 |             }
100 |             res
101 |         }
102 |     )
103 | 
104 |     output <- .make_output(output, result, name)
105 |     output <- add_class(output, "benchmark_tbl")
106 | 
107 |     output
108 | }
109 | 
110 | #' @rdname apply_methods
111 | #' @importFrom rlang .data
112 | #' @export
113 | apply_methods.benchmark_tbl <- function(
114 |     x,
115 |     fn_list,
116 |     name = NULL,
117 |     suppress.messages = TRUE
118 | ) {
119 |     stopifnot(all_unique(names(fn_list)))
120 | 
121 |     method_names <- names(fn_list)
122 | 
123 |     if (missing("name")) {
124 |         # get name from variable name
125 |         name <- deparse(substitute(fn_list))
126 |     }
127 | 
128 |     multithread_param <- getOption("CellBench.bpparam")
129 | 
130 |     # sort columns from left to right, for when users input unsorted data
131 |     # otherwise order will not match tidyr::crossing
132 |     methods_columns <- names(x)[-ncol(x)]
133 |     x <- dplyr::arrange_at(x, dplyr::vars(tidyselect::all_of(methods_columns)))
134 | 
135 |     tasks <- list()
136 |     for (data in x$result) {
137 |         for (fn in fn_list) {
138 |             tasks <- append(
139 |                 tasks,
140 |                 list(list(method = fn, data = data))
141 |             )
142 |         }
143 |     }
144 | 
145 |     results <- .bp_try_apply(
146 |         BPPARAM = multithread_param,
147 |         X = tasks,
148 |         suppress.messages = suppress.messages,
149 |         FUN = function(task, suppress.messages) {
150 |             if (is.error(task$data)) {
151 |                 task$data
152 |             } else {
153 |                 suppressMsgAndPrint(
154 |                     task$method(task$data),
155 |                     suppress = suppress.messages
156 |                 )
157 |             }
158 |         }
159 |     )
160 | 
161 |     results <- purrr::map(
162 |         results,
163 |         name = name,
164 |         function(res, name) {
165 |             if (is.error(res) && is.null(res$error_location)) {
166 |                 res$error_location <- name
167 |                 res <- add_class(res, "task_error")
168 |             }
169 |             res
170 |         }
171 |     )
172 | 
173 |     output <- x %>% dplyr::select(-"result")
174 |     output <- tidyr::crossing(output, factor_no_sort(method_names))
175 |     names(output)[ncol(output)] <- name
176 |     output <- output %>%
177 |         tibble::add_column(result = results)
178 | 
179 |     if (all_length_one(output$result)) {
180 |         output$result <- unlist(output$result)
181 |     }
182 | 
183 | 
184 |     output <- add_class(output, "benchmark_tbl")
185 | 
186 |     output
187 | }
188 | 
189 | #' @rdname apply_methods
190 | #' @export
191 | apply_methods.tbl_df <- apply_methods.benchmark_tbl
192 | 
193 | #' @rdname apply_methods
194 | #'
195 | #' @export
196 | apply_metrics <- apply_methods
197 | 
198 | #' @rdname apply_methods
199 | #'
200 | #' @export
201 | begin_benchmark <- apply_methods
202 | 
203 | # wrapper for task generation
204 | .generate_tasks <- function(output_tbl, x, fn_list, name) {
205 |     purrr::map2(
206 |         output_tbl$data,
207 |         output_tbl[[name]],
208 |         function(dname, fname) {
209 |             list(
210 |                 data = x[[dname]],
211 |                 method = fn_list[[fname]]
212 |             )
213 |         }
214 |     )
215 | }
216 | 
217 | # wrapper for bptry-bplapply pattern
218 | .bp_try_apply <- function(...) {
219 |     BiocParallel::bptry(
220 |         BiocParallel::bplapply(
221 |             ...
222 |         )
223 |     )
224 | }
225 | 
226 | # assemble output
227 | .make_output <- function(output, result, name, timed = FALSE) {
228 |     output <- tibble::as_tibble(output)
229 | 
230 |     if (timed) {
231 |         output <- tibble::add_column(output, timed_result = result)
232 |     } else {
233 |         output <- tibble::add_column(output, result = result)
234 |         if (all_length_one(output$result)) {
235 |             output$result <- unlist(output$result)
236 |         }
237 |     }
238 | 
239 |     output$data <- factor_no_sort(output$data)
240 |     output[[name]] <- factor_no_sort(output[[name]])
241 | 
242 |     output
243 | }
244 | 


--------------------------------------------------------------------------------
/inst/prebuilt-doc/ParameterTesting.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Use case: Benchmarking method parameters"
  3 | author: "Shian Su"
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::pdf_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Vignette Title}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | library(CellBench)
 14 | library(SingleCellExperiment)
 15 | library(dplyr)
 16 | library(purrr)
 17 | library(ggplot2)
 18 | set_cellbench_threads(4)
 19 | ```
 20 | 
 21 | # Introduction
 22 | 
 23 | This use case shows how to test a range of parameters for a given method. We will use the CelSeq2 mRNA mixture data from 3 lung adenocarcinoma cell-lines and apply knn-smooth with various `k` parameter to see its effects on the output.
 24 | 
 25 | # Setting up benchmark
 26 | 
 27 | ```{r}
 28 | library(CellBench)
 29 | library(SingleCellExperiment)
 30 | library(dplyr)
 31 | library(purrr)
 32 | library(ggplot2)
 33 | ```
 34 | 
 35 | We load in the data and create a list of 1 SingleCellExperiment object.
 36 | 
 37 | ```{r}
 38 | cellbench_mrna_mix_data <- load_mrna_mix_data()
 39 | 
 40 | data_list <- list(
 41 |     mrna_mix_celseq = cellbench_mrna_mix_data$mrna_mix_celseq
 42 | )
 43 | ```
 44 | 
 45 | We need to write some small wrappers to help run pipelines and make methods uniform in input and output. This is necessary because each step of analysis should take in the same type of data and output the same type of data, however different methods may differ in how they are called, how many steps need to run and what they output. Wrappers help manage this, in this example we want our normalisation step to take in a SingleCellExperiment and output a normalised count matrix. The imputation step should take a count matrix and return an imputed counts matrix.
 46 | 
 47 | ```{r}
 48 | # take in a SingleCellExperiment and return a scran normalised 
 49 | # expression matrix
 50 | scran_norm_expr <- function(x) {
 51 |     stopifnot(is(x, "SingleCellExperiment"))
 52 | 
 53 |     x <- scran::computeSumFactors(x)
 54 |     x <- scater::normalize(x, return_log = FALSE)
 55 | 
 56 |     SingleCellExperiment::normcounts(x)
 57 | }
 58 | 
 59 | # take in an expression matrix and return the imputed expression matrix
 60 | impute_knn_smooth <- function(expr, k) {
 61 |     source("https://raw.github.com/yanailab/knn-smoothing/master/knn_smooth.R")
 62 |     smoothed_mat <- knn_smoothing(mat = expr, k = k)
 63 |     smoothed_mat
 64 | }
 65 | ```
 66 | 
 67 | We then create the lists of functions to use with CellBench. We only have one normalisation method, but for imputation we can create a series of partially applied functions with different `k` parameters. Here assuming we have `f(x, y)`, `partial(f, y = 1)` is equivalent to `function(x) f(x, y = 1)`, partial application "fills in" parameter values and returns a function that can be called.
 68 | 
 69 | ```{r}
 70 | norm_method <- list(
 71 |     scran = scran_norm_expr
 72 | )
 73 | 
 74 | # identity simply returns its argument, here it's used to represent 
 75 | # no imputation
 76 | impute_method <- list(
 77 |     "none" = identity,
 78 |     "impute_knn_smooth(k = 2)" = partial(impute_knn_smooth, k = 2),
 79 |     "impute_knn_smooth(k = 4)" = partial(impute_knn_smooth, k = 4),
 80 |     "impute_knn_smooth(k = 8)" = partial(impute_knn_smooth, k = 8),
 81 |     "impute_knn_smooth(k = 16)" = partial(impute_knn_smooth, k = 16),
 82 |     "impute_knn_smooth(k = 32)" = partial(impute_knn_smooth, k = 32)
 83 | )
 84 | ```
 85 | 
 86 | First we apply the normalisation method we can store the result of this in `res_norm` and print it.
 87 | 
 88 | ```{r}
 89 | res_norm <- data_list %>%
 90 |     apply_methods(norm_method)
 91 | 
 92 | res_norm
 93 | ```
 94 | 
 95 | We see that it's created a tibble where the column name is taken from the name of the lists and values in the columns correspond to names within the lists. The result of the computation performed is stored in the last column. If we had another normalisation method to test then we could go back and add it to the `norm_method` list and run this code again.
 96 | 
 97 | We can then apply our next set of methods to the results of the normalisation. This expands our table to more rows, keeping track of the combinations of `data`, `norm_method` and `impute_method`, as well as updating the results with the latest methods applied.
 98 | 
 99 | ```{r}
100 | res_impute <- res_norm %>%
101 |     apply_methods(impute_method)
102 | 
103 | res_impute
104 | ```
105 | 
106 | Now we have the computation output of all the pipelines of methods, each producing a different imputed expression in the result column. From here we could calculate some metrics, in this instance we will create principal component plots of the imputed values for a visual assessment.
107 | 
108 | We create a new method list containing a PCA transformation that returns a data.frame of two columns containing the principal component coordinates. We could add more dimensionality reduction methods into our list later on to look at it in different ways. We also transform our counts into log-scale before we perform PCA for more stable scaling.
109 | 
110 | ```{r}
111 | dim_red <- list(
112 |     pca = compute_pca
113 | )
114 | 
115 | # log-transform the counts
116 | res_impute$result <- lapply(res_impute$result, function(x) log2(x + 1))
117 | 
118 | res <- res_impute %>%
119 |     apply_methods(dim_red)
120 | 
121 | res
122 | ```
123 | 
124 | We now have a benchmark_tbl where the results column contains two PCA coordinates. If we plot the PCA values in the results column as is, we will have a bunch of points on a plot with no additional context. We actually have annotation for these cells in terms of mixture proportions so we can append it to our results and create a more informative visualisation. This could be done using a for-loop where you iterate through the results and reassign it with the annotated version, but I will introduce a functional programming approach to this task.
125 | 
126 | We define a function that takes in a data_key, the name of the dataset and the result data.frame. This function will add a new column to the result data.frame called "truth" which is the concatenated proportion values, so each combination of mixutre proportions is a unique group. Because we need information from both data and result columns, we use `map2` from `purrr` which allows use to map functions with two arguments to two columns.
127 | 
128 | ```{r}
129 | # function to add annotation to results table
130 | # we could have used a single variable because we're only looking at one dataset
131 | # but this gives us more flexibility if we want to add more datasets to our list
132 | append_anno <- function(data_key, result) {
133 |     # extract the desired SingleCellExperiment from our data list
134 |     data <- data_list[[data_key]]
135 |     
136 |     # take the sample annotations from colData
137 |     sample_anno <- colData(data)
138 | 
139 |     # create the truth values by concatenating cell line proportions
140 |     truth <- paste(
141 |         sample_anno$H2228_prop,
142 |         sample_anno$H1975_prop,
143 |         sample_anno$HCC827_prop
144 |     )
145 |     
146 |     # add truth values as a column to result and return the result
147 |     result$truth <- truth
148 |     result
149 | }
150 | 
151 | # replace the result column with annotated results
152 | annotated_res_list <- map2(res$data, res$result, append_anno)
153 | 
154 | annotated_res <- res %>%
155 |     dplyr::mutate(result = annotated_res_list)
156 | 
157 | annotated_res
158 | ```
159 | 
160 | There's now a new column in the results data.frames containing the "truth" mixture groups. We want to plot all of these PCA values, it's possible to use a for-loop to iterate through the columns and call many plots, but here we can use `unnest` from `tidy` to get a nice tabular format that allows us to use ggplots. The data.frames in the result column are unnested in that they are combined row-wise, and the remaining columns are duplicated to match the new structure. For this to work the data.frames in the result column must have the same columns.
161 | 
162 | ```{r}
163 | plot_df <- tidyr::unnest(annotated_res)
164 | 
165 | plot_df
166 | ```
167 | 
168 | With this tidy data.frame it is easy to use ggplots to visualise the PCA, colouring the points by the truth group. We use `facet_wrap` to create a facetted plot so that each plot has a common scale, the impute_method is used as the facetting variable so each plot shows a different imputation.
169 | 
170 | ```{r}
171 | plot_df %>%
172 |     ggplot(aes(x = Dim1, y = Dim2, col = truth)) +
173 |     geom_point() +
174 |     facet_wrap(~impute_method, nrow = 2) +
175 |     ggtitle("KNN Smooth Imputation")
176 | ```
177 | 
178 | We see that performing PCA on the unimputed data already separates the groups out, but not quite in the structure we expected and with quite a bit of spread within the groups. It appears that increasing the k parameter in the knn-smooth algorithm will pull the points toward the designed structure reflecting the mrna proportions. Though it appears if k is set too high then points will be pulled into almost singular points and potentially lose much of their true variability.
179 | 
180 | # Summary
181 | 
182 | We have seen an application of CellBench to test various parameters values in the knn-smooth algorithm. Multiple steps of methods were applied to the original data and at each step it's possible to store the result and print it in an easy to interpret form. The main complexity is in writing the wrappers, once wrappers for methods are written they are simply placed into lists and applied to data or upstream benchmark_tbls. With the wrappers and annotation function written, we could have compressed this entire example into the following code.
183 | 
184 | ```{r}
185 | res <- data_list %>%
186 |     apply_methods(norm_method) %>%
187 |     apply_methods(impute_method) %>%
188 |     mutate(result = lapply(result, function(x) log2(x + 1))) %>%
189 |     apply_methods(dim_red) %>%
190 |     mutate(result = map2(data, result, append_anno))
191 | 
192 | res
193 | 
194 | plot_df <- tidyr::unnest(res)
195 | plot_df %>%
196 |     ggplot(aes(x = Dim1, y = Dim2, col = truth)) +
197 |     geom_point() +
198 |     facet_wrap(~impute_method, nrow = 2) +
199 |     ggtitle("KNN Smooth Imputation")
200 | ```
201 | 
202 | Then we could easily add new data or methods to the pipeline by modifying our methods lists.
203 | 


--------------------------------------------------------------------------------
/vignettes/DataManipulation.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Benchmark Data Manipulation"
  3 | author: "Shian Su"
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::html_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Data Manipulation}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | library(CellBench)
 14 | library(dplyr)
 15 | library(purrr)
 16 | ```
 17 | 
 18 | # Introduction
 19 | 
 20 | This vignette contains some examples of common manipulations of data objects in this package. This package is built around the tidy data ideas established by Hadley Wickham's [tidyverse](https://www.tidyverse.org), the primary goals are to keep data in an organised manner and enable concise manipulations for achieving a wide range of outcomes.
 21 | 
 22 | # Benchmark Tibble
 23 | 
 24 | ## Basics
 25 | 
 26 | The fundamental object in this package is the benchmark tibble. [Tibbles](https://cran.r-project.org/web/packages/tibble/vignettes/tibble.html) are variant of the `data.frame`, they are used here for their nicer printing properties.
 27 | 
 28 | The benchmark tibble is a `data.frame` structure where the `result` column is a special _list-column_ which is allowed to contain arbitrary data types compared to a regular column which may only contain [atomic data types](https://cran.r-project.org/doc/manuals/r-release/R-lang.html#Vector-objects) such as `numeric`, `logical` and `character`. A _list-column_ is a list with the same number of elements as there are rows in the data frame, they exist because the typical column is a vector which cannot contain complicated data types. However if the results of a computation are simple atomic values, then the column will be coerced to a regular vector with all the expected behaviours.
 29 | 
 30 | **IMPORTANT!** Because the result column is a list, care must be taken when performing certain operations. Most vectorised operations do not work on lists, and when we will cover how to properly work with these columns in the [Operations On Benchmark Tibbles](#ops-on-benchmark-tibbles) section.
 31 | 
 32 | We demonstrate how the benchmark tibble works:
 33 | 
 34 | ```{r}
 35 | library(CellBench)
 36 | datasets <- list(
 37 |     random_mat1 = matrix(runif(100), 10, 10),
 38 |     random_mat2 = matrix(runif(100), 10, 10)
 39 | )
 40 | 
 41 | cor_method <- list(
 42 |     pearson = function(x) { cor(x, method = "pearson") },
 43 |     kendall = function(x) { cor(x, method = "kendall") }
 44 | )
 45 | 
 46 | res <- datasets %>%
 47 |     apply_methods(cor_method)
 48 | ```
 49 | 
 50 | As we can see, the table contains the data used, methods applied and the result of the computation. The reason for using tibbles is so that the result column is printed in a summarised form rather than fully expanded as would be the case for non-tibble list-columns.
 51 | 
 52 | ## Operations On Benchmark Tibbles {#ops-on-benchmark-tibbles}
 53 | 
 54 | ```{r}
 55 | class(res)
 56 | ```
 57 | 
 58 | The benchmark tibble inherits from tibbles which inherit from data.frame, so operations expected to work on the parent classes should be expected to work on the benchmark tibble.
 59 | 
 60 | ```{r}
 61 | res[1:2, ]
 62 | ```
 63 | 
 64 | By default tibbles only print the first 10 rows, this doesn't change with how many elements you subset. Instead you should use `print(res, n = Inf)` if you wish to print the whole tibble, or a desired number of rows.
 65 | 
 66 | We can also make use of the `dplyr` functions along with piping to write concise expressions for manipulating the benchmark tibble.
 67 | 
 68 | ```{r}
 69 | library(dplyr)
 70 | res %>%
 71 |     filter(cor_method == "pearson")
 72 | ```
 73 | 
 74 | It is also possible to cbind two benchmark tibbles together, for example if you had added another set of methods 
 75 | 
 76 | ```{r}
 77 | cor_method <- list(
 78 |     spearman = function(x) cor(x, method = "spearman")
 79 | )
 80 | 
 81 | res2 <- datasets %>%
 82 |     apply_methods(cor_method)
 83 | 
 84 | res2
 85 | ```
 86 | 
 87 | ```{r}
 88 | rbind(res, res2)
 89 | ```
 90 | 
 91 | This allows new methods to be added without having to recompute results for old methods.
 92 | 
 93 | ## Operations On list-columns
 94 | 
 95 | We note again that the benchmark column is a list.
 96 | 
 97 | ```{r}
 98 | class(res$result)
 99 | ```
100 | 
101 | This means some simple vectorised functions will not quite work as expected. For example if we wished to take the exponential of all the matrices using `dplyr::mutate()`. Because `dplyr` feeds entire columns into the functions and expects the entire column to be returned, the result of the following code will attempt to run `exp()` on a `list` which it cannot handle.
102 | 
103 | ```{r, eval = FALSE}
104 | # this code will fail
105 | res %>%
106 |     mutate(exp_result = exp(result))
107 | ```
108 | 
109 | Instead we must reformulate these to expressions that take in list arguments and return lists or vectors of the same length. This can be done using either `lapply` from the base R library or `map` from the `purrr` package.
110 | 
111 | ```{r}
112 | res %>%
113 |     mutate(exp_result = lapply(result, exp)) %>%
114 |     mutate(sum_of_exp = unlist(lapply(exp_result, sum)))
115 | ```
116 | 
117 | ## Unnesting with Lists of data.frames
118 | 
119 | One of the most useful representations that can be created in the tibble framework is to have data frames with consistent columns as the `result` list-column. This allows the data to be unnested such that the contents of the result data frames are row-contenated and the information in the remaining rows are duplicated accordingly.
120 | 
121 | ```{r}
122 | library(tibble)
123 | 
124 | df1 <- data.frame(
125 |     little = c(1, 3),
126 |     big = c(5, 7)
127 | )
128 | 
129 | df1
130 | ```
131 | 
132 | ```{r}
133 | df2 <- data.frame(
134 |     little = c(2, 4),
135 |     big = c(6, 8)
136 | )
137 | 
138 | df2
139 | ```
140 | 
141 | ```{r}
142 | tbl <- tibble(
143 |     type = c("odds", "evens"),
144 |     values = list(df1, df2)
145 | )
146 | 
147 | tbl
148 | ```
149 | 
150 | ```{r}
151 | tidyr::unnest(tbl)
152 | ```
153 | 
154 | \newpage
155 | # Manipulating Functions
156 | 
157 | ## Basics of Functional Programming
158 | 
159 | The book Advanced R contains an excellent section on [Functional Programming](http://adv-r.had.co.nz/Functional-programming.html). The primary idea we want to make use of is that functions are objects, not too different from numbers or character strings. For example we can think of anonymous functions like raw literal values.
160 | 
161 | ```{r, result = 'hide'}
162 | # a numeric literal
163 | 1
164 | 
165 | # a character literal
166 | "a"
167 | 
168 | # a function literal
169 | function(x) { print(x) }
170 | ```
171 | 
172 | We can assign these to variables in the same way
173 | 
174 | ```{r, result = 'hide'}
175 | # assigning numeric literal
176 | x <- 1
177 | 
178 | # assigning character literal
179 | x <- "a"
180 | 
181 | # assigning function literal
182 | f <- function(x) { print(x) }
183 | ```
184 | 
185 | We can also reassign variables to other variables
186 | 
187 | ```{r, result = 'hide'}
188 | # assigning numeric literal
189 | x <- 1
190 | y <- x # y = 1
191 | 
192 | # assigning character literal
193 | x <- "a"
194 | y <- x # y = "a"
195 | 
196 | # assigning function literal
197 | f <- function(x) { print(x) }
198 | g <- f # g = function(x) { print(x) }
199 | ```
200 | 
201 | Being able to accept functions as regular objects is fundamental to making effective use of this package.
202 | 
203 | ## Partial Application
204 | 
205 | Partial application is a way to manipulate function objects. The idea is that you take a function which accepts multiple arguments, and "partially" apply some arguments to it. The simplest way to perform a partial application is to write a new function that wraps around the original function but with some arguments already filled in.
206 | 
207 | ```{r}
208 | # function to add two things
209 | plus <- function(x, y) { x + y }
210 | 
211 | # function that adds 2 to x
212 | plus_two <- function(x) { plus(x, y = 2) }
213 | 
214 | plus_two(1)
215 | ```
216 | 
217 | Equivalently, we could use `purrr::partial()` which properly encapsulates this idea into a helper function. This is preferable to writing the wrapping function because it's very explicit in what its purpose is.
218 | 
219 | Wrapper function can do all sorts of computations beyond just filling in a variable, whereas `purrr::partial()` performs the singular duty of partially filling in arguments, there is no room to sneak in additional work that might complicate the process.
220 | 
221 | ```{r}
222 | library(purrr)
223 | 
224 | plus_two <- partial(plus, y = 2)
225 | 
226 | plus_two(1)
227 | ```
228 | 
229 | ## Sequence of Partial Applications
230 | 
231 | CellBench offers a function to help construct partially-applied functions with oen or more sequences of arguments.
232 | 
233 | ```{r}
234 | # define a function that multiplies 3 numbers together
235 | g <- function(x, y, z) {
236 |     x * y * z
237 | }
238 | 
239 | g(1, 2, 3)
240 | ```
241 | 
242 | ```{r}
243 | # create a list of functions with the second and third values partially applied
244 | # all combinations of y and z are generates, resulting in a list of 4 functions
245 | g_list <- fn_arg_seq(g, y = c(1, 2), z = c(3, 4))
246 | 
247 | # apply each of the functions in the list to the value 1
248 | lapply(g_list, function(func) { func(x = 1) })
249 | ```
250 | 
251 | This can be very useful for testing out a range or grid of parameters with very little code repetition.
252 | 
253 | ## Memoisation
254 | 
255 | Memoisation is the functional programming techinque of caching the result of computations. When a memoised function is called with arguments it had previously been evaluated with, it will simply recall the return value from the cache rather than redo the computations. 
256 | 
257 | Memoisation is a operation on functions, taking in a regular function and returning a memoised version. CellBench offers memoisation through the `cache_method()` command which wraps around functionality from the `memoise` CRAN package. Memoised functions store their cache on disk, so be careful with functions that return large output objects.
258 | 
259 | ```{r, eval = FALSE}
260 | # initialise the CellBench cache
261 | cellbench_cache_init()
262 | 
263 | # dummy simulation of a slow function
264 | f <- function(x) {
265 |     Sys.sleep(2)
266 |     return(x)
267 | }
268 | 
269 | # create the memoised version of the function
270 | cached_f <- cache_method(f)
271 | 
272 | # running the first time will be slow
273 | cached_f(1)
274 | 
275 | # running the second time will be fast
276 | cached_f(1)
277 | ```
278 | 
279 | # Further Reading
280 | 
281 | * Introduction to tibbles: https://tibble.tidyverse.org
282 | * Introduction to purrr: https://purrr.tidyverse.org
283 | 


--------------------------------------------------------------------------------
/vignettes/TidyversePatterns.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tidyverse Patterns"
  3 | author: "Shian Su"
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::html_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Tidyverse Patterns}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE)
 14 | library(purrr)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(ggplot2)
 18 | library(CellBench)
 19 | ```
 20 | 
 21 | # Introduction
 22 | 
 23 | This vignette will introduce tidyverse patterns that are useful for making full
 24 | use of the CellBench framework. CellBench was developed with tidyverse
 25 | compatibility as a fundamental goal. `purrr` provides functional programming
 26 | tools to manipulate the methods and lists produced in this framework. `dplyr` is
 27 | very useful for working with the `tibble`-based structures produced by 
 28 | CellBench. Since the outputs are mostly in `tibble` structure, they are very
 29 | easily visualised using `ggplot2`.
 30 | 
 31 | For detailed explanations of tidyverse packages please see resources at
 32 | https://www.tidyverse.org/learn/, in particular 
 33 | [R for Data Science](https://r4ds.had.co.nz).
 34 | 
 35 | For quick concise references of tidyverse features and functions I recommend
 36 | all of the cheatsheets available at 
 37 | https://www.rstudio.com/resources/cheatsheets/.
 38 | 
 39 | \newpage
 40 | # Functional Programming with purrr
 41 | 
 42 | ## Methods as Function Objects
 43 | 
 44 | In CellBench we require methods to take in only a single argument. The idea is
 45 | that all methods within a single pipeline step should take the same kind of
 46 | input and produce the same type of output. In practice most methods have
 47 | additional parameters that can be tuned, and we may use `purrr::partial()` to
 48 | help pre-fill these parameters.
 49 | 
 50 | `partial()` takes a function, some variable values, and returns the function
 51 | with the specified arguments pre-filled. This reduces the number of free
 52 | parameters in your function.
 53 | 
 54 | We demonstrate a trivial application of `partial()`:
 55 | 
 56 | ```{r}
 57 | library(CellBench)
 58 | library(purrr)
 59 | 
 60 | # function to raise number to a power
 61 | pow <- function(x, n) {
 62 |     x^n
 63 | }
 64 | 
 65 | pow2 <- partial(pow, n = 2)
 66 | pow3 <- partial(pow, n = 3)
 67 | 
 68 | pow2(2)
 69 | pow3(2)
 70 | ```
 71 | 
 72 | Here `partial()` allowed us to turn a two-parameter function into a single
 73 | parameter function. Using `partial()` is good practice compared to the two
 74 | alternatives:
 75 | 
 76 | 1. Writing duplicate functions
 77 | 2. Writing function wrappers
 78 | 
 79 | Consider writing duplicate functions
 80 | 
 81 | ```{r, eval = FALSE}
 82 | pow2 <- function(x) {
 83 |     x^2
 84 | }
 85 | 
 86 | pow3 <- function(x) {
 87 |     x^3
 88 | }
 89 | ```
 90 | 
 91 | Now say you wanted to add an argument check `stopifnot(is.numeric(x))`, you
 92 | would need to edit the code in two places. The chances for errors leading to
 93 | inconsistencies increases dramatically with the number of duplications and
 94 | changes required.
 95 | 
 96 | Consider writing function wrappers
 97 | 
 98 | ```{r, eval = FALSE}
 99 | pow <- function(x, n) {
100 |     x^n
101 | }
102 | 
103 | pow2 <- function(x) {
104 |     pow(x, 2)
105 | }
106 | 
107 | pow3 <- function(x) {
108 |     pow(x, 3)
109 | }
110 | ```
111 | 
112 | The issue here is that these functions can contain much more in their bodies
113 | than the simple function call. When working in collaboration or sharing the code
114 | in general, it's not immediately clear that the intention of the derived
115 | functions is only to pre-fill certain variables. Using `partial()` is concise
116 | and unambiguous in purpose.
117 | 
118 | See also:
119 | 
120 | * `?partial` in the example section for more ways to use partial
121 | * `?fn_arg_seq` for the CellBench utility to construct a list of functions with
122 | varying parameter values
123 | 
124 | ## Function Composition
125 | 
126 | When functions have just one argument, they can easily be composed to create
127 | new functions. We use `purrr::compose()` for this, and it takes a series of
128 | functions as input. `compose()` will then return a function that applies
129 | the functions given to it in a right-to-left fashion, such that the right-most
130 | function is applied first and the left-most is applied last in succession.
131 | 
132 | ```{r}
133 | # find the maximum absolute value
134 | max_absolute <- compose(max, abs)
135 | 
136 | max_absolute(rnorm(100))
137 | ```
138 | 
139 | This is useful for stitching together steps of a pipeline manually, for example
140 | if some methods require normalisation but some do not, then you may write
141 | wrappers as follows
142 | 
143 | ```{r, eval = FALSE}
144 | method1 <- function(x) {
145 |     x <- normalise(x)
146 |     method_func1(x)
147 | }
148 | 
149 | method2 <- function(x) {
150 |     method_func2(x)
151 | }
152 | 
153 | method3 <- function(x) {
154 |     x <- normalise(x)
155 |     method_func3(x)
156 | }
157 | ```
158 | 
159 | alternatively you could write
160 | 
161 | ```{r, eval = FALSE}
162 | # identity simply returns its argument, useful here for code consistency
163 | method1 <- compose(method_func1, normalise)
164 | method2 <- compose(method_func2, identity)
165 | method3 <- compose(method_func3, normalise)
166 | ```
167 | 
168 | which is more succinct and likely to be less error-prone. This is useful when
169 | two or more steps in a pipeline would only work in specific combinations, then
170 | these combinations can be fused together to form a single step in the desired
171 | combinations.
172 | 
173 | \newpage
174 | ## Mapping Over Lists
175 | 
176 | The majority of `purrr`'s functionality revolves around mapping functions over
177 | a list of inputs. This is a represented by the `map()` family of functions that
178 | usually take a list and a function as arguments, returning the result of
179 | applying the function to each element of the list.
180 | 
181 | `map()` is the primary function which performs basic mapping of lists and 
182 | returns list. It functions almost identically to `lapply`, but is accompanied by
183 | a family of suffixed variants that are useful for various situations.
184 | 
185 | ```{r}
186 | x <- list(1, 2, 3)
187 | 
188 | map(x, function(x) { x * 2 })
189 | ```
190 | 
191 | One useful variant is `map2()` which takes two lists and applies a two-argument
192 | function to the first elements of both lists, second elements and so on. This
193 | can be used to pass additional variables into the function.
194 | 
195 | ```{r}
196 | # list of random values from different distributions
197 | x <- list(
198 |     rpois(100, lambda = 5),
199 |     rpois(100, lambda = 5),
200 |     rgamma(100, shape = 5),
201 |     rgamma(100, shape = 5)
202 | )
203 | 
204 | # list of additional parameters
205 | y <- list(
206 |     "mean",
207 |     "median",
208 |     "mean",
209 |     "median"
210 | )
211 | 
212 | # function that takes values and a mode argument
213 | centrality <- function(x, mode = c("mean", "median")) {
214 |     mode <- match.arg(mode)
215 |     
216 |     if (mode == "mean") {
217 |        mean = mean(x)
218 |     } else if (mode == "median") {
219 |        median = median(x)
220 |     }
221 | }
222 | 
223 | # using map2 to apply function to two lists
224 | map2(x, y, centrality)
225 | ```
226 | 
227 | \newpage
228 | # Table Manipulation with dplyr
229 | 
230 | ## Operations on the Benchmark tibble
231 | 
232 | The fundamental `benchmark_tbl()` is derived from the `tibble` object which
233 | acts mostly indentical to a regular `data.frame`. Therefore it is compatible
234 | with the `dplyr` set of table manipulation functions.
235 | 
236 | ```{r}
237 | library(dplyr)
238 | # list of data
239 | datasets <- list(
240 |     set1 = rnorm(500, mean = 2, sd = 1),
241 |     set2 = rnorm(500, mean = 1, sd = 2)
242 | )
243 | 
244 | # list of functions
245 | add_noise <- list(
246 |     none = identity,
247 |     add_bias = function(x) { x + 1 }
248 | )
249 | 
250 | res <- apply_methods(datasets, add_noise)
251 | class(res)
252 | res
253 | ```
254 | 
255 | From our results we can filter the rows or manipulat the columns with regular
256 | `dplyr` operations.
257 | 
258 | ```{r}
259 | # filtering rows to only data from set 1
260 | res %>%
261 |     filter(data == "set1")
262 | 
263 | # filtering rows to only add_bias method
264 | res %>%
265 |     filter(add_noise == "add_bias")
266 | 
267 | # mutating data column to prepend "data" to data set names
268 | res %>%
269 |     mutate(data = paste0("data", data))
270 | ```
271 | 
272 | ## Calculating multiple columns of metrics
273 | 
274 | We often want to plot two or more metrics against each other, for this purpose it is most useful to have each metric in its own column. The default CellBench model does not appear to support this, but it can be done quite easily using `spread()` from the `tidyr` package.
275 | 
276 | ```{r}
277 | metric <- list(
278 |     mean = mean,
279 |     median = median
280 | )
281 | 
282 | # simply applying the metrics results in a single column
283 | res %>%
284 |     apply_methods(metric)
285 | 
286 | # spread metrics across columns
287 | res %>%
288 |     apply_methods(metric) %>%
289 |     spread(metric, result)
290 | ```
291 | 
292 | \newpage
293 | # Plotting with ggplot2
294 | 
295 | ## Basic Plotting
296 | 
297 | Tibble results are easy to use with ggplot2. For a more extensive introduction to ggplot2 see [R for Data Science: Chapter 3](https://r4ds.had.co.nz/data-visualisation.html). Here we will plot results of our pipelines in a single plot, because the result column is a list-column, it needs to be unnested to produce a "flat" table, see `?tidyr::unnest` for more explanation on unnesting. For convenience, we also use `pipeline_collapse()` from CellBench to concatenate the method names at each stop to produce a single character string representing a pipeline.
298 | 
299 | ```{r}
300 | library(tidyr)
301 | library(ggplot2)
302 | 
303 | # I prefer my own theme for ggplot2, following theme code is optional
304 | theme_set(theme_bw() + theme(
305 |     plot.title = element_text(face = "plain", size = rel(20/12),
306 |                               hjust = 1/2, margin = margin(t = 10, b = 20)),
307 |     axis.text = element_text(size = rel(14/12)),
308 |     strip.text.x = element_text(size = rel(16/12)),
309 |     axis.title = element_text(size = rel(16/12))
310 | ))
311 | 
312 | scale_colour_discrete <- function(...) scale_colour_brewer(..., palette="Set1")
313 | scale_fill_discrete <- function(...) scale_fill_brewer(... , palette="Set1")
314 | ```
315 | 
316 | ```{r}
317 | # pipeline collapse constructs a single string from the pipeline steps,
318 | # unnest expands the list-column of results, transforming the result 
319 | # into a flat table.
320 | collapsed_res <- pipeline_collapse(res) %>%
321 |     unnest()
322 | 
323 | ggplot(collapsed_res, aes(x = pipeline, y = result)) +
324 |     geom_boxplot()
325 | ```
326 | 
327 | 
328 | ## Facetting
329 | 
330 | We can "facet" the above plot by sectioning off related graphics. This general idea covered in the [facet section](https://r4ds.had.co.nz/data-visualisation.html#facets) of the previously linked text. This also demonstrates the benefits of having the data in a tibble format.
331 | 
332 | ```{r}
333 | # remember that we have to unnest the data before it's appropriate
334 | # for plotting
335 | ggplot(unnest(res), aes(x = add_noise, y = result)) +
336 |     geom_boxplot() +
337 |     facet_grid(~data)
338 | ```
339 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Get head of 2 dimensional object as a square block
  2 | #'
  3 | #' head prints all columns which may flood the console, mhead takes a square
  4 | #' block which can look nicer and still provide a good inspection of the
  5 | #' contents
  6 | #'
  7 | #' @param x the object with 2 dimensions
  8 | #' @param n the size of the n-by-n block to extract
  9 | #'
 10 | #' @return an n-by-n sized subset of x
 11 | #' @export
 12 | #'
 13 | #' @examples
 14 | #' x <- matrix(runif(100), nrow = 10, ncol = 10)
 15 | #'
 16 | #' mhead(x)
 17 | #' mhead(x, n = 3)
 18 | mhead <- function(x, n = 6) {
 19 |     stopifnot(
 20 |         !is.null(dim(x)),
 21 |         is.numeric(n),
 22 |         n > 0
 23 |     )
 24 | 
 25 |     n1 <- min(n, nrow(x))
 26 |     n2 <- min(n, ncol(x))
 27 |     x[seq_len(n1), seq_len(n2)]
 28 | }
 29 | 
 30 | # left to right function composition
 31 | chain <- function(...) {
 32 |     do.call(purrr::compose, rev(list(...)))
 33 | }
 34 | 
 35 | # check if object is a list of functions
 36 | is_fn_list <- function(x) {
 37 |     is(x, "list") && purrr::every(x, is.function)
 38 | }
 39 | 
 40 | #' Collapse benchmark_tbl into a two column summary
 41 | #'
 42 | #' Collapse benchmark_tbl into two columns: "pipeline" and "result". The
 43 | #' "pipeline" column will be the concatenated values from the data and methods
 44 | #' columns while the "result" column remains unchanged from the benchmark_tbl.
 45 | #' This is useful for having a string summary of the pipeline for annotating.
 46 | #'
 47 | #' @param x the benchmark_tbl to collapse
 48 | #' @param sep the separator to use for concatenating the pipeline steps
 49 | #' @param drop.steps if the data name and methods steps should be dropped from
 50 | #'   the output. TRUE by default.
 51 | #' @param data.name if the dataset name should be included in the pipeline
 52 | #'   string. Useful if only a single dataset is used.
 53 | #'
 54 | #' @return benchmark_tbl with pipeline and result columns (and all other columns
 55 | #'   if drop.steps is FALSE)
 56 | #'
 57 | #' @importFrom rlang .data
 58 | #' @export
 59 | #'
 60 | #' @seealso \code{\link{as_pipeline_list}}
 61 | #'
 62 | #' @examples
 63 | #' # list of data
 64 | #' datasets <- list(
 65 | #'     set1 = rnorm(500, mean = 2, sd = 1),
 66 | #'     set2 = rnorm(500, mean = 1, sd = 2)
 67 | #' )
 68 | #'
 69 | #' # list of functions
 70 | #' add_noise <- list(
 71 | #'     none = identity,
 72 | #'     add_bias = function(x) { x + 1 }
 73 | #' )
 74 | #'
 75 | #' res <- apply_methods(datasets, add_noise)
 76 | #' collapse_pipeline(res)
 77 | collapse_pipeline <- function(
 78 |     x,
 79 |     sep = arrow_sep("right"),
 80 |     drop.steps = TRUE,
 81 |     data.name = TRUE
 82 | ) {
 83 |     stopifnot(
 84 |         is(x, "data.frame"),
 85 |         dplyr::last(colnames(x)) == "result"
 86 |     )
 87 | 
 88 |     results <- dplyr::pull(x, "result")
 89 | 
 90 |     if (!data.name) {
 91 |         data <- x$data
 92 |         x <- dplyr::select(x, -"data")
 93 |     }
 94 | 
 95 |     x <- dplyr::select(x, -"result") %>%
 96 |         tidyr::unite("pipeline", dplyr::everything(), sep = sep, remove = drop.steps) %>%
 97 |         dplyr::select(-"pipeline", "pipeline") %>% # put "pipeline" on last column
 98 |         tibble::as_tibble() %>%
 99 |         dplyr::mutate(pipeline = factor_no_sort(.data$pipeline))
100 | 
101 |     if (!data.name) {
102 |         tibble::add_column(x, data = data, before = 1)
103 |     }
104 | 
105 |     tibble::add_column(x, result = results)
106 | }
107 | 
108 | unicode_arrow <- function(towards = c("right", "left", "up", "down")) {
109 |     towards <- match.arg(towards)
110 |     switch(
111 |         towards,
112 |         "left" = "\u2190",
113 |         "up" = "\u2191",
114 |         "right" = "\u2192",
115 |         "down" = "\u2193"
116 |     )
117 | }
118 | 
119 | ascii_arrow <- function(towards = c("right", "left")) {
120 |     switch(
121 |         towards,
122 |         "left" = "\u00AB",
123 |         "right" = "\u00BB"
124 |     )
125 | }
126 | 
127 | #' Unicode arrow separators
128 | #'
129 | #' Utility function for generating unicode arrow separators.
130 | #'
131 | #' @param towards the direction the unicode arrow points towards
132 | #' @param unicode whether unicode arrows should be used. Does not work inside
133 | #'   plots within knitted PDF documents.
134 | #'
135 | #' @return a string containing an unicode arrow surrounded by two spaces
136 | #' @export
137 | #'
138 | #' @examples
139 | #' arrow_sep("left") # left arrrow
140 | #' arrow_sep("right") # right arrrow
141 | arrow_sep <- function(towards = c("right", "left"), unicode = FALSE) {
142 |     towards <- match.arg(towards)
143 |     if (!unicode) {
144 |         arrow <- ascii_arrow(towards)
145 |     } else {
146 |         arrow <- unicode_arrow(towards)
147 |     }
148 |     glue::glue(" {arrow} ")
149 | }
150 | 
151 | # create factor with levels in order they appear rather than alphabetically
152 | # sorted
153 | factor_no_sort <- function(x) {
154 |     factor(x, levels = unique(x))
155 | }
156 | 
157 | #' convert benchmark_tbl to list
158 | #'
159 | #' convert a benchmark_tbl to a list where the name of the elements represent the pipeline steps separated by "..". This can be useful for using the apply family of functions.
160 | #'
161 | #' @param x the benchmark_tbl object to convert
162 | #'
163 | #' @importFrom stats setNames
164 | #' @export
165 | #'
166 | #' @return list containing the results with names set to data and pipeline steps
167 | #'   separated by ..
168 | #'
169 | #' @seealso \code{\link{collapse_pipeline}}
170 | #'
171 | #' @examples
172 | #' # list of data
173 | #' datasets <- list(
174 | #'     set1 = rnorm(500, mean = 2, sd = 1),
175 | #'     set2 = rnorm(500, mean = 1, sd = 2)
176 | #' )
177 | #'
178 | #' # list of functions
179 | #' add_noise <- list(
180 | #'     none = identity,
181 | #'     add_bias = function(x) { x + 1 }
182 | #' )
183 | #'
184 | #' res <- apply_methods(datasets, add_noise)
185 | #' as_pipeline_list(res)
186 | as_pipeline_list <- function(x) {
187 |     stopifnot(is(x, "benchmark_tbl"))
188 | 
189 |     if (dplyr::last(colnames(x)) != "result") {
190 |         # if benchmark_tbl has been manipulated by user to non-standard form
191 |         stop("final column should contain 'result'")
192 |     }
193 | 
194 |     x <- collapse_pipeline(x, sep = "..")
195 | 
196 |     setNames(x$result, nm = x$pipeline)
197 | }
198 | 
199 | # suppress prints, which many people use as if they were messages
200 | suppressPrint <- function(expr) {
201 |     utils::capture.output(x <- expr)
202 |     x
203 | }
204 | 
205 | # suppresses messages and prints
206 | suppressMsgAndPrint <- function(expr, suppress = TRUE) {
207 |     if (suppress) {
208 |         suppressMessages(suppressPrint(expr))
209 |     } else {
210 |         expr
211 |     }
212 | }
213 | 
214 | # generate sequence along number of rows
215 | seq_nrow <- function(x) {
216 |     if (!is.numeric(nrow(x))) return(integer(0))
217 |     seq_len(nrow(x))
218 | }
219 | 
220 | # generate sequence along number of columns
221 | seq_ncol <- function(x) {
222 |     if (!is.numeric(ncol(x))) return(integer(0))
223 |     seq_len(ncol(x))
224 | }
225 | 
226 | # @importFrom tibble as_tibble
227 | # @importFrom magrittr set_names
228 | make_combinations <- function(...) {
229 |     input_names <- infer_names_from_dots(...)
230 |     input <- list(...) %>%
231 |         magrittr::set_names(input_names)
232 | 
233 |     # unnaming data.frame list elements required for tidyr >= 1.0.0
234 |     names(input)[purrr::map_lgl(input, is.data.frame)] <- ""
235 | 
236 |     is.valid.input <- function(x) {
237 |         is.character(x) || is.data.frame(x) || is.factor(x)
238 |     }
239 | 
240 |     if (!purrr::every(input, is.valid.input)) {
241 |         stop("all arguments must be either data.frames, character or factor vectors")
242 |     }
243 | 
244 |     input <- purrr::map(
245 |         input,
246 |         function(x) {
247 |             if (is.data.frame(x)) {
248 |                 return(x)
249 |             } else {
250 |                 return(factor_no_sort(x))
251 |             }
252 |         }
253 |     )
254 | 
255 |     tibble::as_tibble(do.call(tidyr::crossing, input))
256 | }
257 | 
258 | # @importFrom tibble as_tibble
259 | # @importFrom magrittr set_names
260 | # make_combinations_df <- function(...) {
261 | #     input_names <- infer_names_from_dots(...)
262 | #     input <- list(...) %>%
263 | #         magrittr::set_names(input_names)
264 | #
265 | #     # unnaming data.frame list elements required for tidyr >= 1.0.0
266 | #     names(input)[purrr::map_lgl(input, is.data.frame)] <- ""
267 | #
268 | #     is.valid.input <- function(x) {
269 | #         is.character(x) || is.data.frame(x) || is.factor(x)
270 | #     }
271 | #
272 | #     if (!purrr::every(input, is.valid.input)) {
273 | #         stop("all arguments must be either data.frames, character or factor vectors")
274 | #     }
275 | #
276 | #     input <- purrr::map(
277 | #         input,
278 | #         function(x) {
279 | #             if (is.data.frame(x)) {
280 | #                 return(x)
281 | #             } else {
282 | #                 return(factor_no_sort(x))
283 | #             }
284 | #         }
285 | #     )
286 | #
287 | #     df <- input[[1]] %>%
288 | #         dplyr::mutate(
289 | #             method_names = map(1:nrow(input[[1]]), function(x) {
290 | #                 out <- tibble::tibble(col = input[[2]])
291 | #                 names(out) <- names(input)[2]
292 | #                 out
293 | #             })
294 | #         )
295 | #
296 | #     df %>% tidyr::unnest(method_names)
297 | # }
298 | 
299 | all_same_class <- function(x) {
300 |     classes <- purrr::map(x, class)
301 | 
302 |     intersect_classes <- sort(purrr::reduce(classes, intersect))
303 |     first_classes <- sort(classes[[1]])
304 | 
305 |     # all elements have the same class if the intersection of all classes
306 |     # has the same length as the first classes and same values
307 |     (length(intersect_classes) == length(first_classes)) &&
308 |         all.equal(intersect_classes, first_classes)
309 | }
310 | 
311 | # check that all elements of a list have length one
312 | all_length_one <- function(x) {
313 |     stopifnot(is(x, "list"))
314 |     purrr::every(x, function(x) { length(x) == 1 })
315 | }
316 | 
317 | # add class to the classes of an object
318 | add_class <- function(x, class) {
319 |     stopifnot(is.character(class))
320 | 
321 |     existing_class <- class(x)
322 |     if (class %in% existing_class) {
323 |         return(x)
324 |     } else {
325 |         class(x) <- c(class, existing_class)
326 |         return(x)
327 |     }
328 | }
329 | 
330 | # drop class from classes of an object
331 | drop_class <- function(x, class) {
332 |     stopifnot(is.character(class))
333 | 
334 |     classes <- class(x)
335 |     if (!class %in% classes) {
336 |         return(x)
337 |     } else {
338 |         class(x) <- setdiff(classes, class)
339 |         return(x)
340 |     }
341 | }
342 | 
343 | # convert to duration in seconds
344 | #' @importFrom lubridate seconds as.duration
345 | duration_seconds <- function(x, digits = 3) {
346 |     round(x, digits = digits) %>%
347 |         lubridate::seconds() %>%
348 |         lubridate::as.duration()
349 | }
350 | 
351 | # wrapper to return summaried time in numerics rath
352 | simple_time <- function(...) {
353 |     summary(system.time(...))
354 | }
355 | 
356 | # replace null values with default value
357 | if_null_then <- function(x, value) {
358 |     if (is.null(x)) {
359 |         x <- value
360 |     }
361 |     x
362 | }
363 | 
364 | # replace empty values with default value
365 | if_empty_then <- function(x, value) {
366 |     if (length(x) == 0) {
367 |         x <- value
368 |     }
369 |     x
370 | }
371 | 
372 | # take variadic ellipses and return a vector of names,
373 | #' @importFrom rlang exprs
374 | infer_names_from_dots <- function(...) {
375 |     var_names <- purrr::map_chr(as.list(substitute(list(...))[-1L]), deparse)
376 |     given_names <- names(list(...))
377 |     if (is.null(given_names)) {
378 |         output <- var_names
379 |     } else {
380 |         output <- ifelse(given_names == "", var_names, given_names)
381 |     }
382 | 
383 |     if (!all_unique(output)) {
384 |         warning("not all names were unique, numbers appended to duplicates")
385 |     }
386 |     make.names(output, unique = TRUE)
387 | }
388 | 
389 | # convert rows of a data frame into list
390 | #' @importFrom magrittr set_names
391 | df_to_tasks <- function(df, names = rownames(df)) {
392 |     stopifnot(is(df, "data.frame"))
393 |     split(df, seq(nrow(df))) %>%
394 |         magrittr::set_names(names)
395 | }
396 | 
397 | # Retain for backward compatibility
398 | #' @rdname collapse_pipeline
399 | #' @export
400 | pipeline_collapse <- collapse_pipeline
401 | 


--------------------------------------------------------------------------------
/vignettes/Introduction.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to CellBench"
  3 | author: "Shian Su"
  4 | date: "`r Sys.Date()`"
  5 | output: BiocStyle::html_document
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Introduction}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | library(CellBench)
 14 | library(limma)
 15 | library(dplyr)
 16 | library(purrr)
 17 | ```
 18 | 
 19 | 
 20 | ```{r setup, include = FALSE}
 21 | knitr::opts_chunk$set(
 22 |   collapse = TRUE,
 23 |   comment = "#>",
 24 |   fig.retina = 1
 25 | )
 26 | ```
 27 | 
 28 | # Introduction
 29 | 
 30 | CellBench is a package to assist with creating benchmarks for single cell analysis methods. We provide functions for working with `SingleCellExperiments` objects and a framework for constructing benchmarks for different single cell datasets across different methods or combinations of methods.
 31 | 
 32 | The aim of this package is to make it simpler for developers to construct combinatorial designs and provide a flat data structure to store the organised outputs of analysis methods. We provide some fully constructed benchmarking pipelines for a set of single-cell benchmark datasets, and we hope that the framework will allow users to easily construct benchmarks in an organised and expressive manner.
 33 | 
 34 | For more realistic examples, run `cellbench_case_study()` to see a case study using CellBench.
 35 | 
 36 | # Quick start
 37 | 
 38 | There are 3 fundamental components to the benchmarks in this package, `list`s of data, `list`s of functions and a `tibble` with a `list-column` that we will call a `benchmark_tbl`. For simplicity we use randomly generated data and simple functions, but hopefully it's clear how the idea extends into more complex functions and benchmarks.
 39 | 
 40 | As a motivating example, we will have a simple look at what count-per-million library size normalisation and log-transformation does for our MDS plots. In addition to `CellBench` we will require the `limma` package.
 41 | 
 42 | ```{r}
 43 | library(CellBench)
 44 | library(limma)
 45 | 
 46 | datasets <- list(
 47 |     sample_10x = readRDS(cellbench_file("10x_sce_sample.rds"))
 48 | )
 49 | 
 50 | norm_method <- list(
 51 |     none = counts,
 52 |     cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
 53 | )
 54 | 
 55 | transform <- list(
 56 |     none = identity,
 57 |     log2 = function(x) log2(x+1)
 58 | )
 59 | ```
 60 | 
 61 | Now we have 3 `list`s.
 62 | 
 63 | * A `list` of datasets. In this context it is a single `SingleCellExperiment`, but it can be any arbitrary object. Ideally all objects in the list are of the same type, this makes it success more likely when the same methods are applied across all datasets.
 64 | * Two `list`s of functions. These are the functions that will perform one step of our pipeline stored neatly in a single object.
 65 |     * The first list of functions will either extract the count (no normalisation) or perform count-per-million (cpm) library size normalisation.
 66 |     * The second list of functions will wither return the object as-is (no transformation) or log2-transform the counts/cpm values with an offset of 1 to account for 0 counts/cpms.
 67 | 
 68 | ```{r}
 69 | res1 <- datasets %>%
 70 |     apply_methods(norm_method)
 71 | 
 72 | res1
 73 | ```
 74 | 
 75 | So we see that we have a `result` for every combination of `data` and `norm_method` method applied. We can then apply the transform methods.
 76 | 
 77 | ```{r}
 78 | res2 <- res1 %>%
 79 |     apply_methods(transform)
 80 | 
 81 | res2
 82 | ```
 83 | 
 84 | Now the `result` column has been updated to reflect the matrix produced from each transform method applied each result from the previous table. Thus it is simple to generate combinatorial benchmarking schemes simply by successively applying further `list`s of functions.
 85 | 
 86 | Finally we want to visualise the final results of each pipeline, here we will use `plotMDS` from the `limma` package and colour points by the `cell_line` column extracted from the `colData()` (column data) of the original data.
 87 | 
 88 | To set this up we generate colours from the `cell_line` information in the original data. Then we use `pipeline_collapse()` to collapse the data and method names into a single columns to be used as the title in our plots.
 89 | 
 90 | ```{r}
 91 | # generate colour values from cell line information
 92 | cell_line <- factor(colData(datasets$sample_10x)$cell_line)
 93 | cell_line_col <- c("red", "blue", "green")[cell_line]
 94 | 
 95 | collapsed_res <- res2 %>%
 96 |     pipeline_collapse()
 97 | 
 98 | collapsed_res
 99 | ```
100 | 
101 | We can then loop through the rows and generate plots showing how each combination of normalisation and transformation affects our MDS visualisation.
102 | 
103 | ```{r, fig.height = 9, fig.width = 8}
104 | par(mfrow = c(2, 2)) # declare 2x2 plotting grid
105 | 
106 | # loop through row of summarised pipeline table
107 | for (i in 1:nrow(collapsed_res)) {
108 |     title <- collapsed_res$pipeline[i]
109 |     expr_mat <- collapsed_res$result[[i]] # note the use of [[]] due to list
110 |     
111 |     limma::plotMDS(
112 |         expr_mat,
113 |         main = title,
114 |         pch = 19, # draw circles rather than label name
115 |         col = cell_line_col
116 |     )
117 | }
118 | 
119 | par(mfrow = c(1, 1)) # undo plotting grid for future plots
120 | ```
121 | 
122 | So we can see that applying a simple library size normalisation and log2 transform can dramatically improve the visual inference in our PCA plots.
123 | 
124 | # Downloading benchmark data
125 | 
126 | This package provides access to the single cell mixology data produced by Tian et al. (2018). These can be accessed through the `load_*_data()` functions, when run for the first time they will download the data from the web. On subsequent runs they will load the data from a local cache.
127 | 
128 | Each set of data is loaded as a list of SingleCellExperiment objects. They are grouped into the mixing strategy used to produce the datasets, each dataset within the same mixing strategy can be expected to have the same columns in their colData.
129 | 
130 | ```{r, eval = FALSE}
131 | # loading the individual sets of data
132 | sc_data <- load_sc_data()
133 | mrna_mix_data <- load_mrna_mix_data()
134 | cell_mix_data <- load_cell_mix_data()
135 | 
136 | # loading all datasets
137 | all_data <- load_all_data()
138 | ```
139 | 
140 | To clear the data from the local cache, you can run `clear_cached_datasets()`.
141 | 
142 | ```{r, eval = FALSE}
143 | # removes all locally cached CellBench datasets
144 | clear_cached_datasets()
145 | ```
146 | 
147 | # Key objects and concepts
148 | 
149 | ## Function piping
150 | 
151 | In this package many examples make heavy use of the pipe operator `%>%` from [magrittr](https://magrittr.tidyverse.org). This is useful for writing cleaner code that is easier to debug.
152 | 
153 | ```{r, eval = FALSE}
154 | # the following two statements are equivalent
155 | f(x)
156 | x %>% f()
157 | 
158 | # as are these
159 | f(x, y)
160 | x %>% f(y)
161 | 
162 | # and these
163 | h(g(f(x)))
164 | x %>% f() %>% g() %>% h()
165 | 
166 | # or these
167 | h(g(f(x, a), b), c)
168 | x %>% f(a) %>% g(b) %>% h(c)
169 | ```
170 | 
171 | We can see in the last example that with many functions composed together, the piped form reads from left to right and it's clear which arguments belong to which function, whereas in the nested form it is more difficult to clearly identify what is happening. In general piping data into a function calls the function with the data serving as the first argument, more complex behaviour can be achieved and is describe on the [magrittr](https://magrittr.tidyverse.org) web page.
172 | 
173 | ## Mapping or list-apply
174 | 
175 | Lists in R are containers for a collection of arbitrary objects. In this package we encourage users to use lists as containers for a series of identically-typed objects, using them as if they were vectors for data types that vectors cannot contain. For example we store our datasets in lists of SingleCellExperiment objects and analysis methods in lists of functions, these data types would not be accepted within a vector.
176 | 
177 | To work with lists we encourage using `lapply` or `purrr::map`, these allow functions to be applied to each element of a list and return the result in a list.
178 | 
179 | ```{r}
180 | x <- list(
181 |     a = 1,
182 |     b = 2,
183 |     c = 3
184 | )
185 | 
186 | lapply(x, sqrt)
187 | ```
188 | 
189 | ## List of datasets
190 | 
191 | The benchmarking workflow starts with a list of datasets, even if you only have one dataset you will need to store it in a list for workflow to function. In our example the dataset was a sample of the 10X cell mixture dataset.
192 | 
193 | ```{r, result = 'hide'}
194 | sample_10x <- readRDS(cellbench_file("10x_sce_sample.rds"))
195 | 
196 | # even with a single dataset we need to construct a list
197 | datasets <- list(
198 |     sample_10x = sample_10x
199 | )
200 | 
201 | # we can add more datasets to the pipeline by adding to the list
202 | # here we have two datasets that are random samplings of the genes in the 10x
203 | # sample data
204 | datasets <- list(
205 |     subsample1_10x = sample_genes(sample_10x, n = 1000),
206 |     subsample2_10x = sample_genes(sample_10x, n = 1000)
207 | )
208 | 
209 | # could have been any other kind of object as long as they are consistent
210 | datasets <- list(
211 |     set1 = matrix(rnorm(500, mean = 2, sd = 1), ncol = 5, nrow = 100),
212 |     set2 = matrix(rnorm(500, mean = 2, sd = 1), ncol = 5, nrow = 100)
213 | )
214 | ```
215 | 
216 | Any kind of object can be stored in a list, so there is great flexibility in what kind of starting point can be used for the benchmarking workflow.
217 | 
218 | ## List of functions
219 | 
220 | In R functions themselves are a type of object, so they too can be stored in lists, this is rarely used in common R but this allows very simple addition of methods.
221 | 
222 | ```{r, result = 'hide'}
223 | # counts is a function that can be run with counts(x) here it is named 
224 | # "none" as it denotes the lack of normalisation
225 | norm_method <- list(
226 |     none = counts,
227 |     cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
228 | )
229 | 
230 | # "identity" is a useful function that simply returns its input 
231 | # it allows the comparison between applying and not applying a method
232 | transform <- list(
233 |     none = identity,
234 |     log2 = function(x) log2(x+1)
235 | )
236 | ```
237 | 
238 | The key thing to note is that the function must be callable and take a single argument. This may mean you need to write a wrapper function or use `purrr::partial()` to fill in some arguments. For example both `mean` and `sd` have `na.rm` arguments, because the element of the list must itself be a function, simply writing something like `mean(na.rm = TRUE)` will not work, as it is an incomplete function call. Instead we have two main options:
239 | 
240 | ```{r, result = 'hide'}
241 | # using anonymous function wrappers
242 | metric <- list(
243 |     mean = function(x) { mean(x, na.rm = TRUE) },
244 |     sd = function(x) { sd(x, na.rm = TRUE) }
245 | )
246 | 
247 | # using purrr partial function
248 | partial <- purrr::partial # explicit namespacing to avoid ambiguity
249 | metric <- list(
250 |     mean = partial(mean, na.rm = TRUE),
251 |     sd = partial(sd, na.rm = TRUE)
252 | )
253 | 
254 | # example use with kmeans
255 | clustering <- list(
256 |     kmeans_4 = partial(kmeans, centers = 4),
257 |     kmeans_5 = partial(kmeans, centers = 5),
258 |     kmeans_6 = partial(kmeans, centers = 6)
259 | )
260 | ```
261 | 
262 | `purrr::partial()` is known as partial-application of a function: it takes a function and arguments to that function, then returns a new function that is the function with the provided arguments filled in. This is slightly more explicit than creating the function wrapper, since the function wrapper can perform many more tasks within its body than just setting arguments, whereas `purrr::partial()` makes it clear all you're doing is setting some arguments.
263 | 
264 | ## Benchmark tibble and list-columns
265 | 
266 | The `benchmark_tbl` is a very light wrapper around the standard tibble provided by `tibble::tibble()`. This is like a regular `data.frame()` except it has some pretty printing features that are particularly useful for [list-columns](https://jennybc.github.io/purrr-tutorial/ls13_list-columns.html). A list column is a special type of column where the values are not atomic, i.e. cannot be stored in a vector. This allows arbitrary data types to be stored in a column but with the caveat that pulling out that column returns a list rather than a vector. This has implications for how to perform mutations using `dplyr` verbs and in general will not behave expectedly with vectorised functions.
267 | 
268 | In the framework established by this package, the first column will be the name of the data, followed by columns specifying the names of the analysis steps and ending with a list-column containing the result of the specified dataset after processing by the chain of analysis methods.
269 | 
270 | ```{r}
271 | class(res2)
272 | ```
273 | 
274 | Because they are tibbles, they respond well to `dplyr` verbs, or most regular `data.frame` manipulations.
275 | 
276 | ```{r}
277 | res2 %>% dplyr::filter(norm_method == "cpm")
278 | ```
279 | 
280 | ## Applying methods
281 | 
282 | The final idea that ties together the CellBench framework is the `apply_methods()` function, which takes a `benchmark_tbl` and applies a `list` of functions. The result is that each row is processed through each method, a new column is added specifying the method applied and the result is updated to the new value.
283 | 
284 | ```{r}
285 | # datasets
286 | datasets <- list(
287 |     sample_10x = readRDS(cellbench_file("10x_sce_sample.rds"))
288 | )
289 | 
290 | # first set of methods in pipeline
291 | norm_method <- list(
292 |     none = counts,
293 |     cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
294 | )
295 | 
296 | # second set of methods in pipeline
297 | transform <- list(
298 |     none = identity,
299 |     log2 = function(x) log2(x+1)
300 | )
301 | 
302 | datasets %>%
303 |     apply_methods(norm_method)
304 | ```
305 | 
306 | `apply_methods` takes the name of the variable holding the methods list and puts uses it as the column name for those methods, and the names of the methods within the list are used as the values in that column. The `data` column will store the name of the names within the dataset list, but not inherit the name of the variable holding the dataset list.
307 | 
308 | The way that the `apply_methods` is written means that you can simply pipe data through the methods without saving any intermediate results.
309 | 
310 | ```{r}
311 | datasets %>%
312 |     apply_methods(norm_method) %>%
313 |     apply_methods(transform)
314 | ```
315 | 
316 | # Advanced usage
317 | 
318 | ## Multithreading
319 | 
320 | Application of methods can be done in parallel, this is done by setting the global threads used by CellBench. This option may cause conflicts if the applied methods have their own internal parallelism. If any of the methods have internal parallelism then it is recommended to leave CellBench in single threaded mode.
321 | 
322 | **CAUTION**: Multi-threading with CellBench uses significantly more memory than one might expect, each thread can potentially make a full copy of all data in the environment. Be aware of this when working on memory-intensive tasks.
323 | 
324 | ```{r, eval = FALSE}
325 | # set cellbench to use 4 threads
326 | set_cellbench_threads(4)
327 | ```
328 | 
329 | ## Function return caching
330 | 
331 | CellBench can use the `memoise` to cache function results so that calling functions with the same arguments simply loads results from the local cache rather than repeating computation. Because of the atypical way that CellBench calls functions (as a member of a list), caching in memory using memoise doesn't appear to work, so it is necessary to cache on disk.
332 | 
333 | To use function return value caching in CellBench we first declare a folder to store our return values and then replace our regular methods with their cached versions.
334 | 
335 | **NOTE**: Caching a method that has pseudo-random behaviours means that the same result will be retrieved from the cache, negating the pseudo-random property of the method. This is generally undesirable.
336 | 
337 | **CAUTION**: Be careful when using caching with multiple threads, if more than one instance of a function runs with the exact same arguments, then the instances will attempt to write to the cache simultaneously and corrupt it.
338 | 
339 | **CAUTION**: Since only the function call signature and input value is considered for retrieving cached results, if the body of the underlying function is altered then CellBench will retrieve an outdated result.
340 | 
341 | **CAUTION**: As each result is save to disk, be careful with caching functions that produce large output and need to be run on many different inputs.
342 | 
343 | ```{r, eval = FALSE}
344 | set_cellbench_cache_path(".CellBenchCache")
345 | methods <- list(
346 |     method1 = cache_method(method1),
347 |     method2 = cache_method(method2)
348 | )
349 | ```
350 | 
351 | The function cache can be cleared using `clear_cellbench_cache()`. This will only work if the cache was set in the same session as it is cleared. Otherwise the cache folder will need to be located manually and deleted.
352 | 
353 | ```{r, eval = FALSE}
354 | # clears the cache set by set_cellbench_cache_path() in the same session
355 | clear_cellbench_cache()
356 | ```
357 | 
358 | ## Constructing functions with parameter range
359 | 
360 | CellBench provides a helper function `fn_arg_seq` to create a list of functions with varying parameters values, making it easy to search out the parameters space.
361 | 
362 | It takes a function as its first argument, then vectors of argument values with the name of the argument used by the function. A list of functions is returned with the specified argument filled in using each value in the vector. If multiple argument vectors are given then a vector of functions is returned with each combination of parameter values applied.
363 | 
364 | ```{r}
365 | # f is a function of three parameters
366 | f <- function(x, y, z) {
367 |     x + y + z
368 | }
369 | 
370 | # f_list is a list of functions with two of the parameters pre-filled
371 | f_list <- fn_arg_seq(f, y = 1:2, z = 3:4)
372 | 
373 | f_list
374 | ```
375 | 
376 | ```{r}
377 | names(f_list)[1]
378 | g <- f_list[[1]]
379 | g(10)
380 | 
381 | names(f_list)[2]
382 | h <- f_list[[2]]
383 | h(20)
384 | ```
385 | 
386 | # Summary
387 | 
388 | CellBench provides a lightweight and flexible framework for working with benchmarks that have multiple steps and result in combinatorial designs for application of methods. It makes use of simple and transparent R objects that are easy to understand and manipulate, using basic data and function list constructs as its input. The resulting tables are compatible with the popular `dplyr` manipulations and in general encourages a clean coding style that is easy to understand, debug and extend.
389 | 


--------------------------------------------------------------------------------