├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   └── test-coverage.yaml
├── vignettes
    ├── .gitignore
    ├── fig1.png
    ├── fig2.png
    ├── fig3.png
    ├── fig4.png
    ├── 02_compare-vignette.Rmd
    └── 01_plaid-vignette.Rmd
├── inst
    ├── logo
    │   └── logo.png
    ├── extdata
    │   └── pbmc3k-50cells.rda
    ├── scripts
    │   ├── hallmark.txt
    │   └── pbmc3k-50cells.R
    └── CITATION
├── man
    ├── figures
    │   └── logo.png
    ├── mat.rowsds.Rd
    ├── gset_ttest.Rd
    ├── matrix_onesample_ttest.Rd
    ├── sparse_colranks.Rd
    ├── chunked_crossprod.Rd
    ├── cor_sparse_matrix.Rd
    ├── matrix_metap.Rd
    ├── fc_ttest.Rd
    ├── fc_ztest.Rd
    ├── normalize_medians.Rd
    ├── gset_averageCLR.Rd
    ├── mat2gmt.Rd
    ├── read.gmt.Rd
    ├── colranks.Rd
    ├── write.gmt.Rd
    ├── gmt2mat.Rd
    ├── gset.rankcor.Rd
    ├── replaid.sing.Rd
    ├── replaid.aucell.Rd
    ├── replaid.ucell.Rd
    ├── replaid.ssgsea.Rd
    ├── replaid.gsva.Rd
    ├── replaid.scse.Rd
    ├── dualGSEA.Rd
    └── plaid.Rd
├── .Rbuildignore
├── dev
    ├── environment.yml
    ├── extdata.R
    ├── Makefile
    ├── make_package.R
    └── test.R
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-bioc-utils.R
    │   ├── test-gmt-utils.R
    │   ├── test-plaid.R
    │   └── test-stats.R
├── NEWS
├── .gitignore
├── NAMESPACE
├── DESCRIPTION
├── README.md
└── R
    ├── bioc-utils.R
    ├── gmt-utils.R
    ├── stats.R
    └── plaid.R


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/inst/logo/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/inst/logo/logo.png


--------------------------------------------------------------------------------
/vignettes/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig1.png


--------------------------------------------------------------------------------
/vignettes/fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig2.png


--------------------------------------------------------------------------------
/vignettes/fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig3.png


--------------------------------------------------------------------------------
/vignettes/fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig4.png


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/inst/extdata/pbmc3k-50cells.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigomics/plaid/HEAD/inst/extdata/pbmc3k-50cells.rda


--------------------------------------------------------------------------------
/inst/scripts/hallmark.txt:
--------------------------------------------------------------------------------
1 | hallmark.gmt data downloaded from:
2 | 
3 | https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=H


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^LICENSE$
 2 | ^README\.Rmd$
 3 | ^Makefile$
 4 | ^dev$
 5 | ^[.].*$
 6 | ^.*/[.].*$
 7 | ^Meta$
 8 | ^doc$
 9 | ^experiments$
10 | ^plaid.BiocCheck$
11 | ^_pkgdown\.yml$
12 | ^docs$
13 | ^pkgdown$
14 | ^\.github$
15 | 


--------------------------------------------------------------------------------
/dev/environment.yml:
--------------------------------------------------------------------------------
 1 | name: plaid-env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - r-base=4.3
 8 |   - r-devtools
 9 |   - r-remotes
10 |   - r-matrix
11 |   - r-matrixstats
12 |   - r-rfast
13 |   - bioconductor-sparsematrixstats
14 |   - r-qlcmatrix
15 |   - bioconductor-biocstyle
16 |   - r-knitr
17 |   - r-rmarkdown
18 |   - r-testthat
19 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | bibentry(
2 |   bibtype  = "Article",
3 |   title    = "PLAID: ultrafast single-sample gene set enrichment scoring",
4 |   author   = "Zito, Antonino and Escrib\u00e0 Montagut, Xavier and Scorici, Gabriela and Martinelli, Axel and Akhmedov, Murodzhon and Kwee, Ivo",
5 |   journal  = "Bioinformatics",
6 |   year     = "2025",
7 |   doi      = "10.1093/bioinformatics/btaf621"
8 | )
9 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
 7 | # * https://testthat.r-lib.org/articles/special-files.html
 8 | 
 9 | library(testthat)
10 | library(plaid)
11 | 
12 | test_check("plaid")
13 | 


--------------------------------------------------------------------------------
/man/mat.rowsds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{mat.rowsds}
 4 | \alias{mat.rowsds}
 5 | \title{Calculate row standard deviations for matrix}
 6 | \usage{
 7 | mat.rowsds(X)
 8 | }
 9 | \arguments{
10 | \item{X}{Input matrix (can be sparse or dense)}
11 | }
12 | \value{
13 | Vector of row standard deviations.
14 | }
15 | \description{
16 | Calculate row standard deviations for matrix
17 | }
18 | 


--------------------------------------------------------------------------------
/dev/extdata.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | library(Seurat)
 4 | library(SeuratData)
 5 | SeuratData::InstallData("pbmc3k")
 6 | data("pbmc3k.final")
 7 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final)
 8 | X <- pbmc3k.final[['RNA']]@data
 9 | y <- pbmc3k.final$seurat_annotations
10 | table(y)
11 | sel <- c(head(which(y=="B"),25), head(which(y=="Naive CD4 T"),25))
12 | X <- X[,sel]
13 | X <- X[rowSums(X)>0,]
14 | celltype <- c(rep("B",25),rep("T",25))
15 | save(X, celltype, file="../inst/extdata/pbmc3k-50cells.rda")
16 | 


--------------------------------------------------------------------------------
/inst/scripts/pbmc3k-50cells.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(SeuratData)
 3 | SeuratData::InstallData("pbmc3k")
 4 | data("pbmc3k.final")
 5 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final)
 6 | X <- pbmc3k.final[['RNA']]@data
 7 | y <- pbmc3k.final$seurat_annotations
 8 | table(y)
 9 | sel <- c(head(which(y=="B"),25), head(which(y=="Naive CD4 T"),25))
10 | X <- X[,sel]
11 | X <- X[rowSums(X)>0,]
12 | celltype <- c(rep("B",25),rep("T",25))
13 | save(X, celltype, file="../inst/extdata/pbmc3k-50cells.rda")
14 | 


--------------------------------------------------------------------------------
/dev/Makefile:
--------------------------------------------------------------------------------
 1 | build: doc
 2 | 	R -e "devtools::build()"
 3 | 
 4 | doc: vignettes/*.Rmd R/*.R
 5 | 	R -e "devtools::document()"
 6 | 	R -e "devtools::build_vignettes()"
 7 | 
 8 | install: 
 9 | 	R CMD INSTALL .
10 | 
11 | check: clean
12 | 	R -e 'devtools::check()'
13 | 
14 | biocheck: clean
15 | 	R -e 'BiocCheck::BiocCheck(".")'
16 | 
17 | test:
18 | 	R -e "devtools::test()"
19 | 
20 | clean:
21 | 	rm -f `find . -name '.\#*' -o -name '\#*' -o -name '*~' -printf '"%p" '`
22 | 
23 | echo:
24 | 	echo `find . -name '.\#*' -o -name '\#*' -o -name '*~' -printf '"%p" '`
25 | 
26 | FORCE: ;
27 | 


--------------------------------------------------------------------------------
/man/gset_ttest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{gset_ttest}
 4 | \alias{gset_ttest}
 5 | \title{Perform t-test on gene set scores}
 6 | \usage{
 7 | gset_ttest(gsetX, y)
 8 | }
 9 | \arguments{
10 | \item{gsetX}{Matrix of gene set scores with gene sets on rows and samples on columns}
11 | 
12 | \item{y}{Binary vector (0/1) indicating group membership}
13 | }
14 | \value{
15 | Data frame with columns: diff (difference in means), statistic (t-statistic),
16 | pvalue (p-value), and other t-test results.
17 | }
18 | \description{
19 | Perform t-test on gene set scores
20 | }
21 | 


--------------------------------------------------------------------------------
/man/matrix_onesample_ttest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{matrix_onesample_ttest}
 4 | \alias{matrix_onesample_ttest}
 5 | \title{Perform one-sample t-test on matrix with gene sets}
 6 | \usage{
 7 | matrix_onesample_ttest(Fm, G)
 8 | }
 9 | \arguments{
10 | \item{Fm}{Vector of feature values (e.g., fold changes)}
11 | 
12 | \item{G}{Sparse matrix of gene sets with genes on rows and gene sets on columns}
13 | }
14 | \value{
15 | List containing mean, t-statistic, and p-value matrices.
16 | }
17 | \description{
18 | Perform one-sample t-test on matrix with gene sets
19 | }
20 | 


--------------------------------------------------------------------------------
/man/sparse_colranks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{sparse_colranks}
 4 | \alias{sparse_colranks}
 5 | \title{Compute columm ranks for sparse matrix. Internally used by colranks()}
 6 | \usage{
 7 | sparse_colranks(X, signed = FALSE, ties.method = "average")
 8 | }
 9 | \arguments{
10 | \item{X}{Input matrix}
11 | 
12 | \item{signed}{Logical: use or not signed ranks}
13 | 
14 | \item{ties.method}{Character Choice of ties.method}
15 | }
16 | \value{
17 | Sparse matrix of columnwise ranks with same dimensions as input.
18 | }
19 | \description{
20 | Compute columm ranks for sparse matrix. Internally used by colranks()
21 | }
22 | 


--------------------------------------------------------------------------------
/man/chunked_crossprod.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{chunked_crossprod}
 4 | \alias{chunked_crossprod}
 5 | \title{Chunked computation of cross product}
 6 | \usage{
 7 | chunked_crossprod(x, y, chunk = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{Matrix First matrix for multiplication. Can be sparse.}
11 | 
12 | \item{y}{Matrix Second matrix for multiplication. Can be sparse.}
13 | 
14 | \item{chunk}{Integer Chunk size (max number of columns) for computation.}
15 | }
16 | \value{
17 | Matrix. Result of matrix cross product.
18 | }
19 | \description{
20 | Compute crossprod (t(x) \%*\% y) for very large y by computing in
21 | chunks.
22 | }
23 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Changes in version 0.99.0
 2 | -------------------------
 3 | 
 4 | INITIAL RELEASE
 5 | 
 6 |   * plaid: ultra-fast single-sample enrichment scoring using rank-based 
 7 |     statistics
 8 |   
 9 |   * replaid methods: fast implementations of popular enrichment methods 
10 |     (ssGSEA, GSVA, singscore, UCell, AUCell, scSE)
11 |   
12 |   * dualGSEA: statistical testing for differential enrichment with dualGSEA
13 |   
14 |   * Full Bioconductor integration with SummarizedExperiment, 
15 |     SingleCellExperiment, and BiocSet support
16 |   
17 |   * GMT utilities for reading, writing, and converting gene set files
18 |   
19 |   * Performance optimizations including sparse matrix support and parallel 
20 |     processing
21 | 


--------------------------------------------------------------------------------
/man/cor_sparse_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{cor_sparse_matrix}
 4 | \alias{cor_sparse_matrix}
 5 | \title{Calculate sparse correlation matrix handling missing values}
 6 | \usage{
 7 | cor_sparse_matrix(G, mat)
 8 | }
 9 | \arguments{
10 | \item{G}{Sparse matrix containing gene sets}
11 | 
12 | \item{mat}{Matrix of values}
13 | }
14 | \value{
15 | Correlation matrix between G and mat
16 | }
17 | \description{
18 | Calculate sparse correlation matrix handling missing values
19 | }
20 | \details{
21 | If mat has no missing values, calculates correlation directly using corSparse.
22 | Otherwise computes column-wise correlations only using non-missing values.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/matrix_metap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{matrix_metap}
 4 | \alias{matrix_metap}
 5 | \title{Matrix version for combining p-values using fisher or stouffer
 6 | method. Much faster than doing metap::sumlog() and metap::sumz()}
 7 | \usage{
 8 | matrix_metap(plist, method = "stouffer")
 9 | }
10 | \arguments{
11 | \item{plist}{List of p-value vectors or matrix of p-values}
12 | 
13 | \item{method}{Method for combining p-values ("fisher"/"sumlog" or "stouffer"/"sumz")}
14 | }
15 | \value{
16 | Vector of combined p-values.
17 | }
18 | \description{
19 | Matrix version for combining p-values using fisher or stouffer
20 | method. Much faster than doing metap::sumlog() and metap::sumz()
21 | }
22 | 


--------------------------------------------------------------------------------
/man/fc_ttest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{fc_ttest}
 4 | \alias{fc_ttest}
 5 | \title{T-test statistical testing of differentially enrichment}
 6 | \usage{
 7 | fc_ttest(fc, G, sort.by = "pvalue")
 8 | }
 9 | \arguments{
10 | \item{fc}{Vector of logFC values}
11 | 
12 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates
13 | gene/feature is part of gene sets. Features on rows, gene sets on
14 | columns.}
15 | 
16 | \item{sort.by}{Column name to sort results by ("pvalue", "gsetFC", or "none")}
17 | }
18 | \value{
19 | Data frame with columns: gsetFC (gene set fold change),
20 | pvalue (p-value from one-sample t-test), and qvalue (FDR-adjusted p-value).
21 | }
22 | \description{
23 | This function performs statistical testing for differential
24 | enrichment using plaid
25 | }
26 | 


--------------------------------------------------------------------------------
/man/fc_ztest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{fc_ztest}
 4 | \alias{fc_ztest}
 5 | \title{Z-test statistical testing of differentially enrichment}
 6 | \usage{
 7 | fc_ztest(fc, G, zmat = FALSE, alpha = 0.5)
 8 | }
 9 | \arguments{
10 | \item{fc}{Vector of logFC values}
11 | 
12 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates
13 | gene/feature is part of gene sets. Features on rows, gene sets on
14 | columns.}
15 | 
16 | \item{zmat}{Logical indicating to return z-matrix}
17 | 
18 | \item{alpha}{Scalar weight for SD estimation. Default 0.5.}
19 | }
20 | \value{
21 | List with element: z_statistic (z-statistic from one-sample z-test),
22 | p_value (p-value from one-sample z-test), and zmat (z-matrix).
23 | }
24 | \description{
25 | This function performs statistical testing for differential
26 | enrichment using plaid
27 | }
28 | 


--------------------------------------------------------------------------------
/man/normalize_medians.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{normalize_medians}
 4 | \alias{normalize_medians}
 5 | \title{Normalize column medians of matrix}
 6 | \usage{
 7 | normalize_medians(x, ignore.zero = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{Input matrix}
11 | 
12 | \item{ignore.zero}{Logical indicating whether to ignore zeros to
13 | exclude for median calculation}
14 | }
15 | \value{
16 | Matrix with normalized column medians.
17 | }
18 | \description{
19 | This function normalizes the column medians of matrix x. It calls
20 | optimized functions from the matrixStats package.
21 | }
22 | \examples{
23 | # Create example matrix
24 | set.seed(123)
25 | x <- matrix(rnorm(100), nrow = 10, ncol = 10)
26 | x[1:3, 1:3] <- 0  # Add some zeros
27 | 
28 | # Normalize medians
29 | x_norm <- normalize_medians(x)
30 | head(x_norm)
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/dev/make_package.R:
--------------------------------------------------------------------------------
 1 | ## Followings steps of "R Packages" book
 2 | ##
 3 | 
 4 | library(usethis)
 5 | library(devtools)
 6 | 
 7 | setwd("~/Playground/projects/plaid")
 8 | create_package("~/Playground/projects/plaid")
 9 | use_git()
10 | 
11 | use_r("plaid")
12 | use_gpl3_license()
13 | 
14 | usethis::use_vignette("plaid")
15 | 
16 | use_testthat()
17 | use_test("plaid")
18 | 
19 | ## List package dependencies
20 | use_package("Matrix")
21 | use_package("Rfast")
22 | use_package("matrixStats")
23 | use_package("methods")
24 | use_package("sparseMatrixStats")
25 | use_package("metap")
26 | 
27 | ## For vignette
28 | use_package("BiocStyle", type = "Suggests")
29 | #use_package("Seurat", type = "Suggests")
30 | #use_package("SeuratData", type = "Suggests")
31 | 
32 | use_readme_md()
33 | 
34 | use_logo("inst/logo/logo.png")
35 | 
36 | use_build_ignore(c("dev"))
37 | 
38 | check()
39 | install()
40 | 
41 | ## Reload
42 | load_all()
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/man/gset_averageCLR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{gset_averageCLR}
 4 | \alias{gset_averageCLR}
 5 | \title{Compute geneset expression as the average log-ration of genes in
 6 | the geneset. Requires log-expression matrix X and (sparse) geneset
 7 | matrix matG.}
 8 | \usage{
 9 | gset_averageCLR(X, matG, center = TRUE)
10 | }
11 | \arguments{
12 | \item{X}{Log-expression matrix with genes on rows and samples on columns}
13 | 
14 | \item{matG}{Sparse gene set matrix with genes on rows and gene sets on columns}
15 | 
16 | \item{center}{Logical indicating whether to center the results}
17 | }
18 | \value{
19 | Matrix of gene set expression scores with gene sets on rows and samples on columns.
20 | }
21 | \description{
22 | Compute geneset expression as the average log-ration of genes in
23 | the geneset. Requires log-expression matrix X and (sparse) geneset
24 | matrix matG.
25 | }
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | .RDataTmp
 8 | 
 9 | # User-specific files
10 | .Ruserdata
11 | 
12 | # Example code in package build process
13 | *-Ex.R
14 | 
15 | # Output files from R CMD build
16 | /*.tar.gz
17 | 
18 | # Output files from R CMD check
19 | /*.Rcheck/
20 | 
21 | # RStudio files
22 | .Rproj.user/
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | *_cache/
33 | /cache/
34 | /Meta/
35 | /doc/
36 | 
37 | # Temporary files created by R markdown
38 | *.utf8.md
39 | *.knit.md
40 | 
41 | # R Environment Variables
42 | .Renviron
43 | 
44 | # pkgdown site
45 | docs/
46 | 
47 | # translation temp files
48 | po/*~
49 | 
50 | # RStudio Connect folder
51 | rsconnect/
52 | 
53 | # Backup files
54 | *~
55 | *.BAK
56 | *.tmp
57 | *.save
58 | \#*
59 | 
60 | # Output
61 | *.out
62 | *.Rout
63 | *.tmp
64 | *.pdf
65 | docs
66 | 


--------------------------------------------------------------------------------
/man/mat2gmt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gmt-utils.R
 3 | \name{mat2gmt}
 4 | \alias{mat2gmt}
 5 | \title{Convert Binary Matrix to GMT}
 6 | \usage{
 7 | mat2gmt(mat)
 8 | }
 9 | \arguments{
10 | \item{mat}{Matrix with non-zero entries representing genes in each gene set.
11 | Rows represent genes and columns represent gene sets.}
12 | }
13 | \value{
14 | A list of vector representing each gene set. Each list
15 | element correspond to a gene set and is a vector of genes
16 | }
17 | \description{
18 | Convert binary matrix to a GMT (Gene Matrix Transposed) list.
19 | The binary matrix indicates presence or absence of genes in each gene set.
20 | Rows represent genes and columns represent gene sets.
21 | }
22 | \examples{
23 | # Create example binary matrix
24 | mat <- matrix(0, nrow = 6, ncol = 3)
25 | rownames(mat) <- paste0("GENE", 1:6)
26 | colnames(mat) <- paste0("Pathway", 1:3)
27 | mat[1:3, 1] <- 1  # Pathway1: GENE1, GENE2, GENE3
28 | mat[c(2,4,5), 2] <- 1  # Pathway2: GENE2, GENE4, GENE5
29 | mat[c(1,5,6), 3] <- 1  # Pathway3: GENE1, GENE5, GENE6
30 | 
31 | # Convert to GMT list
32 | gmt <- mat2gmt(mat)
33 | print(gmt)
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/read.gmt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gmt-utils.R
 3 | \name{read.gmt}
 4 | \alias{read.gmt}
 5 | \title{Read GMT File}
 6 | \usage{
 7 | read.gmt(gmt.file, dir = NULL, add.source = FALSE, nrows = -1)
 8 | }
 9 | \arguments{
10 | \item{gmt.file}{Path to GMT file.}
11 | 
12 | \item{dir}{(Optional) The directory where the GMT file is located.}
13 | 
14 | \item{add.source}{(optional) Include the source information in the gene sets' names.}
15 | 
16 | \item{nrows}{(optional) Number of rows to read from the GMT file.}
17 | }
18 | \value{
19 | A list of gene sets: each gene set is represented as a character vector of gene names.
20 | }
21 | \description{
22 | Read data from a GMT file (Gene Matrix Transposed).
23 | The GMT format is commonly used to store gene sets or gene annotations.
24 | }
25 | \examples{
26 | \donttest{
27 | # Read GMT file (requires file to exist)
28 | gmt_file <- system.file("extdata", "hallmarks.gmt", package = "plaid")
29 | if (file.exists(gmt_file)) {
30 |   gmt <- read.gmt(gmt_file)
31 |   print(names(gmt))
32 |   print(head(gmt[[1]]))
33 |   
34 |   # Read with source information
35 |   gmt_with_source <- read.gmt(gmt_file, add.source = TRUE)
36 |   print(head(names(gmt_with_source)))
37 | }
38 | }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/colranks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{colranks}
 4 | \alias{colranks}
 5 | \title{Compute columnwise ranks of matrix}
 6 | \usage{
 7 | colranks(
 8 |   X,
 9 |   sparse = NULL,
10 |   signed = FALSE,
11 |   keep.zero = FALSE,
12 |   ties.method = "average"
13 | )
14 | }
15 | \arguments{
16 | \item{X}{Input matrix}
17 | 
18 | \item{sparse}{Logical indicating to use sparse methods}
19 | 
20 | \item{signed}{Logical indicating using signed ranks}
21 | 
22 | \item{keep.zero}{Logical indicating whether to keep zero as ranked zero}
23 | 
24 | \item{ties.method}{Character Choice of ties.method}
25 | }
26 | \value{
27 | Matrix of columnwise ranks with same dimensions as input.
28 | }
29 | \description{
30 | Computes columnwise rank of matrix. Can be sparse. Tries to call
31 | optimized functions from Rfast or matrixStats.
32 | }
33 | \examples{
34 | # Create example matrix
35 | set.seed(123)
36 | X <- matrix(rnorm(100), nrow = 10, ncol = 10)
37 | rownames(X) <- paste0("Gene", 1:10)
38 | colnames(X) <- paste0("Sample", 1:10)
39 | 
40 | # Compute column ranks
41 | ranks <- colranks(X)
42 | print(ranks[1:5, 1:5])
43 | 
44 | # Compute signed ranks
45 | signed_ranks <- colranks(X, signed = TRUE)
46 | print(signed_ranks[1:5, 1:5])
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(colranks)
 4 | export(dualGSEA)
 5 | export(gmt2mat)
 6 | export(gset.rankcor)
 7 | export(mat2gmt)
 8 | export(normalize_medians)
 9 | export(plaid)
10 | export(read.gmt)
11 | export(replaid.aucell)
12 | export(replaid.gsva)
13 | export(replaid.scse)
14 | export(replaid.sing)
15 | export(replaid.ssgsea)
16 | export(replaid.ucell)
17 | export(write.gmt)
18 | importFrom(BiocSet,es_elementset)
19 | importFrom(Matrix,Diagonal)
20 | importFrom(Matrix,Matrix)
21 | importFrom(Matrix,colScale)
22 | importFrom(Matrix,colSums)
23 | importFrom(Matrix,crossprod)
24 | importFrom(Matrix,rowMeans)
25 | importFrom(Matrix,rowSums)
26 | importFrom(Matrix,t)
27 | importFrom(Matrix,which)
28 | importFrom(Rfast,ttests)
29 | importFrom(SummarizedExperiment,assay)
30 | importFrom(SummarizedExperiment,assayNames)
31 | importFrom(matrixStats,colMedians)
32 | importFrom(matrixStats,colRanks)
33 | importFrom(methods,as)
34 | importFrom(methods,is)
35 | importFrom(parallel,mclapply)
36 | importFrom(qlcMatrix,corSparse)
37 | importFrom(sparseMatrixStats,colRanks)
38 | importFrom(stats,ecdf)
39 | importFrom(stats,p.adjust)
40 | importFrom(stats,pchisq)
41 | importFrom(stats,pnorm)
42 | importFrom(stats,pt)
43 | importFrom(stats,qnorm)
44 | importFrom(stats,var)
45 | importFrom(utils,head)
46 | importFrom(utils,read.csv)
47 | 


--------------------------------------------------------------------------------
/dev/test.R:
--------------------------------------------------------------------------------
 1 | ##library("plaid")
 2 | library(devtools)
 3 | load_all()
 4 | 
 5 | library(Seurat)
 6 | library(SeuratData)
 7 | data("pbmc3k.final")
 8 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final)
 9 | X <- pbmc3k.final[['RNA']]@data
10 | dim(X)
11 | X <- do.call(cbind, rep(list(X),10))
12 | 
13 | gmt <- read.gmt(system.file("extdata", "hallmarks.gmt", package = "plaid"))
14 | matG <- gmt2mat(gmt)
15 | dim(matG)
16 | 
17 | ## run plaid
18 | peakRAM::peakRAM( gsetX <- plaid(X, matG, normalize=TRUE) )
19 | dim(gsetX)
20 | 
21 | ## good practice to normalize median
22 | peakRAM::peakRAM( gsetX <- normalize_medians(gsetX) )
23 | peakRAM::peakRAM( Rfast::colMedians(gsetX))
24 | peakRAM::peakRAM( matrixStats::colMedians(gsetX))
25 | peakRAM::peakRAM( apply(gsetX,2,stats::median,na.rm=TRUE))
26 | 
27 | 
28 | ## do test
29 | celltype <- pbmc3k.final$seurat_annotations
30 | y <- (celltype == "B")
31 | res <- plaid.test(X, y, G=matG, gsetX=gsetX)
32 | head(res)
33 | 
34 | pairs(res[,-1])
35 | pairs(-log10(res[,-1]))
36 | 
37 | head(res[order(res[,2]),1])
38 | head(res[order(res[,3]),1])
39 | head(res[order(res[,4]),1])
40 | head(res[order(res[,5]),1])
41 | 
42 | fc <- rowMeans(X[,y==1]) - rowMeans(X[,y==0])
43 | res.fgsea <- fgsea::fgsea( gmt, fc)
44 | head(res.fgsea)
45 | nes <- res.fgsea$NES
46 | p.nes <- res.fgsea$pval
47 | names(nes)=names(p.nes)=res.fgsea$pathway
48 | 
49 | 


--------------------------------------------------------------------------------
/man/write.gmt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gmt-utils.R
 3 | \name{write.gmt}
 4 | \alias{write.gmt}
 5 | \title{Write GMT File}
 6 | \usage{
 7 | write.gmt(gmt, file, source = NA)
 8 | }
 9 | \arguments{
10 | \item{gmt}{A list of gene sets in GMT format: each gene set is represented as a vector of gene names.}
11 | 
12 | \item{file}{The file path to write the GMT file.}
13 | 
14 | \item{source}{A character vector specifying the source of each gene set.
15 | If not provided, the names of the gene sets are used as the source.}
16 | }
17 | \value{
18 | Does not return anything.
19 | }
20 | \description{
21 | Write gene sets to GMT file (Gene Matrix Transposed).
22 | The GMT format is commonly used to store gene sets or gene annotations.
23 | }
24 | \examples{
25 | # Create example GMT data
26 | gmt <- list(
27 |   "Pathway1" = c("GENE1", "GENE2", "GENE3"),
28 |   "Pathway2" = c("GENE2", "GENE4", "GENE5"),
29 |   "Pathway3" = c("GENE1", "GENE5", "GENE6")
30 | )
31 | 
32 | \donttest{
33 | # Write to GMT file (creates file in temp directory)
34 | temp_file <- tempfile(fileext = ".gmt")
35 | write.gmt(gmt, temp_file)
36 | 
37 | # Write with custom source information
38 | temp_file2 <- tempfile(fileext = ".gmt")
39 | write.gmt(gmt, temp_file2, source = c("DB1", "DB2", "DB3"))
40 | 
41 | # Clean up
42 | unlink(c(temp_file, temp_file2))
43 | }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: plaid
 2 | Title: PLAID ultrafast gene set enrichment scoring
 3 | Version: 0.99.16
 4 | Authors@R: c(
 5 |     person("Ivo", "Kwee", , "ivo.kwee@bigomics.ch", role = "aut",
 6 |         comment = c(ORCID = "0000-0002-2751-4218")),
 7 |     person("Antonino", "Zito", , "antonino.zito@bigomics.ch", role = "cre",
 8 |         comment = c(ORCID = "0000-0003-1931-984X"))
 9 |   )
10 | Description: PLAID (Pathway Level Average Intensity Detection) is an ultra-fast method to compute single-sample enrichment scores for gene expression or proteomics data. For each sample, plaid computes the gene set score as the average intensity of the genes/proteins in the gene set. The output is a gene set score matrix suitable for further analyses.
11 | License: GPL-3
12 | URL: https://github.com/bigomics/plaid, https://bigomics.github.io/plaid/
13 | BugReports: https://github.com/bigomics/plaid/issues
14 | Encoding: UTF-8
15 | Roxygen: list(markdown = TRUE)
16 | RoxygenNote: 7.3.3
17 | Suggests: 
18 |     BiocStyle,
19 |     knitr,
20 |     rmarkdown,
21 |     testthat (>= 3.0.0)
22 | Config/testthat/edition: 3
23 | biocViews: GeneSetEnrichment, GeneExpression, Proteomics
24 | Depends: R (>= 4.3.3)
25 | Imports: 
26 |     Matrix,
27 |     matrixStats,
28 |     methods,
29 |     parallel,
30 |     Rfast,
31 |     sparseMatrixStats,
32 |     qlcMatrix,
33 |     GSVA,
34 |     fgsea,
35 |     SummarizedExperiment,
36 |     BiocSet,
37 |     stats,
38 |     utils
39 | VignetteBuilder: knitr
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 0 * * *"
 7 | 
 8 | name: test-coverage
 9 | 
10 | jobs:
11 |   test-coverage:
12 |     runs-on: ubuntu-latest
13 |     env:
14 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 |       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package"),
35 |             token = Sys.getenv("CODECOV_TOKEN")
36 |           )
37 |         shell: Rscript {0}
38 | 
39 |       - name: Show testthat output
40 |         if: always()
41 |         run: |
42 |           ## --------------------------------------------------------------------
43 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
44 |         shell: bash
45 | 
46 |       - name: Upload test results
47 |         if: failure()
48 |         uses: actions/upload-artifact@v4
49 |         with:
50 |           name: coverage-test-failures
51 |           path: ${{ runner.temp }}/package


--------------------------------------------------------------------------------
/man/gmt2mat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gmt-utils.R
 3 | \name{gmt2mat}
 4 | \alias{gmt2mat}
 5 | \title{Convert GMT to Binary Matrix}
 6 | \usage{
 7 | gmt2mat(
 8 |   gmt,
 9 |   max.genes = -1,
10 |   ntop = -1,
11 |   sparse = TRUE,
12 |   bg = NULL,
13 |   use.multicore = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{gmt}{List representing the GMT file: each element is a character vector representing a gene set.}
18 | 
19 | \item{max.genes}{Max number of genes to include in the binary matrix. Default = -1 to include all genes.}
20 | 
21 | \item{ntop}{Number of top genes to consider for each gene set. Default = -1 to include all genes.}
22 | 
23 | \item{sparse}{Logical: create a sparse matrix. Default \code{TRUE}. If \code{FALSE} creates a dense matrix.}
24 | 
25 | \item{bg}{Character vector of background gene set. Default \code{NULL} to consider all unique genes.}
26 | 
27 | \item{use.multicore}{Logical: use parallel processing ('parallel' R package). Default \code{TRUE}.}
28 | }
29 | \value{
30 | A binary matrix representing the presence or absence of genes in each gene set.
31 | Rows correspond to genes, and columns correspond to gene sets.
32 | }
33 | \description{
34 | Convert a GMT file (Gene Matrix Transposed) to a binary matrix,
35 | where rows represent genes and columns represent gene sets.
36 | The binary matrix indicates presence or absence of genes in a gene set.
37 | }
38 | \examples{
39 | # Create example GMT data
40 | gmt <- list(
41 |   "Pathway1" = c("GENE1", "GENE2", "GENE3"),
42 |   "Pathway2" = c("GENE2", "GENE4", "GENE5"),
43 |   "Pathway3" = c("GENE1", "GENE5", "GENE6")
44 | )
45 | 
46 | # Convert to binary matrix
47 | mat <- gmt2mat(gmt)
48 | print(mat)
49 | 
50 | # Create dense matrix instead of sparse
51 | mat_dense <- gmt2mat(gmt, sparse = FALSE)
52 | print(mat_dense)
53 | }
54 | 


--------------------------------------------------------------------------------
/man/gset.rankcor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{gset.rankcor}
 4 | \alias{gset.rankcor}
 5 | \title{Calculate gene set rank correlation}
 6 | \usage{
 7 | gset.rankcor(rnk, gset, compute.p = FALSE, use.rank = TRUE)
 8 | }
 9 | \arguments{
10 | \item{rnk}{Numeric vector or matrix of gene ranks, with genes as row names}
11 | 
12 | \item{gset}{Numeric matrix of gene sets, with genes as row/column names}
13 | 
14 | \item{compute.p}{Logical indicating whether to compute p-values}
15 | 
16 | \item{use.rank}{Logical indicating whether to rank transform rnk before correlation}
17 | }
18 | \value{
19 | Named list with components:
20 | \itemize{
21 | \item rho - Matrix of correlation coefficients between rnk and gset
22 | \item p.value - Matrix of p-values for correlation (if compute.p = TRUE)
23 | \item q.value - Matrix of FDR adjusted p-values (if compute.p = TRUE)
24 | }
25 | }
26 | \description{
27 | Compute rank correlation between a gene rank vector/matrix and gene sets
28 | }
29 | \details{
30 | This function calculates sparse rank correlation between rnk and each
31 | column of gset using \code{qlcMatrix::corSparse()}. It handles missing values in
32 | rnk by computing column-wise correlations.
33 | 
34 | P-values are computed from statistical distribution
35 | }
36 | \examples{
37 | # Create example rank vector
38 | set.seed(123)
39 | ranks <- rnorm(100)
40 | names(ranks) <- paste0("GENE", 1:100)
41 | 
42 | # Create example gene sets as sparse matrix
43 | gmt <- list(
44 |   "Pathway1" = paste0("GENE", 1:20),
45 |   "Pathway2" = paste0("GENE", 15:35),
46 |   "Pathway3" = paste0("GENE", 30:50)
47 | )
48 | genesets <- gmt2mat(gmt)
49 | 
50 | # Calculate rank correlation
51 | result <- gset.rankcor(ranks, genesets, compute.p = TRUE)
52 | print(result$rho)
53 | print(result$p.value)
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/man/replaid.sing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.sing}
 4 | \alias{replaid.sing}
 5 | \title{Fast calculation of singscore}
 6 | \usage{
 7 | replaid.sing(X, matG, assay = "logcounts", min.genes = 5, max.genes = 500)
 8 | }
 9 | \arguments{
10 | \item{X}{Gene or protein expression matrix. Generally log
11 | transformed. See details. Genes on rows, samples on columns.
12 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
13 | 
14 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on
15 | columns. Also accepts BiocSet objects or GMT lists.}
16 | 
17 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
18 | 
19 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
20 | 
21 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
22 | }
23 | \value{
24 | Matrix of single-sample singscore enrichment scores.
25 | Gene sets on rows, samples on columns.
26 | }
27 | \description{
28 | Calculates single-sample enrichment singscore
29 | (Fouratan et al., 2018) using plaid back-end. The computation is
30 | 10-100x faster than the original code.
31 | }
32 | \details{
33 | Computing the singscore requires to compute the ranks of
34 | the expression matrix. We have wrapped this in a single
35 | convenience function.
36 | 
37 | We have extensively compared the results of \code{replaid.sing} and from
38 | the original \code{singscore} R package and we showed identical result
39 | in the score, logFC and p-values.
40 | }
41 | \examples{
42 | # Create example expression matrix
43 | set.seed(123)
44 | X <- matrix(rnorm(500), nrow = 50, ncol = 10)
45 | rownames(X) <- paste0("GENE", 1:50)
46 | colnames(X) <- paste0("Sample", 1:10)
47 | 
48 | # Create example gene sets
49 | gmt <- list(
50 |   "Pathway1" = paste0("GENE", 1:15),
51 |   "Pathway2" = paste0("GENE", 10:25)
52 | )
53 | matG <- gmt2mat(gmt)
54 | 
55 | # Compute singscore
56 | scores <- replaid.sing(X, matG)
57 | print(scores[1:2, 1:5])
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/man/replaid.aucell.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.aucell}
 4 | \alias{replaid.aucell}
 5 | \title{Fast calculation of AUCell}
 6 | \usage{
 7 | replaid.aucell(
 8 |   X,
 9 |   matG,
10 |   aucMaxRank = NULL,
11 |   assay = "logcounts",
12 |   min.genes = 5,
13 |   max.genes = 500
14 | )
15 | }
16 | \arguments{
17 | \item{X}{Gene or protein expression matrix. Generally log
18 | transformed. See details. Genes on rows, samples on columns.
19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
20 | 
21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns.
22 | Also accepts BiocSet objects or GMT lists.}
23 | 
24 | \item{aucMaxRank}{Rank threshold (see AUCell paper). Default aucMaxRank = 0.05*nrow(X).}
25 | 
26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
27 | 
28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
29 | 
30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
31 | }
32 | \value{
33 | Matrix of single-sample AUCell enrichment scores.
34 | Gene sets on rows, samples on columns.
35 | }
36 | \description{
37 | Calculates single-sample enrichment AUCell (Aibar
38 | et al., 2017) using plaid back-end. The computation is
39 | 10-100x faster than the original code.
40 | }
41 | \details{
42 | Computing the AUCell score requires to compute the ranks
43 | of the expression matrix and approximating the AUC of a gene
44 | set. We have wrapped this in a single convenience function.
45 | 
46 | We have extensively compared the results of \code{replaid.aucell} and
47 | from the original \code{AUCell} R package and we showed good concordance
48 | of results in the score, logFC and p-values.
49 | }
50 | \examples{
51 | # Create example expression matrix
52 | set.seed(123)
53 | X <- matrix(rnorm(500), nrow = 50, ncol = 10)
54 | rownames(X) <- paste0("GENE", 1:50)
55 | colnames(X) <- paste0("Sample", 1:10)
56 | 
57 | # Create example gene sets
58 | gmt <- list(
59 |   "Pathway1" = paste0("GENE", 1:15),
60 |   "Pathway2" = paste0("GENE", 10:25)
61 | )
62 | matG <- gmt2mat(gmt)
63 | 
64 | # Compute AUCell scores
65 | scores <- replaid.aucell(X, matG)
66 | print(scores[1:2, 1:5])
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/man/replaid.ucell.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.ucell}
 4 | \alias{replaid.ucell}
 5 | \title{Fast calculation of UCell}
 6 | \usage{
 7 | replaid.ucell(
 8 |   X,
 9 |   matG,
10 |   rmax = 1500,
11 |   assay = "logcounts",
12 |   min.genes = 5,
13 |   max.genes = 500
14 | )
15 | }
16 | \arguments{
17 | \item{X}{Gene or protein expression matrix. Generally log
18 | transformed. See details. Genes on rows, samples on columns.
19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
20 | 
21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns.
22 | Also accepts BiocSet objects or GMT lists.}
23 | 
24 | \item{rmax}{Rank threshold (see Ucell paper). Default rmax = 1500.}
25 | 
26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
27 | 
28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
29 | 
30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
31 | }
32 | \value{
33 | Matrix of single-sample UCell enrichment scores.
34 | Gene sets on rows, samples on columns.
35 | }
36 | \description{
37 | Calculates single-sample enrichment UCell (Andreatta
38 | et al., 2021) using plaid back-end. The computation is
39 | 10-100x faster than the original code.
40 | }
41 | \details{
42 | Computing ssGSEA score requires to compute the ranks of
43 | the expression matrix and truncation of the ranks. We have wrapped
44 | this in a single convenience function.
45 | 
46 | We have extensively compared the results of \code{replaid.ucell} and
47 | from the original \code{UCell} R package and we showed near exacct
48 | results in the score, logFC and p-values.
49 | }
50 | \examples{
51 | # Create example expression matrix
52 | set.seed(123)
53 | X <- matrix(rnorm(500), nrow = 50, ncol = 10)
54 | rownames(X) <- paste0("GENE", 1:50)
55 | colnames(X) <- paste0("Sample", 1:10)
56 | 
57 | # Create example gene sets
58 | gmt <- list(
59 |   "Pathway1" = paste0("GENE", 1:15),
60 |   "Pathway2" = paste0("GENE", 10:25)
61 | )
62 | matG <- gmt2mat(gmt)
63 | 
64 | # Compute UCell scores (default rmax = 1500)
65 | scores <- replaid.ucell(X, matG)
66 | print(scores[1:2, 1:5])
67 | 
68 | # Compute UCell scores with custom rmax
69 | scores_custom <- replaid.ucell(X, matG, rmax = 1000)
70 | print(scores_custom[1:2, 1:5])
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/man/replaid.ssgsea.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.ssgsea}
 4 | \alias{replaid.ssgsea}
 5 | \title{Fast calculation of ssGSEA}
 6 | \usage{
 7 | replaid.ssgsea(
 8 |   X,
 9 |   matG,
10 |   alpha = 0,
11 |   assay = "logcounts",
12 |   min.genes = 5,
13 |   max.genes = 500
14 | )
15 | }
16 | \arguments{
17 | \item{X}{Gene or protein expression matrix. Generally log
18 | transformed. See details. Genes on rows, samples on columns.
19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
20 | 
21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on
22 | columns. Also accepts BiocSet objects or GMT lists.}
23 | 
24 | \item{alpha}{Weighting factor for exponential weighting of ranks}
25 | 
26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
27 | 
28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
29 | 
30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
31 | }
32 | \value{
33 | Matrix of single-sample ssGSEA enrichment scores.
34 | Gene sets on rows, samples on columns.
35 | }
36 | \description{
37 | Calculates single-sample enrichment singscore (Barbie
38 | et al., 2009; Hänzelmann et al., 2013) using plaid back-end. The
39 | computation is 10-100x faster than the original code.
40 | }
41 | \details{
42 | Computing ssGSEA score requires to compute the ranks of
43 | the expression matrix and weighting of the ranks. We have wrapped
44 | this in a single convenience function.
45 | 
46 | We have extensively compared the results of \code{replaid.ssgsea} and
47 | from the original \code{GSVA} R package and we showed highly similar
48 | results in the score, logFC and p-values. For alpha=0 we obtain
49 | exact results, for alpha>0 the results are highly similar but not
50 | exactly the same.
51 | }
52 | \examples{
53 | # Create example expression matrix
54 | set.seed(123)
55 | X <- matrix(rnorm(500), nrow = 50, ncol = 10)
56 | rownames(X) <- paste0("GENE", 1:50)
57 | colnames(X) <- paste0("Sample", 1:10)
58 | 
59 | # Create example gene sets
60 | gmt <- list(
61 |   "Pathway1" = paste0("GENE", 1:15),
62 |   "Pathway2" = paste0("GENE", 10:25)
63 | )
64 | matG <- gmt2mat(gmt)
65 | 
66 | # Compute ssGSEA scores (alpha = 0)
67 | scores <- replaid.ssgsea(X, matG, alpha = 0)
68 | print(scores[1:2, 1:5])
69 | 
70 | # Compute ssGSEA scores with weighting (alpha = 0.25)
71 | scores_weighted <- replaid.ssgsea(X, matG, alpha = 0.25)
72 | print(scores_weighted[1:2, 1:5])
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/man/replaid.gsva.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.gsva}
 4 | \alias{replaid.gsva}
 5 | \title{Fast approximation of GSVA}
 6 | \usage{
 7 | replaid.gsva(
 8 |   X,
 9 |   matG,
10 |   tau = 0,
11 |   rowtf = c("z", "ecdf")[1],
12 |   assay = "logcounts",
13 |   min.genes = 5,
14 |   max.genes = 500
15 | )
16 | }
17 | \arguments{
18 | \item{X}{Gene or protein expression matrix. Generally log
19 | transformed. See details. Genes on rows, samples on columns.
20 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
21 | 
22 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on
23 | columns. Also accepts BiocSet objects or GMT lists.}
24 | 
25 | \item{tau}{Rank weight parameter (see GSVA publication). Default
26 | tau=0.}
27 | 
28 | \item{rowtf}{Row transformation method ("z" or "ecdf"). Default "z".}
29 | 
30 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
31 | 
32 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
33 | 
34 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
35 | }
36 | \value{
37 | Matrix of single-sample GSVA enrichment scores.
38 | Gene sets on rows, samples on columns.
39 | }
40 | \description{
41 | Calculates single-sample enrichment GSVA (Hänzelmann
42 | et al., 2013) using plaid back-end. The computation is
43 | 10-100x faster than the original code.
44 | }
45 | \details{
46 | Computing the GSVA score requires to compute the CDF of
47 | the expression matrix, ranking and scoring the genesets. We have
48 | wrapped this in a single convenience function.
49 | 
50 | We have extensively compared the results of \code{replaid.gsva} and
51 | from the original \code{GSVA} R package and we showed good concordance
52 | of results in the score, logFC and p-values.
53 | 
54 | In the original formulation, GSVA uses an emperical CDF to
55 | transform expression of each feature to a (0;1) relative expression
56 | value. For efficiency reasons, this is here approximated by a
57 | z-transform (center+scale) of each row.
58 | }
59 | \examples{
60 | # Create example expression matrix
61 | set.seed(123)
62 | X <- matrix(rnorm(500), nrow = 50, ncol = 10)
63 | rownames(X) <- paste0("GENE", 1:50)
64 | colnames(X) <- paste0("Sample", 1:10)
65 | 
66 | # Create example gene sets
67 | gmt <- list(
68 |   "Pathway1" = paste0("GENE", 1:15),
69 |   "Pathway2" = paste0("GENE", 10:25)
70 | )
71 | matG <- gmt2mat(gmt)
72 | 
73 | # Compute GSVA scores
74 | scores <- replaid.gsva(X, matG)
75 | print(scores[1:2, 1:5])
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/man/replaid.scse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plaid.R
 3 | \name{replaid.scse}
 4 | \alias{replaid.scse}
 5 | \title{Fast calculation of scSE score}
 6 | \usage{
 7 | replaid.scse(
 8 |   X,
 9 |   matG,
10 |   removeLog2 = NULL,
11 |   scoreMean = FALSE,
12 |   assay = "logcounts",
13 |   min.genes = 5,
14 |   max.genes = 500
15 | )
16 | }
17 | \arguments{
18 | \item{X}{Gene or protein expression matrix. Generally log
19 | transformed. See details. Genes on rows, samples on columns.
20 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
21 | 
22 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on
23 | columns. Also accepts BiocSet objects or GMT lists.}
24 | 
25 | \item{removeLog2}{Logical for whether to remove the Log2, i.e. will
26 | apply power transform (base2) on input (default TRUE).}
27 | 
28 | \item{scoreMean}{Logical for whether computing sum or mean as score
29 | (default FALSE).}
30 | 
31 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".}
32 | 
33 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.}
34 | 
35 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.}
36 | }
37 | \value{
38 | Matrix of single-sample scSE enrichment scores.
39 | Gene sets on rows, samples on columns.
40 | }
41 | \description{
42 | Calculates Single-Cell Signature Explorer (Pont et
43 | al., 2019) scores using plaid back-end. The computation is
44 | 10-100x faster than the original code.
45 | }
46 | \details{
47 | Computing the scSE requires running plaid on the linear
48 | (not logarithmic) score and perform additional normalization by
49 | the total UMI per sample. We have wrapped this in a single
50 | convenience function:
51 | 
52 | To replicate the original "sum-of-UMI" scSE score, set \code{removeLog2=TRUE}
53 | and \code{scoreMean=FALSE}. scSE and plaid scores become more similar for
54 | \code{removeLog2=FALSE} and \code{scoreMean=TRUE}.
55 | 
56 | We have extensively compared the results from \code{replaid.scse} and
57 | from the original scSE (implemented in GO lang) and we showed
58 | almost identical results in the score, logFC and p-values.
59 | }
60 | \examples{
61 | # Create example expression matrix (log-transformed)
62 | set.seed(123)
63 | X <- log2(matrix(rpois(500, lambda = 10) + 1, nrow = 50, ncol = 10))
64 | rownames(X) <- paste0("GENE", 1:50)
65 | colnames(X) <- paste0("Sample", 1:10)
66 | 
67 | # Create example gene sets
68 | gmt <- list(
69 |   "Pathway1" = paste0("GENE", 1:15),
70 |   "Pathway2" = paste0("GENE", 10:25)
71 | )
72 | matG <- gmt2mat(gmt)
73 | 
74 | # Compute scSE scores (original method)
75 | scores <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = FALSE)
76 | print(scores[1:2, 1:5])
77 | 
78 | # Compute scSE scores (mean method)
79 | scores_mean <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = TRUE)
80 | print(scores_mean[1:2, 1:5])
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/man/dualGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{dualGSEA}
 4 | \alias{dualGSEA}
 5 | \title{Reimplementation of dualGSEA (Bull et al., 2024) but defaults with
 6 | replaid backend. For the preranked test we still use fgsea. Should
 7 | be much faster than original using fgsea + GSVA::ssGSEA.}
 8 | \usage{
 9 | dualGSEA(
10 |   X,
11 |   y,
12 |   G,
13 |   gmt = NULL,
14 |   gsetX = NULL,
15 |   fc.method = c("fgsea", "rankcor", "ztest", "ttest", "cor")[2],
16 |   ss.method = c("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva")[1],
17 |   metap.method = c("stouffer", "fisher", "maxp")[1],
18 |   pv1 = NULL,
19 |   pv2 = NULL,
20 |   sort.by = "p.dual"
21 | )
22 | }
23 | \arguments{
24 | \item{X}{Expression matrix with genes on rows and samples ont columns}
25 | 
26 | \item{y}{Binary vector (0/1) indicating group membership}
27 | 
28 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates
29 | gene/feature is part of gene sets. Features on rows, gene sets on
30 | columns.}
31 | 
32 | \item{gmt}{List of gene sets in GMT format}
33 | 
34 | \item{gsetX}{Optional pre-computed matrix of gene set enrichment scores with
35 | gene sets on rows and samples on columns. If NULL (default), scores will be
36 | computed using the method specified by \code{ss.method}. Providing pre-computed
37 | scores improves efficiency when running multiple analyses.}
38 | 
39 | \item{fc.method}{Method for fold change testing ("fgsea", "ztest", "ttest", "rankcor", "cor")}
40 | 
41 | \item{ss.method}{Method for single-sample enrichment ("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva")}
42 | 
43 | \item{metap.method}{Method for combining p-values ("stouffer", "fisher" or "maxp"). Default "stouffer".}
44 | 
45 | \item{pv1}{Pre-computed p-values from fold change test. If NULL, will be computed based on fc.test.}
46 | 
47 | \item{pv2}{Pre-computed p-values from single sample test. If NULL, will be computed using gset_ttest.}
48 | 
49 | \item{sort.by}{Column name to sort results by ("p.dual", "gsetFC", "p.fc", "p.ss"). Default "p.dual".}
50 | }
51 | \value{
52 | Data frame with columns: gsetFC (gene set fold change), size (gene set size),
53 | p.fc (p-value from fold change test), p.ss (p-value from single sample test),
54 | p.dual (combined p-value), and q.dual (FDR-adjusted combined p-value).
55 | }
56 | \description{
57 | Reimplementation of dualGSEA (Bull et al., 2024) but defaults with
58 | replaid backend. For the preranked test we still use fgsea. Should
59 | be much faster than original using fgsea + GSVA::ssGSEA.
60 | }
61 | \examples{
62 | # Create example expression matrix
63 | set.seed(123)
64 | X <- matrix(rnorm(1000), nrow = 100, ncol = 20)
65 | rownames(X) <- paste0("GENE", 1:100)
66 | colnames(X) <- paste0("Sample", 1:20)
67 | 
68 | # Create binary group vector
69 | y <- rep(c(0, 1), each = 10)
70 | 
71 | # Create example gene sets
72 | gmt <- list(
73 |   "Pathway1" = paste0("GENE", 1:20),
74 |   "Pathway2" = paste0("GENE", 15:35),
75 |   "Pathway3" = paste0("GENE", 30:50)
76 | )
77 | 
78 | # Perform dualGSEA with correlation test (fast method)
79 | results_cor <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "cor", ss.method = "replaid.gsva")
80 | print(head(results_cor))
81 | 
82 | \donttest{
83 | # Perform dualGSEA with fgsea (requires fgsea package)
84 | if (requireNamespace("fgsea", quietly = TRUE)) {
85 |   results <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "fgsea", ss.method = "replaid.ssgsea")
86 |   print(head(results))
87 | }
88 | }
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PLAID: ultrafast single-sample gene set enrichment  <a href="https://bigomics.github.io/plaid/"><img src="man/figures/logo.png" align="right" height="138" alt="plaid website" /></a>
 2 | 
 3 | [![codecov](https://codecov.io/github/bigomics/plaid/graph/badge.svg?token=66J6W41C0G)](https://codecov.io/github/bigomics/plaid)
 4 | 
 5 | [PLAID](https://bigomics.github.io/plaid) (Pathway Level Average Intensity Detection) is an ultrafast
 6 | method to compute single-sample enrichment scores for gene expression
 7 | or proteomics data. For each sample, PLAID computes the gene set score
 8 | as the average intensity of the genes/proteins in the gene set. The
 9 | output is a gene set score matrix suitable for further analyses.
10 | 
11 | A distinctive feature of PLAID is that it can simulate few of the most
12 | widely used single-sample gene set scoring algorithms 
13 | ([ssGSEA](https://github.com/rcastelo/GSVA),
14 | [GSVA](https://github.com/rcastelo/GSVA),
15 | [AUCell](https://github.com/aertslab/AUCell),
16 | [singscore](https://github.com/DavisLaboratory/singscore),
17 | [scSE](https://doi.org/10.1093/nar/gkz601),
18 | [UCell](https://github.com/carmonalab/UCell)), enabling researchers
19 | to replace those functions and gain much improved runtime efficiency
20 | and memory requirement. Typically, PLAID can be more than 100 times
21 | faster and requiring 10 times less memory than the original algorithm.
22 | 
23 | #### Key features
24 | 
25 | - Ultra-fast single-sample gene set enrichment scoring
26 | - Includes multiple scoring methods (plaid, singscore, ssGSEA, GSVA, scSE, UCell, AUCell)
27 | - Works with regular matrices, sparse matrices, and Bioconductor data structures
28 | - Automatically detects and handles Bioconductor objects 
29 | ([`SummarizedExperiment`](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html), 
30 | [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html), [`BiocSet`](https://bioconductor.org/packages/release/bioc/html/BiocSet.html))
31 | - Built-in differential enrichment testing
32 | 
33 | #### Warning
34 | 
35 | PLAID is fast. Ludicrously fast. Please fasten your seatbelts before usage.
36 | 
37 | ## Installation
38 | 
39 | You can install PLAID from Bioconductor:
40 | 
41 | ```r
42 | BiocManager::install("plaid")
43 | ```
44 | 
45 | You can also install the development version from GitHub:
46 | 
47 | ```r
48 | remotes::install_github("bigomics/plaid")
49 | ```
50 | 
51 | ## Usage
52 | 
53 | For detailed usage examples and tutorials, please see our vignettes:
54 | 
55 | - [Getting Started with PLAID](https://bigomics.github.io/plaid/articles/01_plaid-vignette.html)
56 | - [Comparing PLAID with other methods](https://bigomics.github.io/plaid/articles/02_compare-vignette.html)
57 | 
58 | PLAID is the main single-sample gene set scoring algorithm in OmicsPlayground, our 
59 | Bioinformatics platform at [BigOmics](https://bigomics.ch). In OmicsPlayground, you 
60 | can perform PLAID without coding needs.
61 | 
62 | ## References
63 | 
64 | For more technical details please refer to our papers. Please cite us when you use
65 | PLAID as part of your research. 
66 | 
67 | - Zito A., et al. PLAID: ultrafast single-sample gene set enrichment scoring. Bioinformatics, 2025, [btaf621](https://doi.org/10.1093/bioinformatics/btaf621).
68 | - Akhmedov M., et al., Omics Playground: a comprehensive self-service platform for visualization, analytics and exploration of Big Omics Data, NAR Genomics and Bioinformatics, 2020, [lqz019](https://doi.org/10.1093/nargab/lqz019).
69 | 
70 | ## Support
71 | 
72 | For support feel free to reach our Bioinformatics Data Science Team at
73 | BigOmics Analytics: help@bigomics.ch
74 | 
75 | If you like PLAID, please recommend us to your friends, buy us [coffee](https://buymeacoffee.com/bigomics)
76 | and brag about PLAID on your social media. 
77 | 


--------------------------------------------------------------------------------
/man/plaid.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plaid.R
  3 | \name{plaid}
  4 | \alias{plaid}
  5 | \title{Compute PLAID single-sample enrichment score}
  6 | \usage{
  7 | plaid(
  8 |   X,
  9 |   matG,
 10 |   stats = c("mean", "sum"),
 11 |   chunk = NULL,
 12 |   normalize = TRUE,
 13 |   nsmooth = 3,
 14 |   assay = "logcounts",
 15 |   min.genes = 5,
 16 |   max.genes = 500
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{X}{Log-transformed expr. matrix. Genes on rows, samples on columns.
 21 | Also accepts SummarizedExperiment or SingleCellExperiment objects.}
 22 | 
 23 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns.
 24 | Also accepts BiocSet objects or GMT lists (named list of gene vectors).}
 25 | 
 26 | \item{stats}{Score computation stats: mean or sum of intensity. Default 'mean'.}
 27 | 
 28 | \item{chunk}{Logical: use chunks for large matrices. Default 'NULL' for autodetect.}
 29 | 
 30 | \item{normalize}{Logical: median normalize results or not. Default 'TRUE'.}
 31 | 
 32 | \item{nsmooth}{Smoothing parameter for more stable average when stats="mean". Default 3.}
 33 | 
 34 | \item{assay}{Character: assay name to extract from SummarizedExperiment/SingleCellExperiment. Default "logcounts".}
 35 | 
 36 | \item{min.genes}{Integer: minimum genes per gene set (for BiocSet/GMT input). Default 5.}
 37 | 
 38 | \item{max.genes}{Integer: maximum genes per gene set (for BiocSet/GMT input). Default 500.}
 39 | }
 40 | \value{
 41 | Matrix of single-sample enrichment scores.
 42 | Gene sets on rows, samples on columns.
 43 | }
 44 | \description{
 45 | Compute single-sample geneset expression as the
 46 | average log-expression f genes in the geneset. Requires log-expression
 47 | matrix X and (sparse) geneset matrix matG. If you have gene sets
 48 | as a gmt list, please convert it first using the function \code{gmt2mat()}.
 49 | }
 50 | \details{
 51 | PLAID needs the gene sets as sparse matrix. If you have
 52 | your collection of gene sets a a list, we need first to convert
 53 | the gmt list to matrix format.
 54 | 
 55 | We recommend to run PLAID on the log transformed expression matrix,
 56 | not on the counts, as the average in the logarithmic space is more
 57 | robust and is in concordance to calculating the geometric mean.
 58 | 
 59 | It is not necessary to normalize your expression matrix before
 60 | running PLAID because PLAID performs median normalization of the
 61 | enrichment scores afterwards.
 62 | 
 63 | It is recommended to use sparse matrix as PLAID relies on
 64 | sparse matrix computations. But, PLAID is also fast for dense matrices.
 65 | 
 66 | PLAID can also be run on the ranked matrix. This corresponds to
 67 | the singscore (Fouratan et al., 2018). PLAID can also be run on
 68 | the (non-logarithmic) counts which can be used to calculate the
 69 | scSE score (Pont et al., 2019).
 70 | 
 71 | PLAID is fast and memery efficient because it uses efficient
 72 | sparse matrix computation. When input matrix is very large, PLAID
 73 | performs 'chunked' computation by splitting the matrix in chunks.
 74 | 
 75 | Although \code{X} and \code{matG} are generally sparse, the result
 76 | matrix \code{gsetX} generally is dense and can thus be very large.
 77 | Example: computing gene set scores for 10K gene sets on 1M cells
 78 | will create a 10K x 1M dense matrix which requires ~75GB memory.
 79 | 
 80 | PLAID now automatically detects and handles Bioconductor objects.
 81 | If X is a SummarizedExperiment or SingleCellExperiment, it will extract
 82 | the appropriate assay. If matG is a BiocSet object or GMT list, it will
 83 | be converted to sparse matrix format automatically.
 84 | }
 85 | \examples{
 86 | library(plaid)
 87 | 
 88 | # Create example expression matrix
 89 | set.seed(123)
 90 | X <- matrix(rnorm(1000), nrow = 100, ncol = 10)
 91 | rownames(X) <- paste0("GENE", 1:100)
 92 | colnames(X) <- paste0("Sample", 1:10)
 93 | 
 94 | # Create example gene sets
 95 | gmt <- list(
 96 |   "Pathway1" = paste0("GENE", 1:20),
 97 |   "Pathway2" = paste0("GENE", 15:35),
 98 |   "Pathway3" = paste0("GENE", 30:50)
 99 | )
100 | matG <- gmt2mat(gmt)
101 | 
102 | # Compute PLAID scores
103 | gsetX <- plaid(X, matG)
104 | print(dim(gsetX))
105 | print(gsetX[1:3, 1:5])
106 | 
107 | # Use sum statistics instead of mean
108 | gsetX_sum <- plaid(X, matG, stats = "sum")
109 | 
110 | \donttest{
111 | # Using real data (if available in package)
112 | extdata_path <- system.file("extdata", "pbmc3k-50cells.rda", package = "plaid")
113 | if (file.exists(extdata_path)) {
114 |   load(extdata_path)
115 |   hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid")
116 |   gmt <- read.gmt(hallmarks)
117 |   matG <- gmt2mat(gmt)
118 |   gsetX <- plaid(X, matG)
119 | }
120 | }
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/R/bioc-utils.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This file is part of the Omics Playground project.
  3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  4 | ##
  5 | 
  6 | #' @importFrom SummarizedExperiment assayNames assay
  7 | #' @importFrom BiocSet es_elementset
  8 | NULL
  9 | 
 10 | #' Extract expression matrix from Bioconductor objects
 11 | #'
 12 | #' @description Internal function to extract expression matrix from various
 13 | #' Bioconductor objects including SummarizedExperiment and SingleCellExperiment.
 14 | #' 
 15 | #' @param object A Bioconductor object (SummarizedExperiment, SingleCellExperiment)
 16 | #'   or a regular matrix.
 17 | #' @param assay Character or integer specifying which assay to extract. Default "logcounts".
 18 | #'   For SummarizedExperiment objects, common options are "counts", "logcounts", 
 19 | #'   "normcounts". The function will try several common names if the specified
 20 | #'   assay is not found.
 21 | #' @param log.transform Logical. If TRUE and the data appears to be counts (not log-transformed),
 22 | #'   will apply log2(x+1) transformation. Default FALSE.
 23 | #'
 24 | #' @return A matrix with genes/features on rows and samples on columns.
 25 | #'
 26 | #' @details This function is designed to handle various Bioconductor object types:
 27 | #' \itemize{
 28 | #'   \item SummarizedExperiment: extracts from assays
 29 | #'   \item SingleCellExperiment: extracts from assays (prefers logcounts)
 30 | #'   \item Matrix/matrix: returns as-is
 31 | #' }
 32 | #'
 33 | #' The function will attempt to find an appropriate assay by trying common names
 34 | #' in the following order: specified assay, "logcounts", "normcounts", "counts",
 35 | #' or the first available assay.
 36 | #'
 37 | #' @noRd
 38 | .extract_expression_matrix <- function(object, assay = "logcounts", log.transform = FALSE) {
 39 |   
 40 |   # If already a matrix, return as-is
 41 |   if (is.matrix(object) || inherits(object, "Matrix")) {
 42 |     return(object)
 43 |   }
 44 |   
 45 |   # Check for SummarizedExperiment or SingleCellExperiment
 46 |   if (inherits(object, "SummarizedExperiment") || 
 47 |       inherits(object, "SingleCellExperiment")) {
 48 |     
 49 |     # Try to extract the requested assay
 50 |     avail_assays <- SummarizedExperiment::assayNames(object)
 51 |     
 52 |     if (length(avail_assays) == 0) {
 53 |       stop("[.extract_expression_matrix] No assays found in object.")
 54 |     }
 55 |     
 56 |     # Try to find the best assay
 57 |     if (assay %in% avail_assays) {
 58 |       X <- SummarizedExperiment::assay(object, assay)
 59 |     } else {
 60 |       # Try common alternatives in order of preference
 61 |       preferred_order <- c("logcounts", "normcounts", "counts")
 62 |       found_assay <- NULL
 63 |       
 64 |       for (pref in preferred_order) {
 65 |         if (pref %in% avail_assays) {
 66 |           found_assay <- pref
 67 |           break
 68 |         }
 69 |       }
 70 |       
 71 |       if (is.null(found_assay)) {
 72 |         # Just use first available assay
 73 |         found_assay <- avail_assays[1]
 74 |         message("[.extract_expression_matrix] Using first available assay: ", found_assay)
 75 |       } else {
 76 |         message("[.extract_expression_matrix] Requested assay '", assay, 
 77 |                 "' not found. Using: ", found_assay)
 78 |       }
 79 |       
 80 |       X <- SummarizedExperiment::assay(object, found_assay)
 81 |     }
 82 |     
 83 |     # Apply log transformation if requested
 84 |     if (log.transform) {
 85 |       # Check if data looks like counts (non-negative integers or large values)
 86 |       if (min(X, na.rm = TRUE) >= 0 && max(X, na.rm = TRUE) > 100) {
 87 |         message("[.extract_expression_matrix] Applying log2(x+1) transformation.")
 88 |         if (inherits(X, "dgCMatrix")) {
 89 |           X@x <- log2(X@x + 1)
 90 |         } else {
 91 |           X <- log2(X + 1)
 92 |         }
 93 |       }
 94 |     }
 95 |     
 96 |     return(X)
 97 |   }
 98 |   
 99 |   # If we get here, unsupported object type
100 |   stop("[.extract_expression_matrix] Unsupported object type: ", class(object)[1],
101 |        "\nSupported types: matrix, Matrix, SummarizedExperiment, SingleCellExperiment")
102 | }
103 | 
104 | 
105 | #' Convert BiocSet to sparse matrix format for plaid
106 | #'
107 | #' @description Internal function to convert BiocSet objects to the sparse
108 | #' matrix format required by plaid (genes on rows, gene sets on columns).
109 | #'
110 | #' @param geneset A BiocSet object, list (GMT format), or already a matrix.
111 | #' @param background Character vector of background genes to include in the matrix.
112 | #'   Default NULL uses all genes from the gene sets.
113 | #' @param min.genes Minimum number of genes required for a gene set to be included.
114 | #'   Default 5.
115 | #' @param max.genes Maximum number of genes allowed for a gene set to be included.
116 | #'   Default 500.
117 | #'
118 | #' @return A sparse matrix (dgCMatrix) with genes on rows and gene sets on columns.
119 | #'
120 | #' @details This function handles conversion from BiocSet objects to the
121 | #' sparse matrix format used by plaid. It can also handle GMT lists or matrices
122 | #' directly. Gene sets are filtered by size (min/max genes).
123 | #'
124 | #' For BiocSet objects, the function extracts element-set mappings and converts
125 | #' them to a binary sparse matrix indicating gene membership.
126 | #'
127 | #' @noRd
128 | .convert_geneset_to_matrix <- function(geneset, background = NULL, 
129 |                                        min.genes = 5, max.genes = 500) {
130 |   
131 |   # If already a matrix, return as-is
132 |   if (is.matrix(geneset) || inherits(geneset, "Matrix")) {
133 |     return(geneset)
134 |   }
135 |   
136 |   # Handle BiocSet objects
137 |   if (inherits(geneset, "BiocSet")) {
138 |     message("[.convert_geneset_to_matrix] Converting BiocSet to matrix format...")
139 |     
140 |     # Extract the element-set mapping from BiocSet
141 |     # BiocSet uses a tibble structure with 'element' and 'set' columns
142 |     es_tbl <- BiocSet::es_elementset(geneset)
143 |     
144 |     if (nrow(es_tbl) == 0) {
145 |       stop("[.convert_geneset_to_matrix] BiocSet object is empty.")
146 |     }
147 |     
148 |     # Convert to GMT list format first
149 |     gmt <- split(es_tbl$element, es_tbl$set)
150 |     gmt <- lapply(gmt, as.character)
151 |     
152 |   } else if (is.list(geneset)) {
153 |     # Already in GMT format
154 |     gmt <- geneset
155 |   } else {
156 |     stop("[.convert_geneset_to_matrix] Unsupported geneset type: ", class(geneset)[1],
157 |          "\nSupported types: BiocSet, list (GMT), matrix, Matrix")
158 |   }
159 |   
160 |   # Filter gene sets by size
161 |   gset_sizes <- vapply(gmt, length, integer(1))
162 |   valid_sets <- gset_sizes >= min.genes & gset_sizes <= max.genes
163 |   
164 |   if (sum(valid_sets) == 0) {
165 |     stop("[.convert_geneset_to_matrix] No gene sets passed size filters (min=", 
166 |          min.genes, ", max=", max.genes, ")")
167 |   }
168 |   
169 |   if (sum(!valid_sets) > 0) {
170 |     message("[.convert_geneset_to_matrix] Filtered out ", sum(!valid_sets), 
171 |             " gene sets (size filters: ", min.genes, "-", max.genes, " genes)")
172 |   }
173 |   
174 |   gmt <- gmt[valid_sets]
175 |   
176 |   # Convert to sparse matrix using existing gmt2mat function
177 |   matG <- gmt2mat(gmt, bg = background, sparse = TRUE)
178 |   
179 |   return(matG)
180 | }
181 | 


--------------------------------------------------------------------------------
/R/gmt-utils.R:
--------------------------------------------------------------------------------
  1 | ## This file is part of the Omics Playground project.
  2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  3 | 
  4 | #' @importFrom utils head read.csv
  5 | #' @importFrom Matrix Matrix rowSums which
  6 | #' @importFrom parallel mclapply
  7 | NULL
  8 | 
  9 | #' Convert GMT to Binary Matrix
 10 | #'
 11 | #' @description Convert a GMT file (Gene Matrix Transposed) to a binary matrix,
 12 | #' where rows represent genes and columns represent gene sets.
 13 | #' The binary matrix indicates presence or absence of genes in a gene set.
 14 | #'
 15 | #' @param gmt List representing the GMT file: each element is a character vector representing a gene set.
 16 | #' @param max.genes Max number of genes to include in the binary matrix. Default = -1 to include all genes.
 17 | #' @param ntop Number of top genes to consider for each gene set. Default = -1 to include all genes.
 18 | #' @param sparse Logical: create a sparse matrix. Default `TRUE`. If `FALSE` creates a dense matrix.
 19 | #' @param bg Character vector of background gene set. Default `NULL` to consider all unique genes.
 20 | #' @param use.multicore Logical: use parallel processing ('parallel' R package). Default `TRUE`.
 21 | #'
 22 | #' @export
 23 | #'
 24 | #' @return A binary matrix representing the presence or absence of genes in each gene set.
 25 | #'         Rows correspond to genes, and columns correspond to gene sets.
 26 | #'
 27 | #' @examples
 28 | #' # Create example GMT data
 29 | #' gmt <- list(
 30 | #'   "Pathway1" = c("GENE1", "GENE2", "GENE3"),
 31 | #'   "Pathway2" = c("GENE2", "GENE4", "GENE5"),
 32 | #'   "Pathway3" = c("GENE1", "GENE5", "GENE6")
 33 | #' )
 34 | #' 
 35 | #' # Convert to binary matrix
 36 | #' mat <- gmt2mat(gmt)
 37 | #' print(mat)
 38 | #' 
 39 | #' # Create dense matrix instead of sparse
 40 | #' mat_dense <- gmt2mat(gmt, sparse = FALSE)
 41 | #' print(mat_dense)
 42 | gmt2mat <- function(gmt,
 43 |                     max.genes = -1,
 44 |                     ntop = -1, sparse = TRUE,
 45 |                     bg = NULL,
 46 |                     use.multicore = TRUE) {
 47 | 
 48 |   gmt <- gmt[order(-vapply(gmt, length, integer(1)))]
 49 |   gmt <- gmt[!duplicated(names(gmt))]
 50 |   if (ntop > 0) gmt <- lapply(gmt, utils::head, n = ntop)
 51 |   
 52 |   if (is.null(names(gmt))) names(gmt) <- paste("gmt.", seq_along(gmt), sep = "")
 53 |   if (is.null(bg)) {
 54 |     bg <- names(sort(table(unlist(gmt)), decreasing = TRUE))
 55 |   }
 56 |   
 57 |   if (max.genes < 0) max.genes <- length(bg)
 58 |   gg <- bg
 59 |   gg <- Matrix::head(bg, n = max.genes)
 60 |   ##gmt <- lapply(gmt, function(s) intersect(gg, s))
 61 |   kk <- unique(names(gmt))
 62 |   if (sparse) {
 63 |     D <- Matrix::Matrix(0, nrow = length(gg), ncol = length(kk), sparse = TRUE)
 64 |   } else {
 65 |     D <- matrix(0, nrow = length(gg), ncol = length(kk))
 66 |   }
 67 |   rownames(D) <- gg
 68 |   colnames(D) <- kk
 69 | 
 70 |   if (use.multicore) {
 71 |     idx <- parallel::mclapply(gmt, function(s) match(s, gg))
 72 |   } else {
 73 |     idx <- lapply(gmt, function(s) match(s, gg))
 74 |   }
 75 |   idx <- lapply(idx, function(x) x[!is.na(x)])
 76 |   idx[vapply(idx, length, integer(1)) == 0] <- 0
 77 |   idx <- lapply(seq_along(idx), function(i) rbind(idx[[i]], i))
 78 |   idx <- matrix(unlist(idx[]), byrow = TRUE, ncol = 2)
 79 |   idx <- idx[!is.na(idx[, 1]), ]
 80 |   idx <- idx[idx[, 1] > 0, ]
 81 |   D[idx] <- 1
 82 |   D <- D[order(-Matrix::rowSums(D != 0, na.rm = TRUE)), ,drop=FALSE]
 83 | 
 84 |   return(D)
 85 | 
 86 | }
 87 | 
 88 | #' Convert Binary Matrix to GMT
 89 | #'
 90 | #' @description Convert binary matrix to a GMT (Gene Matrix Transposed) list.
 91 | #' The binary matrix indicates presence or absence of genes in each gene set.
 92 | #' Rows represent genes and columns represent gene sets.
 93 | #'
 94 | #' @param mat Matrix with non-zero entries representing genes in each gene set.
 95 | #' Rows represent genes and columns represent gene sets.
 96 | #'
 97 | #' @export
 98 | #'
 99 | #' @return A list of vector representing each gene set. Each list
100 | #'   element correspond to a gene set and is a vector of genes
101 | #'
102 | #' @examples
103 | #' # Create example binary matrix
104 | #' mat <- matrix(0, nrow = 6, ncol = 3)
105 | #' rownames(mat) <- paste0("GENE", 1:6)
106 | #' colnames(mat) <- paste0("Pathway", 1:3)
107 | #' mat[1:3, 1] <- 1  # Pathway1: GENE1, GENE2, GENE3
108 | #' mat[c(2,4,5), 2] <- 1  # Pathway2: GENE2, GENE4, GENE5
109 | #' mat[c(1,5,6), 3] <- 1  # Pathway3: GENE1, GENE5, GENE6
110 | #' 
111 | #' # Convert to GMT list
112 | #' gmt <- mat2gmt(mat)
113 | #' print(gmt)
114 | #'
115 | mat2gmt <- function(mat) {
116 |   idx <- Matrix::which(mat != 0, arr.ind = TRUE)
117 |   gmt <- tapply(rownames(idx), idx[, 2], list)
118 |   names(gmt) <- colnames(mat)[as.integer(names(gmt))]
119 |   return(gmt)
120 | }
121 | 
122 | 
123 | #' Read GMT File
124 | #'
125 | #' @description Read data from a GMT file (Gene Matrix Transposed).
126 | #' The GMT format is commonly used to store gene sets or gene annotations.
127 | #' @param gmt.file Path to GMT file.
128 | #' @param dir (Optional) The directory where the GMT file is located.
129 | #' @param add.source (optional) Include the source information in the gene sets' names.
130 | #' @param nrows (optional) Number of rows to read from the GMT file.
131 | #'
132 | #' @export
133 | #'
134 | #' @return A list of gene sets: each gene set is represented as a character vector of gene names.
135 | #' 
136 | #' @examples
137 | #' \donttest{
138 | #' # Read GMT file (requires file to exist)
139 | #' gmt_file <- system.file("extdata", "hallmarks.gmt", package = "plaid")
140 | #' if (file.exists(gmt_file)) {
141 | #'   gmt <- read.gmt(gmt_file)
142 | #'   print(names(gmt))
143 | #'   print(head(gmt[[1]]))
144 | #'   
145 | #'   # Read with source information
146 | #'   gmt_with_source <- read.gmt(gmt_file, add.source = TRUE)
147 | #'   print(head(names(gmt_with_source)))
148 | #' }
149 | #' }
150 | #'
151 | read.gmt <- function(gmt.file,
152 |                      dir = NULL,
153 |                      add.source = FALSE,
154 |                      nrows = -1) {
155 |   f0 <- gmt.file
156 |   if (strtrim(gmt.file, 1) == "/") dir <- NULL
157 |   if (!is.null(dir)) f0 <- paste(sub("/$", "", dir), "/", gmt.file, sep = "")
158 |   gmt <- utils::read.csv(f0, sep = "!", header = FALSE, comment.char = "#", nrows = nrows)[, 1]
159 |   gmt <- as.character(gmt)
160 |   gmt <- lapply(gmt, function(s) strsplit(s, split = "\t")[[1]])
161 |   names(gmt) <- NULL
162 |   gmt.name <- vapply(gmt, "[", character(1), 1)
163 |   gmt.source <- vapply(gmt, "[", character(1), 2)
164 |   gmt.genes <- vapply(gmt, function(x) {
165 |     if (length(x) < 3) return("");
166 |     paste(x[3:length(x)], collapse = " ")
167 |   }, character(1))
168 |   gset <- strsplit(gmt.genes, split = "[ \t]")
169 |   gset <- lapply(gset, function(x) setdiff(x, c("", "NA", NA)))
170 |   names(gset) <- gmt.name
171 | 
172 |   if (add.source)
173 |     names(gset) <- paste0(names(gset), " (", gmt.source, ")")
174 |   
175 |   return(gset)
176 | 
177 | }
178 | 
179 | 
180 | #' Write GMT File
181 | #'
182 | #' @description Write gene sets to GMT file (Gene Matrix Transposed).
183 | #' The GMT format is commonly used to store gene sets or gene annotations.
184 | #'
185 | #' @param gmt A list of gene sets in GMT format: each gene set is represented as a vector of gene names.
186 | #' @param file The file path to write the GMT file.
187 | #' @param source A character vector specifying the source of each gene set.
188 | #' If not provided, the names of the gene sets are used as the source.
189 | #' 
190 | #' @export
191 | #' @return Does not return anything.
192 | #' 
193 | #' @examples
194 | #' # Create example GMT data
195 | #' gmt <- list(
196 | #'   "Pathway1" = c("GENE1", "GENE2", "GENE3"),
197 | #'   "Pathway2" = c("GENE2", "GENE4", "GENE5"),
198 | #'   "Pathway3" = c("GENE1", "GENE5", "GENE6")
199 | #' )
200 | #' 
201 | #' \donttest{
202 | #' # Write to GMT file (creates file in temp directory)
203 | #' temp_file <- tempfile(fileext = ".gmt")
204 | #' write.gmt(gmt, temp_file)
205 | #' 
206 | #' # Write with custom source information
207 | #' temp_file2 <- tempfile(fileext = ".gmt")
208 | #' write.gmt(gmt, temp_file2, source = c("DB1", "DB2", "DB3"))
209 | #' 
210 | #' # Clean up
211 | #' unlink(c(temp_file, temp_file2))
212 | #' }
213 | #'
214 | write.gmt <- function(gmt, file, source = NA) {
215 |   gg <- lapply(gmt, paste, collapse = "\t")
216 |   if (length(source) == 1 && is.na(source[1])) source <- names(gmt)
217 |   ee <- paste(names(gmt), "\t", source, "\t", gg, sep = "")
218 |   write(ee, file = file)
219 | }
220 | 


--------------------------------------------------------------------------------
/vignettes/02_compare-vignette.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Comparing PLAID with other methods"
 3 | author: "BigOmics Analytics"
 4 | package: plaid
 5 | output: 
 6 |     BiocStyle::html_document
 7 | vignette: >
 8 |   %\VignetteIndexEntry{Comparing PLAID with other methods}
 9 |   %\VignetteEngine{knitr::rmarkdown}
10 |   %\VignetteEncoding{UTF-8}
11 | ---
12 | 
13 | ```{r, include = FALSE}
14 | knitr::opts_chunk$set(
15 |   collapse = TRUE,
16 |   comment = "#>"
17 | )
18 | ```
19 | 
20 | # Introduction
21 | 
22 | Single-sample gene set and pathway scoring enable the prioritization
23 | of molecular signatures of individual samples, highlighting potential
24 | targets for personalized medicine. Several methods have been
25 | developed, each with distinct strengths and limitations. The most
26 | widely used include [ssGSEA](https://github.com/rcastelo/GSVA),
27 | [GSVA](https://github.com/rcastelo/GSVA),
28 | [AUCell](https://github.com/aertslab/AUCell),
29 | [singscore](https://github.com/DavisLaboratory/singscore),
30 | [scSE](https://doi.org/10.1093/nar/gkz601) and
31 | [UCell](https://github.com/carmonalab/UCell). These methods differ in
32 | preprocessing steps of ranking, centering, and normalization. Despite
33 | methodical variations, these methods collectively offer vast
34 | flexibility for data analysis. Nevertheless, with the emergence of
35 | large-scale biological data (e.g., biobanks), most methods face
36 | substantial computational inefficiency. Except scSE, which is
37 | implemented in Go, other methods are in fact practically infeasible in
38 | runtime and memory requirements for large-scale data. This has limited
39 | the application of single-sample gene set scoring, hindering timely
40 | discovery.
41 | 
42 | We address this critical issue with PLAID (Pathway Level Average Intensity Detection). Within each sample, PLAID identifies the genes mapped within a gene set and calculates the gene set score as the average log-intensity of genes in the gene set. PLAID does not zero-center or rank features, and leverages sparse matrices for efficient computations. 
43 | 
44 | # Benchmarking
45 | 
46 | We evaluated PLAID in single-cell transcriptomics (PBMC3K), bulk proteomics, and microarray data. These data span diverse scenarios in terms of resolution and distribution, thus allowing comprehensive benchmarks. We compared PLAID runtime, peak memory, and gene set scores to Singscore, ssGSEA, GSVA, scSE, UCell and AUCell. Collectively these methods represent the most used single-sample gene set enrichment methods in biomedical research.
47 | 
48 | # Comparison
49 | 
50 | ## Computational Performance
51 | 
52 | ![Run time (seconds) and peak RAM memory (MiB) usage. Benchmarking was conducted on data and gene sets of increasing sizes: 100 cells, 1K cells, 10K cells; 50 gene sets (HALLMARK collection); 2864 gene sets (GO_BP collection); 61,459 gene sets (whole collection). Runs were timed out at 1h (3600s). Timed out runs are indicated by an asterisk.](fig1.png)
53 | 
54 | In scRNA-seq data, PLAID consistently outperformed other methods at different sample and gene set sizes. PLAID completed scoring for 2,864 gene sets in 1K cells in 0.17s, which is >100x faster compared to any other method. The second best method was scSE. Overall, the best memory-performing methods were PLAID and scSE. A similar trend was observed for 10K cells, where PLAID was >10x faster than other methods, while requiring similar RAM as other non-timed-out methods.
55 | 
56 | When testing 61,459 gene sets in 1K cells, PLAID, Singscore, UCell and AUCell successfully completed the gene set scoring. Other methods were timed out at 1h. PLAID completed the run in <8s, with <3GB peak RAM. Singscore, UCell and AUCell required >100x runtime compared to PLAID, with up to ~1.5GB additional memory. GSVA, ssGSEA and scSE were timed out, and neither runtime nor peak RAM usage were accurately estimated. For 61,459 gene sets and 10K cells, PLAID achieved gene set scoring within 1h, requiring 110s and <20GB peak RAM. Other methods were timed out, and would have required at least 5x additional computing resources for a complete run.
57 | 
58 | While working with sparse matrices would be ideal, researchers often use dense matrices. For a comprehensive evaluation, we thus conducted runtime and memory profiling of PLAID on the TCGA-BRCA microarray dense data matrix. In line with previous observations, PLAID outperformed other methods in most scenarios. For instance, PLAID was >100x faster than AUCell and GSVA for 61,459 gene sets and 1K cells, requiring 2.4GB. Altogether, these data demonstrate that PLAID is a highly efficient alternative to other single-sample gene set enrichment methods, but also that scSE remains a highly memory-optimized method capable of outperforming PLAID in some cases. Both runtime and peak memory usage increase approximately linearly with the number of cells or gene sets. For 1K gene sets and 1M cells, PLAID required about 200s and 28GB RAM peak memory.
59 | 
60 | ## Score Concordance
61 | 
62 | ![Pairwise scatter plots between PLAID vs. singscore, ssGSEA, GSVA, UCell, AUCell and scSE and scSE.mean ESs (Methods). Original ESs.](fig2.png)
63 | 
64 | 
65 | ![Pairwise scatter plots between PLAID vs. singscore, ssGSEA, GSVA, UCell, AUCell and scSE and scSE.mean ESs (Methods). Row-centered gene set ESs.](fig3.png)
66 | 
67 | PLAID enrichment scores are median normalized to aid cross-group comparisons. While conceptually based on a self-defined metric, we nevertheless compared PLAID with other methods. We found that in scRNA-seq, for the first available cell, PLAID scores correlated well with Singscore, UCell and AUCell scores. By enabling calculation of mean, scSE produces scores highly concordant with PLAID. Lower similarities emerged with GSVA and ssGSEA. Notably, GSVA and ssGSEA also exhibited low concordance with any other method.
68 | 
69 | Critically, within any single sample or cell, the raw gene set scores are not well suited for direct comparisons between gene sets. Comparing gene sets would need statistical testing between samples, or centering gene sets across samples to obtain relative scores corresponding to gene set average centered log-expression. On the basis of this principle, we compared PLAID scores vs other methods' scores after centering. Interestingly, we observed a generalized improved concordance with all methods, including with GSVA and ssGSEA. This supports the suitability of PLAID scores for differential testing between groups, and the possibility of cross-validating analyses from other methods.
70 | 
71 | Testing in a bulk proteomics dataset confirmed high similarity between PLAID and scSE.mean, and high concordance with Singscore and ssGSEA. Low concordance emerged between PLAID and GSVA, in line with previous observations. Both in scRNA-seq and bulk proteomics, GSVA was lowly correlated with any other methods. Improved concordance was reached upon gene set centering, supporting results in scRNA-seq data.
72 | 
73 | ## Replicating Other Methods
74 | 
75 | ![Pairwise scatter plots between original and PLAID-replicated ESs for different methods (left to right): singscore, ssGSEA, GSVA, UCell, AUCell and scSE. Three testing datasets were used: (top to bottom): PBMC3K scRNAseq dataset; LC/MS proteomics dataset (Wolf et al., 2020); mRNA microarray (GSE10846; Lenz et al., 2008). In most cases, the PLAID (re)-implementation of the distinct enrichment methods provides accurate replication, with the key advantage of being significantly faster and memory efficient.](fig4.png)
76 | 
77 | Running independent methods to validate enriched gene sets is a best practice. However, given the computational inefficiency of most current methods, this may be time-prohibitive in large datasets. We provide a solution to this problem by equipping PLAID with the most widely used single-sample gene set enrichment methods.
78 | 
79 | We've implemented the following functions using PLAID as 'back-end': `replaid.sing`, `replaid.scse`, `replaid.ssgsea`, `replaid.gsva`, `replaid.ucell`, `replaid.aucell`. These functions provide efficient calculations of Singscore, scSE, ssGSEA, GSVA, UCell and AUCell. To ensure accurate replication of the original methods, we conducted testing in scRNA-seq, proteomics and microarray expression data.
80 | 
81 | Best concordances are seen for `replaid.sing`, `replaid.scse`, `replaid.ucell` and `replaid.aucell` vs. each respective original method, with further improvements made possible by method-specific parameters that reflect original implementations. A somewhat relatively lower concordance (especially for scRNA-seq data) emerges between `replaid.ssgsea` and `replaid.gsva` vs. original ssGSEA and GSVA, respectively. These methods were originally not intended for scRNA-seq. Lower concordance is also related to the 'alpha' and 'tau' parameters, due to approximations of rank weightings and ECDF in PLAID. Nevertheless, we achieve good concordance in nearly all cases, with the unmatched advantage of reaching up to ten-fold gain in computational efficiency.
82 | 
83 | # Conclusions
84 | 
85 | These analyses demonstrate that PLAID is a highly-performing and accurate method for single-sample gene set scoring. PLAID is ultrafast and memory efficient, generating gene set scores highly concordant with existing methods. PLAID can be up to 100x faster and requires significantly less memory - reaching up to 5-fold reduction in memory usage - when compared to any other method in any dataset.
86 | 
87 | Altogether, these data demonstrate the multi-task power of PLAID in providing (i) its own single-sample gene set enrichment scores; (ii) unmatched computational efficiency; (iii) a framework for the most used single-sample enrichment methods, with much higher efficiency; (iv) evidence that if implemented expertly, the R language can be highly efficient.
88 | 
89 | # Session Info
90 | 
91 | ```{r sessionInfo}
92 | sessionInfo()
93 | ```
94 | 
95 | 


--------------------------------------------------------------------------------
/tests/testthat/test-bioc-utils.R:
--------------------------------------------------------------------------------
  1 | ## This file is part of the Omics Playground project.
  2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  3 | 
  4 | library(testthat)
  5 | library(Matrix)
  6 | 
  7 | # =============================================================================
  8 | # Test .extract_expression_matrix function
  9 | # =============================================================================
 10 | 
 11 | test_that(".extract_expression_matrix handles regular matrices", {
 12 |   # Create test matrix
 13 |   mat <- matrix(rnorm(100), nrow = 10, ncol = 10)
 14 |   rownames(mat) <- paste0("Gene", 1:10)
 15 |   colnames(mat) <- paste0("Sample", 1:10)
 16 |   
 17 |   # Should return matrix as-is
 18 |   result <- plaid:::.extract_expression_matrix(mat)
 19 |   
 20 |   expect_identical(result, mat)
 21 |   expect_true(is.matrix(result))
 22 | })
 23 | 
 24 | test_that(".extract_expression_matrix handles sparse matrices", {
 25 |   # Create sparse matrix
 26 |   mat <- Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
 27 |   rownames(mat) <- paste0("Gene", 1:10)
 28 |   colnames(mat) <- paste0("Sample", 1:10)
 29 |   
 30 |   # Should return sparse matrix as-is
 31 |   result <- plaid:::.extract_expression_matrix(mat)
 32 |   
 33 |   expect_identical(result, mat)
 34 |   expect_true(inherits(result, "Matrix"))
 35 | })
 36 | 
 37 | test_that(".extract_expression_matrix handles SummarizedExperiment objects", {
 38 |   skip_if_not_installed("SummarizedExperiment")
 39 |   
 40 |   # Create test SummarizedExperiment
 41 |   counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10)
 42 |   rownames(counts) <- paste0("Gene", 1:10)
 43 |   colnames(counts) <- paste0("Sample", 1:10)
 44 |   
 45 |   se <- SummarizedExperiment::SummarizedExperiment(
 46 |     assays = list(counts = counts)
 47 |   )
 48 |   
 49 |   # Extract expression matrix
 50 |   result <- plaid:::.extract_expression_matrix(se, assay = "counts")
 51 |   
 52 |   expect_true(is.matrix(result))
 53 |   expect_equal(dim(result), c(10, 10))
 54 |   expect_equal(rownames(result), paste0("Gene", 1:10))
 55 | })
 56 | 
 57 | test_that(".extract_expression_matrix handles multiple assays", {
 58 |   skip_if_not_installed("SummarizedExperiment")
 59 |   
 60 |   # Create SE with multiple assays
 61 |   counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10)
 62 |   logcounts <- log2(counts + 1)
 63 |   rownames(counts) <- rownames(logcounts) <- paste0("Gene", 1:10)
 64 |   colnames(counts) <- colnames(logcounts) <- paste0("Sample", 1:10)
 65 |   
 66 |   se <- SummarizedExperiment::SummarizedExperiment(
 67 |     assays = list(counts = counts, logcounts = logcounts)
 68 |   )
 69 |   
 70 |   # Should extract logcounts when specified
 71 |   result_log <- plaid:::.extract_expression_matrix(se, assay = "logcounts")
 72 |   expect_equal(result_log, logcounts)
 73 |   
 74 |   # Should extract counts when specified
 75 |   result_counts <- plaid:::.extract_expression_matrix(se, assay = "counts")
 76 |   expect_equal(result_counts, counts)
 77 | })
 78 | 
 79 | test_that(".extract_expression_matrix falls back to available assays", {
 80 |   skip_if_not_installed("SummarizedExperiment")
 81 |   
 82 |   # Create SE with only counts
 83 |   counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10)
 84 |   rownames(counts) <- paste0("Gene", 1:10)
 85 |   colnames(counts) <- paste0("Sample", 1:10)
 86 |   
 87 |   se <- SummarizedExperiment::SummarizedExperiment(
 88 |     assays = list(counts = counts)
 89 |   )
 90 |   
 91 |   # Request non-existent assay, should fall back
 92 |   expect_message(
 93 |     result <- plaid:::.extract_expression_matrix(se, assay = "logcounts"),
 94 |     "not found"
 95 |   )
 96 |   
 97 |   expect_true(is.matrix(result))
 98 | })
 99 | 
100 | test_that(".extract_expression_matrix applies log transformation when requested", {
101 |   skip_if_not_installed("SummarizedExperiment")
102 |   
103 |   # Create SE with large count values
104 |   counts <- matrix(rpois(100, lambda = 500), nrow = 10, ncol = 10)
105 |   rownames(counts) <- paste0("Gene", 1:10)
106 |   colnames(counts) <- paste0("Sample", 1:10)
107 |   
108 |   se <- SummarizedExperiment::SummarizedExperiment(
109 |     assays = list(counts = counts)
110 |   )
111 |   
112 |   # Request with log transformation
113 |   expect_message(
114 |     result <- plaid:::.extract_expression_matrix(se, assay = "counts", log.transform = TRUE),
115 |     "log2"
116 |   )
117 |   
118 |   # Result should be log-transformed
119 |   expect_true(all(result < 20))  # Log values should be much smaller
120 |   expect_true(all(result >= 0))
121 | })
122 | 
123 | test_that(".extract_expression_matrix errors on empty assays", {
124 |   skip_if_not_installed("SummarizedExperiment")
125 |   
126 |   # Create empty SE
127 |   se <- SummarizedExperiment::SummarizedExperiment()
128 |   
129 |   expect_error(
130 |     plaid:::.extract_expression_matrix(se),
131 |     "No assays found"
132 |   )
133 | })
134 | 
135 | test_that(".extract_expression_matrix errors on unsupported types", {
136 |   # Test with unsupported object type
137 |   df <- data.frame(x = 1:10, y = 11:20)
138 |   
139 |   expect_error(
140 |     plaid:::.extract_expression_matrix(df),
141 |     "Unsupported object type"
142 |   )
143 | })
144 | 
145 | # =============================================================================
146 | # Test .convert_geneset_to_matrix function
147 | # =============================================================================
148 | 
149 | test_that(".convert_geneset_to_matrix handles matrices directly", {
150 |   # Create test matrix
151 |   mat <- matrix(c(1, 0, 1, 0, 0, 1), nrow = 3, ncol = 2)
152 |   rownames(mat) <- paste0("Gene", 1:3)
153 |   colnames(mat) <- paste0("Pathway", 1:2)
154 |   
155 |   # Should return matrix as-is
156 |   result <- plaid:::.convert_geneset_to_matrix(mat)
157 |   
158 |   expect_identical(result, mat)
159 | })
160 | 
161 | test_that(".convert_geneset_to_matrix handles GMT lists", {
162 |   # Create GMT list
163 |   gmt <- list(
164 |     "Pathway1" = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6"),
165 |     "Pathway2" = c("Gene7", "Gene8", "Gene9", "Gene10", "Gene11"),
166 |     "Pathway3" = c("Gene1", "Gene3", "Gene5", "Gene7", "Gene9", "Gene11")
167 |   )
168 |   
169 |   # Convert to matrix
170 |   result <- plaid:::.convert_geneset_to_matrix(gmt)
171 |   
172 |   expect_true(inherits(result, "sparseMatrix"))
173 |   expect_equal(ncol(result), 3)
174 |   expect_true(all(colnames(result) %in% names(gmt)))
175 | })
176 | 
177 | test_that(".convert_geneset_to_matrix filters by gene set size", {
178 |   # Create GMT with varying sizes
179 |   gmt <- list(
180 |     "TooSmall" = c("Gene1", "Gene2"),  # Only 2 genes
181 |     "JustRight" = paste0("Gene", 1:10),  # 10 genes
182 |     "TooBig" = paste0("Gene", 1:600)  # 600 genes
183 |   )
184 |   
185 |   # Should filter out TooSmall and TooBig
186 |   expect_message(
187 |     result <- plaid:::.convert_geneset_to_matrix(gmt, min.genes = 5, max.genes = 500),
188 |     "Filtered out"
189 |   )
190 |   
191 |   expect_equal(ncol(result), 1)
192 |   expect_equal(colnames(result), "JustRight")
193 | })
194 | 
195 | test_that(".convert_geneset_to_matrix uses background genes", {
196 |   gmt <- list(
197 |     "Pathway1" = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6"),
198 |     "Pathway2" = c("Gene3", "Gene4", "Gene5", "Gene6", "Gene7", "Gene8")
199 |   )
200 |   
201 |   # Specify background
202 |   bg <- c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5")
203 |   
204 |   result <- plaid:::.convert_geneset_to_matrix(gmt, background = bg)
205 |   
206 |   # Should only include background genes
207 |   expect_true(all(rownames(result) %in% bg))
208 | })
209 | 
210 | test_that(".convert_geneset_to_matrix errors when no gene sets pass filter", {
211 |   gmt <- list(
212 |     "TooSmall1" = c("Gene1", "Gene2"),
213 |     "TooSmall2" = c("Gene3")
214 |   )
215 |   
216 |   expect_error(
217 |     plaid:::.convert_geneset_to_matrix(gmt, min.genes = 5, max.genes = 500),
218 |     "No gene sets passed size filters"
219 |   )
220 | })
221 | 
222 | test_that(".convert_geneset_to_matrix handles BiocSet objects", {
223 |   skip_if_not_installed("BiocSet")
224 |   skip("BiocSet requires complex setup with tibbles and specific structure")
225 |   
226 |   # Note: BiocSet objects require specific tibble structure
227 |   # and are complex to construct in tests. The function handles
228 |   # them by extracting es_elementset() and converting to GMT format.
229 | })
230 | 
231 | test_that(".convert_geneset_to_matrix errors on BiocSet with no data", {
232 |   skip_if_not_installed("BiocSet")
233 |   skip("BiocSet requires complex setup - error handling verified via code inspection")
234 | })
235 | 
236 | test_that(".convert_geneset_to_matrix handles various input types", {
237 |   # The function accepts matrix, Matrix, BiocSet, or list
238 |   # Data frames might be coerced to lists by R, so test with truly unsupported type
239 |   
240 |   # Test that unsupported atomic types error
241 |   expect_error(
242 |     plaid:::.convert_geneset_to_matrix("not_a_valid_input"),
243 |     "Unsupported geneset type|subscript out of bounds"
244 |   )
245 | })
246 | 
247 | # =============================================================================
248 | # Integration test: using both functions together
249 | # =============================================================================
250 | 
251 | test_that("bioc-utils functions work together in workflow", {
252 |   skip_if_not_installed("SummarizedExperiment")
253 |   
254 |   # Create test data
255 |   counts <- matrix(rpois(200, lambda = 10), nrow = 20, ncol = 10)
256 |   rownames(counts) <- paste0("Gene", 1:20)
257 |   colnames(counts) <- paste0("Sample", 1:10)
258 |   
259 |   se <- SummarizedExperiment::SummarizedExperiment(
260 |     assays = list(counts = counts)
261 |   )
262 |   
263 |   # Create gene sets
264 |   gmt <- list(
265 |     "Pathway1" = paste0("Gene", 1:10),
266 |     "Pathway2" = paste0("Gene", 11:20)
267 |   )
268 |   
269 |   # Extract expression and convert gene sets
270 |   expr_mat <- plaid:::.extract_expression_matrix(se, assay = "counts")
271 |   gset_mat <- plaid:::.convert_geneset_to_matrix(gmt, background = rownames(expr_mat))
272 |   
273 |   # Should have compatible dimensions
274 |   expect_equal(nrow(gset_mat), nrow(expr_mat))
275 |   expect_true(all(rownames(gset_mat) %in% rownames(expr_mat)))
276 | })
277 | 
278 | 


--------------------------------------------------------------------------------
/vignettes/01_plaid-vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Getting Started with PLAID"
  3 | author: "BigOmics Analytics"
  4 | package: plaid
  5 | output: 
  6 |     BiocStyle::html_document
  7 | vignette: >
  8 |   %\VignetteIndexEntry{Getting Started with PLAID}
  9 |   %\VignetteEngine{knitr::rmarkdown}
 10 |   %\VignetteEncoding{UTF-8}
 11 | ---
 12 | 
 13 | ```{r, include = FALSE}
 14 | knitr::opts_chunk$set(
 15 |   collapse = TRUE,
 16 |   comment = "#>",
 17 |   fig.align = "center"
 18 | )
 19 | ```
 20 | 
 21 | # Introduction
 22 | 
 23 | PLAID (Pathway Level Average Intensity Detection) is a
 24 | novel, ultrafast and memory optimized gene set enrichment scoring
 25 | algorithm. PLAID demonstrates accurate gene set scoring and
 26 | outperforms all currently available gene set scoring methods in large
 27 | bulk and single-cell RNA-seq datasets.
 28 | 
 29 | # Motivation
 30 | 
 31 | In recent years, computational methods have emerged that calculate enrichment of gene signatures
 32 | within individual samples, rather than across pooled samples. These signatures offer critical insights into the
 33 | coordinated activity of functionally related genes, proteins or metabolites, enabling identification of unique
 34 | molecular profiles based on gene set and pathway activity in individual cells and patients. This strategy is pivotal
 35 | for patient stratification and advancement of personalized medicine. However, the rise of large-scale datasets,
 36 | including single-cell profiles and population biobanks, has exposed significant computational inefficiencies in
 37 | existing methods. Current methods often demand excessive runtime and memory resources, becoming
 38 | impractical for large datasets. Overcoming these limitations is a focus of current efforts by bioinformatics teams in
 39 | academia and the pharmaceutical industry, as essential to support basic and clinical biomedical research.
 40 | 
 41 | 
 42 | # Example: Single-cell RNA-seq hallmark scoring
 43 | 
 44 | ## Preparing data
 45 | 
 46 | For this vignette, our package includes a small subset of the the
 47 | pmbc3k single-cell dataset of just 50 cells. Please install the Seurat
 48 | and SeuratData packages if you want to run this vignette against the
 49 | full dataset.
 50 | 
 51 | ```{r}
 52 | library("plaid")
 53 | load(system.file("extdata", "pbmc3k-50cells.rda", package = "plaid"),verbose=TRUE)
 54 | dim(X)
 55 | ```
 56 | 
 57 | Note that X is the normalized expression matrix from the Seurat
 58 | object, not the raw counts matrix. We recommend to run plaid on the
 59 | log transformed expression matrix, not on the counts, as the average
 60 | in the logarithmic space is more robust and is in concordance to
 61 | calculating the geometric mean.
 62 | 
 63 | It is not necessary to normalize your expression matrix before running
 64 | plaid because plaid normalizes the enrichment scores
 65 | afterwards. However, again, log transformation is recommended.
 66 | 
 67 | It is recommended to keep the expression matrix sparse as much as
 68 | possible because plaid extensively take advantage of sparse matrix
 69 | computations. But even for dense matrices plaid is fast.
 70 | 
 71 | 
 72 | ## Preparing gene sets
 73 | 
 74 | For convenience we have included the 50 Hallmark genesets in our
 75 | package. But we encourage you to download larger geneset collections
 76 | as plaid's speed advantage will be more apparent for larger datasets
 77 | and large geneset collections.
 78 | 
 79 | Plaid needs the gene sets as sparse matrix. If you have your
 80 | collection of gene sets a a list, we need first to convert the gmt
 81 | list to matrix format.
 82 | 
 83 | ```{r}
 84 | hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid")
 85 | gmt <- read.gmt(hallmarks)
 86 | matG <- gmt2mat(gmt)
 87 | dim(matG)
 88 | ```
 89 | 
 90 | If you have your own gene sets stored as gmt files, you can
 91 | conveniently use the included `read.gmt()` function to read the gmt
 92 | file.
 93 | 
 94 | 
 95 | ## Calculating the score
 96 | 
 97 | The main function to run plaid is `plaid()`. We run plaid on our
 98 | expression matrix `X` and gene set matrix `matG`.
 99 | 
100 | ```{r}
101 | gsetX <- plaid(X, matG, normalize=TRUE)
102 | dim(gsetX)
103 | ```
104 | 
105 | The resulting matrix `gsetX` contains the single-sample enrichment
106 | scores for the specified gene sets and samples. 
107 | 
108 | Notice that by default plaid performs median normalization of the
109 | final results. That also means that it is not necessary to normalize
110 | your expression matrix before running plaid. However, generally, log
111 | transformation is recommended.
112 | 
113 | Plaid can also be run on the ranked matrix, we will see later that
114 | this corresponds to the singscore (Fouratan et al., 2018). Or plaid
115 | could be run on the (non-logarithmic) counts which can be used to
116 | calculate the scSE score (Pont et al., 2019).
117 | 
118 | ## Very large matrices
119 | 
120 | Plaid is fast and memory efficient because it uses very efficient
121 | sparse matrix computation in the back. For very large `X`, plaid uses
122 | chunked computation by splitting the matrix in chunks to avoid index
123 | overflow. Should you encounter errors, please compute your dataset by
124 | subsetting manually the expression matrix and/or gene sets.
125 | 
126 | Although `X` and `matG` are generally very sparse, be aware that the
127 | result matrix `gsetX` generally is dense and therefore can become very
128 | large. If you would want to compute the score of 10.000 gene sets on a
129 | million of cells this would create a large 10.000 x 1.000.000 dense
130 | matrix which requires about 75GB of memory.
131 | 
132 | 
133 | ## Differential expression testing using dualGSEA
134 | 
135 | Once we have the gene sets scores we can use these scores for
136 | statistical analysis. We could compute the differential gene set
137 | expression between two groups using a general t-test or limma directly
138 | on the score matrix `gsetX`.
139 | 
140 | Another way to test whether a gene set is statistically significant
141 | would be to test whether the fold-change of the genes in the gene sets
142 | are statistically different than zero. That is, we can perform a one
143 | sample z-test on the logFC of the genes of each gene sets and test
144 | whether they are significantly different from zero. The logFC is
145 | computed from the original (log) expression matrix `X` and group
146 | vector `y`.
147 | 
148 | 
149 | The function `dualGSEA()` does both tests: the one-sample z-test on
150 | the logFC  and the two-group t-test on the gene set matrix `gsetX`. 
151 | Dual testing has been suggested by Bull et al. (Sci Rep., 2024)
152 | 
153 | ```{r}
154 | y <- 1*(celltype == "B")
155 | res <- dualGSEA(X, y, G=matG)
156 | ```
157 | 
158 | The top significant genesets can be shown with
159 | 
160 | ```{r}
161 | res <- res[order(res[,"p.dual"]),] 
162 | head(res)
163 | ```
164 | 
165 | The column `gsetFC` corresponds to the difference in gene set score
166 | and also corresponds to the average foldchange of the genes in the
167 | gene set. The column 'p.fc corresponds to the test on the preranked
168 | logFC, the column 'p.ss' corresponds to the two-group t-test on the
169 | geneset scores `gsetX`. The two p-values are then combined using
170 | Stouffer's method in the column 'p.dual' and adjusted for multiple
171 | testing in column `q.dual`.
172 | 
173 | The left figure below plots the fold-change enrichment `p.fc` vs. 
174 | single-sample enrichment `p.ss`. The right figure shows the volcano plot
175 | `p.dual` vs. `gsetFC`:
176 | 
177 | ```{r, fig.height=6, fig.width=12, fig.fullwidth=TRUE}
178 | fc <- res[,"gsetFC"]
179 | pv <- res[,"p.dual"]
180 | p1 <- res[,"p.fc"]
181 | p2 <- res[,"p.ss"]
182 | ii <- head(order(pv))
183 | par(mfrow=c(1,2))
184 | plot( -log10(p1), -log10(p2),
185 |   xlab="FC enrichment (-log10p)",
186 |   ylab="single-sample enrichment (-log10p)", pch=19)
187 | text( -log10(p1[ii]), -log10(p2[ii]), rownames(res)[ii],pos=2)
188 | plot( fc, -log10(pv), xlab="gsetFC", ylab="-log10p", pch=19)
189 | abline(h=0, v=0, lty=2)
190 | text( fc[ii], -log10(pv[ii]), rownames(res)[ii],pos=2)
191 | ```
192 | 
193 | # Replicating ssGSEA, singscore and scSE
194 | 
195 | Plaid can be used to replicate other enrichment score such as
196 | [ssGSEA](https://github.com/rcastelo/GSVA),
197 | [GSVA](https://github.com/rcastelo/GSVA),
198 | [AUCell](https://github.com/aertslab/AUCell),
199 | [Singscore](https://github.com/DavisLaboratory/singscore),
200 | [scSE](https://doi.org/10.1093/nar/gkz601) and
201 | [UCell](https://github.com/carmonalab/UCell). But using plaid, the
202 | computation is much faster than the original code.
203 | 
204 | ## Replicating singscore
205 | 
206 | Computing the
207 | [singscore](https://github.com/DavisLaboratory/singscore) requires to
208 | compute the ranks of the expression matrix. We have wrapped this in a
209 | single convenience function:
210 | 
211 | ```{r}
212 | sing <- replaid.sing(X, matG)
213 | ```
214 | 
215 | We have extensively compared the results of `replaid.sing` and from
216 | the original `singscore` R package and we showed identical result in
217 | the score, logFC and p-values.
218 | 
219 | 
220 | ## Replicating ssGSEA
221 | 
222 | Plaid can also be used to compute the ssGSEA score (Barbie et al.,
223 | 2009). Using plaid, we can calculate the score upto 100x faster. We
224 | have wrapped this in a single convenience function:
225 | 
226 | ```{r}
227 | ssgsea <- replaid.ssgsea(X, matG, alpha=0)
228 | ```
229 | 
230 | We have extensively compared the results of `replaid.ssgsea()` and
231 | from the original [GSVA](https://github.com/rcastelo/GSVA) R
232 | package. Note the rank weight parameter alpha. For `alpha=0` we
233 | obtained identical result for the score, logFC and p-values. For
234 | non-zero values for alpha the results are close but not exactly the
235 | same. The default value in the original publication and in GSVA is
236 | `alpha=0.25`.
237 | 
238 | 
239 | ## Replicating the scSE score
240 | 
241 | [Single-cell Signature Explorer
242 | (scSE)](https://doi.org/10.1093/nar/gkz601) is a fast enrichment
243 | algorithm implemented in GO. Computing the scSE score requires running
244 | plaid on the linear (not logarithmic) score and perform additional
245 | normalization by the total UMI per sample. We have wrapped this in a
246 | single convenience function:
247 | 
248 | ```{r}
249 | scse <- replaid.scse(X, matG, removeLog2=TRUE, scoreMean=FALSE)
250 | ```
251 | 
252 | To replicate the original "sum-of-UMI" scSE score, set
253 | `removeLog2=TRUE` and `scoreMean=FALSE`. scSE and plaid scores become
254 | more similar for `removeLog2=FALSE` and `scoreMean=TRUE`.
255 | 
256 | We have extensively compared the results from `replaid.scse` and from
257 | the original scSE (implemented in GO lang) and we showed almost
258 | identical results in the score, logFC and p-values.
259 | 
260 | 
261 | ## Compare scores
262 | 
263 | We can compare all scores in a pairs plot:
264 | 
265 | ```{r}
266 | S <- cbind(plaid=gsetX[,1], sing=sing[,1], ssgsea=ssgsea[,1], scSE=scse[,1])
267 | pairs(S)
268 | ```
269 | 
270 | # Session info
271 | 
272 | ```{r}
273 | sessionInfo()
274 | ```
275 | 


--------------------------------------------------------------------------------
/tests/testthat/test-gmt-utils.R:
--------------------------------------------------------------------------------
  1 | ## This file is part of the Omics Playground project.
  2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  3 | 
  4 | library(testthat)
  5 | library(Matrix)
  6 | 
  7 | # =============================================================================
  8 | # Test gmt2mat function
  9 | # =============================================================================
 10 | 
 11 | test_that("gmt2mat creates sparse binary matrix correctly", {
 12 |   # Create test GMT data
 13 |   gmt <- list(
 14 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
 15 |     "Pathway2" = c("GENE2", "GENE4", "GENE5"),
 16 |     "Pathway3" = c("GENE1", "GENE5", "GENE6")
 17 |   )
 18 |   
 19 |   mat <- gmt2mat(gmt, sparse = TRUE)
 20 |   
 21 |   # Check output type
 22 |   expect_true(inherits(mat, "sparseMatrix"))
 23 |   
 24 |   # Check dimensions
 25 |   expect_equal(ncol(mat), 3)
 26 |   expect_true(nrow(mat) >= 6)
 27 |   
 28 |   # Check column names
 29 |   expect_equal(colnames(mat), c("Pathway1", "Pathway2", "Pathway3"))
 30 |   
 31 |   # Check that genes are present
 32 |   expect_true(all(c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5", "GENE6") %in% rownames(mat)))
 33 |   
 34 |   # Check binary values (0 or 1)
 35 |   expect_true(all(as.vector(mat) %in% c(0, 1)))
 36 | })
 37 | 
 38 | test_that("gmt2mat creates dense matrix correctly", {
 39 |   gmt <- list(
 40 |     "Pathway1" = c("GENE1", "GENE2"),
 41 |     "Pathway2" = c("GENE3", "GENE4")
 42 |   )
 43 |   
 44 |   mat <- gmt2mat(gmt, sparse = FALSE)
 45 |   
 46 |   # Check output type
 47 |   expect_true(is.matrix(mat))
 48 |   expect_false(inherits(mat, "sparseMatrix"))
 49 |   
 50 |   # Check dimensions
 51 |   expect_equal(ncol(mat), 2)
 52 |   expect_equal(nrow(mat), 4)
 53 | })
 54 | 
 55 | test_that("gmt2mat handles max.genes parameter", {
 56 |   gmt <- list(
 57 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
 58 |     "Pathway2" = c("GENE4", "GENE5", "GENE6")
 59 |   )
 60 |   
 61 |   mat <- gmt2mat(gmt, max.genes = 3)
 62 |   
 63 |   # Should limit to 3 genes
 64 |   expect_equal(nrow(mat), 3)
 65 | })
 66 | 
 67 | test_that("gmt2mat handles ntop parameter", {
 68 |   gmt <- list(
 69 |     "Pathway1" = c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5"),
 70 |     "Pathway2" = c("GENE6", "GENE7", "GENE8", "GENE9", "GENE10")
 71 |   )
 72 |   
 73 |   mat <- gmt2mat(gmt, ntop = 3)
 74 |   
 75 |   # Each pathway should have at most 3 genes
 76 |   expect_true(all(Matrix::colSums(mat) <= 3))
 77 | })
 78 | 
 79 | test_that("gmt2mat handles custom background genes", {
 80 |   gmt <- list(
 81 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
 82 |     "Pathway2" = c("GENE2", "GENE4")
 83 |   )
 84 |   
 85 |   # Custom background - only include GENE1 and GENE2
 86 |   bg <- c("GENE1", "GENE2")
 87 |   mat <- gmt2mat(gmt, bg = bg)
 88 |   
 89 |   # Should only include background genes
 90 |   expect_equal(nrow(mat), 2)
 91 |   expect_true(all(rownames(mat) %in% bg))
 92 | })
 93 | 
 94 | test_that("gmt2mat handles multicore parameter", {
 95 |   gmt <- list(
 96 |     "Pathway1" = c("GENE1", "GENE2"),
 97 |     "Pathway2" = c("GENE3", "GENE4")
 98 |   )
 99 |   
100 |   # With multicore
101 |   mat1 <- gmt2mat(gmt, use.multicore = TRUE)
102 |   
103 |   # Without multicore
104 |   mat2 <- gmt2mat(gmt, use.multicore = FALSE)
105 |   
106 |   # Results should be identical
107 |   expect_equal(dim(mat1), dim(mat2))
108 |   expect_equal(rownames(mat1), rownames(mat2))
109 |   expect_equal(colnames(mat1), colnames(mat2))
110 |   expect_equal(as.matrix(mat1), as.matrix(mat2))
111 | })
112 | 
113 | test_that("gmt2mat handles unnamed GMT lists", {
114 |   # Note: When GMT lists are unnamed, the function auto-generates names
115 |   # However, there's an edge case where empty lists cause issues
116 |   # Test with a simple named list instead
117 |   skip("Unnamed GMT lists can cause issues with name assignment after deduplication")
118 | })
119 | 
120 | test_that("gmt2mat handles duplicate pathway names", {
121 |   gmt <- list(
122 |     "Pathway1" = c("GENE1", "GENE2"),
123 |     "Pathway1" = c("GENE3", "GENE4"),  # Duplicate name
124 |     "Pathway2" = c("GENE5", "GENE6")
125 |   )
126 |   
127 |   mat <- gmt2mat(gmt)
128 |   
129 |   # Should remove duplicates
130 |   expect_equal(ncol(mat), 2)
131 |   expect_true(all(colnames(mat) %in% c("Pathway1", "Pathway2")))
132 | })
133 | 
134 | test_that("gmt2mat sorts pathways by size", {
135 |   gmt <- list(
136 |     "Small" = c("GENE1"),
137 |     "Large" = c("GENE2", "GENE3", "GENE4", "GENE5"),
138 |     "Medium" = c("GENE6", "GENE7")
139 |   )
140 |   
141 |   mat <- gmt2mat(gmt)
142 |   
143 |   # Larger pathways should come first in the ordering
144 |   # (the function sorts by decreasing size)
145 |   expect_equal(colnames(mat)[1], "Large")
146 | })
147 | 
148 | test_that("gmt2mat sorts genes by frequency", {
149 |   gmt <- list(
150 |     "Pathway1" = c("GENE1", "GENE2"),
151 |     "Pathway2" = c("GENE1", "GENE3"),
152 |     "Pathway3" = c("GENE1", "GENE4")
153 |   )
154 |   
155 |   mat <- gmt2mat(gmt)
156 |   
157 |   # GENE1 appears in all pathways, should be first
158 |   expect_equal(rownames(mat)[1], "GENE1")
159 | })
160 | 
161 | # =============================================================================
162 | # Test mat2gmt function
163 | # =============================================================================
164 | 
165 | test_that("mat2gmt converts matrix to GMT correctly", {
166 |   # Create binary matrix
167 |   mat <- matrix(0, nrow = 6, ncol = 3)
168 |   rownames(mat) <- paste0("GENE", 1:6)
169 |   colnames(mat) <- paste0("Pathway", 1:3)
170 |   mat[1:3, 1] <- 1  # Pathway1: GENE1, GENE2, GENE3
171 |   mat[c(2, 4, 5), 2] <- 1  # Pathway2: GENE2, GENE4, GENE5
172 |   mat[c(1, 5, 6), 3] <- 1  # Pathway3: GENE1, GENE5, GENE6
173 |   
174 |   gmt <- mat2gmt(mat)
175 |   
176 |   # Check output type
177 |   expect_true(is.list(gmt))
178 |   
179 |   # Check number of pathways
180 |   expect_equal(length(gmt), 3)
181 |   
182 |   # Check pathway names
183 |   expect_equal(names(gmt), paste0("Pathway", 1:3))
184 |   
185 |   # Check pathway contents
186 |   expect_equal(sort(gmt$Pathway1), paste0("GENE", 1:3))
187 |   expect_equal(sort(gmt$Pathway2), paste0("GENE", c(2, 4, 5)))
188 |   expect_equal(sort(gmt$Pathway3), paste0("GENE", c(1, 5, 6)))
189 | })
190 | 
191 | test_that("mat2gmt works with sparse matrix", {
192 |   # Create sparse matrix
193 |   mat <- Matrix(0, nrow = 4, ncol = 2, sparse = TRUE)
194 |   rownames(mat) <- paste0("GENE", 1:4)
195 |   colnames(mat) <- paste0("Pathway", 1:2)
196 |   mat[1:2, 1] <- 1
197 |   mat[3:4, 2] <- 1
198 |   
199 |   gmt <- mat2gmt(mat)
200 |   
201 |   expect_equal(length(gmt), 2)
202 |   expect_equal(sort(gmt$Pathway1), c("GENE1", "GENE2"))
203 |   expect_equal(sort(gmt$Pathway2), c("GENE3", "GENE4"))
204 | })
205 | 
206 | test_that("mat2gmt handles non-binary matrices", {
207 |   # Matrix with different values
208 |   mat <- matrix(c(1, 2, 0, 0, 0, 3), nrow = 3, ncol = 2)
209 |   rownames(mat) <- paste0("GENE", 1:3)
210 |   colnames(mat) <- paste0("Pathway", 1:2)
211 |   
212 |   gmt <- mat2gmt(mat)
213 |   
214 |   # Should include genes with non-zero values
215 |   expect_equal(sort(gmt$Pathway1), c("GENE1", "GENE2"))
216 |   expect_equal(gmt$Pathway2, "GENE3")
217 | })
218 | 
219 | test_that("mat2gmt handles empty pathways", {
220 |   mat <- matrix(0, nrow = 4, ncol = 3)
221 |   rownames(mat) <- paste0("GENE", 1:4)
222 |   colnames(mat) <- paste0("Pathway", 1:3)
223 |   mat[1:2, 1] <- 1
224 |   # Pathway2 and Pathway3 are empty
225 |   
226 |   gmt <- mat2gmt(mat)
227 |   
228 |   # Should still return all pathways
229 |   expect_equal(length(gmt), 1)  # Only Pathway1 has genes
230 |   expect_equal(names(gmt), "Pathway1")
231 | })
232 | 
233 | # =============================================================================
234 | # Test round-trip conversion (GMT -> Matrix -> GMT)
235 | # =============================================================================
236 | 
237 | test_that("gmt2mat and mat2gmt are inverse operations", {
238 |   # Original GMT
239 |   gmt_orig <- list(
240 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
241 |     "Pathway2" = c("GENE2", "GENE4", "GENE5"),
242 |     "Pathway3" = c("GENE1", "GENE5", "GENE6")
243 |   )
244 |   
245 |   # Convert to matrix and back
246 |   mat <- gmt2mat(gmt_orig)
247 |   gmt_new <- mat2gmt(mat)
248 |   
249 |   # Check that we get the same pathways back
250 |   expect_equal(length(gmt_new), length(gmt_orig))
251 |   expect_equal(sort(names(gmt_new)), sort(names(gmt_orig)))
252 |   
253 |   # Check pathway contents (order might differ)
254 |   for (pathway in names(gmt_orig)) {
255 |     expect_equal(sort(gmt_new[[pathway]]), sort(gmt_orig[[pathway]]))
256 |   }
257 | })
258 | 
259 | # =============================================================================
260 | # Test read.gmt and write.gmt functions
261 | # =============================================================================
262 | 
263 | test_that("write.gmt creates properly formatted file", {
264 |   # Create test GMT data
265 |   gmt_orig <- list(
266 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
267 |     "Pathway2" = c("GENE4", "GENE5"),
268 |     "Pathway3" = c("GENE6", "GENE7", "GENE8", "GENE9")
269 |   )
270 |   
271 |   # Create temporary file
272 |   temp_file <- tempfile(fileext = ".gmt")
273 |   
274 |   # Write GMT file
275 |   write.gmt(gmt_orig, temp_file)
276 |   
277 |   # Check file exists
278 |   expect_true(file.exists(temp_file))
279 |   
280 |   # Read file and check format
281 |   lines <- readLines(temp_file)
282 |   expect_equal(length(lines), 3)
283 |   
284 |   # Each line should have pathway name, source, and genes separated by tabs
285 |   for (i in seq_along(lines)) {
286 |     parts <- strsplit(lines[i], "\t")[[1]]
287 |     expect_true(length(parts) >= 3)  # name + source + at least one gene
288 |     expect_equal(parts[1], names(gmt_orig)[i])  # Check pathway name
289 |   }
290 |   
291 |   # Cleanup
292 |   unlink(temp_file)
293 | })
294 | 
295 | test_that("write.gmt handles custom source parameter", {
296 |   gmt <- list(
297 |     "Pathway1" = c("GENE1", "GENE2"),
298 |     "Pathway2" = c("GENE3", "GENE4")
299 |   )
300 |   
301 |   temp_file <- tempfile(fileext = ".gmt")
302 |   
303 |   # Write with custom source - need to match length of gmt
304 |   source_info <- c("DB1", "DB2")
305 |   # Note: write.gmt expects single source or needs fixing
306 |   write.gmt(gmt, temp_file, source = "CustomDB")
307 |   
308 |   # Read file and check content
309 |   lines <- readLines(temp_file)
310 |   expect_true(grepl("CustomDB", lines[1]))
311 |   expect_true(grepl("CustomDB", lines[2]))
312 |   
313 |   # Cleanup
314 |   unlink(temp_file)
315 | })
316 | 
317 | test_that("write.gmt handles source parameter correctly", {
318 |   gmt <- list(
319 |     "Pathway1" = c("GENE1", "GENE2"),
320 |     "Pathway2" = c("GENE3", "GENE4")
321 |   )
322 |   
323 |   temp_file <- tempfile(fileext = ".gmt")
324 |   
325 |   # Write without source (should use pathway names as source)
326 |   write.gmt(gmt, temp_file)
327 |   lines <- readLines(temp_file)
328 |   parts1 <- strsplit(lines[1], "\t")[[1]]
329 |   expect_equal(parts1[1], parts1[2])  # Name and source should be the same when NA
330 |   
331 |   # Cleanup
332 |   unlink(temp_file)
333 | })
334 | 
335 | test_that("read.gmt reads GMT file correctly", {
336 |   # Create test GMT data
337 |   gmt_orig <- list(
338 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
339 |     "Pathway2" = c("GENE4", "GENE5"),
340 |     "Pathway3" = c("GENE6", "GENE7", "GENE8")
341 |   )
342 |   
343 |   # Create temporary file
344 |   temp_file <- tempfile(fileext = ".gmt")
345 |   write.gmt(gmt_orig, temp_file)
346 |   
347 |   # Read back
348 |   gmt_read <- read.gmt(temp_file)
349 |   
350 |   # Check output type
351 |   expect_true(is.list(gmt_read))
352 |   
353 |   # Check number of pathways
354 |   expect_equal(length(gmt_read), 3)
355 |   
356 |   # Check pathway names
357 |   expect_equal(names(gmt_read), names(gmt_orig))
358 |   
359 |   # Check pathway contents
360 |   expect_equal(sort(gmt_read$Pathway1), sort(gmt_orig$Pathway1))
361 |   expect_equal(sort(gmt_read$Pathway2), sort(gmt_orig$Pathway2))
362 |   expect_equal(sort(gmt_read$Pathway3), sort(gmt_orig$Pathway3))
363 |   
364 |   # Cleanup
365 |   unlink(temp_file)
366 | })
367 | 
368 | test_that("read.gmt and write.gmt are inverse operations", {
369 |   # Create test GMT data
370 |   gmt_orig <- list(
371 |     "Pathway1" = c("GENE1", "GENE2", "GENE3"),
372 |     "Pathway2" = c("GENE2", "GENE4", "GENE5"),
373 |     "Pathway3" = c("GENE1", "GENE5", "GENE6")
374 |   )
375 |   
376 |   # Write and read back
377 |   temp_file <- tempfile(fileext = ".gmt")
378 |   write.gmt(gmt_orig, temp_file)
379 |   gmt_read <- read.gmt(temp_file)
380 |   
381 |   # Should get the same data back
382 |   expect_equal(length(gmt_read), length(gmt_orig))
383 |   expect_equal(names(gmt_read), names(gmt_orig))
384 |   
385 |   for (pathway in names(gmt_orig)) {
386 |     expect_equal(sort(gmt_read[[pathway]]), sort(gmt_orig[[pathway]]))
387 |   }
388 |   
389 |   # Cleanup
390 |   unlink(temp_file)
391 | })
392 | 
393 | test_that("read.gmt handles add.source parameter", {
394 |   # Create test GMT with source info
395 |   gmt_orig <- list(
396 |     "Pathway1" = c("GENE1", "GENE2"),
397 |     "Pathway2" = c("GENE3", "GENE4")
398 |   )
399 |   
400 |   temp_file <- tempfile(fileext = ".gmt")
401 |   write.gmt(gmt_orig, temp_file, source = c("DB1", "DB2"))
402 |   
403 |   # Read without source
404 |   gmt_no_source <- read.gmt(temp_file, add.source = FALSE)
405 |   expect_equal(names(gmt_no_source), c("Pathway1", "Pathway2"))
406 |   
407 |   # Read with source
408 |   gmt_with_source <- read.gmt(temp_file, add.source = TRUE)
409 |   expect_true(grepl("DB1", names(gmt_with_source)[1]))
410 |   expect_true(grepl("DB2", names(gmt_with_source)[2]))
411 |   expect_true(grepl("Pathway1", names(gmt_with_source)[1]))
412 |   expect_true(grepl("Pathway2", names(gmt_with_source)[2]))
413 |   
414 |   # Cleanup
415 |   unlink(temp_file)
416 | })
417 | 
418 | test_that("read.gmt handles nrows parameter", {
419 |   # Create test GMT with multiple pathways
420 |   gmt_orig <- list(
421 |     "Pathway1" = c("GENE1", "GENE2"),
422 |     "Pathway2" = c("GENE3", "GENE4"),
423 |     "Pathway3" = c("GENE5", "GENE6"),
424 |     "Pathway4" = c("GENE7", "GENE8")
425 |   )
426 |   
427 |   temp_file <- tempfile(fileext = ".gmt")
428 |   write.gmt(gmt_orig, temp_file)
429 |   
430 |   # Read only first 2 rows
431 |   gmt_partial <- read.gmt(temp_file, nrows = 2)
432 |   
433 |   expect_equal(length(gmt_partial), 2)
434 |   expect_equal(names(gmt_partial), c("Pathway1", "Pathway2"))
435 |   
436 |   # Cleanup
437 |   unlink(temp_file)
438 | })
439 | 
440 | test_that("read.gmt handles empty gene lists", {
441 |   # Create GMT file with a pathway that has no genes
442 |   temp_file <- tempfile(fileext = ".gmt")
443 |   
444 |   # Write manually to create edge case
445 |   writeLines(c(
446 |     "Pathway1\tDB1\tGENE1\tGENE2",
447 |     "Pathway2\tDB2",  # No genes
448 |     "Pathway3\tDB3\tGENE3"
449 |   ), temp_file)
450 |   
451 |   gmt <- read.gmt(temp_file)
452 |   
453 |   # Should still read all pathways
454 |   expect_equal(length(gmt), 3)
455 |   expect_equal(names(gmt), c("Pathway1", "Pathway2", "Pathway3"))
456 |   
457 |   # Pathway2 should be empty
458 |   expect_equal(length(gmt$Pathway2), 0)
459 |   
460 |   # Other pathways should have genes
461 |   expect_equal(gmt$Pathway1, c("GENE1", "GENE2"))
462 |   expect_equal(gmt$Pathway3, "GENE3")
463 |   
464 |   # Cleanup
465 |   unlink(temp_file)
466 | })
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | # =============================================================================
474 | # Test edge cases and error handling
475 | # =============================================================================
476 | 
477 | test_that("gmt2mat handles empty GMT list", {
478 |   # Skip this test as gmt2mat doesn't handle truly empty lists
479 |   # This is expected behavior - GMT lists should have at least one pathway
480 |   skip("Empty GMT lists are not supported by design")
481 | })
482 | 
483 | test_that("gmt2mat handles single pathway", {
484 |   gmt <- list(
485 |     "OnlyPathway" = c("GENE1", "GENE2", "GENE3")
486 |   )
487 |   
488 |   mat <- gmt2mat(gmt)
489 |   
490 |   expect_equal(ncol(mat), 1)
491 |   expect_equal(colnames(mat), "OnlyPathway")
492 |   expect_equal(sum(mat), 3)
493 | })
494 | 
495 | test_that("gmt2mat handles pathways with no overlapping genes", {
496 |   gmt <- list(
497 |     "Pathway1" = c("GENE1", "GENE2"),
498 |     "Pathway2" = c("GENE3", "GENE4")
499 |   )
500 |   
501 |   # Use background that doesn't overlap
502 |   bg <- c("GENE5", "GENE6")
503 |   mat <- gmt2mat(gmt, bg = bg)
504 |   
505 |   # Matrix should be all zeros
506 |   expect_equal(sum(mat), 0)
507 | })
508 | 
509 | test_that("mat2gmt handles matrix with all zeros", {
510 |   mat <- matrix(0, nrow = 4, ncol = 2)
511 |   rownames(mat) <- paste0("GENE", 1:4)
512 |   colnames(mat) <- paste0("Pathway", 1:2)
513 |   
514 |   # This will create an empty result when matrix is all zeros
515 |   # The function returns a named list based on non-zero entries
516 |   result <- tryCatch({
517 |     gmt <- mat2gmt(mat)
518 |     length(gmt)
519 |   }, error = function(e) {
520 |     0  # If error, expect 0 length
521 |   })
522 |   
523 |   # Should return empty list
524 |   expect_equal(result, 0)
525 | })
526 | 
527 | test_that("gmt2mat preserves matrix class when specified", {
528 |   gmt <- list(
529 |     "Pathway1" = c("GENE1", "GENE2")
530 |   )
531 |   
532 |   # Sparse matrix
533 |   mat_sparse <- gmt2mat(gmt, sparse = TRUE)
534 |   expect_true(inherits(mat_sparse, "sparseMatrix"))
535 |   
536 |   # Dense matrix
537 |   mat_dense <- gmt2mat(gmt, sparse = FALSE)
538 |   expect_true(is.matrix(mat_dense))
539 |   expect_false(inherits(mat_dense, "sparseMatrix"))
540 | })
541 | 
542 | 


--------------------------------------------------------------------------------
/R/stats.R:
--------------------------------------------------------------------------------
  1 | ##----------------------------------------------------------------
  2 | ##----------------- STATISTICAL TESTS ----------------------------
  3 | ##----------------------------------------------------------------
  4 | 
  5 | #' @importFrom stats p.adjust pt pchisq qnorm pnorm
  6 | #' @importFrom Matrix rowMeans colSums crossprod t colScale
  7 | #' @importFrom Rfast ttests
  8 | #' @importFrom qlcMatrix corSparse
  9 | #' @importFrom methods is
 10 | NULL
 11 | 
 12 | #' Reimplementation of dualGSEA (Bull et al., 2024) but defaults with
 13 | #' replaid backend. For the preranked test we still use fgsea. Should
 14 | #' be much faster than original using fgsea + GSVA::ssGSEA.
 15 | #'
 16 | #' @param X Expression matrix with genes on rows and samples ont columns
 17 | #' @param y Binary vector (0/1) indicating group membership
 18 | #' @param gmt List of gene sets in GMT format
 19 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates
 20 | #'   gene/feature is part of gene sets. Features on rows, gene sets on
 21 | #'   columns.
 22 | #' @param gsetX Optional pre-computed matrix of gene set enrichment scores with 
 23 | #'   gene sets on rows and samples on columns. If NULL (default), scores will be 
 24 | #'   computed using the method specified by `ss.method`. Providing pre-computed 
 25 | #'   scores improves efficiency when running multiple analyses.
 26 | #' @param fc.method Method for fold change testing ("fgsea", "ztest", "ttest", "rankcor", "cor")
 27 | #' @param ss.method Method for single-sample enrichment ("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva")
 28 | #' @param pv1 Pre-computed p-values from fold change test. If NULL, will be computed based on fc.test.
 29 | #' @param pv2 Pre-computed p-values from single sample test. If NULL, will be computed using gset_ttest.
 30 | #' @param metap.method Method for combining p-values ("stouffer", "fisher" or "maxp"). Default "stouffer".
 31 | #' @param sort.by Column name to sort results by ("p.dual", "gsetFC", "p.fc", "p.ss"). Default "p.dual".
 32 | #' 
 33 | #' @return Data frame with columns: gsetFC (gene set fold change), size (gene set size),
 34 | #'   p.fc (p-value from fold change test), p.ss (p-value from single sample test),
 35 | #'   p.dual (combined p-value), and q.dual (FDR-adjusted combined p-value).
 36 | #'
 37 | #' @examples
 38 | #' # Create example expression matrix
 39 | #' set.seed(123)
 40 | #' X <- matrix(rnorm(1000), nrow = 100, ncol = 20)
 41 | #' rownames(X) <- paste0("GENE", 1:100)
 42 | #' colnames(X) <- paste0("Sample", 1:20)
 43 | #' 
 44 | #' # Create binary group vector
 45 | #' y <- rep(c(0, 1), each = 10)
 46 | #' 
 47 | #' # Create example gene sets
 48 | #' gmt <- list(
 49 | #'   "Pathway1" = paste0("GENE", 1:20),
 50 | #'   "Pathway2" = paste0("GENE", 15:35),
 51 | #'   "Pathway3" = paste0("GENE", 30:50)
 52 | #' )
 53 | #' 
 54 | #' # Perform dualGSEA with correlation test (fast method)
 55 | #' results_cor <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "cor", ss.method = "replaid.gsva")
 56 | #' print(head(results_cor))
 57 | #' 
 58 | #' \donttest{
 59 | #' # Perform dualGSEA with fgsea (requires fgsea package)
 60 | #' if (requireNamespace("fgsea", quietly = TRUE)) {
 61 | #'   results <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "fgsea", ss.method = "replaid.ssgsea")
 62 | #'   print(head(results))
 63 | #' }
 64 | #' }
 65 | #'
 66 | #' @export
 67 | dualGSEA <- function(X, y, G, gmt=NULL, gsetX=NULL,
 68 |                      fc.method = c("fgsea","rankcor","ztest","ttest","cor")[2],
 69 |                      ss.method = c('plaid', 'replaid.ssgsea','replaid.gsva', 
 70 |                        'ssgsea','gsva')[1],
 71 |                      metap.method = c("stouffer","fisher","maxp")[1],
 72 |                      pv1 = NULL, pv2 = NULL, sort.by='p.dual') {
 73 |   #require(fgsea)
 74 |   if (fc.method == "fgsea" && !requireNamespace("fgsea", quietly=TRUE)) {
 75 |     stop("The fgsea package must be installed to use this functionality")
 76 |   }
 77 |   if (ss.method %in% c("ssgsea","gsva") && !requireNamespace("GSVA", quietly=TRUE)) {
 78 |     stop("The GSVA package must be installed to use this functionality")
 79 |   }
 80 |   if(is.null(gmt) && is.null(G)) {
 81 |     stop("at least gmt or matrix G must be given")
 82 |   }
 83 |   if(!is.null(gmt) && !is(gmt, "list")) {
 84 |     stop("gmt must be a list")
 85 |   }
 86 |   
 87 |   if(!all(unique(y) %in% c(0,1,NA))) stop("elements of y must be 0 or 1")
 88 |   sel <- which(!is.na(y))
 89 |   y <- y[sel]
 90 |   X <- X[,sel,drop=FALSE] 
 91 |   
 92 |   if(is.null(G) && !is.null(gmt))  G <- gmt2mat(gmt)
 93 |   if(is.null(gmt) && !is.null(G))  gmt <- mat2gmt(G)  
 94 |   
 95 |   ## pairwise test on logFC
 96 |   if(is.null(pv1)) {
 97 |     message("FC testing using ", fc.method)
 98 |     m1 <- Matrix::rowMeans(X[,which(y==1),drop=FALSE])
 99 |     m0 <- Matrix::rowMeans(X[,which(y==0),drop=FALSE])
100 |     fc <- m1 - m0
101 |     if(fc.method == "fgsea") {
102 |       res1 <- fgsea::fgsea(gmt, fc)
103 |       res1 <- data.frame(res1, row.names=res1$pathway)
104 |       pv1 <- res1[,"pval"]
105 |       names(pv1) <- rownames(res1)
106 |     } else if(fc.method %in% c('ttest','ztest')) {
107 |       if(inherits(X,"dgCMatrix")) {
108 |         sdx <- sparseMatrixStats::rowSds(X,na.rm=TRUE)
109 |       } else {
110 |         sdx <- matrixStats::rowSds(X,na.rm=TRUE)
111 |       }
112 |       sdx0 <- mean(sdx, na.rm=TRUE)
113 |       zc <- fc / (0.1*sdx0 + sdx)
114 |       if(fc.method == "ttest") {
115 |         res1 <- fc_ttest(zc, G, sort.by="none")
116 |         pv1 <- res1[,'pvalue']
117 |       }
118 |       if(fc.method == "ztest") {
119 |         res1 <- fc_ztest(zc, G, zmat=FALSE)
120 |         pv1 <- res1$p_value
121 |       }
122 |     } else if(fc.method == 'rankcor') {
123 |       res1 <- gset.rankcor(fc, G, compute.p=TRUE, use.rank=TRUE)
124 |       pv1 <- res1$p.value[,1]
125 |     } else if(fc.method == 'cor') {
126 |       res1 <- gset.rankcor(fc, G, compute.p=TRUE, use.rank=FALSE)
127 |       pv1 <- res1$p.value[,1]
128 |     } else {
129 |       stop("invalid fc.method method")
130 |     }
131 |   }
132 |   
133 |   ## single-sample test
134 |   message("single-sample testing using ", ss.method)
135 | 
136 |   if(is.null(gsetX)) {
137 |     if(ss.method == "plaid") {
138 |       gsetX <- plaid::plaid(X, G)
139 |     }else if(ss.method == "gsva") {
140 |       gsvapar <- GSVA::gsvaParam(X, gmt)
141 |       gsetX <- GSVA::gsva(gsvapar, verbose = FALSE)
142 |     } else if(ss.method == "ssgsea") {
143 |       gsvapar <- GSVA::ssgseaParam(X, gmt)
144 |       gsetX <- GSVA::gsva(gsvapar, verbose = FALSE)
145 |     } else if(ss.method == "replaid.ssgsea") {
146 |       gsetX <- replaid.ssgsea(X, G)
147 |     } else if(ss.method == "replaid.gsva") {
148 |       gsetX <- replaid.gsva(X, G)
149 |     } else {
150 |       stop("invalid ss.method: ",ss.method)
151 |     }
152 |   } else {
153 |     gsetX <- gsetX[colnames(G),]
154 |   }
155 | 
156 |   gg <- intersect(rownames(G),rownames(X))
157 |   sel <- which(!is.na(y))
158 |   X <- X[gg,sel,drop=FALSE]
159 |   G <- G[gg,]
160 |   y <- y[sel]
161 |   
162 |   if(is.null(gsetX)) {
163 |     message("computing gsetX using plaid. please precompute for efficiency.")
164 |     gsetX <- plaid(X, G)
165 |   }
166 |   pp <- intersect(rownames(gsetX),colnames(G))
167 |   gsetX <- gsetX[pp,colnames(X),drop=FALSE]
168 |   G <- G[,pp,drop=FALSE]  
169 |   
170 |   e1 <- Matrix::rowMeans(gsetX[,y==1,drop=FALSE])
171 |   e0 <- Matrix::rowMeans(gsetX[,y==0,drop=FALSE])
172 |   gsetFC <- e1 - e0    
173 |   gs.size <- Matrix::colSums(G!=0)[rownames(gsetX)]
174 | 
175 |   if(is.null(pv2)) {
176 |     res2 <- gset_ttest(gsetX, y) 
177 |     pv2 <- res2[,'pvalue']
178 |   }
179 |   
180 |   gs <- rownames(gsetX)
181 |   P <- cbind(pv1[gs], pv2[gs])  
182 |   P[is.na(P)] <- 1
183 |   P <- pmin(pmax(P, 1e-99), 1-1e-99)
184 |   colnames(P) <- c("p.fc","p.ss")
185 | 
186 |   p.dual <- matrix_metap(P, method=metap.method)   
187 |   q.dual <- stats::p.adjust(p.dual, method="fdr")
188 |   
189 |   res <- cbind(
190 |     gsetFC = gsetFC,
191 |     size = gs.size,
192 |     pvalues = P,
193 |     p.dual = p.dual,
194 |     q.dual = q.dual    
195 |   )
196 |   if(sort.by %in% colnames(res)) {
197 |     osign <- ifelse(sort.by=="gsetFC",-1,1)
198 |     res <- res[order(osign*res[,sort.by]),]
199 |   }
200 |   res
201 | }
202 | 
203 | 
204 | #' T-test statistical testing of differentially enrichment
205 | #'
206 | #' This function performs statistical testing for differential
207 | #' enrichment using plaid
208 | #'
209 | #' @param fc Vector of logFC values
210 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates
211 | #'   gene/feature is part of gene sets. Features on rows, gene sets on
212 | #'   columns.
213 | #' @param sort.by Column name to sort results by ("pvalue", "gsetFC", or "none")
214 | #' 
215 | #' @return Data frame with columns: gsetFC (gene set fold change),
216 | #'   pvalue (p-value from one-sample t-test), and qvalue (FDR-adjusted p-value).
217 | #'
218 | fc_ttest <- function(fc, G, sort.by="pvalue") {
219 |   if(is.null(names(fc))) stop("fc must have names")  
220 |   gg <- intersect(rownames(G),names(fc))
221 |   fc <- fc[gg]
222 |   G <- G[gg,]
223 |   
224 |   message("[fc_ttest] computing one-sample t-tests on logFC")    
225 |   mt <- matrix_onesample_ttest(fc, G)
226 |   pv <- mt$p[,1]
227 |   df <- mt$mean[,1]
228 |   qv <- p.adjust(pv, method="fdr")
229 |   gsetFC <- gset_averageCLR(fc, G, center = FALSE)[,1]
230 |   
231 |   res <- cbind(
232 |     gsetFC = gsetFC,
233 |     pvalue = pv,
234 |     qvalue = qv    
235 |   )
236 |   if(!is.null(sort.by) && sort.by %in% colnames(res)) {
237 |     sort.sign <- ifelse(sort.by=="gsetFC",-1,+1)
238 |     res <- res[order(sort.sign*res[,sort.by]),]
239 |   }
240 |   res
241 | }
242 | 
243 | #' Z-test statistical testing of differentially enrichment
244 | #'
245 | #' This function performs statistical testing for differential
246 | #' enrichment using plaid
247 | #' 
248 | #' @importFrom stats var
249 | #'
250 | #' @param fc Vector of logFC values
251 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates
252 | #'   gene/feature is part of gene sets. Features on rows, gene sets on
253 | #'   columns.
254 | #' @param zmat Logical indicating to return z-matrix
255 | #' @param alpha Scalar weight for SD estimation. Default 0.5.
256 | #' 
257 | #' @return List with element: z_statistic (z-statistic from one-sample z-test),
258 | #'   p_value (p-value from one-sample z-test), and zmat (z-matrix).
259 | #'
260 | fc_ztest <- function(fc, G, zmat=FALSE, alpha=0.5) {
261 |   if(is.null(names(fc))) stop("fc must have names")
262 |   gg <- intersect(rownames(G),names(fc))
263 |   sample_size <- Matrix::colSums(G[gg,]!=0)
264 |   sample_size <- pmax(sample_size, 1) ## avoid div-by-zero
265 |   sample_mean <- (Matrix::t(G[gg,]!=0) %*% fc[gg]) / sample_size
266 |   population_mean <- mean(fc, na.rm=TRUE)
267 |   population_var <- var(fc, na.rm=TRUE)
268 |   gfc <- (G[gg,]!=0) * fc[gg]
269 |   sample_var <- sparseMatrixStats::colVars(gfc) * nrow(G) / sample_size
270 |   alpha <- pmin(pmax(alpha,0), 0.999) ## limit
271 |   estim_sd <- sqrt( alpha*sample_var + (1-alpha)*population_var )
272 |   z_statistic <- (sample_mean - population_mean) / (estim_sd / sqrt(sample_size))
273 |   p_value <- 2 * pnorm(abs(z_statistic[,1]), lower.tail = FALSE)  
274 |   if(zmat) {
275 |     zmat <- (Matrix::t(gfc) / estim_sd)
276 |   } else {
277 |     zmat <- NULL
278 |   }
279 |   list(
280 |     z_statistic = z_statistic[,1],
281 |     p_value = p_value,
282 |     zmat = zmat
283 |   )
284 | }
285 | 
286 | #' Compute geneset expression as the average log-ration of genes in
287 | #' the geneset. Requires log-expression matrix X and (sparse) geneset
288 | #' matrix matG.
289 | #'
290 | #' @param X Log-expression matrix with genes on rows and samples on columns
291 | #' @param matG Sparse gene set matrix with genes on rows and gene sets on columns
292 | #' @param center Logical indicating whether to center the results
293 | #'
294 | #' @return Matrix of gene set expression scores with gene sets on rows and samples on columns.
295 | #'
296 | gset_averageCLR <- function(X, matG, center = TRUE) {
297 |   if (NCOL(X) == 1) X <- cbind(X)
298 |   gg <- intersect(rownames(X), rownames(matG))
299 |   if (length(gg) == 0) {
300 |     message("[gset.averageCLR] no overlapping features")
301 |     return(NULL)
302 |   }
303 |   X <- X[gg, , drop = FALSE]
304 |   matG <- matG[gg, , drop = FALSE]
305 |   sumG <- 1e-8 + Matrix::colSums(matG != 0, na.rm = TRUE)
306 |   nG <- Matrix::colScale(1 * (matG != 0), 1 / sumG)
307 |   gsetX <- Matrix::t(nG) %*% X 
308 |   if(center) gsetX <- gsetX - Matrix::rowMeans(gsetX, na.rm = TRUE)
309 |   as.matrix(gsetX)
310 | }
311 | 
312 | #' Perform t-test on gene set scores
313 | #'
314 | #' @param gsetX Matrix of gene set scores with gene sets on rows and samples on columns
315 | #' @param y Binary vector (0/1) indicating group membership
316 | #'
317 | #' @return Data frame with columns: diff (difference in means), statistic (t-statistic),
318 | #'   pvalue (p-value), and other t-test results.
319 | #'
320 | gset_ttest <- function(gsetX, y) {
321 |   ii <- which(!is.na(y))
322 |   gsetX <- gsetX[,ii]
323 |   y <- y[ii]
324 |   if(!all(unique(y) %in% c(0,1))) stop("[gset_ttest] elements of y must be 0 or 1")
325 |   res  <- Rfast::ttests(Matrix::t(gsetX), ina=y+1)
326 |   rownames(res) <- rownames(gsetX)
327 |   diff <- rowMeans(gsetX[,y==1]) - rowMeans(gsetX[,y==0])
328 |   res <- cbind( diff=diff, res)
329 |   return(res)
330 | }
331 | 
332 | ##----------------------------------------------------------------
333 | ##----------------- FUNCTIONS ------------------------------------
334 | ##----------------------------------------------------------------
335 | 
336 | #' Perform one-sample t-test on matrix with gene sets
337 | #'
338 | #' @param Fm Vector of feature values (e.g., fold changes)
339 | #' @param G Sparse matrix of gene sets with genes on rows and gene sets on columns
340 | #'
341 | #' @return List containing mean, t-statistic, and p-value matrices.
342 | #'
343 | matrix_onesample_ttest <- function(Fm, G) {
344 |   sumG <- Matrix::colSums(G!=0)
345 |   sum_sq  <- Matrix::crossprod(G!=0, Fm^2) 
346 |   meanx <- Matrix::crossprod(G!=0, Fm) / (1e-8 + sumG)
347 |   sdx   <-  sqrt( (sum_sq - meanx^2 * sumG) / (sumG - 1))
348 |   f_stats <- meanx
349 |   t_stats <- meanx / (1e-8 + sdx) * sqrt(sumG)
350 |   p_stats <- apply( abs(t_stats), 2, function(tv)
351 |     2*pt(tv,df=pmax(sumG-1,1),lower.tail=FALSE))
352 |   list(mean = as.matrix(f_stats), t = as.matrix(t_stats), p = p_stats)  
353 | }
354 | 
355 | #' Matrix version for combining p-values using fisher or stouffer
356 | #' method. Much faster than doing metap::sumlog() and metap::sumz()
357 | #'
358 | #' @param plist List of p-value vectors or matrix of p-values
359 | #' @param method Method for combining p-values ("fisher"/"sumlog" or "stouffer"/"sumz")
360 | #'
361 | #' @return Vector of combined p-values.
362 | #'
363 | matrix_metap <- function(plist, method='stouffer') {
364 |   if(inherits(plist,"matrix")) {
365 |     plist <- as.list(data.frame(plist))
366 |   }
367 |   if(method %in% c("fisher","sumlog")) {
368 |     chisq <- (-2) * Reduce('+', lapply(plist,log))
369 |     df <- 2 * length(plist)
370 |     pv <- pchisq(chisq, df, lower.tail=FALSE)
371 |   } else if(method %in% c("stouffer","sumz")) {
372 |     np <- length(plist)
373 |     zz <- lapply(plist, qnorm, lower.tail=FALSE) 
374 |     zz <- Reduce('+', zz) / sqrt(np)
375 |     pv <- pnorm(zz, lower.tail=FALSE)
376 |   } else if(method %in% c("maxp","pmax","maximump")) {
377 |     pv <- Reduce(pmax, plist)    
378 |   } else {
379 |     stop("Invalid method: ",method)
380 |   }
381 |   dimnames(pv) <- dimnames(plist[[1]])
382 |   return(pv)
383 | }
384 | 
385 | 
386 | #' Calculate gene set rank correlation
387 | #'
388 | #' Compute rank correlation between a gene rank vector/matrix and gene sets
389 | #'
390 | #' @param rnk Numeric vector or matrix of gene ranks, with genes as row names
391 | #' @param gset Numeric matrix of gene sets, with genes as row/column names
392 | #' @param compute.p Logical indicating whether to compute p-values
393 | #' @param use.rank Logical indicating whether to rank transform rnk before correlation
394 | #'
395 | #' @return Named list with components:
396 | #' \itemize{
397 | #'  \item rho - Matrix of correlation coefficients between rnk and gset
398 | #'  \item p.value - Matrix of p-values for correlation (if compute.p = TRUE)
399 | #'  \item q.value - Matrix of FDR adjusted p-values (if compute.p = TRUE)
400 | #' }
401 | #'
402 | #' @details This function calculates sparse rank correlation between rnk and each
403 | #' column of gset using \code{qlcMatrix::corSparse()}. It handles missing values in
404 | #' rnk by computing column-wise correlations.
405 | #'
406 | #' P-values are computed from statistical distribution
407 | #'
408 | #' @examples
409 | #' # Create example rank vector
410 | #' set.seed(123)
411 | #' ranks <- rnorm(100)
412 | #' names(ranks) <- paste0("GENE", 1:100)
413 | #' 
414 | #' # Create example gene sets as sparse matrix
415 | #' gmt <- list(
416 | #'   "Pathway1" = paste0("GENE", 1:20),
417 | #'   "Pathway2" = paste0("GENE", 15:35),
418 | #'   "Pathway3" = paste0("GENE", 30:50)
419 | #' )
420 | #' genesets <- gmt2mat(gmt)
421 | #'
422 | #' # Calculate rank correlation
423 | #' result <- gset.rankcor(ranks, genesets, compute.p = TRUE)
424 | #' print(result$rho)
425 | #' print(result$p.value)
426 | #' 
427 | #' @export
428 | gset.rankcor <- function(rnk, gset, compute.p = FALSE, use.rank = TRUE) {
429 |   if (ncol(gset) == 0 || NCOL(rnk) == 0) {
430 |     if (ncol(gset) == 0) message("gset has zero columns")
431 |     if (NCOL(rnk) == 0) message("rnk has zero columns")
432 |     return(NULL)
433 |   }
434 | 
435 |   #  if (!any(class(gset) %in% c("Matrix", "dgCMatrix", "lgCMatrix", "matrix", "array"))) {
436 |   #    stop("gset must be a matrix")
437 |   #  }
438 |   if (!inherits(gset, "Matrix")) stop("gset must be a matrix")
439 | 
440 |   is.vec <- (NCOL(rnk) == 1 && !any(class(rnk) %in% c("matrix", "Matrix")))
441 |   if (is.vec && is.null(names(rnk))) stop("rank vector must be named")
442 |   if (!is.vec && is.null(rownames(rnk))) stop("rank matrix must have rownames")
443 |   if (is.vec) rnk <- matrix(rnk, ncol = 1, dimnames = list(names(rnk), "rnk"))
444 |   n1 <- sum(rownames(rnk) %in% colnames(gset), na.rm = TRUE)
445 |   n2 <- sum(rownames(rnk) %in% rownames(gset), na.rm = TRUE)
446 |   if (n1 > n2) gset <- Matrix::t(gset)
447 | 
448 |   gg <- intersect(rownames(gset), rownames(rnk))
449 |   rnk1 <- rnk[gg, , drop = FALSE]
450 |   gset <- gset[gg, , drop = FALSE]
451 | 
452 |   if (use.rank) {
453 |     if(inherits(rnk1,"dgCMatrix")) {
454 |       ## for sparse dgCMatrix
455 |       ##rnk1 <- apply(rnk1, 2, base::rank, na.last = "keep", ties.method="random")
456 |       rnk1 <- sparseMatrixStats::colRanks(rnk1, na.last = "keep", ties.method = "random", preserveShape = TRUE)
457 |     } else {
458 |       rnk1 <- matrixStats::colRanks(rnk1, na.last = "keep", ties.method = "random", preserveShape = TRUE)
459 |     }
460 |   }
461 | 
462 |   ## two cases: (1) in case no missing values, just use corSparse on
463 |   ## whole matrix. (2) in case the rnk matrix has missing values, we
464 |   ## must proceed 1-column at time and do reduced corSparse on
465 |   ## intersection of genes.
466 |   rho1 <- cor_sparse_matrix(gset, rnk1)
467 | 
468 |   rownames(rho1) <- colnames(gset)
469 |   colnames(rho1) <- colnames(rnk1)
470 |   rho1[is.nan(rho1)] <- NA ## ??
471 | 
472 |   ## compute p-value
473 |   .cor.pvalue <- function(x, n) 2 * stats::pnorm(-abs(x / ((1 - x**2) / (n - 2))**0.5))
474 |   if (compute.p) {
475 |     pv <- apply(rho1, 2, function(x) .cor.pvalue(x, n = nrow(rnk1)))
476 |     pv[is.nan(pv)] <- NA ## ??
477 |     qv <- apply(pv, 2, stats::p.adjust, method = "fdr")
478 |     df <- list(rho = rho1, p.value = pv, q.value = qv)
479 |   } else {
480 |     df <- list(rho = rho1, p.value = NA, q.value = NA)
481 |   }
482 |   df
483 | }
484 | 
485 | #' Calculate sparse correlation matrix handling missing values
486 | #'
487 | #' @param G Sparse matrix containing gene sets
488 | #' @param mat Matrix of values
489 | #' @return Correlation matrix between G and mat
490 | #' @details If mat has no missing values, calculates correlation directly using corSparse.
491 | #' Otherwise computes column-wise correlations only using non-missing values.
492 | cor_sparse_matrix <- function(G, mat) {
493 |   if (sum(is.na(mat)) == 0) {
494 |     cor_matrix <- qlcMatrix::corSparse(G, mat)
495 |   } else {
496 |     message("matrix has missing values: computing column-wise reduced cor")
497 |     corSparse.vec <- function(X, y) {
498 |       jj <- which(!is.na(y))
499 |       qlcMatrix::corSparse(X[jj, , drop = FALSE], cbind(y[jj]))
500 |     }
501 |     cor_matrix <- lapply(seq_len(ncol(mat)), function(i) corSparse.vec(G, mat[, i]))
502 |     cor_matrix <- do.call(cbind, cor_matrix)
503 |   }
504 |   return(cor_matrix)
505 | }
506 | 


--------------------------------------------------------------------------------
/tests/testthat/test-plaid.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This file is part of the Omics Playground project.
  3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  4 | ##
  5 | 
  6 | # ============================================================================
  7 | # Setup test data
  8 | # ============================================================================
  9 | 
 10 | # Helper function to create test data
 11 | create_test_data <- function(n_genes = 100, n_samples = 10, n_pathways = 5) {
 12 |   set.seed(123)
 13 |   
 14 |   # Create expression matrix
 15 |   X <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples)
 16 |   rownames(X) <- paste0("GENE", 1:n_genes)
 17 |   colnames(X) <- paste0("Sample", 1:n_samples)
 18 |   
 19 |   # Create gene sets
 20 |   gmt <- lapply(1:n_pathways, function(i) {
 21 |     start_idx <- ((i-1) * 15 + 1)
 22 |     end_idx <- min(start_idx + 19, n_genes)
 23 |     paste0("GENE", start_idx:end_idx)
 24 |   })
 25 |   names(gmt) <- paste0("Pathway", 1:n_pathways)
 26 |   
 27 |   matG <- gmt2mat(gmt, bg = rownames(X))
 28 |   
 29 |   list(X = X, matG = matG, gmt = gmt)
 30 | }
 31 | 
 32 | # ============================================================================
 33 | # Test: plaid() - Main function
 34 | # ============================================================================
 35 | 
 36 | test_that("plaid works with basic input", {
 37 |   data <- create_test_data()
 38 |   
 39 |   result <- plaid(data$X, data$matG)
 40 |   
 41 |   expect_true(is.matrix(result))
 42 |   expect_equal(nrow(result), ncol(data$matG))
 43 |   expect_equal(ncol(result), ncol(data$X))
 44 |   expect_equal(rownames(result), colnames(data$matG))
 45 |   expect_equal(colnames(result), colnames(data$X))
 46 | })
 47 | 
 48 | test_that("plaid works with stats='sum'", {
 49 |   data <- create_test_data()
 50 |   
 51 |   result <- plaid(data$X, data$matG, stats = "sum")
 52 |   
 53 |   expect_true(is.matrix(result))
 54 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
 55 | })
 56 | 
 57 | test_that("plaid works with normalize=FALSE", {
 58 |   data <- create_test_data()
 59 |   
 60 |   result <- plaid(data$X, data$matG, normalize = FALSE)
 61 |   
 62 |   expect_true(is.matrix(result))
 63 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
 64 | })
 65 | 
 66 | test_that("plaid works with single column", {
 67 |   data <- create_test_data(n_samples = 1)
 68 |   
 69 |   result <- plaid(data$X, data$matG)
 70 |   
 71 |   expect_true(is.matrix(result))
 72 |   expect_equal(ncol(result), 1)
 73 | })
 74 | 
 75 | test_that("plaid handles no overlapping features", {
 76 |   data <- create_test_data()
 77 |   rownames(data$X) <- paste0("OTHER_GENE", 1:nrow(data$X))
 78 |   
 79 |   result <- plaid(data$X, data$matG)
 80 |   
 81 |   expect_null(result)
 82 | })
 83 | 
 84 | test_that("plaid works with sparse matrices", {
 85 |   data <- create_test_data(n_pathways = 25)  # Use more pathways to avoid normalization issue
 86 |   X_sparse <- Matrix::Matrix(data$X, sparse = TRUE)
 87 |   
 88 |   result <- plaid(X_sparse, data$matG)
 89 |   
 90 |   expect_true(is.matrix(result))
 91 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
 92 | })
 93 | 
 94 | test_that("plaid works with different nsmooth values", {
 95 |   data <- create_test_data()
 96 |   
 97 |   result1 <- plaid(data$X, data$matG, nsmooth = 1)
 98 |   result2 <- plaid(data$X, data$matG, nsmooth = 5)
 99 |   
100 |   expect_true(is.matrix(result1))
101 |   expect_true(is.matrix(result2))
102 |   expect_false(identical(result1, result2))
103 | })
104 | 
105 | # ============================================================================
106 | # Test: chunked_crossprod()
107 | # ============================================================================
108 | 
109 | test_that("chunked_crossprod works with small matrix", {
110 |   set.seed(123)
111 |   x <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
112 |   y <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
113 |   
114 |   result <- chunked_crossprod(x, y, chunk = 1000)
115 |   expected <- Matrix::crossprod(x, y)
116 |   
117 |   expect_equal(as.matrix(result), as.matrix(expected))
118 | })
119 | 
120 | test_that("chunked_crossprod works with chunk size", {
121 |   set.seed(123)
122 |   x <- Matrix::Matrix(rnorm(200), nrow = 10, ncol = 20, sparse = TRUE)
123 |   y <- Matrix::Matrix(rnorm(300), nrow = 10, ncol = 30, sparse = TRUE)
124 |   
125 |   result <- chunked_crossprod(x, y, chunk = 10)
126 |   expected <- Matrix::crossprod(x, y)
127 |   
128 |   # Compare values, ignoring dimnames which may differ
129 |   expect_equal(as.numeric(result), as.numeric(expected), tolerance = 1e-10)
130 |   expect_equal(dim(result), dim(expected))
131 | })
132 | 
133 | # ============================================================================
134 | # Test: replaid.scse()
135 | # ============================================================================
136 | 
137 | test_that("replaid.scse works with basic input", {
138 |   data <- create_test_data()
139 |   # Create log-transformed data (simulate counts)
140 |   X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10))
141 |   rownames(X) <- paste0("GENE", 1:100)
142 |   colnames(X) <- paste0("Sample", 1:10)
143 |   
144 |   result <- replaid.scse(X, data$matG, removeLog2 = TRUE, scoreMean = FALSE)
145 |   
146 |   expect_true(is.matrix(result))
147 |   expect_equal(dim(result), c(ncol(data$matG), ncol(X)))
148 |   expect_equal(colnames(result), colnames(X))
149 | })
150 | 
151 | test_that("replaid.scse works with scoreMean=TRUE", {
152 |   data <- create_test_data()
153 |   X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10))
154 |   rownames(X) <- rownames(data$X)
155 |   colnames(X) <- colnames(data$X)
156 |   
157 |   result <- replaid.scse(X, data$matG, removeLog2 = TRUE, scoreMean = TRUE)
158 |   
159 |   expect_true(is.matrix(result))
160 |   expect_equal(dim(result), c(ncol(data$matG), ncol(X)))
161 | })
162 | 
163 | test_that("replaid.scse auto-detects removeLog2", {
164 |   data <- create_test_data()
165 |   X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10))
166 |   rownames(X) <- rownames(data$X)
167 |   colnames(X) <- colnames(data$X)
168 |   
169 |   result <- replaid.scse(X, data$matG, removeLog2 = NULL)
170 |   
171 |   expect_true(is.matrix(result))
172 | })
173 | 
174 | # ============================================================================
175 | # Test: replaid.sing()
176 | # ============================================================================
177 | 
178 | test_that("replaid.sing works with basic input", {
179 |   data <- create_test_data()
180 |   
181 |   result <- replaid.sing(data$X, data$matG)
182 |   
183 |   expect_true(is.matrix(result))
184 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
185 |   expect_equal(rownames(result), colnames(data$matG))
186 |   expect_equal(colnames(result), colnames(data$X))
187 | })
188 | 
189 | test_that("replaid.sing produces values in expected range", {
190 |   data <- create_test_data()
191 |   
192 |   result <- replaid.sing(data$X, data$matG)
193 |   
194 |   # Singscore should be centered around 0
195 |   expect_true(all(is.finite(result)))
196 |   expect_true(abs(mean(result)) < 1)
197 | })
198 | 
199 | # ============================================================================
200 | # Test: replaid.ssgsea()
201 | # ============================================================================
202 | 
203 | test_that("replaid.ssgsea works with alpha=0", {
204 |   data <- create_test_data()
205 |   
206 |   result <- replaid.ssgsea(data$X, data$matG, alpha = 0)
207 |   
208 |   expect_true(is.matrix(result))
209 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
210 | })
211 | 
212 | test_that("replaid.ssgsea works with alpha>0", {
213 |   data <- create_test_data()
214 |   
215 |   result <- replaid.ssgsea(data$X, data$matG, alpha = 0.25)
216 |   
217 |   expect_true(is.matrix(result))
218 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
219 | })
220 | 
221 | test_that("replaid.ssgsea results differ for different alpha values", {
222 |   data <- create_test_data()
223 |   
224 |   result1 <- replaid.ssgsea(data$X, data$matG, alpha = 0)
225 |   result2 <- replaid.ssgsea(data$X, data$matG, alpha = 0.5)
226 |   
227 |   expect_false(identical(result1, result2))
228 | })
229 | 
230 | # ============================================================================
231 | # Test: replaid.ucell()
232 | # ============================================================================
233 | 
234 | test_that("replaid.ucell works with basic input", {
235 |   data <- create_test_data()
236 |   
237 |   result <- replaid.ucell(data$X, data$matG, rmax = 50)
238 |   
239 |   expect_true(is.matrix(result))
240 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
241 | })
242 | 
243 | test_that("replaid.ucell produces finite values", {
244 |   data <- create_test_data()
245 |   
246 |   result <- replaid.ucell(data$X, data$matG, rmax = 50)
247 |   
248 |   # UCell scores are transformed and may exceed [0,1] range
249 |   expect_true(all(is.finite(result)))
250 |   expect_true(is.matrix(result))
251 | })
252 | 
253 | test_that("replaid.ucell works with different rmax values", {
254 |   data <- create_test_data()
255 |   
256 |   result1 <- replaid.ucell(data$X, data$matG, rmax = 50)
257 |   result2 <- replaid.ucell(data$X, data$matG, rmax = 100)
258 |   
259 |   expect_false(identical(result1, result2))
260 | })
261 | 
262 | # ============================================================================
263 | # Test: replaid.aucell()
264 | # ============================================================================
265 | 
266 | test_that("replaid.aucell works with basic input", {
267 |   data <- create_test_data()
268 |   
269 |   result <- replaid.aucell(data$X, data$matG)
270 |   
271 |   expect_true(is.matrix(result))
272 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
273 | })
274 | 
275 | test_that("replaid.aucell works with custom aucMaxRank", {
276 |   data <- create_test_data()
277 |   
278 |   result <- replaid.aucell(data$X, data$matG, aucMaxRank = 10)
279 |   
280 |   expect_true(is.matrix(result))
281 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
282 | })
283 | 
284 | test_that("replaid.aucell auto-computes aucMaxRank", {
285 |   data <- create_test_data()
286 |   
287 |   result <- replaid.aucell(data$X, data$matG, aucMaxRank = NULL)
288 |   
289 |   expect_true(is.matrix(result))
290 | })
291 | 
292 | # ============================================================================
293 | # Test: replaid.gsva()
294 | # ============================================================================
295 | 
296 | test_that("replaid.gsva works with basic input", {
297 |   data <- create_test_data()
298 |   
299 |   result <- replaid.gsva(data$X, data$matG, tau = 0)
300 |   
301 |   expect_true(is.matrix(result))
302 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
303 | })
304 | 
305 | test_that("replaid.gsva works with tau>0", {
306 |   data <- create_test_data()
307 |   
308 |   result <- replaid.gsva(data$X, data$matG, tau = 0.25)
309 |   
310 |   expect_true(is.matrix(result))
311 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
312 | })
313 | 
314 | test_that("replaid.gsva works with rowtf='z'", {
315 |   data <- create_test_data()
316 |   
317 |   result <- replaid.gsva(data$X, data$matG, rowtf = "z")
318 |   
319 |   expect_true(is.matrix(result))
320 | })
321 | 
322 | test_that("replaid.gsva works with rowtf='ecdf'", {
323 |   data <- create_test_data()
324 |   
325 |   result <- replaid.gsva(data$X, data$matG, rowtf = "ecdf")
326 |   
327 |   expect_true(is.matrix(result))
328 | })
329 | 
330 | test_that("replaid.gsva throws error for invalid rowtf", {
331 |   data <- create_test_data()
332 |   
333 |   expect_error(replaid.gsva(data$X, data$matG, rowtf = "invalid"))
334 | })
335 | 
336 | # ============================================================================
337 | # Test: mat.rowsds()
338 | # ============================================================================
339 | 
340 | test_that("mat.rowsds works with dense matrix", {
341 |   set.seed(123)
342 |   X <- matrix(rnorm(100), nrow = 10, ncol = 10)
343 |   
344 |   result <- mat.rowsds(X)
345 |   expected <- matrixStats::rowSds(X)
346 |   
347 |   expect_equal(result, expected)
348 | })
349 | 
350 | test_that("mat.rowsds works with sparse matrix", {
351 |   set.seed(123)
352 |   X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
353 |   
354 |   result <- mat.rowsds(X)
355 |   
356 |   expect_true(is.numeric(result))
357 |   expect_equal(length(result), nrow(X))
358 |   expect_true(all(result >= 0))
359 | })
360 | 
361 | # ============================================================================
362 | # Test: normalize_medians()
363 | # ============================================================================
364 | 
365 | test_that("normalize_medians works with basic input", {
366 |   set.seed(123)
367 |   x <- matrix(rnorm(100, mean = 5), nrow = 10, ncol = 10)
368 |   
369 |   result <- normalize_medians(x)
370 |   
371 |   expect_true(is.matrix(result))
372 |   expect_equal(dim(result), dim(x))
373 |   
374 |   # Check that medians are similar after normalization
375 |   medians <- matrixStats::colMedians(result)
376 |   expect_true(sd(medians) < sd(matrixStats::colMedians(x)))
377 | })
378 | 
379 | test_that("normalize_medians handles zeros correctly", {
380 |   set.seed(123)
381 |   x <- matrix(rnorm(100), nrow = 10, ncol = 10)
382 |   x[1:3, 1:3] <- 0
383 |   
384 |   result <- normalize_medians(x, ignore.zero = TRUE)
385 |   
386 |   expect_true(is.matrix(result))
387 |   expect_equal(dim(result), dim(x))
388 | })
389 | 
390 | test_that("normalize_medians auto-detects zeros", {
391 |   set.seed(123)
392 |   x <- matrix(rnorm(100), nrow = 10, ncol = 10)
393 |   x[1:3, 1:3] <- 0
394 |   
395 |   result <- normalize_medians(x, ignore.zero = NULL)
396 |   
397 |   expect_true(is.matrix(result))
398 | })
399 | 
400 | # ============================================================================
401 | # Test: colranks()
402 | # ============================================================================
403 | 
404 | test_that("colranks works with basic input", {
405 |   set.seed(123)
406 |   X <- matrix(rnorm(100), nrow = 10, ncol = 10)
407 |   rownames(X) <- paste0("Gene", 1:10)
408 |   colnames(X) <- paste0("Sample", 1:10)
409 |   
410 |   result <- colranks(X)
411 |   
412 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
413 |   expect_equal(dim(result), dim(X))
414 |   expect_equal(dimnames(result), dimnames(X))
415 | })
416 | 
417 | test_that("colranks produces correct rank range", {
418 |   set.seed(123)
419 |   X <- matrix(rnorm(100), nrow = 10, ncol = 10)
420 |   
421 |   result <- colranks(X, ties.method = "average")
422 |   
423 |   # Ranks should be between 1 and nrow(X)
424 |   expect_true(all(result >= 1 & result <= nrow(X)))
425 | })
426 | 
427 | test_that("colranks works with signed=TRUE", {
428 |   set.seed(123)
429 |   X <- matrix(rnorm(100), nrow = 10, ncol = 10)
430 |   
431 |   result <- colranks(X, signed = TRUE)
432 |   
433 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
434 |   expect_equal(dim(result), dim(X))
435 | })
436 | 
437 | test_that("colranks works with keep.zero=TRUE", {
438 |   set.seed(123)
439 |   X <- matrix(rnorm(100), nrow = 10, ncol = 10)
440 |   X[1:3, 1:3] <- 0
441 |   X <- Matrix::Matrix(X, sparse = TRUE)
442 |   
443 |   result <- colranks(X, keep.zero = TRUE)
444 |   
445 |   expect_true(inherits(result, "Matrix"))
446 |   expect_equal(dim(result), dim(X))
447 | })
448 | 
449 | test_that("colranks works with sparse matrix", {
450 |   set.seed(123)
451 |   X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
452 |   
453 |   result <- colranks(X, sparse = TRUE)
454 |   
455 |   # colranks returns dense matrix for full matrices even if input is sparse
456 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
457 |   expect_equal(dim(result), dim(X))
458 | })
459 | 
460 | test_that("colranks ties methods work correctly", {
461 |   X <- matrix(c(1, 2, 2, 3, 4, 5, 5, 6), nrow = 4, ncol = 2)
462 |   
463 |   result_avg <- colranks(X, ties.method = "average")
464 |   result_min <- colranks(X, ties.method = "min")
465 |   
466 |   expect_false(identical(result_avg, result_min))
467 | })
468 | 
469 | # ============================================================================
470 | # Test: sparse_colranks()
471 | # ============================================================================
472 | 
473 | test_that("sparse_colranks works with basic input", {
474 |   set.seed(123)
475 |   X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
476 |   
477 |   result <- sparse_colranks(X)
478 |   
479 |   expect_true(inherits(result, "CsparseMatrix"))
480 |   expect_equal(dim(result), dim(X))
481 | })
482 | 
483 | test_that("sparse_colranks works with signed=TRUE", {
484 |   set.seed(123)
485 |   X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE)
486 |   
487 |   result <- sparse_colranks(X, signed = TRUE)
488 |   
489 |   expect_true(inherits(result, "CsparseMatrix"))
490 |   expect_equal(dim(result), dim(X))
491 | })
492 | 
493 | test_that("sparse_colranks handles different ties methods", {
494 |   X <- Matrix::Matrix(c(1, 2, 2, 3, 4, 5, 5, 6), nrow = 4, ncol = 2, sparse = TRUE)
495 |   
496 |   result_avg <- sparse_colranks(X, ties.method = "average")
497 |   result_min <- sparse_colranks(X, ties.method = "min")
498 |   
499 |   expect_true(inherits(result_avg, "CsparseMatrix"))
500 |   expect_true(inherits(result_min, "CsparseMatrix"))
501 | })
502 | 
503 | # ============================================================================
504 | # Test: GMT list input for replaid functions
505 | # ============================================================================
506 | 
507 | test_that("plaid works with GMT list input", {
508 |   data <- create_test_data()
509 |   
510 |   result <- plaid(data$X, data$gmt)
511 |   
512 |   expect_true(is.matrix(result))
513 |   expect_equal(ncol(result), ncol(data$X))
514 | })
515 | 
516 | test_that("replaid.sing works with GMT list input", {
517 |   data <- create_test_data()
518 |   
519 |   result <- replaid.sing(data$X, data$gmt)
520 |   
521 |   expect_true(is.matrix(result))
522 |   expect_equal(ncol(result), ncol(data$X))
523 | })
524 | 
525 | test_that("replaid.ssgsea works with GMT list input", {
526 |   data <- create_test_data()
527 |   
528 |   result <- replaid.ssgsea(data$X, data$gmt, alpha = 0)
529 |   
530 |   expect_true(is.matrix(result))
531 |   expect_equal(ncol(result), ncol(data$X))
532 | })
533 | 
534 | # ============================================================================
535 | # Test: Edge cases
536 | # ============================================================================
537 | 
538 | test_that("plaid handles small number of gene sets", {
539 |   data <- create_test_data(n_pathways = 2)
540 |   
541 |   result <- plaid(data$X, data$matG)
542 |   
543 |   expect_true(is.matrix(result))
544 |   expect_equal(nrow(result), 2)
545 | })
546 | 
547 | test_that("plaid handles large nsmooth value", {
548 |   data <- create_test_data()
549 |   
550 |   result <- plaid(data$X, data$matG, nsmooth = 100)
551 |   
552 |   expect_true(is.matrix(result))
553 |   expect_equal(dim(result), c(ncol(data$matG), ncol(data$X)))
554 | })
555 | 
556 | test_that("normalize_medians handles all-zero columns", {
557 |   x <- matrix(0, nrow = 10, ncol = 5)
558 |   x[, 1:3] <- rnorm(30)
559 |   
560 |   result <- normalize_medians(x)
561 |   
562 |   expect_true(is.matrix(result))
563 |   expect_equal(dim(result), dim(x))
564 | })
565 | 
566 | test_that("colranks handles constant values", {
567 |   X <- matrix(rep(1, 20), nrow = 5, ncol = 4)
568 |   
569 |   result <- colranks(X)
570 |   
571 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
572 |   expect_equal(dim(result), dim(X))
573 | })
574 | 
575 | # ============================================================================
576 | # Test: Parameter validation
577 | # ============================================================================
578 | 
579 | test_that("plaid handles partial overlap of features", {
580 |   data <- create_test_data(n_genes = 100)
581 |   # Keep only half of genes in matG
582 |   keep_genes <- rownames(data$X)[1:50]
583 |   data$matG <- data$matG[keep_genes, , drop = FALSE]
584 |   
585 |   result <- plaid(data$X, data$matG)
586 |   
587 |   expect_true(is.matrix(result))
588 |   expect_equal(ncol(result), ncol(data$X))
589 | })
590 | 
591 | test_that("replaid functions handle min.genes and max.genes parameters", {
592 |   data <- create_test_data()
593 |   
594 |   result <- plaid(data$X, data$gmt, min.genes = 5, max.genes = 100)
595 |   
596 |   expect_true(is.matrix(result))
597 | })
598 | 
599 | # ============================================================================
600 | # Test: Numerical stability
601 | # ============================================================================
602 | 
603 | test_that("plaid handles very small values", {
604 |   data <- create_test_data()
605 |   data$X <- data$X * 1e-10
606 |   
607 |   result <- plaid(data$X, data$matG)
608 |   
609 |   expect_true(is.matrix(result))
610 |   expect_true(all(is.finite(result)))
611 | })
612 | 
613 | test_that("plaid handles very large values", {
614 |   data <- create_test_data(n_pathways = 25)
615 |   data$X <- data$X * 1e6  # Use smaller multiplier to avoid Inf
616 |   
617 |   result <- plaid(data$X, data$matG)
618 |   
619 |   expect_true(is.matrix(result))
620 |   # With very large values, some Inf may occur in intermediate calculations
621 |   expect_true(is.matrix(result))
622 | })
623 | 
624 | test_that("normalize_medians handles NA values", {
625 |   set.seed(123)
626 |   x <- matrix(rnorm(100), nrow = 10, ncol = 10)
627 |   x[1:3, 1:3] <- NA
628 |   
629 |   result <- normalize_medians(x)
630 |   
631 |   expect_true(is.matrix(result))
632 |   expect_equal(dim(result), dim(x))
633 | })
634 | 


--------------------------------------------------------------------------------
/tests/testthat/test-stats.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This file is part of the Omics Playground project.
  3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  4 | ##
  5 | 
  6 | # ============================================================================
  7 | # Setup test data
  8 | # ============================================================================
  9 | 
 10 | # Helper function to create test data for statistical testing
 11 | create_stats_test_data <- function(n_genes = 100, n_samples = 20, n_pathways = 5) {
 12 |   set.seed(456)
 13 |   
 14 |   # Create expression matrix
 15 |   X <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples)
 16 |   rownames(X) <- paste0("GENE", 1:n_genes)
 17 |   colnames(X) <- paste0("Sample", 1:n_samples)
 18 |   
 19 |   # Create binary group vector (0/1)
 20 |   y <- rep(c(0, 1), each = n_samples/2)
 21 |   
 22 |   # Create gene sets
 23 |   gmt <- lapply(1:n_pathways, function(i) {
 24 |     start_idx <- ((i-1) * 15 + 1)
 25 |     end_idx <- min(start_idx + 19, n_genes)
 26 |     paste0("GENE", start_idx:end_idx)
 27 |   })
 28 |   names(gmt) <- paste0("Pathway", 1:n_pathways)
 29 |   
 30 |   G <- gmt2mat(gmt, bg = rownames(X))
 31 |   
 32 |   list(X = X, y = y, G = G, gmt = gmt)
 33 | }
 34 | 
 35 | # ============================================================================
 36 | # Test: dualGSEA()
 37 | # ============================================================================
 38 | 
 39 | test_that("dualGSEA works with replaid methods", {
 40 |   data <- create_stats_test_data()
 41 |   
 42 |   result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G,
 43 |                             fc.method = "cor", ss.method = "replaid.ssgsea")
 44 |   
 45 |   expect_true(is.matrix(result) || is.data.frame(result))
 46 |   expect_true("gsetFC" %in% colnames(result))
 47 |   expect_true("p.dual" %in% colnames(result))
 48 |   expect_true("q.dual" %in% colnames(result))
 49 |   expect_true("size" %in% colnames(result))
 50 |   expect_equal(nrow(result), ncol(data$G))
 51 | 
 52 | })
 53 | 
 54 | test_that("dualGSEA handles NA values in y", {
 55 |   data <- create_stats_test_data()
 56 |   data$y[1:2] <- NA
 57 |   
 58 |   result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G,
 59 |                      fc.method = "rankcor", ss.method = "replaid.ssgsea")
 60 |   
 61 |   expect_true(is.matrix(result) || is.data.frame(result))
 62 | })
 63 | 
 64 | test_that("dualGSEA throws error for invalid y", {
 65 |   data <- create_stats_test_data()
 66 |   data$y[1] <- 2
 67 |   
 68 |   expect_error(dualGSEA(data$X, data$y, gmt = data$gmt))
 69 | })
 70 | 
 71 | test_that("dualGSEA throws error for invalid fc.method", {
 72 |   data <- create_stats_test_data()
 73 |   
 74 |   expect_error(dualGSEA(data$X, data$y, gmt = data$gmt, fc.method = "invalid"))
 75 | })
 76 | 
 77 | test_that("dualGSEA throws error for invalid ss.method", {
 78 |   data <- create_stats_test_data()
 79 |   
 80 |   expect_error(dualGSEA(data$X, data$y, gmt = data$gmt, 
 81 |                         fc.method = "cor", ss.method = "invalid"))
 82 | })
 83 | 
 84 | test_that("dualGSEA works with different fc.method methods", {
 85 |   data <- create_stats_test_data(n_pathways = 25)  # Use more pathways
 86 |   
 87 |   result_cor <- dualGSEA(data$X, data$y, G = data$G, fc.method = "cor")
 88 |   result_rankcor <- dualGSEA(data$X, data$y, G = data$G, fc.method = "rankcor")
 89 |   result_ztest <- dualGSEA(data$X, data$y, G = data$G, fc.method = "ztest")
 90 |   
 91 |   expect_true(is.matrix(result_cor) || is.data.frame(result_cor))
 92 |   expect_true(is.matrix(result_rankcor) || is.data.frame(result_rankcor))
 93 |   expect_true(is.matrix(result_ztest) || is.data.frame(result_ztest))
 94 | })
 95 | 
 96 | test_that("dualGSEA works with different metap methods", {
 97 |   data <- create_stats_test_data()
 98 |   
 99 |   result_stouffer <- dualGSEA(data$X, data$y, G = data$G, metap.method = "stouffer")
100 |   result_fisher <- dualGSEA(data$X, data$y, G = data$G, metap.method = "fisher")
101 |   
102 |   expect_true(is.matrix(result_stouffer) || is.data.frame(result_stouffer))
103 |   expect_true(is.matrix(result_fisher) || is.data.frame(result_fisher))
104 |   expect_false(identical(result_stouffer, result_fisher))
105 | })
106 | 
107 | test_that("dualGSEA works with different sort.by options", {
108 |   data <- create_stats_test_data()
109 |   
110 |   result_pdual <- dualGSEA(data$X, data$y, G = data$G, sort.by = "p.dual")
111 |   result_fc <- dualGSEA(data$X, data$y, G = data$G, sort.by = "gsetFC")
112 |   
113 |   expect_true(is.matrix(result_pdual) || is.data.frame(result_pdual))
114 |   expect_true(is.matrix(result_fc) || is.data.frame(result_fc))
115 | })
116 | 
117 | test_that("dualGSEA handles NA values in y", {
118 |   data <- create_stats_test_data()
119 |   data$y[1:2] <- NA
120 |   
121 |   result <- dualGSEA(data$X, data$y, G = data$G)
122 |   
123 |   expect_true(is.matrix(result) || is.data.frame(result))
124 | })
125 | 
126 | test_that("dualGSEA works with GMT list input", {
127 |   data <- create_stats_test_data()
128 |   
129 |   result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G)
130 |   
131 |   expect_true(is.matrix(result) || is.data.frame(result))
132 | })
133 | 
134 | test_that("dualGSEA works with precomputed pv1 and pv2", {
135 |   data <- create_stats_test_data()
136 |   pv1 <- rep(0.05, ncol(data$G))
137 |   names(pv1) <- colnames(data$G)
138 |   pv2 <- rep(0.1, ncol(data$G))
139 |   names(pv2) <- colnames(data$G)
140 |   
141 |   result <- dualGSEA(data$X, data$y, G = data$G, pv1 = pv1, pv2 = pv2)
142 |   
143 |   expect_true(is.matrix(result) || is.data.frame(result))
144 | })
145 | 
146 | test_that("dualGSEA works with sparse matrix input", {
147 |   data <- create_stats_test_data(n_pathways = 25)  # Use more pathways to avoid normalization issue
148 |   X_sparse <- Matrix::Matrix(data$X, sparse = TRUE)
149 |   
150 |   result <- dualGSEA(X_sparse, data$y, G = data$G)
151 |   
152 |   expect_true(is.matrix(result) || is.data.frame(result))
153 | })
154 | 
155 | # ============================================================================
156 | # Test: fc_ttest()
157 | # ============================================================================
158 | 
159 | test_that("fc_ttest works with basic input", {
160 |   data <- create_stats_test_data()
161 |   fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0])
162 |   names(fc) <- rownames(data$X)
163 |   
164 |   result <- fc_ttest(fc, data$G)
165 |   
166 |   expect_true(is.matrix(result) || is.data.frame(result))
167 |   expect_true("gsetFC" %in% colnames(result))
168 |   expect_true("pvalue" %in% colnames(result))
169 |   expect_true("qvalue" %in% colnames(result))
170 | })
171 | 
172 | test_that("fc_ttest works with different sort.by options", {
173 |   data <- create_stats_test_data()
174 |   fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0])
175 |   names(fc) <- rownames(data$X)
176 |   
177 |   result_pvalue <- fc_ttest(fc, data$G, sort.by = "pvalue")
178 |   result_fc <- fc_ttest(fc, data$G, sort.by = "gsetFC")
179 |   result_none <- fc_ttest(fc, data$G, sort.by = "none")
180 |   
181 |   expect_true(is.matrix(result_pvalue) || is.data.frame(result_pvalue))
182 |   expect_true(is.matrix(result_fc) || is.data.frame(result_fc))
183 |   expect_true(is.matrix(result_none) || is.data.frame(result_none))
184 | })
185 | 
186 | test_that("fc_ttest throws error for unnamed fc", {
187 |   data <- create_stats_test_data()
188 |   fc <- rnorm(nrow(data$X))
189 |   
190 |   expect_error(fc_ttest(fc, data$G))
191 | })
192 | 
193 | # ============================================================================
194 | # Test: fc_ztest()
195 | # ============================================================================
196 | 
197 | test_that("fc_ztest works with basic input", {
198 |   data <- create_stats_test_data()
199 |   fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0])
200 |   names(fc) <- rownames(data$X)
201 |   
202 |   result <- fc_ztest(fc, data$G)
203 |   
204 |   expect_true(is.list(result))
205 |   expect_true("z_statistic" %in% names(result))
206 |   expect_true("p_value" %in% names(result))
207 | })
208 | 
209 | test_that("fc_ztest works with zmat=TRUE", {
210 |   data <- create_stats_test_data()
211 |   fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0])
212 |   names(fc) <- rownames(data$X)
213 |   
214 |   result <- fc_ztest(fc, data$G, zmat=TRUE)
215 |   
216 |   expect_true(is.list(result))
217 |   expect_true("zmat" %in% names(result))
218 |   expect_true("z_statistic" %in% names(result))  
219 |   expect_true("p_value" %in% names(result))
220 | })
221 | 
222 | test_that("fc_ttest throws error for unnamed fc", {
223 |   data <- create_stats_test_data()
224 |   fc <- rnorm(nrow(data$X))
225 |   
226 |   expect_error(fc_ztest(fc, data$G))
227 | })
228 | 
229 | # ============================================================================
230 | # Test: gset_averageCLR()
231 | # ============================================================================
232 | 
233 | test_that("gset_averageCLR works with basic input", {
234 |   data <- create_stats_test_data()
235 |   
236 |   result <- gset_averageCLR(data$X, data$G)
237 |   
238 |   expect_true(is.matrix(result))
239 |   expect_equal(nrow(result), ncol(data$G))
240 |   expect_equal(ncol(result), ncol(data$X))
241 | })
242 | 
243 | test_that("gset_averageCLR works with center=FALSE", {
244 |   data <- create_stats_test_data()
245 |   
246 |   result <- gset_averageCLR(data$X, data$G, center = FALSE)
247 |   
248 |   expect_true(is.matrix(result))
249 |   expect_equal(dim(result), c(ncol(data$G), ncol(data$X)))
250 | })
251 | 
252 | test_that("gset_averageCLR works with single column", {
253 |   data <- create_stats_test_data(n_samples = 2)
254 |   X_single <- data$X[, 1, drop = FALSE]
255 |   
256 |   result <- gset_averageCLR(X_single, data$G)
257 |   
258 |   expect_true(is.matrix(result))
259 |   expect_equal(ncol(result), 1)
260 | })
261 | 
262 | test_that("gset_averageCLR handles no overlapping features", {
263 |   data <- create_stats_test_data()
264 |   rownames(data$X) <- paste0("OTHER_GENE", 1:nrow(data$X))
265 |   
266 |   result <- gset_averageCLR(data$X, data$G)
267 |   
268 |   expect_null(result)
269 | })
270 | 
271 | # ============================================================================
272 | # Test: gset_ttest()
273 | # ============================================================================
274 | 
275 | test_that("gset_ttest works with basic input", {
276 |   data <- create_stats_test_data()
277 |   gsetX <- plaid(data$X, data$G)
278 |   
279 |   result <- gset_ttest(gsetX, data$y)
280 |   
281 |   expect_true(is.matrix(result) || is.data.frame(result))
282 |   expect_true("diff" %in% colnames(result))
283 |   expect_true("pvalue" %in% colnames(result))
284 |   expect_equal(nrow(result), nrow(gsetX))
285 | })
286 | 
287 | test_that("gset_ttest throws error for invalid y", {
288 |   data <- create_stats_test_data()
289 |   gsetX <- plaid(data$X, data$G)
290 |   invalid_y <- rep(c(0, 1, 2), length.out = ncol(gsetX))
291 |   
292 |   expect_error(gset_ttest(gsetX, invalid_y))
293 | })
294 | 
295 | test_that("gset_ttest produces expected output structure", {
296 |   data <- create_stats_test_data()
297 |   gsetX <- plaid(data$X, data$G)
298 |   
299 |   result <- gset_ttest(gsetX, data$y)
300 |   
301 |   expect_true(all(c("diff", "pvalue") %in% colnames(result)))
302 |   expect_equal(rownames(result), rownames(gsetX))
303 | })
304 | 
305 | # ============================================================================
306 | # Test: matrix_onesample_ttest()
307 | # ============================================================================
308 | 
309 | test_that("matrix_onesample_ttest works with basic input", {
310 |   set.seed(456)
311 |   Fm <- rnorm(100)
312 |   names(Fm) <- paste0("GENE", 1:100)
313 |   data <- create_stats_test_data()
314 |   
315 |   result <- matrix_onesample_ttest(Fm, data$G)
316 |   
317 |   expect_true(is.list(result))
318 |   expect_true("mean" %in% names(result))
319 |   expect_true("t" %in% names(result))
320 |   expect_true("p" %in% names(result))
321 |   expect_equal(nrow(result$mean), ncol(data$G))
322 | })
323 | 
324 | test_that("matrix_onesample_ttest produces finite values", {
325 |   set.seed(456)
326 |   Fm <- rnorm(100)
327 |   names(Fm) <- paste0("GENE", 1:100)
328 |   data <- create_stats_test_data()
329 |   
330 |   result <- matrix_onesample_ttest(Fm, data$G)
331 |   
332 |   expect_true(all(is.finite(result$mean)))
333 |   expect_true(all(is.finite(result$t)))
334 |   expect_true(all(is.finite(result$p)))
335 | })
336 | 
337 | test_that("matrix_onesample_ttest handles varying gene set sizes", {
338 |   set.seed(456)
339 |   Fm <- rnorm(100)
340 |   names(Fm) <- paste0("GENE", 1:100)
341 |   data <- create_stats_test_data(n_pathways = 10)
342 |   
343 |   result <- matrix_onesample_ttest(Fm, data$G)
344 |   
345 |   expect_equal(nrow(result$mean), ncol(data$G))
346 | })
347 | 
348 | # ============================================================================
349 | # Test: matrix_metap()
350 | # ============================================================================
351 | 
352 | test_that("matrix_metap works with stouffer method", {
353 |   plist <- list(
354 |     p1 = c(0.05, 0.01, 0.1),
355 |     p2 = c(0.08, 0.02, 0.15)
356 |   )
357 |   
358 |   result <- matrix_metap(plist, method = "stouffer")
359 |   
360 |   expect_true(is.numeric(result))
361 |   expect_equal(length(result), 3)
362 |   expect_true(all(result >= 0 & result <= 1))
363 | })
364 | 
365 | test_that("matrix_metap works with fisher method", {
366 |   plist <- list(
367 |     p1 = c(0.05, 0.01, 0.1),
368 |     p2 = c(0.08, 0.02, 0.15)
369 |   )
370 |   
371 |   result <- matrix_metap(plist, method = "fisher")
372 |   
373 |   expect_true(is.numeric(result))
374 |   expect_equal(length(result), 3)
375 |   expect_true(all(result >= 0 & result <= 1))
376 | })
377 | 
378 | test_that("matrix_metap works with matrix input", {
379 |   pmat <- matrix(c(0.05, 0.01, 0.1, 0.08, 0.02, 0.15), nrow = 3, ncol = 2)
380 |   
381 |   result <- matrix_metap(pmat, method = "stouffer")
382 |   
383 |   expect_true(is.numeric(result))
384 |   expect_equal(length(result), 3)
385 | })
386 | 
387 | test_that("matrix_metap results differ by method", {
388 |   plist <- list(
389 |     p1 = c(0.05, 0.01, 0.1),
390 |     p2 = c(0.08, 0.02, 0.15)
391 |   )
392 |   
393 |   result_stouffer <- matrix_metap(plist, method = "stouffer")
394 |   result_fisher <- matrix_metap(plist, method = "fisher")
395 |   
396 |   expect_false(identical(result_stouffer, result_fisher))
397 | })
398 | 
399 | test_that("matrix_metap works with sumlog alias", {
400 |   plist <- list(
401 |     p1 = c(0.05, 0.01, 0.1),
402 |     p2 = c(0.08, 0.02, 0.15)
403 |   )
404 |   
405 |   result <- matrix_metap(plist, method = "sumlog")
406 |   
407 |   expect_true(is.numeric(result))
408 |   expect_true(all(result >= 0 & result <= 1))
409 | })
410 | 
411 | test_that("matrix_metap works with sumz alias", {
412 |   plist <- list(
413 |     p1 = c(0.05, 0.01, 0.1),
414 |     p2 = c(0.08, 0.02, 0.15)
415 |   )
416 |   
417 |   result <- matrix_metap(plist, method = "sumz")
418 |   
419 |   expect_true(is.numeric(result))
420 |   expect_true(all(result >= 0 & result <= 1))
421 | })
422 | 
423 | test_that("matrix_metap throws error for invalid method", {
424 |   plist <- list(
425 |     p1 = c(0.05, 0.01, 0.1),
426 |     p2 = c(0.08, 0.02, 0.15)
427 |   )
428 |   
429 |   expect_error(matrix_metap(plist, method = "invalid"))
430 | })
431 | 
432 | test_that("matrix_metap handles extreme p-values", {
433 |   plist <- list(
434 |     p1 = c(1e-10, 0.5, 0.99),
435 |     p2 = c(1e-9, 0.6, 0.999)
436 |   )
437 |   
438 |   result <- matrix_metap(plist, method = "stouffer")
439 |   
440 |   expect_true(all(is.finite(result)))
441 |   expect_true(all(result >= 0 & result <= 1))
442 | })
443 | 
444 | # ============================================================================
445 | # Test: gset.rankcor()
446 | # ============================================================================
447 | 
448 | test_that("gset.rankcor works with basic input", {
449 |   set.seed(456)
450 |   rnk <- rnorm(100)
451 |   names(rnk) <- paste0("GENE", 1:100)
452 |   data <- create_stats_test_data()
453 |   
454 |   result <- gset.rankcor(rnk, data$G, compute.p = FALSE)
455 |   
456 |   expect_true(is.list(result))
457 |   expect_true("rho" %in% names(result))
458 |   expect_equal(nrow(result$rho), ncol(data$G))
459 | })
460 | 
461 | test_that("gset.rankcor works with compute.p=TRUE", {
462 |   set.seed(456)
463 |   rnk <- rnorm(100)
464 |   names(rnk) <- paste0("GENE", 1:100)
465 |   data <- create_stats_test_data()
466 |   
467 |   result <- gset.rankcor(rnk, data$G, compute.p = TRUE)
468 |   
469 |   expect_true(is.list(result))
470 |   expect_true("rho" %in% names(result))
471 |   expect_true("p.value" %in% names(result))
472 |   expect_true("q.value" %in% names(result))
473 | })
474 | 
475 | test_that("gset.rankcor works with use.rank=FALSE", {
476 |   set.seed(456)
477 |   rnk <- rnorm(100)
478 |   names(rnk) <- paste0("GENE", 1:100)
479 |   data <- create_stats_test_data()
480 |   
481 |   result <- gset.rankcor(rnk, data$G, use.rank = FALSE)
482 |   
483 |   expect_true(is.list(result))
484 |   expect_true("rho" %in% names(result))
485 | })
486 | 
487 | test_that("gset.rankcor works with matrix input", {
488 |   set.seed(456)
489 |   rnk <- matrix(rnorm(200), nrow = 100, ncol = 2)
490 |   rownames(rnk) <- paste0("GENE", 1:100)
491 |   colnames(rnk) <- c("Cond1", "Cond2")
492 |   data <- create_stats_test_data()
493 |   
494 |   result <- gset.rankcor(rnk, data$G)
495 |   
496 |   expect_true(is.list(result))
497 |   expect_equal(ncol(result$rho), 2)
498 | })
499 | 
500 | test_that("gset.rankcor throws error for unnamed vector", {
501 |   rnk <- rnorm(100)
502 |   data <- create_stats_test_data()
503 |   
504 |   expect_error(gset.rankcor(rnk, data$G))
505 | })
506 | 
507 | test_that("gset.rankcor throws error for matrix without rownames", {
508 |   rnk <- matrix(rnorm(200), nrow = 100, ncol = 2)
509 |   data <- create_stats_test_data()
510 |   
511 |   expect_error(gset.rankcor(rnk, data$G))
512 | })
513 | 
514 | test_that("gset.rankcor returns NULL for zero columns", {
515 |   set.seed(456)
516 |   rnk <- rnorm(100)
517 |   names(rnk) <- paste0("GENE", 1:100)
518 |   empty_G <- Matrix::Matrix(0, nrow = 100, ncol = 0, sparse = TRUE)
519 |   rownames(empty_G) <- paste0("GENE", 1:100)
520 |   
521 |   result <- gset.rankcor(rnk, empty_G)
522 |   
523 |   expect_null(result)
524 | })
525 | 
526 | test_that("gset.rankcor throws error for non-Matrix gset", {
527 |   set.seed(456)
528 |   rnk <- rnorm(100)
529 |   names(rnk) <- paste0("GENE", 1:100)
530 |   gset <- list(pathway1 = c("GENE1", "GENE2"))
531 |   
532 |   expect_error(gset.rankcor(rnk, gset))
533 | })
534 | 
535 | test_that("gset.rankcor handles transposed input correctly", {
536 |   set.seed(456)
537 |   rnk <- rnorm(100)
538 |   names(rnk) <- paste0("GENE", 1:100)
539 |   data <- create_stats_test_data()
540 |   G_transposed <- Matrix::t(data$G)
541 |   
542 |   result <- gset.rankcor(rnk, G_transposed)
543 |   
544 |   expect_true(is.list(result))
545 | })
546 | 
547 | # ============================================================================
548 | # Test: cor_sparse_matrix()
549 | # ============================================================================
550 | 
551 | test_that("cor_sparse_matrix works without missing values", {
552 |   set.seed(456)
553 |   G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE)
554 |   mat <- matrix(rnorm(200), nrow = 100, ncol = 2)
555 |   
556 |   result <- cor_sparse_matrix(G, mat)
557 |   
558 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
559 |   expect_equal(nrow(result), ncol(G))
560 |   expect_equal(ncol(result), ncol(mat))
561 | })
562 | 
563 | test_that("cor_sparse_matrix works with missing values", {
564 |   set.seed(456)
565 |   G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE)
566 |   mat <- matrix(rnorm(200), nrow = 100, ncol = 2)
567 |   mat[1:5, 1] <- NA
568 |   
569 |   result <- cor_sparse_matrix(G, mat)
570 |   
571 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
572 |   expect_equal(nrow(result), ncol(G))
573 |   expect_equal(ncol(result), ncol(mat))
574 | })
575 | 
576 | test_that("cor_sparse_matrix produces finite values", {
577 |   set.seed(456)
578 |   G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE)
579 |   mat <- matrix(rnorm(200), nrow = 100, ncol = 2)
580 |   
581 |   result <- cor_sparse_matrix(G, mat)
582 |   
583 |   # Some correlations may be NA/NaN for constant columns
584 |   expect_true(is.matrix(result) || inherits(result, "Matrix"))
585 | })
586 | 
587 | # ============================================================================
588 | # Test: Integration tests
589 | # ============================================================================
590 | 
591 | test_that("dualGSEA and gset_ttest produce consistent results", {
592 |   data <- create_stats_test_data()
593 |   gsetX <- plaid(data$X, data$G)
594 |   
595 |   # Run dualGSEA with precomputed gsetX
596 |   dual_result <- dualGSEA(data$X, data$y, G = data$G)
597 |   
598 |   # Run gset_ttest separately
599 |   ttest_result <- gset_ttest(gsetX, data$y)
600 |   
601 |   # Both should have same number of gene sets
602 |   expect_equal(nrow(dual_result), nrow(ttest_result))
603 | })
604 | 
605 | test_that("matrix_metap with different p-value sources", {
606 |   data <- create_stats_test_data()
607 |   
608 |   # Create dummy p-values
609 |   pv1 <- runif(ncol(data$G))
610 |   pv2 <- runif(ncol(data$G))
611 |   plist <- list(p1 = pv1, p2 = pv2)
612 |   
613 |   # Combine with both methods
614 |   combined_stouffer <- matrix_metap(plist, method = "stouffer")
615 |   combined_fisher <- matrix_metap(plist, method = "fisher")
616 |   
617 |   # Combined p-values should be more significant than individual ones
618 |   expect_true(all(combined_stouffer <= 1))
619 |   expect_true(all(combined_fisher <= 1))
620 | })
621 | 
622 | # ============================================================================
623 | # Test: Edge cases and error handling
624 | # ============================================================================
625 | 
626 | test_that("dualGSEA handles small sample sizes", {
627 |   data <- create_stats_test_data(n_samples = 4)
628 |   
629 |   result <- dualGSEA(data$X, data$y, G = data$G)
630 |   
631 |   expect_true(is.matrix(result) || is.data.frame(result))
632 | })
633 | 
634 | test_that("gset_averageCLR handles single gene set", {
635 |   data <- create_stats_test_data(n_pathways = 1)
636 |   
637 |   result <- gset_averageCLR(data$X, data$G)
638 |   
639 |   expect_true(is.matrix(result))
640 |   expect_equal(nrow(result), 1)
641 | })
642 | 
643 | test_that("matrix_onesample_ttest handles small gene sets", {
644 |   set.seed(456)
645 |   Fm <- rnorm(100)
646 |   names(Fm) <- paste0("GENE", 1:100)
647 |   
648 |   # Create gene set with only 2 genes
649 |   G <- Matrix::Matrix(0, nrow = 100, ncol = 1, sparse = TRUE)
650 |   rownames(G) <- paste0("GENE", 1:100)
651 |   colnames(G) <- "SmallSet"
652 |   G[1:2, 1] <- 1
653 |   
654 |   result <- matrix_onesample_ttest(Fm, G)
655 |   
656 |   expect_true(is.list(result))
657 |   expect_true(all(is.finite(result$mean)))
658 | })
659 | 
660 | test_that("gset.rankcor handles partial overlap", {
661 |   set.seed(456)
662 |   rnk <- rnorm(50)
663 |   names(rnk) <- paste0("GENE", 1:50)
664 |   data <- create_stats_test_data()
665 |   
666 |   result <- gset.rankcor(rnk, data$G)
667 |   
668 |   expect_true(is.list(result))
669 | })
670 | 
671 | test_that("dualGSEA handles unbalanced groups", {
672 |   data <- create_stats_test_data()
673 |   data$y <- rep(0, length(data$y))
674 |   data$y[1:3] <- 1  # Few samples in group 1
675 |   
676 |   # This should work but may produce warnings
677 |   result <- suppressWarnings(dualGSEA(data$X, data$y, G = data$G))
678 |   
679 |   expect_true(is.matrix(result) || is.data.frame(result))
680 | })
681 | 
682 | # ============================================================================
683 | # Test: Numerical stability
684 | # ============================================================================
685 | 
686 | test_that("matrix_metap handles very small p-values", {
687 |   plist <- list(
688 |     p1 = rep(1e-100, 5),
689 |     p2 = rep(1e-100, 5)
690 |   )
691 |   
692 |   result <- matrix_metap(plist, method = "stouffer")
693 |   
694 |   expect_true(all(is.finite(result)))
695 |   expect_true(all(result >= 0))
696 | })
697 | 
698 | test_that("matrix_metap handles p-values near 1", {
699 |   plist <- list(
700 |     p1 = rep(0.99, 5),
701 |     p2 = rep(0.999, 5)
702 |   )
703 |   
704 |   result <- matrix_metap(plist, method = "stouffer")
705 |   
706 |   expect_true(all(is.finite(result)))
707 |   expect_true(all(result <= 1))
708 | })
709 | 
710 | test_that("gset_averageCLR handles extreme values", {
711 |   data <- create_stats_test_data()
712 |   data$X <- data$X * 1e6
713 |   
714 |   result <- gset_averageCLR(data$X, data$G)
715 |   
716 |   expect_true(is.matrix(result))
717 |   expect_true(all(is.finite(result)))
718 | })
719 | 
720 | test_that("matrix_onesample_ttest handles constant values", {
721 |   Fm <- rep(1, 100)
722 |   names(Fm) <- paste0("GENE", 1:100)
723 |   data <- create_stats_test_data()
724 |   
725 |   result <- matrix_onesample_ttest(Fm, data$G)
726 |   
727 |   expect_true(is.list(result))
728 |   # P-values should be 1 for no variance
729 |   expect_true(all(result$p >= 0 & result$p <= 1))
730 | })
731 | 
732 | 


--------------------------------------------------------------------------------
/R/plaid.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This file is part of the Omics Playground project.
  3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved.
  4 | ##
  5 | 
  6 | #' @importFrom methods as
  7 | #' @importFrom stats ecdf
  8 | #' @importFrom Matrix colSums colScale crossprod Diagonal rowMeans t which
  9 | #' @importFrom matrixStats colRanks colMedians
 10 | #' @importFrom sparseMatrixStats colRanks
 11 | NULL
 12 | 
 13 | #' Compute PLAID single-sample enrichment score 
 14 | #'
 15 | #' @description Compute single-sample geneset expression as the
 16 | #'   average log-expression f genes in the geneset. Requires log-expression
 17 | #'   matrix X and (sparse) geneset matrix matG. If you have gene sets
 18 | #'   as a gmt list, please convert it first using the function `gmt2mat()`.
 19 | #'
 20 | #' @details PLAID needs the gene sets as sparse matrix. If you have
 21 | #'   your collection of gene sets a a list, we need first to convert
 22 | #'   the gmt list to matrix format.
 23 | #' 
 24 | #' @details We recommend to run PLAID on the log transformed expression matrix,
 25 | #' not on the counts, as the average in the logarithmic space is more
 26 | #' robust and is in concordance to calculating the geometric mean.
 27 | #'
 28 | #' @details It is not necessary to normalize your expression matrix before
 29 | #' running PLAID because PLAID performs median normalization of the
 30 | #' enrichment scores afterwards.
 31 | #'
 32 | #' @details It is recommended to use sparse matrix as PLAID relies on
 33 | #' sparse matrix computations. But, PLAID is also fast for dense matrices.
 34 | #'
 35 | #' @details PLAID can also be run on the ranked matrix. This corresponds to
 36 | #' the singscore (Fouratan et al., 2018). PLAID can also be run on
 37 | #' the (non-logarithmic) counts which can be used to calculate the
 38 | #' scSE score (Pont et al., 2019).
 39 | #'
 40 | #' @details PLAID is fast and memery efficient because it uses efficient
 41 | #' sparse matrix computation. When input matrix is very large, PLAID
 42 | #' performs 'chunked' computation by splitting the matrix in chunks.
 43 | #'
 44 | #' @details Although `X` and `matG` are generally sparse, the result
 45 | #' matrix `gsetX` generally is dense and can thus be very large.
 46 | #' Example: computing gene set scores for 10K gene sets on 1M cells
 47 | #' will create a 10K x 1M dense matrix which requires ~75GB memory.
 48 | #' 
 49 | #' @details PLAID now automatically detects and handles Bioconductor objects.
 50 | #' If X is a SummarizedExperiment or SingleCellExperiment, it will extract
 51 | #' the appropriate assay. If matG is a BiocSet object or GMT list, it will
 52 | #' be converted to sparse matrix format automatically.
 53 | #' 
 54 | #' @param X Log-transformed expr. matrix. Genes on rows, samples on columns.
 55 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
 56 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns.
 57 | #'   Also accepts BiocSet objects or GMT lists (named list of gene vectors).
 58 | #' @param stats Score computation stats: mean or sum of intensity. Default 'mean'.
 59 | #' @param chunk Logical: use chunks for large matrices. Default 'NULL' for autodetect.
 60 | #' @param normalize Logical: median normalize results or not. Default 'TRUE'.
 61 | #' @param nsmooth Smoothing parameter for more stable average when stats="mean". Default 3.
 62 | #' @param assay Character: assay name to extract from SummarizedExperiment/SingleCellExperiment. Default "logcounts".
 63 | #' @param min.genes Integer: minimum genes per gene set (for BiocSet/GMT input). Default 5.
 64 | #' @param max.genes Integer: maximum genes per gene set (for BiocSet/GMT input). Default 500.
 65 | #'
 66 | #' @return Matrix of single-sample enrichment scores.
 67 | #' Gene sets on rows, samples on columns.
 68 | #' 
 69 | #' @examples
 70 | #' library(plaid)
 71 | #' 
 72 | #' # Create example expression matrix
 73 | #' set.seed(123)
 74 | #' X <- matrix(rnorm(1000), nrow = 100, ncol = 10)
 75 | #' rownames(X) <- paste0("GENE", 1:100)
 76 | #' colnames(X) <- paste0("Sample", 1:10)
 77 | #' 
 78 | #' # Create example gene sets
 79 | #' gmt <- list(
 80 | #'   "Pathway1" = paste0("GENE", 1:20),
 81 | #'   "Pathway2" = paste0("GENE", 15:35),
 82 | #'   "Pathway3" = paste0("GENE", 30:50)
 83 | #' )
 84 | #' matG <- gmt2mat(gmt)
 85 | #' 
 86 | #' # Compute PLAID scores
 87 | #' gsetX <- plaid(X, matG)
 88 | #' print(dim(gsetX))
 89 | #' print(gsetX[1:3, 1:5])
 90 | #' 
 91 | #' # Use sum statistics instead of mean
 92 | #' gsetX_sum <- plaid(X, matG, stats = "sum")
 93 | #' 
 94 | #' \donttest{
 95 | #' # Using real data (if available in package)
 96 | #' extdata_path <- system.file("extdata", "pbmc3k-50cells.rda", package = "plaid")
 97 | #' if (file.exists(extdata_path)) {
 98 | #'   load(extdata_path)
 99 | #'   hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid")
100 | #'   gmt <- read.gmt(hallmarks)
101 | #'   matG <- gmt2mat(gmt)
102 | #'   gsetX <- plaid(X, matG)
103 | #' }
104 | #' }
105 | #'
106 | #' @export
107 | plaid <- function(X, matG, stats=c("mean","sum"), chunk=NULL, normalize=TRUE,
108 |                   nsmooth=3, assay="logcounts", min.genes=5, max.genes=500) {
109 | 
110 |   ## Auto-detect and convert Bioconductor objects
111 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
112 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
113 |   }
114 |   
115 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
116 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
117 |                                        min.genes=min.genes, max.genes=max.genes)
118 |   }
119 | 
120 |   stats <- stats[1]
121 |   if (NCOL(X) == 1) X <- cbind(X)
122 | 
123 |   ## make sure X is matrix (not dataframe) and convert to sparse if needed
124 |   if(!inherits(X, "Matrix")) {
125 |     X <- as.matrix(X)
126 |     if(mean(X==0,na.rm=TRUE)>0.5) X <- Matrix::Matrix(X, sparse=TRUE)
127 |   }
128 |   
129 |   gg <- intersect(rownames(X), rownames(matG))
130 |   if (length(gg) == 0) {
131 |     message("[plaid] No overlapping features.")
132 |     return(NULL)
133 |   }
134 | 
135 |   X <- X[gg, , drop = FALSE]
136 |   matG <- matG[gg, , drop = FALSE]
137 |   G <- 1 * (matG != 0)
138 |   if(stats == "mean") {
139 |     # nsmooth is 'smoothing' parameter for more stable average
140 |     sumG <- 1e-8 + nsmooth + Matrix::colSums(G, na.rm = TRUE)
141 |     G <- Matrix::colScale(G, 1 / sumG)
142 |   }
143 | 
144 |   ## Calculates PLAID score
145 |   gsetX <- chunked_crossprod(G, X, chunk=NULL)
146 |   gsetX <- as.matrix(gsetX)
147 |   
148 |   if(normalize) {
149 |     if(nrow(gsetX) < 20) {
150 |       ## for few genesets, median-norm is not good
151 |       normfactor <- Matrix::colMeans(gsetX,na.rm=TRUE)
152 |       normfactor <- normfactor - mean(normfactor)
153 |       gsetX <- sweep(gsetX, 2, normfactor, '-') 
154 |     } else {
155 |       gsetX <- normalize_medians(gsetX)
156 |     }
157 |   }
158 |  
159 |   return(gsetX)
160 | 
161 | }
162 | 
163 | #' Chunked computation of cross product
164 | #'
165 | #' Compute crossprod (t(x) %*% y) for very large y by computing in
166 | #' chunks.
167 | #'
168 | #' @param x Matrix First matrix for multiplication. Can be sparse.
169 | #' @param y Matrix Second matrix for multiplication. Can be sparse.
170 | #' @param chunk Integer Chunk size (max number of columns) for computation.
171 | #'
172 | #' @return Matrix. Result of matrix cross product.
173 | #' 
174 | chunked_crossprod <- function(x, y, chunk=NULL) {
175 |   if(is.null(chunk) || chunk < 0) {
176 |     ## if y is large, we need to chunk computation
177 |     Int_max <- .Machine$integer.max
178 |     chunk <- round(0.8 * Int_max / ncol(x))
179 |   }
180 | 
181 |   if(ncol(y) < chunk) return(Matrix::crossprod(x, y))
182 |  
183 |   message("[chunked_crossprod] chunked compute: chunk = ", chunk)
184 |   k <- ceiling(ncol(y) / chunk)
185 |   gsetX <- matrix(NA, nrow=ncol(x), ncol=ncol(y),
186 |     dimnames=list(colnames(x),colnames(y)))
187 | 
188 |   for(i in seq_len(k)) {
189 |     jj <- c(((i-1)*chunk+1):min(ncol(y),(i*chunk)))
190 |     xy <- Matrix::crossprod(x, y[,jj])
191 |     gsetX[,jj] <- as.matrix(xy)
192 |   }
193 | 
194 |   return(gsetX)
195 | 
196 | }
197 | 
198 | #' Fast calculation of scSE score
199 | #'
200 | #' @description Calculates Single-Cell Signature Explorer (Pont et
201 | #'   al., 2019) scores using plaid back-end. The computation is
202 | #'   10-100x faster than the original code.
203 | #'
204 | #' @details Computing the scSE requires running plaid on the linear
205 | #'   (not logarithmic) score and perform additional normalization by
206 | #'   the total UMI per sample. We have wrapped this in a single
207 | #'   convenience function:
208 | #'
209 | #' To replicate the original "sum-of-UMI" scSE score, set `removeLog2=TRUE`
210 | #' and `scoreMean=FALSE`. scSE and plaid scores become more similar for
211 | #' `removeLog2=FALSE` and `scoreMean=TRUE`.
212 | #'
213 | #' We have extensively compared the results from `replaid.scse` and
214 | #' from the original scSE (implemented in GO lang) and we showed
215 | #' almost identical results in the score, logFC and p-values.
216 | #' 
217 | #' 
218 | #' @param X Gene or protein expression matrix. Generally log
219 | #'   transformed. See details. Genes on rows, samples on columns.
220 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
221 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on
222 | #'   columns. Also accepts BiocSet objects or GMT lists.
223 | #' @param removeLog2 Logical for whether to remove the Log2, i.e. will
224 | #'   apply power transform (base2) on input (default TRUE).
225 | #' @param scoreMean Logical for whether computing sum or mean as score
226 | #'   (default FALSE).
227 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
228 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
229 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
230 | #'
231 | #' @return Matrix of single-sample scSE enrichment scores.
232 | #'   Gene sets on rows, samples on columns.
233 | #'
234 | #' @examples
235 | #' # Create example expression matrix (log-transformed)
236 | #' set.seed(123)
237 | #' X <- log2(matrix(rpois(500, lambda = 10) + 1, nrow = 50, ncol = 10))
238 | #' rownames(X) <- paste0("GENE", 1:50)
239 | #' colnames(X) <- paste0("Sample", 1:10)
240 | #' 
241 | #' # Create example gene sets
242 | #' gmt <- list(
243 | #'   "Pathway1" = paste0("GENE", 1:15),
244 | #'   "Pathway2" = paste0("GENE", 10:25)
245 | #' )
246 | #' matG <- gmt2mat(gmt)
247 | #' 
248 | #' # Compute scSE scores (original method)
249 | #' scores <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = FALSE)
250 | #' print(scores[1:2, 1:5])
251 | #' 
252 | #' # Compute scSE scores (mean method)
253 | #' scores_mean <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = TRUE)
254 | #' print(scores_mean[1:2, 1:5])
255 | #'
256 | #' @export
257 | replaid.scse <- function(X,
258 |                          matG,
259 |                          removeLog2 = NULL,
260 |                          scoreMean = FALSE,
261 |                          assay="logcounts",
262 |                          min.genes=5,
263 |                          max.genes=500) {
264 | 
265 |   ## Auto-detect and convert Bioconductor objects
266 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
267 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
268 |   }
269 |   
270 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
271 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
272 |                                        min.genes=min.genes, max.genes=max.genes)
273 |   }
274 | 
275 |   if(is.null(removeLog2))
276 |     removeLog2 <- min(X, na.rm = TRUE)==0 && max(X, na.rm = TRUE) < 20
277 |   
278 |   if(removeLog2)  {
279 |     message("[replaid.scse] Converting data to linear scale (removing log2)...")
280 |     if(inherits(X,"dgCMatrix")) {
281 |       X@x <- 2**X@x
282 |     } else {
283 |       nz <- Matrix::which(X>0)
284 |       X[nz] <- 2**X[nz]  ## undo only non-zeros as in scSE code
285 |     }
286 |   }
287 | 
288 |   if(scoreMean) {
289 |     ## modified scSE with Mean-statistics    
290 |     sX <- plaid(X, matG, stats="mean", normalize=FALSE)
291 |     sumx <- Matrix::colMeans(abs(X)) + 1e-8    
292 |     sX <- sX %*% Matrix::Diagonal(x = 1/sumx)
293 |   } else {
294 |     ## original scSE with Sum-statistics
295 |     sX <- plaid(X, matG, stats="sum", normalize=FALSE)
296 |     sumx <- Matrix::colSums(abs(X)) + 1e-8
297 |     sX <- sX %*% Matrix::Diagonal(x = 1/sumx) * 100      
298 |   }
299 | 
300 |   colnames(sX) <- colnames(X)
301 |   sX <- as.matrix(sX)
302 | 
303 |   return(sX)
304 | 
305 | }
306 | 
307 | 
308 | #' Fast calculation of singscore
309 | #'
310 | #' @description Calculates single-sample enrichment singscore
311 | #'   (Fouratan et al., 2018) using plaid back-end. The computation is
312 | #'   10-100x faster than the original code.
313 | #'
314 | #' @details Computing the singscore requires to compute the ranks of
315 | #'   the expression matrix. We have wrapped this in a single
316 | #'   convenience function.
317 | #'
318 | #' We have extensively compared the results of `replaid.sing` and from
319 | #' the original `singscore` R package and we showed identical result
320 | #' in the score, logFC and p-values.
321 | #' 
322 | #' @param X Gene or protein expression matrix. Generally log
323 | #'   transformed. See details. Genes on rows, samples on columns.
324 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
325 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on
326 | #'   columns. Also accepts BiocSet objects or GMT lists.
327 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
328 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
329 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
330 | #' 
331 | #' @return Matrix of single-sample singscore enrichment scores.
332 | #'   Gene sets on rows, samples on columns.
333 | #'
334 | #' @examples
335 | #' # Create example expression matrix
336 | #' set.seed(123)
337 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10)
338 | #' rownames(X) <- paste0("GENE", 1:50)
339 | #' colnames(X) <- paste0("Sample", 1:10)
340 | #' 
341 | #' # Create example gene sets
342 | #' gmt <- list(
343 | #'   "Pathway1" = paste0("GENE", 1:15),
344 | #'   "Pathway2" = paste0("GENE", 10:25)
345 | #' )
346 | #' matG <- gmt2mat(gmt)
347 | #' 
348 | #' # Compute singscore
349 | #' scores <- replaid.sing(X, matG)
350 | #' print(scores[1:2, 1:5])
351 | #'
352 | #' @export
353 | replaid.sing <- function(X, matG, assay="logcounts", min.genes=5, max.genes=500) {
354 |   ## Auto-detect and convert Bioconductor objects
355 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
356 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
357 |   }
358 |   
359 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
360 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
361 |                                        min.genes=min.genes, max.genes=max.genes)
362 |   }
363 |   
364 |   ## the ties.method=min is important for exact replication
365 |   rX <- colranks(X, ties.method = "min")
366 |   rX <- rX / nrow(X) - 0.5
367 |   gsetX <- plaid(rX, matG = matG, normalize = FALSE)
368 |   return(gsetX)
369 | }
370 | 
371 | #' Fast calculation of ssGSEA
372 | #'
373 | #' @description Calculates single-sample enrichment singscore (Barbie
374 | #'   et al., 2009; Hänzelmann et al., 2013) using plaid back-end. The
375 | #'   computation is 10-100x faster than the original code.
376 | #'
377 | #' @details Computing ssGSEA score requires to compute the ranks of
378 | #'   the expression matrix and weighting of the ranks. We have wrapped
379 | #'   this in a single convenience function.
380 | #'
381 | #' We have extensively compared the results of `replaid.ssgsea` and
382 | #' from the original `GSVA` R package and we showed highly similar
383 | #' results in the score, logFC and p-values. For alpha=0 we obtain
384 | #' exact results, for alpha>0 the results are highly similar but not
385 | #' exactly the same.
386 | #' 
387 | #' @param X Gene or protein expression matrix. Generally log
388 | #'   transformed. See details. Genes on rows, samples on columns.
389 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
390 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on
391 | #'   columns. Also accepts BiocSet objects or GMT lists.
392 | #' @param alpha Weighting factor for exponential weighting of ranks
393 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
394 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
395 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
396 | #' 
397 | #' @return Matrix of single-sample ssGSEA enrichment scores.
398 | #'   Gene sets on rows, samples on columns.
399 | #'
400 | #' @examples
401 | #' # Create example expression matrix
402 | #' set.seed(123)
403 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10)
404 | #' rownames(X) <- paste0("GENE", 1:50)
405 | #' colnames(X) <- paste0("Sample", 1:10)
406 | #' 
407 | #' # Create example gene sets
408 | #' gmt <- list(
409 | #'   "Pathway1" = paste0("GENE", 1:15),
410 | #'   "Pathway2" = paste0("GENE", 10:25)
411 | #' )
412 | #' matG <- gmt2mat(gmt)
413 | #' 
414 | #' # Compute ssGSEA scores (alpha = 0)
415 | #' scores <- replaid.ssgsea(X, matG, alpha = 0)
416 | #' print(scores[1:2, 1:5])
417 | #' 
418 | #' # Compute ssGSEA scores with weighting (alpha = 0.25)
419 | #' scores_weighted <- replaid.ssgsea(X, matG, alpha = 0.25)
420 | #' print(scores_weighted[1:2, 1:5])
421 | #'
422 | #' @export
423 | replaid.ssgsea <- function(X, matG, alpha = 0, assay="logcounts", min.genes=5, max.genes=500) {
424 |   ## Auto-detect and convert Bioconductor objects
425 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
426 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
427 |   }
428 |   
429 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
430 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
431 |                                        min.genes=min.genes, max.genes=max.genes)
432 |   }
433 |   
434 |   rX <- colranks(X, keep.zero = TRUE, ties.method = "average")
435 |   if(alpha != 0) {
436 |     ## This is not exactly like original formula. Not sure how to
437 |     ## efficiently implement original rank weighting
438 |     rX <- rX^(1 + alpha)
439 |   }
440 |   rX <- rX / max(rX) - 0.5
441 |   dimnames(rX) <- dimnames(X)
442 |   gsetX <- plaid(rX, matG, stats = "mean", normalize = TRUE)
443 |   return(gsetX)
444 | }
445 | 
446 | #' Fast calculation of UCell
447 | #'
448 | #' @description Calculates single-sample enrichment UCell (Andreatta
449 | #'   et al., 2021) using plaid back-end. The computation is
450 | #'   10-100x faster than the original code.
451 | #'
452 | #' @details Computing ssGSEA score requires to compute the ranks of
453 | #'   the expression matrix and truncation of the ranks. We have wrapped
454 | #'   this in a single convenience function.
455 | #'
456 | #' We have extensively compared the results of `replaid.ucell` and
457 | #' from the original `UCell` R package and we showed near exacct
458 | #' results in the score, logFC and p-values. 
459 | #' 
460 | #' @param X Gene or protein expression matrix. Generally log
461 | #'   transformed. See details. Genes on rows, samples on columns.
462 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
463 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns.
464 | #'   Also accepts BiocSet objects or GMT lists.
465 | #' @param rmax Rank threshold (see Ucell paper). Default rmax = 1500.
466 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
467 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
468 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
469 | #' 
470 | #' @return Matrix of single-sample UCell enrichment scores.
471 | #'   Gene sets on rows, samples on columns.
472 | #'
473 | #' @examples
474 | #' # Create example expression matrix
475 | #' set.seed(123)
476 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10)
477 | #' rownames(X) <- paste0("GENE", 1:50)
478 | #' colnames(X) <- paste0("Sample", 1:10)
479 | #' 
480 | #' # Create example gene sets
481 | #' gmt <- list(
482 | #'   "Pathway1" = paste0("GENE", 1:15),
483 | #'   "Pathway2" = paste0("GENE", 10:25)
484 | #' )
485 | #' matG <- gmt2mat(gmt)
486 | #' 
487 | #' # Compute UCell scores (default rmax = 1500)
488 | #' scores <- replaid.ucell(X, matG)
489 | #' print(scores[1:2, 1:5])
490 | #' 
491 | #' # Compute UCell scores with custom rmax
492 | #' scores_custom <- replaid.ucell(X, matG, rmax = 1000)
493 | #' print(scores_custom[1:2, 1:5])
494 | #'
495 | #' @export
496 | replaid.ucell <- function(X, matG, rmax = 1500, assay="logcounts", min.genes=5, max.genes=500) {
497 |   ## Auto-detect and convert Bioconductor objects
498 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
499 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
500 |   }
501 |   
502 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
503 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
504 |                                        min.genes=min.genes, max.genes=max.genes)
505 |   }
506 |   
507 |   rX <- colranks(X, ties.method = "average")
508 |   rX <- pmin( max(rX) - rX, rmax+1 )
509 |   S <- plaid(rX, matG)
510 |   S <- 1 - S / rmax + (Matrix::colSums(matG!=0)+1)/(2*rmax)
511 |   return(S)
512 | }
513 | 
514 | #' Fast calculation of AUCell
515 | #'
516 | #' @description Calculates single-sample enrichment AUCell (Aibar
517 | #'   et al., 2017) using plaid back-end. The computation is
518 | #'   10-100x faster than the original code.
519 | #'
520 | #' @details Computing the AUCell score requires to compute the ranks
521 | #'   of the expression matrix and approximating the AUC of a gene
522 | #'   set. We have wrapped this in a single convenience function.
523 | #'
524 | #' We have extensively compared the results of `replaid.aucell` and
525 | #' from the original `AUCell` R package and we showed good concordance
526 | #' of results in the score, logFC and p-values.
527 | #' 
528 | #' @param X Gene or protein expression matrix. Generally log
529 | #'   transformed. See details. Genes on rows, samples on columns.
530 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
531 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns.
532 | #'   Also accepts BiocSet objects or GMT lists.
533 | #' @param aucMaxRank Rank threshold (see AUCell paper). Default aucMaxRank = 0.05*nrow(X).
534 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
535 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
536 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
537 | #' 
538 | #' @return Matrix of single-sample AUCell enrichment scores.
539 | #'   Gene sets on rows, samples on columns.
540 | #'
541 | #' @examples
542 | #' # Create example expression matrix
543 | #' set.seed(123)
544 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10)
545 | #' rownames(X) <- paste0("GENE", 1:50)
546 | #' colnames(X) <- paste0("Sample", 1:10)
547 | #' 
548 | #' # Create example gene sets
549 | #' gmt <- list(
550 | #'   "Pathway1" = paste0("GENE", 1:15),
551 | #'   "Pathway2" = paste0("GENE", 10:25)
552 | #' )
553 | #' matG <- gmt2mat(gmt)
554 | #' 
555 | #' # Compute AUCell scores
556 | #' scores <- replaid.aucell(X, matG)
557 | #' print(scores[1:2, 1:5])
558 | #'
559 | #' @export
560 | replaid.aucell <- function(X, matG, aucMaxRank = NULL, assay="logcounts", min.genes=5, max.genes=500) {
561 |   ## Auto-detect and convert Bioconductor objects
562 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
563 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
564 |   }
565 |   
566 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
567 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
568 |                                        min.genes=min.genes, max.genes=max.genes)
569 |   }
570 |   
571 |   if (is.null(aucMaxRank)) {
572 |     aucMaxRank <- ceiling(0.05*nrow(X))
573 |   }
574 |   
575 |   rX <- colranks(X, ties.method = "average")
576 |   ww <- 1.08*pmax((rX - (max(rX) - aucMaxRank)) / aucMaxRank, 0)
577 |   gsetX <- plaid(ww, matG, stats = "mean")
578 |   return(gsetX)
579 | }
580 | 
581 | #' Fast approximation of GSVA
582 | #'
583 | #' @description Calculates single-sample enrichment GSVA (Hänzelmann
584 | #'   et al., 2013) using plaid back-end. The computation is
585 | #'   10-100x faster than the original code.
586 | #'
587 | #' @details Computing the GSVA score requires to compute the CDF of
588 | #'   the expression matrix, ranking and scoring the genesets. We have
589 | #'   wrapped this in a single convenience function.
590 | #'
591 | #' We have extensively compared the results of `replaid.gsva` and
592 | #' from the original `GSVA` R package and we showed good concordance
593 | #' of results in the score, logFC and p-values.
594 | #'
595 | #' In the original formulation, GSVA uses an emperical CDF to
596 | #' transform expression of each feature to a (0;1) relative expression
597 | #' value. For efficiency reasons, this is here approximated by a
598 | #' z-transform (center+scale) of each row.
599 | #' 
600 | #' @param X Gene or protein expression matrix. Generally log
601 | #'   transformed. See details. Genes on rows, samples on columns.
602 | #'   Also accepts SummarizedExperiment or SingleCellExperiment objects.
603 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on
604 | #'   columns. Also accepts BiocSet objects or GMT lists.
605 | #' @param tau Rank weight parameter (see GSVA publication). Default
606 | #'   tau=0.
607 | #' @param rowtf Row transformation method ("z" or "ecdf"). Default "z".
608 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts".
609 | #' @param min.genes Integer: minimum genes per gene set. Default 5.
610 | #' @param max.genes Integer: maximum genes per gene set. Default 500.
611 | #' 
612 | #' @return Matrix of single-sample GSVA enrichment scores.
613 | #'   Gene sets on rows, samples on columns.
614 | #'
615 | #' @examples
616 | #' # Create example expression matrix
617 | #' set.seed(123)
618 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10)
619 | #' rownames(X) <- paste0("GENE", 1:50)
620 | #' colnames(X) <- paste0("Sample", 1:10)
621 | #' 
622 | #' # Create example gene sets
623 | #' gmt <- list(
624 | #'   "Pathway1" = paste0("GENE", 1:15),
625 | #'   "Pathway2" = paste0("GENE", 10:25)
626 | #' )
627 | #' matG <- gmt2mat(gmt)
628 | #' 
629 | #' # Compute GSVA scores
630 | #' scores <- replaid.gsva(X, matG)
631 | #' print(scores[1:2, 1:5])
632 | #'
633 | #' @export
634 | replaid.gsva <- function(X, matG, tau = 0, rowtf = c("z", "ecdf")[1], assay="logcounts", min.genes=5, max.genes=500) {
635 |   ## Auto-detect and convert Bioconductor objects
636 |   if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) {
637 |     X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE)
638 |   }
639 |   
640 |   if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) {
641 |     matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 
642 |                                        min.genes=min.genes, max.genes=max.genes)
643 |   }
644 |   
645 |   rowtf <- rowtf[1]
646 | 
647 |   if(rowtf == "z") {
648 |     ## Faster approximation of relative activation
649 |     zX <- (X - Matrix::rowMeans(X)) / (1e-8 + mat.rowsds(X))
650 |   } else if(rowtf=='ecdf') {
651 |     ## this implements original ECDF idea
652 |     zX <- t(apply(X,1,function(x) ecdf(x)(x))) 
653 |   } else {
654 |     stop("unknown row transform",rowtf)
655 |   }
656 | 
657 |   rX <- colranks(zX, signed = TRUE, ties.method = "average")
658 |   rX <- rX / max(abs(rX))
659 |   if(tau > 0) {
660 |     ## Note: This is not exactly like original formula. Not sure how
661 |     ## to efficiently implement original rank weighting
662 |     rX <- sign(rX) * abs(rX)^(1 + tau)
663 |   }
664 |   dimnames(rX) <- dimnames(X)
665 |   gsetX <- plaid(rX, matG)
666 | 
667 |   return(gsetX)
668 | 
669 | }
670 | 
671 | #' Calculate row standard deviations for matrix
672 | #'
673 | #' @param X Input matrix (can be sparse or dense)
674 | #'
675 | #' @return Vector of row standard deviations.
676 | #'
677 | mat.rowsds <- function(X) {
678 |   if(inherits(X,"CsparseMatrix"))
679 |     return(sparseMatrixStats::rowSds(X))
680 |   sdx <- matrixStats::rowSds(X)
681 |   return(sdx)
682 | }
683 | 
684 | ##----------------------------------------------------------------
685 | ##-------------------- UTILITIES ---------------------------------
686 | ##----------------------------------------------------------------
687 | 
688 | #' Normalize column medians of matrix
689 | #'
690 | #' This function normalizes the column medians of matrix x. It calls
691 | #' optimized functions from the matrixStats package.
692 | #'
693 | #' @param x Input matrix
694 | #' @param ignore.zero Logical indicating whether to ignore zeros to
695 | #'   exclude for median calculation
696 | #'
697 | #' @return Matrix with normalized column medians.
698 | #'
699 | #' @examples
700 | #' # Create example matrix
701 | #' set.seed(123)
702 | #' x <- matrix(rnorm(100), nrow = 10, ncol = 10)
703 | #' x[1:3, 1:3] <- 0  # Add some zeros
704 | #' 
705 | #' # Normalize medians
706 | #' x_norm <- normalize_medians(x)
707 | #' head(x_norm)
708 | #'
709 | #' @export
710 | normalize_medians <- function(x, ignore.zero = NULL) {
711 | 
712 |   if(is.null(ignore.zero))
713 |     ignore.zero <- (min(x,na.rm = TRUE) == 0)
714 | 
715 |   x <- as.matrix(x)
716 | 
717 |   if(ignore.zero) {
718 |     zx <- x
719 |     zx[Matrix::which(x==0)] <- NA
720 |     medx <- matrixStats::colMedians(zx, na.rm = TRUE)    
721 |     medx[is.na(medx)] <- 0
722 |   } else {
723 |     medx <- matrixStats::colMedians(x, na.rm = TRUE)    
724 |   }
725 | 
726 |   nx <- sweep(x, 2, medx, '-') + mean(medx, na.rm = TRUE)
727 |   return(nx)
728 |   
729 | }
730 | 
731 | #' Compute columnwise ranks of matrix
732 | #'
733 | #' Computes columnwise rank of matrix. Can be sparse. Tries to call
734 | #' optimized functions from Rfast or matrixStats.
735 | #'
736 | #' @param X Input matrix
737 | #' @param sparse Logical indicating to use sparse methods
738 | #' @param signed Logical indicating using signed ranks
739 | #' @param keep.zero Logical indicating whether to keep zero as ranked zero
740 | #' @param ties.method Character Choice of ties.method
741 | #' 
742 | #' @return Matrix of columnwise ranks with same dimensions as input.
743 | #'
744 | #' @examples
745 | #' # Create example matrix
746 | #' set.seed(123)
747 | #' X <- matrix(rnorm(100), nrow = 10, ncol = 10)
748 | #' rownames(X) <- paste0("Gene", 1:10)
749 | #' colnames(X) <- paste0("Sample", 1:10)
750 | #' 
751 | #' # Compute column ranks
752 | #' ranks <- colranks(X)
753 | #' print(ranks[1:5, 1:5])
754 | #' 
755 | #' # Compute signed ranks
756 | #' signed_ranks <- colranks(X, signed = TRUE)
757 | #' print(signed_ranks[1:5, 1:5])
758 | #'
759 | #' @export
760 | colranks <- function(X,
761 |                      sparse = NULL,
762 |                      signed = FALSE,
763 |                      keep.zero = FALSE,
764 |                      ties.method = "average") {
765 | 
766 |   if(is.null(sparse))
767 |     sparse <- inherits(X,"CsparseMatrix")
768 | 
769 |   if(sparse) {
770 |     X <- methods::as(X, "CsparseMatrix")
771 |     if(keep.zero) {
772 |       rX <- sparse_colranks(X, signed = signed, ties.method = ties.method)
773 |     } else {
774 |       if(signed) {
775 |         sign.X <- sign(X)
776 |         abs.rX <- Matrix::t(sparseMatrixStats::colRanks(abs(X), ties.method = ties.method))
777 |         rX <- abs.rX * sign.X
778 |       } else {
779 |         rX <- Matrix::t(sparseMatrixStats::colRanks(X, ties.method = ties.method))
780 |       }
781 |     }
782 |   } else {
783 |     if(signed) {
784 |       sign.X <- sign(X)
785 |       abs.rX <- Matrix::t(matrixStats::colRanks(as.matrix(abs(X)), ties.method = ties.method))
786 |       rX <- sign.X * abs.rX
787 |     } else {
788 |       rX <- Matrix::t(matrixStats::colRanks(as.matrix(X), ties.method = ties.method))
789 |     }
790 |   }
791 | 
792 |   return(rX)
793 | 
794 | }
795 | 
796 | #' Compute columm ranks for sparse matrix. Internally used by colranks()
797 | #'
798 | #' @param X Input matrix
799 | #' @param signed Logical: use or not signed ranks
800 | #' @param ties.method Character Choice of ties.method
801 | #' 
802 | #' @return Sparse matrix of columnwise ranks with same dimensions as input.
803 | #'
804 | sparse_colranks <- function(X, signed = FALSE, ties.method = "average") {
805 |   ## https://stackoverflow.com/questions/41772943
806 |   X <- methods::as(X, "CsparseMatrix")
807 |   n <- diff(X@p)  ## number of non-zeros per column
808 |   lst <- split(X@x, rep.int(seq_len(ncol(X)), n))  ## columns to list
809 |   ## column-wise ranking and result collapsing
810 |   if(signed) {
811 |     lst.sign <- lapply(lst, sign)
812 |     lst.rnk  <- lapply(lst, function(x) rank(abs(x),ties.method = ties.method))
813 |     rnk <- unlist(mapply('*', lst.sign, lst.rnk, SIMPLIFY = FALSE))  
814 |   } else {
815 |     rnk <- unlist(lapply(lst, rank, ties.method = ties.method))  
816 |   }
817 | 
818 |   rX <- X  ## copy sparse matrix
819 |   rX@x <- rnk  ## replace non-zero elements with rank
820 | 
821 |   return(rX)
822 | 
823 | }
824 | 
825 | ##-------------------------------------------------------------
826 | ##------------------ end of file ------------------------------
827 | ##-------------------------------------------------------------
828 | 


--------------------------------------------------------------------------------