├── .github ├── .gitignore └── workflows │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── vignettes ├── .gitignore ├── fig1.png ├── fig2.png ├── fig3.png ├── fig4.png ├── 02_compare-vignette.Rmd └── 01_plaid-vignette.Rmd ├── inst ├── logo │ └── logo.png ├── extdata │ └── pbmc3k-50cells.rda ├── scripts │ ├── hallmark.txt │ └── pbmc3k-50cells.R └── CITATION ├── man ├── figures │ └── logo.png ├── mat.rowsds.Rd ├── gset_ttest.Rd ├── matrix_onesample_ttest.Rd ├── sparse_colranks.Rd ├── chunked_crossprod.Rd ├── cor_sparse_matrix.Rd ├── matrix_metap.Rd ├── fc_ttest.Rd ├── fc_ztest.Rd ├── normalize_medians.Rd ├── gset_averageCLR.Rd ├── mat2gmt.Rd ├── read.gmt.Rd ├── colranks.Rd ├── write.gmt.Rd ├── gmt2mat.Rd ├── gset.rankcor.Rd ├── replaid.sing.Rd ├── replaid.aucell.Rd ├── replaid.ucell.Rd ├── replaid.ssgsea.Rd ├── replaid.gsva.Rd ├── replaid.scse.Rd ├── dualGSEA.Rd └── plaid.Rd ├── .Rbuildignore ├── dev ├── environment.yml ├── extdata.R ├── Makefile ├── make_package.R └── test.R ├── tests ├── testthat.R └── testthat │ ├── test-bioc-utils.R │ ├── test-gmt-utils.R │ ├── test-plaid.R │ └── test-stats.R ├── NEWS ├── .gitignore ├── NAMESPACE ├── DESCRIPTION ├── README.md └── R ├── bioc-utils.R ├── gmt-utils.R ├── stats.R └── plaid.R /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /inst/logo/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/inst/logo/logo.png -------------------------------------------------------------------------------- /vignettes/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig1.png -------------------------------------------------------------------------------- /vignettes/fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig2.png -------------------------------------------------------------------------------- /vignettes/fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig3.png -------------------------------------------------------------------------------- /vignettes/fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/vignettes/fig4.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /inst/extdata/pbmc3k-50cells.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigomics/plaid/HEAD/inst/extdata/pbmc3k-50cells.rda -------------------------------------------------------------------------------- /inst/scripts/hallmark.txt: -------------------------------------------------------------------------------- 1 | hallmark.gmt data downloaded from: 2 | 3 | https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=H -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE$ 2 | ^README\.Rmd$ 3 | ^Makefile$ 4 | ^dev$ 5 | ^[.].*$ 6 | ^.*/[.].*$ 7 | ^Meta$ 8 | ^doc$ 9 | ^experiments$ 10 | ^plaid.BiocCheck$ 11 | ^_pkgdown\.yml$ 12 | ^docs$ 13 | ^pkgdown$ 14 | ^\.github$ 15 | -------------------------------------------------------------------------------- /dev/environment.yml: -------------------------------------------------------------------------------- 1 | name: plaid-env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - r-base=4.3 8 | - r-devtools 9 | - r-remotes 10 | - r-matrix 11 | - r-matrixstats 12 | - r-rfast 13 | - bioconductor-sparsematrixstats 14 | - r-qlcmatrix 15 | - bioconductor-biocstyle 16 | - r-knitr 17 | - r-rmarkdown 18 | - r-testthat 19 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | title = "PLAID: ultrafast single-sample gene set enrichment scoring", 4 | author = "Zito, Antonino and Escrib\u00e0 Montagut, Xavier and Scorici, Gabriela and Martinelli, Axel and Akhmedov, Murodzhon and Kwee, Ivo", 5 | journal = "Bioinformatics", 6 | year = "2025", 7 | doi = "10.1093/bioinformatics/btaf621" 8 | ) 9 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview 7 | # * https://testthat.r-lib.org/articles/special-files.html 8 | 9 | library(testthat) 10 | library(plaid) 11 | 12 | test_check("plaid") 13 | -------------------------------------------------------------------------------- /man/mat.rowsds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{mat.rowsds} 4 | \alias{mat.rowsds} 5 | \title{Calculate row standard deviations for matrix} 6 | \usage{ 7 | mat.rowsds(X) 8 | } 9 | \arguments{ 10 | \item{X}{Input matrix (can be sparse or dense)} 11 | } 12 | \value{ 13 | Vector of row standard deviations. 14 | } 15 | \description{ 16 | Calculate row standard deviations for matrix 17 | } 18 | -------------------------------------------------------------------------------- /dev/extdata.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | library(Seurat) 4 | library(SeuratData) 5 | SeuratData::InstallData("pbmc3k") 6 | data("pbmc3k.final") 7 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final) 8 | X <- pbmc3k.final[['RNA']]@data 9 | y <- pbmc3k.final$seurat_annotations 10 | table(y) 11 | sel <- c(head(which(y=="B"),25), head(which(y=="Naive CD4 T"),25)) 12 | X <- X[,sel] 13 | X <- X[rowSums(X)>0,] 14 | celltype <- c(rep("B",25),rep("T",25)) 15 | save(X, celltype, file="../inst/extdata/pbmc3k-50cells.rda") 16 | -------------------------------------------------------------------------------- /inst/scripts/pbmc3k-50cells.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(SeuratData) 3 | SeuratData::InstallData("pbmc3k") 4 | data("pbmc3k.final") 5 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final) 6 | X <- pbmc3k.final[['RNA']]@data 7 | y <- pbmc3k.final$seurat_annotations 8 | table(y) 9 | sel <- c(head(which(y=="B"),25), head(which(y=="Naive CD4 T"),25)) 10 | X <- X[,sel] 11 | X <- X[rowSums(X)>0,] 12 | celltype <- c(rep("B",25),rep("T",25)) 13 | save(X, celltype, file="../inst/extdata/pbmc3k-50cells.rda") 14 | -------------------------------------------------------------------------------- /dev/Makefile: -------------------------------------------------------------------------------- 1 | build: doc 2 | R -e "devtools::build()" 3 | 4 | doc: vignettes/*.Rmd R/*.R 5 | R -e "devtools::document()" 6 | R -e "devtools::build_vignettes()" 7 | 8 | install: 9 | R CMD INSTALL . 10 | 11 | check: clean 12 | R -e 'devtools::check()' 13 | 14 | biocheck: clean 15 | R -e 'BiocCheck::BiocCheck(".")' 16 | 17 | test: 18 | R -e "devtools::test()" 19 | 20 | clean: 21 | rm -f `find . -name '.\#*' -o -name '\#*' -o -name '*~' -printf '"%p" '` 22 | 23 | echo: 24 | echo `find . -name '.\#*' -o -name '\#*' -o -name '*~' -printf '"%p" '` 25 | 26 | FORCE: ; 27 | -------------------------------------------------------------------------------- /man/gset_ttest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{gset_ttest} 4 | \alias{gset_ttest} 5 | \title{Perform t-test on gene set scores} 6 | \usage{ 7 | gset_ttest(gsetX, y) 8 | } 9 | \arguments{ 10 | \item{gsetX}{Matrix of gene set scores with gene sets on rows and samples on columns} 11 | 12 | \item{y}{Binary vector (0/1) indicating group membership} 13 | } 14 | \value{ 15 | Data frame with columns: diff (difference in means), statistic (t-statistic), 16 | pvalue (p-value), and other t-test results. 17 | } 18 | \description{ 19 | Perform t-test on gene set scores 20 | } 21 | -------------------------------------------------------------------------------- /man/matrix_onesample_ttest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{matrix_onesample_ttest} 4 | \alias{matrix_onesample_ttest} 5 | \title{Perform one-sample t-test on matrix with gene sets} 6 | \usage{ 7 | matrix_onesample_ttest(Fm, G) 8 | } 9 | \arguments{ 10 | \item{Fm}{Vector of feature values (e.g., fold changes)} 11 | 12 | \item{G}{Sparse matrix of gene sets with genes on rows and gene sets on columns} 13 | } 14 | \value{ 15 | List containing mean, t-statistic, and p-value matrices. 16 | } 17 | \description{ 18 | Perform one-sample t-test on matrix with gene sets 19 | } 20 | -------------------------------------------------------------------------------- /man/sparse_colranks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{sparse_colranks} 4 | \alias{sparse_colranks} 5 | \title{Compute columm ranks for sparse matrix. Internally used by colranks()} 6 | \usage{ 7 | sparse_colranks(X, signed = FALSE, ties.method = "average") 8 | } 9 | \arguments{ 10 | \item{X}{Input matrix} 11 | 12 | \item{signed}{Logical: use or not signed ranks} 13 | 14 | \item{ties.method}{Character Choice of ties.method} 15 | } 16 | \value{ 17 | Sparse matrix of columnwise ranks with same dimensions as input. 18 | } 19 | \description{ 20 | Compute columm ranks for sparse matrix. Internally used by colranks() 21 | } 22 | -------------------------------------------------------------------------------- /man/chunked_crossprod.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{chunked_crossprod} 4 | \alias{chunked_crossprod} 5 | \title{Chunked computation of cross product} 6 | \usage{ 7 | chunked_crossprod(x, y, chunk = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{Matrix First matrix for multiplication. Can be sparse.} 11 | 12 | \item{y}{Matrix Second matrix for multiplication. Can be sparse.} 13 | 14 | \item{chunk}{Integer Chunk size (max number of columns) for computation.} 15 | } 16 | \value{ 17 | Matrix. Result of matrix cross product. 18 | } 19 | \description{ 20 | Compute crossprod (t(x) \%*\% y) for very large y by computing in 21 | chunks. 22 | } 23 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Changes in version 0.99.0 2 | ------------------------- 3 | 4 | INITIAL RELEASE 5 | 6 | * plaid: ultra-fast single-sample enrichment scoring using rank-based 7 | statistics 8 | 9 | * replaid methods: fast implementations of popular enrichment methods 10 | (ssGSEA, GSVA, singscore, UCell, AUCell, scSE) 11 | 12 | * dualGSEA: statistical testing for differential enrichment with dualGSEA 13 | 14 | * Full Bioconductor integration with SummarizedExperiment, 15 | SingleCellExperiment, and BiocSet support 16 | 17 | * GMT utilities for reading, writing, and converting gene set files 18 | 19 | * Performance optimizations including sparse matrix support and parallel 20 | processing 21 | -------------------------------------------------------------------------------- /man/cor_sparse_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{cor_sparse_matrix} 4 | \alias{cor_sparse_matrix} 5 | \title{Calculate sparse correlation matrix handling missing values} 6 | \usage{ 7 | cor_sparse_matrix(G, mat) 8 | } 9 | \arguments{ 10 | \item{G}{Sparse matrix containing gene sets} 11 | 12 | \item{mat}{Matrix of values} 13 | } 14 | \value{ 15 | Correlation matrix between G and mat 16 | } 17 | \description{ 18 | Calculate sparse correlation matrix handling missing values 19 | } 20 | \details{ 21 | If mat has no missing values, calculates correlation directly using corSparse. 22 | Otherwise computes column-wise correlations only using non-missing values. 23 | } 24 | -------------------------------------------------------------------------------- /man/matrix_metap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{matrix_metap} 4 | \alias{matrix_metap} 5 | \title{Matrix version for combining p-values using fisher or stouffer 6 | method. Much faster than doing metap::sumlog() and metap::sumz()} 7 | \usage{ 8 | matrix_metap(plist, method = "stouffer") 9 | } 10 | \arguments{ 11 | \item{plist}{List of p-value vectors or matrix of p-values} 12 | 13 | \item{method}{Method for combining p-values ("fisher"/"sumlog" or "stouffer"/"sumz")} 14 | } 15 | \value{ 16 | Vector of combined p-values. 17 | } 18 | \description{ 19 | Matrix version for combining p-values using fisher or stouffer 20 | method. Much faster than doing metap::sumlog() and metap::sumz() 21 | } 22 | -------------------------------------------------------------------------------- /man/fc_ttest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{fc_ttest} 4 | \alias{fc_ttest} 5 | \title{T-test statistical testing of differentially enrichment} 6 | \usage{ 7 | fc_ttest(fc, G, sort.by = "pvalue") 8 | } 9 | \arguments{ 10 | \item{fc}{Vector of logFC values} 11 | 12 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates 13 | gene/feature is part of gene sets. Features on rows, gene sets on 14 | columns.} 15 | 16 | \item{sort.by}{Column name to sort results by ("pvalue", "gsetFC", or "none")} 17 | } 18 | \value{ 19 | Data frame with columns: gsetFC (gene set fold change), 20 | pvalue (p-value from one-sample t-test), and qvalue (FDR-adjusted p-value). 21 | } 22 | \description{ 23 | This function performs statistical testing for differential 24 | enrichment using plaid 25 | } 26 | -------------------------------------------------------------------------------- /man/fc_ztest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{fc_ztest} 4 | \alias{fc_ztest} 5 | \title{Z-test statistical testing of differentially enrichment} 6 | \usage{ 7 | fc_ztest(fc, G, zmat = FALSE, alpha = 0.5) 8 | } 9 | \arguments{ 10 | \item{fc}{Vector of logFC values} 11 | 12 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates 13 | gene/feature is part of gene sets. Features on rows, gene sets on 14 | columns.} 15 | 16 | \item{zmat}{Logical indicating to return z-matrix} 17 | 18 | \item{alpha}{Scalar weight for SD estimation. Default 0.5.} 19 | } 20 | \value{ 21 | List with element: z_statistic (z-statistic from one-sample z-test), 22 | p_value (p-value from one-sample z-test), and zmat (z-matrix). 23 | } 24 | \description{ 25 | This function performs statistical testing for differential 26 | enrichment using plaid 27 | } 28 | -------------------------------------------------------------------------------- /man/normalize_medians.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{normalize_medians} 4 | \alias{normalize_medians} 5 | \title{Normalize column medians of matrix} 6 | \usage{ 7 | normalize_medians(x, ignore.zero = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{Input matrix} 11 | 12 | \item{ignore.zero}{Logical indicating whether to ignore zeros to 13 | exclude for median calculation} 14 | } 15 | \value{ 16 | Matrix with normalized column medians. 17 | } 18 | \description{ 19 | This function normalizes the column medians of matrix x. It calls 20 | optimized functions from the matrixStats package. 21 | } 22 | \examples{ 23 | # Create example matrix 24 | set.seed(123) 25 | x <- matrix(rnorm(100), nrow = 10, ncol = 10) 26 | x[1:3, 1:3] <- 0 # Add some zeros 27 | 28 | # Normalize medians 29 | x_norm <- normalize_medians(x) 30 | head(x_norm) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /dev/make_package.R: -------------------------------------------------------------------------------- 1 | ## Followings steps of "R Packages" book 2 | ## 3 | 4 | library(usethis) 5 | library(devtools) 6 | 7 | setwd("~/Playground/projects/plaid") 8 | create_package("~/Playground/projects/plaid") 9 | use_git() 10 | 11 | use_r("plaid") 12 | use_gpl3_license() 13 | 14 | usethis::use_vignette("plaid") 15 | 16 | use_testthat() 17 | use_test("plaid") 18 | 19 | ## List package dependencies 20 | use_package("Matrix") 21 | use_package("Rfast") 22 | use_package("matrixStats") 23 | use_package("methods") 24 | use_package("sparseMatrixStats") 25 | use_package("metap") 26 | 27 | ## For vignette 28 | use_package("BiocStyle", type = "Suggests") 29 | #use_package("Seurat", type = "Suggests") 30 | #use_package("SeuratData", type = "Suggests") 31 | 32 | use_readme_md() 33 | 34 | use_logo("inst/logo/logo.png") 35 | 36 | use_build_ignore(c("dev")) 37 | 38 | check() 39 | install() 40 | 41 | ## Reload 42 | load_all() 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /man/gset_averageCLR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{gset_averageCLR} 4 | \alias{gset_averageCLR} 5 | \title{Compute geneset expression as the average log-ration of genes in 6 | the geneset. Requires log-expression matrix X and (sparse) geneset 7 | matrix matG.} 8 | \usage{ 9 | gset_averageCLR(X, matG, center = TRUE) 10 | } 11 | \arguments{ 12 | \item{X}{Log-expression matrix with genes on rows and samples on columns} 13 | 14 | \item{matG}{Sparse gene set matrix with genes on rows and gene sets on columns} 15 | 16 | \item{center}{Logical indicating whether to center the results} 17 | } 18 | \value{ 19 | Matrix of gene set expression scores with gene sets on rows and samples on columns. 20 | } 21 | \description{ 22 | Compute geneset expression as the average log-ration of genes in 23 | the geneset. Requires log-expression matrix X and (sparse) geneset 24 | matrix matG. 25 | } 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .RDataTmp 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | /Meta/ 35 | /doc/ 36 | 37 | # Temporary files created by R markdown 38 | *.utf8.md 39 | *.knit.md 40 | 41 | # R Environment Variables 42 | .Renviron 43 | 44 | # pkgdown site 45 | docs/ 46 | 47 | # translation temp files 48 | po/*~ 49 | 50 | # RStudio Connect folder 51 | rsconnect/ 52 | 53 | # Backup files 54 | *~ 55 | *.BAK 56 | *.tmp 57 | *.save 58 | \#* 59 | 60 | # Output 61 | *.out 62 | *.Rout 63 | *.tmp 64 | *.pdf 65 | docs 66 | -------------------------------------------------------------------------------- /man/mat2gmt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gmt-utils.R 3 | \name{mat2gmt} 4 | \alias{mat2gmt} 5 | \title{Convert Binary Matrix to GMT} 6 | \usage{ 7 | mat2gmt(mat) 8 | } 9 | \arguments{ 10 | \item{mat}{Matrix with non-zero entries representing genes in each gene set. 11 | Rows represent genes and columns represent gene sets.} 12 | } 13 | \value{ 14 | A list of vector representing each gene set. Each list 15 | element correspond to a gene set and is a vector of genes 16 | } 17 | \description{ 18 | Convert binary matrix to a GMT (Gene Matrix Transposed) list. 19 | The binary matrix indicates presence or absence of genes in each gene set. 20 | Rows represent genes and columns represent gene sets. 21 | } 22 | \examples{ 23 | # Create example binary matrix 24 | mat <- matrix(0, nrow = 6, ncol = 3) 25 | rownames(mat) <- paste0("GENE", 1:6) 26 | colnames(mat) <- paste0("Pathway", 1:3) 27 | mat[1:3, 1] <- 1 # Pathway1: GENE1, GENE2, GENE3 28 | mat[c(2,4,5), 2] <- 1 # Pathway2: GENE2, GENE4, GENE5 29 | mat[c(1,5,6), 3] <- 1 # Pathway3: GENE1, GENE5, GENE6 30 | 31 | # Convert to GMT list 32 | gmt <- mat2gmt(mat) 33 | print(gmt) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /man/read.gmt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gmt-utils.R 3 | \name{read.gmt} 4 | \alias{read.gmt} 5 | \title{Read GMT File} 6 | \usage{ 7 | read.gmt(gmt.file, dir = NULL, add.source = FALSE, nrows = -1) 8 | } 9 | \arguments{ 10 | \item{gmt.file}{Path to GMT file.} 11 | 12 | \item{dir}{(Optional) The directory where the GMT file is located.} 13 | 14 | \item{add.source}{(optional) Include the source information in the gene sets' names.} 15 | 16 | \item{nrows}{(optional) Number of rows to read from the GMT file.} 17 | } 18 | \value{ 19 | A list of gene sets: each gene set is represented as a character vector of gene names. 20 | } 21 | \description{ 22 | Read data from a GMT file (Gene Matrix Transposed). 23 | The GMT format is commonly used to store gene sets or gene annotations. 24 | } 25 | \examples{ 26 | \donttest{ 27 | # Read GMT file (requires file to exist) 28 | gmt_file <- system.file("extdata", "hallmarks.gmt", package = "plaid") 29 | if (file.exists(gmt_file)) { 30 | gmt <- read.gmt(gmt_file) 31 | print(names(gmt)) 32 | print(head(gmt[[1]])) 33 | 34 | # Read with source information 35 | gmt_with_source <- read.gmt(gmt_file, add.source = TRUE) 36 | print(head(names(gmt_with_source))) 37 | } 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/colranks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{colranks} 4 | \alias{colranks} 5 | \title{Compute columnwise ranks of matrix} 6 | \usage{ 7 | colranks( 8 | X, 9 | sparse = NULL, 10 | signed = FALSE, 11 | keep.zero = FALSE, 12 | ties.method = "average" 13 | ) 14 | } 15 | \arguments{ 16 | \item{X}{Input matrix} 17 | 18 | \item{sparse}{Logical indicating to use sparse methods} 19 | 20 | \item{signed}{Logical indicating using signed ranks} 21 | 22 | \item{keep.zero}{Logical indicating whether to keep zero as ranked zero} 23 | 24 | \item{ties.method}{Character Choice of ties.method} 25 | } 26 | \value{ 27 | Matrix of columnwise ranks with same dimensions as input. 28 | } 29 | \description{ 30 | Computes columnwise rank of matrix. Can be sparse. Tries to call 31 | optimized functions from Rfast or matrixStats. 32 | } 33 | \examples{ 34 | # Create example matrix 35 | set.seed(123) 36 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 37 | rownames(X) <- paste0("Gene", 1:10) 38 | colnames(X) <- paste0("Sample", 1:10) 39 | 40 | # Compute column ranks 41 | ranks <- colranks(X) 42 | print(ranks[1:5, 1:5]) 43 | 44 | # Compute signed ranks 45 | signed_ranks <- colranks(X, signed = TRUE) 46 | print(signed_ranks[1:5, 1:5]) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(colranks) 4 | export(dualGSEA) 5 | export(gmt2mat) 6 | export(gset.rankcor) 7 | export(mat2gmt) 8 | export(normalize_medians) 9 | export(plaid) 10 | export(read.gmt) 11 | export(replaid.aucell) 12 | export(replaid.gsva) 13 | export(replaid.scse) 14 | export(replaid.sing) 15 | export(replaid.ssgsea) 16 | export(replaid.ucell) 17 | export(write.gmt) 18 | importFrom(BiocSet,es_elementset) 19 | importFrom(Matrix,Diagonal) 20 | importFrom(Matrix,Matrix) 21 | importFrom(Matrix,colScale) 22 | importFrom(Matrix,colSums) 23 | importFrom(Matrix,crossprod) 24 | importFrom(Matrix,rowMeans) 25 | importFrom(Matrix,rowSums) 26 | importFrom(Matrix,t) 27 | importFrom(Matrix,which) 28 | importFrom(Rfast,ttests) 29 | importFrom(SummarizedExperiment,assay) 30 | importFrom(SummarizedExperiment,assayNames) 31 | importFrom(matrixStats,colMedians) 32 | importFrom(matrixStats,colRanks) 33 | importFrom(methods,as) 34 | importFrom(methods,is) 35 | importFrom(parallel,mclapply) 36 | importFrom(qlcMatrix,corSparse) 37 | importFrom(sparseMatrixStats,colRanks) 38 | importFrom(stats,ecdf) 39 | importFrom(stats,p.adjust) 40 | importFrom(stats,pchisq) 41 | importFrom(stats,pnorm) 42 | importFrom(stats,pt) 43 | importFrom(stats,qnorm) 44 | importFrom(stats,var) 45 | importFrom(utils,head) 46 | importFrom(utils,read.csv) 47 | -------------------------------------------------------------------------------- /dev/test.R: -------------------------------------------------------------------------------- 1 | ##library("plaid") 2 | library(devtools) 3 | load_all() 4 | 5 | library(Seurat) 6 | library(SeuratData) 7 | data("pbmc3k.final") 8 | pbmc3k.final <- Seurat::UpdateSeuratObject(pbmc3k.final) 9 | X <- pbmc3k.final[['RNA']]@data 10 | dim(X) 11 | X <- do.call(cbind, rep(list(X),10)) 12 | 13 | gmt <- read.gmt(system.file("extdata", "hallmarks.gmt", package = "plaid")) 14 | matG <- gmt2mat(gmt) 15 | dim(matG) 16 | 17 | ## run plaid 18 | peakRAM::peakRAM( gsetX <- plaid(X, matG, normalize=TRUE) ) 19 | dim(gsetX) 20 | 21 | ## good practice to normalize median 22 | peakRAM::peakRAM( gsetX <- normalize_medians(gsetX) ) 23 | peakRAM::peakRAM( Rfast::colMedians(gsetX)) 24 | peakRAM::peakRAM( matrixStats::colMedians(gsetX)) 25 | peakRAM::peakRAM( apply(gsetX,2,stats::median,na.rm=TRUE)) 26 | 27 | 28 | ## do test 29 | celltype <- pbmc3k.final$seurat_annotations 30 | y <- (celltype == "B") 31 | res <- plaid.test(X, y, G=matG, gsetX=gsetX) 32 | head(res) 33 | 34 | pairs(res[,-1]) 35 | pairs(-log10(res[,-1])) 36 | 37 | head(res[order(res[,2]),1]) 38 | head(res[order(res[,3]),1]) 39 | head(res[order(res[,4]),1]) 40 | head(res[order(res[,5]),1]) 41 | 42 | fc <- rowMeans(X[,y==1]) - rowMeans(X[,y==0]) 43 | res.fgsea <- fgsea::fgsea( gmt, fc) 44 | head(res.fgsea) 45 | nes <- res.fgsea$NES 46 | p.nes <- res.fgsea$pval 47 | names(nes)=names(p.nes)=res.fgsea$pathway 48 | 49 | -------------------------------------------------------------------------------- /man/write.gmt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gmt-utils.R 3 | \name{write.gmt} 4 | \alias{write.gmt} 5 | \title{Write GMT File} 6 | \usage{ 7 | write.gmt(gmt, file, source = NA) 8 | } 9 | \arguments{ 10 | \item{gmt}{A list of gene sets in GMT format: each gene set is represented as a vector of gene names.} 11 | 12 | \item{file}{The file path to write the GMT file.} 13 | 14 | \item{source}{A character vector specifying the source of each gene set. 15 | If not provided, the names of the gene sets are used as the source.} 16 | } 17 | \value{ 18 | Does not return anything. 19 | } 20 | \description{ 21 | Write gene sets to GMT file (Gene Matrix Transposed). 22 | The GMT format is commonly used to store gene sets or gene annotations. 23 | } 24 | \examples{ 25 | # Create example GMT data 26 | gmt <- list( 27 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 28 | "Pathway2" = c("GENE2", "GENE4", "GENE5"), 29 | "Pathway3" = c("GENE1", "GENE5", "GENE6") 30 | ) 31 | 32 | \donttest{ 33 | # Write to GMT file (creates file in temp directory) 34 | temp_file <- tempfile(fileext = ".gmt") 35 | write.gmt(gmt, temp_file) 36 | 37 | # Write with custom source information 38 | temp_file2 <- tempfile(fileext = ".gmt") 39 | write.gmt(gmt, temp_file2, source = c("DB1", "DB2", "DB3")) 40 | 41 | # Clean up 42 | unlink(c(temp_file, temp_file2)) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: plaid 2 | Title: PLAID ultrafast gene set enrichment scoring 3 | Version: 0.99.16 4 | Authors@R: c( 5 | person("Ivo", "Kwee", , "ivo.kwee@bigomics.ch", role = "aut", 6 | comment = c(ORCID = "0000-0002-2751-4218")), 7 | person("Antonino", "Zito", , "antonino.zito@bigomics.ch", role = "cre", 8 | comment = c(ORCID = "0000-0003-1931-984X")) 9 | ) 10 | Description: PLAID (Pathway Level Average Intensity Detection) is an ultra-fast method to compute single-sample enrichment scores for gene expression or proteomics data. For each sample, plaid computes the gene set score as the average intensity of the genes/proteins in the gene set. The output is a gene set score matrix suitable for further analyses. 11 | License: GPL-3 12 | URL: https://github.com/bigomics/plaid, https://bigomics.github.io/plaid/ 13 | BugReports: https://github.com/bigomics/plaid/issues 14 | Encoding: UTF-8 15 | Roxygen: list(markdown = TRUE) 16 | RoxygenNote: 7.3.3 17 | Suggests: 18 | BiocStyle, 19 | knitr, 20 | rmarkdown, 21 | testthat (>= 3.0.0) 22 | Config/testthat/edition: 3 23 | biocViews: GeneSetEnrichment, GeneExpression, Proteomics 24 | Depends: R (>= 4.3.3) 25 | Imports: 26 | Matrix, 27 | matrixStats, 28 | methods, 29 | parallel, 30 | Rfast, 31 | sparseMatrixStats, 32 | qlcMatrix, 33 | GSVA, 34 | fgsea, 35 | SummarizedExperiment, 36 | BiocSet, 37 | stats, 38 | utils 39 | VignetteBuilder: knitr 40 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * *" 7 | 8 | name: test-coverage 9 | 10 | jobs: 11 | test-coverage: 12 | runs-on: ubuntu-latest 13 | env: 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package"), 35 | token = Sys.getenv("CODECOV_TOKEN") 36 | ) 37 | shell: Rscript {0} 38 | 39 | - name: Show testthat output 40 | if: always() 41 | run: | 42 | ## -------------------------------------------------------------------- 43 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 44 | shell: bash 45 | 46 | - name: Upload test results 47 | if: failure() 48 | uses: actions/upload-artifact@v4 49 | with: 50 | name: coverage-test-failures 51 | path: ${{ runner.temp }}/package -------------------------------------------------------------------------------- /man/gmt2mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gmt-utils.R 3 | \name{gmt2mat} 4 | \alias{gmt2mat} 5 | \title{Convert GMT to Binary Matrix} 6 | \usage{ 7 | gmt2mat( 8 | gmt, 9 | max.genes = -1, 10 | ntop = -1, 11 | sparse = TRUE, 12 | bg = NULL, 13 | use.multicore = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{gmt}{List representing the GMT file: each element is a character vector representing a gene set.} 18 | 19 | \item{max.genes}{Max number of genes to include in the binary matrix. Default = -1 to include all genes.} 20 | 21 | \item{ntop}{Number of top genes to consider for each gene set. Default = -1 to include all genes.} 22 | 23 | \item{sparse}{Logical: create a sparse matrix. Default \code{TRUE}. If \code{FALSE} creates a dense matrix.} 24 | 25 | \item{bg}{Character vector of background gene set. Default \code{NULL} to consider all unique genes.} 26 | 27 | \item{use.multicore}{Logical: use parallel processing ('parallel' R package). Default \code{TRUE}.} 28 | } 29 | \value{ 30 | A binary matrix representing the presence or absence of genes in each gene set. 31 | Rows correspond to genes, and columns correspond to gene sets. 32 | } 33 | \description{ 34 | Convert a GMT file (Gene Matrix Transposed) to a binary matrix, 35 | where rows represent genes and columns represent gene sets. 36 | The binary matrix indicates presence or absence of genes in a gene set. 37 | } 38 | \examples{ 39 | # Create example GMT data 40 | gmt <- list( 41 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 42 | "Pathway2" = c("GENE2", "GENE4", "GENE5"), 43 | "Pathway3" = c("GENE1", "GENE5", "GENE6") 44 | ) 45 | 46 | # Convert to binary matrix 47 | mat <- gmt2mat(gmt) 48 | print(mat) 49 | 50 | # Create dense matrix instead of sparse 51 | mat_dense <- gmt2mat(gmt, sparse = FALSE) 52 | print(mat_dense) 53 | } 54 | -------------------------------------------------------------------------------- /man/gset.rankcor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{gset.rankcor} 4 | \alias{gset.rankcor} 5 | \title{Calculate gene set rank correlation} 6 | \usage{ 7 | gset.rankcor(rnk, gset, compute.p = FALSE, use.rank = TRUE) 8 | } 9 | \arguments{ 10 | \item{rnk}{Numeric vector or matrix of gene ranks, with genes as row names} 11 | 12 | \item{gset}{Numeric matrix of gene sets, with genes as row/column names} 13 | 14 | \item{compute.p}{Logical indicating whether to compute p-values} 15 | 16 | \item{use.rank}{Logical indicating whether to rank transform rnk before correlation} 17 | } 18 | \value{ 19 | Named list with components: 20 | \itemize{ 21 | \item rho - Matrix of correlation coefficients between rnk and gset 22 | \item p.value - Matrix of p-values for correlation (if compute.p = TRUE) 23 | \item q.value - Matrix of FDR adjusted p-values (if compute.p = TRUE) 24 | } 25 | } 26 | \description{ 27 | Compute rank correlation between a gene rank vector/matrix and gene sets 28 | } 29 | \details{ 30 | This function calculates sparse rank correlation between rnk and each 31 | column of gset using \code{qlcMatrix::corSparse()}. It handles missing values in 32 | rnk by computing column-wise correlations. 33 | 34 | P-values are computed from statistical distribution 35 | } 36 | \examples{ 37 | # Create example rank vector 38 | set.seed(123) 39 | ranks <- rnorm(100) 40 | names(ranks) <- paste0("GENE", 1:100) 41 | 42 | # Create example gene sets as sparse matrix 43 | gmt <- list( 44 | "Pathway1" = paste0("GENE", 1:20), 45 | "Pathway2" = paste0("GENE", 15:35), 46 | "Pathway3" = paste0("GENE", 30:50) 47 | ) 48 | genesets <- gmt2mat(gmt) 49 | 50 | # Calculate rank correlation 51 | result <- gset.rankcor(ranks, genesets, compute.p = TRUE) 52 | print(result$rho) 53 | print(result$p.value) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /man/replaid.sing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.sing} 4 | \alias{replaid.sing} 5 | \title{Fast calculation of singscore} 6 | \usage{ 7 | replaid.sing(X, matG, assay = "logcounts", min.genes = 5, max.genes = 500) 8 | } 9 | \arguments{ 10 | \item{X}{Gene or protein expression matrix. Generally log 11 | transformed. See details. Genes on rows, samples on columns. 12 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 13 | 14 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on 15 | columns. Also accepts BiocSet objects or GMT lists.} 16 | 17 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 18 | 19 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 20 | 21 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 22 | } 23 | \value{ 24 | Matrix of single-sample singscore enrichment scores. 25 | Gene sets on rows, samples on columns. 26 | } 27 | \description{ 28 | Calculates single-sample enrichment singscore 29 | (Fouratan et al., 2018) using plaid back-end. The computation is 30 | 10-100x faster than the original code. 31 | } 32 | \details{ 33 | Computing the singscore requires to compute the ranks of 34 | the expression matrix. We have wrapped this in a single 35 | convenience function. 36 | 37 | We have extensively compared the results of \code{replaid.sing} and from 38 | the original \code{singscore} R package and we showed identical result 39 | in the score, logFC and p-values. 40 | } 41 | \examples{ 42 | # Create example expression matrix 43 | set.seed(123) 44 | X <- matrix(rnorm(500), nrow = 50, ncol = 10) 45 | rownames(X) <- paste0("GENE", 1:50) 46 | colnames(X) <- paste0("Sample", 1:10) 47 | 48 | # Create example gene sets 49 | gmt <- list( 50 | "Pathway1" = paste0("GENE", 1:15), 51 | "Pathway2" = paste0("GENE", 10:25) 52 | ) 53 | matG <- gmt2mat(gmt) 54 | 55 | # Compute singscore 56 | scores <- replaid.sing(X, matG) 57 | print(scores[1:2, 1:5]) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /man/replaid.aucell.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.aucell} 4 | \alias{replaid.aucell} 5 | \title{Fast calculation of AUCell} 6 | \usage{ 7 | replaid.aucell( 8 | X, 9 | matG, 10 | aucMaxRank = NULL, 11 | assay = "logcounts", 12 | min.genes = 5, 13 | max.genes = 500 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{Gene or protein expression matrix. Generally log 18 | transformed. See details. Genes on rows, samples on columns. 19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 20 | 21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns. 22 | Also accepts BiocSet objects or GMT lists.} 23 | 24 | \item{aucMaxRank}{Rank threshold (see AUCell paper). Default aucMaxRank = 0.05*nrow(X).} 25 | 26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 27 | 28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 29 | 30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 31 | } 32 | \value{ 33 | Matrix of single-sample AUCell enrichment scores. 34 | Gene sets on rows, samples on columns. 35 | } 36 | \description{ 37 | Calculates single-sample enrichment AUCell (Aibar 38 | et al., 2017) using plaid back-end. The computation is 39 | 10-100x faster than the original code. 40 | } 41 | \details{ 42 | Computing the AUCell score requires to compute the ranks 43 | of the expression matrix and approximating the AUC of a gene 44 | set. We have wrapped this in a single convenience function. 45 | 46 | We have extensively compared the results of \code{replaid.aucell} and 47 | from the original \code{AUCell} R package and we showed good concordance 48 | of results in the score, logFC and p-values. 49 | } 50 | \examples{ 51 | # Create example expression matrix 52 | set.seed(123) 53 | X <- matrix(rnorm(500), nrow = 50, ncol = 10) 54 | rownames(X) <- paste0("GENE", 1:50) 55 | colnames(X) <- paste0("Sample", 1:10) 56 | 57 | # Create example gene sets 58 | gmt <- list( 59 | "Pathway1" = paste0("GENE", 1:15), 60 | "Pathway2" = paste0("GENE", 10:25) 61 | ) 62 | matG <- gmt2mat(gmt) 63 | 64 | # Compute AUCell scores 65 | scores <- replaid.aucell(X, matG) 66 | print(scores[1:2, 1:5]) 67 | 68 | } 69 | -------------------------------------------------------------------------------- /man/replaid.ucell.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.ucell} 4 | \alias{replaid.ucell} 5 | \title{Fast calculation of UCell} 6 | \usage{ 7 | replaid.ucell( 8 | X, 9 | matG, 10 | rmax = 1500, 11 | assay = "logcounts", 12 | min.genes = 5, 13 | max.genes = 500 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{Gene or protein expression matrix. Generally log 18 | transformed. See details. Genes on rows, samples on columns. 19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 20 | 21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns. 22 | Also accepts BiocSet objects or GMT lists.} 23 | 24 | \item{rmax}{Rank threshold (see Ucell paper). Default rmax = 1500.} 25 | 26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 27 | 28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 29 | 30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 31 | } 32 | \value{ 33 | Matrix of single-sample UCell enrichment scores. 34 | Gene sets on rows, samples on columns. 35 | } 36 | \description{ 37 | Calculates single-sample enrichment UCell (Andreatta 38 | et al., 2021) using plaid back-end. The computation is 39 | 10-100x faster than the original code. 40 | } 41 | \details{ 42 | Computing ssGSEA score requires to compute the ranks of 43 | the expression matrix and truncation of the ranks. We have wrapped 44 | this in a single convenience function. 45 | 46 | We have extensively compared the results of \code{replaid.ucell} and 47 | from the original \code{UCell} R package and we showed near exacct 48 | results in the score, logFC and p-values. 49 | } 50 | \examples{ 51 | # Create example expression matrix 52 | set.seed(123) 53 | X <- matrix(rnorm(500), nrow = 50, ncol = 10) 54 | rownames(X) <- paste0("GENE", 1:50) 55 | colnames(X) <- paste0("Sample", 1:10) 56 | 57 | # Create example gene sets 58 | gmt <- list( 59 | "Pathway1" = paste0("GENE", 1:15), 60 | "Pathway2" = paste0("GENE", 10:25) 61 | ) 62 | matG <- gmt2mat(gmt) 63 | 64 | # Compute UCell scores (default rmax = 1500) 65 | scores <- replaid.ucell(X, matG) 66 | print(scores[1:2, 1:5]) 67 | 68 | # Compute UCell scores with custom rmax 69 | scores_custom <- replaid.ucell(X, matG, rmax = 1000) 70 | print(scores_custom[1:2, 1:5]) 71 | 72 | } 73 | -------------------------------------------------------------------------------- /man/replaid.ssgsea.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.ssgsea} 4 | \alias{replaid.ssgsea} 5 | \title{Fast calculation of ssGSEA} 6 | \usage{ 7 | replaid.ssgsea( 8 | X, 9 | matG, 10 | alpha = 0, 11 | assay = "logcounts", 12 | min.genes = 5, 13 | max.genes = 500 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{Gene or protein expression matrix. Generally log 18 | transformed. See details. Genes on rows, samples on columns. 19 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 20 | 21 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on 22 | columns. Also accepts BiocSet objects or GMT lists.} 23 | 24 | \item{alpha}{Weighting factor for exponential weighting of ranks} 25 | 26 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 27 | 28 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 29 | 30 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 31 | } 32 | \value{ 33 | Matrix of single-sample ssGSEA enrichment scores. 34 | Gene sets on rows, samples on columns. 35 | } 36 | \description{ 37 | Calculates single-sample enrichment singscore (Barbie 38 | et al., 2009; Hänzelmann et al., 2013) using plaid back-end. The 39 | computation is 10-100x faster than the original code. 40 | } 41 | \details{ 42 | Computing ssGSEA score requires to compute the ranks of 43 | the expression matrix and weighting of the ranks. We have wrapped 44 | this in a single convenience function. 45 | 46 | We have extensively compared the results of \code{replaid.ssgsea} and 47 | from the original \code{GSVA} R package and we showed highly similar 48 | results in the score, logFC and p-values. For alpha=0 we obtain 49 | exact results, for alpha>0 the results are highly similar but not 50 | exactly the same. 51 | } 52 | \examples{ 53 | # Create example expression matrix 54 | set.seed(123) 55 | X <- matrix(rnorm(500), nrow = 50, ncol = 10) 56 | rownames(X) <- paste0("GENE", 1:50) 57 | colnames(X) <- paste0("Sample", 1:10) 58 | 59 | # Create example gene sets 60 | gmt <- list( 61 | "Pathway1" = paste0("GENE", 1:15), 62 | "Pathway2" = paste0("GENE", 10:25) 63 | ) 64 | matG <- gmt2mat(gmt) 65 | 66 | # Compute ssGSEA scores (alpha = 0) 67 | scores <- replaid.ssgsea(X, matG, alpha = 0) 68 | print(scores[1:2, 1:5]) 69 | 70 | # Compute ssGSEA scores with weighting (alpha = 0.25) 71 | scores_weighted <- replaid.ssgsea(X, matG, alpha = 0.25) 72 | print(scores_weighted[1:2, 1:5]) 73 | 74 | } 75 | -------------------------------------------------------------------------------- /man/replaid.gsva.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.gsva} 4 | \alias{replaid.gsva} 5 | \title{Fast approximation of GSVA} 6 | \usage{ 7 | replaid.gsva( 8 | X, 9 | matG, 10 | tau = 0, 11 | rowtf = c("z", "ecdf")[1], 12 | assay = "logcounts", 13 | min.genes = 5, 14 | max.genes = 500 15 | ) 16 | } 17 | \arguments{ 18 | \item{X}{Gene or protein expression matrix. Generally log 19 | transformed. See details. Genes on rows, samples on columns. 20 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 21 | 22 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on 23 | columns. Also accepts BiocSet objects or GMT lists.} 24 | 25 | \item{tau}{Rank weight parameter (see GSVA publication). Default 26 | tau=0.} 27 | 28 | \item{rowtf}{Row transformation method ("z" or "ecdf"). Default "z".} 29 | 30 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 31 | 32 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 33 | 34 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 35 | } 36 | \value{ 37 | Matrix of single-sample GSVA enrichment scores. 38 | Gene sets on rows, samples on columns. 39 | } 40 | \description{ 41 | Calculates single-sample enrichment GSVA (Hänzelmann 42 | et al., 2013) using plaid back-end. The computation is 43 | 10-100x faster than the original code. 44 | } 45 | \details{ 46 | Computing the GSVA score requires to compute the CDF of 47 | the expression matrix, ranking and scoring the genesets. We have 48 | wrapped this in a single convenience function. 49 | 50 | We have extensively compared the results of \code{replaid.gsva} and 51 | from the original \code{GSVA} R package and we showed good concordance 52 | of results in the score, logFC and p-values. 53 | 54 | In the original formulation, GSVA uses an emperical CDF to 55 | transform expression of each feature to a (0;1) relative expression 56 | value. For efficiency reasons, this is here approximated by a 57 | z-transform (center+scale) of each row. 58 | } 59 | \examples{ 60 | # Create example expression matrix 61 | set.seed(123) 62 | X <- matrix(rnorm(500), nrow = 50, ncol = 10) 63 | rownames(X) <- paste0("GENE", 1:50) 64 | colnames(X) <- paste0("Sample", 1:10) 65 | 66 | # Create example gene sets 67 | gmt <- list( 68 | "Pathway1" = paste0("GENE", 1:15), 69 | "Pathway2" = paste0("GENE", 10:25) 70 | ) 71 | matG <- gmt2mat(gmt) 72 | 73 | # Compute GSVA scores 74 | scores <- replaid.gsva(X, matG) 75 | print(scores[1:2, 1:5]) 76 | 77 | } 78 | -------------------------------------------------------------------------------- /man/replaid.scse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{replaid.scse} 4 | \alias{replaid.scse} 5 | \title{Fast calculation of scSE score} 6 | \usage{ 7 | replaid.scse( 8 | X, 9 | matG, 10 | removeLog2 = NULL, 11 | scoreMean = FALSE, 12 | assay = "logcounts", 13 | min.genes = 5, 14 | max.genes = 500 15 | ) 16 | } 17 | \arguments{ 18 | \item{X}{Gene or protein expression matrix. Generally log 19 | transformed. See details. Genes on rows, samples on columns. 20 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 21 | 22 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on 23 | columns. Also accepts BiocSet objects or GMT lists.} 24 | 25 | \item{removeLog2}{Logical for whether to remove the Log2, i.e. will 26 | apply power transform (base2) on input (default TRUE).} 27 | 28 | \item{scoreMean}{Logical for whether computing sum or mean as score 29 | (default FALSE).} 30 | 31 | \item{assay}{Character: assay name for Bioconductor objects. Default "logcounts".} 32 | 33 | \item{min.genes}{Integer: minimum genes per gene set. Default 5.} 34 | 35 | \item{max.genes}{Integer: maximum genes per gene set. Default 500.} 36 | } 37 | \value{ 38 | Matrix of single-sample scSE enrichment scores. 39 | Gene sets on rows, samples on columns. 40 | } 41 | \description{ 42 | Calculates Single-Cell Signature Explorer (Pont et 43 | al., 2019) scores using plaid back-end. The computation is 44 | 10-100x faster than the original code. 45 | } 46 | \details{ 47 | Computing the scSE requires running plaid on the linear 48 | (not logarithmic) score and perform additional normalization by 49 | the total UMI per sample. We have wrapped this in a single 50 | convenience function: 51 | 52 | To replicate the original "sum-of-UMI" scSE score, set \code{removeLog2=TRUE} 53 | and \code{scoreMean=FALSE}. scSE and plaid scores become more similar for 54 | \code{removeLog2=FALSE} and \code{scoreMean=TRUE}. 55 | 56 | We have extensively compared the results from \code{replaid.scse} and 57 | from the original scSE (implemented in GO lang) and we showed 58 | almost identical results in the score, logFC and p-values. 59 | } 60 | \examples{ 61 | # Create example expression matrix (log-transformed) 62 | set.seed(123) 63 | X <- log2(matrix(rpois(500, lambda = 10) + 1, nrow = 50, ncol = 10)) 64 | rownames(X) <- paste0("GENE", 1:50) 65 | colnames(X) <- paste0("Sample", 1:10) 66 | 67 | # Create example gene sets 68 | gmt <- list( 69 | "Pathway1" = paste0("GENE", 1:15), 70 | "Pathway2" = paste0("GENE", 10:25) 71 | ) 72 | matG <- gmt2mat(gmt) 73 | 74 | # Compute scSE scores (original method) 75 | scores <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = FALSE) 76 | print(scores[1:2, 1:5]) 77 | 78 | # Compute scSE scores (mean method) 79 | scores_mean <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = TRUE) 80 | print(scores_mean[1:2, 1:5]) 81 | 82 | } 83 | -------------------------------------------------------------------------------- /man/dualGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stats.R 3 | \name{dualGSEA} 4 | \alias{dualGSEA} 5 | \title{Reimplementation of dualGSEA (Bull et al., 2024) but defaults with 6 | replaid backend. For the preranked test we still use fgsea. Should 7 | be much faster than original using fgsea + GSVA::ssGSEA.} 8 | \usage{ 9 | dualGSEA( 10 | X, 11 | y, 12 | G, 13 | gmt = NULL, 14 | gsetX = NULL, 15 | fc.method = c("fgsea", "rankcor", "ztest", "ttest", "cor")[2], 16 | ss.method = c("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva")[1], 17 | metap.method = c("stouffer", "fisher", "maxp")[1], 18 | pv1 = NULL, 19 | pv2 = NULL, 20 | sort.by = "p.dual" 21 | ) 22 | } 23 | \arguments{ 24 | \item{X}{Expression matrix with genes on rows and samples ont columns} 25 | 26 | \item{y}{Binary vector (0/1) indicating group membership} 27 | 28 | \item{G}{Sparse matrix of gene sets. Non-zero entry indicates 29 | gene/feature is part of gene sets. Features on rows, gene sets on 30 | columns.} 31 | 32 | \item{gmt}{List of gene sets in GMT format} 33 | 34 | \item{gsetX}{Optional pre-computed matrix of gene set enrichment scores with 35 | gene sets on rows and samples on columns. If NULL (default), scores will be 36 | computed using the method specified by \code{ss.method}. Providing pre-computed 37 | scores improves efficiency when running multiple analyses.} 38 | 39 | \item{fc.method}{Method for fold change testing ("fgsea", "ztest", "ttest", "rankcor", "cor")} 40 | 41 | \item{ss.method}{Method for single-sample enrichment ("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva")} 42 | 43 | \item{metap.method}{Method for combining p-values ("stouffer", "fisher" or "maxp"). Default "stouffer".} 44 | 45 | \item{pv1}{Pre-computed p-values from fold change test. If NULL, will be computed based on fc.test.} 46 | 47 | \item{pv2}{Pre-computed p-values from single sample test. If NULL, will be computed using gset_ttest.} 48 | 49 | \item{sort.by}{Column name to sort results by ("p.dual", "gsetFC", "p.fc", "p.ss"). Default "p.dual".} 50 | } 51 | \value{ 52 | Data frame with columns: gsetFC (gene set fold change), size (gene set size), 53 | p.fc (p-value from fold change test), p.ss (p-value from single sample test), 54 | p.dual (combined p-value), and q.dual (FDR-adjusted combined p-value). 55 | } 56 | \description{ 57 | Reimplementation of dualGSEA (Bull et al., 2024) but defaults with 58 | replaid backend. For the preranked test we still use fgsea. Should 59 | be much faster than original using fgsea + GSVA::ssGSEA. 60 | } 61 | \examples{ 62 | # Create example expression matrix 63 | set.seed(123) 64 | X <- matrix(rnorm(1000), nrow = 100, ncol = 20) 65 | rownames(X) <- paste0("GENE", 1:100) 66 | colnames(X) <- paste0("Sample", 1:20) 67 | 68 | # Create binary group vector 69 | y <- rep(c(0, 1), each = 10) 70 | 71 | # Create example gene sets 72 | gmt <- list( 73 | "Pathway1" = paste0("GENE", 1:20), 74 | "Pathway2" = paste0("GENE", 15:35), 75 | "Pathway3" = paste0("GENE", 30:50) 76 | ) 77 | 78 | # Perform dualGSEA with correlation test (fast method) 79 | results_cor <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "cor", ss.method = "replaid.gsva") 80 | print(head(results_cor)) 81 | 82 | \donttest{ 83 | # Perform dualGSEA with fgsea (requires fgsea package) 84 | if (requireNamespace("fgsea", quietly = TRUE)) { 85 | results <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "fgsea", ss.method = "replaid.ssgsea") 86 | print(head(results)) 87 | } 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PLAID: ultrafast single-sample gene set enrichment plaid website 2 | 3 | [![codecov](https://codecov.io/github/bigomics/plaid/graph/badge.svg?token=66J6W41C0G)](https://codecov.io/github/bigomics/plaid) 4 | 5 | [PLAID](https://bigomics.github.io/plaid) (Pathway Level Average Intensity Detection) is an ultrafast 6 | method to compute single-sample enrichment scores for gene expression 7 | or proteomics data. For each sample, PLAID computes the gene set score 8 | as the average intensity of the genes/proteins in the gene set. The 9 | output is a gene set score matrix suitable for further analyses. 10 | 11 | A distinctive feature of PLAID is that it can simulate few of the most 12 | widely used single-sample gene set scoring algorithms 13 | ([ssGSEA](https://github.com/rcastelo/GSVA), 14 | [GSVA](https://github.com/rcastelo/GSVA), 15 | [AUCell](https://github.com/aertslab/AUCell), 16 | [singscore](https://github.com/DavisLaboratory/singscore), 17 | [scSE](https://doi.org/10.1093/nar/gkz601), 18 | [UCell](https://github.com/carmonalab/UCell)), enabling researchers 19 | to replace those functions and gain much improved runtime efficiency 20 | and memory requirement. Typically, PLAID can be more than 100 times 21 | faster and requiring 10 times less memory than the original algorithm. 22 | 23 | #### Key features 24 | 25 | - Ultra-fast single-sample gene set enrichment scoring 26 | - Includes multiple scoring methods (plaid, singscore, ssGSEA, GSVA, scSE, UCell, AUCell) 27 | - Works with regular matrices, sparse matrices, and Bioconductor data structures 28 | - Automatically detects and handles Bioconductor objects 29 | ([`SummarizedExperiment`](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html), 30 | [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html), [`BiocSet`](https://bioconductor.org/packages/release/bioc/html/BiocSet.html)) 31 | - Built-in differential enrichment testing 32 | 33 | #### Warning 34 | 35 | PLAID is fast. Ludicrously fast. Please fasten your seatbelts before usage. 36 | 37 | ## Installation 38 | 39 | You can install PLAID from Bioconductor: 40 | 41 | ```r 42 | BiocManager::install("plaid") 43 | ``` 44 | 45 | You can also install the development version from GitHub: 46 | 47 | ```r 48 | remotes::install_github("bigomics/plaid") 49 | ``` 50 | 51 | ## Usage 52 | 53 | For detailed usage examples and tutorials, please see our vignettes: 54 | 55 | - [Getting Started with PLAID](https://bigomics.github.io/plaid/articles/01_plaid-vignette.html) 56 | - [Comparing PLAID with other methods](https://bigomics.github.io/plaid/articles/02_compare-vignette.html) 57 | 58 | PLAID is the main single-sample gene set scoring algorithm in OmicsPlayground, our 59 | Bioinformatics platform at [BigOmics](https://bigomics.ch). In OmicsPlayground, you 60 | can perform PLAID without coding needs. 61 | 62 | ## References 63 | 64 | For more technical details please refer to our papers. Please cite us when you use 65 | PLAID as part of your research. 66 | 67 | - Zito A., et al. PLAID: ultrafast single-sample gene set enrichment scoring. Bioinformatics, 2025, [btaf621](https://doi.org/10.1093/bioinformatics/btaf621). 68 | - Akhmedov M., et al., Omics Playground: a comprehensive self-service platform for visualization, analytics and exploration of Big Omics Data, NAR Genomics and Bioinformatics, 2020, [lqz019](https://doi.org/10.1093/nargab/lqz019). 69 | 70 | ## Support 71 | 72 | For support feel free to reach our Bioinformatics Data Science Team at 73 | BigOmics Analytics: help@bigomics.ch 74 | 75 | If you like PLAID, please recommend us to your friends, buy us [coffee](https://buymeacoffee.com/bigomics) 76 | and brag about PLAID on your social media. 77 | -------------------------------------------------------------------------------- /man/plaid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plaid.R 3 | \name{plaid} 4 | \alias{plaid} 5 | \title{Compute PLAID single-sample enrichment score} 6 | \usage{ 7 | plaid( 8 | X, 9 | matG, 10 | stats = c("mean", "sum"), 11 | chunk = NULL, 12 | normalize = TRUE, 13 | nsmooth = 3, 14 | assay = "logcounts", 15 | min.genes = 5, 16 | max.genes = 500 17 | ) 18 | } 19 | \arguments{ 20 | \item{X}{Log-transformed expr. matrix. Genes on rows, samples on columns. 21 | Also accepts SummarizedExperiment or SingleCellExperiment objects.} 22 | 23 | \item{matG}{Gene sets sparse matrix. Genes on rows, gene sets on columns. 24 | Also accepts BiocSet objects or GMT lists (named list of gene vectors).} 25 | 26 | \item{stats}{Score computation stats: mean or sum of intensity. Default 'mean'.} 27 | 28 | \item{chunk}{Logical: use chunks for large matrices. Default 'NULL' for autodetect.} 29 | 30 | \item{normalize}{Logical: median normalize results or not. Default 'TRUE'.} 31 | 32 | \item{nsmooth}{Smoothing parameter for more stable average when stats="mean". Default 3.} 33 | 34 | \item{assay}{Character: assay name to extract from SummarizedExperiment/SingleCellExperiment. Default "logcounts".} 35 | 36 | \item{min.genes}{Integer: minimum genes per gene set (for BiocSet/GMT input). Default 5.} 37 | 38 | \item{max.genes}{Integer: maximum genes per gene set (for BiocSet/GMT input). Default 500.} 39 | } 40 | \value{ 41 | Matrix of single-sample enrichment scores. 42 | Gene sets on rows, samples on columns. 43 | } 44 | \description{ 45 | Compute single-sample geneset expression as the 46 | average log-expression f genes in the geneset. Requires log-expression 47 | matrix X and (sparse) geneset matrix matG. If you have gene sets 48 | as a gmt list, please convert it first using the function \code{gmt2mat()}. 49 | } 50 | \details{ 51 | PLAID needs the gene sets as sparse matrix. If you have 52 | your collection of gene sets a a list, we need first to convert 53 | the gmt list to matrix format. 54 | 55 | We recommend to run PLAID on the log transformed expression matrix, 56 | not on the counts, as the average in the logarithmic space is more 57 | robust and is in concordance to calculating the geometric mean. 58 | 59 | It is not necessary to normalize your expression matrix before 60 | running PLAID because PLAID performs median normalization of the 61 | enrichment scores afterwards. 62 | 63 | It is recommended to use sparse matrix as PLAID relies on 64 | sparse matrix computations. But, PLAID is also fast for dense matrices. 65 | 66 | PLAID can also be run on the ranked matrix. This corresponds to 67 | the singscore (Fouratan et al., 2018). PLAID can also be run on 68 | the (non-logarithmic) counts which can be used to calculate the 69 | scSE score (Pont et al., 2019). 70 | 71 | PLAID is fast and memery efficient because it uses efficient 72 | sparse matrix computation. When input matrix is very large, PLAID 73 | performs 'chunked' computation by splitting the matrix in chunks. 74 | 75 | Although \code{X} and \code{matG} are generally sparse, the result 76 | matrix \code{gsetX} generally is dense and can thus be very large. 77 | Example: computing gene set scores for 10K gene sets on 1M cells 78 | will create a 10K x 1M dense matrix which requires ~75GB memory. 79 | 80 | PLAID now automatically detects and handles Bioconductor objects. 81 | If X is a SummarizedExperiment or SingleCellExperiment, it will extract 82 | the appropriate assay. If matG is a BiocSet object or GMT list, it will 83 | be converted to sparse matrix format automatically. 84 | } 85 | \examples{ 86 | library(plaid) 87 | 88 | # Create example expression matrix 89 | set.seed(123) 90 | X <- matrix(rnorm(1000), nrow = 100, ncol = 10) 91 | rownames(X) <- paste0("GENE", 1:100) 92 | colnames(X) <- paste0("Sample", 1:10) 93 | 94 | # Create example gene sets 95 | gmt <- list( 96 | "Pathway1" = paste0("GENE", 1:20), 97 | "Pathway2" = paste0("GENE", 15:35), 98 | "Pathway3" = paste0("GENE", 30:50) 99 | ) 100 | matG <- gmt2mat(gmt) 101 | 102 | # Compute PLAID scores 103 | gsetX <- plaid(X, matG) 104 | print(dim(gsetX)) 105 | print(gsetX[1:3, 1:5]) 106 | 107 | # Use sum statistics instead of mean 108 | gsetX_sum <- plaid(X, matG, stats = "sum") 109 | 110 | \donttest{ 111 | # Using real data (if available in package) 112 | extdata_path <- system.file("extdata", "pbmc3k-50cells.rda", package = "plaid") 113 | if (file.exists(extdata_path)) { 114 | load(extdata_path) 115 | hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid") 116 | gmt <- read.gmt(hallmarks) 117 | matG <- gmt2mat(gmt) 118 | gsetX <- plaid(X, matG) 119 | } 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /R/bioc-utils.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## This file is part of the Omics Playground project. 3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 4 | ## 5 | 6 | #' @importFrom SummarizedExperiment assayNames assay 7 | #' @importFrom BiocSet es_elementset 8 | NULL 9 | 10 | #' Extract expression matrix from Bioconductor objects 11 | #' 12 | #' @description Internal function to extract expression matrix from various 13 | #' Bioconductor objects including SummarizedExperiment and SingleCellExperiment. 14 | #' 15 | #' @param object A Bioconductor object (SummarizedExperiment, SingleCellExperiment) 16 | #' or a regular matrix. 17 | #' @param assay Character or integer specifying which assay to extract. Default "logcounts". 18 | #' For SummarizedExperiment objects, common options are "counts", "logcounts", 19 | #' "normcounts". The function will try several common names if the specified 20 | #' assay is not found. 21 | #' @param log.transform Logical. If TRUE and the data appears to be counts (not log-transformed), 22 | #' will apply log2(x+1) transformation. Default FALSE. 23 | #' 24 | #' @return A matrix with genes/features on rows and samples on columns. 25 | #' 26 | #' @details This function is designed to handle various Bioconductor object types: 27 | #' \itemize{ 28 | #' \item SummarizedExperiment: extracts from assays 29 | #' \item SingleCellExperiment: extracts from assays (prefers logcounts) 30 | #' \item Matrix/matrix: returns as-is 31 | #' } 32 | #' 33 | #' The function will attempt to find an appropriate assay by trying common names 34 | #' in the following order: specified assay, "logcounts", "normcounts", "counts", 35 | #' or the first available assay. 36 | #' 37 | #' @noRd 38 | .extract_expression_matrix <- function(object, assay = "logcounts", log.transform = FALSE) { 39 | 40 | # If already a matrix, return as-is 41 | if (is.matrix(object) || inherits(object, "Matrix")) { 42 | return(object) 43 | } 44 | 45 | # Check for SummarizedExperiment or SingleCellExperiment 46 | if (inherits(object, "SummarizedExperiment") || 47 | inherits(object, "SingleCellExperiment")) { 48 | 49 | # Try to extract the requested assay 50 | avail_assays <- SummarizedExperiment::assayNames(object) 51 | 52 | if (length(avail_assays) == 0) { 53 | stop("[.extract_expression_matrix] No assays found in object.") 54 | } 55 | 56 | # Try to find the best assay 57 | if (assay %in% avail_assays) { 58 | X <- SummarizedExperiment::assay(object, assay) 59 | } else { 60 | # Try common alternatives in order of preference 61 | preferred_order <- c("logcounts", "normcounts", "counts") 62 | found_assay <- NULL 63 | 64 | for (pref in preferred_order) { 65 | if (pref %in% avail_assays) { 66 | found_assay <- pref 67 | break 68 | } 69 | } 70 | 71 | if (is.null(found_assay)) { 72 | # Just use first available assay 73 | found_assay <- avail_assays[1] 74 | message("[.extract_expression_matrix] Using first available assay: ", found_assay) 75 | } else { 76 | message("[.extract_expression_matrix] Requested assay '", assay, 77 | "' not found. Using: ", found_assay) 78 | } 79 | 80 | X <- SummarizedExperiment::assay(object, found_assay) 81 | } 82 | 83 | # Apply log transformation if requested 84 | if (log.transform) { 85 | # Check if data looks like counts (non-negative integers or large values) 86 | if (min(X, na.rm = TRUE) >= 0 && max(X, na.rm = TRUE) > 100) { 87 | message("[.extract_expression_matrix] Applying log2(x+1) transformation.") 88 | if (inherits(X, "dgCMatrix")) { 89 | X@x <- log2(X@x + 1) 90 | } else { 91 | X <- log2(X + 1) 92 | } 93 | } 94 | } 95 | 96 | return(X) 97 | } 98 | 99 | # If we get here, unsupported object type 100 | stop("[.extract_expression_matrix] Unsupported object type: ", class(object)[1], 101 | "\nSupported types: matrix, Matrix, SummarizedExperiment, SingleCellExperiment") 102 | } 103 | 104 | 105 | #' Convert BiocSet to sparse matrix format for plaid 106 | #' 107 | #' @description Internal function to convert BiocSet objects to the sparse 108 | #' matrix format required by plaid (genes on rows, gene sets on columns). 109 | #' 110 | #' @param geneset A BiocSet object, list (GMT format), or already a matrix. 111 | #' @param background Character vector of background genes to include in the matrix. 112 | #' Default NULL uses all genes from the gene sets. 113 | #' @param min.genes Minimum number of genes required for a gene set to be included. 114 | #' Default 5. 115 | #' @param max.genes Maximum number of genes allowed for a gene set to be included. 116 | #' Default 500. 117 | #' 118 | #' @return A sparse matrix (dgCMatrix) with genes on rows and gene sets on columns. 119 | #' 120 | #' @details This function handles conversion from BiocSet objects to the 121 | #' sparse matrix format used by plaid. It can also handle GMT lists or matrices 122 | #' directly. Gene sets are filtered by size (min/max genes). 123 | #' 124 | #' For BiocSet objects, the function extracts element-set mappings and converts 125 | #' them to a binary sparse matrix indicating gene membership. 126 | #' 127 | #' @noRd 128 | .convert_geneset_to_matrix <- function(geneset, background = NULL, 129 | min.genes = 5, max.genes = 500) { 130 | 131 | # If already a matrix, return as-is 132 | if (is.matrix(geneset) || inherits(geneset, "Matrix")) { 133 | return(geneset) 134 | } 135 | 136 | # Handle BiocSet objects 137 | if (inherits(geneset, "BiocSet")) { 138 | message("[.convert_geneset_to_matrix] Converting BiocSet to matrix format...") 139 | 140 | # Extract the element-set mapping from BiocSet 141 | # BiocSet uses a tibble structure with 'element' and 'set' columns 142 | es_tbl <- BiocSet::es_elementset(geneset) 143 | 144 | if (nrow(es_tbl) == 0) { 145 | stop("[.convert_geneset_to_matrix] BiocSet object is empty.") 146 | } 147 | 148 | # Convert to GMT list format first 149 | gmt <- split(es_tbl$element, es_tbl$set) 150 | gmt <- lapply(gmt, as.character) 151 | 152 | } else if (is.list(geneset)) { 153 | # Already in GMT format 154 | gmt <- geneset 155 | } else { 156 | stop("[.convert_geneset_to_matrix] Unsupported geneset type: ", class(geneset)[1], 157 | "\nSupported types: BiocSet, list (GMT), matrix, Matrix") 158 | } 159 | 160 | # Filter gene sets by size 161 | gset_sizes <- vapply(gmt, length, integer(1)) 162 | valid_sets <- gset_sizes >= min.genes & gset_sizes <= max.genes 163 | 164 | if (sum(valid_sets) == 0) { 165 | stop("[.convert_geneset_to_matrix] No gene sets passed size filters (min=", 166 | min.genes, ", max=", max.genes, ")") 167 | } 168 | 169 | if (sum(!valid_sets) > 0) { 170 | message("[.convert_geneset_to_matrix] Filtered out ", sum(!valid_sets), 171 | " gene sets (size filters: ", min.genes, "-", max.genes, " genes)") 172 | } 173 | 174 | gmt <- gmt[valid_sets] 175 | 176 | # Convert to sparse matrix using existing gmt2mat function 177 | matG <- gmt2mat(gmt, bg = background, sparse = TRUE) 178 | 179 | return(matG) 180 | } 181 | -------------------------------------------------------------------------------- /R/gmt-utils.R: -------------------------------------------------------------------------------- 1 | ## This file is part of the Omics Playground project. 2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 3 | 4 | #' @importFrom utils head read.csv 5 | #' @importFrom Matrix Matrix rowSums which 6 | #' @importFrom parallel mclapply 7 | NULL 8 | 9 | #' Convert GMT to Binary Matrix 10 | #' 11 | #' @description Convert a GMT file (Gene Matrix Transposed) to a binary matrix, 12 | #' where rows represent genes and columns represent gene sets. 13 | #' The binary matrix indicates presence or absence of genes in a gene set. 14 | #' 15 | #' @param gmt List representing the GMT file: each element is a character vector representing a gene set. 16 | #' @param max.genes Max number of genes to include in the binary matrix. Default = -1 to include all genes. 17 | #' @param ntop Number of top genes to consider for each gene set. Default = -1 to include all genes. 18 | #' @param sparse Logical: create a sparse matrix. Default `TRUE`. If `FALSE` creates a dense matrix. 19 | #' @param bg Character vector of background gene set. Default `NULL` to consider all unique genes. 20 | #' @param use.multicore Logical: use parallel processing ('parallel' R package). Default `TRUE`. 21 | #' 22 | #' @export 23 | #' 24 | #' @return A binary matrix representing the presence or absence of genes in each gene set. 25 | #' Rows correspond to genes, and columns correspond to gene sets. 26 | #' 27 | #' @examples 28 | #' # Create example GMT data 29 | #' gmt <- list( 30 | #' "Pathway1" = c("GENE1", "GENE2", "GENE3"), 31 | #' "Pathway2" = c("GENE2", "GENE4", "GENE5"), 32 | #' "Pathway3" = c("GENE1", "GENE5", "GENE6") 33 | #' ) 34 | #' 35 | #' # Convert to binary matrix 36 | #' mat <- gmt2mat(gmt) 37 | #' print(mat) 38 | #' 39 | #' # Create dense matrix instead of sparse 40 | #' mat_dense <- gmt2mat(gmt, sparse = FALSE) 41 | #' print(mat_dense) 42 | gmt2mat <- function(gmt, 43 | max.genes = -1, 44 | ntop = -1, sparse = TRUE, 45 | bg = NULL, 46 | use.multicore = TRUE) { 47 | 48 | gmt <- gmt[order(-vapply(gmt, length, integer(1)))] 49 | gmt <- gmt[!duplicated(names(gmt))] 50 | if (ntop > 0) gmt <- lapply(gmt, utils::head, n = ntop) 51 | 52 | if (is.null(names(gmt))) names(gmt) <- paste("gmt.", seq_along(gmt), sep = "") 53 | if (is.null(bg)) { 54 | bg <- names(sort(table(unlist(gmt)), decreasing = TRUE)) 55 | } 56 | 57 | if (max.genes < 0) max.genes <- length(bg) 58 | gg <- bg 59 | gg <- Matrix::head(bg, n = max.genes) 60 | ##gmt <- lapply(gmt, function(s) intersect(gg, s)) 61 | kk <- unique(names(gmt)) 62 | if (sparse) { 63 | D <- Matrix::Matrix(0, nrow = length(gg), ncol = length(kk), sparse = TRUE) 64 | } else { 65 | D <- matrix(0, nrow = length(gg), ncol = length(kk)) 66 | } 67 | rownames(D) <- gg 68 | colnames(D) <- kk 69 | 70 | if (use.multicore) { 71 | idx <- parallel::mclapply(gmt, function(s) match(s, gg)) 72 | } else { 73 | idx <- lapply(gmt, function(s) match(s, gg)) 74 | } 75 | idx <- lapply(idx, function(x) x[!is.na(x)]) 76 | idx[vapply(idx, length, integer(1)) == 0] <- 0 77 | idx <- lapply(seq_along(idx), function(i) rbind(idx[[i]], i)) 78 | idx <- matrix(unlist(idx[]), byrow = TRUE, ncol = 2) 79 | idx <- idx[!is.na(idx[, 1]), ] 80 | idx <- idx[idx[, 1] > 0, ] 81 | D[idx] <- 1 82 | D <- D[order(-Matrix::rowSums(D != 0, na.rm = TRUE)), ,drop=FALSE] 83 | 84 | return(D) 85 | 86 | } 87 | 88 | #' Convert Binary Matrix to GMT 89 | #' 90 | #' @description Convert binary matrix to a GMT (Gene Matrix Transposed) list. 91 | #' The binary matrix indicates presence or absence of genes in each gene set. 92 | #' Rows represent genes and columns represent gene sets. 93 | #' 94 | #' @param mat Matrix with non-zero entries representing genes in each gene set. 95 | #' Rows represent genes and columns represent gene sets. 96 | #' 97 | #' @export 98 | #' 99 | #' @return A list of vector representing each gene set. Each list 100 | #' element correspond to a gene set and is a vector of genes 101 | #' 102 | #' @examples 103 | #' # Create example binary matrix 104 | #' mat <- matrix(0, nrow = 6, ncol = 3) 105 | #' rownames(mat) <- paste0("GENE", 1:6) 106 | #' colnames(mat) <- paste0("Pathway", 1:3) 107 | #' mat[1:3, 1] <- 1 # Pathway1: GENE1, GENE2, GENE3 108 | #' mat[c(2,4,5), 2] <- 1 # Pathway2: GENE2, GENE4, GENE5 109 | #' mat[c(1,5,6), 3] <- 1 # Pathway3: GENE1, GENE5, GENE6 110 | #' 111 | #' # Convert to GMT list 112 | #' gmt <- mat2gmt(mat) 113 | #' print(gmt) 114 | #' 115 | mat2gmt <- function(mat) { 116 | idx <- Matrix::which(mat != 0, arr.ind = TRUE) 117 | gmt <- tapply(rownames(idx), idx[, 2], list) 118 | names(gmt) <- colnames(mat)[as.integer(names(gmt))] 119 | return(gmt) 120 | } 121 | 122 | 123 | #' Read GMT File 124 | #' 125 | #' @description Read data from a GMT file (Gene Matrix Transposed). 126 | #' The GMT format is commonly used to store gene sets or gene annotations. 127 | #' @param gmt.file Path to GMT file. 128 | #' @param dir (Optional) The directory where the GMT file is located. 129 | #' @param add.source (optional) Include the source information in the gene sets' names. 130 | #' @param nrows (optional) Number of rows to read from the GMT file. 131 | #' 132 | #' @export 133 | #' 134 | #' @return A list of gene sets: each gene set is represented as a character vector of gene names. 135 | #' 136 | #' @examples 137 | #' \donttest{ 138 | #' # Read GMT file (requires file to exist) 139 | #' gmt_file <- system.file("extdata", "hallmarks.gmt", package = "plaid") 140 | #' if (file.exists(gmt_file)) { 141 | #' gmt <- read.gmt(gmt_file) 142 | #' print(names(gmt)) 143 | #' print(head(gmt[[1]])) 144 | #' 145 | #' # Read with source information 146 | #' gmt_with_source <- read.gmt(gmt_file, add.source = TRUE) 147 | #' print(head(names(gmt_with_source))) 148 | #' } 149 | #' } 150 | #' 151 | read.gmt <- function(gmt.file, 152 | dir = NULL, 153 | add.source = FALSE, 154 | nrows = -1) { 155 | f0 <- gmt.file 156 | if (strtrim(gmt.file, 1) == "/") dir <- NULL 157 | if (!is.null(dir)) f0 <- paste(sub("/$", "", dir), "/", gmt.file, sep = "") 158 | gmt <- utils::read.csv(f0, sep = "!", header = FALSE, comment.char = "#", nrows = nrows)[, 1] 159 | gmt <- as.character(gmt) 160 | gmt <- lapply(gmt, function(s) strsplit(s, split = "\t")[[1]]) 161 | names(gmt) <- NULL 162 | gmt.name <- vapply(gmt, "[", character(1), 1) 163 | gmt.source <- vapply(gmt, "[", character(1), 2) 164 | gmt.genes <- vapply(gmt, function(x) { 165 | if (length(x) < 3) return(""); 166 | paste(x[3:length(x)], collapse = " ") 167 | }, character(1)) 168 | gset <- strsplit(gmt.genes, split = "[ \t]") 169 | gset <- lapply(gset, function(x) setdiff(x, c("", "NA", NA))) 170 | names(gset) <- gmt.name 171 | 172 | if (add.source) 173 | names(gset) <- paste0(names(gset), " (", gmt.source, ")") 174 | 175 | return(gset) 176 | 177 | } 178 | 179 | 180 | #' Write GMT File 181 | #' 182 | #' @description Write gene sets to GMT file (Gene Matrix Transposed). 183 | #' The GMT format is commonly used to store gene sets or gene annotations. 184 | #' 185 | #' @param gmt A list of gene sets in GMT format: each gene set is represented as a vector of gene names. 186 | #' @param file The file path to write the GMT file. 187 | #' @param source A character vector specifying the source of each gene set. 188 | #' If not provided, the names of the gene sets are used as the source. 189 | #' 190 | #' @export 191 | #' @return Does not return anything. 192 | #' 193 | #' @examples 194 | #' # Create example GMT data 195 | #' gmt <- list( 196 | #' "Pathway1" = c("GENE1", "GENE2", "GENE3"), 197 | #' "Pathway2" = c("GENE2", "GENE4", "GENE5"), 198 | #' "Pathway3" = c("GENE1", "GENE5", "GENE6") 199 | #' ) 200 | #' 201 | #' \donttest{ 202 | #' # Write to GMT file (creates file in temp directory) 203 | #' temp_file <- tempfile(fileext = ".gmt") 204 | #' write.gmt(gmt, temp_file) 205 | #' 206 | #' # Write with custom source information 207 | #' temp_file2 <- tempfile(fileext = ".gmt") 208 | #' write.gmt(gmt, temp_file2, source = c("DB1", "DB2", "DB3")) 209 | #' 210 | #' # Clean up 211 | #' unlink(c(temp_file, temp_file2)) 212 | #' } 213 | #' 214 | write.gmt <- function(gmt, file, source = NA) { 215 | gg <- lapply(gmt, paste, collapse = "\t") 216 | if (length(source) == 1 && is.na(source[1])) source <- names(gmt) 217 | ee <- paste(names(gmt), "\t", source, "\t", gg, sep = "") 218 | write(ee, file = file) 219 | } 220 | -------------------------------------------------------------------------------- /vignettes/02_compare-vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Comparing PLAID with other methods" 3 | author: "BigOmics Analytics" 4 | package: plaid 5 | output: 6 | BiocStyle::html_document 7 | vignette: > 8 | %\VignetteIndexEntry{Comparing PLAID with other methods} 9 | %\VignetteEngine{knitr::rmarkdown} 10 | %\VignetteEncoding{UTF-8} 11 | --- 12 | 13 | ```{r, include = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = "#>" 17 | ) 18 | ``` 19 | 20 | # Introduction 21 | 22 | Single-sample gene set and pathway scoring enable the prioritization 23 | of molecular signatures of individual samples, highlighting potential 24 | targets for personalized medicine. Several methods have been 25 | developed, each with distinct strengths and limitations. The most 26 | widely used include [ssGSEA](https://github.com/rcastelo/GSVA), 27 | [GSVA](https://github.com/rcastelo/GSVA), 28 | [AUCell](https://github.com/aertslab/AUCell), 29 | [singscore](https://github.com/DavisLaboratory/singscore), 30 | [scSE](https://doi.org/10.1093/nar/gkz601) and 31 | [UCell](https://github.com/carmonalab/UCell). These methods differ in 32 | preprocessing steps of ranking, centering, and normalization. Despite 33 | methodical variations, these methods collectively offer vast 34 | flexibility for data analysis. Nevertheless, with the emergence of 35 | large-scale biological data (e.g., biobanks), most methods face 36 | substantial computational inefficiency. Except scSE, which is 37 | implemented in Go, other methods are in fact practically infeasible in 38 | runtime and memory requirements for large-scale data. This has limited 39 | the application of single-sample gene set scoring, hindering timely 40 | discovery. 41 | 42 | We address this critical issue with PLAID (Pathway Level Average Intensity Detection). Within each sample, PLAID identifies the genes mapped within a gene set and calculates the gene set score as the average log-intensity of genes in the gene set. PLAID does not zero-center or rank features, and leverages sparse matrices for efficient computations. 43 | 44 | # Benchmarking 45 | 46 | We evaluated PLAID in single-cell transcriptomics (PBMC3K), bulk proteomics, and microarray data. These data span diverse scenarios in terms of resolution and distribution, thus allowing comprehensive benchmarks. We compared PLAID runtime, peak memory, and gene set scores to Singscore, ssGSEA, GSVA, scSE, UCell and AUCell. Collectively these methods represent the most used single-sample gene set enrichment methods in biomedical research. 47 | 48 | # Comparison 49 | 50 | ## Computational Performance 51 | 52 | ![Run time (seconds) and peak RAM memory (MiB) usage. Benchmarking was conducted on data and gene sets of increasing sizes: 100 cells, 1K cells, 10K cells; 50 gene sets (HALLMARK collection); 2864 gene sets (GO_BP collection); 61,459 gene sets (whole collection). Runs were timed out at 1h (3600s). Timed out runs are indicated by an asterisk.](fig1.png) 53 | 54 | In scRNA-seq data, PLAID consistently outperformed other methods at different sample and gene set sizes. PLAID completed scoring for 2,864 gene sets in 1K cells in 0.17s, which is >100x faster compared to any other method. The second best method was scSE. Overall, the best memory-performing methods were PLAID and scSE. A similar trend was observed for 10K cells, where PLAID was >10x faster than other methods, while requiring similar RAM as other non-timed-out methods. 55 | 56 | When testing 61,459 gene sets in 1K cells, PLAID, Singscore, UCell and AUCell successfully completed the gene set scoring. Other methods were timed out at 1h. PLAID completed the run in <8s, with <3GB peak RAM. Singscore, UCell and AUCell required >100x runtime compared to PLAID, with up to ~1.5GB additional memory. GSVA, ssGSEA and scSE were timed out, and neither runtime nor peak RAM usage were accurately estimated. For 61,459 gene sets and 10K cells, PLAID achieved gene set scoring within 1h, requiring 110s and <20GB peak RAM. Other methods were timed out, and would have required at least 5x additional computing resources for a complete run. 57 | 58 | While working with sparse matrices would be ideal, researchers often use dense matrices. For a comprehensive evaluation, we thus conducted runtime and memory profiling of PLAID on the TCGA-BRCA microarray dense data matrix. In line with previous observations, PLAID outperformed other methods in most scenarios. For instance, PLAID was >100x faster than AUCell and GSVA for 61,459 gene sets and 1K cells, requiring 2.4GB. Altogether, these data demonstrate that PLAID is a highly efficient alternative to other single-sample gene set enrichment methods, but also that scSE remains a highly memory-optimized method capable of outperforming PLAID in some cases. Both runtime and peak memory usage increase approximately linearly with the number of cells or gene sets. For 1K gene sets and 1M cells, PLAID required about 200s and 28GB RAM peak memory. 59 | 60 | ## Score Concordance 61 | 62 | ![Pairwise scatter plots between PLAID vs. singscore, ssGSEA, GSVA, UCell, AUCell and scSE and scSE.mean ESs (Methods). Original ESs.](fig2.png) 63 | 64 | 65 | ![Pairwise scatter plots between PLAID vs. singscore, ssGSEA, GSVA, UCell, AUCell and scSE and scSE.mean ESs (Methods). Row-centered gene set ESs.](fig3.png) 66 | 67 | PLAID enrichment scores are median normalized to aid cross-group comparisons. While conceptually based on a self-defined metric, we nevertheless compared PLAID with other methods. We found that in scRNA-seq, for the first available cell, PLAID scores correlated well with Singscore, UCell and AUCell scores. By enabling calculation of mean, scSE produces scores highly concordant with PLAID. Lower similarities emerged with GSVA and ssGSEA. Notably, GSVA and ssGSEA also exhibited low concordance with any other method. 68 | 69 | Critically, within any single sample or cell, the raw gene set scores are not well suited for direct comparisons between gene sets. Comparing gene sets would need statistical testing between samples, or centering gene sets across samples to obtain relative scores corresponding to gene set average centered log-expression. On the basis of this principle, we compared PLAID scores vs other methods' scores after centering. Interestingly, we observed a generalized improved concordance with all methods, including with GSVA and ssGSEA. This supports the suitability of PLAID scores for differential testing between groups, and the possibility of cross-validating analyses from other methods. 70 | 71 | Testing in a bulk proteomics dataset confirmed high similarity between PLAID and scSE.mean, and high concordance with Singscore and ssGSEA. Low concordance emerged between PLAID and GSVA, in line with previous observations. Both in scRNA-seq and bulk proteomics, GSVA was lowly correlated with any other methods. Improved concordance was reached upon gene set centering, supporting results in scRNA-seq data. 72 | 73 | ## Replicating Other Methods 74 | 75 | ![Pairwise scatter plots between original and PLAID-replicated ESs for different methods (left to right): singscore, ssGSEA, GSVA, UCell, AUCell and scSE. Three testing datasets were used: (top to bottom): PBMC3K scRNAseq dataset; LC/MS proteomics dataset (Wolf et al., 2020); mRNA microarray (GSE10846; Lenz et al., 2008). In most cases, the PLAID (re)-implementation of the distinct enrichment methods provides accurate replication, with the key advantage of being significantly faster and memory efficient.](fig4.png) 76 | 77 | Running independent methods to validate enriched gene sets is a best practice. However, given the computational inefficiency of most current methods, this may be time-prohibitive in large datasets. We provide a solution to this problem by equipping PLAID with the most widely used single-sample gene set enrichment methods. 78 | 79 | We've implemented the following functions using PLAID as 'back-end': `replaid.sing`, `replaid.scse`, `replaid.ssgsea`, `replaid.gsva`, `replaid.ucell`, `replaid.aucell`. These functions provide efficient calculations of Singscore, scSE, ssGSEA, GSVA, UCell and AUCell. To ensure accurate replication of the original methods, we conducted testing in scRNA-seq, proteomics and microarray expression data. 80 | 81 | Best concordances are seen for `replaid.sing`, `replaid.scse`, `replaid.ucell` and `replaid.aucell` vs. each respective original method, with further improvements made possible by method-specific parameters that reflect original implementations. A somewhat relatively lower concordance (especially for scRNA-seq data) emerges between `replaid.ssgsea` and `replaid.gsva` vs. original ssGSEA and GSVA, respectively. These methods were originally not intended for scRNA-seq. Lower concordance is also related to the 'alpha' and 'tau' parameters, due to approximations of rank weightings and ECDF in PLAID. Nevertheless, we achieve good concordance in nearly all cases, with the unmatched advantage of reaching up to ten-fold gain in computational efficiency. 82 | 83 | # Conclusions 84 | 85 | These analyses demonstrate that PLAID is a highly-performing and accurate method for single-sample gene set scoring. PLAID is ultrafast and memory efficient, generating gene set scores highly concordant with existing methods. PLAID can be up to 100x faster and requires significantly less memory - reaching up to 5-fold reduction in memory usage - when compared to any other method in any dataset. 86 | 87 | Altogether, these data demonstrate the multi-task power of PLAID in providing (i) its own single-sample gene set enrichment scores; (ii) unmatched computational efficiency; (iii) a framework for the most used single-sample enrichment methods, with much higher efficiency; (iv) evidence that if implemented expertly, the R language can be highly efficient. 88 | 89 | # Session Info 90 | 91 | ```{r sessionInfo} 92 | sessionInfo() 93 | ``` 94 | 95 | -------------------------------------------------------------------------------- /tests/testthat/test-bioc-utils.R: -------------------------------------------------------------------------------- 1 | ## This file is part of the Omics Playground project. 2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 3 | 4 | library(testthat) 5 | library(Matrix) 6 | 7 | # ============================================================================= 8 | # Test .extract_expression_matrix function 9 | # ============================================================================= 10 | 11 | test_that(".extract_expression_matrix handles regular matrices", { 12 | # Create test matrix 13 | mat <- matrix(rnorm(100), nrow = 10, ncol = 10) 14 | rownames(mat) <- paste0("Gene", 1:10) 15 | colnames(mat) <- paste0("Sample", 1:10) 16 | 17 | # Should return matrix as-is 18 | result <- plaid:::.extract_expression_matrix(mat) 19 | 20 | expect_identical(result, mat) 21 | expect_true(is.matrix(result)) 22 | }) 23 | 24 | test_that(".extract_expression_matrix handles sparse matrices", { 25 | # Create sparse matrix 26 | mat <- Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 27 | rownames(mat) <- paste0("Gene", 1:10) 28 | colnames(mat) <- paste0("Sample", 1:10) 29 | 30 | # Should return sparse matrix as-is 31 | result <- plaid:::.extract_expression_matrix(mat) 32 | 33 | expect_identical(result, mat) 34 | expect_true(inherits(result, "Matrix")) 35 | }) 36 | 37 | test_that(".extract_expression_matrix handles SummarizedExperiment objects", { 38 | skip_if_not_installed("SummarizedExperiment") 39 | 40 | # Create test SummarizedExperiment 41 | counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10) 42 | rownames(counts) <- paste0("Gene", 1:10) 43 | colnames(counts) <- paste0("Sample", 1:10) 44 | 45 | se <- SummarizedExperiment::SummarizedExperiment( 46 | assays = list(counts = counts) 47 | ) 48 | 49 | # Extract expression matrix 50 | result <- plaid:::.extract_expression_matrix(se, assay = "counts") 51 | 52 | expect_true(is.matrix(result)) 53 | expect_equal(dim(result), c(10, 10)) 54 | expect_equal(rownames(result), paste0("Gene", 1:10)) 55 | }) 56 | 57 | test_that(".extract_expression_matrix handles multiple assays", { 58 | skip_if_not_installed("SummarizedExperiment") 59 | 60 | # Create SE with multiple assays 61 | counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10) 62 | logcounts <- log2(counts + 1) 63 | rownames(counts) <- rownames(logcounts) <- paste0("Gene", 1:10) 64 | colnames(counts) <- colnames(logcounts) <- paste0("Sample", 1:10) 65 | 66 | se <- SummarizedExperiment::SummarizedExperiment( 67 | assays = list(counts = counts, logcounts = logcounts) 68 | ) 69 | 70 | # Should extract logcounts when specified 71 | result_log <- plaid:::.extract_expression_matrix(se, assay = "logcounts") 72 | expect_equal(result_log, logcounts) 73 | 74 | # Should extract counts when specified 75 | result_counts <- plaid:::.extract_expression_matrix(se, assay = "counts") 76 | expect_equal(result_counts, counts) 77 | }) 78 | 79 | test_that(".extract_expression_matrix falls back to available assays", { 80 | skip_if_not_installed("SummarizedExperiment") 81 | 82 | # Create SE with only counts 83 | counts <- matrix(rpois(100, lambda = 10), nrow = 10, ncol = 10) 84 | rownames(counts) <- paste0("Gene", 1:10) 85 | colnames(counts) <- paste0("Sample", 1:10) 86 | 87 | se <- SummarizedExperiment::SummarizedExperiment( 88 | assays = list(counts = counts) 89 | ) 90 | 91 | # Request non-existent assay, should fall back 92 | expect_message( 93 | result <- plaid:::.extract_expression_matrix(se, assay = "logcounts"), 94 | "not found" 95 | ) 96 | 97 | expect_true(is.matrix(result)) 98 | }) 99 | 100 | test_that(".extract_expression_matrix applies log transformation when requested", { 101 | skip_if_not_installed("SummarizedExperiment") 102 | 103 | # Create SE with large count values 104 | counts <- matrix(rpois(100, lambda = 500), nrow = 10, ncol = 10) 105 | rownames(counts) <- paste0("Gene", 1:10) 106 | colnames(counts) <- paste0("Sample", 1:10) 107 | 108 | se <- SummarizedExperiment::SummarizedExperiment( 109 | assays = list(counts = counts) 110 | ) 111 | 112 | # Request with log transformation 113 | expect_message( 114 | result <- plaid:::.extract_expression_matrix(se, assay = "counts", log.transform = TRUE), 115 | "log2" 116 | ) 117 | 118 | # Result should be log-transformed 119 | expect_true(all(result < 20)) # Log values should be much smaller 120 | expect_true(all(result >= 0)) 121 | }) 122 | 123 | test_that(".extract_expression_matrix errors on empty assays", { 124 | skip_if_not_installed("SummarizedExperiment") 125 | 126 | # Create empty SE 127 | se <- SummarizedExperiment::SummarizedExperiment() 128 | 129 | expect_error( 130 | plaid:::.extract_expression_matrix(se), 131 | "No assays found" 132 | ) 133 | }) 134 | 135 | test_that(".extract_expression_matrix errors on unsupported types", { 136 | # Test with unsupported object type 137 | df <- data.frame(x = 1:10, y = 11:20) 138 | 139 | expect_error( 140 | plaid:::.extract_expression_matrix(df), 141 | "Unsupported object type" 142 | ) 143 | }) 144 | 145 | # ============================================================================= 146 | # Test .convert_geneset_to_matrix function 147 | # ============================================================================= 148 | 149 | test_that(".convert_geneset_to_matrix handles matrices directly", { 150 | # Create test matrix 151 | mat <- matrix(c(1, 0, 1, 0, 0, 1), nrow = 3, ncol = 2) 152 | rownames(mat) <- paste0("Gene", 1:3) 153 | colnames(mat) <- paste0("Pathway", 1:2) 154 | 155 | # Should return matrix as-is 156 | result <- plaid:::.convert_geneset_to_matrix(mat) 157 | 158 | expect_identical(result, mat) 159 | }) 160 | 161 | test_that(".convert_geneset_to_matrix handles GMT lists", { 162 | # Create GMT list 163 | gmt <- list( 164 | "Pathway1" = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6"), 165 | "Pathway2" = c("Gene7", "Gene8", "Gene9", "Gene10", "Gene11"), 166 | "Pathway3" = c("Gene1", "Gene3", "Gene5", "Gene7", "Gene9", "Gene11") 167 | ) 168 | 169 | # Convert to matrix 170 | result <- plaid:::.convert_geneset_to_matrix(gmt) 171 | 172 | expect_true(inherits(result, "sparseMatrix")) 173 | expect_equal(ncol(result), 3) 174 | expect_true(all(colnames(result) %in% names(gmt))) 175 | }) 176 | 177 | test_that(".convert_geneset_to_matrix filters by gene set size", { 178 | # Create GMT with varying sizes 179 | gmt <- list( 180 | "TooSmall" = c("Gene1", "Gene2"), # Only 2 genes 181 | "JustRight" = paste0("Gene", 1:10), # 10 genes 182 | "TooBig" = paste0("Gene", 1:600) # 600 genes 183 | ) 184 | 185 | # Should filter out TooSmall and TooBig 186 | expect_message( 187 | result <- plaid:::.convert_geneset_to_matrix(gmt, min.genes = 5, max.genes = 500), 188 | "Filtered out" 189 | ) 190 | 191 | expect_equal(ncol(result), 1) 192 | expect_equal(colnames(result), "JustRight") 193 | }) 194 | 195 | test_that(".convert_geneset_to_matrix uses background genes", { 196 | gmt <- list( 197 | "Pathway1" = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6"), 198 | "Pathway2" = c("Gene3", "Gene4", "Gene5", "Gene6", "Gene7", "Gene8") 199 | ) 200 | 201 | # Specify background 202 | bg <- c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5") 203 | 204 | result <- plaid:::.convert_geneset_to_matrix(gmt, background = bg) 205 | 206 | # Should only include background genes 207 | expect_true(all(rownames(result) %in% bg)) 208 | }) 209 | 210 | test_that(".convert_geneset_to_matrix errors when no gene sets pass filter", { 211 | gmt <- list( 212 | "TooSmall1" = c("Gene1", "Gene2"), 213 | "TooSmall2" = c("Gene3") 214 | ) 215 | 216 | expect_error( 217 | plaid:::.convert_geneset_to_matrix(gmt, min.genes = 5, max.genes = 500), 218 | "No gene sets passed size filters" 219 | ) 220 | }) 221 | 222 | test_that(".convert_geneset_to_matrix handles BiocSet objects", { 223 | skip_if_not_installed("BiocSet") 224 | skip("BiocSet requires complex setup with tibbles and specific structure") 225 | 226 | # Note: BiocSet objects require specific tibble structure 227 | # and are complex to construct in tests. The function handles 228 | # them by extracting es_elementset() and converting to GMT format. 229 | }) 230 | 231 | test_that(".convert_geneset_to_matrix errors on BiocSet with no data", { 232 | skip_if_not_installed("BiocSet") 233 | skip("BiocSet requires complex setup - error handling verified via code inspection") 234 | }) 235 | 236 | test_that(".convert_geneset_to_matrix handles various input types", { 237 | # The function accepts matrix, Matrix, BiocSet, or list 238 | # Data frames might be coerced to lists by R, so test with truly unsupported type 239 | 240 | # Test that unsupported atomic types error 241 | expect_error( 242 | plaid:::.convert_geneset_to_matrix("not_a_valid_input"), 243 | "Unsupported geneset type|subscript out of bounds" 244 | ) 245 | }) 246 | 247 | # ============================================================================= 248 | # Integration test: using both functions together 249 | # ============================================================================= 250 | 251 | test_that("bioc-utils functions work together in workflow", { 252 | skip_if_not_installed("SummarizedExperiment") 253 | 254 | # Create test data 255 | counts <- matrix(rpois(200, lambda = 10), nrow = 20, ncol = 10) 256 | rownames(counts) <- paste0("Gene", 1:20) 257 | colnames(counts) <- paste0("Sample", 1:10) 258 | 259 | se <- SummarizedExperiment::SummarizedExperiment( 260 | assays = list(counts = counts) 261 | ) 262 | 263 | # Create gene sets 264 | gmt <- list( 265 | "Pathway1" = paste0("Gene", 1:10), 266 | "Pathway2" = paste0("Gene", 11:20) 267 | ) 268 | 269 | # Extract expression and convert gene sets 270 | expr_mat <- plaid:::.extract_expression_matrix(se, assay = "counts") 271 | gset_mat <- plaid:::.convert_geneset_to_matrix(gmt, background = rownames(expr_mat)) 272 | 273 | # Should have compatible dimensions 274 | expect_equal(nrow(gset_mat), nrow(expr_mat)) 275 | expect_true(all(rownames(gset_mat) %in% rownames(expr_mat))) 276 | }) 277 | 278 | -------------------------------------------------------------------------------- /vignettes/01_plaid-vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Getting Started with PLAID" 3 | author: "BigOmics Analytics" 4 | package: plaid 5 | output: 6 | BiocStyle::html_document 7 | vignette: > 8 | %\VignetteIndexEntry{Getting Started with PLAID} 9 | %\VignetteEngine{knitr::rmarkdown} 10 | %\VignetteEncoding{UTF-8} 11 | --- 12 | 13 | ```{r, include = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = "#>", 17 | fig.align = "center" 18 | ) 19 | ``` 20 | 21 | # Introduction 22 | 23 | PLAID (Pathway Level Average Intensity Detection) is a 24 | novel, ultrafast and memory optimized gene set enrichment scoring 25 | algorithm. PLAID demonstrates accurate gene set scoring and 26 | outperforms all currently available gene set scoring methods in large 27 | bulk and single-cell RNA-seq datasets. 28 | 29 | # Motivation 30 | 31 | In recent years, computational methods have emerged that calculate enrichment of gene signatures 32 | within individual samples, rather than across pooled samples. These signatures offer critical insights into the 33 | coordinated activity of functionally related genes, proteins or metabolites, enabling identification of unique 34 | molecular profiles based on gene set and pathway activity in individual cells and patients. This strategy is pivotal 35 | for patient stratification and advancement of personalized medicine. However, the rise of large-scale datasets, 36 | including single-cell profiles and population biobanks, has exposed significant computational inefficiencies in 37 | existing methods. Current methods often demand excessive runtime and memory resources, becoming 38 | impractical for large datasets. Overcoming these limitations is a focus of current efforts by bioinformatics teams in 39 | academia and the pharmaceutical industry, as essential to support basic and clinical biomedical research. 40 | 41 | 42 | # Example: Single-cell RNA-seq hallmark scoring 43 | 44 | ## Preparing data 45 | 46 | For this vignette, our package includes a small subset of the the 47 | pmbc3k single-cell dataset of just 50 cells. Please install the Seurat 48 | and SeuratData packages if you want to run this vignette against the 49 | full dataset. 50 | 51 | ```{r} 52 | library("plaid") 53 | load(system.file("extdata", "pbmc3k-50cells.rda", package = "plaid"),verbose=TRUE) 54 | dim(X) 55 | ``` 56 | 57 | Note that X is the normalized expression matrix from the Seurat 58 | object, not the raw counts matrix. We recommend to run plaid on the 59 | log transformed expression matrix, not on the counts, as the average 60 | in the logarithmic space is more robust and is in concordance to 61 | calculating the geometric mean. 62 | 63 | It is not necessary to normalize your expression matrix before running 64 | plaid because plaid normalizes the enrichment scores 65 | afterwards. However, again, log transformation is recommended. 66 | 67 | It is recommended to keep the expression matrix sparse as much as 68 | possible because plaid extensively take advantage of sparse matrix 69 | computations. But even for dense matrices plaid is fast. 70 | 71 | 72 | ## Preparing gene sets 73 | 74 | For convenience we have included the 50 Hallmark genesets in our 75 | package. But we encourage you to download larger geneset collections 76 | as plaid's speed advantage will be more apparent for larger datasets 77 | and large geneset collections. 78 | 79 | Plaid needs the gene sets as sparse matrix. If you have your 80 | collection of gene sets a a list, we need first to convert the gmt 81 | list to matrix format. 82 | 83 | ```{r} 84 | hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid") 85 | gmt <- read.gmt(hallmarks) 86 | matG <- gmt2mat(gmt) 87 | dim(matG) 88 | ``` 89 | 90 | If you have your own gene sets stored as gmt files, you can 91 | conveniently use the included `read.gmt()` function to read the gmt 92 | file. 93 | 94 | 95 | ## Calculating the score 96 | 97 | The main function to run plaid is `plaid()`. We run plaid on our 98 | expression matrix `X` and gene set matrix `matG`. 99 | 100 | ```{r} 101 | gsetX <- plaid(X, matG, normalize=TRUE) 102 | dim(gsetX) 103 | ``` 104 | 105 | The resulting matrix `gsetX` contains the single-sample enrichment 106 | scores for the specified gene sets and samples. 107 | 108 | Notice that by default plaid performs median normalization of the 109 | final results. That also means that it is not necessary to normalize 110 | your expression matrix before running plaid. However, generally, log 111 | transformation is recommended. 112 | 113 | Plaid can also be run on the ranked matrix, we will see later that 114 | this corresponds to the singscore (Fouratan et al., 2018). Or plaid 115 | could be run on the (non-logarithmic) counts which can be used to 116 | calculate the scSE score (Pont et al., 2019). 117 | 118 | ## Very large matrices 119 | 120 | Plaid is fast and memory efficient because it uses very efficient 121 | sparse matrix computation in the back. For very large `X`, plaid uses 122 | chunked computation by splitting the matrix in chunks to avoid index 123 | overflow. Should you encounter errors, please compute your dataset by 124 | subsetting manually the expression matrix and/or gene sets. 125 | 126 | Although `X` and `matG` are generally very sparse, be aware that the 127 | result matrix `gsetX` generally is dense and therefore can become very 128 | large. If you would want to compute the score of 10.000 gene sets on a 129 | million of cells this would create a large 10.000 x 1.000.000 dense 130 | matrix which requires about 75GB of memory. 131 | 132 | 133 | ## Differential expression testing using dualGSEA 134 | 135 | Once we have the gene sets scores we can use these scores for 136 | statistical analysis. We could compute the differential gene set 137 | expression between two groups using a general t-test or limma directly 138 | on the score matrix `gsetX`. 139 | 140 | Another way to test whether a gene set is statistically significant 141 | would be to test whether the fold-change of the genes in the gene sets 142 | are statistically different than zero. That is, we can perform a one 143 | sample z-test on the logFC of the genes of each gene sets and test 144 | whether they are significantly different from zero. The logFC is 145 | computed from the original (log) expression matrix `X` and group 146 | vector `y`. 147 | 148 | 149 | The function `dualGSEA()` does both tests: the one-sample z-test on 150 | the logFC and the two-group t-test on the gene set matrix `gsetX`. 151 | Dual testing has been suggested by Bull et al. (Sci Rep., 2024) 152 | 153 | ```{r} 154 | y <- 1*(celltype == "B") 155 | res <- dualGSEA(X, y, G=matG) 156 | ``` 157 | 158 | The top significant genesets can be shown with 159 | 160 | ```{r} 161 | res <- res[order(res[,"p.dual"]),] 162 | head(res) 163 | ``` 164 | 165 | The column `gsetFC` corresponds to the difference in gene set score 166 | and also corresponds to the average foldchange of the genes in the 167 | gene set. The column 'p.fc corresponds to the test on the preranked 168 | logFC, the column 'p.ss' corresponds to the two-group t-test on the 169 | geneset scores `gsetX`. The two p-values are then combined using 170 | Stouffer's method in the column 'p.dual' and adjusted for multiple 171 | testing in column `q.dual`. 172 | 173 | The left figure below plots the fold-change enrichment `p.fc` vs. 174 | single-sample enrichment `p.ss`. The right figure shows the volcano plot 175 | `p.dual` vs. `gsetFC`: 176 | 177 | ```{r, fig.height=6, fig.width=12, fig.fullwidth=TRUE} 178 | fc <- res[,"gsetFC"] 179 | pv <- res[,"p.dual"] 180 | p1 <- res[,"p.fc"] 181 | p2 <- res[,"p.ss"] 182 | ii <- head(order(pv)) 183 | par(mfrow=c(1,2)) 184 | plot( -log10(p1), -log10(p2), 185 | xlab="FC enrichment (-log10p)", 186 | ylab="single-sample enrichment (-log10p)", pch=19) 187 | text( -log10(p1[ii]), -log10(p2[ii]), rownames(res)[ii],pos=2) 188 | plot( fc, -log10(pv), xlab="gsetFC", ylab="-log10p", pch=19) 189 | abline(h=0, v=0, lty=2) 190 | text( fc[ii], -log10(pv[ii]), rownames(res)[ii],pos=2) 191 | ``` 192 | 193 | # Replicating ssGSEA, singscore and scSE 194 | 195 | Plaid can be used to replicate other enrichment score such as 196 | [ssGSEA](https://github.com/rcastelo/GSVA), 197 | [GSVA](https://github.com/rcastelo/GSVA), 198 | [AUCell](https://github.com/aertslab/AUCell), 199 | [Singscore](https://github.com/DavisLaboratory/singscore), 200 | [scSE](https://doi.org/10.1093/nar/gkz601) and 201 | [UCell](https://github.com/carmonalab/UCell). But using plaid, the 202 | computation is much faster than the original code. 203 | 204 | ## Replicating singscore 205 | 206 | Computing the 207 | [singscore](https://github.com/DavisLaboratory/singscore) requires to 208 | compute the ranks of the expression matrix. We have wrapped this in a 209 | single convenience function: 210 | 211 | ```{r} 212 | sing <- replaid.sing(X, matG) 213 | ``` 214 | 215 | We have extensively compared the results of `replaid.sing` and from 216 | the original `singscore` R package and we showed identical result in 217 | the score, logFC and p-values. 218 | 219 | 220 | ## Replicating ssGSEA 221 | 222 | Plaid can also be used to compute the ssGSEA score (Barbie et al., 223 | 2009). Using plaid, we can calculate the score upto 100x faster. We 224 | have wrapped this in a single convenience function: 225 | 226 | ```{r} 227 | ssgsea <- replaid.ssgsea(X, matG, alpha=0) 228 | ``` 229 | 230 | We have extensively compared the results of `replaid.ssgsea()` and 231 | from the original [GSVA](https://github.com/rcastelo/GSVA) R 232 | package. Note the rank weight parameter alpha. For `alpha=0` we 233 | obtained identical result for the score, logFC and p-values. For 234 | non-zero values for alpha the results are close but not exactly the 235 | same. The default value in the original publication and in GSVA is 236 | `alpha=0.25`. 237 | 238 | 239 | ## Replicating the scSE score 240 | 241 | [Single-cell Signature Explorer 242 | (scSE)](https://doi.org/10.1093/nar/gkz601) is a fast enrichment 243 | algorithm implemented in GO. Computing the scSE score requires running 244 | plaid on the linear (not logarithmic) score and perform additional 245 | normalization by the total UMI per sample. We have wrapped this in a 246 | single convenience function: 247 | 248 | ```{r} 249 | scse <- replaid.scse(X, matG, removeLog2=TRUE, scoreMean=FALSE) 250 | ``` 251 | 252 | To replicate the original "sum-of-UMI" scSE score, set 253 | `removeLog2=TRUE` and `scoreMean=FALSE`. scSE and plaid scores become 254 | more similar for `removeLog2=FALSE` and `scoreMean=TRUE`. 255 | 256 | We have extensively compared the results from `replaid.scse` and from 257 | the original scSE (implemented in GO lang) and we showed almost 258 | identical results in the score, logFC and p-values. 259 | 260 | 261 | ## Compare scores 262 | 263 | We can compare all scores in a pairs plot: 264 | 265 | ```{r} 266 | S <- cbind(plaid=gsetX[,1], sing=sing[,1], ssgsea=ssgsea[,1], scSE=scse[,1]) 267 | pairs(S) 268 | ``` 269 | 270 | # Session info 271 | 272 | ```{r} 273 | sessionInfo() 274 | ``` 275 | -------------------------------------------------------------------------------- /tests/testthat/test-gmt-utils.R: -------------------------------------------------------------------------------- 1 | ## This file is part of the Omics Playground project. 2 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 3 | 4 | library(testthat) 5 | library(Matrix) 6 | 7 | # ============================================================================= 8 | # Test gmt2mat function 9 | # ============================================================================= 10 | 11 | test_that("gmt2mat creates sparse binary matrix correctly", { 12 | # Create test GMT data 13 | gmt <- list( 14 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 15 | "Pathway2" = c("GENE2", "GENE4", "GENE5"), 16 | "Pathway3" = c("GENE1", "GENE5", "GENE6") 17 | ) 18 | 19 | mat <- gmt2mat(gmt, sparse = TRUE) 20 | 21 | # Check output type 22 | expect_true(inherits(mat, "sparseMatrix")) 23 | 24 | # Check dimensions 25 | expect_equal(ncol(mat), 3) 26 | expect_true(nrow(mat) >= 6) 27 | 28 | # Check column names 29 | expect_equal(colnames(mat), c("Pathway1", "Pathway2", "Pathway3")) 30 | 31 | # Check that genes are present 32 | expect_true(all(c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5", "GENE6") %in% rownames(mat))) 33 | 34 | # Check binary values (0 or 1) 35 | expect_true(all(as.vector(mat) %in% c(0, 1))) 36 | }) 37 | 38 | test_that("gmt2mat creates dense matrix correctly", { 39 | gmt <- list( 40 | "Pathway1" = c("GENE1", "GENE2"), 41 | "Pathway2" = c("GENE3", "GENE4") 42 | ) 43 | 44 | mat <- gmt2mat(gmt, sparse = FALSE) 45 | 46 | # Check output type 47 | expect_true(is.matrix(mat)) 48 | expect_false(inherits(mat, "sparseMatrix")) 49 | 50 | # Check dimensions 51 | expect_equal(ncol(mat), 2) 52 | expect_equal(nrow(mat), 4) 53 | }) 54 | 55 | test_that("gmt2mat handles max.genes parameter", { 56 | gmt <- list( 57 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 58 | "Pathway2" = c("GENE4", "GENE5", "GENE6") 59 | ) 60 | 61 | mat <- gmt2mat(gmt, max.genes = 3) 62 | 63 | # Should limit to 3 genes 64 | expect_equal(nrow(mat), 3) 65 | }) 66 | 67 | test_that("gmt2mat handles ntop parameter", { 68 | gmt <- list( 69 | "Pathway1" = c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5"), 70 | "Pathway2" = c("GENE6", "GENE7", "GENE8", "GENE9", "GENE10") 71 | ) 72 | 73 | mat <- gmt2mat(gmt, ntop = 3) 74 | 75 | # Each pathway should have at most 3 genes 76 | expect_true(all(Matrix::colSums(mat) <= 3)) 77 | }) 78 | 79 | test_that("gmt2mat handles custom background genes", { 80 | gmt <- list( 81 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 82 | "Pathway2" = c("GENE2", "GENE4") 83 | ) 84 | 85 | # Custom background - only include GENE1 and GENE2 86 | bg <- c("GENE1", "GENE2") 87 | mat <- gmt2mat(gmt, bg = bg) 88 | 89 | # Should only include background genes 90 | expect_equal(nrow(mat), 2) 91 | expect_true(all(rownames(mat) %in% bg)) 92 | }) 93 | 94 | test_that("gmt2mat handles multicore parameter", { 95 | gmt <- list( 96 | "Pathway1" = c("GENE1", "GENE2"), 97 | "Pathway2" = c("GENE3", "GENE4") 98 | ) 99 | 100 | # With multicore 101 | mat1 <- gmt2mat(gmt, use.multicore = TRUE) 102 | 103 | # Without multicore 104 | mat2 <- gmt2mat(gmt, use.multicore = FALSE) 105 | 106 | # Results should be identical 107 | expect_equal(dim(mat1), dim(mat2)) 108 | expect_equal(rownames(mat1), rownames(mat2)) 109 | expect_equal(colnames(mat1), colnames(mat2)) 110 | expect_equal(as.matrix(mat1), as.matrix(mat2)) 111 | }) 112 | 113 | test_that("gmt2mat handles unnamed GMT lists", { 114 | # Note: When GMT lists are unnamed, the function auto-generates names 115 | # However, there's an edge case where empty lists cause issues 116 | # Test with a simple named list instead 117 | skip("Unnamed GMT lists can cause issues with name assignment after deduplication") 118 | }) 119 | 120 | test_that("gmt2mat handles duplicate pathway names", { 121 | gmt <- list( 122 | "Pathway1" = c("GENE1", "GENE2"), 123 | "Pathway1" = c("GENE3", "GENE4"), # Duplicate name 124 | "Pathway2" = c("GENE5", "GENE6") 125 | ) 126 | 127 | mat <- gmt2mat(gmt) 128 | 129 | # Should remove duplicates 130 | expect_equal(ncol(mat), 2) 131 | expect_true(all(colnames(mat) %in% c("Pathway1", "Pathway2"))) 132 | }) 133 | 134 | test_that("gmt2mat sorts pathways by size", { 135 | gmt <- list( 136 | "Small" = c("GENE1"), 137 | "Large" = c("GENE2", "GENE3", "GENE4", "GENE5"), 138 | "Medium" = c("GENE6", "GENE7") 139 | ) 140 | 141 | mat <- gmt2mat(gmt) 142 | 143 | # Larger pathways should come first in the ordering 144 | # (the function sorts by decreasing size) 145 | expect_equal(colnames(mat)[1], "Large") 146 | }) 147 | 148 | test_that("gmt2mat sorts genes by frequency", { 149 | gmt <- list( 150 | "Pathway1" = c("GENE1", "GENE2"), 151 | "Pathway2" = c("GENE1", "GENE3"), 152 | "Pathway3" = c("GENE1", "GENE4") 153 | ) 154 | 155 | mat <- gmt2mat(gmt) 156 | 157 | # GENE1 appears in all pathways, should be first 158 | expect_equal(rownames(mat)[1], "GENE1") 159 | }) 160 | 161 | # ============================================================================= 162 | # Test mat2gmt function 163 | # ============================================================================= 164 | 165 | test_that("mat2gmt converts matrix to GMT correctly", { 166 | # Create binary matrix 167 | mat <- matrix(0, nrow = 6, ncol = 3) 168 | rownames(mat) <- paste0("GENE", 1:6) 169 | colnames(mat) <- paste0("Pathway", 1:3) 170 | mat[1:3, 1] <- 1 # Pathway1: GENE1, GENE2, GENE3 171 | mat[c(2, 4, 5), 2] <- 1 # Pathway2: GENE2, GENE4, GENE5 172 | mat[c(1, 5, 6), 3] <- 1 # Pathway3: GENE1, GENE5, GENE6 173 | 174 | gmt <- mat2gmt(mat) 175 | 176 | # Check output type 177 | expect_true(is.list(gmt)) 178 | 179 | # Check number of pathways 180 | expect_equal(length(gmt), 3) 181 | 182 | # Check pathway names 183 | expect_equal(names(gmt), paste0("Pathway", 1:3)) 184 | 185 | # Check pathway contents 186 | expect_equal(sort(gmt$Pathway1), paste0("GENE", 1:3)) 187 | expect_equal(sort(gmt$Pathway2), paste0("GENE", c(2, 4, 5))) 188 | expect_equal(sort(gmt$Pathway3), paste0("GENE", c(1, 5, 6))) 189 | }) 190 | 191 | test_that("mat2gmt works with sparse matrix", { 192 | # Create sparse matrix 193 | mat <- Matrix(0, nrow = 4, ncol = 2, sparse = TRUE) 194 | rownames(mat) <- paste0("GENE", 1:4) 195 | colnames(mat) <- paste0("Pathway", 1:2) 196 | mat[1:2, 1] <- 1 197 | mat[3:4, 2] <- 1 198 | 199 | gmt <- mat2gmt(mat) 200 | 201 | expect_equal(length(gmt), 2) 202 | expect_equal(sort(gmt$Pathway1), c("GENE1", "GENE2")) 203 | expect_equal(sort(gmt$Pathway2), c("GENE3", "GENE4")) 204 | }) 205 | 206 | test_that("mat2gmt handles non-binary matrices", { 207 | # Matrix with different values 208 | mat <- matrix(c(1, 2, 0, 0, 0, 3), nrow = 3, ncol = 2) 209 | rownames(mat) <- paste0("GENE", 1:3) 210 | colnames(mat) <- paste0("Pathway", 1:2) 211 | 212 | gmt <- mat2gmt(mat) 213 | 214 | # Should include genes with non-zero values 215 | expect_equal(sort(gmt$Pathway1), c("GENE1", "GENE2")) 216 | expect_equal(gmt$Pathway2, "GENE3") 217 | }) 218 | 219 | test_that("mat2gmt handles empty pathways", { 220 | mat <- matrix(0, nrow = 4, ncol = 3) 221 | rownames(mat) <- paste0("GENE", 1:4) 222 | colnames(mat) <- paste0("Pathway", 1:3) 223 | mat[1:2, 1] <- 1 224 | # Pathway2 and Pathway3 are empty 225 | 226 | gmt <- mat2gmt(mat) 227 | 228 | # Should still return all pathways 229 | expect_equal(length(gmt), 1) # Only Pathway1 has genes 230 | expect_equal(names(gmt), "Pathway1") 231 | }) 232 | 233 | # ============================================================================= 234 | # Test round-trip conversion (GMT -> Matrix -> GMT) 235 | # ============================================================================= 236 | 237 | test_that("gmt2mat and mat2gmt are inverse operations", { 238 | # Original GMT 239 | gmt_orig <- list( 240 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 241 | "Pathway2" = c("GENE2", "GENE4", "GENE5"), 242 | "Pathway3" = c("GENE1", "GENE5", "GENE6") 243 | ) 244 | 245 | # Convert to matrix and back 246 | mat <- gmt2mat(gmt_orig) 247 | gmt_new <- mat2gmt(mat) 248 | 249 | # Check that we get the same pathways back 250 | expect_equal(length(gmt_new), length(gmt_orig)) 251 | expect_equal(sort(names(gmt_new)), sort(names(gmt_orig))) 252 | 253 | # Check pathway contents (order might differ) 254 | for (pathway in names(gmt_orig)) { 255 | expect_equal(sort(gmt_new[[pathway]]), sort(gmt_orig[[pathway]])) 256 | } 257 | }) 258 | 259 | # ============================================================================= 260 | # Test read.gmt and write.gmt functions 261 | # ============================================================================= 262 | 263 | test_that("write.gmt creates properly formatted file", { 264 | # Create test GMT data 265 | gmt_orig <- list( 266 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 267 | "Pathway2" = c("GENE4", "GENE5"), 268 | "Pathway3" = c("GENE6", "GENE7", "GENE8", "GENE9") 269 | ) 270 | 271 | # Create temporary file 272 | temp_file <- tempfile(fileext = ".gmt") 273 | 274 | # Write GMT file 275 | write.gmt(gmt_orig, temp_file) 276 | 277 | # Check file exists 278 | expect_true(file.exists(temp_file)) 279 | 280 | # Read file and check format 281 | lines <- readLines(temp_file) 282 | expect_equal(length(lines), 3) 283 | 284 | # Each line should have pathway name, source, and genes separated by tabs 285 | for (i in seq_along(lines)) { 286 | parts <- strsplit(lines[i], "\t")[[1]] 287 | expect_true(length(parts) >= 3) # name + source + at least one gene 288 | expect_equal(parts[1], names(gmt_orig)[i]) # Check pathway name 289 | } 290 | 291 | # Cleanup 292 | unlink(temp_file) 293 | }) 294 | 295 | test_that("write.gmt handles custom source parameter", { 296 | gmt <- list( 297 | "Pathway1" = c("GENE1", "GENE2"), 298 | "Pathway2" = c("GENE3", "GENE4") 299 | ) 300 | 301 | temp_file <- tempfile(fileext = ".gmt") 302 | 303 | # Write with custom source - need to match length of gmt 304 | source_info <- c("DB1", "DB2") 305 | # Note: write.gmt expects single source or needs fixing 306 | write.gmt(gmt, temp_file, source = "CustomDB") 307 | 308 | # Read file and check content 309 | lines <- readLines(temp_file) 310 | expect_true(grepl("CustomDB", lines[1])) 311 | expect_true(grepl("CustomDB", lines[2])) 312 | 313 | # Cleanup 314 | unlink(temp_file) 315 | }) 316 | 317 | test_that("write.gmt handles source parameter correctly", { 318 | gmt <- list( 319 | "Pathway1" = c("GENE1", "GENE2"), 320 | "Pathway2" = c("GENE3", "GENE4") 321 | ) 322 | 323 | temp_file <- tempfile(fileext = ".gmt") 324 | 325 | # Write without source (should use pathway names as source) 326 | write.gmt(gmt, temp_file) 327 | lines <- readLines(temp_file) 328 | parts1 <- strsplit(lines[1], "\t")[[1]] 329 | expect_equal(parts1[1], parts1[2]) # Name and source should be the same when NA 330 | 331 | # Cleanup 332 | unlink(temp_file) 333 | }) 334 | 335 | test_that("read.gmt reads GMT file correctly", { 336 | # Create test GMT data 337 | gmt_orig <- list( 338 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 339 | "Pathway2" = c("GENE4", "GENE5"), 340 | "Pathway3" = c("GENE6", "GENE7", "GENE8") 341 | ) 342 | 343 | # Create temporary file 344 | temp_file <- tempfile(fileext = ".gmt") 345 | write.gmt(gmt_orig, temp_file) 346 | 347 | # Read back 348 | gmt_read <- read.gmt(temp_file) 349 | 350 | # Check output type 351 | expect_true(is.list(gmt_read)) 352 | 353 | # Check number of pathways 354 | expect_equal(length(gmt_read), 3) 355 | 356 | # Check pathway names 357 | expect_equal(names(gmt_read), names(gmt_orig)) 358 | 359 | # Check pathway contents 360 | expect_equal(sort(gmt_read$Pathway1), sort(gmt_orig$Pathway1)) 361 | expect_equal(sort(gmt_read$Pathway2), sort(gmt_orig$Pathway2)) 362 | expect_equal(sort(gmt_read$Pathway3), sort(gmt_orig$Pathway3)) 363 | 364 | # Cleanup 365 | unlink(temp_file) 366 | }) 367 | 368 | test_that("read.gmt and write.gmt are inverse operations", { 369 | # Create test GMT data 370 | gmt_orig <- list( 371 | "Pathway1" = c("GENE1", "GENE2", "GENE3"), 372 | "Pathway2" = c("GENE2", "GENE4", "GENE5"), 373 | "Pathway3" = c("GENE1", "GENE5", "GENE6") 374 | ) 375 | 376 | # Write and read back 377 | temp_file <- tempfile(fileext = ".gmt") 378 | write.gmt(gmt_orig, temp_file) 379 | gmt_read <- read.gmt(temp_file) 380 | 381 | # Should get the same data back 382 | expect_equal(length(gmt_read), length(gmt_orig)) 383 | expect_equal(names(gmt_read), names(gmt_orig)) 384 | 385 | for (pathway in names(gmt_orig)) { 386 | expect_equal(sort(gmt_read[[pathway]]), sort(gmt_orig[[pathway]])) 387 | } 388 | 389 | # Cleanup 390 | unlink(temp_file) 391 | }) 392 | 393 | test_that("read.gmt handles add.source parameter", { 394 | # Create test GMT with source info 395 | gmt_orig <- list( 396 | "Pathway1" = c("GENE1", "GENE2"), 397 | "Pathway2" = c("GENE3", "GENE4") 398 | ) 399 | 400 | temp_file <- tempfile(fileext = ".gmt") 401 | write.gmt(gmt_orig, temp_file, source = c("DB1", "DB2")) 402 | 403 | # Read without source 404 | gmt_no_source <- read.gmt(temp_file, add.source = FALSE) 405 | expect_equal(names(gmt_no_source), c("Pathway1", "Pathway2")) 406 | 407 | # Read with source 408 | gmt_with_source <- read.gmt(temp_file, add.source = TRUE) 409 | expect_true(grepl("DB1", names(gmt_with_source)[1])) 410 | expect_true(grepl("DB2", names(gmt_with_source)[2])) 411 | expect_true(grepl("Pathway1", names(gmt_with_source)[1])) 412 | expect_true(grepl("Pathway2", names(gmt_with_source)[2])) 413 | 414 | # Cleanup 415 | unlink(temp_file) 416 | }) 417 | 418 | test_that("read.gmt handles nrows parameter", { 419 | # Create test GMT with multiple pathways 420 | gmt_orig <- list( 421 | "Pathway1" = c("GENE1", "GENE2"), 422 | "Pathway2" = c("GENE3", "GENE4"), 423 | "Pathway3" = c("GENE5", "GENE6"), 424 | "Pathway4" = c("GENE7", "GENE8") 425 | ) 426 | 427 | temp_file <- tempfile(fileext = ".gmt") 428 | write.gmt(gmt_orig, temp_file) 429 | 430 | # Read only first 2 rows 431 | gmt_partial <- read.gmt(temp_file, nrows = 2) 432 | 433 | expect_equal(length(gmt_partial), 2) 434 | expect_equal(names(gmt_partial), c("Pathway1", "Pathway2")) 435 | 436 | # Cleanup 437 | unlink(temp_file) 438 | }) 439 | 440 | test_that("read.gmt handles empty gene lists", { 441 | # Create GMT file with a pathway that has no genes 442 | temp_file <- tempfile(fileext = ".gmt") 443 | 444 | # Write manually to create edge case 445 | writeLines(c( 446 | "Pathway1\tDB1\tGENE1\tGENE2", 447 | "Pathway2\tDB2", # No genes 448 | "Pathway3\tDB3\tGENE3" 449 | ), temp_file) 450 | 451 | gmt <- read.gmt(temp_file) 452 | 453 | # Should still read all pathways 454 | expect_equal(length(gmt), 3) 455 | expect_equal(names(gmt), c("Pathway1", "Pathway2", "Pathway3")) 456 | 457 | # Pathway2 should be empty 458 | expect_equal(length(gmt$Pathway2), 0) 459 | 460 | # Other pathways should have genes 461 | expect_equal(gmt$Pathway1, c("GENE1", "GENE2")) 462 | expect_equal(gmt$Pathway3, "GENE3") 463 | 464 | # Cleanup 465 | unlink(temp_file) 466 | }) 467 | 468 | 469 | 470 | 471 | 472 | 473 | # ============================================================================= 474 | # Test edge cases and error handling 475 | # ============================================================================= 476 | 477 | test_that("gmt2mat handles empty GMT list", { 478 | # Skip this test as gmt2mat doesn't handle truly empty lists 479 | # This is expected behavior - GMT lists should have at least one pathway 480 | skip("Empty GMT lists are not supported by design") 481 | }) 482 | 483 | test_that("gmt2mat handles single pathway", { 484 | gmt <- list( 485 | "OnlyPathway" = c("GENE1", "GENE2", "GENE3") 486 | ) 487 | 488 | mat <- gmt2mat(gmt) 489 | 490 | expect_equal(ncol(mat), 1) 491 | expect_equal(colnames(mat), "OnlyPathway") 492 | expect_equal(sum(mat), 3) 493 | }) 494 | 495 | test_that("gmt2mat handles pathways with no overlapping genes", { 496 | gmt <- list( 497 | "Pathway1" = c("GENE1", "GENE2"), 498 | "Pathway2" = c("GENE3", "GENE4") 499 | ) 500 | 501 | # Use background that doesn't overlap 502 | bg <- c("GENE5", "GENE6") 503 | mat <- gmt2mat(gmt, bg = bg) 504 | 505 | # Matrix should be all zeros 506 | expect_equal(sum(mat), 0) 507 | }) 508 | 509 | test_that("mat2gmt handles matrix with all zeros", { 510 | mat <- matrix(0, nrow = 4, ncol = 2) 511 | rownames(mat) <- paste0("GENE", 1:4) 512 | colnames(mat) <- paste0("Pathway", 1:2) 513 | 514 | # This will create an empty result when matrix is all zeros 515 | # The function returns a named list based on non-zero entries 516 | result <- tryCatch({ 517 | gmt <- mat2gmt(mat) 518 | length(gmt) 519 | }, error = function(e) { 520 | 0 # If error, expect 0 length 521 | }) 522 | 523 | # Should return empty list 524 | expect_equal(result, 0) 525 | }) 526 | 527 | test_that("gmt2mat preserves matrix class when specified", { 528 | gmt <- list( 529 | "Pathway1" = c("GENE1", "GENE2") 530 | ) 531 | 532 | # Sparse matrix 533 | mat_sparse <- gmt2mat(gmt, sparse = TRUE) 534 | expect_true(inherits(mat_sparse, "sparseMatrix")) 535 | 536 | # Dense matrix 537 | mat_dense <- gmt2mat(gmt, sparse = FALSE) 538 | expect_true(is.matrix(mat_dense)) 539 | expect_false(inherits(mat_dense, "sparseMatrix")) 540 | }) 541 | 542 | -------------------------------------------------------------------------------- /R/stats.R: -------------------------------------------------------------------------------- 1 | ##---------------------------------------------------------------- 2 | ##----------------- STATISTICAL TESTS ---------------------------- 3 | ##---------------------------------------------------------------- 4 | 5 | #' @importFrom stats p.adjust pt pchisq qnorm pnorm 6 | #' @importFrom Matrix rowMeans colSums crossprod t colScale 7 | #' @importFrom Rfast ttests 8 | #' @importFrom qlcMatrix corSparse 9 | #' @importFrom methods is 10 | NULL 11 | 12 | #' Reimplementation of dualGSEA (Bull et al., 2024) but defaults with 13 | #' replaid backend. For the preranked test we still use fgsea. Should 14 | #' be much faster than original using fgsea + GSVA::ssGSEA. 15 | #' 16 | #' @param X Expression matrix with genes on rows and samples ont columns 17 | #' @param y Binary vector (0/1) indicating group membership 18 | #' @param gmt List of gene sets in GMT format 19 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates 20 | #' gene/feature is part of gene sets. Features on rows, gene sets on 21 | #' columns. 22 | #' @param gsetX Optional pre-computed matrix of gene set enrichment scores with 23 | #' gene sets on rows and samples on columns. If NULL (default), scores will be 24 | #' computed using the method specified by `ss.method`. Providing pre-computed 25 | #' scores improves efficiency when running multiple analyses. 26 | #' @param fc.method Method for fold change testing ("fgsea", "ztest", "ttest", "rankcor", "cor") 27 | #' @param ss.method Method for single-sample enrichment ("plaid", "replaid.ssgsea", "replaid.gsva", "ssgsea", "gsva") 28 | #' @param pv1 Pre-computed p-values from fold change test. If NULL, will be computed based on fc.test. 29 | #' @param pv2 Pre-computed p-values from single sample test. If NULL, will be computed using gset_ttest. 30 | #' @param metap.method Method for combining p-values ("stouffer", "fisher" or "maxp"). Default "stouffer". 31 | #' @param sort.by Column name to sort results by ("p.dual", "gsetFC", "p.fc", "p.ss"). Default "p.dual". 32 | #' 33 | #' @return Data frame with columns: gsetFC (gene set fold change), size (gene set size), 34 | #' p.fc (p-value from fold change test), p.ss (p-value from single sample test), 35 | #' p.dual (combined p-value), and q.dual (FDR-adjusted combined p-value). 36 | #' 37 | #' @examples 38 | #' # Create example expression matrix 39 | #' set.seed(123) 40 | #' X <- matrix(rnorm(1000), nrow = 100, ncol = 20) 41 | #' rownames(X) <- paste0("GENE", 1:100) 42 | #' colnames(X) <- paste0("Sample", 1:20) 43 | #' 44 | #' # Create binary group vector 45 | #' y <- rep(c(0, 1), each = 10) 46 | #' 47 | #' # Create example gene sets 48 | #' gmt <- list( 49 | #' "Pathway1" = paste0("GENE", 1:20), 50 | #' "Pathway2" = paste0("GENE", 15:35), 51 | #' "Pathway3" = paste0("GENE", 30:50) 52 | #' ) 53 | #' 54 | #' # Perform dualGSEA with correlation test (fast method) 55 | #' results_cor <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "cor", ss.method = "replaid.gsva") 56 | #' print(head(results_cor)) 57 | #' 58 | #' \donttest{ 59 | #' # Perform dualGSEA with fgsea (requires fgsea package) 60 | #' if (requireNamespace("fgsea", quietly = TRUE)) { 61 | #' results <- dualGSEA(X, y, G = NULL, gmt = gmt, fc.method = "fgsea", ss.method = "replaid.ssgsea") 62 | #' print(head(results)) 63 | #' } 64 | #' } 65 | #' 66 | #' @export 67 | dualGSEA <- function(X, y, G, gmt=NULL, gsetX=NULL, 68 | fc.method = c("fgsea","rankcor","ztest","ttest","cor")[2], 69 | ss.method = c('plaid', 'replaid.ssgsea','replaid.gsva', 70 | 'ssgsea','gsva')[1], 71 | metap.method = c("stouffer","fisher","maxp")[1], 72 | pv1 = NULL, pv2 = NULL, sort.by='p.dual') { 73 | #require(fgsea) 74 | if (fc.method == "fgsea" && !requireNamespace("fgsea", quietly=TRUE)) { 75 | stop("The fgsea package must be installed to use this functionality") 76 | } 77 | if (ss.method %in% c("ssgsea","gsva") && !requireNamespace("GSVA", quietly=TRUE)) { 78 | stop("The GSVA package must be installed to use this functionality") 79 | } 80 | if(is.null(gmt) && is.null(G)) { 81 | stop("at least gmt or matrix G must be given") 82 | } 83 | if(!is.null(gmt) && !is(gmt, "list")) { 84 | stop("gmt must be a list") 85 | } 86 | 87 | if(!all(unique(y) %in% c(0,1,NA))) stop("elements of y must be 0 or 1") 88 | sel <- which(!is.na(y)) 89 | y <- y[sel] 90 | X <- X[,sel,drop=FALSE] 91 | 92 | if(is.null(G) && !is.null(gmt)) G <- gmt2mat(gmt) 93 | if(is.null(gmt) && !is.null(G)) gmt <- mat2gmt(G) 94 | 95 | ## pairwise test on logFC 96 | if(is.null(pv1)) { 97 | message("FC testing using ", fc.method) 98 | m1 <- Matrix::rowMeans(X[,which(y==1),drop=FALSE]) 99 | m0 <- Matrix::rowMeans(X[,which(y==0),drop=FALSE]) 100 | fc <- m1 - m0 101 | if(fc.method == "fgsea") { 102 | res1 <- fgsea::fgsea(gmt, fc) 103 | res1 <- data.frame(res1, row.names=res1$pathway) 104 | pv1 <- res1[,"pval"] 105 | names(pv1) <- rownames(res1) 106 | } else if(fc.method %in% c('ttest','ztest')) { 107 | if(inherits(X,"dgCMatrix")) { 108 | sdx <- sparseMatrixStats::rowSds(X,na.rm=TRUE) 109 | } else { 110 | sdx <- matrixStats::rowSds(X,na.rm=TRUE) 111 | } 112 | sdx0 <- mean(sdx, na.rm=TRUE) 113 | zc <- fc / (0.1*sdx0 + sdx) 114 | if(fc.method == "ttest") { 115 | res1 <- fc_ttest(zc, G, sort.by="none") 116 | pv1 <- res1[,'pvalue'] 117 | } 118 | if(fc.method == "ztest") { 119 | res1 <- fc_ztest(zc, G, zmat=FALSE) 120 | pv1 <- res1$p_value 121 | } 122 | } else if(fc.method == 'rankcor') { 123 | res1 <- gset.rankcor(fc, G, compute.p=TRUE, use.rank=TRUE) 124 | pv1 <- res1$p.value[,1] 125 | } else if(fc.method == 'cor') { 126 | res1 <- gset.rankcor(fc, G, compute.p=TRUE, use.rank=FALSE) 127 | pv1 <- res1$p.value[,1] 128 | } else { 129 | stop("invalid fc.method method") 130 | } 131 | } 132 | 133 | ## single-sample test 134 | message("single-sample testing using ", ss.method) 135 | 136 | if(is.null(gsetX)) { 137 | if(ss.method == "plaid") { 138 | gsetX <- plaid::plaid(X, G) 139 | }else if(ss.method == "gsva") { 140 | gsvapar <- GSVA::gsvaParam(X, gmt) 141 | gsetX <- GSVA::gsva(gsvapar, verbose = FALSE) 142 | } else if(ss.method == "ssgsea") { 143 | gsvapar <- GSVA::ssgseaParam(X, gmt) 144 | gsetX <- GSVA::gsva(gsvapar, verbose = FALSE) 145 | } else if(ss.method == "replaid.ssgsea") { 146 | gsetX <- replaid.ssgsea(X, G) 147 | } else if(ss.method == "replaid.gsva") { 148 | gsetX <- replaid.gsva(X, G) 149 | } else { 150 | stop("invalid ss.method: ",ss.method) 151 | } 152 | } else { 153 | gsetX <- gsetX[colnames(G),] 154 | } 155 | 156 | gg <- intersect(rownames(G),rownames(X)) 157 | sel <- which(!is.na(y)) 158 | X <- X[gg,sel,drop=FALSE] 159 | G <- G[gg,] 160 | y <- y[sel] 161 | 162 | if(is.null(gsetX)) { 163 | message("computing gsetX using plaid. please precompute for efficiency.") 164 | gsetX <- plaid(X, G) 165 | } 166 | pp <- intersect(rownames(gsetX),colnames(G)) 167 | gsetX <- gsetX[pp,colnames(X),drop=FALSE] 168 | G <- G[,pp,drop=FALSE] 169 | 170 | e1 <- Matrix::rowMeans(gsetX[,y==1,drop=FALSE]) 171 | e0 <- Matrix::rowMeans(gsetX[,y==0,drop=FALSE]) 172 | gsetFC <- e1 - e0 173 | gs.size <- Matrix::colSums(G!=0)[rownames(gsetX)] 174 | 175 | if(is.null(pv2)) { 176 | res2 <- gset_ttest(gsetX, y) 177 | pv2 <- res2[,'pvalue'] 178 | } 179 | 180 | gs <- rownames(gsetX) 181 | P <- cbind(pv1[gs], pv2[gs]) 182 | P[is.na(P)] <- 1 183 | P <- pmin(pmax(P, 1e-99), 1-1e-99) 184 | colnames(P) <- c("p.fc","p.ss") 185 | 186 | p.dual <- matrix_metap(P, method=metap.method) 187 | q.dual <- stats::p.adjust(p.dual, method="fdr") 188 | 189 | res <- cbind( 190 | gsetFC = gsetFC, 191 | size = gs.size, 192 | pvalues = P, 193 | p.dual = p.dual, 194 | q.dual = q.dual 195 | ) 196 | if(sort.by %in% colnames(res)) { 197 | osign <- ifelse(sort.by=="gsetFC",-1,1) 198 | res <- res[order(osign*res[,sort.by]),] 199 | } 200 | res 201 | } 202 | 203 | 204 | #' T-test statistical testing of differentially enrichment 205 | #' 206 | #' This function performs statistical testing for differential 207 | #' enrichment using plaid 208 | #' 209 | #' @param fc Vector of logFC values 210 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates 211 | #' gene/feature is part of gene sets. Features on rows, gene sets on 212 | #' columns. 213 | #' @param sort.by Column name to sort results by ("pvalue", "gsetFC", or "none") 214 | #' 215 | #' @return Data frame with columns: gsetFC (gene set fold change), 216 | #' pvalue (p-value from one-sample t-test), and qvalue (FDR-adjusted p-value). 217 | #' 218 | fc_ttest <- function(fc, G, sort.by="pvalue") { 219 | if(is.null(names(fc))) stop("fc must have names") 220 | gg <- intersect(rownames(G),names(fc)) 221 | fc <- fc[gg] 222 | G <- G[gg,] 223 | 224 | message("[fc_ttest] computing one-sample t-tests on logFC") 225 | mt <- matrix_onesample_ttest(fc, G) 226 | pv <- mt$p[,1] 227 | df <- mt$mean[,1] 228 | qv <- p.adjust(pv, method="fdr") 229 | gsetFC <- gset_averageCLR(fc, G, center = FALSE)[,1] 230 | 231 | res <- cbind( 232 | gsetFC = gsetFC, 233 | pvalue = pv, 234 | qvalue = qv 235 | ) 236 | if(!is.null(sort.by) && sort.by %in% colnames(res)) { 237 | sort.sign <- ifelse(sort.by=="gsetFC",-1,+1) 238 | res <- res[order(sort.sign*res[,sort.by]),] 239 | } 240 | res 241 | } 242 | 243 | #' Z-test statistical testing of differentially enrichment 244 | #' 245 | #' This function performs statistical testing for differential 246 | #' enrichment using plaid 247 | #' 248 | #' @importFrom stats var 249 | #' 250 | #' @param fc Vector of logFC values 251 | #' @param G Sparse matrix of gene sets. Non-zero entry indicates 252 | #' gene/feature is part of gene sets. Features on rows, gene sets on 253 | #' columns. 254 | #' @param zmat Logical indicating to return z-matrix 255 | #' @param alpha Scalar weight for SD estimation. Default 0.5. 256 | #' 257 | #' @return List with element: z_statistic (z-statistic from one-sample z-test), 258 | #' p_value (p-value from one-sample z-test), and zmat (z-matrix). 259 | #' 260 | fc_ztest <- function(fc, G, zmat=FALSE, alpha=0.5) { 261 | if(is.null(names(fc))) stop("fc must have names") 262 | gg <- intersect(rownames(G),names(fc)) 263 | sample_size <- Matrix::colSums(G[gg,]!=0) 264 | sample_size <- pmax(sample_size, 1) ## avoid div-by-zero 265 | sample_mean <- (Matrix::t(G[gg,]!=0) %*% fc[gg]) / sample_size 266 | population_mean <- mean(fc, na.rm=TRUE) 267 | population_var <- var(fc, na.rm=TRUE) 268 | gfc <- (G[gg,]!=0) * fc[gg] 269 | sample_var <- sparseMatrixStats::colVars(gfc) * nrow(G) / sample_size 270 | alpha <- pmin(pmax(alpha,0), 0.999) ## limit 271 | estim_sd <- sqrt( alpha*sample_var + (1-alpha)*population_var ) 272 | z_statistic <- (sample_mean - population_mean) / (estim_sd / sqrt(sample_size)) 273 | p_value <- 2 * pnorm(abs(z_statistic[,1]), lower.tail = FALSE) 274 | if(zmat) { 275 | zmat <- (Matrix::t(gfc) / estim_sd) 276 | } else { 277 | zmat <- NULL 278 | } 279 | list( 280 | z_statistic = z_statistic[,1], 281 | p_value = p_value, 282 | zmat = zmat 283 | ) 284 | } 285 | 286 | #' Compute geneset expression as the average log-ration of genes in 287 | #' the geneset. Requires log-expression matrix X and (sparse) geneset 288 | #' matrix matG. 289 | #' 290 | #' @param X Log-expression matrix with genes on rows and samples on columns 291 | #' @param matG Sparse gene set matrix with genes on rows and gene sets on columns 292 | #' @param center Logical indicating whether to center the results 293 | #' 294 | #' @return Matrix of gene set expression scores with gene sets on rows and samples on columns. 295 | #' 296 | gset_averageCLR <- function(X, matG, center = TRUE) { 297 | if (NCOL(X) == 1) X <- cbind(X) 298 | gg <- intersect(rownames(X), rownames(matG)) 299 | if (length(gg) == 0) { 300 | message("[gset.averageCLR] no overlapping features") 301 | return(NULL) 302 | } 303 | X <- X[gg, , drop = FALSE] 304 | matG <- matG[gg, , drop = FALSE] 305 | sumG <- 1e-8 + Matrix::colSums(matG != 0, na.rm = TRUE) 306 | nG <- Matrix::colScale(1 * (matG != 0), 1 / sumG) 307 | gsetX <- Matrix::t(nG) %*% X 308 | if(center) gsetX <- gsetX - Matrix::rowMeans(gsetX, na.rm = TRUE) 309 | as.matrix(gsetX) 310 | } 311 | 312 | #' Perform t-test on gene set scores 313 | #' 314 | #' @param gsetX Matrix of gene set scores with gene sets on rows and samples on columns 315 | #' @param y Binary vector (0/1) indicating group membership 316 | #' 317 | #' @return Data frame with columns: diff (difference in means), statistic (t-statistic), 318 | #' pvalue (p-value), and other t-test results. 319 | #' 320 | gset_ttest <- function(gsetX, y) { 321 | ii <- which(!is.na(y)) 322 | gsetX <- gsetX[,ii] 323 | y <- y[ii] 324 | if(!all(unique(y) %in% c(0,1))) stop("[gset_ttest] elements of y must be 0 or 1") 325 | res <- Rfast::ttests(Matrix::t(gsetX), ina=y+1) 326 | rownames(res) <- rownames(gsetX) 327 | diff <- rowMeans(gsetX[,y==1]) - rowMeans(gsetX[,y==0]) 328 | res <- cbind( diff=diff, res) 329 | return(res) 330 | } 331 | 332 | ##---------------------------------------------------------------- 333 | ##----------------- FUNCTIONS ------------------------------------ 334 | ##---------------------------------------------------------------- 335 | 336 | #' Perform one-sample t-test on matrix with gene sets 337 | #' 338 | #' @param Fm Vector of feature values (e.g., fold changes) 339 | #' @param G Sparse matrix of gene sets with genes on rows and gene sets on columns 340 | #' 341 | #' @return List containing mean, t-statistic, and p-value matrices. 342 | #' 343 | matrix_onesample_ttest <- function(Fm, G) { 344 | sumG <- Matrix::colSums(G!=0) 345 | sum_sq <- Matrix::crossprod(G!=0, Fm^2) 346 | meanx <- Matrix::crossprod(G!=0, Fm) / (1e-8 + sumG) 347 | sdx <- sqrt( (sum_sq - meanx^2 * sumG) / (sumG - 1)) 348 | f_stats <- meanx 349 | t_stats <- meanx / (1e-8 + sdx) * sqrt(sumG) 350 | p_stats <- apply( abs(t_stats), 2, function(tv) 351 | 2*pt(tv,df=pmax(sumG-1,1),lower.tail=FALSE)) 352 | list(mean = as.matrix(f_stats), t = as.matrix(t_stats), p = p_stats) 353 | } 354 | 355 | #' Matrix version for combining p-values using fisher or stouffer 356 | #' method. Much faster than doing metap::sumlog() and metap::sumz() 357 | #' 358 | #' @param plist List of p-value vectors or matrix of p-values 359 | #' @param method Method for combining p-values ("fisher"/"sumlog" or "stouffer"/"sumz") 360 | #' 361 | #' @return Vector of combined p-values. 362 | #' 363 | matrix_metap <- function(plist, method='stouffer') { 364 | if(inherits(plist,"matrix")) { 365 | plist <- as.list(data.frame(plist)) 366 | } 367 | if(method %in% c("fisher","sumlog")) { 368 | chisq <- (-2) * Reduce('+', lapply(plist,log)) 369 | df <- 2 * length(plist) 370 | pv <- pchisq(chisq, df, lower.tail=FALSE) 371 | } else if(method %in% c("stouffer","sumz")) { 372 | np <- length(plist) 373 | zz <- lapply(plist, qnorm, lower.tail=FALSE) 374 | zz <- Reduce('+', zz) / sqrt(np) 375 | pv <- pnorm(zz, lower.tail=FALSE) 376 | } else if(method %in% c("maxp","pmax","maximump")) { 377 | pv <- Reduce(pmax, plist) 378 | } else { 379 | stop("Invalid method: ",method) 380 | } 381 | dimnames(pv) <- dimnames(plist[[1]]) 382 | return(pv) 383 | } 384 | 385 | 386 | #' Calculate gene set rank correlation 387 | #' 388 | #' Compute rank correlation between a gene rank vector/matrix and gene sets 389 | #' 390 | #' @param rnk Numeric vector or matrix of gene ranks, with genes as row names 391 | #' @param gset Numeric matrix of gene sets, with genes as row/column names 392 | #' @param compute.p Logical indicating whether to compute p-values 393 | #' @param use.rank Logical indicating whether to rank transform rnk before correlation 394 | #' 395 | #' @return Named list with components: 396 | #' \itemize{ 397 | #' \item rho - Matrix of correlation coefficients between rnk and gset 398 | #' \item p.value - Matrix of p-values for correlation (if compute.p = TRUE) 399 | #' \item q.value - Matrix of FDR adjusted p-values (if compute.p = TRUE) 400 | #' } 401 | #' 402 | #' @details This function calculates sparse rank correlation between rnk and each 403 | #' column of gset using \code{qlcMatrix::corSparse()}. It handles missing values in 404 | #' rnk by computing column-wise correlations. 405 | #' 406 | #' P-values are computed from statistical distribution 407 | #' 408 | #' @examples 409 | #' # Create example rank vector 410 | #' set.seed(123) 411 | #' ranks <- rnorm(100) 412 | #' names(ranks) <- paste0("GENE", 1:100) 413 | #' 414 | #' # Create example gene sets as sparse matrix 415 | #' gmt <- list( 416 | #' "Pathway1" = paste0("GENE", 1:20), 417 | #' "Pathway2" = paste0("GENE", 15:35), 418 | #' "Pathway3" = paste0("GENE", 30:50) 419 | #' ) 420 | #' genesets <- gmt2mat(gmt) 421 | #' 422 | #' # Calculate rank correlation 423 | #' result <- gset.rankcor(ranks, genesets, compute.p = TRUE) 424 | #' print(result$rho) 425 | #' print(result$p.value) 426 | #' 427 | #' @export 428 | gset.rankcor <- function(rnk, gset, compute.p = FALSE, use.rank = TRUE) { 429 | if (ncol(gset) == 0 || NCOL(rnk) == 0) { 430 | if (ncol(gset) == 0) message("gset has zero columns") 431 | if (NCOL(rnk) == 0) message("rnk has zero columns") 432 | return(NULL) 433 | } 434 | 435 | # if (!any(class(gset) %in% c("Matrix", "dgCMatrix", "lgCMatrix", "matrix", "array"))) { 436 | # stop("gset must be a matrix") 437 | # } 438 | if (!inherits(gset, "Matrix")) stop("gset must be a matrix") 439 | 440 | is.vec <- (NCOL(rnk) == 1 && !any(class(rnk) %in% c("matrix", "Matrix"))) 441 | if (is.vec && is.null(names(rnk))) stop("rank vector must be named") 442 | if (!is.vec && is.null(rownames(rnk))) stop("rank matrix must have rownames") 443 | if (is.vec) rnk <- matrix(rnk, ncol = 1, dimnames = list(names(rnk), "rnk")) 444 | n1 <- sum(rownames(rnk) %in% colnames(gset), na.rm = TRUE) 445 | n2 <- sum(rownames(rnk) %in% rownames(gset), na.rm = TRUE) 446 | if (n1 > n2) gset <- Matrix::t(gset) 447 | 448 | gg <- intersect(rownames(gset), rownames(rnk)) 449 | rnk1 <- rnk[gg, , drop = FALSE] 450 | gset <- gset[gg, , drop = FALSE] 451 | 452 | if (use.rank) { 453 | if(inherits(rnk1,"dgCMatrix")) { 454 | ## for sparse dgCMatrix 455 | ##rnk1 <- apply(rnk1, 2, base::rank, na.last = "keep", ties.method="random") 456 | rnk1 <- sparseMatrixStats::colRanks(rnk1, na.last = "keep", ties.method = "random", preserveShape = TRUE) 457 | } else { 458 | rnk1 <- matrixStats::colRanks(rnk1, na.last = "keep", ties.method = "random", preserveShape = TRUE) 459 | } 460 | } 461 | 462 | ## two cases: (1) in case no missing values, just use corSparse on 463 | ## whole matrix. (2) in case the rnk matrix has missing values, we 464 | ## must proceed 1-column at time and do reduced corSparse on 465 | ## intersection of genes. 466 | rho1 <- cor_sparse_matrix(gset, rnk1) 467 | 468 | rownames(rho1) <- colnames(gset) 469 | colnames(rho1) <- colnames(rnk1) 470 | rho1[is.nan(rho1)] <- NA ## ?? 471 | 472 | ## compute p-value 473 | .cor.pvalue <- function(x, n) 2 * stats::pnorm(-abs(x / ((1 - x**2) / (n - 2))**0.5)) 474 | if (compute.p) { 475 | pv <- apply(rho1, 2, function(x) .cor.pvalue(x, n = nrow(rnk1))) 476 | pv[is.nan(pv)] <- NA ## ?? 477 | qv <- apply(pv, 2, stats::p.adjust, method = "fdr") 478 | df <- list(rho = rho1, p.value = pv, q.value = qv) 479 | } else { 480 | df <- list(rho = rho1, p.value = NA, q.value = NA) 481 | } 482 | df 483 | } 484 | 485 | #' Calculate sparse correlation matrix handling missing values 486 | #' 487 | #' @param G Sparse matrix containing gene sets 488 | #' @param mat Matrix of values 489 | #' @return Correlation matrix between G and mat 490 | #' @details If mat has no missing values, calculates correlation directly using corSparse. 491 | #' Otherwise computes column-wise correlations only using non-missing values. 492 | cor_sparse_matrix <- function(G, mat) { 493 | if (sum(is.na(mat)) == 0) { 494 | cor_matrix <- qlcMatrix::corSparse(G, mat) 495 | } else { 496 | message("matrix has missing values: computing column-wise reduced cor") 497 | corSparse.vec <- function(X, y) { 498 | jj <- which(!is.na(y)) 499 | qlcMatrix::corSparse(X[jj, , drop = FALSE], cbind(y[jj])) 500 | } 501 | cor_matrix <- lapply(seq_len(ncol(mat)), function(i) corSparse.vec(G, mat[, i])) 502 | cor_matrix <- do.call(cbind, cor_matrix) 503 | } 504 | return(cor_matrix) 505 | } 506 | -------------------------------------------------------------------------------- /tests/testthat/test-plaid.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## This file is part of the Omics Playground project. 3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 4 | ## 5 | 6 | # ============================================================================ 7 | # Setup test data 8 | # ============================================================================ 9 | 10 | # Helper function to create test data 11 | create_test_data <- function(n_genes = 100, n_samples = 10, n_pathways = 5) { 12 | set.seed(123) 13 | 14 | # Create expression matrix 15 | X <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples) 16 | rownames(X) <- paste0("GENE", 1:n_genes) 17 | colnames(X) <- paste0("Sample", 1:n_samples) 18 | 19 | # Create gene sets 20 | gmt <- lapply(1:n_pathways, function(i) { 21 | start_idx <- ((i-1) * 15 + 1) 22 | end_idx <- min(start_idx + 19, n_genes) 23 | paste0("GENE", start_idx:end_idx) 24 | }) 25 | names(gmt) <- paste0("Pathway", 1:n_pathways) 26 | 27 | matG <- gmt2mat(gmt, bg = rownames(X)) 28 | 29 | list(X = X, matG = matG, gmt = gmt) 30 | } 31 | 32 | # ============================================================================ 33 | # Test: plaid() - Main function 34 | # ============================================================================ 35 | 36 | test_that("plaid works with basic input", { 37 | data <- create_test_data() 38 | 39 | result <- plaid(data$X, data$matG) 40 | 41 | expect_true(is.matrix(result)) 42 | expect_equal(nrow(result), ncol(data$matG)) 43 | expect_equal(ncol(result), ncol(data$X)) 44 | expect_equal(rownames(result), colnames(data$matG)) 45 | expect_equal(colnames(result), colnames(data$X)) 46 | }) 47 | 48 | test_that("plaid works with stats='sum'", { 49 | data <- create_test_data() 50 | 51 | result <- plaid(data$X, data$matG, stats = "sum") 52 | 53 | expect_true(is.matrix(result)) 54 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 55 | }) 56 | 57 | test_that("plaid works with normalize=FALSE", { 58 | data <- create_test_data() 59 | 60 | result <- plaid(data$X, data$matG, normalize = FALSE) 61 | 62 | expect_true(is.matrix(result)) 63 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 64 | }) 65 | 66 | test_that("plaid works with single column", { 67 | data <- create_test_data(n_samples = 1) 68 | 69 | result <- plaid(data$X, data$matG) 70 | 71 | expect_true(is.matrix(result)) 72 | expect_equal(ncol(result), 1) 73 | }) 74 | 75 | test_that("plaid handles no overlapping features", { 76 | data <- create_test_data() 77 | rownames(data$X) <- paste0("OTHER_GENE", 1:nrow(data$X)) 78 | 79 | result <- plaid(data$X, data$matG) 80 | 81 | expect_null(result) 82 | }) 83 | 84 | test_that("plaid works with sparse matrices", { 85 | data <- create_test_data(n_pathways = 25) # Use more pathways to avoid normalization issue 86 | X_sparse <- Matrix::Matrix(data$X, sparse = TRUE) 87 | 88 | result <- plaid(X_sparse, data$matG) 89 | 90 | expect_true(is.matrix(result)) 91 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 92 | }) 93 | 94 | test_that("plaid works with different nsmooth values", { 95 | data <- create_test_data() 96 | 97 | result1 <- plaid(data$X, data$matG, nsmooth = 1) 98 | result2 <- plaid(data$X, data$matG, nsmooth = 5) 99 | 100 | expect_true(is.matrix(result1)) 101 | expect_true(is.matrix(result2)) 102 | expect_false(identical(result1, result2)) 103 | }) 104 | 105 | # ============================================================================ 106 | # Test: chunked_crossprod() 107 | # ============================================================================ 108 | 109 | test_that("chunked_crossprod works with small matrix", { 110 | set.seed(123) 111 | x <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 112 | y <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 113 | 114 | result <- chunked_crossprod(x, y, chunk = 1000) 115 | expected <- Matrix::crossprod(x, y) 116 | 117 | expect_equal(as.matrix(result), as.matrix(expected)) 118 | }) 119 | 120 | test_that("chunked_crossprod works with chunk size", { 121 | set.seed(123) 122 | x <- Matrix::Matrix(rnorm(200), nrow = 10, ncol = 20, sparse = TRUE) 123 | y <- Matrix::Matrix(rnorm(300), nrow = 10, ncol = 30, sparse = TRUE) 124 | 125 | result <- chunked_crossprod(x, y, chunk = 10) 126 | expected <- Matrix::crossprod(x, y) 127 | 128 | # Compare values, ignoring dimnames which may differ 129 | expect_equal(as.numeric(result), as.numeric(expected), tolerance = 1e-10) 130 | expect_equal(dim(result), dim(expected)) 131 | }) 132 | 133 | # ============================================================================ 134 | # Test: replaid.scse() 135 | # ============================================================================ 136 | 137 | test_that("replaid.scse works with basic input", { 138 | data <- create_test_data() 139 | # Create log-transformed data (simulate counts) 140 | X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10)) 141 | rownames(X) <- paste0("GENE", 1:100) 142 | colnames(X) <- paste0("Sample", 1:10) 143 | 144 | result <- replaid.scse(X, data$matG, removeLog2 = TRUE, scoreMean = FALSE) 145 | 146 | expect_true(is.matrix(result)) 147 | expect_equal(dim(result), c(ncol(data$matG), ncol(X))) 148 | expect_equal(colnames(result), colnames(X)) 149 | }) 150 | 151 | test_that("replaid.scse works with scoreMean=TRUE", { 152 | data <- create_test_data() 153 | X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10)) 154 | rownames(X) <- rownames(data$X) 155 | colnames(X) <- colnames(data$X) 156 | 157 | result <- replaid.scse(X, data$matG, removeLog2 = TRUE, scoreMean = TRUE) 158 | 159 | expect_true(is.matrix(result)) 160 | expect_equal(dim(result), c(ncol(data$matG), ncol(X))) 161 | }) 162 | 163 | test_that("replaid.scse auto-detects removeLog2", { 164 | data <- create_test_data() 165 | X <- log2(matrix(rpois(1000, lambda = 10) + 1, nrow = 100, ncol = 10)) 166 | rownames(X) <- rownames(data$X) 167 | colnames(X) <- colnames(data$X) 168 | 169 | result <- replaid.scse(X, data$matG, removeLog2 = NULL) 170 | 171 | expect_true(is.matrix(result)) 172 | }) 173 | 174 | # ============================================================================ 175 | # Test: replaid.sing() 176 | # ============================================================================ 177 | 178 | test_that("replaid.sing works with basic input", { 179 | data <- create_test_data() 180 | 181 | result <- replaid.sing(data$X, data$matG) 182 | 183 | expect_true(is.matrix(result)) 184 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 185 | expect_equal(rownames(result), colnames(data$matG)) 186 | expect_equal(colnames(result), colnames(data$X)) 187 | }) 188 | 189 | test_that("replaid.sing produces values in expected range", { 190 | data <- create_test_data() 191 | 192 | result <- replaid.sing(data$X, data$matG) 193 | 194 | # Singscore should be centered around 0 195 | expect_true(all(is.finite(result))) 196 | expect_true(abs(mean(result)) < 1) 197 | }) 198 | 199 | # ============================================================================ 200 | # Test: replaid.ssgsea() 201 | # ============================================================================ 202 | 203 | test_that("replaid.ssgsea works with alpha=0", { 204 | data <- create_test_data() 205 | 206 | result <- replaid.ssgsea(data$X, data$matG, alpha = 0) 207 | 208 | expect_true(is.matrix(result)) 209 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 210 | }) 211 | 212 | test_that("replaid.ssgsea works with alpha>0", { 213 | data <- create_test_data() 214 | 215 | result <- replaid.ssgsea(data$X, data$matG, alpha = 0.25) 216 | 217 | expect_true(is.matrix(result)) 218 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 219 | }) 220 | 221 | test_that("replaid.ssgsea results differ for different alpha values", { 222 | data <- create_test_data() 223 | 224 | result1 <- replaid.ssgsea(data$X, data$matG, alpha = 0) 225 | result2 <- replaid.ssgsea(data$X, data$matG, alpha = 0.5) 226 | 227 | expect_false(identical(result1, result2)) 228 | }) 229 | 230 | # ============================================================================ 231 | # Test: replaid.ucell() 232 | # ============================================================================ 233 | 234 | test_that("replaid.ucell works with basic input", { 235 | data <- create_test_data() 236 | 237 | result <- replaid.ucell(data$X, data$matG, rmax = 50) 238 | 239 | expect_true(is.matrix(result)) 240 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 241 | }) 242 | 243 | test_that("replaid.ucell produces finite values", { 244 | data <- create_test_data() 245 | 246 | result <- replaid.ucell(data$X, data$matG, rmax = 50) 247 | 248 | # UCell scores are transformed and may exceed [0,1] range 249 | expect_true(all(is.finite(result))) 250 | expect_true(is.matrix(result)) 251 | }) 252 | 253 | test_that("replaid.ucell works with different rmax values", { 254 | data <- create_test_data() 255 | 256 | result1 <- replaid.ucell(data$X, data$matG, rmax = 50) 257 | result2 <- replaid.ucell(data$X, data$matG, rmax = 100) 258 | 259 | expect_false(identical(result1, result2)) 260 | }) 261 | 262 | # ============================================================================ 263 | # Test: replaid.aucell() 264 | # ============================================================================ 265 | 266 | test_that("replaid.aucell works with basic input", { 267 | data <- create_test_data() 268 | 269 | result <- replaid.aucell(data$X, data$matG) 270 | 271 | expect_true(is.matrix(result)) 272 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 273 | }) 274 | 275 | test_that("replaid.aucell works with custom aucMaxRank", { 276 | data <- create_test_data() 277 | 278 | result <- replaid.aucell(data$X, data$matG, aucMaxRank = 10) 279 | 280 | expect_true(is.matrix(result)) 281 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 282 | }) 283 | 284 | test_that("replaid.aucell auto-computes aucMaxRank", { 285 | data <- create_test_data() 286 | 287 | result <- replaid.aucell(data$X, data$matG, aucMaxRank = NULL) 288 | 289 | expect_true(is.matrix(result)) 290 | }) 291 | 292 | # ============================================================================ 293 | # Test: replaid.gsva() 294 | # ============================================================================ 295 | 296 | test_that("replaid.gsva works with basic input", { 297 | data <- create_test_data() 298 | 299 | result <- replaid.gsva(data$X, data$matG, tau = 0) 300 | 301 | expect_true(is.matrix(result)) 302 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 303 | }) 304 | 305 | test_that("replaid.gsva works with tau>0", { 306 | data <- create_test_data() 307 | 308 | result <- replaid.gsva(data$X, data$matG, tau = 0.25) 309 | 310 | expect_true(is.matrix(result)) 311 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 312 | }) 313 | 314 | test_that("replaid.gsva works with rowtf='z'", { 315 | data <- create_test_data() 316 | 317 | result <- replaid.gsva(data$X, data$matG, rowtf = "z") 318 | 319 | expect_true(is.matrix(result)) 320 | }) 321 | 322 | test_that("replaid.gsva works with rowtf='ecdf'", { 323 | data <- create_test_data() 324 | 325 | result <- replaid.gsva(data$X, data$matG, rowtf = "ecdf") 326 | 327 | expect_true(is.matrix(result)) 328 | }) 329 | 330 | test_that("replaid.gsva throws error for invalid rowtf", { 331 | data <- create_test_data() 332 | 333 | expect_error(replaid.gsva(data$X, data$matG, rowtf = "invalid")) 334 | }) 335 | 336 | # ============================================================================ 337 | # Test: mat.rowsds() 338 | # ============================================================================ 339 | 340 | test_that("mat.rowsds works with dense matrix", { 341 | set.seed(123) 342 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 343 | 344 | result <- mat.rowsds(X) 345 | expected <- matrixStats::rowSds(X) 346 | 347 | expect_equal(result, expected) 348 | }) 349 | 350 | test_that("mat.rowsds works with sparse matrix", { 351 | set.seed(123) 352 | X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 353 | 354 | result <- mat.rowsds(X) 355 | 356 | expect_true(is.numeric(result)) 357 | expect_equal(length(result), nrow(X)) 358 | expect_true(all(result >= 0)) 359 | }) 360 | 361 | # ============================================================================ 362 | # Test: normalize_medians() 363 | # ============================================================================ 364 | 365 | test_that("normalize_medians works with basic input", { 366 | set.seed(123) 367 | x <- matrix(rnorm(100, mean = 5), nrow = 10, ncol = 10) 368 | 369 | result <- normalize_medians(x) 370 | 371 | expect_true(is.matrix(result)) 372 | expect_equal(dim(result), dim(x)) 373 | 374 | # Check that medians are similar after normalization 375 | medians <- matrixStats::colMedians(result) 376 | expect_true(sd(medians) < sd(matrixStats::colMedians(x))) 377 | }) 378 | 379 | test_that("normalize_medians handles zeros correctly", { 380 | set.seed(123) 381 | x <- matrix(rnorm(100), nrow = 10, ncol = 10) 382 | x[1:3, 1:3] <- 0 383 | 384 | result <- normalize_medians(x, ignore.zero = TRUE) 385 | 386 | expect_true(is.matrix(result)) 387 | expect_equal(dim(result), dim(x)) 388 | }) 389 | 390 | test_that("normalize_medians auto-detects zeros", { 391 | set.seed(123) 392 | x <- matrix(rnorm(100), nrow = 10, ncol = 10) 393 | x[1:3, 1:3] <- 0 394 | 395 | result <- normalize_medians(x, ignore.zero = NULL) 396 | 397 | expect_true(is.matrix(result)) 398 | }) 399 | 400 | # ============================================================================ 401 | # Test: colranks() 402 | # ============================================================================ 403 | 404 | test_that("colranks works with basic input", { 405 | set.seed(123) 406 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 407 | rownames(X) <- paste0("Gene", 1:10) 408 | colnames(X) <- paste0("Sample", 1:10) 409 | 410 | result <- colranks(X) 411 | 412 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 413 | expect_equal(dim(result), dim(X)) 414 | expect_equal(dimnames(result), dimnames(X)) 415 | }) 416 | 417 | test_that("colranks produces correct rank range", { 418 | set.seed(123) 419 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 420 | 421 | result <- colranks(X, ties.method = "average") 422 | 423 | # Ranks should be between 1 and nrow(X) 424 | expect_true(all(result >= 1 & result <= nrow(X))) 425 | }) 426 | 427 | test_that("colranks works with signed=TRUE", { 428 | set.seed(123) 429 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 430 | 431 | result <- colranks(X, signed = TRUE) 432 | 433 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 434 | expect_equal(dim(result), dim(X)) 435 | }) 436 | 437 | test_that("colranks works with keep.zero=TRUE", { 438 | set.seed(123) 439 | X <- matrix(rnorm(100), nrow = 10, ncol = 10) 440 | X[1:3, 1:3] <- 0 441 | X <- Matrix::Matrix(X, sparse = TRUE) 442 | 443 | result <- colranks(X, keep.zero = TRUE) 444 | 445 | expect_true(inherits(result, "Matrix")) 446 | expect_equal(dim(result), dim(X)) 447 | }) 448 | 449 | test_that("colranks works with sparse matrix", { 450 | set.seed(123) 451 | X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 452 | 453 | result <- colranks(X, sparse = TRUE) 454 | 455 | # colranks returns dense matrix for full matrices even if input is sparse 456 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 457 | expect_equal(dim(result), dim(X)) 458 | }) 459 | 460 | test_that("colranks ties methods work correctly", { 461 | X <- matrix(c(1, 2, 2, 3, 4, 5, 5, 6), nrow = 4, ncol = 2) 462 | 463 | result_avg <- colranks(X, ties.method = "average") 464 | result_min <- colranks(X, ties.method = "min") 465 | 466 | expect_false(identical(result_avg, result_min)) 467 | }) 468 | 469 | # ============================================================================ 470 | # Test: sparse_colranks() 471 | # ============================================================================ 472 | 473 | test_that("sparse_colranks works with basic input", { 474 | set.seed(123) 475 | X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 476 | 477 | result <- sparse_colranks(X) 478 | 479 | expect_true(inherits(result, "CsparseMatrix")) 480 | expect_equal(dim(result), dim(X)) 481 | }) 482 | 483 | test_that("sparse_colranks works with signed=TRUE", { 484 | set.seed(123) 485 | X <- Matrix::Matrix(rnorm(100), nrow = 10, ncol = 10, sparse = TRUE) 486 | 487 | result <- sparse_colranks(X, signed = TRUE) 488 | 489 | expect_true(inherits(result, "CsparseMatrix")) 490 | expect_equal(dim(result), dim(X)) 491 | }) 492 | 493 | test_that("sparse_colranks handles different ties methods", { 494 | X <- Matrix::Matrix(c(1, 2, 2, 3, 4, 5, 5, 6), nrow = 4, ncol = 2, sparse = TRUE) 495 | 496 | result_avg <- sparse_colranks(X, ties.method = "average") 497 | result_min <- sparse_colranks(X, ties.method = "min") 498 | 499 | expect_true(inherits(result_avg, "CsparseMatrix")) 500 | expect_true(inherits(result_min, "CsparseMatrix")) 501 | }) 502 | 503 | # ============================================================================ 504 | # Test: GMT list input for replaid functions 505 | # ============================================================================ 506 | 507 | test_that("plaid works with GMT list input", { 508 | data <- create_test_data() 509 | 510 | result <- plaid(data$X, data$gmt) 511 | 512 | expect_true(is.matrix(result)) 513 | expect_equal(ncol(result), ncol(data$X)) 514 | }) 515 | 516 | test_that("replaid.sing works with GMT list input", { 517 | data <- create_test_data() 518 | 519 | result <- replaid.sing(data$X, data$gmt) 520 | 521 | expect_true(is.matrix(result)) 522 | expect_equal(ncol(result), ncol(data$X)) 523 | }) 524 | 525 | test_that("replaid.ssgsea works with GMT list input", { 526 | data <- create_test_data() 527 | 528 | result <- replaid.ssgsea(data$X, data$gmt, alpha = 0) 529 | 530 | expect_true(is.matrix(result)) 531 | expect_equal(ncol(result), ncol(data$X)) 532 | }) 533 | 534 | # ============================================================================ 535 | # Test: Edge cases 536 | # ============================================================================ 537 | 538 | test_that("plaid handles small number of gene sets", { 539 | data <- create_test_data(n_pathways = 2) 540 | 541 | result <- plaid(data$X, data$matG) 542 | 543 | expect_true(is.matrix(result)) 544 | expect_equal(nrow(result), 2) 545 | }) 546 | 547 | test_that("plaid handles large nsmooth value", { 548 | data <- create_test_data() 549 | 550 | result <- plaid(data$X, data$matG, nsmooth = 100) 551 | 552 | expect_true(is.matrix(result)) 553 | expect_equal(dim(result), c(ncol(data$matG), ncol(data$X))) 554 | }) 555 | 556 | test_that("normalize_medians handles all-zero columns", { 557 | x <- matrix(0, nrow = 10, ncol = 5) 558 | x[, 1:3] <- rnorm(30) 559 | 560 | result <- normalize_medians(x) 561 | 562 | expect_true(is.matrix(result)) 563 | expect_equal(dim(result), dim(x)) 564 | }) 565 | 566 | test_that("colranks handles constant values", { 567 | X <- matrix(rep(1, 20), nrow = 5, ncol = 4) 568 | 569 | result <- colranks(X) 570 | 571 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 572 | expect_equal(dim(result), dim(X)) 573 | }) 574 | 575 | # ============================================================================ 576 | # Test: Parameter validation 577 | # ============================================================================ 578 | 579 | test_that("plaid handles partial overlap of features", { 580 | data <- create_test_data(n_genes = 100) 581 | # Keep only half of genes in matG 582 | keep_genes <- rownames(data$X)[1:50] 583 | data$matG <- data$matG[keep_genes, , drop = FALSE] 584 | 585 | result <- plaid(data$X, data$matG) 586 | 587 | expect_true(is.matrix(result)) 588 | expect_equal(ncol(result), ncol(data$X)) 589 | }) 590 | 591 | test_that("replaid functions handle min.genes and max.genes parameters", { 592 | data <- create_test_data() 593 | 594 | result <- plaid(data$X, data$gmt, min.genes = 5, max.genes = 100) 595 | 596 | expect_true(is.matrix(result)) 597 | }) 598 | 599 | # ============================================================================ 600 | # Test: Numerical stability 601 | # ============================================================================ 602 | 603 | test_that("plaid handles very small values", { 604 | data <- create_test_data() 605 | data$X <- data$X * 1e-10 606 | 607 | result <- plaid(data$X, data$matG) 608 | 609 | expect_true(is.matrix(result)) 610 | expect_true(all(is.finite(result))) 611 | }) 612 | 613 | test_that("plaid handles very large values", { 614 | data <- create_test_data(n_pathways = 25) 615 | data$X <- data$X * 1e6 # Use smaller multiplier to avoid Inf 616 | 617 | result <- plaid(data$X, data$matG) 618 | 619 | expect_true(is.matrix(result)) 620 | # With very large values, some Inf may occur in intermediate calculations 621 | expect_true(is.matrix(result)) 622 | }) 623 | 624 | test_that("normalize_medians handles NA values", { 625 | set.seed(123) 626 | x <- matrix(rnorm(100), nrow = 10, ncol = 10) 627 | x[1:3, 1:3] <- NA 628 | 629 | result <- normalize_medians(x) 630 | 631 | expect_true(is.matrix(result)) 632 | expect_equal(dim(result), dim(x)) 633 | }) 634 | -------------------------------------------------------------------------------- /tests/testthat/test-stats.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## This file is part of the Omics Playground project. 3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 4 | ## 5 | 6 | # ============================================================================ 7 | # Setup test data 8 | # ============================================================================ 9 | 10 | # Helper function to create test data for statistical testing 11 | create_stats_test_data <- function(n_genes = 100, n_samples = 20, n_pathways = 5) { 12 | set.seed(456) 13 | 14 | # Create expression matrix 15 | X <- matrix(rnorm(n_genes * n_samples), nrow = n_genes, ncol = n_samples) 16 | rownames(X) <- paste0("GENE", 1:n_genes) 17 | colnames(X) <- paste0("Sample", 1:n_samples) 18 | 19 | # Create binary group vector (0/1) 20 | y <- rep(c(0, 1), each = n_samples/2) 21 | 22 | # Create gene sets 23 | gmt <- lapply(1:n_pathways, function(i) { 24 | start_idx <- ((i-1) * 15 + 1) 25 | end_idx <- min(start_idx + 19, n_genes) 26 | paste0("GENE", start_idx:end_idx) 27 | }) 28 | names(gmt) <- paste0("Pathway", 1:n_pathways) 29 | 30 | G <- gmt2mat(gmt, bg = rownames(X)) 31 | 32 | list(X = X, y = y, G = G, gmt = gmt) 33 | } 34 | 35 | # ============================================================================ 36 | # Test: dualGSEA() 37 | # ============================================================================ 38 | 39 | test_that("dualGSEA works with replaid methods", { 40 | data <- create_stats_test_data() 41 | 42 | result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G, 43 | fc.method = "cor", ss.method = "replaid.ssgsea") 44 | 45 | expect_true(is.matrix(result) || is.data.frame(result)) 46 | expect_true("gsetFC" %in% colnames(result)) 47 | expect_true("p.dual" %in% colnames(result)) 48 | expect_true("q.dual" %in% colnames(result)) 49 | expect_true("size" %in% colnames(result)) 50 | expect_equal(nrow(result), ncol(data$G)) 51 | 52 | }) 53 | 54 | test_that("dualGSEA handles NA values in y", { 55 | data <- create_stats_test_data() 56 | data$y[1:2] <- NA 57 | 58 | result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G, 59 | fc.method = "rankcor", ss.method = "replaid.ssgsea") 60 | 61 | expect_true(is.matrix(result) || is.data.frame(result)) 62 | }) 63 | 64 | test_that("dualGSEA throws error for invalid y", { 65 | data <- create_stats_test_data() 66 | data$y[1] <- 2 67 | 68 | expect_error(dualGSEA(data$X, data$y, gmt = data$gmt)) 69 | }) 70 | 71 | test_that("dualGSEA throws error for invalid fc.method", { 72 | data <- create_stats_test_data() 73 | 74 | expect_error(dualGSEA(data$X, data$y, gmt = data$gmt, fc.method = "invalid")) 75 | }) 76 | 77 | test_that("dualGSEA throws error for invalid ss.method", { 78 | data <- create_stats_test_data() 79 | 80 | expect_error(dualGSEA(data$X, data$y, gmt = data$gmt, 81 | fc.method = "cor", ss.method = "invalid")) 82 | }) 83 | 84 | test_that("dualGSEA works with different fc.method methods", { 85 | data <- create_stats_test_data(n_pathways = 25) # Use more pathways 86 | 87 | result_cor <- dualGSEA(data$X, data$y, G = data$G, fc.method = "cor") 88 | result_rankcor <- dualGSEA(data$X, data$y, G = data$G, fc.method = "rankcor") 89 | result_ztest <- dualGSEA(data$X, data$y, G = data$G, fc.method = "ztest") 90 | 91 | expect_true(is.matrix(result_cor) || is.data.frame(result_cor)) 92 | expect_true(is.matrix(result_rankcor) || is.data.frame(result_rankcor)) 93 | expect_true(is.matrix(result_ztest) || is.data.frame(result_ztest)) 94 | }) 95 | 96 | test_that("dualGSEA works with different metap methods", { 97 | data <- create_stats_test_data() 98 | 99 | result_stouffer <- dualGSEA(data$X, data$y, G = data$G, metap.method = "stouffer") 100 | result_fisher <- dualGSEA(data$X, data$y, G = data$G, metap.method = "fisher") 101 | 102 | expect_true(is.matrix(result_stouffer) || is.data.frame(result_stouffer)) 103 | expect_true(is.matrix(result_fisher) || is.data.frame(result_fisher)) 104 | expect_false(identical(result_stouffer, result_fisher)) 105 | }) 106 | 107 | test_that("dualGSEA works with different sort.by options", { 108 | data <- create_stats_test_data() 109 | 110 | result_pdual <- dualGSEA(data$X, data$y, G = data$G, sort.by = "p.dual") 111 | result_fc <- dualGSEA(data$X, data$y, G = data$G, sort.by = "gsetFC") 112 | 113 | expect_true(is.matrix(result_pdual) || is.data.frame(result_pdual)) 114 | expect_true(is.matrix(result_fc) || is.data.frame(result_fc)) 115 | }) 116 | 117 | test_that("dualGSEA handles NA values in y", { 118 | data <- create_stats_test_data() 119 | data$y[1:2] <- NA 120 | 121 | result <- dualGSEA(data$X, data$y, G = data$G) 122 | 123 | expect_true(is.matrix(result) || is.data.frame(result)) 124 | }) 125 | 126 | test_that("dualGSEA works with GMT list input", { 127 | data <- create_stats_test_data() 128 | 129 | result <- dualGSEA(data$X, data$y, gmt = data$gmt, G = data$G) 130 | 131 | expect_true(is.matrix(result) || is.data.frame(result)) 132 | }) 133 | 134 | test_that("dualGSEA works with precomputed pv1 and pv2", { 135 | data <- create_stats_test_data() 136 | pv1 <- rep(0.05, ncol(data$G)) 137 | names(pv1) <- colnames(data$G) 138 | pv2 <- rep(0.1, ncol(data$G)) 139 | names(pv2) <- colnames(data$G) 140 | 141 | result <- dualGSEA(data$X, data$y, G = data$G, pv1 = pv1, pv2 = pv2) 142 | 143 | expect_true(is.matrix(result) || is.data.frame(result)) 144 | }) 145 | 146 | test_that("dualGSEA works with sparse matrix input", { 147 | data <- create_stats_test_data(n_pathways = 25) # Use more pathways to avoid normalization issue 148 | X_sparse <- Matrix::Matrix(data$X, sparse = TRUE) 149 | 150 | result <- dualGSEA(X_sparse, data$y, G = data$G) 151 | 152 | expect_true(is.matrix(result) || is.data.frame(result)) 153 | }) 154 | 155 | # ============================================================================ 156 | # Test: fc_ttest() 157 | # ============================================================================ 158 | 159 | test_that("fc_ttest works with basic input", { 160 | data <- create_stats_test_data() 161 | fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0]) 162 | names(fc) <- rownames(data$X) 163 | 164 | result <- fc_ttest(fc, data$G) 165 | 166 | expect_true(is.matrix(result) || is.data.frame(result)) 167 | expect_true("gsetFC" %in% colnames(result)) 168 | expect_true("pvalue" %in% colnames(result)) 169 | expect_true("qvalue" %in% colnames(result)) 170 | }) 171 | 172 | test_that("fc_ttest works with different sort.by options", { 173 | data <- create_stats_test_data() 174 | fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0]) 175 | names(fc) <- rownames(data$X) 176 | 177 | result_pvalue <- fc_ttest(fc, data$G, sort.by = "pvalue") 178 | result_fc <- fc_ttest(fc, data$G, sort.by = "gsetFC") 179 | result_none <- fc_ttest(fc, data$G, sort.by = "none") 180 | 181 | expect_true(is.matrix(result_pvalue) || is.data.frame(result_pvalue)) 182 | expect_true(is.matrix(result_fc) || is.data.frame(result_fc)) 183 | expect_true(is.matrix(result_none) || is.data.frame(result_none)) 184 | }) 185 | 186 | test_that("fc_ttest throws error for unnamed fc", { 187 | data <- create_stats_test_data() 188 | fc <- rnorm(nrow(data$X)) 189 | 190 | expect_error(fc_ttest(fc, data$G)) 191 | }) 192 | 193 | # ============================================================================ 194 | # Test: fc_ztest() 195 | # ============================================================================ 196 | 197 | test_that("fc_ztest works with basic input", { 198 | data <- create_stats_test_data() 199 | fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0]) 200 | names(fc) <- rownames(data$X) 201 | 202 | result <- fc_ztest(fc, data$G) 203 | 204 | expect_true(is.list(result)) 205 | expect_true("z_statistic" %in% names(result)) 206 | expect_true("p_value" %in% names(result)) 207 | }) 208 | 209 | test_that("fc_ztest works with zmat=TRUE", { 210 | data <- create_stats_test_data() 211 | fc <- rowMeans(data$X[, data$y == 1]) - rowMeans(data$X[, data$y == 0]) 212 | names(fc) <- rownames(data$X) 213 | 214 | result <- fc_ztest(fc, data$G, zmat=TRUE) 215 | 216 | expect_true(is.list(result)) 217 | expect_true("zmat" %in% names(result)) 218 | expect_true("z_statistic" %in% names(result)) 219 | expect_true("p_value" %in% names(result)) 220 | }) 221 | 222 | test_that("fc_ttest throws error for unnamed fc", { 223 | data <- create_stats_test_data() 224 | fc <- rnorm(nrow(data$X)) 225 | 226 | expect_error(fc_ztest(fc, data$G)) 227 | }) 228 | 229 | # ============================================================================ 230 | # Test: gset_averageCLR() 231 | # ============================================================================ 232 | 233 | test_that("gset_averageCLR works with basic input", { 234 | data <- create_stats_test_data() 235 | 236 | result <- gset_averageCLR(data$X, data$G) 237 | 238 | expect_true(is.matrix(result)) 239 | expect_equal(nrow(result), ncol(data$G)) 240 | expect_equal(ncol(result), ncol(data$X)) 241 | }) 242 | 243 | test_that("gset_averageCLR works with center=FALSE", { 244 | data <- create_stats_test_data() 245 | 246 | result <- gset_averageCLR(data$X, data$G, center = FALSE) 247 | 248 | expect_true(is.matrix(result)) 249 | expect_equal(dim(result), c(ncol(data$G), ncol(data$X))) 250 | }) 251 | 252 | test_that("gset_averageCLR works with single column", { 253 | data <- create_stats_test_data(n_samples = 2) 254 | X_single <- data$X[, 1, drop = FALSE] 255 | 256 | result <- gset_averageCLR(X_single, data$G) 257 | 258 | expect_true(is.matrix(result)) 259 | expect_equal(ncol(result), 1) 260 | }) 261 | 262 | test_that("gset_averageCLR handles no overlapping features", { 263 | data <- create_stats_test_data() 264 | rownames(data$X) <- paste0("OTHER_GENE", 1:nrow(data$X)) 265 | 266 | result <- gset_averageCLR(data$X, data$G) 267 | 268 | expect_null(result) 269 | }) 270 | 271 | # ============================================================================ 272 | # Test: gset_ttest() 273 | # ============================================================================ 274 | 275 | test_that("gset_ttest works with basic input", { 276 | data <- create_stats_test_data() 277 | gsetX <- plaid(data$X, data$G) 278 | 279 | result <- gset_ttest(gsetX, data$y) 280 | 281 | expect_true(is.matrix(result) || is.data.frame(result)) 282 | expect_true("diff" %in% colnames(result)) 283 | expect_true("pvalue" %in% colnames(result)) 284 | expect_equal(nrow(result), nrow(gsetX)) 285 | }) 286 | 287 | test_that("gset_ttest throws error for invalid y", { 288 | data <- create_stats_test_data() 289 | gsetX <- plaid(data$X, data$G) 290 | invalid_y <- rep(c(0, 1, 2), length.out = ncol(gsetX)) 291 | 292 | expect_error(gset_ttest(gsetX, invalid_y)) 293 | }) 294 | 295 | test_that("gset_ttest produces expected output structure", { 296 | data <- create_stats_test_data() 297 | gsetX <- plaid(data$X, data$G) 298 | 299 | result <- gset_ttest(gsetX, data$y) 300 | 301 | expect_true(all(c("diff", "pvalue") %in% colnames(result))) 302 | expect_equal(rownames(result), rownames(gsetX)) 303 | }) 304 | 305 | # ============================================================================ 306 | # Test: matrix_onesample_ttest() 307 | # ============================================================================ 308 | 309 | test_that("matrix_onesample_ttest works with basic input", { 310 | set.seed(456) 311 | Fm <- rnorm(100) 312 | names(Fm) <- paste0("GENE", 1:100) 313 | data <- create_stats_test_data() 314 | 315 | result <- matrix_onesample_ttest(Fm, data$G) 316 | 317 | expect_true(is.list(result)) 318 | expect_true("mean" %in% names(result)) 319 | expect_true("t" %in% names(result)) 320 | expect_true("p" %in% names(result)) 321 | expect_equal(nrow(result$mean), ncol(data$G)) 322 | }) 323 | 324 | test_that("matrix_onesample_ttest produces finite values", { 325 | set.seed(456) 326 | Fm <- rnorm(100) 327 | names(Fm) <- paste0("GENE", 1:100) 328 | data <- create_stats_test_data() 329 | 330 | result <- matrix_onesample_ttest(Fm, data$G) 331 | 332 | expect_true(all(is.finite(result$mean))) 333 | expect_true(all(is.finite(result$t))) 334 | expect_true(all(is.finite(result$p))) 335 | }) 336 | 337 | test_that("matrix_onesample_ttest handles varying gene set sizes", { 338 | set.seed(456) 339 | Fm <- rnorm(100) 340 | names(Fm) <- paste0("GENE", 1:100) 341 | data <- create_stats_test_data(n_pathways = 10) 342 | 343 | result <- matrix_onesample_ttest(Fm, data$G) 344 | 345 | expect_equal(nrow(result$mean), ncol(data$G)) 346 | }) 347 | 348 | # ============================================================================ 349 | # Test: matrix_metap() 350 | # ============================================================================ 351 | 352 | test_that("matrix_metap works with stouffer method", { 353 | plist <- list( 354 | p1 = c(0.05, 0.01, 0.1), 355 | p2 = c(0.08, 0.02, 0.15) 356 | ) 357 | 358 | result <- matrix_metap(plist, method = "stouffer") 359 | 360 | expect_true(is.numeric(result)) 361 | expect_equal(length(result), 3) 362 | expect_true(all(result >= 0 & result <= 1)) 363 | }) 364 | 365 | test_that("matrix_metap works with fisher method", { 366 | plist <- list( 367 | p1 = c(0.05, 0.01, 0.1), 368 | p2 = c(0.08, 0.02, 0.15) 369 | ) 370 | 371 | result <- matrix_metap(plist, method = "fisher") 372 | 373 | expect_true(is.numeric(result)) 374 | expect_equal(length(result), 3) 375 | expect_true(all(result >= 0 & result <= 1)) 376 | }) 377 | 378 | test_that("matrix_metap works with matrix input", { 379 | pmat <- matrix(c(0.05, 0.01, 0.1, 0.08, 0.02, 0.15), nrow = 3, ncol = 2) 380 | 381 | result <- matrix_metap(pmat, method = "stouffer") 382 | 383 | expect_true(is.numeric(result)) 384 | expect_equal(length(result), 3) 385 | }) 386 | 387 | test_that("matrix_metap results differ by method", { 388 | plist <- list( 389 | p1 = c(0.05, 0.01, 0.1), 390 | p2 = c(0.08, 0.02, 0.15) 391 | ) 392 | 393 | result_stouffer <- matrix_metap(plist, method = "stouffer") 394 | result_fisher <- matrix_metap(plist, method = "fisher") 395 | 396 | expect_false(identical(result_stouffer, result_fisher)) 397 | }) 398 | 399 | test_that("matrix_metap works with sumlog alias", { 400 | plist <- list( 401 | p1 = c(0.05, 0.01, 0.1), 402 | p2 = c(0.08, 0.02, 0.15) 403 | ) 404 | 405 | result <- matrix_metap(plist, method = "sumlog") 406 | 407 | expect_true(is.numeric(result)) 408 | expect_true(all(result >= 0 & result <= 1)) 409 | }) 410 | 411 | test_that("matrix_metap works with sumz alias", { 412 | plist <- list( 413 | p1 = c(0.05, 0.01, 0.1), 414 | p2 = c(0.08, 0.02, 0.15) 415 | ) 416 | 417 | result <- matrix_metap(plist, method = "sumz") 418 | 419 | expect_true(is.numeric(result)) 420 | expect_true(all(result >= 0 & result <= 1)) 421 | }) 422 | 423 | test_that("matrix_metap throws error for invalid method", { 424 | plist <- list( 425 | p1 = c(0.05, 0.01, 0.1), 426 | p2 = c(0.08, 0.02, 0.15) 427 | ) 428 | 429 | expect_error(matrix_metap(plist, method = "invalid")) 430 | }) 431 | 432 | test_that("matrix_metap handles extreme p-values", { 433 | plist <- list( 434 | p1 = c(1e-10, 0.5, 0.99), 435 | p2 = c(1e-9, 0.6, 0.999) 436 | ) 437 | 438 | result <- matrix_metap(plist, method = "stouffer") 439 | 440 | expect_true(all(is.finite(result))) 441 | expect_true(all(result >= 0 & result <= 1)) 442 | }) 443 | 444 | # ============================================================================ 445 | # Test: gset.rankcor() 446 | # ============================================================================ 447 | 448 | test_that("gset.rankcor works with basic input", { 449 | set.seed(456) 450 | rnk <- rnorm(100) 451 | names(rnk) <- paste0("GENE", 1:100) 452 | data <- create_stats_test_data() 453 | 454 | result <- gset.rankcor(rnk, data$G, compute.p = FALSE) 455 | 456 | expect_true(is.list(result)) 457 | expect_true("rho" %in% names(result)) 458 | expect_equal(nrow(result$rho), ncol(data$G)) 459 | }) 460 | 461 | test_that("gset.rankcor works with compute.p=TRUE", { 462 | set.seed(456) 463 | rnk <- rnorm(100) 464 | names(rnk) <- paste0("GENE", 1:100) 465 | data <- create_stats_test_data() 466 | 467 | result <- gset.rankcor(rnk, data$G, compute.p = TRUE) 468 | 469 | expect_true(is.list(result)) 470 | expect_true("rho" %in% names(result)) 471 | expect_true("p.value" %in% names(result)) 472 | expect_true("q.value" %in% names(result)) 473 | }) 474 | 475 | test_that("gset.rankcor works with use.rank=FALSE", { 476 | set.seed(456) 477 | rnk <- rnorm(100) 478 | names(rnk) <- paste0("GENE", 1:100) 479 | data <- create_stats_test_data() 480 | 481 | result <- gset.rankcor(rnk, data$G, use.rank = FALSE) 482 | 483 | expect_true(is.list(result)) 484 | expect_true("rho" %in% names(result)) 485 | }) 486 | 487 | test_that("gset.rankcor works with matrix input", { 488 | set.seed(456) 489 | rnk <- matrix(rnorm(200), nrow = 100, ncol = 2) 490 | rownames(rnk) <- paste0("GENE", 1:100) 491 | colnames(rnk) <- c("Cond1", "Cond2") 492 | data <- create_stats_test_data() 493 | 494 | result <- gset.rankcor(rnk, data$G) 495 | 496 | expect_true(is.list(result)) 497 | expect_equal(ncol(result$rho), 2) 498 | }) 499 | 500 | test_that("gset.rankcor throws error for unnamed vector", { 501 | rnk <- rnorm(100) 502 | data <- create_stats_test_data() 503 | 504 | expect_error(gset.rankcor(rnk, data$G)) 505 | }) 506 | 507 | test_that("gset.rankcor throws error for matrix without rownames", { 508 | rnk <- matrix(rnorm(200), nrow = 100, ncol = 2) 509 | data <- create_stats_test_data() 510 | 511 | expect_error(gset.rankcor(rnk, data$G)) 512 | }) 513 | 514 | test_that("gset.rankcor returns NULL for zero columns", { 515 | set.seed(456) 516 | rnk <- rnorm(100) 517 | names(rnk) <- paste0("GENE", 1:100) 518 | empty_G <- Matrix::Matrix(0, nrow = 100, ncol = 0, sparse = TRUE) 519 | rownames(empty_G) <- paste0("GENE", 1:100) 520 | 521 | result <- gset.rankcor(rnk, empty_G) 522 | 523 | expect_null(result) 524 | }) 525 | 526 | test_that("gset.rankcor throws error for non-Matrix gset", { 527 | set.seed(456) 528 | rnk <- rnorm(100) 529 | names(rnk) <- paste0("GENE", 1:100) 530 | gset <- list(pathway1 = c("GENE1", "GENE2")) 531 | 532 | expect_error(gset.rankcor(rnk, gset)) 533 | }) 534 | 535 | test_that("gset.rankcor handles transposed input correctly", { 536 | set.seed(456) 537 | rnk <- rnorm(100) 538 | names(rnk) <- paste0("GENE", 1:100) 539 | data <- create_stats_test_data() 540 | G_transposed <- Matrix::t(data$G) 541 | 542 | result <- gset.rankcor(rnk, G_transposed) 543 | 544 | expect_true(is.list(result)) 545 | }) 546 | 547 | # ============================================================================ 548 | # Test: cor_sparse_matrix() 549 | # ============================================================================ 550 | 551 | test_that("cor_sparse_matrix works without missing values", { 552 | set.seed(456) 553 | G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE) 554 | mat <- matrix(rnorm(200), nrow = 100, ncol = 2) 555 | 556 | result <- cor_sparse_matrix(G, mat) 557 | 558 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 559 | expect_equal(nrow(result), ncol(G)) 560 | expect_equal(ncol(result), ncol(mat)) 561 | }) 562 | 563 | test_that("cor_sparse_matrix works with missing values", { 564 | set.seed(456) 565 | G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE) 566 | mat <- matrix(rnorm(200), nrow = 100, ncol = 2) 567 | mat[1:5, 1] <- NA 568 | 569 | result <- cor_sparse_matrix(G, mat) 570 | 571 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 572 | expect_equal(nrow(result), ncol(G)) 573 | expect_equal(ncol(result), ncol(mat)) 574 | }) 575 | 576 | test_that("cor_sparse_matrix produces finite values", { 577 | set.seed(456) 578 | G <- Matrix::Matrix(rnorm(500), nrow = 100, ncol = 5, sparse = TRUE) 579 | mat <- matrix(rnorm(200), nrow = 100, ncol = 2) 580 | 581 | result <- cor_sparse_matrix(G, mat) 582 | 583 | # Some correlations may be NA/NaN for constant columns 584 | expect_true(is.matrix(result) || inherits(result, "Matrix")) 585 | }) 586 | 587 | # ============================================================================ 588 | # Test: Integration tests 589 | # ============================================================================ 590 | 591 | test_that("dualGSEA and gset_ttest produce consistent results", { 592 | data <- create_stats_test_data() 593 | gsetX <- plaid(data$X, data$G) 594 | 595 | # Run dualGSEA with precomputed gsetX 596 | dual_result <- dualGSEA(data$X, data$y, G = data$G) 597 | 598 | # Run gset_ttest separately 599 | ttest_result <- gset_ttest(gsetX, data$y) 600 | 601 | # Both should have same number of gene sets 602 | expect_equal(nrow(dual_result), nrow(ttest_result)) 603 | }) 604 | 605 | test_that("matrix_metap with different p-value sources", { 606 | data <- create_stats_test_data() 607 | 608 | # Create dummy p-values 609 | pv1 <- runif(ncol(data$G)) 610 | pv2 <- runif(ncol(data$G)) 611 | plist <- list(p1 = pv1, p2 = pv2) 612 | 613 | # Combine with both methods 614 | combined_stouffer <- matrix_metap(plist, method = "stouffer") 615 | combined_fisher <- matrix_metap(plist, method = "fisher") 616 | 617 | # Combined p-values should be more significant than individual ones 618 | expect_true(all(combined_stouffer <= 1)) 619 | expect_true(all(combined_fisher <= 1)) 620 | }) 621 | 622 | # ============================================================================ 623 | # Test: Edge cases and error handling 624 | # ============================================================================ 625 | 626 | test_that("dualGSEA handles small sample sizes", { 627 | data <- create_stats_test_data(n_samples = 4) 628 | 629 | result <- dualGSEA(data$X, data$y, G = data$G) 630 | 631 | expect_true(is.matrix(result) || is.data.frame(result)) 632 | }) 633 | 634 | test_that("gset_averageCLR handles single gene set", { 635 | data <- create_stats_test_data(n_pathways = 1) 636 | 637 | result <- gset_averageCLR(data$X, data$G) 638 | 639 | expect_true(is.matrix(result)) 640 | expect_equal(nrow(result), 1) 641 | }) 642 | 643 | test_that("matrix_onesample_ttest handles small gene sets", { 644 | set.seed(456) 645 | Fm <- rnorm(100) 646 | names(Fm) <- paste0("GENE", 1:100) 647 | 648 | # Create gene set with only 2 genes 649 | G <- Matrix::Matrix(0, nrow = 100, ncol = 1, sparse = TRUE) 650 | rownames(G) <- paste0("GENE", 1:100) 651 | colnames(G) <- "SmallSet" 652 | G[1:2, 1] <- 1 653 | 654 | result <- matrix_onesample_ttest(Fm, G) 655 | 656 | expect_true(is.list(result)) 657 | expect_true(all(is.finite(result$mean))) 658 | }) 659 | 660 | test_that("gset.rankcor handles partial overlap", { 661 | set.seed(456) 662 | rnk <- rnorm(50) 663 | names(rnk) <- paste0("GENE", 1:50) 664 | data <- create_stats_test_data() 665 | 666 | result <- gset.rankcor(rnk, data$G) 667 | 668 | expect_true(is.list(result)) 669 | }) 670 | 671 | test_that("dualGSEA handles unbalanced groups", { 672 | data <- create_stats_test_data() 673 | data$y <- rep(0, length(data$y)) 674 | data$y[1:3] <- 1 # Few samples in group 1 675 | 676 | # This should work but may produce warnings 677 | result <- suppressWarnings(dualGSEA(data$X, data$y, G = data$G)) 678 | 679 | expect_true(is.matrix(result) || is.data.frame(result)) 680 | }) 681 | 682 | # ============================================================================ 683 | # Test: Numerical stability 684 | # ============================================================================ 685 | 686 | test_that("matrix_metap handles very small p-values", { 687 | plist <- list( 688 | p1 = rep(1e-100, 5), 689 | p2 = rep(1e-100, 5) 690 | ) 691 | 692 | result <- matrix_metap(plist, method = "stouffer") 693 | 694 | expect_true(all(is.finite(result))) 695 | expect_true(all(result >= 0)) 696 | }) 697 | 698 | test_that("matrix_metap handles p-values near 1", { 699 | plist <- list( 700 | p1 = rep(0.99, 5), 701 | p2 = rep(0.999, 5) 702 | ) 703 | 704 | result <- matrix_metap(plist, method = "stouffer") 705 | 706 | expect_true(all(is.finite(result))) 707 | expect_true(all(result <= 1)) 708 | }) 709 | 710 | test_that("gset_averageCLR handles extreme values", { 711 | data <- create_stats_test_data() 712 | data$X <- data$X * 1e6 713 | 714 | result <- gset_averageCLR(data$X, data$G) 715 | 716 | expect_true(is.matrix(result)) 717 | expect_true(all(is.finite(result))) 718 | }) 719 | 720 | test_that("matrix_onesample_ttest handles constant values", { 721 | Fm <- rep(1, 100) 722 | names(Fm) <- paste0("GENE", 1:100) 723 | data <- create_stats_test_data() 724 | 725 | result <- matrix_onesample_ttest(Fm, data$G) 726 | 727 | expect_true(is.list(result)) 728 | # P-values should be 1 for no variance 729 | expect_true(all(result$p >= 0 & result$p <= 1)) 730 | }) 731 | 732 | -------------------------------------------------------------------------------- /R/plaid.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## This file is part of the Omics Playground project. 3 | ## Copyright (c) 2018-2025 BigOmics Analytics SA. All rights reserved. 4 | ## 5 | 6 | #' @importFrom methods as 7 | #' @importFrom stats ecdf 8 | #' @importFrom Matrix colSums colScale crossprod Diagonal rowMeans t which 9 | #' @importFrom matrixStats colRanks colMedians 10 | #' @importFrom sparseMatrixStats colRanks 11 | NULL 12 | 13 | #' Compute PLAID single-sample enrichment score 14 | #' 15 | #' @description Compute single-sample geneset expression as the 16 | #' average log-expression f genes in the geneset. Requires log-expression 17 | #' matrix X and (sparse) geneset matrix matG. If you have gene sets 18 | #' as a gmt list, please convert it first using the function `gmt2mat()`. 19 | #' 20 | #' @details PLAID needs the gene sets as sparse matrix. If you have 21 | #' your collection of gene sets a a list, we need first to convert 22 | #' the gmt list to matrix format. 23 | #' 24 | #' @details We recommend to run PLAID on the log transformed expression matrix, 25 | #' not on the counts, as the average in the logarithmic space is more 26 | #' robust and is in concordance to calculating the geometric mean. 27 | #' 28 | #' @details It is not necessary to normalize your expression matrix before 29 | #' running PLAID because PLAID performs median normalization of the 30 | #' enrichment scores afterwards. 31 | #' 32 | #' @details It is recommended to use sparse matrix as PLAID relies on 33 | #' sparse matrix computations. But, PLAID is also fast for dense matrices. 34 | #' 35 | #' @details PLAID can also be run on the ranked matrix. This corresponds to 36 | #' the singscore (Fouratan et al., 2018). PLAID can also be run on 37 | #' the (non-logarithmic) counts which can be used to calculate the 38 | #' scSE score (Pont et al., 2019). 39 | #' 40 | #' @details PLAID is fast and memery efficient because it uses efficient 41 | #' sparse matrix computation. When input matrix is very large, PLAID 42 | #' performs 'chunked' computation by splitting the matrix in chunks. 43 | #' 44 | #' @details Although `X` and `matG` are generally sparse, the result 45 | #' matrix `gsetX` generally is dense and can thus be very large. 46 | #' Example: computing gene set scores for 10K gene sets on 1M cells 47 | #' will create a 10K x 1M dense matrix which requires ~75GB memory. 48 | #' 49 | #' @details PLAID now automatically detects and handles Bioconductor objects. 50 | #' If X is a SummarizedExperiment or SingleCellExperiment, it will extract 51 | #' the appropriate assay. If matG is a BiocSet object or GMT list, it will 52 | #' be converted to sparse matrix format automatically. 53 | #' 54 | #' @param X Log-transformed expr. matrix. Genes on rows, samples on columns. 55 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 56 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns. 57 | #' Also accepts BiocSet objects or GMT lists (named list of gene vectors). 58 | #' @param stats Score computation stats: mean or sum of intensity. Default 'mean'. 59 | #' @param chunk Logical: use chunks for large matrices. Default 'NULL' for autodetect. 60 | #' @param normalize Logical: median normalize results or not. Default 'TRUE'. 61 | #' @param nsmooth Smoothing parameter for more stable average when stats="mean". Default 3. 62 | #' @param assay Character: assay name to extract from SummarizedExperiment/SingleCellExperiment. Default "logcounts". 63 | #' @param min.genes Integer: minimum genes per gene set (for BiocSet/GMT input). Default 5. 64 | #' @param max.genes Integer: maximum genes per gene set (for BiocSet/GMT input). Default 500. 65 | #' 66 | #' @return Matrix of single-sample enrichment scores. 67 | #' Gene sets on rows, samples on columns. 68 | #' 69 | #' @examples 70 | #' library(plaid) 71 | #' 72 | #' # Create example expression matrix 73 | #' set.seed(123) 74 | #' X <- matrix(rnorm(1000), nrow = 100, ncol = 10) 75 | #' rownames(X) <- paste0("GENE", 1:100) 76 | #' colnames(X) <- paste0("Sample", 1:10) 77 | #' 78 | #' # Create example gene sets 79 | #' gmt <- list( 80 | #' "Pathway1" = paste0("GENE", 1:20), 81 | #' "Pathway2" = paste0("GENE", 15:35), 82 | #' "Pathway3" = paste0("GENE", 30:50) 83 | #' ) 84 | #' matG <- gmt2mat(gmt) 85 | #' 86 | #' # Compute PLAID scores 87 | #' gsetX <- plaid(X, matG) 88 | #' print(dim(gsetX)) 89 | #' print(gsetX[1:3, 1:5]) 90 | #' 91 | #' # Use sum statistics instead of mean 92 | #' gsetX_sum <- plaid(X, matG, stats = "sum") 93 | #' 94 | #' \donttest{ 95 | #' # Using real data (if available in package) 96 | #' extdata_path <- system.file("extdata", "pbmc3k-50cells.rda", package = "plaid") 97 | #' if (file.exists(extdata_path)) { 98 | #' load(extdata_path) 99 | #' hallmarks <- system.file("extdata", "hallmarks.gmt", package = "plaid") 100 | #' gmt <- read.gmt(hallmarks) 101 | #' matG <- gmt2mat(gmt) 102 | #' gsetX <- plaid(X, matG) 103 | #' } 104 | #' } 105 | #' 106 | #' @export 107 | plaid <- function(X, matG, stats=c("mean","sum"), chunk=NULL, normalize=TRUE, 108 | nsmooth=3, assay="logcounts", min.genes=5, max.genes=500) { 109 | 110 | ## Auto-detect and convert Bioconductor objects 111 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 112 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 113 | } 114 | 115 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 116 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 117 | min.genes=min.genes, max.genes=max.genes) 118 | } 119 | 120 | stats <- stats[1] 121 | if (NCOL(X) == 1) X <- cbind(X) 122 | 123 | ## make sure X is matrix (not dataframe) and convert to sparse if needed 124 | if(!inherits(X, "Matrix")) { 125 | X <- as.matrix(X) 126 | if(mean(X==0,na.rm=TRUE)>0.5) X <- Matrix::Matrix(X, sparse=TRUE) 127 | } 128 | 129 | gg <- intersect(rownames(X), rownames(matG)) 130 | if (length(gg) == 0) { 131 | message("[plaid] No overlapping features.") 132 | return(NULL) 133 | } 134 | 135 | X <- X[gg, , drop = FALSE] 136 | matG <- matG[gg, , drop = FALSE] 137 | G <- 1 * (matG != 0) 138 | if(stats == "mean") { 139 | # nsmooth is 'smoothing' parameter for more stable average 140 | sumG <- 1e-8 + nsmooth + Matrix::colSums(G, na.rm = TRUE) 141 | G <- Matrix::colScale(G, 1 / sumG) 142 | } 143 | 144 | ## Calculates PLAID score 145 | gsetX <- chunked_crossprod(G, X, chunk=NULL) 146 | gsetX <- as.matrix(gsetX) 147 | 148 | if(normalize) { 149 | if(nrow(gsetX) < 20) { 150 | ## for few genesets, median-norm is not good 151 | normfactor <- Matrix::colMeans(gsetX,na.rm=TRUE) 152 | normfactor <- normfactor - mean(normfactor) 153 | gsetX <- sweep(gsetX, 2, normfactor, '-') 154 | } else { 155 | gsetX <- normalize_medians(gsetX) 156 | } 157 | } 158 | 159 | return(gsetX) 160 | 161 | } 162 | 163 | #' Chunked computation of cross product 164 | #' 165 | #' Compute crossprod (t(x) %*% y) for very large y by computing in 166 | #' chunks. 167 | #' 168 | #' @param x Matrix First matrix for multiplication. Can be sparse. 169 | #' @param y Matrix Second matrix for multiplication. Can be sparse. 170 | #' @param chunk Integer Chunk size (max number of columns) for computation. 171 | #' 172 | #' @return Matrix. Result of matrix cross product. 173 | #' 174 | chunked_crossprod <- function(x, y, chunk=NULL) { 175 | if(is.null(chunk) || chunk < 0) { 176 | ## if y is large, we need to chunk computation 177 | Int_max <- .Machine$integer.max 178 | chunk <- round(0.8 * Int_max / ncol(x)) 179 | } 180 | 181 | if(ncol(y) < chunk) return(Matrix::crossprod(x, y)) 182 | 183 | message("[chunked_crossprod] chunked compute: chunk = ", chunk) 184 | k <- ceiling(ncol(y) / chunk) 185 | gsetX <- matrix(NA, nrow=ncol(x), ncol=ncol(y), 186 | dimnames=list(colnames(x),colnames(y))) 187 | 188 | for(i in seq_len(k)) { 189 | jj <- c(((i-1)*chunk+1):min(ncol(y),(i*chunk))) 190 | xy <- Matrix::crossprod(x, y[,jj]) 191 | gsetX[,jj] <- as.matrix(xy) 192 | } 193 | 194 | return(gsetX) 195 | 196 | } 197 | 198 | #' Fast calculation of scSE score 199 | #' 200 | #' @description Calculates Single-Cell Signature Explorer (Pont et 201 | #' al., 2019) scores using plaid back-end. The computation is 202 | #' 10-100x faster than the original code. 203 | #' 204 | #' @details Computing the scSE requires running plaid on the linear 205 | #' (not logarithmic) score and perform additional normalization by 206 | #' the total UMI per sample. We have wrapped this in a single 207 | #' convenience function: 208 | #' 209 | #' To replicate the original "sum-of-UMI" scSE score, set `removeLog2=TRUE` 210 | #' and `scoreMean=FALSE`. scSE and plaid scores become more similar for 211 | #' `removeLog2=FALSE` and `scoreMean=TRUE`. 212 | #' 213 | #' We have extensively compared the results from `replaid.scse` and 214 | #' from the original scSE (implemented in GO lang) and we showed 215 | #' almost identical results in the score, logFC and p-values. 216 | #' 217 | #' 218 | #' @param X Gene or protein expression matrix. Generally log 219 | #' transformed. See details. Genes on rows, samples on columns. 220 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 221 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on 222 | #' columns. Also accepts BiocSet objects or GMT lists. 223 | #' @param removeLog2 Logical for whether to remove the Log2, i.e. will 224 | #' apply power transform (base2) on input (default TRUE). 225 | #' @param scoreMean Logical for whether computing sum or mean as score 226 | #' (default FALSE). 227 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 228 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 229 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 230 | #' 231 | #' @return Matrix of single-sample scSE enrichment scores. 232 | #' Gene sets on rows, samples on columns. 233 | #' 234 | #' @examples 235 | #' # Create example expression matrix (log-transformed) 236 | #' set.seed(123) 237 | #' X <- log2(matrix(rpois(500, lambda = 10) + 1, nrow = 50, ncol = 10)) 238 | #' rownames(X) <- paste0("GENE", 1:50) 239 | #' colnames(X) <- paste0("Sample", 1:10) 240 | #' 241 | #' # Create example gene sets 242 | #' gmt <- list( 243 | #' "Pathway1" = paste0("GENE", 1:15), 244 | #' "Pathway2" = paste0("GENE", 10:25) 245 | #' ) 246 | #' matG <- gmt2mat(gmt) 247 | #' 248 | #' # Compute scSE scores (original method) 249 | #' scores <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = FALSE) 250 | #' print(scores[1:2, 1:5]) 251 | #' 252 | #' # Compute scSE scores (mean method) 253 | #' scores_mean <- replaid.scse(X, matG, removeLog2 = TRUE, scoreMean = TRUE) 254 | #' print(scores_mean[1:2, 1:5]) 255 | #' 256 | #' @export 257 | replaid.scse <- function(X, 258 | matG, 259 | removeLog2 = NULL, 260 | scoreMean = FALSE, 261 | assay="logcounts", 262 | min.genes=5, 263 | max.genes=500) { 264 | 265 | ## Auto-detect and convert Bioconductor objects 266 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 267 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 268 | } 269 | 270 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 271 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 272 | min.genes=min.genes, max.genes=max.genes) 273 | } 274 | 275 | if(is.null(removeLog2)) 276 | removeLog2 <- min(X, na.rm = TRUE)==0 && max(X, na.rm = TRUE) < 20 277 | 278 | if(removeLog2) { 279 | message("[replaid.scse] Converting data to linear scale (removing log2)...") 280 | if(inherits(X,"dgCMatrix")) { 281 | X@x <- 2**X@x 282 | } else { 283 | nz <- Matrix::which(X>0) 284 | X[nz] <- 2**X[nz] ## undo only non-zeros as in scSE code 285 | } 286 | } 287 | 288 | if(scoreMean) { 289 | ## modified scSE with Mean-statistics 290 | sX <- plaid(X, matG, stats="mean", normalize=FALSE) 291 | sumx <- Matrix::colMeans(abs(X)) + 1e-8 292 | sX <- sX %*% Matrix::Diagonal(x = 1/sumx) 293 | } else { 294 | ## original scSE with Sum-statistics 295 | sX <- plaid(X, matG, stats="sum", normalize=FALSE) 296 | sumx <- Matrix::colSums(abs(X)) + 1e-8 297 | sX <- sX %*% Matrix::Diagonal(x = 1/sumx) * 100 298 | } 299 | 300 | colnames(sX) <- colnames(X) 301 | sX <- as.matrix(sX) 302 | 303 | return(sX) 304 | 305 | } 306 | 307 | 308 | #' Fast calculation of singscore 309 | #' 310 | #' @description Calculates single-sample enrichment singscore 311 | #' (Fouratan et al., 2018) using plaid back-end. The computation is 312 | #' 10-100x faster than the original code. 313 | #' 314 | #' @details Computing the singscore requires to compute the ranks of 315 | #' the expression matrix. We have wrapped this in a single 316 | #' convenience function. 317 | #' 318 | #' We have extensively compared the results of `replaid.sing` and from 319 | #' the original `singscore` R package and we showed identical result 320 | #' in the score, logFC and p-values. 321 | #' 322 | #' @param X Gene or protein expression matrix. Generally log 323 | #' transformed. See details. Genes on rows, samples on columns. 324 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 325 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on 326 | #' columns. Also accepts BiocSet objects or GMT lists. 327 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 328 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 329 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 330 | #' 331 | #' @return Matrix of single-sample singscore enrichment scores. 332 | #' Gene sets on rows, samples on columns. 333 | #' 334 | #' @examples 335 | #' # Create example expression matrix 336 | #' set.seed(123) 337 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10) 338 | #' rownames(X) <- paste0("GENE", 1:50) 339 | #' colnames(X) <- paste0("Sample", 1:10) 340 | #' 341 | #' # Create example gene sets 342 | #' gmt <- list( 343 | #' "Pathway1" = paste0("GENE", 1:15), 344 | #' "Pathway2" = paste0("GENE", 10:25) 345 | #' ) 346 | #' matG <- gmt2mat(gmt) 347 | #' 348 | #' # Compute singscore 349 | #' scores <- replaid.sing(X, matG) 350 | #' print(scores[1:2, 1:5]) 351 | #' 352 | #' @export 353 | replaid.sing <- function(X, matG, assay="logcounts", min.genes=5, max.genes=500) { 354 | ## Auto-detect and convert Bioconductor objects 355 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 356 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 357 | } 358 | 359 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 360 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 361 | min.genes=min.genes, max.genes=max.genes) 362 | } 363 | 364 | ## the ties.method=min is important for exact replication 365 | rX <- colranks(X, ties.method = "min") 366 | rX <- rX / nrow(X) - 0.5 367 | gsetX <- plaid(rX, matG = matG, normalize = FALSE) 368 | return(gsetX) 369 | } 370 | 371 | #' Fast calculation of ssGSEA 372 | #' 373 | #' @description Calculates single-sample enrichment singscore (Barbie 374 | #' et al., 2009; Hänzelmann et al., 2013) using plaid back-end. The 375 | #' computation is 10-100x faster than the original code. 376 | #' 377 | #' @details Computing ssGSEA score requires to compute the ranks of 378 | #' the expression matrix and weighting of the ranks. We have wrapped 379 | #' this in a single convenience function. 380 | #' 381 | #' We have extensively compared the results of `replaid.ssgsea` and 382 | #' from the original `GSVA` R package and we showed highly similar 383 | #' results in the score, logFC and p-values. For alpha=0 we obtain 384 | #' exact results, for alpha>0 the results are highly similar but not 385 | #' exactly the same. 386 | #' 387 | #' @param X Gene or protein expression matrix. Generally log 388 | #' transformed. See details. Genes on rows, samples on columns. 389 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 390 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on 391 | #' columns. Also accepts BiocSet objects or GMT lists. 392 | #' @param alpha Weighting factor for exponential weighting of ranks 393 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 394 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 395 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 396 | #' 397 | #' @return Matrix of single-sample ssGSEA enrichment scores. 398 | #' Gene sets on rows, samples on columns. 399 | #' 400 | #' @examples 401 | #' # Create example expression matrix 402 | #' set.seed(123) 403 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10) 404 | #' rownames(X) <- paste0("GENE", 1:50) 405 | #' colnames(X) <- paste0("Sample", 1:10) 406 | #' 407 | #' # Create example gene sets 408 | #' gmt <- list( 409 | #' "Pathway1" = paste0("GENE", 1:15), 410 | #' "Pathway2" = paste0("GENE", 10:25) 411 | #' ) 412 | #' matG <- gmt2mat(gmt) 413 | #' 414 | #' # Compute ssGSEA scores (alpha = 0) 415 | #' scores <- replaid.ssgsea(X, matG, alpha = 0) 416 | #' print(scores[1:2, 1:5]) 417 | #' 418 | #' # Compute ssGSEA scores with weighting (alpha = 0.25) 419 | #' scores_weighted <- replaid.ssgsea(X, matG, alpha = 0.25) 420 | #' print(scores_weighted[1:2, 1:5]) 421 | #' 422 | #' @export 423 | replaid.ssgsea <- function(X, matG, alpha = 0, assay="logcounts", min.genes=5, max.genes=500) { 424 | ## Auto-detect and convert Bioconductor objects 425 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 426 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 427 | } 428 | 429 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 430 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 431 | min.genes=min.genes, max.genes=max.genes) 432 | } 433 | 434 | rX <- colranks(X, keep.zero = TRUE, ties.method = "average") 435 | if(alpha != 0) { 436 | ## This is not exactly like original formula. Not sure how to 437 | ## efficiently implement original rank weighting 438 | rX <- rX^(1 + alpha) 439 | } 440 | rX <- rX / max(rX) - 0.5 441 | dimnames(rX) <- dimnames(X) 442 | gsetX <- plaid(rX, matG, stats = "mean", normalize = TRUE) 443 | return(gsetX) 444 | } 445 | 446 | #' Fast calculation of UCell 447 | #' 448 | #' @description Calculates single-sample enrichment UCell (Andreatta 449 | #' et al., 2021) using plaid back-end. The computation is 450 | #' 10-100x faster than the original code. 451 | #' 452 | #' @details Computing ssGSEA score requires to compute the ranks of 453 | #' the expression matrix and truncation of the ranks. We have wrapped 454 | #' this in a single convenience function. 455 | #' 456 | #' We have extensively compared the results of `replaid.ucell` and 457 | #' from the original `UCell` R package and we showed near exacct 458 | #' results in the score, logFC and p-values. 459 | #' 460 | #' @param X Gene or protein expression matrix. Generally log 461 | #' transformed. See details. Genes on rows, samples on columns. 462 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 463 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns. 464 | #' Also accepts BiocSet objects or GMT lists. 465 | #' @param rmax Rank threshold (see Ucell paper). Default rmax = 1500. 466 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 467 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 468 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 469 | #' 470 | #' @return Matrix of single-sample UCell enrichment scores. 471 | #' Gene sets on rows, samples on columns. 472 | #' 473 | #' @examples 474 | #' # Create example expression matrix 475 | #' set.seed(123) 476 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10) 477 | #' rownames(X) <- paste0("GENE", 1:50) 478 | #' colnames(X) <- paste0("Sample", 1:10) 479 | #' 480 | #' # Create example gene sets 481 | #' gmt <- list( 482 | #' "Pathway1" = paste0("GENE", 1:15), 483 | #' "Pathway2" = paste0("GENE", 10:25) 484 | #' ) 485 | #' matG <- gmt2mat(gmt) 486 | #' 487 | #' # Compute UCell scores (default rmax = 1500) 488 | #' scores <- replaid.ucell(X, matG) 489 | #' print(scores[1:2, 1:5]) 490 | #' 491 | #' # Compute UCell scores with custom rmax 492 | #' scores_custom <- replaid.ucell(X, matG, rmax = 1000) 493 | #' print(scores_custom[1:2, 1:5]) 494 | #' 495 | #' @export 496 | replaid.ucell <- function(X, matG, rmax = 1500, assay="logcounts", min.genes=5, max.genes=500) { 497 | ## Auto-detect and convert Bioconductor objects 498 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 499 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 500 | } 501 | 502 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 503 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 504 | min.genes=min.genes, max.genes=max.genes) 505 | } 506 | 507 | rX <- colranks(X, ties.method = "average") 508 | rX <- pmin( max(rX) - rX, rmax+1 ) 509 | S <- plaid(rX, matG) 510 | S <- 1 - S / rmax + (Matrix::colSums(matG!=0)+1)/(2*rmax) 511 | return(S) 512 | } 513 | 514 | #' Fast calculation of AUCell 515 | #' 516 | #' @description Calculates single-sample enrichment AUCell (Aibar 517 | #' et al., 2017) using plaid back-end. The computation is 518 | #' 10-100x faster than the original code. 519 | #' 520 | #' @details Computing the AUCell score requires to compute the ranks 521 | #' of the expression matrix and approximating the AUC of a gene 522 | #' set. We have wrapped this in a single convenience function. 523 | #' 524 | #' We have extensively compared the results of `replaid.aucell` and 525 | #' from the original `AUCell` R package and we showed good concordance 526 | #' of results in the score, logFC and p-values. 527 | #' 528 | #' @param X Gene or protein expression matrix. Generally log 529 | #' transformed. See details. Genes on rows, samples on columns. 530 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 531 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on columns. 532 | #' Also accepts BiocSet objects or GMT lists. 533 | #' @param aucMaxRank Rank threshold (see AUCell paper). Default aucMaxRank = 0.05*nrow(X). 534 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 535 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 536 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 537 | #' 538 | #' @return Matrix of single-sample AUCell enrichment scores. 539 | #' Gene sets on rows, samples on columns. 540 | #' 541 | #' @examples 542 | #' # Create example expression matrix 543 | #' set.seed(123) 544 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10) 545 | #' rownames(X) <- paste0("GENE", 1:50) 546 | #' colnames(X) <- paste0("Sample", 1:10) 547 | #' 548 | #' # Create example gene sets 549 | #' gmt <- list( 550 | #' "Pathway1" = paste0("GENE", 1:15), 551 | #' "Pathway2" = paste0("GENE", 10:25) 552 | #' ) 553 | #' matG <- gmt2mat(gmt) 554 | #' 555 | #' # Compute AUCell scores 556 | #' scores <- replaid.aucell(X, matG) 557 | #' print(scores[1:2, 1:5]) 558 | #' 559 | #' @export 560 | replaid.aucell <- function(X, matG, aucMaxRank = NULL, assay="logcounts", min.genes=5, max.genes=500) { 561 | ## Auto-detect and convert Bioconductor objects 562 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 563 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 564 | } 565 | 566 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 567 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 568 | min.genes=min.genes, max.genes=max.genes) 569 | } 570 | 571 | if (is.null(aucMaxRank)) { 572 | aucMaxRank <- ceiling(0.05*nrow(X)) 573 | } 574 | 575 | rX <- colranks(X, ties.method = "average") 576 | ww <- 1.08*pmax((rX - (max(rX) - aucMaxRank)) / aucMaxRank, 0) 577 | gsetX <- plaid(ww, matG, stats = "mean") 578 | return(gsetX) 579 | } 580 | 581 | #' Fast approximation of GSVA 582 | #' 583 | #' @description Calculates single-sample enrichment GSVA (Hänzelmann 584 | #' et al., 2013) using plaid back-end. The computation is 585 | #' 10-100x faster than the original code. 586 | #' 587 | #' @details Computing the GSVA score requires to compute the CDF of 588 | #' the expression matrix, ranking and scoring the genesets. We have 589 | #' wrapped this in a single convenience function. 590 | #' 591 | #' We have extensively compared the results of `replaid.gsva` and 592 | #' from the original `GSVA` R package and we showed good concordance 593 | #' of results in the score, logFC and p-values. 594 | #' 595 | #' In the original formulation, GSVA uses an emperical CDF to 596 | #' transform expression of each feature to a (0;1) relative expression 597 | #' value. For efficiency reasons, this is here approximated by a 598 | #' z-transform (center+scale) of each row. 599 | #' 600 | #' @param X Gene or protein expression matrix. Generally log 601 | #' transformed. See details. Genes on rows, samples on columns. 602 | #' Also accepts SummarizedExperiment or SingleCellExperiment objects. 603 | #' @param matG Gene sets sparse matrix. Genes on rows, gene sets on 604 | #' columns. Also accepts BiocSet objects or GMT lists. 605 | #' @param tau Rank weight parameter (see GSVA publication). Default 606 | #' tau=0. 607 | #' @param rowtf Row transformation method ("z" or "ecdf"). Default "z". 608 | #' @param assay Character: assay name for Bioconductor objects. Default "logcounts". 609 | #' @param min.genes Integer: minimum genes per gene set. Default 5. 610 | #' @param max.genes Integer: maximum genes per gene set. Default 500. 611 | #' 612 | #' @return Matrix of single-sample GSVA enrichment scores. 613 | #' Gene sets on rows, samples on columns. 614 | #' 615 | #' @examples 616 | #' # Create example expression matrix 617 | #' set.seed(123) 618 | #' X <- matrix(rnorm(500), nrow = 50, ncol = 10) 619 | #' rownames(X) <- paste0("GENE", 1:50) 620 | #' colnames(X) <- paste0("Sample", 1:10) 621 | #' 622 | #' # Create example gene sets 623 | #' gmt <- list( 624 | #' "Pathway1" = paste0("GENE", 1:15), 625 | #' "Pathway2" = paste0("GENE", 10:25) 626 | #' ) 627 | #' matG <- gmt2mat(gmt) 628 | #' 629 | #' # Compute GSVA scores 630 | #' scores <- replaid.gsva(X, matG) 631 | #' print(scores[1:2, 1:5]) 632 | #' 633 | #' @export 634 | replaid.gsva <- function(X, matG, tau = 0, rowtf = c("z", "ecdf")[1], assay="logcounts", min.genes=5, max.genes=500) { 635 | ## Auto-detect and convert Bioconductor objects 636 | if (inherits(X, "SummarizedExperiment") || inherits(X, "SingleCellExperiment")) { 637 | X <- .extract_expression_matrix(X, assay=assay, log.transform=FALSE) 638 | } 639 | 640 | if (inherits(matG, "BiocSet") || (is.list(matG) && !is.matrix(matG) && !inherits(matG, "Matrix"))) { 641 | matG <- .convert_geneset_to_matrix(matG, background=rownames(X), 642 | min.genes=min.genes, max.genes=max.genes) 643 | } 644 | 645 | rowtf <- rowtf[1] 646 | 647 | if(rowtf == "z") { 648 | ## Faster approximation of relative activation 649 | zX <- (X - Matrix::rowMeans(X)) / (1e-8 + mat.rowsds(X)) 650 | } else if(rowtf=='ecdf') { 651 | ## this implements original ECDF idea 652 | zX <- t(apply(X,1,function(x) ecdf(x)(x))) 653 | } else { 654 | stop("unknown row transform",rowtf) 655 | } 656 | 657 | rX <- colranks(zX, signed = TRUE, ties.method = "average") 658 | rX <- rX / max(abs(rX)) 659 | if(tau > 0) { 660 | ## Note: This is not exactly like original formula. Not sure how 661 | ## to efficiently implement original rank weighting 662 | rX <- sign(rX) * abs(rX)^(1 + tau) 663 | } 664 | dimnames(rX) <- dimnames(X) 665 | gsetX <- plaid(rX, matG) 666 | 667 | return(gsetX) 668 | 669 | } 670 | 671 | #' Calculate row standard deviations for matrix 672 | #' 673 | #' @param X Input matrix (can be sparse or dense) 674 | #' 675 | #' @return Vector of row standard deviations. 676 | #' 677 | mat.rowsds <- function(X) { 678 | if(inherits(X,"CsparseMatrix")) 679 | return(sparseMatrixStats::rowSds(X)) 680 | sdx <- matrixStats::rowSds(X) 681 | return(sdx) 682 | } 683 | 684 | ##---------------------------------------------------------------- 685 | ##-------------------- UTILITIES --------------------------------- 686 | ##---------------------------------------------------------------- 687 | 688 | #' Normalize column medians of matrix 689 | #' 690 | #' This function normalizes the column medians of matrix x. It calls 691 | #' optimized functions from the matrixStats package. 692 | #' 693 | #' @param x Input matrix 694 | #' @param ignore.zero Logical indicating whether to ignore zeros to 695 | #' exclude for median calculation 696 | #' 697 | #' @return Matrix with normalized column medians. 698 | #' 699 | #' @examples 700 | #' # Create example matrix 701 | #' set.seed(123) 702 | #' x <- matrix(rnorm(100), nrow = 10, ncol = 10) 703 | #' x[1:3, 1:3] <- 0 # Add some zeros 704 | #' 705 | #' # Normalize medians 706 | #' x_norm <- normalize_medians(x) 707 | #' head(x_norm) 708 | #' 709 | #' @export 710 | normalize_medians <- function(x, ignore.zero = NULL) { 711 | 712 | if(is.null(ignore.zero)) 713 | ignore.zero <- (min(x,na.rm = TRUE) == 0) 714 | 715 | x <- as.matrix(x) 716 | 717 | if(ignore.zero) { 718 | zx <- x 719 | zx[Matrix::which(x==0)] <- NA 720 | medx <- matrixStats::colMedians(zx, na.rm = TRUE) 721 | medx[is.na(medx)] <- 0 722 | } else { 723 | medx <- matrixStats::colMedians(x, na.rm = TRUE) 724 | } 725 | 726 | nx <- sweep(x, 2, medx, '-') + mean(medx, na.rm = TRUE) 727 | return(nx) 728 | 729 | } 730 | 731 | #' Compute columnwise ranks of matrix 732 | #' 733 | #' Computes columnwise rank of matrix. Can be sparse. Tries to call 734 | #' optimized functions from Rfast or matrixStats. 735 | #' 736 | #' @param X Input matrix 737 | #' @param sparse Logical indicating to use sparse methods 738 | #' @param signed Logical indicating using signed ranks 739 | #' @param keep.zero Logical indicating whether to keep zero as ranked zero 740 | #' @param ties.method Character Choice of ties.method 741 | #' 742 | #' @return Matrix of columnwise ranks with same dimensions as input. 743 | #' 744 | #' @examples 745 | #' # Create example matrix 746 | #' set.seed(123) 747 | #' X <- matrix(rnorm(100), nrow = 10, ncol = 10) 748 | #' rownames(X) <- paste0("Gene", 1:10) 749 | #' colnames(X) <- paste0("Sample", 1:10) 750 | #' 751 | #' # Compute column ranks 752 | #' ranks <- colranks(X) 753 | #' print(ranks[1:5, 1:5]) 754 | #' 755 | #' # Compute signed ranks 756 | #' signed_ranks <- colranks(X, signed = TRUE) 757 | #' print(signed_ranks[1:5, 1:5]) 758 | #' 759 | #' @export 760 | colranks <- function(X, 761 | sparse = NULL, 762 | signed = FALSE, 763 | keep.zero = FALSE, 764 | ties.method = "average") { 765 | 766 | if(is.null(sparse)) 767 | sparse <- inherits(X,"CsparseMatrix") 768 | 769 | if(sparse) { 770 | X <- methods::as(X, "CsparseMatrix") 771 | if(keep.zero) { 772 | rX <- sparse_colranks(X, signed = signed, ties.method = ties.method) 773 | } else { 774 | if(signed) { 775 | sign.X <- sign(X) 776 | abs.rX <- Matrix::t(sparseMatrixStats::colRanks(abs(X), ties.method = ties.method)) 777 | rX <- abs.rX * sign.X 778 | } else { 779 | rX <- Matrix::t(sparseMatrixStats::colRanks(X, ties.method = ties.method)) 780 | } 781 | } 782 | } else { 783 | if(signed) { 784 | sign.X <- sign(X) 785 | abs.rX <- Matrix::t(matrixStats::colRanks(as.matrix(abs(X)), ties.method = ties.method)) 786 | rX <- sign.X * abs.rX 787 | } else { 788 | rX <- Matrix::t(matrixStats::colRanks(as.matrix(X), ties.method = ties.method)) 789 | } 790 | } 791 | 792 | return(rX) 793 | 794 | } 795 | 796 | #' Compute columm ranks for sparse matrix. Internally used by colranks() 797 | #' 798 | #' @param X Input matrix 799 | #' @param signed Logical: use or not signed ranks 800 | #' @param ties.method Character Choice of ties.method 801 | #' 802 | #' @return Sparse matrix of columnwise ranks with same dimensions as input. 803 | #' 804 | sparse_colranks <- function(X, signed = FALSE, ties.method = "average") { 805 | ## https://stackoverflow.com/questions/41772943 806 | X <- methods::as(X, "CsparseMatrix") 807 | n <- diff(X@p) ## number of non-zeros per column 808 | lst <- split(X@x, rep.int(seq_len(ncol(X)), n)) ## columns to list 809 | ## column-wise ranking and result collapsing 810 | if(signed) { 811 | lst.sign <- lapply(lst, sign) 812 | lst.rnk <- lapply(lst, function(x) rank(abs(x),ties.method = ties.method)) 813 | rnk <- unlist(mapply('*', lst.sign, lst.rnk, SIMPLIFY = FALSE)) 814 | } else { 815 | rnk <- unlist(lapply(lst, rank, ties.method = ties.method)) 816 | } 817 | 818 | rX <- X ## copy sparse matrix 819 | rX@x <- rnk ## replace non-zero elements with rank 820 | 821 | return(rX) 822 | 823 | } 824 | 825 | ##------------------------------------------------------------- 826 | ##------------------ end of file ------------------------------ 827 | ##------------------------------------------------------------- 828 | --------------------------------------------------------------------------------