├── .github ├── .gitignore └── workflows │ ├── windows-check.yaml │ ├── check-release.yaml │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── vignettes ├── .gitignore ├── figure │ ├── figt-1.png │ └── figrho-1.png ├── covariate_adjustment.Rmd.orig ├── covariate_adjustment.Rmd └── CSCORE.Rmd ├── LICENSE ├── src ├── CSCORE.so ├── WLS_mean.o ├── WSL_cov.o ├── RcppExports.o ├── CSCORE_IRLS_cpp.o ├── Makevars.win ├── WLS_cov.h ├── WLS_mean.h ├── Makevars ├── WLS_mean.cpp ├── WLS_cov.cpp ├── RcppExports.cpp └── CSCORE_IRLS_cpp.cpp ├── R ├── zzz.R ├── RcppExports.R ├── data.R ├── WLS_helper.R ├── CSCORE_IRLS_base.R ├── CSCORE.R ├── CSCORE_IRLS.R └── CSCORE_IRLS_cpp.R ├── data ├── ind_gene_pair.rda └── top_enrich_go.rda ├── .gitignore ├── .Rbuildignore ├── tests ├── testthat │ ├── fixtures │ │ └── mic_highly_expressed_500_AD_control │ ├── test-CSCORE.R │ ├── test-IRLS_versions.R │ ├── test-CSCORE_IRLS.R │ └── test-covariates.R └── testthat.R ├── NAMESPACE ├── man ├── has_non_integer.Rd ├── WLS_mean.Rd ├── top_enrich_go.Rd ├── ind_gene_pair.Rd ├── WLS_cov.Rd ├── post_process_est.Rd ├── set_D.Rd ├── CSCORE_IRLS_base.Rd ├── CSCORE.Rd ├── CSCORE_IRLS.Rd └── CSCORE_IRLS_cpp.Rd ├── CSCORE.Rproj ├── data-raw └── ind_gene_pair.R ├── LICENSE.md ├── DESCRIPTION ├── _pkgdown.yml ├── NEWS.md └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2025 2 | COPYRIGHT HOLDER: CS-CORE authors 3 | -------------------------------------------------------------------------------- /src/CSCORE.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/CSCORE.so -------------------------------------------------------------------------------- /src/WLS_mean.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/WLS_mean.o -------------------------------------------------------------------------------- /src/WSL_cov.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/WSL_cov.o -------------------------------------------------------------------------------- /src/RcppExports.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/RcppExports.o -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib CSCORE, .registration = TRUE 2 | #' @importFrom Rcpp evalCpp 3 | NULL 4 | -------------------------------------------------------------------------------- /data/ind_gene_pair.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/data/ind_gene_pair.rda -------------------------------------------------------------------------------- /data/top_enrich_go.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/data/top_enrich_go.rda -------------------------------------------------------------------------------- /src/CSCORE_IRLS_cpp.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/CSCORE_IRLS_cpp.o -------------------------------------------------------------------------------- /vignettes/figure/figt-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/vignettes/figure/figt-1.png -------------------------------------------------------------------------------- /vignettes/figure/figrho-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/vignettes/figure/figrho-1.png -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | CXX_STD = CXX17 2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | PKG_CPPFLAGS = -DARMA_USE_CURRENT 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .Rdata 4 | .rds 5 | .httr-oauth 6 | .DS_Store 7 | inst/doc 8 | docs 9 | src/*.o 10 | /doc/ 11 | /Meta/ 12 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CSCORE\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^data-raw$ 5 | ^\.github$ 6 | ^_pkgdown\.yml$ 7 | ^docs$ 8 | ^pkgdown$ 9 | ^doc$ 10 | ^Meta$ 11 | -------------------------------------------------------------------------------- /tests/testthat/fixtures/mic_highly_expressed_500_AD_control: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/tests/testthat/fixtures/mic_highly_expressed_500_AD_control -------------------------------------------------------------------------------- /src/WLS_cov.h: -------------------------------------------------------------------------------- 1 | #ifndef WLS_COV_H 2 | #define WLS_COV_H 3 | 4 | #include 5 | 6 | // Declaration of the function 7 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /src/WLS_mean.h: -------------------------------------------------------------------------------- 1 | #ifndef WLS_MEAN_H 2 | #define WLS_MEAN_H 3 | 4 | #include 5 | 6 | // Declaration of the function 7 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /tests/testthat/test-CSCORE.R: -------------------------------------------------------------------------------- 1 | test_that("input must be a Seurat object", { 2 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 3 | count = test_dat[[1]] 4 | genes = colnames(count) 5 | expect_error(CSCORE(count, genes)) 6 | }) 7 | 8 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CSCORE) 4 | export(CSCORE_IRLS_base) 5 | export(CSCORE_IRLS_cpp) 6 | export(WLS_cov) 7 | export(WLS_mean) 8 | export(post_process_est) 9 | importFrom(Rcpp,evalCpp) 10 | useDynLib(CSCORE, .registration = TRUE) 11 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | ## Use RcppArmadillo for C++ compilation 2 | PKG_CPPFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppArmadillo:::CxxFlags()") 3 | 4 | ## Link against the same LAPACK/BLAS libraries that R was built with 5 | PKG_LIBS = $(shell ${R_HOME}/bin/Rscript -e "RcppArmadillo:::LdFlags()") \ 6 | $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 7 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(CSCORE) 11 | 12 | test_check("CSCORE") 13 | -------------------------------------------------------------------------------- /man/has_non_integer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R 3 | \name{has_non_integer} 4 | \alias{has_non_integer} 5 | \title{Check for non-integer values in a matrix} 6 | \usage{ 7 | has_non_integer(mat) 8 | } 9 | \arguments{ 10 | \item{mat}{A numeric matrix.} 11 | } 12 | \value{ 13 | A logical value: \code{TRUE} if any element is non-integer, otherwise \code{FALSE}. 14 | } 15 | \description{ 16 | This function checks whether a numeric matrix contains any non-integer values. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /CSCORE.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: e3c151dd-dd48-4daa-8123-01d6ab649e4b 3 | 4 | RestoreWorkspace: No 5 | SaveWorkspace: No 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | AutoAppendNewline: Yes 17 | StripTrailingWhitespace: Yes 18 | LineEndingConversion: Posix 19 | 20 | BuildType: Package 21 | PackageUseDevtools: Yes 22 | PackageInstallArgs: --no-multiarch --with-keep.source 23 | PackageRoxygenize: rd,collate,namespace 24 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | CSCORE_IRLS_cpp_impl <- function(X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process = TRUE, n_iter = 10L, eps = 0.05, verbose = FALSE, conv = "q95", return_all = FALSE) { 5 | .Call(`_CSCORE_CSCORE_IRLS_cpp_impl`, X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process, n_iter, eps, verbose, conv, return_all) 6 | } 7 | 8 | WLS_cov <- function(D, X, W) { 9 | .Call(`_CSCORE_WLS_cov`, D, X, W) 10 | } 11 | 12 | WLS_mean <- function(D, X, W) { 13 | .Call(`_CSCORE_WLS_mean`, D, X, W) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /man/WLS_mean.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/WLS_helper.R 3 | \name{WLS_mean} 4 | \alias{WLS_mean} 5 | \title{Weighted Least Squares Mean and Variance Estimation (Rcpp)} 6 | \usage{ 7 | WLS_mean(D, X, W) 8 | } 9 | \arguments{ 10 | \item{D}{Design matrix for WLS (size n x k), where the first column represents baseline mean/var, 11 | and others represents covariates.} 12 | 13 | \item{X}{Response matrix for WLS (size n x p)} 14 | 15 | \item{W}{Weight matrix (n x p)} 16 | } 17 | \value{ 18 | A k x p matrix of WLS estimates 19 | } 20 | \description{ 21 | Compute WLS estimates of mean or variance for all p genes 22 | } 23 | -------------------------------------------------------------------------------- /man/top_enrich_go.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{top_enrich_go} 5 | \alias{top_enrich_go} 6 | \title{Top GO enrichment result in the CS-CORE vignette} 7 | \format{ 8 | \subsection{\code{top_enrich_go}}{ 9 | 10 | A list with five elements, where each element holds the top 3 GO terms enriched in a gene module with strong enrichment signals 11 | } 12 | } 13 | \usage{ 14 | top_enrich_go 15 | } 16 | \description{ 17 | This data set stores the top GO enrichment result from the analysis in CS-CORE vignette. 18 | It is saved for efficiently rendering the vignette without re-evaluating the codes. 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/ind_gene_pair.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ind_gene_pair} 5 | \alias{ind_gene_pair} 6 | \title{A simulated independent gene pair} 7 | \format{ 8 | \subsection{\code{ind_gene_pair}}{ 9 | 10 | A list with 2 elements: 11 | \describe{ 12 | \item{counts}{A 1972 by 2 count matrix with 1972 cells and 2 genes} 13 | \item{seq_depths}{A length 1972 vector of sequencing depths} 14 | } 15 | } 16 | } 17 | \usage{ 18 | ind_gene_pair 19 | } 20 | \description{ 21 | This data set is created only for illustrative purposes and is used to test CSCORE_IRLS.R. 22 | The source code for generating this data set is in data-raw/ on Github. 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/WLS_cov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/WLS_helper.R 3 | \name{WLS_cov} 4 | \alias{WLS_cov} 5 | \title{Weighted Least Squares Covariance Estimation (Rcpp)} 6 | \usage{ 7 | WLS_cov(D, X, W) 8 | } 9 | \arguments{ 10 | \item{D}{Design matrix for WLS (size n x k)} 11 | 12 | \item{X}{Gene expression matrix (n x p)} 13 | 14 | \item{W}{Weight matrix (n x p)} 15 | } 16 | \value{ 17 | A list with: 18 | \describe{ 19 | \item{cov_hat}{A k x p x p array of WLS estimates} 20 | \item{test_stat}{A p x p matrix of test statistics} 21 | } 22 | } 23 | \description{ 24 | Computes WLS estimates of gene-gene covariance and test statistics that assess 25 | the statistical significance of co-expression using moment-based regressions. 26 | } 27 | -------------------------------------------------------------------------------- /data-raw/ind_gene_pair.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `ind_gene_pair` dataset goes here 2 | 3 | # Generate highly variable seq depths from a log normal distribution 4 | set.seed(202212) 5 | sim_seq_depths <- exp(rnorm(2000, 7.3, sd=0.6)) 6 | sim_seq_depths <- sim_seq_depths[!sim_seq_depths < 400] 7 | sim_seq_depths <- round(sim_seq_depths) 8 | summary(sim_seq_depths) 9 | 10 | # Generate an independent gene pair 11 | n <- length(sim_seq_depths) 12 | z_mat <- matrix(rgamma(2*n, shape=4.83, scale=7.56e-05), 13 | nrow = n, ncol = 2) 14 | x_mat <- matrix(rpois(n = 2*n, lambda=z_mat * sim_seq_depths), nrow = n, ncol = 2) 15 | summary(x_mat[,1]) 16 | summary(x_mat[,2]) 17 | 18 | ind_gene_pair <- list(counts = x_mat, seq_depths = sim_seq_depths) 19 | 20 | usethis::use_data(ind_gene_pair, overwrite = TRUE) 21 | -------------------------------------------------------------------------------- /.github/workflows/windows-check.yaml: -------------------------------------------------------------------------------- 1 | name: Windows (R-release) 2 | 3 | on: 4 | push: 5 | pull_request: 6 | workflow_dispatch: 7 | 8 | jobs: 9 | check: 10 | runs-on: windows-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: r-lib/actions/setup-r@v2 14 | with: 15 | r-version: 'release' 16 | - uses: r-lib/actions/setup-r-dependencies@v2 17 | with: 18 | extra-packages: any::rcmdcheck, any::pak 19 | needs: check 20 | - name: Ensure latest Rcpp/RcppArmadillo 21 | run: | 22 | Rscript -e "pak::pkg_install(c('Rcpp','RcppArmadillo'))" 23 | - name: Check package 24 | env: 25 | _R_CHECK_FORCE_SUGGESTS_: false 26 | run: | 27 | Rscript -e "rcmdcheck::rcmdcheck('.', args=c('--no-manual','--as-cran'), error_on='error')" 28 | -------------------------------------------------------------------------------- /.github/workflows/check-release.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | R_KEEP_PKG_SOURCE: yes 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::rcmdcheck 27 | needs: check 28 | 29 | - uses: r-lib/actions/check-r-package@v2 30 | -------------------------------------------------------------------------------- /man/post_process_est.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R 3 | \name{post_process_est} 4 | \alias{post_process_est} 5 | \title{Post-process IRLS estimates} 6 | \usage{ 7 | post_process_est(est) 8 | } 9 | \arguments{ 10 | \item{est}{Estimated co-expression matrix from IRLS} 11 | } 12 | \value{ 13 | Post-processed correlation matrix 14 | } 15 | \description{ 16 | The IRLS procedure does not guarantee the variance estimates to be postive nor the co-expression parameters to be bounded. 17 | To address this, this function evaluates the percentage of genes with negative variance estimates; 18 | sets the their co-expressions to 0 as these genes do not have sufficient biological variations. 19 | This function also evaluates the percentage of gene pairs with out-of-bound co-expression estimates; 20 | sets the co-expressions greater than 1 to 1; set the co-expressions smaller than -1 to -1. 21 | } 22 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' A simulated independent gene pair 2 | #' 3 | #' This data set is created only for illustrative purposes and is used to test CSCORE_IRLS.R. 4 | #' The source code for generating this data set is in data-raw/ on Github. 5 | #' 6 | #' @format ## `ind_gene_pair` 7 | #' A list with 2 elements: 8 | #' \describe{ 9 | #' \item{counts}{A 1972 by 2 count matrix with 1972 cells and 2 genes} 10 | #' \item{seq_depths}{A length 1972 vector of sequencing depths} 11 | #' } 12 | "ind_gene_pair" 13 | 14 | #' Top GO enrichment result in the CS-CORE vignette 15 | #' 16 | #' This data set stores the top GO enrichment result from the analysis in CS-CORE vignette. 17 | #' It is saved for efficiently rendering the vignette without re-evaluating the codes. 18 | #' 19 | #' @format ## `top_enrich_go` 20 | #' A list with five elements, where each element holds the top 3 GO terms enriched in a gene module with strong enrichment signals 21 | #' 22 | "top_enrich_go" 23 | 24 | -------------------------------------------------------------------------------- /man/set_D.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R 3 | \name{set_D} 4 | \alias{set_D} 5 | \title{Set the design matrix for moment-based regressions} 6 | \usage{ 7 | set_D(s, D, adjust_setting, covariate_level) 8 | } 9 | \arguments{ 10 | \item{s}{A numeric vector of sequencing depths (for mean regression) or squared sequencing depths (for variance and covariance)} 11 | 12 | \item{D}{A numeric matrix of intercept and covariates (\code{n x K})} 13 | 14 | \item{adjust_setting}{Logical; whether to adjust for covariates} 15 | 16 | \item{covariate_level}{A character string indicating whether covariates are assumed to affect 17 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).} 18 | } 19 | \value{ 20 | Design matrix (n by K) for moment-based regressions 21 | } 22 | \description{ 23 | Set the design matrix for moment-based regressions 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | R_KEEP_PKG_SOURCE: yes 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - name: Install system dependencies 25 | run: sudo apt-get update && sudo apt-get install -y liblapack-dev libopenblas-dev 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v2 28 | with: 29 | extra-packages: any::rcmdcheck 30 | needs: check 31 | 32 | - uses: r-lib/actions/check-r-package@v2 33 | with: 34 | error-on: '"error"' 35 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2025 CS-CORE authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CSCORE 2 | Title: Cell-type-specific co-expression inference from single cell RNA-sequencing data 3 | Version: 1.0.2 4 | Authors@R: c( 5 | person("Chang", "Su", , "chang.su@emory.edu", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-8704-1512")), 7 | person("Zichun", "Xu", , "zichun.xu@yale.edu", role = c("aut"), 8 | comment = c(ORCID = "0000-0002-4001-0321")), 9 | person("Xinning", "Shan", , "xinning.shan@yale.edu", role = c("aut"), 10 | comment = c(ORCID = "0000-0001-6270-0094")) 11 | ) 12 | Description: This R package provides an implementation for the statistical method CS-CORE that estimates and tests cell-type-speicific co-expression networks using single cell RNA sequencing data. 13 | License: MIT + file LICENSE 14 | Encoding: UTF-8 15 | Roxygen: list(markdown = TRUE) 16 | RoxygenNote: 7.3.2 17 | Imports: 18 | stats, 19 | Rcpp 20 | LinkingTo: Rcpp, RcppArmadillo 21 | Depends: 22 | R (>= 2.10) 23 | LazyData: true 24 | Suggests: 25 | knitr, 26 | rmarkdown, 27 | MASS, 28 | Seurat, 29 | testthat (>= 3.0.0) 30 | Config/testthat/edition: 3 31 | VignetteBuilder: knitr 32 | URL: https://changsubiostats.github.io/CS-CORE/ 33 | -------------------------------------------------------------------------------- /R/WLS_helper.R: -------------------------------------------------------------------------------- 1 | #' Weighted Least Squares Covariance Estimation (Rcpp) 2 | #' 3 | #' Computes WLS estimates of gene-gene covariance and test statistics that assess 4 | #' the statistical significance of co-expression using moment-based regressions. 5 | #' 6 | #' @param D Design matrix for WLS (size n x k) 7 | #' @param X Gene expression matrix (n x p) 8 | #' @param W Weight matrix (n x p) 9 | #' 10 | #' @return A list with: 11 | #' \describe{ 12 | #' \item{cov_hat}{A k x p x p array of WLS estimates} 13 | #' \item{test_stat}{A p x p matrix of test statistics} 14 | #' } 15 | #' 16 | #' @export 17 | WLS_cov <- function(D, X, W) { 18 | .Call(`_CSCORE_WLS_cov`, D, X, W) 19 | } 20 | 21 | #' Weighted Least Squares Mean and Variance Estimation (Rcpp) 22 | #' 23 | #' Compute WLS estimates of mean or variance for all p genes 24 | #' 25 | #' @param D Design matrix for WLS (size n x k), where the first column represents baseline mean/var, 26 | #' and others represents covariates. 27 | #' @param X Response matrix for WLS (size n x p) 28 | #' @param W Weight matrix (n x p) 29 | #' 30 | #' @return A k x p matrix of WLS estimates 31 | #' 32 | #' @export 33 | WLS_mean <- function(D, X, W) { 34 | .Call(`_CSCORE_WLS_mean`, D, X, W) 35 | } 36 | -------------------------------------------------------------------------------- /tests/testthat/test-IRLS_versions.R: -------------------------------------------------------------------------------- 1 | test_that("CSCORE_IRLS Rcpp version matches base version on a simulated independent gene pair", { 2 | expect_equal(CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths, 3 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')), 4 | CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths), tolerance = 1e-8) 5 | }) 6 | 7 | test_that("CSCORE_IRLS Rcpp version matches base version on real data from 3k cells and on 500 genes", { 8 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 9 | count = test_dat[[1]] 10 | seq_depth = test_dat[[2]] 11 | expect_equal(CSCORE_IRLS_cpp(count, seq_depth, 12 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')), 13 | CSCORE_IRLS_base(count, seq_depth), tolerance = 1e-8) 14 | }) 15 | 16 | test_that("CSCORE_IRLS Rcpp version matches base version on real data from 3k cells and on 500 genes", { 17 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 18 | count = test_dat[[1]] 19 | seq_depth = test_dat[[2]] 20 | expect_equal(CSCORE_IRLS_cpp(count, seq_depth, 21 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')), 22 | CSCORE_IRLS_base(count, seq_depth), tolerance = 1e-8) 23 | }) 24 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://changsubiostats.github.io/CS-CORE/ 2 | template: 3 | bootstrap: 5 4 | bootswatch: simplex 5 | math-rendering: mathjax 6 | articles: 7 | - title: Getting started 8 | contents: CSCORE 9 | - title: Covariate Adjustment 10 | contents: covariate_adjustment 11 | navbar: 12 | structure: 13 | left: 14 | - home 15 | - intro 16 | - articles 17 | - reference 18 | - news 19 | right: github 20 | components: 21 | articles: 22 | text: Vignettes 23 | menu: 24 | - text: CS-CORE 25 | href: articles/CSCORE.html 26 | - text: Covariates adjustment 27 | href: articles/covariate_adjustment.html 28 | home: 29 | sidebar: 30 | structure: 31 | - license 32 | - authors 33 | - links 34 | links: 35 | - text: CS-CORE for co-expression 36 | href: https://doi.org/10.1038/s41467-023-40503-7 37 | - text: scMultiMap for peak-gene association 38 | href: https://doi.org/10.1038/s41467-025-59306-z 39 | reference: 40 | - title: CS-CORE functions 41 | desc: | 42 | Functions for inferring cell-type-specific co-expression networks 43 | contents: 44 | - CSCORE 45 | - CSCORE_IRLS_cpp 46 | - CSCORE_IRLS_base 47 | - post_process_est 48 | - WLS_mean 49 | - WLS_cov 50 | - title: Supporting data sets 51 | desc: | 52 | Data sets for illustrative purposes 53 | contents: 54 | - ind_gene_pair 55 | - top_enrich_go 56 | 57 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # CSCORE 1.0.2 (2025-09-13) 2 | 3 | ## Improvements 4 | * Fixed BLAS/LAPACK linking on Windows by adding `src/Makevars.win`. 5 | * Standardized dimension and loop variables to use `arma::uword`, removing signed/unsigned comparison warnings. 6 | * Ensured compatibility with R 4.5.1 and RcppArmadillo 15.x (validated via win-builder and GitHub Actions CI). 7 | * Improved consistency of header includes (`RcppArmadillo.h` first) for more robust builds. 8 | 9 | ## Infrastructure 10 | * Added GitHub Actions workflow for Windows CI with latest R and RcppArmadillo. 11 | * Added Windows build status badge to README. 12 | 13 | 14 | # CSCORE 1.0.1 (2025-06-26) 15 | ## Features 16 | * Added support for covariate adjustment in moment-based regressions for estimating mean, variance, and covariance. 17 | 18 | ## Improvements 19 | * Introduced Rcpp implementations of `WLS_mean()` and `WLS_cov()` to enhance memory efficiency while preserving computational performance. 20 | * Implemented `CSCORE_IRLS_cpp()`, the cpp implementation of the full IRLS procedure to replace the original base R implementation. 21 | * Added tests to ensure consistency with the original base R implementation. 22 | * Updated documentation. 23 | 24 | # CSCORE 1.0.0 25 | * The first official release of the CS-CORE R pakcage (with NC revision). 26 | 27 | # CSCORE 0.0.0.9000 28 | 29 | * The first version of the CS-CORE R package. 30 | * Added a `NEWS.md` file to track changes to the package. 31 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@v4.4.1 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v4 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /src/WLS_mean.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // [[Rcpp::depends(RcppArmadillo)]] 3 | 4 | using namespace Rcpp; 5 | 6 | 7 | /* 8 | * Compute WLS estimates of mean or variance for all p genes 9 | * 10 | * D: Design matrix for WLS (size n x k), where the first column represents baseline mean/var, and others represents covariates 11 | * X: Response matrix for WLS (size n x p) 12 | * W: Weight matrix (size n x p) 13 | * 14 | * Returns: numeric matrix of size k x p, where (, j) stores the WLS estimates for gene j 15 | */ 16 | // [[Rcpp::export]] 17 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W) { 18 | const arma::uword n = D.n_rows; // number of rows of D (i.e., number of cells) 19 | const arma::uword k = D.n_cols; // number of columns of D (i.e., number of covariates) 20 | const arma::uword p = X.n_cols; // number of genes 21 | 22 | if (X.n_rows != n) { 23 | stop("Dimensions of design matrix and gene expression data do not match."); 24 | } 25 | 26 | arma::mat result(k, p, arma::fill::zeros); // k x p matrix, WLS estimates 27 | 28 | arma::mat D_T = D.t(); // precompute transpose once 29 | 30 | for (arma::uword j = 0; j < p; ++j) { 31 | // Compute D^T * diag(w) * D 32 | arma::mat DTD_w = D_T * (D.each_col() % W.col(j)); 33 | 34 | // Inverse of DTD_w 35 | arma::mat inv_DTD_w = arma::inv(DTD_w); 36 | 37 | // Compute D^T * (w % x_prod) 38 | arma::vec Dt_w_x = D_T * (X.col(j) % W.col(j)); 39 | 40 | // WLS estimator 41 | arma::vec res_vec = inv_DTD_w * Dt_w_x; 42 | 43 | // Save WLS estimates 44 | result.col(j) = res_vec; 45 | } 46 | 47 | return result; 48 | } 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS-CORE 2 | [![R-CMD-check](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/R-CMD-check.yaml) 3 | ![Windows CI](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/windows-check.yaml/badge.svg) 4 | [![DOI](https://zenodo.org/badge/576326164.svg)](https://zenodo.org/badge/latestdoi/576326164) 5 | 6 | `CS-CORE` is a R package for cell-type-specific co-expression inference from single cell RNA-sequencing data. 7 | 8 | ## Reference 9 | Su, Chang, et al. "Cell-type-specific co-expression inference from single cell RNA-sequencing data." *Nature Communications* 14.1 (2023): 4846. (https://doi.org/10.1038/s41467-023-40503-7) 10 | 11 | ## Installation 12 | 13 | `CS-CORE` is available on GitHub. You can install it using 14 | 15 | ``` r 16 | ## Load devtools for installing R packages from Github 17 | library(devtools) 18 | ## Install CS-CORE from Github 19 | install_github("ChangSuBiostats/CS-CORE") 20 | ``` 21 | 22 | ## Vignettes 23 | 24 | 1. [Get started](https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html) shows an example of using CS-CORE for cell-type-specific co-expression analysis with single cell RNA-sequencing data. 25 | It includes inferring co-expressions, extracting co-expressed gene modules and functional enrichment analysis. 26 | 27 | 2. [Covariate adjustment](https://changsubiostats.github.io/CS-CORE/articles/covariate_adjustment.html) shows how to adjust for 28 | covariates in co-expression inference with CS-CORE. 29 | 30 | 31 | ## Contact us 32 | 33 | For issues or feature requests, please visit GitHub Issues. If an issue remains unanswered for a while, 34 | you are welcome to email the maintainer at 35 | 36 | ## A Python version 37 | 38 | A Python implementation of CS-CORE is also provided [here](https://github.com/ChangSuBiostats/CS-CORE_python). 39 | -------------------------------------------------------------------------------- /tests/testthat/test-CSCORE_IRLS.R: -------------------------------------------------------------------------------- 1 | test_that("Correct dimension", { 2 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 3 | count = test_dat[[1]] 4 | seq_depth = test_dat[[2]] 5 | CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth) 6 | rho = CSCORE_result$est 7 | pvalues = CSCORE_result$p_value 8 | test_stats = CSCORE_result$test_stat 9 | p = c(dim(count)[2], dim(count)[2]) 10 | expect_true(all(dim(rho) == p, 11 | dim(pvalues) == p, 12 | dim(test_stats) == p)) 13 | }) 14 | 15 | 16 | test_that("P-values are in the correct range", { 17 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 18 | count = test_dat[[1]] 19 | seq_depth = test_dat[[2]] 20 | CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth) 21 | pvalues = CSCORE_result$p_value 22 | expect_true(mean(pvalues <= 1, na.rm = T) == 1) 23 | }) 24 | 25 | test_that("Co-expression estimates are bounded between -1 and 1 and the matrix is symmetric", { 26 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 27 | count = test_dat[[1]] 28 | seq_depth = test_dat[[2]] 29 | CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth, post_process = TRUE) 30 | est = CSCORE_result$est 31 | expect_true(all(all(diag(est) == 1), 32 | abs(est) <= 1, 33 | all.equal(est[upper.tri(est)], t(est)[upper.tri(est)]))) 34 | }) 35 | 36 | test_that("The length of the sequencing depth must match the number of cells", { 37 | test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control") 38 | count = test_dat[[1]] 39 | seq_depth = c(test_dat[[2]], 0) 40 | expect_error(CSCORE_IRLS(X = count, seq_depth = seq_depth), 41 | "The length of the sequencing depth must match the number of cells.") 42 | }) 43 | 44 | test_that("The co-expression estimates and p values are computed correctly", { 45 | cscore_example <- CSCORE_IRLS(ind_gene_pair$counts, ind_gene_pair$seq_depths) 46 | expect_equal(cscore_example$est[1,2], 0.0078201236) 47 | expect_equal(cscore_example$p_value[1,2], 0.96198097) 48 | }) 49 | -------------------------------------------------------------------------------- /man/CSCORE_IRLS_base.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS_base.R 3 | \name{CSCORE_IRLS_base} 4 | \alias{CSCORE_IRLS_base} 5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE (base R, archived)} 6 | \source{ 7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data. 9 | \emph{Nature Communications}. 10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7} 11 | } 12 | \usage{ 13 | CSCORE_IRLS_base(X, seq_depth, post_process = TRUE) 14 | } 15 | \arguments{ 16 | \item{X}{A n by p matrix of UMI counts, where n denotes the number of cells and p denotes the number of genes} 17 | 18 | \item{seq_depth}{A length n vector of sequencing depths} 19 | 20 | \item{post_process}{Whether to process the estimated co-expressions such that the estimates are between -1 and 1. Default to TRUE.} 21 | } 22 | \value{ 23 | A list of three p by p matrices: 24 | \describe{ 25 | \item{est}{co-expression estimates} 26 | \item{p_value}{p values} 27 | \item{test_stat}{test statistics} 28 | } 29 | } 30 | \description{ 31 | This function was originally implemented in 2023 and included in the first release of the CS-CORE R package. 32 | It has since been replaced by \code{\link{CSCORE_IRLS}} for two main reasons: 33 | } 34 | \details{ 35 | \enumerate{ 36 | \item It relies on base R for regression, which is slower and more memory-intensive than the Rcpp-based implementation in \code{CSCORE_IRLS}. 37 | \item It does not support covariate adjustment, a feature supported in \code{CSCORE_IRLS}. 38 | } 39 | } 40 | \note{ 41 | This function is retained for reference and backward compatibility, but users are encouraged to use \code{\link{CSCORE_IRLS}} for new analyses. 42 | } 43 | \examples{ 44 | ## Toy example: 45 | ## run CSCORE on a simulated independent gene pair 46 | cscore_example <- CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths) 47 | 48 | ## Estimated co-expression between two genes 49 | cscore_example$est[1,2] 50 | # close to 0: 0.007820124 51 | 52 | ## p-values 53 | cscore_example$p_value[1,2] 54 | # not significant: 0.961981 55 | 56 | } 57 | \keyword{internal} 58 | -------------------------------------------------------------------------------- /tests/testthat/test-covariates.R: -------------------------------------------------------------------------------- 1 | test_that("The covaiates' effect sizes are computed correctly", { 2 | set.seed(202212) 3 | n <- 6000 4 | sim_seq_depths <- exp(rnorm(n, 7.3, sd=0.6)) 5 | sim_seq_depths <- sim_seq_depths[!sim_seq_depths < 400] 6 | sim_seq_depths <- round(sim_seq_depths) 7 | 8 | n <- length(sim_seq_depths) 9 | mu <- 4.83 * 7.56e-05 # extracted from a specific gene 10 | sigma2 <- 4.83 * 7.56e-05^2 11 | # Simulate a covariate g 12 | g <- rbinom(n, 2, 0.4) 13 | effect_sizes <- matrix(c(0.05, 0.1, 0.2, 0.04), nrow = 2) 14 | rownames(effect_sizes) <- c('mu', 'sigma2') 15 | effect_sizes 16 | covar <- sigma2 * 0.1 17 | covar_effect_size <- 1 18 | 19 | z_mat <- matrix(nrow = n, ncol = 2) 20 | for(g_val in unique(g)){ 21 | g_val_inds <- which(g == g_val) 22 | covar_g <- covar + covar * covar_effect_size * g_val 23 | sigma2_val_vec <- numeric(2) 24 | for(j in 1:2){ 25 | sigma2_val_vec[j] <- sigma2 + sigma2 * effect_sizes['sigma2', j] * g_val 26 | } 27 | rho_g <- covar_g / sqrt(prod(sigma2_val_vec)) 28 | z_mat_gaussian <- MASS::mvrnorm(length(g_val_inds), rep(0,2), 29 | matrix(c(1, rho_g, rho_g, 1), nrow = 2)) 30 | 31 | for(j in 1:2){ 32 | mu_val <- mu + mu * effect_sizes['mu', j] * g_val 33 | sigma2_val <- sigma2 + sigma2 * effect_sizes['sigma2', j] * g_val 34 | z_mat[g_val_inds,j] <- qgamma(pnorm(z_mat_gaussian[,j]), shape = mu_val^2 / sigma2_val, scale = sigma2_val / mu_val) 35 | } 36 | } 37 | x_mat <- matrix(rpois(n = 2*n, lambda=z_mat * sim_seq_depths), nrow = n, ncol = 2) 38 | 39 | cscore_example <- CSCORE_IRLS_cpp(x_mat, sim_seq_depths, 40 | covariates = g, 41 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max'), 42 | return_all = TRUE) 43 | expect_equal(cscore_example$est[1,2], 0.1068046, tolerance = 1e-6) 44 | expect_equal(cscore_example$p_value[1,2], 0.5056709, tolerance = 1e-6) 45 | expect_equal(cscore_example$mu_beta, 46 | matrix(c(3.654416e-04, 3.484695e-04, 1.090923e-05, 7.330102e-05), 2, 2, byrow=T), tolerance = 1e-6) 47 | expect_equal(cscore_example$sigma2_beta, 48 | matrix(c(2.483174e-08, 2.850686e-08, 1.477678e-10, -6.450020e-09), 2, 2, byrow=T), tolerance = 1e-6) 49 | expect_equal(cscore_example$cov_beta[,,1], 50 | matrix(c(1.861900e-07, 2.855003e-09, 2.254392e-09, 8.866591e-09), 2, 2, byrow=T), tolerance = 1e-6) 51 | }) 52 | -------------------------------------------------------------------------------- /man/CSCORE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE.R 3 | \name{CSCORE} 4 | \alias{CSCORE} 5 | \title{CS-CORE for cell-type-specific co-expression network inference} 6 | \usage{ 7 | CSCORE( 8 | object, 9 | genes, 10 | seq_depth = NULL, 11 | covariate_names = NULL, 12 | adjust_setting = c(mean = T, var = T, covar = T), 13 | IRLS_version = "Rcpp", 14 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE) 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{A Seurat object containing single-cell RNA-seq data. 19 | The object should be subsetted to cells of a single cell type to ensure cell-type-specific inference. 20 | CS-CORE requires raw UMI counts as input, and assumes that the raw count matrix is stored in the \code{"counts"} slot of the \code{"RNA"} assay 21 | (i.e., \code{object[["RNA"]]@counts}).} 22 | 23 | \item{genes}{A character vector of gene names (length \eqn{p}) for which the co-expression network will be estimated.} 24 | 25 | \item{seq_depth}{A numeric vector of sequencing depths (length \eqn{n}). 26 | If \code{NULL}, sequencing depth will be computed as the total UMI count per cell. 27 | Defaults to \code{NULL}.} 28 | 29 | \item{covariate_names}{Optional. A character vector specifying the names of cell-level covariates to adjust for in the regression models. 30 | These variables will be extracted from \code{object@meta.data[, covariate_names]}. Defaults to \code{NULL}.} 31 | 32 | \item{adjust_setting}{Optional. A named logical vector of length 3 indicating whether to adjust for covariates in the estimation of mean, variance, and covariance. 33 | Must be named \code{c("mean", "var", "covar")}. Defaults to \code{c(mean = TRUE, var = TRUE, covar = TRUE)}.} 34 | 35 | \item{IRLS_version}{Optional. A character string specifying the IRLS implementation to use: \code{"Rcpp"} or \code{"base_R"}. 36 | Only the \code{"Rcpp"} version supports covariate adjustment. The \code{"base_R"} version does not. 37 | When applicable, \code{"Rcpp"} offers improved memory efficiency (~10-100 times) but may be slower (~10 times), 38 | while \code{"base_R"} is faster but more memory intensive. 39 | Defaults to \code{"Rcpp"}.} 40 | 41 | \item{IRLS_par}{Optional. A named list of length 3 specifying parameters for the IRLS algorithm: 42 | \describe{ 43 | \item{\code{n_iter}}{Maximum number of iterations.} 44 | \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.} 45 | \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.} 46 | } 47 | Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE)}.} 48 | } 49 | \value{ 50 | A list of three p by p matrices: 51 | \describe{ 52 | \item{est}{Matrix of co-expression estimates.} 53 | \item{p_value}{Matrix of p-values for testing co-expression.} 54 | \item{test_stat}{Matrix of test statistics for evaluating the significance of co-expression.} 55 | } 56 | } 57 | \description{ 58 | Run CS-CORE on a Seurat object to infer the cell-type-specific co-expression network for a specified set of genes, 59 | with optional adjustment for covariates. 60 | For more details on the covariate adjustment and the moment-based regression, 61 | please refer to \link{CSCORE_IRLS}. 62 | } 63 | \examples{ 64 | # See a full example at: 65 | # https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html 66 | 67 | } 68 | \seealso{ 69 | \link{CSCORE_IRLS} 70 | 71 | \href{https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html}{CS-CORE online tutorial} 72 | } 73 | -------------------------------------------------------------------------------- /src/WLS_cov.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // [[Rcpp::depends(RcppArmadillo)]] 3 | 4 | using namespace Rcpp; 5 | 6 | /* 7 | * Compute WLS estimates of covariance for all p by p gene pairs 8 | * 9 | * D: Design matrix for WLS (size n x k) 10 | * X: Gene expression matrix (size n x p) 11 | * W: Weight matrix (size n x p) 12 | * 13 | * Returns: a list with two components 14 | * cov_hat: numeric array of size k x p x p, where (, j ,j') stores the WLS estimates for gene pair j, j' across k covariates 15 | * test_stat: numeric matrix of size p x p, where (j, j') stores the test statistics for gene j, j' (H_0: independence) 16 | */ 17 | // [[Rcpp::export]] 18 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W) { 19 | const arma::uword n = D.n_rows; // number of rows of D (i.e., number of cells) 20 | const arma::uword k = D.n_cols; // number of columns of D (i.e., number of covariates) 21 | const arma::uword p = X.n_cols; // number of genes 22 | 23 | if (X.n_rows != n) { 24 | stop("Dimensions of design matrix and gene expression datado not match."); 25 | } 26 | 27 | arma::cube result(k, p, p, arma::fill::zeros); // k x p x p cube, WLS estimates 28 | //arma::cube inv_DTD_w_cube(k, k, p * p, arma::fill::zeros); // k x k x p x p cube, variance of WLS estimators under the null 29 | arma::mat ts_mat(p, p, arma::fill::zeros); // p x p matrix, test statistics 30 | 31 | arma::mat D_T = D.t(); // precompute transpose once 32 | 33 | for (arma::uword j = 0; j < p; ++j) { 34 | for (arma::uword jprime = j; jprime < p; ++jprime) { 35 | 36 | // Compute x_prod = X[, j] * X[, j'] 37 | arma::vec x_prod = X.col(j) % X.col(jprime); // element-wise product 38 | 39 | // Compute weights 40 | arma::vec w = W.col(j) % W.col(jprime); 41 | 42 | // Compute D^T * diag(w) * D 43 | arma::mat D_weighted = D; 44 | D_weighted.each_col() %= w; 45 | 46 | // Compute inverse of DTD_w 47 | arma::mat DTD_w = D_T * D_weighted; 48 | //arma::mat inv_DTD_w = arma::inv_sympd(DTD_w); // safer inverse 49 | arma::mat inv_DTD_w; 50 | //try { 51 | // inv_DTD_w = arma::inv_sympd(DTD_w); 52 | //} catch (...) { 53 | //Rcpp::Rcout << "j" << j << std::endl; 54 | //Rcpp::Rcout << "jprime" << jprime << std::endl; 55 | //Rcpp::Rcout << "DTD_w matrix:\n" << DTD_w << std::endl; 56 | // Rcpp::Rcout << "[WARNING] inv_sympd failed, using general inverse" << std::endl; 57 | // inv_DTD_w = arma::inv(DTD_w); // more stable but slower 58 | //} 59 | inv_DTD_w = arma::inv(DTD_w); // more stable but slower 60 | 61 | // Compute D^T * (w % x_prod) 62 | arma::vec Dt_w_x = D_T * (w % x_prod); 63 | 64 | // WLS estimator 65 | arma::vec res_vec = inv_DTD_w * Dt_w_x; 66 | 67 | // Save covariance estimates 68 | result.slice(jprime).col(j) = res_vec; 69 | 70 | // Save inverse matrix 71 | //int pair_idx = j * p + jprime; 72 | //inv_DTD_w_cube.slice(pair_idx) = inv_DTD_w; 73 | 74 | // Save test statistics 75 | ts_mat(j, jprime) = res_vec(0) / std::sqrt(inv_DTD_w(0, 0)); 76 | } 77 | } 78 | 79 | for (arma::uword j = 0; j < p; ++j) { 80 | for (arma::uword jprime = j + 1; jprime < p; ++jprime) { 81 | result.slice(j).col(jprime) = result.slice(jprime).col(j); 82 | ts_mat(jprime, j) = ts_mat(j, jprime); 83 | } 84 | } 85 | 86 | // Return both result and inverse cube 87 | return List::create(Named("cov_hat") = result, 88 | //Named("var_cov_hat") = inv_DTD_w_cube, 89 | Named("test_stat") = ts_mat); 90 | } 91 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // CSCORE_IRLS_cpp_impl 15 | Rcpp::List CSCORE_IRLS_cpp_impl(const arma::mat& X, const arma::vec& seq_depth_sq, const arma::mat& D_mu, const arma::mat& D_sigma2, const arma::mat& D_sigma, const bool post_process, const int n_iter, const double eps, const bool verbose, const std::string& conv, const bool return_all); 16 | RcppExport SEXP _CSCORE_CSCORE_IRLS_cpp_impl(SEXP XSEXP, SEXP seq_depth_sqSEXP, SEXP D_muSEXP, SEXP D_sigma2SEXP, SEXP D_sigmaSEXP, SEXP post_processSEXP, SEXP n_iterSEXP, SEXP epsSEXP, SEXP verboseSEXP, SEXP convSEXP, SEXP return_allSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP); 21 | Rcpp::traits::input_parameter< const arma::vec& >::type seq_depth_sq(seq_depth_sqSEXP); 22 | Rcpp::traits::input_parameter< const arma::mat& >::type D_mu(D_muSEXP); 23 | Rcpp::traits::input_parameter< const arma::mat& >::type D_sigma2(D_sigma2SEXP); 24 | Rcpp::traits::input_parameter< const arma::mat& >::type D_sigma(D_sigmaSEXP); 25 | Rcpp::traits::input_parameter< const bool >::type post_process(post_processSEXP); 26 | Rcpp::traits::input_parameter< const int >::type n_iter(n_iterSEXP); 27 | Rcpp::traits::input_parameter< const double >::type eps(epsSEXP); 28 | Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); 29 | Rcpp::traits::input_parameter< const std::string& >::type conv(convSEXP); 30 | Rcpp::traits::input_parameter< const bool >::type return_all(return_allSEXP); 31 | rcpp_result_gen = Rcpp::wrap(CSCORE_IRLS_cpp_impl(X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process, n_iter, eps, verbose, conv, return_all)); 32 | return rcpp_result_gen; 33 | END_RCPP 34 | } 35 | // WLS_cov 36 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W); 37 | RcppExport SEXP _CSCORE_WLS_cov(SEXP DSEXP, SEXP XSEXP, SEXP WSEXP) { 38 | BEGIN_RCPP 39 | Rcpp::RObject rcpp_result_gen; 40 | Rcpp::RNGScope rcpp_rngScope_gen; 41 | Rcpp::traits::input_parameter< arma::mat >::type D(DSEXP); 42 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 43 | Rcpp::traits::input_parameter< arma::mat >::type W(WSEXP); 44 | rcpp_result_gen = Rcpp::wrap(WLS_cov(D, X, W)); 45 | return rcpp_result_gen; 46 | END_RCPP 47 | } 48 | // WLS_mean 49 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W); 50 | RcppExport SEXP _CSCORE_WLS_mean(SEXP DSEXP, SEXP XSEXP, SEXP WSEXP) { 51 | BEGIN_RCPP 52 | Rcpp::RObject rcpp_result_gen; 53 | Rcpp::RNGScope rcpp_rngScope_gen; 54 | Rcpp::traits::input_parameter< arma::mat >::type D(DSEXP); 55 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 56 | Rcpp::traits::input_parameter< arma::mat >::type W(WSEXP); 57 | rcpp_result_gen = Rcpp::wrap(WLS_mean(D, X, W)); 58 | return rcpp_result_gen; 59 | END_RCPP 60 | } 61 | 62 | static const R_CallMethodDef CallEntries[] = { 63 | {"_CSCORE_CSCORE_IRLS_cpp_impl", (DL_FUNC) &_CSCORE_CSCORE_IRLS_cpp_impl, 11}, 64 | {"_CSCORE_WLS_cov", (DL_FUNC) &_CSCORE_WLS_cov, 3}, 65 | {"_CSCORE_WLS_mean", (DL_FUNC) &_CSCORE_WLS_mean, 3}, 66 | {NULL, NULL, 0} 67 | }; 68 | 69 | RcppExport void R_init_CSCORE(DllInfo *dll) { 70 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 71 | R_useDynamicSymbols(dll, FALSE); 72 | } 73 | -------------------------------------------------------------------------------- /R/CSCORE_IRLS_base.R: -------------------------------------------------------------------------------- 1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE (base R, archived) 2 | #' 3 | #' This function was originally implemented in 2023 and included in the first release of the CS-CORE R package. 4 | #' It has since been replaced by \code{\link{CSCORE_IRLS}} for two main reasons: 5 | #' 6 | #' \enumerate{ 7 | #' \item It relies on base R for regression, which is slower and more memory-intensive than the Rcpp-based implementation in \code{CSCORE_IRLS}. 8 | #' \item It does not support covariate adjustment, a feature supported in \code{CSCORE_IRLS}. 9 | #' } 10 | #' 11 | #' @note This function is retained for reference and backward compatibility, but users are encouraged to use \code{\link{CSCORE_IRLS}} for new analyses. 12 | #' 13 | #' @keywords internal 14 | #' 15 | #' @param X A n by p matrix of UMI counts, where n denotes the number of cells and p denotes the number of genes 16 | #' @param seq_depth A length n vector of sequencing depths 17 | #' @param post_process Whether to process the estimated co-expressions such that the estimates are between -1 and 1. Default to TRUE. 18 | #' 19 | #' @return A list of three p by p matrices: 20 | #' \describe{ 21 | #' \item{est}{co-expression estimates} 22 | #' \item{p_value}{p values} 23 | #' \item{test_stat}{test statistics} 24 | #' } 25 | #' @export 26 | #' 27 | #' @examples 28 | #' ## Toy example: 29 | #' ## run CSCORE on a simulated independent gene pair 30 | #' cscore_example <- CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths) 31 | #' 32 | #' ## Estimated co-expression between two genes 33 | #' cscore_example$est[1,2] 34 | #' # close to 0: 0.007820124 35 | #' 36 | #' ## p-values 37 | #' cscore_example$p_value[1,2] 38 | #' # not significant: 0.961981 39 | #' 40 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 41 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data. 42 | #' \emph{Nature Communications}. 43 | #' doi: 44 | #' 45 | CSCORE_IRLS_base <- function(X, seq_depth, post_process = TRUE){ 46 | if (is.null(seq_depth)) { 47 | seq_depth = apply(X, 1, sum, na.rm = T) 48 | } 49 | if(nrow(X) != length(seq_depth)){ 50 | stop('The length of the sequencing depth must match the number of cells.') 51 | } 52 | n_cell = nrow(X) 53 | n_gene = ncol(X) 54 | seq_depth_sq = seq_depth^2 55 | seq_2 = sum(seq_depth_sq) 56 | seq_4 = sum(seq_depth^4) 57 | mu = colSums(X * seq_depth)/seq_2 58 | M = outer(seq_depth, mu) 59 | X_centered = X - M 60 | sigma2 = colSums(((X_centered^2 - M) * seq_depth_sq))/seq_4 61 | theta = mu^2/sigma2 62 | j = 0 63 | delta = Inf 64 | 65 | while( delta > 0.05 & j <= 10 ){ 66 | theta_previous = theta 67 | theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5) 68 | theta[theta < 0] = Inf 69 | w = M + outer(seq_depth_sq, mu^2/theta_median) 70 | w[is.na(w)|w <= 0] = 1 71 | mu = colSums((X/w) * seq_depth)/colSums(seq_depth_sq/w) 72 | M = outer(seq_depth, mu) 73 | X_centered = X - M 74 | h = (M^2/theta_median + M)^2 75 | h[h <= 0] = 1 76 | sigma2 = colSums(((X_centered^2 - M)/h * seq_depth_sq))/colSums(seq_depth_sq^2/h) 77 | theta = mu^2/sigma2 78 | j = j+1 79 | # print(paste0("Iteration: ", j, ", Median: ", theta_median)) 80 | delta = max(abs(log((theta/theta_previous)[theta > 0 & theta_previous > 0])), na.rm = T) 81 | # print(delta) 82 | } 83 | if(j == 10 & delta > 0.05){ 84 | print('IRLS failed to converge after 10 iterations. Please check your data.') 85 | }else{ 86 | print(sprintf('IRLS converged after %i iterations.', j)) 87 | } 88 | 89 | theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5) 90 | theta[theta < 0] = Inf 91 | w = M + outer(seq_depth_sq, mu^2/theta_median) 92 | w[is.na(w)|w <= 0] = 1 93 | 94 | covar = matrix(NA, nrow = n_gene, ncol = n_gene) 95 | covar <- (t(seq_depth_sq * X_centered/w) %*% (X_centered/w))/(t(seq_depth_sq/w) %*% (seq_depth_sq/w)) 96 | 97 | # Evaluate test statistics and p values 98 | Sigma <- M + outer(seq_depth_sq, sigma2) 99 | ele_inv_Sigma <- 1/Sigma 100 | X_centered_scaled <- X_centered * ele_inv_Sigma 101 | num <- t(seq_depth_sq * X_centered_scaled) %*% X_centered_scaled 102 | deno <- sqrt(t(seq_depth^4 * ele_inv_Sigma) %*% ele_inv_Sigma) 103 | test_stat <- num/deno 104 | p_value <- 2 * stats::pnorm(abs(test_stat), lower.tail = F) 105 | 106 | # Evaluate co-expression estimates 107 | neg_gene_inds <- which(sigma2 < 0) 108 | sigma2[neg_gene_inds] <- 0 109 | sigma <- sqrt(sigma2) 110 | est <- covar/outer(sigma, sigma) 111 | 112 | # Post-process the co-expression estimates 113 | if(post_process) est <- post_process_est(est) 114 | return(list(est = est, p_value = p_value, test_stat = test_stat)) 115 | } 116 | -------------------------------------------------------------------------------- /man/CSCORE_IRLS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS.R 3 | \name{CSCORE_IRLS} 4 | \alias{CSCORE_IRLS} 5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE} 6 | \source{ 7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data. 9 | \emph{Nature Communications}. 10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7} 11 | } 12 | \usage{ 13 | CSCORE_IRLS( 14 | X, 15 | seq_depth, 16 | covariates = NULL, 17 | post_process = TRUE, 18 | covariate_level = "z", 19 | adjust_setting = c(mean = T, var = T, covar = T), 20 | return_all = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{X}{A numeric matrix of UMI counts (\code{n x p}), 25 | where \code{n} is the number of cells and \code{p} is the number of genes.} 26 | 27 | \item{seq_depth}{A numeric vector of sequencing depths of length \code{n}.} 28 | 29 | \item{covariates}{Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions. 30 | Can be a length n vector if \eqn{K=1}. 31 | If \code{NULL}, no covariates are adjusted for in the regression. 32 | Defaults to \code{NULL}.} 33 | 34 | \item{post_process}{Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1. 35 | Defaults to \code{TRUE}.} 36 | 37 | \item{covariate_level}{Optional. A character string indicating whether covariates are assumed to affect 38 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}). 39 | See the \emph{Details} section for further explanation. 40 | Defaults to \code{"z"}.} 41 | 42 | \item{adjust_setting}{Optional. A named logical vector of length 3; 43 | whether to adjust for covariates at the mean, variance, and covariance level. 44 | Must be named \code{c("mean", "var", "covar")}. 45 | Defaults to \code{c(mean = T,var = T, covar = T)}.} 46 | 47 | \item{return_all}{Logical; whether to return all estimates, including the effect sizes for covariates. 48 | Defaults to \code{FALSE}.} 49 | } 50 | \value{ 51 | A list containing the following components: 52 | \describe{ 53 | \item{est}{A \eqn{p \times p} matrix of co-expression estimates.} 54 | \item{p_value}{A \eqn{p \times p} matrix of p-values.} 55 | \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.} 56 | \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.} 57 | \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.} 58 | \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.} 59 | } 60 | } 61 | \description{ 62 | This function implements the IRLS procedure used in CS-CORE for estimating and testing 63 | cell-type-specific co-expression from single-cell RNA sequencing data. 64 | } 65 | \details{ 66 | Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i}; 67 | \eqn{s_i} denote the sequencing depth; 68 | \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance; 69 | \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state). 70 | The procedure consists of two main steps: 71 | \enumerate{ 72 | \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters 73 | using two moment-based regressions: 74 | \itemize{ 75 | \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}} 76 | \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}}, 77 | where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k} 78 | } 79 | 80 | \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance 81 | and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression: 82 | \itemize{ 83 | \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}} 84 | } 85 | } 86 | We note that 87 | \enumerate{ 88 | \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression, 89 | rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression 90 | (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}}), 91 | please specify \code{covariate_level="x"}. 92 | \item The original CS-CORE published in \url{https://doi.org/10.1038/s41467-023-40503-7} 93 | did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}. 94 | } 95 | } 96 | \keyword{internal} 97 | -------------------------------------------------------------------------------- /R/CSCORE.R: -------------------------------------------------------------------------------- 1 | #' CS-CORE for cell-type-specific co-expression network inference 2 | #' 3 | #' Run CS-CORE on a Seurat object to infer the cell-type-specific co-expression network for a specified set of genes, 4 | #' with optional adjustment for covariates. 5 | #' For more details on the covariate adjustment and the moment-based regression, 6 | #' please refer to \link{CSCORE_IRLS}. 7 | #' @seealso \link{CSCORE_IRLS} 8 | #' @seealso \href{https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html}{CS-CORE online tutorial} 9 | #' 10 | #' @param object A Seurat object containing single-cell RNA-seq data. 11 | #' The object should be subsetted to cells of a single cell type to ensure cell-type-specific inference. 12 | #' CS-CORE requires raw UMI counts as input, and assumes that the raw count matrix is stored in the \code{"counts"} slot of the \code{"RNA"} assay 13 | #' (i.e., \code{object[["RNA"]]@counts}). 14 | #' @param genes A character vector of gene names (length \eqn{p}) for which the co-expression network will be estimated. 15 | #' @param seq_depth A numeric vector of sequencing depths (length \eqn{n}). 16 | #' If \code{NULL}, sequencing depth will be computed as the total UMI count per cell. 17 | #' Defaults to \code{NULL}. 18 | #' @param covariate_names Optional. A character vector specifying the names of cell-level covariates to adjust for in the regression models. 19 | #' These variables will be extracted from \code{object@meta.data[, covariate_names]}. Defaults to \code{NULL}. 20 | #' @param adjust_setting Optional. A named logical vector of length 3 indicating whether to adjust for covariates in the estimation of mean, variance, and covariance. 21 | #' Must be named \code{c("mean", "var", "covar")}. Defaults to \code{c(mean = TRUE, var = TRUE, covar = TRUE)}. 22 | #' @param IRLS_version Optional. A character string specifying the IRLS implementation to use: \code{"Rcpp"} or \code{"base_R"}. 23 | #' Only the \code{"Rcpp"} version supports covariate adjustment. The \code{"base_R"} version does not. 24 | #' When applicable, \code{"Rcpp"} offers improved memory efficiency (~10-100 times) but may be slower (~10 times), 25 | #' while \code{"base_R"} is faster but more memory intensive. 26 | #' Defaults to \code{"Rcpp"}. 27 | #' @param IRLS_par Optional. A named list of length 3 specifying parameters for the IRLS algorithm: 28 | #' \describe{ 29 | #' \item{\code{n_iter}}{Maximum number of iterations.} 30 | #' \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.} 31 | #' \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.} 32 | #' } 33 | #' Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE)}. 34 | #' 35 | #' @return A list of three p by p matrices: 36 | #' \describe{ 37 | #' \item{est}{Matrix of co-expression estimates.} 38 | #' \item{p_value}{Matrix of p-values for testing co-expression.} 39 | #' \item{test_stat}{Matrix of test statistics for evaluating the significance of co-expression.} 40 | #' } 41 | #' @export 42 | #' 43 | #' @examples 44 | #' # See a full example at: 45 | #' # https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html 46 | #' 47 | CSCORE <- function(object, genes, seq_depth = NULL, 48 | covariate_names = NULL, 49 | adjust_setting = c('mean' = T, 'var' = T, 'covar' = T), 50 | IRLS_version = 'Rcpp', 51 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE)){ 52 | # Extract the UMI count matrix from the single cell object 53 | # with RNA as the default assay 54 | # such that slot `counts` corresponds to UMI counts 55 | count_matrix <- t(as.matrix(GetAssayData(object = object, assay = "RNA",slot='counts'))) 56 | # Extract / calculate the sequencing depths 57 | if(is.null(seq_depth)){ 58 | if('nCount_RNA' %in% colnames(object@meta.data)){ 59 | seq_depth <- object$nCount_RNA 60 | }else{ 61 | seq_depth <- rowSums(count_matrix) 62 | } 63 | }else{ 64 | if(length(seq_depth) != nrow(count_matrix)) stop("The length of the sequencing depth must match the number of cells.") 65 | } 66 | # Run CS-CORE 67 | if(is.null(covariate_names)){ 68 | if(IRLS_version == 'base_R'){ 69 | CSCORE_result <- CSCORE_IRLS_base(count_matrix[,genes], seq_depth) 70 | }else{ 71 | IRLS_par[['conv']] <- 'max' 72 | CSCORE_result <- CSCORE_IRLS_cpp(count_matrix[,genes], seq_depth, 73 | IRLS_par = IRLS_par) 74 | } 75 | }else{ 76 | if(any(!covariate_names %in% colnames(object@meta.data))){ 77 | stop("[ERROR] Not all covariates are included in the Seurat object. Please check `colnames(object@meta.data)`") 78 | } 79 | covariates <- object@meta.data[, covariate_names] 80 | cat("[INFO] Adjust for covariates:", paste(colnames(covariates), collapse = ", "), "\n") 81 | covariate_matrix <- stats::model.matrix(~ ., data = covariates)[,-1] 82 | covariate_matrix <- scale(covariate_matrix, center = T, scale = T) 83 | cat("[INFO] Variables in the design matrix:", paste(colnames(covariate_matrix), collapse = ", "), "\n") 84 | 85 | IRLS_par[['conv']] <- 'q95' 86 | CSCORE_result <- CSCORE_IRLS_cpp(count_matrix[,genes], seq_depth, 87 | covariates = covariate_matrix, 88 | adjust_setting = adjust_setting, 89 | IRLS_par = IRLS_par) 90 | } 91 | return(CSCORE_result) 92 | } 93 | -------------------------------------------------------------------------------- /man/CSCORE_IRLS_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R 3 | \name{CSCORE_IRLS_cpp} 4 | \alias{CSCORE_IRLS_cpp} 5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE (Rcpp)} 6 | \source{ 7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data. 9 | \emph{Nature Communications}. 10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7} 11 | } 12 | \usage{ 13 | CSCORE_IRLS_cpp( 14 | X, 15 | seq_depth, 16 | covariates = NULL, 17 | post_process = TRUE, 18 | covariate_level = "z", 19 | adjust_setting = c(mean = T, var = T, covar = T), 20 | IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = "q95"), 21 | return_all = FALSE 22 | ) 23 | } 24 | \arguments{ 25 | \item{X}{A numeric matrix of UMI counts (\code{n x p}), 26 | where \code{n} is the number of cells and \code{p} is the number of genes.} 27 | 28 | \item{seq_depth}{A numeric vector of sequencing depths of length \code{n}.} 29 | 30 | \item{covariates}{Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions. 31 | Can be a length n vector if \eqn{K=1}. 32 | If \code{NULL}, no covariates are adjusted for in the regression. 33 | Defaults to \code{NULL}.} 34 | 35 | \item{post_process}{Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1. 36 | Defaults to \code{TRUE}.} 37 | 38 | \item{covariate_level}{Optional. A character string indicating whether covariates are assumed to affect 39 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}). 40 | See the \emph{Details} section for further explanation. 41 | Defaults to \code{"z"}.} 42 | 43 | \item{adjust_setting}{Optional. A named logical vector of length 3; 44 | whether to adjust for covariates at the mean, variance, and covariance level. 45 | Must be named \code{c("mean", "var", "covar")}. 46 | Defaults to \code{c(mean = T,var = T, covar = T)}.} 47 | 48 | \item{IRLS_par}{Optional. A named list of length 4 specifying parameters for the IRLS algorithm: 49 | \describe{ 50 | \item{\code{n_iter}}{Maximum number of iterations.} 51 | \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.} 52 | \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.} 53 | \item{\code{conv}}{Character string; determine convergence based on \code{q95} (0.95 quantile) or \code{max} of \code{delta}.} 54 | } 55 | Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE, cov = "q95")}.} 56 | 57 | \item{return_all}{Logical; whether to return all estimates, including the effect sizes for covariates. 58 | Defaults to \code{FALSE}.} 59 | } 60 | \value{ 61 | A list containing the following components: 62 | \describe{ 63 | \item{est}{A \eqn{p \times p} matrix of co-expression estimates.} 64 | \item{p_value}{A \eqn{p \times p} matrix of p-values.} 65 | \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.} 66 | \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.} 67 | \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.} 68 | \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.} 69 | } 70 | } 71 | \description{ 72 | Estimate and test cell-type-specific co-expression using the IRLS procedure with optional covariate adjustment. 73 | } 74 | \details{ 75 | Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i}; 76 | \eqn{s_i} denote the sequencing depth; 77 | \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance; 78 | \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state). 79 | The procedure consists of two main steps: 80 | \enumerate{ 81 | \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters 82 | using two moment-based regressions: 83 | \itemize{ 84 | \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}} 85 | \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}}, 86 | where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k} 87 | } 88 | 89 | \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance 90 | and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression: 91 | \itemize{ 92 | \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}} 93 | } 94 | } 95 | We note that 96 | \enumerate{ 97 | \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression, 98 | rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression 99 | (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}}), 100 | please specify \code{covariate_level="x"}. 101 | \item The original CS-CORE published in \url{https://doi.org/10.1038/s41467-023-40503-7} 102 | did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}. 103 | } 104 | Note: This is an R wrapper for the \code{CSCORE_IRLS_cpp_impl()} function implemented in Rcpp. 105 | } 106 | \examples{ 107 | ## Toy example: 108 | ## run CSCORE on a simulated independent gene pair 109 | cscore_example <- CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths) 110 | 111 | ## Estimated co-expression between two genes 112 | cscore_example$est[1,2] 113 | # close to 0: 0.007820124 114 | 115 | ## p-values 116 | cscore_example$p_value[1,2] 117 | # not significant: 0.961981 118 | 119 | } 120 | -------------------------------------------------------------------------------- /vignettes/covariate_adjustment.Rmd.orig: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Covariate adjustment" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{covariate_adjustment} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | # Motivation 18 | The original implementation of CS-CORE does not consider covariate adjustment. For example, in the [paper](https://www.nature.com/articles/s41467-023-40503-7), we assumed 19 | 20 | \[ 21 | {\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}) 22 | \] 23 | 24 | such that the mean ${\bf \mu}$ and the variance-covariance matrix $\Sigma$ of underlying gene expression are exactly the same for all cells in the population. However, in real data, it is possible that the mean, variance, and co-expression of underlying gene expression could be affected by technical covariates (such as percent.mt) or biological covariates (such as sex) that vary across cell. 25 | 26 | # New moment-based regressions for covariate adjustment 27 | 28 | In June 2025, we added a new feature to CS-CORE that allows adjusting for covariates in co-expression inference. In particular, we adjust for covariates through modelling ${\bf mu}$ and $\Sigma$ as a function of covariates $c_{ik}$'s: 29 | $$ 30 | {\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}, \ 31 | \Sigma_{i}=\Sigma +\sum_k c_{ik} {\bf \gamma_k} 32 | $$ 33 | This leads to the following moment-based regression: 34 | 35 | $$ 36 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij} 37 | $$ 38 | 39 | $$(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}$$ 40 | $$(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},$$ 41 | 42 | where $\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k$. These allow adjusting for $K$ covariates $c_{ik}$'s for their effects on the underlying gene expression mean ($\beta_k$'s), variance ($\gamma_k$'s) and covariance ($\theta_k$'s). 43 | 44 | # Demonstration on real data 45 | 46 | Here, we demonstrates how to adjust for covariates in CS-CORE with the same dataset as in [Getting started](CSCORE.html). 47 | 48 | ```{r message=F} 49 | library(CSCORE) 50 | library(Seurat) 51 | ``` 52 | 53 | ```{r message=F, warning=F} 54 | # wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds 55 | 56 | pbmc <- readRDS('/Users/csu30/Library/CloudStorage/OneDrive-Emory/Projects/CS-CORE/Rpackage/blish_covid.seu.rds') 57 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object 58 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B'] 59 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA) 60 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:200] 61 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"] 62 | ``` 63 | 64 | This dataset comes with detailed cell-level covariates: 65 | ```{r} 66 | colnames(pbmc_B_healthy@meta.data) 67 | ``` 68 | 69 | As an example, we choose to adjust for `Sex` and `percent.mt`. To run CS-CORE with covariate adjustment, use 70 | 71 | ```{r} 72 | CSCORE_result_adj <- CSCORE(pbmc_B_healthy, 73 | genes = genes_selected, 74 | covariate_names = c('percent.mt', 'Sex')) 75 | ``` 76 | 77 | By default, `CSCORE` extracts the covariates from the Seurat object and construct a design matrix with scaled and centered covariates. To understand the detailed impact of covariate adjustment, we also compare with the results without covariate adjustment. 78 | 79 | ```{r} 80 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected) 81 | ``` 82 | 83 | ```{r figrho} 84 | # compare co-expression estimates for a random set of gene pairs 85 | set.seed(42002) 86 | p <- length(genes_selected) 87 | random_pairs <- sample(which(upper.tri(matrix(1:p^2, p, p))), 1000) 88 | plot(CSCORE_result$est[random_pairs], 89 | CSCORE_result_adj$est[random_pairs], 90 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 91 | main = 'Co-expression estimates') 92 | abline(0,1,col='red') 93 | ``` 94 | 95 | ```{r echo=F, eval=F} 96 | png('vignettes/figure/figrho-1.png', width = 480, height = 480) 97 | plot(CSCORE_result$est[random_pairs], 98 | CSCORE_result_adj$est[random_pairs], 99 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 100 | main = 'Co-expression estimates') 101 | abline(0,1,col='red') 102 | dev.off() 103 | ``` 104 | 105 | ```{r figt} 106 | plot(CSCORE_result$test_stat[random_pairs], 107 | CSCORE_result_adj$test_stat[random_pairs], 108 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 109 | main = 'Test_stat') 110 | abline(0,1,col='red') 111 | ``` 112 | 113 | ```{r echo=F, eval=F} 114 | png('vignettes/figure/figt-1.png', width = 480, height = 480) 115 | plot(CSCORE_result$test_stat[random_pairs], 116 | CSCORE_result_adj$test_stat[random_pairs], 117 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 118 | main = 'Test_stat') 119 | abline(0,1,col='red') 120 | dev.off() 121 | ``` 122 | 123 | It seems that the co-expression for most gene pairs are similar with and without covariate adjustment. We recommend users to sanity check this and examine the impact of covariate adjustment on co-expression inference. 124 | 125 | 126 | # Advanced topics 127 | 128 | The application above adjusts for covariates in the underlying expression levels' mean, variance, and covariance. For users who wish to have more fine-grained control on the regression models, we provide two additional parameters: `adjust_setting` and `covariate_level`. 129 | 130 | ## `adjust_setting` 131 | 132 | We provide these two options in [CSCORE_IRLS_cpp](CSCORE_IRLS_cpp.html), which is the function underlying `CSCORE`. `adjust_setting` allows you to choose which regression model to adjust covariates for. For example, if `adjust_setting=c(mean = T, var = F, covar = T)`, this is equivalent to running the following regressions: 133 | 134 | $$ 135 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij} 136 | $$ 137 | 138 | $$ 139 | (x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 \sigma_{jj} + \eta_{ij} 140 | $$ 141 | 142 | $$ 143 | (x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}, 144 | $$ 145 | 146 | ## `covariate_level` 147 | 148 | In the adjustment models above, we assume ${\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})$ and ${\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}$. This implies that the underlying mean expression is associated with covariates. 149 | 150 | Another possible model is to assume $\text{Poisson}(s_i z_{ij} + \sum_k c_{ik} {\bf \beta_k})$, which implies that the covariates operate in the measurement process, independent of underlying gene expression. Even though we think the default model is more natural, we allow for this flexibility by specifying `covariate_level = "x"`. This will run 151 | $$ 152 | x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}, 153 | $$ 154 | and similarly for variance and covariance. 155 | -------------------------------------------------------------------------------- /vignettes/covariate_adjustment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Covariate adjustment" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{covariate_adjustment} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | 11 | 12 | # Motivation 13 | The original implementation of CS-CORE does not consider covariate adjustment. For example, in the [paper](https://www.nature.com/articles/s41467-023-40503-7), we assumed 14 | 15 | \[ 16 | {\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}) 17 | \] 18 | 19 | such that the mean ${\bf \mu}$ and the variance-covariance matrix $\Sigma$ of underlying gene expression are exactly the same for all cells in the population. However, in real data, it is possible that the mean, variance, and co-expression of underlying gene expression could be affected by technical covariates (such as percent.mt) or biological covariates (such as sex) that vary across cell. 20 | 21 | # New moment-based regressions for covariate adjustment 22 | 23 | In June 2025, we added a new feature to CS-CORE that allows adjusting for covariates in co-expression inference. In particular, we adjust for covariates through modelling ${\bf mu}$ and $\Sigma$ as a function of covariates $c_{ik}$'s: 24 | $$ 25 | {\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}, \ 26 | \Sigma_{i}=\Sigma +\sum_k c_{ik} {\bf \gamma_k} 27 | $$ 28 | This leads to the following moment-based regression: 29 | 30 | $$ 31 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij} 32 | $$ 33 | 34 | $$(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}$$ 35 | $$(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},$$ 36 | 37 | where $\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k$. These allow adjusting for $K$ covariates $c_{ik}$'s for their effects on the underlying gene expression mean ($\beta_k$'s), variance ($\gamma_k$'s) and covariance ($\theta_k$'s). 38 | 39 | # Demonstration on real data 40 | 41 | Here, we demonstrates how to adjust for covariates in CS-CORE with the same dataset as in [Getting started](CSCORE.html). 42 | 43 | 44 | ``` r 45 | library(CSCORE) 46 | library(Seurat) 47 | ``` 48 | 49 | 50 | ``` r 51 | # wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds 52 | 53 | pbmc <- readRDS('blish_covid.seu.rds') 54 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object 55 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B'] 56 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA) 57 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:200] 58 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"] 59 | ``` 60 | 61 | This dataset comes with detailed cell-level covariates: 62 | 63 | ``` r 64 | colnames(pbmc_B_healthy@meta.data) 65 | #> [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "percent.mt" "percent.rps" "percent.rpl" 66 | #> [7] "percent.rrna" "nCount_SCT" "nFeature_SCT" "SCT_snn_res.1" "seurat_clusters" "singler" 67 | #> [13] "Admission.level" "cell.type.fine" "cell.type.coarse" "cell.type" "IFN1" "HLA1" 68 | #> [19] "Donor.orig" "Donor.full" "Donor" "Status" "Sex" "DPS" 69 | #> [25] "DTF" "Admission" "Ventilated" 70 | ``` 71 | 72 | As an example, we choose to adjust for `Sex` and `percent.mt`. To run CS-CORE with covariate adjustment, use 73 | 74 | 75 | ``` r 76 | CSCORE_result_adj <- CSCORE(pbmc_B_healthy, 77 | genes = genes_selected, 78 | covariate_names = c('percent.mt', 'Sex')) 79 | #> [INFO] Adjust for covariates: percent.mt, Sex 80 | #> [INFO] Variables in the design matrix: percent.mt, SexM 81 | #> [INFO] IRLS converged after 3 iterations. 82 | #> [INFO] Starting WLS for covariance at Thu Jun 26 17:09:48 2025 83 | #> [INFO] 1 among 200 genes have invalid variance estimates. Their co-expressions with other genes were set to 0. 84 | #> [INFO] 0.0854% co-expression estimates were greater than 1 and were set to 1. 85 | #> [INFO] 0.0101% co-expression estimates were smaller than -1 and were set to -1. 86 | #> [INFO] Finished WLS. Elapsed time: 1.8762 seconds. 87 | ``` 88 | 89 | By default, `CSCORE` extracts the covariates from the Seurat object and construct a design matrix with scaled and centered covariates. To understand the detailed impact of covariate adjustment, we also compare with the results without covariate adjustment. 90 | 91 | 92 | ``` r 93 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected) 94 | #> [INFO] IRLS converged after 3 iterations. 95 | #> [INFO] Starting WLS for covariance at Thu Jun 26 17:09:50 2025 96 | #> [INFO] 0.0101% co-expression estimates were greater than 1 and were set to 1. 97 | #> [INFO] 0.0000% co-expression estimates were smaller than -1 and were set to -1. 98 | #> [INFO] Finished WLS. Elapsed time: 1.2249 seconds. 99 | ``` 100 | 101 | 102 | ``` r 103 | # compare co-expression estimates for a random set of gene pairs 104 | set.seed(42002) 105 | p <- length(genes_selected) 106 | random_pairs <- sample(which(upper.tri(matrix(1:p^2, p, p))), 1000) 107 | plot(CSCORE_result$est[random_pairs], 108 | CSCORE_result_adj$est[random_pairs], 109 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 110 | main = 'Co-expression estimates') 111 | abline(0,1,col='red') 112 | ``` 113 | 114 | ![](figure/figrho-1.png) 115 | 116 | 117 | 118 | 119 | ``` r 120 | plot(CSCORE_result$test_stat[random_pairs], 121 | CSCORE_result_adj$test_stat[random_pairs], 122 | xlab = 'CS-CORE', ylab = 'CS-CORE adjusted', 123 | main = 'Test_stat') 124 | abline(0,1,col='red') 125 | ``` 126 | 127 | ![](figure/figt-1.png) 128 | 129 | 130 | 131 | It seems that the co-expression for most gene pairs are similar with and without covariate adjustment. We recommend users to sanity check this and examine the impact of covariate adjustment on co-expression inference. 132 | 133 | 134 | # Advanced topics 135 | 136 | The application above adjusts for covariates in the underlying expression levels' mean, variance, and covariance. For users who wish to have more fine-grained control on the regression models, we provide two additional parameters: `adjust_setting` and `covariate_level`. 137 | 138 | ## `adjust_setting` 139 | 140 | We provide these two options in [CSCORE_IRLS_cpp](CSCORE_IRLS_cpp.html), which is the function underlying `CSCORE`. `adjust_setting` allows you to choose which regression model to adjust covariates for. For example, if `adjust_setting=c(mean = T, var = F, covar = T)`, this is equivalent to running the following regressions: 141 | 142 | $$ 143 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij} 144 | $$ 145 | 146 | $$ 147 | (x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 \sigma_{jj} + \eta_{ij} 148 | $$ 149 | 150 | $$ 151 | (x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}, 152 | $$ 153 | 154 | ## `covariate_level` 155 | 156 | In the adjustment models above, we assume ${\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})$ and ${\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}$. This implies that the underlying mean expression is associated with covariates. 157 | 158 | Another possible model is to assume $\text{Poisson}(s_i z_{ij} + \sum_k c_{ik} {\bf \beta_k})$, which implies that the covariates operate in the measurement process, independent of underlying gene expression. Even though we think the default model is more natural, we allow for this flexibility by specifying `covariate_level = "x"`. This will run 159 | $$ 160 | x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}, 161 | $$ 162 | and similarly for variance and covariance. 163 | -------------------------------------------------------------------------------- /src/CSCORE_IRLS_cpp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // [[Rcpp::depends(RcppArmadillo)]] 3 | #include "WLS_mean.h" 4 | #include "WLS_cov.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace Rcpp; 11 | 12 | arma::mat post_process_est(arma::mat est) { 13 | int p = est.n_rows; 14 | 15 | // Identify diagonal elements that are NA or Inf 16 | std::vector neg_gene_inds; 17 | for (int i = 0; i < p; ++i) { 18 | double val = est(i, i); 19 | if (std::isnan(val) || !std::isfinite(val)) { 20 | neg_gene_inds.push_back(i); 21 | } 22 | } 23 | 24 | if (!neg_gene_inds.empty()) { 25 | Rcpp::Rcout << "[INFO] " << neg_gene_inds.size() << " among " << p 26 | << " genes have invalid variance estimates. Their co-expressions with other genes were set to 0." << std::endl; 27 | } 28 | 29 | // Set entire rows and columns of invalid genes to 0 30 | for (int idx : neg_gene_inds) { 31 | est.row(idx).zeros(); 32 | est.col(idx).zeros(); 33 | } 34 | 35 | // Set diagonal to 1 36 | est.diag().ones(); 37 | 38 | // Clip values outside [-1, 1] and count them 39 | int upper_count = 0, lower_count = 0, total = 0; 40 | for (int i = 0; i < p; ++i) { 41 | for (int j = i + 1; j < p; ++j) { 42 | total++; 43 | if (est(i, j) > 1.0) { 44 | upper_count++; 45 | est(i, j) = 1.0; 46 | est(j, i) = 1.0; 47 | } else if (est(i, j) < -1.0) { 48 | lower_count++; 49 | est(i, j) = -1.0; 50 | est(j, i) = -1.0; 51 | } 52 | } 53 | } 54 | 55 | Rcpp::Rcout << "[INFO] " 56 | << std::fixed << std::setprecision(4) 57 | << 100.0 * upper_count / total 58 | << "% co-expression estimates were greater than 1 and were set to 1." << std::endl; 59 | Rcpp::Rcout << "[INFO] " 60 | << 100.0 * lower_count / total 61 | << "% co-expression estimates were smaller than -1 and were set to -1." << std::endl; 62 | 63 | return est; 64 | } 65 | 66 | 67 | // ---- inline quantile approximation ---- 68 | inline double arma_quantile(const arma::vec& v, double prob) { 69 | arma::uword n = v.n_elem; 70 | if (n == 0) return NA_REAL; 71 | 72 | arma::vec sorted = arma::sort(v); 73 | double index = (n - 1) * prob; 74 | arma::uword lo = std::floor(index); 75 | arma::uword hi = std::ceil(index); 76 | 77 | //Rcpp::Rcout << "[DEBUG] Input vector size: " << v.n_elem << std::endl; 78 | //Rcpp::Rcout << "[DEBUG] Sorted vector head: " << sorted.head(5).t(); 79 | //Rcpp::Rcout << "[DEBUG] index: " << index << ", lo: " << lo << ", hi: " << hi << std::endl; 80 | 81 | if (lo == hi) { 82 | return sorted[lo]; 83 | } else { 84 | double weight = index - lo; 85 | return (1.0 - weight) * sorted[lo] + weight * sorted[hi]; 86 | } 87 | } 88 | 89 | 90 | // [[Rcpp::export]] 91 | Rcpp::List CSCORE_IRLS_cpp_impl(const arma::mat& X, 92 | const arma::vec& seq_depth_sq, 93 | const arma::mat& D_mu, 94 | const arma::mat& D_sigma2, 95 | const arma::mat& D_sigma, 96 | const bool post_process = true, 97 | const int n_iter = 10, 98 | const double eps = 0.05, 99 | const bool verbose = false, 100 | const std::string& conv = "q95", 101 | const bool return_all = false) { 102 | 103 | const arma::uword n = X.n_rows; 104 | const arma::uword p = X.n_cols; 105 | 106 | arma::mat W = arma::ones(X.n_rows, X.n_cols); 107 | arma::mat mu_beta = WLS_mean(D_mu, X, W); 108 | arma::mat M = D_mu * mu_beta; 109 | arma::rowvec mu = mu_beta.row(0); 110 | 111 | arma::mat X_centered = X - M; 112 | arma::mat sigma2_beta = WLS_mean(D_sigma2, (arma::square(X_centered) - M), W); 113 | arma::rowvec sigma2 = sigma2_beta.row(0); 114 | arma::rowvec theta = arma::square(mu) / sigma2; 115 | 116 | int iter = 0; 117 | double delta = std::numeric_limits::infinity(); 118 | 119 | while (delta > eps && iter <= n_iter) { 120 | arma::rowvec theta_prev = theta; 121 | arma::vec theta_pos = theta.elem(arma::find(theta > 0)); 122 | double theta_median = arma::median(theta_pos); 123 | 124 | arma::rowvec mu_sq_over_theta = arma::square(mu) / theta_median; 125 | arma::mat weight = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(mu_sq_over_theta, n, 1); 126 | weight.elem(arma::find_nonfinite(weight)).fill(1); 127 | weight.elem(arma::find(weight <= 0)).fill(1); 128 | 129 | mu_beta = WLS_mean(D_mu, X, 1 / weight); 130 | mu = mu_beta.row(0); 131 | M = D_mu * mu_beta; 132 | X_centered = X - M; 133 | 134 | arma::mat h = arma::square(arma::square(M) / theta_median + M); 135 | h.elem(arma::find(h <= 0)).fill(1); 136 | sigma2_beta = WLS_mean(D_sigma2, (arma::square(X_centered) - M), 1 / h); 137 | sigma2 = sigma2_beta.row(0); 138 | theta = arma::square(mu) / sigma2; 139 | 140 | arma::uvec valid_idx = arma::find(theta > 0 && theta_prev > 0); 141 | arma::vec ratio = arma::log(theta.elem(valid_idx) / theta_prev.elem(valid_idx)); 142 | if (conv == "max") { 143 | delta = arma::max(arma::abs(ratio)); 144 | } else if (conv == "q95") { 145 | delta = arma_quantile(arma::abs(ratio), 0.95);; 146 | } 147 | iter++; 148 | if (verbose) { 149 | Rcpp::Rcout << "[INFO] " << iter << ": delta=" << delta << "\n"; 150 | } 151 | } 152 | 153 | if (iter > n_iter && delta > eps) { 154 | Rcpp::Rcout << "[WARNING] IRLS did not converge after " << n_iter << " iterations. \n Please increase the number of iterations or check your data.\n"; 155 | } else { 156 | Rcpp::Rcout << "[INFO] IRLS converged after " << iter << " iterations.\n"; 157 | } 158 | 159 | 160 | arma::rowvec mu_sq_over_theta = arma::square(mu) / arma::median(theta.elem(arma::find(theta > 0))); 161 | arma::mat w_cov = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(mu_sq_over_theta, n, 1); 162 | w_cov.elem(arma::find_nonfinite(w_cov)).fill(1); 163 | w_cov.elem(arma::find(w_cov <= 0)).fill(1); 164 | X_centered = X - M; 165 | 166 | auto start = std::chrono::system_clock::now(); 167 | std::time_t start_time = std::chrono::system_clock::to_time_t(start); 168 | Rcpp::Rcout << "[INFO] Starting WLS for covariance at " << std::ctime(&start_time); // includes newline 169 | auto tic = std::chrono::high_resolution_clock::now(); 170 | 171 | // Covariance estimation 172 | Rcpp::List covar_list = WLS_cov(D_sigma, X_centered, 1 / w_cov); 173 | arma::cube covar_beta = Rcpp::as(covar_list["cov_hat"]); 174 | arma::mat covar = covar_beta.row(0); // Extract k = 0 slice => p × p matrix 175 | 176 | // Statistical testing 177 | arma::mat Sigma = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(sigma2, n, 1); 178 | arma::mat ele_inv_Sigma = 1 / Sigma; 179 | 180 | Rcpp::List ts_list = WLS_cov(D_sigma, X_centered, ele_inv_Sigma); 181 | arma::vec sigma = arma::sqrt(arma::clamp(sigma2.t(), 0, arma::datum::inf)); 182 | //arma::cube covar_beta = Rcpp::as(ts_list["cov_hat"]); 183 | //arma::mat covar = covar_beta.row(0); // Extract k = 0 slice => p × p matrix 184 | arma::mat est = covar / (sigma * sigma.t()); 185 | arma::mat test_stat = ts_list["test_stat"]; 186 | arma::mat p_value = 2 * arma::normcdf(-arma::abs(test_stat)); 187 | 188 | if (post_process) { 189 | est = post_process_est(est); 190 | } 191 | 192 | auto toc = std::chrono::high_resolution_clock::now(); 193 | std::chrono::duration elapsed = toc - tic; 194 | Rcpp::Rcout << "[INFO] Finished WLS. Elapsed time: " << elapsed.count() << " seconds.\n"; 195 | 196 | Rcpp::List result = List::create(Named("est") = est, 197 | Named("p_value") = p_value, 198 | Named("test_stat") = test_stat); 199 | 200 | if (return_all) { 201 | result["mu_beta"] = mu_beta; 202 | result["sigma2_beta"] = sigma2_beta; 203 | result["cov_beta"] = covar_beta; 204 | } 205 | 206 | return result; 207 | } 208 | 209 | 210 | -------------------------------------------------------------------------------- /R/CSCORE_IRLS.R: -------------------------------------------------------------------------------- 1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE 2 | #' 3 | #' This function implements the IRLS procedure used in CS-CORE for estimating and testing 4 | #' cell-type-specific co-expression from single-cell RNA sequencing data. 5 | #' 6 | #' Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i}; 7 | #' \eqn{s_i} denote the sequencing depth; 8 | #' \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance; 9 | #' \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state). 10 | #' The procedure consists of two main steps: 11 | #' \enumerate{ 12 | #' \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters 13 | #' using two moment-based regressions: 14 | #' \itemize{ 15 | #' \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}} 16 | #' \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}}, 17 | #' where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k} 18 | #' } 19 | #' 20 | #' \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance 21 | #' and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression: 22 | #' \itemize{ 23 | #' \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}} 24 | #' } 25 | #' } 26 | #' We note that 27 | #' \enumerate{ 28 | #' \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression, 29 | #' rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression 30 | #' (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}}), 31 | #' please specify \code{covariate_level="x"}. 32 | #' \item The original CS-CORE published in 33 | #' did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}. 34 | #' } 35 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 36 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data. 37 | #' \emph{Nature Communications}. 38 | #' doi: 39 | #' 40 | #' @param X A numeric matrix of UMI counts (\code{n x p}), 41 | #' where \code{n} is the number of cells and \code{p} is the number of genes. 42 | #' @param seq_depth A numeric vector of sequencing depths of length \code{n}. 43 | #' @param covariates Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions. 44 | #' Can be a length n vector if \eqn{K=1}. 45 | #' If \code{NULL}, no covariates are adjusted for in the regression. 46 | #' Defaults to \code{NULL}. 47 | #' @param post_process Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1. 48 | #' Defaults to \code{TRUE}. 49 | #' @param covariate_level Optional. A character string indicating whether covariates are assumed to affect 50 | #' the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}). 51 | #' See the *Details* section for further explanation. 52 | #' Defaults to \code{"z"}. 53 | #' @param adjust_setting Optional. A named logical vector of length 3; 54 | #' whether to adjust for covariates at the mean, variance, and covariance level. 55 | #' Must be named \code{c("mean", "var", "covar")}. 56 | #' Defaults to \code{c(mean = T,var = T, covar = T)}. 57 | #' @param return_all Logical; whether to return all estimates, including the effect sizes for covariates. 58 | #' Defaults to \code{FALSE}. 59 | #' 60 | #' @return A list containing the following components: 61 | #' \describe{ 62 | #' \item{est}{A \eqn{p \times p} matrix of co-expression estimates.} 63 | #' \item{p_value}{A \eqn{p \times p} matrix of p-values.} 64 | #' \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.} 65 | #' \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.} 66 | #' \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.} 67 | #' \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.} 68 | #' } 69 | #' @keywords internal 70 | #' 71 | 72 | CSCORE_IRLS <- function(X, seq_depth, 73 | covariates = NULL, 74 | post_process = TRUE, 75 | covariate_level = 'z', 76 | adjust_setting = c('mean' = T, 'var' = T, 'covar' = T), 77 | return_all = FALSE){ 78 | if(is.null(seq_depth)){ 79 | seq_depth = apply(X, 1, sum, na.rm = T) 80 | } 81 | if(nrow(X) != length(seq_depth)){ 82 | stop('The length of the sequencing depth must match the number of cells.') 83 | } 84 | if(has_non_integer(X)){ 85 | stop("CS-CORE takes the UMI count matrix as input. The input matrix X contains non-integer values.") 86 | } 87 | if(!is.null(covariates)){ 88 | if(is.vector(covariates)){ 89 | if(nrow(X) != length(covariates)) stop('The length of the covariate vector should match the number of cells.') 90 | }else{ 91 | if(nrow(X) != nrow(covariates)) stop('The number of rows in the covariate matrix should match the number of cells.') 92 | message(sprintf('The design matrix for regression has %i columns', ncol(covariates) + 1)) 93 | } 94 | } 95 | n_cell = nrow(X) 96 | n_gene = ncol(X) 97 | seq_depth_sq = seq_depth^2 98 | seq_2 = sum(seq_depth_sq) 99 | seq_4 = sum(seq_depth^4) 100 | # Construct design matrix 101 | if(is.null(covariates)){ 102 | D <- matrix(rep(1, n_cell), ncol = 1) 103 | }else{ 104 | D <- cbind(1, covariates) 105 | } 106 | D_mu <- set_D(seq_depth, D, adjust_setting['mean'], covariate_level) 107 | D_sigma2 <- set_D(seq_depth_sq, D, adjust_setting['var'], covariate_level) 108 | D_sigma <- set_D(seq_depth_sq, D, adjust_setting['covar'], covariate_level) 109 | # initialize with OLS 110 | mu_beta <- WLS_mean(D_mu, X, matrix(1, nrow = n_cell, ncol = n_gene)) 111 | M <- D_mu %*% mu_beta 112 | mu <- mu_beta[1,] 113 | X_centered = X - M 114 | sigma2_beta <- WLS_mean(D_sigma2, X_centered^2 - M, matrix(1, nrow = n_cell, ncol = n_gene)) 115 | sigma2 <- sigma2_beta[1,] 116 | theta = mu^2/sigma2 117 | j = 0 118 | delta = Inf 119 | 120 | # IRLS for estimating mean and variance parameters 121 | while( delta > 0.05 & j <= 10 ){ 122 | theta_previous = theta 123 | theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5) 124 | theta[theta < 0] = Inf 125 | w = M + outer(seq_depth_sq, mu^2/theta_median) 126 | w[is.na(w)|w <= 0] = 1 127 | mu_beta <- WLS_mean(D_mu, X, 1/w) 128 | mu <- mu_beta[1,] 129 | M <- D_mu %*% mu_beta 130 | X_centered = X - M 131 | h = (M^2/theta_median + M)^2 132 | h[h <= 0] = 1 133 | sigma2_beta <- WLS_mean(D_sigma2, X_centered^2 - M, 1/h) 134 | sigma2 <- sigma2_beta[1,] 135 | theta = mu^2/sigma2 136 | j = j+1 137 | delta = max(abs(log((theta/theta_previous)[theta > 0 & theta_previous > 0])), na.rm = T) 138 | } 139 | if(j == 10 & delta > 0.05){ 140 | print('IRLS failed to converge after 10 iterations. Please check your data.') 141 | }else{ 142 | print(sprintf('IRLS converged after %i iterations.', j)) 143 | } 144 | 145 | # Update the weights for estimating covariance 146 | theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5) 147 | theta[theta < 0] = Inf 148 | w = M + outer(seq_depth_sq, mu^2/theta_median) 149 | w[is.na(w)|w <= 0] = 1 150 | 151 | X_centered <- X - M 152 | covar <- WLS_cov(D_sigma, X_centered, 1/w)$cov_hat[1,,] 153 | # Evaluate test statistics and p values 154 | Sigma <- M + outer(seq_depth_sq, sigma2) 155 | ele_inv_Sigma <- 1/Sigma 156 | ts_res <- WLS_cov(D_sigma, X_centered, ele_inv_Sigma) 157 | est <- ts_res$cov_hat[1,,] 158 | p_value <- 2 * stats::pnorm(abs(ts_res$test_stat), lower.tail = F) 159 | 160 | # Evaluate co-expression estimates 161 | neg_gene_inds <- which(sigma2 < 0) 162 | sigma2[neg_gene_inds] <- 0 163 | sigma <- sqrt(sigma2) 164 | diag(covar) <- sigma 165 | est <- covar/outer(sigma, sigma) 166 | test_stat <- ts_res$test_stat 167 | 168 | # Post-process the co-expression estimates 169 | #diag(est) <- 1 170 | if(post_process) est <- post_process_est(est) 171 | rownames(est) <- colnames(est) <- rownames(p_value) <- colnames(p_value) <- rownames(test_stat) <- colnames(test_stat) <- colnames(X) 172 | result_list <- list(est = est, p_value = p_value, test_stat = test_stat) 173 | if(return_all){ 174 | result_list$mu_beta <- mu_beta 175 | result_list$sigma2_beta <- sigma2_beta 176 | result_list$cov_beta <- ts_res$cov_hat 177 | } 178 | return(result_list) 179 | } 180 | 181 | -------------------------------------------------------------------------------- /vignettes/CSCORE.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "CS-CORE for cell-type-specific co-expression analysis" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{CS-CORE for cell-type-specific co-expression analysis} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | This vignette shows an example of applying `CS-CORE` to infer cell-type-specific co-expression networks and extracting co-expressed gene modules that are enriched for biological functions in cell types. 18 | 19 | # 1. Load packages and data 20 | 21 | ```{r setup, warning = FALSE, message = FALSE} 22 | library(CSCORE) 23 | library(Seurat) 24 | ``` 25 | 26 | In this vignette, we use the single cell RNA-sequencing data on Peripheral blood mononuclear cells (PBMC) from COVID patients and healthy controls from [Wilk et al.](https://www.nature.com/articles/s41591-020-0944-y), which were also studied in our [paper](https://www.nature.com/articles/s41467-023-40503-7). This data set can be downloaded via the following bash script 27 | 28 | ```{r, engine = 'bash', eval = FALSE} 29 | wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds 30 | ``` 31 | 32 | After downloading blish_covid.seu.rds, we load it into the R session 33 | ```{r eval = FALSE} 34 | pbmc <- readRDS('blish_covid.seu.rds') 35 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object 36 | ``` 37 | 38 | # 2. Select cell types and gene sets to study 39 | 40 | In this example, we focus on B cells and infer the B cell–specific co-expression network. In general, we strongly recommend subsetting to the cell type of interest before inferring co-expression. Otherwise, the estimates may be affected by [Simpson's paradox](https://pmc.ncbi.nlm.nih.gov/articles/PMC4579334/#:~:text=(A)%20Simpson's%20Paradox%20describes%20), capturing associations driven by differences between cell types (e.g.correlations between cell type marker genes), rather than gene-gene co-expression within a homogeneous cell population. 41 | 42 | ```{r eval = FALSE} 43 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B'] 44 | ``` 45 | 46 | Depending on the biological question of interest, one may choose to study the co-expression network for any gene set. Here, we chose to infer the co-expression network for the genes with meaningful expression levels in B cells (top 5000 among 26361 genes). There are several reasons for our choice: 47 | 48 | 1. All genes with moderate to high expression levels provides a comprehensive and unbiased set of genes that could have meaningful biological functions in a cell type. 49 | 50 | 2. If the genes have much lower expression levels, it would be statistically more challenging and biologically less interesting to infer their co-expressions, as these genes might have almost all UMI counts equal to 0. 51 | 52 | In general, it will be up to the users's choice to select the gene sets to study. We recommend choosing the gene sets that are of interest to your application. 53 | 54 | ```{r eval = FALSE} 55 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA) 56 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:5000] 57 | ``` 58 | 59 | 60 | # 3. Run `CS-CORE` to infer cell-type-specific co-expression network on the specified gene set 61 | 62 | We further subset the B cells to those from healthy control subjects in order to study B-cell specific co-expression network among healthy control B cells. 63 | 64 | ```{r eval = FALSE} 65 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"] 66 | ``` 67 | 68 | Run `CS-CORE` with the subsetted Seurat object and a gene set of interest. We note that CSCORE operates on the raw UMI counts, i.e. `object[['RNA']]@counts` for the Seurat object `object`. 69 | 70 | ```{r eval = FALSE} 71 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected) 72 | # faster yet more memory intensive: 73 | # CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected, IRLS_version = 'bash_R') 74 | ``` 75 | 76 | Since version 1.0.1 (updated June 2025), CSCORE provides a new Rcpp implementation of moment-based regressions that offers improved memory efficiency and supports covariate adjustment (see [Covariate adjustment](covariate_adjustment.html)). The original implementation at the time of publication (version 0.0.0.9) was written in base R, and while it is approximately 10 times faster, it is also significantly more memory intensive (by a factor of 10–100). We recommend choosing between the two implementations based on available memory resources and the need for computational speed and covariate adjustment. 77 | 78 | # 4. Downstream analysis on the co-expression network 79 | 80 | ## 4.1 Extract co-expressed gene module 81 | 82 | Given the `CS-CORE` $p$-values, we first set co-expressions that are not statistically significant (Benjamini \& Hochberg-adjusted $p$-values $>0.05$) to 0. 83 | 84 | ```{r eval = FALSE} 85 | # Obtain CS-CORE co-expression estimates 86 | CSCORE_coexp <- CSCORE_result$est 87 | 88 | # Obtain BH-adjusted p values 89 | CSCORE_p <- CSCORE_result$p_value 90 | p_matrix_BH = matrix(0, length(genes_selected), length(genes_selected)) 91 | p_matrix_BH[upper.tri(p_matrix_BH)] = p.adjust(CSCORE_p[upper.tri(CSCORE_p)], method = "BH") 92 | p_matrix_BH <- p_matrix_BH + t(p_matrix_BH) 93 | 94 | # Set co-expression entires with BH-adjusted p-values greater than 0.05 to 0 95 | CSCORE_coexp[p_matrix_BH > 0.05] <- 0 96 | ``` 97 | 98 | Next, based on the thresholded co-expression matrix, we apply [WGCNA](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559) to extract co-expressed gene modules. In particular, we use `CS-CORE` estimates to measure co-expressions for single cell RNA-sequencing data, which replace the Pearson correlations used in traditional WGNCA workflow, that suffer from inflated false positives and attenuation bias on single cell data as demonstrated in our [manuscript](https://www.biorxiv.org/content/10.1101/2022.12.13.520181v1). 99 | 100 | ```{r eval = FALSE} 101 | if (!require(WGCNA)) { 102 | install.packages("WGCNA") 103 | library(WGCNA) 104 | }else{ 105 | library(WGCNA) 106 | } 107 | ``` 108 | 109 | ```{r eval = FALSE} 110 | # Compute the adjacency matrix based on the co-expression matrix 111 | adj = WGCNA::adjacency.fromSimilarity(abs(CSCORE_coexp), power = 1) 112 | # Compute the topological overlap matrix 113 | TOM = WGCNA::TOMsimilarity(adj) 114 | dissTOM = 1-TOM 115 | rownames(dissTOM) <- colnames(dissTOM) <- genes_selected 116 | # Run hierarchical clustering as in the WGCNA workflow 117 | hclust_dist = hclust(as.dist(dissTOM), method = "average") 118 | memb = dynamicTreeCut::cutreeDynamic(dendro = hclust_dist, 119 | distM = dissTOM, 120 | deepSplit = 2, 121 | pamRespectsDendro = FALSE, 122 | minClusterSize = 10) 123 | # For more instructions on how to tune the parameters in the WGCNA workflow, 124 | # please refer to https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/ 125 | 126 | names(memb) = genes_selected 127 | memb_tab <- table(memb) 128 | module_list = lapply(sort(unique(memb)), function(i_k) names(which(memb == i_k))) 129 | ``` 130 | One can also apply other clustering methods to extract co-expressed gene modules. 131 | 132 | ## 4.2 Functional enrichment analysis 133 | 134 | Gene Ontology (GO) enrichment analysis is one of the common downstream functional enrichment analyses for interpreting the biological pathways implied by co-expressed gene modules. Here, we showcase the GO pathways enriched in `CS-CORE` co-expressed gene modules using the R implementation from [Wu et al.](https://pubmed.ncbi.nlm.nih.gov/34557778/). 135 | 136 | ```{r eval = FALSE} 137 | if (!require(clusterProfiler)) { 138 | BiocManager::install("clusterProfiler") 139 | library(clusterProfiler) 140 | }else{ 141 | library(clusterProfiler) 142 | } 143 | ``` 144 | 145 | ```{r eval = FALSE} 146 | # Set all genes in clustering analysis as background, 147 | # such that the enrichment result of any module is not attributed to its high expression levels. 148 | universe <- genes_selected 149 | 150 | # Filter GO terms based on BH-adjusted p values < 0.05 151 | #### 152 | ## Note: the following codes can take a long time to run as 153 | ## in this example there are more than 100 co-expressed gene modules from WGCNA 154 | #### 155 | ego_result <- lapply(1:length(module_list), function(i){ 156 | enrichGO(gene = module_list[[i]], 157 | OrgDb = 'org.Hs.eg.db', # human 158 | keyType = "SYMBOL", 159 | ont = "ALL", 160 | pAdjustMethod = "BH", 161 | universe = universe, 162 | pvalueCutoff = 0.05) 163 | }) 164 | ``` 165 | 166 | There are in total 144 gene modules inferred by WGCNA. For illustrative purposes, we focus on the modules with the strongest enrichment signals (with at least one GO term having adjusted $p$-value smaller than $10^{-3}$ and with at least 10 enriched GO terms) and print the top 3 GO terms. 167 | 168 | ```{r eval = FALSE} 169 | top_enrich_clusters <- which(sapply(ego_result, function(x) 170 | (x@result$p.adjust[1] < 0.001) & (dim(x)[1]>10))) 171 | top_enrich_go <- lapply(top_enrich_clusters, function(i) ego_result[[i]]@result[1:3,]) 172 | ``` 173 | 174 | ```{r} 175 | for(i in 1:length(top_enrich_go)){ 176 | print(top_enrich_go[[i]][, c('Description', 'GeneRatio', 'p.adjust')]) 177 | cat('\n') 178 | } 179 | ``` 180 | 181 | At this point, we have reproduced the results in our [manuscript](https://doi.org/10.1038/s41467-023-40503-7), Table S6. 182 | 183 | This concludes our vignette of using `CS-CORE` to infer cell-type-specific co-expression networks and a typical pipeline for extracting co-expressed gene modules and performing functional enrichment analysis. 184 | 185 | One can also perform a differential co-expression analysis based on the codes provided here. For example, the inferred network of healthy B cells can be constrasted to the network inferred with B cells from the COVID-19 patients to study dysregulation in B cells' co-expression due to COVID-19 infection. For more details please refer the the methods in our [manuscript](https://doi.org/10.1038/s41467-023-40503-7). 186 | 187 | **Stay tuned!** We are also working on a pipeline for cell-type-specific module-trait association analyses with single cell RNA-seq data based on `CS-CORE` developed here. 188 | 189 | 190 | -------------------------------------------------------------------------------- /R/CSCORE_IRLS_cpp.R: -------------------------------------------------------------------------------- 1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE (Rcpp) 2 | #' 3 | #' Estimate and test cell-type-specific co-expression using the IRLS procedure with optional covariate adjustment. 4 | #' 5 | #' Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i}; 6 | #' \eqn{s_i} denote the sequencing depth; 7 | #' \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance; 8 | #' \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state). 9 | #' The procedure consists of two main steps: 10 | #' \enumerate{ 11 | #' \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters 12 | #' using two moment-based regressions: 13 | #' \itemize{ 14 | #' \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}} 15 | #' \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}}, 16 | #' where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k} 17 | #' } 18 | #' 19 | #' \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance 20 | #' and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression: 21 | #' \itemize{ 22 | #' \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}} 23 | #' } 24 | #' } 25 | #' We note that 26 | #' \enumerate{ 27 | #' \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression, 28 | #' rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression 29 | #' (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j + \sum_k c_{ik} \beta_k + \epsilon_{ij}}), 30 | #' please specify \code{covariate_level="x"}. 31 | #' \item The original CS-CORE published in 32 | #' did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}. 33 | #' } 34 | #' Note: This is an R wrapper for the `CSCORE_IRLS_cpp_impl()` function implemented in Rcpp. 35 | #' 36 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023). 37 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data. 38 | #' \emph{Nature Communications}. 39 | #' doi: 40 | #' 41 | #' @param X A numeric matrix of UMI counts (\code{n x p}), 42 | #' where \code{n} is the number of cells and \code{p} is the number of genes. 43 | #' @param seq_depth A numeric vector of sequencing depths of length \code{n}. 44 | #' @param covariates Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions. 45 | #' Can be a length n vector if \eqn{K=1}. 46 | #' If \code{NULL}, no covariates are adjusted for in the regression. 47 | #' Defaults to \code{NULL}. 48 | #' @param post_process Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1. 49 | #' Defaults to \code{TRUE}. 50 | #' @param covariate_level Optional. A character string indicating whether covariates are assumed to affect 51 | #' the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}). 52 | #' See the *Details* section for further explanation. 53 | #' Defaults to \code{"z"}. 54 | #' @param adjust_setting Optional. A named logical vector of length 3; 55 | #' whether to adjust for covariates at the mean, variance, and covariance level. 56 | #' Must be named \code{c("mean", "var", "covar")}. 57 | #' Defaults to \code{c(mean = T,var = T, covar = T)}. 58 | #' @param IRLS_par Optional. A named list of length 4 specifying parameters for the IRLS algorithm: 59 | #' \describe{ 60 | #' \item{\code{n_iter}}{Maximum number of iterations.} 61 | #' \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.} 62 | #' \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.} 63 | #' \item{\code{conv}}{Character string; determine convergence based on \code{q95} (0.95 quantile) or \code{max} of \code{delta}.} 64 | #' } 65 | #' Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE, cov = "q95")}. 66 | #' @param return_all Logical; whether to return all estimates, including the effect sizes for covariates. 67 | #' Defaults to \code{FALSE}. 68 | #' 69 | #' @return A list containing the following components: 70 | #' \describe{ 71 | #' \item{est}{A \eqn{p \times p} matrix of co-expression estimates.} 72 | #' \item{p_value}{A \eqn{p \times p} matrix of p-values.} 73 | #' \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.} 74 | #' \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.} 75 | #' \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.} 76 | #' \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.} 77 | #' } 78 | #' @export 79 | #' 80 | #' @examples 81 | #' ## Toy example: 82 | #' ## run CSCORE on a simulated independent gene pair 83 | #' cscore_example <- CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths) 84 | #' 85 | #' ## Estimated co-expression between two genes 86 | #' cscore_example$est[1,2] 87 | #' # close to 0: 0.007820124 88 | #' 89 | #' ## p-values 90 | #' cscore_example$p_value[1,2] 91 | #' # not significant: 0.961981 92 | #' 93 | 94 | CSCORE_IRLS_cpp <- function(X, 95 | seq_depth, 96 | covariates = NULL, 97 | post_process = TRUE, 98 | covariate_level = 'z', 99 | adjust_setting = c('mean' = T, 'var' = T, 'covar' = T), 100 | IRLS_par = list('n_iter' = 10, 'eps' = 0.05, 'verbose' = FALSE, conv = 'q95'), 101 | return_all = FALSE) { 102 | if(is.null(seq_depth)){ 103 | seq_depth = apply(X, 1, sum, na.rm = T) 104 | } 105 | if(nrow(X) != length(seq_depth)){ 106 | stop('[ERROR] The length of the sequencing depth must match the number of cells.') 107 | } 108 | if(! all.equal(names(IRLS_par), c('n_iter', 'eps', 'verbose', 'conv'))){ 109 | stop('[ERROR] IRLS_par must be a named list with integer n_iter, double eps, and logical verbose.') 110 | } 111 | if(has_non_integer(X)){ 112 | stop("[ERROR] CS-CORE takes the UMI count matrix as input. The input matrix X contains non-integer values.") 113 | } 114 | if(!is.null(covariates)){ 115 | if(is.vector(covariates)){ 116 | if(nrow(X) != length(covariates)) stop('[ERROR] The length of the covariate vector should match the number of cells.') 117 | }else{ 118 | if(nrow(X) != nrow(covariates)) stop('[ERROR] The number of rows in the covariate matrix should match the number of cells.') 119 | #cat(sprintf('[INFO] The design matrix for regression has %i columns', ncol(covariates) + 1)) 120 | } 121 | } 122 | if (length(adjust_setting) != 3) stop("[ERROR] adjust_setting must be of length 3.") 123 | 124 | # Construct design matrix 125 | n_cell <- nrow(X) 126 | if(is.null(covariates)){ 127 | D <- matrix(rep(1, n_cell), ncol = 1) 128 | }else{ 129 | D <- cbind(1, covariates) 130 | } 131 | seq_depth_sq <- seq_depth^2 132 | D_mu <- set_D(seq_depth, D, adjust_setting['mean'], covariate_level) 133 | D_sigma2 <- set_D(seq_depth_sq, D, adjust_setting['var'], covariate_level) 134 | D_sigma <- set_D(seq_depth_sq, D, adjust_setting['covar'], covariate_level) 135 | 136 | res <- .Call(`_CSCORE_CSCORE_IRLS_cpp_impl`, 137 | X, seq_depth_sq, D_mu, D_sigma2, D_sigma, # regression parameters 138 | post_process, # post processing 139 | IRLS_par[['n_iter']], IRLS_par[['eps']], IRLS_par[['verbose']], IRLS_par[['conv']], # IRLS parameters 140 | return_all) 141 | 142 | # Assign row/colnames to square matrices 143 | gene_names <- colnames(X) 144 | rownames(res$est) <- colnames(res$est) <- gene_names 145 | rownames(res$p_value) <- colnames(res$p_value) <- gene_names 146 | rownames(res$test_stat) <- colnames(res$test_stat) <- gene_names 147 | 148 | return(res) 149 | } 150 | 151 | #' Set the design matrix for moment-based regressions 152 | #' 153 | #' 154 | #' @param s A numeric vector of sequencing depths (for mean regression) or squared sequencing depths (for variance and covariance) 155 | #' @param D A numeric matrix of intercept and covariates (\code{n x K}) 156 | #' @param adjust_setting Logical; whether to adjust for covariates 157 | #' @param covariate_level A character string indicating whether covariates are assumed to affect 158 | #' the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}). 159 | #' 160 | #' @return Design matrix (n by K) for moment-based regressions 161 | #' 162 | #' @keywords internal 163 | #' 164 | set_D <- function(s, D, adjust_setting, covariate_level){ 165 | if(!adjust_setting){ 166 | return(matrix(s, ncol = 1)) 167 | }else{ 168 | if(covariate_level == 'z'){ 169 | return(s * D) 170 | }else if(covariate_level == 'x'){ 171 | return(cbind(s, D[,-1])) 172 | } 173 | } 174 | } 175 | 176 | 177 | #' Check for non-integer values in a matrix 178 | #' 179 | #' This function checks whether a numeric matrix contains any non-integer values. 180 | #' 181 | #' @param mat A numeric matrix. 182 | #' 183 | #' @return A logical value: \code{TRUE} if any element is non-integer, otherwise \code{FALSE}. 184 | #' 185 | #' @keywords internal 186 | #' 187 | has_non_integer <- function(mat) { 188 | for (val in mat) { 189 | if (abs(val - round(val)) > .Machine$double.eps^0.5) return(TRUE) 190 | } 191 | return(FALSE) 192 | } 193 | 194 | 195 | #' Post-process IRLS estimates 196 | #' 197 | #' The IRLS procedure does not guarantee the variance estimates to be postive nor the co-expression parameters to be bounded. 198 | #' To address this, this function evaluates the percentage of genes with negative variance estimates; 199 | #' sets the their co-expressions to 0 as these genes do not have sufficient biological variations. 200 | #' This function also evaluates the percentage of gene pairs with out-of-bound co-expression estimates; 201 | #' sets the co-expressions greater than 1 to 1; set the co-expressions smaller than -1 to -1. 202 | #' 203 | #' @param est Estimated co-expression matrix from IRLS 204 | #' 205 | #' @return Post-processed correlation matrix 206 | #' 207 | #' @export 208 | #' 209 | post_process_est <- function(est){ 210 | p <- nrow(est) 211 | # Post-process CS-CORE estimates 212 | neg_gene_inds <- which(sapply(diag(est), function(x) is.infinite(x) | is.na(x))) 213 | #which(is.infinite(diag(est))) 214 | if(length(neg_gene_inds) > 0){ 215 | print(sprintf('%i among %i genes have negative variance estimates. Their co-expressions with other genes were set to 0.', 216 | length(neg_gene_inds), p)) 217 | } 218 | # Negative variances suggest insufficient biological variation, 219 | # and also lack of correlation 220 | est[neg_gene_inds, ] <- 0 221 | est[, neg_gene_inds] <- 0 222 | # Set all diagonal values to 1 223 | diag(est) <- 1 224 | # Gene pairs with out-of-bound estimates 225 | print(sprintf('%.4f%% co-expression estimates were greater than 1 and were set to 1.', 226 | mean(est[upper.tri(est)] > 1, na.rm = T) * 100)) 227 | print(sprintf('%.4f%% co-expression estimates were smaller than -1 and were set to -1.', 228 | mean(est[upper.tri(est)] < -1, na.rm = T) * 100)) 229 | est[est > 1] <- 1 230 | est[est < -1] <- -1 231 | return(est) 232 | } 233 | 234 | --------------------------------------------------------------------------------