├── .github
    ├── .gitignore
    └── workflows
    │   ├── windows-check.yaml
    │   ├── check-release.yaml
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   └── test-coverage.yaml
├── vignettes
    ├── .gitignore
    ├── figure
    │   ├── figt-1.png
    │   └── figrho-1.png
    ├── covariate_adjustment.Rmd.orig
    ├── covariate_adjustment.Rmd
    └── CSCORE.Rmd
├── LICENSE
├── src
    ├── CSCORE.so
    ├── WLS_mean.o
    ├── WSL_cov.o
    ├── RcppExports.o
    ├── CSCORE_IRLS_cpp.o
    ├── Makevars.win
    ├── WLS_cov.h
    ├── WLS_mean.h
    ├── Makevars
    ├── WLS_mean.cpp
    ├── WLS_cov.cpp
    ├── RcppExports.cpp
    └── CSCORE_IRLS_cpp.cpp
├── R
    ├── zzz.R
    ├── RcppExports.R
    ├── data.R
    ├── WLS_helper.R
    ├── CSCORE_IRLS_base.R
    ├── CSCORE.R
    ├── CSCORE_IRLS.R
    └── CSCORE_IRLS_cpp.R
├── data
    ├── ind_gene_pair.rda
    └── top_enrich_go.rda
├── .gitignore
├── .Rbuildignore
├── tests
    ├── testthat
    │   ├── fixtures
    │   │   └── mic_highly_expressed_500_AD_control
    │   ├── test-CSCORE.R
    │   ├── test-IRLS_versions.R
    │   ├── test-CSCORE_IRLS.R
    │   └── test-covariates.R
    └── testthat.R
├── NAMESPACE
├── man
    ├── has_non_integer.Rd
    ├── WLS_mean.Rd
    ├── top_enrich_go.Rd
    ├── ind_gene_pair.Rd
    ├── WLS_cov.Rd
    ├── post_process_est.Rd
    ├── set_D.Rd
    ├── CSCORE_IRLS_base.Rd
    ├── CSCORE.Rd
    ├── CSCORE_IRLS.Rd
    └── CSCORE_IRLS_cpp.Rd
├── CSCORE.Rproj
├── data-raw
    └── ind_gene_pair.R
├── LICENSE.md
├── DESCRIPTION
├── _pkgdown.yml
├── NEWS.md
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2025
2 | COPYRIGHT HOLDER: CS-CORE authors
3 | 


--------------------------------------------------------------------------------
/src/CSCORE.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/CSCORE.so


--------------------------------------------------------------------------------
/src/WLS_mean.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/WLS_mean.o


--------------------------------------------------------------------------------
/src/WSL_cov.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/WSL_cov.o


--------------------------------------------------------------------------------
/src/RcppExports.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/RcppExports.o


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | #' @useDynLib CSCORE, .registration = TRUE
2 | #' @importFrom Rcpp evalCpp
3 | NULL
4 | 


--------------------------------------------------------------------------------
/data/ind_gene_pair.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/data/ind_gene_pair.rda


--------------------------------------------------------------------------------
/data/top_enrich_go.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/data/top_enrich_go.rda


--------------------------------------------------------------------------------
/src/CSCORE_IRLS_cpp.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/src/CSCORE_IRLS_cpp.o


--------------------------------------------------------------------------------
/vignettes/figure/figt-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/vignettes/figure/figt-1.png


--------------------------------------------------------------------------------
/vignettes/figure/figrho-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/vignettes/figure/figrho-1.png


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | CXX_STD = CXX17
2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
3 | PKG_CPPFLAGS = -DARMA_USE_CURRENT
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .Rdata
 4 | .rds
 5 | .httr-oauth
 6 | .DS_Store
 7 | inst/doc
 8 | docs
 9 | src/*.o
10 | /doc/
11 | /Meta/
12 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CSCORE\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^data-raw$
 5 | ^\.github$
 6 | ^_pkgdown\.yml$
 7 | ^docs$
 8 | ^pkgdown$
 9 | ^doc$
10 | ^Meta$
11 | 


--------------------------------------------------------------------------------
/tests/testthat/fixtures/mic_highly_expressed_500_AD_control:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChangSuBiostats/CS-CORE/HEAD/tests/testthat/fixtures/mic_highly_expressed_500_AD_control


--------------------------------------------------------------------------------
/src/WLS_cov.h:
--------------------------------------------------------------------------------
 1 | #ifndef WLS_COV_H
 2 | #define WLS_COV_H
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | 
 6 | // Declaration of the function
 7 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/src/WLS_mean.h:
--------------------------------------------------------------------------------
 1 | #ifndef WLS_MEAN_H
 2 | #define WLS_MEAN_H
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | 
 6 | // Declaration of the function
 7 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/tests/testthat/test-CSCORE.R:
--------------------------------------------------------------------------------
1 | test_that("input must be a Seurat object", {
2 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
3 |   count = test_dat[[1]]
4 |   genes = colnames(count)
5 |   expect_error(CSCORE(count, genes))
6 | })
7 | 
8 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CSCORE)
 4 | export(CSCORE_IRLS_base)
 5 | export(CSCORE_IRLS_cpp)
 6 | export(WLS_cov)
 7 | export(WLS_mean)
 8 | export(post_process_est)
 9 | importFrom(Rcpp,evalCpp)
10 | useDynLib(CSCORE, .registration = TRUE)
11 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | ## Use RcppArmadillo for C++ compilation
2 | PKG_CPPFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppArmadillo:::CxxFlags()")
3 | 
4 | ## Link against the same LAPACK/BLAS libraries that R was built with
5 | PKG_LIBS = $(shell ${R_HOME}/bin/Rscript -e "RcppArmadillo:::LdFlags()") \
6 |             $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
7 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(CSCORE)
11 | 
12 | test_check("CSCORE")
13 | 


--------------------------------------------------------------------------------
/man/has_non_integer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R
 3 | \name{has_non_integer}
 4 | \alias{has_non_integer}
 5 | \title{Check for non-integer values in a matrix}
 6 | \usage{
 7 | has_non_integer(mat)
 8 | }
 9 | \arguments{
10 | \item{mat}{A numeric matrix.}
11 | }
12 | \value{
13 | A logical value: \code{TRUE} if any element is non-integer, otherwise \code{FALSE}.
14 | }
15 | \description{
16 | This function checks whether a numeric matrix contains any non-integer values.
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/CSCORE.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: e3c151dd-dd48-4daa-8123-01d6ab649e4b
 3 | 
 4 | RestoreWorkspace: No
 5 | SaveWorkspace: No
 6 | AlwaysSaveHistory: Default
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 | 
16 | AutoAppendNewline: Yes
17 | StripTrailingWhitespace: Yes
18 | LineEndingConversion: Posix
19 | 
20 | BuildType: Package
21 | PackageUseDevtools: Yes
22 | PackageInstallArgs: --no-multiarch --with-keep.source
23 | PackageRoxygenize: rd,collate,namespace
24 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | CSCORE_IRLS_cpp_impl <- function(X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process = TRUE, n_iter = 10L, eps = 0.05, verbose = FALSE, conv = "q95", return_all = FALSE) {
 5 |     .Call(`_CSCORE_CSCORE_IRLS_cpp_impl`, X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process, n_iter, eps, verbose, conv, return_all)
 6 | }
 7 | 
 8 | WLS_cov <- function(D, X, W) {
 9 |     .Call(`_CSCORE_WLS_cov`, D, X, W)
10 | }
11 | 
12 | WLS_mean <- function(D, X, W) {
13 |     .Call(`_CSCORE_WLS_mean`, D, X, W)
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/man/WLS_mean.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/WLS_helper.R
 3 | \name{WLS_mean}
 4 | \alias{WLS_mean}
 5 | \title{Weighted Least Squares Mean and Variance Estimation (Rcpp)}
 6 | \usage{
 7 | WLS_mean(D, X, W)
 8 | }
 9 | \arguments{
10 | \item{D}{Design matrix for WLS (size n x k), where the first column represents baseline mean/var,
11 | and others represents covariates.}
12 | 
13 | \item{X}{Response matrix for WLS (size n x p)}
14 | 
15 | \item{W}{Weight matrix (n x p)}
16 | }
17 | \value{
18 | A k x p matrix of WLS estimates
19 | }
20 | \description{
21 | Compute WLS estimates of mean or variance for all p genes
22 | }
23 | 


--------------------------------------------------------------------------------
/man/top_enrich_go.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{top_enrich_go}
 5 | \alias{top_enrich_go}
 6 | \title{Top GO enrichment result in the CS-CORE vignette}
 7 | \format{
 8 | \subsection{\code{top_enrich_go}}{
 9 | 
10 | A list with five elements, where each element holds the top 3 GO terms enriched in a gene module with strong enrichment signals
11 | }
12 | }
13 | \usage{
14 | top_enrich_go
15 | }
16 | \description{
17 | This data set stores the top GO enrichment result from the analysis in CS-CORE vignette.
18 | It is saved for efficiently rendering the vignette without re-evaluating the codes.
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/ind_gene_pair.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ind_gene_pair}
 5 | \alias{ind_gene_pair}
 6 | \title{A simulated independent gene pair}
 7 | \format{
 8 | \subsection{\code{ind_gene_pair}}{
 9 | 
10 | A list with 2 elements:
11 | \describe{
12 | \item{counts}{A 1972 by 2 count matrix with 1972 cells and 2 genes}
13 | \item{seq_depths}{A length 1972 vector of sequencing depths}
14 | }
15 | }
16 | }
17 | \usage{
18 | ind_gene_pair
19 | }
20 | \description{
21 | This data set is created only for illustrative purposes and is used to test CSCORE_IRLS.R.
22 | The source code for generating this data set is in data-raw/ on Github.
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/WLS_cov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/WLS_helper.R
 3 | \name{WLS_cov}
 4 | \alias{WLS_cov}
 5 | \title{Weighted Least Squares Covariance Estimation (Rcpp)}
 6 | \usage{
 7 | WLS_cov(D, X, W)
 8 | }
 9 | \arguments{
10 | \item{D}{Design matrix for WLS (size n x k)}
11 | 
12 | \item{X}{Gene expression matrix (n x p)}
13 | 
14 | \item{W}{Weight matrix (n x p)}
15 | }
16 | \value{
17 | A list with:
18 | \describe{
19 | \item{cov_hat}{A k x p x p array of WLS estimates}
20 | \item{test_stat}{A p x p matrix of test statistics}
21 | }
22 | }
23 | \description{
24 | Computes WLS estimates of gene-gene covariance and test statistics that assess
25 | the statistical significance of co-expression using moment-based regressions.
26 | }
27 | 


--------------------------------------------------------------------------------
/data-raw/ind_gene_pair.R:
--------------------------------------------------------------------------------
 1 | ## code to prepare `ind_gene_pair` dataset goes here
 2 | 
 3 | # Generate highly variable seq depths from a log normal distribution
 4 | set.seed(202212)
 5 | sim_seq_depths <- exp(rnorm(2000, 7.3, sd=0.6))
 6 | sim_seq_depths <- sim_seq_depths[!sim_seq_depths < 400]
 7 | sim_seq_depths <- round(sim_seq_depths)
 8 | summary(sim_seq_depths)
 9 | 
10 | # Generate an independent gene pair
11 | n <- length(sim_seq_depths)
12 | z_mat <- matrix(rgamma(2*n, shape=4.83, scale=7.56e-05),
13 |                 nrow = n, ncol = 2)
14 | x_mat <- matrix(rpois(n = 2*n, lambda=z_mat * sim_seq_depths), nrow = n, ncol = 2)
15 | summary(x_mat[,1])
16 | summary(x_mat[,2])
17 | 
18 | ind_gene_pair <- list(counts = x_mat, seq_depths = sim_seq_depths)
19 | 
20 | usethis::use_data(ind_gene_pair, overwrite = TRUE)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-check.yaml:
--------------------------------------------------------------------------------
 1 | name: Windows (R-release)
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   check:
10 |     runs-on: windows-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - uses: r-lib/actions/setup-r@v2
14 |         with:
15 |           r-version: 'release'
16 |       - uses: r-lib/actions/setup-r-dependencies@v2
17 |         with:
18 |           extra-packages: any::rcmdcheck, any::pak
19 |           needs: check
20 |       - name: Ensure latest Rcpp/RcppArmadillo
21 |         run: |
22 |           Rscript -e "pak::pkg_install(c('Rcpp','RcppArmadillo'))"
23 |       - name: Check package
24 |         env:
25 |           _R_CHECK_FORCE_SUGGESTS_: false
26 |         run: |
27 |           Rscript -e "rcmdcheck::rcmdcheck('.', args=c('--no-manual','--as-cran'), error_on='error')"
28 | 


--------------------------------------------------------------------------------
/.github/workflows/check-release.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |       R_KEEP_PKG_SOURCE: yes
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::rcmdcheck
27 |           needs: check
28 | 
29 |       - uses: r-lib/actions/check-r-package@v2
30 | 


--------------------------------------------------------------------------------
/man/post_process_est.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R
 3 | \name{post_process_est}
 4 | \alias{post_process_est}
 5 | \title{Post-process IRLS estimates}
 6 | \usage{
 7 | post_process_est(est)
 8 | }
 9 | \arguments{
10 | \item{est}{Estimated co-expression matrix from IRLS}
11 | }
12 | \value{
13 | Post-processed correlation matrix
14 | }
15 | \description{
16 | The IRLS procedure does not guarantee the variance estimates to be postive nor the co-expression parameters to be bounded.
17 | To address this, this function evaluates the percentage of genes with negative variance estimates;
18 | sets the their co-expressions to 0 as these genes do not have sufficient biological variations.
19 | This function also evaluates the percentage of gene pairs with out-of-bound co-expression estimates;
20 | sets the co-expressions greater than 1 to 1; set the co-expressions smaller than -1 to -1.
21 | }
22 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' A simulated independent gene pair
 2 | #'
 3 | #' This data set is created only for illustrative purposes and is used to test CSCORE_IRLS.R.
 4 | #' The source code for generating this data set is in data-raw/ on Github.
 5 | #'
 6 | #' @format ## `ind_gene_pair`
 7 | #' A list with 2 elements:
 8 | #' \describe{
 9 | #'   \item{counts}{A 1972 by 2 count matrix with 1972 cells and 2 genes}
10 | #'   \item{seq_depths}{A length 1972 vector of sequencing depths}
11 | #' }
12 | "ind_gene_pair"
13 | 
14 | #' Top GO enrichment result in the CS-CORE vignette
15 | #'
16 | #' This data set stores the top GO enrichment result from the analysis in CS-CORE vignette.
17 | #' It is saved for efficiently rendering the vignette without re-evaluating the codes.
18 | #'
19 | #' @format ## `top_enrich_go`
20 | #' A list with five elements, where each element holds the top 3 GO terms enriched in a gene module with strong enrichment signals
21 | #'
22 | "top_enrich_go"
23 | 
24 | 


--------------------------------------------------------------------------------
/man/set_D.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R
 3 | \name{set_D}
 4 | \alias{set_D}
 5 | \title{Set the design matrix for moment-based regressions}
 6 | \usage{
 7 | set_D(s, D, adjust_setting, covariate_level)
 8 | }
 9 | \arguments{
10 | \item{s}{A numeric vector of sequencing depths (for mean regression) or squared sequencing depths (for variance and covariance)}
11 | 
12 | \item{D}{A numeric matrix of intercept and covariates (\code{n x K})}
13 | 
14 | \item{adjust_setting}{Logical; whether to adjust for covariates}
15 | 
16 | \item{covariate_level}{A character string indicating whether covariates are assumed to affect
17 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).}
18 | }
19 | \value{
20 | Design matrix (n by K) for moment-based regressions
21 | }
22 | \description{
23 | Set the design matrix for moment-based regressions
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |       R_KEEP_PKG_SOURCE: yes
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - name: Install system dependencies
25 |         run: sudo apt-get update && sudo apt-get install -y liblapack-dev libopenblas-dev
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v2
28 |         with:
29 |           extra-packages: any::rcmdcheck
30 |           needs: check
31 | 
32 |       - uses: r-lib/actions/check-r-package@v2
33 |         with:
34 |           error-on: '"error"'
35 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2025 CS-CORE authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CSCORE
 2 | Title: Cell-type-specific co-expression inference from single cell RNA-sequencing data
 3 | Version: 1.0.2
 4 | Authors@R: c(
 5 |     person("Chang", "Su", , "chang.su@emory.edu", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0002-8704-1512")),
 7 |     person("Zichun", "Xu", , "zichun.xu@yale.edu", role = c("aut"),
 8 |            comment = c(ORCID = "0000-0002-4001-0321")),
 9 |     person("Xinning", "Shan", , "xinning.shan@yale.edu", role = c("aut"),
10 |            comment = c(ORCID = "0000-0001-6270-0094"))
11 |     )
12 | Description: This R package provides an implementation for the statistical method CS-CORE that estimates and tests cell-type-speicific co-expression networks using single cell RNA sequencing data.
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | Roxygen: list(markdown = TRUE)
16 | RoxygenNote: 7.3.2
17 | Imports: 
18 |     stats,
19 |     Rcpp
20 | LinkingTo: Rcpp, RcppArmadillo
21 | Depends: 
22 |     R (>= 2.10)
23 | LazyData: true
24 | Suggests: 
25 |     knitr,
26 |     rmarkdown,
27 |     MASS,
28 |     Seurat,
29 |     testthat (>= 3.0.0)
30 | Config/testthat/edition: 3
31 | VignetteBuilder: knitr
32 | URL: https://changsubiostats.github.io/CS-CORE/
33 | 


--------------------------------------------------------------------------------
/R/WLS_helper.R:
--------------------------------------------------------------------------------
 1 | #' Weighted Least Squares Covariance Estimation (Rcpp)
 2 | #'
 3 | #' Computes WLS estimates of gene-gene covariance and test statistics that assess
 4 | #' the statistical significance of co-expression using moment-based regressions.
 5 | #'
 6 | #' @param D Design matrix for WLS (size n x k)
 7 | #' @param X Gene expression matrix (n x p)
 8 | #' @param W Weight matrix (n x p)
 9 | #'
10 | #' @return A list with:
11 | #' \describe{
12 | #'   \item{cov_hat}{A k x p x p array of WLS estimates}
13 | #'   \item{test_stat}{A p x p matrix of test statistics}
14 | #' }
15 | #'
16 | #' @export
17 | WLS_cov <- function(D, X, W) {
18 |   .Call(`_CSCORE_WLS_cov`, D, X, W)
19 | }
20 | 
21 | #' Weighted Least Squares Mean and Variance Estimation (Rcpp)
22 | #'
23 | #' Compute WLS estimates of mean or variance for all p genes
24 | #'
25 | #' @param D Design matrix for WLS (size n x k), where the first column represents baseline mean/var,
26 | #'   and others represents covariates.
27 | #' @param X Response matrix for WLS (size n x p)
28 | #' @param W Weight matrix (n x p)
29 | #'
30 | #' @return A k x p matrix of WLS estimates
31 | #'
32 | #' @export
33 | WLS_mean <- function(D, X, W) {
34 |   .Call(`_CSCORE_WLS_mean`, D, X, W)
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/testthat/test-IRLS_versions.R:
--------------------------------------------------------------------------------
 1 | test_that("CSCORE_IRLS Rcpp version matches base version on a simulated independent gene pair", {
 2 |   expect_equal(CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths,
 3 |                                IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')),
 4 |                CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths), tolerance = 1e-8)
 5 | })
 6 | 
 7 | test_that("CSCORE_IRLS Rcpp version matches base version on real data from 3k cells and on 500 genes", {
 8 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
 9 |   count = test_dat[[1]]
10 |   seq_depth = test_dat[[2]]
11 |   expect_equal(CSCORE_IRLS_cpp(count, seq_depth,
12 |                                IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')),
13 |                CSCORE_IRLS_base(count, seq_depth), tolerance = 1e-8)
14 | })
15 | 
16 | test_that("CSCORE_IRLS Rcpp version matches base version on real data from 3k cells and on 500 genes", {
17 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
18 |   count = test_dat[[1]]
19 |   seq_depth = test_dat[[2]]
20 |   expect_equal(CSCORE_IRLS_cpp(count, seq_depth,
21 |                                IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max')),
22 |                CSCORE_IRLS_base(count, seq_depth), tolerance = 1e-8)
23 | })
24 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://changsubiostats.github.io/CS-CORE/
 2 | template:
 3 |   bootstrap: 5
 4 |   bootswatch: simplex
 5 |   math-rendering: mathjax
 6 | articles:
 7 | - title: Getting started
 8 |   contents: CSCORE
 9 | - title: Covariate Adjustment
10 |   contents: covariate_adjustment
11 | navbar:
12 |   structure:
13 |     left:
14 |     - home
15 |     - intro
16 |     - articles
17 |     - reference
18 |     - news
19 |     right: github
20 |   components:
21 |     articles:
22 |       text: Vignettes
23 |       menu:
24 |       - text: CS-CORE
25 |         href: articles/CSCORE.html
26 |       - text: Covariates adjustment
27 |         href: articles/covariate_adjustment.html
28 | home:
29 |   sidebar:
30 |     structure:
31 |     - license
32 |     - authors
33 |     - links
34 |   links:
35 |   - text: CS-CORE for co-expression
36 |     href: https://doi.org/10.1038/s41467-023-40503-7
37 |   - text: scMultiMap for peak-gene association
38 |     href: https://doi.org/10.1038/s41467-025-59306-z
39 | reference:
40 | - title: CS-CORE functions
41 |   desc: |
42 |     Functions for inferring cell-type-specific co-expression networks
43 |   contents:
44 |   - CSCORE
45 |   - CSCORE_IRLS_cpp
46 |   - CSCORE_IRLS_base
47 |   - post_process_est
48 |   - WLS_mean
49 |   - WLS_cov
50 | - title: Supporting data sets
51 |   desc: |
52 |     Data sets for illustrative purposes
53 |   contents:
54 |   - ind_gene_pair
55 |   - top_enrich_go
56 | 
57 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # CSCORE 1.0.2 (2025-09-13)
 2 | 
 3 | ## Improvements
 4 | * Fixed BLAS/LAPACK linking on Windows by adding `src/Makevars.win`.
 5 | * Standardized dimension and loop variables to use `arma::uword`, removing signed/unsigned comparison warnings.
 6 | * Ensured compatibility with R 4.5.1 and RcppArmadillo 15.x (validated via win-builder and GitHub Actions CI).
 7 | * Improved consistency of header includes (`RcppArmadillo.h` first) for more robust builds.
 8 | 
 9 | ## Infrastructure
10 | * Added GitHub Actions workflow for Windows CI with latest R and RcppArmadillo.
11 | * Added Windows build status badge to README.
12 | 
13 | 
14 | # CSCORE 1.0.1 (2025-06-26)
15 | ## Features
16 | * Added support for covariate adjustment in moment-based regressions for estimating mean, variance, and covariance.
17 | 
18 | ## Improvements
19 | * Introduced Rcpp implementations of `WLS_mean()` and `WLS_cov()` to enhance memory efficiency while preserving computational performance.
20 | * Implemented `CSCORE_IRLS_cpp()`, the cpp implementation of the full IRLS procedure to replace the original base R implementation.
21 | * Added tests to ensure consistency with the original base R implementation.
22 | * Updated documentation.
23 | 
24 | # CSCORE 1.0.0
25 | * The first official release of the CS-CORE R pakcage (with NC revision).
26 | 
27 | # CSCORE 0.0.0.9000
28 | 
29 | * The first version of the CS-CORE R package.
30 | * Added a `NEWS.md` file to track changes to the package.
31 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v3
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@v4.4.1
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v4
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/src/WLS_mean.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | // [[Rcpp::depends(RcppArmadillo)]]
 3 | 
 4 | using namespace Rcpp;
 5 | 
 6 | 
 7 | /*
 8 |  * Compute WLS estimates of mean or variance for all p genes
 9 |  *
10 |  * D: Design matrix for WLS (size n x k), where the first column represents baseline mean/var, and others represents covariates
11 |  * X: Response matrix for WLS (size n x p)
12 |  * W: Weight matrix (size n x p)
13 |  *
14 |  * Returns: numeric matrix of size k x p, where (, j) stores the WLS estimates for gene j
15 |  */
16 | // [[Rcpp::export]]
17 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W) {
18 |   const arma::uword n = D.n_rows;   // number of rows of D (i.e., number of cells)
19 |   const arma::uword k = D.n_cols;   // number of columns of D (i.e., number of covariates)
20 |   const arma::uword p = X.n_cols;   // number of genes
21 | 
22 |   if (X.n_rows != n) {
23 |     stop("Dimensions of design matrix and gene expression data do not match.");
24 |   }
25 | 
26 |   arma::mat result(k, p, arma::fill::zeros);  // k x p matrix, WLS estimates
27 | 
28 |   arma::mat D_T = D.t();  // precompute transpose once
29 | 
30 |   for (arma::uword j = 0; j < p; ++j) {
31 |     // Compute D^T * diag(w) * D
32 |     arma::mat DTD_w = D_T * (D.each_col() % W.col(j));
33 | 
34 |     // Inverse of DTD_w
35 |     arma::mat inv_DTD_w = arma::inv(DTD_w);
36 | 
37 |     // Compute D^T * (w % x_prod)
38 |     arma::vec Dt_w_x = D_T * (X.col(j) % W.col(j));
39 | 
40 |     // WLS estimator
41 |     arma::vec res_vec = inv_DTD_w * Dt_w_x;
42 | 
43 |     // Save WLS estimates
44 |     result.col(j) = res_vec;
45 |   }
46 | 
47 |   return result;
48 | }
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CS-CORE
 2 | [![R-CMD-check](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/R-CMD-check.yaml)
 3 | ![Windows CI](https://github.com/ChangSuBiostats/CS-CORE/actions/workflows/windows-check.yaml/badge.svg)
 4 | [![DOI](https://zenodo.org/badge/576326164.svg)](https://zenodo.org/badge/latestdoi/576326164)
 5 | 
 6 | `CS-CORE` is a R package for cell-type-specific co-expression inference from single cell RNA-sequencing data. 
 7 | 
 8 | ## Reference
 9 | Su, Chang, et al. "Cell-type-specific co-expression inference from single cell RNA-sequencing data." *Nature Communications* 14.1 (2023): 4846. (https://doi.org/10.1038/s41467-023-40503-7)
10 | 
11 | ## Installation
12 | 
13 | `CS-CORE` is available on GitHub. You can install it using
14 | 
15 | ``` r
16 | ## Load devtools for installing R packages from Github
17 | library(devtools)
18 | ## Install CS-CORE from Github
19 | install_github("ChangSuBiostats/CS-CORE")
20 | ```
21 | 
22 | ## Vignettes
23 | 
24 | 1. [Get started](https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html) shows an example of using CS-CORE for cell-type-specific co-expression analysis with single cell RNA-sequencing data. 
25 | It includes inferring co-expressions, extracting co-expressed gene modules and functional enrichment analysis.
26 | 
27 | 2. [Covariate adjustment](https://changsubiostats.github.io/CS-CORE/articles/covariate_adjustment.html) shows how to adjust for 
28 | covariates in co-expression inference with CS-CORE.
29 | 
30 | 
31 | ## Contact us 
32 | 
33 | For issues or feature requests, please visit GitHub Issues. If an issue remains unanswered for a while, 
34 | you are welcome to email the maintainer at <chang.su@emory.edu>
35 | 
36 | ## A Python version
37 | 
38 | A Python implementation of CS-CORE is also provided [here](https://github.com/ChangSuBiostats/CS-CORE_python).
39 | 


--------------------------------------------------------------------------------
/tests/testthat/test-CSCORE_IRLS.R:
--------------------------------------------------------------------------------
 1 | test_that("Correct dimension", {
 2 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
 3 |   count = test_dat[[1]]
 4 |   seq_depth = test_dat[[2]]
 5 |   CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth)
 6 |   rho = CSCORE_result$est
 7 |   pvalues = CSCORE_result$p_value
 8 |   test_stats = CSCORE_result$test_stat
 9 |   p = c(dim(count)[2], dim(count)[2])
10 |   expect_true(all(dim(rho) == p,
11 |                   dim(pvalues) == p,
12 |                   dim(test_stats) == p))
13 | })
14 | 
15 | 
16 | test_that("P-values are in the correct range", {
17 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
18 |   count = test_dat[[1]]
19 |   seq_depth = test_dat[[2]]
20 |   CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth)
21 |   pvalues = CSCORE_result$p_value
22 |   expect_true(mean(pvalues <= 1, na.rm = T) == 1)
23 | })
24 | 
25 | test_that("Co-expression estimates are bounded between -1 and 1 and the matrix is symmetric", {
26 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
27 |   count = test_dat[[1]]
28 |   seq_depth = test_dat[[2]]
29 |   CSCORE_result = CSCORE_IRLS(X = count, seq_depth = seq_depth, post_process = TRUE)
30 |   est = CSCORE_result$est
31 |   expect_true(all(all(diag(est) == 1),
32 |                   abs(est) <= 1,
33 |                   all.equal(est[upper.tri(est)], t(est)[upper.tri(est)])))
34 | })
35 | 
36 | test_that("The length of the sequencing depth must match the number of cells", {
37 |   test_dat = readRDS("fixtures/mic_highly_expressed_500_AD_control")
38 |   count = test_dat[[1]]
39 |   seq_depth = c(test_dat[[2]], 0)
40 |   expect_error(CSCORE_IRLS(X = count, seq_depth = seq_depth),
41 |                "The length of the sequencing depth must match the number of cells.")
42 | })
43 | 
44 | test_that("The co-expression estimates and p values are computed correctly", {
45 |   cscore_example <- CSCORE_IRLS(ind_gene_pair$counts, ind_gene_pair$seq_depths)
46 |   expect_equal(cscore_example$est[1,2], 0.0078201236)
47 |   expect_equal(cscore_example$p_value[1,2], 0.96198097)
48 | })
49 | 


--------------------------------------------------------------------------------
/man/CSCORE_IRLS_base.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE_IRLS_base.R
 3 | \name{CSCORE_IRLS_base}
 4 | \alias{CSCORE_IRLS_base}
 5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE (base R, archived)}
 6 | \source{
 7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
 8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data.
 9 | \emph{Nature Communications}.
10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7}
11 | }
12 | \usage{
13 | CSCORE_IRLS_base(X, seq_depth, post_process = TRUE)
14 | }
15 | \arguments{
16 | \item{X}{A n by p matrix of UMI counts, where n denotes the number of cells and p denotes the number of genes}
17 | 
18 | \item{seq_depth}{A length n vector of sequencing depths}
19 | 
20 | \item{post_process}{Whether to process the estimated co-expressions such that the estimates are between -1 and 1. Default to TRUE.}
21 | }
22 | \value{
23 | A list of three p by p matrices:
24 | \describe{
25 | \item{est}{co-expression estimates}
26 | \item{p_value}{p values}
27 | \item{test_stat}{test statistics}
28 | }
29 | }
30 | \description{
31 | This function was originally implemented in 2023 and included in the first release of the CS-CORE R package.
32 | It has since been replaced by \code{\link{CSCORE_IRLS}} for two main reasons:
33 | }
34 | \details{
35 | \enumerate{
36 | \item It relies on base R for regression, which is slower and more memory-intensive than the Rcpp-based implementation in \code{CSCORE_IRLS}.
37 | \item It does not support covariate adjustment, a feature supported in \code{CSCORE_IRLS}.
38 | }
39 | }
40 | \note{
41 | This function is retained for reference and backward compatibility, but users are encouraged to use \code{\link{CSCORE_IRLS}} for new analyses.
42 | }
43 | \examples{
44 | ## Toy example:
45 | ## run CSCORE on a simulated independent gene pair
46 | cscore_example <- CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths)
47 | 
48 | ## Estimated co-expression between two genes
49 | cscore_example$est[1,2]
50 | # close to 0: 0.007820124
51 | 
52 | ## p-values
53 | cscore_example$p_value[1,2]
54 | # not significant: 0.961981
55 | 
56 | }
57 | \keyword{internal}
58 | 


--------------------------------------------------------------------------------
/tests/testthat/test-covariates.R:
--------------------------------------------------------------------------------
 1 | test_that("The covaiates' effect sizes are computed correctly", {
 2 |   set.seed(202212)
 3 |   n <- 6000
 4 |   sim_seq_depths <- exp(rnorm(n, 7.3, sd=0.6))
 5 |   sim_seq_depths <- sim_seq_depths[!sim_seq_depths < 400]
 6 |   sim_seq_depths <- round(sim_seq_depths)
 7 | 
 8 |   n <- length(sim_seq_depths)
 9 |   mu <- 4.83 * 7.56e-05 # extracted from a specific gene
10 |   sigma2 <- 4.83 * 7.56e-05^2
11 |   # Simulate a covariate g
12 |   g <- rbinom(n, 2, 0.4)
13 |   effect_sizes <- matrix(c(0.05, 0.1, 0.2, 0.04), nrow = 2)
14 |   rownames(effect_sizes) <- c('mu', 'sigma2')
15 |   effect_sizes
16 |   covar <- sigma2 * 0.1
17 |   covar_effect_size <- 1
18 | 
19 |   z_mat <- matrix(nrow = n, ncol = 2)
20 |   for(g_val in unique(g)){
21 |     g_val_inds <- which(g == g_val)
22 |     covar_g <- covar + covar * covar_effect_size * g_val
23 |     sigma2_val_vec <- numeric(2)
24 |     for(j in 1:2){
25 |       sigma2_val_vec[j] <- sigma2 + sigma2 * effect_sizes['sigma2', j] * g_val
26 |     }
27 |     rho_g <- covar_g / sqrt(prod(sigma2_val_vec))
28 |     z_mat_gaussian <- MASS::mvrnorm(length(g_val_inds), rep(0,2),
29 |                               matrix(c(1, rho_g, rho_g, 1), nrow = 2))
30 | 
31 |     for(j in 1:2){
32 |       mu_val <- mu + mu * effect_sizes['mu', j] * g_val
33 |       sigma2_val <- sigma2 + sigma2 * effect_sizes['sigma2', j] * g_val
34 |       z_mat[g_val_inds,j] <- qgamma(pnorm(z_mat_gaussian[,j]), shape = mu_val^2 / sigma2_val, scale = sigma2_val / mu_val)
35 |     }
36 |   }
37 |   x_mat <- matrix(rpois(n = 2*n, lambda=z_mat * sim_seq_depths), nrow = n, ncol = 2)
38 | 
39 |   cscore_example <- CSCORE_IRLS_cpp(x_mat, sim_seq_depths,
40 |                                     covariates = g,
41 |                                     IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = 'max'),
42 |                                     return_all = TRUE)
43 |   expect_equal(cscore_example$est[1,2], 0.1068046, tolerance = 1e-6)
44 |   expect_equal(cscore_example$p_value[1,2], 0.5056709, tolerance = 1e-6)
45 |   expect_equal(cscore_example$mu_beta,
46 |                matrix(c(3.654416e-04, 3.484695e-04, 1.090923e-05, 7.330102e-05), 2, 2, byrow=T), tolerance = 1e-6)
47 |   expect_equal(cscore_example$sigma2_beta,
48 |                matrix(c(2.483174e-08, 2.850686e-08, 1.477678e-10, -6.450020e-09), 2, 2, byrow=T), tolerance = 1e-6)
49 |   expect_equal(cscore_example$cov_beta[,,1],
50 |                matrix(c(1.861900e-07, 2.855003e-09, 2.254392e-09, 8.866591e-09), 2, 2, byrow=T), tolerance = 1e-6)
51 | })
52 | 


--------------------------------------------------------------------------------
/man/CSCORE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE.R
 3 | \name{CSCORE}
 4 | \alias{CSCORE}
 5 | \title{CS-CORE for cell-type-specific co-expression network inference}
 6 | \usage{
 7 | CSCORE(
 8 |   object,
 9 |   genes,
10 |   seq_depth = NULL,
11 |   covariate_names = NULL,
12 |   adjust_setting = c(mean = T, var = T, covar = T),
13 |   IRLS_version = "Rcpp",
14 |   IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE)
15 | )
16 | }
17 | \arguments{
18 | \item{object}{A Seurat object containing single-cell RNA-seq data.
19 | The object should be subsetted to cells of a single cell type to ensure cell-type-specific inference.
20 | CS-CORE requires raw UMI counts as input, and assumes that the raw count matrix is stored in the \code{"counts"} slot of the \code{"RNA"} assay
21 | (i.e., \code{object[["RNA"]]@counts}).}
22 | 
23 | \item{genes}{A character vector of gene names (length \eqn{p}) for which the co-expression network will be estimated.}
24 | 
25 | \item{seq_depth}{A numeric vector of sequencing depths (length \eqn{n}).
26 | If \code{NULL}, sequencing depth will be computed as the total UMI count per cell.
27 | Defaults to \code{NULL}.}
28 | 
29 | \item{covariate_names}{Optional. A character vector specifying the names of cell-level covariates to adjust for in the regression models.
30 | These variables will be extracted from \code{object@meta.data[, covariate_names]}. Defaults to \code{NULL}.}
31 | 
32 | \item{adjust_setting}{Optional. A named logical vector of length 3 indicating whether to adjust for covariates in the estimation of mean, variance, and covariance.
33 | Must be named \code{c("mean", "var", "covar")}. Defaults to \code{c(mean = TRUE, var = TRUE, covar = TRUE)}.}
34 | 
35 | \item{IRLS_version}{Optional. A character string specifying the IRLS implementation to use: \code{"Rcpp"} or \code{"base_R"}.
36 | Only the \code{"Rcpp"} version supports covariate adjustment. The \code{"base_R"} version does not.
37 | When applicable, \code{"Rcpp"} offers improved memory efficiency (~10-100 times) but may be slower (~10 times),
38 | while \code{"base_R"} is faster but more memory intensive.
39 | Defaults to \code{"Rcpp"}.}
40 | 
41 | \item{IRLS_par}{Optional. A named list of length 3 specifying parameters for the IRLS algorithm:
42 | \describe{
43 | \item{\code{n_iter}}{Maximum number of iterations.}
44 | \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.}
45 | \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.}
46 | }
47 | Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE)}.}
48 | }
49 | \value{
50 | A list of three p by p matrices:
51 | \describe{
52 | \item{est}{Matrix of co-expression estimates.}
53 | \item{p_value}{Matrix of p-values for testing co-expression.}
54 | \item{test_stat}{Matrix of test statistics for evaluating the significance of co-expression.}
55 | }
56 | }
57 | \description{
58 | Run CS-CORE on a Seurat object to infer the cell-type-specific co-expression network for a specified set of genes,
59 | with optional adjustment for covariates.
60 | For more details on the covariate adjustment and the moment-based regression,
61 | please refer to \link{CSCORE_IRLS}.
62 | }
63 | \examples{
64 | # See a full example at:
65 | # https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html
66 | 
67 | }
68 | \seealso{
69 | \link{CSCORE_IRLS}
70 | 
71 | \href{https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html}{CS-CORE online tutorial}
72 | }
73 | 


--------------------------------------------------------------------------------
/src/WLS_cov.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | // [[Rcpp::depends(RcppArmadillo)]]
 3 | 
 4 | using namespace Rcpp;
 5 | 
 6 | /*
 7 |  * Compute WLS estimates of covariance for all p by p gene pairs
 8 |  *
 9 |  * D: Design matrix for WLS (size n x k)
10 |  * X: Gene expression matrix (size n x p)
11 |  * W: Weight matrix (size n x p)
12 |  *
13 |  * Returns: a list with two components
14 |  * cov_hat: numeric array of size k x p x p, where (, j ,j') stores the WLS estimates for gene pair j, j' across k covariates
15 |  * test_stat: numeric matrix of size p x p, where (j, j') stores the test statistics for gene j, j' (H_0: independence)
16 |  */
17 | // [[Rcpp::export]]
18 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W) {
19 |   const arma::uword n = D.n_rows;   // number of rows of D (i.e., number of cells)
20 |   const arma::uword k = D.n_cols;   // number of columns of D (i.e., number of covariates)
21 |   const arma::uword p = X.n_cols;   // number of genes
22 | 
23 |   if (X.n_rows != n) {
24 |     stop("Dimensions of design matrix and gene expression datado not match.");
25 |   }
26 | 
27 |   arma::cube result(k, p, p, arma::fill::zeros);  // k x p x p cube, WLS estimates
28 |   //arma::cube inv_DTD_w_cube(k, k, p * p, arma::fill::zeros); // k x k x p x p cube, variance of WLS estimators under the null
29 |   arma::mat ts_mat(p, p, arma::fill::zeros); // p x p matrix, test statistics
30 | 
31 |   arma::mat D_T = D.t();  // precompute transpose once
32 | 
33 |   for (arma::uword j = 0; j < p; ++j) {
34 |     for (arma::uword jprime = j; jprime < p; ++jprime) {
35 | 
36 |       // Compute x_prod = X[, j] * X[, j']
37 |       arma::vec x_prod = X.col(j) % X.col(jprime);  // element-wise product
38 | 
39 |       // Compute weights
40 |       arma::vec w = W.col(j) % W.col(jprime);
41 | 
42 |       // Compute D^T * diag(w) * D
43 |       arma::mat D_weighted = D;
44 |       D_weighted.each_col() %= w;
45 | 
46 |       // Compute inverse of DTD_w
47 |       arma::mat DTD_w = D_T * D_weighted;
48 |       //arma::mat inv_DTD_w = arma::inv_sympd(DTD_w);  // safer inverse
49 |       arma::mat inv_DTD_w;
50 |       //try {
51 |       //  inv_DTD_w = arma::inv_sympd(DTD_w);
52 |       //} catch (...) {
53 |         //Rcpp::Rcout << "j" << j << std::endl;
54 |         //Rcpp::Rcout << "jprime" << jprime << std::endl;
55 |         //Rcpp::Rcout << "DTD_w matrix:\n" << DTD_w << std::endl;
56 |         // Rcpp::Rcout << "[WARNING] inv_sympd failed, using general inverse" << std::endl;
57 |       //  inv_DTD_w = arma::inv(DTD_w);  // more stable but slower
58 |       //}
59 |       inv_DTD_w = arma::inv(DTD_w);  // more stable but slower
60 | 
61 |       // Compute D^T * (w % x_prod)
62 |       arma::vec Dt_w_x = D_T * (w % x_prod);
63 | 
64 |       // WLS estimator
65 |       arma::vec res_vec = inv_DTD_w * Dt_w_x;
66 | 
67 |       // Save covariance estimates
68 |       result.slice(jprime).col(j) = res_vec;
69 | 
70 |       // Save inverse matrix
71 |       //int pair_idx = j * p + jprime;
72 |       //inv_DTD_w_cube.slice(pair_idx) = inv_DTD_w;
73 | 
74 |       // Save test statistics
75 |       ts_mat(j, jprime) = res_vec(0) / std::sqrt(inv_DTD_w(0, 0));
76 |     }
77 |   }
78 | 
79 |   for (arma::uword j = 0; j < p; ++j) {
80 |     for (arma::uword jprime = j + 1; jprime < p; ++jprime) {
81 |       result.slice(j).col(jprime) = result.slice(jprime).col(j);
82 |       ts_mat(jprime, j) = ts_mat(j, jprime);
83 |     }
84 |   }
85 | 
86 |   // Return both result and inverse cube
87 |   return List::create(Named("cov_hat") = result,
88 |                       //Named("var_cov_hat") = inv_DTD_w_cube,
89 |                       Named("test_stat") = ts_mat);
90 | }
91 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | #include <Rcpp.h>
 6 | 
 7 | using namespace Rcpp;
 8 | 
 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12 | #endif
13 | 
14 | // CSCORE_IRLS_cpp_impl
15 | Rcpp::List CSCORE_IRLS_cpp_impl(const arma::mat& X, const arma::vec& seq_depth_sq, const arma::mat& D_mu, const arma::mat& D_sigma2, const arma::mat& D_sigma, const bool post_process, const int n_iter, const double eps, const bool verbose, const std::string& conv, const bool return_all);
16 | RcppExport SEXP _CSCORE_CSCORE_IRLS_cpp_impl(SEXP XSEXP, SEXP seq_depth_sqSEXP, SEXP D_muSEXP, SEXP D_sigma2SEXP, SEXP D_sigmaSEXP, SEXP post_processSEXP, SEXP n_iterSEXP, SEXP epsSEXP, SEXP verboseSEXP, SEXP convSEXP, SEXP return_allSEXP) {
17 | BEGIN_RCPP
18 |     Rcpp::RObject rcpp_result_gen;
19 |     Rcpp::RNGScope rcpp_rngScope_gen;
20 |     Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP);
21 |     Rcpp::traits::input_parameter< const arma::vec& >::type seq_depth_sq(seq_depth_sqSEXP);
22 |     Rcpp::traits::input_parameter< const arma::mat& >::type D_mu(D_muSEXP);
23 |     Rcpp::traits::input_parameter< const arma::mat& >::type D_sigma2(D_sigma2SEXP);
24 |     Rcpp::traits::input_parameter< const arma::mat& >::type D_sigma(D_sigmaSEXP);
25 |     Rcpp::traits::input_parameter< const bool >::type post_process(post_processSEXP);
26 |     Rcpp::traits::input_parameter< const int >::type n_iter(n_iterSEXP);
27 |     Rcpp::traits::input_parameter< const double >::type eps(epsSEXP);
28 |     Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP);
29 |     Rcpp::traits::input_parameter< const std::string& >::type conv(convSEXP);
30 |     Rcpp::traits::input_parameter< const bool >::type return_all(return_allSEXP);
31 |     rcpp_result_gen = Rcpp::wrap(CSCORE_IRLS_cpp_impl(X, seq_depth_sq, D_mu, D_sigma2, D_sigma, post_process, n_iter, eps, verbose, conv, return_all));
32 |     return rcpp_result_gen;
33 | END_RCPP
34 | }
35 | // WLS_cov
36 | Rcpp::List WLS_cov(arma::mat D, arma::mat X, arma::mat W);
37 | RcppExport SEXP _CSCORE_WLS_cov(SEXP DSEXP, SEXP XSEXP, SEXP WSEXP) {
38 | BEGIN_RCPP
39 |     Rcpp::RObject rcpp_result_gen;
40 |     Rcpp::RNGScope rcpp_rngScope_gen;
41 |     Rcpp::traits::input_parameter< arma::mat >::type D(DSEXP);
42 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
43 |     Rcpp::traits::input_parameter< arma::mat >::type W(WSEXP);
44 |     rcpp_result_gen = Rcpp::wrap(WLS_cov(D, X, W));
45 |     return rcpp_result_gen;
46 | END_RCPP
47 | }
48 | // WLS_mean
49 | arma::mat WLS_mean(arma::mat D, arma::mat X, arma::mat W);
50 | RcppExport SEXP _CSCORE_WLS_mean(SEXP DSEXP, SEXP XSEXP, SEXP WSEXP) {
51 | BEGIN_RCPP
52 |     Rcpp::RObject rcpp_result_gen;
53 |     Rcpp::RNGScope rcpp_rngScope_gen;
54 |     Rcpp::traits::input_parameter< arma::mat >::type D(DSEXP);
55 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
56 |     Rcpp::traits::input_parameter< arma::mat >::type W(WSEXP);
57 |     rcpp_result_gen = Rcpp::wrap(WLS_mean(D, X, W));
58 |     return rcpp_result_gen;
59 | END_RCPP
60 | }
61 | 
62 | static const R_CallMethodDef CallEntries[] = {
63 |     {"_CSCORE_CSCORE_IRLS_cpp_impl", (DL_FUNC) &_CSCORE_CSCORE_IRLS_cpp_impl, 11},
64 |     {"_CSCORE_WLS_cov", (DL_FUNC) &_CSCORE_WLS_cov, 3},
65 |     {"_CSCORE_WLS_mean", (DL_FUNC) &_CSCORE_WLS_mean, 3},
66 |     {NULL, NULL, 0}
67 | };
68 | 
69 | RcppExport void R_init_CSCORE(DllInfo *dll) {
70 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
71 |     R_useDynamicSymbols(dll, FALSE);
72 | }
73 | 


--------------------------------------------------------------------------------
/R/CSCORE_IRLS_base.R:
--------------------------------------------------------------------------------
  1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE (base R, archived)
  2 | #'
  3 | #' This function was originally implemented in 2023 and included in the first release of the CS-CORE R package.
  4 | #' It has since been replaced by \code{\link{CSCORE_IRLS}} for two main reasons:
  5 | #'
  6 | #' \enumerate{
  7 | #'   \item It relies on base R for regression, which is slower and more memory-intensive than the Rcpp-based implementation in \code{CSCORE_IRLS}.
  8 | #'   \item It does not support covariate adjustment, a feature supported in \code{CSCORE_IRLS}.
  9 | #' }
 10 | #'
 11 | #' @note This function is retained for reference and backward compatibility, but users are encouraged to use \code{\link{CSCORE_IRLS}} for new analyses.
 12 | #'
 13 | #' @keywords internal
 14 | #'
 15 | #' @param X A n by p matrix of UMI counts, where n denotes the number of cells and p denotes the number of genes
 16 | #' @param seq_depth A length n vector of sequencing depths
 17 | #' @param post_process Whether to process the estimated co-expressions such that the estimates are between -1 and 1. Default to TRUE.
 18 | #'
 19 | #' @return A list of three p by p matrices:
 20 | #' \describe{
 21 | #'   \item{est}{co-expression estimates}
 22 | #'   \item{p_value}{p values}
 23 | #'   \item{test_stat}{test statistics}
 24 | #' }
 25 | #' @export
 26 | #'
 27 | #' @examples
 28 | #' ## Toy example:
 29 | #' ## run CSCORE on a simulated independent gene pair
 30 | #' cscore_example <- CSCORE_IRLS_base(ind_gene_pair$counts, ind_gene_pair$seq_depths)
 31 | #'
 32 | #' ## Estimated co-expression between two genes
 33 | #' cscore_example$est[1,2]
 34 | #' # close to 0: 0.007820124
 35 | #'
 36 | #' ## p-values
 37 | #' cscore_example$p_value[1,2]
 38 | #' # not significant: 0.961981
 39 | #'
 40 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
 41 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data.
 42 | #' \emph{Nature Communications}.
 43 | #' doi: <https://doi.org/10.1038/s41467-023-40503-7>
 44 | #'
 45 | CSCORE_IRLS_base <- function(X, seq_depth, post_process = TRUE){
 46 |   if (is.null(seq_depth)) {
 47 |     seq_depth = apply(X, 1, sum, na.rm = T)
 48 |   }
 49 |   if(nrow(X) != length(seq_depth)){
 50 |     stop('The length of the sequencing depth must match the number of cells.')
 51 |   }
 52 |   n_cell = nrow(X)
 53 |   n_gene = ncol(X)
 54 |   seq_depth_sq = seq_depth^2
 55 |   seq_2 = sum(seq_depth_sq)
 56 |   seq_4 = sum(seq_depth^4)
 57 |   mu = colSums(X * seq_depth)/seq_2
 58 |   M = outer(seq_depth, mu)
 59 |   X_centered = X - M
 60 |   sigma2 = colSums(((X_centered^2 - M) * seq_depth_sq))/seq_4
 61 |   theta = mu^2/sigma2
 62 |   j = 0
 63 |   delta = Inf
 64 | 
 65 |   while( delta > 0.05 & j <= 10 ){
 66 |     theta_previous = theta
 67 |     theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5)
 68 |     theta[theta < 0] = Inf
 69 |     w = M + outer(seq_depth_sq, mu^2/theta_median)
 70 |     w[is.na(w)|w <= 0] = 1
 71 |     mu = colSums((X/w) * seq_depth)/colSums(seq_depth_sq/w)
 72 |     M = outer(seq_depth, mu)
 73 |     X_centered = X - M
 74 |     h = (M^2/theta_median + M)^2
 75 |     h[h <= 0] = 1
 76 |     sigma2 = colSums(((X_centered^2 - M)/h * seq_depth_sq))/colSums(seq_depth_sq^2/h)
 77 |     theta = mu^2/sigma2
 78 |     j = j+1
 79 |     # print(paste0("Iteration: ", j, ", Median: ", theta_median))
 80 |     delta = max(abs(log((theta/theta_previous)[theta > 0 & theta_previous > 0])), na.rm = T)
 81 |     # print(delta)
 82 |   }
 83 |   if(j == 10 & delta > 0.05){
 84 |     print('IRLS failed to converge after 10 iterations. Please check your data.')
 85 |   }else{
 86 |     print(sprintf('IRLS converged after %i iterations.', j))
 87 |   }
 88 | 
 89 |   theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5)
 90 |   theta[theta < 0] = Inf
 91 |   w = M + outer(seq_depth_sq, mu^2/theta_median)
 92 |   w[is.na(w)|w <= 0] = 1
 93 | 
 94 |   covar = matrix(NA, nrow = n_gene, ncol = n_gene)
 95 |   covar <- (t(seq_depth_sq * X_centered/w) %*% (X_centered/w))/(t(seq_depth_sq/w) %*% (seq_depth_sq/w))
 96 | 
 97 |   # Evaluate test statistics and p values
 98 |   Sigma <- M + outer(seq_depth_sq, sigma2)
 99 |   ele_inv_Sigma <- 1/Sigma
100 |   X_centered_scaled <- X_centered * ele_inv_Sigma
101 |   num <- t(seq_depth_sq * X_centered_scaled) %*% X_centered_scaled
102 |   deno <- sqrt(t(seq_depth^4 * ele_inv_Sigma) %*% ele_inv_Sigma)
103 |   test_stat <- num/deno
104 |   p_value <- 2 * stats::pnorm(abs(test_stat), lower.tail = F)
105 | 
106 |   # Evaluate co-expression estimates
107 |   neg_gene_inds <- which(sigma2 < 0)
108 |   sigma2[neg_gene_inds] <- 0
109 |   sigma <- sqrt(sigma2)
110 |   est <- covar/outer(sigma, sigma)
111 | 
112 |   # Post-process the co-expression estimates
113 |   if(post_process) est <- post_process_est(est)
114 |   return(list(est = est, p_value = p_value, test_stat = test_stat))
115 | }
116 | 


--------------------------------------------------------------------------------
/man/CSCORE_IRLS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CSCORE_IRLS.R
 3 | \name{CSCORE_IRLS}
 4 | \alias{CSCORE_IRLS}
 5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE}
 6 | \source{
 7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
 8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data.
 9 | \emph{Nature Communications}.
10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7}
11 | }
12 | \usage{
13 | CSCORE_IRLS(
14 |   X,
15 |   seq_depth,
16 |   covariates = NULL,
17 |   post_process = TRUE,
18 |   covariate_level = "z",
19 |   adjust_setting = c(mean = T, var = T, covar = T),
20 |   return_all = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{X}{A numeric matrix of UMI counts (\code{n x p}),
25 | where \code{n} is the number of cells and \code{p} is the number of genes.}
26 | 
27 | \item{seq_depth}{A numeric vector of sequencing depths of length \code{n}.}
28 | 
29 | \item{covariates}{Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions.
30 | Can be a length n vector if \eqn{K=1}.
31 | If \code{NULL}, no covariates are adjusted for in the regression.
32 | Defaults to \code{NULL}.}
33 | 
34 | \item{post_process}{Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1.
35 | Defaults to \code{TRUE}.}
36 | 
37 | \item{covariate_level}{Optional. A character string indicating whether covariates are assumed to affect
38 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).
39 | See the \emph{Details} section for further explanation.
40 | Defaults to \code{"z"}.}
41 | 
42 | \item{adjust_setting}{Optional. A named logical vector of length 3;
43 | whether to adjust for covariates at the mean, variance, and covariance level.
44 | Must be named \code{c("mean", "var", "covar")}.
45 | Defaults to \code{c(mean = T,var = T, covar = T)}.}
46 | 
47 | \item{return_all}{Logical; whether to return all estimates, including the effect sizes for covariates.
48 | Defaults to \code{FALSE}.}
49 | }
50 | \value{
51 | A list containing the following components:
52 | \describe{
53 | \item{est}{A \eqn{p \times p} matrix of co-expression estimates.}
54 | \item{p_value}{A \eqn{p \times p} matrix of p-values.}
55 | \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.}
56 | \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.}
57 | \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.}
58 | \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.}
59 | }
60 | }
61 | \description{
62 | This function implements the IRLS procedure used in CS-CORE for estimating and testing
63 | cell-type-specific co-expression from single-cell RNA sequencing data.
64 | }
65 | \details{
66 | Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i};
67 | \eqn{s_i} denote the sequencing depth;
68 | \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance;
69 | \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state).
70 | The procedure consists of two main steps:
71 | \enumerate{
72 | \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters
73 | using two moment-based regressions:
74 | \itemize{
75 | \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}}
76 | \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}},
77 | where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k}
78 | }
79 | 
80 | \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance
81 | and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression:
82 | \itemize{
83 | \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}}
84 | }
85 | }
86 | We note that
87 | \enumerate{
88 | \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression,
89 | rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression
90 | (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij}}),
91 | please specify \code{covariate_level="x"}.
92 | \item The original CS-CORE published in \url{https://doi.org/10.1038/s41467-023-40503-7}
93 | did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}.
94 | }
95 | }
96 | \keyword{internal}
97 | 


--------------------------------------------------------------------------------
/R/CSCORE.R:
--------------------------------------------------------------------------------
 1 | #' CS-CORE for cell-type-specific co-expression network inference
 2 | #'
 3 | #' Run CS-CORE on a Seurat object to infer the cell-type-specific co-expression network for a specified set of genes,
 4 | #' with optional adjustment for covariates.
 5 | #' For more details on the covariate adjustment and the moment-based regression,
 6 | #' please refer to \link{CSCORE_IRLS}.
 7 | #' @seealso \link{CSCORE_IRLS}
 8 | #' @seealso \href{https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html}{CS-CORE online tutorial}
 9 | #'
10 | #' @param object A Seurat object containing single-cell RNA-seq data.
11 | #'   The object should be subsetted to cells of a single cell type to ensure cell-type-specific inference.
12 | #'   CS-CORE requires raw UMI counts as input, and assumes that the raw count matrix is stored in the \code{"counts"} slot of the \code{"RNA"} assay
13 | #'   (i.e., \code{object[["RNA"]]@counts}).
14 | #' @param genes A character vector of gene names (length \eqn{p}) for which the co-expression network will be estimated.
15 | #' @param seq_depth A numeric vector of sequencing depths (length \eqn{n}).
16 | #'   If \code{NULL}, sequencing depth will be computed as the total UMI count per cell.
17 | #'   Defaults to \code{NULL}.
18 | #' @param covariate_names Optional. A character vector specifying the names of cell-level covariates to adjust for in the regression models.
19 | #'   These variables will be extracted from \code{object@meta.data[, covariate_names]}. Defaults to \code{NULL}.
20 | #' @param adjust_setting Optional. A named logical vector of length 3 indicating whether to adjust for covariates in the estimation of mean, variance, and covariance.
21 | #'   Must be named \code{c("mean", "var", "covar")}. Defaults to \code{c(mean = TRUE, var = TRUE, covar = TRUE)}.
22 | #' @param IRLS_version Optional. A character string specifying the IRLS implementation to use: \code{"Rcpp"} or \code{"base_R"}.
23 | #'   Only the \code{"Rcpp"} version supports covariate adjustment. The \code{"base_R"} version does not.
24 | #'   When applicable, \code{"Rcpp"} offers improved memory efficiency (~10-100 times) but may be slower (~10 times),
25 | #'   while \code{"base_R"} is faster but more memory intensive.
26 | #'   Defaults to \code{"Rcpp"}.
27 | #' @param IRLS_par Optional. A named list of length 3 specifying parameters for the IRLS algorithm:
28 | #'   \describe{
29 | #'     \item{\code{n_iter}}{Maximum number of iterations.}
30 | #'     \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.}
31 | #'     \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.}
32 | #'   }
33 | #'   Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE)}.
34 | #'
35 | #' @return A list of three p by p matrices:
36 | #' \describe{
37 | #'   \item{est}{Matrix of co-expression estimates.}
38 | #'   \item{p_value}{Matrix of p-values for testing co-expression.}
39 | #'   \item{test_stat}{Matrix of test statistics for evaluating the significance of co-expression.}
40 | #' }
41 | #' @export
42 | #'
43 | #' @examples
44 | #' # See a full example at:
45 | #' # https://changsubiostats.github.io/CS-CORE/articles/CSCORE.html
46 | #'
47 | CSCORE <- function(object, genes, seq_depth = NULL,
48 |                    covariate_names = NULL,
49 |                    adjust_setting = c('mean' = T, 'var' = T, 'covar' = T),
50 |                    IRLS_version = 'Rcpp',
51 |                    IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE)){
52 |   # Extract the UMI count matrix from the single cell object
53 |   # with RNA as the default assay
54 |   # such that slot `counts` corresponds to UMI counts
55 |   count_matrix <- t(as.matrix(GetAssayData(object = object, assay = "RNA",slot='counts')))
56 |   # Extract / calculate the sequencing depths
57 |   if(is.null(seq_depth)){
58 |     if('nCount_RNA' %in% colnames(object@meta.data)){
59 |       seq_depth <- object$nCount_RNA
60 |     }else{
61 |       seq_depth <- rowSums(count_matrix)
62 |     }
63 |   }else{
64 |     if(length(seq_depth) != nrow(count_matrix)) stop("The length of the sequencing depth must match the number of cells.")
65 |   }
66 |   # Run CS-CORE
67 |   if(is.null(covariate_names)){
68 |     if(IRLS_version == 'base_R'){
69 |       CSCORE_result <- CSCORE_IRLS_base(count_matrix[,genes], seq_depth)
70 |     }else{
71 |       IRLS_par[['conv']] <- 'max'
72 |       CSCORE_result <- CSCORE_IRLS_cpp(count_matrix[,genes], seq_depth,
73 |                                        IRLS_par = IRLS_par)
74 |     }
75 |   }else{
76 |     if(any(!covariate_names %in% colnames(object@meta.data))){
77 |       stop("[ERROR] Not all covariates are included in the Seurat object. Please check `colnames(object@meta.data)`")
78 |     }
79 |     covariates <- object@meta.data[, covariate_names]
80 |     cat("[INFO] Adjust for covariates:", paste(colnames(covariates), collapse = ", "), "\n")
81 |     covariate_matrix <- stats::model.matrix(~ ., data = covariates)[,-1]
82 |     covariate_matrix <- scale(covariate_matrix, center = T, scale = T)
83 |     cat("[INFO] Variables in the design matrix:", paste(colnames(covariate_matrix), collapse = ", "), "\n")
84 | 
85 |     IRLS_par[['conv']] <- 'q95'
86 |     CSCORE_result <- CSCORE_IRLS_cpp(count_matrix[,genes], seq_depth,
87 |                                      covariates = covariate_matrix,
88 |                                      adjust_setting = adjust_setting,
89 |                                      IRLS_par = IRLS_par)
90 |   }
91 |   return(CSCORE_result)
92 | }
93 | 


--------------------------------------------------------------------------------
/man/CSCORE_IRLS_cpp.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/CSCORE_IRLS_cpp.R
  3 | \name{CSCORE_IRLS_cpp}
  4 | \alias{CSCORE_IRLS_cpp}
  5 | \title{Iteratively reweighted least squares (IRLS) procedure in CS-CORE (Rcpp)}
  6 | \source{
  7 | Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
  8 | Cell-type-specific co-expression inference from single cell RNA-sequencing data.
  9 | \emph{Nature Communications}.
 10 | doi: \url{https://doi.org/10.1038/s41467-023-40503-7}
 11 | }
 12 | \usage{
 13 | CSCORE_IRLS_cpp(
 14 |   X,
 15 |   seq_depth,
 16 |   covariates = NULL,
 17 |   post_process = TRUE,
 18 |   covariate_level = "z",
 19 |   adjust_setting = c(mean = T, var = T, covar = T),
 20 |   IRLS_par = list(n_iter = 10, eps = 0.05, verbose = FALSE, conv = "q95"),
 21 |   return_all = FALSE
 22 | )
 23 | }
 24 | \arguments{
 25 | \item{X}{A numeric matrix of UMI counts (\code{n x p}),
 26 | where \code{n} is the number of cells and \code{p} is the number of genes.}
 27 | 
 28 | \item{seq_depth}{A numeric vector of sequencing depths of length \code{n}.}
 29 | 
 30 | \item{covariates}{Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions.
 31 | Can be a length n vector if \eqn{K=1}.
 32 | If \code{NULL}, no covariates are adjusted for in the regression.
 33 | Defaults to \code{NULL}.}
 34 | 
 35 | \item{post_process}{Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1.
 36 | Defaults to \code{TRUE}.}
 37 | 
 38 | \item{covariate_level}{Optional. A character string indicating whether covariates are assumed to affect
 39 | the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).
 40 | See the \emph{Details} section for further explanation.
 41 | Defaults to \code{"z"}.}
 42 | 
 43 | \item{adjust_setting}{Optional. A named logical vector of length 3;
 44 | whether to adjust for covariates at the mean, variance, and covariance level.
 45 | Must be named \code{c("mean", "var", "covar")}.
 46 | Defaults to \code{c(mean = T,var = T, covar = T)}.}
 47 | 
 48 | \item{IRLS_par}{Optional. A named list of length 4 specifying parameters for the IRLS algorithm:
 49 | \describe{
 50 | \item{\code{n_iter}}{Maximum number of iterations.}
 51 | \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.}
 52 | \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.}
 53 | \item{\code{conv}}{Character string; determine convergence based on \code{q95} (0.95 quantile) or \code{max} of \code{delta}.}
 54 | }
 55 | Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE, cov = "q95")}.}
 56 | 
 57 | \item{return_all}{Logical; whether to return all estimates, including the effect sizes for covariates.
 58 | Defaults to \code{FALSE}.}
 59 | }
 60 | \value{
 61 | A list containing the following components:
 62 | \describe{
 63 | \item{est}{A \eqn{p \times p} matrix of co-expression estimates.}
 64 | \item{p_value}{A \eqn{p \times p} matrix of p-values.}
 65 | \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.}
 66 | \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.}
 67 | \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.}
 68 | \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.}
 69 | }
 70 | }
 71 | \description{
 72 | Estimate and test cell-type-specific co-expression using the IRLS procedure with optional covariate adjustment.
 73 | }
 74 | \details{
 75 | Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i};
 76 | \eqn{s_i} denote the sequencing depth;
 77 | \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance;
 78 | \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state).
 79 | The procedure consists of two main steps:
 80 | \enumerate{
 81 | \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters
 82 | using two moment-based regressions:
 83 | \itemize{
 84 | \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}}
 85 | \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}},
 86 | where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k}
 87 | }
 88 | 
 89 | \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance
 90 | and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression:
 91 | \itemize{
 92 | \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}}
 93 | }
 94 | }
 95 | We note that
 96 | \enumerate{
 97 | \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression,
 98 | rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression
 99 | (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij}}),
100 | please specify \code{covariate_level="x"}.
101 | \item The original CS-CORE published in \url{https://doi.org/10.1038/s41467-023-40503-7}
102 | did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}.
103 | }
104 | Note: This is an R wrapper for the \code{CSCORE_IRLS_cpp_impl()} function implemented in Rcpp.
105 | }
106 | \examples{
107 | ## Toy example:
108 | ## run CSCORE on a simulated independent gene pair
109 | cscore_example <- CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths)
110 | 
111 | ## Estimated co-expression between two genes
112 | cscore_example$est[1,2]
113 | # close to 0: 0.007820124
114 | 
115 | ## p-values
116 | cscore_example$p_value[1,2]
117 | # not significant: 0.961981
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/vignettes/covariate_adjustment.Rmd.orig:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Covariate adjustment"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{covariate_adjustment}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | ```
 16 | 
 17 | # Motivation
 18 | The original implementation of CS-CORE does not consider covariate adjustment. For example, in the [paper](https://www.nature.com/articles/s41467-023-40503-7), we assumed
 19 | 
 20 | \[
 21 | {\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})
 22 | \]
 23 | 
 24 | such that the mean ${\bf \mu}$ and the variance-covariance matrix $\Sigma$ of underlying gene expression are exactly the same for all cells in the population. However, in real data, it is possible that the mean, variance, and co-expression of underlying gene expression could be affected by technical covariates (such as percent.mt) or biological covariates (such as sex) that vary across cell.
 25 | 
 26 | # New moment-based regressions for covariate adjustment
 27 | 
 28 | In June 2025, we added a new feature to CS-CORE that allows adjusting for covariates in co-expression inference. In particular, we adjust for covariates through modelling ${\bf mu}$ and $\Sigma$ as a function of covariates $c_{ik}$'s:
 29 | $$
 30 | {\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}, \
 31 | \Sigma_{i}=\Sigma +\sum_k c_{ik} {\bf \gamma_k}
 32 | $$
 33 | This leads to the following moment-based regression:
 34 | 
 35 | $$
 36 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}
 37 | $$
 38 | 
 39 | $$(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}$$
 40 | $$(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},$$
 41 | 
 42 | where $\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k$.  These allow adjusting for $K$ covariates $c_{ik}$'s for their effects on the underlying gene expression mean ($\beta_k$'s), variance ($\gamma_k$'s) and covariance ($\theta_k$'s).
 43 | 
 44 | # Demonstration on real data
 45 | 
 46 | Here, we demonstrates how to adjust for covariates in CS-CORE with the same dataset as in [Getting started](CSCORE.html).
 47 | 
 48 | ```{r message=F}
 49 | library(CSCORE)
 50 | library(Seurat)
 51 | ```
 52 | 
 53 | ```{r message=F, warning=F}
 54 | # wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds
 55 | 
 56 | pbmc <- readRDS('/Users/csu30/Library/CloudStorage/OneDrive-Emory/Projects/CS-CORE/Rpackage/blish_covid.seu.rds')
 57 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object
 58 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B']
 59 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA)
 60 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:200]
 61 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"]
 62 | ```
 63 | 
 64 | This dataset comes with detailed cell-level covariates:
 65 | ```{r}
 66 | colnames(pbmc_B_healthy@meta.data)
 67 | ```
 68 | 
 69 | As an example, we choose to adjust for `Sex` and `percent.mt`. To run CS-CORE with covariate adjustment, use
 70 | 
 71 | ```{r}
 72 | CSCORE_result_adj <- CSCORE(pbmc_B_healthy,
 73 |                             genes = genes_selected,
 74 |                             covariate_names = c('percent.mt', 'Sex'))
 75 | ```
 76 | 
 77 | By default, `CSCORE` extracts the covariates from the Seurat object and construct a design matrix with scaled and centered covariates. To understand the detailed impact of covariate adjustment, we also compare with the results without covariate adjustment.
 78 | 
 79 | ```{r}
 80 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected)
 81 | ```
 82 | 
 83 | ```{r figrho}
 84 | # compare co-expression estimates for a random set of gene pairs
 85 | set.seed(42002)
 86 | p <- length(genes_selected)
 87 | random_pairs <- sample(which(upper.tri(matrix(1:p^2, p, p))), 1000)
 88 | plot(CSCORE_result$est[random_pairs],
 89 |      CSCORE_result_adj$est[random_pairs],
 90 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
 91 |      main = 'Co-expression estimates')
 92 | abline(0,1,col='red')
 93 | ```
 94 | 
 95 | ```{r echo=F, eval=F}
 96 | png('vignettes/figure/figrho-1.png', width = 480, height = 480)
 97 | plot(CSCORE_result$est[random_pairs],
 98 |      CSCORE_result_adj$est[random_pairs],
 99 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
100 |      main = 'Co-expression estimates')
101 | abline(0,1,col='red')
102 | dev.off()
103 | ```
104 | 
105 | ```{r figt}
106 | plot(CSCORE_result$test_stat[random_pairs],
107 |      CSCORE_result_adj$test_stat[random_pairs],
108 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
109 |      main = 'Test_stat')
110 | abline(0,1,col='red')
111 | ```
112 | 
113 | ```{r echo=F, eval=F}
114 | png('vignettes/figure/figt-1.png', width = 480, height = 480)
115 | plot(CSCORE_result$test_stat[random_pairs],
116 |      CSCORE_result_adj$test_stat[random_pairs],
117 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
118 |      main = 'Test_stat')
119 | abline(0,1,col='red')
120 | dev.off()
121 | ```
122 | 
123 | It seems that the co-expression for most gene pairs are similar with and without covariate adjustment. We recommend users to sanity check this and examine the impact of covariate adjustment on co-expression inference.
124 | 
125 | 
126 | # Advanced topics
127 | 
128 | The application above adjusts for covariates in the underlying expression levels' mean, variance, and covariance. For users who wish to have more fine-grained control on the regression models, we provide two additional parameters: `adjust_setting` and `covariate_level`.
129 | 
130 | ## `adjust_setting`
131 | 
132 | We provide these two options in [CSCORE_IRLS_cpp](CSCORE_IRLS_cpp.html), which is the function underlying `CSCORE`. `adjust_setting` allows you to choose which regression model to adjust covariates for. For example, if `adjust_setting=c(mean = T, var = F, covar = T)`, this is equivalent to running the following regressions:
133 | 
134 | $$
135 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}
136 | $$
137 | 
138 | $$
139 | (x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 \sigma_{jj}  + \eta_{ij}
140 | $$
141 | 
142 | $$
143 | (x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},
144 | $$
145 | 
146 | ## `covariate_level`
147 | 
148 | In the adjustment models above, we assume ${\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})$ and ${\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}$. This implies that the underlying mean expression is associated with covariates.
149 | 
150 | Another possible model is to assume $\text{Poisson}(s_i z_{ij} + \sum_k c_{ik} {\bf \beta_k})$, which implies that the covariates operate in the measurement process, independent of underlying gene expression. Even though we think the default model is more natural, we allow for this flexibility by specifying `covariate_level = "x"`. This will run
151 | $$
152 | x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij},
153 | $$
154 | and similarly for variance and covariance.
155 | 


--------------------------------------------------------------------------------
/vignettes/covariate_adjustment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Covariate adjustment"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{covariate_adjustment}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | 
 11 | 
 12 | # Motivation
 13 | The original implementation of CS-CORE does not consider covariate adjustment. For example, in the [paper](https://www.nature.com/articles/s41467-023-40503-7), we assumed
 14 | 
 15 | \[
 16 | {\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})
 17 | \]
 18 | 
 19 | such that the mean ${\bf \mu}$ and the variance-covariance matrix $\Sigma$ of underlying gene expression are exactly the same for all cells in the population. However, in real data, it is possible that the mean, variance, and co-expression of underlying gene expression could be affected by technical covariates (such as percent.mt) or biological covariates (such as sex) that vary across cell.
 20 | 
 21 | # New moment-based regressions for covariate adjustment
 22 | 
 23 | In June 2025, we added a new feature to CS-CORE that allows adjusting for covariates in co-expression inference. In particular, we adjust for covariates through modelling ${\bf mu}$ and $\Sigma$ as a function of covariates $c_{ik}$'s:
 24 | $$
 25 | {\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}, \
 26 | \Sigma_{i}=\Sigma +\sum_k c_{ik} {\bf \gamma_k}
 27 | $$
 28 | This leads to the following moment-based regression:
 29 | 
 30 | $$
 31 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}
 32 | $$
 33 | 
 34 | $$(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}$$
 35 | $$(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},$$
 36 | 
 37 | where $\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k$.  These allow adjusting for $K$ covariates $c_{ik}$'s for their effects on the underlying gene expression mean ($\beta_k$'s), variance ($\gamma_k$'s) and covariance ($\theta_k$'s).
 38 | 
 39 | # Demonstration on real data
 40 | 
 41 | Here, we demonstrates how to adjust for covariates in CS-CORE with the same dataset as in [Getting started](CSCORE.html).
 42 | 
 43 | 
 44 | ``` r
 45 | library(CSCORE)
 46 | library(Seurat)
 47 | ```
 48 | 
 49 | 
 50 | ``` r
 51 | # wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds
 52 | 
 53 | pbmc <- readRDS('blish_covid.seu.rds')
 54 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object
 55 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B']
 56 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA)
 57 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:200]
 58 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"]
 59 | ```
 60 | 
 61 | This dataset comes with detailed cell-level covariates:
 62 | 
 63 | ``` r
 64 | colnames(pbmc_B_healthy@meta.data)
 65 | #>  [1] "orig.ident"       "nCount_RNA"       "nFeature_RNA"     "percent.mt"       "percent.rps"      "percent.rpl"     
 66 | #>  [7] "percent.rrna"     "nCount_SCT"       "nFeature_SCT"     "SCT_snn_res.1"    "seurat_clusters"  "singler"         
 67 | #> [13] "Admission.level"  "cell.type.fine"   "cell.type.coarse" "cell.type"        "IFN1"             "HLA1"            
 68 | #> [19] "Donor.orig"       "Donor.full"       "Donor"            "Status"           "Sex"              "DPS"             
 69 | #> [25] "DTF"              "Admission"        "Ventilated"
 70 | ```
 71 | 
 72 | As an example, we choose to adjust for `Sex` and `percent.mt`. To run CS-CORE with covariate adjustment, use
 73 | 
 74 | 
 75 | ``` r
 76 | CSCORE_result_adj <- CSCORE(pbmc_B_healthy,
 77 |                             genes = genes_selected,
 78 |                             covariate_names = c('percent.mt', 'Sex'))
 79 | #> [INFO] Adjust for covariates: percent.mt, Sex 
 80 | #> [INFO] Variables in the design matrix: percent.mt, SexM 
 81 | #> [INFO] IRLS converged after 3 iterations.
 82 | #> [INFO] Starting WLS for covariance at Thu Jun 26 17:09:48 2025
 83 | #> [INFO] 1 among 200 genes have invalid variance estimates. Their co-expressions with other genes were set to 0.
 84 | #> [INFO] 0.0854% co-expression estimates were greater than 1 and were set to 1.
 85 | #> [INFO] 0.0101% co-expression estimates were smaller than -1 and were set to -1.
 86 | #> [INFO] Finished WLS. Elapsed time: 1.8762 seconds.
 87 | ```
 88 | 
 89 | By default, `CSCORE` extracts the covariates from the Seurat object and construct a design matrix with scaled and centered covariates. To understand the detailed impact of covariate adjustment, we also compare with the results without covariate adjustment.
 90 | 
 91 | 
 92 | ``` r
 93 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected)
 94 | #> [INFO] IRLS converged after 3 iterations.
 95 | #> [INFO] Starting WLS for covariance at Thu Jun 26 17:09:50 2025
 96 | #> [INFO] 0.0101% co-expression estimates were greater than 1 and were set to 1.
 97 | #> [INFO] 0.0000% co-expression estimates were smaller than -1 and were set to -1.
 98 | #> [INFO] Finished WLS. Elapsed time: 1.2249 seconds.
 99 | ```
100 | 
101 | 
102 | ``` r
103 | # compare co-expression estimates for a random set of gene pairs
104 | set.seed(42002)
105 | p <- length(genes_selected)
106 | random_pairs <- sample(which(upper.tri(matrix(1:p^2, p, p))), 1000)
107 | plot(CSCORE_result$est[random_pairs],
108 |      CSCORE_result_adj$est[random_pairs],
109 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
110 |      main = 'Co-expression estimates')
111 | abline(0,1,col='red')
112 | ```
113 | 
114 | ![](figure/figrho-1.png)
115 | 
116 | 
117 | 
118 | 
119 | ``` r
120 | plot(CSCORE_result$test_stat[random_pairs],
121 |      CSCORE_result_adj$test_stat[random_pairs],
122 |      xlab = 'CS-CORE', ylab = 'CS-CORE adjusted',
123 |      main = 'Test_stat')
124 | abline(0,1,col='red')
125 | ```
126 | 
127 | ![](figure/figt-1.png)
128 | 
129 | 
130 | 
131 | It seems that the co-expression for most gene pairs are similar with and without covariate adjustment. We recommend users to sanity check this and examine the impact of covariate adjustment on co-expression inference.
132 | 
133 | 
134 | # Advanced topics
135 | 
136 | The application above adjusts for covariates in the underlying expression levels' mean, variance, and covariance. For users who wish to have more fine-grained control on the regression models, we provide two additional parameters: `adjust_setting` and `covariate_level`.
137 | 
138 | ## `adjust_setting`
139 | 
140 | We provide these two options in [CSCORE_IRLS_cpp](CSCORE_IRLS_cpp.html), which is the function underlying `CSCORE`. `adjust_setting` allows you to choose which regression model to adjust covariates for. For example, if `adjust_setting=c(mean = T, var = F, covar = T)`, this is equivalent to running the following regressions:
141 | 
142 | $$
143 | x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}
144 | $$
145 | 
146 | $$
147 | (x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 \sigma_{jj}  + \eta_{ij}
148 | $$
149 | 
150 | $$
151 | (x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'},
152 | $$
153 | 
154 | ## `covariate_level`
155 | 
156 | In the adjustment models above, we assume ${\bf z_{i}}\sim F_p({\bf \mu}, \Sigma), x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij})$ and ${\bf \mu}_{i}={\bf \mu}+\sum_k c_{ik} {\bf \beta_k}$. This implies that the underlying mean expression is associated with covariates.
157 | 
158 | Another possible model is to assume $\text{Poisson}(s_i z_{ij} + \sum_k c_{ik} {\bf \beta_k})$, which implies that the covariates operate in the measurement process, independent of underlying gene expression. Even though we think the default model is more natural, we allow for this flexibility by specifying `covariate_level = "x"`. This will run
159 | $$
160 | x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij},
161 | $$
162 | and similarly for variance and covariance.
163 | 


--------------------------------------------------------------------------------
/src/CSCORE_IRLS_cpp.cpp:
--------------------------------------------------------------------------------
  1 | #include <RcppArmadillo.h>
  2 | // [[Rcpp::depends(RcppArmadillo)]]
  3 | #include "WLS_mean.h"
  4 | #include "WLS_cov.h"
  5 | #include <chrono>
  6 | #include <ctime>
  7 | #include <iomanip>
  8 | #include <string>
  9 | 
 10 | using namespace Rcpp;
 11 | 
 12 | arma::mat post_process_est(arma::mat est) {
 13 |   int p = est.n_rows;
 14 | 
 15 |   // Identify diagonal elements that are NA or Inf
 16 |   std::vector<int> neg_gene_inds;
 17 |   for (int i = 0; i < p; ++i) {
 18 |     double val = est(i, i);
 19 |     if (std::isnan(val) || !std::isfinite(val)) {
 20 |       neg_gene_inds.push_back(i);
 21 |     }
 22 |   }
 23 | 
 24 |   if (!neg_gene_inds.empty()) {
 25 |     Rcpp::Rcout << "[INFO] " << neg_gene_inds.size() << " among " << p
 26 |                 << " genes have invalid variance estimates. Their co-expressions with other genes were set to 0." << std::endl;
 27 |   }
 28 | 
 29 |   // Set entire rows and columns of invalid genes to 0
 30 |   for (int idx : neg_gene_inds) {
 31 |     est.row(idx).zeros();
 32 |     est.col(idx).zeros();
 33 |   }
 34 | 
 35 |   // Set diagonal to 1
 36 |   est.diag().ones();
 37 | 
 38 |   // Clip values outside [-1, 1] and count them
 39 |   int upper_count = 0, lower_count = 0, total = 0;
 40 |   for (int i = 0; i < p; ++i) {
 41 |     for (int j = i + 1; j < p; ++j) {
 42 |       total++;
 43 |       if (est(i, j) > 1.0) {
 44 |         upper_count++;
 45 |         est(i, j) = 1.0;
 46 |         est(j, i) = 1.0;
 47 |       } else if (est(i, j) < -1.0) {
 48 |         lower_count++;
 49 |         est(i, j) = -1.0;
 50 |         est(j, i) = -1.0;
 51 |       }
 52 |     }
 53 |   }
 54 | 
 55 |   Rcpp::Rcout << "[INFO] "
 56 |               << std::fixed << std::setprecision(4)
 57 |               << 100.0 * upper_count / total
 58 |               << "% co-expression estimates were greater than 1 and were set to 1." << std::endl;
 59 |   Rcpp::Rcout << "[INFO] "
 60 |               << 100.0 * lower_count / total
 61 |               << "% co-expression estimates were smaller than -1 and were set to -1." << std::endl;
 62 | 
 63 |   return est;
 64 | }
 65 | 
 66 | 
 67 | // ---- inline quantile approximation ----
 68 | inline double arma_quantile(const arma::vec& v, double prob) {
 69 |   arma::uword n = v.n_elem;
 70 |   if (n == 0) return NA_REAL;
 71 | 
 72 |   arma::vec sorted = arma::sort(v);
 73 |   double index = (n - 1) * prob;
 74 |   arma::uword lo = std::floor(index);
 75 |   arma::uword hi = std::ceil(index);
 76 | 
 77 |   //Rcpp::Rcout << "[DEBUG] Input vector size: " << v.n_elem << std::endl;
 78 |   //Rcpp::Rcout << "[DEBUG] Sorted vector head: " << sorted.head(5).t();
 79 |   //Rcpp::Rcout << "[DEBUG] index: " << index << ", lo: " << lo << ", hi: " << hi << std::endl;
 80 | 
 81 |   if (lo == hi) {
 82 |     return sorted[lo];
 83 |   } else {
 84 |     double weight = index - lo;
 85 |     return (1.0 - weight) * sorted[lo] + weight * sorted[hi];
 86 |   }
 87 | }
 88 | 
 89 | 
 90 | // [[Rcpp::export]]
 91 | Rcpp::List CSCORE_IRLS_cpp_impl(const arma::mat& X,
 92 |                                 const arma::vec& seq_depth_sq,
 93 |                                 const arma::mat& D_mu,
 94 |                                 const arma::mat& D_sigma2,
 95 |                                 const arma::mat& D_sigma,
 96 |                                 const bool post_process = true,
 97 |                                 const int n_iter = 10,
 98 |                                 const double eps = 0.05,
 99 |                                 const bool verbose = false,
100 |                                 const std::string& conv = "q95",
101 |                                 const bool return_all = false) {
102 | 
103 |   const arma::uword n = X.n_rows;
104 |   const arma::uword p = X.n_cols;
105 | 
106 |   arma::mat W = arma::ones<arma::mat>(X.n_rows, X.n_cols);
107 |   arma::mat mu_beta = WLS_mean(D_mu, X, W);
108 |   arma::mat M = D_mu * mu_beta;
109 |   arma::rowvec mu = mu_beta.row(0);
110 | 
111 |   arma::mat X_centered = X - M;
112 |   arma::mat sigma2_beta = WLS_mean(D_sigma2, (arma::square(X_centered) - M), W);
113 |   arma::rowvec sigma2 = sigma2_beta.row(0);
114 |   arma::rowvec theta = arma::square(mu) / sigma2;
115 | 
116 |   int iter = 0;
117 |   double delta = std::numeric_limits<double>::infinity();
118 | 
119 |   while (delta > eps && iter <= n_iter) {
120 |     arma::rowvec theta_prev = theta;
121 |     arma::vec theta_pos = theta.elem(arma::find(theta > 0));
122 |     double theta_median = arma::median(theta_pos);
123 | 
124 |     arma::rowvec mu_sq_over_theta = arma::square(mu) / theta_median;
125 |     arma::mat weight = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(mu_sq_over_theta, n, 1);
126 |     weight.elem(arma::find_nonfinite(weight)).fill(1);
127 |     weight.elem(arma::find(weight <= 0)).fill(1);
128 | 
129 |     mu_beta = WLS_mean(D_mu, X, 1 / weight);
130 |     mu = mu_beta.row(0);
131 |     M = D_mu * mu_beta;
132 |     X_centered = X - M;
133 | 
134 |     arma::mat h = arma::square(arma::square(M) / theta_median + M);
135 |     h.elem(arma::find(h <= 0)).fill(1);
136 |     sigma2_beta = WLS_mean(D_sigma2, (arma::square(X_centered) - M), 1 / h);
137 |     sigma2 = sigma2_beta.row(0);
138 |     theta = arma::square(mu) / sigma2;
139 | 
140 |     arma::uvec valid_idx = arma::find(theta > 0 && theta_prev > 0);
141 |     arma::vec ratio = arma::log(theta.elem(valid_idx) / theta_prev.elem(valid_idx));
142 |     if (conv == "max") {
143 |       delta = arma::max(arma::abs(ratio));
144 |     } else if (conv == "q95") {
145 |       delta = arma_quantile(arma::abs(ratio), 0.95);;
146 |     }
147 |     iter++;
148 |     if (verbose) {
149 |       Rcpp::Rcout << "[INFO] " << iter << ": delta=" << delta << "\n";
150 |     }
151 |   }
152 | 
153 |   if (iter > n_iter && delta > eps) {
154 |     Rcpp::Rcout << "[WARNING] IRLS did not converge after " << n_iter << " iterations. \n Please increase the number of iterations or check your data.\n";
155 |   } else {
156 |     Rcpp::Rcout << "[INFO] IRLS converged after " << iter << " iterations.\n";
157 |   }
158 | 
159 | 
160 |   arma::rowvec mu_sq_over_theta = arma::square(mu) / arma::median(theta.elem(arma::find(theta > 0)));
161 |   arma::mat w_cov = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(mu_sq_over_theta, n, 1);
162 |   w_cov.elem(arma::find_nonfinite(w_cov)).fill(1);
163 |   w_cov.elem(arma::find(w_cov <= 0)).fill(1);
164 |   X_centered = X - M;
165 | 
166 |   auto start = std::chrono::system_clock::now();
167 |   std::time_t start_time = std::chrono::system_clock::to_time_t(start);
168 |   Rcpp::Rcout << "[INFO] Starting WLS for covariance at " << std::ctime(&start_time);  // includes newline
169 |   auto tic = std::chrono::high_resolution_clock::now();
170 | 
171 |   // Covariance estimation
172 |   Rcpp::List covar_list = WLS_cov(D_sigma, X_centered, 1 / w_cov);
173 |   arma::cube covar_beta = Rcpp::as<arma::cube>(covar_list["cov_hat"]);
174 |   arma::mat covar = covar_beta.row(0);  // Extract k = 0 slice => p × p matrix
175 | 
176 |   // Statistical testing
177 |   arma::mat Sigma = M + arma::repmat(seq_depth_sq, 1, p) % arma::repmat(sigma2, n, 1);
178 |   arma::mat ele_inv_Sigma = 1 / Sigma;
179 | 
180 |   Rcpp::List ts_list = WLS_cov(D_sigma, X_centered, ele_inv_Sigma);
181 |   arma::vec sigma = arma::sqrt(arma::clamp(sigma2.t(), 0, arma::datum::inf));
182 |   //arma::cube covar_beta = Rcpp::as<arma::cube>(ts_list["cov_hat"]);
183 |   //arma::mat covar = covar_beta.row(0);  // Extract k = 0 slice => p × p matrix
184 |   arma::mat est = covar / (sigma * sigma.t());
185 |   arma::mat test_stat = ts_list["test_stat"];
186 |   arma::mat p_value = 2 * arma::normcdf(-arma::abs(test_stat));
187 | 
188 |   if (post_process) {
189 |     est = post_process_est(est);
190 |   }
191 | 
192 |   auto toc = std::chrono::high_resolution_clock::now();
193 |   std::chrono::duration<double> elapsed = toc - tic;
194 |   Rcpp::Rcout << "[INFO] Finished WLS. Elapsed time: " << elapsed.count() << " seconds.\n";
195 | 
196 |   Rcpp::List result = List::create(Named("est") = est,
197 |                                    Named("p_value") = p_value,
198 |                                    Named("test_stat") = test_stat);
199 | 
200 |   if (return_all) {
201 |     result["mu_beta"] = mu_beta;
202 |     result["sigma2_beta"] = sigma2_beta;
203 |     result["cov_beta"] = covar_beta;
204 |   }
205 | 
206 |   return result;
207 | }
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/R/CSCORE_IRLS.R:
--------------------------------------------------------------------------------
  1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE
  2 | #'
  3 | #' This function implements the IRLS procedure used in CS-CORE for estimating and testing
  4 | #' cell-type-specific co-expression from single-cell RNA sequencing data.
  5 | #'
  6 | #' Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i};
  7 | #' \eqn{s_i} denote the sequencing depth;
  8 | #' \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance;
  9 | #' \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state).
 10 | #' The procedure consists of two main steps:
 11 | #' \enumerate{
 12 | #'   \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters
 13 | #'         using two moment-based regressions:
 14 | #'     \itemize{
 15 | #'       \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}}
 16 | #'       \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}},
 17 | #'       where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k}
 18 | #'     }
 19 | #'
 20 | #'   \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance
 21 | #'         and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression:
 22 | #'     \itemize{
 23 | #'       \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}}
 24 | #'     }
 25 | #' }
 26 | #' We note that
 27 | #' \enumerate{
 28 | #'   \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression,
 29 | #'   rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression
 30 | #'   (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij}}),
 31 | #'   please specify \code{covariate_level="x"}.
 32 | #'   \item The original CS-CORE published in <https://doi.org/10.1038/s41467-023-40503-7>
 33 | #'   did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}.
 34 | #' }
 35 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
 36 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data.
 37 | #' \emph{Nature Communications}.
 38 | #' doi: <https://doi.org/10.1038/s41467-023-40503-7>
 39 | #'
 40 | #' @param X A numeric matrix of UMI counts (\code{n x p}),
 41 | #'   where \code{n} is the number of cells and \code{p} is the number of genes.
 42 | #' @param seq_depth A numeric vector of sequencing depths of length \code{n}.
 43 | #' @param covariates Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions.
 44 | #'   Can be a length n vector if \eqn{K=1}.
 45 | #'   If \code{NULL}, no covariates are adjusted for in the regression.
 46 | #'   Defaults to \code{NULL}.
 47 | #' @param post_process Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1.
 48 | #'   Defaults to \code{TRUE}.
 49 | #' @param covariate_level Optional. A character string indicating whether covariates are assumed to affect
 50 | #'   the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).
 51 | #'   See the *Details* section for further explanation.
 52 | #'   Defaults to \code{"z"}.
 53 | #' @param adjust_setting Optional. A named logical vector of length 3;
 54 | #'   whether to adjust for covariates at the mean, variance, and covariance level.
 55 | #'   Must be named \code{c("mean", "var", "covar")}.
 56 | #'   Defaults to \code{c(mean = T,var = T, covar = T)}.
 57 | #' @param return_all Logical; whether to return all estimates, including the effect sizes for covariates.
 58 | #'   Defaults to \code{FALSE}.
 59 | #'
 60 | #' @return A list containing the following components:
 61 | #' \describe{
 62 | #'   \item{est}{A \eqn{p \times p} matrix of co-expression estimates.}
 63 | #'   \item{p_value}{A \eqn{p \times p} matrix of p-values.}
 64 | #'   \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.}
 65 | #'   \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.}
 66 | #'   \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.}
 67 | #'   \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.}
 68 | #' }
 69 | #' @keywords internal
 70 | #'
 71 | 
 72 | CSCORE_IRLS <- function(X, seq_depth,
 73 |                         covariates = NULL,
 74 |                         post_process = TRUE,
 75 |                         covariate_level = 'z',
 76 |                         adjust_setting = c('mean' = T, 'var' = T, 'covar' = T),
 77 |                         return_all = FALSE){
 78 |   if(is.null(seq_depth)){
 79 |     seq_depth = apply(X, 1, sum, na.rm = T)
 80 |   }
 81 |   if(nrow(X) != length(seq_depth)){
 82 |     stop('The length of the sequencing depth must match the number of cells.')
 83 |   }
 84 |   if(has_non_integer(X)){
 85 |     stop("CS-CORE takes the UMI count matrix as input. The input matrix X contains non-integer values.")
 86 |   }
 87 |   if(!is.null(covariates)){
 88 |     if(is.vector(covariates)){
 89 |       if(nrow(X) != length(covariates)) stop('The length of the covariate vector should match the number of cells.')
 90 |     }else{
 91 |       if(nrow(X) != nrow(covariates)) stop('The number of rows in the covariate matrix should match the number of cells.')
 92 |       message(sprintf('The design matrix for regression has %i columns', ncol(covariates) + 1))
 93 |     }
 94 |   }
 95 |   n_cell = nrow(X)
 96 |   n_gene = ncol(X)
 97 |   seq_depth_sq = seq_depth^2
 98 |   seq_2 = sum(seq_depth_sq)
 99 |   seq_4 = sum(seq_depth^4)
100 |   # Construct design matrix
101 |   if(is.null(covariates)){
102 |     D <- matrix(rep(1, n_cell), ncol = 1)
103 |   }else{
104 |     D <- cbind(1, covariates)
105 |   }
106 |   D_mu <- set_D(seq_depth, D, adjust_setting['mean'], covariate_level)
107 |   D_sigma2 <- set_D(seq_depth_sq, D, adjust_setting['var'], covariate_level)
108 |   D_sigma <- set_D(seq_depth_sq, D, adjust_setting['covar'], covariate_level)
109 |   # initialize with OLS
110 |   mu_beta <- WLS_mean(D_mu, X, matrix(1, nrow = n_cell, ncol = n_gene))
111 |   M <- D_mu %*% mu_beta
112 |   mu <- mu_beta[1,]
113 |   X_centered = X - M
114 |   sigma2_beta <- WLS_mean(D_sigma2, X_centered^2 - M, matrix(1, nrow = n_cell, ncol = n_gene))
115 |   sigma2 <- sigma2_beta[1,]
116 |   theta = mu^2/sigma2
117 |   j = 0
118 |   delta = Inf
119 | 
120 |   # IRLS for estimating mean and variance parameters
121 |   while( delta > 0.05 & j <= 10 ){
122 |     theta_previous = theta
123 |     theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5)
124 |     theta[theta < 0] = Inf
125 |     w = M + outer(seq_depth_sq, mu^2/theta_median)
126 |     w[is.na(w)|w <= 0] = 1
127 |     mu_beta <- WLS_mean(D_mu, X, 1/w)
128 |     mu <- mu_beta[1,]
129 |     M <- D_mu %*% mu_beta
130 |     X_centered = X - M
131 |     h = (M^2/theta_median + M)^2
132 |     h[h <= 0] = 1
133 |     sigma2_beta <- WLS_mean(D_sigma2, X_centered^2 - M, 1/h)
134 |     sigma2 <- sigma2_beta[1,]
135 |     theta = mu^2/sigma2
136 |     j = j+1
137 |     delta = max(abs(log((theta/theta_previous)[theta > 0 & theta_previous > 0])), na.rm = T)
138 |   }
139 |   if(j == 10 & delta > 0.05){
140 |     print('IRLS failed to converge after 10 iterations. Please check your data.')
141 |   }else{
142 |     print(sprintf('IRLS converged after %i iterations.', j))
143 |   }
144 | 
145 |   # Update the weights for estimating covariance
146 |   theta_median = stats::quantile(theta[theta > 0], na.rm = T, probs = 0.5)
147 |   theta[theta < 0] = Inf
148 |   w = M + outer(seq_depth_sq, mu^2/theta_median)
149 |   w[is.na(w)|w <= 0] = 1
150 | 
151 |   X_centered <- X - M
152 |   covar <- WLS_cov(D_sigma, X_centered, 1/w)$cov_hat[1,,]
153 |   # Evaluate test statistics and p values
154 |   Sigma <- M + outer(seq_depth_sq, sigma2)
155 |   ele_inv_Sigma <- 1/Sigma
156 |   ts_res <- WLS_cov(D_sigma, X_centered, ele_inv_Sigma)
157 |   est <- ts_res$cov_hat[1,,]
158 |   p_value <- 2 * stats::pnorm(abs(ts_res$test_stat), lower.tail = F)
159 | 
160 |   # Evaluate co-expression estimates
161 |   neg_gene_inds <- which(sigma2 < 0)
162 |   sigma2[neg_gene_inds] <- 0
163 |   sigma <- sqrt(sigma2)
164 |   diag(covar) <- sigma
165 |   est <- covar/outer(sigma, sigma)
166 |   test_stat <- ts_res$test_stat
167 | 
168 |   # Post-process the co-expression estimates
169 |   #diag(est) <- 1
170 |   if(post_process) est <- post_process_est(est)
171 |   rownames(est) <- colnames(est) <- rownames(p_value) <- colnames(p_value) <- rownames(test_stat) <- colnames(test_stat) <- colnames(X)
172 |   result_list <- list(est = est, p_value = p_value, test_stat = test_stat)
173 |   if(return_all){
174 |     result_list$mu_beta <- mu_beta
175 |     result_list$sigma2_beta <- sigma2_beta
176 |     result_list$cov_beta <- ts_res$cov_hat
177 |   }
178 |   return(result_list)
179 | }
180 | 
181 | 


--------------------------------------------------------------------------------
/vignettes/CSCORE.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "CS-CORE for cell-type-specific co-expression analysis"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{CS-CORE for cell-type-specific co-expression analysis}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | ```
 16 | 
 17 | This vignette shows an example of applying `CS-CORE` to infer cell-type-specific co-expression networks and extracting co-expressed gene modules that are enriched for biological functions in cell types.
 18 | 
 19 | # 1. Load packages and data
 20 | 
 21 | ```{r setup, warning = FALSE, message = FALSE}
 22 | library(CSCORE)
 23 | library(Seurat)
 24 | ```
 25 | 
 26 | In this vignette, we use the single cell RNA-sequencing data on Peripheral blood mononuclear cells (PBMC) from COVID patients and healthy controls from [Wilk et al.](https://www.nature.com/articles/s41591-020-0944-y), which were also studied in our [paper](https://www.nature.com/articles/s41467-023-40503-7). This data set can be downloaded via the following bash script 
 27 | 
 28 | ```{r, engine = 'bash', eval = FALSE}
 29 | wget https://hosted-matrices-prod.s3-us-west-2.amazonaws.com/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection-25/blish_covid.seu.rds
 30 | ```
 31 | 
 32 | After downloading blish_covid.seu.rds, we load it into the R session
 33 | ```{r eval = FALSE}
 34 | pbmc <- readRDS('blish_covid.seu.rds')
 35 | pbmc <- UpdateSeuratObject(pbmc) # update the obsolete Seurat object
 36 | ```
 37 | 
 38 | # 2. Select cell types and gene sets to study
 39 | 
 40 | In this example, we focus on B cells and infer the B cell–specific co-expression network. In general, we strongly recommend subsetting to the cell type of interest before inferring co-expression. Otherwise, the estimates may be affected by [Simpson's paradox](https://pmc.ncbi.nlm.nih.gov/articles/PMC4579334/#:~:text=(A)%20Simpson's%20Paradox%20describes%20), capturing associations driven by differences between cell types (e.g.correlations between cell type marker genes), rather than gene-gene co-expression within a homogeneous cell population.
 41 | 
 42 | ```{r eval = FALSE}
 43 | pbmc_B = pbmc[,pbmc$cell.type.coarse %in% 'B']
 44 | ```
 45 | 
 46 | Depending on the biological question of interest, one may choose to study the co-expression network for any gene set. Here, we chose to infer the co-expression network for the genes with meaningful expression levels in B cells (top 5000 among 26361 genes). There are several reasons for our choice: 
 47 | 
 48 | 1. All genes with moderate to high expression levels provides a comprehensive and unbiased set of genes that could have meaningful biological functions in a cell type.
 49 | 
 50 | 2. If the genes have much lower expression levels, it would be statistically more challenging and biologically less interesting to infer their co-expressions, as these genes might have almost all UMI counts equal to 0. 
 51 | 
 52 | In general, it will be up to the users's choice to select the gene sets to study. We recommend choosing the gene sets that are of interest to your application.
 53 | 
 54 | ```{r eval = FALSE}
 55 | mean_exp = rowMeans(pbmc_B@assays$RNA@counts/pbmc_B$nCount_RNA)
 56 | genes_selected = names(sort.int(mean_exp, decreasing = T))[1:5000]
 57 | ```
 58 | 
 59 | 
 60 | # 3. Run `CS-CORE` to infer cell-type-specific co-expression network on the specified gene set
 61 | 
 62 | We further subset the B cells to those from healthy control subjects in order to study B-cell specific co-expression network among healthy control B cells. 
 63 | 
 64 | ```{r eval = FALSE}
 65 | pbmc_B_healthy <- pbmc_B[, pbmc_B$Status == "Healthy"]
 66 | ```
 67 | 
 68 | Run `CS-CORE` with the subsetted Seurat object and a gene set of interest. We note that CSCORE operates on the raw UMI counts, i.e. `object[['RNA']]@counts` for the Seurat object `object`.
 69 | 
 70 | ```{r eval = FALSE}
 71 | CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected)
 72 | # faster yet more memory intensive:
 73 | # CSCORE_result <- CSCORE(pbmc_B_healthy, genes = genes_selected, IRLS_version = 'bash_R')
 74 | ```
 75 | 
 76 | Since version 1.0.1 (updated June 2025), CSCORE provides a new Rcpp implementation of moment-based regressions that offers improved memory efficiency and supports covariate adjustment (see [Covariate adjustment](covariate_adjustment.html)). The original implementation at the time of publication (version 0.0.0.9) was written in base R, and while it is approximately 10 times faster, it is also significantly more memory intensive (by a factor of 10–100). We recommend choosing between the two implementations based on available memory resources and the need for computational speed and covariate adjustment.
 77 | 
 78 | # 4. Downstream analysis on the co-expression network
 79 | 
 80 | ## 4.1 Extract co-expressed gene module
 81 | 
 82 | Given the `CS-CORE` $p$-values, we first set co-expressions that are not statistically significant (Benjamini \& Hochberg-adjusted $p$-values $>0.05$) to 0.
 83 | 
 84 | ```{r eval = FALSE}
 85 | # Obtain CS-CORE co-expression estimates
 86 | CSCORE_coexp <- CSCORE_result$est
 87 | 
 88 | # Obtain BH-adjusted p values
 89 | CSCORE_p <- CSCORE_result$p_value
 90 | p_matrix_BH = matrix(0, length(genes_selected), length(genes_selected))
 91 | p_matrix_BH[upper.tri(p_matrix_BH)] = p.adjust(CSCORE_p[upper.tri(CSCORE_p)], method = "BH")
 92 | p_matrix_BH <- p_matrix_BH + t(p_matrix_BH)
 93 | 
 94 | # Set co-expression entires with BH-adjusted p-values greater than 0.05 to 0
 95 | CSCORE_coexp[p_matrix_BH > 0.05] <- 0
 96 | ```
 97 | 
 98 | Next, based on the thresholded co-expression matrix, we apply [WGCNA](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559) to extract co-expressed gene modules. In particular, we use `CS-CORE` estimates to measure co-expressions for single cell RNA-sequencing data, which replace the Pearson correlations used in traditional WGNCA workflow, that suffer from inflated false positives and attenuation bias on single cell data as demonstrated in our [manuscript](https://www.biorxiv.org/content/10.1101/2022.12.13.520181v1).
 99 | 
100 | ```{r eval = FALSE}
101 | if (!require(WGCNA)) {
102 |   install.packages("WGCNA")
103 |   library(WGCNA)
104 | }else{
105 |   library(WGCNA)
106 | }
107 | ```
108 | 
109 | ```{r eval = FALSE}
110 | # Compute the adjacency matrix based on the co-expression matrix
111 | adj = WGCNA::adjacency.fromSimilarity(abs(CSCORE_coexp), power = 1)
112 | # Compute the topological overlap matrix
113 | TOM = WGCNA::TOMsimilarity(adj)
114 | dissTOM = 1-TOM
115 | rownames(dissTOM) <- colnames(dissTOM) <- genes_selected
116 | # Run hierarchical clustering as in the WGCNA workflow
117 | hclust_dist = hclust(as.dist(dissTOM), method = "average") 
118 | memb = dynamicTreeCut::cutreeDynamic(dendro = hclust_dist, 
119 |                      distM = dissTOM, 
120 |                      deepSplit = 2,
121 |                      pamRespectsDendro = FALSE,
122 |                      minClusterSize = 10)
123 | # For more instructions on how to tune the parameters in the WGCNA workflow,
124 | # please refer to https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/
125 | 
126 | names(memb) = genes_selected
127 | memb_tab <- table(memb)
128 | module_list = lapply(sort(unique(memb)), function(i_k) names(which(memb == i_k)))
129 | ```
130 | One can also apply other clustering methods to extract co-expressed gene modules.
131 | 
132 | ## 4.2 Functional enrichment analysis
133 | 
134 | Gene Ontology (GO) enrichment analysis is one of the common downstream functional enrichment analyses for interpreting the biological pathways implied by co-expressed gene modules. Here, we showcase the GO pathways enriched in `CS-CORE` co-expressed gene modules using the R implementation from [Wu et al.](https://pubmed.ncbi.nlm.nih.gov/34557778/).
135 | 
136 | ```{r eval = FALSE}
137 | if (!require(clusterProfiler)) {
138 |   BiocManager::install("clusterProfiler")
139 |   library(clusterProfiler)
140 | }else{
141 |   library(clusterProfiler)
142 | }
143 | ```
144 | 
145 | ```{r eval = FALSE}
146 | # Set all genes in clustering analysis as background,
147 | # such that the enrichment result of any module is not attributed to its high expression levels.
148 | universe <- genes_selected
149 | 
150 | # Filter GO terms based on BH-adjusted p values < 0.05
151 | ####
152 | ## Note: the following codes can take a long time to run as 
153 | ## in this example there are more than 100 co-expressed gene modules from WGCNA
154 | ####
155 | ego_result <- lapply(1:length(module_list), function(i){
156 |   enrichGO(gene = module_list[[i]],
157 |          OrgDb = 'org.Hs.eg.db', # human
158 |          keyType = "SYMBOL",
159 |          ont = "ALL",
160 |          pAdjustMethod = "BH",
161 |          universe = universe,
162 |          pvalueCutoff = 0.05)
163 | })
164 | ```
165 | 
166 | There are in total 144 gene modules inferred by WGCNA. For illustrative purposes, we focus on the modules with the strongest enrichment signals (with at least one GO term having adjusted $p$-value smaller than $10^{-3}$ and with at least 10 enriched GO terms) and print the top 3 GO terms.
167 | 
168 | ```{r eval = FALSE}
169 | top_enrich_clusters <- which(sapply(ego_result, function(x) 
170 |   (x@result$p.adjust[1] < 0.001) & (dim(x)[1]>10)))
171 | top_enrich_go <- lapply(top_enrich_clusters, function(i) ego_result[[i]]@result[1:3,])
172 | ```
173 | 
174 | ```{r}
175 | for(i in 1:length(top_enrich_go)){
176 |   print(top_enrich_go[[i]][, c('Description', 'GeneRatio', 'p.adjust')])
177 |   cat('\n')
178 | }
179 | ```
180 | 
181 | At this point, we have reproduced the results in our [manuscript](https://doi.org/10.1038/s41467-023-40503-7), Table S6.
182 | 
183 | This concludes our vignette of using `CS-CORE` to infer cell-type-specific co-expression networks and a typical pipeline for extracting co-expressed gene modules and performing functional enrichment analysis.
184 | 
185 | One can also perform a differential co-expression analysis based on the codes provided here. For example, the inferred network of healthy B cells can be constrasted to the network inferred with B cells from the COVID-19 patients to study dysregulation in B cells' co-expression due to COVID-19 infection. For more details please refer the the methods in our [manuscript](https://doi.org/10.1038/s41467-023-40503-7).
186 | 
187 | **Stay tuned!** We are also working on a pipeline for cell-type-specific module-trait association analyses with single cell RNA-seq data based on `CS-CORE` developed here.
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/R/CSCORE_IRLS_cpp.R:
--------------------------------------------------------------------------------
  1 | #' Iteratively reweighted least squares (IRLS) procedure in CS-CORE (Rcpp)
  2 | #'
  3 | #' Estimate and test cell-type-specific co-expression using the IRLS procedure with optional covariate adjustment.
  4 | #'
  5 | #' Let \eqn{x_{ij}} denote the UMI count of gene \eqn{j} in cell \eqn{i};
  6 | #' \eqn{s_i} denote the sequencing depth;
  7 | #' \eqn{\mu_j,\sigma_{jj}, \sigma_{jj'}} denote the mean, variance and covariance;
  8 | #' \eqn{c_{ik}} denote additional covariate \eqn{k} for cell \eqn{i} (e.g. disease status or cellular state).
  9 | #' The procedure consists of two main steps:
 10 | #' \enumerate{
 11 | #'   \item \strong{Mean and variance estimation:} Estimate gene-specific mean and variance parameters
 12 | #'         using two moment-based regressions:
 13 | #'     \itemize{
 14 | #'       \item \strong{Mean model:} \eqn{x_{ij} = s_i (\mu_j + \sum_k c_{ik} \beta_k) + \epsilon_{ij}}
 15 | #'       \item \strong{Variance model:} \eqn{(x_{ij} - s_i \mu_{ij})^2 = s_i \mu_{ij} + s_i^2 (\sigma_{jj} + \sum_k c_{ik} \gamma_k) + \eta_{ij}},
 16 | #'       where \eqn{\mu_{ij} = \mu_j + \sum_k c_{ik} \beta_k}
 17 | #'     }
 18 | #'
 19 | #'   \item \strong{Covariance estimation and hypothesis testing:} Estimate gene-gene covariance
 20 | #'         and compute test statistics to assess the statistical significance of gene co-expression using a third moment-based regression:
 21 | #'     \itemize{
 22 | #'       \item \strong{Covariance model:} \eqn{(x_{ij} - s_i \mu_{ij})(x_{ij'} - s_i \mu_{ij'}) = s_i^2 (\sigma_{jj'} + \sum_k c_{ik} \theta_k) + \xi_{ijj'}}
 23 | #'     }
 24 | #' }
 25 | #' We note that
 26 | #' \enumerate{
 27 | #'   \item The formulation above assumes that the covariates alter the mean / variance / covariance of underlying gene expression,
 28 | #'   rather than observed counts. If you believe that the covariates directly affect the observed counts independent of underlying gene expression
 29 | #'   (e.g. \eqn{x_{ij}|z_{ij} \sim \text{Poisson}(s_i z_{ij}+\sum_k c_{ik} \beta_k)} or \eqn{x_{ij} = s_i \mu_j  + \sum_k c_{ik} \beta_k + \epsilon_{ij}}),
 30 | #'   please specify \code{covariate_level="x"}.
 31 | #'   \item The original CS-CORE published in <https://doi.org/10.1038/s41467-023-40503-7>
 32 | #'   did not consider adjusting for covariates \eqn{c_{ik}}'s. This is equivalent to setting \code{covariates} to \code{NULL}.
 33 | #' }
 34 | #' Note: This is an R wrapper for the `CSCORE_IRLS_cpp_impl()` function implemented in Rcpp.
 35 | #'
 36 | #' @source Su, C., Xu, Z., Shan, X., Cai, B., Zhao, H., & Zhang, J. (2023).
 37 | #' Cell-type-specific co-expression inference from single cell RNA-sequencing data.
 38 | #' \emph{Nature Communications}.
 39 | #' doi: <https://doi.org/10.1038/s41467-023-40503-7>
 40 | #'
 41 | #' @param X A numeric matrix of UMI counts (\code{n x p}),
 42 | #'   where \code{n} is the number of cells and \code{p} is the number of genes.
 43 | #' @param seq_depth A numeric vector of sequencing depths of length \code{n}.
 44 | #' @param covariates Optional. A numeric matrix of covariates (\code{n x K}) to be adjusted for in the moment-based regressions.
 45 | #'   Can be a length n vector if \eqn{K=1}.
 46 | #'   If \code{NULL}, no covariates are adjusted for in the regression.
 47 | #'   Defaults to \code{NULL}.
 48 | #' @param post_process Optional. Logical; whether to rescale the estimated co-expressions to lie between –1 and 1.
 49 | #'   Defaults to \code{TRUE}.
 50 | #' @param covariate_level Optional. A character string indicating whether covariates are assumed to affect
 51 | #'   the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).
 52 | #'   See the *Details* section for further explanation.
 53 | #'   Defaults to \code{"z"}.
 54 | #' @param adjust_setting Optional. A named logical vector of length 3;
 55 | #'   whether to adjust for covariates at the mean, variance, and covariance level.
 56 | #'   Must be named \code{c("mean", "var", "covar")}.
 57 | #'   Defaults to \code{c(mean = T,var = T, covar = T)}.
 58 | #' @param IRLS_par Optional. A named list of length 4 specifying parameters for the IRLS algorithm:
 59 | #'   \describe{
 60 | #'     \item{\code{n_iter}}{Maximum number of iterations.}
 61 | #'     \item{\code{eps}}{Convergence threshold for log-ratio change \code{delta}, computed as \code{abs(log(theta / theta_prev))}.}
 62 | #'     \item{\code{verbose}}{Logical; whether to print the convergence metric (\code{delta}) at each iteration.}
 63 | #'     \item{\code{conv}}{Character string; determine convergence based on \code{q95} (0.95 quantile) or \code{max} of \code{delta}.}
 64 | #'   }
 65 | #'   Defaults to \code{list(n_iter = 10, eps = 0.05, verbose = FALSE, cov = "q95")}.
 66 | #' @param return_all Logical; whether to return all estimates, including the effect sizes for covariates.
 67 | #'   Defaults to \code{FALSE}.
 68 | #'
 69 | #' @return A list containing the following components:
 70 | #' \describe{
 71 | #'   \item{est}{A \eqn{p \times p} matrix of co-expression estimates.}
 72 | #'   \item{p_value}{A \eqn{p \times p} matrix of p-values.}
 73 | #'   \item{test_stat}{A \eqn{p \times p} matrix of test statistics evaluating the statistical significance of co-expression.}
 74 | #'   \item{mu_beta}{A \eqn{k \times p} matrix of regression coefficients from the mean model. Returned if \code{return_all = TRUE}.}
 75 | #'   \item{sigma2_beta}{A \eqn{k \times p} matrix of regression coefficients from the variance model. Returned if \code{return_all = TRUE}.}
 76 | #'   \item{cov_beta}{A \eqn{k \times p \times p} array of regression coefficients from the covariance model. Returned if \code{return_all = TRUE}.}
 77 | #' }
 78 | #' @export
 79 | #'
 80 | #' @examples
 81 | #' ## Toy example:
 82 | #' ## run CSCORE on a simulated independent gene pair
 83 | #' cscore_example <- CSCORE_IRLS_cpp(ind_gene_pair$counts, ind_gene_pair$seq_depths)
 84 | #'
 85 | #' ## Estimated co-expression between two genes
 86 | #' cscore_example$est[1,2]
 87 | #' # close to 0: 0.007820124
 88 | #'
 89 | #' ## p-values
 90 | #' cscore_example$p_value[1,2]
 91 | #' # not significant: 0.961981
 92 | #'
 93 | 
 94 | CSCORE_IRLS_cpp <- function(X,
 95 |                             seq_depth,
 96 |                             covariates = NULL,
 97 |                             post_process = TRUE,
 98 |                             covariate_level = 'z',
 99 |                             adjust_setting = c('mean' = T, 'var' = T, 'covar' = T),
100 |                             IRLS_par = list('n_iter' = 10, 'eps' = 0.05, 'verbose' = FALSE, conv = 'q95'),
101 |                             return_all = FALSE) {
102 |   if(is.null(seq_depth)){
103 |     seq_depth = apply(X, 1, sum, na.rm = T)
104 |   }
105 |   if(nrow(X) != length(seq_depth)){
106 |     stop('[ERROR] The length of the sequencing depth must match the number of cells.')
107 |   }
108 |   if(! all.equal(names(IRLS_par), c('n_iter', 'eps', 'verbose', 'conv'))){
109 |     stop('[ERROR] IRLS_par must be a named list with integer n_iter, double eps, and logical verbose.')
110 |   }
111 |   if(has_non_integer(X)){
112 |     stop("[ERROR] CS-CORE takes the UMI count matrix as input. The input matrix X contains non-integer values.")
113 |   }
114 |   if(!is.null(covariates)){
115 |     if(is.vector(covariates)){
116 |       if(nrow(X) != length(covariates)) stop('[ERROR] The length of the covariate vector should match the number of cells.')
117 |     }else{
118 |       if(nrow(X) != nrow(covariates)) stop('[ERROR] The number of rows in the covariate matrix should match the number of cells.')
119 |       #cat(sprintf('[INFO] The design matrix for regression has %i columns', ncol(covariates) + 1))
120 |     }
121 |   }
122 |   if (length(adjust_setting) != 3) stop("[ERROR] adjust_setting must be of length 3.")
123 | 
124 |   # Construct design matrix
125 |   n_cell <- nrow(X)
126 |   if(is.null(covariates)){
127 |     D <- matrix(rep(1, n_cell), ncol = 1)
128 |   }else{
129 |     D <- cbind(1, covariates)
130 |   }
131 |   seq_depth_sq <- seq_depth^2
132 |   D_mu <- set_D(seq_depth, D, adjust_setting['mean'], covariate_level)
133 |   D_sigma2 <- set_D(seq_depth_sq, D, adjust_setting['var'], covariate_level)
134 |   D_sigma <- set_D(seq_depth_sq, D, adjust_setting['covar'], covariate_level)
135 | 
136 |   res <- .Call(`_CSCORE_CSCORE_IRLS_cpp_impl`,
137 |                X, seq_depth_sq, D_mu, D_sigma2, D_sigma, # regression parameters
138 |                post_process, # post processing
139 |                IRLS_par[['n_iter']], IRLS_par[['eps']], IRLS_par[['verbose']], IRLS_par[['conv']], # IRLS parameters
140 |                return_all)
141 | 
142 |   # Assign row/colnames to square matrices
143 |   gene_names <- colnames(X)
144 |   rownames(res$est) <- colnames(res$est) <- gene_names
145 |   rownames(res$p_value) <- colnames(res$p_value) <- gene_names
146 |   rownames(res$test_stat) <- colnames(res$test_stat) <- gene_names
147 | 
148 |   return(res)
149 | }
150 | 
151 | #' Set the design matrix for moment-based regressions
152 | #'
153 | #'
154 | #' @param s A numeric vector of sequencing depths (for mean regression) or squared sequencing depths (for variance and covariance)
155 | #' @param D A numeric matrix of intercept and covariates (\code{n x K})
156 | #' @param adjust_setting Logical; whether to adjust for covariates
157 | #' @param covariate_level A character string indicating whether covariates are assumed to affect
158 | #'   the underlying gene expression levels (\code{"z"}) or the observed counts (\code{"x"}).
159 | #'
160 | #' @return Design matrix (n by K) for moment-based regressions
161 | #'
162 | #' @keywords internal
163 | #'
164 | set_D <- function(s, D, adjust_setting, covariate_level){
165 |   if(!adjust_setting){
166 |     return(matrix(s, ncol = 1))
167 |   }else{
168 |     if(covariate_level == 'z'){
169 |       return(s * D)
170 |     }else if(covariate_level == 'x'){
171 |       return(cbind(s, D[,-1]))
172 |     }
173 |   }
174 | }
175 | 
176 | 
177 | #' Check for non-integer values in a matrix
178 | #'
179 | #' This function checks whether a numeric matrix contains any non-integer values.
180 | #'
181 | #' @param mat A numeric matrix.
182 | #'
183 | #' @return A logical value: \code{TRUE} if any element is non-integer, otherwise \code{FALSE}.
184 | #'
185 | #' @keywords internal
186 | #'
187 | has_non_integer <- function(mat) {
188 |   for (val in mat) {
189 |     if (abs(val - round(val)) > .Machine$double.eps^0.5) return(TRUE)
190 |   }
191 |   return(FALSE)
192 | }
193 | 
194 | 
195 | #' Post-process IRLS estimates
196 | #'
197 | #' The IRLS procedure does not guarantee the variance estimates to be postive nor the co-expression parameters to be bounded.
198 | #' To address this, this function evaluates the percentage of genes with negative variance estimates;
199 | #' sets the their co-expressions to 0 as these genes do not have sufficient biological variations.
200 | #' This function also evaluates the percentage of gene pairs with out-of-bound co-expression estimates;
201 | #' sets the co-expressions greater than 1 to 1; set the co-expressions smaller than -1 to -1.
202 | #'
203 | #' @param est Estimated co-expression matrix from IRLS
204 | #'
205 | #' @return Post-processed correlation matrix
206 | #'
207 | #' @export
208 | #'
209 | post_process_est <- function(est){
210 |   p <- nrow(est)
211 |   # Post-process CS-CORE estimates
212 |   neg_gene_inds <- which(sapply(diag(est), function(x) is.infinite(x) | is.na(x)))
213 |   #which(is.infinite(diag(est)))
214 |   if(length(neg_gene_inds) > 0){
215 |     print(sprintf('%i among %i genes have negative variance estimates. Their co-expressions with other genes were set to 0.',
216 |                   length(neg_gene_inds), p))
217 |   }
218 |   # Negative variances suggest insufficient biological variation,
219 |   # and also lack of correlation
220 |   est[neg_gene_inds, ] <- 0
221 |   est[, neg_gene_inds] <- 0
222 |   # Set all diagonal values to 1
223 |   diag(est) <- 1
224 |   # Gene pairs with out-of-bound estimates
225 |   print(sprintf('%.4f%% co-expression estimates were greater than 1 and were set to 1.',
226 |                 mean(est[upper.tri(est)] > 1, na.rm = T) * 100))
227 |   print(sprintf('%.4f%% co-expression estimates were smaller than -1 and were set to -1.',
228 |                 mean(est[upper.tri(est)] < -1, na.rm = T) * 100))
229 |   est[est > 1] <- 1
230 |   est[est < -1] <- -1
231 |   return(est)
232 | }
233 | 
234 | 


--------------------------------------------------------------------------------