├── .Rbuildignore ├── .github └── workflows │ └── check-bioc.yml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── R ├── aggregateData.R ├── calcExprFreqs.R ├── data.R ├── mmDS.R ├── pbDS.R ├── pbFlatten.R ├── pbHeatmap.R ├── pbMDS.R ├── prepSCE.R ├── prepSim.R ├── resDS.R ├── simData.R ├── stagewiseDD.R ├── utils-mmDS.R ├── utils-pbDS.R ├── utils-simData.R ├── utils.R └── validity-checks.R ├── README.md ├── data └── example_sce.rda ├── inst └── extdata │ ├── 1a.png │ ├── 1b.png │ ├── 1d.png │ ├── muscat.png │ └── refs.bib ├── man ├── aggregateData.Rd ├── calcExprFreqs.Rd ├── data.Rd ├── mmDS.Rd ├── pbDS.Rd ├── pbFlatten.Rd ├── pbHeatmap.Rd ├── pbMDS.Rd ├── prepSCE.Rd ├── prepSim.Rd ├── resDS.Rd ├── simData.Rd └── stagewise_DS_DD.Rd ├── tests ├── testthat.R └── testthat │ ├── test-aggregateData.R │ ├── test-calcExprFreqs.R │ ├── test-mmDS.R │ ├── test-pbDS.R │ ├── test-pbHeatmap.R │ ├── test-pbMDS.R │ ├── test-prepSCE.R │ ├── test-prepSim.R │ ├── test-resDS.R │ ├── test-simData.R │ ├── test-utils.R │ ├── test-validityChecks.R │ └── test_utils-simData.R └── vignettes ├── analysis.Rmd ├── detection.Rmd └── simulation.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^\.git$ 2 | ^\.github$ 3 | ^\.gitignore$ 4 | -------------------------------------------------------------------------------- /.github/workflows/check-bioc.yml: -------------------------------------------------------------------------------- 1 | ## Read more about GitHub actions the features of this GitHub Actions workflow 2 | ## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action 3 | ## 4 | ## For more details, check the biocthis developer notes vignette at 5 | ## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html 6 | ## 7 | ## You can add this workflow to other packages using: 8 | ## > biocthis::use_bioc_github_action() 9 | ## 10 | ## Using GitHub Actions exposes you to many details about how R packages are 11 | ## compiled and installed in several operating system.s 12 | ### If you need help, please follow the steps listed at 13 | ## https://github.com/r-lib/actions#where-to-find-help 14 | ## 15 | ## If you found an issue specific to biocthis's GHA workflow, please report it 16 | ## with the information that will make it easier for others to help you. 17 | ## Thank you! 18 | 19 | ## Acronyms: 20 | ## * GHA: GitHub Action 21 | ## * OS: operating system 22 | 23 | on: 24 | push: 25 | pull_request: 26 | 27 | name: R-CMD-check-bioc 28 | 29 | ## These environment variables control whether to run GHA code later on that is 30 | ## specific to testthat, covr, and pkgdown. 31 | ## 32 | ## If you need to clear the cache of packages, update the number inside 33 | ## cache-version as discussed at https://github.com/r-lib/actions/issues/86. 34 | ## Note that you can always run a GHA test without the cache by using the word 35 | ## "/nocache" in the commit message. 36 | env: 37 | has_testthat: 'true' 38 | run_covr: 'false' 39 | run_pkgdown: 'false' 40 | has_RUnit: 'false' 41 | cache-version: 'cache-v1' 42 | run_docker: 'false' 43 | bioc_version: 'bioc-release' 44 | ## Valid options are: 45 | ## "bioc-release" 46 | ## "bioc-devel" 47 | ## or a specific number like "3.20" 48 | 49 | jobs: 50 | bioc-config: 51 | runs-on: ubuntu-latest 52 | outputs: 53 | matrix: ${{ steps.set-bioc-matrix.outputs.matrix }} 54 | steps: 55 | ## Adapted from 56 | ## https://runs-on.com/github-actions/the-matrix-strategy/#dynamic-matrix-generation 57 | - id: set-bioc-matrix 58 | run: | 59 | bioc=$(curl -L https://bioconductor.org/config.yaml) 60 | if [[ "$bioc_version" == "bioc-release" ]]; then 61 | echo "Finding the latest BioC release version and the corresponding R version" 62 | biocversion=$(echo "$bioc" | grep "release_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}") 63 | rversion=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}") 64 | biocmajor=$(echo "$biocversion" | cut -c 1-1) 65 | biocminor=$(echo "$biocversion" | cut -c 3-4) 66 | bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}") 67 | elif [[ "$bioc_version" == "bioc-devel" ]]; then 68 | echo "Finding the latest BioC devel version and the corresponding R version" 69 | biocversion=$(echo "$bioc" | grep "devel_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}") 70 | rversion_release=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}") 71 | rversion_devel=$(echo "$bioc" | grep "r_version_associated_with_devel: " | grep -Eo "[0-9]{1}\.[0-9]{1}") 72 | if [[ "$rversion_release" == "$rversion_devel" ]]; then 73 | rversion=$(echo "$rversion_devel") 74 | else 75 | rversion="devel" 76 | fi 77 | bioccont="bioconductor/bioconductor_docker:devel" 78 | else 79 | echo "Finding the the R version for bioc version ${bioc_version}" 80 | biocversion=$(echo "$bioc_version") 81 | rversion=$(echo "$bioc" | sed -En "/r_ver_for_bioc_ver/,/release_dates/p" | grep "$bioc_version\":" | grep -Eo ": \"[0-9]{1}\.[0-9]{1}" | grep -Eo "[0-9]{1}\.[0-9]{1}") 82 | biocmajor=$(echo "$biocversion" | cut -c 1-1) 83 | biocminor=$(echo "$biocversion" | cut -c 3-4) 84 | bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}") 85 | fi 86 | echo "Found these settings:" 87 | echo "Bioconductor version: $biocversion, R version: $rversion, Bioconductor docker name: $bioccont" 88 | echo "matrix={ \"config\": [{\"os\" : \"ubuntu-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\", \"cont\" : \"${bioccont}\"} , {\"os\" : \"macOS-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\"} , {\"os\" : \"windows-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\" }] }" >> "$GITHUB_OUTPUT" 89 | ## If an OS is failing and you don't want to test it, manually remove it from the 'matrix' JSON entries above 90 | 91 | build-check: 92 | needs: bioc-config 93 | strategy: 94 | fail-fast: false 95 | matrix: ${{fromJson(needs.bioc-config.outputs.matrix)}} 96 | runs-on: ${{ matrix.config.os }} 97 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 98 | container: ${{ matrix.config.cont }} 99 | env: 100 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 101 | NOT_CRAN: true 102 | TZ: UTC 103 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 104 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 105 | 106 | steps: 107 | 108 | ## Most of these steps are the same as the ones in 109 | ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml 110 | ## If they update their steps, we will also need to update ours. 111 | - name: Checkout Repository 112 | uses: actions/checkout@v3 113 | 114 | ## R is already included in the Bioconductor docker images 115 | - name: Setup R from r-lib 116 | if: runner.os != 'Linux' 117 | uses: r-lib/actions/setup-r@v2 118 | with: 119 | r-version: ${{ matrix.config.r }} 120 | http-user-agent: ${{ matrix.config.http-user-agent }} 121 | 122 | ## pandoc is already included in the Bioconductor docker images 123 | - name: Setup pandoc from r-lib 124 | if: runner.os != 'Linux' 125 | uses: r-lib/actions/setup-pandoc@v2 126 | 127 | ## Create the path that will be used for caching packages on Linux 128 | - name: Create R_LIBS_USER on Linux 129 | if: runner.os == 'Linux' 130 | run: | 131 | R_LIBS_USER=/__w/_temp/Library 132 | echo "R_LIBS_USER=$R_LIBS_USER" >> "$GITHUB_ENV" 133 | mkdir -p $R_LIBS_USER 134 | 135 | ## Use cached R packages 136 | - name: Restore R package cache 137 | if: "!contains(github.event.head_commit.message, '/nocache')" 138 | uses: actions/cache@v4 139 | with: 140 | path: ${{ env.R_LIBS_USER }} 141 | key: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-${{ inputs.cache-version }} 142 | restore-keys: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}--${{ inputs.cache-version }} 143 | 144 | ## remotes is needed for isntalling the Linux system dependencies 145 | ## as well as other R packages later on. 146 | - name: Install remotes 147 | run: | 148 | message(paste('****', Sys.time(), 'installing remotes ****')) 149 | install.packages('remotes') 150 | shell: Rscript {0} 151 | 152 | ## This will work again once https://github.com/r-lib/remotes/commit/0e4e23051041d9f1b15a5ab796defec31af6190d 153 | ## makes it to the CRAN version of remotes 154 | 155 | # - name: Install Linux system dependencies 156 | # if: runner.os == 'Linux' 157 | # run: | 158 | # sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "24.04")), collapse = " "))') 159 | # echo $sysreqs 160 | # sudo -s eval "$sysreqs" 161 | 162 | - name: Install macOS system dependencies 163 | if: matrix.config.os == 'macOS-latest' 164 | run: | 165 | ## Enable installing XML from source if needed 166 | brew install libxml2 167 | echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV 168 | 169 | ## Required to install magick as noted at 170 | ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 171 | brew install imagemagick@6 172 | 173 | ## For textshaping, required by ragg, and required by pkgdown 174 | brew install harfbuzz fribidi 175 | 176 | ## For installing usethis's dependency gert 177 | brew install libgit2 178 | 179 | ## Required for tcltk 180 | brew install xquartz --cask 181 | 182 | - name: Install Windows system dependencies 183 | if: runner.os == 'Windows' 184 | run: | 185 | ## Edit below if you have any Windows system dependencies 186 | shell: Rscript {0} 187 | 188 | - name: Install BiocManager 189 | run: | 190 | message(paste('****', Sys.time(), 'installing BiocManager ****')) 191 | remotes::install_cran("BiocManager") 192 | shell: Rscript {0} 193 | 194 | - name: Set BiocVersion 195 | run: | 196 | BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE) 197 | shell: Rscript {0} 198 | 199 | - name: Install dependencies pass 1 200 | run: | 201 | ## Try installing the package dependencies in steps. First the local 202 | ## dependencies, then any remaining dependencies to avoid the 203 | ## issues described at 204 | ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html 205 | ## https://github.com/r-lib/remotes/issues/296 206 | ## Ideally, all dependencies should get installed in the first pass. 207 | 208 | ## For running the checks 209 | message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) 210 | install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories()) 211 | 212 | ## Pass #1 at installing dependencies 213 | message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) 214 | remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE) 215 | continue-on-error: true 216 | shell: Rscript {0} 217 | 218 | - name: Install dependencies pass 2 219 | run: | 220 | ## Pass #2 at installing dependencies 221 | message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) 222 | remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE) 223 | shell: Rscript {0} 224 | 225 | - name: Install BiocGenerics 226 | if: env.has_RUnit == 'true' 227 | run: | 228 | ## Install BiocGenerics 229 | BiocManager::install("BiocGenerics") 230 | shell: Rscript {0} 231 | 232 | - name: Install covr 233 | if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' 234 | run: | 235 | remotes::install_cran("covr") 236 | shell: Rscript {0} 237 | 238 | - name: Install pkgdown 239 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' 240 | run: | 241 | remotes::install_cran("pkgdown") 242 | shell: Rscript {0} 243 | 244 | - name: Session info 245 | run: | 246 | options(width = 100) 247 | pkgs <- installed.packages()[, "Package"] 248 | sessioninfo::session_info(pkgs, include_base = TRUE) 249 | shell: Rscript {0} 250 | 251 | - name: Run CMD check 252 | env: 253 | _R_CHECK_CRAN_INCOMING_: false 254 | DISPLAY: 99.0 255 | run: | 256 | options(crayon.enabled = TRUE) 257 | rcmdcheck::rcmdcheck( 258 | args = c("--no-manual", "--no-vignettes", "--timings"), 259 | build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), 260 | error_on = "warning", 261 | check_dir = "check" 262 | ) 263 | shell: Rscript {0} 264 | 265 | ## Might need an to add this to the if: && runner.os == 'Linux' 266 | - name: Reveal testthat details 267 | if: env.has_testthat == 'true' 268 | run: find . -name testthat.Rout -exec cat '{}' ';' 269 | 270 | - name: Run RUnit tests 271 | if: env.has_RUnit == 'true' 272 | run: | 273 | BiocGenerics:::testPackage() 274 | shell: Rscript {0} 275 | 276 | - name: Run BiocCheck 277 | env: 278 | DISPLAY: 99.0 279 | run: | 280 | BiocCheck::BiocCheck( 281 | dir('check', 'tar.gz$', full.names = TRUE), 282 | `quit-with-status` = TRUE, 283 | `no-check-R-ver` = TRUE, 284 | `no-check-bioc-help` = TRUE 285 | ) 286 | shell: Rscript {0} 287 | 288 | - name: Test coverage 289 | if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' 290 | run: | 291 | covr::codecov(coverage = covr::package_coverage(type = "all")) 292 | shell: Rscript {0} 293 | 294 | - name: Install package 295 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' 296 | run: R CMD INSTALL . 297 | 298 | - name: Build pkgdown site 299 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' 300 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 301 | shell: Rscript {0} 302 | ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) 303 | ## at least one locally before this will work. This creates the gh-pages 304 | ## branch (erasing anything you haven't version controlled!) and 305 | ## makes the git history recognizable by pkgdown. 306 | 307 | - name: Install deploy dependencies 308 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' 309 | run: | 310 | apt-get update && apt-get -y install rsync 311 | 312 | - name: Deploy pkgdown site to GitHub pages 🚀 313 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' 314 | uses: JamesIves/github-pages-deploy-action@releases/v4 315 | with: 316 | clean: false 317 | branch: gh-pages 318 | folder: docs 319 | 320 | - name: Upload check results 321 | if: failure() 322 | uses: actions/upload-artifact@master 323 | with: 324 | name: ${{ runner.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-results 325 | path: check 326 | 327 | 328 | ## Code adapted from 329 | ## https://github.com/waldronlab/cBioPortalData/blob/e0440a4445f0cc731e426363a76faa22ee5e0f9d/.github/workflows/devel_check_dock.yml#L65-L92 330 | docker-build-and-push: 331 | runs-on: ubuntu-latest 332 | needs: build-check 333 | steps: 334 | - name: Checkout Repository 335 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" 336 | uses: actions/checkout@v3 337 | 338 | - name: Register repo name 339 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" 340 | id: reg_repo_name 341 | run: | 342 | echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV 343 | 344 | - name: Set up QEMU 345 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" 346 | uses: docker/setup-qemu-action@v2 347 | 348 | - name: Set up Docker Buildx 349 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" 350 | uses: docker/setup-buildx-action@v2 351 | 352 | - name: Login to Docker Hub 353 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'" 354 | uses: docker/login-action@v2 355 | with: 356 | username: ${{ secrets.DOCKERHUB_USERNAME }} 357 | password: ${{ secrets.DOCKERHUB_TOKEN }} 358 | ## Note that DOCKERHUB_TOKEN is really a token for your dockerhub 359 | ## account, not your actual dockerhub account password. You can get it 360 | ## from https://hub.docker.com/settings/security. 361 | ## Check https://github.com/docker/build-push-action/tree/v4.0.0 362 | ## for more details. 363 | ## Alternatively, try checking 364 | ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html. 365 | 366 | - name: Build and Push Docker 367 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel' && success()" 368 | uses: docker/build-push-action@v4 369 | with: 370 | context: . 371 | push: true 372 | tags: > 373 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest, 374 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel 375 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .RData 2 | .Rhistory 3 | .DS_Store 4 | .Ruserdata 5 | .Rproj.user 6 | vignettes/*/ 7 | vignettes/*.html -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: muscat 2 | Title: Multi-sample multi-group scRNA-seq data analysis tools 3 | Description: `muscat` provides various methods and visualization tools 4 | for DS analysis in multi-sample, multi-group, multi-(cell-)subpopulation 5 | scRNA-seq data, including cell-level mixed models and methods based on 6 | aggregated “pseudobulk” data, as well as a flexible simulation platform 7 | that mimics both single and multi-sample scRNA-seq data. 8 | Type: Package 9 | Version: 1.22.0 10 | Depends: R (>= 4.4) 11 | Authors@R: c( 12 | person("Helena L.", "Crowell", 13 | role=c("aut", "cre"), 14 | email="helena@crowell.eu", 15 | comment=c(ORCID="0000-0002-4801-1767")), 16 | person("Pierre-Luc", "Germain", role="aut"), 17 | person("Charlotte", "Soneson", role="aut"), 18 | person("Anthony", "Sonrel", role="aut"), 19 | person("Jeroen", "Gilis", role="aut"), 20 | person("Davide", "Risso", role="aut"), 21 | person("Lieven", "Clement", role="aut"), 22 | person("Mark D.", "Robinson", role=c("aut", "fnd"), 23 | email="mark.robinson@imls.uzh.ch")) 24 | Imports: 25 | BiocParallel, blme, 26 | ComplexHeatmap, 27 | data.table, DESeq2, dplyr, 28 | edgeR, 29 | ggplot2, glmmTMB, grDevices, grid, 30 | limma, lmerTest, lme4, 31 | Matrix, matrixStats, methods, 32 | progress, purrr, 33 | rlang, 34 | S4Vectors, scales, scater, scuttle, sctransform, stats, 35 | SingleCellExperiment, SummarizedExperiment, 36 | variancePartition, viridis 37 | Suggests: 38 | BiocStyle, 39 | countsimQC, 40 | ExperimentHub, 41 | iCOBRA, 42 | knitr, 43 | patchwork, 44 | phylogram, 45 | RColorBrewer, 46 | reshape2, 47 | rmarkdown, 48 | statmod, 49 | stageR, 50 | testthat, 51 | UpSetR 52 | biocViews: ImmunoOncology, DifferentialExpression, Sequencing, 53 | SingleCell, Software, StatisticalMethod, Visualization 54 | License: GPL-3 55 | VignetteBuilder: knitr 56 | RoxygenNote: 7.3.2 57 | Encoding: UTF-8 58 | URL: https://github.com/HelenaLC/muscat 59 | BugReports: https://github.com/HelenaLC/muscat/issues 60 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(aggregateData) 4 | export(calcExprFreqs) 5 | export(mmDS) 6 | export(pbDD) 7 | export(pbDS) 8 | export(pbFlatten) 9 | export(pbHeatmap) 10 | export(pbMDS) 11 | export(prepSCE) 12 | export(prepSim) 13 | export(resDS) 14 | export(simData) 15 | export(stagewise_DS_DD) 16 | import(ggplot2) 17 | importFrom(BiocParallel,MulticoreParam) 18 | importFrom(BiocParallel,SerialParam) 19 | importFrom(BiocParallel,bplapply) 20 | importFrom(ComplexHeatmap,Heatmap) 21 | importFrom(ComplexHeatmap,columnAnnotation) 22 | importFrom(ComplexHeatmap,rowAnnotation) 23 | importFrom(DESeq2,DESeq) 24 | importFrom(DESeq2,DESeqDataSetFromMatrix) 25 | importFrom(DESeq2,estimateDispersions) 26 | importFrom(DESeq2,results) 27 | importFrom(DESeq2,sizeFactors) 28 | importFrom(DESeq2,varianceStabilizingTransformation) 29 | importFrom(Matrix,colSums) 30 | importFrom(Matrix,qr) 31 | importFrom(Matrix,rowMeans) 32 | importFrom(Matrix,rowSums) 33 | importFrom(Matrix,t) 34 | importFrom(S4Vectors,"metadata<-") 35 | importFrom(S4Vectors,DataFrame) 36 | importFrom(S4Vectors,as.list) 37 | importFrom(S4Vectors,metadata) 38 | importFrom(S4Vectors,split) 39 | importFrom(S4Vectors,unfactor) 40 | importFrom(SingleCellExperiment,"counts<-") 41 | importFrom(SingleCellExperiment,"int_colData<-") 42 | importFrom(SingleCellExperiment,"sizeFactors<-") 43 | importFrom(SingleCellExperiment,SingleCellExperiment) 44 | importFrom(SingleCellExperiment,colData) 45 | importFrom(SingleCellExperiment,counts) 46 | importFrom(SingleCellExperiment,int_colData) 47 | importFrom(SingleCellExperiment,reducedDims) 48 | importFrom(SingleCellExperiment,sizeFactors) 49 | importFrom(SummarizedExperiment,"assay<-") 50 | importFrom(SummarizedExperiment,"colData<-") 51 | importFrom(SummarizedExperiment,"rowData<-") 52 | importFrom(SummarizedExperiment,SummarizedExperiment) 53 | importFrom(SummarizedExperiment,assay) 54 | importFrom(SummarizedExperiment,assayNames) 55 | importFrom(SummarizedExperiment,assays) 56 | importFrom(SummarizedExperiment,colData) 57 | importFrom(SummarizedExperiment,rowData) 58 | importFrom(blme,bglmer) 59 | importFrom(blme,blmer) 60 | importFrom(data.table,data.table) 61 | importFrom(dplyr,"%>%") 62 | importFrom(dplyr,all_of) 63 | importFrom(dplyr,bind_cols) 64 | importFrom(dplyr,bind_rows) 65 | importFrom(dplyr,filter) 66 | importFrom(dplyr,full_join) 67 | importFrom(dplyr,inner_join) 68 | importFrom(dplyr,last) 69 | importFrom(dplyr,mutate) 70 | importFrom(dplyr,mutate_all) 71 | importFrom(dplyr,mutate_at) 72 | importFrom(dplyr,mutate_if) 73 | importFrom(dplyr,pull) 74 | importFrom(dplyr,rename) 75 | importFrom(dplyr,select) 76 | importFrom(edgeR,DGEList) 77 | importFrom(edgeR,calcNormFactors) 78 | importFrom(edgeR,cpm) 79 | importFrom(edgeR,estimateDisp) 80 | importFrom(edgeR,filterByExpr) 81 | importFrom(edgeR,glmFit) 82 | importFrom(edgeR,glmQLFTest) 83 | importFrom(edgeR,glmQLFit) 84 | importFrom(edgeR,glmTreat) 85 | importFrom(edgeR,plotMDS.DGEList) 86 | importFrom(edgeR,topTags) 87 | importFrom(glmmTMB,glmmTMB) 88 | importFrom(glmmTMB,nbinom1) 89 | importFrom(grDevices,colorRampPalette) 90 | importFrom(grid,gpar) 91 | importFrom(limma,contrasts.fit) 92 | importFrom(limma,duplicateCorrelation) 93 | importFrom(limma,eBayes) 94 | importFrom(limma,lmFit) 95 | importFrom(limma,makeContrasts) 96 | importFrom(limma,topTable) 97 | importFrom(limma,topTreat) 98 | importFrom(limma,treat) 99 | importFrom(limma,voom) 100 | importFrom(lme4,.makeCC) 101 | importFrom(lme4,lmerControl) 102 | importFrom(lmerTest,contest) 103 | importFrom(lmerTest,lmer) 104 | importFrom(matrixStats,colAnys) 105 | importFrom(matrixStats,rowAnyNAs) 106 | importFrom(matrixStats,rowMedians) 107 | importFrom(matrixStats,rowMins) 108 | importFrom(matrixStats,rowQuantiles) 109 | importFrom(matrixStats,rowSds) 110 | importFrom(methods,is) 111 | importFrom(progress,progress_bar) 112 | importFrom(purrr,map) 113 | importFrom(purrr,map_depth) 114 | importFrom(purrr,modify_depth) 115 | importFrom(purrr,negate) 116 | importFrom(purrr,reduce) 117 | importFrom(purrr,set_names) 118 | importFrom(rlang,.data) 119 | importFrom(scales,hue_pal) 120 | importFrom(scater,computeLibraryFactors) 121 | importFrom(scater,isOutlier) 122 | importFrom(sctransform,vst) 123 | importFrom(scuttle,summarizeAssayByGroup) 124 | importFrom(stats,as.formula) 125 | importFrom(stats,coef) 126 | importFrom(stats,df.residual) 127 | importFrom(stats,model.matrix) 128 | importFrom(stats,p.adjust) 129 | importFrom(stats,residuals) 130 | importFrom(stats,rgamma) 131 | importFrom(stats,rnbinom) 132 | importFrom(stats,sd) 133 | importFrom(stats,setNames) 134 | importFrom(variancePartition,dream) 135 | importFrom(variancePartition,getContrast) 136 | importFrom(variancePartition,voomWithDreamWeights) 137 | importFrom(viridis,viridis) 138 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Changes in version 1.19.1 2 | 3 | + added J Gilis, D Risso, L Clement as authors 4 | + differential detection with 'pbDS(..., method="DD")' or 'pbDD()' 5 | & stagewise testing [Gilis et al.], plus corresponding vignette 6 | + replace 'aes_string()' in 'ggplot()' by '.data$.' from 'rlang' 7 | 8 | Changes in version 1.15.1 9 | 10 | + bug fix in 'pbDS': too stringent filtering causing no genes in any clusters 11 | to be tested previously resulted in a 'subscript out of bounds' error; 12 | execution is stopped and an informative error thrown instead. 13 | 14 | + bug fix in 'mmDS': 'dream' (new version?) wouldn't recognize 15 | model variables provided as data; fixed via adding 'as.formula()'. 16 | 17 | + "analysis" vignette: replaced suspended 'dplyr' function 18 | 'top_n' with 'slice_min' when filtering for top DS hits; 19 | fixed some typos; updated preprint to journal reference. 20 | 21 | Changes in version 1.15.0 22 | 23 | + Bioconductor release v3.17 24 | 25 | Changes in version 1.12.1 26 | 27 | + fixed various typos in both vignettes 28 | + internal fixes to keep up with 'ggplot2' & 'dplyr' updates 29 | + bug fix in 'simDS' computing means when one group is missing 30 | + bug fix in 'resDS' until testing when 'cpm/frq = TRUE' 31 | 32 | Changes in version 1.11.1 33 | 34 | + bug fix in pbHeatmap(): previously failed for results from 'mmDS()' 35 | 36 | Changes in version 1.9.3 37 | 38 | + bug fix in pbDS(): drop samples w/o any detected features, 39 | otherwise edgeR::calcNormFactors() fails when lib.size 0 40 | 41 | Changes in version 1.8.1 42 | 43 | + bug fix in prepSim(): removal of genes with NA coefficients 44 | was previously not propagated to the dispersion estimates 45 | 46 | + bug fix in test-resDR.R: set 'min_cells = 0' to assure that 47 | everything is being tested, otherwise unit tests could fail 48 | 49 | Changes in version 1.8.0 50 | 51 | + Bioconductor 3.14 release 52 | 53 | Changes in version 1.7.2 54 | 55 | + bug fix in prepSim(): removal of NA coefficients and 56 | subsetting of the input SCE was previously out of synch 57 | 58 | Changes in version 1.5.2 59 | 60 | + added edgeR::calcNormFactors() step in prepSim() 61 | 62 | + added argument 'dd' to simData() specifying 63 | whether or not to simulate 2 groups 64 | 65 | + prepSim() and simData() now support simulation of "singular" design 66 | (no samples, no clusters), as well as only samples/clusters 67 | 68 | + simData() defaults to simulating as many samples as available 69 | in order to avoid re-use (duplication) of reference samples 70 | 71 | Changes in version 1.5.1 72 | 73 | + significant speed-up of aggregateData() by replacing usage 74 | of rowX() over a list with scuttle::summarizeAssayByGroup() 75 | 76 | + added options use "prop.detected" and "num.detected" 77 | as summary statistic (argument 'fun') in aggregateData() 78 | 79 | + added parallelization support in aggregateData() and pbDS() through argument BBPARAM 80 | (passed to scater::sumCountsAcrossCells() and BiocParallel::bplapply, respectively) 81 | 82 | + aggregateData() now stores the number of cells that went into aggregation under 83 | int_colData(.)$n_cells (vs. metadata(.)$n_cells) to support automated subsetting 84 | 85 | + replaced argument n_threads with BPPARAM throughout all 86 | parallelizable functions (aggregateData(), pbDS(), mmDS()) 87 | 88 | + bug fix in prepSim(): the function previously failed when 89 | cluster/sample/group_id cell metadata columns were non-factors 90 | 91 | + bug fix in resDS(): cpm = TRUE previously didn't handle 92 | missing cluster-sample combinations correctly -------------------------------------------------------------------------------- /R/aggregateData.R: -------------------------------------------------------------------------------- 1 | #' @rdname aggregateData 2 | #' @title Aggregation of single-cell to pseudobulk data 3 | #' 4 | #' @description ... 5 | #' 6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 7 | #' @param assay character string specifying the assay slot to use as 8 | #' input data. Defaults to the 1st available (\code{assayNames(x)[1]}). 9 | #' @param by character vector specifying which 10 | #' \code{colData(x)} columns to summarize by (at most 2!). 11 | #' @param fun a character string. 12 | #' Specifies the function to use as summary statistic. 13 | #' Passed to \code{\link[scuttle]{summarizeAssayByGroup}}. 14 | #' @param scale logical. Should pseudo-bulks be scaled 15 | #' with the effective library size & multiplied by 1M? 16 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} 17 | #' object specifying how aggregation should be parallelized. 18 | #' @param verbose logical. Should information on progress be reported? 19 | #' 20 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 21 | #' \itemize{ 22 | #' \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains 23 | #' pseudobulks for each of \code{by[1]}, e.g., for each cluster when 24 | #' \code{by = "cluster_id"}. Rows correspond to genes, columns to 25 | #' \code{by[2]}, e.g., samples when \code{by = "sample_id"}}. 26 | #' \item{If \code{length(by) == 1}, the returned SCE will contain only 27 | #' a single \code{assay} with rows = genes and colums = \code{by}.}} 28 | #' 29 | #' Aggregation parameters (\code{assay, by, fun, scaled}) are stored in 30 | #' \code{metadata()$agg_pars}, and the number of cells that were aggregated 31 | #' are accessible in \code{int_colData()$n_cells}. 32 | #' 33 | #' @examples 34 | #' # pseudobulk counts by cluster-sample 35 | #' data(example_sce) 36 | #' pb <- aggregateData(example_sce) 37 | #' 38 | #' library(SingleCellExperiment) 39 | #' assayNames(example_sce) # one sheet per cluster 40 | #' head(assay(example_sce)) # n_genes x n_samples 41 | #' 42 | #' # scaled CPM 43 | #' cpm <- edgeR::cpm(assay(example_sce)) 44 | #' assays(example_sce)$cpm <- cpm 45 | #' pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE) 46 | #' head(assay(pb)) 47 | #' 48 | #' # aggregate by cluster only 49 | #' pb <- aggregateData(example_sce, by = "cluster_id") 50 | #' length(assays(pb)) # single assay 51 | #' head(assay(pb)) # n_genes x n_clusters 52 | #' 53 | #' @author Helena L Crowell & Mark D Robinson 54 | #' 55 | #' @references 56 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 57 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 58 | #' On the discovery of population-specific state transitions from 59 | #' multi-sample multi-condition single-cell RNA sequencing data. 60 | #' \emph{bioRxiv} \strong{713412} (2018). 61 | #' doi: \url{https://doi.org/10.1101/713412} 62 | #' 63 | #' @importFrom Matrix colSums 64 | #' @importFrom purrr map 65 | #' @importFrom S4Vectors DataFrame metadata 66 | #' @importFrom SingleCellExperiment SingleCellExperiment int_colData<- 67 | #' @importFrom SummarizedExperiment colData colData<- 68 | #' @export 69 | 70 | aggregateData <- function(x, 71 | assay = NULL, by = c("cluster_id", "sample_id"), 72 | fun = c("sum", "mean", "median", "prop.detected", "num.detected"), 73 | scale = FALSE, verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose)) { 74 | 75 | # check validity of input arguments 76 | fun <- match.arg(fun) 77 | if (is.null(assay)) 78 | assay <- assayNames(x)[1] 79 | .check_arg_assay(x, assay) 80 | .check_args_aggData(as.list(environment())) 81 | stopifnot(is(BPPARAM, "BiocParallelParam")) 82 | 83 | # assure 'by' colData columns are factors 84 | # so that missing combinations aren't dropped 85 | for (i in by) 86 | if (!is.factor(x[[i]])) 87 | x[[i]] <- factor(x[[i]]) 88 | 89 | # compute pseudo-bulks 90 | pb <- .pb(x, by, assay, fun, BPPARAM) 91 | if (scale & length(by) == 2) { 92 | # compute library sizes 93 | cs <- if (assay == "counts" && fun == "sum") 94 | pb else .pb(x, by, "counts", "sum", BPPARAM) 95 | ls <- lapply(cs, colSums) 96 | # scale pseudobulks by CPM 97 | pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]]) 98 | names(pb) <- names(ls) 99 | } 100 | 101 | # construct SCE 102 | md <- metadata(x) 103 | md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale) 104 | pb <- SingleCellExperiment(pb, rowData = rowData(x), metadata = md) 105 | 106 | # tabulate number of cells 107 | cd <- data.frame(colData(x)[, by]) 108 | for (i in names(cd)) 109 | if (is.factor(cd[[i]])) 110 | cd[[i]] <- droplevels(cd[[i]]) 111 | ns <- table(cd) 112 | if (length(by) == 2) { 113 | ns <- asplit(ns, 2) 114 | ns <- map(ns, ~c(unclass(.))) 115 | } else ns <- c(unclass(ns)) 116 | int_colData(pb)$n_cells <- ns 117 | 118 | # propagate 'colData' columns that are unique across 2nd 'by' 119 | if (length(by) == 2) { 120 | cd <- colData(x) 121 | ids <- colnames(pb) 122 | counts <- vapply(ids, function(u) { 123 | m <- as.logical(match(cd[, by[2]], u, nomatch = 0)) 124 | vapply(cd[m, ], function(u) length(unique(u)), numeric(1)) 125 | }, numeric(ncol(colData(x)))) 126 | cd_keep <- apply(counts, 1, function(u) all(u == 1)) 127 | cd_keep <- setdiff(names(which(cd_keep)), by) 128 | if (length(cd_keep) != 0) { 129 | m <- match(ids, cd[, by[2]], nomatch = 0) 130 | cd <- cd[m, cd_keep, drop = FALSE] 131 | rownames(cd) <- ids 132 | colData(pb) <- cd 133 | } 134 | } 135 | return(pb) 136 | } 137 | -------------------------------------------------------------------------------- /R/calcExprFreqs.R: -------------------------------------------------------------------------------- 1 | #' calcExprFreqs 2 | #' 3 | #' Calculates gene expression frequencies 4 | #' 5 | #' \code{calcExprFreq} computes, for each sample and group (in each cluster), 6 | #' the fraction of cells that express a given gene. Here, a gene is considered 7 | #' to be expressed when the specified measurement value (\code{assay}) 8 | #' lies above the specified threshold value (\code{th}). 9 | #' 10 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 11 | #' @param assay a character string specifying which assay to use. 12 | #' @param th numeric threshold value above which 13 | #' a gene should be considered to be expressed. 14 | #' 15 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 16 | #' containing, for each cluster, an assay of dimensions #genes x #samples 17 | #' giving the fraction of cells that express each gene in each sample. 18 | #' If \code{colData(x)} contains a \code{"group_id"} column, the fraction 19 | #' of expressing cells in each each group will be included as well. 20 | #' 21 | #' @examples 22 | #' data(example_sce) 23 | #' library(SingleCellExperiment) 24 | #' 25 | #' frq <- calcExprFreqs(example_sce) 26 | #' 27 | #' # one assay per cluster 28 | #' assayNames(frq) 29 | #' 30 | #' # expression frequencies by 31 | #' # sample & group; 1st cluster: 32 | #' head(assay(frq)) 33 | #' 34 | #' @author Helena L Crowell & Mark D Robinson 35 | #' 36 | #' @importFrom Matrix rowMeans 37 | #' @importFrom methods is 38 | #' @importFrom purrr set_names 39 | #' @importFrom SummarizedExperiment assays colData SummarizedExperiment 40 | #' @export 41 | 42 | calcExprFreqs <- function(x, assay = "counts", th = 0) { 43 | # check validity of input arguments 44 | .check_sce(x, req_group = FALSE) 45 | .check_arg_assay(x, assay) 46 | stopifnot(is.numeric(th), length(th) == 1) 47 | 48 | # split cells by cluster-sample 49 | cs_by_ks <- .split_cells(x) 50 | 51 | # for each gene, compute fraction of cells 52 | # w/ assay value above threshold in each sample 53 | y <- assays(x)[[assay]] 54 | fq <- lapply(cs_by_ks, vapply, function(i) { 55 | if (is.null(i)) return(rep(0, nrow(x))) 56 | Matrix::rowMeans(y[, i, drop = FALSE] > th) 57 | }, numeric(nrow(x))) 58 | 59 | # same for ea. group (if colData column "group_id" exists) 60 | if ("group_id" %in% colnames(colData(x))) { 61 | kids <- x$cluster_id 62 | nc_by_ks <- table(kids, x$sample_id) 63 | nc_by_kg <- table(kids, x$group_id) 64 | 65 | ei <- metadata(x)$experiment_info 66 | s_by_g <- split(ei$sample_id, ei$group_id) 67 | gids <- names(s_by_g) 68 | kids <- set_names(levels(kids)) 69 | 70 | fq <- lapply(kids, function(k) { 71 | nc_by_s <- fq[[k]] * nc_by_ks[k, ][col(fq[[k]])] 72 | fq_by_g <- vapply(gids, function(g) { 73 | nc_g <- nc_by_s[, s_by_g[[g]], drop = FALSE] 74 | rowSums(nc_g) / nc_by_kg[k, g] 75 | }, numeric(nrow(x))) 76 | cbind(fq[[k]], fq_by_g) 77 | }) 78 | } 79 | # return SCE 80 | SingleCellExperiment(assays = fq) 81 | } 82 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' @rdname data 2 | #' @name data 3 | #' @aliases data example_sce 4 | #' 5 | #' @title Example datasets 6 | #' 7 | #' @description 8 | #' A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing 9 | #' 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after 10 | #' 6h-treatment with INF-beta (16 samples in total). 11 | #' 12 | #' The original data has been filtered to 13 | #' \itemize{ 14 | #' \item{remove unassigned cells & cell multiplets} 15 | #' \item{retain only 4 out of 8 samples per experimental group} 16 | #' \item{retain only 5 out of 8 subpopulations (clusters)} 17 | #' \item{retain genes with a count > 1 in > 50 cells} 18 | #' \item{retain cells with > 200 detected genes} 19 | #' \item{retain at most 100 cells per cluster-sample instance} 20 | #' } 21 | #' 22 | #' Assay \code{logcounts} corresponds to log-normalized values 23 | #' obtained from \code{\link[scater]{logNormCounts}} with default parameters. 24 | #' 25 | #' The original measurement data, as well as gene and cell metadata 26 | #' is available through the NCBI GEO accession number GSE96583; 27 | #' code to reproduce this example dataset from the original data 28 | #' is provided in the examples section. 29 | #' 30 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 31 | #' 32 | #' @examples 33 | #' \donttest{ 34 | #' # set random seed for cell sampling 35 | #' set.seed(2929) 36 | #' 37 | #' # load data 38 | #' library(ExperimentHub) 39 | #' eh <- ExperimentHub() 40 | #' sce <- eh[["EH2259"]] 41 | #' 42 | #' # drop unassigned cells & multiplets 43 | #' sce <- sce[, !is.na(sce$cell)] 44 | #' sce <- sce[, sce$multiplets == "singlet"] 45 | #' 46 | #' # keep 4 samples per group 47 | #' sce$id <- paste0(sce$stim, sce$ind) 48 | #' inds <- sample(sce$ind, 4) 49 | #' ids <- paste0(levels(sce$stim), rep(inds, each = 2)) 50 | #' sce <- sce[, sce$id %in% ids] 51 | #' 52 | #' # keep 5 clusters 53 | #' kids <- c("B cells", "CD4 T cells", "CD8 T cells", 54 | #' "CD14+ Monocytes", "FCGR3A+ Monocytes") 55 | #' sce <- sce[, sce$cell %in% kids] 56 | #' sce$cell <- droplevels(sce$cell) 57 | #' 58 | #' # basic filtering on genes & cells 59 | #' gs <- rowSums(counts(sce) > 1) > 50 60 | #' cs <- colSums(counts(sce) > 0) > 200 61 | #' sce <- sce[gs, cs] 62 | #' 63 | #' # sample max. 100 cells per cluster-sample 64 | #' cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id)) 65 | #' cs <- sapply(cs_by_ks, function(u) 66 | #' sample(u, min(length(u), 100))) 67 | #' sce <- sce[, unlist(cs)] 68 | #' 69 | #' # compute logcounts 70 | #' library(scater) 71 | #' sce <- computeLibraryFactors(sce) 72 | #' sce <- logNormCounts(sce) 73 | #' 74 | #' # re-format for 'muscat' 75 | #' sce <- prepSCE(sce, 76 | #' kid = "cell", 77 | #' sid = "id", 78 | #' gid = "stim", 79 | #' drop = TRUE) 80 | #' } 81 | #' 82 | #' @references 83 | #' Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing 84 | #' using natural genetic variation. \emph{Nature Biotechnology}, 85 | #' \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042. 86 | #' 87 | #' @author Helena L Crowell 88 | 89 | NULL 90 | -------------------------------------------------------------------------------- /R/mmDS.R: -------------------------------------------------------------------------------- 1 | #' @rdname mmDS 2 | #' @title DS analysis using mixed-models (MM) 3 | #' 4 | #' @description Performs cluster-wise DE analysis by fitting cell-level models. 5 | #' 6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 7 | #' @param coef character specifying the coefficient to test. 8 | #' If NULL (default), will test the last level of \code{"group_id"}. 9 | #' @param covs character vector of \code{colData(x)} 10 | #' column names to use as covariates. 11 | #' @param method a character string. 12 | #' Either \code{"dream2"} (default, lme4 with voom-weights), 13 | #' \code{"dream"} (previous implementation of the dream method), 14 | #' \code{"vst"} (variance-stabilizing transformation), 15 | #' \code{"poisson"} (poisson GLM-MM), 16 | #' \code{"nbinom"} (negative binomial GLM-MM), 17 | #' \code{"hybrid"} (combination of pseudobulk and poisson methods) 18 | #' or a function accepting the same arguments. 19 | #' @param n_cells number of cells per cluster-sample 20 | #' required to consider a sample for testing. 21 | #' @param n_samples number of samples per group 22 | #' required to consider a cluster for testing. 23 | #' @param min_count numeric. For a gene to be tested in a given cluster, 24 | #' at least \code{min_cells} must have a count >= \code{min_count}. 25 | #' @param min_cells number (or fraction, if < 1) of cells with a count > 26 | #' \code{min_count} required for a gene to be tested in a given cluster. 27 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} 28 | #' object specifying how differential testing should be parallelized. 29 | #' @param verbose logical specifying whether messages 30 | #' on progress and a progress bar should be displayed. 31 | #' 32 | #' @return a data.frame 33 | #' 34 | #' @examples 35 | #' # subset "B cells" cluster 36 | #' data(example_sce) 37 | #' b_cells <- example_sce$cluster_id == "B cells" 38 | #' sub <- example_sce[, b_cells] 39 | #' sub$cluster_id <- droplevels(sub$cluster_id) 40 | #' 41 | #' # downsample to 100 genes 42 | #' gs <- sample(nrow(sub), 100) 43 | #' sub <- sub[gs, ] 44 | #' 45 | #' # run DS analysis using cell-level mixed-model 46 | #' res <- mmDS(sub, method = "dream", verbose = FALSE) 47 | #' head(res$`B cells`) 48 | #' 49 | #' @author Pierre-Luc Germain & Helena L Crowell 50 | #' 51 | #' @references 52 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 53 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 54 | #' On the discovery of population-specific state transitions from 55 | #' multi-sample multi-condition single-cell RNA sequencing data. 56 | #' \emph{bioRxiv} \strong{713412} (2018). 57 | #' doi: \url{https://doi.org/10.1101/713412} 58 | #' 59 | #' @importFrom DESeq2 DESeqDataSetFromMatrix estimateDispersions 60 | #' sizeFactors varianceStabilizingTransformation 61 | #' @importFrom dplyr %>% mutate bind_rows 62 | #' @importFrom matrixStats rowMins 63 | #' @importFrom progress progress_bar 64 | #' @importFrom purrr map_depth 65 | #' @importFrom sctransform vst 66 | #' @importFrom SingleCellExperiment counts counts<- 67 | #' colData sizeFactors sizeFactors<- 68 | #' @importFrom stats p.adjust 69 | #' @export 70 | 71 | mmDS <- function(x, coef = NULL, covs = NULL, 72 | method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"), 73 | n_cells = 10, n_samples = 2, min_count = 1, min_cells = 20, 74 | verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose), 75 | vst = c("sctransform", "DESeq2"), 76 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"), 77 | dup_corr = FALSE, trended = FALSE, bayesian = FALSE, 78 | blind = TRUE, REML = TRUE, moderate = FALSE) { 79 | 80 | # check validity of input arguments 81 | .check_sce(x, req_group = TRUE) 82 | .check_arg_assay(x, "counts") 83 | .check_args_mmDS(as.list(environment())) 84 | stopifnot(is(BPPARAM, "BiocParallelParam")) 85 | 86 | args <- as.list(environment()) 87 | args$method <- match.arg(method) 88 | args$vst <- match.arg(vst) 89 | args$ddf <- match.arg(ddf) 90 | 91 | # counts cells per cluster-sample 92 | n_cells_by_ks <- table(x$cluster_id, x$sample_id) 93 | 94 | # filter clusters w/ >= n_cells in >= n_samples 95 | if (!is.null(metadata(x)$experiment_info$group_id)) { 96 | ei <- metadata(x)$experiment_info 97 | m <- match(levels(x$sample_id), ei$sample_id) 98 | gids <- ei$group_id[m] 99 | } else { 100 | gids <- x$group_id 101 | } 102 | ks_keep <- apply(n_cells_by_ks > n_cells, 1, 103 | function(u) all(tabulate(gids[u]) >= n_samples)) 104 | if (sum(ks_keep) == 0) 105 | stop("No cluster has at least ", n_samples, 106 | " samples with at least ", n_cells, " cells.") 107 | 108 | kids <- levels(x$cluster_id) 109 | if (verbose && sum(ks_keep) < length(kids)) 110 | message(paste("Skipping cluster(s)", 111 | paste(dQuote(kids[!ks_keep]), collapse = ", "), 112 | "\ndue to an insufficient number of samples", 113 | "with a sufficient number of cells.")) 114 | kids <- kids[ks_keep] 115 | names(kids) <- kids 116 | 117 | # split cells by cluster 118 | cells_by_k <- split(colnames(x), x$cluster_id) 119 | 120 | if (min_count < 1) { 121 | min_count <- floor(min_count * rowMins(n_cells_by_ks)) 122 | } else { 123 | min_count <- rep(min_count, length(kids)) 124 | } 125 | names(min_count) <- kids 126 | 127 | # variance-stabilizing transformation 128 | if (args$method == "vst" && !"vstresiduals" %in% assayNames(x)) { 129 | vst_call <- switch(args$vst, 130 | sctransform = expression(.vst_sctransform(x, verbose)), 131 | DESeq2 = expression(.vst_DESeq2(x, covs, blind))) 132 | if (verbose) { 133 | assay(x, "vstresiduals", FALSE) <- eval(vst_call) 134 | } else { 135 | assay(x, "vstresiduals", FALSE) <- suppressMessages(eval(vst_call)) 136 | } 137 | } 138 | 139 | # get method function & construct correct call 140 | if (!is.function(fun <- args$method)) 141 | fun <- get(paste0(".mm_", fun)) 142 | args_use <- names(formals(fun)) 143 | args <- args[names(args) %in% args_use] 144 | 145 | if (verbose) pb <- progress_bar$new(total = length(kids)) 146 | res <- lapply(kids, function(k) { 147 | y <- x[, cells_by_k[[k]]] 148 | y <- y[rowSums(counts(y) >= min_count[k]) > min_cells, ] 149 | if (verbose) 150 | message("Testing ", nrow(y), " genes across ", 151 | ncol(y), " cells in cluster ", dQuote(k), "...") 152 | 153 | # call to .mm_dream/.mm_vst 154 | args$x <- y 155 | z <- do.call(fun, args) 156 | data.frame( 157 | gene = rownames(y), 158 | cluster_id = k, z, 159 | row.names = NULL, 160 | stringsAsFactors = FALSE) 161 | }) 162 | if (verbose) pb$terminate() 163 | 164 | # assemble results from all cluster 165 | res <- bind_rows(res) 166 | # global p-value adjustment 167 | p_adj.glb <- p.adjust(res$p_val, method = "BH") 168 | i <- which(colnames(res) == "p_adj.loc") 169 | res[["p_adj.glb"]] <- p_adj.glb 170 | res <- res[, c(seq_len(i), ncol(res), seq_len(ncol(res)-1)[-seq_len(i)])] 171 | # re-split by cluster 172 | split(res, res$cluster_id) 173 | } 174 | -------------------------------------------------------------------------------- /R/pbDS.R: -------------------------------------------------------------------------------- 1 | #' @rdname pbDS 2 | #' @title pseudobulk DS analysis 3 | #' 4 | #' @description \code{pbDS} tests for DS after aggregating single-cell 5 | #' measurements to pseudobulk data, by applying bulk RNA-seq DE methods, 6 | #' such as \code{edgeR}, \code{DESeq2} and \code{limma}. 7 | #' 8 | #' @param pb a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 9 | #' containing pseudobulks as returned by \code{\link{aggregateData}}. 10 | #' @param method a character string. 11 | #' @param design For methods \code{"edegR"} and \code{"limma"}, a design matrix 12 | #' with row & column names(!) created with \code{\link[stats]{model.matrix}}; 13 | #' For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}. 14 | #' Defaults to \code{~ group_id} or the corresponding \code{model.matrix}. 15 | #' @param contrast a matrix of contrasts to test for 16 | #' created with \code{\link[limma]{makeContrasts}}. 17 | #' @param coef passed to \code{\link[edgeR]{glmQLFTest}}, 18 | #' \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}} 19 | #' for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively. 20 | #' Can be a list for multiple, independent comparisons. 21 | #' @param min_cells a numeric. Specifies the minimum number of cells in a given 22 | #' cluster-sample required to consider the sample for differential testing. 23 | #' @param filter character string specifying whether 24 | #' to filter on genes, samples, both or neither. 25 | #' @param treat logical specifying whether empirical Bayes moderated-t 26 | #' p-values should be computed relative to a minimum fold-change threshold. 27 | #' Only applicable for methods \code{"limma-x"} 28 | #' (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"} 29 | #' (\code{\link[edgeR]{glmTreat}}), and ignored otherwise. 30 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} 31 | #' object specifying how differential testing should be parallelized. 32 | #' @param verbose logical. Should information on progress be reported? 33 | #' 34 | #' @return a list containing \itemize{ 35 | #' \item a data.frame with differential testing results, 36 | #' \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and 37 | #' \item the \code{design} matrix, and \code{contrast} or \code{coef} used.} 38 | #' 39 | #' @examples 40 | #' # simulate 5 clusters, 20% of DE genes 41 | #' data(example_sce) 42 | #' 43 | #' # compute pseudobulk sum-counts & run DS analysis 44 | #' pb <- aggregateData(example_sce) 45 | #' res <- pbDS(pb, method = "limma-trend") 46 | #' 47 | #' names(res) 48 | #' names(res$table) 49 | #' head(res$table$stim$`B cells`) 50 | #' 51 | #' # count nb. of DE genes by cluster 52 | #' vapply(res$table$stim, function(u) 53 | #' sum(u$p_adj.loc < 0.05), numeric(1)) 54 | #' 55 | #' # get top 5 hits for ea. cluster w/ abs(logFC) > 1 56 | #' library(dplyr) 57 | #' lapply(res$table$stim, function(u) 58 | #' filter(u, abs(logFC) > 1) %>% 59 | #' arrange(p_adj.loc) %>% 60 | #' slice(seq_len(5))) 61 | #' 62 | #' @author Helena L Crowell & Mark D Robinson 63 | #' 64 | #' @references 65 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 66 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 67 | #' On the discovery of population-specific state transitions from 68 | #' multi-sample multi-condition single-cell RNA sequencing data. 69 | #' \emph{bioRxiv} \strong{713412} (2018). 70 | #' doi: \url{https://doi.org/10.1101/713412} 71 | #' 72 | #' @importFrom BiocParallel bplapply SerialParam 73 | #' @importFrom edgeR filterByExpr 74 | #' @importFrom dplyr last rename 75 | #' @importFrom limma makeContrasts 76 | #' @importFrom matrixStats colAnys 77 | #' @importFrom Matrix qr rowSums 78 | #' @importFrom methods is 79 | #' @importFrom scater isOutlier 80 | #' @importFrom stats model.matrix 81 | #' @importFrom SummarizedExperiment assay colData 82 | #' @export 83 | 84 | pbDS <- function(pb, 85 | method=c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"), 86 | design=NULL, coef=NULL, contrast=NULL, min_cells=10, 87 | filter=c("both", "genes", "samples", "none"), treat=FALSE, 88 | verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) { 89 | 90 | # check validity of input arguments 91 | args <- as.list(environment()) 92 | method <- match.arg(method) 93 | filter <- match.arg(filter) 94 | .check_pbs(pb, check_by=TRUE) 95 | .check_args_pbDS(args) 96 | stopifnot(is(BPPARAM, "BiocParallelParam")) 97 | 98 | if (is.null(design)) { 99 | formula <- ~ group_id 100 | cd <- as.data.frame(colData(pb)) 101 | design <- model.matrix(formula, cd) 102 | colnames(design) <- levels(pb$group_id) 103 | args$design <- design 104 | } 105 | if (is.null(coef) & is.null(contrast)) { 106 | c <- colnames(design)[ncol(design)] 107 | contrast <- makeContrasts(contrasts=c, levels=design) 108 | args$contrast <- contrast 109 | } 110 | 111 | # ct: type of comparison - "contrast" or "coef" 112 | # cs: named list of 'coef's or 'contrast's 113 | if (!is.null(contrast)) { 114 | coef <- NULL 115 | names(cs) <- cs <- colnames(contrast) 116 | } else if (!is.null(coef)) { 117 | if (!is.list(coef)) 118 | coef <- list(coef) 119 | cs <- vapply(coef, function(i) 120 | paste(colnames(design)[i], collapse="-"), 121 | character(1)) 122 | names(cs) <- names(coef) <- cs 123 | } 124 | ct <- ifelse(is.null(coef), "contrast", "coef") 125 | 126 | if (!is.function(method)) { 127 | fun <- switch(method, 128 | "DD"=.edgeR_NB, 129 | "edgeR"=.edgeR, 130 | "DESeq2"=.DESeq2, 131 | "limma-voom"=.limma_voom, 132 | "limma-trend"=.limma_trend) 133 | } else { 134 | fun_call <- 1 135 | } 136 | fun_args <- names(as.list(args(fun))) 137 | fun_args <- fun_args[-length(fun_args)] 138 | 139 | # for ea. cluster, run DEA 140 | n_cells <- .n_cells(pb) 141 | names(kids) <- kids <- assayNames(pb) 142 | res <- bplapply( 143 | BPPARAM=BPPARAM, 144 | kids, function (k) { 145 | rmv <- n_cells[k, ] < min_cells 146 | d <- design[colnames(y <- pb[ , !rmv]), , drop=FALSE] 147 | if (filter %in% c("samples", "both")) { 148 | ls <- colSums(assay(y, k)) 149 | ol <- isOutlier(ls, log=TRUE, type="lower", nmads=3) 150 | d <- d[colnames(y <- y[, !ol]), , drop=FALSE] 151 | } 152 | if (any(tabulate(y$group_id) < 2) 153 | || qr(d)$rank== nrow(d) 154 | || qr(d)$rank < ncol(d)) 155 | return(NULL) 156 | y <- y[rowSums(assay(y, k)) != 0, , drop=FALSE] 157 | if (filter %in% c("genes", "both") & max(assay(y, k)) > 100) 158 | y <- y[filterByExpr(assay(y, k), d), , drop=FALSE] 159 | # drop samples without any detected features 160 | keep <- colAnys(assay(y, k) > 0) 161 | y <- y[, keep, drop=FALSE] 162 | d <- d[keep, , drop=FALSE] 163 | args <- list( 164 | x=y, k=k, design=d, coef=coef, 165 | contrast=contrast, ct=ct, cs=cs, 166 | treat=treat, nc=n_cells[k, !rmv]) 167 | args <- args[intersect(names(args), fun_args)] 168 | suppressWarnings(do.call(fun, args)) 169 | }) 170 | 171 | # remove empty clusters 172 | rmv <- vapply(res, is.null, logical(1)) 173 | res <- res[!rmv] 174 | 175 | if (length(res)== 0) stop( 176 | "Specified filtering options result in no genes in any clusters ", 177 | "being tested. To force testing, consider modifying arguments ", 178 | "'min_cells' and/or 'filter'. See '?pbDS' for details.") 179 | 180 | # reorganize & do global p-value adjustment 181 | names(i) <- i <- c("table", "data", "fit") 182 | res <- lapply(i, map, .x=res) 183 | res$table <- .p_adj_global(res$table) 184 | return(c(res, list(args=args))) 185 | } 186 | 187 | #' @rdname pbDS 188 | #' @export 189 | pbDD <- function(pb, design=NULL, coef=NULL, contrast=NULL, 190 | min_cells=10, filter=c("both", "genes", "samples", "none"), 191 | verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) 192 | { 193 | args <- as.list(environment()) 194 | do.call(pbDS, c(args, list(method="DD"))) 195 | } 196 | -------------------------------------------------------------------------------- /R/pbFlatten.R: -------------------------------------------------------------------------------- 1 | #' pbFlatten 2 | #' Flatten pseudobulk SCE 3 | #' 4 | #' Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}} 5 | #' as returned by \code{\link{aggregateData}} such that all cell subpopulations 6 | #' are represented as a single assay. 7 | #' 8 | #' @param pb a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}} 9 | #' as returned by \code{\link{aggregateData}}, with different subpopulations as assays. 10 | #' @param normalize logical specifying whether to compute a \code{logcpm} assay. 11 | #' 12 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 13 | #' 14 | #' @examples 15 | #' data(example_sce) 16 | #' library(SingleCellExperiment) 17 | #' pb_stack <- aggregateData(example_sce) 18 | #' (pb_flat <- pbFlatten(pb_stack)) 19 | #' ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack)) 20 | #' 21 | #' @importFrom methods is 22 | #' @importFrom edgeR cpm calcNormFactors DGEList 23 | #' @importFrom S4Vectors DataFrame metadata as.list 24 | #' @importFrom SingleCellExperiment SingleCellExperiment 25 | #' @importFrom SummarizedExperiment assay assay<- assays colData rowData 26 | #' @export 27 | 28 | pbFlatten <- function(pb, normalize = TRUE){ 29 | # check validity of input arguments 30 | stopifnot(is.logical(normalize), length(normalize) == 1, 31 | is(pb, "SingleCellExperiment"), length(assays(pb)) > 1) 32 | # concatenate assay data 33 | as <- assays(pb) 34 | a <- do.call(cbind, as.list(as)) 35 | sids <- rep(colnames(pb), length(as)) 36 | kids <- rep(names(as), each = ncol(pb)) 37 | colnames(a) <- paste(sids, kids, sep = ".") 38 | pb$sample_id <- colnames(pb) 39 | # construct cell metadata 40 | cd <- lapply(seq_along(as), function(u) colData(pb)) 41 | cd <- do.call(rbind, cd) 42 | rownames(cd) <- colnames(a) 43 | cd$cluster_id <- kids 44 | # construct single-assay SCE 45 | sce <- SingleCellExperiment( 46 | assays = list(counts = a), 47 | colData = cd, 48 | rowData = rowData(pb), 49 | metadata = metadata(pb)) 50 | # (optionally) add number of cells per cluster-sample 51 | if (!is.null(n_cells <- .n_cells(pb))) { 52 | n_cells <- tryCatch(mapply( 53 | function(k, s) n_cells[k, s], 54 | k = as.character(kids), s = as.character(sids)), 55 | error = function(e) { warning(e); NULL }) 56 | if (!is.null(n_cells)) 57 | sce$n_cells <- as.numeric(n_cells) 58 | } 59 | # (optionally) do log-CPM normalization 60 | if (normalize) { 61 | # remove empty columns (samples that lack a cluster) 62 | sce <- sce[, colSums(a != 0) > 0] 63 | dgl <- DGEList(assay(sce)) 64 | dgl <- calcNormFactors(dgl) 65 | assay(sce, "logcpm") <- log1p(cpm(dgl)) 66 | } 67 | return(sce) 68 | } 69 | -------------------------------------------------------------------------------- /R/pbHeatmap.R: -------------------------------------------------------------------------------- 1 | #' @rdname pbHeatmap 2 | #' @title Heatmap of cluster-sample pseudobulks 3 | #' 4 | #' @description ... 5 | #' 6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 7 | #' @param y a list of DS analysis results as returned by 8 | #' \code{\link{pbDS}} or \code{\link{mmDS}}. 9 | #' @param k character vector; specifies which cluster ID(s) to retain. 10 | #' Defaults to \code{levels(x$cluster_id)}. 11 | #' @param g character vector; specifies which genes to retain. 12 | #' Defaults to considering all genes. 13 | #' @param c character string; specifies which contrast/coefficient to retain. 14 | #' Defaults to \code{names(y$table)[1]}. 15 | #' @param top_n single numeric; number of genes to retain per cluster. 16 | #' @param fdr,lfc single numeric; FDR and logFC cutoffs to filter results by. 17 | #' The specified FDR threshold is applied to \code{p_adj.loc} values. 18 | #' @param sort_by character string specifying 19 | #' a numeric results table column to sort by; 20 | #' \code{"none"} to retain original ordering. 21 | #' @param decreasing logical; whether to sort 22 | #' in decreasing order of \code{sort_by}. 23 | #' @param assay character string; specifies which assay to use; 24 | #' should be one of \code{assayNames(x)}. 25 | #' @param fun function to use as summary statistic, 26 | #' e.g., mean, median, sum (depending on the input assay). 27 | #' @param normalize logical; whether to apply a z-normalization 28 | #' to each row (gene) of the cluster-sample pseudobulk data. 29 | #' @param col character vector of colors or color mapping function 30 | #' generated with \code{\link[circlize]{colorRamp2}}. Passed to 31 | #' argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}} 32 | #' (see \code{?ComplexHeatmap::Heatmap} for details). 33 | #' @param row_anno,col_anno logical; whether to render 34 | #' annotations of cluster and group IDs, respectively. 35 | #' 36 | #' @return a \code{\link{HeatmapList-class}} object. 37 | #' 38 | #' @examples 39 | #' # compute pseudobulks & run DS analysis 40 | #' data(example_sce) 41 | #' pb <- aggregateData(example_sce) 42 | #' res <- pbDS(pb) 43 | #' 44 | #' # cluster-sample expression means 45 | #' pbHeatmap(example_sce, res) 46 | #' 47 | #' # include only a single cluster 48 | #' pbHeatmap(example_sce, res, k = "B cells") 49 | #' 50 | #' # plot specific gene across all clusters 51 | #' pbHeatmap(example_sce, res, g = "ISG20") 52 | #' 53 | #' @author Helena L Crowell 54 | #' 55 | #' @importFrom ComplexHeatmap Heatmap columnAnnotation rowAnnotation 56 | #' @importFrom dplyr bind_rows filter 57 | #' @importFrom grid gpar 58 | #' @importFrom purrr map 59 | #' @importFrom scales hue_pal 60 | #' @importFrom viridis viridis 61 | #' @export 62 | 63 | pbHeatmap <- function(x, y, 64 | k = NULL, g = NULL, c = NULL, 65 | top_n = 20, fdr = 0.05, lfc = 1, 66 | sort_by = "p_adj.loc", decreasing = FALSE, 67 | assay = "logcounts", fun = mean, normalize = TRUE, 68 | col = viridis(10), row_anno = TRUE, col_anno = TRUE) { 69 | 70 | # check validity of input arguments 71 | .check_sce(x, req_group = TRUE) 72 | .check_arg_assay(x, assay) 73 | .check_args_pbHeatmap(as.list(environment())) 74 | 75 | # subset specified contrast/coef & cluster(s) 76 | if (is.null(k)) k <- levels(x$cluster_id) 77 | if (names(y)[1] == "table") { 78 | if (is.null(c)) c <- names(y$table)[1] 79 | y <- y$table[[c]][k] 80 | } else y <- y[k] 81 | y <- y[!vapply(y, is.null, logical(1))] 82 | 83 | # filter results 84 | if (!is.null(g)) y <- lapply(y, filter, gene %in% g) 85 | y <- lapply(y, filter, p_adj.loc < fdr, abs(logFC) > lfc) 86 | 87 | # subset 'top_n' results 88 | if (is.null(top_n)) { 89 | ns <- vapply(y, nrow, numeric(1)) 90 | } else { 91 | ns <- vapply(y, function(u) min(nrow(u), top_n), numeric(1)) 92 | } 93 | 94 | # get cluster IDs & nb. of clusters 95 | ex <- ns == 0; ns <- ns[!ex]; y <- y[!ex] 96 | nk <- length(names(kids) <- kids <- names(y)) 97 | 98 | # re-order results 99 | if (sort_by == "none") { 100 | y <- lapply(kids, function(k) 101 | y[[k]][seq_len(ns[k]), ]) 102 | } else { 103 | vs <- map(y, sort_by) 104 | os <- lapply(vs, order, decreasing = decreasing) 105 | y <- lapply(kids, function(k) { 106 | o <- os[[k]][seq_len(ns[k])] 107 | y[[k]][o, ] 108 | }) 109 | } 110 | y <- bind_rows(y) 111 | 112 | # subset 'assay' data 113 | es <- assays(x)[[assay]] 114 | es <- es[unlist(y$gene), , drop = FALSE] 115 | 116 | # compute cluster-sample summary values (e.g., means) 117 | cells_by_ks <- .split_cells(x) 118 | xs <- t(mapply(function(g, k) 119 | vapply(cells_by_ks[[k]], function(cs) 120 | fun(es[g, cs]), numeric(1)), 121 | g = y$gene, k = y$cluster_id)) 122 | if (normalize) xs <- .z_norm(xs) 123 | 124 | # plotting ----------------------------------------------------------------- 125 | # row & column annotation 126 | lgd_aes <- list(labels_gp = gpar(fontsize = 6), 127 | title_gp = gpar(fontface = "bold", fontsize = 8)) 128 | 129 | if (row_anno & nk > 1) { 130 | cols <- .cluster_colors 131 | if (nk > length(cols)) 132 | cols <- colorRampPalette(cols)(nk) 133 | cols <- setNames(cols[seq_len(nk)], kids) 134 | row_anno <- rowAnnotation( 135 | df = data.frame(cluster_id = y$cluster_id), 136 | col = list(cluster_id = cols), 137 | gp = gpar(col = "white"), 138 | show_annotation_name = FALSE, 139 | annotation_legend_param = lgd_aes) 140 | } else { 141 | row_anno <- NULL 142 | } 143 | 144 | ei <- metadata(x)$experiment_info 145 | m <- match(levels(x$sample_id), ei$sample_id) 146 | if (col_anno) { 147 | cols <- setNames(hue_pal()(nlevels(x$group_id)), levels(x$group_id)) 148 | col_anno <- columnAnnotation( 149 | df = data.frame(group_id = ei$group_id[m]), 150 | col = list(group_id = cols), 151 | gp = gpar(col = "white"), 152 | show_annotation_name = FALSE, 153 | annotation_legend_param = lgd_aes) 154 | } else { 155 | col_anno <- NULL 156 | } 157 | 158 | Heatmap(matrix = xs, col = col, 159 | name = sprintf("%s%s %s", 160 | c("", "z-normalized\n")[normalize + 1], 161 | deparse(substitute(fun)), assay), 162 | row_title = NULL, column_title = NULL, 163 | cluster_rows = FALSE, cluster_columns = FALSE, 164 | left_annotation = row_anno, top_annotation = col_anno, 165 | split = y$cluster_id, column_split = ei$group_id[m], 166 | heatmap_legend_param = lgd_aes, 167 | row_names_gp = gpar(fontsize = 6), 168 | column_names_gp = gpar(fontsize = 8), 169 | column_title_gp = gpar(fontface = "bold", fontsize = 10)) 170 | } -------------------------------------------------------------------------------- /R/pbMDS.R: -------------------------------------------------------------------------------- 1 | #' @rdname pbMDS 2 | #' @title Pseudobulk-level MDS plot 3 | #' 4 | #' @description Renders a multidimensional scaling (MDS) 5 | #' where each point represents a cluster-sample instance; 6 | #' with points colored by cluster ID and shaped by group ID. 7 | #' 8 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 9 | #' containing cluster-sample pseudobulks as returned by 10 | #' \code{\link{aggregateData}} with argument 11 | #' \code{by = c("cluster_id", "sample_id")}. 12 | #' 13 | #' @return a \code{ggplot} object. 14 | #' 15 | #' @examples 16 | #' data(example_sce) 17 | #' pb <- aggregateData(example_sce) 18 | #' pbMDS(pb) 19 | #' 20 | #' @author Helena L Crowell & Mark D Robinson 21 | #' 22 | #' @importFrom edgeR calcNormFactors cpm DGEList plotMDS.DGEList 23 | #' @importFrom SummarizedExperiment assays 24 | #' @importFrom grDevices colorRampPalette 25 | #' @importFrom S4Vectors metadata 26 | #' @importFrom Matrix rowSums 27 | #' @importFrom rlang .data 28 | #' @import ggplot2 29 | #' @export 30 | 31 | pbMDS <- function(x) { 32 | # check validity of input pseudobulk-SCE 33 | # (cells should have been aggregated by cluster-sample) 34 | .check_pbs(x, check_by = TRUE) 35 | 36 | y <- as.list(assays(x)) 37 | y <- do.call("cbind", y) 38 | y <- y[, (j <- c(t(.n_cells(x))) != 0)] 39 | d <- DGEList(unname(y), remove.zeros = TRUE) 40 | d <- calcNormFactors(d) 41 | 42 | mds <- plotMDS.DGEList(d, plot = FALSE) 43 | nk <- length(kids <- assayNames(x)) 44 | 45 | ss <- rep(colnames(x), nk) 46 | ks <- rep(kids, each = ncol(x)) 47 | 48 | if (any(!j)) { 49 | txt <- paste(sQuote(ks[!j]), sQuote(ss[!j]), sep = "-") 50 | message("Removing cluster-sample instance(s) ", 51 | paste(txt, collapse = ", ")) 52 | } 53 | 54 | df <- data.frame( 55 | MDS1 = mds$x, MDS2 = mds$y, 56 | cluster_id = factor(ks[j], levels = kids), 57 | group_id = rep(x$group_id, nk)[j]) 58 | 59 | cols <- .cluster_colors 60 | if (nk > length(cols)) 61 | cols <- colorRampPalette(cols)(nk) 62 | 63 | ggplot(df, aes(.data$MDS1, .data$MDS2, 64 | col=.data$cluster_id, shape=.data$group_id)) + 65 | scale_color_manual(values = cols) + 66 | geom_point(size = 3, alpha = 0.8) + 67 | guides(color = guide_legend(override.aes = list(alpha = 1))) + 68 | theme_bw() + theme(aspect.ratio = 1, 69 | axis.text = element_text(color = "black"), 70 | panel.grid.minor = element_blank(), 71 | panel.grid.major = element_line(linewidth = 0.2, color = "lightgrey")) 72 | } 73 | -------------------------------------------------------------------------------- /R/prepSCE.R: -------------------------------------------------------------------------------- 1 | #' @rdname prepSCE 2 | #' @title Prepare SCE for DS analysis 3 | #' 4 | #' @description ... 5 | #' 6 | #' @param x a \linkS4class{SingleCellExperiment}. 7 | #' @param kid,sid,gid character strings specifying 8 | #' the \code{colData(x)} columns containing cluster assignments, 9 | #' unique sample identifiers, and group IDs (e.g., treatment). 10 | #' @param drop logical. Specifies whether \code{colData(x)} columns 11 | #' besides those specified as \code{cluster_id,sample_id,group_id} 12 | #' should be retained (default \code{drop = FALSE}) 13 | #' or removed (\code{drop = TRUE}). 14 | #' 15 | #' @examples 16 | #' # generate random counts 17 | #' ng <- 50 18 | #' nc <- 200 19 | #' 20 | #' # generate some cell metadata 21 | #' gids <- sample(c("groupA", "groupB"), nc, TRUE) 22 | #' sids <- sample(paste0("sample", seq_len(3)), nc, TRUE) 23 | #' kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE) 24 | #' batch <- sample(seq_len(3), nc, TRUE) 25 | #' cd <- data.frame(group = gids, id = sids, cluster = kids, batch) 26 | #' 27 | #' # construct SCE 28 | #' library(scuttle) 29 | #' sce <- mockSCE(ncells = nc, ngenes = ng) 30 | #' colData(sce) <- cbind(colData(sce), cd) 31 | #' 32 | #' # prep. for workflow 33 | #' sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group") 34 | #' head(colData(sce)) 35 | #' metadata(sce)$experiment_info 36 | #' sce 37 | #' 38 | #' @author Helena L Crowell 39 | #' 40 | #' @return a \linkS4class{SingleCellExperiment}. 41 | #' 42 | #' @importFrom dplyr mutate_all 43 | #' @importFrom S4Vectors DataFrame metadata<- 44 | #' @importFrom SingleCellExperiment reducedDims SingleCellExperiment 45 | #' @importFrom SummarizedExperiment assays colData rowData 46 | #' @export 47 | 48 | prepSCE <- function(x, 49 | kid = "cluster_id", 50 | sid = "sample_id", 51 | gid = "group_id", 52 | drop = FALSE) { 53 | 54 | stopifnot(is(x, "SingleCellExperiment")) 55 | 56 | args <- as.list(environment()) 57 | ids <- args[grep("[a-z]id", names(args))] 58 | ids <- unlist(ids) 59 | 60 | stopifnot(is.character(ids)) 61 | stopifnot(all(ids %in% colnames(colData(x)))) 62 | 63 | cd0 <- colData(x) 64 | cd <- data.frame(cd0[ids], check.names = FALSE) 65 | cd <- mutate_all(cd, as.factor) 66 | colnames(cd) <- unlist(formals()[names(ids)]) 67 | 68 | if (!drop) 69 | cd <- data.frame(cd, 70 | cd0[setdiff(colnames(cd0), ids)], 71 | check.names = FALSE) 72 | 73 | # replace colData in SCE 74 | colData(x) <- DataFrame(cd) 75 | 76 | # construct metadata 77 | ei <- .make_ei(x) 78 | metadata(x)$experiment_info <- ei 79 | 80 | return(x) 81 | } 82 | -------------------------------------------------------------------------------- /R/prepSim.R: -------------------------------------------------------------------------------- 1 | #' @name prepSim 2 | #' 3 | #' @title SCE preparation for \code{\link{simData}} 4 | #' 5 | #' @description \code{prepSim} prepares an input SCE for simulation 6 | #' with \code{muscat}'s \code{\link{simData}} function by 7 | #' \enumerate{ 8 | #' \item{basic filtering of genes and cells} 9 | #' \item{(optional) filtering of subpopulation-sample instances} 10 | #' \item{estimation of cell (library sizes) and gene parameters 11 | #' (dispersions and sample-specific means), respectively.} 12 | #' } 13 | #' 14 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 15 | #' @param min_count,min_cells used for filtering of genes; only genes with 16 | #' a count > \code{min_count} in >= \code{min_cells} will be retained. 17 | #' @param min_genes used for filtering cells; 18 | #' only cells with a count > 0 in >= \code{min_genes} will be retained. 19 | #' @param min_size used for filtering subpopulation-sample combinations; 20 | #' only instances with >= \code{min_size} cells will be retained. 21 | #' Specifying \code{min_size = NULL} skips this step. 22 | #' @param group_keep character string; if \code{nlevels(x$group_id) > 1}, 23 | #' specifies which group of samples to keep (see details). The default 24 | #' NULL retains samples from \code{levels(x$group_id)[1]}; otherwise, 25 | #' if `colData(x)$group_id` is not specified, all samples will be kept. 26 | #' @param verbose logical; should information on progress be reported? 27 | #' 28 | #' @details For each gene \eqn{g}, \code{prepSim} fits a model to estimate 29 | #' sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s}, 30 | #' and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s 31 | #' \code{\link[edgeR]{estimateDisp}} function with default parameters. 32 | #' Thus, the reference count data is modeled as NB distributed: 33 | #' \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)} 34 | #' for gene \eqn{g} and cell \eqn{c}, where the mean 35 | #' \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here, 36 | #' \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g} 37 | #' in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size 38 | #' (total number of counts), and \eqn{\phi_g} is the dispersion. 39 | #' 40 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 41 | #' containing, for each cell, library size (\code{colData(x)$offset}) 42 | #' and, for each gene, dispersion and sample-specific mean estimates 43 | #' (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively). 44 | #' 45 | #' @examples 46 | #' # estimate simulation parameters 47 | #' data(example_sce) 48 | #' ref <- prepSim(example_sce) 49 | #' 50 | #' # tabulate number of genes/cells before vs. after 51 | #' ns <- cbind( 52 | #' before = dim(example_sce), 53 | #' after = dim(ref)) 54 | #' rownames(ns) <- c("#genes", "#cells") 55 | #' ns 56 | #' 57 | #' library(SingleCellExperiment) 58 | #' head(rowData(ref)) # gene parameters 59 | #' head(colData(ref)) # cell parameters 60 | #' 61 | #' @author Helena L Crowell 62 | #' 63 | #' @references 64 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 65 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 66 | #' On the discovery of population-specific state transitions from 67 | #' multi-sample multi-condition single-cell RNA sequencing data. 68 | #' \emph{bioRxiv} \strong{713412} (2018). 69 | #' doi: \url{https://doi.org/10.1101/713412} 70 | #' 71 | #' @importFrom edgeR DGEList estimateDisp glmFit 72 | #' @importFrom Matrix colSums rowSums 73 | #' @importFrom matrixStats rowAnyNAs 74 | #' @importFrom SingleCellExperiment SingleCellExperiment counts 75 | #' @importFrom SummarizedExperiment colData rowData<- 76 | #' @importFrom stats model.matrix 77 | #' @importFrom S4Vectors DataFrame 78 | #' @export 79 | 80 | prepSim <- function(x, 81 | min_count = 1, min_cells = 10, 82 | min_genes = 100, min_size = 100, 83 | group_keep = NULL, verbose = TRUE) { 84 | 85 | .check_sce(x, req_group = FALSE) 86 | stopifnot(is.numeric(min_count), 87 | is.numeric(min_cells), is.numeric(min_genes), 88 | is.null(min_size) || is.numeric(min_size), 89 | is.logical(verbose), length(verbose) == 1) 90 | 91 | # get model variables 92 | vars <- c("sample_id", "cluster_id") 93 | names(vars) <- vars <- intersect(vars, names(colData(x))) 94 | 95 | # assure these are factors 96 | for (v in vars) { 97 | # drop singular variables from model 98 | n <- length(unique(x[[v]])) 99 | if (n == 1) { 100 | rmv <- grep(v, vars) 101 | vars <- vars[-rmv] 102 | } 103 | if (!is.factor(x[[v]])) 104 | x[[v]] <- as.factor(x[[v]]) 105 | x[[v]] <- droplevels(x[[v]]) 106 | } 107 | 108 | n_cells0 <- ncol(x) 109 | x <- .update_sce(x) 110 | if (is.null(group_keep)) { 111 | if ("group_id" %in% colnames(colData(x))) { 112 | group_keep <- levels(x$group_id)[1] 113 | if (verbose) { 114 | fmt <- paste( 115 | "Argument `group_keep` unspecified;", 116 | "defaulting to retaining %s-group samples.") 117 | message(sprintf(fmt, dQuote(group_keep))) 118 | } 119 | cells_keep <- x$group_id == group_keep 120 | } else { 121 | cells_keep <- seq_len(ncol(x)) 122 | } 123 | } else { 124 | stopifnot(is.character(group_keep), 125 | group_keep %in% levels(x$group_id)) 126 | cells_keep <- x$group_id %in% group_keep 127 | } 128 | x <- x[, cells_keep] 129 | x <- .update_sce(x) 130 | 131 | # keep genes w/ count > `min_count` in at least `min_cells`; 132 | # keep cells w/ at least `min_genes` detected genes 133 | if (verbose) message("Filtering...") 134 | genes_keep <- rowSums(counts(x) > min_count) >= min_cells 135 | cells_keep <- colSums(counts(x) > 0) >= min_genes 136 | if (verbose) message(sprintf( 137 | "- %s/%s genes and %s/%s cells retained.", 138 | sum(genes_keep), nrow(x), sum(cells_keep), n_cells0)) 139 | x <- x[genes_keep, cells_keep, drop = FALSE] 140 | 141 | # keep cluster-samples w/ at least 'min_size' cells 142 | if (!is.null(min_size)) { 143 | n_cells <- table(x$cluster_id, x$sample_id) 144 | n_cells <- .filter_matrix(n_cells, n = min_size) 145 | if (ncol(n_cells) == 1) 146 | stop("Current 'min_size' retains only 1 sample,\nbut", 147 | " mean-dispersion estimation requires at least 2.") 148 | if (verbose) message(sprintf( 149 | "- %s/%s subpopulations and %s/%s samples retained.", 150 | nrow(n_cells), nlevels(x$cluster_id), 151 | ncol(n_cells), nlevels(x$sample_id))) 152 | x <- .filter_sce(x, rownames(n_cells), colnames(n_cells)) 153 | } 154 | 155 | if (is.null(rownames(x))) rownames(x) <- paste0("gene", seq(nrow(x))) 156 | if (is.null(colnames(x))) colnames(x) <- paste0("cell", seq(ncol(x))) 157 | 158 | # construct model formula 159 | f <- "~ 1" 160 | for (v in vars) 161 | f <- paste(f, v, sep = "+") 162 | cd <- as.data.frame(droplevels(colData(x))) 163 | mm <- model.matrix(as.formula(f), data = cd) 164 | 165 | # fit NB model 166 | if (verbose) 167 | message("Estimating gene and cell parameters...") 168 | y <- DGEList(counts(x)) 169 | y <- calcNormFactors(y) 170 | y <- estimateDisp(y, mm) 171 | y <- glmFit(y, prior.count = 0) 172 | 173 | # drop genes for which estimation failed 174 | cs <- y$coefficients 175 | i <- !rowAnyNAs(cs) 176 | x <- x[i, , drop = FALSE] 177 | cs <- cs[i, , drop = FALSE] 178 | ds <- y$dispersion[i] 179 | names(ds) <- rownames(x) 180 | 181 | # group betas by variable 182 | bs <- DataFrame( 183 | beta0 = cs[, 1], 184 | row.names = rownames(x)) 185 | for (v in vars) { 186 | pat <- paste0("^", v) 187 | i <- grep(pat, colnames(cs)) 188 | df <- DataFrame(cs[, i]) 189 | nms <- colnames(cs)[i] 190 | names(df) <- gsub(pat, "", nms) 191 | bs[[v]] <- df 192 | } 193 | # store betas & dispersions in rowData 194 | rowData(x)$beta <- bs 195 | rowData(x)$disp <- ds 196 | 197 | # store offsets in colData 198 | os <- c(y$offset) 199 | names(os) <- colnames(x) 200 | x$offset <- os 201 | 202 | # drop singular variables from cell metadata 203 | for (v in names(colData(x))) { 204 | n <- length(unique(x[[v]])) 205 | if (n == 1) x[[v]] <- NULL 206 | } 207 | 208 | # return SCE 209 | return(x) 210 | } 211 | -------------------------------------------------------------------------------- /R/resDS.R: -------------------------------------------------------------------------------- 1 | #' resDS 2 | #' Formatting of DS analysis results 3 | #' 4 | #' \code{resDS} provides a simple wrapper to format cluster-level 5 | #' differential testing results into an easily filterable table, and 6 | #' to optionally append gene expression frequencies by cluster-sample 7 | #' & -group, as well as cluster-sample-wise CPM. 8 | #' 9 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 10 | #' @param y a list of DS testing results as returned 11 | #' by \code{\link{pbDS}} or \code{\link{mmDS}}. 12 | #' @param bind character string specifying the output format (see details). 13 | #' @param frq logical or a pre-computed list of expression frequencies 14 | #' as returned by \code{\link{calcExprFreqs}}. 15 | #' @param cpm logical specifying whether CPM by cluster-sample 16 | #' should be appendeded to the output result table(s). 17 | #' @param digits integer value specifying the 18 | #' number of significant digits to maintain. 19 | #' @param sep character string to use as separator 20 | #' when constructing new column names. 21 | #' @param ... optional arguments passed to 22 | #' \code{\link{calcExprFreqs}} if \code{frq = TRUE}. 23 | #' 24 | #' @details When \code{bind = "col"}, the list of DS testing results at 25 | #' \code{y$table} will be merge vertically (by column) into a single table 26 | #' in tidy format with column \code{contrast/coef} specifying the comparison. 27 | #' 28 | #' Otherwise, when \code{bind = "row"}, an identifier of the respective 29 | #' contrast or coefficient will be appended to the column names, 30 | #' and all tables will be merge horizontally (by row). 31 | #' 32 | #' Expression frequencies pre-computed with \code{\link{calcExprFreqs}} 33 | #' may be provided with \code{frq}. Alternatively, when \code{frq = TRUE}, 34 | #' expression frequencies can be computed directly, and additional arguments 35 | #' may be passed to \code{\link{calcExprFreqs}} (see examples below). 36 | #' 37 | #' @return returns a `data.frame`. 38 | #' 39 | #' @examples 40 | #' # compute pseudobulks (sum of counts) 41 | #' data(example_sce) 42 | #' pb <- aggregateData(example_sce, 43 | #' assay = "counts", fun = "sum") 44 | #' 45 | #' # run DS analysis (edgeR on pseudobulks) 46 | #' res <- pbDS(pb, method = "edgeR") 47 | #' 48 | #' head(resDS(example_sce, res, bind = "row")) # tidy format 49 | #' head(resDS(example_sce, res, bind = "col", digits = Inf)) 50 | #' 51 | #' # append CPMs & expression frequencies 52 | #' head(resDS(example_sce, res, cpm = TRUE)) 53 | #' head(resDS(example_sce, res, frq = TRUE)) 54 | #' 55 | #' # pre-computed expression frequencies & append 56 | #' frq <- calcExprFreqs(example_sce, assay = "counts", th = 0) 57 | #' head(resDS(example_sce, res, frq = frq)) 58 | #' 59 | #' @author Helena L Crowell & Mark D Robinson 60 | #' 61 | #' @importFrom dplyr %>% bind_rows inner_join full_join mutate mutate_if select all_of 62 | #' @importFrom edgeR cpm 63 | #' @importFrom methods is 64 | #' @importFrom purrr reduce 65 | #' @importFrom SummarizedExperiment colData 66 | #' @importFrom S4Vectors metadata 67 | #' @export 68 | 69 | resDS <- function(x, y, bind = c("row", "col"), 70 | frq = FALSE, cpm = FALSE, digits = 3, sep = "__", ...) { 71 | 72 | # check validity of input arguments 73 | .check_sce(x, req_group = TRUE) 74 | #.check_res(x, y) 75 | bind <- match.arg(bind) 76 | if (!is.logical(frq)) 77 | .check_frq(x, frq) 78 | stopifnot(is.infinite(digits) || is.numeric(digits) & 79 | digits > 0 & as.integer(digits) == digits) 80 | 81 | ei <- metadata(x)$experiment_info 82 | kids <- levels(x$cluster_id) 83 | 84 | res <- switch(bind, 85 | row = { 86 | bind_rows(lapply(y$table, bind_rows)) 87 | }, 88 | col = { 89 | ct <- ifelse(!is.null(y$args$contrast), "contrast", "coef") 90 | cs <- names(y$table) 91 | res <- lapply(cs, function(c) { 92 | df <- bind_rows(y$table[[c]]) 93 | df <- select(df, !all_of(ct)) 94 | i <- !colnames(df) %in% c("gene", "cluster_id") 95 | colnames(df)[i] <- paste(colnames(df)[i], c, sep = sep) 96 | return(df) 97 | }) 98 | reduce(res, full_join, by = c("gene", "cluster_id")) 99 | }) 100 | 101 | .tidy <- function(u, ei, append = "") { 102 | m1 <- match(ei$sample_id, colnames(u), nomatch = 0) 103 | m2 <- match(levels(ei$group_id), colnames(u)) 104 | if (all(is.na(m2))) m2 <- 0 105 | colnames(u)[m1] <- paste0(ei$sample_id, append)[m1 != 0] 106 | colnames(u)[m2] <- paste0(colnames(u)[m2], append) 107 | k <- seq_len(ncol(u))[-c(m1, m2)] 108 | u[, c(k, m1[order(ei$group)], m2)] 109 | } 110 | 111 | # append expression frequencies 112 | if (is.logical(frq)) 113 | if (frq) frq <- calcExprFreqs(x, ...) else frq <- NULL 114 | if (!is.null(frq)) { 115 | frq <- data.frame( 116 | gene = rep(rownames(x), length(assays(frq))), 117 | cluster_id = rep(assayNames(frq), each = nrow(x)), 118 | do.call("rbind", as.list(assays(frq))), 119 | row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE) 120 | frq <- .tidy(frq, ei, append = ".frq") 121 | res <- inner_join(frq, res, 122 | multiple = "all", 123 | by = c("gene", "cluster_id")) 124 | } 125 | 126 | # append CPMs 127 | if (cpm) { 128 | cpm <- lapply(kids, function(k) { 129 | if (is.null(y$data[[k]])) 130 | return(NULL) 131 | cpm <- cpm(y$data[[k]]) 132 | data.frame(cpm, 133 | gene = rownames(cpm), 134 | cluster_id = k, 135 | row.names = NULL, 136 | check.names = FALSE, 137 | stringsAsFactors = FALSE) 138 | }) 139 | cpm <- bind_rows(cpm) 140 | cpm <- .tidy(cpm, ei, append = ".cpm") 141 | res <- inner_join(cpm, res, 142 | multiple = "all", 143 | by = c("gene", "cluster_id")) 144 | } 145 | mutate_if(res, is.numeric, signif, digits) 146 | } 147 | -------------------------------------------------------------------------------- /R/stagewiseDD.R: -------------------------------------------------------------------------------- 1 | #' @importFrom dplyr bind_rows 2 | .res_DX <- function(res_DS, res_DD) { 3 | # for each contrast... 4 | names(cts) <- cts <- names(res_DS$table) 5 | lapply(cts, function(ct) { 6 | # for each cluster... 7 | names(kids) <- kids <- names(res_DS$table[[ct]]) 8 | lapply(kids, function(kid) { 9 | # get DS/DD results 10 | DS <- res_DS$table[[ct]][[kid]] 11 | DD <- res_DD$table[[ct]][[kid]] 12 | # add missing genes 13 | DS <- bind_rows(DS, data.frame(gene=setdiff(DD$gene, DS$gene))) 14 | DD <- bind_rows(DD, data.frame(gene=setdiff(DS$gene, DD$gene))) 15 | # reorder & return both 16 | DD <- DD[match(DS$gene, DD$gene), ] 17 | return(list(DS=DS, DD=DD)) 18 | }) 19 | }) 20 | } 21 | 22 | #' @rdname stagewise_DS_DD 23 | #' @title Perform two-stage testing on DS and DD analysis results 24 | #' 25 | #' @param res_DS a list of DS testing results as returned 26 | #' by \code{\link{pbDS}} or \code{\link{mmDS}}. 27 | #' @param res_DD a list of DD testing results as returned 28 | #' by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}). 29 | #' @param sce (optional) \code{SingleCellExperiment} object containing the data 30 | #' that underlies testing, prior to summarization with \code{\link{aggregateData}}. 31 | #' Used for validation of inputs in order to prevent unexpected failure/results. 32 | #' @param verbose logical. Should information on progress be reported? 33 | #' 34 | #' @return 35 | #' A list of \code{DFrame}s containing results for each contrast and cluster. 36 | #' Each table contains DS and DD results for genes shared between analyses, 37 | #' as well as results from stagewise testing analysis, namely: 38 | #' \itemize{ 39 | #' \item{\code{p_adj}: FDR adjusted p-values for the 40 | #' screening hypothesis that a gene is neither DS nor DD 41 | #' (see \code{?stageR::getAdjustedPValues} for details)} 42 | #' \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}} 43 | #' 44 | #' @examples 45 | #' data(example_sce) 46 | #' 47 | #' pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum") 48 | #' pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected") 49 | #' 50 | #' res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE) 51 | #' res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE) 52 | #' 53 | #' res <- stagewise_DS_DD(res_DS, res_DD) 54 | #' head(res[[1]][[1]]) # results for 1st cluster 55 | #' 56 | #' @importFrom S4Vectors DataFrame 57 | #' @importFrom purrr map_depth 58 | #' @export 59 | 60 | stagewise_DS_DD <- function(res_DS, res_DD, sce=NULL, verbose=FALSE) { 61 | if (!requireNamespace("stageR", quietly=TRUE)) 62 | stop("Install 'stageR' to use this function.") 63 | 64 | # validity checks 65 | # TODO: helper to check validity of 'res_DS/D' 66 | # against each other and, optionally, 'sce' 67 | stopifnot( 68 | # same coefs/constrasts 69 | names(x <- res_DS$table) == 70 | names(y <- res_DD$table), 71 | # any shared clusters 72 | sum(mapply(\(i, j) 73 | length(intersect(i, j)), 74 | i=lapply(x, names), 75 | j=lapply(y, names))) > 0) 76 | if (!is.null(sce)) { 77 | .check_sce(sce) 78 | . <- map_depth(list(x, y), 3, \(df) df$gene %in% rownames(sce)) 79 | stopifnot("gene(s) present in 'res_DS/D' not found in 'sce'"=unlist(.)) 80 | . <- map_depth(list(x, y), 3, \(df) df$cluster_id %in% sce$cluster_id) 81 | stopifnot("cluster(s) present in 'res_DS/D' not found in 'sce'"=unlist(.)) 82 | } 83 | 84 | # assure that results contain same set of genes, in the same order 85 | # (indepedent of different filtering criteria for the two analyses) 86 | res_DX <- .res_DX(res_DS=res_DS, res_DD=res_DD) 87 | 88 | # perform harmonic mean p-value aggregation according to 89 | # (https://www.pnas.org/doi/full/10.1073/pnas.1814092116) 90 | .mu <- \(x) 1/mean(1/x, na.rm=TRUE) 91 | 92 | # perform stagewise testing 93 | res <- map_depth(res_DX, 2, \(x) { 94 | ps <- data.frame( 95 | p_val.DS=x$DS$p_val, 96 | p_val.DD=x$DD$p_val, 97 | row.names=x$DS$gene) 98 | qs <- apply(ps, 1, .mu); names(qs) <- x$DS$gene 99 | obj <- stageR::stageR(qs, as.matrix(ps), FALSE) 100 | eva <- expression({ 101 | obj <- stageR::stageWiseAdjustment(obj, 102 | method="none", alpha=0.05, allowNA=TRUE) 103 | res <- stageR::getAdjustedPValues(obj, 104 | onlySignificantGenes=FALSE, order=FALSE) 105 | }) 106 | res <- if (verbose) eval(eva) else suppressMessages({eval(eva)}) 107 | # TODO: communicate this better with the user? 108 | if (is.null(res)) res <- NA 109 | colnames(res)[1] <- "p_adj" 110 | return(res) 111 | }) 112 | names(cs) <- cs <- names(res) 113 | lapply(cs, \(c) { 114 | names(ks) <- ks <- names(res[[c]]) 115 | lapply(ks, \(k) { 116 | gs <- rownames(df <- res[[c]][[k]]) 117 | res_DS <- I((. <- res_DS$table[[c]][[k]])[match(gs, .$gene), ]) 118 | res_DD <- I((. <- res_DD$table[[c]][[k]])[match(gs, .$gene), ]) 119 | DataFrame(gene=gs, df, cluster_id=k, contrast=c, res_DS, res_DD) 120 | }) 121 | }) 122 | } 123 | -------------------------------------------------------------------------------- /R/utils-pbDS.R: -------------------------------------------------------------------------------- 1 | #' @importFrom BiocParallel SerialParam 2 | #' @importFrom purrr map 3 | #' @importFrom scuttle summarizeAssayByGroup 4 | #' @importFrom SummarizedExperiment assay colData 5 | .pb <- function(x, by, assay, fun, BPPARAM = SerialParam()) { 6 | # compute pseudobulks 7 | suppressWarnings( 8 | # temporarily suppressing warnings b/c 'median' 9 | # warns about unspecified 'useNames' argument 10 | y <- summarizeAssayByGroup(x, 11 | assay.type = assay, 12 | ids = (ids <- colData(x)[by]), 13 | statistics = fun, 14 | BPPARAM = BPPARAM)) 15 | colnames(y) <- y[[by[length(by)]]] 16 | 17 | if (length(by) == 1) 18 | return(assay(y)) 19 | 20 | # reformat into one assay per 'by[1]' 21 | if (is.factor(ids <- y[[by[1]]])) 22 | ids <- droplevels(ids) 23 | is <- split(seq_len(ncol(y)), ids) 24 | ys <- map(is, ~assay(y)[, ., drop=FALSE]) 25 | 26 | # fill in missing combinations 27 | for (i in seq_along(ys)) { 28 | fill <- setdiff( 29 | unique(y[[by[2]]]), 30 | colnames(ys[[i]])) 31 | if (length(fill != 0)) { 32 | foo <- matrix(0, nrow(x), length(fill)) 33 | colnames(foo) <- fill 34 | foo <- cbind(ys[[i]], foo) 35 | o <- paste(sort(unique(y[[by[2]]]))) 36 | ys[[i]] <- foo[, o] 37 | } 38 | } 39 | return(ys) 40 | } 41 | 42 | # extract table of cell counts from 'int_colData' 43 | # of pseudobulks as returned by 'aggregateData' 44 | #' @importFrom S4Vectors metadata 45 | #' @importFrom SingleCellExperiment int_colData 46 | .n_cells <- function(x) { 47 | y <- int_colData(x)$n_cells 48 | if (is.null(y)) return(NULL) 49 | if (length(metadata(x)$agg_pars$by) == 2) 50 | y <- as.matrix(data.frame(y, check.names = FALSE)) 51 | return(as.table(y)) 52 | } 53 | 54 | # wrapper to create output tables 55 | # k: cluster ID 56 | # tt: topTable data.frame 57 | .res_df <- function(tbl, k, ct, c) { 58 | df <- data.frame( 59 | gene = rownames(tbl), cluster_id = k, tbl, 60 | row.names = NULL, stringsAsFactors = FALSE) 61 | df[[ct]] <- c; df 62 | } 63 | 64 | #' @importFrom DESeq2 DESeq results 65 | #' @importFrom dplyr rename 66 | #' @importFrom edgeR calcNormFactors DGEList estimateDisp 67 | #' filterByExpr glmQLFit glmQLFTest glmTreat topTags 68 | #' @importFrom scater isOutlier 69 | #' @importFrom SummarizedExperiment assay 70 | #' @importFrom S4Vectors metadata 71 | .edgeR <- function(x, k, design, coef, contrast, ct, cs, treat) { 72 | y <- assay(x, k) 73 | y <- suppressMessages(DGEList(y, 74 | group = x$group_id[colnames(y)], 75 | remove.zeros = TRUE)) 76 | y <- calcNormFactors(y) 77 | y <- estimateDisp(y, design) 78 | fit <- glmQLFit(y, design) 79 | # treat: test for DE relative to logFC threshold 80 | # else: genewise NB GLM with quasi-likelihood test 81 | .fun <- ifelse(treat, glmTreat, glmQLFTest) 82 | tbl <- lapply(cs, function(c) { 83 | fit <- .fun(fit, coef[[c]], contrast[, c]) 84 | tbl <- topTags(fit, n = Inf, sort.by = "none") 85 | # combine tables & reformat 86 | tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR") 87 | tbl <- .res_df(tbl, k, ct, c) 88 | }) 89 | list(table = tbl, data = y, fit = fit) 90 | } 91 | 92 | #' @importFrom matrixStats rowMedians 93 | .edgeR_NB <- \(x, k, design, coef, contrast, ct, cs, nc) { 94 | y <- assay(x, k) 95 | # Gene_level filtering to remove genes detected in 96 | # almost all cells of almost all pseudobulk samples 97 | med_detection <- rowMedians(sweep(y, 2, nc, "/")) 98 | gene_filter <- med_detection < 0.9 99 | # Normalization offset to remove systematic differences between pseudobulk 100 | # samples that are due to technical or nuisance biological variability. 101 | # Idea obtained from cellular detection rate (CDR) normalization from MAST. 102 | # Note that this normalization is used instead of 'edgeR::calcNormFactors()'. 103 | of <- colMeans(sweep(y[gene_filter, ], 2, nc, "/")) 104 | # construct 'DGEList' 105 | y <- suppressMessages(DGEList( 106 | counts = y[gene_filter, ], 107 | group = x$group_id[colnames(y)], 108 | remove.zeros = TRUE)) 109 | # add offsets to 'DGEList' 110 | y$offset <- log(nc * of) 111 | # run an 'edgeR' analysis 112 | y <- estimateDisp(y, design) 113 | fit <- glmQLFit(y, design, robust = TRUE) 114 | tbl <- lapply(cs, function(c) { 115 | fit <- glmQLFTest(fit, 116 | coef[[c]], 117 | contrast[, c], 118 | poisson.bound = FALSE) 119 | tbl <- topTags(fit, n = Inf, sort.by = "none") 120 | tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR") 121 | tbl <- .res_df(tbl, k, ct, c) 122 | }) 123 | list(table = tbl, data = y, fit = fit) 124 | } 125 | 126 | #' @importFrom dplyr rename 127 | #' @importFrom edgeR calcNormFactors DGEList 128 | #' @importFrom limma contrasts.fit eBayes lmFit topTable topTreat voom treat 129 | #' @importFrom SummarizedExperiment assay 130 | #' @importFrom S4Vectors metadata 131 | .limma <- function(x, k, design, coef, contrast, ct, cs, method, treat) { 132 | y <- assay(x, k) 133 | trend <- robust <- TRUE 134 | if (method == "voom") { 135 | trend <- robust <- FALSE 136 | y <- suppressMessages(DGEList(y, remove.zeros = TRUE)) 137 | y <- calcNormFactors(y) 138 | y <- voom(y, design) 139 | } 140 | w <- .n_cells(x)[k, colnames(x)] 141 | fit <- lmFit(y, design, weights = w) 142 | # treat: eBayes moderated-t p-val relative to min logFC threshold 143 | # else: eBayes moderated t-stat testing each contrast equal to 0 144 | .fun <- ifelse(treat, limma::treat, eBayes) 145 | .tbl <- ifelse(treat, topTreat, topTable) 146 | tbl <- lapply(cs, function(c) { 147 | fit <- contrasts.fit(fit, contrast[, c], coef[[c]]) 148 | fit <- .fun(fit, trend = trend, robust = robust) 149 | tbl <- .tbl(fit, number = Inf, sort.by = "none") 150 | tbl <- rename(tbl, p_val = "P.Value", p_adj.loc = "adj.P.Val") 151 | tbl <- .res_df(tbl, k, ct, c) 152 | }) 153 | list(table = tbl, data = y, fit = fit) 154 | } 155 | 156 | .limma_trend <- function(x, k, design, coef, contrast, ct, cs, treat) 157 | .limma(x, k, design, coef, contrast, ct, cs, method = "trend", treat) 158 | 159 | .limma_voom <- function(x, k, design, coef, contrast, ct, cs, treat) 160 | .limma(x, k, design, coef, contrast, ct, cs, method = "voom", treat) 161 | 162 | #' @importFrom dplyr rename 163 | #' @importFrom DESeq2 DESeq DESeqDataSetFromMatrix results 164 | #' @importFrom SummarizedExperiment assay colData 165 | .DESeq2 <- function(x, k, design, contrast, ct, cs) { 166 | cd <- colData(x) 167 | y <- assay(x, k) 168 | mode(y) <- "integer" 169 | y <- DESeqDataSetFromMatrix(y, cd, design) 170 | y <- suppressMessages(DESeq(y)) 171 | tbl <- lapply(cs, function(c) { 172 | tbl <- results(y, contrast[, c]) 173 | tbl <- .res_df(tbl, k, ct, c) 174 | rename(tbl, logFC = "log2FoldChange", 175 | p_val = "pvalue", p_adj.loc = "padj") 176 | }) 177 | list(table = tbl, data = y) 178 | } 179 | 180 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # filters rows/columns from input matrix `m` until all entries >= `n`, 3 | # such that ea. iteration removes the row/column w/ the smallest summed value. 4 | # ------------------------------------------------------------------------------ 5 | .filter_matrix <- function(m, n = 100) { 6 | while (any(m < n)) { 7 | # get candidate rows/cols for removal 8 | i <- m < n 9 | r <- apply(i, 1, any) 10 | c <- apply(i, 2, any) 11 | # get smallest row/col 12 | rs <- rowSums(m) 13 | cs <- colSums(m) 14 | r <- which(r)[which.min(rs[r])] 15 | c <- which(c)[which.min(cs[c])] 16 | # priorities removal of rows over cols 17 | if (rs[r] <= cs[c]) { 18 | m <- m[-r, , drop = FALSE] 19 | } else { 20 | m <- m[, -c, drop = FALSE] 21 | } 22 | if (any(dim(m) == 1)) 23 | break 24 | } 25 | return(m) 26 | } 27 | 28 | #' @importFrom dplyr mutate_if 29 | #' @importFrom purrr negate 30 | #' @importFrom S4Vectors DataFrame metadata metadata<- 31 | #' @importFrom SummarizedExperiment colData colData<- 32 | .update_sce <- function(sce) { 33 | # update colData 34 | cd <- as.data.frame(colData(sce)) 35 | cd <- mutate_if(cd, is.factor, droplevels) 36 | cd <- mutate_if(cd, negate(is.factor), factor) 37 | colData(sce) <- DataFrame(cd, row.names = colnames(sce)) 38 | # update metadata 39 | if(!is.null(ei <- metadata(sce)$experiment_info)){ 40 | ei <- ei[ei$sample_id %in% levels(sce$sample_id), ] 41 | ei <- mutate_if(ei, is.factor, droplevels) 42 | metadata(sce)$experiment_info <- ei 43 | } 44 | return(sce) 45 | } 46 | 47 | .filter_sce <- function(sce, kids, sids) { 48 | cs1 <- sce$cluster_id %in% kids 49 | cs2 <- sce$sample_id %in% sids 50 | sce <- sce[, cs1 & cs2] 51 | sce <- .update_sce(sce) 52 | return(sce) 53 | } 54 | 55 | .cluster_colors <- c( 56 | "#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72", 57 | "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3", 58 | "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D", 59 | "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999", 60 | "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000", 61 | "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00") 62 | 63 | # ============================================================================== 64 | # scale values b/w 0 and 1 using 65 | # low (1%) and high (99%) quantiles as boundaries 66 | # ------------------------------------------------------------------------------ 67 | #' @importFrom matrixStats rowQuantiles 68 | .scale <- function(x) { 69 | qs <- rowQuantiles(as.matrix(x), probs = c(.01, .99), na.rm = TRUE) 70 | x <- (x - qs[, 1]) / (qs[, 2] - qs[, 1]) 71 | x[x < 0] <- 0 72 | x[x > 1] <- 1 73 | return(x) 74 | } 75 | 76 | # ============================================================================== 77 | # wrapper for z-normalization 78 | # ------------------------------------------------------------------------------ 79 | .z_norm <- function(x, th = 2.5) { 80 | x <- as.matrix(x) 81 | sds <- rowSds(x, na.rm = TRUE) 82 | sds[sds == 0] <- 1 83 | x <- t(t(x - rowMeans(x, na.rm = TRUE)) / sds) 84 | #x <- (x - rowMeans(x, na.rm = TRUE)) / sds 85 | x[x > th] <- th 86 | x[x < -th] <- -th 87 | return(x) 88 | } 89 | 90 | # ------------------------------------------------------------------------------ 91 | # generate experimental design metadata table 92 | # for an input SCE or colData data.frame 93 | # ------------------------------------------------------------------------------ 94 | #' @importFrom dplyr mutate_at 95 | #' @importFrom methods is 96 | #' @importFrom SummarizedExperiment colData 97 | .make_ei <- function(x) { 98 | if (is(x, "SingleCellExperiment")) 99 | x <- colData(x) 100 | sids <- unique(x$sample_id) 101 | m <- match(sids, x$sample_id) 102 | df <- data.frame( 103 | stringsAsFactors = FALSE, 104 | sample_id = sids, 105 | group_id = x$group_id[m], 106 | n_cells = as.numeric(table(x$sample_id)[sids])) 107 | for (i in c("sample_id", "group_id")) 108 | if (is.factor(x[[i]])) 109 | df <- mutate_at(df, i, factor, levels = levels(x[[i]])) 110 | return(df) 111 | } 112 | 113 | # ------------------------------------------------------------------------------ 114 | # split cells by cluster-sample 115 | # ------------------------------------------------------------------------------ 116 | # x: a SingleCellExperiment or colData 117 | # by: character vector specifying colData column(s) to split by 118 | # > If length(by) == 1, a list of length nlevels(colData$by), else, 119 | # a nested list with 2nd level of length nlevels(colData$by[2]) 120 | # ------------------------------------------------------------------------------ 121 | #' @importFrom data.table data.table 122 | #' @importFrom purrr map_depth 123 | .split_cells <- function(x, 124 | by = c("cluster_id", "sample_id")) { 125 | if (is(x, "SingleCellExperiment")) 126 | x <- colData(x) 127 | cd <- data.frame(x[by], check.names = FALSE) 128 | cd <- data.table(cd, cell = rownames(x)) %>% 129 | split(by = by, sorted = TRUE, flatten = FALSE) 130 | map_depth(cd, length(by), "cell") 131 | } 132 | 133 | # ------------------------------------------------------------------------------ 134 | # global p-value adjustment 135 | # ------------------------------------------------------------------------------ 136 | # x: results table; a nested list w/ 137 | # 1st level = comparisons and 2nd level = clusters 138 | # > adds 'p_adj.glb' column containing globally adjusted p-values 139 | # to the result table of ea. cluster for each comparison 140 | # ------------------------------------------------------------------------------ 141 | #' @importFrom purrr map map_depth 142 | #' @importFrom stats p.adjust 143 | .p_adj_global <- function(x) { 144 | names(ks) <- ks <- names(x) 145 | names(cs) <- cs <- names(x[[1]]) 146 | lapply(cs, function(c) { 147 | # get p-values 148 | tbl <- map_depth(x, 1, c) 149 | p_val <- map(tbl, "p_val") 150 | # adjust for each comparison 151 | p_adj <- p.adjust(unlist(p_val), method = "BH") 152 | # re-split by cluster 153 | ns <- vapply(p_val, length, numeric(1)) 154 | p_adj <- split(p_adj, rep.int(ks, ns)) 155 | # insert into results tables 156 | lapply(ks, function(k) { 157 | u <- x[[k]][[c]] 158 | i <- which(colnames(u) == "p_adj.loc") 159 | u[["p_adj.glb"]] <- p_adj[[k]] 160 | u[, c(seq_len(i), ncol(u), seq(i+1, ncol(u)-1))] 161 | }) 162 | }) 163 | } 164 | 165 | # ------------------------------------------------------------------------------ 166 | # toy SCE for unit-testing 167 | # ------------------------------------------------------------------------------ 168 | #' @importFrom SingleCellExperiment SingleCellExperiment 169 | .toySCE <- function(dim = c(200, 800)) { 170 | gs <- paste0("gene", seq_len(ngs <- dim[1])) 171 | cs <- paste0("cell", seq_len(ncs <- dim[2])) 172 | 173 | y <- rnbinom(ngs * ncs, size = 2, mu = 4) 174 | y <- matrix(y, ngs, ncs, TRUE, list(gs, cs)) 175 | 176 | cd <- mapply(function(i, n) 177 | sample(paste0(i, seq_len(n)), ncs, TRUE), 178 | i = c("k", "s", "g"), n = c(5, 4, 3)) 179 | 180 | cd <- data.frame(cd, stringsAsFactors = TRUE) 181 | cd$s <- factor(paste(cd$s, cd$g, sep = ".")) 182 | colnames(cd) <- paste(c("cluster", "sample", "group"), "id", sep = "_") 183 | 184 | SingleCellExperiment( 185 | assay = list(counts = y), colData = cd, 186 | metadata = list(experiment_info = .make_ei(cd))) 187 | } 188 | -------------------------------------------------------------------------------- /R/validity-checks.R: -------------------------------------------------------------------------------- 1 | # validity checks for objects & function arguments 2 | # ============================================================================== 3 | 4 | # check input SCE 5 | #' @importFrom methods is 6 | #' @importFrom SummarizedExperiment colData 7 | .check_sce <- function(x, req_group = TRUE) { 8 | stopifnot(is(x, "SingleCellExperiment")) 9 | stopifnot(c("cluster_id", "sample_id") %in% colnames(colData(x))) 10 | if (req_group) 11 | stopifnot("group_id" %in% colnames(colData(x))) 12 | } 13 | 14 | # check of 'assay' argument 15 | #' @importFrom SummarizedExperiment assayNames 16 | .check_arg_assay <- function(x, y) { 17 | stopifnot(is.character(y), length(y) == 1, y %in% assayNames(x)) 18 | if (sum(assayNames(x) == y) > 1) 19 | stop("Argument 'assay' was matched to multiple times.\n ", 20 | " Please assure that the input SCE has unique 'assayNames'.") 21 | } 22 | 23 | # check pseudo-bulks for DS analysis 24 | # (must have be aggregated by cluster-sample) 25 | # x = SCE used for aggregation 26 | # y = SCE containing pseudo-bulks as returned by 27 | #` aggregateData(x, by = c("cluster_id", "sample_id")) 28 | #' @importFrom methods is 29 | #' @importFrom S4Vectors metadata 30 | #' @importFrom SummarizedExperiment assayNames 31 | .check_pbs <- function(pbs, sce = NULL, check_by = TRUE) { 32 | stopifnot(is(pbs, "SingleCellExperiment"), 33 | !is.null(ei <- metadata(pbs)$experiment_info), 34 | !is.null(agg_pars <- metadata(pbs)$agg_pars), 35 | !is.null(n_cells <- .n_cells(pbs)), 36 | identical(assayNames(pbs), rownames(n_cells)), 37 | identical(colnames(pbs), colnames(n_cells))) 38 | if (!is.null(sce)) { 39 | stopifnot(identical(ei, metadata(sce)$experiment_info), 40 | identical(assayNames(pbs), levels(sce[[agg_pars$by[1]]])), 41 | identical(rownames(pbs), rownames(sce))) 42 | if (length(agg_pars$by == 2)) 43 | stopifnot(identical(colnames(pbs), levels(sce[[agg_pars$by[2]]]))) 44 | } 45 | if (check_by) 46 | stopifnot(!is.null(pbs[["group_id"]]), 47 | identical(agg_pars$by, c("cluster_id", "sample_id"))) 48 | } 49 | 50 | # check validity of runDS() output 51 | #' @importFrom methods is 52 | #' @importFrom S4Vectors metadata 53 | .check_res <- function(x, y) { 54 | ei <- metadata(x)$experiment_info 55 | nk <- length(kids <- levels(x$cluster_id)) 56 | nms <- c("table", "data", "method", "design", "contrast", "coef") 57 | stopifnot(is(y, "list"), all.equal(names(y), nms)) 58 | # table 59 | stopifnot(is(y$table, "list"), 60 | vapply(y$table, is, class = "list", logical(1)), 61 | identical(names(y$table), colnames(y$contrast)) 62 | | identical(names(y$table), names(y$coef)), 63 | apply(vapply(y$table, names, character(nk)), 2, identical, kids)) 64 | # data 65 | stopifnot(is(y$data, "list"), names(y$data) %in% kids, 66 | vapply(y$data, is, class = "DGEList", logical(1))) 67 | # design 68 | stopifnot(is(y$design, "matrix"), 69 | colnames(y$design) %in% ei$group_id, 70 | rownames(y$design) %in% ei$sample_id) 71 | # contrast & coef 72 | stopifnot(is.null(y$contrast) | is(y$contrast, "matrix")) 73 | stopifnot(is.null(y$coef) | is(y$coef, "numeric") | is(y$coef, "list")) 74 | } 75 | 76 | # check validity of calcExprFreqs() output 77 | #' @importFrom methods is 78 | #' @importFrom SummarizedExperiment assays colData 79 | .check_frq <- function(x, y) { 80 | stopifnot( 81 | is(x, "SingleCellExperiment"), 82 | is(y, "SingleCellExperiment")) 83 | kids <- levels(x$cluster_id) 84 | 85 | ids <- levels(x$sample_id) 86 | if ("group_id" %in% colnames(colData(x))) 87 | ids <- c(ids, levels(x$group_id)) 88 | stopifnot(identical(ids, colnames(y))) 89 | 90 | vals <- unlist(assays(y)) 91 | stopifnot(all(vals <= 1), all(vals >= 0)) 92 | } 93 | 94 | .check_args_simData <- function(u) { 95 | if (!is.null(u$ns)) 96 | stopifnot( 97 | is.numeric(u$ns), length(u$ns) == 1, 98 | u$ns > 0, as.integer(u$ns) == u$ns) 99 | if (!is.null(u$nk)) { 100 | stopifnot( 101 | is.numeric(u$nk), length(u$nk) == 1, 102 | u$nk > 0, as.integer(u$nk) == u$nk) 103 | } else u$nk <- nlevels(u$x$cluster_id) 104 | 105 | if (!u$force && u$ng != nrow(u$x)) 106 | stop("Number of simulated genes should match with reference,\n", 107 | " but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", 108 | " simulation should be forced regardlessly (see '?simData').") 109 | if (!is.null(u$phylo_tree) && u$p_type != 0) 110 | stop("Only one of arguments 'p_type' or 'phylo_tree'\n", 111 | " can be specified; see '?simData' for 'Details'.") 112 | # assure number of simulated clusters matches with specified phylogeny 113 | if (!is.null(u$phylo_tree)) { 114 | kids_phylo <- .get_clusters_from_phylo(u$phylo_tree) 115 | nk_phylo <- length(kids_phylo) 116 | ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo)) 117 | if (!all(sort(ns_phylo) == seq_len(nk_phylo))) 118 | stop("Some clusters appear to be missing from 'phylo_tree';\n", 119 | " please make sure all clusters up to ", 120 | dQuote(kids_phylo[which.max(ns_phylo)]), " are present.") 121 | # possibly update number of clusters 'nk' 122 | if (nk_phylo != u$nk) u$nk <- nk_phylo 123 | } 124 | stopifnot( 125 | is.numeric(u$ng), length(u$ng) == 1, u$ng > 0, as.integer(u$ng) == u$ng, 126 | is.numeric(u$nc), length(u$nc) == 1, u$nc > 0, as.integer(u$nc) == u$nc, 127 | is.numeric(u$p_dd), length(u$p_dd) == 6, u$p_dd >= 0, u$p_dd <= 1, 128 | abs(1-sum(u$p_dd)) < 1e-12, is.logical(u$paired), length(u$paired) == 1, 129 | is.numeric(u$p_ep), length(u$p_ep) == 1, u$p_ep > 0, u$p_ep < 1, 130 | is.numeric(u$p_dp), length(u$p_dp) == 1, u$p_dp > 0, u$p_dp < 1, 131 | is.numeric(u$p_dm), length(u$p_dm) == 1, u$p_dm > 0, u$p_dm < 1, 132 | is.numeric(u$p_type), length(u$p_type) == 1, u$p_type >= 0, u$p_type <= 1, 133 | is.numeric(u$lfc), is.numeric(u$lfc), length(u$lfc) == 1, u$lfc >= 1, 134 | is.logical(u$force), length(u$force) == 1, 135 | is.numeric(u$phylo_pars), length(u$phylo_pars) == 2, u$phylo_pars >= 0) 136 | if (!is.null(u$rel_lfc)) 137 | stopifnot(is.numeric(u$rel_lfc), 138 | length(u$rel_lfc) == u$nk, u$rel_lfc >= 0) 139 | return(list(nk = u$nk, ns = u$ns)) 140 | } 141 | 142 | #' @importFrom SummarizedExperiment colData 143 | .check_args_aggData <- function(u) { 144 | stopifnot(is.character(u$by), length(u$by) <= 2, 145 | u$by %in% colnames(colData(u$x))) 146 | stopifnot(is.logical(u$scale), length(u$scale) == 1) 147 | if (u$scale & (!u$assay %in% c("cpm", "CPM") | u$fun != "sum")) 148 | stop("Option 'scale = TRUE' only valid for", 149 | " 'assay = \"cpm/CPM\"' and 'fun = \"sum\"'.") 150 | } 151 | 152 | .check_args_pbDS <- function(u) { 153 | if (!is.null(u$design)) 154 | stopifnot(is.matrix(u$design), 155 | !is.null(rownames(u$design)), 156 | !is.null(colnames(u$design))) 157 | stopifnot( 158 | is.null(u$contrast) | is.matrix(u$contrast), 159 | is.null(u$coef) | is.numeric(unlist(u$coef)), 160 | is.numeric(u$min_cells), length(u$min_cells) == 1, 161 | is.logical(u$verbose), length(u$verbose) == 1, 162 | is.logical(u$treat), length(u$treat) == 1) 163 | } 164 | 165 | .check_args_mmDS <- function(u) { 166 | stopifnot( 167 | is.null(u$covs) || is.character(u$covs) & all(u$covs %in% names(colData(u$x))), 168 | is.numeric(u$coef) & u$coef %in% seq_len(nlevels(u$x$group_id)) 169 | | is.character(u$coef) & u$coef %in% c("(Intercept)", 170 | paste0("group_id", levels(u$x$group_id)[-1])), 171 | !is.null(metadata(u$x)$experiment_info$group_id) | !is.null(u$x$group_id), 172 | is.numeric(u$n_cells), length(u$n_cells) == 1, u$n_cells >= 0, 173 | is.numeric(u$n_samples), length(u$n_samples) == 1, u$n_samples >= 2, 174 | is.numeric(u$min_count), length(u$min_count) == 1, u$min_count >= 0, 175 | is.numeric(u$min_cells), length(u$min_cells) == 1, u$min_cells >= 0, 176 | is.logical(u$verbose), length(u$verbose) == 1, 177 | is.logical(u$dup_corr), length(u$dup_corr) == 1, 178 | is.logical(u$trended), length(u$trended) == 1, 179 | is.logical(u$bayesian), length(u$bayesian) == 1, 180 | is.logical(u$blind), length(u$blind) == 1, 181 | is.logical(u$REML), length(u$REML) == 1) 182 | } 183 | 184 | .check_args_pbHeatmap <- function(u) { 185 | if (!is.null(u$k)) 186 | stopifnot(is.character(u$k), u$k %in% levels(u$x$cluster_id)) 187 | if (!is.null(u$g)) 188 | stopifnot(is.character(u$g), u$g %in% rownames(u$x)) 189 | if (!is.null(u$c)) 190 | stopifnot(is.character(u$c), u$c %in% names(u$y$table)) 191 | stopifnot( 192 | is.numeric(u$top_n), length(u$top_n) == 1, u$top_n > 1, 193 | is.numeric(u$fdr), length(u$fdr) == 1, u$fdr > 0, 194 | is.numeric(u$lfc), length(u$lfc) == 1, 195 | is.character(u$sort_by), length(u$sort_by) == 1, 196 | u$sort_by == "none" | 197 | !is.null(dim(u$y[[1]])) & 198 | u$sort_by %in% names(u$y[[1]]) & 199 | is.numeric(u$y[[1]][[u$sort_by]]) | 200 | u$sort_by %in% names(u$y$table[[1]][[1]]) & 201 | is.numeric(u$y$table[[1]][[1]][[u$sort_by]]), 202 | is.function(u$fun), 203 | is.logical(u$decreasing), length(u$decreasing) == 1, 204 | is.logical(u$normalize), length(u$normalize) == 1, 205 | is.logical(u$row_anno), length(u$row_anno) == 1, 206 | is.logical(u$col_anno), length(u$col_anno) == 1) 207 | } 208 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | **`muscat` (**Mu**lti-sample **mu**lti-group **sc**RNA-seq **a**nalysis **t**ools )** 4 | 5 | ...provides methods for *Differential State* (DS) analyses in scRNA-seq data 6 | with multiple samples, groups, and (cell)-subpopulations, as elaborated in: 7 | 8 | > Crowell HL, Soneson C\*, Germain P-L\*, 9 | Calini D, Collin L, Raposo C, Malhotra D & Robinson MD: 10 | "*muscat* detects subpopulation-specific state transitions from 11 | multi-sample multi-condition single-cell transcriptomics data" 12 | *Nature Communications* **11**, 6077 (2020) 13 | [DOI: 10.1038/s41467-020-19894-4](https://doi.org/10.1038/s41467-020-19894-4) 14 | 15 | *These authors contributed equally. 16 | 17 | ### installation 18 | 19 | `muscat` is available through Bioconductor, and 20 | can be installed using the following commands: 21 | 22 | ```r 23 | if (!requireNamespace("BiocManager", quietly = TRUE)) 24 | install.packages("BiocManager") 25 | BiocManager::install("muscat") 26 | ``` 27 | 28 | ### quick guide 29 | 30 | Let `sce` be a [`SingleCellExperiment`](https://www.bioconductor.org/packages/SingleCellExperiment.html) object with cell metadata (`colData`) columns 31 | 32 | 1. `"sample_id"` specifying unique sample identifiers (e.g., PeterPan1, Nautilus7, ...) 33 | 2. `"group_id"` specifying each sample's experimental condition (e.g., reference/stimulated, healthy/diseased, ...) 34 | 3. `"cluster_id"` specifying subpopulation (cluster) assignments (e.g., B cells, dendritic cells, ...) 35 | 36 | Aggregation-based methods come down to the following simple commands: 37 | 38 | ```r 39 | # compute pseudobulks (sum of counts) 40 | pb <- aggregateData(sce, 41 | assay = "counts", fun = "sum", 42 | by = c("cluster_id", "sample_id")) 43 | 44 | # run pseudobulk (aggregation-based) DS analysis 45 | ds_pb <- pbDS(pb, method = "edgeR") 46 | ``` 47 | 48 | Mixed models can be run directly on cell-level measurements, e.g.: 49 | 50 | ```r 51 | ds_mm <- mmDS(sce, method = "dream") 52 | ``` 53 | 54 | For details, please see the package vignettes. 55 | 56 | ### differential detection 57 | 58 | `muscat` also supports testing for differential detection as proposed in 59 | 60 | > Gilis J, Perin L, Malfait M, Van den Berge K, 61 | Assefa AT, Verbist B, Risso D, and Clement L: 62 | Differential detection workflows for 63 | multi-sample single-cell RNA-seq data. 64 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043) 65 | 66 | Key alterations to the commands above are highlighted below (!!!), 67 | however, we recommend users consult the corresponding publication 68 | and package vignette for more details. 69 | 70 | ```r 71 | # sum binarized counts 72 | pb <- aggregateData(sce, 73 | assay = "counts", 74 | fun = "num.detected", # !!! 75 | by = c("cluster_id", "sample_id")) 76 | # test for differential detection 77 | dd <- pbDD(pb) # or.. 78 | dd <- pbDS(pb, method = "DD") 79 | ``` -------------------------------------------------------------------------------- /data/example_sce.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/data/example_sce.rda -------------------------------------------------------------------------------- /inst/extdata/1a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1a.png -------------------------------------------------------------------------------- /inst/extdata/1b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1b.png -------------------------------------------------------------------------------- /inst/extdata/1d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1d.png -------------------------------------------------------------------------------- /inst/extdata/muscat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/muscat.png -------------------------------------------------------------------------------- /inst/extdata/refs.bib: -------------------------------------------------------------------------------- 1 | % Generated by Paperpile. Check out https://paperpile.com for more information. 2 | % BibTeX export options can be customized via Settings -> BibTeX. 3 | 4 | @ARTICLE{Crowell2020-muscat, 5 | title = "muscat detects subpopulation-specific state transitions from 6 | multi-sample multi-condition single-cell transcriptomics data", 7 | author = "Crowell, Helena L and Soneson, Charlotte and Germain, Pierre-Luc 8 | and Calini, Daniela and Collin, Ludovic and Raposo, Catarina and 9 | Malhotra, Dheeraj and Robinson, Mark D", 10 | journal = "Nature Communications", 11 | volume = 11, 12 | number = 1, 13 | pages = "6077", 14 | year = 2020 15 | } 16 | 17 | @ARTICLE{Zhang2019, 18 | title = "Probabilistic cell type assignment of single-cell transcriptomic 19 | data reveals spatiotemporal microenvironment dynamics in human 20 | cancers", 21 | author = "Zhang, Allen W and O\textbackslashtextquoterightFlanagan, Ciara 22 | and Chavez, Elizabeth and Lim, Jamie L P and McPherson, Andrew and 23 | Wiens, Matt and Walters, Pascale and Chan, Tim and Hewitson, 24 | Brittany and Lai, Daniel and Mottok, Anja and Sarkozy, Clementine 25 | and Chong, Lauren and Aoki, Tomohiro and Wang, Xuehai and Weng, 26 | Andrew P and McAlpine, Jessica N and Aparicio, Samuel and Steidl, 27 | Christian and Campbell, Kieran R and Shah, Sohrab P", 28 | journal = "bioRxiv", 29 | volume = 521914, 30 | year = 2019 31 | } 32 | 33 | @ARTICLE{Soneson2018-countsimQC, 34 | title = "Towards unified quality verification of synthetic count data with 35 | {countsimQC}", 36 | author = "Soneson, Charlotte and Robinson, Mark D", 37 | journal = "Bioinformatics", 38 | volume = 34, 39 | number = 4, 40 | pages = "691--692", 41 | year = 2018 42 | } 43 | 44 | @ARTICLE{Butler2018-Seurat, 45 | title = "Integrating single-cell transcriptomic data across different 46 | conditions, technologies, and species", 47 | author = "Butler, Andrew and Hoffman, Paul and Smibert, Peter and Papalexi, 48 | Efthymia and Satija, Rahul", 49 | journal = "Nature Biotechnology", 50 | volume = 36, 51 | number = 5, 52 | pages = "411--420", 53 | year = 2018 54 | } 55 | 56 | @ARTICLE{Stuart2019, 57 | title = "Comprehensive {I}ntegration of {{S}ingle-{C}ell} {D}ata", 58 | author = "Stuart, Tim and Butler, Andrew and Hoffman, Paul and Hafemeister, 59 | Christoph and Papalexi, Efthymia and Mauck, 3rd, William M and 60 | Hao, Yuhan and Stoeckius, Marlon and Smibert, Peter and Satija, 61 | Rahul", 62 | journal = "Cell", 63 | volume = 177, 64 | number = 7, 65 | pages = "1888--1902.e21", 66 | year = 2019 67 | } 68 | 69 | @ARTICLE{Stegle2015, 70 | title = "Computational and analytical challenges in single-cell 71 | transcriptomics", 72 | author = "Stegle, Oliver and Teichmann, Sarah A and Marioni, John C", 73 | journal = "Nature Reviews Genetics", 74 | volume = 16, 75 | number = 3, 76 | pages = "133--145", 77 | year = 2015 78 | } 79 | 80 | @ARTICLE{Trapnell2015, 81 | title = "Defining cell types and states with single-cell genomics", 82 | author = "Trapnell, Cole", 83 | journal = "Genome Research", 84 | volume = 25, 85 | number = 10, 86 | pages = "1491--1498", 87 | year = 2015 88 | } 89 | 90 | @ARTICLE{Wagner2016, 91 | title = "Revealing the vectors of cellular identity with single-cell 92 | genomics", 93 | author = "Wagner, Allon and Regev, Aviv and Yosef, Nir", 94 | journal = "Nature Biotechnology", 95 | volume = 34, 96 | number = 11, 97 | pages = "1145--1160", 98 | year = 2016 99 | } 100 | 101 | @ARTICLE{McCarthy2017-scater, 102 | title = "Scater: pre-processing, quality control, normalization and 103 | visualization of single-cell {RNA-seq} data in {R}", 104 | author = "McCarthy, Davis J and Campbell, Kieran R and Lun, Aaron T L and 105 | Wills, Quin F", 106 | journal = "Bioinformatics", 107 | volume = 33, 108 | number = 8, 109 | pages = "1179--1186", 110 | year = 2017 111 | } 112 | 113 | @ARTICLE{Love2014-DESeq2, 114 | title = "Moderated estimation of fold change and dispersion for {RNA-seq} 115 | data with {DESeq2}", 116 | author = "Love, Michael I and Huber, Wolfgang and Anders, Simon", 117 | journal = "Genome Biology", 118 | volume = 15, 119 | number = 12, 120 | pages = "550", 121 | year = 2014 122 | } 123 | 124 | @ARTICLE{Korthauer2016-scDD, 125 | title = "A statistical approach for identifying differential distributions 126 | in single-cell {RNA-seq} experiments", 127 | author = "Korthauer, Keegan D and Chu, Li-Fang and Newton, Michael A and Li, 128 | Yuan and Thomson, James and Stewart, Ron and Kendziorski, 129 | Christina", 130 | journal = "Genome Biology", 131 | volume = 17, 132 | number = 1, 133 | pages = "222", 134 | year = 2016 135 | } 136 | 137 | @ARTICLE{Kang2018-demuxlet, 138 | title = "Multiplexed droplet single-cell {RNA-sequencing} using natural 139 | genetic variation", 140 | author = "Kang, Hyun Min and Subramaniam, Meena and Targ, Sasha and Nguyen, 141 | Michelle and Maliskova, Lenka and McCarthy, Elizabeth and Wan, 142 | Eunice and Wong, Simon and Byrnes, Lauren and Lanata, Cristina M 143 | and Gate, Rachel E and Mostafavi, Sara and Marson, Alexander and 144 | Zaitlen, Noah and Criswell, Lindsey A and Ye, Chun Jimmie", 145 | journal = "Nature Biotechnology", 146 | volume = 36, 147 | number = 1, 148 | pages = "89--94", 149 | year = 2018 150 | } 151 | 152 | @ARTICLE{Soneson2016-iCOBRA, 153 | title = "{iCOBRA}: open, reproducible, standardized and live method 154 | benchmarking", 155 | author = "Soneson, Charlotte and Robinson, Mark D", 156 | journal = "Nature Methods", 157 | volume = 13, 158 | number = 4, 159 | pages = "283", 160 | year = 2016 161 | } 162 | 163 | @ARTICLE{Hafemeister2019-sctransform, 164 | title = "Normalization and variance stabilization of single-cell {RNA-seq} 165 | data using regularized negative binomial regression", 166 | author = "Hafemeister, Christoph and Satija, Rahul", 167 | journal = "bioRxiv", 168 | volume = 576827, 169 | year = 2019 170 | } 171 | 172 | @ARTICLE{Freytag2018-clustering, 173 | title = "Comparison of clustering tools in {R} for medium-sized 10x 174 | Genomics single-cell {RNA-sequencing} data", 175 | author = "Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng, 176 | Milica and Bahlo, Melanie", 177 | journal = "F1000Research", 178 | volume = 7, 179 | pages = "1297", 180 | year = 2018 181 | } 182 | 183 | @ARTICLE{Duo2018, 184 | title = "A systematic performance evaluation of clustering methods for 185 | single-cell {RNA-seq} data", 186 | author = "Du{\`o}, Angelo and Robinson, Mark D and Soneson, Charlotte", 187 | journal = "F1000Research", 188 | volume = 7, 189 | pages = "1141", 190 | year = 2018 191 | } 192 | 193 | @ARTICLE{Diaz-Mejia2019, 194 | title = "Evaluation of methods to assign cell type labels to cell clusters 195 | from single-cell {RNA-sequencing} data", 196 | author = "Diaz-Mejia, J Javier and Javier Diaz-Mejia, J and Meng, Elaine C 197 | and Pico, Alexander R and MacParland, Sonya A and Ketela, Troy and 198 | Pugh, Trevor J and Bader, Gary D and Morris, John H", 199 | journal = "F1000Research", 200 | volume = 8, 201 | pages = "296", 202 | year = 2019 203 | } 204 | 205 | @ARTICLE{Soneson2018, 206 | title = "Bias, robustness and scalability in single-cell differential 207 | expression analysis", 208 | author = "Soneson, Charlotte and Robinson, Mark D", 209 | journal = "Nature Methods", 210 | volume = 15, 211 | number = 4, 212 | pages = "255--261", 213 | year = 2018 214 | } 215 | 216 | @ARTICLE{Ritchie2015-limma, 217 | title = "limma powers differential expression analyses for {RNA-sequencing} 218 | and microarray studies", 219 | author = "Ritchie, Matthew E and Phipson, Belinda and Wu, Di and Hu, Yifang 220 | and Law, Charity W and Shi, Wei and Smyth, Gordon K", 221 | journal = "Nucleic Acids Research", 222 | volume = 43, 223 | number = 7, 224 | pages = "e47", 225 | year = 2015 226 | } 227 | 228 | @ARTICLE{Robinson2010-edgeR, 229 | title = "edge{R}: a {B}ioconductor package for differential expression 230 | analysis of digital gene expression data", 231 | author = "Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K", 232 | journal = "Bioinformatics", 233 | volume = 26, 234 | number = 1, 235 | pages = "139--140", 236 | year = 2010 237 | } 238 | 239 | @ARTICLE{Gilis2023, 240 | title = "Differential detection workflows for multi-sample single-cell 241 | {RNA-seq} data", 242 | author = "Gilis, Jeroen and Perin, Laura and Malfait, Milan and Van den 243 | Berge, Koen and Takele Assefa, Alemu and Verbist, Bie and Risso, 244 | Davide and Clement, Lieven", 245 | journal = "bioRxiv", 246 | year = 2023 247 | } 248 | 249 | @ARTICLE{Bouland2021, 250 | title = "Differential analysis of binarized single-cell {RNA} sequencing 251 | data captures biological variation", 252 | author = "Bouland, Gerard A and Mahfouz, Ahmed and Reinders, Marcel J T", 253 | journal = "NAR Genomics and Bioinformatics", 254 | volume = 3, 255 | number = 4, 256 | pages = "lqab118", 257 | year = 2021 258 | } 259 | 260 | @ARTICLE{Qiu2020, 261 | title = "Embracing the dropouts in single-cell {RNA-seq} analysis", 262 | author = "Qiu, Peng", 263 | journal = "Nature Communications", 264 | volume = 11, 265 | number = 1, 266 | pages = "1169", 267 | year = 2020, 268 | language = "en" 269 | } 270 | 271 | @ARTICLE{Vandenberge2017, 272 | title = "stageR: a general stage-wise method for controlling the gene-level 273 | false discovery rate in differential expression and differential 274 | transcript usage", 275 | author = "Van den Berge, Koen and Soneson, Charlotte and Robinson, Mark D 276 | and Clement, Lieven", 277 | journal = "Genome Biology", 278 | volume = 18, 279 | number = 151, 280 | year = 2017, 281 | language = "en" 282 | } -------------------------------------------------------------------------------- /man/aggregateData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aggregateData.R 3 | \name{aggregateData} 4 | \alias{aggregateData} 5 | \title{Aggregation of single-cell to pseudobulk data} 6 | \usage{ 7 | aggregateData( 8 | x, 9 | assay = NULL, 10 | by = c("cluster_id", "sample_id"), 11 | fun = c("sum", "mean", "median", "prop.detected", "num.detected"), 12 | scale = FALSE, 13 | verbose = TRUE, 14 | BPPARAM = SerialParam(progressbar = verbose) 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 19 | 20 | \item{assay}{character string specifying the assay slot to use as 21 | input data. Defaults to the 1st available (\code{assayNames(x)[1]}).} 22 | 23 | \item{by}{character vector specifying which 24 | \code{colData(x)} columns to summarize by (at most 2!).} 25 | 26 | \item{fun}{a character string. 27 | Specifies the function to use as summary statistic. 28 | Passed to \code{\link[scuttle]{summarizeAssayByGroup}}.} 29 | 30 | \item{scale}{logical. Should pseudo-bulks be scaled 31 | with the effective library size & multiplied by 1M?} 32 | 33 | \item{verbose}{logical. Should information on progress be reported?} 34 | 35 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} 36 | object specifying how aggregation should be parallelized.} 37 | } 38 | \value{ 39 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 40 | \itemize{ 41 | \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains 42 | pseudobulks for each of \code{by[1]}, e.g., for each cluster when 43 | \code{by = "cluster_id"}. Rows correspond to genes, columns to 44 | \code{by[2]}, e.g., samples when \code{by = "sample_id"}}. 45 | \item{If \code{length(by) == 1}, the returned SCE will contain only 46 | a single \code{assay} with rows = genes and colums = \code{by}.}} 47 | 48 | Aggregation parameters (\code{assay, by, fun, scaled}) are stored in 49 | \code{metadata()$agg_pars}, and the number of cells that were aggregated 50 | are accessible in \code{int_colData()$n_cells}. 51 | } 52 | \description{ 53 | ... 54 | } 55 | \examples{ 56 | # pseudobulk counts by cluster-sample 57 | data(example_sce) 58 | pb <- aggregateData(example_sce) 59 | 60 | library(SingleCellExperiment) 61 | assayNames(example_sce) # one sheet per cluster 62 | head(assay(example_sce)) # n_genes x n_samples 63 | 64 | # scaled CPM 65 | cpm <- edgeR::cpm(assay(example_sce)) 66 | assays(example_sce)$cpm <- cpm 67 | pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE) 68 | head(assay(pb)) 69 | 70 | # aggregate by cluster only 71 | pb <- aggregateData(example_sce, by = "cluster_id") 72 | length(assays(pb)) # single assay 73 | head(assay(pb)) # n_genes x n_clusters 74 | 75 | } 76 | \references{ 77 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 78 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 79 | On the discovery of population-specific state transitions from 80 | multi-sample multi-condition single-cell RNA sequencing data. 81 | \emph{bioRxiv} \strong{713412} (2018). 82 | doi: \url{https://doi.org/10.1101/713412} 83 | } 84 | \author{ 85 | Helena L Crowell & Mark D Robinson 86 | } 87 | -------------------------------------------------------------------------------- /man/calcExprFreqs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calcExprFreqs.R 3 | \name{calcExprFreqs} 4 | \alias{calcExprFreqs} 5 | \title{calcExprFreqs} 6 | \usage{ 7 | calcExprFreqs(x, assay = "counts", th = 0) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 11 | 12 | \item{assay}{a character string specifying which assay to use.} 13 | 14 | \item{th}{numeric threshold value above which 15 | a gene should be considered to be expressed.} 16 | } 17 | \value{ 18 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 19 | containing, for each cluster, an assay of dimensions #genes x #samples 20 | giving the fraction of cells that express each gene in each sample. 21 | If \code{colData(x)} contains a \code{"group_id"} column, the fraction 22 | of expressing cells in each each group will be included as well. 23 | } 24 | \description{ 25 | Calculates gene expression frequencies 26 | } 27 | \details{ 28 | \code{calcExprFreq} computes, for each sample and group (in each cluster), 29 | the fraction of cells that express a given gene. Here, a gene is considered 30 | to be expressed when the specified measurement value (\code{assay}) 31 | lies above the specified threshold value (\code{th}). 32 | } 33 | \examples{ 34 | data(example_sce) 35 | library(SingleCellExperiment) 36 | 37 | frq <- calcExprFreqs(example_sce) 38 | 39 | # one assay per cluster 40 | assayNames(frq) 41 | 42 | # expression frequencies by 43 | # sample & group; 1st cluster: 44 | head(assay(frq)) 45 | 46 | } 47 | \author{ 48 | Helena L Crowell & Mark D Robinson 49 | } 50 | -------------------------------------------------------------------------------- /man/data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \name{data} 4 | \alias{data} 5 | \alias{example_sce} 6 | \title{Example datasets} 7 | \value{ 8 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 9 | } 10 | \description{ 11 | A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing 12 | 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after 13 | 6h-treatment with INF-beta (16 samples in total). 14 | 15 | The original data has been filtered to 16 | \itemize{ 17 | \item{remove unassigned cells & cell multiplets} 18 | \item{retain only 4 out of 8 samples per experimental group} 19 | \item{retain only 5 out of 8 subpopulations (clusters)} 20 | \item{retain genes with a count > 1 in > 50 cells} 21 | \item{retain cells with > 200 detected genes} 22 | \item{retain at most 100 cells per cluster-sample instance} 23 | } 24 | 25 | Assay \code{logcounts} corresponds to log-normalized values 26 | obtained from \code{\link[scater]{logNormCounts}} with default parameters. 27 | 28 | The original measurement data, as well as gene and cell metadata 29 | is available through the NCBI GEO accession number GSE96583; 30 | code to reproduce this example dataset from the original data 31 | is provided in the examples section. 32 | } 33 | \examples{ 34 | \donttest{ 35 | # set random seed for cell sampling 36 | set.seed(2929) 37 | 38 | # load data 39 | library(ExperimentHub) 40 | eh <- ExperimentHub() 41 | sce <- eh[["EH2259"]] 42 | 43 | # drop unassigned cells & multiplets 44 | sce <- sce[, !is.na(sce$cell)] 45 | sce <- sce[, sce$multiplets == "singlet"] 46 | 47 | # keep 4 samples per group 48 | sce$id <- paste0(sce$stim, sce$ind) 49 | inds <- sample(sce$ind, 4) 50 | ids <- paste0(levels(sce$stim), rep(inds, each = 2)) 51 | sce <- sce[, sce$id \%in\% ids] 52 | 53 | # keep 5 clusters 54 | kids <- c("B cells", "CD4 T cells", "CD8 T cells", 55 | "CD14+ Monocytes", "FCGR3A+ Monocytes") 56 | sce <- sce[, sce$cell \%in\% kids] 57 | sce$cell <- droplevels(sce$cell) 58 | 59 | # basic filtering on genes & cells 60 | gs <- rowSums(counts(sce) > 1) > 50 61 | cs <- colSums(counts(sce) > 0) > 200 62 | sce <- sce[gs, cs] 63 | 64 | # sample max. 100 cells per cluster-sample 65 | cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id)) 66 | cs <- sapply(cs_by_ks, function(u) 67 | sample(u, min(length(u), 100))) 68 | sce <- sce[, unlist(cs)] 69 | 70 | # compute logcounts 71 | library(scater) 72 | sce <- computeLibraryFactors(sce) 73 | sce <- logNormCounts(sce) 74 | 75 | # re-format for 'muscat' 76 | sce <- prepSCE(sce, 77 | kid = "cell", 78 | sid = "id", 79 | gid = "stim", 80 | drop = TRUE) 81 | } 82 | 83 | } 84 | \references{ 85 | Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing 86 | using natural genetic variation. \emph{Nature Biotechnology}, 87 | \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042. 88 | } 89 | \author{ 90 | Helena L Crowell 91 | } 92 | -------------------------------------------------------------------------------- /man/mmDS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mmDS.R, R/utils-mmDS.R 3 | \name{mmDS} 4 | \alias{mmDS} 5 | \alias{.mm_dream} 6 | \alias{.mm_dream2} 7 | \alias{.mm_vst} 8 | \alias{.mm_glmm} 9 | \title{DS analysis using mixed-models (MM)} 10 | \usage{ 11 | mmDS( 12 | x, 13 | coef = NULL, 14 | covs = NULL, 15 | method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"), 16 | n_cells = 10, 17 | n_samples = 2, 18 | min_count = 1, 19 | min_cells = 20, 20 | verbose = TRUE, 21 | BPPARAM = SerialParam(progressbar = verbose), 22 | vst = c("sctransform", "DESeq2"), 23 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"), 24 | dup_corr = FALSE, 25 | trended = FALSE, 26 | bayesian = FALSE, 27 | blind = TRUE, 28 | REML = TRUE, 29 | moderate = FALSE 30 | ) 31 | 32 | .mm_dream( 33 | x, 34 | coef = NULL, 35 | covs = NULL, 36 | dup_corr = FALSE, 37 | trended = FALSE, 38 | ddf = c("Satterthwaite", "Kenward-Roger"), 39 | verbose = FALSE, 40 | BPPARAM = SerialParam(progressbar = verbose) 41 | ) 42 | 43 | .mm_dream2( 44 | x, 45 | coef = NULL, 46 | covs = NULL, 47 | ddf = c("Satterthwaite", "Kenward-Roger"), 48 | verbose = FALSE, 49 | BPPARAM = SerialParam(progressbar = verbose) 50 | ) 51 | 52 | .mm_vst( 53 | x, 54 | vst = c("sctransform", "DESeq2"), 55 | coef = NULL, 56 | covs = NULL, 57 | bayesian = FALSE, 58 | blind = TRUE, 59 | REML = TRUE, 60 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"), 61 | verbose = FALSE, 62 | BPPARAM = SerialParam(progressbar = verbose) 63 | ) 64 | 65 | .mm_glmm( 66 | x, 67 | coef = NULL, 68 | covs = NULL, 69 | family = c("poisson", "nbinom"), 70 | moderate = FALSE, 71 | verbose = TRUE, 72 | BPPARAM = SerialParam(progressbar = verbose) 73 | ) 74 | } 75 | \arguments{ 76 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 77 | 78 | \item{coef}{character specifying the coefficient to test. 79 | If NULL (default), will test the last level of \code{"group_id"}.} 80 | 81 | \item{covs}{character vector of \code{colData(x)} 82 | column names to use as covariates.} 83 | 84 | \item{method}{a character string. 85 | Either \code{"dream2"} (default, lme4 with voom-weights), 86 | \code{"dream"} (previous implementation of the dream method), 87 | \code{"vst"} (variance-stabilizing transformation), 88 | \code{"poisson"} (poisson GLM-MM), 89 | \code{"nbinom"} (negative binomial GLM-MM), 90 | \code{"hybrid"} (combination of pseudobulk and poisson methods) 91 | or a function accepting the same arguments.} 92 | 93 | \item{n_cells}{number of cells per cluster-sample 94 | required to consider a sample for testing.} 95 | 96 | \item{n_samples}{number of samples per group 97 | required to consider a cluster for testing.} 98 | 99 | \item{min_count}{numeric. For a gene to be tested in a given cluster, 100 | at least \code{min_cells} must have a count >= \code{min_count}.} 101 | 102 | \item{min_cells}{number (or fraction, if < 1) of cells with a count > 103 | \code{min_count} required for a gene to be tested in a given cluster.} 104 | 105 | \item{verbose}{logical specifying whether messages 106 | on progress and a progress bar should be displayed.} 107 | 108 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} 109 | object specifying how differential testing should be parallelized.} 110 | 111 | \item{vst}{method to use as variance-stabilizing transformations. 112 | \code{"sctransform"} for \code{\link[sctransform]{vst}}; \code{"DESeq2"} 113 | for \code{\link[DESeq2]{varianceStabilizingTransformation}}.} 114 | 115 | \item{ddf}{character string specifying the method for estimating 116 | the effective degrees of freedom. For \code{method = "dream"}, 117 | either \code{"Satterthwaite"} (faster) or \code{"Kenward-Roger"} 118 | (more accurate); see \code{?variancePartition::dream} for details. 119 | For \code{method = "vst"}, method \code{"lme4"} is also valid; 120 | see \code{\link[lmerTest]{contest.lmerModLmerTest}}.} 121 | 122 | \item{dup_corr}{logical; whether to use 123 | \code{\link[limma:dupcor]{duplicateCorrelation}}.} 124 | 125 | \item{trended}{logical; whether to use expression-dependent variance priors 126 | in \code{\link[limma]{eBayes}}.} 127 | 128 | \item{bayesian}{logical; whether to use bayesian mixed models.} 129 | 130 | \item{blind}{logical; whether to ignore experimental design for the vst.} 131 | 132 | \item{REML}{logical; whether to maximize REML instead of log-likelihood.} 133 | 134 | \item{moderate}{logical; whether to perform empirical Bayes moderation.} 135 | 136 | \item{family}{character string specifying which GLMM to fit: 137 | \code{"poisson"} for \code{\link[blme:blmer]{bglmer}}, 138 | \code{"nbinom"} for \code{\link[glmmTMB]{glmmTMB}}.} 139 | } 140 | \value{ 141 | a data.frame 142 | } 143 | \description{ 144 | Performs cluster-wise DE analysis by fitting cell-level models. 145 | } 146 | \details{ 147 | The \code{.mm_*} functions (e.g. \code{.mm_dream}) expect cells from a single 148 | cluster, and do not perform filtering or handle incorrect parameters well. 149 | Meant to be called by \code{mmDS} with \code{method = c("dream", "vst")} and 150 | \code{vst = c("sctransform", "DESeq2")} to be applied across all clusters. 151 | \describe{ 152 | \item{\code{method = "dream2"}}{ 153 | \code{variancePartition}'s (>=1.14.1) voom-lme4-implementation 154 | of mixed models for RNA-seq data; function \code{dream}.} 155 | \item{\code{method = "dream"}}{ 156 | \code{variancePartition}'s older voom-lme4-implementation 157 | of mixed models for RNA-seq data; function \code{dream}.} 158 | \item{\code{method = "vst"}}{ 159 | \describe{ 160 | \item{\code{vst = "sctransform"}}{ 161 | \code{lmer} or \code{blmer} mixed models on 162 | \code{\link[sctransform]{vst}} transformed counts.} 163 | \item{\code{vst = "DESeq2"}}{ 164 | \code{\link[DESeq2]{varianceStabilizingTransformation}} 165 | followed by \code{lme4} mixed models.}}}} 166 | } 167 | \section{Functions}{ 168 | \itemize{ 169 | \item \code{.mm_dream()}: see details. 170 | 171 | \item \code{.mm_dream2()}: see details. 172 | 173 | \item \code{.mm_vst()}: see details. 174 | 175 | \item \code{.mm_glmm()}: see details. 176 | 177 | }} 178 | \examples{ 179 | # subset "B cells" cluster 180 | data(example_sce) 181 | b_cells <- example_sce$cluster_id == "B cells" 182 | sub <- example_sce[, b_cells] 183 | sub$cluster_id <- droplevels(sub$cluster_id) 184 | 185 | # downsample to 100 genes 186 | gs <- sample(nrow(sub), 100) 187 | sub <- sub[gs, ] 188 | 189 | # run DS analysis using cell-level mixed-model 190 | res <- mmDS(sub, method = "dream", verbose = FALSE) 191 | head(res$`B cells`) 192 | 193 | } 194 | \references{ 195 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 196 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 197 | On the discovery of population-specific state transitions from 198 | multi-sample multi-condition single-cell RNA sequencing data. 199 | \emph{bioRxiv} \strong{713412} (2018). 200 | doi: \url{https://doi.org/10.1101/713412} 201 | } 202 | \author{ 203 | Pierre-Luc Germain & Helena L Crowell 204 | } 205 | -------------------------------------------------------------------------------- /man/pbDS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbDS.R 3 | \name{pbDS} 4 | \alias{pbDS} 5 | \alias{pbDD} 6 | \title{pseudobulk DS analysis} 7 | \usage{ 8 | pbDS( 9 | pb, 10 | method = c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"), 11 | design = NULL, 12 | coef = NULL, 13 | contrast = NULL, 14 | min_cells = 10, 15 | filter = c("both", "genes", "samples", "none"), 16 | treat = FALSE, 17 | verbose = TRUE, 18 | BPPARAM = SerialParam(progressbar = verbose) 19 | ) 20 | 21 | pbDD( 22 | pb, 23 | design = NULL, 24 | coef = NULL, 25 | contrast = NULL, 26 | min_cells = 10, 27 | filter = c("both", "genes", "samples", "none"), 28 | verbose = TRUE, 29 | BPPARAM = SerialParam(progressbar = verbose) 30 | ) 31 | } 32 | \arguments{ 33 | \item{pb}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 34 | containing pseudobulks as returned by \code{\link{aggregateData}}.} 35 | 36 | \item{method}{a character string.} 37 | 38 | \item{design}{For methods \code{"edegR"} and \code{"limma"}, a design matrix 39 | with row & column names(!) created with \code{\link[stats]{model.matrix}}; 40 | For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}. 41 | Defaults to \code{~ group_id} or the corresponding \code{model.matrix}.} 42 | 43 | \item{coef}{passed to \code{\link[edgeR]{glmQLFTest}}, 44 | \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}} 45 | for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively. 46 | Can be a list for multiple, independent comparisons.} 47 | 48 | \item{contrast}{a matrix of contrasts to test for 49 | created with \code{\link[limma]{makeContrasts}}.} 50 | 51 | \item{min_cells}{a numeric. Specifies the minimum number of cells in a given 52 | cluster-sample required to consider the sample for differential testing.} 53 | 54 | \item{filter}{character string specifying whether 55 | to filter on genes, samples, both or neither.} 56 | 57 | \item{treat}{logical specifying whether empirical Bayes moderated-t 58 | p-values should be computed relative to a minimum fold-change threshold. 59 | Only applicable for methods \code{"limma-x"} 60 | (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"} 61 | (\code{\link[edgeR]{glmTreat}}), and ignored otherwise.} 62 | 63 | \item{verbose}{logical. Should information on progress be reported?} 64 | 65 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} 66 | object specifying how differential testing should be parallelized.} 67 | } 68 | \value{ 69 | a list containing \itemize{ 70 | \item a data.frame with differential testing results, 71 | \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and 72 | \item the \code{design} matrix, and \code{contrast} or \code{coef} used.} 73 | } 74 | \description{ 75 | \code{pbDS} tests for DS after aggregating single-cell 76 | measurements to pseudobulk data, by applying bulk RNA-seq DE methods, 77 | such as \code{edgeR}, \code{DESeq2} and \code{limma}. 78 | } 79 | \examples{ 80 | # simulate 5 clusters, 20\% of DE genes 81 | data(example_sce) 82 | 83 | # compute pseudobulk sum-counts & run DS analysis 84 | pb <- aggregateData(example_sce) 85 | res <- pbDS(pb, method = "limma-trend") 86 | 87 | names(res) 88 | names(res$table) 89 | head(res$table$stim$`B cells`) 90 | 91 | # count nb. of DE genes by cluster 92 | vapply(res$table$stim, function(u) 93 | sum(u$p_adj.loc < 0.05), numeric(1)) 94 | 95 | # get top 5 hits for ea. cluster w/ abs(logFC) > 1 96 | library(dplyr) 97 | lapply(res$table$stim, function(u) 98 | filter(u, abs(logFC) > 1) \%>\% 99 | arrange(p_adj.loc) \%>\% 100 | slice(seq_len(5))) 101 | 102 | } 103 | \references{ 104 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 105 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 106 | On the discovery of population-specific state transitions from 107 | multi-sample multi-condition single-cell RNA sequencing data. 108 | \emph{bioRxiv} \strong{713412} (2018). 109 | doi: \url{https://doi.org/10.1101/713412} 110 | } 111 | \author{ 112 | Helena L Crowell & Mark D Robinson 113 | } 114 | -------------------------------------------------------------------------------- /man/pbFlatten.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbFlatten.R 3 | \name{pbFlatten} 4 | \alias{pbFlatten} 5 | \title{pbFlatten 6 | Flatten pseudobulk SCE} 7 | \usage{ 8 | pbFlatten(pb, normalize = TRUE) 9 | } 10 | \arguments{ 11 | \item{pb}{a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}} 12 | as returned by \code{\link{aggregateData}}, with different subpopulations as assays.} 13 | 14 | \item{normalize}{logical specifying whether to compute a \code{logcpm} assay.} 15 | } 16 | \value{ 17 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. 18 | } 19 | \description{ 20 | Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}} 21 | as returned by \code{\link{aggregateData}} such that all cell subpopulations 22 | are represented as a single assay. 23 | } 24 | \examples{ 25 | data(example_sce) 26 | library(SingleCellExperiment) 27 | pb_stack <- aggregateData(example_sce) 28 | (pb_flat <- pbFlatten(pb_stack)) 29 | ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack)) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/pbHeatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbHeatmap.R 3 | \name{pbHeatmap} 4 | \alias{pbHeatmap} 5 | \title{Heatmap of cluster-sample pseudobulks} 6 | \usage{ 7 | pbHeatmap( 8 | x, 9 | y, 10 | k = NULL, 11 | g = NULL, 12 | c = NULL, 13 | top_n = 20, 14 | fdr = 0.05, 15 | lfc = 1, 16 | sort_by = "p_adj.loc", 17 | decreasing = FALSE, 18 | assay = "logcounts", 19 | fun = mean, 20 | normalize = TRUE, 21 | col = viridis(10), 22 | row_anno = TRUE, 23 | col_anno = TRUE 24 | ) 25 | } 26 | \arguments{ 27 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 28 | 29 | \item{y}{a list of DS analysis results as returned by 30 | \code{\link{pbDS}} or \code{\link{mmDS}}.} 31 | 32 | \item{k}{character vector; specifies which cluster ID(s) to retain. 33 | Defaults to \code{levels(x$cluster_id)}.} 34 | 35 | \item{g}{character vector; specifies which genes to retain. 36 | Defaults to considering all genes.} 37 | 38 | \item{c}{character string; specifies which contrast/coefficient to retain. 39 | Defaults to \code{names(y$table)[1]}.} 40 | 41 | \item{top_n}{single numeric; number of genes to retain per cluster.} 42 | 43 | \item{fdr, lfc}{single numeric; FDR and logFC cutoffs to filter results by. 44 | The specified FDR threshold is applied to \code{p_adj.loc} values.} 45 | 46 | \item{sort_by}{character string specifying 47 | a numeric results table column to sort by; 48 | \code{"none"} to retain original ordering.} 49 | 50 | \item{decreasing}{logical; whether to sort 51 | in decreasing order of \code{sort_by}.} 52 | 53 | \item{assay}{character string; specifies which assay to use; 54 | should be one of \code{assayNames(x)}.} 55 | 56 | \item{fun}{function to use as summary statistic, 57 | e.g., mean, median, sum (depending on the input assay).} 58 | 59 | \item{normalize}{logical; whether to apply a z-normalization 60 | to each row (gene) of the cluster-sample pseudobulk data.} 61 | 62 | \item{col}{character vector of colors or color mapping function 63 | generated with \code{\link[circlize]{colorRamp2}}. Passed to 64 | argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}} 65 | (see \code{?ComplexHeatmap::Heatmap} for details).} 66 | 67 | \item{row_anno, col_anno}{logical; whether to render 68 | annotations of cluster and group IDs, respectively.} 69 | } 70 | \value{ 71 | a \code{\link{HeatmapList-class}} object. 72 | } 73 | \description{ 74 | ... 75 | } 76 | \examples{ 77 | # compute pseudobulks & run DS analysis 78 | data(example_sce) 79 | pb <- aggregateData(example_sce) 80 | res <- pbDS(pb) 81 | 82 | # cluster-sample expression means 83 | pbHeatmap(example_sce, res) 84 | 85 | # include only a single cluster 86 | pbHeatmap(example_sce, res, k = "B cells") 87 | 88 | # plot specific gene across all clusters 89 | pbHeatmap(example_sce, res, g = "ISG20") 90 | 91 | } 92 | \author{ 93 | Helena L Crowell 94 | } 95 | -------------------------------------------------------------------------------- /man/pbMDS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbMDS.R 3 | \name{pbMDS} 4 | \alias{pbMDS} 5 | \title{Pseudobulk-level MDS plot} 6 | \usage{ 7 | pbMDS(x) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 11 | containing cluster-sample pseudobulks as returned by 12 | \code{\link{aggregateData}} with argument 13 | \code{by = c("cluster_id", "sample_id")}.} 14 | } 15 | \value{ 16 | a \code{ggplot} object. 17 | } 18 | \description{ 19 | Renders a multidimensional scaling (MDS) 20 | where each point represents a cluster-sample instance; 21 | with points colored by cluster ID and shaped by group ID. 22 | } 23 | \examples{ 24 | data(example_sce) 25 | pb <- aggregateData(example_sce) 26 | pbMDS(pb) 27 | 28 | } 29 | \author{ 30 | Helena L Crowell & Mark D Robinson 31 | } 32 | -------------------------------------------------------------------------------- /man/prepSCE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prepSCE.R 3 | \name{prepSCE} 4 | \alias{prepSCE} 5 | \title{Prepare SCE for DS analysis} 6 | \usage{ 7 | prepSCE( 8 | x, 9 | kid = "cluster_id", 10 | sid = "sample_id", 11 | gid = "group_id", 12 | drop = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{a \linkS4class{SingleCellExperiment}.} 17 | 18 | \item{kid, sid, gid}{character strings specifying 19 | the \code{colData(x)} columns containing cluster assignments, 20 | unique sample identifiers, and group IDs (e.g., treatment).} 21 | 22 | \item{drop}{logical. Specifies whether \code{colData(x)} columns 23 | besides those specified as \code{cluster_id,sample_id,group_id} 24 | should be retained (default \code{drop = FALSE}) 25 | or removed (\code{drop = TRUE}).} 26 | } 27 | \value{ 28 | a \linkS4class{SingleCellExperiment}. 29 | } 30 | \description{ 31 | ... 32 | } 33 | \examples{ 34 | # generate random counts 35 | ng <- 50 36 | nc <- 200 37 | 38 | # generate some cell metadata 39 | gids <- sample(c("groupA", "groupB"), nc, TRUE) 40 | sids <- sample(paste0("sample", seq_len(3)), nc, TRUE) 41 | kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE) 42 | batch <- sample(seq_len(3), nc, TRUE) 43 | cd <- data.frame(group = gids, id = sids, cluster = kids, batch) 44 | 45 | # construct SCE 46 | library(scuttle) 47 | sce <- mockSCE(ncells = nc, ngenes = ng) 48 | colData(sce) <- cbind(colData(sce), cd) 49 | 50 | # prep. for workflow 51 | sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group") 52 | head(colData(sce)) 53 | metadata(sce)$experiment_info 54 | sce 55 | 56 | } 57 | \author{ 58 | Helena L Crowell 59 | } 60 | -------------------------------------------------------------------------------- /man/prepSim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prepSim.R 3 | \name{prepSim} 4 | \alias{prepSim} 5 | \title{SCE preparation for \code{\link{simData}}} 6 | \usage{ 7 | prepSim( 8 | x, 9 | min_count = 1, 10 | min_cells = 10, 11 | min_genes = 100, 12 | min_size = 100, 13 | group_keep = NULL, 14 | verbose = TRUE 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 19 | 20 | \item{min_count, min_cells}{used for filtering of genes; only genes with 21 | a count > \code{min_count} in >= \code{min_cells} will be retained.} 22 | 23 | \item{min_genes}{used for filtering cells; 24 | only cells with a count > 0 in >= \code{min_genes} will be retained.} 25 | 26 | \item{min_size}{used for filtering subpopulation-sample combinations; 27 | only instances with >= \code{min_size} cells will be retained. 28 | Specifying \code{min_size = NULL} skips this step.} 29 | 30 | \item{group_keep}{character string; if \code{nlevels(x$group_id) > 1}, 31 | specifies which group of samples to keep (see details). The default 32 | NULL retains samples from \code{levels(x$group_id)[1]}; otherwise, 33 | if `colData(x)$group_id` is not specified, all samples will be kept.} 34 | 35 | \item{verbose}{logical; should information on progress be reported?} 36 | } 37 | \value{ 38 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 39 | containing, for each cell, library size (\code{colData(x)$offset}) 40 | and, for each gene, dispersion and sample-specific mean estimates 41 | (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively). 42 | } 43 | \description{ 44 | \code{prepSim} prepares an input SCE for simulation 45 | with \code{muscat}'s \code{\link{simData}} function by 46 | \enumerate{ 47 | \item{basic filtering of genes and cells} 48 | \item{(optional) filtering of subpopulation-sample instances} 49 | \item{estimation of cell (library sizes) and gene parameters 50 | (dispersions and sample-specific means), respectively.} 51 | } 52 | } 53 | \details{ 54 | For each gene \eqn{g}, \code{prepSim} fits a model to estimate 55 | sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s}, 56 | and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s 57 | \code{\link[edgeR]{estimateDisp}} function with default parameters. 58 | Thus, the reference count data is modeled as NB distributed: 59 | \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)} 60 | for gene \eqn{g} and cell \eqn{c}, where the mean 61 | \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here, 62 | \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g} 63 | in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size 64 | (total number of counts), and \eqn{\phi_g} is the dispersion. 65 | } 66 | \examples{ 67 | # estimate simulation parameters 68 | data(example_sce) 69 | ref <- prepSim(example_sce) 70 | 71 | # tabulate number of genes/cells before vs. after 72 | ns <- cbind( 73 | before = dim(example_sce), 74 | after = dim(ref)) 75 | rownames(ns) <- c("#genes", "#cells") 76 | ns 77 | 78 | library(SingleCellExperiment) 79 | head(rowData(ref)) # gene parameters 80 | head(colData(ref)) # cell parameters 81 | 82 | } 83 | \references{ 84 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 85 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 86 | On the discovery of population-specific state transitions from 87 | multi-sample multi-condition single-cell RNA sequencing data. 88 | \emph{bioRxiv} \strong{713412} (2018). 89 | doi: \url{https://doi.org/10.1101/713412} 90 | } 91 | \author{ 92 | Helena L Crowell 93 | } 94 | -------------------------------------------------------------------------------- /man/resDS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/resDS.R 3 | \name{resDS} 4 | \alias{resDS} 5 | \title{resDS 6 | Formatting of DS analysis results} 7 | \usage{ 8 | resDS( 9 | x, 10 | y, 11 | bind = c("row", "col"), 12 | frq = FALSE, 13 | cpm = FALSE, 14 | digits = 3, 15 | sep = "__", 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 21 | 22 | \item{y}{a list of DS testing results as returned 23 | by \code{\link{pbDS}} or \code{\link{mmDS}}.} 24 | 25 | \item{bind}{character string specifying the output format (see details).} 26 | 27 | \item{frq}{logical or a pre-computed list of expression frequencies 28 | as returned by \code{\link{calcExprFreqs}}.} 29 | 30 | \item{cpm}{logical specifying whether CPM by cluster-sample 31 | should be appendeded to the output result table(s).} 32 | 33 | \item{digits}{integer value specifying the 34 | number of significant digits to maintain.} 35 | 36 | \item{sep}{character string to use as separator 37 | when constructing new column names.} 38 | 39 | \item{...}{optional arguments passed to 40 | \code{\link{calcExprFreqs}} if \code{frq = TRUE}.} 41 | } 42 | \value{ 43 | returns a `data.frame`. 44 | } 45 | \description{ 46 | \code{resDS} provides a simple wrapper to format cluster-level 47 | differential testing results into an easily filterable table, and 48 | to optionally append gene expression frequencies by cluster-sample 49 | & -group, as well as cluster-sample-wise CPM. 50 | } 51 | \details{ 52 | When \code{bind = "col"}, the list of DS testing results at 53 | \code{y$table} will be merge vertically (by column) into a single table 54 | in tidy format with column \code{contrast/coef} specifying the comparison. 55 | 56 | Otherwise, when \code{bind = "row"}, an identifier of the respective 57 | contrast or coefficient will be appended to the column names, 58 | and all tables will be merge horizontally (by row). 59 | 60 | Expression frequencies pre-computed with \code{\link{calcExprFreqs}} 61 | may be provided with \code{frq}. Alternatively, when \code{frq = TRUE}, 62 | expression frequencies can be computed directly, and additional arguments 63 | may be passed to \code{\link{calcExprFreqs}} (see examples below). 64 | } 65 | \examples{ 66 | # compute pseudobulks (sum of counts) 67 | data(example_sce) 68 | pb <- aggregateData(example_sce, 69 | assay = "counts", fun = "sum") 70 | 71 | # run DS analysis (edgeR on pseudobulks) 72 | res <- pbDS(pb, method = "edgeR") 73 | 74 | head(resDS(example_sce, res, bind = "row")) # tidy format 75 | head(resDS(example_sce, res, bind = "col", digits = Inf)) 76 | 77 | # append CPMs & expression frequencies 78 | head(resDS(example_sce, res, cpm = TRUE)) 79 | head(resDS(example_sce, res, frq = TRUE)) 80 | 81 | # pre-computed expression frequencies & append 82 | frq <- calcExprFreqs(example_sce, assay = "counts", th = 0) 83 | head(resDS(example_sce, res, frq = frq)) 84 | 85 | } 86 | \author{ 87 | Helena L Crowell & Mark D Robinson 88 | } 89 | -------------------------------------------------------------------------------- /man/simData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simData.R 3 | \name{simData} 4 | \alias{simData} 5 | \title{simData} 6 | \usage{ 7 | simData( 8 | x, 9 | ng = nrow(x), 10 | nc = ncol(x), 11 | ns = NULL, 12 | nk = NULL, 13 | probs = NULL, 14 | dd = TRUE, 15 | p_dd = diag(6)[1, ], 16 | paired = FALSE, 17 | p_ep = 0.5, 18 | p_dp = 0.3, 19 | p_dm = 0.5, 20 | p_type = 0, 21 | lfc = 2, 22 | rel_lfc = NULL, 23 | phylo_tree = NULL, 24 | phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3), 25 | force = FALSE 26 | ) 27 | } 28 | \arguments{ 29 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.} 30 | 31 | \item{ng}{number of genes to simulate. Importantly, for the library sizes 32 | computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 33 | the number of simulated genes should match with the number of genes 34 | in the reference. To simulate a reduced number of genes, e.g. for 35 | testing and development purposes, please set \code{force = TRUE}.} 36 | 37 | \item{nc}{number of cells to simulate.} 38 | 39 | \item{ns}{number of samples to simulate; defaults to as many as 40 | available in the reference to avoid duplicated reference samples. 41 | Specifically, the number of samples will be set to 42 | \code{n = nlevels(x$sample_id)} when \code{dd = FALSE}, 43 | \code{n} per group when \code{dd, paired = TRUE}, and 44 | \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}. 45 | When a larger number samples should be simulated, set \code{force = TRUE}.} 46 | 47 | \item{nk}{number of clusters to simulate; defaults to the number 48 | of available reference clusters (\code{nlevels(x$cluster_id)}).} 49 | 50 | \item{probs}{a list of length 3 containing probabilities of a cell belonging 51 | to each cluster, sample, and group, respectively. List elements must be 52 | NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.} 53 | 54 | \item{dd}{whether or not to simulate differential distributions; if TRUE, 55 | two groups are simulated and \code{ns} corresponds to the number of 56 | samples per group, else one group with \code{ns} samples is simulated.} 57 | 58 | \item{p_dd}{numeric vector of length 6. 59 | Specifies the probability of a gene being 60 | EE, EP, DE, DP, DM, or DB, respectively.} 61 | 62 | \item{paired}{logical specifying whether a paired design should 63 | be simulated (both groups use the same set of reference samples) 64 | or not (reference samples are drawn at random).} 65 | 66 | \item{p_ep, p_dp, p_dm}{numeric specifying the proportion of cells 67 | to be shifted to a different expression state in one group (see details).} 68 | 69 | \item{p_type}{numeric. Probability of EE/EP gene being a type-gene. 70 | If a gene is of class "type" in a given cluster, a unique mean 71 | will be used for that gene in the respective cluster.} 72 | 73 | \item{lfc}{numeric value to use as mean logFC 74 | (logarithm base 2) for DE, DP, DM, and DB type of genes.} 75 | 76 | \item{rel_lfc}{numeric vector of relative logFCs for each cluster. 77 | Should be of length \code{nlevels(x$cluster_id)} with 78 | \code{levels(x$cluster_id)} as names. 79 | Defaults to factor of 1 for all clusters.} 80 | 81 | \item{phylo_tree}{newick tree text representing cluster relations 82 | and their relative distance. An explanation of the syntax can be found 83 | \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. 84 | The distance between the nodes, except for the original branch, will be 85 | translated in the number of shared genes between the clusters belonging to 86 | these nodes (this relation is controlled with \code{phylo_pars}). 87 | The distance between two clusters is defined as the sum 88 | of the branches lengths separating them.} 89 | 90 | \item{phylo_pars}{vector of length 2 providing the parameters that control 91 | the number of type genes. Passed to an exponential PDF (see details).} 92 | 93 | \item{force}{logical specifying whether to force simulation 94 | when \code{ng} and/or \code{ns} don't match the number of 95 | available reference genes and samples, respectively.} 96 | } 97 | \value{ 98 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 99 | containing multiple clusters & samples across 2 groups 100 | as well as the following metadata: \describe{ 101 | \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing, 102 | containing, for each cell, it's cluster, sample, and group ID.} 103 | \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing, 104 | for each gene, it's \code{class} (one of "state", "type", "none") and 105 | specificity (\code{specs}; NA for genes of type "state", otherwise 106 | a character vector of clusters that share the given gene).} 107 | \item{experiment metadata (\code{metadata(.)})}{ 108 | \describe{ 109 | \item{\code{experiment_info}}{a \code{data.frame} 110 | summarizing the experimental design.} 111 | \item{\code{n_cells}}{the number of cells for each sample.} 112 | \item{\code{gene_info}}{a \code{data.frame} containing, for each gene 113 | in each cluster, it's differential distribution \code{category}, 114 | mean \code{logFC} (NA for genes for categories "ee" and "ep"), 115 | gene used as reference (\code{sim_gene}), dispersion \code{sim_disp}, 116 | and simulation means for each group \code{sim_mean.A/B}.} 117 | \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.} 118 | \item{\code{args}}{a list of the function call's input arguments.}}}} 119 | } 120 | \description{ 121 | Simulation of complex scRNA-seq data 122 | } 123 | \details{ 124 | \code{simData} simulates multiple clusters and samples 125 | across 2 experimental conditions from a real scRNA-seq data set. 126 | 127 | The simulation of type genes can be performed in 2 ways; 128 | (1) via \code{p_type} to simulate independent clusters, OR 129 | (2) via \code{phylo_tree} to simulate a hierarchical cluster structure. 130 | 131 | For (1), a subset of \code{p_type} \% of genes are selected per cluster 132 | to use a different references genes than the remainder of clusters, 133 | giving rise to cluster-specific NB means for count sampling. 134 | 135 | For (2), the number of shared/type genes at each node 136 | are given by \code{a*G*e^(-b*d)}, where \itemize{ 137 | \item{\code{a} -- controls the percentage of shared genes between nodes. 138 | By default, at most 10\% of the genes are reserved as type genes 139 | (when \code{b} = 0). However, it is advised to tune this parameter 140 | depending on the input \code{prep_sce}.} 141 | \item{\code{b} -- determines how the number of shared genes 142 | decreases with increasing distance d between clusters 143 | (defined through \code{phylo_tree}).}} 144 | } 145 | \examples{ 146 | data(example_sce) 147 | library(SingleCellExperiment) 148 | 149 | # prep. SCE for simulation 150 | ref <- prepSim(example_sce) 151 | 152 | # simulate data 153 | (sim <- simData(ref, nc = 200, 154 | p_dd = c(0.9, 0, 0.1, 0, 0, 0), 155 | ng = 100, force = TRUE, 156 | probs = list(NULL, NULL, c(1, 0)))) 157 | 158 | # simulation metadata 159 | head(gi <- metadata(sim)$gene_info) 160 | 161 | # should be ~10\% DE 162 | table(gi$category) 163 | 164 | # unbalanced sample sizes 165 | sim <- simData(ref, nc = 100, ns = 2, 166 | probs = list(NULL, c(0.25, 0.75), NULL), 167 | ng = 10, force = TRUE) 168 | table(sim$sample_id) 169 | 170 | # one group only 171 | sim <- simData(ref, nc = 100, 172 | probs = list(NULL, NULL, c(1, 0)), 173 | ng = 10, force = TRUE) 174 | levels(sim$group_id) 175 | 176 | # HIERARCHICAL CLUSTER STRUCTURE 177 | # define phylogram specifying cluster relations 178 | phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);" 179 | # verify syntax & visualize relations 180 | library(phylogram) 181 | plot(read.dendrogram(text = phylo_tree)) 182 | 183 | # let's use a more complex phylogeny 184 | phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3': 185 | 0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);" 186 | plot(read.dendrogram(text = phylo_tree)) 187 | 188 | # simulate clusters accordingly 189 | sim <- simData(ref, 190 | phylo_tree = phylo_tree, 191 | phylo_pars = c(0.1, 3), 192 | ng = 500, force = TRUE) 193 | # view information about shared 'type' genes 194 | table(rowData(sim)$class) 195 | 196 | } 197 | \references{ 198 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 199 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 200 | On the discovery of population-specific state transitions from 201 | multi-sample multi-condition single-cell RNA sequencing data. 202 | \emph{bioRxiv} \strong{713412} (2018). 203 | doi: \url{https://doi.org/10.1101/713412} 204 | } 205 | \author{ 206 | Helena L Crowell & Anthony Sonrel 207 | } 208 | -------------------------------------------------------------------------------- /man/stagewise_DS_DD.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stagewiseDD.R 3 | \name{stagewise_DS_DD} 4 | \alias{stagewise_DS_DD} 5 | \title{Perform two-stage testing on DS and DD analysis results} 6 | \usage{ 7 | stagewise_DS_DD(res_DS, res_DD, sce = NULL, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{res_DS}{a list of DS testing results as returned 11 | by \code{\link{pbDS}} or \code{\link{mmDS}}.} 12 | 13 | \item{res_DD}{a list of DD testing results as returned 14 | by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).} 15 | 16 | \item{sce}{(optional) \code{SingleCellExperiment} object containing the data 17 | that underlies testing, prior to summarization with \code{\link{aggregateData}}. 18 | Used for validation of inputs in order to prevent unexpected failure/results.} 19 | 20 | \item{verbose}{logical. Should information on progress be reported?} 21 | } 22 | \value{ 23 | A list of \code{DFrame}s containing results for each contrast and cluster. 24 | Each table contains DS and DD results for genes shared between analyses, 25 | as well as results from stagewise testing analysis, namely: 26 | \itemize{ 27 | \item{\code{p_adj}: FDR adjusted p-values for the 28 | screening hypothesis that a gene is neither DS nor DD 29 | (see \code{?stageR::getAdjustedPValues} for details)} 30 | \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}} 31 | } 32 | \description{ 33 | Perform two-stage testing on DS and DD analysis results 34 | } 35 | \examples{ 36 | data(example_sce) 37 | 38 | pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum") 39 | pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected") 40 | 41 | res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE) 42 | res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE) 43 | 44 | res <- stagewise_DS_DD(res_DS, res_DD) 45 | head(res[[1]][[1]]) # results for 1st cluster 46 | 47 | } 48 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(muscat) 3 | 4 | test_check("muscat") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-aggregateData.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | # generate toy dataset 7 | seed <- as.numeric(format(Sys.Date(), "%s")) 8 | set.seed(seed) 9 | sce <- .toySCE() 10 | nk <- length(kids <- levels(sce$cluster_id)) 11 | ns <- length(sids <- levels(sce$sample_id)) 12 | ng <- length(gids <- levels(sce$group_id)) 13 | 14 | test_that("aggregation across 2 factors", { 15 | for (fun in c("sum", "mean", "median")) { 16 | pb <- aggregateData(sce, by = c("cluster_id", "sample_id"), fun = fun) 17 | expect_error(aggregateData(sce, assay = "x")) 18 | expect_error(aggregateData(sce, fun = "x")) 19 | expect_error(aggregateData(sce, by = "x")) 20 | 21 | pars <- metadata(pb)$agg_pars 22 | expect_identical(pars$by, c("cluster_id", "sample_id")) 23 | expect_identical(pars$assay, "counts") 24 | expect_identical(pars$fun, fun) 25 | expect_false(pars$scale) 26 | 27 | expect_is(pb, "SingleCellExperiment") 28 | expect_identical(assayNames(pb), kids) 29 | 30 | expect_identical(nrow(pb), nrow(sce)) 31 | expect_identical(ncol(pb), ns) 32 | 33 | expect_identical(rownames(pb), rownames(sce)) 34 | expect_identical(colnames(pb), sids) 35 | 36 | expect_equivalent(.n_cells(pb), table(sce$cluster_id, sce$sample_id)) 37 | 38 | # 10x random spot check 39 | replicate(10, { 40 | k <- sample(kids, 1) 41 | s <- sample(sids, 1) 42 | g <- sample(rownames(sce), 1) 43 | i <- sce$sample_id == s & sce$cluster_id == k 44 | expect_equal(assays(pb)[[k]][g, s], get(fun)(assay(sce)[g, i])) 45 | }) 46 | } 47 | }) 48 | 49 | test_that("aggregation across 1 factor", { 50 | for (fun in c("sum", "mean", "median")) { 51 | pb <- aggregateData(sce, by = "cluster_id", fun = fun) 52 | expect_is(pb, "SingleCellExperiment") 53 | expect_identical(nrow(pb), nrow(sce)) 54 | expect_identical(ncol(pb), nk) 55 | expect_identical(rownames(pb), rownames(sce)) 56 | expect_identical(colnames(pb), kids) 57 | expect_equivalent(table(sce$cluster_id), .n_cells(pb)) 58 | # random spot check 59 | k <- sample(kids, 1) 60 | g <- sample(rownames(sce), 1) 61 | i <- sce$cluster_id == k 62 | expect_equal(assay(pb)[g, k], get(fun)(assay(sce)[g, i])) 63 | } 64 | }) 65 | 66 | test_that("pbFlatten()", { 67 | x <- aggregateData(sce, by = "cluster_id") 68 | expect_error(pbFlatten(x)) 69 | x <- aggregateData(sce) 70 | cd <- colData(y <- pbFlatten(x)) 71 | expect_is(y, "SingleCellExperiment") 72 | expect_true(length(assays(y)) == 2) 73 | expect_identical(rownames(y), rownames(x)) 74 | expect_true(ncol(y) == nk*ncol(x)) 75 | a <- do.call(cbind, as.list(assays(x))) 76 | expect_equivalent(assay(y), a) 77 | expect_true(all(table(cd$sample_id) == nk)) 78 | expect_true(all(table(cd$cluster_id) == ns)) 79 | expect_true(all(table(cd$group_id) == ns/ng*nk)) 80 | expect_equivalent(y$n_cells, c(table(sce$sample_id, sce$cluster_id))) 81 | # without normalization 82 | y <- pbFlatten(x, normalize = FALSE) 83 | expect_true(assayNames(y) == "counts") 84 | }) 85 | -------------------------------------------------------------------------------- /tests/testthat/test-calcExprFreqs.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | # generate toy dataset 7 | seed <- as.numeric(format(Sys.Date(), "%s")) 8 | set.seed(seed) 9 | sce <- .toySCE() 10 | 11 | # put in 50% random 0s 12 | n <- length(assay(sce)) 13 | i <- sample(n, round(n * 0.5)) 14 | assay(sce)[i] <- 0 15 | 16 | # store nb. / IDs of clusters, samples, groups 17 | nk <- length(kids <- levels(sce$cluster_id)) 18 | ns <- length(sids <- levels(sce$sample_id)) 19 | ng <- length(gids <- levels(sce$group_id)) 20 | 21 | # calculate expression frequencies 22 | x <- calcExprFreqs(sce, assay = "counts", th = 0) 23 | 24 | test_that("Output is correctly structured SCE", { 25 | expect_is(x, "SingleCellExperiment") 26 | expect_identical(assayNames(x), kids) 27 | 28 | expect_identical(nrow(x), nrow(sce)) 29 | expect_identical(ncol(x), ns + ng) 30 | 31 | expect_identical(rownames(x), rownames(sce)) 32 | expect_identical(colnames(x), c(sids, gids)) 33 | }) 34 | 35 | test_that("Frequencies lie in [0, 1] w/o any NAs", { 36 | v <- unlist(assays(x)) 37 | expect_true(all(v >= 0)) 38 | expect_true(all(v <= 1)) 39 | expect_true(!any(is.na(v))) 40 | }) 41 | 42 | test_that("10x random spot checks", { 43 | replicate(10, { 44 | # sample cluster, sample & group 45 | k <- sample(kids, 1) 46 | s <- sample(sids, 1) 47 | g <- sample(gids, 1) 48 | ki <- sce$cluster_id == k 49 | si <- sce$sample_id == s & ki 50 | gi <- sce$group_id == g & ki 51 | # sample gene & check frequencies vs. truth 52 | i <- sample(rownames(sce), 1) 53 | expect_identical(mean(counts(sce)[i, si] > 0), assays(x)[[k]][i, s]) 54 | expect_identical(mean(counts(sce)[i, gi] > 0), assays(x)[[k]][i, g]) 55 | }) 56 | }) 57 | -------------------------------------------------------------------------------- /tests/testthat/test-mmDS.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressPackageStartupMessages({ 3 | library(dplyr) 4 | library(purrr) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | # generate toy dataset 9 | seed <- as.numeric(format(Sys.time(), "%s")) 10 | set.seed(seed) 11 | x <- .toySCE(c(200, 2e3)) 12 | x <- x[, x$group_id != "g3"] 13 | 14 | nk <- length(kids <- x$cluster_id) 15 | ns <- length(sids <- x$sample_id) 16 | ng <- length(gids <- x$group_id) 17 | 18 | test_that(".check_args_mmDS()", { 19 | y <- x; class(y) <- "x" 20 | expect_error(mmDS(y)) 21 | y <- x; assayNames(y) <- "x" 22 | expect_error(mmDS(y)) 23 | expect_error(mmDS(x, "x")) 24 | expect_error(mmDS(x, 100)) 25 | expect_error(mmDS(x, covs = "x")) 26 | expect_error(mmDS(x, method = "x")) 27 | expect_error(mmDS(x, method = "vst", vst = "x")) 28 | for (u in c("verbose", "dup_corr", "trended", "bayesian", "blind", "REML")) 29 | for (v in list("x", 1, c(TRUE, TRUE))) { 30 | w <- list(x = x); w[[u]] <- v 31 | expect_error(do.call(mmDS, w)) 32 | } 33 | }) 34 | 35 | test_that("mmDS() - filtering", { 36 | expect_error(mmDS(x, n_cells = Inf)) 37 | ks <- sample(kids, 2) 38 | ks <- as.character(ks) 39 | cs <- x$cluster_id %in% ks 40 | ls <- rowSums(assay(x[, cs])) 41 | o <- order(ls, decreasing = TRUE) 42 | gs <- rownames(x)[o][seq_len(5)] 43 | z <- suppressWarnings( 44 | mmDS(x[gs, cs], verbose = FALSE)) 45 | expect_setequal(names(z), ks) 46 | expect_true(all(vapply(map(z, "gene"), 47 | function(u) all(u == gs), logical(1)))) 48 | expect_true(all(vapply(ks, function(k) 49 | all(z[[k]]$cluster_id == k), logical(1)))) 50 | y <- x[gs, cs]; metadata(y) <- list() 51 | y$group_id <- NULL; expect_error(mmDS(y)) 52 | }) 53 | 54 | # randomly select 'n_de' genes & bump counts for group 2 55 | n_de <- 5; g2 <- gids == "g2" 56 | de_gs <- sample(rownames(x), n_de) 57 | de_es <- rnbinom(n_de * sum(g2), size = 4, mu = 8) 58 | de_es <- matrix(de_es, n_de, sum(g2)) 59 | assay(x[de_gs, g2], withDimnames = FALSE) <- de_es 60 | vst <- .vst_sctransform(x, verbose = FALSE) 61 | assay(x, "vstresiduals") <- vst 62 | 63 | for (fun in paste0(".mm_", c("dream", "dream2", "vst"))) { 64 | test_that(paste("mmDS-utils;", fun), { 65 | cs <- x$cluster_id == kids[1] 66 | gs <- c(de_gs, sample(setdiff(rownames(x), de_gs), 15)) 67 | # currently not passing; likely because 68 | # the toy data is too simplistic... 69 | # "poisson", "hybrid", "nbinom" 70 | y <- suppressWarnings( 71 | get(fun)(x[gs, cs], verbose = FALSE)) 72 | expect_is(y, "data.frame") 73 | expect_identical(rownames(y), gs) 74 | if (grepl("dream", fun)) 75 | y <- y[y$logFC > 0, ] 76 | top <- order(y$p_adj.loc)[seq_len(n_de)] 77 | expect_setequal(rownames(y)[top], de_gs) 78 | }) 79 | } 80 | -------------------------------------------------------------------------------- /tests/testthat/test-pbDS.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(dplyr) 4 | library(purrr) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | # generate toy dataset 9 | seed <- as.numeric(format(Sys.time(), "%s")) 10 | set.seed(seed) 11 | sce <- .toySCE(c(100, 2e3)) 12 | 13 | nk <- length(kids <- levels(sce$cluster_id)) 14 | ns <- length(sids <- levels(sce$sample_id)) 15 | ng <- length(gids <- levels(sce$group_id)) 16 | 17 | g3 <- sce$group_id == "g3" 18 | g23 <- sce$group_id %in% c("g2", "g3") 19 | 20 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells 21 | degs <- sample(rownames(sce), (nde <- 5)) 22 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10 23 | pb <- aggregateData(sce, assay = "counts", fun = "sum") 24 | 25 | # specify design & contrast matrix 26 | ei <- metadata(sce)$experiment_info 27 | design <- model.matrix(~ 0 + ei$group_id) 28 | dimnames(design) <- list(ei$sample_id, levels(ei$group_id)) 29 | contrast <- limma::makeContrasts("g2-g1", "g3-g1", levels = design) 30 | 31 | # default method settings ------------------------------------------------------ 32 | ms <- eval(as.list(args(pbDS))$method) 33 | ms <- setdiff(ms, "DD") 34 | for (method in ms) { 35 | test_that(paste("defaults - pbDS", method, sep = "."), { 36 | res <- pbDS(pb, 37 | method = method, 38 | min_cells = 0, 39 | filter = "none", 40 | verbose = FALSE) 41 | tbl <- res$table[[1]] 42 | expect_identical(names(tbl), kids) 43 | top <- map(tbl, function(u) 44 | dplyr::arrange(u, p_adj.loc) %>% 45 | dplyr::slice(seq_len(5)) %>% 46 | pull("gene")) 47 | expect_true(all(vapply(top, setequal, y = degs, logical(1)))) 48 | }) 49 | } 50 | 51 | # multiple contrast w/ & w/o 'treat' ------------------------------------------- 52 | for (method in ms) { 53 | if (grepl("edgeR|limma", method)) { 54 | treat <- c(FALSE, TRUE) 55 | } else treat <- FALSE 56 | test_that(paste("pbDS", method, sep = "."), { 57 | for (t in treat) { 58 | res <- pbDS(pb, 59 | treat = t, filter = "none", 60 | method = method, verbose = FALSE, 61 | design = design, contrast = contrast) 62 | 63 | expect_identical(length(res$table), ncol(contrast)) 64 | expect_identical(names(res$table), colnames(contrast)) 65 | expect_true(all(vapply(map(res$table, names), "==", 66 | levels(kids), FUN.VALUE = logical(nlevels(kids))))) 67 | 68 | # check that top genes equal 'degs' in ea. comparison & cluster 69 | top <- map_depth(res$table, 2, function(u) { 70 | dplyr::arrange(u, p_adj.loc) %>% 71 | dplyr::slice(seq_len(nde)) %>% 72 | pull("gene") 73 | }) %>% Reduce(f = "c") 74 | expect_true(all(unlist(map(top, setequal, degs)))) 75 | } 76 | }) 77 | } 78 | 79 | # global p-value adjustment ---------------------------------------------------- 80 | test_that(".p_adj_global", { 81 | names(cs) <- cs <- paste0("c", seq_len(5)) 82 | names(ks) <- ks <- paste0("k", seq_len(8)) 83 | ns <- sample(1e3, length(ks)) 84 | names(ns) <- ks 85 | df <- lapply(ks, function(k) lapply(cs, function(c) { 86 | df <- data.frame(p_val = rgamma(ns[k], 0.1)) 87 | df$p_adj.loc <- p.adjust(df$p_val, method = "BH") 88 | return(df) 89 | })) %>% .p_adj_global 90 | for (c in cs) 91 | expect_identical( 92 | p.adjust(bind_rows(df[[c]])$p_val, method = "BH"), 93 | bind_rows(df[[c]])$p_adj.glb) 94 | }) 95 | 96 | # filtering -------------------------------------------------------------------- 97 | test_that("pbDS() filtering", { 98 | sub <- sce[, sample(ncol(sce), 100)] 99 | pbs <- aggregateData(sub, assay = "counts", fun = "sum") 100 | expect_error(pbDS(pbs, verbose = FALSE)) 101 | expect_silent(pbDS(pbs, verbose = FALSE, 102 | min_cells = 0, filter = "none")) 103 | }) 104 | 105 | # pbDD ------------------------------------------------------------------------- 106 | 107 | set.seed(1968) 108 | sce <- .toySCE(c(100, 2e3)) 109 | cs <- which( 110 | sce$group_id == (g <- "g3") & 111 | sce$cluster_id == (k <- "k1")) 112 | cs <- sample(cs, length(cs)) 113 | gs <- sample(rownames(sce), 5) 114 | assay(sce)[gs, cs] <- 0 115 | 116 | test_that("differential testing", { 117 | pbs <- aggregateData(sce, fun="num.detected") 118 | expect_identical( 119 | tbl <- pbDD(pbs, verbose=FALSE)$table[[g]][[k]], 120 | pbDS(pbs, method="DD", verbose=FALSE)$table[[g]][[k]]) 121 | top <- tbl$gene[order(tbl$p_adj.loc)] 122 | expect_setequal(top[seq_along(gs)], gs) 123 | }) 124 | -------------------------------------------------------------------------------- /tests/testthat/test-pbHeatmap.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(purrr) 4 | library(scater) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | # generate toy dataset 9 | set.seed(as.numeric(format(Sys.time(), "%s"))) 10 | sce <- .toySCE() 11 | 12 | nk <- length(kids <- levels(sce$cluster_id)) 13 | ns <- length(sids <- levels(sce$sample_id)) 14 | ng <- length(gids <- levels(sce$group_id)) 15 | 16 | g3 <- sce$group_id == "g3" 17 | g23 <- sce$group_id %in% c("g2", "g3") 18 | 19 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells 20 | degs <- sample(rownames(sce), (nde <- 5)) 21 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10 22 | sce <- logNormCounts(computeLibraryFactors(sce)) 23 | 24 | # run DS analysis using 'edgeR' on pseudobulks 25 | pb <- aggregateData(sce, assay = "counts", fun = "sum") 26 | res <- pbDS(pb, method = "edgeR", min_cells = 0, filter = "none", verbose = FALSE) 27 | 28 | # compute pseudobulk mean of logcounts 29 | pb <- aggregateData(sce, assay = "logcounts", fun = "mean") 30 | 31 | test_that("pbHeatmap() - input arguments", { 32 | # get list of default parameters 33 | defs <- as.list(eval(formals(pbHeatmap))) 34 | defs$x <- sce; defs$y <- res 35 | expect_is(do.call(pbHeatmap, defs), "Heatmap") 36 | # check that invalid arguments throw error 37 | fail <- list(g = "x", k = "x", c = "x", 38 | sort_by = "x", sort_by = "gene", 39 | sort_by = c("logFC", "p_val"), 40 | lfc = "x", lfc = c(1, 2), 41 | assay = 1, assay = "x", 42 | decreasing = "x", decreasing = c(TRUE, FALSE), 43 | row_anno = "x", row_anno = c(TRUE, FALSE), 44 | col_anno = "x", col_anno = c(TRUE, FALSE), 45 | normalize = "x", normalize = c(TRUE, FALSE)) 46 | for (i in seq_along(fail)) { 47 | args <- defs; args[[names(fail)[i]]] <- fail[[i]] 48 | expect_error(do.call(pbHeatmap, args)) 49 | } 50 | }) 51 | 52 | test_that("pbHeatmap() - subset of clusters", { 53 | ks <- sample(kids, (nks <- 3)) 54 | p <- pbHeatmap(sce, res, k = ks, 55 | lfc = 0, fdr = Inf, sort_by = "none", 56 | top_n = (nds <- 10), normalize = FALSE) 57 | expect_is(p, "Heatmap") 58 | gs <- rownames(y <- p@matrix)[seq_len(nds)] 59 | expect_equal(dim(y), c(nks*nds, ns)) 60 | expect_identical(rownames(y), rep(rownames(sce)[seq_len(nds)], nks)) 61 | expect_identical(colnames(y), sids) 62 | for (i in seq_len(nks)) { 63 | k <- kids[match(ks, kids)][i] 64 | expect_equal(y[seq_len(nds)+nds*(i-1), ], assay(pb, k)[gs, ]) 65 | } 66 | }) 67 | 68 | test_that("pbHeatmap() - subset of genes", { 69 | gs <- sample(rownames(sce), (ngs <- 20)) 70 | p <- pbHeatmap(sce, res, g = gs, 71 | lfc = 0, fdr = Inf, sort_by = "none", 72 | top_n = Inf, normalize = FALSE) 73 | expect_is(p, "Heatmap") 74 | gs <- rownames(y <- p@matrix)[seq_len(ngs)] 75 | expect_equal(dim(y), c(nk*ngs, ns)) 76 | expect_identical(rownames(y), rep(gs, nk)) 77 | expect_identical(colnames(y), sids) 78 | }) 79 | -------------------------------------------------------------------------------- /tests/testthat/test-pbMDS.R: -------------------------------------------------------------------------------- 1 | # generate toy dataset 2 | seed <- as.numeric(format(Sys.time(), "%s")) 3 | set.seed(seed); x <- .toySCE() 4 | 5 | nk <- length(kids <- levels(x$cluster_id)) 6 | ns <- length(sids <- levels(x$sample_id)) 7 | ng <- length(gids <- levels(x$group_id)) 8 | 9 | test_that("pbMDS()", { 10 | y <- x; class(y) <- "x" 11 | expect_error(pbMDS(y)) 12 | y <- aggregateData(x) 13 | expect_is(p <- pbMDS(y), "ggplot") 14 | expect_true(nrow(p$data) == nk*ns) 15 | expect_true(all(table(p$data$cluster_id) == ns)) 16 | expect_true(all(table(p$data$group_id) == ns*nk/ng)) 17 | cs1 <- x$group_id != gids[1] # remove group 18 | cs2 <- x$sample_id != sids[1] # remove sample 19 | cs3 <- x$cluster_id != kids[1] # remove cluster 20 | cs4 <- cs1 & cs2 # remove sample in single group 21 | cs5 <- cs1 & cs3 # remove cluster in single group 22 | cs6 <- cs2 & cs3 # remove cluster-sample instance 23 | for (cs in paste0("cs", seq_len(6))) 24 | expect_silent(pbMDS(aggregateData(x[, get(cs)]))) 25 | }) 26 | -------------------------------------------------------------------------------- /tests/testthat/test-prepSCE.R: -------------------------------------------------------------------------------- 1 | # generate toy dataset 2 | seed <- as.numeric(format(Sys.time(), "%s")) 3 | set.seed(seed); x <- .toySCE() 4 | 5 | nk <- length(kids <- levels(x$cluster_id)) 6 | ns <- length(sids <- levels(x$sample_id)) 7 | ng <- length(gids <- levels(x$group_id)) 8 | 9 | test_that("prepSCE()", { 10 | y <- x; class(y) <- "x" 11 | expect_error(prepSCE(y)) 12 | expect_is(y <- prepSCE(x), "SingleCellExperiment") 13 | expect_identical(dim(y), dim(x)) 14 | expect_identical(dimnames(y), dimnames(x)) 15 | ids <- formals("prepSCE") 16 | ids <- ids[grep("[a-z]id", names(ids))] 17 | ids <- unname(unlist(ids)) 18 | expect_identical(colnames(colData(y)), ids) 19 | x$foo <- sample(ncol(x)) 20 | y <- prepSCE(x, drop = TRUE) 21 | expect_true(is.null(y$foo)) 22 | y <- prepSCE(x, drop = FALSE) 23 | expect_identical(y$foo, x$foo) 24 | expect_false(is.null(metadata(y)$experiment_info)) 25 | }) 26 | -------------------------------------------------------------------------------- /tests/testthat/test-prepSim.R: -------------------------------------------------------------------------------- 1 | # generate toy dataset 2 | seed <- as.numeric(format(Sys.time(), "%s")) 3 | set.seed(seed); x <- .toySCE() 4 | 5 | nk <- length(kids <- levels(x$cluster_id)) 6 | ns <- length(sids <- levels(x$sample_id)) 7 | ng <- length(gids <- levels(x$group_id)) 8 | 9 | test_that("prepSim()", { 10 | z <- x; class(z) <- "x" 11 | expect_error(prepSim(z)) 12 | expect_error(prepSim(x, group_keep = "x")) 13 | y <- prepSim(x, 0, 0, 0, 0) 14 | expect_is(y, "SingleCellExperiment") 15 | expect_true(nrow(y) == nrow(x)) 16 | expect_true(ncol(y) == sum(x$group_id == gids[1])) 17 | g <- sample(setdiff(gids, gids[1]), 1) 18 | y <- prepSim(x, 0, 0, 0, 0, group_keep = g) 19 | expect_true(ncol(y) == sum(x$group_id == g)) 20 | }) 21 | 22 | test_that("non-factor ID columns work", { 23 | y <- x; for (i in names(colData(y))) 24 | y[[i]] <- as.character(y[[i]]) 25 | expect_silent(prepSim(y, 0, 0, 0, 0, verbose = FALSE)) 26 | }) 27 | 28 | test_that("single cluster", { 29 | g <- sample(levels(x$group_id), 1) 30 | k <- sample(levels(x$cluster_id), 1) 31 | y <- x[, i <- x$cluster_id == k & x$group_id == g] 32 | z <- prepSim(y, min_size = 0, min_genes = 0, verbose = FALSE) 33 | expect_true(ncol(z) == sum(i)) 34 | expect_true(!"cluster_id" %in% names(colData(z))) 35 | }) 36 | -------------------------------------------------------------------------------- /tests/testthat/test-resDS.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(dplyr) 4 | library(purrr) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | # generate toy dataset 9 | seed <- as.numeric(format(Sys.time(), "%s")) 10 | set.seed(seed) 11 | x <- .toySCE() 12 | 13 | nk <- length(kids <- levels(x$cluster_id)) 14 | ns <- length(sids <- levels(x$sample_id)) 15 | ng <- length(gids <- levels(x$group_id)) 16 | 17 | # sample 'n_de' genes & multiply counts by 10 for 'g2/3'-cells 18 | g23 <- x$group_id != "g1" 19 | de_gs <- sample(rownames(x), (n_de <- 5)) 20 | assay(x[de_gs, g23]) <- assay(x[de_gs, g23]) * 10 21 | 22 | # aggregate & run pseudobulk DS analysis 23 | nc <- length(cs <- list(2, 3)) 24 | pbs <- aggregateData(x, assay = "counts", fun = "sum") 25 | y <- pbDS(pbs, 26 | coef = cs, verbose = FALSE, 27 | # assure everything is being tested 28 | min_cells = 0, filter = "none") 29 | 30 | test_that("resDS()", { 31 | v <- list(col = list(nr = nrow(x)*nk, ng = nrow(x), nk = nk)) 32 | v$row <- lapply(v$col, "*", nc) 33 | v$col$char_cols <- c("gene", "cluster_id") 34 | v$row$char_cols <- c(v$col$char_cols, "coef") 35 | for (bind in c("row", "col")) { 36 | z <- resDS(x, y, bind, frq = FALSE, cpm = FALSE) 37 | expect_is(z, "data.frame") 38 | expect_identical(nrow(z), v[[bind]]$nr) 39 | expect_true(all(table(z$gene) == v[[bind]]$nk)) 40 | expect_true(all(table(z$cluster_id) == v[[bind]]$ng)) 41 | is_char <- colnames(z) %in% v[[bind]]$char_cols 42 | expect_true(all(apply(z[, !is_char], 2, class) == "numeric")) 43 | expect_true(all(apply(z[, is_char], 2, class) == "character")) 44 | } 45 | }) 46 | test_that("resDS() - 'frq = TRUE'", { 47 | z <- resDS(x, y, frq = TRUE) 48 | u <- z[, grep("frq", colnames(z))] 49 | expect_true(ncol(u) == ns + ng) 50 | expect_true(all(u <= 1 & u >= 0 | is.na(u))) 51 | # remove single cluster-sample instance 52 | s <- sample(sids, 1); k <- sample(kids, 1) 53 | x_ <- x[, !(x$sample_id == s & x$cluster_id == k)] 54 | y_ <- aggregateData(x_, assay = "counts", fun = "sum") 55 | y_ <- pbDS(y_, coef = cs, verbose = FALSE) 56 | z <- resDS(x_, y_, frq = TRUE) 57 | u <- z[, grep("frq", colnames(z))] 58 | expect_true(ncol(u) == ns + ng) 59 | expect_true(all(u <= 1 & u >= 0 | is.na(u))) 60 | expect_true(all(is.na(z[z$cluster_id == k, paste0(s, ".frq")]))) 61 | }) 62 | test_that("resDS() - 'cpm = TRUE'", { 63 | z <- resDS(x, y, cpm = TRUE) 64 | u <- z[, grep("cpm", colnames(z))] 65 | expect_true(ncol(u) == ns) 66 | expect_true(all(u %% 1 == 0 | is.na(u))) 67 | }) 68 | 69 | test_that("missing cluster is handled", { 70 | k <- sample(kids, 1) 71 | i <- !(x$cluster_id == k) 72 | pbs <- aggregateData(x[, i], verbose = FALSE) 73 | res <- pbDS(pbs, verbose = FALSE) 74 | tbl <- resDS(x, res, cpm = TRUE) 75 | expect_true(!k %in% unique(tbl$cluster_id)) 76 | }) 77 | test_that("missing sample is handled", { 78 | s <- sample(sids, 1) 79 | i <- !(x$sample_id == s) 80 | pbs <- aggregateData(x[, i], verbose = FALSE) 81 | res <- pbDS(pbs, verbose = FALSE) 82 | tbl <- resDS(x, res, cpm = TRUE) 83 | expect_true(sum(grepl(s, names(tbl))) == 0) 84 | }) 85 | test_that("missing cluster-sample is handled", { 86 | k <- sample(kids, 1) 87 | s <- sample(sids, 1) 88 | i <- !(x$cluster_id == k & x$sample_id == s) 89 | pbs <- aggregateData(x[, i], verbose = FALSE) 90 | res <- pbDS(pbs, verbose = FALSE) 91 | tbl <- resDS(x, res, cpm = TRUE) 92 | sub <- tbl[tbl$cluster_id == k, ] 93 | expect_true(all(is.na(sub[, grep(s, names(sub))]))) 94 | }) 95 | -------------------------------------------------------------------------------- /tests/testthat/test-simData.R: -------------------------------------------------------------------------------- 1 | suppressMessages({ 2 | library(dplyr) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | data(example_sce) 7 | ref <- prepSim(example_sce, verbose = FALSE) 8 | ng <- 200; nc <- 2e3; ns <- 3; nk <- 2 9 | 10 | test_that("no groups, no samples, no clusters", { 11 | s <- sample(levels(ref$sample_id), 1) 12 | k <- sample(levels(ref$cluster_id), 1) 13 | sub <- ref[, ref$sample_id == s & ref$cluster_id == k] 14 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE) 15 | expect_null(sim$group_id) 16 | expect_null(sim$sample_id) 17 | expect_null(sim$cluster_id) 18 | }) 19 | 20 | test_that("no groups, yes samples and clusters", { 21 | sids <- levels(ref$sample_id) 22 | kids <- levels(ref$cluster_id) 23 | sim <- simData(ref, ng, nc, dd = FALSE, force = TRUE) 24 | expect_null(sim$group_id) 25 | expect_true(setequal(metadata(sim)$ref_sids, sids)) 26 | expect_true(setequal(metadata(sim)$ref_kids, kids)) 27 | }) 28 | 29 | test_that("only samples", { 30 | k <- sample(levels(ref$cluster_id), 1) 31 | sub <- ref[, ref$cluster_id == k] 32 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE) 33 | expect_null(sim$group_id) 34 | expect_null(sim$culster_id) 35 | expect_setequal(metadata(sim)$ref_sids, levels(sub$sample_id)) 36 | }) 37 | 38 | test_that("only clusters", { 39 | s <- sample(levels(ref$sample_id), 1) 40 | sub <- ref[, ref$sample_id == s] 41 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE) 42 | expect_null(sim$group_id) 43 | expect_null(sim$sample_id) 44 | expect_setequal(metadata(sim)$ref_kids, levels(sub$cluster_id)) 45 | }) 46 | 47 | 48 | test_that("pbDS() gets at least 50% right for 10% DE genes", { 49 | replicate(3, { 50 | sim <- simData(ref, ng, nc, ns, nk, 51 | p_dd = c(0.9,0,0.1,0,0,0), force = TRUE) 52 | gi <- metadata(sim)$gene_info 53 | pb <- aggregateData(sim) 54 | re <- pbDS(pb, verbose = FALSE) 55 | re <- bind_rows(re$table[[1]]) 56 | re <- arrange(re, p_adj.loc) 57 | gi$id <- with(gi, paste0(gene, cluster_id)) 58 | re$id <- with(re, paste0(gene, cluster_id)) 59 | n_de <- sum(de <- gi$category == "de") 60 | expect_gte(sum(re$id[seq_len(n_de)] %in% gi$id[de]), round(n_de/2)) 61 | }) 62 | }) 63 | 64 | test_that("Single group simulation", { 65 | gs <- c("A", "B") 66 | ps <- list(c(1, 0), c(0, 1)) 67 | names(ps) <- gs 68 | for (g in gs) { 69 | x <- simData(ref, 70 | nc, ns, nk, ng = 10, force = TRUE, 71 | probs = list(NULL, NULL, ps[[g]])) 72 | expect_identical(levels(x$group_id), g) 73 | sids <- sprintf("sample%s.%s", seq_len(ns), g) 74 | expect_identical(levels(x$sample_id), sids) 75 | gi <- metadata(x)$gene_info 76 | ms <- paste0("sim_mean.", setdiff(gs, g)) 77 | expect_true(all(is.na(gi[[ms]]))) 78 | } 79 | }) 80 | 81 | test_that("Pure simulations give single DD category", { 82 | for (c in cats) { 83 | sim <- simData(ref, nc, ns, nk, 84 | p_dd = as.numeric(cats == c), 85 | ng = ng, force = TRUE) 86 | gi <- metadata(sim)$gene_info 87 | expect_equal(table(gi$category)[[c]], ng*nk) 88 | } 89 | }) 90 | 91 | test_that("simData() - input arguments", { 92 | # number of genes mismatch b/w simulation & reference 93 | expect_error(simData(ref, ng = 100, force = FALSE)) 94 | expect_silent(simData(ref, ng = 100, force = TRUE)) 95 | nk <- length(kids <- levels(ref$cluster_id)) 96 | # named 'rel_lfc's (mis)match cluster names 97 | lfc <- rep(1, nk); names(lfc) <- kids; lfc2 <- lfc; names(lfc2)[1] <- "x" 98 | expect_silent(simData(ref, nk = nk, rel_lfc = lfc, ng = 10, force = TRUE)) 99 | expect_error(simData(ref, nk = nk, rel_lfc = lfc2, ng = 10, force = TRUE)) 100 | }) 101 | 102 | test_that("Type genes & cluster phylogeny", { 103 | ng <- 1e3; pt <- 0.05 104 | t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);" 105 | args <- list( 106 | list(pt = pt, t = NULL), # type genes but no phylogeny 107 | list(t = t, pt = 0)) # both, type genes and phylogeny 108 | cs <- c("type", "state") # possible gene classes 109 | for (i in seq_along(args)) { 110 | rd <- rowData(x <- simData(ref, ng = ng, force = TRUE, 111 | p_type = args[[i]]$pt, phylo_tree = args[[i]]$t)) 112 | expect_is(rd, "DataFrame") 113 | expect_equal(dim(rd), c(ng, 2)) 114 | expect_equal(colnames(rd), c("class", "specs")) 115 | ns <- as.list(table(rd$class)) 116 | is_state <- rd$class == "state" 117 | expect_true(all(is.na(unlist(rd$specs[is_state])))) 118 | expect_true(all(unlist(rd$specs[!is_state]) %in% levels(x$cluster_id))) 119 | } 120 | # missing clusters in input phylogeny 121 | t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster4':0.5);" 122 | expect_error(simData(ref, phylo_tree = t)) 123 | # specification of both, 'p_type' and 'phylo_tree' 124 | expect_error(simData(ref, p_type = pt, phylo_tree = t)) 125 | }) 126 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | x <- .toySCE() 6 | kids <- levels(x$cluster_id) 7 | sids <- levels(x$sample_id) 8 | 9 | test_that(".filter_matrix()", { 10 | replicate(5, { 11 | x <- matrix(sample(seq_len(200)), 10, 20) 12 | y <- .filter_matrix(x, (n <- sample(200, 1))) 13 | expect_true(all(y >= n) | any(dim(y) == 1)) 14 | }) 15 | }) 16 | test_that(".update_sce()", { 17 | replicate(5, { 18 | ks <- sample(kids, (nk <- 2)) 19 | ss <- sample(sids, (ns <- 3)) 20 | cs <- x$cluster_id %in% ks & x$sample_id %in% ss 21 | y <- .update_sce(x[, cs]) 22 | expect_equal(ncol(y), sum(cs)) 23 | expect_true(setequal(levels(y$cluster_id), ks)) 24 | expect_true(setequal(levels(y$sample_id), ss)) 25 | }) 26 | }) 27 | test_that(".filter_sce()", { 28 | replicate(5, { 29 | ks <- sample(kids, (nk <- 2)) 30 | ss <- sample(sids, (ns <- 3)) 31 | y <- .filter_sce(x, ks, ss) 32 | expect_true(setequal(levels(y$cluster_id), ks)) 33 | expect_true(setequal(levels(y$sample_id), ss)) 34 | cs <- x$cluster_id %in% ks & x$sample_id %in% ss 35 | expect_equal(ncol(y), sum(cs)) 36 | expect_equal(colnames(y), colnames(x)[cs]) 37 | ei <- metadata(y)$experiment_info 38 | expect_true(setequal(ei$sample_id, ss)) 39 | }) 40 | }) 41 | test_that(".scale()", { 42 | replicate(5, { 43 | y <- .scale(x <- matrix(runif(200), (n <- 10), (m <- 20))) 44 | expect_true(all(apply(y, 1, min) == 0)) 45 | expect_true(all(apply(y, 1, max) == 1)) 46 | qs <- quantile(x[1, ], c(0.01, 0.99)) 47 | z <- (x[1, ] - qs[1]) / diff(qs) 48 | z[z < 0] <- 0; z[z > 1] <- 1 49 | expect_true(all(y[1, ] == z)) 50 | i <- sample(n, 1); j <- sample(m, 1) 51 | x[i, j] <- NA; z <- .scale(x) 52 | expect_true(sum(is.na(z)) == 1) 53 | os <- lapply(list(y, z), function(u) { 54 | rngs <- colRanges(u[-i, -j], 55 | na.rm = TRUE, useNames = FALSE) 56 | apply(rngs, 2, order) 57 | }) 58 | expect_identical(os[[1]], os[[2]]) 59 | }) 60 | }) 61 | -------------------------------------------------------------------------------- /tests/testthat/test-validityChecks.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | suppressMessages({ 3 | library(SingleCellExperiment) 4 | }) 5 | sce <- .toySCE() 6 | 7 | test_that(".check_sce", { 8 | # object is not 'SingleCellExperiment' 9 | x <- sce; class(x) <- "x" 10 | expect_error(.check_sce(x)) 11 | # "group_id" 'colData' column not required 12 | x <- sce; x$group_id <- NULL 13 | expect_silent(.check_sce(x, FALSE)) 14 | # missing "x_id" 'colData' column; 15 | # x = "sample", "cluster", "group" 16 | for (i in colnames(colData(sce))) { 17 | x <- sce; x[[i]] <- NULL 18 | expect_error(.check_sce(x)) 19 | } 20 | }) 21 | 22 | test_that(".check_arg_assay", { 23 | expect_error(.check_arg_assay(sce, 1)) 24 | expect_error(.check_arg_assay(sce, "x")) 25 | expect_error(.check_arg_assay(sce, c(assayNames(sce)[1], "x"))) 26 | expect_silent(.check_arg_assay(sce, assayNames(sce)[1])) 27 | }) 28 | 29 | test_that(".check_args_simData", { 30 | u <- list(x = sce, ng = 10, nc = 100, ns = 3, nk = 2, probs = NULL, 31 | p_dd = diag(6)[1, ], p_type = 0.1, lfc = 1, rel_lfc = NULL, 32 | p_ep = 0.1, p_dp = 0.1, p_dm = 0.1, paired = FALSE, force = TRUE, 33 | phylo_tree = NULL, phylo_pars = c(0, 3)) 34 | expect_silent(.check_args_simData(u)) 35 | # 'ng', 'nc', 'ns', 'nk' should be single numerics > 0 36 | for (arg in c("ng", "nc", "ns", "nk")) { 37 | v <- u; v[[arg]] <- 1; expect_silent(.check_args_simData(v)) 38 | for (val in list(numeric(2)+1, numeric(1), NA, "a")) 39 | v[[arg]] <- val; expect_error(.check_args_simData(v)) 40 | } 41 | # 'p_dd' should be length of DD categories, sum to 1, and be in [0,1] 42 | v <- u; for (val in list(rep(1,5)/5, rep(1,6)/5, c(-1,2,rep(0,4)))) 43 | v$p_dd <- val; expect_error(.check_args_simData(v)) 44 | # 'paired' should be length-one logical 45 | v <- u; for (val in list("x", 123, c(TRUE, FALSE))) 46 | v$paired <- val; expect_error(.check_args_simData(v)) 47 | # 'rel_lfc' should be >0 and be of length 'nk' 48 | u$rel_lfc <- rep(1, u$nk) 49 | expect_silent(.check_args_simData(u)) 50 | v <- u; for (val in list(-1, rep(1,u$nk+1))) 51 | v$rel_lfc <- val; expect_error(.check_args_simData(v)) 52 | }) 53 | 54 | test_that(".check_frq()", { 55 | y <- calcExprFreqs(x <- sce) 56 | expect_silent(.check_frq(x, y)) 57 | z <- x; class(z) <- "x" 58 | expect_error(.check_frq(z, y)) 59 | z <- y; class(z) <- "x" 60 | expect_error(.check_frq(x, z)) 61 | for (v in c(-1, 10)) { 62 | z <- y; assay(z)[1, 1] <- v 63 | expect_error(.check_frq(x, z)) 64 | } 65 | }) 66 | -------------------------------------------------------------------------------- /tests/testthat/test_utils-simData.R: -------------------------------------------------------------------------------- 1 | nk <- length(k <- paste0("cluster", seq_len(5))) 2 | ns <- length(s <- paste0("sample", seq_len(4))) 3 | ng <- length(g <- paste0("group", seq_len(3))) 4 | 5 | test_that(".get_ns", { 6 | replicate(10, { 7 | # default to using all samples when 8 | # dd = FALSE or dd = TRUE, paired = TRUE 9 | ns_ref <- sample(100, 1) 10 | args <- list( 11 | list(dd = TRUE, paired = TRUE), 12 | list(dd = FALSE, paired = TRUE), 13 | list(dd = FALSE, paired = FALSE)) 14 | for (. in args) { 15 | .$ns_ref <- ns_ref 16 | .$force <- FALSE 17 | . <- c(., list(ns_sim = NULL)) 18 | ns <- do.call(.get_ns, .) 19 | expect_identical(ns_ref, ns) 20 | } 21 | # error when force = FALSE and 22 | # desired number of samples to simulate 23 | # exceeds available reference samples 24 | ns_sim <- ns_ref + 1 25 | args <- list( 26 | list(dd = TRUE, paired = TRUE), 27 | list(dd = TRUE, paired = FALSE), 28 | list(dd = FALSE, paired = TRUE), 29 | list(dd = FALSE, paired = FALSE)) 30 | for (. in args) { 31 | .$ns_ref <- ns_ref 32 | .$ns_sim <- ns_sim 33 | .$force <- FALSE 34 | expect_error(do.call(.get_ns, .)) 35 | # pass when force = TRUE 36 | .$force <- TRUE 37 | ns <- do.call(.get_ns, .) 38 | expect_identical(ns_sim, ns) 39 | } 40 | }) 41 | }) 42 | 43 | test_that(".split_cells", { 44 | n_cells <- 1e3 45 | cells <- paste0("cell", seq_len(n_cells)) 46 | x <- matrix(0, 47 | nrow = 1, ncol = n_cells, 48 | dimnames = list(NULL, cells)) 49 | cd <- data.frame( 50 | row.names = cells, 51 | cluster_id = sample(k, n_cells, TRUE), 52 | sample_id = sample(s, n_cells, TRUE)) 53 | 54 | cs <- .split_cells(cd, "cluster_id") 55 | expect_identical(names(cs), k) 56 | expect_identical( 57 | as.numeric(vapply(cs, length, numeric(1))), 58 | as.numeric(table(cd$cluster_id))) 59 | 60 | cs <- .split_cells(cd, c("cluster_id", "sample_id")) 61 | expect_identical(names(cs), k) 62 | nms_lvl2 <- vapply(cs, names, character(length(s))) 63 | expect_true(all(apply(nms_lvl2, 2, identical, s))) 64 | 65 | cs <- .split_cells(cd, c("sample_id", "cluster_id")) 66 | expect_identical(names(cs), s) 67 | nms_lvl2 <- vapply(cs, names, character(length(k))) 68 | expect_true(all(apply(nms_lvl2, 2, identical, k))) 69 | }) 70 | 71 | test_that(".sample_cell_md", { 72 | ids <- list(k, s, g) 73 | md <- .sample_cell_md((n <- 1e3), ids) 74 | ms <- vapply(apply(md, 2, table), mean, numeric(1)) 75 | expect_true(all(vapply(seq_along(ids), function(i) 76 | ms[[i]] == n/length(ids[[i]]), logical(1)))) 77 | set.seed(1); a <- .sample_cell_md(n, ids) 78 | set.seed(1); b <- .sample_cell_md(n, ids, 79 | list(rep(1/nk,nk),rep(1/ns,ns),rep(1/ng,ng))) 80 | expect_identical(a, b) 81 | }) 82 | -------------------------------------------------------------------------------- /vignettes/detection.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Differential detection analysis" 3 | author: 4 | - name: Jeroen Gilis 5 | affiliation: 6 | - &one Applied Mathematics, Computer science and Statistics, Ghent University, Ghent, Belgium 7 | - &two Bioinformatics Institute, Ghent University, Ghent, Belgium 8 | - Data Mining and Modeling for Biomedicine, VIB Flemish Institute for Biotechnology, Ghent, Belgium 9 | - name: Helena L Crowell 10 | affiliation: 11 | - National Center for Genomic Analysis (CNAG), Barcelona, Spain 12 | - name: Davide Risso 13 | affiliation: 14 | - Department of Statistical Sciences, University of Padova, Padova, Italy 15 | - Padua Center for Network Medicine, University of Padova, Padova, Italy 16 | - name: Lieven Clement 17 | affiliation: 18 | - *one 19 | - *two 20 | package: "`r BiocStyle::pkg_ver('muscat')`" 21 | date: "`r format(Sys.Date(), '%B %d, %Y')`" 22 | output: 23 | BiocStyle::html_document 24 | vignette: > 25 | %\VignetteIndexEntry{"3. Differential detection"} 26 | %\VignettePackage{muscat} 27 | %\VignetteEngine{knitr::rmarkdown} 28 | %\VignetteEncoding{UTF-8} 29 | bibliography: "`r file.path(system.file('extdata', package='muscat'), 'refs.bib')`" 30 | abstract: > 31 |

In this vignette, we display how `muscat` can be used to perform differrential detection (DD) analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and differential state (DS) analysis results on the same data can be effectively combined. This vignette thus introduces a workflow that allows users to jointly assess two biological hypotheses that often contain orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost. 32 | --- 33 | 34 | 39 | 40 | ```{r cache, include=FALSE} 41 | knitr::opts_chunk$set(cache=TRUE) 42 | ``` 43 | 44 | ```{r echo=FALSE, message=FALSE, warning=FALSE} 45 | library(BiocStyle) 46 | ``` 47 | 48 | *** 49 | 50 | Based on @Gilis2023 51 | 52 | > Gilis J, Perin L, Malfait M, Van den Berge K, 53 | Assefa AT, Verbist B, Risso D, and Clement L: 54 | Differential detection workflows for 55 | multi-sample single-cell RNA-seq data. 56 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043) 57 | 58 | # Load packages {-} 59 | 60 | ```{r load-libs, message=FALSE, warning=FALSE} 61 | library(dplyr) 62 | library(purrr) 63 | library(tidyr) 64 | library(scater) 65 | library(muscat) 66 | library(ggplot2) 67 | library(patchwork) 68 | ``` 69 | 70 | # Introduction 71 | 72 | Single-cell RNA-sequencing (scRNA-seq) has improved our understanding of complex biological processes by elucidating cell-level heterogeneity in gene expression. One of the key tasks in the downstream analysis of scRNA-seq data is studying differential gene expression (DE). Most DE analysis methods aim to identify genes for which the *average* expression differs between biological groups of interest, e.g., between cell types or between diseased and healthy cells. As such, most methods allow for assessing only one aspect of the gene expression distribution: the mean. However, in scRNA-seq data, differences in other characteristics between count distributions can commonly be observed. 73 | 74 | One such characteristic is gene detection, i.e., the number of cells in which a gene is (detectably) expressed. Analogous to a DE analysis, a differential detection (DD) analysis aims to identify genes for which the *average fraction of cells in which the gene is detected* changes between groups. In @Gilis2023, we show how DD analysis contain information that is biologically relevant, and that is largely orthogonal to the information obtained from DE analysis on the same data. 75 | 76 | In this vignette, we display how `muscat` can be used to perform DD analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and DS analysis results on the same data can be effectively combined using a two-stage testing approach. This workflow thus allows users to jointly assess two biological hypotheses containing orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost. 77 | 78 | # Setup 79 | 80 | We will use the same data as in the differential state (DS) analyses described in `r Biocpkg("muscat", vignette = "analysis.html")`, namely, scRNA-seq data acquired on PBMCs from 8 patients before and after IFN-$\beta$ treatment. For a more detailed description of these data and subsequent preprocessing, we refer to `r Biocpkg("muscat", vignette = "analysis.html")`. 81 | 82 | ```{r load-data, message=FALSE} 83 | library(ExperimentHub) 84 | eh <- ExperimentHub() 85 | query(eh, "Kang") 86 | (sce <- eh[["EH2259"]]) 87 | ``` 88 | 89 | We further apply some minimal filtering to remove low-quality genes and cells, and use `prepSCE()` to standardize cell metadata such that slots specifying cluster (`cell`), sample (`stim`+`ind`), and group (`stim`) identifiers conform with the `muscat` framework: 90 | 91 | ```{r prep-data} 92 | sce <- sce[rowSums(counts(sce) > 0) > 0, ] 93 | qc <- perCellQCMetrics(sce) 94 | sce <- sce[, !isOutlier(qc$detected, nmads=2, log=TRUE)] 95 | sce <- sce[rowSums(counts(sce) > 1) >= 10, ] 96 | sce$id <- paste0(sce$stim, sce$ind) 97 | sce <- prepSCE(sce, "cell", "id", "stim") 98 | table(sce$cluster_id, sce$group_id) 99 | table(sce$sample_id) 100 | ``` 101 | 102 | ## Aggregation 103 | 104 | In general, `aggregateData()` will aggregate the data by the `colData` variables specified with argument `by`, and return a `SingleCellExperiment` containing pseudobulk data. 105 | 106 | To perform a pseudobulk-level analysis, measurements must be aggregated at the cluster-sample level (default `by = c("cluster_id", "sample_id"`). In this case, the returned `SingleCellExperiment` will contain one assay per cluster, where rows = genes and columns = samples. Arguments `assay` and `fun` specify the input data and summary statistic, respectively, to use for aggregation. 107 | 108 | In a differential detection (DD) analysis, the default choice of the summary statistic used for aggregation is `fun = "num.detected"`. This strategy can be thought of as first binarizing the gene expression values (1: expressed, 0: not expressed), and subsequently performing a simple summation of the binarized gene expression counts for cells belonging to the same cluster-sample level. Hence, the resulting pseudobulk-level expression count reflects the total number of cells in a particular cluster-sample level with a non-zero gene expression value. 109 | 110 | In a differential state (DS) analysis, the default choice for aggregation is `fun = "sum"`, which amounts to the simple summation of the raw gene expression counts of cells belonging to the same cluster-sample level. 111 | 112 | ```{r pbs-det} 113 | pb_sum <- aggregateData(sce, 114 | assay="counts", fun="sum", 115 | by=c("cluster_id", "sample_id")) 116 | pb_det <- aggregateData(sce, 117 | assay="counts", fun="num.detected", 118 | by=c("cluster_id", "sample_id")) 119 | t(head(assay(pb_det))) 120 | ``` 121 | 122 | @Qiu2020 demonstrated that binarizing scRNA-seq counts generates expression profiles that still accurately reflect biological variation. 123 | This finding was confirmed by @Bouland2021, who showed that the frequencies of zero counts capture biological variability, and further claimed that a binarized representation of the single-cell expression data allows for a more robust description of the relative abundance of transcripts than counts. 124 | 125 | ```{r pbs-mds, fig.width=8, fig.height=4, fig.cap="Pseudobulk-level multidimensional scaling (MDS) plot based on (A) sum of counts and (B) sum of binarized counts (i.e., counting the number of detected features) in each cluster-sample."} 126 | pbMDS(pb_sum) + ggtitle("Σ counts") + 127 | pbMDS(pb_det) + ggtitle("# detected") + 128 | plot_layout(guides="collect") + 129 | plot_annotation(tag_levels="A") & 130 | theme(legend.key.size=unit(0.5, "lines")) 131 | ``` 132 | 133 | ## Analysis 134 | 135 | Once we have assembled the pseudobulk data, we can test for DD using `pbDD()`. By default, a $\sim$`group_id` model is fit, and the last coefficient of the linear model is tested to be equal to zero. 136 | 137 | ```{r pbDD} 138 | res_DD <- pbDD(pb_det, min_cells=0, filter="none", verbose=FALSE) 139 | ``` 140 | 141 | ## Handling and visualizing results 142 | 143 | Inspection, manipulation, and visualization of DD analysis results follows the same principles as for a DS analysis. For a detailed description, we refer to the DS analysis vignette`r Biocpkg("muscat", vignette = "analysis.html")`. Below, some basic functionalities are being displayed. 144 | 145 | ```{r} 146 | tbl <- res_DD$table[[1]] 147 | # one data.frame per cluster 148 | names(tbl) 149 | ``` 150 | 151 | ```{r} 152 | # view results for 1st cluster 153 | k1 <- tbl[[1]] 154 | head(format(k1[, -ncol(k1)], digits = 2)) 155 | ``` 156 | 157 | ```{r} 158 | # filter FDR < 5%, |logFC| > 1 & sort by adj. p-value 159 | tbl_fil <- lapply(tbl, \(u) 160 | filter(u, 161 | p_adj.loc < 0.05, 162 | abs(logFC) > 1) |> 163 | arrange(p_adj.loc)) 164 | 165 | # nb. of DS genes & % of total by cluster 166 | n_de <- vapply(tbl_fil, nrow, numeric(1)) 167 | p_de <- format(n_de / nrow(sce) * 100, digits = 3) 168 | data.frame("#DD" = n_de, "%DD" = p_de, check.names = FALSE) 169 | ``` 170 | 171 | ```{r} 172 | library(UpSetR) 173 | de_gs_by_k <- map(tbl_fil, "gene") 174 | upset(fromList(de_gs_by_k)) 175 | ``` 176 | 177 | # Stagewise anaysis 178 | 179 | While DD analysis results may contain biologically relevant information in their own right, we show in @Gilis2023 that combing DD and DS analysis results on the same data can further improve our understanding of complex biological phenomena. In the remainder of this vignette, we show how DD and DS analysis results on the same data can be effectively combined. 180 | 181 | For this, we build on the two-stage testing paradigm proposed by @Vandenberge2017. In the first stage of this testing procedure, we identify differential genes by using an omnibus test for differential detection and differential expression (DE). The null hypothesis for this test is that the gene is neither differentially detected, nor differentially expressed. 182 | 183 | In the second stage, we perform post-hoc tests on the differential genes from stage one to unravel whether they are DD, DE or both. Compared to the individual DD and DS analysis results, the two-stage approach increases statistical power and provides better type 1 error control. 184 | 185 | ```{r pbDS} 186 | res_DS <- pbDS(pb_sum, min_cells=0, filter="none", verbose=FALSE) 187 | ``` 188 | 189 | ```{r} 190 | res <- stagewise_DS_DD(res_DS, res_DD, verbose=FALSE) 191 | head(res[[1]][[1]]) # results for 1st cluster 192 | ``` 193 | 194 | ## Comparison 195 | 196 | ```{r} 197 | # for each approach, get adjusted p-values across clusters 198 | ps <- map_depth(res, 2, \(df) { 199 | data.frame( 200 | df[, c("gene", "cluster_id")], 201 | p_adj.stagewise=df$p_adj, 202 | p_adj.DS=df$res_DS$p_adj.loc, 203 | p_adj.DD=df$res_DD$p_adj.loc) 204 | }) |> 205 | lapply(do.call, what=rbind) |> 206 | do.call(what=rbind) |> 207 | data.frame(row.names=NULL) 208 | head(ps) 209 | ``` 210 | 211 | To get an overview of how different approaches compare, we can count the number of genes found differential in each cluster for a given FDR threshold: 212 | 213 | ```{r fig.width=12, fig.height=4} 214 | # for each approach & cluster, count number 215 | # of genes falling below 5% FDR threshold 216 | ns <- lapply(seq(0, 0.2, 0.005), \(th) { 217 | ps |> 218 | mutate(th=th) |> 219 | group_by(cluster_id, th) |> 220 | summarise( 221 | .groups="drop", 222 | across(starts_with("p_"), 223 | \(.) sum(. < th, na.rm=TRUE))) 224 | }) |> 225 | do.call(what=rbind) |> 226 | pivot_longer(starts_with("p_")) 227 | ggplot(ns, aes(th, value, col=name)) + 228 | geom_line(linewidth=0.8, key_glyph="point") + 229 | geom_vline(xintercept=0.05, lty=2, linewidth=0.4) + 230 | guides(col=guide_legend(NULL, override.aes=list(size=3))) + 231 | labs(x="FDR threshold", y="number of significantly\ndifferential genes") + 232 | facet_wrap(~cluster_id, scales="free_y", nrow=2) + 233 | theme_bw() + theme( 234 | panel.grid.minor=element_blank(), 235 | legend.key.size=unit(0.5, "lines")) 236 | ``` 237 | 238 | We can further identify which hits are shared between or unique to a given approach. 239 | In the example below, for instance, the vast majority of hits is common to all approaches, many hits are shared between DD and stagewise testing, and only few genes are specific to any one approach: 240 | 241 | ```{r upset, fig.width = 5, fig.height = 3, fig.cap = "Upset plot of differential findings (FDR < 0.05) across DS, DD, and stagewise analysis for an exemplary cluster; shown are the 50 most frequent interactions."} 242 | # subset adjuster p-values for cluster of interest 243 | qs <- ps[grep("CD4", ps$cluster_id), grep("p_", names(ps))] 244 | # for each approach, extract genes at 5% FDR threshold 245 | gs <- apply(qs, 2, \(.) ps$gene[. < 0.05]) 246 | # visualize set intersections between approaches 247 | UpSetR::upset(UpSetR::fromList(gs), order.by="freq") 248 | ``` 249 | 250 | ```{r} 251 | # extract genes unique to stagewise testing 252 | sw <- grep("stagewise", names(gs)) 253 | setdiff(gs[[sw]], unlist(gs[-sw])) 254 | ``` 255 | 256 | # Session info {- .smaller} 257 | 258 | ```{r session-info} 259 | sessionInfo() 260 | ``` 261 | 262 | # References --------------------------------------------------------------------------------