├── .Rbuildignore
├── .github
    └── workflows
    │   └── check-bioc.yml
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
    ├── aggregateData.R
    ├── calcExprFreqs.R
    ├── data.R
    ├── mmDS.R
    ├── pbDS.R
    ├── pbFlatten.R
    ├── pbHeatmap.R
    ├── pbMDS.R
    ├── prepSCE.R
    ├── prepSim.R
    ├── resDS.R
    ├── simData.R
    ├── stagewiseDD.R
    ├── utils-mmDS.R
    ├── utils-pbDS.R
    ├── utils-simData.R
    ├── utils.R
    └── validity-checks.R
├── README.md
├── data
    └── example_sce.rda
├── inst
    └── extdata
    │   ├── 1a.png
    │   ├── 1b.png
    │   ├── 1d.png
    │   ├── muscat.png
    │   └── refs.bib
├── man
    ├── aggregateData.Rd
    ├── calcExprFreqs.Rd
    ├── data.Rd
    ├── mmDS.Rd
    ├── pbDS.Rd
    ├── pbFlatten.Rd
    ├── pbHeatmap.Rd
    ├── pbMDS.Rd
    ├── prepSCE.Rd
    ├── prepSim.Rd
    ├── resDS.Rd
    ├── simData.Rd
    └── stagewise_DS_DD.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-aggregateData.R
    │   ├── test-calcExprFreqs.R
    │   ├── test-mmDS.R
    │   ├── test-pbDS.R
    │   ├── test-pbHeatmap.R
    │   ├── test-pbMDS.R
    │   ├── test-prepSCE.R
    │   ├── test-prepSim.R
    │   ├── test-resDS.R
    │   ├── test-simData.R
    │   ├── test-utils.R
    │   ├── test-validityChecks.R
    │   └── test_utils-simData.R
└── vignettes
    ├── analysis.Rmd
    ├── detection.Rmd
    └── simulation.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.git$
2 | ^\.github$
3 | ^\.gitignore$
4 | 


--------------------------------------------------------------------------------
/.github/workflows/check-bioc.yml:
--------------------------------------------------------------------------------
  1 | ## Read more about GitHub actions the features of this GitHub Actions workflow
  2 | ## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action
  3 | ##
  4 | ## For more details, check the biocthis developer notes vignette at
  5 | ## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html
  6 | ##
  7 | ## You can add this workflow to other packages using:
  8 | ## > biocthis::use_bioc_github_action()
  9 | ##
 10 | ## Using GitHub Actions exposes you to many details about how R packages are
 11 | ## compiled and installed in several operating system.s
 12 | ### If you need help, please follow the steps listed at
 13 | ## https://github.com/r-lib/actions#where-to-find-help
 14 | ##
 15 | ## If you found an issue specific to biocthis's GHA workflow, please report it
 16 | ## with the information that will make it easier for others to help you.
 17 | ## Thank you!
 18 | 
 19 | ## Acronyms:
 20 | ## * GHA: GitHub Action
 21 | ## * OS: operating system
 22 | 
 23 | on:
 24 |   push:
 25 |   pull_request:
 26 | 
 27 | name: R-CMD-check-bioc
 28 | 
 29 | ## These environment variables control whether to run GHA code later on that is
 30 | ## specific to testthat, covr, and pkgdown.
 31 | ##
 32 | ## If you need to clear the cache of packages, update the number inside
 33 | ## cache-version as discussed at https://github.com/r-lib/actions/issues/86.
 34 | ## Note that you can always run a GHA test without the cache by using the word
 35 | ## "/nocache" in the commit message.
 36 | env:
 37 |   has_testthat: 'true'
 38 |   run_covr: 'false'
 39 |   run_pkgdown: 'false'
 40 |   has_RUnit: 'false'
 41 |   cache-version: 'cache-v1'
 42 |   run_docker: 'false'
 43 |   bioc_version: 'bioc-release'
 44 |   ## Valid options are:
 45 |   ##    "bioc-release"
 46 |   ##    "bioc-devel"
 47 |   ##    or a specific number like "3.20"
 48 | 
 49 | jobs:
 50 |   bioc-config:
 51 |     runs-on: ubuntu-latest
 52 |     outputs:
 53 |       matrix: ${{ steps.set-bioc-matrix.outputs.matrix }}
 54 |     steps:
 55 |       ## Adapted from
 56 |       ## https://runs-on.com/github-actions/the-matrix-strategy/#dynamic-matrix-generation
 57 |       - id: set-bioc-matrix
 58 |         run: |
 59 |          bioc=$(curl -L https://bioconductor.org/config.yaml)
 60 |          if [[ "$bioc_version" == "bioc-release" ]]; then
 61 |            echo "Finding the latest BioC release version and the corresponding R version"
 62 |            biocversion=$(echo "$bioc" | grep "release_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}")
 63 |            rversion=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
 64 |            biocmajor=$(echo "$biocversion" | cut -c 1-1)
 65 |            biocminor=$(echo "$biocversion" | cut -c 3-4)
 66 |            bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}")
 67 |          elif [[ "$bioc_version" == "bioc-devel" ]]; then
 68 |            echo "Finding the latest BioC devel version and the corresponding R version"
 69 |            biocversion=$(echo "$bioc" | grep "devel_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}")
 70 |            rversion_release=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
 71 |            rversion_devel=$(echo "$bioc" | grep "r_version_associated_with_devel: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
 72 |            if [[ "$rversion_release" == "$rversion_devel" ]]; then
 73 |              rversion=$(echo "$rversion_devel")
 74 |            else
 75 |              rversion="devel"
 76 |            fi
 77 |            bioccont="bioconductor/bioconductor_docker:devel"
 78 |          else
 79 |            echo "Finding the the R version for bioc version ${bioc_version}"
 80 |            biocversion=$(echo "$bioc_version")
 81 |            rversion=$(echo "$bioc" | sed -En "/r_ver_for_bioc_ver/,/release_dates/p" | grep "$bioc_version\":" | grep -Eo ": \"[0-9]{1}\.[0-9]{1}" | grep -Eo "[0-9]{1}\.[0-9]{1}")
 82 |            biocmajor=$(echo "$biocversion" | cut -c 1-1)
 83 |            biocminor=$(echo "$biocversion" | cut -c 3-4)
 84 |            bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}")
 85 |          fi
 86 |          echo "Found these settings:"
 87 |          echo "Bioconductor version: $biocversion, R version: $rversion, Bioconductor docker name: $bioccont"
 88 |          echo "matrix={ \"config\": [{\"os\" : \"ubuntu-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\", \"cont\" : \"${bioccont}\"} , {\"os\" : \"macOS-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\"} , {\"os\" : \"windows-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\" }] }" >> "$GITHUB_OUTPUT"
 89 |          ## If an OS is failing and you don't want to test it, manually remove it from the 'matrix' JSON entries above
 90 | 
 91 |   build-check:
 92 |     needs: bioc-config
 93 |     strategy:
 94 |       fail-fast: false
 95 |       matrix: ${{fromJson(needs.bioc-config.outputs.matrix)}}
 96 |     runs-on: ${{ matrix.config.os }}
 97 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
 98 |     container: ${{ matrix.config.cont }}
 99 |     env:
100 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
101 |       NOT_CRAN: true
102 |       TZ: UTC
103 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
104 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
105 | 
106 |     steps:
107 | 
108 |       ## Most of these steps are the same as the ones in
109 |       ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
110 |       ## If they update their steps, we will also need to update ours.
111 |       - name: Checkout Repository
112 |         uses: actions/checkout@v3
113 | 
114 |       ## R is already included in the Bioconductor docker images
115 |       - name: Setup R from r-lib
116 |         if: runner.os != 'Linux'
117 |         uses: r-lib/actions/setup-r@v2
118 |         with:
119 |           r-version: ${{ matrix.config.r }}
120 |           http-user-agent: ${{ matrix.config.http-user-agent }}
121 | 
122 |       ## pandoc is already included in the Bioconductor docker images
123 |       - name: Setup pandoc from r-lib
124 |         if: runner.os != 'Linux'
125 |         uses: r-lib/actions/setup-pandoc@v2
126 | 
127 |       ## Create the path that will be used for caching packages on Linux
128 |       - name: Create R_LIBS_USER on Linux
129 |         if: runner.os == 'Linux'
130 |         run: |
131 |           R_LIBS_USER=/__w/_temp/Library
132 |           echo "R_LIBS_USER=$R_LIBS_USER" >> "$GITHUB_ENV"
133 |           mkdir -p $R_LIBS_USER
134 | 
135 |       ## Use cached R packages
136 |       - name: Restore R package cache
137 |         if: "!contains(github.event.head_commit.message, '/nocache')"
138 |         uses: actions/cache@v4
139 |         with:
140 |           path: ${{ env.R_LIBS_USER }}
141 |           key: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-${{ inputs.cache-version }}
142 |           restore-keys: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}--${{ inputs.cache-version }}
143 | 
144 |       ## remotes is needed for isntalling the Linux system dependencies
145 |       ## as well as other R packages later on.
146 |       - name: Install remotes
147 |         run: |
148 |           message(paste('****', Sys.time(), 'installing remotes ****'))
149 |           install.packages('remotes')
150 |         shell: Rscript {0}
151 | 
152 |       ## This will work again once https://github.com/r-lib/remotes/commit/0e4e23051041d9f1b15a5ab796defec31af6190d
153 |       ## makes it to the CRAN version of remotes
154 | 
155 |       # - name: Install Linux system dependencies
156 |       #   if: runner.os == 'Linux'
157 |       #   run: |
158 |       #     sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "24.04")), collapse = " "))')
159 |       #     echo $sysreqs
160 |       #     sudo -s eval "$sysreqs"
161 | 
162 |       - name: Install macOS system dependencies
163 |         if: matrix.config.os == 'macOS-latest'
164 |         run: |
165 |           ## Enable installing XML from source if needed
166 |           brew install libxml2
167 |           echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV
168 | 
169 |           ## Required to install magick as noted at
170 |           ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2
171 |           brew install imagemagick@6
172 | 
173 |           ## For textshaping, required by ragg, and required by pkgdown
174 |           brew install harfbuzz fribidi
175 | 
176 |           ## For installing usethis's dependency gert
177 |           brew install libgit2
178 | 
179 |           ## Required for tcltk
180 |           brew install xquartz --cask
181 | 
182 |       - name: Install Windows system dependencies
183 |         if: runner.os == 'Windows'
184 |         run: |
185 |           ## Edit below if you have any Windows system dependencies
186 |         shell: Rscript {0}
187 | 
188 |       - name: Install BiocManager
189 |         run: |
190 |           message(paste('****', Sys.time(), 'installing BiocManager ****'))
191 |           remotes::install_cran("BiocManager")
192 |         shell: Rscript {0}
193 | 
194 |       - name: Set BiocVersion
195 |         run: |
196 |           BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE)
197 |         shell: Rscript {0}
198 | 
199 |       - name: Install dependencies pass 1
200 |         run: |
201 |           ## Try installing the package dependencies in steps. First the local
202 |           ## dependencies, then any remaining dependencies to avoid the
203 |           ## issues described at
204 |           ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
205 |           ## https://github.com/r-lib/remotes/issues/296
206 |           ## Ideally, all dependencies should get installed in the first pass.
207 | 
208 |           ## For running the checks
209 |           message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
210 |           install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories())
211 | 
212 |           ## Pass #1 at installing dependencies
213 |           message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
214 |           remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
215 |         continue-on-error: true
216 |         shell: Rscript {0}
217 | 
218 |       - name: Install dependencies pass 2
219 |         run: |
220 |           ## Pass #2 at installing dependencies
221 |           message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
222 |           remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
223 |         shell: Rscript {0}
224 | 
225 |       - name: Install BiocGenerics
226 |         if:  env.has_RUnit == 'true'
227 |         run: |
228 |           ## Install BiocGenerics
229 |           BiocManager::install("BiocGenerics")
230 |         shell: Rscript {0}
231 | 
232 |       - name: Install covr
233 |         if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux'
234 |         run: |
235 |           remotes::install_cran("covr")
236 |         shell: Rscript {0}
237 | 
238 |       - name: Install pkgdown
239 |         if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
240 |         run: |
241 |           remotes::install_cran("pkgdown")
242 |         shell: Rscript {0}
243 | 
244 |       - name: Session info
245 |         run: |
246 |           options(width = 100)
247 |           pkgs <- installed.packages()[, "Package"]
248 |           sessioninfo::session_info(pkgs, include_base = TRUE)
249 |         shell: Rscript {0}
250 | 
251 |       - name: Run CMD check
252 |         env:
253 |           _R_CHECK_CRAN_INCOMING_: false
254 |           DISPLAY: 99.0
255 |         run: |
256 |           options(crayon.enabled = TRUE)
257 |           rcmdcheck::rcmdcheck(
258 |               args = c("--no-manual", "--no-vignettes", "--timings"),
259 |               build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
260 |               error_on = "warning",
261 |               check_dir = "check"
262 |           )
263 |         shell: Rscript {0}
264 | 
265 |       ## Might need an to add this to the if:  && runner.os == 'Linux'
266 |       - name: Reveal testthat details
267 |         if:  env.has_testthat == 'true'
268 |         run: find . -name testthat.Rout -exec cat '{}' ';'
269 | 
270 |       - name: Run RUnit tests
271 |         if:  env.has_RUnit == 'true'
272 |         run: |
273 |           BiocGenerics:::testPackage()
274 |         shell: Rscript {0}
275 | 
276 |       - name: Run BiocCheck
277 |         env:
278 |           DISPLAY: 99.0
279 |         run: |
280 |           BiocCheck::BiocCheck(
281 |               dir('check', 'tar.gz$', full.names = TRUE),
282 |               `quit-with-status` = TRUE,
283 |               `no-check-R-ver` = TRUE,
284 |               `no-check-bioc-help` = TRUE
285 |           )
286 |         shell: Rscript {0}
287 | 
288 |       - name: Test coverage
289 |         if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux'
290 |         run: |
291 |           covr::codecov(coverage = covr::package_coverage(type = "all"))
292 |         shell: Rscript {0}
293 | 
294 |       - name: Install package
295 |         if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
296 |         run: R CMD INSTALL .
297 | 
298 |       - name: Build pkgdown site
299 |         if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
300 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
301 |         shell: Rscript {0}
302 |         ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
303 |         ## at least one locally before this will work. This creates the gh-pages
304 |         ## branch (erasing anything you haven't version controlled!) and
305 |         ## makes the git history recognizable by pkgdown.
306 | 
307 |       - name: Install deploy dependencies
308 |         if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
309 |         run: |
310 |           apt-get update && apt-get -y install rsync
311 | 
312 |       - name: Deploy pkgdown site to GitHub pages 🚀
313 |         if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
314 |         uses: JamesIves/github-pages-deploy-action@releases/v4
315 |         with:
316 |           clean: false
317 |           branch: gh-pages
318 |           folder: docs
319 | 
320 |       - name: Upload check results
321 |         if: failure()
322 |         uses: actions/upload-artifact@master
323 |         with:
324 |           name: ${{ runner.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-results
325 |           path: check
326 | 
327 | 
328 |   ## Code adapted from
329 |   ## https://github.com/waldronlab/cBioPortalData/blob/e0440a4445f0cc731e426363a76faa22ee5e0f9d/.github/workflows/devel_check_dock.yml#L65-L92
330 |   docker-build-and-push:
331 |     runs-on: ubuntu-latest
332 |     needs: build-check
333 |     steps:
334 |       - name: Checkout Repository
335 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
336 |         uses: actions/checkout@v3
337 | 
338 |       - name: Register repo name
339 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
340 |         id: reg_repo_name
341 |         run: |
342 |           echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
343 | 
344 |       - name: Set up QEMU
345 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
346 |         uses: docker/setup-qemu-action@v2
347 | 
348 |       - name: Set up Docker Buildx
349 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
350 |         uses: docker/setup-buildx-action@v2
351 | 
352 |       - name: Login to Docker Hub
353 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
354 |         uses: docker/login-action@v2
355 |         with:
356 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
357 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
358 |       ## Note that DOCKERHUB_TOKEN is really a token for your dockerhub
359 |       ## account, not your actual dockerhub account password. You can get it
360 |       ## from https://hub.docker.com/settings/security.
361 |       ## Check https://github.com/docker/build-push-action/tree/v4.0.0
362 |       ## for more details.
363 |       ## Alternatively, try checking
364 |       ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html.
365 | 
366 |       - name: Build and Push Docker
367 |         if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel' && success()"
368 |         uses: docker/build-push-action@v4
369 |         with:
370 |           context: .
371 |           push: true
372 |           tags: >
373 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest,
374 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel
375 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .RData
2 | .Rhistory
3 | .DS_Store
4 | .Ruserdata
5 | .Rproj.user
6 | vignettes/*/
7 | vignettes/*.html


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: muscat
 2 | Title: Multi-sample multi-group scRNA-seq data analysis tools
 3 | Description: `muscat` provides various methods and visualization tools 
 4 |     for DS analysis in multi-sample, multi-group, multi-(cell-)subpopulation 
 5 |     scRNA-seq data, including cell-level mixed models and methods based on 
 6 |     aggregated “pseudobulk” data, as well as a flexible simulation platform 
 7 |     that mimics both single and multi-sample scRNA-seq data.
 8 | Type: Package
 9 | Version: 1.22.0
10 | Depends: R (>= 4.4)
11 | Authors@R: c(
12 | 	person("Helena L.", "Crowell", 
13 | 	    role=c("aut", "cre"),
14 | 	    email="helena@crowell.eu", 
15 | 	    comment=c(ORCID="0000-0002-4801-1767")), 
16 | 	person("Pierre-Luc", "Germain", role="aut"),
17 | 	person("Charlotte", "Soneson", role="aut"),
18 | 	person("Anthony", "Sonrel", role="aut"),
19 | 	person("Jeroen", "Gilis", role="aut"),
20 | 	person("Davide", "Risso", role="aut"),
21 | 	person("Lieven", "Clement", role="aut"),
22 | 	person("Mark D.", "Robinson", role=c("aut", "fnd"), 
23 | 	    email="mark.robinson@imls.uzh.ch"))
24 | Imports: 
25 | 	BiocParallel, blme, 
26 | 	ComplexHeatmap, 
27 | 	data.table, DESeq2, dplyr, 
28 | 	edgeR, 
29 | 	ggplot2, glmmTMB, grDevices, grid, 
30 | 	limma, lmerTest, lme4, 
31 | 	Matrix, matrixStats, methods, 
32 | 	progress, purrr,  
33 | 	rlang,
34 | 	S4Vectors, scales, scater, scuttle, sctransform, stats, 
35 | 	SingleCellExperiment, SummarizedExperiment, 
36 | 	variancePartition, viridis
37 | Suggests: 
38 | 	BiocStyle,
39 | 	countsimQC,
40 | 	ExperimentHub, 
41 | 	iCOBRA,
42 | 	knitr, 
43 | 	patchwork,
44 | 	phylogram,
45 | 	RColorBrewer,
46 | 	reshape2,
47 | 	rmarkdown,
48 | 	statmod,
49 | 	stageR,
50 | 	testthat, 
51 | 	UpSetR
52 | biocViews: ImmunoOncology, DifferentialExpression, Sequencing, 
53 |     SingleCell, Software, StatisticalMethod, Visualization
54 | License: GPL-3
55 | VignetteBuilder: knitr
56 | RoxygenNote: 7.3.2
57 | Encoding: UTF-8
58 | URL: https://github.com/HelenaLC/muscat
59 | BugReports: https://github.com/HelenaLC/muscat/issues
60 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | # Generated by roxygen2: do not edit by hand
  2 | 
  3 | export(aggregateData)
  4 | export(calcExprFreqs)
  5 | export(mmDS)
  6 | export(pbDD)
  7 | export(pbDS)
  8 | export(pbFlatten)
  9 | export(pbHeatmap)
 10 | export(pbMDS)
 11 | export(prepSCE)
 12 | export(prepSim)
 13 | export(resDS)
 14 | export(simData)
 15 | export(stagewise_DS_DD)
 16 | import(ggplot2)
 17 | importFrom(BiocParallel,MulticoreParam)
 18 | importFrom(BiocParallel,SerialParam)
 19 | importFrom(BiocParallel,bplapply)
 20 | importFrom(ComplexHeatmap,Heatmap)
 21 | importFrom(ComplexHeatmap,columnAnnotation)
 22 | importFrom(ComplexHeatmap,rowAnnotation)
 23 | importFrom(DESeq2,DESeq)
 24 | importFrom(DESeq2,DESeqDataSetFromMatrix)
 25 | importFrom(DESeq2,estimateDispersions)
 26 | importFrom(DESeq2,results)
 27 | importFrom(DESeq2,sizeFactors)
 28 | importFrom(DESeq2,varianceStabilizingTransformation)
 29 | importFrom(Matrix,colSums)
 30 | importFrom(Matrix,qr)
 31 | importFrom(Matrix,rowMeans)
 32 | importFrom(Matrix,rowSums)
 33 | importFrom(Matrix,t)
 34 | importFrom(S4Vectors,"metadata<-")
 35 | importFrom(S4Vectors,DataFrame)
 36 | importFrom(S4Vectors,as.list)
 37 | importFrom(S4Vectors,metadata)
 38 | importFrom(S4Vectors,split)
 39 | importFrom(S4Vectors,unfactor)
 40 | importFrom(SingleCellExperiment,"counts<-")
 41 | importFrom(SingleCellExperiment,"int_colData<-")
 42 | importFrom(SingleCellExperiment,"sizeFactors<-")
 43 | importFrom(SingleCellExperiment,SingleCellExperiment)
 44 | importFrom(SingleCellExperiment,colData)
 45 | importFrom(SingleCellExperiment,counts)
 46 | importFrom(SingleCellExperiment,int_colData)
 47 | importFrom(SingleCellExperiment,reducedDims)
 48 | importFrom(SingleCellExperiment,sizeFactors)
 49 | importFrom(SummarizedExperiment,"assay<-")
 50 | importFrom(SummarizedExperiment,"colData<-")
 51 | importFrom(SummarizedExperiment,"rowData<-")
 52 | importFrom(SummarizedExperiment,SummarizedExperiment)
 53 | importFrom(SummarizedExperiment,assay)
 54 | importFrom(SummarizedExperiment,assayNames)
 55 | importFrom(SummarizedExperiment,assays)
 56 | importFrom(SummarizedExperiment,colData)
 57 | importFrom(SummarizedExperiment,rowData)
 58 | importFrom(blme,bglmer)
 59 | importFrom(blme,blmer)
 60 | importFrom(data.table,data.table)
 61 | importFrom(dplyr,"%>%")
 62 | importFrom(dplyr,all_of)
 63 | importFrom(dplyr,bind_cols)
 64 | importFrom(dplyr,bind_rows)
 65 | importFrom(dplyr,filter)
 66 | importFrom(dplyr,full_join)
 67 | importFrom(dplyr,inner_join)
 68 | importFrom(dplyr,last)
 69 | importFrom(dplyr,mutate)
 70 | importFrom(dplyr,mutate_all)
 71 | importFrom(dplyr,mutate_at)
 72 | importFrom(dplyr,mutate_if)
 73 | importFrom(dplyr,pull)
 74 | importFrom(dplyr,rename)
 75 | importFrom(dplyr,select)
 76 | importFrom(edgeR,DGEList)
 77 | importFrom(edgeR,calcNormFactors)
 78 | importFrom(edgeR,cpm)
 79 | importFrom(edgeR,estimateDisp)
 80 | importFrom(edgeR,filterByExpr)
 81 | importFrom(edgeR,glmFit)
 82 | importFrom(edgeR,glmQLFTest)
 83 | importFrom(edgeR,glmQLFit)
 84 | importFrom(edgeR,glmTreat)
 85 | importFrom(edgeR,plotMDS.DGEList)
 86 | importFrom(edgeR,topTags)
 87 | importFrom(glmmTMB,glmmTMB)
 88 | importFrom(glmmTMB,nbinom1)
 89 | importFrom(grDevices,colorRampPalette)
 90 | importFrom(grid,gpar)
 91 | importFrom(limma,contrasts.fit)
 92 | importFrom(limma,duplicateCorrelation)
 93 | importFrom(limma,eBayes)
 94 | importFrom(limma,lmFit)
 95 | importFrom(limma,makeContrasts)
 96 | importFrom(limma,topTable)
 97 | importFrom(limma,topTreat)
 98 | importFrom(limma,treat)
 99 | importFrom(limma,voom)
100 | importFrom(lme4,.makeCC)
101 | importFrom(lme4,lmerControl)
102 | importFrom(lmerTest,contest)
103 | importFrom(lmerTest,lmer)
104 | importFrom(matrixStats,colAnys)
105 | importFrom(matrixStats,rowAnyNAs)
106 | importFrom(matrixStats,rowMedians)
107 | importFrom(matrixStats,rowMins)
108 | importFrom(matrixStats,rowQuantiles)
109 | importFrom(matrixStats,rowSds)
110 | importFrom(methods,is)
111 | importFrom(progress,progress_bar)
112 | importFrom(purrr,map)
113 | importFrom(purrr,map_depth)
114 | importFrom(purrr,modify_depth)
115 | importFrom(purrr,negate)
116 | importFrom(purrr,reduce)
117 | importFrom(purrr,set_names)
118 | importFrom(rlang,.data)
119 | importFrom(scales,hue_pal)
120 | importFrom(scater,computeLibraryFactors)
121 | importFrom(scater,isOutlier)
122 | importFrom(sctransform,vst)
123 | importFrom(scuttle,summarizeAssayByGroup)
124 | importFrom(stats,as.formula)
125 | importFrom(stats,coef)
126 | importFrom(stats,df.residual)
127 | importFrom(stats,model.matrix)
128 | importFrom(stats,p.adjust)
129 | importFrom(stats,residuals)
130 | importFrom(stats,rgamma)
131 | importFrom(stats,rnbinom)
132 | importFrom(stats,sd)
133 | importFrom(stats,setNames)
134 | importFrom(variancePartition,dream)
135 | importFrom(variancePartition,getContrast)
136 | importFrom(variancePartition,voomWithDreamWeights)
137 | importFrom(viridis,viridis)
138 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 |                         Changes in version 1.19.1
 2 |                         
 3 | + added J Gilis, D Risso, L Clement as authors
 4 | + differential detection with 'pbDS(..., method="DD")' or 'pbDD()'
 5 |   & stagewise testing [Gilis et al.], plus corresponding vignette
 6 | + replace 'aes_string()' in 'ggplot()' by '.data$.' from 'rlang'
 7 |   
 8 |                         Changes in version 1.15.1
 9 |                         
10 | + bug fix in 'pbDS': too stringent filtering causing no genes in any clusters
11 |   to be tested previously resulted in a 'subscript out of bounds' error;
12 |   execution is stopped and an informative error thrown instead.
13 |   
14 | + bug fix in 'mmDS': 'dream' (new version?) wouldn't recognize
15 |   model variables provided as data; fixed via adding 'as.formula()'.
16 |   
17 | + "analysis" vignette: replaced suspended 'dplyr' function
18 |   'top_n' with 'slice_min' when filtering for top DS hits;
19 |   fixed some typos; updated preprint to journal reference.
20 | 
21 |                         Changes in version 1.15.0
22 |                         
23 | + Bioconductor release v3.17
24 | 
25 |                         Changes in version 1.12.1
26 | 
27 | + fixed various typos in both vignettes
28 | + internal fixes to keep up with 'ggplot2' & 'dplyr' updates
29 | + bug fix in 'simDS' computing means when one group is missing
30 | + bug fix in 'resDS' until testing when 'cpm/frq = TRUE'
31 | 
32 |                         Changes in version 1.11.1
33 | 
34 | + bug fix in pbHeatmap(): previously failed for results from 'mmDS()'
35 | 
36 |                         Changes in version 1.9.3
37 | 
38 | + bug fix in pbDS(): drop samples w/o any detected features,
39 |   otherwise edgeR::calcNormFactors() fails when lib.size 0
40 |   
41 |                         Changes in version 1.8.1
42 | 
43 | + bug fix in prepSim(): removal of genes with NA coefficients
44 |   was previously not propagated to the dispersion estimates 
45 | 
46 | + bug fix in test-resDR.R: set 'min_cells = 0' to assure that
47 |   everything is being tested, otherwise unit tests could fail
48 | 
49 |                         Changes in version 1.8.0
50 | 
51 | + Bioconductor 3.14 release
52 | 
53 |                         Changes in version 1.7.2
54 | 
55 | + bug fix in prepSim(): removal of NA coefficients and 
56 |   subsetting of the input SCE was previously out of synch
57 |   
58 |                         Changes in version 1.5.2
59 | 
60 | + added edgeR::calcNormFactors() step in prepSim()
61 | 
62 | + added argument 'dd' to simData() specifying  
63 |   whether or not to simulate 2 groups
64 | 
65 | + prepSim() and simData() now support simulation of "singular" design  
66 |   (no samples, no clusters), as well as only samples/clusters
67 |   
68 | + simData() defaults to simulating as many samples as available  
69 |   in order to avoid re-use (duplication) of reference samples
70 | 
71 |                         Changes in version 1.5.1
72 | 
73 | + significant speed-up of aggregateData() by replacing usage  
74 |   of rowX() over a list with scuttle::summarizeAssayByGroup()
75 |   
76 | + added options use "prop.detected" and "num.detected" 
77 |   as summary statistic (argument 'fun') in aggregateData()
78 | 
79 | + added parallelization support in aggregateData() and pbDS() through argument BBPARAM
80 |   (passed to scater::sumCountsAcrossCells() and BiocParallel::bplapply, respectively)
81 | 
82 | + aggregateData() now stores the number of cells that went into aggregation under
83 |   int_colData(.)$n_cells (vs. metadata(.)$n_cells) to support automated subsetting
84 |   
85 | + replaced argument n_threads with BPPARAM throughout all 
86 |   parallelizable functions (aggregateData(), pbDS(), mmDS())
87 |   
88 | + bug fix in prepSim(): the function previously failed when 
89 |   cluster/sample/group_id cell metadata columns were non-factors
90 |   
91 | + bug fix in resDS(): cpm = TRUE previously didn't handle
92 |   missing cluster-sample combinations correctly


--------------------------------------------------------------------------------
/R/aggregateData.R:
--------------------------------------------------------------------------------
  1 | #' @rdname aggregateData
  2 | #' @title Aggregation of single-cell to pseudobulk data
  3 | #' 
  4 | #' @description ...
  5 | #' 
  6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
  7 | #' @param assay character string specifying the assay slot to use as 
  8 | #'   input data. Defaults to the 1st available (\code{assayNames(x)[1]}).
  9 | #' @param by character vector specifying which 
 10 | #'   \code{colData(x)} columns to summarize by (at most 2!).
 11 | #' @param fun a character string.
 12 | #'   Specifies the function to use as summary statistic.
 13 | #'   Passed to \code{\link[scuttle]{summarizeAssayByGroup}}.
 14 | #' @param scale logical. Should pseudo-bulks be scaled
 15 | #'   with the effective library size & multiplied by 1M?
 16 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
 17 | #'   object specifying how aggregation should be parallelized.
 18 | #' @param verbose logical. Should information on progress be reported?
 19 | #' 
 20 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
 21 | #' \itemize{
 22 | #' \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains 
 23 | #'   pseudobulks for each of \code{by[1]}, e.g., for each cluster when 
 24 | #'   \code{by = "cluster_id"}. Rows correspond to genes, columns to 
 25 | #'   \code{by[2]}, e.g., samples when \code{by = "sample_id"}}.
 26 | #' \item{If \code{length(by) == 1}, the returned SCE will contain only 
 27 | #'   a single \code{assay} with rows = genes and colums = \code{by}.}}
 28 | #'   
 29 | #'   Aggregation parameters (\code{assay, by, fun, scaled}) are stored in 
 30 | #'   \code{metadata()$agg_pars}, and the number of cells that were aggregated 
 31 | #'   are accessible in \code{int_colData()$n_cells}.
 32 | #' 
 33 | #' @examples 
 34 | #' # pseudobulk counts by cluster-sample
 35 | #' data(example_sce)
 36 | #' pb <- aggregateData(example_sce)
 37 | #' 
 38 | #' library(SingleCellExperiment)
 39 | #' assayNames(example_sce)  # one sheet per cluster
 40 | #' head(assay(example_sce)) # n_genes x n_samples
 41 | #' 
 42 | #' # scaled CPM
 43 | #' cpm <- edgeR::cpm(assay(example_sce))
 44 | #' assays(example_sce)$cpm <- cpm
 45 | #' pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE)
 46 | #' head(assay(pb)) 
 47 | #' 
 48 | #' # aggregate by cluster only
 49 | #' pb <- aggregateData(example_sce, by = "cluster_id")
 50 | #' length(assays(pb)) # single assay
 51 | #' head(assay(pb))    # n_genes x n_clusters
 52 | #' 
 53 | #' @author Helena L Crowell & Mark D Robinson
 54 | #' 
 55 | #' @references 
 56 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
 57 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
 58 | #' On the discovery of population-specific state transitions from 
 59 | #' multi-sample multi-condition single-cell RNA sequencing data. 
 60 | #' \emph{bioRxiv} \strong{713412} (2018). 
 61 | #' doi: \url{https://doi.org/10.1101/713412}
 62 | #' 
 63 | #' @importFrom Matrix colSums
 64 | #' @importFrom purrr map
 65 | #' @importFrom S4Vectors DataFrame metadata
 66 | #' @importFrom SingleCellExperiment SingleCellExperiment int_colData<-
 67 | #' @importFrom SummarizedExperiment colData colData<-
 68 | #' @export
 69 | 
 70 | aggregateData <- function(x, 
 71 |     assay = NULL, by = c("cluster_id", "sample_id"), 
 72 |     fun = c("sum", "mean", "median", "prop.detected", "num.detected"), 
 73 |     scale = FALSE, verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose)) {
 74 |     
 75 |     # check validity of input arguments
 76 |     fun <- match.arg(fun)
 77 |     if (is.null(assay)) 
 78 |         assay <- assayNames(x)[1] 
 79 |     .check_arg_assay(x, assay)
 80 |     .check_args_aggData(as.list(environment()))
 81 |     stopifnot(is(BPPARAM, "BiocParallelParam"))
 82 |     
 83 |     # assure 'by' colData columns are factors
 84 |     # so that missing combinations aren't dropped
 85 |     for (i in by) 
 86 |         if (!is.factor(x[[i]])) 
 87 |             x[[i]] <- factor(x[[i]])
 88 |     
 89 |     # compute pseudo-bulks
 90 |     pb <- .pb(x, by, assay, fun, BPPARAM)
 91 |     if (scale & length(by) == 2) {
 92 |         # compute library sizes
 93 |         cs <- if (assay == "counts" && fun == "sum")
 94 |             pb else .pb(x, by, "counts", "sum", BPPARAM)
 95 |         ls <- lapply(cs, colSums)
 96 |         # scale pseudobulks by CPM
 97 |         pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]])
 98 |         names(pb) <- names(ls)
 99 |     }
100 |     
101 |     # construct SCE
102 |     md <- metadata(x)
103 |     md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale)
104 |     pb <- SingleCellExperiment(pb, rowData = rowData(x), metadata = md)
105 |     
106 |     # tabulate number of cells
107 |     cd <- data.frame(colData(x)[, by])
108 |     for (i in names(cd))
109 |         if (is.factor(cd[[i]]))
110 |             cd[[i]] <- droplevels(cd[[i]])
111 |     ns <- table(cd)
112 |     if (length(by) == 2) {
113 |         ns <- asplit(ns, 2)
114 |         ns <- map(ns, ~c(unclass(.)))
115 |     } else ns <- c(unclass(ns))
116 |     int_colData(pb)$n_cells <- ns
117 |     
118 |     # propagate 'colData' columns that are unique across 2nd 'by'
119 |     if (length(by) == 2) {
120 |         cd <- colData(x)
121 |         ids <- colnames(pb)
122 |         counts <- vapply(ids, function(u) {
123 |             m <- as.logical(match(cd[, by[2]], u, nomatch = 0))
124 |             vapply(cd[m, ], function(u) length(unique(u)), numeric(1))
125 |         }, numeric(ncol(colData(x))))
126 |         cd_keep <- apply(counts, 1, function(u) all(u == 1))
127 |         cd_keep <- setdiff(names(which(cd_keep)), by)
128 |         if (length(cd_keep) != 0) {
129 |             m <- match(ids, cd[, by[2]], nomatch = 0)
130 |             cd <- cd[m, cd_keep, drop = FALSE]
131 |             rownames(cd) <- ids
132 |             colData(pb) <- cd
133 |         }
134 |     }
135 |     return(pb)
136 | }
137 | 


--------------------------------------------------------------------------------
/R/calcExprFreqs.R:
--------------------------------------------------------------------------------
 1 | #' calcExprFreqs
 2 | #' 
 3 | #' Calculates gene expression frequencies
 4 | #' 
 5 | #' \code{calcExprFreq} computes, for each sample and group (in each cluster),
 6 | #' the fraction of cells that express a given gene. Here, a gene is considered 
 7 | #' to be expressed when the specified measurement value (\code{assay}) 
 8 | #' lies above the specified threshold value (\code{th}).
 9 | #' 
10 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
11 | #' @param assay a character string specifying which assay to use.
12 | #' @param th numeric threshold value above which
13 | #'   a gene should be considered to be expressed.
14 | #' 
15 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
16 | #'   containing, for each cluster, an assay of dimensions #genes x #samples 
17 | #'   giving the fraction of cells that express each gene in each sample. 
18 | #'   If \code{colData(x)} contains a \code{"group_id"} column, the fraction 
19 | #'   of expressing cells in each each group will be included as well.
20 | #'   
21 | #' @examples
22 | #' data(example_sce)
23 | #' library(SingleCellExperiment)
24 | #' 
25 | #' frq <- calcExprFreqs(example_sce)
26 | #' 
27 | #' # one assay per cluster
28 | #' assayNames(frq)  
29 | #' 
30 | #' # expression frequencies by
31 | #' # sample & group; 1st cluster:
32 | #' head(assay(frq))
33 | #' 
34 | #' @author Helena L Crowell & Mark D Robinson
35 | #' 
36 | #' @importFrom Matrix rowMeans
37 | #' @importFrom methods is
38 | #' @importFrom purrr set_names
39 | #' @importFrom SummarizedExperiment assays colData SummarizedExperiment
40 | #' @export
41 |     
42 | calcExprFreqs <- function(x, assay = "counts", th = 0) {
43 |     # check validity of input arguments
44 |     .check_sce(x, req_group = FALSE)
45 |     .check_arg_assay(x, assay)
46 |     stopifnot(is.numeric(th), length(th) == 1)
47 |     
48 |     # split cells by cluster-sample
49 |     cs_by_ks <- .split_cells(x)
50 | 
51 |     # for each gene, compute fraction of cells 
52 |     # w/ assay value above threshold in each sample
53 |     y <- assays(x)[[assay]]
54 |     fq <- lapply(cs_by_ks, vapply, function(i) {
55 |         if (is.null(i)) return(rep(0, nrow(x)))
56 |         Matrix::rowMeans(y[, i, drop = FALSE] > th)
57 |     }, numeric(nrow(x)))
58 | 
59 |     # same for ea. group (if colData column "group_id" exists)
60 |     if ("group_id" %in% colnames(colData(x))) {
61 |         kids <- x$cluster_id
62 |         nc_by_ks <- table(kids, x$sample_id)
63 |         nc_by_kg <- table(kids, x$group_id)
64 |         
65 |         ei <- metadata(x)$experiment_info
66 |         s_by_g <- split(ei$sample_id, ei$group_id) 
67 |         gids <- names(s_by_g)
68 |         kids <- set_names(levels(kids))
69 |         
70 |         fq <- lapply(kids, function(k) {
71 |             nc_by_s <- fq[[k]] * nc_by_ks[k, ][col(fq[[k]])]
72 |             fq_by_g <- vapply(gids, function(g) {
73 |                 nc_g <- nc_by_s[, s_by_g[[g]], drop = FALSE]
74 |                 rowSums(nc_g) / nc_by_kg[k, g]
75 |             }, numeric(nrow(x)))
76 |             cbind(fq[[k]], fq_by_g)
77 |         })
78 |     }
79 |     # return SCE
80 |     SingleCellExperiment(assays = fq)
81 | }
82 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' @rdname data
 2 | #' @name data
 3 | #' @aliases data example_sce
 4 | #' 
 5 | #' @title Example datasets
 6 | #' 
 7 | #' @description 
 8 | #' A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing 
 9 | #' 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after 
10 | #' 6h-treatment with INF-beta (16 samples in total).
11 | #' 
12 | #' The original data has been filtered to
13 | #' \itemize{
14 | #' \item{remove unassigned cells & cell multiplets}
15 | #' \item{retain only 4 out of 8 samples per experimental group}
16 | #' \item{retain only 5 out of 8 subpopulations (clusters)}
17 | #' \item{retain genes with a count > 1 in > 50 cells}
18 | #' \item{retain cells with > 200 detected genes}
19 | #' \item{retain at most 100 cells per cluster-sample instance}
20 | #' }
21 | #' 
22 | #' Assay \code{logcounts} corresponds to log-normalized values 
23 | #' obtained from \code{\link[scater]{logNormCounts}} with default parameters.
24 | #'   
25 | #' The original measurement data, as well as gene and cell metadata 
26 | #' is available through the NCBI GEO accession number GSE96583;
27 | #' code to reproduce this example dataset from the original data 
28 | #' is provided in the examples section.
29 | #' 
30 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
31 | #' 
32 | #' @examples
33 | #' \donttest{
34 | #' # set random seed for cell sampling
35 | #' set.seed(2929)
36 | #' 
37 | #' # load data
38 | #' library(ExperimentHub)
39 | #' eh <- ExperimentHub()
40 | #' sce <- eh[["EH2259"]]
41 | #' 
42 | #' # drop unassigned cells & multiplets
43 | #' sce <- sce[, !is.na(sce$cell)]
44 | #' sce <- sce[, sce$multiplets == "singlet"]
45 | #' 
46 | #' # keep 4 samples per group
47 | #' sce$id <- paste0(sce$stim, sce$ind)
48 | #' inds <- sample(sce$ind, 4)
49 | #' ids <- paste0(levels(sce$stim), rep(inds, each = 2))
50 | #' sce <- sce[, sce$id %in% ids]
51 | #' 
52 | #' # keep 5 clusters
53 | #' kids <- c("B cells", "CD4 T cells", "CD8 T cells",
54 | #'     "CD14+ Monocytes", "FCGR3A+ Monocytes")
55 | #' sce <- sce[, sce$cell %in% kids]
56 | #' sce$cell <- droplevels(sce$cell)
57 | #' 
58 | #' # basic filtering on  genes & cells
59 | #' gs <- rowSums(counts(sce) > 1) > 50
60 | #' cs <- colSums(counts(sce) > 0) > 200
61 | #' sce <- sce[gs, cs]
62 | #' 
63 | #' # sample max. 100 cells per cluster-sample
64 | #' cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id))
65 | #' cs <- sapply(cs_by_ks, function(u) 
66 | #'     sample(u, min(length(u), 100)))
67 | #' sce <- sce[, unlist(cs)]
68 | #' 
69 | #' # compute logcounts
70 | #' library(scater)
71 | #' sce <- computeLibraryFactors(sce)
72 | #' sce <- logNormCounts(sce)
73 | #' 
74 | #' # re-format for 'muscat'
75 | #' sce <- prepSCE(sce, 
76 | #'     kid = "cell", 
77 | #'     sid = "id", 
78 | #'     gid = "stim", 
79 | #'     drop = TRUE)
80 | #' } 
81 | #' 
82 | #' @references 
83 | #' Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing 
84 | #' using natural genetic variation. \emph{Nature Biotechnology},
85 | #' \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042.
86 | #'  
87 | #' @author Helena L Crowell
88 | 
89 | NULL
90 | 


--------------------------------------------------------------------------------
/R/mmDS.R:
--------------------------------------------------------------------------------
  1 | #' @rdname mmDS
  2 | #' @title DS analysis using mixed-models (MM)
  3 | #'
  4 | #' @description Performs cluster-wise DE analysis by fitting cell-level models.
  5 | #'
  6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
  7 | #' @param coef character specifying the coefficient to test.
  8 | #'   If NULL (default), will test the last level of \code{"group_id"}.
  9 | #' @param covs character vector of \code{colData(x)}
 10 | #'   column names to use as covariates.
 11 | #' @param method a character string.
 12 | #'   Either \code{"dream2"} (default, lme4 with voom-weights),
 13 | #'   \code{"dream"} (previous implementation of the dream method),
 14 | #'   \code{"vst"} (variance-stabilizing transformation),
 15 | #'   \code{"poisson"} (poisson GLM-MM),
 16 | #'   \code{"nbinom"} (negative binomial GLM-MM),
 17 | #'   \code{"hybrid"} (combination of pseudobulk and poisson methods)
 18 | #'   or a function accepting the same arguments.
 19 | #' @param n_cells number of cells per cluster-sample
 20 | #'   required to consider a sample for testing.
 21 | #' @param n_samples number of samples per group
 22 | #'   required to consider a cluster for testing.
 23 | #' @param min_count numeric. For a gene to be tested in a given cluster,
 24 | #'   at least \code{min_cells} must have a count >= \code{min_count}.
 25 | #' @param min_cells number (or fraction, if < 1) of cells with a count >
 26 | #'   \code{min_count} required for a gene to be tested in a given cluster.
 27 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
 28 | #'   object specifying how differential testing should be parallelized.
 29 | #' @param verbose logical specifying whether messages
 30 | #'   on progress and a progress bar should be displayed.
 31 | #'
 32 | #' @return a data.frame
 33 | #'
 34 | #' @examples
 35 | #' # subset "B cells" cluster
 36 | #' data(example_sce)
 37 | #' b_cells <- example_sce$cluster_id == "B cells"
 38 | #' sub <- example_sce[, b_cells]
 39 | #' sub$cluster_id <- droplevels(sub$cluster_id)
 40 | #'
 41 | #' # downsample to 100 genes
 42 | #' gs <- sample(nrow(sub), 100)
 43 | #' sub <- sub[gs, ]
 44 | #'
 45 | #' # run DS analysis using cell-level mixed-model
 46 | #' res <- mmDS(sub, method = "dream", verbose = FALSE)
 47 | #' head(res$`B cells`)
 48 | #'
 49 | #' @author Pierre-Luc Germain & Helena L Crowell
 50 | #'
 51 | #' @references
 52 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
 53 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
 54 | #' On the discovery of population-specific state transitions from
 55 | #' multi-sample multi-condition single-cell RNA sequencing data.
 56 | #' \emph{bioRxiv} \strong{713412} (2018).
 57 | #' doi: \url{https://doi.org/10.1101/713412}
 58 | #'
 59 | #' @importFrom DESeq2 DESeqDataSetFromMatrix estimateDispersions
 60 | #'   sizeFactors varianceStabilizingTransformation
 61 | #' @importFrom dplyr %>% mutate bind_rows
 62 | #' @importFrom matrixStats rowMins
 63 | #' @importFrom progress progress_bar
 64 | #' @importFrom purrr map_depth
 65 | #' @importFrom sctransform vst
 66 | #' @importFrom SingleCellExperiment counts counts<-
 67 | #'   colData sizeFactors sizeFactors<-
 68 | #' @importFrom stats p.adjust
 69 | #' @export
 70 | 
 71 | mmDS <- function(x, coef = NULL, covs = NULL,
 72 |     method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"),
 73 |     n_cells = 10, n_samples = 2, min_count = 1, min_cells = 20,
 74 |     verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose), 
 75 |     vst = c("sctransform", "DESeq2"),
 76 |     ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
 77 |     dup_corr = FALSE, trended = FALSE, bayesian = FALSE, 
 78 |     blind = TRUE, REML = TRUE, moderate = FALSE) {
 79 |     
 80 |     # check validity of input arguments
 81 |     .check_sce(x, req_group = TRUE)
 82 |     .check_arg_assay(x, "counts")
 83 |     .check_args_mmDS(as.list(environment()))
 84 |     stopifnot(is(BPPARAM, "BiocParallelParam"))
 85 |     
 86 |     args <- as.list(environment())
 87 |     args$method <- match.arg(method)
 88 |     args$vst <- match.arg(vst)
 89 |     args$ddf <- match.arg(ddf)
 90 |     
 91 |     # counts cells per cluster-sample
 92 |     n_cells_by_ks <- table(x$cluster_id, x$sample_id)
 93 |     
 94 |     # filter clusters w/ >= n_cells in >= n_samples
 95 |     if (!is.null(metadata(x)$experiment_info$group_id)) {
 96 |         ei <- metadata(x)$experiment_info
 97 |         m <- match(levels(x$sample_id), ei$sample_id)
 98 |         gids <- ei$group_id[m]
 99 |     } else {
100 |         gids <- x$group_id
101 |     }
102 |     ks_keep <- apply(n_cells_by_ks > n_cells, 1,
103 |         function(u) all(tabulate(gids[u]) >= n_samples))
104 |     if (sum(ks_keep) == 0)
105 |         stop("No cluster has at least ", n_samples,
106 |             " samples with at least ", n_cells, " cells.")
107 |     
108 |     kids <- levels(x$cluster_id)
109 |     if (verbose && sum(ks_keep) < length(kids))
110 |         message(paste("Skipping cluster(s)",
111 |             paste(dQuote(kids[!ks_keep]), collapse = ", "),
112 |             "\ndue to an insufficient number of samples",
113 |             "with a sufficient number of cells."))
114 |     kids <- kids[ks_keep]
115 |     names(kids) <- kids
116 |     
117 |     # split cells by cluster
118 |     cells_by_k <- split(colnames(x), x$cluster_id)
119 |     
120 |     if (min_count < 1) {
121 |         min_count <- floor(min_count * rowMins(n_cells_by_ks))
122 |     } else {
123 |         min_count <- rep(min_count, length(kids))
124 |     }
125 |     names(min_count) <- kids
126 |     
127 |     # variance-stabilizing transformation
128 |     if (args$method == "vst" && !"vstresiduals" %in% assayNames(x)) {
129 |         vst_call <- switch(args$vst,
130 |             sctransform = expression(.vst_sctransform(x, verbose)),
131 |             DESeq2 = expression(.vst_DESeq2(x, covs, blind)))
132 |         if (verbose) {
133 |             assay(x, "vstresiduals", FALSE) <- eval(vst_call)
134 |         } else {
135 |             assay(x, "vstresiduals", FALSE) <- suppressMessages(eval(vst_call))
136 |         }
137 |     }
138 |     
139 |     # get method function & construct correct call
140 |     if (!is.function(fun <- args$method))
141 |         fun <- get(paste0(".mm_", fun))
142 |     args_use <- names(formals(fun))
143 |     args <- args[names(args) %in% args_use]
144 |     
145 |     if (verbose) pb <- progress_bar$new(total = length(kids))
146 |     res <- lapply(kids, function(k) {
147 |         y <- x[, cells_by_k[[k]]]
148 |         y <- y[rowSums(counts(y) >= min_count[k]) > min_cells, ]
149 |         if (verbose)
150 |             message("Testing ", nrow(y), " genes across ",
151 |                 ncol(y), " cells in cluster ", dQuote(k), "...")
152 |         
153 |         # call to .mm_dream/.mm_vst
154 |         args$x <- y
155 |         z <- do.call(fun, args)
156 |         data.frame(
157 |             gene = rownames(y),
158 |             cluster_id = k, z, 
159 |             row.names = NULL,
160 |             stringsAsFactors = FALSE)
161 |     })
162 |     if (verbose) pb$terminate()
163 |     
164 |     # assemble results from all cluster
165 |     res <- bind_rows(res)
166 |     # global p-value adjustment
167 |     p_adj.glb <- p.adjust(res$p_val, method = "BH")
168 |     i <- which(colnames(res) == "p_adj.loc")
169 |     res[["p_adj.glb"]] <- p_adj.glb
170 |     res <- res[, c(seq_len(i), ncol(res), seq_len(ncol(res)-1)[-seq_len(i)])]
171 |     # re-split by cluster
172 |     split(res, res$cluster_id)
173 | }
174 | 


--------------------------------------------------------------------------------
/R/pbDS.R:
--------------------------------------------------------------------------------
  1 | #' @rdname pbDS
  2 | #' @title pseudobulk DS analysis
  3 | #'
  4 | #' @description \code{pbDS} tests for DS after aggregating single-cell 
  5 | #'   measurements to pseudobulk data, by applying bulk RNA-seq DE methods, 
  6 | #'   such as \code{edgeR}, \code{DESeq2} and \code{limma}.
  7 | #'
  8 | #' @param pb a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
  9 | #'   containing pseudobulks as returned by \code{\link{aggregateData}}.
 10 | #' @param method a character string.
 11 | #' @param design For methods \code{"edegR"} and \code{"limma"}, a design matrix 
 12 | #'   with row & column names(!) created with \code{\link[stats]{model.matrix}}; 
 13 | #'   For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}.
 14 | #'   Defaults to \code{~ group_id} or the corresponding \code{model.matrix}.
 15 | #' @param contrast a matrix of contrasts to test for
 16 | #'   created with \code{\link[limma]{makeContrasts}}.
 17 | #' @param coef passed to \code{\link[edgeR]{glmQLFTest}},
 18 | #'   \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}}
 19 | #'   for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively.
 20 | #'   Can be a list for multiple, independent comparisons.
 21 | #' @param min_cells a numeric. Specifies the minimum number of cells in a given 
 22 | #'   cluster-sample required to consider the sample for differential testing.
 23 | #' @param filter character string specifying whether
 24 | #'   to filter on genes, samples, both or neither.
 25 | #' @param treat logical specifying whether empirical Bayes moderated-t 
 26 | #'   p-values should be computed relative to a minimum fold-change threshold. 
 27 | #'   Only applicable for methods \code{"limma-x"} 
 28 | #'   (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"} 
 29 | #'   (\code{\link[edgeR]{glmTreat}}), and ignored otherwise.
 30 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
 31 | #'   object specifying how differential testing should be parallelized.
 32 | #' @param verbose logical. Should information on progress be reported?
 33 | #'
 34 | #' @return a list containing \itemize{
 35 | #' \item a data.frame with differential testing results,
 36 | #' \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and
 37 | #' \item the \code{design} matrix, and \code{contrast} or \code{coef} used.}
 38 | #'
 39 | #' @examples
 40 | #' # simulate 5 clusters, 20% of DE genes
 41 | #' data(example_sce)
 42 | #'     
 43 | #' # compute pseudobulk sum-counts & run DS analysis
 44 | #' pb <- aggregateData(example_sce)
 45 | #' res <- pbDS(pb, method = "limma-trend")
 46 | #'
 47 | #' names(res)
 48 | #' names(res$table)
 49 | #' head(res$table$stim$`B cells`)
 50 | #' 
 51 | #' # count nb. of DE genes by cluster
 52 | #' vapply(res$table$stim, function(u) 
 53 | #'   sum(u$p_adj.loc < 0.05), numeric(1))
 54 | #' 
 55 | #' # get top 5 hits for ea. cluster w/ abs(logFC) > 1
 56 | #' library(dplyr)
 57 | #' lapply(res$table$stim, function(u)
 58 | #'   filter(u, abs(logFC) > 1) %>% 
 59 | #'     arrange(p_adj.loc) %>% 
 60 | #'     slice(seq_len(5)))
 61 | #'
 62 | #' @author Helena L Crowell & Mark D Robinson
 63 | #'
 64 | #' @references 
 65 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
 66 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
 67 | #' On the discovery of population-specific state transitions from 
 68 | #' multi-sample multi-condition single-cell RNA sequencing data. 
 69 | #' \emph{bioRxiv} \strong{713412} (2018). 
 70 | #' doi: \url{https://doi.org/10.1101/713412}
 71 | #'
 72 | #' @importFrom BiocParallel bplapply SerialParam 
 73 | #' @importFrom edgeR filterByExpr
 74 | #' @importFrom dplyr last rename
 75 | #' @importFrom limma makeContrasts
 76 | #' @importFrom matrixStats colAnys
 77 | #' @importFrom Matrix qr rowSums
 78 | #' @importFrom methods is
 79 | #' @importFrom scater isOutlier
 80 | #' @importFrom stats model.matrix
 81 | #' @importFrom SummarizedExperiment assay colData
 82 | #' @export
 83 | 
 84 | pbDS <- function(pb, 
 85 |     method=c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"),
 86 |     design=NULL, coef=NULL, contrast=NULL, min_cells=10, 
 87 |     filter=c("both", "genes", "samples", "none"), treat=FALSE, 
 88 |     verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) {
 89 |     
 90 |     # check validity of input arguments
 91 |     args <- as.list(environment())
 92 |     method <- match.arg(method)
 93 |     filter <- match.arg(filter)
 94 |     .check_pbs(pb, check_by=TRUE)
 95 |     .check_args_pbDS(args)
 96 |     stopifnot(is(BPPARAM, "BiocParallelParam"))
 97 |     
 98 |     if (is.null(design)) {
 99 |         formula <- ~ group_id
100 |         cd <- as.data.frame(colData(pb))
101 |         design <- model.matrix(formula, cd)
102 |         colnames(design) <- levels(pb$group_id)
103 |         args$design <- design
104 |     }
105 |     if (is.null(coef) & is.null(contrast)) {
106 |         c <- colnames(design)[ncol(design)]
107 |         contrast <- makeContrasts(contrasts=c, levels=design)
108 |         args$contrast <- contrast
109 |     }
110 | 
111 |     # ct: type of comparison - "contrast" or "coef"
112 |     # cs: named list of 'coef's or 'contrast's
113 |     if (!is.null(contrast)) {
114 |         coef <- NULL
115 |         names(cs) <- cs <- colnames(contrast)
116 |     } else if (!is.null(coef)) {
117 |         if (!is.list(coef)) 
118 |             coef <- list(coef)
119 |         cs <- vapply(coef, function(i)
120 |             paste(colnames(design)[i], collapse="-"),
121 |             character(1))
122 |         names(cs) <- names(coef) <- cs
123 |     }
124 |     ct <- ifelse(is.null(coef), "contrast", "coef")
125 |     
126 |     if (!is.function(method)) {
127 |         fun <- switch(method,
128 |             "DD"=.edgeR_NB,
129 |             "edgeR"=.edgeR, 
130 |             "DESeq2"=.DESeq2,
131 |             "limma-voom"=.limma_voom,
132 |             "limma-trend"=.limma_trend)
133 |     } else {
134 |         fun_call <- 1
135 |     }
136 |     fun_args <- names(as.list(args(fun)))
137 |     fun_args <- fun_args[-length(fun_args)]
138 |     
139 |     # for ea. cluster, run DEA
140 |     n_cells <- .n_cells(pb)
141 |     names(kids) <- kids <- assayNames(pb)
142 |     res <- bplapply(
143 |         BPPARAM=BPPARAM, 
144 |         kids, function (k) {
145 |         rmv <- n_cells[k, ] < min_cells
146 |         d <- design[colnames(y <- pb[ , !rmv]), , drop=FALSE]
147 |         if (filter %in% c("samples", "both")) {
148 |             ls <- colSums(assay(y, k))
149 |             ol <- isOutlier(ls, log=TRUE, type="lower", nmads=3)
150 |             d <- d[colnames(y <- y[, !ol]), , drop=FALSE]
151 |         }
152 |         if (any(tabulate(y$group_id) < 2) 
153 |             || qr(d)$rank== nrow(d) 
154 |             || qr(d)$rank < ncol(d)) 
155 |             return(NULL)
156 |         y <- y[rowSums(assay(y, k)) != 0, , drop=FALSE]
157 |         if (filter %in% c("genes", "both") & max(assay(y, k)) > 100) 
158 |             y <- y[filterByExpr(assay(y, k), d), , drop=FALSE]
159 |         # drop samples without any detected features
160 |         keep <- colAnys(assay(y, k) > 0)
161 |         y <- y[, keep, drop=FALSE]
162 |         d <- d[keep, , drop=FALSE]
163 |         args <- list(
164 |             x=y, k=k, design=d, coef=coef, 
165 |             contrast=contrast, ct=ct, cs=cs,
166 |             treat=treat, nc=n_cells[k, !rmv])
167 |         args <- args[intersect(names(args), fun_args)]
168 |         suppressWarnings(do.call(fun, args))
169 |     })
170 |     
171 |     # remove empty clusters
172 |     rmv <- vapply(res, is.null, logical(1))
173 |     res <- res[!rmv]
174 |     
175 |     if (length(res)== 0) stop(
176 |         "Specified filtering options result in no genes in any clusters ",
177 |         "being tested. To force testing, consider modifying arguments ",
178 |         "'min_cells' and/or 'filter'. See '?pbDS' for details.")
179 |     
180 |     # reorganize & do global p-value adjustment
181 |     names(i) <- i <- c("table", "data", "fit")
182 |     res <- lapply(i, map, .x=res)
183 |     res$table <- .p_adj_global(res$table)
184 |     return(c(res, list(args=args)))
185 | }
186 | 
187 | #' @rdname pbDS
188 | #' @export
189 | pbDD <- function(pb, design=NULL, coef=NULL, contrast=NULL, 
190 |     min_cells=10, filter=c("both", "genes", "samples", "none"), 
191 |     verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) 
192 | {
193 |     args <- as.list(environment())
194 |     do.call(pbDS, c(args, list(method="DD")))
195 | }
196 | 


--------------------------------------------------------------------------------
/R/pbFlatten.R:
--------------------------------------------------------------------------------
 1 | #' pbFlatten
 2 | #' Flatten pseudobulk SCE
 3 | #'
 4 | #' Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
 5 | #' as returned by \code{\link{aggregateData}} such that all cell subpopulations 
 6 | #' are represented as a single assay.
 7 | #'
 8 | #' @param pb a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
 9 | #'   as returned by \code{\link{aggregateData}}, with different subpopulations as assays.
10 | #' @param normalize logical specifying whether to compute a \code{logcpm} assay.
11 | #'
12 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
13 | #' 
14 | #' @examples 
15 | #' data(example_sce)
16 | #' library(SingleCellExperiment)
17 | #' pb_stack <- aggregateData(example_sce)
18 | #' (pb_flat <- pbFlatten(pb_stack))
19 | #' ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack))
20 | #' 
21 | #' @importFrom methods is
22 | #' @importFrom edgeR cpm calcNormFactors DGEList
23 | #' @importFrom S4Vectors DataFrame metadata as.list
24 | #' @importFrom SingleCellExperiment SingleCellExperiment
25 | #' @importFrom SummarizedExperiment assay assay<- assays colData rowData
26 | #' @export
27 | 
28 | pbFlatten <- function(pb, normalize = TRUE){
29 |     # check validity of input arguments
30 |     stopifnot(is.logical(normalize), length(normalize) == 1,
31 |         is(pb, "SingleCellExperiment"), length(assays(pb)) > 1)
32 |     # concatenate assay data
33 |     as <- assays(pb)
34 |     a <- do.call(cbind, as.list(as))
35 |     sids <- rep(colnames(pb), length(as))
36 |     kids <- rep(names(as), each = ncol(pb))
37 |     colnames(a) <- paste(sids, kids, sep = ".")
38 |     pb$sample_id <- colnames(pb)
39 |     # construct cell metadata
40 |     cd <- lapply(seq_along(as), function(u) colData(pb))
41 |     cd <- do.call(rbind, cd)
42 |     rownames(cd) <- colnames(a)
43 |     cd$cluster_id <- kids
44 |     # construct single-assay SCE
45 |     sce <- SingleCellExperiment(
46 |         assays = list(counts = a),
47 |         colData = cd, 
48 |         rowData = rowData(pb),
49 |         metadata = metadata(pb))
50 |     # (optionally) add number of cells per cluster-sample
51 |     if (!is.null(n_cells <- .n_cells(pb))) {
52 |         n_cells <- tryCatch(mapply(
53 |             function(k, s) n_cells[k, s],
54 |             k = as.character(kids), s = as.character(sids)),
55 |             error = function(e) { warning(e); NULL })
56 |         if (!is.null(n_cells)) 
57 |             sce$n_cells <- as.numeric(n_cells)
58 |     }
59 |     # (optionally) do log-CPM normalization
60 |     if (normalize) {
61 |         # remove empty columns (samples that lack a cluster)
62 |         sce <- sce[, colSums(a != 0) > 0]
63 |         dgl <- DGEList(assay(sce))
64 |         dgl <- calcNormFactors(dgl)
65 |         assay(sce, "logcpm") <- log1p(cpm(dgl))
66 |     }
67 |     return(sce)
68 | }
69 | 


--------------------------------------------------------------------------------
/R/pbHeatmap.R:
--------------------------------------------------------------------------------
  1 | #' @rdname pbHeatmap
  2 | #' @title Heatmap of cluster-sample pseudobulks
  3 | #' 
  4 | #' @description ...
  5 | #' 
  6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
  7 | #' @param y a list of DS analysis results as returned by 
  8 | #'   \code{\link{pbDS}} or \code{\link{mmDS}}.
  9 | #' @param k character vector; specifies which cluster ID(s) to retain.
 10 | #'   Defaults to \code{levels(x$cluster_id)}.
 11 | #' @param g character vector; specifies which genes to retain.
 12 | #'   Defaults to considering all genes.
 13 | #' @param c character string; specifies which contrast/coefficient to retain.
 14 | #'   Defaults to \code{names(y$table)[1]}.
 15 | #' @param top_n single numeric; number of genes to retain per cluster.
 16 | #' @param fdr,lfc single numeric; FDR and logFC cutoffs to filter results by.
 17 | #'   The specified FDR threshold is applied to \code{p_adj.loc} values.
 18 | #' @param sort_by character string specifying 
 19 | #'   a numeric results table column to sort by;
 20 | #'   \code{"none"} to retain original ordering.
 21 | #' @param decreasing logical; whether to sort 
 22 | #'   in decreasing order of \code{sort_by}.
 23 | #' @param assay character string; specifies which assay to use;
 24 | #'   should be one of \code{assayNames(x)}.
 25 | #' @param fun function to use as summary statistic, 
 26 | #'   e.g., mean, median, sum (depending on the input assay).
 27 | #' @param normalize logical; whether to apply a z-normalization 
 28 | #'   to each row (gene) of the cluster-sample pseudobulk data.
 29 | #' @param col character vector of colors or color mapping function
 30 | #'   generated with \code{\link[circlize]{colorRamp2}}. Passed to 
 31 | #'   argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}}
 32 | #'   (see \code{?ComplexHeatmap::Heatmap} for details).
 33 | #' @param row_anno,col_anno logical; whether to render
 34 | #'   annotations of cluster and group IDs, respectively.
 35 | #' 
 36 | #' @return a \code{\link{HeatmapList-class}} object.
 37 | #' 
 38 | #' @examples 
 39 | #' # compute pseudobulks & run DS analysis
 40 | #' data(example_sce)
 41 | #' pb <- aggregateData(example_sce)
 42 | #' res <- pbDS(pb)
 43 | #' 
 44 | #' # cluster-sample expression means
 45 | #' pbHeatmap(example_sce, res)
 46 | #' 
 47 | #' # include only a single cluster
 48 | #' pbHeatmap(example_sce, res, k = "B cells")
 49 | #' 
 50 | #' # plot specific gene across all clusters
 51 | #' pbHeatmap(example_sce, res, g = "ISG20")
 52 | #' 
 53 | #' @author Helena L Crowell
 54 | #' 
 55 | #' @importFrom ComplexHeatmap Heatmap columnAnnotation rowAnnotation
 56 | #' @importFrom dplyr bind_rows filter
 57 | #' @importFrom grid gpar
 58 | #' @importFrom purrr map
 59 | #' @importFrom scales hue_pal
 60 | #' @importFrom viridis viridis
 61 | #' @export
 62 | 
 63 | pbHeatmap <- function(x, y, 
 64 |     k = NULL, g = NULL, c = NULL, 
 65 |     top_n = 20, fdr = 0.05, lfc = 1, 
 66 |     sort_by = "p_adj.loc", decreasing = FALSE,
 67 |     assay = "logcounts", fun = mean, normalize = TRUE,
 68 |     col = viridis(10), row_anno = TRUE, col_anno = TRUE) {
 69 |     
 70 |     # check validity of input arguments
 71 |     .check_sce(x, req_group = TRUE)
 72 |     .check_arg_assay(x, assay)
 73 |     .check_args_pbHeatmap(as.list(environment()))
 74 |     
 75 |     # subset specified contrast/coef & cluster(s)
 76 |     if (is.null(k)) k <- levels(x$cluster_id)
 77 |     if (names(y)[1] == "table") {
 78 |         if (is.null(c)) c <- names(y$table)[1]
 79 |         y <- y$table[[c]][k]
 80 |     } else y <- y[k]
 81 |     y <- y[!vapply(y, is.null, logical(1))]
 82 |     
 83 |     # filter results
 84 |     if (!is.null(g)) y <- lapply(y, filter, gene %in% g)
 85 |     y <- lapply(y, filter, p_adj.loc < fdr, abs(logFC) > lfc)
 86 |     
 87 |     # subset 'top_n' results
 88 |     if (is.null(top_n)) {
 89 |         ns <- vapply(y, nrow, numeric(1))
 90 |     } else {
 91 |         ns <- vapply(y, function(u) min(nrow(u), top_n), numeric(1))
 92 |     }
 93 |     
 94 |     # get cluster IDs & nb. of clusters
 95 |     ex <- ns == 0; ns <- ns[!ex]; y <- y[!ex]
 96 |     nk <- length(names(kids) <- kids <- names(y))
 97 |     
 98 |     # re-order results
 99 |     if (sort_by == "none") {
100 |         y <- lapply(kids, function(k) 
101 |             y[[k]][seq_len(ns[k]), ])
102 |     } else {
103 |         vs <- map(y, sort_by)
104 |         os <- lapply(vs, order, decreasing = decreasing)
105 |         y <- lapply(kids, function(k) {
106 |             o <- os[[k]][seq_len(ns[k])]
107 |             y[[k]][o, ]
108 |         })
109 |     }
110 |     y <- bind_rows(y)
111 |     
112 |     # subset 'assay' data
113 |     es <- assays(x)[[assay]]
114 |     es <- es[unlist(y$gene), , drop = FALSE]
115 |     
116 |     # compute cluster-sample summary values (e.g., means)
117 |     cells_by_ks <- .split_cells(x)
118 |     xs <- t(mapply(function(g, k)
119 |         vapply(cells_by_ks[[k]], function(cs)
120 |             fun(es[g, cs]), numeric(1)), 
121 |         g = y$gene, k = y$cluster_id))
122 |     if (normalize) xs <- .z_norm(xs)
123 |     
124 |     # plotting -----------------------------------------------------------------
125 |     # row & column annotation
126 |     lgd_aes <- list(labels_gp = gpar(fontsize = 6),
127 |         title_gp = gpar(fontface = "bold", fontsize = 8))
128 |     
129 |     if (row_anno & nk > 1) {
130 |         cols <- .cluster_colors
131 |         if (nk > length(cols))
132 |             cols <- colorRampPalette(cols)(nk)
133 |         cols <- setNames(cols[seq_len(nk)], kids)
134 |         row_anno <- rowAnnotation(
135 |             df = data.frame(cluster_id = y$cluster_id),
136 |             col = list(cluster_id = cols),
137 |             gp = gpar(col = "white"),
138 |             show_annotation_name = FALSE,
139 |             annotation_legend_param = lgd_aes)
140 |     } else {
141 |         row_anno <- NULL
142 |     }
143 |     
144 |     ei <- metadata(x)$experiment_info
145 |     m <- match(levels(x$sample_id), ei$sample_id)
146 |     if (col_anno) {
147 |         cols <- setNames(hue_pal()(nlevels(x$group_id)), levels(x$group_id))
148 |         col_anno <- columnAnnotation(
149 |             df = data.frame(group_id = ei$group_id[m]),
150 |             col = list(group_id = cols),
151 |             gp = gpar(col = "white"),
152 |             show_annotation_name = FALSE,
153 |             annotation_legend_param = lgd_aes)
154 |     } else {
155 |         col_anno <- NULL
156 |     }
157 |     
158 |     Heatmap(matrix = xs, col = col, 
159 |         name = sprintf("%s%s %s", 
160 |             c("", "z-normalized\n")[normalize + 1], 
161 |             deparse(substitute(fun)), assay),
162 |         row_title = NULL, column_title = NULL,
163 |         cluster_rows = FALSE, cluster_columns = FALSE,
164 |         left_annotation = row_anno, top_annotation = col_anno,
165 |         split = y$cluster_id, column_split = ei$group_id[m],
166 |         heatmap_legend_param = lgd_aes,
167 |         row_names_gp = gpar(fontsize = 6), 
168 |         column_names_gp = gpar(fontsize = 8),
169 |         column_title_gp = gpar(fontface = "bold", fontsize = 10))
170 | }


--------------------------------------------------------------------------------
/R/pbMDS.R:
--------------------------------------------------------------------------------
 1 | #' @rdname pbMDS
 2 | #' @title Pseudobulk-level MDS plot
 3 | #' 
 4 | #' @description Renders a multidimensional scaling (MDS) 
 5 | #'   where each point represents a cluster-sample instance; 
 6 | #'   with points colored by cluster ID and shaped by group ID. 
 7 | #' 
 8 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 
 9 | #'   containing cluster-sample pseudobulks as returned by 
10 | #'   \code{\link{aggregateData}} with argument 
11 | #'   \code{by = c("cluster_id", "sample_id")}.
12 | #' 
13 | #' @return a \code{ggplot} object.
14 | #' 
15 | #' @examples 
16 | #' data(example_sce)
17 | #' pb <- aggregateData(example_sce)
18 | #' pbMDS(pb)
19 | #' 
20 | #' @author Helena L Crowell & Mark D Robinson
21 | #' 
22 | #' @importFrom edgeR calcNormFactors cpm DGEList plotMDS.DGEList
23 | #' @importFrom SummarizedExperiment assays
24 | #' @importFrom grDevices colorRampPalette
25 | #' @importFrom S4Vectors metadata
26 | #' @importFrom Matrix rowSums
27 | #' @importFrom rlang .data
28 | #' @import ggplot2
29 | #' @export
30 | 
31 | pbMDS <- function(x) {
32 |     # check validity of input pseudobulk-SCE
33 |     # (cells should have been aggregated by cluster-sample)
34 |     .check_pbs(x, check_by = TRUE)
35 |     
36 |     y <- as.list(assays(x))
37 |     y <- do.call("cbind", y)
38 |     y <- y[, (j <- c(t(.n_cells(x))) != 0)]
39 |     d <- DGEList(unname(y), remove.zeros = TRUE)
40 |     d <- calcNormFactors(d)
41 |     
42 |     mds <- plotMDS.DGEList(d, plot = FALSE)
43 |     nk <- length(kids <- assayNames(x))
44 | 
45 |     ss <- rep(colnames(x), nk)
46 |     ks <- rep(kids, each = ncol(x))
47 |     
48 |     if (any(!j)) {
49 |         txt <- paste(sQuote(ks[!j]), sQuote(ss[!j]), sep = "-")
50 |         message("Removing cluster-sample instance(s) ", 
51 |             paste(txt, collapse = ", "))
52 |     }
53 | 
54 |     df <- data.frame(
55 |         MDS1 = mds$x, MDS2 = mds$y, 
56 |         cluster_id = factor(ks[j], levels = kids), 
57 |         group_id = rep(x$group_id, nk)[j])
58 |     
59 |     cols <- .cluster_colors
60 |     if (nk > length(cols)) 
61 |         cols <- colorRampPalette(cols)(nk)
62 |     
63 |     ggplot(df, aes(.data$MDS1, .data$MDS2, 
64 |         col=.data$cluster_id, shape=.data$group_id)) +
65 |         scale_color_manual(values = cols) +
66 |         geom_point(size = 3, alpha = 0.8) + 
67 |         guides(color = guide_legend(override.aes = list(alpha = 1))) +
68 |         theme_bw() + theme(aspect.ratio = 1,
69 |             axis.text = element_text(color = "black"),
70 |             panel.grid.minor = element_blank(),
71 |             panel.grid.major = element_line(linewidth = 0.2, color = "lightgrey"))
72 | }
73 | 


--------------------------------------------------------------------------------
/R/prepSCE.R:
--------------------------------------------------------------------------------
 1 | #' @rdname prepSCE
 2 | #' @title Prepare SCE for DS analysis
 3 | #' 
 4 | #' @description ...
 5 | #' 
 6 | #' @param x a \linkS4class{SingleCellExperiment}.
 7 | #' @param kid,sid,gid character strings specifying
 8 | #'   the \code{colData(x)} columns containing cluster assignments,
 9 | #'   unique sample identifiers, and group IDs (e.g., treatment).
10 | #' @param drop logical. Specifies whether \code{colData(x)} columns
11 | #'   besides those specified as \code{cluster_id,sample_id,group_id}
12 | #'   should be retained (default \code{drop = FALSE}) 
13 | #'   or removed (\code{drop = TRUE}). 
14 | #' 
15 | #' @examples
16 | #' # generate random counts
17 | #' ng <- 50
18 | #' nc <- 200
19 | #'     
20 | #' # generate some cell metadata
21 | #' gids <- sample(c("groupA", "groupB"), nc, TRUE)   
22 | #' sids <- sample(paste0("sample", seq_len(3)), nc, TRUE) 
23 | #' kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE) 
24 | #' batch <- sample(seq_len(3), nc, TRUE)
25 | #' cd <- data.frame(group = gids, id = sids, cluster = kids, batch)
26 | #' 
27 | #' # construct SCE
28 | #' library(scuttle)
29 | #' sce <- mockSCE(ncells = nc, ngenes = ng)
30 | #' colData(sce) <- cbind(colData(sce), cd)
31 | #' 
32 | #' # prep. for workflow
33 | #' sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group")
34 | #' head(colData(sce))
35 | #' metadata(sce)$experiment_info
36 | #' sce
37 | #' 
38 | #' @author Helena L Crowell
39 | #' 
40 | #' @return a \linkS4class{SingleCellExperiment}.
41 | #' 
42 | #' @importFrom dplyr mutate_all
43 | #' @importFrom S4Vectors DataFrame metadata<-
44 | #' @importFrom SingleCellExperiment reducedDims SingleCellExperiment
45 | #' @importFrom SummarizedExperiment assays colData rowData
46 | #' @export
47 | 
48 | prepSCE <- function(x, 
49 |     kid = "cluster_id", 
50 |     sid = "sample_id", 
51 |     gid = "group_id", 
52 |     drop = FALSE) {
53 | 
54 |     stopifnot(is(x, "SingleCellExperiment"))
55 | 
56 |     args <- as.list(environment())
57 |     ids <- args[grep("[a-z]id", names(args))]
58 |     ids <- unlist(ids)
59 |     
60 |     stopifnot(is.character(ids))
61 |     stopifnot(all(ids %in% colnames(colData(x))))
62 |     
63 |     cd0 <- colData(x)
64 |     cd <- data.frame(cd0[ids], check.names = FALSE)
65 |     cd <- mutate_all(cd, as.factor)
66 |     colnames(cd) <- unlist(formals()[names(ids)])
67 |     
68 |     if (!drop)
69 |         cd <- data.frame(cd,
70 |             cd0[setdiff(colnames(cd0), ids)], 
71 |             check.names = FALSE)
72 | 
73 |     # replace colData in SCE
74 |     colData(x) <- DataFrame(cd)
75 | 
76 |     # construct metadata
77 |     ei <- .make_ei(x)
78 |     metadata(x)$experiment_info <- ei
79 |     
80 |     return(x)
81 | }
82 | 


--------------------------------------------------------------------------------
/R/prepSim.R:
--------------------------------------------------------------------------------
  1 | #' @name prepSim
  2 | #' 
  3 | #' @title SCE preparation for \code{\link{simData}}
  4 | #' 
  5 | #' @description \code{prepSim} prepares an input SCE for simulation 
  6 | #'   with \code{muscat}'s \code{\link{simData}} function by 
  7 | #' \enumerate{
  8 | #'   \item{basic filtering of genes and cells}
  9 | #'   \item{(optional) filtering of subpopulation-sample instances}
 10 | #'   \item{estimation of cell (library sizes) and gene parameters 
 11 | #'   (dispersions and sample-specific means), respectively.}
 12 | #' }
 13 | #' 
 14 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
 15 | #' @param min_count,min_cells used for filtering of genes; only genes with 
 16 | #'   a count > \code{min_count} in >= \code{min_cells} will be retained.
 17 | #' @param min_genes used for filtering cells;
 18 | #'   only cells with a count > 0 in >= \code{min_genes} will be retained.
 19 | #' @param min_size used for filtering subpopulation-sample combinations;
 20 | #'   only instances with >= \code{min_size} cells will be retained.
 21 | #'   Specifying \code{min_size = NULL} skips this step.
 22 | #' @param group_keep character string; if \code{nlevels(x$group_id) > 1},
 23 | #'   specifies which group of samples to keep (see details). The default
 24 | #'   NULL retains samples from \code{levels(x$group_id)[1]}; otherwise, 
 25 | #'   if `colData(x)$group_id` is not specified, all samples will be kept.
 26 | #' @param verbose logical; should information on progress be reported?
 27 | #' 
 28 | #' @details For each gene \eqn{g}, \code{prepSim} fits a model to estimate 
 29 | #'   sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s}, 
 30 | #'   and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s 
 31 | #'   \code{\link[edgeR]{estimateDisp}} function with default parameters. 
 32 | #'   Thus, the reference count data is modeled as NB distributed: 
 33 | #'   \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)}
 34 | #'   for gene \eqn{g} and cell \eqn{c}, where the mean 
 35 | #'   \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here, 
 36 | #'   \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g} 
 37 | #'   in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size 
 38 | #'   (total number of counts), and \eqn{\phi_g} is the dispersion.
 39 | #'   
 40 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 
 41 | #'   containing, for each cell, library size (\code{colData(x)$offset})
 42 | #'   and, for each gene, dispersion and sample-specific mean estimates 
 43 | #'   (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively).
 44 | #' 
 45 | #' @examples
 46 | #' # estimate simulation parameters
 47 | #' data(example_sce)
 48 | #' ref <- prepSim(example_sce)
 49 | #' 
 50 | #' # tabulate number of genes/cells before vs. after
 51 | #' ns <- cbind(
 52 | #'   before = dim(example_sce), 
 53 | #'   after = dim(ref)) 
 54 | #' rownames(ns) <- c("#genes", "#cells")
 55 | #' ns
 56 | #' 
 57 | #' library(SingleCellExperiment)
 58 | #' head(rowData(ref)) # gene parameters
 59 | #' head(colData(ref)) # cell parameters
 60 | #' 
 61 | #' @author Helena L Crowell
 62 | #' 
 63 | #' @references 
 64 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
 65 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
 66 | #' On the discovery of population-specific state transitions from 
 67 | #' multi-sample multi-condition single-cell RNA sequencing data. 
 68 | #' \emph{bioRxiv} \strong{713412} (2018). 
 69 | #' doi: \url{https://doi.org/10.1101/713412}
 70 | #' 
 71 | #' @importFrom edgeR DGEList estimateDisp glmFit
 72 | #' @importFrom Matrix colSums rowSums
 73 | #' @importFrom matrixStats rowAnyNAs
 74 | #' @importFrom SingleCellExperiment SingleCellExperiment counts
 75 | #' @importFrom SummarizedExperiment colData rowData<-
 76 | #' @importFrom stats model.matrix
 77 | #' @importFrom S4Vectors DataFrame
 78 | #' @export
 79 | 
 80 | prepSim <- function(x, 
 81 |     min_count = 1, min_cells = 10, 
 82 |     min_genes = 100, min_size = 100, 
 83 |     group_keep = NULL, verbose = TRUE) {
 84 |     
 85 |     .check_sce(x, req_group = FALSE)
 86 |     stopifnot(is.numeric(min_count), 
 87 |         is.numeric(min_cells), is.numeric(min_genes), 
 88 |         is.null(min_size) || is.numeric(min_size),
 89 |         is.logical(verbose), length(verbose) == 1)
 90 |     
 91 |     # get model variables
 92 |     vars <- c("sample_id", "cluster_id")
 93 |     names(vars) <- vars <- intersect(vars, names(colData(x)))
 94 |     
 95 |     # assure these are factors
 96 |     for (v in vars) {
 97 |         # drop singular variables from model
 98 |         n <- length(unique(x[[v]]))
 99 |         if (n == 1) {
100 |             rmv <- grep(v, vars)
101 |             vars <- vars[-rmv]
102 |         }
103 |         if (!is.factor(x[[v]]))
104 |             x[[v]] <- as.factor(x[[v]])
105 |         x[[v]] <- droplevels(x[[v]])
106 |     }
107 |     
108 |     n_cells0 <- ncol(x)
109 |     x <- .update_sce(x)
110 |     if (is.null(group_keep)) {
111 |         if ("group_id" %in% colnames(colData(x))) {
112 |             group_keep <- levels(x$group_id)[1]
113 |             if (verbose) {
114 |                 fmt <- paste(
115 |                     "Argument `group_keep` unspecified;",
116 |                     "defaulting to retaining %s-group samples.")
117 |                 message(sprintf(fmt, dQuote(group_keep)))
118 |             }
119 |             cells_keep <- x$group_id == group_keep
120 |         } else {
121 |             cells_keep <- seq_len(ncol(x))
122 |         }
123 |     } else {
124 |         stopifnot(is.character(group_keep), 
125 |             group_keep %in% levels(x$group_id))
126 |         cells_keep <- x$group_id %in% group_keep
127 |     }
128 |     x <- x[, cells_keep]
129 |     x <- .update_sce(x)
130 |     
131 |     # keep genes w/ count > `min_count` in at least `min_cells`;
132 |     # keep cells w/ at least `min_genes` detected genes
133 |     if (verbose) message("Filtering...")
134 |     genes_keep <- rowSums(counts(x) > min_count) >= min_cells
135 |     cells_keep <- colSums(counts(x) > 0) >= min_genes
136 |     if (verbose) message(sprintf(
137 |         "- %s/%s genes and %s/%s cells retained.",
138 |         sum(genes_keep), nrow(x), sum(cells_keep), n_cells0))
139 |     x <- x[genes_keep, cells_keep, drop = FALSE]
140 |     
141 |     # keep cluster-samples w/ at least 'min_size' cells
142 |     if (!is.null(min_size)) {
143 |         n_cells <- table(x$cluster_id, x$sample_id)
144 |         n_cells <- .filter_matrix(n_cells, n = min_size)
145 |         if (ncol(n_cells) == 1)
146 |             stop("Current 'min_size' retains only 1 sample,\nbut",
147 |                 " mean-dispersion estimation requires at least 2.")
148 |         if (verbose) message(sprintf(
149 |             "- %s/%s subpopulations and %s/%s samples retained.",
150 |             nrow(n_cells), nlevels(x$cluster_id), 
151 |             ncol(n_cells), nlevels(x$sample_id)))
152 |         x <- .filter_sce(x, rownames(n_cells), colnames(n_cells))
153 |     }
154 |     
155 |     if (is.null(rownames(x))) rownames(x) <- paste0("gene", seq(nrow(x)))
156 |     if (is.null(colnames(x))) colnames(x) <- paste0("cell", seq(ncol(x)))
157 |     
158 |     # construct model formula
159 |     f <- "~ 1"
160 |     for (v in vars)
161 |         f <- paste(f, v, sep = "+")
162 |     cd <- as.data.frame(droplevels(colData(x)))
163 |     mm <- model.matrix(as.formula(f), data = cd)
164 |     
165 |     # fit NB model
166 |     if (verbose) 
167 |         message("Estimating gene and cell parameters...")
168 |     y <- DGEList(counts(x))
169 |     y <- calcNormFactors(y)
170 |     y <- estimateDisp(y, mm)
171 |     y <- glmFit(y, prior.count = 0)
172 |     
173 |     # drop genes for which estimation failed
174 |     cs <- y$coefficients
175 |     i <- !rowAnyNAs(cs)
176 |     x <- x[i, , drop = FALSE]
177 |     cs <- cs[i, , drop = FALSE]
178 |     ds <- y$dispersion[i]
179 |     names(ds) <- rownames(x)
180 |     
181 |     # group betas by variable
182 |     bs <- DataFrame(
183 |         beta0 = cs[, 1],
184 |         row.names = rownames(x))
185 |     for (v in vars) {
186 |         pat <- paste0("^", v)
187 |         i <- grep(pat, colnames(cs))
188 |         df <- DataFrame(cs[, i])
189 |         nms <- colnames(cs)[i]
190 |         names(df) <- gsub(pat, "", nms)
191 |         bs[[v]] <- df
192 |     }
193 |     # store betas & dispersions in rowData
194 |     rowData(x)$beta <- bs
195 |     rowData(x)$disp <- ds
196 |     
197 |     # store offsets in colData
198 |     os <- c(y$offset)
199 |     names(os) <- colnames(x)
200 |     x$offset <- os
201 |     
202 |     # drop singular variables from cell metadata
203 |     for (v in names(colData(x))) {
204 |         n <- length(unique(x[[v]]))
205 |         if (n == 1) x[[v]] <- NULL
206 |     }
207 |     
208 |     # return SCE
209 |     return(x)
210 | }
211 | 


--------------------------------------------------------------------------------
/R/resDS.R:
--------------------------------------------------------------------------------
  1 | #' resDS
  2 | #' Formatting of DS analysis results
  3 | #' 
  4 | #' \code{resDS} provides a simple wrapper to format cluster-level
  5 | #' differential testing results into an easily filterable table, and
  6 | #' to optionally append gene expression frequencies by cluster-sample
  7 | #' & -group, as well as cluster-sample-wise CPM.
  8 | #' 
  9 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
 10 | #' @param y a list of DS testing results as returned 
 11 | #'   by \code{\link{pbDS}} or \code{\link{mmDS}}.
 12 | #' @param bind character string specifying the output format (see details).
 13 | #' @param frq logical or a pre-computed list of expression frequencies 
 14 | #'   as returned by \code{\link{calcExprFreqs}}.
 15 | #' @param cpm logical specifying whether CPM by cluster-sample 
 16 | #'   should be appendeded to the output result table(s).
 17 | #' @param digits integer value specifying the 
 18 | #'   number of significant digits to maintain.
 19 | #' @param sep character string to use as separator 
 20 | #'   when constructing new column names.
 21 | #' @param ... optional arguments passed to 
 22 | #'   \code{\link{calcExprFreqs}} if \code{frq = TRUE}.
 23 | #' 
 24 | #' @details When \code{bind = "col"}, the list of DS testing results at 
 25 | #' \code{y$table} will be merge vertically (by column) into a single table
 26 | #' in tidy format with column \code{contrast/coef} specifying the comparison.
 27 | #'   
 28 | #' Otherwise, when \code{bind = "row"}, an identifier of the respective
 29 | #' contrast or coefficient will be appended to the column names,
 30 | #' and all tables will be merge horizontally (by row).
 31 | #'   
 32 | #' Expression frequencies pre-computed with \code{\link{calcExprFreqs}} 
 33 | #' may be provided with \code{frq}. Alternatively, when \code{frq = TRUE}, 
 34 | #' expression frequencies can be computed directly, and additional arguments 
 35 | #' may be passed to \code{\link{calcExprFreqs}} (see examples below).
 36 | #' 
 37 | #' @return returns a `data.frame`.
 38 | #' 
 39 | #' @examples
 40 | #' # compute pseudobulks (sum of counts)
 41 | #' data(example_sce)
 42 | #' pb <- aggregateData(example_sce, 
 43 | #'   assay = "counts", fun = "sum")
 44 | #' 
 45 | #' # run DS analysis (edgeR on pseudobulks)
 46 | #' res <- pbDS(pb, method = "edgeR")
 47 | #' 
 48 | #' head(resDS(example_sce, res, bind = "row")) # tidy format
 49 | #' head(resDS(example_sce, res, bind = "col", digits = Inf))
 50 | #' 
 51 | #' # append CPMs & expression frequencies
 52 | #' head(resDS(example_sce, res, cpm = TRUE))
 53 | #' head(resDS(example_sce, res, frq = TRUE))
 54 | #' 
 55 | #' # pre-computed expression frequencies & append
 56 | #' frq <- calcExprFreqs(example_sce, assay = "counts", th = 0)
 57 | #' head(resDS(example_sce, res, frq = frq))
 58 | #' 
 59 | #' @author Helena L Crowell & Mark D Robinson
 60 | #' 
 61 | #' @importFrom dplyr %>% bind_rows inner_join full_join mutate mutate_if select all_of
 62 | #' @importFrom edgeR cpm
 63 | #' @importFrom methods is
 64 | #' @importFrom purrr reduce
 65 | #' @importFrom SummarizedExperiment colData
 66 | #' @importFrom S4Vectors metadata
 67 | #' @export  
 68 | 
 69 | resDS <- function(x, y, bind = c("row", "col"),
 70 |     frq = FALSE, cpm = FALSE, digits = 3, sep = "__", ...) {
 71 |     
 72 |     # check validity of input arguments
 73 |     .check_sce(x, req_group = TRUE)
 74 |     #.check_res(x, y)
 75 |     bind <- match.arg(bind)
 76 |     if (!is.logical(frq)) 
 77 |         .check_frq(x, frq)
 78 |     stopifnot(is.infinite(digits) || is.numeric(digits) &
 79 |         digits > 0 & as.integer(digits) == digits)
 80 | 
 81 |     ei <- metadata(x)$experiment_info
 82 |     kids <- levels(x$cluster_id)
 83 |     
 84 |     res <- switch(bind,
 85 |         row = {
 86 |             bind_rows(lapply(y$table, bind_rows))
 87 |         },
 88 |         col = {
 89 |             ct <- ifelse(!is.null(y$args$contrast), "contrast", "coef")
 90 |             cs <- names(y$table)
 91 |             res <- lapply(cs, function(c) {
 92 |                 df <- bind_rows(y$table[[c]])
 93 |                 df <- select(df, !all_of(ct))
 94 |                 i <- !colnames(df) %in% c("gene", "cluster_id")
 95 |                 colnames(df)[i] <- paste(colnames(df)[i], c, sep = sep)
 96 |                 return(df)
 97 |             })
 98 |             reduce(res, full_join, by = c("gene", "cluster_id"))
 99 |         })
100 | 
101 |     .tidy <- function(u, ei, append = "") {
102 |         m1 <- match(ei$sample_id, colnames(u), nomatch = 0)
103 |         m2 <- match(levels(ei$group_id), colnames(u))
104 |         if (all(is.na(m2))) m2 <- 0
105 |         colnames(u)[m1] <- paste0(ei$sample_id, append)[m1 != 0]
106 |         colnames(u)[m2] <- paste0(colnames(u)[m2], append)
107 |         k <- seq_len(ncol(u))[-c(m1, m2)]
108 |         u[, c(k, m1[order(ei$group)], m2)]
109 |     }
110 |     
111 |     # append expression frequencies
112 |     if (is.logical(frq))
113 |         if (frq) frq <- calcExprFreqs(x, ...) else frq <- NULL
114 |     if (!is.null(frq)) {
115 |         frq <- data.frame(
116 |             gene = rep(rownames(x), length(assays(frq))),
117 |             cluster_id = rep(assayNames(frq), each = nrow(x)), 
118 |             do.call("rbind", as.list(assays(frq))),
119 |             row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
120 |         frq <- .tidy(frq, ei, append = ".frq")
121 |         res <- inner_join(frq, res, 
122 |             multiple = "all",
123 |             by = c("gene", "cluster_id"))
124 |     }
125 | 
126 |     # append CPMs
127 |     if (cpm) {
128 |         cpm <- lapply(kids, function(k) {
129 |             if (is.null(y$data[[k]])) 
130 |                 return(NULL)
131 |             cpm <- cpm(y$data[[k]])
132 |             data.frame(cpm, 
133 |                 gene = rownames(cpm),
134 |                 cluster_id = k,
135 |                 row.names = NULL, 
136 |                 check.names = FALSE, 
137 |                 stringsAsFactors = FALSE)
138 |         })
139 |         cpm <- bind_rows(cpm)
140 |         cpm <- .tidy(cpm, ei, append = ".cpm")
141 |         res <- inner_join(cpm, res,
142 |             multiple = "all",
143 |             by = c("gene", "cluster_id"))
144 |     }
145 |     mutate_if(res, is.numeric, signif, digits)
146 | }
147 | 


--------------------------------------------------------------------------------
/R/stagewiseDD.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom dplyr bind_rows
  2 | .res_DX <- function(res_DS, res_DD) {
  3 |     # for each contrast...
  4 |     names(cts) <- cts <- names(res_DS$table) 
  5 |     lapply(cts, function(ct) { 
  6 |         # for each cluster...
  7 |         names(kids) <- kids <- names(res_DS$table[[ct]]) 
  8 |         lapply(kids, function(kid) { 
  9 |             # get DS/DD results
 10 |             DS <- res_DS$table[[ct]][[kid]]
 11 |             DD <- res_DD$table[[ct]][[kid]]
 12 |             # add missing genes
 13 |             DS <- bind_rows(DS, data.frame(gene=setdiff(DD$gene, DS$gene)))
 14 |             DD <- bind_rows(DD, data.frame(gene=setdiff(DS$gene, DD$gene)))
 15 |             # reorder & return both
 16 |             DD <- DD[match(DS$gene, DD$gene), ]
 17 |             return(list(DS=DS, DD=DD))
 18 |         })
 19 |     })
 20 | }
 21 | 
 22 | #' @rdname stagewise_DS_DD
 23 | #' @title Perform two-stage testing on DS and DD analysis results
 24 | #'
 25 | #' @param res_DS a list of DS testing results as returned 
 26 | #'   by \code{\link{pbDS}} or \code{\link{mmDS}}.
 27 | #' @param res_DD a list of DD testing results as returned 
 28 | #'   by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).
 29 | #' @param sce (optional) \code{SingleCellExperiment} object containing the data 
 30 | #'   that underlies testing, prior to summarization with \code{\link{aggregateData}}. 
 31 | #'   Used for validation of inputs in order to prevent unexpected failure/results.
 32 | #' @param verbose logical. Should information on progress be reported?
 33 | #'
 34 | #' @return 
 35 | #' A list of \code{DFrame}s containing results for each contrast and cluster.
 36 | #' Each table contains DS and DD results for genes shared between analyses,
 37 | #' as well as results from stagewise testing analysis, namely:
 38 | #' \itemize{
 39 | #' \item{\code{p_adj}: FDR adjusted p-values for the
 40 | #'   screening hypothesis that a gene is neither DS nor DD
 41 | #'   (see \code{?stageR::getAdjustedPValues} for details)}
 42 | #' \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}}
 43 | #'
 44 | #' @examples
 45 | #' data(example_sce)
 46 | #' 
 47 | #' pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum")
 48 | #' pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected")
 49 | #'   
 50 | #' res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE)
 51 | #' res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE)
 52 | #' 
 53 | #' res <- stagewise_DS_DD(res_DS, res_DD)
 54 | #' head(res[[1]][[1]]) # results for 1st cluster
 55 | #'
 56 | #' @importFrom S4Vectors DataFrame
 57 | #' @importFrom purrr map_depth
 58 | #' @export
 59 | 
 60 | stagewise_DS_DD <- function(res_DS, res_DD, sce=NULL, verbose=FALSE) {
 61 |     if (!requireNamespace("stageR", quietly=TRUE))
 62 |         stop("Install 'stageR' to use this function.")
 63 |         
 64 |     # validity checks
 65 |     # TODO: helper to check validity of 'res_DS/D'
 66 |     # against each other and, optionally, 'sce'
 67 |     stopifnot(
 68 |         # same coefs/constrasts
 69 |         names(x <- res_DS$table) == 
 70 |         names(y <- res_DD$table), 
 71 |         # any shared clusters
 72 |         sum(mapply(\(i, j) 
 73 |             length(intersect(i, j)), 
 74 |             i=lapply(x, names), 
 75 |             j=lapply(y, names))) > 0)
 76 |     if (!is.null(sce)) {
 77 |         .check_sce(sce)
 78 |         . <- map_depth(list(x, y), 3, \(df) df$gene %in% rownames(sce))
 79 |         stopifnot("gene(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
 80 |         . <- map_depth(list(x, y), 3, \(df) df$cluster_id %in% sce$cluster_id)
 81 |         stopifnot("cluster(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
 82 |     }
 83 |     
 84 |     # assure that results contain same set of genes, in the same order
 85 |     # (indepedent of different filtering criteria for the two analyses)
 86 |     res_DX <- .res_DX(res_DS=res_DS, res_DD=res_DD)
 87 |     
 88 |     # perform harmonic mean p-value aggregation according to
 89 |     # (https://www.pnas.org/doi/full/10.1073/pnas.1814092116)
 90 |     .mu <- \(x) 1/mean(1/x, na.rm=TRUE)
 91 |     
 92 |     # perform stagewise testing
 93 |     res <- map_depth(res_DX, 2, \(x) {
 94 |         ps <- data.frame(
 95 |             p_val.DS=x$DS$p_val,
 96 |             p_val.DD=x$DD$p_val,
 97 |             row.names=x$DS$gene)
 98 |         qs <- apply(ps, 1, .mu); names(qs) <- x$DS$gene
 99 |         obj <- stageR::stageR(qs, as.matrix(ps), FALSE)
100 |         eva <- expression({
101 |             obj <- stageR::stageWiseAdjustment(obj, 
102 |                 method="none", alpha=0.05, allowNA=TRUE)
103 |             res <- stageR::getAdjustedPValues(obj, 
104 |                 onlySignificantGenes=FALSE, order=FALSE)
105 |         })
106 |         res <- if (verbose) eval(eva) else suppressMessages({eval(eva)})
107 |         # TODO: communicate this better with the user?
108 |         if (is.null(res)) res <- NA
109 |         colnames(res)[1] <- "p_adj"
110 |         return(res)
111 |     })
112 |     names(cs) <- cs <- names(res)
113 |     lapply(cs, \(c) {
114 |         names(ks) <- ks <- names(res[[c]])
115 |         lapply(ks, \(k) {
116 |             gs <- rownames(df <- res[[c]][[k]])
117 |             res_DS <- I((. <- res_DS$table[[c]][[k]])[match(gs, .$gene), ])
118 |             res_DD <- I((. <- res_DD$table[[c]][[k]])[match(gs, .$gene), ])
119 |             DataFrame(gene=gs, df, cluster_id=k, contrast=c, res_DS, res_DD)
120 |         })
121 |     })
122 | }
123 | 


--------------------------------------------------------------------------------
/R/utils-pbDS.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom BiocParallel SerialParam
  2 | #' @importFrom purrr map
  3 | #' @importFrom scuttle summarizeAssayByGroup
  4 | #' @importFrom SummarizedExperiment assay colData
  5 | .pb <- function(x, by, assay, fun, BPPARAM = SerialParam()) {
  6 |   # compute pseudobulks
  7 |   suppressWarnings( 
  8 |     # temporarily suppressing warnings b/c 'median' 
  9 |     # warns about unspecified 'useNames' argument
 10 |   y <- summarizeAssayByGroup(x,
 11 |         assay.type = assay, 
 12 |         ids = (ids <- colData(x)[by]),
 13 |         statistics = fun,
 14 |         BPPARAM = BPPARAM))
 15 |   colnames(y) <- y[[by[length(by)]]]
 16 |   
 17 |   if (length(by) == 1) 
 18 |       return(assay(y))
 19 |   
 20 |   # reformat into one assay per 'by[1]'
 21 |   if (is.factor(ids <- y[[by[1]]]))
 22 |       ids <- droplevels(ids)
 23 |   is <- split(seq_len(ncol(y)), ids)
 24 |   ys <- map(is, ~assay(y)[, ., drop=FALSE])
 25 |   
 26 |   # fill in missing combinations
 27 |   for (i in seq_along(ys)) {
 28 |       fill <- setdiff(
 29 |           unique(y[[by[2]]]), 
 30 |           colnames(ys[[i]]))
 31 |       if (length(fill != 0)) {
 32 |           foo <- matrix(0, nrow(x), length(fill))
 33 |           colnames(foo) <- fill
 34 |           foo <- cbind(ys[[i]], foo)
 35 |           o <- paste(sort(unique(y[[by[2]]])))
 36 |           ys[[i]] <- foo[, o]
 37 |       }
 38 |   }
 39 |   return(ys)
 40 | }
 41 | 
 42 | # extract table of cell counts from 'int_colData'
 43 | # of pseudobulks as returned by 'aggregateData'
 44 | #' @importFrom S4Vectors metadata
 45 | #' @importFrom SingleCellExperiment int_colData
 46 | .n_cells <- function(x) {
 47 |     y <- int_colData(x)$n_cells
 48 |     if (is.null(y)) return(NULL)
 49 |     if (length(metadata(x)$agg_pars$by) == 2)
 50 |         y <- as.matrix(data.frame(y, check.names = FALSE))
 51 |     return(as.table(y))
 52 | }
 53 | 
 54 | # wrapper to create output tables
 55 | #   k:  cluster ID
 56 | #   tt: topTable data.frame
 57 | .res_df <- function(tbl, k, ct, c) {
 58 |     df <- data.frame(
 59 |         gene = rownames(tbl), cluster_id = k, tbl,
 60 |         row.names = NULL, stringsAsFactors = FALSE)
 61 |     df[[ct]] <- c; df
 62 | }
 63 | 
 64 | #' @importFrom DESeq2 DESeq results
 65 | #' @importFrom dplyr rename
 66 | #' @importFrom edgeR calcNormFactors DGEList estimateDisp
 67 | #'   filterByExpr glmQLFit glmQLFTest glmTreat topTags
 68 | #' @importFrom scater isOutlier
 69 | #' @importFrom SummarizedExperiment assay
 70 | #' @importFrom S4Vectors metadata
 71 | .edgeR <- function(x, k, design, coef, contrast, ct, cs, treat) {
 72 |     y <- assay(x, k)
 73 |     y <- suppressMessages(DGEList(y, 
 74 |         group = x$group_id[colnames(y)], 
 75 |         remove.zeros = TRUE))
 76 |     y <- calcNormFactors(y)
 77 |     y <- estimateDisp(y, design)
 78 |     fit <- glmQLFit(y, design)
 79 |     # treat: test for DE relative to logFC threshold
 80 |     # else:  genewise NB GLM with quasi-likelihood test
 81 |     .fun <- ifelse(treat, glmTreat, glmQLFTest)
 82 |     tbl <- lapply(cs, function(c) {
 83 |         fit <- .fun(fit, coef[[c]], contrast[, c])
 84 |         tbl <- topTags(fit, n = Inf, sort.by = "none")
 85 |         # combine tables & reformat
 86 |         tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR")
 87 |         tbl <- .res_df(tbl, k, ct, c)
 88 |     })
 89 |     list(table = tbl, data = y, fit = fit)
 90 | }
 91 | 
 92 | #' @importFrom matrixStats rowMedians
 93 | .edgeR_NB <- \(x, k, design, coef, contrast, ct, cs, nc) {
 94 |     y <- assay(x, k)
 95 |     # Gene_level filtering to remove genes detected in
 96 |     # almost all cells of almost all pseudobulk samples
 97 |     med_detection <- rowMedians(sweep(y, 2, nc, "/"))
 98 |     gene_filter <- med_detection < 0.9
 99 |     # Normalization offset to remove systematic differences between pseudobulk 
100 |     # samples that are due to technical or nuisance biological variability. 
101 |     # Idea obtained from cellular detection rate (CDR) normalization from MAST. 
102 |     # Note that this normalization is used instead of 'edgeR::calcNormFactors()'.
103 |     of <- colMeans(sweep(y[gene_filter, ], 2, nc, "/"))
104 |     # construct 'DGEList'
105 |     y <- suppressMessages(DGEList(
106 |         counts = y[gene_filter, ],
107 |         group = x$group_id[colnames(y)],
108 |         remove.zeros = TRUE))
109 |     # add offsets to 'DGEList'
110 |     y$offset <- log(nc * of) 
111 |     # run an 'edgeR' analysis
112 |     y <- estimateDisp(y, design)
113 |     fit <- glmQLFit(y, design, robust = TRUE)
114 |     tbl <- lapply(cs, function(c) {
115 |         fit <- glmQLFTest(fit,
116 |             coef[[c]],
117 |             contrast[, c],
118 |             poisson.bound = FALSE) 
119 |         tbl <- topTags(fit, n = Inf, sort.by = "none")
120 |         tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR") 
121 |         tbl <- .res_df(tbl, k, ct, c)
122 |     })
123 |     list(table = tbl, data = y, fit = fit)
124 | }
125 | 
126 | #' @importFrom dplyr rename
127 | #' @importFrom edgeR calcNormFactors DGEList
128 | #' @importFrom limma contrasts.fit eBayes lmFit topTable topTreat voom treat
129 | #' @importFrom SummarizedExperiment assay
130 | #' @importFrom S4Vectors metadata
131 | .limma <- function(x, k, design, coef, contrast, ct, cs, method, treat) {
132 |     y <- assay(x, k)
133 |     trend <- robust <- TRUE
134 |     if (method == "voom") {
135 |         trend <- robust <- FALSE
136 |         y <- suppressMessages(DGEList(y, remove.zeros = TRUE))
137 |         y <- calcNormFactors(y)
138 |         y <- voom(y, design)
139 |     } 
140 |     w <- .n_cells(x)[k, colnames(x)]   
141 |     fit <- lmFit(y, design, weights = w)
142 |     # treat: eBayes moderated-t p-val relative to min logFC threshold
143 |     # else:  eBayes moderated t-stat testing each contrast equal to 0 
144 |     .fun <- ifelse(treat, limma::treat, eBayes)
145 |     .tbl <- ifelse(treat, topTreat, topTable)
146 |     tbl <- lapply(cs, function(c) {
147 |         fit <- contrasts.fit(fit, contrast[, c], coef[[c]])
148 |         fit <- .fun(fit, trend = trend, robust = robust)
149 |         tbl <- .tbl(fit, number = Inf, sort.by = "none")
150 |         tbl <- rename(tbl, p_val = "P.Value", p_adj.loc = "adj.P.Val")
151 |         tbl <- .res_df(tbl, k, ct, c)
152 |     })
153 |     list(table = tbl, data = y, fit = fit)
154 | }
155 | 
156 | .limma_trend <- function(x, k, design, coef, contrast, ct, cs, treat)
157 |     .limma(x, k, design, coef, contrast, ct, cs, method = "trend", treat)
158 | 
159 | .limma_voom <- function(x, k, design, coef, contrast, ct, cs, treat)
160 |     .limma(x, k, design, coef, contrast, ct, cs, method = "voom", treat)
161 |   
162 | #' @importFrom dplyr rename
163 | #' @importFrom DESeq2 DESeq DESeqDataSetFromMatrix results 
164 | #' @importFrom SummarizedExperiment assay colData
165 | .DESeq2 <- function(x, k, design, contrast, ct, cs) {
166 |     cd <- colData(x)
167 |     y <- assay(x, k)
168 |     mode(y) <- "integer"
169 |     y <- DESeqDataSetFromMatrix(y, cd, design)
170 |     y <- suppressMessages(DESeq(y))
171 |     tbl <- lapply(cs, function(c) {
172 |         tbl <- results(y, contrast[, c])
173 |         tbl <- .res_df(tbl, k, ct, c)
174 |         rename(tbl, logFC = "log2FoldChange", 
175 |             p_val = "pvalue", p_adj.loc = "padj")
176 |     })
177 |     list(table = tbl, data = y)
178 | }
179 | 
180 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | # ==============================================================================
  2 | # filters rows/columns from input matrix `m` until all entries >= `n`, 
  3 | # such that ea. iteration removes the row/column w/ the smallest summed value.
  4 | # ------------------------------------------------------------------------------
  5 | .filter_matrix <- function(m, n = 100) {
  6 |     while (any(m < n)) {
  7 |         # get candidate rows/cols for removal
  8 |         i <- m < n
  9 |         r <- apply(i, 1, any)
 10 |         c <- apply(i, 2, any)
 11 |         # get smallest row/col
 12 |         rs <- rowSums(m)
 13 |         cs <- colSums(m)
 14 |         r <- which(r)[which.min(rs[r])]
 15 |         c <- which(c)[which.min(cs[c])]
 16 |         # priorities removal of rows over cols
 17 |         if (rs[r] <= cs[c]) {
 18 |             m <- m[-r, , drop = FALSE]
 19 |         } else {
 20 |             m <- m[, -c, drop = FALSE]
 21 |         }
 22 |         if (any(dim(m) == 1)) 
 23 |             break
 24 |     }
 25 |     return(m)
 26 | }
 27 | 
 28 | #' @importFrom dplyr mutate_if 
 29 | #' @importFrom purrr negate
 30 | #' @importFrom S4Vectors DataFrame metadata metadata<-
 31 | #' @importFrom SummarizedExperiment colData colData<-
 32 | .update_sce <- function(sce) {
 33 |     # update colData
 34 |     cd <- as.data.frame(colData(sce))
 35 |     cd <- mutate_if(cd, is.factor, droplevels) 
 36 |     cd <- mutate_if(cd, negate(is.factor), factor) 
 37 |     colData(sce) <- DataFrame(cd, row.names = colnames(sce))
 38 |     # update metadata
 39 |     if(!is.null(ei <- metadata(sce)$experiment_info)){
 40 |         ei <- ei[ei$sample_id %in% levels(sce$sample_id), ]
 41 |         ei <- mutate_if(ei, is.factor, droplevels)
 42 |         metadata(sce)$experiment_info <- ei
 43 |     }
 44 |     return(sce)
 45 | }
 46 | 
 47 | .filter_sce <- function(sce, kids, sids) {
 48 |     cs1 <- sce$cluster_id %in% kids
 49 |     cs2 <- sce$sample_id %in% sids
 50 |     sce <- sce[, cs1 & cs2]
 51 |     sce <- .update_sce(sce)
 52 |     return(sce)
 53 | }
 54 | 
 55 | .cluster_colors <- c(
 56 |     "#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
 57 |     "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
 58 |     "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
 59 |     "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
 60 |     "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
 61 |     "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")
 62 | 
 63 | # ==============================================================================
 64 | # scale values b/w 0 and 1 using 
 65 | # low (1%) and high (99%) quantiles as boundaries
 66 | # ------------------------------------------------------------------------------
 67 | #' @importFrom matrixStats rowQuantiles
 68 | .scale <- function(x) {
 69 |     qs <- rowQuantiles(as.matrix(x), probs = c(.01, .99), na.rm = TRUE)
 70 |     x <- (x - qs[, 1]) / (qs[, 2] - qs[, 1])
 71 |     x[x < 0] <- 0
 72 |     x[x > 1] <- 1
 73 |     return(x)
 74 | }
 75 | 
 76 | # ==============================================================================
 77 | # wrapper for z-normalization
 78 | # ------------------------------------------------------------------------------
 79 | .z_norm <- function(x, th = 2.5) {
 80 |     x <- as.matrix(x)
 81 |     sds <- rowSds(x, na.rm = TRUE)
 82 |     sds[sds == 0] <- 1
 83 |     x <- t(t(x - rowMeans(x, na.rm = TRUE)) / sds)
 84 |     #x <- (x - rowMeans(x, na.rm = TRUE)) / sds
 85 |     x[x >  th] <-  th
 86 |     x[x < -th] <- -th
 87 |     return(x)
 88 | }
 89 | 
 90 | # ------------------------------------------------------------------------------
 91 | # generate experimental design metadata table 
 92 | # for an input SCE or colData data.frame
 93 | # ------------------------------------------------------------------------------
 94 | #' @importFrom dplyr mutate_at 
 95 | #' @importFrom methods is
 96 | #' @importFrom SummarizedExperiment colData
 97 | .make_ei <- function(x) {
 98 |     if (is(x, "SingleCellExperiment"))
 99 |         x <- colData(x)
100 |     sids <- unique(x$sample_id)
101 |     m <- match(sids, x$sample_id)
102 |     df <- data.frame(
103 |         stringsAsFactors = FALSE,
104 |         sample_id = sids,
105 |         group_id = x$group_id[m],
106 |         n_cells = as.numeric(table(x$sample_id)[sids]))
107 |     for (i in c("sample_id", "group_id"))
108 |         if (is.factor(x[[i]]))
109 |             df <- mutate_at(df, i, factor, levels = levels(x[[i]]))
110 |     return(df)
111 | }
112 | 
113 | # ------------------------------------------------------------------------------
114 | # split cells by cluster-sample
115 | # ------------------------------------------------------------------------------
116 | #   x:  a SingleCellExperiment or colData
117 | #   by: character vector specifying colData column(s) to split by
118 | # > If length(by) == 1, a list of length nlevels(colData$by), else,
119 | #   a nested list with 2nd level of length nlevels(colData$by[2])
120 | # ------------------------------------------------------------------------------
121 | #' @importFrom data.table data.table
122 | #' @importFrom purrr map_depth
123 | .split_cells <- function(x, 
124 |     by = c("cluster_id", "sample_id")) {
125 |     if (is(x, "SingleCellExperiment"))
126 |         x <- colData(x)
127 |     cd <- data.frame(x[by], check.names = FALSE)
128 |     cd <- data.table(cd, cell = rownames(x)) %>% 
129 |         split(by = by, sorted = TRUE, flatten = FALSE)
130 |     map_depth(cd, length(by), "cell")
131 | }
132 | 
133 | # ------------------------------------------------------------------------------
134 | # global p-value adjustment
135 | # ------------------------------------------------------------------------------
136 | #   x: results table; a nested list w/ 
137 | #      1st level = comparisons and 2nd level = clusters
138 | # > adds 'p_adj.glb' column containing globally adjusted p-values
139 | #   to the result table of ea. cluster for each comparison
140 | # ------------------------------------------------------------------------------
141 | #' @importFrom purrr map map_depth
142 | #' @importFrom stats p.adjust
143 | .p_adj_global <- function(x) {
144 |     names(ks) <- ks <- names(x)
145 |     names(cs) <- cs <- names(x[[1]])
146 |     lapply(cs, function(c) {
147 |         # get p-values
148 |         tbl <- map_depth(x, 1, c)
149 |         p_val <- map(tbl, "p_val")
150 |         # adjust for each comparison
151 |         p_adj <- p.adjust(unlist(p_val), method = "BH")
152 |         # re-split by cluster
153 |         ns <- vapply(p_val, length, numeric(1))
154 |         p_adj <- split(p_adj, rep.int(ks, ns))
155 |         # insert into results tables
156 |         lapply(ks, function(k) {
157 |             u <- x[[k]][[c]]
158 |             i <- which(colnames(u) == "p_adj.loc")
159 |             u[["p_adj.glb"]] <- p_adj[[k]]
160 |             u[, c(seq_len(i), ncol(u), seq(i+1, ncol(u)-1))]
161 |         })
162 |     })
163 | }
164 | 
165 | # ------------------------------------------------------------------------------
166 | # toy SCE for unit-testing
167 | # ------------------------------------------------------------------------------
168 | #' @importFrom SingleCellExperiment SingleCellExperiment
169 | .toySCE <- function(dim = c(200, 800)) {
170 |     gs <- paste0("gene", seq_len(ngs <- dim[1]))
171 |     cs <- paste0("cell", seq_len(ncs <- dim[2]))
172 | 
173 |     y <- rnbinom(ngs * ncs, size = 2, mu = 4)
174 |     y <- matrix(y, ngs, ncs, TRUE, list(gs, cs))
175 | 
176 |     cd <- mapply(function(i, n) 
177 |         sample(paste0(i, seq_len(n)), ncs, TRUE),
178 |         i = c("k", "s", "g"), n = c(5, 4, 3))
179 |     
180 |     cd <- data.frame(cd, stringsAsFactors = TRUE)
181 |     cd$s <- factor(paste(cd$s, cd$g, sep = "."))
182 |     colnames(cd) <- paste(c("cluster", "sample", "group"), "id", sep = "_")
183 |     
184 |     SingleCellExperiment(
185 |         assay = list(counts = y), colData = cd, 
186 |         metadata = list(experiment_info = .make_ei(cd)))
187 | }
188 | 


--------------------------------------------------------------------------------
/R/validity-checks.R:
--------------------------------------------------------------------------------
  1 | # validity checks for objects & function arguments
  2 | # ==============================================================================
  3 | 
  4 | # check input SCE
  5 | #' @importFrom methods is
  6 | #' @importFrom SummarizedExperiment colData
  7 | .check_sce <- function(x, req_group = TRUE) {
  8 |     stopifnot(is(x, "SingleCellExperiment"))
  9 |     stopifnot(c("cluster_id", "sample_id") %in% colnames(colData(x)))
 10 |     if (req_group)
 11 |         stopifnot("group_id" %in% colnames(colData(x)))
 12 | }
 13 | 
 14 | # check of 'assay' argument
 15 | #' @importFrom SummarizedExperiment assayNames
 16 | .check_arg_assay <- function(x, y) {
 17 |     stopifnot(is.character(y), length(y) == 1, y %in% assayNames(x))
 18 |     if (sum(assayNames(x) == y) > 1)
 19 |         stop("Argument 'assay' was matched to multiple times.\n ", 
 20 |             " Please assure that the input SCE has unique 'assayNames'.")
 21 | }
 22 | 
 23 | # check pseudo-bulks for DS analysis
 24 | # (must have be aggregated by cluster-sample)
 25 | #   x = SCE used for aggregation
 26 | #   y = SCE containing pseudo-bulks as returned by 
 27 | #`      aggregateData(x, by = c("cluster_id", "sample_id"))
 28 | #' @importFrom methods is
 29 | #' @importFrom S4Vectors metadata
 30 | #' @importFrom SummarizedExperiment assayNames
 31 | .check_pbs <- function(pbs, sce = NULL, check_by = TRUE) {
 32 |     stopifnot(is(pbs, "SingleCellExperiment"),
 33 |         !is.null(ei <- metadata(pbs)$experiment_info),
 34 |         !is.null(agg_pars <- metadata(pbs)$agg_pars),
 35 |         !is.null(n_cells <- .n_cells(pbs)),
 36 |         identical(assayNames(pbs), rownames(n_cells)),
 37 |         identical(colnames(pbs), colnames(n_cells)))
 38 |     if (!is.null(sce)) {
 39 |         stopifnot(identical(ei, metadata(sce)$experiment_info),
 40 |             identical(assayNames(pbs), levels(sce[[agg_pars$by[1]]])),
 41 |             identical(rownames(pbs), rownames(sce)))
 42 |         if (length(agg_pars$by == 2))
 43 |             stopifnot(identical(colnames(pbs), levels(sce[[agg_pars$by[2]]])))
 44 |     }
 45 |     if (check_by)
 46 |         stopifnot(!is.null(pbs[["group_id"]]),
 47 |             identical(agg_pars$by, c("cluster_id", "sample_id")))
 48 | }
 49 | 
 50 | # check validity of runDS() output
 51 | #' @importFrom methods is
 52 | #' @importFrom S4Vectors metadata
 53 | .check_res <- function(x, y) {
 54 |     ei <- metadata(x)$experiment_info
 55 |     nk <- length(kids <- levels(x$cluster_id))
 56 |     nms <- c("table", "data", "method", "design", "contrast", "coef")
 57 |     stopifnot(is(y, "list"), all.equal(names(y), nms))
 58 |     # table
 59 |     stopifnot(is(y$table, "list"),
 60 |         vapply(y$table, is, class = "list", logical(1)),
 61 |         identical(names(y$table), colnames(y$contrast))
 62 |         | identical(names(y$table), names(y$coef)),
 63 |         apply(vapply(y$table, names, character(nk)), 2, identical, kids))
 64 |     # data
 65 |     stopifnot(is(y$data, "list"), names(y$data) %in% kids,
 66 |         vapply(y$data, is, class = "DGEList", logical(1)))
 67 |     # design
 68 |     stopifnot(is(y$design, "matrix"),
 69 |         colnames(y$design) %in% ei$group_id,
 70 |         rownames(y$design) %in% ei$sample_id)
 71 |     # contrast & coef
 72 |     stopifnot(is.null(y$contrast) | is(y$contrast, "matrix"))
 73 |     stopifnot(is.null(y$coef) | is(y$coef, "numeric") | is(y$coef, "list"))
 74 | }
 75 | 
 76 | # check validity of calcExprFreqs() output
 77 | #' @importFrom methods is
 78 | #' @importFrom SummarizedExperiment assays colData
 79 | .check_frq <- function(x, y) {
 80 |     stopifnot(
 81 |         is(x, "SingleCellExperiment"), 
 82 |         is(y, "SingleCellExperiment"))
 83 |     kids <- levels(x$cluster_id)
 84 |     
 85 |     ids <- levels(x$sample_id)
 86 |     if ("group_id" %in% colnames(colData(x)))
 87 |         ids <- c(ids, levels(x$group_id))
 88 |     stopifnot(identical(ids, colnames(y)))
 89 |     
 90 |     vals <- unlist(assays(y))
 91 |     stopifnot(all(vals <= 1), all(vals >= 0))
 92 | }
 93 | 
 94 | .check_args_simData <- function(u) {
 95 |     if (!is.null(u$ns))
 96 |         stopifnot(
 97 |             is.numeric(u$ns), length(u$ns) == 1, 
 98 |             u$ns > 0, as.integer(u$ns) == u$ns)
 99 |     if (!is.null(u$nk)) {
100 |         stopifnot(
101 |             is.numeric(u$nk), length(u$nk) == 1, 
102 |             u$nk > 0, as.integer(u$nk) == u$nk)
103 |     } else u$nk <- nlevels(u$x$cluster_id)
104 |     
105 |     if (!u$force && u$ng != nrow(u$x))
106 |         stop("Number of simulated genes should match with reference,\n", 
107 |             "  but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", 
108 |             "  simulation should be forced regardlessly (see '?simData').")
109 |     if (!is.null(u$phylo_tree) && u$p_type != 0)
110 |         stop("Only one of arguments 'p_type' or 'phylo_tree'\n",
111 |             "  can be specified; see '?simData' for 'Details'.")
112 |     # assure number of simulated clusters matches with specified phylogeny
113 |     if (!is.null(u$phylo_tree)) {
114 |         kids_phylo <- .get_clusters_from_phylo(u$phylo_tree)
115 |         nk_phylo <- length(kids_phylo)
116 |         ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
117 |         if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
118 |             stop("Some clusters appear to be missing from 'phylo_tree';\n",
119 |                 "  please make sure all clusters up to ", 
120 |                 dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
121 |         # possibly update number of clusters 'nk'
122 |         if (nk_phylo != u$nk) u$nk <- nk_phylo
123 |     }
124 |     stopifnot(
125 |         is.numeric(u$ng), length(u$ng) == 1, u$ng > 0, as.integer(u$ng) == u$ng,
126 |         is.numeric(u$nc), length(u$nc) == 1, u$nc > 0, as.integer(u$nc) == u$nc,
127 |         is.numeric(u$p_dd), length(u$p_dd) == 6, u$p_dd >= 0, u$p_dd <= 1,
128 |         abs(1-sum(u$p_dd)) < 1e-12, is.logical(u$paired), length(u$paired) == 1,
129 |         is.numeric(u$p_ep), length(u$p_ep) == 1, u$p_ep > 0, u$p_ep < 1,
130 |         is.numeric(u$p_dp), length(u$p_dp) == 1, u$p_dp > 0, u$p_dp < 1,
131 |         is.numeric(u$p_dm), length(u$p_dm) == 1, u$p_dm > 0, u$p_dm < 1,
132 |         is.numeric(u$p_type), length(u$p_type) == 1, u$p_type >= 0, u$p_type <= 1,
133 |         is.numeric(u$lfc), is.numeric(u$lfc), length(u$lfc) == 1, u$lfc >= 1,
134 |         is.logical(u$force), length(u$force) == 1,
135 |         is.numeric(u$phylo_pars), length(u$phylo_pars) == 2, u$phylo_pars >= 0)
136 |     if (!is.null(u$rel_lfc))
137 |         stopifnot(is.numeric(u$rel_lfc), 
138 |             length(u$rel_lfc) == u$nk, u$rel_lfc >= 0)
139 |     return(list(nk = u$nk, ns = u$ns))
140 | }
141 | 
142 | #' @importFrom SummarizedExperiment colData
143 | .check_args_aggData <- function(u) {
144 |     stopifnot(is.character(u$by), length(u$by) <= 2, 
145 |         u$by %in% colnames(colData(u$x)))
146 |     stopifnot(is.logical(u$scale), length(u$scale) == 1)
147 |     if (u$scale & (!u$assay %in% c("cpm", "CPM") | u$fun != "sum"))
148 |         stop("Option 'scale = TRUE' only valid for", 
149 |             " 'assay = \"cpm/CPM\"' and 'fun = \"sum\"'.")
150 | }
151 | 
152 | .check_args_pbDS <- function(u) {
153 |     if (!is.null(u$design))
154 |         stopifnot(is.matrix(u$design),
155 |             !is.null(rownames(u$design)),
156 |             !is.null(colnames(u$design)))
157 |     stopifnot(
158 |         is.null(u$contrast) | is.matrix(u$contrast),
159 |         is.null(u$coef) | is.numeric(unlist(u$coef)),
160 |         is.numeric(u$min_cells), length(u$min_cells) == 1,
161 |         is.logical(u$verbose), length(u$verbose) == 1,
162 |         is.logical(u$treat), length(u$treat) == 1)
163 | }
164 | 
165 | .check_args_mmDS <- function(u) {
166 |     stopifnot(
167 |         is.null(u$covs) || is.character(u$covs) & all(u$covs %in% names(colData(u$x))),
168 |         is.numeric(u$coef) & u$coef %in% seq_len(nlevels(u$x$group_id))
169 |         | is.character(u$coef) & u$coef %in% c("(Intercept)", 
170 |             paste0("group_id", levels(u$x$group_id)[-1])),
171 |         !is.null(metadata(u$x)$experiment_info$group_id) | !is.null(u$x$group_id), 
172 |         is.numeric(u$n_cells), length(u$n_cells) == 1, u$n_cells >= 0,
173 |         is.numeric(u$n_samples), length(u$n_samples) == 1, u$n_samples >= 2,
174 |         is.numeric(u$min_count), length(u$min_count) == 1, u$min_count >= 0,
175 |         is.numeric(u$min_cells), length(u$min_cells) == 1, u$min_cells >= 0,
176 |         is.logical(u$verbose), length(u$verbose) == 1,
177 |         is.logical(u$dup_corr), length(u$dup_corr) == 1,
178 |         is.logical(u$trended), length(u$trended) == 1,
179 |         is.logical(u$bayesian), length(u$bayesian) == 1,
180 |         is.logical(u$blind), length(u$blind) == 1,
181 |         is.logical(u$REML), length(u$REML) == 1)
182 | }
183 | 
184 | .check_args_pbHeatmap <- function(u) {
185 |     if (!is.null(u$k))
186 |         stopifnot(is.character(u$k), u$k %in% levels(u$x$cluster_id))
187 |     if (!is.null(u$g))
188 |         stopifnot(is.character(u$g), u$g %in% rownames(u$x))
189 |     if (!is.null(u$c))
190 |         stopifnot(is.character(u$c), u$c %in% names(u$y$table))
191 |     stopifnot(
192 |         is.numeric(u$top_n), length(u$top_n) == 1, u$top_n > 1,
193 |         is.numeric(u$fdr), length(u$fdr) == 1, u$fdr > 0,
194 |         is.numeric(u$lfc), length(u$lfc) == 1,
195 |         is.character(u$sort_by), length(u$sort_by) == 1,
196 |         u$sort_by == "none" | 
197 |             !is.null(dim(u$y[[1]])) &
198 |             u$sort_by %in% names(u$y[[1]]) &
199 |             is.numeric(u$y[[1]][[u$sort_by]]) |
200 |             u$sort_by %in% names(u$y$table[[1]][[1]]) &
201 |             is.numeric(u$y$table[[1]][[1]][[u$sort_by]]),
202 |         is.function(u$fun),
203 |         is.logical(u$decreasing), length(u$decreasing) == 1,
204 |         is.logical(u$normalize), length(u$normalize) == 1,
205 |         is.logical(u$row_anno), length(u$row_anno) == 1,
206 |         is.logical(u$col_anno), length(u$col_anno) == 1)
207 | }
208 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="inst/extdata/muscat.png" width="200" align="right"/> 
 2 | 
 3 | **`muscat` (**Mu**lti-sample **mu**lti-group **sc**RNA-seq **a**nalysis **t**ools )**
 4 | 
 5 | ...provides methods for *Differential State* (DS) analyses in scRNA-seq data  
 6 | with multiple samples, groups, and (cell)-subpopulations, as elaborated in:
 7 | 
 8 | > Crowell HL, Soneson C\*, Germain P-L\*,  
 9 | Calini D, Collin L, Raposo C, Malhotra D & Robinson MD:  
10 | "*muscat* detects subpopulation-specific state transitions from  
11 | multi-sample multi-condition single-cell transcriptomics data"  
12 | *Nature Communications* **11**, 6077 (2020)  
13 | [DOI: 10.1038/s41467-020-19894-4](https://doi.org/10.1038/s41467-020-19894-4)
14 | 
15 | *These authors contributed equally.
16 | 
17 | ### installation
18 | 
19 | `muscat` is available through Bioconductor, and
20 | can be installed using the following commands:
21 | 
22 | ```r
23 | if (!requireNamespace("BiocManager", quietly = TRUE))
24 |     install.packages("BiocManager")
25 | BiocManager::install("muscat")
26 | ```
27 | 
28 | ### quick guide
29 | 
30 | Let `sce` be a [`SingleCellExperiment`](https://www.bioconductor.org/packages/SingleCellExperiment.html) object with cell metadata (`colData`) columns
31 | 
32 | 1. `"sample_id"` specifying unique sample identifiers (e.g., PeterPan1, Nautilus7, ...)
33 | 2. `"group_id"` specifying each sample's experimental condition (e.g., reference/stimulated, healthy/diseased, ...)
34 | 3. `"cluster_id"` specifying subpopulation (cluster) assignments (e.g., B cells, dendritic cells, ...)
35 | 
36 | Aggregation-based methods come down to the following simple commands: 
37 | 
38 | ```r
39 | # compute pseudobulks (sum of counts)
40 | pb <- aggregateData(sce, 
41 |     assay = "counts", fun = "sum",
42 |     by = c("cluster_id", "sample_id"))
43 |     
44 | # run pseudobulk (aggregation-based) DS analysis
45 | ds_pb <- pbDS(pb, method = "edgeR")
46 | ```
47 | 
48 | Mixed models can be run directly on cell-level measurements, e.g.:
49 | 
50 | ```r
51 | ds_mm <- mmDS(sce, method = "dream")
52 | ```
53 | 
54 | For details, please see the package vignettes.
55 | 
56 | ### differential detection
57 | 
58 | `muscat` also supports testing for differential detection as proposed in 
59 | 
60 | > Gilis J, Perin L, Malfait M, Van den Berge K,  
61 | Assefa AT, Verbist B, Risso D, and Clement L:  
62 | Differential detection workflows for  
63 | multi-sample single-cell RNA-seq data.  
64 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043)
65 | 
66 | Key alterations to the commands above are highlighted below (!!!), 
67 | however, we recommend users consult the corresponding publication 
68 | and package vignette for more details.
69 | 
70 | ```r
71 | # sum binarized counts
72 | pb <- aggregateData(sce, 
73 |     assay = "counts", 
74 |     fun = "num.detected", # !!!
75 |     by = c("cluster_id", "sample_id"))
76 | # test for differential detection
77 | dd <- pbDD(pb) # or..
78 | dd <- pbDS(pb, method = "DD")
79 | ```


--------------------------------------------------------------------------------
/data/example_sce.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/data/example_sce.rda


--------------------------------------------------------------------------------
/inst/extdata/1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1a.png


--------------------------------------------------------------------------------
/inst/extdata/1b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1b.png


--------------------------------------------------------------------------------
/inst/extdata/1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1d.png


--------------------------------------------------------------------------------
/inst/extdata/muscat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/muscat.png


--------------------------------------------------------------------------------
/inst/extdata/refs.bib:
--------------------------------------------------------------------------------
  1 | % Generated by Paperpile. Check out https://paperpile.com for more information.
  2 | % BibTeX export options can be customized via Settings -> BibTeX.
  3 | 
  4 | @ARTICLE{Crowell2020-muscat,
  5 |   title   = "muscat detects subpopulation-specific state transitions from
  6 |              multi-sample multi-condition single-cell transcriptomics data",
  7 |   author  = "Crowell, Helena L and Soneson, Charlotte and Germain, Pierre-Luc
  8 |              and Calini, Daniela and Collin, Ludovic and Raposo, Catarina and
  9 |              Malhotra, Dheeraj and Robinson, Mark D",
 10 |   journal = "Nature Communications",
 11 |   volume  =  11,
 12 |   number  =  1,
 13 |   pages   = "6077",
 14 |   year    =  2020
 15 | }
 16 | 
 17 | @ARTICLE{Zhang2019,
 18 |   title   = "Probabilistic cell type assignment of single-cell transcriptomic
 19 |              data reveals spatiotemporal microenvironment dynamics in human
 20 |              cancers",
 21 |   author  = "Zhang, Allen W and O\textbackslashtextquoterightFlanagan, Ciara
 22 |              and Chavez, Elizabeth and Lim, Jamie L P and McPherson, Andrew and
 23 |              Wiens, Matt and Walters, Pascale and Chan, Tim and Hewitson,
 24 |              Brittany and Lai, Daniel and Mottok, Anja and Sarkozy, Clementine
 25 |              and Chong, Lauren and Aoki, Tomohiro and Wang, Xuehai and Weng,
 26 |              Andrew P and McAlpine, Jessica N and Aparicio, Samuel and Steidl,
 27 |              Christian and Campbell, Kieran R and Shah, Sohrab P",
 28 |   journal = "bioRxiv",
 29 |   volume  =  521914,
 30 |   year    =  2019
 31 | }
 32 | 
 33 | @ARTICLE{Soneson2018-countsimQC,
 34 |   title   = "Towards unified quality verification of synthetic count data with
 35 |              {countsimQC}",
 36 |   author  = "Soneson, Charlotte and Robinson, Mark D",
 37 |   journal = "Bioinformatics",
 38 |   volume  =  34,
 39 |   number  =  4,
 40 |   pages   = "691--692",
 41 |   year    =  2018
 42 | }
 43 | 
 44 | @ARTICLE{Butler2018-Seurat,
 45 |   title   = "Integrating single-cell transcriptomic data across different
 46 |              conditions, technologies, and species",
 47 |   author  = "Butler, Andrew and Hoffman, Paul and Smibert, Peter and Papalexi,
 48 |              Efthymia and Satija, Rahul",
 49 |   journal = "Nature Biotechnology",
 50 |   volume  =  36,
 51 |   number  =  5,
 52 |   pages   = "411--420",
 53 |   year    =  2018
 54 | }
 55 | 
 56 | @ARTICLE{Stuart2019,
 57 |   title   = "Comprehensive {I}ntegration of {{S}ingle-{C}ell} {D}ata",
 58 |   author  = "Stuart, Tim and Butler, Andrew and Hoffman, Paul and Hafemeister,
 59 |              Christoph and Papalexi, Efthymia and Mauck, 3rd, William M and
 60 |              Hao, Yuhan and Stoeckius, Marlon and Smibert, Peter and Satija,
 61 |              Rahul",
 62 |   journal = "Cell",
 63 |   volume  =  177,
 64 |   number  =  7,
 65 |   pages   = "1888--1902.e21",
 66 |   year    =  2019
 67 | }
 68 | 
 69 | @ARTICLE{Stegle2015,
 70 |   title   = "Computational and analytical challenges in single-cell
 71 |              transcriptomics",
 72 |   author  = "Stegle, Oliver and Teichmann, Sarah A and Marioni, John C",
 73 |   journal = "Nature Reviews Genetics",
 74 |   volume  =  16,
 75 |   number  =  3,
 76 |   pages   = "133--145",
 77 |   year    =  2015
 78 | }
 79 | 
 80 | @ARTICLE{Trapnell2015,
 81 |   title   = "Defining cell types and states with single-cell genomics",
 82 |   author  = "Trapnell, Cole",
 83 |   journal = "Genome Research",
 84 |   volume  =  25,
 85 |   number  =  10,
 86 |   pages   = "1491--1498",
 87 |   year    =  2015
 88 | }
 89 | 
 90 | @ARTICLE{Wagner2016,
 91 |   title   = "Revealing the vectors of cellular identity with single-cell
 92 |              genomics",
 93 |   author  = "Wagner, Allon and Regev, Aviv and Yosef, Nir",
 94 |   journal = "Nature Biotechnology",
 95 |   volume  =  34,
 96 |   number  =  11,
 97 |   pages   = "1145--1160",
 98 |   year    =  2016
 99 | }
100 | 
101 | @ARTICLE{McCarthy2017-scater,
102 |   title   = "Scater: pre-processing, quality control, normalization and
103 |              visualization of single-cell {RNA-seq} data in {R}",
104 |   author  = "McCarthy, Davis J and Campbell, Kieran R and Lun, Aaron T L and
105 |              Wills, Quin F",
106 |   journal = "Bioinformatics",
107 |   volume  =  33,
108 |   number  =  8,
109 |   pages   = "1179--1186",
110 |   year    =  2017
111 | }
112 | 
113 | @ARTICLE{Love2014-DESeq2,
114 |   title   = "Moderated estimation of fold change and dispersion for {RNA-seq}
115 |              data with {DESeq2}",
116 |   author  = "Love, Michael I and Huber, Wolfgang and Anders, Simon",
117 |   journal = "Genome Biology",
118 |   volume  =  15,
119 |   number  =  12,
120 |   pages   = "550",
121 |   year    =  2014
122 | }
123 | 
124 | @ARTICLE{Korthauer2016-scDD,
125 |   title   = "A statistical approach for identifying differential distributions
126 |              in single-cell {RNA-seq} experiments",
127 |   author  = "Korthauer, Keegan D and Chu, Li-Fang and Newton, Michael A and Li,
128 |              Yuan and Thomson, James and Stewart, Ron and Kendziorski,
129 |              Christina",
130 |   journal = "Genome Biology",
131 |   volume  =  17,
132 |   number  =  1,
133 |   pages   = "222",
134 |   year    =  2016
135 | }
136 | 
137 | @ARTICLE{Kang2018-demuxlet,
138 |   title   = "Multiplexed droplet single-cell {RNA-sequencing} using natural
139 |              genetic variation",
140 |   author  = "Kang, Hyun Min and Subramaniam, Meena and Targ, Sasha and Nguyen,
141 |              Michelle and Maliskova, Lenka and McCarthy, Elizabeth and Wan,
142 |              Eunice and Wong, Simon and Byrnes, Lauren and Lanata, Cristina M
143 |              and Gate, Rachel E and Mostafavi, Sara and Marson, Alexander and
144 |              Zaitlen, Noah and Criswell, Lindsey A and Ye, Chun Jimmie",
145 |   journal = "Nature Biotechnology",
146 |   volume  =  36,
147 |   number  =  1,
148 |   pages   = "89--94",
149 |   year    =  2018
150 | }
151 | 
152 | @ARTICLE{Soneson2016-iCOBRA,
153 |   title   = "{iCOBRA}: open, reproducible, standardized and live method
154 |              benchmarking",
155 |   author  = "Soneson, Charlotte and Robinson, Mark D",
156 |   journal = "Nature Methods",
157 |   volume  =  13,
158 |   number  =  4,
159 |   pages   = "283",
160 |   year    =  2016
161 | }
162 | 
163 | @ARTICLE{Hafemeister2019-sctransform,
164 |   title   = "Normalization and variance stabilization of single-cell {RNA-seq}
165 |              data using regularized negative binomial regression",
166 |   author  = "Hafemeister, Christoph and Satija, Rahul",
167 |   journal = "bioRxiv",
168 |   volume  =  576827,
169 |   year    =  2019
170 | }
171 | 
172 | @ARTICLE{Freytag2018-clustering,
173 |   title    = "Comparison of clustering tools in {R} for medium-sized 10x
174 |               Genomics single-cell {RNA-sequencing} data",
175 |   author   = "Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng,
176 |               Milica and Bahlo, Melanie",
177 |   journal  = "F1000Research",
178 |   volume   =  7,
179 |   pages    = "1297",
180 |   year     =  2018
181 | }
182 | 
183 | @ARTICLE{Duo2018,
184 |   title   = "A systematic performance evaluation of clustering methods for
185 |              single-cell {RNA-seq} data",
186 |   author  = "Du{\`o}, Angelo and Robinson, Mark D and Soneson, Charlotte",
187 |   journal = "F1000Research",
188 |   volume  =  7,
189 |   pages   = "1141",
190 |   year    =  2018
191 | }
192 | 
193 | @ARTICLE{Diaz-Mejia2019,
194 |   title   = "Evaluation of methods to assign cell type labels to cell clusters
195 |              from single-cell {RNA-sequencing} data",
196 |   author  = "Diaz-Mejia, J Javier and Javier Diaz-Mejia, J and Meng, Elaine C
197 |              and Pico, Alexander R and MacParland, Sonya A and Ketela, Troy and
198 |              Pugh, Trevor J and Bader, Gary D and Morris, John H",
199 |   journal = "F1000Research",
200 |   volume  =  8,
201 |   pages   = "296",
202 |   year    =  2019
203 | }
204 | 
205 | @ARTICLE{Soneson2018,
206 |   title   = "Bias, robustness and scalability in single-cell differential
207 |              expression analysis",
208 |   author  = "Soneson, Charlotte and Robinson, Mark D",
209 |   journal = "Nature Methods",
210 |   volume  =  15,
211 |   number  =  4,
212 |   pages   = "255--261",
213 |   year    =  2018
214 | }
215 | 
216 | @ARTICLE{Ritchie2015-limma,
217 |   title   = "limma powers differential expression analyses for {RNA-sequencing}
218 |              and microarray studies",
219 |   author  = "Ritchie, Matthew E and Phipson, Belinda and Wu, Di and Hu, Yifang
220 |              and Law, Charity W and Shi, Wei and Smyth, Gordon K",
221 |   journal = "Nucleic Acids Research",
222 |   volume  =  43,
223 |   number  =  7,
224 |   pages   = "e47",
225 |   year    =  2015
226 | }
227 | 
228 | @ARTICLE{Robinson2010-edgeR,
229 |   title   = "edge{R}: a {B}ioconductor package for differential expression
230 |              analysis of digital gene expression data",
231 |   author  = "Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K",
232 |   journal = "Bioinformatics",
233 |   volume  =  26,
234 |   number  =  1,
235 |   pages   = "139--140",
236 |   year    =  2010
237 | }
238 | 
239 | @ARTICLE{Gilis2023,
240 |   title    = "Differential detection workflows for multi-sample single-cell
241 |               {RNA-seq} data",
242 |   author   = "Gilis, Jeroen and Perin, Laura and Malfait, Milan and Van den
243 |               Berge, Koen and Takele Assefa, Alemu and Verbist, Bie and Risso,
244 |               Davide and Clement, Lieven",
245 |   journal  = "bioRxiv",
246 |   year     =  2023
247 | }
248 | 
249 | @ARTICLE{Bouland2021,
250 |   title    = "Differential analysis of binarized single-cell {RNA} sequencing
251 |               data captures biological variation",
252 |   author   = "Bouland, Gerard A and Mahfouz, Ahmed and Reinders, Marcel J T",
253 |   journal  = "NAR Genomics and Bioinformatics",
254 |   volume   =  3,
255 |   number   =  4,
256 |   pages    = "lqab118",
257 |   year     =  2021
258 | }
259 | 
260 | @ARTICLE{Qiu2020,
261 |   title    = "Embracing the dropouts in single-cell {RNA-seq} analysis",
262 |   author   = "Qiu, Peng",
263 |   journal  = "Nature Communications",
264 |   volume   =  11,
265 |   number   =  1,
266 |   pages    = "1169",
267 |   year     =  2020,
268 |   language = "en"
269 | }
270 | 
271 | @ARTICLE{Vandenberge2017,
272 |   title    = "stageR: a general stage-wise method for controlling the gene-level
273 | 	      false discovery rate in differential expression and differential 
274 | 	      transcript usage",
275 |   author   = "Van den Berge, Koen and Soneson, Charlotte and Robinson, Mark D 
276 | 	      and Clement, Lieven",
277 |   journal  = "Genome Biology",
278 |   volume   =  18,
279 |   number   =  151,
280 |   year     =  2017,
281 |   language = "en"
282 | }


--------------------------------------------------------------------------------
/man/aggregateData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aggregateData.R
 3 | \name{aggregateData}
 4 | \alias{aggregateData}
 5 | \title{Aggregation of single-cell to pseudobulk data}
 6 | \usage{
 7 | aggregateData(
 8 |   x,
 9 |   assay = NULL,
10 |   by = c("cluster_id", "sample_id"),
11 |   fun = c("sum", "mean", "median", "prop.detected", "num.detected"),
12 |   scale = FALSE,
13 |   verbose = TRUE,
14 |   BPPARAM = SerialParam(progressbar = verbose)
15 | )
16 | }
17 | \arguments{
18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
19 | 
20 | \item{assay}{character string specifying the assay slot to use as 
21 | input data. Defaults to the 1st available (\code{assayNames(x)[1]}).}
22 | 
23 | \item{by}{character vector specifying which 
24 | \code{colData(x)} columns to summarize by (at most 2!).}
25 | 
26 | \item{fun}{a character string.
27 | Specifies the function to use as summary statistic.
28 | Passed to \code{\link[scuttle]{summarizeAssayByGroup}}.}
29 | 
30 | \item{scale}{logical. Should pseudo-bulks be scaled
31 | with the effective library size & multiplied by 1M?}
32 | 
33 | \item{verbose}{logical. Should information on progress be reported?}
34 | 
35 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
36 | object specifying how aggregation should be parallelized.}
37 | }
38 | \value{
39 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
40 | \itemize{
41 | \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains 
42 |   pseudobulks for each of \code{by[1]}, e.g., for each cluster when 
43 |   \code{by = "cluster_id"}. Rows correspond to genes, columns to 
44 |   \code{by[2]}, e.g., samples when \code{by = "sample_id"}}.
45 | \item{If \code{length(by) == 1}, the returned SCE will contain only 
46 |   a single \code{assay} with rows = genes and colums = \code{by}.}}
47 |   
48 |   Aggregation parameters (\code{assay, by, fun, scaled}) are stored in 
49 |   \code{metadata()$agg_pars}, and the number of cells that were aggregated 
50 |   are accessible in \code{int_colData()$n_cells}.
51 | }
52 | \description{
53 | ...
54 | }
55 | \examples{
56 | # pseudobulk counts by cluster-sample
57 | data(example_sce)
58 | pb <- aggregateData(example_sce)
59 | 
60 | library(SingleCellExperiment)
61 | assayNames(example_sce)  # one sheet per cluster
62 | head(assay(example_sce)) # n_genes x n_samples
63 | 
64 | # scaled CPM
65 | cpm <- edgeR::cpm(assay(example_sce))
66 | assays(example_sce)$cpm <- cpm
67 | pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE)
68 | head(assay(pb)) 
69 | 
70 | # aggregate by cluster only
71 | pb <- aggregateData(example_sce, by = "cluster_id")
72 | length(assays(pb)) # single assay
73 | head(assay(pb))    # n_genes x n_clusters
74 | 
75 | }
76 | \references{
77 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
78 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
79 | On the discovery of population-specific state transitions from 
80 | multi-sample multi-condition single-cell RNA sequencing data. 
81 | \emph{bioRxiv} \strong{713412} (2018). 
82 | doi: \url{https://doi.org/10.1101/713412}
83 | }
84 | \author{
85 | Helena L Crowell & Mark D Robinson
86 | }
87 | 


--------------------------------------------------------------------------------
/man/calcExprFreqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calcExprFreqs.R
 3 | \name{calcExprFreqs}
 4 | \alias{calcExprFreqs}
 5 | \title{calcExprFreqs}
 6 | \usage{
 7 | calcExprFreqs(x, assay = "counts", th = 0)
 8 | }
 9 | \arguments{
10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
11 | 
12 | \item{assay}{a character string specifying which assay to use.}
13 | 
14 | \item{th}{numeric threshold value above which
15 | a gene should be considered to be expressed.}
16 | }
17 | \value{
18 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
19 |   containing, for each cluster, an assay of dimensions #genes x #samples 
20 |   giving the fraction of cells that express each gene in each sample. 
21 |   If \code{colData(x)} contains a \code{"group_id"} column, the fraction 
22 |   of expressing cells in each each group will be included as well.
23 | }
24 | \description{
25 | Calculates gene expression frequencies
26 | }
27 | \details{
28 | \code{calcExprFreq} computes, for each sample and group (in each cluster),
29 | the fraction of cells that express a given gene. Here, a gene is considered 
30 | to be expressed when the specified measurement value (\code{assay}) 
31 | lies above the specified threshold value (\code{th}).
32 | }
33 | \examples{
34 | data(example_sce)
35 | library(SingleCellExperiment)
36 | 
37 | frq <- calcExprFreqs(example_sce)
38 | 
39 | # one assay per cluster
40 | assayNames(frq)  
41 | 
42 | # expression frequencies by
43 | # sample & group; 1st cluster:
44 | head(assay(frq))
45 | 
46 | }
47 | \author{
48 | Helena L Crowell & Mark D Robinson
49 | }
50 | 


--------------------------------------------------------------------------------
/man/data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \name{data}
 4 | \alias{data}
 5 | \alias{example_sce}
 6 | \title{Example datasets}
 7 | \value{
 8 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
 9 | }
10 | \description{
11 | A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing 
12 | 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after 
13 | 6h-treatment with INF-beta (16 samples in total).
14 | 
15 | The original data has been filtered to
16 | \itemize{
17 | \item{remove unassigned cells & cell multiplets}
18 | \item{retain only 4 out of 8 samples per experimental group}
19 | \item{retain only 5 out of 8 subpopulations (clusters)}
20 | \item{retain genes with a count > 1 in > 50 cells}
21 | \item{retain cells with > 200 detected genes}
22 | \item{retain at most 100 cells per cluster-sample instance}
23 | }
24 | 
25 | Assay \code{logcounts} corresponds to log-normalized values 
26 | obtained from \code{\link[scater]{logNormCounts}} with default parameters.
27 |   
28 | The original measurement data, as well as gene and cell metadata 
29 | is available through the NCBI GEO accession number GSE96583;
30 | code to reproduce this example dataset from the original data 
31 | is provided in the examples section.
32 | }
33 | \examples{
34 | \donttest{
35 | # set random seed for cell sampling
36 | set.seed(2929)
37 | 
38 | # load data
39 | library(ExperimentHub)
40 | eh <- ExperimentHub()
41 | sce <- eh[["EH2259"]]
42 | 
43 | # drop unassigned cells & multiplets
44 | sce <- sce[, !is.na(sce$cell)]
45 | sce <- sce[, sce$multiplets == "singlet"]
46 | 
47 | # keep 4 samples per group
48 | sce$id <- paste0(sce$stim, sce$ind)
49 | inds <- sample(sce$ind, 4)
50 | ids <- paste0(levels(sce$stim), rep(inds, each = 2))
51 | sce <- sce[, sce$id \%in\% ids]
52 | 
53 | # keep 5 clusters
54 | kids <- c("B cells", "CD4 T cells", "CD8 T cells",
55 |     "CD14+ Monocytes", "FCGR3A+ Monocytes")
56 | sce <- sce[, sce$cell \%in\% kids]
57 | sce$cell <- droplevels(sce$cell)
58 | 
59 | # basic filtering on  genes & cells
60 | gs <- rowSums(counts(sce) > 1) > 50
61 | cs <- colSums(counts(sce) > 0) > 200
62 | sce <- sce[gs, cs]
63 | 
64 | # sample max. 100 cells per cluster-sample
65 | cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id))
66 | cs <- sapply(cs_by_ks, function(u) 
67 |     sample(u, min(length(u), 100)))
68 | sce <- sce[, unlist(cs)]
69 | 
70 | # compute logcounts
71 | library(scater)
72 | sce <- computeLibraryFactors(sce)
73 | sce <- logNormCounts(sce)
74 | 
75 | # re-format for 'muscat'
76 | sce <- prepSCE(sce, 
77 |     kid = "cell", 
78 |     sid = "id", 
79 |     gid = "stim", 
80 |     drop = TRUE)
81 | } 
82 | 
83 | }
84 | \references{
85 | Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing 
86 | using natural genetic variation. \emph{Nature Biotechnology},
87 | \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042.
88 | }
89 | \author{
90 | Helena L Crowell
91 | }
92 | 


--------------------------------------------------------------------------------
/man/mmDS.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mmDS.R, R/utils-mmDS.R
  3 | \name{mmDS}
  4 | \alias{mmDS}
  5 | \alias{.mm_dream}
  6 | \alias{.mm_dream2}
  7 | \alias{.mm_vst}
  8 | \alias{.mm_glmm}
  9 | \title{DS analysis using mixed-models (MM)}
 10 | \usage{
 11 | mmDS(
 12 |   x,
 13 |   coef = NULL,
 14 |   covs = NULL,
 15 |   method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"),
 16 |   n_cells = 10,
 17 |   n_samples = 2,
 18 |   min_count = 1,
 19 |   min_cells = 20,
 20 |   verbose = TRUE,
 21 |   BPPARAM = SerialParam(progressbar = verbose),
 22 |   vst = c("sctransform", "DESeq2"),
 23 |   ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
 24 |   dup_corr = FALSE,
 25 |   trended = FALSE,
 26 |   bayesian = FALSE,
 27 |   blind = TRUE,
 28 |   REML = TRUE,
 29 |   moderate = FALSE
 30 | )
 31 | 
 32 | .mm_dream(
 33 |   x,
 34 |   coef = NULL,
 35 |   covs = NULL,
 36 |   dup_corr = FALSE,
 37 |   trended = FALSE,
 38 |   ddf = c("Satterthwaite", "Kenward-Roger"),
 39 |   verbose = FALSE,
 40 |   BPPARAM = SerialParam(progressbar = verbose)
 41 | )
 42 | 
 43 | .mm_dream2(
 44 |   x,
 45 |   coef = NULL,
 46 |   covs = NULL,
 47 |   ddf = c("Satterthwaite", "Kenward-Roger"),
 48 |   verbose = FALSE,
 49 |   BPPARAM = SerialParam(progressbar = verbose)
 50 | )
 51 | 
 52 | .mm_vst(
 53 |   x,
 54 |   vst = c("sctransform", "DESeq2"),
 55 |   coef = NULL,
 56 |   covs = NULL,
 57 |   bayesian = FALSE,
 58 |   blind = TRUE,
 59 |   REML = TRUE,
 60 |   ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
 61 |   verbose = FALSE,
 62 |   BPPARAM = SerialParam(progressbar = verbose)
 63 | )
 64 | 
 65 | .mm_glmm(
 66 |   x,
 67 |   coef = NULL,
 68 |   covs = NULL,
 69 |   family = c("poisson", "nbinom"),
 70 |   moderate = FALSE,
 71 |   verbose = TRUE,
 72 |   BPPARAM = SerialParam(progressbar = verbose)
 73 | )
 74 | }
 75 | \arguments{
 76 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
 77 | 
 78 | \item{coef}{character specifying the coefficient to test.
 79 | If NULL (default), will test the last level of \code{"group_id"}.}
 80 | 
 81 | \item{covs}{character vector of \code{colData(x)}
 82 | column names to use as covariates.}
 83 | 
 84 | \item{method}{a character string.
 85 | Either \code{"dream2"} (default, lme4 with voom-weights),
 86 | \code{"dream"} (previous implementation of the dream method),
 87 | \code{"vst"} (variance-stabilizing transformation),
 88 | \code{"poisson"} (poisson GLM-MM),
 89 | \code{"nbinom"} (negative binomial GLM-MM),
 90 | \code{"hybrid"} (combination of pseudobulk and poisson methods)
 91 | or a function accepting the same arguments.}
 92 | 
 93 | \item{n_cells}{number of cells per cluster-sample
 94 | required to consider a sample for testing.}
 95 | 
 96 | \item{n_samples}{number of samples per group
 97 | required to consider a cluster for testing.}
 98 | 
 99 | \item{min_count}{numeric. For a gene to be tested in a given cluster,
100 | at least \code{min_cells} must have a count >= \code{min_count}.}
101 | 
102 | \item{min_cells}{number (or fraction, if < 1) of cells with a count >
103 | \code{min_count} required for a gene to be tested in a given cluster.}
104 | 
105 | \item{verbose}{logical specifying whether messages
106 | on progress and a progress bar should be displayed.}
107 | 
108 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
109 | object specifying how differential testing should be parallelized.}
110 | 
111 | \item{vst}{method to use as variance-stabilizing transformations.
112 | \code{"sctransform"} for \code{\link[sctransform]{vst}}; \code{"DESeq2"}
113 | for \code{\link[DESeq2]{varianceStabilizingTransformation}}.}
114 | 
115 | \item{ddf}{character string specifying the method for estimating
116 | the effective degrees of freedom. For \code{method = "dream"},
117 | either \code{"Satterthwaite"} (faster) or \code{"Kenward-Roger"}
118 | (more accurate); see \code{?variancePartition::dream} for details.
119 | For \code{method = "vst"}, method \code{"lme4"} is also valid;
120 | see \code{\link[lmerTest]{contest.lmerModLmerTest}}.}
121 | 
122 | \item{dup_corr}{logical; whether to use
123 | \code{\link[limma:dupcor]{duplicateCorrelation}}.}
124 | 
125 | \item{trended}{logical; whether to use expression-dependent variance priors
126 | in \code{\link[limma]{eBayes}}.}
127 | 
128 | \item{bayesian}{logical; whether to use bayesian mixed models.}
129 | 
130 | \item{blind}{logical; whether to ignore experimental design for the vst.}
131 | 
132 | \item{REML}{logical; whether to maximize REML instead of log-likelihood.}
133 | 
134 | \item{moderate}{logical; whether to perform empirical Bayes moderation.}
135 | 
136 | \item{family}{character string specifying which GLMM to fit:
137 | \code{"poisson"} for \code{\link[blme:blmer]{bglmer}},
138 | \code{"nbinom"} for \code{\link[glmmTMB]{glmmTMB}}.}
139 | }
140 | \value{
141 | a data.frame
142 | }
143 | \description{
144 | Performs cluster-wise DE analysis by fitting cell-level models.
145 | }
146 | \details{
147 | The \code{.mm_*} functions (e.g. \code{.mm_dream}) expect cells from a single
148 |  cluster, and do not perform filtering or handle incorrect parameters well.
149 | Meant to be called by \code{mmDS} with \code{method = c("dream", "vst")} and
150 | \code{vst = c("sctransform", "DESeq2")} to be applied across all clusters.
151 | \describe{
152 | \item{\code{method = "dream2"}}{
153 |   \code{variancePartition}'s (>=1.14.1) voom-lme4-implementation
154 |   of mixed models for RNA-seq data; function \code{dream}.}
155 | \item{\code{method = "dream"}}{
156 |   \code{variancePartition}'s older voom-lme4-implementation
157 |   of mixed models for RNA-seq data; function \code{dream}.}
158 | \item{\code{method = "vst"}}{
159 |   \describe{
160 |   \item{\code{vst = "sctransform"}}{
161 |     \code{lmer} or \code{blmer} mixed models on
162 |     \code{\link[sctransform]{vst}} transformed counts.}
163 |   \item{\code{vst = "DESeq2"}}{
164 |     \code{\link[DESeq2]{varianceStabilizingTransformation}}
165 |     followed by \code{lme4} mixed models.}}}}
166 | }
167 | \section{Functions}{
168 | \itemize{
169 | \item \code{.mm_dream()}: see details.
170 | 
171 | \item \code{.mm_dream2()}: see details.
172 | 
173 | \item \code{.mm_vst()}: see details.
174 | 
175 | \item \code{.mm_glmm()}: see details.
176 | 
177 | }}
178 | \examples{
179 | # subset "B cells" cluster
180 | data(example_sce)
181 | b_cells <- example_sce$cluster_id == "B cells"
182 | sub <- example_sce[, b_cells]
183 | sub$cluster_id <- droplevels(sub$cluster_id)
184 | 
185 | # downsample to 100 genes
186 | gs <- sample(nrow(sub), 100)
187 | sub <- sub[gs, ]
188 | 
189 | # run DS analysis using cell-level mixed-model
190 | res <- mmDS(sub, method = "dream", verbose = FALSE)
191 | head(res$`B cells`)
192 | 
193 | }
194 | \references{
195 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
196 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
197 | On the discovery of population-specific state transitions from
198 | multi-sample multi-condition single-cell RNA sequencing data.
199 | \emph{bioRxiv} \strong{713412} (2018).
200 | doi: \url{https://doi.org/10.1101/713412}
201 | }
202 | \author{
203 | Pierre-Luc Germain & Helena L Crowell
204 | }
205 | 


--------------------------------------------------------------------------------
/man/pbDS.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/pbDS.R
  3 | \name{pbDS}
  4 | \alias{pbDS}
  5 | \alias{pbDD}
  6 | \title{pseudobulk DS analysis}
  7 | \usage{
  8 | pbDS(
  9 |   pb,
 10 |   method = c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"),
 11 |   design = NULL,
 12 |   coef = NULL,
 13 |   contrast = NULL,
 14 |   min_cells = 10,
 15 |   filter = c("both", "genes", "samples", "none"),
 16 |   treat = FALSE,
 17 |   verbose = TRUE,
 18 |   BPPARAM = SerialParam(progressbar = verbose)
 19 | )
 20 | 
 21 | pbDD(
 22 |   pb,
 23 |   design = NULL,
 24 |   coef = NULL,
 25 |   contrast = NULL,
 26 |   min_cells = 10,
 27 |   filter = c("both", "genes", "samples", "none"),
 28 |   verbose = TRUE,
 29 |   BPPARAM = SerialParam(progressbar = verbose)
 30 | )
 31 | }
 32 | \arguments{
 33 | \item{pb}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
 34 | containing pseudobulks as returned by \code{\link{aggregateData}}.}
 35 | 
 36 | \item{method}{a character string.}
 37 | 
 38 | \item{design}{For methods \code{"edegR"} and \code{"limma"}, a design matrix 
 39 | with row & column names(!) created with \code{\link[stats]{model.matrix}}; 
 40 | For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}.
 41 | Defaults to \code{~ group_id} or the corresponding \code{model.matrix}.}
 42 | 
 43 | \item{coef}{passed to \code{\link[edgeR]{glmQLFTest}},
 44 | \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}}
 45 | for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively.
 46 | Can be a list for multiple, independent comparisons.}
 47 | 
 48 | \item{contrast}{a matrix of contrasts to test for
 49 | created with \code{\link[limma]{makeContrasts}}.}
 50 | 
 51 | \item{min_cells}{a numeric. Specifies the minimum number of cells in a given 
 52 | cluster-sample required to consider the sample for differential testing.}
 53 | 
 54 | \item{filter}{character string specifying whether
 55 | to filter on genes, samples, both or neither.}
 56 | 
 57 | \item{treat}{logical specifying whether empirical Bayes moderated-t 
 58 | p-values should be computed relative to a minimum fold-change threshold. 
 59 | Only applicable for methods \code{"limma-x"} 
 60 | (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"} 
 61 | (\code{\link[edgeR]{glmTreat}}), and ignored otherwise.}
 62 | 
 63 | \item{verbose}{logical. Should information on progress be reported?}
 64 | 
 65 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
 66 | object specifying how differential testing should be parallelized.}
 67 | }
 68 | \value{
 69 | a list containing \itemize{
 70 | \item a data.frame with differential testing results,
 71 | \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and
 72 | \item the \code{design} matrix, and \code{contrast} or \code{coef} used.}
 73 | }
 74 | \description{
 75 | \code{pbDS} tests for DS after aggregating single-cell 
 76 |   measurements to pseudobulk data, by applying bulk RNA-seq DE methods, 
 77 |   such as \code{edgeR}, \code{DESeq2} and \code{limma}.
 78 | }
 79 | \examples{
 80 | # simulate 5 clusters, 20\% of DE genes
 81 | data(example_sce)
 82 |     
 83 | # compute pseudobulk sum-counts & run DS analysis
 84 | pb <- aggregateData(example_sce)
 85 | res <- pbDS(pb, method = "limma-trend")
 86 | 
 87 | names(res)
 88 | names(res$table)
 89 | head(res$table$stim$`B cells`)
 90 | 
 91 | # count nb. of DE genes by cluster
 92 | vapply(res$table$stim, function(u) 
 93 |   sum(u$p_adj.loc < 0.05), numeric(1))
 94 | 
 95 | # get top 5 hits for ea. cluster w/ abs(logFC) > 1
 96 | library(dplyr)
 97 | lapply(res$table$stim, function(u)
 98 |   filter(u, abs(logFC) > 1) \%>\% 
 99 |     arrange(p_adj.loc) \%>\% 
100 |     slice(seq_len(5)))
101 | 
102 | }
103 | \references{
104 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
105 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
106 | On the discovery of population-specific state transitions from 
107 | multi-sample multi-condition single-cell RNA sequencing data. 
108 | \emph{bioRxiv} \strong{713412} (2018). 
109 | doi: \url{https://doi.org/10.1101/713412}
110 | }
111 | \author{
112 | Helena L Crowell & Mark D Robinson
113 | }
114 | 


--------------------------------------------------------------------------------
/man/pbFlatten.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pbFlatten.R
 3 | \name{pbFlatten}
 4 | \alias{pbFlatten}
 5 | \title{pbFlatten
 6 | Flatten pseudobulk SCE}
 7 | \usage{
 8 | pbFlatten(pb, normalize = TRUE)
 9 | }
10 | \arguments{
11 | \item{pb}{a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
12 | as returned by \code{\link{aggregateData}}, with different subpopulations as assays.}
13 | 
14 | \item{normalize}{logical specifying whether to compute a \code{logcpm} assay.}
15 | }
16 | \value{
17 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
18 | }
19 | \description{
20 | Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
21 | as returned by \code{\link{aggregateData}} such that all cell subpopulations 
22 | are represented as a single assay.
23 | }
24 | \examples{
25 | data(example_sce)
26 | library(SingleCellExperiment)
27 | pb_stack <- aggregateData(example_sce)
28 | (pb_flat <- pbFlatten(pb_stack))
29 | ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack))
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/pbHeatmap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pbHeatmap.R
 3 | \name{pbHeatmap}
 4 | \alias{pbHeatmap}
 5 | \title{Heatmap of cluster-sample pseudobulks}
 6 | \usage{
 7 | pbHeatmap(
 8 |   x,
 9 |   y,
10 |   k = NULL,
11 |   g = NULL,
12 |   c = NULL,
13 |   top_n = 20,
14 |   fdr = 0.05,
15 |   lfc = 1,
16 |   sort_by = "p_adj.loc",
17 |   decreasing = FALSE,
18 |   assay = "logcounts",
19 |   fun = mean,
20 |   normalize = TRUE,
21 |   col = viridis(10),
22 |   row_anno = TRUE,
23 |   col_anno = TRUE
24 | )
25 | }
26 | \arguments{
27 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
28 | 
29 | \item{y}{a list of DS analysis results as returned by 
30 | \code{\link{pbDS}} or \code{\link{mmDS}}.}
31 | 
32 | \item{k}{character vector; specifies which cluster ID(s) to retain.
33 | Defaults to \code{levels(x$cluster_id)}.}
34 | 
35 | \item{g}{character vector; specifies which genes to retain.
36 | Defaults to considering all genes.}
37 | 
38 | \item{c}{character string; specifies which contrast/coefficient to retain.
39 | Defaults to \code{names(y$table)[1]}.}
40 | 
41 | \item{top_n}{single numeric; number of genes to retain per cluster.}
42 | 
43 | \item{fdr, lfc}{single numeric; FDR and logFC cutoffs to filter results by.
44 | The specified FDR threshold is applied to \code{p_adj.loc} values.}
45 | 
46 | \item{sort_by}{character string specifying 
47 | a numeric results table column to sort by;
48 | \code{"none"} to retain original ordering.}
49 | 
50 | \item{decreasing}{logical; whether to sort 
51 | in decreasing order of \code{sort_by}.}
52 | 
53 | \item{assay}{character string; specifies which assay to use;
54 | should be one of \code{assayNames(x)}.}
55 | 
56 | \item{fun}{function to use as summary statistic, 
57 | e.g., mean, median, sum (depending on the input assay).}
58 | 
59 | \item{normalize}{logical; whether to apply a z-normalization 
60 | to each row (gene) of the cluster-sample pseudobulk data.}
61 | 
62 | \item{col}{character vector of colors or color mapping function
63 | generated with \code{\link[circlize]{colorRamp2}}. Passed to 
64 | argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}}
65 | (see \code{?ComplexHeatmap::Heatmap} for details).}
66 | 
67 | \item{row_anno, col_anno}{logical; whether to render
68 | annotations of cluster and group IDs, respectively.}
69 | }
70 | \value{
71 | a \code{\link{HeatmapList-class}} object.
72 | }
73 | \description{
74 | ...
75 | }
76 | \examples{
77 | # compute pseudobulks & run DS analysis
78 | data(example_sce)
79 | pb <- aggregateData(example_sce)
80 | res <- pbDS(pb)
81 | 
82 | # cluster-sample expression means
83 | pbHeatmap(example_sce, res)
84 | 
85 | # include only a single cluster
86 | pbHeatmap(example_sce, res, k = "B cells")
87 | 
88 | # plot specific gene across all clusters
89 | pbHeatmap(example_sce, res, g = "ISG20")
90 | 
91 | }
92 | \author{
93 | Helena L Crowell
94 | }
95 | 


--------------------------------------------------------------------------------
/man/pbMDS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pbMDS.R
 3 | \name{pbMDS}
 4 | \alias{pbMDS}
 5 | \title{Pseudobulk-level MDS plot}
 6 | \usage{
 7 | pbMDS(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 
11 | containing cluster-sample pseudobulks as returned by 
12 | \code{\link{aggregateData}} with argument 
13 | \code{by = c("cluster_id", "sample_id")}.}
14 | }
15 | \value{
16 | a \code{ggplot} object.
17 | }
18 | \description{
19 | Renders a multidimensional scaling (MDS) 
20 |   where each point represents a cluster-sample instance; 
21 |   with points colored by cluster ID and shaped by group ID.
22 | }
23 | \examples{
24 | data(example_sce)
25 | pb <- aggregateData(example_sce)
26 | pbMDS(pb)
27 | 
28 | }
29 | \author{
30 | Helena L Crowell & Mark D Robinson
31 | }
32 | 


--------------------------------------------------------------------------------
/man/prepSCE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/prepSCE.R
 3 | \name{prepSCE}
 4 | \alias{prepSCE}
 5 | \title{Prepare SCE for DS analysis}
 6 | \usage{
 7 | prepSCE(
 8 |   x,
 9 |   kid = "cluster_id",
10 |   sid = "sample_id",
11 |   gid = "group_id",
12 |   drop = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{x}{a \linkS4class{SingleCellExperiment}.}
17 | 
18 | \item{kid, sid, gid}{character strings specifying
19 | the \code{colData(x)} columns containing cluster assignments,
20 | unique sample identifiers, and group IDs (e.g., treatment).}
21 | 
22 | \item{drop}{logical. Specifies whether \code{colData(x)} columns
23 | besides those specified as \code{cluster_id,sample_id,group_id}
24 | should be retained (default \code{drop = FALSE}) 
25 | or removed (\code{drop = TRUE}).}
26 | }
27 | \value{
28 | a \linkS4class{SingleCellExperiment}.
29 | }
30 | \description{
31 | ...
32 | }
33 | \examples{
34 | # generate random counts
35 | ng <- 50
36 | nc <- 200
37 |     
38 | # generate some cell metadata
39 | gids <- sample(c("groupA", "groupB"), nc, TRUE)   
40 | sids <- sample(paste0("sample", seq_len(3)), nc, TRUE) 
41 | kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE) 
42 | batch <- sample(seq_len(3), nc, TRUE)
43 | cd <- data.frame(group = gids, id = sids, cluster = kids, batch)
44 | 
45 | # construct SCE
46 | library(scuttle)
47 | sce <- mockSCE(ncells = nc, ngenes = ng)
48 | colData(sce) <- cbind(colData(sce), cd)
49 | 
50 | # prep. for workflow
51 | sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group")
52 | head(colData(sce))
53 | metadata(sce)$experiment_info
54 | sce
55 | 
56 | }
57 | \author{
58 | Helena L Crowell
59 | }
60 | 


--------------------------------------------------------------------------------
/man/prepSim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/prepSim.R
 3 | \name{prepSim}
 4 | \alias{prepSim}
 5 | \title{SCE preparation for \code{\link{simData}}}
 6 | \usage{
 7 | prepSim(
 8 |   x,
 9 |   min_count = 1,
10 |   min_cells = 10,
11 |   min_genes = 100,
12 |   min_size = 100,
13 |   group_keep = NULL,
14 |   verbose = TRUE
15 | )
16 | }
17 | \arguments{
18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
19 | 
20 | \item{min_count, min_cells}{used for filtering of genes; only genes with 
21 | a count > \code{min_count} in >= \code{min_cells} will be retained.}
22 | 
23 | \item{min_genes}{used for filtering cells;
24 | only cells with a count > 0 in >= \code{min_genes} will be retained.}
25 | 
26 | \item{min_size}{used for filtering subpopulation-sample combinations;
27 | only instances with >= \code{min_size} cells will be retained.
28 | Specifying \code{min_size = NULL} skips this step.}
29 | 
30 | \item{group_keep}{character string; if \code{nlevels(x$group_id) > 1},
31 | specifies which group of samples to keep (see details). The default
32 | NULL retains samples from \code{levels(x$group_id)[1]}; otherwise, 
33 | if `colData(x)$group_id` is not specified, all samples will be kept.}
34 | 
35 | \item{verbose}{logical; should information on progress be reported?}
36 | }
37 | \value{
38 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}} 
39 |   containing, for each cell, library size (\code{colData(x)$offset})
40 |   and, for each gene, dispersion and sample-specific mean estimates 
41 |   (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively).
42 | }
43 | \description{
44 | \code{prepSim} prepares an input SCE for simulation 
45 |   with \code{muscat}'s \code{\link{simData}} function by 
46 | \enumerate{
47 |   \item{basic filtering of genes and cells}
48 |   \item{(optional) filtering of subpopulation-sample instances}
49 |   \item{estimation of cell (library sizes) and gene parameters 
50 |   (dispersions and sample-specific means), respectively.}
51 | }
52 | }
53 | \details{
54 | For each gene \eqn{g}, \code{prepSim} fits a model to estimate 
55 |   sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s}, 
56 |   and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s 
57 |   \code{\link[edgeR]{estimateDisp}} function with default parameters. 
58 |   Thus, the reference count data is modeled as NB distributed: 
59 |   \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)}
60 |   for gene \eqn{g} and cell \eqn{c}, where the mean 
61 |   \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here, 
62 |   \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g} 
63 |   in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size 
64 |   (total number of counts), and \eqn{\phi_g} is the dispersion.
65 | }
66 | \examples{
67 | # estimate simulation parameters
68 | data(example_sce)
69 | ref <- prepSim(example_sce)
70 | 
71 | # tabulate number of genes/cells before vs. after
72 | ns <- cbind(
73 |   before = dim(example_sce), 
74 |   after = dim(ref)) 
75 | rownames(ns) <- c("#genes", "#cells")
76 | ns
77 | 
78 | library(SingleCellExperiment)
79 | head(rowData(ref)) # gene parameters
80 | head(colData(ref)) # cell parameters
81 | 
82 | }
83 | \references{
84 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
85 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
86 | On the discovery of population-specific state transitions from 
87 | multi-sample multi-condition single-cell RNA sequencing data. 
88 | \emph{bioRxiv} \strong{713412} (2018). 
89 | doi: \url{https://doi.org/10.1101/713412}
90 | }
91 | \author{
92 | Helena L Crowell
93 | }
94 | 


--------------------------------------------------------------------------------
/man/resDS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/resDS.R
 3 | \name{resDS}
 4 | \alias{resDS}
 5 | \title{resDS
 6 | Formatting of DS analysis results}
 7 | \usage{
 8 | resDS(
 9 |   x,
10 |   y,
11 |   bind = c("row", "col"),
12 |   frq = FALSE,
13 |   cpm = FALSE,
14 |   digits = 3,
15 |   sep = "__",
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
21 | 
22 | \item{y}{a list of DS testing results as returned 
23 | by \code{\link{pbDS}} or \code{\link{mmDS}}.}
24 | 
25 | \item{bind}{character string specifying the output format (see details).}
26 | 
27 | \item{frq}{logical or a pre-computed list of expression frequencies 
28 | as returned by \code{\link{calcExprFreqs}}.}
29 | 
30 | \item{cpm}{logical specifying whether CPM by cluster-sample 
31 | should be appendeded to the output result table(s).}
32 | 
33 | \item{digits}{integer value specifying the 
34 | number of significant digits to maintain.}
35 | 
36 | \item{sep}{character string to use as separator 
37 | when constructing new column names.}
38 | 
39 | \item{...}{optional arguments passed to 
40 | \code{\link{calcExprFreqs}} if \code{frq = TRUE}.}
41 | }
42 | \value{
43 | returns a `data.frame`.
44 | }
45 | \description{
46 | \code{resDS} provides a simple wrapper to format cluster-level
47 | differential testing results into an easily filterable table, and
48 | to optionally append gene expression frequencies by cluster-sample
49 | & -group, as well as cluster-sample-wise CPM.
50 | }
51 | \details{
52 | When \code{bind = "col"}, the list of DS testing results at 
53 | \code{y$table} will be merge vertically (by column) into a single table
54 | in tidy format with column \code{contrast/coef} specifying the comparison.
55 |   
56 | Otherwise, when \code{bind = "row"}, an identifier of the respective
57 | contrast or coefficient will be appended to the column names,
58 | and all tables will be merge horizontally (by row).
59 |   
60 | Expression frequencies pre-computed with \code{\link{calcExprFreqs}} 
61 | may be provided with \code{frq}. Alternatively, when \code{frq = TRUE}, 
62 | expression frequencies can be computed directly, and additional arguments 
63 | may be passed to \code{\link{calcExprFreqs}} (see examples below).
64 | }
65 | \examples{
66 | # compute pseudobulks (sum of counts)
67 | data(example_sce)
68 | pb <- aggregateData(example_sce, 
69 |   assay = "counts", fun = "sum")
70 | 
71 | # run DS analysis (edgeR on pseudobulks)
72 | res <- pbDS(pb, method = "edgeR")
73 | 
74 | head(resDS(example_sce, res, bind = "row")) # tidy format
75 | head(resDS(example_sce, res, bind = "col", digits = Inf))
76 | 
77 | # append CPMs & expression frequencies
78 | head(resDS(example_sce, res, cpm = TRUE))
79 | head(resDS(example_sce, res, frq = TRUE))
80 | 
81 | # pre-computed expression frequencies & append
82 | frq <- calcExprFreqs(example_sce, assay = "counts", th = 0)
83 | head(resDS(example_sce, res, frq = frq))
84 | 
85 | }
86 | \author{
87 | Helena L Crowell & Mark D Robinson
88 | }
89 | 


--------------------------------------------------------------------------------
/man/simData.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/simData.R
  3 | \name{simData}
  4 | \alias{simData}
  5 | \title{simData}
  6 | \usage{
  7 | simData(
  8 |   x,
  9 |   ng = nrow(x),
 10 |   nc = ncol(x),
 11 |   ns = NULL,
 12 |   nk = NULL,
 13 |   probs = NULL,
 14 |   dd = TRUE,
 15 |   p_dd = diag(6)[1, ],
 16 |   paired = FALSE,
 17 |   p_ep = 0.5,
 18 |   p_dp = 0.3,
 19 |   p_dm = 0.5,
 20 |   p_type = 0,
 21 |   lfc = 2,
 22 |   rel_lfc = NULL,
 23 |   phylo_tree = NULL,
 24 |   phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
 25 |   force = FALSE
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
 30 | 
 31 | \item{ng}{number of genes to simulate. Importantly, for the library sizes 
 32 | computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
 33 | the number of simulated genes should match with the number of genes 
 34 | in the reference. To simulate a reduced number of genes, e.g. for 
 35 | testing and development purposes, please set \code{force = TRUE}.}
 36 | 
 37 | \item{nc}{number of cells to simulate.}
 38 | 
 39 | \item{ns}{number of samples to simulate; defaults to as many as
 40 | available in the reference to avoid duplicated reference samples. 
 41 | Specifically, the number of samples will be set to 
 42 | \code{n = nlevels(x$sample_id)} when \code{dd = FALSE}, 
 43 | \code{n} per group  when \code{dd, paired = TRUE}, and 
 44 | \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}.
 45 | When a larger number samples should be simulated, set \code{force = TRUE}.}
 46 | 
 47 | \item{nk}{number of clusters to simulate; defaults to the number
 48 | of available reference clusters (\code{nlevels(x$cluster_id)}).}
 49 | 
 50 | \item{probs}{a list of length 3 containing probabilities of a cell belonging
 51 | to each cluster, sample, and group, respectively. List elements must be 
 52 | NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.}
 53 | 
 54 | \item{dd}{whether or not to simulate differential distributions; if TRUE, 
 55 | two groups are simulated and \code{ns} corresponds to the number of 
 56 | samples per group, else one group with \code{ns} samples is simulated.}
 57 | 
 58 | \item{p_dd}{numeric vector of length 6.
 59 | Specifies the probability of a gene being
 60 | EE, EP, DE, DP, DM, or DB, respectively.}
 61 | 
 62 | \item{paired}{logical specifying whether a paired design should 
 63 | be simulated (both groups use the same set of reference samples) 
 64 | or not (reference samples are drawn at random).}
 65 | 
 66 | \item{p_ep, p_dp, p_dm}{numeric specifying the proportion of cells
 67 | to be shifted to a different expression state in one group (see details).}
 68 | 
 69 | \item{p_type}{numeric. Probability of EE/EP gene being a type-gene.
 70 | If a gene is of class "type" in a given cluster, a unique mean 
 71 | will be used for that gene in the respective cluster.}
 72 | 
 73 | \item{lfc}{numeric value to use as mean logFC 
 74 | (logarithm base 2) for DE, DP, DM, and DB type of genes.}
 75 | 
 76 | \item{rel_lfc}{numeric vector of relative logFCs for each cluster. 
 77 | Should be of length \code{nlevels(x$cluster_id)} with 
 78 | \code{levels(x$cluster_id)} as names. 
 79 | Defaults to factor of 1 for all clusters.}
 80 | 
 81 | \item{phylo_tree}{newick tree text representing cluster relations 
 82 | and their relative distance. An explanation of the syntax can be found 
 83 | \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. 
 84 | The distance between the nodes, except for the original branch, will be 
 85 | translated in the number of shared genes between the clusters belonging to 
 86 | these nodes (this relation is controlled with \code{phylo_pars}). 
 87 | The distance between two clusters is defined as the sum 
 88 | of the branches lengths separating them.}
 89 | 
 90 | \item{phylo_pars}{vector of length 2 providing the parameters that control 
 91 | the number of type genes. Passed to an exponential PDF (see details).}
 92 | 
 93 | \item{force}{logical specifying whether to force simulation 
 94 | when \code{ng} and/or \code{ns} don't match the number of 
 95 | available reference genes and samples, respectively.}
 96 | }
 97 | \value{
 98 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
 99 |   containing multiple clusters & samples across 2 groups 
100 |   as well as the following metadata: \describe{
101 |   \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing,
102 |   containing, for each cell, it's cluster, sample, and group ID.}
103 |   \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing, 
104 |   for each gene, it's \code{class} (one of "state", "type", "none") and 
105 |   specificity (\code{specs}; NA for genes of type "state", otherwise 
106 |   a character vector of clusters that share the given gene).}
107 |   \item{experiment metadata (\code{metadata(.)})}{
108 |   \describe{
109 |   \item{\code{experiment_info}}{a \code{data.frame} 
110 |   summarizing the experimental design.}
111 |   \item{\code{n_cells}}{the number of cells for each sample.}
112 |   \item{\code{gene_info}}{a \code{data.frame} containing, for each gene
113 |   in each cluster, it's differential distribution \code{category},
114 |   mean \code{logFC} (NA for genes for categories "ee" and "ep"),
115 |   gene used as reference (\code{sim_gene}), dispersion \code{sim_disp},
116 |   and simulation means for each group \code{sim_mean.A/B}.}
117 |   \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.}
118 |   \item{\code{args}}{a list of the function call's input arguments.}}}}
119 | }
120 | \description{
121 | Simulation of complex scRNA-seq data
122 | }
123 | \details{
124 | \code{simData} simulates multiple clusters and samples 
125 | across 2 experimental conditions from a real scRNA-seq data set.
126 | 
127 | The simulation of type genes can be performed in 2 ways; 
128 |   (1) via \code{p_type} to simulate independent clusters, OR 
129 |   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
130 |   
131 |   For (1), a subset of \code{p_type} \% of genes are selected per cluster
132 |   to use a different references genes than the remainder of clusters,
133 |   giving rise to cluster-specific NB means for count sampling.
134 |   
135 |   For (2), the number of shared/type genes at each node 
136 |   are given by \code{a*G*e^(-b*d)}, where \itemize{
137 |   \item{\code{a} -- controls the percentage of shared genes between nodes.
138 |   By default, at most 10\% of the genes are reserved as type genes
139 |   (when \code{b} = 0). However, it is advised to tune this parameter 
140 |   depending on the input \code{prep_sce}.}
141 |   \item{\code{b} -- determines how the number of shared genes 
142 |   decreases with increasing distance d between clusters 
143 |   (defined through \code{phylo_tree}).}}
144 | }
145 | \examples{
146 | data(example_sce)
147 | library(SingleCellExperiment)
148 | 
149 | # prep. SCE for simulation
150 | ref <- prepSim(example_sce)
151 | 
152 | # simulate data
153 | (sim <- simData(ref, nc = 200,
154 |   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
155 |   ng = 100, force = TRUE,
156 |   probs = list(NULL, NULL, c(1, 0))))
157 | 
158 | # simulation metadata
159 | head(gi <- metadata(sim)$gene_info)
160 | 
161 | # should be ~10\% DE  
162 | table(gi$category)
163 | 
164 | # unbalanced sample sizes
165 | sim <- simData(ref, nc = 100, ns = 2,
166 |   probs = list(NULL, c(0.25, 0.75), NULL),
167 |   ng = 10, force = TRUE)
168 | table(sim$sample_id)
169 | 
170 | # one group only
171 | sim <- simData(ref, nc = 100,
172 |   probs = list(NULL, NULL, c(1, 0)),
173 |   ng = 10, force = TRUE)
174 | levels(sim$group_id)
175 | 
176 | # HIERARCHICAL CLUSTER STRUCTURE
177 | # define phylogram specifying cluster relations
178 | phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
179 | # verify syntax & visualize relations
180 | library(phylogram)
181 | plot(read.dendrogram(text = phylo_tree))
182 | 
183 | # let's use a more complex phylogeny
184 | phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
185 |   0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
186 | plot(read.dendrogram(text = phylo_tree))
187 | 
188 | # simulate clusters accordingly
189 | sim <- simData(ref, 
190 |   phylo_tree = phylo_tree, 
191 |   phylo_pars = c(0.1, 3), 
192 |   ng = 500, force = TRUE)
193 | # view information about shared 'type' genes
194 | table(rowData(sim)$class)
195 | 
196 | }
197 | \references{
198 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
199 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
200 | On the discovery of population-specific state transitions from 
201 | multi-sample multi-condition single-cell RNA sequencing data. 
202 | \emph{bioRxiv} \strong{713412} (2018). 
203 | doi: \url{https://doi.org/10.1101/713412}
204 | }
205 | \author{
206 | Helena L Crowell & Anthony Sonrel
207 | }
208 | 


--------------------------------------------------------------------------------
/man/stagewise_DS_DD.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stagewiseDD.R
 3 | \name{stagewise_DS_DD}
 4 | \alias{stagewise_DS_DD}
 5 | \title{Perform two-stage testing on DS and DD analysis results}
 6 | \usage{
 7 | stagewise_DS_DD(res_DS, res_DD, sce = NULL, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{res_DS}{a list of DS testing results as returned 
11 | by \code{\link{pbDS}} or \code{\link{mmDS}}.}
12 | 
13 | \item{res_DD}{a list of DD testing results as returned 
14 | by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).}
15 | 
16 | \item{sce}{(optional) \code{SingleCellExperiment} object containing the data 
17 | that underlies testing, prior to summarization with \code{\link{aggregateData}}. 
18 | Used for validation of inputs in order to prevent unexpected failure/results.}
19 | 
20 | \item{verbose}{logical. Should information on progress be reported?}
21 | }
22 | \value{
23 | A list of \code{DFrame}s containing results for each contrast and cluster.
24 | Each table contains DS and DD results for genes shared between analyses,
25 | as well as results from stagewise testing analysis, namely:
26 | \itemize{
27 | \item{\code{p_adj}: FDR adjusted p-values for the
28 |   screening hypothesis that a gene is neither DS nor DD
29 |   (see \code{?stageR::getAdjustedPValues} for details)}
30 | \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}}
31 | }
32 | \description{
33 | Perform two-stage testing on DS and DD analysis results
34 | }
35 | \examples{
36 | data(example_sce)
37 | 
38 | pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum")
39 | pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected")
40 |   
41 | res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE)
42 | res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE)
43 | 
44 | res <- stagewise_DS_DD(res_DS, res_DD)
45 | head(res[[1]][[1]]) # results for 1st cluster
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(muscat)
3 | 
4 | test_check("muscat")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-aggregateData.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressMessages({
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | # generate toy dataset
 7 | seed <- as.numeric(format(Sys.Date(), "%s"))
 8 | set.seed(seed)
 9 | sce <- .toySCE()
10 | nk <- length(kids <- levels(sce$cluster_id))
11 | ns <- length(sids <- levels(sce$sample_id))
12 | ng <- length(gids <- levels(sce$group_id))
13 | 
14 | test_that("aggregation across 2 factors", {
15 |     for (fun in c("sum", "mean", "median")) { 
16 |         pb <- aggregateData(sce, by = c("cluster_id", "sample_id"), fun = fun)
17 |         expect_error(aggregateData(sce, assay = "x"))
18 |         expect_error(aggregateData(sce, fun = "x"))
19 |         expect_error(aggregateData(sce, by = "x"))
20 |         
21 |         pars <- metadata(pb)$agg_pars
22 |         expect_identical(pars$by, c("cluster_id", "sample_id"))
23 |         expect_identical(pars$assay, "counts")
24 |         expect_identical(pars$fun, fun)
25 |         expect_false(pars$scale)
26 |         
27 |         expect_is(pb, "SingleCellExperiment")
28 |         expect_identical(assayNames(pb), kids)
29 |         
30 |         expect_identical(nrow(pb), nrow(sce))
31 |         expect_identical(ncol(pb), ns)
32 |         
33 |         expect_identical(rownames(pb), rownames(sce))
34 |         expect_identical(colnames(pb), sids)
35 |         
36 |         expect_equivalent(.n_cells(pb), table(sce$cluster_id, sce$sample_id))
37 |         
38 |         # 10x random spot check
39 |         replicate(10, {
40 |             k <- sample(kids, 1)
41 |             s <- sample(sids, 1)
42 |             g <- sample(rownames(sce), 1)
43 |             i <- sce$sample_id == s & sce$cluster_id == k
44 |             expect_equal(assays(pb)[[k]][g, s], get(fun)(assay(sce)[g, i]))
45 |         })
46 |     }
47 | })
48 | 
49 | test_that("aggregation across 1 factor", {
50 |     for (fun in c("sum", "mean", "median")) {
51 |         pb <- aggregateData(sce, by = "cluster_id", fun = fun)
52 |         expect_is(pb, "SingleCellExperiment")
53 |         expect_identical(nrow(pb), nrow(sce))
54 |         expect_identical(ncol(pb), nk)
55 |         expect_identical(rownames(pb), rownames(sce))
56 |         expect_identical(colnames(pb), kids)
57 |         expect_equivalent(table(sce$cluster_id), .n_cells(pb))
58 |         # random spot check
59 |         k <- sample(kids, 1)
60 |         g <- sample(rownames(sce), 1)
61 |         i <- sce$cluster_id == k
62 |         expect_equal(assay(pb)[g, k], get(fun)(assay(sce)[g, i]))
63 |     }
64 | })
65 | 
66 | test_that("pbFlatten()", {
67 |     x <- aggregateData(sce, by = "cluster_id")
68 |     expect_error(pbFlatten(x))
69 |     x <- aggregateData(sce)
70 |     cd <- colData(y <- pbFlatten(x))
71 |     expect_is(y, "SingleCellExperiment")
72 |     expect_true(length(assays(y)) == 2)
73 |     expect_identical(rownames(y), rownames(x))
74 |     expect_true(ncol(y) == nk*ncol(x))
75 |     a <- do.call(cbind, as.list(assays(x)))
76 |     expect_equivalent(assay(y), a)
77 |     expect_true(all(table(cd$sample_id) == nk))
78 |     expect_true(all(table(cd$cluster_id) == ns))
79 |     expect_true(all(table(cd$group_id) == ns/ng*nk))
80 |     expect_equivalent(y$n_cells, c(table(sce$sample_id, sce$cluster_id)))
81 |     # without normalization
82 |     y <- pbFlatten(x, normalize = FALSE)
83 |     expect_true(assayNames(y) == "counts")
84 | })
85 | 


--------------------------------------------------------------------------------
/tests/testthat/test-calcExprFreqs.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressMessages({
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | # generate toy dataset
 7 | seed <- as.numeric(format(Sys.Date(), "%s"))
 8 | set.seed(seed)
 9 | sce <- .toySCE()
10 | 
11 | # put in 50% random 0s
12 | n <- length(assay(sce))
13 | i <- sample(n, round(n * 0.5))
14 | assay(sce)[i] <- 0
15 | 
16 | # store nb. / IDs of clusters, samples, groups
17 | nk <- length(kids <- levels(sce$cluster_id))
18 | ns <- length(sids <- levels(sce$sample_id))
19 | ng <- length(gids <- levels(sce$group_id))
20 | 
21 | # calculate expression frequencies
22 | x <- calcExprFreqs(sce, assay = "counts", th = 0)
23 | 
24 | test_that("Output is correctly structured SCE", {
25 |     expect_is(x, "SingleCellExperiment")
26 |     expect_identical(assayNames(x), kids)
27 |     
28 |     expect_identical(nrow(x), nrow(sce))
29 |     expect_identical(ncol(x), ns + ng)
30 | 
31 |     expect_identical(rownames(x), rownames(sce))
32 |     expect_identical(colnames(x), c(sids, gids))
33 | })
34 | 
35 | test_that("Frequencies lie in [0, 1] w/o any NAs", {
36 |     v <- unlist(assays(x))
37 |     expect_true(all(v >= 0))
38 |     expect_true(all(v <= 1))
39 |     expect_true(!any(is.na(v)))
40 | })
41 | 
42 | test_that("10x random spot checks", {
43 |     replicate(10, {
44 |         # sample cluster, sample & group
45 |         k <- sample(kids, 1)
46 |         s <- sample(sids, 1)
47 |         g <- sample(gids, 1)
48 |         ki <- sce$cluster_id == k
49 |         si <- sce$sample_id == s & ki
50 |         gi <- sce$group_id == g & ki
51 |         # sample gene & check frequencies vs. truth
52 |         i <- sample(rownames(sce), 1)
53 |         expect_identical(mean(counts(sce)[i, si] > 0), assays(x)[[k]][i, s])
54 |         expect_identical(mean(counts(sce)[i, gi] > 0), assays(x)[[k]][i, g])
55 |     })
56 | })
57 | 


--------------------------------------------------------------------------------
/tests/testthat/test-mmDS.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressPackageStartupMessages({
 3 |     library(dplyr)
 4 |     library(purrr)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | # generate toy dataset
 9 | seed <- as.numeric(format(Sys.time(), "%s"))
10 | set.seed(seed)
11 | x <- .toySCE(c(200, 2e3))
12 | x <- x[, x$group_id != "g3"]
13 | 
14 | nk <- length(kids <- x$cluster_id)
15 | ns <- length(sids <- x$sample_id)
16 | ng <- length(gids <- x$group_id)
17 | 
18 | test_that(".check_args_mmDS()", {
19 |     y <- x; class(y) <- "x"
20 |     expect_error(mmDS(y))
21 |     y <- x; assayNames(y) <- "x"
22 |     expect_error(mmDS(y))
23 |     expect_error(mmDS(x, "x"))
24 |     expect_error(mmDS(x, 100))
25 |     expect_error(mmDS(x, covs = "x"))
26 |     expect_error(mmDS(x, method = "x"))
27 |     expect_error(mmDS(x, method = "vst", vst = "x"))
28 |     for (u in c("verbose", "dup_corr", "trended", "bayesian", "blind", "REML")) 
29 |         for (v in list("x", 1, c(TRUE, TRUE))) {
30 |             w <- list(x = x); w[[u]] <- v
31 |             expect_error(do.call(mmDS, w))
32 |         }
33 | })
34 | 
35 | test_that("mmDS() - filtering", {
36 |     expect_error(mmDS(x, n_cells = Inf))
37 |     ks <- sample(kids, 2)
38 |     ks <- as.character(ks)
39 |     cs <- x$cluster_id %in% ks
40 |     ls <- rowSums(assay(x[, cs]))
41 |     o <- order(ls, decreasing = TRUE)
42 |     gs <- rownames(x)[o][seq_len(5)]
43 |     z <- suppressWarnings(
44 |         mmDS(x[gs, cs], verbose = FALSE))
45 |     expect_setequal(names(z), ks)
46 |     expect_true(all(vapply(map(z, "gene"), 
47 |         function(u) all(u == gs), logical(1))))
48 |     expect_true(all(vapply(ks, function(k) 
49 |         all(z[[k]]$cluster_id == k), logical(1))))
50 |     y <- x[gs, cs]; metadata(y) <- list()
51 |     y$group_id <- NULL; expect_error(mmDS(y))
52 | })
53 | 
54 | # randomly select 'n_de' genes & bump counts for group 2
55 | n_de <- 5; g2 <- gids == "g2"
56 | de_gs <- sample(rownames(x), n_de)
57 | de_es <- rnbinom(n_de * sum(g2), size = 4, mu = 8)
58 | de_es <- matrix(de_es, n_de, sum(g2))
59 | assay(x[de_gs, g2], withDimnames = FALSE) <- de_es
60 | vst <- .vst_sctransform(x, verbose = FALSE)
61 | assay(x, "vstresiduals") <- vst
62 | 
63 | for (fun in paste0(".mm_", c("dream", "dream2", "vst"))) {
64 |     test_that(paste("mmDS-utils;", fun), {
65 |         cs <- x$cluster_id == kids[1]
66 |         gs <- c(de_gs, sample(setdiff(rownames(x), de_gs), 15))
67 |         # currently not passing; likely because 
68 |         # the toy data is too simplistic...
69 |         # "poisson", "hybrid", "nbinom"
70 |         y <- suppressWarnings(
71 |             get(fun)(x[gs, cs], verbose = FALSE))
72 |         expect_is(y, "data.frame")
73 |         expect_identical(rownames(y), gs)
74 |         if (grepl("dream", fun))
75 |             y <- y[y$logFC > 0, ]
76 |         top <- order(y$p_adj.loc)[seq_len(n_de)]
77 |         expect_setequal(rownames(y)[top], de_gs)
78 |     })
79 | }
80 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pbDS.R:
--------------------------------------------------------------------------------
  1 | # load packages
  2 | suppressMessages({
  3 |     library(dplyr)
  4 |     library(purrr)
  5 |     library(SingleCellExperiment)
  6 | })
  7 | 
  8 | # generate toy dataset
  9 | seed <- as.numeric(format(Sys.time(), "%s"))
 10 | set.seed(seed)
 11 | sce <- .toySCE(c(100, 2e3))
 12 | 
 13 | nk <- length(kids <- levels(sce$cluster_id))
 14 | ns <- length(sids <- levels(sce$sample_id))
 15 | ng <- length(gids <- levels(sce$group_id))
 16 | 
 17 | g3 <- sce$group_id == "g3"
 18 | g23 <- sce$group_id %in% c("g2", "g3")
 19 | 
 20 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells
 21 | degs <- sample(rownames(sce), (nde <- 5))
 22 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10
 23 | pb <- aggregateData(sce, assay = "counts", fun = "sum")
 24 | 
 25 | # specify design & contrast matrix
 26 | ei <- metadata(sce)$experiment_info
 27 | design <- model.matrix(~ 0 + ei$group_id)
 28 | dimnames(design) <- list(ei$sample_id, levels(ei$group_id))
 29 | contrast <- limma::makeContrasts("g2-g1", "g3-g1", levels = design)
 30 | 
 31 | # default method settings ------------------------------------------------------
 32 | ms <- eval(as.list(args(pbDS))$method)
 33 | ms <- setdiff(ms, "DD")
 34 | for (method in ms) {
 35 |     test_that(paste("defaults - pbDS", method, sep = "."), {
 36 |         res <- pbDS(pb, 
 37 |             method = method, 
 38 |             min_cells = 0, 
 39 |             filter = "none", 
 40 |             verbose = FALSE)
 41 |         tbl <- res$table[[1]]
 42 |         expect_identical(names(tbl), kids)
 43 |         top <- map(tbl, function(u)
 44 |             dplyr::arrange(u, p_adj.loc) %>% 
 45 |                 dplyr::slice(seq_len(5)) %>% 
 46 |                 pull("gene"))
 47 |         expect_true(all(vapply(top, setequal, y = degs, logical(1))))
 48 |     })
 49 | }
 50 | 
 51 | # multiple contrast w/ & w/o 'treat' -------------------------------------------
 52 | for (method in ms) {
 53 |     if (grepl("edgeR|limma", method)) {
 54 |         treat <- c(FALSE, TRUE)
 55 |     } else treat <- FALSE
 56 |     test_that(paste("pbDS", method, sep = "."), {
 57 |         for (t in treat) {
 58 |             res <- pbDS(pb, 
 59 |                 treat = t, filter = "none",
 60 |                 method = method, verbose = FALSE,
 61 |                 design = design, contrast = contrast)
 62 |             
 63 |             expect_identical(length(res$table), ncol(contrast))
 64 |             expect_identical(names(res$table), colnames(contrast))
 65 |             expect_true(all(vapply(map(res$table, names), "==",
 66 |                 levels(kids), FUN.VALUE = logical(nlevels(kids)))))
 67 |             
 68 |             # check that top genes equal 'degs' in ea. comparison & cluster
 69 |             top <- map_depth(res$table, 2, function(u) {
 70 |                 dplyr::arrange(u, p_adj.loc) %>% 
 71 |                     dplyr::slice(seq_len(nde)) %>% 
 72 |                     pull("gene")
 73 |             }) %>% Reduce(f = "c")
 74 |             expect_true(all(unlist(map(top, setequal, degs))))
 75 |         }
 76 |     })
 77 | }
 78 | 
 79 | # global p-value adjustment ----------------------------------------------------
 80 | test_that(".p_adj_global", {
 81 |     names(cs) <- cs <- paste0("c", seq_len(5))
 82 |     names(ks) <- ks <- paste0("k", seq_len(8))
 83 |     ns <- sample(1e3, length(ks))
 84 |     names(ns) <- ks
 85 |     df <- lapply(ks, function(k) lapply(cs, function(c) {
 86 |         df <- data.frame(p_val = rgamma(ns[k], 0.1))
 87 |         df$p_adj.loc <- p.adjust(df$p_val, method = "BH")
 88 |         return(df) 
 89 |     })) %>% .p_adj_global
 90 |     for (c in cs)
 91 |         expect_identical(
 92 |             p.adjust(bind_rows(df[[c]])$p_val, method = "BH"),
 93 |             bind_rows(df[[c]])$p_adj.glb)
 94 | })
 95 | 
 96 | # filtering --------------------------------------------------------------------
 97 | test_that("pbDS() filtering", {
 98 |   sub <- sce[, sample(ncol(sce), 100)]
 99 |   pbs <- aggregateData(sub, assay = "counts", fun = "sum")
100 |   expect_error(pbDS(pbs, verbose = FALSE))
101 |   expect_silent(pbDS(pbs, verbose = FALSE, 
102 |     min_cells = 0, filter = "none"))
103 | })
104 | 
105 | # pbDD -------------------------------------------------------------------------
106 | 
107 | set.seed(1968)
108 | sce <- .toySCE(c(100, 2e3))
109 | cs <- which(
110 |     sce$group_id == (g <- "g3") & 
111 |     sce$cluster_id == (k <- "k1"))
112 | cs <- sample(cs, length(cs))
113 | gs <- sample(rownames(sce), 5)
114 | assay(sce)[gs, cs] <- 0
115 | 
116 | test_that("differential testing", {
117 |     pbs <- aggregateData(sce, fun="num.detected")
118 |     expect_identical(
119 |         tbl <- pbDD(pbs, verbose=FALSE)$table[[g]][[k]], 
120 |         pbDS(pbs, method="DD", verbose=FALSE)$table[[g]][[k]])
121 |     top <- tbl$gene[order(tbl$p_adj.loc)]
122 |     expect_setequal(top[seq_along(gs)], gs)
123 | })
124 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pbHeatmap.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressMessages({
 3 |     library(purrr)
 4 |     library(scater)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | # generate toy dataset
 9 | set.seed(as.numeric(format(Sys.time(), "%s")))
10 | sce <- .toySCE()
11 | 
12 | nk <- length(kids <- levels(sce$cluster_id))
13 | ns <- length(sids <- levels(sce$sample_id))
14 | ng <- length(gids <- levels(sce$group_id))
15 | 
16 | g3 <- sce$group_id == "g3"
17 | g23 <- sce$group_id %in% c("g2", "g3")
18 | 
19 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells
20 | degs <- sample(rownames(sce), (nde <- 5))
21 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10
22 | sce <- logNormCounts(computeLibraryFactors(sce))
23 | 
24 | # run DS analysis using 'edgeR' on pseudobulks
25 | pb <- aggregateData(sce, assay = "counts", fun = "sum")
26 | res <- pbDS(pb, method = "edgeR", min_cells = 0, filter = "none", verbose = FALSE)
27 | 
28 | # compute pseudobulk mean of logcounts
29 | pb <- aggregateData(sce, assay = "logcounts", fun = "mean")
30 | 
31 | test_that("pbHeatmap() - input arguments", {
32 |     # get list of default parameters
33 |     defs <- as.list(eval(formals(pbHeatmap)))
34 |     defs$x <- sce; defs$y <- res
35 |     expect_is(do.call(pbHeatmap, defs), "Heatmap")
36 |     # check that invalid arguments throw error
37 |     fail <- list(g = "x", k = "x", c = "x",
38 |         sort_by = "x", sort_by = "gene",
39 |         sort_by = c("logFC", "p_val"),
40 |         lfc = "x", lfc = c(1, 2),
41 |         assay = 1, assay = "x",
42 |         decreasing = "x", decreasing = c(TRUE, FALSE),
43 |         row_anno = "x", row_anno = c(TRUE, FALSE), 
44 |         col_anno = "x", col_anno = c(TRUE, FALSE),
45 |         normalize = "x", normalize = c(TRUE, FALSE))
46 |     for (i in seq_along(fail)) {
47 |         args <- defs; args[[names(fail)[i]]] <- fail[[i]]
48 |         expect_error(do.call(pbHeatmap, args))
49 |     }
50 | })
51 |     
52 | test_that("pbHeatmap() - subset of clusters", {
53 |     ks <- sample(kids, (nks <- 3))
54 |     p <- pbHeatmap(sce, res, k = ks, 
55 |         lfc = 0, fdr = Inf, sort_by = "none",
56 |         top_n = (nds <- 10), normalize = FALSE)
57 |     expect_is(p, "Heatmap")
58 |     gs <- rownames(y <- p@matrix)[seq_len(nds)]
59 |     expect_equal(dim(y), c(nks*nds, ns))
60 |     expect_identical(rownames(y), rep(rownames(sce)[seq_len(nds)], nks))
61 |     expect_identical(colnames(y), sids)
62 |     for (i in seq_len(nks)) {
63 |         k <- kids[match(ks, kids)][i]
64 |         expect_equal(y[seq_len(nds)+nds*(i-1), ], assay(pb, k)[gs, ])
65 |     }
66 | })
67 | 
68 | test_that("pbHeatmap() - subset of genes", {
69 |     gs <- sample(rownames(sce), (ngs <- 20))
70 |     p <- pbHeatmap(sce, res, g = gs, 
71 |         lfc = 0, fdr = Inf, sort_by = "none",
72 |         top_n = Inf, normalize = FALSE)
73 |     expect_is(p, "Heatmap")
74 |     gs <- rownames(y <- p@matrix)[seq_len(ngs)]
75 |     expect_equal(dim(y), c(nk*ngs, ns))
76 |     expect_identical(rownames(y), rep(gs, nk))
77 |     expect_identical(colnames(y), sids)
78 | })
79 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pbMDS.R:
--------------------------------------------------------------------------------
 1 | # generate toy dataset
 2 | seed <- as.numeric(format(Sys.time(), "%s"))
 3 | set.seed(seed); x <- .toySCE()
 4 | 
 5 | nk <- length(kids <- levels(x$cluster_id))
 6 | ns <- length(sids <- levels(x$sample_id))
 7 | ng <- length(gids <- levels(x$group_id))
 8 | 
 9 | test_that("pbMDS()", {
10 |     y <- x; class(y) <- "x"
11 |     expect_error(pbMDS(y))
12 |     y <- aggregateData(x)
13 |     expect_is(p <- pbMDS(y), "ggplot")
14 |     expect_true(nrow(p$data) == nk*ns)
15 |     expect_true(all(table(p$data$cluster_id) == ns))
16 |     expect_true(all(table(p$data$group_id) == ns*nk/ng))
17 |     cs1 <- x$group_id != gids[1]   # remove group
18 |     cs2 <- x$sample_id  != sids[1] # remove sample
19 |     cs3 <- x$cluster_id != kids[1] # remove cluster
20 |     cs4 <- cs1 & cs2 # remove sample in single group
21 |     cs5 <- cs1 & cs3 # remove cluster in single group
22 |     cs6 <- cs2 & cs3 # remove cluster-sample instance
23 |     for (cs in paste0("cs", seq_len(6))) 
24 |         expect_silent(pbMDS(aggregateData(x[, get(cs)])))
25 | })
26 | 


--------------------------------------------------------------------------------
/tests/testthat/test-prepSCE.R:
--------------------------------------------------------------------------------
 1 | # generate toy dataset
 2 | seed <- as.numeric(format(Sys.time(), "%s"))
 3 | set.seed(seed); x <- .toySCE()
 4 | 
 5 | nk <- length(kids <- levels(x$cluster_id))
 6 | ns <- length(sids <- levels(x$sample_id))
 7 | ng <- length(gids <- levels(x$group_id))
 8 | 
 9 | test_that("prepSCE()", {
10 |     y <- x; class(y) <- "x"
11 |     expect_error(prepSCE(y))
12 |     expect_is(y <- prepSCE(x), "SingleCellExperiment")
13 |     expect_identical(dim(y), dim(x))
14 |     expect_identical(dimnames(y), dimnames(x))
15 |     ids <- formals("prepSCE")
16 |     ids <- ids[grep("[a-z]id", names(ids))]
17 |     ids <- unname(unlist(ids))
18 |     expect_identical(colnames(colData(y)), ids)
19 |     x$foo <- sample(ncol(x))
20 |     y <- prepSCE(x, drop = TRUE)
21 |     expect_true(is.null(y$foo))
22 |     y <- prepSCE(x, drop = FALSE)
23 |     expect_identical(y$foo, x$foo)
24 |     expect_false(is.null(metadata(y)$experiment_info))
25 | })
26 | 


--------------------------------------------------------------------------------
/tests/testthat/test-prepSim.R:
--------------------------------------------------------------------------------
 1 | # generate toy dataset
 2 | seed <- as.numeric(format(Sys.time(), "%s"))
 3 | set.seed(seed); x <- .toySCE()
 4 | 
 5 | nk <- length(kids <- levels(x$cluster_id))
 6 | ns <- length(sids <- levels(x$sample_id))
 7 | ng <- length(gids <- levels(x$group_id))
 8 | 
 9 | test_that("prepSim()", {
10 |     z <- x; class(z) <- "x"
11 |     expect_error(prepSim(z))
12 |     expect_error(prepSim(x, group_keep = "x"))
13 |     y <- prepSim(x, 0, 0, 0, 0)
14 |     expect_is(y, "SingleCellExperiment")
15 |     expect_true(nrow(y) == nrow(x))
16 |     expect_true(ncol(y) == sum(x$group_id == gids[1]))
17 |     g <- sample(setdiff(gids, gids[1]), 1)
18 |     y <- prepSim(x, 0, 0, 0, 0, group_keep = g)
19 |     expect_true(ncol(y) == sum(x$group_id == g))
20 | })
21 | 
22 | test_that("non-factor ID columns work", {
23 |     y <- x; for (i in names(colData(y))) 
24 |         y[[i]] <- as.character(y[[i]])
25 |     expect_silent(prepSim(y, 0, 0, 0, 0, verbose = FALSE))
26 | })
27 | 
28 | test_that("single cluster", {
29 |     g <- sample(levels(x$group_id), 1)
30 |     k <- sample(levels(x$cluster_id), 1)
31 |     y <- x[, i <- x$cluster_id == k & x$group_id == g]
32 |     z <- prepSim(y, min_size = 0, min_genes = 0, verbose = FALSE)
33 |     expect_true(ncol(z) == sum(i))
34 |     expect_true(!"cluster_id" %in% names(colData(z)))
35 | })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/test-resDS.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressMessages({
 3 |     library(dplyr)
 4 |     library(purrr)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | # generate toy dataset
 9 | seed <- as.numeric(format(Sys.time(), "%s"))
10 | set.seed(seed)
11 | x <- .toySCE()
12 | 
13 | nk <- length(kids <- levels(x$cluster_id))
14 | ns <- length(sids <- levels(x$sample_id))
15 | ng <- length(gids <- levels(x$group_id))
16 | 
17 | # sample 'n_de' genes & multiply counts by 10 for 'g2/3'-cells
18 | g23 <- x$group_id != "g1"
19 | de_gs <- sample(rownames(x), (n_de <- 5))
20 | assay(x[de_gs, g23]) <- assay(x[de_gs, g23]) * 10
21 | 
22 | # aggregate & run pseudobulk DS analysis
23 | nc <- length(cs <- list(2, 3))
24 | pbs <- aggregateData(x, assay = "counts", fun = "sum")
25 | y <- pbDS(pbs, 
26 |     coef = cs, verbose = FALSE,
27 |     # assure everything is being tested
28 |     min_cells = 0, filter = "none")
29 |     
30 | test_that("resDS()", {
31 |     v <- list(col = list(nr = nrow(x)*nk, ng = nrow(x), nk = nk))
32 |     v$row <- lapply(v$col, "*", nc)
33 |     v$col$char_cols <- c("gene", "cluster_id")
34 |     v$row$char_cols <- c(v$col$char_cols, "coef")
35 |     for (bind in c("row", "col")) {
36 |         z <- resDS(x, y, bind, frq = FALSE, cpm = FALSE)
37 |         expect_is(z, "data.frame")
38 |         expect_identical(nrow(z), v[[bind]]$nr)
39 |         expect_true(all(table(z$gene) == v[[bind]]$nk))
40 |         expect_true(all(table(z$cluster_id) == v[[bind]]$ng))
41 |         is_char <- colnames(z) %in% v[[bind]]$char_cols
42 |         expect_true(all(apply(z[, !is_char], 2, class) == "numeric"))
43 |         expect_true(all(apply(z[,  is_char], 2, class) == "character"))
44 |     }
45 | })
46 | test_that("resDS() - 'frq = TRUE'", {
47 |     z <- resDS(x, y, frq = TRUE)
48 |     u <- z[, grep("frq", colnames(z))]
49 |     expect_true(ncol(u) == ns + ng)
50 |     expect_true(all(u <= 1 & u >= 0 | is.na(u)))
51 |     # remove single cluster-sample instance
52 |     s <- sample(sids, 1); k <- sample(kids, 1)
53 |     x_ <- x[, !(x$sample_id == s & x$cluster_id == k)]
54 |     y_ <- aggregateData(x_, assay = "counts", fun = "sum")
55 |     y_ <- pbDS(y_, coef = cs, verbose = FALSE)
56 |     z <- resDS(x_, y_, frq = TRUE)
57 |     u <- z[, grep("frq", colnames(z))]
58 |     expect_true(ncol(u) == ns + ng)
59 |     expect_true(all(u <= 1 & u >= 0 | is.na(u)))
60 |     expect_true(all(is.na(z[z$cluster_id == k, paste0(s, ".frq")])))
61 | })
62 | test_that("resDS() - 'cpm = TRUE'", {
63 |     z <- resDS(x, y, cpm = TRUE)
64 |     u <- z[, grep("cpm", colnames(z))]
65 |     expect_true(ncol(u) == ns)
66 |     expect_true(all(u %% 1 == 0 | is.na(u)))
67 | })
68 | 
69 | test_that("missing cluster is handled", {
70 |     k <- sample(kids, 1)
71 |     i <- !(x$cluster_id == k)
72 |     pbs <- aggregateData(x[, i], verbose = FALSE)
73 |     res <- pbDS(pbs, verbose = FALSE)
74 |     tbl <- resDS(x, res, cpm = TRUE)
75 |     expect_true(!k %in% unique(tbl$cluster_id))
76 | })
77 | test_that("missing sample is handled", {
78 |     s <- sample(sids, 1)
79 |     i <- !(x$sample_id == s)
80 |     pbs <- aggregateData(x[, i], verbose = FALSE)
81 |     res <- pbDS(pbs, verbose = FALSE)
82 |     tbl <- resDS(x, res, cpm = TRUE)
83 |     expect_true(sum(grepl(s, names(tbl))) == 0)
84 | })
85 | test_that("missing cluster-sample is handled", {
86 |     k <- sample(kids, 1)
87 |     s <- sample(sids, 1)
88 |     i <- !(x$cluster_id == k & x$sample_id == s)
89 |     pbs <- aggregateData(x[, i], verbose = FALSE)
90 |     res <- pbDS(pbs, verbose = FALSE)
91 |     tbl <- resDS(x, res, cpm = TRUE)
92 |     sub <- tbl[tbl$cluster_id == k, ]
93 |     expect_true(all(is.na(sub[, grep(s, names(sub))])))
94 | })
95 | 


--------------------------------------------------------------------------------
/tests/testthat/test-simData.R:
--------------------------------------------------------------------------------
  1 | suppressMessages({
  2 |     library(dplyr)
  3 |     library(SingleCellExperiment)
  4 | })
  5 | 
  6 | data(example_sce)
  7 | ref <- prepSim(example_sce, verbose = FALSE)
  8 | ng <- 200; nc <- 2e3; ns <- 3; nk <- 2
  9 | 
 10 | test_that("no groups, no samples, no clusters", {
 11 |     s <- sample(levels(ref$sample_id), 1)
 12 |     k <- sample(levels(ref$cluster_id), 1)
 13 |     sub <- ref[, ref$sample_id == s & ref$cluster_id == k]
 14 |     sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
 15 |     expect_null(sim$group_id)
 16 |     expect_null(sim$sample_id)
 17 |     expect_null(sim$cluster_id)
 18 | })
 19 | 
 20 | test_that("no groups, yes samples and clusters", {
 21 |     sids <- levels(ref$sample_id)
 22 |     kids <- levels(ref$cluster_id)
 23 |     sim <- simData(ref, ng, nc, dd = FALSE, force = TRUE)
 24 |     expect_null(sim$group_id)
 25 |     expect_true(setequal(metadata(sim)$ref_sids, sids))
 26 |     expect_true(setequal(metadata(sim)$ref_kids, kids))
 27 | })
 28 | 
 29 | test_that("only samples", {
 30 |     k <- sample(levels(ref$cluster_id), 1)
 31 |     sub <- ref[, ref$cluster_id == k]
 32 |     sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
 33 |     expect_null(sim$group_id)
 34 |     expect_null(sim$culster_id)
 35 |     expect_setequal(metadata(sim)$ref_sids, levels(sub$sample_id))
 36 | })
 37 | 
 38 | test_that("only clusters", {
 39 |     s <- sample(levels(ref$sample_id), 1)
 40 |     sub <- ref[, ref$sample_id == s]
 41 |     sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
 42 |     expect_null(sim$group_id)
 43 |     expect_null(sim$sample_id)
 44 |     expect_setequal(metadata(sim)$ref_kids, levels(sub$cluster_id))
 45 | })
 46 | 
 47 | 
 48 | test_that("pbDS() gets at least 50% right for 10% DE genes", {
 49 |     replicate(3, {
 50 |         sim <- simData(ref, ng, nc, ns, nk, 
 51 |             p_dd = c(0.9,0,0.1,0,0,0), force = TRUE)
 52 |         gi <- metadata(sim)$gene_info
 53 |         pb <- aggregateData(sim)
 54 |         re <- pbDS(pb, verbose = FALSE)
 55 |         re <- bind_rows(re$table[[1]])
 56 |         re <- arrange(re, p_adj.loc)
 57 |         gi$id <- with(gi, paste0(gene, cluster_id))
 58 |         re$id <- with(re, paste0(gene, cluster_id))
 59 |         n_de <- sum(de <- gi$category == "de")
 60 |         expect_gte(sum(re$id[seq_len(n_de)] %in% gi$id[de]), round(n_de/2))
 61 |     })
 62 | })
 63 | 
 64 | test_that("Single group simulation", {
 65 |     gs <- c("A", "B")
 66 |     ps <- list(c(1, 0), c(0, 1))
 67 |     names(ps) <- gs
 68 |     for (g in gs) {
 69 |         x <- simData(ref, 
 70 |             nc, ns, nk, ng = 10, force = TRUE,
 71 |             probs = list(NULL, NULL, ps[[g]]))
 72 |         expect_identical(levels(x$group_id), g)
 73 |         sids <- sprintf("sample%s.%s", seq_len(ns), g)
 74 |         expect_identical(levels(x$sample_id), sids)
 75 |         gi <- metadata(x)$gene_info
 76 |         ms <- paste0("sim_mean.", setdiff(gs, g))
 77 |         expect_true(all(is.na(gi[[ms]])))
 78 |     }
 79 | })
 80 | 
 81 | test_that("Pure simulations give single DD category", {
 82 |     for (c in cats) {
 83 |         sim <- simData(ref, nc, ns, nk, 
 84 |             p_dd = as.numeric(cats == c),
 85 |             ng = ng, force = TRUE)
 86 |         gi <- metadata(sim)$gene_info
 87 |         expect_equal(table(gi$category)[[c]], ng*nk)
 88 |     }
 89 | })
 90 | 
 91 | test_that("simData() - input arguments", {
 92 |     # number of genes mismatch b/w simulation & reference
 93 |     expect_error(simData(ref, ng = 100, force = FALSE))
 94 |     expect_silent(simData(ref, ng = 100, force = TRUE))
 95 |     nk <- length(kids <- levels(ref$cluster_id))
 96 |     # named 'rel_lfc's (mis)match cluster names
 97 |     lfc <- rep(1, nk); names(lfc) <- kids; lfc2 <- lfc; names(lfc2)[1] <- "x"
 98 |     expect_silent(simData(ref, nk = nk, rel_lfc = lfc, ng = 10, force = TRUE))
 99 |     expect_error(simData(ref, nk = nk, rel_lfc = lfc2, ng = 10, force = TRUE))
100 | })
101 | 
102 | test_that("Type genes & cluster phylogeny", {
103 |     ng <- 1e3; pt <- 0.05
104 |     t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
105 |     args <- list(
106 |         list(pt = pt, t = NULL), # type genes but no phylogeny
107 |         list(t = t, pt = 0))     # both, type genes and phylogeny
108 |     cs <- c("type", "state")     # possible gene classes
109 |     for (i in seq_along(args)) {
110 |         rd <- rowData(x <- simData(ref, ng = ng, force = TRUE,
111 |             p_type = args[[i]]$pt, phylo_tree = args[[i]]$t))
112 |         expect_is(rd, "DataFrame")
113 |         expect_equal(dim(rd), c(ng, 2))
114 |         expect_equal(colnames(rd), c("class", "specs"))
115 |         ns <- as.list(table(rd$class))
116 |         is_state <- rd$class == "state"
117 |         expect_true(all(is.na(unlist(rd$specs[is_state]))))
118 |         expect_true(all(unlist(rd$specs[!is_state]) %in% levels(x$cluster_id)))
119 |     }
120 |     # missing clusters in input phylogeny
121 |     t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster4':0.5);"
122 |     expect_error(simData(ref, phylo_tree = t))
123 |     # specification of both, 'p_type' and 'phylo_tree'
124 |     expect_error(simData(ref, p_type = pt, phylo_tree = t))
125 | })
126 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | x <- .toySCE()
 6 | kids <- levels(x$cluster_id)
 7 | sids <- levels(x$sample_id)
 8 | 
 9 | test_that(".filter_matrix()", {
10 |     replicate(5, {
11 |         x <- matrix(sample(seq_len(200)), 10, 20)
12 |         y <- .filter_matrix(x, (n <- sample(200, 1)))
13 |         expect_true(all(y >= n) | any(dim(y) == 1))
14 |     })
15 | })
16 | test_that(".update_sce()", {
17 |     replicate(5, {
18 |         ks <- sample(kids, (nk <- 2))
19 |         ss <- sample(sids, (ns <- 3))
20 |         cs <- x$cluster_id %in% ks & x$sample_id %in% ss
21 |         y <- .update_sce(x[, cs])
22 |         expect_equal(ncol(y), sum(cs))
23 |         expect_true(setequal(levels(y$cluster_id), ks))
24 |         expect_true(setequal(levels(y$sample_id), ss))
25 |     })
26 | })
27 | test_that(".filter_sce()", {
28 |     replicate(5, {
29 |         ks <- sample(kids, (nk <- 2))
30 |         ss <- sample(sids, (ns <- 3))
31 |         y <- .filter_sce(x, ks, ss)
32 |         expect_true(setequal(levels(y$cluster_id), ks))
33 |         expect_true(setequal(levels(y$sample_id), ss))
34 |         cs <- x$cluster_id %in% ks & x$sample_id %in% ss
35 |         expect_equal(ncol(y), sum(cs))
36 |         expect_equal(colnames(y), colnames(x)[cs])
37 |         ei <- metadata(y)$experiment_info
38 |         expect_true(setequal(ei$sample_id, ss))
39 |     })
40 | })
41 | test_that(".scale()", {
42 |     replicate(5, {
43 |         y <- .scale(x <- matrix(runif(200), (n <- 10), (m <- 20)))
44 |         expect_true(all(apply(y, 1, min) == 0))
45 |         expect_true(all(apply(y, 1, max) == 1))
46 |         qs <- quantile(x[1, ], c(0.01, 0.99))
47 |         z <- (x[1, ] - qs[1]) / diff(qs)
48 |         z[z < 0] <- 0; z[z > 1] <- 1
49 |         expect_true(all(y[1, ] == z))
50 |         i <- sample(n, 1); j <- sample(m, 1)
51 |         x[i, j] <- NA; z <- .scale(x)
52 |         expect_true(sum(is.na(z)) == 1)
53 |         os <- lapply(list(y, z), function(u) {
54 |             rngs <- colRanges(u[-i, -j], 
55 |               na.rm = TRUE, useNames = FALSE)
56 |             apply(rngs, 2, order)
57 |         })
58 |         expect_identical(os[[1]], os[[2]])
59 |     })
60 | })
61 | 


--------------------------------------------------------------------------------
/tests/testthat/test-validityChecks.R:
--------------------------------------------------------------------------------
 1 | # load packages
 2 | suppressMessages({
 3 |     library(SingleCellExperiment)
 4 | })
 5 | sce <- .toySCE()
 6 | 
 7 | test_that(".check_sce", {
 8 |     # object is not 'SingleCellExperiment'
 9 |     x <- sce; class(x) <- "x"
10 |     expect_error(.check_sce(x))
11 |     # "group_id" 'colData' column not required
12 |     x <- sce; x$group_id <- NULL
13 |     expect_silent(.check_sce(x, FALSE))
14 |     # missing "x_id" 'colData' column;
15 |     # x = "sample", "cluster", "group"
16 |     for (i in colnames(colData(sce))) {
17 |         x <- sce; x[[i]] <- NULL
18 |         expect_error(.check_sce(x))
19 |     }
20 | })
21 | 
22 | test_that(".check_arg_assay", {
23 |     expect_error(.check_arg_assay(sce, 1))
24 |     expect_error(.check_arg_assay(sce, "x"))
25 |     expect_error(.check_arg_assay(sce, c(assayNames(sce)[1], "x")))
26 |     expect_silent(.check_arg_assay(sce, assayNames(sce)[1]))
27 | })
28 | 
29 | test_that(".check_args_simData", {
30 |     u <- list(x = sce, ng = 10, nc = 100, ns = 3, nk = 2, probs = NULL, 
31 |         p_dd = diag(6)[1, ], p_type = 0.1, lfc = 1, rel_lfc = NULL, 
32 |         p_ep = 0.1, p_dp = 0.1, p_dm = 0.1, paired = FALSE, force = TRUE,
33 |         phylo_tree = NULL, phylo_pars = c(0, 3))
34 |     expect_silent(.check_args_simData(u))
35 |     # 'ng', 'nc', 'ns', 'nk' should be single numerics > 0
36 |     for (arg in c("ng", "nc", "ns", "nk")) {
37 |         v <- u; v[[arg]] <- 1; expect_silent(.check_args_simData(v))
38 |         for (val in list(numeric(2)+1, numeric(1), NA, "a"))
39 |             v[[arg]] <- val; expect_error(.check_args_simData(v))
40 |     }
41 |     # 'p_dd' should be length of DD categories, sum to 1, and be in [0,1]
42 |     v <- u; for (val in list(rep(1,5)/5, rep(1,6)/5, c(-1,2,rep(0,4))))
43 |         v$p_dd <- val; expect_error(.check_args_simData(v))
44 |     # 'paired' should be length-one logical
45 |     v <- u; for (val in list("x", 123, c(TRUE, FALSE)))
46 |         v$paired <- val; expect_error(.check_args_simData(v))
47 |     # 'rel_lfc' should be >0 and be of length 'nk'
48 |     u$rel_lfc <- rep(1, u$nk)
49 |     expect_silent(.check_args_simData(u))
50 |     v <- u; for (val in list(-1, rep(1,u$nk+1)))
51 |         v$rel_lfc <- val; expect_error(.check_args_simData(v))
52 | })
53 | 
54 | test_that(".check_frq()", {
55 |     y <- calcExprFreqs(x <- sce)
56 |     expect_silent(.check_frq(x, y))
57 |     z <- x; class(z) <- "x"
58 |     expect_error(.check_frq(z, y))
59 |     z <- y; class(z) <- "x"
60 |     expect_error(.check_frq(x, z))
61 |     for (v in c(-1, 10)) {
62 |         z <- y; assay(z)[1, 1] <- v
63 |         expect_error(.check_frq(x, z))
64 |     }
65 | })
66 | 


--------------------------------------------------------------------------------
/tests/testthat/test_utils-simData.R:
--------------------------------------------------------------------------------
 1 | nk <- length(k <- paste0("cluster", seq_len(5)))
 2 | ns <- length(s <- paste0("sample", seq_len(4)))
 3 | ng <- length(g <- paste0("group", seq_len(3)))
 4 | 
 5 | test_that(".get_ns", {
 6 |     replicate(10, {
 7 |         # default to using all samples when 
 8 |         # dd = FALSE or dd = TRUE, paired = TRUE
 9 |         ns_ref <- sample(100, 1)
10 |         args <- list(
11 |             list(dd = TRUE, paired = TRUE),
12 |             list(dd = FALSE, paired = TRUE),
13 |             list(dd = FALSE, paired = FALSE))
14 |         for (. in args) {
15 |             .$ns_ref <- ns_ref
16 |             .$force <- FALSE
17 |             . <- c(., list(ns_sim = NULL))
18 |             ns <- do.call(.get_ns, .)
19 |             expect_identical(ns_ref, ns)
20 |         }
21 |         # error when force = FALSE and 
22 |         # desired number of samples to simulate 
23 |         # exceeds available reference samples
24 |         ns_sim <- ns_ref + 1
25 |         args <- list(
26 |             list(dd = TRUE, paired = TRUE),
27 |             list(dd = TRUE, paired = FALSE),
28 |             list(dd = FALSE, paired = TRUE),
29 |             list(dd = FALSE, paired = FALSE))
30 |         for (. in args) {
31 |             .$ns_ref <- ns_ref
32 |             .$ns_sim <- ns_sim
33 |             .$force <- FALSE
34 |             expect_error(do.call(.get_ns, .))
35 |             # pass when force = TRUE
36 |             .$force <- TRUE
37 |             ns <- do.call(.get_ns, .)
38 |             expect_identical(ns_sim, ns)
39 |         }
40 |     })
41 | })
42 | 
43 | test_that(".split_cells", {
44 |     n_cells <- 1e3
45 |     cells <- paste0("cell", seq_len(n_cells))
46 |     x <- matrix(0, 
47 |         nrow = 1, ncol = n_cells, 
48 |         dimnames = list(NULL, cells))
49 |     cd <- data.frame(
50 |         row.names = cells,
51 |         cluster_id = sample(k, n_cells, TRUE),
52 |         sample_id = sample(s, n_cells, TRUE))
53 |     
54 |     cs <- .split_cells(cd, "cluster_id")
55 |     expect_identical(names(cs), k)
56 |     expect_identical(
57 |         as.numeric(vapply(cs, length, numeric(1))),
58 |         as.numeric(table(cd$cluster_id)))
59 |     
60 |     cs <- .split_cells(cd, c("cluster_id", "sample_id"))
61 |     expect_identical(names(cs), k)
62 |     nms_lvl2 <- vapply(cs, names, character(length(s)))
63 |     expect_true(all(apply(nms_lvl2, 2, identical, s)))
64 |     
65 |     cs <- .split_cells(cd, c("sample_id", "cluster_id"))
66 |     expect_identical(names(cs), s)
67 |     nms_lvl2 <- vapply(cs, names, character(length(k)))
68 |     expect_true(all(apply(nms_lvl2, 2, identical, k)))
69 | })
70 | 
71 | test_that(".sample_cell_md", {
72 |     ids <- list(k, s, g)
73 |     md <- .sample_cell_md((n <- 1e3), ids)
74 |     ms <- vapply(apply(md, 2, table), mean, numeric(1))
75 |     expect_true(all(vapply(seq_along(ids), function(i) 
76 |         ms[[i]] == n/length(ids[[i]]), logical(1))))
77 |     set.seed(1); a <- .sample_cell_md(n, ids)
78 |     set.seed(1); b <- .sample_cell_md(n, ids, 
79 |         list(rep(1/nk,nk),rep(1/ns,ns),rep(1/ng,ng)))
80 |     expect_identical(a, b)
81 | })
82 | 


--------------------------------------------------------------------------------
/vignettes/detection.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Differential detection analysis"
  3 | author:
  4 | - name: Jeroen Gilis
  5 |   affiliation: 
  6 |   - &one Applied Mathematics, Computer science and Statistics, Ghent University, Ghent, Belgium
  7 |   - &two Bioinformatics Institute, Ghent University, Ghent, Belgium
  8 |   - Data Mining and Modeling for Biomedicine, VIB Flemish Institute for Biotechnology, Ghent, Belgium
  9 | - name: Helena L Crowell
 10 |   affiliation: 
 11 |   - National Center for Genomic Analysis (CNAG), Barcelona, Spain 
 12 | - name: Davide Risso
 13 |   affiliation: 
 14 |   - Department of Statistical Sciences, University of Padova, Padova, Italy
 15 |   - Padua Center for Network Medicine, University of Padova, Padova, Italy
 16 | - name: Lieven Clement
 17 |   affiliation: 
 18 |   - *one
 19 |   - *two
 20 | package: "`r BiocStyle::pkg_ver('muscat')`"
 21 | date: "`r format(Sys.Date(), '%B %d, %Y')`"
 22 | output: 
 23 |   BiocStyle::html_document
 24 | vignette: >
 25 |   %\VignetteIndexEntry{"3. Differential detection"}
 26 |   %\VignettePackage{muscat}
 27 |   %\VignetteEngine{knitr::rmarkdown}
 28 |   %\VignetteEncoding{UTF-8}
 29 | bibliography: "`r file.path(system.file('extdata', package='muscat'), 'refs.bib')`"
 30 | abstract: > 
 31 |   <p> In this vignette, we display how `muscat` can be used to perform differrential detection (DD) analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and differential state (DS) analysis results on the same data can be effectively combined. This vignette thus introduces a workflow that allows users to jointly assess two biological hypotheses that often contain orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost.
 32 | ---
 33 | 
 34 | <style type="text/css">
 35 | .smaller {
 36 |   font-size: 10px
 37 | }
 38 | </style>
 39 | 
 40 | ```{r cache, include=FALSE}
 41 | knitr::opts_chunk$set(cache=TRUE)
 42 | ```
 43 | 
 44 | ```{r echo=FALSE, message=FALSE, warning=FALSE}
 45 | library(BiocStyle)
 46 | ```
 47 | 
 48 | *** 
 49 | 
 50 | Based on @Gilis2023
 51 | 
 52 | > Gilis J, Perin L, Malfait M, Van den Berge K,  
 53 | Assefa AT, Verbist B, Risso D, and Clement L:  
 54 | Differential detection workflows for  
 55 | multi-sample single-cell RNA-seq data.  
 56 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043)
 57 | 
 58 | # Load packages {-}
 59 | 
 60 | ```{r load-libs, message=FALSE,  warning=FALSE}
 61 | library(dplyr)
 62 | library(purrr)
 63 | library(tidyr)
 64 | library(scater)
 65 | library(muscat)
 66 | library(ggplot2)
 67 | library(patchwork)
 68 | ```
 69 | 
 70 | # Introduction
 71 | 
 72 | Single-cell RNA-sequencing (scRNA-seq) has improved our understanding of complex biological processes by elucidating cell-level heterogeneity in gene expression. One of the key tasks in the downstream analysis of scRNA-seq data is studying differential gene expression (DE). Most DE analysis methods aim to identify genes for which the *average* expression differs between biological groups of interest, e.g., between cell types or between diseased and healthy cells. As such, most methods allow for assessing only one aspect of the gene expression distribution: the mean. However, in scRNA-seq data, differences in other characteristics between count distributions can commonly be observed.
 73 | 
 74 | One such characteristic is gene detection, i.e., the number of cells in which a gene is (detectably) expressed. Analogous to a DE analysis, a differential detection (DD) analysis aims to identify genes for which the *average fraction of cells in which the gene is detected* changes between groups. In @Gilis2023, we show how DD analysis contain information that is biologically relevant, and that is largely orthogonal to the information obtained from DE analysis on the same data.
 75 | 
 76 | In this vignette, we display how `muscat` can be used to perform DD analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and DS analysis results on the same data can be effectively combined using a two-stage testing approach. This workflow thus allows users to jointly assess two biological hypotheses containing orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost.
 77 | 
 78 | # Setup
 79 | 
 80 | We will use the same data as in the differential state (DS) analyses described in `r Biocpkg("muscat", vignette = "analysis.html")`, namely, scRNA-seq data acquired on PBMCs from 8 patients before and after IFN-$\beta$ treatment. For a more detailed description of these data and subsequent preprocessing, we refer to `r Biocpkg("muscat", vignette = "analysis.html")`.
 81 | 
 82 | ```{r load-data, message=FALSE}
 83 | library(ExperimentHub)
 84 | eh <- ExperimentHub()
 85 | query(eh, "Kang")
 86 | (sce <- eh[["EH2259"]])
 87 | ```
 88 | 
 89 | We further apply some minimal filtering to remove low-quality genes and cells, and use `prepSCE()` to standardize cell metadata such that slots specifying cluster (`cell`), sample (`stim`+`ind`), and group (`stim`) identifiers conform with the `muscat` framework:
 90 | 
 91 | ```{r prep-data}
 92 | sce <- sce[rowSums(counts(sce) > 0) > 0, ]
 93 | qc <- perCellQCMetrics(sce)
 94 | sce <- sce[, !isOutlier(qc$detected, nmads=2, log=TRUE)]
 95 | sce <- sce[rowSums(counts(sce) > 1) >= 10, ]
 96 | sce$id <- paste0(sce$stim, sce$ind)
 97 | sce <- prepSCE(sce, "cell", "id", "stim")
 98 | table(sce$cluster_id, sce$group_id)
 99 | table(sce$sample_id)
100 | ```
101 | 
102 | ## Aggregation 
103 | 
104 | In general, `aggregateData()` will aggregate the data by the `colData` variables specified with argument `by`, and return a `SingleCellExperiment` containing pseudobulk data.
105 | 
106 | To perform a pseudobulk-level analysis, measurements must be aggregated at the cluster-sample level (default `by = c("cluster_id", "sample_id"`). In this case, the returned `SingleCellExperiment` will contain one assay per cluster, where rows = genes and columns = samples. Arguments `assay` and `fun` specify the input data and summary statistic, respectively, to use for aggregation.
107 | 
108 | In a differential detection (DD) analysis, the default choice of the summary statistic used for aggregation is `fun = "num.detected"`. This strategy can be thought of as first binarizing the gene expression values (1: expressed, 0: not expressed), and subsequently performing a simple summation of the binarized gene expression counts for cells belonging to the same cluster-sample level. Hence, the resulting pseudobulk-level expression count reflects the total number of cells in a particular cluster-sample level with a non-zero gene expression value.
109 | 
110 | In a differential state (DS) analysis, the default choice for aggregation is `fun = "sum"`, which amounts to the simple summation of the raw gene expression counts of cells belonging to the same cluster-sample level.
111 | 
112 | ```{r pbs-det}
113 | pb_sum <- aggregateData(sce,
114 |     assay="counts", fun="sum",
115 |     by=c("cluster_id", "sample_id"))
116 | pb_det <- aggregateData(sce,
117 |     assay="counts", fun="num.detected",
118 |     by=c("cluster_id", "sample_id"))
119 | t(head(assay(pb_det)))
120 | ```
121 | 
122 | @Qiu2020 demonstrated that binarizing scRNA-seq counts generates expression profiles that still accurately reflect biological variation.
123 | This finding was confirmed by @Bouland2021, who showed that the frequencies of zero counts capture biological variability, and further claimed that a binarized representation of the single-cell expression data allows for a more robust description of the relative abundance of transcripts than counts.
124 | 
125 | ```{r pbs-mds, fig.width=8, fig.height=4, fig.cap="Pseudobulk-level multidimensional scaling (MDS) plot based on (A) sum of counts and (B) sum of binarized counts (i.e., counting the number of detected features) in each cluster-sample."}
126 | pbMDS(pb_sum) + ggtitle("Σ counts") +
127 | pbMDS(pb_det) + ggtitle("# detected") +
128 | plot_layout(guides="collect") +
129 | plot_annotation(tag_levels="A") &
130 | theme(legend.key.size=unit(0.5, "lines"))
131 | ```
132 | 
133 | ## Analysis
134 | 
135 | Once we have assembled the pseudobulk data, we can test for DD using `pbDD()`. By default, a $\sim$`group_id` model is fit, and the last coefficient of the linear model is tested to be equal to zero.
136 | 
137 | ```{r pbDD}
138 | res_DD <- pbDD(pb_det, min_cells=0, filter="none", verbose=FALSE)
139 | ```
140 | 
141 | ## Handling and visualizing results
142 | 
143 | Inspection, manipulation, and visualization of DD analysis results follows the same principles as for a DS analysis. For a detailed description, we refer to the DS analysis vignette`r Biocpkg("muscat", vignette = "analysis.html")`. Below, some basic functionalities are being displayed.
144 | 
145 | ```{r}
146 | tbl <- res_DD$table[[1]]
147 | # one data.frame per cluster
148 | names(tbl)
149 | ```
150 | 
151 | ```{r}
152 | # view results for 1st cluster
153 | k1 <- tbl[[1]]
154 | head(format(k1[, -ncol(k1)], digits = 2))
155 | ```
156 | 
157 | ```{r}
158 | # filter FDR < 5%, |logFC| > 1 & sort by adj. p-value
159 | tbl_fil <- lapply(tbl, \(u)
160 |     filter(u, 
161 |         p_adj.loc < 0.05, 
162 |         abs(logFC) > 1) |>
163 |         arrange(p_adj.loc))
164 | 
165 | # nb. of DS genes & % of total by cluster
166 | n_de <- vapply(tbl_fil, nrow, numeric(1))
167 | p_de <- format(n_de / nrow(sce) * 100, digits = 3)
168 | data.frame("#DD" = n_de, "%DD" = p_de, check.names = FALSE)
169 | ```
170 | 
171 | ```{r}
172 | library(UpSetR)
173 | de_gs_by_k <- map(tbl_fil, "gene")
174 | upset(fromList(de_gs_by_k))
175 | ```
176 | 
177 | # Stagewise anaysis
178 | 
179 | While DD analysis results may contain biologically relevant information in their own right, we show in @Gilis2023 that combing DD and DS analysis results on the same data can further improve our understanding of complex biological phenomena. In the remainder of this vignette, we show how DD and DS analysis results on the same data can be effectively combined.
180 | 
181 | For this, we build on the two-stage testing paradigm proposed by @Vandenberge2017. In the first stage of this testing procedure, we identify differential genes by using an omnibus test for differential detection and differential expression (DE). The null hypothesis for this test is that the gene is neither differentially detected, nor differentially expressed.
182 | 
183 | In the second stage, we perform post-hoc tests on the differential genes from stage one to unravel whether they are DD, DE or both. Compared to the individual DD and DS analysis results, the two-stage approach increases statistical power and provides better type 1 error control.
184 | 
185 | ```{r pbDS}
186 | res_DS <- pbDS(pb_sum, min_cells=0, filter="none", verbose=FALSE)
187 | ```
188 | 
189 | ```{r}
190 | res <- stagewise_DS_DD(res_DS, res_DD, verbose=FALSE)
191 | head(res[[1]][[1]]) # results for 1st cluster
192 | ```
193 | 
194 | ## Comparison
195 | 
196 | ```{r}
197 | # for each approach, get adjusted p-values across clusters
198 | ps <- map_depth(res, 2, \(df) {
199 |     data.frame(
200 |         df[, c("gene", "cluster_id")],
201 |         p_adj.stagewise=df$p_adj,
202 |         p_adj.DS=df$res_DS$p_adj.loc,
203 |         p_adj.DD=df$res_DD$p_adj.loc)
204 | }) |> 
205 |     lapply(do.call, what=rbind) |>
206 |     do.call(what=rbind) |>
207 |     data.frame(row.names=NULL)
208 | head(ps)
209 | ```
210 | 
211 | To get an overview of how different approaches compare, we can count the number of genes found differential in each cluster for a given FDR threshold:
212 | 
213 | ```{r fig.width=12, fig.height=4}
214 | # for each approach & cluster, count number 
215 | # of genes falling below 5% FDR threshold
216 | ns <- lapply(seq(0, 0.2, 0.005), \(th) {
217 |     ps |>
218 |         mutate(th=th) |>
219 |         group_by(cluster_id, th) |>
220 |         summarise(
221 |             .groups="drop",
222 |             across(starts_with("p_"), 
223 |             \(.) sum(. < th, na.rm=TRUE)))
224 | }) |> 
225 |     do.call(what=rbind) |>
226 |     pivot_longer(starts_with("p_"))
227 | ggplot(ns, aes(th, value, col=name)) + 
228 |     geom_line(linewidth=0.8, key_glyph="point") +
229 |     geom_vline(xintercept=0.05, lty=2, linewidth=0.4) +
230 |     guides(col=guide_legend(NULL, override.aes=list(size=3))) +
231 |     labs(x="FDR threshold", y="number of significantly\ndifferential genes") +
232 |     facet_wrap(~cluster_id, scales="free_y", nrow=2) + 
233 |     theme_bw() + theme(
234 |         panel.grid.minor=element_blank(),
235 |         legend.key.size=unit(0.5, "lines"))
236 | ```
237 | 
238 | We can further identify which hits are shared between or unique to a given approach.
239 | In the example below, for instance, the vast majority of hits is common to all approaches, many hits are shared between DD and stagewise testing, and only few genes are specific to any one approach:
240 | 
241 | ```{r upset, fig.width = 5, fig.height = 3, fig.cap = "Upset plot of differential findings (FDR < 0.05) across DS, DD, and stagewise analysis for an exemplary cluster; shown are the 50 most frequent interactions."}
242 | # subset adjuster p-values for cluster of interest
243 | qs <- ps[grep("CD4", ps$cluster_id), grep("p_", names(ps))]
244 | # for each approach, extract genes at 5% FDR threshold
245 | gs <- apply(qs, 2, \(.) ps$gene[. < 0.05])
246 | # visualize set intersections between approaches
247 | UpSetR::upset(UpSetR::fromList(gs), order.by="freq")
248 | ```
249 | 
250 | ```{r}
251 | # extract genes unique to stagewise testing
252 | sw <- grep("stagewise", names(gs))
253 | setdiff(gs[[sw]], unlist(gs[-sw]))
254 | ```
255 | 
256 | # Session info {- .smaller}
257 | 
258 | ```{r session-info}
259 | sessionInfo()
260 | ```
261 | 
262 | # References


--------------------------------------------------------------------------------