├── .Rbuildignore
├── .github
└── workflows
│ └── check-bioc.yml
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
├── aggregateData.R
├── calcExprFreqs.R
├── data.R
├── mmDS.R
├── pbDS.R
├── pbFlatten.R
├── pbHeatmap.R
├── pbMDS.R
├── prepSCE.R
├── prepSim.R
├── resDS.R
├── simData.R
├── stagewiseDD.R
├── utils-mmDS.R
├── utils-pbDS.R
├── utils-simData.R
├── utils.R
└── validity-checks.R
├── README.md
├── data
└── example_sce.rda
├── inst
└── extdata
│ ├── 1a.png
│ ├── 1b.png
│ ├── 1d.png
│ ├── muscat.png
│ └── refs.bib
├── man
├── aggregateData.Rd
├── calcExprFreqs.Rd
├── data.Rd
├── mmDS.Rd
├── pbDS.Rd
├── pbFlatten.Rd
├── pbHeatmap.Rd
├── pbMDS.Rd
├── prepSCE.Rd
├── prepSim.Rd
├── resDS.Rd
├── simData.Rd
└── stagewise_DS_DD.Rd
├── tests
├── testthat.R
└── testthat
│ ├── test-aggregateData.R
│ ├── test-calcExprFreqs.R
│ ├── test-mmDS.R
│ ├── test-pbDS.R
│ ├── test-pbHeatmap.R
│ ├── test-pbMDS.R
│ ├── test-prepSCE.R
│ ├── test-prepSim.R
│ ├── test-resDS.R
│ ├── test-simData.R
│ ├── test-utils.R
│ ├── test-validityChecks.R
│ └── test_utils-simData.R
└── vignettes
├── analysis.Rmd
├── detection.Rmd
└── simulation.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.git$
2 | ^\.github$
3 | ^\.gitignore$
4 |
--------------------------------------------------------------------------------
/.github/workflows/check-bioc.yml:
--------------------------------------------------------------------------------
1 | ## Read more about GitHub actions the features of this GitHub Actions workflow
2 | ## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action
3 | ##
4 | ## For more details, check the biocthis developer notes vignette at
5 | ## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html
6 | ##
7 | ## You can add this workflow to other packages using:
8 | ## > biocthis::use_bioc_github_action()
9 | ##
10 | ## Using GitHub Actions exposes you to many details about how R packages are
11 | ## compiled and installed in several operating system.s
12 | ### If you need help, please follow the steps listed at
13 | ## https://github.com/r-lib/actions#where-to-find-help
14 | ##
15 | ## If you found an issue specific to biocthis's GHA workflow, please report it
16 | ## with the information that will make it easier for others to help you.
17 | ## Thank you!
18 |
19 | ## Acronyms:
20 | ## * GHA: GitHub Action
21 | ## * OS: operating system
22 |
23 | on:
24 | push:
25 | pull_request:
26 |
27 | name: R-CMD-check-bioc
28 |
29 | ## These environment variables control whether to run GHA code later on that is
30 | ## specific to testthat, covr, and pkgdown.
31 | ##
32 | ## If you need to clear the cache of packages, update the number inside
33 | ## cache-version as discussed at https://github.com/r-lib/actions/issues/86.
34 | ## Note that you can always run a GHA test without the cache by using the word
35 | ## "/nocache" in the commit message.
36 | env:
37 | has_testthat: 'true'
38 | run_covr: 'false'
39 | run_pkgdown: 'false'
40 | has_RUnit: 'false'
41 | cache-version: 'cache-v1'
42 | run_docker: 'false'
43 | bioc_version: 'bioc-release'
44 | ## Valid options are:
45 | ## "bioc-release"
46 | ## "bioc-devel"
47 | ## or a specific number like "3.20"
48 |
49 | jobs:
50 | bioc-config:
51 | runs-on: ubuntu-latest
52 | outputs:
53 | matrix: ${{ steps.set-bioc-matrix.outputs.matrix }}
54 | steps:
55 | ## Adapted from
56 | ## https://runs-on.com/github-actions/the-matrix-strategy/#dynamic-matrix-generation
57 | - id: set-bioc-matrix
58 | run: |
59 | bioc=$(curl -L https://bioconductor.org/config.yaml)
60 | if [[ "$bioc_version" == "bioc-release" ]]; then
61 | echo "Finding the latest BioC release version and the corresponding R version"
62 | biocversion=$(echo "$bioc" | grep "release_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}")
63 | rversion=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
64 | biocmajor=$(echo "$biocversion" | cut -c 1-1)
65 | biocminor=$(echo "$biocversion" | cut -c 3-4)
66 | bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}")
67 | elif [[ "$bioc_version" == "bioc-devel" ]]; then
68 | echo "Finding the latest BioC devel version and the corresponding R version"
69 | biocversion=$(echo "$bioc" | grep "devel_version: " | grep -Eo "[0-9]{1}\.[0-9]{2}")
70 | rversion_release=$(echo "$bioc" | grep "r_version_associated_with_release: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
71 | rversion_devel=$(echo "$bioc" | grep "r_version_associated_with_devel: " | grep -Eo "[0-9]{1}\.[0-9]{1}")
72 | if [[ "$rversion_release" == "$rversion_devel" ]]; then
73 | rversion=$(echo "$rversion_devel")
74 | else
75 | rversion="devel"
76 | fi
77 | bioccont="bioconductor/bioconductor_docker:devel"
78 | else
79 | echo "Finding the the R version for bioc version ${bioc_version}"
80 | biocversion=$(echo "$bioc_version")
81 | rversion=$(echo "$bioc" | sed -En "/r_ver_for_bioc_ver/,/release_dates/p" | grep "$bioc_version\":" | grep -Eo ": \"[0-9]{1}\.[0-9]{1}" | grep -Eo "[0-9]{1}\.[0-9]{1}")
82 | biocmajor=$(echo "$biocversion" | cut -c 1-1)
83 | biocminor=$(echo "$biocversion" | cut -c 3-4)
84 | bioccont=$(echo "bioconductor/bioconductor_docker:RELEASE_${biocmajor}_${biocminor}")
85 | fi
86 | echo "Found these settings:"
87 | echo "Bioconductor version: $biocversion, R version: $rversion, Bioconductor docker name: $bioccont"
88 | echo "matrix={ \"config\": [{\"os\" : \"ubuntu-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\", \"cont\" : \"${bioccont}\"} , {\"os\" : \"macOS-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\"} , {\"os\" : \"windows-latest\", \"r\" : \"${rversion}\", \"bioc\" : \"${biocversion}\" }] }" >> "$GITHUB_OUTPUT"
89 | ## If an OS is failing and you don't want to test it, manually remove it from the 'matrix' JSON entries above
90 |
91 | build-check:
92 | needs: bioc-config
93 | strategy:
94 | fail-fast: false
95 | matrix: ${{fromJson(needs.bioc-config.outputs.matrix)}}
96 | runs-on: ${{ matrix.config.os }}
97 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
98 | container: ${{ matrix.config.cont }}
99 | env:
100 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
101 | NOT_CRAN: true
102 | TZ: UTC
103 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
104 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
105 |
106 | steps:
107 |
108 | ## Most of these steps are the same as the ones in
109 | ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
110 | ## If they update their steps, we will also need to update ours.
111 | - name: Checkout Repository
112 | uses: actions/checkout@v3
113 |
114 | ## R is already included in the Bioconductor docker images
115 | - name: Setup R from r-lib
116 | if: runner.os != 'Linux'
117 | uses: r-lib/actions/setup-r@v2
118 | with:
119 | r-version: ${{ matrix.config.r }}
120 | http-user-agent: ${{ matrix.config.http-user-agent }}
121 |
122 | ## pandoc is already included in the Bioconductor docker images
123 | - name: Setup pandoc from r-lib
124 | if: runner.os != 'Linux'
125 | uses: r-lib/actions/setup-pandoc@v2
126 |
127 | ## Create the path that will be used for caching packages on Linux
128 | - name: Create R_LIBS_USER on Linux
129 | if: runner.os == 'Linux'
130 | run: |
131 | R_LIBS_USER=/__w/_temp/Library
132 | echo "R_LIBS_USER=$R_LIBS_USER" >> "$GITHUB_ENV"
133 | mkdir -p $R_LIBS_USER
134 |
135 | ## Use cached R packages
136 | - name: Restore R package cache
137 | if: "!contains(github.event.head_commit.message, '/nocache')"
138 | uses: actions/cache@v4
139 | with:
140 | path: ${{ env.R_LIBS_USER }}
141 | key: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-${{ inputs.cache-version }}
142 | restore-keys: ${{ matrix.config.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}--${{ inputs.cache-version }}
143 |
144 | ## remotes is needed for isntalling the Linux system dependencies
145 | ## as well as other R packages later on.
146 | - name: Install remotes
147 | run: |
148 | message(paste('****', Sys.time(), 'installing remotes ****'))
149 | install.packages('remotes')
150 | shell: Rscript {0}
151 |
152 | ## This will work again once https://github.com/r-lib/remotes/commit/0e4e23051041d9f1b15a5ab796defec31af6190d
153 | ## makes it to the CRAN version of remotes
154 |
155 | # - name: Install Linux system dependencies
156 | # if: runner.os == 'Linux'
157 | # run: |
158 | # sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "24.04")), collapse = " "))')
159 | # echo $sysreqs
160 | # sudo -s eval "$sysreqs"
161 |
162 | - name: Install macOS system dependencies
163 | if: matrix.config.os == 'macOS-latest'
164 | run: |
165 | ## Enable installing XML from source if needed
166 | brew install libxml2
167 | echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV
168 |
169 | ## Required to install magick as noted at
170 | ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2
171 | brew install imagemagick@6
172 |
173 | ## For textshaping, required by ragg, and required by pkgdown
174 | brew install harfbuzz fribidi
175 |
176 | ## For installing usethis's dependency gert
177 | brew install libgit2
178 |
179 | ## Required for tcltk
180 | brew install xquartz --cask
181 |
182 | - name: Install Windows system dependencies
183 | if: runner.os == 'Windows'
184 | run: |
185 | ## Edit below if you have any Windows system dependencies
186 | shell: Rscript {0}
187 |
188 | - name: Install BiocManager
189 | run: |
190 | message(paste('****', Sys.time(), 'installing BiocManager ****'))
191 | remotes::install_cran("BiocManager")
192 | shell: Rscript {0}
193 |
194 | - name: Set BiocVersion
195 | run: |
196 | BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE)
197 | shell: Rscript {0}
198 |
199 | - name: Install dependencies pass 1
200 | run: |
201 | ## Try installing the package dependencies in steps. First the local
202 | ## dependencies, then any remaining dependencies to avoid the
203 | ## issues described at
204 | ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
205 | ## https://github.com/r-lib/remotes/issues/296
206 | ## Ideally, all dependencies should get installed in the first pass.
207 |
208 | ## For running the checks
209 | message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
210 | install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories())
211 |
212 | ## Pass #1 at installing dependencies
213 | message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
214 | remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
215 | continue-on-error: true
216 | shell: Rscript {0}
217 |
218 | - name: Install dependencies pass 2
219 | run: |
220 | ## Pass #2 at installing dependencies
221 | message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
222 | remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE)
223 | shell: Rscript {0}
224 |
225 | - name: Install BiocGenerics
226 | if: env.has_RUnit == 'true'
227 | run: |
228 | ## Install BiocGenerics
229 | BiocManager::install("BiocGenerics")
230 | shell: Rscript {0}
231 |
232 | - name: Install covr
233 | if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux'
234 | run: |
235 | remotes::install_cran("covr")
236 | shell: Rscript {0}
237 |
238 | - name: Install pkgdown
239 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
240 | run: |
241 | remotes::install_cran("pkgdown")
242 | shell: Rscript {0}
243 |
244 | - name: Session info
245 | run: |
246 | options(width = 100)
247 | pkgs <- installed.packages()[, "Package"]
248 | sessioninfo::session_info(pkgs, include_base = TRUE)
249 | shell: Rscript {0}
250 |
251 | - name: Run CMD check
252 | env:
253 | _R_CHECK_CRAN_INCOMING_: false
254 | DISPLAY: 99.0
255 | run: |
256 | options(crayon.enabled = TRUE)
257 | rcmdcheck::rcmdcheck(
258 | args = c("--no-manual", "--no-vignettes", "--timings"),
259 | build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
260 | error_on = "warning",
261 | check_dir = "check"
262 | )
263 | shell: Rscript {0}
264 |
265 | ## Might need an to add this to the if: && runner.os == 'Linux'
266 | - name: Reveal testthat details
267 | if: env.has_testthat == 'true'
268 | run: find . -name testthat.Rout -exec cat '{}' ';'
269 |
270 | - name: Run RUnit tests
271 | if: env.has_RUnit == 'true'
272 | run: |
273 | BiocGenerics:::testPackage()
274 | shell: Rscript {0}
275 |
276 | - name: Run BiocCheck
277 | env:
278 | DISPLAY: 99.0
279 | run: |
280 | BiocCheck::BiocCheck(
281 | dir('check', 'tar.gz$', full.names = TRUE),
282 | `quit-with-status` = TRUE,
283 | `no-check-R-ver` = TRUE,
284 | `no-check-bioc-help` = TRUE
285 | )
286 | shell: Rscript {0}
287 |
288 | - name: Test coverage
289 | if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux'
290 | run: |
291 | covr::codecov(coverage = covr::package_coverage(type = "all"))
292 | shell: Rscript {0}
293 |
294 | - name: Install package
295 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
296 | run: R CMD INSTALL .
297 |
298 | - name: Build pkgdown site
299 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
300 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
301 | shell: Rscript {0}
302 | ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE)
303 | ## at least one locally before this will work. This creates the gh-pages
304 | ## branch (erasing anything you haven't version controlled!) and
305 | ## makes the git history recognizable by pkgdown.
306 |
307 | - name: Install deploy dependencies
308 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
309 | run: |
310 | apt-get update && apt-get -y install rsync
311 |
312 | - name: Deploy pkgdown site to GitHub pages 🚀
313 | if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux'
314 | uses: JamesIves/github-pages-deploy-action@releases/v4
315 | with:
316 | clean: false
317 | branch: gh-pages
318 | folder: docs
319 |
320 | - name: Upload check results
321 | if: failure()
322 | uses: actions/upload-artifact@master
323 | with:
324 | name: ${{ runner.os }}-${{ matrix.config.r }}-${{ matrix.config.bioc }}-results
325 | path: check
326 |
327 |
328 | ## Code adapted from
329 | ## https://github.com/waldronlab/cBioPortalData/blob/e0440a4445f0cc731e426363a76faa22ee5e0f9d/.github/workflows/devel_check_dock.yml#L65-L92
330 | docker-build-and-push:
331 | runs-on: ubuntu-latest
332 | needs: build-check
333 | steps:
334 | - name: Checkout Repository
335 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
336 | uses: actions/checkout@v3
337 |
338 | - name: Register repo name
339 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
340 | id: reg_repo_name
341 | run: |
342 | echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
343 |
344 | - name: Set up QEMU
345 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
346 | uses: docker/setup-qemu-action@v2
347 |
348 | - name: Set up Docker Buildx
349 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
350 | uses: docker/setup-buildx-action@v2
351 |
352 | - name: Login to Docker Hub
353 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel'"
354 | uses: docker/login-action@v2
355 | with:
356 | username: ${{ secrets.DOCKERHUB_USERNAME }}
357 | password: ${{ secrets.DOCKERHUB_TOKEN }}
358 | ## Note that DOCKERHUB_TOKEN is really a token for your dockerhub
359 | ## account, not your actual dockerhub account password. You can get it
360 | ## from https://hub.docker.com/settings/security.
361 | ## Check https://github.com/docker/build-push-action/tree/v4.0.0
362 | ## for more details.
363 | ## Alternatively, try checking
364 | ## https://seandavi.github.io/BuildABiocWorkshop/articles/HOWTO_BUILD_WORKSHOP.html.
365 |
366 | - name: Build and Push Docker
367 | if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && github.ref == 'refs/heads/devel' && success()"
368 | uses: docker/build-push-action@v4
369 | with:
370 | context: .
371 | push: true
372 | tags: >
373 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest,
374 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel
375 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .RData
2 | .Rhistory
3 | .DS_Store
4 | .Ruserdata
5 | .Rproj.user
6 | vignettes/*/
7 | vignettes/*.html
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: muscat
2 | Title: Multi-sample multi-group scRNA-seq data analysis tools
3 | Description: `muscat` provides various methods and visualization tools
4 | for DS analysis in multi-sample, multi-group, multi-(cell-)subpopulation
5 | scRNA-seq data, including cell-level mixed models and methods based on
6 | aggregated “pseudobulk” data, as well as a flexible simulation platform
7 | that mimics both single and multi-sample scRNA-seq data.
8 | Type: Package
9 | Version: 1.22.0
10 | Depends: R (>= 4.4)
11 | Authors@R: c(
12 | person("Helena L.", "Crowell",
13 | role=c("aut", "cre"),
14 | email="helena@crowell.eu",
15 | comment=c(ORCID="0000-0002-4801-1767")),
16 | person("Pierre-Luc", "Germain", role="aut"),
17 | person("Charlotte", "Soneson", role="aut"),
18 | person("Anthony", "Sonrel", role="aut"),
19 | person("Jeroen", "Gilis", role="aut"),
20 | person("Davide", "Risso", role="aut"),
21 | person("Lieven", "Clement", role="aut"),
22 | person("Mark D.", "Robinson", role=c("aut", "fnd"),
23 | email="mark.robinson@imls.uzh.ch"))
24 | Imports:
25 | BiocParallel, blme,
26 | ComplexHeatmap,
27 | data.table, DESeq2, dplyr,
28 | edgeR,
29 | ggplot2, glmmTMB, grDevices, grid,
30 | limma, lmerTest, lme4,
31 | Matrix, matrixStats, methods,
32 | progress, purrr,
33 | rlang,
34 | S4Vectors, scales, scater, scuttle, sctransform, stats,
35 | SingleCellExperiment, SummarizedExperiment,
36 | variancePartition, viridis
37 | Suggests:
38 | BiocStyle,
39 | countsimQC,
40 | ExperimentHub,
41 | iCOBRA,
42 | knitr,
43 | patchwork,
44 | phylogram,
45 | RColorBrewer,
46 | reshape2,
47 | rmarkdown,
48 | statmod,
49 | stageR,
50 | testthat,
51 | UpSetR
52 | biocViews: ImmunoOncology, DifferentialExpression, Sequencing,
53 | SingleCell, Software, StatisticalMethod, Visualization
54 | License: GPL-3
55 | VignetteBuilder: knitr
56 | RoxygenNote: 7.3.2
57 | Encoding: UTF-8
58 | URL: https://github.com/HelenaLC/muscat
59 | BugReports: https://github.com/HelenaLC/muscat/issues
60 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(aggregateData)
4 | export(calcExprFreqs)
5 | export(mmDS)
6 | export(pbDD)
7 | export(pbDS)
8 | export(pbFlatten)
9 | export(pbHeatmap)
10 | export(pbMDS)
11 | export(prepSCE)
12 | export(prepSim)
13 | export(resDS)
14 | export(simData)
15 | export(stagewise_DS_DD)
16 | import(ggplot2)
17 | importFrom(BiocParallel,MulticoreParam)
18 | importFrom(BiocParallel,SerialParam)
19 | importFrom(BiocParallel,bplapply)
20 | importFrom(ComplexHeatmap,Heatmap)
21 | importFrom(ComplexHeatmap,columnAnnotation)
22 | importFrom(ComplexHeatmap,rowAnnotation)
23 | importFrom(DESeq2,DESeq)
24 | importFrom(DESeq2,DESeqDataSetFromMatrix)
25 | importFrom(DESeq2,estimateDispersions)
26 | importFrom(DESeq2,results)
27 | importFrom(DESeq2,sizeFactors)
28 | importFrom(DESeq2,varianceStabilizingTransformation)
29 | importFrom(Matrix,colSums)
30 | importFrom(Matrix,qr)
31 | importFrom(Matrix,rowMeans)
32 | importFrom(Matrix,rowSums)
33 | importFrom(Matrix,t)
34 | importFrom(S4Vectors,"metadata<-")
35 | importFrom(S4Vectors,DataFrame)
36 | importFrom(S4Vectors,as.list)
37 | importFrom(S4Vectors,metadata)
38 | importFrom(S4Vectors,split)
39 | importFrom(S4Vectors,unfactor)
40 | importFrom(SingleCellExperiment,"counts<-")
41 | importFrom(SingleCellExperiment,"int_colData<-")
42 | importFrom(SingleCellExperiment,"sizeFactors<-")
43 | importFrom(SingleCellExperiment,SingleCellExperiment)
44 | importFrom(SingleCellExperiment,colData)
45 | importFrom(SingleCellExperiment,counts)
46 | importFrom(SingleCellExperiment,int_colData)
47 | importFrom(SingleCellExperiment,reducedDims)
48 | importFrom(SingleCellExperiment,sizeFactors)
49 | importFrom(SummarizedExperiment,"assay<-")
50 | importFrom(SummarizedExperiment,"colData<-")
51 | importFrom(SummarizedExperiment,"rowData<-")
52 | importFrom(SummarizedExperiment,SummarizedExperiment)
53 | importFrom(SummarizedExperiment,assay)
54 | importFrom(SummarizedExperiment,assayNames)
55 | importFrom(SummarizedExperiment,assays)
56 | importFrom(SummarizedExperiment,colData)
57 | importFrom(SummarizedExperiment,rowData)
58 | importFrom(blme,bglmer)
59 | importFrom(blme,blmer)
60 | importFrom(data.table,data.table)
61 | importFrom(dplyr,"%>%")
62 | importFrom(dplyr,all_of)
63 | importFrom(dplyr,bind_cols)
64 | importFrom(dplyr,bind_rows)
65 | importFrom(dplyr,filter)
66 | importFrom(dplyr,full_join)
67 | importFrom(dplyr,inner_join)
68 | importFrom(dplyr,last)
69 | importFrom(dplyr,mutate)
70 | importFrom(dplyr,mutate_all)
71 | importFrom(dplyr,mutate_at)
72 | importFrom(dplyr,mutate_if)
73 | importFrom(dplyr,pull)
74 | importFrom(dplyr,rename)
75 | importFrom(dplyr,select)
76 | importFrom(edgeR,DGEList)
77 | importFrom(edgeR,calcNormFactors)
78 | importFrom(edgeR,cpm)
79 | importFrom(edgeR,estimateDisp)
80 | importFrom(edgeR,filterByExpr)
81 | importFrom(edgeR,glmFit)
82 | importFrom(edgeR,glmQLFTest)
83 | importFrom(edgeR,glmQLFit)
84 | importFrom(edgeR,glmTreat)
85 | importFrom(edgeR,plotMDS.DGEList)
86 | importFrom(edgeR,topTags)
87 | importFrom(glmmTMB,glmmTMB)
88 | importFrom(glmmTMB,nbinom1)
89 | importFrom(grDevices,colorRampPalette)
90 | importFrom(grid,gpar)
91 | importFrom(limma,contrasts.fit)
92 | importFrom(limma,duplicateCorrelation)
93 | importFrom(limma,eBayes)
94 | importFrom(limma,lmFit)
95 | importFrom(limma,makeContrasts)
96 | importFrom(limma,topTable)
97 | importFrom(limma,topTreat)
98 | importFrom(limma,treat)
99 | importFrom(limma,voom)
100 | importFrom(lme4,.makeCC)
101 | importFrom(lme4,lmerControl)
102 | importFrom(lmerTest,contest)
103 | importFrom(lmerTest,lmer)
104 | importFrom(matrixStats,colAnys)
105 | importFrom(matrixStats,rowAnyNAs)
106 | importFrom(matrixStats,rowMedians)
107 | importFrom(matrixStats,rowMins)
108 | importFrom(matrixStats,rowQuantiles)
109 | importFrom(matrixStats,rowSds)
110 | importFrom(methods,is)
111 | importFrom(progress,progress_bar)
112 | importFrom(purrr,map)
113 | importFrom(purrr,map_depth)
114 | importFrom(purrr,modify_depth)
115 | importFrom(purrr,negate)
116 | importFrom(purrr,reduce)
117 | importFrom(purrr,set_names)
118 | importFrom(rlang,.data)
119 | importFrom(scales,hue_pal)
120 | importFrom(scater,computeLibraryFactors)
121 | importFrom(scater,isOutlier)
122 | importFrom(sctransform,vst)
123 | importFrom(scuttle,summarizeAssayByGroup)
124 | importFrom(stats,as.formula)
125 | importFrom(stats,coef)
126 | importFrom(stats,df.residual)
127 | importFrom(stats,model.matrix)
128 | importFrom(stats,p.adjust)
129 | importFrom(stats,residuals)
130 | importFrom(stats,rgamma)
131 | importFrom(stats,rnbinom)
132 | importFrom(stats,sd)
133 | importFrom(stats,setNames)
134 | importFrom(variancePartition,dream)
135 | importFrom(variancePartition,getContrast)
136 | importFrom(variancePartition,voomWithDreamWeights)
137 | importFrom(viridis,viridis)
138 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | Changes in version 1.19.1
2 |
3 | + added J Gilis, D Risso, L Clement as authors
4 | + differential detection with 'pbDS(..., method="DD")' or 'pbDD()'
5 | & stagewise testing [Gilis et al.], plus corresponding vignette
6 | + replace 'aes_string()' in 'ggplot()' by '.data$.' from 'rlang'
7 |
8 | Changes in version 1.15.1
9 |
10 | + bug fix in 'pbDS': too stringent filtering causing no genes in any clusters
11 | to be tested previously resulted in a 'subscript out of bounds' error;
12 | execution is stopped and an informative error thrown instead.
13 |
14 | + bug fix in 'mmDS': 'dream' (new version?) wouldn't recognize
15 | model variables provided as data; fixed via adding 'as.formula()'.
16 |
17 | + "analysis" vignette: replaced suspended 'dplyr' function
18 | 'top_n' with 'slice_min' when filtering for top DS hits;
19 | fixed some typos; updated preprint to journal reference.
20 |
21 | Changes in version 1.15.0
22 |
23 | + Bioconductor release v3.17
24 |
25 | Changes in version 1.12.1
26 |
27 | + fixed various typos in both vignettes
28 | + internal fixes to keep up with 'ggplot2' & 'dplyr' updates
29 | + bug fix in 'simDS' computing means when one group is missing
30 | + bug fix in 'resDS' until testing when 'cpm/frq = TRUE'
31 |
32 | Changes in version 1.11.1
33 |
34 | + bug fix in pbHeatmap(): previously failed for results from 'mmDS()'
35 |
36 | Changes in version 1.9.3
37 |
38 | + bug fix in pbDS(): drop samples w/o any detected features,
39 | otherwise edgeR::calcNormFactors() fails when lib.size 0
40 |
41 | Changes in version 1.8.1
42 |
43 | + bug fix in prepSim(): removal of genes with NA coefficients
44 | was previously not propagated to the dispersion estimates
45 |
46 | + bug fix in test-resDR.R: set 'min_cells = 0' to assure that
47 | everything is being tested, otherwise unit tests could fail
48 |
49 | Changes in version 1.8.0
50 |
51 | + Bioconductor 3.14 release
52 |
53 | Changes in version 1.7.2
54 |
55 | + bug fix in prepSim(): removal of NA coefficients and
56 | subsetting of the input SCE was previously out of synch
57 |
58 | Changes in version 1.5.2
59 |
60 | + added edgeR::calcNormFactors() step in prepSim()
61 |
62 | + added argument 'dd' to simData() specifying
63 | whether or not to simulate 2 groups
64 |
65 | + prepSim() and simData() now support simulation of "singular" design
66 | (no samples, no clusters), as well as only samples/clusters
67 |
68 | + simData() defaults to simulating as many samples as available
69 | in order to avoid re-use (duplication) of reference samples
70 |
71 | Changes in version 1.5.1
72 |
73 | + significant speed-up of aggregateData() by replacing usage
74 | of rowX() over a list with scuttle::summarizeAssayByGroup()
75 |
76 | + added options use "prop.detected" and "num.detected"
77 | as summary statistic (argument 'fun') in aggregateData()
78 |
79 | + added parallelization support in aggregateData() and pbDS() through argument BBPARAM
80 | (passed to scater::sumCountsAcrossCells() and BiocParallel::bplapply, respectively)
81 |
82 | + aggregateData() now stores the number of cells that went into aggregation under
83 | int_colData(.)$n_cells (vs. metadata(.)$n_cells) to support automated subsetting
84 |
85 | + replaced argument n_threads with BPPARAM throughout all
86 | parallelizable functions (aggregateData(), pbDS(), mmDS())
87 |
88 | + bug fix in prepSim(): the function previously failed when
89 | cluster/sample/group_id cell metadata columns were non-factors
90 |
91 | + bug fix in resDS(): cpm = TRUE previously didn't handle
92 | missing cluster-sample combinations correctly
--------------------------------------------------------------------------------
/R/aggregateData.R:
--------------------------------------------------------------------------------
1 | #' @rdname aggregateData
2 | #' @title Aggregation of single-cell to pseudobulk data
3 | #'
4 | #' @description ...
5 | #'
6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
7 | #' @param assay character string specifying the assay slot to use as
8 | #' input data. Defaults to the 1st available (\code{assayNames(x)[1]}).
9 | #' @param by character vector specifying which
10 | #' \code{colData(x)} columns to summarize by (at most 2!).
11 | #' @param fun a character string.
12 | #' Specifies the function to use as summary statistic.
13 | #' Passed to \code{\link[scuttle]{summarizeAssayByGroup}}.
14 | #' @param scale logical. Should pseudo-bulks be scaled
15 | #' with the effective library size & multiplied by 1M?
16 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
17 | #' object specifying how aggregation should be parallelized.
18 | #' @param verbose logical. Should information on progress be reported?
19 | #'
20 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
21 | #' \itemize{
22 | #' \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains
23 | #' pseudobulks for each of \code{by[1]}, e.g., for each cluster when
24 | #' \code{by = "cluster_id"}. Rows correspond to genes, columns to
25 | #' \code{by[2]}, e.g., samples when \code{by = "sample_id"}}.
26 | #' \item{If \code{length(by) == 1}, the returned SCE will contain only
27 | #' a single \code{assay} with rows = genes and colums = \code{by}.}}
28 | #'
29 | #' Aggregation parameters (\code{assay, by, fun, scaled}) are stored in
30 | #' \code{metadata()$agg_pars}, and the number of cells that were aggregated
31 | #' are accessible in \code{int_colData()$n_cells}.
32 | #'
33 | #' @examples
34 | #' # pseudobulk counts by cluster-sample
35 | #' data(example_sce)
36 | #' pb <- aggregateData(example_sce)
37 | #'
38 | #' library(SingleCellExperiment)
39 | #' assayNames(example_sce) # one sheet per cluster
40 | #' head(assay(example_sce)) # n_genes x n_samples
41 | #'
42 | #' # scaled CPM
43 | #' cpm <- edgeR::cpm(assay(example_sce))
44 | #' assays(example_sce)$cpm <- cpm
45 | #' pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE)
46 | #' head(assay(pb))
47 | #'
48 | #' # aggregate by cluster only
49 | #' pb <- aggregateData(example_sce, by = "cluster_id")
50 | #' length(assays(pb)) # single assay
51 | #' head(assay(pb)) # n_genes x n_clusters
52 | #'
53 | #' @author Helena L Crowell & Mark D Robinson
54 | #'
55 | #' @references
56 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
57 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
58 | #' On the discovery of population-specific state transitions from
59 | #' multi-sample multi-condition single-cell RNA sequencing data.
60 | #' \emph{bioRxiv} \strong{713412} (2018).
61 | #' doi: \url{https://doi.org/10.1101/713412}
62 | #'
63 | #' @importFrom Matrix colSums
64 | #' @importFrom purrr map
65 | #' @importFrom S4Vectors DataFrame metadata
66 | #' @importFrom SingleCellExperiment SingleCellExperiment int_colData<-
67 | #' @importFrom SummarizedExperiment colData colData<-
68 | #' @export
69 |
70 | aggregateData <- function(x,
71 | assay = NULL, by = c("cluster_id", "sample_id"),
72 | fun = c("sum", "mean", "median", "prop.detected", "num.detected"),
73 | scale = FALSE, verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose)) {
74 |
75 | # check validity of input arguments
76 | fun <- match.arg(fun)
77 | if (is.null(assay))
78 | assay <- assayNames(x)[1]
79 | .check_arg_assay(x, assay)
80 | .check_args_aggData(as.list(environment()))
81 | stopifnot(is(BPPARAM, "BiocParallelParam"))
82 |
83 | # assure 'by' colData columns are factors
84 | # so that missing combinations aren't dropped
85 | for (i in by)
86 | if (!is.factor(x[[i]]))
87 | x[[i]] <- factor(x[[i]])
88 |
89 | # compute pseudo-bulks
90 | pb <- .pb(x, by, assay, fun, BPPARAM)
91 | if (scale & length(by) == 2) {
92 | # compute library sizes
93 | cs <- if (assay == "counts" && fun == "sum")
94 | pb else .pb(x, by, "counts", "sum", BPPARAM)
95 | ls <- lapply(cs, colSums)
96 | # scale pseudobulks by CPM
97 | pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]])
98 | names(pb) <- names(ls)
99 | }
100 |
101 | # construct SCE
102 | md <- metadata(x)
103 | md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale)
104 | pb <- SingleCellExperiment(pb, rowData = rowData(x), metadata = md)
105 |
106 | # tabulate number of cells
107 | cd <- data.frame(colData(x)[, by])
108 | for (i in names(cd))
109 | if (is.factor(cd[[i]]))
110 | cd[[i]] <- droplevels(cd[[i]])
111 | ns <- table(cd)
112 | if (length(by) == 2) {
113 | ns <- asplit(ns, 2)
114 | ns <- map(ns, ~c(unclass(.)))
115 | } else ns <- c(unclass(ns))
116 | int_colData(pb)$n_cells <- ns
117 |
118 | # propagate 'colData' columns that are unique across 2nd 'by'
119 | if (length(by) == 2) {
120 | cd <- colData(x)
121 | ids <- colnames(pb)
122 | counts <- vapply(ids, function(u) {
123 | m <- as.logical(match(cd[, by[2]], u, nomatch = 0))
124 | vapply(cd[m, ], function(u) length(unique(u)), numeric(1))
125 | }, numeric(ncol(colData(x))))
126 | cd_keep <- apply(counts, 1, function(u) all(u == 1))
127 | cd_keep <- setdiff(names(which(cd_keep)), by)
128 | if (length(cd_keep) != 0) {
129 | m <- match(ids, cd[, by[2]], nomatch = 0)
130 | cd <- cd[m, cd_keep, drop = FALSE]
131 | rownames(cd) <- ids
132 | colData(pb) <- cd
133 | }
134 | }
135 | return(pb)
136 | }
137 |
--------------------------------------------------------------------------------
/R/calcExprFreqs.R:
--------------------------------------------------------------------------------
1 | #' calcExprFreqs
2 | #'
3 | #' Calculates gene expression frequencies
4 | #'
5 | #' \code{calcExprFreq} computes, for each sample and group (in each cluster),
6 | #' the fraction of cells that express a given gene. Here, a gene is considered
7 | #' to be expressed when the specified measurement value (\code{assay})
8 | #' lies above the specified threshold value (\code{th}).
9 | #'
10 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
11 | #' @param assay a character string specifying which assay to use.
12 | #' @param th numeric threshold value above which
13 | #' a gene should be considered to be expressed.
14 | #'
15 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
16 | #' containing, for each cluster, an assay of dimensions #genes x #samples
17 | #' giving the fraction of cells that express each gene in each sample.
18 | #' If \code{colData(x)} contains a \code{"group_id"} column, the fraction
19 | #' of expressing cells in each each group will be included as well.
20 | #'
21 | #' @examples
22 | #' data(example_sce)
23 | #' library(SingleCellExperiment)
24 | #'
25 | #' frq <- calcExprFreqs(example_sce)
26 | #'
27 | #' # one assay per cluster
28 | #' assayNames(frq)
29 | #'
30 | #' # expression frequencies by
31 | #' # sample & group; 1st cluster:
32 | #' head(assay(frq))
33 | #'
34 | #' @author Helena L Crowell & Mark D Robinson
35 | #'
36 | #' @importFrom Matrix rowMeans
37 | #' @importFrom methods is
38 | #' @importFrom purrr set_names
39 | #' @importFrom SummarizedExperiment assays colData SummarizedExperiment
40 | #' @export
41 |
42 | calcExprFreqs <- function(x, assay = "counts", th = 0) {
43 | # check validity of input arguments
44 | .check_sce(x, req_group = FALSE)
45 | .check_arg_assay(x, assay)
46 | stopifnot(is.numeric(th), length(th) == 1)
47 |
48 | # split cells by cluster-sample
49 | cs_by_ks <- .split_cells(x)
50 |
51 | # for each gene, compute fraction of cells
52 | # w/ assay value above threshold in each sample
53 | y <- assays(x)[[assay]]
54 | fq <- lapply(cs_by_ks, vapply, function(i) {
55 | if (is.null(i)) return(rep(0, nrow(x)))
56 | Matrix::rowMeans(y[, i, drop = FALSE] > th)
57 | }, numeric(nrow(x)))
58 |
59 | # same for ea. group (if colData column "group_id" exists)
60 | if ("group_id" %in% colnames(colData(x))) {
61 | kids <- x$cluster_id
62 | nc_by_ks <- table(kids, x$sample_id)
63 | nc_by_kg <- table(kids, x$group_id)
64 |
65 | ei <- metadata(x)$experiment_info
66 | s_by_g <- split(ei$sample_id, ei$group_id)
67 | gids <- names(s_by_g)
68 | kids <- set_names(levels(kids))
69 |
70 | fq <- lapply(kids, function(k) {
71 | nc_by_s <- fq[[k]] * nc_by_ks[k, ][col(fq[[k]])]
72 | fq_by_g <- vapply(gids, function(g) {
73 | nc_g <- nc_by_s[, s_by_g[[g]], drop = FALSE]
74 | rowSums(nc_g) / nc_by_kg[k, g]
75 | }, numeric(nrow(x)))
76 | cbind(fq[[k]], fq_by_g)
77 | })
78 | }
79 | # return SCE
80 | SingleCellExperiment(assays = fq)
81 | }
82 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | #' @rdname data
2 | #' @name data
3 | #' @aliases data example_sce
4 | #'
5 | #' @title Example datasets
6 | #'
7 | #' @description
8 | #' A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing
9 | #' 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after
10 | #' 6h-treatment with INF-beta (16 samples in total).
11 | #'
12 | #' The original data has been filtered to
13 | #' \itemize{
14 | #' \item{remove unassigned cells & cell multiplets}
15 | #' \item{retain only 4 out of 8 samples per experimental group}
16 | #' \item{retain only 5 out of 8 subpopulations (clusters)}
17 | #' \item{retain genes with a count > 1 in > 50 cells}
18 | #' \item{retain cells with > 200 detected genes}
19 | #' \item{retain at most 100 cells per cluster-sample instance}
20 | #' }
21 | #'
22 | #' Assay \code{logcounts} corresponds to log-normalized values
23 | #' obtained from \code{\link[scater]{logNormCounts}} with default parameters.
24 | #'
25 | #' The original measurement data, as well as gene and cell metadata
26 | #' is available through the NCBI GEO accession number GSE96583;
27 | #' code to reproduce this example dataset from the original data
28 | #' is provided in the examples section.
29 | #'
30 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
31 | #'
32 | #' @examples
33 | #' \donttest{
34 | #' # set random seed for cell sampling
35 | #' set.seed(2929)
36 | #'
37 | #' # load data
38 | #' library(ExperimentHub)
39 | #' eh <- ExperimentHub()
40 | #' sce <- eh[["EH2259"]]
41 | #'
42 | #' # drop unassigned cells & multiplets
43 | #' sce <- sce[, !is.na(sce$cell)]
44 | #' sce <- sce[, sce$multiplets == "singlet"]
45 | #'
46 | #' # keep 4 samples per group
47 | #' sce$id <- paste0(sce$stim, sce$ind)
48 | #' inds <- sample(sce$ind, 4)
49 | #' ids <- paste0(levels(sce$stim), rep(inds, each = 2))
50 | #' sce <- sce[, sce$id %in% ids]
51 | #'
52 | #' # keep 5 clusters
53 | #' kids <- c("B cells", "CD4 T cells", "CD8 T cells",
54 | #' "CD14+ Monocytes", "FCGR3A+ Monocytes")
55 | #' sce <- sce[, sce$cell %in% kids]
56 | #' sce$cell <- droplevels(sce$cell)
57 | #'
58 | #' # basic filtering on genes & cells
59 | #' gs <- rowSums(counts(sce) > 1) > 50
60 | #' cs <- colSums(counts(sce) > 0) > 200
61 | #' sce <- sce[gs, cs]
62 | #'
63 | #' # sample max. 100 cells per cluster-sample
64 | #' cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id))
65 | #' cs <- sapply(cs_by_ks, function(u)
66 | #' sample(u, min(length(u), 100)))
67 | #' sce <- sce[, unlist(cs)]
68 | #'
69 | #' # compute logcounts
70 | #' library(scater)
71 | #' sce <- computeLibraryFactors(sce)
72 | #' sce <- logNormCounts(sce)
73 | #'
74 | #' # re-format for 'muscat'
75 | #' sce <- prepSCE(sce,
76 | #' kid = "cell",
77 | #' sid = "id",
78 | #' gid = "stim",
79 | #' drop = TRUE)
80 | #' }
81 | #'
82 | #' @references
83 | #' Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing
84 | #' using natural genetic variation. \emph{Nature Biotechnology},
85 | #' \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042.
86 | #'
87 | #' @author Helena L Crowell
88 |
89 | NULL
90 |
--------------------------------------------------------------------------------
/R/mmDS.R:
--------------------------------------------------------------------------------
1 | #' @rdname mmDS
2 | #' @title DS analysis using mixed-models (MM)
3 | #'
4 | #' @description Performs cluster-wise DE analysis by fitting cell-level models.
5 | #'
6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
7 | #' @param coef character specifying the coefficient to test.
8 | #' If NULL (default), will test the last level of \code{"group_id"}.
9 | #' @param covs character vector of \code{colData(x)}
10 | #' column names to use as covariates.
11 | #' @param method a character string.
12 | #' Either \code{"dream2"} (default, lme4 with voom-weights),
13 | #' \code{"dream"} (previous implementation of the dream method),
14 | #' \code{"vst"} (variance-stabilizing transformation),
15 | #' \code{"poisson"} (poisson GLM-MM),
16 | #' \code{"nbinom"} (negative binomial GLM-MM),
17 | #' \code{"hybrid"} (combination of pseudobulk and poisson methods)
18 | #' or a function accepting the same arguments.
19 | #' @param n_cells number of cells per cluster-sample
20 | #' required to consider a sample for testing.
21 | #' @param n_samples number of samples per group
22 | #' required to consider a cluster for testing.
23 | #' @param min_count numeric. For a gene to be tested in a given cluster,
24 | #' at least \code{min_cells} must have a count >= \code{min_count}.
25 | #' @param min_cells number (or fraction, if < 1) of cells with a count >
26 | #' \code{min_count} required for a gene to be tested in a given cluster.
27 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
28 | #' object specifying how differential testing should be parallelized.
29 | #' @param verbose logical specifying whether messages
30 | #' on progress and a progress bar should be displayed.
31 | #'
32 | #' @return a data.frame
33 | #'
34 | #' @examples
35 | #' # subset "B cells" cluster
36 | #' data(example_sce)
37 | #' b_cells <- example_sce$cluster_id == "B cells"
38 | #' sub <- example_sce[, b_cells]
39 | #' sub$cluster_id <- droplevels(sub$cluster_id)
40 | #'
41 | #' # downsample to 100 genes
42 | #' gs <- sample(nrow(sub), 100)
43 | #' sub <- sub[gs, ]
44 | #'
45 | #' # run DS analysis using cell-level mixed-model
46 | #' res <- mmDS(sub, method = "dream", verbose = FALSE)
47 | #' head(res$`B cells`)
48 | #'
49 | #' @author Pierre-Luc Germain & Helena L Crowell
50 | #'
51 | #' @references
52 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
53 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
54 | #' On the discovery of population-specific state transitions from
55 | #' multi-sample multi-condition single-cell RNA sequencing data.
56 | #' \emph{bioRxiv} \strong{713412} (2018).
57 | #' doi: \url{https://doi.org/10.1101/713412}
58 | #'
59 | #' @importFrom DESeq2 DESeqDataSetFromMatrix estimateDispersions
60 | #' sizeFactors varianceStabilizingTransformation
61 | #' @importFrom dplyr %>% mutate bind_rows
62 | #' @importFrom matrixStats rowMins
63 | #' @importFrom progress progress_bar
64 | #' @importFrom purrr map_depth
65 | #' @importFrom sctransform vst
66 | #' @importFrom SingleCellExperiment counts counts<-
67 | #' colData sizeFactors sizeFactors<-
68 | #' @importFrom stats p.adjust
69 | #' @export
70 |
71 | mmDS <- function(x, coef = NULL, covs = NULL,
72 | method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"),
73 | n_cells = 10, n_samples = 2, min_count = 1, min_cells = 20,
74 | verbose = TRUE, BPPARAM = SerialParam(progressbar = verbose),
75 | vst = c("sctransform", "DESeq2"),
76 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
77 | dup_corr = FALSE, trended = FALSE, bayesian = FALSE,
78 | blind = TRUE, REML = TRUE, moderate = FALSE) {
79 |
80 | # check validity of input arguments
81 | .check_sce(x, req_group = TRUE)
82 | .check_arg_assay(x, "counts")
83 | .check_args_mmDS(as.list(environment()))
84 | stopifnot(is(BPPARAM, "BiocParallelParam"))
85 |
86 | args <- as.list(environment())
87 | args$method <- match.arg(method)
88 | args$vst <- match.arg(vst)
89 | args$ddf <- match.arg(ddf)
90 |
91 | # counts cells per cluster-sample
92 | n_cells_by_ks <- table(x$cluster_id, x$sample_id)
93 |
94 | # filter clusters w/ >= n_cells in >= n_samples
95 | if (!is.null(metadata(x)$experiment_info$group_id)) {
96 | ei <- metadata(x)$experiment_info
97 | m <- match(levels(x$sample_id), ei$sample_id)
98 | gids <- ei$group_id[m]
99 | } else {
100 | gids <- x$group_id
101 | }
102 | ks_keep <- apply(n_cells_by_ks > n_cells, 1,
103 | function(u) all(tabulate(gids[u]) >= n_samples))
104 | if (sum(ks_keep) == 0)
105 | stop("No cluster has at least ", n_samples,
106 | " samples with at least ", n_cells, " cells.")
107 |
108 | kids <- levels(x$cluster_id)
109 | if (verbose && sum(ks_keep) < length(kids))
110 | message(paste("Skipping cluster(s)",
111 | paste(dQuote(kids[!ks_keep]), collapse = ", "),
112 | "\ndue to an insufficient number of samples",
113 | "with a sufficient number of cells."))
114 | kids <- kids[ks_keep]
115 | names(kids) <- kids
116 |
117 | # split cells by cluster
118 | cells_by_k <- split(colnames(x), x$cluster_id)
119 |
120 | if (min_count < 1) {
121 | min_count <- floor(min_count * rowMins(n_cells_by_ks))
122 | } else {
123 | min_count <- rep(min_count, length(kids))
124 | }
125 | names(min_count) <- kids
126 |
127 | # variance-stabilizing transformation
128 | if (args$method == "vst" && !"vstresiduals" %in% assayNames(x)) {
129 | vst_call <- switch(args$vst,
130 | sctransform = expression(.vst_sctransform(x, verbose)),
131 | DESeq2 = expression(.vst_DESeq2(x, covs, blind)))
132 | if (verbose) {
133 | assay(x, "vstresiduals", FALSE) <- eval(vst_call)
134 | } else {
135 | assay(x, "vstresiduals", FALSE) <- suppressMessages(eval(vst_call))
136 | }
137 | }
138 |
139 | # get method function & construct correct call
140 | if (!is.function(fun <- args$method))
141 | fun <- get(paste0(".mm_", fun))
142 | args_use <- names(formals(fun))
143 | args <- args[names(args) %in% args_use]
144 |
145 | if (verbose) pb <- progress_bar$new(total = length(kids))
146 | res <- lapply(kids, function(k) {
147 | y <- x[, cells_by_k[[k]]]
148 | y <- y[rowSums(counts(y) >= min_count[k]) > min_cells, ]
149 | if (verbose)
150 | message("Testing ", nrow(y), " genes across ",
151 | ncol(y), " cells in cluster ", dQuote(k), "...")
152 |
153 | # call to .mm_dream/.mm_vst
154 | args$x <- y
155 | z <- do.call(fun, args)
156 | data.frame(
157 | gene = rownames(y),
158 | cluster_id = k, z,
159 | row.names = NULL,
160 | stringsAsFactors = FALSE)
161 | })
162 | if (verbose) pb$terminate()
163 |
164 | # assemble results from all cluster
165 | res <- bind_rows(res)
166 | # global p-value adjustment
167 | p_adj.glb <- p.adjust(res$p_val, method = "BH")
168 | i <- which(colnames(res) == "p_adj.loc")
169 | res[["p_adj.glb"]] <- p_adj.glb
170 | res <- res[, c(seq_len(i), ncol(res), seq_len(ncol(res)-1)[-seq_len(i)])]
171 | # re-split by cluster
172 | split(res, res$cluster_id)
173 | }
174 |
--------------------------------------------------------------------------------
/R/pbDS.R:
--------------------------------------------------------------------------------
1 | #' @rdname pbDS
2 | #' @title pseudobulk DS analysis
3 | #'
4 | #' @description \code{pbDS} tests for DS after aggregating single-cell
5 | #' measurements to pseudobulk data, by applying bulk RNA-seq DE methods,
6 | #' such as \code{edgeR}, \code{DESeq2} and \code{limma}.
7 | #'
8 | #' @param pb a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
9 | #' containing pseudobulks as returned by \code{\link{aggregateData}}.
10 | #' @param method a character string.
11 | #' @param design For methods \code{"edegR"} and \code{"limma"}, a design matrix
12 | #' with row & column names(!) created with \code{\link[stats]{model.matrix}};
13 | #' For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}.
14 | #' Defaults to \code{~ group_id} or the corresponding \code{model.matrix}.
15 | #' @param contrast a matrix of contrasts to test for
16 | #' created with \code{\link[limma]{makeContrasts}}.
17 | #' @param coef passed to \code{\link[edgeR]{glmQLFTest}},
18 | #' \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}}
19 | #' for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively.
20 | #' Can be a list for multiple, independent comparisons.
21 | #' @param min_cells a numeric. Specifies the minimum number of cells in a given
22 | #' cluster-sample required to consider the sample for differential testing.
23 | #' @param filter character string specifying whether
24 | #' to filter on genes, samples, both or neither.
25 | #' @param treat logical specifying whether empirical Bayes moderated-t
26 | #' p-values should be computed relative to a minimum fold-change threshold.
27 | #' Only applicable for methods \code{"limma-x"}
28 | #' (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"}
29 | #' (\code{\link[edgeR]{glmTreat}}), and ignored otherwise.
30 | #' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}}
31 | #' object specifying how differential testing should be parallelized.
32 | #' @param verbose logical. Should information on progress be reported?
33 | #'
34 | #' @return a list containing \itemize{
35 | #' \item a data.frame with differential testing results,
36 | #' \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and
37 | #' \item the \code{design} matrix, and \code{contrast} or \code{coef} used.}
38 | #'
39 | #' @examples
40 | #' # simulate 5 clusters, 20% of DE genes
41 | #' data(example_sce)
42 | #'
43 | #' # compute pseudobulk sum-counts & run DS analysis
44 | #' pb <- aggregateData(example_sce)
45 | #' res <- pbDS(pb, method = "limma-trend")
46 | #'
47 | #' names(res)
48 | #' names(res$table)
49 | #' head(res$table$stim$`B cells`)
50 | #'
51 | #' # count nb. of DE genes by cluster
52 | #' vapply(res$table$stim, function(u)
53 | #' sum(u$p_adj.loc < 0.05), numeric(1))
54 | #'
55 | #' # get top 5 hits for ea. cluster w/ abs(logFC) > 1
56 | #' library(dplyr)
57 | #' lapply(res$table$stim, function(u)
58 | #' filter(u, abs(logFC) > 1) %>%
59 | #' arrange(p_adj.loc) %>%
60 | #' slice(seq_len(5)))
61 | #'
62 | #' @author Helena L Crowell & Mark D Robinson
63 | #'
64 | #' @references
65 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
66 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
67 | #' On the discovery of population-specific state transitions from
68 | #' multi-sample multi-condition single-cell RNA sequencing data.
69 | #' \emph{bioRxiv} \strong{713412} (2018).
70 | #' doi: \url{https://doi.org/10.1101/713412}
71 | #'
72 | #' @importFrom BiocParallel bplapply SerialParam
73 | #' @importFrom edgeR filterByExpr
74 | #' @importFrom dplyr last rename
75 | #' @importFrom limma makeContrasts
76 | #' @importFrom matrixStats colAnys
77 | #' @importFrom Matrix qr rowSums
78 | #' @importFrom methods is
79 | #' @importFrom scater isOutlier
80 | #' @importFrom stats model.matrix
81 | #' @importFrom SummarizedExperiment assay colData
82 | #' @export
83 |
84 | pbDS <- function(pb,
85 | method=c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"),
86 | design=NULL, coef=NULL, contrast=NULL, min_cells=10,
87 | filter=c("both", "genes", "samples", "none"), treat=FALSE,
88 | verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) {
89 |
90 | # check validity of input arguments
91 | args <- as.list(environment())
92 | method <- match.arg(method)
93 | filter <- match.arg(filter)
94 | .check_pbs(pb, check_by=TRUE)
95 | .check_args_pbDS(args)
96 | stopifnot(is(BPPARAM, "BiocParallelParam"))
97 |
98 | if (is.null(design)) {
99 | formula <- ~ group_id
100 | cd <- as.data.frame(colData(pb))
101 | design <- model.matrix(formula, cd)
102 | colnames(design) <- levels(pb$group_id)
103 | args$design <- design
104 | }
105 | if (is.null(coef) & is.null(contrast)) {
106 | c <- colnames(design)[ncol(design)]
107 | contrast <- makeContrasts(contrasts=c, levels=design)
108 | args$contrast <- contrast
109 | }
110 |
111 | # ct: type of comparison - "contrast" or "coef"
112 | # cs: named list of 'coef's or 'contrast's
113 | if (!is.null(contrast)) {
114 | coef <- NULL
115 | names(cs) <- cs <- colnames(contrast)
116 | } else if (!is.null(coef)) {
117 | if (!is.list(coef))
118 | coef <- list(coef)
119 | cs <- vapply(coef, function(i)
120 | paste(colnames(design)[i], collapse="-"),
121 | character(1))
122 | names(cs) <- names(coef) <- cs
123 | }
124 | ct <- ifelse(is.null(coef), "contrast", "coef")
125 |
126 | if (!is.function(method)) {
127 | fun <- switch(method,
128 | "DD"=.edgeR_NB,
129 | "edgeR"=.edgeR,
130 | "DESeq2"=.DESeq2,
131 | "limma-voom"=.limma_voom,
132 | "limma-trend"=.limma_trend)
133 | } else {
134 | fun_call <- 1
135 | }
136 | fun_args <- names(as.list(args(fun)))
137 | fun_args <- fun_args[-length(fun_args)]
138 |
139 | # for ea. cluster, run DEA
140 | n_cells <- .n_cells(pb)
141 | names(kids) <- kids <- assayNames(pb)
142 | res <- bplapply(
143 | BPPARAM=BPPARAM,
144 | kids, function (k) {
145 | rmv <- n_cells[k, ] < min_cells
146 | d <- design[colnames(y <- pb[ , !rmv]), , drop=FALSE]
147 | if (filter %in% c("samples", "both")) {
148 | ls <- colSums(assay(y, k))
149 | ol <- isOutlier(ls, log=TRUE, type="lower", nmads=3)
150 | d <- d[colnames(y <- y[, !ol]), , drop=FALSE]
151 | }
152 | if (any(tabulate(y$group_id) < 2)
153 | || qr(d)$rank== nrow(d)
154 | || qr(d)$rank < ncol(d))
155 | return(NULL)
156 | y <- y[rowSums(assay(y, k)) != 0, , drop=FALSE]
157 | if (filter %in% c("genes", "both") & max(assay(y, k)) > 100)
158 | y <- y[filterByExpr(assay(y, k), d), , drop=FALSE]
159 | # drop samples without any detected features
160 | keep <- colAnys(assay(y, k) > 0)
161 | y <- y[, keep, drop=FALSE]
162 | d <- d[keep, , drop=FALSE]
163 | args <- list(
164 | x=y, k=k, design=d, coef=coef,
165 | contrast=contrast, ct=ct, cs=cs,
166 | treat=treat, nc=n_cells[k, !rmv])
167 | args <- args[intersect(names(args), fun_args)]
168 | suppressWarnings(do.call(fun, args))
169 | })
170 |
171 | # remove empty clusters
172 | rmv <- vapply(res, is.null, logical(1))
173 | res <- res[!rmv]
174 |
175 | if (length(res)== 0) stop(
176 | "Specified filtering options result in no genes in any clusters ",
177 | "being tested. To force testing, consider modifying arguments ",
178 | "'min_cells' and/or 'filter'. See '?pbDS' for details.")
179 |
180 | # reorganize & do global p-value adjustment
181 | names(i) <- i <- c("table", "data", "fit")
182 | res <- lapply(i, map, .x=res)
183 | res$table <- .p_adj_global(res$table)
184 | return(c(res, list(args=args)))
185 | }
186 |
187 | #' @rdname pbDS
188 | #' @export
189 | pbDD <- function(pb, design=NULL, coef=NULL, contrast=NULL,
190 | min_cells=10, filter=c("both", "genes", "samples", "none"),
191 | verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose))
192 | {
193 | args <- as.list(environment())
194 | do.call(pbDS, c(args, list(method="DD")))
195 | }
196 |
--------------------------------------------------------------------------------
/R/pbFlatten.R:
--------------------------------------------------------------------------------
1 | #' pbFlatten
2 | #' Flatten pseudobulk SCE
3 | #'
4 | #' Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
5 | #' as returned by \code{\link{aggregateData}} such that all cell subpopulations
6 | #' are represented as a single assay.
7 | #'
8 | #' @param pb a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
9 | #' as returned by \code{\link{aggregateData}}, with different subpopulations as assays.
10 | #' @param normalize logical specifying whether to compute a \code{logcpm} assay.
11 | #'
12 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
13 | #'
14 | #' @examples
15 | #' data(example_sce)
16 | #' library(SingleCellExperiment)
17 | #' pb_stack <- aggregateData(example_sce)
18 | #' (pb_flat <- pbFlatten(pb_stack))
19 | #' ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack))
20 | #'
21 | #' @importFrom methods is
22 | #' @importFrom edgeR cpm calcNormFactors DGEList
23 | #' @importFrom S4Vectors DataFrame metadata as.list
24 | #' @importFrom SingleCellExperiment SingleCellExperiment
25 | #' @importFrom SummarizedExperiment assay assay<- assays colData rowData
26 | #' @export
27 |
28 | pbFlatten <- function(pb, normalize = TRUE){
29 | # check validity of input arguments
30 | stopifnot(is.logical(normalize), length(normalize) == 1,
31 | is(pb, "SingleCellExperiment"), length(assays(pb)) > 1)
32 | # concatenate assay data
33 | as <- assays(pb)
34 | a <- do.call(cbind, as.list(as))
35 | sids <- rep(colnames(pb), length(as))
36 | kids <- rep(names(as), each = ncol(pb))
37 | colnames(a) <- paste(sids, kids, sep = ".")
38 | pb$sample_id <- colnames(pb)
39 | # construct cell metadata
40 | cd <- lapply(seq_along(as), function(u) colData(pb))
41 | cd <- do.call(rbind, cd)
42 | rownames(cd) <- colnames(a)
43 | cd$cluster_id <- kids
44 | # construct single-assay SCE
45 | sce <- SingleCellExperiment(
46 | assays = list(counts = a),
47 | colData = cd,
48 | rowData = rowData(pb),
49 | metadata = metadata(pb))
50 | # (optionally) add number of cells per cluster-sample
51 | if (!is.null(n_cells <- .n_cells(pb))) {
52 | n_cells <- tryCatch(mapply(
53 | function(k, s) n_cells[k, s],
54 | k = as.character(kids), s = as.character(sids)),
55 | error = function(e) { warning(e); NULL })
56 | if (!is.null(n_cells))
57 | sce$n_cells <- as.numeric(n_cells)
58 | }
59 | # (optionally) do log-CPM normalization
60 | if (normalize) {
61 | # remove empty columns (samples that lack a cluster)
62 | sce <- sce[, colSums(a != 0) > 0]
63 | dgl <- DGEList(assay(sce))
64 | dgl <- calcNormFactors(dgl)
65 | assay(sce, "logcpm") <- log1p(cpm(dgl))
66 | }
67 | return(sce)
68 | }
69 |
--------------------------------------------------------------------------------
/R/pbHeatmap.R:
--------------------------------------------------------------------------------
1 | #' @rdname pbHeatmap
2 | #' @title Heatmap of cluster-sample pseudobulks
3 | #'
4 | #' @description ...
5 | #'
6 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
7 | #' @param y a list of DS analysis results as returned by
8 | #' \code{\link{pbDS}} or \code{\link{mmDS}}.
9 | #' @param k character vector; specifies which cluster ID(s) to retain.
10 | #' Defaults to \code{levels(x$cluster_id)}.
11 | #' @param g character vector; specifies which genes to retain.
12 | #' Defaults to considering all genes.
13 | #' @param c character string; specifies which contrast/coefficient to retain.
14 | #' Defaults to \code{names(y$table)[1]}.
15 | #' @param top_n single numeric; number of genes to retain per cluster.
16 | #' @param fdr,lfc single numeric; FDR and logFC cutoffs to filter results by.
17 | #' The specified FDR threshold is applied to \code{p_adj.loc} values.
18 | #' @param sort_by character string specifying
19 | #' a numeric results table column to sort by;
20 | #' \code{"none"} to retain original ordering.
21 | #' @param decreasing logical; whether to sort
22 | #' in decreasing order of \code{sort_by}.
23 | #' @param assay character string; specifies which assay to use;
24 | #' should be one of \code{assayNames(x)}.
25 | #' @param fun function to use as summary statistic,
26 | #' e.g., mean, median, sum (depending on the input assay).
27 | #' @param normalize logical; whether to apply a z-normalization
28 | #' to each row (gene) of the cluster-sample pseudobulk data.
29 | #' @param col character vector of colors or color mapping function
30 | #' generated with \code{\link[circlize]{colorRamp2}}. Passed to
31 | #' argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}}
32 | #' (see \code{?ComplexHeatmap::Heatmap} for details).
33 | #' @param row_anno,col_anno logical; whether to render
34 | #' annotations of cluster and group IDs, respectively.
35 | #'
36 | #' @return a \code{\link{HeatmapList-class}} object.
37 | #'
38 | #' @examples
39 | #' # compute pseudobulks & run DS analysis
40 | #' data(example_sce)
41 | #' pb <- aggregateData(example_sce)
42 | #' res <- pbDS(pb)
43 | #'
44 | #' # cluster-sample expression means
45 | #' pbHeatmap(example_sce, res)
46 | #'
47 | #' # include only a single cluster
48 | #' pbHeatmap(example_sce, res, k = "B cells")
49 | #'
50 | #' # plot specific gene across all clusters
51 | #' pbHeatmap(example_sce, res, g = "ISG20")
52 | #'
53 | #' @author Helena L Crowell
54 | #'
55 | #' @importFrom ComplexHeatmap Heatmap columnAnnotation rowAnnotation
56 | #' @importFrom dplyr bind_rows filter
57 | #' @importFrom grid gpar
58 | #' @importFrom purrr map
59 | #' @importFrom scales hue_pal
60 | #' @importFrom viridis viridis
61 | #' @export
62 |
63 | pbHeatmap <- function(x, y,
64 | k = NULL, g = NULL, c = NULL,
65 | top_n = 20, fdr = 0.05, lfc = 1,
66 | sort_by = "p_adj.loc", decreasing = FALSE,
67 | assay = "logcounts", fun = mean, normalize = TRUE,
68 | col = viridis(10), row_anno = TRUE, col_anno = TRUE) {
69 |
70 | # check validity of input arguments
71 | .check_sce(x, req_group = TRUE)
72 | .check_arg_assay(x, assay)
73 | .check_args_pbHeatmap(as.list(environment()))
74 |
75 | # subset specified contrast/coef & cluster(s)
76 | if (is.null(k)) k <- levels(x$cluster_id)
77 | if (names(y)[1] == "table") {
78 | if (is.null(c)) c <- names(y$table)[1]
79 | y <- y$table[[c]][k]
80 | } else y <- y[k]
81 | y <- y[!vapply(y, is.null, logical(1))]
82 |
83 | # filter results
84 | if (!is.null(g)) y <- lapply(y, filter, gene %in% g)
85 | y <- lapply(y, filter, p_adj.loc < fdr, abs(logFC) > lfc)
86 |
87 | # subset 'top_n' results
88 | if (is.null(top_n)) {
89 | ns <- vapply(y, nrow, numeric(1))
90 | } else {
91 | ns <- vapply(y, function(u) min(nrow(u), top_n), numeric(1))
92 | }
93 |
94 | # get cluster IDs & nb. of clusters
95 | ex <- ns == 0; ns <- ns[!ex]; y <- y[!ex]
96 | nk <- length(names(kids) <- kids <- names(y))
97 |
98 | # re-order results
99 | if (sort_by == "none") {
100 | y <- lapply(kids, function(k)
101 | y[[k]][seq_len(ns[k]), ])
102 | } else {
103 | vs <- map(y, sort_by)
104 | os <- lapply(vs, order, decreasing = decreasing)
105 | y <- lapply(kids, function(k) {
106 | o <- os[[k]][seq_len(ns[k])]
107 | y[[k]][o, ]
108 | })
109 | }
110 | y <- bind_rows(y)
111 |
112 | # subset 'assay' data
113 | es <- assays(x)[[assay]]
114 | es <- es[unlist(y$gene), , drop = FALSE]
115 |
116 | # compute cluster-sample summary values (e.g., means)
117 | cells_by_ks <- .split_cells(x)
118 | xs <- t(mapply(function(g, k)
119 | vapply(cells_by_ks[[k]], function(cs)
120 | fun(es[g, cs]), numeric(1)),
121 | g = y$gene, k = y$cluster_id))
122 | if (normalize) xs <- .z_norm(xs)
123 |
124 | # plotting -----------------------------------------------------------------
125 | # row & column annotation
126 | lgd_aes <- list(labels_gp = gpar(fontsize = 6),
127 | title_gp = gpar(fontface = "bold", fontsize = 8))
128 |
129 | if (row_anno & nk > 1) {
130 | cols <- .cluster_colors
131 | if (nk > length(cols))
132 | cols <- colorRampPalette(cols)(nk)
133 | cols <- setNames(cols[seq_len(nk)], kids)
134 | row_anno <- rowAnnotation(
135 | df = data.frame(cluster_id = y$cluster_id),
136 | col = list(cluster_id = cols),
137 | gp = gpar(col = "white"),
138 | show_annotation_name = FALSE,
139 | annotation_legend_param = lgd_aes)
140 | } else {
141 | row_anno <- NULL
142 | }
143 |
144 | ei <- metadata(x)$experiment_info
145 | m <- match(levels(x$sample_id), ei$sample_id)
146 | if (col_anno) {
147 | cols <- setNames(hue_pal()(nlevels(x$group_id)), levels(x$group_id))
148 | col_anno <- columnAnnotation(
149 | df = data.frame(group_id = ei$group_id[m]),
150 | col = list(group_id = cols),
151 | gp = gpar(col = "white"),
152 | show_annotation_name = FALSE,
153 | annotation_legend_param = lgd_aes)
154 | } else {
155 | col_anno <- NULL
156 | }
157 |
158 | Heatmap(matrix = xs, col = col,
159 | name = sprintf("%s%s %s",
160 | c("", "z-normalized\n")[normalize + 1],
161 | deparse(substitute(fun)), assay),
162 | row_title = NULL, column_title = NULL,
163 | cluster_rows = FALSE, cluster_columns = FALSE,
164 | left_annotation = row_anno, top_annotation = col_anno,
165 | split = y$cluster_id, column_split = ei$group_id[m],
166 | heatmap_legend_param = lgd_aes,
167 | row_names_gp = gpar(fontsize = 6),
168 | column_names_gp = gpar(fontsize = 8),
169 | column_title_gp = gpar(fontface = "bold", fontsize = 10))
170 | }
--------------------------------------------------------------------------------
/R/pbMDS.R:
--------------------------------------------------------------------------------
1 | #' @rdname pbMDS
2 | #' @title Pseudobulk-level MDS plot
3 | #'
4 | #' @description Renders a multidimensional scaling (MDS)
5 | #' where each point represents a cluster-sample instance;
6 | #' with points colored by cluster ID and shaped by group ID.
7 | #'
8 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
9 | #' containing cluster-sample pseudobulks as returned by
10 | #' \code{\link{aggregateData}} with argument
11 | #' \code{by = c("cluster_id", "sample_id")}.
12 | #'
13 | #' @return a \code{ggplot} object.
14 | #'
15 | #' @examples
16 | #' data(example_sce)
17 | #' pb <- aggregateData(example_sce)
18 | #' pbMDS(pb)
19 | #'
20 | #' @author Helena L Crowell & Mark D Robinson
21 | #'
22 | #' @importFrom edgeR calcNormFactors cpm DGEList plotMDS.DGEList
23 | #' @importFrom SummarizedExperiment assays
24 | #' @importFrom grDevices colorRampPalette
25 | #' @importFrom S4Vectors metadata
26 | #' @importFrom Matrix rowSums
27 | #' @importFrom rlang .data
28 | #' @import ggplot2
29 | #' @export
30 |
31 | pbMDS <- function(x) {
32 | # check validity of input pseudobulk-SCE
33 | # (cells should have been aggregated by cluster-sample)
34 | .check_pbs(x, check_by = TRUE)
35 |
36 | y <- as.list(assays(x))
37 | y <- do.call("cbind", y)
38 | y <- y[, (j <- c(t(.n_cells(x))) != 0)]
39 | d <- DGEList(unname(y), remove.zeros = TRUE)
40 | d <- calcNormFactors(d)
41 |
42 | mds <- plotMDS.DGEList(d, plot = FALSE)
43 | nk <- length(kids <- assayNames(x))
44 |
45 | ss <- rep(colnames(x), nk)
46 | ks <- rep(kids, each = ncol(x))
47 |
48 | if (any(!j)) {
49 | txt <- paste(sQuote(ks[!j]), sQuote(ss[!j]), sep = "-")
50 | message("Removing cluster-sample instance(s) ",
51 | paste(txt, collapse = ", "))
52 | }
53 |
54 | df <- data.frame(
55 | MDS1 = mds$x, MDS2 = mds$y,
56 | cluster_id = factor(ks[j], levels = kids),
57 | group_id = rep(x$group_id, nk)[j])
58 |
59 | cols <- .cluster_colors
60 | if (nk > length(cols))
61 | cols <- colorRampPalette(cols)(nk)
62 |
63 | ggplot(df, aes(.data$MDS1, .data$MDS2,
64 | col=.data$cluster_id, shape=.data$group_id)) +
65 | scale_color_manual(values = cols) +
66 | geom_point(size = 3, alpha = 0.8) +
67 | guides(color = guide_legend(override.aes = list(alpha = 1))) +
68 | theme_bw() + theme(aspect.ratio = 1,
69 | axis.text = element_text(color = "black"),
70 | panel.grid.minor = element_blank(),
71 | panel.grid.major = element_line(linewidth = 0.2, color = "lightgrey"))
72 | }
73 |
--------------------------------------------------------------------------------
/R/prepSCE.R:
--------------------------------------------------------------------------------
1 | #' @rdname prepSCE
2 | #' @title Prepare SCE for DS analysis
3 | #'
4 | #' @description ...
5 | #'
6 | #' @param x a \linkS4class{SingleCellExperiment}.
7 | #' @param kid,sid,gid character strings specifying
8 | #' the \code{colData(x)} columns containing cluster assignments,
9 | #' unique sample identifiers, and group IDs (e.g., treatment).
10 | #' @param drop logical. Specifies whether \code{colData(x)} columns
11 | #' besides those specified as \code{cluster_id,sample_id,group_id}
12 | #' should be retained (default \code{drop = FALSE})
13 | #' or removed (\code{drop = TRUE}).
14 | #'
15 | #' @examples
16 | #' # generate random counts
17 | #' ng <- 50
18 | #' nc <- 200
19 | #'
20 | #' # generate some cell metadata
21 | #' gids <- sample(c("groupA", "groupB"), nc, TRUE)
22 | #' sids <- sample(paste0("sample", seq_len(3)), nc, TRUE)
23 | #' kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE)
24 | #' batch <- sample(seq_len(3), nc, TRUE)
25 | #' cd <- data.frame(group = gids, id = sids, cluster = kids, batch)
26 | #'
27 | #' # construct SCE
28 | #' library(scuttle)
29 | #' sce <- mockSCE(ncells = nc, ngenes = ng)
30 | #' colData(sce) <- cbind(colData(sce), cd)
31 | #'
32 | #' # prep. for workflow
33 | #' sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group")
34 | #' head(colData(sce))
35 | #' metadata(sce)$experiment_info
36 | #' sce
37 | #'
38 | #' @author Helena L Crowell
39 | #'
40 | #' @return a \linkS4class{SingleCellExperiment}.
41 | #'
42 | #' @importFrom dplyr mutate_all
43 | #' @importFrom S4Vectors DataFrame metadata<-
44 | #' @importFrom SingleCellExperiment reducedDims SingleCellExperiment
45 | #' @importFrom SummarizedExperiment assays colData rowData
46 | #' @export
47 |
48 | prepSCE <- function(x,
49 | kid = "cluster_id",
50 | sid = "sample_id",
51 | gid = "group_id",
52 | drop = FALSE) {
53 |
54 | stopifnot(is(x, "SingleCellExperiment"))
55 |
56 | args <- as.list(environment())
57 | ids <- args[grep("[a-z]id", names(args))]
58 | ids <- unlist(ids)
59 |
60 | stopifnot(is.character(ids))
61 | stopifnot(all(ids %in% colnames(colData(x))))
62 |
63 | cd0 <- colData(x)
64 | cd <- data.frame(cd0[ids], check.names = FALSE)
65 | cd <- mutate_all(cd, as.factor)
66 | colnames(cd) <- unlist(formals()[names(ids)])
67 |
68 | if (!drop)
69 | cd <- data.frame(cd,
70 | cd0[setdiff(colnames(cd0), ids)],
71 | check.names = FALSE)
72 |
73 | # replace colData in SCE
74 | colData(x) <- DataFrame(cd)
75 |
76 | # construct metadata
77 | ei <- .make_ei(x)
78 | metadata(x)$experiment_info <- ei
79 |
80 | return(x)
81 | }
82 |
--------------------------------------------------------------------------------
/R/prepSim.R:
--------------------------------------------------------------------------------
1 | #' @name prepSim
2 | #'
3 | #' @title SCE preparation for \code{\link{simData}}
4 | #'
5 | #' @description \code{prepSim} prepares an input SCE for simulation
6 | #' with \code{muscat}'s \code{\link{simData}} function by
7 | #' \enumerate{
8 | #' \item{basic filtering of genes and cells}
9 | #' \item{(optional) filtering of subpopulation-sample instances}
10 | #' \item{estimation of cell (library sizes) and gene parameters
11 | #' (dispersions and sample-specific means), respectively.}
12 | #' }
13 | #'
14 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
15 | #' @param min_count,min_cells used for filtering of genes; only genes with
16 | #' a count > \code{min_count} in >= \code{min_cells} will be retained.
17 | #' @param min_genes used for filtering cells;
18 | #' only cells with a count > 0 in >= \code{min_genes} will be retained.
19 | #' @param min_size used for filtering subpopulation-sample combinations;
20 | #' only instances with >= \code{min_size} cells will be retained.
21 | #' Specifying \code{min_size = NULL} skips this step.
22 | #' @param group_keep character string; if \code{nlevels(x$group_id) > 1},
23 | #' specifies which group of samples to keep (see details). The default
24 | #' NULL retains samples from \code{levels(x$group_id)[1]}; otherwise,
25 | #' if `colData(x)$group_id` is not specified, all samples will be kept.
26 | #' @param verbose logical; should information on progress be reported?
27 | #'
28 | #' @details For each gene \eqn{g}, \code{prepSim} fits a model to estimate
29 | #' sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s},
30 | #' and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s
31 | #' \code{\link[edgeR]{estimateDisp}} function with default parameters.
32 | #' Thus, the reference count data is modeled as NB distributed:
33 | #' \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)}
34 | #' for gene \eqn{g} and cell \eqn{c}, where the mean
35 | #' \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here,
36 | #' \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g}
37 | #' in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size
38 | #' (total number of counts), and \eqn{\phi_g} is the dispersion.
39 | #'
40 | #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
41 | #' containing, for each cell, library size (\code{colData(x)$offset})
42 | #' and, for each gene, dispersion and sample-specific mean estimates
43 | #' (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively).
44 | #'
45 | #' @examples
46 | #' # estimate simulation parameters
47 | #' data(example_sce)
48 | #' ref <- prepSim(example_sce)
49 | #'
50 | #' # tabulate number of genes/cells before vs. after
51 | #' ns <- cbind(
52 | #' before = dim(example_sce),
53 | #' after = dim(ref))
54 | #' rownames(ns) <- c("#genes", "#cells")
55 | #' ns
56 | #'
57 | #' library(SingleCellExperiment)
58 | #' head(rowData(ref)) # gene parameters
59 | #' head(colData(ref)) # cell parameters
60 | #'
61 | #' @author Helena L Crowell
62 | #'
63 | #' @references
64 | #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
65 | #' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
66 | #' On the discovery of population-specific state transitions from
67 | #' multi-sample multi-condition single-cell RNA sequencing data.
68 | #' \emph{bioRxiv} \strong{713412} (2018).
69 | #' doi: \url{https://doi.org/10.1101/713412}
70 | #'
71 | #' @importFrom edgeR DGEList estimateDisp glmFit
72 | #' @importFrom Matrix colSums rowSums
73 | #' @importFrom matrixStats rowAnyNAs
74 | #' @importFrom SingleCellExperiment SingleCellExperiment counts
75 | #' @importFrom SummarizedExperiment colData rowData<-
76 | #' @importFrom stats model.matrix
77 | #' @importFrom S4Vectors DataFrame
78 | #' @export
79 |
80 | prepSim <- function(x,
81 | min_count = 1, min_cells = 10,
82 | min_genes = 100, min_size = 100,
83 | group_keep = NULL, verbose = TRUE) {
84 |
85 | .check_sce(x, req_group = FALSE)
86 | stopifnot(is.numeric(min_count),
87 | is.numeric(min_cells), is.numeric(min_genes),
88 | is.null(min_size) || is.numeric(min_size),
89 | is.logical(verbose), length(verbose) == 1)
90 |
91 | # get model variables
92 | vars <- c("sample_id", "cluster_id")
93 | names(vars) <- vars <- intersect(vars, names(colData(x)))
94 |
95 | # assure these are factors
96 | for (v in vars) {
97 | # drop singular variables from model
98 | n <- length(unique(x[[v]]))
99 | if (n == 1) {
100 | rmv <- grep(v, vars)
101 | vars <- vars[-rmv]
102 | }
103 | if (!is.factor(x[[v]]))
104 | x[[v]] <- as.factor(x[[v]])
105 | x[[v]] <- droplevels(x[[v]])
106 | }
107 |
108 | n_cells0 <- ncol(x)
109 | x <- .update_sce(x)
110 | if (is.null(group_keep)) {
111 | if ("group_id" %in% colnames(colData(x))) {
112 | group_keep <- levels(x$group_id)[1]
113 | if (verbose) {
114 | fmt <- paste(
115 | "Argument `group_keep` unspecified;",
116 | "defaulting to retaining %s-group samples.")
117 | message(sprintf(fmt, dQuote(group_keep)))
118 | }
119 | cells_keep <- x$group_id == group_keep
120 | } else {
121 | cells_keep <- seq_len(ncol(x))
122 | }
123 | } else {
124 | stopifnot(is.character(group_keep),
125 | group_keep %in% levels(x$group_id))
126 | cells_keep <- x$group_id %in% group_keep
127 | }
128 | x <- x[, cells_keep]
129 | x <- .update_sce(x)
130 |
131 | # keep genes w/ count > `min_count` in at least `min_cells`;
132 | # keep cells w/ at least `min_genes` detected genes
133 | if (verbose) message("Filtering...")
134 | genes_keep <- rowSums(counts(x) > min_count) >= min_cells
135 | cells_keep <- colSums(counts(x) > 0) >= min_genes
136 | if (verbose) message(sprintf(
137 | "- %s/%s genes and %s/%s cells retained.",
138 | sum(genes_keep), nrow(x), sum(cells_keep), n_cells0))
139 | x <- x[genes_keep, cells_keep, drop = FALSE]
140 |
141 | # keep cluster-samples w/ at least 'min_size' cells
142 | if (!is.null(min_size)) {
143 | n_cells <- table(x$cluster_id, x$sample_id)
144 | n_cells <- .filter_matrix(n_cells, n = min_size)
145 | if (ncol(n_cells) == 1)
146 | stop("Current 'min_size' retains only 1 sample,\nbut",
147 | " mean-dispersion estimation requires at least 2.")
148 | if (verbose) message(sprintf(
149 | "- %s/%s subpopulations and %s/%s samples retained.",
150 | nrow(n_cells), nlevels(x$cluster_id),
151 | ncol(n_cells), nlevels(x$sample_id)))
152 | x <- .filter_sce(x, rownames(n_cells), colnames(n_cells))
153 | }
154 |
155 | if (is.null(rownames(x))) rownames(x) <- paste0("gene", seq(nrow(x)))
156 | if (is.null(colnames(x))) colnames(x) <- paste0("cell", seq(ncol(x)))
157 |
158 | # construct model formula
159 | f <- "~ 1"
160 | for (v in vars)
161 | f <- paste(f, v, sep = "+")
162 | cd <- as.data.frame(droplevels(colData(x)))
163 | mm <- model.matrix(as.formula(f), data = cd)
164 |
165 | # fit NB model
166 | if (verbose)
167 | message("Estimating gene and cell parameters...")
168 | y <- DGEList(counts(x))
169 | y <- calcNormFactors(y)
170 | y <- estimateDisp(y, mm)
171 | y <- glmFit(y, prior.count = 0)
172 |
173 | # drop genes for which estimation failed
174 | cs <- y$coefficients
175 | i <- !rowAnyNAs(cs)
176 | x <- x[i, , drop = FALSE]
177 | cs <- cs[i, , drop = FALSE]
178 | ds <- y$dispersion[i]
179 | names(ds) <- rownames(x)
180 |
181 | # group betas by variable
182 | bs <- DataFrame(
183 | beta0 = cs[, 1],
184 | row.names = rownames(x))
185 | for (v in vars) {
186 | pat <- paste0("^", v)
187 | i <- grep(pat, colnames(cs))
188 | df <- DataFrame(cs[, i])
189 | nms <- colnames(cs)[i]
190 | names(df) <- gsub(pat, "", nms)
191 | bs[[v]] <- df
192 | }
193 | # store betas & dispersions in rowData
194 | rowData(x)$beta <- bs
195 | rowData(x)$disp <- ds
196 |
197 | # store offsets in colData
198 | os <- c(y$offset)
199 | names(os) <- colnames(x)
200 | x$offset <- os
201 |
202 | # drop singular variables from cell metadata
203 | for (v in names(colData(x))) {
204 | n <- length(unique(x[[v]]))
205 | if (n == 1) x[[v]] <- NULL
206 | }
207 |
208 | # return SCE
209 | return(x)
210 | }
211 |
--------------------------------------------------------------------------------
/R/resDS.R:
--------------------------------------------------------------------------------
1 | #' resDS
2 | #' Formatting of DS analysis results
3 | #'
4 | #' \code{resDS} provides a simple wrapper to format cluster-level
5 | #' differential testing results into an easily filterable table, and
6 | #' to optionally append gene expression frequencies by cluster-sample
7 | #' & -group, as well as cluster-sample-wise CPM.
8 | #'
9 | #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
10 | #' @param y a list of DS testing results as returned
11 | #' by \code{\link{pbDS}} or \code{\link{mmDS}}.
12 | #' @param bind character string specifying the output format (see details).
13 | #' @param frq logical or a pre-computed list of expression frequencies
14 | #' as returned by \code{\link{calcExprFreqs}}.
15 | #' @param cpm logical specifying whether CPM by cluster-sample
16 | #' should be appendeded to the output result table(s).
17 | #' @param digits integer value specifying the
18 | #' number of significant digits to maintain.
19 | #' @param sep character string to use as separator
20 | #' when constructing new column names.
21 | #' @param ... optional arguments passed to
22 | #' \code{\link{calcExprFreqs}} if \code{frq = TRUE}.
23 | #'
24 | #' @details When \code{bind = "col"}, the list of DS testing results at
25 | #' \code{y$table} will be merge vertically (by column) into a single table
26 | #' in tidy format with column \code{contrast/coef} specifying the comparison.
27 | #'
28 | #' Otherwise, when \code{bind = "row"}, an identifier of the respective
29 | #' contrast or coefficient will be appended to the column names,
30 | #' and all tables will be merge horizontally (by row).
31 | #'
32 | #' Expression frequencies pre-computed with \code{\link{calcExprFreqs}}
33 | #' may be provided with \code{frq}. Alternatively, when \code{frq = TRUE},
34 | #' expression frequencies can be computed directly, and additional arguments
35 | #' may be passed to \code{\link{calcExprFreqs}} (see examples below).
36 | #'
37 | #' @return returns a `data.frame`.
38 | #'
39 | #' @examples
40 | #' # compute pseudobulks (sum of counts)
41 | #' data(example_sce)
42 | #' pb <- aggregateData(example_sce,
43 | #' assay = "counts", fun = "sum")
44 | #'
45 | #' # run DS analysis (edgeR on pseudobulks)
46 | #' res <- pbDS(pb, method = "edgeR")
47 | #'
48 | #' head(resDS(example_sce, res, bind = "row")) # tidy format
49 | #' head(resDS(example_sce, res, bind = "col", digits = Inf))
50 | #'
51 | #' # append CPMs & expression frequencies
52 | #' head(resDS(example_sce, res, cpm = TRUE))
53 | #' head(resDS(example_sce, res, frq = TRUE))
54 | #'
55 | #' # pre-computed expression frequencies & append
56 | #' frq <- calcExprFreqs(example_sce, assay = "counts", th = 0)
57 | #' head(resDS(example_sce, res, frq = frq))
58 | #'
59 | #' @author Helena L Crowell & Mark D Robinson
60 | #'
61 | #' @importFrom dplyr %>% bind_rows inner_join full_join mutate mutate_if select all_of
62 | #' @importFrom edgeR cpm
63 | #' @importFrom methods is
64 | #' @importFrom purrr reduce
65 | #' @importFrom SummarizedExperiment colData
66 | #' @importFrom S4Vectors metadata
67 | #' @export
68 |
69 | resDS <- function(x, y, bind = c("row", "col"),
70 | frq = FALSE, cpm = FALSE, digits = 3, sep = "__", ...) {
71 |
72 | # check validity of input arguments
73 | .check_sce(x, req_group = TRUE)
74 | #.check_res(x, y)
75 | bind <- match.arg(bind)
76 | if (!is.logical(frq))
77 | .check_frq(x, frq)
78 | stopifnot(is.infinite(digits) || is.numeric(digits) &
79 | digits > 0 & as.integer(digits) == digits)
80 |
81 | ei <- metadata(x)$experiment_info
82 | kids <- levels(x$cluster_id)
83 |
84 | res <- switch(bind,
85 | row = {
86 | bind_rows(lapply(y$table, bind_rows))
87 | },
88 | col = {
89 | ct <- ifelse(!is.null(y$args$contrast), "contrast", "coef")
90 | cs <- names(y$table)
91 | res <- lapply(cs, function(c) {
92 | df <- bind_rows(y$table[[c]])
93 | df <- select(df, !all_of(ct))
94 | i <- !colnames(df) %in% c("gene", "cluster_id")
95 | colnames(df)[i] <- paste(colnames(df)[i], c, sep = sep)
96 | return(df)
97 | })
98 | reduce(res, full_join, by = c("gene", "cluster_id"))
99 | })
100 |
101 | .tidy <- function(u, ei, append = "") {
102 | m1 <- match(ei$sample_id, colnames(u), nomatch = 0)
103 | m2 <- match(levels(ei$group_id), colnames(u))
104 | if (all(is.na(m2))) m2 <- 0
105 | colnames(u)[m1] <- paste0(ei$sample_id, append)[m1 != 0]
106 | colnames(u)[m2] <- paste0(colnames(u)[m2], append)
107 | k <- seq_len(ncol(u))[-c(m1, m2)]
108 | u[, c(k, m1[order(ei$group)], m2)]
109 | }
110 |
111 | # append expression frequencies
112 | if (is.logical(frq))
113 | if (frq) frq <- calcExprFreqs(x, ...) else frq <- NULL
114 | if (!is.null(frq)) {
115 | frq <- data.frame(
116 | gene = rep(rownames(x), length(assays(frq))),
117 | cluster_id = rep(assayNames(frq), each = nrow(x)),
118 | do.call("rbind", as.list(assays(frq))),
119 | row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
120 | frq <- .tidy(frq, ei, append = ".frq")
121 | res <- inner_join(frq, res,
122 | multiple = "all",
123 | by = c("gene", "cluster_id"))
124 | }
125 |
126 | # append CPMs
127 | if (cpm) {
128 | cpm <- lapply(kids, function(k) {
129 | if (is.null(y$data[[k]]))
130 | return(NULL)
131 | cpm <- cpm(y$data[[k]])
132 | data.frame(cpm,
133 | gene = rownames(cpm),
134 | cluster_id = k,
135 | row.names = NULL,
136 | check.names = FALSE,
137 | stringsAsFactors = FALSE)
138 | })
139 | cpm <- bind_rows(cpm)
140 | cpm <- .tidy(cpm, ei, append = ".cpm")
141 | res <- inner_join(cpm, res,
142 | multiple = "all",
143 | by = c("gene", "cluster_id"))
144 | }
145 | mutate_if(res, is.numeric, signif, digits)
146 | }
147 |
--------------------------------------------------------------------------------
/R/stagewiseDD.R:
--------------------------------------------------------------------------------
1 | #' @importFrom dplyr bind_rows
2 | .res_DX <- function(res_DS, res_DD) {
3 | # for each contrast...
4 | names(cts) <- cts <- names(res_DS$table)
5 | lapply(cts, function(ct) {
6 | # for each cluster...
7 | names(kids) <- kids <- names(res_DS$table[[ct]])
8 | lapply(kids, function(kid) {
9 | # get DS/DD results
10 | DS <- res_DS$table[[ct]][[kid]]
11 | DD <- res_DD$table[[ct]][[kid]]
12 | # add missing genes
13 | DS <- bind_rows(DS, data.frame(gene=setdiff(DD$gene, DS$gene)))
14 | DD <- bind_rows(DD, data.frame(gene=setdiff(DS$gene, DD$gene)))
15 | # reorder & return both
16 | DD <- DD[match(DS$gene, DD$gene), ]
17 | return(list(DS=DS, DD=DD))
18 | })
19 | })
20 | }
21 |
22 | #' @rdname stagewise_DS_DD
23 | #' @title Perform two-stage testing on DS and DD analysis results
24 | #'
25 | #' @param res_DS a list of DS testing results as returned
26 | #' by \code{\link{pbDS}} or \code{\link{mmDS}}.
27 | #' @param res_DD a list of DD testing results as returned
28 | #' by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).
29 | #' @param sce (optional) \code{SingleCellExperiment} object containing the data
30 | #' that underlies testing, prior to summarization with \code{\link{aggregateData}}.
31 | #' Used for validation of inputs in order to prevent unexpected failure/results.
32 | #' @param verbose logical. Should information on progress be reported?
33 | #'
34 | #' @return
35 | #' A list of \code{DFrame}s containing results for each contrast and cluster.
36 | #' Each table contains DS and DD results for genes shared between analyses,
37 | #' as well as results from stagewise testing analysis, namely:
38 | #' \itemize{
39 | #' \item{\code{p_adj}: FDR adjusted p-values for the
40 | #' screening hypothesis that a gene is neither DS nor DD
41 | #' (see \code{?stageR::getAdjustedPValues} for details)}
42 | #' \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}}
43 | #'
44 | #' @examples
45 | #' data(example_sce)
46 | #'
47 | #' pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum")
48 | #' pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected")
49 | #'
50 | #' res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE)
51 | #' res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE)
52 | #'
53 | #' res <- stagewise_DS_DD(res_DS, res_DD)
54 | #' head(res[[1]][[1]]) # results for 1st cluster
55 | #'
56 | #' @importFrom S4Vectors DataFrame
57 | #' @importFrom purrr map_depth
58 | #' @export
59 |
60 | stagewise_DS_DD <- function(res_DS, res_DD, sce=NULL, verbose=FALSE) {
61 | if (!requireNamespace("stageR", quietly=TRUE))
62 | stop("Install 'stageR' to use this function.")
63 |
64 | # validity checks
65 | # TODO: helper to check validity of 'res_DS/D'
66 | # against each other and, optionally, 'sce'
67 | stopifnot(
68 | # same coefs/constrasts
69 | names(x <- res_DS$table) ==
70 | names(y <- res_DD$table),
71 | # any shared clusters
72 | sum(mapply(\(i, j)
73 | length(intersect(i, j)),
74 | i=lapply(x, names),
75 | j=lapply(y, names))) > 0)
76 | if (!is.null(sce)) {
77 | .check_sce(sce)
78 | . <- map_depth(list(x, y), 3, \(df) df$gene %in% rownames(sce))
79 | stopifnot("gene(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
80 | . <- map_depth(list(x, y), 3, \(df) df$cluster_id %in% sce$cluster_id)
81 | stopifnot("cluster(s) present in 'res_DS/D' not found in 'sce'"=unlist(.))
82 | }
83 |
84 | # assure that results contain same set of genes, in the same order
85 | # (indepedent of different filtering criteria for the two analyses)
86 | res_DX <- .res_DX(res_DS=res_DS, res_DD=res_DD)
87 |
88 | # perform harmonic mean p-value aggregation according to
89 | # (https://www.pnas.org/doi/full/10.1073/pnas.1814092116)
90 | .mu <- \(x) 1/mean(1/x, na.rm=TRUE)
91 |
92 | # perform stagewise testing
93 | res <- map_depth(res_DX, 2, \(x) {
94 | ps <- data.frame(
95 | p_val.DS=x$DS$p_val,
96 | p_val.DD=x$DD$p_val,
97 | row.names=x$DS$gene)
98 | qs <- apply(ps, 1, .mu); names(qs) <- x$DS$gene
99 | obj <- stageR::stageR(qs, as.matrix(ps), FALSE)
100 | eva <- expression({
101 | obj <- stageR::stageWiseAdjustment(obj,
102 | method="none", alpha=0.05, allowNA=TRUE)
103 | res <- stageR::getAdjustedPValues(obj,
104 | onlySignificantGenes=FALSE, order=FALSE)
105 | })
106 | res <- if (verbose) eval(eva) else suppressMessages({eval(eva)})
107 | # TODO: communicate this better with the user?
108 | if (is.null(res)) res <- NA
109 | colnames(res)[1] <- "p_adj"
110 | return(res)
111 | })
112 | names(cs) <- cs <- names(res)
113 | lapply(cs, \(c) {
114 | names(ks) <- ks <- names(res[[c]])
115 | lapply(ks, \(k) {
116 | gs <- rownames(df <- res[[c]][[k]])
117 | res_DS <- I((. <- res_DS$table[[c]][[k]])[match(gs, .$gene), ])
118 | res_DD <- I((. <- res_DD$table[[c]][[k]])[match(gs, .$gene), ])
119 | DataFrame(gene=gs, df, cluster_id=k, contrast=c, res_DS, res_DD)
120 | })
121 | })
122 | }
123 |
--------------------------------------------------------------------------------
/R/utils-pbDS.R:
--------------------------------------------------------------------------------
1 | #' @importFrom BiocParallel SerialParam
2 | #' @importFrom purrr map
3 | #' @importFrom scuttle summarizeAssayByGroup
4 | #' @importFrom SummarizedExperiment assay colData
5 | .pb <- function(x, by, assay, fun, BPPARAM = SerialParam()) {
6 | # compute pseudobulks
7 | suppressWarnings(
8 | # temporarily suppressing warnings b/c 'median'
9 | # warns about unspecified 'useNames' argument
10 | y <- summarizeAssayByGroup(x,
11 | assay.type = assay,
12 | ids = (ids <- colData(x)[by]),
13 | statistics = fun,
14 | BPPARAM = BPPARAM))
15 | colnames(y) <- y[[by[length(by)]]]
16 |
17 | if (length(by) == 1)
18 | return(assay(y))
19 |
20 | # reformat into one assay per 'by[1]'
21 | if (is.factor(ids <- y[[by[1]]]))
22 | ids <- droplevels(ids)
23 | is <- split(seq_len(ncol(y)), ids)
24 | ys <- map(is, ~assay(y)[, ., drop=FALSE])
25 |
26 | # fill in missing combinations
27 | for (i in seq_along(ys)) {
28 | fill <- setdiff(
29 | unique(y[[by[2]]]),
30 | colnames(ys[[i]]))
31 | if (length(fill != 0)) {
32 | foo <- matrix(0, nrow(x), length(fill))
33 | colnames(foo) <- fill
34 | foo <- cbind(ys[[i]], foo)
35 | o <- paste(sort(unique(y[[by[2]]])))
36 | ys[[i]] <- foo[, o]
37 | }
38 | }
39 | return(ys)
40 | }
41 |
42 | # extract table of cell counts from 'int_colData'
43 | # of pseudobulks as returned by 'aggregateData'
44 | #' @importFrom S4Vectors metadata
45 | #' @importFrom SingleCellExperiment int_colData
46 | .n_cells <- function(x) {
47 | y <- int_colData(x)$n_cells
48 | if (is.null(y)) return(NULL)
49 | if (length(metadata(x)$agg_pars$by) == 2)
50 | y <- as.matrix(data.frame(y, check.names = FALSE))
51 | return(as.table(y))
52 | }
53 |
54 | # wrapper to create output tables
55 | # k: cluster ID
56 | # tt: topTable data.frame
57 | .res_df <- function(tbl, k, ct, c) {
58 | df <- data.frame(
59 | gene = rownames(tbl), cluster_id = k, tbl,
60 | row.names = NULL, stringsAsFactors = FALSE)
61 | df[[ct]] <- c; df
62 | }
63 |
64 | #' @importFrom DESeq2 DESeq results
65 | #' @importFrom dplyr rename
66 | #' @importFrom edgeR calcNormFactors DGEList estimateDisp
67 | #' filterByExpr glmQLFit glmQLFTest glmTreat topTags
68 | #' @importFrom scater isOutlier
69 | #' @importFrom SummarizedExperiment assay
70 | #' @importFrom S4Vectors metadata
71 | .edgeR <- function(x, k, design, coef, contrast, ct, cs, treat) {
72 | y <- assay(x, k)
73 | y <- suppressMessages(DGEList(y,
74 | group = x$group_id[colnames(y)],
75 | remove.zeros = TRUE))
76 | y <- calcNormFactors(y)
77 | y <- estimateDisp(y, design)
78 | fit <- glmQLFit(y, design)
79 | # treat: test for DE relative to logFC threshold
80 | # else: genewise NB GLM with quasi-likelihood test
81 | .fun <- ifelse(treat, glmTreat, glmQLFTest)
82 | tbl <- lapply(cs, function(c) {
83 | fit <- .fun(fit, coef[[c]], contrast[, c])
84 | tbl <- topTags(fit, n = Inf, sort.by = "none")
85 | # combine tables & reformat
86 | tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR")
87 | tbl <- .res_df(tbl, k, ct, c)
88 | })
89 | list(table = tbl, data = y, fit = fit)
90 | }
91 |
92 | #' @importFrom matrixStats rowMedians
93 | .edgeR_NB <- \(x, k, design, coef, contrast, ct, cs, nc) {
94 | y <- assay(x, k)
95 | # Gene_level filtering to remove genes detected in
96 | # almost all cells of almost all pseudobulk samples
97 | med_detection <- rowMedians(sweep(y, 2, nc, "/"))
98 | gene_filter <- med_detection < 0.9
99 | # Normalization offset to remove systematic differences between pseudobulk
100 | # samples that are due to technical or nuisance biological variability.
101 | # Idea obtained from cellular detection rate (CDR) normalization from MAST.
102 | # Note that this normalization is used instead of 'edgeR::calcNormFactors()'.
103 | of <- colMeans(sweep(y[gene_filter, ], 2, nc, "/"))
104 | # construct 'DGEList'
105 | y <- suppressMessages(DGEList(
106 | counts = y[gene_filter, ],
107 | group = x$group_id[colnames(y)],
108 | remove.zeros = TRUE))
109 | # add offsets to 'DGEList'
110 | y$offset <- log(nc * of)
111 | # run an 'edgeR' analysis
112 | y <- estimateDisp(y, design)
113 | fit <- glmQLFit(y, design, robust = TRUE)
114 | tbl <- lapply(cs, function(c) {
115 | fit <- glmQLFTest(fit,
116 | coef[[c]],
117 | contrast[, c],
118 | poisson.bound = FALSE)
119 | tbl <- topTags(fit, n = Inf, sort.by = "none")
120 | tbl <- rename(tbl$table, p_val = "PValue", p_adj.loc = "FDR")
121 | tbl <- .res_df(tbl, k, ct, c)
122 | })
123 | list(table = tbl, data = y, fit = fit)
124 | }
125 |
126 | #' @importFrom dplyr rename
127 | #' @importFrom edgeR calcNormFactors DGEList
128 | #' @importFrom limma contrasts.fit eBayes lmFit topTable topTreat voom treat
129 | #' @importFrom SummarizedExperiment assay
130 | #' @importFrom S4Vectors metadata
131 | .limma <- function(x, k, design, coef, contrast, ct, cs, method, treat) {
132 | y <- assay(x, k)
133 | trend <- robust <- TRUE
134 | if (method == "voom") {
135 | trend <- robust <- FALSE
136 | y <- suppressMessages(DGEList(y, remove.zeros = TRUE))
137 | y <- calcNormFactors(y)
138 | y <- voom(y, design)
139 | }
140 | w <- .n_cells(x)[k, colnames(x)]
141 | fit <- lmFit(y, design, weights = w)
142 | # treat: eBayes moderated-t p-val relative to min logFC threshold
143 | # else: eBayes moderated t-stat testing each contrast equal to 0
144 | .fun <- ifelse(treat, limma::treat, eBayes)
145 | .tbl <- ifelse(treat, topTreat, topTable)
146 | tbl <- lapply(cs, function(c) {
147 | fit <- contrasts.fit(fit, contrast[, c], coef[[c]])
148 | fit <- .fun(fit, trend = trend, robust = robust)
149 | tbl <- .tbl(fit, number = Inf, sort.by = "none")
150 | tbl <- rename(tbl, p_val = "P.Value", p_adj.loc = "adj.P.Val")
151 | tbl <- .res_df(tbl, k, ct, c)
152 | })
153 | list(table = tbl, data = y, fit = fit)
154 | }
155 |
156 | .limma_trend <- function(x, k, design, coef, contrast, ct, cs, treat)
157 | .limma(x, k, design, coef, contrast, ct, cs, method = "trend", treat)
158 |
159 | .limma_voom <- function(x, k, design, coef, contrast, ct, cs, treat)
160 | .limma(x, k, design, coef, contrast, ct, cs, method = "voom", treat)
161 |
162 | #' @importFrom dplyr rename
163 | #' @importFrom DESeq2 DESeq DESeqDataSetFromMatrix results
164 | #' @importFrom SummarizedExperiment assay colData
165 | .DESeq2 <- function(x, k, design, contrast, ct, cs) {
166 | cd <- colData(x)
167 | y <- assay(x, k)
168 | mode(y) <- "integer"
169 | y <- DESeqDataSetFromMatrix(y, cd, design)
170 | y <- suppressMessages(DESeq(y))
171 | tbl <- lapply(cs, function(c) {
172 | tbl <- results(y, contrast[, c])
173 | tbl <- .res_df(tbl, k, ct, c)
174 | rename(tbl, logFC = "log2FoldChange",
175 | p_val = "pvalue", p_adj.loc = "padj")
176 | })
177 | list(table = tbl, data = y)
178 | }
179 |
180 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | # ==============================================================================
2 | # filters rows/columns from input matrix `m` until all entries >= `n`,
3 | # such that ea. iteration removes the row/column w/ the smallest summed value.
4 | # ------------------------------------------------------------------------------
5 | .filter_matrix <- function(m, n = 100) {
6 | while (any(m < n)) {
7 | # get candidate rows/cols for removal
8 | i <- m < n
9 | r <- apply(i, 1, any)
10 | c <- apply(i, 2, any)
11 | # get smallest row/col
12 | rs <- rowSums(m)
13 | cs <- colSums(m)
14 | r <- which(r)[which.min(rs[r])]
15 | c <- which(c)[which.min(cs[c])]
16 | # priorities removal of rows over cols
17 | if (rs[r] <= cs[c]) {
18 | m <- m[-r, , drop = FALSE]
19 | } else {
20 | m <- m[, -c, drop = FALSE]
21 | }
22 | if (any(dim(m) == 1))
23 | break
24 | }
25 | return(m)
26 | }
27 |
28 | #' @importFrom dplyr mutate_if
29 | #' @importFrom purrr negate
30 | #' @importFrom S4Vectors DataFrame metadata metadata<-
31 | #' @importFrom SummarizedExperiment colData colData<-
32 | .update_sce <- function(sce) {
33 | # update colData
34 | cd <- as.data.frame(colData(sce))
35 | cd <- mutate_if(cd, is.factor, droplevels)
36 | cd <- mutate_if(cd, negate(is.factor), factor)
37 | colData(sce) <- DataFrame(cd, row.names = colnames(sce))
38 | # update metadata
39 | if(!is.null(ei <- metadata(sce)$experiment_info)){
40 | ei <- ei[ei$sample_id %in% levels(sce$sample_id), ]
41 | ei <- mutate_if(ei, is.factor, droplevels)
42 | metadata(sce)$experiment_info <- ei
43 | }
44 | return(sce)
45 | }
46 |
47 | .filter_sce <- function(sce, kids, sids) {
48 | cs1 <- sce$cluster_id %in% kids
49 | cs2 <- sce$sample_id %in% sids
50 | sce <- sce[, cs1 & cs2]
51 | sce <- .update_sce(sce)
52 | return(sce)
53 | }
54 |
55 | .cluster_colors <- c(
56 | "#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
57 | "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
58 | "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
59 | "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
60 | "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
61 | "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")
62 |
63 | # ==============================================================================
64 | # scale values b/w 0 and 1 using
65 | # low (1%) and high (99%) quantiles as boundaries
66 | # ------------------------------------------------------------------------------
67 | #' @importFrom matrixStats rowQuantiles
68 | .scale <- function(x) {
69 | qs <- rowQuantiles(as.matrix(x), probs = c(.01, .99), na.rm = TRUE)
70 | x <- (x - qs[, 1]) / (qs[, 2] - qs[, 1])
71 | x[x < 0] <- 0
72 | x[x > 1] <- 1
73 | return(x)
74 | }
75 |
76 | # ==============================================================================
77 | # wrapper for z-normalization
78 | # ------------------------------------------------------------------------------
79 | .z_norm <- function(x, th = 2.5) {
80 | x <- as.matrix(x)
81 | sds <- rowSds(x, na.rm = TRUE)
82 | sds[sds == 0] <- 1
83 | x <- t(t(x - rowMeans(x, na.rm = TRUE)) / sds)
84 | #x <- (x - rowMeans(x, na.rm = TRUE)) / sds
85 | x[x > th] <- th
86 | x[x < -th] <- -th
87 | return(x)
88 | }
89 |
90 | # ------------------------------------------------------------------------------
91 | # generate experimental design metadata table
92 | # for an input SCE or colData data.frame
93 | # ------------------------------------------------------------------------------
94 | #' @importFrom dplyr mutate_at
95 | #' @importFrom methods is
96 | #' @importFrom SummarizedExperiment colData
97 | .make_ei <- function(x) {
98 | if (is(x, "SingleCellExperiment"))
99 | x <- colData(x)
100 | sids <- unique(x$sample_id)
101 | m <- match(sids, x$sample_id)
102 | df <- data.frame(
103 | stringsAsFactors = FALSE,
104 | sample_id = sids,
105 | group_id = x$group_id[m],
106 | n_cells = as.numeric(table(x$sample_id)[sids]))
107 | for (i in c("sample_id", "group_id"))
108 | if (is.factor(x[[i]]))
109 | df <- mutate_at(df, i, factor, levels = levels(x[[i]]))
110 | return(df)
111 | }
112 |
113 | # ------------------------------------------------------------------------------
114 | # split cells by cluster-sample
115 | # ------------------------------------------------------------------------------
116 | # x: a SingleCellExperiment or colData
117 | # by: character vector specifying colData column(s) to split by
118 | # > If length(by) == 1, a list of length nlevels(colData$by), else,
119 | # a nested list with 2nd level of length nlevels(colData$by[2])
120 | # ------------------------------------------------------------------------------
121 | #' @importFrom data.table data.table
122 | #' @importFrom purrr map_depth
123 | .split_cells <- function(x,
124 | by = c("cluster_id", "sample_id")) {
125 | if (is(x, "SingleCellExperiment"))
126 | x <- colData(x)
127 | cd <- data.frame(x[by], check.names = FALSE)
128 | cd <- data.table(cd, cell = rownames(x)) %>%
129 | split(by = by, sorted = TRUE, flatten = FALSE)
130 | map_depth(cd, length(by), "cell")
131 | }
132 |
133 | # ------------------------------------------------------------------------------
134 | # global p-value adjustment
135 | # ------------------------------------------------------------------------------
136 | # x: results table; a nested list w/
137 | # 1st level = comparisons and 2nd level = clusters
138 | # > adds 'p_adj.glb' column containing globally adjusted p-values
139 | # to the result table of ea. cluster for each comparison
140 | # ------------------------------------------------------------------------------
141 | #' @importFrom purrr map map_depth
142 | #' @importFrom stats p.adjust
143 | .p_adj_global <- function(x) {
144 | names(ks) <- ks <- names(x)
145 | names(cs) <- cs <- names(x[[1]])
146 | lapply(cs, function(c) {
147 | # get p-values
148 | tbl <- map_depth(x, 1, c)
149 | p_val <- map(tbl, "p_val")
150 | # adjust for each comparison
151 | p_adj <- p.adjust(unlist(p_val), method = "BH")
152 | # re-split by cluster
153 | ns <- vapply(p_val, length, numeric(1))
154 | p_adj <- split(p_adj, rep.int(ks, ns))
155 | # insert into results tables
156 | lapply(ks, function(k) {
157 | u <- x[[k]][[c]]
158 | i <- which(colnames(u) == "p_adj.loc")
159 | u[["p_adj.glb"]] <- p_adj[[k]]
160 | u[, c(seq_len(i), ncol(u), seq(i+1, ncol(u)-1))]
161 | })
162 | })
163 | }
164 |
165 | # ------------------------------------------------------------------------------
166 | # toy SCE for unit-testing
167 | # ------------------------------------------------------------------------------
168 | #' @importFrom SingleCellExperiment SingleCellExperiment
169 | .toySCE <- function(dim = c(200, 800)) {
170 | gs <- paste0("gene", seq_len(ngs <- dim[1]))
171 | cs <- paste0("cell", seq_len(ncs <- dim[2]))
172 |
173 | y <- rnbinom(ngs * ncs, size = 2, mu = 4)
174 | y <- matrix(y, ngs, ncs, TRUE, list(gs, cs))
175 |
176 | cd <- mapply(function(i, n)
177 | sample(paste0(i, seq_len(n)), ncs, TRUE),
178 | i = c("k", "s", "g"), n = c(5, 4, 3))
179 |
180 | cd <- data.frame(cd, stringsAsFactors = TRUE)
181 | cd$s <- factor(paste(cd$s, cd$g, sep = "."))
182 | colnames(cd) <- paste(c("cluster", "sample", "group"), "id", sep = "_")
183 |
184 | SingleCellExperiment(
185 | assay = list(counts = y), colData = cd,
186 | metadata = list(experiment_info = .make_ei(cd)))
187 | }
188 |
--------------------------------------------------------------------------------
/R/validity-checks.R:
--------------------------------------------------------------------------------
1 | # validity checks for objects & function arguments
2 | # ==============================================================================
3 |
4 | # check input SCE
5 | #' @importFrom methods is
6 | #' @importFrom SummarizedExperiment colData
7 | .check_sce <- function(x, req_group = TRUE) {
8 | stopifnot(is(x, "SingleCellExperiment"))
9 | stopifnot(c("cluster_id", "sample_id") %in% colnames(colData(x)))
10 | if (req_group)
11 | stopifnot("group_id" %in% colnames(colData(x)))
12 | }
13 |
14 | # check of 'assay' argument
15 | #' @importFrom SummarizedExperiment assayNames
16 | .check_arg_assay <- function(x, y) {
17 | stopifnot(is.character(y), length(y) == 1, y %in% assayNames(x))
18 | if (sum(assayNames(x) == y) > 1)
19 | stop("Argument 'assay' was matched to multiple times.\n ",
20 | " Please assure that the input SCE has unique 'assayNames'.")
21 | }
22 |
23 | # check pseudo-bulks for DS analysis
24 | # (must have be aggregated by cluster-sample)
25 | # x = SCE used for aggregation
26 | # y = SCE containing pseudo-bulks as returned by
27 | #` aggregateData(x, by = c("cluster_id", "sample_id"))
28 | #' @importFrom methods is
29 | #' @importFrom S4Vectors metadata
30 | #' @importFrom SummarizedExperiment assayNames
31 | .check_pbs <- function(pbs, sce = NULL, check_by = TRUE) {
32 | stopifnot(is(pbs, "SingleCellExperiment"),
33 | !is.null(ei <- metadata(pbs)$experiment_info),
34 | !is.null(agg_pars <- metadata(pbs)$agg_pars),
35 | !is.null(n_cells <- .n_cells(pbs)),
36 | identical(assayNames(pbs), rownames(n_cells)),
37 | identical(colnames(pbs), colnames(n_cells)))
38 | if (!is.null(sce)) {
39 | stopifnot(identical(ei, metadata(sce)$experiment_info),
40 | identical(assayNames(pbs), levels(sce[[agg_pars$by[1]]])),
41 | identical(rownames(pbs), rownames(sce)))
42 | if (length(agg_pars$by == 2))
43 | stopifnot(identical(colnames(pbs), levels(sce[[agg_pars$by[2]]])))
44 | }
45 | if (check_by)
46 | stopifnot(!is.null(pbs[["group_id"]]),
47 | identical(agg_pars$by, c("cluster_id", "sample_id")))
48 | }
49 |
50 | # check validity of runDS() output
51 | #' @importFrom methods is
52 | #' @importFrom S4Vectors metadata
53 | .check_res <- function(x, y) {
54 | ei <- metadata(x)$experiment_info
55 | nk <- length(kids <- levels(x$cluster_id))
56 | nms <- c("table", "data", "method", "design", "contrast", "coef")
57 | stopifnot(is(y, "list"), all.equal(names(y), nms))
58 | # table
59 | stopifnot(is(y$table, "list"),
60 | vapply(y$table, is, class = "list", logical(1)),
61 | identical(names(y$table), colnames(y$contrast))
62 | | identical(names(y$table), names(y$coef)),
63 | apply(vapply(y$table, names, character(nk)), 2, identical, kids))
64 | # data
65 | stopifnot(is(y$data, "list"), names(y$data) %in% kids,
66 | vapply(y$data, is, class = "DGEList", logical(1)))
67 | # design
68 | stopifnot(is(y$design, "matrix"),
69 | colnames(y$design) %in% ei$group_id,
70 | rownames(y$design) %in% ei$sample_id)
71 | # contrast & coef
72 | stopifnot(is.null(y$contrast) | is(y$contrast, "matrix"))
73 | stopifnot(is.null(y$coef) | is(y$coef, "numeric") | is(y$coef, "list"))
74 | }
75 |
76 | # check validity of calcExprFreqs() output
77 | #' @importFrom methods is
78 | #' @importFrom SummarizedExperiment assays colData
79 | .check_frq <- function(x, y) {
80 | stopifnot(
81 | is(x, "SingleCellExperiment"),
82 | is(y, "SingleCellExperiment"))
83 | kids <- levels(x$cluster_id)
84 |
85 | ids <- levels(x$sample_id)
86 | if ("group_id" %in% colnames(colData(x)))
87 | ids <- c(ids, levels(x$group_id))
88 | stopifnot(identical(ids, colnames(y)))
89 |
90 | vals <- unlist(assays(y))
91 | stopifnot(all(vals <= 1), all(vals >= 0))
92 | }
93 |
94 | .check_args_simData <- function(u) {
95 | if (!is.null(u$ns))
96 | stopifnot(
97 | is.numeric(u$ns), length(u$ns) == 1,
98 | u$ns > 0, as.integer(u$ns) == u$ns)
99 | if (!is.null(u$nk)) {
100 | stopifnot(
101 | is.numeric(u$nk), length(u$nk) == 1,
102 | u$nk > 0, as.integer(u$nk) == u$nk)
103 | } else u$nk <- nlevels(u$x$cluster_id)
104 |
105 | if (!u$force && u$ng != nrow(u$x))
106 | stop("Number of simulated genes should match with reference,\n",
107 | " but 'ng != nrow(x)'; please specify 'force = TRUE' if\n",
108 | " simulation should be forced regardlessly (see '?simData').")
109 | if (!is.null(u$phylo_tree) && u$p_type != 0)
110 | stop("Only one of arguments 'p_type' or 'phylo_tree'\n",
111 | " can be specified; see '?simData' for 'Details'.")
112 | # assure number of simulated clusters matches with specified phylogeny
113 | if (!is.null(u$phylo_tree)) {
114 | kids_phylo <- .get_clusters_from_phylo(u$phylo_tree)
115 | nk_phylo <- length(kids_phylo)
116 | ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
117 | if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
118 | stop("Some clusters appear to be missing from 'phylo_tree';\n",
119 | " please make sure all clusters up to ",
120 | dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
121 | # possibly update number of clusters 'nk'
122 | if (nk_phylo != u$nk) u$nk <- nk_phylo
123 | }
124 | stopifnot(
125 | is.numeric(u$ng), length(u$ng) == 1, u$ng > 0, as.integer(u$ng) == u$ng,
126 | is.numeric(u$nc), length(u$nc) == 1, u$nc > 0, as.integer(u$nc) == u$nc,
127 | is.numeric(u$p_dd), length(u$p_dd) == 6, u$p_dd >= 0, u$p_dd <= 1,
128 | abs(1-sum(u$p_dd)) < 1e-12, is.logical(u$paired), length(u$paired) == 1,
129 | is.numeric(u$p_ep), length(u$p_ep) == 1, u$p_ep > 0, u$p_ep < 1,
130 | is.numeric(u$p_dp), length(u$p_dp) == 1, u$p_dp > 0, u$p_dp < 1,
131 | is.numeric(u$p_dm), length(u$p_dm) == 1, u$p_dm > 0, u$p_dm < 1,
132 | is.numeric(u$p_type), length(u$p_type) == 1, u$p_type >= 0, u$p_type <= 1,
133 | is.numeric(u$lfc), is.numeric(u$lfc), length(u$lfc) == 1, u$lfc >= 1,
134 | is.logical(u$force), length(u$force) == 1,
135 | is.numeric(u$phylo_pars), length(u$phylo_pars) == 2, u$phylo_pars >= 0)
136 | if (!is.null(u$rel_lfc))
137 | stopifnot(is.numeric(u$rel_lfc),
138 | length(u$rel_lfc) == u$nk, u$rel_lfc >= 0)
139 | return(list(nk = u$nk, ns = u$ns))
140 | }
141 |
142 | #' @importFrom SummarizedExperiment colData
143 | .check_args_aggData <- function(u) {
144 | stopifnot(is.character(u$by), length(u$by) <= 2,
145 | u$by %in% colnames(colData(u$x)))
146 | stopifnot(is.logical(u$scale), length(u$scale) == 1)
147 | if (u$scale & (!u$assay %in% c("cpm", "CPM") | u$fun != "sum"))
148 | stop("Option 'scale = TRUE' only valid for",
149 | " 'assay = \"cpm/CPM\"' and 'fun = \"sum\"'.")
150 | }
151 |
152 | .check_args_pbDS <- function(u) {
153 | if (!is.null(u$design))
154 | stopifnot(is.matrix(u$design),
155 | !is.null(rownames(u$design)),
156 | !is.null(colnames(u$design)))
157 | stopifnot(
158 | is.null(u$contrast) | is.matrix(u$contrast),
159 | is.null(u$coef) | is.numeric(unlist(u$coef)),
160 | is.numeric(u$min_cells), length(u$min_cells) == 1,
161 | is.logical(u$verbose), length(u$verbose) == 1,
162 | is.logical(u$treat), length(u$treat) == 1)
163 | }
164 |
165 | .check_args_mmDS <- function(u) {
166 | stopifnot(
167 | is.null(u$covs) || is.character(u$covs) & all(u$covs %in% names(colData(u$x))),
168 | is.numeric(u$coef) & u$coef %in% seq_len(nlevels(u$x$group_id))
169 | | is.character(u$coef) & u$coef %in% c("(Intercept)",
170 | paste0("group_id", levels(u$x$group_id)[-1])),
171 | !is.null(metadata(u$x)$experiment_info$group_id) | !is.null(u$x$group_id),
172 | is.numeric(u$n_cells), length(u$n_cells) == 1, u$n_cells >= 0,
173 | is.numeric(u$n_samples), length(u$n_samples) == 1, u$n_samples >= 2,
174 | is.numeric(u$min_count), length(u$min_count) == 1, u$min_count >= 0,
175 | is.numeric(u$min_cells), length(u$min_cells) == 1, u$min_cells >= 0,
176 | is.logical(u$verbose), length(u$verbose) == 1,
177 | is.logical(u$dup_corr), length(u$dup_corr) == 1,
178 | is.logical(u$trended), length(u$trended) == 1,
179 | is.logical(u$bayesian), length(u$bayesian) == 1,
180 | is.logical(u$blind), length(u$blind) == 1,
181 | is.logical(u$REML), length(u$REML) == 1)
182 | }
183 |
184 | .check_args_pbHeatmap <- function(u) {
185 | if (!is.null(u$k))
186 | stopifnot(is.character(u$k), u$k %in% levels(u$x$cluster_id))
187 | if (!is.null(u$g))
188 | stopifnot(is.character(u$g), u$g %in% rownames(u$x))
189 | if (!is.null(u$c))
190 | stopifnot(is.character(u$c), u$c %in% names(u$y$table))
191 | stopifnot(
192 | is.numeric(u$top_n), length(u$top_n) == 1, u$top_n > 1,
193 | is.numeric(u$fdr), length(u$fdr) == 1, u$fdr > 0,
194 | is.numeric(u$lfc), length(u$lfc) == 1,
195 | is.character(u$sort_by), length(u$sort_by) == 1,
196 | u$sort_by == "none" |
197 | !is.null(dim(u$y[[1]])) &
198 | u$sort_by %in% names(u$y[[1]]) &
199 | is.numeric(u$y[[1]][[u$sort_by]]) |
200 | u$sort_by %in% names(u$y$table[[1]][[1]]) &
201 | is.numeric(u$y$table[[1]][[1]][[u$sort_by]]),
202 | is.function(u$fun),
203 | is.logical(u$decreasing), length(u$decreasing) == 1,
204 | is.logical(u$normalize), length(u$normalize) == 1,
205 | is.logical(u$row_anno), length(u$row_anno) == 1,
206 | is.logical(u$col_anno), length(u$col_anno) == 1)
207 | }
208 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | **`muscat` (**Mu**lti-sample **mu**lti-group **sc**RNA-seq **a**nalysis **t**ools )**
4 |
5 | ...provides methods for *Differential State* (DS) analyses in scRNA-seq data
6 | with multiple samples, groups, and (cell)-subpopulations, as elaborated in:
7 |
8 | > Crowell HL, Soneson C\*, Germain P-L\*,
9 | Calini D, Collin L, Raposo C, Malhotra D & Robinson MD:
10 | "*muscat* detects subpopulation-specific state transitions from
11 | multi-sample multi-condition single-cell transcriptomics data"
12 | *Nature Communications* **11**, 6077 (2020)
13 | [DOI: 10.1038/s41467-020-19894-4](https://doi.org/10.1038/s41467-020-19894-4)
14 |
15 | *These authors contributed equally.
16 |
17 | ### installation
18 |
19 | `muscat` is available through Bioconductor, and
20 | can be installed using the following commands:
21 |
22 | ```r
23 | if (!requireNamespace("BiocManager", quietly = TRUE))
24 | install.packages("BiocManager")
25 | BiocManager::install("muscat")
26 | ```
27 |
28 | ### quick guide
29 |
30 | Let `sce` be a [`SingleCellExperiment`](https://www.bioconductor.org/packages/SingleCellExperiment.html) object with cell metadata (`colData`) columns
31 |
32 | 1. `"sample_id"` specifying unique sample identifiers (e.g., PeterPan1, Nautilus7, ...)
33 | 2. `"group_id"` specifying each sample's experimental condition (e.g., reference/stimulated, healthy/diseased, ...)
34 | 3. `"cluster_id"` specifying subpopulation (cluster) assignments (e.g., B cells, dendritic cells, ...)
35 |
36 | Aggregation-based methods come down to the following simple commands:
37 |
38 | ```r
39 | # compute pseudobulks (sum of counts)
40 | pb <- aggregateData(sce,
41 | assay = "counts", fun = "sum",
42 | by = c("cluster_id", "sample_id"))
43 |
44 | # run pseudobulk (aggregation-based) DS analysis
45 | ds_pb <- pbDS(pb, method = "edgeR")
46 | ```
47 |
48 | Mixed models can be run directly on cell-level measurements, e.g.:
49 |
50 | ```r
51 | ds_mm <- mmDS(sce, method = "dream")
52 | ```
53 |
54 | For details, please see the package vignettes.
55 |
56 | ### differential detection
57 |
58 | `muscat` also supports testing for differential detection as proposed in
59 |
60 | > Gilis J, Perin L, Malfait M, Van den Berge K,
61 | Assefa AT, Verbist B, Risso D, and Clement L:
62 | Differential detection workflows for
63 | multi-sample single-cell RNA-seq data.
64 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043)
65 |
66 | Key alterations to the commands above are highlighted below (!!!),
67 | however, we recommend users consult the corresponding publication
68 | and package vignette for more details.
69 |
70 | ```r
71 | # sum binarized counts
72 | pb <- aggregateData(sce,
73 | assay = "counts",
74 | fun = "num.detected", # !!!
75 | by = c("cluster_id", "sample_id"))
76 | # test for differential detection
77 | dd <- pbDD(pb) # or..
78 | dd <- pbDS(pb, method = "DD")
79 | ```
--------------------------------------------------------------------------------
/data/example_sce.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/data/example_sce.rda
--------------------------------------------------------------------------------
/inst/extdata/1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1a.png
--------------------------------------------------------------------------------
/inst/extdata/1b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1b.png
--------------------------------------------------------------------------------
/inst/extdata/1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/1d.png
--------------------------------------------------------------------------------
/inst/extdata/muscat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/muscat/49a3b323133921fdcd9772441df0aa2ee09ab1e1/inst/extdata/muscat.png
--------------------------------------------------------------------------------
/inst/extdata/refs.bib:
--------------------------------------------------------------------------------
1 | % Generated by Paperpile. Check out https://paperpile.com for more information.
2 | % BibTeX export options can be customized via Settings -> BibTeX.
3 |
4 | @ARTICLE{Crowell2020-muscat,
5 | title = "muscat detects subpopulation-specific state transitions from
6 | multi-sample multi-condition single-cell transcriptomics data",
7 | author = "Crowell, Helena L and Soneson, Charlotte and Germain, Pierre-Luc
8 | and Calini, Daniela and Collin, Ludovic and Raposo, Catarina and
9 | Malhotra, Dheeraj and Robinson, Mark D",
10 | journal = "Nature Communications",
11 | volume = 11,
12 | number = 1,
13 | pages = "6077",
14 | year = 2020
15 | }
16 |
17 | @ARTICLE{Zhang2019,
18 | title = "Probabilistic cell type assignment of single-cell transcriptomic
19 | data reveals spatiotemporal microenvironment dynamics in human
20 | cancers",
21 | author = "Zhang, Allen W and O\textbackslashtextquoterightFlanagan, Ciara
22 | and Chavez, Elizabeth and Lim, Jamie L P and McPherson, Andrew and
23 | Wiens, Matt and Walters, Pascale and Chan, Tim and Hewitson,
24 | Brittany and Lai, Daniel and Mottok, Anja and Sarkozy, Clementine
25 | and Chong, Lauren and Aoki, Tomohiro and Wang, Xuehai and Weng,
26 | Andrew P and McAlpine, Jessica N and Aparicio, Samuel and Steidl,
27 | Christian and Campbell, Kieran R and Shah, Sohrab P",
28 | journal = "bioRxiv",
29 | volume = 521914,
30 | year = 2019
31 | }
32 |
33 | @ARTICLE{Soneson2018-countsimQC,
34 | title = "Towards unified quality verification of synthetic count data with
35 | {countsimQC}",
36 | author = "Soneson, Charlotte and Robinson, Mark D",
37 | journal = "Bioinformatics",
38 | volume = 34,
39 | number = 4,
40 | pages = "691--692",
41 | year = 2018
42 | }
43 |
44 | @ARTICLE{Butler2018-Seurat,
45 | title = "Integrating single-cell transcriptomic data across different
46 | conditions, technologies, and species",
47 | author = "Butler, Andrew and Hoffman, Paul and Smibert, Peter and Papalexi,
48 | Efthymia and Satija, Rahul",
49 | journal = "Nature Biotechnology",
50 | volume = 36,
51 | number = 5,
52 | pages = "411--420",
53 | year = 2018
54 | }
55 |
56 | @ARTICLE{Stuart2019,
57 | title = "Comprehensive {I}ntegration of {{S}ingle-{C}ell} {D}ata",
58 | author = "Stuart, Tim and Butler, Andrew and Hoffman, Paul and Hafemeister,
59 | Christoph and Papalexi, Efthymia and Mauck, 3rd, William M and
60 | Hao, Yuhan and Stoeckius, Marlon and Smibert, Peter and Satija,
61 | Rahul",
62 | journal = "Cell",
63 | volume = 177,
64 | number = 7,
65 | pages = "1888--1902.e21",
66 | year = 2019
67 | }
68 |
69 | @ARTICLE{Stegle2015,
70 | title = "Computational and analytical challenges in single-cell
71 | transcriptomics",
72 | author = "Stegle, Oliver and Teichmann, Sarah A and Marioni, John C",
73 | journal = "Nature Reviews Genetics",
74 | volume = 16,
75 | number = 3,
76 | pages = "133--145",
77 | year = 2015
78 | }
79 |
80 | @ARTICLE{Trapnell2015,
81 | title = "Defining cell types and states with single-cell genomics",
82 | author = "Trapnell, Cole",
83 | journal = "Genome Research",
84 | volume = 25,
85 | number = 10,
86 | pages = "1491--1498",
87 | year = 2015
88 | }
89 |
90 | @ARTICLE{Wagner2016,
91 | title = "Revealing the vectors of cellular identity with single-cell
92 | genomics",
93 | author = "Wagner, Allon and Regev, Aviv and Yosef, Nir",
94 | journal = "Nature Biotechnology",
95 | volume = 34,
96 | number = 11,
97 | pages = "1145--1160",
98 | year = 2016
99 | }
100 |
101 | @ARTICLE{McCarthy2017-scater,
102 | title = "Scater: pre-processing, quality control, normalization and
103 | visualization of single-cell {RNA-seq} data in {R}",
104 | author = "McCarthy, Davis J and Campbell, Kieran R and Lun, Aaron T L and
105 | Wills, Quin F",
106 | journal = "Bioinformatics",
107 | volume = 33,
108 | number = 8,
109 | pages = "1179--1186",
110 | year = 2017
111 | }
112 |
113 | @ARTICLE{Love2014-DESeq2,
114 | title = "Moderated estimation of fold change and dispersion for {RNA-seq}
115 | data with {DESeq2}",
116 | author = "Love, Michael I and Huber, Wolfgang and Anders, Simon",
117 | journal = "Genome Biology",
118 | volume = 15,
119 | number = 12,
120 | pages = "550",
121 | year = 2014
122 | }
123 |
124 | @ARTICLE{Korthauer2016-scDD,
125 | title = "A statistical approach for identifying differential distributions
126 | in single-cell {RNA-seq} experiments",
127 | author = "Korthauer, Keegan D and Chu, Li-Fang and Newton, Michael A and Li,
128 | Yuan and Thomson, James and Stewart, Ron and Kendziorski,
129 | Christina",
130 | journal = "Genome Biology",
131 | volume = 17,
132 | number = 1,
133 | pages = "222",
134 | year = 2016
135 | }
136 |
137 | @ARTICLE{Kang2018-demuxlet,
138 | title = "Multiplexed droplet single-cell {RNA-sequencing} using natural
139 | genetic variation",
140 | author = "Kang, Hyun Min and Subramaniam, Meena and Targ, Sasha and Nguyen,
141 | Michelle and Maliskova, Lenka and McCarthy, Elizabeth and Wan,
142 | Eunice and Wong, Simon and Byrnes, Lauren and Lanata, Cristina M
143 | and Gate, Rachel E and Mostafavi, Sara and Marson, Alexander and
144 | Zaitlen, Noah and Criswell, Lindsey A and Ye, Chun Jimmie",
145 | journal = "Nature Biotechnology",
146 | volume = 36,
147 | number = 1,
148 | pages = "89--94",
149 | year = 2018
150 | }
151 |
152 | @ARTICLE{Soneson2016-iCOBRA,
153 | title = "{iCOBRA}: open, reproducible, standardized and live method
154 | benchmarking",
155 | author = "Soneson, Charlotte and Robinson, Mark D",
156 | journal = "Nature Methods",
157 | volume = 13,
158 | number = 4,
159 | pages = "283",
160 | year = 2016
161 | }
162 |
163 | @ARTICLE{Hafemeister2019-sctransform,
164 | title = "Normalization and variance stabilization of single-cell {RNA-seq}
165 | data using regularized negative binomial regression",
166 | author = "Hafemeister, Christoph and Satija, Rahul",
167 | journal = "bioRxiv",
168 | volume = 576827,
169 | year = 2019
170 | }
171 |
172 | @ARTICLE{Freytag2018-clustering,
173 | title = "Comparison of clustering tools in {R} for medium-sized 10x
174 | Genomics single-cell {RNA-sequencing} data",
175 | author = "Freytag, Saskia and Tian, Luyi and L{\"o}nnstedt, Ingrid and Ng,
176 | Milica and Bahlo, Melanie",
177 | journal = "F1000Research",
178 | volume = 7,
179 | pages = "1297",
180 | year = 2018
181 | }
182 |
183 | @ARTICLE{Duo2018,
184 | title = "A systematic performance evaluation of clustering methods for
185 | single-cell {RNA-seq} data",
186 | author = "Du{\`o}, Angelo and Robinson, Mark D and Soneson, Charlotte",
187 | journal = "F1000Research",
188 | volume = 7,
189 | pages = "1141",
190 | year = 2018
191 | }
192 |
193 | @ARTICLE{Diaz-Mejia2019,
194 | title = "Evaluation of methods to assign cell type labels to cell clusters
195 | from single-cell {RNA-sequencing} data",
196 | author = "Diaz-Mejia, J Javier and Javier Diaz-Mejia, J and Meng, Elaine C
197 | and Pico, Alexander R and MacParland, Sonya A and Ketela, Troy and
198 | Pugh, Trevor J and Bader, Gary D and Morris, John H",
199 | journal = "F1000Research",
200 | volume = 8,
201 | pages = "296",
202 | year = 2019
203 | }
204 |
205 | @ARTICLE{Soneson2018,
206 | title = "Bias, robustness and scalability in single-cell differential
207 | expression analysis",
208 | author = "Soneson, Charlotte and Robinson, Mark D",
209 | journal = "Nature Methods",
210 | volume = 15,
211 | number = 4,
212 | pages = "255--261",
213 | year = 2018
214 | }
215 |
216 | @ARTICLE{Ritchie2015-limma,
217 | title = "limma powers differential expression analyses for {RNA-sequencing}
218 | and microarray studies",
219 | author = "Ritchie, Matthew E and Phipson, Belinda and Wu, Di and Hu, Yifang
220 | and Law, Charity W and Shi, Wei and Smyth, Gordon K",
221 | journal = "Nucleic Acids Research",
222 | volume = 43,
223 | number = 7,
224 | pages = "e47",
225 | year = 2015
226 | }
227 |
228 | @ARTICLE{Robinson2010-edgeR,
229 | title = "edge{R}: a {B}ioconductor package for differential expression
230 | analysis of digital gene expression data",
231 | author = "Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K",
232 | journal = "Bioinformatics",
233 | volume = 26,
234 | number = 1,
235 | pages = "139--140",
236 | year = 2010
237 | }
238 |
239 | @ARTICLE{Gilis2023,
240 | title = "Differential detection workflows for multi-sample single-cell
241 | {RNA-seq} data",
242 | author = "Gilis, Jeroen and Perin, Laura and Malfait, Milan and Van den
243 | Berge, Koen and Takele Assefa, Alemu and Verbist, Bie and Risso,
244 | Davide and Clement, Lieven",
245 | journal = "bioRxiv",
246 | year = 2023
247 | }
248 |
249 | @ARTICLE{Bouland2021,
250 | title = "Differential analysis of binarized single-cell {RNA} sequencing
251 | data captures biological variation",
252 | author = "Bouland, Gerard A and Mahfouz, Ahmed and Reinders, Marcel J T",
253 | journal = "NAR Genomics and Bioinformatics",
254 | volume = 3,
255 | number = 4,
256 | pages = "lqab118",
257 | year = 2021
258 | }
259 |
260 | @ARTICLE{Qiu2020,
261 | title = "Embracing the dropouts in single-cell {RNA-seq} analysis",
262 | author = "Qiu, Peng",
263 | journal = "Nature Communications",
264 | volume = 11,
265 | number = 1,
266 | pages = "1169",
267 | year = 2020,
268 | language = "en"
269 | }
270 |
271 | @ARTICLE{Vandenberge2017,
272 | title = "stageR: a general stage-wise method for controlling the gene-level
273 | false discovery rate in differential expression and differential
274 | transcript usage",
275 | author = "Van den Berge, Koen and Soneson, Charlotte and Robinson, Mark D
276 | and Clement, Lieven",
277 | journal = "Genome Biology",
278 | volume = 18,
279 | number = 151,
280 | year = 2017,
281 | language = "en"
282 | }
--------------------------------------------------------------------------------
/man/aggregateData.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/aggregateData.R
3 | \name{aggregateData}
4 | \alias{aggregateData}
5 | \title{Aggregation of single-cell to pseudobulk data}
6 | \usage{
7 | aggregateData(
8 | x,
9 | assay = NULL,
10 | by = c("cluster_id", "sample_id"),
11 | fun = c("sum", "mean", "median", "prop.detected", "num.detected"),
12 | scale = FALSE,
13 | verbose = TRUE,
14 | BPPARAM = SerialParam(progressbar = verbose)
15 | )
16 | }
17 | \arguments{
18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
19 |
20 | \item{assay}{character string specifying the assay slot to use as
21 | input data. Defaults to the 1st available (\code{assayNames(x)[1]}).}
22 |
23 | \item{by}{character vector specifying which
24 | \code{colData(x)} columns to summarize by (at most 2!).}
25 |
26 | \item{fun}{a character string.
27 | Specifies the function to use as summary statistic.
28 | Passed to \code{\link[scuttle]{summarizeAssayByGroup}}.}
29 |
30 | \item{scale}{logical. Should pseudo-bulks be scaled
31 | with the effective library size & multiplied by 1M?}
32 |
33 | \item{verbose}{logical. Should information on progress be reported?}
34 |
35 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
36 | object specifying how aggregation should be parallelized.}
37 | }
38 | \value{
39 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
40 | \itemize{
41 | \item{If \code{length(by) == 2}, each sheet (\code{assay}) contains
42 | pseudobulks for each of \code{by[1]}, e.g., for each cluster when
43 | \code{by = "cluster_id"}. Rows correspond to genes, columns to
44 | \code{by[2]}, e.g., samples when \code{by = "sample_id"}}.
45 | \item{If \code{length(by) == 1}, the returned SCE will contain only
46 | a single \code{assay} with rows = genes and colums = \code{by}.}}
47 |
48 | Aggregation parameters (\code{assay, by, fun, scaled}) are stored in
49 | \code{metadata()$agg_pars}, and the number of cells that were aggregated
50 | are accessible in \code{int_colData()$n_cells}.
51 | }
52 | \description{
53 | ...
54 | }
55 | \examples{
56 | # pseudobulk counts by cluster-sample
57 | data(example_sce)
58 | pb <- aggregateData(example_sce)
59 |
60 | library(SingleCellExperiment)
61 | assayNames(example_sce) # one sheet per cluster
62 | head(assay(example_sce)) # n_genes x n_samples
63 |
64 | # scaled CPM
65 | cpm <- edgeR::cpm(assay(example_sce))
66 | assays(example_sce)$cpm <- cpm
67 | pb <- aggregateData(example_sce, assay = "cpm", scale = TRUE)
68 | head(assay(pb))
69 |
70 | # aggregate by cluster only
71 | pb <- aggregateData(example_sce, by = "cluster_id")
72 | length(assays(pb)) # single assay
73 | head(assay(pb)) # n_genes x n_clusters
74 |
75 | }
76 | \references{
77 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
78 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
79 | On the discovery of population-specific state transitions from
80 | multi-sample multi-condition single-cell RNA sequencing data.
81 | \emph{bioRxiv} \strong{713412} (2018).
82 | doi: \url{https://doi.org/10.1101/713412}
83 | }
84 | \author{
85 | Helena L Crowell & Mark D Robinson
86 | }
87 |
--------------------------------------------------------------------------------
/man/calcExprFreqs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calcExprFreqs.R
3 | \name{calcExprFreqs}
4 | \alias{calcExprFreqs}
5 | \title{calcExprFreqs}
6 | \usage{
7 | calcExprFreqs(x, assay = "counts", th = 0)
8 | }
9 | \arguments{
10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
11 |
12 | \item{assay}{a character string specifying which assay to use.}
13 |
14 | \item{th}{numeric threshold value above which
15 | a gene should be considered to be expressed.}
16 | }
17 | \value{
18 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
19 | containing, for each cluster, an assay of dimensions #genes x #samples
20 | giving the fraction of cells that express each gene in each sample.
21 | If \code{colData(x)} contains a \code{"group_id"} column, the fraction
22 | of expressing cells in each each group will be included as well.
23 | }
24 | \description{
25 | Calculates gene expression frequencies
26 | }
27 | \details{
28 | \code{calcExprFreq} computes, for each sample and group (in each cluster),
29 | the fraction of cells that express a given gene. Here, a gene is considered
30 | to be expressed when the specified measurement value (\code{assay})
31 | lies above the specified threshold value (\code{th}).
32 | }
33 | \examples{
34 | data(example_sce)
35 | library(SingleCellExperiment)
36 |
37 | frq <- calcExprFreqs(example_sce)
38 |
39 | # one assay per cluster
40 | assayNames(frq)
41 |
42 | # expression frequencies by
43 | # sample & group; 1st cluster:
44 | head(assay(frq))
45 |
46 | }
47 | \author{
48 | Helena L Crowell & Mark D Robinson
49 | }
50 |
--------------------------------------------------------------------------------
/man/data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \name{data}
4 | \alias{data}
5 | \alias{example_sce}
6 | \title{Example datasets}
7 | \value{
8 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9 | }
10 | \description{
11 | A \code{\link[SingleCellExperiment]{SingleCellExperiment}} containing
12 | 10x droplet-based scRNA-seq PBCM data from 8 Lupus patients befor and after
13 | 6h-treatment with INF-beta (16 samples in total).
14 |
15 | The original data has been filtered to
16 | \itemize{
17 | \item{remove unassigned cells & cell multiplets}
18 | \item{retain only 4 out of 8 samples per experimental group}
19 | \item{retain only 5 out of 8 subpopulations (clusters)}
20 | \item{retain genes with a count > 1 in > 50 cells}
21 | \item{retain cells with > 200 detected genes}
22 | \item{retain at most 100 cells per cluster-sample instance}
23 | }
24 |
25 | Assay \code{logcounts} corresponds to log-normalized values
26 | obtained from \code{\link[scater]{logNormCounts}} with default parameters.
27 |
28 | The original measurement data, as well as gene and cell metadata
29 | is available through the NCBI GEO accession number GSE96583;
30 | code to reproduce this example dataset from the original data
31 | is provided in the examples section.
32 | }
33 | \examples{
34 | \donttest{
35 | # set random seed for cell sampling
36 | set.seed(2929)
37 |
38 | # load data
39 | library(ExperimentHub)
40 | eh <- ExperimentHub()
41 | sce <- eh[["EH2259"]]
42 |
43 | # drop unassigned cells & multiplets
44 | sce <- sce[, !is.na(sce$cell)]
45 | sce <- sce[, sce$multiplets == "singlet"]
46 |
47 | # keep 4 samples per group
48 | sce$id <- paste0(sce$stim, sce$ind)
49 | inds <- sample(sce$ind, 4)
50 | ids <- paste0(levels(sce$stim), rep(inds, each = 2))
51 | sce <- sce[, sce$id \%in\% ids]
52 |
53 | # keep 5 clusters
54 | kids <- c("B cells", "CD4 T cells", "CD8 T cells",
55 | "CD14+ Monocytes", "FCGR3A+ Monocytes")
56 | sce <- sce[, sce$cell \%in\% kids]
57 | sce$cell <- droplevels(sce$cell)
58 |
59 | # basic filtering on genes & cells
60 | gs <- rowSums(counts(sce) > 1) > 50
61 | cs <- colSums(counts(sce) > 0) > 200
62 | sce <- sce[gs, cs]
63 |
64 | # sample max. 100 cells per cluster-sample
65 | cs_by_ks <- split(colnames(sce), list(sce$cell, sce$id))
66 | cs <- sapply(cs_by_ks, function(u)
67 | sample(u, min(length(u), 100)))
68 | sce <- sce[, unlist(cs)]
69 |
70 | # compute logcounts
71 | library(scater)
72 | sce <- computeLibraryFactors(sce)
73 | sce <- logNormCounts(sce)
74 |
75 | # re-format for 'muscat'
76 | sce <- prepSCE(sce,
77 | kid = "cell",
78 | sid = "id",
79 | gid = "stim",
80 | drop = TRUE)
81 | }
82 |
83 | }
84 | \references{
85 | Kang et al. (2018). Multiplexed droplet single-cell RNA-sequencing
86 | using natural genetic variation. \emph{Nature Biotechnology},
87 | \bold{36}(1): 89-94. DOI: 10.1038/nbt.4042.
88 | }
89 | \author{
90 | Helena L Crowell
91 | }
92 |
--------------------------------------------------------------------------------
/man/mmDS.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mmDS.R, R/utils-mmDS.R
3 | \name{mmDS}
4 | \alias{mmDS}
5 | \alias{.mm_dream}
6 | \alias{.mm_dream2}
7 | \alias{.mm_vst}
8 | \alias{.mm_glmm}
9 | \title{DS analysis using mixed-models (MM)}
10 | \usage{
11 | mmDS(
12 | x,
13 | coef = NULL,
14 | covs = NULL,
15 | method = c("dream2", "dream", "vst", "poisson", "nbinom", "hybrid"),
16 | n_cells = 10,
17 | n_samples = 2,
18 | min_count = 1,
19 | min_cells = 20,
20 | verbose = TRUE,
21 | BPPARAM = SerialParam(progressbar = verbose),
22 | vst = c("sctransform", "DESeq2"),
23 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
24 | dup_corr = FALSE,
25 | trended = FALSE,
26 | bayesian = FALSE,
27 | blind = TRUE,
28 | REML = TRUE,
29 | moderate = FALSE
30 | )
31 |
32 | .mm_dream(
33 | x,
34 | coef = NULL,
35 | covs = NULL,
36 | dup_corr = FALSE,
37 | trended = FALSE,
38 | ddf = c("Satterthwaite", "Kenward-Roger"),
39 | verbose = FALSE,
40 | BPPARAM = SerialParam(progressbar = verbose)
41 | )
42 |
43 | .mm_dream2(
44 | x,
45 | coef = NULL,
46 | covs = NULL,
47 | ddf = c("Satterthwaite", "Kenward-Roger"),
48 | verbose = FALSE,
49 | BPPARAM = SerialParam(progressbar = verbose)
50 | )
51 |
52 | .mm_vst(
53 | x,
54 | vst = c("sctransform", "DESeq2"),
55 | coef = NULL,
56 | covs = NULL,
57 | bayesian = FALSE,
58 | blind = TRUE,
59 | REML = TRUE,
60 | ddf = c("Satterthwaite", "Kenward-Roger", "lme4"),
61 | verbose = FALSE,
62 | BPPARAM = SerialParam(progressbar = verbose)
63 | )
64 |
65 | .mm_glmm(
66 | x,
67 | coef = NULL,
68 | covs = NULL,
69 | family = c("poisson", "nbinom"),
70 | moderate = FALSE,
71 | verbose = TRUE,
72 | BPPARAM = SerialParam(progressbar = verbose)
73 | )
74 | }
75 | \arguments{
76 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
77 |
78 | \item{coef}{character specifying the coefficient to test.
79 | If NULL (default), will test the last level of \code{"group_id"}.}
80 |
81 | \item{covs}{character vector of \code{colData(x)}
82 | column names to use as covariates.}
83 |
84 | \item{method}{a character string.
85 | Either \code{"dream2"} (default, lme4 with voom-weights),
86 | \code{"dream"} (previous implementation of the dream method),
87 | \code{"vst"} (variance-stabilizing transformation),
88 | \code{"poisson"} (poisson GLM-MM),
89 | \code{"nbinom"} (negative binomial GLM-MM),
90 | \code{"hybrid"} (combination of pseudobulk and poisson methods)
91 | or a function accepting the same arguments.}
92 |
93 | \item{n_cells}{number of cells per cluster-sample
94 | required to consider a sample for testing.}
95 |
96 | \item{n_samples}{number of samples per group
97 | required to consider a cluster for testing.}
98 |
99 | \item{min_count}{numeric. For a gene to be tested in a given cluster,
100 | at least \code{min_cells} must have a count >= \code{min_count}.}
101 |
102 | \item{min_cells}{number (or fraction, if < 1) of cells with a count >
103 | \code{min_count} required for a gene to be tested in a given cluster.}
104 |
105 | \item{verbose}{logical specifying whether messages
106 | on progress and a progress bar should be displayed.}
107 |
108 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
109 | object specifying how differential testing should be parallelized.}
110 |
111 | \item{vst}{method to use as variance-stabilizing transformations.
112 | \code{"sctransform"} for \code{\link[sctransform]{vst}}; \code{"DESeq2"}
113 | for \code{\link[DESeq2]{varianceStabilizingTransformation}}.}
114 |
115 | \item{ddf}{character string specifying the method for estimating
116 | the effective degrees of freedom. For \code{method = "dream"},
117 | either \code{"Satterthwaite"} (faster) or \code{"Kenward-Roger"}
118 | (more accurate); see \code{?variancePartition::dream} for details.
119 | For \code{method = "vst"}, method \code{"lme4"} is also valid;
120 | see \code{\link[lmerTest]{contest.lmerModLmerTest}}.}
121 |
122 | \item{dup_corr}{logical; whether to use
123 | \code{\link[limma:dupcor]{duplicateCorrelation}}.}
124 |
125 | \item{trended}{logical; whether to use expression-dependent variance priors
126 | in \code{\link[limma]{eBayes}}.}
127 |
128 | \item{bayesian}{logical; whether to use bayesian mixed models.}
129 |
130 | \item{blind}{logical; whether to ignore experimental design for the vst.}
131 |
132 | \item{REML}{logical; whether to maximize REML instead of log-likelihood.}
133 |
134 | \item{moderate}{logical; whether to perform empirical Bayes moderation.}
135 |
136 | \item{family}{character string specifying which GLMM to fit:
137 | \code{"poisson"} for \code{\link[blme:blmer]{bglmer}},
138 | \code{"nbinom"} for \code{\link[glmmTMB]{glmmTMB}}.}
139 | }
140 | \value{
141 | a data.frame
142 | }
143 | \description{
144 | Performs cluster-wise DE analysis by fitting cell-level models.
145 | }
146 | \details{
147 | The \code{.mm_*} functions (e.g. \code{.mm_dream}) expect cells from a single
148 | cluster, and do not perform filtering or handle incorrect parameters well.
149 | Meant to be called by \code{mmDS} with \code{method = c("dream", "vst")} and
150 | \code{vst = c("sctransform", "DESeq2")} to be applied across all clusters.
151 | \describe{
152 | \item{\code{method = "dream2"}}{
153 | \code{variancePartition}'s (>=1.14.1) voom-lme4-implementation
154 | of mixed models for RNA-seq data; function \code{dream}.}
155 | \item{\code{method = "dream"}}{
156 | \code{variancePartition}'s older voom-lme4-implementation
157 | of mixed models for RNA-seq data; function \code{dream}.}
158 | \item{\code{method = "vst"}}{
159 | \describe{
160 | \item{\code{vst = "sctransform"}}{
161 | \code{lmer} or \code{blmer} mixed models on
162 | \code{\link[sctransform]{vst}} transformed counts.}
163 | \item{\code{vst = "DESeq2"}}{
164 | \code{\link[DESeq2]{varianceStabilizingTransformation}}
165 | followed by \code{lme4} mixed models.}}}}
166 | }
167 | \section{Functions}{
168 | \itemize{
169 | \item \code{.mm_dream()}: see details.
170 |
171 | \item \code{.mm_dream2()}: see details.
172 |
173 | \item \code{.mm_vst()}: see details.
174 |
175 | \item \code{.mm_glmm()}: see details.
176 |
177 | }}
178 | \examples{
179 | # subset "B cells" cluster
180 | data(example_sce)
181 | b_cells <- example_sce$cluster_id == "B cells"
182 | sub <- example_sce[, b_cells]
183 | sub$cluster_id <- droplevels(sub$cluster_id)
184 |
185 | # downsample to 100 genes
186 | gs <- sample(nrow(sub), 100)
187 | sub <- sub[gs, ]
188 |
189 | # run DS analysis using cell-level mixed-model
190 | res <- mmDS(sub, method = "dream", verbose = FALSE)
191 | head(res$`B cells`)
192 |
193 | }
194 | \references{
195 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
196 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
197 | On the discovery of population-specific state transitions from
198 | multi-sample multi-condition single-cell RNA sequencing data.
199 | \emph{bioRxiv} \strong{713412} (2018).
200 | doi: \url{https://doi.org/10.1101/713412}
201 | }
202 | \author{
203 | Pierre-Luc Germain & Helena L Crowell
204 | }
205 |
--------------------------------------------------------------------------------
/man/pbDS.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pbDS.R
3 | \name{pbDS}
4 | \alias{pbDS}
5 | \alias{pbDD}
6 | \title{pseudobulk DS analysis}
7 | \usage{
8 | pbDS(
9 | pb,
10 | method = c("edgeR", "DESeq2", "limma-trend", "limma-voom", "DD"),
11 | design = NULL,
12 | coef = NULL,
13 | contrast = NULL,
14 | min_cells = 10,
15 | filter = c("both", "genes", "samples", "none"),
16 | treat = FALSE,
17 | verbose = TRUE,
18 | BPPARAM = SerialParam(progressbar = verbose)
19 | )
20 |
21 | pbDD(
22 | pb,
23 | design = NULL,
24 | coef = NULL,
25 | contrast = NULL,
26 | min_cells = 10,
27 | filter = c("both", "genes", "samples", "none"),
28 | verbose = TRUE,
29 | BPPARAM = SerialParam(progressbar = verbose)
30 | )
31 | }
32 | \arguments{
33 | \item{pb}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
34 | containing pseudobulks as returned by \code{\link{aggregateData}}.}
35 |
36 | \item{method}{a character string.}
37 |
38 | \item{design}{For methods \code{"edegR"} and \code{"limma"}, a design matrix
39 | with row & column names(!) created with \code{\link[stats]{model.matrix}};
40 | For \code{"DESeq2"}, a formula with variables in \code{colData(pb)}.
41 | Defaults to \code{~ group_id} or the corresponding \code{model.matrix}.}
42 |
43 | \item{coef}{passed to \code{\link[edgeR]{glmQLFTest}},
44 | \code{\link[limma]{contrasts.fit}}, \code{\link[DESeq2]{results}}
45 | for \code{method = "edgeR", "limma-x", "DESeq2"}, respectively.
46 | Can be a list for multiple, independent comparisons.}
47 |
48 | \item{contrast}{a matrix of contrasts to test for
49 | created with \code{\link[limma]{makeContrasts}}.}
50 |
51 | \item{min_cells}{a numeric. Specifies the minimum number of cells in a given
52 | cluster-sample required to consider the sample for differential testing.}
53 |
54 | \item{filter}{character string specifying whether
55 | to filter on genes, samples, both or neither.}
56 |
57 | \item{treat}{logical specifying whether empirical Bayes moderated-t
58 | p-values should be computed relative to a minimum fold-change threshold.
59 | Only applicable for methods \code{"limma-x"}
60 | (\code{\link[limma:eBayes]{treat}}) and \code{"edgeR"}
61 | (\code{\link[edgeR]{glmTreat}}), and ignored otherwise.}
62 |
63 | \item{verbose}{logical. Should information on progress be reported?}
64 |
65 | \item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}}
66 | object specifying how differential testing should be parallelized.}
67 | }
68 | \value{
69 | a list containing \itemize{
70 | \item a data.frame with differential testing results,
71 | \item a \code{\link[edgeR]{DGEList}} object of length nb.-clusters, and
72 | \item the \code{design} matrix, and \code{contrast} or \code{coef} used.}
73 | }
74 | \description{
75 | \code{pbDS} tests for DS after aggregating single-cell
76 | measurements to pseudobulk data, by applying bulk RNA-seq DE methods,
77 | such as \code{edgeR}, \code{DESeq2} and \code{limma}.
78 | }
79 | \examples{
80 | # simulate 5 clusters, 20\% of DE genes
81 | data(example_sce)
82 |
83 | # compute pseudobulk sum-counts & run DS analysis
84 | pb <- aggregateData(example_sce)
85 | res <- pbDS(pb, method = "limma-trend")
86 |
87 | names(res)
88 | names(res$table)
89 | head(res$table$stim$`B cells`)
90 |
91 | # count nb. of DE genes by cluster
92 | vapply(res$table$stim, function(u)
93 | sum(u$p_adj.loc < 0.05), numeric(1))
94 |
95 | # get top 5 hits for ea. cluster w/ abs(logFC) > 1
96 | library(dplyr)
97 | lapply(res$table$stim, function(u)
98 | filter(u, abs(logFC) > 1) \%>\%
99 | arrange(p_adj.loc) \%>\%
100 | slice(seq_len(5)))
101 |
102 | }
103 | \references{
104 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
105 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
106 | On the discovery of population-specific state transitions from
107 | multi-sample multi-condition single-cell RNA sequencing data.
108 | \emph{bioRxiv} \strong{713412} (2018).
109 | doi: \url{https://doi.org/10.1101/713412}
110 | }
111 | \author{
112 | Helena L Crowell & Mark D Robinson
113 | }
114 |
--------------------------------------------------------------------------------
/man/pbFlatten.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pbFlatten.R
3 | \name{pbFlatten}
4 | \alias{pbFlatten}
5 | \title{pbFlatten
6 | Flatten pseudobulk SCE}
7 | \usage{
8 | pbFlatten(pb, normalize = TRUE)
9 | }
10 | \arguments{
11 | \item{pb}{a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
12 | as returned by \code{\link{aggregateData}}, with different subpopulations as assays.}
13 |
14 | \item{normalize}{logical specifying whether to compute a \code{logcpm} assay.}
15 | }
16 | \value{
17 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
18 | }
19 | \description{
20 | Flattens a pseudobulk \code{\link[SingleCellExperiment]{SingleCellExperiment}}
21 | as returned by \code{\link{aggregateData}} such that all cell subpopulations
22 | are represented as a single assay.
23 | }
24 | \examples{
25 | data(example_sce)
26 | library(SingleCellExperiment)
27 | pb_stack <- aggregateData(example_sce)
28 | (pb_flat <- pbFlatten(pb_stack))
29 | ncol(pb_flat) == ncol(pb_stack)*length(assays(pb_stack))
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/man/pbHeatmap.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pbHeatmap.R
3 | \name{pbHeatmap}
4 | \alias{pbHeatmap}
5 | \title{Heatmap of cluster-sample pseudobulks}
6 | \usage{
7 | pbHeatmap(
8 | x,
9 | y,
10 | k = NULL,
11 | g = NULL,
12 | c = NULL,
13 | top_n = 20,
14 | fdr = 0.05,
15 | lfc = 1,
16 | sort_by = "p_adj.loc",
17 | decreasing = FALSE,
18 | assay = "logcounts",
19 | fun = mean,
20 | normalize = TRUE,
21 | col = viridis(10),
22 | row_anno = TRUE,
23 | col_anno = TRUE
24 | )
25 | }
26 | \arguments{
27 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
28 |
29 | \item{y}{a list of DS analysis results as returned by
30 | \code{\link{pbDS}} or \code{\link{mmDS}}.}
31 |
32 | \item{k}{character vector; specifies which cluster ID(s) to retain.
33 | Defaults to \code{levels(x$cluster_id)}.}
34 |
35 | \item{g}{character vector; specifies which genes to retain.
36 | Defaults to considering all genes.}
37 |
38 | \item{c}{character string; specifies which contrast/coefficient to retain.
39 | Defaults to \code{names(y$table)[1]}.}
40 |
41 | \item{top_n}{single numeric; number of genes to retain per cluster.}
42 |
43 | \item{fdr, lfc}{single numeric; FDR and logFC cutoffs to filter results by.
44 | The specified FDR threshold is applied to \code{p_adj.loc} values.}
45 |
46 | \item{sort_by}{character string specifying
47 | a numeric results table column to sort by;
48 | \code{"none"} to retain original ordering.}
49 |
50 | \item{decreasing}{logical; whether to sort
51 | in decreasing order of \code{sort_by}.}
52 |
53 | \item{assay}{character string; specifies which assay to use;
54 | should be one of \code{assayNames(x)}.}
55 |
56 | \item{fun}{function to use as summary statistic,
57 | e.g., mean, median, sum (depending on the input assay).}
58 |
59 | \item{normalize}{logical; whether to apply a z-normalization
60 | to each row (gene) of the cluster-sample pseudobulk data.}
61 |
62 | \item{col}{character vector of colors or color mapping function
63 | generated with \code{\link[circlize]{colorRamp2}}. Passed to
64 | argument \code{col} in \code{\link[ComplexHeatmap]{Heatmap}}
65 | (see \code{?ComplexHeatmap::Heatmap} for details).}
66 |
67 | \item{row_anno, col_anno}{logical; whether to render
68 | annotations of cluster and group IDs, respectively.}
69 | }
70 | \value{
71 | a \code{\link{HeatmapList-class}} object.
72 | }
73 | \description{
74 | ...
75 | }
76 | \examples{
77 | # compute pseudobulks & run DS analysis
78 | data(example_sce)
79 | pb <- aggregateData(example_sce)
80 | res <- pbDS(pb)
81 |
82 | # cluster-sample expression means
83 | pbHeatmap(example_sce, res)
84 |
85 | # include only a single cluster
86 | pbHeatmap(example_sce, res, k = "B cells")
87 |
88 | # plot specific gene across all clusters
89 | pbHeatmap(example_sce, res, g = "ISG20")
90 |
91 | }
92 | \author{
93 | Helena L Crowell
94 | }
95 |
--------------------------------------------------------------------------------
/man/pbMDS.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/pbMDS.R
3 | \name{pbMDS}
4 | \alias{pbMDS}
5 | \title{Pseudobulk-level MDS plot}
6 | \usage{
7 | pbMDS(x)
8 | }
9 | \arguments{
10 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
11 | containing cluster-sample pseudobulks as returned by
12 | \code{\link{aggregateData}} with argument
13 | \code{by = c("cluster_id", "sample_id")}.}
14 | }
15 | \value{
16 | a \code{ggplot} object.
17 | }
18 | \description{
19 | Renders a multidimensional scaling (MDS)
20 | where each point represents a cluster-sample instance;
21 | with points colored by cluster ID and shaped by group ID.
22 | }
23 | \examples{
24 | data(example_sce)
25 | pb <- aggregateData(example_sce)
26 | pbMDS(pb)
27 |
28 | }
29 | \author{
30 | Helena L Crowell & Mark D Robinson
31 | }
32 |
--------------------------------------------------------------------------------
/man/prepSCE.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/prepSCE.R
3 | \name{prepSCE}
4 | \alias{prepSCE}
5 | \title{Prepare SCE for DS analysis}
6 | \usage{
7 | prepSCE(
8 | x,
9 | kid = "cluster_id",
10 | sid = "sample_id",
11 | gid = "group_id",
12 | drop = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{x}{a \linkS4class{SingleCellExperiment}.}
17 |
18 | \item{kid, sid, gid}{character strings specifying
19 | the \code{colData(x)} columns containing cluster assignments,
20 | unique sample identifiers, and group IDs (e.g., treatment).}
21 |
22 | \item{drop}{logical. Specifies whether \code{colData(x)} columns
23 | besides those specified as \code{cluster_id,sample_id,group_id}
24 | should be retained (default \code{drop = FALSE})
25 | or removed (\code{drop = TRUE}).}
26 | }
27 | \value{
28 | a \linkS4class{SingleCellExperiment}.
29 | }
30 | \description{
31 | ...
32 | }
33 | \examples{
34 | # generate random counts
35 | ng <- 50
36 | nc <- 200
37 |
38 | # generate some cell metadata
39 | gids <- sample(c("groupA", "groupB"), nc, TRUE)
40 | sids <- sample(paste0("sample", seq_len(3)), nc, TRUE)
41 | kids <- sample(paste0("cluster", seq_len(5)), nc, TRUE)
42 | batch <- sample(seq_len(3), nc, TRUE)
43 | cd <- data.frame(group = gids, id = sids, cluster = kids, batch)
44 |
45 | # construct SCE
46 | library(scuttle)
47 | sce <- mockSCE(ncells = nc, ngenes = ng)
48 | colData(sce) <- cbind(colData(sce), cd)
49 |
50 | # prep. for workflow
51 | sce <- prepSCE(sce, kid = "cluster", sid = "id", gid = "group")
52 | head(colData(sce))
53 | metadata(sce)$experiment_info
54 | sce
55 |
56 | }
57 | \author{
58 | Helena L Crowell
59 | }
60 |
--------------------------------------------------------------------------------
/man/prepSim.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/prepSim.R
3 | \name{prepSim}
4 | \alias{prepSim}
5 | \title{SCE preparation for \code{\link{simData}}}
6 | \usage{
7 | prepSim(
8 | x,
9 | min_count = 1,
10 | min_cells = 10,
11 | min_genes = 100,
12 | min_size = 100,
13 | group_keep = NULL,
14 | verbose = TRUE
15 | )
16 | }
17 | \arguments{
18 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
19 |
20 | \item{min_count, min_cells}{used for filtering of genes; only genes with
21 | a count > \code{min_count} in >= \code{min_cells} will be retained.}
22 |
23 | \item{min_genes}{used for filtering cells;
24 | only cells with a count > 0 in >= \code{min_genes} will be retained.}
25 |
26 | \item{min_size}{used for filtering subpopulation-sample combinations;
27 | only instances with >= \code{min_size} cells will be retained.
28 | Specifying \code{min_size = NULL} skips this step.}
29 |
30 | \item{group_keep}{character string; if \code{nlevels(x$group_id) > 1},
31 | specifies which group of samples to keep (see details). The default
32 | NULL retains samples from \code{levels(x$group_id)[1]}; otherwise,
33 | if `colData(x)$group_id` is not specified, all samples will be kept.}
34 |
35 | \item{verbose}{logical; should information on progress be reported?}
36 | }
37 | \value{
38 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
39 | containing, for each cell, library size (\code{colData(x)$offset})
40 | and, for each gene, dispersion and sample-specific mean estimates
41 | (\code{rowData(x)$dispersion} and \code{$beta.sample_id}, respectively).
42 | }
43 | \description{
44 | \code{prepSim} prepares an input SCE for simulation
45 | with \code{muscat}'s \code{\link{simData}} function by
46 | \enumerate{
47 | \item{basic filtering of genes and cells}
48 | \item{(optional) filtering of subpopulation-sample instances}
49 | \item{estimation of cell (library sizes) and gene parameters
50 | (dispersions and sample-specific means), respectively.}
51 | }
52 | }
53 | \details{
54 | For each gene \eqn{g}, \code{prepSim} fits a model to estimate
55 | sample-specific means \eqn{\beta_g^s}, for each sample \eqn{s},
56 | and dispersion parameters \eqn{\phi_g} using \code{edgeR}'s
57 | \code{\link[edgeR]{estimateDisp}} function with default parameters.
58 | Thus, the reference count data is modeled as NB distributed:
59 | \deqn{Y_{gc} \sim NB(\mu_{gc}, \phi_g)}
60 | for gene \eqn{g} and cell \eqn{c}, where the mean
61 | \eqn{\mu_{gc} = \exp(\beta_{g}^{s(c)}) \cdot \lambda_c}. Here,
62 | \eqn{\beta_{g}^{s(c)}} is the relative abundance of gene \eqn{g}
63 | in sample \eqn{s(c)}, \eqn{\lambda_c} is the library size
64 | (total number of counts), and \eqn{\phi_g} is the dispersion.
65 | }
66 | \examples{
67 | # estimate simulation parameters
68 | data(example_sce)
69 | ref <- prepSim(example_sce)
70 |
71 | # tabulate number of genes/cells before vs. after
72 | ns <- cbind(
73 | before = dim(example_sce),
74 | after = dim(ref))
75 | rownames(ns) <- c("#genes", "#cells")
76 | ns
77 |
78 | library(SingleCellExperiment)
79 | head(rowData(ref)) # gene parameters
80 | head(colData(ref)) # cell parameters
81 |
82 | }
83 | \references{
84 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
85 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
86 | On the discovery of population-specific state transitions from
87 | multi-sample multi-condition single-cell RNA sequencing data.
88 | \emph{bioRxiv} \strong{713412} (2018).
89 | doi: \url{https://doi.org/10.1101/713412}
90 | }
91 | \author{
92 | Helena L Crowell
93 | }
94 |
--------------------------------------------------------------------------------
/man/resDS.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/resDS.R
3 | \name{resDS}
4 | \alias{resDS}
5 | \title{resDS
6 | Formatting of DS analysis results}
7 | \usage{
8 | resDS(
9 | x,
10 | y,
11 | bind = c("row", "col"),
12 | frq = FALSE,
13 | cpm = FALSE,
14 | digits = 3,
15 | sep = "__",
16 | ...
17 | )
18 | }
19 | \arguments{
20 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
21 |
22 | \item{y}{a list of DS testing results as returned
23 | by \code{\link{pbDS}} or \code{\link{mmDS}}.}
24 |
25 | \item{bind}{character string specifying the output format (see details).}
26 |
27 | \item{frq}{logical or a pre-computed list of expression frequencies
28 | as returned by \code{\link{calcExprFreqs}}.}
29 |
30 | \item{cpm}{logical specifying whether CPM by cluster-sample
31 | should be appendeded to the output result table(s).}
32 |
33 | \item{digits}{integer value specifying the
34 | number of significant digits to maintain.}
35 |
36 | \item{sep}{character string to use as separator
37 | when constructing new column names.}
38 |
39 | \item{...}{optional arguments passed to
40 | \code{\link{calcExprFreqs}} if \code{frq = TRUE}.}
41 | }
42 | \value{
43 | returns a `data.frame`.
44 | }
45 | \description{
46 | \code{resDS} provides a simple wrapper to format cluster-level
47 | differential testing results into an easily filterable table, and
48 | to optionally append gene expression frequencies by cluster-sample
49 | & -group, as well as cluster-sample-wise CPM.
50 | }
51 | \details{
52 | When \code{bind = "col"}, the list of DS testing results at
53 | \code{y$table} will be merge vertically (by column) into a single table
54 | in tidy format with column \code{contrast/coef} specifying the comparison.
55 |
56 | Otherwise, when \code{bind = "row"}, an identifier of the respective
57 | contrast or coefficient will be appended to the column names,
58 | and all tables will be merge horizontally (by row).
59 |
60 | Expression frequencies pre-computed with \code{\link{calcExprFreqs}}
61 | may be provided with \code{frq}. Alternatively, when \code{frq = TRUE},
62 | expression frequencies can be computed directly, and additional arguments
63 | may be passed to \code{\link{calcExprFreqs}} (see examples below).
64 | }
65 | \examples{
66 | # compute pseudobulks (sum of counts)
67 | data(example_sce)
68 | pb <- aggregateData(example_sce,
69 | assay = "counts", fun = "sum")
70 |
71 | # run DS analysis (edgeR on pseudobulks)
72 | res <- pbDS(pb, method = "edgeR")
73 |
74 | head(resDS(example_sce, res, bind = "row")) # tidy format
75 | head(resDS(example_sce, res, bind = "col", digits = Inf))
76 |
77 | # append CPMs & expression frequencies
78 | head(resDS(example_sce, res, cpm = TRUE))
79 | head(resDS(example_sce, res, frq = TRUE))
80 |
81 | # pre-computed expression frequencies & append
82 | frq <- calcExprFreqs(example_sce, assay = "counts", th = 0)
83 | head(resDS(example_sce, res, frq = frq))
84 |
85 | }
86 | \author{
87 | Helena L Crowell & Mark D Robinson
88 | }
89 |
--------------------------------------------------------------------------------
/man/simData.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/simData.R
3 | \name{simData}
4 | \alias{simData}
5 | \title{simData}
6 | \usage{
7 | simData(
8 | x,
9 | ng = nrow(x),
10 | nc = ncol(x),
11 | ns = NULL,
12 | nk = NULL,
13 | probs = NULL,
14 | dd = TRUE,
15 | p_dd = diag(6)[1, ],
16 | paired = FALSE,
17 | p_ep = 0.5,
18 | p_dp = 0.3,
19 | p_dm = 0.5,
20 | p_type = 0,
21 | lfc = 2,
22 | rel_lfc = NULL,
23 | phylo_tree = NULL,
24 | phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
25 | force = FALSE
26 | )
27 | }
28 | \arguments{
29 | \item{x}{a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.}
30 |
31 | \item{ng}{number of genes to simulate. Importantly, for the library sizes
32 | computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
33 | the number of simulated genes should match with the number of genes
34 | in the reference. To simulate a reduced number of genes, e.g. for
35 | testing and development purposes, please set \code{force = TRUE}.}
36 |
37 | \item{nc}{number of cells to simulate.}
38 |
39 | \item{ns}{number of samples to simulate; defaults to as many as
40 | available in the reference to avoid duplicated reference samples.
41 | Specifically, the number of samples will be set to
42 | \code{n = nlevels(x$sample_id)} when \code{dd = FALSE},
43 | \code{n} per group when \code{dd, paired = TRUE}, and
44 | \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}.
45 | When a larger number samples should be simulated, set \code{force = TRUE}.}
46 |
47 | \item{nk}{number of clusters to simulate; defaults to the number
48 | of available reference clusters (\code{nlevels(x$cluster_id)}).}
49 |
50 | \item{probs}{a list of length 3 containing probabilities of a cell belonging
51 | to each cluster, sample, and group, respectively. List elements must be
52 | NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.}
53 |
54 | \item{dd}{whether or not to simulate differential distributions; if TRUE,
55 | two groups are simulated and \code{ns} corresponds to the number of
56 | samples per group, else one group with \code{ns} samples is simulated.}
57 |
58 | \item{p_dd}{numeric vector of length 6.
59 | Specifies the probability of a gene being
60 | EE, EP, DE, DP, DM, or DB, respectively.}
61 |
62 | \item{paired}{logical specifying whether a paired design should
63 | be simulated (both groups use the same set of reference samples)
64 | or not (reference samples are drawn at random).}
65 |
66 | \item{p_ep, p_dp, p_dm}{numeric specifying the proportion of cells
67 | to be shifted to a different expression state in one group (see details).}
68 |
69 | \item{p_type}{numeric. Probability of EE/EP gene being a type-gene.
70 | If a gene is of class "type" in a given cluster, a unique mean
71 | will be used for that gene in the respective cluster.}
72 |
73 | \item{lfc}{numeric value to use as mean logFC
74 | (logarithm base 2) for DE, DP, DM, and DB type of genes.}
75 |
76 | \item{rel_lfc}{numeric vector of relative logFCs for each cluster.
77 | Should be of length \code{nlevels(x$cluster_id)} with
78 | \code{levels(x$cluster_id)} as names.
79 | Defaults to factor of 1 for all clusters.}
80 |
81 | \item{phylo_tree}{newick tree text representing cluster relations
82 | and their relative distance. An explanation of the syntax can be found
83 | \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}.
84 | The distance between the nodes, except for the original branch, will be
85 | translated in the number of shared genes between the clusters belonging to
86 | these nodes (this relation is controlled with \code{phylo_pars}).
87 | The distance between two clusters is defined as the sum
88 | of the branches lengths separating them.}
89 |
90 | \item{phylo_pars}{vector of length 2 providing the parameters that control
91 | the number of type genes. Passed to an exponential PDF (see details).}
92 |
93 | \item{force}{logical specifying whether to force simulation
94 | when \code{ng} and/or \code{ns} don't match the number of
95 | available reference genes and samples, respectively.}
96 | }
97 | \value{
98 | a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
99 | containing multiple clusters & samples across 2 groups
100 | as well as the following metadata: \describe{
101 | \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing,
102 | containing, for each cell, it's cluster, sample, and group ID.}
103 | \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing,
104 | for each gene, it's \code{class} (one of "state", "type", "none") and
105 | specificity (\code{specs}; NA for genes of type "state", otherwise
106 | a character vector of clusters that share the given gene).}
107 | \item{experiment metadata (\code{metadata(.)})}{
108 | \describe{
109 | \item{\code{experiment_info}}{a \code{data.frame}
110 | summarizing the experimental design.}
111 | \item{\code{n_cells}}{the number of cells for each sample.}
112 | \item{\code{gene_info}}{a \code{data.frame} containing, for each gene
113 | in each cluster, it's differential distribution \code{category},
114 | mean \code{logFC} (NA for genes for categories "ee" and "ep"),
115 | gene used as reference (\code{sim_gene}), dispersion \code{sim_disp},
116 | and simulation means for each group \code{sim_mean.A/B}.}
117 | \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.}
118 | \item{\code{args}}{a list of the function call's input arguments.}}}}
119 | }
120 | \description{
121 | Simulation of complex scRNA-seq data
122 | }
123 | \details{
124 | \code{simData} simulates multiple clusters and samples
125 | across 2 experimental conditions from a real scRNA-seq data set.
126 |
127 | The simulation of type genes can be performed in 2 ways;
128 | (1) via \code{p_type} to simulate independent clusters, OR
129 | (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
130 |
131 | For (1), a subset of \code{p_type} \% of genes are selected per cluster
132 | to use a different references genes than the remainder of clusters,
133 | giving rise to cluster-specific NB means for count sampling.
134 |
135 | For (2), the number of shared/type genes at each node
136 | are given by \code{a*G*e^(-b*d)}, where \itemize{
137 | \item{\code{a} -- controls the percentage of shared genes between nodes.
138 | By default, at most 10\% of the genes are reserved as type genes
139 | (when \code{b} = 0). However, it is advised to tune this parameter
140 | depending on the input \code{prep_sce}.}
141 | \item{\code{b} -- determines how the number of shared genes
142 | decreases with increasing distance d between clusters
143 | (defined through \code{phylo_tree}).}}
144 | }
145 | \examples{
146 | data(example_sce)
147 | library(SingleCellExperiment)
148 |
149 | # prep. SCE for simulation
150 | ref <- prepSim(example_sce)
151 |
152 | # simulate data
153 | (sim <- simData(ref, nc = 200,
154 | p_dd = c(0.9, 0, 0.1, 0, 0, 0),
155 | ng = 100, force = TRUE,
156 | probs = list(NULL, NULL, c(1, 0))))
157 |
158 | # simulation metadata
159 | head(gi <- metadata(sim)$gene_info)
160 |
161 | # should be ~10\% DE
162 | table(gi$category)
163 |
164 | # unbalanced sample sizes
165 | sim <- simData(ref, nc = 100, ns = 2,
166 | probs = list(NULL, c(0.25, 0.75), NULL),
167 | ng = 10, force = TRUE)
168 | table(sim$sample_id)
169 |
170 | # one group only
171 | sim <- simData(ref, nc = 100,
172 | probs = list(NULL, NULL, c(1, 0)),
173 | ng = 10, force = TRUE)
174 | levels(sim$group_id)
175 |
176 | # HIERARCHICAL CLUSTER STRUCTURE
177 | # define phylogram specifying cluster relations
178 | phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
179 | # verify syntax & visualize relations
180 | library(phylogram)
181 | plot(read.dendrogram(text = phylo_tree))
182 |
183 | # let's use a more complex phylogeny
184 | phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
185 | 0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
186 | plot(read.dendrogram(text = phylo_tree))
187 |
188 | # simulate clusters accordingly
189 | sim <- simData(ref,
190 | phylo_tree = phylo_tree,
191 | phylo_pars = c(0.1, 3),
192 | ng = 500, force = TRUE)
193 | # view information about shared 'type' genes
194 | table(rowData(sim)$class)
195 |
196 | }
197 | \references{
198 | Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
199 | Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
200 | On the discovery of population-specific state transitions from
201 | multi-sample multi-condition single-cell RNA sequencing data.
202 | \emph{bioRxiv} \strong{713412} (2018).
203 | doi: \url{https://doi.org/10.1101/713412}
204 | }
205 | \author{
206 | Helena L Crowell & Anthony Sonrel
207 | }
208 |
--------------------------------------------------------------------------------
/man/stagewise_DS_DD.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/stagewiseDD.R
3 | \name{stagewise_DS_DD}
4 | \alias{stagewise_DS_DD}
5 | \title{Perform two-stage testing on DS and DD analysis results}
6 | \usage{
7 | stagewise_DS_DD(res_DS, res_DD, sce = NULL, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{res_DS}{a list of DS testing results as returned
11 | by \code{\link{pbDS}} or \code{\link{mmDS}}.}
12 |
13 | \item{res_DD}{a list of DD testing results as returned
14 | by \code{\link{pbDD}} (or \code{\link{pbDS}} with \code{method="DD"}).}
15 |
16 | \item{sce}{(optional) \code{SingleCellExperiment} object containing the data
17 | that underlies testing, prior to summarization with \code{\link{aggregateData}}.
18 | Used for validation of inputs in order to prevent unexpected failure/results.}
19 |
20 | \item{verbose}{logical. Should information on progress be reported?}
21 | }
22 | \value{
23 | A list of \code{DFrame}s containing results for each contrast and cluster.
24 | Each table contains DS and DD results for genes shared between analyses,
25 | as well as results from stagewise testing analysis, namely:
26 | \itemize{
27 | \item{\code{p_adj}: FDR adjusted p-values for the
28 | screening hypothesis that a gene is neither DS nor DD
29 | (see \code{?stageR::getAdjustedPValues} for details)}
30 | \item{\code{p_val.DS/D}: confirmation stage p-values for DS/D}}
31 | }
32 | \description{
33 | Perform two-stage testing on DS and DD analysis results
34 | }
35 | \examples{
36 | data(example_sce)
37 |
38 | pbs_sum <- aggregateData(example_sce, assay="counts", fun="sum")
39 | pbs_det <- aggregateData(example_sce, assay="counts", fun="num.detected")
40 |
41 | res_DS <- pbDS(pbs_sum, min_cells=0, filter="none", verbose=FALSE)
42 | res_DD <- pbDD(pbs_det, min_cells=0, filter="none", verbose=FALSE)
43 |
44 | res <- stagewise_DS_DD(res_DS, res_DD)
45 | head(res[[1]][[1]]) # results for 1st cluster
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(muscat)
3 |
4 | test_check("muscat")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-aggregateData.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(SingleCellExperiment)
4 | })
5 |
6 | # generate toy dataset
7 | seed <- as.numeric(format(Sys.Date(), "%s"))
8 | set.seed(seed)
9 | sce <- .toySCE()
10 | nk <- length(kids <- levels(sce$cluster_id))
11 | ns <- length(sids <- levels(sce$sample_id))
12 | ng <- length(gids <- levels(sce$group_id))
13 |
14 | test_that("aggregation across 2 factors", {
15 | for (fun in c("sum", "mean", "median")) {
16 | pb <- aggregateData(sce, by = c("cluster_id", "sample_id"), fun = fun)
17 | expect_error(aggregateData(sce, assay = "x"))
18 | expect_error(aggregateData(sce, fun = "x"))
19 | expect_error(aggregateData(sce, by = "x"))
20 |
21 | pars <- metadata(pb)$agg_pars
22 | expect_identical(pars$by, c("cluster_id", "sample_id"))
23 | expect_identical(pars$assay, "counts")
24 | expect_identical(pars$fun, fun)
25 | expect_false(pars$scale)
26 |
27 | expect_is(pb, "SingleCellExperiment")
28 | expect_identical(assayNames(pb), kids)
29 |
30 | expect_identical(nrow(pb), nrow(sce))
31 | expect_identical(ncol(pb), ns)
32 |
33 | expect_identical(rownames(pb), rownames(sce))
34 | expect_identical(colnames(pb), sids)
35 |
36 | expect_equivalent(.n_cells(pb), table(sce$cluster_id, sce$sample_id))
37 |
38 | # 10x random spot check
39 | replicate(10, {
40 | k <- sample(kids, 1)
41 | s <- sample(sids, 1)
42 | g <- sample(rownames(sce), 1)
43 | i <- sce$sample_id == s & sce$cluster_id == k
44 | expect_equal(assays(pb)[[k]][g, s], get(fun)(assay(sce)[g, i]))
45 | })
46 | }
47 | })
48 |
49 | test_that("aggregation across 1 factor", {
50 | for (fun in c("sum", "mean", "median")) {
51 | pb <- aggregateData(sce, by = "cluster_id", fun = fun)
52 | expect_is(pb, "SingleCellExperiment")
53 | expect_identical(nrow(pb), nrow(sce))
54 | expect_identical(ncol(pb), nk)
55 | expect_identical(rownames(pb), rownames(sce))
56 | expect_identical(colnames(pb), kids)
57 | expect_equivalent(table(sce$cluster_id), .n_cells(pb))
58 | # random spot check
59 | k <- sample(kids, 1)
60 | g <- sample(rownames(sce), 1)
61 | i <- sce$cluster_id == k
62 | expect_equal(assay(pb)[g, k], get(fun)(assay(sce)[g, i]))
63 | }
64 | })
65 |
66 | test_that("pbFlatten()", {
67 | x <- aggregateData(sce, by = "cluster_id")
68 | expect_error(pbFlatten(x))
69 | x <- aggregateData(sce)
70 | cd <- colData(y <- pbFlatten(x))
71 | expect_is(y, "SingleCellExperiment")
72 | expect_true(length(assays(y)) == 2)
73 | expect_identical(rownames(y), rownames(x))
74 | expect_true(ncol(y) == nk*ncol(x))
75 | a <- do.call(cbind, as.list(assays(x)))
76 | expect_equivalent(assay(y), a)
77 | expect_true(all(table(cd$sample_id) == nk))
78 | expect_true(all(table(cd$cluster_id) == ns))
79 | expect_true(all(table(cd$group_id) == ns/ng*nk))
80 | expect_equivalent(y$n_cells, c(table(sce$sample_id, sce$cluster_id)))
81 | # without normalization
82 | y <- pbFlatten(x, normalize = FALSE)
83 | expect_true(assayNames(y) == "counts")
84 | })
85 |
--------------------------------------------------------------------------------
/tests/testthat/test-calcExprFreqs.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(SingleCellExperiment)
4 | })
5 |
6 | # generate toy dataset
7 | seed <- as.numeric(format(Sys.Date(), "%s"))
8 | set.seed(seed)
9 | sce <- .toySCE()
10 |
11 | # put in 50% random 0s
12 | n <- length(assay(sce))
13 | i <- sample(n, round(n * 0.5))
14 | assay(sce)[i] <- 0
15 |
16 | # store nb. / IDs of clusters, samples, groups
17 | nk <- length(kids <- levels(sce$cluster_id))
18 | ns <- length(sids <- levels(sce$sample_id))
19 | ng <- length(gids <- levels(sce$group_id))
20 |
21 | # calculate expression frequencies
22 | x <- calcExprFreqs(sce, assay = "counts", th = 0)
23 |
24 | test_that("Output is correctly structured SCE", {
25 | expect_is(x, "SingleCellExperiment")
26 | expect_identical(assayNames(x), kids)
27 |
28 | expect_identical(nrow(x), nrow(sce))
29 | expect_identical(ncol(x), ns + ng)
30 |
31 | expect_identical(rownames(x), rownames(sce))
32 | expect_identical(colnames(x), c(sids, gids))
33 | })
34 |
35 | test_that("Frequencies lie in [0, 1] w/o any NAs", {
36 | v <- unlist(assays(x))
37 | expect_true(all(v >= 0))
38 | expect_true(all(v <= 1))
39 | expect_true(!any(is.na(v)))
40 | })
41 |
42 | test_that("10x random spot checks", {
43 | replicate(10, {
44 | # sample cluster, sample & group
45 | k <- sample(kids, 1)
46 | s <- sample(sids, 1)
47 | g <- sample(gids, 1)
48 | ki <- sce$cluster_id == k
49 | si <- sce$sample_id == s & ki
50 | gi <- sce$group_id == g & ki
51 | # sample gene & check frequencies vs. truth
52 | i <- sample(rownames(sce), 1)
53 | expect_identical(mean(counts(sce)[i, si] > 0), assays(x)[[k]][i, s])
54 | expect_identical(mean(counts(sce)[i, gi] > 0), assays(x)[[k]][i, g])
55 | })
56 | })
57 |
--------------------------------------------------------------------------------
/tests/testthat/test-mmDS.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressPackageStartupMessages({
3 | library(dplyr)
4 | library(purrr)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | # generate toy dataset
9 | seed <- as.numeric(format(Sys.time(), "%s"))
10 | set.seed(seed)
11 | x <- .toySCE(c(200, 2e3))
12 | x <- x[, x$group_id != "g3"]
13 |
14 | nk <- length(kids <- x$cluster_id)
15 | ns <- length(sids <- x$sample_id)
16 | ng <- length(gids <- x$group_id)
17 |
18 | test_that(".check_args_mmDS()", {
19 | y <- x; class(y) <- "x"
20 | expect_error(mmDS(y))
21 | y <- x; assayNames(y) <- "x"
22 | expect_error(mmDS(y))
23 | expect_error(mmDS(x, "x"))
24 | expect_error(mmDS(x, 100))
25 | expect_error(mmDS(x, covs = "x"))
26 | expect_error(mmDS(x, method = "x"))
27 | expect_error(mmDS(x, method = "vst", vst = "x"))
28 | for (u in c("verbose", "dup_corr", "trended", "bayesian", "blind", "REML"))
29 | for (v in list("x", 1, c(TRUE, TRUE))) {
30 | w <- list(x = x); w[[u]] <- v
31 | expect_error(do.call(mmDS, w))
32 | }
33 | })
34 |
35 | test_that("mmDS() - filtering", {
36 | expect_error(mmDS(x, n_cells = Inf))
37 | ks <- sample(kids, 2)
38 | ks <- as.character(ks)
39 | cs <- x$cluster_id %in% ks
40 | ls <- rowSums(assay(x[, cs]))
41 | o <- order(ls, decreasing = TRUE)
42 | gs <- rownames(x)[o][seq_len(5)]
43 | z <- suppressWarnings(
44 | mmDS(x[gs, cs], verbose = FALSE))
45 | expect_setequal(names(z), ks)
46 | expect_true(all(vapply(map(z, "gene"),
47 | function(u) all(u == gs), logical(1))))
48 | expect_true(all(vapply(ks, function(k)
49 | all(z[[k]]$cluster_id == k), logical(1))))
50 | y <- x[gs, cs]; metadata(y) <- list()
51 | y$group_id <- NULL; expect_error(mmDS(y))
52 | })
53 |
54 | # randomly select 'n_de' genes & bump counts for group 2
55 | n_de <- 5; g2 <- gids == "g2"
56 | de_gs <- sample(rownames(x), n_de)
57 | de_es <- rnbinom(n_de * sum(g2), size = 4, mu = 8)
58 | de_es <- matrix(de_es, n_de, sum(g2))
59 | assay(x[de_gs, g2], withDimnames = FALSE) <- de_es
60 | vst <- .vst_sctransform(x, verbose = FALSE)
61 | assay(x, "vstresiduals") <- vst
62 |
63 | for (fun in paste0(".mm_", c("dream", "dream2", "vst"))) {
64 | test_that(paste("mmDS-utils;", fun), {
65 | cs <- x$cluster_id == kids[1]
66 | gs <- c(de_gs, sample(setdiff(rownames(x), de_gs), 15))
67 | # currently not passing; likely because
68 | # the toy data is too simplistic...
69 | # "poisson", "hybrid", "nbinom"
70 | y <- suppressWarnings(
71 | get(fun)(x[gs, cs], verbose = FALSE))
72 | expect_is(y, "data.frame")
73 | expect_identical(rownames(y), gs)
74 | if (grepl("dream", fun))
75 | y <- y[y$logFC > 0, ]
76 | top <- order(y$p_adj.loc)[seq_len(n_de)]
77 | expect_setequal(rownames(y)[top], de_gs)
78 | })
79 | }
80 |
--------------------------------------------------------------------------------
/tests/testthat/test-pbDS.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(dplyr)
4 | library(purrr)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | # generate toy dataset
9 | seed <- as.numeric(format(Sys.time(), "%s"))
10 | set.seed(seed)
11 | sce <- .toySCE(c(100, 2e3))
12 |
13 | nk <- length(kids <- levels(sce$cluster_id))
14 | ns <- length(sids <- levels(sce$sample_id))
15 | ng <- length(gids <- levels(sce$group_id))
16 |
17 | g3 <- sce$group_id == "g3"
18 | g23 <- sce$group_id %in% c("g2", "g3")
19 |
20 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells
21 | degs <- sample(rownames(sce), (nde <- 5))
22 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10
23 | pb <- aggregateData(sce, assay = "counts", fun = "sum")
24 |
25 | # specify design & contrast matrix
26 | ei <- metadata(sce)$experiment_info
27 | design <- model.matrix(~ 0 + ei$group_id)
28 | dimnames(design) <- list(ei$sample_id, levels(ei$group_id))
29 | contrast <- limma::makeContrasts("g2-g1", "g3-g1", levels = design)
30 |
31 | # default method settings ------------------------------------------------------
32 | ms <- eval(as.list(args(pbDS))$method)
33 | ms <- setdiff(ms, "DD")
34 | for (method in ms) {
35 | test_that(paste("defaults - pbDS", method, sep = "."), {
36 | res <- pbDS(pb,
37 | method = method,
38 | min_cells = 0,
39 | filter = "none",
40 | verbose = FALSE)
41 | tbl <- res$table[[1]]
42 | expect_identical(names(tbl), kids)
43 | top <- map(tbl, function(u)
44 | dplyr::arrange(u, p_adj.loc) %>%
45 | dplyr::slice(seq_len(5)) %>%
46 | pull("gene"))
47 | expect_true(all(vapply(top, setequal, y = degs, logical(1))))
48 | })
49 | }
50 |
51 | # multiple contrast w/ & w/o 'treat' -------------------------------------------
52 | for (method in ms) {
53 | if (grepl("edgeR|limma", method)) {
54 | treat <- c(FALSE, TRUE)
55 | } else treat <- FALSE
56 | test_that(paste("pbDS", method, sep = "."), {
57 | for (t in treat) {
58 | res <- pbDS(pb,
59 | treat = t, filter = "none",
60 | method = method, verbose = FALSE,
61 | design = design, contrast = contrast)
62 |
63 | expect_identical(length(res$table), ncol(contrast))
64 | expect_identical(names(res$table), colnames(contrast))
65 | expect_true(all(vapply(map(res$table, names), "==",
66 | levels(kids), FUN.VALUE = logical(nlevels(kids)))))
67 |
68 | # check that top genes equal 'degs' in ea. comparison & cluster
69 | top <- map_depth(res$table, 2, function(u) {
70 | dplyr::arrange(u, p_adj.loc) %>%
71 | dplyr::slice(seq_len(nde)) %>%
72 | pull("gene")
73 | }) %>% Reduce(f = "c")
74 | expect_true(all(unlist(map(top, setequal, degs))))
75 | }
76 | })
77 | }
78 |
79 | # global p-value adjustment ----------------------------------------------------
80 | test_that(".p_adj_global", {
81 | names(cs) <- cs <- paste0("c", seq_len(5))
82 | names(ks) <- ks <- paste0("k", seq_len(8))
83 | ns <- sample(1e3, length(ks))
84 | names(ns) <- ks
85 | df <- lapply(ks, function(k) lapply(cs, function(c) {
86 | df <- data.frame(p_val = rgamma(ns[k], 0.1))
87 | df$p_adj.loc <- p.adjust(df$p_val, method = "BH")
88 | return(df)
89 | })) %>% .p_adj_global
90 | for (c in cs)
91 | expect_identical(
92 | p.adjust(bind_rows(df[[c]])$p_val, method = "BH"),
93 | bind_rows(df[[c]])$p_adj.glb)
94 | })
95 |
96 | # filtering --------------------------------------------------------------------
97 | test_that("pbDS() filtering", {
98 | sub <- sce[, sample(ncol(sce), 100)]
99 | pbs <- aggregateData(sub, assay = "counts", fun = "sum")
100 | expect_error(pbDS(pbs, verbose = FALSE))
101 | expect_silent(pbDS(pbs, verbose = FALSE,
102 | min_cells = 0, filter = "none"))
103 | })
104 |
105 | # pbDD -------------------------------------------------------------------------
106 |
107 | set.seed(1968)
108 | sce <- .toySCE(c(100, 2e3))
109 | cs <- which(
110 | sce$group_id == (g <- "g3") &
111 | sce$cluster_id == (k <- "k1"))
112 | cs <- sample(cs, length(cs))
113 | gs <- sample(rownames(sce), 5)
114 | assay(sce)[gs, cs] <- 0
115 |
116 | test_that("differential testing", {
117 | pbs <- aggregateData(sce, fun="num.detected")
118 | expect_identical(
119 | tbl <- pbDD(pbs, verbose=FALSE)$table[[g]][[k]],
120 | pbDS(pbs, method="DD", verbose=FALSE)$table[[g]][[k]])
121 | top <- tbl$gene[order(tbl$p_adj.loc)]
122 | expect_setequal(top[seq_along(gs)], gs)
123 | })
124 |
--------------------------------------------------------------------------------
/tests/testthat/test-pbHeatmap.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(purrr)
4 | library(scater)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | # generate toy dataset
9 | set.seed(as.numeric(format(Sys.time(), "%s")))
10 | sce <- .toySCE()
11 |
12 | nk <- length(kids <- levels(sce$cluster_id))
13 | ns <- length(sids <- levels(sce$sample_id))
14 | ng <- length(gids <- levels(sce$group_id))
15 |
16 | g3 <- sce$group_id == "g3"
17 | g23 <- sce$group_id %in% c("g2", "g3")
18 |
19 | # sample 'nde' genes & multiply counts by 10 for 'g2'- & 'g3'-cells
20 | degs <- sample(rownames(sce), (nde <- 5))
21 | assay(sce[degs, g23]) <- assay(sce[degs, g23]) * 10
22 | sce <- logNormCounts(computeLibraryFactors(sce))
23 |
24 | # run DS analysis using 'edgeR' on pseudobulks
25 | pb <- aggregateData(sce, assay = "counts", fun = "sum")
26 | res <- pbDS(pb, method = "edgeR", min_cells = 0, filter = "none", verbose = FALSE)
27 |
28 | # compute pseudobulk mean of logcounts
29 | pb <- aggregateData(sce, assay = "logcounts", fun = "mean")
30 |
31 | test_that("pbHeatmap() - input arguments", {
32 | # get list of default parameters
33 | defs <- as.list(eval(formals(pbHeatmap)))
34 | defs$x <- sce; defs$y <- res
35 | expect_is(do.call(pbHeatmap, defs), "Heatmap")
36 | # check that invalid arguments throw error
37 | fail <- list(g = "x", k = "x", c = "x",
38 | sort_by = "x", sort_by = "gene",
39 | sort_by = c("logFC", "p_val"),
40 | lfc = "x", lfc = c(1, 2),
41 | assay = 1, assay = "x",
42 | decreasing = "x", decreasing = c(TRUE, FALSE),
43 | row_anno = "x", row_anno = c(TRUE, FALSE),
44 | col_anno = "x", col_anno = c(TRUE, FALSE),
45 | normalize = "x", normalize = c(TRUE, FALSE))
46 | for (i in seq_along(fail)) {
47 | args <- defs; args[[names(fail)[i]]] <- fail[[i]]
48 | expect_error(do.call(pbHeatmap, args))
49 | }
50 | })
51 |
52 | test_that("pbHeatmap() - subset of clusters", {
53 | ks <- sample(kids, (nks <- 3))
54 | p <- pbHeatmap(sce, res, k = ks,
55 | lfc = 0, fdr = Inf, sort_by = "none",
56 | top_n = (nds <- 10), normalize = FALSE)
57 | expect_is(p, "Heatmap")
58 | gs <- rownames(y <- p@matrix)[seq_len(nds)]
59 | expect_equal(dim(y), c(nks*nds, ns))
60 | expect_identical(rownames(y), rep(rownames(sce)[seq_len(nds)], nks))
61 | expect_identical(colnames(y), sids)
62 | for (i in seq_len(nks)) {
63 | k <- kids[match(ks, kids)][i]
64 | expect_equal(y[seq_len(nds)+nds*(i-1), ], assay(pb, k)[gs, ])
65 | }
66 | })
67 |
68 | test_that("pbHeatmap() - subset of genes", {
69 | gs <- sample(rownames(sce), (ngs <- 20))
70 | p <- pbHeatmap(sce, res, g = gs,
71 | lfc = 0, fdr = Inf, sort_by = "none",
72 | top_n = Inf, normalize = FALSE)
73 | expect_is(p, "Heatmap")
74 | gs <- rownames(y <- p@matrix)[seq_len(ngs)]
75 | expect_equal(dim(y), c(nk*ngs, ns))
76 | expect_identical(rownames(y), rep(gs, nk))
77 | expect_identical(colnames(y), sids)
78 | })
79 |
--------------------------------------------------------------------------------
/tests/testthat/test-pbMDS.R:
--------------------------------------------------------------------------------
1 | # generate toy dataset
2 | seed <- as.numeric(format(Sys.time(), "%s"))
3 | set.seed(seed); x <- .toySCE()
4 |
5 | nk <- length(kids <- levels(x$cluster_id))
6 | ns <- length(sids <- levels(x$sample_id))
7 | ng <- length(gids <- levels(x$group_id))
8 |
9 | test_that("pbMDS()", {
10 | y <- x; class(y) <- "x"
11 | expect_error(pbMDS(y))
12 | y <- aggregateData(x)
13 | expect_is(p <- pbMDS(y), "ggplot")
14 | expect_true(nrow(p$data) == nk*ns)
15 | expect_true(all(table(p$data$cluster_id) == ns))
16 | expect_true(all(table(p$data$group_id) == ns*nk/ng))
17 | cs1 <- x$group_id != gids[1] # remove group
18 | cs2 <- x$sample_id != sids[1] # remove sample
19 | cs3 <- x$cluster_id != kids[1] # remove cluster
20 | cs4 <- cs1 & cs2 # remove sample in single group
21 | cs5 <- cs1 & cs3 # remove cluster in single group
22 | cs6 <- cs2 & cs3 # remove cluster-sample instance
23 | for (cs in paste0("cs", seq_len(6)))
24 | expect_silent(pbMDS(aggregateData(x[, get(cs)])))
25 | })
26 |
--------------------------------------------------------------------------------
/tests/testthat/test-prepSCE.R:
--------------------------------------------------------------------------------
1 | # generate toy dataset
2 | seed <- as.numeric(format(Sys.time(), "%s"))
3 | set.seed(seed); x <- .toySCE()
4 |
5 | nk <- length(kids <- levels(x$cluster_id))
6 | ns <- length(sids <- levels(x$sample_id))
7 | ng <- length(gids <- levels(x$group_id))
8 |
9 | test_that("prepSCE()", {
10 | y <- x; class(y) <- "x"
11 | expect_error(prepSCE(y))
12 | expect_is(y <- prepSCE(x), "SingleCellExperiment")
13 | expect_identical(dim(y), dim(x))
14 | expect_identical(dimnames(y), dimnames(x))
15 | ids <- formals("prepSCE")
16 | ids <- ids[grep("[a-z]id", names(ids))]
17 | ids <- unname(unlist(ids))
18 | expect_identical(colnames(colData(y)), ids)
19 | x$foo <- sample(ncol(x))
20 | y <- prepSCE(x, drop = TRUE)
21 | expect_true(is.null(y$foo))
22 | y <- prepSCE(x, drop = FALSE)
23 | expect_identical(y$foo, x$foo)
24 | expect_false(is.null(metadata(y)$experiment_info))
25 | })
26 |
--------------------------------------------------------------------------------
/tests/testthat/test-prepSim.R:
--------------------------------------------------------------------------------
1 | # generate toy dataset
2 | seed <- as.numeric(format(Sys.time(), "%s"))
3 | set.seed(seed); x <- .toySCE()
4 |
5 | nk <- length(kids <- levels(x$cluster_id))
6 | ns <- length(sids <- levels(x$sample_id))
7 | ng <- length(gids <- levels(x$group_id))
8 |
9 | test_that("prepSim()", {
10 | z <- x; class(z) <- "x"
11 | expect_error(prepSim(z))
12 | expect_error(prepSim(x, group_keep = "x"))
13 | y <- prepSim(x, 0, 0, 0, 0)
14 | expect_is(y, "SingleCellExperiment")
15 | expect_true(nrow(y) == nrow(x))
16 | expect_true(ncol(y) == sum(x$group_id == gids[1]))
17 | g <- sample(setdiff(gids, gids[1]), 1)
18 | y <- prepSim(x, 0, 0, 0, 0, group_keep = g)
19 | expect_true(ncol(y) == sum(x$group_id == g))
20 | })
21 |
22 | test_that("non-factor ID columns work", {
23 | y <- x; for (i in names(colData(y)))
24 | y[[i]] <- as.character(y[[i]])
25 | expect_silent(prepSim(y, 0, 0, 0, 0, verbose = FALSE))
26 | })
27 |
28 | test_that("single cluster", {
29 | g <- sample(levels(x$group_id), 1)
30 | k <- sample(levels(x$cluster_id), 1)
31 | y <- x[, i <- x$cluster_id == k & x$group_id == g]
32 | z <- prepSim(y, min_size = 0, min_genes = 0, verbose = FALSE)
33 | expect_true(ncol(z) == sum(i))
34 | expect_true(!"cluster_id" %in% names(colData(z)))
35 | })
36 |
--------------------------------------------------------------------------------
/tests/testthat/test-resDS.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(dplyr)
4 | library(purrr)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | # generate toy dataset
9 | seed <- as.numeric(format(Sys.time(), "%s"))
10 | set.seed(seed)
11 | x <- .toySCE()
12 |
13 | nk <- length(kids <- levels(x$cluster_id))
14 | ns <- length(sids <- levels(x$sample_id))
15 | ng <- length(gids <- levels(x$group_id))
16 |
17 | # sample 'n_de' genes & multiply counts by 10 for 'g2/3'-cells
18 | g23 <- x$group_id != "g1"
19 | de_gs <- sample(rownames(x), (n_de <- 5))
20 | assay(x[de_gs, g23]) <- assay(x[de_gs, g23]) * 10
21 |
22 | # aggregate & run pseudobulk DS analysis
23 | nc <- length(cs <- list(2, 3))
24 | pbs <- aggregateData(x, assay = "counts", fun = "sum")
25 | y <- pbDS(pbs,
26 | coef = cs, verbose = FALSE,
27 | # assure everything is being tested
28 | min_cells = 0, filter = "none")
29 |
30 | test_that("resDS()", {
31 | v <- list(col = list(nr = nrow(x)*nk, ng = nrow(x), nk = nk))
32 | v$row <- lapply(v$col, "*", nc)
33 | v$col$char_cols <- c("gene", "cluster_id")
34 | v$row$char_cols <- c(v$col$char_cols, "coef")
35 | for (bind in c("row", "col")) {
36 | z <- resDS(x, y, bind, frq = FALSE, cpm = FALSE)
37 | expect_is(z, "data.frame")
38 | expect_identical(nrow(z), v[[bind]]$nr)
39 | expect_true(all(table(z$gene) == v[[bind]]$nk))
40 | expect_true(all(table(z$cluster_id) == v[[bind]]$ng))
41 | is_char <- colnames(z) %in% v[[bind]]$char_cols
42 | expect_true(all(apply(z[, !is_char], 2, class) == "numeric"))
43 | expect_true(all(apply(z[, is_char], 2, class) == "character"))
44 | }
45 | })
46 | test_that("resDS() - 'frq = TRUE'", {
47 | z <- resDS(x, y, frq = TRUE)
48 | u <- z[, grep("frq", colnames(z))]
49 | expect_true(ncol(u) == ns + ng)
50 | expect_true(all(u <= 1 & u >= 0 | is.na(u)))
51 | # remove single cluster-sample instance
52 | s <- sample(sids, 1); k <- sample(kids, 1)
53 | x_ <- x[, !(x$sample_id == s & x$cluster_id == k)]
54 | y_ <- aggregateData(x_, assay = "counts", fun = "sum")
55 | y_ <- pbDS(y_, coef = cs, verbose = FALSE)
56 | z <- resDS(x_, y_, frq = TRUE)
57 | u <- z[, grep("frq", colnames(z))]
58 | expect_true(ncol(u) == ns + ng)
59 | expect_true(all(u <= 1 & u >= 0 | is.na(u)))
60 | expect_true(all(is.na(z[z$cluster_id == k, paste0(s, ".frq")])))
61 | })
62 | test_that("resDS() - 'cpm = TRUE'", {
63 | z <- resDS(x, y, cpm = TRUE)
64 | u <- z[, grep("cpm", colnames(z))]
65 | expect_true(ncol(u) == ns)
66 | expect_true(all(u %% 1 == 0 | is.na(u)))
67 | })
68 |
69 | test_that("missing cluster is handled", {
70 | k <- sample(kids, 1)
71 | i <- !(x$cluster_id == k)
72 | pbs <- aggregateData(x[, i], verbose = FALSE)
73 | res <- pbDS(pbs, verbose = FALSE)
74 | tbl <- resDS(x, res, cpm = TRUE)
75 | expect_true(!k %in% unique(tbl$cluster_id))
76 | })
77 | test_that("missing sample is handled", {
78 | s <- sample(sids, 1)
79 | i <- !(x$sample_id == s)
80 | pbs <- aggregateData(x[, i], verbose = FALSE)
81 | res <- pbDS(pbs, verbose = FALSE)
82 | tbl <- resDS(x, res, cpm = TRUE)
83 | expect_true(sum(grepl(s, names(tbl))) == 0)
84 | })
85 | test_that("missing cluster-sample is handled", {
86 | k <- sample(kids, 1)
87 | s <- sample(sids, 1)
88 | i <- !(x$cluster_id == k & x$sample_id == s)
89 | pbs <- aggregateData(x[, i], verbose = FALSE)
90 | res <- pbDS(pbs, verbose = FALSE)
91 | tbl <- resDS(x, res, cpm = TRUE)
92 | sub <- tbl[tbl$cluster_id == k, ]
93 | expect_true(all(is.na(sub[, grep(s, names(sub))])))
94 | })
95 |
--------------------------------------------------------------------------------
/tests/testthat/test-simData.R:
--------------------------------------------------------------------------------
1 | suppressMessages({
2 | library(dplyr)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | data(example_sce)
7 | ref <- prepSim(example_sce, verbose = FALSE)
8 | ng <- 200; nc <- 2e3; ns <- 3; nk <- 2
9 |
10 | test_that("no groups, no samples, no clusters", {
11 | s <- sample(levels(ref$sample_id), 1)
12 | k <- sample(levels(ref$cluster_id), 1)
13 | sub <- ref[, ref$sample_id == s & ref$cluster_id == k]
14 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
15 | expect_null(sim$group_id)
16 | expect_null(sim$sample_id)
17 | expect_null(sim$cluster_id)
18 | })
19 |
20 | test_that("no groups, yes samples and clusters", {
21 | sids <- levels(ref$sample_id)
22 | kids <- levels(ref$cluster_id)
23 | sim <- simData(ref, ng, nc, dd = FALSE, force = TRUE)
24 | expect_null(sim$group_id)
25 | expect_true(setequal(metadata(sim)$ref_sids, sids))
26 | expect_true(setequal(metadata(sim)$ref_kids, kids))
27 | })
28 |
29 | test_that("only samples", {
30 | k <- sample(levels(ref$cluster_id), 1)
31 | sub <- ref[, ref$cluster_id == k]
32 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
33 | expect_null(sim$group_id)
34 | expect_null(sim$culster_id)
35 | expect_setequal(metadata(sim)$ref_sids, levels(sub$sample_id))
36 | })
37 |
38 | test_that("only clusters", {
39 | s <- sample(levels(ref$sample_id), 1)
40 | sub <- ref[, ref$sample_id == s]
41 | sim <- simData(sub, ng, nc, dd = FALSE, force = TRUE)
42 | expect_null(sim$group_id)
43 | expect_null(sim$sample_id)
44 | expect_setequal(metadata(sim)$ref_kids, levels(sub$cluster_id))
45 | })
46 |
47 |
48 | test_that("pbDS() gets at least 50% right for 10% DE genes", {
49 | replicate(3, {
50 | sim <- simData(ref, ng, nc, ns, nk,
51 | p_dd = c(0.9,0,0.1,0,0,0), force = TRUE)
52 | gi <- metadata(sim)$gene_info
53 | pb <- aggregateData(sim)
54 | re <- pbDS(pb, verbose = FALSE)
55 | re <- bind_rows(re$table[[1]])
56 | re <- arrange(re, p_adj.loc)
57 | gi$id <- with(gi, paste0(gene, cluster_id))
58 | re$id <- with(re, paste0(gene, cluster_id))
59 | n_de <- sum(de <- gi$category == "de")
60 | expect_gte(sum(re$id[seq_len(n_de)] %in% gi$id[de]), round(n_de/2))
61 | })
62 | })
63 |
64 | test_that("Single group simulation", {
65 | gs <- c("A", "B")
66 | ps <- list(c(1, 0), c(0, 1))
67 | names(ps) <- gs
68 | for (g in gs) {
69 | x <- simData(ref,
70 | nc, ns, nk, ng = 10, force = TRUE,
71 | probs = list(NULL, NULL, ps[[g]]))
72 | expect_identical(levels(x$group_id), g)
73 | sids <- sprintf("sample%s.%s", seq_len(ns), g)
74 | expect_identical(levels(x$sample_id), sids)
75 | gi <- metadata(x)$gene_info
76 | ms <- paste0("sim_mean.", setdiff(gs, g))
77 | expect_true(all(is.na(gi[[ms]])))
78 | }
79 | })
80 |
81 | test_that("Pure simulations give single DD category", {
82 | for (c in cats) {
83 | sim <- simData(ref, nc, ns, nk,
84 | p_dd = as.numeric(cats == c),
85 | ng = ng, force = TRUE)
86 | gi <- metadata(sim)$gene_info
87 | expect_equal(table(gi$category)[[c]], ng*nk)
88 | }
89 | })
90 |
91 | test_that("simData() - input arguments", {
92 | # number of genes mismatch b/w simulation & reference
93 | expect_error(simData(ref, ng = 100, force = FALSE))
94 | expect_silent(simData(ref, ng = 100, force = TRUE))
95 | nk <- length(kids <- levels(ref$cluster_id))
96 | # named 'rel_lfc's (mis)match cluster names
97 | lfc <- rep(1, nk); names(lfc) <- kids; lfc2 <- lfc; names(lfc2)[1] <- "x"
98 | expect_silent(simData(ref, nk = nk, rel_lfc = lfc, ng = 10, force = TRUE))
99 | expect_error(simData(ref, nk = nk, rel_lfc = lfc2, ng = 10, force = TRUE))
100 | })
101 |
102 | test_that("Type genes & cluster phylogeny", {
103 | ng <- 1e3; pt <- 0.05
104 | t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
105 | args <- list(
106 | list(pt = pt, t = NULL), # type genes but no phylogeny
107 | list(t = t, pt = 0)) # both, type genes and phylogeny
108 | cs <- c("type", "state") # possible gene classes
109 | for (i in seq_along(args)) {
110 | rd <- rowData(x <- simData(ref, ng = ng, force = TRUE,
111 | p_type = args[[i]]$pt, phylo_tree = args[[i]]$t))
112 | expect_is(rd, "DataFrame")
113 | expect_equal(dim(rd), c(ng, 2))
114 | expect_equal(colnames(rd), c("class", "specs"))
115 | ns <- as.list(table(rd$class))
116 | is_state <- rd$class == "state"
117 | expect_true(all(is.na(unlist(rd$specs[is_state]))))
118 | expect_true(all(unlist(rd$specs[!is_state]) %in% levels(x$cluster_id)))
119 | }
120 | # missing clusters in input phylogeny
121 | t <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster4':0.5);"
122 | expect_error(simData(ref, phylo_tree = t))
123 | # specification of both, 'p_type' and 'phylo_tree'
124 | expect_error(simData(ref, p_type = pt, phylo_tree = t))
125 | })
126 |
--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | x <- .toySCE()
6 | kids <- levels(x$cluster_id)
7 | sids <- levels(x$sample_id)
8 |
9 | test_that(".filter_matrix()", {
10 | replicate(5, {
11 | x <- matrix(sample(seq_len(200)), 10, 20)
12 | y <- .filter_matrix(x, (n <- sample(200, 1)))
13 | expect_true(all(y >= n) | any(dim(y) == 1))
14 | })
15 | })
16 | test_that(".update_sce()", {
17 | replicate(5, {
18 | ks <- sample(kids, (nk <- 2))
19 | ss <- sample(sids, (ns <- 3))
20 | cs <- x$cluster_id %in% ks & x$sample_id %in% ss
21 | y <- .update_sce(x[, cs])
22 | expect_equal(ncol(y), sum(cs))
23 | expect_true(setequal(levels(y$cluster_id), ks))
24 | expect_true(setequal(levels(y$sample_id), ss))
25 | })
26 | })
27 | test_that(".filter_sce()", {
28 | replicate(5, {
29 | ks <- sample(kids, (nk <- 2))
30 | ss <- sample(sids, (ns <- 3))
31 | y <- .filter_sce(x, ks, ss)
32 | expect_true(setequal(levels(y$cluster_id), ks))
33 | expect_true(setequal(levels(y$sample_id), ss))
34 | cs <- x$cluster_id %in% ks & x$sample_id %in% ss
35 | expect_equal(ncol(y), sum(cs))
36 | expect_equal(colnames(y), colnames(x)[cs])
37 | ei <- metadata(y)$experiment_info
38 | expect_true(setequal(ei$sample_id, ss))
39 | })
40 | })
41 | test_that(".scale()", {
42 | replicate(5, {
43 | y <- .scale(x <- matrix(runif(200), (n <- 10), (m <- 20)))
44 | expect_true(all(apply(y, 1, min) == 0))
45 | expect_true(all(apply(y, 1, max) == 1))
46 | qs <- quantile(x[1, ], c(0.01, 0.99))
47 | z <- (x[1, ] - qs[1]) / diff(qs)
48 | z[z < 0] <- 0; z[z > 1] <- 1
49 | expect_true(all(y[1, ] == z))
50 | i <- sample(n, 1); j <- sample(m, 1)
51 | x[i, j] <- NA; z <- .scale(x)
52 | expect_true(sum(is.na(z)) == 1)
53 | os <- lapply(list(y, z), function(u) {
54 | rngs <- colRanges(u[-i, -j],
55 | na.rm = TRUE, useNames = FALSE)
56 | apply(rngs, 2, order)
57 | })
58 | expect_identical(os[[1]], os[[2]])
59 | })
60 | })
61 |
--------------------------------------------------------------------------------
/tests/testthat/test-validityChecks.R:
--------------------------------------------------------------------------------
1 | # load packages
2 | suppressMessages({
3 | library(SingleCellExperiment)
4 | })
5 | sce <- .toySCE()
6 |
7 | test_that(".check_sce", {
8 | # object is not 'SingleCellExperiment'
9 | x <- sce; class(x) <- "x"
10 | expect_error(.check_sce(x))
11 | # "group_id" 'colData' column not required
12 | x <- sce; x$group_id <- NULL
13 | expect_silent(.check_sce(x, FALSE))
14 | # missing "x_id" 'colData' column;
15 | # x = "sample", "cluster", "group"
16 | for (i in colnames(colData(sce))) {
17 | x <- sce; x[[i]] <- NULL
18 | expect_error(.check_sce(x))
19 | }
20 | })
21 |
22 | test_that(".check_arg_assay", {
23 | expect_error(.check_arg_assay(sce, 1))
24 | expect_error(.check_arg_assay(sce, "x"))
25 | expect_error(.check_arg_assay(sce, c(assayNames(sce)[1], "x")))
26 | expect_silent(.check_arg_assay(sce, assayNames(sce)[1]))
27 | })
28 |
29 | test_that(".check_args_simData", {
30 | u <- list(x = sce, ng = 10, nc = 100, ns = 3, nk = 2, probs = NULL,
31 | p_dd = diag(6)[1, ], p_type = 0.1, lfc = 1, rel_lfc = NULL,
32 | p_ep = 0.1, p_dp = 0.1, p_dm = 0.1, paired = FALSE, force = TRUE,
33 | phylo_tree = NULL, phylo_pars = c(0, 3))
34 | expect_silent(.check_args_simData(u))
35 | # 'ng', 'nc', 'ns', 'nk' should be single numerics > 0
36 | for (arg in c("ng", "nc", "ns", "nk")) {
37 | v <- u; v[[arg]] <- 1; expect_silent(.check_args_simData(v))
38 | for (val in list(numeric(2)+1, numeric(1), NA, "a"))
39 | v[[arg]] <- val; expect_error(.check_args_simData(v))
40 | }
41 | # 'p_dd' should be length of DD categories, sum to 1, and be in [0,1]
42 | v <- u; for (val in list(rep(1,5)/5, rep(1,6)/5, c(-1,2,rep(0,4))))
43 | v$p_dd <- val; expect_error(.check_args_simData(v))
44 | # 'paired' should be length-one logical
45 | v <- u; for (val in list("x", 123, c(TRUE, FALSE)))
46 | v$paired <- val; expect_error(.check_args_simData(v))
47 | # 'rel_lfc' should be >0 and be of length 'nk'
48 | u$rel_lfc <- rep(1, u$nk)
49 | expect_silent(.check_args_simData(u))
50 | v <- u; for (val in list(-1, rep(1,u$nk+1)))
51 | v$rel_lfc <- val; expect_error(.check_args_simData(v))
52 | })
53 |
54 | test_that(".check_frq()", {
55 | y <- calcExprFreqs(x <- sce)
56 | expect_silent(.check_frq(x, y))
57 | z <- x; class(z) <- "x"
58 | expect_error(.check_frq(z, y))
59 | z <- y; class(z) <- "x"
60 | expect_error(.check_frq(x, z))
61 | for (v in c(-1, 10)) {
62 | z <- y; assay(z)[1, 1] <- v
63 | expect_error(.check_frq(x, z))
64 | }
65 | })
66 |
--------------------------------------------------------------------------------
/tests/testthat/test_utils-simData.R:
--------------------------------------------------------------------------------
1 | nk <- length(k <- paste0("cluster", seq_len(5)))
2 | ns <- length(s <- paste0("sample", seq_len(4)))
3 | ng <- length(g <- paste0("group", seq_len(3)))
4 |
5 | test_that(".get_ns", {
6 | replicate(10, {
7 | # default to using all samples when
8 | # dd = FALSE or dd = TRUE, paired = TRUE
9 | ns_ref <- sample(100, 1)
10 | args <- list(
11 | list(dd = TRUE, paired = TRUE),
12 | list(dd = FALSE, paired = TRUE),
13 | list(dd = FALSE, paired = FALSE))
14 | for (. in args) {
15 | .$ns_ref <- ns_ref
16 | .$force <- FALSE
17 | . <- c(., list(ns_sim = NULL))
18 | ns <- do.call(.get_ns, .)
19 | expect_identical(ns_ref, ns)
20 | }
21 | # error when force = FALSE and
22 | # desired number of samples to simulate
23 | # exceeds available reference samples
24 | ns_sim <- ns_ref + 1
25 | args <- list(
26 | list(dd = TRUE, paired = TRUE),
27 | list(dd = TRUE, paired = FALSE),
28 | list(dd = FALSE, paired = TRUE),
29 | list(dd = FALSE, paired = FALSE))
30 | for (. in args) {
31 | .$ns_ref <- ns_ref
32 | .$ns_sim <- ns_sim
33 | .$force <- FALSE
34 | expect_error(do.call(.get_ns, .))
35 | # pass when force = TRUE
36 | .$force <- TRUE
37 | ns <- do.call(.get_ns, .)
38 | expect_identical(ns_sim, ns)
39 | }
40 | })
41 | })
42 |
43 | test_that(".split_cells", {
44 | n_cells <- 1e3
45 | cells <- paste0("cell", seq_len(n_cells))
46 | x <- matrix(0,
47 | nrow = 1, ncol = n_cells,
48 | dimnames = list(NULL, cells))
49 | cd <- data.frame(
50 | row.names = cells,
51 | cluster_id = sample(k, n_cells, TRUE),
52 | sample_id = sample(s, n_cells, TRUE))
53 |
54 | cs <- .split_cells(cd, "cluster_id")
55 | expect_identical(names(cs), k)
56 | expect_identical(
57 | as.numeric(vapply(cs, length, numeric(1))),
58 | as.numeric(table(cd$cluster_id)))
59 |
60 | cs <- .split_cells(cd, c("cluster_id", "sample_id"))
61 | expect_identical(names(cs), k)
62 | nms_lvl2 <- vapply(cs, names, character(length(s)))
63 | expect_true(all(apply(nms_lvl2, 2, identical, s)))
64 |
65 | cs <- .split_cells(cd, c("sample_id", "cluster_id"))
66 | expect_identical(names(cs), s)
67 | nms_lvl2 <- vapply(cs, names, character(length(k)))
68 | expect_true(all(apply(nms_lvl2, 2, identical, k)))
69 | })
70 |
71 | test_that(".sample_cell_md", {
72 | ids <- list(k, s, g)
73 | md <- .sample_cell_md((n <- 1e3), ids)
74 | ms <- vapply(apply(md, 2, table), mean, numeric(1))
75 | expect_true(all(vapply(seq_along(ids), function(i)
76 | ms[[i]] == n/length(ids[[i]]), logical(1))))
77 | set.seed(1); a <- .sample_cell_md(n, ids)
78 | set.seed(1); b <- .sample_cell_md(n, ids,
79 | list(rep(1/nk,nk),rep(1/ns,ns),rep(1/ng,ng)))
80 | expect_identical(a, b)
81 | })
82 |
--------------------------------------------------------------------------------
/vignettes/detection.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Differential detection analysis"
3 | author:
4 | - name: Jeroen Gilis
5 | affiliation:
6 | - &one Applied Mathematics, Computer science and Statistics, Ghent University, Ghent, Belgium
7 | - &two Bioinformatics Institute, Ghent University, Ghent, Belgium
8 | - Data Mining and Modeling for Biomedicine, VIB Flemish Institute for Biotechnology, Ghent, Belgium
9 | - name: Helena L Crowell
10 | affiliation:
11 | - National Center for Genomic Analysis (CNAG), Barcelona, Spain
12 | - name: Davide Risso
13 | affiliation:
14 | - Department of Statistical Sciences, University of Padova, Padova, Italy
15 | - Padua Center for Network Medicine, University of Padova, Padova, Italy
16 | - name: Lieven Clement
17 | affiliation:
18 | - *one
19 | - *two
20 | package: "`r BiocStyle::pkg_ver('muscat')`"
21 | date: "`r format(Sys.Date(), '%B %d, %Y')`"
22 | output:
23 | BiocStyle::html_document
24 | vignette: >
25 | %\VignetteIndexEntry{"3. Differential detection"}
26 | %\VignettePackage{muscat}
27 | %\VignetteEngine{knitr::rmarkdown}
28 | %\VignetteEncoding{UTF-8}
29 | bibliography: "`r file.path(system.file('extdata', package='muscat'), 'refs.bib')`"
30 | abstract: >
31 |
In this vignette, we display how `muscat` can be used to perform differrential detection (DD) analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and differential state (DS) analysis results on the same data can be effectively combined. This vignette thus introduces a workflow that allows users to jointly assess two biological hypotheses that often contain orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost. 32 | --- 33 | 34 | 39 | 40 | ```{r cache, include=FALSE} 41 | knitr::opts_chunk$set(cache=TRUE) 42 | ``` 43 | 44 | ```{r echo=FALSE, message=FALSE, warning=FALSE} 45 | library(BiocStyle) 46 | ``` 47 | 48 | *** 49 | 50 | Based on @Gilis2023 51 | 52 | > Gilis J, Perin L, Malfait M, Van den Berge K, 53 | Assefa AT, Verbist B, Risso D, and Clement L: 54 | Differential detection workflows for 55 | multi-sample single-cell RNA-seq data. 56 | *bioRxiv* (2023). [DOI: 10.1101/2023.12.17.572043](https://doi.org/10.1101/2023.12.17.572043) 57 | 58 | # Load packages {-} 59 | 60 | ```{r load-libs, message=FALSE, warning=FALSE} 61 | library(dplyr) 62 | library(purrr) 63 | library(tidyr) 64 | library(scater) 65 | library(muscat) 66 | library(ggplot2) 67 | library(patchwork) 68 | ``` 69 | 70 | # Introduction 71 | 72 | Single-cell RNA-sequencing (scRNA-seq) has improved our understanding of complex biological processes by elucidating cell-level heterogeneity in gene expression. One of the key tasks in the downstream analysis of scRNA-seq data is studying differential gene expression (DE). Most DE analysis methods aim to identify genes for which the *average* expression differs between biological groups of interest, e.g., between cell types or between diseased and healthy cells. As such, most methods allow for assessing only one aspect of the gene expression distribution: the mean. However, in scRNA-seq data, differences in other characteristics between count distributions can commonly be observed. 73 | 74 | One such characteristic is gene detection, i.e., the number of cells in which a gene is (detectably) expressed. Analogous to a DE analysis, a differential detection (DD) analysis aims to identify genes for which the *average fraction of cells in which the gene is detected* changes between groups. In @Gilis2023, we show how DD analysis contain information that is biologically relevant, and that is largely orthogonal to the information obtained from DE analysis on the same data. 75 | 76 | In this vignette, we display how `muscat` can be used to perform DD analyses in multi-sample, multi-group, multi-(cell-)subpopulation scRNA-seq data. Furthermore, we show how DD and DS analysis results on the same data can be effectively combined using a two-stage testing approach. This workflow thus allows users to jointly assess two biological hypotheses containing orthogonal information, which thus can be expected to improve their understanding of complex biological phenomena, at no extra cost. 77 | 78 | # Setup 79 | 80 | We will use the same data as in the differential state (DS) analyses described in `r Biocpkg("muscat", vignette = "analysis.html")`, namely, scRNA-seq data acquired on PBMCs from 8 patients before and after IFN-$\beta$ treatment. For a more detailed description of these data and subsequent preprocessing, we refer to `r Biocpkg("muscat", vignette = "analysis.html")`. 81 | 82 | ```{r load-data, message=FALSE} 83 | library(ExperimentHub) 84 | eh <- ExperimentHub() 85 | query(eh, "Kang") 86 | (sce <- eh[["EH2259"]]) 87 | ``` 88 | 89 | We further apply some minimal filtering to remove low-quality genes and cells, and use `prepSCE()` to standardize cell metadata such that slots specifying cluster (`cell`), sample (`stim`+`ind`), and group (`stim`) identifiers conform with the `muscat` framework: 90 | 91 | ```{r prep-data} 92 | sce <- sce[rowSums(counts(sce) > 0) > 0, ] 93 | qc <- perCellQCMetrics(sce) 94 | sce <- sce[, !isOutlier(qc$detected, nmads=2, log=TRUE)] 95 | sce <- sce[rowSums(counts(sce) > 1) >= 10, ] 96 | sce$id <- paste0(sce$stim, sce$ind) 97 | sce <- prepSCE(sce, "cell", "id", "stim") 98 | table(sce$cluster_id, sce$group_id) 99 | table(sce$sample_id) 100 | ``` 101 | 102 | ## Aggregation 103 | 104 | In general, `aggregateData()` will aggregate the data by the `colData` variables specified with argument `by`, and return a `SingleCellExperiment` containing pseudobulk data. 105 | 106 | To perform a pseudobulk-level analysis, measurements must be aggregated at the cluster-sample level (default `by = c("cluster_id", "sample_id"`). In this case, the returned `SingleCellExperiment` will contain one assay per cluster, where rows = genes and columns = samples. Arguments `assay` and `fun` specify the input data and summary statistic, respectively, to use for aggregation. 107 | 108 | In a differential detection (DD) analysis, the default choice of the summary statistic used for aggregation is `fun = "num.detected"`. This strategy can be thought of as first binarizing the gene expression values (1: expressed, 0: not expressed), and subsequently performing a simple summation of the binarized gene expression counts for cells belonging to the same cluster-sample level. Hence, the resulting pseudobulk-level expression count reflects the total number of cells in a particular cluster-sample level with a non-zero gene expression value. 109 | 110 | In a differential state (DS) analysis, the default choice for aggregation is `fun = "sum"`, which amounts to the simple summation of the raw gene expression counts of cells belonging to the same cluster-sample level. 111 | 112 | ```{r pbs-det} 113 | pb_sum <- aggregateData(sce, 114 | assay="counts", fun="sum", 115 | by=c("cluster_id", "sample_id")) 116 | pb_det <- aggregateData(sce, 117 | assay="counts", fun="num.detected", 118 | by=c("cluster_id", "sample_id")) 119 | t(head(assay(pb_det))) 120 | ``` 121 | 122 | @Qiu2020 demonstrated that binarizing scRNA-seq counts generates expression profiles that still accurately reflect biological variation. 123 | This finding was confirmed by @Bouland2021, who showed that the frequencies of zero counts capture biological variability, and further claimed that a binarized representation of the single-cell expression data allows for a more robust description of the relative abundance of transcripts than counts. 124 | 125 | ```{r pbs-mds, fig.width=8, fig.height=4, fig.cap="Pseudobulk-level multidimensional scaling (MDS) plot based on (A) sum of counts and (B) sum of binarized counts (i.e., counting the number of detected features) in each cluster-sample."} 126 | pbMDS(pb_sum) + ggtitle("Σ counts") + 127 | pbMDS(pb_det) + ggtitle("# detected") + 128 | plot_layout(guides="collect") + 129 | plot_annotation(tag_levels="A") & 130 | theme(legend.key.size=unit(0.5, "lines")) 131 | ``` 132 | 133 | ## Analysis 134 | 135 | Once we have assembled the pseudobulk data, we can test for DD using `pbDD()`. By default, a $\sim$`group_id` model is fit, and the last coefficient of the linear model is tested to be equal to zero. 136 | 137 | ```{r pbDD} 138 | res_DD <- pbDD(pb_det, min_cells=0, filter="none", verbose=FALSE) 139 | ``` 140 | 141 | ## Handling and visualizing results 142 | 143 | Inspection, manipulation, and visualization of DD analysis results follows the same principles as for a DS analysis. For a detailed description, we refer to the DS analysis vignette`r Biocpkg("muscat", vignette = "analysis.html")`. Below, some basic functionalities are being displayed. 144 | 145 | ```{r} 146 | tbl <- res_DD$table[[1]] 147 | # one data.frame per cluster 148 | names(tbl) 149 | ``` 150 | 151 | ```{r} 152 | # view results for 1st cluster 153 | k1 <- tbl[[1]] 154 | head(format(k1[, -ncol(k1)], digits = 2)) 155 | ``` 156 | 157 | ```{r} 158 | # filter FDR < 5%, |logFC| > 1 & sort by adj. p-value 159 | tbl_fil <- lapply(tbl, \(u) 160 | filter(u, 161 | p_adj.loc < 0.05, 162 | abs(logFC) > 1) |> 163 | arrange(p_adj.loc)) 164 | 165 | # nb. of DS genes & % of total by cluster 166 | n_de <- vapply(tbl_fil, nrow, numeric(1)) 167 | p_de <- format(n_de / nrow(sce) * 100, digits = 3) 168 | data.frame("#DD" = n_de, "%DD" = p_de, check.names = FALSE) 169 | ``` 170 | 171 | ```{r} 172 | library(UpSetR) 173 | de_gs_by_k <- map(tbl_fil, "gene") 174 | upset(fromList(de_gs_by_k)) 175 | ``` 176 | 177 | # Stagewise anaysis 178 | 179 | While DD analysis results may contain biologically relevant information in their own right, we show in @Gilis2023 that combing DD and DS analysis results on the same data can further improve our understanding of complex biological phenomena. In the remainder of this vignette, we show how DD and DS analysis results on the same data can be effectively combined. 180 | 181 | For this, we build on the two-stage testing paradigm proposed by @Vandenberge2017. In the first stage of this testing procedure, we identify differential genes by using an omnibus test for differential detection and differential expression (DE). The null hypothesis for this test is that the gene is neither differentially detected, nor differentially expressed. 182 | 183 | In the second stage, we perform post-hoc tests on the differential genes from stage one to unravel whether they are DD, DE or both. Compared to the individual DD and DS analysis results, the two-stage approach increases statistical power and provides better type 1 error control. 184 | 185 | ```{r pbDS} 186 | res_DS <- pbDS(pb_sum, min_cells=0, filter="none", verbose=FALSE) 187 | ``` 188 | 189 | ```{r} 190 | res <- stagewise_DS_DD(res_DS, res_DD, verbose=FALSE) 191 | head(res[[1]][[1]]) # results for 1st cluster 192 | ``` 193 | 194 | ## Comparison 195 | 196 | ```{r} 197 | # for each approach, get adjusted p-values across clusters 198 | ps <- map_depth(res, 2, \(df) { 199 | data.frame( 200 | df[, c("gene", "cluster_id")], 201 | p_adj.stagewise=df$p_adj, 202 | p_adj.DS=df$res_DS$p_adj.loc, 203 | p_adj.DD=df$res_DD$p_adj.loc) 204 | }) |> 205 | lapply(do.call, what=rbind) |> 206 | do.call(what=rbind) |> 207 | data.frame(row.names=NULL) 208 | head(ps) 209 | ``` 210 | 211 | To get an overview of how different approaches compare, we can count the number of genes found differential in each cluster for a given FDR threshold: 212 | 213 | ```{r fig.width=12, fig.height=4} 214 | # for each approach & cluster, count number 215 | # of genes falling below 5% FDR threshold 216 | ns <- lapply(seq(0, 0.2, 0.005), \(th) { 217 | ps |> 218 | mutate(th=th) |> 219 | group_by(cluster_id, th) |> 220 | summarise( 221 | .groups="drop", 222 | across(starts_with("p_"), 223 | \(.) sum(. < th, na.rm=TRUE))) 224 | }) |> 225 | do.call(what=rbind) |> 226 | pivot_longer(starts_with("p_")) 227 | ggplot(ns, aes(th, value, col=name)) + 228 | geom_line(linewidth=0.8, key_glyph="point") + 229 | geom_vline(xintercept=0.05, lty=2, linewidth=0.4) + 230 | guides(col=guide_legend(NULL, override.aes=list(size=3))) + 231 | labs(x="FDR threshold", y="number of significantly\ndifferential genes") + 232 | facet_wrap(~cluster_id, scales="free_y", nrow=2) + 233 | theme_bw() + theme( 234 | panel.grid.minor=element_blank(), 235 | legend.key.size=unit(0.5, "lines")) 236 | ``` 237 | 238 | We can further identify which hits are shared between or unique to a given approach. 239 | In the example below, for instance, the vast majority of hits is common to all approaches, many hits are shared between DD and stagewise testing, and only few genes are specific to any one approach: 240 | 241 | ```{r upset, fig.width = 5, fig.height = 3, fig.cap = "Upset plot of differential findings (FDR < 0.05) across DS, DD, and stagewise analysis for an exemplary cluster; shown are the 50 most frequent interactions."} 242 | # subset adjuster p-values for cluster of interest 243 | qs <- ps[grep("CD4", ps$cluster_id), grep("p_", names(ps))] 244 | # for each approach, extract genes at 5% FDR threshold 245 | gs <- apply(qs, 2, \(.) ps$gene[. < 0.05]) 246 | # visualize set intersections between approaches 247 | UpSetR::upset(UpSetR::fromList(gs), order.by="freq") 248 | ``` 249 | 250 | ```{r} 251 | # extract genes unique to stagewise testing 252 | sw <- grep("stagewise", names(gs)) 253 | setdiff(gs[[sw]], unlist(gs[-sw])) 254 | ``` 255 | 256 | # Session info {- .smaller} 257 | 258 | ```{r session-info} 259 | sessionInfo() 260 | ``` 261 | 262 | # References --------------------------------------------------------------------------------