├── .Rbuildignore
├── .github
    └── workflows
    │   └── pr_check.yml
├── .gitignore
├── CITATION.cff
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── CITEseq.R
    ├── GTseq.R
    ├── SCoPE2.R
    ├── SingleCellMultiModal-package.R
    ├── SingleCellMultiModal.R
    ├── cache.R
    ├── cellGating.R
    ├── ontomap.R
    ├── scMultiome.R
    ├── scNMT.R
    ├── seqFISH.R
    └── utils.R
├── README.md
├── _pkgdown.yml
├── inst
    ├── CITATION
    ├── REFERENCES.bib
    ├── extdata
    │   ├── docuData
    │   │   ├── singlecellmultimodalv1.csv
    │   │   ├── singlecellmultimodalv2.csv
    │   │   ├── singlecellmultimodalv4.csv
    │   │   ├── singlecellmultimodalv5.csv
    │   │   ├── singlecellmultimodalv6.csv
    │   │   ├── singlecellmultimodalv7.csv
    │   │   ├── singlecellmultimodalv8.csv
    │   │   └── singlecellmultimodalv9.csv
    │   ├── metadata.csv
    │   └── ontomap.tsv
    └── scripts
    │   ├── CITEseq_celltypes.R
    │   ├── Contributing-Guidelines.Rmd
    │   ├── README.Rmd
    │   ├── make-data.R
    │   ├── make-data
    │       ├── CITEseq_filtering.R
    │       ├── make_macrophage.R
    │       ├── scMultiome.R
    │       └── scNMT.R
    │   ├── make-metadata.R
    │   ├── make-upload.R
    │   ├── make_docu.R
    │   ├── ontomap_update.R
    │   └── update_wiki.sh
├── man
    ├── CITEseq.Rd
    ├── GTseq.Rd
    ├── SCoPE2.Rd
    ├── SingleCellMultiModal-package.Rd
    ├── SingleCellMultiModal.Rd
    ├── addCTLabels.Rd
    ├── dot-CITEseqMaeToSce.Rd
    ├── getCellGroups.Rd
    ├── ontomap.Rd
    ├── scMultiome.Rd
    ├── scNMT.Rd
    ├── scmmCache.Rd
    └── seqFISH.Rd
└── vignettes
    ├── CITEseq.Rmd
    ├── ECCITEseq.Rmd
    ├── GTseq.Rmd
    ├── SCoPE2.Rmd
    ├── SingleCellMultiModal.Rmd
    ├── scMultiome.Rmd
    ├── scNMT.Rmd
    └── seqFISH.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | #----------------------------
 2 | # Git
 3 | #----------------------------
 4 | ^\.git$
 5 | ^\.github$
 6 | ^\.gitignore$
 7 | ^\.gitattributes$
 8 | 
 9 | #----------------------------
10 | # RStudio and R
11 | #----------------------------
12 | ^\.Rhistory$
13 | ^.*\.Rproj$
14 | ^\.Rproj\.user$
15 | 
16 | #----------------------------
17 | # Data and files
18 | #----------------------------
19 | ^.*\.rda$
20 | ^.*\.tar\.gz$
21 | ^\.lintr$
22 | ^README\.md$
23 | ^docs$
24 | ^data-raw$
25 | ^.*_cache$
26 | ^CITATION\.cff$
27 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_check.yml:
--------------------------------------------------------------------------------
  1 | name: PR CMD check & build site
  2 | 
  3 | on:
  4 |   pull_request:
  5 |   push:
  6 |     paths:
  7 |       - 'DESCRIPTION'
  8 |       - '**.yml'
  9 |     branches:
 10 |       - devel
 11 |       - RELEASE_3_21
 12 | 
 13 | env:
 14 |   R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
 15 |   GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 16 |   CRAN: https://p3m.dev/cran/__linux__/noble/latest
 17 |   BIOC_RELEASE: RELEASE_3_21
 18 | 
 19 | jobs:
 20 |   set-matrix:
 21 |     runs-on: ubuntu-24.04
 22 |     outputs:
 23 |       matrix: ${{ steps.set.outputs.matrix }}
 24 |       dockerfile_exists: ${{ steps.dockerfile.outputs.exists }}
 25 |     steps:
 26 |       - name: Set Matrix Bioconductor Version
 27 |         id: set
 28 |         run: |
 29 |           MATRIX="{\"include\":[{\"bioc_version\":\"$GITHUB_REF_NAME\"}]}"
 30 |           echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
 31 |       - name: Check for Dockerfile
 32 |         id: dockerfile
 33 |         run: |
 34 |           echo "exists=$( [ -f ./inst/docker/pkg/Dockerfile ] && echo true || echo false )" >> $GITHUB_OUTPUT
 35 | 
 36 |   check:
 37 |     needs: set-matrix
 38 |     runs-on: ubuntu-latest
 39 |     strategy:
 40 |       matrix: ${{ fromJson(needs.set-matrix.outputs.matrix) }}
 41 |     container: bioconductor/bioconductor_docker:${{ matrix.bioc_version }}
 42 | 
 43 |     steps:
 44 |       - name: Checkout Repository
 45 |         uses: actions/checkout@v4
 46 |         with:
 47 |           ref: ${{ matrix.bioc_version }}
 48 | 
 49 |       - name: Query dependencies
 50 |         run: |
 51 |           BiocManager::install(c("covr", "BiocCheck"))
 52 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
 53 |         shell: Rscript {0}
 54 | 
 55 |       - name: Cache R packages
 56 |         uses: actions/cache@v4
 57 |         with:
 58 |           path: /usr/local/lib/R/site-library
 59 |           key: ${{ runner.os }}-r-${{ matrix.bioc_version }}-${{ hashFiles('.github/depends.Rds') }}
 60 |           restore-keys: ${{ runner.os }}-r-${{ matrix.bioc_version }}-
 61 | 
 62 |       - name: Install GPG
 63 |         if: ${{ github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 64 |         run: sudo apt-get update && sudo apt-get install -y gpg
 65 | 
 66 |       - name: Install Dependencies
 67 |         run: |
 68 |           remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories())
 69 |           BiocManager::install(c("rcmdcheck", "BiocCheck"), ask = FALSE, update = TRUE)
 70 |         shell: Rscript {0}
 71 | 
 72 |       - name: Check Package
 73 |         env:
 74 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
 75 |         run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error", check_dir = "check")
 76 |         shell: Rscript {0}
 77 | 
 78 |       - name: Test coverage
 79 |         if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 80 |         run: |
 81 |           cov <- covr::package_coverage(
 82 |             quiet = FALSE,
 83 |             clean = FALSE,
 84 |             type = "all",
 85 |             install_path = file.path(
 86 |               normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"),
 87 |               "package"
 88 |             )
 89 |           )
 90 |           covr::to_cobertura(cov)
 91 |         shell: Rscript {0}
 92 | 
 93 |       - name: Upload test results to Codecov
 94 |         if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 95 |         uses: codecov/codecov-action@v4
 96 |         with:
 97 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
 98 |           file: ./cobertura.xml
 99 |           plugin: noop
100 |           disable_search: true
101 |           token: ${{ secrets.CODECOV_TOKEN }}
102 | 
103 |       - name: Run BiocCheck
104 |         id: bioccheck
105 |         run: |
106 |           BiocCheck::BiocCheck(
107 |             dir('check', 'tar.gz$', full.names = TRUE),
108 |             `quit-with-status` = TRUE, `no-check-bioc-help` = TRUE
109 |           )
110 |         shell: Rscript {0}
111 | 
112 |       - name: Build pkgdown
113 |         if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }}
114 |         run: |
115 |            PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site()'
116 | 
117 |       - name: Upload pkgdown artifact
118 |         if: github.ref == format('refs/heads/{0}', env.BIOC_RELEASE)
119 |         uses: actions/upload-pages-artifact@v3
120 |         with:
121 |           path: docs
122 | 
123 |   dock:
124 |     needs:
125 |       - check
126 |       - set-matrix
127 |     runs-on: ubuntu-24.04
128 |     if: ${{ github.ref == 'refs/heads/devel' && needs.set-matrix.outputs.dockerfile_exists == 'true' }}
129 |     steps:
130 |       - name: Checkout Repository
131 |         if: ${{ success() && github.event_name != 'pull_request' }}
132 |         uses: actions/checkout@v4
133 | 
134 |       - name: Register repo name
135 |         if: ${{ github.event_name != 'pull_request' }}
136 |         id: reg_repo_name
137 |         run: |
138 |           echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
139 | 
140 |       - name: Login to Docker Hub
141 |         if: ${{ github.event_name != 'pull_request' }}
142 |         uses: docker/login-action@v2
143 |         with:
144 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
145 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
146 | 
147 |       - name: Build and Push Docker
148 |         if: ${{ success() && github.event_name != 'pull_request' }}
149 |         uses: docker/build-push-action@v6
150 |         with:
151 |           context: .
152 |           file: ./inst/docker/pkg/Dockerfile
153 |           push: true
154 |           tags: >
155 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest,
156 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel
157 | 
158 |   deploy:
159 |     needs: check
160 |     permissions:
161 |       contents: write
162 |       pages: write
163 |       id-token: write
164 |     runs-on: ubuntu-24.04
165 | 
166 |     steps:
167 |       - name: Deploy to GitHub Pages
168 |         if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }}
169 |         id: deployment
170 |         uses: actions/deploy-pages@v4
171 | 
172 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # RStudio Files
 2 | .Rproj.user
 3 | # Data Files
 4 | .RData
 5 | /vignettes/cache/
 6 | *.[Rr][Dd][SsAa]
 7 | *.txt
 8 | *.html
 9 | .Ruserdata
10 | .Rhistory
11 | *GSE*
12 | *_cache
13 | # Merge residuals
14 | *.orig
15 | # compressed files
16 | *.gz
17 | # databases
18 | *.sqlite
19 | .DS_Store
20 | SingleCellMultiModal.Rproj
21 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Package
 2 | Package: SingleCellMultiModal
 3 | Title: Integrating Multi-modal Single Cell Experiment datasets
 4 | Version: 1.21.2
 5 | Authors@R: c(
 6 |     person("Marcel", "Ramos", , "marcel.ramos@roswellpark.org",
 7 |         c("aut", "cre"), c(ORCID = "0000-0002-3242-0582")
 8 |     ),
 9 |     person("Ricard", "Argelaguet", , "ricard@ebi.ac.uk", "aut"),
10 |     person("Al", "Abadi", , , "ctb"),
11 |     person("Dario", "Righelli", , "dario.righelli@gmail.com", "aut"),
12 |     person("Christophe", "Vanderaa", ,
13 |         "christophe.vanderaa@uclouvain.be", "ctb"),
14 |     person("Kelly", "Eckenrode", , "kelly.eckenrode@sph.cuny.edu", "aut"),
15 |     person("Ludwig", "Geistlinger", ,
16 |         "ludwig_geistlinger@hms.harvard.edu", "aut"),
17 |     person("Levi", "Waldron", , "lwaldron.research@gmail.com", "aut")
18 |     )
19 | Description: SingleCellMultiModal is an ExperimentHub package
20 |     that serves multiple datasets obtained from GEO and other sources and
21 |     represents them as MultiAssayExperiment objects. We provide several
22 |     multi-modal datasets including scNMT, 10X Multiome, seqFISH, CITEseq,
23 |     SCoPE2, and others. The scope of the package is is to provide data for
24 |     benchmarking and analysis. To cite, use the 'citation' function and see 
25 |     <https://doi.org/10.1371/journal.pcbi.1011324>.
26 | License: Artistic-2.0
27 | BugReports: https://github.com/waldronlab/SingleCellMultiModal/issues
28 | Depends:
29 |     R (>= 4.2.0),
30 |     MultiAssayExperiment
31 | Imports:
32 |     AnnotationHub,
33 |     BiocBaseUtils,
34 |     BiocFileCache,
35 |     ExperimentHub,
36 |     graphics,
37 |     HDF5Array,
38 |     S4Vectors,
39 |     SingleCellExperiment,
40 |     SpatialExperiment,
41 |     SummarizedExperiment,
42 |     Matrix,
43 |     methods,
44 |     utils
45 | Suggests:
46 |     BiocStyle,
47 |     ggplot2,
48 |     knitr,
49 |     RaggedExperiment,
50 |     rmarkdown,
51 |     scater,
52 |     scran,
53 |     UpSetR,
54 |     uwot
55 | VignetteBuilder:
56 |     knitr
57 | biocViews: ExperimentData, SingleCellData, ReproducibleResearch,
58 |     ExperimentHub, GEO
59 | Encoding: UTF-8
60 | RoxygenNote: 7.3.2
61 | Roxygen: list(markdown = TRUE)
62 | Date: 2025-05-06
63 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CITEseq)
 4 | export(GTseq)
 5 | export(SCoPE2)
 6 | export(SingleCellMultiModal)
 7 | export(addCTLabels)
 8 | export(getCellGroups)
 9 | export(ontomap)
10 | export(removeCache)
11 | export(scMultiome)
12 | export(scNMT)
13 | export(scmmCache)
14 | export(seqFISH)
15 | export(setCache)
16 | import(MultiAssayExperiment)
17 | importFrom(AnnotationHub,query)
18 | importFrom(ExperimentHub,ExperimentHub)
19 | importFrom(ExperimentHub,loadResources)
20 | importFrom(Matrix,Matrix)
21 | importFrom(MultiAssayExperiment,experiments)
22 | importFrom(S4Vectors,DataFrame)
23 | importFrom(S4Vectors,SimpleList)
24 | importFrom(SingleCellExperiment,"altExp<-")
25 | importFrom(SingleCellExperiment,"altExps<-")
26 | importFrom(SingleCellExperiment,SingleCellExperiment)
27 | importFrom(SingleCellExperiment,altExp)
28 | importFrom(SingleCellExperiment,altExps)
29 | importFrom(SingleCellExperiment,colData)
30 | importFrom(SingleCellExperiment,counts)
31 | importFrom(SpatialExperiment,SpatialExperiment)
32 | importFrom(SummarizedExperiment,"assays<-")
33 | importFrom(SummarizedExperiment,SummarizedExperiment)
34 | importFrom(graphics,abline)
35 | importFrom(graphics,smoothScatter)
36 | importFrom(graphics,text)
37 | importFrom(methods,is)
38 | importFrom(utils,glob2rx)
39 | importFrom(utils,read.csv)
40 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | ## Changes in version 1.16.0
  2 | 
  3 | ### New features
  4 | 
  5 | * Added citation information to the package; see
  6 | `citation("SingleCellMultiModal")` and the vignette.
  7 | 
  8 | ### Bug fixes and minor improvements
  9 | 
 10 | * Update imports from `SingleCellExperiment`, `S4Vectors`, and
 11 | `SummarizedExperiment`
 12 | * Add package anchors to links in documentation
 13 | * Use markdown in documentation
 14 | 
 15 | ## Changes in version 1.14.0
 16 | 
 17 | ### New features
 18 | 
 19 | * The `ontomap` function provides a reference table of ontology IDs and cell
 20 | names by data type available in the package.
 21 | * `scRNAseq` `colData` added to `cord_blood` and `peripheral_blood` datasets
 22 | provided by the `CITEseq` function. (@drighelli)
 23 | 
 24 | ### Bug fixes and minor improvements
 25 | 
 26 | * When using `HDF5` as `format` input in `scMultiome`, the filtering of file
 27 | paths obtained from `ExperimentHub` has been fixed.
 28 | * Using `BiocBaseUtils` internally to handle assertions and checks.
 29 | 
 30 | ## Changes in version 1.12.0
 31 | 
 32 | ### Bug fixes and minor improvements
 33 | 
 34 | * Added Ludwig Geistlinger as author (@lgeistlinger) for contributing the
 35 | `GTseq` dataset.
 36 | 
 37 | ## Changes in version 1.8.0
 38 | 
 39 | ### Bug fixes and minor improvements
 40 | 
 41 | * Updated the reference in the `SCoPE2` vignette (@cvanderaa).
 42 | 
 43 | ## Changes in version 1.6.0
 44 | 
 45 | ### New features
 46 | 
 47 | * `scMultiome` version `1.0.1` provides the 10X format for RNAseq data.
 48 | 
 49 | ### Bug fixes and minor improvements
 50 | 
 51 | * Updates to `seqFISH` vignette and documentation.
 52 | * Updated to changes in `SummarizedExperiment` where `assayDimnames` are
 53 | checked.
 54 | * `scNMT` defaults to version '1.0.0's QC filtered cells. For unfiltered
 55 | cells see version section in `?scNMT`.
 56 | 
 57 | ## Changes in version 1.4.0
 58 | 
 59 | ### New features
 60 | 
 61 | * `SingleCellMultiModal` function allows the combination of multiple
 62 | multi-modal technologies.
 63 | * `GTseq` data from Macaulay et al. (2015) now available (@lgeistlinger)
 64 | * `SCoPE2` data from Specht et al. now available thanks to @cvanderaa (#26)
 65 | * `scMultiome` provides PBMC from 10X Genomics thanks to @rargelaguet
 66 | 
 67 | ### Bug fixes and minor improvements
 68 | 
 69 | * Metadata information (function call and call to technology map) included in
 70 | `SingleCellMultiModal`
 71 | * `scNMT` includes the original call in the `MultiAssayExperiment` metadata
 72 | * Improved and edited Contributing Guidelines for clarity
 73 | * `seqFISH` uses the `spatialData` argument with `DataFrame` input based on
 74 | changes to `SpatialExperiment` (@drighelli)
 75 | * Removed the extra column in the `sampleMap` in `CITEseq` (@drighelli)
 76 | 
 77 | ## Changes in version 1.2.0
 78 | 
 79 | ### New features
 80 | 
 81 | * `CITEseq` function, vignette, and 'cord_blood' data available
 82 | (@drighelli, #18)
 83 | * Include `seqFISH` function, vignette, and 'mouse_visual_cortex' data
 84 | (v1 and v2 from @drighelli, #14)
 85 | * New 'mouse_gastrulation' dataset released (version "2.0.0").
 86 | * Use `version` argument to indicate the `mouse_gastrulation` data version
 87 | * The data includes **all** cells not only the ones that passed the QC
 88 | of all three 'omics (thanks @rargelaguet, @ajabadi).
 89 | 
 90 | ### Bug fixes and minor improvements
 91 | 
 92 | * Caching mechanism uses `tools::R_user_dir` and not `rappdirs`.
 93 | * Improved display of available data using `ExperimentHub` metadata.
 94 | * Improved documentation explaining versioning differences.
 95 | * Contribution guidelines available at
 96 | https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines
 97 | * Default `version` argument in `scNMT` function now set to "2.0.0" (version
 98 | "1.0.0" still available)
 99 | 
100 | ## Changes in version 1.0.0
101 | 
102 | ### New features
103 | 
104 | * `scNMT` serves the mouse gastrulation dataset from Argelaguet et al. 2019
105 | * Data set is provided by Argelaguet and colleagues via CloudStor link:
106 | https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ
107 | * GitHub repository for the dataset by the authors available at:
108 | https://github.com/rargelaguet/scnmt_gastrulation
109 | 
110 | ### Bug fixes and minor improvements
111 | 
112 | * Row names in the scNMT dataset properly show mouse ENSEMBL identifiers
113 | 


--------------------------------------------------------------------------------
/R/CITEseq.R:
--------------------------------------------------------------------------------
  1 | .cord_blood <- function(ess_list)
  2 | {
  3 |     idx <- grep(pattern="Counts", names(ess_list$experiments))
  4 |     names(ess_list$experiments) <- gsub("Counts|_Counts", "", names(ess_list$experiments))
  5 |     mae <- MultiAssayExperiment::MultiAssayExperiment(experiments=(ess_list$experiments[idx]))
  6 |     coldat <- sampleMap(mae)[,-c(1:2), drop=FALSE]
  7 |     rownames(coldat) <- coldat[,1]
  8 |     colnames(coldat) <- c("sampleID")
  9 |     cd <- ess_list$experiments[grep("coldata", names(ess_list$experiments))][[1]]
 10 |     ### check add clr counts
 11 |     if ( !is.null(dim(cd)) )
 12 |     {
 13 |         # colData(mae) <- S4Vectors::cbind.DataFrame(coldat, cd)
 14 |         colData(mae) <- DataFrame(cd)
 15 |     } else {
 16 |         colData(mae) <- coldat
 17 |     }
 18 |     return(mae)
 19 | }
 20 | 
 21 | .combMatrixForAssay <- function(explist, dimslist,
 22 |                                 assayId=c("scADT", "scHTO", "scRNA"))
 23 | {
 24 |     match.arg(assayId)
 25 |     assIdx <- grep(assayId, names(explist))
 26 |     switch(assayId,
 27 |            "scADT"=, "scHTO"={
 28 |                if(length(explist[assIdx]) == 2)
 29 |                {
 30 |                    m1 <- Matrix::Matrix(unlist(explist[assIdx]),
 31 |                         nrow=dimslist[assIdx][[1]][1],
 32 |                         ncol=(dimslist[assIdx][[1]][2]+dimslist[assIdx][[2]][2]),
 33 |                         sparse=TRUE)
 34 |                } else {
 35 |                    m1 <- Matrix::Matrix(explist[[assIdx]])
 36 |                }
 37 |            },
 38 |            "scRNA"={
 39 |                if(length(explist[assIdx]) == 2)
 40 |                {
 41 |                    ## we can have at last 2 matrices
 42 |                    m1 <- cbind(explist[[assIdx[1]]], explist[[assIdx[2]]])
 43 |                } else {
 44 |                    m1 <- explist[[assIdx]]
 45 |                }
 46 |            },
 47 |            { stop("Unrecognized assayId: ", assayId) }
 48 |     )
 49 |     if(length(explist[assIdx]) == 2)
 50 |     {
 51 |         colnames(m1) <- c(paste0(rep(gsub("scADT|scHTO|scRNA","",
 52 |                                           names(explist)[assIdx[1]]),
 53 |                                      dimslist[assIdx][[1]][2]),
 54 |                                  colnames(explist[[assIdx[1]]])),
 55 |                           paste0(rep(gsub("scADT|scHTO|scRNA","",
 56 |                                           names(explist)[assIdx[2]]),
 57 |                                      dimslist[assIdx][[2]][2]),
 58 |                                  colnames(explist[[assIdx[2]]])))
 59 |         rownames(m1) <- rownames(explist[[assIdx[[1]]]])
 60 |     } else {
 61 |         colnames(m1) <- paste0(rep(gsub("scADT|scHTO|scRNA","",
 62 |                                         names(explist)[assIdx[1]]),
 63 |                                    dimslist[assIdx][[1]][2]),
 64 |                                colnames(explist[[assIdx[1]]]))
 65 |         rownames(m1) <- rownames(explist[[assIdx[[1]]]])
 66 |     }
 67 |     return(m1)
 68 | }
 69 | 
 70 | .buildColData <- function(mat1, assayId)
 71 | {
 72 |     cd <- DataFrame(
 73 |         colname=colnames(mat1),
 74 |         condition=gsub("_\\w+", "", colnames(mat1))
 75 |     )
 76 |     return(cd)
 77 | }
 78 | 
 79 | .buildMap <- function(mat1, assayId)
 80 | {
 81 |     map <- DataFrame(assay=assayId,
 82 |                      #primary=gsub("_\\w+", "", colnames(mat1)),
 83 |                      primary=colnames(mat1),
 84 |                      colname=colnames(mat1),
 85 |                      condition=gsub("_\\w+", "", colnames(mat1)))
 86 |     return(map)
 87 | }
 88 | 
 89 | #' @importFrom Matrix Matrix
 90 | .peripheral_blood <- function(ess_list)
 91 | {
 92 |     ll <- ess_list$experiments
 93 |     cdidx <- grep("coldata", names(ll))
 94 |     cd <- NULL
 95 |     if (length(cdidx)!=0)
 96 |     {
 97 |         cd <- ll[[cdidx]]
 98 |         ll <- ess_list$experiments[-cdidx]
 99 |     }
100 |     ll <- lapply(ll, function(x)
101 |     {
102 |         x <- x[order(rownames(x)),]
103 |     })
104 | 
105 |     dims <- lapply(ll, dim)
106 |     # expslist <- vector("list", length(ll))
107 |     # sampmap <- DataFrame()
108 |     exps <- lapply(c("scADT", "scHTO", "scRNA"), function(assayn)
109 |     {
110 |         if ( !isEmpty(grep(assayn, names(ll))) )
111 |         {
112 |             assmat <- .combMatrixForAssay(explist=ll, dimslist=dims, assayId=assayn)
113 |             assmap <- .buildMap(assmat, assayId=assayn)
114 |             return(list("EXP"=assmat, "SAMP"=assmap, "NAME"=assayn))
115 |         }
116 |     })
117 |     names(exps) <- unlist(lapply(exps, function(e){e$NAME}))
118 |     expslist <- lapply(exps, function(e){e$EXP})
119 |     sampmap <- do.call("rbind", lapply(exps, function(e){e$SAMP}))
120 |     if (is.null(cd)) {
121 |         coldat <- .buildColData(ll)
122 |         coldat <- sampmap[,-c(1:2)]
123 |         colnames(coldat) <- c("sampleID", "condition")
124 |         rownames(coldat) <- coldat$sampleID
125 |         coldat <- unique(coldat)
126 |     } else {
127 |         coldat <- cd
128 |     }
129 |     mae <- MultiAssayExperiment::MultiAssayExperiment(experiments=expslist,
130 |                                                       sampleMap=sampmap,
131 |                                                       colData=coldat)
132 |     if(!isEmpty(grep("TCR", names(ll))))
133 |     {
134 |         metadata(mae) <- ll[grep("TCR", names(ll))]
135 |     }
136 |     return(mae)
137 | }
138 | 
139 | #' CITEseq
140 | #' @description function assembles data on-the-fly from `ExperimentHub` to
141 | #'   provide a
142 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
143 | #'   container. Actually the `dataType` argument provides access to the
144 | #'   available datasets associated to the package.
145 | #' @author Dario Righelli
146 | #' @details CITEseq data are a combination of single cell transcriptomics and
147 | #'   about a hundread of cell surface proteins.
148 | #'   Available datasets are:
149 | #'   * cord_blood: a dataset of single cells of cord blood as
150 | #'   provided in Stoeckius et al. (2017).
151 | #'      * scRNA_Counts - Stoeckius scRNA-seq gene count matrix
152 | #'      * scADT - Stoeckius antibody-derived tags (ADT) data
153 | #'   * peripheral_blood: a dataset of single cells of peripheral
154 | #'   blood as provided in Mimitou et al. (2019). We provide two different
155 | #'   conditions controls (CTRL) and Cutaneous T-cell Limphoma (CTCL). Just build
156 | #'   appropriate `modes` regex for subselecting the dataset modes.
157 | #'      * scRNA - Mimitou scRNA-seq gene count matrix
158 | #'      * scADT - Mimitou antibody-derived tags (ADT) data
159 | #'      * scHTO - Mimitou Hashtag Oligo (HTO) data
160 | #'      * TCRab - Mimitou T-cell Receptors (TCR) alpha and beta
161 | #'      available through the object metadata.
162 | #'      * TCRgd - Mimitou T-cell Receptors (TCR) gamma and delta
163 | #'      available through the object metadata.
164 | #'
165 | #' @param DataType `character(1)` indicating the identifier of the dataset to
166 | #'     retrieve.  (default "cord_blood")
167 | #'
168 | #' @param modes `character()` The assay types or modes of data to obtain these
169 | #'     include scADT and scRNA-seq data by default.
170 | #'
171 | #' @param version `character(1)` Either version '1.0.0' depending on
172 | #'     data version required.
173 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual
174 | #'     download (default `TRUE`)
175 | #' @param filtered `logical(1)` indicating if the returned dataset needs to
176 | #'     have filtered cells.
177 | #'     See Details for additional information about the filtering process.
178 | #'
179 | #' @param verbose `logical(1)` Whether to show the dataset currently being
180 | #'     (down)loaded (default `TRUE`)
181 | #'
182 | #' @param ... Additional arguments passed on to the
183 | #'     \link[ExperimentHub]{ExperimentHub-class} constructor
184 | #'
185 | #' @param DataClass either MultiAssayExperiment or SingleCellExperiment
186 | #' data classes can be returned (default MultiAssayExperiment)
187 | #'
188 | #' @details
189 | #' If `filtered` parameter is `FALSE` (default), the `colData` of the returned
190 | #' object contains multiple columns of `logicals` indicating the cells to be
191 | #' discarded.
192 | #' In case `filtered` is `TRUE`, the `discard` column is used to filer the
193 | #' cells.
194 | #' Column `adt.discard` indicates the cells to be discarded computed on the ADT
195 | #' assay.
196 | #' Column `mito.discard` indicates the cells to be discarded computed on the
197 | #' RNA assay and mitocondrial genes.
198 | #' Column `discard` combines the previous columns with an `OR` operator.
199 | #' Note that for the `peripheral_blood` dataset these three columns are
200 | #' computed and returned separately for the `CTCL` and `CTRL` conditions.
201 | #' In this case the additional `discard` column combines the `discard.CTCL` and
202 | #' `discard.CTRL` columns with an `OR` operator.
203 | #' Cell filtering has been computed for `cord_blood` and `peripheral_blood`
204 | #' datasets following section 12.3 of the Advanced Single-Cell Analysis with
205 | #' Bioconductor book.
206 | #' Executed code can be retrieved in the CITEseq_filtering.R script of this
207 | #' package.
208 | #'
209 | #' @return A single cell multi-modal
210 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
211 | #'   or informative `data.frame` when `dry.run` is `TRUE`. When `DataClass` is
212 | #'   `SingleCellExperiment` an object of this class is returned with an RNA
213 | #'   assay as main experiment and other assay(s) as `AltExp(s)`.
214 | #' @references Stoeckius et al. (2017), Mimitou et al. (2019)
215 | #' @export
216 | #'
217 | #' @examples
218 | #'
219 | #' mae <- CITEseq(DataType="cord_blood", dry.run=FALSE)
220 | #' experiments(mae)
221 | CITEseq <- function(DataType=c("cord_blood", "peripheral_blood"), modes="*",
222 |                 version="1.0.0", dry.run=TRUE, filtered=FALSE, verbose=TRUE,
223 |                 DataClass=c("MultiAssayExperiment", "SingleCellExperiment"),
224 |                 ...)
225 | {
226 |     dataType <- match.arg(DataType)
227 |     message("Dataset: ", dataType)
228 |     dataClass <- match.arg(DataClass)
229 |     ess_list <- .getResourcesList(prefix = "citeseq_", datatype = dataType,
230 |                     modes=modes, version=version,
231 |                     dry.run=dry.run, verbose=verbose, ...)
232 |     if (!dry.run) {
233 |         mae <- switch(
234 |             dataType,
235 |             "cord_blood" = { .cord_blood(ess_list=ess_list) },
236 |             "peripheral_blood" = { .peripheral_blood(ess_list=ess_list) },
237 |             ## Add here other CITE-seq datasets based on DataType identifier
238 |             { stop("Unrecognized CITE-seq dataset name: ", DataType) }
239 |         )
240 |         if (filtered) {
241 |             sampleMap(mae) <- sampleMap(mae)[!colData(mae)$discard, ]
242 |         }
243 |         if(dataClass=="SingleCellExperiment") return(.CITEseqMaeToSce(mae))
244 |         return(mae)
245 |     } else {
246 |         return(ess_list)
247 |     }
248 | }
249 | 
250 | 
251 | #' CITEseqMaeToSce
252 | #' @description converts a `MultiAssayExperiment` object with CITEseq data into
253 | #' a `SingleCellExperiment` object to be used with already known methods and
254 | #' packages in literature.
255 | #'
256 | #' Note that for creating a `SingleCellExperiment` object the following function
257 | #' subsets all the assays present in the `MultiAssayExperiment` with only the
258 | #' common cells across all the modalities.
259 | #' This could result in a not complete object.
260 | #'
261 | #'
262 | #' @param mae a MultiAssayExperiment object with scRNA and/or scADT and/or
263 | #' scHTO named experiments.
264 | #'
265 | #' @return a SingleCellExperiment object as widely with scRNA data as counts
266 | #' and scADT, scHTO data as altExps.
267 | #' If only one modality is present, it has returned as main assay of the SCE.
268 | #'
269 | #' @importFrom MultiAssayExperiment experiments
270 | #' @importFrom SummarizedExperiment SummarizedExperiment assays<-
271 | #' @importFrom SingleCellExperiment SingleCellExperiment altExp altExp<- altExps
272 | #'   altExps<- colData counts
273 | #' @importFrom methods is
274 | #' @importFrom S4Vectors SimpleList
275 | #' @keywords internal
276 | .CITEseqMaeToSce <- function(mae)
277 | {
278 |     stopifnot(c(is(mae, "MultiAssayExperiment"), !(length(mae)==0)))
279 | 
280 |     cs <- colnames(mae[[1]])
281 |     for ( i in seq_along(mae)[-1]) { cs <- intersect(cs, colnames(mae[[i]])) }
282 | 
283 |     scelist <- lapply(seq_along(mae), function(i)
284 |     {
285 |         sce <- SingleCellExperiment(list(counts=mae[[i]]))
286 |         sce <- sce[, (colnames(sce) %in% cs)]
287 |         cd <- colData(mae)[(rownames(colData(mae)) %in% colnames(sce)), ]
288 |         colData(sce) <- cd
289 |         return(sce)
290 |     })
291 |     names(scelist) <- names(mae)
292 | 
293 |     idx <- grep("scRNA", names(scelist))
294 |     if (length(idx) != 0 )
295 |     {
296 |         altExps(scelist[[idx]]) <- scelist[-idx]
297 |         sce <- scelist[[idx]]
298 |     } else {
299 |         stop("Couldn't find RNA assay in MultiAssayExperiment")
300 |     }
301 |     idx <- grep("scADT_clr", names(altExps(sce)))
302 |     if( length(idx) != 0 )
303 |     {
304 |         clr <- counts(altExps(sce)[[idx]])
305 |         altExps(sce)[idx] <- NULL
306 |         assays(altExp(sce)) <- SimpleList(counts=counts(altExp(sce)), clr=clr)
307 |     }
308 | 
309 |     if ( !isEmpty(metadata(mae))) {
310 |         metadata(sce) <- metadata(mae)
311 |     }
312 | 
313 |     return(sce)
314 | }
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/R/GTseq.R:
--------------------------------------------------------------------------------
  1 | ############################################################
  2 | #
  3 | # author: Ludwig Geistlinger
  4 | # date: 2021-03-24 18:17:27
  5 | #
  6 | # descr: G&T-seq data retrieval
  7 | #
  8 | ############################################################
  9 | 
 10 | #' Parallel sequencing data of single-cell genomes and transcriptomes
 11 | #'
 12 | #' @description GTseq assembles data on-the-fly from `ExperimentHub` to provide
 13 | #'   a
 14 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 15 | #'   container. The `DataType` argument provides access to the
 16 | #'   `mouse_embryo_8_cell` dataset as obtained from Macaulay et al. (2015).
 17 | #'   Protocol information for this dataset is available from Macaulay et al.
 18 | #'   (2016). See references.
 19 | #'
 20 | #' @details G&T-seq is a combination of Picoplex amplified gDNA sequencing
 21 | #'   (genome) and SMARTSeq2 amplified cDNA sequencing (transcriptome) of the
 22 | #'   same cell. For more information, see Macaulay et al. (2015).
 23 | #'     * mouse_embryo_8_cell:
 24 | #'     this dataset was filtered for bad cells as specified in Macaulay
 25 | #'     et al. (2015).
 26 | #'         * genomic - integer copy numbers as detected from scDNA-seq
 27 | #'         * transcriptomic - raw read counts as quantified from scRNA-seq
 28 | #'
 29 | #' @section metadata:
 30 | #'   The `MultiAssayExperiment` metadata includes the original function call
 31 | #'   that saves the function call and the data version requested.
 32 | #'
 33 | #' @param DataType `character(1)` Indicates study that produces this type of
 34 | #'   data (default: 'mouse_embryo_8_cell')
 35 | #'
 36 | #' @param modes `character()` A wildcard / glob pattern of modes, such as
 37 | #'   `"*omic"`. A wildcard of `"*"` will return all modes including
 38 | #'   copy numbers ("genomic") and RNA-seq read counts ("transcriptomic"),
 39 | #'   which is the default.
 40 | #'
 41 | #' @param version `character(1)` Currently, only version '1.0.0'.
 42 | #'
 43 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual
 44 | #'   download (default `TRUE`)
 45 | #'
 46 | #' @param verbose `logical(1)` Whether to show the dataset currently being
 47 | #'   (down)loaded (default `TRUE`)
 48 | #'
 49 | #' @param ... Additional arguments passed on to the
 50 | #'   [ExperimentHub][ExperimentHub::ExperimentHub-class] constructor
 51 | #'
 52 | #' @seealso SingleCellMultiModal-package
 53 | #'
 54 | #' @return A single cell multi-modal
 55 | #'   [MultiAssayExperiment][MultiAssayExperiment::MultiAssayExperiment-class] or
 56 | #'   informative `data.frame` when `dry.run` is `TRUE`
 57 | #'
 58 | #' @source <https://www.ebi.ac.uk/ena/browser/view/PRJEB9051>
 59 | #'
 60 | #' @references
 61 | #'   Macaulay et al. (2015) G&T-seq: parallel sequencing of single-cell
 62 | #'   genomes and transcriptomes. Nat Methods, 12:519–22.
 63 | #'
 64 | #'   Macaulay et al. (2016) Separation and parallel sequencing of the genomes
 65 | #'   and transcriptomes of single cells using G&T-seq. Nat Protoc, 11:2081–103.
 66 | #'
 67 | #' @examples
 68 | #'
 69 | #' GTseq()
 70 | #'
 71 | #' @export GTseq
 72 | GTseq <-
 73 |     function(
 74 |         DataType = "mouse_embryo_8_cell", modes = "*",
 75 |         version = "1.0.0", dry.run = TRUE, verbose = TRUE, ...
 76 |     )
 77 | {
 78 |     stopifnot(.isSingleChar(version), .isSingleChar(DataType))
 79 |     meta <- list(call = match.call())
 80 | 
 81 |     ess_list <- .getResourcesList(
 82 |         prefix = "GTseq_",
 83 |         datatype = DataType,
 84 |         modes = modes,
 85 |         version = version,
 86 |         dry.run = dry.run,
 87 |         verbose = verbose,
 88 |         ...
 89 |     )
 90 | 
 91 |     if (dry.run) { return(ess_list) }
 92 | 
 93 |     cdat <- ess_list[["colData"]]
 94 |     prim.ids <- rep(paste0("cell", seq_len(112)), 2)
 95 |     smap <- S4Vectors::DataFrame(
 96 |         assay = tolower(cdat[,"Comment.LIBRARY_SOURCE."]),
 97 |         primary = prim.ids,
 98 |         colname = cdat[,"Sample.ID"]
 99 |     )
100 | 
101 |     rcols <- c("organism", "sex", "cell.type")
102 |     rcols <- paste0("Characteristics.", rcols, ".")
103 |     cdat <- cdat[seq_len(112), rcols]
104 |     rownames(cdat) <- prim.ids[seq_len(112)]
105 | 
106 |     MultiAssayExperiment(
107 |         experiments = ess_list[["experiments"]],
108 |         colData = cdat,
109 |         sampleMap = smap,
110 |         metadata = c(meta, as.list(ess_list[["metadata"]]))
111 |     )
112 | }
113 | 


--------------------------------------------------------------------------------
/R/SCoPE2.R:
--------------------------------------------------------------------------------
 1 | #' Single-cell RNA sequencing and proteomics
 2 | #'
 3 | #' @description SCoPE2 assembles data on-the-fly from `ExperimentHub` to provide
 4 | #'   a
 5 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 6 | #'   container. The `DataType` argument provides access to the `SCoPE2` dataset
 7 | #'   as provided by Specht et al. (2020; DOI:
 8 | #'   <http://dx.doi.org/10.1101/665307>). The article provides more information
 9 | #'   about the data acquisition and pre-processing.
10 | #'
11 | #' @details The SCoPE2 study combines scRNA-seq (transcriptome) and
12 | #'   single-cell proteomics.
13 | #'
14 | #'   * macrophage_differentiation: the cells are monocytes that undergo
15 | #'   macrophage differentiation. No annotation is available for the
16 | #'   transcriptome data, but batch and cell type annotations are
17 | #'   available for the proteomics data in the `celltype` `colData` column.
18 | #'   The transcriptomics and proteomics data were not measured from the same
19 | #'   cells but from a distinct set of cell cultures.
20 | #'   This dataset provides already filtered bad quality cells.
21 | #'       * scRNAseq1 - single-cell transcriptome (batch 1)
22 | #'       * scRNAseq2 - single-cell transcriptome (batch 2)
23 | #'       * scp - single-cell proteomics
24 | #'
25 | #' @inheritParams scNMT
26 | #'
27 | #' @param DataType `character(1)` Indicates study that produces this type of
28 | #'   data (default: 'macrophage_differentiation')
29 | #'
30 | #' @param modes `character()` A wildcard / glob pattern of modes, such as
31 | #'   `"rna"`. A wildcard of `"*"` will return all modes, that are
32 | #'   transcriptome ("rna") or proteome ("protein") which is the
33 | #'   default.
34 | #'
35 | #' @param version `character(1)`, currently only version '1.0.0' is
36 | #'   available
37 | #'
38 | #' @return A single cell multi-modal
39 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
40 | #'   or informative `data.frame` when `dry.run` is `TRUE`
41 | #'
42 | #' @seealso SingleCellMultiModal-package
43 | #'
44 | #' @source All files are linked from the slavovlab website
45 | #'     <https://scope2.slavovlab.net/docs/data>
46 | #'
47 | #' @references
48 | #'   Specht, Harrison, Edward Emmott, Aleksandra A. Petelski, R.
49 | #'   Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko,
50 | #'   Antonius Koller, and Nikolai Slavov. 2020. “Single-Cell
51 | #'   Proteomic and Transcriptomic Analysis of Macrophage
52 | #'   Heterogeneity.” bioRxiv. https://doi.org/10.1101/665307.
53 | #'
54 | #' @examples
55 | #'
56 | #' SCoPE2(DataType = "macrophage_differentiation",
57 | #'        modes = "*",
58 | #'        version = "1.0.0",
59 | #'        dry.run = TRUE)
60 | #'
61 | #' @export
62 | SCoPE2 <- function(
63 |     DataType = "macrophage_differentiation",
64 |     modes = "*",
65 |     version = "1.0.0",
66 |     dry.run = TRUE,
67 |     verbose = TRUE,
68 |     ...
69 | ) {
70 |     if (version != "1.0.0")
71 |         stop("Only version '1.0.0' is available.")
72 | 
73 |     ## Retrieve the different resources from ExperimentHub
74 |     ess_list <- .getResourcesList(
75 |         prefix = "macrophage_",
76 |         datatype = DataType,
77 |         modes = modes,
78 |         version = version,
79 |         dry.run = dry.run,
80 |         verbose = verbose,
81 |         ...
82 |     )
83 |     ## If dry.run, return only the information table
84 |     if (dry.run) return(ess_list)
85 |     ## Get the colData
86 |     cd <- .mergeLowColData(ess_list[["experiments"]])
87 |     colnames(cd)[which(colnames(cd) == "Batch")] <- "batch_Chromium"
88 | 
89 |     ## Construct and return the MAE object
90 |     MultiAssayExperiment(
91 |         experiments = ess_list[["experiments"]],
92 |         colData = cd
93 |     )
94 | }
95 | 


--------------------------------------------------------------------------------
/R/SingleCellMultiModal-package.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom ExperimentHub loadResources ExperimentHub
 2 | #' @importFrom AnnotationHub query
 3 | #' @importFrom utils glob2rx read.csv
 4 | #' @import MultiAssayExperiment
 5 | NULL
 6 | 
 7 | #' SingleCellMultiModal-package
 8 | #'
 9 | #' @aliases NULL SingleCellMultiModal-package
10 | #'
11 | #' @description
12 | #' The SingleCellMultiModal package provides a convenient and user-friendly
13 | #' representation of multi-modal data from project such as `scNMT` for mouse
14 | #' gastrulation.
15 | #'
16 | #' @examples
17 | #' help(package = "SingleCellMultiModal")
18 | #'
19 | "_PACKAGE"
20 | 


--------------------------------------------------------------------------------
/R/SingleCellMultiModal.R:
--------------------------------------------------------------------------------
 1 | .internalMap <- S4Vectors::DataFrame(
 2 |     FUN = c("scNMT", "scMultiome", "SCoPE2",
 3 |         "CITEseq", "CITEseq", "seqFISH", "GTseq"),
 4 |     DataType = c("mouse_gastrulation", "pbmc_10x",
 5 |         "macrophage_differentiation", "cord_blood",
 6 |         "peripheral_blood", "mouse_visual_cortex",
 7 |         "mouse_embryo_8_cell"
 8 |     )
 9 | )
10 | 
11 | .filterMap <- function(DataTypes, dry.run, verbose) {
12 |     inDTypes <- match(DataTypes, .internalMap[["DataType"]])
13 |     notfound <- is.na(inDTypes)
14 |     if (any(notfound))
15 |         stop("'", paste(DataTypes[notfound], collapse = ", "),
16 |             "' is not available, ", "see ?SingleCellMultiModal")
17 |     upmap <- .internalMap[inDTypes, , drop = FALSE]
18 |     upmap[["dry.run"]] <- dry.run
19 |     upmap[["verbose"]] <- verbose
20 |     upmap
21 | }
22 | 
23 | #' Combining Modalities into one MultiAssayExperiment
24 | #'
25 | #' Combine multiple single cell modalities into one using the input of the
26 | #' individual functions.
27 | #'
28 | #' @inheritParams scNMT
29 | #'
30 | #' @param DataTypes `character()` A vector of data types as indicated in each
31 | #'     individual function by the `DataType` parameter. These can be any of
32 | #'     the following: "mouse_gastrulation", "pbmc_10x",
33 | #'     "macrophage_differentiation", "cord_blood", "peripheral_blood",
34 | #'     "mouse_visual_cortex", "mouse_embryo_8_cell"
35 | #'
36 | #' @param versions `character()` A vector of versions for each DataType. By
37 | #'     default, version `1.0.0` is obtained for all data types.
38 | #'
39 | #' @param modes list() A list or CharacterList of modes for each data type
40 | #'     where each element corresponds to one data type.
41 | #'
42 | #' @return A multi-modality `MultiAssayExperiment`
43 | #'
44 | #' @section metadata:
45 | #'     The metadata in the `MultiAssayExperiment` contains the original
46 | #'     function call used to generate the object (labeled as `call`),
47 | #'     a `call_map` which provides traceability of technology functions to
48 | #'     `DataType` prefixes, and lastly, R version information as `version`.
49 | #'
50 | #' @examples
51 | #'
52 | #' SingleCellMultiModal(c("mouse_gastrulation", "pbmc_10x"),
53 | #'     modes = list(c("acc*", "met*"), "rna"),
54 | #'     version = c("2.0.0", "1.0.0"), dry.run = TRUE, verbose = TRUE
55 | #' )
56 | #'
57 | #' @export
58 | SingleCellMultiModal <- function(
59 |         DataTypes, modes = "*", versions = "1.0.0",
60 |         dry.run = TRUE, verbose = TRUE, ...
61 |     )
62 | {
63 |     stopifnot(is.character(DataTypes), is.character(versions))
64 | 
65 |     if (.isSingleChar(modes) && identical(modes, "*"))
66 |         modes <- c(rep(modes, length(DataTypes)))
67 |     if (.isSingleChar(versions) && identical(versions, "1.0.0"))
68 |         versions <- c(rep(versions, length(DataTypes)))
69 |     resmap <- .filterMap(DataTypes, dry.run, verbose)
70 |     modes <- methods::as(modes, "CharacterList")
71 |     resmap <- cbind(resmap, version = versions, modes = modes)
72 |     meta <- list(call = match.call(), call_map = resmap, version = version)
73 | 
74 |     ess_lists <- apply(resmap, 1L,
75 |         function(resrow) {
76 |             if (verbose)
77 |                 message("Running ", resrow[[1]], "...")
78 |             do.call(get(resrow[[1]]), resrow[-1])
79 |         }
80 |     )
81 |     names(ess_lists) <- DataTypes
82 | 
83 |     if (dry.run) { return(ess_lists) }
84 | 
85 |     new_prefix <- paste0(resmap[["DataType"]], "_")
86 |     ess_lists <- Map(function(x, y) {
87 |         if (is(x, "MultiAssayExperiment"))
88 |             names(x) <- paste0(y, names(x))
89 |         x
90 |     }, x = ess_lists, y = new_prefix)
91 | 
92 |     result <- Reduce(c, ess_lists)
93 |     metadata(result) <- meta
94 |     result
95 | }
96 | 


--------------------------------------------------------------------------------
/R/cache.R:
--------------------------------------------------------------------------------
 1 | .getCache <- function() {
 2 |     cache <- getOption("scmmCache", setCache(verbose = FALSE))
 3 |     BiocFileCache::BiocFileCache(cache)
 4 | }
 5 | 
 6 | #' @name scmmCache
 7 | #'
 8 | #' @title Manage cache / download directories for study data
 9 | #'
10 | #' @description Managing data downloads is important to save disk space and
11 | #' re-downloading data files. This can be done effortlessly via the integrated
12 | #' `BiocFileCache` system.
13 | #'
14 | #' @section scmmCache:
15 | #' Get the directory location of the cache. It will prompt the user to create
16 | #' a cache if not already created. A specific directory can be used via
17 | #' `setCache`.
18 | #'
19 | #' @section setCache:
20 | #' Specify the directory location of the data cache. By default, it will
21 | #' go into the user's home and package name directory as given by
22 | #' [R_user_dir][tools::R_user_dir] (default: varies by system e.g., for Linux:
23 | #' '$HOME/.cache/R/SingleCellMultiModal').
24 | #'
25 | #' @section removeCache:
26 | #' Some files may become corrupt when downloading, this function allows
27 | #' the user to delete the tarball associated with a study number in the
28 | #' cache.
29 | #'
30 | #' @param directory `character(1)` The file location where the cache is located.
31 | #' Once set, future downloads will go to this folder. See `setCache` section
32 | #' for details.
33 | #'
34 | #' @param verbose Whether to print descriptive messages
35 | #'
36 | #' @param ask `logical(1)` (default TRUE when `interactive()`) Confirm the file
37 | #' location of the cache directory
38 | #'
39 | #' @param accession `character(1)` A single string indicating the accession number
40 | #' of the study
41 | #'
42 | #' @param ... For `scmmCache`, arguments passed to `setCache`
43 | #'
44 | #' @examples
45 | #' getOption("scmmCache")
46 | #' scmmCache()
47 | #'
48 | #' @return The directory / option of the cache location
49 | #'
50 | #' @export
51 | scmmCache <- function(...) {
52 |     getOption("scmmCache", setCache(..., verbose = FALSE))
53 | }
54 | 
55 | #' @rdname scmmCache
56 | #' @export
57 | setCache <-
58 |     function(directory = tools::R_user_dir("SingleCellMultiModal", "cache"),
59 |         verbose = TRUE,
60 |         ask = interactive())
61 | {
62 |     stopifnot(
63 |         is.character(directory), length(directory) == 1L, !is.na(directory)
64 |     )
65 | 
66 |     if (!dir.exists(directory)) {
67 |         if (ask) {
68 |             qtxt <- sprintf(
69 |                 "Create cBioPortalData cache at \n    %s? [y/n]: ",
70 |                 directory
71 |             )
72 |             answer <- .getAnswer(qtxt, allowed = c("y", "Y", "n", "N"))
73 |             if ("n" == answer)
74 |                 stop("'cbioCache' directory not created. Use 'setCache'")
75 |         }
76 |         dir.create(directory, recursive = TRUE, showWarnings = FALSE)
77 |     }
78 |     options("cbioCache" = directory)
79 | 
80 |     if (verbose)
81 |         message("cBioPortalData cache directory set to:\n    ",
82 |                 directory)
83 |     invisible(directory)
84 | }
85 | 
86 | #' @rdname scmmCache
87 | #' @export
88 | removeCache <- function(accession) {
89 |     bfc <- .getCache()
90 |     rid <- BiocFileCache::bfcquery(bfc, accession, "rname", exact = TRUE)$rid
91 |     if (length(rid)) {
92 |         BiocFileCache::bfcremove(bfc, rid)
93 |         message("Cache record: ", accession, ".tar.gz removed")
94 |     } else
95 |         message("No record found: ", accession, ".tar.gz")
96 | }
97 | 


--------------------------------------------------------------------------------
/R/cellGating.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' addCTLabels
  3 | #'
  4 | #' @param cd the `colData` `DataFrame`
  5 | #' @param out list data structure returned by `getCellGroups`
  6 | #' @param outname character indicating the name of the out data structure
  7 | #' @param ct character indicating the celltype to assign in the `ctcol`
  8 | #' @param mkrcol character indicating the cd column to store the markers
  9 | #' indicated by `outname` (default is markers)
 10 | #' @param ctcol character indicating the column in cd to store the cell type
 11 | #' indicated by `ct` (default is celltype)
 12 | #' @param overwrite logical indicating if the cell types have to be overwritten
 13 | #' without checking if detected barcodes were already assigned to other celltypes
 14 | #' @param verbose logical for having informative messages during the execution
 15 | #'
 16 | #' @return an updated version of the cd DataFrame
 17 | #'
 18 | #' @export
 19 | addCTLabels <- function(cd, out, outname, ct, mkrcol="markers", ctcol="celltype",
 20 |                         overwrite=FALSE, verbose=TRUE)
 21 | {
 22 |     ## adds to input cd colData in the mkrcol the markers indicated by outname
 23 |     ## and in the ctcol the celltype indicated in ct
 24 |     ## the positions for the barcodes (rows in the cd) are taken in position
 25 |     ## outname from the out structure given by function getCellGroups
 26 |     stopifnot(any(c(mkrcol, ctcol) %in% colnames(cd)))
 27 |     stopifnot((outname %in% names(out)))
 28 | 
 29 |     cellbc <- out[[outname]]$bc
 30 |     idxc <- which(rownames(cd) %in% cellbc)
 31 |     if (length(idxc) !=0)
 32 |     {
 33 |         if (overwrite)
 34 |         {
 35 |             if(verbose) message("Blindly overwriting cell types assignements")
 36 |             cd[[mkrcol]][idxc] <- outname
 37 |             cd[[ctcol]][idxc] <- ct
 38 |         } else {
 39 |             ## checking if celltypes are already assigned
 40 |             idxnona <- which(!is.na(cd[[mkrcol]][idxc]))
 41 |             # don't get why ifelse doesn't work
 42 |             # idxcnona <- ifelse(length(idxnona)!=0, idxc[-idxnona], idxc)
 43 |             if ( length(idxnona)!=0 ) {
 44 |                 idxcnona <- idxc[-idxnona]
 45 |                 if(verbose) message(length(idxnona), " Barcodes already assigned.\n",
 46 |                                     "Assigning only ", length(idxcnona), " Barcodes...")
 47 |             } else { idxcnona <- idxc }
 48 |             if (length(idxcnona)!=0)
 49 |             {
 50 |                 cd[[mkrcol]][idxcnona] <- outname
 51 |                 cd[[ctcol]][idxcnona] <- ct
 52 |             } else {
 53 |                 if(verbose) message("All selected Barcodes are already assigned\n",
 54 |                                     "Look at the overwrite argument to handle a more ",
 55 |                                     "brutal behaviour")
 56 |             }
 57 |         }
 58 | 
 59 |     } else {
 60 |         warning("No barcodes in cd detected for the selected ", outname,
 61 |                 "\nReturning cd as it is...")
 62 |     }
 63 | 
 64 |     return(cd)
 65 | }
 66 | 
 67 | #' @importFrom graphics abline smoothScatter
 68 | .plotGatingAdt <- function(mat, adt1="CD19", adt2="CD3", th1=0.2, th2=0)
 69 | {
 70 |     plot(x=mat[adt1,], y=mat[adt2,], xlab=adt1, ylab=adt2,
 71 |          main=paste0("Gain plot with x-th: ", th1, " y-th: ", th2))
 72 |     abline(v=th1, col="red", lty=2)
 73 |     abline(h=th2, col="red", lty=2)
 74 |     smoothScatter(x=mat[adt1,], y=mat[adt2,], xlab=adt1, ylab=adt2,
 75 |                   main=paste0("Gain plot with x-th: ", th1, " y-th: ", th2))
 76 | 
 77 |     abline(v=th1, col="red", lty=2)
 78 |     abline(h=th2, col="red", lty=2)
 79 | }
 80 | 
 81 | 
 82 | #' getCellGroups
 83 | #'
 84 | #' @description
 85 | #' Shows the cells/barcodes in two different plots (scatter and density)
 86 | #' divinding the space in four quadrant indicated by the two thresholds given
 87 | #' as input parameters.
 88 | #' The x/y-axis represent respectively the two ADTs given as input.
 89 | #' It returns a list of one element for each quadrant, each with barcodes and
 90 | #' percentage (see Value section for details).
 91 | #'
 92 | #' @param mat matrix of counts or clr transformed counts for ADT data in CITEseq
 93 | #' @param adt1 character indicating the name of the marker to plot on the x-axis
 94 | #' (default is CD19).
 95 | #' @param adt2 character indicating the name of the marker to plot on the y-axis
 96 | #' (default is CD3).
 97 | #' @param th1 numeric indicating the threshold for the marker on the x-axis
 98 | #' (default is 0.2).
 99 | #' @param th2 numeric indicating the threshold for the marker on the y-axis
100 | #' (default is 0).
101 | #'
102 | #' @return a list of four different element, each one indicating the quarter
103 | #' where the thresholds divide the plotting space, in eucledian order I, II,
104 | #' III, IV quadrant, indicating respectively +/+, +/-, -/+, -/- combinations
105 | #' for the couples of selected ADTs.
106 | #' Each element of the list contains two objects, one with the list of detected
107 | #' barcodes and one indicating the percentage of barcodes falling into that
108 | #' quadrant.
109 | #' .
110 | #' @details helps to do manual gating for cell type indentification with CITEseq
111 | #' or similar data, providing cell markers.
112 | #' Once identified two interesting markers for a cell type, the user has to
113 | #' play with the thresholds to identify the cell populations specified by an
114 | #' uptake (+) o downtake (-) of the couple of markers (ADTs) previously selected.
115 | #'
116 | #' @importFrom graphics text
117 | #'
118 | #' @export
119 | getCellGroups <- function(mat, adt1="CD19", adt2="CD3", th1=0.2, th2=0)
120 | {
121 |     stopifnot(any(adt1,adt2) %in% rownames(mat))
122 | 
123 |     plot <- match.arg(plot)
124 |     .plotGatingAdt(mat, adt1, adt2, th1, th2)
125 |     matadt <- mat[c(adt1,adt2),]
126 |     adt1p <- (matadt[adt1,]>th1)
127 |     adt1m <- (matadt[adt1,]<=th1)
128 |     adt2p <- (matadt[adt2,]>th2)
129 |     adt2m <- (matadt[adt2,]<=th2)
130 | 
131 | 
132 |     if (sum(adt1p)+sum(adt1m) != dim(mat)[2]) stop("something went wrong with adt1")
133 |     if (sum(adt2p)+sum(adt2m) != dim(mat)[2]) stop("something went wrong with adt2")
134 | 
135 |     adt12pp <- which(adt1p & adt2p)
136 |     adt12pm <- which(adt1p & adt2m)
137 |     adt12mp <- which(adt1m & adt2p)
138 |     adt12mm <- which(adt1m & adt2m)
139 | 
140 |     l <- list(
141 |         ADT12pp=list(
142 |             bc=colnames(matadt)[adt12pp],
143 |             prc=((length(adt12pp)/dim(matadt)[2])*100)),
144 |         ADT12pm=list(
145 |             bc=colnames(matadt)[adt12pm],
146 |             prc=((length(adt12pm)/dim(matadt)[2])*100)),
147 |         ADT12mp=list(
148 |             bc=colnames(matadt)[adt12mp],
149 |             prc=((length(adt12mp)/dim(matadt)[2])*100)),
150 |         ADT12mm=list(
151 |             bc=colnames(matadt)[adt12mm],
152 |             prc=((length(adt12mm)/dim(matadt)[2])*100))
153 |     )
154 |     names(l) <- c(paste0(adt1,"+/",adt2,"+"),
155 |                   paste0(adt1,"+/",adt2,"-"),
156 |                   paste0(adt1,"-/",adt2,"+"),
157 |                   paste0(adt1,"-/",adt2,"-"))
158 | 
159 | 
160 |     text((min(matadt[adt1,])+0.03), (max(matadt[adt2,])-0.05), paste(round(l[[3]]$prc), "%"))
161 |     text((max(matadt[adt1,])-0.03), (max(matadt[adt2,])-0.05), paste(round(l[[1]]$prc), "%"))
162 |     text((max(matadt[adt1,])-0.03), (min(matadt[adt2,])+0.05), paste(round(l[[2]]$prc), "%"))
163 |     text((min(matadt[adt1,])+0.03), (min(matadt[adt2,])+0.05), paste(round(l[[4]]$prc), "%"))
164 |     return(l)
165 | }
166 | 


--------------------------------------------------------------------------------
/R/ontomap.R:
--------------------------------------------------------------------------------
 1 | #' Obtain a map of cell types for each dataset
 2 | #'
 3 | #' The `ontomap` function provides a mapping of all the cell names across the
 4 | #' all the data sets or for a specified data set.
 5 | #'
 6 | #' @param dataset `character()` One of the existing functions within the
 7 | #'   package. If missing, a map of all cell types in each function will
 8 | #'   be provided.
 9 | #'
10 | #' @details
11 | #' Note that `CITEseq` does not have any cell annotations; therefore, no entries
12 | #' are present in the `ontomap`.
13 | #'
14 | #' @return A `data.frame` of metadata with cell types and ontologies
15 | #'
16 | #' @examples
17 | #'
18 | #' ontomap(dataset = "scNMT")
19 | #'
20 | #' @export
21 | ontomap <- function(
22 |     dataset = c("scNMT", "scMultiome", "SCoPE2", "CITEseq", "seqFISH")
23 | ) {
24 |     dataset <- match.arg(dataset, several.ok = TRUE)
25 |     omap <- system.file(
26 |         "extdata", "ontomap.tsv",
27 |         package = "SingleCellMultiModal", mustWork = TRUE
28 |     )
29 |     map <- utils::read.delim(omap)
30 |     dnames <- map[["function_name"]]
31 |     map[dnames %in% dataset, ]
32 | }
33 | 


--------------------------------------------------------------------------------
/R/scMultiome.R:
--------------------------------------------------------------------------------
  1 | ## Load HDF5 file with either TENxMatrix or HDF5Array
  2 | .getH5_TENx <- function(filelist, ehub, fn, verbose) {
  3 |     if (verbose)
  4 |         message("Working on: ", paste(fn, collapse = ",\n "))
  5 |     se_h5 <- grep("_se", filelist, value = TRUE)
  6 |     se_obj <- query(ehub, se_h5)[[1L]]
  7 | 
  8 |     hasTENx <- any(grepl("tenx", filelist))
  9 |     patt <- if (hasTENx) "tenx" else "_assay"
 10 | 
 11 |     h5data <- grep(patt, filelist, value = TRUE, ignore.case = TRUE)
 12 |     h5fileloc <- query(ehub, h5data)[[1L]]
 13 | 
 14 |     if (!hasTENx)
 15 |         h5array <- HDF5Array::HDF5Array(h5fileloc, "assay001", as.sparse = TRUE)
 16 |     else
 17 |         h5array <- HDF5Array::TENxMatrix(h5fileloc, "pbmc")
 18 | 
 19 |     SummarizedExperiment::`assays<-`(
 20 |         x = se_obj, withDimnames = FALSE,
 21 |         value = list(counts = h5array)
 22 |     )
 23 | }
 24 | 
 25 | .loadHDF5 <- function(ehub, filepaths, verbose) {
 26 |     matchres <- grepl("\\.[Hh]5|_se\\.[Rr][Dd][Ss]", filepaths)
 27 |     fpaths <- filepaths[matchres]
 28 |     fact <- .removeExt(fpaths)
 29 |     fact <- gsub("_se|_assays|_tenx", "", fact)
 30 |     h5list <- split(fpaths, fact)
 31 |     lapply(h5list,
 32 |         .getH5_TENx,
 33 |         ehub = ehub, fn = names(h5list), verbose = verbose
 34 |     )
 35 | }
 36 | 
 37 | .message <-
 38 |     function(...)
 39 | {
 40 |     message(...)
 41 |     TRUE
 42 | }
 43 | 
 44 | ## @mtmorgan's function from HCAMatrixBrowser
 45 | .read_mtx <-
 46 |     function(path, verbose = FALSE)
 47 | {
 48 |     headers <- readLines(path, 2L)
 49 |     dims <- as.integer(strsplit(headers[2], " ")[[1]][c(1, 2)])
 50 |     !verbose || .message("dim: ", dims[1], " ", dims[2])
 51 |     v <- scan(
 52 |         path, list(integer(), integer(), numeric()), skip = 2,
 53 |         quiet = !verbose
 54 |     )
 55 |     Matrix::sparseMatrix(v[[1]], v[[2]], x = v[[3]], dims = dims)
 56 | }
 57 | 
 58 | .loadMTX <- function(ehub, filepaths, verbose) {
 59 |     matchres <-
 60 |         grepl("\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|_se\\.[Rr][Dd][Ss]$", filepaths)
 61 |     filepaths <- filepaths[matchres]
 62 |     fact <- .removeExt(filepaths)
 63 |     fact <- gsub("_se", "", fact)
 64 |     mtxlist <- split(filepaths, fact)
 65 |     lapply(mtxlist, function(mtxfile, fn) {
 66 |         if (verbose)
 67 |             message("Working on: ", paste(fn, collapse = ",\n "))
 68 |         se_mtx <- grep("_se", mtxfile, value = TRUE)
 69 |         mtxdata <- grep("mtx", mtxfile, value = TRUE, ignore.case = TRUE)
 70 |         se <- query(ehub, se_mtx)[[1L]]
 71 |         mtxfile <- query(ehub, mtxdata)[[1L]]
 72 |         mtxf <- .read_mtx(mtxfile)
 73 | 
 74 |         BiocBaseUtils::setSlots(
 75 |             object = se,
 76 |             assays = SummarizedExperiment::Assays(
 77 |                 S4Vectors::SimpleList(counts = mtxf)
 78 |             )
 79 |         )
 80 |     }, fn = names(mtxlist))
 81 | }
 82 | 
 83 | #' Single-cell Multiome ATAC + Gene Expression
 84 | #'
 85 | #' @description 10x Genomics Multiome technology enables simultaneous profiling
 86 | #' of the transcriptome (using 3’ gene expression) and epigenome
 87 | #' (using ATAC-seq) from single cells to
 88 | #' deepen our understanding of how genes are expressed and regulated across
 89 | #' different cell types. Data prepared by Ricard Argelaguet.
 90 | #'
 91 | #' @details Users are able to choose from either an `MTX` or `HDF5` file format
 92 | #'   as the internal data representation. The `MTX` (Matrix Market) format
 93 | #'   allows users to load a sparse `dgCMatrix` representation. Choosing `HDF5`
 94 | #'   gives users a sparse `HDF5Array` class object.
 95 | #'     * pbmc_10x: 10K Peripheral Blood Mononuclear Cells provided by
 96 | #' [10x Genomics website](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets)
 97 | #'     Cell quality control filters are available in the object `colData`
 98 | #'     together with the `celltype` annotation labels.
 99 | #'
100 | #' @inheritParams scNMT
101 | #'
102 | #' @param format `character(1)` Either MTX or HDF5 data format (default MTX)
103 | #'
104 | #' @return A 10X PBMC `MultiAssayExperiment` object
105 | #'
106 | #' @examples
107 | #'
108 | #' scMultiome(DataType = "pbmc_10x", modes = "*", dry.run = TRUE)
109 | #'
110 | #' @export
111 | scMultiome <-
112 |     function(
113 |         DataType = "pbmc_10x", modes = "*", version = "1.0.0",
114 |         format = c("MTX", "HDF5"), dry.run = TRUE, verbose = TRUE, ...
115 |     )
116 | {
117 |     stopifnot(.isSingleChar(version), .isSingleChar(DataType))
118 | 
119 |     format <- match.arg(format)
120 |     meta <- list(call = match.call(), version = version)
121 | 
122 |     if (!version %in% c("1.0.0", "1.0.1"))
123 |         stop("Invalid 'version'; see '?scMultiome' for details.")
124 | 
125 |     ess_list <- .getResourcesList(prefix = "pbmc_", datatype = DataType,
126 |         modes = modes, version = version, dry.run = dry.run,
127 |         verbose = verbose, format = format, ...)
128 | 
129 |     if (dry.run) { return(ess_list) }
130 | 
131 |     MultiAssayExperiment(
132 |         experiments = ess_list[["experiments"]],
133 |         colData = ess_list[["colData"]],
134 |         sampleMap = ess_list[["sampleMap"]],
135 |         metadata = meta
136 |     )
137 | }
138 | 


--------------------------------------------------------------------------------
/R/scNMT.R:
--------------------------------------------------------------------------------
  1 | #' Single-cell Nucleosome, Methylation and Transcription sequencing
  2 | #'
  3 | #' @description scNMT assembles data on-the-fly from `ExperimentHub` to provide
  4 | #'   a
  5 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
  6 | #'   container. The `DataType` argument provides access to the
  7 | #'   `mouse_gastrulation` dataset as obtained from Argelaguet et al. (2019; DOI:
  8 | #'   10.1038/s41586-019-1825-8). Pre-processing code can be seen at
  9 | #'   <https://github.com/rargelaguet/scnmt_gastrulation>. Protocol
 10 | #'   information for this dataset is available at Clark et al. (2018). See the
 11 | #'   vignette for the full citation.
 12 | #'
 13 | #' @details scNMT is a combination of RNA-seq (transcriptome) and an adaptation
 14 | #'   of Nucleosome Occupancy and Methylation sequencing (NOMe-seq, the
 15 | #'   methylome and chromatin accessibility) technologies. For more
 16 | #'   information, see Reik et al. (2018) DOI: 10.1038/s41467-018-03149-4
 17 | #'
 18 | #'  * mouse_gastrulation - this dataset provides cell quality control filters in
 19 | #'  the object `colData` starting from version 2.0.0. Additionally, cell types
 20 | #'  annotations are provided through the `lineage` `colData` column.
 21 | #'      * rna - RNA-seq
 22 | #'      * acc_\* - chromatin accessibility
 23 | #'      * met_\* - DNA methylation
 24 | #'          * cgi - CpG islands
 25 | #'          * CTCF - footprints of CTCF binding
 26 | #'          * DHS - DNase Hypersensitive Sites
 27 | #'          * genebody - gene bodies
 28 | #'          * p300 - p300 binding sites
 29 | #'          * promoter - gene promoters
 30 | #'
 31 | #'   Special thanks to Al J Abadi for preparing the published data in time
 32 | #'   for the 2020 BIRS Workshop, see the link here:
 33 | #'   <https://github.com/BIRSBiointegration/Hackathon/tree/master/scNMT-seq>
 34 | #'
 35 | #' @section versions:
 36 | #'   Version '1.0.0' of the scNMT mouse_gastrulation dataset includes all of
 37 | #'   the above mentioned assay technologies with filtering of cells based on
 38 | #'   quality control metrics. Version '2.0.0' contains all of the cells
 39 | #'   without the QC filter and does not contain CTCF binding footprints or
 40 | #'   p300 binding sites.
 41 | #'
 42 | #' @section metadata:
 43 | #'   The `MultiAssayExperiment` metadata includes the original function call
 44 | #'   that saves the function call and the data version requested.
 45 | #'
 46 | #' @param DataType `character(1)` Indicates study that produces this type of
 47 | #'   data (default: 'mouse_gastrulation')
 48 | #'
 49 | #' @param modes `character()` A wildcard / glob pattern of modes, such as
 50 | #'   `"acc*"`. A wildcard of `"*"` will return all modes including
 51 | #'   Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna")
 52 | #'   which is the default.
 53 | #'
 54 | #' @param version `character(1)` Either version '1.0.0' or '2.0.0' depending on
 55 | #'   data version required (default '1.0.0'). See version section.
 56 | #'
 57 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual
 58 | #'   download (default `TRUE`)
 59 | #'
 60 | #' @param verbose `logical(1)` Whether to show the dataset currently being
 61 | #'   (down)loaded (default `TRUE`)
 62 | #'
 63 | #' @param ... Additional arguments passed on to the
 64 | #'   \link[ExperimentHub]{ExperimentHub-class} constructor
 65 | #'
 66 | #' @seealso SingleCellMultiModal-package
 67 | #'
 68 | #' @return A single cell multi-modal
 69 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 70 | #'   or informative `data.frame` when `dry.run` is `TRUE`
 71 | #'
 72 | #' @source <http://ftp.ebi.ac.uk/pub/databases/scnmt_gastrulation/>
 73 | #'
 74 | #' @references
 75 | #'   Argelaguet et al. (2019)
 76 | #'
 77 | #' @examples
 78 | #'
 79 | #' scNMT(DataType = "mouse_gastrulation", modes = "*",
 80 | #'     version = "1.0.0", dry.run = TRUE)
 81 | #'
 82 | #' @export scNMT
 83 | scNMT <-
 84 |     function(
 85 |         DataType = "mouse_gastrulation", modes = "*", version = "1.0.0",
 86 |         dry.run = TRUE, verbose = TRUE, ...
 87 |     )
 88 | {
 89 |     stopifnot(.isSingleChar(version), .isSingleChar(DataType))
 90 |     meta <- list(call = match.call(), version = version)
 91 | 
 92 |     if (missing(version) || !version %in% c("1.0.0", "2.0.0"))
 93 |         stop("Enter version '1.0.0' or '2.0.0'; see '?scNMT' for details.")
 94 | 
 95 |     ess_list <- .getResourcesList(prefix = "scnmt_", datatype = DataType,
 96 |         modes = modes, version = version, dry.run = dry.run,
 97 |         verbose = verbose, ...)
 98 | 
 99 |     if (dry.run) { return(ess_list) }
100 | 
101 |     MultiAssayExperiment(
102 |         experiments = ess_list[["experiments"]],
103 |         colData = ess_list[["colData"]],
104 |         sampleMap = ess_list[["sampleMap"]],
105 |         metadata = meta
106 |     )
107 | }
108 | 


--------------------------------------------------------------------------------
/R/seqFISH.R:
--------------------------------------------------------------------------------
  1 | #' Single-cell spatial + Gene Expression
  2 | #'
  3 | #' @description seqFISH function assembles data on-the-fly from `ExperimentHub`
  4 | #'   to provide a
  5 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
  6 | #'   container. Actually the `DataType` argument provides access to the
  7 | #'   available datasets associated to the package.
  8 | #'
  9 | #' @details seq FISH data are a combination of single cell spatial coordinates
 10 | #'   and transcriptomics for a few hundreds of genes. seq-FISH data can be
 11 | #'   combined for example with scRNA-seq data to unveil multiple aspects of
 12 | #'   cellular behaviour based on their spatial organization and transcription.
 13 | #'
 14 | #' Available datasets are:
 15 | #' * mouse_visual_cortex: combination of seq-FISH data as obtained from Zhu
 16 | #' et al. (2018) and scRNA-seq data as obtained from Tasic et al. (2016),
 17 | #' Version 1.0.0 returns the full scRNA-seq data matrix, while version 2.0.0
 18 | #' returns the processed and subsetted scRNA-seq data matrix (produced for
 19 | #' the Mathematical Frameworks for Integrative Analysis of Emerging
 20 | #' Biological Data Types 2020 Workshop) The returned seqFISH data are always
 21 | #' the processed ones for the same workshop. Additionally, cell types
 22 | #' annotations are available in the `colData` through the `class` column in
 23 | #' the seqFISH `assay`.
 24 | #'     * scRNA_Counts - Tasic scRNA-seq gene count matrix
 25 | #'     * scRNA_Labels - Tasic scRNA-seq cell labels
 26 | #'     * seqFISH_Coordinates - Zhu seq-FISH spatial coordinates
 27 | #'     * seqFISH_Counts - Zhu seq-FISH gene counts matrix
 28 | #'     * seqFISH_Labels - Zhu seq-FISH cell labels
 29 | #'
 30 | #' @inheritParams scNMT
 31 | #'
 32 | #' @param DataType `character(1)` indicating the identifier of the dataset to
 33 | #'     retrieve.  (default "mouse_visual_cortex")
 34 | #'
 35 | #' @param modes `character()` The assay types or modes of data to obtain these
 36 | #'     include seq-FISH and scRNA-seq data by default.
 37 | #'
 38 | #' @return A
 39 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 40 | #'   of seq-FISH data
 41 | #'
 42 | #' @author Dario Righelli <dario.righelli <at> gmail.com>
 43 | #'
 44 | #' @importFrom SpatialExperiment SpatialExperiment
 45 | #' @importFrom SingleCellExperiment SingleCellExperiment
 46 | #' @importFrom S4Vectors DataFrame
 47 | #'
 48 | #' @examples
 49 | #'
 50 | #' seqFISH(DataType = "mouse_visual_cortex", modes = "*", version = "2.0.0",
 51 | #'     dry.run = TRUE)
 52 | #'
 53 | #' @export
 54 | seqFISH <-
 55 |     function(
 56 |         DataType="mouse_visual_cortex", modes="*", version,
 57 |         dry.run=TRUE, verbose=TRUE, ...
 58 |     )
 59 | {
 60 |     ess_list <- .getResourcesList(prefix = "seqfish_", datatype = DataType,
 61 |         modes = modes, version = version, dry.run = dry.run,
 62 |         verbose = verbose, ...)
 63 | 
 64 |     if (dry.run) { return(ess_list) }
 65 | 
 66 |     modes_list <- ess_list[["experiments"]]
 67 | 
 68 |     switch(DataType,
 69 |         "mouse_visual_cortex" = {
 70 |             mae <- .mouse_visual_cortex(modes_list=modes_list,
 71 |                                         version=version)
 72 |         },
 73 |         ## Add here other seqFISH datasets based on DataType identifier
 74 |         {
 75 |             stop("Unrecognized seqFISH dataset name")
 76 |         }
 77 |     )
 78 | 
 79 |     return(mae)
 80 | }
 81 | 
 82 | .mouse_visual_cortex <- function(modes_list, version)
 83 | {
 84 |     res <- paste0("scRNA",
 85 |         if (identical(version, "1.0.0")) "_Full" else "",
 86 |         "_", c("Counts", "Labels")
 87 |     )
 88 | 
 89 |     ## discrepancy between labels in counts and colData
 90 |     counts <- as.matrix(modes_list[[res[1]]])
 91 |     ## rowData is duplicate of rownames [removed]
 92 |     coldata <- modes_list[[res[2]]]
 93 |     vIDs <- intersect(rownames(coldata), colnames(counts))
 94 |     counts <- counts[, vIDs]
 95 |     coldata <- coldata[vIDs, ]
 96 | 
 97 |     sce <- SingleCellExperiment::SingleCellExperiment(
 98 |         colData=coldata,
 99 |         assays=S4Vectors::SimpleList(counts=counts)
100 |     )
101 | 
102 |     se <- SpatialExperiment::SpatialExperiment(
103 |         rowData=rownames(modes_list$seqFISH_Counts),
104 |         colData=modes_list$seqFISH_Labels,
105 |         assays=S4Vectors::SimpleList(
106 |             counts=as.matrix(modes_list$seqFISH_Counts)),
107 |         spatialData=DataFrame(modes_list$seqFISH_Coordinates),
108 |         spatialCoordsNames=c("x", "y"))
109 | 
110 |     MultiAssayExperiment(
111 |         experiments = list(seqFISH = se, scRNAseq = sce)
112 |     )
113 | }
114 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | .getAnswer <- function(msg, allowed)
  2 | {
  3 |     if (interactive()) {
  4 |         repeat {
  5 |             cat(msg)
  6 |             answer <- readLines(n = 1)
  7 |             if (answer %in% allowed)
  8 |                 break
  9 |         }
 10 |         tolower(answer)
 11 |     } else {
 12 |         "n"
 13 |     }
 14 | }
 15 | 
 16 | .isSingleChar <- function(x) {
 17 |     length(x) == 1L && is.character(x) && !is.na(x)
 18 | }
 19 | 
 20 | .removeExt <- function(fnames) {
 21 |     gsub("\\..*$", "", basename(fnames))
 22 | }
 23 | 
 24 | .modesAvailable <- function(listfiles, prefix) {
 25 |     slots <- c("metadata", "colData", "sampleMap")
 26 |     modes <- gsub(prefix, "", listfiles, fixed = TRUE)
 27 |     modes <- gsub("_assays|_se|_tenx", "", modes)
 28 |     modes <- .removeExt(modes)
 29 |     unique(sort(modes[!modes %in% slots]))
 30 | }
 31 | 
 32 | .searchFromInputs <- function(glob, searchFields) {
 33 |     regGlob <- glob2rx(unique(glob))
 34 |     res <- unlist(lapply(regGlob, function(x) {
 35 |         grep(x, searchFields, ignore.case = TRUE, value = TRUE)
 36 |         }))
 37 |     if (!length(res))
 38 |         stop("No matches found, modify search criteria")
 39 |     res
 40 | }
 41 | 
 42 | .conditionToIndex <- function(startVec, testVec, FUN) {
 43 |     logmat <- vapply(startVec, FUN, logical(length(testVec)))
 44 |     apply(logmat, 1L, any)
 45 | }
 46 | 
 47 | .queryResources <- function(ExperimentHub, resTable, verbose) {
 48 |     fileNames <- stats::setNames(resTable[["RDataPath"]], resTable[["Title"]])
 49 |     lapply(fileNames, function(res) {
 50 |         if (verbose)
 51 |             message("Working on: ", gsub("\\.rda", "", basename(res)))
 52 |         # only take the last one for multiple matches
 53 |         utils::tail(query(ExperimentHub, res), 1)
 54 |     })
 55 | }
 56 | 
 57 | .getResources <- function(ExperimentHub, resTable, prefix, verbose) {
 58 |     infos <- .queryResources(ExperimentHub, resTable, verbose)
 59 |     rpath <- vapply(infos, function(x) `$`(x, "rdatapath"), character(1L))
 60 | 
 61 |     h5resources <- grepl("\\.[Hh]5$", rpath)
 62 |     mtxresources <- grepl("\\.[Mm][Tt][Xx]\\.[Gg][Zz]$", rpath)
 63 |     shells <- grepl("se\\.[Rr][Dd][Ss]$", rpath)
 64 |     otherres <- !((h5resources | mtxresources) | shells)
 65 | 
 66 |     if (any(h5resources))
 67 |         matress <- .loadHDF5(ExperimentHub, rpath, verbose)
 68 |     else if (any(mtxresources))
 69 |         matress <- .loadMTX(ExperimentHub, rpath, verbose)
 70 |     else
 71 |         matress <- list()
 72 | 
 73 |     if (any(otherres)) {
 74 |         rest <- lapply(infos[otherres], `[[`, 1L)
 75 |         c(rest, matress)
 76 |     } else {
 77 |         matress
 78 |     }
 79 | }
 80 | 
 81 | .getResourceInfo <- function(ExperimentHub, resTable, prefix, verbose) {
 82 |     infos <- .queryResources(ExperimentHub, resTable, verbose)
 83 |     resID <- vapply(infos, names, character(1L))
 84 |     restab <- AnnotationHub::getInfoOnIds(ExperimentHub, resID)
 85 |     restab <-
 86 |         restab[, !names(restab) %in% c("fetch_id", "status", "biocversion")]
 87 |     sizes <- as.numeric(restab[["file_size"]])
 88 |     class(sizes) <- "object_size"
 89 |     titleidx <- which(names(restab) == "title")
 90 |     restab <- as.data.frame(append(
 91 |         restab,
 92 |         list(mode = gsub(prefix, "", restab[["title"]]),
 93 |             file_size = format(sizes, units = "Mb")),
 94 |         titleidx
 95 |     ))
 96 |     restab[, -c(length(restab), titleidx)]
 97 | }
 98 | 
 99 | .test_eh <- function(...) {
100 |     tryCatch({
101 |         ExperimentHub(...)
102 |     }, error = function(e) {
103 |         emsg <- conditionMessage(e)
104 |         if (grepl("Timeout", emsg))
105 |             warning("[experimenthub.bioconductor.org] timeout, localHub=TRUE",
106 |                 call.=FALSE)
107 |         ExperimentHub(..., localHub = TRUE)
108 |     })
109 | }
110 | 
111 | .isSingleCharNA <- function(x) {
112 |     is.character(x) && length(x) == 1L && !is.na(x)
113 | }
114 | 
115 | .getResourcesList <-
116 |     function(prefix, datatype, modes, version, format, dry.run, verbose, ...)
117 | {
118 |     modes_file <- system.file("extdata", "metadata.csv",
119 |         package = "SingleCellMultiModal", mustWork = TRUE)
120 | 
121 |     DataType <- tolower(datatype)
122 |     stopifnot(
123 |         .isSingleCharNA(DataType), .isSingleCharNA(version)
124 |     )
125 | 
126 |     modes_metadat <- read.csv(modes_file, stringsAsFactors = FALSE)
127 |     if (missing(format))
128 |         notfmt <- "FakeFormatNoMatch"
129 |     else
130 |         notfmt <- switch(format, HDF5 = "MTX", MTX = "HDF5", format)
131 |     filt <- modes_metadat[["DataType"]] == DataType &
132 |         modes_metadat[["SourceVersion"]] == version &
133 |         modes_metadat[["SourceType"]] != notfmt
134 | 
135 |     modes_metadat <- modes_metadat[filt, , drop = FALSE]
136 |     eh_assays <- modes_metadat[["ResourceName"]]
137 |     modesAvail <- .modesAvailable(eh_assays, prefix)
138 |     resultModes <- .searchFromInputs(modes, modesAvail)
139 |     fileIdx <- .conditionToIndex(
140 |         resultModes, eh_assays, function(x) grepl(x, eh_assays)
141 |     )
142 |     fileMatches <-
143 |         modes_metadat[fileIdx, c("Title", "DispatchClass", "SourceVersion")]
144 |     eh <- .test_eh(...)
145 | 
146 |     if (dry.run) {
147 |         return(.getResourceInfo(
148 |             eh, modes_metadat[fileIdx, c("Title", "RDataPath")], prefix, FALSE
149 |         ))
150 |     }
151 |     modes_list <- .getResources(
152 |         eh, modes_metadat[fileIdx, c("Title", "RDataPath")], prefix, verbose
153 |     )
154 |     names(modes_list) <- gsub(prefix, "", names(modes_list))
155 | 
156 |     eh_experiments <- ExperimentList(modes_list)[resultModes]
157 | 
158 |     ess_names <- c("colData", "metadata", "sampleMap")
159 | 
160 |     ess_idx <- .conditionToIndex(ess_names, eh_assays,
161 |         function(x) grepl(x, eh_assays))
162 | 
163 |     ess_list <- .getResources(eh,
164 |         modes_metadat[ess_idx, c("Title", "RDataPath")], prefix, verbose)
165 |     names(ess_list) <- gsub(prefix, "", names(ess_list))
166 | 
167 |     c(list(experiments = eh_experiments), ess_list)
168 | }
169 | 
170 | .mergeLowColData <- function(x) {
171 |     newcoldata <- Reduce(
172 |         function(x, y) {
173 |             S4Vectors::merge(x, y, by = "row.names", all = TRUE)
174 |         },
175 |         lapply(x, colData)
176 |     )
177 |     if (length(x) > 1L) {
178 |         rownames(newcoldata) <- newcoldata[["Row.names"]]
179 |         newcoldata <- newcoldata[, -which(colnames(newcoldata) == "Row.names")]
180 |     }
181 |     newcoldata
182 | }
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # <a href='https://waldronlab.github.io/SingleCellMultiModal'><img src='https://raw.githubusercontent.com/Bioconductor/BiocStickers/devel/SingleCellMultiModal/SingleCellMultiModal.png' align="right" height="139" /></a>
 3 | 
 4 | # SingleCellMultiModal
 5 | 
 6 | ## Overview
 7 | 
 8 | `SingleCellMultiModal` is an R package that provides a convenient and
 9 | user-friendly representation of multi-modal data using
10 | `MultiAssayExperiment`. This package introduces a suite of single-cell
11 | multimodal landmark datasets for benchmarking and testing multimodal
12 | analysis methods via the `ExperimentHub` Bioconductor package. The scope
13 | of this package is to provide efficient access to a selection of
14 | curated, pre-integrated, publicly available landmark datasets for
15 | methods development and benchmarking.
16 | 
17 | ## Installation
18 | 
19 | ``` r
20 | if (!requireNamespace("BiocManager", quietly = TRUE))
21 |     install.packages("BiocManager")
22 | 
23 | BiocManager::install("SingleCellMultiModal")
24 | ```
25 | 
26 | ## Loading packages
27 | 
28 | ``` r
29 | library(SingleCellMultiModal)
30 | library(MultiAssayExperiment)
31 | ```
32 | 
33 | # Citing SingleCellMultiModal
34 | 
35 | Your citations are crucial in keeping our software free and open source.
36 | To cite our package see the citation (Eckenrode et al. (2023)) in the
37 | Reference section. You may also browse to the publication at [PLoS
38 | Computational Biology](https://doi.org/10.1371/journal.pcbi.1011324).
39 | 
40 | ## Representation
41 | 
42 | Users can obtain integrative representations of multiple modalities as a
43 | `MultiAssayExperiment`, a common core Bioconductor data structure relied
44 | on by dozens of multimodal data analysis packages.
45 | `MultiAssayExperiment` harmonizes data management of multiple
46 | experimental assays performed on an overlapping set of specimens.
47 | Although originally developed for patient data from multi-omics cancer
48 | studies, the `MultiAssayExperiment` framework naturally applies also to
49 | single cells. A schematic of the data structure can be seen below. In
50 | this context, “patients” are replaced by “cells”. We use
51 | `MultiAssayExperiment` because it provides a familiar user experience by
52 | extending `SummarizedExperiment` concepts and providing open ended
53 | compatibility with standard data classes present in Bioconductor such as
54 | the `SingleCellExperiment`.
55 | 
56 | <img src="https://github.com/waldronlab/MultiAssayExperiment/blob/c3c59a094e5a08111ee98b9f69579db5634d9fd4/vignettes/MultiAssayExperiment.png?raw=true" width="100%" />
57 | 
58 | # Contributions
59 | 
60 | Want to contribute to the `SingleCellMultiModal` package? We welcome
61 | contributions from the community. Please refer to our [Contributing
62 | Guidelines](https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines)
63 | for more details.
64 | 
65 | ## Further resources
66 | 
67 | For more information on the `MultiAssayExperiment` data structure,
68 | please refer to Ramos et al. (2017) as well as the [MultiAssayExperiment
69 | vignette](https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/MultiAssayExperiment.html).
70 | 
71 | # References
72 | 
73 | <div id="refs" class="references csl-bib-body hanging-indent"
74 | entry-spacing="0">
75 | 
76 | <div id="ref-Eckenrode2023-yq" class="csl-entry">
77 | 
78 | Eckenrode, Kelly B, Dario Righelli, Marcel Ramos, Ricard Argelaguet,
79 | Christophe Vanderaa, Ludwig Geistlinger, Aedin C Culhane, et al. 2023.
80 | “Curated Single Cell Multimodal Landmark Datasets for R/Bioconductor.”
81 | *PLoS Comput. Biol.* 19 (8): e1011324.
82 | 
83 | </div>
84 | 
85 | <div id="ref-Ramos2017-tk" class="csl-entry">
86 | 
87 | Ramos, Marcel, Lucas Schiffer, Angela Re, Rimsha Azhar, Azfar Basunia,
88 | Carmen Rodriguez, Tiffany Chan, et al. 2017. “Software for the
89 | Integration of Multiomics Experiments in Bioconductor.” *Cancer Res.* 77
90 | (21): e39–42.
91 | 
92 | </div>
93 | 
94 | </div>
95 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | title: SingleCellMultiModal
2 | url: https://waldronlab.github.io/SingleCellMultiModal
3 | 
4 | template:
5 |   bootstrap: 5
6 |   params:
7 |     bootswatch: flatly
8 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite SingleCellMultiModal in publication use:")
 2 | 
 3 | bibentry(
 4 |     bibtype = "Article",
 5 |     title =
 6 |         "Curated single cell multimodal landmark datasets for R/Bioconductor",
 7 |     author = c(
 8 |         person(
 9 |             "Kelly B", "Eckenrode", , "ctb",
10 |         ),
11 |         person(
12 |             "Dario", "Righelli", , "aut",
13 |         ),
14 |         person(
15 |             "Marcel", "Ramos", , c("aut", "cre"),
16 |             comment = c(ORCID = "0000-0002-3242-0582")
17 |         ),
18 |         person(
19 |             "Ricard", "Argelaguet", , "ctb"
20 |         ),
21 |         person(
22 |             "Christophe", "Vanderaa", , "aut"
23 |         ),
24 |         person(
25 |             "Ludwig", "Geistlinger", , "aut",
26 |             comment = c(ORCID = "0000-0002-2495-5464")
27 |         ),
28 |         person(
29 |             "Aedin C", "Culhane", , "ctb"
30 |         ),
31 |         person(
32 |             "Laurent", "Gatto", , "ctb"
33 |         ),
34 |         person(
35 |             "Vincent J", "Carey", , "ctb",
36 |             comment = c(ORCID = "0000-0003-4046-0063")
37 |         ),
38 |         person(
39 |             "Martin", "Morgan", , "ctb",
40 |             comment = c(ORCID = "0000-0002-5874-8148")
41 |         ),
42 |         person(
43 |             "Davide", "Risso", , "ctb"
44 |         ),
45 |         person(
46 |             "Levi", "Waldron", , "ctb",
47 |             comment = c(ORCID = "0000-0003-2725-0694")
48 |         )
49 |     ),
50 |     journal = "PLoS Comput. Biol.",
51 |     year = "2023",
52 |     volume = "19",
53 |     number = "8",
54 |     doi = "10.1371/journal.pcbi.1011324"
55 | )
56 | 
57 | 


--------------------------------------------------------------------------------
/inst/REFERENCES.bib:
--------------------------------------------------------------------------------
  1 | @ARTICLE{Macaulay2015,
  2 |   title    = "{G\&T-seq}: parallel sequencing of single-cell genomes and
  3 |               transcriptomes",
  4 |   author   = "Macaulay, Iain C and Haerty, Wilfried and Kumar, Parveen and Li,
  5 |               Yang I and Hu, Tim Xiaoming and Teng, Mabel J and Goolam, Mubeen
  6 |               and Saurat, Nathalie and Coupland, Paul and Shirley, Lesley M and
  7 |               Smith, Miriam and Van der Aa, Niels and Banerjee, Ruby and Ellis,
  8 |               Peter D and Quail, Michael A and Swerdlow, Harold P and
  9 |               Zernicka-Goetz, Magdalena and Livesey, Frederick J and Ponting,
 10 |               Chris P and Voet, Thierry",
 11 |   abstract = "G\&T-seq offers robust full-length transcript and whole-genome
 12 |               sequencing simultaneously from a single cell.",
 13 |   journal  = "Nat. Methods",
 14 |   volume   =  12,
 15 |   number   =  6,
 16 |   pages    = "519--522",
 17 |   month    =  jun,
 18 |   year     =  2015
 19 | }
 20 | 
 21 | @ARTICLE{Macaulay2016,
 22 |   title    = "Separation and parallel sequencing of the genomes and
 23 |               transcriptomes of single cells using {G\&T-seq}",
 24 |   author   = "Macaulay, Iain C and Teng, Mabel J and Haerty, Wilfried and
 25 |               Kumar, Parveen and Ponting, Chris P and Voet, Thierry",
 26 |   journal  = "Nat. Protoc.",
 27 |   volume   =  11,
 28 |   number   =  11,
 29 |   pages    = "2081--2103",
 30 |   month    =  nov,
 31 |   year     =  2016,
 32 |   language = "en"
 33 | }
 34 | 
 35 | @ARTICLE{Argelaguet2019-et,
 36 |   title    = "Multi-omics profiling of mouse gastrulation at single-cell
 37 |               resolution",
 38 |   author   = "Argelaguet, Ricard and Clark, Stephen J and Mohammed, Hisham and
 39 |               Stapel, L Carine and Krueger, Christel and Kapourani,
 40 |               Chantriolnt-Andreas and Imaz-Rosshandler, Ivan and Lohoff, Tim
 41 |               and Xiang, Yunlong and Hanna, Courtney W and Smallwood, Sebastien
 42 |               and Ibarra-Soria, Ximena and Buettner, Florian and Sanguinetti,
 43 |               Guido and Xie, Wei and Krueger, Felix and G{\"o}ttgens, Berthold
 44 |               and Rugg-Gunn, Peter J and Kelsey, Gavin and Dean, Wendy and
 45 |               Nichols, Jennifer and Stegle, Oliver and Marioni, John C and
 46 |               Reik, Wolf",
 47 |   journal  = "Nature",
 48 |   volume   =  576,
 49 |   number   =  7787,
 50 |   pages    = "487--491",
 51 |   month    =  dec,
 52 |   year     =  2019,
 53 |   language = "en"
 54 | }
 55 | 
 56 | @ARTICLE{Clark2018-qg,
 57 |   title    = "{scNMT-seq} enables joint profiling of chromatin accessibility
 58 |               {DNA} methylation and transcription in single cells",
 59 |   author   = "Clark, Stephen J and Argelaguet, Ricard and Kapourani,
 60 |               Chantriolnt-Andreas and Stubbs, Thomas M and Lee, Heather J and
 61 |               Alda-Catalinas, Celia and Krueger, Felix and Sanguinetti, Guido
 62 |               and Kelsey, Gavin and Marioni, John C and Stegle, Oliver and
 63 |               Reik, Wolf",
 64 |   journal  = "Nat. Commun.",
 65 |   volume   =  9,
 66 |   number   =  1,
 67 |   pages    = "781",
 68 |   month    =  feb,
 69 |   year     =  2018,
 70 |   language = "en"
 71 | }
 72 | 
 73 | @ARTICLE{Zhu2018identification,
 74 |   title    = "Identification of spatially associated subpopulations by 
 75 |                 combining {scRNAseq} and sequential fluorescence in situ 
 76 |                 hybridization data",
 77 |   author   = "Zhu, Qian and Shah, Sheel and Dries, Ruben and Cai, Long and 
 78 |                 Yuan, Guo-Cheng",
 79 |   journal   = "Nature biotechnology",
 80 |   volume    = 36,
 81 |   number    = 12,
 82 |   pages     = 1183,
 83 |   year      = 2018,
 84 |   language  = "en"
 85 | }
 86 | 
 87 | @ARTICLE{Tasic2016adult,
 88 |   title    = "Adult mouse cortical cell taxonomy revealed by single cell 
 89 |                 transcriptomics",
 90 |   author   = "Tasic, Bosiljka and Menon, Vilas and Nguyen, Thuc Nghi and 
 91 |                 Kim, Tae Kyung and Jarsky, Tim and Yao, Zizhen and 
 92 |                 Levi, Boaz and Gray, Lucas T and Sorensen, Staci A and 
 93 |                 Dolbeare, Tim and others",
 94 |   journal  = "Nature neuroscience",
 95 |   volume   = 19,
 96 |   number   = 2,
 97 |   pages    = 335,
 98 |   year     = 2016,
 99 |   language  = "en"
100 | }
101 | 
102 | @ARTICLE{stoeckius2017simultaneous,
103 |   title = {Simultaneous epitope and transcriptome measurement in single cells},
104 |   author = {Stoeckius, Marlon and Hafemeister, Christoph and 
105 |   			Stephenson, William and Houck-Loomis, Brian and 
106 |   			Chattopadhyay, Pratip K and Swerdlow, Harold and 
107 |   			Satija, Rahul and Smibert, Peter},
108 |   journal = {Nature methods},
109 |   volume = {14},
110 |   number = {9},
111 |   pages = {865},
112 |   year = {2017},
113 |   publisher = {Nature Publishing Group}
114 | }
115 | 
116 | 
117 | @ARTICLE{Specht2021-pm,
118 |   title    = "Single-cell proteomic and transcriptomic analysis of macrophage
119 |               heterogeneity using {SCoPE2}",
120 |   author   = "Specht, Harrison and Emmott, Edward and Petelski, Aleksandra A
121 |               and Huffman, R Gray and Perlman, David H and Serra, Marco and
122 |               Kharchenko, Peter and Koller, Antonius and Slavov, Nikolai",
123 |   journal  = "Genome Biol.",
124 |   volume   =  22,
125 |   number   =  1,
126 |   pages    = "50",
127 |   month    =  jan,
128 |   year     =  2021,
129 |   language = "en"
130 | }
131 | 
132 | @ARTICLE{mimitou2019multiplexed,
133 |   title={Multiplexed detection of proteins, transcriptomes, clonotypes and 
134 |         CRISPR perturbations in single cells},
135 |   author={Mimitou, Eleni P and Cheng, Anthony and Montalbano, Antonino and Hao, 
136 |         Stephanie and Stoeckius, Marlon and Legut, Mateusz and Roush, Timothy 
137 |         and Herrera, Alberto and Papalexi, Efthymia and Ouyang, Zhengqing 
138 |         and others},
139 |   journal={Nature methods},
140 |   volume={16},
141 |   number={5},
142 |   pages={409--412},
143 |   year={2019},
144 |   publisher={Nature Publishing Group}
145 | }
146 | 
147 | @ARTICLE{Ramos2017-tk,
148 |   title    = "Software for the Integration of Multiomics Experiments in
149 |               Bioconductor",
150 |   author   = "Ramos, Marcel and Schiffer, Lucas and Re, Angela and Azhar,
151 |               Rimsha and Basunia, Azfar and Rodriguez, Carmen and Chan, Tiffany
152 |               and Chapman, Phil and Davis, Sean R and Gomez-Cabrero, David and
153 |               Culhane, Aedin C and Haibe-Kains, Benjamin and Hansen, Kasper D
154 |               and Kodali, Hanish and Louis, Marie S and Mer, Arvind S and
155 |               Riester, Markus and Morgan, Martin and Carey, Vince and Waldron,
156 |               Levi",
157 |   journal  = "Cancer Res.",
158 |   volume   =  77,
159 |   number   =  21,
160 |   pages    = "e39--e42",
161 |   month    =  nov,
162 |   year     =  2017,
163 |   language = "en"
164 | }
165 | 
166 | @ARTICLE{Eckenrode2023-yq,
167 |   title    = "Curated single cell multimodal landmark datasets for
168 |               {R/Bioconductor}",
169 |   author   = "Eckenrode, Kelly B and Righelli, Dario and Ramos, Marcel and
170 |               Argelaguet, Ricard and Vanderaa, Christophe and Geistlinger,
171 |               Ludwig and Culhane, Aedin C and Gatto, Laurent and Carey, Vincent
172 |               and Morgan, Martin and Risso, Davide and Waldron, Levi",
173 |   journal  = "PLoS Comput. Biol.",
174 |   volume   =  19,
175 |   number   =  8,
176 |   pages    = "e1011324",
177 |   month    =  aug,
178 |   year     =  2023,
179 |   language = "en"
180 | }
181 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv1.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer"
2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos <marcel.ramos@roswellpark.org>"
3 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv2.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer"
2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos <marcel.ramos@roswellpark.org>"
3 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","2.0.0","mouse_gastrulation","Marcel Ramos <marcel.ramos@roswellpark.org>"
4 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv4.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer"
2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos <marcel.ramos@roswellpark.org>"
3 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","2.0.0","mouse_gastrulation","Marcel Ramos <marcel.ramos@roswellpark.org>"
4 | "Dept. of Molecular Genetics, Allen Institute for Brain Science, United States","10090","Mus musculus","https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0","TXT","1.0.0","mouse_visual_cortex","Dario Righelli <dario.righelli@gmail.com>"
5 | "Dept. of Molecular Genetics, Allen Institute for Brain Science, United States","10090","Mus musculus","https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0","TXT","2.0.0","mouse_visual_cortex","Dario Righelli <dario.righelli@gmail.com>"
6 | "Innovation Lab, New York Genome Center, New York, United States","9606","Homo Sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866","TXT","1.0.0","coord_blood","Dario Righelli <dario.righelli@gmail.com>"
7 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv5.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer"
2 | "Technology Innovation Lab, New York Genome Center, New York, United States","9606","Homo Sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126310","TXT","1.0.0","peripheral_blood","Dario Righelli <dario.righelli@gmail.com>"
3 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv6.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer"
2 | "European Bioinformatics Institute (EMBL-EBI), United Kingdom","9606","Homo sapiens","http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/","1.0.0","pbmc_10x","Ricard Argelaguet <ricard@ebi.ac.uk>"


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv7.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer"
2 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa <christophe.vanderaa@uclouvain.be>"
3 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa <christophe.vanderaa@uclouvain.be>"
4 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa <christophe.vanderaa@uclouvain.be>"
5 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv8.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer"
2 | "Wellcome Trust Sanger Institute, Cambridge, United Kingdom","10090","Mus musculus","https://www.ebi.ac.uk/ena/browser/view/PRJEB9051","1.0.0","mouse_embryo_8_cell","Ludwig Geistlinger <ludwig_geistlinger@hms.harvard.edu>"
3 | 


--------------------------------------------------------------------------------
/inst/extdata/docuData/singlecellmultimodalv9.csv:
--------------------------------------------------------------------------------
1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer"
2 | "European Bioinformatics Institute (EMBL-EBI), United Kingdom","9606","Homo sapiens","http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/","1.0.1","pbmc_10x","Marcel Ramos <marcel.ramos@roswellpark.org>"
3 | 


--------------------------------------------------------------------------------
/inst/extdata/ontomap.tsv:
--------------------------------------------------------------------------------
 1 | DataType	function_name	original_column_name	original_cell_name	ontology_ID	ontology_cell_name
 2 | macrophage_differentiation	SCoPE2	celltype	Macrophage	NCIT:C12558	Macrophage
 3 | macrophage_differentiation	SCoPE2	celltype	Monocyte	NCIT:C12547	Monocyte
 4 | mouse_gastrulation	scNMT	lineage	Epiblast	NCIT:C34164	Epiblast
 5 | mouse_gastrulation	scNMT	lineage	Mesoderm	NCIT:C12750	Mesoderm
 6 | mouse_gastrulation	scNMT	lineage	Primitive_Streak	NCIT:C28402	Primitive_Streak
 7 | mouse_gastrulation	scNMT	lineage	Ectoderm	NCIT:C12703	Ectoderm
 8 | mouse_gastrulation	scNMT	lineage	Endoderm	NCIT:C12706	Endoderm
 9 | mouse_gastrulation	scNMT	lineage	ExE_ectoderm	NCIT:C12703	Ectoderm
10 | mouse_gastrulation	scNMT	lineage	Visceral_endoderm	UBERON:0004877	visceral endoderm
11 | mouse_gastrulation	scNMT	lineage	Primitive_endoderm	BTO:0002123	primitive endoderm
12 | mouse_visual_cortex	seqFISH	class	Glutamatergic Neuron	CL:0000679	glutamatergic neuron
13 | mouse_visual_cortex	seqFISH	class	GABA-ergic Neuron	CL:0000617	GABAergic neuron
14 | mouse_visual_cortex	seqFISH	class	Oligodendrocyte.3	CL:0000128	oligodendrocyte
15 | mouse_visual_cortex	seqFISH	class	Endothelial Cell	NCIT:C12865	Endothelial Cell
16 | mouse_visual_cortex	seqFISH	class	Astrocyte	NCIT:C12477	Astrocyte
17 | mouse_visual_cortex	seqFISH	class	Oligodendrocyte.2	CL:0000128	oligodendrocyte
18 | mouse_visual_cortex	seqFISH	class	Microglia	NCIT:C12616	Microglia
19 | mouse_visual_cortex	seqFISH	class	Oligodendrocyte.1	CL:0000128	oligodendrocyte
20 | pbmc_10x	scMultiome	celltype	naive CD4 T cells	NCIT:C12537	CD4-Positive T-Lymphocyte
21 | pbmc_10x	scMultiome	celltype	memory CD4 T cells	NCIT:C97349	CD4 Positive Memory T-Lymphocyte
22 | pbmc_10x	scMultiome	celltype	non-classical monocytes	CL:0000875	non-classical monocyte
23 | pbmc_10x	scMultiome	celltype	naive CD8 T cells	ASCTB-TEMP_cd8-t-cells	cd8+ t-cells
24 | pbmc_10x	scMultiome	celltype	CD56 (bright) NK cells	CL:0000938	CD16-negative, CD56-bright natural killer cell
25 | pbmc_10x	scMultiome	celltype	classical monocytes	CL:0000860	classic monocytes
26 | pbmc_10x	scMultiome	celltype	effector CD8 T cells	NCIT:C126419	Effector Memory T-Lymphocyte
27 | pbmc_10x	scMultiome	celltype	myeloid DC	CL:0000782	myeloid dendritic cell
28 | pbmc_10x	scMultiome	celltype	intermediate monocytes	CL:0002393	intermediate monocyte
29 | pbmc_10x	scMultiome	celltype	memory B cells	CL:0000787	memory B cell
30 | pbmc_10x	scMultiome	celltype	MAIT T cells	NCIT:C115217	Mucosal-Associated Invariant T-Cell
31 | pbmc_10x	scMultiome	celltype	CD56 (dim) NK cells	CL:0000939	CD16-positive, CD56-dim natural killer cell
32 | pbmc_10x	scMultiome	celltype	naive B cells	CL:0000788	naive B cell
33 | pbmc_10x	scMultiome	celltype	plasmacytoid DC	CL:0000784	plasmacytoid dendritic cell
34 | cord_blood	CITEseq	celltype	Natural Killers	OMIT:0026379	Natural Killer T-Cells
35 | cord_blood	CITEseq	celltype	Precursors	OMIT:0012443	Protein Precursors
36 | cord_blood	CITEseq	celltype	CD4 T-cells	ASCTB-TEMP_cd4-t	cd4+ t
37 | cord_blood	CITEseq	celltype	CD8 T-cells	ASCTB-TEMP_cd8-t-cells	cd8+ t-cells
38 | cord_blood	CITEseq	celltype	B-cells	NCIT:C12474	B-Lymphocyte
39 | cord_blood	CITEseq	celltype	Monocytes CD14+	CL:0001054	CD14-positive monocyte
40 | cord_blood	CITEseq	celltype	T-cells	OMIT:0026379	Natural Killer T-Cells
41 | cord_blood	CITEseq	celltype	Monocytes CD16+	NCIT:C12547	Monocyte
42 | 


--------------------------------------------------------------------------------
/inst/scripts/CITEseq_celltypes.R:
--------------------------------------------------------------------------------
  1 | library(devtools)
  2 | load_all()
  3 | mae <- CITEseq("cord_blood", dry.run=FALSE)
  4 | 
  5 | ## Detecting MOUSE/HUMAN cells
  6 | rna <- assays(mae)[["scRNAseq"]]
  7 | hrna <- colSums(rna[grep("^HUMAN", rownames(rna)),])
  8 | mrna <- colSums(rna[grep("^MOUSE", rownames(rna)),])
  9 | mate <- cbind(hrna, mrna)
 10 | plot(log1p(hrna+1), log1p(mrna+1), xlab="hrna", ylab="mrna")
 11 | ## Using kmeans for detecting 2 bigger clusters of human and mouse cells
 12 | set.seed(666)
 13 | km <- kmeans(cbind(log(hrna+1), log(mrna+1)), centers=2)
 14 | plot(log(hrna+1), log(mrna+1), xlab="hrna", ylab="mrna", col=km$cluster)
 15 | 
 16 | ## computing distance+hclust on human/mouse cells cluster for detecting mixed cells
 17 | mat <- cbind(log(hrna+1), log(mrna+1))
 18 | 
 19 | ## human cells
 20 | d <- dist(mat[km$cluster==2,])
 21 | hc <- hclust(d, method="single")
 22 | cl <- cutree(hc, k=2)
 23 | plot(mat[km$cluster==2,], col=cl)
 24 | hbc <- names(cl)[cl==1]
 25 | 
 26 | # cd <- colData(mae)
 27 | load("cord_blood/v1.0.0/coldata_scRNAseq.rda")
 28 | cd <- coldata_scRNAseq
 29 | 
 30 | cd$species <- NA
 31 | cd$species[which(rownames(cd) %in% hbc)] <- "HUMAN"
 32 | cd$species[which(rownames(cd) %in% names(cl)[cl==2])] <- "MIXED"
 33 | cd$species[which(rownames(cd) %in% names(km$cluster)[km$cluster==1])] <- "MOUSE"
 34 | table(cd$specie)
 35 | 
 36 | ##### Annotating cell types
 37 | adtclrgeo <- as.matrix(read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_clr-transformed.csv", row.names=1))
 38 | ## add this assay to the ADTs 
 39 | cd$celltype <- NA
 40 | cd$markers <- NA
 41 | 
 42 | cdct <- cd[!cd$discard,]
 43 | cdct <- cdct[cdct$species=="HUMAN",]
 44 | 
 45 | out.cd19.cd3 <- getCellGroups(adtclrgeo, adt1="CD19", adt2="CD3", th1=0.9, th2=0.6)
 46 | 
 47 | 
 48 | cdct <- addCTLabels(cdct, out.cd19.cd3, "CD19-/CD3+", "T-cells")
 49 | cdct <- addCTLabels(cdct, out.cd19.cd3, "CD19+/CD3-", "B-cells")
 50 | 
 51 | out.cd11.cd14 <- getCellGroups(adtclrgeo, adt1="CD11c", adt2="CD14", th1=0.4, th2=0.55)
 52 | cdct <- addCTLabels(cdct, out.cd11.cd14, "CD11c+/CD14+", "Monocytes CD14+")
 53 | table(cdct$celltype)
 54 | 
 55 | out.cd11.cd16 <- getCellGroups(adtclrgeo, adt1="CD11c", adt2="CD16", th1=0.4, th2=0.55)
 56 | cdct <- addCTLabels(cdct, out.cd11.cd16, "CD11c+/CD16+", "Monocytes CD16+")
 57 | table(cdct$celltype)
 58 | 
 59 | out.T.cd8.cd4 <- getCellGroups(adtclrgeo[,out.cd19.cd3$`CD19-/CD3+`$bc], adt1="CD8", adt2="CD4", th1=0.9, th2=0.6)
 60 | ## overwriting because CD4/CD8 T-cells are subgroups of T-cells
 61 | cdct <- addCTLabels(cdct, out.T.cd8.cd4, "CD8-/CD4+", "CD4 T-cells", overwrite=TRUE)
 62 | cdct <- addCTLabels(cdct, out.T.cd8.cd4, "CD8+/CD4-", "CD8 T-cells", overwrite=TRUE)
 63 | # cord_blood_colData_anno <- cdct
 64 | # save(cord_blood_colData_anno, file="cord_blood_colData_anno.rda")
 65 | 
 66 | ## precursors are CD34+ and I took CD56- which seems not expressed from the paper figure
 67 | out.cd56.cd34 <- getCellGroups(adtclrgeo, adt1="CD56", adt2="CD34", th1=0.37, th2=0.9)
 68 | prebc <- out.cd56.cd34$`CD56-/CD34+`$bc
 69 | # idxpre <- which(rownames(cdct) %in% prebc)
 70 | # which(prebc %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!!
 71 | cdct <- addCTLabels(cdct, out.cd56.cd34, "CD56-/CD34+", "Precursors")
 72 | 
 73 | # ## NATURAL KILLERS are CD3-/CD16+ (CD56+ and CD16+)
 74 | out.cd16.cd3 <- getCellGroups(adtclrgeo, adt1="CD16", adt2="CD3", th1=0, th2=0.55)
 75 | nkcellbc16 <- out.cd16.cd3$`CD16+/CD3-`$bc
 76 | idxnk <- which(rownames(cdct) %in% nkcellbc16)
 77 | length(idxnk)
 78 | sum(nkcellbc16 %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!!
 79 | 
 80 | cdctnk <- addCTLabels(cdct, out.cd16.cd3, "CD16+/CD3-", "Natural Killers")
 81 | 
 82 | ## other markers for NK are CD56+ and CD3-
 83 | out.cd56.cd3 <- getCellGroups(adtclrgeo, adt1="CD56", adt2="CD3", th1=0, th2=0)
 84 | # nkcellbc56 <- out.cd56.cd3$`CD56+/CD3-`$bc
 85 | # idxnk <- which(rownames(cdct) %in% nkcellbc56)
 86 | # length(idxnk)
 87 | # sum(nkcellbc56 %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!!
 88 | cdctnk <- addCTLabels(cdctnk, out.cd56.cd3, "CD56+/CD3-", "Natural Killers")
 89 | 
 90 | coldata_scRNAseq <-  cdctnk
 91 | save(coldata_scRNAseq, file="cord_blood/v1.0.0/coldata_scRNAseq.rda")
 92 | scADT_clrCounts <- adtclrgeo
 93 | save(scADT_clrCounts, file="cord_blood/v1.0.0/scADT_clrCounts.rda")
 94 | 
 95 | 
 96 | ## Building tsne
 97 | cnts <- (mae[["scADT"]][, which(colnames(mae[["scADT"]]) %in% rownames(cdctnk))])
 98 | adtclrgeoss <- adtclrgeo[,which(colnames(adtclrgeo) %in% rownames(cdctnk))]
 99 | adtsce <- SingleCellExperiment(assays=list(counts=cnts, logcounts=adtclrgeoss), 
100 |                                colData=cdctnk)
101 | library(scran)
102 | adtsce <- runPCA(adtsce)
103 | adtsce <- runTSNE(adtsce, dimred="PCA")
104 | plotReducedDim(adtsce, dimred="TSNE", colour_by="celltype")
105 | 
106 | 


--------------------------------------------------------------------------------
/inst/scripts/Contributing-Guidelines.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | date: "`r format(Sys.time(), '%B %d, %Y')`"
  4 | ---
  5 | 
  6 | ```{r, include = FALSE}
  7 | knitr::opts_chunk$set(
  8 |   collapse = TRUE,
  9 |   comment = "#>",
 10 |   cache = TRUE,
 11 |   out.width = "100%"
 12 | )
 13 | options(tibble.print_min = 5, tibble.print_max = 5)
 14 | ```
 15 | 
 16 | # Overview
 17 | 
 18 | Thank you for your interest!
 19 | 
 20 | The `SingleCellMultiModal` package aims to provide single cell datasets
 21 | from several different technologies / modalities for benchmarking and analysis.
 22 | We currently provide from `scNMT`, `scM&T`, `seqFISH`, `CITEseq`, and other
 23 | technologies. Contributions are very much welcome.
 24 | 
 25 | # List of Multi-modal Datasets
 26 | 
 27 | For a full list of available datasets, see here:
 28 | [Google Drive Sheet](https://docs.google.com/spreadsheets/d/14Eq_Bt_3tKx_t1UDwan0qZZGWNyK-d2WLXtmoPGB5To/edit#gid=0)
 29 | 
 30 | # Contributing
 31 | 
 32 | In order to contribute, we generally require data in `Rda` or `Rds` format
 33 | though we also support `HDF5` and `MTX` formats. Aside from the usual required
 34 | `metadata.csv` documentation in the package, contributors are required to add a
 35 | name to the `DataType` column in the metadata table that indicates the name of
 36 | the contributed dataset. To illustrate, here are some `DataType` names already
 37 | in the package:
 38 | 
 39 | * mouse_gastrulation
 40 | * mouse_visual_cortex
 41 | * cord_blood
 42 | * pbmc_10x
 43 | * macrophage_differentiation
 44 | * mouse_embryo_8_cell
 45 | 
 46 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
 47 | library(SingleCellMultiModal)
 48 | ```
 49 | 
 50 | ```{r}
 51 | meta <- system.file("extdata", "metadata.csv",
 52 |     package = "SingleCellMultiModal", mustWork = TRUE)
 53 | head(read.csv(meta))
 54 | ```
 55 | 
 56 | # Versioning and folder structure
 57 | 
 58 | We associate a version with all datasets. We start with version `1.0.0` using
 59 | semantic versioning and include data in a corresponding version folder
 60 | (`v1.0.0`). Thus, the recommended folder structure is as follows:
 61 | 
 62 | ```
 63 | ~/data
 64 |   └ scmm/
 65 |     └ mouse_gastrulation/
 66 |       └ v1.0.0/
 67 |         └ scnmt_acc_cgi.rda
 68 |         └ scnmt_met_genebody.rda
 69 |         └ scnmt_met_cgi.rda
 70 |         └ scnmt_rna.rda
 71 |         └ scnmt_colData.rda
 72 |         └ scnmt_sampleMap.rda
 73 | ```
 74 | 
 75 | In the `inst` section, we will discuss how to annotate these data products.
 76 | 
 77 | # Files
 78 | 
 79 | It is customary to include one `Rda` / `Rds` file per assay or per assay and
 80 | region combination of interest (as above). We also highly recommend including
 81 | `sampleMap` and `colData` datasets for the `MultiAssayExperiment` that will
 82 | be built on the fly. In this example, there are three modalities in the `scNMT`
 83 | dataset, `rna` (transcriptome), `acc` (chromatin accessibility), and `met`
 84 | (methylation).
 85 | 
 86 | # vignettes
 87 | 
 88 | Contributors are required to demonstrate user-level functionality via
 89 | examples in a vignette for each contributed dataset.
 90 | 
 91 | # R
 92 | 
 93 | Ideally, the interface for the contributed dataset should be similar to that
 94 | of `scNMT` so that users have a sense of consistency in the usage of the
 95 | package. This means having one main function that returns a
 96 | `MultiAssayExperiment` object and having options that show the user what
 97 | datasets are available for a particular technology. Contributors should use
 98 | `roxygen2` for documenting datasets and using `@inheritParams scNMT` tag
 99 | to avoid copying `@param` documentation.
100 | 
101 | See the current example for implementation details:
102 | 
103 | ```{r}
104 | scNMT(
105 |     DataType = "mouse_gastrulation",
106 |     mode = "*",
107 |     version = "1.0.0",
108 |     dry.run = TRUE
109 | )
110 | ```
111 | 
112 | **Note**. Contributors should ensure that the documentation is complete and the
113 | proper data sources have been attributed.
114 | 
115 | # inst/*
116 | 
117 | ## extdata/
118 | 
119 | In the following section we will describe how to annotate and append to
120 | the `metadata.csv` file. First, we have to ensure that we are accounting for
121 | all of the fields required by `ExperimentHub`. They are listed here:
122 | 
123 | * ResourceName
124 | * Title
125 | * Description
126 | * BiocVersion
127 | * Genome
128 | * SourceType
129 | * SourceUrl
130 | * SourceVersion
131 | * Species
132 | * TaxonomyId
133 | * Coordinate_1_based
134 | * DataProvider
135 | * Maintainer
136 | * RDataPath
137 | * RDataClass
138 | * DispatchClass
139 | * DataType+
140 | 
141 | **Note**. `DataType` is a field we've added to help distinguish multimodal
142 | technologies and is required for `SingleCellMultiModal`. Some of the
143 | `DataType`s already available are `mouse_gastrulation`, `mouse_visual_cortex`,
144 | `cord_blood`, `peripheral_blood`, etc.
145 | 
146 | To make it easy for contributions, we've provided a mechanism for easy
147 | documentation using a file from a `data.frame` we call a `doc_file`.
148 | 
149 | Interested contributors should create a `doc_file` in `inst/extdata/docuData`
150 | folder. Although we do not have a strict naming convention for the `doc_file`,
151 | we usually name the file `singlecellmultimodalvX.csv` where `X` is the *n*th
152 | dataset added to the package.
153 | 
154 | Here is an example of the file from version `v1.0.0` of the `scNMT` dataset:
155 | 
156 | ```{r}
157 | doc_file <- system.file("extdata", "docuData", "singlecellmultimodalv1.csv",
158 |     package = "SingleCellMultiModal", mustWork = TRUE)
159 | read.csv(doc_file, header = TRUE)
160 | ```
161 | 
162 | Contributors will then use their `doc_file` to append to the existing
163 | `metadata.csv`.
164 | 
165 | To create a `doc_file` `data.frame` with the file name
166 | `singlecellmultimodalvX.csv`, first we create a `data.frame` object.
167 | Each general annotation or row in this `data.frame` will be applied to all
168 | files uploaded to `ExperimentHub`. We take advantage of the `data.frame`
169 | function to repeat data and create a uniform `data.frame` with equal values
170 | across the columns.
171 | 
172 | ```{r}
173 | scmeta <- data.frame(
174 |     DataProvider =
175 |         "Dept. of Bioinformatics, The Babraham Institute, United Kingdom",
176 |     TaxonomyId = "10090",
177 |     Species = "Mus musculus",
178 |     SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ",
179 |     SourceType = "RDS",
180 |     SourceVersion = "1.0.0",
181 |     DataType = "mouse_gastrulation",
182 |     Maintainer = "Ricard Argelaguet <ricard@ebi.ac.uk>",
183 |     stringsAsFactors = FALSE
184 | )
185 | scmeta
186 | ```
187 | 
188 | ### Saving the data
189 | 
190 | After creating the documentation `data.frame` (`doc_file`), the contributor can
191 | save that dataset as a `.csv` file using `write.csv`.
192 | 
193 | ```{r,eval=FALSE}
194 | write.csv(
195 |     scmeta,
196 |     file = "inst/extdata/docuData/singlecellmultimodal.csv",
197 |     row.names = FALSE
198 | )
199 | ```
200 | 
201 | ## Documenting diverse data
202 | 
203 | In the case that the contributed data is not uniform, meaning that there are
204 | multiple file types from potentially different speciments, the `data.frame`
205 | will have to account for _all_ contributed data files.
206 | 
207 | For example, if the contributed data has a number of different source types,
208 | the contributor is required to create a `data.frame` with the number of rows
209 | equal to the number of files to be uploaded.
210 | 
211 | In this example, we have **two** data files from different source types and
212 | formats:
213 | 
214 | ```{r}
215 | data.frame(
216 |     DataProvider =
217 |         c("Institute of Population Genetics", "Mouse Science Center"),
218 |     TaxonomyId = c("9606", "10090"),
219 |     Species = c("Homo sapiens", "Mus musculus"),
220 |     SourceUrl = c("https://human.science/org", "https://mouse.science/gov"),
221 |     SourceType = c("RDS", "XML"),
222 |     DataType = c("human_genetics", "mouse_genetics"),
223 |     stringsAsFactors = FALSE
224 | )
225 | ```
226 | 
227 | ## scripts/
228 | 
229 | ### make-data/
230 | 
231 | The individual data products that will eventually come together into
232 | a `MultiAssayExperiment` can be uploaded as serialized `RDA` / `RDS` files,
233 | `HDF5`, and even `MTX` files. For examples on how to save data into
234 | their respective file formats, see the `make-data` folder.
235 | 
236 | ## Generating the metadata.csv
237 | 
238 | ### make-metadata.R
239 | 
240 | Based on the folder structure described previously, the `directory` argument in
241 | `make_metadata` will correspond to the `~/data/scmm` folder. The `dataDir`
242 | folder will correspond to the `DataType` / technology subfolder (e.g.,
243 | "mouse_gastrulation"). These will be used as inputs to the `make_metadata`
244 | function.
245 | 
246 | Once the data is ready, the user can use the function in `make-metadata.R`
247 | in the `scripts` folder.  A typical call to `make_metadata` will either add to
248 | the metadata or replace it entirely. The easiest for current contributors is to
249 | `append` rows to the metadata file.
250 | 
251 | ```{r,eval=FALSE}
252 | make_metadata(
253 |     directory = "~/data/scmm",
254 |     dataDirs = "mouse_gastrulation", # also the name of the DataType
255 |     ext_pattern = "\\.[Rr][Dd][Aa]$",
256 |     doc_file = "inst/extdata/docuData/singlecellmultimodalv1.csv",
257 |     pkg_name = "SingleCellMultiModal",
258 |     append = TRUE,
259 |     dry.run = TRUE
260 | )
261 | ```
262 | 
263 | Note that the extraction pattern (`ext_pattern`) will allow contributors to
264 | match a specific file extension in that folder and ignore any intermediate
265 | files.
266 | 
267 | The contributor may also wish to run `dry.run=TRUE` to see the output
268 | `data.frame` to be added to the `metadata.csv` file.
269 | 
270 | _Note_. The `make_metadata` function should be run from the base package
271 | directory from a GitHub / git checkout (`git clone ...`).
272 | 
273 | ## Validation
274 | 
275 | It is recommended to run the metadata validation function from
276 | `AnnotationHubData`:
277 | 
278 | ```{r,eval=FALSE}
279 | AnnotationHubData::makeAnnotationHubMetadata("SingleCellMultiModal")
280 | ```
281 | 
282 | to ensure that some of the metadata fields are properly annotated.
283 | 
284 | 
285 | # NEWS.md
286 | 
287 | Contributors should update the `NEWS.md` file with a mention of the
288 | function and data that are being provided. See the `NEWS.md` for examples.
289 | 
290 | # Next steps
291 | 
292 | The contributor should then create a Pull Request on [GitHub][].
293 | 
294 | [GitHub]: https://github.com/waldronlab/SingleCellMultiModal/pulls
295 | 
296 | If you are interested in contributing, I can help you go over the contribution
297 | and submission. Please contact me either on the [Bioc-community Slack][]
298 | (mramos148) or at marcel {dot} ramos [at] sph (dot) cuny (dot) edu.
299 | If you need to sign up to the community Slack channel, follow this link:
300 | https://bioc-community.herokuapp.com/
301 | 
302 | [Bioc-community Slack]: https://community-bioc.slack.com
303 | 
304 | ## sessionInfo
305 | 
306 | <details>
307 |   <summary> sessionInfo </summary>
308 | 
309 | ```{r,echo=FALSE}
310 | sessionInfo()
311 | ```
312 | 
313 | </details>
314 | 


--------------------------------------------------------------------------------
/inst/scripts/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | knit: (function(inputFile, encoding) {
 4 |   rmarkdown::render(inputFile, encoding = encoding, output_dir = "../../") })
 5 | bibliography: ../REFERENCES.bib
 6 | ---
 7 | 
 8 | ```{r, include = FALSE}
 9 | knitr::opts_chunk$set(
10 |   collapse = TRUE,
11 |   comment = "#>",
12 |   cache = TRUE,
13 |   out.width = "100%"
14 | )
15 | ```
16 | 
17 | ```{r,echo=FALSE,eval=FALSE}
18 | ## Generate the ./README.md (relative to base folder)
19 | rmarkdown::render(input = "inst/scripts/README.Rmd", output_dir = ".")
20 | ```
21 | 
22 | # <a href='https://waldronlab.github.io/SingleCellMultiModal'><img src='https://raw.githubusercontent.com/Bioconductor/BiocStickers/devel/SingleCellMultiModal/SingleCellMultiModal.png' align="right" height="139" /></a>
23 | 
24 | ```{r, child="../../vignettes/SingleCellMultiModal.Rmd"}
25 | 
26 | ```
27 | 


--------------------------------------------------------------------------------
/inst/scripts/make-data.R:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## PLACEHOLDER for make-data/
 3 | ##
 4 | ## see the respective R script for each technology
 5 | ##
 6 | ## for example, make-data/scNMT.R for an example on how
 7 | ## we took a MultiAssayExperiment object and created
 8 | ## Rda files for upload to ExperimentHub
 9 | ##
10 | ## see make-data/scMultiome.R for converting a MultiAssayExperiment
11 | ## into sparceMatrix representations using HDF5 and MTX formats
12 | ##
13 | ## see make-data/make-macrophage.R for taking raw data and
14 | ## creating SingleCellExperiment data products
15 | ##
16 | 


--------------------------------------------------------------------------------
/inst/scripts/make-data/CITEseq_filtering.R:
--------------------------------------------------------------------------------
  1 | library(SingleCellExperiment)
  2 | library(DropletUtils)
  3 | cb <- CITEseq("cord_blood", dry.run=FALSE, DataClass="SingleCellExperiment")
  4 | adt <- SingleCellExperiment(assays=list(counts=assays(altExp(cb))[[1]]))
  5 | top.marker <- rownames(adt)[max.col(t(counts(adt)))]
  6 | total.count <- colSums(counts(adt))
  7 | boxplot(split(log10(total.count), top.marker), ylab="Log-total ADT count", las=2)
  8 | adt.counts <- counts(adt)
  9 | adt.detected <- colSums(adt.counts > 0)
 10 | hist(adt.detected, col='grey', main="", xlab="Number of detected ADTs")
 11 | 
 12 | qc.stats <- cleanTagCounts(adt)#, exclusive=c("CD3", "CD19"))
 13 | summary(qc.stats$high.ambient) # libraries removed with high ambient contamination
 14 | 
 15 | library(scater)
 16 | mito <- grep("mt-", tolower(rownames(adt)))
 17 | df <- perCellQCMetrics(adt, subsets=list(Mito=mito))
 18 | mito.discard <- isOutlier(df$subsets_Mito_percent, type="higher")
 19 | summary(mito.discard)
 20 | 
 21 | discard <- qc.stats$discard | mito.discard
 22 | 
 23 | colData(cb) <- cbind.DataFrame(colData(cb), adt.discard=qc.stats$discard, mito.discard=mito.discard, discard=discard)
 24 | 
 25 | scRNAseq_coldata <- as.data.frame(colData(cb))
 26 | dir.create("cord_blood/v1.0.0/", recursive=TRUE)
 27 | save(scRNAseq_coldata, file="cord_blood/v1.0.0/scRNAseq_coldata.rda")
 28 | 
 29 | ## Alternatively it is possible to indicate two or more ADTs that should 
 30 | ## be expressed alternatively in a cell.
 31 | ## for CD3/CD4/CD8 i'm referring to https://tinyurl.com/ys9aawce
 32 | ## otherwise the OSCA vignette 12.3.3 indicates to use CD3/CD19, but
 33 | ## this article found that CD3and CD19 could be expressed in a novel cell type
 34 | ## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8694500/
 35 | # qc.stats <- cleanTagCounts(adtsce1, exclusive=c("CD4", "CD8"))
 36 | # summary(qc.stats$discard) # libraries removed with high ambient contamination
 37 | 
 38 | library(SingleCellExperiment)
 39 | library(DropletUtils)
 40 | mae <- CITEseq("peripheral_blood", dry.run=FALSE)#, DataClass="SingleCellExperiment")
 41 | adt <- SingleCellExperiment(assays=list(counts=mae[["scADT"]]))
 42 | pb <-  SingleCellExperiment(assays=list(counts=mae[["scRNA"]]))
 43 | cn <- colnames(adt)
 44 | condition <- unlist(lapply(strsplit(colnames(adt), "_"), function(x) x[1]))
 45 | bc <- unlist(lapply(strsplit(colnames(adt), "_"), function(x) x[2]))
 46 | colData(adt) <- DataFrame("barcodes"=bc, "condition"=condition)
 47 | colnames(adt) <- cn
 48 | 
 49 | adt.rm <- adt[-c(3,52),]
 50 | 
 51 | adtcr <- adt.rm[, adt.rm$condition=="CTRL"]
 52 | adtcl <- adt.rm[, adt.rm$condition=="CTCL"]
 53 | top.markercr <- rownames(adtcr)[max.col(t(counts(adtcr)))]
 54 | top.markercl <- rownames(adtcl)[max.col(t(counts(adtcl)))]
 55 | total.countcr <- colSums(counts(adtcr))
 56 | total.countcl <- colSums(counts(adtcl))
 57 | 
 58 | boxplot(split(log10(total.countcr), top.markercr), ylab="Log-total ADT CTRL count", las=2) #CD5
 59 | boxplot(split(log10(total.countcl), top.markercl), ylab="Log-total ADT CTCL count", las=2) #CD279
 60 | 
 61 | adt.countscr <- counts(adtcr)
 62 | adt.detectedcr <- colSums(adt.countscr > 0)
 63 | hist(adt.detectedcr, col='grey', main="", xlab="Number of detected ADTs CTRL")
 64 | 
 65 | adt.countscl <- counts(adtcl)
 66 | adt.detectedcl <- colSums(adt.countscl > 0)
 67 | hist(adt.detectedcl, col='grey', main="", xlab="Number of detected ADTs CTCL")
 68 | 
 69 | qc.statscr <- cleanTagCounts(adtcr)#, exclusive=c("CD3", "CD19"))
 70 | summary(qc.statscr$high.ambient) # libraries removed with high ambient contamination
 71 | 
 72 | qc.statscl <- cleanTagCounts(adtcl)#, exclusive=c("CD3", "CD19"))
 73 | summary(qc.statscl$high.ambient) # libraries removed with high am
 74 | 
 75 | library(scater)
 76 | cn <- colnames(pb)
 77 | condition <- unlist(lapply(strsplit(colnames(pb), "_"), function(x) x[1]))
 78 | bc <- unlist(lapply(strsplit(colnames(pb), "_"), function(x) x[2]))
 79 | colData(pb) <- DataFrame("barcodes"=bc, "condition"=condition)
 80 | colnames(pb) <- cn
 81 | pbcr <- pb[,pb$condition=="CTRL"]
 82 | pbcl <- pb[,pb$condition=="CTCL"]
 83 | mito <- grep("mt-", tolower(rownames(pb)))
 84 | dfcr <- perCellQCMetrics(pbcr, subsets=list(Mito=mito))
 85 | dfcl <- perCellQCMetrics(pbcl, subsets=list(Mito=mito))
 86 | mito.discardcr <- isOutlier(dfcr$subsets_Mito_percent, type="higher")
 87 | names(mito.discardcr) <- rownames(dfcr)
 88 | summary(mito.discardcr)
 89 | 
 90 | mito.discardcl <- isOutlier(dfcl$subsets_Mito_percent, type="higher")
 91 | names(mito.discardcl) <- rownames(dfcl)
 92 | summary(mito.discardcl)
 93 | 
 94 | cd <- colData(mae)
 95 | cd
 96 | cd$adt.discard_CTRL <- FALSE
 97 | cd$adt.discard_CTRL[which(rownames(cd) %in% rownames(qc.statscr)[qc.statscr$discard])] <- TRUE
 98 | cd$adt.discard_CTCL <- FALSE
 99 | cd$adt.discard_CTCL[which(rownames(cd) %in% rownames(qc.statscl)[qc.statscl$discard])] <- TRUE
100 | 
101 | cd$mito.discard_CTRL <- FALSE
102 | cd$mito.discard_CTRL[which(rownames(cd) %in% names(mito.discardcr)[mito.discardcr])] <- TRUE
103 | cd$mito.discard_CTCL <- FALSE
104 | cd$mito.discard_CTCL[which(rownames(cd) %in% names(mito.discardcl)[mito.discardcl])] <- TRUE
105 | cd$discard_CTRL <- cd$adt.discard_CTRL | cd$mito.discard_CTRL
106 | cd$discard_CTCL <- cd$adt.discard_CTCL | cd$mito.discard_CTCL
107 | cd$discard <- cd$discard_CTRL | cd$discard_CTCL
108 | 
109 | 
110 | scRNAseq_coldata <- as.data.frame(cd)
111 | dir.create("peripheral_blood/v1.0.0/", recursive=TRUE)
112 | save(scRNAseq_coldata, file="peripheral_blood/v1.0.0/scRNAseq_coldata.Rda")
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/inst/scripts/make-data/make_macrophage.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Make the data to distribute to ExperimentHub
  3 | 
  4 | ## Required packages
  5 | library(HDF5Array)
  6 | library(BiocFileCache)
  7 | library(SingleCellExperiment)
  8 | 
  9 | ## Steps:
 10 | ## 1. Retrieve the scRNASeq matrices (n=2) from NCBI
 11 | ## 2. Read the count matrices
 12 | ## 3. Combine the two RNA count matrices in a SingleCellExperiment
 13 | ## 4. Retrieve the protein matrix and annotation from Google Drive
 14 | ## 5. Combine the protein data in a SingleCellExperiment object
 15 | 
 16 | ## Note that step 3 is optional. I needed to migrate to HDF5 due to 
 17 | ## memory limitations.
 18 | 
 19 | ## -------------------------------------- ##
 20 | ## 1. Retrieve the scRNASeq matrices (n=2) from NCBI
 21 | ## -------------------------------------- ##
 22 | 
 23 | ## See also https://bioconductor.org/packages/devel/bioc/vignettes/MultiAssayExperiment/inst/doc/UsingHDF5Array.html
 24 | bfc <- BiocFileCache()
 25 | url1 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4226nnn/GSM4226877/suppl/GSM4226877_rna_data_Bio_Replicate_1.csv.gz"
 26 | url2 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4226nnn/GSM4226878/suppl/GSM4226878_rna_data_Bio_Replicate_2.csv.gz"
 27 | bfcrpath(bfc, url1)
 28 | bfcrpath(bfc, url2)
 29 | 
 30 | ## ---------------------- ##
 31 | ## 2. Read count matrices
 32 | ## ---------------------- ##
 33 | 
 34 | ## Batch 1
 35 | m1 <- data.table::fread(file = bfcquery(bfc, "GSM4226877")$rpath,
 36 |                         sep = ",", header = TRUE)
 37 | rn <- m1[[1]]
 38 | m1 <- as.matrix(m1[, -1])
 39 | rownames(m1) <- rn
 40 | colnames(m1) <- paste0(colnames(m1), ".1")
 41 | ## Batch 2
 42 | m2 <- data.table::fread(file = bfcquery(bfc, "GSM4226878")$rpath,
 43 |                         sep = ",", header = TRUE)
 44 | rn <- m2[[1]]
 45 | m2 <- as.matrix(m2[, -1])
 46 | colnames(m2) <- paste0(colnames(m2), ".2")
 47 | rownames(m2) <- rn
 48 | 
 49 | ## ------------------------------------------------------- ##
 50 | ## 3. Combine the two RNA count matrices in a SingleCellExperiment
 51 | ## ------------------------------------------------------- ##
 52 | 
 53 | m1 <- DelayedArray(m1)
 54 | m2 <- DelayedArray(m2)
 55 | m3 <- cbind(m1, m2) ## This process is delayed until writing
 56 | batch <- factor(gsub("^.*[.](\\d)$", "\\1", colnames(m3)))
 57 | sce <- SingleCellExperiment(list(counts = m3),
 58 |                             colData = DataFrame(Batch = batch))
 59 | ## The object is rather big and is better stored on disk as an HDF5
 60 | saveHDF5SummarizedExperiment(sce, 
 61 |                              dir = "../.localdata/SingleCellMultiModal/macrophage_differentiation/v1.0.0/",
 62 |                              prefix = "macrophage_rna_", 
 63 |                              as.sparse = TRUE)
 64 | ## Restore some RAM
 65 | rm(m1, m2, m3); gc()
 66 | 
 67 | ## ------------------------------------------------------- ##
 68 | ## 4. Retrieve the protein matrix and annotation from Google Drive
 69 | ## ------------------------------------------------------- ##
 70 | 
 71 | ## Download the protein data provided by the Slavov lab
 72 | ## https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing
 73 | protein_assay <- read.csv("../.localdata/SCP/specht2019/v3/Proteins-processed.csv",
 74 |                           row.names = 1)
 75 | protein_assay <- protein_assay[, colnames(protein_assay) != "protein"]
 76 | protein_assay <- as.matrix(protein_assay)
 77 | 
 78 | ## Download the protein data provided by the Slavov lab
 79 | ## https://drive.google.com/file/d/16vf6rjIsk-oK9naAH6BQnCFrlWnYtJsS/view?usp=sharing
 80 | protein_colData <- read.csv("../.localdata/SCP/specht2019/v3/Cells.csv",
 81 |                             row.names = 1)
 82 | protein_colData <- t(protein_colData)
 83 | protein_colData <- DataFrame(protein_colData)
 84 | ## Replace the cell type annotation by more explicit values
 85 | protein_colData$celltype <- 
 86 |     ifelse(protein_colData$celltype == "sc_m0", "Macrophage", "Monocyte")
 87 | ## Rename the `raw.file` value by `Batch`
 88 | colnames(protein_colData)[5] <- "batch_MS"
 89 | 
 90 | ## ------------------------------------------------------- ##
 91 | ## 6. Combine the data in a SingleCellExperiment object
 92 | ## ------------------------------------------------------- ##
 93 | 
 94 | macrophage_protein <- SingleCellExperiment(assay = list(logexprs = protein_assay),
 95 |                                            colData = protein_colData)
 96 | format(object.size(macrophage_protein), "MB")
 97 | ## Note the protein data can easily fit in memory. We save it as an Rda
 98 | save(macrophage_protein, file = "../.localdata/SingleCellMultiModal/macrophage_differentiation/v1.0.0/macrophage_protein.Rda")
 99 | 
100 | ## ------------------------------------------------------- ##
101 | ## Conclusion
102 | ## ------------------------------------------------------- ##
103 | 
104 | ## These files should be sent to ExperimentHub:
105 | ##  mRNA
106 | ##  - macrophage_rna_assays.h5
107 | ##  - macrophage_rna_se.rds
108 | ##  Protein
109 | ##  - macrophage_protein.Rda
110 | 


--------------------------------------------------------------------------------
/inst/scripts/make-data/scMultiome.R:
--------------------------------------------------------------------------------
  1 | library(MultiAssayExperiment)
  2 | library(rhdf5)
  3 | library(HDF5Array)
  4 | 
  5 | ddir <- "~/data/scmm/pbmc_10x"
  6 | 
  7 | pbmc <- readRDS(
  8 |     file.path(ddir, "mae.rds")
  9 | )
 10 | vdir <- file.path(ddir, paste0("v", "1.0.0"))
 11 | setwd(vdir)
 12 | 
 13 | ## save colData and sampleMap
 14 | pbmc_colData <- colData(pbmc)
 15 | save(pbmc_colData, file = "pbmc_colData.rda")
 16 | 
 17 | pbmc_sampleMap <- sampleMap(pbmc)
 18 | save(pbmc_sampleMap, file = "pbmc_sampleMap.rda")
 19 | 
 20 | Matrix::writeMM(assay(pbmc[[1]]), "pbmc_rna.mtx")
 21 | R.utils::gzip(filename = "pbmc_rna.mtx", destname = "pbmc_rna.mtx.gz")
 22 | file.remove("pbmc_rna.mtx")
 23 | 
 24 | stopifnot(file.exists("pbmc_rna.mtx.gz"))
 25 | 
 26 | rna_mtx <- .read_mtx("pbmc_rna.mtx.gz")
 27 | 
 28 | ## save H5 file and SCE shell
 29 | HDF5Array::saveHDF5SummarizedExperiment(pbmc[[1]], dir = "pbmc_rna",
 30 |     prefix = "pbmc_rna_", as.sparse = TRUE)
 31 | 
 32 | ## load SCE shell
 33 | rna_sce <- readRDS("./pbmc_rna/pbmc_rna_se.rds")
 34 | ## replace assay with MTX assay
 35 | pbmc_rna_mtx_obj <- BiocBaseUtils::setSlots(
 36 |     rna_sce, assays = Assays(SimpleList(counts = rna_mtx))
 37 | )
 38 | 
 39 | pbmc_rna_h5_obj <-
 40 |     HDF5Array::loadHDF5SummarizedExperiment("pbmc_rna", "pbmc_rna_")
 41 | 
 42 | Matrix::writeMM(assay(pbmc[[2]]), "pbmc_atac.mtx")
 43 | R.utils::gzip(filename = "pbmc_atac.mtx", destname = "pbmc_atac.mtx.gz")
 44 | pbmc_atac_mtx <- "pbmc_atac.mtx.gz"
 45 | stopifnot(file.exists(pbmc_atac_mtx))
 46 | atac_mtx <- .read_mtx("pbmc_atac.mtx.gz")
 47 | ## save H5 file and SCE shell
 48 | HDF5Array::saveHDF5SummarizedExperiment(pbmc[[2]], dir = "pbmc_atac",
 49 |     prefix = "pbmc_atac_", as.sparse = TRUE)
 50 | 
 51 | ## load SCE shell
 52 | atac_sce <- readRDS("./pbmc_atac/pbmc_atac_se.rds")
 53 | ## replace assay with MTX assay
 54 | pbmc_atac_mtx_obj <- BiocBaseUtils::setSlots(
 55 |     atac_sce, assays = Assays(SimpleList(counts = atac_mtx))
 56 | )
 57 | 
 58 | ## load H5 object
 59 | pbmc_atac_h5_obj <-
 60 |     HDF5Array::loadHDF5SummarizedExperiment("pbmc_atac", "pbmc_atac_")
 61 | 
 62 | ### Use 10X Dataset
 63 | ## First load the previous version
 64 | rna <- SingleCellMultiModal::scMultiome(
 65 |     "pbmc_10x", "rna", format = "HDF5", version = "1.0.0", dry.run = FALSE
 66 | )
 67 | 
 68 | rnames <- rownames(rna[[1]])
 69 | cnames <- colnames(rna[[1]])
 70 | 
 71 | datafolder <- file.path(Sys.getenv("HOME"), "data/scmm/pbmc_10x")
 72 | 
 73 | v1url <- paste0("https://cf.10xgenomics.com/samples/cell-arc/1.0.0/",
 74 | "pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5"
 75 | )
 76 | 
 77 | h5v1 <- file.path(datafolder, basename(v1url))
 78 | 
 79 | if (!file.exists(h5v1))
 80 |     download.file(
 81 |         url = v1url,
 82 |         destfile = h5v1
 83 |     )
 84 | 
 85 | HDF5Array::h5ls(h5v1, all = TRUE)
 86 | 
 87 | grange <- as.character(h5read(h5v1, "/matrix/features/name"))
 88 | 
 89 | aa <- HDF5Array::TENxMatrix(h5v1, "matrix")
 90 | rownames(aa) <- grange
 91 | upcnames <- gsub("(.*)(-1)$", "\\1", colnames(aa))
 92 | 
 93 | stopifnot(
 94 |     all.equal(
 95 |         nchar(colnames(aa)) - nchar("-1") , nchar(upcnames)
 96 |     )
 97 | )
 98 | 
 99 | colnames(aa) <- upcnames
100 | 
101 | rowlog <- grange %in% rnames
102 | aa <- aa[rowlog,]
103 | 
104 | collog <- colnames(aa) %in% cnames
105 | aa <- aa[, collog]
106 | 
107 | all(rownames(aa) %in% rnames)
108 | ## TRUE
109 | all(colnames(aa) %in% cnames)
110 | ## TRUE
111 | 
112 | ## remove dups HDF5Array
113 | duprows <- duplicated(rownames(aa))
114 | aa <- aa[!duprows, ]
115 | 
116 | ## make sure that previous and new values are the same
117 | ## identical(rowSums(assay(rna[[1]])), rowSums(aa))
118 | 
119 | HDF5Array::writeTENxMatrix(aa, outfile, "pbmc", verbose = TRUE)
120 | # file.remove(outfile)
121 | outfile
122 | 
123 | HDF5Array::h5ls(outfile, all = TRUE)
124 | 
125 | 


--------------------------------------------------------------------------------
/inst/scripts/make-data/scNMT.R:
--------------------------------------------------------------------------------
 1 | # get data from cloudstor
 2 | # https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ/download?path=%2Foutput&files=scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds
 3 | ## ./output/scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds
 4 | library(MultiAssayExperiment)
 5 | 
 6 | ddir <- "~/data/scmm/mouse_gastrulation"
 7 | 
 8 | if (!dir.exists(ddir))
 9 |     dir.create(ddir, recursive = TRUE)
10 | 
11 | #   old
12 | #   "scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds"
13 | scnmt <- readRDS(
14 |     file.path(ddir, "allcells",
15 |         "scnmtseq_gastrulation_mae_AllCells.rds"
16 |     )
17 | )
18 | 
19 | exportClass(scnmt, ddir, fmt = "csv")
20 | 
21 | # convert .csv files to .rda matrices
22 | .convertData <- function(
23 |     directory = "~/data/scmm/",
24 |     dataDir = "mouse_gastrulation",
25 |     version = "1.0.0",
26 |     pattern = ".csv")
27 | {
28 |     location <- file.path(directory, dataDir, paste0("v", version))
29 |     csvs <- list.files(location, pattern = pattern, full.names = TRUE,
30 |         recursive = FALSE)
31 |     invisible(
32 |         lapply(csvs, function(csvfile) {
33 |             objname <- gsub(pattern, "", basename(csvfile))
34 |             readin <- as.data.frame(readr::read_csv(csvfile))
35 |             rnames <- readin[[1L]]
36 | 
37 |             if (!objname %in% c("scnmt_colData", "scnmt_sampleMap"))
38 |                 readin <- data.matrix(readin[, -1])
39 |             else if (identical(objname, "scnmt_colData"))
40 |                 names(readin)[1] <- "cellID"
41 |             else
42 |                 readin <- readin[, -1]
43 | 
44 |             if (!objname %in% "scnmt_sampleMap")
45 |                 rownames(readin) <- rnames
46 | 
47 |             assign(objname, readin)
48 |             rdafile <- gsub("csv", "rda", csvfile)
49 |             save(list = objname, file = rdafile)
50 |         })
51 |     )
52 | }
53 | 
54 | .convertData()
55 | 


--------------------------------------------------------------------------------
/inst/scripts/make-metadata.R:
--------------------------------------------------------------------------------
  1 | setwd("~/gh/SingleCellMultiModal")
  2 | 
  3 | .getSourceType <- function(filepaths) {
  4 |     lfiles <- strsplit(basename(filepaths), "\\.")
  5 |     exts <- vapply(lfiles,
  6 |         function(x) { paste(x[-1], collapse = ".") }, character(1L))
  7 |     uexts <- toupper(exts)
  8 |     uexts <- gsub("[Hh]5", "HDF5", uexts)
  9 |     uexts <- gsub("[Mm][Tt][Xx]\\.[Gg][Zz]", "MTX", uexts)
 10 |     vTypes <- AnnotationHubData::getValidSourceTypes()
 11 |     uTypes <- toupper(vTypes)
 12 |     allvalid <- all(uexts %in% uTypes)
 13 |     if (!allvalid)
 14 |         stop("Source types not supported: ", paste0(exts[!allvalid],
 15 |             collapse = ", "), "\n See 'AnnotationHubData::getValidSources()'",
 16 |             call. = FALSE)
 17 |     res <- vTypes[match(uexts, uTypes)]
 18 |     ## hot fix before AnnotationHubData 1.21.2
 19 |     gsub("MTX", "mtx.gz", res, fixed = TRUE)
 20 | }
 21 | 
 22 | doc_helper <-
 23 |     function(
 24 |         DataProvider, TaxonomyId, Species, SourceUrl, SourceType, DataType, ...
 25 |     )
 26 | {
 27 |     args <- list(...)
 28 |     saf <- args[["stringsAsFactors"]]
 29 |     saf <- if(!is.null(saf)) saf else FALSE
 30 | 
 31 |     input_vals <- list(
 32 |         DataProvider = DataProvider, TaxonomyId = TaxonomyId,
 33 |         Species = Species, SourceUrl = SourceUrl,
 34 |         SourceType = SourceType, DataType = DataType
 35 |     )
 36 |     clens <- lengths(input_vals)
 37 |     zlen <- !clens
 38 |     if (any(zlen))
 39 |         stop(
 40 |             "Provide values for: ",
 41 |             paste(names(input_vals)[zlen], collapse = ", ")
 42 |         )
 43 | 
 44 |     nonstd <- !clens %in% c(max(clens), 1L)
 45 |     if (any(nonstd))
 46 |         stop("Lengths of inputs must either be 1 or the max length")
 47 | 
 48 |     input_vals[clens == 1L] <- lapply(input_vals[clens == 1L],
 49 |         function(x) {
 50 |             rep(x, max(clens))
 51 |         })
 52 | 
 53 |     as.data.frame(input_vals, stringsAsFactors = saf)
 54 | }
 55 | 
 56 | .stdLength <- function(metalist, replength) {
 57 |     lapply(metalist, function(field) {
 58 |         if (length(field) == 1L)
 59 |             rep(field, replength)
 60 |         else
 61 |             field
 62 |     })
 63 | }
 64 | 
 65 | .loadRDS <- function(filepath) {
 66 |     readRDS(filepath)
 67 | }
 68 | 
 69 | .loadRDA <- function(filepath) {
 70 |     basefile <- gsub("\\.[Rr][Dd][Aa]", "", basename(filepath))
 71 |     OBJENV <- new.env(parent = emptyenv())
 72 |     load(filepath, envir = OBJENV)
 73 |     OBJENV[[basefile]]
 74 | }
 75 | 
 76 | .loadH5 <- function(filepath) {
 77 |     if (grepl("tenx", filepath))
 78 |         HDF5Array::TENxMatrix(filepath, "pbmc")
 79 |     else
 80 |         HDF5Array::HDF5Array(filepath, "assay001")
 81 | }
 82 | 
 83 | .loadMTX.GZ <- function(filepath) {
 84 |     .read_mtx(filepath)
 85 | }
 86 | 
 87 | .loadDataList <- function(filepaths) {
 88 |     recipelist <- list(
 89 |         "\\.[Rr][Dd][Aa]" = .loadRDA,
 90 |         "\\.[Rr][Dd][Ss]" = .loadRDS,
 91 |         "\\.[Hh]5" = .loadH5,
 92 |         "\\.[Mm][Tt][Xx]\\.[Gg][Zz]" = .loadMTX.GZ
 93 |     )
 94 |     hitMatrix <- vapply(names(recipelist),
 95 |         function(pat) grepl(pat, filepaths),
 96 |         logical(length(filepaths))
 97 |     )
 98 |     allrecipes <- recipelist[apply(hitMatrix, 1L, which)]
 99 |     Map(function(x, y) { x(y) }, x = allrecipes, y = filepaths)
100 | }
101 | 
102 | any.na <- function(x) {
103 |     any(is.na(x))
104 | }
105 | 
106 | .get_Description <- function(data_name, DataType) {
107 |     paste(data_name, "data specific to the", toupper(DataType), "project")
108 | }
109 | 
110 | .getRDataClass <- function(dataList) {
111 |     vapply(dataList, function(dataName) {
112 |             if (is.matrix(dataName))
113 |                 "matrix"
114 |             else
115 |                 class(dataName)
116 |     }, character(1L))
117 | }
118 | 
119 | 
120 | .file_pattern_map <- data.frame(
121 |     ext_pattern = paste0(
122 |         c("[Rr][Dd][Aa]", "[Rr][Dd][Ss]", "[Hh]5", "[Mm][Tt][Xx]\\.[Gg][Zz]"),
123 |         "$"
124 |     ),
125 |     ## currently MTX DispatchClass recipe unavailable
126 |     Dispatch = c("Rda", "Rds", "H5File", "FilePath"),
127 |     stringsAsFactors = FALSE
128 | )
129 | 
130 | .getDispatchClass <- function(resource_files, ext_map = .file_pattern_map) {
131 |     hitMatrix <- vapply(ext_map[["ext_pattern"]],
132 |         function(pat) grepl(pat, resource_files),
133 |             logical(length(resource_files)))
134 |     ext_map[["Dispatch"]][apply(hitMatrix, 1L, which)]
135 | }
136 | 
137 | ## alist() with formals()<-
138 | ## fancyFUN <- function() {}
139 | ## formals(fancyFUN) <- alist()
140 | 
141 | MetaHubCreate <-
142 |     function(base_dir, data_dirs, ext_pattern, doc_file, version, pkg_name)
143 | {
144 |     locations <- file.path(base_dir, data_dirs, paste0("v", version))
145 |     stopifnot(
146 |         dir.exists(base_dir), all(dir.exists(locations)),
147 |         is.character(ext_pattern), !is.na(ext_pattern),
148 |         identical(length(ext_pattern), 1L),
149 |         file.exists(doc_file), is.character(doc_file), !is.na(doc_file),
150 |         identical(length(doc_file), 1L), is.character(version)
151 |     )
152 |     fpathlist <- lapply(locations, function(locs) {
153 |         list.files(
154 |             locs, pattern = ext_pattern, full.names = TRUE, recursive = TRUE
155 |         )
156 |     })
157 |     docFrame <- read.csv(doc_file, header = TRUE)
158 |     docList <- split(docFrame,
159 |         list(docFrame[["DataType"]], docFrame[["SourceVersion"]]))
160 |     versions <- version
161 |     DataTypes <- data_dirs
162 |     replengths <- lengths(fpathlist)
163 |     namelist <- lapply(fpathlist, basename)
164 | 
165 |     metaList <- Map(
166 |         function(DataType, doc_file, resnames, filepaths, replength, version) {
167 |             message("Working on: ", basename(DataType), " v", version)
168 |             hubmeta <- R6::R6Class("EHubMeta",
169 |                 public = list(
170 |                     Title = NA_character_,
171 |                     Description = NA_character_,
172 |                     BiocVersion = as.character(BiocManager::version()),
173 |                     Genome = NA_character_,
174 |                     SourceType = NA_character_,
175 |                     SourceUrl = character(1L),
176 |                     SourceVersion = version,
177 |                     Species = character(1L),
178 |                     TaxonomyId = character(1L),
179 |                     Coordinate_1_based = NA,
180 |                     DataProvider = character(1L),
181 |                     Maintainer = NA_character_,
182 |                     RDataClass = NA_character_,
183 |                     DispatchClass = .getDispatchClass(resnames),
184 |                     Location_Prefix = NA_character_,
185 |                     RDataPath = NA_character_,
186 |                     ResourceName = resnames,
187 |                     DataType = DataType,
188 | 
189 |                     initialize = function(doc_file)
190 |                     {
191 |                         lapply(names(doc_file), function(i) {
192 |                             assign(i, doc_file[[i]], self)
193 |                         })
194 |                         if (is.na(self$Title))
195 |                             self$Title <- gsub(ext_pattern, "",
196 |                                 basename(filepaths))
197 |                         if (is.na(self$Description))
198 |                             self$Description <- paste(self$Title,
199 |                                 "data specific to the", toupper(self$DataType),
200 |                                 "project")
201 |                         if (any.na(self$SourceType))
202 |                             self$SourceType <- .getSourceType(filepaths)
203 |                         if (any.na(self$SourceVersion))
204 |                             self$SourceVersion <- "1.0.0"
205 |                         if (any.na(self$Maintainer))
206 |                             self$Maintainer <- utils::maintainer(pkg_name)
207 |                         if (any.na(self$RDataClass)) {
208 |                             dataList <- .loadDataList(filepaths)
209 |                             self$RDataClass <- .getRDataClass(dataList)
210 |                         }
211 |                         if (is.na(self$Location_Prefix))
212 |                             self$Location_Prefix <- NULL
213 |                         if (is.na(self$RDataPath))
214 |                             self$RDataPath <- file.path(pkg_name,
215 |                                 self$DataType, paste0("v", version),
216 |                                 self$ResourceName)
217 |                     },
218 |                     generate = function() {
219 |                         lnames <- !names(self) %in%
220 |                             c(".__enclos_env__", "clone", "generate",
221 |                                 "initialize")
222 |                         initList <- mget(names(self)[lnames], envir = self)
223 |                         initList <- Filter(function(x) !is.null(x), initList)
224 |                         flist <- .stdLength(initList, replength)
225 |                         do.call(data.frame, c(flist, stringsAsFactors = FALSE))
226 |                     }
227 |                 ),
228 |                 lock_objects = FALSE
229 |             )
230 |             nhub <- hubmeta$new(doc_file)
231 |             nhub$generate()
232 |     }, DataType = DataTypes, doc_file = docList, resnames = namelist,
233 |     filepaths = fpathlist, replength = replengths, version = versions
234 |     )
235 | 
236 |     do.call(
237 |         function(...) {
238 |             rbind.data.frame(..., make.row.names = FALSE,
239 |                 stringsAsFactors = FALSE)
240 |         },
241 |     metaList)
242 | }
243 | 
244 | #' Generate the metadata.csv file from a documentation file
245 | #'
246 | #' This function takes a specific folder structure and generates the
247 | #' metadata.csv file for adding to ExperimentHub.
248 | #'
249 | #' @param directory The base folder for _all_ datasets
250 | #'
251 | #' @param dataDirs `character()` A vector of folder names contained in directory
252 | #'     that corresponds to each project. For multiple versions, repeat the
253 | #'     name of the folder.
254 | #'
255 | #' @param version `character()` A vector of subfolder versions that is parallel
256 | #'     to `dataDirs` argument, typically `v1.0.0`.
257 | #'
258 | #' @param ext_pattern `character(1)` A string that matches files within the
259 | #'     above folders to find the data.
260 | #'
261 | #' @param doc_file `character(1)` A path to the documentation `data.frame` that
262 | #'     tells the function how to fill in the standard columns for data
263 | #'     annotation, for example `DataProvider`, `TaxonomyId`, etc.
264 | #'
265 | #' @param pkg_name `character(1)` The name of the current package
266 | #'
267 | #' @param dry.run `logical(1)` Whether to (over)write the `metadata.csv` file or
268 | #'     return as output.
269 | #'
270 | #' @param append `logical(1)` Whether to append to the current `metadata.csv`
271 | #'     file
272 | #'
273 | #' @return Saves a file under `/inst/extdata/metadata.csv`
274 | #'
275 | #' @examples
276 | #'
277 | #' make_metadata(
278 | #'     directory = "~/data/scmm",
279 | #'     dataDirs = "mouse_gastrulation",
280 | #'     version = c("1.0.0", "2.0.0"),
281 | #'     doc_file = "inst/extdata/docuData/singlecellmultimodalv2.csv",
282 | #'     dry.run = FALSE
283 | #' )
284 | #'
285 | #' make_metadata(
286 | #'     directory = "~/data/scmm",
287 | #'     dataDirs = c(rep("mouse_gastrulation", 2),
288 | #'         rep("mouse_visual_cortex", 2)),
289 | #'     version = rep(c("1.0.0", "2.0.0"), 2),
290 | #'     ext_pattern = "\\.[Rr][Dd][Aa]$",
291 | #'     doc_file = "inst/extdata/docuData/singlecellmultimodalv3.csv",
292 | #'     pkg_name = "SingleCellMultiModal",
293 | #'     dry.run = TRUE,
294 | #' )
295 | #'
296 | #' make_metadata(
297 | #'     directory = "~/data/scmm",
298 | #'     dataDirs = "pbmc",
299 | #'     version = "1.0.0",
300 | #'     ext_pattern = "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$",
301 | #'     doc_file = "inst/extdata/docuData/singlecellmultimodalv6.csv",
302 | #'     pkg_name = "SingleCellMultiModal",
303 | #'     dry.run = TRUE,
304 | #' )
305 | #'
306 | #' @export
307 | make_metadata <- function(
308 |     directory = "~/data/scmm",
309 |     dataDirs = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex", 2), "pbmc"),
310 |     version = c(rep(c("1.0.0", "2.0.0"), 2), "1.0.0"),
311 |     ext_pattern = "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|\\.[Hh]5$",
312 |     doc_file,
313 |     pkg_name = "SingleCellMultiModal",
314 |     dry.run = TRUE,
315 |     append = FALSE)
316 | {
317 |     if (!identical(basename(getwd()), pkg_name))
318 |         stop("Run 'make_metadata()' from directory: ", pkg_name)
319 | 
320 |     exdata <- "inst/extdata"
321 | 
322 |     if (!dir.exists(exdata))
323 |         dir.create(exdata)
324 | 
325 |     if (missing(doc_file))
326 |         stop("'doc_file' for generating the metadata is missing")
327 | 
328 |     metafile <- file.path(exdata, "metadata.csv")
329 | 
330 |     metadat <- MetaHubCreate(
331 |         base_dir = directory,
332 |         data_dirs = dataDirs,
333 |         ext_pattern = ext_pattern,
334 |         doc_file = doc_file,
335 |         version = version,
336 |         pkg_name = pkg_name
337 |     )
338 | 
339 |     if (!dry.run) {
340 |         if(!append)
341 |         {
342 |             file.remove(metafile)
343 |         }
344 |         readr::write_csv(metadat, metafile, append = append, na="NA")
345 |     }
346 | 
347 |     metadat
348 | }
349 | 
350 | # make_metadata(
351 | #     dataDirs = "mouse_gastrulation",
352 | #     version = "1.0.0",
353 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv1.csv",
354 | #     dry_run = FALSE
355 | # )
356 | #
357 | # make_metadata(
358 | #     directory="CITEseq/",
359 | #     dataDirs = "cord_blood",
360 | #     version = "1.0.0",
361 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv5.csv",
362 | #     dry.run = FALSE,
363 | #     append=TRUE
364 | # )
365 | 
366 | # make_metadata(
367 | #     dataDirs = c(rep("mouse_gastrulation", 2), "mouse_visual_cortex"),
368 | #     version = c("1.0.0", "2.0.0", "1.0.0"),
369 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv3.csv",
370 | #     dry.run = FALSE
371 | # )
372 | 
373 | # make_metadata(
374 | #     directory = "~/data/scmm",
375 | #     dataDirs = "peripheral_blood",
376 | #     version = "1.0.0",
377 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv5.csv",
378 | #     dry.run = FALSE,
379 | #     append = TRUE
380 | # )
381 | 
382 | # make_metadata(
383 | #     directory = "~/data/scmm",
384 | #     dataDirs = "pbmc_10x",
385 | #     version = "1.0.0",
386 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv6.csv",
387 | #     dry.run = FALSE,
388 | #     append = TRUE
389 | # )
390 | 
391 | # make_metadata(
392 | #     directory = "../.localdata/SingleCellMultiModal/",
393 | #     dataDirs = "macrophage_differentiation",
394 | #     version = "1.0.0",
395 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv7.csv",
396 | #     dry.run = FALSE,
397 | #     append = TRUE
398 | # )
399 | 
400 | ## request to update Maintainer field in older AH resources
401 | # aq <- AnnotationHub::query(eh, "SingleCellMultiModal")
402 | # aq[aq$maintainer == "Marcel Ramos <marcel.ramos@roswellpark.org>" &
403 | #     grepl("v[12]", aq$rdatapath)]
404 | 
405 | # make_metadata(
406 | #     directory = "~/data/scmm",
407 | #     dataDirs = "mouse_embryo_8_cell",
408 | #     version = "1.0.0",
409 | #     doc_file = "inst/extdata/docuData/singlecellmultimodalv8.csv",
410 | #     dry.run = FALSE,
411 | #     append = TRUE
412 | # )
413 | 
414 | make_metadata(
415 |     directory = "~/data/scmm",
416 |     dataDirs = "pbmc_10x",
417 |     version = "1.0.1",
418 |     doc_file = "inst/extdata/docuData/singlecellmultimodalv9.csv",
419 |     dry.run = FALSE,
420 |     append = TRUE
421 | )
422 | 
423 | ## Check metadata.csv file with:
424 | ExperimentHubData::makeExperimentHubMetadata(
425 |     file.path(Sys.getenv("HOME"), "gh/SingleCellMultiModal"), "metadata.csv"
426 | )
427 | 


--------------------------------------------------------------------------------
/inst/scripts/make-upload.R:
--------------------------------------------------------------------------------
 1 | # upload files to AWS S3
 2 | allextpat <- "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|\\.[Hh]5$"
 3 | 
 4 | .version_folder <- function(version) {
 5 |         paste0("v", version)
 6 | }
 7 | 
 8 | .getDataFiles <- function(directory = "~/data/scmm",
 9 |     dataDir = "mouse_gastrulation", pattern = allextpat, version = "1.0.0"
10 | ) {
11 |     vfolder <- .version_folder(version)
12 |     location <- file.path(directory, dataDir, vfolder)
13 |     list.files(
14 |         location, pattern = pattern, full.names = TRUE, recursive = FALSE
15 |     )
16 | }
17 | 
18 | ## check files are listed
19 | .getDataFiles(dataDir = "pbmc_10x", version = "1.0.1")
20 | 
21 | # IMPORTANT!
22 | # Make sure that AWS_DEFAULT_REGION, AWS_ACCESS_KEY_ID, and
23 | # AWS_SECRET_ACCESS_KEY are set in the ~/.Renviron file
24 | 
25 | # source("inst/scripts/make-metadata.R")
26 | 
27 | upload_aws <- function(
28 |     DataType, directory = "~/data/scmm", upload = FALSE,
29 |     fileExt = allextpat, version = "1.0.0"
30 | ) {
31 |     if (missing(DataType))
32 |         stop("Enter a 'DataType' folder")
33 |     datafilepaths <- .getDataFiles(
34 |         directory = directory, dataDir = DataType,
35 |         pattern = fileExt, version = version
36 |     )
37 |     vfolder <- .version_folder(version)
38 |     bucketLocation <-
39 |         file.path("experimenthub", "SingleCellMultiModal", DataType, vfolder)
40 |     if (!upload)
41 |         message("Data NOT uploaded")
42 |     if (upload)
43 |         AnnotationHubData:::upload_to_S3(file = datafilepaths,
44 |             remotename = basename(datafilepaths),
45 |             bucket = bucketLocation)
46 |     else
47 |         file.path("s3:/", bucketLocation, basename(datafilepaths))
48 | }
49 | 
50 | # upload_aws(DataType = "mouse_gastrulation", version = "1.0.0", upload=TRUE)
51 | # upload_aws(DataType = "mouse_gastrulation", version = "2.0.0", upload=TRUE)
52 | # upload_aws(DataType = "cord_blood", directory="CITEseq", version = "1.0.0",
53 | #     upload=TRUE)
54 | # upload_aws(DataType = "mouse_visual_cortex", upload=TRUE)
55 | # upload_aws(DataType = "pbmc_10x", directory = "~/data/scmm",
56 | #     version = "1.0.0", upload = TRUE)
57 | # upload_aws(DataType = "mouse_embryo_8_cell", directory = "~/data/scmm",
58 | #     version = "1.0.0", upload = TRUE)
59 | # upload_aws(DataType = "pbmc_10x", directory = "~/data/scmm",
60 | #     version = "1.0.1", upload = TRUE)
61 | 
62 | 


--------------------------------------------------------------------------------
/inst/scripts/make_docu.R:
--------------------------------------------------------------------------------
  1 | # version 1
  2 | scmeta <- data.frame(
  3 |     DataProvider = "Dept. of Bioinformatics, The Babraham Institute, United Kingdom",
  4 |     TaxonomyId = "10090",
  5 |     Species = "Mus musculus",
  6 |     SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ",
  7 |     SourceType = "RDS",
  8 |     SourceVersion = "1.0.0",
  9 |     DataType = "mouse_gastrulation",
 10 |     Maintainer  = "Ricard Argelaguet <ricard@ebi.ac.uk>",
 11 |     stringsAsFactors = FALSE
 12 | )
 13 | write.csv(
 14 |     scmeta,
 15 |     file = "inst/extdata/docuData/singlecellmultimodalv1.csv",
 16 |     row.names = FALSE
 17 | )
 18 | 
 19 | # version 2
 20 | scmeta <- data.frame(
 21 |     DataProvider =
 22 |         "Dept. of Bioinformatics, The Babraham Institute, United Kingdom",
 23 |     TaxonomyId = "10090",
 24 |     Species = "Mus musculus",
 25 |     SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ",
 26 |     SourceType = "RDS",
 27 |     SourceVersion = c("1.0.0", "2.0.0"),
 28 |     DataType = "mouse_gastrulation",
 29 |     Maintainer  = "Ricard Argelaguet <ricard@ebi.ac.uk>",
 30 |     stringsAsFactors = FALSE
 31 | )
 32 | write.csv(
 33 |     scmeta,
 34 |     file = "inst/extdata/docuData/singlecellmultimodalv2.csv",
 35 |     row.names = FALSE
 36 | )
 37 | 
 38 | # version 3 with spatial
 39 | scmeta <- data.frame(
 40 |     DataProvider = c(
 41 |         rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2),
 42 |         rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2)
 43 |     ),
 44 |     TaxonomyId = "10090",
 45 |     Species = "Mus musculus",
 46 |     SourceUrl = c(
 47 |         rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2),
 48 |         "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585",
 49 |         "https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0"
 50 |     ),
 51 |     SourceType = c("RDS", "RDS", "TXT", "TXT"),
 52 |     SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0"),
 53 |     DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex", 2)),
 54 |     Maintainer = c(rep("Ricard Argelaguet <ricard@ebi.ac.uk>", 2),
 55 |                    rep("Dario Righelli <dario.righelli@gmail.com>", 2)),
 56 |     stringsAsFactors = FALSE
 57 | )
 58 | write.csv(
 59 |     scmeta,
 60 |     file = "inst/extdata/docuData/singlecellmultimodalv3.csv",
 61 |     row.names = FALSE
 62 | )
 63 | 
 64 | 
 65 | # version 4 with cord_blood
 66 | scmeta <- data.frame(
 67 |     DataProvider = c(
 68 |         rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2),
 69 |         rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2),
 70 |         "Innovation Lab, New York Genome Center, New York, United States"
 71 |     ),
 72 |     TaxonomyId = c(rep("10090",4), "9606"),
 73 |     Species = c(rep("Mus musculus", 4), "Homo sapiens"),
 74 |     SourceUrl = c(
 75 |         rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2),
 76 |         rep("https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0", 2),
 77 |         "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866"
 78 |     ),
 79 |     SourceType = c(rep("RDS", 2), rep("TXT",3)),
 80 |     SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0", "1.0.0"),
 81 |     DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex",2), "coord_blood"),
 82 |     Maintainer = c(rep("Ricard Argelaguet <ricard@ebi.ac.uk>", 2),
 83 |                    rep("Dario Righelli <dario.righelli@gmail.com>",3)),
 84 |     stringsAsFactors = FALSE
 85 | )
 86 | 
 87 | write.csv(
 88 |     scmeta,
 89 |     file = "inst/extdata/docuData/singlecellmultimodalv3.csv",
 90 |     row.names = FALSE
 91 | )
 92 | 
 93 | # indv cord_blood
 94 | citeseqmeta <- data.frame(
 95 |     DataProvider =
 96 |         "Innovation Lab, New York Genome Center, New York, United States",
 97 |     TaxonomyId = "9606",
 98 |     Species = "Homo sapiens",
 99 |     SourceUrl = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866",
100 |     SourceType = "TXT",
101 |     SourceVersion = "1.0.0",
102 |     DataType = "coord_blood",
103 |     Maintainer = "Dario Righelli <dario.righelli@gmail.com>",
104 |     stringsAsFactors = FALSE
105 | )
106 | 
107 | write.csv(
108 |     citeseqmeta,
109 |     file = "inst/extdata/docuData/singlecellmultimodalv5.csv",
110 |     row.names = FALSE
111 | )
112 | #
113 | #
114 | # # version 2 with spatial
115 | # scmeta <- data.frame(
116 | #     DataProvider = c(
117 | #         rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2),
118 | #         rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2),
119 | #         "Innovation Lab, New York Genome Center, New York, United States"
120 | #     ),
121 | #     TaxonomyId = c(rep("10090",4), "9606"),
122 | #     Species = c(rep("Mus musculus", 4), "Homo sapiens"),
123 | #     SourceUrl = c(
124 | #         rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2),
125 | #         rep("https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0", 2),
126 | #         "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866"
127 | #     ),
128 | #     SourceType = c(rep("RDS", 2), rep("TXT",3)),
129 | #     SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0", "1.0.0"),
130 | #     DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex",2), "coord_blood"),
131 | #     Maintainer = c(rep("Marcel Ramos <marcel.ramos@roswellpark.org>", 2),
132 | #                    rep("Dario Righelli <dario.righelli@gmail.com>",3)),
133 | #     stringsAsFactors = FALSE
134 | # )
135 | # write.csv(
136 | #     scmeta,
137 | #     file = "inst/extdata/docuData/singlecellmultimodalv3.csv",
138 | #     row.names = FALSE
139 | # )
140 | 
141 | # version 5 pbmc
142 | scmeta <- data.frame(
143 |     DataProvider = "European Bioinformatics Institute (EMBL-EBI), United Kingdom",
144 |     TaxonomyId = "9606",
145 |     Species = "Homo sapiens",
146 |     SourceUrl = "http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/",
147 |     SourceVersion = "1.0.0",
148 |     DataType = "pbmc_10x",
149 |     Maintainer  = "Ricard Argelaguet <ricard@ebi.ac.uk>",
150 |     stringsAsFactors = FALSE
151 | )
152 | 
153 | write.csv(
154 |     scmeta,
155 |     file = "inst/extdata/docuData/singlecellmultimodalv6.csv",
156 |     row.names = FALSE
157 | )
158 | 
159 | ## version 7: creating metadata for the SCoPE2 dataset
160 | scope2meta <- data.frame(
161 |     DataProvider = paste0("Slavov Laboratory and SCP Center at ",
162 |                           "Northeastern University, Boston, United ",
163 |                           "states"),
164 |     TaxonomyId = "9606",
165 |     Species = "Homo sapiens",
166 |     SourceUrl = c("https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing",
167 |                   "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392",
168 |                   "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392"),
169 |     SourceType = c("CSV", "CSV", "CSV"),
170 |     SourceVersion = "1.0.0",
171 |     DataType = "macrophage_differentiation",
172 |     Maintainer = "Christophe Vanderaa <christophe.vanderaa@uclouvain.be>",
173 |     stringsAsFactors = FALSE
174 | )
175 | 
176 | write.csv(
177 |     scope2meta,
178 |     file = "inst/extdata/docuData/singlecellmultimodalv7.csv",
179 |     row.names = FALSE
180 | )
181 | 
182 | # version 8: GTseq dataset
183 | gtseq <- data.frame(
184 |     DataProvider = "Wellcome Trust Sanger Institute, Cambridge, United Kingdom",
185 |     TaxonomyId = "10090",
186 |     Species = "Mus musculus",
187 |     SourceUrl = "https://www.ebi.ac.uk/ena/browser/view/PRJEB9051",
188 |     SourceVersion = "1.0.0",
189 |     DataType = "mouse_embryo_8_cell",
190 |     Maintainer  = "Ludwig Geistlinger <ludwig_geistlinger@hms.harvard.edu>",
191 |     stringsAsFactors = FALSE
192 | )
193 | 
194 | write.csv(
195 |     gtseq,
196 |     file = "inst/extdata/docuData/singlecellmultimodalv8.csv",
197 |     row.names = FALSE
198 | )
199 | 
200 | 
201 | scmeta9 <- data.frame(
202 |     DataProvider = "European Bioinformatics Institute (EMBL-EBI), United Kingdom",
203 |     TaxonomyId = "9606",
204 |     Species = "Homo sapiens",
205 |     SourceUrl = "http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/",
206 |     SourceVersion = "1.0.1",
207 |     DataType = "pbmc_10x",
208 |     Maintainer  = "Marcel Ramos <marcel.ramos@roswellpark.org>",
209 |     stringsAsFactors = FALSE
210 | )
211 | 
212 | write.csv(
213 |     scmeta9,
214 |     file = "inst/extdata/docuData/singlecellmultimodalv9.csv",
215 |     row.names = FALSE
216 | )
217 | 
218 | 


--------------------------------------------------------------------------------
/inst/scripts/ontomap_update.R:
--------------------------------------------------------------------------------
 1 | ## read in
 2 | onto <- readr::read_tsv("inst/extdata/ontomap.tsv")
 3 | 
 4 | ## modification
 5 | onto <- as.data.frame(onto)
 6 | onto[onto$DataType == "macrophage_differentiation_protein", "DataType"] <-
 7 |     "macrophage_differentiation"
 8 | 
 9 | ## output checking
10 | stopifnot(
11 |     identical(length(unique(onto[["DataType"]])), 4L)
12 | )
13 | 
14 | ## writing
15 | write.table(
16 |     x = onto, file = "inst/extdata/ontomap.tsv",
17 |     quote = FALSE, sep = "\t", row.names = FALSE
18 | )
19 | 
20 | ## reading ontology terms from kelly ontomap based on an ontomap old version
21 | cellontokelly <- as.data.frame(
22 |     readr::read_tsv("~/Downloads/Cell type ontology - Sheet2.tsv")
23 | )
24 | onto <- as.data.frame(
25 |     readr::read_tsv("inst/extdata/ontomap.tsv")
26 | )
27 | 
28 | ## removing repetitive rows for seqFISH/scRNAseq celltypes
29 | ## aligning with newer version of ontomap
30 | cellontokelly <-
31 |     cellontokelly[!cellontokelly$dataset_name=="mouse_visual_cortex_scRNAseq",]
32 | ontokey <- paste0(onto$DataType,"_",onto$function_name)
33 | ontokey <-
34 |     gsub("SCoPE2", "protein_SCoPE2", gsub("scMultiome", "multiome", ontokey))
35 | ontokey <- paste0(ontokey, "_", onto$original_cell_name)
36 | kellykey <-
37 |     paste0(cellontokelly$dataset_name, "_", cellontokelly$original_cell_name)
38 | ## reordering
39 | cellontokelly <- cellontokelly[match(ontokey, kellykey),]
40 | 
41 | onto$ontology_ID <- cellontokelly$ontology_ID
42 | onto$ontology_cell_name <- cellontokelly$ontology_cell_name
43 | 
44 | ## writing
45 | write.table(
46 |     x = onto, file = "inst/extdata/ontomap.tsv",
47 |     quote = FALSE, sep = "\t", row.names = FALSE
48 | )
49 | 
50 | 
51 | ## adding celltypes for cord_blood citeseq
52 | ## 
53 | load("cord_blood/v1.0.0/coldata_scRNAseq.rda")
54 | cd <- coldata_scRNAseq
55 | ct <- unique(cd$celltype)
56 | ct <- ct[-which(is.na(ct))]
57 | 
58 | onto <- as.data.frame(
59 |     readr::read_tsv("inst/extdata/ontomap.tsv")
60 | )
61 | 
62 | cn <- colnames(onto)
63 | ctcb <- data.frame("cord_blood", "CITEseq", "celltype", ct, NA, NA)
64 | colnames(ctcb) <- cn
65 | onton <- rbind.data.frame(onto, ctcb) 
66 | write.table(
67 |     x = onton, file = "inst/extdata/ontomap.tsv",
68 |     quote = FALSE, sep = "\t", row.names = FALSE
69 | )
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/inst/scripts/update_wiki.sh:
--------------------------------------------------------------------------------
 1 | SCMM="$HOME/gh/SingleCellMultiModal"
 2 | 
 3 | WIKI="$HOME/wiki/SingleCellMultiModal.wiki"
 4 | 
 5 | RVER="devel"
 6 | 
 7 | cd $SCMM
 8 | 
 9 | export R_LIBS_USER="/media/$USER/1D24A0EA4286043C1/bioc-$RVER/"
10 | 
11 | RCMD="$HOME/src/svn/r-$RVER/R/bin/R --no-save --no-restore-data"
12 | 
13 | $RCMD CMD INSTALL $SCMM
14 | 
15 | $RCMD -e "rmarkdown::render('inst/scripts/Contributing-Guidelines.Rmd', output_file = '$WIKI/Contributing-Guidelines.md')"
16 | 
17 | cd $WIKI
18 | 
19 | git diff
20 | 
21 | git pull origin master
22 | git commit -am "update wiki"
23 | git push origin master
24 | 
25 | 


--------------------------------------------------------------------------------
/man/CITEseq.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/CITEseq.R
  3 | \name{CITEseq}
  4 | \alias{CITEseq}
  5 | \title{CITEseq}
  6 | \usage{
  7 | CITEseq(
  8 |   DataType = c("cord_blood", "peripheral_blood"),
  9 |   modes = "*",
 10 |   version = "1.0.0",
 11 |   dry.run = TRUE,
 12 |   filtered = FALSE,
 13 |   verbose = TRUE,
 14 |   DataClass = c("MultiAssayExperiment", "SingleCellExperiment"),
 15 |   ...
 16 | )
 17 | }
 18 | \arguments{
 19 | \item{DataType}{\code{character(1)} indicating the identifier of the dataset to
 20 | retrieve.  (default "cord_blood")}
 21 | 
 22 | \item{modes}{\code{character()} The assay types or modes of data to obtain these
 23 | include scADT and scRNA-seq data by default.}
 24 | 
 25 | \item{version}{\code{character(1)} Either version '1.0.0' depending on
 26 | data version required.}
 27 | 
 28 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
 29 | download (default \code{TRUE})}
 30 | 
 31 | \item{filtered}{\code{logical(1)} indicating if the returned dataset needs to
 32 | have filtered cells.
 33 | See Details for additional information about the filtering process.}
 34 | 
 35 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
 36 | (down)loaded (default \code{TRUE})}
 37 | 
 38 | \item{DataClass}{either MultiAssayExperiment or SingleCellExperiment
 39 | data classes can be returned (default MultiAssayExperiment)}
 40 | 
 41 | \item{...}{Additional arguments passed on to the
 42 | \link[ExperimentHub]{ExperimentHub-class} constructor}
 43 | }
 44 | \value{
 45 | A single cell multi-modal
 46 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
 47 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE}. When \code{DataClass} is
 48 | \code{SingleCellExperiment} an object of this class is returned with an RNA
 49 | assay as main experiment and other assay(s) as \code{AltExp(s)}.
 50 | }
 51 | \description{
 52 | function assembles data on-the-fly from \code{ExperimentHub} to
 53 | provide a
 54 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
 55 | container. Actually the \code{dataType} argument provides access to the
 56 | available datasets associated to the package.
 57 | }
 58 | \details{
 59 | CITEseq data are a combination of single cell transcriptomics and
 60 | about a hundread of cell surface proteins.
 61 | Available datasets are:
 62 | \itemize{
 63 | \item cord_blood: a dataset of single cells of cord blood as
 64 | provided in Stoeckius et al. (2017).
 65 | \itemize{
 66 | \item scRNA_Counts - Stoeckius scRNA-seq gene count matrix
 67 | \item scADT - Stoeckius antibody-derived tags (ADT) data
 68 | }
 69 | \item peripheral_blood: a dataset of single cells of peripheral
 70 | blood as provided in Mimitou et al. (2019). We provide two different
 71 | conditions controls (CTRL) and Cutaneous T-cell Limphoma (CTCL). Just build
 72 | appropriate \code{modes} regex for subselecting the dataset modes.
 73 | \itemize{
 74 | \item scRNA - Mimitou scRNA-seq gene count matrix
 75 | \item scADT - Mimitou antibody-derived tags (ADT) data
 76 | \item scHTO - Mimitou Hashtag Oligo (HTO) data
 77 | \item TCRab - Mimitou T-cell Receptors (TCR) alpha and beta
 78 | available through the object metadata.
 79 | \item TCRgd - Mimitou T-cell Receptors (TCR) gamma and delta
 80 | available through the object metadata.
 81 | }
 82 | }
 83 | 
 84 | If \code{filtered} parameter is \code{FALSE} (default), the \code{colData} of the returned
 85 | object contains multiple columns of \code{logicals} indicating the cells to be
 86 | discarded.
 87 | In case \code{filtered} is \code{TRUE}, the \code{discard} column is used to filer the
 88 | cells.
 89 | Column \code{adt.discard} indicates the cells to be discarded computed on the ADT
 90 | assay.
 91 | Column \code{mito.discard} indicates the cells to be discarded computed on the
 92 | RNA assay and mitocondrial genes.
 93 | Column \code{discard} combines the previous columns with an \code{OR} operator.
 94 | Note that for the \code{peripheral_blood} dataset these three columns are
 95 | computed and returned separately for the \code{CTCL} and \code{CTRL} conditions.
 96 | In this case the additional \code{discard} column combines the \code{discard.CTCL} and
 97 | \code{discard.CTRL} columns with an \code{OR} operator.
 98 | Cell filtering has been computed for \code{cord_blood} and \code{peripheral_blood}
 99 | datasets following section 12.3 of the Advanced Single-Cell Analysis with
100 | Bioconductor book.
101 | Executed code can be retrieved in the CITEseq_filtering.R script of this
102 | package.
103 | }
104 | \examples{
105 | 
106 | mae <- CITEseq(DataType="cord_blood", dry.run=FALSE)
107 | experiments(mae)
108 | }
109 | \references{
110 | Stoeckius et al. (2017), Mimitou et al. (2019)
111 | }
112 | \author{
113 | Dario Righelli
114 | }
115 | 


--------------------------------------------------------------------------------
/man/GTseq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GTseq.R
 3 | \name{GTseq}
 4 | \alias{GTseq}
 5 | \title{Parallel sequencing data of single-cell genomes and transcriptomes}
 6 | \source{
 7 | \url{https://www.ebi.ac.uk/ena/browser/view/PRJEB9051}
 8 | }
 9 | \usage{
10 | GTseq(
11 |   DataType = "mouse_embryo_8_cell",
12 |   modes = "*",
13 |   version = "1.0.0",
14 |   dry.run = TRUE,
15 |   verbose = TRUE,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{DataType}{\code{character(1)} Indicates study that produces this type of
21 | data (default: 'mouse_embryo_8_cell')}
22 | 
23 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as
24 | \code{"*omic"}. A wildcard of \code{"*"} will return all modes including
25 | copy numbers ("genomic") and RNA-seq read counts ("transcriptomic"),
26 | which is the default.}
27 | 
28 | \item{version}{\code{character(1)} Currently, only version '1.0.0'.}
29 | 
30 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
31 | download (default \code{TRUE})}
32 | 
33 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
34 | (down)loaded (default \code{TRUE})}
35 | 
36 | \item{...}{Additional arguments passed on to the
37 | \link[ExperimentHub:ExperimentHub-class]{ExperimentHub} constructor}
38 | }
39 | \value{
40 | A single cell multi-modal
41 | \link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment} or
42 | informative \code{data.frame} when \code{dry.run} is \code{TRUE}
43 | }
44 | \description{
45 | GTseq assembles data on-the-fly from \code{ExperimentHub} to provide
46 | a
47 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
48 | container. The \code{DataType} argument provides access to the
49 | \code{mouse_embryo_8_cell} dataset as obtained from Macaulay et al. (2015).
50 | Protocol information for this dataset is available from Macaulay et al.
51 | (2016). See references.
52 | }
53 | \details{
54 | G&T-seq is a combination of Picoplex amplified gDNA sequencing
55 | (genome) and SMARTSeq2 amplified cDNA sequencing (transcriptome) of the
56 | same cell. For more information, see Macaulay et al. (2015).
57 | * mouse_embryo_8_cell:
58 | this dataset was filtered for bad cells as specified in Macaulay
59 | et al. (2015).
60 | * genomic - integer copy numbers as detected from scDNA-seq
61 | * transcriptomic - raw read counts as quantified from scRNA-seq
62 | }
63 | \section{metadata}{
64 | 
65 | The \code{MultiAssayExperiment} metadata includes the original function call
66 | that saves the function call and the data version requested.
67 | }
68 | 
69 | \examples{
70 | 
71 | GTseq()
72 | 
73 | }
74 | \references{
75 | Macaulay et al. (2015) G&T-seq: parallel sequencing of single-cell
76 | genomes and transcriptomes. Nat Methods, 12:519–22.
77 | 
78 | Macaulay et al. (2016) Separation and parallel sequencing of the genomes
79 | and transcriptomes of single cells using G&T-seq. Nat Protoc, 11:2081–103.
80 | }
81 | \seealso{
82 | SingleCellMultiModal-package
83 | }
84 | 


--------------------------------------------------------------------------------
/man/SCoPE2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SCoPE2.R
 3 | \name{SCoPE2}
 4 | \alias{SCoPE2}
 5 | \title{Single-cell RNA sequencing and proteomics}
 6 | \source{
 7 | All files are linked from the slavovlab website
 8 | \url{https://scope2.slavovlab.net/docs/data}
 9 | }
10 | \usage{
11 | SCoPE2(
12 |   DataType = "macrophage_differentiation",
13 |   modes = "*",
14 |   version = "1.0.0",
15 |   dry.run = TRUE,
16 |   verbose = TRUE,
17 |   ...
18 | )
19 | }
20 | \arguments{
21 | \item{DataType}{\code{character(1)} Indicates study that produces this type of
22 | data (default: 'macrophage_differentiation')}
23 | 
24 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as
25 | \code{"rna"}. A wildcard of \code{"*"} will return all modes, that are
26 | transcriptome ("rna") or proteome ("protein") which is the
27 | default.}
28 | 
29 | \item{version}{\code{character(1)}, currently only version '1.0.0' is
30 | available}
31 | 
32 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
33 | download (default \code{TRUE})}
34 | 
35 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
36 | (down)loaded (default \code{TRUE})}
37 | 
38 | \item{...}{Additional arguments passed on to the
39 | \link[ExperimentHub]{ExperimentHub-class} constructor}
40 | }
41 | \value{
42 | A single cell multi-modal
43 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
44 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE}
45 | }
46 | \description{
47 | SCoPE2 assembles data on-the-fly from \code{ExperimentHub} to provide
48 | a
49 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
50 | container. The \code{DataType} argument provides access to the \code{SCoPE2} dataset
51 | as provided by Specht et al. (2020; DOI:
52 | \url{http://dx.doi.org/10.1101/665307}). The article provides more information
53 | about the data acquisition and pre-processing.
54 | }
55 | \details{
56 | The SCoPE2 study combines scRNA-seq (transcriptome) and
57 | single-cell proteomics.
58 | \itemize{
59 | \item macrophage_differentiation: the cells are monocytes that undergo
60 | macrophage differentiation. No annotation is available for the
61 | transcriptome data, but batch and cell type annotations are
62 | available for the proteomics data in the \code{celltype} \code{colData} column.
63 | The transcriptomics and proteomics data were not measured from the same
64 | cells but from a distinct set of cell cultures.
65 | This dataset provides already filtered bad quality cells.
66 | \itemize{
67 | \item scRNAseq1 - single-cell transcriptome (batch 1)
68 | \item scRNAseq2 - single-cell transcriptome (batch 2)
69 | \item scp - single-cell proteomics
70 | }
71 | }
72 | }
73 | \examples{
74 | 
75 | SCoPE2(DataType = "macrophage_differentiation",
76 |        modes = "*",
77 |        version = "1.0.0",
78 |        dry.run = TRUE)
79 | 
80 | }
81 | \references{
82 | Specht, Harrison, Edward Emmott, Aleksandra A. Petelski, R.
83 | Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko,
84 | Antonius Koller, and Nikolai Slavov. 2020. “Single-Cell
85 | Proteomic and Transcriptomic Analysis of Macrophage
86 | Heterogeneity.” bioRxiv. https://doi.org/10.1101/665307.
87 | }
88 | \seealso{
89 | SingleCellMultiModal-package
90 | }
91 | 


--------------------------------------------------------------------------------
/man/SingleCellMultiModal-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellMultiModal-package.R
 3 | \docType{package}
 4 | \name{SingleCellMultiModal-package}
 5 | \alias{SingleCellMultiModal-package}
 6 | \title{SingleCellMultiModal-package}
 7 | \description{
 8 | The SingleCellMultiModal package provides a convenient and user-friendly
 9 | representation of multi-modal data from project such as \code{scNMT} for mouse
10 | gastrulation.
11 | }
12 | \examples{
13 | help(package = "SingleCellMultiModal")
14 | 
15 | }
16 | \seealso{
17 | Useful links:
18 | \itemize{
19 |   \item Report bugs at \url{https://github.com/waldronlab/SingleCellMultiModal/issues}
20 | }
21 | 
22 | }
23 | \author{
24 | \strong{Maintainer}: Marcel Ramos \email{marcel.ramos@roswellpark.org} (\href{https://orcid.org/0000-0002-3242-0582}{ORCID})
25 | 
26 | Authors:
27 | \itemize{
28 |   \item Ricard Argelaguet \email{ricard@ebi.ac.uk}
29 |   \item Dario Righelli \email{dario.righelli@gmail.com}
30 |   \item Kelly Eckenrode \email{kelly.eckenrode@sph.cuny.edu}
31 |   \item Ludwig Geistlinger \email{ludwig_geistlinger@hms.harvard.edu}
32 |   \item Levi Waldron \email{lwaldron.research@gmail.com}
33 | }
34 | 
35 | Other contributors:
36 | \itemize{
37 |   \item Al Abadi [contributor]
38 |   \item Christophe Vanderaa \email{christophe.vanderaa@uclouvain.be} [contributor]
39 | }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/SingleCellMultiModal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellMultiModal.R
 3 | \name{SingleCellMultiModal}
 4 | \alias{SingleCellMultiModal}
 5 | \title{Combining Modalities into one MultiAssayExperiment}
 6 | \usage{
 7 | SingleCellMultiModal(
 8 |   DataTypes,
 9 |   modes = "*",
10 |   versions = "1.0.0",
11 |   dry.run = TRUE,
12 |   verbose = TRUE,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{DataTypes}{\code{character()} A vector of data types as indicated in each
18 | individual function by the \code{DataType} parameter. These can be any of
19 | the following: "mouse_gastrulation", "pbmc_10x",
20 | "macrophage_differentiation", "cord_blood", "peripheral_blood",
21 | "mouse_visual_cortex", "mouse_embryo_8_cell"}
22 | 
23 | \item{modes}{list() A list or CharacterList of modes for each data type
24 | where each element corresponds to one data type.}
25 | 
26 | \item{versions}{\code{character()} A vector of versions for each DataType. By
27 | default, version \verb{1.0.0} is obtained for all data types.}
28 | 
29 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
30 | download (default \code{TRUE})}
31 | 
32 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
33 | (down)loaded (default \code{TRUE})}
34 | 
35 | \item{...}{Additional arguments passed on to the
36 | \link[ExperimentHub]{ExperimentHub-class} constructor}
37 | }
38 | \value{
39 | A multi-modality \code{MultiAssayExperiment}
40 | }
41 | \description{
42 | Combine multiple single cell modalities into one using the input of the
43 | individual functions.
44 | }
45 | \section{metadata}{
46 | 
47 | The metadata in the \code{MultiAssayExperiment} contains the original
48 | function call used to generate the object (labeled as \code{call}),
49 | a \code{call_map} which provides traceability of technology functions to
50 | \code{DataType} prefixes, and lastly, R version information as \code{version}.
51 | }
52 | 
53 | \examples{
54 | 
55 | SingleCellMultiModal(c("mouse_gastrulation", "pbmc_10x"),
56 |     modes = list(c("acc*", "met*"), "rna"),
57 |     version = c("2.0.0", "1.0.0"), dry.run = TRUE, verbose = TRUE
58 | )
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/man/addCTLabels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellGating.R
 3 | \name{addCTLabels}
 4 | \alias{addCTLabels}
 5 | \title{addCTLabels}
 6 | \usage{
 7 | addCTLabels(
 8 |   cd,
 9 |   out,
10 |   outname,
11 |   ct,
12 |   mkrcol = "markers",
13 |   ctcol = "celltype",
14 |   overwrite = FALSE,
15 |   verbose = TRUE
16 | )
17 | }
18 | \arguments{
19 | \item{cd}{the \code{colData} \code{DataFrame}}
20 | 
21 | \item{out}{list data structure returned by \code{getCellGroups}}
22 | 
23 | \item{outname}{character indicating the name of the out data structure}
24 | 
25 | \item{ct}{character indicating the celltype to assign in the \code{ctcol}}
26 | 
27 | \item{mkrcol}{character indicating the cd column to store the markers
28 | indicated by \code{outname} (default is markers)}
29 | 
30 | \item{ctcol}{character indicating the column in cd to store the cell type
31 | indicated by \code{ct} (default is celltype)}
32 | 
33 | \item{overwrite}{logical indicating if the cell types have to be overwritten
34 | without checking if detected barcodes were already assigned to other celltypes}
35 | 
36 | \item{verbose}{logical for having informative messages during the execution}
37 | }
38 | \value{
39 | an updated version of the cd DataFrame
40 | }
41 | \description{
42 | addCTLabels
43 | }
44 | 


--------------------------------------------------------------------------------
/man/dot-CITEseqMaeToSce.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CITEseq.R
 3 | \name{.CITEseqMaeToSce}
 4 | \alias{.CITEseqMaeToSce}
 5 | \title{CITEseqMaeToSce}
 6 | \usage{
 7 | .CITEseqMaeToSce(mae)
 8 | }
 9 | \arguments{
10 | \item{mae}{a MultiAssayExperiment object with scRNA and/or scADT and/or
11 | scHTO named experiments.}
12 | }
13 | \value{
14 | a SingleCellExperiment object as widely with scRNA data as counts
15 | and scADT, scHTO data as altExps.
16 | If only one modality is present, it has returned as main assay of the SCE.
17 | }
18 | \description{
19 | converts a \code{MultiAssayExperiment} object with CITEseq data into
20 | a \code{SingleCellExperiment} object to be used with already known methods and
21 | packages in literature.
22 | 
23 | Note that for creating a \code{SingleCellExperiment} object the following function
24 | subsets all the assays present in the \code{MultiAssayExperiment} with only the
25 | common cells across all the modalities.
26 | This could result in a not complete object.
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/getCellGroups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellGating.R
 3 | \name{getCellGroups}
 4 | \alias{getCellGroups}
 5 | \title{getCellGroups}
 6 | \usage{
 7 | getCellGroups(mat, adt1 = "CD19", adt2 = "CD3", th1 = 0.2, th2 = 0)
 8 | }
 9 | \arguments{
10 | \item{mat}{matrix of counts or clr transformed counts for ADT data in CITEseq}
11 | 
12 | \item{adt1}{character indicating the name of the marker to plot on the x-axis
13 | (default is CD19).}
14 | 
15 | \item{adt2}{character indicating the name of the marker to plot on the y-axis
16 | (default is CD3).}
17 | 
18 | \item{th1}{numeric indicating the threshold for the marker on the x-axis
19 | (default is 0.2).}
20 | 
21 | \item{th2}{numeric indicating the threshold for the marker on the y-axis
22 | (default is 0).}
23 | }
24 | \value{
25 | a list of four different element, each one indicating the quarter
26 | where the thresholds divide the plotting space, in eucledian order I, II,
27 | III, IV quadrant, indicating respectively +/+, +/-, -/+, -/- combinations
28 | for the couples of selected ADTs.
29 | Each element of the list contains two objects, one with the list of detected
30 | barcodes and one indicating the percentage of barcodes falling into that
31 | quadrant.
32 | .
33 | }
34 | \description{
35 | Shows the cells/barcodes in two different plots (scatter and density)
36 | divinding the space in four quadrant indicated by the two thresholds given
37 | as input parameters.
38 | The x/y-axis represent respectively the two ADTs given as input.
39 | It returns a list of one element for each quadrant, each with barcodes and
40 | percentage (see Value section for details).
41 | }
42 | \details{
43 | helps to do manual gating for cell type indentification with CITEseq
44 | or similar data, providing cell markers.
45 | Once identified two interesting markers for a cell type, the user has to
46 | play with the thresholds to identify the cell populations specified by an
47 | uptake (+) o downtake (-) of the couple of markers (ADTs) previously selected.
48 | }
49 | 


--------------------------------------------------------------------------------
/man/ontomap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ontomap.R
 3 | \name{ontomap}
 4 | \alias{ontomap}
 5 | \title{Obtain a map of cell types for each dataset}
 6 | \usage{
 7 | ontomap(dataset = c("scNMT", "scMultiome", "SCoPE2", "CITEseq", "seqFISH"))
 8 | }
 9 | \arguments{
10 | \item{dataset}{\code{character()} One of the existing functions within the
11 | package. If missing, a map of all cell types in each function will
12 | be provided.}
13 | }
14 | \value{
15 | A \code{data.frame} of metadata with cell types and ontologies
16 | }
17 | \description{
18 | The \code{ontomap} function provides a mapping of all the cell names across the
19 | all the data sets or for a specified data set.
20 | }
21 | \details{
22 | Note that \code{CITEseq} does not have any cell annotations; therefore, no entries
23 | are present in the \code{ontomap}.
24 | }
25 | \examples{
26 | 
27 | ontomap(dataset = "scNMT")
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/scMultiome.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scMultiome.R
 3 | \name{scMultiome}
 4 | \alias{scMultiome}
 5 | \title{Single-cell Multiome ATAC + Gene Expression}
 6 | \usage{
 7 | scMultiome(
 8 |   DataType = "pbmc_10x",
 9 |   modes = "*",
10 |   version = "1.0.0",
11 |   format = c("MTX", "HDF5"),
12 |   dry.run = TRUE,
13 |   verbose = TRUE,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{DataType}{\code{character(1)} Indicates study that produces this type of
19 | data (default: 'mouse_gastrulation')}
20 | 
21 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as
22 | \code{"acc*"}. A wildcard of \code{"*"} will return all modes including
23 | Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna")
24 | which is the default.}
25 | 
26 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on
27 | data version required (default '1.0.0'). See version section.}
28 | 
29 | \item{format}{\code{character(1)} Either MTX or HDF5 data format (default MTX)}
30 | 
31 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
32 | download (default \code{TRUE})}
33 | 
34 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
35 | (down)loaded (default \code{TRUE})}
36 | 
37 | \item{...}{Additional arguments passed on to the
38 | \link[ExperimentHub]{ExperimentHub-class} constructor}
39 | }
40 | \value{
41 | A 10X PBMC \code{MultiAssayExperiment} object
42 | }
43 | \description{
44 | 10x Genomics Multiome technology enables simultaneous profiling
45 | of the transcriptome (using 3’ gene expression) and epigenome
46 | (using ATAC-seq) from single cells to
47 | deepen our understanding of how genes are expressed and regulated across
48 | different cell types. Data prepared by Ricard Argelaguet.
49 | }
50 | \details{
51 | Users are able to choose from either an \code{MTX} or \code{HDF5} file format
52 | as the internal data representation. The \code{MTX} (Matrix Market) format
53 | allows users to load a sparse \code{dgCMatrix} representation. Choosing \code{HDF5}
54 | gives users a sparse \code{HDF5Array} class object.
55 | * pbmc_10x: 10K Peripheral Blood Mononuclear Cells provided by
56 | \href{https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets}{10x Genomics website}
57 | Cell quality control filters are available in the object \code{colData}
58 | together with the \code{celltype} annotation labels.
59 | }
60 | \examples{
61 | 
62 | scMultiome(DataType = "pbmc_10x", modes = "*", dry.run = TRUE)
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/man/scNMT.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/scNMT.R
  3 | \name{scNMT}
  4 | \alias{scNMT}
  5 | \title{Single-cell Nucleosome, Methylation and Transcription sequencing}
  6 | \source{
  7 | \url{http://ftp.ebi.ac.uk/pub/databases/scnmt_gastrulation/}
  8 | }
  9 | \usage{
 10 | scNMT(
 11 |   DataType = "mouse_gastrulation",
 12 |   modes = "*",
 13 |   version = "1.0.0",
 14 |   dry.run = TRUE,
 15 |   verbose = TRUE,
 16 |   ...
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{DataType}{\code{character(1)} Indicates study that produces this type of
 21 | data (default: 'mouse_gastrulation')}
 22 | 
 23 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as
 24 | \code{"acc*"}. A wildcard of \code{"*"} will return all modes including
 25 | Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna")
 26 | which is the default.}
 27 | 
 28 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on
 29 | data version required (default '1.0.0'). See version section.}
 30 | 
 31 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
 32 | download (default \code{TRUE})}
 33 | 
 34 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
 35 | (down)loaded (default \code{TRUE})}
 36 | 
 37 | \item{...}{Additional arguments passed on to the
 38 | \link[ExperimentHub]{ExperimentHub-class} constructor}
 39 | }
 40 | \value{
 41 | A single cell multi-modal
 42 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
 43 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE}
 44 | }
 45 | \description{
 46 | scNMT assembles data on-the-fly from \code{ExperimentHub} to provide
 47 | a
 48 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
 49 | container. The \code{DataType} argument provides access to the
 50 | \code{mouse_gastrulation} dataset as obtained from Argelaguet et al. (2019; DOI:
 51 | 10.1038/s41586-019-1825-8). Pre-processing code can be seen at
 52 | \url{https://github.com/rargelaguet/scnmt_gastrulation}. Protocol
 53 | information for this dataset is available at Clark et al. (2018). See the
 54 | vignette for the full citation.
 55 | }
 56 | \details{
 57 | scNMT is a combination of RNA-seq (transcriptome) and an adaptation
 58 | of Nucleosome Occupancy and Methylation sequencing (NOMe-seq, the
 59 | methylome and chromatin accessibility) technologies. For more
 60 | information, see Reik et al. (2018) DOI: 10.1038/s41467-018-03149-4
 61 | \itemize{
 62 | \item mouse_gastrulation - this dataset provides cell quality control filters in
 63 | the object \code{colData} starting from version 2.0.0. Additionally, cell types
 64 | annotations are provided through the \code{lineage} \code{colData} column.
 65 | \itemize{
 66 | \item rna - RNA-seq
 67 | \item acc_\* - chromatin accessibility
 68 | \item met_\* - DNA methylation
 69 | \itemize{
 70 | \item cgi - CpG islands
 71 | \item CTCF - footprints of CTCF binding
 72 | \item DHS - DNase Hypersensitive Sites
 73 | \item genebody - gene bodies
 74 | \item p300 - p300 binding sites
 75 | \item promoter - gene promoters
 76 | }
 77 | }
 78 | }
 79 | 
 80 | Special thanks to Al J Abadi for preparing the published data in time
 81 | for the 2020 BIRS Workshop, see the link here:
 82 | \url{https://github.com/BIRSBiointegration/Hackathon/tree/master/scNMT-seq}
 83 | }
 84 | \section{versions}{
 85 | 
 86 | Version '1.0.0' of the scNMT mouse_gastrulation dataset includes all of
 87 | the above mentioned assay technologies with filtering of cells based on
 88 | quality control metrics. Version '2.0.0' contains all of the cells
 89 | without the QC filter and does not contain CTCF binding footprints or
 90 | p300 binding sites.
 91 | }
 92 | 
 93 | \section{metadata}{
 94 | 
 95 | The \code{MultiAssayExperiment} metadata includes the original function call
 96 | that saves the function call and the data version requested.
 97 | }
 98 | 
 99 | \examples{
100 | 
101 | scNMT(DataType = "mouse_gastrulation", modes = "*",
102 |     version = "1.0.0", dry.run = TRUE)
103 | 
104 | }
105 | \references{
106 | Argelaguet et al. (2019)
107 | }
108 | \seealso{
109 | SingleCellMultiModal-package
110 | }
111 | 


--------------------------------------------------------------------------------
/man/scmmCache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cache.R
 3 | \name{scmmCache}
 4 | \alias{scmmCache}
 5 | \alias{setCache}
 6 | \alias{removeCache}
 7 | \title{Manage cache / download directories for study data}
 8 | \usage{
 9 | scmmCache(...)
10 | 
11 | setCache(
12 |   directory = tools::R_user_dir("SingleCellMultiModal", "cache"),
13 |   verbose = TRUE,
14 |   ask = interactive()
15 | )
16 | 
17 | removeCache(accession)
18 | }
19 | \arguments{
20 | \item{...}{For \code{scmmCache}, arguments passed to \code{setCache}}
21 | 
22 | \item{directory}{\code{character(1)} The file location where the cache is located.
23 | Once set, future downloads will go to this folder. See \code{setCache} section
24 | for details.}
25 | 
26 | \item{verbose}{Whether to print descriptive messages}
27 | 
28 | \item{ask}{\code{logical(1)} (default TRUE when \code{interactive()}) Confirm the file
29 | location of the cache directory}
30 | 
31 | \item{accession}{\code{character(1)} A single string indicating the accession number
32 | of the study}
33 | }
34 | \value{
35 | The directory / option of the cache location
36 | }
37 | \description{
38 | Managing data downloads is important to save disk space and
39 | re-downloading data files. This can be done effortlessly via the integrated
40 | \code{BiocFileCache} system.
41 | }
42 | \section{scmmCache}{
43 | 
44 | Get the directory location of the cache. It will prompt the user to create
45 | a cache if not already created. A specific directory can be used via
46 | \code{setCache}.
47 | }
48 | 
49 | \section{setCache}{
50 | 
51 | Specify the directory location of the data cache. By default, it will
52 | go into the user's home and package name directory as given by
53 | \link[tools:userdir]{R_user_dir} (default: varies by system e.g., for Linux:
54 | '$HOME/.cache/R/SingleCellMultiModal').
55 | }
56 | 
57 | \section{removeCache}{
58 | 
59 | Some files may become corrupt when downloading, this function allows
60 | the user to delete the tarball associated with a study number in the
61 | cache.
62 | }
63 | 
64 | \examples{
65 | getOption("scmmCache")
66 | scmmCache()
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/man/seqFISH.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/seqFISH.R
 3 | \name{seqFISH}
 4 | \alias{seqFISH}
 5 | \title{Single-cell spatial + Gene Expression}
 6 | \usage{
 7 | seqFISH(
 8 |   DataType = "mouse_visual_cortex",
 9 |   modes = "*",
10 |   version,
11 |   dry.run = TRUE,
12 |   verbose = TRUE,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{DataType}{\code{character(1)} indicating the identifier of the dataset to
18 | retrieve.  (default "mouse_visual_cortex")}
19 | 
20 | \item{modes}{\code{character()} The assay types or modes of data to obtain these
21 | include seq-FISH and scRNA-seq data by default.}
22 | 
23 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on
24 | data version required (default '1.0.0'). See version section.}
25 | 
26 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual
27 | download (default \code{TRUE})}
28 | 
29 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being
30 | (down)loaded (default \code{TRUE})}
31 | 
32 | \item{...}{Additional arguments passed on to the
33 | \link[ExperimentHub]{ExperimentHub-class} constructor}
34 | }
35 | \value{
36 | A
37 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
38 | of seq-FISH data
39 | }
40 | \description{
41 | seqFISH function assembles data on-the-fly from \code{ExperimentHub}
42 | to provide a
43 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
44 | container. Actually the \code{DataType} argument provides access to the
45 | available datasets associated to the package.
46 | }
47 | \details{
48 | seq FISH data are a combination of single cell spatial coordinates
49 | and transcriptomics for a few hundreds of genes. seq-FISH data can be
50 | combined for example with scRNA-seq data to unveil multiple aspects of
51 | cellular behaviour based on their spatial organization and transcription.
52 | 
53 | Available datasets are:
54 | \itemize{
55 | \item mouse_visual_cortex: combination of seq-FISH data as obtained from Zhu
56 | et al. (2018) and scRNA-seq data as obtained from Tasic et al. (2016),
57 | Version 1.0.0 returns the full scRNA-seq data matrix, while version 2.0.0
58 | returns the processed and subsetted scRNA-seq data matrix (produced for
59 | the Mathematical Frameworks for Integrative Analysis of Emerging
60 | Biological Data Types 2020 Workshop) The returned seqFISH data are always
61 | the processed ones for the same workshop. Additionally, cell types
62 | annotations are available in the \code{colData} through the \code{class} column in
63 | the seqFISH \code{assay}.
64 | \itemize{
65 | \item scRNA_Counts - Tasic scRNA-seq gene count matrix
66 | \item scRNA_Labels - Tasic scRNA-seq cell labels
67 | \item seqFISH_Coordinates - Zhu seq-FISH spatial coordinates
68 | \item seqFISH_Counts - Zhu seq-FISH gene counts matrix
69 | \item seqFISH_Labels - Zhu seq-FISH cell labels
70 | }
71 | }
72 | }
73 | \examples{
74 | 
75 | seqFISH(DataType = "mouse_visual_cortex", modes = "*", version = "2.0.0",
76 |     dry.run = TRUE)
77 | 
78 | }
79 | \author{
80 | Dario Righelli <dario.righelli \if{html}{\out{<at>}} gmail.com>
81 | }
82 | 


--------------------------------------------------------------------------------
/vignettes/CITEseq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "CITEseq Cord Blood"
  3 | author: "Dario Righelli"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |     BiocStyle::html_document:
  7 |       toc_float: true
  8 | vignette: >
  9 |     %\VignetteIndexEntry{CITEseq Cord Blood}
 10 |     %\VignetteEncoding{UTF-8}
 11 |     %\VignetteEngine{knitr::rmarkdown}
 12 | package: SingleCellMultiModal
 13 | bibliography: ../inst/REFERENCES.bib
 14 | editor_options:
 15 |   chunk_output_type: console
 16 | ---
 17 | 
 18 | # Installation 
 19 | 
 20 | ```{r,eval=FALSE}
 21 | if (!requireNamespace("BiocManager", quietly = TRUE))
 22 |     install.packages("BiocManager")
 23 | 
 24 | BiocManager::install("SingleCellMultiModal")
 25 | ```
 26 | 
 27 | 
 28 | # Load libraries
 29 | 
 30 | ```{r, include=TRUE, results="hide", message=FALSE, warning=FALSE}
 31 | 
 32 | library(MultiAssayExperiment)
 33 | library(SingleCellMultiModal)
 34 | library(SingleCellExperiment)
 35 | ```
 36 | 
 37 | 
 38 | # CITE-seq dataset
 39 | 
 40 | CITE-seq data are a combination of two data types extracted at the same
 41 | time from the same cell.  First data type is scRNA-seq data, while the second
 42 | one consists of about a hundread of antibody-derived tags (ADT).
 43 | In particular this dataset is provided by @stoeckius2017simultaneous.
 44 | 
 45 | ## Downloading datasets
 46 | 
 47 | The user can see the available dataset by using the default options
 48 | 
 49 | ```{r}
 50 | 
 51 | CITEseq(DataType="cord_blood", modes="*", dry.run=TRUE, version="1.0.0")
 52 | 
 53 | ```
 54 | 
 55 | Or simply by setting `dry.run = FALSE` it downloads the data and creates the
 56 | `MultiAssayExperiment` object.
 57 | 
 58 | In this example, we will use one of the two available datasets `scADT_Counts`:
 59 | 
 60 | ```{r,message=FALSE}
 61 | 
 62 | mae <- CITEseq(
 63 |     DataType="cord_blood", modes="*", dry.run=FALSE, version="1.0.0"
 64 | )
 65 | 
 66 | mae
 67 | ```
 68 | 
 69 | Example with actual data:
 70 | 
 71 | ```{r}
 72 | experiments(mae)
 73 | ```
 74 | 
 75 | 
 76 | ## Exploring the data structure
 77 | 
 78 | Check row annotations:
 79 | 
 80 | ```{r}
 81 | rownames(mae)
 82 | ```
 83 | 
 84 | Take a peek at the `sampleMap`:
 85 | 
 86 | ```{r}
 87 | sampleMap(mae)
 88 | ```
 89 | 
 90 | 
 91 | ## scRNA-seq data
 92 | 
 93 | The scRNA-seq data are accessible with the name `scRNAseq`, which returns a
 94 | *matrix* object.
 95 | 
 96 | ```{r}
 97 | head(experiments(mae)$scRNAseq)[, 1:4]
 98 | ```
 99 | 
100 | ## scADT data
101 | 
102 | The scADT data are accessible with the name `scADT`, which returns a
103 | **matrix** object.
104 | 
105 | ```{r}
106 | head(experiments(mae)$scADT)[, 1:4]
107 | ```
108 | 
109 | # SingleCellExperiment object conversion
110 | 
111 | Because of already large use of some methodologies (such as
112 | in the [SingleCellExperiment vignette][1] or [CiteFuse Vignette][2] where the
113 | `SingleCellExperiment` object is used for CITE-seq data,
114 | we provide a function for the conversion of our CITE-seq `MultiAssayExperiment`
115 | object into a `SingleCellExperiment` object with scRNA-seq data as counts and
116 | scADT data as `altExp`s.
117 | 
118 | [1]: https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html#5_adding_alternative_feature_sets
119 | [2]: http://www.bioconductor.org/packages/release/bioc/vignettes/CiteFuse/inst/doc/CiteFuse.html
120 | 
121 | ```{r message=FALSE}
122 | sce <- CITEseq(DataType="cord_blood", modes="*", dry.run=FALSE, version="1.0.0",
123 |               DataClass="SingleCellExperiment")
124 | sce
125 | ```
126 | 
127 | # Session Info
128 | 
129 | ```{r, tidy=TRUE}
130 | sessionInfo()
131 | ```
132 | 
133 | # References
134 | 
135 | 


--------------------------------------------------------------------------------
/vignettes/ECCITEseq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "ECCITEseq Peripheral Blood"
  3 | author: "Dario Righelli"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |     BiocStyle::html_document:
  7 |       toc_float: true
  8 | vignette: >
  9 |     %\VignetteIndexEntry{ECCITEseq Peripheral Blood}
 10 |     %\VignetteEncoding{UTF-8}
 11 |     %\VignetteEngine{knitr::rmarkdown}
 12 | package: SingleCellMultiModal
 13 | bibliography: ../inst/REFERENCES.bib
 14 | editor_options:
 15 |   chunk_output_type: console
 16 | ---
 17 | 
 18 | # Installation 
 19 | 
 20 | ```{r,eval=FALSE}
 21 | if (!requireNamespace("BiocManager", quietly = TRUE))
 22 |     install.packages("BiocManager")
 23 | 
 24 | BiocManager::install("SingleCellMultiModal")
 25 | ```
 26 | 
 27 | 
 28 | # Load libraries
 29 | 
 30 | ```{r, include=TRUE, results="hide", message=FALSE, warning=FALSE}
 31 | 
 32 | library(MultiAssayExperiment)
 33 | library(SingleCellMultiModal)
 34 | library(SingleCellExperiment)
 35 | 
 36 | ```
 37 | 
 38 | 
 39 | # ECCITE-seq dataset
 40 | 
 41 | ECCITE-seq data are an evolution of the CITE-seq data 
 42 | (see also [CITE-seq vignette](CITEseq.html) for more details)
 43 | by extending the CITE-seq original data types with a third one always extracted
 44 | from the same cell.
 45 | Indeed, in addition to the CITE-seq providing scRNA-seq and antibody-derived tags
 46 | (ADT), it provides around ten Hashtagged Oligo (HTO).
 47 | In particular this dataset is provided by @mimitou2019multiplexed.
 48 | 
 49 | ## Downloading datasets
 50 | 
 51 | The user can see the available dataset by using the default options through the
 52 | CITE-seq function.
 53 | 
 54 | ```{r}
 55 | 
 56 | CITEseq(DataType="peripheral_blood", modes="*", dry.run=TRUE, version="1.0.0")
 57 | 
 58 | ```
 59 | 
 60 | Or simply by setting `dry.run = FALSE` it downloads the data and by default 
 61 | creates the `MultiAssayExperiment` object.
 62 | 
 63 | In this example, we will use one of the two available datasets `scADT_Counts`:
 64 | 
 65 | ```{r message=FALSE}
 66 | 
 67 | mae <- CITEseq(DataType="peripheral_blood", modes="*", dry.run=FALSE, version="1.0.0")
 68 | mae
 69 | ```
 70 | 
 71 | Example with actual data:
 72 | 
 73 | ```{r}
 74 | experiments(mae)
 75 | ```
 76 | 
 77 | Additionally, we stored into the object metedata 
 78 | 
 79 | ## Exploring the data structure
 80 | 
 81 | Check row annotations:
 82 | 
 83 | ```{r}
 84 | rownames(mae)
 85 | ```
 86 | 
 87 | Take a peek at the `sampleMap`:
 88 | 
 89 | ```{r}
 90 | sampleMap(mae)
 91 | ```
 92 | 
 93 | 
 94 | ## scRNA-seq data
 95 | 
 96 | The scRNA-seq data are accessible with the name `scRNAseq`, which returns a
 97 | *matrix* object.
 98 | 
 99 | ```{r}
100 | head(experiments(mae)$scRNA)[, 1:4]
101 | ```
102 | 
103 | ## scADT data
104 | 
105 | The scADT data are accessible with the name `scADT`, which returns a
106 | **matrix** object.
107 | 
108 | ```{r}
109 | head(experiments(mae)$scADT)[, 1:4]
110 | ```
111 | 
112 | ## CTCL/CTRL conditions
113 | 
114 | The dataset has two different conditions (CTCL and CTRL) which samples can be identified with the `colData` accessor.
115 | 
116 | CTCL stands for cutaneous T-cell lymphoma while CTRL for control.
117 | 
118 | For example, if we want only the CTCL samples, we can run:
119 | 
120 | ```{r}
121 | (ctclMae <- mae[,colData(mae)$condition == "CTCL",])
122 | ```
123 | 
124 | And if you're interested into the common samples across all the modalities
125 | you can use the `complete.cases` funtion.
126 | 
127 | ```{r}
128 | ctclMae[,complete.cases(ctclMae),]
129 | ```
130 | 
131 |  
132 | ## sgRNAs CRISPR pertubation data
133 | 
134 | The CRISPR perturbed scRNAs data are stored in a different spot 
135 | to keep their original long format.
136 | 
137 | They can be accessed with the `metadata` accessors which, in this case returns a named `list` of `data.frame`s.
138 | 
139 | ```{r}
140 | sgRNAs <- metadata(mae)
141 | names(sgRNAs)
142 | ```
143 | 
144 | There are four different sgRNAs datasets, one per each condition and family receptors combination.
145 | 
146 | TCR stands for T-Cell Receptor, while a,b,g,d stand for alpha, beta, gamma and delta respectively.
147 | 
148 | To look into the TCRab, simply run:
149 | 
150 | ```{r}
151 | head(sgRNAs$CTCL_TCRab)
152 | ```
153 | 
154 | # SingleCellExperiment object conversion
155 | 
156 | Because of already large use of some methodologies (such as
157 | in the [SingleCellExperiment vignette][1] or [CiteFuse Vignette][2] where the
158 | `SingleCellExperiment` object is used for CITE-seq data,
159 | we provide a function for the conversion of our CITE-seq `MultiAssayExperiment`
160 | object into a `SingleCellExperiment` object with scRNA-seq data as counts and
161 | scADT data as `altExp`s.
162 | 
163 | 
164 | ```{r message=FALSE}
165 | sce <- CITEseq(DataType="peripheral_blood", modes="*", dry.run=FALSE, 
166 |                version="1.0.0", DataClass="SingleCellExperiment")
167 | sce
168 | ```
169 | 
170 | # Session Info
171 | 
172 | ```{r, tidy=TRUE}
173 | sessionInfo()
174 | ```
175 | 
176 | # Additional References
177 | 
178 | https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html#5_adding_alternative_feature_sets
179 | http://www.bioconductor.org/packages/release/bioc/vignettes/CiteFuse/inst/doc/CiteFuse.html
180 | 
181 | # References
182 | 


--------------------------------------------------------------------------------
/vignettes/GTseq.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "G&T-seq Mouse Embryo (8-cell stage)"
 3 | date: "`r BiocStyle::doc_date()`"
 4 | vignette: |
 5 |   %\VignetteIndexEntry{GT-seq Mouse Embryo}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | output:
 9 |     BiocStyle::html_document:
10 |       toc_float: true
11 | package: SingleCellMultiModal
12 | bibliography: ../inst/REFERENCES.bib
13 | ---
14 | 
15 | # Installation
16 | 
17 | ```{r,eval=FALSE}
18 | if (!requireNamespace("BiocManager", quietly = TRUE))
19 |     install.packages("BiocManager")
20 | 
21 | BiocManager::install("SingleCellMultiModal")
22 | ```
23 | 
24 | ## Load
25 | 
26 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
27 | library(SingleCellMultiModal)
28 | library(MultiAssayExperiment)
29 | ```
30 | 
31 | # G&T-seq: parallel sequencing data of single-cell genomes and transcriptomes
32 | 
33 | G&T-seq is a combination of Picoplex amplified gDNA sequencing (genome) and
34 | SMARTSeq2 amplified cDNA sequencing (transcriptome) of the same cell.
35 | For more information, see @Macaulay2015.
36 | 
37 | ## Downloading datasets
38 | 
39 | The user can see the available dataset by using the default options
40 | 
41 | ```{r}
42 | GTseq("mouse_embryo_8_cell", mode = "*", dry.run = TRUE)
43 | ```
44 | 
45 | Or by simply running:
46 | 
47 | ```{r}
48 | GTseq()
49 | ```
50 | 
51 | ## Obtaining the data
52 | 
53 | To obtain the actual datasets:
54 | 
55 | ```{r,message=FALSE}
56 | gts <- GTseq(dry.run = FALSE)
57 | gts
58 | ```
59 | 
60 | ## Exploring the data structure
61 | 
62 | Check available metadata for each of the 112 mouse embryo cells assayed by G&T-seq:
63 | 
64 | ```{r}
65 | colData(gts)
66 | ```
67 | 
68 | Take a peek at the `sampleMap`:
69 | 
70 | ```{r}
71 | sampleMap(gts)
72 | ```
73 | 
74 | ## Copy numbers
75 | 
76 | To access the integer copy numbers as detected from scDNA-seq:
77 | 
78 | ```{r}
79 | head(assay(gts, "genomic"))[, 1:4]
80 | ```
81 | 
82 | ## RNA-seq
83 | 
84 | To access raw read counts as quantified from scRNA-seq:
85 | 
86 | ```{r}
87 | head(assay(gts, "transcriptomic"))[, 1:4]
88 | ```
89 | 
90 | For protocol information, see @Macaulay2016.
91 | 
92 | # sessionInfo
93 | 
94 | ```{r}
95 | sessionInfo()
96 | ```
97 | 
98 | # References
99 | 


--------------------------------------------------------------------------------
/vignettes/SCoPE2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SCoPE2: macrophage vs monocytes"
  3 | date: "`r BiocStyle::doc_date()`"
  4 | vignette: |
  5 |     %\VignetteIndexEntry{SCoPE2: macrophage vs monocytes}
  6 |     %\VignetteEngine{knitr::rmarkdown}
  7 |     %\VignetteEncoding{UTF-8}
  8 | output:
  9 |     BiocStyle::html_document:
 10 |         toc_float: true
 11 | package: SingleCellMultiModal
 12 | bibliography: ../inst/REFERENCES.bib
 13 | ---
 14 | 
 15 | This vignette will guide you through how accessing and manipulating
 16 | the SCoPE2 data sets available from the `SingleCellMultimodal` package.
 17 | 
 18 | # Installation
 19 | 
 20 | ```{r,eval=FALSE}
 21 | if (!requireNamespace("BiocManager", quietly = TRUE))
 22 |     install.packages("BiocManager")
 23 | BiocManager::install("SingleCellMultiModal")
 24 | ```
 25 | 
 26 | ## Load packages
 27 | 
 28 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
 29 | library(SingleCellMultiModal)
 30 | library(MultiAssayExperiment)
 31 | ```
 32 | 
 33 | # SCoPE2
 34 | 
 35 | SCoPE2 is a mass spectrometry (MS)-based single-cell proteomics
 36 | protocol to quantify the proteome of single-cells in an untargeted
 37 | fashion. It was initially developed by @Specht2021-pm.
 38 | 
 39 | ## Downloading data sets
 40 | 
 41 | The user can see the available data set by using the default options.
 42 | 
 43 | ```{r}
 44 | SCoPE2("macrophage_differentiation",
 45 |        mode = "*",
 46 |        version = "1.0.0",
 47 |        dry.run = TRUE)
 48 | ```
 49 | 
 50 | Or by simply running:
 51 | 
 52 | ```{r}
 53 | SCoPE2("macrophage_differentiation")
 54 | ```
 55 | 
 56 | ## Available projects
 57 | 
 58 | Currently, only the `macrophage_differentiation` is available.
 59 | 
 60 | ## Retrieving data
 61 | 
 62 | You can retrieve the actual data from `ExperimentHub` by setting
 63 | `dry.run = FALSE`. This example retrieves the complete data set
 64 | (transcriptome and proteome) for the `macrophage_differentiation`
 65 | project:
 66 | 
 67 | ```{r,message=FALSE}
 68 | scope2 <- SCoPE2("macrophage_differentiation",
 69 |                  modes = "rna|protein",
 70 |                  dry.run = FALSE)
 71 | scope2
 72 | ```
 73 | 
 74 | # The macrophage differentiation project
 75 | 
 76 | This data set has been acquired by the Slavov Lab (@Specht2021-pm).
 77 | It contains single-cell proteomics and single-cell
 78 | RNA sequencing data for macrophages and monocytes. The objective of the
 79 | research that led to generate the data is to understand whether
 80 | homogeneous monocytes differentiate in the absence of cytokines to
 81 | macrophages with homogeneous or heterogeneous profiles. The transcriptomic and
 82 | proteomic acquisitions are conducted on two separate subset of similar
 83 | cells (same experimental design). The cell type of the samples are known only
 84 | for the **proteomics** data. The proteomics data was retrieved from
 85 | the authors' [website](https://scope2.slavovlab.net/docs/data) and the
 86 | transcriptomic data was retrieved from the GEO database (accession id:
 87 | [GSE142392](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392)).
 88 | 
 89 | For more information on the protocol, see @Specht2021-pm.
 90 | 
 91 | ## Data versions
 92 | 
 93 | Only version `1.0.0` is currently available.
 94 | 
 95 | The `macrophage_differentiation` data set in this package contains two
 96 | assays: `rna` and `protein`.
 97 | 
 98 | ### Cell annotation
 99 | 
100 | The single-cell proteomics data contains cell type annotation
101 | (`celltype`), sample preparation batch (`batch_digest` and
102 | `batch_sort`), chromatography batch (`batch_chromatography`), and the
103 | MS acquisition run (`batch_MS`). The single-cell transcriptomics data
104 | was acquired in two batches (`batch_Chromium`). Note that because the
105 | cells that compose the two assays are distinct, there is no common
106 | cell annotation available for both proteomics and transcriptomics. The
107 | annotation were therefore filled with `NA`s accordingly.
108 | 
109 | ```{r}
110 | colData(scope2)
111 | ```
112 | 
113 | ### Transcriptomic data
114 | 
115 | You can extract and check the transcriptomic data through subsetting:
116 | 
117 | ```{r}
118 | scope2[["rna"]]
119 | ```
120 | 
121 | The data is rather large and is therefore stored on-disk using the
122 | HDF5 backend. You can verify this by looking at the assay data matrix.
123 | Note that the counts are UMI counts.
124 | 
125 | ```{r}
126 | assay(scope2[["rna"]])[1:5, 1:5]
127 | ```
128 | 
129 | ### Proteomic data
130 | 
131 | The `protein` assay contains MS-based proteomic data.
132 | The data have been passed sample and feature quality control,
133 | normalized, log transformed, imputed and batch corrected. Detailed
134 | information about the data processing is available in
135 | [another vignette](https://uclouvain-cbio.github.io/SCP.replication/articles/SCoPE2.html). You can extract the proteomic data similarly to the
136 | transcriptomic data:
137 | 
138 | ```{r}
139 | scope2[["protein"]]
140 | ```
141 | 
142 | In this case, the protein data have reasonable size and are loaded
143 | directly into memory. The data matrix is stored in `logexprs`. We
144 | decided to not use the traditional `logcounts` because MS proteomics
145 | measures intensities rather than counts as opposed to scRNA-Seq.
146 | 
147 | ```{r}
148 | assay(scope2[["protein"]])[1:5, 1:5]
149 | ```
150 | 
151 | # sessionInfo
152 | 
153 | ```{r}
154 | sessionInfo()
155 | ```
156 | 
157 | # References
158 | 


--------------------------------------------------------------------------------
/vignettes/SingleCellMultiModal.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "SingleCellMultiModal Introduction"
 3 | date: "`r BiocStyle::doc_date()`"
 4 | vignette: |
 5 |   %\VignetteIndexEntry{SingleCellMultiModal Introduction}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | output:
 9 |     BiocStyle::html_document:
10 |       toc_float: true
11 | package: SingleCellMultiModal
12 | bibliography: ../inst/REFERENCES.bib
13 | ---
14 | 
15 | # SingleCellMultiModal
16 | 
17 | ## Overview 
18 | 
19 | `SingleCellMultiModal` is an R package that provides a convenient and
20 | user-friendly representation of multi-modal data using `MultiAssayExperiment`.
21 | This package introduces a suite of single-cell multimodal landmark datasets for
22 | benchmarking and testing multimodal analysis methods via the `ExperimentHub`
23 | Bioconductor package. The scope of this package is to provide efficient access
24 | to a selection of curated, pre-integrated, publicly available landmark datasets
25 | for methods development and benchmarking.
26 | 
27 | ## Installation
28 | 
29 | ```{r,eval=FALSE}
30 | if (!requireNamespace("BiocManager", quietly = TRUE))
31 |     install.packages("BiocManager")
32 | 
33 | BiocManager::install("SingleCellMultiModal")
34 | ```
35 | 
36 | ## Loading packages
37 | 
38 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
39 | library(SingleCellMultiModal)
40 | library(MultiAssayExperiment)
41 | ```
42 | 
43 | # Citing SingleCellMultiModal
44 | 
45 | Your citations are crucial in keeping our software free and open source. To
46 | cite our package see the citation (@Eckenrode2023-yq) in the Reference
47 | section. You may also browse to the publication at
48 | [PLoS Computational Biology][1].
49 | 
50 | [1]: https://doi.org/10.1371/journal.pcbi.1011324
51 | 
52 | ## Representation
53 | 
54 | Users can obtain integrative representations of multiple modalities as a
55 | `MultiAssayExperiment`, a common core Bioconductor data structure relied on by
56 | dozens of multimodal data analysis packages. `MultiAssayExperiment` harmonizes
57 | data management of multiple experimental assays performed on an overlapping set
58 | of specimens. Although originally developed for patient data from multi-omics
59 | cancer studies, the `MultiAssayExperiment` framework naturally applies also to
60 | single cells. A schematic of the data structure can be seen below. In this
61 | context, "patients" are replaced by "cells". We use `MultiAssayExperiment`
62 | because it provides a familiar user experience by extending
63 | `SummarizedExperiment` concepts and providing open ended compatibility with
64 | standard data classes present in Bioconductor such as the
65 | `SingleCellExperiment`.
66 | 
67 | ```{r,echo=FALSE}
68 | imgurl <- paste0(
69 |     "https://github.com/waldronlab/MultiAssayExperiment/blob/",
70 |     "c3c59a094e5a08111ee98b9f69579db5634d9fd4/vignettes/",
71 |     "MultiAssayExperiment.png?raw=true"
72 | )
73 | knitr::include_graphics(
74 |     path = imgurl
75 | )
76 | ```
77 | 
78 | # Contributions
79 | 
80 | Want to contribute to the `SingleCellMultiModal` package? We welcome
81 | contributions from the community. Please refer to our
82 | [Contributing Guidelines][2] for more details.
83 | 
84 | [2]: https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines
85 | 
86 | 
87 | ## Further resources
88 | 
89 | For more information on the `MultiAssayExperiment` data structure, please refer
90 | to @Ramos2017-tk as well as the [MultiAssayExperiment vignette][3].
91 | 
92 | [3]: https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/MultiAssayExperiment.html
93 | 
94 | # References
95 | 


--------------------------------------------------------------------------------
/vignettes/scMultiome.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "PBMCs profiled with the Chromium Single Cell Multiome ATAC + Gene Expression from 10x"
  3 | date: "`r BiocStyle::doc_date()`"
  4 | vignette: |
  5 |   %\VignetteIndexEntry{scMultiome 10x PBMC}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | output:
  9 |     BiocStyle::html_document:
 10 |       toc_float: true
 11 | package: SingleCellMultiModal
 12 | ---
 13 | 
 14 | # Installation
 15 | 
 16 | ```{r,eval=FALSE}
 17 | if (!requireNamespace("BiocManager", quietly = TRUE))
 18 |     install.packages("BiocManager")
 19 | BiocManager::install("SingleCellMultiModal")
 20 | ```
 21 | 
 22 | ## Load
 23 | 
 24 | ```{r,include=TRUE, results="hide", message=FALSE, warning=FALSE}
 25 | library(SingleCellMultiModal)
 26 | library(MultiAssayExperiment)
 27 | library(scran)
 28 | library(scater)
 29 | ```
 30 | 
 31 | # Description
 32 | 
 33 | This data set consists of about 10K Peripheral Blood Mononuclear Cells (PBMCs)
 34 | derived from a single healthy donor. It is available
 35 | [from the 10x Genomics website](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets).
 36 | 
 37 | Provided are the RNA expression counts quantified at the gene level and the
 38 | chromatin accessibility levels quantified at the peak level. Here we provide
 39 | the default peaks called by the CellRanger software. If you want to explore
 40 | other peak definitions or chromatin accessibility quantifications (at the
 41 | promoter level, etc.), you have download the `fragments.tsv.gz` file from the
 42 | 10x Genomics website.
 43 | 
 44 | # Downloading datasets
 45 | 
 46 | The user can see the available dataset by using the default options
 47 | 
 48 | ```{r}
 49 | mae <- scMultiome("pbmc_10x", modes = "*", dry.run = FALSE, format = "MTX")
 50 | ```
 51 | 
 52 | ```{r, echo=FALSE}
 53 | gg_color_hue <- function(n) {
 54 |   hues = seq(15, 375, length = n + 1)
 55 |   hcl(h = hues, l = 65, c = 100)[1:n]
 56 | }
 57 | colors <- gg_color_hue(length(unique(mae$celltype)))
 58 | names(colors) <- unique(mae$celltype)
 59 | ```
 60 | 
 61 | # Exploring the data structure
 62 | 
 63 | There are two assays: `rna` and `atac`, stored as
 64 | [SingleCellExperiment](http://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html)
 65 | objects
 66 | 
 67 | ```{r}
 68 | mae
 69 | ```
 70 | 
 71 | where the cells are the same in both assays:
 72 | 
 73 | ```{r}
 74 | upsetSamples(mae)
 75 | ```
 76 | 
 77 | ## Cell metadata
 78 | 
 79 | Columns:
 80 | 
 81 | - **nCount_RNA**: number of read counts
 82 | - **nFeature_RNA**: number of genes with at least one read count
 83 | - **nCount_ATAC**: number of ATAC read counts
 84 | - **nFeature_ATAC**: number of ATAC peaks with at least one read count
 85 | - **celltype**: The cell types have been annotated by the 10x Genomics R&D team using gene markers. They provide a rough characterisation of the cell type diversity, but keep in mind that they are not ground truth labels.
 86 | - **broad_celltype**: `Lymphoid` or `Myeloid` origin
 87 | 
 88 | The cells have not been QC-ed, choosing a minimum number of genes/peaks per
 89 | cell depends is left to you! In addition, there are further quality control
 90 | criteria that you may want to apply, including mitochondrial coverage, fraction
 91 | of reads overlapping ENCODE Blacklisted regions, Transcription start site
 92 | enrichment, etc. See suggestions below for software that can perform a
 93 | semi-automated quality control pipeline
 94 | 
 95 | ```{r}
 96 | head(colData(mae))
 97 | ```
 98 | 
 99 | ## RNA expression
100 | 
101 | The RNA expression consists of 36,549 genes and 10,032 cells, stored using
102 | the `dgCMatrix` sparse matrix format
103 | 
104 | ```{r}
105 | dim(experiments(mae)[["rna"]])
106 | ```
107 | 
108 | ```{r}
109 | names(experiments(mae))
110 | ```
111 | 
112 | Let's do some standard dimensionality reduction plot:
113 | 
114 | ```{r}
115 | sce.rna <- experiments(mae)[["rna"]]
116 | 
117 | # Normalisation
118 | sce.rna <- logNormCounts(sce.rna)
119 | 
120 | # Feature selection
121 | decomp <- modelGeneVar(sce.rna)
122 | hvgs <- rownames(decomp)[decomp$mean>0.01 & decomp$p.value <= 0.05]
123 | sce.rna <- sce.rna[hvgs,]
124 | 
125 | # PCA
126 | sce.rna <- runPCA(sce.rna, ncomponents = 25)
127 | 
128 | # UMAP
129 | set.seed(42)
130 | sce.rna <- runUMAP(sce.rna, dimred="PCA", n_neighbors = 25, min_dist = 0.3)
131 | plotUMAP(sce.rna, colour_by="celltype", point_size=0.5, point_alpha=1)
132 | ```
133 | 
134 | ## Chromatin Accessibility
135 | 
136 | The ATAC expression consists of 108,344 peaks and 10,032 cells:
137 | 
138 | ```{r}
139 | dim(experiments(mae)[["atac"]])
140 | ```
141 | 
142 | Let's do some standard dimensionality reduction plot. Note that scATAC-seq data is sparser than scRNA-seq, almost binary. The log normalisation + PCA approach that `scater` implements for scRNA-seq is not a good strategy for scATAC-seq data. Topic modelling or TFIDF+SVD are a better strategy. Please see the package recommendations below.
143 | 
144 | ```{r}
145 | sce.atac <- experiments(mae)[["atac"]]
146 | 
147 | # Normalisation
148 | sce.atac <- logNormCounts(sce.atac)
149 | 
150 | # Feature selection
151 | decomp <- modelGeneVar(sce.atac)
152 | hvgs <- rownames(decomp)[decomp$mean>0.25]
153 | sce.atac <- sce.atac[hvgs,]
154 | 
155 | # PCA
156 | sce.atac <- runPCA(sce.atac, ncomponents = 25)
157 | 
158 | # UMAP
159 | set.seed(42)
160 | sce.atac <- runUMAP(sce.atac, dimred="PCA", n_neighbors = 25, min_dist = 0.3)
161 | plotUMAP(sce.atac, colour_by="celltype", point_size=0.5, point_alpha=1)
162 | ```
163 | 
164 | # Suggested software for the downstream analysis
165 | 
166 | These are my personal recommendations of R-based analysis software:
167 | 
168 | -   **RNA expression**: [scater](http://bioconductor.org/packages/release/bioc/html/scater.html), [scran](https://bioconductor.org/packages/release/bioc/html/scran.html)
169 | -   **ATAC accessibility**: [archR](https://www.archrproject.com/), [snapATAC](https://github.com/r3fang/SnapATAC), [cisTopic](https://github.com/aertslab/cisTopic), [Signac](https://satijalab.org/signac), [chromVar](https://bioconductor.org/packages/release/bioc/html/chromVAR.html), [Cicero](https://www.bioconductor.org/packages/release/bioc/html/cicero.html)
170 | -   **Integrative analysis**: [MOFA+](https://biofam.github.io/MOFA2), [Seurat](https://satijalab.org/seurat). Note that both methods have released vignettes in their website where they analysed this same data set.
171 | 
172 | # sessionInfo
173 | 
174 | ```{r}
175 | sessionInfo()
176 | ```
177 | 


--------------------------------------------------------------------------------
/vignettes/scNMT.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "scNMT Mouse Gastrulation"
  3 | date: "`r BiocStyle::doc_date()`"
  4 | vignette: |
  5 |   %\VignetteIndexEntry{scNMT Mouse Gastrulation}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | output:
  9 |     BiocStyle::html_document:
 10 |       toc_float: true
 11 | package: SingleCellMultiModal
 12 | bibliography: ../inst/REFERENCES.bib
 13 | ---
 14 | 
 15 | # Installation
 16 | 
 17 | ```{r,eval=FALSE}
 18 | if (!requireNamespace("BiocManager", quietly = TRUE))
 19 |     install.packages("BiocManager")
 20 | 
 21 | BiocManager::install("SingleCellMultiModal")
 22 | ```
 23 | 
 24 | ## Load packages
 25 | 
 26 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
 27 | library(SingleCellMultiModal)
 28 | library(MultiAssayExperiment)
 29 | ```
 30 | 
 31 | # scNMT: single-cell nucleosome, methylation and transcription sequencing
 32 | 
 33 | The dataset was graciously provided by @Argelaguet2019-et.
 34 | 
 35 | Scripts used to process the raw data were written and maintained by Argelaguet
 36 | and colleagues and reside on GitHub:
 37 | https://github.com/rargelaguet/scnmt_gastrulation
 38 | 
 39 | For more information on the protocol, see @Clark2018-qg.
 40 | 
 41 | ## Dataset lookup
 42 | 
 43 | The user can see the available datasets by using the `dry.run` argument:
 44 | 
 45 | ```{r}
 46 | scNMT("mouse_gastrulation", mode = "*", version = "1.0.0", dry.run = TRUE)
 47 | ```
 48 | 
 49 | Or by simply running the `scNMT` function with defaults:
 50 | 
 51 | ```{r}
 52 | scNMT("mouse_gastrulation", version = "1.0.0")
 53 | ```
 54 | 
 55 | ## Data versions
 56 | 
 57 | A more recent release of the 'mouse_gastrulation' dataset was provided
 58 | by Argelaguet and colleagues. This dataset includes additional cells that
 59 | did not pass the original quality metrics as imposed for the version `1.0.0`
 60 | dataset.
 61 | 
 62 | Use the `version` argument to indicate the newer dataset version
 63 | (i.e., `2.0.0`):
 64 | 
 65 | ```{r}
 66 | scNMT("mouse_gastrulation", version = '2.0.0', dry.run = TRUE)
 67 | ```
 68 | 
 69 | ## Downloading the data
 70 | 
 71 | To obtain the data, we can use the `mode` argument to indicate specific
 72 | datasets using 'glob' patterns that will match the outputs above. For example,
 73 | if we would like to have all 'genebody' datasets for all available assays,
 74 | we would use `*_genebody` as an input to `mode`.
 75 | 
 76 | ```{r,message=FALSE}
 77 | nmt <- scNMT("mouse_gastrulation", mode = c("*_DHS", "*_cgi", "*_genebody"),
 78 |     version = "1.0.0", dry.run = FALSE)
 79 | nmt
 80 | ```
 81 | 
 82 | ## Checking the cell metadata
 83 | 
 84 | Included in the `colData` `DataFrame` within the `MultiAssayExperiment` class
 85 | are the variables `cellID`, `stage`, `lineage10x_2`, and `stage_lineage`.
 86 | To extract this `DataFrame`, one has to use `colData` on the
 87 | `MultiAssayExperiment` object:
 88 | 
 89 | ```{r}
 90 | colData(nmt)
 91 | ```
 92 | 
 93 | ## Exploring the data structure
 94 | 
 95 | Check row annotations:
 96 | 
 97 | ```{r}
 98 | rownames(nmt)
 99 | ```
100 | 
101 | The `sampleMap` is a graph representation of the relationships between cells
102 | and 'assay' datasets:
103 | 
104 | ```{r}
105 | sampleMap(nmt)
106 | ```
107 | 
108 | Take a look at the cell identifiers or barcodes across assays:
109 | 
110 | ```{r}
111 | colnames(nmt)
112 | ```
113 | 
114 | ## Chromatin Accessibility (acc_*)
115 | 
116 | See the accessibilty levels (as proportions) for DNase Hypersensitive Sites:
117 | 
118 | ```{r}
119 | head(assay(nmt, "acc_DHS"))[, 1:4]
120 | ```
121 | 
122 | ## DNA Methylation (met_*)
123 | 
124 | See the methylation percentage / proportion:
125 | 
126 | ```{r}
127 | head(assay(nmt, "met_DHS"))[, 1:4]
128 | ```
129 | 
130 | For protocol information, see the references below.
131 | 
132 | # sessionInfo
133 | 
134 | ```{r}
135 | sessionInfo()
136 | ```
137 | 
138 | # References
139 | 


--------------------------------------------------------------------------------
/vignettes/seqFISH.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "seqFISH Mouse Visual Cortex"
  3 | author: "Dario Righelli"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |     BiocStyle::html_document:
  7 |       toc_float: true
  8 | vignette: >
  9 |     %\VignetteIndexEntry{seqFISH Mouse Visual Cortex}
 10 |     %\VignetteEncoding{UTF-8}
 11 |     %\VignetteEngine{knitr::rmarkdown}
 12 | package: SingleCellMultiModal
 13 | bibliography: ../inst/REFERENCES.bib
 14 | editor_options:
 15 |   chunk_output_type: console
 16 | ---
 17 | 
 18 | # Installation
 19 | 
 20 | ```{r,eval=FALSE}
 21 | if (!requireNamespace("BiocManager", quietly = TRUE))
 22 |     install.packages("BiocManager")
 23 | BiocManager::install("SingleCellMultiModal")
 24 | ```
 25 | 
 26 | ## Load packages
 27 | 
 28 | ```{r,include=TRUE, results="hide", message=FALSE, warning=FALSE}
 29 | library(MultiAssayExperiment)
 30 | library(SpatialExperiment)
 31 | library(SingleCellMultiModal)
 32 | ```
 33 | 
 34 | 
 35 | # seq-FISH dataset
 36 | 
 37 | The dataset consists of two data types,
 38 | seq-FISH data was provided by @Zhu2018identification, while scRNA-seq data
 39 | was provided by @Tasic2016adult.
 40 | 
 41 | Data have been retrievedas part of the
 42 | [Hackathon](https://github.com/BIRSBiointegration/Hackathon/tree/master/seqFISH)
 43 | in the
 44 | [Mathematical Frameworks for Integrative Analysis of Emerging Biological DataTypes](https://www.birs.ca/events/2020/5-day-workshops/20w5197) workshop.
 45 | 
 46 | ## Downloading datasets
 47 | 
 48 | The user can see the available dataset by using the default options
 49 | 
 50 | ```{r}
 51 | seqFISH(
 52 |     DataType="mouse_visual_cortex", modes="*", dry.run=TRUE, version="2.0.0"
 53 | )
 54 | ```
 55 | 
 56 | Or simply by running:
 57 | 
 58 | ```{r}
 59 | seqfish <- seqFISH(
 60 |     DataType="mouse_visual_cortex", modes="*", dry.run=FALSE, version="2.0.0"
 61 | )
 62 | seqfish
 63 | ```
 64 | 
 65 | Extract the list of experiments _without_ the associated colData.
 66 | 
 67 | ```{r}
 68 | experiments(seqfish)
 69 | ```
 70 | 
 71 | ## Exploring the data structure
 72 | 
 73 | Check row annotations for all experiments:
 74 | 
 75 | ```{r}
 76 | rownames(seqfish)
 77 | ```
 78 | 
 79 | Take a peek at the `sampleMap` (graph representation of assays, cells, and
 80 | barcodes):
 81 | 
 82 | ```{r}
 83 | sampleMap(seqfish)
 84 | ```
 85 | 
 86 | ## Visualize matching cell identifiers across assays
 87 | 
 88 | ```{r}
 89 | upsetSamples(seqfish)
 90 | ```
 91 | 
 92 | This shows that about 1597 cells match across both modalities / assays.
 93 | 
 94 | ## scRNA-seq data
 95 | 
 96 | The scRNA-seq data are accessible with `$scRNAseq`, which returns a
 97 | *SingleCellExperiment* class object, with all its associated methods.
 98 | 
 99 | ```{r}
100 | seqfish[["scRNAseq"]]
101 | ```
102 | 
103 | Otherwhise the `assay` function can be used to access the *scRNAseq* assay
104 | stored in the `seqfish` *MultiAssayExperiment* object.
105 | 
106 | ```{r}
107 | head(assay(seqfish, "scRNAseq"))[,1:4]
108 | ```
109 | 
110 | ## seq-FISH data
111 | 
112 | The seq-FISH data are accessible with `$seqFISH`, which returns a
113 | **SpatialExperiment** class object.
114 | 
115 | ```{r}
116 | seqfish[["seqFISH"]]
117 | ```
118 | 
119 | Otherwhise the `assay` function can be used to access the *seqFISH* assay
120 | stored in the `seqfish` *MultiAssayExperiment* object.
121 | 
122 | ```{r}
123 | head(assay(seqfish, "seqFISH"))[,1:4]
124 | ```
125 | 
126 | Spatial data can be retrieved with `spatialData` function on the
127 | *SpatialExperiment* object.
128 | 
129 | ```{r}
130 | (sd <- spatialData(seqfish[["seqFISH"]]))
131 | ```
132 | 
133 | Spatial coordinates within the spatial data can be retrieved in matrix form
134 | with `spatialCoords` function on the *SpatialExperiment* object.
135 | 
136 | ```{r}
137 | head(sc <- spatialCoords(seqfish[["seqFISH"]]))
138 | ```
139 | 
140 | Direct access to the colnames of the spacial coordinates with
141 | `spatialCoordsNames` function.
142 | 
143 | ```{r}
144 | spatialCoordsNames(seqfish[["seqFISH"]])
145 | ```
146 | 
147 | 
148 | ## Other data version
149 | 
150 | The provided seqFISH dataset comes out in two different versions:
151 | 
152 | * 1.0.0 - provides the same seqFISH data as shown in the rest of this
153 | vignette, but it returns the full normalized scRNA-seq data matrix (with
154 | labels), as released from the original authors on the GEO database.
155 | * 2.0.0 - provides the same seqFISH data as shown in the rest of this
156 | vignette, but it returns a processed subset of the original scRNA-seq data,
157 | providing only the same genes present in the seqFISH data matrix.
158 | 
159 | ### Data version 1.0.0
160 | 
161 | The full scRNA-seq data matrix is 24057 rows x 1809 columns.
162 | 
163 | To access the v1.0.0 simply run
164 | 
165 | ```{r}
166 | seqFISH(
167 |     DataType="mouse_visual_cortex", modes="*", dry.run=FALSE, version="1.0.0"
168 | )
169 | ```
170 | 
171 | # Session Info
172 | 
173 | ```{r, tidy=TRUE}
174 | sessionInfo()
175 | ```
176 | 
177 | 


--------------------------------------------------------------------------------