├── .Rbuildignore ├── .github └── workflows │ └── pr_check.yml ├── .gitignore ├── CITATION.cff ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── CITEseq.R ├── GTseq.R ├── SCoPE2.R ├── SingleCellMultiModal-package.R ├── SingleCellMultiModal.R ├── cache.R ├── cellGating.R ├── ontomap.R ├── scMultiome.R ├── scNMT.R ├── seqFISH.R └── utils.R ├── README.md ├── _pkgdown.yml ├── inst ├── CITATION ├── REFERENCES.bib ├── extdata │ ├── docuData │ │ ├── singlecellmultimodalv1.csv │ │ ├── singlecellmultimodalv2.csv │ │ ├── singlecellmultimodalv4.csv │ │ ├── singlecellmultimodalv5.csv │ │ ├── singlecellmultimodalv6.csv │ │ ├── singlecellmultimodalv7.csv │ │ ├── singlecellmultimodalv8.csv │ │ └── singlecellmultimodalv9.csv │ ├── metadata.csv │ └── ontomap.tsv └── scripts │ ├── CITEseq_celltypes.R │ ├── Contributing-Guidelines.Rmd │ ├── README.Rmd │ ├── make-data.R │ ├── make-data │ ├── CITEseq_filtering.R │ ├── make_macrophage.R │ ├── scMultiome.R │ └── scNMT.R │ ├── make-metadata.R │ ├── make-upload.R │ ├── make_docu.R │ ├── ontomap_update.R │ └── update_wiki.sh ├── man ├── CITEseq.Rd ├── GTseq.Rd ├── SCoPE2.Rd ├── SingleCellMultiModal-package.Rd ├── SingleCellMultiModal.Rd ├── addCTLabels.Rd ├── dot-CITEseqMaeToSce.Rd ├── getCellGroups.Rd ├── ontomap.Rd ├── scMultiome.Rd ├── scNMT.Rd ├── scmmCache.Rd └── seqFISH.Rd └── vignettes ├── CITEseq.Rmd ├── ECCITEseq.Rmd ├── GTseq.Rmd ├── SCoPE2.Rmd ├── SingleCellMultiModal.Rmd ├── scMultiome.Rmd ├── scNMT.Rmd └── seqFISH.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | #---------------------------- 2 | # Git 3 | #---------------------------- 4 | ^\.git$ 5 | ^\.github$ 6 | ^\.gitignore$ 7 | ^\.gitattributes$ 8 | 9 | #---------------------------- 10 | # RStudio and R 11 | #---------------------------- 12 | ^\.Rhistory$ 13 | ^.*\.Rproj$ 14 | ^\.Rproj\.user$ 15 | 16 | #---------------------------- 17 | # Data and files 18 | #---------------------------- 19 | ^.*\.rda$ 20 | ^.*\.tar\.gz$ 21 | ^\.lintr$ 22 | ^README\.md$ 23 | ^docs$ 24 | ^data-raw$ 25 | ^.*_cache$ 26 | ^CITATION\.cff$ 27 | -------------------------------------------------------------------------------- /.github/workflows/pr_check.yml: -------------------------------------------------------------------------------- 1 | name: PR CMD check & build site 2 | 3 | on: 4 | pull_request: 5 | push: 6 | paths: 7 | - 'DESCRIPTION' 8 | - '**.yml' 9 | branches: 10 | - devel 11 | - RELEASE_3_21 12 | 13 | env: 14 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | CRAN: https://p3m.dev/cran/__linux__/noble/latest 17 | BIOC_RELEASE: RELEASE_3_21 18 | 19 | jobs: 20 | set-matrix: 21 | runs-on: ubuntu-24.04 22 | outputs: 23 | matrix: ${{ steps.set.outputs.matrix }} 24 | dockerfile_exists: ${{ steps.dockerfile.outputs.exists }} 25 | steps: 26 | - name: Set Matrix Bioconductor Version 27 | id: set 28 | run: | 29 | MATRIX="{\"include\":[{\"bioc_version\":\"$GITHUB_REF_NAME\"}]}" 30 | echo "matrix=$MATRIX" >> $GITHUB_OUTPUT 31 | - name: Check for Dockerfile 32 | id: dockerfile 33 | run: | 34 | echo "exists=$( [ -f ./inst/docker/pkg/Dockerfile ] && echo true || echo false )" >> $GITHUB_OUTPUT 35 | 36 | check: 37 | needs: set-matrix 38 | runs-on: ubuntu-latest 39 | strategy: 40 | matrix: ${{ fromJson(needs.set-matrix.outputs.matrix) }} 41 | container: bioconductor/bioconductor_docker:${{ matrix.bioc_version }} 42 | 43 | steps: 44 | - name: Checkout Repository 45 | uses: actions/checkout@v4 46 | with: 47 | ref: ${{ matrix.bioc_version }} 48 | 49 | - name: Query dependencies 50 | run: | 51 | BiocManager::install(c("covr", "BiocCheck")) 52 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 53 | shell: Rscript {0} 54 | 55 | - name: Cache R packages 56 | uses: actions/cache@v4 57 | with: 58 | path: /usr/local/lib/R/site-library 59 | key: ${{ runner.os }}-r-${{ matrix.bioc_version }}-${{ hashFiles('.github/depends.Rds') }} 60 | restore-keys: ${{ runner.os }}-r-${{ matrix.bioc_version }}- 61 | 62 | - name: Install GPG 63 | if: ${{ github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 64 | run: sudo apt-get update && sudo apt-get install -y gpg 65 | 66 | - name: Install Dependencies 67 | run: | 68 | remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories()) 69 | BiocManager::install(c("rcmdcheck", "BiocCheck"), ask = FALSE, update = TRUE) 70 | shell: Rscript {0} 71 | 72 | - name: Check Package 73 | env: 74 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 75 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error", check_dir = "check") 76 | shell: Rscript {0} 77 | 78 | - name: Test coverage 79 | if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 80 | run: | 81 | cov <- covr::package_coverage( 82 | quiet = FALSE, 83 | clean = FALSE, 84 | type = "all", 85 | install_path = file.path( 86 | normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), 87 | "package" 88 | ) 89 | ) 90 | covr::to_cobertura(cov) 91 | shell: Rscript {0} 92 | 93 | - name: Upload test results to Codecov 94 | if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 95 | uses: codecov/codecov-action@v4 96 | with: 97 | fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} 98 | file: ./cobertura.xml 99 | plugin: noop 100 | disable_search: true 101 | token: ${{ secrets.CODECOV_TOKEN }} 102 | 103 | - name: Run BiocCheck 104 | id: bioccheck 105 | run: | 106 | BiocCheck::BiocCheck( 107 | dir('check', 'tar.gz$', full.names = TRUE), 108 | `quit-with-status` = TRUE, `no-check-bioc-help` = TRUE 109 | ) 110 | shell: Rscript {0} 111 | 112 | - name: Build pkgdown 113 | if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }} 114 | run: | 115 | PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site()' 116 | 117 | - name: Upload pkgdown artifact 118 | if: github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) 119 | uses: actions/upload-pages-artifact@v3 120 | with: 121 | path: docs 122 | 123 | dock: 124 | needs: 125 | - check 126 | - set-matrix 127 | runs-on: ubuntu-24.04 128 | if: ${{ github.ref == 'refs/heads/devel' && needs.set-matrix.outputs.dockerfile_exists == 'true' }} 129 | steps: 130 | - name: Checkout Repository 131 | if: ${{ success() && github.event_name != 'pull_request' }} 132 | uses: actions/checkout@v4 133 | 134 | - name: Register repo name 135 | if: ${{ github.event_name != 'pull_request' }} 136 | id: reg_repo_name 137 | run: | 138 | echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV 139 | 140 | - name: Login to Docker Hub 141 | if: ${{ github.event_name != 'pull_request' }} 142 | uses: docker/login-action@v2 143 | with: 144 | username: ${{ secrets.DOCKERHUB_USERNAME }} 145 | password: ${{ secrets.DOCKERHUB_TOKEN }} 146 | 147 | - name: Build and Push Docker 148 | if: ${{ success() && github.event_name != 'pull_request' }} 149 | uses: docker/build-push-action@v6 150 | with: 151 | context: . 152 | file: ./inst/docker/pkg/Dockerfile 153 | push: true 154 | tags: > 155 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest, 156 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel 157 | 158 | deploy: 159 | needs: check 160 | permissions: 161 | contents: write 162 | pages: write 163 | id-token: write 164 | runs-on: ubuntu-24.04 165 | 166 | steps: 167 | - name: Deploy to GitHub Pages 168 | if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }} 169 | id: deployment 170 | uses: actions/deploy-pages@v4 171 | 172 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # RStudio Files 2 | .Rproj.user 3 | # Data Files 4 | .RData 5 | /vignettes/cache/ 6 | *.[Rr][Dd][SsAa] 7 | *.txt 8 | *.html 9 | .Ruserdata 10 | .Rhistory 11 | *GSE* 12 | *_cache 13 | # Merge residuals 14 | *.orig 15 | # compressed files 16 | *.gz 17 | # databases 18 | *.sqlite 19 | .DS_Store 20 | SingleCellMultiModal.Rproj 21 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: SingleCellMultiModal 3 | Title: Integrating Multi-modal Single Cell Experiment datasets 4 | Version: 1.21.2 5 | Authors@R: c( 6 | person("Marcel", "Ramos", , "marcel.ramos@roswellpark.org", 7 | c("aut", "cre"), c(ORCID = "0000-0002-3242-0582") 8 | ), 9 | person("Ricard", "Argelaguet", , "ricard@ebi.ac.uk", "aut"), 10 | person("Al", "Abadi", , , "ctb"), 11 | person("Dario", "Righelli", , "dario.righelli@gmail.com", "aut"), 12 | person("Christophe", "Vanderaa", , 13 | "christophe.vanderaa@uclouvain.be", "ctb"), 14 | person("Kelly", "Eckenrode", , "kelly.eckenrode@sph.cuny.edu", "aut"), 15 | person("Ludwig", "Geistlinger", , 16 | "ludwig_geistlinger@hms.harvard.edu", "aut"), 17 | person("Levi", "Waldron", , "lwaldron.research@gmail.com", "aut") 18 | ) 19 | Description: SingleCellMultiModal is an ExperimentHub package 20 | that serves multiple datasets obtained from GEO and other sources and 21 | represents them as MultiAssayExperiment objects. We provide several 22 | multi-modal datasets including scNMT, 10X Multiome, seqFISH, CITEseq, 23 | SCoPE2, and others. The scope of the package is is to provide data for 24 | benchmarking and analysis. To cite, use the 'citation' function and see 25 | . 26 | License: Artistic-2.0 27 | BugReports: https://github.com/waldronlab/SingleCellMultiModal/issues 28 | Depends: 29 | R (>= 4.2.0), 30 | MultiAssayExperiment 31 | Imports: 32 | AnnotationHub, 33 | BiocBaseUtils, 34 | BiocFileCache, 35 | ExperimentHub, 36 | graphics, 37 | HDF5Array, 38 | S4Vectors, 39 | SingleCellExperiment, 40 | SpatialExperiment, 41 | SummarizedExperiment, 42 | Matrix, 43 | methods, 44 | utils 45 | Suggests: 46 | BiocStyle, 47 | ggplot2, 48 | knitr, 49 | RaggedExperiment, 50 | rmarkdown, 51 | scater, 52 | scran, 53 | UpSetR, 54 | uwot 55 | VignetteBuilder: 56 | knitr 57 | biocViews: ExperimentData, SingleCellData, ReproducibleResearch, 58 | ExperimentHub, GEO 59 | Encoding: UTF-8 60 | RoxygenNote: 7.3.2 61 | Roxygen: list(markdown = TRUE) 62 | Date: 2025-05-06 63 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CITEseq) 4 | export(GTseq) 5 | export(SCoPE2) 6 | export(SingleCellMultiModal) 7 | export(addCTLabels) 8 | export(getCellGroups) 9 | export(ontomap) 10 | export(removeCache) 11 | export(scMultiome) 12 | export(scNMT) 13 | export(scmmCache) 14 | export(seqFISH) 15 | export(setCache) 16 | import(MultiAssayExperiment) 17 | importFrom(AnnotationHub,query) 18 | importFrom(ExperimentHub,ExperimentHub) 19 | importFrom(ExperimentHub,loadResources) 20 | importFrom(Matrix,Matrix) 21 | importFrom(MultiAssayExperiment,experiments) 22 | importFrom(S4Vectors,DataFrame) 23 | importFrom(S4Vectors,SimpleList) 24 | importFrom(SingleCellExperiment,"altExp<-") 25 | importFrom(SingleCellExperiment,"altExps<-") 26 | importFrom(SingleCellExperiment,SingleCellExperiment) 27 | importFrom(SingleCellExperiment,altExp) 28 | importFrom(SingleCellExperiment,altExps) 29 | importFrom(SingleCellExperiment,colData) 30 | importFrom(SingleCellExperiment,counts) 31 | importFrom(SpatialExperiment,SpatialExperiment) 32 | importFrom(SummarizedExperiment,"assays<-") 33 | importFrom(SummarizedExperiment,SummarizedExperiment) 34 | importFrom(graphics,abline) 35 | importFrom(graphics,smoothScatter) 36 | importFrom(graphics,text) 37 | importFrom(methods,is) 38 | importFrom(utils,glob2rx) 39 | importFrom(utils,read.csv) 40 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Changes in version 1.16.0 2 | 3 | ### New features 4 | 5 | * Added citation information to the package; see 6 | `citation("SingleCellMultiModal")` and the vignette. 7 | 8 | ### Bug fixes and minor improvements 9 | 10 | * Update imports from `SingleCellExperiment`, `S4Vectors`, and 11 | `SummarizedExperiment` 12 | * Add package anchors to links in documentation 13 | * Use markdown in documentation 14 | 15 | ## Changes in version 1.14.0 16 | 17 | ### New features 18 | 19 | * The `ontomap` function provides a reference table of ontology IDs and cell 20 | names by data type available in the package. 21 | * `scRNAseq` `colData` added to `cord_blood` and `peripheral_blood` datasets 22 | provided by the `CITEseq` function. (@drighelli) 23 | 24 | ### Bug fixes and minor improvements 25 | 26 | * When using `HDF5` as `format` input in `scMultiome`, the filtering of file 27 | paths obtained from `ExperimentHub` has been fixed. 28 | * Using `BiocBaseUtils` internally to handle assertions and checks. 29 | 30 | ## Changes in version 1.12.0 31 | 32 | ### Bug fixes and minor improvements 33 | 34 | * Added Ludwig Geistlinger as author (@lgeistlinger) for contributing the 35 | `GTseq` dataset. 36 | 37 | ## Changes in version 1.8.0 38 | 39 | ### Bug fixes and minor improvements 40 | 41 | * Updated the reference in the `SCoPE2` vignette (@cvanderaa). 42 | 43 | ## Changes in version 1.6.0 44 | 45 | ### New features 46 | 47 | * `scMultiome` version `1.0.1` provides the 10X format for RNAseq data. 48 | 49 | ### Bug fixes and minor improvements 50 | 51 | * Updates to `seqFISH` vignette and documentation. 52 | * Updated to changes in `SummarizedExperiment` where `assayDimnames` are 53 | checked. 54 | * `scNMT` defaults to version '1.0.0's QC filtered cells. For unfiltered 55 | cells see version section in `?scNMT`. 56 | 57 | ## Changes in version 1.4.0 58 | 59 | ### New features 60 | 61 | * `SingleCellMultiModal` function allows the combination of multiple 62 | multi-modal technologies. 63 | * `GTseq` data from Macaulay et al. (2015) now available (@lgeistlinger) 64 | * `SCoPE2` data from Specht et al. now available thanks to @cvanderaa (#26) 65 | * `scMultiome` provides PBMC from 10X Genomics thanks to @rargelaguet 66 | 67 | ### Bug fixes and minor improvements 68 | 69 | * Metadata information (function call and call to technology map) included in 70 | `SingleCellMultiModal` 71 | * `scNMT` includes the original call in the `MultiAssayExperiment` metadata 72 | * Improved and edited Contributing Guidelines for clarity 73 | * `seqFISH` uses the `spatialData` argument with `DataFrame` input based on 74 | changes to `SpatialExperiment` (@drighelli) 75 | * Removed the extra column in the `sampleMap` in `CITEseq` (@drighelli) 76 | 77 | ## Changes in version 1.2.0 78 | 79 | ### New features 80 | 81 | * `CITEseq` function, vignette, and 'cord_blood' data available 82 | (@drighelli, #18) 83 | * Include `seqFISH` function, vignette, and 'mouse_visual_cortex' data 84 | (v1 and v2 from @drighelli, #14) 85 | * New 'mouse_gastrulation' dataset released (version "2.0.0"). 86 | * Use `version` argument to indicate the `mouse_gastrulation` data version 87 | * The data includes **all** cells not only the ones that passed the QC 88 | of all three 'omics (thanks @rargelaguet, @ajabadi). 89 | 90 | ### Bug fixes and minor improvements 91 | 92 | * Caching mechanism uses `tools::R_user_dir` and not `rappdirs`. 93 | * Improved display of available data using `ExperimentHub` metadata. 94 | * Improved documentation explaining versioning differences. 95 | * Contribution guidelines available at 96 | https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines 97 | * Default `version` argument in `scNMT` function now set to "2.0.0" (version 98 | "1.0.0" still available) 99 | 100 | ## Changes in version 1.0.0 101 | 102 | ### New features 103 | 104 | * `scNMT` serves the mouse gastrulation dataset from Argelaguet et al. 2019 105 | * Data set is provided by Argelaguet and colleagues via CloudStor link: 106 | https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ 107 | * GitHub repository for the dataset by the authors available at: 108 | https://github.com/rargelaguet/scnmt_gastrulation 109 | 110 | ### Bug fixes and minor improvements 111 | 112 | * Row names in the scNMT dataset properly show mouse ENSEMBL identifiers 113 | -------------------------------------------------------------------------------- /R/CITEseq.R: -------------------------------------------------------------------------------- 1 | .cord_blood <- function(ess_list) 2 | { 3 | idx <- grep(pattern="Counts", names(ess_list$experiments)) 4 | names(ess_list$experiments) <- gsub("Counts|_Counts", "", names(ess_list$experiments)) 5 | mae <- MultiAssayExperiment::MultiAssayExperiment(experiments=(ess_list$experiments[idx])) 6 | coldat <- sampleMap(mae)[,-c(1:2), drop=FALSE] 7 | rownames(coldat) <- coldat[,1] 8 | colnames(coldat) <- c("sampleID") 9 | cd <- ess_list$experiments[grep("coldata", names(ess_list$experiments))][[1]] 10 | ### check add clr counts 11 | if ( !is.null(dim(cd)) ) 12 | { 13 | # colData(mae) <- S4Vectors::cbind.DataFrame(coldat, cd) 14 | colData(mae) <- DataFrame(cd) 15 | } else { 16 | colData(mae) <- coldat 17 | } 18 | return(mae) 19 | } 20 | 21 | .combMatrixForAssay <- function(explist, dimslist, 22 | assayId=c("scADT", "scHTO", "scRNA")) 23 | { 24 | match.arg(assayId) 25 | assIdx <- grep(assayId, names(explist)) 26 | switch(assayId, 27 | "scADT"=, "scHTO"={ 28 | if(length(explist[assIdx]) == 2) 29 | { 30 | m1 <- Matrix::Matrix(unlist(explist[assIdx]), 31 | nrow=dimslist[assIdx][[1]][1], 32 | ncol=(dimslist[assIdx][[1]][2]+dimslist[assIdx][[2]][2]), 33 | sparse=TRUE) 34 | } else { 35 | m1 <- Matrix::Matrix(explist[[assIdx]]) 36 | } 37 | }, 38 | "scRNA"={ 39 | if(length(explist[assIdx]) == 2) 40 | { 41 | ## we can have at last 2 matrices 42 | m1 <- cbind(explist[[assIdx[1]]], explist[[assIdx[2]]]) 43 | } else { 44 | m1 <- explist[[assIdx]] 45 | } 46 | }, 47 | { stop("Unrecognized assayId: ", assayId) } 48 | ) 49 | if(length(explist[assIdx]) == 2) 50 | { 51 | colnames(m1) <- c(paste0(rep(gsub("scADT|scHTO|scRNA","", 52 | names(explist)[assIdx[1]]), 53 | dimslist[assIdx][[1]][2]), 54 | colnames(explist[[assIdx[1]]])), 55 | paste0(rep(gsub("scADT|scHTO|scRNA","", 56 | names(explist)[assIdx[2]]), 57 | dimslist[assIdx][[2]][2]), 58 | colnames(explist[[assIdx[2]]]))) 59 | rownames(m1) <- rownames(explist[[assIdx[[1]]]]) 60 | } else { 61 | colnames(m1) <- paste0(rep(gsub("scADT|scHTO|scRNA","", 62 | names(explist)[assIdx[1]]), 63 | dimslist[assIdx][[1]][2]), 64 | colnames(explist[[assIdx[1]]])) 65 | rownames(m1) <- rownames(explist[[assIdx[[1]]]]) 66 | } 67 | return(m1) 68 | } 69 | 70 | .buildColData <- function(mat1, assayId) 71 | { 72 | cd <- DataFrame( 73 | colname=colnames(mat1), 74 | condition=gsub("_\\w+", "", colnames(mat1)) 75 | ) 76 | return(cd) 77 | } 78 | 79 | .buildMap <- function(mat1, assayId) 80 | { 81 | map <- DataFrame(assay=assayId, 82 | #primary=gsub("_\\w+", "", colnames(mat1)), 83 | primary=colnames(mat1), 84 | colname=colnames(mat1), 85 | condition=gsub("_\\w+", "", colnames(mat1))) 86 | return(map) 87 | } 88 | 89 | #' @importFrom Matrix Matrix 90 | .peripheral_blood <- function(ess_list) 91 | { 92 | ll <- ess_list$experiments 93 | cdidx <- grep("coldata", names(ll)) 94 | cd <- NULL 95 | if (length(cdidx)!=0) 96 | { 97 | cd <- ll[[cdidx]] 98 | ll <- ess_list$experiments[-cdidx] 99 | } 100 | ll <- lapply(ll, function(x) 101 | { 102 | x <- x[order(rownames(x)),] 103 | }) 104 | 105 | dims <- lapply(ll, dim) 106 | # expslist <- vector("list", length(ll)) 107 | # sampmap <- DataFrame() 108 | exps <- lapply(c("scADT", "scHTO", "scRNA"), function(assayn) 109 | { 110 | if ( !isEmpty(grep(assayn, names(ll))) ) 111 | { 112 | assmat <- .combMatrixForAssay(explist=ll, dimslist=dims, assayId=assayn) 113 | assmap <- .buildMap(assmat, assayId=assayn) 114 | return(list("EXP"=assmat, "SAMP"=assmap, "NAME"=assayn)) 115 | } 116 | }) 117 | names(exps) <- unlist(lapply(exps, function(e){e$NAME})) 118 | expslist <- lapply(exps, function(e){e$EXP}) 119 | sampmap <- do.call("rbind", lapply(exps, function(e){e$SAMP})) 120 | if (is.null(cd)) { 121 | coldat <- .buildColData(ll) 122 | coldat <- sampmap[,-c(1:2)] 123 | colnames(coldat) <- c("sampleID", "condition") 124 | rownames(coldat) <- coldat$sampleID 125 | coldat <- unique(coldat) 126 | } else { 127 | coldat <- cd 128 | } 129 | mae <- MultiAssayExperiment::MultiAssayExperiment(experiments=expslist, 130 | sampleMap=sampmap, 131 | colData=coldat) 132 | if(!isEmpty(grep("TCR", names(ll)))) 133 | { 134 | metadata(mae) <- ll[grep("TCR", names(ll))] 135 | } 136 | return(mae) 137 | } 138 | 139 | #' CITEseq 140 | #' @description function assembles data on-the-fly from `ExperimentHub` to 141 | #' provide a 142 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 143 | #' container. Actually the `dataType` argument provides access to the 144 | #' available datasets associated to the package. 145 | #' @author Dario Righelli 146 | #' @details CITEseq data are a combination of single cell transcriptomics and 147 | #' about a hundread of cell surface proteins. 148 | #' Available datasets are: 149 | #' * cord_blood: a dataset of single cells of cord blood as 150 | #' provided in Stoeckius et al. (2017). 151 | #' * scRNA_Counts - Stoeckius scRNA-seq gene count matrix 152 | #' * scADT - Stoeckius antibody-derived tags (ADT) data 153 | #' * peripheral_blood: a dataset of single cells of peripheral 154 | #' blood as provided in Mimitou et al. (2019). We provide two different 155 | #' conditions controls (CTRL) and Cutaneous T-cell Limphoma (CTCL). Just build 156 | #' appropriate `modes` regex for subselecting the dataset modes. 157 | #' * scRNA - Mimitou scRNA-seq gene count matrix 158 | #' * scADT - Mimitou antibody-derived tags (ADT) data 159 | #' * scHTO - Mimitou Hashtag Oligo (HTO) data 160 | #' * TCRab - Mimitou T-cell Receptors (TCR) alpha and beta 161 | #' available through the object metadata. 162 | #' * TCRgd - Mimitou T-cell Receptors (TCR) gamma and delta 163 | #' available through the object metadata. 164 | #' 165 | #' @param DataType `character(1)` indicating the identifier of the dataset to 166 | #' retrieve. (default "cord_blood") 167 | #' 168 | #' @param modes `character()` The assay types or modes of data to obtain these 169 | #' include scADT and scRNA-seq data by default. 170 | #' 171 | #' @param version `character(1)` Either version '1.0.0' depending on 172 | #' data version required. 173 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual 174 | #' download (default `TRUE`) 175 | #' @param filtered `logical(1)` indicating if the returned dataset needs to 176 | #' have filtered cells. 177 | #' See Details for additional information about the filtering process. 178 | #' 179 | #' @param verbose `logical(1)` Whether to show the dataset currently being 180 | #' (down)loaded (default `TRUE`) 181 | #' 182 | #' @param ... Additional arguments passed on to the 183 | #' \link[ExperimentHub]{ExperimentHub-class} constructor 184 | #' 185 | #' @param DataClass either MultiAssayExperiment or SingleCellExperiment 186 | #' data classes can be returned (default MultiAssayExperiment) 187 | #' 188 | #' @details 189 | #' If `filtered` parameter is `FALSE` (default), the `colData` of the returned 190 | #' object contains multiple columns of `logicals` indicating the cells to be 191 | #' discarded. 192 | #' In case `filtered` is `TRUE`, the `discard` column is used to filer the 193 | #' cells. 194 | #' Column `adt.discard` indicates the cells to be discarded computed on the ADT 195 | #' assay. 196 | #' Column `mito.discard` indicates the cells to be discarded computed on the 197 | #' RNA assay and mitocondrial genes. 198 | #' Column `discard` combines the previous columns with an `OR` operator. 199 | #' Note that for the `peripheral_blood` dataset these three columns are 200 | #' computed and returned separately for the `CTCL` and `CTRL` conditions. 201 | #' In this case the additional `discard` column combines the `discard.CTCL` and 202 | #' `discard.CTRL` columns with an `OR` operator. 203 | #' Cell filtering has been computed for `cord_blood` and `peripheral_blood` 204 | #' datasets following section 12.3 of the Advanced Single-Cell Analysis with 205 | #' Bioconductor book. 206 | #' Executed code can be retrieved in the CITEseq_filtering.R script of this 207 | #' package. 208 | #' 209 | #' @return A single cell multi-modal 210 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 211 | #' or informative `data.frame` when `dry.run` is `TRUE`. When `DataClass` is 212 | #' `SingleCellExperiment` an object of this class is returned with an RNA 213 | #' assay as main experiment and other assay(s) as `AltExp(s)`. 214 | #' @references Stoeckius et al. (2017), Mimitou et al. (2019) 215 | #' @export 216 | #' 217 | #' @examples 218 | #' 219 | #' mae <- CITEseq(DataType="cord_blood", dry.run=FALSE) 220 | #' experiments(mae) 221 | CITEseq <- function(DataType=c("cord_blood", "peripheral_blood"), modes="*", 222 | version="1.0.0", dry.run=TRUE, filtered=FALSE, verbose=TRUE, 223 | DataClass=c("MultiAssayExperiment", "SingleCellExperiment"), 224 | ...) 225 | { 226 | dataType <- match.arg(DataType) 227 | message("Dataset: ", dataType) 228 | dataClass <- match.arg(DataClass) 229 | ess_list <- .getResourcesList(prefix = "citeseq_", datatype = dataType, 230 | modes=modes, version=version, 231 | dry.run=dry.run, verbose=verbose, ...) 232 | if (!dry.run) { 233 | mae <- switch( 234 | dataType, 235 | "cord_blood" = { .cord_blood(ess_list=ess_list) }, 236 | "peripheral_blood" = { .peripheral_blood(ess_list=ess_list) }, 237 | ## Add here other CITE-seq datasets based on DataType identifier 238 | { stop("Unrecognized CITE-seq dataset name: ", DataType) } 239 | ) 240 | if (filtered) { 241 | sampleMap(mae) <- sampleMap(mae)[!colData(mae)$discard, ] 242 | } 243 | if(dataClass=="SingleCellExperiment") return(.CITEseqMaeToSce(mae)) 244 | return(mae) 245 | } else { 246 | return(ess_list) 247 | } 248 | } 249 | 250 | 251 | #' CITEseqMaeToSce 252 | #' @description converts a `MultiAssayExperiment` object with CITEseq data into 253 | #' a `SingleCellExperiment` object to be used with already known methods and 254 | #' packages in literature. 255 | #' 256 | #' Note that for creating a `SingleCellExperiment` object the following function 257 | #' subsets all the assays present in the `MultiAssayExperiment` with only the 258 | #' common cells across all the modalities. 259 | #' This could result in a not complete object. 260 | #' 261 | #' 262 | #' @param mae a MultiAssayExperiment object with scRNA and/or scADT and/or 263 | #' scHTO named experiments. 264 | #' 265 | #' @return a SingleCellExperiment object as widely with scRNA data as counts 266 | #' and scADT, scHTO data as altExps. 267 | #' If only one modality is present, it has returned as main assay of the SCE. 268 | #' 269 | #' @importFrom MultiAssayExperiment experiments 270 | #' @importFrom SummarizedExperiment SummarizedExperiment assays<- 271 | #' @importFrom SingleCellExperiment SingleCellExperiment altExp altExp<- altExps 272 | #' altExps<- colData counts 273 | #' @importFrom methods is 274 | #' @importFrom S4Vectors SimpleList 275 | #' @keywords internal 276 | .CITEseqMaeToSce <- function(mae) 277 | { 278 | stopifnot(c(is(mae, "MultiAssayExperiment"), !(length(mae)==0))) 279 | 280 | cs <- colnames(mae[[1]]) 281 | for ( i in seq_along(mae)[-1]) { cs <- intersect(cs, colnames(mae[[i]])) } 282 | 283 | scelist <- lapply(seq_along(mae), function(i) 284 | { 285 | sce <- SingleCellExperiment(list(counts=mae[[i]])) 286 | sce <- sce[, (colnames(sce) %in% cs)] 287 | cd <- colData(mae)[(rownames(colData(mae)) %in% colnames(sce)), ] 288 | colData(sce) <- cd 289 | return(sce) 290 | }) 291 | names(scelist) <- names(mae) 292 | 293 | idx <- grep("scRNA", names(scelist)) 294 | if (length(idx) != 0 ) 295 | { 296 | altExps(scelist[[idx]]) <- scelist[-idx] 297 | sce <- scelist[[idx]] 298 | } else { 299 | stop("Couldn't find RNA assay in MultiAssayExperiment") 300 | } 301 | idx <- grep("scADT_clr", names(altExps(sce))) 302 | if( length(idx) != 0 ) 303 | { 304 | clr <- counts(altExps(sce)[[idx]]) 305 | altExps(sce)[idx] <- NULL 306 | assays(altExp(sce)) <- SimpleList(counts=counts(altExp(sce)), clr=clr) 307 | } 308 | 309 | if ( !isEmpty(metadata(mae))) { 310 | metadata(sce) <- metadata(mae) 311 | } 312 | 313 | return(sce) 314 | } 315 | 316 | 317 | -------------------------------------------------------------------------------- /R/GTseq.R: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # 3 | # author: Ludwig Geistlinger 4 | # date: 2021-03-24 18:17:27 5 | # 6 | # descr: G&T-seq data retrieval 7 | # 8 | ############################################################ 9 | 10 | #' Parallel sequencing data of single-cell genomes and transcriptomes 11 | #' 12 | #' @description GTseq assembles data on-the-fly from `ExperimentHub` to provide 13 | #' a 14 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 15 | #' container. The `DataType` argument provides access to the 16 | #' `mouse_embryo_8_cell` dataset as obtained from Macaulay et al. (2015). 17 | #' Protocol information for this dataset is available from Macaulay et al. 18 | #' (2016). See references. 19 | #' 20 | #' @details G&T-seq is a combination of Picoplex amplified gDNA sequencing 21 | #' (genome) and SMARTSeq2 amplified cDNA sequencing (transcriptome) of the 22 | #' same cell. For more information, see Macaulay et al. (2015). 23 | #' * mouse_embryo_8_cell: 24 | #' this dataset was filtered for bad cells as specified in Macaulay 25 | #' et al. (2015). 26 | #' * genomic - integer copy numbers as detected from scDNA-seq 27 | #' * transcriptomic - raw read counts as quantified from scRNA-seq 28 | #' 29 | #' @section metadata: 30 | #' The `MultiAssayExperiment` metadata includes the original function call 31 | #' that saves the function call and the data version requested. 32 | #' 33 | #' @param DataType `character(1)` Indicates study that produces this type of 34 | #' data (default: 'mouse_embryo_8_cell') 35 | #' 36 | #' @param modes `character()` A wildcard / glob pattern of modes, such as 37 | #' `"*omic"`. A wildcard of `"*"` will return all modes including 38 | #' copy numbers ("genomic") and RNA-seq read counts ("transcriptomic"), 39 | #' which is the default. 40 | #' 41 | #' @param version `character(1)` Currently, only version '1.0.0'. 42 | #' 43 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual 44 | #' download (default `TRUE`) 45 | #' 46 | #' @param verbose `logical(1)` Whether to show the dataset currently being 47 | #' (down)loaded (default `TRUE`) 48 | #' 49 | #' @param ... Additional arguments passed on to the 50 | #' [ExperimentHub][ExperimentHub::ExperimentHub-class] constructor 51 | #' 52 | #' @seealso SingleCellMultiModal-package 53 | #' 54 | #' @return A single cell multi-modal 55 | #' [MultiAssayExperiment][MultiAssayExperiment::MultiAssayExperiment-class] or 56 | #' informative `data.frame` when `dry.run` is `TRUE` 57 | #' 58 | #' @source 59 | #' 60 | #' @references 61 | #' Macaulay et al. (2015) G&T-seq: parallel sequencing of single-cell 62 | #' genomes and transcriptomes. Nat Methods, 12:519–22. 63 | #' 64 | #' Macaulay et al. (2016) Separation and parallel sequencing of the genomes 65 | #' and transcriptomes of single cells using G&T-seq. Nat Protoc, 11:2081–103. 66 | #' 67 | #' @examples 68 | #' 69 | #' GTseq() 70 | #' 71 | #' @export GTseq 72 | GTseq <- 73 | function( 74 | DataType = "mouse_embryo_8_cell", modes = "*", 75 | version = "1.0.0", dry.run = TRUE, verbose = TRUE, ... 76 | ) 77 | { 78 | stopifnot(.isSingleChar(version), .isSingleChar(DataType)) 79 | meta <- list(call = match.call()) 80 | 81 | ess_list <- .getResourcesList( 82 | prefix = "GTseq_", 83 | datatype = DataType, 84 | modes = modes, 85 | version = version, 86 | dry.run = dry.run, 87 | verbose = verbose, 88 | ... 89 | ) 90 | 91 | if (dry.run) { return(ess_list) } 92 | 93 | cdat <- ess_list[["colData"]] 94 | prim.ids <- rep(paste0("cell", seq_len(112)), 2) 95 | smap <- S4Vectors::DataFrame( 96 | assay = tolower(cdat[,"Comment.LIBRARY_SOURCE."]), 97 | primary = prim.ids, 98 | colname = cdat[,"Sample.ID"] 99 | ) 100 | 101 | rcols <- c("organism", "sex", "cell.type") 102 | rcols <- paste0("Characteristics.", rcols, ".") 103 | cdat <- cdat[seq_len(112), rcols] 104 | rownames(cdat) <- prim.ids[seq_len(112)] 105 | 106 | MultiAssayExperiment( 107 | experiments = ess_list[["experiments"]], 108 | colData = cdat, 109 | sampleMap = smap, 110 | metadata = c(meta, as.list(ess_list[["metadata"]])) 111 | ) 112 | } 113 | -------------------------------------------------------------------------------- /R/SCoPE2.R: -------------------------------------------------------------------------------- 1 | #' Single-cell RNA sequencing and proteomics 2 | #' 3 | #' @description SCoPE2 assembles data on-the-fly from `ExperimentHub` to provide 4 | #' a 5 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 6 | #' container. The `DataType` argument provides access to the `SCoPE2` dataset 7 | #' as provided by Specht et al. (2020; DOI: 8 | #' ). The article provides more information 9 | #' about the data acquisition and pre-processing. 10 | #' 11 | #' @details The SCoPE2 study combines scRNA-seq (transcriptome) and 12 | #' single-cell proteomics. 13 | #' 14 | #' * macrophage_differentiation: the cells are monocytes that undergo 15 | #' macrophage differentiation. No annotation is available for the 16 | #' transcriptome data, but batch and cell type annotations are 17 | #' available for the proteomics data in the `celltype` `colData` column. 18 | #' The transcriptomics and proteomics data were not measured from the same 19 | #' cells but from a distinct set of cell cultures. 20 | #' This dataset provides already filtered bad quality cells. 21 | #' * scRNAseq1 - single-cell transcriptome (batch 1) 22 | #' * scRNAseq2 - single-cell transcriptome (batch 2) 23 | #' * scp - single-cell proteomics 24 | #' 25 | #' @inheritParams scNMT 26 | #' 27 | #' @param DataType `character(1)` Indicates study that produces this type of 28 | #' data (default: 'macrophage_differentiation') 29 | #' 30 | #' @param modes `character()` A wildcard / glob pattern of modes, such as 31 | #' `"rna"`. A wildcard of `"*"` will return all modes, that are 32 | #' transcriptome ("rna") or proteome ("protein") which is the 33 | #' default. 34 | #' 35 | #' @param version `character(1)`, currently only version '1.0.0' is 36 | #' available 37 | #' 38 | #' @return A single cell multi-modal 39 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 40 | #' or informative `data.frame` when `dry.run` is `TRUE` 41 | #' 42 | #' @seealso SingleCellMultiModal-package 43 | #' 44 | #' @source All files are linked from the slavovlab website 45 | #' 46 | #' 47 | #' @references 48 | #' Specht, Harrison, Edward Emmott, Aleksandra A. Petelski, R. 49 | #' Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko, 50 | #' Antonius Koller, and Nikolai Slavov. 2020. “Single-Cell 51 | #' Proteomic and Transcriptomic Analysis of Macrophage 52 | #' Heterogeneity.” bioRxiv. https://doi.org/10.1101/665307. 53 | #' 54 | #' @examples 55 | #' 56 | #' SCoPE2(DataType = "macrophage_differentiation", 57 | #' modes = "*", 58 | #' version = "1.0.0", 59 | #' dry.run = TRUE) 60 | #' 61 | #' @export 62 | SCoPE2 <- function( 63 | DataType = "macrophage_differentiation", 64 | modes = "*", 65 | version = "1.0.0", 66 | dry.run = TRUE, 67 | verbose = TRUE, 68 | ... 69 | ) { 70 | if (version != "1.0.0") 71 | stop("Only version '1.0.0' is available.") 72 | 73 | ## Retrieve the different resources from ExperimentHub 74 | ess_list <- .getResourcesList( 75 | prefix = "macrophage_", 76 | datatype = DataType, 77 | modes = modes, 78 | version = version, 79 | dry.run = dry.run, 80 | verbose = verbose, 81 | ... 82 | ) 83 | ## If dry.run, return only the information table 84 | if (dry.run) return(ess_list) 85 | ## Get the colData 86 | cd <- .mergeLowColData(ess_list[["experiments"]]) 87 | colnames(cd)[which(colnames(cd) == "Batch")] <- "batch_Chromium" 88 | 89 | ## Construct and return the MAE object 90 | MultiAssayExperiment( 91 | experiments = ess_list[["experiments"]], 92 | colData = cd 93 | ) 94 | } 95 | -------------------------------------------------------------------------------- /R/SingleCellMultiModal-package.R: -------------------------------------------------------------------------------- 1 | #' @importFrom ExperimentHub loadResources ExperimentHub 2 | #' @importFrom AnnotationHub query 3 | #' @importFrom utils glob2rx read.csv 4 | #' @import MultiAssayExperiment 5 | NULL 6 | 7 | #' SingleCellMultiModal-package 8 | #' 9 | #' @aliases NULL SingleCellMultiModal-package 10 | #' 11 | #' @description 12 | #' The SingleCellMultiModal package provides a convenient and user-friendly 13 | #' representation of multi-modal data from project such as `scNMT` for mouse 14 | #' gastrulation. 15 | #' 16 | #' @examples 17 | #' help(package = "SingleCellMultiModal") 18 | #' 19 | "_PACKAGE" 20 | -------------------------------------------------------------------------------- /R/SingleCellMultiModal.R: -------------------------------------------------------------------------------- 1 | .internalMap <- S4Vectors::DataFrame( 2 | FUN = c("scNMT", "scMultiome", "SCoPE2", 3 | "CITEseq", "CITEseq", "seqFISH", "GTseq"), 4 | DataType = c("mouse_gastrulation", "pbmc_10x", 5 | "macrophage_differentiation", "cord_blood", 6 | "peripheral_blood", "mouse_visual_cortex", 7 | "mouse_embryo_8_cell" 8 | ) 9 | ) 10 | 11 | .filterMap <- function(DataTypes, dry.run, verbose) { 12 | inDTypes <- match(DataTypes, .internalMap[["DataType"]]) 13 | notfound <- is.na(inDTypes) 14 | if (any(notfound)) 15 | stop("'", paste(DataTypes[notfound], collapse = ", "), 16 | "' is not available, ", "see ?SingleCellMultiModal") 17 | upmap <- .internalMap[inDTypes, , drop = FALSE] 18 | upmap[["dry.run"]] <- dry.run 19 | upmap[["verbose"]] <- verbose 20 | upmap 21 | } 22 | 23 | #' Combining Modalities into one MultiAssayExperiment 24 | #' 25 | #' Combine multiple single cell modalities into one using the input of the 26 | #' individual functions. 27 | #' 28 | #' @inheritParams scNMT 29 | #' 30 | #' @param DataTypes `character()` A vector of data types as indicated in each 31 | #' individual function by the `DataType` parameter. These can be any of 32 | #' the following: "mouse_gastrulation", "pbmc_10x", 33 | #' "macrophage_differentiation", "cord_blood", "peripheral_blood", 34 | #' "mouse_visual_cortex", "mouse_embryo_8_cell" 35 | #' 36 | #' @param versions `character()` A vector of versions for each DataType. By 37 | #' default, version `1.0.0` is obtained for all data types. 38 | #' 39 | #' @param modes list() A list or CharacterList of modes for each data type 40 | #' where each element corresponds to one data type. 41 | #' 42 | #' @return A multi-modality `MultiAssayExperiment` 43 | #' 44 | #' @section metadata: 45 | #' The metadata in the `MultiAssayExperiment` contains the original 46 | #' function call used to generate the object (labeled as `call`), 47 | #' a `call_map` which provides traceability of technology functions to 48 | #' `DataType` prefixes, and lastly, R version information as `version`. 49 | #' 50 | #' @examples 51 | #' 52 | #' SingleCellMultiModal(c("mouse_gastrulation", "pbmc_10x"), 53 | #' modes = list(c("acc*", "met*"), "rna"), 54 | #' version = c("2.0.0", "1.0.0"), dry.run = TRUE, verbose = TRUE 55 | #' ) 56 | #' 57 | #' @export 58 | SingleCellMultiModal <- function( 59 | DataTypes, modes = "*", versions = "1.0.0", 60 | dry.run = TRUE, verbose = TRUE, ... 61 | ) 62 | { 63 | stopifnot(is.character(DataTypes), is.character(versions)) 64 | 65 | if (.isSingleChar(modes) && identical(modes, "*")) 66 | modes <- c(rep(modes, length(DataTypes))) 67 | if (.isSingleChar(versions) && identical(versions, "1.0.0")) 68 | versions <- c(rep(versions, length(DataTypes))) 69 | resmap <- .filterMap(DataTypes, dry.run, verbose) 70 | modes <- methods::as(modes, "CharacterList") 71 | resmap <- cbind(resmap, version = versions, modes = modes) 72 | meta <- list(call = match.call(), call_map = resmap, version = version) 73 | 74 | ess_lists <- apply(resmap, 1L, 75 | function(resrow) { 76 | if (verbose) 77 | message("Running ", resrow[[1]], "...") 78 | do.call(get(resrow[[1]]), resrow[-1]) 79 | } 80 | ) 81 | names(ess_lists) <- DataTypes 82 | 83 | if (dry.run) { return(ess_lists) } 84 | 85 | new_prefix <- paste0(resmap[["DataType"]], "_") 86 | ess_lists <- Map(function(x, y) { 87 | if (is(x, "MultiAssayExperiment")) 88 | names(x) <- paste0(y, names(x)) 89 | x 90 | }, x = ess_lists, y = new_prefix) 91 | 92 | result <- Reduce(c, ess_lists) 93 | metadata(result) <- meta 94 | result 95 | } 96 | -------------------------------------------------------------------------------- /R/cache.R: -------------------------------------------------------------------------------- 1 | .getCache <- function() { 2 | cache <- getOption("scmmCache", setCache(verbose = FALSE)) 3 | BiocFileCache::BiocFileCache(cache) 4 | } 5 | 6 | #' @name scmmCache 7 | #' 8 | #' @title Manage cache / download directories for study data 9 | #' 10 | #' @description Managing data downloads is important to save disk space and 11 | #' re-downloading data files. This can be done effortlessly via the integrated 12 | #' `BiocFileCache` system. 13 | #' 14 | #' @section scmmCache: 15 | #' Get the directory location of the cache. It will prompt the user to create 16 | #' a cache if not already created. A specific directory can be used via 17 | #' `setCache`. 18 | #' 19 | #' @section setCache: 20 | #' Specify the directory location of the data cache. By default, it will 21 | #' go into the user's home and package name directory as given by 22 | #' [R_user_dir][tools::R_user_dir] (default: varies by system e.g., for Linux: 23 | #' '$HOME/.cache/R/SingleCellMultiModal'). 24 | #' 25 | #' @section removeCache: 26 | #' Some files may become corrupt when downloading, this function allows 27 | #' the user to delete the tarball associated with a study number in the 28 | #' cache. 29 | #' 30 | #' @param directory `character(1)` The file location where the cache is located. 31 | #' Once set, future downloads will go to this folder. See `setCache` section 32 | #' for details. 33 | #' 34 | #' @param verbose Whether to print descriptive messages 35 | #' 36 | #' @param ask `logical(1)` (default TRUE when `interactive()`) Confirm the file 37 | #' location of the cache directory 38 | #' 39 | #' @param accession `character(1)` A single string indicating the accession number 40 | #' of the study 41 | #' 42 | #' @param ... For `scmmCache`, arguments passed to `setCache` 43 | #' 44 | #' @examples 45 | #' getOption("scmmCache") 46 | #' scmmCache() 47 | #' 48 | #' @return The directory / option of the cache location 49 | #' 50 | #' @export 51 | scmmCache <- function(...) { 52 | getOption("scmmCache", setCache(..., verbose = FALSE)) 53 | } 54 | 55 | #' @rdname scmmCache 56 | #' @export 57 | setCache <- 58 | function(directory = tools::R_user_dir("SingleCellMultiModal", "cache"), 59 | verbose = TRUE, 60 | ask = interactive()) 61 | { 62 | stopifnot( 63 | is.character(directory), length(directory) == 1L, !is.na(directory) 64 | ) 65 | 66 | if (!dir.exists(directory)) { 67 | if (ask) { 68 | qtxt <- sprintf( 69 | "Create cBioPortalData cache at \n %s? [y/n]: ", 70 | directory 71 | ) 72 | answer <- .getAnswer(qtxt, allowed = c("y", "Y", "n", "N")) 73 | if ("n" == answer) 74 | stop("'cbioCache' directory not created. Use 'setCache'") 75 | } 76 | dir.create(directory, recursive = TRUE, showWarnings = FALSE) 77 | } 78 | options("cbioCache" = directory) 79 | 80 | if (verbose) 81 | message("cBioPortalData cache directory set to:\n ", 82 | directory) 83 | invisible(directory) 84 | } 85 | 86 | #' @rdname scmmCache 87 | #' @export 88 | removeCache <- function(accession) { 89 | bfc <- .getCache() 90 | rid <- BiocFileCache::bfcquery(bfc, accession, "rname", exact = TRUE)$rid 91 | if (length(rid)) { 92 | BiocFileCache::bfcremove(bfc, rid) 93 | message("Cache record: ", accession, ".tar.gz removed") 94 | } else 95 | message("No record found: ", accession, ".tar.gz") 96 | } 97 | -------------------------------------------------------------------------------- /R/cellGating.R: -------------------------------------------------------------------------------- 1 | 2 | #' addCTLabels 3 | #' 4 | #' @param cd the `colData` `DataFrame` 5 | #' @param out list data structure returned by `getCellGroups` 6 | #' @param outname character indicating the name of the out data structure 7 | #' @param ct character indicating the celltype to assign in the `ctcol` 8 | #' @param mkrcol character indicating the cd column to store the markers 9 | #' indicated by `outname` (default is markers) 10 | #' @param ctcol character indicating the column in cd to store the cell type 11 | #' indicated by `ct` (default is celltype) 12 | #' @param overwrite logical indicating if the cell types have to be overwritten 13 | #' without checking if detected barcodes were already assigned to other celltypes 14 | #' @param verbose logical for having informative messages during the execution 15 | #' 16 | #' @return an updated version of the cd DataFrame 17 | #' 18 | #' @export 19 | addCTLabels <- function(cd, out, outname, ct, mkrcol="markers", ctcol="celltype", 20 | overwrite=FALSE, verbose=TRUE) 21 | { 22 | ## adds to input cd colData in the mkrcol the markers indicated by outname 23 | ## and in the ctcol the celltype indicated in ct 24 | ## the positions for the barcodes (rows in the cd) are taken in position 25 | ## outname from the out structure given by function getCellGroups 26 | stopifnot(any(c(mkrcol, ctcol) %in% colnames(cd))) 27 | stopifnot((outname %in% names(out))) 28 | 29 | cellbc <- out[[outname]]$bc 30 | idxc <- which(rownames(cd) %in% cellbc) 31 | if (length(idxc) !=0) 32 | { 33 | if (overwrite) 34 | { 35 | if(verbose) message("Blindly overwriting cell types assignements") 36 | cd[[mkrcol]][idxc] <- outname 37 | cd[[ctcol]][idxc] <- ct 38 | } else { 39 | ## checking if celltypes are already assigned 40 | idxnona <- which(!is.na(cd[[mkrcol]][idxc])) 41 | # don't get why ifelse doesn't work 42 | # idxcnona <- ifelse(length(idxnona)!=0, idxc[-idxnona], idxc) 43 | if ( length(idxnona)!=0 ) { 44 | idxcnona <- idxc[-idxnona] 45 | if(verbose) message(length(idxnona), " Barcodes already assigned.\n", 46 | "Assigning only ", length(idxcnona), " Barcodes...") 47 | } else { idxcnona <- idxc } 48 | if (length(idxcnona)!=0) 49 | { 50 | cd[[mkrcol]][idxcnona] <- outname 51 | cd[[ctcol]][idxcnona] <- ct 52 | } else { 53 | if(verbose) message("All selected Barcodes are already assigned\n", 54 | "Look at the overwrite argument to handle a more ", 55 | "brutal behaviour") 56 | } 57 | } 58 | 59 | } else { 60 | warning("No barcodes in cd detected for the selected ", outname, 61 | "\nReturning cd as it is...") 62 | } 63 | 64 | return(cd) 65 | } 66 | 67 | #' @importFrom graphics abline smoothScatter 68 | .plotGatingAdt <- function(mat, adt1="CD19", adt2="CD3", th1=0.2, th2=0) 69 | { 70 | plot(x=mat[adt1,], y=mat[adt2,], xlab=adt1, ylab=adt2, 71 | main=paste0("Gain plot with x-th: ", th1, " y-th: ", th2)) 72 | abline(v=th1, col="red", lty=2) 73 | abline(h=th2, col="red", lty=2) 74 | smoothScatter(x=mat[adt1,], y=mat[adt2,], xlab=adt1, ylab=adt2, 75 | main=paste0("Gain plot with x-th: ", th1, " y-th: ", th2)) 76 | 77 | abline(v=th1, col="red", lty=2) 78 | abline(h=th2, col="red", lty=2) 79 | } 80 | 81 | 82 | #' getCellGroups 83 | #' 84 | #' @description 85 | #' Shows the cells/barcodes in two different plots (scatter and density) 86 | #' divinding the space in four quadrant indicated by the two thresholds given 87 | #' as input parameters. 88 | #' The x/y-axis represent respectively the two ADTs given as input. 89 | #' It returns a list of one element for each quadrant, each with barcodes and 90 | #' percentage (see Value section for details). 91 | #' 92 | #' @param mat matrix of counts or clr transformed counts for ADT data in CITEseq 93 | #' @param adt1 character indicating the name of the marker to plot on the x-axis 94 | #' (default is CD19). 95 | #' @param adt2 character indicating the name of the marker to plot on the y-axis 96 | #' (default is CD3). 97 | #' @param th1 numeric indicating the threshold for the marker on the x-axis 98 | #' (default is 0.2). 99 | #' @param th2 numeric indicating the threshold for the marker on the y-axis 100 | #' (default is 0). 101 | #' 102 | #' @return a list of four different element, each one indicating the quarter 103 | #' where the thresholds divide the plotting space, in eucledian order I, II, 104 | #' III, IV quadrant, indicating respectively +/+, +/-, -/+, -/- combinations 105 | #' for the couples of selected ADTs. 106 | #' Each element of the list contains two objects, one with the list of detected 107 | #' barcodes and one indicating the percentage of barcodes falling into that 108 | #' quadrant. 109 | #' . 110 | #' @details helps to do manual gating for cell type indentification with CITEseq 111 | #' or similar data, providing cell markers. 112 | #' Once identified two interesting markers for a cell type, the user has to 113 | #' play with the thresholds to identify the cell populations specified by an 114 | #' uptake (+) o downtake (-) of the couple of markers (ADTs) previously selected. 115 | #' 116 | #' @importFrom graphics text 117 | #' 118 | #' @export 119 | getCellGroups <- function(mat, adt1="CD19", adt2="CD3", th1=0.2, th2=0) 120 | { 121 | stopifnot(any(adt1,adt2) %in% rownames(mat)) 122 | 123 | plot <- match.arg(plot) 124 | .plotGatingAdt(mat, adt1, adt2, th1, th2) 125 | matadt <- mat[c(adt1,adt2),] 126 | adt1p <- (matadt[adt1,]>th1) 127 | adt1m <- (matadt[adt1,]<=th1) 128 | adt2p <- (matadt[adt2,]>th2) 129 | adt2m <- (matadt[adt2,]<=th2) 130 | 131 | 132 | if (sum(adt1p)+sum(adt1m) != dim(mat)[2]) stop("something went wrong with adt1") 133 | if (sum(adt2p)+sum(adt2m) != dim(mat)[2]) stop("something went wrong with adt2") 134 | 135 | adt12pp <- which(adt1p & adt2p) 136 | adt12pm <- which(adt1p & adt2m) 137 | adt12mp <- which(adt1m & adt2p) 138 | adt12mm <- which(adt1m & adt2m) 139 | 140 | l <- list( 141 | ADT12pp=list( 142 | bc=colnames(matadt)[adt12pp], 143 | prc=((length(adt12pp)/dim(matadt)[2])*100)), 144 | ADT12pm=list( 145 | bc=colnames(matadt)[adt12pm], 146 | prc=((length(adt12pm)/dim(matadt)[2])*100)), 147 | ADT12mp=list( 148 | bc=colnames(matadt)[adt12mp], 149 | prc=((length(adt12mp)/dim(matadt)[2])*100)), 150 | ADT12mm=list( 151 | bc=colnames(matadt)[adt12mm], 152 | prc=((length(adt12mm)/dim(matadt)[2])*100)) 153 | ) 154 | names(l) <- c(paste0(adt1,"+/",adt2,"+"), 155 | paste0(adt1,"+/",adt2,"-"), 156 | paste0(adt1,"-/",adt2,"+"), 157 | paste0(adt1,"-/",adt2,"-")) 158 | 159 | 160 | text((min(matadt[adt1,])+0.03), (max(matadt[adt2,])-0.05), paste(round(l[[3]]$prc), "%")) 161 | text((max(matadt[adt1,])-0.03), (max(matadt[adt2,])-0.05), paste(round(l[[1]]$prc), "%")) 162 | text((max(matadt[adt1,])-0.03), (min(matadt[adt2,])+0.05), paste(round(l[[2]]$prc), "%")) 163 | text((min(matadt[adt1,])+0.03), (min(matadt[adt2,])+0.05), paste(round(l[[4]]$prc), "%")) 164 | return(l) 165 | } 166 | -------------------------------------------------------------------------------- /R/ontomap.R: -------------------------------------------------------------------------------- 1 | #' Obtain a map of cell types for each dataset 2 | #' 3 | #' The `ontomap` function provides a mapping of all the cell names across the 4 | #' all the data sets or for a specified data set. 5 | #' 6 | #' @param dataset `character()` One of the existing functions within the 7 | #' package. If missing, a map of all cell types in each function will 8 | #' be provided. 9 | #' 10 | #' @details 11 | #' Note that `CITEseq` does not have any cell annotations; therefore, no entries 12 | #' are present in the `ontomap`. 13 | #' 14 | #' @return A `data.frame` of metadata with cell types and ontologies 15 | #' 16 | #' @examples 17 | #' 18 | #' ontomap(dataset = "scNMT") 19 | #' 20 | #' @export 21 | ontomap <- function( 22 | dataset = c("scNMT", "scMultiome", "SCoPE2", "CITEseq", "seqFISH") 23 | ) { 24 | dataset <- match.arg(dataset, several.ok = TRUE) 25 | omap <- system.file( 26 | "extdata", "ontomap.tsv", 27 | package = "SingleCellMultiModal", mustWork = TRUE 28 | ) 29 | map <- utils::read.delim(omap) 30 | dnames <- map[["function_name"]] 31 | map[dnames %in% dataset, ] 32 | } 33 | -------------------------------------------------------------------------------- /R/scMultiome.R: -------------------------------------------------------------------------------- 1 | ## Load HDF5 file with either TENxMatrix or HDF5Array 2 | .getH5_TENx <- function(filelist, ehub, fn, verbose) { 3 | if (verbose) 4 | message("Working on: ", paste(fn, collapse = ",\n ")) 5 | se_h5 <- grep("_se", filelist, value = TRUE) 6 | se_obj <- query(ehub, se_h5)[[1L]] 7 | 8 | hasTENx <- any(grepl("tenx", filelist)) 9 | patt <- if (hasTENx) "tenx" else "_assay" 10 | 11 | h5data <- grep(patt, filelist, value = TRUE, ignore.case = TRUE) 12 | h5fileloc <- query(ehub, h5data)[[1L]] 13 | 14 | if (!hasTENx) 15 | h5array <- HDF5Array::HDF5Array(h5fileloc, "assay001", as.sparse = TRUE) 16 | else 17 | h5array <- HDF5Array::TENxMatrix(h5fileloc, "pbmc") 18 | 19 | SummarizedExperiment::`assays<-`( 20 | x = se_obj, withDimnames = FALSE, 21 | value = list(counts = h5array) 22 | ) 23 | } 24 | 25 | .loadHDF5 <- function(ehub, filepaths, verbose) { 26 | matchres <- grepl("\\.[Hh]5|_se\\.[Rr][Dd][Ss]", filepaths) 27 | fpaths <- filepaths[matchres] 28 | fact <- .removeExt(fpaths) 29 | fact <- gsub("_se|_assays|_tenx", "", fact) 30 | h5list <- split(fpaths, fact) 31 | lapply(h5list, 32 | .getH5_TENx, 33 | ehub = ehub, fn = names(h5list), verbose = verbose 34 | ) 35 | } 36 | 37 | .message <- 38 | function(...) 39 | { 40 | message(...) 41 | TRUE 42 | } 43 | 44 | ## @mtmorgan's function from HCAMatrixBrowser 45 | .read_mtx <- 46 | function(path, verbose = FALSE) 47 | { 48 | headers <- readLines(path, 2L) 49 | dims <- as.integer(strsplit(headers[2], " ")[[1]][c(1, 2)]) 50 | !verbose || .message("dim: ", dims[1], " ", dims[2]) 51 | v <- scan( 52 | path, list(integer(), integer(), numeric()), skip = 2, 53 | quiet = !verbose 54 | ) 55 | Matrix::sparseMatrix(v[[1]], v[[2]], x = v[[3]], dims = dims) 56 | } 57 | 58 | .loadMTX <- function(ehub, filepaths, verbose) { 59 | matchres <- 60 | grepl("\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|_se\\.[Rr][Dd][Ss]$", filepaths) 61 | filepaths <- filepaths[matchres] 62 | fact <- .removeExt(filepaths) 63 | fact <- gsub("_se", "", fact) 64 | mtxlist <- split(filepaths, fact) 65 | lapply(mtxlist, function(mtxfile, fn) { 66 | if (verbose) 67 | message("Working on: ", paste(fn, collapse = ",\n ")) 68 | se_mtx <- grep("_se", mtxfile, value = TRUE) 69 | mtxdata <- grep("mtx", mtxfile, value = TRUE, ignore.case = TRUE) 70 | se <- query(ehub, se_mtx)[[1L]] 71 | mtxfile <- query(ehub, mtxdata)[[1L]] 72 | mtxf <- .read_mtx(mtxfile) 73 | 74 | BiocBaseUtils::setSlots( 75 | object = se, 76 | assays = SummarizedExperiment::Assays( 77 | S4Vectors::SimpleList(counts = mtxf) 78 | ) 79 | ) 80 | }, fn = names(mtxlist)) 81 | } 82 | 83 | #' Single-cell Multiome ATAC + Gene Expression 84 | #' 85 | #' @description 10x Genomics Multiome technology enables simultaneous profiling 86 | #' of the transcriptome (using 3’ gene expression) and epigenome 87 | #' (using ATAC-seq) from single cells to 88 | #' deepen our understanding of how genes are expressed and regulated across 89 | #' different cell types. Data prepared by Ricard Argelaguet. 90 | #' 91 | #' @details Users are able to choose from either an `MTX` or `HDF5` file format 92 | #' as the internal data representation. The `MTX` (Matrix Market) format 93 | #' allows users to load a sparse `dgCMatrix` representation. Choosing `HDF5` 94 | #' gives users a sparse `HDF5Array` class object. 95 | #' * pbmc_10x: 10K Peripheral Blood Mononuclear Cells provided by 96 | #' [10x Genomics website](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets) 97 | #' Cell quality control filters are available in the object `colData` 98 | #' together with the `celltype` annotation labels. 99 | #' 100 | #' @inheritParams scNMT 101 | #' 102 | #' @param format `character(1)` Either MTX or HDF5 data format (default MTX) 103 | #' 104 | #' @return A 10X PBMC `MultiAssayExperiment` object 105 | #' 106 | #' @examples 107 | #' 108 | #' scMultiome(DataType = "pbmc_10x", modes = "*", dry.run = TRUE) 109 | #' 110 | #' @export 111 | scMultiome <- 112 | function( 113 | DataType = "pbmc_10x", modes = "*", version = "1.0.0", 114 | format = c("MTX", "HDF5"), dry.run = TRUE, verbose = TRUE, ... 115 | ) 116 | { 117 | stopifnot(.isSingleChar(version), .isSingleChar(DataType)) 118 | 119 | format <- match.arg(format) 120 | meta <- list(call = match.call(), version = version) 121 | 122 | if (!version %in% c("1.0.0", "1.0.1")) 123 | stop("Invalid 'version'; see '?scMultiome' for details.") 124 | 125 | ess_list <- .getResourcesList(prefix = "pbmc_", datatype = DataType, 126 | modes = modes, version = version, dry.run = dry.run, 127 | verbose = verbose, format = format, ...) 128 | 129 | if (dry.run) { return(ess_list) } 130 | 131 | MultiAssayExperiment( 132 | experiments = ess_list[["experiments"]], 133 | colData = ess_list[["colData"]], 134 | sampleMap = ess_list[["sampleMap"]], 135 | metadata = meta 136 | ) 137 | } 138 | -------------------------------------------------------------------------------- /R/scNMT.R: -------------------------------------------------------------------------------- 1 | #' Single-cell Nucleosome, Methylation and Transcription sequencing 2 | #' 3 | #' @description scNMT assembles data on-the-fly from `ExperimentHub` to provide 4 | #' a 5 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 6 | #' container. The `DataType` argument provides access to the 7 | #' `mouse_gastrulation` dataset as obtained from Argelaguet et al. (2019; DOI: 8 | #' 10.1038/s41586-019-1825-8). Pre-processing code can be seen at 9 | #' . Protocol 10 | #' information for this dataset is available at Clark et al. (2018). See the 11 | #' vignette for the full citation. 12 | #' 13 | #' @details scNMT is a combination of RNA-seq (transcriptome) and an adaptation 14 | #' of Nucleosome Occupancy and Methylation sequencing (NOMe-seq, the 15 | #' methylome and chromatin accessibility) technologies. For more 16 | #' information, see Reik et al. (2018) DOI: 10.1038/s41467-018-03149-4 17 | #' 18 | #' * mouse_gastrulation - this dataset provides cell quality control filters in 19 | #' the object `colData` starting from version 2.0.0. Additionally, cell types 20 | #' annotations are provided through the `lineage` `colData` column. 21 | #' * rna - RNA-seq 22 | #' * acc_\* - chromatin accessibility 23 | #' * met_\* - DNA methylation 24 | #' * cgi - CpG islands 25 | #' * CTCF - footprints of CTCF binding 26 | #' * DHS - DNase Hypersensitive Sites 27 | #' * genebody - gene bodies 28 | #' * p300 - p300 binding sites 29 | #' * promoter - gene promoters 30 | #' 31 | #' Special thanks to Al J Abadi for preparing the published data in time 32 | #' for the 2020 BIRS Workshop, see the link here: 33 | #' 34 | #' 35 | #' @section versions: 36 | #' Version '1.0.0' of the scNMT mouse_gastrulation dataset includes all of 37 | #' the above mentioned assay technologies with filtering of cells based on 38 | #' quality control metrics. Version '2.0.0' contains all of the cells 39 | #' without the QC filter and does not contain CTCF binding footprints or 40 | #' p300 binding sites. 41 | #' 42 | #' @section metadata: 43 | #' The `MultiAssayExperiment` metadata includes the original function call 44 | #' that saves the function call and the data version requested. 45 | #' 46 | #' @param DataType `character(1)` Indicates study that produces this type of 47 | #' data (default: 'mouse_gastrulation') 48 | #' 49 | #' @param modes `character()` A wildcard / glob pattern of modes, such as 50 | #' `"acc*"`. A wildcard of `"*"` will return all modes including 51 | #' Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna") 52 | #' which is the default. 53 | #' 54 | #' @param version `character(1)` Either version '1.0.0' or '2.0.0' depending on 55 | #' data version required (default '1.0.0'). See version section. 56 | #' 57 | #' @param dry.run `logical(1)` Whether to return the dataset names before actual 58 | #' download (default `TRUE`) 59 | #' 60 | #' @param verbose `logical(1)` Whether to show the dataset currently being 61 | #' (down)loaded (default `TRUE`) 62 | #' 63 | #' @param ... Additional arguments passed on to the 64 | #' \link[ExperimentHub]{ExperimentHub-class} constructor 65 | #' 66 | #' @seealso SingleCellMultiModal-package 67 | #' 68 | #' @return A single cell multi-modal 69 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 70 | #' or informative `data.frame` when `dry.run` is `TRUE` 71 | #' 72 | #' @source 73 | #' 74 | #' @references 75 | #' Argelaguet et al. (2019) 76 | #' 77 | #' @examples 78 | #' 79 | #' scNMT(DataType = "mouse_gastrulation", modes = "*", 80 | #' version = "1.0.0", dry.run = TRUE) 81 | #' 82 | #' @export scNMT 83 | scNMT <- 84 | function( 85 | DataType = "mouse_gastrulation", modes = "*", version = "1.0.0", 86 | dry.run = TRUE, verbose = TRUE, ... 87 | ) 88 | { 89 | stopifnot(.isSingleChar(version), .isSingleChar(DataType)) 90 | meta <- list(call = match.call(), version = version) 91 | 92 | if (missing(version) || !version %in% c("1.0.0", "2.0.0")) 93 | stop("Enter version '1.0.0' or '2.0.0'; see '?scNMT' for details.") 94 | 95 | ess_list <- .getResourcesList(prefix = "scnmt_", datatype = DataType, 96 | modes = modes, version = version, dry.run = dry.run, 97 | verbose = verbose, ...) 98 | 99 | if (dry.run) { return(ess_list) } 100 | 101 | MultiAssayExperiment( 102 | experiments = ess_list[["experiments"]], 103 | colData = ess_list[["colData"]], 104 | sampleMap = ess_list[["sampleMap"]], 105 | metadata = meta 106 | ) 107 | } 108 | -------------------------------------------------------------------------------- /R/seqFISH.R: -------------------------------------------------------------------------------- 1 | #' Single-cell spatial + Gene Expression 2 | #' 3 | #' @description seqFISH function assembles data on-the-fly from `ExperimentHub` 4 | #' to provide a 5 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 6 | #' container. Actually the `DataType` argument provides access to the 7 | #' available datasets associated to the package. 8 | #' 9 | #' @details seq FISH data are a combination of single cell spatial coordinates 10 | #' and transcriptomics for a few hundreds of genes. seq-FISH data can be 11 | #' combined for example with scRNA-seq data to unveil multiple aspects of 12 | #' cellular behaviour based on their spatial organization and transcription. 13 | #' 14 | #' Available datasets are: 15 | #' * mouse_visual_cortex: combination of seq-FISH data as obtained from Zhu 16 | #' et al. (2018) and scRNA-seq data as obtained from Tasic et al. (2016), 17 | #' Version 1.0.0 returns the full scRNA-seq data matrix, while version 2.0.0 18 | #' returns the processed and subsetted scRNA-seq data matrix (produced for 19 | #' the Mathematical Frameworks for Integrative Analysis of Emerging 20 | #' Biological Data Types 2020 Workshop) The returned seqFISH data are always 21 | #' the processed ones for the same workshop. Additionally, cell types 22 | #' annotations are available in the `colData` through the `class` column in 23 | #' the seqFISH `assay`. 24 | #' * scRNA_Counts - Tasic scRNA-seq gene count matrix 25 | #' * scRNA_Labels - Tasic scRNA-seq cell labels 26 | #' * seqFISH_Coordinates - Zhu seq-FISH spatial coordinates 27 | #' * seqFISH_Counts - Zhu seq-FISH gene counts matrix 28 | #' * seqFISH_Labels - Zhu seq-FISH cell labels 29 | #' 30 | #' @inheritParams scNMT 31 | #' 32 | #' @param DataType `character(1)` indicating the identifier of the dataset to 33 | #' retrieve. (default "mouse_visual_cortex") 34 | #' 35 | #' @param modes `character()` The assay types or modes of data to obtain these 36 | #' include seq-FISH and scRNA-seq data by default. 37 | #' 38 | #' @return A 39 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 40 | #' of seq-FISH data 41 | #' 42 | #' @author Dario Righelli gmail.com> 43 | #' 44 | #' @importFrom SpatialExperiment SpatialExperiment 45 | #' @importFrom SingleCellExperiment SingleCellExperiment 46 | #' @importFrom S4Vectors DataFrame 47 | #' 48 | #' @examples 49 | #' 50 | #' seqFISH(DataType = "mouse_visual_cortex", modes = "*", version = "2.0.0", 51 | #' dry.run = TRUE) 52 | #' 53 | #' @export 54 | seqFISH <- 55 | function( 56 | DataType="mouse_visual_cortex", modes="*", version, 57 | dry.run=TRUE, verbose=TRUE, ... 58 | ) 59 | { 60 | ess_list <- .getResourcesList(prefix = "seqfish_", datatype = DataType, 61 | modes = modes, version = version, dry.run = dry.run, 62 | verbose = verbose, ...) 63 | 64 | if (dry.run) { return(ess_list) } 65 | 66 | modes_list <- ess_list[["experiments"]] 67 | 68 | switch(DataType, 69 | "mouse_visual_cortex" = { 70 | mae <- .mouse_visual_cortex(modes_list=modes_list, 71 | version=version) 72 | }, 73 | ## Add here other seqFISH datasets based on DataType identifier 74 | { 75 | stop("Unrecognized seqFISH dataset name") 76 | } 77 | ) 78 | 79 | return(mae) 80 | } 81 | 82 | .mouse_visual_cortex <- function(modes_list, version) 83 | { 84 | res <- paste0("scRNA", 85 | if (identical(version, "1.0.0")) "_Full" else "", 86 | "_", c("Counts", "Labels") 87 | ) 88 | 89 | ## discrepancy between labels in counts and colData 90 | counts <- as.matrix(modes_list[[res[1]]]) 91 | ## rowData is duplicate of rownames [removed] 92 | coldata <- modes_list[[res[2]]] 93 | vIDs <- intersect(rownames(coldata), colnames(counts)) 94 | counts <- counts[, vIDs] 95 | coldata <- coldata[vIDs, ] 96 | 97 | sce <- SingleCellExperiment::SingleCellExperiment( 98 | colData=coldata, 99 | assays=S4Vectors::SimpleList(counts=counts) 100 | ) 101 | 102 | se <- SpatialExperiment::SpatialExperiment( 103 | rowData=rownames(modes_list$seqFISH_Counts), 104 | colData=modes_list$seqFISH_Labels, 105 | assays=S4Vectors::SimpleList( 106 | counts=as.matrix(modes_list$seqFISH_Counts)), 107 | spatialData=DataFrame(modes_list$seqFISH_Coordinates), 108 | spatialCoordsNames=c("x", "y")) 109 | 110 | MultiAssayExperiment( 111 | experiments = list(seqFISH = se, scRNAseq = sce) 112 | ) 113 | } 114 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | .getAnswer <- function(msg, allowed) 2 | { 3 | if (interactive()) { 4 | repeat { 5 | cat(msg) 6 | answer <- readLines(n = 1) 7 | if (answer %in% allowed) 8 | break 9 | } 10 | tolower(answer) 11 | } else { 12 | "n" 13 | } 14 | } 15 | 16 | .isSingleChar <- function(x) { 17 | length(x) == 1L && is.character(x) && !is.na(x) 18 | } 19 | 20 | .removeExt <- function(fnames) { 21 | gsub("\\..*$", "", basename(fnames)) 22 | } 23 | 24 | .modesAvailable <- function(listfiles, prefix) { 25 | slots <- c("metadata", "colData", "sampleMap") 26 | modes <- gsub(prefix, "", listfiles, fixed = TRUE) 27 | modes <- gsub("_assays|_se|_tenx", "", modes) 28 | modes <- .removeExt(modes) 29 | unique(sort(modes[!modes %in% slots])) 30 | } 31 | 32 | .searchFromInputs <- function(glob, searchFields) { 33 | regGlob <- glob2rx(unique(glob)) 34 | res <- unlist(lapply(regGlob, function(x) { 35 | grep(x, searchFields, ignore.case = TRUE, value = TRUE) 36 | })) 37 | if (!length(res)) 38 | stop("No matches found, modify search criteria") 39 | res 40 | } 41 | 42 | .conditionToIndex <- function(startVec, testVec, FUN) { 43 | logmat <- vapply(startVec, FUN, logical(length(testVec))) 44 | apply(logmat, 1L, any) 45 | } 46 | 47 | .queryResources <- function(ExperimentHub, resTable, verbose) { 48 | fileNames <- stats::setNames(resTable[["RDataPath"]], resTable[["Title"]]) 49 | lapply(fileNames, function(res) { 50 | if (verbose) 51 | message("Working on: ", gsub("\\.rda", "", basename(res))) 52 | # only take the last one for multiple matches 53 | utils::tail(query(ExperimentHub, res), 1) 54 | }) 55 | } 56 | 57 | .getResources <- function(ExperimentHub, resTable, prefix, verbose) { 58 | infos <- .queryResources(ExperimentHub, resTable, verbose) 59 | rpath <- vapply(infos, function(x) `$`(x, "rdatapath"), character(1L)) 60 | 61 | h5resources <- grepl("\\.[Hh]5$", rpath) 62 | mtxresources <- grepl("\\.[Mm][Tt][Xx]\\.[Gg][Zz]$", rpath) 63 | shells <- grepl("se\\.[Rr][Dd][Ss]$", rpath) 64 | otherres <- !((h5resources | mtxresources) | shells) 65 | 66 | if (any(h5resources)) 67 | matress <- .loadHDF5(ExperimentHub, rpath, verbose) 68 | else if (any(mtxresources)) 69 | matress <- .loadMTX(ExperimentHub, rpath, verbose) 70 | else 71 | matress <- list() 72 | 73 | if (any(otherres)) { 74 | rest <- lapply(infos[otherres], `[[`, 1L) 75 | c(rest, matress) 76 | } else { 77 | matress 78 | } 79 | } 80 | 81 | .getResourceInfo <- function(ExperimentHub, resTable, prefix, verbose) { 82 | infos <- .queryResources(ExperimentHub, resTable, verbose) 83 | resID <- vapply(infos, names, character(1L)) 84 | restab <- AnnotationHub::getInfoOnIds(ExperimentHub, resID) 85 | restab <- 86 | restab[, !names(restab) %in% c("fetch_id", "status", "biocversion")] 87 | sizes <- as.numeric(restab[["file_size"]]) 88 | class(sizes) <- "object_size" 89 | titleidx <- which(names(restab) == "title") 90 | restab <- as.data.frame(append( 91 | restab, 92 | list(mode = gsub(prefix, "", restab[["title"]]), 93 | file_size = format(sizes, units = "Mb")), 94 | titleidx 95 | )) 96 | restab[, -c(length(restab), titleidx)] 97 | } 98 | 99 | .test_eh <- function(...) { 100 | tryCatch({ 101 | ExperimentHub(...) 102 | }, error = function(e) { 103 | emsg <- conditionMessage(e) 104 | if (grepl("Timeout", emsg)) 105 | warning("[experimenthub.bioconductor.org] timeout, localHub=TRUE", 106 | call.=FALSE) 107 | ExperimentHub(..., localHub = TRUE) 108 | }) 109 | } 110 | 111 | .isSingleCharNA <- function(x) { 112 | is.character(x) && length(x) == 1L && !is.na(x) 113 | } 114 | 115 | .getResourcesList <- 116 | function(prefix, datatype, modes, version, format, dry.run, verbose, ...) 117 | { 118 | modes_file <- system.file("extdata", "metadata.csv", 119 | package = "SingleCellMultiModal", mustWork = TRUE) 120 | 121 | DataType <- tolower(datatype) 122 | stopifnot( 123 | .isSingleCharNA(DataType), .isSingleCharNA(version) 124 | ) 125 | 126 | modes_metadat <- read.csv(modes_file, stringsAsFactors = FALSE) 127 | if (missing(format)) 128 | notfmt <- "FakeFormatNoMatch" 129 | else 130 | notfmt <- switch(format, HDF5 = "MTX", MTX = "HDF5", format) 131 | filt <- modes_metadat[["DataType"]] == DataType & 132 | modes_metadat[["SourceVersion"]] == version & 133 | modes_metadat[["SourceType"]] != notfmt 134 | 135 | modes_metadat <- modes_metadat[filt, , drop = FALSE] 136 | eh_assays <- modes_metadat[["ResourceName"]] 137 | modesAvail <- .modesAvailable(eh_assays, prefix) 138 | resultModes <- .searchFromInputs(modes, modesAvail) 139 | fileIdx <- .conditionToIndex( 140 | resultModes, eh_assays, function(x) grepl(x, eh_assays) 141 | ) 142 | fileMatches <- 143 | modes_metadat[fileIdx, c("Title", "DispatchClass", "SourceVersion")] 144 | eh <- .test_eh(...) 145 | 146 | if (dry.run) { 147 | return(.getResourceInfo( 148 | eh, modes_metadat[fileIdx, c("Title", "RDataPath")], prefix, FALSE 149 | )) 150 | } 151 | modes_list <- .getResources( 152 | eh, modes_metadat[fileIdx, c("Title", "RDataPath")], prefix, verbose 153 | ) 154 | names(modes_list) <- gsub(prefix, "", names(modes_list)) 155 | 156 | eh_experiments <- ExperimentList(modes_list)[resultModes] 157 | 158 | ess_names <- c("colData", "metadata", "sampleMap") 159 | 160 | ess_idx <- .conditionToIndex(ess_names, eh_assays, 161 | function(x) grepl(x, eh_assays)) 162 | 163 | ess_list <- .getResources(eh, 164 | modes_metadat[ess_idx, c("Title", "RDataPath")], prefix, verbose) 165 | names(ess_list) <- gsub(prefix, "", names(ess_list)) 166 | 167 | c(list(experiments = eh_experiments), ess_list) 168 | } 169 | 170 | .mergeLowColData <- function(x) { 171 | newcoldata <- Reduce( 172 | function(x, y) { 173 | S4Vectors::merge(x, y, by = "row.names", all = TRUE) 174 | }, 175 | lapply(x, colData) 176 | ) 177 | if (length(x) > 1L) { 178 | rownames(newcoldata) <- newcoldata[["Row.names"]] 179 | newcoldata <- newcoldata[, -which(colnames(newcoldata) == "Row.names")] 180 | } 181 | newcoldata 182 | } 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # 3 | 4 | # SingleCellMultiModal 5 | 6 | ## Overview 7 | 8 | `SingleCellMultiModal` is an R package that provides a convenient and 9 | user-friendly representation of multi-modal data using 10 | `MultiAssayExperiment`. This package introduces a suite of single-cell 11 | multimodal landmark datasets for benchmarking and testing multimodal 12 | analysis methods via the `ExperimentHub` Bioconductor package. The scope 13 | of this package is to provide efficient access to a selection of 14 | curated, pre-integrated, publicly available landmark datasets for 15 | methods development and benchmarking. 16 | 17 | ## Installation 18 | 19 | ``` r 20 | if (!requireNamespace("BiocManager", quietly = TRUE)) 21 | install.packages("BiocManager") 22 | 23 | BiocManager::install("SingleCellMultiModal") 24 | ``` 25 | 26 | ## Loading packages 27 | 28 | ``` r 29 | library(SingleCellMultiModal) 30 | library(MultiAssayExperiment) 31 | ``` 32 | 33 | # Citing SingleCellMultiModal 34 | 35 | Your citations are crucial in keeping our software free and open source. 36 | To cite our package see the citation (Eckenrode et al. (2023)) in the 37 | Reference section. You may also browse to the publication at [PLoS 38 | Computational Biology](https://doi.org/10.1371/journal.pcbi.1011324). 39 | 40 | ## Representation 41 | 42 | Users can obtain integrative representations of multiple modalities as a 43 | `MultiAssayExperiment`, a common core Bioconductor data structure relied 44 | on by dozens of multimodal data analysis packages. 45 | `MultiAssayExperiment` harmonizes data management of multiple 46 | experimental assays performed on an overlapping set of specimens. 47 | Although originally developed for patient data from multi-omics cancer 48 | studies, the `MultiAssayExperiment` framework naturally applies also to 49 | single cells. A schematic of the data structure can be seen below. In 50 | this context, “patients” are replaced by “cells”. We use 51 | `MultiAssayExperiment` because it provides a familiar user experience by 52 | extending `SummarizedExperiment` concepts and providing open ended 53 | compatibility with standard data classes present in Bioconductor such as 54 | the `SingleCellExperiment`. 55 | 56 | 57 | 58 | # Contributions 59 | 60 | Want to contribute to the `SingleCellMultiModal` package? We welcome 61 | contributions from the community. Please refer to our [Contributing 62 | Guidelines](https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines) 63 | for more details. 64 | 65 | ## Further resources 66 | 67 | For more information on the `MultiAssayExperiment` data structure, 68 | please refer to Ramos et al. (2017) as well as the [MultiAssayExperiment 69 | vignette](https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/MultiAssayExperiment.html). 70 | 71 | # References 72 | 73 |
75 | 76 |
77 | 78 | Eckenrode, Kelly B, Dario Righelli, Marcel Ramos, Ricard Argelaguet, 79 | Christophe Vanderaa, Ludwig Geistlinger, Aedin C Culhane, et al. 2023. 80 | “Curated Single Cell Multimodal Landmark Datasets for R/Bioconductor.” 81 | *PLoS Comput. Biol.* 19 (8): e1011324. 82 | 83 |
84 | 85 |
86 | 87 | Ramos, Marcel, Lucas Schiffer, Angela Re, Rimsha Azhar, Azfar Basunia, 88 | Carmen Rodriguez, Tiffany Chan, et al. 2017. “Software for the 89 | Integration of Multiomics Experiments in Bioconductor.” *Cancer Res.* 77 90 | (21): e39–42. 91 | 92 |
93 | 94 |
95 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | title: SingleCellMultiModal 2 | url: https://waldronlab.github.io/SingleCellMultiModal 3 | 4 | template: 5 | bootstrap: 5 6 | params: 7 | bootswatch: flatly 8 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite SingleCellMultiModal in publication use:") 2 | 3 | bibentry( 4 | bibtype = "Article", 5 | title = 6 | "Curated single cell multimodal landmark datasets for R/Bioconductor", 7 | author = c( 8 | person( 9 | "Kelly B", "Eckenrode", , "ctb", 10 | ), 11 | person( 12 | "Dario", "Righelli", , "aut", 13 | ), 14 | person( 15 | "Marcel", "Ramos", , c("aut", "cre"), 16 | comment = c(ORCID = "0000-0002-3242-0582") 17 | ), 18 | person( 19 | "Ricard", "Argelaguet", , "ctb" 20 | ), 21 | person( 22 | "Christophe", "Vanderaa", , "aut" 23 | ), 24 | person( 25 | "Ludwig", "Geistlinger", , "aut", 26 | comment = c(ORCID = "0000-0002-2495-5464") 27 | ), 28 | person( 29 | "Aedin C", "Culhane", , "ctb" 30 | ), 31 | person( 32 | "Laurent", "Gatto", , "ctb" 33 | ), 34 | person( 35 | "Vincent J", "Carey", , "ctb", 36 | comment = c(ORCID = "0000-0003-4046-0063") 37 | ), 38 | person( 39 | "Martin", "Morgan", , "ctb", 40 | comment = c(ORCID = "0000-0002-5874-8148") 41 | ), 42 | person( 43 | "Davide", "Risso", , "ctb" 44 | ), 45 | person( 46 | "Levi", "Waldron", , "ctb", 47 | comment = c(ORCID = "0000-0003-2725-0694") 48 | ) 49 | ), 50 | journal = "PLoS Comput. Biol.", 51 | year = "2023", 52 | volume = "19", 53 | number = "8", 54 | doi = "10.1371/journal.pcbi.1011324" 55 | ) 56 | 57 | -------------------------------------------------------------------------------- /inst/REFERENCES.bib: -------------------------------------------------------------------------------- 1 | @ARTICLE{Macaulay2015, 2 | title = "{G\&T-seq}: parallel sequencing of single-cell genomes and 3 | transcriptomes", 4 | author = "Macaulay, Iain C and Haerty, Wilfried and Kumar, Parveen and Li, 5 | Yang I and Hu, Tim Xiaoming and Teng, Mabel J and Goolam, Mubeen 6 | and Saurat, Nathalie and Coupland, Paul and Shirley, Lesley M and 7 | Smith, Miriam and Van der Aa, Niels and Banerjee, Ruby and Ellis, 8 | Peter D and Quail, Michael A and Swerdlow, Harold P and 9 | Zernicka-Goetz, Magdalena and Livesey, Frederick J and Ponting, 10 | Chris P and Voet, Thierry", 11 | abstract = "G\&T-seq offers robust full-length transcript and whole-genome 12 | sequencing simultaneously from a single cell.", 13 | journal = "Nat. Methods", 14 | volume = 12, 15 | number = 6, 16 | pages = "519--522", 17 | month = jun, 18 | year = 2015 19 | } 20 | 21 | @ARTICLE{Macaulay2016, 22 | title = "Separation and parallel sequencing of the genomes and 23 | transcriptomes of single cells using {G\&T-seq}", 24 | author = "Macaulay, Iain C and Teng, Mabel J and Haerty, Wilfried and 25 | Kumar, Parveen and Ponting, Chris P and Voet, Thierry", 26 | journal = "Nat. Protoc.", 27 | volume = 11, 28 | number = 11, 29 | pages = "2081--2103", 30 | month = nov, 31 | year = 2016, 32 | language = "en" 33 | } 34 | 35 | @ARTICLE{Argelaguet2019-et, 36 | title = "Multi-omics profiling of mouse gastrulation at single-cell 37 | resolution", 38 | author = "Argelaguet, Ricard and Clark, Stephen J and Mohammed, Hisham and 39 | Stapel, L Carine and Krueger, Christel and Kapourani, 40 | Chantriolnt-Andreas and Imaz-Rosshandler, Ivan and Lohoff, Tim 41 | and Xiang, Yunlong and Hanna, Courtney W and Smallwood, Sebastien 42 | and Ibarra-Soria, Ximena and Buettner, Florian and Sanguinetti, 43 | Guido and Xie, Wei and Krueger, Felix and G{\"o}ttgens, Berthold 44 | and Rugg-Gunn, Peter J and Kelsey, Gavin and Dean, Wendy and 45 | Nichols, Jennifer and Stegle, Oliver and Marioni, John C and 46 | Reik, Wolf", 47 | journal = "Nature", 48 | volume = 576, 49 | number = 7787, 50 | pages = "487--491", 51 | month = dec, 52 | year = 2019, 53 | language = "en" 54 | } 55 | 56 | @ARTICLE{Clark2018-qg, 57 | title = "{scNMT-seq} enables joint profiling of chromatin accessibility 58 | {DNA} methylation and transcription in single cells", 59 | author = "Clark, Stephen J and Argelaguet, Ricard and Kapourani, 60 | Chantriolnt-Andreas and Stubbs, Thomas M and Lee, Heather J and 61 | Alda-Catalinas, Celia and Krueger, Felix and Sanguinetti, Guido 62 | and Kelsey, Gavin and Marioni, John C and Stegle, Oliver and 63 | Reik, Wolf", 64 | journal = "Nat. Commun.", 65 | volume = 9, 66 | number = 1, 67 | pages = "781", 68 | month = feb, 69 | year = 2018, 70 | language = "en" 71 | } 72 | 73 | @ARTICLE{Zhu2018identification, 74 | title = "Identification of spatially associated subpopulations by 75 | combining {scRNAseq} and sequential fluorescence in situ 76 | hybridization data", 77 | author = "Zhu, Qian and Shah, Sheel and Dries, Ruben and Cai, Long and 78 | Yuan, Guo-Cheng", 79 | journal = "Nature biotechnology", 80 | volume = 36, 81 | number = 12, 82 | pages = 1183, 83 | year = 2018, 84 | language = "en" 85 | } 86 | 87 | @ARTICLE{Tasic2016adult, 88 | title = "Adult mouse cortical cell taxonomy revealed by single cell 89 | transcriptomics", 90 | author = "Tasic, Bosiljka and Menon, Vilas and Nguyen, Thuc Nghi and 91 | Kim, Tae Kyung and Jarsky, Tim and Yao, Zizhen and 92 | Levi, Boaz and Gray, Lucas T and Sorensen, Staci A and 93 | Dolbeare, Tim and others", 94 | journal = "Nature neuroscience", 95 | volume = 19, 96 | number = 2, 97 | pages = 335, 98 | year = 2016, 99 | language = "en" 100 | } 101 | 102 | @ARTICLE{stoeckius2017simultaneous, 103 | title = {Simultaneous epitope and transcriptome measurement in single cells}, 104 | author = {Stoeckius, Marlon and Hafemeister, Christoph and 105 | Stephenson, William and Houck-Loomis, Brian and 106 | Chattopadhyay, Pratip K and Swerdlow, Harold and 107 | Satija, Rahul and Smibert, Peter}, 108 | journal = {Nature methods}, 109 | volume = {14}, 110 | number = {9}, 111 | pages = {865}, 112 | year = {2017}, 113 | publisher = {Nature Publishing Group} 114 | } 115 | 116 | 117 | @ARTICLE{Specht2021-pm, 118 | title = "Single-cell proteomic and transcriptomic analysis of macrophage 119 | heterogeneity using {SCoPE2}", 120 | author = "Specht, Harrison and Emmott, Edward and Petelski, Aleksandra A 121 | and Huffman, R Gray and Perlman, David H and Serra, Marco and 122 | Kharchenko, Peter and Koller, Antonius and Slavov, Nikolai", 123 | journal = "Genome Biol.", 124 | volume = 22, 125 | number = 1, 126 | pages = "50", 127 | month = jan, 128 | year = 2021, 129 | language = "en" 130 | } 131 | 132 | @ARTICLE{mimitou2019multiplexed, 133 | title={Multiplexed detection of proteins, transcriptomes, clonotypes and 134 | CRISPR perturbations in single cells}, 135 | author={Mimitou, Eleni P and Cheng, Anthony and Montalbano, Antonino and Hao, 136 | Stephanie and Stoeckius, Marlon and Legut, Mateusz and Roush, Timothy 137 | and Herrera, Alberto and Papalexi, Efthymia and Ouyang, Zhengqing 138 | and others}, 139 | journal={Nature methods}, 140 | volume={16}, 141 | number={5}, 142 | pages={409--412}, 143 | year={2019}, 144 | publisher={Nature Publishing Group} 145 | } 146 | 147 | @ARTICLE{Ramos2017-tk, 148 | title = "Software for the Integration of Multiomics Experiments in 149 | Bioconductor", 150 | author = "Ramos, Marcel and Schiffer, Lucas and Re, Angela and Azhar, 151 | Rimsha and Basunia, Azfar and Rodriguez, Carmen and Chan, Tiffany 152 | and Chapman, Phil and Davis, Sean R and Gomez-Cabrero, David and 153 | Culhane, Aedin C and Haibe-Kains, Benjamin and Hansen, Kasper D 154 | and Kodali, Hanish and Louis, Marie S and Mer, Arvind S and 155 | Riester, Markus and Morgan, Martin and Carey, Vince and Waldron, 156 | Levi", 157 | journal = "Cancer Res.", 158 | volume = 77, 159 | number = 21, 160 | pages = "e39--e42", 161 | month = nov, 162 | year = 2017, 163 | language = "en" 164 | } 165 | 166 | @ARTICLE{Eckenrode2023-yq, 167 | title = "Curated single cell multimodal landmark datasets for 168 | {R/Bioconductor}", 169 | author = "Eckenrode, Kelly B and Righelli, Dario and Ramos, Marcel and 170 | Argelaguet, Ricard and Vanderaa, Christophe and Geistlinger, 171 | Ludwig and Culhane, Aedin C and Gatto, Laurent and Carey, Vincent 172 | and Morgan, Martin and Risso, Davide and Waldron, Levi", 173 | journal = "PLoS Comput. Biol.", 174 | volume = 19, 175 | number = 8, 176 | pages = "e1011324", 177 | month = aug, 178 | year = 2023, 179 | language = "en" 180 | } 181 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv1.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer" 2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos " 3 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv2.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer" 2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos " 3 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","2.0.0","mouse_gastrulation","Marcel Ramos " 4 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv4.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer" 2 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","1.0.0","mouse_gastrulation","Marcel Ramos " 3 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom","10090","Mus musculus","https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ","RDS","2.0.0","mouse_gastrulation","Marcel Ramos " 4 | "Dept. of Molecular Genetics, Allen Institute for Brain Science, United States","10090","Mus musculus","https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0","TXT","1.0.0","mouse_visual_cortex","Dario Righelli " 5 | "Dept. of Molecular Genetics, Allen Institute for Brain Science, United States","10090","Mus musculus","https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0","TXT","2.0.0","mouse_visual_cortex","Dario Righelli " 6 | "Innovation Lab, New York Genome Center, New York, United States","9606","Homo Sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866","TXT","1.0.0","coord_blood","Dario Righelli " 7 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv5.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer" 2 | "Technology Innovation Lab, New York Genome Center, New York, United States","9606","Homo Sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126310","TXT","1.0.0","peripheral_blood","Dario Righelli " 3 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv6.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer" 2 | "European Bioinformatics Institute (EMBL-EBI), United Kingdom","9606","Homo sapiens","http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/","1.0.0","pbmc_10x","Ricard Argelaguet " -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv7.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceType","SourceVersion","DataType","Maintainer" 2 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa " 3 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa " 4 | "Slavov Laboratory and SCP Center at Northeastern University, Boston, United states","9606","Homo sapiens","https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392","CSV","1.0.0","macrophage_differentiation","Christophe Vanderaa " 5 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv8.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer" 2 | "Wellcome Trust Sanger Institute, Cambridge, United Kingdom","10090","Mus musculus","https://www.ebi.ac.uk/ena/browser/view/PRJEB9051","1.0.0","mouse_embryo_8_cell","Ludwig Geistlinger " 3 | -------------------------------------------------------------------------------- /inst/extdata/docuData/singlecellmultimodalv9.csv: -------------------------------------------------------------------------------- 1 | "DataProvider","TaxonomyId","Species","SourceUrl","SourceVersion","DataType","Maintainer" 2 | "European Bioinformatics Institute (EMBL-EBI), United Kingdom","9606","Homo sapiens","http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/","1.0.1","pbmc_10x","Marcel Ramos " 3 | -------------------------------------------------------------------------------- /inst/extdata/ontomap.tsv: -------------------------------------------------------------------------------- 1 | DataType function_name original_column_name original_cell_name ontology_ID ontology_cell_name 2 | macrophage_differentiation SCoPE2 celltype Macrophage NCIT:C12558 Macrophage 3 | macrophage_differentiation SCoPE2 celltype Monocyte NCIT:C12547 Monocyte 4 | mouse_gastrulation scNMT lineage Epiblast NCIT:C34164 Epiblast 5 | mouse_gastrulation scNMT lineage Mesoderm NCIT:C12750 Mesoderm 6 | mouse_gastrulation scNMT lineage Primitive_Streak NCIT:C28402 Primitive_Streak 7 | mouse_gastrulation scNMT lineage Ectoderm NCIT:C12703 Ectoderm 8 | mouse_gastrulation scNMT lineage Endoderm NCIT:C12706 Endoderm 9 | mouse_gastrulation scNMT lineage ExE_ectoderm NCIT:C12703 Ectoderm 10 | mouse_gastrulation scNMT lineage Visceral_endoderm UBERON:0004877 visceral endoderm 11 | mouse_gastrulation scNMT lineage Primitive_endoderm BTO:0002123 primitive endoderm 12 | mouse_visual_cortex seqFISH class Glutamatergic Neuron CL:0000679 glutamatergic neuron 13 | mouse_visual_cortex seqFISH class GABA-ergic Neuron CL:0000617 GABAergic neuron 14 | mouse_visual_cortex seqFISH class Oligodendrocyte.3 CL:0000128 oligodendrocyte 15 | mouse_visual_cortex seqFISH class Endothelial Cell NCIT:C12865 Endothelial Cell 16 | mouse_visual_cortex seqFISH class Astrocyte NCIT:C12477 Astrocyte 17 | mouse_visual_cortex seqFISH class Oligodendrocyte.2 CL:0000128 oligodendrocyte 18 | mouse_visual_cortex seqFISH class Microglia NCIT:C12616 Microglia 19 | mouse_visual_cortex seqFISH class Oligodendrocyte.1 CL:0000128 oligodendrocyte 20 | pbmc_10x scMultiome celltype naive CD4 T cells NCIT:C12537 CD4-Positive T-Lymphocyte 21 | pbmc_10x scMultiome celltype memory CD4 T cells NCIT:C97349 CD4 Positive Memory T-Lymphocyte 22 | pbmc_10x scMultiome celltype non-classical monocytes CL:0000875 non-classical monocyte 23 | pbmc_10x scMultiome celltype naive CD8 T cells ASCTB-TEMP_cd8-t-cells cd8+ t-cells 24 | pbmc_10x scMultiome celltype CD56 (bright) NK cells CL:0000938 CD16-negative, CD56-bright natural killer cell 25 | pbmc_10x scMultiome celltype classical monocytes CL:0000860 classic monocytes 26 | pbmc_10x scMultiome celltype effector CD8 T cells NCIT:C126419 Effector Memory T-Lymphocyte 27 | pbmc_10x scMultiome celltype myeloid DC CL:0000782 myeloid dendritic cell 28 | pbmc_10x scMultiome celltype intermediate monocytes CL:0002393 intermediate monocyte 29 | pbmc_10x scMultiome celltype memory B cells CL:0000787 memory B cell 30 | pbmc_10x scMultiome celltype MAIT T cells NCIT:C115217 Mucosal-Associated Invariant T-Cell 31 | pbmc_10x scMultiome celltype CD56 (dim) NK cells CL:0000939 CD16-positive, CD56-dim natural killer cell 32 | pbmc_10x scMultiome celltype naive B cells CL:0000788 naive B cell 33 | pbmc_10x scMultiome celltype plasmacytoid DC CL:0000784 plasmacytoid dendritic cell 34 | cord_blood CITEseq celltype Natural Killers OMIT:0026379 Natural Killer T-Cells 35 | cord_blood CITEseq celltype Precursors OMIT:0012443 Protein Precursors 36 | cord_blood CITEseq celltype CD4 T-cells ASCTB-TEMP_cd4-t cd4+ t 37 | cord_blood CITEseq celltype CD8 T-cells ASCTB-TEMP_cd8-t-cells cd8+ t-cells 38 | cord_blood CITEseq celltype B-cells NCIT:C12474 B-Lymphocyte 39 | cord_blood CITEseq celltype Monocytes CD14+ CL:0001054 CD14-positive monocyte 40 | cord_blood CITEseq celltype T-cells OMIT:0026379 Natural Killer T-Cells 41 | cord_blood CITEseq celltype Monocytes CD16+ NCIT:C12547 Monocyte 42 | -------------------------------------------------------------------------------- /inst/scripts/CITEseq_celltypes.R: -------------------------------------------------------------------------------- 1 | library(devtools) 2 | load_all() 3 | mae <- CITEseq("cord_blood", dry.run=FALSE) 4 | 5 | ## Detecting MOUSE/HUMAN cells 6 | rna <- assays(mae)[["scRNAseq"]] 7 | hrna <- colSums(rna[grep("^HUMAN", rownames(rna)),]) 8 | mrna <- colSums(rna[grep("^MOUSE", rownames(rna)),]) 9 | mate <- cbind(hrna, mrna) 10 | plot(log1p(hrna+1), log1p(mrna+1), xlab="hrna", ylab="mrna") 11 | ## Using kmeans for detecting 2 bigger clusters of human and mouse cells 12 | set.seed(666) 13 | km <- kmeans(cbind(log(hrna+1), log(mrna+1)), centers=2) 14 | plot(log(hrna+1), log(mrna+1), xlab="hrna", ylab="mrna", col=km$cluster) 15 | 16 | ## computing distance+hclust on human/mouse cells cluster for detecting mixed cells 17 | mat <- cbind(log(hrna+1), log(mrna+1)) 18 | 19 | ## human cells 20 | d <- dist(mat[km$cluster==2,]) 21 | hc <- hclust(d, method="single") 22 | cl <- cutree(hc, k=2) 23 | plot(mat[km$cluster==2,], col=cl) 24 | hbc <- names(cl)[cl==1] 25 | 26 | # cd <- colData(mae) 27 | load("cord_blood/v1.0.0/coldata_scRNAseq.rda") 28 | cd <- coldata_scRNAseq 29 | 30 | cd$species <- NA 31 | cd$species[which(rownames(cd) %in% hbc)] <- "HUMAN" 32 | cd$species[which(rownames(cd) %in% names(cl)[cl==2])] <- "MIXED" 33 | cd$species[which(rownames(cd) %in% names(km$cluster)[km$cluster==1])] <- "MOUSE" 34 | table(cd$specie) 35 | 36 | ##### Annotating cell types 37 | adtclrgeo <- as.matrix(read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_clr-transformed.csv", row.names=1)) 38 | ## add this assay to the ADTs 39 | cd$celltype <- NA 40 | cd$markers <- NA 41 | 42 | cdct <- cd[!cd$discard,] 43 | cdct <- cdct[cdct$species=="HUMAN",] 44 | 45 | out.cd19.cd3 <- getCellGroups(adtclrgeo, adt1="CD19", adt2="CD3", th1=0.9, th2=0.6) 46 | 47 | 48 | cdct <- addCTLabels(cdct, out.cd19.cd3, "CD19-/CD3+", "T-cells") 49 | cdct <- addCTLabels(cdct, out.cd19.cd3, "CD19+/CD3-", "B-cells") 50 | 51 | out.cd11.cd14 <- getCellGroups(adtclrgeo, adt1="CD11c", adt2="CD14", th1=0.4, th2=0.55) 52 | cdct <- addCTLabels(cdct, out.cd11.cd14, "CD11c+/CD14+", "Monocytes CD14+") 53 | table(cdct$celltype) 54 | 55 | out.cd11.cd16 <- getCellGroups(adtclrgeo, adt1="CD11c", adt2="CD16", th1=0.4, th2=0.55) 56 | cdct <- addCTLabels(cdct, out.cd11.cd16, "CD11c+/CD16+", "Monocytes CD16+") 57 | table(cdct$celltype) 58 | 59 | out.T.cd8.cd4 <- getCellGroups(adtclrgeo[,out.cd19.cd3$`CD19-/CD3+`$bc], adt1="CD8", adt2="CD4", th1=0.9, th2=0.6) 60 | ## overwriting because CD4/CD8 T-cells are subgroups of T-cells 61 | cdct <- addCTLabels(cdct, out.T.cd8.cd4, "CD8-/CD4+", "CD4 T-cells", overwrite=TRUE) 62 | cdct <- addCTLabels(cdct, out.T.cd8.cd4, "CD8+/CD4-", "CD8 T-cells", overwrite=TRUE) 63 | # cord_blood_colData_anno <- cdct 64 | # save(cord_blood_colData_anno, file="cord_blood_colData_anno.rda") 65 | 66 | ## precursors are CD34+ and I took CD56- which seems not expressed from the paper figure 67 | out.cd56.cd34 <- getCellGroups(adtclrgeo, adt1="CD56", adt2="CD34", th1=0.37, th2=0.9) 68 | prebc <- out.cd56.cd34$`CD56-/CD34+`$bc 69 | # idxpre <- which(rownames(cdct) %in% prebc) 70 | # which(prebc %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!! 71 | cdct <- addCTLabels(cdct, out.cd56.cd34, "CD56-/CD34+", "Precursors") 72 | 73 | # ## NATURAL KILLERS are CD3-/CD16+ (CD56+ and CD16+) 74 | out.cd16.cd3 <- getCellGroups(adtclrgeo, adt1="CD16", adt2="CD3", th1=0, th2=0.55) 75 | nkcellbc16 <- out.cd16.cd3$`CD16+/CD3-`$bc 76 | idxnk <- which(rownames(cdct) %in% nkcellbc16) 77 | length(idxnk) 78 | sum(nkcellbc16 %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!! 79 | 80 | cdctnk <- addCTLabels(cdct, out.cd16.cd3, "CD16+/CD3-", "Natural Killers") 81 | 82 | ## other markers for NK are CD56+ and CD3- 83 | out.cd56.cd3 <- getCellGroups(adtclrgeo, adt1="CD56", adt2="CD3", th1=0, th2=0) 84 | # nkcellbc56 <- out.cd56.cd3$`CD56+/CD3-`$bc 85 | # idxnk <- which(rownames(cdct) %in% nkcellbc56) 86 | # length(idxnk) 87 | # sum(nkcellbc56 %in% rownames(cdct)[!is.na(cdct$celltype)]) ## showing overlap!!! 88 | cdctnk <- addCTLabels(cdctnk, out.cd56.cd3, "CD56+/CD3-", "Natural Killers") 89 | 90 | coldata_scRNAseq <- cdctnk 91 | save(coldata_scRNAseq, file="cord_blood/v1.0.0/coldata_scRNAseq.rda") 92 | scADT_clrCounts <- adtclrgeo 93 | save(scADT_clrCounts, file="cord_blood/v1.0.0/scADT_clrCounts.rda") 94 | 95 | 96 | ## Building tsne 97 | cnts <- (mae[["scADT"]][, which(colnames(mae[["scADT"]]) %in% rownames(cdctnk))]) 98 | adtclrgeoss <- adtclrgeo[,which(colnames(adtclrgeo) %in% rownames(cdctnk))] 99 | adtsce <- SingleCellExperiment(assays=list(counts=cnts, logcounts=adtclrgeoss), 100 | colData=cdctnk) 101 | library(scran) 102 | adtsce <- runPCA(adtsce) 103 | adtsce <- runTSNE(adtsce, dimred="PCA") 104 | plotReducedDim(adtsce, dimred="TSNE", colour_by="celltype") 105 | 106 | -------------------------------------------------------------------------------- /inst/scripts/Contributing-Guidelines.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | date: "`r format(Sys.time(), '%B %d, %Y')`" 4 | --- 5 | 6 | ```{r, include = FALSE} 7 | knitr::opts_chunk$set( 8 | collapse = TRUE, 9 | comment = "#>", 10 | cache = TRUE, 11 | out.width = "100%" 12 | ) 13 | options(tibble.print_min = 5, tibble.print_max = 5) 14 | ``` 15 | 16 | # Overview 17 | 18 | Thank you for your interest! 19 | 20 | The `SingleCellMultiModal` package aims to provide single cell datasets 21 | from several different technologies / modalities for benchmarking and analysis. 22 | We currently provide from `scNMT`, `scM&T`, `seqFISH`, `CITEseq`, and other 23 | technologies. Contributions are very much welcome. 24 | 25 | # List of Multi-modal Datasets 26 | 27 | For a full list of available datasets, see here: 28 | [Google Drive Sheet](https://docs.google.com/spreadsheets/d/14Eq_Bt_3tKx_t1UDwan0qZZGWNyK-d2WLXtmoPGB5To/edit#gid=0) 29 | 30 | # Contributing 31 | 32 | In order to contribute, we generally require data in `Rda` or `Rds` format 33 | though we also support `HDF5` and `MTX` formats. Aside from the usual required 34 | `metadata.csv` documentation in the package, contributors are required to add a 35 | name to the `DataType` column in the metadata table that indicates the name of 36 | the contributed dataset. To illustrate, here are some `DataType` names already 37 | in the package: 38 | 39 | * mouse_gastrulation 40 | * mouse_visual_cortex 41 | * cord_blood 42 | * pbmc_10x 43 | * macrophage_differentiation 44 | * mouse_embryo_8_cell 45 | 46 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 47 | library(SingleCellMultiModal) 48 | ``` 49 | 50 | ```{r} 51 | meta <- system.file("extdata", "metadata.csv", 52 | package = "SingleCellMultiModal", mustWork = TRUE) 53 | head(read.csv(meta)) 54 | ``` 55 | 56 | # Versioning and folder structure 57 | 58 | We associate a version with all datasets. We start with version `1.0.0` using 59 | semantic versioning and include data in a corresponding version folder 60 | (`v1.0.0`). Thus, the recommended folder structure is as follows: 61 | 62 | ``` 63 | ~/data 64 | └ scmm/ 65 | └ mouse_gastrulation/ 66 | └ v1.0.0/ 67 | └ scnmt_acc_cgi.rda 68 | └ scnmt_met_genebody.rda 69 | └ scnmt_met_cgi.rda 70 | └ scnmt_rna.rda 71 | └ scnmt_colData.rda 72 | └ scnmt_sampleMap.rda 73 | ``` 74 | 75 | In the `inst` section, we will discuss how to annotate these data products. 76 | 77 | # Files 78 | 79 | It is customary to include one `Rda` / `Rds` file per assay or per assay and 80 | region combination of interest (as above). We also highly recommend including 81 | `sampleMap` and `colData` datasets for the `MultiAssayExperiment` that will 82 | be built on the fly. In this example, there are three modalities in the `scNMT` 83 | dataset, `rna` (transcriptome), `acc` (chromatin accessibility), and `met` 84 | (methylation). 85 | 86 | # vignettes 87 | 88 | Contributors are required to demonstrate user-level functionality via 89 | examples in a vignette for each contributed dataset. 90 | 91 | # R 92 | 93 | Ideally, the interface for the contributed dataset should be similar to that 94 | of `scNMT` so that users have a sense of consistency in the usage of the 95 | package. This means having one main function that returns a 96 | `MultiAssayExperiment` object and having options that show the user what 97 | datasets are available for a particular technology. Contributors should use 98 | `roxygen2` for documenting datasets and using `@inheritParams scNMT` tag 99 | to avoid copying `@param` documentation. 100 | 101 | See the current example for implementation details: 102 | 103 | ```{r} 104 | scNMT( 105 | DataType = "mouse_gastrulation", 106 | mode = "*", 107 | version = "1.0.0", 108 | dry.run = TRUE 109 | ) 110 | ``` 111 | 112 | **Note**. Contributors should ensure that the documentation is complete and the 113 | proper data sources have been attributed. 114 | 115 | # inst/* 116 | 117 | ## extdata/ 118 | 119 | In the following section we will describe how to annotate and append to 120 | the `metadata.csv` file. First, we have to ensure that we are accounting for 121 | all of the fields required by `ExperimentHub`. They are listed here: 122 | 123 | * ResourceName 124 | * Title 125 | * Description 126 | * BiocVersion 127 | * Genome 128 | * SourceType 129 | * SourceUrl 130 | * SourceVersion 131 | * Species 132 | * TaxonomyId 133 | * Coordinate_1_based 134 | * DataProvider 135 | * Maintainer 136 | * RDataPath 137 | * RDataClass 138 | * DispatchClass 139 | * DataType+ 140 | 141 | **Note**. `DataType` is a field we've added to help distinguish multimodal 142 | technologies and is required for `SingleCellMultiModal`. Some of the 143 | `DataType`s already available are `mouse_gastrulation`, `mouse_visual_cortex`, 144 | `cord_blood`, `peripheral_blood`, etc. 145 | 146 | To make it easy for contributions, we've provided a mechanism for easy 147 | documentation using a file from a `data.frame` we call a `doc_file`. 148 | 149 | Interested contributors should create a `doc_file` in `inst/extdata/docuData` 150 | folder. Although we do not have a strict naming convention for the `doc_file`, 151 | we usually name the file `singlecellmultimodalvX.csv` where `X` is the *n*th 152 | dataset added to the package. 153 | 154 | Here is an example of the file from version `v1.0.0` of the `scNMT` dataset: 155 | 156 | ```{r} 157 | doc_file <- system.file("extdata", "docuData", "singlecellmultimodalv1.csv", 158 | package = "SingleCellMultiModal", mustWork = TRUE) 159 | read.csv(doc_file, header = TRUE) 160 | ``` 161 | 162 | Contributors will then use their `doc_file` to append to the existing 163 | `metadata.csv`. 164 | 165 | To create a `doc_file` `data.frame` with the file name 166 | `singlecellmultimodalvX.csv`, first we create a `data.frame` object. 167 | Each general annotation or row in this `data.frame` will be applied to all 168 | files uploaded to `ExperimentHub`. We take advantage of the `data.frame` 169 | function to repeat data and create a uniform `data.frame` with equal values 170 | across the columns. 171 | 172 | ```{r} 173 | scmeta <- data.frame( 174 | DataProvider = 175 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 176 | TaxonomyId = "10090", 177 | Species = "Mus musculus", 178 | SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 179 | SourceType = "RDS", 180 | SourceVersion = "1.0.0", 181 | DataType = "mouse_gastrulation", 182 | Maintainer = "Ricard Argelaguet ", 183 | stringsAsFactors = FALSE 184 | ) 185 | scmeta 186 | ``` 187 | 188 | ### Saving the data 189 | 190 | After creating the documentation `data.frame` (`doc_file`), the contributor can 191 | save that dataset as a `.csv` file using `write.csv`. 192 | 193 | ```{r,eval=FALSE} 194 | write.csv( 195 | scmeta, 196 | file = "inst/extdata/docuData/singlecellmultimodal.csv", 197 | row.names = FALSE 198 | ) 199 | ``` 200 | 201 | ## Documenting diverse data 202 | 203 | In the case that the contributed data is not uniform, meaning that there are 204 | multiple file types from potentially different speciments, the `data.frame` 205 | will have to account for _all_ contributed data files. 206 | 207 | For example, if the contributed data has a number of different source types, 208 | the contributor is required to create a `data.frame` with the number of rows 209 | equal to the number of files to be uploaded. 210 | 211 | In this example, we have **two** data files from different source types and 212 | formats: 213 | 214 | ```{r} 215 | data.frame( 216 | DataProvider = 217 | c("Institute of Population Genetics", "Mouse Science Center"), 218 | TaxonomyId = c("9606", "10090"), 219 | Species = c("Homo sapiens", "Mus musculus"), 220 | SourceUrl = c("https://human.science/org", "https://mouse.science/gov"), 221 | SourceType = c("RDS", "XML"), 222 | DataType = c("human_genetics", "mouse_genetics"), 223 | stringsAsFactors = FALSE 224 | ) 225 | ``` 226 | 227 | ## scripts/ 228 | 229 | ### make-data/ 230 | 231 | The individual data products that will eventually come together into 232 | a `MultiAssayExperiment` can be uploaded as serialized `RDA` / `RDS` files, 233 | `HDF5`, and even `MTX` files. For examples on how to save data into 234 | their respective file formats, see the `make-data` folder. 235 | 236 | ## Generating the metadata.csv 237 | 238 | ### make-metadata.R 239 | 240 | Based on the folder structure described previously, the `directory` argument in 241 | `make_metadata` will correspond to the `~/data/scmm` folder. The `dataDir` 242 | folder will correspond to the `DataType` / technology subfolder (e.g., 243 | "mouse_gastrulation"). These will be used as inputs to the `make_metadata` 244 | function. 245 | 246 | Once the data is ready, the user can use the function in `make-metadata.R` 247 | in the `scripts` folder. A typical call to `make_metadata` will either add to 248 | the metadata or replace it entirely. The easiest for current contributors is to 249 | `append` rows to the metadata file. 250 | 251 | ```{r,eval=FALSE} 252 | make_metadata( 253 | directory = "~/data/scmm", 254 | dataDirs = "mouse_gastrulation", # also the name of the DataType 255 | ext_pattern = "\\.[Rr][Dd][Aa]$", 256 | doc_file = "inst/extdata/docuData/singlecellmultimodalv1.csv", 257 | pkg_name = "SingleCellMultiModal", 258 | append = TRUE, 259 | dry.run = TRUE 260 | ) 261 | ``` 262 | 263 | Note that the extraction pattern (`ext_pattern`) will allow contributors to 264 | match a specific file extension in that folder and ignore any intermediate 265 | files. 266 | 267 | The contributor may also wish to run `dry.run=TRUE` to see the output 268 | `data.frame` to be added to the `metadata.csv` file. 269 | 270 | _Note_. The `make_metadata` function should be run from the base package 271 | directory from a GitHub / git checkout (`git clone ...`). 272 | 273 | ## Validation 274 | 275 | It is recommended to run the metadata validation function from 276 | `AnnotationHubData`: 277 | 278 | ```{r,eval=FALSE} 279 | AnnotationHubData::makeAnnotationHubMetadata("SingleCellMultiModal") 280 | ``` 281 | 282 | to ensure that some of the metadata fields are properly annotated. 283 | 284 | 285 | # NEWS.md 286 | 287 | Contributors should update the `NEWS.md` file with a mention of the 288 | function and data that are being provided. See the `NEWS.md` for examples. 289 | 290 | # Next steps 291 | 292 | The contributor should then create a Pull Request on [GitHub][]. 293 | 294 | [GitHub]: https://github.com/waldronlab/SingleCellMultiModal/pulls 295 | 296 | If you are interested in contributing, I can help you go over the contribution 297 | and submission. Please contact me either on the [Bioc-community Slack][] 298 | (mramos148) or at marcel {dot} ramos [at] sph (dot) cuny (dot) edu. 299 | If you need to sign up to the community Slack channel, follow this link: 300 | https://bioc-community.herokuapp.com/ 301 | 302 | [Bioc-community Slack]: https://community-bioc.slack.com 303 | 304 | ## sessionInfo 305 | 306 |
307 | sessionInfo 308 | 309 | ```{r,echo=FALSE} 310 | sessionInfo() 311 | ``` 312 | 313 |
314 | -------------------------------------------------------------------------------- /inst/scripts/README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | knit: (function(inputFile, encoding) { 4 | rmarkdown::render(inputFile, encoding = encoding, output_dir = "../../") }) 5 | bibliography: ../REFERENCES.bib 6 | --- 7 | 8 | ```{r, include = FALSE} 9 | knitr::opts_chunk$set( 10 | collapse = TRUE, 11 | comment = "#>", 12 | cache = TRUE, 13 | out.width = "100%" 14 | ) 15 | ``` 16 | 17 | ```{r,echo=FALSE,eval=FALSE} 18 | ## Generate the ./README.md (relative to base folder) 19 | rmarkdown::render(input = "inst/scripts/README.Rmd", output_dir = ".") 20 | ``` 21 | 22 | # 23 | 24 | ```{r, child="../../vignettes/SingleCellMultiModal.Rmd"} 25 | 26 | ``` 27 | -------------------------------------------------------------------------------- /inst/scripts/make-data.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## PLACEHOLDER for make-data/ 3 | ## 4 | ## see the respective R script for each technology 5 | ## 6 | ## for example, make-data/scNMT.R for an example on how 7 | ## we took a MultiAssayExperiment object and created 8 | ## Rda files for upload to ExperimentHub 9 | ## 10 | ## see make-data/scMultiome.R for converting a MultiAssayExperiment 11 | ## into sparceMatrix representations using HDF5 and MTX formats 12 | ## 13 | ## see make-data/make-macrophage.R for taking raw data and 14 | ## creating SingleCellExperiment data products 15 | ## 16 | -------------------------------------------------------------------------------- /inst/scripts/make-data/CITEseq_filtering.R: -------------------------------------------------------------------------------- 1 | library(SingleCellExperiment) 2 | library(DropletUtils) 3 | cb <- CITEseq("cord_blood", dry.run=FALSE, DataClass="SingleCellExperiment") 4 | adt <- SingleCellExperiment(assays=list(counts=assays(altExp(cb))[[1]])) 5 | top.marker <- rownames(adt)[max.col(t(counts(adt)))] 6 | total.count <- colSums(counts(adt)) 7 | boxplot(split(log10(total.count), top.marker), ylab="Log-total ADT count", las=2) 8 | adt.counts <- counts(adt) 9 | adt.detected <- colSums(adt.counts > 0) 10 | hist(adt.detected, col='grey', main="", xlab="Number of detected ADTs") 11 | 12 | qc.stats <- cleanTagCounts(adt)#, exclusive=c("CD3", "CD19")) 13 | summary(qc.stats$high.ambient) # libraries removed with high ambient contamination 14 | 15 | library(scater) 16 | mito <- grep("mt-", tolower(rownames(adt))) 17 | df <- perCellQCMetrics(adt, subsets=list(Mito=mito)) 18 | mito.discard <- isOutlier(df$subsets_Mito_percent, type="higher") 19 | summary(mito.discard) 20 | 21 | discard <- qc.stats$discard | mito.discard 22 | 23 | colData(cb) <- cbind.DataFrame(colData(cb), adt.discard=qc.stats$discard, mito.discard=mito.discard, discard=discard) 24 | 25 | scRNAseq_coldata <- as.data.frame(colData(cb)) 26 | dir.create("cord_blood/v1.0.0/", recursive=TRUE) 27 | save(scRNAseq_coldata, file="cord_blood/v1.0.0/scRNAseq_coldata.rda") 28 | 29 | ## Alternatively it is possible to indicate two or more ADTs that should 30 | ## be expressed alternatively in a cell. 31 | ## for CD3/CD4/CD8 i'm referring to https://tinyurl.com/ys9aawce 32 | ## otherwise the OSCA vignette 12.3.3 indicates to use CD3/CD19, but 33 | ## this article found that CD3and CD19 could be expressed in a novel cell type 34 | ## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8694500/ 35 | # qc.stats <- cleanTagCounts(adtsce1, exclusive=c("CD4", "CD8")) 36 | # summary(qc.stats$discard) # libraries removed with high ambient contamination 37 | 38 | library(SingleCellExperiment) 39 | library(DropletUtils) 40 | mae <- CITEseq("peripheral_blood", dry.run=FALSE)#, DataClass="SingleCellExperiment") 41 | adt <- SingleCellExperiment(assays=list(counts=mae[["scADT"]])) 42 | pb <- SingleCellExperiment(assays=list(counts=mae[["scRNA"]])) 43 | cn <- colnames(adt) 44 | condition <- unlist(lapply(strsplit(colnames(adt), "_"), function(x) x[1])) 45 | bc <- unlist(lapply(strsplit(colnames(adt), "_"), function(x) x[2])) 46 | colData(adt) <- DataFrame("barcodes"=bc, "condition"=condition) 47 | colnames(adt) <- cn 48 | 49 | adt.rm <- adt[-c(3,52),] 50 | 51 | adtcr <- adt.rm[, adt.rm$condition=="CTRL"] 52 | adtcl <- adt.rm[, adt.rm$condition=="CTCL"] 53 | top.markercr <- rownames(adtcr)[max.col(t(counts(adtcr)))] 54 | top.markercl <- rownames(adtcl)[max.col(t(counts(adtcl)))] 55 | total.countcr <- colSums(counts(adtcr)) 56 | total.countcl <- colSums(counts(adtcl)) 57 | 58 | boxplot(split(log10(total.countcr), top.markercr), ylab="Log-total ADT CTRL count", las=2) #CD5 59 | boxplot(split(log10(total.countcl), top.markercl), ylab="Log-total ADT CTCL count", las=2) #CD279 60 | 61 | adt.countscr <- counts(adtcr) 62 | adt.detectedcr <- colSums(adt.countscr > 0) 63 | hist(adt.detectedcr, col='grey', main="", xlab="Number of detected ADTs CTRL") 64 | 65 | adt.countscl <- counts(adtcl) 66 | adt.detectedcl <- colSums(adt.countscl > 0) 67 | hist(adt.detectedcl, col='grey', main="", xlab="Number of detected ADTs CTCL") 68 | 69 | qc.statscr <- cleanTagCounts(adtcr)#, exclusive=c("CD3", "CD19")) 70 | summary(qc.statscr$high.ambient) # libraries removed with high ambient contamination 71 | 72 | qc.statscl <- cleanTagCounts(adtcl)#, exclusive=c("CD3", "CD19")) 73 | summary(qc.statscl$high.ambient) # libraries removed with high am 74 | 75 | library(scater) 76 | cn <- colnames(pb) 77 | condition <- unlist(lapply(strsplit(colnames(pb), "_"), function(x) x[1])) 78 | bc <- unlist(lapply(strsplit(colnames(pb), "_"), function(x) x[2])) 79 | colData(pb) <- DataFrame("barcodes"=bc, "condition"=condition) 80 | colnames(pb) <- cn 81 | pbcr <- pb[,pb$condition=="CTRL"] 82 | pbcl <- pb[,pb$condition=="CTCL"] 83 | mito <- grep("mt-", tolower(rownames(pb))) 84 | dfcr <- perCellQCMetrics(pbcr, subsets=list(Mito=mito)) 85 | dfcl <- perCellQCMetrics(pbcl, subsets=list(Mito=mito)) 86 | mito.discardcr <- isOutlier(dfcr$subsets_Mito_percent, type="higher") 87 | names(mito.discardcr) <- rownames(dfcr) 88 | summary(mito.discardcr) 89 | 90 | mito.discardcl <- isOutlier(dfcl$subsets_Mito_percent, type="higher") 91 | names(mito.discardcl) <- rownames(dfcl) 92 | summary(mito.discardcl) 93 | 94 | cd <- colData(mae) 95 | cd 96 | cd$adt.discard_CTRL <- FALSE 97 | cd$adt.discard_CTRL[which(rownames(cd) %in% rownames(qc.statscr)[qc.statscr$discard])] <- TRUE 98 | cd$adt.discard_CTCL <- FALSE 99 | cd$adt.discard_CTCL[which(rownames(cd) %in% rownames(qc.statscl)[qc.statscl$discard])] <- TRUE 100 | 101 | cd$mito.discard_CTRL <- FALSE 102 | cd$mito.discard_CTRL[which(rownames(cd) %in% names(mito.discardcr)[mito.discardcr])] <- TRUE 103 | cd$mito.discard_CTCL <- FALSE 104 | cd$mito.discard_CTCL[which(rownames(cd) %in% names(mito.discardcl)[mito.discardcl])] <- TRUE 105 | cd$discard_CTRL <- cd$adt.discard_CTRL | cd$mito.discard_CTRL 106 | cd$discard_CTCL <- cd$adt.discard_CTCL | cd$mito.discard_CTCL 107 | cd$discard <- cd$discard_CTRL | cd$discard_CTCL 108 | 109 | 110 | scRNAseq_coldata <- as.data.frame(cd) 111 | dir.create("peripheral_blood/v1.0.0/", recursive=TRUE) 112 | save(scRNAseq_coldata, file="peripheral_blood/v1.0.0/scRNAseq_coldata.Rda") 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /inst/scripts/make-data/make_macrophage.R: -------------------------------------------------------------------------------- 1 | 2 | ## Make the data to distribute to ExperimentHub 3 | 4 | ## Required packages 5 | library(HDF5Array) 6 | library(BiocFileCache) 7 | library(SingleCellExperiment) 8 | 9 | ## Steps: 10 | ## 1. Retrieve the scRNASeq matrices (n=2) from NCBI 11 | ## 2. Read the count matrices 12 | ## 3. Combine the two RNA count matrices in a SingleCellExperiment 13 | ## 4. Retrieve the protein matrix and annotation from Google Drive 14 | ## 5. Combine the protein data in a SingleCellExperiment object 15 | 16 | ## Note that step 3 is optional. I needed to migrate to HDF5 due to 17 | ## memory limitations. 18 | 19 | ## -------------------------------------- ## 20 | ## 1. Retrieve the scRNASeq matrices (n=2) from NCBI 21 | ## -------------------------------------- ## 22 | 23 | ## See also https://bioconductor.org/packages/devel/bioc/vignettes/MultiAssayExperiment/inst/doc/UsingHDF5Array.html 24 | bfc <- BiocFileCache() 25 | url1 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4226nnn/GSM4226877/suppl/GSM4226877_rna_data_Bio_Replicate_1.csv.gz" 26 | url2 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4226nnn/GSM4226878/suppl/GSM4226878_rna_data_Bio_Replicate_2.csv.gz" 27 | bfcrpath(bfc, url1) 28 | bfcrpath(bfc, url2) 29 | 30 | ## ---------------------- ## 31 | ## 2. Read count matrices 32 | ## ---------------------- ## 33 | 34 | ## Batch 1 35 | m1 <- data.table::fread(file = bfcquery(bfc, "GSM4226877")$rpath, 36 | sep = ",", header = TRUE) 37 | rn <- m1[[1]] 38 | m1 <- as.matrix(m1[, -1]) 39 | rownames(m1) <- rn 40 | colnames(m1) <- paste0(colnames(m1), ".1") 41 | ## Batch 2 42 | m2 <- data.table::fread(file = bfcquery(bfc, "GSM4226878")$rpath, 43 | sep = ",", header = TRUE) 44 | rn <- m2[[1]] 45 | m2 <- as.matrix(m2[, -1]) 46 | colnames(m2) <- paste0(colnames(m2), ".2") 47 | rownames(m2) <- rn 48 | 49 | ## ------------------------------------------------------- ## 50 | ## 3. Combine the two RNA count matrices in a SingleCellExperiment 51 | ## ------------------------------------------------------- ## 52 | 53 | m1 <- DelayedArray(m1) 54 | m2 <- DelayedArray(m2) 55 | m3 <- cbind(m1, m2) ## This process is delayed until writing 56 | batch <- factor(gsub("^.*[.](\\d)$", "\\1", colnames(m3))) 57 | sce <- SingleCellExperiment(list(counts = m3), 58 | colData = DataFrame(Batch = batch)) 59 | ## The object is rather big and is better stored on disk as an HDF5 60 | saveHDF5SummarizedExperiment(sce, 61 | dir = "../.localdata/SingleCellMultiModal/macrophage_differentiation/v1.0.0/", 62 | prefix = "macrophage_rna_", 63 | as.sparse = TRUE) 64 | ## Restore some RAM 65 | rm(m1, m2, m3); gc() 66 | 67 | ## ------------------------------------------------------- ## 68 | ## 4. Retrieve the protein matrix and annotation from Google Drive 69 | ## ------------------------------------------------------- ## 70 | 71 | ## Download the protein data provided by the Slavov lab 72 | ## https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing 73 | protein_assay <- read.csv("../.localdata/SCP/specht2019/v3/Proteins-processed.csv", 74 | row.names = 1) 75 | protein_assay <- protein_assay[, colnames(protein_assay) != "protein"] 76 | protein_assay <- as.matrix(protein_assay) 77 | 78 | ## Download the protein data provided by the Slavov lab 79 | ## https://drive.google.com/file/d/16vf6rjIsk-oK9naAH6BQnCFrlWnYtJsS/view?usp=sharing 80 | protein_colData <- read.csv("../.localdata/SCP/specht2019/v3/Cells.csv", 81 | row.names = 1) 82 | protein_colData <- t(protein_colData) 83 | protein_colData <- DataFrame(protein_colData) 84 | ## Replace the cell type annotation by more explicit values 85 | protein_colData$celltype <- 86 | ifelse(protein_colData$celltype == "sc_m0", "Macrophage", "Monocyte") 87 | ## Rename the `raw.file` value by `Batch` 88 | colnames(protein_colData)[5] <- "batch_MS" 89 | 90 | ## ------------------------------------------------------- ## 91 | ## 6. Combine the data in a SingleCellExperiment object 92 | ## ------------------------------------------------------- ## 93 | 94 | macrophage_protein <- SingleCellExperiment(assay = list(logexprs = protein_assay), 95 | colData = protein_colData) 96 | format(object.size(macrophage_protein), "MB") 97 | ## Note the protein data can easily fit in memory. We save it as an Rda 98 | save(macrophage_protein, file = "../.localdata/SingleCellMultiModal/macrophage_differentiation/v1.0.0/macrophage_protein.Rda") 99 | 100 | ## ------------------------------------------------------- ## 101 | ## Conclusion 102 | ## ------------------------------------------------------- ## 103 | 104 | ## These files should be sent to ExperimentHub: 105 | ## mRNA 106 | ## - macrophage_rna_assays.h5 107 | ## - macrophage_rna_se.rds 108 | ## Protein 109 | ## - macrophage_protein.Rda 110 | -------------------------------------------------------------------------------- /inst/scripts/make-data/scMultiome.R: -------------------------------------------------------------------------------- 1 | library(MultiAssayExperiment) 2 | library(rhdf5) 3 | library(HDF5Array) 4 | 5 | ddir <- "~/data/scmm/pbmc_10x" 6 | 7 | pbmc <- readRDS( 8 | file.path(ddir, "mae.rds") 9 | ) 10 | vdir <- file.path(ddir, paste0("v", "1.0.0")) 11 | setwd(vdir) 12 | 13 | ## save colData and sampleMap 14 | pbmc_colData <- colData(pbmc) 15 | save(pbmc_colData, file = "pbmc_colData.rda") 16 | 17 | pbmc_sampleMap <- sampleMap(pbmc) 18 | save(pbmc_sampleMap, file = "pbmc_sampleMap.rda") 19 | 20 | Matrix::writeMM(assay(pbmc[[1]]), "pbmc_rna.mtx") 21 | R.utils::gzip(filename = "pbmc_rna.mtx", destname = "pbmc_rna.mtx.gz") 22 | file.remove("pbmc_rna.mtx") 23 | 24 | stopifnot(file.exists("pbmc_rna.mtx.gz")) 25 | 26 | rna_mtx <- .read_mtx("pbmc_rna.mtx.gz") 27 | 28 | ## save H5 file and SCE shell 29 | HDF5Array::saveHDF5SummarizedExperiment(pbmc[[1]], dir = "pbmc_rna", 30 | prefix = "pbmc_rna_", as.sparse = TRUE) 31 | 32 | ## load SCE shell 33 | rna_sce <- readRDS("./pbmc_rna/pbmc_rna_se.rds") 34 | ## replace assay with MTX assay 35 | pbmc_rna_mtx_obj <- BiocBaseUtils::setSlots( 36 | rna_sce, assays = Assays(SimpleList(counts = rna_mtx)) 37 | ) 38 | 39 | pbmc_rna_h5_obj <- 40 | HDF5Array::loadHDF5SummarizedExperiment("pbmc_rna", "pbmc_rna_") 41 | 42 | Matrix::writeMM(assay(pbmc[[2]]), "pbmc_atac.mtx") 43 | R.utils::gzip(filename = "pbmc_atac.mtx", destname = "pbmc_atac.mtx.gz") 44 | pbmc_atac_mtx <- "pbmc_atac.mtx.gz" 45 | stopifnot(file.exists(pbmc_atac_mtx)) 46 | atac_mtx <- .read_mtx("pbmc_atac.mtx.gz") 47 | ## save H5 file and SCE shell 48 | HDF5Array::saveHDF5SummarizedExperiment(pbmc[[2]], dir = "pbmc_atac", 49 | prefix = "pbmc_atac_", as.sparse = TRUE) 50 | 51 | ## load SCE shell 52 | atac_sce <- readRDS("./pbmc_atac/pbmc_atac_se.rds") 53 | ## replace assay with MTX assay 54 | pbmc_atac_mtx_obj <- BiocBaseUtils::setSlots( 55 | atac_sce, assays = Assays(SimpleList(counts = atac_mtx)) 56 | ) 57 | 58 | ## load H5 object 59 | pbmc_atac_h5_obj <- 60 | HDF5Array::loadHDF5SummarizedExperiment("pbmc_atac", "pbmc_atac_") 61 | 62 | ### Use 10X Dataset 63 | ## First load the previous version 64 | rna <- SingleCellMultiModal::scMultiome( 65 | "pbmc_10x", "rna", format = "HDF5", version = "1.0.0", dry.run = FALSE 66 | ) 67 | 68 | rnames <- rownames(rna[[1]]) 69 | cnames <- colnames(rna[[1]]) 70 | 71 | datafolder <- file.path(Sys.getenv("HOME"), "data/scmm/pbmc_10x") 72 | 73 | v1url <- paste0("https://cf.10xgenomics.com/samples/cell-arc/1.0.0/", 74 | "pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5" 75 | ) 76 | 77 | h5v1 <- file.path(datafolder, basename(v1url)) 78 | 79 | if (!file.exists(h5v1)) 80 | download.file( 81 | url = v1url, 82 | destfile = h5v1 83 | ) 84 | 85 | HDF5Array::h5ls(h5v1, all = TRUE) 86 | 87 | grange <- as.character(h5read(h5v1, "/matrix/features/name")) 88 | 89 | aa <- HDF5Array::TENxMatrix(h5v1, "matrix") 90 | rownames(aa) <- grange 91 | upcnames <- gsub("(.*)(-1)$", "\\1", colnames(aa)) 92 | 93 | stopifnot( 94 | all.equal( 95 | nchar(colnames(aa)) - nchar("-1") , nchar(upcnames) 96 | ) 97 | ) 98 | 99 | colnames(aa) <- upcnames 100 | 101 | rowlog <- grange %in% rnames 102 | aa <- aa[rowlog,] 103 | 104 | collog <- colnames(aa) %in% cnames 105 | aa <- aa[, collog] 106 | 107 | all(rownames(aa) %in% rnames) 108 | ## TRUE 109 | all(colnames(aa) %in% cnames) 110 | ## TRUE 111 | 112 | ## remove dups HDF5Array 113 | duprows <- duplicated(rownames(aa)) 114 | aa <- aa[!duprows, ] 115 | 116 | ## make sure that previous and new values are the same 117 | ## identical(rowSums(assay(rna[[1]])), rowSums(aa)) 118 | 119 | HDF5Array::writeTENxMatrix(aa, outfile, "pbmc", verbose = TRUE) 120 | # file.remove(outfile) 121 | outfile 122 | 123 | HDF5Array::h5ls(outfile, all = TRUE) 124 | 125 | -------------------------------------------------------------------------------- /inst/scripts/make-data/scNMT.R: -------------------------------------------------------------------------------- 1 | # get data from cloudstor 2 | # https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ/download?path=%2Foutput&files=scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds 3 | ## ./output/scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds 4 | library(MultiAssayExperiment) 5 | 6 | ddir <- "~/data/scmm/mouse_gastrulation" 7 | 8 | if (!dir.exists(ddir)) 9 | dir.create(ddir, recursive = TRUE) 10 | 11 | # old 12 | # "scnmtseq_gastrulation_mae_826-cells_orderedFeatures.rds" 13 | scnmt <- readRDS( 14 | file.path(ddir, "allcells", 15 | "scnmtseq_gastrulation_mae_AllCells.rds" 16 | ) 17 | ) 18 | 19 | exportClass(scnmt, ddir, fmt = "csv") 20 | 21 | # convert .csv files to .rda matrices 22 | .convertData <- function( 23 | directory = "~/data/scmm/", 24 | dataDir = "mouse_gastrulation", 25 | version = "1.0.0", 26 | pattern = ".csv") 27 | { 28 | location <- file.path(directory, dataDir, paste0("v", version)) 29 | csvs <- list.files(location, pattern = pattern, full.names = TRUE, 30 | recursive = FALSE) 31 | invisible( 32 | lapply(csvs, function(csvfile) { 33 | objname <- gsub(pattern, "", basename(csvfile)) 34 | readin <- as.data.frame(readr::read_csv(csvfile)) 35 | rnames <- readin[[1L]] 36 | 37 | if (!objname %in% c("scnmt_colData", "scnmt_sampleMap")) 38 | readin <- data.matrix(readin[, -1]) 39 | else if (identical(objname, "scnmt_colData")) 40 | names(readin)[1] <- "cellID" 41 | else 42 | readin <- readin[, -1] 43 | 44 | if (!objname %in% "scnmt_sampleMap") 45 | rownames(readin) <- rnames 46 | 47 | assign(objname, readin) 48 | rdafile <- gsub("csv", "rda", csvfile) 49 | save(list = objname, file = rdafile) 50 | }) 51 | ) 52 | } 53 | 54 | .convertData() 55 | -------------------------------------------------------------------------------- /inst/scripts/make-metadata.R: -------------------------------------------------------------------------------- 1 | setwd("~/gh/SingleCellMultiModal") 2 | 3 | .getSourceType <- function(filepaths) { 4 | lfiles <- strsplit(basename(filepaths), "\\.") 5 | exts <- vapply(lfiles, 6 | function(x) { paste(x[-1], collapse = ".") }, character(1L)) 7 | uexts <- toupper(exts) 8 | uexts <- gsub("[Hh]5", "HDF5", uexts) 9 | uexts <- gsub("[Mm][Tt][Xx]\\.[Gg][Zz]", "MTX", uexts) 10 | vTypes <- AnnotationHubData::getValidSourceTypes() 11 | uTypes <- toupper(vTypes) 12 | allvalid <- all(uexts %in% uTypes) 13 | if (!allvalid) 14 | stop("Source types not supported: ", paste0(exts[!allvalid], 15 | collapse = ", "), "\n See 'AnnotationHubData::getValidSources()'", 16 | call. = FALSE) 17 | res <- vTypes[match(uexts, uTypes)] 18 | ## hot fix before AnnotationHubData 1.21.2 19 | gsub("MTX", "mtx.gz", res, fixed = TRUE) 20 | } 21 | 22 | doc_helper <- 23 | function( 24 | DataProvider, TaxonomyId, Species, SourceUrl, SourceType, DataType, ... 25 | ) 26 | { 27 | args <- list(...) 28 | saf <- args[["stringsAsFactors"]] 29 | saf <- if(!is.null(saf)) saf else FALSE 30 | 31 | input_vals <- list( 32 | DataProvider = DataProvider, TaxonomyId = TaxonomyId, 33 | Species = Species, SourceUrl = SourceUrl, 34 | SourceType = SourceType, DataType = DataType 35 | ) 36 | clens <- lengths(input_vals) 37 | zlen <- !clens 38 | if (any(zlen)) 39 | stop( 40 | "Provide values for: ", 41 | paste(names(input_vals)[zlen], collapse = ", ") 42 | ) 43 | 44 | nonstd <- !clens %in% c(max(clens), 1L) 45 | if (any(nonstd)) 46 | stop("Lengths of inputs must either be 1 or the max length") 47 | 48 | input_vals[clens == 1L] <- lapply(input_vals[clens == 1L], 49 | function(x) { 50 | rep(x, max(clens)) 51 | }) 52 | 53 | as.data.frame(input_vals, stringsAsFactors = saf) 54 | } 55 | 56 | .stdLength <- function(metalist, replength) { 57 | lapply(metalist, function(field) { 58 | if (length(field) == 1L) 59 | rep(field, replength) 60 | else 61 | field 62 | }) 63 | } 64 | 65 | .loadRDS <- function(filepath) { 66 | readRDS(filepath) 67 | } 68 | 69 | .loadRDA <- function(filepath) { 70 | basefile <- gsub("\\.[Rr][Dd][Aa]", "", basename(filepath)) 71 | OBJENV <- new.env(parent = emptyenv()) 72 | load(filepath, envir = OBJENV) 73 | OBJENV[[basefile]] 74 | } 75 | 76 | .loadH5 <- function(filepath) { 77 | if (grepl("tenx", filepath)) 78 | HDF5Array::TENxMatrix(filepath, "pbmc") 79 | else 80 | HDF5Array::HDF5Array(filepath, "assay001") 81 | } 82 | 83 | .loadMTX.GZ <- function(filepath) { 84 | .read_mtx(filepath) 85 | } 86 | 87 | .loadDataList <- function(filepaths) { 88 | recipelist <- list( 89 | "\\.[Rr][Dd][Aa]" = .loadRDA, 90 | "\\.[Rr][Dd][Ss]" = .loadRDS, 91 | "\\.[Hh]5" = .loadH5, 92 | "\\.[Mm][Tt][Xx]\\.[Gg][Zz]" = .loadMTX.GZ 93 | ) 94 | hitMatrix <- vapply(names(recipelist), 95 | function(pat) grepl(pat, filepaths), 96 | logical(length(filepaths)) 97 | ) 98 | allrecipes <- recipelist[apply(hitMatrix, 1L, which)] 99 | Map(function(x, y) { x(y) }, x = allrecipes, y = filepaths) 100 | } 101 | 102 | any.na <- function(x) { 103 | any(is.na(x)) 104 | } 105 | 106 | .get_Description <- function(data_name, DataType) { 107 | paste(data_name, "data specific to the", toupper(DataType), "project") 108 | } 109 | 110 | .getRDataClass <- function(dataList) { 111 | vapply(dataList, function(dataName) { 112 | if (is.matrix(dataName)) 113 | "matrix" 114 | else 115 | class(dataName) 116 | }, character(1L)) 117 | } 118 | 119 | 120 | .file_pattern_map <- data.frame( 121 | ext_pattern = paste0( 122 | c("[Rr][Dd][Aa]", "[Rr][Dd][Ss]", "[Hh]5", "[Mm][Tt][Xx]\\.[Gg][Zz]"), 123 | "$" 124 | ), 125 | ## currently MTX DispatchClass recipe unavailable 126 | Dispatch = c("Rda", "Rds", "H5File", "FilePath"), 127 | stringsAsFactors = FALSE 128 | ) 129 | 130 | .getDispatchClass <- function(resource_files, ext_map = .file_pattern_map) { 131 | hitMatrix <- vapply(ext_map[["ext_pattern"]], 132 | function(pat) grepl(pat, resource_files), 133 | logical(length(resource_files))) 134 | ext_map[["Dispatch"]][apply(hitMatrix, 1L, which)] 135 | } 136 | 137 | ## alist() with formals()<- 138 | ## fancyFUN <- function() {} 139 | ## formals(fancyFUN) <- alist() 140 | 141 | MetaHubCreate <- 142 | function(base_dir, data_dirs, ext_pattern, doc_file, version, pkg_name) 143 | { 144 | locations <- file.path(base_dir, data_dirs, paste0("v", version)) 145 | stopifnot( 146 | dir.exists(base_dir), all(dir.exists(locations)), 147 | is.character(ext_pattern), !is.na(ext_pattern), 148 | identical(length(ext_pattern), 1L), 149 | file.exists(doc_file), is.character(doc_file), !is.na(doc_file), 150 | identical(length(doc_file), 1L), is.character(version) 151 | ) 152 | fpathlist <- lapply(locations, function(locs) { 153 | list.files( 154 | locs, pattern = ext_pattern, full.names = TRUE, recursive = TRUE 155 | ) 156 | }) 157 | docFrame <- read.csv(doc_file, header = TRUE) 158 | docList <- split(docFrame, 159 | list(docFrame[["DataType"]], docFrame[["SourceVersion"]])) 160 | versions <- version 161 | DataTypes <- data_dirs 162 | replengths <- lengths(fpathlist) 163 | namelist <- lapply(fpathlist, basename) 164 | 165 | metaList <- Map( 166 | function(DataType, doc_file, resnames, filepaths, replength, version) { 167 | message("Working on: ", basename(DataType), " v", version) 168 | hubmeta <- R6::R6Class("EHubMeta", 169 | public = list( 170 | Title = NA_character_, 171 | Description = NA_character_, 172 | BiocVersion = as.character(BiocManager::version()), 173 | Genome = NA_character_, 174 | SourceType = NA_character_, 175 | SourceUrl = character(1L), 176 | SourceVersion = version, 177 | Species = character(1L), 178 | TaxonomyId = character(1L), 179 | Coordinate_1_based = NA, 180 | DataProvider = character(1L), 181 | Maintainer = NA_character_, 182 | RDataClass = NA_character_, 183 | DispatchClass = .getDispatchClass(resnames), 184 | Location_Prefix = NA_character_, 185 | RDataPath = NA_character_, 186 | ResourceName = resnames, 187 | DataType = DataType, 188 | 189 | initialize = function(doc_file) 190 | { 191 | lapply(names(doc_file), function(i) { 192 | assign(i, doc_file[[i]], self) 193 | }) 194 | if (is.na(self$Title)) 195 | self$Title <- gsub(ext_pattern, "", 196 | basename(filepaths)) 197 | if (is.na(self$Description)) 198 | self$Description <- paste(self$Title, 199 | "data specific to the", toupper(self$DataType), 200 | "project") 201 | if (any.na(self$SourceType)) 202 | self$SourceType <- .getSourceType(filepaths) 203 | if (any.na(self$SourceVersion)) 204 | self$SourceVersion <- "1.0.0" 205 | if (any.na(self$Maintainer)) 206 | self$Maintainer <- utils::maintainer(pkg_name) 207 | if (any.na(self$RDataClass)) { 208 | dataList <- .loadDataList(filepaths) 209 | self$RDataClass <- .getRDataClass(dataList) 210 | } 211 | if (is.na(self$Location_Prefix)) 212 | self$Location_Prefix <- NULL 213 | if (is.na(self$RDataPath)) 214 | self$RDataPath <- file.path(pkg_name, 215 | self$DataType, paste0("v", version), 216 | self$ResourceName) 217 | }, 218 | generate = function() { 219 | lnames <- !names(self) %in% 220 | c(".__enclos_env__", "clone", "generate", 221 | "initialize") 222 | initList <- mget(names(self)[lnames], envir = self) 223 | initList <- Filter(function(x) !is.null(x), initList) 224 | flist <- .stdLength(initList, replength) 225 | do.call(data.frame, c(flist, stringsAsFactors = FALSE)) 226 | } 227 | ), 228 | lock_objects = FALSE 229 | ) 230 | nhub <- hubmeta$new(doc_file) 231 | nhub$generate() 232 | }, DataType = DataTypes, doc_file = docList, resnames = namelist, 233 | filepaths = fpathlist, replength = replengths, version = versions 234 | ) 235 | 236 | do.call( 237 | function(...) { 238 | rbind.data.frame(..., make.row.names = FALSE, 239 | stringsAsFactors = FALSE) 240 | }, 241 | metaList) 242 | } 243 | 244 | #' Generate the metadata.csv file from a documentation file 245 | #' 246 | #' This function takes a specific folder structure and generates the 247 | #' metadata.csv file for adding to ExperimentHub. 248 | #' 249 | #' @param directory The base folder for _all_ datasets 250 | #' 251 | #' @param dataDirs `character()` A vector of folder names contained in directory 252 | #' that corresponds to each project. For multiple versions, repeat the 253 | #' name of the folder. 254 | #' 255 | #' @param version `character()` A vector of subfolder versions that is parallel 256 | #' to `dataDirs` argument, typically `v1.0.0`. 257 | #' 258 | #' @param ext_pattern `character(1)` A string that matches files within the 259 | #' above folders to find the data. 260 | #' 261 | #' @param doc_file `character(1)` A path to the documentation `data.frame` that 262 | #' tells the function how to fill in the standard columns for data 263 | #' annotation, for example `DataProvider`, `TaxonomyId`, etc. 264 | #' 265 | #' @param pkg_name `character(1)` The name of the current package 266 | #' 267 | #' @param dry.run `logical(1)` Whether to (over)write the `metadata.csv` file or 268 | #' return as output. 269 | #' 270 | #' @param append `logical(1)` Whether to append to the current `metadata.csv` 271 | #' file 272 | #' 273 | #' @return Saves a file under `/inst/extdata/metadata.csv` 274 | #' 275 | #' @examples 276 | #' 277 | #' make_metadata( 278 | #' directory = "~/data/scmm", 279 | #' dataDirs = "mouse_gastrulation", 280 | #' version = c("1.0.0", "2.0.0"), 281 | #' doc_file = "inst/extdata/docuData/singlecellmultimodalv2.csv", 282 | #' dry.run = FALSE 283 | #' ) 284 | #' 285 | #' make_metadata( 286 | #' directory = "~/data/scmm", 287 | #' dataDirs = c(rep("mouse_gastrulation", 2), 288 | #' rep("mouse_visual_cortex", 2)), 289 | #' version = rep(c("1.0.0", "2.0.0"), 2), 290 | #' ext_pattern = "\\.[Rr][Dd][Aa]$", 291 | #' doc_file = "inst/extdata/docuData/singlecellmultimodalv3.csv", 292 | #' pkg_name = "SingleCellMultiModal", 293 | #' dry.run = TRUE, 294 | #' ) 295 | #' 296 | #' make_metadata( 297 | #' directory = "~/data/scmm", 298 | #' dataDirs = "pbmc", 299 | #' version = "1.0.0", 300 | #' ext_pattern = "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$", 301 | #' doc_file = "inst/extdata/docuData/singlecellmultimodalv6.csv", 302 | #' pkg_name = "SingleCellMultiModal", 303 | #' dry.run = TRUE, 304 | #' ) 305 | #' 306 | #' @export 307 | make_metadata <- function( 308 | directory = "~/data/scmm", 309 | dataDirs = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex", 2), "pbmc"), 310 | version = c(rep(c("1.0.0", "2.0.0"), 2), "1.0.0"), 311 | ext_pattern = "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|\\.[Hh]5$", 312 | doc_file, 313 | pkg_name = "SingleCellMultiModal", 314 | dry.run = TRUE, 315 | append = FALSE) 316 | { 317 | if (!identical(basename(getwd()), pkg_name)) 318 | stop("Run 'make_metadata()' from directory: ", pkg_name) 319 | 320 | exdata <- "inst/extdata" 321 | 322 | if (!dir.exists(exdata)) 323 | dir.create(exdata) 324 | 325 | if (missing(doc_file)) 326 | stop("'doc_file' for generating the metadata is missing") 327 | 328 | metafile <- file.path(exdata, "metadata.csv") 329 | 330 | metadat <- MetaHubCreate( 331 | base_dir = directory, 332 | data_dirs = dataDirs, 333 | ext_pattern = ext_pattern, 334 | doc_file = doc_file, 335 | version = version, 336 | pkg_name = pkg_name 337 | ) 338 | 339 | if (!dry.run) { 340 | if(!append) 341 | { 342 | file.remove(metafile) 343 | } 344 | readr::write_csv(metadat, metafile, append = append, na="NA") 345 | } 346 | 347 | metadat 348 | } 349 | 350 | # make_metadata( 351 | # dataDirs = "mouse_gastrulation", 352 | # version = "1.0.0", 353 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv1.csv", 354 | # dry_run = FALSE 355 | # ) 356 | # 357 | # make_metadata( 358 | # directory="CITEseq/", 359 | # dataDirs = "cord_blood", 360 | # version = "1.0.0", 361 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv5.csv", 362 | # dry.run = FALSE, 363 | # append=TRUE 364 | # ) 365 | 366 | # make_metadata( 367 | # dataDirs = c(rep("mouse_gastrulation", 2), "mouse_visual_cortex"), 368 | # version = c("1.0.0", "2.0.0", "1.0.0"), 369 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv3.csv", 370 | # dry.run = FALSE 371 | # ) 372 | 373 | # make_metadata( 374 | # directory = "~/data/scmm", 375 | # dataDirs = "peripheral_blood", 376 | # version = "1.0.0", 377 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv5.csv", 378 | # dry.run = FALSE, 379 | # append = TRUE 380 | # ) 381 | 382 | # make_metadata( 383 | # directory = "~/data/scmm", 384 | # dataDirs = "pbmc_10x", 385 | # version = "1.0.0", 386 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv6.csv", 387 | # dry.run = FALSE, 388 | # append = TRUE 389 | # ) 390 | 391 | # make_metadata( 392 | # directory = "../.localdata/SingleCellMultiModal/", 393 | # dataDirs = "macrophage_differentiation", 394 | # version = "1.0.0", 395 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv7.csv", 396 | # dry.run = FALSE, 397 | # append = TRUE 398 | # ) 399 | 400 | ## request to update Maintainer field in older AH resources 401 | # aq <- AnnotationHub::query(eh, "SingleCellMultiModal") 402 | # aq[aq$maintainer == "Marcel Ramos " & 403 | # grepl("v[12]", aq$rdatapath)] 404 | 405 | # make_metadata( 406 | # directory = "~/data/scmm", 407 | # dataDirs = "mouse_embryo_8_cell", 408 | # version = "1.0.0", 409 | # doc_file = "inst/extdata/docuData/singlecellmultimodalv8.csv", 410 | # dry.run = FALSE, 411 | # append = TRUE 412 | # ) 413 | 414 | make_metadata( 415 | directory = "~/data/scmm", 416 | dataDirs = "pbmc_10x", 417 | version = "1.0.1", 418 | doc_file = "inst/extdata/docuData/singlecellmultimodalv9.csv", 419 | dry.run = FALSE, 420 | append = TRUE 421 | ) 422 | 423 | ## Check metadata.csv file with: 424 | ExperimentHubData::makeExperimentHubMetadata( 425 | file.path(Sys.getenv("HOME"), "gh/SingleCellMultiModal"), "metadata.csv" 426 | ) 427 | -------------------------------------------------------------------------------- /inst/scripts/make-upload.R: -------------------------------------------------------------------------------- 1 | # upload files to AWS S3 2 | allextpat <- "\\.[Rr][Dd][AaSs]$|\\.[Mm][Tt][Xx]\\.[Gg][Zz]$|\\.[Hh]5$" 3 | 4 | .version_folder <- function(version) { 5 | paste0("v", version) 6 | } 7 | 8 | .getDataFiles <- function(directory = "~/data/scmm", 9 | dataDir = "mouse_gastrulation", pattern = allextpat, version = "1.0.0" 10 | ) { 11 | vfolder <- .version_folder(version) 12 | location <- file.path(directory, dataDir, vfolder) 13 | list.files( 14 | location, pattern = pattern, full.names = TRUE, recursive = FALSE 15 | ) 16 | } 17 | 18 | ## check files are listed 19 | .getDataFiles(dataDir = "pbmc_10x", version = "1.0.1") 20 | 21 | # IMPORTANT! 22 | # Make sure that AWS_DEFAULT_REGION, AWS_ACCESS_KEY_ID, and 23 | # AWS_SECRET_ACCESS_KEY are set in the ~/.Renviron file 24 | 25 | # source("inst/scripts/make-metadata.R") 26 | 27 | upload_aws <- function( 28 | DataType, directory = "~/data/scmm", upload = FALSE, 29 | fileExt = allextpat, version = "1.0.0" 30 | ) { 31 | if (missing(DataType)) 32 | stop("Enter a 'DataType' folder") 33 | datafilepaths <- .getDataFiles( 34 | directory = directory, dataDir = DataType, 35 | pattern = fileExt, version = version 36 | ) 37 | vfolder <- .version_folder(version) 38 | bucketLocation <- 39 | file.path("experimenthub", "SingleCellMultiModal", DataType, vfolder) 40 | if (!upload) 41 | message("Data NOT uploaded") 42 | if (upload) 43 | AnnotationHubData:::upload_to_S3(file = datafilepaths, 44 | remotename = basename(datafilepaths), 45 | bucket = bucketLocation) 46 | else 47 | file.path("s3:/", bucketLocation, basename(datafilepaths)) 48 | } 49 | 50 | # upload_aws(DataType = "mouse_gastrulation", version = "1.0.0", upload=TRUE) 51 | # upload_aws(DataType = "mouse_gastrulation", version = "2.0.0", upload=TRUE) 52 | # upload_aws(DataType = "cord_blood", directory="CITEseq", version = "1.0.0", 53 | # upload=TRUE) 54 | # upload_aws(DataType = "mouse_visual_cortex", upload=TRUE) 55 | # upload_aws(DataType = "pbmc_10x", directory = "~/data/scmm", 56 | # version = "1.0.0", upload = TRUE) 57 | # upload_aws(DataType = "mouse_embryo_8_cell", directory = "~/data/scmm", 58 | # version = "1.0.0", upload = TRUE) 59 | # upload_aws(DataType = "pbmc_10x", directory = "~/data/scmm", 60 | # version = "1.0.1", upload = TRUE) 61 | 62 | -------------------------------------------------------------------------------- /inst/scripts/make_docu.R: -------------------------------------------------------------------------------- 1 | # version 1 2 | scmeta <- data.frame( 3 | DataProvider = "Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 4 | TaxonomyId = "10090", 5 | Species = "Mus musculus", 6 | SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 7 | SourceType = "RDS", 8 | SourceVersion = "1.0.0", 9 | DataType = "mouse_gastrulation", 10 | Maintainer = "Ricard Argelaguet ", 11 | stringsAsFactors = FALSE 12 | ) 13 | write.csv( 14 | scmeta, 15 | file = "inst/extdata/docuData/singlecellmultimodalv1.csv", 16 | row.names = FALSE 17 | ) 18 | 19 | # version 2 20 | scmeta <- data.frame( 21 | DataProvider = 22 | "Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 23 | TaxonomyId = "10090", 24 | Species = "Mus musculus", 25 | SourceUrl = "https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 26 | SourceType = "RDS", 27 | SourceVersion = c("1.0.0", "2.0.0"), 28 | DataType = "mouse_gastrulation", 29 | Maintainer = "Ricard Argelaguet ", 30 | stringsAsFactors = FALSE 31 | ) 32 | write.csv( 33 | scmeta, 34 | file = "inst/extdata/docuData/singlecellmultimodalv2.csv", 35 | row.names = FALSE 36 | ) 37 | 38 | # version 3 with spatial 39 | scmeta <- data.frame( 40 | DataProvider = c( 41 | rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2), 42 | rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2) 43 | ), 44 | TaxonomyId = "10090", 45 | Species = "Mus musculus", 46 | SourceUrl = c( 47 | rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2), 48 | "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585", 49 | "https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0" 50 | ), 51 | SourceType = c("RDS", "RDS", "TXT", "TXT"), 52 | SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0"), 53 | DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex", 2)), 54 | Maintainer = c(rep("Ricard Argelaguet ", 2), 55 | rep("Dario Righelli ", 2)), 56 | stringsAsFactors = FALSE 57 | ) 58 | write.csv( 59 | scmeta, 60 | file = "inst/extdata/docuData/singlecellmultimodalv3.csv", 61 | row.names = FALSE 62 | ) 63 | 64 | 65 | # version 4 with cord_blood 66 | scmeta <- data.frame( 67 | DataProvider = c( 68 | rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2), 69 | rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2), 70 | "Innovation Lab, New York Genome Center, New York, United States" 71 | ), 72 | TaxonomyId = c(rep("10090",4), "9606"), 73 | Species = c(rep("Mus musculus", 4), "Homo sapiens"), 74 | SourceUrl = c( 75 | rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2), 76 | rep("https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0", 2), 77 | "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866" 78 | ), 79 | SourceType = c(rep("RDS", 2), rep("TXT",3)), 80 | SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0", "1.0.0"), 81 | DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex",2), "coord_blood"), 82 | Maintainer = c(rep("Ricard Argelaguet ", 2), 83 | rep("Dario Righelli ",3)), 84 | stringsAsFactors = FALSE 85 | ) 86 | 87 | write.csv( 88 | scmeta, 89 | file = "inst/extdata/docuData/singlecellmultimodalv3.csv", 90 | row.names = FALSE 91 | ) 92 | 93 | # indv cord_blood 94 | citeseqmeta <- data.frame( 95 | DataProvider = 96 | "Innovation Lab, New York Genome Center, New York, United States", 97 | TaxonomyId = "9606", 98 | Species = "Homo sapiens", 99 | SourceUrl = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866", 100 | SourceType = "TXT", 101 | SourceVersion = "1.0.0", 102 | DataType = "coord_blood", 103 | Maintainer = "Dario Righelli ", 104 | stringsAsFactors = FALSE 105 | ) 106 | 107 | write.csv( 108 | citeseqmeta, 109 | file = "inst/extdata/docuData/singlecellmultimodalv5.csv", 110 | row.names = FALSE 111 | ) 112 | # 113 | # 114 | # # version 2 with spatial 115 | # scmeta <- data.frame( 116 | # DataProvider = c( 117 | # rep("Dept. of Bioinformatics, The Babraham Institute, United Kingdom", 2), 118 | # rep("Dept. of Molecular Genetics, Allen Institute for Brain Science, United States", 2), 119 | # "Innovation Lab, New York Genome Center, New York, United States" 120 | # ), 121 | # TaxonomyId = c(rep("10090",4), "9606"), 122 | # Species = c(rep("Mus musculus", 4), "Homo sapiens"), 123 | # SourceUrl = c( 124 | # rep("https://cloudstor.aarnet.edu.au/plus/s/Xzf5vCgAEUVgbfQ", 2), 125 | # rep("https://www.dropbox.com/sh/avj4nrd4la5i88u/AACafWwBbE-xsLvOGDwRZDpYa?dl=0", 2), 126 | # "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866" 127 | # ), 128 | # SourceType = c(rep("RDS", 2), rep("TXT",3)), 129 | # SourceVersion = c("1.0.0", "2.0.0", "1.0.0", "2.0.0", "1.0.0"), 130 | # DataType = c(rep("mouse_gastrulation", 2), rep("mouse_visual_cortex",2), "coord_blood"), 131 | # Maintainer = c(rep("Marcel Ramos ", 2), 132 | # rep("Dario Righelli ",3)), 133 | # stringsAsFactors = FALSE 134 | # ) 135 | # write.csv( 136 | # scmeta, 137 | # file = "inst/extdata/docuData/singlecellmultimodalv3.csv", 138 | # row.names = FALSE 139 | # ) 140 | 141 | # version 5 pbmc 142 | scmeta <- data.frame( 143 | DataProvider = "European Bioinformatics Institute (EMBL-EBI), United Kingdom", 144 | TaxonomyId = "9606", 145 | Species = "Homo sapiens", 146 | SourceUrl = "http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/", 147 | SourceVersion = "1.0.0", 148 | DataType = "pbmc_10x", 149 | Maintainer = "Ricard Argelaguet ", 150 | stringsAsFactors = FALSE 151 | ) 152 | 153 | write.csv( 154 | scmeta, 155 | file = "inst/extdata/docuData/singlecellmultimodalv6.csv", 156 | row.names = FALSE 157 | ) 158 | 159 | ## version 7: creating metadata for the SCoPE2 dataset 160 | scope2meta <- data.frame( 161 | DataProvider = paste0("Slavov Laboratory and SCP Center at ", 162 | "Northeastern University, Boston, United ", 163 | "states"), 164 | TaxonomyId = "9606", 165 | Species = "Homo sapiens", 166 | SourceUrl = c("https://drive.google.com/file/d/1sF5STkofF_f2msnYaaYdWabou84Qf2Xr/view?usp=sharing", 167 | "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392", 168 | "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392"), 169 | SourceType = c("CSV", "CSV", "CSV"), 170 | SourceVersion = "1.0.0", 171 | DataType = "macrophage_differentiation", 172 | Maintainer = "Christophe Vanderaa ", 173 | stringsAsFactors = FALSE 174 | ) 175 | 176 | write.csv( 177 | scope2meta, 178 | file = "inst/extdata/docuData/singlecellmultimodalv7.csv", 179 | row.names = FALSE 180 | ) 181 | 182 | # version 8: GTseq dataset 183 | gtseq <- data.frame( 184 | DataProvider = "Wellcome Trust Sanger Institute, Cambridge, United Kingdom", 185 | TaxonomyId = "10090", 186 | Species = "Mus musculus", 187 | SourceUrl = "https://www.ebi.ac.uk/ena/browser/view/PRJEB9051", 188 | SourceVersion = "1.0.0", 189 | DataType = "mouse_embryo_8_cell", 190 | Maintainer = "Ludwig Geistlinger ", 191 | stringsAsFactors = FALSE 192 | ) 193 | 194 | write.csv( 195 | gtseq, 196 | file = "inst/extdata/docuData/singlecellmultimodalv8.csv", 197 | row.names = FALSE 198 | ) 199 | 200 | 201 | scmeta9 <- data.frame( 202 | DataProvider = "European Bioinformatics Institute (EMBL-EBI), United Kingdom", 203 | TaxonomyId = "9606", 204 | Species = "Homo sapiens", 205 | SourceUrl = "http://ftp.ebi.ac.uk/pub/databases/mofa/10x_rna_atac_vignette/filtered_feature_bc_matrix/", 206 | SourceVersion = "1.0.1", 207 | DataType = "pbmc_10x", 208 | Maintainer = "Marcel Ramos ", 209 | stringsAsFactors = FALSE 210 | ) 211 | 212 | write.csv( 213 | scmeta9, 214 | file = "inst/extdata/docuData/singlecellmultimodalv9.csv", 215 | row.names = FALSE 216 | ) 217 | 218 | -------------------------------------------------------------------------------- /inst/scripts/ontomap_update.R: -------------------------------------------------------------------------------- 1 | ## read in 2 | onto <- readr::read_tsv("inst/extdata/ontomap.tsv") 3 | 4 | ## modification 5 | onto <- as.data.frame(onto) 6 | onto[onto$DataType == "macrophage_differentiation_protein", "DataType"] <- 7 | "macrophage_differentiation" 8 | 9 | ## output checking 10 | stopifnot( 11 | identical(length(unique(onto[["DataType"]])), 4L) 12 | ) 13 | 14 | ## writing 15 | write.table( 16 | x = onto, file = "inst/extdata/ontomap.tsv", 17 | quote = FALSE, sep = "\t", row.names = FALSE 18 | ) 19 | 20 | ## reading ontology terms from kelly ontomap based on an ontomap old version 21 | cellontokelly <- as.data.frame( 22 | readr::read_tsv("~/Downloads/Cell type ontology - Sheet2.tsv") 23 | ) 24 | onto <- as.data.frame( 25 | readr::read_tsv("inst/extdata/ontomap.tsv") 26 | ) 27 | 28 | ## removing repetitive rows for seqFISH/scRNAseq celltypes 29 | ## aligning with newer version of ontomap 30 | cellontokelly <- 31 | cellontokelly[!cellontokelly$dataset_name=="mouse_visual_cortex_scRNAseq",] 32 | ontokey <- paste0(onto$DataType,"_",onto$function_name) 33 | ontokey <- 34 | gsub("SCoPE2", "protein_SCoPE2", gsub("scMultiome", "multiome", ontokey)) 35 | ontokey <- paste0(ontokey, "_", onto$original_cell_name) 36 | kellykey <- 37 | paste0(cellontokelly$dataset_name, "_", cellontokelly$original_cell_name) 38 | ## reordering 39 | cellontokelly <- cellontokelly[match(ontokey, kellykey),] 40 | 41 | onto$ontology_ID <- cellontokelly$ontology_ID 42 | onto$ontology_cell_name <- cellontokelly$ontology_cell_name 43 | 44 | ## writing 45 | write.table( 46 | x = onto, file = "inst/extdata/ontomap.tsv", 47 | quote = FALSE, sep = "\t", row.names = FALSE 48 | ) 49 | 50 | 51 | ## adding celltypes for cord_blood citeseq 52 | ## 53 | load("cord_blood/v1.0.0/coldata_scRNAseq.rda") 54 | cd <- coldata_scRNAseq 55 | ct <- unique(cd$celltype) 56 | ct <- ct[-which(is.na(ct))] 57 | 58 | onto <- as.data.frame( 59 | readr::read_tsv("inst/extdata/ontomap.tsv") 60 | ) 61 | 62 | cn <- colnames(onto) 63 | ctcb <- data.frame("cord_blood", "CITEseq", "celltype", ct, NA, NA) 64 | colnames(ctcb) <- cn 65 | onton <- rbind.data.frame(onto, ctcb) 66 | write.table( 67 | x = onton, file = "inst/extdata/ontomap.tsv", 68 | quote = FALSE, sep = "\t", row.names = FALSE 69 | ) 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /inst/scripts/update_wiki.sh: -------------------------------------------------------------------------------- 1 | SCMM="$HOME/gh/SingleCellMultiModal" 2 | 3 | WIKI="$HOME/wiki/SingleCellMultiModal.wiki" 4 | 5 | RVER="devel" 6 | 7 | cd $SCMM 8 | 9 | export R_LIBS_USER="/media/$USER/1D24A0EA4286043C1/bioc-$RVER/" 10 | 11 | RCMD="$HOME/src/svn/r-$RVER/R/bin/R --no-save --no-restore-data" 12 | 13 | $RCMD CMD INSTALL $SCMM 14 | 15 | $RCMD -e "rmarkdown::render('inst/scripts/Contributing-Guidelines.Rmd', output_file = '$WIKI/Contributing-Guidelines.md')" 16 | 17 | cd $WIKI 18 | 19 | git diff 20 | 21 | git pull origin master 22 | git commit -am "update wiki" 23 | git push origin master 24 | 25 | -------------------------------------------------------------------------------- /man/CITEseq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CITEseq.R 3 | \name{CITEseq} 4 | \alias{CITEseq} 5 | \title{CITEseq} 6 | \usage{ 7 | CITEseq( 8 | DataType = c("cord_blood", "peripheral_blood"), 9 | modes = "*", 10 | version = "1.0.0", 11 | dry.run = TRUE, 12 | filtered = FALSE, 13 | verbose = TRUE, 14 | DataClass = c("MultiAssayExperiment", "SingleCellExperiment"), 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{DataType}{\code{character(1)} indicating the identifier of the dataset to 20 | retrieve. (default "cord_blood")} 21 | 22 | \item{modes}{\code{character()} The assay types or modes of data to obtain these 23 | include scADT and scRNA-seq data by default.} 24 | 25 | \item{version}{\code{character(1)} Either version '1.0.0' depending on 26 | data version required.} 27 | 28 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 29 | download (default \code{TRUE})} 30 | 31 | \item{filtered}{\code{logical(1)} indicating if the returned dataset needs to 32 | have filtered cells. 33 | See Details for additional information about the filtering process.} 34 | 35 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 36 | (down)loaded (default \code{TRUE})} 37 | 38 | \item{DataClass}{either MultiAssayExperiment or SingleCellExperiment 39 | data classes can be returned (default MultiAssayExperiment)} 40 | 41 | \item{...}{Additional arguments passed on to the 42 | \link[ExperimentHub]{ExperimentHub-class} constructor} 43 | } 44 | \value{ 45 | A single cell multi-modal 46 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 47 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE}. When \code{DataClass} is 48 | \code{SingleCellExperiment} an object of this class is returned with an RNA 49 | assay as main experiment and other assay(s) as \code{AltExp(s)}. 50 | } 51 | \description{ 52 | function assembles data on-the-fly from \code{ExperimentHub} to 53 | provide a 54 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 55 | container. Actually the \code{dataType} argument provides access to the 56 | available datasets associated to the package. 57 | } 58 | \details{ 59 | CITEseq data are a combination of single cell transcriptomics and 60 | about a hundread of cell surface proteins. 61 | Available datasets are: 62 | \itemize{ 63 | \item cord_blood: a dataset of single cells of cord blood as 64 | provided in Stoeckius et al. (2017). 65 | \itemize{ 66 | \item scRNA_Counts - Stoeckius scRNA-seq gene count matrix 67 | \item scADT - Stoeckius antibody-derived tags (ADT) data 68 | } 69 | \item peripheral_blood: a dataset of single cells of peripheral 70 | blood as provided in Mimitou et al. (2019). We provide two different 71 | conditions controls (CTRL) and Cutaneous T-cell Limphoma (CTCL). Just build 72 | appropriate \code{modes} regex for subselecting the dataset modes. 73 | \itemize{ 74 | \item scRNA - Mimitou scRNA-seq gene count matrix 75 | \item scADT - Mimitou antibody-derived tags (ADT) data 76 | \item scHTO - Mimitou Hashtag Oligo (HTO) data 77 | \item TCRab - Mimitou T-cell Receptors (TCR) alpha and beta 78 | available through the object metadata. 79 | \item TCRgd - Mimitou T-cell Receptors (TCR) gamma and delta 80 | available through the object metadata. 81 | } 82 | } 83 | 84 | If \code{filtered} parameter is \code{FALSE} (default), the \code{colData} of the returned 85 | object contains multiple columns of \code{logicals} indicating the cells to be 86 | discarded. 87 | In case \code{filtered} is \code{TRUE}, the \code{discard} column is used to filer the 88 | cells. 89 | Column \code{adt.discard} indicates the cells to be discarded computed on the ADT 90 | assay. 91 | Column \code{mito.discard} indicates the cells to be discarded computed on the 92 | RNA assay and mitocondrial genes. 93 | Column \code{discard} combines the previous columns with an \code{OR} operator. 94 | Note that for the \code{peripheral_blood} dataset these three columns are 95 | computed and returned separately for the \code{CTCL} and \code{CTRL} conditions. 96 | In this case the additional \code{discard} column combines the \code{discard.CTCL} and 97 | \code{discard.CTRL} columns with an \code{OR} operator. 98 | Cell filtering has been computed for \code{cord_blood} and \code{peripheral_blood} 99 | datasets following section 12.3 of the Advanced Single-Cell Analysis with 100 | Bioconductor book. 101 | Executed code can be retrieved in the CITEseq_filtering.R script of this 102 | package. 103 | } 104 | \examples{ 105 | 106 | mae <- CITEseq(DataType="cord_blood", dry.run=FALSE) 107 | experiments(mae) 108 | } 109 | \references{ 110 | Stoeckius et al. (2017), Mimitou et al. (2019) 111 | } 112 | \author{ 113 | Dario Righelli 114 | } 115 | -------------------------------------------------------------------------------- /man/GTseq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GTseq.R 3 | \name{GTseq} 4 | \alias{GTseq} 5 | \title{Parallel sequencing data of single-cell genomes and transcriptomes} 6 | \source{ 7 | \url{https://www.ebi.ac.uk/ena/browser/view/PRJEB9051} 8 | } 9 | \usage{ 10 | GTseq( 11 | DataType = "mouse_embryo_8_cell", 12 | modes = "*", 13 | version = "1.0.0", 14 | dry.run = TRUE, 15 | verbose = TRUE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{DataType}{\code{character(1)} Indicates study that produces this type of 21 | data (default: 'mouse_embryo_8_cell')} 22 | 23 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as 24 | \code{"*omic"}. A wildcard of \code{"*"} will return all modes including 25 | copy numbers ("genomic") and RNA-seq read counts ("transcriptomic"), 26 | which is the default.} 27 | 28 | \item{version}{\code{character(1)} Currently, only version '1.0.0'.} 29 | 30 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 31 | download (default \code{TRUE})} 32 | 33 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 34 | (down)loaded (default \code{TRUE})} 35 | 36 | \item{...}{Additional arguments passed on to the 37 | \link[ExperimentHub:ExperimentHub-class]{ExperimentHub} constructor} 38 | } 39 | \value{ 40 | A single cell multi-modal 41 | \link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment} or 42 | informative \code{data.frame} when \code{dry.run} is \code{TRUE} 43 | } 44 | \description{ 45 | GTseq assembles data on-the-fly from \code{ExperimentHub} to provide 46 | a 47 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 48 | container. The \code{DataType} argument provides access to the 49 | \code{mouse_embryo_8_cell} dataset as obtained from Macaulay et al. (2015). 50 | Protocol information for this dataset is available from Macaulay et al. 51 | (2016). See references. 52 | } 53 | \details{ 54 | G&T-seq is a combination of Picoplex amplified gDNA sequencing 55 | (genome) and SMARTSeq2 amplified cDNA sequencing (transcriptome) of the 56 | same cell. For more information, see Macaulay et al. (2015). 57 | * mouse_embryo_8_cell: 58 | this dataset was filtered for bad cells as specified in Macaulay 59 | et al. (2015). 60 | * genomic - integer copy numbers as detected from scDNA-seq 61 | * transcriptomic - raw read counts as quantified from scRNA-seq 62 | } 63 | \section{metadata}{ 64 | 65 | The \code{MultiAssayExperiment} metadata includes the original function call 66 | that saves the function call and the data version requested. 67 | } 68 | 69 | \examples{ 70 | 71 | GTseq() 72 | 73 | } 74 | \references{ 75 | Macaulay et al. (2015) G&T-seq: parallel sequencing of single-cell 76 | genomes and transcriptomes. Nat Methods, 12:519–22. 77 | 78 | Macaulay et al. (2016) Separation and parallel sequencing of the genomes 79 | and transcriptomes of single cells using G&T-seq. Nat Protoc, 11:2081–103. 80 | } 81 | \seealso{ 82 | SingleCellMultiModal-package 83 | } 84 | -------------------------------------------------------------------------------- /man/SCoPE2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SCoPE2.R 3 | \name{SCoPE2} 4 | \alias{SCoPE2} 5 | \title{Single-cell RNA sequencing and proteomics} 6 | \source{ 7 | All files are linked from the slavovlab website 8 | \url{https://scope2.slavovlab.net/docs/data} 9 | } 10 | \usage{ 11 | SCoPE2( 12 | DataType = "macrophage_differentiation", 13 | modes = "*", 14 | version = "1.0.0", 15 | dry.run = TRUE, 16 | verbose = TRUE, 17 | ... 18 | ) 19 | } 20 | \arguments{ 21 | \item{DataType}{\code{character(1)} Indicates study that produces this type of 22 | data (default: 'macrophage_differentiation')} 23 | 24 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as 25 | \code{"rna"}. A wildcard of \code{"*"} will return all modes, that are 26 | transcriptome ("rna") or proteome ("protein") which is the 27 | default.} 28 | 29 | \item{version}{\code{character(1)}, currently only version '1.0.0' is 30 | available} 31 | 32 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 33 | download (default \code{TRUE})} 34 | 35 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 36 | (down)loaded (default \code{TRUE})} 37 | 38 | \item{...}{Additional arguments passed on to the 39 | \link[ExperimentHub]{ExperimentHub-class} constructor} 40 | } 41 | \value{ 42 | A single cell multi-modal 43 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 44 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE} 45 | } 46 | \description{ 47 | SCoPE2 assembles data on-the-fly from \code{ExperimentHub} to provide 48 | a 49 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 50 | container. The \code{DataType} argument provides access to the \code{SCoPE2} dataset 51 | as provided by Specht et al. (2020; DOI: 52 | \url{http://dx.doi.org/10.1101/665307}). The article provides more information 53 | about the data acquisition and pre-processing. 54 | } 55 | \details{ 56 | The SCoPE2 study combines scRNA-seq (transcriptome) and 57 | single-cell proteomics. 58 | \itemize{ 59 | \item macrophage_differentiation: the cells are monocytes that undergo 60 | macrophage differentiation. No annotation is available for the 61 | transcriptome data, but batch and cell type annotations are 62 | available for the proteomics data in the \code{celltype} \code{colData} column. 63 | The transcriptomics and proteomics data were not measured from the same 64 | cells but from a distinct set of cell cultures. 65 | This dataset provides already filtered bad quality cells. 66 | \itemize{ 67 | \item scRNAseq1 - single-cell transcriptome (batch 1) 68 | \item scRNAseq2 - single-cell transcriptome (batch 2) 69 | \item scp - single-cell proteomics 70 | } 71 | } 72 | } 73 | \examples{ 74 | 75 | SCoPE2(DataType = "macrophage_differentiation", 76 | modes = "*", 77 | version = "1.0.0", 78 | dry.run = TRUE) 79 | 80 | } 81 | \references{ 82 | Specht, Harrison, Edward Emmott, Aleksandra A. Petelski, R. 83 | Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko, 84 | Antonius Koller, and Nikolai Slavov. 2020. “Single-Cell 85 | Proteomic and Transcriptomic Analysis of Macrophage 86 | Heterogeneity.” bioRxiv. https://doi.org/10.1101/665307. 87 | } 88 | \seealso{ 89 | SingleCellMultiModal-package 90 | } 91 | -------------------------------------------------------------------------------- /man/SingleCellMultiModal-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellMultiModal-package.R 3 | \docType{package} 4 | \name{SingleCellMultiModal-package} 5 | \alias{SingleCellMultiModal-package} 6 | \title{SingleCellMultiModal-package} 7 | \description{ 8 | The SingleCellMultiModal package provides a convenient and user-friendly 9 | representation of multi-modal data from project such as \code{scNMT} for mouse 10 | gastrulation. 11 | } 12 | \examples{ 13 | help(package = "SingleCellMultiModal") 14 | 15 | } 16 | \seealso{ 17 | Useful links: 18 | \itemize{ 19 | \item Report bugs at \url{https://github.com/waldronlab/SingleCellMultiModal/issues} 20 | } 21 | 22 | } 23 | \author{ 24 | \strong{Maintainer}: Marcel Ramos \email{marcel.ramos@roswellpark.org} (\href{https://orcid.org/0000-0002-3242-0582}{ORCID}) 25 | 26 | Authors: 27 | \itemize{ 28 | \item Ricard Argelaguet \email{ricard@ebi.ac.uk} 29 | \item Dario Righelli \email{dario.righelli@gmail.com} 30 | \item Kelly Eckenrode \email{kelly.eckenrode@sph.cuny.edu} 31 | \item Ludwig Geistlinger \email{ludwig_geistlinger@hms.harvard.edu} 32 | \item Levi Waldron \email{lwaldron.research@gmail.com} 33 | } 34 | 35 | Other contributors: 36 | \itemize{ 37 | \item Al Abadi [contributor] 38 | \item Christophe Vanderaa \email{christophe.vanderaa@uclouvain.be} [contributor] 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/SingleCellMultiModal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellMultiModal.R 3 | \name{SingleCellMultiModal} 4 | \alias{SingleCellMultiModal} 5 | \title{Combining Modalities into one MultiAssayExperiment} 6 | \usage{ 7 | SingleCellMultiModal( 8 | DataTypes, 9 | modes = "*", 10 | versions = "1.0.0", 11 | dry.run = TRUE, 12 | verbose = TRUE, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{DataTypes}{\code{character()} A vector of data types as indicated in each 18 | individual function by the \code{DataType} parameter. These can be any of 19 | the following: "mouse_gastrulation", "pbmc_10x", 20 | "macrophage_differentiation", "cord_blood", "peripheral_blood", 21 | "mouse_visual_cortex", "mouse_embryo_8_cell"} 22 | 23 | \item{modes}{list() A list or CharacterList of modes for each data type 24 | where each element corresponds to one data type.} 25 | 26 | \item{versions}{\code{character()} A vector of versions for each DataType. By 27 | default, version \verb{1.0.0} is obtained for all data types.} 28 | 29 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 30 | download (default \code{TRUE})} 31 | 32 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 33 | (down)loaded (default \code{TRUE})} 34 | 35 | \item{...}{Additional arguments passed on to the 36 | \link[ExperimentHub]{ExperimentHub-class} constructor} 37 | } 38 | \value{ 39 | A multi-modality \code{MultiAssayExperiment} 40 | } 41 | \description{ 42 | Combine multiple single cell modalities into one using the input of the 43 | individual functions. 44 | } 45 | \section{metadata}{ 46 | 47 | The metadata in the \code{MultiAssayExperiment} contains the original 48 | function call used to generate the object (labeled as \code{call}), 49 | a \code{call_map} which provides traceability of technology functions to 50 | \code{DataType} prefixes, and lastly, R version information as \code{version}. 51 | } 52 | 53 | \examples{ 54 | 55 | SingleCellMultiModal(c("mouse_gastrulation", "pbmc_10x"), 56 | modes = list(c("acc*", "met*"), "rna"), 57 | version = c("2.0.0", "1.0.0"), dry.run = TRUE, verbose = TRUE 58 | ) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /man/addCTLabels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cellGating.R 3 | \name{addCTLabels} 4 | \alias{addCTLabels} 5 | \title{addCTLabels} 6 | \usage{ 7 | addCTLabels( 8 | cd, 9 | out, 10 | outname, 11 | ct, 12 | mkrcol = "markers", 13 | ctcol = "celltype", 14 | overwrite = FALSE, 15 | verbose = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{cd}{the \code{colData} \code{DataFrame}} 20 | 21 | \item{out}{list data structure returned by \code{getCellGroups}} 22 | 23 | \item{outname}{character indicating the name of the out data structure} 24 | 25 | \item{ct}{character indicating the celltype to assign in the \code{ctcol}} 26 | 27 | \item{mkrcol}{character indicating the cd column to store the markers 28 | indicated by \code{outname} (default is markers)} 29 | 30 | \item{ctcol}{character indicating the column in cd to store the cell type 31 | indicated by \code{ct} (default is celltype)} 32 | 33 | \item{overwrite}{logical indicating if the cell types have to be overwritten 34 | without checking if detected barcodes were already assigned to other celltypes} 35 | 36 | \item{verbose}{logical for having informative messages during the execution} 37 | } 38 | \value{ 39 | an updated version of the cd DataFrame 40 | } 41 | \description{ 42 | addCTLabels 43 | } 44 | -------------------------------------------------------------------------------- /man/dot-CITEseqMaeToSce.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CITEseq.R 3 | \name{.CITEseqMaeToSce} 4 | \alias{.CITEseqMaeToSce} 5 | \title{CITEseqMaeToSce} 6 | \usage{ 7 | .CITEseqMaeToSce(mae) 8 | } 9 | \arguments{ 10 | \item{mae}{a MultiAssayExperiment object with scRNA and/or scADT and/or 11 | scHTO named experiments.} 12 | } 13 | \value{ 14 | a SingleCellExperiment object as widely with scRNA data as counts 15 | and scADT, scHTO data as altExps. 16 | If only one modality is present, it has returned as main assay of the SCE. 17 | } 18 | \description{ 19 | converts a \code{MultiAssayExperiment} object with CITEseq data into 20 | a \code{SingleCellExperiment} object to be used with already known methods and 21 | packages in literature. 22 | 23 | Note that for creating a \code{SingleCellExperiment} object the following function 24 | subsets all the assays present in the \code{MultiAssayExperiment} with only the 25 | common cells across all the modalities. 26 | This could result in a not complete object. 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/getCellGroups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cellGating.R 3 | \name{getCellGroups} 4 | \alias{getCellGroups} 5 | \title{getCellGroups} 6 | \usage{ 7 | getCellGroups(mat, adt1 = "CD19", adt2 = "CD3", th1 = 0.2, th2 = 0) 8 | } 9 | \arguments{ 10 | \item{mat}{matrix of counts or clr transformed counts for ADT data in CITEseq} 11 | 12 | \item{adt1}{character indicating the name of the marker to plot on the x-axis 13 | (default is CD19).} 14 | 15 | \item{adt2}{character indicating the name of the marker to plot on the y-axis 16 | (default is CD3).} 17 | 18 | \item{th1}{numeric indicating the threshold for the marker on the x-axis 19 | (default is 0.2).} 20 | 21 | \item{th2}{numeric indicating the threshold for the marker on the y-axis 22 | (default is 0).} 23 | } 24 | \value{ 25 | a list of four different element, each one indicating the quarter 26 | where the thresholds divide the plotting space, in eucledian order I, II, 27 | III, IV quadrant, indicating respectively +/+, +/-, -/+, -/- combinations 28 | for the couples of selected ADTs. 29 | Each element of the list contains two objects, one with the list of detected 30 | barcodes and one indicating the percentage of barcodes falling into that 31 | quadrant. 32 | . 33 | } 34 | \description{ 35 | Shows the cells/barcodes in two different plots (scatter and density) 36 | divinding the space in four quadrant indicated by the two thresholds given 37 | as input parameters. 38 | The x/y-axis represent respectively the two ADTs given as input. 39 | It returns a list of one element for each quadrant, each with barcodes and 40 | percentage (see Value section for details). 41 | } 42 | \details{ 43 | helps to do manual gating for cell type indentification with CITEseq 44 | or similar data, providing cell markers. 45 | Once identified two interesting markers for a cell type, the user has to 46 | play with the thresholds to identify the cell populations specified by an 47 | uptake (+) o downtake (-) of the couple of markers (ADTs) previously selected. 48 | } 49 | -------------------------------------------------------------------------------- /man/ontomap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ontomap.R 3 | \name{ontomap} 4 | \alias{ontomap} 5 | \title{Obtain a map of cell types for each dataset} 6 | \usage{ 7 | ontomap(dataset = c("scNMT", "scMultiome", "SCoPE2", "CITEseq", "seqFISH")) 8 | } 9 | \arguments{ 10 | \item{dataset}{\code{character()} One of the existing functions within the 11 | package. If missing, a map of all cell types in each function will 12 | be provided.} 13 | } 14 | \value{ 15 | A \code{data.frame} of metadata with cell types and ontologies 16 | } 17 | \description{ 18 | The \code{ontomap} function provides a mapping of all the cell names across the 19 | all the data sets or for a specified data set. 20 | } 21 | \details{ 22 | Note that \code{CITEseq} does not have any cell annotations; therefore, no entries 23 | are present in the \code{ontomap}. 24 | } 25 | \examples{ 26 | 27 | ontomap(dataset = "scNMT") 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/scMultiome.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scMultiome.R 3 | \name{scMultiome} 4 | \alias{scMultiome} 5 | \title{Single-cell Multiome ATAC + Gene Expression} 6 | \usage{ 7 | scMultiome( 8 | DataType = "pbmc_10x", 9 | modes = "*", 10 | version = "1.0.0", 11 | format = c("MTX", "HDF5"), 12 | dry.run = TRUE, 13 | verbose = TRUE, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{DataType}{\code{character(1)} Indicates study that produces this type of 19 | data (default: 'mouse_gastrulation')} 20 | 21 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as 22 | \code{"acc*"}. A wildcard of \code{"*"} will return all modes including 23 | Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna") 24 | which is the default.} 25 | 26 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on 27 | data version required (default '1.0.0'). See version section.} 28 | 29 | \item{format}{\code{character(1)} Either MTX or HDF5 data format (default MTX)} 30 | 31 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 32 | download (default \code{TRUE})} 33 | 34 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 35 | (down)loaded (default \code{TRUE})} 36 | 37 | \item{...}{Additional arguments passed on to the 38 | \link[ExperimentHub]{ExperimentHub-class} constructor} 39 | } 40 | \value{ 41 | A 10X PBMC \code{MultiAssayExperiment} object 42 | } 43 | \description{ 44 | 10x Genomics Multiome technology enables simultaneous profiling 45 | of the transcriptome (using 3’ gene expression) and epigenome 46 | (using ATAC-seq) from single cells to 47 | deepen our understanding of how genes are expressed and regulated across 48 | different cell types. Data prepared by Ricard Argelaguet. 49 | } 50 | \details{ 51 | Users are able to choose from either an \code{MTX} or \code{HDF5} file format 52 | as the internal data representation. The \code{MTX} (Matrix Market) format 53 | allows users to load a sparse \code{dgCMatrix} representation. Choosing \code{HDF5} 54 | gives users a sparse \code{HDF5Array} class object. 55 | * pbmc_10x: 10K Peripheral Blood Mononuclear Cells provided by 56 | \href{https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets}{10x Genomics website} 57 | Cell quality control filters are available in the object \code{colData} 58 | together with the \code{celltype} annotation labels. 59 | } 60 | \examples{ 61 | 62 | scMultiome(DataType = "pbmc_10x", modes = "*", dry.run = TRUE) 63 | 64 | } 65 | -------------------------------------------------------------------------------- /man/scNMT.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scNMT.R 3 | \name{scNMT} 4 | \alias{scNMT} 5 | \title{Single-cell Nucleosome, Methylation and Transcription sequencing} 6 | \source{ 7 | \url{http://ftp.ebi.ac.uk/pub/databases/scnmt_gastrulation/} 8 | } 9 | \usage{ 10 | scNMT( 11 | DataType = "mouse_gastrulation", 12 | modes = "*", 13 | version = "1.0.0", 14 | dry.run = TRUE, 15 | verbose = TRUE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{DataType}{\code{character(1)} Indicates study that produces this type of 21 | data (default: 'mouse_gastrulation')} 22 | 23 | \item{modes}{\code{character()} A wildcard / glob pattern of modes, such as 24 | \code{"acc*"}. A wildcard of \code{"*"} will return all modes including 25 | Chromatin Accessibilty ("acc"), Methylation ("met"), RNA-seq ("rna") 26 | which is the default.} 27 | 28 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on 29 | data version required (default '1.0.0'). See version section.} 30 | 31 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 32 | download (default \code{TRUE})} 33 | 34 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 35 | (down)loaded (default \code{TRUE})} 36 | 37 | \item{...}{Additional arguments passed on to the 38 | \link[ExperimentHub]{ExperimentHub-class} constructor} 39 | } 40 | \value{ 41 | A single cell multi-modal 42 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 43 | or informative \code{data.frame} when \code{dry.run} is \code{TRUE} 44 | } 45 | \description{ 46 | scNMT assembles data on-the-fly from \code{ExperimentHub} to provide 47 | a 48 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 49 | container. The \code{DataType} argument provides access to the 50 | \code{mouse_gastrulation} dataset as obtained from Argelaguet et al. (2019; DOI: 51 | 10.1038/s41586-019-1825-8). Pre-processing code can be seen at 52 | \url{https://github.com/rargelaguet/scnmt_gastrulation}. Protocol 53 | information for this dataset is available at Clark et al. (2018). See the 54 | vignette for the full citation. 55 | } 56 | \details{ 57 | scNMT is a combination of RNA-seq (transcriptome) and an adaptation 58 | of Nucleosome Occupancy and Methylation sequencing (NOMe-seq, the 59 | methylome and chromatin accessibility) technologies. For more 60 | information, see Reik et al. (2018) DOI: 10.1038/s41467-018-03149-4 61 | \itemize{ 62 | \item mouse_gastrulation - this dataset provides cell quality control filters in 63 | the object \code{colData} starting from version 2.0.0. Additionally, cell types 64 | annotations are provided through the \code{lineage} \code{colData} column. 65 | \itemize{ 66 | \item rna - RNA-seq 67 | \item acc_\* - chromatin accessibility 68 | \item met_\* - DNA methylation 69 | \itemize{ 70 | \item cgi - CpG islands 71 | \item CTCF - footprints of CTCF binding 72 | \item DHS - DNase Hypersensitive Sites 73 | \item genebody - gene bodies 74 | \item p300 - p300 binding sites 75 | \item promoter - gene promoters 76 | } 77 | } 78 | } 79 | 80 | Special thanks to Al J Abadi for preparing the published data in time 81 | for the 2020 BIRS Workshop, see the link here: 82 | \url{https://github.com/BIRSBiointegration/Hackathon/tree/master/scNMT-seq} 83 | } 84 | \section{versions}{ 85 | 86 | Version '1.0.0' of the scNMT mouse_gastrulation dataset includes all of 87 | the above mentioned assay technologies with filtering of cells based on 88 | quality control metrics. Version '2.0.0' contains all of the cells 89 | without the QC filter and does not contain CTCF binding footprints or 90 | p300 binding sites. 91 | } 92 | 93 | \section{metadata}{ 94 | 95 | The \code{MultiAssayExperiment} metadata includes the original function call 96 | that saves the function call and the data version requested. 97 | } 98 | 99 | \examples{ 100 | 101 | scNMT(DataType = "mouse_gastrulation", modes = "*", 102 | version = "1.0.0", dry.run = TRUE) 103 | 104 | } 105 | \references{ 106 | Argelaguet et al. (2019) 107 | } 108 | \seealso{ 109 | SingleCellMultiModal-package 110 | } 111 | -------------------------------------------------------------------------------- /man/scmmCache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cache.R 3 | \name{scmmCache} 4 | \alias{scmmCache} 5 | \alias{setCache} 6 | \alias{removeCache} 7 | \title{Manage cache / download directories for study data} 8 | \usage{ 9 | scmmCache(...) 10 | 11 | setCache( 12 | directory = tools::R_user_dir("SingleCellMultiModal", "cache"), 13 | verbose = TRUE, 14 | ask = interactive() 15 | ) 16 | 17 | removeCache(accession) 18 | } 19 | \arguments{ 20 | \item{...}{For \code{scmmCache}, arguments passed to \code{setCache}} 21 | 22 | \item{directory}{\code{character(1)} The file location where the cache is located. 23 | Once set, future downloads will go to this folder. See \code{setCache} section 24 | for details.} 25 | 26 | \item{verbose}{Whether to print descriptive messages} 27 | 28 | \item{ask}{\code{logical(1)} (default TRUE when \code{interactive()}) Confirm the file 29 | location of the cache directory} 30 | 31 | \item{accession}{\code{character(1)} A single string indicating the accession number 32 | of the study} 33 | } 34 | \value{ 35 | The directory / option of the cache location 36 | } 37 | \description{ 38 | Managing data downloads is important to save disk space and 39 | re-downloading data files. This can be done effortlessly via the integrated 40 | \code{BiocFileCache} system. 41 | } 42 | \section{scmmCache}{ 43 | 44 | Get the directory location of the cache. It will prompt the user to create 45 | a cache if not already created. A specific directory can be used via 46 | \code{setCache}. 47 | } 48 | 49 | \section{setCache}{ 50 | 51 | Specify the directory location of the data cache. By default, it will 52 | go into the user's home and package name directory as given by 53 | \link[tools:userdir]{R_user_dir} (default: varies by system e.g., for Linux: 54 | '$HOME/.cache/R/SingleCellMultiModal'). 55 | } 56 | 57 | \section{removeCache}{ 58 | 59 | Some files may become corrupt when downloading, this function allows 60 | the user to delete the tarball associated with a study number in the 61 | cache. 62 | } 63 | 64 | \examples{ 65 | getOption("scmmCache") 66 | scmmCache() 67 | 68 | } 69 | -------------------------------------------------------------------------------- /man/seqFISH.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/seqFISH.R 3 | \name{seqFISH} 4 | \alias{seqFISH} 5 | \title{Single-cell spatial + Gene Expression} 6 | \usage{ 7 | seqFISH( 8 | DataType = "mouse_visual_cortex", 9 | modes = "*", 10 | version, 11 | dry.run = TRUE, 12 | verbose = TRUE, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{DataType}{\code{character(1)} indicating the identifier of the dataset to 18 | retrieve. (default "mouse_visual_cortex")} 19 | 20 | \item{modes}{\code{character()} The assay types or modes of data to obtain these 21 | include seq-FISH and scRNA-seq data by default.} 22 | 23 | \item{version}{\code{character(1)} Either version '1.0.0' or '2.0.0' depending on 24 | data version required (default '1.0.0'). See version section.} 25 | 26 | \item{dry.run}{\code{logical(1)} Whether to return the dataset names before actual 27 | download (default \code{TRUE})} 28 | 29 | \item{verbose}{\code{logical(1)} Whether to show the dataset currently being 30 | (down)loaded (default \code{TRUE})} 31 | 32 | \item{...}{Additional arguments passed on to the 33 | \link[ExperimentHub]{ExperimentHub-class} constructor} 34 | } 35 | \value{ 36 | A 37 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 38 | of seq-FISH data 39 | } 40 | \description{ 41 | seqFISH function assembles data on-the-fly from \code{ExperimentHub} 42 | to provide a 43 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 44 | container. Actually the \code{DataType} argument provides access to the 45 | available datasets associated to the package. 46 | } 47 | \details{ 48 | seq FISH data are a combination of single cell spatial coordinates 49 | and transcriptomics for a few hundreds of genes. seq-FISH data can be 50 | combined for example with scRNA-seq data to unveil multiple aspects of 51 | cellular behaviour based on their spatial organization and transcription. 52 | 53 | Available datasets are: 54 | \itemize{ 55 | \item mouse_visual_cortex: combination of seq-FISH data as obtained from Zhu 56 | et al. (2018) and scRNA-seq data as obtained from Tasic et al. (2016), 57 | Version 1.0.0 returns the full scRNA-seq data matrix, while version 2.0.0 58 | returns the processed and subsetted scRNA-seq data matrix (produced for 59 | the Mathematical Frameworks for Integrative Analysis of Emerging 60 | Biological Data Types 2020 Workshop) The returned seqFISH data are always 61 | the processed ones for the same workshop. Additionally, cell types 62 | annotations are available in the \code{colData} through the \code{class} column in 63 | the seqFISH \code{assay}. 64 | \itemize{ 65 | \item scRNA_Counts - Tasic scRNA-seq gene count matrix 66 | \item scRNA_Labels - Tasic scRNA-seq cell labels 67 | \item seqFISH_Coordinates - Zhu seq-FISH spatial coordinates 68 | \item seqFISH_Counts - Zhu seq-FISH gene counts matrix 69 | \item seqFISH_Labels - Zhu seq-FISH cell labels 70 | } 71 | } 72 | } 73 | \examples{ 74 | 75 | seqFISH(DataType = "mouse_visual_cortex", modes = "*", version = "2.0.0", 76 | dry.run = TRUE) 77 | 78 | } 79 | \author{ 80 | Dario Righelli }} gmail.com> 81 | } 82 | -------------------------------------------------------------------------------- /vignettes/CITEseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "CITEseq Cord Blood" 3 | author: "Dario Righelli" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | BiocStyle::html_document: 7 | toc_float: true 8 | vignette: > 9 | %\VignetteIndexEntry{CITEseq Cord Blood} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | package: SingleCellMultiModal 13 | bibliography: ../inst/REFERENCES.bib 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | # Installation 19 | 20 | ```{r,eval=FALSE} 21 | if (!requireNamespace("BiocManager", quietly = TRUE)) 22 | install.packages("BiocManager") 23 | 24 | BiocManager::install("SingleCellMultiModal") 25 | ``` 26 | 27 | 28 | # Load libraries 29 | 30 | ```{r, include=TRUE, results="hide", message=FALSE, warning=FALSE} 31 | 32 | library(MultiAssayExperiment) 33 | library(SingleCellMultiModal) 34 | library(SingleCellExperiment) 35 | ``` 36 | 37 | 38 | # CITE-seq dataset 39 | 40 | CITE-seq data are a combination of two data types extracted at the same 41 | time from the same cell. First data type is scRNA-seq data, while the second 42 | one consists of about a hundread of antibody-derived tags (ADT). 43 | In particular this dataset is provided by @stoeckius2017simultaneous. 44 | 45 | ## Downloading datasets 46 | 47 | The user can see the available dataset by using the default options 48 | 49 | ```{r} 50 | 51 | CITEseq(DataType="cord_blood", modes="*", dry.run=TRUE, version="1.0.0") 52 | 53 | ``` 54 | 55 | Or simply by setting `dry.run = FALSE` it downloads the data and creates the 56 | `MultiAssayExperiment` object. 57 | 58 | In this example, we will use one of the two available datasets `scADT_Counts`: 59 | 60 | ```{r,message=FALSE} 61 | 62 | mae <- CITEseq( 63 | DataType="cord_blood", modes="*", dry.run=FALSE, version="1.0.0" 64 | ) 65 | 66 | mae 67 | ``` 68 | 69 | Example with actual data: 70 | 71 | ```{r} 72 | experiments(mae) 73 | ``` 74 | 75 | 76 | ## Exploring the data structure 77 | 78 | Check row annotations: 79 | 80 | ```{r} 81 | rownames(mae) 82 | ``` 83 | 84 | Take a peek at the `sampleMap`: 85 | 86 | ```{r} 87 | sampleMap(mae) 88 | ``` 89 | 90 | 91 | ## scRNA-seq data 92 | 93 | The scRNA-seq data are accessible with the name `scRNAseq`, which returns a 94 | *matrix* object. 95 | 96 | ```{r} 97 | head(experiments(mae)$scRNAseq)[, 1:4] 98 | ``` 99 | 100 | ## scADT data 101 | 102 | The scADT data are accessible with the name `scADT`, which returns a 103 | **matrix** object. 104 | 105 | ```{r} 106 | head(experiments(mae)$scADT)[, 1:4] 107 | ``` 108 | 109 | # SingleCellExperiment object conversion 110 | 111 | Because of already large use of some methodologies (such as 112 | in the [SingleCellExperiment vignette][1] or [CiteFuse Vignette][2] where the 113 | `SingleCellExperiment` object is used for CITE-seq data, 114 | we provide a function for the conversion of our CITE-seq `MultiAssayExperiment` 115 | object into a `SingleCellExperiment` object with scRNA-seq data as counts and 116 | scADT data as `altExp`s. 117 | 118 | [1]: https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html#5_adding_alternative_feature_sets 119 | [2]: http://www.bioconductor.org/packages/release/bioc/vignettes/CiteFuse/inst/doc/CiteFuse.html 120 | 121 | ```{r message=FALSE} 122 | sce <- CITEseq(DataType="cord_blood", modes="*", dry.run=FALSE, version="1.0.0", 123 | DataClass="SingleCellExperiment") 124 | sce 125 | ``` 126 | 127 | # Session Info 128 | 129 | ```{r, tidy=TRUE} 130 | sessionInfo() 131 | ``` 132 | 133 | # References 134 | 135 | -------------------------------------------------------------------------------- /vignettes/ECCITEseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ECCITEseq Peripheral Blood" 3 | author: "Dario Righelli" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | BiocStyle::html_document: 7 | toc_float: true 8 | vignette: > 9 | %\VignetteIndexEntry{ECCITEseq Peripheral Blood} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | package: SingleCellMultiModal 13 | bibliography: ../inst/REFERENCES.bib 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | # Installation 19 | 20 | ```{r,eval=FALSE} 21 | if (!requireNamespace("BiocManager", quietly = TRUE)) 22 | install.packages("BiocManager") 23 | 24 | BiocManager::install("SingleCellMultiModal") 25 | ``` 26 | 27 | 28 | # Load libraries 29 | 30 | ```{r, include=TRUE, results="hide", message=FALSE, warning=FALSE} 31 | 32 | library(MultiAssayExperiment) 33 | library(SingleCellMultiModal) 34 | library(SingleCellExperiment) 35 | 36 | ``` 37 | 38 | 39 | # ECCITE-seq dataset 40 | 41 | ECCITE-seq data are an evolution of the CITE-seq data 42 | (see also [CITE-seq vignette](CITEseq.html) for more details) 43 | by extending the CITE-seq original data types with a third one always extracted 44 | from the same cell. 45 | Indeed, in addition to the CITE-seq providing scRNA-seq and antibody-derived tags 46 | (ADT), it provides around ten Hashtagged Oligo (HTO). 47 | In particular this dataset is provided by @mimitou2019multiplexed. 48 | 49 | ## Downloading datasets 50 | 51 | The user can see the available dataset by using the default options through the 52 | CITE-seq function. 53 | 54 | ```{r} 55 | 56 | CITEseq(DataType="peripheral_blood", modes="*", dry.run=TRUE, version="1.0.0") 57 | 58 | ``` 59 | 60 | Or simply by setting `dry.run = FALSE` it downloads the data and by default 61 | creates the `MultiAssayExperiment` object. 62 | 63 | In this example, we will use one of the two available datasets `scADT_Counts`: 64 | 65 | ```{r message=FALSE} 66 | 67 | mae <- CITEseq(DataType="peripheral_blood", modes="*", dry.run=FALSE, version="1.0.0") 68 | mae 69 | ``` 70 | 71 | Example with actual data: 72 | 73 | ```{r} 74 | experiments(mae) 75 | ``` 76 | 77 | Additionally, we stored into the object metedata 78 | 79 | ## Exploring the data structure 80 | 81 | Check row annotations: 82 | 83 | ```{r} 84 | rownames(mae) 85 | ``` 86 | 87 | Take a peek at the `sampleMap`: 88 | 89 | ```{r} 90 | sampleMap(mae) 91 | ``` 92 | 93 | 94 | ## scRNA-seq data 95 | 96 | The scRNA-seq data are accessible with the name `scRNAseq`, which returns a 97 | *matrix* object. 98 | 99 | ```{r} 100 | head(experiments(mae)$scRNA)[, 1:4] 101 | ``` 102 | 103 | ## scADT data 104 | 105 | The scADT data are accessible with the name `scADT`, which returns a 106 | **matrix** object. 107 | 108 | ```{r} 109 | head(experiments(mae)$scADT)[, 1:4] 110 | ``` 111 | 112 | ## CTCL/CTRL conditions 113 | 114 | The dataset has two different conditions (CTCL and CTRL) which samples can be identified with the `colData` accessor. 115 | 116 | CTCL stands for cutaneous T-cell lymphoma while CTRL for control. 117 | 118 | For example, if we want only the CTCL samples, we can run: 119 | 120 | ```{r} 121 | (ctclMae <- mae[,colData(mae)$condition == "CTCL",]) 122 | ``` 123 | 124 | And if you're interested into the common samples across all the modalities 125 | you can use the `complete.cases` funtion. 126 | 127 | ```{r} 128 | ctclMae[,complete.cases(ctclMae),] 129 | ``` 130 | 131 | 132 | ## sgRNAs CRISPR pertubation data 133 | 134 | The CRISPR perturbed scRNAs data are stored in a different spot 135 | to keep their original long format. 136 | 137 | They can be accessed with the `metadata` accessors which, in this case returns a named `list` of `data.frame`s. 138 | 139 | ```{r} 140 | sgRNAs <- metadata(mae) 141 | names(sgRNAs) 142 | ``` 143 | 144 | There are four different sgRNAs datasets, one per each condition and family receptors combination. 145 | 146 | TCR stands for T-Cell Receptor, while a,b,g,d stand for alpha, beta, gamma and delta respectively. 147 | 148 | To look into the TCRab, simply run: 149 | 150 | ```{r} 151 | head(sgRNAs$CTCL_TCRab) 152 | ``` 153 | 154 | # SingleCellExperiment object conversion 155 | 156 | Because of already large use of some methodologies (such as 157 | in the [SingleCellExperiment vignette][1] or [CiteFuse Vignette][2] where the 158 | `SingleCellExperiment` object is used for CITE-seq data, 159 | we provide a function for the conversion of our CITE-seq `MultiAssayExperiment` 160 | object into a `SingleCellExperiment` object with scRNA-seq data as counts and 161 | scADT data as `altExp`s. 162 | 163 | 164 | ```{r message=FALSE} 165 | sce <- CITEseq(DataType="peripheral_blood", modes="*", dry.run=FALSE, 166 | version="1.0.0", DataClass="SingleCellExperiment") 167 | sce 168 | ``` 169 | 170 | # Session Info 171 | 172 | ```{r, tidy=TRUE} 173 | sessionInfo() 174 | ``` 175 | 176 | # Additional References 177 | 178 | https://www.bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html#5_adding_alternative_feature_sets 179 | http://www.bioconductor.org/packages/release/bioc/vignettes/CiteFuse/inst/doc/CiteFuse.html 180 | 181 | # References 182 | -------------------------------------------------------------------------------- /vignettes/GTseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "G&T-seq Mouse Embryo (8-cell stage)" 3 | date: "`r BiocStyle::doc_date()`" 4 | vignette: | 5 | %\VignetteIndexEntry{GT-seq Mouse Embryo} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: 9 | BiocStyle::html_document: 10 | toc_float: true 11 | package: SingleCellMultiModal 12 | bibliography: ../inst/REFERENCES.bib 13 | --- 14 | 15 | # Installation 16 | 17 | ```{r,eval=FALSE} 18 | if (!requireNamespace("BiocManager", quietly = TRUE)) 19 | install.packages("BiocManager") 20 | 21 | BiocManager::install("SingleCellMultiModal") 22 | ``` 23 | 24 | ## Load 25 | 26 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 27 | library(SingleCellMultiModal) 28 | library(MultiAssayExperiment) 29 | ``` 30 | 31 | # G&T-seq: parallel sequencing data of single-cell genomes and transcriptomes 32 | 33 | G&T-seq is a combination of Picoplex amplified gDNA sequencing (genome) and 34 | SMARTSeq2 amplified cDNA sequencing (transcriptome) of the same cell. 35 | For more information, see @Macaulay2015. 36 | 37 | ## Downloading datasets 38 | 39 | The user can see the available dataset by using the default options 40 | 41 | ```{r} 42 | GTseq("mouse_embryo_8_cell", mode = "*", dry.run = TRUE) 43 | ``` 44 | 45 | Or by simply running: 46 | 47 | ```{r} 48 | GTseq() 49 | ``` 50 | 51 | ## Obtaining the data 52 | 53 | To obtain the actual datasets: 54 | 55 | ```{r,message=FALSE} 56 | gts <- GTseq(dry.run = FALSE) 57 | gts 58 | ``` 59 | 60 | ## Exploring the data structure 61 | 62 | Check available metadata for each of the 112 mouse embryo cells assayed by G&T-seq: 63 | 64 | ```{r} 65 | colData(gts) 66 | ``` 67 | 68 | Take a peek at the `sampleMap`: 69 | 70 | ```{r} 71 | sampleMap(gts) 72 | ``` 73 | 74 | ## Copy numbers 75 | 76 | To access the integer copy numbers as detected from scDNA-seq: 77 | 78 | ```{r} 79 | head(assay(gts, "genomic"))[, 1:4] 80 | ``` 81 | 82 | ## RNA-seq 83 | 84 | To access raw read counts as quantified from scRNA-seq: 85 | 86 | ```{r} 87 | head(assay(gts, "transcriptomic"))[, 1:4] 88 | ``` 89 | 90 | For protocol information, see @Macaulay2016. 91 | 92 | # sessionInfo 93 | 94 | ```{r} 95 | sessionInfo() 96 | ``` 97 | 98 | # References 99 | -------------------------------------------------------------------------------- /vignettes/SCoPE2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SCoPE2: macrophage vs monocytes" 3 | date: "`r BiocStyle::doc_date()`" 4 | vignette: | 5 | %\VignetteIndexEntry{SCoPE2: macrophage vs monocytes} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: 9 | BiocStyle::html_document: 10 | toc_float: true 11 | package: SingleCellMultiModal 12 | bibliography: ../inst/REFERENCES.bib 13 | --- 14 | 15 | This vignette will guide you through how accessing and manipulating 16 | the SCoPE2 data sets available from the `SingleCellMultimodal` package. 17 | 18 | # Installation 19 | 20 | ```{r,eval=FALSE} 21 | if (!requireNamespace("BiocManager", quietly = TRUE)) 22 | install.packages("BiocManager") 23 | BiocManager::install("SingleCellMultiModal") 24 | ``` 25 | 26 | ## Load packages 27 | 28 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 29 | library(SingleCellMultiModal) 30 | library(MultiAssayExperiment) 31 | ``` 32 | 33 | # SCoPE2 34 | 35 | SCoPE2 is a mass spectrometry (MS)-based single-cell proteomics 36 | protocol to quantify the proteome of single-cells in an untargeted 37 | fashion. It was initially developed by @Specht2021-pm. 38 | 39 | ## Downloading data sets 40 | 41 | The user can see the available data set by using the default options. 42 | 43 | ```{r} 44 | SCoPE2("macrophage_differentiation", 45 | mode = "*", 46 | version = "1.0.0", 47 | dry.run = TRUE) 48 | ``` 49 | 50 | Or by simply running: 51 | 52 | ```{r} 53 | SCoPE2("macrophage_differentiation") 54 | ``` 55 | 56 | ## Available projects 57 | 58 | Currently, only the `macrophage_differentiation` is available. 59 | 60 | ## Retrieving data 61 | 62 | You can retrieve the actual data from `ExperimentHub` by setting 63 | `dry.run = FALSE`. This example retrieves the complete data set 64 | (transcriptome and proteome) for the `macrophage_differentiation` 65 | project: 66 | 67 | ```{r,message=FALSE} 68 | scope2 <- SCoPE2("macrophage_differentiation", 69 | modes = "rna|protein", 70 | dry.run = FALSE) 71 | scope2 72 | ``` 73 | 74 | # The macrophage differentiation project 75 | 76 | This data set has been acquired by the Slavov Lab (@Specht2021-pm). 77 | It contains single-cell proteomics and single-cell 78 | RNA sequencing data for macrophages and monocytes. The objective of the 79 | research that led to generate the data is to understand whether 80 | homogeneous monocytes differentiate in the absence of cytokines to 81 | macrophages with homogeneous or heterogeneous profiles. The transcriptomic and 82 | proteomic acquisitions are conducted on two separate subset of similar 83 | cells (same experimental design). The cell type of the samples are known only 84 | for the **proteomics** data. The proteomics data was retrieved from 85 | the authors' [website](https://scope2.slavovlab.net/docs/data) and the 86 | transcriptomic data was retrieved from the GEO database (accession id: 87 | [GSE142392](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE142392)). 88 | 89 | For more information on the protocol, see @Specht2021-pm. 90 | 91 | ## Data versions 92 | 93 | Only version `1.0.0` is currently available. 94 | 95 | The `macrophage_differentiation` data set in this package contains two 96 | assays: `rna` and `protein`. 97 | 98 | ### Cell annotation 99 | 100 | The single-cell proteomics data contains cell type annotation 101 | (`celltype`), sample preparation batch (`batch_digest` and 102 | `batch_sort`), chromatography batch (`batch_chromatography`), and the 103 | MS acquisition run (`batch_MS`). The single-cell transcriptomics data 104 | was acquired in two batches (`batch_Chromium`). Note that because the 105 | cells that compose the two assays are distinct, there is no common 106 | cell annotation available for both proteomics and transcriptomics. The 107 | annotation were therefore filled with `NA`s accordingly. 108 | 109 | ```{r} 110 | colData(scope2) 111 | ``` 112 | 113 | ### Transcriptomic data 114 | 115 | You can extract and check the transcriptomic data through subsetting: 116 | 117 | ```{r} 118 | scope2[["rna"]] 119 | ``` 120 | 121 | The data is rather large and is therefore stored on-disk using the 122 | HDF5 backend. You can verify this by looking at the assay data matrix. 123 | Note that the counts are UMI counts. 124 | 125 | ```{r} 126 | assay(scope2[["rna"]])[1:5, 1:5] 127 | ``` 128 | 129 | ### Proteomic data 130 | 131 | The `protein` assay contains MS-based proteomic data. 132 | The data have been passed sample and feature quality control, 133 | normalized, log transformed, imputed and batch corrected. Detailed 134 | information about the data processing is available in 135 | [another vignette](https://uclouvain-cbio.github.io/SCP.replication/articles/SCoPE2.html). You can extract the proteomic data similarly to the 136 | transcriptomic data: 137 | 138 | ```{r} 139 | scope2[["protein"]] 140 | ``` 141 | 142 | In this case, the protein data have reasonable size and are loaded 143 | directly into memory. The data matrix is stored in `logexprs`. We 144 | decided to not use the traditional `logcounts` because MS proteomics 145 | measures intensities rather than counts as opposed to scRNA-Seq. 146 | 147 | ```{r} 148 | assay(scope2[["protein"]])[1:5, 1:5] 149 | ``` 150 | 151 | # sessionInfo 152 | 153 | ```{r} 154 | sessionInfo() 155 | ``` 156 | 157 | # References 158 | -------------------------------------------------------------------------------- /vignettes/SingleCellMultiModal.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SingleCellMultiModal Introduction" 3 | date: "`r BiocStyle::doc_date()`" 4 | vignette: | 5 | %\VignetteIndexEntry{SingleCellMultiModal Introduction} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: 9 | BiocStyle::html_document: 10 | toc_float: true 11 | package: SingleCellMultiModal 12 | bibliography: ../inst/REFERENCES.bib 13 | --- 14 | 15 | # SingleCellMultiModal 16 | 17 | ## Overview 18 | 19 | `SingleCellMultiModal` is an R package that provides a convenient and 20 | user-friendly representation of multi-modal data using `MultiAssayExperiment`. 21 | This package introduces a suite of single-cell multimodal landmark datasets for 22 | benchmarking and testing multimodal analysis methods via the `ExperimentHub` 23 | Bioconductor package. The scope of this package is to provide efficient access 24 | to a selection of curated, pre-integrated, publicly available landmark datasets 25 | for methods development and benchmarking. 26 | 27 | ## Installation 28 | 29 | ```{r,eval=FALSE} 30 | if (!requireNamespace("BiocManager", quietly = TRUE)) 31 | install.packages("BiocManager") 32 | 33 | BiocManager::install("SingleCellMultiModal") 34 | ``` 35 | 36 | ## Loading packages 37 | 38 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 39 | library(SingleCellMultiModal) 40 | library(MultiAssayExperiment) 41 | ``` 42 | 43 | # Citing SingleCellMultiModal 44 | 45 | Your citations are crucial in keeping our software free and open source. To 46 | cite our package see the citation (@Eckenrode2023-yq) in the Reference 47 | section. You may also browse to the publication at 48 | [PLoS Computational Biology][1]. 49 | 50 | [1]: https://doi.org/10.1371/journal.pcbi.1011324 51 | 52 | ## Representation 53 | 54 | Users can obtain integrative representations of multiple modalities as a 55 | `MultiAssayExperiment`, a common core Bioconductor data structure relied on by 56 | dozens of multimodal data analysis packages. `MultiAssayExperiment` harmonizes 57 | data management of multiple experimental assays performed on an overlapping set 58 | of specimens. Although originally developed for patient data from multi-omics 59 | cancer studies, the `MultiAssayExperiment` framework naturally applies also to 60 | single cells. A schematic of the data structure can be seen below. In this 61 | context, "patients" are replaced by "cells". We use `MultiAssayExperiment` 62 | because it provides a familiar user experience by extending 63 | `SummarizedExperiment` concepts and providing open ended compatibility with 64 | standard data classes present in Bioconductor such as the 65 | `SingleCellExperiment`. 66 | 67 | ```{r,echo=FALSE} 68 | imgurl <- paste0( 69 | "https://github.com/waldronlab/MultiAssayExperiment/blob/", 70 | "c3c59a094e5a08111ee98b9f69579db5634d9fd4/vignettes/", 71 | "MultiAssayExperiment.png?raw=true" 72 | ) 73 | knitr::include_graphics( 74 | path = imgurl 75 | ) 76 | ``` 77 | 78 | # Contributions 79 | 80 | Want to contribute to the `SingleCellMultiModal` package? We welcome 81 | contributions from the community. Please refer to our 82 | [Contributing Guidelines][2] for more details. 83 | 84 | [2]: https://github.com/waldronlab/SingleCellMultiModal/wiki/Contributing-Guidelines 85 | 86 | 87 | ## Further resources 88 | 89 | For more information on the `MultiAssayExperiment` data structure, please refer 90 | to @Ramos2017-tk as well as the [MultiAssayExperiment vignette][3]. 91 | 92 | [3]: https://bioconductor.org/packages/release/bioc/vignettes/MultiAssayExperiment/inst/doc/MultiAssayExperiment.html 93 | 94 | # References 95 | -------------------------------------------------------------------------------- /vignettes/scMultiome.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "PBMCs profiled with the Chromium Single Cell Multiome ATAC + Gene Expression from 10x" 3 | date: "`r BiocStyle::doc_date()`" 4 | vignette: | 5 | %\VignetteIndexEntry{scMultiome 10x PBMC} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: 9 | BiocStyle::html_document: 10 | toc_float: true 11 | package: SingleCellMultiModal 12 | --- 13 | 14 | # Installation 15 | 16 | ```{r,eval=FALSE} 17 | if (!requireNamespace("BiocManager", quietly = TRUE)) 18 | install.packages("BiocManager") 19 | BiocManager::install("SingleCellMultiModal") 20 | ``` 21 | 22 | ## Load 23 | 24 | ```{r,include=TRUE, results="hide", message=FALSE, warning=FALSE} 25 | library(SingleCellMultiModal) 26 | library(MultiAssayExperiment) 27 | library(scran) 28 | library(scater) 29 | ``` 30 | 31 | # Description 32 | 33 | This data set consists of about 10K Peripheral Blood Mononuclear Cells (PBMCs) 34 | derived from a single healthy donor. It is available 35 | [from the 10x Genomics website](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets). 36 | 37 | Provided are the RNA expression counts quantified at the gene level and the 38 | chromatin accessibility levels quantified at the peak level. Here we provide 39 | the default peaks called by the CellRanger software. If you want to explore 40 | other peak definitions or chromatin accessibility quantifications (at the 41 | promoter level, etc.), you have download the `fragments.tsv.gz` file from the 42 | 10x Genomics website. 43 | 44 | # Downloading datasets 45 | 46 | The user can see the available dataset by using the default options 47 | 48 | ```{r} 49 | mae <- scMultiome("pbmc_10x", modes = "*", dry.run = FALSE, format = "MTX") 50 | ``` 51 | 52 | ```{r, echo=FALSE} 53 | gg_color_hue <- function(n) { 54 | hues = seq(15, 375, length = n + 1) 55 | hcl(h = hues, l = 65, c = 100)[1:n] 56 | } 57 | colors <- gg_color_hue(length(unique(mae$celltype))) 58 | names(colors) <- unique(mae$celltype) 59 | ``` 60 | 61 | # Exploring the data structure 62 | 63 | There are two assays: `rna` and `atac`, stored as 64 | [SingleCellExperiment](http://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) 65 | objects 66 | 67 | ```{r} 68 | mae 69 | ``` 70 | 71 | where the cells are the same in both assays: 72 | 73 | ```{r} 74 | upsetSamples(mae) 75 | ``` 76 | 77 | ## Cell metadata 78 | 79 | Columns: 80 | 81 | - **nCount_RNA**: number of read counts 82 | - **nFeature_RNA**: number of genes with at least one read count 83 | - **nCount_ATAC**: number of ATAC read counts 84 | - **nFeature_ATAC**: number of ATAC peaks with at least one read count 85 | - **celltype**: The cell types have been annotated by the 10x Genomics R&D team using gene markers. They provide a rough characterisation of the cell type diversity, but keep in mind that they are not ground truth labels. 86 | - **broad_celltype**: `Lymphoid` or `Myeloid` origin 87 | 88 | The cells have not been QC-ed, choosing a minimum number of genes/peaks per 89 | cell depends is left to you! In addition, there are further quality control 90 | criteria that you may want to apply, including mitochondrial coverage, fraction 91 | of reads overlapping ENCODE Blacklisted regions, Transcription start site 92 | enrichment, etc. See suggestions below for software that can perform a 93 | semi-automated quality control pipeline 94 | 95 | ```{r} 96 | head(colData(mae)) 97 | ``` 98 | 99 | ## RNA expression 100 | 101 | The RNA expression consists of 36,549 genes and 10,032 cells, stored using 102 | the `dgCMatrix` sparse matrix format 103 | 104 | ```{r} 105 | dim(experiments(mae)[["rna"]]) 106 | ``` 107 | 108 | ```{r} 109 | names(experiments(mae)) 110 | ``` 111 | 112 | Let's do some standard dimensionality reduction plot: 113 | 114 | ```{r} 115 | sce.rna <- experiments(mae)[["rna"]] 116 | 117 | # Normalisation 118 | sce.rna <- logNormCounts(sce.rna) 119 | 120 | # Feature selection 121 | decomp <- modelGeneVar(sce.rna) 122 | hvgs <- rownames(decomp)[decomp$mean>0.01 & decomp$p.value <= 0.05] 123 | sce.rna <- sce.rna[hvgs,] 124 | 125 | # PCA 126 | sce.rna <- runPCA(sce.rna, ncomponents = 25) 127 | 128 | # UMAP 129 | set.seed(42) 130 | sce.rna <- runUMAP(sce.rna, dimred="PCA", n_neighbors = 25, min_dist = 0.3) 131 | plotUMAP(sce.rna, colour_by="celltype", point_size=0.5, point_alpha=1) 132 | ``` 133 | 134 | ## Chromatin Accessibility 135 | 136 | The ATAC expression consists of 108,344 peaks and 10,032 cells: 137 | 138 | ```{r} 139 | dim(experiments(mae)[["atac"]]) 140 | ``` 141 | 142 | Let's do some standard dimensionality reduction plot. Note that scATAC-seq data is sparser than scRNA-seq, almost binary. The log normalisation + PCA approach that `scater` implements for scRNA-seq is not a good strategy for scATAC-seq data. Topic modelling or TFIDF+SVD are a better strategy. Please see the package recommendations below. 143 | 144 | ```{r} 145 | sce.atac <- experiments(mae)[["atac"]] 146 | 147 | # Normalisation 148 | sce.atac <- logNormCounts(sce.atac) 149 | 150 | # Feature selection 151 | decomp <- modelGeneVar(sce.atac) 152 | hvgs <- rownames(decomp)[decomp$mean>0.25] 153 | sce.atac <- sce.atac[hvgs,] 154 | 155 | # PCA 156 | sce.atac <- runPCA(sce.atac, ncomponents = 25) 157 | 158 | # UMAP 159 | set.seed(42) 160 | sce.atac <- runUMAP(sce.atac, dimred="PCA", n_neighbors = 25, min_dist = 0.3) 161 | plotUMAP(sce.atac, colour_by="celltype", point_size=0.5, point_alpha=1) 162 | ``` 163 | 164 | # Suggested software for the downstream analysis 165 | 166 | These are my personal recommendations of R-based analysis software: 167 | 168 | - **RNA expression**: [scater](http://bioconductor.org/packages/release/bioc/html/scater.html), [scran](https://bioconductor.org/packages/release/bioc/html/scran.html) 169 | - **ATAC accessibility**: [archR](https://www.archrproject.com/), [snapATAC](https://github.com/r3fang/SnapATAC), [cisTopic](https://github.com/aertslab/cisTopic), [Signac](https://satijalab.org/signac), [chromVar](https://bioconductor.org/packages/release/bioc/html/chromVAR.html), [Cicero](https://www.bioconductor.org/packages/release/bioc/html/cicero.html) 170 | - **Integrative analysis**: [MOFA+](https://biofam.github.io/MOFA2), [Seurat](https://satijalab.org/seurat). Note that both methods have released vignettes in their website where they analysed this same data set. 171 | 172 | # sessionInfo 173 | 174 | ```{r} 175 | sessionInfo() 176 | ``` 177 | -------------------------------------------------------------------------------- /vignettes/scNMT.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "scNMT Mouse Gastrulation" 3 | date: "`r BiocStyle::doc_date()`" 4 | vignette: | 5 | %\VignetteIndexEntry{scNMT Mouse Gastrulation} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: 9 | BiocStyle::html_document: 10 | toc_float: true 11 | package: SingleCellMultiModal 12 | bibliography: ../inst/REFERENCES.bib 13 | --- 14 | 15 | # Installation 16 | 17 | ```{r,eval=FALSE} 18 | if (!requireNamespace("BiocManager", quietly = TRUE)) 19 | install.packages("BiocManager") 20 | 21 | BiocManager::install("SingleCellMultiModal") 22 | ``` 23 | 24 | ## Load packages 25 | 26 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 27 | library(SingleCellMultiModal) 28 | library(MultiAssayExperiment) 29 | ``` 30 | 31 | # scNMT: single-cell nucleosome, methylation and transcription sequencing 32 | 33 | The dataset was graciously provided by @Argelaguet2019-et. 34 | 35 | Scripts used to process the raw data were written and maintained by Argelaguet 36 | and colleagues and reside on GitHub: 37 | https://github.com/rargelaguet/scnmt_gastrulation 38 | 39 | For more information on the protocol, see @Clark2018-qg. 40 | 41 | ## Dataset lookup 42 | 43 | The user can see the available datasets by using the `dry.run` argument: 44 | 45 | ```{r} 46 | scNMT("mouse_gastrulation", mode = "*", version = "1.0.0", dry.run = TRUE) 47 | ``` 48 | 49 | Or by simply running the `scNMT` function with defaults: 50 | 51 | ```{r} 52 | scNMT("mouse_gastrulation", version = "1.0.0") 53 | ``` 54 | 55 | ## Data versions 56 | 57 | A more recent release of the 'mouse_gastrulation' dataset was provided 58 | by Argelaguet and colleagues. This dataset includes additional cells that 59 | did not pass the original quality metrics as imposed for the version `1.0.0` 60 | dataset. 61 | 62 | Use the `version` argument to indicate the newer dataset version 63 | (i.e., `2.0.0`): 64 | 65 | ```{r} 66 | scNMT("mouse_gastrulation", version = '2.0.0', dry.run = TRUE) 67 | ``` 68 | 69 | ## Downloading the data 70 | 71 | To obtain the data, we can use the `mode` argument to indicate specific 72 | datasets using 'glob' patterns that will match the outputs above. For example, 73 | if we would like to have all 'genebody' datasets for all available assays, 74 | we would use `*_genebody` as an input to `mode`. 75 | 76 | ```{r,message=FALSE} 77 | nmt <- scNMT("mouse_gastrulation", mode = c("*_DHS", "*_cgi", "*_genebody"), 78 | version = "1.0.0", dry.run = FALSE) 79 | nmt 80 | ``` 81 | 82 | ## Checking the cell metadata 83 | 84 | Included in the `colData` `DataFrame` within the `MultiAssayExperiment` class 85 | are the variables `cellID`, `stage`, `lineage10x_2`, and `stage_lineage`. 86 | To extract this `DataFrame`, one has to use `colData` on the 87 | `MultiAssayExperiment` object: 88 | 89 | ```{r} 90 | colData(nmt) 91 | ``` 92 | 93 | ## Exploring the data structure 94 | 95 | Check row annotations: 96 | 97 | ```{r} 98 | rownames(nmt) 99 | ``` 100 | 101 | The `sampleMap` is a graph representation of the relationships between cells 102 | and 'assay' datasets: 103 | 104 | ```{r} 105 | sampleMap(nmt) 106 | ``` 107 | 108 | Take a look at the cell identifiers or barcodes across assays: 109 | 110 | ```{r} 111 | colnames(nmt) 112 | ``` 113 | 114 | ## Chromatin Accessibility (acc_*) 115 | 116 | See the accessibilty levels (as proportions) for DNase Hypersensitive Sites: 117 | 118 | ```{r} 119 | head(assay(nmt, "acc_DHS"))[, 1:4] 120 | ``` 121 | 122 | ## DNA Methylation (met_*) 123 | 124 | See the methylation percentage / proportion: 125 | 126 | ```{r} 127 | head(assay(nmt, "met_DHS"))[, 1:4] 128 | ``` 129 | 130 | For protocol information, see the references below. 131 | 132 | # sessionInfo 133 | 134 | ```{r} 135 | sessionInfo() 136 | ``` 137 | 138 | # References 139 | -------------------------------------------------------------------------------- /vignettes/seqFISH.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "seqFISH Mouse Visual Cortex" 3 | author: "Dario Righelli" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | BiocStyle::html_document: 7 | toc_float: true 8 | vignette: > 9 | %\VignetteIndexEntry{seqFISH Mouse Visual Cortex} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | package: SingleCellMultiModal 13 | bibliography: ../inst/REFERENCES.bib 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | # Installation 19 | 20 | ```{r,eval=FALSE} 21 | if (!requireNamespace("BiocManager", quietly = TRUE)) 22 | install.packages("BiocManager") 23 | BiocManager::install("SingleCellMultiModal") 24 | ``` 25 | 26 | ## Load packages 27 | 28 | ```{r,include=TRUE, results="hide", message=FALSE, warning=FALSE} 29 | library(MultiAssayExperiment) 30 | library(SpatialExperiment) 31 | library(SingleCellMultiModal) 32 | ``` 33 | 34 | 35 | # seq-FISH dataset 36 | 37 | The dataset consists of two data types, 38 | seq-FISH data was provided by @Zhu2018identification, while scRNA-seq data 39 | was provided by @Tasic2016adult. 40 | 41 | Data have been retrievedas part of the 42 | [Hackathon](https://github.com/BIRSBiointegration/Hackathon/tree/master/seqFISH) 43 | in the 44 | [Mathematical Frameworks for Integrative Analysis of Emerging Biological DataTypes](https://www.birs.ca/events/2020/5-day-workshops/20w5197) workshop. 45 | 46 | ## Downloading datasets 47 | 48 | The user can see the available dataset by using the default options 49 | 50 | ```{r} 51 | seqFISH( 52 | DataType="mouse_visual_cortex", modes="*", dry.run=TRUE, version="2.0.0" 53 | ) 54 | ``` 55 | 56 | Or simply by running: 57 | 58 | ```{r} 59 | seqfish <- seqFISH( 60 | DataType="mouse_visual_cortex", modes="*", dry.run=FALSE, version="2.0.0" 61 | ) 62 | seqfish 63 | ``` 64 | 65 | Extract the list of experiments _without_ the associated colData. 66 | 67 | ```{r} 68 | experiments(seqfish) 69 | ``` 70 | 71 | ## Exploring the data structure 72 | 73 | Check row annotations for all experiments: 74 | 75 | ```{r} 76 | rownames(seqfish) 77 | ``` 78 | 79 | Take a peek at the `sampleMap` (graph representation of assays, cells, and 80 | barcodes): 81 | 82 | ```{r} 83 | sampleMap(seqfish) 84 | ``` 85 | 86 | ## Visualize matching cell identifiers across assays 87 | 88 | ```{r} 89 | upsetSamples(seqfish) 90 | ``` 91 | 92 | This shows that about 1597 cells match across both modalities / assays. 93 | 94 | ## scRNA-seq data 95 | 96 | The scRNA-seq data are accessible with `$scRNAseq`, which returns a 97 | *SingleCellExperiment* class object, with all its associated methods. 98 | 99 | ```{r} 100 | seqfish[["scRNAseq"]] 101 | ``` 102 | 103 | Otherwhise the `assay` function can be used to access the *scRNAseq* assay 104 | stored in the `seqfish` *MultiAssayExperiment* object. 105 | 106 | ```{r} 107 | head(assay(seqfish, "scRNAseq"))[,1:4] 108 | ``` 109 | 110 | ## seq-FISH data 111 | 112 | The seq-FISH data are accessible with `$seqFISH`, which returns a 113 | **SpatialExperiment** class object. 114 | 115 | ```{r} 116 | seqfish[["seqFISH"]] 117 | ``` 118 | 119 | Otherwhise the `assay` function can be used to access the *seqFISH* assay 120 | stored in the `seqfish` *MultiAssayExperiment* object. 121 | 122 | ```{r} 123 | head(assay(seqfish, "seqFISH"))[,1:4] 124 | ``` 125 | 126 | Spatial data can be retrieved with `spatialData` function on the 127 | *SpatialExperiment* object. 128 | 129 | ```{r} 130 | (sd <- spatialData(seqfish[["seqFISH"]])) 131 | ``` 132 | 133 | Spatial coordinates within the spatial data can be retrieved in matrix form 134 | with `spatialCoords` function on the *SpatialExperiment* object. 135 | 136 | ```{r} 137 | head(sc <- spatialCoords(seqfish[["seqFISH"]])) 138 | ``` 139 | 140 | Direct access to the colnames of the spacial coordinates with 141 | `spatialCoordsNames` function. 142 | 143 | ```{r} 144 | spatialCoordsNames(seqfish[["seqFISH"]]) 145 | ``` 146 | 147 | 148 | ## Other data version 149 | 150 | The provided seqFISH dataset comes out in two different versions: 151 | 152 | * 1.0.0 - provides the same seqFISH data as shown in the rest of this 153 | vignette, but it returns the full normalized scRNA-seq data matrix (with 154 | labels), as released from the original authors on the GEO database. 155 | * 2.0.0 - provides the same seqFISH data as shown in the rest of this 156 | vignette, but it returns a processed subset of the original scRNA-seq data, 157 | providing only the same genes present in the seqFISH data matrix. 158 | 159 | ### Data version 1.0.0 160 | 161 | The full scRNA-seq data matrix is 24057 rows x 1809 columns. 162 | 163 | To access the v1.0.0 simply run 164 | 165 | ```{r} 166 | seqFISH( 167 | DataType="mouse_visual_cortex", modes="*", dry.run=FALSE, version="1.0.0" 168 | ) 169 | ``` 170 | 171 | # Session Info 172 | 173 | ```{r, tidy=TRUE} 174 | sessionInfo() 175 | ``` 176 | 177 | --------------------------------------------------------------------------------