├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── test-coverage.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── constants.R
    └── loader.R
├── README.md
├── _pkgdown.yml
├── inst
    └── CITATION
├── man
    └── getCompendium.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── setup.R
    │   ├── test-test_get_coldata.R
    │   └── test-test_get_compendium.R
└── vignettes
    ├── .gitignore
    └── overview.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^_pkgdown\.yml$
4 | ^docs$
5 | ^pkgdown$
6 | ^LICENSE\.md$
7 | ^\.github$
8 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v3
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs
6 | inst/doc
7 | *.Rproj


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MicroBioMap
 2 | Type: Package
 3 | Title: Access the microbiome compendium from R
 4 | Version: 0.99.13
 5 | Description: The MicroBioMap offers access to a dataset including more than
 6 |     168,000 samples of publicly available
 7 |     16S rRNA amplicon sequencing data, all processed using the same pipeline
 8 |     and reference database. The main function of the package is to fetch
 9 |     the data and parse it into a TreeSummarizedExperiment. Basic documentation
10 |     includes use cases and examples.
11 | License: MIT + file LICENSE
12 | Depends:
13 |     TreeSummarizedExperiment,
14 |     ape
15 | Imports:
16 |     Matrix,
17 |     data.table,
18 |     BiocFileCache,
19 |     R.utils
20 | Encoding: UTF-8
21 | Authors@R:
22 |     c(
23 |       person("Richard", "Abdill",
24 |       role = c("aut", "cre"),
25 |       email='rabdill@uchicago.edu',
26 |       comment=c(ORCID='0000-0001-9565-5832')),
27 |       person("Sean", "Davis",
28 |       role = c("aut"),
29 |       email='seandavi@gmail.com',
30 |       comment=c(ORCID='0000-0002-8991-6458')),
31 |       person("Ran", "Blehkman",
32 |       role=c('aut'),
33 |       comment=c(ORCID='0000-0003-3218-613X')),
34 |       person("Samantha", "Graham", role=c('aut')),
35 |       person("Casey", "Greene", role=c("aut"))
36 |       )
37 | URL: https://blekhmanlab.github.io/MicroBioMap/
38 | biocViews: Microbiome, Metagenomics, Sequencing, DataImport
39 | RoxygenNote: 7.3.2
40 | Suggests:
41 |     knitr,
42 |     rmarkdown,
43 |     testthat (>= 3.0.0),
44 |     ggplot2,
45 |     countrycode
46 | Config/testthat/edition: 3
47 | VignetteBuilder: knitr
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: MicroBioMap authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 MicroBioMap authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(getCompendium)
 4 | import(R.utils)
 5 | import(TreeSummarizedExperiment)
 6 | import(ape)
 7 | importClassesFrom(Matrix,TsparseMatrix)
 8 | importFrom(BiocFileCache,BiocFileCache)
 9 | importFrom(BiocFileCache,bfcnew)
10 | importFrom(BiocFileCache,bfcquery)
11 | importFrom(BiocFileCache,bfcrpath)
12 | importFrom(data.table,fread)
13 | importFrom(data.table,setkey)
14 | 


--------------------------------------------------------------------------------
/R/constants.R:
--------------------------------------------------------------------------------
1 | canonical_doi <- 'https://doi.org/10.5281/zenodo.8186993'
2 | 


--------------------------------------------------------------------------------
/R/loader.R:
--------------------------------------------------------------------------------
  1 | .getVersions <- function(bfc, verbose=FALSE) {
  2 |     # Determines the most recent version of the compendium
  3 |     # and retrieves the manifest that describes all available releases.
  4 |     # Returns a data.table listing all versions and the necessary URLs
  5 |     # This requires the canonical_doi configuration value stored in
  6 |     # constants.R, which always resolves to the most recent version.
  7 | 
  8 |     # Check if we've already cached the manifest.
  9 |     # If not, make an HTTP call to get the URL we need
 10 |     rpath <- BiocFileCache::bfcquery(bfc, 'manifest')$rpath
 11 |     if(length(rpath) == 0) {
 12 |       if(verbose) {
 13 |         print('Retrieving version information.')
 14 |       }
 15 |       resolve <- curl::curl_fetch_memory(canonical_doi)
 16 |       if(resolve$status_code != 200) {
 17 |         stop(paste0(
 18 |           'Could not resolve canonical DOI. Status code: ',
 19 |           resolve$status_code
 20 |         ))
 21 |       }
 22 | 
 23 |       if(verbose) {
 24 |         print('Determined data address:')
 25 |         print(resolve$url)
 26 |       }
 27 |       manifest <- paste0(resolve$url, '/files/manifest.csv')
 28 | 
 29 |       rpath <- tryCatch(
 30 |         {
 31 |           bfcrpath(bfc, manifest)
 32 |         },
 33 |         error = function(msg){
 34 |           print('Could not retrieve manifest file. Falling back to manifest as of v1.1.0')
 35 |           towrite <- data.table::data.table(
 36 |             version = c('1.1.0', '1.0.1'),
 37 |             zenodo_id = c('13733642', '10452633'),
 38 |             default = c(TRUE, FALSE)
 39 |           )
 40 |           # we save this to the cache so the app remembers not to keep looking online
 41 |           # for a manifest every time the version information is needed
 42 |           savepath <- BiocFileCache::bfcnew(bfc, 'manifest', ext='.csv')
 43 |           data.table::fwrite(towrite, file=savepath)
 44 |           savepath
 45 |         }
 46 |       )
 47 |     }
 48 |     else {
 49 |       if(verbose) {
 50 |         print('Cached version information found.')
 51 |       }
 52 |     }
 53 |     results <- data.table::fread(rpath)
 54 | 
 55 |     colnames(results) <- c('version','zenodo_id','default')
 56 |     results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
 57 |     results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
 58 |     data.table::setkey(results, version)
 59 |     results
 60 | }
 61 | 
 62 | .getCompendiumData <- function(version, bfc) {
 63 |     versions <- .getVersions(bfc)
 64 |     rpath <- bfcrpath(bfc, versions[version]$data_url)
 65 |     data.table::fread(rpath)
 66 | }
 67 | 
 68 | .getCompendiumColdata <- function(version, bfc) {
 69 |     versions <- .getVersions(bfc)
 70 |     rpath <- bfcrpath(bfc, versions[version]$coldata_url)
 71 |     sampdat <- as.data.frame(data.table::fread(rpath))
 72 |     rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_")
 73 |     sampdat
 74 | }
 75 | 
 76 | #' load all compendium data into a TreeSummarizedExperiment
 77 | #'
 78 | #' @param bfc BiocFileCache object to use
 79 | #'
 80 | #' @returns a `TreeSummarizedExperiment`
 81 | #'
 82 | #' @importFrom data.table fread setkey
 83 | #' @importClassesFrom Matrix TsparseMatrix
 84 | #' @import TreeSummarizedExperiment
 85 | #' @import R.utils
 86 | #' @import ape
 87 | #' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
 88 | #'
 89 | #' @export
 90 | #'
 91 | #' @examples
 92 | #' cpd <- getCompendium()
 93 | #'
 94 | #' dim(cpd)
 95 | #' cpd
 96 | #' assayNames(cpd)
 97 | #' head(colData(cpd))
 98 | #'
 99 | 
100 | getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
101 |     versions <- .getVersions(bfc)
102 | 
103 |     if(is.na(version)) {
104 |         # If the user has not specified a version, grab whichever
105 |         # is indicated in the manifest as the default (i.e. most recent)
106 |         version <- versions[versions$default,]$version[1]
107 |     }
108 |     print(paste('Retrieving compendium version',version))
109 |     dat <-.getCompendiumData(version, bfc)
110 |     coldat <- .getCompendiumColdata(version, bfc)
111 | 
112 |     sampnames <- dat[[2]]
113 | 
114 |     coldat <- coldat[match(sampnames, rownames(coldat)), ]
115 | 
116 |     taxa <- colnames(dat)[3:ncol(dat)]
117 |     requireNamespace("Matrix")
118 |     # mat = as(as.matrix(dat[,3:ncol(dat)]), 'TsparseMatrix')
119 |     mat <- as.matrix(dat[, 3:ncol(dat)])
120 |     rownames(mat) <- sampnames
121 |     colnames(mat) <- taxa
122 |     sampinfo <- do.call(rbind, strsplit(sampnames, "_"))
123 |     colnames(sampinfo) <- c("project", "sample")
124 |     coldata <- data.frame(sampinfo)
125 |     rownames(coldata) <- sampnames
126 |     splittaxa <- do.call(rbind, lapply(
127 |         strsplit(taxa, "\\."),
128 |         function(x) {
129 |             c(x, rep(NA, 8 - length(x)))
130 |         }
131 |     ))
132 |     colnames(splittaxa) <- c(
133 |         "kingdom",
134 |         "phylum",
135 |         "class",
136 |         "order",
137 |         "family",
138 |         "genus",
139 |         "species",
140 |         "strain"
141 |     )
142 |     rowdata <- data.frame(splittaxa)
143 |     rownames(rowdata) <- taxa
144 |     td <- TreeSummarizedExperiment::TreeSummarizedExperiment(
145 |         colData = coldat,
146 |         rowData = rowdata,
147 |         assays = list(counts = t(mat))
148 |     )
149 |     td
150 | }
151 | 
152 | taxonname2edgelist <- function(taxon) {
153 |     v <- strsplit(taxon, "\\.")[[1]]
154 |     v <- v[!v == "NA"]
155 |     if (length(v) > 1) {
156 |         lv <- length(v)
157 |         df <- data.frame(from = v[seq_len(lv - 1)], to = v[1+seq_len(lv-1)])
158 |     } else {
159 |         df <- data.frame()
160 |     }
161 |     df
162 | }
163 | 
164 | taxa2edgelist <- function(taxa) {
165 |     taxa_edgelist <- lapply(taxa, taxonname2edgelist)
166 |     df <- unique(do.call(rbind, taxa_edgelist))
167 |     return(df)
168 |     unique_names <- unique(c(df$from, df$to))
169 |     l <- seq_along(unique_names)
170 |     names(l) <- unique_names
171 |     parents <- l[df$from]
172 |     nodes <- l[df$to]
173 |     df$parent <- parents
174 |     df$node <- nodes
175 |     df$label <- df$to
176 |     df
177 | }
178 | 
179 | taxa2phylo <- function(taxa) {
180 |     edgelist <- taxa2edgelist(taxa)
181 |     edgelist <- as.matrix(edgelist)
182 | 
183 |     edgelist <- edgelist[!is.na(edgelist[, 1]) & !is.na(edgelist[, 2]), ]
184 | 
185 |     from <- edgelist[, 1]
186 |     to <- edgelist[, 2]
187 |     ids <- unique(c(edgelist[, 1], edgelist[, 2]))
188 | 
189 |     tip.label <- setdiff(ids, from)
190 |     node.label <- unique(from)
191 | 
192 |     # make a map from taxonomy ID to internal 1:n ids
193 |     idmap <- seq_along(c(tip.label, node.label))
194 |     names(idmap) <- c(tip.label, node.label)
195 | 
196 |     # make a phylo object
197 |     tree <- list(
198 |         edge       = matrix(c(idmap[as.character(from)], idmap[as.character(to)]), ncol = 2),
199 |         tip.label  = unname(tip.label),
200 |         # node.label = unname(node.label),
201 |         Nnode      = length(node.label)
202 |     )
203 |     class(tree) <- "phylo"
204 | 
205 |     tree
206 | }
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Microbiome Compendium
 2 | 
 3 | 
 4 | Our dataset includes more than 168,000 samples of publicly available 16S rRNA
 5 | amplicon sequencing data, all processed using the same pipeline and reference
 6 | database.
 7 | 
 8 | The goal of the MicroBioMap package is simply to expose these data
 9 | to the broad community of Bioconductor and R users with
10 | the smallest fuss possible.
11 | 
12 | See the [vignette](https://blekhmanlab.github.io/MicroBioMap/articles/overview.html) for more details.
13 | 
14 | ## Installation
15 | 
16 | For the smoothest installation experience, use the [`BiocManager` Bioconductor
17 | package](https://bioconductor.org/packages/BiocManager).
18 | 
19 | ```
20 | BiocManager::install('blekhmanlab/MicroBioMap')
21 | ```
22 | 
23 | ## Usage
24 | 
25 | Load the compendium using:
26 | 
27 | ```
28 | library(MicroBioMap)
29 | cpd <- getCompendium()
30 | ```
31 | 
32 | The resulting object is a `TreeSummarizedExperiment` object. Currently, the
33 | "tree" part of the TreeSummarizedExperiment is not populated, but that is
34 | on the roadmap.
35 | 
36 | After loading the compendium, you will have immediate access to nearly
37 | 170,000 microbiome samples.
38 | 
39 | The `getCompendium` function retrieves [data stored by Zenodo](https://doi.org/10.5281/zenodo.8186993) and accepts an optional parameter indicating which version to download—for example, `getCompendium('1.0.1')`. Version 1.1.0 is retrieved by default.
40 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://blekhmanlab.github.io/MicroBioMap/
 2 | template:
 3 |   bootstrap: 5
 4 |   bootswatch: spacelab
 5 | 
 6 | navbar:
 7 |   structure:
 8 |     right: [github]
 9 |   components:
10 |     github:
11 |       icon: fa-github
12 |       href: http://github.com/blekhmanlab/MicroBioMap
13 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("If MicroBioMap or the Human Microbiome Compendium has played a role in your publication, please cite our paper:")
 2 | 
 3 | bibentry(
 4 |   bibtype = "Article",
 5 |   title = 'Integration of 168,000 samples reveals global patterns of the human gut microbiome',
 6 |   journal = 'Cell',
 7 |   author = as.person('R J Abdill, S P Graham, V Rubinetti, M Ahmadian, P Hicks, A Chetty, D McDonald, P Ferretti, E Gibbons, M Rossi, A Krishan, F W Albert, C S Greene, S Davis, R Blekhman'),
 8 |   year = 2025,
 9 |   doi = '10.1016/j.cell.2024.12.017',
10 |   volume='188',
11 |   issue='4'
12 | )


--------------------------------------------------------------------------------
/man/getCompendium.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loader.R
 3 | \name{getCompendium}
 4 | \alias{getCompendium}
 5 | \title{load all compendium data into a TreeSummarizedExperiment}
 6 | \usage{
 7 | getCompendium(version = NA, bfc = BiocFileCache::BiocFileCache())
 8 | }
 9 | \arguments{
10 | \item{bfc}{BiocFileCache object to use}
11 | }
12 | \value{
13 | a `TreeSummarizedExperiment`
14 | }
15 | \description{
16 | load all compendium data into a TreeSummarizedExperiment
17 | }
18 | \examples{
19 | cpd <- getCompendium()
20 | 
21 | dim(cpd)
22 | cpd
23 | assayNames(cpd)
24 | head(colData(cpd))
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
 7 | # * https://testthat.r-lib.org/articles/special-files.html
 8 | 
 9 | library(testthat)
10 | library(MicroBioMap)
11 | 
12 | test_check("MicroBioMap")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/setup.R:
--------------------------------------------------------------------------------
 1 | # Run before any test
 2 | testbfc <- BiocFileCache::BiocFileCache(cache = testthat::test_path())
 3 | 
 4 | # Remove the cache created during tests
 5 | withr::defer(
 6 |   BiocFileCache::cleanbfc(
 7 |     BiocFileCache::BiocFileCache(cache = testthat::test_path()),
 8 |     days=-Inf, ask=FALSE
 9 |   ),
10 |   envir=testthat::teardown_env(), priority='last'
11 | )
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test-test_get_coldata.R:
--------------------------------------------------------------------------------
 1 | test_that("sample metadata download works", {
 2 |     coldat <- .getCompendiumColdata('1.1.0', testbfc)
 3 |     expect_equal(ncol(coldat), 11)
 4 |     expect_contains(colnames(coldat), c(
 5 |         "srs", "project", "srr", "library_strategy",
 6 |         "library_source", "pubdate", "total_bases",
 7 |         "instrument", "geo_loc_name",
 8 |         "region"
 9 |     ))
10 | })
11 | 


--------------------------------------------------------------------------------
/tests/testthat/test-test_get_compendium.R:
--------------------------------------------------------------------------------
 1 | test_that("getting compendium with defaults works as expected", {
 2 |     skip_on_ci()
 3 |     cpd <- getCompendium(bfc = testbfc)
 4 |     expect_s4_class(cpd, "TreeSummarizedExperiment")
 5 |     expect_gt(nrow(cpd), 1000)
 6 |     expect_gt(ncol(cpd), 168000)
 7 |     expect_contains(assayNames(cpd), "counts")
 8 |     # v1.1.0 changed the name of this taxon:
 9 |     expect_equal(max(counts(cpd)['Bacteria.Bacillota.Clostridia.Eubacteriales.Alkalibacteraceae.Alkalibaculum',]), 64)
10 |     expect_error(max(counts(cpd)['Bacteria.Firmicutes.Clostridia.Eubacteriales.(unclassified).Alkalibaculum',]))
11 | })
12 | 
13 | test_that("getting compendium with specified version works as expected", {
14 |     skip_on_ci()
15 |     cpd <- getCompendium('1.0.1', bfc = testbfc)
16 |     expect_s4_class(cpd, "TreeSummarizedExperiment")
17 |     expect_gt(nrow(cpd), 1000)
18 |     expect_gt(ncol(cpd), 1000)
19 |     expect_contains(assayNames(cpd), "counts")
20 |     expect_equal(max(counts(cpd)['Bacteria.Firmicutes.Clostridia.Eubacteriales.(unclassified).Alkalibaculum',]), 16)
21 |     expect_error(max(counts(cpd)['Bacteria.Firmicutes.Clostridia.Eubacteriales.Alkalibaculum.NA',]))
22 | })
23 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/overview.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using the MicroBioMap package"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Using the MicroBioMap package}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ## Overview
 11 | 
 12 | The human microbiome, particularly in the large intestine, is gaining attention
 13 | for its role in health and disease, including conditions like colorectal cancer
 14 | and inflammatory bowel disease. Various factors, including genetics and
 15 | ethnicity, influence the human microbiome, often tied to geographic regions. As
 16 | a result, individuals from different global regions tend to have distinct gut
 17 | microbiomes. While research has shown significant variation in the gut
 18 | microbiome across countries and social groups, much of the microbiome literature
 19 | has focused on a limited range of subjects, raising questions about the
 20 | generalizability of these findings.
 21 | 
 22 | To address these limitations, researchers have turned to publicly available
 23 | microbiome data, revealing an overrepresentation of high-income countries like
 24 | the United States and Finland in major microbiome research repositories.
 25 | However, significant variations from the Western baseline are observed when
 26 | other communities are studied. This suggests that the current understanding of
 27 | the microbiome may be limited due to geographic gaps in research.
 28 | 
 29 | To bridge this gap and enhance the field's bioinformatic capabilities, the Human
 30 | Microbiome Compendium is introduced. It comprises over 168,000 human gut
 31 | microbiome samples from 68 countries, processed using advanced tools and made
 32 | accessible in various formats, including the MicroBioMap R package and a
 33 | dedicated website. This dataset is used to analyze global patterns in microbiome
 34 | composition and identify areas where our knowledge of the human gut microbiome
 35 | is lacking.
 36 | 
 37 | ### Primary data processing
 38 | 
 39 | Amplicon processing was conducted according to the following criteria: If the
 40 | quantity of forward read files corresponded with the number of reverse read
 41 | files, we proceeded with paired-end sequencing analysis. However, in cases of a
 42 | mismatch or the absence of reverse reads, the project was treated as
 43 | single-ended data. In both scenarios, we employed DADA2 version 1.14.0 for data
 44 | processing (B. J. Callahan et al. 2016). We utilized broad parameter settings
 45 | intended to maximize the inclusion of samples while excluding low-quality data:
 46 | Specifically, we did not apply fixed trimming from either end or impose a
 47 | maximum read length. Reads shorter than 20 nucleotides, those containing
 48 | ambiguous ("N") base calls, and reads aligning to the phiX genome (typically
 49 | present as a control in Illumina sequencing runs) were eliminated. Additionally,
 50 | quality-based truncation of reads was disabled. Paired-end reads were merged
 51 | with a minimum overlap of 20 bases. In certain instances, the merging process
 52 | failed, resulting in minimal or no merging of forward and reverse reads, often
 53 | associated with sequencing strategies featuring non-overlapping reads. For
 54 | studies where less than 50 percent of forward reads merged successfully, we
 55 | opted to discard the reverse reads rather than concatenate them. This decision
 56 | aimed to prevent failures in merging due to low-quality calls or discrepancies
 57 | between forward and reverse read files. In such cases, the reverse reads were
 58 | excluded, and the projects were reprocessed as single-ended data. When the
 59 | number of forward reads did not align with the number of reverse reads within a
 60 | sample, an attempt was made to employ DADA2 to identify the sequence identifier
 61 | field in the FASTQ file, facilitating the alignment of salvageable samples. In
 62 | cases where this attempt proved unsuccessful, the reverse reads were removed,
 63 | and the data was reprocessed as single-ended. Taxonomic assignment was carried
 64 | out by DADA2, utilizing the SILVA database release 138.1 as a reference (Quast
 65 | et al. 2013; McLaren and Callahan 2021).
 66 | 
 67 | ## Getting started
 68 | 
 69 | ### Installation
 70 | 
 71 | ```{r eval=FALSE}
 72 | library(BiocManager)
 73 | BiocManager::install("blekhmanlab/MicroBioMap")
 74 | ```
 75 | 
 76 | ## Basic usage
 77 | 
 78 | ```{r message=FALSE}
 79 | library(MicroBioMap)
 80 | # this operation requires about 4GB of RAM
 81 | cpd <- getCompendium()
 82 | ```
 83 | 
 84 | ```{r}
 85 | cpd
 86 | ```
 87 | 
 88 | ### Sample metadata
 89 | 
 90 | 
 91 | The `colData` slot of the `cpd` object contains the information about samples.
 92 | We can examine the names of the available metadata:
 93 | 
 94 | ```{r}
 95 | names(colData(cpd))
 96 | ```
 97 | 
 98 | We can look at the samples by available regions:
 99 | 
100 | * Region
101 |     ```{r}
102 | table(colData(cpd)$region)
103 |     ```
104 | 
105 | * Library strategy
106 |     ```{r}
107 | table(colData(cpd)$library_strategy)
108 |     ```
109 | 
110 | * Instrument
111 |     ```{r}
112 | table(colData(cpd)$instrument)
113 |     ```
114 | 
115 | * Country
116 |     ```{r}
117 | head(sort(table(colData(cpd)$geo_loc_name), decreasing = TRUE))
118 |     ```
119 | 
120 | 
121 | ### Microbe (row) metadata
122 | 
123 | Each row in `cpd` represents one microbe. The `rowData` contains a data.frame-
124 | like set of metadata with columns `kingdom`, `phylum`, etc.
125 | 
126 | ```{r}
127 | head(rowData(cpd))
128 | ```
129 | 
130 | ## Use cases
131 | 
132 | ### Samples in specific Bioprojects
133 | 
134 | To select samples that belong to specific Bioprojects, we can filter using
135 | the "project" metadata column in the `colData`.
136 | 
137 | ```{r}
138 | head(unique(colData(cpd)$project))
139 | ```
140 | 
141 | For example, create a subset of data belonging to Bioprojects:
142 | 
143 | - PRJDB10485
144 | - PRJDB10527
145 | - PRJDB10528
146 | - PRJDB10612
147 | 
148 | ```{r}
149 | project_ids <- c("PRJDB10485", "PRJDB10527", "PRJDB10528", "PRJDB10612")
150 | cpd_sub_by_project_ids <- cpd[, colData(cpd)$project %in% project_ids]
151 | cpd_sub_by_project_ids
152 | ```
153 | 
154 | ### Samples from a specific world region
155 | 
156 | To select a subset of samples from a specific world region, subset by the
157 | "columns" of the dataset. The following code ummarized the regions are
158 | available and the number of samples in each region.
159 | 
160 | ```{r}
161 | sort(table(colData(cpd)$region), decreasing = TRUE)
162 | ```
163 | Now, select only those samples from, for example, "Sub-Saharan Africa".
164 | 
165 | ```{r}
166 | cpd_africa <- cpd[, colData(cpd)$region == "Sub-Saharan Africa"]
167 | cpd_africa
168 | ```
169 | 
170 | ### Samples from a specific country
171 | 
172 | To select a subset of samples from a specific country, subset by the
173 | "columns" of the dataset. The following code ummarized the countries are
174 | available and the number of samples in each country.
175 | 
176 | ```{r}
177 | library(forcats)
178 | library(ggplot2)
179 | ggplot(as.data.frame(colData(cpd)),
180 |     aes(x = fct_infreq(iso))) +
181 |     geom_bar(stat='count') +
182 |     theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
183 |     labs(x = "Country", y = "Number of samples") +
184 |     scale_y_log10()
185 | ```
186 | 
187 | ### Enrich data with country names
188 | 
189 | Using the `countrycode` package, we can convert the iso codes to country names.
190 | 
191 | ```{r}
192 | library(countrycode)
193 | colData(cpd)$country <- countrycode(colData(cpd)$iso, "iso2c", "country.name")
194 | # or add the iso3c code
195 | colData(cpd)$iso3c <- countrycode(colData(cpd)$iso, "iso2c", "iso3c")
196 | ```
197 | 
198 | ### Histogram of total bases per sample
199 | 
200 | ```{r warning=FALSE, message=FALSE}
201 | library(ggplot2)
202 | ggplot(as.data.frame(colData(cpd)), aes(x = log10(total_bases))) +
203 |     geom_histogram()
204 | ```
205 | 
206 | ### Samples filtered by presence of microbe
207 | 
208 | There are over 4000 microbes represented in the compendium. Some are fairly rare
209 | and we may want to subset the compendium to only those with a specific
210 | bug in the sample. Here, we pick an arbitrary examplar to show just the
211 | mechanics.
212 | 
213 | ```{r}
214 | shig_cpd_counts = counts(cpd)['Bacteria.Bacillota.Clostridia.Eubacteriales.Alkalibacteraceae.Alkalibaculum',]
215 | ```
216 | 
217 | Examine the distribution of abundance across all samples in the compendium.
218 | 
219 | ```{r}
220 | hist(log10(shig_cpd_counts))
221 | ```
222 | Picking an arbitrary threshold (after all, this is only an example), we can
223 | limit samples to those with plenty of our bacteria of interest present.
224 | 
225 | ```{r}
226 | shig_cpd <- cpd[, log10(shig_cpd_counts)>2]
227 | dim(shig_cpd)
228 | ```
229 | 
230 | ## sessionInfo
231 | 
232 | ```{r}
233 | sessionInfo()
234 | ```
235 | 


--------------------------------------------------------------------------------