├── _pkgdown.yml
├── tests
├── testthat.R
└── testthat
│ ├── test-getMetadata.R
│ ├── test-searchAnalysis.R
│ ├── test-getData.R
│ ├── test-doQuery.R
│ ├── test-MgnifyClient.R
│ ├── test-getResult.R
│ └── test-getFile.R
├── man
├── figures
│ ├── mgnify_logo.jpg
│ ├── mgnifyr_logo.png
│ └── findingpheno_logo.png
├── MGnifyR-package.Rd
├── getMetadata.Rd
├── searchAnalysis.Rd
├── MgnifyClient-accessors.Rd
├── getData.Rd
├── deprecate.Rd
├── MgnifyClient.Rd
├── getFile.Rd
├── doQuery.Rd
└── getResult.Rd
├── inst
├── extdata
│ └── vignette_MGnifyR.rds
└── extras
│ ├── TODO
│ └── demo_code.R
├── .Rbuildignore
├── .gitignore
├── NEWS
├── R
├── MGnifyR.R
├── AllClasses.R
├── AllGenerics.R
├── AllAccessors.R
├── deprecate.R
├── getMetadata.R
├── MgnifyClient.R
├── getData.R
├── searchAnalysis.R
├── doQuery.R
├── getFile.R
└── utils.R
├── .github
└── workflows
│ └── rworkflows.yml
├── vignettes
├── references.bib
├── MGnifyR.Rmd
├── MGnify_course.Rmd
└── MGnifyR_long.Rmd
├── DESCRIPTION
├── NAMESPACE
├── README.md
└── LICENSE
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: ~
2 | template:
3 | bootstrap: 5
4 |
5 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(MGnifyR)
3 |
4 | test_check("MGnifyR")
5 |
--------------------------------------------------------------------------------
/man/figures/mgnify_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/mgnify_logo.jpg
--------------------------------------------------------------------------------
/man/figures/mgnifyr_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/mgnifyr_logo.png
--------------------------------------------------------------------------------
/inst/extdata/vignette_MGnifyR.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/inst/extdata/vignette_MGnifyR.rds
--------------------------------------------------------------------------------
/man/figures/findingpheno_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/findingpheno_logo.png
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | .github
4 | MGnifyR/vignettes/MGnifyR_cache
5 | ^_pkgdown\.yml$
6 | ^docs$
7 | ^pkgdown$
8 | ^doc$
9 | ^Meta$
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .*_cache
2 | inst/extras/
3 | *~
4 | .Rproj.user
5 | .Rhistory
6 | .RData
7 | .Ruserdata
8 | ..Rcheck
9 | .MGnify_cache
10 | inst/doc
11 | NOTES
12 | *.RDS
13 | Testing
14 | .gitignore
15 | tmp
16 | .RDataTmp
17 | *.Rproj
18 | **/NA/
19 | vignettes/*.html
20 | vignettes/*.R
21 | vignettes/*.pdf
22 | vignettes/MGnifyR_cache
23 | docs
24 | /doc/
25 | /Meta/
26 |
--------------------------------------------------------------------------------
/inst/extras/TODO:
--------------------------------------------------------------------------------
1 | In no particular order:
2 |
3 | Do documentation. DONE(ish)
4 | Convert to single monolithic mgnify_query function DONE
5 | Add caching functionality. DONE
6 | Add warnings for multiple runs etc.
7 | Split phyloseq grab into 2 fdevto unctions
8 | - get metadata DF/list for analysis runs
9 | - Do actual grab of biome data
10 | phyloseq conversion needs to be able to handle a list
11 |
12 | Fix query to check if lists of filters ~can~ be lists (i.e. supply multiple elements as filters)
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | Version: 0.99.23
2 | Date: 2024-03-04
3 | + getReturn fix: failed constructing MAE if samples in experiments did not match
4 |
5 | Version: 0.99.20
6 | Date: 2024-02-26
7 | + searchAnalysis returns now a named vector where names are accession IDs that was fed as input
8 |
9 | Version: 0.99.19
10 | Date: 2024-02-15
11 | + Fix deprecated mgnify_client function
12 |
13 | Version: 0.99.18
14 | Date: 2024-02-12
15 | + Last modifications for Biocondutor submission
16 |
17 | Changes in version 0.99.17
18 | + Added getData function for fetching raw data from the database
19 |
20 | Version 0.99.0
21 | + Support for TreeSummarizedExperiment and MultiAssayExperiment
22 | + Submitted to Bioconductor
23 |
--------------------------------------------------------------------------------
/R/MGnifyR.R:
--------------------------------------------------------------------------------
1 | #' \code{MGnifyR} Package.
2 | #'
3 | #' \code{MGnifyR} implements an interface to the EBI MGnify database.
4 | #' See the vignette for a general introduction to this package.
5 | #' \href{https://www.ebi.ac.uk/metagenomics/about}{About MGnify} for general MGnify
6 | #' information, and
7 | #' \href{https://emg-docs.readthedocs.io/en/latest/api.html}{API documentation} for
8 | #' details about the JSONAPI implementation.
9 | #' @name MGnifyR-package
10 | #' @aliases MGnifyR
11 | #' @docType package
12 | #' @seealso \link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment} class
13 | NULL
14 |
15 | #' @import mia
16 | #' @import TreeSummarizedExperiment
17 | #' @import MultiAssayExperiment
18 | NULL
19 |
--------------------------------------------------------------------------------
/man/MGnifyR-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/MGnifyR.R
3 | \docType{package}
4 | \name{MGnifyR-package}
5 | \alias{MGnifyR-package}
6 | \alias{MGnifyR}
7 | \title{\code{MGnifyR} Package.}
8 | \description{
9 | \code{MGnifyR} implements an interface to the EBI MGnify database.
10 | See the vignette for a general introduction to this package.
11 | \href{https://www.ebi.ac.uk/metagenomics/about}{About MGnify} for general MGnify
12 | information, and
13 | \href{https://emg-docs.readthedocs.io/en/latest/api.html}{API documentation} for
14 | details about the JSONAPI implementation.
15 | }
16 | \seealso{
17 | \link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment} class
18 | }
19 | \author{
20 | \strong{Maintainer}: Tuomas Borman \email{tuomas.v.borman@utu.fi} (\href{https://orcid.org/0000-0002-8563-8884}{ORCID})
21 |
22 | Authors:
23 | \itemize{
24 | \item Ben Allen \email{ben.allen@ncl.ac.uk}
25 | \item Leo Lahti \email{leo.lahti@iki.fi} (\href{https://orcid.org/0000-0001-5537-637X}{ORCID})
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/man/getMetadata.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/getMetadata.R
3 | \name{getMetadata}
4 | \alias{getMetadata}
5 | \alias{getMetadata,MgnifyClient-method}
6 | \title{Get all study, sample and analysis metadata for the supplied analysis
7 | accessions}
8 | \usage{
9 | getMetadata(x, ...)
10 |
11 | \S4method{getMetadata}{MgnifyClient}(x, accession, ...)
12 | }
13 | \arguments{
14 | \item{x}{A \code{MgnifyClient} object.}
15 |
16 | \item{...}{Optional arguments; not currently used.}
17 |
18 | \item{accession}{A single character value or a vector of analysis accession
19 | IDs specifying accessions to retrieve data for.}
20 | }
21 | \value{
22 | A \code{data.frame} containing metadata for each analysis in the
23 | \code{accession} list. Each row represents a single analysis.
24 | }
25 | \description{
26 | Get all study, sample and analysis metadata for the supplied analysis
27 | accessions
28 | }
29 | \details{
30 | The function retrieves all study, sample and analysis metadata associated
31 | with provided analysis accessions.
32 | }
33 | \examples{
34 | # Create a client object
35 | mg <- MgnifyClient(useCache = FALSE)
36 |
37 | # Download all associated study/sample and analysis metadata
38 | accession_list <- c("MGYA00377505")
39 | meta_dataframe <- getMetadata(mg, accession_list)
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/.github/workflows/rworkflows.yml:
--------------------------------------------------------------------------------
1 | name: rworkflows
2 | 'on':
3 | push:
4 | branches:
5 | - master
6 | - main
7 | - devel
8 | - RELEASE_**
9 | pull_request:
10 | branches:
11 | - master
12 | - main
13 | - devel
14 | - RELEASE_**
15 | jobs:
16 | rworkflows:
17 | permissions: write-all
18 | runs-on: ${{ matrix.config.os }}
19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | container: ${{ matrix.config.cont }}
21 | strategy:
22 | fail-fast: ${{ false }}
23 | matrix:
24 | config:
25 | - os: ubuntu-latest
26 | bioc: devel
27 | r: auto
28 | cont: ghcr.io/bioconductor/bioconductor_docker:devel
29 | rspm: ~
30 | - os: macOS-latest
31 | bioc: devel
32 | r: auto
33 | cont: ~
34 | rspm: ~
35 | - os: windows-latest
36 | bioc: devel
37 | r: auto
38 | cont: ~
39 | rspm: ~
40 | steps:
41 | - uses: neurogenomics/rworkflows@master
42 | with:
43 | run_bioccheck: ${{ false }}
44 | run_rcmdcheck: ${{ true }}
45 | as_cran: ${{ true }}
46 | run_vignettes: ${{ true }}
47 | has_testthat: ${{ true }}
48 | run_covr: ${{ true }}
49 | run_pkgdown: ${{ true }}
50 | has_runit: ${{ false }}
51 | has_latex: ${{ false }}
52 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
53 | run_docker: ${{ false }}
54 | DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
55 | runner_os: ${{ runner.os }}
56 | cache_version: cache-v1
57 | docker_registry: ghcr.io
58 |
--------------------------------------------------------------------------------
/R/AllClasses.R:
--------------------------------------------------------------------------------
1 | # MgnifyClient class and its accessors
2 |
3 | #' A MgnifyClient object
4 | #'
5 | #' @details An object that are required by functions of MGnifyR package.
6 | #'
7 | #' @slot databaseUrl A single character value specifying an URL address of
8 | #' database.
9 | #'
10 | #' @slot authTok A single character value specifying authentication token.
11 | #'
12 | #' @slot useCache A single boolean value specifying whether to use cache.
13 | #'
14 | #' @slot cacheDir A single character value specifying cache directory.
15 | #'
16 | #' @slot showWarnings A single boolean value specifying whether to show
17 | #' warnings.
18 | #'
19 | #' @slot clearCache A single boolean value specifying whether to clear cache.
20 | #'
21 | #' @slot verbose A single boolean value specifying whether to show messages.
22 | #'
23 | #' @section Constructor:
24 | #' See \code{\link{MgnifyClient}} for constructor.
25 | #'
26 | #' @section Accessor:
27 | #' See \code{\link{MgnifyClient-accessors}} for accessor functions.
28 | #'
29 | #' @name MgnifyClient
30 | NULL
31 |
32 | #' @rdname MgnifyClient
33 | #' @importFrom httr POST
34 | #' @importFrom httr content
35 | #' @exportClass MgnifyClient
36 | setClass(
37 | "MgnifyClient", representation(
38 | databaseUrl = "character",
39 | authTok = "character",
40 | useCache = "logical",
41 | cacheDir = "character",
42 | showWarnings = "logical",
43 | clearCache = "logical",
44 | verbose = "logical"),
45 | prototype = list(
46 | databaseUrl = "https://www.ebi.ac.uk/metagenomics/api/v1",
47 | authTok = NULL,
48 | useCache = FALSE,
49 | cacheDir = NULL,
50 | clearCache = FALSE,
51 | verbose = TRUE))
52 |
--------------------------------------------------------------------------------
/tests/testthat/test-getMetadata.R:
--------------------------------------------------------------------------------
1 | context("getMetadata")
2 | test_that("getMetadata", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | expect_error(getMetadata(1))
7 | expect_error(getMetadata("test"))
8 | expect_error(getMetadata(TRUE))
9 |
10 | expect_error(getMetadata(mg, accession = TRUE))
11 | expect_error(getMetadata(mg, accession = 1))
12 | expect_error(getMetadata(mg, accession = c(1, 2)))
13 |
14 | expect_error(getMetadata(mg, accession = "test", use.cache = NULL))
15 | expect_error(getMetadata(mg, accession = "test", use.cache = 1))
16 | expect_error(getMetadata(mg, accession = "test", use.cache = c(TRUE, FALSE)))
17 |
18 | expect_error(getMetadata(mg, accession = "test", show.messages = NULL))
19 | expect_error(getMetadata(mg, accession = "test", show.messages = 1))
20 | expect_error(getMetadata(mg, accession = "test", show.messages = c(TRUE, FALSE)))
21 |
22 | # Require internet access
23 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
24 |
25 | # Test that correct metadata is fetched based on certain accession ID.
26 | res <- getMetadata(mg, "MGYA00097621", show.messages = FALSE)
27 | expect_equal(nrow(res), 1)
28 | expect_true(ncol(res) > 1)
29 | expect_equal(rownames(res)[1] , "MGYA00097621")
30 | expect_equal(res$run_accession, "ERR1811651")
31 |
32 | # # To reduce the time used to build the package, these tests are commented
33 | # # When metadata is not found, should give a warning and the result should
34 | # # be empty tibble
35 | # expect_warning(res <- getMetadata(mg, "MGYS00005292", show.messages = FALSE))
36 | # expect_true(ncol(res) == 0 && nrow(res) == 0)
37 | })
38 |
--------------------------------------------------------------------------------
/vignettes/references.bib:
--------------------------------------------------------------------------------
1 | @Manual{SE,
2 | title = {SummarizedExperiment: SummarizedExperiment container},
3 | author = {Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès},
4 | year = {2020},
5 | note = {R package version 1.20.0},
6 | url = {https://bioconductor.org/packages/SummarizedExperiment},
7 | }
8 |
9 | @Article{SCE,
10 | title = {Orchestrating single-cell analysis with Bioconductor},
11 | author = {Robert Amezquita and Aaron Lun and Etienne Becht and Vince Carey and Lindsay Carpp and Ludwig Geistlinger and Federico Marini and Kevin Rue-Albrecht and Davide Risso and Charlotte Soneson and Levi Waldron and Herve Pages and Mike Smith and Wolfgang Huber and Martin Morgan and Raphael Gottardo and Stephanie Hicks},
12 | year = {2020},
13 | volume = {17},
14 | pages = {137--145},
15 | journal = {Nature Methods},
16 | url = {https://www.nature.com/articles/s41592-019-0654-x},
17 | }
18 |
19 | @Manual{TSE,
20 | title = {TreeSummarizedExperiment: TreeSummarizedExperiment: a S4 Class for Data with Tree
21 | Structures},
22 | author = {Ruizhu Huang},
23 | year = {2021},
24 | note = {R package version 1.99.9},
25 | }
26 |
27 | @Article{dada2,
28 | title = {DADA2: High-resolution sample inference from Illumina amplicon data},
29 | author = {Benjamin J Callahan and Paul J McMurdie and Michael J Rosen and Andrew W Han and Amy Jo A Johnson and Susan P Holmes},
30 | journal = {Nature Methods},
31 | volume = {13},
32 | pages = {581-583},
33 | year = {2016},
34 | doi = {10.1038/nmeth.3869},
35 | }
36 |
37 | @Manual{DMM,
38 | title = {DirichletMultinomial: Dirichlet-Multinomial Mixture Model Machine Learning for
39 | Microbiome Data},
40 | author = {Martin Morgan},
41 | year = {2020},
42 | note = {R package version 1.32.0},
43 | }
44 |
--------------------------------------------------------------------------------
/man/searchAnalysis.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/searchAnalysis.R
3 | \name{searchAnalysis}
4 | \alias{searchAnalysis}
5 | \alias{searchAnalysis,MgnifyClient-method}
6 | \title{Look up analysis accession IDs for one or more study or sample accessions}
7 | \usage{
8 | searchAnalysis(x, ...)
9 |
10 | \S4method{searchAnalysis}{MgnifyClient}(x, type, accession, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{MgnifyClient} object.}
14 |
15 | \item{...}{Optional arguments; not currently used.}
16 |
17 | \item{type}{A single character value specifying a type of
18 | accession IDs specified by \code{accession}. Must be "studies" or "samples".}
19 |
20 | \item{accession}{A single character value or a vector of character values
21 | specifying study or sample accession IDs that are used to retrieve analyses
22 | IDs.}
23 | }
24 | \value{
25 | Vector of analysis accession IDs.
26 | }
27 | \description{
28 | Look up analysis accession IDs for one or more study or sample accessions
29 | }
30 | \details{
31 | Retrieve analysis accession IDs associated with the supplied study or
32 | sample accession. In MGnify, an analysis accession refers to a certain
33 | pipeline analysis, such as specific 16S rRNA or shotgun metagenomic mapping.
34 | Studies can include multiple samples, and each sample can undergo multiple
35 | analyses using these pipelines. Each analysis is identified by a unique
36 | accession ID, allowing precise tracking and retrieval of analysis results
37 | within the MGnify database.
38 | }
39 | \examples{
40 | # Create a client object
41 | mg <- MgnifyClient(useCache = FALSE)
42 |
43 | # Retrieve analysis ids from study MGYS00005058
44 | result <- searchAnalysis(mg, "studies", c("MGYS00005058"))
45 |
46 | \dontrun{
47 | # Retrieve all analysis ids from samples
48 | result <- searchAnalysis(
49 | mg, "samples", c("SRS4392730", "SRS4392743"))
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/tests/testthat/test-searchAnalysis.R:
--------------------------------------------------------------------------------
1 | context("searchAnalysis")
2 | test_that("searchAnalysis", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | expect_error(searchAnalysis(TRUE))
7 | expect_error(searchAnalysis("test"))
8 | expect_error(searchAnalysis(NULL))
9 | expect_error(searchAnalysis(1))
10 |
11 | expect_error(searchAnalysis(mg, type = TRUE, accession = "test"))
12 | expect_error(searchAnalysis(mg, type = "test", accession = "test"))
13 | expect_error(searchAnalysis(mg, type = NULL, accession = "test"))
14 | expect_error(
15 | searchAnalysis(mg, type = c("studies", "samples", accession = "test")))
16 |
17 | expect_error(searchAnalysis(mg, type = "studies", accession = TRUE))
18 | expect_error(searchAnalysis(mg, type = "studies", accession = NULL))
19 | expect_error(searchAnalysis(mg, type = "studies", accession = 1))
20 | expect_error(
21 | searchAnalysis(mg, type = "studies", accession = c(TRUE, FALSE)))
22 |
23 | # Require internet access
24 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
25 |
26 | # Test that correct analysis IDs are found based on study ID.
27 | expect_warning(res <- searchAnalysis(
28 | mg, "studies", "MGYA00097621", verbose = FALSE))
29 | expect_true(is.null(res))
30 | res <- searchAnalysis(mg, "studies", "MGYS00005058", verbose = FALSE)
31 | expect_true(length(res) > 0)
32 | expect_true("MGYA00377528" %in% res)
33 |
34 | # # To reduce the time used to build the package, these tests are commented
35 | # # Test that correct analysis IDs are found based on sample ID.
36 | # expect_warning(
37 | # res <- searchAnalysis(mg, "samples", "MGYA00097621", verbose = FALSE))
38 | # expect_true(is.null(res))
39 | # res <- searchAnalysis(mg, "samples", "ERS2161777", verbose = FALSE)
40 | # expect_true(length(res) > 0)
41 | # expect_true("MGYA00293854" %in% res)
42 | })
43 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: MGnifyR
2 | Type: Package
3 | Version: 1.5.1
4 | Authors@R:
5 | c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
6 | email = "tuomas.v.borman@utu.fi",
7 | comment = c(ORCID = "0000-0002-8563-8884")),
8 | person(given = "Ben", family = "Allen", role = c("aut"),
9 | email = "ben.allen@ncl.ac.uk"),
10 | person(given = "Leo", family = "Lahti", role = c("aut"),
11 | email = "leo.lahti@iki.fi",
12 | comment = c(ORCID = "0000-0001-5537-637X")))
13 | Title: R interface to EBI MGnify metagenomics resource
14 | Description:
15 | Utility package to facilitate integration and analysis of EBI MGnify data
16 | in R. The package can be used to import microbial data for instance into
17 | TreeSummarizedExperiment (TreeSE). In TreeSE format, the data is directly
18 | compatible with miaverse framework.
19 | biocViews: Infrastructure, DataImport, Metagenomics, Microbiome, MicrobiomeData
20 | License: Artistic-2.0 | file LICENSE
21 | Encoding: UTF-8
22 | Depends:
23 | R(>= 4.4.0),
24 | MultiAssayExperiment,
25 | TreeSummarizedExperiment,
26 | SummarizedExperiment,
27 | BiocGenerics
28 | Imports:
29 | mia,
30 | ape,
31 | dplyr,
32 | httr,
33 | methods,
34 | plyr,
35 | reshape2,
36 | S4Vectors,
37 | urltools,
38 | utils,
39 | tidyjson
40 | Suggests:
41 | biomformat,
42 | broom,
43 | ggplot2,
44 | knitr,
45 | rmarkdown,
46 | testthat,
47 | xml2,
48 | BiocStyle,
49 | miaViz,
50 | vegan,
51 | scater,
52 | phyloseq,
53 | magick
54 | URL: https://github.com/EBI-Metagenomics/MGnifyR
55 | BugReports: https://github.com/EBI-Metagenomics/MGnifyR/issues
56 | VignetteBuilder: knitr
57 | RoxygenNote: 7.3.1
58 | Collate:
59 | 'utils.R'
60 | 'MgnifyClient.R'
61 | 'AllGenerics.R'
62 | 'AllClasses.R'
63 | 'AllAccessors.R'
64 | 'MGnifyR.R'
65 | 'deprecate.R'
66 | 'doQuery.R'
67 | 'getData.R'
68 | 'getFile.R'
69 | 'getMetadata.R'
70 | 'getResult.R'
71 | 'searchAnalysis.R'
72 |
--------------------------------------------------------------------------------
/tests/testthat/test-getData.R:
--------------------------------------------------------------------------------
1 | context("getData")
2 | test_that("getData", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | expect_error(getData(1))
7 | expect_error(getData("test"))
8 | expect_error(getData(TRUE))
9 |
10 | expect_error(getData(mg, type = 1))
11 | expect_error(getData(mg, type = TRUE))
12 | expect_error(getData(mg, type = NULL))
13 | expect_error(getData(mg, type = c("type", "type")))
14 |
15 | expect_error(getData(
16 | mg, type = "kegg-modules", accession.type = "analyses", accesion = 1))
17 | expect_error(getData(
18 | mg, type = "kegg-modules", accession.type = "analyses",
19 | accesion = TRUE))
20 | expect_error(getData(
21 | mg, type = "kegg-modules", accession.type = "analyses",
22 | accesion = NULL))
23 |
24 | expect_error(getData(
25 | mg, type = "kegg-modules", accession = c("MGYA00642773"),
26 | accesion.type = 1))
27 | expect_error(getData(
28 | mg, type = "kegg-modules",
29 | accession = c("MGYA00642773", "MGYA00642774"), accesion.type = TRUE))
30 | expect_error(getData(
31 | mg, type = "kegg-modules",
32 | accession = c("MGYA00642773", "MGYA00642774"), accesion.type = NULL))
33 | expect_error(getData(
34 | mg, type = "kegg-modules",
35 | accession = c("MGYA00642773", "MGYA00642774"), accesion.type = c("type", "type")))
36 |
37 | expect_error(getData(
38 | mg, type = "kegg-modules", accession = c("MGYA00642773"),
39 | accesion.type = c("type"), as.df = "test"))
40 | expect_error(getData(
41 | mg, type = "kegg-modules", accession = c("MGYA00642773"),
42 | accesion.type = c("type"), as.df = 1))
43 | expect_error(getData(
44 | mg, type = "kegg-modules", accession = c("MGYA00642773"),
45 | accesion.type = c("type"), as.df = c(TRUE, TRUE)))
46 | expect_error(getData(
47 | mg, type = "kegg-modules", accession = c("MGYA00642773"),
48 | accesion.type = c("type"), as.df = NULL))
49 |
50 | # Require internet access
51 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
52 |
53 | type <- "kegg-modules"
54 | res <- getData(
55 | mg, type = type, accession = "MGYA00642773", accession.type = "analyses")
56 | expect_is(res, "data.frame")
57 | expect_true( all(res[["type"]] == type) )
58 | })
59 |
--------------------------------------------------------------------------------
/tests/testthat/test-doQuery.R:
--------------------------------------------------------------------------------
1 | context("doQuery")
2 | test_that("doQuery", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | # Expect errors when input is wrong
7 | expect_error(doQuery("test"))
8 | expect_error(doQuery(TRUE))
9 | expect_error(doQuery(1))
10 |
11 | expect_error(doQuery(mg, type = 1))
12 | expect_error(doQuery(mg, type = "test"))
13 | expect_error(doQuery(mg, type = TRUE))
14 | expect_error(doQuery(mg, type = c("studies", "samples")))
15 |
16 | expect_error(doQuery(mg, type = "studies", accession = 1))
17 | expect_error(doQuery(mg, type = "studies", accession = TRUE))
18 | expect_error(doQuery(mg, type = "studies", accession = c(1, 2)))
19 |
20 | expect_error(doQuery(mg, type = "studies", accession = "test", as.df = NULL))
21 | expect_error(doQuery(mg, type = "studies", accession = "test", as.df = 1))
22 | expect_error(doQuery(mg, type = "studies", accession = "test", as.df = c(TRUE, FALSE)))
23 |
24 | expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = TRUE))
25 | expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = -100))
26 | expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = c(1, 2)))
27 | expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = 1.5))
28 |
29 | expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = NULL))
30 | expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = 1))
31 | expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = c(TRUE, FALSE)))
32 |
33 | # Require internet access
34 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
35 |
36 | # Test that studies are searched based on certain accession ID, get result
37 | # as list, choose max hits
38 | query <- doQuery(mg, "studies", "MGYS00005292", max.hits = 1, as.df = FALSE)
39 | expect_true(is.list(query))
40 | expect_true(names(query) %in% "MGYS00005292")
41 | expect_true(query$MGYS00005292$type == "studies")
42 |
43 | # # To reduce the time used to build the package, these tests are commented
44 | # # Test that runs are searched, get result as df, choose max hits
45 | # query2 <- doQuery(mg, "studies", "MGYS00005292", max.hits = 1)
46 | # expect_true(is.data.frame(query2))
47 | # expect_equal(query2$bioproject,
48 | # query$MGYS00005292$attributes$bioproject)
49 | })
50 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export("authTok<-")
4 | export("cacheDir<-")
5 | export("clearCache<-")
6 | export("databaseUrl<-")
7 | export("showWarnings<-")
8 | export("useCache<-")
9 | export("verbose<-")
10 | export(MgnifyClient)
11 | export(authTok)
12 | export(cacheDir)
13 | export(clearCache)
14 | export(databaseUrl)
15 | export(doQuery)
16 | export(getData)
17 | export(getFile)
18 | export(getMetadata)
19 | export(getResult)
20 | export(mgnify_analyses_from_samples)
21 | export(mgnify_analyses_from_studies)
22 | export(mgnify_client)
23 | export(mgnify_download)
24 | export(mgnify_get_analyses_metadata)
25 | export(mgnify_get_analyses_phyloseq)
26 | export(mgnify_get_analyses_results)
27 | export(mgnify_get_download_urls)
28 | export(mgnify_query)
29 | export(mgnify_retrieve_json)
30 | export(searchAnalysis)
31 | export(searchFile)
32 | export(showWarnings)
33 | export(useCache)
34 | export(verbose)
35 | exportClasses(MgnifyClient)
36 | exportMethods("authTok<-")
37 | exportMethods("cacheDir<-")
38 | exportMethods("clearCache<-")
39 | exportMethods("databaseUrl<-")
40 | exportMethods("showWarnings<-")
41 | exportMethods("useCache<-")
42 | exportMethods("verbose<-")
43 | exportMethods(authTok)
44 | exportMethods(cacheDir)
45 | exportMethods(clearCache)
46 | exportMethods(databaseUrl)
47 | exportMethods(doQuery)
48 | exportMethods(getData)
49 | exportMethods(getFile)
50 | exportMethods(getMetadata)
51 | exportMethods(getResult)
52 | exportMethods(searchAnalysis)
53 | exportMethods(searchFile)
54 | exportMethods(showWarnings)
55 | exportMethods(useCache)
56 | exportMethods(verbose)
57 | import(MultiAssayExperiment)
58 | import(TreeSummarizedExperiment)
59 | import(mia)
60 | importFrom(S4Vectors,SimpleList)
61 | importFrom(SummarizedExperiment,"rowData<-")
62 | importFrom(SummarizedExperiment,rowData)
63 | importFrom(TreeSummarizedExperiment,rowTree)
64 | importFrom(ape,read.tree)
65 | importFrom(dplyr,"%>%")
66 | importFrom(dplyr,bind_rows)
67 | importFrom(dplyr,mutate_all)
68 | importFrom(dplyr,na_if)
69 | importFrom(httr,GET)
70 | importFrom(httr,POST)
71 | importFrom(httr,add_headers)
72 | importFrom(httr,config)
73 | importFrom(httr,content)
74 | importFrom(httr,timeout)
75 | importFrom(httr,write_disk)
76 | importFrom(methods,is)
77 | importFrom(methods,new)
78 | importFrom(mia,checkTaxonomy)
79 | importFrom(mia,importBIOM)
80 | importFrom(plyr,llply)
81 | importFrom(plyr,rbind.fill)
82 | importFrom(reshape2,dcast)
83 | importFrom(tidyjson,spread_all)
84 | importFrom(urltools,"parameters<-")
85 | importFrom(urltools,parameters)
86 | importFrom(utils,read.csv2)
87 |
--------------------------------------------------------------------------------
/tests/testthat/test-MgnifyClient.R:
--------------------------------------------------------------------------------
1 | context("MgnifyClient")
2 | test_that("MgnifyClient", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient()
5 |
6 | # Expect errors when input is wrong
7 | expect_error(MgnifyClient(useCache = 1))
8 | expect_error(MgnifyClient(useCache = "TRUE"))
9 | expect_error(MgnifyClient(useCache = c(TRUE, TRUE)))
10 |
11 | expect_error(MgnifyClient(verbose = 1))
12 | expect_error(MgnifyClient(verbose = "TRUE"))
13 | expect_error(MgnifyClient(verbose = c(TRUE, TRUE)))
14 |
15 | expect_error(MgnifyClient(showWarnings = 1))
16 | expect_error(MgnifyClient(showWarnings = "TRUE"))
17 | expect_error(MgnifyClient(showWarnings = c(TRUE, TRUE)))
18 |
19 | expect_error(MgnifyClient(clearCache = 1))
20 | expect_error(MgnifyClient(clearCache = "TRUE"))
21 | expect_error(MgnifyClient(clearCache = c(TRUE, TRUE)))
22 |
23 | expect_error(MgnifyClient(showWarnings = 1))
24 | expect_error(MgnifyClient(showWarnings = "TRUE"))
25 | expect_error(MgnifyClient(showWarnings = c(TRUE, TRUE)))
26 |
27 | expect_error(MgnifyClient(url = 1))
28 | expect_error(MgnifyClient(url = TRUE))
29 | expect_error(MgnifyClient(url = c("url", "url")))
30 |
31 | expect_error(MgnifyClient(username = 1))
32 | expect_error(MgnifyClient(username = TRUE))
33 | expect_error(MgnifyClient(username = c("url", "url")))
34 |
35 | expect_error(MgnifyClient(password = 1))
36 | expect_error(MgnifyClient(password = TRUE))
37 | expect_error(MgnifyClient(password = c("url", "url")))
38 |
39 | # Test that slots are updated. Change arguments --> check that values
40 | # of slots correspond argument.
41 | mg <- MgnifyClient(
42 | useCache = TRUE,
43 | cacheDir = "test",
44 | showWarnings = FALSE,
45 | url = "test"
46 | )
47 | expect_equal(cacheDir(mg), "test/.MGnifyR_cache")
48 | expect_equal(showWarnings(mg), FALSE)
49 | expect_equal(databaseUrl(mg), "test")
50 | mg <- MgnifyClient(
51 | useCache = FALSE,
52 | cacheDir = "test",
53 | showWarnings = TRUE
54 | )
55 | expect_true(!is.na(cacheDir(mg)))
56 | expect_equal(showWarnings(mg), TRUE)
57 | # Require internet access
58 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
59 | # Test that error occurs when wrong username/password is used in
60 | # authentication
61 | expect_error(MgnifyClient(username = "not_work", password = "not_work"))
62 | expect_error(
63 | MgnifyClient(
64 | username = "not_work", password = "not_work", url = "not_work"))
65 | })
66 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MGnifyR
2 |
3 | An R package for searching and retrieving data from the
4 | [EBI Metagenomics resource](https://www.ebi.ac.uk/metagenomics).
5 | In most cases, MGnifyR interacts directly with the JSONAPI, rather than relying
6 | on downloading analyses outputs as TSV files. Thus it is more general - allowing
7 | for example the intuitive combining of multiple studies and analyses
8 | into a single workflow, but is in some cases slower than the afformentioned
9 | direct access. Local caching of results on disk is implemented to help counter
10 | some of the overheads, but data downloads can be slow - particularly for
11 | functional annotation retrieval.
12 |
13 | MGnifyR package is part of [miaverse](https://microbiome.github.io/)
14 | microbiome analysis ecosystem enabling usage of
15 | [mia](https://bioconductor.org/packages/release/bioc/html/mia.html)
16 | and other miaverse packages.
17 |
18 |
19 |
20 | **This research has received funding from the Horizon 2020 Programme of the
21 | European Union within the FindingPheno project under grant agreement No
22 | 952914.** FindingPheno, an EU-funded project, is dedicated to developing
23 | computational tools and methodologies for the integration and analysis of
24 | multi-omics data. Its primary objective is to deepen our understanding of the
25 | interactions between hosts and their microbiomes. You can find more information
26 | on [FindingPheno website](https://findingpheno.eu/).
27 |
28 | ## Installation
29 |
30 | ### Bioc-release
31 |
32 | ```
33 | if (!requireNamespace("BiocManager", quietly = TRUE))
34 | install.packages("BiocManager")
35 |
36 | BiocManager::install("MGnifyR")
37 | ```
38 |
39 | ### Bioc-devel
40 |
41 | ```
42 | if (!requireNamespace("BiocManager", quietly = TRUE))
43 | install.packages("BiocManager")
44 |
45 | # The following initializes usage of Bioc devel
46 | BiocManager::install(version='devel')
47 |
48 | BiocManager::install("MGnifyR")
49 | ```
50 |
51 | ### GitHub
52 |
53 | ```
54 | remotes::install_github("EBI-Metagenomics/MGnifyR")
55 | ```
56 |
57 | ## Basic usage
58 | For more detailed instructions read the associated function help and vignette (`vignette("MGNifyR")`)
59 |
60 | ```
61 | library(MGnifyR)
62 |
63 | # Set up the MGnify client instance
64 | mg <- MgnifyClient(useCache = TRUE, cacheDir = '/tmp/MGnify_cache')
65 |
66 | # Retrieve the list of analyses associated with a study
67 | accession_list <- searchAnalysis(mg, "studies", "MGYS00005058")
68 |
69 | # Download all associated study/sample and analysis metadata
70 | meta_dataframe <- getMetadata(mg, accession_list)
71 |
72 | # Convert analyses outputs to a single `MultiAssayExperiment` object
73 | mae <- getResult(mg, meta_dataframe$analysis_accession)
74 | mae
75 | ```
76 |
77 |
--------------------------------------------------------------------------------
/man/MgnifyClient-accessors.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/AllAccessors.R
3 | \name{databaseUrl}
4 | \alias{databaseUrl}
5 | \alias{authTok}
6 | \alias{useCache}
7 | \alias{cacheDir}
8 | \alias{showWarnings}
9 | \alias{clearCache}
10 | \alias{verbose}
11 | \alias{databaseUrl<-}
12 | \alias{authTok<-}
13 | \alias{useCache<-}
14 | \alias{cacheDir<-}
15 | \alias{showWarnings<-}
16 | \alias{clearCache<-}
17 | \alias{verbose<-}
18 | \alias{MgnifyClient-accessors}
19 | \alias{databaseUrl,MgnifyClient-method}
20 | \alias{authTok,MgnifyClient-method}
21 | \alias{useCache,MgnifyClient-method}
22 | \alias{cacheDir,MgnifyClient-method}
23 | \alias{showWarnings,MgnifyClient-method}
24 | \alias{clearCache,MgnifyClient-method}
25 | \alias{verbose,MgnifyClient-method}
26 | \alias{databaseUrl<-,MgnifyClient-method}
27 | \alias{authTok<-,MgnifyClient-method}
28 | \alias{useCache<-,MgnifyClient-method}
29 | \alias{cacheDir<-,MgnifyClient-method}
30 | \alias{showWarnings<-,MgnifyClient-method}
31 | \alias{clearCache<-,MgnifyClient-method}
32 | \alias{verbose<-,MgnifyClient-method}
33 | \title{MgnifyClient accessors and mutators}
34 | \usage{
35 | databaseUrl(x)
36 |
37 | authTok(x)
38 |
39 | useCache(x)
40 |
41 | cacheDir(x)
42 |
43 | showWarnings(x)
44 |
45 | clearCache(x)
46 |
47 | verbose(x)
48 |
49 | databaseUrl(x) <- value
50 |
51 | authTok(x) <- value
52 |
53 | useCache(x) <- value
54 |
55 | cacheDir(x) <- value
56 |
57 | showWarnings(x) <- value
58 |
59 | clearCache(x) <- value
60 |
61 | verbose(x) <- value
62 |
63 | \S4method{databaseUrl}{MgnifyClient}(x)
64 |
65 | \S4method{authTok}{MgnifyClient}(x)
66 |
67 | \S4method{useCache}{MgnifyClient}(x)
68 |
69 | \S4method{cacheDir}{MgnifyClient}(x)
70 |
71 | \S4method{showWarnings}{MgnifyClient}(x)
72 |
73 | \S4method{clearCache}{MgnifyClient}(x)
74 |
75 | \S4method{verbose}{MgnifyClient}(x)
76 |
77 | \S4method{databaseUrl}{MgnifyClient}(x) <- value
78 |
79 | \S4method{authTok}{MgnifyClient}(x) <- value
80 |
81 | \S4method{useCache}{MgnifyClient}(x) <- value
82 |
83 | \S4method{cacheDir}{MgnifyClient}(x) <- value
84 |
85 | \S4method{showWarnings}{MgnifyClient}(x) <- value
86 |
87 | \S4method{clearCache}{MgnifyClient}(x) <- value
88 |
89 | \S4method{verbose}{MgnifyClient}(x) <- value
90 | }
91 | \arguments{
92 | \item{x}{A \code{MgnifyClient} object.}
93 |
94 | \item{value}{A value to be added to a certain slot.}
95 | }
96 | \value{
97 | A value of MgnifyClient object or nothing.
98 | }
99 | \description{
100 | MgnifyClient accessors and mutators
101 | }
102 | \details{
103 | These functions are for fetching and mutating slots of
104 | \code{MgnifyClient} object.
105 | }
106 | \examples{
107 | mg <- MgnifyClient()
108 |
109 | databaseUrl(mg)
110 | showWarnings(mg) <- FALSE
111 |
112 | }
113 |
--------------------------------------------------------------------------------
/inst/extras/demo_code.R:
--------------------------------------------------------------------------------
1 | library(vegan)
2 | library(ggplot2)
3 | library(phyloseq)
4 |
5 | library(MGnifyR)
6 |
7 | mg <- mgnify_client(usecache = T, cache_dir = '/tmp/MGnify_demo')
8 |
9 |
10 | ####### Queries:
11 | mgnify_query(mg, "studies", biome_name="Wastewater", usecache = T)
12 | mgnify_query(mg, "samples", latitude_gte=60.0, experiment_type="metagenomic", usecache = T)
13 | m <- mgnify_query(mg, "samples", study_accession = "MGYS00003725", usecache=T)
14 | acc_list <- mgnify_analyses_from_samples(mg, m$accession)
15 | df <- mgnify_get_analyses_metadata(mg, acc_list)
16 | df
17 |
18 |
19 | ##### Single study retrieval
20 | #Amplicon: Oral health of young adults: Amplicon study
21 | om_analyses <- mgnify_analyses_from_studies(mg, "MGYS00002277")
22 | om_metadata_df <- mgnify_get_analyses_metadata(mg, om_analyses)
23 | t(head(om_metadata_df))
24 |
25 | om_ps <- mgnify_get_analyses_phyloseq(mg, om_analyses, tax_SU = "SSU")
26 |
27 | om_ps_sub <- subset_samples(om_ps, sample_sums(om_ps) > 10000)
28 |
29 | omps <- rarefy_even_depth(om_ps_sub)
30 | omps
31 |
32 | #plt1 <- plot_bar(omps, fill="Class", facet_grid = "sample_sample.desc") + theme(legend.position = "none")
33 |
34 | alpha_div <- estimate_richness(omps)
35 |
36 | adf <- cbind.data.frame(sample_data(omps)$`sample_sample.desc`, alpha_div$InvSimpson)
37 | colnames(adf) <- c("factor","value")
38 |
39 | ggplot(adf, aes(x=factor, y=value)) + geom_boxplot(width=0.1) + geom_jitter(width=0.1) + theme_bw()
40 |
41 | omps_ord <- ordinate(omps, method = "PCoA" , distance = "bray")
42 | plot_ordination(omps, omps_ord, color = "sample_sample.desc") + theme_bw()
43 |
44 |
45 |
46 | ##### Multi-biome metagenome analysis
47 | set.seed(11)
48 | mg <- mgnify_client(usecache = T, cache_dir = "/tmp/mgnify_cache")
49 |
50 | #Study accessions
51 |
52 | #Saltmarsh metagenomes HiSeq4000 : MGYS00001447 - 48 samples
53 | #Healthy human gut metagenomes : MGYS00001442 30 odd samples
54 | #Marine Subseafloor microbes at Mid-Cayman Rise: MGYS00001282
55 |
56 |
57 | soil <- mgnify_analyses_from_studies(mg, "MGYS00001447")
58 | human <- mgnify_analyses_from_studies(mg, "MGYS00001442")
59 | seafloor <- mgnify_analyses_from_studies(mg, "MGYS00001282")
60 | seafloor <- sample(seafloor, 40)
61 |
62 | accessions <- c(soil,human,seafloor)
63 |
64 | metadata <- mgnify_get_analyses_metadata(mg, accessions)
65 | head(metadata)
66 |
67 | goterms <- mgnify_get_analyses_results(mg, accessions, retrievelist = "go-slim")$`go-slim`
68 |
69 |
70 | m <- goterms[,c(-1,-2,-3)]
71 |
72 | normed_m <- apply(m, 2, function(x) x/sum(x))
73 |
74 | nmds <- vegan::metaMDS(t(normed_m))
75 |
76 | pltdat <- as.data.frame(scores(vare.mds)) #Using the scores function from vegan to extract the site scores and convert to a data.frame
77 | pltdat$grp <- metadata[rownames(pltdat),"study_accession"] # add the grp variable created earlier
78 |
79 | ggplot() + geom_point(data=pltdat,aes(x=NMDS1,y=NMDS2,colour=grp),size=3) + theme_bw()
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/man/getData.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/getData.R
3 | \name{getData}
4 | \alias{getData}
5 | \alias{getData,MgnifyClient-method}
6 | \title{Versatile function to retrieve raw results}
7 | \usage{
8 | getData(x, ...)
9 |
10 | \S4method{getData}{MgnifyClient}(x, type, accession.type = NULL, accession = NULL, as.df = TRUE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{MgnifyClient} object.}
14 |
15 | \item{...}{optional arguments fed to internal functions.}
16 |
17 | \item{type}{A single character value specifying the type of data retrieve.
18 | Must be one of the following options: \code{studies}, \code{samples},
19 | \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
20 | \code{super-studies}, \code{experiment-types}, \code{pipelines},
21 | \code{pipeline-tools}, \code{publications}, \code{genomes},
22 | \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
23 | \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
24 | \code{antismash-geneclusters}, \code{annotations/go-terms},
25 | \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
26 | \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
27 | \code{annotations/genome-properties},
28 | \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
29 | \code{mydata}.}
30 |
31 | \item{accession.type}{A single character value specifying type of accession
32 | IDs (\code{accession}). Must be specified when \code{accession} is specified.
33 | (By default: \code{accession.type = NULL})}
34 |
35 | \item{accession}{A single character value or a vector of character values
36 | specifying accession IDs to return results for.
37 | (By default: \code{accession = NULL})}
38 |
39 | \item{as.df}{A single boolean value specifying whether to return the
40 | results as a data.frame or leave as a nested list.
41 | (By default: \code{as.df = TRUE})}
42 | }
43 | \value{
44 | \code{data.frame} or \code{list}
45 | }
46 | \description{
47 | Versatile function to retrieve raw results
48 | }
49 | \details{
50 | This function returns data from MGnify database. Compared to
51 | \code{getResult}, this function allows more flexible framework for fetching
52 | the data. However, there are drawbacks: for counts data, \code{getResult}
53 | returns optimally structured data container which is easier for downstream
54 | analysis. \code{getData} returns raw data from the database. However, if
55 | you want to retrieve data on pipelines or publications, for instance,
56 | \code{getResult} is not suitable for it, and \code{getData} can be utilized
57 | instead.
58 | }
59 | \examples{
60 | # Create a client object
61 | mg <- MgnifyClient(useCache = FALSE)
62 |
63 | # Find kegg modules for certain analysis
64 | df <- getData(
65 | mg, type = "kegg-modules",
66 | accession = "MGYA00642773", accession.type = "analyses")
67 |
68 | }
69 | \seealso{
70 | \code{\link[MGnifyR:getResult]{getResult}}
71 | }
72 |
--------------------------------------------------------------------------------
/tests/testthat/test-getResult.R:
--------------------------------------------------------------------------------
1 | context("getResult")
2 | test_that("getResult", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | expect_error(getResult(1))
7 | expect_error(getResult("test"))
8 | expect_error(getResult(TRUE))
9 |
10 | expect_error(getResult(mg, accesion = 1))
11 | expect_error(getResult(mg, accesion = TRUE))
12 | expect_error(getResult(mg, accesion = NULL))
13 |
14 | expect_error(getResult(mg, accession = "test", output = "test"))
15 | expect_error(getResult(mg, accession = "test", output = TRUE))
16 | expect_error(getResult(mg, accession = "test", output = 1))
17 | expect_error(getResult(mg, accession = "test", output = c("TreeSE", "phyloseq")))
18 | expect_error(getResult(mg, accession = "test", output = NULL))
19 |
20 | expect_error(getResult(mg, accession = "test", get.taxa = NULL))
21 | expect_error(getResult(mg, accession = "test", get.taxa = 1))
22 | expect_error(getResult(mg, accession = "test", get.taxa = c(TRUE, TRUE)))
23 | expect_error(getResult(mg, accession = "test", get.taxa = "test"))
24 |
25 | expect_error(getResult(mg, accession = "test", get.func = NULL))
26 | expect_error(getResult(mg, accession = "test", get.func = 1))
27 | expect_error(getResult(mg, accession = "test", get.func = c(TRUE, TRUE)))
28 | expect_error(getResult(mg, accession = "test", get.func = "test"))
29 |
30 | # Require internet access
31 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
32 |
33 | # # To reduce the time used to build the package, these tests are commented
34 | # # Test that only functional data is fetched based on certain accession ID.
35 | # # Get data as list of data.frames
36 | # res <- getResult(
37 | # mg, "MGYA00097621", get.taxa = FALSE, output = "list",
38 | # get.func = TRUE, verbose = FALSE)
39 | # expect_true(is.list(res))
40 | # expect_true("go-terms" %in% names(res))
41 | # expect_true(is.character(res$`interpro-identifiers`$analysis) &&
42 | # is.character(res$`interpro-identifiers`$description) &&
43 | # is.numeric(res$`interpro-identifiers`$count))
44 |
45 | # Test that microbial profiling data and functional data is fetched. Get
46 | # data as MAE. Fetch also trees. Check that all data is is in correct place
47 | # and is correct.
48 | res <- getResult(mg, "MGYA00097621", get.func = TRUE, verbose = FALSE)
49 | expect_is(res, "MultiAssayExperiment")
50 | expect_is(res[[1]], "TreeSummarizedExperiment")
51 | expect_true(!is.null(rowTree(res[["microbiota"]])))
52 | expect_true(is.matrix(assay(res[[1]])))
53 | expect_true("microbiota" %in% names(res) &&
54 | "go-terms" %in% names(res))
55 | expect_true(is.matrix(assay(res[[2]])))
56 | expect_true(is.matrix(assay(res[[3]])))
57 | expect_equal(assay(res[["go-slim"]])["GO:1990204", 1][[1]], 929)
58 | expect_equal(colnames(res[[1]]), colnames(res[[2]]))
59 | expect_equal(colnames(res[[3]]), colnames(res[[2]]))
60 | })
61 |
--------------------------------------------------------------------------------
/tests/testthat/test-getFile.R:
--------------------------------------------------------------------------------
1 | context("getFile")
2 | test_that("getFile", {
3 | # Test that input check caches wrong arguments.
4 | mg <- MgnifyClient(useCache = FALSE)
5 |
6 | expect_error(getFile(10))
7 | expect_error(getFile(TRUE))
8 | expect_error(getFile(NULL))
9 |
10 | expect_error(getFile(mg, url = 10))
11 | expect_error(getFile(mg, url = TRUE))
12 | expect_error(getFile(mg, url = c("test", "test")))
13 |
14 | expect_error(getFile(mg, url = "test", read.func = 10))
15 | expect_error(getFile(mg, url = "test", read.func = TRUE))
16 |
17 | expect_error(getFile(mg, url = "test", use.cache = 10))
18 | expect_error(getFile(mg, url = "test", use.cache = TRUE))
19 | expect_error(getFile(mg, url = "test", use.cache = c("test", "test")))
20 |
21 | expect_error(getFile(mg, url = "taxonomy--ssu", use.cache = 10))
22 | expect_error(getFile(mg, url = "test", use.cache = test))
23 |
24 | expect_error(searchFile(10))
25 | expect_error(searchFile(TRUE))
26 | expect_error(searchFile(NULL))
27 |
28 | expect_error(searchFile(mg, accession = TRUE))
29 | expect_error(searchFile(mg, accession = 1))
30 | expect_error(searchFile(mg, accession = NULL))
31 |
32 | expect_error(searchFile(mg, accession = "test", type = 1))
33 | expect_error(searchFile(mg, accession = "test", type = TRUE))
34 | expect_error(searchFile(mg, accession = "test", c("samples", "analyses")))
35 |
36 | expect_error(searchFile(mg, accession = "test", type = "samples", use.cache = NULL))
37 | expect_error(searchFile(mg, accession = "test", type = "samples", use.cache = 1))
38 | expect_error(searchFile( mg, accession = "test", type = "samples", use.cache = c(TRUE, FALSE)))
39 |
40 | expect_error(searchFile(mg, accession = "test", type = "samples", show.messages = NULL))
41 | expect_error(searchFile(mg, accession = "test", type = "samples", show.messages = 1))
42 | expect_error(searchFile( mg, accession = "test", type = "samples", show.messages = c(TRUE, FALSE)))
43 |
44 | # Require internet access
45 | skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
46 |
47 | # Expect error because url is incorrect
48 | expect_error(getFile(mg, url = "test"))
49 |
50 | # Test that df is returned even if accession ID is not correct
51 | expect_warning(
52 | res <- searchFile(mg, type = "assemblies", accession = "random")
53 | )
54 | expect_true(is.data.frame(res))
55 |
56 | # Test that file search is done correctly based on accession ID.
57 | # Use studies as type
58 | res <- searchFile(mg, type = "studies", accession = "MGYS00005292", show.messages = FALSE)
59 | expect_true(all(res$type == "studies"))
60 | expect_true(is.data.frame(res))
61 | expect_true(grepl("https", res$download_url[1]))
62 |
63 | # # To reduce the time used to build the package, these tests are commented
64 | # # Test that correct file is fetched based on provided url.
65 | # res <- getFile(mg, res$download_url[1])
66 | # # Result is stored in a path which is returned
67 | # expect_true(file.exists(res))
68 | })
69 |
--------------------------------------------------------------------------------
/man/deprecate.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/deprecate.R
3 | \name{deprecate}
4 | \alias{deprecate}
5 | \alias{mgnify_client}
6 | \alias{mgnify_query}
7 | \alias{mgnify_analyses_from_samples}
8 | \alias{mgnify_analyses_from_studies}
9 | \alias{mgnify_get_download_urls}
10 | \alias{mgnify_download}
11 | \alias{mgnify_get_analyses_results}
12 | \alias{mgnify_get_analyses_phyloseq}
13 | \alias{mgnify_get_analyses_metadata}
14 | \alias{mgnify_retrieve_json}
15 | \title{These functions will be deprecated. Please use other functions instead.}
16 | \usage{
17 | mgnify_client(
18 | username = NULL,
19 | password = NULL,
20 | usecache = FALSE,
21 | cache_dir = NULL,
22 | warnings = FALSE,
23 | use_memcache = FALSE,
24 | ...
25 | )
26 |
27 | mgnify_query(
28 | client,
29 | qtype = "samples",
30 | accession = NULL,
31 | asDataFrame = TRUE,
32 | maxhits = 200,
33 | usecache = FALSE,
34 | ...
35 | )
36 |
37 | mgnify_analyses_from_samples(client, accession, usecache = TRUE, ...)
38 |
39 | mgnify_analyses_from_studies(client, accession, usecache = TRUE, ...)
40 |
41 | mgnify_get_download_urls(
42 | client,
43 | accessions,
44 | accession_type,
45 | usecache = TRUE,
46 | ...
47 | )
48 |
49 | mgnify_download(
50 | client,
51 | url,
52 | file = NULL,
53 | read_func = NULL,
54 | usecache = TRUE,
55 | Debug = FALSE,
56 | ...
57 | )
58 |
59 | mgnify_get_analyses_results(
60 | client = NULL,
61 | accessions,
62 | retrievelist = c(),
63 | compact_results = TRUE,
64 | usecache = TRUE,
65 | bulk_dl = FALSE,
66 | ...
67 | )
68 |
69 | mgnify_get_analyses_phyloseq(
70 | client = NULL,
71 | accessions,
72 | usecache = TRUE,
73 | returnLists = FALSE,
74 | tax_SU = "SSU",
75 | get_tree = FALSE,
76 | ...
77 | )
78 |
79 | mgnify_get_analyses_metadata(client, accessions, usecache = TRUE, ...)
80 |
81 | mgnify_retrieve_json(
82 | client,
83 | path = "biomes",
84 | complete_url = NULL,
85 | qopts = NULL,
86 | maxhits = 200,
87 | usecache = FALSE,
88 | Debug = FALSE,
89 | ...
90 | )
91 | }
92 | \arguments{
93 | \item{username}{-}
94 |
95 | \item{password}{-}
96 |
97 | \item{usecache}{-}
98 |
99 | \item{cache_dir}{-}
100 |
101 | \item{warnings}{-}
102 |
103 | \item{use_memcache}{-}
104 |
105 | \item{...}{-}
106 |
107 | \item{client}{-}
108 |
109 | \item{qtype}{-}
110 |
111 | \item{accession}{-}
112 |
113 | \item{asDataFrame}{-}
114 |
115 | \item{maxhits}{-}
116 |
117 | \item{accessions}{-}
118 |
119 | \item{accession_type}{-}
120 |
121 | \item{url}{-}
122 |
123 | \item{file}{-}
124 |
125 | \item{read_func}{-}
126 |
127 | \item{Debug}{-}
128 |
129 | \item{retrievelist}{-}
130 |
131 | \item{compact_results}{-}
132 |
133 | \item{bulk_dl}{-}
134 |
135 | \item{returnLists}{-}
136 |
137 | \item{tax_SU}{-}
138 |
139 | \item{get_tree}{-}
140 |
141 | \item{path}{-}
142 |
143 | \item{complete_url}{-}
144 |
145 | \item{qopts}{-}
146 | }
147 | \value{
148 | -
149 | }
150 | \description{
151 | These functions will be deprecated. Please use other functions instead.
152 | }
153 |
--------------------------------------------------------------------------------
/R/AllGenerics.R:
--------------------------------------------------------------------------------
1 | # All generic methods are listed here
2 |
3 | #' @rdname MgnifyClient-accessors
4 | #' @export
5 | setGeneric(
6 | "databaseUrl", signature = c("x"), function(x)
7 | standardGeneric("databaseUrl"))
8 |
9 | #' @rdname MgnifyClient-accessors
10 | #' @export
11 | setGeneric(
12 | "authTok", signature = c("x"), function(x) standardGeneric("authTok"))
13 |
14 | #' @rdname MgnifyClient-accessors
15 | #' @export
16 | setGeneric(
17 | "useCache", signature = c("x"), function(x) standardGeneric("useCache"))
18 |
19 | #' @rdname MgnifyClient-accessors
20 | #' @export
21 | setGeneric(
22 | "cacheDir", signature = c("x"), function(x) standardGeneric("cacheDir"))
23 |
24 | #' @rdname MgnifyClient-accessors
25 | #' @export
26 | setGeneric(
27 | "showWarnings", signature = c("x"), function(x)
28 | standardGeneric("showWarnings"))
29 |
30 | #' @rdname MgnifyClient-accessors
31 | #' @export
32 | setGeneric(
33 | "clearCache", signature = c("x"), function(x) standardGeneric("clearCache"))
34 |
35 | #' @rdname MgnifyClient-accessors
36 | #' @export
37 | setGeneric(
38 | "verbose", signature = c("x"), function(x) standardGeneric("verbose"))
39 |
40 | #' @rdname MgnifyClient-accessors
41 | #' @export
42 | setGeneric(
43 | "databaseUrl<-", signature = c("x"), function(x, value)
44 | standardGeneric("databaseUrl<-"))
45 |
46 | #' @rdname MgnifyClient-accessors
47 | #' @export
48 | setGeneric(
49 | "authTok<-", signature = c("x"), function(x, value)
50 | standardGeneric("authTok<-"))
51 |
52 | #' @rdname MgnifyClient-accessors
53 | #' @export
54 | setGeneric(
55 | "useCache<-", signature = c("x"), function(x, value)
56 | standardGeneric("useCache<-"))
57 |
58 | #' @rdname MgnifyClient-accessors
59 | #' @export
60 | setGeneric(
61 | "cacheDir<-", signature = c("x"), function(x, value)
62 | standardGeneric("cacheDir<-"))
63 |
64 | #' @rdname MgnifyClient-accessors
65 | #' @export
66 | setGeneric(
67 | "showWarnings<-", signature = c("x"), function(x, value)
68 | standardGeneric("showWarnings<-"))
69 |
70 | #' @rdname MgnifyClient-accessors
71 | #' @export
72 | setGeneric(
73 | "clearCache<-", signature = c("x"), function(x, value)
74 | standardGeneric("clearCache<-"))
75 |
76 | #' @rdname MgnifyClient-accessors
77 | #' @export
78 | setGeneric(
79 | "verbose<-", signature = c("x"), function(x, value)
80 | standardGeneric("verbose<-"))
81 |
82 | #' @rdname doQuery
83 | #' @export
84 | setGeneric(
85 | "doQuery", signature = c("x"), function(x, ...) standardGeneric("doQuery"))
86 |
87 | #' @rdname getFile
88 | #' @export
89 | setGeneric(
90 | "getFile", signature = c("x"), function(x, ...) standardGeneric("getFile"))
91 |
92 | #' @rdname getFile
93 | #' @export
94 | setGeneric(
95 | "searchFile", signature = c("x"), function(x, ...)
96 | standardGeneric("searchFile"))
97 |
98 | #' @rdname getMetadata
99 | #' @export
100 | setGeneric(
101 | "getMetadata", signature = c("x"), function(x, ...)
102 | standardGeneric("getMetadata"))
103 |
104 | #' @rdname getResult
105 | #' @export
106 | setGeneric(
107 | "getResult", signature = c("x"), function(x, ...)
108 | standardGeneric("getResult"))
109 |
110 | #' @rdname getData
111 | #' @export
112 | setGeneric(
113 | "getData", signature = c("x"), function(x, ...)
114 | standardGeneric("getData"))
115 |
116 | #' @rdname searchAnalysis
117 | #' @export
118 | setGeneric(
119 | "searchAnalysis", signature = c("x"), function(x, ...)
120 | standardGeneric("searchAnalysis"))
121 |
--------------------------------------------------------------------------------
/man/MgnifyClient.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/MgnifyClient.R, R/AllClasses.R
3 | \docType{class}
4 | \name{MgnifyClient}
5 | \alias{MgnifyClient}
6 | \alias{MgnifyClient-class}
7 | \title{Constructor for creating a MgnifyClient object to allow the access to
8 | MGnify database.}
9 | \usage{
10 | MgnifyClient(
11 | username = NULL,
12 | password = NULL,
13 | useCache = FALSE,
14 | cacheDir = tempdir(),
15 | showWarnings = FALSE,
16 | verbose = TRUE,
17 | clearCache = FALSE,
18 | ...
19 | )
20 | }
21 | \arguments{
22 | \item{username}{A single character value specifying an optional username for
23 | authentication. (By default: \code{username = NULL})}
24 |
25 | \item{password}{A single character value specifying an optional password for
26 | authentication. (By default: \code{password = NULL})}
27 |
28 | \item{useCache}{A single boolean value specifying whether to enable on-disk
29 | caching of results during this session. In most use cases should be TRUE.
30 | (By default: \code{useCache = FALSE})}
31 |
32 | \item{cacheDir}{A single character value specifying a folder to contain the
33 | local cache. Note that cached files are persistent, so the cache directory
34 | may be reused between sessions, taking advantage of previously downloaded
35 | results. The directory will be created if it doesn't exist already.
36 | (By default: \code{cacheDir = tempdir()})}
37 |
38 | \item{showWarnings}{A single boolean value specifying whether to print
39 | warnings during invocation of some MGnifyR functions.
40 | (By default: \code{showWarnings = FALSE})}
41 |
42 | \item{verbose}{A single boolean value specifying whether to print extra
43 | output during invocation of some MGnifyR functions.
44 | (By default: \code{verbose = FALSE})}
45 |
46 | \item{clearCache}{A single boolean value specifying whether to clear the
47 | cache. (By default: \code{clearCache = FALSE})}
48 |
49 | \item{...}{optional arguments:
50 | \itemize{
51 | \item \strong{url} A single character value specifying an url address of
52 | the database. (By default:
53 | \code{url = "https://www.ebi.ac.uk/metagenomics/api/v1"})
54 | }}
55 | }
56 | \value{
57 | A MgnifyClient object.
58 | }
59 | \description{
60 | Constructor for creating a MgnifyClient object to allow the access to
61 | MGnify database.
62 |
63 | A MgnifyClient object
64 | }
65 | \details{
66 | All functions in the MGnifyR package take a \code{MgnifyClient} object as
67 | their first argument. The object allows the simple handling of both user
68 | authentication and access to private data, and manages general options for
69 | querying the MGnify database.
70 |
71 | An object that are required by functions of MGnifyR package.
72 | }
73 | \section{Slots}{
74 |
75 | \describe{
76 | \item{\code{databaseUrl}}{A single character value specifying an URL address of
77 | database.}
78 |
79 | \item{\code{authTok}}{A single character value specifying authentication token.}
80 |
81 | \item{\code{useCache}}{A single boolean value specifying whether to use cache.}
82 |
83 | \item{\code{cacheDir}}{A single character value specifying cache directory.}
84 |
85 | \item{\code{showWarnings}}{A single boolean value specifying whether to show
86 | warnings.}
87 |
88 | \item{\code{clearCache}}{A single boolean value specifying whether to clear cache.}
89 |
90 | \item{\code{verbose}}{A single boolean value specifying whether to show messages.}
91 | }}
92 |
93 | \section{Constructor}{
94 |
95 | See \code{\link{MgnifyClient}} for constructor.
96 | }
97 |
98 | \section{Accessor}{
99 |
100 | See \code{\link{MgnifyClient-accessors}} for accessor functions.
101 | }
102 |
103 | \examples{
104 | my_client <- MgnifyClient(
105 | useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
106 | )
107 |
108 | \dontrun{
109 | # Use username and password to get access to non-public data
110 | my_client <- MgnifyClient(
111 | username = "Webin-1122334", password = "SecretPassword",
112 | useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
113 | )
114 | }
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/R/AllAccessors.R:
--------------------------------------------------------------------------------
1 | #' MgnifyClient accessors and mutators
2 | #'
3 | #' @details
4 | #' These functions are for fetching and mutating slots of
5 | #' \code{MgnifyClient} object.
6 | #'
7 | #' @param x A \code{MgnifyClient} object.
8 | #'
9 | #' @param value A value to be added to a certain slot.
10 | #'
11 | #' @return A value of MgnifyClient object or nothing.
12 | #'
13 | #' @examples
14 | #' mg <- MgnifyClient()
15 | #'
16 | #' databaseUrl(mg)
17 | #' showWarnings(mg) <- FALSE
18 | #'
19 | #' @name MgnifyClient-accessors
20 | NULL
21 |
22 | #' @rdname MgnifyClient-accessors
23 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
24 | #' @export
25 | setMethod(
26 | "databaseUrl", signature = c(x = "MgnifyClient"),
27 | function(x){ x@databaseUrl })
28 |
29 | #' @rdname MgnifyClient-accessors
30 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
31 | #' @export
32 | setMethod(
33 | "authTok", signature = c(x = "MgnifyClient"), function(x){ x@authTok })
34 |
35 | #' @rdname MgnifyClient-accessors
36 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
37 | #' @export
38 | setMethod(
39 | "useCache", signature = c(x = "MgnifyClient"),
40 | function(x){ x@useCache })
41 |
42 | #' @rdname MgnifyClient-accessors
43 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
44 | #' @export
45 | setMethod(
46 | "cacheDir", signature = c(x = "MgnifyClient"), function(x){ x@cacheDir })
47 |
48 | #' @rdname MgnifyClient-accessors
49 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
50 | #' @export
51 | setMethod(
52 | "showWarnings", signature = c(x = "MgnifyClient"),
53 | function(x){ x@showWarnings })
54 |
55 | #' @rdname MgnifyClient-accessors
56 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
57 | #' @export
58 | setMethod(
59 | "clearCache", signature = c(x = "MgnifyClient"),
60 | function(x){ x@clearCache })
61 |
62 | #' @rdname MgnifyClient-accessors
63 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
64 | #' @export
65 | setMethod(
66 | "verbose", signature = c(x = "MgnifyClient"),
67 | function(x){ x@verbose })
68 |
69 | #' @rdname MgnifyClient-accessors
70 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
71 | #' @export
72 | setMethod(
73 | "databaseUrl<-", signature = c(x = "MgnifyClient"),
74 | function(x, value){ BiocGenerics:::replaceSlots(x, databaseUrl = value) })
75 |
76 | #' @rdname MgnifyClient-accessors
77 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
78 | #' @export
79 | setMethod(
80 | "authTok<-", signature = c(x = "MgnifyClient"),
81 | function(x, value){ BiocGenerics:::replaceSlots(x, authTok = value) })
82 |
83 | #' @rdname MgnifyClient-accessors
84 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
85 | #' @export
86 | setMethod(
87 | "useCache<-", signature = c(x = "MgnifyClient"),
88 | function(x, value){ BiocGenerics:::replaceSlots(x, useCache = value) })
89 |
90 | #' @rdname MgnifyClient-accessors
91 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
92 | #' @export
93 | setMethod(
94 | "cacheDir<-", signature = c(x = "MgnifyClient"),
95 | function(x, value){ BiocGenerics:::replaceSlots(x, cacheDir = value) })
96 |
97 | #' @rdname MgnifyClient-accessors
98 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
99 | #' @export
100 | setMethod(
101 | "showWarnings<-", signature = c(x = "MgnifyClient"),
102 | function(x, value){ BiocGenerics:::replaceSlots(x, showWarnings = value) })
103 |
104 | #' @rdname MgnifyClient-accessors
105 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
106 | #' @export
107 | setMethod(
108 | "clearCache<-", signature = c(x = "MgnifyClient"),
109 | function(x, value){ BiocGenerics:::replaceSlots(x, clearCache = value) })
110 |
111 | #' @rdname MgnifyClient-accessors
112 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
113 | #' @export
114 | setMethod(
115 | "verbose<-", signature = c(x = "MgnifyClient"),
116 | function(x, value){ BiocGenerics:::replaceSlots(x, verbose = value) })
117 |
--------------------------------------------------------------------------------
/man/getFile.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/getFile.R
3 | \name{getFile}
4 | \alias{getFile}
5 | \alias{searchFile}
6 | \alias{getFile,MgnifyClient-method}
7 | \alias{searchFile,MgnifyClient-method}
8 | \title{Download any MGnify files, also including processed reads and
9 | identified protein sequences}
10 | \usage{
11 | getFile(x, ...)
12 |
13 | searchFile(x, ...)
14 |
15 | \S4method{getFile}{MgnifyClient}(x, url, file = NULL, read.func = NULL, ...)
16 |
17 | \S4method{searchFile}{MgnifyClient}(
18 | x,
19 | accession,
20 | type = c("studies", "samples", "analyses", "assemblies", "genomes", "run"),
21 | ...
22 | )
23 | }
24 | \arguments{
25 | \item{x}{A \code{MgnifyClient} object.}
26 |
27 | \item{...}{Additional arguments; not used currently.}
28 |
29 | \item{url}{A single character value specifying the url address of the file
30 | we wish to download.}
31 |
32 | \item{file}{A single character value or NULL specifying an
33 | optional local filename to use for saving the file. If \code{NULL},
34 | MGNify local cache settings will be used. If the file is intended to be
35 | processed in a separate program, it may be sensible to provide a
36 | meaningful \code{file}, rather than having to hunt through the
37 | cache folders. If \code{file} is \code{NULL} and \code{useCache(client)}
38 | is \code{FALSE}, the \code{read.func} parameter must be supplied or the
39 | file will be downloaded and then deleted.
40 | (By default: \code{file = NULL})}
41 |
42 | \item{read.func}{A function specifying an optional function to process the
43 | downloaded file and return the results, rather than relying on post
44 | processing. The primary use-case for this parameter is when local disk
45 | space is limited and downloaded files can be quickly processed and
46 | discarded. The function should take a single parameter, the downloaded
47 | filename, and may return any valid R object.
48 | (By default: \code{read.func = NULL})}
49 |
50 | \item{accession}{A single character value or a vector of character values
51 | specifying accession IDs to return results for.}
52 |
53 | \item{type}{A single character value specifying the type of objects to
54 | query. Must be one of the following options: \code{analysis}, \code{samples},
55 | \code{studies}, \code{assembly}, \code{genome} or \code{run}.
56 | (By default: \code{type = "samples"})}
57 | }
58 | \value{
59 | For \code{getFile()}, either the local filename of the downloaded
60 | file, be it either the location in the MGNifyR cache or file. If
61 | \code{read.func} is used, its result will be returned.
62 |
63 | For \code{searchFile()} \code{data.frame} containing all discovered
64 | downloads. If multiple \code{accessions} are queried, the \code{accessions}
65 | column may to filter the results - since rownames are not set (and wouldn't
66 | make sense as each query will return multiple items)
67 | }
68 | \description{
69 | Download any MGnify files, also including processed reads and
70 | identified protein sequences
71 |
72 | Listing files available for download
73 | }
74 | \details{
75 | \code{getFile} is a convenient wrapper round generic the URL
76 | downloading functionality in R, taking care of things like local
77 | caching and authentication.
78 |
79 | \code{searchFile()} function is a wrapper function allowing easy
80 | enumeration of downloads available for a given accession IDs.
81 | Returns a single data.frame containing all available downloads and associated
82 | metadata, including the url location and description. This can then be
83 | filtered to extract the urls of interest, before actually
84 | retrieving the files using \code{getFile()}
85 | }
86 | \examples{
87 | # Make a client object
88 | mg <- MgnifyClient(useCache = FALSE)
89 |
90 | # Create a vector of accession ids - these happen to be \code{analysis}
91 | # accessions
92 | accession_vect <- c("MGYA00563876", "MGYA00563877")
93 | downloads <- searchFile(mg, accession_vect, "analyses")
94 |
95 | # Filter to find the urls of 16S encoding sequences
96 | url_list <- downloads[
97 | downloads$attributes.description.label == "Contigs encoding SSU rRNA",
98 | "download_url"]
99 |
100 | # Example 1:
101 | # Download the first file
102 | supplied_filename <- getFile(
103 | mg, url_list[[1]], file="SSU_file.fasta.gz")
104 |
105 | \dontrun{
106 | # Example 2:
107 | # Just use local caching
108 | cached_filename <- getFile(mg, url_list[[2]])
109 |
110 | # Example 3:
111 | # Using read.func to open the reads with readDNAStringSet from
112 | # \code{biostrings}. Without retaining on disk
113 | dna_seqs <- getFile(
114 | mg, url_list[[3]], read.func = readDNAStringSet)
115 | }
116 |
117 | # Make a client object
118 | mg <- MgnifyClient(useCache = TRUE)
119 | # Create a vector of accession ids - these happen to be \code{analysis}
120 | # accessions
121 | accession_vect <- c(
122 | "MGYA00563876", "MGYA00563877", "MGYA00563878",
123 | "MGYA00563879", "MGYA00563880" )
124 | downloads <- searchFile(mg, accession_vect, "analyses")
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/R/deprecate.R:
--------------------------------------------------------------------------------
1 | #' These functions will be deprecated. Please use other functions instead.
2 | #'
3 | #' @param url -
4 | #'
5 | #' @param username -
6 | #'
7 | #' @param password -
8 | #'
9 | #' @param usecache -
10 | #'
11 | #' @param cache_dir -
12 | #'
13 | #' @param warnings -
14 | #'
15 | #' @param use_memcache -
16 | #'
17 | #' @param client -
18 | #'
19 | #' @param qtype -
20 | #'
21 | #' @param accession -
22 | #'
23 | #' @param asDataFrame -
24 | #'
25 | #' @param maxhits -
26 | #'
27 | #' @param ... -
28 | #'
29 | #' @param accessions -
30 | #'
31 | #' @param accession_type -
32 | #'
33 | #' @param file -
34 | #'
35 | #' @param read_func -
36 | #'
37 | #' @param Debug -
38 | #'
39 | #' @param retrievelist -
40 | #'
41 | #' @param compact_results -
42 | #'
43 | #' @param bulk_dl -
44 | #'
45 | #' @param returnLists -
46 | #'
47 | #' @param tax_SU -
48 | #'
49 | #' @param get_tree -
50 | #'
51 | #' @param path -
52 | #'
53 | #' @param complete_url -
54 | #'
55 | #' @param qopts -
56 | #'
57 | #' @return -
58 | #'
59 | #' @name deprecate
60 | NULL
61 |
62 | #' @rdname deprecate
63 | #' @export
64 | mgnify_client <- function(
65 | username = NULL, password = NULL, usecache = FALSE,
66 | cache_dir = NULL, warnings = FALSE, use_memcache = FALSE, ...){
67 | .Deprecated("MgnifyClient")
68 | MgnifyClient(
69 | username = username, password = password,
70 | useCache = usecache, cacheDir = cache_dir, warnings = warnings,
71 | use.mem.cache = use_memcache, ...)
72 | }
73 |
74 | #' @rdname deprecate
75 | #' @export
76 | mgnify_query <- function(
77 | client, qtype = "samples", accession = NULL, asDataFrame = TRUE,
78 | maxhits = 200, usecache = FALSE, ...){
79 | .Deprecated("doQuery")
80 | doQuery(
81 | x = client, type = qtype, accession = accession,
82 | as.df = asDataFrame, max.hits = maxhits, usecache = usecache, ...)
83 | }
84 |
85 | #' @rdname deprecate
86 | #' @export
87 | mgnify_analyses_from_samples <- function(
88 | client, accession, usecache = TRUE, ...){
89 | .Deprecated("searchAnalysis")
90 | searchAnalysis(
91 | x = client, type = "samples", accession = accession,
92 | use.cache = usecache, ...)
93 | }
94 |
95 | #' @rdname deprecate
96 | #' @export
97 | mgnify_analyses_from_studies <- function(
98 | client, accession, usecache = TRUE, ...){
99 | .Deprecated("searchAnalysis")
100 | searchAnalysis(
101 | x = client, type = "studies", accession = accession,
102 | use.cache = usecache, ...)
103 | }
104 |
105 | #' @rdname deprecate
106 | #' @export
107 | mgnify_get_download_urls <- function(
108 | client, accessions, accession_type, usecache = TRUE, ...){
109 | .Deprecated("searchFile")
110 | searchFile(
111 | x = client, accession = accessions, type = accession_type,
112 | use.cache = usecache, ...)
113 | }
114 |
115 | #' @rdname deprecate
116 | #' @export
117 | mgnify_download <- function(
118 | client, url, file = NULL, read_func = NULL, usecache = TRUE,
119 | Debug = FALSE, ...){
120 | .Deprecated("getFile")
121 | getFile(
122 | x = client, url = url, file = file,
123 | read.func = read_func, use.cache = usecache, ...)
124 | }
125 |
126 | #' @rdname deprecate
127 | #' @export
128 | mgnify_get_analyses_results <- function(
129 | client = NULL, accessions, retrievelist = c(), compact_results = TRUE,
130 | usecache = TRUE, bulk_dl = FALSE, ...){
131 | .Deprecated("getResult")
132 | if( length(retrievelist) == 0 ){
133 | retrievelist <- FALSE
134 | }
135 | getResult(
136 | x = client, accession = accessions, get.taxa = FALSE,
137 | get.func = retrievelist, output = "list", usecache = TRUE,
138 | as.df = compact_results, ...)
139 | }
140 |
141 | #' @rdname deprecate
142 | #' @export
143 | mgnify_get_analyses_phyloseq <- function(
144 | client = NULL, accessions, usecache = TRUE, returnLists = FALSE,
145 | tax_SU = "SSU", get_tree = FALSE, ...){
146 | .Deprecated("getResult")
147 | output <- ifelse(returnLists, "list", "phyloseq")
148 | getResult(
149 | x = client, accession = accessions, get.taxa = TRUE, get.func = FALSE,
150 | output = output, use.cache = usecache, tax.su = tax_SU,
151 | get.tree = get_tree, ...
152 | )
153 | }
154 |
155 | #' @rdname deprecate
156 | #' @export
157 | mgnify_get_analyses_metadata <- function(
158 | client, accessions, usecache = TRUE, ...){
159 | .Deprecated("getMetadata")
160 | getMetadata(x = client, accession = accessions, usecache = usecache, ...)
161 | }
162 |
163 | #' @rdname deprecate
164 | #' @export
165 | mgnify_retrieve_json <- function(
166 | client, path = "biomes", complete_url = NULL, qopts = NULL,
167 | maxhits = 200, usecache = FALSE, Debug = FALSE, ...){
168 | .Deprecated(msg = "'mgnify_retrieve_json' is deprecated.\n",
169 | "See other functions and use them instead.\n",
170 | "See help('Deprecated')")
171 | return(NULL)
172 | }
173 |
--------------------------------------------------------------------------------
/man/doQuery.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/doQuery.R
3 | \name{doQuery}
4 | \alias{doQuery}
5 | \alias{doQuery,MgnifyClient-method}
6 | \title{Search MGnify database for studies, samples, runs, analyses, biomes,
7 | assemblies, and genomes.}
8 | \usage{
9 | doQuery(x, ...)
10 |
11 | \S4method{doQuery}{MgnifyClient}(
12 | x,
13 | type = "studies",
14 | accession = NULL,
15 | as.df = TRUE,
16 | max.hits = 200,
17 | ...
18 | )
19 | }
20 | \arguments{
21 | \item{x}{A \code{MgnifyClient} object.}
22 |
23 | \item{...}{Remaining parameter key/value pairs may be supplied to filter
24 | the returned values. Available options differ between \code{types}.
25 | See discussion Details section for details.}
26 |
27 | \item{type}{A single character value specifying the type of objects to
28 | query. Must be one of the following options: \code{studies}, \code{samples},
29 | \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
30 | \code{super-studies}, \code{experiment-types}, \code{pipelines},
31 | \code{pipeline-tools}, \code{publications}, \code{genomes},
32 | \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
33 | \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
34 | \code{antismash-geneclusters}, \code{annotations/go-terms},
35 | \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
36 | \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
37 | \code{annotations/genome-properties},
38 | \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
39 | \code{mydata}.
40 | (By default: \code{type = "studies"})}
41 |
42 | \item{accession}{A single character value or a vector of character values
43 | specifying MGnify accession identifiers (of type \code{type}) or NULL. When
44 | NULL, all results defined by other parameters are retrieved.
45 | (By default: \code{accession = NULL})}
46 |
47 | \item{as.df}{A single boolean value specifying whether to return the
48 | results as a data.frame or leave as a nested list. In most cases,
49 | \code{as.df = TRUE} will make the most sense.
50 | (By default: \code{as.df = TRUE})}
51 |
52 | \item{max.hits}{A single integer value specifying the maximum number of
53 | results to return or FALSE. The actual number of results will actually be
54 | higher than \code{max.hits}, as clipping only occurs on pagination page
55 | boundaries. To disable the limit, set \code{max.hits = NULL}.
56 | (By default: \code{max.hits = 200})}
57 | }
58 | \value{
59 | A nested list or data.frame containing the results of the query.
60 | }
61 | \description{
62 | Search MGnify database for studies, samples, runs, analyses, biomes,
63 | assemblies, and genomes.
64 | }
65 | \details{
66 | \code{doQuery} is a flexible query function, harnessing the "full"
67 | power of the JSONAPI MGnify search filters. Search results may be filtered
68 | by metadata value, associated study/sample/analyse etc.
69 |
70 | See \href{https://www.ebi.ac.uk/metagenomics/api/v1/}{Api browser} for
71 | information on MGnify database filters.
72 | You can find help on customizing queries from
73 | \href{https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries}{here}.
74 |
75 | For example the following filters are available:
76 | \itemize{
77 | \item{\strong{studies}: accession, biome_name, lineage, centre_name,
78 | include}
79 | \item{\strong{samples}: accession, experiment_type, biome_name,
80 | lineage, geo_loc_name, latitude_gte, latitude_lte,
81 | longitude_gte, longitude_lte, species, instrument_model,
82 | instrument_platform, metadata_key, metadata_value_gte,
83 | metadata_value_lte, metadata_value, environment_material,
84 | environment_feature, study_accession, include}
85 | \item{\strong{runs}: accession, experiment_type, biome_name, lineage,
86 | species, instrument_platform, instrument_model, metdata_key,
87 | metadata_value_gte, metadata_value_lte, metadata_value, sample_accession,
88 | study_accession, include}
89 | \item{\strong{analyses}: biome_name, lineage, experiment_type, species,
90 | sample_accession, pipeline_version}
91 | \item{\strong{biomes}: depth_gte, depth_lte}
92 | \item{\strong{assemblies}: depth_gte, depth_lte}
93 | }
94 | Unfortunately it appears that in some cases, some of these filters don't work
95 | as expected, so it is important to check the results returned match up with
96 | what's expected. Even more unfortunately if there's an error in the parameter
97 | specification, the query will run as if no filter parameters were present
98 | at all. Thus the result will appear superficially correct but will infact
99 | correspond to something completely different. This behaviour will hopefully
100 | be fixed in future incarnations of the MGnifyR or JSONAPI, but for now users
101 | should double check returned values.
102 |
103 | It is currently not possible to combine queries of the same type in a single
104 | call (for example to search for samples \emph{between} latitude). However,
105 | it is possible to run multiple queries and combine the results using set
106 | operations in R to get the desired behaviour.
107 | }
108 | \examples{
109 | mg <- MgnifyClient(useCache = FALSE)
110 |
111 | # Get a list of studies from the Agricultural Wastewater :
112 | agwaste_studies <- doQuery(
113 | mg, "studies", biome_name="Agricultural wastewater"
114 | )
115 |
116 | \dontrun{
117 | # Get all samples from a particular study
118 | samps <- doQuery(mg, "samples", accession="MGYS00004521")
119 |
120 | # Search polar samples
121 | samps_np <- doQuery(mg, "samples", latitude_gte=66, max.hits=10)
122 | samps_sp <- doQuery(mg, "samples", latitude_lte=-66, max.hits=10)
123 |
124 | # Search studies that have studied drinking water
125 | tbl <- doQuery(
126 | mg,
127 | type = "studies",
128 | biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
129 | max.hits = 10)
130 | }
131 |
132 | }
133 |
--------------------------------------------------------------------------------
/man/getResult.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/AllGenerics.R, R/getResult.R
3 | \name{getResult}
4 | \alias{getResult}
5 | \alias{getResult,MgnifyClient-method}
6 | \title{Get microbial and/or functional profiling data for a list of accessions}
7 | \usage{
8 | getResult(x, ...)
9 |
10 | \S4method{getResult}{MgnifyClient}(
11 | x,
12 | accession,
13 | get.taxa = TRUE,
14 | get.func = TRUE,
15 | output = "TreeSE",
16 | ...
17 | )
18 | }
19 | \arguments{
20 | \item{x}{A \code{MgnifyClient} object.}
21 |
22 | \item{...}{optional arguments:
23 | \itemize{
24 |
25 | \item \strong{taxa.su} A single character value specifying which taxa
26 | subunit results should be selected. Currently, taxonomy assignments in the
27 | MGnify pipelines rely on rRNA matches to existing databases
28 | (GreenGenes and SILVA), with later pipelines checking both the SSU and
29 | LSU portions of the rRNA sequence. \code{taxa.su} allows then selection
30 | of either the Small subunit (\code{"SSU"}) or Large subunit (\code{"LSU"})
31 | results in the final \code{TreeSummarizedExperiment} object. Older pipeline
32 | versions do not report results for both subunits, and thus for some
33 | accessions this value will have no effect.
34 |
35 | \item \strong{get.tree} A single boolean value specifying whether to
36 | include available phylogenetic trees in the \code{TreeSummarizedExperiment}
37 | object. Available when \code{get.taxa = TRUE}.
38 | (By default: \code{get.tree = TRUE})
39 |
40 | \item \strong{as.df} A single boolean value enabled when
41 | \code{output = "list"}. The argument specifies whether return functional
42 | data as a named list (one entry per element in the output list) of
43 | data.frames, with each data.frame containing results for all requested
44 | accessions. If \code{FALSE}, the function returns a list of lists, each
45 | element consisting of results for a single accession. (By default:
46 | \code{as.df = TRUE})
47 |
48 | \item \strong{bulk.dl} A single boolean value specifying should
49 | MGnifyR attempt to speed things up by downloading
50 | relevant studies TSV results and only extracting the required columns,
51 | rather than using the JSONAPI interface. When getting results where
52 | multiple accessions share the same study, this option may result in
53 | significantly faster processing. However, there appear to be (quite a few)
54 | cases in the database where the TSV result columns do NOT match the
55 | expected accession names. This will hopefully be fixed in the future,
56 | but for now \code{bulk.dl} defaults to TRUE. When it does work, it can
57 | be orders of magnitude more efficient.
58 | (By default: \code{buld_dl = TRUE})
59 |
60 | }}
61 |
62 | \item{accession}{A single character value or a vector of character values
63 | specifying accession IDs to return results for.}
64 |
65 | \item{get.taxa}{A boolean value specifying whether to retrieve taxonomy
66 | data (OTU table). See \code{taxa.su} for specifying taxonomy type. The
67 | data is retrieved as BIOM files which are subsequently parsed.
68 | (By default: \code{get.taxa = TRUE})}
69 |
70 | \item{get.func}{A boolean value or a single character value or a vector
71 | character values specifying functional analysis types to retrieve. If
72 | \code{get.func = TRUE}, all available functional datatypes are retrieved,
73 | and if \code{FALSE}, functional data is not retrieved. The current list of
74 | available types is \code{"antismash-gene-clusters"}, \code{"go-slim"},
75 | \code{"go-terms"}, \code{"interpro-identifiers"}, \code{"taxonomy"},
76 | \code{"taxonomy-itsonedb"}, \code{"taxonomy-itsunite"}, \code{"taxonomy-lsu"},
77 | and \code{"taxonomy-ssu"}. Note that depending on the particular analysis
78 | type, pipeline version etc., not all functional results will be available.
79 | Furthermore, taxonomy is also available via \code{get.func}, and loading
80 | the data might be considerable faster if \code{bulk.dl = TRUE}. However,
81 | phylogeny is available only via \code{get.taxa}.
82 | (By default: \code{get.func = TRUE})}
83 |
84 | \item{output}{A single character value specifying the format of an output.
85 | Must be one of the following options: \code{"TreeSE"}, \code{"list"}, or
86 | \code{"phyloseq"}. (By default: \code{output = "TreeSE"})}
87 | }
88 | \value{
89 | If only taxonomy data is retrieved, the result is returned in
90 | \code{TreeSummarizedExperiment} object by default. The result can also be
91 | returned as a \code{phyloseq} object or as a list of \code{data.frames}.
92 | Note that \code{phyloseq} object can include only one phylogenetic tree
93 | meaning that some taxa might be lost when data is subsetted based on tree.
94 |
95 | When functional data is retrieved in addition to taxonomy data, the result
96 | is returned as a \code{MultiAssayExperiment} object. Other options are a list
97 | containing \code{phyloseq} object and \code{data.frames} or just
98 | \code{data.frames}.
99 |
100 | Functional data can be returned as a \code{MultiAssayExperiment} object or
101 | as a list of \code{data.frames}.
102 | }
103 | \description{
104 | Get microbial and/or functional profiling data for a list of accessions
105 | }
106 | \details{
107 | Given a set of analysis accessions and collection of annotation types,
108 | the function queries the MGNify API and returns the results. This function
109 | is convenient for retrieving highly structured (analysis vs counts) data on
110 | certain instances. For example, BIOM files are downloaded automatically.
111 | If you want just to retrieve raw data from the database, see \code{getData}.
112 | }
113 | \examples{
114 | # Create a client object
115 | mg <- MgnifyClient(useCache = FALSE)
116 |
117 | # Get OTU tables as TreeSE
118 | accession_list <- c("MGYA00377505")
119 | tse <- getResult(mg, accession_list, get.func=FALSE, get.taxa=TRUE)
120 |
121 | \dontrun{
122 | # Get functional data along with OTU tables as MAE
123 | mae <- getResult(mg, accession_list, get.func=TRUE, get.taxa=TRUE)
124 |
125 | # Get same data as list
126 | list <- getResult(
127 | mg, accession_list, get.func=TRUE, get.taxa=TRUE, output = "list",
128 | as.df = TRUE, use.cache = TRUE)
129 | }
130 |
131 | }
132 | \seealso{
133 | \code{\link[MGnifyR:getData]{getData}}
134 | }
135 |
--------------------------------------------------------------------------------
/R/getMetadata.R:
--------------------------------------------------------------------------------
1 | #' Get all study, sample and analysis metadata for the supplied analysis
2 | #' accessions
3 | #'
4 | #' @details
5 | #' The function retrieves all study, sample and analysis metadata associated
6 | #' with provided analysis accessions.
7 | #'
8 | #' @param x A \code{MgnifyClient} object.
9 | #'
10 | #' @param accession A single character value or a vector of analysis accession
11 | #' IDs specifying accessions to retrieve data for.
12 | #'
13 | #' @param ... Optional arguments; not currently used.
14 | #'
15 | #' @return A \code{data.frame} containing metadata for each analysis in the
16 | #' \code{accession} list. Each row represents a single analysis.
17 | #'
18 | #' @examples
19 | #' # Create a client object
20 | #' mg <- MgnifyClient(useCache = FALSE)
21 | #'
22 | #' # Download all associated study/sample and analysis metadata
23 | #' accession_list <- c("MGYA00377505")
24 | #' meta_dataframe <- getMetadata(mg, accession_list)
25 | #'
26 | #' @name getMetadata
27 | NULL
28 |
29 | #' @rdname getMetadata
30 | #' @importFrom plyr llply
31 | #' @importFrom dplyr bind_rows
32 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
33 | #' @export
34 | setMethod("getMetadata", signature = c(x = "MgnifyClient"), function(
35 | x, accession, ...){
36 | ############################### INPUT CHECK ################################
37 | if( !is.character(accession) ){
38 | stop(
39 | "'accession' must be a single character or a list of character ",
40 | "values.", call. = FALSE)
41 | }
42 | ############################# INPUT CHECK END ##############################
43 | # Get metadata
44 | result <- .mgnify_get_analyses_metadata(
45 | client = x, accession = accession, ...)
46 | return(result)
47 | })
48 |
49 | ################################ HELP FUNCTIONS ################################
50 |
51 | # Fetch metadata based on analysis accessions.
52 | .mgnify_get_analyses_metadata <- function(
53 | client, accession, use.cache = useCache(client),
54 | show.messages = verbose(client), ...){
55 | # Input check
56 | if( !.is_a_bool(use.cache) ){
57 | stop(
58 | "'use.cache' must be a single boolean value specifying whether to ",
59 | "show progress.", call. = FALSE)
60 | }
61 | if( !.is_a_bool(show.messages) ){
62 | stop(
63 | "'show.messages' must be a single boolean value.", call. = FALSE)
64 | }
65 | show.messages <- ifelse(show.messages, "text", "none")
66 | #
67 | # Give message about progress
68 | if( show.messages == "text" ){
69 | message("Fetching metadata...")
70 | }
71 | # Loop through analysis accessions and find metadata
72 | reslist <- llply(as.list(accession), function(x){
73 | .mgnify_get_single_analysis_metadata(
74 | client, x, use.cache = use.cache, ...)
75 | }, .progress = show.messages)
76 | # Combine all metadata to single df
77 | df <- do.call(bind_rows, reslist)
78 | return(df)
79 | }
80 |
81 | # Retrieves combined study/sample/analysis metadata - not exported
82 | .mgnify_get_single_analysis_metadata <- function(
83 | client, accession, use.cache = useCache(client), max.hits = NULL, ...){
84 | # Input check
85 | if( !.is_a_bool(use.cache) ){
86 | stop(
87 | "'use.cache' must be a single boolean value specifying whether to ",
88 | "show progress.", call. = FALSE)
89 | }
90 | #
91 | # Get data in json format
92 | dat <- .mgnify_retrieve_json(
93 | client, paste("analyses", accession, sep="/"), use.cache = use.cache,
94 | max.hits = max.hits, ...)
95 | # If metadata was not found, return the NULL value
96 | if(is.null(dat)){
97 | warning(
98 | "\nFailed to find study metadata for ", accession, call. = FALSE)
99 | return(dat)
100 | }
101 |
102 | # There should be just a single result
103 | top_data <- dat[[1]]
104 | # Convert hit result to df
105 | analysis_df <- .mgnify_attr_list_to_df_row(
106 | top_data, metadata_key = "analysis-summary")
107 |
108 | # Build up the metadata dataframe from the analyses_metadata_headers vector:
109 | sample_met <- .mgnify_retrieve_json(
110 | client, complete_url = top_data$relationships$sample$links$related,
111 | use.cache = use.cache, ...)
112 | study_met <- .mgnify_retrieve_json(
113 | client, complete_url = top_data$relationships$study$links$related,
114 | use.cache = use.cache, ...)
115 | # Again, convert to df
116 | if(!is.null(sample_met)){
117 | sample_df <- .mgnify_attr_list_to_df_row(
118 | sample_met[[1]], metadata_key = "sample-metadata")
119 | } else{
120 | warning(
121 | "\nFailed to find sample metadata for ", accession, call. = FALSE)
122 | sample_df <- data.frame(accession=NA)
123 | }
124 | # It turns out that a sample might not be part of a study - if it's been
125 | # harvested...
126 | if(!is.null(study_met)){
127 | study_df <- .mgnify_attr_list_to_df_row(study_met[[1]])
128 | } else{
129 | warning(
130 | "\nFailed to find study metadata for ", accession, call. = FALSE)
131 | study_df <- data.frame(accession=NA)
132 | }
133 | # Add colnames to sample, study and analysis tables
134 | colnames(sample_df) <- paste("sample", colnames(sample_df), sep="_")
135 | colnames(study_df) <- paste("study", colnames(study_df), sep="_")
136 | colnames(analysis_df) <- paste("analysis", colnames(analysis_df), sep="_")
137 | # Add what analysis corresponds what sample and study
138 | rownames(sample_df) <- rownames(analysis_df)
139 | rownames(study_df) <- rownames(analysis_df)
140 | # Combine sample and study result
141 | full_df <- cbind(analysis_df, study_df, sample_df)
142 |
143 | # Extras - include some more metadata from various places
144 | # Assembly accession
145 | if("id" %in% names(top_data$relationships$assembly$data)){
146 | full_df$assembly_accession <- top_data$relationships$assembly$data$id
147 | }
148 | # Run accession
149 | if("id" %in% names(top_data$relationships$run$data)){
150 | full_df$run_accession <- top_data$relationships$run$data$id
151 | }
152 | # biom (from the sample metadata)
153 | if( !is.null(sample_met[[1]]$relationships$biome$data$id) ){
154 | full_df$biome_string <- sample_met[[1]]$relationships$biome$data$id
155 | } else {
156 | warning("\nFailed to find biome entry for ", accession, call = FALSE)
157 | }
158 | return(full_df)
159 | }
160 |
--------------------------------------------------------------------------------
/R/MgnifyClient.R:
--------------------------------------------------------------------------------
1 | #' Constructor for creating a MgnifyClient object to allow the access to
2 | #' MGnify database.
3 | #'
4 | #' @details
5 | #' All functions in the MGnifyR package take a \code{MgnifyClient} object as
6 | #' their first argument. The object allows the simple handling of both user
7 | #' authentication and access to private data, and manages general options for
8 | #' querying the MGnify database.
9 | #'
10 | #' @param username A single character value specifying an optional username for
11 | #' authentication. (By default: \code{username = NULL})
12 | #'
13 | #' @param password A single character value specifying an optional password for
14 | #' authentication. (By default: \code{password = NULL})
15 | #'
16 | #' @param useCache A single boolean value specifying whether to enable on-disk
17 | #' caching of results during this session. In most use cases should be TRUE.
18 | #' (By default: \code{useCache = FALSE})
19 | #'
20 | #' @param cacheDir A single character value specifying a folder to contain the
21 | #' local cache. Note that cached files are persistent, so the cache directory
22 | #' may be reused between sessions, taking advantage of previously downloaded
23 | #' results. The directory will be created if it doesn't exist already.
24 | #' (By default: \code{cacheDir = tempdir()})
25 | #'
26 | #' @param showWarnings A single boolean value specifying whether to print
27 | #' warnings during invocation of some MGnifyR functions.
28 | #' (By default: \code{showWarnings = FALSE})
29 | #'
30 | #' @param verbose A single boolean value specifying whether to print extra
31 | #' output during invocation of some MGnifyR functions.
32 | #' (By default: \code{verbose = FALSE})
33 | #'
34 | #' @param clearCache A single boolean value specifying whether to clear the
35 | #' cache. (By default: \code{clearCache = FALSE})
36 | #'
37 | #' @param ... optional arguments:
38 | #' \itemize{
39 | #' \item \strong{url} A single character value specifying an url address of
40 | #' the database. (By default:
41 | #' \code{url = "https://www.ebi.ac.uk/metagenomics/api/v1"})
42 | #' }
43 | #'
44 | #' @return A MgnifyClient object.
45 | #'
46 | #' @examples
47 | #' my_client <- MgnifyClient(
48 | #' useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
49 | #' )
50 | #'
51 | #' \dontrun{
52 | #' # Use username and password to get access to non-public data
53 | #' my_client <- MgnifyClient(
54 | #' username = "Webin-1122334", password = "SecretPassword",
55 | #' useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
56 | #' )
57 | #'}
58 | #'
59 | #' @name MgnifyClient
60 | NULL
61 |
62 | #' @rdname MgnifyClient
63 | #' @importFrom methods new
64 | #' @export
65 | MgnifyClient <- function(
66 | username = NULL, password = NULL, useCache = FALSE,
67 | cacheDir = tempdir(), showWarnings = FALSE, verbose = TRUE,
68 | clearCache = FALSE, ...){
69 | ############################### INPUT CHECK ################################
70 | if( !(is.null(username) || .is_non_empty_string(username)) ){
71 | stop(
72 | "'username' must be NULL or single character value specifying ",
73 | "the username.", call. = FALSE)
74 | }
75 | if( !(is.null(password) || .is_non_empty_string(password)) ){
76 | stop(
77 | "'password' must be NULL or single character value specifying ",
78 | "the password.", call. = FALSE)
79 | }
80 | if( !.is_a_bool(useCache) ){
81 | stop(
82 | "'useCache' must be a boolean value specifying whether to use ",
83 | "on-disk caching.", call. = FALSE)
84 | }
85 | if( !.is_non_empty_string(cacheDir) ){
86 | stop(
87 | "'cacheDir' must be single character value specifying ",
88 | "the the directory for cache.", call. = FALSE)
89 | }
90 | if( !.is_a_bool(showWarnings) ){
91 | stop(
92 | "'showWarnings' must be a boolean value specifying whether print ",
93 | "warnings during invocation of MGnifyR functions.",
94 | call. = FALSE)
95 | }
96 | if( !.is_a_bool(verbose) ){
97 | stop(
98 | "'verbose' must be a boolean value specifying whether print ",
99 | "extra output during invocation of MGnifyR functions.",
100 | call. = FALSE)
101 | }
102 | if( !.is_a_bool(clearCache) ){
103 | stop(
104 | "'clearCache' must be a boolean value specifying whether to ",
105 | "clear the cache.", call. = FALSE)
106 | }
107 | ############################# INPUT CHECK END ##############################
108 | # Get the url address
109 | url <- .get_url_address(...)
110 | # Authentication token is NA as default
111 | authtok <- NA_character_
112 | # Check to see if we're going to try and get an authentication token:
113 | if (!is.null(username) && !is.null(password)){
114 | # Fetch username vs password data from database
115 | r <- POST(
116 | paste(url, "utils/token/obtain", sep = "/"),
117 | body = list(username = username, password = password),
118 | encode = "json")
119 | # If the authentication was not successful, returned value do not
120 | # include data
121 | cont <- content(r, ...)
122 | if ("data" %in% names(cont)){
123 | authtok <- cont$data$token
124 | } else{
125 | stop("Failed to authenticate.", call. = FALSE)
126 | }
127 | }
128 | # Get the directory where cache will be stored.
129 | # If user has specified the subdirectory, ensure that it works in any
130 | # system by adding correct "/".
131 | cacheDir <- as.list(strsplit(cacheDir, "[/\\\\]")[[1]])
132 | cacheDir <- do.call(file.path, cacheDir)
133 | # Add subdirectory. If user has specified for example working directory,
134 | # the directory would be full of files. This is unintentional.
135 | cacheDir <- file.path(cacheDir, ".MGnifyR_cache")
136 | # Make it if needed - assume the user is sensible and the path will
137 | # work...
138 | if( useCache ){
139 | dir.create(cacheDir, recursive = TRUE, showWarnings = FALSE)
140 | }
141 | # Return the final object
142 | obj <- new(
143 | "MgnifyClient",
144 | databaseUrl = url,
145 | authTok = authtok,
146 | useCache = useCache,
147 | cacheDir = cacheDir,
148 | showWarnings = showWarnings,
149 | clearCache = clearCache,
150 | verbose = verbose
151 | )
152 | return(obj)
153 | }
154 |
155 | ################################ HELP FUNCTIONS ################################
156 |
157 | # This function is just to remove url from main function's arguments.
158 | .get_url_address <- function(
159 | url = "https://www.ebi.ac.uk/metagenomics/api/v1", ...){
160 | ############################### INPUT CHECK ################################
161 | if( !(.is_non_empty_string(url)) ){
162 | stop(
163 | "'url' must be a single character value specifying ",
164 | "the URL address.", call. = FALSE)
165 | }
166 | ############################# INPUT CHECK END ##############################
167 | return(url)
168 | }
169 |
170 |
--------------------------------------------------------------------------------
/R/getData.R:
--------------------------------------------------------------------------------
1 | #' Versatile function to retrieve raw results
2 | #'
3 | #' @details
4 | #' This function returns data from MGnify database. Compared to
5 | #' \code{getResult}, this function allows more flexible framework for fetching
6 | #' the data. However, there are drawbacks: for counts data, \code{getResult}
7 | #' returns optimally structured data container which is easier for downstream
8 | #' analysis. \code{getData} returns raw data from the database. However, if
9 | #' you want to retrieve data on pipelines or publications, for instance,
10 | #' \code{getResult} is not suitable for it, and \code{getData} can be utilized
11 | #' instead.
12 | #'
13 | #' @param x A \code{MgnifyClient} object.
14 | #'
15 | #' @param type A single character value specifying the type of data retrieve.
16 | #' Must be one of the following options: \code{studies}, \code{samples},
17 | #' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
18 | #' \code{super-studies}, \code{experiment-types}, \code{pipelines},
19 | #' \code{pipeline-tools}, \code{publications}, \code{genomes},
20 | #' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
21 | #' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
22 | #' \code{antismash-geneclusters}, \code{annotations/go-terms},
23 | #' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
24 | #' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
25 | #' \code{annotations/genome-properties},
26 | #' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
27 | #' \code{mydata}.
28 | #'
29 | #' @param accession A single character value or a vector of character values
30 | #' specifying accession IDs to return results for.
31 | #' (By default: \code{accession = NULL})
32 | #'
33 | #' @param accession.type A single character value specifying type of accession
34 | #' IDs (\code{accession}). Must be specified when \code{accession} is specified.
35 | #' (By default: \code{accession.type = NULL})
36 | #'
37 | #' @param as.df A single boolean value specifying whether to return the
38 | #' results as a data.frame or leave as a nested list.
39 | #' (By default: \code{as.df = TRUE})
40 | #'
41 | #' @param ... optional arguments fed to internal functions.
42 | #'
43 | #' @return
44 | #' \code{data.frame} or \code{list}
45 | #'
46 | #' @examples
47 | #' # Create a client object
48 | #' mg <- MgnifyClient(useCache = FALSE)
49 | #'
50 | #' # Find kegg modules for certain analysis
51 | #' df <- getData(
52 | #' mg, type = "kegg-modules",
53 | #' accession = "MGYA00642773", accession.type = "analyses")
54 | #'
55 | #' @seealso
56 | #' \code{\link[MGnifyR:getResult]{getResult}}
57 | #'
58 | #' @name getData
59 | NULL
60 |
61 | #' @rdname getData
62 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
63 | #' @export
64 | setMethod(
65 | "getData", signature = c(x = "MgnifyClient"), function(
66 | x, type, accession.type = NULL, accession = NULL, as.df = TRUE, ...){
67 | ############################### INPUT CHECK ################################
68 | available_types <- c(
69 | "studies", "samples", "runs", "analyses", "biomes", "assemblies",
70 | "super-studies", "experiment-types", "pipelines", "pipeline-tools",
71 | "publications", "genomes", "genome-search", "genome-search/gather",
72 | "genome-catalogues", "genomeset", "cogs", "kegg-modules",
73 | "kegg-classes", "antismash-geneclusters", "annotations/go-terms",
74 | "annotations/interpro-identifiers", "annotations/kegg-modules",
75 | "annotations/pfam-entries", "annotations/kegg-orthologs",
76 | "annotations/genome-properties", "annotations/antismash-gene-clusters",
77 | "annotations/organisms", "mydata")
78 | if( !(.is_non_empty_string(type) && type %in% available_types) ){
79 | stop(
80 | "'type' must be a single character value specifying ",
81 | "the type of instance to query. The value must be one of the ",
82 | "following options: ",
83 | paste0("'", paste(available_types, collapse = "', '"), "'"),
84 | call. = FALSE)
85 | }
86 | if( !(.is_non_empty_character(accession) || is.null(accession)) ){
87 | stop(
88 | "'accession' must be a single character value or vector of ",
89 | "character values specifying the MGnify accession identifier.",
90 | call. = FALSE)
91 | }
92 | if( !(.is_non_empty_character(accession.type) || is.null(accession.type)) ){
93 | stop(
94 | "'accession.type' must be a single character value or vector of ",
95 | "character values specifying the type of MGnify accession ",
96 | "identifier.", call. = FALSE)
97 | }
98 | if(
99 | (is.null(accession) && !is.null(accession.type)) ||
100 | (is.null(accession.type) && !is.null(accession)) ){
101 | stop(
102 | "Both 'accession' and 'accession.type' must be specified or they ",
103 | "must be NULL.", call. = FALSE)
104 | }
105 | if( !.is_a_bool(as.df) ){
106 | stop(
107 | "'as.df' must be a single boolean value specifying whether",
108 | "to return list or data.frame.", call. = FALSE)
109 | }
110 | ############################# INPUT CHECK END ##############################
111 | # Retrieve results
112 | result <- .get_results_as_json_list(x, type, accession.type, accession, ...)
113 | # Convert to df
114 | if( as.df ){
115 | result <- .convert_json_list_to_df(result)
116 | } else if( length(result) == 1 ){
117 | result <- result[[1]]
118 | }
119 | return(result)
120 | })
121 |
122 | ################################ HELP FUNCTIONS ################################
123 |
124 | #' @importFrom plyr llply
125 | .get_results_as_json_list <- function(mg, type, accession.type, accession, ...){
126 | # Create a path. If multiple accession IDs, path is vector of multiple
127 | # paths. Otherwise the path specifies only the type
128 | if( !is.null(accession.type) && !is.null(accession) ){
129 | path <- paste0(accession.type, "/", accession, "/", type)
130 | names(path) <- accession
131 | } else{
132 | path <- type
133 | }
134 | # Find results by loping through paths
135 | res <- llply(path, function(x){
136 | .mgnify_retrieve_json(mg, path = x, ...)
137 | })
138 | return(res)
139 | }
140 |
141 | #' @importFrom tidyjson spread_all
142 | #' @importFrom dplyr bind_rows
143 | .convert_json_list_to_df <- function(result){
144 | # Create data.frames from individual search results
145 | res <- lapply(result, function(x){
146 | if( !is.null(x) ){
147 | x <- as.data.frame(spread_all(x))
148 | }
149 | return(x)
150 | })
151 | # Merge individual data.frames to one
152 | res <- bind_rows(res)
153 | # Add names if there were accession IDs provided as input
154 | if( !is.null(names(result)) ){
155 | # Assign to "accession" column name if there is no column with that name
156 | # already
157 | col_name <- "accession"
158 | col_name <- c(colnames(res), col_name)
159 | col_name <- make.unique(col_name)[[ length(col_name) ]]
160 | # Add to result df
161 | nams <- rep( names(result), each = lengths(result))
162 | res[[ col_name ]] <- nams
163 | }
164 | return(res)
165 | }
166 |
--------------------------------------------------------------------------------
/R/searchAnalysis.R:
--------------------------------------------------------------------------------
1 | #' Look up analysis accession IDs for one or more study or sample accessions
2 | #'
3 | #' @details
4 | #' Retrieve analysis accession IDs associated with the supplied study or
5 | #' sample accession. In MGnify, an analysis accession refers to a certain
6 | #' pipeline analysis, such as specific 16S rRNA or shotgun metagenomic mapping.
7 | #' Studies can include multiple samples, and each sample can undergo multiple
8 | #' analyses using these pipelines. Each analysis is identified by a unique
9 | #' accession ID, allowing precise tracking and retrieval of analysis results
10 | #' within the MGnify database.
11 | #'
12 | #' @param x A \code{MgnifyClient} object.
13 | #'
14 | #' @param type A single character value specifying a type of
15 | #' accession IDs specified by \code{accession}. Must be "studies" or "samples".
16 | #'
17 | #' @param accession A single character value or a vector of character values
18 | #' specifying study or sample accession IDs that are used to retrieve analyses
19 | #' IDs.
20 | #'
21 | #' @param ... Optional arguments; not currently used.
22 | #'
23 | #' @return Vector of analysis accession IDs.
24 | #'
25 | #' @examples
26 | #' # Create a client object
27 | #' mg <- MgnifyClient(useCache = FALSE)
28 | #'
29 | #' # Retrieve analysis ids from study MGYS00005058
30 | #' result <- searchAnalysis(mg, "studies", c("MGYS00005058"))
31 | #'
32 | #' \dontrun{
33 | #' # Retrieve all analysis ids from samples
34 | #' result <- searchAnalysis(
35 | #' mg, "samples", c("SRS4392730", "SRS4392743"))
36 | #' }
37 | #'
38 | #' @name searchAnalysis
39 | NULL
40 |
41 | #' @rdname searchAnalysis
42 | #' @importFrom plyr llply
43 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
44 | #' @export
45 | setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
46 | x, type, accession, ...){
47 | ############################### INPUT CHECK ################################
48 | if( !(length(type) == 1 && type %in% c("samples", "studies")) ){
49 | stop(
50 | "'type' must be 'samples' or 'studies'.", call. = FALSE)
51 | }
52 | if( !(.is_non_empty_character(accession)) ){
53 | stop(
54 | "'accession' must be a single character value or vector of ",
55 | "character values specifying the MGnify accession identifier.",
56 | call. = FALSE)
57 | }
58 | ############################# INPUT CHECK END ##############################
59 | # Get analysis accession IDs based on sample or study accessions
60 | result <- .mgnify_analyses_from_studies_and_samples(
61 | client = x, accession = accession, type = type, ...)
62 | return(result)
63 | })
64 |
65 | ################################ HELP FUNCTIONS ################################
66 | # Get analysis accessions based on studies or samples. The result is a vector
67 | # of analyses IDs.
68 | .mgnify_analyses_from_studies_and_samples <- function(
69 | client, accession, type, show.messages = verbose(client), ...){
70 | # Input check
71 | if( !.is_a_bool(show.messages) ){
72 | stop(
73 | "'show.messages' must be a single boolean value.", call. = FALSE)
74 | }
75 | show.messages <- ifelse(show.messages, "text", "none")
76 | #
77 | # Give message about progress
78 | if( show.messages == "text" ){
79 | message("Fetching analyses...")
80 | }
81 | # Search analyses IDs
82 | analysis_ids <- .get_all_analyses_ids(
83 | client, accession, type, "analyses", show.messages = show.messages, ...)
84 | # Check which study/sample ID resulted to found analysis ID
85 | not_found <- accession[ !accession %in% names(analysis_ids) ]
86 | # If user is searching analyses based on samples, we can still try another
87 | # approach. Sometimes, those "sample" IDs refer to runs instead.
88 | if( length(not_found) > 0 && type == "samples" ){
89 | # Finds runs based on samples
90 | temp <- .get_all_analyses_ids(
91 | client, accession, "samples", "runs",
92 | show.messages = show.messages, ...)
93 | # Create a data.frame that holds all the IDs to book keep matches
94 | # between IDs.
95 | id_df <- data.frame(sample = names(temp), run = temp)
96 | # Based on those runs, search analyses
97 | temp <- .get_all_analyses_ids(
98 | client, id_df[["run"]], "runs", "analyses",
99 | show.messages = show.messages, ...)
100 | # Add found analysis IDs to data.frame
101 | temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
102 | temp_df[["analyses"]] <- temp
103 | id_df <- merge(id_df, temp_df, all = TRUE)
104 |
105 | # If there still are samples that were not found, we can try to get
106 | # analyses from assemblies. That is why we try to first fetch assemblies
107 | # based on runs.
108 | temp <- .get_all_analyses_ids(
109 | client, id_df[is.na(id_df[["analyses"]]), "run"], "runs",
110 | "assemblies", show.messages = show.messages, ...)
111 | # Add found analysis IDs to data.frame
112 | temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
113 | temp_df[["assemblies"]] <- temp
114 | id_df <- merge(id_df, temp_df, all = TRUE)
115 | # Then based on assemblies, we can finally try to find analyses.
116 | temp <- .get_all_analyses_ids(
117 | client, id_df[is.na(id_df[["analyses"]]), "assemblies"],
118 | "assemblies", "analyses", show.messages = show.messages, ...)
119 | # Add found analysis IDs to data.frame
120 | temp_df <- id_df[match(names(temp), id_df[["assemblies"]]), ]
121 | temp_df[["analyses"]] <- temp
122 | id_df <- merge(id_df, temp_df, all = TRUE)
123 | # Now we should have a table that contains all the analyses that were
124 | # possible to find. Add these analyses to the original result list.
125 | temp <- id_df[["analyses"]]
126 | names(temp) <- id_df[["sample"]]
127 | temp <- temp[ !is.na(temp) ]
128 | analysis_ids <- c(analysis_ids, temp)
129 | # Update the "not found samples" vector
130 | not_found <- accession[ !accession %in% names(analysis_ids) ]
131 | }
132 | # If the data was not found for specified ID, give warning
133 | if( length(not_found) > 0 ){
134 | warning(
135 | "\nAnalyses not found for the following ", type, ": '",
136 | paste(not_found, collapse = "', '"), "'", call. = FALSE)
137 | }
138 | return(analysis_ids)
139 | }
140 |
141 | # This function gets IDs type "type_from" as input and tries to fetch
142 | # corresponding IDs type "type_to".
143 | # based on those studies or samples.
144 | .get_all_analyses_ids <- function(
145 | client, ids, type_from, type_to, show.messages,
146 | use.cache = useCache(client), ...){
147 | #
148 | if( !.is_a_bool(use.cache) ){
149 | stop(
150 | "'use.cache' must be a single boolean value", call. = FALSE)
151 | }
152 | #
153 | # Get only unique IDs
154 | ids <- unique(ids)
155 | # Loop through accessions
156 | analysis_ids <- llply(ids, function(id){
157 | # Get URL address of results that were found. For instance, URL address
158 | # of analyses based on study ID/accession
159 | url <- .mgnify_get_x_for_y(
160 | client, id, type_from, type_to, use.cache = use.cache,
161 | ...)
162 | # Check whether results were found or not
163 | res <- NULL
164 | if( !is.null(url) ){
165 | # Get data
166 | json <- .mgnify_retrieve_json(
167 | client, complete_url = url, use.cache = use.cache,
168 | max.hits = NULL, ...)
169 | # We need just the accession ID
170 | res <- lapply(json, function(x) x$id) |> unlist()
171 | # Add accession as name. There might be multiple analyses for each
172 | # accession. This helps to determine which analyses belong to which
173 | # study.
174 | if( length(res) > 0 ){
175 | names(res) <- rep(id, length(res))
176 | }
177 | }
178 | return(res)
179 | }, .progress = show.messages)
180 | # Create a vector from results
181 | analysis_ids <- analysis_ids |> unlist()
182 | return(analysis_ids)
183 | }
184 |
--------------------------------------------------------------------------------
/vignettes/MGnifyR.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "MGnifyR"
3 | date: "`r Sys.Date()`"
4 | package: MGnifyR
5 | output:
6 | BiocStyle::html_document:
7 | fig_height: 7
8 | fig_width: 10
9 | toc: yes
10 | toc_depth: 2
11 | number_sections: true
12 | vignette: >
13 | %\VignetteIndexEntry{MGnifyR}
14 | %\VignetteEngine{knitr::rmarkdown}
15 | %\VignetteEncoding{UTF-8}
16 | bibliography: references.bib
17 | ---
18 |
19 | ```{r, include = FALSE}
20 | library(knitr)
21 | knitr::opts_chunk$set(
22 | collapse = TRUE,
23 | comment = "#>",
24 | cache = TRUE
25 | )
26 |
27 | # Get already loaded results
28 | path <- system.file("extdata", "vignette_MGnifyR.rds", package = "MGnifyR")
29 | vignette_MGnifyR <- readRDS(path)
30 | ```
31 |
32 | # Introduction
33 |
34 | `MGnifyR` is a package designed to ease access to the EBI's
35 | [MGnify](https://www.ebi.ac.uk/metagenomics) resource, allowing searching and
36 | retrieval of multiple datasets for downstream analysis.
37 |
38 | The latest version of MGnifyR seamlessly integrates with the
39 | [miaverse framework](https://microbiome.github.io/) providing access to
40 | cutting-edge tools in microbiome down-stream analytics.
41 |
42 | # Installation
43 |
44 | `MGnifyR` is hosted on Bioconductor, and can be installed using via
45 | `BiocManager`.
46 |
47 | ```{r install, eval=FALSE}
48 | BiocManager::install("MGnifyR")
49 | ```
50 |
51 | # Load `MGnifyR` package
52 |
53 | Once installed, `MGnifyR` is made available in the usual way.
54 |
55 | ```{r load_package}
56 | library(MGnifyR)
57 | ```
58 |
59 | # Create a client
60 |
61 | All functions in `MGnifyR` make use of a `MgnifyClient` object to keep track
62 | of the JSONAPI url, disk cache location and user access tokens. Thus the first
63 | thing to do when starting any analysis is to instantiate this object. The
64 | following snippet creates this.
65 |
66 | ```{r create_client, message = FALSE}
67 | mg <- MgnifyClient(useCache = TRUE)
68 | mg
69 | ```
70 |
71 | The `MgnifyClient` object contains slots for each of the previously mentioned
72 | settings.
73 |
74 | # Functions for fetching the data
75 |
76 | ## Search data
77 |
78 | `doQuery()` function can be utilized to search results such as samples and
79 | studies from MGnify database. Below, we fetch information drinking water
80 | samples.
81 |
82 | ```{r search_studies1, eval=FALSE}
83 | # Fetch studies
84 | samples <- doQuery(
85 | mg,
86 | type = "samples",
87 | biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
88 | max.hits = 10)
89 | ```
90 |
91 | ```{r search_studies2, eval=TRUE, include=FALSE}
92 | samples <- vignette_MGnifyR[["samples"]]
93 | ```
94 |
95 | The result is a table containing accession IDs and description -- in this case
96 | -- on samples.
97 |
98 | ```{r search_studies3}
99 | colnames(samples) |> head()
100 | ```
101 |
102 | ## Find relevent **analyses** accessions
103 |
104 | Now we want to find analysis accessions. Each sample might have multiple
105 | analyses. Each analysis ID corresponds to a single run of a particular pipeline
106 | on a single sample in a single study.
107 |
108 | ```{r convert_to_analyses1, eval=FALSE}
109 | analyses_accessions <- searchAnalysis(mg, "samples", samples$accession)
110 | ```
111 |
112 | ```{r convert_to_analyses2, eval=TRUE, include=FALSE}
113 | analyses_accessions <- vignette_MGnifyR[["analyses_accessions"]]
114 | ```
115 |
116 | By running the `searchAnalysis()` function, we get a vector of analysis IDs of
117 | samples that we fed as an input.
118 |
119 | ```{r convert_to_analyses3}
120 | analyses_accessions |> head()
121 | ```
122 |
123 |
124 | ## Fetch metadata
125 |
126 | We can now check the metadata to get hint of what kind of data we have. We use
127 | `getMetadata()` function to fetch data based on analysis IDs.
128 |
129 | ```{r get_metadata1, eval=FALSE}
130 | analyses_metadata <- getMetadata(mg, analyses_accessions)
131 | ```
132 |
133 | ```{r get_metadata2, eval=TRUE, include=FALSE}
134 | analyses_metadata <- vignette_MGnifyR[["analyses_metadata"]]
135 | ```
136 |
137 | The returned value is a `data.frame` that includes metadata for example on how
138 | analysis was conducted and what kind of samples were analyzed.
139 |
140 | ```{r get_metadata3}
141 | colnames(analyses_metadata) |> head()
142 | ```
143 |
144 | ## Fetch microbiome data
145 |
146 | After we have selected the data to fetch, we can use `getResult()`
147 |
148 | The output is `r BiocStyle::Biocpkg("TreeSummarizedExperiment")` (`TreeSE`) or
149 | `r BiocStyle::Biocpkg("MultiAssayExperiment")` (`MAE`) depending on the dataset.
150 | If the dataset includes only taxonomic profiling data, the output is a single
151 | `TreeSE`. If dataset includes also functional data, the output is multiple
152 | `TreeSE` objects that are linked together by utilizing `MAE`.
153 |
154 | ```{r get_mae1, eval=FALSE}
155 | mae <- getResult(mg, accession = analyses_accessions)
156 | ```
157 |
158 | ```{r get_mae2, eval=TRUE, include=FALSE}
159 | mae <- vignette_MGnifyR[["mae"]]
160 | ```
161 |
162 | ```{r get_mae3}
163 | mae
164 | ```
165 |
166 | You can get access to individual `TreeSE` object in `MAE` by specifying
167 | index or name.
168 |
169 | ```{r mae_access}
170 | mae[[1]]
171 | ```
172 |
173 | `TreeSE` object is uniquely positioned to support `SummarizedExperiment`-based
174 | microbiome data manipulation and visualization. Moreover, it enables access
175 | to `miaverse` tools. For example, we can estimate diversity of samples...
176 |
177 | ```{r calculate_diversity, fig.width=9}
178 | library(mia)
179 |
180 | mae[[1]] <- estimateDiversity(mae[[1]], index = "shannon")
181 |
182 | library(scater)
183 |
184 | plotColData(mae[[1]], "shannon", x = "sample_environment..biome.")
185 | ```
186 |
187 | ... and plot abundances of most abundant phyla.
188 |
189 | ```{r plot_abundance}
190 | # Agglomerate data
191 | altExps(mae[[1]]) <- splitByRanks(mae[[1]])
192 |
193 | library(miaViz)
194 |
195 | # Plot top taxa
196 | top_taxa <- getTopFeatures(altExp(mae[[1]], "Phylum"), 10)
197 | plotAbundance(
198 | altExp(mae[[1]], "Phylum")[top_taxa, ],
199 | rank = "Phylum",
200 | as.relative = TRUE
201 | )
202 | ```
203 |
204 | We can also perform other analyses such as principal component analysis to
205 | microbial profiling data by utilizing miaverse tools.
206 |
207 | ```{r pcoa}
208 | # Apply relative transformation
209 | mae[[1]] <- transformAssay(mae[[1]], method = "relabundance")
210 | # Perform PCoA
211 | mae[[1]] <- runMDS(
212 | mae[[1]], assay.type = "relabundance",
213 | FUN = vegan::vegdist, method = "bray")
214 | # Plot
215 | plotReducedDim(
216 | mae[[1]], "MDS", colour_by = "sample_environment..biome.")
217 | ```
218 |
219 | ## Fetch raw files
220 |
221 | While `getResult()` can be utilized to retrieve microbial profiling data,
222 | `getData()` can be used more flexibly to retrieve any kind of data from the
223 | database. It returns data as simple data.frame or list format.
224 |
225 | ```{r fetch_data1, eval=FALSE}
226 | publications <- getData(mg, type = "publications")
227 | ```
228 |
229 | ```{r fetch_data2, eval=TRUE, include=FALSE}
230 | publications <- vignette_MGnifyR[["publications"]]
231 | ```
232 |
233 | ```{r fetch_data3}
234 | colnames(publications) |> head()
235 | ```
236 |
237 | The result is a `data.frame` by default. In this case, it includes information
238 | on publications fetched from the data portal.
239 |
240 | ## Fetch sequence files
241 |
242 | Finally, we can use `searchFile()` and `getFile()` to retrieve other MGnify
243 | pipeline outputs such as merged sequence reads, assembled contigs, and details
244 | of the functional analyses.
245 |
246 | With `searchFile()`, we can search files from the database.
247 |
248 | ```{r get_download_urls1, eval=FALSE}
249 | dl_urls <- searchFile(mg, analyses_accessions, type = "analyses")
250 | ```
251 |
252 | ```{r get_download_urls2, eval=TRUE, include=FALSE}
253 | dl_urls <- vignette_MGnifyR[["dl_urls"]]
254 | ```
255 |
256 | The returned table contains search results related to analyses that we fed as
257 | an input. The table contains information on file and also URL address from
258 | where the file can be loaded.
259 |
260 | ```{r get_download_urls3}
261 | target_urls <- dl_urls[
262 | dl_urls$attributes.description.label == "Predicted alpha tmRNA", ]
263 |
264 | colnames(target_urls) |> head()
265 | ```
266 |
267 | Finally, we can download the files with `getFile()`.
268 |
269 | ```{r download_file1, eval=FALSE}
270 | # Just select a single file from the target_urls list for demonstration.
271 | file_url <- target_urls$download_url[[1]]
272 | cached_location <- getFile(mg, file_url)
273 | ```
274 |
275 | ```{r download_file2, eval=TRUE, include=FALSE}
276 | cached_location <- vignette_MGnifyR[["cached_location"]]
277 | ```
278 |
279 | The function returns a path where the file is stored.
280 |
281 | ```{r download_file3}
282 | # Where are the files?
283 | cached_location
284 | ```
285 |
286 | ```{r session_info}
287 | sessionInfo()
288 | ```
289 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Artistic License 2.0
2 |
3 | Copyright (c) Ben Allen, Leo Lahti, Kévin Gatel, 2020-2022.
4 |
5 | Everyone is permitted to copy and distribute verbatim copies of this
6 | license document, but changing it is not allowed.
7 |
8 | Preamble
9 | ********
10 |
11 | This license establishes the terms under which a given free software
12 | Package may be copied, modified, distributed, and/or redistributed. The
13 | intent is that the Copyright Holder maintains some artistic control over
14 | the development of that Package while still keeping the Package
15 | available as open source and free software.
16 |
17 | You are always permitted to make arrangements wholly outside of this
18 | license directly with the Copyright Holder of a given Package. If the
19 | terms of this license do not permit the full use that you propose to
20 | make of the Package, you should contact the Copyright Holder and seek a
21 | different licensing arrangement.
22 |
23 | Definitions
24 | ***********
25 |
26 | "Copyright Holder" means the individual(s) or organization(s) named in
27 | the copyright notice for the entire Package.
28 |
29 | "Contributor" means any party that has contributed code or other
30 | material to the Package, in accordance with the Copyright Holder's
31 | procedures.
32 |
33 | "You" and "your" means any person who would like to copy, distribute, or
34 | modify the Package.
35 |
36 | "Package" means the collection of files distributed by the Copyright
37 | Holder, and derivatives of that collection and/or of those files. A
38 | given Package may consist of either the Standard Version, or a Modified
39 | Version.
40 |
41 | "Distribute" means providing a copy of the Package or making it
42 | accessible to anyone else, or in the case of a company or organization,
43 | to others outside of your company or organization.
44 |
45 | "Distributor Fee" means any fee that you charge for Distributing this
46 | Package or providing support for this Package to another party. It does
47 | not mean licensing fees.
48 |
49 | "Standard Version" refers to the Package if it has not been modified, or
50 | has been modified only in ways explicitly requested by the Copyright
51 | Holder.
52 |
53 | "Modified Version" means the Package, if it has been changed, and such
54 | changes were not explicitly requested by the Copyright Holder.
55 |
56 | "Original License" means this Artistic License as Distributed with the
57 | Standard Version of the Package, in its current version or as it may be
58 | modified by The Perl Foundation in the future.
59 |
60 | "Source" form means the source code, documentation source, and
61 | configuration files for the Package.
62 |
63 | "Compiled" form means the compiled bytecode, object code, binary, or any
64 | other form resulting from mechanical transformation or translation of
65 | the Source form.
66 |
67 | Permission for Use and Modification Without Distribution
68 | ********************************************************
69 |
70 | (1) You are permitted to use the Standard Version and create and use
71 | Modified Versions for any purpose without restriction, provided that you
72 | do not Distribute the Modified Version.
73 |
74 | Permissions for Redistribution of the Standard Version
75 | ******************************************************
76 |
77 | (2) You may Distribute verbatim copies of the Source form of the
78 | Standard Version of this Package in any medium without restriction,
79 | either gratis or for a Distributor Fee, provided that you duplicate all
80 | of the original copyright notices and associated disclaimers. At your
81 | discretion, such verbatim copies may or may not include a Compiled form
82 | of the Package.
83 |
84 | (3) You may apply any bug fixes, portability changes, and other
85 | modifications made available from the Copyright Holder. The resulting
86 | Package will still be considered the Standard Version, and as such will
87 | be subject to the Original License.
88 |
89 | Distribution of Modified Versions of the Package as Source
90 | **********************************************************
91 |
92 | (4) You may Distribute your Modified Version as Source (either gratis or
93 | for a Distributor Fee, and with or without a Compiled form of the
94 | Modified Version) provided that you clearly document how it differs from
95 | the Standard Version, including, but not limited to, documenting any
96 | non-standard features, executables, or modules, and provided that you do
97 | at least ONE of the following:
98 |
99 | (a) make the Modified Version available to the Copyright Holder of the
100 | Standard Version, under the Original License, so that the Copyright
101 | Holder may include your modifications in the Standard Version.
102 |
103 | (b) ensure that installation of your Modified Version does not prevent
104 | the user installing or running the Standard Version. In addition, the
105 | Modified Version must bear a name that is different from the name of the
106 | Standard Version.
107 |
108 | (c) allow anyone who receives a copy of the Modified Version to make the
109 | Source form of the Modified Version available to others under
110 |
111 | (i) the Original License or
112 |
113 | (ii) a license that permits the licensee to freely copy, modify and
114 | redistribute the Modified Version using the same licensing terms that
115 | apply to the copy that the licensee received, and requires that the
116 | Source form of the Modified Version, and of any works derived from it,
117 | be made freely available in that license fees are prohibited but
118 | Distributor Fees are allowed.
119 |
120 | Distribution of Compiled Forms of the Standard Version or Modified
121 | ******************************************************************
122 | Versions without the Source
123 | ***************************
124 |
125 | (5) You may Distribute Compiled forms of the Standard Version without
126 | the Source, provided that you include complete instructions on how to
127 | get the Source of the Standard Version. Such instructions must be valid
128 | at the time of your distribution. If these instructions, at any time
129 | while you are carrying out such distribution, become invalid, you must
130 | provide new instructions on demand or cease further distribution. If
131 | you provide valid instructions or cease distribution within thirty days
132 | after you become aware that the instructions are invalid, then you do
133 | not forfeit any of your rights under this license.
134 |
135 | (6) You may Distribute a Modified Version in Compiled form without the
136 | Source, provided that you comply with Section 4 with respect to the
137 | Source of the Modified Version.
138 |
139 | Aggregating or Linking the Package
140 | **********************************
141 |
142 | (7) You may aggregate the Package (either the Standard Version or
143 | Modified Version) with other packages and Distribute the resulting
144 | aggregation provided that you do not charge a licensing fee for the
145 | Package. Distributor Fees are permitted, and licensing fees for other
146 | components in the aggregation are permitted. The terms of this license
147 | apply to the use and Distribution of the Standard or Modified Versions
148 | as included in the aggregation.
149 |
150 | (8) You are permitted to link Modified and Standard Versions with other
151 | works, to embed the Package in a larger work of your own, or to build
152 | stand-alone binary or bytecode versions of applications that include the
153 | Package, and Distribute the result without restriction, provided the
154 | result does not expose a direct interface to the Package.
155 |
156 | Items That are Not Considered Part of a Modified Version
157 | ********************************************************
158 |
159 | (9) Works (including, but not limited to, modules and scripts) that
160 | merely extend or make use of the Package, do not, by themselves, cause
161 | the Package to be a Modified Version. In addition, such works are not
162 | considered parts of the Package itself, and are not subject to the terms
163 | of this license.
164 |
165 | General Provisions
166 | ******************
167 |
168 | (10) Any use, modification, and distribution of the Standard or Modified
169 | Versions is governed by this Artistic License. By using, modifying or
170 | distributing the Package, you accept this license. Do not use, modify,
171 | or distribute the Package, if you do not accept this license.
172 |
173 | (11) If your Modified Version has been derived from a Modified Version
174 | made by someone other than you, you are nevertheless required to ensure
175 | that your Modified Version complies with the requirements of this
176 | license.
177 |
178 | (12) This license does not grant you the right to use any trademark,
179 | service mark, tradename, or logo of the Copyright Holder.
180 |
181 | (13) This license includes the non-exclusive, worldwide, free-of-charge
182 | patent license to make, have made, use, offer to sell, sell, import and
183 | otherwise transfer the Package with respect to any patent claims
184 | licensable by the Copyright Holder that are necessarily infringed by the
185 | Package. If you institute patent litigation (including a cross-claim or
186 | counterclaim) against any party alleging that the Package constitutes
187 | direct or contributory patent infringement, then this Artistic License
188 | to you shall terminate on the date that such litigation is filed.
189 |
190 | (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT
191 | HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED
192 | WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
193 | PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT
194 | PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT
195 | HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT,
196 | INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE
197 | OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
198 |
--------------------------------------------------------------------------------
/vignettes/MGnify_course.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Metagenomics bioinformatics at MGnify"
3 | date: "`r Sys.Date()`"
4 | package: MGnifyR
5 | output:
6 | BiocStyle::html_document:
7 | fig_height: 7
8 | fig_width: 10
9 | toc: yes
10 | toc_depth: 2
11 | number_sections: true
12 | vignette: >
13 | %\VignetteIndexEntry{MGnifyR, extended vignette}
14 | %\VignetteEngine{knitr::rmarkdown}
15 | %\VignetteEncoding{UTF-8}
16 | bibliography: references.bib
17 | ---
18 |
19 | ```{r setup, include=FALSE}
20 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE)
21 | ```
22 |
23 | ## Introduction
24 |
25 | In this notebook we aim to demonstrate how the `MGnifyR` tool can be used to
26 | fetch data of a MGnify microbiome data resource. We then showcase how to analyze
27 | the datausing advanced microbiome data science tools, including estimating alpha
28 | and beta diversity, as well as performing differential abundance analysis.
29 |
30 | [`MGnifyR`](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html)
31 | is an R/Bioconductor package that provides a set of tools for easily accessing
32 | and processing MGnify data in R, making queries to MGnify databases through the
33 | [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/).
34 |
35 | The benefit of `MGnifyR` is that it streamlines data access, allowing users to
36 | fetch data either in its "raw" format or directly as a
37 | [`TreeSummarizedExperiment` (`TreeSE`)](https://microbiome.github.io/OMA/docs/devel/pages/containers.html)
38 | object. This enables seamless integration with custom workflows for analysis.
39 |
40 | Utilizing `TreeSE` provides access to a wide range of tools within
41 | Bioconductor's `SummarizedExperiment` (`SE`) ecosystem. It also integrates
42 | with the
43 | [`mia` package](https://microbiome.github.io/mia/), which offers
44 | microbiome-specific methods within the `SE` framework.
45 |
46 | For more information
47 | on microbiome data science in Bioconductor, refer to
48 | [Orchestrating Microbiome Analysis (OMA) online book](https://microbiome.github.io/OMA/docs/devel/).
49 |
50 | ## Load packages
51 |
52 | ```{r install}
53 | # List of packages that we need
54 | packages <- c("ANCOMBC", "MGnifyR", "mia", "miaViz", "scater")
55 |
56 | # Get packages that are already installed
57 | packages_already_installed <- packages[ packages %in% installed.packages() ]
58 | # Get packages that need to be installed
59 | packages_need_to_install <- setdiff( packages, packages_already_installed )
60 | # Loads BiocManager into the session. Install it if it is not already installed.
61 | if( !require("BiocManager") ){
62 | install.packages("BiocManager")
63 | library("BiocManager")
64 | }
65 | # If there are packages that need to be installed, installs them with BiocManager
66 | # Updates old packages.
67 | if( length(packages_need_to_install) > 0 ) {
68 | install(packages_need_to_install, ask = FALSE)
69 | }
70 |
71 | # Load all packages into session. Stop if there are packages that were not
72 | # successfully loaded
73 | pkgs_not_loaded <- !sapply(packages, require, character.only = TRUE)
74 | pkgs_not_loaded <- names(pkgs_not_loaded)[ pkgs_not_loaded ]
75 | if( length(pkgs_not_loaded) > 0 ){
76 | stop(
77 | "Error in loading the following packages into the session: '",
78 | paste0(pkgs_not_loaded, collapse = "', '"), "'")
79 | }
80 | ```
81 |
82 | ## Data import
83 |
84 | To interact with the MGnify database, we need to create a `MgnifyClient` object.
85 | This object allows us to store options for data fetching. For instance, we can
86 | configure it to use a cache for improved efficiency.
87 |
88 | ```{r create_mgnify_obj}
89 | #| output: false
90 |
91 | # Create the MgnifyClient object with caching enabled
92 | mg <- MgnifyClient(
93 | useCache = TRUE,
94 | cacheDir = "/home/training" # Set this to your desired cache directory
95 | )
96 | ```
97 |
98 | In this workflow, we will fetch taxonomy annotations and metadata from
99 | the study
100 | ["MGYS00005154"](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154).
101 | The dataset focuses on the human gut microbiome, analyzed
102 | across different geographic regions.
103 |
104 | We can now search for all analyses associated with the certain study.
105 | The analysis refers to metagenomic runs performed to samples. Each
106 | sample can have multiple runs made, which is why we work with analyses and not
107 | with samples; analysis identifier points to a single entity.
108 |
109 | ```{r search_analysis}
110 | #| output: false
111 |
112 | study_id <- "MGYS00005154"
113 | analysis_id <- searchAnalysis(mg, "studies", study_id)
114 | ```
115 |
116 | Now we are ready to load the metadata on the analyses to get idea on what
117 | kind of data we are dealing with.
118 |
119 | There are currently (17 Sep 2024) almost 1,000 analyses available. Downloading
120 | whole dataset will take some time, which is why we use memory cache.
121 |
122 | ```{r load_meta}
123 | metadata <- getMetadata(mg, accession = analysis_id)
124 | ```
125 |
126 | We can see that there are analyses that are performed with different pipelines.
127 | Let's take only those analyses that are generated with the pipeline version 5.0.
128 |
129 | ```{r subset_meta}
130 | metadata <- metadata[metadata[["analysis_pipeline-version"]] == "5.0", ]
131 | ```
132 |
133 | We have now analyses that each point to unique sample. The final step is
134 | to fetch abundance tables in `TreeSummarizedExperiment` (`TreeSE`) format.
135 |
136 | ```{r import_treese}
137 | tse <- getResult(
138 | mg,
139 | accession = metadata[["analysis_accession"]],
140 | get.func = FALSE
141 | )
142 | tse
143 | ```
144 |
145 | The fetched data is `TreeSE` object, including taxonomy annotations. See
146 | [OMA online book](https://microbiome.github.io/OMA/docs/devel/pages/containers.html)
147 | on how to handle the data in this format.
148 |
149 | ## Preprocessing
150 |
151 | Below, we agglomerate the data to the Order level, meaning we summarize the
152 | abundances at this specific taxonomic rank. The OMA provides a detailed
153 | [chapter](https://microbiome.github.io/OMA/docs/devel/pages/agglomeration.html)
154 | explaining agglomeration in more depth.
155 |
156 | ```{r agg}
157 | tse_order <- agglomerateByRank(tse, rank = "Order")
158 | ```
159 |
160 | Because of the unique properties of microbiome data, we have to apply
161 | transformations. Here, we perform relative transformation. You can find
162 | more information on transformations from
163 | [OMA](https://microbiome.github.io/OMA/docs/devel/pages/transformation.html).
164 |
165 | ```{r preprocess}
166 | # Transform the main TreeSE
167 | tse <- transformAssay(tse, method = "relabundance")
168 | # Transform the agglomerated TreeSE
169 | tse_order <- transformAssay(tse_order, method = "relabundance")
170 | ```
171 |
172 | ## Alpha diversity
173 |
174 | Alpha diversity measures community diversity within a sample. Learn more on
175 | community diversity from
176 | [here](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html).
177 |
178 | ```{r alpha}
179 | # Calculate alpha diversity
180 | tse <- addAlpha(tse)
181 |
182 | # Create a plot
183 | p <- plotColData(
184 | tse,
185 | y = "shannon_diversity",
186 | x = "sample_geographic.location..country.and.or.sea.region.",
187 | show_boxplot = TRUE
188 | )
189 | p
190 | ```
191 |
192 | We can test whether the diversity differences are statistically significant.
193 | We utilize Mann Whithney U test (or Wilcoxon test).
194 |
195 | ```{r}
196 | pairwise.wilcox.test(
197 | tse[["shannon_diversity"]],
198 | tse[["sample_geographic.location..country.and.or.sea.region."]],
199 | p.adjust.method = "fdr"
200 | )
201 | ```
202 |
203 | To add p-values to the plot, see
204 | [OMA](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html#visualizing-significance-in-group-wise-comparisons).
205 |
206 | ## Beta diversity
207 |
208 | We can assess the differences in microbial compositions between samples, aiming
209 | to identify patterns in the data that are associated with covariates.
210 |
211 | To achieve this, we perform Principal Coordinate Analysis (PCoA) using
212 | Bray-Curtis dissimilarity.
213 |
214 | ```{r pcoa}
215 | # Perform PCoA
216 | tse <- runMDS(
217 | tse,
218 | FUN = getDissimilarity,
219 | method = "bray",
220 | assay.type = "relabundance"
221 | )
222 | # Visualize PCoA
223 | p <- plotReducedDim(
224 | tse,
225 | dimred = "MDS",
226 | colour_by = "sample_geographic.location..country.and.or.sea.region."
227 | )
228 | p
229 | ```
230 |
231 | See [community similarity chapter](https://microbiome.github.io/OMA/docs/devel/pages/beta_diversity.html)
232 | from OMA for more information.
233 |
234 | ## Differential abundance analysis (DAA)
235 |
236 | In DAA, we analyze whether abundances of certain features vary between study
237 | groups. Again, OMA has a dedicated chapter also on this
238 | [topic](https://microbiome.github.io/OMA/docs/devel/pages/differential_abundance.html).
239 |
240 | ```{r daa1}
241 | # Perform DAA
242 | res <- ancombc2(
243 | data = tse_order,
244 | assay.type = "counts",
245 | fix_formula = "sample_geographic.location..country.and.or.sea.region.",
246 | p_adj_method = "fdr",
247 | )
248 | ```
249 |
250 | Next we visualize features that have the lowest p-values.
251 |
252 | ```{r daa2}
253 | # Get the most significant features
254 | n_top <- 4
255 | res_tab <- res[["res"]]
256 | res_tab <- res_tab[order(res_tab[["q_(Intercept)"]]), ]
257 | top_feat <- res_tab[seq_len(n_top), "taxon"]
258 |
259 | # Create a plot
260 | p <- plotExpression(
261 | tse_order,
262 | features = top_feat,
263 | assay.type = "relabundance",
264 | x = "sample_geographic.location..country.and.or.sea.region.",
265 | show_boxplot = TRUE, show_violin = FALSE, point_shape = NA
266 | ) +
267 | scale_y_log10()
268 | p
269 | ```
270 |
271 | ## Session info
272 |
273 | ```{r sesion_info}
274 | sessionInfo()
275 | ```
276 |
--------------------------------------------------------------------------------
/R/doQuery.R:
--------------------------------------------------------------------------------
1 | #' Search MGnify database for studies, samples, runs, analyses, biomes,
2 | #' assemblies, and genomes.
3 | #'
4 | #' @details
5 | #' \code{doQuery} is a flexible query function, harnessing the "full"
6 | #' power of the JSONAPI MGnify search filters. Search results may be filtered
7 | #' by metadata value, associated study/sample/analyse etc.
8 | #'
9 | #' See \href{https://www.ebi.ac.uk/metagenomics/api/v1/}{Api browser} for
10 | #' information on MGnify database filters.
11 | #' You can find help on customizing queries from
12 | #' \href{https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries}{here}.
13 | #'
14 | #' For example the following filters are available:
15 | #' \itemize{
16 | #' \item{\strong{studies}: accession, biome_name, lineage, centre_name,
17 | #' include}
18 | #' \item{\strong{samples}: accession, experiment_type, biome_name,
19 | #' lineage, geo_loc_name, latitude_gte, latitude_lte,
20 | #' longitude_gte, longitude_lte, species, instrument_model,
21 | #' instrument_platform, metadata_key, metadata_value_gte,
22 | #' metadata_value_lte, metadata_value, environment_material,
23 | #' environment_feature, study_accession, include}
24 | #' \item{\strong{runs}: accession, experiment_type, biome_name, lineage,
25 | #' species, instrument_platform, instrument_model, metdata_key,
26 | #' metadata_value_gte, metadata_value_lte, metadata_value, sample_accession,
27 | #' study_accession, include}
28 | #' \item{\strong{analyses}: biome_name, lineage, experiment_type, species,
29 | #' sample_accession, pipeline_version}
30 | #' \item{\strong{biomes}: depth_gte, depth_lte}
31 | #' \item{\strong{assemblies}: depth_gte, depth_lte}
32 | #' }
33 | #' Unfortunately it appears that in some cases, some of these filters don't work
34 | #' as expected, so it is important to check the results returned match up with
35 | #' what's expected. Even more unfortunately if there's an error in the parameter
36 | #' specification, the query will run as if no filter parameters were present
37 | #' at all. Thus the result will appear superficially correct but will infact
38 | #' correspond to something completely different. This behaviour will hopefully
39 | #' be fixed in future incarnations of the MGnifyR or JSONAPI, but for now users
40 | #' should double check returned values.
41 | #'
42 | #' It is currently not possible to combine queries of the same type in a single
43 | #' call (for example to search for samples \emph{between} latitude). However,
44 | #' it is possible to run multiple queries and combine the results using set
45 | #' operations in R to get the desired behaviour.
46 | #'
47 | #' @param x A \code{MgnifyClient} object.
48 | #'
49 | #' @param type A single character value specifying the type of objects to
50 | #' query. Must be one of the following options: \code{studies}, \code{samples},
51 | #' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
52 | #' \code{super-studies}, \code{experiment-types}, \code{pipelines},
53 | #' \code{pipeline-tools}, \code{publications}, \code{genomes},
54 | #' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
55 | #' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
56 | #' \code{antismash-geneclusters}, \code{annotations/go-terms},
57 | #' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
58 | #' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
59 | #' \code{annotations/genome-properties},
60 | #' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
61 | #' \code{mydata}.
62 | #' (By default: \code{type = "studies"})
63 | #'
64 | #' @param accession A single character value or a vector of character values
65 | #' specifying MGnify accession identifiers (of type \code{type}) or NULL. When
66 | #' NULL, all results defined by other parameters are retrieved.
67 | #' (By default: \code{accession = NULL})
68 | #'
69 | #' @param as.df A single boolean value specifying whether to return the
70 | #' results as a data.frame or leave as a nested list. In most cases,
71 | #' \code{as.df = TRUE} will make the most sense.
72 | #' (By default: \code{as.df = TRUE})
73 | #'
74 | #' @param max.hits A single integer value specifying the maximum number of
75 | #' results to return or FALSE. The actual number of results will actually be
76 | #' higher than \code{max.hits}, as clipping only occurs on pagination page
77 | #' boundaries. To disable the limit, set \code{max.hits = NULL}.
78 | #' (By default: \code{max.hits = 200})
79 | #'
80 | #' @param ... Remaining parameter key/value pairs may be supplied to filter
81 | #' the returned values. Available options differ between \code{types}.
82 | #' See discussion Details section for details.
83 | #'
84 | #' @return A nested list or data.frame containing the results of the query.
85 | #'
86 | #' @examples
87 | #' mg <- MgnifyClient(useCache = FALSE)
88 | #'
89 | #' # Get a list of studies from the Agricultural Wastewater :
90 | #' agwaste_studies <- doQuery(
91 | #' mg, "studies", biome_name="Agricultural wastewater"
92 | #' )
93 | #'
94 | #' \dontrun{
95 | #' # Get all samples from a particular study
96 | #' samps <- doQuery(mg, "samples", accession="MGYS00004521")
97 | #'
98 | #' # Search polar samples
99 | #' samps_np <- doQuery(mg, "samples", latitude_gte=66, max.hits=10)
100 | #' samps_sp <- doQuery(mg, "samples", latitude_lte=-66, max.hits=10)
101 | #'
102 | #' # Search studies that have studied drinking water
103 | #' tbl <- doQuery(
104 | #' mg,
105 | #' type = "studies",
106 | #' biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
107 | #' max.hits = 10)
108 | #' }
109 | #'
110 | #' @name doQuery
111 | NULL
112 |
113 | #' @rdname doQuery
114 | #' @importFrom dplyr bind_rows
115 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
116 | #' @export
117 | setMethod("doQuery", signature = c(x = "MgnifyClient"), function(
118 | x, type = "studies", accession = NULL, as.df = TRUE, max.hits = 200,
119 | ...){
120 | ############################### INPUT CHECK ################################
121 | available_types <- c(
122 | "studies", "samples", "runs", "analyses", "biomes", "assemblies",
123 | "super-studies", "experiment-types", "pipelines", "pipeline-tools",
124 | "publications", "genomes", "genome-search", "genome-search/gather",
125 | "genome-catalogues", "genomeset", "cogs", "kegg-modules",
126 | "kegg-classes", "antismash-geneclusters", "annotations/go-terms",
127 | "annotations/interpro-identifiers", "annotations/kegg-modules",
128 | "annotations/pfam-entries", "annotations/kegg-orthologs",
129 | "annotations/genome-properties", "annotations/antismash-gene-clusters",
130 | "annotations/organisms", "mydata")
131 | if( !(.is_non_empty_string(type) && type %in% available_types) ){
132 | stop(
133 | "'type' must be a single character value specifying ",
134 | "the type of instance to query. The value must be one of the ",
135 | "following options: ",
136 | paste0("'", paste(available_types, collapse = "', '"), "'"),
137 | call. = FALSE)
138 | }
139 | if( !(.is_non_empty_character(accession) || is.null(accession)) ){
140 | stop(
141 | "'accession' must be a single character value or vector of ",
142 | "character values specifying the MGnify accession identifier ",
143 | "or NULL.",
144 | call. = FALSE)
145 | }
146 | if( !.is_a_bool(as.df) ){
147 | stop(
148 | "'as.df' must be a single boolean value specifying whether",
149 | "to return list or data.frame.", call. = FALSE)
150 | }
151 | if( !((.is_an_integer(max.hits) && (max.hits > 0 || max.hits == -1) ) ||
152 | is.null(max.hits) ) ){
153 | stop(
154 | "'max.hits' must be a single integer value specifying the ",
155 | "maximum number of results to return or NULL.", call. = FALSE)
156 | }
157 | ############################# INPUT CHECK END ##############################
158 | # Perform query
159 | result <- .perform_query(
160 | client = x, type = type, accession = accession, max.hits = max.hits,
161 | ...)
162 | # Convert list to data.frame if specified
163 | if( as.df && length(result) > 0 ){
164 | # Turn lists to dfs
165 | result <- lapply(result, .list_to_dataframe)
166 | # Combine dfs
167 | result <- bind_rows(result)
168 | }
169 | return(result)
170 | })
171 |
172 | ################################ HELP FUNCTIONS ################################
173 |
174 | .perform_query <- function(
175 | client, type, accession, max.hits, use.cache = useCache(client),
176 | show.messages = verbose(client), ...){
177 | # Input check
178 | if( !.is_a_bool(use.cache) ){
179 | stop(
180 | "'use.cache' must be a single boolean value.", call. = FALSE)
181 | }
182 | #
183 | # Get parameters that are passed to do the query from database
184 | query_params <- list(...)
185 | query_params[["accession"]] <- accession
186 | # Get results from the database
187 | result <- .mgnify_retrieve_json(
188 | client = client,
189 | path = type,
190 | max.hits = max.hits,
191 | use.cache = use.cache,
192 | qopts = query_params
193 | )
194 | # Rename entries by accession
195 | id_list <- lapply(result, function(res) res$id)
196 | if( !is.null(result) ){
197 | names(result) <- id_list
198 | }
199 | return(result)
200 | }
201 |
202 | .list_to_dataframe <- function(result){
203 | # Get attributes
204 | df <- .mgnify_attr_list_to_df_row(
205 | json = result, metadata_key = "sample-metadata")
206 |
207 | # Loop through relationships, i.e., this data might be related to specific
208 | # samples, analyses... --> get that info
209 | relationships <- result[["relationships"]]
210 | for( i in seq_len(length(relationships)) ){
211 | # Get specific relationship, e.g., this data vs related runs
212 | relationship_type <- names(result$relationships)[[i]]
213 | relationship <- result$relationships[[i]]
214 | # Get only data (temp is list of lists and only data element contains
215 | # relevant info)
216 | rel_data <- relationship[["data"]]
217 | # If there is data, include it
218 | if( !is.null(rel_data) && length(rel_data) > 0 ){
219 | # Take all "id" values. Some data can also include list of
220 | # lists. --> unlist and take "id" values. Based on this ID (such
221 | # as "runs" ID) user can fetch specific relationship
222 | rel_data <- unlist(rel_data)
223 | rel_data <- rel_data[names(rel_data) %in% "id"]
224 | temp_names <- rep(relationship_type, length(rel_data))
225 | # Get all column names and make them unique
226 | colnames <- append(colnames(df), temp_names)
227 | colnames <- make.unique(colnames)
228 | # Get only column values that are being added
229 | temp_names <- colnames[
230 | (length(colnames)-length(temp_names)+1):length(colnames)]
231 | # Add new data to dataset
232 | df[temp_names] <- rel_data
233 | }
234 | }
235 | # Add type of data that is being queried and accession code
236 | df[["type"]] <- result[["type"]]
237 | rownames(df) <- df[["accession"]]
238 | return(df)
239 | }
240 |
--------------------------------------------------------------------------------
/R/getFile.R:
--------------------------------------------------------------------------------
1 | #' Download any MGnify files, also including processed reads and
2 | #' identified protein sequences
3 | #'
4 | #' @details
5 | #' \code{getFile} is a convenient wrapper round generic the URL
6 | #' downloading functionality in R, taking care of things like local
7 | #' caching and authentication.
8 | #'
9 | #' @param x A \code{MgnifyClient} object.
10 | #'
11 | #' @param url A single character value specifying the url address of the file
12 | #' we wish to download.
13 | #'
14 | #' @param file A single character value or NULL specifying an
15 | #' optional local filename to use for saving the file. If \code{NULL},
16 | #' MGNify local cache settings will be used. If the file is intended to be
17 | #' processed in a separate program, it may be sensible to provide a
18 | #' meaningful \code{file}, rather than having to hunt through the
19 | #' cache folders. If \code{file} is \code{NULL} and \code{useCache(client)}
20 | #' is \code{FALSE}, the \code{read.func} parameter must be supplied or the
21 | #' file will be downloaded and then deleted.
22 | #' (By default: \code{file = NULL})
23 | #'
24 | #' @param read.func A function specifying an optional function to process the
25 | #' downloaded file and return the results, rather than relying on post
26 | #' processing. The primary use-case for this parameter is when local disk
27 | #' space is limited and downloaded files can be quickly processed and
28 | #' discarded. The function should take a single parameter, the downloaded
29 | #' filename, and may return any valid R object.
30 | #' (By default: \code{read.func = NULL})
31 | #'
32 | #' @param ... Additional arguments; not used currently.
33 | #'
34 | #' @return For \code{getFile()}, either the local filename of the downloaded
35 | #' file, be it either the location in the MGNifyR cache or file. If
36 | #' \code{read.func} is used, its result will be returned.
37 | #'
38 | #' @examples
39 | #' # Make a client object
40 | #' mg <- MgnifyClient(useCache = FALSE)
41 | #'
42 | #' # Create a vector of accession ids - these happen to be \code{analysis}
43 | #' # accessions
44 | #' accession_vect <- c("MGYA00563876", "MGYA00563877")
45 | #' downloads <- searchFile(mg, accession_vect, "analyses")
46 | #'
47 | #' # Filter to find the urls of 16S encoding sequences
48 | #' url_list <- downloads[
49 | #' downloads$attributes.description.label == "Contigs encoding SSU rRNA",
50 | #' "download_url"]
51 | #'
52 | #' # Example 1:
53 | #' # Download the first file
54 | #' supplied_filename <- getFile(
55 | #' mg, url_list[[1]], file="SSU_file.fasta.gz")
56 | #'
57 | #' \dontrun{
58 | #' # Example 2:
59 | #' # Just use local caching
60 | #' cached_filename <- getFile(mg, url_list[[2]])
61 | #'
62 | #' # Example 3:
63 | #' # Using read.func to open the reads with readDNAStringSet from
64 | #' # \code{biostrings}. Without retaining on disk
65 | #' dna_seqs <- getFile(
66 | #' mg, url_list[[3]], read.func = readDNAStringSet)
67 | #' }
68 | #'
69 | #' @name getFile
70 | NULL
71 |
72 | #' @rdname getFile
73 | #' @importFrom httr add_headers content write_disk
74 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
75 | #' @export
76 | setMethod("getFile", signature = c(x = "MgnifyClient"), function(
77 | x, url, file = NULL, read.func = NULL, ...){
78 | ############################### INPUT CHECK ################################
79 | if( !.is_non_empty_string(url) ){
80 | stop(
81 | "'url' must be a single character value specifying ",
82 | "the url of the file.", call. = FALSE)
83 | }
84 | if( !(.is_non_empty_string(file) || is.null(file)) ){
85 | stop(
86 | "'file' must be NULL or a single character value ",
87 | "specifying the name of file being saved.", call. = FALSE)
88 | }
89 | if( !(is.function(read.func) || is.null(read.func)) ){
90 | stop(
91 | "'read.func' must be a function that is used to process the file ",
92 | "or NULL.", call. = FALSE)
93 | }
94 | ############################# INPUT CHECK END ##############################
95 | # Get file
96 | result <- .mgnify_download(
97 | client = x, url = url, file = file,
98 | read.func = read.func, ...)
99 | return(result)
100 | })
101 |
102 | #' Listing files available for download
103 | #'
104 | #' @details
105 | #' \code{searchFile()} function is a wrapper function allowing easy
106 | #' enumeration of downloads available for a given accession IDs.
107 | #' Returns a single data.frame containing all available downloads and associated
108 | #' metadata, including the url location and description. This can then be
109 | #' filtered to extract the urls of interest, before actually
110 | #' retrieving the files using \code{getFile()}
111 | #'
112 | #' @param accession A single character value or a vector of character values
113 | #' specifying accession IDs to return results for.
114 | #'
115 | #' @param type A single character value specifying the type of objects to
116 | #' query. Must be one of the following options: \code{analysis}, \code{samples},
117 | #' \code{studies}, \code{assembly}, \code{genome} or \code{run}.
118 | #' (By default: \code{type = "samples"})
119 | #'
120 | #' @return For \code{searchFile()} \code{data.frame} containing all discovered
121 | #' downloads. If multiple \code{accessions} are queried, the \code{accessions}
122 | #' column may to filter the results - since rownames are not set (and wouldn't
123 | #' make sense as each query will return multiple items)
124 | #'
125 | #' @examples
126 | #' # Make a client object
127 | #' mg <- MgnifyClient(useCache = TRUE)
128 | #' # Create a vector of accession ids - these happen to be \code{analysis}
129 | #' # accessions
130 | #' accession_vect <- c(
131 | #' "MGYA00563876", "MGYA00563877", "MGYA00563878",
132 | #' "MGYA00563879", "MGYA00563880" )
133 | #' downloads <- searchFile(mg, accession_vect, "analyses")
134 | #'
135 | #' @name getFile
136 | NULL
137 |
138 | #' @rdname getFile
139 | #' @importFrom plyr llply rbind.fill
140 | #' @importFrom urltools parameters parameters<-
141 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
142 | #' @export
143 | setMethod("searchFile", signature = c(x = "MgnifyClient"), function(
144 | x, accession, type = c(
145 | "studies", "samples", "analyses", "assemblies", "genomes", "run"),
146 | ...
147 | ){
148 | ############################### INPUT CHECK ################################
149 | if( !.is_non_empty_character(accession) ){
150 | stop(
151 | "'accession' must be a list of character values specifying ",
152 | "the MGnify accession identifiers.",
153 | call. = FALSE)
154 | }
155 | if( !(.is_non_empty_string(type)) ){
156 | stop(
157 | "'type' must be a single character value specifying ",
158 | "the type of instance to query.", call. = FALSE)
159 | }
160 | type <- match.arg(type, several.ok = FALSE)
161 | ############################# INPUT CHECK END ##############################
162 | # Get file urls
163 | result <- .mgnify_get_download_urls(
164 | client = x, accession = accession, type = type, ...)
165 | return(result)
166 | })
167 |
168 | ################################ HELP FUNCTIONS ################################
169 |
170 | # Download the specified files from the database
171 | .mgnify_download <- function(
172 | client, url, file, read.func, use.cache = useCache(client),
173 | url.address = databaseUrl(client), cache.dir = cacheDir(client),
174 | show.warnings = showWarnings(client), clear.cache = clearCache(client),
175 | auth.tok = authTok(client), ...){
176 | # Input check
177 | if( !.is_non_empty_string(url.address) ){
178 | stop(
179 | "'url.address' must be a string.", call. = FALSE)
180 | }
181 | if( !.is_a_bool(use.cache) ){
182 | stop(
183 | "'use.cache' must be a single boolean value.", call. = FALSE)
184 | }
185 | if( !.is_non_empty_string(cache.dir) ){
186 | stop(
187 | "'cache.dir' must be a string.", call. = FALSE)
188 | }
189 | if( !.is_a_bool(show.warnings) ){
190 | stop(
191 | "'show.warnings' must be a single boolean value.", call. = FALSE)
192 | }
193 | if( !.is_a_bool(clear.cache) ){
194 | stop(
195 | "'clear.cache' must be a single boolean value.", call. = FALSE)
196 | }
197 | if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
198 | stop(
199 | "'auth.tok' must be a string or NULL.", call. = FALSE)
200 | }
201 | #
202 | # Set up filenames for storing the data
203 | if ( !is.null(file) ){
204 | file_path <- file
205 | }else if(use.cache){
206 | # Build a filename out of the url, including the full paths. Annoying,
207 | # but some downloads (e.g. genome results) are just names like
208 | # core_genes.fa , which would break the caching.
209 | cachetgt <- gsub(paste(url.address, "/", sep = ""), "", url)
210 |
211 | # Make sure the directory exists
212 | cache_full_name <- file.path(cache.dir, cachetgt)
213 | dir.create(
214 | dirname(cache_full_name), recursive = TRUE,
215 | showWarnings = show.warnings)
216 | file_path <- cache_full_name
217 | } else{
218 | file_path <- tempfile()[[1]]
219 | }
220 |
221 | # Clear cache if specified
222 | if( use.cache && clear.cache && file.exists(file_path) ){
223 | message("clear_cache is TRUE: deleting ", file_path)
224 | unlink(file_path)
225 | }
226 |
227 | # Only get the data if it's not already on disk
228 | if( !file.exists(file_path) || (use.cache && file.exists(file_path)) ){
229 | # Add authentication details to query options
230 | if( !is.null(auth.tok) ){
231 | add_headers(.headers = c(
232 | Authorization = paste("Bearer", auth.tok, sep = " ")))
233 | }
234 | # If there's an error we need to make sure the cache file isn't written
235 | # - by default it seems it is.
236 | res <- GET(url, write_disk(file_path, overwrite = TRUE))
237 | # If the file was not successfully downloaded
238 | if( res$status_code != 200 ){
239 | # Remove the downloaded file
240 | unlink(file_path)
241 | stop(
242 | url, ": ", content(res, ...)$errors[[1]]$detail,
243 | " Could not load the file from database.",
244 | call. = FALSE)
245 | }
246 | }
247 | # Whether to use user-specified read function
248 | if( is.null(read.func) ){
249 | result <- file_path
250 | } else{
251 | result <- read.func(file_path)
252 | }
253 | return(result)
254 | }
255 |
256 | # Get URL addresses of downloadable files that are related to certain accession
257 | # ID.
258 | .mgnify_get_download_urls <- function(
259 | client, accession, type, use.cache = useCache(client),
260 | show.messages = verbose(client), ...){
261 | # Input check
262 | if( !.is_a_bool(use.cache) ){
263 | stop(
264 | "'use.cache' must be a single boolean value.", call. = FALSE)
265 | }
266 | if( !.is_a_bool(show.messages) ){
267 | stop(
268 | "'show.messages' must be a single boolean value.", call. = FALSE)
269 | }
270 | show.messages <- ifelse(show.messages, "text", "none")
271 | #
272 | # Give message about progress
273 | if( show.messages == "text" ){
274 | message("Searching files...")
275 | }
276 | # L
277 | # Loop though accession IDs and find the info
278 | results <- llply(accession, function(x){
279 | # Get the data as nested json list
280 | download_list <- .mgnify_retrieve_json(
281 | client, paste(type, x, "downloads", sep = "/"),
282 | use.cache = use.cache, ...)
283 | # Convert to df
284 | df <- do.call(rbind.fill, lapply(download_list, function(x){
285 | as.data.frame(x, stringsAsFactors = FALSE)}
286 | ))
287 | # Add info to df
288 | df$accession <- x
289 | df$type <- type
290 | # If no match, df is a list --> convert to data.frame
291 | if( !is.data.frame(df) ){
292 | df <- as.data.frame(df)
293 | } else {
294 | # If search result was found, modify
295 | # For convenience, rename the "self" column to "download_url" -
296 | # which is what it actually is...
297 | colnames(df)[colnames(df) == "self"] <- "download_url"
298 | # Finally, strip off any options from the url - they sometimes seem
299 | # to get format=json stuck on the end
300 | urls <- df$download_url
301 | parameters(urls) <- NULL
302 | df$download_url <- urls
303 | }
304 | return(df)
305 | }, .progress = show.messages)
306 | # Combine results of multiple accessions IDs
307 | results <- do.call(rbind.fill, results)
308 | return(results)
309 | }
310 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # integration with other packages
3 |
4 | .require_package <- function(pkg){
5 | if(!requireNamespace(pkg, quietly = TRUE)){
6 | stop(
7 | "'", pkg,"' package not found. Please install the '", pkg,
8 | "' package to use this function.", call. = FALSE)
9 | }
10 | }
11 |
12 | ################################### TESTING ###################################
13 | # Methods for testing
14 |
15 | .is_a_bool <- function(x){
16 | is.logical(x) && length(x) == 1L && !is.na(x)
17 | }
18 |
19 | .is_non_empty_character <- function(x){
20 | is.character(x) && all(nzchar(x))
21 | }
22 |
23 | .is_non_empty_string <- function(x){
24 | is.character(x) && length(x) == 1L
25 | }
26 |
27 | .is_an_integer <- function(x){
28 | is.numeric(x) && length(x) == 1L && x%%1==0
29 | }
30 |
31 | ################################ HELP FUNCTIONS ################################
32 | # Help functions that are utilized by multiple methods
33 |
34 | ########################## .mgnify_attr_list_to_df_row #########################
35 | # Not exporting this - if people want to they can use the
36 | # rjsonapi functionality. Internally, it takes the "attributes" list
37 | # and converts it into a single row data.frame. For some entries, there is a
38 | # sublist of key/value pairs. metadata_key allows these to be included as
39 | # columns in the result.
40 | .mgnify_attr_list_to_df_row <- function (json, metadata_key = NULL){
41 | # Get what kind of metadata the data includes
42 | attrlist <- names(json$attributes)
43 | # If the type of metadata is specified
44 | if (!is.null(metadata_key)){
45 | # Get metadata related to specific key
46 | metaattrlist <- json$attributes[[metadata_key]]
47 | metlist <- lapply(metaattrlist, function(x) x$value)
48 | metlist <- unlist(metlist)
49 | names_metlist <- lapply(metaattrlist, function(x) x$key)
50 | names(metlist) <- unlist(names_metlist)
51 | # Get metadata without the key
52 | baseattrlist <- attrlist[!(attrlist %in% c(metadata_key))]
53 | # Combine metadata
54 | df <- as.data.frame(t(unlist(c(
55 | json$attributes[baseattrlist], metlist))),
56 | stringsAsFactors = FALSE)
57 | }else{
58 | # Get all the metadata without key extraction
59 | df <- as.data.frame(t(unlist(json["attributes"])),
60 | stringsAsFactors = FALSE)
61 | }
62 | # Add accession code and type of data
63 | df$accession <- json$id
64 | df$acc_type <- json$type
65 | # Add accession code also to rownames
66 | rownames(df) <- df$accession
67 | return(df)
68 | }
69 |
70 | ############################## .mgnify_get_x_for_y #############################
71 | # Helper function for getting relative paths in the API
72 | # Not everything is implemented here - just what we
73 | # need to get to the download or run areas
74 | # Given an accession x, we want to get the link to get the url for the
75 | # corresponding typeY JSONAPI path for child elements
76 | #
77 | # .mgnify_get_x_for_y determines the location of typeY child objects of x
78 | # (typeX)
79 | #
80 | # This helper function, principally intended to be used internally,
81 | # is used to match up related objects within the path. The inherently
82 | # unhierarchical nature of the MGnify API makes it a bit inconsistent. This
83 | # function acts as a quick way to determine how to get from one type to another,
84 | # without having to special case within the code.
85 | #
86 | # Parameters:
87 | # client MGnifyR client API object
88 | # x Accession ID \code{char} of parent object
89 | # typeX Type of accession \code{x}
90 | # typeY Type of child object to return
91 | # use.cache Whether to use on-disk cache
92 | #
93 | # Return:
94 | # char complete url to access the result. Note this query is not run from here -
95 | # just the URL is returned
96 | #
97 | # Examples:
98 | # cl <- new("MgnifyClient")
99 | # .mgnify_get_x_for_y(cl, "MGYS00005126", "studies", "samples")
100 | .mgnify_get_x_for_y <- function(
101 | client, x, typeX, typeY, use.cache, ...){
102 | # Fetch the data on samples/analyses as a json list
103 | res <- .mgnify_retrieve_json(
104 | client,
105 | paste(typeX, x, sep = "/"),
106 | use.cache = use.cache,
107 | ...)
108 | # Get related analyses when samples were found and vice versa if result was
109 | # found.
110 | if( !is.null(res) ){
111 | res <- res[[1]]$relationships[[typeY]]$links$related
112 | }
113 | return(res)
114 | }
115 |
116 | ############################## .mgnify_get_x_for_y #############################
117 | # Internal function to actually perform the http request. Build up the URL then
118 | # issues a GET, parsing the returned JSON into a nested list (uses jsonlite
119 | # internally?) Previously cached results may be retrieved from disk without
120 | # resorting to calling the MGnify server.
121 |
122 | # Low level MGnify API handler
123 | #
124 | # .mgnify_retrieve_json deals with handles the actual HTTP GET calls for the
125 | # MGnifyR package, handling API pagination, local result caching, and
126 | # authentication cookies for access to restricted or pre-release datasets.
127 | # Although principally intended for internal MGnifyR use, it's exported for
128 | # direct invocation. Generally though it's not recommended for use by users.
129 | #
130 | # Parameters:
131 | # client MGnifyR client
132 | # path top level search point for the query. One of biomes, samples, runs etc.
133 | # Basically includes all parts of the URL between the base API url and the
134 | # parameter specifications
135 | # complete_url complete url to search, usually retrieved from previous query in
136 | # the "related" section.
137 | # qopts named list or vector containing options/filters to be URL encoded and
138 | # appended to query as key/value pairs
139 | # max.hits Maximum number of data entries to return. The actual number of hits
140 | # returned may be higher than this value, as this parameter only clamps after
141 | # each full page is processed. Set to <=0 to disable - i.e. retrieve all items.
142 | # use.cache Should successful queries be cached on disk locally? There are
143 | # unresolved questions about whether this is a sensible thing to do, but it
144 | # remains as an option. It probably makes sense for single accession grabs,
145 | # but not for (filtered) queries - which are liable to change as new data is
146 | # added to MGnify. Also caching only works for the first page.
147 | # Debug Should we print out lots of information while doing the grabbing?
148 | # timeout How long should be waited server to respond?
149 | #
150 | # Return:
151 | # list of results after pagination is dealt with.
152 |
153 | #' @importFrom urltools parameters parameters<-
154 | #' @importFrom httr add_headers
155 | #' @importFrom httr GET
156 | #' @importFrom httr config
157 | #' @importFrom httr content
158 | #' @importFrom httr timeout
159 | .mgnify_retrieve_json <- function(
160 | client, path = "biomes", complete_url = NULL, qopts = NULL,
161 | max.hits = 200, timeout = 5*60, Debug = FALSE,
162 | use.cache = useCache(client), show.warnings = showWarnings(client),
163 | clear.cache = clearCache(client), url.address = databaseUrl(client),
164 | auth.tok = authTok(client), cache.dir = cacheDir(client), ...){
165 | # Input check
166 | if( !.is_an_integer(timeout) ){
167 | stop(
168 | "'timeout' must be a single integer value.", call. = FALSE)
169 | }
170 | if( !.is_a_bool(Debug) ){
171 | stop(
172 | "'Debug' must be a single boolean value.", call. = FALSE)
173 | }
174 | if( !.is_a_bool(use.cache) ){
175 | stop(
176 | "'use.cache' must be a single boolean value specifying whether ",
177 | "to use on-disk caching.", call. = FALSE)
178 | }
179 | if( !.is_a_bool(show.warnings) ){
180 | stop(
181 | "'show.warnings' must be a single boolean value.", call. = FALSE)
182 | }
183 | if( !.is_a_bool(clear.cache) ){
184 | stop(
185 | "'clear.cache' must be a single boolean value.", call. = FALSE)
186 | }
187 | if( !.is_non_empty_string(url.address) ){
188 | stop(
189 | "'url.address' must be a string.", call. = FALSE)
190 | }
191 | if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
192 | stop(
193 | "'auth.tok' must be a string or NULL.", call. = FALSE)
194 | }
195 | #
196 | if( !.is_non_empty_string(cache.dir) ){
197 | stop(
198 | "'cache.dir' must be a string.", call. = FALSE)
199 | }
200 | #
201 | # Warning message if data is not found
202 | warning_msg <- paste0(path, ": No data found.")
203 | # warnings(client) turns on debugging too:
204 | if( show.warnings ){
205 | Debug <- TRUE
206 | }
207 | # Set up the base url
208 | # Are we using internal paths?
209 | if( is.null(complete_url) ){
210 | fullurl <- paste(url.address, path, sep = "/")
211 | } else{
212 | # Or direct links from e.g. a "related" section
213 | # Set the full url, but clean off any existing parameters
214 | # (page, format etc) as they'll be added back later:
215 | fullurl <- complete_url
216 | parameters(fullurl) <- NULL
217 | path <- substr(fullurl, nchar(url.address) + 2, nchar(fullurl))
218 | }
219 | # Spaces are not allowed in url address. Convert spaces to %20.
220 | fullurl <- gsub(" ", "%20", fullurl)
221 |
222 | # Convert to csv if filters are lists.
223 | # This doesn't check if they can be searched for in the API,
224 | # which is an issue since no error is returned by the JSON if the search
225 | # is invalid - we only get a result as if no query was present...
226 | tmpqopts <- lapply(qopts, function(x) paste(x, collapse = ","))
227 |
228 | # Include the json and page position options
229 | # full_qopts <- as.list(c(format="json", tmpqopts, page=1))
230 | full_qopts <- as.list(c(format="json", tmpqopts))
231 |
232 | # Build up the cache name anyway - even if it's not ultimately used:
233 | fname_list <- c(path, names(unlist(full_qopts)), unlist(full_qopts))
234 | cache_fname <- paste(fname_list, collapse = "_")
235 | # Because query options are collapsed to file name , they might include
236 | # colons that are not supported in file names. Replace them with
237 | # underscores.
238 | cache_fname <- gsub(":", "_", cache_fname)
239 | cache_full_fname <- file.path(cache.dir, paste0(cache_fname, ".RDS"))
240 |
241 | # Quick check to see if we should clear the disk cache for this
242 | # specific call - used for debugging and when MGnify breaks
243 | if( use.cache && clear.cache ){
244 | if( file.exists(cache_full_fname) ){
245 | message("clearCache is TRUE: deleting ", cache_full_fname)
246 | unlink(cache_full_fname)
247 | }
248 | }
249 |
250 | # Do we want to try and use a cache to speed things up?
251 | if( use.cache && file.exists(cache_full_fname) ){
252 | final_data <- readRDS(cache_full_fname)
253 | } else{
254 | # Authorization: Bearer
255 | if( !is.null(auth.tok) ){
256 | add_headers(
257 | .headers = c(Authorization = paste(
258 | "Bearer", authTok(client), sep = " ")))
259 | }
260 | res <- GET(
261 | url = fullurl, config(verbose = Debug), query = full_qopts,
262 | timeout(timeout))
263 | # Get the data
264 | data <- content(res, ...)
265 |
266 | # Check if the search was successful and data can be found
267 | not_found <- (res$status_code != 200) || (
268 | is.null(data$data) || length(data$data) == 0)
269 | # If data is found
270 | if( !not_found ){
271 | # Fetch all the data
272 | final_data <- .retrieve_json_data(
273 | client, data, fullurl, full_qopts, max.hits, Debug
274 | )
275 | } else{
276 | final_data <- NULL
277 | if( res$status_code != 200 ){
278 | warning_msg <- paste0(
279 | path, " (", res$status_code, " error): ",
280 | data$errors[[1]]$detail)
281 | }
282 | }
283 | # Save the result to file if specified
284 | if( use.cache && !file.exists(cache_full_fname) ){
285 | # Make sure the directory is created...
286 | dir.create(
287 | dirname(cache_full_fname), recursive = TRUE,
288 | showWarnings = show.warnings)
289 | saveRDS(final_data, file = cache_full_fname)
290 | }
291 | }
292 | # Give warning if data is not found.
293 | if( is.null(final_data) ){
294 | warning("\n", warning_msg, call. = FALSE)
295 | }
296 | return(final_data)
297 | }
298 |
299 | # This retrives all the data related to accession. FOr example, it loops
300 | # oer multiple pages.
301 | .retrieve_json_data <- function(
302 | client, data, fullurl, full_qopts, max.hits, Debug,
303 | auth.tok = authTok(client), ...){
304 | # Input check
305 | if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
306 | stop(
307 | "'auth.tok' must be a string or NULL.", call. = FALSE)
308 | }
309 | #
310 | # At this point, data$data is either a list of lists or a single named
311 | # list. If it's a single entry, it needs embedding in a list for
312 | # consistency downstream datlist is built up as a list of pages, where
313 | # each entry must be another list. Thus, on the first page,
314 | #
315 | datlist <- list()
316 | if( !is.null(names(data$data)) ){
317 | # Create something to store the returned data
318 | datlist[[1]] <- list(data$data)
319 | }else{
320 | datlist[[1]] <- data$data
321 | }
322 | # Check to see if there's pagination required
323 | if( "meta" %in% names(data) ){
324 | # Yes, paginate
325 | pstart <- as.numeric(data$meta$pagination$page)
326 | pend <- as.numeric(data$meta$pagination$pages)
327 | # We've already got the first one
328 | if( pend > 1 ){
329 | # Loop over pages and save their result to list
330 | for( p in seq(pstart+1,pend) ){
331 | full_qopts$page <- p
332 | if( !is.null(auth.tok) ){
333 | add_headers(
334 | .headers = c(
335 | Authorization = paste(
336 | "Bearer", auth.tok, sep = " ")))
337 | }
338 | curd <- content(
339 | GET(fullurl, config(verbose = Debug), query = full_qopts ),
340 | ...)
341 | datlist[[p]] <- curd$data
342 | # Check to see if we've pulled enough entries.
343 | # With NULL and -1, disable max.hits
344 | curlen <- sum(unlist(lapply(datlist, length)))
345 | if( !is.null(max.hits) && curlen >= max.hits &&
346 | max.hits != -1 ){
347 | break
348 | }
349 | }
350 | }
351 | }
352 | # Combine results from different pages
353 | final_data <- unlist(datlist, recursive = FALSE)
354 | return(final_data)
355 | }
356 |
357 | #Internal functions to parse the attributes/hierarchy list into a data.frame
358 | .mgnify_parse_tax <- function(json){
359 | df <- as.data.frame(
360 | c(json$attributes["count"], unlist(json$attributes$hierarchy)),
361 | stringsAsFactors = FALSE)
362 | df$index_id <- json$attributes$lineage
363 | df
364 |
365 | }
366 | .mgnify_parse_func <- function(json){
367 | df <- as.data.frame(json$attributes, stringsAsFactors = FALSE)
368 | df$index_id <- json$attributes$accession
369 | df
370 | }
371 |
--------------------------------------------------------------------------------
/vignettes/MGnifyR_long.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "MGnifyR, extended vignette"
3 | date: "`r Sys.Date()`"
4 | package: MGnifyR
5 | output:
6 | BiocStyle::html_document:
7 | fig_height: 7
8 | fig_width: 10
9 | toc: yes
10 | toc_depth: 2
11 | number_sections: true
12 | vignette: >
13 | %\VignetteIndexEntry{MGnifyR, extended vignette}
14 | %\VignetteEngine{knitr::rmarkdown}
15 | %\VignetteEncoding{UTF-8}
16 | bibliography: references.bib
17 | ---
18 |
19 | ```{r include = FALSE}
20 | library(knitr)
21 | knitr::opts_chunk$set(
22 | collapse = TRUE,
23 | comment = "#>",
24 | eval = FALSE,
25 | cache = TRUE
26 | )
27 | ```
28 |
29 | [MGnifyR homepage](http://github.com/EBI-Metagenomics/MGnifyR)
30 |
31 | # Introduction
32 |
33 | `MGnifyR` is a package designed to ease access to the EBI's
34 | [MGnify](https://www.ebi.ac.uk/metagenomics) resource, allowing searching and
35 | retrieval of multiple datasets for downstream analysis. While MGnify pipelines
36 | are undoubtedly useful, as currently implemented they produce results on a
37 | strictly per-sample basis. While some whole study results are available,
38 | comparisons across studies are difficult. The `MGnifyR` package is designed to
39 | facilitate cross-study analyses by handling all the per-sample data retrieval
40 | and merging details internally, leaving the user free to perform the analysis
41 | as they see fit.
42 |
43 | The latest version of MGnifyR seamlessly integrates with the
44 | [miaverse framework](https://microbiome.github.io/) providing access to
45 | tools in microbiome down-stream analytics. This integration
46 | enables users to leverage optimized and standardized methods for analyzing
47 | the microbiome. Additionally, users can benefit from a comprehensive tutorial
48 | book that offers valuable guidance and support.
49 |
50 | # Installation
51 |
52 | `MGnifyR` is currently hosted on GitHub, and can be installed using via
53 | `devtools`. `MGnifyR` should be built using the following snippet.
54 |
55 | ```{r install, eval=FALSE}
56 | BiocManager::install("MGnifyR")
57 | ```
58 |
59 | # Load `MGnifyR` package
60 |
61 | Once installed, `MGnifyR` is made available in the usual way.
62 |
63 | ```{r load_package}
64 | library(MGnifyR)
65 | ```
66 |
67 | # Create a client
68 |
69 | All functions in `MGnifyR` make use of a `MgnifyClient` object to keep track
70 | of the JSONAPI url, disk cache location and user access tokens. Thus the first
71 | thing to do when starting any analysis is to instantiate this object. The
72 | following snippet creates this.
73 |
74 | ```{r create_client}
75 | mg <- MgnifyClient()
76 | mg
77 | ```
78 |
79 | It's recommended that local caching is enabled with `useCache = TRUE`. Queries
80 | to the MGnify API can be quite slow, particularly when retrieving multipage
81 | results for many analyses (such as many `Interpro` results). Using a local
82 | disk cache can significantly speed up subsequent work, bypassing the need to
83 | re-query the API. Use of the cache should be entirely transparent, as the
84 | caching occurs at the raw data level. The cache can persist across `MGnifyR`
85 | sessions, and can even be used for multiple sessions simultaneously - provided
86 | that different sets of accessions are queried at once.
87 |
88 | Optionally, a username and password may be specified during client creation,
89 | causing `MGnifyR` to attempt retrieval of an authentication token from the API.
90 | Doing so gives access to non-public results, such as those currently under an
91 | author imposed embargo period.
92 |
93 | ```{r create_client_passwd, eval=FALSE}
94 | mg <- MgnifyClient(
95 | username = "Webin-username", password = "your-password", useCache = TRUE)
96 | ```
97 |
98 | # Functions for fetching the data
99 |
100 | ## Search data
101 |
102 | `MGnifyR` gives users access to the complete range of search functionality
103 | implemented in the MGnify JSON API. A single function `doQuery()` is used to do
104 | perform this searching, allowing Studies, Samples, Runs and Accession to be
105 | interrogated from a common interface. As with all MGnifyR functions the first
106 | argument `client` must be a valid `MgnifyClient` instance. The only remaining
107 | **required** parameter is `qtype`, specifying the type of data to be queried,
108 | and may be one of `studies`, `samples`, `runs`, `analyses` or `assemblies`.
109 | Other general parameter include `max.hits`.
110 |
111 | Unlike most other `MGnifyR` high level functions, caching is turned off by
112 | default for `doQuery()`. New data and analyses are being added to MGnify all the
113 | time, so enabling caching by default may lead to out-of-date search results for
114 | long-lived sessions. However, it's easy to switch back on, and may be useful in
115 | many cases. Also, given the huge and ever increasing number of datasets
116 | available in MGnify, a limit to the number of results returned may be set
117 | using `max.hits`. By default this is set to 200, which for most exploratory
118 | queries should be sufficient. It may be increased or decreased by directly
119 | specifying `max.hits`, and disabled completely (no limit) by setting
120 | `max.hits=NULL`.
121 |
122 | In most cases we will want to be more specific about the search, and will
123 | also use either an `accession` parameter, or the many filter options available
124 | through the API, and discussed below. Specifying an `accession` id, which in
125 | the case of `samples`, `runs` and `assemblies` may be a vector of ids, returns
126 | a data.frame of metadata with one row per matching accession.
127 |
128 | If `accession` is `NULL` (the default) then remaining parameters define the
129 | filters applied by the API to the search result. Details of these parameters
130 | are given in `help(doQuery)`. By way of example though, supposing we are
131 | interested in amplicon Illumina samples from the arctic, we might try the
132 | following query:
133 |
134 | ```{r search_studies}
135 | northpolar <- doQuery(
136 | mg, "samples", latitude_gte=60.0, experiment_type="amplicon",
137 | biome_name="Soil", instrument_platform = "Illumina", max.hits = 10)
138 |
139 | head(northpolar)
140 | ```
141 |
142 | Specifying an `accession` parameter will restrict results to just those matching
143 | that particular entry, be it a study, sample or run. For example, to retrieve
144 | information for study "MGYS00002891":
145 |
146 | ```{r search_studies_accession}
147 | study_samples <- doQuery(mg, "studies", accession="MGYS00002891")
148 |
149 | head(study_samples)
150 | ```
151 |
152 | ## Find relevent **analyses** accessions
153 |
154 | Having obtained a particular set of search hits, it's now time to retrieve the
155 | associated results. General automated analysis is complicated by the MGnify
156 | database design, wherein for example samples may be shared between multiple
157 | studies, or studies analysed multiple times using different versions of the
158 | pipeline. Navigating these "many-to-one" relationships can be tricky, so
159 | `MGnifyR` resorts to using `analyses` accessions as it's canonical identifier.
160 | Each analysis corresponds to a single run of a particular pipeline on a single
161 | sample in a single study. The downside of this approach is that queries
162 | returning `studies`, `samples` (or anything other than `analyses`) accessions
163 | need converting to the corresponding `analyses`.
164 |
165 | `MGnifyR` therefore provides a helper function to handle this conversion -
166 | `searchAnalysis()`. Following on from our previous search, we have a
167 | list of `study` accessions, so to convert to corresponding `analyses` we use:
168 |
169 | ```{r convert_to_analyses}
170 | analyses_accessions <- searchAnalysis(
171 | mg, type="studies", accession = study_samples$accession)
172 |
173 | head(analyses_accessions)
174 | ```
175 |
176 | A useful side effect of the above call is that some attribute metadata for
177 | each sample has now been retrieved and stored in the local cache. Thus
178 | subsequent API calls for these samples (which will occur multiple times in
179 | later steps) will be significantly faster.
180 |
181 | It's important to be aware that the results of a `searchAnalysis()` command will
182 | not necessarily be a one-to-one match with the input accessions. `MGnify`
183 | analysis runs are sometimes performed multiple times, perhaps using different
184 | versions of the pipeline. Thus further filtering of the result list may be
185 | required, but is easily performed and is illustrated in the next section.
186 |
187 | ## Fetch metadata
188 |
189 | At this point we have a long list of analysis instances (with potential
190 | duplicates) corresponding to the samples previously found. We use the
191 | `getMetadata` function to download and combine all associated `sample`, `run`
192 | and `study` metadata, which we then filter as required to include only the
193 | rows we want.
194 |
195 | ```{r get_metadata}
196 | analyses_metadata <- getMetadata(mg, analyses_accessions)
197 |
198 | head(analyses_metadata)
199 | ```
200 |
201 | The resulting data.frame has columns with names prefixed with their source
202 | type. For example, "sample_xxx" columns correspond to metadata gleaned from
203 | querying an accession's `sample` entry. MGnify allows quite flexible
204 | specification of arbitray metadata at submission time, in many cases leading
205 | to quite sparse `data.frame` results if accession queries are sourced from more
206 | than one study. For instance, if only one sample contains an entry for
207 | "sample_soil_PH", entries for other rows will be filled with `NA`. `MGnifyR`
208 | does not automatically clean these missing values - instead opting to allow the
209 | the user to choose the a correct action. The particular study we're looking at
210 | is from the marine biome, suppose we were interested in only those samples or
211 | analyses for which the sampling depth was known. The following snippet filters
212 | the full `data.frame` selecting only entries which contain a valid
213 | `sample_depth`. It's worth noting the `as.numeric` call to ensure the column
214 | is converted to `numeric` type before it is checked. *All* sample data from
215 | MGnifyR is initially retrieved as type `character`, and it's up to the user to
216 | make sure ostensibly numeric entries are converted properly.
217 |
218 | ```{r filter_metadata}
219 | known_depths <- analyses_metadata[
220 | !is.na(as.numeric(analyses_metadata$sample_depth)), ]
221 | # How many are left?
222 | dim(known_depths)
223 | ```
224 |
225 | ## Fetch microbiome data
226 |
227 | Having selected the analyses we wish to examine further, `getResult()` is used
228 | to both download associated OTU tables and taxonomy, and join all results
229 | into a single `r BiocStyle::Biocpkg("TreeSummarizedExperiment")` (`TreeSE`)
230 | object. TreeSE is becoming a defacto standard for taxonomic abundance *munging*
231 | in R. `TreeSE` objects integrate abundance, taxonomic, phylogenetic, sample and
232 | sequence data into a single object, with powerful facilities for filtering,
233 | processing and plotting the results. Compared to
234 | `r BiocStyle::Biocpkg("phyloseq")` object, `TreeSE` is more scalable and capable
235 | for efficient data analysis.
236 |
237 | `miaverse` framework is developed around `TreeSE` data container. It provides
238 | tools for analysis and visualization. Moreover, it includes a comprehensive
239 | tutorial book called [OMA](https://microbiome.github.io/OMA/).
240 |
241 | ### Amplicon sequencing
242 |
243 | When the dataset includes amplicon sequencing data, i.e., the dataset does not
244 | include function predictions, `getResult()` method returns the dataset as a
245 | `TreeSE` by default. See other output types from the function documentation.
246 |
247 | ```{r get_treese}
248 | tse <- getResult(mg, accession = analyses_accessions, get.func = FALSE)
249 |
250 | tse
251 | ```
252 |
253 | `TreeSE` object is uniquely positioned to support
254 | `r BiocStyle::Biocpkg("SummarizedExperiment")`-based
255 | microbiome data manipulation and visualization. Moreover, it enables access
256 | to `miaverse` tools. For example, we can estimate diversity of samples.
257 |
258 | ```{r calculate_diversity}
259 | library(mia)
260 |
261 | tse <- estimateDiversity(tse, index = "shannon")
262 |
263 | library(scater)
264 |
265 | plotColData(tse, "shannon", x = "sample_geo.loc.name")
266 | ```
267 |
268 | ```{r plot_abundance}
269 | library(miaViz)
270 |
271 | plotAbundance(
272 | tse[!is.na( rowData(tse)[["Kingdom"]] ), ],
273 | rank = "Kingdom",
274 | as.relative = TRUE
275 | )
276 | ```
277 |
278 | If needed, `TreeSE` can be converted to `phyloseq`.
279 |
280 | ```{r to_phyloseq}
281 | pseq <- makePhyloseqFromTreeSE(tse)
282 | pseq
283 | ```
284 |
285 | ### Metagenomics
286 |
287 | Although the previous queries have been based on the results from `doQuery()`,
288 | from now on we will concentrate on combining and comparing results from
289 | specific studies. Since newly performed analyses are retrieved first in the
290 | `doQuery()` call, it's likely that by the time this vignette is read, the query
291 | results will be different. This is principally due to the rapid increase in
292 | MGnify submissions, leading to a potential lack of consistency between even
293 | closely spaced queries. As mentioned previously, it may be best to use
294 | `useCache=FALSE` from `MgnifyCLient` object for `doQuery()` calls, to ensure
295 | queries are actually returning the latest data.
296 |
297 | For the remainder of this vignette however, we'll be comparing 3 ostensibly
298 | different studies. A study of saltmarsh soils from York University, human
299 | faecal samples from a survey of healthy Sardinians, and a set of samples from
300 | hydrothermal vents in the Mid-Cayman rise in the Carribbean Sea. To simplify
301 | things, only the first 20 samples from each study will be used. Furthermore,
302 | the intention is only to demonstrate the functionality of the MGnifyR package,
303 | rather than produce scientifically rigorous results.
304 |
305 | ```{r get_analyses}
306 | soil <- searchAnalysis(mg, "studies", "MGYS00001447")
307 | human <- searchAnalysis(mg, "studies", "MGYS00001442")
308 | marine <- searchAnalysis(mg, "studies", "MGYS00001282")
309 |
310 | # Combine analyses
311 | all_accessions <- c(soil, human, marine)
312 |
313 | head(all_accessions)
314 | ```
315 |
316 | The first step with this new accession list is, as previously, to retrieve the
317 | associated metadata using `getMetadata()`, and as seen with the
318 | `doQuery()` results, the returned `data.frame` contains a large number of
319 | columns. Being autogenerated and flexible, the column names can be a little
320 | difficult to predict, but examining `colnames(full_metadata)` should make
321 | things clearer.
322 |
323 | ```{r get_new_metadata}
324 | full_metadata <- getMetadata(mg, all_accessions)
325 |
326 | colnames(full_metadata)
327 | head(full_metadata)
328 | ```
329 |
330 | From `full_metadata` we get an idea of the type of data we're dealing with,
331 | and can extract useul information such as sequencing platform, source biome,
332 | etc. The next code snippet tallies a few of these columns to give an idea about
333 | what's available. The boxplot also indicates that while within study read
334 | counts are similar, we probably need to use some sort of normalization
335 | procedure when comparing across samples. We might also want to drop
336 | particularly low read coverage samples from further analysis.
337 |
338 | ```{r full_metatdata_explore}
339 | # Load ggplot2
340 | library(ggplot2)
341 |
342 | #Distribution of sample source material:
343 | table(full_metadata$`sample_environment-material`)
344 |
345 | #What sequencing machine(s) were used?
346 | table(full_metadata$`sample_instrument model`)
347 |
348 | # Boxplot of raw read counts:
349 | ggplot(
350 | full_metadata, aes(x=study_accession, y=log(
351 | as.numeric(`analysis_Submitted nucleotide sequences`)))) +
352 | geom_boxplot(aes(group=study_accession)) +
353 | theme_bw() +
354 | ylab("log(submitted reads)")
355 | ```
356 |
357 | Again, we can fetch the data by calling `getResult()`. `bulk.dl=TRUE` has the
358 | potential to significantly speed up data retrieval. MGnify makes its
359 | functional results available in two separate ways, either on a per-analysis
360 | basis through the web api, or at the whole study level as large files,
361 | tab separated (TSV), and with columns representing the results for each
362 | analysis. When `bulk.dl` is `FALSE`, `MGnifyR` queries the web api to get
363 | results which (given some functional analyses results may consist of
364 | thousands of entries) may take significant time. Setting `bulk.dl` to
365 | `TRUE` causes `MGnifyR` to determine the source study associated with a
366 | particular `analysis` and to instead download and parse its corresponding
367 | results file. Since this result file contains entries for all analyses
368 | associated with the study, by taking advantage of `MGnifyR`'s local caching
369 | this single download provides results for many future analyses. In some cases
370 | this affords several orders of magnitude speedup over the api query case.
371 |
372 | Unfortunately, column entries in the per-study results files do not always
373 | directly correspond to those from a particular analysis run, causing the
374 | retrieval to fail. The principal cause of this is believed to be the running
375 | of multiple analyses jobs on the same sample. Thus for reliability, `bulk.dl`
376 | is `FALSE` by default. As a general recommendation though, you should try
377 | setting it `TRUE` the first time `getResult()` is used on a
378 | set of accessions. If this fails, setting `bulk.dl` to `FALSE` will enable the
379 | more robust approach allowing the analysis to continue. It might take a while
380 | though. Hopefully in the future the sample/analysis correspondence mismatches
381 | will be fixed and the default `bulk.dl` will be switch to `TRUE`.
382 |
383 | ```{r get_mae}
384 | mae <- getResult(mg, all_accessions, bulk.dl = TRUE)
385 |
386 | mae
387 | ```
388 |
389 | For metagenomic samples, the result is
390 | `r BiocStyle::Biocpkg("MultiAssayExperiment")` (`MAE`) which
391 | links multiple `TreeSE` objects into one dataset. These `TreeSE` objects include
392 | taxonomic profiling data along with functional data in unique objects. Each
393 | objects is linked with each other by their sample names. You can get access
394 | to individual object or experiment by specifying index or name.
395 |
396 | ```{r mae_access}
397 | mae[[2]]
398 | ```
399 |
400 | We can perform principal component analysis to microbial profiling data by
401 | utilizing miaverse tools.
402 |
403 | ```{r pcoa}
404 | # Apply relative transformation
405 | mae[[1]] <- transformAssay(mae[[1]], method = "relabundance")
406 | # Perform PCoA
407 | mae[[1]] <- runMDS(
408 | mae[[1]], assay.type = "relabundance",
409 | FUN = vegan::vegdist, method = "bray")
410 | # Plot
411 | plotReducedDim(mae[[1]], "MDS", colour_by = "sample_environment.feature")
412 | ```
413 |
414 | ## Fetch raw files
415 |
416 | While `getResult()` can be utilized to retrieve microbial profiling data,
417 | `getData()` can be used more flexibly to retrieve any kind of data from the
418 | database. It returns data as simple data.frame or list format.
419 |
420 | ```{r fetch_data}
421 | kegg <- getData(
422 | mg, type = "kegg-modules", accession = "MGYA00642773",
423 | accession.type = "analyses")
424 |
425 | head(kegg)
426 | ```
427 |
428 | ## Fetch sequence files
429 |
430 | Finally, we can use `searchFile()` and `getFile()` to retrieve other MGnify
431 | pipeline outputs such as merged sequence reads, assembled contigs, and details
432 | of the functional analyses. `searchFile()` is a simple wrapper function
433 | which, when supplied a list of accessions, finds the urls of the files we're
434 | after. In most cases we'll want to filter the returned list down to only the
435 | files of interest, which is easily done on the resulting data.frame object.
436 | In addition to the actual download location (the `download_url` column),
437 | extra columns include file type, contents and compression. It's recommended
438 | that the `colnames` of the `data.frame` be examined to get a grasp on the
439 | available metadata. To demonstrate the process, the code below retrieves
440 | a data.frame containing all available downloads for each accession we've been
441 | examining previously. It then filters this to retain only those files
442 | corresponding retain the annotated amino acid sequence files.
443 |
444 | ```{r get_download_urls}
445 | # Find list of available downloads
446 | dl_urls <- searchFile(
447 | mg, full_metadata$analysis_accession, type = "analyses")
448 |
449 | # Filter table
450 | target_urls <- dl_urls[
451 | dl_urls$attributes.description.label == "Predicted CDS with annotation", ]
452 |
453 | head(target_urls)
454 | ```
455 |
456 | To list the types of available files, and guide the filtering, something like
457 | the following might be useful.
458 |
459 | ```{r list_descriptions}
460 | table(dl_urls$attributes.description.label)
461 | ```
462 |
463 | Unlike other `MGnifyR` functions, `searchFile()` is not limited to
464 | `analyses`, and by specifying `accession_type` other results types may be
465 | found. For instance, while general `genome` functionality is not yet
466 | integrated into `MGnifyR`, we can retrieve associated files for a particular
467 | `genome` accession with the following:
468 |
469 | ```{r get_genome_urls}
470 | genome_urls <- searchFile(mg, "MGYG000433953", type = "genomes")
471 |
472 | genome_urls[ , c("id", "attributes.file.format.name", "download_url")]
473 | ```
474 |
475 | Having found the a set of target urls, the final step is to use
476 | `getFile()` to actually retrieve the file. Unlike other functions, this only
477 | works with a single url location at once, so each entry in `target_urls` from
478 | above must be downloaded individually - easily done by either looping or
479 | `apply`ing over the list.
480 |
481 | If the files are intended to be used with external programs, it might be
482 | easiest to provide a `file` parameter to the function call, which specifies
483 | a local filename for writing the file. By default `MGnifyR` will use the local
484 | cache, which can make getting to the file afterwards more awkward. Regardless,
485 | the default behaviour of `getFile()` is to retrieve the file specified in the
486 | parameter `url`, save it to disk, and return the filepath it was saved to.
487 |
488 | ```{r get_files}
489 | # Just select a single file from the target_urls list for demonstration.
490 |
491 | # Default behavior - use local cache.
492 | cached_location1 = getFile(mg, target_urls$download_url[[1]])
493 |
494 | # Specifying a file
495 | cached_location2 <- getFile(
496 | mg, target_urls$download_url[[1]])
497 |
498 | cached_location <- c(cached_location1, cached_location2)
499 |
500 | # Where are the files?
501 | cached_location
502 | ```
503 |
504 | A second download option is available, which allows built-in parsing of the
505 | file. If we know ahead of time what processing will be performed, it may be
506 | possible to integrate it into a function, pass this function to
507 | `getFile()` as the `read.func` argument. The function in question should
508 | take a single argument (the complete path name of the locally downloaded file)
509 | and the result of the call will be returned in place of the usual output
510 | file name.
511 |
512 | Alternatively the files could first be downloaded in the standard way, and
513 | then processed using this same function in a loop. Therefore in many cases
514 | the `read.func` parameter is redundant. However, many of the outputs from
515 | MGnify can be quite large, meaning local storage of many files may become an
516 | issue. By providing a `read_func` parameter (and necessarily setting from
517 | `MgnifyClient` object: `useCache=FALSE`) analysis of a large number of datasets
518 | may be possible with minimal storage requirements.
519 |
520 | To illustrate, suppose we were interested in retrieving all detected sequences
521 | matching a particular PFAM motif in a set of analyses. The simple function
522 | below uses the `Biostrings` package to read an amino acid fasta file, searches
523 | for a matching PFAM tag in the sequence name, and then tallies up the unique
524 | sequences into a single data.frame row. In this case the PFAM motif identifies
525 | sequences coding for the amoC gene, found in both ammonia and methane
526 | oxidizing organisms, but any other filtering method could be used.
527 |
528 | ```{r simple_parse_function}
529 | library(Biostrings)
530 |
531 | # Simple function to a count of unique sequences matching PFAM amoC/mmoC motif
532 | getAmoCseqs <- function(fname){
533 | sequences <- readAAStringSet(fname)
534 | tgtvec <- grepl("PF04896", names(sequences))
535 | as.data.frame(as.list(table(as.character(sequences[tgtvec]))))
536 | }
537 | ```
538 |
539 | Having defined the function, it just remains to include it in the call to
540 | `getFile()`.
541 |
542 | ```{r download_with_read}
543 | # Just download a single accession for demonstration, specifying a read_function
544 | amoC_seq_counts <- getFile(
545 | mg, target_urls$download_url[[1]], read_func = getAmoCseqs)
546 |
547 | amoC_seq_counts
548 | ```
549 |
550 | ```{r session_info}
551 | sessionInfo()
552 | ```
553 |
--------------------------------------------------------------------------------