├── _pkgdown.yml
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-getMetadata.R
    │   ├── test-searchAnalysis.R
    │   ├── test-getData.R
    │   ├── test-doQuery.R
    │   ├── test-MgnifyClient.R
    │   ├── test-getResult.R
    │   └── test-getFile.R
├── man
    ├── figures
    │   ├── mgnify_logo.jpg
    │   ├── mgnifyr_logo.png
    │   └── findingpheno_logo.png
    ├── MGnifyR-package.Rd
    ├── getMetadata.Rd
    ├── searchAnalysis.Rd
    ├── MgnifyClient-accessors.Rd
    ├── getData.Rd
    ├── deprecate.Rd
    ├── MgnifyClient.Rd
    ├── getFile.Rd
    ├── doQuery.Rd
    └── getResult.Rd
├── inst
    ├── extdata
    │   └── vignette_MGnifyR.rds
    └── extras
    │   ├── TODO
    │   └── demo_code.R
├── .Rbuildignore
├── .gitignore
├── NEWS
├── R
    ├── MGnifyR.R
    ├── AllClasses.R
    ├── AllGenerics.R
    ├── AllAccessors.R
    ├── deprecate.R
    ├── getMetadata.R
    ├── MgnifyClient.R
    ├── getData.R
    ├── searchAnalysis.R
    ├── doQuery.R
    ├── getFile.R
    └── utils.R
├── .github
    └── workflows
    │   └── rworkflows.yml
├── vignettes
    ├── references.bib
    ├── MGnifyR.Rmd
    ├── MGnify_course.Rmd
    └── MGnifyR_long.Rmd
├── DESCRIPTION
├── NAMESPACE
├── README.md
└── LICENSE


/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: ~
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(MGnifyR)
3 | 
4 | test_check("MGnifyR")
5 | 


--------------------------------------------------------------------------------
/man/figures/mgnify_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/mgnify_logo.jpg


--------------------------------------------------------------------------------
/man/figures/mgnifyr_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/mgnifyr_logo.png


--------------------------------------------------------------------------------
/inst/extdata/vignette_MGnifyR.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/inst/extdata/vignette_MGnifyR.rds


--------------------------------------------------------------------------------
/man/figures/findingpheno_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EBI-Metagenomics/MGnifyR/HEAD/man/figures/findingpheno_logo.png


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | .github
 4 | MGnifyR/vignettes/MGnifyR_cache
 5 | ^_pkgdown\.yml$
 6 | ^docs$
 7 | ^pkgdown$
 8 | ^doc$
 9 | ^Meta$
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .*_cache
 2 | inst/extras/
 3 | *~
 4 | .Rproj.user
 5 | .Rhistory
 6 | .RData
 7 | .Ruserdata
 8 | ..Rcheck
 9 | .MGnify_cache
10 | inst/doc
11 | NOTES
12 | *.RDS
13 | Testing
14 | .gitignore
15 | tmp
16 | .RDataTmp
17 | *.Rproj
18 | **/NA/
19 | vignettes/*.html
20 | vignettes/*.R
21 | vignettes/*.pdf
22 | vignettes/MGnifyR_cache
23 | docs
24 | /doc/
25 | /Meta/
26 | 


--------------------------------------------------------------------------------
/inst/extras/TODO:
--------------------------------------------------------------------------------
 1 | In no particular order:
 2 | 
 3 | Do documentation. DONE(ish)
 4 | Convert to single monolithic mgnify_query function DONE
 5 | Add caching functionality. DONE
 6 | Add warnings for multiple runs etc.
 7 | Split phyloseq grab into 2 fdevto unctions
 8 |    - get metadata DF/list for analysis runs
 9 |    - Do actual grab of biome data
10 | phyloseq conversion needs to be able to handle a list
11 | 
12 | Fix query to check if lists of filters ~can~ be lists (i.e. supply multiple elements as filters)
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Version: 0.99.23
 2 | Date: 2024-03-04
 3 | + getReturn fix: failed constructing MAE if samples in experiments did not match
 4 | 
 5 | Version: 0.99.20
 6 | Date: 2024-02-26
 7 | + searchAnalysis returns now a named vector where names are accession IDs that was fed as input
 8 | 
 9 | Version: 0.99.19
10 | Date: 2024-02-15
11 | + Fix deprecated mgnify_client function
12 | 
13 | Version: 0.99.18
14 | Date: 2024-02-12
15 | + Last modifications for Biocondutor submission
16 | 
17 | Changes in version 0.99.17
18 | + Added getData function for fetching raw data from the database
19 | 
20 | Version 0.99.0
21 | + Support for TreeSummarizedExperiment and MultiAssayExperiment
22 | + Submitted to Bioconductor
23 | 


--------------------------------------------------------------------------------
/R/MGnifyR.R:
--------------------------------------------------------------------------------
 1 | #' \code{MGnifyR} Package.
 2 | #'
 3 | #' \code{MGnifyR} implements an interface to the EBI MGnify database.
 4 | #' See the vignette for a general introduction to this package.
 5 | #' \href{https://www.ebi.ac.uk/metagenomics/about}{About MGnify} for general MGnify
 6 | #' information, and
 7 | #' \href{https://emg-docs.readthedocs.io/en/latest/api.html}{API documentation} for
 8 | #' details about the JSONAPI implementation.
 9 | #' @name MGnifyR-package
10 | #' @aliases MGnifyR
11 | #' @docType package
12 | #' @seealso \link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment} class
13 | NULL
14 | 
15 | #' @import mia
16 | #' @import TreeSummarizedExperiment
17 | #' @import MultiAssayExperiment
18 | NULL
19 | 


--------------------------------------------------------------------------------
/man/MGnifyR-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/MGnifyR.R
 3 | \docType{package}
 4 | \name{MGnifyR-package}
 5 | \alias{MGnifyR-package}
 6 | \alias{MGnifyR}
 7 | \title{\code{MGnifyR} Package.}
 8 | \description{
 9 | \code{MGnifyR} implements an interface to the EBI MGnify database.
10 | See the vignette for a general introduction to this package.
11 | \href{https://www.ebi.ac.uk/metagenomics/about}{About MGnify} for general MGnify
12 | information, and
13 | \href{https://emg-docs.readthedocs.io/en/latest/api.html}{API documentation} for
14 | details about the JSONAPI implementation.
15 | }
16 | \seealso{
17 | \link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment} class
18 | }
19 | \author{
20 | \strong{Maintainer}: Tuomas Borman \email{tuomas.v.borman@utu.fi} (\href{https://orcid.org/0000-0002-8563-8884}{ORCID})
21 | 
22 | Authors:
23 | \itemize{
24 |   \item Ben Allen \email{ben.allen@ncl.ac.uk}
25 |   \item Leo Lahti \email{leo.lahti@iki.fi} (\href{https://orcid.org/0000-0001-5537-637X}{ORCID})
26 | }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/getMetadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/getMetadata.R
 3 | \name{getMetadata}
 4 | \alias{getMetadata}
 5 | \alias{getMetadata,MgnifyClient-method}
 6 | \title{Get all study, sample and analysis metadata for the supplied analysis
 7 | accessions}
 8 | \usage{
 9 | getMetadata(x, ...)
10 | 
11 | \S4method{getMetadata}{MgnifyClient}(x, accession, ...)
12 | }
13 | \arguments{
14 | \item{x}{A \code{MgnifyClient} object.}
15 | 
16 | \item{...}{Optional arguments; not currently used.}
17 | 
18 | \item{accession}{A single character value or a vector of analysis accession
19 | IDs specifying accessions to retrieve data for.}
20 | }
21 | \value{
22 | A \code{data.frame} containing metadata for each analysis in the
23 | \code{accession} list. Each row represents a single analysis.
24 | }
25 | \description{
26 | Get all study, sample and analysis metadata for the supplied analysis
27 | accessions
28 | }
29 | \details{
30 | The function retrieves all study, sample and analysis metadata associated
31 | with provided analysis accessions.
32 | }
33 | \examples{
34 | # Create a client object
35 | mg <- MgnifyClient(useCache = FALSE)
36 | 
37 | # Download all associated study/sample and analysis metadata
38 | accession_list <- c("MGYA00377505")
39 | meta_dataframe <- getMetadata(mg, accession_list)
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/.github/workflows/rworkflows.yml:
--------------------------------------------------------------------------------
 1 | name: rworkflows
 2 | 'on':
 3 |   push:
 4 |     branches:
 5 |     - master
 6 |     - main
 7 |     - devel
 8 |     - RELEASE_**
 9 |   pull_request:
10 |     branches:
11 |     - master
12 |     - main
13 |     - devel
14 |     - RELEASE_**
15 | jobs:
16 |   rworkflows:
17 |     permissions: write-all
18 |     runs-on: ${{ matrix.config.os }}
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 |     container: ${{ matrix.config.cont }}
21 |     strategy:
22 |       fail-fast: ${{ false }}
23 |       matrix:
24 |         config:
25 |         - os: ubuntu-latest
26 |           bioc: devel
27 |           r: auto
28 |           cont: ghcr.io/bioconductor/bioconductor_docker:devel
29 |           rspm: ~
30 |         - os: macOS-latest
31 |           bioc: devel
32 |           r: auto
33 |           cont: ~
34 |           rspm: ~
35 |         - os: windows-latest
36 |           bioc: devel
37 |           r: auto
38 |           cont: ~
39 |           rspm: ~
40 |     steps:
41 |     - uses: neurogenomics/rworkflows@master
42 |       with:
43 |         run_bioccheck: ${{ false }}
44 |         run_rcmdcheck: ${{ true }}
45 |         as_cran: ${{ true }}
46 |         run_vignettes: ${{ true }}
47 |         has_testthat: ${{ true }}
48 |         run_covr: ${{ true }}
49 |         run_pkgdown: ${{ true }}
50 |         has_runit: ${{ false }}
51 |         has_latex: ${{ false }}
52 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
53 |         run_docker: ${{ false }}
54 |         DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
55 |         runner_os: ${{ runner.os }}
56 |         cache_version: cache-v1
57 |         docker_registry: ghcr.io
58 | 


--------------------------------------------------------------------------------
/R/AllClasses.R:
--------------------------------------------------------------------------------
 1 | # MgnifyClient class and its accessors
 2 | 
 3 | #' A MgnifyClient object
 4 | #'
 5 | #' @details An object that are required by functions of MGnifyR package.
 6 | #'
 7 | #' @slot databaseUrl A single character value specifying an URL address of
 8 | #' database.
 9 | #'
10 | #' @slot authTok A single character value specifying authentication token.
11 | #'
12 | #' @slot useCache A single boolean value specifying whether to use cache.
13 | #'
14 | #' @slot cacheDir A single character value specifying cache directory.
15 | #'
16 | #' @slot showWarnings A single boolean value specifying whether to show
17 | #' warnings.
18 | #'
19 | #' @slot clearCache A single boolean value specifying whether to clear cache.
20 | #'
21 | #' @slot verbose A single boolean value specifying whether to show messages.
22 | #'
23 | #' @section Constructor:
24 | #' See  \code{\link{MgnifyClient}} for constructor.
25 | #'
26 | #' @section Accessor:
27 | #' See \code{\link{MgnifyClient-accessors}} for accessor functions.
28 | #'
29 | #' @name MgnifyClient
30 | NULL
31 | 
32 | #' @rdname MgnifyClient
33 | #' @importFrom httr POST
34 | #' @importFrom httr content
35 | #' @exportClass MgnifyClient
36 | setClass(
37 |     "MgnifyClient", representation(
38 |         databaseUrl = "character",
39 |         authTok = "character",
40 |         useCache = "logical",
41 |         cacheDir = "character",
42 |         showWarnings = "logical",
43 |         clearCache = "logical",
44 |         verbose = "logical"),
45 |     prototype = list(
46 |         databaseUrl = "https://www.ebi.ac.uk/metagenomics/api/v1",
47 |         authTok = NULL,
48 |         useCache = FALSE,
49 |         cacheDir = NULL,
50 |         clearCache = FALSE,
51 |         verbose = TRUE))
52 | 


--------------------------------------------------------------------------------
/tests/testthat/test-getMetadata.R:
--------------------------------------------------------------------------------
 1 | context("getMetadata")
 2 | test_that("getMetadata", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     expect_error(getMetadata(1))
 7 |     expect_error(getMetadata("test"))
 8 |     expect_error(getMetadata(TRUE))
 9 | 
10 |     expect_error(getMetadata(mg, accession = TRUE))
11 |     expect_error(getMetadata(mg, accession = 1))
12 |     expect_error(getMetadata(mg, accession = c(1, 2)))
13 | 
14 |     expect_error(getMetadata(mg, accession = "test", use.cache = NULL))
15 |     expect_error(getMetadata(mg, accession = "test", use.cache = 1))
16 |     expect_error(getMetadata(mg, accession = "test", use.cache = c(TRUE, FALSE)))
17 | 
18 |     expect_error(getMetadata(mg, accession = "test", show.messages = NULL))
19 |     expect_error(getMetadata(mg, accession = "test", show.messages = 1))
20 |     expect_error(getMetadata(mg, accession = "test", show.messages = c(TRUE, FALSE)))
21 | 
22 |     # Require internet access
23 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
24 | 
25 |     # Test that correct metadata is fetched based on certain accession ID.
26 |     res <- getMetadata(mg, "MGYA00097621", show.messages = FALSE)
27 |     expect_equal(nrow(res), 1)
28 |     expect_true(ncol(res) > 1)
29 |     expect_equal(rownames(res)[1] , "MGYA00097621")
30 |     expect_equal(res$run_accession, "ERR1811651")
31 |     
32 |     # # To reduce the time used to build the package, these tests are commented
33 |     # # When metadata is not found, should give a warning and the result should
34 |     # # be empty tibble
35 |     # expect_warning(res <- getMetadata(mg, "MGYS00005292", show.messages = FALSE))
36 |     # expect_true(ncol(res) == 0 && nrow(res) == 0)
37 | })
38 | 


--------------------------------------------------------------------------------
/vignettes/references.bib:
--------------------------------------------------------------------------------
 1 | @Manual{SE,
 2 |     title = {SummarizedExperiment: SummarizedExperiment container},
 3 |     author = {Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès},
 4 |     year = {2020},
 5 |     note = {R package version 1.20.0},
 6 |     url = {https://bioconductor.org/packages/SummarizedExperiment},
 7 |   }
 8 | 
 9 | @Article{SCE,
10 |     title = {Orchestrating single-cell analysis with Bioconductor},
11 |     author = {Robert Amezquita and Aaron Lun and Etienne Becht and Vince Carey and Lindsay Carpp and Ludwig Geistlinger and Federico Marini and Kevin Rue-Albrecht and Davide Risso and Charlotte Soneson and Levi Waldron and Herve Pages and Mike Smith and Wolfgang Huber and Martin Morgan and Raphael Gottardo and Stephanie Hicks},
12 |     year = {2020},
13 |     volume = {17},
14 |     pages = {137--145},
15 |     journal = {Nature Methods},
16 |     url = {https://www.nature.com/articles/s41592-019-0654-x},
17 |   }
18 | 
19 | @Manual{TSE,
20 |     title = {TreeSummarizedExperiment: TreeSummarizedExperiment: a S4 Class for Data with Tree
21 | Structures},
22 |     author = {Ruizhu Huang},
23 |     year = {2021},
24 |     note = {R package version 1.99.9},
25 |   }
26 | 
27 | @Article{dada2,
28 |     title = {DADA2: High-resolution sample inference from Illumina amplicon data},
29 |     author = {Benjamin J Callahan and Paul J McMurdie and Michael J Rosen and Andrew W Han and Amy Jo A Johnson and Susan P Holmes},
30 |     journal = {Nature Methods},
31 |     volume = {13},
32 |     pages = {581-583},
33 |     year = {2016},
34 |     doi = {10.1038/nmeth.3869},
35 |   }
36 | 
37 |  @Manual{DMM,
38 |     title = {DirichletMultinomial: Dirichlet-Multinomial Mixture Model Machine Learning for
39 | Microbiome Data},
40 |     author = {Martin Morgan},
41 |     year = {2020},
42 |     note = {R package version 1.32.0},
43 |   }
44 | 


--------------------------------------------------------------------------------
/man/searchAnalysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/searchAnalysis.R
 3 | \name{searchAnalysis}
 4 | \alias{searchAnalysis}
 5 | \alias{searchAnalysis,MgnifyClient-method}
 6 | \title{Look up analysis accession IDs for one or more study or sample accessions}
 7 | \usage{
 8 | searchAnalysis(x, ...)
 9 | 
10 | \S4method{searchAnalysis}{MgnifyClient}(x, type, accession, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{MgnifyClient} object.}
14 | 
15 | \item{...}{Optional arguments; not currently used.}
16 | 
17 | \item{type}{A single character value specifying a type of
18 | accession IDs specified by \code{accession}. Must be "studies" or "samples".}
19 | 
20 | \item{accession}{A single character value or a vector of character values
21 | specifying study or sample accession IDs that are used to retrieve analyses
22 | IDs.}
23 | }
24 | \value{
25 | Vector of analysis accession IDs.
26 | }
27 | \description{
28 | Look up analysis accession IDs for one or more study or sample accessions
29 | }
30 | \details{
31 | Retrieve analysis accession IDs associated with the supplied study or
32 | sample accession.  In MGnify, an analysis accession refers to a certain
33 | pipeline analysis, such as specific 16S rRNA or shotgun metagenomic mapping.
34 | Studies can include multiple samples, and each sample can undergo multiple
35 | analyses using these pipelines. Each analysis is identified by a unique
36 | accession ID, allowing precise tracking and retrieval of analysis results
37 | within the MGnify database.
38 | }
39 | \examples{
40 | # Create a client object
41 | mg <- MgnifyClient(useCache = FALSE)
42 | 
43 | # Retrieve analysis ids from study MGYS00005058
44 | result <- searchAnalysis(mg, "studies", c("MGYS00005058"))
45 | 
46 | \dontrun{
47 | # Retrieve all analysis ids from samples
48 | result <- searchAnalysis(
49 |     mg, "samples", c("SRS4392730", "SRS4392743"))
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/testthat/test-searchAnalysis.R:
--------------------------------------------------------------------------------
 1 | context("searchAnalysis")
 2 | test_that("searchAnalysis", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     expect_error(searchAnalysis(TRUE))
 7 |     expect_error(searchAnalysis("test"))
 8 |     expect_error(searchAnalysis(NULL))
 9 |     expect_error(searchAnalysis(1))
10 | 
11 |     expect_error(searchAnalysis(mg, type = TRUE, accession = "test"))
12 |     expect_error(searchAnalysis(mg, type = "test", accession = "test"))
13 |     expect_error(searchAnalysis(mg, type = NULL, accession = "test"))
14 |     expect_error(
15 |         searchAnalysis(mg, type = c("studies", "samples", accession = "test")))
16 | 
17 |     expect_error(searchAnalysis(mg, type = "studies", accession = TRUE))
18 |     expect_error(searchAnalysis(mg, type = "studies", accession = NULL))
19 |     expect_error(searchAnalysis(mg, type = "studies", accession = 1))
20 |     expect_error(
21 |         searchAnalysis(mg, type = "studies", accession = c(TRUE, FALSE)))
22 | 
23 |     # Require internet access
24 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
25 | 
26 |     # Test that correct analysis IDs are found based on study ID.
27 |     expect_warning(res <- searchAnalysis(
28 |         mg, "studies", "MGYA00097621", verbose = FALSE))
29 |     expect_true(is.null(res))
30 |     res <- searchAnalysis(mg, "studies", "MGYS00005058", verbose = FALSE)
31 |     expect_true(length(res) > 0)
32 |     expect_true("MGYA00377528" %in% res)
33 |     
34 |     # # To reduce the time used to build the package, these tests are commented
35 |     # # Test that correct analysis IDs are found based on sample ID.
36 |     # expect_warning(
37 |     #     res <- searchAnalysis(mg, "samples", "MGYA00097621", verbose = FALSE))
38 |     # expect_true(is.null(res))
39 |     # res <-  searchAnalysis(mg, "samples", "ERS2161777", verbose = FALSE)
40 |     # expect_true(length(res) > 0)
41 |     # expect_true("MGYA00293854" %in% res)
42 | })
43 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MGnifyR
 2 | Type: Package
 3 | Version: 1.5.1
 4 | Authors@R:
 5 |     c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
 6 |              email = "tuomas.v.borman@utu.fi",
 7 |              comment = c(ORCID = "0000-0002-8563-8884")),
 8 |       person(given = "Ben", family = "Allen", role = c("aut"),
 9 |              email = "ben.allen@ncl.ac.uk"),
10 |       person(given = "Leo", family = "Lahti", role = c("aut"),
11 |              email = "leo.lahti@iki.fi",
12 |              comment = c(ORCID = "0000-0001-5537-637X")))
13 | Title: R interface to EBI MGnify metagenomics resource
14 | Description:
15 |     Utility package to facilitate integration and analysis of EBI MGnify data
16 |     in R. The package can be used to import microbial data for instance into
17 |     TreeSummarizedExperiment (TreeSE). In TreeSE format, the data is directly
18 |     compatible with miaverse framework.
19 | biocViews: Infrastructure, DataImport, Metagenomics, Microbiome, MicrobiomeData
20 | License: Artistic-2.0 | file LICENSE
21 | Encoding: UTF-8
22 | Depends:
23 |     R(>= 4.4.0),
24 |     MultiAssayExperiment,
25 |     TreeSummarizedExperiment,
26 |     SummarizedExperiment,
27 |     BiocGenerics
28 | Imports:
29 |     mia,
30 |     ape,
31 |     dplyr,
32 |     httr,
33 |     methods,
34 |     plyr,
35 |     reshape2,
36 |     S4Vectors,
37 |     urltools,
38 |     utils,
39 |     tidyjson
40 | Suggests:
41 |     biomformat,
42 |     broom,
43 |     ggplot2,
44 |     knitr,
45 |     rmarkdown,
46 |     testthat,
47 |     xml2,
48 |     BiocStyle,
49 |     miaViz,
50 |     vegan,
51 |     scater,
52 |     phyloseq,
53 |     magick
54 | URL: https://github.com/EBI-Metagenomics/MGnifyR
55 | BugReports: https://github.com/EBI-Metagenomics/MGnifyR/issues
56 | VignetteBuilder: knitr
57 | RoxygenNote: 7.3.1
58 | Collate:
59 |     'utils.R'
60 |     'MgnifyClient.R'
61 |     'AllGenerics.R'
62 |     'AllClasses.R'
63 |     'AllAccessors.R'
64 |     'MGnifyR.R'
65 |     'deprecate.R'
66 |     'doQuery.R'
67 |     'getData.R'
68 |     'getFile.R'
69 |     'getMetadata.R'
70 |     'getResult.R'
71 |     'searchAnalysis.R'
72 | 


--------------------------------------------------------------------------------
/tests/testthat/test-getData.R:
--------------------------------------------------------------------------------
 1 | context("getData")
 2 | test_that("getData", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     expect_error(getData(1))
 7 |     expect_error(getData("test"))
 8 |     expect_error(getData(TRUE))
 9 | 
10 |     expect_error(getData(mg, type = 1))
11 |     expect_error(getData(mg, type = TRUE))
12 |     expect_error(getData(mg, type = NULL))
13 |     expect_error(getData(mg, type = c("type", "type")))
14 | 
15 |     expect_error(getData(
16 |         mg, type = "kegg-modules", accession.type = "analyses", accesion = 1))
17 |     expect_error(getData(
18 |         mg, type = "kegg-modules", accession.type = "analyses",
19 |         accesion = TRUE))
20 |     expect_error(getData(
21 |         mg, type = "kegg-modules", accession.type = "analyses",
22 |         accesion = NULL))
23 | 
24 |     expect_error(getData(
25 |         mg, type = "kegg-modules", accession = c("MGYA00642773"),
26 |         accesion.type = 1))
27 |     expect_error(getData(
28 |         mg, type = "kegg-modules",
29 |         accession = c("MGYA00642773", "MGYA00642774"), accesion.type = TRUE))
30 |     expect_error(getData(
31 |         mg, type = "kegg-modules",
32 |         accession = c("MGYA00642773", "MGYA00642774"), accesion.type = NULL))
33 |     expect_error(getData(
34 |         mg, type = "kegg-modules",
35 |         accession = c("MGYA00642773", "MGYA00642774"), accesion.type = c("type", "type")))
36 | 
37 |     expect_error(getData(
38 |         mg, type = "kegg-modules", accession = c("MGYA00642773"),
39 |         accesion.type = c("type"), as.df = "test"))
40 |     expect_error(getData(
41 |         mg, type = "kegg-modules", accession = c("MGYA00642773"),
42 |         accesion.type = c("type"), as.df = 1))
43 |     expect_error(getData(
44 |         mg, type = "kegg-modules", accession = c("MGYA00642773"),
45 |         accesion.type = c("type"), as.df = c(TRUE, TRUE)))
46 |     expect_error(getData(
47 |         mg, type = "kegg-modules", accession = c("MGYA00642773"),
48 |         accesion.type = c("type"), as.df = NULL))
49 | 
50 |     # Require internet access
51 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
52 | 
53 |     type <- "kegg-modules"
54 |     res <- getData(
55 |       mg, type = type, accession = "MGYA00642773", accession.type = "analyses")
56 |     expect_is(res, "data.frame")
57 |     expect_true( all(res[["type"]] == type) )
58 | })
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-doQuery.R:
--------------------------------------------------------------------------------
 1 | context("doQuery")
 2 | test_that("doQuery", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     # Expect errors when input is wrong
 7 |     expect_error(doQuery("test"))
 8 |     expect_error(doQuery(TRUE))
 9 |     expect_error(doQuery(1))
10 | 
11 |     expect_error(doQuery(mg, type = 1))
12 |     expect_error(doQuery(mg, type = "test"))
13 |     expect_error(doQuery(mg, type = TRUE))
14 |     expect_error(doQuery(mg, type = c("studies", "samples")))
15 | 
16 |     expect_error(doQuery(mg, type = "studies", accession = 1))
17 |     expect_error(doQuery(mg, type = "studies", accession = TRUE))
18 |     expect_error(doQuery(mg, type = "studies", accession = c(1, 2)))
19 | 
20 |     expect_error(doQuery(mg, type = "studies", accession = "test", as.df = NULL))
21 |     expect_error(doQuery(mg, type = "studies", accession = "test", as.df = 1))
22 |     expect_error(doQuery(mg, type = "studies", accession = "test", as.df = c(TRUE, FALSE)))
23 | 
24 |     expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = TRUE))
25 |     expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = -100))
26 |     expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = c(1, 2)))
27 |     expect_error(doQuery(mg, type = "studies", accession = "test", max.hits = 1.5))
28 | 
29 |     expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = NULL))
30 |     expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = 1))
31 |     expect_error(doQuery(mg, type = "studies", accession = "test", use.cache = c(TRUE, FALSE)))
32 | 
33 |     # Require internet access
34 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
35 |     
36 |     # Test that studies are searched based on certain accession ID, get result
37 |     # as list, choose max hits
38 |     query <- doQuery(mg, "studies", "MGYS00005292", max.hits = 1, as.df = FALSE)
39 |     expect_true(is.list(query))
40 |     expect_true(names(query) %in% "MGYS00005292")
41 |     expect_true(query$MGYS00005292$type == "studies")
42 |     
43 |     # # To reduce the time used to build the package, these tests are commented
44 |     # # Test that runs are searched, get result as df, choose max hits
45 |     # query2 <- doQuery(mg, "studies", "MGYS00005292", max.hits = 1)
46 |     # expect_true(is.data.frame(query2))
47 |     # expect_equal(query2$bioproject,
48 |     #              query$MGYS00005292$attributes$bioproject)
49 | })
50 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("authTok<-")
 4 | export("cacheDir<-")
 5 | export("clearCache<-")
 6 | export("databaseUrl<-")
 7 | export("showWarnings<-")
 8 | export("useCache<-")
 9 | export("verbose<-")
10 | export(MgnifyClient)
11 | export(authTok)
12 | export(cacheDir)
13 | export(clearCache)
14 | export(databaseUrl)
15 | export(doQuery)
16 | export(getData)
17 | export(getFile)
18 | export(getMetadata)
19 | export(getResult)
20 | export(mgnify_analyses_from_samples)
21 | export(mgnify_analyses_from_studies)
22 | export(mgnify_client)
23 | export(mgnify_download)
24 | export(mgnify_get_analyses_metadata)
25 | export(mgnify_get_analyses_phyloseq)
26 | export(mgnify_get_analyses_results)
27 | export(mgnify_get_download_urls)
28 | export(mgnify_query)
29 | export(mgnify_retrieve_json)
30 | export(searchAnalysis)
31 | export(searchFile)
32 | export(showWarnings)
33 | export(useCache)
34 | export(verbose)
35 | exportClasses(MgnifyClient)
36 | exportMethods("authTok<-")
37 | exportMethods("cacheDir<-")
38 | exportMethods("clearCache<-")
39 | exportMethods("databaseUrl<-")
40 | exportMethods("showWarnings<-")
41 | exportMethods("useCache<-")
42 | exportMethods("verbose<-")
43 | exportMethods(authTok)
44 | exportMethods(cacheDir)
45 | exportMethods(clearCache)
46 | exportMethods(databaseUrl)
47 | exportMethods(doQuery)
48 | exportMethods(getData)
49 | exportMethods(getFile)
50 | exportMethods(getMetadata)
51 | exportMethods(getResult)
52 | exportMethods(searchAnalysis)
53 | exportMethods(searchFile)
54 | exportMethods(showWarnings)
55 | exportMethods(useCache)
56 | exportMethods(verbose)
57 | import(MultiAssayExperiment)
58 | import(TreeSummarizedExperiment)
59 | import(mia)
60 | importFrom(S4Vectors,SimpleList)
61 | importFrom(SummarizedExperiment,"rowData<-")
62 | importFrom(SummarizedExperiment,rowData)
63 | importFrom(TreeSummarizedExperiment,rowTree)
64 | importFrom(ape,read.tree)
65 | importFrom(dplyr,"%>%")
66 | importFrom(dplyr,bind_rows)
67 | importFrom(dplyr,mutate_all)
68 | importFrom(dplyr,na_if)
69 | importFrom(httr,GET)
70 | importFrom(httr,POST)
71 | importFrom(httr,add_headers)
72 | importFrom(httr,config)
73 | importFrom(httr,content)
74 | importFrom(httr,timeout)
75 | importFrom(httr,write_disk)
76 | importFrom(methods,is)
77 | importFrom(methods,new)
78 | importFrom(mia,checkTaxonomy)
79 | importFrom(mia,importBIOM)
80 | importFrom(plyr,llply)
81 | importFrom(plyr,rbind.fill)
82 | importFrom(reshape2,dcast)
83 | importFrom(tidyjson,spread_all)
84 | importFrom(urltools,"parameters<-")
85 | importFrom(urltools,parameters)
86 | importFrom(utils,read.csv2)
87 | 


--------------------------------------------------------------------------------
/tests/testthat/test-MgnifyClient.R:
--------------------------------------------------------------------------------
 1 | context("MgnifyClient")
 2 | test_that("MgnifyClient", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient()
 5 | 
 6 |     # Expect errors when input is wrong
 7 |     expect_error(MgnifyClient(useCache = 1))
 8 |     expect_error(MgnifyClient(useCache = "TRUE"))
 9 |     expect_error(MgnifyClient(useCache = c(TRUE, TRUE)))
10 | 
11 |     expect_error(MgnifyClient(verbose = 1))
12 |     expect_error(MgnifyClient(verbose = "TRUE"))
13 |     expect_error(MgnifyClient(verbose = c(TRUE, TRUE)))
14 | 
15 |     expect_error(MgnifyClient(showWarnings = 1))
16 |     expect_error(MgnifyClient(showWarnings = "TRUE"))
17 |     expect_error(MgnifyClient(showWarnings = c(TRUE, TRUE)))
18 | 
19 |     expect_error(MgnifyClient(clearCache = 1))
20 |     expect_error(MgnifyClient(clearCache = "TRUE"))
21 |     expect_error(MgnifyClient(clearCache = c(TRUE, TRUE)))
22 | 
23 |     expect_error(MgnifyClient(showWarnings = 1))
24 |     expect_error(MgnifyClient(showWarnings = "TRUE"))
25 |     expect_error(MgnifyClient(showWarnings = c(TRUE, TRUE)))
26 | 
27 |     expect_error(MgnifyClient(url = 1))
28 |     expect_error(MgnifyClient(url = TRUE))
29 |     expect_error(MgnifyClient(url = c("url", "url")))
30 | 
31 |     expect_error(MgnifyClient(username = 1))
32 |     expect_error(MgnifyClient(username = TRUE))
33 |     expect_error(MgnifyClient(username = c("url", "url")))
34 | 
35 |     expect_error(MgnifyClient(password = 1))
36 |     expect_error(MgnifyClient(password = TRUE))
37 |     expect_error(MgnifyClient(password = c("url", "url")))
38 | 
39 |     # Test that slots are updated. Change arguments --> check that values
40 |     # of slots correspond argument.
41 |     mg <- MgnifyClient(
42 |         useCache = TRUE,
43 |         cacheDir = "test",
44 |         showWarnings = FALSE,
45 |         url = "test"
46 |     )
47 |     expect_equal(cacheDir(mg), "test/.MGnifyR_cache")
48 |     expect_equal(showWarnings(mg), FALSE)
49 |     expect_equal(databaseUrl(mg), "test")
50 |     mg <- MgnifyClient(
51 |         useCache = FALSE,
52 |         cacheDir = "test",
53 |         showWarnings = TRUE
54 |     )
55 |     expect_true(!is.na(cacheDir(mg)))
56 |     expect_equal(showWarnings(mg), TRUE)
57 |     # Require internet access
58 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
59 |     # Test that error occurs when wrong username/password is used in
60 |     # authentication
61 |     expect_error(MgnifyClient(username = "not_work", password = "not_work"))
62 |     expect_error(
63 |         MgnifyClient(
64 |             username = "not_work", password = "not_work", url = "not_work"))
65 | })
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MGnifyR <img src="man/figures//mgnifyr_logo.png" align="right" width="120" />
 2 | 
 3 | An R package for searching and retrieving data from the
 4 | [EBI Metagenomics resource](https://www.ebi.ac.uk/metagenomics). 
 5 | In most cases, MGnifyR interacts directly with the JSONAPI, rather than relying
 6 | on downloading analyses outputs as TSV files. Thus it is more general - allowing
 7 | for example the intuitive combining of multiple studies and analyses
 8 | into a single workflow, but is in some cases slower than the afformentioned
 9 | direct access. Local caching of results on disk is implemented to help counter
10 | some of the overheads, but data downloads can be slow - particularly for
11 | functional annotation retrieval. 
12 | 
13 | MGnifyR package is part of [miaverse](https://microbiome.github.io/) 
14 | microbiome analysis ecosystem enabling usage of
15 | [mia](https://bioconductor.org/packages/release/bioc/html/mia.html)
16 | and other miaverse packages.
17 | 
18 | <img src="man/figures/findingpheno_logo.png" align="right" width="160" />
19 | 
20 | **This research has received funding from the Horizon 2020 Programme of the
21 | European Union within the FindingPheno project under grant agreement No
22 | 952914.** FindingPheno, an EU-funded project, is dedicated to developing
23 | computational tools and methodologies for the integration and analysis of
24 | multi-omics data. Its primary objective is to deepen our understanding of the
25 | interactions between hosts and their microbiomes. You can find more information
26 | on [FindingPheno website](https://findingpheno.eu/).
27 | 
28 | ## Installation
29 | 
30 | ### Bioc-release
31 | 
32 | ```
33 | if (!requireNamespace("BiocManager", quietly = TRUE))
34 |     install.packages("BiocManager")
35 | 
36 | BiocManager::install("MGnifyR")
37 | ```
38 | 
39 | ### Bioc-devel
40 | 
41 | ```
42 | if (!requireNamespace("BiocManager", quietly = TRUE))
43 |     install.packages("BiocManager")
44 | 
45 | # The following initializes usage of Bioc devel
46 | BiocManager::install(version='devel')
47 | 
48 | BiocManager::install("MGnifyR")
49 | ```
50 | 
51 | ### GitHub
52 | 
53 | ```
54 | remotes::install_github("EBI-Metagenomics/MGnifyR")
55 | ```
56 | 
57 | ## Basic usage
58 | For more detailed instructions read the associated function help and vignette (`vignette("MGNifyR")`)
59 | 
60 | ```
61 | library(MGnifyR)
62 | 
63 | # Set up the MGnify client instance
64 | mg <- MgnifyClient(useCache = TRUE, cacheDir = '/tmp/MGnify_cache')
65 | 
66 | # Retrieve the list of analyses associated with a study
67 | accession_list <- searchAnalysis(mg, "studies", "MGYS00005058")
68 | 
69 | # Download all associated study/sample and analysis metadata
70 | meta_dataframe <- getMetadata(mg, accession_list)
71 | 
72 | # Convert analyses outputs to a single `MultiAssayExperiment` object
73 | mae <- getResult(mg, meta_dataframe$analysis_accession)
74 | mae
75 | ```
76 | 
77 | 


--------------------------------------------------------------------------------
/man/MgnifyClient-accessors.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/AllGenerics.R, R/AllAccessors.R
  3 | \name{databaseUrl}
  4 | \alias{databaseUrl}
  5 | \alias{authTok}
  6 | \alias{useCache}
  7 | \alias{cacheDir}
  8 | \alias{showWarnings}
  9 | \alias{clearCache}
 10 | \alias{verbose}
 11 | \alias{databaseUrl<-}
 12 | \alias{authTok<-}
 13 | \alias{useCache<-}
 14 | \alias{cacheDir<-}
 15 | \alias{showWarnings<-}
 16 | \alias{clearCache<-}
 17 | \alias{verbose<-}
 18 | \alias{MgnifyClient-accessors}
 19 | \alias{databaseUrl,MgnifyClient-method}
 20 | \alias{authTok,MgnifyClient-method}
 21 | \alias{useCache,MgnifyClient-method}
 22 | \alias{cacheDir,MgnifyClient-method}
 23 | \alias{showWarnings,MgnifyClient-method}
 24 | \alias{clearCache,MgnifyClient-method}
 25 | \alias{verbose,MgnifyClient-method}
 26 | \alias{databaseUrl<-,MgnifyClient-method}
 27 | \alias{authTok<-,MgnifyClient-method}
 28 | \alias{useCache<-,MgnifyClient-method}
 29 | \alias{cacheDir<-,MgnifyClient-method}
 30 | \alias{showWarnings<-,MgnifyClient-method}
 31 | \alias{clearCache<-,MgnifyClient-method}
 32 | \alias{verbose<-,MgnifyClient-method}
 33 | \title{MgnifyClient accessors and mutators}
 34 | \usage{
 35 | databaseUrl(x)
 36 | 
 37 | authTok(x)
 38 | 
 39 | useCache(x)
 40 | 
 41 | cacheDir(x)
 42 | 
 43 | showWarnings(x)
 44 | 
 45 | clearCache(x)
 46 | 
 47 | verbose(x)
 48 | 
 49 | databaseUrl(x) <- value
 50 | 
 51 | authTok(x) <- value
 52 | 
 53 | useCache(x) <- value
 54 | 
 55 | cacheDir(x) <- value
 56 | 
 57 | showWarnings(x) <- value
 58 | 
 59 | clearCache(x) <- value
 60 | 
 61 | verbose(x) <- value
 62 | 
 63 | \S4method{databaseUrl}{MgnifyClient}(x)
 64 | 
 65 | \S4method{authTok}{MgnifyClient}(x)
 66 | 
 67 | \S4method{useCache}{MgnifyClient}(x)
 68 | 
 69 | \S4method{cacheDir}{MgnifyClient}(x)
 70 | 
 71 | \S4method{showWarnings}{MgnifyClient}(x)
 72 | 
 73 | \S4method{clearCache}{MgnifyClient}(x)
 74 | 
 75 | \S4method{verbose}{MgnifyClient}(x)
 76 | 
 77 | \S4method{databaseUrl}{MgnifyClient}(x) <- value
 78 | 
 79 | \S4method{authTok}{MgnifyClient}(x) <- value
 80 | 
 81 | \S4method{useCache}{MgnifyClient}(x) <- value
 82 | 
 83 | \S4method{cacheDir}{MgnifyClient}(x) <- value
 84 | 
 85 | \S4method{showWarnings}{MgnifyClient}(x) <- value
 86 | 
 87 | \S4method{clearCache}{MgnifyClient}(x) <- value
 88 | 
 89 | \S4method{verbose}{MgnifyClient}(x) <- value
 90 | }
 91 | \arguments{
 92 | \item{x}{A \code{MgnifyClient} object.}
 93 | 
 94 | \item{value}{A value to be added to a certain slot.}
 95 | }
 96 | \value{
 97 | A value of MgnifyClient object or nothing.
 98 | }
 99 | \description{
100 | MgnifyClient accessors and mutators
101 | }
102 | \details{
103 | These functions are for fetching and mutating slots of
104 | \code{MgnifyClient} object.
105 | }
106 | \examples{
107 | mg <- MgnifyClient()
108 | 
109 | databaseUrl(mg)
110 | showWarnings(mg) <- FALSE
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/inst/extras/demo_code.R:
--------------------------------------------------------------------------------
 1 | library(vegan)
 2 | library(ggplot2)
 3 | library(phyloseq)
 4 | 
 5 | library(MGnifyR)
 6 | 
 7 | mg <- mgnify_client(usecache = T, cache_dir = '/tmp/MGnify_demo')
 8 | 
 9 | 
10 | ####### Queries:
11 | mgnify_query(mg, "studies", biome_name="Wastewater", usecache = T)
12 | mgnify_query(mg, "samples", latitude_gte=60.0, experiment_type="metagenomic", usecache = T)
13 | m <- mgnify_query(mg, "samples", study_accession = "MGYS00003725", usecache=T)
14 | acc_list <- mgnify_analyses_from_samples(mg, m$accession)
15 | df <- mgnify_get_analyses_metadata(mg, acc_list)
16 | df
17 | 
18 | 
19 | ##### Single study retrieval
20 | #Amplicon: Oral health of young adults: Amplicon study
21 | om_analyses <- mgnify_analyses_from_studies(mg, "MGYS00002277")
22 | om_metadata_df <- mgnify_get_analyses_metadata(mg, om_analyses)
23 | t(head(om_metadata_df))
24 | 
25 | om_ps <- mgnify_get_analyses_phyloseq(mg, om_analyses, tax_SU = "SSU")
26 | 
27 | om_ps_sub <- subset_samples(om_ps, sample_sums(om_ps) > 10000)
28 | 
29 | omps <- rarefy_even_depth(om_ps_sub)
30 | omps
31 | 
32 | #plt1 <- plot_bar(omps, fill="Class", facet_grid = "sample_sample.desc") + theme(legend.position = "none")
33 | 
34 | alpha_div <- estimate_richness(omps)
35 | 
36 | adf <- cbind.data.frame(sample_data(omps)$`sample_sample.desc`, alpha_div$InvSimpson)
37 | colnames(adf) <- c("factor","value")
38 | 
39 | ggplot(adf, aes(x=factor, y=value)) + geom_boxplot(width=0.1) + geom_jitter(width=0.1) + theme_bw()
40 | 
41 | omps_ord <- ordinate(omps, method = "PCoA" , distance = "bray")
42 | plot_ordination(omps, omps_ord, color = "sample_sample.desc") + theme_bw()
43 | 
44 | 
45 | 
46 | ##### Multi-biome metagenome analysis
47 | set.seed(11)
48 | mg <- mgnify_client(usecache = T, cache_dir = "/tmp/mgnify_cache")
49 | 
50 | #Study accessions
51 | 
52 | #Saltmarsh metagenomes HiSeq4000 : MGYS00001447 - 48 samples
53 | #Healthy human gut metagenomes : MGYS00001442 30 odd samples
54 | #Marine Subseafloor microbes at Mid-Cayman Rise: MGYS00001282
55 | 
56 | 
57 | soil <- mgnify_analyses_from_studies(mg, "MGYS00001447")
58 | human <- mgnify_analyses_from_studies(mg, "MGYS00001442")
59 | seafloor <- mgnify_analyses_from_studies(mg, "MGYS00001282")
60 | seafloor <- sample(seafloor, 40)
61 | 
62 | accessions <- c(soil,human,seafloor)
63 | 
64 | metadata <- mgnify_get_analyses_metadata(mg, accessions)
65 | head(metadata)
66 | 
67 | goterms <-  mgnify_get_analyses_results(mg, accessions, retrievelist = "go-slim")$`go-slim`
68 | 
69 | 
70 | m <- goterms[,c(-1,-2,-3)]
71 | 
72 | normed_m <- apply(m, 2, function(x) x/sum(x))
73 | 
74 | nmds <- vegan::metaMDS(t(normed_m))
75 | 
76 | pltdat <- as.data.frame(scores(vare.mds))  #Using the scores function from vegan to extract the site scores and convert to a data.frame
77 | pltdat$grp <- metadata[rownames(pltdat),"study_accession"]  #  add the grp variable created earlier
78 | 
79 | ggplot() + geom_point(data=pltdat,aes(x=NMDS1,y=NMDS2,colour=grp),size=3) + theme_bw()
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/man/getData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/getData.R
 3 | \name{getData}
 4 | \alias{getData}
 5 | \alias{getData,MgnifyClient-method}
 6 | \title{Versatile function to retrieve raw results}
 7 | \usage{
 8 | getData(x, ...)
 9 | 
10 | \S4method{getData}{MgnifyClient}(x, type, accession.type = NULL, accession = NULL, as.df = TRUE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{MgnifyClient} object.}
14 | 
15 | \item{...}{optional arguments fed to internal functions.}
16 | 
17 | \item{type}{A single character value specifying the type of data retrieve.
18 | Must be one of the following options: \code{studies}, \code{samples},
19 | \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
20 | \code{super-studies}, \code{experiment-types}, \code{pipelines},
21 | \code{pipeline-tools}, \code{publications}, \code{genomes},
22 | \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
23 | \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
24 | \code{antismash-geneclusters}, \code{annotations/go-terms},
25 | \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
26 | \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
27 | \code{annotations/genome-properties},
28 | \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
29 | \code{mydata}.}
30 | 
31 | \item{accession.type}{A single character value specifying type of accession
32 | IDs (\code{accession}). Must be specified when \code{accession} is specified.
33 | (By default: \code{accession.type = NULL})}
34 | 
35 | \item{accession}{A single character value or a vector of character values
36 | specifying accession IDs to return results for.
37 | (By default: \code{accession = NULL})}
38 | 
39 | \item{as.df}{A single boolean value specifying whether to return the
40 | results as a data.frame or leave as a nested list.
41 | (By default: \code{as.df = TRUE})}
42 | }
43 | \value{
44 | \code{data.frame} or \code{list}
45 | }
46 | \description{
47 | Versatile function to retrieve raw results
48 | }
49 | \details{
50 | This function returns data from MGnify database. Compared to
51 | \code{getResult}, this function allows more flexible framework for fetching
52 | the data. However, there are drawbacks: for counts data, \code{getResult}
53 | returns optimally structured data container which is easier for downstream
54 | analysis. \code{getData} returns raw data from the database. However, if
55 | you want to retrieve data on pipelines or publications, for instance,
56 | \code{getResult} is not suitable for it, and \code{getData} can be utilized
57 | instead.
58 | }
59 | \examples{
60 | # Create a client object
61 | mg <- MgnifyClient(useCache = FALSE)
62 | 
63 | # Find kegg modules for certain analysis
64 | df <- getData(
65 |     mg, type = "kegg-modules",
66 |     accession = "MGYA00642773", accession.type = "analyses")
67 | 
68 | }
69 | \seealso{
70 | \code{\link[MGnifyR:getResult]{getResult}}
71 | }
72 | 


--------------------------------------------------------------------------------
/tests/testthat/test-getResult.R:
--------------------------------------------------------------------------------
 1 | context("getResult")
 2 | test_that("getResult", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     expect_error(getResult(1))
 7 |     expect_error(getResult("test"))
 8 |     expect_error(getResult(TRUE))
 9 | 
10 |     expect_error(getResult(mg, accesion = 1))
11 |     expect_error(getResult(mg, accesion = TRUE))
12 |     expect_error(getResult(mg, accesion = NULL))
13 | 
14 |     expect_error(getResult(mg, accession = "test", output = "test"))
15 |     expect_error(getResult(mg, accession = "test", output = TRUE))
16 |     expect_error(getResult(mg, accession = "test", output = 1))
17 |     expect_error(getResult(mg, accession = "test", output = c("TreeSE", "phyloseq")))
18 |     expect_error(getResult(mg, accession = "test", output = NULL))
19 | 
20 |     expect_error(getResult(mg, accession = "test", get.taxa = NULL))
21 |     expect_error(getResult(mg, accession = "test", get.taxa = 1))
22 |     expect_error(getResult(mg, accession = "test", get.taxa = c(TRUE, TRUE)))
23 |     expect_error(getResult(mg, accession = "test", get.taxa = "test"))
24 | 
25 |     expect_error(getResult(mg, accession = "test", get.func = NULL))
26 |     expect_error(getResult(mg, accession = "test", get.func = 1))
27 |     expect_error(getResult(mg, accession = "test", get.func = c(TRUE, TRUE)))
28 |     expect_error(getResult(mg, accession = "test", get.func = "test"))
29 | 
30 |     # Require internet access
31 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
32 |     
33 |     # # To reduce the time used to build the package, these tests are commented
34 |     # # Test that only functional data is fetched based on certain accession ID.
35 |     # # Get data as list of data.frames
36 |     # res <- getResult(
37 |     #     mg, "MGYA00097621", get.taxa = FALSE, output = "list",
38 |     #     get.func = TRUE, verbose = FALSE)
39 |     # expect_true(is.list(res))
40 |     # expect_true("go-terms" %in% names(res))
41 |     # expect_true(is.character(res$`interpro-identifiers`$analysis) &&
42 |     #                 is.character(res$`interpro-identifiers`$description) &&
43 |     #                 is.numeric(res$`interpro-identifiers`$count))
44 | 
45 |     # Test that microbial profiling data and functional data is fetched. Get
46 |     # data as MAE. Fetch also trees. Check that all data is is in correct place
47 |     # and is correct.
48 |     res <- getResult(mg, "MGYA00097621", get.func = TRUE, verbose = FALSE)
49 |     expect_is(res, "MultiAssayExperiment")
50 |     expect_is(res[[1]], "TreeSummarizedExperiment")
51 |     expect_true(!is.null(rowTree(res[["microbiota"]])))
52 |     expect_true(is.matrix(assay(res[[1]])))
53 |     expect_true("microbiota" %in% names(res) &&
54 |                     "go-terms" %in% names(res))
55 |     expect_true(is.matrix(assay(res[[2]])))
56 |     expect_true(is.matrix(assay(res[[3]])))
57 |     expect_equal(assay(res[["go-slim"]])["GO:1990204", 1][[1]], 929)
58 |     expect_equal(colnames(res[[1]]), colnames(res[[2]]))
59 |     expect_equal(colnames(res[[3]]), colnames(res[[2]]))
60 | })
61 | 


--------------------------------------------------------------------------------
/tests/testthat/test-getFile.R:
--------------------------------------------------------------------------------
 1 | context("getFile")
 2 | test_that("getFile", {
 3 |     # Test that input check caches wrong arguments.
 4 |     mg <- MgnifyClient(useCache = FALSE)
 5 | 
 6 |     expect_error(getFile(10))
 7 |     expect_error(getFile(TRUE))
 8 |     expect_error(getFile(NULL))
 9 | 
10 |     expect_error(getFile(mg, url = 10))
11 |     expect_error(getFile(mg, url = TRUE))
12 |     expect_error(getFile(mg, url = c("test", "test")))
13 | 
14 |     expect_error(getFile(mg, url = "test", read.func = 10))
15 |     expect_error(getFile(mg, url = "test", read.func = TRUE))
16 | 
17 |     expect_error(getFile(mg, url = "test", use.cache = 10))
18 |     expect_error(getFile(mg, url = "test", use.cache = TRUE))
19 |     expect_error(getFile(mg, url = "test", use.cache = c("test", "test")))
20 | 
21 |     expect_error(getFile(mg, url = "taxonomy--ssu", use.cache = 10))
22 |     expect_error(getFile(mg, url = "test", use.cache = test))
23 | 
24 |     expect_error(searchFile(10))
25 |     expect_error(searchFile(TRUE))
26 |     expect_error(searchFile(NULL))
27 | 
28 |     expect_error(searchFile(mg, accession = TRUE))
29 |     expect_error(searchFile(mg, accession = 1))
30 |     expect_error(searchFile(mg, accession = NULL))
31 | 
32 |     expect_error(searchFile(mg, accession = "test", type = 1))
33 |     expect_error(searchFile(mg, accession = "test", type = TRUE))
34 |     expect_error(searchFile(mg, accession = "test", c("samples", "analyses")))
35 | 
36 |     expect_error(searchFile(mg, accession = "test", type = "samples", use.cache = NULL))
37 |     expect_error(searchFile(mg, accession = "test", type = "samples", use.cache = 1))
38 |     expect_error(searchFile( mg, accession = "test", type = "samples", use.cache = c(TRUE, FALSE)))
39 | 
40 |     expect_error(searchFile(mg, accession = "test", type = "samples", show.messages = NULL))
41 |     expect_error(searchFile(mg, accession = "test", type = "samples", show.messages = 1))
42 |     expect_error(searchFile( mg, accession = "test", type = "samples", show.messages = c(TRUE, FALSE)))
43 | 
44 |     # Require internet access
45 |     skip_if(httr::http_error("https://www.ebi.ac.uk/metagenomics/api/v1"))
46 | 
47 |     # Expect error because url is incorrect
48 |     expect_error(getFile(mg, url = "test"))
49 | 
50 |     # Test that df is returned even if accession ID is not correct
51 |     expect_warning(
52 |     res <- searchFile(mg, type = "assemblies", accession = "random")
53 |     )
54 |     expect_true(is.data.frame(res))
55 | 
56 |     # Test that file search is done correctly based on accession ID.
57 |     # Use studies as type
58 |     res <- searchFile(mg, type = "studies", accession = "MGYS00005292", show.messages = FALSE)
59 |     expect_true(all(res$type == "studies"))
60 |     expect_true(is.data.frame(res))
61 |     expect_true(grepl("https", res$download_url[1]))
62 |     
63 |     # # To reduce the time used to build the package, these tests are commented
64 |     # # Test that correct file is fetched based on provided url.
65 |     # res <- getFile(mg, res$download_url[1])
66 |     # # Result is stored in a path which is returned
67 |     # expect_true(file.exists(res))
68 | })
69 | 


--------------------------------------------------------------------------------
/man/deprecate.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/deprecate.R
  3 | \name{deprecate}
  4 | \alias{deprecate}
  5 | \alias{mgnify_client}
  6 | \alias{mgnify_query}
  7 | \alias{mgnify_analyses_from_samples}
  8 | \alias{mgnify_analyses_from_studies}
  9 | \alias{mgnify_get_download_urls}
 10 | \alias{mgnify_download}
 11 | \alias{mgnify_get_analyses_results}
 12 | \alias{mgnify_get_analyses_phyloseq}
 13 | \alias{mgnify_get_analyses_metadata}
 14 | \alias{mgnify_retrieve_json}
 15 | \title{These functions will be deprecated. Please use other functions instead.}
 16 | \usage{
 17 | mgnify_client(
 18 |   username = NULL,
 19 |   password = NULL,
 20 |   usecache = FALSE,
 21 |   cache_dir = NULL,
 22 |   warnings = FALSE,
 23 |   use_memcache = FALSE,
 24 |   ...
 25 | )
 26 | 
 27 | mgnify_query(
 28 |   client,
 29 |   qtype = "samples",
 30 |   accession = NULL,
 31 |   asDataFrame = TRUE,
 32 |   maxhits = 200,
 33 |   usecache = FALSE,
 34 |   ...
 35 | )
 36 | 
 37 | mgnify_analyses_from_samples(client, accession, usecache = TRUE, ...)
 38 | 
 39 | mgnify_analyses_from_studies(client, accession, usecache = TRUE, ...)
 40 | 
 41 | mgnify_get_download_urls(
 42 |   client,
 43 |   accessions,
 44 |   accession_type,
 45 |   usecache = TRUE,
 46 |   ...
 47 | )
 48 | 
 49 | mgnify_download(
 50 |   client,
 51 |   url,
 52 |   file = NULL,
 53 |   read_func = NULL,
 54 |   usecache = TRUE,
 55 |   Debug = FALSE,
 56 |   ...
 57 | )
 58 | 
 59 | mgnify_get_analyses_results(
 60 |   client = NULL,
 61 |   accessions,
 62 |   retrievelist = c(),
 63 |   compact_results = TRUE,
 64 |   usecache = TRUE,
 65 |   bulk_dl = FALSE,
 66 |   ...
 67 | )
 68 | 
 69 | mgnify_get_analyses_phyloseq(
 70 |   client = NULL,
 71 |   accessions,
 72 |   usecache = TRUE,
 73 |   returnLists = FALSE,
 74 |   tax_SU = "SSU",
 75 |   get_tree = FALSE,
 76 |   ...
 77 | )
 78 | 
 79 | mgnify_get_analyses_metadata(client, accessions, usecache = TRUE, ...)
 80 | 
 81 | mgnify_retrieve_json(
 82 |   client,
 83 |   path = "biomes",
 84 |   complete_url = NULL,
 85 |   qopts = NULL,
 86 |   maxhits = 200,
 87 |   usecache = FALSE,
 88 |   Debug = FALSE,
 89 |   ...
 90 | )
 91 | }
 92 | \arguments{
 93 | \item{username}{-}
 94 | 
 95 | \item{password}{-}
 96 | 
 97 | \item{usecache}{-}
 98 | 
 99 | \item{cache_dir}{-}
100 | 
101 | \item{warnings}{-}
102 | 
103 | \item{use_memcache}{-}
104 | 
105 | \item{...}{-}
106 | 
107 | \item{client}{-}
108 | 
109 | \item{qtype}{-}
110 | 
111 | \item{accession}{-}
112 | 
113 | \item{asDataFrame}{-}
114 | 
115 | \item{maxhits}{-}
116 | 
117 | \item{accessions}{-}
118 | 
119 | \item{accession_type}{-}
120 | 
121 | \item{url}{-}
122 | 
123 | \item{file}{-}
124 | 
125 | \item{read_func}{-}
126 | 
127 | \item{Debug}{-}
128 | 
129 | \item{retrievelist}{-}
130 | 
131 | \item{compact_results}{-}
132 | 
133 | \item{bulk_dl}{-}
134 | 
135 | \item{returnLists}{-}
136 | 
137 | \item{tax_SU}{-}
138 | 
139 | \item{get_tree}{-}
140 | 
141 | \item{path}{-}
142 | 
143 | \item{complete_url}{-}
144 | 
145 | \item{qopts}{-}
146 | }
147 | \value{
148 | -
149 | }
150 | \description{
151 | These functions will be deprecated. Please use other functions instead.
152 | }
153 | 


--------------------------------------------------------------------------------
/R/AllGenerics.R:
--------------------------------------------------------------------------------
  1 | # All generic methods are listed here
  2 | 
  3 | #' @rdname MgnifyClient-accessors
  4 | #' @export
  5 | setGeneric(
  6 |     "databaseUrl", signature = c("x"), function(x)
  7 |         standardGeneric("databaseUrl"))
  8 | 
  9 | #' @rdname MgnifyClient-accessors
 10 | #' @export
 11 | setGeneric(
 12 |     "authTok", signature = c("x"), function(x) standardGeneric("authTok"))
 13 | 
 14 | #' @rdname MgnifyClient-accessors
 15 | #' @export
 16 | setGeneric(
 17 |     "useCache", signature = c("x"), function(x) standardGeneric("useCache"))
 18 | 
 19 | #' @rdname MgnifyClient-accessors
 20 | #' @export
 21 | setGeneric(
 22 |     "cacheDir", signature = c("x"), function(x) standardGeneric("cacheDir"))
 23 | 
 24 | #' @rdname MgnifyClient-accessors
 25 | #' @export
 26 | setGeneric(
 27 |     "showWarnings", signature = c("x"), function(x)
 28 |         standardGeneric("showWarnings"))
 29 | 
 30 | #' @rdname MgnifyClient-accessors
 31 | #' @export
 32 | setGeneric(
 33 |     "clearCache", signature = c("x"), function(x) standardGeneric("clearCache"))
 34 | 
 35 | #' @rdname MgnifyClient-accessors
 36 | #' @export
 37 | setGeneric(
 38 |     "verbose", signature = c("x"), function(x) standardGeneric("verbose"))
 39 | 
 40 | #' @rdname MgnifyClient-accessors
 41 | #' @export
 42 | setGeneric(
 43 |     "databaseUrl<-", signature = c("x"), function(x, value)
 44 |         standardGeneric("databaseUrl<-"))
 45 | 
 46 | #' @rdname MgnifyClient-accessors
 47 | #' @export
 48 | setGeneric(
 49 |     "authTok<-", signature = c("x"), function(x, value)
 50 |         standardGeneric("authTok<-"))
 51 | 
 52 | #' @rdname MgnifyClient-accessors
 53 | #' @export
 54 | setGeneric(
 55 |     "useCache<-", signature = c("x"), function(x, value)
 56 |         standardGeneric("useCache<-"))
 57 | 
 58 | #' @rdname MgnifyClient-accessors
 59 | #' @export
 60 | setGeneric(
 61 |     "cacheDir<-", signature = c("x"), function(x, value)
 62 |         standardGeneric("cacheDir<-"))
 63 | 
 64 | #' @rdname MgnifyClient-accessors
 65 | #' @export
 66 | setGeneric(
 67 |     "showWarnings<-", signature = c("x"), function(x, value)
 68 |         standardGeneric("showWarnings<-"))
 69 | 
 70 | #' @rdname MgnifyClient-accessors
 71 | #' @export
 72 | setGeneric(
 73 |     "clearCache<-", signature = c("x"), function(x, value)
 74 |         standardGeneric("clearCache<-"))
 75 | 
 76 | #' @rdname MgnifyClient-accessors
 77 | #' @export
 78 | setGeneric(
 79 |     "verbose<-", signature = c("x"), function(x, value)
 80 |         standardGeneric("verbose<-"))
 81 | 
 82 | #' @rdname doQuery
 83 | #' @export
 84 | setGeneric(
 85 |     "doQuery", signature = c("x"), function(x, ...) standardGeneric("doQuery"))
 86 | 
 87 | #' @rdname getFile
 88 | #' @export
 89 | setGeneric(
 90 |     "getFile", signature = c("x"), function(x, ...) standardGeneric("getFile"))
 91 | 
 92 | #' @rdname getFile
 93 | #' @export
 94 | setGeneric(
 95 |     "searchFile", signature = c("x"), function(x, ...)
 96 |         standardGeneric("searchFile"))
 97 | 
 98 | #' @rdname getMetadata
 99 | #' @export
100 | setGeneric(
101 |     "getMetadata", signature = c("x"), function(x, ...)
102 |         standardGeneric("getMetadata"))
103 | 
104 | #' @rdname getResult
105 | #' @export
106 | setGeneric(
107 |     "getResult", signature = c("x"), function(x, ...)
108 |         standardGeneric("getResult"))
109 | 
110 | #' @rdname getData
111 | #' @export
112 | setGeneric(
113 |     "getData", signature = c("x"), function(x, ...)
114 |         standardGeneric("getData"))
115 | 
116 | #' @rdname searchAnalysis
117 | #' @export
118 | setGeneric(
119 |     "searchAnalysis", signature = c("x"), function(x, ...)
120 |         standardGeneric("searchAnalysis"))
121 | 


--------------------------------------------------------------------------------
/man/MgnifyClient.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/MgnifyClient.R, R/AllClasses.R
  3 | \docType{class}
  4 | \name{MgnifyClient}
  5 | \alias{MgnifyClient}
  6 | \alias{MgnifyClient-class}
  7 | \title{Constructor for creating a MgnifyClient object to allow the access to
  8 | MGnify database.}
  9 | \usage{
 10 | MgnifyClient(
 11 |   username = NULL,
 12 |   password = NULL,
 13 |   useCache = FALSE,
 14 |   cacheDir = tempdir(),
 15 |   showWarnings = FALSE,
 16 |   verbose = TRUE,
 17 |   clearCache = FALSE,
 18 |   ...
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{username}{A single character value specifying an optional username for
 23 | authentication. (By default: \code{username = NULL})}
 24 | 
 25 | \item{password}{A single character value specifying an optional password for
 26 | authentication. (By default: \code{password = NULL})}
 27 | 
 28 | \item{useCache}{A single boolean value specifying whether to enable on-disk
 29 | caching of results during this session. In most use cases should be TRUE.
 30 | (By default: \code{useCache = FALSE})}
 31 | 
 32 | \item{cacheDir}{A single character value specifying a folder to contain the
 33 | local cache. Note that cached files are persistent, so the cache directory
 34 | may be reused between sessions, taking advantage of previously downloaded
 35 | results. The directory will be created if it doesn't exist already.
 36 | (By default: \code{cacheDir = tempdir()})}
 37 | 
 38 | \item{showWarnings}{A single boolean value specifying whether to print
 39 | warnings during invocation of some MGnifyR functions.
 40 | (By default: \code{showWarnings = FALSE})}
 41 | 
 42 | \item{verbose}{A single boolean value specifying whether to print extra
 43 | output during invocation of some MGnifyR functions.
 44 | (By default: \code{verbose = FALSE})}
 45 | 
 46 | \item{clearCache}{A single boolean value specifying whether to clear the
 47 | cache. (By default: \code{clearCache = FALSE})}
 48 | 
 49 | \item{...}{optional arguments:
 50 | \itemize{
 51 |   \item \strong{url} A single character value specifying an url address of
 52 |   the database. (By default:
 53 |   \code{url = "https://www.ebi.ac.uk/metagenomics/api/v1"})
 54 | }}
 55 | }
 56 | \value{
 57 | A MgnifyClient object.
 58 | }
 59 | \description{
 60 | Constructor for creating a MgnifyClient object to allow the access to
 61 | MGnify database.
 62 | 
 63 | A MgnifyClient object
 64 | }
 65 | \details{
 66 | All functions in the MGnifyR package take a \code{MgnifyClient} object as
 67 | their first argument. The object allows the simple handling of both user
 68 | authentication and access to private data, and manages general options for
 69 | querying the MGnify database.
 70 | 
 71 | An object that are required by functions of MGnifyR package.
 72 | }
 73 | \section{Slots}{
 74 | 
 75 | \describe{
 76 | \item{\code{databaseUrl}}{A single character value specifying an URL address of
 77 | database.}
 78 | 
 79 | \item{\code{authTok}}{A single character value specifying authentication token.}
 80 | 
 81 | \item{\code{useCache}}{A single boolean value specifying whether to use cache.}
 82 | 
 83 | \item{\code{cacheDir}}{A single character value specifying cache directory.}
 84 | 
 85 | \item{\code{showWarnings}}{A single boolean value specifying whether to show
 86 | warnings.}
 87 | 
 88 | \item{\code{clearCache}}{A single boolean value specifying whether to clear cache.}
 89 | 
 90 | \item{\code{verbose}}{A single boolean value specifying whether to show messages.}
 91 | }}
 92 | 
 93 | \section{Constructor}{
 94 | 
 95 | See  \code{\link{MgnifyClient}} for constructor.
 96 | }
 97 | 
 98 | \section{Accessor}{
 99 | 
100 | See \code{\link{MgnifyClient-accessors}} for accessor functions.
101 | }
102 | 
103 | \examples{
104 | my_client <- MgnifyClient(
105 |     useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
106 |     )
107 | 
108 | \dontrun{
109 | # Use username and password to get access to non-public data
110 | my_client <- MgnifyClient(
111 |     username = "Webin-1122334", password = "SecretPassword",
112 |     useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
113 |     )
114 | }
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/R/AllAccessors.R:
--------------------------------------------------------------------------------
  1 | #' MgnifyClient accessors and mutators
  2 | #'
  3 | #' @details
  4 | #' These functions are for fetching and mutating slots of
  5 | #' \code{MgnifyClient} object.
  6 | #'
  7 | #' @param x A \code{MgnifyClient} object.
  8 | #'
  9 | #' @param value A value to be added to a certain slot.
 10 | #'
 11 | #' @return A value of MgnifyClient object or nothing.
 12 | #'
 13 | #' @examples
 14 | #' mg <- MgnifyClient()
 15 | #'
 16 | #' databaseUrl(mg)
 17 | #' showWarnings(mg) <- FALSE
 18 | #'
 19 | #' @name MgnifyClient-accessors
 20 | NULL
 21 | 
 22 | #' @rdname MgnifyClient-accessors
 23 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 24 | #' @export
 25 | setMethod(
 26 |     "databaseUrl", signature = c(x = "MgnifyClient"),
 27 |     function(x){ x@databaseUrl })
 28 | 
 29 | #' @rdname MgnifyClient-accessors
 30 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 31 | #' @export
 32 | setMethod(
 33 |     "authTok", signature = c(x = "MgnifyClient"), function(x){ x@authTok })
 34 | 
 35 | #' @rdname MgnifyClient-accessors
 36 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 37 | #' @export
 38 | setMethod(
 39 |     "useCache", signature = c(x = "MgnifyClient"),
 40 |     function(x){ x@useCache })
 41 | 
 42 | #' @rdname MgnifyClient-accessors
 43 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 44 | #' @export
 45 | setMethod(
 46 |     "cacheDir", signature = c(x = "MgnifyClient"), function(x){ x@cacheDir })
 47 | 
 48 | #' @rdname MgnifyClient-accessors
 49 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 50 | #' @export
 51 | setMethod(
 52 |     "showWarnings", signature = c(x = "MgnifyClient"),
 53 |     function(x){ x@showWarnings })
 54 | 
 55 | #' @rdname MgnifyClient-accessors
 56 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 57 | #' @export
 58 | setMethod(
 59 |     "clearCache", signature = c(x = "MgnifyClient"),
 60 |     function(x){ x@clearCache })
 61 | 
 62 | #' @rdname MgnifyClient-accessors
 63 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 64 | #' @export
 65 | setMethod(
 66 |     "verbose", signature = c(x = "MgnifyClient"),
 67 |     function(x){ x@verbose })
 68 | 
 69 | #' @rdname MgnifyClient-accessors
 70 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 71 | #' @export
 72 | setMethod(
 73 |     "databaseUrl<-", signature = c(x = "MgnifyClient"),
 74 |     function(x, value){ BiocGenerics:::replaceSlots(x, databaseUrl = value) })
 75 | 
 76 | #' @rdname MgnifyClient-accessors
 77 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 78 | #' @export
 79 | setMethod(
 80 |     "authTok<-", signature = c(x = "MgnifyClient"),
 81 |     function(x, value){ BiocGenerics:::replaceSlots(x, authTok = value) })
 82 | 
 83 | #' @rdname MgnifyClient-accessors
 84 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 85 | #' @export
 86 | setMethod(
 87 |     "useCache<-", signature = c(x = "MgnifyClient"),
 88 |     function(x, value){ BiocGenerics:::replaceSlots(x, useCache = value) })
 89 | 
 90 | #' @rdname MgnifyClient-accessors
 91 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 92 | #' @export
 93 | setMethod(
 94 |     "cacheDir<-", signature = c(x = "MgnifyClient"),
 95 |     function(x, value){ BiocGenerics:::replaceSlots(x, cacheDir = value) })
 96 | 
 97 | #' @rdname MgnifyClient-accessors
 98 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 99 | #' @export
100 | setMethod(
101 |     "showWarnings<-", signature = c(x = "MgnifyClient"),
102 |     function(x, value){ BiocGenerics:::replaceSlots(x, showWarnings = value) })
103 | 
104 | #' @rdname MgnifyClient-accessors
105 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
106 | #' @export
107 | setMethod(
108 |     "clearCache<-", signature = c(x = "MgnifyClient"),
109 |     function(x, value){ BiocGenerics:::replaceSlots(x, clearCache = value) })
110 | 
111 | #' @rdname MgnifyClient-accessors
112 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
113 | #' @export
114 | setMethod(
115 |     "verbose<-", signature = c(x = "MgnifyClient"),
116 |     function(x, value){ BiocGenerics:::replaceSlots(x, verbose = value) })
117 | 


--------------------------------------------------------------------------------
/man/getFile.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/AllGenerics.R, R/getFile.R
  3 | \name{getFile}
  4 | \alias{getFile}
  5 | \alias{searchFile}
  6 | \alias{getFile,MgnifyClient-method}
  7 | \alias{searchFile,MgnifyClient-method}
  8 | \title{Download any MGnify files, also including processed reads and
  9 | identified protein sequences}
 10 | \usage{
 11 | getFile(x, ...)
 12 | 
 13 | searchFile(x, ...)
 14 | 
 15 | \S4method{getFile}{MgnifyClient}(x, url, file = NULL, read.func = NULL, ...)
 16 | 
 17 | \S4method{searchFile}{MgnifyClient}(
 18 |   x,
 19 |   accession,
 20 |   type = c("studies", "samples", "analyses", "assemblies", "genomes", "run"),
 21 |   ...
 22 | )
 23 | }
 24 | \arguments{
 25 | \item{x}{A \code{MgnifyClient} object.}
 26 | 
 27 | \item{...}{Additional arguments; not used currently.}
 28 | 
 29 | \item{url}{A single character value specifying the url address of the file
 30 | we wish to download.}
 31 | 
 32 | \item{file}{A single character value or NULL specifying an
 33 | optional local filename to use for saving the file. If \code{NULL},
 34 | MGNify local cache settings will be used. If the file is intended to be
 35 | processed in a separate program, it may be sensible to provide a
 36 | meaningful \code{file}, rather than having to hunt through the
 37 | cache folders. If \code{file} is \code{NULL} and \code{useCache(client)}
 38 | is \code{FALSE}, the \code{read.func} parameter must be supplied or the
 39 | file will be downloaded and then deleted.
 40 | (By default: \code{file = NULL})}
 41 | 
 42 | \item{read.func}{A function specifying an optional function to process the
 43 | downloaded file and return the results, rather than relying on post
 44 | processing. The primary use-case for this parameter is when local disk
 45 | space is limited and downloaded files can be quickly processed and
 46 | discarded. The function should take a single parameter, the downloaded
 47 | filename, and may return any valid R object.
 48 | (By default: \code{read.func = NULL})}
 49 | 
 50 | \item{accession}{A single character value or a vector of character values
 51 | specifying accession IDs to return results for.}
 52 | 
 53 | \item{type}{A single character value specifying the type of objects to
 54 | query. Must be one of the following options: \code{analysis}, \code{samples},
 55 | \code{studies}, \code{assembly}, \code{genome} or \code{run}.
 56 | (By default: \code{type = "samples"})}
 57 | }
 58 | \value{
 59 | For \code{getFile()}, either the local filename of the downloaded
 60 | file, be it either the location in the MGNifyR cache or file. If
 61 | \code{read.func} is used, its result will be returned.
 62 | 
 63 | For \code{searchFile()} \code{data.frame} containing all discovered
 64 | downloads. If multiple \code{accessions} are queried, the \code{accessions}
 65 | column may to filter the results - since rownames are not set (and wouldn't
 66 | make sense as each query will return multiple items)
 67 | }
 68 | \description{
 69 | Download any MGnify files, also including processed reads and
 70 | identified protein sequences
 71 | 
 72 | Listing files available for download
 73 | }
 74 | \details{
 75 | \code{getFile} is a convenient wrapper round generic the URL
 76 | downloading functionality in R, taking care of things like local
 77 | caching and authentication.
 78 | 
 79 | \code{searchFile()} function is a wrapper function allowing easy
 80 | enumeration of downloads available for a given accession IDs.
 81 | Returns a single data.frame containing all available downloads and associated
 82 | metadata, including the url location and description. This can then be
 83 | filtered to extract the urls of interest, before actually
 84 | retrieving the files using \code{getFile()}
 85 | }
 86 | \examples{
 87 | # Make a client object
 88 | mg <- MgnifyClient(useCache = FALSE)
 89 | 
 90 | # Create a vector of accession ids - these happen to be \code{analysis}
 91 | # accessions
 92 | accession_vect <- c("MGYA00563876", "MGYA00563877")
 93 | downloads <- searchFile(mg, accession_vect, "analyses")
 94 | 
 95 | # Filter to find the urls of 16S encoding sequences
 96 | url_list <- downloads[
 97 |     downloads$attributes.description.label == "Contigs encoding SSU rRNA",
 98 |     "download_url"]
 99 | 
100 | # Example 1:
101 | # Download the first file
102 | supplied_filename <- getFile(
103 |     mg, url_list[[1]], file="SSU_file.fasta.gz")
104 | 
105 | \dontrun{
106 | # Example 2:
107 | # Just use local caching
108 | cached_filename <- getFile(mg, url_list[[2]])
109 | 
110 | # Example 3:
111 | # Using read.func to open the reads with readDNAStringSet from
112 | # \code{biostrings}. Without retaining on disk
113 | dna_seqs <- getFile(
114 |     mg, url_list[[3]], read.func = readDNAStringSet)
115 | }
116 | 
117 | # Make a client object
118 | mg <- MgnifyClient(useCache = TRUE)
119 | # Create a vector of accession ids - these happen to be \code{analysis}
120 | # accessions
121 | accession_vect <- c(
122 |     "MGYA00563876", "MGYA00563877", "MGYA00563878",
123 |     "MGYA00563879", "MGYA00563880" )
124 | downloads <- searchFile(mg, accession_vect, "analyses")
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/R/deprecate.R:
--------------------------------------------------------------------------------
  1 | #' These functions will be deprecated. Please use other functions instead.
  2 | #'
  3 | #' @param url -
  4 | #'
  5 | #' @param username -
  6 | #'
  7 | #' @param password -
  8 | #'
  9 | #' @param usecache -
 10 | #'
 11 | #' @param cache_dir -
 12 | #'
 13 | #' @param warnings -
 14 | #'
 15 | #' @param use_memcache -
 16 | #'
 17 | #' @param client -
 18 | #'
 19 | #' @param qtype -
 20 | #'
 21 | #' @param accession -
 22 | #'
 23 | #' @param asDataFrame -
 24 | #'
 25 | #' @param maxhits -
 26 | #'
 27 | #' @param ... -
 28 | #'
 29 | #' @param accessions -
 30 | #'
 31 | #' @param accession_type -
 32 | #'
 33 | #' @param file -
 34 | #'
 35 | #' @param read_func -
 36 | #'
 37 | #' @param Debug -
 38 | #'
 39 | #' @param retrievelist -
 40 | #'
 41 | #' @param compact_results -
 42 | #'
 43 | #' @param bulk_dl -
 44 | #'
 45 | #' @param returnLists -
 46 | #'
 47 | #' @param tax_SU -
 48 | #'
 49 | #' @param get_tree -
 50 | #'
 51 | #' @param path -
 52 | #'
 53 | #' @param complete_url -
 54 | #'
 55 | #' @param qopts -
 56 | #' 
 57 | #' @return -
 58 | #' 
 59 | #' @name deprecate
 60 | NULL
 61 | 
 62 | #' @rdname deprecate
 63 | #' @export
 64 | mgnify_client <- function(
 65 |         username = NULL, password = NULL, usecache = FALSE,
 66 |         cache_dir = NULL, warnings = FALSE, use_memcache = FALSE, ...){
 67 |     .Deprecated("MgnifyClient")
 68 |     MgnifyClient(
 69 |         username = username, password = password,
 70 |         useCache = usecache, cacheDir = cache_dir, warnings = warnings,
 71 |         use.mem.cache = use_memcache, ...)
 72 | }
 73 | 
 74 | #' @rdname deprecate
 75 | #' @export
 76 | mgnify_query <- function(
 77 |         client, qtype = "samples", accession = NULL, asDataFrame = TRUE,
 78 |         maxhits = 200, usecache = FALSE, ...){
 79 |     .Deprecated("doQuery")
 80 |     doQuery(
 81 |         x = client, type = qtype, accession = accession,
 82 |         as.df = asDataFrame, max.hits = maxhits, usecache = usecache, ...)
 83 | }
 84 | 
 85 | #' @rdname deprecate
 86 | #' @export
 87 | mgnify_analyses_from_samples <- function(
 88 |         client, accession, usecache = TRUE, ...){
 89 |     .Deprecated("searchAnalysis")
 90 |     searchAnalysis(
 91 |         x = client, type = "samples", accession = accession,
 92 |         use.cache = usecache, ...)
 93 | }
 94 | 
 95 | #' @rdname deprecate
 96 | #' @export
 97 | mgnify_analyses_from_studies <- function(
 98 |         client, accession, usecache = TRUE, ...){
 99 |     .Deprecated("searchAnalysis")
100 |     searchAnalysis(
101 |         x = client, type = "studies", accession = accession,
102 |         use.cache = usecache, ...)
103 | }
104 | 
105 | #' @rdname deprecate
106 | #' @export
107 | mgnify_get_download_urls <- function(
108 |         client, accessions, accession_type, usecache = TRUE, ...){
109 |     .Deprecated("searchFile")
110 |     searchFile(
111 |         x = client, accession = accessions, type = accession_type,
112 |         use.cache = usecache, ...)
113 | }
114 | 
115 | #' @rdname deprecate
116 | #' @export
117 | mgnify_download <- function(
118 |         client, url, file = NULL, read_func = NULL, usecache = TRUE,
119 |         Debug = FALSE, ...){
120 |     .Deprecated("getFile")
121 |     getFile(
122 |         x = client, url = url, file = file,
123 |         read.func = read_func, use.cache = usecache, ...)
124 | }
125 | 
126 | #' @rdname deprecate
127 | #' @export
128 | mgnify_get_analyses_results <- function(
129 |         client  = NULL, accessions, retrievelist = c(), compact_results = TRUE,
130 |         usecache = TRUE, bulk_dl = FALSE, ...){
131 |     .Deprecated("getResult")
132 |     if( length(retrievelist) == 0 ){
133 |         retrievelist <- FALSE
134 |     }
135 |     getResult(
136 |         x = client, accession = accessions, get.taxa = FALSE,
137 |         get.func = retrievelist, output = "list", usecache = TRUE,
138 |         as.df = compact_results, ...)
139 | }
140 | 
141 | #' @rdname deprecate
142 | #' @export
143 | mgnify_get_analyses_phyloseq <- function(
144 |         client = NULL, accessions, usecache = TRUE, returnLists = FALSE,
145 |         tax_SU = "SSU", get_tree = FALSE, ...){
146 |     .Deprecated("getResult")
147 |     output <- ifelse(returnLists, "list", "phyloseq")
148 |     getResult(
149 |         x = client, accession = accessions, get.taxa = TRUE, get.func = FALSE,
150 |         output = output, use.cache = usecache, tax.su = tax_SU,
151 |         get.tree = get_tree, ...
152 |     )
153 | }
154 | 
155 | #' @rdname deprecate
156 | #' @export
157 | mgnify_get_analyses_metadata <- function(
158 |         client, accessions, usecache = TRUE, ...){
159 |     .Deprecated("getMetadata")
160 |     getMetadata(x = client, accession = accessions, usecache = usecache, ...)
161 | }
162 | 
163 | #' @rdname deprecate
164 | #' @export
165 | mgnify_retrieve_json <- function(
166 |         client, path = "biomes", complete_url = NULL, qopts = NULL,
167 |         maxhits = 200, usecache = FALSE, Debug = FALSE, ...){
168 |     .Deprecated(msg = "'mgnify_retrieve_json' is deprecated.\n",
169 |                 "See other functions and use them instead.\n",
170 |                 "See help('Deprecated')")
171 |     return(NULL)
172 | }
173 | 


--------------------------------------------------------------------------------
/man/doQuery.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/AllGenerics.R, R/doQuery.R
  3 | \name{doQuery}
  4 | \alias{doQuery}
  5 | \alias{doQuery,MgnifyClient-method}
  6 | \title{Search MGnify database for studies, samples, runs, analyses, biomes,
  7 | assemblies, and genomes.}
  8 | \usage{
  9 | doQuery(x, ...)
 10 | 
 11 | \S4method{doQuery}{MgnifyClient}(
 12 |   x,
 13 |   type = "studies",
 14 |   accession = NULL,
 15 |   as.df = TRUE,
 16 |   max.hits = 200,
 17 |   ...
 18 | )
 19 | }
 20 | \arguments{
 21 | \item{x}{A \code{MgnifyClient} object.}
 22 | 
 23 | \item{...}{Remaining parameter key/value pairs may be supplied to filter
 24 | the returned values. Available options differ between \code{types}.
 25 | See discussion Details section for details.}
 26 | 
 27 | \item{type}{A single character value specifying the type of objects to
 28 | query. Must be one of the following options: \code{studies}, \code{samples},
 29 | \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
 30 | \code{super-studies}, \code{experiment-types}, \code{pipelines},
 31 | \code{pipeline-tools}, \code{publications}, \code{genomes},
 32 | \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
 33 | \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
 34 | \code{antismash-geneclusters}, \code{annotations/go-terms},
 35 | \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
 36 | \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
 37 | \code{annotations/genome-properties},
 38 | \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
 39 | \code{mydata}.
 40 | (By default: \code{type = "studies"})}
 41 | 
 42 | \item{accession}{A single character value or a vector of character values
 43 | specifying MGnify accession identifiers (of type \code{type}) or NULL. When
 44 | NULL, all results defined by other parameters are retrieved.
 45 | (By default: \code{accession = NULL})}
 46 | 
 47 | \item{as.df}{A single boolean value specifying whether to return the
 48 | results as a data.frame or leave as a nested list. In most cases,
 49 | \code{as.df = TRUE} will make the most sense.
 50 | (By default: \code{as.df = TRUE})}
 51 | 
 52 | \item{max.hits}{A single integer value specifying the maximum number of
 53 | results to return or FALSE. The actual number of results will actually be
 54 | higher than \code{max.hits}, as clipping only occurs on pagination page
 55 | boundaries. To disable the limit, set \code{max.hits = NULL}.
 56 | (By default: \code{max.hits = 200})}
 57 | }
 58 | \value{
 59 | A nested list or data.frame containing the results of the query.
 60 | }
 61 | \description{
 62 | Search MGnify database for studies, samples, runs, analyses, biomes,
 63 | assemblies, and genomes.
 64 | }
 65 | \details{
 66 | \code{doQuery} is a flexible query function, harnessing the "full"
 67 | power of the JSONAPI MGnify search filters. Search results may be filtered
 68 | by metadata value, associated study/sample/analyse etc.
 69 | 
 70 | See \href{https://www.ebi.ac.uk/metagenomics/api/v1/}{Api browser} for
 71 | information on MGnify database filters.
 72 | You can find help on customizing queries from
 73 | \href{https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries}{here}.
 74 | 
 75 | For example the following filters are available:
 76 | \itemize{
 77 |     \item{\strong{studies}: accession, biome_name, lineage, centre_name,
 78 |     include}
 79 |     \item{\strong{samples}: accession, experiment_type, biome_name,
 80 |     lineage, geo_loc_name, latitude_gte, latitude_lte,
 81 |     longitude_gte, longitude_lte, species, instrument_model,
 82 |     instrument_platform, metadata_key, metadata_value_gte,
 83 |     metadata_value_lte, metadata_value, environment_material,
 84 |     environment_feature, study_accession, include}
 85 |     \item{\strong{runs}: accession, experiment_type, biome_name, lineage,
 86 |     species, instrument_platform, instrument_model, metdata_key,
 87 |     metadata_value_gte, metadata_value_lte, metadata_value, sample_accession,
 88 |     study_accession, include}
 89 |     \item{\strong{analyses}: biome_name, lineage, experiment_type, species,
 90 |     sample_accession, pipeline_version}
 91 |     \item{\strong{biomes}: depth_gte, depth_lte}
 92 |     \item{\strong{assemblies}: depth_gte, depth_lte}
 93 |  }
 94 | Unfortunately it appears that in some cases, some of these filters don't work
 95 | as expected, so it is important to check the results returned match up with
 96 | what's expected. Even more unfortunately if there's an error in the parameter
 97 | specification, the query will run as if no filter parameters were present
 98 | at all. Thus the result will appear superficially correct but will infact
 99 | correspond to something completely different. This behaviour will hopefully
100 | be fixed in future incarnations of the MGnifyR or JSONAPI, but for now users
101 | should double check returned values.
102 | 
103 | It is currently not possible to combine queries of the same type in a single
104 | call (for example to search for samples \emph{between} latitude). However,
105 | it is possible to run multiple queries and combine the results using set
106 | operations in R to get the desired behaviour.
107 | }
108 | \examples{
109 | mg <- MgnifyClient(useCache = FALSE)
110 | 
111 | # Get a list of studies from the Agricultural Wastewater :
112 | agwaste_studies <- doQuery(
113 |     mg, "studies", biome_name="Agricultural wastewater"
114 |     )
115 | 
116 | \dontrun{
117 | # Get all samples from a particular study
118 | samps <- doQuery(mg, "samples", accession="MGYS00004521")
119 | 
120 | # Search polar samples
121 | samps_np <- doQuery(mg, "samples", latitude_gte=66, max.hits=10)
122 | samps_sp <- doQuery(mg, "samples", latitude_lte=-66, max.hits=10)
123 | 
124 | # Search studies that have studied drinking water
125 | tbl <- doQuery(
126 |     mg,
127 |     type = "studies",
128 |     biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
129 |     max.hits = 10)
130 | }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/man/getResult.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/AllGenerics.R, R/getResult.R
  3 | \name{getResult}
  4 | \alias{getResult}
  5 | \alias{getResult,MgnifyClient-method}
  6 | \title{Get microbial and/or functional profiling data for a list of accessions}
  7 | \usage{
  8 | getResult(x, ...)
  9 | 
 10 | \S4method{getResult}{MgnifyClient}(
 11 |   x,
 12 |   accession,
 13 |   get.taxa = TRUE,
 14 |   get.func = TRUE,
 15 |   output = "TreeSE",
 16 |   ...
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{x}{A \code{MgnifyClient} object.}
 21 | 
 22 | \item{...}{optional arguments:
 23 | \itemize{
 24 |   
 25 |   \item \strong{taxa.su} A single character value specifying which taxa
 26 |   subunit results should be selected. Currently, taxonomy assignments in the
 27 |   MGnify pipelines rely on rRNA matches to existing databases
 28 |   (GreenGenes and SILVA), with later pipelines checking both the SSU and
 29 |   LSU portions of the rRNA sequence. \code{taxa.su} allows then selection
 30 |   of either the Small subunit (\code{"SSU"}) or Large subunit (\code{"LSU"})
 31 |   results in the final \code{TreeSummarizedExperiment} object. Older pipeline
 32 |   versions do not report results for both subunits, and thus for some
 33 |   accessions this value will have no effect.
 34 | 
 35 |   \item \strong{get.tree} A single boolean value specifying whether to
 36 |   include available phylogenetic trees in the \code{TreeSummarizedExperiment}
 37 |   object. Available when \code{get.taxa = TRUE}.
 38 |   (By default: \code{get.tree = TRUE})
 39 | 
 40 |   \item \strong{as.df} A single boolean value enabled when
 41 |   \code{output = "list"}. The argument specifies whether return functional
 42 |   data as a named list (one entry per element in the output list) of
 43 |   data.frames, with each data.frame containing results for all requested
 44 |   accessions. If \code{FALSE}, the function returns a list of lists, each
 45 |   element consisting of results for a single accession. (By default:
 46 |   \code{as.df = TRUE})
 47 | 
 48 |   \item \strong{bulk.dl} A single boolean value specifying should
 49 |   MGnifyR attempt to speed things up by downloading
 50 |   relevant studies TSV results and only extracting the required columns,
 51 |   rather than using the JSONAPI interface. When getting results where
 52 |   multiple accessions share the same study, this option may result in
 53 |   significantly faster processing. However, there appear to be (quite a few)
 54 |   cases in the database where the TSV result columns do NOT match the
 55 |   expected accession names. This will hopefully be fixed in the future,
 56 |   but for now \code{bulk.dl} defaults to TRUE. When it does work, it can
 57 |   be orders of magnitude more efficient.
 58 |   (By default: \code{buld_dl = TRUE})
 59 | 
 60 | }}
 61 | 
 62 | \item{accession}{A single character value or a vector of character values
 63 | specifying accession IDs to return results for.}
 64 | 
 65 | \item{get.taxa}{A boolean value specifying whether to retrieve taxonomy
 66 | data (OTU table). See \code{taxa.su} for specifying taxonomy type. The
 67 | data is retrieved as BIOM files which are subsequently parsed.
 68 | (By default: \code{get.taxa = TRUE})}
 69 | 
 70 | \item{get.func}{A boolean value or a single character value or a vector
 71 | character values specifying functional analysis types to retrieve. If
 72 | \code{get.func = TRUE}, all available functional datatypes are retrieved,
 73 | and if \code{FALSE}, functional data is not retrieved. The current list of
 74 | available types is \code{"antismash-gene-clusters"}, \code{"go-slim"},
 75 | \code{"go-terms"}, \code{"interpro-identifiers"}, \code{"taxonomy"},
 76 | \code{"taxonomy-itsonedb"}, \code{"taxonomy-itsunite"}, \code{"taxonomy-lsu"},
 77 | and \code{"taxonomy-ssu"}. Note that depending on the particular analysis
 78 | type, pipeline version etc., not all functional results will be available.
 79 | Furthermore, taxonomy is also available via \code{get.func}, and loading
 80 | the data might be considerable faster if \code{bulk.dl = TRUE}. However,
 81 | phylogeny is available only via \code{get.taxa}.
 82 | (By default: \code{get.func = TRUE})}
 83 | 
 84 | \item{output}{A single character value specifying the format of an output.
 85 | Must be one of the following options: \code{"TreeSE"}, \code{"list"}, or 
 86 | \code{"phyloseq"}. (By default: \code{output = "TreeSE"})}
 87 | }
 88 | \value{
 89 | If only taxonomy data is retrieved, the result is returned in
 90 | \code{TreeSummarizedExperiment} object by default. The result can also be
 91 | returned as a \code{phyloseq} object or as a list of \code{data.frames}.
 92 | Note that \code{phyloseq} object can include only one phylogenetic tree
 93 | meaning that some taxa might be lost when data is subsetted based on tree.
 94 | 
 95 | When functional data is retrieved in addition to taxonomy data, the result
 96 | is returned as a \code{MultiAssayExperiment} object. Other options are a list
 97 | containing \code{phyloseq} object and \code{data.frames} or just
 98 | \code{data.frames}.
 99 | 
100 | Functional data can be returned as a \code{MultiAssayExperiment} object or
101 | as a list of \code{data.frames}.
102 | }
103 | \description{
104 | Get microbial and/or functional profiling data for a list of accessions
105 | }
106 | \details{
107 | Given a set of analysis accessions and collection of annotation types,
108 | the function queries the MGNify API and returns the results. This function
109 | is convenient for retrieving highly structured (analysis vs counts) data on
110 | certain instances. For example, BIOM files are downloaded automatically.
111 | If you want just to retrieve raw data from the database, see \code{getData}.
112 | }
113 | \examples{
114 | # Create a client object
115 | mg <- MgnifyClient(useCache = FALSE)
116 | 
117 | # Get OTU tables as TreeSE
118 | accession_list <- c("MGYA00377505")
119 | tse <- getResult(mg, accession_list, get.func=FALSE, get.taxa=TRUE)
120 | 
121 | \dontrun{
122 | # Get functional data along with OTU tables as MAE
123 | mae <- getResult(mg, accession_list, get.func=TRUE, get.taxa=TRUE)
124 | 
125 | # Get same data as list
126 | list <- getResult(
127 |     mg, accession_list, get.func=TRUE, get.taxa=TRUE, output = "list",
128 |     as.df = TRUE, use.cache = TRUE)
129 | }
130 | 
131 | }
132 | \seealso{
133 | \code{\link[MGnifyR:getData]{getData}}
134 | }
135 | 


--------------------------------------------------------------------------------
/R/getMetadata.R:
--------------------------------------------------------------------------------
  1 | #' Get all study, sample and analysis metadata for the supplied analysis
  2 | #' accessions
  3 | #'
  4 | #' @details
  5 | #' The function retrieves all study, sample and analysis metadata associated
  6 | #' with provided analysis accessions.
  7 | #'
  8 | #' @param x A \code{MgnifyClient} object.
  9 | #'
 10 | #' @param accession A single character value or a vector of analysis accession
 11 | #' IDs specifying accessions to retrieve data for.
 12 | #' 
 13 | #' @param ... Optional arguments; not currently used.
 14 | #'
 15 | #' @return A \code{data.frame} containing metadata for each analysis in the
 16 | #' \code{accession} list. Each row represents a single analysis.
 17 | #'
 18 | #' @examples
 19 | #' # Create a client object
 20 | #' mg <- MgnifyClient(useCache = FALSE)
 21 | #'
 22 | #' # Download all associated study/sample and analysis metadata
 23 | #' accession_list <- c("MGYA00377505")
 24 | #' meta_dataframe <- getMetadata(mg, accession_list)
 25 | #'
 26 | #' @name getMetadata
 27 | NULL
 28 | 
 29 | #' @rdname getMetadata
 30 | #' @importFrom plyr llply
 31 | #' @importFrom dplyr bind_rows
 32 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 33 | #' @export
 34 | setMethod("getMetadata", signature = c(x = "MgnifyClient"), function(
 35 |         x, accession, ...){
 36 |     ############################### INPUT CHECK ################################
 37 |     if( !is.character(accession) ){
 38 |         stop(
 39 |             "'accession' must be a single character or a list of character ",
 40 |             "values.", call. = FALSE)
 41 |     }
 42 |     ############################# INPUT CHECK END ##############################
 43 |     # Get metadata
 44 |     result <- .mgnify_get_analyses_metadata(
 45 |         client = x, accession = accession, ...)
 46 |     return(result)
 47 | })
 48 | 
 49 | ################################ HELP FUNCTIONS ################################
 50 | 
 51 | # Fetch metadata based on analysis accessions.
 52 | .mgnify_get_analyses_metadata <- function(
 53 |         client, accession, use.cache = useCache(client),
 54 |         show.messages = verbose(client), ...){
 55 |     # Input check
 56 |     if( !.is_a_bool(use.cache) ){
 57 |         stop(
 58 |             "'use.cache' must be a single boolean value specifying whether to ",
 59 |             "show progress.", call. = FALSE)
 60 |     }
 61 |     if( !.is_a_bool(show.messages) ){
 62 |         stop(
 63 |             "'show.messages' must be a single boolean value.", call. = FALSE)
 64 |     }
 65 |     show.messages <- ifelse(show.messages, "text", "none")
 66 |     #
 67 |     # Give message about progress
 68 |     if( show.messages == "text" ){
 69 |         message("Fetching metadata...")
 70 |     }
 71 |     # Loop through analysis accessions and find metadata
 72 |     reslist <- llply(as.list(accession), function(x){
 73 |         .mgnify_get_single_analysis_metadata(
 74 |             client, x, use.cache = use.cache, ...)
 75 |     }, .progress = show.messages)
 76 |     # Combine all metadata to single df
 77 |     df <- do.call(bind_rows, reslist)
 78 |     return(df)
 79 | }
 80 | 
 81 | # Retrieves combined study/sample/analysis metadata - not exported
 82 | .mgnify_get_single_analysis_metadata <- function(
 83 |         client, accession, use.cache = useCache(client), max.hits = NULL, ...){
 84 |     # Input check
 85 |     if( !.is_a_bool(use.cache) ){
 86 |         stop(
 87 |             "'use.cache' must be a single boolean value specifying whether to ",
 88 |             "show progress.", call. = FALSE)
 89 |     }
 90 |     #
 91 |     # Get data in json format
 92 |     dat <- .mgnify_retrieve_json(
 93 |         client, paste("analyses", accession, sep="/"), use.cache = use.cache,
 94 |         max.hits = max.hits, ...)
 95 |     # If metadata was not found, return the NULL value
 96 |     if(is.null(dat)){
 97 |         warning(
 98 |             "\nFailed to find study metadata for ", accession, call. = FALSE)
 99 |         return(dat)
100 |     }
101 | 
102 |     # There  should  be just a single result
103 |     top_data <- dat[[1]]
104 |     # Convert hit result to df
105 |     analysis_df <- .mgnify_attr_list_to_df_row(
106 |         top_data, metadata_key = "analysis-summary")
107 | 
108 |     # Build up the metadata dataframe from the analyses_metadata_headers vector:
109 |     sample_met <- .mgnify_retrieve_json(
110 |         client, complete_url = top_data$relationships$sample$links$related,
111 |         use.cache = use.cache, ...)
112 |     study_met <- .mgnify_retrieve_json(
113 |         client, complete_url = top_data$relationships$study$links$related,
114 |         use.cache = use.cache, ...)
115 |     # Again, convert to df
116 |     if(!is.null(sample_met)){
117 |         sample_df <- .mgnify_attr_list_to_df_row(
118 |             sample_met[[1]], metadata_key = "sample-metadata")
119 |     } else{
120 |         warning(
121 |             "\nFailed to find sample metadata for ", accession, call. = FALSE)
122 |         sample_df <- data.frame(accession=NA)
123 |     }
124 |     # It turns out that a sample might not be part of a study - if it's been
125 |     # harvested...
126 |     if(!is.null(study_met)){
127 |         study_df <- .mgnify_attr_list_to_df_row(study_met[[1]])
128 |     } else{
129 |         warning(
130 |             "\nFailed to find study metadata for ", accession, call. = FALSE)
131 |         study_df <- data.frame(accession=NA)
132 |     }
133 |     # Add colnames to sample, study and analysis tables
134 |     colnames(sample_df) <- paste("sample", colnames(sample_df), sep="_")
135 |     colnames(study_df) <- paste("study", colnames(study_df), sep="_")
136 |     colnames(analysis_df) <- paste("analysis", colnames(analysis_df), sep="_")
137 |     # Add what analysis corresponds what sample and study
138 |     rownames(sample_df) <- rownames(analysis_df)
139 |     rownames(study_df) <- rownames(analysis_df)
140 |     # Combine sample and study result
141 |     full_df <- cbind(analysis_df, study_df, sample_df)
142 | 
143 |     # Extras - include some more metadata from various places
144 |     # Assembly accession
145 |     if("id" %in% names(top_data$relationships$assembly$data)){
146 |         full_df$assembly_accession <- top_data$relationships$assembly$data$id
147 |     }
148 |     # Run accession
149 |     if("id" %in% names(top_data$relationships$run$data)){
150 |         full_df$run_accession <- top_data$relationships$run$data$id
151 |     }
152 |     # biom (from the sample metadata)
153 |     if( !is.null(sample_met[[1]]$relationships$biome$data$id) ){
154 |         full_df$biome_string <- sample_met[[1]]$relationships$biome$data$id
155 |     } else {
156 |         warning("\nFailed to find biome entry for ", accession, call = FALSE)
157 |     }
158 |     return(full_df)
159 | }
160 | 


--------------------------------------------------------------------------------
/R/MgnifyClient.R:
--------------------------------------------------------------------------------
  1 | #' Constructor for creating a MgnifyClient object to allow the access to
  2 | #' MGnify database.
  3 | #'
  4 | #' @details
  5 | #' All functions in the MGnifyR package take a \code{MgnifyClient} object as
  6 | #' their first argument. The object allows the simple handling of both user
  7 | #' authentication and access to private data, and manages general options for
  8 | #' querying the MGnify database.
  9 | #'
 10 | #' @param username A single character value specifying an optional username for
 11 | #' authentication. (By default: \code{username = NULL})
 12 | #'
 13 | #' @param password A single character value specifying an optional password for
 14 | #' authentication. (By default: \code{password = NULL})
 15 | #'
 16 | #' @param useCache A single boolean value specifying whether to enable on-disk
 17 | #' caching of results during this session. In most use cases should be TRUE.
 18 | #' (By default: \code{useCache = FALSE})
 19 | #'
 20 | #' @param cacheDir A single character value specifying a folder to contain the
 21 | #' local cache. Note that cached files are persistent, so the cache directory
 22 | #' may be reused between sessions, taking advantage of previously downloaded
 23 | #' results. The directory will be created if it doesn't exist already.
 24 | #' (By default: \code{cacheDir = tempdir()})
 25 | #'
 26 | #' @param showWarnings A single boolean value specifying whether to print
 27 | #' warnings during invocation of some MGnifyR functions.
 28 | #' (By default: \code{showWarnings = FALSE})
 29 | #'
 30 | #' @param verbose A single boolean value specifying whether to print extra
 31 | #' output during invocation of some MGnifyR functions.
 32 | #' (By default: \code{verbose = FALSE})
 33 | #'
 34 | #' @param clearCache A single boolean value specifying whether to clear the
 35 | #' cache. (By default: \code{clearCache = FALSE})
 36 | #'
 37 | #' @param ... optional arguments:
 38 | #' \itemize{
 39 | #'   \item \strong{url} A single character value specifying an url address of
 40 | #'   the database. (By default:
 41 | #'   \code{url = "https://www.ebi.ac.uk/metagenomics/api/v1"})
 42 | #' }
 43 | #'
 44 | #' @return A MgnifyClient object.
 45 | #'
 46 | #' @examples
 47 | #' my_client <- MgnifyClient(
 48 | #'     useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
 49 | #'     )
 50 | #'
 51 | #' \dontrun{
 52 | #' # Use username and password to get access to non-public data
 53 | #' my_client <- MgnifyClient(
 54 | #'     username = "Webin-1122334", password = "SecretPassword",
 55 | #'     useCache = TRUE, cacheDir = "/scratch/MGnify_cache_location"
 56 | #'     )
 57 | #'}
 58 | #'
 59 | #' @name MgnifyClient
 60 | NULL
 61 | 
 62 | #' @rdname MgnifyClient
 63 | #' @importFrom methods new
 64 | #' @export
 65 | MgnifyClient <- function(
 66 |         username = NULL, password = NULL, useCache = FALSE,
 67 |         cacheDir = tempdir(), showWarnings = FALSE, verbose = TRUE,
 68 |         clearCache = FALSE, ...){
 69 |     ############################### INPUT CHECK ################################
 70 |     if( !(is.null(username) || .is_non_empty_string(username)) ){
 71 |         stop(
 72 |             "'username' must be NULL or single character value specifying ",
 73 |             "the username.", call. = FALSE)
 74 |     }
 75 |     if( !(is.null(password) || .is_non_empty_string(password)) ){
 76 |         stop(
 77 |             "'password' must be NULL or single character value specifying ",
 78 |             "the password.", call. = FALSE)
 79 |     }
 80 |     if( !.is_a_bool(useCache) ){
 81 |         stop(
 82 |             "'useCache' must be a boolean value specifying whether to use ",
 83 |             "on-disk caching.", call. = FALSE)
 84 |     }
 85 |     if( !.is_non_empty_string(cacheDir) ){
 86 |         stop(
 87 |             "'cacheDir' must be single character value specifying ",
 88 |             "the the directory for cache.", call. = FALSE)
 89 |     }
 90 |     if( !.is_a_bool(showWarnings) ){
 91 |         stop(
 92 |             "'showWarnings' must be a boolean value specifying whether print ",
 93 |             "warnings during invocation of MGnifyR functions.",
 94 |             call. = FALSE)
 95 |     }
 96 |     if( !.is_a_bool(verbose) ){
 97 |         stop(
 98 |             "'verbose' must be a boolean value specifying whether print ",
 99 |             "extra output during invocation of MGnifyR functions.",
100 |             call. = FALSE)
101 |     }
102 |     if( !.is_a_bool(clearCache) ){
103 |         stop(
104 |             "'clearCache' must be a boolean value specifying whether to ",
105 |             "clear the cache.", call. = FALSE)
106 |     }
107 |     ############################# INPUT CHECK END ##############################
108 |     # Get the url address
109 |     url <- .get_url_address(...)
110 |     # Authentication token is NA as default
111 |     authtok <- NA_character_
112 |     # Check to see if we're going to try and get an authentication token:
113 |     if (!is.null(username) && !is.null(password)){
114 |         # Fetch username vs password data from database
115 |         r <- POST(
116 |             paste(url, "utils/token/obtain", sep = "/"),
117 |             body = list(username = username, password = password),
118 |             encode = "json")
119 |         # If the authentication was not successful, returned value do not
120 |         # include data
121 |         cont <- content(r, ...)
122 |         if ("data" %in% names(cont)){
123 |             authtok <- cont$data$token
124 |         } else{
125 |             stop("Failed to authenticate.", call. = FALSE)
126 |         }
127 |     }
128 |     # Get the directory where cache will be stored.
129 |     # If user has specified the subdirectory, ensure that it works in any
130 |     # system by adding correct "/".
131 |     cacheDir <- as.list(strsplit(cacheDir, "[/\\\\]")[[1]])
132 |     cacheDir <- do.call(file.path, cacheDir)
133 |     # Add subdirectory. If user has specified for example working directory,
134 |     # the directory would be full of files. This is unintentional.
135 |     cacheDir <- file.path(cacheDir, ".MGnifyR_cache")
136 |     # Make it if needed - assume the user is sensible and the path will
137 |     # work...
138 |     if( useCache ){
139 |         dir.create(cacheDir, recursive = TRUE, showWarnings = FALSE)
140 |     }
141 |     # Return the final object
142 |     obj <- new(
143 |         "MgnifyClient",
144 |         databaseUrl = url,
145 |         authTok = authtok,
146 |         useCache = useCache,
147 |         cacheDir = cacheDir,
148 |         showWarnings = showWarnings,
149 |         clearCache = clearCache,
150 |         verbose = verbose
151 |     )
152 |     return(obj)
153 | }
154 | 
155 | ################################ HELP FUNCTIONS ################################
156 | 
157 | # This function is just to remove url from main function's arguments.
158 | .get_url_address <- function(
159 |         url = "https://www.ebi.ac.uk/metagenomics/api/v1", ...){
160 |     ############################### INPUT CHECK ################################
161 |     if( !(.is_non_empty_string(url)) ){
162 |         stop(
163 |             "'url' must be a single character value specifying ",
164 |             "the URL address.", call. = FALSE)
165 |     }
166 |     ############################# INPUT CHECK END ##############################
167 |     return(url)
168 | }
169 | 
170 | 


--------------------------------------------------------------------------------
/R/getData.R:
--------------------------------------------------------------------------------
  1 | #' Versatile function to retrieve raw results
  2 | #'
  3 | #' @details
  4 | #' This function returns data from MGnify database. Compared to
  5 | #' \code{getResult}, this function allows more flexible framework for fetching
  6 | #' the data. However, there are drawbacks: for counts data, \code{getResult}
  7 | #' returns optimally structured data container which is easier for downstream
  8 | #' analysis. \code{getData} returns raw data from the database. However, if
  9 | #' you want to retrieve data on pipelines or publications, for instance,
 10 | #' \code{getResult} is not suitable for it, and \code{getData} can be utilized
 11 | #' instead.
 12 | #'
 13 | #' @param x A \code{MgnifyClient} object.
 14 | #'
 15 | #' @param type A single character value specifying the type of data retrieve.
 16 | #' Must be one of the following options: \code{studies}, \code{samples},
 17 | #' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
 18 | #' \code{super-studies}, \code{experiment-types}, \code{pipelines},
 19 | #' \code{pipeline-tools}, \code{publications}, \code{genomes},
 20 | #' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
 21 | #' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
 22 | #' \code{antismash-geneclusters}, \code{annotations/go-terms},
 23 | #' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
 24 | #' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
 25 | #' \code{annotations/genome-properties},
 26 | #' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
 27 | #' \code{mydata}.
 28 | #'
 29 | #' @param accession A single character value or a vector of character values
 30 | #' specifying accession IDs to return results for.
 31 | #' (By default: \code{accession = NULL})
 32 | #'
 33 | #' @param accession.type A single character value specifying type of accession
 34 | #' IDs (\code{accession}). Must be specified when \code{accession} is specified.
 35 | #' (By default: \code{accession.type = NULL})
 36 | #'
 37 | #' @param as.df A single boolean value specifying whether to return the
 38 | #' results as a data.frame or leave as a nested list.
 39 | #' (By default: \code{as.df = TRUE})
 40 | #'
 41 | #' @param ... optional arguments fed to internal functions.
 42 | #'
 43 | #' @return
 44 | #' \code{data.frame} or \code{list}
 45 | #'
 46 | #' @examples
 47 | #' # Create a client object
 48 | #' mg <- MgnifyClient(useCache = FALSE)
 49 | #'
 50 | #' # Find kegg modules for certain analysis
 51 | #' df <- getData(
 52 | #'     mg, type = "kegg-modules",
 53 | #'     accession = "MGYA00642773", accession.type = "analyses")
 54 | #'
 55 | #' @seealso
 56 | #' \code{\link[MGnifyR:getResult]{getResult}}
 57 | #'
 58 | #' @name getData
 59 | NULL
 60 | 
 61 | #' @rdname getData
 62 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 63 | #' @export
 64 | setMethod(
 65 |     "getData", signature = c(x = "MgnifyClient"), function(
 66 |     x, type, accession.type = NULL, accession = NULL, as.df = TRUE, ...){
 67 |     ############################### INPUT CHECK ################################
 68 |     available_types <- c(
 69 |         "studies", "samples", "runs", "analyses", "biomes", "assemblies",
 70 |         "super-studies", "experiment-types", "pipelines", "pipeline-tools",
 71 |         "publications", "genomes", "genome-search", "genome-search/gather",
 72 |         "genome-catalogues", "genomeset", "cogs", "kegg-modules",
 73 |         "kegg-classes", "antismash-geneclusters", "annotations/go-terms",
 74 |         "annotations/interpro-identifiers", "annotations/kegg-modules",
 75 |         "annotations/pfam-entries", "annotations/kegg-orthologs",
 76 |         "annotations/genome-properties", "annotations/antismash-gene-clusters",
 77 |         "annotations/organisms", "mydata")
 78 |     if( !(.is_non_empty_string(type) && type %in% available_types) ){
 79 |         stop(
 80 |             "'type' must be a single character value specifying ",
 81 |             "the type of instance to query. The value must be one of the ",
 82 |             "following options: ",
 83 |             paste0("'", paste(available_types, collapse = "', '"), "'"),
 84 |             call. = FALSE)
 85 |     }
 86 |     if( !(.is_non_empty_character(accession) || is.null(accession)) ){
 87 |         stop(
 88 |             "'accession' must be a single character value or vector of ",
 89 |             "character values specifying the MGnify accession identifier.",
 90 |             call. = FALSE)
 91 |     }
 92 |     if( !(.is_non_empty_character(accession.type) || is.null(accession.type)) ){
 93 |         stop(
 94 |             "'accession.type' must be a single character value or vector of ",
 95 |             "character values specifying the type of MGnify accession ",
 96 |             "identifier.", call. = FALSE)
 97 |     }
 98 |     if(
 99 |         (is.null(accession) && !is.null(accession.type)) ||
100 |         (is.null(accession.type) && !is.null(accession)) ){
101 |         stop(
102 |           "Both 'accession' and 'accession.type' must be specified or they ",
103 |           "must be NULL.", call. = FALSE)
104 |     }
105 |     if( !.is_a_bool(as.df) ){
106 |         stop(
107 |             "'as.df' must be a single boolean value specifying whether",
108 |             "to return list or data.frame.", call. = FALSE)
109 |     }
110 |     ############################# INPUT CHECK END ##############################
111 |     # Retrieve results
112 |     result <- .get_results_as_json_list(x, type, accession.type, accession, ...)
113 |     # Convert to df
114 |     if( as.df ){
115 |         result <- .convert_json_list_to_df(result)
116 |     } else if( length(result) == 1 ){
117 |         result <- result[[1]]
118 |     }
119 |     return(result)
120 | })
121 | 
122 | ################################ HELP FUNCTIONS ################################
123 | 
124 | #' @importFrom plyr llply
125 | .get_results_as_json_list <- function(mg, type, accession.type, accession, ...){
126 |     # Create a path. If multiple accession IDs, path is vector of multiple
127 |     # paths. Otherwise the path specifies only the type
128 |     if( !is.null(accession.type) && !is.null(accession) ){
129 |         path <- paste0(accession.type, "/", accession, "/", type)
130 |         names(path) <- accession
131 |     } else{
132 |         path <- type
133 |     }
134 |     # Find results by loping through paths
135 |     res <- llply(path, function(x){
136 |         .mgnify_retrieve_json(mg, path = x, ...)
137 |     })
138 |     return(res)
139 | }
140 | 
141 | #' @importFrom tidyjson spread_all
142 | #' @importFrom dplyr bind_rows
143 | .convert_json_list_to_df <- function(result){
144 |     # Create data.frames from individual search results
145 |     res <- lapply(result, function(x){
146 |         if( !is.null(x) ){
147 |             x <- as.data.frame(spread_all(x))
148 |         }
149 |         return(x)
150 |     })
151 |     # Merge individual data.frames to one
152 |     res <- bind_rows(res)
153 |     # Add names if there were accession IDs provided as input
154 |     if( !is.null(names(result)) ){
155 |         # Assign to "accession" column name if there is no column with that name
156 |         # already
157 |         col_name <- "accession"
158 |         col_name <- c(colnames(res), col_name)
159 |         col_name <- make.unique(col_name)[[ length(col_name) ]]
160 |         # Add to result df
161 |         nams <- rep( names(result), each = lengths(result))
162 |         res[[ col_name ]] <- nams
163 |     }
164 |     return(res)
165 | }
166 | 


--------------------------------------------------------------------------------
/R/searchAnalysis.R:
--------------------------------------------------------------------------------
  1 | #' Look up analysis accession IDs for one or more study or sample accessions
  2 | #'
  3 | #' @details
  4 | #' Retrieve analysis accession IDs associated with the supplied study or
  5 | #' sample accession.  In MGnify, an analysis accession refers to a certain
  6 | #' pipeline analysis, such as specific 16S rRNA or shotgun metagenomic mapping.
  7 | #' Studies can include multiple samples, and each sample can undergo multiple
  8 | #' analyses using these pipelines. Each analysis is identified by a unique
  9 | #' accession ID, allowing precise tracking and retrieval of analysis results
 10 | #' within the MGnify database.
 11 | #'
 12 | #' @param x A \code{MgnifyClient} object.
 13 | #'
 14 | #' @param type A single character value specifying a type of
 15 | #' accession IDs specified by \code{accession}. Must be "studies" or "samples".
 16 | #'
 17 | #' @param accession A single character value or a vector of character values
 18 | #' specifying study or sample accession IDs that are used to retrieve analyses
 19 | #' IDs.
 20 | #'
 21 | #' @param ... Optional arguments; not currently used.
 22 | #'
 23 | #' @return Vector of analysis accession IDs.
 24 | #'
 25 | #' @examples
 26 | #' # Create a client object
 27 | #' mg <- MgnifyClient(useCache = FALSE)
 28 | #'
 29 | #' # Retrieve analysis ids from study MGYS00005058
 30 | #' result <- searchAnalysis(mg, "studies", c("MGYS00005058"))
 31 | #'
 32 | #' \dontrun{
 33 | #' # Retrieve all analysis ids from samples
 34 | #' result <- searchAnalysis(
 35 | #'     mg, "samples", c("SRS4392730", "SRS4392743"))
 36 | #' }
 37 | #'
 38 | #' @name searchAnalysis
 39 | NULL
 40 | 
 41 | #' @rdname searchAnalysis
 42 | #' @importFrom plyr llply
 43 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 44 | #' @export
 45 | setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
 46 |         x, type, accession, ...){
 47 |     ############################### INPUT CHECK ################################
 48 |     if( !(length(type) == 1 && type %in% c("samples", "studies")) ){
 49 |         stop(
 50 |             "'type' must be 'samples' or 'studies'.", call. = FALSE)
 51 |     }
 52 |     if( !(.is_non_empty_character(accession)) ){
 53 |         stop(
 54 |             "'accession' must be a single character value or vector of ",
 55 |             "character values specifying the MGnify accession identifier.",
 56 |             call. = FALSE)
 57 |     }
 58 |     ############################# INPUT CHECK END ##############################
 59 |     # Get analysis accession IDs based on sample or study accessions
 60 |     result <- .mgnify_analyses_from_studies_and_samples(
 61 |         client = x, accession = accession, type = type, ...)
 62 |     return(result)
 63 | })
 64 | 
 65 | ################################ HELP FUNCTIONS ################################
 66 | # Get analysis accessions based on studies or samples. The result is a vector
 67 | # of analyses IDs.
 68 | .mgnify_analyses_from_studies_and_samples <- function(
 69 |         client, accession, type, show.messages = verbose(client), ...){
 70 |     # Input check
 71 |     if( !.is_a_bool(show.messages) ){
 72 |         stop(
 73 |             "'show.messages' must be a single boolean value.", call. = FALSE)
 74 |     }
 75 |     show.messages <- ifelse(show.messages, "text", "none")
 76 |     #
 77 |     # Give message about progress
 78 |     if( show.messages == "text" ){
 79 |         message("Fetching analyses...")
 80 |     }
 81 |     # Search analyses IDs
 82 |     analysis_ids <- .get_all_analyses_ids(
 83 |         client, accession, type, "analyses", show.messages = show.messages, ...)
 84 |     # Check which study/sample ID resulted to found analysis ID
 85 |     not_found <- accession[ !accession %in% names(analysis_ids) ]
 86 |     # If user is searching analyses based on samples, we can still try another
 87 |     # approach. Sometimes, those "sample" IDs refer to runs instead.
 88 |     if( length(not_found) > 0 && type == "samples" ){
 89 |         # Finds runs based on samples
 90 |         temp <- .get_all_analyses_ids(
 91 |             client, accession, "samples", "runs",
 92 |             show.messages = show.messages, ...)
 93 |         # Create a data.frame that holds all the IDs to book keep matches
 94 |         # between IDs.
 95 |         id_df <- data.frame(sample = names(temp), run = temp)
 96 |         # Based on those runs, search analyses
 97 |         temp <- .get_all_analyses_ids(
 98 |             client, id_df[["run"]], "runs", "analyses",
 99 |             show.messages = show.messages, ...)
100 |         # Add found analysis IDs to data.frame
101 |         temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
102 |         temp_df[["analyses"]] <- temp
103 |         id_df <- merge(id_df, temp_df, all = TRUE)
104 |         
105 |         # If there still are samples that were not found, we can try to get
106 |         # analyses from assemblies. That is why we try to first fetch assemblies
107 |         # based on runs.
108 |         temp <- .get_all_analyses_ids(
109 |             client, id_df[is.na(id_df[["analyses"]]), "run"], "runs",
110 |             "assemblies", show.messages = show.messages, ...)
111 |         # Add found analysis IDs to data.frame
112 |         temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
113 |         temp_df[["assemblies"]] <- temp
114 |         id_df <- merge(id_df, temp_df, all = TRUE)
115 |         # Then based on assemblies, we can finally try to find analyses.
116 |         temp <- .get_all_analyses_ids(
117 |             client, id_df[is.na(id_df[["analyses"]]), "assemblies"],
118 |             "assemblies", "analyses", show.messages = show.messages, ...)
119 |         # Add found analysis IDs to data.frame
120 |         temp_df <- id_df[match(names(temp), id_df[["assemblies"]]), ]
121 |         temp_df[["analyses"]] <- temp
122 |         id_df <- merge(id_df, temp_df, all = TRUE)
123 |         # Now we should have a table that contains all the analyses that were
124 |         # possible to find. Add these analyses to the original result list.
125 |         temp <- id_df[["analyses"]]
126 |         names(temp) <- id_df[["sample"]]
127 |         temp <- temp[ !is.na(temp) ]
128 |         analysis_ids <- c(analysis_ids, temp)
129 |         # Update the "not found samples" vector
130 |         not_found <- accession[ !accession %in% names(analysis_ids) ]
131 |     }
132 |     # If the data was not found for specified ID, give warning
133 |     if( length(not_found) > 0 ){
134 |         warning(
135 |             "\nAnalyses not found for the following ", type, ": '",
136 |             paste(not_found, collapse = "', '"), "'", call. = FALSE)
137 |     }
138 |     return(analysis_ids)
139 | }
140 | 
141 | # This function gets IDs type "type_from" as input and tries to fetch
142 | # corresponding IDs type "type_to".
143 | # based on those studies or samples.
144 | .get_all_analyses_ids <- function(
145 |         client, ids, type_from, type_to, show.messages,
146 |         use.cache = useCache(client), ...){
147 |     #
148 |     if( !.is_a_bool(use.cache) ){
149 |         stop(
150 |             "'use.cache' must be a single boolean value", call. = FALSE)
151 |     }
152 |     #
153 |     # Get only unique IDs
154 |     ids <- unique(ids)
155 |     # Loop through accessions
156 |     analysis_ids <- llply(ids, function(id){
157 |         # Get URL address of results that were found. For instance, URL address
158 |         # of analyses based on study ID/accession
159 |         url <- .mgnify_get_x_for_y(
160 |             client, id, type_from, type_to, use.cache = use.cache,
161 |             ...)
162 |         # Check whether results were found or not
163 |         res <- NULL
164 |         if( !is.null(url) ){
165 |             # Get data
166 |             json <- .mgnify_retrieve_json(
167 |                 client, complete_url = url, use.cache = use.cache,
168 |                 max.hits = NULL, ...)
169 |             # We need just the accession ID
170 |             res <- lapply(json, function(x) x$id) |> unlist()
171 |             # Add accession as name. There might be multiple analyses for each
172 |             # accession. This helps to determine which analyses belong to which
173 |             # study.
174 |             if( length(res) > 0 ){
175 |                 names(res) <- rep(id, length(res))
176 |             }
177 |         }
178 |         return(res)
179 |     }, .progress = show.messages)
180 |     # Create a vector from results
181 |     analysis_ids <- analysis_ids |> unlist()
182 |     return(analysis_ids)
183 | }
184 | 


--------------------------------------------------------------------------------
/vignettes/MGnifyR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "MGnifyR"
  3 | date: "`r Sys.Date()`"
  4 | package: MGnifyR
  5 | output:
  6 |     BiocStyle::html_document:
  7 |         fig_height: 7
  8 |         fig_width: 10
  9 |         toc: yes
 10 |         toc_depth: 2
 11 |         number_sections: true
 12 | vignette: >
 13 |     %\VignetteIndexEntry{MGnifyR}
 14 |     %\VignetteEngine{knitr::rmarkdown}
 15 |     %\VignetteEncoding{UTF-8}
 16 | bibliography: references.bib
 17 | ---
 18 | 
 19 | ```{r, include = FALSE}
 20 | library(knitr)
 21 | knitr::opts_chunk$set(
 22 |     collapse = TRUE,
 23 |     comment = "#>",
 24 |     cache = TRUE
 25 | )
 26 | 
 27 | # Get already loaded results
 28 | path <- system.file("extdata", "vignette_MGnifyR.rds", package = "MGnifyR")
 29 | vignette_MGnifyR <- readRDS(path)
 30 | ```
 31 | 
 32 | # Introduction
 33 | 
 34 | `MGnifyR` is a package designed to ease access to the EBI's
 35 | [MGnify](https://www.ebi.ac.uk/metagenomics) resource, allowing searching and
 36 | retrieval of multiple datasets for downstream analysis.
 37 | 
 38 | The latest version of MGnifyR seamlessly integrates with the
 39 | [miaverse framework](https://microbiome.github.io/) providing access to
 40 | cutting-edge tools in microbiome down-stream analytics. 
 41 | 
 42 | # Installation
 43 | 
 44 | `MGnifyR` is hosted on Bioconductor, and can be installed using via
 45 | `BiocManager`.
 46 | 
 47 | ```{r install, eval=FALSE}
 48 | BiocManager::install("MGnifyR")
 49 | ```
 50 | 
 51 | # Load `MGnifyR` package
 52 | 
 53 | Once installed, `MGnifyR` is made available in the usual way.
 54 | 
 55 | ```{r load_package}
 56 | library(MGnifyR)
 57 | ```
 58 | 
 59 | # Create a client
 60 | 
 61 | All functions in `MGnifyR` make use of a `MgnifyClient` object to keep track
 62 | of the JSONAPI url, disk cache location and user access tokens. Thus the first
 63 | thing to do when starting any analysis is to instantiate this object. The
 64 | following snippet creates this.
 65 | 
 66 | ```{r create_client, message = FALSE}
 67 | mg <- MgnifyClient(useCache = TRUE)
 68 | mg
 69 | ```
 70 | 
 71 | The `MgnifyClient` object contains slots for each of the previously mentioned
 72 | settings.
 73 | 
 74 | # Functions for fetching the data
 75 | 
 76 | ## Search data
 77 | 
 78 | `doQuery()` function can be utilized to search results such as samples and
 79 | studies from MGnify database. Below, we fetch information drinking water
 80 | samples.
 81 | 
 82 | ```{r search_studies1, eval=FALSE}
 83 | # Fetch studies
 84 | samples <- doQuery(
 85 |     mg,
 86 |     type = "samples",
 87 |     biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
 88 |     max.hits = 10)
 89 | ```
 90 | 
 91 | ```{r search_studies2, eval=TRUE, include=FALSE}
 92 | samples <- vignette_MGnifyR[["samples"]]
 93 | ```
 94 | 
 95 | The result is a table containing accession IDs and description -- in this case
 96 | -- on samples.
 97 | 
 98 | ```{r search_studies3}
 99 | colnames(samples) |> head()
100 | ```
101 | 
102 | ## Find relevent **analyses** accessions
103 | 
104 | Now we want to find analysis accessions. Each sample might have multiple
105 | analyses. Each analysis ID corresponds to a single run of a particular pipeline
106 | on a single sample in a single study.
107 | 
108 | ```{r convert_to_analyses1, eval=FALSE}
109 | analyses_accessions <- searchAnalysis(mg, "samples", samples$accession)
110 | ```
111 | 
112 | ```{r convert_to_analyses2, eval=TRUE, include=FALSE}
113 | analyses_accessions <- vignette_MGnifyR[["analyses_accessions"]]
114 | ```
115 | 
116 | By running the `searchAnalysis()` function, we get a vector of analysis IDs of
117 | samples that we fed as an input.
118 | 
119 | ```{r convert_to_analyses3}
120 | analyses_accessions |> head()
121 | ```
122 | 
123 | 
124 | ## Fetch metadata
125 | 
126 | We can now check the metadata to get hint of what kind of data we have. We use
127 | `getMetadata()` function to fetch data based on analysis IDs.
128 | 
129 | ```{r get_metadata1, eval=FALSE}
130 | analyses_metadata <- getMetadata(mg, analyses_accessions)
131 | ```
132 | 
133 | ```{r get_metadata2, eval=TRUE, include=FALSE}
134 | analyses_metadata <- vignette_MGnifyR[["analyses_metadata"]]
135 | ```
136 | 
137 | The returned value is a `data.frame` that includes metadata for example on how
138 | analysis was conducted and what kind of samples were analyzed.
139 | 
140 | ```{r get_metadata3}
141 | colnames(analyses_metadata) |> head()
142 | ```
143 | 
144 | ## Fetch microbiome data
145 | 
146 | After we have selected the data to fetch, we can use `getResult()`
147 | 
148 | The output is `r BiocStyle::Biocpkg("TreeSummarizedExperiment")` (`TreeSE`) or
149 | `r BiocStyle::Biocpkg("MultiAssayExperiment")` (`MAE`) depending on the dataset.
150 | If the dataset includes only taxonomic profiling data, the output is a single
151 | `TreeSE`. If dataset includes also functional data, the output is multiple
152 | `TreeSE` objects that are linked together by utilizing `MAE`.
153 | 
154 | ```{r get_mae1, eval=FALSE}
155 | mae <- getResult(mg, accession = analyses_accessions)
156 | ```
157 | 
158 | ```{r get_mae2, eval=TRUE, include=FALSE}
159 | mae <- vignette_MGnifyR[["mae"]]
160 | ```
161 | 
162 | ```{r get_mae3}
163 | mae
164 | ```
165 | 
166 | You can get access to individual `TreeSE` object in `MAE` by specifying
167 | index or name.
168 | 
169 | ```{r mae_access}
170 | mae[[1]]
171 | ```
172 | 
173 | `TreeSE` object is uniquely positioned to support `SummarizedExperiment`-based
174 | microbiome data manipulation and visualization. Moreover, it enables access
175 | to `miaverse` tools. For example, we can estimate diversity of samples...
176 | 
177 | ```{r calculate_diversity, fig.width=9}
178 | library(mia)
179 | 
180 | mae[[1]] <- estimateDiversity(mae[[1]], index = "shannon")
181 | 
182 | library(scater)
183 | 
184 | plotColData(mae[[1]], "shannon", x = "sample_environment..biome.")
185 | ```
186 | 
187 | ... and plot abundances of most abundant phyla.
188 | 
189 | ```{r plot_abundance}
190 | # Agglomerate data
191 | altExps(mae[[1]]) <- splitByRanks(mae[[1]])
192 | 
193 | library(miaViz)
194 | 
195 | # Plot top taxa
196 | top_taxa <- getTopFeatures(altExp(mae[[1]], "Phylum"), 10)
197 | plotAbundance(
198 |     altExp(mae[[1]], "Phylum")[top_taxa, ],
199 |     rank = "Phylum",
200 |     as.relative = TRUE
201 |     )
202 | ```
203 | 
204 | We can also perform other analyses such as principal component analysis to
205 | microbial profiling data by utilizing miaverse tools.
206 | 
207 | ```{r pcoa}
208 | # Apply relative transformation
209 | mae[[1]] <- transformAssay(mae[[1]], method = "relabundance")
210 | # Perform PCoA
211 | mae[[1]] <- runMDS(
212 |     mae[[1]], assay.type = "relabundance",
213 |     FUN = vegan::vegdist, method = "bray")
214 | # Plot
215 | plotReducedDim(
216 |     mae[[1]], "MDS", colour_by = "sample_environment..biome.")
217 | ```
218 | 
219 | ## Fetch raw files
220 | 
221 | While `getResult()` can be utilized to retrieve microbial profiling data, 
222 | `getData()` can be used more flexibly to retrieve any kind of data from the
223 | database. It returns data as simple data.frame or list format.
224 | 
225 | ```{r fetch_data1, eval=FALSE}
226 | publications <- getData(mg, type = "publications")
227 | ```
228 | 
229 | ```{r fetch_data2, eval=TRUE, include=FALSE}
230 | publications <- vignette_MGnifyR[["publications"]]
231 | ```
232 | 
233 | ```{r fetch_data3}
234 | colnames(publications) |> head()
235 | ```
236 | 
237 | The result is a `data.frame` by default. In this case, it includes information
238 | on publications fetched from the data portal.
239 | 
240 | ## Fetch sequence files
241 | 
242 | Finally, we can use `searchFile()` and `getFile()` to retrieve other MGnify
243 | pipeline outputs such as merged sequence reads, assembled contigs, and details
244 | of the functional analyses.
245 | 
246 | With `searchFile()`, we can search files from the database.
247 | 
248 | ```{r get_download_urls1, eval=FALSE}
249 | dl_urls <- searchFile(mg, analyses_accessions, type = "analyses")
250 | ```
251 | 
252 | ```{r get_download_urls2, eval=TRUE, include=FALSE}
253 | dl_urls <- vignette_MGnifyR[["dl_urls"]]
254 | ```
255 | 
256 | The returned table contains search results related to analyses that we fed as
257 | an input. The table contains information on file and also URL address from
258 | where the file can be loaded.
259 | 
260 | ```{r get_download_urls3}
261 | target_urls <- dl_urls[
262 |     dl_urls$attributes.description.label == "Predicted alpha tmRNA", ]
263 | 
264 | colnames(target_urls) |> head()
265 | ```
266 | 
267 | Finally, we can download the files with `getFile()`.
268 | 
269 | ```{r download_file1, eval=FALSE}
270 | # Just select a single file from the target_urls list for demonstration.
271 | file_url <- target_urls$download_url[[1]]
272 | cached_location <- getFile(mg, file_url)
273 | ```
274 | 
275 | ```{r download_file2, eval=TRUE, include=FALSE}
276 | cached_location <- vignette_MGnifyR[["cached_location"]]
277 | ```
278 | 
279 | The function returns a path where the file is stored.
280 | 
281 | ```{r download_file3}
282 | # Where are the files?
283 | cached_location
284 | ```
285 | 
286 | ```{r session_info}
287 | sessionInfo()
288 | ```
289 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Artistic License 2.0
  2 | 
  3 | Copyright (c) Ben Allen, Leo Lahti, Kévin Gatel, 2020-2022.
  4 | 
  5 | Everyone is permitted to copy and distribute verbatim copies of this
  6 | license document, but changing it is not allowed.
  7 | 
  8 | Preamble
  9 | ********
 10 | 
 11 | This license establishes the terms under which a given free software
 12 | Package may be copied, modified, distributed, and/or redistributed.  The
 13 | intent is that the Copyright Holder maintains some artistic control over
 14 | the development of that Package while still keeping the Package
 15 | available as open source and free software.
 16 | 
 17 | You are always permitted to make arrangements wholly outside of this
 18 | license directly with the Copyright Holder of a given Package.  If the
 19 | terms of this license do not permit the full use that you propose to
 20 | make of the Package, you should contact the Copyright Holder and seek a
 21 | different licensing arrangement.
 22 | 
 23 | Definitions
 24 | ***********
 25 | 
 26 | "Copyright Holder" means the individual(s) or organization(s) named in
 27 | the copyright notice for the entire Package.
 28 | 
 29 | "Contributor" means any party that has contributed code or other
 30 | material to the Package, in accordance with the Copyright Holder's
 31 | procedures.
 32 | 
 33 | "You" and "your" means any person who would like to copy, distribute, or
 34 | modify the Package.
 35 | 
 36 | "Package" means the collection of files distributed by the Copyright
 37 | Holder, and derivatives of that collection and/or of those files.  A
 38 | given Package may consist of either the Standard Version, or a Modified
 39 | Version.
 40 | 
 41 | "Distribute" means providing a copy of the Package or making it
 42 | accessible to anyone else, or in the case of a company or organization,
 43 | to others outside of your company or organization.
 44 | 
 45 | "Distributor Fee" means any fee that you charge for Distributing this
 46 | Package or providing support for this Package to another party.  It does
 47 | not mean licensing fees.
 48 | 
 49 | "Standard Version" refers to the Package if it has not been modified, or
 50 | has been modified only in ways explicitly requested by the Copyright
 51 | Holder.
 52 | 
 53 | "Modified Version" means the Package, if it has been changed, and such
 54 | changes were not explicitly requested by the Copyright Holder.
 55 | 
 56 | "Original License" means this Artistic License as Distributed with the
 57 | Standard Version of the Package, in its current version or as it may be
 58 | modified by The Perl Foundation in the future.
 59 | 
 60 | "Source" form means the source code, documentation source, and
 61 | configuration files for the Package.
 62 | 
 63 | "Compiled" form means the compiled bytecode, object code, binary, or any
 64 | other form resulting from mechanical transformation or translation of
 65 | the Source form.
 66 | 
 67 | Permission for Use and Modification Without Distribution
 68 | ********************************************************
 69 | 
 70 | (1) You are permitted to use the Standard Version and create and use
 71 | Modified Versions for any purpose without restriction, provided that you
 72 | do not Distribute the Modified Version.
 73 | 
 74 | Permissions for Redistribution of the Standard Version
 75 | ******************************************************
 76 | 
 77 | (2) You may Distribute verbatim copies of the Source form of the
 78 | Standard Version of this Package in any medium without restriction,
 79 | either gratis or for a Distributor Fee, provided that you duplicate all
 80 | of the original copyright notices and associated disclaimers.  At your
 81 | discretion, such verbatim copies may or may not include a Compiled form
 82 | of the Package.
 83 | 
 84 | (3) You may apply any bug fixes, portability changes, and other
 85 | modifications made available from the Copyright Holder.  The resulting
 86 | Package will still be considered the Standard Version, and as such will
 87 | be subject to the Original License.
 88 | 
 89 | Distribution of Modified Versions of the Package as Source
 90 | **********************************************************
 91 | 
 92 | (4) You may Distribute your Modified Version as Source (either gratis or
 93 | for a Distributor Fee, and with or without a Compiled form of the
 94 | Modified Version) provided that you clearly document how it differs from
 95 | the Standard Version, including, but not limited to, documenting any
 96 | non-standard features, executables, or modules, and provided that you do
 97 | at least ONE of the following:
 98 | 
 99 | (a) make the Modified Version available to the Copyright Holder of the
100 | Standard Version, under the Original License, so that the Copyright
101 | Holder may include your modifications in the Standard Version.
102 | 
103 | (b) ensure that installation of your Modified Version does not prevent
104 | the user installing or running the Standard Version.  In addition, the
105 | Modified Version must bear a name that is different from the name of the
106 | Standard Version.
107 | 
108 | (c) allow anyone who receives a copy of the Modified Version to make the
109 | Source form of the Modified Version available to others under
110 | 
111 | (i) the Original License or
112 | 
113 | (ii) a license that permits the licensee to freely copy, modify and
114 | redistribute the Modified Version using the same licensing terms that
115 | apply to the copy that the licensee received, and requires that the
116 | Source form of the Modified Version, and of any works derived from it,
117 | be made freely available in that license fees are prohibited but
118 | Distributor Fees are allowed.
119 | 
120 | Distribution of Compiled Forms of the Standard Version or Modified
121 | ******************************************************************
122 | Versions without the Source
123 | ***************************
124 | 
125 | (5) You may Distribute Compiled forms of the Standard Version without
126 | the Source, provided that you include complete instructions on how to
127 | get the Source of the Standard Version.  Such instructions must be valid
128 | at the time of your distribution.  If these instructions, at any time
129 | while you are carrying out such distribution, become invalid, you must
130 | provide new instructions on demand or cease further distribution.  If
131 | you provide valid instructions or cease distribution within thirty days
132 | after you become aware that the instructions are invalid, then you do
133 | not forfeit any of your rights under this license.
134 | 
135 | (6) You may Distribute a Modified Version in Compiled form without the
136 | Source, provided that you comply with Section 4 with respect to the
137 | Source of the Modified Version.
138 | 
139 | Aggregating or Linking the Package
140 | **********************************
141 | 
142 | (7) You may aggregate the Package (either the Standard Version or
143 | Modified Version) with other packages and Distribute the resulting
144 | aggregation provided that you do not charge a licensing fee for the
145 | Package.  Distributor Fees are permitted, and licensing fees for other
146 | components in the aggregation are permitted.  The terms of this license
147 | apply to the use and Distribution of the Standard or Modified Versions
148 | as included in the aggregation.
149 | 
150 | (8) You are permitted to link Modified and Standard Versions with other
151 | works, to embed the Package in a larger work of your own, or to build
152 | stand-alone binary or bytecode versions of applications that include the
153 | Package, and Distribute the result without restriction, provided the
154 | result does not expose a direct interface to the Package.
155 | 
156 | Items That are Not Considered Part of a Modified Version
157 | ********************************************************
158 | 
159 | (9) Works (including, but not limited to, modules and scripts) that
160 | merely extend or make use of the Package, do not, by themselves, cause
161 | the Package to be a Modified Version.  In addition, such works are not
162 | considered parts of the Package itself, and are not subject to the terms
163 | of this license.
164 | 
165 | General Provisions
166 | ******************
167 | 
168 | (10) Any use, modification, and distribution of the Standard or Modified
169 | Versions is governed by this Artistic License.  By using, modifying or
170 | distributing the Package, you accept this license.  Do not use, modify,
171 | or distribute the Package, if you do not accept this license.
172 | 
173 | (11) If your Modified Version has been derived from a Modified Version
174 | made by someone other than you, you are nevertheless required to ensure
175 | that your Modified Version complies with the requirements of this
176 | license.
177 | 
178 | (12) This license does not grant you the right to use any trademark,
179 | service mark, tradename, or logo of the Copyright Holder.
180 | 
181 | (13) This license includes the non-exclusive, worldwide, free-of-charge
182 | patent license to make, have made, use, offer to sell, sell, import and
183 | otherwise transfer the Package with respect to any patent claims
184 | licensable by the Copyright Holder that are necessarily infringed by the
185 | Package.  If you institute patent litigation (including a cross-claim or
186 | counterclaim) against any party alleging that the Package constitutes
187 | direct or contributory patent infringement, then this Artistic License
188 | to you shall terminate on the date that such litigation is filed.
189 | 
190 | (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT
191 | HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED
192 | WARRANTIES.  THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
193 | PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT
194 | PERMITTED BY YOUR LOCAL LAW.  UNLESS REQUIRED BY LAW, NO COPYRIGHT
195 | HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT,
196 | INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE
197 | OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
198 | 


--------------------------------------------------------------------------------
/vignettes/MGnify_course.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Metagenomics bioinformatics at MGnify"
  3 | date: "`r Sys.Date()`"
  4 | package: MGnifyR
  5 | output:
  6 |     BiocStyle::html_document:
  7 |         fig_height: 7
  8 |         fig_width: 10
  9 |         toc: yes
 10 |         toc_depth: 2
 11 |         number_sections: true
 12 | vignette: >
 13 |     %\VignetteIndexEntry{MGnifyR, extended vignette}
 14 |     %\VignetteEngine{knitr::rmarkdown}
 15 |     %\VignetteEncoding{UTF-8}
 16 | bibliography: references.bib
 17 | ---
 18 | 
 19 | ```{r setup, include=FALSE}
 20 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE)
 21 | ```
 22 | 
 23 | ## Introduction
 24 | 
 25 | In this notebook we aim to demonstrate how the `MGnifyR` tool can be used to
 26 | fetch data of a MGnify microbiome data resource. We then showcase how to analyze
 27 | the datausing advanced microbiome data science tools, including estimating alpha
 28 | and beta diversity, as well as performing differential abundance analysis.
 29 | 
 30 | [`MGnifyR`](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html)
 31 | is an R/Bioconductor package that provides a set of tools for easily accessing
 32 | and processing MGnify data in R, making queries to MGnify databases through the
 33 | [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). 
 34 | 
 35 | The benefit of `MGnifyR` is that it streamlines data access, allowing users to 
 36 | fetch data either in its "raw" format or directly as a 
 37 | [`TreeSummarizedExperiment` (`TreeSE`)](https://microbiome.github.io/OMA/docs/devel/pages/containers.html) 
 38 | object. This enables seamless integration with custom workflows for analysis.
 39 | 
 40 | Utilizing `TreeSE` provides access to a wide range of tools within
 41 | Bioconductor's `SummarizedExperiment` (`SE`) ecosystem. It also integrates
 42 | with the 
 43 | [`mia` package](https://microbiome.github.io/mia/), which offers
 44 | microbiome-specific  methods within the `SE` framework.
 45 | 
 46 | For more information
 47 | on microbiome data science in Bioconductor, refer to 
 48 | [Orchestrating Microbiome Analysis (OMA) online book](https://microbiome.github.io/OMA/docs/devel/).
 49 | 
 50 | ## Load packages
 51 | 
 52 | ```{r install}
 53 | # List of packages that we need
 54 | packages <- c("ANCOMBC", "MGnifyR", "mia",  "miaViz", "scater")
 55 | 
 56 | # Get packages that are already installed
 57 | packages_already_installed <- packages[ packages %in% installed.packages() ]
 58 | # Get packages that need to be installed
 59 | packages_need_to_install <- setdiff( packages, packages_already_installed )
 60 | # Loads BiocManager into the session. Install it if it is not already installed.
 61 | if( !require("BiocManager") ){
 62 |     install.packages("BiocManager")
 63 |     library("BiocManager")
 64 | }
 65 | # If there are packages that need to be installed, installs them with BiocManager
 66 | # Updates old packages.
 67 | if( length(packages_need_to_install) > 0 ) {
 68 |    install(packages_need_to_install, ask = FALSE)
 69 | }
 70 | 
 71 | # Load all packages into session. Stop if there are packages that were not
 72 | # successfully loaded
 73 | pkgs_not_loaded <- !sapply(packages, require, character.only = TRUE)
 74 | pkgs_not_loaded <- names(pkgs_not_loaded)[ pkgs_not_loaded ]
 75 | if( length(pkgs_not_loaded) > 0 ){
 76 |     stop(
 77 |         "Error in loading the following packages into the session: '",
 78 |         paste0(pkgs_not_loaded, collapse = "', '"), "'")
 79 | }
 80 | ```
 81 | 
 82 | ## Data import
 83 | 
 84 | To interact with the MGnify database, we need to create a `MgnifyClient` object.
 85 | This object allows us to store options for data fetching. For instance, we can
 86 | configure it to use a cache for improved efficiency.
 87 | 
 88 | ```{r create_mgnify_obj}
 89 | #| output: false
 90 | 
 91 | # Create the MgnifyClient object with caching enabled
 92 | mg <- MgnifyClient(
 93 |   useCache = TRUE,
 94 |   cacheDir = "/home/training" # Set this to your desired cache directory
 95 |   )
 96 | ```
 97 | 
 98 | In this workflow, we will fetch taxonomy annotations and metadata from
 99 | the study
100 | ["MGYS00005154"](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154).
101 | The dataset focuses on the human gut microbiome, analyzed 
102 | across different geographic regions.
103 | 
104 | We can now search for all analyses associated with the certain study.
105 | The analysis refers to metagenomic runs performed to samples. Each
106 | sample can have multiple runs made, which is why we work with analyses and not
107 | with samples; analysis identifier points to a single entity.
108 | 
109 | ```{r search_analysis}
110 | #| output: false
111 | 
112 | study_id <- "MGYS00005154"
113 | analysis_id <- searchAnalysis(mg, "studies", study_id)
114 | ```
115 | 
116 | Now we are ready to load the metadata on the analyses to get idea on what
117 | kind of data we are dealing with.
118 | 
119 | There are currently (17 Sep 2024) almost 1,000 analyses available. Downloading
120 | whole dataset will take some time, which is why we use memory cache.
121 | 
122 | ```{r load_meta}
123 | metadata <- getMetadata(mg, accession = analysis_id)
124 | ```
125 | 
126 | We can see that there are analyses that are performed with different pipelines.
127 | Let's take only those analyses that are generated with the pipeline version 5.0.
128 | 
129 | ```{r subset_meta}
130 | metadata <- metadata[metadata[["analysis_pipeline-version"]] == "5.0", ]
131 | ```
132 | 
133 | We have now analyses that each point to unique sample. The final step is
134 | to fetch abundance tables in `TreeSummarizedExperiment` (`TreeSE`) format.
135 | 
136 | ```{r import_treese}
137 | tse <- getResult(
138 |     mg,
139 |     accession = metadata[["analysis_accession"]],
140 |     get.func = FALSE
141 |     )
142 | tse
143 | ```
144 | 
145 | The fetched data is `TreeSE` object, including taxonomy annotations. See
146 | [OMA online book](https://microbiome.github.io/OMA/docs/devel/pages/containers.html)
147 | on how to handle the data in this format.
148 | 
149 | ## Preprocessing
150 | 
151 | Below, we agglomerate the data to the Order level, meaning we summarize the 
152 | abundances at this specific taxonomic rank. The OMA provides a detailed 
153 | [chapter](https://microbiome.github.io/OMA/docs/devel/pages/agglomeration.html) 
154 | explaining agglomeration in more depth.
155 | 
156 | ```{r agg}
157 | tse_order <- agglomerateByRank(tse, rank = "Order")
158 | ```
159 | 
160 | Because of the unique properties of microbiome data, we have to apply
161 | transformations. Here, we perform relative transformation. You can find
162 | more information on transformations from
163 | [OMA](https://microbiome.github.io/OMA/docs/devel/pages/transformation.html).
164 | 
165 | ```{r preprocess}
166 | # Transform the main TreeSE
167 | tse <- transformAssay(tse, method = "relabundance")
168 | # Transform the agglomerated TreeSE
169 | tse_order <- transformAssay(tse_order, method = "relabundance")
170 | ```
171 | 
172 | ## Alpha diversity
173 | 
174 | Alpha diversity measures community diversity within a sample. Learn more on
175 | community diversity from
176 | [here](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html).
177 | 
178 | ```{r alpha}
179 | # Calculate alpha diversity
180 | tse <- addAlpha(tse)
181 | 
182 | # Create a plot
183 | p <- plotColData(
184 |   tse,
185 |   y = "shannon_diversity",
186 |   x = "sample_geographic.location..country.and.or.sea.region.",
187 |   show_boxplot = TRUE
188 |   )
189 | p
190 | ```
191 | 
192 | We can test whether the diversity differences are statistically significant.
193 | We utilize Mann Whithney U test (or Wilcoxon test).
194 | 
195 | ```{r}
196 | pairwise.wilcox.test(
197 |     tse[["shannon_diversity"]],
198 |     tse[["sample_geographic.location..country.and.or.sea.region."]],
199 |     p.adjust.method = "fdr"
200 |     )
201 | ```
202 | 
203 | To add p-values to the plot, see
204 | [OMA](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html#visualizing-significance-in-group-wise-comparisons).
205 | 
206 | ## Beta diversity
207 | 
208 | We can assess the differences in microbial compositions between samples, aiming 
209 | to identify patterns in the data that are associated with covariates.
210 | 
211 | To achieve this, we perform Principal Coordinate Analysis (PCoA) using 
212 | Bray-Curtis dissimilarity.
213 | 
214 | ```{r pcoa}
215 | # Perform PCoA
216 | tse <- runMDS(
217 |   tse,
218 |   FUN = getDissimilarity,
219 |   method = "bray",
220 |   assay.type = "relabundance"
221 | )
222 | # Visualize PCoA
223 | p <- plotReducedDim(
224 |   tse,
225 |   dimred = "MDS",
226 |   colour_by = "sample_geographic.location..country.and.or.sea.region."
227 |   )
228 | p
229 | ```
230 | 
231 | See [community similarity chapter](https://microbiome.github.io/OMA/docs/devel/pages/beta_diversity.html)
232 | from OMA for more information.
233 | 
234 | ## Differential abundance analysis (DAA)
235 | 
236 | In DAA, we analyze whether abundances of certain features vary between study
237 | groups. Again, OMA has a dedicated chapter also on this
238 | [topic](https://microbiome.github.io/OMA/docs/devel/pages/differential_abundance.html).
239 | 
240 | ```{r daa1}
241 | # Perform DAA
242 | res <- ancombc2(
243 |     data = tse_order,
244 |     assay.type = "counts",
245 |     fix_formula = "sample_geographic.location..country.and.or.sea.region.",
246 |     p_adj_method = "fdr",
247 |     )
248 | ```
249 | 
250 | Next we visualize features that have the lowest p-values.
251 | 
252 | ```{r daa2}
253 | # Get the most significant features
254 | n_top <- 4
255 | res_tab <- res[["res"]]
256 | res_tab <- res_tab[order(res_tab[["q_(Intercept)"]]), ]
257 | top_feat <- res_tab[seq_len(n_top), "taxon"]
258 | 
259 | # Create a plot
260 | p <- plotExpression(
261 |   tse_order,
262 |   features = top_feat,
263 |   assay.type = "relabundance",
264 |   x = "sample_geographic.location..country.and.or.sea.region.",
265 |   show_boxplot = TRUE, show_violin = FALSE, point_shape = NA
266 |   ) +
267 |   scale_y_log10()
268 | p
269 | ```
270 | 
271 | ## Session info
272 | 
273 | ```{r sesion_info}
274 | sessionInfo()
275 | ```
276 | 


--------------------------------------------------------------------------------
/R/doQuery.R:
--------------------------------------------------------------------------------
  1 | #' Search MGnify database for studies, samples, runs, analyses, biomes,
  2 | #' assemblies, and genomes.
  3 | #'
  4 | #' @details
  5 | #' \code{doQuery} is a flexible query function, harnessing the "full"
  6 | #' power of the JSONAPI MGnify search filters. Search results may be filtered
  7 | #' by metadata value, associated study/sample/analyse etc.
  8 | #'
  9 | #' See \href{https://www.ebi.ac.uk/metagenomics/api/v1/}{Api browser} for
 10 | #' information on MGnify database filters.
 11 | #' You can find help on customizing queries from
 12 | #' \href{https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries}{here}.
 13 | #'
 14 | #' For example the following filters are available:
 15 | #' \itemize{
 16 | #'     \item{\strong{studies}: accession, biome_name, lineage, centre_name,
 17 | #'     include}
 18 | #'     \item{\strong{samples}: accession, experiment_type, biome_name,
 19 | #'     lineage, geo_loc_name, latitude_gte, latitude_lte,
 20 | #'     longitude_gte, longitude_lte, species, instrument_model,
 21 | #'     instrument_platform, metadata_key, metadata_value_gte,
 22 | #'     metadata_value_lte, metadata_value, environment_material,
 23 | #'     environment_feature, study_accession, include}
 24 | #'     \item{\strong{runs}: accession, experiment_type, biome_name, lineage,
 25 | #'     species, instrument_platform, instrument_model, metdata_key,
 26 | #'     metadata_value_gte, metadata_value_lte, metadata_value, sample_accession,
 27 | #'     study_accession, include}
 28 | #'     \item{\strong{analyses}: biome_name, lineage, experiment_type, species,
 29 | #'     sample_accession, pipeline_version}
 30 | #'     \item{\strong{biomes}: depth_gte, depth_lte}
 31 | #'     \item{\strong{assemblies}: depth_gte, depth_lte}
 32 | #'  }
 33 | #' Unfortunately it appears that in some cases, some of these filters don't work
 34 | #' as expected, so it is important to check the results returned match up with
 35 | #' what's expected. Even more unfortunately if there's an error in the parameter
 36 | #' specification, the query will run as if no filter parameters were present
 37 | #' at all. Thus the result will appear superficially correct but will infact
 38 | #' correspond to something completely different. This behaviour will hopefully
 39 | #' be fixed in future incarnations of the MGnifyR or JSONAPI, but for now users
 40 | #' should double check returned values.
 41 | #'
 42 | #' It is currently not possible to combine queries of the same type in a single
 43 | #' call (for example to search for samples \emph{between} latitude). However,
 44 | #' it is possible to run multiple queries and combine the results using set
 45 | #' operations in R to get the desired behaviour.
 46 | #'
 47 | #' @param x A \code{MgnifyClient} object.
 48 | #'
 49 | #' @param type A single character value specifying the type of objects to
 50 | #' query. Must be one of the following options: \code{studies}, \code{samples},
 51 | #' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
 52 | #' \code{super-studies}, \code{experiment-types}, \code{pipelines},
 53 | #' \code{pipeline-tools}, \code{publications}, \code{genomes},
 54 | #' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
 55 | #' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
 56 | #' \code{antismash-geneclusters}, \code{annotations/go-terms},
 57 | #' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
 58 | #' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
 59 | #' \code{annotations/genome-properties},
 60 | #' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
 61 | #' \code{mydata}.
 62 | #' (By default: \code{type = "studies"})
 63 | #'
 64 | #' @param accession A single character value or a vector of character values
 65 | #' specifying MGnify accession identifiers (of type \code{type}) or NULL. When
 66 | #' NULL, all results defined by other parameters are retrieved.
 67 | #' (By default: \code{accession = NULL})
 68 | #'
 69 | #' @param as.df A single boolean value specifying whether to return the
 70 | #' results as a data.frame or leave as a nested list. In most cases,
 71 | #' \code{as.df = TRUE} will make the most sense.
 72 | #' (By default: \code{as.df = TRUE})
 73 | #'
 74 | #' @param max.hits A single integer value specifying the maximum number of
 75 | #' results to return or FALSE. The actual number of results will actually be
 76 | #' higher than \code{max.hits}, as clipping only occurs on pagination page
 77 | #' boundaries. To disable the limit, set \code{max.hits = NULL}.
 78 | #' (By default: \code{max.hits = 200})
 79 | #'
 80 | #' @param ... Remaining parameter key/value pairs may be supplied to filter
 81 | #' the returned values. Available options differ between \code{types}.
 82 | #' See discussion Details section for details.
 83 | #'
 84 | #' @return A nested list or data.frame containing the results of the query.
 85 | #'
 86 | #' @examples
 87 | #' mg <- MgnifyClient(useCache = FALSE)
 88 | #'
 89 | #' # Get a list of studies from the Agricultural Wastewater :
 90 | #' agwaste_studies <- doQuery(
 91 | #'     mg, "studies", biome_name="Agricultural wastewater"
 92 | #'     )
 93 | #'
 94 | #' \dontrun{
 95 | #' # Get all samples from a particular study
 96 | #' samps <- doQuery(mg, "samples", accession="MGYS00004521")
 97 | #'
 98 | #' # Search polar samples
 99 | #' samps_np <- doQuery(mg, "samples", latitude_gte=66, max.hits=10)
100 | #' samps_sp <- doQuery(mg, "samples", latitude_lte=-66, max.hits=10)
101 | #'
102 | #' # Search studies that have studied drinking water
103 | #' tbl <- doQuery(
104 | #'     mg,
105 | #'     type = "studies",
106 | #'     biome_name = "root:Environmental:Aquatic:Freshwater:Drinking water",
107 | #'     max.hits = 10)
108 | #' }
109 | #'
110 | #' @name doQuery
111 | NULL
112 | 
113 | #' @rdname doQuery
114 | #' @importFrom dplyr bind_rows
115 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
116 | #' @export
117 | setMethod("doQuery", signature = c(x = "MgnifyClient"), function(
118 |         x, type = "studies", accession = NULL, as.df = TRUE, max.hits = 200,
119 |         ...){
120 |     ############################### INPUT CHECK ################################
121 |     available_types <- c(
122 |         "studies", "samples", "runs", "analyses", "biomes", "assemblies",
123 |         "super-studies", "experiment-types", "pipelines", "pipeline-tools",
124 |         "publications", "genomes", "genome-search", "genome-search/gather",
125 |         "genome-catalogues", "genomeset", "cogs", "kegg-modules",
126 |         "kegg-classes", "antismash-geneclusters", "annotations/go-terms",
127 |         "annotations/interpro-identifiers", "annotations/kegg-modules",
128 |         "annotations/pfam-entries", "annotations/kegg-orthologs",
129 |         "annotations/genome-properties", "annotations/antismash-gene-clusters",
130 |         "annotations/organisms", "mydata")
131 |     if( !(.is_non_empty_string(type) && type %in% available_types) ){
132 |         stop(
133 |             "'type' must be a single character value specifying ",
134 |             "the type of instance to query. The value must be one of the ",
135 |             "following options: ",
136 |             paste0("'", paste(available_types, collapse = "', '"), "'"),
137 |             call. = FALSE)
138 |     }
139 |     if( !(.is_non_empty_character(accession) || is.null(accession)) ){
140 |         stop(
141 |             "'accession' must be a single character value or vector of ",
142 |             "character values specifying the MGnify accession identifier ",
143 |             "or NULL.",
144 |             call. = FALSE)
145 |     }
146 |     if( !.is_a_bool(as.df) ){
147 |         stop(
148 |             "'as.df' must be a single boolean value specifying whether",
149 |             "to return list or data.frame.", call. = FALSE)
150 |     }
151 |     if( !((.is_an_integer(max.hits) && (max.hits > 0 || max.hits == -1) ) ||
152 |         is.null(max.hits) )  ){
153 |         stop(
154 |             "'max.hits' must be a single integer value specifying the ",
155 |             "maximum number of results to return or NULL.", call. = FALSE)
156 |     }
157 |     ############################# INPUT CHECK END ##############################
158 |     # Perform query
159 |     result <- .perform_query(
160 |         client = x, type = type, accession = accession, max.hits = max.hits,
161 |         ...)
162 |     # Convert list to data.frame if specified
163 |     if( as.df && length(result) > 0 ){
164 |         # Turn lists to dfs
165 |         result <- lapply(result, .list_to_dataframe)
166 |         # Combine dfs
167 |         result <- bind_rows(result)
168 |     }
169 |     return(result)
170 | })
171 | 
172 | ################################ HELP FUNCTIONS ################################
173 | 
174 | .perform_query <- function(
175 |         client, type, accession, max.hits, use.cache = useCache(client),
176 |         show.messages = verbose(client), ...){
177 |     # Input check
178 |     if( !.is_a_bool(use.cache) ){
179 |         stop(
180 |             "'use.cache' must be a single boolean value.", call. = FALSE)
181 |     }
182 |     #
183 |     # Get parameters that are passed to do the query from database
184 |     query_params <- list(...)
185 |     query_params[["accession"]] <- accession
186 |     # Get results from the database
187 |     result <- .mgnify_retrieve_json(
188 |         client = client,
189 |         path = type,
190 |         max.hits = max.hits,
191 |         use.cache = use.cache,
192 |         qopts = query_params
193 |         )
194 |     # Rename entries by accession
195 |     id_list <- lapply(result, function(res) res$id)
196 |     if( !is.null(result) ){
197 |         names(result) <- id_list
198 |     }
199 |     return(result)
200 | }
201 | 
202 | .list_to_dataframe <- function(result){
203 |     # Get attributes
204 |     df <- .mgnify_attr_list_to_df_row(
205 |       json = result, metadata_key = "sample-metadata")
206 | 
207 |     # Loop through relationships, i.e., this data might be related to specific
208 |     # samples, analyses... --> get that info
209 |     relationships <- result[["relationships"]]
210 |     for( i in seq_len(length(relationships)) ){
211 |         # Get specific relationship, e.g., this data vs related runs
212 |         relationship_type <- names(result$relationships)[[i]]
213 |         relationship <- result$relationships[[i]]
214 |         # Get only data (temp is list of lists and only data element contains
215 |         # relevant info)
216 |         rel_data <- relationship[["data"]]
217 |         # If there is data, include it
218 |         if( !is.null(rel_data) && length(rel_data) > 0 ){
219 |             # Take all "id" values. Some data can also include list of
220 |             # lists. --> unlist and take "id" values. Based on this ID (such
221 |             # as "runs" ID) user can fetch specific relationship
222 |             rel_data <- unlist(rel_data)
223 |             rel_data <- rel_data[names(rel_data) %in% "id"]
224 |             temp_names <- rep(relationship_type, length(rel_data))
225 |             # Get all column names and make them unique
226 |             colnames <- append(colnames(df), temp_names)
227 |             colnames <- make.unique(colnames)
228 |             # Get only column values that are being added
229 |             temp_names <- colnames[
230 |                 (length(colnames)-length(temp_names)+1):length(colnames)]
231 |             # Add new data to dataset
232 |             df[temp_names] <- rel_data
233 |         }
234 |     }
235 |     # Add type of data that is being queried and accession code
236 |     df[["type"]] <- result[["type"]]
237 |     rownames(df) <- df[["accession"]]
238 |     return(df)
239 | }
240 | 


--------------------------------------------------------------------------------
/R/getFile.R:
--------------------------------------------------------------------------------
  1 | #' Download any MGnify files, also including processed reads and
  2 | #' identified protein sequences
  3 | #'
  4 | #' @details
  5 | #' \code{getFile} is a convenient wrapper round generic the URL
  6 | #' downloading functionality in R, taking care of things like local
  7 | #' caching and authentication.
  8 | #'
  9 | #' @param x A \code{MgnifyClient} object.
 10 | #'
 11 | #' @param url A single character value specifying the url address of the file
 12 | #' we wish to download.
 13 | #'
 14 | #' @param file A single character value or NULL specifying an
 15 | #' optional local filename to use for saving the file. If \code{NULL},
 16 | #' MGNify local cache settings will be used. If the file is intended to be
 17 | #' processed in a separate program, it may be sensible to provide a
 18 | #' meaningful \code{file}, rather than having to hunt through the
 19 | #' cache folders. If \code{file} is \code{NULL} and \code{useCache(client)}
 20 | #' is \code{FALSE}, the \code{read.func} parameter must be supplied or the
 21 | #' file will be downloaded and then deleted.
 22 | #' (By default: \code{file = NULL})
 23 | #'
 24 | #' @param read.func A function specifying an optional function to process the
 25 | #' downloaded file and return the results, rather than relying on post
 26 | #' processing. The primary use-case for this parameter is when local disk
 27 | #' space is limited and downloaded files can be quickly processed and
 28 | #' discarded. The function should take a single parameter, the downloaded
 29 | #' filename, and may return any valid R object.
 30 | #' (By default: \code{read.func = NULL})
 31 | #'
 32 | #' @param ... Additional arguments; not used currently.
 33 | #'
 34 | #' @return For \code{getFile()}, either the local filename of the downloaded
 35 | #' file, be it either the location in the MGNifyR cache or file. If
 36 | #' \code{read.func} is used, its result will be returned.
 37 | #'
 38 | #' @examples
 39 | #' # Make a client object
 40 | #' mg <- MgnifyClient(useCache = FALSE)
 41 | #' 
 42 | #' # Create a vector of accession ids - these happen to be \code{analysis}
 43 | #' # accessions
 44 | #' accession_vect <- c("MGYA00563876", "MGYA00563877")
 45 | #' downloads <- searchFile(mg, accession_vect, "analyses")
 46 | #'
 47 | #' # Filter to find the urls of 16S encoding sequences
 48 | #' url_list <- downloads[
 49 | #'     downloads$attributes.description.label == "Contigs encoding SSU rRNA",
 50 | #'     "download_url"]
 51 | #'
 52 | #' # Example 1:
 53 | #' # Download the first file
 54 | #' supplied_filename <- getFile(
 55 | #'     mg, url_list[[1]], file="SSU_file.fasta.gz")
 56 | #'
 57 | #' \dontrun{
 58 | #' # Example 2:
 59 | #' # Just use local caching
 60 | #' cached_filename <- getFile(mg, url_list[[2]])
 61 | #'
 62 | #' # Example 3:
 63 | #' # Using read.func to open the reads with readDNAStringSet from
 64 | #' # \code{biostrings}. Without retaining on disk
 65 | #' dna_seqs <- getFile(
 66 | #'     mg, url_list[[3]], read.func = readDNAStringSet)
 67 | #' }
 68 | #'
 69 | #' @name getFile
 70 | NULL
 71 | 
 72 | #' @rdname getFile
 73 | #' @importFrom httr add_headers content write_disk
 74 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
 75 | #' @export
 76 | setMethod("getFile", signature = c(x = "MgnifyClient"), function(
 77 |         x, url, file = NULL, read.func = NULL, ...){
 78 |     ############################### INPUT CHECK ################################
 79 |     if( !.is_non_empty_string(url) ){
 80 |         stop(
 81 |             "'url' must be a single character value specifying ",
 82 |             "the url of the file.", call. = FALSE)
 83 |     }
 84 |     if( !(.is_non_empty_string(file) || is.null(file)) ){
 85 |         stop(
 86 |             "'file' must be NULL or a single character value ",
 87 |             "specifying the name of file being saved.", call. = FALSE)
 88 |     }
 89 |     if( !(is.function(read.func) || is.null(read.func)) ){
 90 |         stop(
 91 |             "'read.func' must be a function that is used to process the file ",
 92 |             "or NULL.", call. = FALSE)
 93 |     }
 94 |     ############################# INPUT CHECK END ##############################
 95 |     # Get file
 96 |     result <- .mgnify_download(
 97 |         client = x, url = url, file = file,
 98 |         read.func = read.func, ...)
 99 |     return(result)
100 | })
101 | 
102 | #' Listing files available for download
103 | #'
104 | #' @details
105 | #' \code{searchFile()} function is a wrapper function allowing easy
106 | #' enumeration of downloads available for a given accession IDs.
107 | #' Returns a single data.frame containing all available downloads and associated
108 | #' metadata, including the url location and description. This can then be
109 | #' filtered to extract the urls of interest, before actually
110 | #' retrieving the files using \code{getFile()}
111 | #'
112 | #' @param accession A single character value or a vector of character values
113 | #' specifying accession IDs to return results for.
114 | #'
115 | #' @param type A single character value specifying the type of objects to
116 | #' query. Must be one of the following options: \code{analysis}, \code{samples},
117 | #' \code{studies}, \code{assembly}, \code{genome} or \code{run}.
118 | #' (By default: \code{type = "samples"})
119 | #'
120 | #' @return For \code{searchFile()} \code{data.frame} containing all discovered
121 | #' downloads. If multiple \code{accessions} are queried, the \code{accessions}
122 | #' column may to filter the results - since rownames are not set (and wouldn't
123 | #' make sense as each query will return multiple items)
124 | #'
125 | #' @examples
126 | #' # Make a client object
127 | #' mg <- MgnifyClient(useCache = TRUE)
128 | #' # Create a vector of accession ids - these happen to be \code{analysis}
129 | #' # accessions
130 | #' accession_vect <- c(
131 | #'     "MGYA00563876", "MGYA00563877", "MGYA00563878",
132 | #'     "MGYA00563879", "MGYA00563880" )
133 | #' downloads <- searchFile(mg, accession_vect, "analyses")
134 | #'
135 | #' @name getFile
136 | NULL
137 | 
138 | #' @rdname getFile
139 | #' @importFrom plyr llply rbind.fill
140 | #' @importFrom urltools parameters parameters<-
141 | #' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
142 | #' @export
143 | setMethod("searchFile", signature = c(x = "MgnifyClient"), function(
144 |         x, accession, type = c(
145 |             "studies", "samples", "analyses", "assemblies", "genomes", "run"),
146 |         ...
147 |         ){
148 |     ############################### INPUT CHECK ################################
149 |     if( !.is_non_empty_character(accession) ){
150 |         stop(
151 |             "'accession' must be a list of character values specifying ",
152 |             "the MGnify accession identifiers.",
153 |             call. = FALSE)
154 |     }
155 |     if( !(.is_non_empty_string(type)) ){
156 |         stop(
157 |             "'type' must be a single character value specifying ",
158 |             "the type of instance to query.", call. = FALSE)
159 |     }
160 |     type <- match.arg(type, several.ok = FALSE)
161 |     ############################# INPUT CHECK END ##############################
162 |     # Get file urls
163 |     result <- .mgnify_get_download_urls(
164 |         client = x, accession = accession, type = type, ...)
165 |     return(result)
166 | })
167 | 
168 | ################################ HELP FUNCTIONS ################################
169 | 
170 | # Download the specified files from the database
171 | .mgnify_download <- function(
172 |         client, url, file, read.func, use.cache = useCache(client),
173 |         url.address = databaseUrl(client), cache.dir = cacheDir(client),
174 |         show.warnings = showWarnings(client), clear.cache = clearCache(client),
175 |         auth.tok = authTok(client), ...){
176 |     # Input check
177 |     if( !.is_non_empty_string(url.address) ){
178 |         stop(
179 |             "'url.address' must be a string.", call. = FALSE)
180 |     }
181 |     if( !.is_a_bool(use.cache) ){
182 |         stop(
183 |             "'use.cache' must be a single boolean value.", call. = FALSE)
184 |     }
185 |     if( !.is_non_empty_string(cache.dir) ){
186 |         stop(
187 |             "'cache.dir' must be a string.", call. = FALSE)
188 |     }
189 |     if( !.is_a_bool(show.warnings) ){
190 |         stop(
191 |             "'show.warnings' must be a single boolean value.", call. = FALSE)
192 |     }
193 |     if( !.is_a_bool(clear.cache) ){
194 |         stop(
195 |             "'clear.cache' must be a single boolean value.", call. = FALSE)
196 |     }
197 |     if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
198 |         stop(
199 |             "'auth.tok' must be a string or NULL.", call. = FALSE)
200 |     }
201 |     #
202 |     # Set up filenames for storing the data
203 |     if ( !is.null(file) ){
204 |         file_path <- file
205 |     }else if(use.cache){
206 |         # Build a filename out of the url, including the full paths. Annoying,
207 |         # but some downloads (e.g. genome results) are just names like
208 |         # core_genes.fa , which would break the caching.
209 |         cachetgt <- gsub(paste(url.address, "/", sep = ""), "", url)
210 | 
211 |         # Make sure the directory exists
212 |         cache_full_name <- file.path(cache.dir, cachetgt)
213 |         dir.create(
214 |             dirname(cache_full_name), recursive = TRUE,
215 |             showWarnings = show.warnings)
216 |         file_path <- cache_full_name
217 |     } else{
218 |         file_path <- tempfile()[[1]]
219 |     }
220 | 
221 |     # Clear cache if specified
222 |     if( use.cache && clear.cache && file.exists(file_path) ){
223 |         message("clear_cache is TRUE: deleting ", file_path)
224 |         unlink(file_path)
225 |     }
226 | 
227 |     # Only get the data if it's not already on disk
228 |     if( !file.exists(file_path) || (use.cache && file.exists(file_path)) ){
229 |         # Add authentication details to query options
230 |         if( !is.null(auth.tok) ){
231 |             add_headers(.headers = c(
232 |                 Authorization = paste("Bearer", auth.tok, sep = " ")))
233 |         }
234 |         # If there's an error we need to make sure the cache file isn't written
235 |         # - by default it seems it is.
236 |         res <- GET(url, write_disk(file_path, overwrite = TRUE))
237 |         # If the file was not successfully downloaded
238 |         if( res$status_code != 200 ){
239 |             # Remove the downloaded file
240 |             unlink(file_path)
241 |             stop(
242 |                 url, ": ", content(res, ...)$errors[[1]]$detail,
243 |                 " Could not load the file from database.",
244 |                 call. = FALSE)
245 |         }
246 |     }
247 |     # Whether to use user-specified read function
248 |     if( is.null(read.func) ){
249 |         result <- file_path
250 |     } else{
251 |         result <- read.func(file_path)
252 |     }
253 |     return(result)
254 | }
255 | 
256 | # Get URL addresses of downloadable files that are related to certain accession
257 | # ID.
258 | .mgnify_get_download_urls <- function(
259 |         client, accession, type, use.cache = useCache(client),
260 |         show.messages = verbose(client), ...){
261 |     # Input check
262 |     if( !.is_a_bool(use.cache) ){
263 |         stop(
264 |             "'use.cache' must be a single boolean value.", call. = FALSE)
265 |     }
266 |     if( !.is_a_bool(show.messages) ){
267 |         stop(
268 |             "'show.messages' must be a single boolean value.", call. = FALSE)
269 |     }
270 |     show.messages <- ifelse(show.messages, "text", "none")
271 |     #
272 |     # Give message about progress
273 |     if( show.messages == "text" ){
274 |         message("Searching files...")
275 |     }
276 |     # L
277 |     # Loop though accession IDs and find the info
278 |     results <- llply(accession, function(x){
279 |         # Get the data as nested json list
280 |         download_list <- .mgnify_retrieve_json(
281 |             client, paste(type, x, "downloads", sep = "/"),
282 |             use.cache = use.cache, ...)
283 |         # Convert to df
284 |         df <- do.call(rbind.fill, lapply(download_list, function(x){
285 |             as.data.frame(x, stringsAsFactors = FALSE)}
286 |             ))
287 |         # Add info to df
288 |         df$accession <- x
289 |         df$type <- type
290 |         # If no match, df is a list --> convert to data.frame
291 |         if( !is.data.frame(df) ){
292 |             df <- as.data.frame(df)
293 |         } else {
294 |             # If search result was found, modify
295 |             # For convenience, rename the "self" column to "download_url" -
296 |             # which is what it actually is...
297 |             colnames(df)[colnames(df) == "self"] <- "download_url"
298 |             # Finally, strip off any options from the url - they sometimes seem
299 |             # to get format=json stuck on the end
300 |             urls <- df$download_url
301 |             parameters(urls) <- NULL
302 |             df$download_url <- urls
303 |         }
304 |         return(df)
305 |     }, .progress = show.messages)
306 |     # Combine results of multiple accessions IDs
307 |     results <- do.call(rbind.fill, results)
308 |     return(results)
309 | }
310 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # integration with other packages
  3 | 
  4 | .require_package <- function(pkg){
  5 |     if(!requireNamespace(pkg, quietly = TRUE)){
  6 |         stop(
  7 |             "'", pkg,"' package not found. Please install the '", pkg,
  8 |             "' package to use this function.", call. = FALSE)
  9 |     }
 10 | }
 11 | 
 12 | ################################### TESTING ###################################
 13 | # Methods for testing
 14 | 
 15 | .is_a_bool <- function(x){
 16 |     is.logical(x) && length(x) == 1L && !is.na(x)
 17 | }
 18 | 
 19 | .is_non_empty_character <- function(x){
 20 |     is.character(x) && all(nzchar(x))
 21 | }
 22 | 
 23 | .is_non_empty_string <- function(x){
 24 |     is.character(x) && length(x) == 1L
 25 | }
 26 | 
 27 | .is_an_integer <- function(x){
 28 |     is.numeric(x) && length(x) == 1L && x%%1==0
 29 | }
 30 | 
 31 | ################################ HELP FUNCTIONS ################################
 32 | # Help functions that are utilized by multiple methods
 33 | 
 34 | ########################## .mgnify_attr_list_to_df_row #########################
 35 | # Not exporting this - if people want to they can use the
 36 | # rjsonapi functionality. Internally, it takes the "attributes" list
 37 | # and converts it into a single row data.frame. For some entries, there is a
 38 | # sublist of key/value pairs. metadata_key allows these to be included as
 39 | # columns in the result.
 40 | .mgnify_attr_list_to_df_row <- function (json, metadata_key = NULL){
 41 |     # Get what kind of metadata the data includes
 42 |     attrlist <- names(json$attributes)
 43 |     # If the type of metadata is specified
 44 |     if (!is.null(metadata_key)){
 45 |         # Get metadata related to specific key
 46 |         metaattrlist <- json$attributes[[metadata_key]]
 47 |         metlist <- lapply(metaattrlist, function(x) x$value)
 48 |         metlist <- unlist(metlist)
 49 |         names_metlist <- lapply(metaattrlist, function(x) x$key)
 50 |         names(metlist) <- unlist(names_metlist)
 51 |         # Get metadata without the key
 52 |         baseattrlist <- attrlist[!(attrlist %in% c(metadata_key))]
 53 |         # Combine metadata
 54 |         df <- as.data.frame(t(unlist(c(
 55 |             json$attributes[baseattrlist], metlist))),
 56 |             stringsAsFactors = FALSE)
 57 |     }else{
 58 |         # Get all the metadata without key extraction
 59 |         df <- as.data.frame(t(unlist(json["attributes"])),
 60 |                             stringsAsFactors = FALSE)
 61 |     }
 62 |     # Add accession code and type of data
 63 |     df$accession <- json$id
 64 |     df$acc_type <- json$type
 65 |     # Add accession code also to rownames
 66 |     rownames(df) <- df$accession
 67 |     return(df)
 68 | }
 69 | 
 70 | ############################## .mgnify_get_x_for_y #############################
 71 | # Helper function for getting relative paths in the API
 72 | # Not everything is implemented here - just what we
 73 | # need to get to the download or run areas
 74 | # Given an accession x, we want to get the link to get the url for the
 75 | # corresponding typeY JSONAPI path for child elements
 76 | #
 77 | # .mgnify_get_x_for_y determines the location of typeY child objects of x
 78 | # (typeX)
 79 | #
 80 | # This helper function, principally intended to be used internally,
 81 | # is used to match up related objects within the path. The inherently
 82 | # unhierarchical nature of the MGnify API makes it a bit inconsistent. This
 83 | # function acts as a quick way to determine how to get from one type to another,
 84 | # without having to special case within the code.
 85 | #
 86 | # Parameters:
 87 | # client MGnifyR client API object
 88 | # x Accession ID \code{char} of parent object
 89 | # typeX Type of accession \code{x}
 90 | # typeY Type of child object to return
 91 | # use.cache Whether to use on-disk cache
 92 | #
 93 | # Return:
 94 | # char complete url to access the result. Note this query is not run from here -
 95 | # just the URL is returned
 96 | #
 97 | # Examples:
 98 | # cl <- new("MgnifyClient")
 99 | # .mgnify_get_x_for_y(cl, "MGYS00005126", "studies", "samples")
100 | .mgnify_get_x_for_y <- function(
101 |         client, x, typeX, typeY, use.cache, ...){
102 |     # Fetch the data on samples/analyses as a json list
103 |     res <- .mgnify_retrieve_json(
104 |         client,
105 |         paste(typeX, x, sep = "/"),
106 |         use.cache = use.cache,
107 |         ...)
108 |     # Get related analyses when samples were found and vice versa if result was
109 |     # found.
110 |     if( !is.null(res) ){
111 |         res <- res[[1]]$relationships[[typeY]]$links$related
112 |     }
113 |     return(res)
114 | }
115 | 
116 | ############################## .mgnify_get_x_for_y #############################
117 | # Internal function to actually perform the http request. Build up the URL then
118 | # issues a GET, parsing the returned JSON into a nested list (uses jsonlite
119 | # internally?) Previously cached results may be retrieved from disk without
120 | # resorting to calling the MGnify server.
121 | 
122 | # Low level MGnify API handler
123 | #
124 | # .mgnify_retrieve_json deals with handles the actual HTTP GET calls for the
125 | # MGnifyR package, handling API pagination, local result caching, and
126 | # authentication cookies for access to restricted or pre-release datasets.
127 | # Although principally intended for internal MGnifyR use, it's exported for
128 | # direct invocation. Generally though it's not recommended for use by users.
129 | #
130 | # Parameters:
131 | # client MGnifyR client
132 | # path top level search point for the query. One of biomes, samples, runs etc.
133 | # Basically includes all parts of the URL between the base API url and the
134 | # parameter specifications
135 | # complete_url complete url to search, usually retrieved from previous query in
136 | # the "related" section.
137 | # qopts named list or vector containing options/filters to be URL encoded and
138 | # appended to query as key/value pairs
139 | # max.hits Maximum number of data entries to return. The actual number of hits
140 | # returned may be higher than this value, as this parameter only clamps after
141 | # each full page is processed. Set to <=0 to disable - i.e. retrieve all items.
142 | # use.cache Should successful queries be cached on disk locally? There are
143 | # unresolved questions about whether this is a sensible thing to do, but it
144 | # remains as an option. It probably makes sense for single accession grabs,
145 | # but not for (filtered) queries - which are liable to change as new data is
146 | # added to MGnify. Also caching only works for the first page.
147 | # Debug Should we print out lots of information while doing the grabbing?
148 | # timeout How long should be waited server to respond?
149 | #
150 | # Return:
151 | # list of results after pagination is dealt with.
152 | 
153 | #' @importFrom urltools parameters parameters<-
154 | #' @importFrom httr add_headers
155 | #' @importFrom httr GET
156 | #' @importFrom httr config
157 | #' @importFrom httr content
158 | #' @importFrom httr timeout
159 | .mgnify_retrieve_json <- function(
160 |         client, path = "biomes", complete_url = NULL, qopts = NULL,
161 |         max.hits = 200, timeout = 5*60, Debug = FALSE,
162 |         use.cache = useCache(client), show.warnings = showWarnings(client),
163 |         clear.cache = clearCache(client), url.address = databaseUrl(client),
164 |         auth.tok = authTok(client), cache.dir = cacheDir(client), ...){
165 |     # Input check
166 |     if( !.is_an_integer(timeout) ){
167 |         stop(
168 |             "'timeout' must be a single integer value.", call. = FALSE)
169 |     }
170 |     if( !.is_a_bool(Debug) ){
171 |         stop(
172 |             "'Debug' must be a single boolean value.", call. = FALSE)
173 |     }
174 |     if( !.is_a_bool(use.cache) ){
175 |         stop(
176 |             "'use.cache' must be a single boolean value specifying whether ",
177 |             "to use on-disk caching.", call. = FALSE)
178 |     }
179 |     if( !.is_a_bool(show.warnings) ){
180 |         stop(
181 |             "'show.warnings' must be a single boolean value.", call. = FALSE)
182 |     }
183 |     if( !.is_a_bool(clear.cache) ){
184 |         stop(
185 |             "'clear.cache' must be a single boolean value.", call. = FALSE)
186 |     }
187 |     if( !.is_non_empty_string(url.address) ){
188 |         stop(
189 |             "'url.address' must be a string.", call. = FALSE)
190 |     }
191 |     if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
192 |         stop(
193 |             "'auth.tok' must be a string or NULL.", call. = FALSE)
194 |     }
195 |     #
196 |     if( !.is_non_empty_string(cache.dir) ){
197 |         stop(
198 |             "'cache.dir' must be a string.", call. = FALSE)
199 |     }
200 |     #
201 |     # Warning message if data is not found
202 |     warning_msg <- paste0(path, ": No data found.")
203 |     # warnings(client) turns on debugging too:
204 |     if( show.warnings ){
205 |         Debug <- TRUE
206 |     }
207 |     # Set up the base url
208 |     # Are we using internal paths?
209 |     if( is.null(complete_url) ){
210 |         fullurl <- paste(url.address, path, sep = "/")
211 |     } else{
212 |         # Or direct links from e.g. a "related" section
213 |         # Set the full url, but clean off any existing parameters
214 |         # (page, format etc) as they'll be added back later:
215 |         fullurl <- complete_url
216 |         parameters(fullurl) <- NULL
217 |         path <- substr(fullurl, nchar(url.address) + 2, nchar(fullurl))
218 |     }
219 |     # Spaces are not allowed in url address. Convert spaces to %20.
220 |     fullurl <- gsub(" ", "%20", fullurl)
221 | 
222 |     # Convert to csv if filters are lists.
223 |     # This doesn't check if they  can  be searched for in the API,
224 |     # which is an issue since no error is returned by the JSON if the search
225 |     # is invalid - we only get a result as if no query was present...
226 |     tmpqopts <- lapply(qopts, function(x) paste(x, collapse = ","))
227 | 
228 |     # Include the json and page position options
229 |     # full_qopts <- as.list(c(format="json", tmpqopts, page=1))
230 |     full_qopts <- as.list(c(format="json", tmpqopts))
231 | 
232 |     # Build up the cache name anyway - even if it's not ultimately used:
233 |     fname_list <- c(path, names(unlist(full_qopts)), unlist(full_qopts))
234 |     cache_fname <- paste(fname_list, collapse = "_")
235 |     # Because query options are collapsed to file name , they might include
236 |     # colons that are not supported in file names. Replace them with
237 |     # underscores.
238 |     cache_fname <- gsub(":", "_", cache_fname)
239 |     cache_full_fname <- file.path(cache.dir, paste0(cache_fname, ".RDS"))
240 | 
241 |     # Quick check to see if we should clear the disk cache  for this
242 |     # specific call  - used for debugging and when MGnify breaks
243 |     if( use.cache && clear.cache ){
244 |         if( file.exists(cache_full_fname) ){
245 |             message("clearCache is TRUE: deleting ", cache_full_fname)
246 |             unlink(cache_full_fname)
247 |         }
248 |     }
249 | 
250 |     # Do we want to try and use a cache to speed things up?
251 |     if( use.cache && file.exists(cache_full_fname) ){
252 |         final_data <- readRDS(cache_full_fname)
253 |     } else{
254 |         # Authorization: Bearer <your_token>
255 |         if( !is.null(auth.tok) ){
256 |             add_headers(
257 |                 .headers = c(Authorization = paste(
258 |                     "Bearer", authTok(client), sep = " ")))
259 |         }
260 |         res <- GET(
261 |             url = fullurl, config(verbose = Debug), query = full_qopts,
262 |             timeout(timeout))
263 |         # Get the data
264 |         data <- content(res, ...)
265 | 
266 |         # Check if the search was successful and data can be found
267 |         not_found <- (res$status_code != 200) || (
268 |             is.null(data$data) || length(data$data) == 0)
269 |         # If data is found
270 |         if( !not_found ){
271 |             # Fetch all the data
272 |             final_data <- .retrieve_json_data(
273 |                 client, data, fullurl, full_qopts, max.hits, Debug
274 |             )
275 |         } else{
276 |             final_data <- NULL
277 |             if( res$status_code != 200 ){
278 |                 warning_msg <- paste0(
279 |                     path, " (", res$status_code, " error): ",
280 |                     data$errors[[1]]$detail)
281 |             }
282 |         }
283 |         # Save the result to file if specified
284 |         if( use.cache && !file.exists(cache_full_fname) ){
285 |             # Make sure the directory is created...
286 |             dir.create(
287 |                 dirname(cache_full_fname), recursive = TRUE,
288 |                 showWarnings = show.warnings)
289 |             saveRDS(final_data, file = cache_full_fname)
290 |         }
291 |     }
292 |     # Give warning if data is not found.
293 |     if( is.null(final_data) ){
294 |         warning("\n", warning_msg, call. = FALSE)
295 |     }
296 |     return(final_data)
297 | }
298 | 
299 | # This retrives all the data related to accession. FOr example, it loops
300 | # oer multiple pages.
301 | .retrieve_json_data <- function(
302 |         client, data, fullurl, full_qopts, max.hits, Debug,
303 |         auth.tok = authTok(client), ...){
304 |     # Input check
305 |     if( !(.is_non_empty_string(auth.tok) || is.null(auth.tok)) ){
306 |         stop(
307 |             "'auth.tok' must be a string or NULL.", call. = FALSE)
308 |     }
309 |     #
310 |     # At this point, data$data is either a list of lists or a single named
311 |     # list. If it's a single entry, it needs embedding in a list for
312 |     # consistency downstream datlist is built up as a list of pages, where
313 |     # each entry must be another list. Thus, on the first page,
314 |     #
315 |     datlist <- list()
316 |     if( !is.null(names(data$data)) ){
317 |         # Create something to store the returned data
318 |         datlist[[1]] <- list(data$data)
319 |     }else{
320 |         datlist[[1]] <- data$data
321 |     }
322 |     # Check to see if there's pagination required
323 |     if( "meta" %in% names(data) ){
324 |         # Yes, paginate
325 |         pstart <- as.numeric(data$meta$pagination$page)
326 |         pend <- as.numeric(data$meta$pagination$pages)
327 |         # We've already got the first one
328 |         if( pend > 1 ){
329 |             # Loop over pages and save their result to list
330 |             for( p in seq(pstart+1,pend) ){
331 |                 full_qopts$page <- p
332 |                 if( !is.null(auth.tok) ){
333 |                     add_headers(
334 |                         .headers = c(
335 |                             Authorization = paste(
336 |                                 "Bearer", auth.tok, sep = " ")))
337 |                 }
338 |                 curd <- content(
339 |                     GET(fullurl, config(verbose = Debug), query = full_qopts ),
340 |                     ...)
341 |                 datlist[[p]] <- curd$data
342 |                 # Check to see if we've pulled enough entries.
343 |                 # With NULL and -1, disable max.hits
344 |                 curlen <- sum(unlist(lapply(datlist, length)))
345 |                 if( !is.null(max.hits) && curlen >= max.hits &&
346 |                     max.hits != -1 ){
347 |                     break
348 |                 }
349 |             }
350 |         }
351 |     }
352 |     # Combine results from different pages
353 |     final_data <- unlist(datlist, recursive = FALSE)
354 |     return(final_data)
355 | }
356 | 
357 | #Internal functions to parse the attributes/hierarchy list into a data.frame
358 | .mgnify_parse_tax <- function(json){
359 |     df <- as.data.frame(
360 |         c(json$attributes["count"], unlist(json$attributes$hierarchy)),
361 |         stringsAsFactors = FALSE)
362 |     df$index_id <- json$attributes$lineage
363 |     df
364 | 
365 | }
366 | .mgnify_parse_func <- function(json){
367 |     df <- as.data.frame(json$attributes, stringsAsFactors = FALSE)
368 |     df$index_id <- json$attributes$accession
369 |     df
370 | }
371 | 


--------------------------------------------------------------------------------
/vignettes/MGnifyR_long.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "MGnifyR, extended vignette"
  3 | date: "`r Sys.Date()`"
  4 | package: MGnifyR
  5 | output:
  6 |     BiocStyle::html_document:
  7 |         fig_height: 7
  8 |         fig_width: 10
  9 |         toc: yes
 10 |         toc_depth: 2
 11 |         number_sections: true
 12 | vignette: >
 13 |     %\VignetteIndexEntry{MGnifyR, extended vignette}
 14 |     %\VignetteEngine{knitr::rmarkdown}
 15 |     %\VignetteEncoding{UTF-8}
 16 | bibliography: references.bib
 17 | ---
 18 | 
 19 | ```{r include = FALSE}
 20 | library(knitr)
 21 | knitr::opts_chunk$set(
 22 |     collapse = TRUE,
 23 |     comment = "#>",
 24 |     eval = FALSE,
 25 |     cache = TRUE
 26 | )
 27 | ```
 28 | 
 29 | [MGnifyR homepage](http://github.com/EBI-Metagenomics/MGnifyR)
 30 | 
 31 | # Introduction
 32 | 
 33 | `MGnifyR` is a package designed to ease access to the EBI's
 34 | [MGnify](https://www.ebi.ac.uk/metagenomics) resource, allowing searching and
 35 | retrieval of multiple datasets for downstream analysis. While MGnify pipelines
 36 | are undoubtedly useful, as currently implemented they produce results on a
 37 | strictly per-sample basis. While some whole study results are available,
 38 | comparisons across studies are difficult. The `MGnifyR` package is designed to
 39 | facilitate cross-study analyses by handling all the per-sample data retrieval
 40 | and merging details internally, leaving the user free to perform the analysis
 41 | as they see fit.
 42 | 
 43 | The latest version of MGnifyR seamlessly integrates with the
 44 | [miaverse framework](https://microbiome.github.io/) providing access to
 45 | tools in microbiome down-stream analytics. This integration
 46 | enables users to leverage optimized and standardized methods for analyzing
 47 | the microbiome. Additionally, users can benefit from a comprehensive tutorial
 48 | book that offers valuable guidance and support.
 49 | 
 50 | # Installation
 51 | 
 52 | `MGnifyR` is currently hosted on GitHub, and can be installed using via
 53 | `devtools`. `MGnifyR` should be built using the following snippet.
 54 | 
 55 | ```{r install, eval=FALSE}
 56 | BiocManager::install("MGnifyR")
 57 | ```
 58 | 
 59 | # Load `MGnifyR` package
 60 | 
 61 | Once installed, `MGnifyR` is made available in the usual way.
 62 | 
 63 | ```{r load_package}
 64 | library(MGnifyR)
 65 | ```
 66 | 
 67 | # Create a client
 68 | 
 69 | All functions in `MGnifyR` make use of a `MgnifyClient` object to keep track
 70 | of the JSONAPI url, disk cache location and user access tokens. Thus the first
 71 | thing to do when starting any analysis is to instantiate this object. The
 72 | following snippet creates this.
 73 | 
 74 | ```{r create_client}
 75 | mg <- MgnifyClient()
 76 | mg
 77 | ```
 78 | 
 79 | It's recommended that local caching is enabled with `useCache = TRUE`. Queries
 80 | to the MGnify API can be quite slow, particularly when retrieving multipage
 81 | results for many analyses (such as many `Interpro` results). Using a local
 82 | disk cache can significantly speed up subsequent work, bypassing the need to
 83 | re-query the API. Use of the cache should be entirely transparent, as the
 84 | caching occurs at the raw data level. The cache can persist across `MGnifyR`
 85 | sessions, and can even be used for multiple sessions simultaneously - provided
 86 | that different sets of accessions are queried at once.
 87 | 
 88 | Optionally, a username and password may be specified during client creation,
 89 | causing `MGnifyR` to attempt retrieval of an authentication token from the API.
 90 | Doing so gives access to non-public results, such as those currently under an
 91 | author imposed embargo period.
 92 | 
 93 | ```{r create_client_passwd, eval=FALSE}
 94 | mg <- MgnifyClient(
 95 |     username = "Webin-username", password = "your-password", useCache = TRUE)
 96 | ```
 97 | 
 98 | # Functions for fetching the data
 99 | 
100 | ## Search data
101 | 
102 | `MGnifyR` gives users access to the complete range of search functionality
103 | implemented in the MGnify JSON API. A single function `doQuery()` is used to do
104 | perform this searching, allowing Studies, Samples, Runs and Accession to be
105 | interrogated from a common interface. As with all MGnifyR functions the first
106 | argument `client` must be a valid `MgnifyClient` instance. The only remaining
107 | **required** parameter is `qtype`, specifying the type of data to be queried,
108 | and may be one of `studies`, `samples`, `runs`, `analyses` or `assemblies`.
109 | Other general parameter include `max.hits`.
110 | 
111 | Unlike most other `MGnifyR` high level functions, caching is turned off by
112 | default for `doQuery()`. New data and analyses are being added to MGnify all the
113 | time, so enabling caching by default may lead to out-of-date search results for
114 | long-lived sessions. However, it's easy to switch back on, and may be useful in
115 | many cases. Also, given the huge and ever increasing number of datasets
116 | available in MGnify, a limit to the number of results returned may be set
117 | using `max.hits`. By default this is set to 200, which for most exploratory
118 | queries should be sufficient. It may be increased or decreased by directly
119 | specifying `max.hits`, and disabled completely (no limit) by setting
120 | `max.hits=NULL`.
121 | 
122 | In most cases we will want to be more specific about the search, and will
123 | also use either an `accession` parameter, or the many filter options available
124 | through the API, and discussed below. Specifying an `accession` id, which in
125 | the case of `samples`, `runs` and `assemblies` may be a vector of ids, returns
126 | a data.frame of metadata with one row per matching accession.
127 | 
128 | If `accession` is `NULL` (the default) then remaining parameters define the
129 | filters applied by the API to the search result. Details of these parameters
130 | are given in `help(doQuery)`. By way of example though, supposing we are
131 | interested in amplicon Illumina samples from the arctic, we might try the
132 | following query:
133 | 
134 | ```{r search_studies}
135 | northpolar <- doQuery(
136 |     mg, "samples", latitude_gte=60.0, experiment_type="amplicon",
137 |     biome_name="Soil", instrument_platform = "Illumina", max.hits = 10)
138 | 
139 | head(northpolar)
140 | ```
141 | 
142 | Specifying an `accession` parameter will restrict results to just those matching
143 | that particular entry, be it a study, sample or run. For example, to retrieve
144 | information for study "MGYS00002891":
145 | 
146 | ```{r search_studies_accession}
147 | study_samples <- doQuery(mg, "studies", accession="MGYS00002891")
148 | 
149 | head(study_samples)
150 | ```
151 | 
152 | ## Find relevent **analyses** accessions
153 | 
154 | Having obtained a particular set of search hits, it's now time to retrieve the
155 | associated results. General automated analysis is complicated by the MGnify
156 | database design, wherein for example samples may be shared between multiple
157 | studies, or studies analysed multiple times using different versions of the
158 | pipeline.  Navigating these "many-to-one" relationships can be tricky, so
159 | `MGnifyR` resorts to using `analyses` accessions as it's canonical identifier.
160 | Each analysis corresponds to a single run of a particular pipeline on a single
161 | sample in a single study. The downside of this approach is that queries
162 | returning `studies`, `samples` (or anything other than `analyses`) accessions
163 | need converting to the corresponding `analyses`. 
164 | 
165 | `MGnifyR` therefore provides a helper function to handle this conversion -
166 | `searchAnalysis()`. Following on from our previous search, we have a
167 | list of `study` accessions, so to convert to corresponding `analyses` we use:
168 | 
169 | ```{r convert_to_analyses}
170 | analyses_accessions <- searchAnalysis(
171 |     mg, type="studies", accession = study_samples$accession)
172 | 
173 | head(analyses_accessions)
174 | ```
175 | 
176 | A useful side effect of the above call is that some attribute metadata for
177 | each sample has now been retrieved and stored in the local cache. Thus
178 | subsequent API calls for these samples (which will occur multiple times in
179 | later steps) will be significantly faster.
180 | 
181 | It's important to be aware that the results of a `searchAnalysis()` command will
182 | not necessarily be a one-to-one match with the input accessions. `MGnify`
183 | analysis runs are sometimes performed multiple times, perhaps using different
184 | versions of the pipeline. Thus further filtering of the result list may be
185 | required, but is easily performed and is illustrated in the next section.
186 | 
187 | ## Fetch metadata
188 | 
189 | At this point we have a long list of analysis instances (with potential
190 | duplicates) corresponding to the samples previously found. We use the
191 | `getMetadata` function to download and combine all associated `sample`, `run`
192 | and `study` metadata, which we then filter as required to include only the
193 | rows we want.
194 | 
195 | ```{r get_metadata}
196 | analyses_metadata <- getMetadata(mg, analyses_accessions)
197 | 
198 | head(analyses_metadata)
199 | ```
200 | 
201 | The resulting data.frame has columns with names prefixed with their source
202 | type. For example, "sample_xxx" columns correspond to metadata gleaned from
203 | querying an accession's `sample` entry. MGnify allows quite flexible
204 | specification of arbitray metadata at submission time, in many cases leading
205 | to quite sparse `data.frame` results if accession queries are sourced from more
206 | than one study. For instance, if only one sample contains an entry for
207 | "sample_soil_PH", entries for other rows will be filled with `NA`. `MGnifyR`
208 | does not automatically clean these missing values - instead opting to allow the
209 | the user to choose the a correct action. The particular study we're looking at
210 | is from the marine biome, suppose we were interested in only those samples or
211 | analyses for which the sampling depth was known. The following snippet filters
212 | the full `data.frame` selecting only entries which contain a valid
213 | `sample_depth`. It's worth noting the `as.numeric` call to ensure the column
214 | is converted to `numeric` type before it is checked. *All* sample data from
215 | MGnifyR is initially retrieved as type `character`, and it's up to the user to
216 | make sure ostensibly numeric entries are converted properly.
217 | 
218 | ```{r filter_metadata}
219 | known_depths <- analyses_metadata[
220 |     !is.na(as.numeric(analyses_metadata$sample_depth)), ]
221 | # How many are left?
222 | dim(known_depths)
223 | ```
224 | 
225 | ## Fetch microbiome data
226 | 
227 | Having selected the analyses we wish to examine further, `getResult()` is used
228 | to both download associated OTU tables and taxonomy, and join all results
229 | into a single `r BiocStyle::Biocpkg("TreeSummarizedExperiment")` (`TreeSE`)
230 | object. TreeSE is becoming a defacto standard for taxonomic abundance *munging*
231 | in R. `TreeSE` objects integrate abundance, taxonomic, phylogenetic, sample and
232 | sequence data into a single object, with powerful facilities for filtering,
233 | processing and plotting the results. Compared to
234 | `r BiocStyle::Biocpkg("phyloseq")` object, `TreeSE` is more scalable and capable
235 | for efficient data analysis.
236 | 
237 | `miaverse` framework is developed around `TreeSE` data container. It provides
238 | tools for analysis and visualization. Moreover, it includes a comprehensive
239 | tutorial book called [OMA](https://microbiome.github.io/OMA/).
240 | 
241 | ### Amplicon sequencing
242 | 
243 | When the dataset includes amplicon sequencing data, i.e., the dataset does not
244 | include function predictions, `getResult()` method returns the dataset as a
245 | `TreeSE` by default. See other output types from the function documentation.
246 | 
247 | ```{r get_treese}
248 | tse <- getResult(mg, accession = analyses_accessions, get.func = FALSE)
249 | 
250 | tse
251 | ```
252 | 
253 | `TreeSE` object is uniquely positioned to support
254 | `r BiocStyle::Biocpkg("SummarizedExperiment")`-based
255 | microbiome data manipulation and visualization. Moreover, it enables access
256 | to `miaverse` tools. For example, we can estimate diversity of samples.
257 | 
258 | ```{r calculate_diversity}
259 | library(mia)
260 | 
261 | tse <- estimateDiversity(tse, index = "shannon")
262 | 
263 | library(scater)
264 | 
265 | plotColData(tse, "shannon", x = "sample_geo.loc.name")
266 | ```
267 | 
268 | ```{r plot_abundance}
269 | library(miaViz)
270 | 
271 | plotAbundance(
272 |     tse[!is.na( rowData(tse)[["Kingdom"]] ), ],
273 |     rank = "Kingdom",
274 |     as.relative = TRUE
275 |     )
276 | ```
277 | 
278 | If needed, `TreeSE` can be converted to `phyloseq`.
279 | 
280 | ```{r to_phyloseq}
281 | pseq <- makePhyloseqFromTreeSE(tse)
282 | pseq
283 | ```
284 | 
285 | ### Metagenomics
286 | 
287 | Although the previous queries have been based on the results from `doQuery()`,
288 | from now on we will concentrate on combining and comparing results from
289 | specific studies.  Since newly performed analyses are retrieved first in the
290 | `doQuery()` call, it's likely that by the time this vignette is read, the query
291 | results will be different.  This is principally due to the rapid increase in
292 | MGnify submissions, leading to a potential lack of consistency between even
293 | closely spaced queries. As mentioned previously, it may be best to use
294 | `useCache=FALSE` from `MgnifyCLient` object for `doQuery()` calls, to ensure
295 | queries are actually returning the latest data.
296 | 
297 | For the remainder of this vignette however, we'll be comparing 3 ostensibly
298 | different studies. A study of saltmarsh soils from York University, human
299 | faecal samples from a survey of healthy Sardinians, and a set of samples from
300 | hydrothermal vents in the Mid-Cayman rise in the Carribbean Sea. To simplify
301 | things, only the first 20 samples from each study will be used. Furthermore,
302 | the intention is only to demonstrate the functionality of the MGnifyR package,
303 | rather than produce scientifically rigorous results.
304 | 
305 | ```{r get_analyses}
306 | soil <- searchAnalysis(mg, "studies", "MGYS00001447")
307 | human <- searchAnalysis(mg, "studies", "MGYS00001442")
308 | marine <- searchAnalysis(mg, "studies", "MGYS00001282")
309 | 
310 | # Combine analyses
311 | all_accessions <- c(soil, human, marine)
312 | 
313 | head(all_accessions)
314 | ```
315 | 
316 | The first step with this new accession list is, as previously, to retrieve the
317 | associated metadata using `getMetadata()`, and as seen with the
318 | `doQuery()` results, the returned `data.frame` contains a large number of
319 | columns. Being autogenerated and flexible, the column names can be a little
320 | difficult to predict, but examining `colnames(full_metadata)` should make
321 | things clearer.
322 | 
323 | ```{r get_new_metadata}
324 | full_metadata <- getMetadata(mg, all_accessions)
325 | 
326 | colnames(full_metadata)
327 | head(full_metadata)
328 | ```
329 | 
330 | From `full_metadata` we get an idea of the type of data we're dealing with,
331 | and can extract useul information such as sequencing platform, source biome,
332 | etc. The next code snippet tallies a few of these columns to give an idea about
333 | what's available. The boxplot also indicates that while within study read
334 | counts are similar, we probably need to use some sort of normalization
335 | procedure when comparing across samples. We might also want to drop
336 | particularly low read coverage samples from further analysis.
337 | 
338 | ```{r full_metatdata_explore}
339 | # Load ggplot2 
340 | library(ggplot2)
341 | 
342 | #Distribution of sample source material:
343 | table(full_metadata$`sample_environment-material`)
344 | 
345 | #What sequencing machine(s) were used?
346 | table(full_metadata$`sample_instrument model`)
347 | 
348 | # Boxplot of raw read counts:
349 | ggplot(
350 |     full_metadata, aes(x=study_accession, y=log(
351 |         as.numeric(`analysis_Submitted nucleotide sequences`)))) +
352 |     geom_boxplot(aes(group=study_accession)) +
353 |     theme_bw() +
354 |     ylab("log(submitted reads)")
355 | ```
356 | 
357 | Again, we can fetch the data by calling `getResult()`. `bulk.dl=TRUE` has the
358 | potential to significantly speed up data retrieval. MGnify makes its
359 | functional results available in two separate ways, either on a per-analysis
360 | basis through the web api, or at the whole study level as large files,
361 | tab separated (TSV), and with columns representing the results for each
362 | analysis. When `bulk.dl` is `FALSE`, `MGnifyR` queries the web api to get
363 | results which (given some functional analyses results may consist of
364 | thousands of entries) may take significant time. Setting `bulk.dl` to
365 | `TRUE` causes `MGnifyR` to determine the source study associated with a
366 | particular `analysis` and to instead download and parse its corresponding
367 | results file. Since this result file contains entries for all analyses
368 | associated with the study, by taking advantage of `MGnifyR`'s local caching
369 | this single download provides results for many future analyses. In some cases
370 | this affords several orders of magnitude speedup over the api query case. 
371 | 
372 | Unfortunately, column entries in the per-study results files do not always
373 | directly correspond to those from a particular analysis run, causing the
374 | retrieval to fail. The principal cause of this is believed to be the running
375 | of multiple analyses jobs on the same sample. Thus for reliability, `bulk.dl`
376 | is `FALSE` by default. As a general recommendation though, you should try
377 | setting it `TRUE` the first time `getResult()` is used on a
378 | set of accessions. If this fails, setting `bulk.dl` to `FALSE` will enable the
379 | more robust approach allowing the analysis to continue. It might take a while
380 | though. Hopefully in the future the sample/analysis correspondence mismatches
381 | will be fixed and the default `bulk.dl` will be switch to `TRUE`.
382 | 
383 | ```{r get_mae}
384 | mae <- getResult(mg, all_accessions, bulk.dl = TRUE)
385 | 
386 | mae
387 | ```
388 | 
389 | For metagenomic samples, the result is
390 | `r BiocStyle::Biocpkg("MultiAssayExperiment")` (`MAE`) which
391 | links multiple `TreeSE` objects into one dataset. These `TreeSE` objects include
392 | taxonomic profiling data along with functional data in unique objects. Each
393 | objects is linked with each other by their sample names. You can get access
394 | to individual object or experiment by specifying index or name.
395 | 
396 | ```{r mae_access}
397 | mae[[2]]
398 | ```
399 | 
400 | We can perform principal component analysis to microbial profiling data by
401 | utilizing miaverse tools.
402 | 
403 | ```{r pcoa}
404 | # Apply relative transformation
405 | mae[[1]] <- transformAssay(mae[[1]], method = "relabundance")
406 | # Perform PCoA
407 | mae[[1]] <- runMDS(
408 |     mae[[1]], assay.type = "relabundance",
409 |     FUN = vegan::vegdist, method = "bray")
410 | # Plot
411 | plotReducedDim(mae[[1]], "MDS", colour_by = "sample_environment.feature")
412 | ```
413 | 
414 | ## Fetch raw files
415 | 
416 | While `getResult()` can be utilized to retrieve microbial profiling data, 
417 | `getData()` can be used more flexibly to retrieve any kind of data from the
418 | database. It returns data as simple data.frame or list format.
419 | 
420 | ```{r fetch_data}
421 | kegg <- getData(
422 |     mg, type = "kegg-modules", accession = "MGYA00642773",
423 |     accession.type = "analyses")
424 | 
425 | head(kegg)
426 | ```
427 | 
428 | ## Fetch sequence files
429 | 
430 | Finally, we can use `searchFile()` and `getFile()` to retrieve other MGnify
431 | pipeline outputs such as merged sequence reads, assembled contigs, and details
432 | of the functional analyses.  `searchFile()` is a simple wrapper function
433 | which, when supplied a list of accessions, finds the urls of the files we're
434 | after. In most cases we'll want to filter the returned list down to only the
435 | files of interest, which is easily done on the resulting data.frame object.
436 | In addition to the actual download location (the `download_url` column),
437 | extra columns include file type, contents and compression. It's recommended
438 | that the `colnames` of the `data.frame` be examined to get a grasp on the
439 | available metadata. To demonstrate the process, the code below retrieves
440 | a data.frame containing all available downloads for each accession we've been
441 | examining previously. It then filters this to retain only those files
442 | corresponding retain the annotated amino acid sequence files.
443 | 
444 | ```{r get_download_urls}
445 | # Find list of available downloads
446 | dl_urls <- searchFile(
447 |     mg, full_metadata$analysis_accession, type = "analyses")
448 | 
449 | # Filter table
450 | target_urls <- dl_urls[
451 |     dl_urls$attributes.description.label == "Predicted CDS with annotation", ]
452 | 
453 | head(target_urls)
454 | ```
455 | 
456 | To list the types of available files, and guide the filtering, something like
457 | the following might be useful.
458 | 
459 | ```{r list_descriptions}
460 | table(dl_urls$attributes.description.label)
461 | ```
462 | 
463 | Unlike other `MGnifyR` functions, `searchFile()` is not limited to
464 | `analyses`, and by specifying `accession_type` other results types may be
465 | found. For instance, while general `genome` functionality is not yet
466 | integrated into `MGnifyR`, we can retrieve associated files for a particular
467 | `genome` accession with the following:
468 | 
469 | ```{r get_genome_urls}
470 | genome_urls <- searchFile(mg, "MGYG000433953", type = "genomes")
471 | 
472 | genome_urls[ , c("id", "attributes.file.format.name", "download_url")]
473 | ```
474 | 
475 | Having found the a set of target urls, the final step is to use
476 | `getFile()` to actually retrieve the file. Unlike other functions, this only
477 | works with a single url location at once, so each entry in `target_urls` from
478 | above must be downloaded individually - easily done by either looping or
479 | `apply`ing over the list.
480 | 
481 | If the files are intended to be used with external programs, it might be
482 | easiest to provide a `file` parameter to the function call, which specifies
483 | a local filename for writing the file. By default `MGnifyR` will use the local
484 | cache, which can make getting to the file afterwards more awkward. Regardless,
485 | the default behaviour of `getFile()` is to retrieve the file specified in the
486 | parameter `url`, save it to disk, and return the filepath it was saved to. 
487 | 
488 | ```{r get_files}
489 | # Just select a single file from the target_urls list for demonstration.
490 | 
491 | # Default behavior - use local cache.
492 | cached_location1 = getFile(mg, target_urls$download_url[[1]])
493 | 
494 | # Specifying a file
495 | cached_location2 <- getFile(
496 |     mg, target_urls$download_url[[1]])
497 | 
498 | cached_location <- c(cached_location1, cached_location2)
499 | 
500 | # Where are the files?
501 | cached_location
502 | ```
503 | 
504 | A second download option is available, which allows built-in parsing of the
505 | file. If we know ahead of time what processing will be performed, it may be
506 | possible to integrate it into a function, pass this function to
507 | `getFile()` as the `read.func` argument. The function in question should
508 | take a single argument (the complete path name of the locally downloaded file)
509 | and the result of the call will be returned in place of the usual output
510 | file name. 
511 | 
512 | Alternatively the files could first be downloaded in the standard way, and
513 | then processed using this same function in a loop. Therefore in many cases
514 | the `read.func` parameter is redundant. However, many of the outputs from
515 | MGnify can be quite large, meaning local storage of many files may become an
516 | issue. By providing a `read_func` parameter (and necessarily setting from
517 | `MgnifyClient` object: `useCache=FALSE`) analysis of a large number of datasets
518 | may be possible with minimal storage requirements.
519 | 
520 | To illustrate, suppose we were interested in retrieving all detected sequences
521 | matching a particular PFAM motif in a set of analyses. The simple function
522 | below uses the `Biostrings` package to read an amino acid fasta file, searches
523 | for a matching PFAM tag in the sequence name, and then tallies up the unique
524 | sequences into a single data.frame row. In this case the PFAM motif identifies
525 | sequences coding for the amoC gene, found in both ammonia and methane
526 | oxidizing organisms, but any other filtering method could be used.
527 | 
528 | ```{r simple_parse_function}
529 | library(Biostrings)
530 | 
531 | # Simple function to a count of unique sequences matching PFAM amoC/mmoC motif
532 | getAmoCseqs <- function(fname){
533 |     sequences <- readAAStringSet(fname)
534 |     tgtvec <- grepl("PF04896", names(sequences))
535 |     as.data.frame(as.list(table(as.character(sequences[tgtvec]))))
536 | }
537 | ```
538 | 
539 | Having defined the function, it just remains to include it in the call to
540 | `getFile()`. 
541 | 
542 | ```{r download_with_read}
543 | # Just download a single accession for demonstration, specifying a read_function
544 | amoC_seq_counts <- getFile(
545 |     mg, target_urls$download_url[[1]], read_func = getAmoCseqs)
546 | 
547 | amoC_seq_counts
548 | ```
549 | 
550 | ```{r session_info}
551 | sessionInfo()
552 | ```
553 | 


--------------------------------------------------------------------------------