├── R
    ├── sysdata.rda
    ├── status.R
    ├── GenomicDataCommons-package.R
    ├── utilities.R
    ├── readDNAcopy.R
    ├── entity_name.R
    ├── facets.R
    ├── readHTSeqFile.R
    ├── gdc_token.R
    ├── mapping.R
    ├── expand.R
    ├── ids.R
    ├── constants.R
    ├── query.R
    ├── caching.R
    ├── slicing.R
    ├── manifest.R
    ├── clinical.R
    ├── REST.R
    ├── gdcdata.R
    ├── fields.R
    ├── bulk_transfer.R
    ├── filters.R
    └── response.R
├── _pkgdown.yml
├── tests
    ├── testthat.R
    ├── testthat
    │   ├── test_readHTSeqFile.R
    │   ├── test_cache.R
    │   ├── test_clinical.R
    │   ├── test_data.R
    │   └── test_api.R
    └── README.md
├── inst
    ├── extdata
    │   ├── dnacopy.tsv.gz
    │   └── example.htseq.counts.gz
    └── script
    │   ├── make_sysdata.R
    │   └── README.Rmd
├── vignettes
    ├── all_nodes_040318.png
    ├── questions-and-answers.Rmd
    ├── somatic_mutations.Rmd
    └── overview.Rmd
├── .gitignore
├── .Rbuildignore
├── man
    ├── status.Rd
    ├── constants.Rd
    ├── readDNAcopy.Rd
    ├── available_values.Rd
    ├── results.Rd
    ├── results_all.Rd
    ├── grep_fields.Rd
    ├── aggregations.Rd
    ├── select.Rd
    ├── expand.Rd
    ├── count.Rd
    ├── default_fields.Rd
    ├── available_fields.Rd
    ├── entity_name.Rd
    ├── write_manifest.Rd
    ├── mapping.Rd
    ├── available_expand.Rd
    ├── ids.Rd
    ├── make_filter.Rd
    ├── id_field.Rd
    ├── gdc_client.Rd
    ├── readHTSeqFile.Rd
    ├── faceting.Rd
    ├── response.Rd
    ├── field_description.Rd
    ├── GenomicDataCommons-package.Rd
    ├── manifest.Rd
    ├── gdc_token.Rd
    ├── gdc_clinical.Rd
    ├── gdc_cache.Rd
    ├── gdcdata.Rd
    ├── transfer.Rd
    ├── slicing.Rd
    ├── filtering.Rd
    └── query.Rd
├── GenomicDataCommons.Rproj
├── DESCRIPTION
├── NEWS.md
├── .github
    └── workflows
    │   └── basic_checks.yml
├── NAMESPACE
└── README.md


/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: http://bioconductor.github.io/GenomicDataCommons/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(GenomicDataCommons)
3 | 
4 | test_check("GenomicDataCommons")
5 | 


--------------------------------------------------------------------------------
/inst/extdata/dnacopy.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/inst/extdata/dnacopy.tsv.gz


--------------------------------------------------------------------------------
/vignettes/all_nodes_040318.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/vignettes/all_nodes_040318.png


--------------------------------------------------------------------------------
/inst/extdata/example.htseq.counts.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/inst/extdata/example.htseq.counts.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | *.html
 6 | *#
 7 | scratch
 8 | inst/doc/*
 9 | *_cache/
10 | docs
11 | .httr-oauth
12 | doc
13 | Meta
14 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^_pkgdown\.yml$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^\.travis\.yml$
 5 | ^appveyor\.yml$
 6 | .*cache$
 7 | .httr-oauth
 8 | ^DevNotes\.md$
 9 | scratch/*
10 | ^docs$
11 | _pkgdown.yml
12 | ^\.httr-oauth$
13 | ^doc$
14 | ^Meta$
15 | ^\.github$
16 | ^pkgdown$
17 | 


--------------------------------------------------------------------------------
/tests/testthat/test_readHTSeqFile.R:
--------------------------------------------------------------------------------
 1 | library(GenomicDataCommons)
 2 | context('readHTSeqFile')
 3 | 
 4 | test_that("readHTSeqFile works on example data", {
 5 |     dat = readHTSeqFile(system.file(package="GenomicDataCommons",
 6 |                                     'extdata/example.htseq.counts.gz'))
 7 |     expect_equal(nrow(dat),50)
 8 |     expect_equal(ncol(dat),2)
 9 | })
10 | 


--------------------------------------------------------------------------------
/R/status.R:
--------------------------------------------------------------------------------
 1 | #' Query the GDC for current status
 2 | #'
 3 | #' @param version (optional) character(1) version of GDC
 4 | #'
 5 | #' @return List describing current status.
 6 | #' 
 7 | #' @importFrom httr content
 8 | #'
 9 | #' @examples
10 | #' status()
11 | #' 
12 | #' @export
13 | status <- function(version=NULL) {
14 |     response <- .gdc_get(paste(version, "status", sep="/"),archive='default')
15 |     content(response, type="application/json")
16 | }
17 | 


--------------------------------------------------------------------------------
/R/GenomicDataCommons-package.R:
--------------------------------------------------------------------------------
 1 | #' GenomicDataCommons: A package for interfacing with the NCI GDC
 2 | #' 
 3 | #' @section finding data:
 4 | #' 
 5 | #' \itemize{
 6 | #' \item{\code{\link{query}}}
 7 | #' \item{\code{\link{cases}}}
 8 | #' \item{\code{\link{projects}}}
 9 | #' \item{\code{\link{files}}}
10 | #' \item{\code{\link{annotations}}}
11 | #' \item{\code{\link{mapping}}}
12 | #' }
13 | #'
14 | #' @section downloading data:
15 | #' data
16 | #'
17 | "_PACKAGE"
18 | 


--------------------------------------------------------------------------------
/man/status.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/status.R
 3 | \name{status}
 4 | \alias{status}
 5 | \title{Query the GDC for current status}
 6 | \usage{
 7 | status(version = NULL)
 8 | }
 9 | \arguments{
10 | \item{version}{(optional) character(1) version of GDC}
11 | }
12 | \value{
13 | List describing current status.
14 | }
15 | \description{
16 | Query the GDC for current status
17 | }
18 | \examples{
19 | status()
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/GenomicDataCommons.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: c8e45341-d01a-414d-a98a-fb488c8baf55
 3 | 
 4 | RestoreWorkspace: Default
 5 | SaveWorkspace: Default
 6 | AlwaysSaveHistory: Default
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 4
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: knitr
14 | LaTeX: pdfLaTeX
15 | 
16 | BuildType: Package
17 | PackageUseDevtools: Yes
18 | PackageInstallArgs: --no-multiarch --with-keep.source --no-test-load
19 | PackageBuildArgs: --no-build-vignettes
20 | PackageCheckArgs: --no-vignettes
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/tests/testthat/test_cache.R:
--------------------------------------------------------------------------------
 1 | library(GenomicDataCommons)
 2 | context('cache_control')
 3 | 
 4 | cache = gdc_cache()
 5 | 
 6 | test_that("getting cache returns length 1 char vector", {
 7 |     expect_length(gdc_cache(),1)
 8 |     expect_true(is.character(gdc_cache()))
 9 | })
10 | 
11 | test_that("setting cache works", {
12 |     expect_equal(gdc_set_cache('/tmp'),'/tmp')
13 |     expect_equal(gdc_cache(),'/tmp')
14 | })
15 | 
16 | test_that("setting cache error checking works", {
17 |     expect_error(gdc_set_cache(1))
18 |     expect_error(gdc_set_cache(c('a','b')))
19 | })
20 | 
21 | gdc_set_cache(cache)
22 | 
23 | 


--------------------------------------------------------------------------------
/man/constants.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/constants.R
 3 | \name{endpoints}
 4 | \alias{endpoints}
 5 | \alias{parameters}
 6 | \title{Endpoints and Parameters}
 7 | \usage{
 8 | endpoints()
 9 | 
10 | parameters()
11 | }
12 | \value{
13 | \code{endpoints()} returns a character vector of possible
14 |     endpoints.
15 | 
16 | \code{parameters()} returns a list of possible parameters
17 |     and their default values.
18 | }
19 | \description{
20 | \code{endpoints()} returns available endpoints.
21 | }
22 | \examples{
23 | endpoints()
24 | parameters()
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/readDNAcopy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readDNAcopy.R
 3 | \name{readDNAcopy}
 4 | \alias{readDNAcopy}
 5 | \title{Read DNAcopy results into GRanges object}
 6 | \usage{
 7 | readDNAcopy(fname, ...)
 8 | }
 9 | \arguments{
10 | \item{fname}{The path to a DNAcopy-like file.}
11 | 
12 | \item{...}{passed to \code{\link[readr]{read_tsv}}}
13 | }
14 | \value{
15 | a \code{\link[GenomicRanges]{GRanges}} object
16 | }
17 | \description{
18 | Read DNAcopy results into GRanges object
19 | }
20 | \examples{
21 | fname = system.file(package='GenomicDataCommons',
22 |                     'extdata/dnacopy.tsv.gz')
23 | dnac = readDNAcopy(fname)
24 | class(dnac)
25 | length(dnac)
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/available_values.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{available_values}
 4 | \alias{available_values}
 5 | \title{Find common values for a GDC field}
 6 | \usage{
 7 | available_values(entity, field)
 8 | }
 9 | \arguments{
10 | \item{entity}{character(1), a GDC entity ("cases", "files", "annotations", "projects")}
11 | 
12 | \item{field}{character(1), a field that is present in the entity record}
13 | }
14 | \value{
15 | character vector of the top 100 (or fewer) most frequent
16 |     values for a the given field
17 | }
18 | \description{
19 | Find common values for a GDC field
20 | }
21 | \examples{
22 | available_values('files','cases.project.project_id')[1:5]
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/inst/script/make_sysdata.R:
--------------------------------------------------------------------------------
 1 | library(httr)
 2 | library(xml2)
 3 | 
 4 | pkghome <- "~/a/GenomicDataCommons"
 5 | 
 6 | url <-
 7 |     "https://gdc-docs.nci.nih.gov/API/Users_Guide/Appendix_A_Available_Fields/"
 8 | xml = content(GET(url))
 9 | 
10 | .get_field <- function(xml, xpath) {
11 |     fields <- as.character(xml_find_all(xml, xpath))
12 |     Filter(nzchar, trimws(fields))
13 | }
14 | 
15 | .project_fields <- .get_field(xml, "//table[1]//tr/td[1]/text()")
16 | .file_fields <- .get_field(xml, "//table[2]//tr/td[1]/text()")
17 | .case_fields <- .get_field(xml, "//table[3]//tr/td[1]/text()")
18 | .annotation_fields <- .get_field(xml, "//table[4]//tr/td[1]/text()")
19 | 
20 | save(.project_fields, .file_fields, .case_fields, .annotation_fields,
21 |      file=file.path(pkghome, "R", "sysdata.rda"))
22 | 


--------------------------------------------------------------------------------
/man/results.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/response.R
 3 | \name{results}
 4 | \alias{results}
 5 | \alias{results.GDCQuery}
 6 | \alias{results.GDCResponse}
 7 | \title{results}
 8 | \usage{
 9 | results(x, ...)
10 | 
11 | \method{results}{GDCQuery}(x, ...)
12 | 
13 | \method{results}{GDCResponse}(x, ...)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | 
18 | \item{...}{passed on to \code{\link{response}}}
19 | }
20 | \value{
21 | A (typically nested) \code{list} of GDC records
22 | }
23 | \description{
24 | results
25 | }
26 | \section{Methods (by class)}{
27 | \itemize{
28 | \item \code{results(GDCQuery)}: 
29 | 
30 | \item \code{results(GDCResponse)}: 
31 | 
32 | }}
33 | \examples{
34 | qcases = cases() |> results()
35 | length(qcases)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/R/utilities.R:
--------------------------------------------------------------------------------
 1 | .cat0 <- function(..., sep=NULL)
 2 |     cat(..., sep="")
 3 | 
 4 | .wrapstr <- function(x)
 5 |     paste(strwrap(paste(x, collapse=", "), indent=4, exdent=4), collapse="\n")
 6 | 
 7 | .dir_validate_or_create <- function(destination_dir) {
 8 |     stopifnot(is.character(destination_dir), length(destination_dir) == 1L,
 9 |               nzchar(destination_dir))
10 |     if (!dir.exists(destination_dir)) {
11 |         if (!file.exists(destination_dir))
12 |             dir.create(destination_dir, recursive = TRUE)
13 |         else
14 |             stop("'destination_dir' exists but is not a directory")
15 |     }
16 | }
17 | 
18 | #" (internal) return character(0) instead of NULL
19 | #"
20 | #" Always return a vector and not NULL.
21 | .ifNullCharacterZero <- function(x) {
22 |     if(is.null(x))
23 |         return(character(0))
24 |     return(x)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/results_all.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/response.R
 3 | \name{results_all}
 4 | \alias{results_all}
 5 | \alias{results_all.GDCQuery}
 6 | \alias{results_all.GDCResponse}
 7 | \title{results_all}
 8 | \usage{
 9 | results_all(x)
10 | 
11 | \method{results_all}{GDCQuery}(x)
12 | 
13 | \method{results_all}{GDCResponse}(x)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | }
18 | \value{
19 | A (typically nested) \code{list} of GDC records
20 | }
21 | \description{
22 | results_all
23 | }
24 | \section{Methods (by class)}{
25 | \itemize{
26 | \item \code{results_all(GDCQuery)}: 
27 | 
28 | \item \code{results_all(GDCResponse)}: 
29 | 
30 | }}
31 | \examples{
32 | # details of all available projects
33 | projResults = projects() |> results_all()
34 | length(projResults)
35 | count(projects())
36 | 
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/testthat/test_clinical.R:
--------------------------------------------------------------------------------
 1 | test_that("clinical data is structured properly", {
 2 |     sizen <- 3
 3 |     case_ids <- cases() |> results(size=sizen) |> ids()
 4 |     clinical_data <- gdc_clinical(case_ids)
 5 |     # overview of clinical results
 6 |     expect_true(
 7 |         is(clinical_data, "GDCClinicalList")
 8 |     )
 9 |     expect_true(
10 |         all(
11 |             c("demographic", "diagnoses", "exposures", "follow_ups", "main")
12 |             %in%
13 |             names(clinical_data)
14 |         )
15 |     )
16 |     
17 |     ## exposures has no rows
18 |     clinical_data <- clinical_data[names(clinical_data) != "exposures"]
19 |     expect_true(
20 |         all(
21 |             vapply(clinical_data, nrow, integer(1L)) >= sizen
22 |         )
23 |     )
24 |     expect_true(
25 |         all(
26 |             vapply(clinical_data, is.data.frame, logical(1L))
27 |         )
28 |     )
29 | })
30 | 


--------------------------------------------------------------------------------
/man/grep_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{grep_fields}
 4 | \alias{grep_fields}
 5 | \title{Find matching field names}
 6 | \usage{
 7 | grep_fields(entity, pattern, ..., value = TRUE)
 8 | }
 9 | \arguments{
10 | \item{entity}{one of the available gdc entities ('files','cases',...)
11 | against which to gather available fields for matching}
12 | 
13 | \item{pattern}{A regular expression that will be used
14 | in a call to \code{\link{grep}}}
15 | 
16 | \item{...}{passed on to grep}
17 | 
18 | \item{value}{logical(1) whether to return values as opposed
19 | to indices (passed along to grep)}
20 | }
21 | \value{
22 | character() vector of field names matching
23 |     \code{pattern}
24 | }
25 | \description{
26 | This utility function allows quick text-based search of available
27 | fields for using \code{\link{grep}}
28 | }
29 | \examples{
30 | grep_fields('files','analysis')
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/aggregations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/response.R
 3 | \name{aggregations}
 4 | \alias{aggregations}
 5 | \alias{aggregations.GDCQuery}
 6 | \alias{aggregations.GDCResponse}
 7 | \title{aggregations}
 8 | \usage{
 9 | aggregations(x)
10 | 
11 | \method{aggregations}{GDCQuery}(x)
12 | 
13 | \method{aggregations}{GDCResponse}(x)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | }
18 | \value{
19 | a \code{list} of \code{data.frame} with one
20 | member for each requested facet. The data frames
21 | each have two columns, key and doc_count.
22 | }
23 | \description{
24 | aggregations
25 | }
26 | \section{Methods (by class)}{
27 | \itemize{
28 | \item \code{aggregations(GDCQuery)}: 
29 | 
30 | \item \code{aggregations(GDCResponse)}: 
31 | 
32 | }}
33 | \examples{
34 | # Number of each file type
35 | res = files() |> facet(c('type','data_type')) |> aggregations()
36 | res$type
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/man/select.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{select}
 4 | \alias{select}
 5 | \alias{select.GDCQuery}
 6 | \title{S3 generic to set GDCQuery fields}
 7 | \usage{
 8 | select(x, fields)
 9 | 
10 | \method{select}{GDCQuery}(x, fields)
11 | }
12 | \arguments{
13 | \item{x}{the objects on which to set fields}
14 | 
15 | \item{fields}{a character vector specifying the fields}
16 | }
17 | \value{
18 | A \code{\link{GDCQuery}} object, with the fields
19 | member altered.
20 | }
21 | \description{
22 | S3 generic to set GDCQuery fields
23 | }
24 | \section{Methods (by class)}{
25 | \itemize{
26 | \item \code{select(GDCQuery)}: set fields on a GDCQuery object
27 | 
28 | }}
29 | \examples{
30 | gProj = projects()
31 | gProj$fields
32 | head(available_fields(gProj))
33 | default_fields(gProj)
34 | 
35 | gProj |>
36 |   select(default_fields(gProj)[1:2]) |>
37 |   response() |>
38 |   str(max_level=2)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/R/readDNAcopy.R:
--------------------------------------------------------------------------------
 1 | #' Read DNAcopy results into GRanges object
 2 | #'
 3 | #' @param fname The path to a DNAcopy-like file.
 4 | #' @param ... passed to \code{\link[readr]{read_tsv}}
 5 | #' @return a \code{\link[GenomicRanges]{GRanges}} object
 6 | #' 
 7 | #' @importFrom readr read_tsv
 8 | #' @import GenomicRanges
 9 | #' @importFrom IRanges IRanges
10 | #'
11 | #' @examples
12 | #' fname = system.file(package='GenomicDataCommons',
13 | #'                     'extdata/dnacopy.tsv.gz')
14 | #' dnac = readDNAcopy(fname)
15 | #' class(dnac)
16 | #' length(dnac)
17 | #'
18 | #' @export
19 | readDNAcopy <- function(fname,...) {
20 |     stopifnot(file.exists(fname))
21 |     res = read_tsv(fname,...)
22 |     stopifnot(ncol(res)==6)
23 |     return(GRanges(seqnames=res[[2]],
24 |                    ranges=IRanges(start=res[[3]],end=res[[4]]),
25 |                    sampleName = res[[1]],
26 |                    Num_Probes = res[[5]],
27 |                    value      = res[[6]]))
28 | }
29 | 


--------------------------------------------------------------------------------
/man/expand.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expand.R
 3 | \name{expand}
 4 | \alias{expand}
 5 | \alias{expand.GDCQuery}
 6 | \title{Set the \code{expand} parameter}
 7 | \usage{
 8 | expand(x, expand)
 9 | 
10 | \method{expand}{GDCQuery}(x, expand)
11 | }
12 | \arguments{
13 | \item{x}{the objects on which to set fields}
14 | 
15 | \item{expand}{a character vector specifying the fields}
16 | }
17 | \value{
18 | A \code{\link{GDCQuery}} object, with the \code{expand}
19 | member altered.
20 | }
21 | \description{
22 | S3 generic to set GDCQuery expand parameter
23 | }
24 | \section{Methods (by class)}{
25 | \itemize{
26 | \item \code{expand(GDCQuery)}: set expand fields on a GDCQuery object
27 | 
28 | }}
29 | \examples{
30 | gProj = projects()
31 | gProj$fields
32 | head(available_fields(gProj))
33 | default_fields(gProj)
34 | 
35 | gProj |>
36 |   select(default_fields(gProj)[1:2]) |>
37 |   response() |>
38 |   str(max_level=2)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/count.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/response.R
 3 | \name{count}
 4 | \alias{count}
 5 | \alias{count.GDCQuery}
 6 | \alias{count.GDCResponse}
 7 | \title{provide count of records in a \code{\link{GDCQuery}}}
 8 | \usage{
 9 | count(x, ...)
10 | 
11 | \method{count}{GDCQuery}(x, ...)
12 | 
13 | \method{count}{GDCResponse}(x, ...)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | 
18 | \item{...}{passed to httr (good for passing config info, etc.)}
19 | }
20 | \value{
21 | integer(1) representing the count of records that will
22 |  be returned by the current query
23 | }
24 | \description{
25 | provide count of records in a \code{\link{GDCQuery}}
26 | }
27 | \section{Methods (by class)}{
28 | \itemize{
29 | \item \code{count(GDCQuery)}: 
30 | 
31 | \item \code{count(GDCResponse)}: 
32 | 
33 | }}
34 | \examples{
35 | # total number of projects
36 | projects() |> count()
37 | 
38 | # total number of cases
39 | cases() |> count()
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/default_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{default_fields}
 4 | \alias{default_fields}
 5 | \alias{default_fields.character}
 6 | \alias{default_fields.GDCQuery}
 7 | \title{S3 Generic to return default GDC fields}
 8 | \usage{
 9 | default_fields(x)
10 | 
11 | \method{default_fields}{character}(x)
12 | 
13 | \method{default_fields}{GDCQuery}(x)
14 | }
15 | \arguments{
16 | \item{x}{A character string ('cases','files','projects',
17 | 'annotations') or an subclass of \code{\link{GDCQuery}}.}
18 | }
19 | \value{
20 | a character vector of the default fields
21 | }
22 | \description{
23 | S3 Generic to return default GDC fields
24 | }
25 | \section{Methods (by class)}{
26 | \itemize{
27 | \item \code{default_fields(character)}: character method
28 | 
29 | \item \code{default_fields(GDCQuery)}: GDCQuery method
30 | 
31 | }}
32 | \examples{
33 | default_fields('projects')
34 | projQuery = query('projects')
35 | default_fields(projQuery)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/testthat/test_data.R:
--------------------------------------------------------------------------------
 1 | library(GenomicDataCommons)
 2 | context('data handling')
 3 | 
 4 | case_ids <- cases() |> results(size=10) |> ids()
 5 | 
 6 | test_that("manifest files", {
 7 |     m <- manifest(files(), size = 10)
 8 |     expect_identical(nrow(m), 10L)
 9 |     expect_true(ncol(m) > 5)
10 | })
11 | 
12 | test_that("write_manifest", {
13 |     m = files() |> manifest(size=10)
14 |     tf = tempfile()
15 |     write_manifest(m, tf)
16 |     expect_true(file.exists(tf))
17 |     unlink(tf)
18 | })
19 | 
20 | test_that("gdcdata", {
21 |     d = tempfile()
22 |     if (!dir.exists(d))
23 |         dir.create(d)
24 |     gdc_set_cache(d)
25 |     
26 |     few_file_ids = files() |>
27 |         filter( ~ cases.project.project_id == 'TCGA-SARC' &
28 |             data_type == 'Copy Number Segment' &
29 |             analysis.workflow_type == 'DNAcopy') |> results(size=2) |> ids()
30 | 
31 |     res = gdcdata(few_file_ids)
32 |     expect_length(res, 2)
33 |     expect_named(res)
34 |     unlink(d, recursive = TRUE)
35 | })
36 | 


--------------------------------------------------------------------------------
/man/available_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{available_fields}
 4 | \alias{available_fields}
 5 | \alias{available_fields.GDCQuery}
 6 | \alias{available_fields.character}
 7 | \title{S3 Generic to return all GDC fields}
 8 | \usage{
 9 | available_fields(x)
10 | 
11 | \method{available_fields}{GDCQuery}(x)
12 | 
13 | \method{available_fields}{character}(x)
14 | }
15 | \arguments{
16 | \item{x}{A character(1) string ('cases','files','projects',
17 | 'annotations') or an subclass of \code{\link{GDCQuery}}.}
18 | }
19 | \value{
20 | a character vector of the default fields
21 | }
22 | \description{
23 | S3 Generic to return all GDC fields
24 | }
25 | \section{Methods (by class)}{
26 | \itemize{
27 | \item \code{available_fields(GDCQuery)}: GDCQuery method
28 | 
29 | \item \code{available_fields(character)}: character method
30 | 
31 | }}
32 | \examples{
33 | available_fields('projects')
34 | projQuery = query('projects')
35 | available_fields(projQuery)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/entity_name.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/entity_name.R
 3 | \name{entity_name}
 4 | \alias{entity_name}
 5 | \alias{entity_name.GDCQuery}
 6 | \alias{entity_name.GDCResults}
 7 | \title{Get the entity name from a GDCQuery object}
 8 | \usage{
 9 | entity_name(x)
10 | 
11 | \method{entity_name}{GDCQuery}(x)
12 | 
13 | \method{entity_name}{GDCResults}(x)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | }
18 | \value{
19 | character(1) name of an associated entity; one of
20 | "cases", "files", "projects", "annotations".
21 | }
22 | \description{
23 | An "entity" is simply one of the four medata endpoints.
24 | \itemize{
25 | \item{cases}
26 | \item{projects}
27 | \item{files}
28 | \item{annotations}
29 | }
30 | All \code{\link{GDCQuery}} objects will have an entity name. This S3 method
31 | is simply a utility accessor for those names.
32 | }
33 | \examples{
34 | qcases = cases()
35 | qprojects = projects()
36 | 
37 | entity_name(qcases)
38 | entity_name(qprojects)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/write_manifest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manifest.R
 3 | \name{write_manifest}
 4 | \alias{write_manifest}
 5 | \title{write a manifest data.frame to disk}
 6 | \usage{
 7 | write_manifest(manifest, destfile = tempfile())
 8 | }
 9 | \arguments{
10 | \item{manifest}{A data.frame with five columns, typically
11 | created by a call to \code{\link{manifest}}}
12 | 
13 | \item{destfile}{The filename for saving the manifest.}
14 | }
15 | \value{
16 | character(1) the destination file name.
17 | }
18 | \description{
19 | The \code{\link{manifest}} method creates a data.frame
20 | that represents the data for a manifest file needed
21 | by the GDC Data Transfer Tool. While the file format
22 | is nothing special, this is a simple helper function
23 | to write a manifest data.frame to disk. It returns
24 | the path to which the file is written, so it can
25 | be used "in-line" in a call to \code{\link{transfer}}.
26 | }
27 | \examples{
28 | mf = files() |> manifest(size=10)
29 | write_manifest(mf)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/mapping.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mapping.R
 3 | \name{mapping}
 4 | \alias{mapping}
 5 | \title{Query GDC for available endpoint fields}
 6 | \usage{
 7 | mapping(endpoint)
 8 | }
 9 | \arguments{
10 | \item{endpoint}{character(1) corresponding to endpoints for which
11 | users may specify additional or alternative fields. Endpoints
12 | include \dQuote{projects}, \dQuote{cases}, \dQuote{files}, and
13 | \dQuote{annotations}.}
14 | }
15 | \value{
16 | A data frame describing the field (field name), full (full
17 |     data model name), type (data type), and four additional columns
18 |     describing the "set" to which the fields belong--\dQuote{default},
19 |     \dQuote{expand}, \dQuote{multi}, and \dQuote{nested}.
20 | }
21 | \description{
22 | Query GDC for available endpoint fields
23 | }
24 | \examples{
25 | map <- mapping("projects")
26 | head(map)
27 | # get only the "default" fields
28 | subset(map,defaults)
29 | # And get just the text names of the "default" fields
30 | subset(map,defaults)$field
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/available_expand.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/expand.R
 3 | \name{available_expand}
 4 | \alias{available_expand}
 5 | \alias{available_expand.character}
 6 | \alias{available_expand.GDCQuery}
 7 | \title{Return valid values for "expand"}
 8 | \usage{
 9 | available_expand(entity)
10 | 
11 | \method{available_expand}{character}(entity)
12 | 
13 | \method{available_expand}{GDCQuery}(entity)
14 | }
15 | \arguments{
16 | \item{entity}{Either a \code{\link{GDCQuery}} object
17 | or a character(1) specifying a GDC entity ('cases', 'files',
18 | 'annotations', 'projects')}
19 | }
20 | \value{
21 | A character vector
22 | }
23 | \description{
24 | The GDC allows a shorthand for specifying groups
25 | of fields to be returned by the metadata queries.
26 | These can be specified in a \code{\link{select}}
27 | method call to easily supply groups of fields.
28 | }
29 | \examples{
30 | head(available_expand('files'))
31 | 
32 | }
33 | \seealso{
34 | See \url{https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#expand}
35 | for details
36 | }
37 | 


--------------------------------------------------------------------------------
/man/ids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ids.R
 3 | \name{ids}
 4 | \alias{ids}
 5 | \alias{ids.GDCManifest}
 6 | \alias{ids.GDCQuery}
 7 | \alias{ids.GDCResults}
 8 | \alias{ids.GDCResponse}
 9 | \title{Get the ids associated with a GDC query or response}
10 | \usage{
11 | ids(x)
12 | 
13 | \method{ids}{GDCManifest}(x)
14 | 
15 | \method{ids}{GDCQuery}(x)
16 | 
17 | \method{ids}{GDCResults}(x)
18 | 
19 | \method{ids}{GDCResponse}(x)
20 | }
21 | \arguments{
22 | \item{x}{A \code{\link{GDCQuery}} or \code{\link{GDCResponse}} object}
23 | }
24 | \value{
25 | a character vector of all the entity ids
26 | }
27 | \description{
28 | The GDC assigns ids (in the form of uuids) to objects in its database. Those
29 | ids can be used for relationships, searching on the website, and as
30 | unique ids.  All
31 | }
32 | \examples{
33 | # use with a GDC query, in this case for "cases"
34 | ids(cases() |> filter(~ project.project_id == "TCGA-CHOL"))
35 | # also works for responses
36 | ids(response(files()))
37 | # and results
38 | ids(results(cases()))
39 | 
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/make_filter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filters.R
 3 | \name{make_filter}
 4 | \alias{make_filter}
 5 | \title{Create NCI GDC filters for limiting GDC query results}
 6 | \usage{
 7 | make_filter(expr, available_fields)
 8 | }
 9 | \arguments{
10 | \item{expr}{a lazy-wrapped expression or a formula RHS equivalent}
11 | 
12 | \item{available_fields}{a character vector of the
13 | additional names that will be injected into the
14 | filter evaluation environment}
15 | }
16 | \value{
17 | a \code{list} that represents an R version
18 | of the JSON that will ultimately be used in an
19 | NCI GDC search or other query.
20 | }
21 | \description{
22 | Searching the NCI GDC allows for complex filtering based
23 | on logical operations and simple comparisons.  This function
24 | facilitates writing such filter expressions in R-like syntax
25 | with R code evaluation.
26 | }
27 | \details{
28 | If used with available_fields, "bare" fields that are
29 | named in the available_fields character vector can be used
30 | in the filter expression without quotes.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/id_field.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ids.R
 3 | \name{id_field}
 4 | \alias{id_field}
 5 | \alias{id_field.GDCQuery}
 6 | \alias{id_field.GDCResults}
 7 | \title{get the name of the id field}
 8 | \usage{
 9 | id_field(x)
10 | 
11 | \method{id_field}{GDCQuery}(x)
12 | 
13 | \method{id_field}{GDCResults}(x)
14 | }
15 | \arguments{
16 | \item{x}{An object representing the query or results 
17 | of an entity from the GDC ("cases", "files", "annotations", "projects")}
18 | }
19 | \value{
20 | character(1) such as "case_id", "file_id", etc.
21 | }
22 | \description{
23 | In many places in the GenomicDataCommons package,
24 | the entity ids are stored in a column or a vector
25 | with a specific name that corresponds to the field name 
26 | at the GDC. The format is the entity name (singular) "_id".
27 | This generic simply returns that name from a given object.
28 | }
29 | \section{Methods (by class)}{
30 | \itemize{
31 | \item \code{id_field(GDCQuery)}: GDCQuery method
32 | 
33 | \item \code{id_field(GDCResults)}: GDCResults method
34 | 
35 | }}
36 | \examples{
37 | id_field(cases())
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/entity_name.R:
--------------------------------------------------------------------------------
 1 | #' Get the entity name from a GDCQuery object
 2 | #'
 3 | #' An "entity" is simply one of the four medata endpoints.
 4 | #' \itemize{
 5 | #' \item{cases}
 6 | #' \item{projects}
 7 | #' \item{files}
 8 | #' \item{annotations}
 9 | #' }
10 | #' All \code{\link{GDCQuery}} objects will have an entity name. This S3 method
11 | #' is simply a utility accessor for those names.
12 | #' 
13 | #' @param x a \code{\link{GDCQuery}} object
14 | #'
15 | #' @return character(1) name of an associated entity; one of
16 | #' "cases", "files", "projects", "annotations".
17 | #' 
18 | #' @examples
19 | #' qcases = cases()
20 | #' qprojects = projects()
21 | #' 
22 | #' entity_name(qcases)
23 | #' entity_name(qprojects)
24 | #' 
25 | #' @export
26 | entity_name = function(x) {
27 |     UseMethod('entity_name',x)
28 | }
29 | 
30 | 
31 | #' @rdname entity_name
32 | #' @export
33 | entity_name.GDCQuery = function(x) {
34 |     cls = class(x)[1]
35 |     return(substr(cls,5,nchar(cls)))
36 | }
37 | 
38 | #' @rdname entity_name
39 | #' @export
40 | entity_name.GDCResults = function(x) {
41 |     cls = class(x)[1]
42 |     return(substr(cls,4,nchar(cls)-8))
43 | }
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/man/gdc_client.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bulk_transfer.R
 3 | \name{gdc_client}
 4 | \alias{gdc_client}
 5 | \title{return gdc-client executable path}
 6 | \usage{
 7 | gdc_client()
 8 | }
 9 | \value{
10 | character(1) the path to the gdc-client executable.
11 | }
12 | \description{
13 | This function is a convenience function to 
14 | find and return the path to the GDC Data Transfer
15 | Tool executable assumed to be named 'gdc-client'. 
16 | The assumption is that the appropriate version of the
17 | GDC Data Transfer Tool is a separate download available
18 | from \href{the GDC website}{https://gdc.cancer.gov/access-data/gdc-data-transfer-tool}
19 | and as a backup from \href{on github}{https://github.com/NCI-GDC/gdc-client}.
20 | }
21 | \details{
22 | The path is checked in the following order:
23 | \enumerate{
24 | \item an R option("gdc_client")
25 | \item an environment variable GDC_CLIENT
26 | \item from the search PATH
27 | \item in the current working directory
28 | }
29 | }
30 | \examples{
31 | # this cannot run without first
32 | # downloading the GDC Data Transfer Tool
33 | gdc_client = try(gdc_client(),silent=TRUE)
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/readHTSeqFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/readHTSeqFile.R
 3 | \name{readHTSeqFile}
 4 | \alias{readHTSeqFile}
 5 | \title{Read a single htseq-counts result file.}
 6 | \usage{
 7 | readHTSeqFile(fname, samplename = "sample", ...)
 8 | }
 9 | \arguments{
10 | \item{fname}{character(1), the path of the htseq-count file.}
11 | 
12 | \item{samplename}{character(1), the name of the sample. This will
13 | become the name of the second column on the resulting
14 | \code{data.frame}, making for easier merging if necessary.}
15 | 
16 | \item{...}{passed to \code{\link[readr]{read_tsv})}}
17 | }
18 | \value{
19 | a two-column data frame
20 | }
21 | \description{
22 | The htseq package is used extensively to count reads
23 | relative to regions (see 
24 | \url{http://www-huber.embl.de/HTSeq/doc/counting.html}).
25 | The output of htseq-count is a simple two-column table
26 | that includes features in column 1 and counts in column 2.
27 | This function simply reads in the data from one such file
28 | and assigns column names.
29 | }
30 | \examples{
31 | fname = system.file(package='GenomicDataCommons',
32 |                     'extdata/example.htseq.counts.gz')
33 | dat = readHTSeqFile(fname)
34 | head(dat)
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/man/faceting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/facets.R
 3 | \name{facet}
 4 | \alias{facet}
 5 | \alias{get_facets}
 6 | \alias{get_facets.GDCQuery}
 7 | \title{Set facets for a \code{\link{GDCQuery}}}
 8 | \usage{
 9 | facet(x, facets)
10 | 
11 | get_facets(x)
12 | 
13 | \method{get_facets}{GDCQuery}(x)
14 | }
15 | \arguments{
16 | \item{x}{a \code{\link{GDCQuery}} object}
17 | 
18 | \item{facets}{a character vector of fields that
19 | will be used for forming aggregations (facets).
20 | Default is to set facets for all default fields.
21 | See \code{\link{default_fields}} for details}
22 | }
23 | \value{
24 | returns a \code{\link{GDCQuery}} object,
25 | with facets field updated.
26 | }
27 | \description{
28 | Set facets for a \code{\link{GDCQuery}}
29 | 
30 | Get facets for a \code{\link{GDCQuery}}
31 | }
32 | \examples{
33 | # create a new GDCQuery against the projects endpoint
34 | gProj = projects()
35 | 
36 | # default facets are NULL
37 | get_facets(gProj)
38 | 
39 | # set facets and save result
40 | gProjFacet = facet(gProj)
41 | 
42 | # check facets
43 | get_facets(gProjFacet)
44 | 
45 | # and get a response, noting that
46 | # the aggregations list member contains
47 | # tibbles for each facet
48 | str(response(gProjFacet,size=2),max.level=2)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/man/response.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/response.R
 3 | \name{response}
 4 | \alias{response}
 5 | \alias{GDCResponse}
 6 | \alias{response.GDCQuery}
 7 | \alias{response_all}
 8 | \title{Fetch \code{\link{GDCQuery}} metadata from GDC}
 9 | \usage{
10 | response(x, ...)
11 | 
12 | \method{response}{GDCQuery}(x, from = 0, size = 10, ..., response_handler = jsonlite::fromJSON)
13 | 
14 | response_all(x, ...)
15 | }
16 | \arguments{
17 | \item{x}{a \code{\link{GDCQuery}} object}
18 | 
19 | \item{...}{passed to httr (good for passing config info, etc.)}
20 | 
21 | \item{from}{integer index from which to start returning data}
22 | 
23 | \item{size}{number of records to return}
24 | 
25 | \item{response_handler}{a function that processes JSON (as text)
26 | and returns an R object.  Default is \code{\link[jsonlite]{fromJSON}}.}
27 | }
28 | \value{
29 | A \code{GDCResponse} object which is a list with the following
30 | members:
31 | \itemize{
32 | \item{results}
33 | \item{query}
34 | \item{aggregations}
35 | \item{pages}
36 | }
37 | }
38 | \description{
39 | Fetch \code{\link{GDCQuery}} metadata from GDC
40 | }
41 | \examples{
42 | 
43 | # basic class stuff
44 | gCases = cases()
45 | resp = response(gCases)
46 | class(resp)
47 | names(resp)
48 | 
49 | # And results from query
50 | resp$results[[1]]
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: GenomicDataCommons
 2 | Type: Package
 3 | Title: NIH / NCI Genomic Data Commons Access
 4 | Description: Programmatically access the NIH / NCI Genomic Data Commons
 5 |         RESTful service.
 6 | Version: 1.35.1
 7 | Date: 2025-05-12
 8 | Authors@R: c( person("Martin", "Morgan",
 9 |         email="martin.morgan@roswellpark.org", role=c("aut")),
10 |         person("Sean", "Davis", email="seandavi@gmail.com",
11 |         role=c("aut", "cre")),
12 |         person("Marcel", "Ramos",
13 |         email = "marcel.ramos@sph.cuny.edu", role = "ctb"))
14 | License: Artistic-2.0
15 | Depends: R (>= 4.1.0)
16 | Imports: stats, httr, xml2, jsonlite, utils, rlang, readr,
17 |         GenomicRanges, IRanges, dplyr, rappdirs, tibble, tidyr
18 | Suggests: BiocStyle, knitr, rmarkdown, DT, testthat, listviewer,
19 |         ggplot2, GenomicAlignments, Rsamtools, BiocParallel,
20 |         TxDb.Hsapiens.UCSC.hg38.knownGene,
21 |         VariantAnnotation, maftools, R.utils, data.table
22 | biocViews: DataImport, Sequencing
23 | URL: https://bioconductor.org/packages/GenomicDataCommons,
24 |     http://github.com/Bioconductor/GenomicDataCommons,
25 |     http://bioconductor.github.io/GenomicDataCommons/
26 | BugReports:
27 |         https://github.com/Bioconductor/GenomicDataCommons/issues/new
28 | Encoding: UTF-8
29 | VignetteBuilder: knitr
30 | RoxygenNote: 7.3.2
31 | 


--------------------------------------------------------------------------------
/man/field_description.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fields.R
 3 | \name{field_description}
 4 | \alias{field_description}
 5 | \alias{field_description.GDCQuery}
 6 | \alias{field_description.character}
 7 | \title{S3 Generic that returns the field description text, if available}
 8 | \usage{
 9 | field_description(entity, field)
10 | 
11 | \method{field_description}{GDCQuery}(entity, field)
12 | 
13 | \method{field_description}{character}(entity, field)
14 | }
15 | \arguments{
16 | \item{entity}{character(1) string ('cases','files','projects',
17 | 'annotations', etc.) or an subclass of \code{\link{GDCQuery}}.}
18 | 
19 | \item{field}{character(1), the name of the field that will be used to look
20 | up the description.}
21 | }
22 | \value{
23 | character(1) descriptive text or character(0) if no description
24 | is available.
25 | }
26 | \description{
27 | S3 Generic that returns the field description text, if available
28 | }
29 | \section{Methods (by class)}{
30 | \itemize{
31 | \item \code{field_description(GDCQuery)}: GDCQuery method
32 | 
33 | \item \code{field_description(character)}: character method
34 | 
35 | }}
36 | \examples{
37 | field_description('cases', 'annotations.category')
38 | casesQuery = query('cases')
39 | field_description(casesQuery, 'annotations.category')
40 | field_description(cases(), 'annotations.category')
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/man/GenomicDataCommons-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GenomicDataCommons-package.R
 3 | \docType{package}
 4 | \name{GenomicDataCommons-package}
 5 | \alias{GenomicDataCommons}
 6 | \alias{GenomicDataCommons-package}
 7 | \title{GenomicDataCommons: A package for interfacing with the NCI GDC}
 8 | \description{
 9 | Programmatically access the NIH / NCI Genomic Data Commons RESTful service.
10 | }
11 | \section{finding data}{
12 | 
13 | 
14 | \itemize{
15 | \item{\code{\link{query}}}
16 | \item{\code{\link{cases}}}
17 | \item{\code{\link{projects}}}
18 | \item{\code{\link{files}}}
19 | \item{\code{\link{annotations}}}
20 | \item{\code{\link{mapping}}}
21 | }
22 | }
23 | 
24 | \section{downloading data}{
25 | 
26 | data
27 | }
28 | 
29 | \seealso{
30 | Useful links:
31 | \itemize{
32 |   \item \url{https://bioconductor.org/packages/GenomicDataCommons}
33 |   \item \url{http://github.com/Bioconductor/GenomicDataCommons}
34 |   \item \url{http://bioconductor.github.io/GenomicDataCommons/}
35 |   \item Report bugs at \url{https://github.com/Bioconductor/GenomicDataCommons/issues/new}
36 | }
37 | 
38 | }
39 | \author{
40 | \strong{Maintainer}: Sean Davis \email{seandavi@gmail.com}
41 | 
42 | Authors:
43 | \itemize{
44 |   \item Martin Morgan \email{martin.morgan@roswellpark.org}
45 | }
46 | 
47 | Other contributors:
48 | \itemize{
49 |   \item Marcel Ramos \email{marcel.ramos@sph.cuny.edu} [contributor]
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/R/facets.R:
--------------------------------------------------------------------------------
 1 | #' Set facets for a \code{\link{GDCQuery}}
 2 | #'
 3 | #' @param x a \code{\link{GDCQuery}} object
 4 | #' @param facets a character vector of fields that
 5 | #' will be used for forming aggregations (facets).
 6 | #' Default is to set facets for all default fields.
 7 | #' See \code{\link{default_fields}} for details
 8 | #'
 9 | #' @return returns a \code{\link{GDCQuery}} object,
10 | #' with facets field updated.
11 | #' 
12 | #' @rdname faceting
13 | #'
14 | #' @examples
15 | #' # create a new GDCQuery against the projects endpoint
16 | #' gProj = projects()
17 | #'
18 | #' # default facets are NULL
19 | #' get_facets(gProj)
20 | #'
21 | #' # set facets and save result
22 | #' gProjFacet = facet(gProj)
23 | #'
24 | #' # check facets
25 | #' get_facets(gProjFacet)
26 | #' 
27 | #' # and get a response, noting that
28 | #' # the aggregations list member contains
29 | #' # tibbles for each facet
30 | #' str(response(gProjFacet,size=2),max.level=2)
31 | #' 
32 | #' @export
33 | facet = function(x,facets) {
34 |     UseMethod('facet',x)
35 | }
36 | 
37 | 
38 | #' @export
39 | facet.GDCQuery = function(x,facets=default_fields(x)) {
40 |     x$facets = facets
41 |     return(x)
42 | }
43 | 
44 | #' Get facets for a \code{\link{GDCQuery}}
45 | #'
46 | #' @rdname faceting
47 | #' 
48 | #' @export
49 | get_facets = function(x) {
50 |     UseMethod('get_facets',x)
51 | }
52 | 
53 | #' @rdname faceting
54 | #'
55 | #' @export
56 | get_facets.GDCQuery = function(x) {
57 |     return(x$facets)
58 | }
59 | 


--------------------------------------------------------------------------------
/R/readHTSeqFile.R:
--------------------------------------------------------------------------------
 1 | #' Read a single htseq-counts result file.
 2 | #'
 3 | #' The htseq package is used extensively to count reads
 4 | #' relative to regions (see 
 5 | #' \url{http://www-huber.embl.de/HTSeq/doc/counting.html}).
 6 | #' The output of htseq-count is a simple two-column table
 7 | #' that includes features in column 1 and counts in column 2.
 8 | #' This function simply reads in the data from one such file
 9 | #' and assigns column names. 
10 | #'
11 | #' @param fname character(1), the path of the htseq-count file.
12 | #' @param samplename character(1), the name of the sample. This will
13 | #'     become the name of the second column on the resulting
14 | #'     \code{data.frame}, making for easier merging if necessary.
15 | #' @param ... passed to \code{\link[readr]{read_tsv})}
16 | #' @return a two-column data frame
17 | #'
18 | #' @examples
19 | #' fname = system.file(package='GenomicDataCommons',
20 | #'                     'extdata/example.htseq.counts.gz')
21 | #' dat = readHTSeqFile(fname)
22 | #' head(dat)
23 | #'
24 | #' @export
25 | readHTSeqFile <- function(fname, samplename = 'sample', ...) {
26 |     if(!file.exists(fname))
27 |         stop(sprintf('The specified file, %s, does not exist',fname))
28 |     if(!((length(fname) == 1) & (is.character(fname))))
29 |         stop('fname must be of type character(1)')
30 |     tmp = read_tsv(fname,col_names = FALSE)
31 |     if(ncol(tmp) != 2)
32 |         stop(sprintf('%s had %d columns, expected 2 columns',fname, ncol(tmp)))
33 |     colnames(tmp) = c('feature',samplename)
34 |     tmp
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | ## Changes in version 1.32.0
 2 | 
 3 | ### Bug fixes and minor improvements
 4 | 
 5 | * Minor updates to unit tests and GitHub Actions
 6 | 
 7 | ## Changes in version 1.30.0
 8 | 
 9 | ### New features
10 | 
11 | * `gdc_clinical` includes clinical data from the
12 | `cases.follow_ups.other_clinical_attributes` entity (@LiNk-NY).
13 | 
14 | ### Bug fixes and minor improvements
15 | 
16 | * Removed legacy function, methods, endpoints, and arguments (@LiNk-NY)
17 | * Use native pipe `|>` instead of `magrittr::%>%` (@LiNk-NY)
18 | 
19 | ## Changes in version 1.28.0
20 | 
21 | ### Bug fixes and minor improvements
22 | 
23 | * Defunct legacy function, methods, endpoints, and arguments (@LiNk-NY)
24 | 
25 | ## Changes in version 1.26.0
26 | 
27 | ### New features
28 | 
29 | * The GDC API has deprecated the legacy endpoint (#110, @LiNk-NY) 
30 | 
31 | ## Changes in version 1.24.0
32 | 
33 | ### Bug fixes and minor improvements
34 | 
35 | * `gdc_clinical` handles `NULL` responses when diagnoses are not available for
36 | all IDs queried (#109, @zx8754).
37 | * Minor updates to somatic mutations vignette and unit tests.
38 | 
39 | ## Changes in version 1.20.0
40 | 
41 | ### New features
42 | 
43 | * `gdcdata` has an ellipses argument to download data from the legacy archive,
44 |   e.g., `legacy = TRUE` (#84, @LiNk-NY)
45 | * `missing` (`is MISSING`) and `!missing` (`NOT MISSING`) operations implemented
46 | for filtering queries, see vignette (#96, @LiNk-NY)
47 | * `gdc-client` version can be validated against last known good version based on
48 | data release (#99, @LiNk-NY)
49 | 
50 | ### Bug fixes and minor improvements
51 | 
52 | * `gdc_clinical` uses `readr::type_convert` to handle columns with inconsistent
53 |   types from the API.
54 | * update examples in documentation and vignette based on new data release
55 | 


--------------------------------------------------------------------------------
/man/manifest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manifest.R
 3 | \name{manifest}
 4 | \alias{manifest}
 5 | \alias{manifest.gdc_files}
 6 | \alias{manifest.GDCfilesResponse}
 7 | \alias{manifest.GDCcasesResponse}
 8 | \title{Prepare GDC manifest file for bulk download}
 9 | \usage{
10 | manifest(x, from = 0, size = count(x), ...)
11 | 
12 | \method{manifest}{gdc_files}(x, from = 0, size = count(x), ...)
13 | 
14 | \method{manifest}{GDCfilesResponse}(x, from = 0, size = count(x), ...)
15 | 
16 | \method{manifest}{GDCcasesResponse}(x, from = 0, size = count(x), ...)
17 | }
18 | \arguments{
19 | \item{x}{An \code{\link{GDCQuery}} object of subclass "gdc_files" or "gdc_cases".}
20 | 
21 | \item{from}{Record number from which to start when returning the manifest.}
22 | 
23 | \item{size}{The total number of records to return.  Default 
24 | will return the usually desirable full set of records.}
25 | 
26 | \item{...}{passed to \code{\link[httr]{PUT}}.}
27 | }
28 | \value{
29 | A \code{\link[tibble]{tibble}}, also of type "gdc_manifest", with five columns:
30 | \itemize{
31 | \item{id}
32 | \item{filename}
33 | \item{md5}
34 | \item{size}
35 | \item{state}
36 | }
37 | }
38 | \description{
39 | The \code{manifest} function/method creates a manifest of files to be downloaded
40 | using the GDC Data Transfer Tool. There are methods for
41 | creating manifest data frames from \code{\link{GDCQuery}} objects
42 | that contain file information ("cases" and "files" queries).
43 | }
44 | \section{Methods (by class)}{
45 | \itemize{
46 | \item \code{manifest(gdc_files)}: 
47 | 
48 | \item \code{manifest(GDCfilesResponse)}: 
49 | 
50 | \item \code{manifest(GDCcasesResponse)}: 
51 | 
52 | }}
53 | \examples{
54 | gFiles = files()
55 | shortManifest = gFiles |> manifest(size=10)
56 | head(shortManifest,n=3)
57 | 
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | ## Running tests
 2 | 
 3 | ```{r}
 4 | devtools::test()
 5 | ```
 6 | 
 7 | Should also run under `R CMD BiocCheck/check`.
 8 | 
 9 | ## Tests
10 | 
11 | A test file lives in tests/testthat/. Its name must start with test. Here’s an example of a test file from the stringr package:
12 | 
13 | ```{r}
14 | library(stringr)
15 | context("String length")
16 | 
17 | test_that("str_length is number of characters", {
18 |   expect_equal(str_length("a"), 1)
19 |   expect_equal(str_length("ab"), 2)
20 |   expect_equal(str_length("abc"), 3)
21 | })
22 | 
23 | test_that("str_length of factor is length of level", {
24 |   expect_equal(str_length(factor("a")), 1)
25 |   expect_equal(str_length(factor("ab")), 2)
26 |   expect_equal(str_length(factor("abc")), 3)
27 | })
28 | 
29 | test_that("str_length of missing is missing", {
30 |   expect_equal(str_length(NA), NA_integer_)
31 |   expect_equal(str_length(c(NA, 1)), c(NA, 1))
32 |   expect_equal(str_length("NA"), 2)
33 | })
34 | ```
35 | 
36 | Tests are organised hierarchically: expectations are grouped into tests which are organised in files:
37 | 
38 | An expectation is the atom of testing. It describes the expected result of a computation: Does it have the right value and right class? Does it produce error messages when it should? An expectation automates visual checking of results in the console. Expectations are functions that start with expect_.
39 | 
40 | A test groups together multiple expectations to test the output from a simple function, a range of possibilities for a single parameter from a more complicated function, or tightly related functionality from across multiple functions. This is why they are sometimes called unit as they test one unit of functionality. A test is created with test_that() .
41 | 
42 | A file groups together multiple related tests. Files are given a human readable name with context().
43 | 


--------------------------------------------------------------------------------
/man/gdc_token.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gdc_token.R
 3 | \name{gdc_token}
 4 | \alias{gdc_token}
 5 | \title{return a gdc token from file or environment}
 6 | \usage{
 7 | gdc_token()
 8 | }
 9 | \value{
10 | character(1) (invisibly, to protect against inadvertently printing) the GDC token.
11 | }
12 | \description{
13 | The GDC requires an auth token for downloading
14 | data that are "controlled access". For example, 
15 | BAM files for human datasets, germline variant calls,
16 | and SNP array raw data all are protected as "controlled
17 | access". For these files, a GDC access token is required.
18 | See the \href{details on the GDC authentication and token information}{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens}.
19 | Note that this function simply returns a string value. 
20 | It is possible to keep the GDC token in a variable in R
21 | or to pass a string directly to the appropriate parameter.
22 | This function is simply a convenience function for alternative 
23 | approaches to get a token from an environment variable
24 | or a file.
25 | }
26 | \details{
27 | This function will resolve locations of the GDC token in the 
28 | following order:
29 | \itemize{
30 | \item{from the environment variable, \code{GDC_TOKEN}, expected to 
31 | contain the token downloaded from the GDC as a string}
32 | \item{using \code{readLines} to read a file named in the environment
33 | variable, \code{GDC_TOKEN_FILE}}
34 | \item{using \code{readLines} to read from a file called \code{.gdc_token} in the user's
35 | home directory}
36 | }
37 | If all of these fail, this function will return an error.
38 | }
39 | \examples{
40 | # This will not run before a GDC token
41 | # is in place.  
42 | token = try(gdc_token(),silent=TRUE)
43 | 
44 | 
45 | }
46 | \references{
47 | \url{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Cart/#gdc-authentication-tokens}
48 | }
49 | 


--------------------------------------------------------------------------------
/man/gdc_clinical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clinical.R
 3 | \name{gdc_clinical}
 4 | \alias{gdc_clinical}
 5 | \title{Get clinical information from GDC}
 6 | \usage{
 7 | gdc_clinical(case_ids, include_list_cols = FALSE)
 8 | }
 9 | \arguments{
10 | \item{case_ids}{a character() vector of case_ids, typically from
11 | "cases" query.}
12 | 
13 | \item{include_list_cols}{logical(1), whether to include list
14 | columns in the "main" data.frame. These list columns have
15 | values for aliquots, samples, etc. While these may be useful
16 | for some situations, they are generally not that useful as
17 | clinical annotations.}
18 | }
19 | \value{
20 | A list of four data.frames:
21 | \enumerate{
22 | \item main, representing basic case identification and metadata
23 |     (update date, etc.)
24 | \item diagnoses
25 | \item esposures
26 | \item demographic
27 | }
28 | }
29 | \description{
30 | The NCI GDC has a complex data model that allows various studies to
31 | supply numerous clinical and demographic data elements. However,
32 | across all projects that enter the GDC, there are
33 | similarities. This function returns four data.frames associated
34 | with case_ids from the GDC.
35 | }
36 | \details{
37 | Note that these data.frames can, in general, have different numbers
38 | of rows (or even no rows at all). If one wishes to combine to
39 | produce a single data.frame, using the approach of left joining to
40 | the "main" data.frame will yield a useful combined data.frame. We
41 | do not do that directly given the potential for 1:many
42 | relationships. It is up to the user to determine what the best
43 | approach is for any given dataset.
44 | }
45 | \examples{
46 | case_ids = cases() |> results(size=10) |> ids()
47 | clinical_data = gdc_clinical(case_ids)
48 | 
49 | # overview of clinical results
50 | class(clinical_data)
51 | names(clinical_data)
52 | sapply(clinical_data, class)
53 | sapply(clinical_data, nrow)
54 | 
55 | # available data
56 | head(clinical_data$main)
57 | head(clinical_data$demographic)
58 | head(clinical_data$diagnoses)
59 | head(clinical_data$exposures)
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/man/gdc_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/caching.R
 3 | \name{gdc_cache}
 4 | \alias{gdc_cache}
 5 | \alias{gdc_set_cache}
 6 | \title{Work with gdc cache directory}
 7 | \usage{
 8 | gdc_cache()
 9 | 
10 | gdc_set_cache(
11 |   directory = rappdirs::app_dir(appname = "GenomicDataCommons")$cache(),
12 |   verbose = TRUE,
13 |   create_without_asking = !interactive()
14 | )
15 | }
16 | \arguments{
17 | \item{directory}{character(1) directory path, will be created
18 | recursively if not present.}
19 | 
20 | \item{verbose}{logical(1) whether or not to message the location of
21 | the cache directory after creation.}
22 | 
23 | \item{create_without_asking}{logical(1) specifying whether to allow
24 | the function to create the cache directory without asking the
25 | user first. In an interactive session, if the cache directory
26 | does not exist, the user will be prompted before creation.}
27 | }
28 | \value{
29 | character(1) directory path that serves as
30 | the base directory for GenomicDataCommons downloads.
31 | 
32 | the created directory (invisibly)
33 | }
34 | \description{
35 | The GenomicDataCommons package will cache downloaded
36 | files to minimize network and allow for
37 | offline work. These functions are used to create a cache directory
38 | if one does not exist, set a global option, and query that
39 | option. The cache directory will default to the user "cache"
40 | directory according to specifications in
41 | \code{\link[rappdirs]{app_dir}}. However, the user may want to set
42 | this to another direcotory with more or higher performance
43 | storage.
44 | }
45 | \details{
46 | The cache structure is currently just a directory with each file
47 |     being represented by a path constructed as:
48 |     CACHEDIR/UUID/FILENAME. The cached files can be manipulated
49 |     using standard file system commands (removing, finding,
50 |     etc.). In this sense, the cache sytem is minimalist in design.
51 | }
52 | \section{Functions}{
53 | \itemize{
54 | \item \code{gdc_set_cache()}: (Re)set the GenomicDataCommons cache
55 | directory
56 | 
57 | }}
58 | \examples{
59 | gdc_cache()
60 | \dontrun{
61 | gdc_set_cache(getwd())
62 | }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/R/gdc_token.R:
--------------------------------------------------------------------------------
 1 | #' return a gdc token from file or environment
 2 | #' 
 3 | #' The GDC requires an auth token for downloading
 4 | #' data that are "controlled access". For example, 
 5 | #' BAM files for human datasets, germline variant calls,
 6 | #' and SNP array raw data all are protected as "controlled
 7 | #' access". For these files, a GDC access token is required.
 8 | #' See the \href{details on the GDC authentication and token information}{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens}.
 9 | #' Note that this function simply returns a string value. 
10 | #' It is possible to keep the GDC token in a variable in R
11 | #' or to pass a string directly to the appropriate parameter.
12 | #' This function is simply a convenience function for alternative 
13 | #' approaches to get a token from an environment variable
14 | #' or a file.  
15 | #' 
16 | #'
17 | #' @details 
18 | #' This function will resolve locations of the GDC token in the 
19 | #' following order:
20 | #' \itemize{
21 | #' \item{from the environment variable, \code{GDC_TOKEN}, expected to 
22 | #' contain the token downloaded from the GDC as a string}
23 | #' \item{using \code{readLines} to read a file named in the environment
24 | #' variable, \code{GDC_TOKEN_FILE}}
25 | #' \item{using \code{readLines} to read from a file called \code{.gdc_token} in the user's
26 | #' home directory}
27 | #' }
28 | #' If all of these fail, this function will return an error.
29 | #' 
30 | #' @return character(1) (invisibly, to protect against inadvertently printing) the GDC token.
31 | #' 
32 | #' @references \url{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Cart/#gdc-authentication-tokens}
33 | #'
34 | #' @examples 
35 | #' # This will not run before a GDC token
36 | #' # is in place.  
37 | #' token = try(gdc_token(),silent=TRUE)
38 | #' 
39 | #'
40 | #' @export
41 | gdc_token <- function() {
42 |   if(Sys.getenv('GDC_TOKEN')!='') return(Sys.getenv('GDC_TOKEN'))
43 |   token_file = "~/.gdc_token"
44 |   if(Sys.getenv('GDC_TOKEN_FILE')!='') 
45 |     token_file = trimws(Sys.getenv('GDC_TOKEN_FILE'))
46 |   stopifnot(file.exists(token_file))
47 |   invisible(suppressWarnings(readLines(token_file,n=1)))
48 | }
49 | 


--------------------------------------------------------------------------------
/R/mapping.R:
--------------------------------------------------------------------------------
 1 | .response_mapping_as_list <- function(json) {
 2 |     json <- lapply(json, unlist)
 3 |     structure(json, class=c("mapping_list", "gdc_list", "list"))
 4 | }
 5 | 
 6 | 
 7 | #" (internal) utility for returning _mapping json
 8 | #' @importFrom httr content
 9 | .get_mapping_json <- function(endpoint) {
10 |     valid <- .gdc_entities
11 |     stopifnot(is.character(endpoint), length(endpoint) == 1L,
12 |               endpoint %in% valid)
13 |     response <- .gdc_get(
14 |         sprintf("%s/%s", endpoint, "_mapping")
15 |     )
16 |     content(response, type="application/json")
17 | }
18 | 
19 | 
20 | #' Query GDC for available endpoint fields
21 | #'
22 | #' @param endpoint character(1) corresponding to endpoints for which
23 | #'     users may specify additional or alternative fields. Endpoints
24 | #'     include \dQuote{projects}, \dQuote{cases}, \dQuote{files}, and
25 | #'     \dQuote{annotations}.
26 | #'
27 | #' @return A data frame describing the field (field name), full (full
28 | #'     data model name), type (data type), and four additional columns
29 | #'     describing the "set" to which the fields belong--\dQuote{default},
30 | #'     \dQuote{expand}, \dQuote{multi}, and \dQuote{nested}.
31 | #'
32 | #' @examples
33 | #' map <- mapping("projects")
34 | #' head(map)
35 | #' # get only the "default" fields
36 | #' subset(map,defaults)
37 | #' # And get just the text names of the "default" fields
38 | #' subset(map,defaults)$field
39 | #' 
40 | #' @importFrom httr content
41 | #' @export
42 | mapping <- function(endpoint) {
43 |     json = .get_mapping_json(endpoint)
44 |     maplist = list()
45 |     fields = data.frame(field=unlist(json[['fields']]))
46 |     mapdat = json[['_mapping']]
47 |     for(cname in names(mapdat[[1]])) {
48 |         maplist[[cname]] = as.character(sapply(mapdat,'[[',cname))
49 |     }
50 |     df = do.call(cbind,maplist)
51 |     tmpdf = as.data.frame(matrix(FALSE, ncol = 1, nrow = nrow(df)),stringsAsFactors = FALSE)
52 |     fieldtypes = c('defaults')
53 |     colnames(tmpdf) = fieldtypes
54 |     df = cbind(data.frame(df,stringsAsFactors=FALSE),tmpdf)
55 |     df = as.data.frame(merge(fields,df,by.x='field',by.y='field',all.x=TRUE),stringsAsFactors = FALSE)
56 |     df$field = as.character(df$field)
57 |     for(i in fieldtypes) {
58 |         df[df$field  %in% json[[i]],i] = TRUE
59 |     }
60 |     return(df)
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/.github/workflows/basic_checks.yml:
--------------------------------------------------------------------------------
 1 | name: R CMD check
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - devel
 7 |     paths:
 8 |       - 'DESCRIPTION'
 9 |       - '**basic_checks.yml'
10 |   workflow_dispatch:
11 |   pull_request:
12 |     branches:
13 |       - devel
14 | 
15 | env:
16 |   cache-version: v1
17 | 
18 | jobs:
19 |   r-build-and-check:
20 |     runs-on: ubuntu-latest
21 |     container: bioconductor/bioconductor_docker:devel
22 | 
23 |     env:
24 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: TRUE
25 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
26 | 
27 |     steps:
28 |       - name: Get Ubuntu Codename and Set CRAN URL
29 |         run: |
30 |           CODENAME=$(lsb_release -cs)
31 |           echo "CRAN=https://packagemanager.posit.co/cran/__linux__/${CODENAME}/latest" >> "$GITHUB_ENV"
32 | 
33 |       - name: Checkout Repository
34 |         uses: actions/checkout@v4
35 | 
36 |       - name: Query dependencies and update old packages
37 |         run: |
38 |           BiocManager::install(ask=FALSE)
39 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Cache R packages
43 |         if: runner.os != 'Windows'
44 |         uses: actions/cache@v4
45 |         with:
46 |           path: /usr/local/lib/R/site-library
47 |           key: ${{ env.cache-version }}-${{ runner.os }}-r-${{ hashFiles('.github/depends.Rds') }}
48 |           restore-keys: ${{ env.cache-version }}-${{ runner.os }}-r-
49 | 
50 |       - name: Install dependencies
51 |         run: |
52 |           remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories())
53 |           remotes::install_cran(c("rcmdcheck", "covr"))
54 |           BiocManager::install("BiocCheck")
55 |         shell: Rscript {0}
56 | 
57 |       - name: Run rcmdcheck
58 |         env:
59 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
60 |         run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "warning", check_dir = "check")
61 |         shell: Rscript {0}
62 | 
63 |       - name: Run BiocCheck
64 |         env:
65 |           DISPLAY: ':99.0'
66 |         run: |
67 |           BiocCheck::BiocCheck(
68 |               dir('check', 'tar\\.gz$', full.names = TRUE),
69 |               `quit-with-status` = FALSE,
70 |               `no-check-R-ver` = TRUE,
71 |               `no-check-bioc-help` = TRUE
72 |           )
73 |         shell: Rscript {0}
74 | 


--------------------------------------------------------------------------------
/tests/testthat/test_api.R:
--------------------------------------------------------------------------------
 1 | context('API')
 2 | 
 3 | test_that("status returns correctly", {
 4 |     res <- status()
 5 |     metadata_nms <- c(
 6 |         "commit", "data_release", "data_release_version",
 7 |         "status", "tag", "version"
 8 |     )
 9 |     expect_identical(names(res), metadata_nms)
10 |     expect_identical(res$status, "OK")
11 | })
12 | 
13 | test_that('query', {
14 |     gCases = query('cases')
15 |     expect_equal(class(gCases)[1],'gdc_cases')
16 |     expect_equal(class(gCases)[2],'GDCQuery')
17 |     expect_equal(class(gCases)[3],'list')
18 |     gFiles = query('files')
19 |     expect_equal(class(gFiles)[1],'gdc_files')
20 |     expect_equal(class(gFiles)[2],'GDCQuery')
21 |     expect_equal(class(gFiles)[3],'list')
22 |     gProjects = query('projects')
23 |     expect_equal(class(gProjects)[1],'gdc_projects')
24 |     expect_equal(class(gProjects)[2],'GDCQuery')
25 |     expect_equal(class(gProjects)[3],'list')
26 |     gAnnotations = query('annotations')
27 |     expect_equal(class(gAnnotations)[1],'gdc_annotations')
28 |     expect_equal(class(gAnnotations)[2],'GDCQuery')
29 |     expect_equal(class(gAnnotations)[3],'list')
30 | })
31 | 
32 | test_that("cases", {
33 |     idfield = "case_id"
34 |     q = cases()
35 |     resp = q |> response()
36 |     expect_gte(q |> count(),1000)
37 |     expect_equal(select(q,idfield)$fields,idfield)
38 |     expect_equal(facet(q,idfield)$facets,idfield)
39 | })
40 | 
41 | test_that("files", {
42 |     q = files()
43 |     idfield = "file_id"
44 |     resp = q |> response()
45 |     expect_gte(q |> count(),1000)
46 |     expect_equal(select(q,idfield)$fields,idfield)
47 |     expect_equal(facet(q,idfield)$facets,idfield)
48 | })
49 | 
50 | test_that("annotations", {
51 |     q = annotations()
52 |     idfield = "annotation_id"
53 |     resp = q |> response()
54 |     expect_gte(q |> count(),1000)
55 |     expect_equal(select(q,idfield)$fields,idfield)
56 |     expect_equal(facet(q,idfield)$facets,idfield)
57 | })
58 | 
59 | test_that("mapping", {
60 |     res = mapping('files')
61 |     expect_equal(class(res),'data.frame')
62 |     expect_equal(ncol(res), 6)
63 |     expect_equal(colnames(res),c('field','description','doc_type','full','type','defaults'))
64 | })
65 | 
66 | test_that("projects", {
67 |     q = projects()
68 |     idfield = "project_id"
69 |     resp = q |> response()
70 |     expect_gte(q |> count(),35)
71 |     expect_equal(select(q,idfield)$fields,idfield)
72 |     expect_equal(facet(q,idfield)$facets,idfield)
73 | })
74 | 
75 | 


--------------------------------------------------------------------------------
/R/expand.R:
--------------------------------------------------------------------------------
 1 | #' Return valid values for "expand"
 2 | #'
 3 | #' The GDC allows a shorthand for specifying groups
 4 | #' of fields to be returned by the metadata queries.
 5 | #' These can be specified in a \code{\link{select}}
 6 | #' method call to easily supply groups of fields.
 7 | #'
 8 | #' @param entity Either a \code{\link{GDCQuery}} object
 9 | #' or a character(1) specifying a GDC entity ('cases', 'files',
10 | #' 'annotations', 'projects')
11 | #'
12 | #' @return A character vector
13 | #' 
14 | #' @seealso See \url{https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#expand}
15 | #' for details
16 | #'
17 | #' @examples
18 | #' head(available_expand('files'))
19 | #'
20 | #' @export
21 | available_expand <- function(entity) {
22 |     UseMethod("available_expand",entity)
23 | }   
24 | 
25 | #' @rdname available_expand
26 | #'
27 | #' @export
28 | available_expand.character <- function(entity) {
29 |     json = .get_mapping_json(entity)
30 |     return(unlist(json[['expand']]))
31 | }
32 | 
33 | #' @rdname available_expand
34 | #'
35 | #' @export
36 | available_expand.GDCQuery <- function(entity) {
37 |     return(available_expand(entity_name(entity)))
38 | }
39 | 
40 | #" (internal) check expand values
41 | .gdcCheckExpands <- function(entity,expand) {
42 |     if(is.null(expand)) return(TRUE)
43 |     stopifnot(entity %in% .gdc_entities)
44 |     ae = available_expand(entity)
45 |     mismatches = expand[!(expand %in% ae)]
46 |     if(length(mismatches)>0)
47 |         stop(sprintf('expand specified included expands not available in %s including (%s)',entity,mismatches))
48 |     return(TRUE)
49 | }
50 | 
51 | #' Set the \code{expand} parameter
52 | #'
53 | #' S3 generic to set GDCQuery expand parameter
54 | #'
55 | #' @param x the objects on which to set fields
56 | #' @param expand a character vector specifying the fields
57 | #' 
58 | #'
59 | #' @return A \code{\link{GDCQuery}} object, with the \code{expand}
60 | #' member altered.
61 | #' 
62 | #' @examples
63 | #' gProj = projects()
64 | #' gProj$fields
65 | #' head(available_fields(gProj))
66 | #' default_fields(gProj)
67 | #'
68 | #' gProj |>
69 | #'   select(default_fields(gProj)[1:2]) |>
70 | #'   response() |>
71 | #'   str(max_level=2)
72 | #' 
73 | #' @export
74 | expand <- function(x,expand) {
75 |     UseMethod('expand',x)
76 | }
77 | 
78 | #' @describeIn expand set expand fields on a GDCQuery object
79 | #' @export
80 | expand.GDCQuery <- function(x,expand) {
81 |     .gdcCheckExpands(entity_name(x),expand)
82 |     x$expand = expand
83 |     return(x)
84 | }
85 | 


--------------------------------------------------------------------------------
/man/gdcdata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gdcdata.R
 3 | \name{gdcdata}
 4 | \alias{gdcdata}
 5 | \title{Download GDC files}
 6 | \usage{
 7 | gdcdata(
 8 |   uuids,
 9 |   use_cached = TRUE,
10 |   progress = interactive(),
11 |   token = NULL,
12 |   access_method = "api",
13 |   transfer_args = character(),
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{uuids}{character() of GDC file UUIDs.}
19 | 
20 | \item{use_cached}{logical(1) default TRUE indicating that,
21 | if found in the cache, the file will not be downloaded
22 | again. If FALSE, all supplied uuids will be re-downloaded.}
23 | 
24 | \item{progress}{logical(1) default TRUE in interactive sessions,
25 | FALSE otherwise indicating whether a progress par should be
26 | produced for each file download.}
27 | 
28 | \item{token}{(optional) character(1) security token allowing access
29 | to restricted data. See
30 | \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}.}
31 | 
32 | \item{access_method}{character(1), either 'api' or 'client'. See details.}
33 | 
34 | \item{transfer_args}{character(1), additional arguments to pass to
35 | the gdc-client command line. See \code{\link{gdc_client}} and
36 | \code{\link{transfer_help}} for details.}
37 | 
38 | \item{...}{further arguments passed to files}
39 | }
40 | \value{
41 | a named vector with file uuids as the names and paths as
42 | the value
43 | }
44 | \description{
45 | Download one or more files from GDC. Files are downloaded using the
46 | UUID and renamed to the file name on the remote system. By default,
47 | neither the uuid nor the file name on the remote system can exist.
48 | }
49 | \details{
50 | This function is appropriate for one or several files; for large
51 | downloads use \code{\link{manifest}} to create a manifest for and
52 | the GDC Data Transfer Tool.
53 | 
54 | When access_method is "api", the GDC "data" endpoint is the
55 |     transfer mechanism used. The alternative access_method, "client", will
56 |     utilize the \code{gdc-client} transfer tool, which must be
57 |     downloaded separately and available. See
58 |     \code{\link{gdc_client}} for details on specifying the location
59 |     of the gdc-client executable.
60 | }
61 | \examples{
62 | # get some example file uuids
63 | uuids <- files() |>
64 |     filter(~ access == 'open' & file_size < 100000) |>
65 |     results(size = 3) |>
66 |     ids()
67 | 
68 | # and get the data, placing it into the gdc_cache() directory
69 | gdcdata(uuids, use_cached=TRUE)
70 | 
71 | }
72 | \seealso{
73 | \code{\link{manifest}} for downloading large data.
74 | }
75 | 


--------------------------------------------------------------------------------
/R/ids.R:
--------------------------------------------------------------------------------
 1 | #' Get the ids associated with a GDC query or response
 2 | #'
 3 | #' The GDC assigns ids (in the form of uuids) to objects in its database. Those
 4 | #' ids can be used for relationships, searching on the website, and as
 5 | #' unique ids.  All 
 6 | #'
 7 | #' @param x A \code{\link{GDCQuery}} or \code{\link{GDCResponse}} object
 8 | #'
 9 | #' @return a character vector of all the entity ids
10 | #'
11 | #' @examples
12 | #' # use with a GDC query, in this case for "cases"
13 | #' ids(cases() |> filter(~ project.project_id == "TCGA-CHOL"))
14 | #' # also works for responses
15 | #' ids(response(files()))
16 | #' # and results
17 | #' ids(results(cases()))
18 | #'
19 | #' 
20 | #' @export
21 | ids = function(x) {
22 |     UseMethod('ids',x)
23 | }
24 | 
25 | #' @rdname ids
26 | #' @export
27 | ids.GDCManifest = function(x) {
28 |     return(x[['id']])
29 | }
30 | 
31 | 
32 | #' @rdname ids
33 | #' @export
34 | ids.GDCQuery = function(x) {
35 |     fieldname = .id_field(x)
36 |     res = x |> GenomicDataCommons::select(fieldname) |>
37 |         results_all()
38 |     return(.ifNullCharacterZero(res[[fieldname]]))
39 | }
40 | 
41 | 
42 | #' @rdname ids
43 | #' @export
44 | ids.GDCResults = function(x) {
45 |     fieldname = .id_field(x)
46 |     res = x[[fieldname]]
47 |     return(.ifNullCharacterZero(res))
48 | }
49 | 
50 | #' @rdname ids
51 | #' @export
52 | ids.GDCResponse = function(x) {
53 |     fieldname = paste0(sub('s$','',entity_name(x$query)),'_id')
54 |     res = results(x)[[fieldname]]
55 |     return(.ifNullCharacterZero(res))
56 | }
57 | 
58 | .id_field = function(x) {
59 |     return(paste0(sub('s$','',entity_name(x)),"_id"))
60 | }
61 | 
62 | #' get the name of the id field
63 | #' 
64 | #' In many places in the GenomicDataCommons package,
65 | #' the entity ids are stored in a column or a vector
66 | #' with a specific name that corresponds to the field name 
67 | #' at the GDC. The format is the entity name (singular) "_id".
68 | #' This generic simply returns that name from a given object.
69 | #' 
70 | #' @param x An object representing the query or results 
71 | #'     of an entity from the GDC ("cases", "files", "annotations", "projects")
72 | #' 
73 | #' @return character(1) such as "case_id", "file_id", etc.
74 | #' 
75 | #' @examples 
76 | #' id_field(cases())
77 | #' 
78 | #' @export
79 | id_field = function(x) {
80 |     UseMethod('id_field',x)
81 | }
82 | 
83 | #' @describeIn id_field GDCQuery method
84 | #' @export
85 | id_field.GDCQuery = function(x) {
86 |     return(.id_field(x))
87 | }
88 | 
89 | #' @describeIn id_field GDCResults method
90 | #' @export
91 | id_field.GDCResults = function(x) {
92 |     return(.id_field(x))
93 | }
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/man/transfer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bulk_transfer.R
 3 | \name{transfer}
 4 | \alias{transfer}
 5 | \alias{gdc_client_version_validate}
 6 | \alias{transfer_help}
 7 | \title{Bulk data download}
 8 | \usage{
 9 | transfer(uuids, args = character(), token = NULL, overwrite = FALSE)
10 | 
11 | gdc_client_version_validate(valid_version = .GDC_COMPATIBLE_VERSION)
12 | 
13 | transfer_help()
14 | }
15 | \arguments{
16 | \item{uuids}{character() vector of GDC file UUIDs}
17 | 
18 | \item{args}{character() vector specifying command-line arguments to
19 | be passed to \code{gdc-client}. See \code{\link{transfer_help}} for
20 | possible values. The arguments \code{--manifest}, \code{--dir},
21 | and \code{--token-file} are determined by \code{manifest},
22 | \code{destination_dir}, and \code{token}, respectively, and
23 | should NOT be provided as elements of \code{args}.}
24 | 
25 | \item{token}{character(1) containing security
26 | token allowing access to restricted data. See
27 | \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}.
28 | Note that the GDC transfer tool requires a file for data
29 | transfer. Therefore, this token will be written to a temporary
30 | file (with appropriate permissions set).}
31 | 
32 | \item{overwrite}{logical(1) default FALSE indicating whether
33 | existing files with identical name should be over-written.}
34 | 
35 | \item{valid_version}{character(1) The last known version that works for the
36 | current data release for which to validate against, not typically changed
37 | by the end-user.}
38 | }
39 | \value{
40 | character(1) directory path to which the files were
41 |     downloaded.
42 | }
43 | \description{
44 | The GDC maintains a special tool,
45 | \href{the GDC Data Transfer Tool}{https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/},
46 | that enables high-performance, potentially parallel, and
47 | resumable downloads. The Data Transfer Tool is an external
48 | program that requires separate download. Due to recent changes in the
49 | GDC API, the transfer function now validates the version of the `gdc-client`
50 | to ensure reliable downloads.
51 | }
52 | \section{Functions}{
53 | \itemize{
54 | \item \code{gdc_client_version_validate()}: If you are using the 'client' option, your `gdc-client` should be
55 | up-to-date (>= 1.3.0).
56 | 
57 | \item \code{transfer_help()}: 
58 | 
59 | }}
60 | \examples{
61 | \dontrun{
62 | uuids = files() |> 
63 |   filter(access == "open") |> 
64 |   results() |>
65 |   ids()
66 | file_paths <- transfer(uuids)
67 | file_paths
68 | names(file_paths)
69 | # and with authenication
70 | # REQUIRES gdc_token 
71 | # destination <- transfer(uuids,token=gdc_token())
72 | }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/R/constants.R:
--------------------------------------------------------------------------------
 1 | .gdc_base <- "https://api.gdc.cancer.gov"
 2 | .gdc_endpoint <-
 3 |     structure(
 4 |         c("status", "projects", "cases", "files", "annotations", "data",
 5 |           "manifest", "slicing"), ##, submission
 6 |         class="gdc_endpoints")
 7 | 
 8 | .gdc_parameters <-
 9 |     structure(
10 |         list(format="JSON", pretty=FALSE, fields=NULL, size=10L, from=0L,
11 |              sort=NULL, filters=NULL, facets=NULL),
12 |         class="gdc_parameters")
13 | 
14 | .gdc_flat_parameters <-
15 |     structure(
16 |         c('fields','facets'),
17 |         class = "gdc_flat_params")
18 | 
19 | .gdc_entities =
20 |     structure(
21 |         c('projects','cases',"files","annotations", 
22 |           "ssms", "cnvs", "ssm_occurrences", "cnv_occurrences",
23 |           "genes"),
24 |         class = "gdc_entities")
25 | 
26 | .gdc_manifest_colnames =
27 |     structure(
28 |         c("id", "file_name", "md5sum", "file_size", "state"),
29 |         class = 'gdc_manifest_colnames'
30 |     )
31 | 
32 | 
33 | #' Endpoints and Parameters
34 | #'
35 | #' \code{endpoints()} returns available endpoints.
36 | #'
37 | #' @return \code{endpoints()} returns a character vector of possible
38 | #'     endpoints.
39 | #'
40 | #' @rdname constants
41 | #' @examples
42 | #' endpoints()
43 | #' @export
44 | endpoints <- function()
45 |     .gdc_endpoint
46 | 
47 | #' @export
48 | print.gdc_endpoints <- function(x, ...)
49 |     .cat0("available endpoints:\n", .wrapstr(x), "\n")
50 | 
51 | #' \code{parameters()} include format (internal use only), pretty
52 | #' (internal use only), fields, size (number of results returned),
53 | #' from (index of rist result), sort, filters, and facets. See
54 | #' \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Search_and_Retrieval/#query-parameters}
55 | #'
56 | #' @return \code{parameters()} returns a list of possible parameters
57 | #'     and their default values.
58 | #' @keywords internal
59 | #' 
60 | #' @rdname constants
61 | #' @examples
62 | #' parameters()
63 | #' @export
64 | parameters <- function()
65 |     .gdc_parameters
66 | 
67 | #' @export
68 | print.gdc_parameters <- function(x, ...) {
69 |     cat("available parameters:\n")
70 |     for (nm in names(x))
71 |         .cat0("    ", nm, ": ",
72 |               if (is.null(x[[nm]])) "NULL" else x[[nm]], "\n")
73 | }
74 | 
75 | #" (internal)
76 | .parameter_string <- function(parameters) {
77 |     if (is.null(parameters))
78 |         return("")
79 |     stopifnot(is.list(parameters),
80 |               all(names(parameters) %in% names(.gdc_parameters)))
81 | 
82 |     default <- .gdc_parameters
83 |     default[names(parameters)] <- parameters
84 |     default <- Filter(Negate(is.null), default)
85 |     string <- paste(names(default), unname(default), sep="=", collapse="&")
86 |     sprintf("?%s", string)
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/man/slicing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/slicing.R
 3 | \name{slicing}
 4 | \alias{slicing}
 5 | \title{Query GDC for data slices}
 6 | \usage{
 7 | slicing(
 8 |   uuid,
 9 |   regions,
10 |   symbols,
11 |   destination = file.path(tempdir(), paste0(uuid, ".bam")),
12 |   overwrite = FALSE,
13 |   progress = interactive(),
14 |   token = gdc_token()
15 | )
16 | }
17 | \arguments{
18 | \item{uuid}{character(1) identifying the BAM file resource}
19 | 
20 | \item{regions}{character() vector describing chromosomal regions,
21 | e.g., \code{c("chr1", "chr2:10000", "chr3:10000-20000")} (all
22 | of chromosome 1, chromosome 2 from position 10000 to the end,
23 | chromosome 3 from 10000 to 20000).}
24 | 
25 | \item{symbols}{character() vector of gencode gene symbols, e.g.,
26 | \code{c("BRCA1", "PTEN")}}
27 | 
28 | \item{destination}{character(1) default \code{tempfile()} file path
29 | for BAM file slice}
30 | 
31 | \item{overwrite}{logical(1) default FALSE can destination be
32 | overwritten?}
33 | 
34 | \item{progress}{logical(1) default \code{interactive()} should a
35 | progress bar be used?}
36 | 
37 | \item{token}{character(1) security token allowing access to
38 | restricted data. Almost all BAM data is restricted, so a token is
39 | usually required. See
40 | \url{https://docs.gdc.cancer.gov/Data/Data_Security/Data_Security/#authentication-tokens}.}
41 | }
42 | \value{
43 | character(1) destination to the downloaded BAM file
44 | }
45 | \description{
46 | This function returns a BAM file representing reads overlapping
47 | regions specified either as chromosomal regions or as gencode gene
48 | symbols.
49 | }
50 | \details{
51 | This function uses the Genomic Data Commons "slicing" API
52 |     to get portions of a BAM file specified either using "regions"
53 |     or using HGNC gene symbols.
54 | }
55 | \examples{
56 | \dontrun{
57 |  slicing("df80679e-c4d3-487b-934c-fcc782e5d46e",
58 |         regions="chr17:75000000-76000000",
59 |         token=gdc_token())
60 | 
61 | # Get 10 BAM files.
62 | bamfiles = files() |> 
63 |            filter(data_format=='BAM') |>
64 |            results(size=10) |> ids()
65 | 
66 | # Current alignments at the GDC are to GRCh38
67 | library('TxDb.Hsapiens.UCSC.hg38.knownGene')
68 | all_genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
69 | 
70 | first3genes = all_genes[1:3]
71 | # remove strand info
72 | strand(first3genes) = '*'
73 | 
74 | # We can get our regions easily now
75 | as.character(first3genes)
76 | 
77 | # Use parallel downloads to speed processing
78 | library(BiocParallel)
79 | register(MulticoreParam())
80 | 
81 | fnames = bplapply(bamfiles, slicing, overwrite = TRUE,
82 |                 regions=as.character(first3genes))
83 | 
84 | # 10 BAM files
85 | fnames
86 | 
87 | library(GenomicAlignments)
88 | lapply(unlist(fnames), readGAlignments)
89 | 
90 | }
91 | }
92 | 


--------------------------------------------------------------------------------
/man/filtering.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/filters.R
  3 | \name{filtering}
  4 | \alias{filtering}
  5 | \alias{filter}
  6 | \alias{filter.GDCQuery}
  7 | \alias{get_filter}
  8 | \alias{get_filter.GDCQuery}
  9 | \title{Manipulating GDCQuery filters}
 10 | \usage{
 11 | filter(x, expr)
 12 | 
 13 | \method{filter}{GDCQuery}(x, expr)
 14 | 
 15 | get_filter(x)
 16 | 
 17 | \method{get_filter}{GDCQuery}(x)
 18 | }
 19 | \arguments{
 20 | \item{x}{the object on which to set the filter list
 21 | member}
 22 | 
 23 | \item{expr}{a filter expression in the form of
 24 | the right hand side of a formula, where bare names
 25 | (without quotes) are allowed if they are available
 26 | fields associated with the GDCQuery object, \code{x}}
 27 | }
 28 | \value{
 29 | A \code{\link{GDCQuery}} object with the filter
 30 | field replaced by specified filter expression
 31 | }
 32 | \description{
 33 | Manipulating GDCQuery filters
 34 | 
 35 | The \code{filter} is simply a safe accessor for
 36 | the filter element in \code{\link{GDCQuery}} objects.
 37 | 
 38 | The \code{get_filter} is simply a safe accessor for
 39 | the filter element in \code{\link{GDCQuery}} objects.
 40 | }
 41 | \examples{
 42 | # make a GDCQuery object to start
 43 | #
 44 | # Projects
 45 | #
 46 | pQuery = projects()
 47 | 
 48 | # check for the default fields
 49 | # so that we can use one of them to build a filter
 50 | default_fields(pQuery)
 51 | pQuery = filter(pQuery,~ project_id == 'TCGA-LUAC')
 52 | get_filter(pQuery)
 53 | 
 54 | #
 55 | # Files
 56 | #
 57 | fQuery = files()
 58 | default_fields(fQuery)
 59 | 
 60 | fQuery = filter(fQuery,~ data_format == 'VCF')
 61 | # OR
 62 | # with recent GenomicDataCommons versions:
 63 | #   no "~" needed
 64 | fQuery = filter(fQuery, data_format == 'VCF')
 65 | 
 66 | get_filter(fQuery)
 67 | 
 68 | fQuery = filter(fQuery,~ data_format == 'VCF'
 69 |                 & experimental_strategy == 'WXS'
 70 |                 & type == 'simple_somatic_mutation')
 71 | 
 72 | files() |> filter(~ data_format == 'VCF'
 73 |                    & experimental_strategy=='WXS'
 74 |                    & type == 'simple_somatic_mutation') |> count()
 75 |                    
 76 |                    
 77 | files() |> filter( data_format == 'VCF'
 78 |                    & experimental_strategy=='WXS'
 79 |                    & type == 'simple_somatic_mutation') |> count()
 80 | 
 81 | # Filters may be chained for the 
 82 | # equivalent query
 83 | # 
 84 | # When chained, filters are combined with logical AND
 85 | 
 86 | files() |>
 87 |   filter(~ data_format == 'VCF') |>
 88 |   filter(~ experimental_strategy == 'WXS') |>
 89 |   filter(~ type == 'simple_somatic_mutation') |>
 90 |   count()
 91 | 
 92 | # OR
 93 | 
 94 | files() |>
 95 |   filter( data_format == 'VCF') |>
 96 |   filter( experimental_strategy == 'WXS') |>
 97 |   filter( type == 'simple_somatic_mutation') |>
 98 |   count()
 99 | 
100 | # Use str() to get a cleaner picture
101 | str(get_filter(fQuery))
102 | }
103 | 


--------------------------------------------------------------------------------
/man/query.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/query.R
  3 | \name{query}
  4 | \alias{query}
  5 | \alias{GDCQuery}
  6 | \alias{cases}
  7 | \alias{files}
  8 | \alias{projects}
  9 | \alias{annotations}
 10 | \alias{ssms}
 11 | \alias{ssm_occurrences}
 12 | \alias{cnvs}
 13 | \alias{cnv_occurrences}
 14 | \alias{genes}
 15 | \title{Start a query of GDC metadata}
 16 | \usage{
 17 | query(
 18 |   entity,
 19 |   filters = NULL,
 20 |   facets = NULL,
 21 |   expand = NULL,
 22 |   fields = default_fields(entity),
 23 |   ...
 24 | )
 25 | 
 26 | cases(...)
 27 | 
 28 | files(...)
 29 | 
 30 | projects(...)
 31 | 
 32 | annotations(...)
 33 | 
 34 | ssms(...)
 35 | 
 36 | ssm_occurrences(...)
 37 | 
 38 | cnvs(...)
 39 | 
 40 | cnv_occurrences(...)
 41 | 
 42 | genes(...)
 43 | }
 44 | \arguments{
 45 | \item{entity}{character vector, including one of the entities in .gdc_entities}
 46 | 
 47 | \item{filters}{a filter list, typically created using \code{\link{make_filter}}, or added
 48 | to an existing \code{GDCQuery} object using \code{\link{filter}}.}
 49 | 
 50 | \item{facets}{a character vector of facets for counting common values. 
 51 | See \code{\link{available_fields}}. In general, one will not specify this parameter
 52 | but will use \code{\link{facet}} instead.}
 53 | 
 54 | \item{expand}{a character vector of "expands" to include in returned data. See 
 55 | \code{\link{available_expand}}}
 56 | 
 57 | \item{fields}{a character vector of fields to return. See \code{\link{available_fields}}.
 58 | In general, one will not specify fields directly, but instead use \code{\link{select}}}
 59 | 
 60 | \item{...}{passed through to \code{\link{query}}}
 61 | }
 62 | \value{
 63 | An S3 object, the GDCQuery object. This is a list
 64 | with the following members.
 65 | \itemize{
 66 | \item{filters}
 67 | \item{facets}
 68 | \item{fields}
 69 | \item{expand}
 70 | \item{archive}
 71 | \item{token}
 72 | }
 73 | }
 74 | \description{
 75 | The basis for all functionality in this package
 76 | starts with constructing a query in R. The GDCQuery
 77 | object contains the filters, facets, and other
 78 | parameters that define the returned results. A token
 79 | is required for accessing certain datasets.
 80 | }
 81 | \section{Functions}{
 82 | \itemize{
 83 | \item \code{cases()}: convenience constructor for a GDCQuery for cases
 84 | 
 85 | \item \code{files()}: convenience contructor for a GDCQuery for files
 86 | 
 87 | \item \code{projects()}: convenience contructor for a GDCQuery for projects
 88 | 
 89 | \item \code{annotations()}: convenience contructor for a GDCQuery for annotations
 90 | 
 91 | \item \code{ssms()}: convenience contructor for a GDCQuery for ssms
 92 | 
 93 | \item \code{ssm_occurrences()}: convenience contructor for a GDCQuery for ssm_occurrences
 94 | 
 95 | \item \code{cnvs()}: convenience contructor for a GDCQuery for cnvs
 96 | 
 97 | \item \code{cnv_occurrences()}: convenience contructor for a GDCQuery for cnv_occurrences
 98 | 
 99 | \item \code{genes()}: convenience contructor for a GDCQuery for genes
100 | 
101 | }}
102 | \examples{
103 | qcases = query('cases')
104 | # equivalent to:
105 | qcases = cases()
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/vignettes/questions-and-answers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Questions and answers from over the years"
  3 | author: "Sean Davis"
  4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`"
  5 | always_allow_html: yes
  6 | output:
  7 |   BiocStyle::html_document:
  8 |     df_print: paged
  9 |     toc_float: true
 10 |     keep_md: true
 11 | abstract: >
 12 | 
 13 | vignette: >
 14 |   %\VignetteIndexEntry{Questions and answers from over the years}
 15 |   %\VignetteEngine{knitr::rmarkdown}
 16 |   %\VignetteEncoding{UTF-8}
 17 | ---
 18 | 
 19 | # How could I generate a manifest file with filtering of Race and Ethnicity?
 20 | 
 21 | From https://support.bioconductor.org/p/9138939/.
 22 | 
 23 | ```{r}
 24 | library(GenomicDataCommons,quietly = TRUE)
 25 | ```
 26 | 
 27 | I made a small change to the filtering expression approach based on 
 28 | changes to lazy evaluation best practices. There is now no need to 
 29 | include the `~` in the filter expression. So:
 30 | 
 31 | ```{r}
 32 | q = files() |>
 33 |   GenomicDataCommons::filter(
 34 |     cases.project.project_id == 'TCGA-COAD' &
 35 |       data_type == 'Aligned Reads' &
 36 |       experimental_strategy == 'RNA-Seq' &
 37 |       data_format == 'BAM')
 38 | ```
 39 | And get a count of the results:
 40 | 
 41 | ```{r}
 42 | count(q)
 43 | ```
 44 | 
 45 | And the manifest.
 46 | 
 47 | ```{r}
 48 | manifest(q)
 49 | ```
 50 | 
 51 | Your question about race and ethnicity is a good one. 
 52 | 
 53 | ```{r}
 54 | all_fields = available_fields(files())
 55 | ```
 56 | 
 57 | And we can grep for `race` or `ethnic` to get potential matching fields
 58 | to look at.
 59 | 
 60 | ```{r}
 61 | grep('race|ethnic',all_fields,value=TRUE)
 62 | ```
 63 | 
 64 | Now, we can check available values for each field to determine how to complete
 65 | our filter expressions.
 66 | 
 67 | ```{r}
 68 | available_values('files',"cases.demographic.ethnicity")
 69 | available_values('files',"cases.demographic.race")
 70 | ```
 71 | 
 72 | We can complete our filter expression now to limit to `white` race only.
 73 | 
 74 | ```{r}
 75 | q_white_only = q |>
 76 |   GenomicDataCommons::filter(cases.demographic.race=='white')
 77 | count(q_white_only)
 78 | manifest(q_white_only)
 79 | ```
 80 | 
 81 | # How can I get the number of cases with RNA-Seq data added by date to TCGA project with `GenomicDataCommons`?
 82 | 
 83 | - From https://support.bioconductor.org/p/9135791/
 84 | 
 85 | I would like to get the number of cases added (created, any logical datetime would suffice here) to the TCGA project by experiment type. I attempted to get this data via GenomicDataCommons package, but it is giving me I believe the number of files for a given experiment type rather than number cases. How can I get the number of cases for which there is RNA-Seq data?
 86 | 
 87 | ```{r}
 88 | library(tibble)
 89 | library(dplyr)
 90 | library(GenomicDataCommons)
 91 | 
 92 | cases() |> 
 93 |   GenomicDataCommons::filter(
 94 |     ~ project.program.name=='TCGA' & files.experimental_strategy=='RNA-Seq'
 95 |   ) |> 
 96 |   facet(c("files.created_datetime")) |> 
 97 |   aggregations() |> 
 98 |   unname() |>
 99 |   unlist(recursive = FALSE) |> 
100 |   as_tibble() |>
101 |   dplyr::arrange(dplyr::desc(key))
102 | ```
103 | 


--------------------------------------------------------------------------------
/vignettes/somatic_mutations.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Working with simple somatic mutations"
  3 | author: "Sean Davis"
  4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`"
  5 | always_allow_html: yes
  6 | output:
  7 |   BiocStyle::html_document:
  8 |     df_print: paged
  9 |     toc_float: true
 10 | abstract: >
 11 | 
 12 | vignette: >
 13 |   %\VignetteIndexEntry{Somatic Mutation Data}
 14 |   %\VignetteEngine{knitr::rmarkdown}
 15 |   %\VignetteEncoding{UTF-8}
 16 | ---
 17 | 
 18 | # Background
 19 | 
 20 | 
 21 | 
 22 | # Workflow
 23 | 
 24 | ```{r warning=FALSE,message=FALSE}
 25 | library(GenomicDataCommons)
 26 | library(tibble)
 27 | ```
 28 | 
 29 | ## Genes and gene details
 30 | 
 31 | ```{r}
 32 | grep_fields('genes', 'symbol')
 33 | ```
 34 | ```{r}
 35 | head(available_values('genes','symbol'))
 36 | ```
 37 | 
 38 | 
 39 | 
 40 | ```{r}
 41 | tp53 = genes() |> 
 42 |   GenomicDataCommons::filter(symbol=='TP53') |> 
 43 |   results(size=10000) |> 
 44 |   as_tibble()
 45 | ```
 46 | 
 47 | 
 48 | ## ssms
 49 | 
 50 | ```{r}
 51 | ssms() |> 
 52 |     GenomicDataCommons::filter(
 53 |       chromosome==paste0('chr',tp53$gene_chromosome[1]) &
 54 |         start_position > tp53$gene_start[1] & 
 55 |         end_position < tp53$gene_end[1]) |> 
 56 |     GenomicDataCommons::count()
 57 | ```
 58 | 
 59 | ```{r}
 60 | ssms() |> 
 61 |     GenomicDataCommons::filter(
 62 |       consequence.transcript.gene.symbol %in% c('TP53')) |> 
 63 |     GenomicDataCommons::count()
 64 | ```
 65 | 
 66 | ## convert to VRanges
 67 | 
 68 | ```{r warning=FALSE,message=FALSE}
 69 | library(VariantAnnotation)
 70 | vars = ssms() |> 
 71 |     GenomicDataCommons::filter(
 72 |       consequence.transcript.gene.symbol %in% c('TP53')) |> 
 73 |     GenomicDataCommons::results_all() |>
 74 |     as_tibble()
 75 | ```
 76 | 
 77 | ```{r}
 78 | vr = VRanges(seqnames = vars$chromosome,
 79 |              ranges = IRanges(start=vars$start_position, width=1),
 80 |              ref = vars$reference_allele,
 81 |              alt = vars$tumor_allele)
 82 | ```
 83 | 
 84 | ```{r}
 85 | ssm_occurrences() |> 
 86 |     GenomicDataCommons::filter(
 87 |       ssm.consequence.transcript.gene.symbol %in% c('TP53')) |>
 88 |     GenomicDataCommons::count()
 89 | ```
 90 | 
 91 | ```{r}
 92 | var_samples = ssm_occurrences() |> 
 93 |     GenomicDataCommons::filter(
 94 |       ssm.consequence.transcript.gene.symbol %in% c('TP53')) |> 
 95 |     GenomicDataCommons::expand(c('case', 'ssm', 'case.project')) |>
 96 |     GenomicDataCommons::results_all() |> 
 97 |     as_tibble()
 98 | ```
 99 | 
100 | ```{r}
101 | table(var_samples$case$disease_type)
102 | ```
103 | 
104 | ## OncoPrint
105 | 
106 | ```{r}
107 | fnames <- files() |>
108 |   GenomicDataCommons::filter(
109 |     cases.project.project_id=='TCGA-SKCM' &
110 |       data_format=='maf' &
111 |       data_type=='Masked Somatic Mutation' &
112 |       analysis.workflow_type ==
113 |         'Aliquot Ensemble Somatic Variant Merging and Masking'
114 |   ) |>
115 |   results(size = 1) |>
116 |     ids() |>
117 |       gdcdata()
118 | ```
119 | 
120 | ```{r cache=TRUE}
121 | library(maftools)
122 | melanoma = read.maf(maf = fnames)
123 | ```
124 | 
125 | ```{r}
126 | maftools::oncoplot(melanoma)
127 | ```
128 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | # Generated by roxygen2: do not edit by hand
  2 | 
  3 | S3method(aggregations,GDCQuery)
  4 | S3method(aggregations,GDCResponse)
  5 | S3method(available_expand,GDCQuery)
  6 | S3method(available_expand,character)
  7 | S3method(available_fields,GDCQuery)
  8 | S3method(available_fields,character)
  9 | S3method(count,GDCQuery)
 10 | S3method(count,GDCResponse)
 11 | S3method(default_fields,GDCQuery)
 12 | S3method(default_fields,character)
 13 | S3method(entity_name,GDCQuery)
 14 | S3method(entity_name,GDCResults)
 15 | S3method(expand,GDCQuery)
 16 | S3method(facet,GDCQuery)
 17 | S3method(field_description,GDCQuery)
 18 | S3method(field_description,character)
 19 | S3method(filter,GDCQuery)
 20 | S3method(get_facets,GDCQuery)
 21 | S3method(get_filter,GDCQuery)
 22 | S3method(id_field,GDCQuery)
 23 | S3method(id_field,GDCResults)
 24 | S3method(ids,GDCManifest)
 25 | S3method(ids,GDCQuery)
 26 | S3method(ids,GDCResponse)
 27 | S3method(ids,GDCResults)
 28 | S3method(manifest,GDCcasesResponse)
 29 | S3method(manifest,GDCfilesResponse)
 30 | S3method(manifest,gdc_files)
 31 | S3method(print,gdc_endpoints)
 32 | S3method(print,gdc_parameters)
 33 | S3method(response,GDCQuery)
 34 | S3method(results,GDCQuery)
 35 | S3method(results,GDCResponse)
 36 | S3method(results_all,GDCQuery)
 37 | S3method(results_all,GDCResponse)
 38 | S3method(select,GDCQuery)
 39 | export(aggregations)
 40 | export(annotations)
 41 | export(available_expand)
 42 | export(available_fields)
 43 | export(available_values)
 44 | export(cases)
 45 | export(cnv_occurrences)
 46 | export(cnvs)
 47 | export(count)
 48 | export(default_fields)
 49 | export(endpoints)
 50 | export(entity_name)
 51 | export(expand)
 52 | export(facet)
 53 | export(field_description)
 54 | export(files)
 55 | export(filter)
 56 | export(gdc_cache)
 57 | export(gdc_client)
 58 | export(gdc_client_version_validate)
 59 | export(gdc_clinical)
 60 | export(gdc_set_cache)
 61 | export(gdc_token)
 62 | export(gdcdata)
 63 | export(genes)
 64 | export(get_facets)
 65 | export(get_filter)
 66 | export(grep_fields)
 67 | export(id_field)
 68 | export(ids)
 69 | export(make_filter)
 70 | export(manifest)
 71 | export(mapping)
 72 | export(parameters)
 73 | export(projects)
 74 | export(query)
 75 | export(readDNAcopy)
 76 | export(readHTSeqFile)
 77 | export(response)
 78 | export(response_all)
 79 | export(results)
 80 | export(results_all)
 81 | export(select)
 82 | export(slicing)
 83 | export(ssm_occurrences)
 84 | export(ssms)
 85 | export(status)
 86 | export(transfer)
 87 | export(transfer_help)
 88 | export(write_manifest)
 89 | import(GenomicRanges)
 90 | importFrom(IRanges,IRanges)
 91 | importFrom(dplyr,bind_rows)
 92 | importFrom(httr,GET)
 93 | importFrom(httr,POST)
 94 | importFrom(httr,add_headers)
 95 | importFrom(httr,content)
 96 | importFrom(httr,headers)
 97 | importFrom(httr,progress)
 98 | importFrom(httr,stop_for_status)
 99 | importFrom(httr,write_disk)
100 | importFrom(jsonlite,fromJSON)
101 | importFrom(jsonlite,toJSON)
102 | importFrom(jsonlite,unbox)
103 | importFrom(rappdirs,app_dir)
104 | importFrom(readr,read_tsv)
105 | importFrom(rlang,enquo)
106 | importFrom(rlang,eval_tidy)
107 | importFrom(rlang,f_env)
108 | importFrom(rlang,f_rhs)
109 | importFrom(rlang,is_formula)
110 | importFrom(stats,setNames)
111 | importFrom(tibble,as_tibble)
112 | importFrom(utils,menu)
113 | importFrom(utils,read.table)
114 | importFrom(utils,write.table)
115 | importFrom(xml2,xml_find_all)
116 | importFrom(xml2,xml_text)
117 | 


--------------------------------------------------------------------------------
/R/query.R:
--------------------------------------------------------------------------------
 1 | #' Start a query of GDC metadata
 2 | #'
 3 | #' The basis for all functionality in this package
 4 | #' starts with constructing a query in R. The GDCQuery
 5 | #' object contains the filters, facets, and other
 6 | #' parameters that define the returned results. A token
 7 | #' is required for accessing certain datasets.
 8 | #'
 9 | #' @aliases GDCQuery
10 | #' 
11 | #' @param entity character vector, including one of the entities in .gdc_entities
12 | #' @param filters a filter list, typically created using \code{\link{make_filter}}, or added
13 | #'     to an existing \code{GDCQuery} object using \code{\link{filter}}.
14 | #' @param facets a character vector of facets for counting common values. 
15 | #'     See \code{\link{available_fields}}. In general, one will not specify this parameter
16 | #'     but will use \code{\link{facet}} instead.
17 | #' @param fields a character vector of fields to return. See \code{\link{available_fields}}.
18 | #'     In general, one will not specify fields directly, but instead use \code{\link{select}}
19 | #' @param expand a character vector of "expands" to include in returned data. See 
20 | #'     \code{\link{available_expand}}
21 | #' 
22 | #' @return An S3 object, the GDCQuery object. This is a list
23 | #' with the following members.
24 | #' \itemize{
25 | #' \item{filters}
26 | #' \item{facets}
27 | #' \item{fields}
28 | #' \item{expand}
29 | #' \item{archive}
30 | #' \item{token}
31 | #' }
32 | #'
33 | #' @examples
34 | #' qcases = query('cases')
35 | #' # equivalent to:
36 | #' qcases = cases()
37 | #' 
38 | #' @export
39 | query = function(entity,
40 |                  filters=NULL,
41 |                  facets=NULL,
42 |                  expand = NULL,
43 |                  fields=default_fields(entity),
44 |                  ...)
45 | {
46 |     stopifnot(entity %in% .gdc_entities)
47 |     ret = structure(
48 |         list(
49 |             fields    = fields,
50 |             filters   = filters,
51 |             facets    = facets,
52 |             expand    = expand),
53 |         class = c(paste0('gdc_',entity),'GDCQuery','list')
54 |     )
55 |     return(ret)
56 | }
57 | 
58 | 
59 | #' @describeIn query convenience constructor for a GDCQuery for cases
60 | #'
61 | #' @param ... passed through to \code{\link{query}}
62 | #' 
63 | #' @export
64 | cases = function(...) {return(query('cases',...))}
65 | 
66 | #' @describeIn query convenience contructor for a GDCQuery for files
67 | #' @export
68 | files = function(...) {return(query('files',...))}
69 | 
70 | #' @describeIn query convenience contructor for a GDCQuery for projects
71 | #' @export
72 | projects = function(...) {return(query('projects',...))}
73 | 
74 | #' @describeIn query convenience contructor for a GDCQuery for annotations
75 | #' @export
76 | annotations = function(...) {return(query('annotations',...))}
77 | 
78 | #' @describeIn query convenience contructor for a GDCQuery for ssms
79 | #' @export
80 | ssms = function(...) {return(query("ssms", ...))}
81 | 
82 | #' @describeIn query convenience contructor for a GDCQuery for ssm_occurrences
83 | #' @export
84 | ssm_occurrences = function(...) {return(query("ssm_occurrences", ...))}
85 | 
86 | #' @describeIn query convenience contructor for a GDCQuery for cnvs
87 | #' @export
88 | cnvs = function(...) {return(query("cnvs", ...))}
89 | 
90 | #' @describeIn query convenience contructor for a GDCQuery for cnv_occurrences
91 | #' @export
92 | cnv_occurrences = function(...) {return(query("cnv_occurrences", ...))}
93 | 
94 | #' @describeIn query convenience contructor for a GDCQuery for genes
95 | #' @export
96 | genes = function(...) {return(query("genes", ...))}
97 | 


--------------------------------------------------------------------------------
/R/caching.R:
--------------------------------------------------------------------------------
 1 | #' Work with gdc cache directory
 2 | #'
 3 | #' The GenomicDataCommons package will cache downloaded
 4 | #' files to minimize network and allow for
 5 | #' offline work. These functions are used to create a cache directory
 6 | #' if one does not exist, set a global option, and query that
 7 | #' option. The cache directory will default to the user "cache"
 8 | #' directory according to specifications in
 9 | #' \code{\link[rappdirs]{app_dir}}. However, the user may want to set
10 | #' this to another direcotory with more or higher performance
11 | #' storage. 
12 | #'
13 | #' @return character(1) directory path that serves as
14 | #' the base directory for GenomicDataCommons downloads.
15 | #'
16 | #' @details
17 | #' The cache structure is currently just a directory with each file
18 | #'     being represented by a path constructed as:
19 | #'     CACHEDIR/UUID/FILENAME. The cached files can be manipulated
20 | #'     using standard file system commands (removing, finding,
21 | #'     etc.). In this sense, the cache sytem is minimalist in design.
22 | #' 
23 | #' @examples
24 | #' gdc_cache()
25 | #' \dontrun{
26 | #' gdc_set_cache(getwd())
27 | #' }
28 | #' 
29 | #' @export
30 | gdc_cache = function()
31 | {
32 |     cache_dir = getOption('gdc_cache',gdc_set_cache(verbose=FALSE))
33 |     if(!dir.exists(cache_dir)) {
34 |         gdc_set_cache(cache_dir)
35 |     }
36 |     return(cache_dir)
37 | }
38 | 
39 | #' @describeIn gdc_cache (Re)set the GenomicDataCommons cache
40 | #'     directory
41 | #' 
42 | #' @importFrom utils menu
43 | #' @importFrom rappdirs app_dir
44 | #'
45 | #'
46 | #' @param create_without_asking logical(1) specifying whether to allow
47 | #'     the function to create the cache directory without asking the
48 | #'     user first. In an interactive session, if the cache directory
49 | #'     does not exist, the user will be prompted before creation.
50 | #'
51 | #' @param verbose logical(1) whether or not to message the location of
52 | #'     the cache directory after creation.
53 | #'
54 | #' @param directory character(1) directory path, will be created
55 | #'     recursively if not present.
56 | #'
57 | #'
58 | #' @return the created directory (invisibly)
59 | #' 
60 | #' @export
61 | gdc_set_cache = function(directory = rappdirs::app_dir(appname =
62 |                                                            "GenomicDataCommons")$cache(),
63 |                          verbose = TRUE,
64 |                          create_without_asking = !interactive())
65 | {
66 | 
67 |     create_path = function(directory) {
68 |         dir.create(directory, recursive = TRUE, showWarnings =
69 |                                                     FALSE)
70 |     }
71 |     
72 |     if(is.character(directory) & length(directory)==1) {
73 |         # if directory exists, move on
74 |         if(!dir.exists(directory)) {
75 |             # if not in an interactive session, go
76 |             # ahead and create directory without user
77 |             # input.
78 |             if(create_without_asking) {
79 |                 create_path(directory)
80 |             } else {
81 |                 # If in an interactive environment,
82 |                 # go ahead and ask user for agreement.
83 |                 response = menu(c("Yes", "No"),
84 |                                 title=sprintf("Would you like to create a GDC Cache directory at %s", directory))
85 |                 if(response == 1) {
86 |                     create_path(directory)
87 |                 } else {
88 |                     stop("GDC Cache directory cannot be created without user agreement")
89 |                 }
90 |             }
91 |         }
92 |         options('gdc_cache' = directory)
93 |     } else {
94 |         stop("directory should be a character(1)")
95 |     }
96 |     if(verbose) message("GDC Cache directory set to: ", directory)
97 |     invisible(directory)
98 | }
99 | 


--------------------------------------------------------------------------------
/R/slicing.R:
--------------------------------------------------------------------------------
  1 | #' Query GDC for data slices
  2 | #'
  3 | #' This function returns a BAM file representing reads overlapping
  4 | #' regions specified either as chromosomal regions or as gencode gene
  5 | #' symbols.
  6 | #'
  7 | #' @param uuid character(1) identifying the BAM file resource
  8 | #'
  9 | #' @param regions character() vector describing chromosomal regions,
 10 | #'     e.g., \code{c("chr1", "chr2:10000", "chr3:10000-20000")} (all
 11 | #'     of chromosome 1, chromosome 2 from position 10000 to the end,
 12 | #'     chromosome 3 from 10000 to 20000).
 13 | #'
 14 | #' @param symbols character() vector of gencode gene symbols, e.g.,
 15 | #'     \code{c("BRCA1", "PTEN")}
 16 | #'
 17 | #' @param destination character(1) default \code{tempfile()} file path
 18 | #'     for BAM file slice
 19 | #'
 20 | #' @param overwrite logical(1) default FALSE can destination be
 21 | #'     overwritten?
 22 | #'
 23 | #' @param progress logical(1) default \code{interactive()} should a
 24 | #'     progress bar be used?
 25 | #'
 26 | #' @param token character(1) security token allowing access to
 27 | #'     restricted data. Almost all BAM data is restricted, so a token is
 28 | #'     usually required. See
 29 | #'     \url{https://docs.gdc.cancer.gov/Data/Data_Security/Data_Security/#authentication-tokens}.
 30 | #'
 31 | #' @details This function uses the Genomic Data Commons "slicing" API
 32 | #'     to get portions of a BAM file specified either using "regions"
 33 | #'     or using HGNC gene symbols. 
 34 | #' 
 35 | #' @return character(1) destination to the downloaded BAM file
 36 | #'
 37 | #' @importFrom httr progress
 38 | #' @importFrom jsonlite toJSON
 39 | #' 
 40 | #' @examples
 41 | #' \dontrun{
 42 | #'  slicing("df80679e-c4d3-487b-934c-fcc782e5d46e",
 43 | #'         regions="chr17:75000000-76000000",
 44 | #'         token=gdc_token())
 45 | #' 
 46 | #' # Get 10 BAM files.
 47 | #' bamfiles = files() |> 
 48 | #'            filter(data_format=='BAM') |>
 49 | #'            results(size=10) |> ids()
 50 | #' 
 51 | #' # Current alignments at the GDC are to GRCh38
 52 | #' library('TxDb.Hsapiens.UCSC.hg38.knownGene')
 53 | #' all_genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
 54 | #' 
 55 | #' first3genes = all_genes[1:3]
 56 | #' # remove strand info
 57 | #' strand(first3genes) = '*'
 58 | #' 
 59 | #' # We can get our regions easily now
 60 | #' as.character(first3genes)
 61 | #' 
 62 | #' # Use parallel downloads to speed processing
 63 | #' library(BiocParallel)
 64 | #' register(MulticoreParam())
 65 | #' 
 66 | #' fnames = bplapply(bamfiles, slicing, overwrite = TRUE,
 67 | #'                 regions=as.character(first3genes))
 68 | #' 
 69 | #' # 10 BAM files
 70 | #' fnames
 71 | #' 
 72 | #' library(GenomicAlignments)
 73 | #' lapply(unlist(fnames), readGAlignments)
 74 | #' 
 75 | #' }
 76 | #' @export
 77 | slicing <- function(uuid, regions, symbols, destination=file.path(tempdir(), paste0(uuid, '.bam')),
 78 |                     overwrite=FALSE, progress=interactive(), token=gdc_token())
 79 | {
 80 |     stopifnot(is.character(uuid), length(uuid) == 1L)
 81 |     stopifnot(missing(regions) || missing(symbols),
 82 |               !(missing(regions) && missing(symbols)))
 83 |     stopifnot(is.character(destination), length(destination) == 1L,
 84 |               (overwrite && file.exists(destination)) || !file.exists(destination))
 85 | 
 86 |     if (!missing(symbols))
 87 |         body <- list(gencode=I(symbols))
 88 |     else
 89 |         ## FIXME: validate regions
 90 |         body <- list(regions=regions)
 91 | 
 92 |     response <- .gdc_post(
 93 |         endpoint=sprintf("slicing/view/%s", uuid),
 94 |         add_headers('Content-type'='application/json'),
 95 |         write_disk(destination, overwrite),
 96 |         if (progress) progress() else NULL,
 97 |         body=toJSON(body), token=token)
 98 |     if (progress)
 99 |         cat("\n")
100 | 
101 |     destination
102 | }
103 | 


--------------------------------------------------------------------------------
/R/manifest.R:
--------------------------------------------------------------------------------
  1 | #' Prepare GDC manifest file for bulk download
  2 | #'
  3 | #' The \code{manifest} function/method creates a manifest of files to be downloaded
  4 | #' using the GDC Data Transfer Tool. There are methods for
  5 | #' creating manifest data frames from \code{\link{GDCQuery}} objects
  6 | #' that contain file information ("cases" and "files" queries).
  7 | #' 
  8 | #' @param x An \code{\link{GDCQuery}} object of subclass "gdc_files" or "gdc_cases".
  9 | #'
 10 | #' @param size The total number of records to return.  Default 
 11 | #' will return the usually desirable full set of records.
 12 | #'
 13 | #' @param from Record number from which to start when returning the manifest.
 14 | #'
 15 | #' @param ... passed to \code{\link[httr]{PUT}}.
 16 | #'
 17 | #' @return A \code{\link[tibble]{tibble}}, also of type "gdc_manifest", with five columns:
 18 | #' \itemize{
 19 | #' \item{id}
 20 | #' \item{filename}
 21 | #' \item{md5}
 22 | #' \item{size}
 23 | #' \item{state}
 24 | #'}
 25 | #' 
 26 | #' @examples
 27 | #' gFiles = files()
 28 | #' shortManifest = gFiles |> manifest(size=10)
 29 | #' head(shortManifest,n=3)
 30 | #'
 31 | #' 
 32 | #' @export
 33 | manifest <- function(x,from=0,size=count(x),...) {
 34 |     UseMethod('manifest',x)
 35 | }
 36 | 
 37 | #' @describeIn manifest
 38 | #'
 39 | #' @export
 40 | manifest.gdc_files <- function(x,from=0,size=count(x),...) {
 41 |     .manifestCall(x=x,from=from,size=size,...)
 42 | }
 43 | 
 44 | #' @describeIn manifest
 45 | #'
 46 | #' @export
 47 | manifest.GDCfilesResponse <- function(x,from=0,size=count(x),...) {
 48 |     .manifestCall(x=x$query,from=from,size=size,...)
 49 | }
 50 | 
 51 | #' @describeIn manifest
 52 | #'
 53 | #' @export
 54 | manifest.GDCcasesResponse <- function(x,from=0,size=count(x),...) {
 55 |     manifest(x=x$query,from=from,size=size,...)
 56 | }
 57 | 
 58 | 
 59 | 
 60 | #' @importFrom readr read_tsv 
 61 | .manifestCall <- function(x,from=0,size=count(x),...) {
 62 |     body = Filter(function(z) !is.null(z),x)
 63 |     body[['facets']]=NULL
 64 |     body[['fields']]=paste0(default_fields(x),collapse=",")
 65 |     body[['from']]=from
 66 |     body[['size']]=size
 67 |     # remove return_type for now
 68 |     # body[['return_type']]='manifest'
 69 |     tmp <- httr::content(
 70 |         .gdc_post(entity_name(x), body=body, token=NULL, ...),
 71 |         as = "text", encoding = "UTF-8"
 72 |     )
 73 |     tmp <- jsonlite::fromJSON(tmp)[["data"]][["hits"]]
 74 |     if ("acl" %in% names(tmp))
 75 |         tmp <- tidyr::unnest_wider(data = tmp, col = "acl", names_sep = "_")
 76 |     if(ncol(tmp)<5) {
 77 |         tmp=data.frame()
 78 |     }
 79 |     class(tmp) <- c('GDCManifest',class(tmp))
 80 |     return(tmp)
 81 | }
 82 | 
 83 | #' write a manifest data.frame to disk
 84 | #' 
 85 | #' The \code{\link{manifest}} method creates a data.frame
 86 | #' that represents the data for a manifest file needed
 87 | #' by the GDC Data Transfer Tool. While the file format
 88 | #' is nothing special, this is a simple helper function
 89 | #' to write a manifest data.frame to disk. It returns
 90 | #' the path to which the file is written, so it can
 91 | #' be used "in-line" in a call to \code{\link{transfer}}.
 92 | #' 
 93 | #' @param manifest A data.frame with five columns, typically
 94 | #'     created by a call to \code{\link{manifest}}
 95 | #' 
 96 | #' @param destfile The filename for saving the manifest.
 97 | #'          
 98 | #' @return character(1) the destination file name.
 99 | #'
100 | #' @importFrom utils write.table
101 | #' 
102 | #' @examples
103 | #' mf = files() |> manifest(size=10)
104 | #' write_manifest(mf)
105 | #' 
106 | #' @export
107 | write_manifest <- function(manifest,destfile=tempfile()) {
108 |     stopifnot(
109 |         all(.gdc_manifest_colnames %in% colnames(manifest)),
110 |         ncol(manifest) > 5
111 |     )
112 |     write.table(manifest,file=destfile,sep="\t",
113 |                 col.names=TRUE,row.names=FALSE,quote=FALSE)
114 |     destfile
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/R/clinical.R:
--------------------------------------------------------------------------------
  1 | #' Get clinical information from GDC
  2 | #'
  3 | #' The NCI GDC has a complex data model that allows various studies to
  4 | #' supply numerous clinical and demographic data elements. However,
  5 | #' across all projects that enter the GDC, there are
  6 | #' similarities. This function returns four data.frames associated
  7 | #' with case_ids from the GDC.
  8 | #'
  9 | #' @param case_ids a character() vector of case_ids, typically from
 10 | #'     "cases" query.
 11 | #'
 12 | #' @param include_list_cols logical(1), whether to include list
 13 | #'     columns in the "main" data.frame. These list columns have
 14 | #'     values for aliquots, samples, etc. While these may be useful
 15 | #'     for some situations, they are generally not that useful as
 16 | #'     clinical annotations.
 17 | #' 
 18 | #' @importFrom jsonlite fromJSON
 19 | #' @importFrom dplyr bind_rows
 20 | #' @importFrom tibble as_tibble
 21 | #'
 22 | #' @details
 23 | #' Note that these data.frames can, in general, have different numbers
 24 | #' of rows (or even no rows at all). If one wishes to combine to
 25 | #' produce a single data.frame, using the approach of left joining to
 26 | #' the "main" data.frame will yield a useful combined data.frame. We
 27 | #' do not do that directly given the potential for 1:many
 28 | #' relationships. It is up to the user to determine what the best
 29 | #' approach is for any given dataset.
 30 | #'
 31 | #'
 32 | #' @return
 33 | #' A list of four data.frames:
 34 | #' \enumerate{
 35 | #' \item main, representing basic case identification and metadata
 36 | #'     (update date, etc.)
 37 | #' \item diagnoses
 38 | #' \item esposures
 39 | #' \item demographic
 40 | #' }
 41 | #'
 42 | #'
 43 | #' @examples
 44 | #' case_ids = cases() |> results(size=10) |> ids()
 45 | #' clinical_data = gdc_clinical(case_ids)
 46 | #'
 47 | #' # overview of clinical results
 48 | #' class(clinical_data)
 49 | #' names(clinical_data)
 50 | #' sapply(clinical_data, class)
 51 | #' sapply(clinical_data, nrow)
 52 | #'
 53 | #' # available data
 54 | #' head(clinical_data$main)
 55 | #' head(clinical_data$demographic)
 56 | #' head(clinical_data$diagnoses)
 57 | #' head(clinical_data$exposures)
 58 | #' 
 59 | #' @export
 60 | gdc_clinical = function(case_ids, include_list_cols = FALSE) {
 61 |     stopifnot(is.character(case_ids))
 62 |     stopifnot(is.logical(include_list_cols) & length(include_list_cols)==1)
 63 |     resp = cases() |>
 64 |         filter( ~ case_id %in% case_ids) |>
 65 |         expand(c("diagnoses",
 66 |                  "demographic",
 67 |                  "exposures",
 68 |                  "follow_ups.other_clinical_attributes")) |>
 69 |         response_all(response_handler = function(x) jsonlite::fromJSON(x, simplifyDataFrame = TRUE))
 70 |     demographic = resp$results$demographic
 71 |     demographic$case_id = rownames(demographic)
 72 | 
 73 |     nodx <- vapply(resp$results$diagnoses, is.null, logical(1L))
 74 |     if (any(nodx))
 75 |         resp$results$diagnoses[nodx] <- list(data.frame())
 76 |     
 77 |     diagnoses <- suppressMessages({
 78 |         bind_rows(
 79 |             lapply(resp$results$diagnoses, readr::type_convert),
 80 |             .id = "case_id"
 81 |         )
 82 |     })
 83 | 
 84 |     exposures = bind_rows(resp$results$exposures, .id = "case_id")
 85 |     follow_ups = bind_rows(resp$results$follow_ups, .id = "case_id")
 86 | 
 87 |     # set up main table by removing data.frame columns
 88 |     cnames = setdiff(colnames(resp$results), c('exposures', 'follow_ups', 'diagnoses', 'demographic'))
 89 |     main = resp$results[, cnames]
 90 | 
 91 |     if(!include_list_cols) {
 92 |         non_list_cols = names(Filter(function(cname) cname!='list', sapply(main, class)))
 93 |         main = main[, non_list_cols]
 94 |     }
 95 |     
 96 |     y = list(demographic = as_tibble(demographic),
 97 |              diagnoses = as_tibble(diagnoses),
 98 |              exposures = as_tibble(exposures),
 99 |              follow_ups = as_tibble(follow_ups),
100 |              main = as_tibble(main))
101 |     class(y) = c('GDCClinicalList', class(y))
102 |     return(y)
103 | }
104 | 


--------------------------------------------------------------------------------
/R/REST.R:
--------------------------------------------------------------------------------
  1 | #" (internal) Extract header field element from httr response
  2 | #' @importFrom httr headers
  3 | .gdc_header_elt <- function(response, field, element) {
  4 |     value <- headers(response)[[field]]
  5 |     if (is.null(value))
  6 |         stop("response header does not contain field '", field, "'")
  7 | 
  8 |     value <- strsplit(strsplit(value, "; *")[[1L]], "= *")
  9 |     key <- vapply(value, `[[`, character(1), 1L)
 10 |     idx <- element == key
 11 |     if (sum(idx) != 1L)
 12 |         stop("response header field '", field,
 13 |              "' does not contain unique element '", element, "'")
 14 | 
 15 |     value[[which(idx)]][[2]]
 16 | }
 17 |     
 18 | #" (internal) Rename a file 'from' to 'to'
 19 | .gdc_file_rename <- function(from, to, overwrite) {
 20 |     if (overwrite && file.exists(to))
 21 |         unlink(to)
 22 | 
 23 |     reason <- NULL
 24 |     status <- withCallingHandlers({
 25 |         file.rename(from, to)
 26 |     }, warning=function(w) {
 27 |         reason <<- conditionMessage(w)
 28 |         invokeRestart("muffleWarning")
 29 |     })
 30 |     unlink(from)
 31 |     if (!status)
 32 |         stop("failed to rename downloaded file:\n",
 33 |              "\n  from: '", from, "'",
 34 |              "\n  to: '", to, "'",
 35 |              "\n  reason:",
 36 |              "\n", .wrapstr(reason))
 37 |     else if (!is.null(reason))
 38 |         warning(reason)        # forward non-fatal file rename warning
 39 | 
 40 |     to
 41 | }
 42 | 
 43 | #" (internal) GET endpoint / uri
 44 | #' @importFrom httr GET add_headers stop_for_status
 45 | .gdc_get <-
 46 |     function(endpoint, parameters=list(), token=NULL, ..., base=.gdc_base)
 47 | {
 48 |     stopifnot(is.character(endpoint), length(endpoint) == 1L)
 49 |     uri <- sprintf("%s/%s", base, endpoint)
 50 |     uri <- sprintf("%s%s", uri, .parameter_string(parameters))
 51 |     if(getOption('gdc.verbose',FALSE)) {
 52 |       message("GET request uri:\n",uri)
 53 |     }
 54 |     response <- GET(uri, add_headers(`X-Auth-Token`=token),
 55 |                     #config = httr::config(ssl_verifypeer = FALSE),
 56 |                     ...)
 57 |     stop_for_status(response)
 58 |     response
 59 | }
 60 | 
 61 | #" (internal) POST endpoint / uri
 62 | #' @importFrom httr POST add_headers write_disk stop_for_status
 63 | .gdc_post <-
 64 |     function(endpoint, body, token=NULL, ..., base=.gdc_base)
 65 | {
 66 |     stopifnot(is.character(endpoint), length(endpoint) == 1L)
 67 |     uri <- sprintf("%s/%s", base, endpoint)
 68 |     if(getOption('gdc.verbose',FALSE)) {
 69 |       message("POST request uri:\n",uri)
 70 |       message("POST body: ",jsonlite::toJSON(body))
 71 |     }
 72 |     if('fields' %in% names(body)) 
 73 |         body[['fields']] = paste0(body[['fields']],collapse=',')
 74 |     response <- POST(
 75 |         uri, add_headers(
 76 |             `X-Auth-Token` = token,
 77 |             Accept = "application/json",
 78 |             `Content-Type` = "application/json"
 79 |         ),
 80 |         ...,
 81 |         #config = httr::config(ssl_verifypeer = FALSE),
 82 |         body=body, encode="json")
 83 |     stop_for_status(response)
 84 | }
 85 | 
 86 | #" (internal) Download one file from GDC, renaming to remote filename
 87 | #' @importFrom httr GET write_disk add_headers stop_for_status
 88 | .gdc_download_one <-
 89 |     function(uri, destination, overwrite, progress, token=NULL, base=.gdc_base)
 90 | {
 91 |     uri = sprintf('%s/%s',base,uri)
 92 |     if(getOption('gdc.verbose',FALSE)) {
 93 |       message("GET request uri:\n",uri)
 94 |     }
 95 | 
 96 |     if(!dir.exists(destination)) {
 97 |         dir.create(destination)
 98 |     }
 99 |     destfile = file.path(destination, '.partial_download')
100 |     
101 |     response <- GET(uri, write_disk(destfile, overwrite = TRUE),
102 |                     if (progress) progress() else NULL,
103 |                     add_headers(`X-Auth-Token`=token))
104 |     stop_for_status(response)
105 |     if (progress) cat("\n")
106 |     
107 |     filename <- .gdc_header_elt(response, "content-disposition", "filename")
108 |     to <- file.path(destination, filename)
109 |     .gdc_file_rename(destfile, to, overwrite)
110 | }
111 | 


--------------------------------------------------------------------------------
/R/gdcdata.R:
--------------------------------------------------------------------------------
  1 | #' Download GDC files
  2 | #'
  3 | #' Download one or more files from GDC. Files are downloaded using the
  4 | #' UUID and renamed to the file name on the remote system. By default,
  5 | #' neither the uuid nor the file name on the remote system can exist.
  6 | #'
  7 | #' This function is appropriate for one or several files; for large
  8 | #' downloads use \code{\link{manifest}} to create a manifest for and
  9 | #' the GDC Data Transfer Tool.
 10 | #'
 11 | #' @param uuids character() of GDC file UUIDs.
 12 | #'
 13 | #' @param use_cached logical(1) default TRUE indicating that,
 14 | #'     if found in the cache, the file will not be downloaded
 15 | #'     again. If FALSE, all supplied uuids will be re-downloaded.
 16 | #'
 17 | #' @param progress logical(1) default TRUE in interactive sessions,
 18 | #'     FALSE otherwise indicating whether a progress par should be
 19 | #'     produced for each file download.
 20 | #'
 21 | #' @param access_method character(1), either 'api' or 'client'. See details.
 22 | #'
 23 | #' @param transfer_args character(1), additional arguments to pass to
 24 | #'     the gdc-client command line. See \code{\link{gdc_client}} and
 25 | #'     \code{\link{transfer_help}} for details.
 26 | #' 
 27 | #' @param token (optional) character(1) security token allowing access
 28 | #'     to restricted data. See
 29 | #'     \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}.
 30 | #'
 31 | #' @param ... further arguments passed to files
 32 | #'
 33 | #' @seealso \code{\link{manifest}} for downloading large data.
 34 | #'
 35 | #' @return a named vector with file uuids as the names and paths as
 36 | #' the value
 37 | #'
 38 | #' @details When access_method is "api", the GDC "data" endpoint is the
 39 | #'     transfer mechanism used. The alternative access_method, "client", will
 40 | #'     utilize the \code{gdc-client} transfer tool, which must be
 41 | #'     downloaded separately and available. See
 42 | #'     \code{\link{gdc_client}} for details on specifying the location
 43 | #'     of the gdc-client executable.
 44 | #'
 45 | #' 
 46 | #' @examples
 47 | #' # get some example file uuids
 48 | #' uuids <- files() |>
 49 | #'     filter(~ access == 'open' & file_size < 100000) |>
 50 | #'     results(size = 3) |>
 51 | #'     ids()
 52 | #'
 53 | #' # and get the data, placing it into the gdc_cache() directory
 54 | #' gdcdata(uuids, use_cached=TRUE)
 55 | #'
 56 | #' @export
 57 | gdcdata <-
 58 |     function(uuids, use_cached=TRUE,
 59 |              progress=interactive(), token=NULL, access_method='api',
 60 |              transfer_args = character(), ...)
 61 | {
 62 |     stopifnot(is.character(uuids))
 63 | 
 64 |     uuids = trimws(uuids)
 65 |     manifest = files(...) |>
 66 |             GenomicDataCommons::filter( ~ file_id %in% uuids ) |>
 67 |             GenomicDataCommons::manifest()
 68 |     # files from previous downloads should have the following
 69 |     # path and filenames
 70 |     fs = file.path(gdc_cache(), manifest[["id"]], manifest[["file_name"]])
 71 | 
 72 |     # Restrict new manifest to those that we need to download,
 73 |     to_do_manifest = manifest[!file.exists(fs),]
 74 | 
 75 |     # These are the uuids of the cache misses
 76 |     missing_uuids = to_do_manifest[["id"]]
 77 | 
 78 |     # And these are the cache hits
 79 |     names(fs) = manifest[["id"]]
 80 | 
 81 |     # Using API download to fetch missing uuids
 82 |     endpoint <- "data"
 83 |     cache_dir <-  gdc_cache()
 84 | 
 85 |     destinations <- file.path(cache_dir, missing_uuids)
 86 |     if(access_method == 'api') {
 87 |         uris <- sprintf("%s/%s", endpoint, missing_uuids)
 88 |         value <- mapply(.gdc_download_one, uris, destinations,
 89 |                         MoreArgs=list(overwrite=!use_cached, progress=progress,
 90 |                                       token=token),
 91 |                         SIMPLIFY=TRUE, USE.NAMES=FALSE)
 92 |         names(value) <- missing_uuids
 93 |     } else {
 94 |         ## in the future, may want to transition to
 95 |         ## passing the actual manifest, since we
 96 |         ## are going to regenerate it, anyway.
 97 |         value = NULL
 98 |         if(length(missing_uuids)>0) 
 99 |             value = transfer(missing_uuids, token = token, args = transfer_args)
100 |     }
101 | 
102 |     # combine cache hits with cache misses
103 |     #
104 |     # Return vector of file file path, name=uuid
105 |     fs
106 | }
107 | 


--------------------------------------------------------------------------------
/R/fields.R:
--------------------------------------------------------------------------------
  1 | #' S3 Generic to return all GDC fields
  2 | #'
  3 | #' @param x A character(1) string ('cases','files','projects',
  4 | #' 'annotations') or an subclass of \code{\link{GDCQuery}}.
  5 | #' @return a character vector of the default fields
  6 | #'
  7 | #' @examples
  8 | #' available_fields('projects')
  9 | #' projQuery = query('projects')
 10 | #' available_fields(projQuery)
 11 | #' 
 12 | #' @export
 13 | available_fields = function(x) {
 14 |     UseMethod('available_fields',x)
 15 | }
 16 | 
 17 | #' @describeIn available_fields GDCQuery method
 18 | #' @export
 19 | available_fields.GDCQuery = function(x) {
 20 |     return(mapping(entity_name(x))$field)
 21 | }
 22 | 
 23 | #' @describeIn available_fields character method
 24 | #' @export
 25 | available_fields.character = function(x) {
 26 |     stopifnot(length(x)==1,x %in% .gdc_entities)
 27 |     return(mapping(x)$field)
 28 | }
 29 | 
 30 | 
 31 | #' S3 Generic to return default GDC fields
 32 | #'
 33 | #' @param x A character string ('cases','files','projects',
 34 | #' 'annotations') or an subclass of \code{\link{GDCQuery}}.
 35 | #' @return a character vector of the default fields
 36 | #'
 37 | #' @examples
 38 | #' default_fields('projects')
 39 | #' projQuery = query('projects')
 40 | #' default_fields(projQuery)
 41 | #' 
 42 | #' @export
 43 | default_fields = function(x) {
 44 |     UseMethod('default_fields',x)
 45 | }
 46 | 
 47 | #' @describeIn default_fields character method
 48 | #' @export
 49 | default_fields.character = function(x) {
 50 |     defaults=NA # just to avoid no visible binding note
 51 |     stopifnot(length(x)==1,x %in% .gdc_entities)
 52 |     return(subset(mapping(x),defaults)$field)
 53 | }
 54 | 
 55 | #' @describeIn default_fields GDCQuery method
 56 | #' @export
 57 | default_fields.GDCQuery = function(x) {
 58 |     return(default_fields(entity_name(x)))
 59 | }
 60 | 
 61 | #' S3 generic to set GDCQuery fields
 62 | #'
 63 | #' @param x the objects on which to set fields
 64 | #' @param fields a character vector specifying the fields
 65 | #' 
 66 | #'
 67 | #' @return A \code{\link{GDCQuery}} object, with the fields
 68 | #' member altered.
 69 | #' 
 70 | #' @examples
 71 | #' gProj = projects()
 72 | #' gProj$fields
 73 | #' head(available_fields(gProj))
 74 | #' default_fields(gProj)
 75 | #'
 76 | #' gProj |>
 77 | #'   select(default_fields(gProj)[1:2]) |>
 78 | #'   response() |>
 79 | #'   str(max_level=2)
 80 | #' 
 81 | #' @export
 82 | select <- function(x,fields) {
 83 |     UseMethod('select',x)
 84 | }
 85 | 
 86 | 
 87 | 
 88 | #" (internal) rectify specified fields with available fields
 89 | .gdcRectifyFieldsForEntity <- function(entity,fields) {
 90 |     af = available_fields(entity)
 91 |     mismatches = fields[!(fields %in% af)]
 92 |     if(length(mismatches)>0)
 93 |         stop(sprintf('fields specified included fields not available in %s including (%s)',entity,mismatches))
 94 |     fields = union(paste0(sub('s$','',entity),"_id"),fields)
 95 |     return(fields)
 96 | }
 97 | 
 98 | #' @describeIn select set fields on a GDCQuery object
 99 | #' @export
100 | select.GDCQuery <- function(x,fields) {
101 |     x$fields = .gdcRectifyFieldsForEntity(entity_name(x),fields)
102 |     return(x)
103 | }
104 | 
105 | #' Find matching field names
106 | #' 
107 | #' This utility function allows quick text-based search of available
108 | #' fields for using \code{\link{grep}}
109 | #' 
110 | #' @param entity one of the available gdc entities ('files','cases',...)
111 | #'     against which to gather available fields for matching
112 | #' 
113 | #' @param pattern A regular expression that will be used
114 | #' in a call to \code{\link{grep}}
115 | #' 
116 | #' @param ... passed on to grep
117 | #' 
118 | #' @param value logical(1) whether to return values as opposed
119 | #' to indices (passed along to grep)
120 | #'
121 | #' @return character() vector of field names matching
122 | #'     \code{pattern}
123 | #' 
124 | #' @examples 
125 | #' grep_fields('files','analysis')
126 | #' 
127 | #' @export
128 | grep_fields <- function(entity,pattern,...,value=TRUE) {
129 |   stopifnot(entity %in% .gdc_entities)
130 |   return(grep(pattern=pattern,
131 |               x=available_fields(entity),
132 |               value=TRUE,...))
133 | }
134 | 
135 | #' Find common values for a GDC field
136 | #' 
137 | #' @param entity character(1), a GDC entity ("cases", "files", "annotations", "projects")
138 | #' @param field character(1), a field that is present in the entity record
139 | #'
140 | #' @return character vector of the top 100 (or fewer) most frequent
141 | #'     values for a the given field
142 | #' 
143 | #' @examples 
144 | #' available_values('files','cases.project.project_id')[1:5]
145 | #' 
146 | #' @export
147 | available_values <- function(entity,field) {
148 |     stopifnot(entity %in% .gdc_entities)
149 |     agg = query(entity) |> facet(field) |> aggregations()
150 |     agg[[field]]$key
151 | }
152 | 
153 | #' S3 Generic that returns the field description text, if available
154 | #'
155 | #' @param entity character(1) string ('cases','files','projects',
156 | #' 'annotations', etc.) or an subclass of \code{\link{GDCQuery}}.
157 | #'
158 | #' @param field character(1), the name of the field that will be used to look
159 | #' up the description.
160 | #' 
161 | #' @return character(1) descriptive text or character(0) if no description
162 | #' is available.
163 | #'
164 | #' @examples
165 | #' field_description('cases', 'annotations.category')
166 | #' casesQuery = query('cases')
167 | #' field_description(casesQuery, 'annotations.category')
168 | #' field_description(cases(), 'annotations.category')
169 | #' 
170 | #' @export
171 | field_description = function(entity, field) {
172 |     UseMethod('field_description',entity)
173 | }
174 | 
175 | #' @describeIn field_description GDCQuery method
176 | #' @export
177 | field_description.GDCQuery = function(entity, field) {
178 |     stopifnot(length(field)==1)
179 |     m = mapping(entity_name(entity))
180 |     return(m$description[m$field==field])
181 | }
182 | 
183 | #' @describeIn field_description character method
184 | #' @export
185 | field_description.character = function(entity, field) {
186 |     stopifnot(length(entity)==1,entity %in% .gdc_entities)
187 |     stopifnot(length(field)==1)
188 |     m = mapping(entity)
189 |     return(m$description[m$field==field])
190 | }
191 | 


--------------------------------------------------------------------------------
/R/bulk_transfer.R:
--------------------------------------------------------------------------------
  1 | #' Bulk data download
  2 | #'
  3 | #' The GDC maintains a special tool,
  4 | #' \href{the GDC Data Transfer Tool}{https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/},
  5 | #' that enables high-performance, potentially parallel, and
  6 | #' resumable downloads. The Data Transfer Tool is an external
  7 | #' program that requires separate download. Due to recent changes in the
  8 | #' GDC API, the transfer function now validates the version of the `gdc-client`
  9 | #' to ensure reliable downloads.
 10 | #'
 11 | #' @param uuids character() vector of GDC file UUIDs
 12 | #'
 13 | #' @param args character() vector specifying command-line arguments to
 14 | #'     be passed to \code{gdc-client}. See \code{\link{transfer_help}} for
 15 | #'     possible values. The arguments \code{--manifest}, \code{--dir},
 16 | #'     and \code{--token-file} are determined by \code{manifest},
 17 | #'     \code{destination_dir}, and \code{token}, respectively, and
 18 | #'     should NOT be provided as elements of \code{args}.
 19 | #'
 20 | #' @param token character(1) containing security
 21 | #'     token allowing access to restricted data. See
 22 | #'     \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}.
 23 | #'     Note that the GDC transfer tool requires a file for data
 24 | #'     transfer. Therefore, this token will be written to a temporary
 25 | #'     file (with appropriate permissions set).
 26 | #'
 27 | #' @param overwrite logical(1) default FALSE indicating whether
 28 | #'     existing files with identical name should be over-written.
 29 | #'
 30 | #' @return character(1) directory path to which the files were
 31 | #'     downloaded.
 32 | #'
 33 | #' @examples
 34 | #' \dontrun{
 35 | #' uuids = files() |> 
 36 | #'   filter(access == "open") |> 
 37 | #'   results() |>
 38 | #'   ids()
 39 | #' file_paths <- transfer(uuids)
 40 | #' file_paths
 41 | #' names(file_paths)
 42 | #' # and with authenication
 43 | #' # REQUIRES gdc_token 
 44 | #' # destination <- transfer(uuids,token=gdc_token())
 45 | #' }
 46 | #'
 47 | #' @importFrom utils read.table
 48 | #' @export
 49 | transfer <-
 50 |     function(uuids, args=character(), token=NULL, overwrite=FALSE)
 51 |     {
 52 |         stopifnot(is.character(uuids))
 53 |         destination_dir <- gdc_cache()
 54 | 
 55 |         manifest = files() |>
 56 |             GenomicDataCommons::filter( ~ file_id %in% uuids ) |>
 57 |             GenomicDataCommons::manifest()
 58 |         manifest_file = write_manifest(manifest)
 59 |         
 60 | 
 61 |         dir_arg <- sprintf("--dir %s", destination_dir)
 62 |         manifest_arg <- sprintf("--manifest %s", manifest_file)
 63 |         token_file = tempfile()
 64 |         if (!is.null(token)) {
 65 |             writeLines(token,con=token_file)
 66 |             stopifnot(file.exists(token_file))
 67 |             Sys.chmod(token_file,mode="600")
 68 |             token <- sprintf("--token-file %s", token_file)
 69 |         }
 70 |         gdc_client_version_validate()
 71 |         args <- paste(c("download", dir_arg, manifest_arg, args, token), collapse=" ")
 72 |         system2(gdc_client(), args)
 73 | 
 74 |         if(!is.null(token))
 75 |             unlink(token_file)
 76 | 
 77 |         filepaths <- file.path(gdc_cache(), uuids,
 78 |                                as.character(manifest[[2]]))
 79 |         names(filepaths) = uuids
 80 |         return(filepaths)
 81 |     }
 82 | 
 83 | #' return gdc-client executable path
 84 | #' 
 85 | #' This function is a convenience function to 
 86 | #' find and return the path to the GDC Data Transfer
 87 | #' Tool executable assumed to be named 'gdc-client'. 
 88 | #' The assumption is that the appropriate version of the
 89 | #' GDC Data Transfer Tool is a separate download available
 90 | #' from \href{the GDC website}{https://gdc.cancer.gov/access-data/gdc-data-transfer-tool}
 91 | #' and as a backup from \href{on github}{https://github.com/NCI-GDC/gdc-client}.
 92 | #'
 93 | #' @details
 94 | #' The path is checked in the following order:
 95 | #' \enumerate{
 96 | #' \item an R option("gdc_client")
 97 | #' \item an environment variable GDC_CLIENT
 98 | #' \item from the search PATH
 99 | #' \item in the current working directory
100 | #' }
101 | #'
102 | #' @return character(1) the path to the gdc-client executable.
103 | #'
104 | #' @examples
105 | #' # this cannot run without first
106 | #' # downloading the GDC Data Transfer Tool
107 | #' gdc_client = try(gdc_client(),silent=TRUE)
108 | #' 
109 | #' @export
110 | gdc_client = function() {
111 |     if(!is.null(getOption('gdc_client')))
112 |         if(file.exists(getOption('gdc_client')))
113 |             return(getOption('gdc_client'))
114 |     if(file.exists(Sys.getenv("GDC_CLIENT")))
115 |         return(Sys.getenv("GDC_CLIENT"))
116 |     if(!(Sys.which("gdc-client")==''))
117 |         return(Sys.which("gdc-client"))
118 |     client=dir('.',pattern='^gdc-client$',full.names=TRUE)
119 |     if(length(client)==1)
120 |         if(client=='./gdc-client')
121 |             return(client)
122 |     stop('gdc_client not found. Be sure to install the command \nline GDC client available from the GDC website.')
123 | }
124 | 
125 | gdc_client_version <- function() {
126 |     gc_loc <- gdc_client()
127 |     vers <- system2(gc_loc, "--version", stdout = TRUE, stderr = TRUE)
128 |     vers <- gsub("^v", "", vers)
129 |     package_version(vers)
130 | }
131 | 
132 | .GDC_COMPATIBLE_VERSION <- "1.3.0"
133 | 
134 | #' @describeIn transfer
135 | #'
136 | #' If you are using the 'client' option, your `gdc-client` should be
137 | #' up-to-date (>= 1.3.0).
138 | #'
139 | #' @param valid_version character(1) The last known version that works for the
140 | #'     current data release for which to validate against, not typically changed
141 | #'     by the end-user.
142 | #'
143 | #' @export
144 | gdc_client_version_validate <-
145 |     function(valid_version = .GDC_COMPATIBLE_VERSION)
146 | {
147 |     client_ver <- gdc_client_version()
148 |     if (client_ver < package_version(valid_version))
149 |         stop("Update the 'gdc_client' to a version >= ", valid_version)
150 | }
151 | 
152 | #' \code{transfer_help()} queries the the command line GDC Data
153 | #' Transfer Tool, \code{gdc-client}, for available options to be used
154 | #' in the \code{\link{transfer}} command.
155 | #'
156 | #' @describeIn transfer
157 | #'
158 | #' @export
159 | transfer_help <- function() {
160 |     system2(gdc_client(), "download -h")
161 | }
162 | 


--------------------------------------------------------------------------------
/R/filters.R:
--------------------------------------------------------------------------------
  1 | #.unary_op <- function(left) {
  2 | #  force(left)
  3 | #  function(e1) {
  4 | #    force(e1)
  5 | #    list(op=e1,content=c(field=left,value=c(right)))
  6 | #  }
  7 | #}
  8 | 
  9 | #' @importFrom jsonlite unbox
 10 | .binary_op <- function(sep) {
 11 |   force(sep)
 12 |   function(e1, e2) {
 13 |       force(e1)
 14 |       force(e2)
 15 |       list(op=unbox(sep),content=list(field=e1,value=e2))
 16 |   }
 17 | }
 18 | 
 19 | .missing_op <- function(sep) {
 20 |   force(sep)
 21 |   function(e1) {
 22 |       force(e1)
 23 |       list(op=unbox(sep), content = list(field = e1, value = "MISSING"))
 24 |   }
 25 | }
 26 | 
 27 | .negate_op <- function(sep) {
 28 |     force(sep)
 29 |     function(op) {
 30 |       force(op)
 31 |       list(op = unbox(sep),
 32 |         content = list(
 33 |           field = op$content$field,
 34 |           value = op$content$value)
 35 |       )
 36 |     }
 37 | }
 38 | 
 39 | #' @importFrom jsonlite unbox
 40 | .combine_op <- function(sep) {
 41 |   force(sep)
 42 |   function(e1, e2) {
 43 |       force(e1)
 44 |       force(e2)
 45 |     return(list(op=unbox(sep),content=list(e1,e2)))
 46 |   }
 47 | }
 48 | 
 49 | #.f_env = new.env(parent=emptyenv())
 50 | .f_env = list()
 51 | .f_env$`==` = .binary_op('=')
 52 | .f_env$`!=` = .binary_op('!=')
 53 | .f_env$`<` = .binary_op('<')
 54 | .f_env$'>' = .binary_op('>')
 55 | .f_env$'&' = .combine_op('and')
 56 | .f_env$'|' = .combine_op('or')
 57 | .f_env$`<=` = .binary_op('<=')
 58 | .f_env$'>=' = .binary_op('>=')
 59 | .f_env$'%in%' = .binary_op('in')
 60 | .f_env$'%exclude%' = .binary_op('exclude')
 61 | .f_env$`missing` = .missing_op("is")
 62 | .f_env$'!' = .negate_op("NOT")
 63 | 
 64 | 
 65 | #' Create NCI GDC filters for limiting GDC query results
 66 | #'
 67 | #' Searching the NCI GDC allows for complex filtering based
 68 | #' on logical operations and simple comparisons.  This function
 69 | #' facilitates writing such filter expressions in R-like syntax
 70 | #' with R code evaluation.
 71 | #'
 72 | #' If used with available_fields, "bare" fields that are
 73 | #' named in the available_fields character vector can be used
 74 | #' in the filter expression without quotes.
 75 | #'
 76 | #' @param expr a lazy-wrapped expression or a formula RHS equivalent
 77 | #'
 78 | #' @param available_fields a character vector of the
 79 | #' additional names that will be injected into the
 80 | #' filter evaluation environment
 81 | #'
 82 | #' @return a \code{list} that represents an R version
 83 | #' of the JSON that will ultimately be used in an
 84 | #' NCI GDC search or other query.
 85 | #' 
 86 | #' @importFrom rlang eval_tidy f_rhs f_env
 87 | #' 
 88 | #' @export
 89 | make_filter = function(expr,available_fields) {
 90 |     available_fields=as.list(available_fields)
 91 |     names(available_fields)=available_fields
 92 |     filt_env = c(as.list(.f_env),available_fields)
 93 |     if(is_formula(expr)) {
 94 |         return(rlang::eval_tidy(rlang::f_rhs(expr), data=filt_env, env = rlang::f_env(expr)))
 95 |     } else {
 96 |         return(rlang::eval_tidy(expr,data=filt_env))
 97 |     }
 98 | }
 99 | 
100 | 
101 | 
102 | #' Manipulating GDCQuery filters
103 | #'
104 | #' @name filtering
105 | #' 
106 | #' @return A \code{\link{GDCQuery}} object with the filter
107 | #' field replaced by specified filter expression
108 | #' 
109 | #' @examples
110 | #' # make a GDCQuery object to start
111 | #' #
112 | #' # Projects
113 | #' #
114 | #' pQuery = projects()
115 | #'
116 | #' # check for the default fields
117 | #' # so that we can use one of them to build a filter
118 | #' default_fields(pQuery)
119 | #' pQuery = filter(pQuery,~ project_id == 'TCGA-LUAC')
120 | #' get_filter(pQuery)
121 | #'
122 | #' #
123 | #' # Files
124 | #' #
125 | #' fQuery = files()
126 | #' default_fields(fQuery)
127 | #'
128 | #' fQuery = filter(fQuery,~ data_format == 'VCF')
129 | #' # OR
130 | #' # with recent GenomicDataCommons versions:
131 | #' #   no "~" needed
132 | #' fQuery = filter(fQuery, data_format == 'VCF')
133 | #' 
134 | #' get_filter(fQuery)
135 | #'
136 | #' fQuery = filter(fQuery,~ data_format == 'VCF'
137 | #'                 & experimental_strategy == 'WXS'
138 | #'                 & type == 'simple_somatic_mutation')
139 | #'
140 | #' files() |> filter(~ data_format == 'VCF'
141 | #'                    & experimental_strategy=='WXS'
142 | #'                    & type == 'simple_somatic_mutation') |> count()
143 | #'                    
144 | #'                    
145 | #' files() |> filter( data_format == 'VCF'
146 | #'                    & experimental_strategy=='WXS'
147 | #'                    & type == 'simple_somatic_mutation') |> count()
148 | #'
149 | #' # Filters may be chained for the 
150 | #' # equivalent query
151 | #' # 
152 | #' # When chained, filters are combined with logical AND
153 | #' 
154 | #' files() |>
155 | #'   filter(~ data_format == 'VCF') |>
156 | #'   filter(~ experimental_strategy == 'WXS') |>
157 | #'   filter(~ type == 'simple_somatic_mutation') |>
158 | #'   count()
159 | #' 
160 | #' # OR
161 | #' 
162 | #' files() |>
163 | #'   filter( data_format == 'VCF') |>
164 | #'   filter( experimental_strategy == 'WXS') |>
165 | #'   filter( type == 'simple_somatic_mutation') |>
166 | #'   count()
167 | #' 
168 | #' # Use str() to get a cleaner picture
169 | #' str(get_filter(fQuery))
170 | NULL
171 | 
172 | #' The \code{filter} is simply a safe accessor for
173 | #' the filter element in \code{\link{GDCQuery}} objects.
174 | #'
175 | #' @param x the object on which to set the filter list
176 | #' member
177 | #' @param expr a filter expression in the form of
178 | #' the right hand side of a formula, where bare names
179 | #' (without quotes) are allowed if they are available
180 | #' fields associated with the GDCQuery object, \code{x}
181 | #' 
182 | #' @rdname filtering
183 | #' 
184 | #' @export
185 | filter = function(x,expr) {
186 |     UseMethod('filter',x)
187 | }
188 | 
189 | #' @rdname filtering
190 | #'
191 | #' @importFrom rlang enquo is_formula
192 | #'
193 | #' @export
194 | filter.GDCQuery = function(x,expr) {
195 |     filt = try({
196 |         if(rlang::is_formula(expr))
197 |             make_filter(expr,available_fields(x))
198 |     }, silent=TRUE)
199 |     if(inherits(filt, "try-error")) 
200 |         filt = make_filter(enquo(expr), available_fields(x))
201 |     if(!is.null(x$filters))
202 |         x$filters=list(op="and", content=list(x$filters,filt))
203 |     else
204 |         x$filters = filt
205 |     return(x)
206 | }
207 | 
208 | #' The \code{get_filter} is simply a safe accessor for
209 | #' the filter element in \code{\link{GDCQuery}} objects.
210 | #'
211 | #' @rdname filtering
212 | #'
213 | #' 
214 | #' @export
215 | get_filter = function(x) {
216 |     UseMethod('get_filter',x)
217 | }
218 | 
219 | #' @rdname filtering
220 | #' 
221 | #' @export
222 | get_filter.GDCQuery = function(x) {
223 |     return(x$filters)
224 | }
225 | 
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/inst/script/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | knit: (function(inputFile, encoding) {
  4 |   rmarkdown::render(inputFile, encoding = encoding, output_dir = "../../") })
  5 | ---
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   cache = TRUE,
 12 |   out.width = "100%"
 13 | )
 14 | ```
 15 | 
 16 | ```{r,echo=FALSE,include=FALSE,eval=FALSE}
 17 | rmarkdown::render("inst/script/README.Rmd", output_dir = ".")
 18 | ```
 19 | 
 20 | 
 21 | # GenomicDataCommons
 22 | 
 23 | <!-- badges: start -->
 24 | [![R-CMD-check](https://github.com/Bioconductor/GenomicDataCommons/workflows/R-CMD-check/badge.svg)](https://github.com/Bioconductor/GenomicDataCommons/actions)
 25 | <!-- badges: end -->
 26 | 
 27 | # What is the GDC?
 28 | 
 29 | From the [Genomic Data Commons (GDC) website](https://gdc.nci.nih.gov/about-gdc):
 30 | 
 31 | The National Cancer Institute's (NCI's) Genomic Data Commons (GDC) is
 32 | a data sharing platform that promotes precision medicine in
 33 | oncology. It is not just a database or a tool; it is an expandable
 34 | knowledge network supporting the import and standardization of genomic
 35 | and clinical data from cancer research programs.
 36 | 
 37 | The GDC contains NCI-generated data from some of the largest and most
 38 | comprehensive cancer genomic datasets, including The Cancer Genome
 39 | Atlas (TCGA) and Therapeutically Applicable Research to Generate
 40 | Effective Therapies (TARGET). For the first time, these datasets have
 41 | been harmonized using a common set of bioinformatics pipelines, so
 42 | that the data can be directly compared.
 43 | 
 44 | As a growing knowledge system for cancer, the GDC also enables
 45 | researchers to submit data, and harmonizes these data for import into
 46 | the GDC. As more researchers add clinical and genomic data to the GDC,
 47 | it will become an even more powerful tool for making discoveries about
 48 | the molecular basis of cancer that may lead to better care for
 49 | patients.
 50 | 
 51 | The
 52 | [data model for the GDC is complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components),
 53 | but it worth a quick overview. The data model is encoded as a
 54 | so-called property graph. Nodes represent entities such as Projects,
 55 | Cases, Diagnoses, Files (various kinds), and Annotations. The
 56 | relationships between these entities are maintained as edges.  Both
 57 | nodes and edges may have Properties that supply instance details.  The
 58 | GDC API exposes these nodes and edges in a somewhat simplified set
 59 | of
 60 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer)
 61 | endpoints.
 62 | 
 63 | # Quickstart
 64 | 
 65 | This software is available at Bioconductor.org and can be downloaded via
 66 | `BiocManager::install`.
 67 | 
 68 | To report bugs or problems, either
 69 | [submit a new issue](https://github.com/Bioconductor/GenomicDataCommons/issues)
 70 | or submit a `bug.report(package='GenomicDataCommons')` from within R (which
 71 | will redirect you to the new issue on GitHub).
 72 | 
 73 | ## Installation
 74 | 
 75 | Installation can be achieved via Bioconductor's `BiocManager` package.
 76 | 
 77 | ```{r,eval=FALSE}
 78 | if (!require("BiocManager"))
 79 |     install.packages("BiocManager")
 80 | 
 81 | BiocManager::install('GenomicDataCommons')
 82 | ```
 83 | 
 84 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE}
 85 | library(GenomicDataCommons)
 86 | ```
 87 | 
 88 | ## Check basic functionality
 89 | 
 90 | ```{r}
 91 | status()
 92 | ```
 93 | 
 94 | ## Find data
 95 | 
 96 | The following code builds a `manifest` that can be used to guide the
 97 | download of raw data. Here, filtering finds gene expression files
 98 | quantified as raw counts using `STAR` from ovarian cancer patients.
 99 | 
100 | ```{r}
101 | ge_manifest <- files() |>
102 |     filter( cases.project.project_id == 'TCGA-OV') |>
103 |     filter( type == 'gene_expression' ) |>
104 |     filter( analysis.workflow_type == 'STAR - Counts') |>
105 |     manifest(size = 5)
106 | ge_manifest
107 | ```
108 | 
109 | ## Download data
110 | 
111 | This code block downloads the `r nrow(ge_manifest)` gene expression files
112 | specified in the query above. Using multiple processes to do the download very
113 | significantly speeds up the transfer in many cases.  The following completes in
114 | about 15 seconds.
115 | 
116 | ```{r,eval=FALSE}
117 | library(BiocParallel)
118 | register(MulticoreParam())
119 | destdir <- tempdir()
120 | fnames <- lapply(ge_manifest$id,gdcdata)
121 | ```
122 | 
123 | If the download had included controlled-access data, the download above would
124 | have needed to include a `token`.  Details are available in
125 | [the authentication section below](#authentication).
126 | 
127 | ## Metadata queries
128 | 
129 | Here we use a couple of ad-hoc helper functions to handle the output of the
130 | query. See the `inst/script/README.Rmd` folder for the source.
131 | 
132 | ```{r,echo=FALSE}
133 | filterAllNA <- function(df) {
134 |     notallna <- vapply(df, function(x) !all(is.na(x)), logical(1L))
135 |     df[, notallna]
136 | }
137 | 
138 | bindrowname <- function(resultList) {
139 |     if (is.data.frame(resultList))
140 |         stop("Only run this on the list type of outputs")
141 |     datadf <- dplyr::bind_rows(resultList)
142 |     rownames(datadf) <- names(resultList)
143 |     filterAllNA(datadf)
144 | }
145 | ```
146 | 
147 | First, create a `data.frame` from the clinical data:
148 | 
149 | ```{r}
150 | expands <- c("diagnoses","annotations",
151 |              "demographic","exposures")
152 | clinResults <- cases() |>
153 |     GenomicDataCommons::select(NULL) |>
154 |     GenomicDataCommons::expand(expands) |>
155 |     results(size=6)
156 | demoDF <- filterAllNA(clinResults$demographic)
157 | exposuresDF <- bindrowname(clinResults$exposures)
158 | ```
159 | 
160 | ```{r}
161 | demoDF[, 1:4]
162 | ```
163 | 
164 | ```{r}
165 | exposuresDF[, 1:4]
166 | ```
167 | 
168 | Note that the diagnoses data has multiple lines per patient:
169 | 
170 | ```{r}
171 | diagDF <- bindrowname(clinResults$diagnoses)
172 | diagDF[, 1:4]
173 | ```
174 | 
175 | # Basic design
176 | 
177 | This package design is meant to have some similarities to the "tidyverse"
178 | approach of dplyr. Roughly, the functionality for finding and accessing files
179 | and metadata can be divided into:
180 | 
181 | 1. Simple query constructors based on GDC API endpoints.
182 | 2. A set of verbs that when applied, adjust filtering, field selection, and
183 | faceting (fields for aggregation) and result in a new query object (an
184 | endomorphism)
185 | 3. A set of verbs that take a query and return results from the GDC
186 | 
187 | In addition, there are auxiliary functions for asking the GDC API for
188 | information about available and default fields, slicing BAM files, and
189 | downloading actual data files.  Here is an overview of functionality[^1].
190 | 
191 | 
192 | - Creating a query
193 |     - `projects()`
194 |     - `cases()`
195 |     - `files()`
196 |     - `annotations()`
197 | - Manipulating a query
198 |     - `filter()`
199 |     - `facet()`
200 |     - `select()`
201 | - Introspection on the GDC API fields
202 |     - `mapping()`
203 |     - `available_fields()`
204 |     - `default_fields()`
205 |     - `grep_fields()`
206 |     - `available_values()`
207 |     - `available_expand()`
208 | - Executing an API call to retrieve query results
209 |     - `results()`
210 |     - `count()`
211 |     - `response()`
212 | - Raw data file downloads
213 |     - `gdcdata()`
214 |     - `transfer()`
215 |     - `gdc_client()`
216 | - Summarizing and aggregating field values (faceting)
217 |     - `aggregations()`
218 | - Authentication
219 |     - `gdc_token()`
220 | - BAM file slicing
221 |     - `slicing()`
222 | 
223 | [^1]: See individual function and methods documentation for specific details.


--------------------------------------------------------------------------------
/R/response.R:
--------------------------------------------------------------------------------
  1 | #' Fetch \code{\link{GDCQuery}} metadata from GDC
  2 | #'
  3 | #' @aliases GDCResponse
  4 | #' 
  5 | #' @param x a \code{\link{GDCQuery}} object
  6 | #' @param from integer index from which to start returning data
  7 | #' @param size number of records to return
  8 | #' @param ... passed to httr (good for passing config info, etc.)
  9 | #' @param response_handler a function that processes JSON (as text)
 10 | #' and returns an R object.  Default is \code{\link[jsonlite]{fromJSON}}.
 11 | #' 
 12 | #' @rdname response
 13 | #'
 14 | #' @return A \code{GDCResponse} object which is a list with the following
 15 | #' members:
 16 | #' \itemize{
 17 | #' \item{results}
 18 | #' \item{query}
 19 | #' \item{aggregations}
 20 | #' \item{pages}
 21 | #' }
 22 | #' 
 23 | #' 
 24 | #' @examples
 25 | #'
 26 | #' # basic class stuff
 27 | #' gCases = cases()
 28 | #' resp = response(gCases)
 29 | #' class(resp)
 30 | #' names(resp)
 31 | #'
 32 | #' # And results from query
 33 | #' resp$results[[1]]
 34 | #' 
 35 | #' @export
 36 | response = function(x,...) {
 37 |     UseMethod('response',x)
 38 | }
 39 | 
 40 | #' provide count of records in a \code{\link{GDCQuery}}
 41 | #'
 42 | #' @param x a \code{\link{GDCQuery}} object
 43 | #' @param ... passed to httr (good for passing config info, etc.)
 44 | #'
 45 | #' @return integer(1) representing the count of records that will
 46 | #'  be returned by the current query
 47 | #' 
 48 | #' @examples
 49 | #' # total number of projects
 50 | #' projects() |> count()
 51 | #'
 52 | #' # total number of cases
 53 | #' cases() |> count()
 54 | #' 
 55 | #' @export
 56 | count = function(x,...) {
 57 |     UseMethod('count',x)
 58 | }
 59 | 
 60 | #' @describeIn count
 61 | #'
 62 | #' @export
 63 | count.GDCQuery = function(x,...) {
 64 |     resp = x |> response(size=1)
 65 |     return(resp$pages$total)
 66 | }    
 67 | 
 68 | #' @describeIn count
 69 | #'
 70 | #' @export
 71 | count.GDCResponse = function(x,...) {
 72 |     x$pages$total
 73 | }
 74 | 
 75 | 
 76 | #" (internal) prepare "results" for return
 77 | #"
 78 | #" In particular, this function sets
 79 | #" entity_ids for every element so that
 80 | #" one does not loose track of the relationships
 81 | #" given the nested nature of GDC returns
 82 | .prepareResults <- function(res,idfield) {
 83 |     for(i in names(res)) {
 84 |         if(inherits(res[[i]],'data.frame'))
 85 |             rownames(res[[i]]) = res[[idfield]]
 86 |         else
 87 |             names(res[[i]]) = res[[idfield]]}
 88 |     return(res)
 89 | }
 90 | 
 91 | #' @rdname response
 92 | #' 
 93 | #' @importFrom jsonlite fromJSON
 94 | #' 
 95 | #' @export
 96 | response.GDCQuery = function(x, from = 0, size = 10, ...,
 97 |                              response_handler = jsonlite::fromJSON) {
 98 |     body = Filter(function(z) !is.null(z),x)
 99 |     body[['facets']]=paste0(body[['facets']],collapse=",")
100 |     body[['fields']]=paste0(body[['fields']],collapse=",")
101 |     body[['expand']]=paste0(body[['expand']],collapse=",")
102 |     body[['from']]=from
103 |     body[['size']]=size
104 |     body[['format']]='JSON'
105 |     body[['pretty']]='FALSE'
106 |     tmp = response_handler(httr::content(
107 |       .gdc_post(entity_name(x),body=body, token=NULL,...),
108 |                                          as="text", encoding = "UTF-8"))
109 |     res = tmp$data$hits
110 |     idfield = paste0(sub('s$','',entity_name(x)),'_id')
111 |     ## the following code just sets names on the 
112 |     structure(
113 |         list(results = .prepareResults(res,idfield),
114 |              query   = x,
115 |              pages   = tmp$data$pagination,
116 |              aggregations = lapply(tmp$data$aggregations,function(x) {x$buckets})),
117 |         class = c(paste0('GDC',entity_name(x),'Response'),'GDCResponse','list')
118 |     )
119 | }
120 | 
121 | #' @rdname response
122 | #' 
123 | #' @export
124 | response_all = function(x,...) {
125 |     count = count(x)
126 |     return(response(x=x,size=count,from=0,...))
127 | }
128 | 
129 | 
130 | #' aggregations
131 | #'
132 | #' @param x a \code{\link{GDCQuery}} object
133 | #'
134 | #' @return a \code{list} of \code{data.frame} with one
135 | #' member for each requested facet. The data frames
136 | #' each have two columns, key and doc_count.
137 | #' 
138 | #' @examples
139 | #' # Number of each file type
140 | #' res = files() |> facet(c('type','data_type')) |> aggregations()
141 | #' res$type
142 | #'
143 | #' @export
144 | aggregations = function(x) {
145 |     UseMethod('aggregations',x)
146 | }
147 | 
148 | 
149 | #' @describeIn aggregations
150 | #'
151 | #'
152 | #' @export
153 | aggregations.GDCQuery = function(x) {
154 |     if(is.null(x$facets))
155 |         x = x |> facet()
156 |     return(response(x)$aggregations)
157 | }
158 | 
159 | #' @describeIn aggregations
160 | #'
161 | #'
162 | #' @export
163 | aggregations.GDCResponse = function(x) {
164 |     x$aggregations
165 | }
166 | 
167 | 
168 | #' results
169 | #'
170 | #' @param x a \code{\link{GDCQuery}} object
171 | #' @param ... passed on to \code{\link{response}}
172 | #' 
173 | #' @return A (typically nested) \code{list} of GDC records
174 | #' 
175 | #' @examples
176 | #' qcases = cases() |> results()
177 | #' length(qcases)
178 | #'
179 | #' @export
180 | results = function(x,...) {
181 |     UseMethod('results',x)
182 | }
183 | 
184 | #' results_all
185 | #'
186 | #' @param x a \code{\link{GDCQuery}} object
187 | #'
188 | #' @return A (typically nested) \code{list} of GDC records
189 | #' 
190 | #' @examples
191 | #' # details of all available projects
192 | #' projResults = projects() |> results_all()
193 | #' length(projResults)
194 | #' count(projects())
195 | #'
196 | #' 
197 | #' @export
198 | results_all = function(x) {
199 |     UseMethod('results_all',x)
200 | }
201 | 
202 | 
203 | #' @describeIn results
204 | #'
205 | #'
206 | #' @export
207 | results.GDCQuery = function(x,...) {
208 |     results(response(x,...))
209 | }
210 | 
211 | #' @describeIn results_all
212 | #'
213 | #'
214 | #' @export
215 | results_all.GDCQuery = function(x) {
216 |     results(response_all(x))
217 | }
218 | 
219 | #' @describeIn results
220 | #'
221 | #'
222 | #' @export
223 | results.GDCResponse = function(x,...) {
224 |     structure(
225 |         x$results,
226 |         class=c(sub('Response','Results',class(x)))
227 |     )
228 | }
229 | 
230 | #' @describeIn results_all
231 | #'
232 | #'
233 | #' @export
234 | results_all.GDCResponse = function(x) {
235 |     structure(
236 |         x$results,
237 |         class=c(sub('Response','Results',class(x)))
238 |     )
239 | }
240 | 
241 | 
242 | 
243 | 
244 | #' @importFrom xml2 xml_find_all
245 | .response_warnings <- function(warnings, endpoint)
246 | {
247 |     warnings <- vapply(warnings, as.character, character(1))
248 |     if (length(warnings) && nzchar(warnings))
249 |         warning("'", endpoint, "' query warnings:\n", .wrapstr(warnings))
250 |     NULL
251 | }
252 | 
253 | .response_json_as_list <- function(json, endpoint)
254 | {
255 |     type <- substr(endpoint, 1, nchar(endpoint) - 1L)
256 |     type_id <- sprintf("%s_id", type)
257 |     type_list <- sprintf("%ss_list", type)
258 | 
259 |     hits <- json[["data"]][["hits"]]
260 |     names(hits) <- vapply(hits, "[[", character(1), type_id)
261 |     hits <- lapply(hits, "[[<-", type_id, NULL)
262 |     hits <- lapply(hits, lapply, unlist) # collapse field elt 'list'
263 |     class(hits) <- c(type_list, "gdc_list", "list")
264 |     hits
265 | }   
266 | 
267 | #' @importFrom stats setNames
268 | #' @importFrom xml2 xml_find_all xml_text
269 | .response_xml_as_data_frame <- function(xml, fields)
270 | {
271 |     xpaths <- setNames(sprintf("/response/data/hits/item/%s", fields), fields)
272 | 
273 |     columns <- lapply(xpaths, function(xpath, xml) {
274 |         nodes <- xml_find_all(xml, xpath)
275 |         vapply(nodes, xml_text, character(1))
276 |     }, xml=xml)
277 |     columns <- Filter(length, columns)
278 | 
279 |     dropped <- fields[!fields %in% names(columns)]
280 |     if (length(dropped))
281 |         warning("fields not available:\n", .wrapstr(dropped))
282 |     if (length(columns)==0) {
283 |       warning("No records found. Check on filter criteria to ensure they do what you expect. ")
284 |       return(NULL)
285 |     }
286 |     if (!length(unique(lengths(columns)))) {
287 |         lens <- paste(sprintf("%s = %d", names(columns), lengths(columns)),
288 |                       collapse=", ")
289 |         stop("fields are different lengths:\n", .wrapstr(lens))
290 |     }
291 | 
292 |     as.data.frame(columns, stringsAsFactors=FALSE)
293 | }
294 | 
295 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # GenomicDataCommons
  3 | 
  4 | <!-- badges: start -->
  5 | 
  6 | [![R-CMD-check](https://github.com/Bioconductor/GenomicDataCommons/workflows/R-CMD-check/badge.svg)](https://github.com/Bioconductor/GenomicDataCommons/actions)
  7 | <!-- badges: end -->
  8 | 
  9 | # What is the GDC?
 10 | 
 11 | From the [Genomic Data Commons (GDC)
 12 | website](https://gdc.nci.nih.gov/about-gdc):
 13 | 
 14 | The National Cancer Institute’s (NCI’s) Genomic Data Commons (GDC) is a
 15 | data sharing platform that promotes precision medicine in oncology. It
 16 | is not just a database or a tool; it is an expandable knowledge network
 17 | supporting the import and standardization of genomic and clinical data
 18 | from cancer research programs.
 19 | 
 20 | The GDC contains NCI-generated data from some of the largest and most
 21 | comprehensive cancer genomic datasets, including The Cancer Genome Atlas
 22 | (TCGA) and Therapeutically Applicable Research to Generate Effective
 23 | Therapies (TARGET). For the first time, these datasets have been
 24 | harmonized using a common set of bioinformatics pipelines, so that the
 25 | data can be directly compared.
 26 | 
 27 | As a growing knowledge system for cancer, the GDC also enables
 28 | researchers to submit data, and harmonizes these data for import into
 29 | the GDC. As more researchers add clinical and genomic data to the GDC,
 30 | it will become an even more powerful tool for making discoveries about
 31 | the molecular basis of cancer that may lead to better care for patients.
 32 | 
 33 | The [data model for the GDC is
 34 | complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components),
 35 | but it worth a quick overview. The data model is encoded as a so-called
 36 | property graph. Nodes represent entities such as Projects, Cases,
 37 | Diagnoses, Files (various kinds), and Annotations. The relationships
 38 | between these entities are maintained as edges. Both nodes and edges may
 39 | have Properties that supply instance details. The GDC API exposes these
 40 | nodes and edges in a somewhat simplified set of
 41 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer)
 42 | endpoints.
 43 | 
 44 | # Quickstart
 45 | 
 46 | This software is available at Bioconductor.org and can be downloaded via
 47 | `BiocManager::install`.
 48 | 
 49 | To report bugs or problems, either [submit a new
 50 | issue](https://github.com/Bioconductor/GenomicDataCommons/issues) or
 51 | submit a `bug.report(package='GenomicDataCommons')` from within R (which
 52 | will redirect you to the new issue on GitHub).
 53 | 
 54 | ## Installation
 55 | 
 56 | Installation can be achieved via Bioconductor’s `BiocManager` package.
 57 | 
 58 | ``` r
 59 | if (!require("BiocManager"))
 60 |     install.packages("BiocManager")
 61 | 
 62 | BiocManager::install('GenomicDataCommons')
 63 | ```
 64 | 
 65 | ``` r
 66 | library(GenomicDataCommons)
 67 | ```
 68 | 
 69 | ## Check basic functionality
 70 | 
 71 | ``` r
 72 | status()
 73 | #> $commit
 74 | #> [1] "4dd3680528a19ed33cfc83c7d049426c97bb903b"
 75 | #> 
 76 | #> $data_release
 77 | #> [1] "Data Release 34.0 - July 27, 2022"
 78 | #> 
 79 | #> $status
 80 | #> [1] "OK"
 81 | #> 
 82 | #> $tag
 83 | #> [1] "3.0.0"
 84 | #> 
 85 | #> $version
 86 | #> [1] 1
 87 | ```
 88 | 
 89 | ## Find data
 90 | 
 91 | The following code builds a `manifest` that can be used to guide the
 92 | download of raw data. Here, filtering finds gene expression files
 93 | quantified as raw counts using `STAR` from ovarian cancer patients.
 94 | 
 95 | ``` r
 96 | ge_manifest <- files() |>
 97 |     filter( cases.project.project_id == 'TCGA-OV') |>
 98 |     filter( type == 'gene_expression' ) |>
 99 |     filter( analysis.workflow_type == 'STAR - Counts') |>
100 |     manifest(size = 5)
101 | ge_manifest
102 | #>                                     id data_format     access                                                                   file_name
103 | #> 1 7c69529f-2273-4dc4-b213-e84924d78bea         TSV       open d6472bd0-b4e2-4ed1-a892-e1702c195dc7.rna_seq.augmented_star_gene_counts.tsv
104 | #> 2 0eff4634-f8c4-4db9-8a7c-331b21689bae         TSV       open 42165baf-b32c-4fc4-8b04-29c5b4e76de0.rna_seq.augmented_star_gene_counts.tsv
105 | #> 3 7d74b4c5-6391-4b3e-95a3-020ea0869e86         TSV controlled   accf08d4-a784-4908-831a-7a08d4c5f0f5.rna_seq.star_splice_junctions.tsv.gz
106 | #> 4 dc2aeea4-3cd0-4623-92f4-bbbc962851cc         TSV controlled   8ab508b9-2993-4e66-b8f9-81e32e936d4a.rna_seq.star_splice_junctions.tsv.gz
107 | #> 5 0cf852be-d2e3-4fde-bba8-c93efae2961a         TSV       open 93831282-1dd1-49a3-acd7-dae2a49ca62e.rna_seq.augmented_star_gene_counts.tsv
108 | #>                           submitter_id           data_category       acl            type file_size                 created_datetime                           md5sum
109 | #> 1 7085a70b-2f63-4402-9e53-70f091f26fcb Transcriptome Profiling      open gene_expression   4254435 2021-12-13T20:53:42.329364-06:00 19d5596bba8949f4c138793608497d56
110 | #> 2 f0d44930-b1ad-447a-86b9-27d0285954b9 Transcriptome Profiling      open gene_expression   4257461 2021-12-13T20:47:24.326497-06:00 d89d71b7c028c1643d7a3ee7857d8e01
111 | #> 3 e6473134-6d65-414c-9f52-2c25057fac7d Transcriptome Profiling phs000178 gene_expression   3109435 2021-12-13T21:03:56.008440-06:00 fb8332d6413c44a9de02a1cbe6b018aa
112 | #> 4 f99b93a9-70cb-44f8-bd1f-4edeee4425a4 Transcriptome Profiling phs000178 gene_expression   4607701 2021-12-13T21:02:23.944851-06:00 26231bed1ef67c093d3ce2b39def81cd
113 | #> 5 fb4d7abe-b61a-4f35-9700-605f1bc1512f Transcriptome Profiling      open gene_expression   4265694 2021-12-13T20:50:55.234254-06:00 050763aabd36509f954137fbdc4eeb00
114 | #>                   updated_datetime                              file_id                      data_type    state experimental_strategy
115 | #> 1 2022-01-19T14:47:28.965154-06:00 7c69529f-2273-4dc4-b213-e84924d78bea Gene Expression Quantification released               RNA-Seq
116 | #> 2 2022-01-19T14:47:07.478144-06:00 0eff4634-f8c4-4db9-8a7c-331b21689bae Gene Expression Quantification released               RNA-Seq
117 | #> 3 2022-01-19T14:01:15.621847-06:00 7d74b4c5-6391-4b3e-95a3-020ea0869e86 Splice Junction Quantification released               RNA-Seq
118 | #> 4 2022-01-19T14:01:15.621847-06:00 dc2aeea4-3cd0-4623-92f4-bbbc962851cc Splice Junction Quantification released               RNA-Seq
119 | #> 5 2022-01-19T14:47:07.036781-06:00 0cf852be-d2e3-4fde-bba8-c93efae2961a Gene Expression Quantification released               RNA-Seq
120 | ```
121 | 
122 | ## Download data
123 | 
124 | This code block downloads the 5 gene expression files specified in the
125 | query above. Using multiple processes to do the download very
126 | significantly speeds up the transfer in many cases. The following
127 | completes in about 15 seconds.
128 | 
129 | ``` r
130 | library(BiocParallel)
131 | register(MulticoreParam())
132 | destdir <- tempdir()
133 | fnames <- lapply(ge_manifest$id,gdcdata)
134 | ```
135 | 
136 | If the download had included controlled-access data, the download above
137 | would have needed to include a `token`. Details are available in [the
138 | authentication section below](#authentication).
139 | 
140 | ## Metadata queries
141 | 
142 | Here we use a couple of ad-hoc helper functions to handle the output of
143 | the query. See the `inst/script/README.Rmd` folder for the source.
144 | 
145 | First, create a `data.frame` from the clinical data:
146 | 
147 | ``` r
148 | expands <- c("diagnoses","annotations",
149 |              "demographic","exposures")
150 | clinResults <- cases() |>
151 |     GenomicDataCommons::select(NULL) |>
152 |     GenomicDataCommons::expand(expands) |>
153 |     results(size=6)
154 | demoDF <- filterAllNA(clinResults$demographic)
155 | exposuresDF <- bindrowname(clinResults$exposures)
156 | ```
157 | 
158 | ``` r
159 | demoDF[, 1:4]
160 | #>                                      cause_of_death         race gender              ethnicity
161 | #> 2525bfef-6962-4b7f-8e80-6186400ce624           <NA> not reported female           not reported
162 | #> 126507c3-c0d7-41fb-9093-7deed5baf431 Cancer Related not reported female           not reported
163 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0           <NA> not reported female           not reported
164 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44 Cancer Related not reported   male           not reported
165 | #> 59122a43-606a-4669-806b-6747e0ac9985           <NA>        white   male not hispanic or latino
166 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc Cancer Related        white female not hispanic or latino
167 | ```
168 | 
169 | ``` r
170 | exposuresDF[, 1:4]
171 | #>                                       submitter_id                 created_datetime    alcohol_intensity pack_years_smoked
172 | #> 2525bfef-6962-4b7f-8e80-6186400ce624 C3N-03839-EXP 2019-12-30T10:23:07.190853-06:00 Lifelong Non-Drinker                NA
173 | #> 126507c3-c0d7-41fb-9093-7deed5baf431 C3N-01518-EXP 2018-06-21T14:27:48.817254-05:00 Lifelong Non-Drinker                NA
174 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0 C3N-03933-EXP 2019-03-14T08:23:14.054975-05:00 Lifelong Non-Drinker                NA
175 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44 C3N-02695-EXP 2019-03-14T08:23:14.054975-05:00   Occasional Drinker              16.8
176 | #> 59122a43-606a-4669-806b-6747e0ac9985 C3L-03642-EXP 2019-06-24T07:53:15.534197-05:00 Lifelong Non-Drinker              39.0
177 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc C3L-03728-EXP 2019-06-24T07:53:15.534197-05:00 Lifelong Non-Drinker                NA
178 | ```
179 | 
180 | Note that the diagnoses data has multiple lines per patient:
181 | 
182 | ``` r
183 | diagDF <- bindrowname(clinResults$diagnoses)
184 | diagDF[, 1:4]
185 | #>                                      ajcc_pathologic_stage                 created_datetime tissue_or_organ_of_origin age_at_diagnosis
186 | #> 2525bfef-6962-4b7f-8e80-6186400ce624             Stage IIB 2019-07-22T06:40:02.183501-05:00          Head of pancreas            19956
187 | #> 126507c3-c0d7-41fb-9093-7deed5baf431          Not Reported 2018-12-03T12:05:16.846188-06:00             Temporal lobe            26312
188 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0             Stage III 2019-03-14T10:37:34.405260-05:00       Floor of mouth, NOS            25635
189 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44          Not Reported 2019-03-14T10:37:34.405260-05:00       Floor of mouth, NOS            16652
190 | #> 59122a43-606a-4669-806b-6747e0ac9985          Not Reported 2019-07-22T06:40:02.183501-05:00          Upper lobe, lung            23384
191 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc          Not Reported 2019-05-07T07:41:33.411909-05:00              Frontal lobe            29326
192 | ```
193 | 
194 | # Basic design
195 | 
196 | This package design is meant to have some similarities to the
197 | “tidyverse” approach of dplyr. Roughly, the functionality for finding
198 | and accessing files and metadata can be divided into:
199 | 
200 | 1.  Simple query constructors based on GDC API endpoints.
201 | 2.  A set of verbs that when applied, adjust filtering, field selection,
202 |     and faceting (fields for aggregation) and result in a new query
203 |     object (an endomorphism)
204 | 3.  A set of verbs that take a query and return results from the GDC
205 | 
206 | In addition, there are auxiliary functions for asking the GDC API for
207 | information about available and default fields, slicing BAM files, and
208 | downloading actual data files. Here is an overview of functionality[^1].
209 | 
210 | -   Creating a query
211 |     -   `projects()`
212 |     -   `cases()`
213 |     -   `files()`
214 |     -   `annotations()`
215 | -   Manipulating a query
216 |     -   `filter()`
217 |     -   `facet()`
218 |     -   `select()`
219 | -   Introspection on the GDC API fields
220 |     -   `mapping()`
221 |     -   `available_fields()`
222 |     -   `default_fields()`
223 |     -   `grep_fields()`
224 |     -   `available_values()`
225 |     -   `available_expand()`
226 | -   Executing an API call to retrieve query results
227 |     -   `results()`
228 |     -   `count()`
229 |     -   `response()`
230 | -   Raw data file downloads
231 |     -   `gdcdata()`
232 |     -   `transfer()`
233 |     -   `gdc_client()`
234 | -   Summarizing and aggregating field values (faceting)
235 |     -   `aggregations()`
236 | -   Authentication
237 |     -   `gdc_token()`
238 | -   BAM file slicing
239 |     -   `slicing()`
240 | 
241 | [^1]: See individual function and methods documentation for specific
242 |     details.
243 | 


--------------------------------------------------------------------------------
/vignettes/overview.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The GenomicDataCommons Package"
  3 | author: "Sean Davis & Martin Morgan"
  4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`"
  5 | always_allow_html: yes
  6 | output:
  7 |   BiocStyle::html_document:
  8 |     df_print: paged
  9 |     toc_float: true
 10 | abstract: >
 11 |   The National Cancer Institute (NCI) has established
 12 |   the [Genomic Data Commons](https://gdc.nci.nih.gov/) (GDC). The GDC
 13 |   provides the cancer research community with an open and unified
 14 |   repository for sharing and accessing data across numerous cancer
 15 |   studies and projects via a high-performance data transfer and query
 16 |   infrastructure.  The *GenomicDataCommons* Bioconductor package
 17 |   provides basic infrastructure for querying, accessing, and mining
 18 |   genomic datasets available from the GDC. We expect that the
 19 |   Bioconductor developer and the larger bioinformatics communities will
 20 |   build on the *GenomicDataCommons* package to add higher-level
 21 |   functionality and expose cancer genomics data to the plethora of
 22 |   state-of-the-art bioinformatics methods available in Bioconductor.
 23 | 
 24 | vignette: >
 25 |   %\VignetteIndexEntry{Introduction to Accessing the NCI Genomic Data Commons}
 26 |   %\VignetteEngine{knitr::rmarkdown}
 27 |   %\VignetteEncoding{UTF-8}
 28 | ---
 29 | 
 30 | ```{r init, results='hide', echo=FALSE, warning=FALSE, message=FALSE}
 31 | library(knitr)
 32 | opts_chunk$set(warning=FALSE, message=FALSE)
 33 | BiocStyle::markdown()
 34 | ```
 35 | 
 36 | 
 37 | # What is the GDC?
 38 | 
 39 | From the [Genomic Data Commons (GDC) website](https://gdc.cancer.gov/about-gdc):
 40 | 
 41 | > The National Cancer Institute's (NCI's) Genomic Data Commons (GDC) is
 42 | a data sharing platform that promotes precision medicine in
 43 | oncology. It is not just a database or a tool; it is an expandable
 44 | knowledge network supporting the import and standardization of genomic
 45 | and clinical data from cancer research programs.
 46 | > The GDC contains NCI-generated data from some of the largest and most
 47 | comprehensive cancer genomic datasets, including The Cancer Genome
 48 | Atlas (TCGA) and Therapeutically Applicable Research to Generate
 49 | Effective Therapies (TARGET). For the first time, these datasets have
 50 | been harmonized using a common set of bioinformatics pipelines, so
 51 | that the data can be directly compared.
 52 | > As a growing knowledge system for cancer, the GDC also enables
 53 | researchers to submit data, and harmonizes these data for import into
 54 | the GDC. As more researchers add clinical and genomic data to the GDC,
 55 | it will become an even more powerful tool for making discoveries about
 56 | the molecular basis of cancer that may lead to better care for
 57 | patients.
 58 | 
 59 | The
 60 | [data model for the GDC is complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components),
 61 | but it worth a quick overview and a graphical representation is included here. 
 62 | 
 63 | ![The data model is encoded as a
 64 | so-called property graph. Nodes represent entities such as Projects,
 65 | Cases, Diagnoses, Files (various kinds), and Annotations. The
 66 | relationships between these entities are maintained as edges.  Both
 67 | nodes and edges may have Properties that supply instance details. ](all_nodes_040318.png)
 68 | 
 69 |  The
 70 | GDC API exposes these nodes and edges in a somewhat simplified set
 71 | of
 72 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) endpoints.
 73 | 
 74 | # Quickstart
 75 | 
 76 | This quickstart section is just meant to show basic
 77 | functionality. More details of functionality are included further on
 78 | in this vignette and in function-specific help.
 79 | 
 80 | This software is available at Bioconductor.org and can be downloaded via
 81 | `BiocManager::install`.
 82 | 
 83 | To report bugs or problems, either
 84 | [submit a new issue](https://github.com/Bioconductor/GenomicDataCommons/issues)
 85 | or submit a `bug.report(package='GenomicDataCommons')` from within R (which
 86 | will redirect you to the new issue on GitHub).
 87 | 
 88 | ## Installation
 89 | 
 90 | Installation can be achieved via Bioconductor's `BiocManager` package.
 91 | 
 92 | ```{r install_bioc, eval=FALSE}
 93 | if (!require("BiocManager"))
 94 |     install.packages("BiocManager")
 95 | BiocManager::install('GenomicDataCommons')
 96 | ```
 97 | 
 98 | ```{r libraries, message=FALSE}
 99 | library(GenomicDataCommons)
100 | ```
101 | 
102 | ## Check connectivity and status
103 | 
104 | The `r Biocpkg("GenomicDataCommons")` package relies on having network
105 | connectivity. In addition, the NCI GDC API must also be operational
106 | and not under maintenance. Checking `status` can be used to check this
107 | connectivity and functionality.
108 | 
109 | ```{r statusQS}
110 | GenomicDataCommons::status()
111 | ```
112 | 
113 | And to check the status in code:
114 | 
115 | ```{r statusCheck}
116 | stopifnot(GenomicDataCommons::status()$status=="OK")
117 | ```
118 | 
119 | 
120 | ## Find data
121 | 
122 | The following code builds a `manifest` that can be used to guide the
123 | download of raw data. Here, filtering finds gene expression files
124 | quantified as raw counts using `STAR` from ovarian cancer patients.
125 | 
126 | ```{r manifest}
127 | ge_manifest <- files() |>
128 |     filter( cases.project.project_id == 'TCGA-OV') |> 
129 |     filter( type == 'gene_expression' ) |>
130 |     filter( analysis.workflow_type == 'STAR - Counts')  |>
131 |     manifest()
132 | head(ge_manifest)
133 | ```
134 | 
135 | ## Download data
136 | 
137 | After  the `r nrow(ge_manifest)` gene expression files
138 | specified in the query above. Using multiple processes to do the download very
139 | significantly speeds up the transfer in many cases.  On a standard 1Gb
140 | connection, the following completes in about 30 seconds. The first time the 
141 | data are downloaded, R will ask to create a cache directory (see `?gdc_cache`
142 | for details of setting and interacting with the cache). Resulting
143 | downloaded files will be stored in the cache directory. Future access to 
144 | the same files will be directly from the cache, alleviating multiple downloads.
145 | 
146 | ```{r downloadQS, eval=FALSE}
147 | fnames <- lapply(ge_manifest$id[1:20], gdcdata)
148 | ```
149 | 
150 | If the download had included controlled-access data, the download above would
151 | have needed to include a `token`.  Details are available in
152 | [the authentication section below](#authentication).
153 | 
154 | ## Metadata queries
155 | 
156 | ### Clinical data
157 | 
158 | Accessing clinical data is a very common task. Given a set of `case_ids`,
159 | the `gdc_clinical()` function will return a list of four `tibble`s. 
160 | 
161 | - demographic
162 | - diagnoses
163 | - exposures
164 | - main
165 | 
166 | ```{r gdc_clinical}
167 | case_ids = cases() |> results(size=10) |> ids()
168 | clindat = gdc_clinical(case_ids)
169 | names(clindat)
170 | ```
171 | 
172 | ```{r clinData}
173 | head(clindat[["main"]])
174 | head(clindat[["diagnoses"]])
175 | ```
176 | 
177 | ### General metadata queries
178 | 
179 | The `r Biocpkg("GenomicDataCommons")` package can access the significant 
180 | clinical, demographic, biospecimen, and annotation information 
181 | contained in the NCI GDC. The `gdc_clinical()` function will often
182 | be all that is needed, but the API and `r Biocpkg("GenomicDataCommons")` package
183 | make much flexibility if fine-tuning is required. 
184 | 
185 | ```{r metadataQS}
186 | expands = c("diagnoses","annotations",
187 |              "demographic","exposures")
188 | clinResults = cases() |>
189 |     GenomicDataCommons::select(NULL) |>
190 |     GenomicDataCommons::expand(expands) |>
191 |     results(size=50)
192 | str(clinResults[[1]],list.len=6)
193 | # or listviewer::jsonedit(clinResults)
194 | ```
195 | 
196 | # Basic design
197 | 
198 | This package design is meant to have some similarities to the "hadleyverse"
199 | approach of dplyr. Roughly, the functionality for finding and accessing files
200 | and metadata can be divided into:
201 | 
202 | 1. Simple query constructors based on GDC API endpoints.
203 | 2. A set of verbs that when applied, adjust filtering, field selection, and
204 | faceting (fields for aggregation) and result in a new query object (an
205 | endomorphism)
206 | 3. A set of verbs that take a query and return results from the GDC
207 | 
208 | In addition, there are exhiliary functions for asking the GDC API for
209 | information about available and default fields, slicing BAM files, and
210 | downloading actual data files.  Here is an overview of functionality[^1].
211 | 
212 | 
213 | - Creating a query
214 |     - `projects()`
215 |     - `cases()`
216 |     - `files()`
217 |     - `annotations()`
218 | - Manipulating a query
219 |     - `filter()`
220 |     - `facet()`
221 |     - `select()`
222 | - Introspection on the GDC API fields
223 |     - `mapping()`
224 |     - `available_fields()`
225 |     - `default_fields()`
226 |     - `grep_fields()`
227 |     - `available_values()`
228 |     - `available_expand()`
229 | - Executing an API call to retrieve query results
230 |     - `results()`
231 |     - `count()`
232 |     - `response()`
233 | - Raw data file downloads
234 |     - `gdcdata()`
235 |     - `transfer()`
236 |     - `gdc_client()`
237 | - Summarizing and aggregating field values (faceting)
238 |     - `aggregations()`
239 | - Authentication
240 |     - `gdc_token()`
241 | - BAM file slicing
242 |     - `slicing()`
243 | 
244 | [^1]: See individual function and methods documentation for specific details.
245 | 
246 | 
247 | # Usage
248 | 
249 | There are two main classes of operations when working with the NCI GDC.
250 | 
251 | 1. [Querying metadata and finding data files](#querying-metadata) (e.g., finding
252 | all gene expression quantifications data files for all colon cancer patients).
253 | 2. [Transferring raw or processed data](#datafile-access-and-download) from the
254 | GDC to another computer (e.g., downloading raw or processed data)
255 | 
256 | Both classes of operation are reviewed in detail in the following sections.
257 | 
258 | ## Querying metadata
259 | 
260 | Vast amounts of metadata about cases (patients, basically), files, projects, and
261 | so-called annotations are available via the NCI GDC API. Typically, one will
262 | want to query metadata to either focus in on a set of files for download or
263 | transfer *or* to perform so-called aggregations (pivot-tables, facets, similar
264 | to the R `table()` functionality).
265 | 
266 | Querying metadata starts with [creating a "blank" query](#creating-a-query). One
267 | will often then want to [`filter`](#filtering) the query to limit results prior
268 | to [retrieving results](#retrieving-results). The GenomicDataCommons package has
269 | [helper functions for listing fields](#fields-and-values) that are available for
270 | filtering.
271 | 
272 | In addition to fetching results, the GDC API allows
273 | [faceting, or aggregating,](#facets-and-aggregation), useful for compiling
274 | reports, generating dashboards, or building user interfaces to GDC data (see GDC
275 | web query interface for a non-R-based example).
276 | 
277 | ### Creating a query
278 | 
279 | A query of the GDC starts its life in R. Queries follow the four metadata
280 | endpoints available at the GDC.  In particular, there are four convenience
281 | functions that each create `GDCQuery` objects (actually, specific subclasses of
282 | `GDCQuery`):
283 | 
284 | - `projects()`
285 | - `cases()`
286 | - `files()`
287 | - `annotations()`
288 | 
289 | ```{r projectquery}
290 | pquery = projects()
291 | ```
292 | 
293 | The `pquery` object is now an object of (S3) class, `GDCQuery` (and
294 | `gdc_projects` and `list`). The object contains the following elements:
295 | 
296 | - fields: This is a character vector of the fields that will be returned when we
297 | [retrieve data](#retrieving-results). If no fields are specified to, for
298 | example, the `projects()` function, the default fields from the GDC are used
299 | (see `default_fields()`)
300 | - filters: This will contain results after calling the
301 | [`filter()` method](#filtering) and will be used to filter results on
302 | [retrieval](#retrieving-results).
303 | - facets: A character vector of field names that will be used for
304 | [aggregating data](#facets-and-aggregation) in a call to `aggregations()`.
305 | - token: A character(1) token from the GDC. See
306 | [the authentication section](#authentication) for details, but note that, in
307 | general, the token is not necessary for metadata query and retrieval, only for
308 | actual data download.
309 | 
310 | Looking at the actual object (get used to using `str()`!), note that the query
311 | contains no results.
312 | 
313 | ```{r pquery}
314 | str(pquery)
315 | ```
316 | ### Retrieving results
317 | 
318 | [[ GDC pagination documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#size-and-from)
319 | 
320 | [[ GDC sorting documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#sort)
321 | 
322 | With a query object available, the next step is to retrieve results from the
323 | GDC. The GenomicDataCommons package.  The most basic type of results we can get
324 | is a simple `count()` of records available that satisfy the filter criteria.
325 | Note that we have not set any filters, so a `count()` here will represent all
326 | the project records publicly available at the GDC in the "default" archive"
327 | 
328 | ```{r pquerycount}
329 | pcount = count(pquery)
330 | # or
331 | pcount = pquery |> count()
332 | pcount
333 | ```
334 | 
335 | The `results()` method will fetch actual results.
336 | 
337 | ```{r pqueryresults}
338 | presults = pquery |> results()
339 | ```
340 | These results are
341 | returned from the GDC in [JSON](http://www.json.org/) format and
342 | converted into a (potentially nested) list in R. The `str()` method is useful
343 | for taking a quick glimpse of the data.
344 | 
345 | ```{r presultsstr}
346 | str(presults)
347 | ```
348 | 
349 | A default of only 10 records are returned. We can use the `size` and `from`
350 | arguments to `results()` to either page through results or to change the number
351 | of results. Finally, there is a convenience method, `results_all()` that will
352 | simply fetch all the available results given a query. Note that `results_all()`
353 | may take a long time and return HUGE result sets if not used carefully. Use of a
354 | combination of `count()` and `results()` to get a sense of the expected data
355 | size is probably warranted before calling `results_all()`
356 | 
357 | ```{r presultsall}
358 | length(ids(presults))
359 | presults = pquery |> results_all()
360 | length(ids(presults))
361 | # includes all records
362 | length(ids(presults)) == count(pquery)
363 | ```
364 | 
365 | Extracting subsets of
366 | results or manipulating the results into a more conventional R data
367 | structure is not easily generalizable.  However,
368 | the
369 | [purrr](https://github.com/hadley/purrr),
370 | [rlist](https://renkun.me/rlist/),
371 | and [data.tree](https://cran.r-project.org/web/packages/data.tree/vignettes/data.tree.html) packages
372 | are all potentially of interest for manipulating complex, nested list
373 | structures. For viewing the results in an interactive viewer, consider the
374 | [listviewer](https://github.com/timelyportfolio/listviewer) package.
375 | 
376 | 
377 | ### Fields and Values
378 | 
379 | [[ GDC `fields` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#fields)
380 | 
381 | Central to querying and retrieving data from the GDC is the ability to specify
382 | which fields to return, filtering by fields and values, and faceting or
383 | aggregating. The GenomicDataCommons package includes two simple functions,
384 | `available_fields()` and `default_fields()`. Each can operate on a character(1)
385 | endpoint name ("cases", "files", "annotations", or "projects") or a `GDCQuery`
386 | object.
387 | 
388 | ```{r defaultfields}
389 | default_fields('files')
390 | # The number of fields available for files endpoint
391 | length(available_fields('files'))
392 | # The first few fields available for files endpoint
393 | head(available_fields('files'))
394 | ```
395 | 
396 | The fields to be returned by a query can be specified following a similar
397 | paradigm to that of the dplyr package. The `select()` function is a verb that
398 | resets the fields slot of a `GDCQuery`; note that this is not quite analogous to
399 | the dplyr `select()` verb that limits from already-present fields. We
400 | *completely replace* the fields when using `select()` on a `GDCQuery`.
401 | 
402 | ```{r selectexample}
403 | # Default fields here
404 | qcases = cases()
405 | qcases$fields
406 | # set up query to use ALL available fields
407 | # Note that checking of fields is done by select()
408 | qcases = cases() |> GenomicDataCommons::select(available_fields('cases'))
409 | head(qcases$fields)
410 | ```
411 | 
412 | Finding fields of interest is such a common operation that the
413 | GenomicDataCommons includes the `grep_fields()` function.
414 | See the appropriate help pages for details.
415 | 
416 | ### Facets and aggregation
417 | 
418 | [[ GDC `facet` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#facets)
419 | 
420 | The GDC API offers a feature known as aggregation or faceting. By
421 | specifying one or more fields (of appropriate type), the GDC can
422 | return to us a count of the number of records matching each potential
423 | value. This is similar to the R `table` method. Multiple fields can be
424 | returned at once, but the GDC API does not have a cross-tabulation
425 | feature; all aggregations are only on one field at a time. Results of
426 | `aggregation()` calls come back as a list of data.frames (actually,
427 | tibbles).
428 | 
429 | ```{r aggexample}
430 | # total number of files of a specific type
431 | res = files() |> facet(c('type','data_type')) |> aggregations()
432 | res$type
433 | ```
434 | 
435 | Using `aggregations()` is an also easy way to learn the contents of individual
436 | fields and forms the basis for faceted search pages.
437 | 
438 | ### Filtering
439 | 
440 | [[ GDC `filtering` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#filters-specifying-the-query)
441 | 
442 | The GenomicDataCommons package uses a form of non-standard evaluation to specify
443 | R-like queries that are then translated into an R list. That R list is, upon
444 | calling a method that fetches results from the GDC API, translated into the
445 | appropriate JSON string. The R expression uses the formula interface as
446 | suggested by Hadley Wickham in his [vignette on non-standard evaluation](https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html)
447 | 
448 | > It’s best to use a formula because a formula captures both the expression to
449 | evaluate and the environment where the evaluation occurs. This is important if
450 | the expression is a mixture of variables in a data frame and objects in the
451 | local environment [for example].
452 | 
453 | For the user, these details will not be too important except to note that a
454 | filter expression must begin with a "~".
455 | 
456 | ```{r allfilesunfiltered}
457 | qfiles = files()
458 | qfiles |> count() # all files
459 | ```
460 | To limit the file type, we can refer back to the
461 | [section on faceting](#facets-and-aggregation) to see the possible values for
462 | the file field "type". For example, to filter file results to only
463 | "gene_expression" files, we simply specify a filter.
464 | 
465 | ```{r onlyGeneExpression}
466 | qfiles = files() |> filter( type == 'gene_expression')
467 | # here is what the filter looks like after translation
468 | str(get_filter(qfiles))
469 | ```
470 | 
471 | What if we want to create a filter based on the project ('TCGA-OVCA', for
472 | example)? Well, we have a couple of possible ways to discover available fields.
473 | The first is based on base R functionality and some intuition.
474 | 
475 | ```{r filtAvailFields}
476 | grep('pro',available_fields('files'),value=TRUE) |> 
477 |     head()
478 | ```
479 | 
480 | Interestingly, the project information is "nested" inside the case. We don't
481 | need to know that detail other than to know that we now have a few potential
482 | guesses for where our information might be in the files records.  We need to
483 | know where because we need to construct the appropriate filter.
484 | 
485 | ```{r filtProgramID}
486 | files() |> 
487 |     facet('cases.project.project_id') |> 
488 |     aggregations() |> 
489 |     head()
490 | ```
491 | 
492 | We note that `cases.project.project_id` looks like it is a good fit. We also
493 | note that `TCGA-OV` is the correct project_id, not `TCGA-OVCA`. Note that
494 | *unlike with dplyr and friends, the `filter()` method here **replaces** the
495 | filter and does not build on any previous filters*.
496 | 
497 | ```{r filtfinal}
498 | qfiles = files() |>
499 |     filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression')
500 | str(get_filter(qfiles))
501 | qfiles |> count()
502 | ```
503 | 
504 | Asking for a `count()` of results given these new filter criteria gives `r
505 | qfiles |> count()` results. Filters can be chained (or nested) to 
506 | accomplish the same effect as multiple `&` conditionals. The `count()`
507 | below is equivalent to the `&` filtering done above.
508 | 
509 | ```{r filtChain}
510 | qfiles2 = files() |>
511 |     filter( cases.project.project_id == 'TCGA-OV') |> 
512 |     filter( type == 'gene_expression') 
513 | qfiles2 |> count()
514 | (qfiles |> count()) == (qfiles2 |> count()) #TRUE
515 | ```
516 | 
517 | 
518 | 
519 | Generating a manifest for bulk downloads is as
520 | simple as asking for the manifest from the current query.
521 | 
522 | ```{r filtAndManifest}
523 | manifest_df = qfiles |> manifest()
524 | head(manifest_df)
525 | ```
526 | 
527 | Note that we might still not be quite there. Looking at filenames, there are
528 | suspiciously named files that might include "FPKM", "FPKM-UQ", or "counts".
529 | Another round of `grep` and `available_fields`, looking for "type" turned up
530 | that the field "analysis.workflow_type" has the appropriate filter criteria.
531 | 
532 | 
533 | ```{r filterForSTARCounts}
534 | qfiles = files() |> filter( ~ cases.project.project_id == 'TCGA-OV' &
535 |                             type == 'gene_expression' &
536 |                             access == "open" &
537 |                             analysis.workflow_type == 'STAR - Counts')
538 | manifest_df = qfiles |> manifest()
539 | nrow(manifest_df)
540 | ```
541 | 
542 | The GDC Data Transfer Tool can be used (from R, `transfer()` or from the
543 | command-line) to orchestrate high-performance, restartable transfers of all the
544 | files in the manifest. See [the bulk downloads section](bulk-downloads) for
545 | details.
546 | 
547 | 
548 | ## Authentication
549 | 
550 | [[ GDC authentication documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#facets)
551 | 
552 | The GDC offers both "controlled-access" and "open" data. As of this
553 | writing, only data stored as files is "controlled-access"; that is,
554 | metadata accessible via the GDC is all "open" data and some files are
555 | "open" and some are "controlled-access". Controlled-access data are
556 | only available
557 | after
558 | [going through the process of obtaining access.](https://gdc.cancer.gov/access-data/obtaining-access-controlled-data)
559 | 
560 | After controlled-access to one or more datasets has been granted,
561 | logging into the GDC web portal will allow you
562 | to
563 | [access a GDC authentication token](https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens),
564 | which can be downloaded and then used to access available
565 | controlled-access data via the GenomicDataCommons package.
566 | 
567 | The GenomicDataCommons uses authentication tokens only for downloading
568 | data (see `transfer` and `gdcdata` documentation). The package
569 | includes a helper function, `gdc_token`, that looks for the token to
570 | be stored in one of three ways (resolved in this order):
571 | 
572 | 1. As a string stored in the environment variable, `GDC_TOKEN`
573 | 2. As a file, stored in the file named by the environment variable,
574 |    `GDC_TOKEN_FILE`
575 | 3. In a file in the user home directory, called `.gdc_token`
576 | 
577 | As a concrete example:
578 | 
579 | ```{r authenNoRun, eval=FALSE}
580 | token = gdc_token()
581 | transfer(...,token=token)
582 | # or
583 | transfer(...,token=get_token())
584 | ```
585 | 
586 | 
587 | ## Datafile access and download
588 | 
589 | ### Data downloads via the GDC API
590 | 
591 | The `gdcdata` function takes a character vector of one or more file
592 | ids. A simple way of producing such a vector is to produce a
593 | `manifest` data frame and then pass in the first column, which will
594 | contain file ids.
595 | 
596 | ```{r singlefileDL}
597 | fnames = gdcdata(manifest_df$id[1:2],progress=FALSE)
598 | 
599 | ```
600 | 
601 | Note that for controlled-access data, a
602 | GDC [authentication token](#authentication) is required. Using the
603 | `BiocParallel` package may be useful for downloading in parallel,
604 | particularly for large numbers of smallish files.
605 | 
606 | ### Bulk downloads
607 | 
608 | The bulk download functionality is only efficient (as of v1.2.0 of the
609 | GDC Data Transfer Tool) for relatively large files, so use this
610 | approach only when transferring BAM files or larger VCF files, for
611 | example. Otherwise, consider using the approach shown above, perhaps
612 | in parallel.
613 | 
614 | ```{r bulkDL, eval=FALSE}
615 | # Requires gcd_client command-line utility to be isntalled
616 | # separately. 
617 | fnames = gdcdata(manifest_df$id[3:10], access_method = 'client')
618 | ```
619 | 
620 | 
621 | ### BAM slicing
622 | 
623 | # Use Cases
624 | 
625 | ## Cases
626 | 
627 | ### How many cases are there per project_id?
628 | 
629 | ```{r casesPerProject}
630 | res = cases() |> facet("project.project_id") |> aggregations()
631 | head(res)
632 | library(ggplot2)
633 | ggplot(res$project.project_id,aes(x = key, y = doc_count)) +
634 |     geom_bar(stat='identity') +
635 |     theme(axis.text.x = element_text(angle = 45, hjust = 1))
636 | ```
637 | 
638 | ### How many cases are included in all TARGET projects?
639 | 
640 | ```{r casesInTCGA}
641 | cases() |> filter(~ project.program.name=='TARGET') |> count()
642 | ```
643 | 
644 | ### How many cases are included in all TCGA projects?
645 | 
646 | ```{r casesInTARGET}
647 | cases() |> filter(~ project.program.name=='TCGA') |> count()
648 | ```
649 | 
650 | ### What is the breakdown of sample types in TCGA-BRCA?
651 | 
652 | ```{r casesTCGABRCASampleTypes}
653 | # The need to do the "&" here is a requirement of the
654 | # current version of the GDC API. I have filed a feature
655 | # request to remove this requirement.
656 | resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' &
657 |                               project.project_id=='TCGA-BRCA' ) |>
658 |     facet('samples.sample_type') |> aggregations()
659 | resp$samples.sample_type
660 | ```
661 | 
662 | ### Fetch all samples in TCGA-BRCA that use "Solid Tissue" as a normal.
663 | 
664 | ```{r casesTCGABRCASolidNormal}
665 | # The need to do the "&" here is a requirement of the
666 | # current version of the GDC API. I have filed a feature
667 | # request to remove this requirement.
668 | resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' &
669 |                               samples.sample_type=='Solid Tissue Normal') |>
670 |     GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) |>
671 |     response_all()
672 | count(resp)
673 | res = resp |> results()
674 | str(res[1],list.len=6)
675 | head(ids(resp))
676 | ```
677 | 
678 | ### Get all TCGA case ids that are female
679 | 
680 | ```{r casesFemaleTCGA}
681 | cases() |>
682 |   GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
683 |     "cases.demographic.gender" %in% "female") |>
684 |       GenomicDataCommons::results(size = 4) |>
685 |         ids()
686 | ```
687 | 
688 | ### Get all TCGA-COAD case ids that are NOT female
689 | 
690 | ```{r notFemaleTCGACOAD}
691 | cases() |>
692 |   GenomicDataCommons::filter(~ project.project_id == 'TCGA-COAD' &
693 |     "cases.demographic.gender" %exclude% "female") |>
694 |       GenomicDataCommons::results(size = 4) |>
695 |         ids()
696 | ```
697 | 
698 | ### Get all TCGA cases that are missing gender
699 | 
700 | ```{r missingGenderTCGA}
701 | cases() |>
702 |   GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
703 |     missing("cases.demographic.gender")) |>
704 |       GenomicDataCommons::results(size = 4) |>
705 |         ids()
706 | ```
707 | 
708 | ### Get all TCGA cases that are NOT missing gender
709 | 
710 | ```{r notMissingGenderTCGA}
711 | cases() |>
712 |   GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
713 |     !missing("cases.demographic.gender")) |>
714 |       GenomicDataCommons::results(size = 4) |>
715 |         ids()
716 | ```
717 | 
718 | 
719 | ## Files
720 | 
721 | ### How many of each type of file are available?
722 | 
723 | ```{r filesVCFCount}
724 | res = files() |> facet('type') |> aggregations()
725 | res$type
726 | ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') +
727 |     theme(axis.text.x = element_text(angle = 45, hjust = 1))
728 | ```
729 | 
730 | ### Find gene-level RNA-seq quantification files for GBM
731 | 
732 | ```{r filesRNAseqGeneGBM}
733 | q = files() |>
734 |     GenomicDataCommons::select(available_fields('files')) |>
735 |     filter(~ cases.project.project_id=='TCGA-GBM' &
736 |                data_type=='Gene Expression Quantification')
737 | q |> facet('analysis.workflow_type') |> aggregations()
738 | # so need to add another filter
739 | file_ids = q |> filter(~ cases.project.project_id=='TCGA-GBM' &
740 |                             data_type=='Gene Expression Quantification' &
741 |                             analysis.workflow_type == 'STAR - Counts') |>
742 |     GenomicDataCommons::select('file_id') |>
743 |     response_all() |>
744 |     ids()
745 | ```
746 | 
747 | ## Slicing
748 | 
749 | ### Get all BAM file ids from TCGA-GBM
750 | 
751 | **I need to figure out how to do slicing reproducibly in a testing environment
752 | and for vignette building**.
753 | 
754 | ```{r filesRNAseqGeneGBMforBAM}
755 | q = files() |>
756 |     GenomicDataCommons::select(available_fields('files')) |>
757 |     filter(~ cases.project.project_id == 'TCGA-GBM' &
758 |                data_type == 'Aligned Reads' &
759 |                experimental_strategy == 'RNA-Seq' &
760 |                data_format == 'BAM')
761 | file_ids = q |> response_all() |> ids()
762 | ```
763 | 
764 | 
765 | ```{r slicing10, eval=FALSE}
766 | bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token())
767 | library(GenomicAlignments)
768 | aligns = readGAlignments(bamfile)
769 | ```
770 | 
771 | # Troubleshooting
772 | 
773 | ## SSL connection errors
774 | 
775 | * Symptom: Trying to connect to the API results in:
776 | ```
777 | Error in curl::curl_fetch_memory(url, handle = handle) :
778 | SSL connect error
779 | ```
780 | * Possible solutions: The [issue
781 | is that the GDC supports only recent security Transport Layer Security (TLS)](http://stackoverflow.com/a/42599546/459633),
782 | so the only known fix is to upgrade the system `openssl` to version
783 | 1.0.1 or later.
784 |     * [[Mac OS]](https://github.com/Bioconductor/GenomicDataCommons/issues/35#issuecomment-284233510),
785 |     * [[Ubuntu]](http://askubuntu.com/a/434245)
786 |     * [[Centos/RHEL]](https://www.liquidweb.com/kb/update-and-patch-openssl-for-the-ccs-injection-vulnerability/).
787 |     After upgrading `openssl`, reinstall the R `curl` and `httr` packages.
788 | 
789 | 
790 | # sessionInfo()
791 | 
792 | ```{r sessionInfo}
793 | sessionInfo()
794 | ```
795 | 
796 | # Developer notes
797 | 
798 | - The `S3` object-oriented programming paradigm is used.
799 | - We have adopted a functional programming style with functions and methods that
800 | often take an "object" as the first argument. This style lends itself to
801 | pipeline-style programming.
802 | - The GenomicDataCommons package uses the
803 | [alternative request format (POST)](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#alternative-request-format)
804 | to allow very large request bodies.
805 | 
806 | 


--------------------------------------------------------------------------------