├── R ├── sysdata.rda ├── status.R ├── GenomicDataCommons-package.R ├── utilities.R ├── readDNAcopy.R ├── entity_name.R ├── facets.R ├── readHTSeqFile.R ├── gdc_token.R ├── mapping.R ├── expand.R ├── ids.R ├── constants.R ├── query.R ├── caching.R ├── slicing.R ├── manifest.R ├── clinical.R ├── REST.R ├── gdcdata.R ├── fields.R ├── bulk_transfer.R ├── filters.R └── response.R ├── _pkgdown.yml ├── tests ├── testthat.R ├── testthat │ ├── test_readHTSeqFile.R │ ├── test_cache.R │ ├── test_clinical.R │ ├── test_data.R │ └── test_api.R └── README.md ├── inst ├── extdata │ ├── dnacopy.tsv.gz │ └── example.htseq.counts.gz └── script │ ├── make_sysdata.R │ └── README.Rmd ├── vignettes ├── all_nodes_040318.png ├── questions-and-answers.Rmd ├── somatic_mutations.Rmd └── overview.Rmd ├── .gitignore ├── .Rbuildignore ├── man ├── status.Rd ├── constants.Rd ├── readDNAcopy.Rd ├── available_values.Rd ├── results.Rd ├── results_all.Rd ├── grep_fields.Rd ├── aggregations.Rd ├── select.Rd ├── expand.Rd ├── count.Rd ├── default_fields.Rd ├── available_fields.Rd ├── entity_name.Rd ├── write_manifest.Rd ├── mapping.Rd ├── available_expand.Rd ├── ids.Rd ├── make_filter.Rd ├── id_field.Rd ├── gdc_client.Rd ├── readHTSeqFile.Rd ├── faceting.Rd ├── response.Rd ├── field_description.Rd ├── GenomicDataCommons-package.Rd ├── manifest.Rd ├── gdc_token.Rd ├── gdc_clinical.Rd ├── gdc_cache.Rd ├── gdcdata.Rd ├── transfer.Rd ├── slicing.Rd ├── filtering.Rd └── query.Rd ├── GenomicDataCommons.Rproj ├── DESCRIPTION ├── NEWS.md ├── .github └── workflows │ └── basic_checks.yml ├── NAMESPACE └── README.md /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: http://bioconductor.github.io/GenomicDataCommons/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(GenomicDataCommons) 3 | 4 | test_check("GenomicDataCommons") 5 | -------------------------------------------------------------------------------- /inst/extdata/dnacopy.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/inst/extdata/dnacopy.tsv.gz -------------------------------------------------------------------------------- /vignettes/all_nodes_040318.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/vignettes/all_nodes_040318.png -------------------------------------------------------------------------------- /inst/extdata/example.htseq.counts.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bioconductor/GenomicDataCommons/HEAD/inst/extdata/example.htseq.counts.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *.html 6 | *# 7 | scratch 8 | inst/doc/* 9 | *_cache/ 10 | docs 11 | .httr-oauth 12 | doc 13 | Meta 14 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^_pkgdown\.yml$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^\.travis\.yml$ 5 | ^appveyor\.yml$ 6 | .*cache$ 7 | .httr-oauth 8 | ^DevNotes\.md$ 9 | scratch/* 10 | ^docs$ 11 | _pkgdown.yml 12 | ^\.httr-oauth$ 13 | ^doc$ 14 | ^Meta$ 15 | ^\.github$ 16 | ^pkgdown$ 17 | -------------------------------------------------------------------------------- /tests/testthat/test_readHTSeqFile.R: -------------------------------------------------------------------------------- 1 | library(GenomicDataCommons) 2 | context('readHTSeqFile') 3 | 4 | test_that("readHTSeqFile works on example data", { 5 | dat = readHTSeqFile(system.file(package="GenomicDataCommons", 6 | 'extdata/example.htseq.counts.gz')) 7 | expect_equal(nrow(dat),50) 8 | expect_equal(ncol(dat),2) 9 | }) 10 | -------------------------------------------------------------------------------- /R/status.R: -------------------------------------------------------------------------------- 1 | #' Query the GDC for current status 2 | #' 3 | #' @param version (optional) character(1) version of GDC 4 | #' 5 | #' @return List describing current status. 6 | #' 7 | #' @importFrom httr content 8 | #' 9 | #' @examples 10 | #' status() 11 | #' 12 | #' @export 13 | status <- function(version=NULL) { 14 | response <- .gdc_get(paste(version, "status", sep="/"),archive='default') 15 | content(response, type="application/json") 16 | } 17 | -------------------------------------------------------------------------------- /R/GenomicDataCommons-package.R: -------------------------------------------------------------------------------- 1 | #' GenomicDataCommons: A package for interfacing with the NCI GDC 2 | #' 3 | #' @section finding data: 4 | #' 5 | #' \itemize{ 6 | #' \item{\code{\link{query}}} 7 | #' \item{\code{\link{cases}}} 8 | #' \item{\code{\link{projects}}} 9 | #' \item{\code{\link{files}}} 10 | #' \item{\code{\link{annotations}}} 11 | #' \item{\code{\link{mapping}}} 12 | #' } 13 | #' 14 | #' @section downloading data: 15 | #' data 16 | #' 17 | "_PACKAGE" 18 | -------------------------------------------------------------------------------- /man/status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/status.R 3 | \name{status} 4 | \alias{status} 5 | \title{Query the GDC for current status} 6 | \usage{ 7 | status(version = NULL) 8 | } 9 | \arguments{ 10 | \item{version}{(optional) character(1) version of GDC} 11 | } 12 | \value{ 13 | List describing current status. 14 | } 15 | \description{ 16 | Query the GDC for current status 17 | } 18 | \examples{ 19 | status() 20 | 21 | } 22 | -------------------------------------------------------------------------------- /GenomicDataCommons.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: c8e45341-d01a-414d-a98a-fb488c8baf55 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 4 11 | Encoding: UTF-8 12 | 13 | RnwWeave: knitr 14 | LaTeX: pdfLaTeX 15 | 16 | BuildType: Package 17 | PackageUseDevtools: Yes 18 | PackageInstallArgs: --no-multiarch --with-keep.source --no-test-load 19 | PackageBuildArgs: --no-build-vignettes 20 | PackageCheckArgs: --no-vignettes 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/testthat/test_cache.R: -------------------------------------------------------------------------------- 1 | library(GenomicDataCommons) 2 | context('cache_control') 3 | 4 | cache = gdc_cache() 5 | 6 | test_that("getting cache returns length 1 char vector", { 7 | expect_length(gdc_cache(),1) 8 | expect_true(is.character(gdc_cache())) 9 | }) 10 | 11 | test_that("setting cache works", { 12 | expect_equal(gdc_set_cache('/tmp'),'/tmp') 13 | expect_equal(gdc_cache(),'/tmp') 14 | }) 15 | 16 | test_that("setting cache error checking works", { 17 | expect_error(gdc_set_cache(1)) 18 | expect_error(gdc_set_cache(c('a','b'))) 19 | }) 20 | 21 | gdc_set_cache(cache) 22 | 23 | -------------------------------------------------------------------------------- /man/constants.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/constants.R 3 | \name{endpoints} 4 | \alias{endpoints} 5 | \alias{parameters} 6 | \title{Endpoints and Parameters} 7 | \usage{ 8 | endpoints() 9 | 10 | parameters() 11 | } 12 | \value{ 13 | \code{endpoints()} returns a character vector of possible 14 | endpoints. 15 | 16 | \code{parameters()} returns a list of possible parameters 17 | and their default values. 18 | } 19 | \description{ 20 | \code{endpoints()} returns available endpoints. 21 | } 22 | \examples{ 23 | endpoints() 24 | parameters() 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/readDNAcopy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readDNAcopy.R 3 | \name{readDNAcopy} 4 | \alias{readDNAcopy} 5 | \title{Read DNAcopy results into GRanges object} 6 | \usage{ 7 | readDNAcopy(fname, ...) 8 | } 9 | \arguments{ 10 | \item{fname}{The path to a DNAcopy-like file.} 11 | 12 | \item{...}{passed to \code{\link[readr]{read_tsv}}} 13 | } 14 | \value{ 15 | a \code{\link[GenomicRanges]{GRanges}} object 16 | } 17 | \description{ 18 | Read DNAcopy results into GRanges object 19 | } 20 | \examples{ 21 | fname = system.file(package='GenomicDataCommons', 22 | 'extdata/dnacopy.tsv.gz') 23 | dnac = readDNAcopy(fname) 24 | class(dnac) 25 | length(dnac) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /man/available_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{available_values} 4 | \alias{available_values} 5 | \title{Find common values for a GDC field} 6 | \usage{ 7 | available_values(entity, field) 8 | } 9 | \arguments{ 10 | \item{entity}{character(1), a GDC entity ("cases", "files", "annotations", "projects")} 11 | 12 | \item{field}{character(1), a field that is present in the entity record} 13 | } 14 | \value{ 15 | character vector of the top 100 (or fewer) most frequent 16 | values for a the given field 17 | } 18 | \description{ 19 | Find common values for a GDC field 20 | } 21 | \examples{ 22 | available_values('files','cases.project.project_id')[1:5] 23 | 24 | } 25 | -------------------------------------------------------------------------------- /inst/script/make_sysdata.R: -------------------------------------------------------------------------------- 1 | library(httr) 2 | library(xml2) 3 | 4 | pkghome <- "~/a/GenomicDataCommons" 5 | 6 | url <- 7 | "https://gdc-docs.nci.nih.gov/API/Users_Guide/Appendix_A_Available_Fields/" 8 | xml = content(GET(url)) 9 | 10 | .get_field <- function(xml, xpath) { 11 | fields <- as.character(xml_find_all(xml, xpath)) 12 | Filter(nzchar, trimws(fields)) 13 | } 14 | 15 | .project_fields <- .get_field(xml, "//table[1]//tr/td[1]/text()") 16 | .file_fields <- .get_field(xml, "//table[2]//tr/td[1]/text()") 17 | .case_fields <- .get_field(xml, "//table[3]//tr/td[1]/text()") 18 | .annotation_fields <- .get_field(xml, "//table[4]//tr/td[1]/text()") 19 | 20 | save(.project_fields, .file_fields, .case_fields, .annotation_fields, 21 | file=file.path(pkghome, "R", "sysdata.rda")) 22 | -------------------------------------------------------------------------------- /man/results.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/response.R 3 | \name{results} 4 | \alias{results} 5 | \alias{results.GDCQuery} 6 | \alias{results.GDCResponse} 7 | \title{results} 8 | \usage{ 9 | results(x, ...) 10 | 11 | \method{results}{GDCQuery}(x, ...) 12 | 13 | \method{results}{GDCResponse}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | 18 | \item{...}{passed on to \code{\link{response}}} 19 | } 20 | \value{ 21 | A (typically nested) \code{list} of GDC records 22 | } 23 | \description{ 24 | results 25 | } 26 | \section{Methods (by class)}{ 27 | \itemize{ 28 | \item \code{results(GDCQuery)}: 29 | 30 | \item \code{results(GDCResponse)}: 31 | 32 | }} 33 | \examples{ 34 | qcases = cases() |> results() 35 | length(qcases) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /R/utilities.R: -------------------------------------------------------------------------------- 1 | .cat0 <- function(..., sep=NULL) 2 | cat(..., sep="") 3 | 4 | .wrapstr <- function(x) 5 | paste(strwrap(paste(x, collapse=", "), indent=4, exdent=4), collapse="\n") 6 | 7 | .dir_validate_or_create <- function(destination_dir) { 8 | stopifnot(is.character(destination_dir), length(destination_dir) == 1L, 9 | nzchar(destination_dir)) 10 | if (!dir.exists(destination_dir)) { 11 | if (!file.exists(destination_dir)) 12 | dir.create(destination_dir, recursive = TRUE) 13 | else 14 | stop("'destination_dir' exists but is not a directory") 15 | } 16 | } 17 | 18 | #" (internal) return character(0) instead of NULL 19 | #" 20 | #" Always return a vector and not NULL. 21 | .ifNullCharacterZero <- function(x) { 22 | if(is.null(x)) 23 | return(character(0)) 24 | return(x) 25 | } 26 | -------------------------------------------------------------------------------- /man/results_all.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/response.R 3 | \name{results_all} 4 | \alias{results_all} 5 | \alias{results_all.GDCQuery} 6 | \alias{results_all.GDCResponse} 7 | \title{results_all} 8 | \usage{ 9 | results_all(x) 10 | 11 | \method{results_all}{GDCQuery}(x) 12 | 13 | \method{results_all}{GDCResponse}(x) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | } 18 | \value{ 19 | A (typically nested) \code{list} of GDC records 20 | } 21 | \description{ 22 | results_all 23 | } 24 | \section{Methods (by class)}{ 25 | \itemize{ 26 | \item \code{results_all(GDCQuery)}: 27 | 28 | \item \code{results_all(GDCResponse)}: 29 | 30 | }} 31 | \examples{ 32 | # details of all available projects 33 | projResults = projects() |> results_all() 34 | length(projResults) 35 | count(projects()) 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /tests/testthat/test_clinical.R: -------------------------------------------------------------------------------- 1 | test_that("clinical data is structured properly", { 2 | sizen <- 3 3 | case_ids <- cases() |> results(size=sizen) |> ids() 4 | clinical_data <- gdc_clinical(case_ids) 5 | # overview of clinical results 6 | expect_true( 7 | is(clinical_data, "GDCClinicalList") 8 | ) 9 | expect_true( 10 | all( 11 | c("demographic", "diagnoses", "exposures", "follow_ups", "main") 12 | %in% 13 | names(clinical_data) 14 | ) 15 | ) 16 | 17 | ## exposures has no rows 18 | clinical_data <- clinical_data[names(clinical_data) != "exposures"] 19 | expect_true( 20 | all( 21 | vapply(clinical_data, nrow, integer(1L)) >= sizen 22 | ) 23 | ) 24 | expect_true( 25 | all( 26 | vapply(clinical_data, is.data.frame, logical(1L)) 27 | ) 28 | ) 29 | }) 30 | -------------------------------------------------------------------------------- /man/grep_fields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{grep_fields} 4 | \alias{grep_fields} 5 | \title{Find matching field names} 6 | \usage{ 7 | grep_fields(entity, pattern, ..., value = TRUE) 8 | } 9 | \arguments{ 10 | \item{entity}{one of the available gdc entities ('files','cases',...) 11 | against which to gather available fields for matching} 12 | 13 | \item{pattern}{A regular expression that will be used 14 | in a call to \code{\link{grep}}} 15 | 16 | \item{...}{passed on to grep} 17 | 18 | \item{value}{logical(1) whether to return values as opposed 19 | to indices (passed along to grep)} 20 | } 21 | \value{ 22 | character() vector of field names matching 23 | \code{pattern} 24 | } 25 | \description{ 26 | This utility function allows quick text-based search of available 27 | fields for using \code{\link{grep}} 28 | } 29 | \examples{ 30 | grep_fields('files','analysis') 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/aggregations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/response.R 3 | \name{aggregations} 4 | \alias{aggregations} 5 | \alias{aggregations.GDCQuery} 6 | \alias{aggregations.GDCResponse} 7 | \title{aggregations} 8 | \usage{ 9 | aggregations(x) 10 | 11 | \method{aggregations}{GDCQuery}(x) 12 | 13 | \method{aggregations}{GDCResponse}(x) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | } 18 | \value{ 19 | a \code{list} of \code{data.frame} with one 20 | member for each requested facet. The data frames 21 | each have two columns, key and doc_count. 22 | } 23 | \description{ 24 | aggregations 25 | } 26 | \section{Methods (by class)}{ 27 | \itemize{ 28 | \item \code{aggregations(GDCQuery)}: 29 | 30 | \item \code{aggregations(GDCResponse)}: 31 | 32 | }} 33 | \examples{ 34 | # Number of each file type 35 | res = files() |> facet(c('type','data_type')) |> aggregations() 36 | res$type 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/select.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{select} 4 | \alias{select} 5 | \alias{select.GDCQuery} 6 | \title{S3 generic to set GDCQuery fields} 7 | \usage{ 8 | select(x, fields) 9 | 10 | \method{select}{GDCQuery}(x, fields) 11 | } 12 | \arguments{ 13 | \item{x}{the objects on which to set fields} 14 | 15 | \item{fields}{a character vector specifying the fields} 16 | } 17 | \value{ 18 | A \code{\link{GDCQuery}} object, with the fields 19 | member altered. 20 | } 21 | \description{ 22 | S3 generic to set GDCQuery fields 23 | } 24 | \section{Methods (by class)}{ 25 | \itemize{ 26 | \item \code{select(GDCQuery)}: set fields on a GDCQuery object 27 | 28 | }} 29 | \examples{ 30 | gProj = projects() 31 | gProj$fields 32 | head(available_fields(gProj)) 33 | default_fields(gProj) 34 | 35 | gProj |> 36 | select(default_fields(gProj)[1:2]) |> 37 | response() |> 38 | str(max_level=2) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /R/readDNAcopy.R: -------------------------------------------------------------------------------- 1 | #' Read DNAcopy results into GRanges object 2 | #' 3 | #' @param fname The path to a DNAcopy-like file. 4 | #' @param ... passed to \code{\link[readr]{read_tsv}} 5 | #' @return a \code{\link[GenomicRanges]{GRanges}} object 6 | #' 7 | #' @importFrom readr read_tsv 8 | #' @import GenomicRanges 9 | #' @importFrom IRanges IRanges 10 | #' 11 | #' @examples 12 | #' fname = system.file(package='GenomicDataCommons', 13 | #' 'extdata/dnacopy.tsv.gz') 14 | #' dnac = readDNAcopy(fname) 15 | #' class(dnac) 16 | #' length(dnac) 17 | #' 18 | #' @export 19 | readDNAcopy <- function(fname,...) { 20 | stopifnot(file.exists(fname)) 21 | res = read_tsv(fname,...) 22 | stopifnot(ncol(res)==6) 23 | return(GRanges(seqnames=res[[2]], 24 | ranges=IRanges(start=res[[3]],end=res[[4]]), 25 | sampleName = res[[1]], 26 | Num_Probes = res[[5]], 27 | value = res[[6]])) 28 | } 29 | -------------------------------------------------------------------------------- /man/expand.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expand.R 3 | \name{expand} 4 | \alias{expand} 5 | \alias{expand.GDCQuery} 6 | \title{Set the \code{expand} parameter} 7 | \usage{ 8 | expand(x, expand) 9 | 10 | \method{expand}{GDCQuery}(x, expand) 11 | } 12 | \arguments{ 13 | \item{x}{the objects on which to set fields} 14 | 15 | \item{expand}{a character vector specifying the fields} 16 | } 17 | \value{ 18 | A \code{\link{GDCQuery}} object, with the \code{expand} 19 | member altered. 20 | } 21 | \description{ 22 | S3 generic to set GDCQuery expand parameter 23 | } 24 | \section{Methods (by class)}{ 25 | \itemize{ 26 | \item \code{expand(GDCQuery)}: set expand fields on a GDCQuery object 27 | 28 | }} 29 | \examples{ 30 | gProj = projects() 31 | gProj$fields 32 | head(available_fields(gProj)) 33 | default_fields(gProj) 34 | 35 | gProj |> 36 | select(default_fields(gProj)[1:2]) |> 37 | response() |> 38 | str(max_level=2) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/count.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/response.R 3 | \name{count} 4 | \alias{count} 5 | \alias{count.GDCQuery} 6 | \alias{count.GDCResponse} 7 | \title{provide count of records in a \code{\link{GDCQuery}}} 8 | \usage{ 9 | count(x, ...) 10 | 11 | \method{count}{GDCQuery}(x, ...) 12 | 13 | \method{count}{GDCResponse}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | 18 | \item{...}{passed to httr (good for passing config info, etc.)} 19 | } 20 | \value{ 21 | integer(1) representing the count of records that will 22 | be returned by the current query 23 | } 24 | \description{ 25 | provide count of records in a \code{\link{GDCQuery}} 26 | } 27 | \section{Methods (by class)}{ 28 | \itemize{ 29 | \item \code{count(GDCQuery)}: 30 | 31 | \item \code{count(GDCResponse)}: 32 | 33 | }} 34 | \examples{ 35 | # total number of projects 36 | projects() |> count() 37 | 38 | # total number of cases 39 | cases() |> count() 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/default_fields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{default_fields} 4 | \alias{default_fields} 5 | \alias{default_fields.character} 6 | \alias{default_fields.GDCQuery} 7 | \title{S3 Generic to return default GDC fields} 8 | \usage{ 9 | default_fields(x) 10 | 11 | \method{default_fields}{character}(x) 12 | 13 | \method{default_fields}{GDCQuery}(x) 14 | } 15 | \arguments{ 16 | \item{x}{A character string ('cases','files','projects', 17 | 'annotations') or an subclass of \code{\link{GDCQuery}}.} 18 | } 19 | \value{ 20 | a character vector of the default fields 21 | } 22 | \description{ 23 | S3 Generic to return default GDC fields 24 | } 25 | \section{Methods (by class)}{ 26 | \itemize{ 27 | \item \code{default_fields(character)}: character method 28 | 29 | \item \code{default_fields(GDCQuery)}: GDCQuery method 30 | 31 | }} 32 | \examples{ 33 | default_fields('projects') 34 | projQuery = query('projects') 35 | default_fields(projQuery) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /tests/testthat/test_data.R: -------------------------------------------------------------------------------- 1 | library(GenomicDataCommons) 2 | context('data handling') 3 | 4 | case_ids <- cases() |> results(size=10) |> ids() 5 | 6 | test_that("manifest files", { 7 | m <- manifest(files(), size = 10) 8 | expect_identical(nrow(m), 10L) 9 | expect_true(ncol(m) > 5) 10 | }) 11 | 12 | test_that("write_manifest", { 13 | m = files() |> manifest(size=10) 14 | tf = tempfile() 15 | write_manifest(m, tf) 16 | expect_true(file.exists(tf)) 17 | unlink(tf) 18 | }) 19 | 20 | test_that("gdcdata", { 21 | d = tempfile() 22 | if (!dir.exists(d)) 23 | dir.create(d) 24 | gdc_set_cache(d) 25 | 26 | few_file_ids = files() |> 27 | filter( ~ cases.project.project_id == 'TCGA-SARC' & 28 | data_type == 'Copy Number Segment' & 29 | analysis.workflow_type == 'DNAcopy') |> results(size=2) |> ids() 30 | 31 | res = gdcdata(few_file_ids) 32 | expect_length(res, 2) 33 | expect_named(res) 34 | unlink(d, recursive = TRUE) 35 | }) 36 | -------------------------------------------------------------------------------- /man/available_fields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{available_fields} 4 | \alias{available_fields} 5 | \alias{available_fields.GDCQuery} 6 | \alias{available_fields.character} 7 | \title{S3 Generic to return all GDC fields} 8 | \usage{ 9 | available_fields(x) 10 | 11 | \method{available_fields}{GDCQuery}(x) 12 | 13 | \method{available_fields}{character}(x) 14 | } 15 | \arguments{ 16 | \item{x}{A character(1) string ('cases','files','projects', 17 | 'annotations') or an subclass of \code{\link{GDCQuery}}.} 18 | } 19 | \value{ 20 | a character vector of the default fields 21 | } 22 | \description{ 23 | S3 Generic to return all GDC fields 24 | } 25 | \section{Methods (by class)}{ 26 | \itemize{ 27 | \item \code{available_fields(GDCQuery)}: GDCQuery method 28 | 29 | \item \code{available_fields(character)}: character method 30 | 31 | }} 32 | \examples{ 33 | available_fields('projects') 34 | projQuery = query('projects') 35 | available_fields(projQuery) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /man/entity_name.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/entity_name.R 3 | \name{entity_name} 4 | \alias{entity_name} 5 | \alias{entity_name.GDCQuery} 6 | \alias{entity_name.GDCResults} 7 | \title{Get the entity name from a GDCQuery object} 8 | \usage{ 9 | entity_name(x) 10 | 11 | \method{entity_name}{GDCQuery}(x) 12 | 13 | \method{entity_name}{GDCResults}(x) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | } 18 | \value{ 19 | character(1) name of an associated entity; one of 20 | "cases", "files", "projects", "annotations". 21 | } 22 | \description{ 23 | An "entity" is simply one of the four medata endpoints. 24 | \itemize{ 25 | \item{cases} 26 | \item{projects} 27 | \item{files} 28 | \item{annotations} 29 | } 30 | All \code{\link{GDCQuery}} objects will have an entity name. This S3 method 31 | is simply a utility accessor for those names. 32 | } 33 | \examples{ 34 | qcases = cases() 35 | qprojects = projects() 36 | 37 | entity_name(qcases) 38 | entity_name(qprojects) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/write_manifest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manifest.R 3 | \name{write_manifest} 4 | \alias{write_manifest} 5 | \title{write a manifest data.frame to disk} 6 | \usage{ 7 | write_manifest(manifest, destfile = tempfile()) 8 | } 9 | \arguments{ 10 | \item{manifest}{A data.frame with five columns, typically 11 | created by a call to \code{\link{manifest}}} 12 | 13 | \item{destfile}{The filename for saving the manifest.} 14 | } 15 | \value{ 16 | character(1) the destination file name. 17 | } 18 | \description{ 19 | The \code{\link{manifest}} method creates a data.frame 20 | that represents the data for a manifest file needed 21 | by the GDC Data Transfer Tool. While the file format 22 | is nothing special, this is a simple helper function 23 | to write a manifest data.frame to disk. It returns 24 | the path to which the file is written, so it can 25 | be used "in-line" in a call to \code{\link{transfer}}. 26 | } 27 | \examples{ 28 | mf = files() |> manifest(size=10) 29 | write_manifest(mf) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/mapping.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mapping.R 3 | \name{mapping} 4 | \alias{mapping} 5 | \title{Query GDC for available endpoint fields} 6 | \usage{ 7 | mapping(endpoint) 8 | } 9 | \arguments{ 10 | \item{endpoint}{character(1) corresponding to endpoints for which 11 | users may specify additional or alternative fields. Endpoints 12 | include \dQuote{projects}, \dQuote{cases}, \dQuote{files}, and 13 | \dQuote{annotations}.} 14 | } 15 | \value{ 16 | A data frame describing the field (field name), full (full 17 | data model name), type (data type), and four additional columns 18 | describing the "set" to which the fields belong--\dQuote{default}, 19 | \dQuote{expand}, \dQuote{multi}, and \dQuote{nested}. 20 | } 21 | \description{ 22 | Query GDC for available endpoint fields 23 | } 24 | \examples{ 25 | map <- mapping("projects") 26 | head(map) 27 | # get only the "default" fields 28 | subset(map,defaults) 29 | # And get just the text names of the "default" fields 30 | subset(map,defaults)$field 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/available_expand.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expand.R 3 | \name{available_expand} 4 | \alias{available_expand} 5 | \alias{available_expand.character} 6 | \alias{available_expand.GDCQuery} 7 | \title{Return valid values for "expand"} 8 | \usage{ 9 | available_expand(entity) 10 | 11 | \method{available_expand}{character}(entity) 12 | 13 | \method{available_expand}{GDCQuery}(entity) 14 | } 15 | \arguments{ 16 | \item{entity}{Either a \code{\link{GDCQuery}} object 17 | or a character(1) specifying a GDC entity ('cases', 'files', 18 | 'annotations', 'projects')} 19 | } 20 | \value{ 21 | A character vector 22 | } 23 | \description{ 24 | The GDC allows a shorthand for specifying groups 25 | of fields to be returned by the metadata queries. 26 | These can be specified in a \code{\link{select}} 27 | method call to easily supply groups of fields. 28 | } 29 | \examples{ 30 | head(available_expand('files')) 31 | 32 | } 33 | \seealso{ 34 | See \url{https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#expand} 35 | for details 36 | } 37 | -------------------------------------------------------------------------------- /man/ids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ids.R 3 | \name{ids} 4 | \alias{ids} 5 | \alias{ids.GDCManifest} 6 | \alias{ids.GDCQuery} 7 | \alias{ids.GDCResults} 8 | \alias{ids.GDCResponse} 9 | \title{Get the ids associated with a GDC query or response} 10 | \usage{ 11 | ids(x) 12 | 13 | \method{ids}{GDCManifest}(x) 14 | 15 | \method{ids}{GDCQuery}(x) 16 | 17 | \method{ids}{GDCResults}(x) 18 | 19 | \method{ids}{GDCResponse}(x) 20 | } 21 | \arguments{ 22 | \item{x}{A \code{\link{GDCQuery}} or \code{\link{GDCResponse}} object} 23 | } 24 | \value{ 25 | a character vector of all the entity ids 26 | } 27 | \description{ 28 | The GDC assigns ids (in the form of uuids) to objects in its database. Those 29 | ids can be used for relationships, searching on the website, and as 30 | unique ids. All 31 | } 32 | \examples{ 33 | # use with a GDC query, in this case for "cases" 34 | ids(cases() |> filter(~ project.project_id == "TCGA-CHOL")) 35 | # also works for responses 36 | ids(response(files())) 37 | # and results 38 | ids(results(cases())) 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/make_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filters.R 3 | \name{make_filter} 4 | \alias{make_filter} 5 | \title{Create NCI GDC filters for limiting GDC query results} 6 | \usage{ 7 | make_filter(expr, available_fields) 8 | } 9 | \arguments{ 10 | \item{expr}{a lazy-wrapped expression or a formula RHS equivalent} 11 | 12 | \item{available_fields}{a character vector of the 13 | additional names that will be injected into the 14 | filter evaluation environment} 15 | } 16 | \value{ 17 | a \code{list} that represents an R version 18 | of the JSON that will ultimately be used in an 19 | NCI GDC search or other query. 20 | } 21 | \description{ 22 | Searching the NCI GDC allows for complex filtering based 23 | on logical operations and simple comparisons. This function 24 | facilitates writing such filter expressions in R-like syntax 25 | with R code evaluation. 26 | } 27 | \details{ 28 | If used with available_fields, "bare" fields that are 29 | named in the available_fields character vector can be used 30 | in the filter expression without quotes. 31 | } 32 | -------------------------------------------------------------------------------- /man/id_field.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ids.R 3 | \name{id_field} 4 | \alias{id_field} 5 | \alias{id_field.GDCQuery} 6 | \alias{id_field.GDCResults} 7 | \title{get the name of the id field} 8 | \usage{ 9 | id_field(x) 10 | 11 | \method{id_field}{GDCQuery}(x) 12 | 13 | \method{id_field}{GDCResults}(x) 14 | } 15 | \arguments{ 16 | \item{x}{An object representing the query or results 17 | of an entity from the GDC ("cases", "files", "annotations", "projects")} 18 | } 19 | \value{ 20 | character(1) such as "case_id", "file_id", etc. 21 | } 22 | \description{ 23 | In many places in the GenomicDataCommons package, 24 | the entity ids are stored in a column or a vector 25 | with a specific name that corresponds to the field name 26 | at the GDC. The format is the entity name (singular) "_id". 27 | This generic simply returns that name from a given object. 28 | } 29 | \section{Methods (by class)}{ 30 | \itemize{ 31 | \item \code{id_field(GDCQuery)}: GDCQuery method 32 | 33 | \item \code{id_field(GDCResults)}: GDCResults method 34 | 35 | }} 36 | \examples{ 37 | id_field(cases()) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/entity_name.R: -------------------------------------------------------------------------------- 1 | #' Get the entity name from a GDCQuery object 2 | #' 3 | #' An "entity" is simply one of the four medata endpoints. 4 | #' \itemize{ 5 | #' \item{cases} 6 | #' \item{projects} 7 | #' \item{files} 8 | #' \item{annotations} 9 | #' } 10 | #' All \code{\link{GDCQuery}} objects will have an entity name. This S3 method 11 | #' is simply a utility accessor for those names. 12 | #' 13 | #' @param x a \code{\link{GDCQuery}} object 14 | #' 15 | #' @return character(1) name of an associated entity; one of 16 | #' "cases", "files", "projects", "annotations". 17 | #' 18 | #' @examples 19 | #' qcases = cases() 20 | #' qprojects = projects() 21 | #' 22 | #' entity_name(qcases) 23 | #' entity_name(qprojects) 24 | #' 25 | #' @export 26 | entity_name = function(x) { 27 | UseMethod('entity_name',x) 28 | } 29 | 30 | 31 | #' @rdname entity_name 32 | #' @export 33 | entity_name.GDCQuery = function(x) { 34 | cls = class(x)[1] 35 | return(substr(cls,5,nchar(cls))) 36 | } 37 | 38 | #' @rdname entity_name 39 | #' @export 40 | entity_name.GDCResults = function(x) { 41 | cls = class(x)[1] 42 | return(substr(cls,4,nchar(cls)-8)) 43 | } 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /man/gdc_client.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bulk_transfer.R 3 | \name{gdc_client} 4 | \alias{gdc_client} 5 | \title{return gdc-client executable path} 6 | \usage{ 7 | gdc_client() 8 | } 9 | \value{ 10 | character(1) the path to the gdc-client executable. 11 | } 12 | \description{ 13 | This function is a convenience function to 14 | find and return the path to the GDC Data Transfer 15 | Tool executable assumed to be named 'gdc-client'. 16 | The assumption is that the appropriate version of the 17 | GDC Data Transfer Tool is a separate download available 18 | from \href{the GDC website}{https://gdc.cancer.gov/access-data/gdc-data-transfer-tool} 19 | and as a backup from \href{on github}{https://github.com/NCI-GDC/gdc-client}. 20 | } 21 | \details{ 22 | The path is checked in the following order: 23 | \enumerate{ 24 | \item an R option("gdc_client") 25 | \item an environment variable GDC_CLIENT 26 | \item from the search PATH 27 | \item in the current working directory 28 | } 29 | } 30 | \examples{ 31 | # this cannot run without first 32 | # downloading the GDC Data Transfer Tool 33 | gdc_client = try(gdc_client(),silent=TRUE) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /man/readHTSeqFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readHTSeqFile.R 3 | \name{readHTSeqFile} 4 | \alias{readHTSeqFile} 5 | \title{Read a single htseq-counts result file.} 6 | \usage{ 7 | readHTSeqFile(fname, samplename = "sample", ...) 8 | } 9 | \arguments{ 10 | \item{fname}{character(1), the path of the htseq-count file.} 11 | 12 | \item{samplename}{character(1), the name of the sample. This will 13 | become the name of the second column on the resulting 14 | \code{data.frame}, making for easier merging if necessary.} 15 | 16 | \item{...}{passed to \code{\link[readr]{read_tsv})}} 17 | } 18 | \value{ 19 | a two-column data frame 20 | } 21 | \description{ 22 | The htseq package is used extensively to count reads 23 | relative to regions (see 24 | \url{http://www-huber.embl.de/HTSeq/doc/counting.html}). 25 | The output of htseq-count is a simple two-column table 26 | that includes features in column 1 and counts in column 2. 27 | This function simply reads in the data from one such file 28 | and assigns column names. 29 | } 30 | \examples{ 31 | fname = system.file(package='GenomicDataCommons', 32 | 'extdata/example.htseq.counts.gz') 33 | dat = readHTSeqFile(fname) 34 | head(dat) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /man/faceting.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/facets.R 3 | \name{facet} 4 | \alias{facet} 5 | \alias{get_facets} 6 | \alias{get_facets.GDCQuery} 7 | \title{Set facets for a \code{\link{GDCQuery}}} 8 | \usage{ 9 | facet(x, facets) 10 | 11 | get_facets(x) 12 | 13 | \method{get_facets}{GDCQuery}(x) 14 | } 15 | \arguments{ 16 | \item{x}{a \code{\link{GDCQuery}} object} 17 | 18 | \item{facets}{a character vector of fields that 19 | will be used for forming aggregations (facets). 20 | Default is to set facets for all default fields. 21 | See \code{\link{default_fields}} for details} 22 | } 23 | \value{ 24 | returns a \code{\link{GDCQuery}} object, 25 | with facets field updated. 26 | } 27 | \description{ 28 | Set facets for a \code{\link{GDCQuery}} 29 | 30 | Get facets for a \code{\link{GDCQuery}} 31 | } 32 | \examples{ 33 | # create a new GDCQuery against the projects endpoint 34 | gProj = projects() 35 | 36 | # default facets are NULL 37 | get_facets(gProj) 38 | 39 | # set facets and save result 40 | gProjFacet = facet(gProj) 41 | 42 | # check facets 43 | get_facets(gProjFacet) 44 | 45 | # and get a response, noting that 46 | # the aggregations list member contains 47 | # tibbles for each facet 48 | str(response(gProjFacet,size=2),max.level=2) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /man/response.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/response.R 3 | \name{response} 4 | \alias{response} 5 | \alias{GDCResponse} 6 | \alias{response.GDCQuery} 7 | \alias{response_all} 8 | \title{Fetch \code{\link{GDCQuery}} metadata from GDC} 9 | \usage{ 10 | response(x, ...) 11 | 12 | \method{response}{GDCQuery}(x, from = 0, size = 10, ..., response_handler = jsonlite::fromJSON) 13 | 14 | response_all(x, ...) 15 | } 16 | \arguments{ 17 | \item{x}{a \code{\link{GDCQuery}} object} 18 | 19 | \item{...}{passed to httr (good for passing config info, etc.)} 20 | 21 | \item{from}{integer index from which to start returning data} 22 | 23 | \item{size}{number of records to return} 24 | 25 | \item{response_handler}{a function that processes JSON (as text) 26 | and returns an R object. Default is \code{\link[jsonlite]{fromJSON}}.} 27 | } 28 | \value{ 29 | A \code{GDCResponse} object which is a list with the following 30 | members: 31 | \itemize{ 32 | \item{results} 33 | \item{query} 34 | \item{aggregations} 35 | \item{pages} 36 | } 37 | } 38 | \description{ 39 | Fetch \code{\link{GDCQuery}} metadata from GDC 40 | } 41 | \examples{ 42 | 43 | # basic class stuff 44 | gCases = cases() 45 | resp = response(gCases) 46 | class(resp) 47 | names(resp) 48 | 49 | # And results from query 50 | resp$results[[1]] 51 | 52 | } 53 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: GenomicDataCommons 2 | Type: Package 3 | Title: NIH / NCI Genomic Data Commons Access 4 | Description: Programmatically access the NIH / NCI Genomic Data Commons 5 | RESTful service. 6 | Version: 1.35.1 7 | Date: 2025-05-12 8 | Authors@R: c( person("Martin", "Morgan", 9 | email="martin.morgan@roswellpark.org", role=c("aut")), 10 | person("Sean", "Davis", email="seandavi@gmail.com", 11 | role=c("aut", "cre")), 12 | person("Marcel", "Ramos", 13 | email = "marcel.ramos@sph.cuny.edu", role = "ctb")) 14 | License: Artistic-2.0 15 | Depends: R (>= 4.1.0) 16 | Imports: stats, httr, xml2, jsonlite, utils, rlang, readr, 17 | GenomicRanges, IRanges, dplyr, rappdirs, tibble, tidyr 18 | Suggests: BiocStyle, knitr, rmarkdown, DT, testthat, listviewer, 19 | ggplot2, GenomicAlignments, Rsamtools, BiocParallel, 20 | TxDb.Hsapiens.UCSC.hg38.knownGene, 21 | VariantAnnotation, maftools, R.utils, data.table 22 | biocViews: DataImport, Sequencing 23 | URL: https://bioconductor.org/packages/GenomicDataCommons, 24 | http://github.com/Bioconductor/GenomicDataCommons, 25 | http://bioconductor.github.io/GenomicDataCommons/ 26 | BugReports: 27 | https://github.com/Bioconductor/GenomicDataCommons/issues/new 28 | Encoding: UTF-8 29 | VignetteBuilder: knitr 30 | RoxygenNote: 7.3.2 31 | -------------------------------------------------------------------------------- /man/field_description.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fields.R 3 | \name{field_description} 4 | \alias{field_description} 5 | \alias{field_description.GDCQuery} 6 | \alias{field_description.character} 7 | \title{S3 Generic that returns the field description text, if available} 8 | \usage{ 9 | field_description(entity, field) 10 | 11 | \method{field_description}{GDCQuery}(entity, field) 12 | 13 | \method{field_description}{character}(entity, field) 14 | } 15 | \arguments{ 16 | \item{entity}{character(1) string ('cases','files','projects', 17 | 'annotations', etc.) or an subclass of \code{\link{GDCQuery}}.} 18 | 19 | \item{field}{character(1), the name of the field that will be used to look 20 | up the description.} 21 | } 22 | \value{ 23 | character(1) descriptive text or character(0) if no description 24 | is available. 25 | } 26 | \description{ 27 | S3 Generic that returns the field description text, if available 28 | } 29 | \section{Methods (by class)}{ 30 | \itemize{ 31 | \item \code{field_description(GDCQuery)}: GDCQuery method 32 | 33 | \item \code{field_description(character)}: character method 34 | 35 | }} 36 | \examples{ 37 | field_description('cases', 'annotations.category') 38 | casesQuery = query('cases') 39 | field_description(casesQuery, 'annotations.category') 40 | field_description(cases(), 'annotations.category') 41 | 42 | } 43 | -------------------------------------------------------------------------------- /man/GenomicDataCommons-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GenomicDataCommons-package.R 3 | \docType{package} 4 | \name{GenomicDataCommons-package} 5 | \alias{GenomicDataCommons} 6 | \alias{GenomicDataCommons-package} 7 | \title{GenomicDataCommons: A package for interfacing with the NCI GDC} 8 | \description{ 9 | Programmatically access the NIH / NCI Genomic Data Commons RESTful service. 10 | } 11 | \section{finding data}{ 12 | 13 | 14 | \itemize{ 15 | \item{\code{\link{query}}} 16 | \item{\code{\link{cases}}} 17 | \item{\code{\link{projects}}} 18 | \item{\code{\link{files}}} 19 | \item{\code{\link{annotations}}} 20 | \item{\code{\link{mapping}}} 21 | } 22 | } 23 | 24 | \section{downloading data}{ 25 | 26 | data 27 | } 28 | 29 | \seealso{ 30 | Useful links: 31 | \itemize{ 32 | \item \url{https://bioconductor.org/packages/GenomicDataCommons} 33 | \item \url{http://github.com/Bioconductor/GenomicDataCommons} 34 | \item \url{http://bioconductor.github.io/GenomicDataCommons/} 35 | \item Report bugs at \url{https://github.com/Bioconductor/GenomicDataCommons/issues/new} 36 | } 37 | 38 | } 39 | \author{ 40 | \strong{Maintainer}: Sean Davis \email{seandavi@gmail.com} 41 | 42 | Authors: 43 | \itemize{ 44 | \item Martin Morgan \email{martin.morgan@roswellpark.org} 45 | } 46 | 47 | Other contributors: 48 | \itemize{ 49 | \item Marcel Ramos \email{marcel.ramos@sph.cuny.edu} [contributor] 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /R/facets.R: -------------------------------------------------------------------------------- 1 | #' Set facets for a \code{\link{GDCQuery}} 2 | #' 3 | #' @param x a \code{\link{GDCQuery}} object 4 | #' @param facets a character vector of fields that 5 | #' will be used for forming aggregations (facets). 6 | #' Default is to set facets for all default fields. 7 | #' See \code{\link{default_fields}} for details 8 | #' 9 | #' @return returns a \code{\link{GDCQuery}} object, 10 | #' with facets field updated. 11 | #' 12 | #' @rdname faceting 13 | #' 14 | #' @examples 15 | #' # create a new GDCQuery against the projects endpoint 16 | #' gProj = projects() 17 | #' 18 | #' # default facets are NULL 19 | #' get_facets(gProj) 20 | #' 21 | #' # set facets and save result 22 | #' gProjFacet = facet(gProj) 23 | #' 24 | #' # check facets 25 | #' get_facets(gProjFacet) 26 | #' 27 | #' # and get a response, noting that 28 | #' # the aggregations list member contains 29 | #' # tibbles for each facet 30 | #' str(response(gProjFacet,size=2),max.level=2) 31 | #' 32 | #' @export 33 | facet = function(x,facets) { 34 | UseMethod('facet',x) 35 | } 36 | 37 | 38 | #' @export 39 | facet.GDCQuery = function(x,facets=default_fields(x)) { 40 | x$facets = facets 41 | return(x) 42 | } 43 | 44 | #' Get facets for a \code{\link{GDCQuery}} 45 | #' 46 | #' @rdname faceting 47 | #' 48 | #' @export 49 | get_facets = function(x) { 50 | UseMethod('get_facets',x) 51 | } 52 | 53 | #' @rdname faceting 54 | #' 55 | #' @export 56 | get_facets.GDCQuery = function(x) { 57 | return(x$facets) 58 | } 59 | -------------------------------------------------------------------------------- /R/readHTSeqFile.R: -------------------------------------------------------------------------------- 1 | #' Read a single htseq-counts result file. 2 | #' 3 | #' The htseq package is used extensively to count reads 4 | #' relative to regions (see 5 | #' \url{http://www-huber.embl.de/HTSeq/doc/counting.html}). 6 | #' The output of htseq-count is a simple two-column table 7 | #' that includes features in column 1 and counts in column 2. 8 | #' This function simply reads in the data from one such file 9 | #' and assigns column names. 10 | #' 11 | #' @param fname character(1), the path of the htseq-count file. 12 | #' @param samplename character(1), the name of the sample. This will 13 | #' become the name of the second column on the resulting 14 | #' \code{data.frame}, making for easier merging if necessary. 15 | #' @param ... passed to \code{\link[readr]{read_tsv})} 16 | #' @return a two-column data frame 17 | #' 18 | #' @examples 19 | #' fname = system.file(package='GenomicDataCommons', 20 | #' 'extdata/example.htseq.counts.gz') 21 | #' dat = readHTSeqFile(fname) 22 | #' head(dat) 23 | #' 24 | #' @export 25 | readHTSeqFile <- function(fname, samplename = 'sample', ...) { 26 | if(!file.exists(fname)) 27 | stop(sprintf('The specified file, %s, does not exist',fname)) 28 | if(!((length(fname) == 1) & (is.character(fname)))) 29 | stop('fname must be of type character(1)') 30 | tmp = read_tsv(fname,col_names = FALSE) 31 | if(ncol(tmp) != 2) 32 | stop(sprintf('%s had %d columns, expected 2 columns',fname, ncol(tmp))) 33 | colnames(tmp) = c('feature',samplename) 34 | tmp 35 | } 36 | 37 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Changes in version 1.32.0 2 | 3 | ### Bug fixes and minor improvements 4 | 5 | * Minor updates to unit tests and GitHub Actions 6 | 7 | ## Changes in version 1.30.0 8 | 9 | ### New features 10 | 11 | * `gdc_clinical` includes clinical data from the 12 | `cases.follow_ups.other_clinical_attributes` entity (@LiNk-NY). 13 | 14 | ### Bug fixes and minor improvements 15 | 16 | * Removed legacy function, methods, endpoints, and arguments (@LiNk-NY) 17 | * Use native pipe `|>` instead of `magrittr::%>%` (@LiNk-NY) 18 | 19 | ## Changes in version 1.28.0 20 | 21 | ### Bug fixes and minor improvements 22 | 23 | * Defunct legacy function, methods, endpoints, and arguments (@LiNk-NY) 24 | 25 | ## Changes in version 1.26.0 26 | 27 | ### New features 28 | 29 | * The GDC API has deprecated the legacy endpoint (#110, @LiNk-NY) 30 | 31 | ## Changes in version 1.24.0 32 | 33 | ### Bug fixes and minor improvements 34 | 35 | * `gdc_clinical` handles `NULL` responses when diagnoses are not available for 36 | all IDs queried (#109, @zx8754). 37 | * Minor updates to somatic mutations vignette and unit tests. 38 | 39 | ## Changes in version 1.20.0 40 | 41 | ### New features 42 | 43 | * `gdcdata` has an ellipses argument to download data from the legacy archive, 44 | e.g., `legacy = TRUE` (#84, @LiNk-NY) 45 | * `missing` (`is MISSING`) and `!missing` (`NOT MISSING`) operations implemented 46 | for filtering queries, see vignette (#96, @LiNk-NY) 47 | * `gdc-client` version can be validated against last known good version based on 48 | data release (#99, @LiNk-NY) 49 | 50 | ### Bug fixes and minor improvements 51 | 52 | * `gdc_clinical` uses `readr::type_convert` to handle columns with inconsistent 53 | types from the API. 54 | * update examples in documentation and vignette based on new data release 55 | -------------------------------------------------------------------------------- /man/manifest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manifest.R 3 | \name{manifest} 4 | \alias{manifest} 5 | \alias{manifest.gdc_files} 6 | \alias{manifest.GDCfilesResponse} 7 | \alias{manifest.GDCcasesResponse} 8 | \title{Prepare GDC manifest file for bulk download} 9 | \usage{ 10 | manifest(x, from = 0, size = count(x), ...) 11 | 12 | \method{manifest}{gdc_files}(x, from = 0, size = count(x), ...) 13 | 14 | \method{manifest}{GDCfilesResponse}(x, from = 0, size = count(x), ...) 15 | 16 | \method{manifest}{GDCcasesResponse}(x, from = 0, size = count(x), ...) 17 | } 18 | \arguments{ 19 | \item{x}{An \code{\link{GDCQuery}} object of subclass "gdc_files" or "gdc_cases".} 20 | 21 | \item{from}{Record number from which to start when returning the manifest.} 22 | 23 | \item{size}{The total number of records to return. Default 24 | will return the usually desirable full set of records.} 25 | 26 | \item{...}{passed to \code{\link[httr]{PUT}}.} 27 | } 28 | \value{ 29 | A \code{\link[tibble]{tibble}}, also of type "gdc_manifest", with five columns: 30 | \itemize{ 31 | \item{id} 32 | \item{filename} 33 | \item{md5} 34 | \item{size} 35 | \item{state} 36 | } 37 | } 38 | \description{ 39 | The \code{manifest} function/method creates a manifest of files to be downloaded 40 | using the GDC Data Transfer Tool. There are methods for 41 | creating manifest data frames from \code{\link{GDCQuery}} objects 42 | that contain file information ("cases" and "files" queries). 43 | } 44 | \section{Methods (by class)}{ 45 | \itemize{ 46 | \item \code{manifest(gdc_files)}: 47 | 48 | \item \code{manifest(GDCfilesResponse)}: 49 | 50 | \item \code{manifest(GDCcasesResponse)}: 51 | 52 | }} 53 | \examples{ 54 | gFiles = files() 55 | shortManifest = gFiles |> manifest(size=10) 56 | head(shortManifest,n=3) 57 | 58 | 59 | } 60 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Running tests 2 | 3 | ```{r} 4 | devtools::test() 5 | ``` 6 | 7 | Should also run under `R CMD BiocCheck/check`. 8 | 9 | ## Tests 10 | 11 | A test file lives in tests/testthat/. Its name must start with test. Here’s an example of a test file from the stringr package: 12 | 13 | ```{r} 14 | library(stringr) 15 | context("String length") 16 | 17 | test_that("str_length is number of characters", { 18 | expect_equal(str_length("a"), 1) 19 | expect_equal(str_length("ab"), 2) 20 | expect_equal(str_length("abc"), 3) 21 | }) 22 | 23 | test_that("str_length of factor is length of level", { 24 | expect_equal(str_length(factor("a")), 1) 25 | expect_equal(str_length(factor("ab")), 2) 26 | expect_equal(str_length(factor("abc")), 3) 27 | }) 28 | 29 | test_that("str_length of missing is missing", { 30 | expect_equal(str_length(NA), NA_integer_) 31 | expect_equal(str_length(c(NA, 1)), c(NA, 1)) 32 | expect_equal(str_length("NA"), 2) 33 | }) 34 | ``` 35 | 36 | Tests are organised hierarchically: expectations are grouped into tests which are organised in files: 37 | 38 | An expectation is the atom of testing. It describes the expected result of a computation: Does it have the right value and right class? Does it produce error messages when it should? An expectation automates visual checking of results in the console. Expectations are functions that start with expect_. 39 | 40 | A test groups together multiple expectations to test the output from a simple function, a range of possibilities for a single parameter from a more complicated function, or tightly related functionality from across multiple functions. This is why they are sometimes called unit as they test one unit of functionality. A test is created with test_that() . 41 | 42 | A file groups together multiple related tests. Files are given a human readable name with context(). 43 | -------------------------------------------------------------------------------- /man/gdc_token.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gdc_token.R 3 | \name{gdc_token} 4 | \alias{gdc_token} 5 | \title{return a gdc token from file or environment} 6 | \usage{ 7 | gdc_token() 8 | } 9 | \value{ 10 | character(1) (invisibly, to protect against inadvertently printing) the GDC token. 11 | } 12 | \description{ 13 | The GDC requires an auth token for downloading 14 | data that are "controlled access". For example, 15 | BAM files for human datasets, germline variant calls, 16 | and SNP array raw data all are protected as "controlled 17 | access". For these files, a GDC access token is required. 18 | See the \href{details on the GDC authentication and token information}{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens}. 19 | Note that this function simply returns a string value. 20 | It is possible to keep the GDC token in a variable in R 21 | or to pass a string directly to the appropriate parameter. 22 | This function is simply a convenience function for alternative 23 | approaches to get a token from an environment variable 24 | or a file. 25 | } 26 | \details{ 27 | This function will resolve locations of the GDC token in the 28 | following order: 29 | \itemize{ 30 | \item{from the environment variable, \code{GDC_TOKEN}, expected to 31 | contain the token downloaded from the GDC as a string} 32 | \item{using \code{readLines} to read a file named in the environment 33 | variable, \code{GDC_TOKEN_FILE}} 34 | \item{using \code{readLines} to read from a file called \code{.gdc_token} in the user's 35 | home directory} 36 | } 37 | If all of these fail, this function will return an error. 38 | } 39 | \examples{ 40 | # This will not run before a GDC token 41 | # is in place. 42 | token = try(gdc_token(),silent=TRUE) 43 | 44 | 45 | } 46 | \references{ 47 | \url{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Cart/#gdc-authentication-tokens} 48 | } 49 | -------------------------------------------------------------------------------- /man/gdc_clinical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clinical.R 3 | \name{gdc_clinical} 4 | \alias{gdc_clinical} 5 | \title{Get clinical information from GDC} 6 | \usage{ 7 | gdc_clinical(case_ids, include_list_cols = FALSE) 8 | } 9 | \arguments{ 10 | \item{case_ids}{a character() vector of case_ids, typically from 11 | "cases" query.} 12 | 13 | \item{include_list_cols}{logical(1), whether to include list 14 | columns in the "main" data.frame. These list columns have 15 | values for aliquots, samples, etc. While these may be useful 16 | for some situations, they are generally not that useful as 17 | clinical annotations.} 18 | } 19 | \value{ 20 | A list of four data.frames: 21 | \enumerate{ 22 | \item main, representing basic case identification and metadata 23 | (update date, etc.) 24 | \item diagnoses 25 | \item esposures 26 | \item demographic 27 | } 28 | } 29 | \description{ 30 | The NCI GDC has a complex data model that allows various studies to 31 | supply numerous clinical and demographic data elements. However, 32 | across all projects that enter the GDC, there are 33 | similarities. This function returns four data.frames associated 34 | with case_ids from the GDC. 35 | } 36 | \details{ 37 | Note that these data.frames can, in general, have different numbers 38 | of rows (or even no rows at all). If one wishes to combine to 39 | produce a single data.frame, using the approach of left joining to 40 | the "main" data.frame will yield a useful combined data.frame. We 41 | do not do that directly given the potential for 1:many 42 | relationships. It is up to the user to determine what the best 43 | approach is for any given dataset. 44 | } 45 | \examples{ 46 | case_ids = cases() |> results(size=10) |> ids() 47 | clinical_data = gdc_clinical(case_ids) 48 | 49 | # overview of clinical results 50 | class(clinical_data) 51 | names(clinical_data) 52 | sapply(clinical_data, class) 53 | sapply(clinical_data, nrow) 54 | 55 | # available data 56 | head(clinical_data$main) 57 | head(clinical_data$demographic) 58 | head(clinical_data$diagnoses) 59 | head(clinical_data$exposures) 60 | 61 | } 62 | -------------------------------------------------------------------------------- /man/gdc_cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/caching.R 3 | \name{gdc_cache} 4 | \alias{gdc_cache} 5 | \alias{gdc_set_cache} 6 | \title{Work with gdc cache directory} 7 | \usage{ 8 | gdc_cache() 9 | 10 | gdc_set_cache( 11 | directory = rappdirs::app_dir(appname = "GenomicDataCommons")$cache(), 12 | verbose = TRUE, 13 | create_without_asking = !interactive() 14 | ) 15 | } 16 | \arguments{ 17 | \item{directory}{character(1) directory path, will be created 18 | recursively if not present.} 19 | 20 | \item{verbose}{logical(1) whether or not to message the location of 21 | the cache directory after creation.} 22 | 23 | \item{create_without_asking}{logical(1) specifying whether to allow 24 | the function to create the cache directory without asking the 25 | user first. In an interactive session, if the cache directory 26 | does not exist, the user will be prompted before creation.} 27 | } 28 | \value{ 29 | character(1) directory path that serves as 30 | the base directory for GenomicDataCommons downloads. 31 | 32 | the created directory (invisibly) 33 | } 34 | \description{ 35 | The GenomicDataCommons package will cache downloaded 36 | files to minimize network and allow for 37 | offline work. These functions are used to create a cache directory 38 | if one does not exist, set a global option, and query that 39 | option. The cache directory will default to the user "cache" 40 | directory according to specifications in 41 | \code{\link[rappdirs]{app_dir}}. However, the user may want to set 42 | this to another direcotory with more or higher performance 43 | storage. 44 | } 45 | \details{ 46 | The cache structure is currently just a directory with each file 47 | being represented by a path constructed as: 48 | CACHEDIR/UUID/FILENAME. The cached files can be manipulated 49 | using standard file system commands (removing, finding, 50 | etc.). In this sense, the cache sytem is minimalist in design. 51 | } 52 | \section{Functions}{ 53 | \itemize{ 54 | \item \code{gdc_set_cache()}: (Re)set the GenomicDataCommons cache 55 | directory 56 | 57 | }} 58 | \examples{ 59 | gdc_cache() 60 | \dontrun{ 61 | gdc_set_cache(getwd()) 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /R/gdc_token.R: -------------------------------------------------------------------------------- 1 | #' return a gdc token from file or environment 2 | #' 3 | #' The GDC requires an auth token for downloading 4 | #' data that are "controlled access". For example, 5 | #' BAM files for human datasets, germline variant calls, 6 | #' and SNP array raw data all are protected as "controlled 7 | #' access". For these files, a GDC access token is required. 8 | #' See the \href{details on the GDC authentication and token information}{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens}. 9 | #' Note that this function simply returns a string value. 10 | #' It is possible to keep the GDC token in a variable in R 11 | #' or to pass a string directly to the appropriate parameter. 12 | #' This function is simply a convenience function for alternative 13 | #' approaches to get a token from an environment variable 14 | #' or a file. 15 | #' 16 | #' 17 | #' @details 18 | #' This function will resolve locations of the GDC token in the 19 | #' following order: 20 | #' \itemize{ 21 | #' \item{from the environment variable, \code{GDC_TOKEN}, expected to 22 | #' contain the token downloaded from the GDC as a string} 23 | #' \item{using \code{readLines} to read a file named in the environment 24 | #' variable, \code{GDC_TOKEN_FILE}} 25 | #' \item{using \code{readLines} to read from a file called \code{.gdc_token} in the user's 26 | #' home directory} 27 | #' } 28 | #' If all of these fail, this function will return an error. 29 | #' 30 | #' @return character(1) (invisibly, to protect against inadvertently printing) the GDC token. 31 | #' 32 | #' @references \url{https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Cart/#gdc-authentication-tokens} 33 | #' 34 | #' @examples 35 | #' # This will not run before a GDC token 36 | #' # is in place. 37 | #' token = try(gdc_token(),silent=TRUE) 38 | #' 39 | #' 40 | #' @export 41 | gdc_token <- function() { 42 | if(Sys.getenv('GDC_TOKEN')!='') return(Sys.getenv('GDC_TOKEN')) 43 | token_file = "~/.gdc_token" 44 | if(Sys.getenv('GDC_TOKEN_FILE')!='') 45 | token_file = trimws(Sys.getenv('GDC_TOKEN_FILE')) 46 | stopifnot(file.exists(token_file)) 47 | invisible(suppressWarnings(readLines(token_file,n=1))) 48 | } 49 | -------------------------------------------------------------------------------- /R/mapping.R: -------------------------------------------------------------------------------- 1 | .response_mapping_as_list <- function(json) { 2 | json <- lapply(json, unlist) 3 | structure(json, class=c("mapping_list", "gdc_list", "list")) 4 | } 5 | 6 | 7 | #" (internal) utility for returning _mapping json 8 | #' @importFrom httr content 9 | .get_mapping_json <- function(endpoint) { 10 | valid <- .gdc_entities 11 | stopifnot(is.character(endpoint), length(endpoint) == 1L, 12 | endpoint %in% valid) 13 | response <- .gdc_get( 14 | sprintf("%s/%s", endpoint, "_mapping") 15 | ) 16 | content(response, type="application/json") 17 | } 18 | 19 | 20 | #' Query GDC for available endpoint fields 21 | #' 22 | #' @param endpoint character(1) corresponding to endpoints for which 23 | #' users may specify additional or alternative fields. Endpoints 24 | #' include \dQuote{projects}, \dQuote{cases}, \dQuote{files}, and 25 | #' \dQuote{annotations}. 26 | #' 27 | #' @return A data frame describing the field (field name), full (full 28 | #' data model name), type (data type), and four additional columns 29 | #' describing the "set" to which the fields belong--\dQuote{default}, 30 | #' \dQuote{expand}, \dQuote{multi}, and \dQuote{nested}. 31 | #' 32 | #' @examples 33 | #' map <- mapping("projects") 34 | #' head(map) 35 | #' # get only the "default" fields 36 | #' subset(map,defaults) 37 | #' # And get just the text names of the "default" fields 38 | #' subset(map,defaults)$field 39 | #' 40 | #' @importFrom httr content 41 | #' @export 42 | mapping <- function(endpoint) { 43 | json = .get_mapping_json(endpoint) 44 | maplist = list() 45 | fields = data.frame(field=unlist(json[['fields']])) 46 | mapdat = json[['_mapping']] 47 | for(cname in names(mapdat[[1]])) { 48 | maplist[[cname]] = as.character(sapply(mapdat,'[[',cname)) 49 | } 50 | df = do.call(cbind,maplist) 51 | tmpdf = as.data.frame(matrix(FALSE, ncol = 1, nrow = nrow(df)),stringsAsFactors = FALSE) 52 | fieldtypes = c('defaults') 53 | colnames(tmpdf) = fieldtypes 54 | df = cbind(data.frame(df,stringsAsFactors=FALSE),tmpdf) 55 | df = as.data.frame(merge(fields,df,by.x='field',by.y='field',all.x=TRUE),stringsAsFactors = FALSE) 56 | df$field = as.character(df$field) 57 | for(i in fieldtypes) { 58 | df[df$field %in% json[[i]],i] = TRUE 59 | } 60 | return(df) 61 | } 62 | 63 | -------------------------------------------------------------------------------- /.github/workflows/basic_checks.yml: -------------------------------------------------------------------------------- 1 | name: R CMD check 2 | 3 | on: 4 | push: 5 | branches: 6 | - devel 7 | paths: 8 | - 'DESCRIPTION' 9 | - '**basic_checks.yml' 10 | workflow_dispatch: 11 | pull_request: 12 | branches: 13 | - devel 14 | 15 | env: 16 | cache-version: v1 17 | 18 | jobs: 19 | r-build-and-check: 20 | runs-on: ubuntu-latest 21 | container: bioconductor/bioconductor_docker:devel 22 | 23 | env: 24 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: TRUE 25 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | steps: 28 | - name: Get Ubuntu Codename and Set CRAN URL 29 | run: | 30 | CODENAME=$(lsb_release -cs) 31 | echo "CRAN=https://packagemanager.posit.co/cran/__linux__/${CODENAME}/latest" >> "$GITHUB_ENV" 32 | 33 | - name: Checkout Repository 34 | uses: actions/checkout@v4 35 | 36 | - name: Query dependencies and update old packages 37 | run: | 38 | BiocManager::install(ask=FALSE) 39 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 40 | shell: Rscript {0} 41 | 42 | - name: Cache R packages 43 | if: runner.os != 'Windows' 44 | uses: actions/cache@v4 45 | with: 46 | path: /usr/local/lib/R/site-library 47 | key: ${{ env.cache-version }}-${{ runner.os }}-r-${{ hashFiles('.github/depends.Rds') }} 48 | restore-keys: ${{ env.cache-version }}-${{ runner.os }}-r- 49 | 50 | - name: Install dependencies 51 | run: | 52 | remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories()) 53 | remotes::install_cran(c("rcmdcheck", "covr")) 54 | BiocManager::install("BiocCheck") 55 | shell: Rscript {0} 56 | 57 | - name: Run rcmdcheck 58 | env: 59 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 60 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "warning", check_dir = "check") 61 | shell: Rscript {0} 62 | 63 | - name: Run BiocCheck 64 | env: 65 | DISPLAY: ':99.0' 66 | run: | 67 | BiocCheck::BiocCheck( 68 | dir('check', 'tar\\.gz$', full.names = TRUE), 69 | `quit-with-status` = FALSE, 70 | `no-check-R-ver` = TRUE, 71 | `no-check-bioc-help` = TRUE 72 | ) 73 | shell: Rscript {0} 74 | -------------------------------------------------------------------------------- /tests/testthat/test_api.R: -------------------------------------------------------------------------------- 1 | context('API') 2 | 3 | test_that("status returns correctly", { 4 | res <- status() 5 | metadata_nms <- c( 6 | "commit", "data_release", "data_release_version", 7 | "status", "tag", "version" 8 | ) 9 | expect_identical(names(res), metadata_nms) 10 | expect_identical(res$status, "OK") 11 | }) 12 | 13 | test_that('query', { 14 | gCases = query('cases') 15 | expect_equal(class(gCases)[1],'gdc_cases') 16 | expect_equal(class(gCases)[2],'GDCQuery') 17 | expect_equal(class(gCases)[3],'list') 18 | gFiles = query('files') 19 | expect_equal(class(gFiles)[1],'gdc_files') 20 | expect_equal(class(gFiles)[2],'GDCQuery') 21 | expect_equal(class(gFiles)[3],'list') 22 | gProjects = query('projects') 23 | expect_equal(class(gProjects)[1],'gdc_projects') 24 | expect_equal(class(gProjects)[2],'GDCQuery') 25 | expect_equal(class(gProjects)[3],'list') 26 | gAnnotations = query('annotations') 27 | expect_equal(class(gAnnotations)[1],'gdc_annotations') 28 | expect_equal(class(gAnnotations)[2],'GDCQuery') 29 | expect_equal(class(gAnnotations)[3],'list') 30 | }) 31 | 32 | test_that("cases", { 33 | idfield = "case_id" 34 | q = cases() 35 | resp = q |> response() 36 | expect_gte(q |> count(),1000) 37 | expect_equal(select(q,idfield)$fields,idfield) 38 | expect_equal(facet(q,idfield)$facets,idfield) 39 | }) 40 | 41 | test_that("files", { 42 | q = files() 43 | idfield = "file_id" 44 | resp = q |> response() 45 | expect_gte(q |> count(),1000) 46 | expect_equal(select(q,idfield)$fields,idfield) 47 | expect_equal(facet(q,idfield)$facets,idfield) 48 | }) 49 | 50 | test_that("annotations", { 51 | q = annotations() 52 | idfield = "annotation_id" 53 | resp = q |> response() 54 | expect_gte(q |> count(),1000) 55 | expect_equal(select(q,idfield)$fields,idfield) 56 | expect_equal(facet(q,idfield)$facets,idfield) 57 | }) 58 | 59 | test_that("mapping", { 60 | res = mapping('files') 61 | expect_equal(class(res),'data.frame') 62 | expect_equal(ncol(res), 6) 63 | expect_equal(colnames(res),c('field','description','doc_type','full','type','defaults')) 64 | }) 65 | 66 | test_that("projects", { 67 | q = projects() 68 | idfield = "project_id" 69 | resp = q |> response() 70 | expect_gte(q |> count(),35) 71 | expect_equal(select(q,idfield)$fields,idfield) 72 | expect_equal(facet(q,idfield)$facets,idfield) 73 | }) 74 | 75 | -------------------------------------------------------------------------------- /R/expand.R: -------------------------------------------------------------------------------- 1 | #' Return valid values for "expand" 2 | #' 3 | #' The GDC allows a shorthand for specifying groups 4 | #' of fields to be returned by the metadata queries. 5 | #' These can be specified in a \code{\link{select}} 6 | #' method call to easily supply groups of fields. 7 | #' 8 | #' @param entity Either a \code{\link{GDCQuery}} object 9 | #' or a character(1) specifying a GDC entity ('cases', 'files', 10 | #' 'annotations', 'projects') 11 | #' 12 | #' @return A character vector 13 | #' 14 | #' @seealso See \url{https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#expand} 15 | #' for details 16 | #' 17 | #' @examples 18 | #' head(available_expand('files')) 19 | #' 20 | #' @export 21 | available_expand <- function(entity) { 22 | UseMethod("available_expand",entity) 23 | } 24 | 25 | #' @rdname available_expand 26 | #' 27 | #' @export 28 | available_expand.character <- function(entity) { 29 | json = .get_mapping_json(entity) 30 | return(unlist(json[['expand']])) 31 | } 32 | 33 | #' @rdname available_expand 34 | #' 35 | #' @export 36 | available_expand.GDCQuery <- function(entity) { 37 | return(available_expand(entity_name(entity))) 38 | } 39 | 40 | #" (internal) check expand values 41 | .gdcCheckExpands <- function(entity,expand) { 42 | if(is.null(expand)) return(TRUE) 43 | stopifnot(entity %in% .gdc_entities) 44 | ae = available_expand(entity) 45 | mismatches = expand[!(expand %in% ae)] 46 | if(length(mismatches)>0) 47 | stop(sprintf('expand specified included expands not available in %s including (%s)',entity,mismatches)) 48 | return(TRUE) 49 | } 50 | 51 | #' Set the \code{expand} parameter 52 | #' 53 | #' S3 generic to set GDCQuery expand parameter 54 | #' 55 | #' @param x the objects on which to set fields 56 | #' @param expand a character vector specifying the fields 57 | #' 58 | #' 59 | #' @return A \code{\link{GDCQuery}} object, with the \code{expand} 60 | #' member altered. 61 | #' 62 | #' @examples 63 | #' gProj = projects() 64 | #' gProj$fields 65 | #' head(available_fields(gProj)) 66 | #' default_fields(gProj) 67 | #' 68 | #' gProj |> 69 | #' select(default_fields(gProj)[1:2]) |> 70 | #' response() |> 71 | #' str(max_level=2) 72 | #' 73 | #' @export 74 | expand <- function(x,expand) { 75 | UseMethod('expand',x) 76 | } 77 | 78 | #' @describeIn expand set expand fields on a GDCQuery object 79 | #' @export 80 | expand.GDCQuery <- function(x,expand) { 81 | .gdcCheckExpands(entity_name(x),expand) 82 | x$expand = expand 83 | return(x) 84 | } 85 | -------------------------------------------------------------------------------- /man/gdcdata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gdcdata.R 3 | \name{gdcdata} 4 | \alias{gdcdata} 5 | \title{Download GDC files} 6 | \usage{ 7 | gdcdata( 8 | uuids, 9 | use_cached = TRUE, 10 | progress = interactive(), 11 | token = NULL, 12 | access_method = "api", 13 | transfer_args = character(), 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{uuids}{character() of GDC file UUIDs.} 19 | 20 | \item{use_cached}{logical(1) default TRUE indicating that, 21 | if found in the cache, the file will not be downloaded 22 | again. If FALSE, all supplied uuids will be re-downloaded.} 23 | 24 | \item{progress}{logical(1) default TRUE in interactive sessions, 25 | FALSE otherwise indicating whether a progress par should be 26 | produced for each file download.} 27 | 28 | \item{token}{(optional) character(1) security token allowing access 29 | to restricted data. See 30 | \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}.} 31 | 32 | \item{access_method}{character(1), either 'api' or 'client'. See details.} 33 | 34 | \item{transfer_args}{character(1), additional arguments to pass to 35 | the gdc-client command line. See \code{\link{gdc_client}} and 36 | \code{\link{transfer_help}} for details.} 37 | 38 | \item{...}{further arguments passed to files} 39 | } 40 | \value{ 41 | a named vector with file uuids as the names and paths as 42 | the value 43 | } 44 | \description{ 45 | Download one or more files from GDC. Files are downloaded using the 46 | UUID and renamed to the file name on the remote system. By default, 47 | neither the uuid nor the file name on the remote system can exist. 48 | } 49 | \details{ 50 | This function is appropriate for one or several files; for large 51 | downloads use \code{\link{manifest}} to create a manifest for and 52 | the GDC Data Transfer Tool. 53 | 54 | When access_method is "api", the GDC "data" endpoint is the 55 | transfer mechanism used. The alternative access_method, "client", will 56 | utilize the \code{gdc-client} transfer tool, which must be 57 | downloaded separately and available. See 58 | \code{\link{gdc_client}} for details on specifying the location 59 | of the gdc-client executable. 60 | } 61 | \examples{ 62 | # get some example file uuids 63 | uuids <- files() |> 64 | filter(~ access == 'open' & file_size < 100000) |> 65 | results(size = 3) |> 66 | ids() 67 | 68 | # and get the data, placing it into the gdc_cache() directory 69 | gdcdata(uuids, use_cached=TRUE) 70 | 71 | } 72 | \seealso{ 73 | \code{\link{manifest}} for downloading large data. 74 | } 75 | -------------------------------------------------------------------------------- /R/ids.R: -------------------------------------------------------------------------------- 1 | #' Get the ids associated with a GDC query or response 2 | #' 3 | #' The GDC assigns ids (in the form of uuids) to objects in its database. Those 4 | #' ids can be used for relationships, searching on the website, and as 5 | #' unique ids. All 6 | #' 7 | #' @param x A \code{\link{GDCQuery}} or \code{\link{GDCResponse}} object 8 | #' 9 | #' @return a character vector of all the entity ids 10 | #' 11 | #' @examples 12 | #' # use with a GDC query, in this case for "cases" 13 | #' ids(cases() |> filter(~ project.project_id == "TCGA-CHOL")) 14 | #' # also works for responses 15 | #' ids(response(files())) 16 | #' # and results 17 | #' ids(results(cases())) 18 | #' 19 | #' 20 | #' @export 21 | ids = function(x) { 22 | UseMethod('ids',x) 23 | } 24 | 25 | #' @rdname ids 26 | #' @export 27 | ids.GDCManifest = function(x) { 28 | return(x[['id']]) 29 | } 30 | 31 | 32 | #' @rdname ids 33 | #' @export 34 | ids.GDCQuery = function(x) { 35 | fieldname = .id_field(x) 36 | res = x |> GenomicDataCommons::select(fieldname) |> 37 | results_all() 38 | return(.ifNullCharacterZero(res[[fieldname]])) 39 | } 40 | 41 | 42 | #' @rdname ids 43 | #' @export 44 | ids.GDCResults = function(x) { 45 | fieldname = .id_field(x) 46 | res = x[[fieldname]] 47 | return(.ifNullCharacterZero(res)) 48 | } 49 | 50 | #' @rdname ids 51 | #' @export 52 | ids.GDCResponse = function(x) { 53 | fieldname = paste0(sub('s$','',entity_name(x$query)),'_id') 54 | res = results(x)[[fieldname]] 55 | return(.ifNullCharacterZero(res)) 56 | } 57 | 58 | .id_field = function(x) { 59 | return(paste0(sub('s$','',entity_name(x)),"_id")) 60 | } 61 | 62 | #' get the name of the id field 63 | #' 64 | #' In many places in the GenomicDataCommons package, 65 | #' the entity ids are stored in a column or a vector 66 | #' with a specific name that corresponds to the field name 67 | #' at the GDC. The format is the entity name (singular) "_id". 68 | #' This generic simply returns that name from a given object. 69 | #' 70 | #' @param x An object representing the query or results 71 | #' of an entity from the GDC ("cases", "files", "annotations", "projects") 72 | #' 73 | #' @return character(1) such as "case_id", "file_id", etc. 74 | #' 75 | #' @examples 76 | #' id_field(cases()) 77 | #' 78 | #' @export 79 | id_field = function(x) { 80 | UseMethod('id_field',x) 81 | } 82 | 83 | #' @describeIn id_field GDCQuery method 84 | #' @export 85 | id_field.GDCQuery = function(x) { 86 | return(.id_field(x)) 87 | } 88 | 89 | #' @describeIn id_field GDCResults method 90 | #' @export 91 | id_field.GDCResults = function(x) { 92 | return(.id_field(x)) 93 | } 94 | 95 | 96 | -------------------------------------------------------------------------------- /man/transfer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bulk_transfer.R 3 | \name{transfer} 4 | \alias{transfer} 5 | \alias{gdc_client_version_validate} 6 | \alias{transfer_help} 7 | \title{Bulk data download} 8 | \usage{ 9 | transfer(uuids, args = character(), token = NULL, overwrite = FALSE) 10 | 11 | gdc_client_version_validate(valid_version = .GDC_COMPATIBLE_VERSION) 12 | 13 | transfer_help() 14 | } 15 | \arguments{ 16 | \item{uuids}{character() vector of GDC file UUIDs} 17 | 18 | \item{args}{character() vector specifying command-line arguments to 19 | be passed to \code{gdc-client}. See \code{\link{transfer_help}} for 20 | possible values. The arguments \code{--manifest}, \code{--dir}, 21 | and \code{--token-file} are determined by \code{manifest}, 22 | \code{destination_dir}, and \code{token}, respectively, and 23 | should NOT be provided as elements of \code{args}.} 24 | 25 | \item{token}{character(1) containing security 26 | token allowing access to restricted data. See 27 | \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}. 28 | Note that the GDC transfer tool requires a file for data 29 | transfer. Therefore, this token will be written to a temporary 30 | file (with appropriate permissions set).} 31 | 32 | \item{overwrite}{logical(1) default FALSE indicating whether 33 | existing files with identical name should be over-written.} 34 | 35 | \item{valid_version}{character(1) The last known version that works for the 36 | current data release for which to validate against, not typically changed 37 | by the end-user.} 38 | } 39 | \value{ 40 | character(1) directory path to which the files were 41 | downloaded. 42 | } 43 | \description{ 44 | The GDC maintains a special tool, 45 | \href{the GDC Data Transfer Tool}{https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/}, 46 | that enables high-performance, potentially parallel, and 47 | resumable downloads. The Data Transfer Tool is an external 48 | program that requires separate download. Due to recent changes in the 49 | GDC API, the transfer function now validates the version of the `gdc-client` 50 | to ensure reliable downloads. 51 | } 52 | \section{Functions}{ 53 | \itemize{ 54 | \item \code{gdc_client_version_validate()}: If you are using the 'client' option, your `gdc-client` should be 55 | up-to-date (>= 1.3.0). 56 | 57 | \item \code{transfer_help()}: 58 | 59 | }} 60 | \examples{ 61 | \dontrun{ 62 | uuids = files() |> 63 | filter(access == "open") |> 64 | results() |> 65 | ids() 66 | file_paths <- transfer(uuids) 67 | file_paths 68 | names(file_paths) 69 | # and with authenication 70 | # REQUIRES gdc_token 71 | # destination <- transfer(uuids,token=gdc_token()) 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /R/constants.R: -------------------------------------------------------------------------------- 1 | .gdc_base <- "https://api.gdc.cancer.gov" 2 | .gdc_endpoint <- 3 | structure( 4 | c("status", "projects", "cases", "files", "annotations", "data", 5 | "manifest", "slicing"), ##, submission 6 | class="gdc_endpoints") 7 | 8 | .gdc_parameters <- 9 | structure( 10 | list(format="JSON", pretty=FALSE, fields=NULL, size=10L, from=0L, 11 | sort=NULL, filters=NULL, facets=NULL), 12 | class="gdc_parameters") 13 | 14 | .gdc_flat_parameters <- 15 | structure( 16 | c('fields','facets'), 17 | class = "gdc_flat_params") 18 | 19 | .gdc_entities = 20 | structure( 21 | c('projects','cases',"files","annotations", 22 | "ssms", "cnvs", "ssm_occurrences", "cnv_occurrences", 23 | "genes"), 24 | class = "gdc_entities") 25 | 26 | .gdc_manifest_colnames = 27 | structure( 28 | c("id", "file_name", "md5sum", "file_size", "state"), 29 | class = 'gdc_manifest_colnames' 30 | ) 31 | 32 | 33 | #' Endpoints and Parameters 34 | #' 35 | #' \code{endpoints()} returns available endpoints. 36 | #' 37 | #' @return \code{endpoints()} returns a character vector of possible 38 | #' endpoints. 39 | #' 40 | #' @rdname constants 41 | #' @examples 42 | #' endpoints() 43 | #' @export 44 | endpoints <- function() 45 | .gdc_endpoint 46 | 47 | #' @export 48 | print.gdc_endpoints <- function(x, ...) 49 | .cat0("available endpoints:\n", .wrapstr(x), "\n") 50 | 51 | #' \code{parameters()} include format (internal use only), pretty 52 | #' (internal use only), fields, size (number of results returned), 53 | #' from (index of rist result), sort, filters, and facets. See 54 | #' \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Search_and_Retrieval/#query-parameters} 55 | #' 56 | #' @return \code{parameters()} returns a list of possible parameters 57 | #' and their default values. 58 | #' @keywords internal 59 | #' 60 | #' @rdname constants 61 | #' @examples 62 | #' parameters() 63 | #' @export 64 | parameters <- function() 65 | .gdc_parameters 66 | 67 | #' @export 68 | print.gdc_parameters <- function(x, ...) { 69 | cat("available parameters:\n") 70 | for (nm in names(x)) 71 | .cat0(" ", nm, ": ", 72 | if (is.null(x[[nm]])) "NULL" else x[[nm]], "\n") 73 | } 74 | 75 | #" (internal) 76 | .parameter_string <- function(parameters) { 77 | if (is.null(parameters)) 78 | return("") 79 | stopifnot(is.list(parameters), 80 | all(names(parameters) %in% names(.gdc_parameters))) 81 | 82 | default <- .gdc_parameters 83 | default[names(parameters)] <- parameters 84 | default <- Filter(Negate(is.null), default) 85 | string <- paste(names(default), unname(default), sep="=", collapse="&") 86 | sprintf("?%s", string) 87 | } 88 | 89 | -------------------------------------------------------------------------------- /man/slicing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/slicing.R 3 | \name{slicing} 4 | \alias{slicing} 5 | \title{Query GDC for data slices} 6 | \usage{ 7 | slicing( 8 | uuid, 9 | regions, 10 | symbols, 11 | destination = file.path(tempdir(), paste0(uuid, ".bam")), 12 | overwrite = FALSE, 13 | progress = interactive(), 14 | token = gdc_token() 15 | ) 16 | } 17 | \arguments{ 18 | \item{uuid}{character(1) identifying the BAM file resource} 19 | 20 | \item{regions}{character() vector describing chromosomal regions, 21 | e.g., \code{c("chr1", "chr2:10000", "chr3:10000-20000")} (all 22 | of chromosome 1, chromosome 2 from position 10000 to the end, 23 | chromosome 3 from 10000 to 20000).} 24 | 25 | \item{symbols}{character() vector of gencode gene symbols, e.g., 26 | \code{c("BRCA1", "PTEN")}} 27 | 28 | \item{destination}{character(1) default \code{tempfile()} file path 29 | for BAM file slice} 30 | 31 | \item{overwrite}{logical(1) default FALSE can destination be 32 | overwritten?} 33 | 34 | \item{progress}{logical(1) default \code{interactive()} should a 35 | progress bar be used?} 36 | 37 | \item{token}{character(1) security token allowing access to 38 | restricted data. Almost all BAM data is restricted, so a token is 39 | usually required. See 40 | \url{https://docs.gdc.cancer.gov/Data/Data_Security/Data_Security/#authentication-tokens}.} 41 | } 42 | \value{ 43 | character(1) destination to the downloaded BAM file 44 | } 45 | \description{ 46 | This function returns a BAM file representing reads overlapping 47 | regions specified either as chromosomal regions or as gencode gene 48 | symbols. 49 | } 50 | \details{ 51 | This function uses the Genomic Data Commons "slicing" API 52 | to get portions of a BAM file specified either using "regions" 53 | or using HGNC gene symbols. 54 | } 55 | \examples{ 56 | \dontrun{ 57 | slicing("df80679e-c4d3-487b-934c-fcc782e5d46e", 58 | regions="chr17:75000000-76000000", 59 | token=gdc_token()) 60 | 61 | # Get 10 BAM files. 62 | bamfiles = files() |> 63 | filter(data_format=='BAM') |> 64 | results(size=10) |> ids() 65 | 66 | # Current alignments at the GDC are to GRCh38 67 | library('TxDb.Hsapiens.UCSC.hg38.knownGene') 68 | all_genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene) 69 | 70 | first3genes = all_genes[1:3] 71 | # remove strand info 72 | strand(first3genes) = '*' 73 | 74 | # We can get our regions easily now 75 | as.character(first3genes) 76 | 77 | # Use parallel downloads to speed processing 78 | library(BiocParallel) 79 | register(MulticoreParam()) 80 | 81 | fnames = bplapply(bamfiles, slicing, overwrite = TRUE, 82 | regions=as.character(first3genes)) 83 | 84 | # 10 BAM files 85 | fnames 86 | 87 | library(GenomicAlignments) 88 | lapply(unlist(fnames), readGAlignments) 89 | 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /man/filtering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filters.R 3 | \name{filtering} 4 | \alias{filtering} 5 | \alias{filter} 6 | \alias{filter.GDCQuery} 7 | \alias{get_filter} 8 | \alias{get_filter.GDCQuery} 9 | \title{Manipulating GDCQuery filters} 10 | \usage{ 11 | filter(x, expr) 12 | 13 | \method{filter}{GDCQuery}(x, expr) 14 | 15 | get_filter(x) 16 | 17 | \method{get_filter}{GDCQuery}(x) 18 | } 19 | \arguments{ 20 | \item{x}{the object on which to set the filter list 21 | member} 22 | 23 | \item{expr}{a filter expression in the form of 24 | the right hand side of a formula, where bare names 25 | (without quotes) are allowed if they are available 26 | fields associated with the GDCQuery object, \code{x}} 27 | } 28 | \value{ 29 | A \code{\link{GDCQuery}} object with the filter 30 | field replaced by specified filter expression 31 | } 32 | \description{ 33 | Manipulating GDCQuery filters 34 | 35 | The \code{filter} is simply a safe accessor for 36 | the filter element in \code{\link{GDCQuery}} objects. 37 | 38 | The \code{get_filter} is simply a safe accessor for 39 | the filter element in \code{\link{GDCQuery}} objects. 40 | } 41 | \examples{ 42 | # make a GDCQuery object to start 43 | # 44 | # Projects 45 | # 46 | pQuery = projects() 47 | 48 | # check for the default fields 49 | # so that we can use one of them to build a filter 50 | default_fields(pQuery) 51 | pQuery = filter(pQuery,~ project_id == 'TCGA-LUAC') 52 | get_filter(pQuery) 53 | 54 | # 55 | # Files 56 | # 57 | fQuery = files() 58 | default_fields(fQuery) 59 | 60 | fQuery = filter(fQuery,~ data_format == 'VCF') 61 | # OR 62 | # with recent GenomicDataCommons versions: 63 | # no "~" needed 64 | fQuery = filter(fQuery, data_format == 'VCF') 65 | 66 | get_filter(fQuery) 67 | 68 | fQuery = filter(fQuery,~ data_format == 'VCF' 69 | & experimental_strategy == 'WXS' 70 | & type == 'simple_somatic_mutation') 71 | 72 | files() |> filter(~ data_format == 'VCF' 73 | & experimental_strategy=='WXS' 74 | & type == 'simple_somatic_mutation') |> count() 75 | 76 | 77 | files() |> filter( data_format == 'VCF' 78 | & experimental_strategy=='WXS' 79 | & type == 'simple_somatic_mutation') |> count() 80 | 81 | # Filters may be chained for the 82 | # equivalent query 83 | # 84 | # When chained, filters are combined with logical AND 85 | 86 | files() |> 87 | filter(~ data_format == 'VCF') |> 88 | filter(~ experimental_strategy == 'WXS') |> 89 | filter(~ type == 'simple_somatic_mutation') |> 90 | count() 91 | 92 | # OR 93 | 94 | files() |> 95 | filter( data_format == 'VCF') |> 96 | filter( experimental_strategy == 'WXS') |> 97 | filter( type == 'simple_somatic_mutation') |> 98 | count() 99 | 100 | # Use str() to get a cleaner picture 101 | str(get_filter(fQuery)) 102 | } 103 | -------------------------------------------------------------------------------- /man/query.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/query.R 3 | \name{query} 4 | \alias{query} 5 | \alias{GDCQuery} 6 | \alias{cases} 7 | \alias{files} 8 | \alias{projects} 9 | \alias{annotations} 10 | \alias{ssms} 11 | \alias{ssm_occurrences} 12 | \alias{cnvs} 13 | \alias{cnv_occurrences} 14 | \alias{genes} 15 | \title{Start a query of GDC metadata} 16 | \usage{ 17 | query( 18 | entity, 19 | filters = NULL, 20 | facets = NULL, 21 | expand = NULL, 22 | fields = default_fields(entity), 23 | ... 24 | ) 25 | 26 | cases(...) 27 | 28 | files(...) 29 | 30 | projects(...) 31 | 32 | annotations(...) 33 | 34 | ssms(...) 35 | 36 | ssm_occurrences(...) 37 | 38 | cnvs(...) 39 | 40 | cnv_occurrences(...) 41 | 42 | genes(...) 43 | } 44 | \arguments{ 45 | \item{entity}{character vector, including one of the entities in .gdc_entities} 46 | 47 | \item{filters}{a filter list, typically created using \code{\link{make_filter}}, or added 48 | to an existing \code{GDCQuery} object using \code{\link{filter}}.} 49 | 50 | \item{facets}{a character vector of facets for counting common values. 51 | See \code{\link{available_fields}}. In general, one will not specify this parameter 52 | but will use \code{\link{facet}} instead.} 53 | 54 | \item{expand}{a character vector of "expands" to include in returned data. See 55 | \code{\link{available_expand}}} 56 | 57 | \item{fields}{a character vector of fields to return. See \code{\link{available_fields}}. 58 | In general, one will not specify fields directly, but instead use \code{\link{select}}} 59 | 60 | \item{...}{passed through to \code{\link{query}}} 61 | } 62 | \value{ 63 | An S3 object, the GDCQuery object. This is a list 64 | with the following members. 65 | \itemize{ 66 | \item{filters} 67 | \item{facets} 68 | \item{fields} 69 | \item{expand} 70 | \item{archive} 71 | \item{token} 72 | } 73 | } 74 | \description{ 75 | The basis for all functionality in this package 76 | starts with constructing a query in R. The GDCQuery 77 | object contains the filters, facets, and other 78 | parameters that define the returned results. A token 79 | is required for accessing certain datasets. 80 | } 81 | \section{Functions}{ 82 | \itemize{ 83 | \item \code{cases()}: convenience constructor for a GDCQuery for cases 84 | 85 | \item \code{files()}: convenience contructor for a GDCQuery for files 86 | 87 | \item \code{projects()}: convenience contructor for a GDCQuery for projects 88 | 89 | \item \code{annotations()}: convenience contructor for a GDCQuery for annotations 90 | 91 | \item \code{ssms()}: convenience contructor for a GDCQuery for ssms 92 | 93 | \item \code{ssm_occurrences()}: convenience contructor for a GDCQuery for ssm_occurrences 94 | 95 | \item \code{cnvs()}: convenience contructor for a GDCQuery for cnvs 96 | 97 | \item \code{cnv_occurrences()}: convenience contructor for a GDCQuery for cnv_occurrences 98 | 99 | \item \code{genes()}: convenience contructor for a GDCQuery for genes 100 | 101 | }} 102 | \examples{ 103 | qcases = query('cases') 104 | # equivalent to: 105 | qcases = cases() 106 | 107 | } 108 | -------------------------------------------------------------------------------- /vignettes/questions-and-answers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Questions and answers from over the years" 3 | author: "Sean Davis" 4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`" 5 | always_allow_html: yes 6 | output: 7 | BiocStyle::html_document: 8 | df_print: paged 9 | toc_float: true 10 | keep_md: true 11 | abstract: > 12 | 13 | vignette: > 14 | %\VignetteIndexEntry{Questions and answers from over the years} 15 | %\VignetteEngine{knitr::rmarkdown} 16 | %\VignetteEncoding{UTF-8} 17 | --- 18 | 19 | # How could I generate a manifest file with filtering of Race and Ethnicity? 20 | 21 | From https://support.bioconductor.org/p/9138939/. 22 | 23 | ```{r} 24 | library(GenomicDataCommons,quietly = TRUE) 25 | ``` 26 | 27 | I made a small change to the filtering expression approach based on 28 | changes to lazy evaluation best practices. There is now no need to 29 | include the `~` in the filter expression. So: 30 | 31 | ```{r} 32 | q = files() |> 33 | GenomicDataCommons::filter( 34 | cases.project.project_id == 'TCGA-COAD' & 35 | data_type == 'Aligned Reads' & 36 | experimental_strategy == 'RNA-Seq' & 37 | data_format == 'BAM') 38 | ``` 39 | And get a count of the results: 40 | 41 | ```{r} 42 | count(q) 43 | ``` 44 | 45 | And the manifest. 46 | 47 | ```{r} 48 | manifest(q) 49 | ``` 50 | 51 | Your question about race and ethnicity is a good one. 52 | 53 | ```{r} 54 | all_fields = available_fields(files()) 55 | ``` 56 | 57 | And we can grep for `race` or `ethnic` to get potential matching fields 58 | to look at. 59 | 60 | ```{r} 61 | grep('race|ethnic',all_fields,value=TRUE) 62 | ``` 63 | 64 | Now, we can check available values for each field to determine how to complete 65 | our filter expressions. 66 | 67 | ```{r} 68 | available_values('files',"cases.demographic.ethnicity") 69 | available_values('files',"cases.demographic.race") 70 | ``` 71 | 72 | We can complete our filter expression now to limit to `white` race only. 73 | 74 | ```{r} 75 | q_white_only = q |> 76 | GenomicDataCommons::filter(cases.demographic.race=='white') 77 | count(q_white_only) 78 | manifest(q_white_only) 79 | ``` 80 | 81 | # How can I get the number of cases with RNA-Seq data added by date to TCGA project with `GenomicDataCommons`? 82 | 83 | - From https://support.bioconductor.org/p/9135791/ 84 | 85 | I would like to get the number of cases added (created, any logical datetime would suffice here) to the TCGA project by experiment type. I attempted to get this data via GenomicDataCommons package, but it is giving me I believe the number of files for a given experiment type rather than number cases. How can I get the number of cases for which there is RNA-Seq data? 86 | 87 | ```{r} 88 | library(tibble) 89 | library(dplyr) 90 | library(GenomicDataCommons) 91 | 92 | cases() |> 93 | GenomicDataCommons::filter( 94 | ~ project.program.name=='TCGA' & files.experimental_strategy=='RNA-Seq' 95 | ) |> 96 | facet(c("files.created_datetime")) |> 97 | aggregations() |> 98 | unname() |> 99 | unlist(recursive = FALSE) |> 100 | as_tibble() |> 101 | dplyr::arrange(dplyr::desc(key)) 102 | ``` 103 | -------------------------------------------------------------------------------- /vignettes/somatic_mutations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Working with simple somatic mutations" 3 | author: "Sean Davis" 4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`" 5 | always_allow_html: yes 6 | output: 7 | BiocStyle::html_document: 8 | df_print: paged 9 | toc_float: true 10 | abstract: > 11 | 12 | vignette: > 13 | %\VignetteIndexEntry{Somatic Mutation Data} 14 | %\VignetteEngine{knitr::rmarkdown} 15 | %\VignetteEncoding{UTF-8} 16 | --- 17 | 18 | # Background 19 | 20 | 21 | 22 | # Workflow 23 | 24 | ```{r warning=FALSE,message=FALSE} 25 | library(GenomicDataCommons) 26 | library(tibble) 27 | ``` 28 | 29 | ## Genes and gene details 30 | 31 | ```{r} 32 | grep_fields('genes', 'symbol') 33 | ``` 34 | ```{r} 35 | head(available_values('genes','symbol')) 36 | ``` 37 | 38 | 39 | 40 | ```{r} 41 | tp53 = genes() |> 42 | GenomicDataCommons::filter(symbol=='TP53') |> 43 | results(size=10000) |> 44 | as_tibble() 45 | ``` 46 | 47 | 48 | ## ssms 49 | 50 | ```{r} 51 | ssms() |> 52 | GenomicDataCommons::filter( 53 | chromosome==paste0('chr',tp53$gene_chromosome[1]) & 54 | start_position > tp53$gene_start[1] & 55 | end_position < tp53$gene_end[1]) |> 56 | GenomicDataCommons::count() 57 | ``` 58 | 59 | ```{r} 60 | ssms() |> 61 | GenomicDataCommons::filter( 62 | consequence.transcript.gene.symbol %in% c('TP53')) |> 63 | GenomicDataCommons::count() 64 | ``` 65 | 66 | ## convert to VRanges 67 | 68 | ```{r warning=FALSE,message=FALSE} 69 | library(VariantAnnotation) 70 | vars = ssms() |> 71 | GenomicDataCommons::filter( 72 | consequence.transcript.gene.symbol %in% c('TP53')) |> 73 | GenomicDataCommons::results_all() |> 74 | as_tibble() 75 | ``` 76 | 77 | ```{r} 78 | vr = VRanges(seqnames = vars$chromosome, 79 | ranges = IRanges(start=vars$start_position, width=1), 80 | ref = vars$reference_allele, 81 | alt = vars$tumor_allele) 82 | ``` 83 | 84 | ```{r} 85 | ssm_occurrences() |> 86 | GenomicDataCommons::filter( 87 | ssm.consequence.transcript.gene.symbol %in% c('TP53')) |> 88 | GenomicDataCommons::count() 89 | ``` 90 | 91 | ```{r} 92 | var_samples = ssm_occurrences() |> 93 | GenomicDataCommons::filter( 94 | ssm.consequence.transcript.gene.symbol %in% c('TP53')) |> 95 | GenomicDataCommons::expand(c('case', 'ssm', 'case.project')) |> 96 | GenomicDataCommons::results_all() |> 97 | as_tibble() 98 | ``` 99 | 100 | ```{r} 101 | table(var_samples$case$disease_type) 102 | ``` 103 | 104 | ## OncoPrint 105 | 106 | ```{r} 107 | fnames <- files() |> 108 | GenomicDataCommons::filter( 109 | cases.project.project_id=='TCGA-SKCM' & 110 | data_format=='maf' & 111 | data_type=='Masked Somatic Mutation' & 112 | analysis.workflow_type == 113 | 'Aliquot Ensemble Somatic Variant Merging and Masking' 114 | ) |> 115 | results(size = 1) |> 116 | ids() |> 117 | gdcdata() 118 | ``` 119 | 120 | ```{r cache=TRUE} 121 | library(maftools) 122 | melanoma = read.maf(maf = fnames) 123 | ``` 124 | 125 | ```{r} 126 | maftools::oncoplot(melanoma) 127 | ``` 128 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(aggregations,GDCQuery) 4 | S3method(aggregations,GDCResponse) 5 | S3method(available_expand,GDCQuery) 6 | S3method(available_expand,character) 7 | S3method(available_fields,GDCQuery) 8 | S3method(available_fields,character) 9 | S3method(count,GDCQuery) 10 | S3method(count,GDCResponse) 11 | S3method(default_fields,GDCQuery) 12 | S3method(default_fields,character) 13 | S3method(entity_name,GDCQuery) 14 | S3method(entity_name,GDCResults) 15 | S3method(expand,GDCQuery) 16 | S3method(facet,GDCQuery) 17 | S3method(field_description,GDCQuery) 18 | S3method(field_description,character) 19 | S3method(filter,GDCQuery) 20 | S3method(get_facets,GDCQuery) 21 | S3method(get_filter,GDCQuery) 22 | S3method(id_field,GDCQuery) 23 | S3method(id_field,GDCResults) 24 | S3method(ids,GDCManifest) 25 | S3method(ids,GDCQuery) 26 | S3method(ids,GDCResponse) 27 | S3method(ids,GDCResults) 28 | S3method(manifest,GDCcasesResponse) 29 | S3method(manifest,GDCfilesResponse) 30 | S3method(manifest,gdc_files) 31 | S3method(print,gdc_endpoints) 32 | S3method(print,gdc_parameters) 33 | S3method(response,GDCQuery) 34 | S3method(results,GDCQuery) 35 | S3method(results,GDCResponse) 36 | S3method(results_all,GDCQuery) 37 | S3method(results_all,GDCResponse) 38 | S3method(select,GDCQuery) 39 | export(aggregations) 40 | export(annotations) 41 | export(available_expand) 42 | export(available_fields) 43 | export(available_values) 44 | export(cases) 45 | export(cnv_occurrences) 46 | export(cnvs) 47 | export(count) 48 | export(default_fields) 49 | export(endpoints) 50 | export(entity_name) 51 | export(expand) 52 | export(facet) 53 | export(field_description) 54 | export(files) 55 | export(filter) 56 | export(gdc_cache) 57 | export(gdc_client) 58 | export(gdc_client_version_validate) 59 | export(gdc_clinical) 60 | export(gdc_set_cache) 61 | export(gdc_token) 62 | export(gdcdata) 63 | export(genes) 64 | export(get_facets) 65 | export(get_filter) 66 | export(grep_fields) 67 | export(id_field) 68 | export(ids) 69 | export(make_filter) 70 | export(manifest) 71 | export(mapping) 72 | export(parameters) 73 | export(projects) 74 | export(query) 75 | export(readDNAcopy) 76 | export(readHTSeqFile) 77 | export(response) 78 | export(response_all) 79 | export(results) 80 | export(results_all) 81 | export(select) 82 | export(slicing) 83 | export(ssm_occurrences) 84 | export(ssms) 85 | export(status) 86 | export(transfer) 87 | export(transfer_help) 88 | export(write_manifest) 89 | import(GenomicRanges) 90 | importFrom(IRanges,IRanges) 91 | importFrom(dplyr,bind_rows) 92 | importFrom(httr,GET) 93 | importFrom(httr,POST) 94 | importFrom(httr,add_headers) 95 | importFrom(httr,content) 96 | importFrom(httr,headers) 97 | importFrom(httr,progress) 98 | importFrom(httr,stop_for_status) 99 | importFrom(httr,write_disk) 100 | importFrom(jsonlite,fromJSON) 101 | importFrom(jsonlite,toJSON) 102 | importFrom(jsonlite,unbox) 103 | importFrom(rappdirs,app_dir) 104 | importFrom(readr,read_tsv) 105 | importFrom(rlang,enquo) 106 | importFrom(rlang,eval_tidy) 107 | importFrom(rlang,f_env) 108 | importFrom(rlang,f_rhs) 109 | importFrom(rlang,is_formula) 110 | importFrom(stats,setNames) 111 | importFrom(tibble,as_tibble) 112 | importFrom(utils,menu) 113 | importFrom(utils,read.table) 114 | importFrom(utils,write.table) 115 | importFrom(xml2,xml_find_all) 116 | importFrom(xml2,xml_text) 117 | -------------------------------------------------------------------------------- /R/query.R: -------------------------------------------------------------------------------- 1 | #' Start a query of GDC metadata 2 | #' 3 | #' The basis for all functionality in this package 4 | #' starts with constructing a query in R. The GDCQuery 5 | #' object contains the filters, facets, and other 6 | #' parameters that define the returned results. A token 7 | #' is required for accessing certain datasets. 8 | #' 9 | #' @aliases GDCQuery 10 | #' 11 | #' @param entity character vector, including one of the entities in .gdc_entities 12 | #' @param filters a filter list, typically created using \code{\link{make_filter}}, or added 13 | #' to an existing \code{GDCQuery} object using \code{\link{filter}}. 14 | #' @param facets a character vector of facets for counting common values. 15 | #' See \code{\link{available_fields}}. In general, one will not specify this parameter 16 | #' but will use \code{\link{facet}} instead. 17 | #' @param fields a character vector of fields to return. See \code{\link{available_fields}}. 18 | #' In general, one will not specify fields directly, but instead use \code{\link{select}} 19 | #' @param expand a character vector of "expands" to include in returned data. See 20 | #' \code{\link{available_expand}} 21 | #' 22 | #' @return An S3 object, the GDCQuery object. This is a list 23 | #' with the following members. 24 | #' \itemize{ 25 | #' \item{filters} 26 | #' \item{facets} 27 | #' \item{fields} 28 | #' \item{expand} 29 | #' \item{archive} 30 | #' \item{token} 31 | #' } 32 | #' 33 | #' @examples 34 | #' qcases = query('cases') 35 | #' # equivalent to: 36 | #' qcases = cases() 37 | #' 38 | #' @export 39 | query = function(entity, 40 | filters=NULL, 41 | facets=NULL, 42 | expand = NULL, 43 | fields=default_fields(entity), 44 | ...) 45 | { 46 | stopifnot(entity %in% .gdc_entities) 47 | ret = structure( 48 | list( 49 | fields = fields, 50 | filters = filters, 51 | facets = facets, 52 | expand = expand), 53 | class = c(paste0('gdc_',entity),'GDCQuery','list') 54 | ) 55 | return(ret) 56 | } 57 | 58 | 59 | #' @describeIn query convenience constructor for a GDCQuery for cases 60 | #' 61 | #' @param ... passed through to \code{\link{query}} 62 | #' 63 | #' @export 64 | cases = function(...) {return(query('cases',...))} 65 | 66 | #' @describeIn query convenience contructor for a GDCQuery for files 67 | #' @export 68 | files = function(...) {return(query('files',...))} 69 | 70 | #' @describeIn query convenience contructor for a GDCQuery for projects 71 | #' @export 72 | projects = function(...) {return(query('projects',...))} 73 | 74 | #' @describeIn query convenience contructor for a GDCQuery for annotations 75 | #' @export 76 | annotations = function(...) {return(query('annotations',...))} 77 | 78 | #' @describeIn query convenience contructor for a GDCQuery for ssms 79 | #' @export 80 | ssms = function(...) {return(query("ssms", ...))} 81 | 82 | #' @describeIn query convenience contructor for a GDCQuery for ssm_occurrences 83 | #' @export 84 | ssm_occurrences = function(...) {return(query("ssm_occurrences", ...))} 85 | 86 | #' @describeIn query convenience contructor for a GDCQuery for cnvs 87 | #' @export 88 | cnvs = function(...) {return(query("cnvs", ...))} 89 | 90 | #' @describeIn query convenience contructor for a GDCQuery for cnv_occurrences 91 | #' @export 92 | cnv_occurrences = function(...) {return(query("cnv_occurrences", ...))} 93 | 94 | #' @describeIn query convenience contructor for a GDCQuery for genes 95 | #' @export 96 | genes = function(...) {return(query("genes", ...))} 97 | -------------------------------------------------------------------------------- /R/caching.R: -------------------------------------------------------------------------------- 1 | #' Work with gdc cache directory 2 | #' 3 | #' The GenomicDataCommons package will cache downloaded 4 | #' files to minimize network and allow for 5 | #' offline work. These functions are used to create a cache directory 6 | #' if one does not exist, set a global option, and query that 7 | #' option. The cache directory will default to the user "cache" 8 | #' directory according to specifications in 9 | #' \code{\link[rappdirs]{app_dir}}. However, the user may want to set 10 | #' this to another direcotory with more or higher performance 11 | #' storage. 12 | #' 13 | #' @return character(1) directory path that serves as 14 | #' the base directory for GenomicDataCommons downloads. 15 | #' 16 | #' @details 17 | #' The cache structure is currently just a directory with each file 18 | #' being represented by a path constructed as: 19 | #' CACHEDIR/UUID/FILENAME. The cached files can be manipulated 20 | #' using standard file system commands (removing, finding, 21 | #' etc.). In this sense, the cache sytem is minimalist in design. 22 | #' 23 | #' @examples 24 | #' gdc_cache() 25 | #' \dontrun{ 26 | #' gdc_set_cache(getwd()) 27 | #' } 28 | #' 29 | #' @export 30 | gdc_cache = function() 31 | { 32 | cache_dir = getOption('gdc_cache',gdc_set_cache(verbose=FALSE)) 33 | if(!dir.exists(cache_dir)) { 34 | gdc_set_cache(cache_dir) 35 | } 36 | return(cache_dir) 37 | } 38 | 39 | #' @describeIn gdc_cache (Re)set the GenomicDataCommons cache 40 | #' directory 41 | #' 42 | #' @importFrom utils menu 43 | #' @importFrom rappdirs app_dir 44 | #' 45 | #' 46 | #' @param create_without_asking logical(1) specifying whether to allow 47 | #' the function to create the cache directory without asking the 48 | #' user first. In an interactive session, if the cache directory 49 | #' does not exist, the user will be prompted before creation. 50 | #' 51 | #' @param verbose logical(1) whether or not to message the location of 52 | #' the cache directory after creation. 53 | #' 54 | #' @param directory character(1) directory path, will be created 55 | #' recursively if not present. 56 | #' 57 | #' 58 | #' @return the created directory (invisibly) 59 | #' 60 | #' @export 61 | gdc_set_cache = function(directory = rappdirs::app_dir(appname = 62 | "GenomicDataCommons")$cache(), 63 | verbose = TRUE, 64 | create_without_asking = !interactive()) 65 | { 66 | 67 | create_path = function(directory) { 68 | dir.create(directory, recursive = TRUE, showWarnings = 69 | FALSE) 70 | } 71 | 72 | if(is.character(directory) & length(directory)==1) { 73 | # if directory exists, move on 74 | if(!dir.exists(directory)) { 75 | # if not in an interactive session, go 76 | # ahead and create directory without user 77 | # input. 78 | if(create_without_asking) { 79 | create_path(directory) 80 | } else { 81 | # If in an interactive environment, 82 | # go ahead and ask user for agreement. 83 | response = menu(c("Yes", "No"), 84 | title=sprintf("Would you like to create a GDC Cache directory at %s", directory)) 85 | if(response == 1) { 86 | create_path(directory) 87 | } else { 88 | stop("GDC Cache directory cannot be created without user agreement") 89 | } 90 | } 91 | } 92 | options('gdc_cache' = directory) 93 | } else { 94 | stop("directory should be a character(1)") 95 | } 96 | if(verbose) message("GDC Cache directory set to: ", directory) 97 | invisible(directory) 98 | } 99 | -------------------------------------------------------------------------------- /R/slicing.R: -------------------------------------------------------------------------------- 1 | #' Query GDC for data slices 2 | #' 3 | #' This function returns a BAM file representing reads overlapping 4 | #' regions specified either as chromosomal regions or as gencode gene 5 | #' symbols. 6 | #' 7 | #' @param uuid character(1) identifying the BAM file resource 8 | #' 9 | #' @param regions character() vector describing chromosomal regions, 10 | #' e.g., \code{c("chr1", "chr2:10000", "chr3:10000-20000")} (all 11 | #' of chromosome 1, chromosome 2 from position 10000 to the end, 12 | #' chromosome 3 from 10000 to 20000). 13 | #' 14 | #' @param symbols character() vector of gencode gene symbols, e.g., 15 | #' \code{c("BRCA1", "PTEN")} 16 | #' 17 | #' @param destination character(1) default \code{tempfile()} file path 18 | #' for BAM file slice 19 | #' 20 | #' @param overwrite logical(1) default FALSE can destination be 21 | #' overwritten? 22 | #' 23 | #' @param progress logical(1) default \code{interactive()} should a 24 | #' progress bar be used? 25 | #' 26 | #' @param token character(1) security token allowing access to 27 | #' restricted data. Almost all BAM data is restricted, so a token is 28 | #' usually required. See 29 | #' \url{https://docs.gdc.cancer.gov/Data/Data_Security/Data_Security/#authentication-tokens}. 30 | #' 31 | #' @details This function uses the Genomic Data Commons "slicing" API 32 | #' to get portions of a BAM file specified either using "regions" 33 | #' or using HGNC gene symbols. 34 | #' 35 | #' @return character(1) destination to the downloaded BAM file 36 | #' 37 | #' @importFrom httr progress 38 | #' @importFrom jsonlite toJSON 39 | #' 40 | #' @examples 41 | #' \dontrun{ 42 | #' slicing("df80679e-c4d3-487b-934c-fcc782e5d46e", 43 | #' regions="chr17:75000000-76000000", 44 | #' token=gdc_token()) 45 | #' 46 | #' # Get 10 BAM files. 47 | #' bamfiles = files() |> 48 | #' filter(data_format=='BAM') |> 49 | #' results(size=10) |> ids() 50 | #' 51 | #' # Current alignments at the GDC are to GRCh38 52 | #' library('TxDb.Hsapiens.UCSC.hg38.knownGene') 53 | #' all_genes = genes(TxDb.Hsapiens.UCSC.hg38.knownGene) 54 | #' 55 | #' first3genes = all_genes[1:3] 56 | #' # remove strand info 57 | #' strand(first3genes) = '*' 58 | #' 59 | #' # We can get our regions easily now 60 | #' as.character(first3genes) 61 | #' 62 | #' # Use parallel downloads to speed processing 63 | #' library(BiocParallel) 64 | #' register(MulticoreParam()) 65 | #' 66 | #' fnames = bplapply(bamfiles, slicing, overwrite = TRUE, 67 | #' regions=as.character(first3genes)) 68 | #' 69 | #' # 10 BAM files 70 | #' fnames 71 | #' 72 | #' library(GenomicAlignments) 73 | #' lapply(unlist(fnames), readGAlignments) 74 | #' 75 | #' } 76 | #' @export 77 | slicing <- function(uuid, regions, symbols, destination=file.path(tempdir(), paste0(uuid, '.bam')), 78 | overwrite=FALSE, progress=interactive(), token=gdc_token()) 79 | { 80 | stopifnot(is.character(uuid), length(uuid) == 1L) 81 | stopifnot(missing(regions) || missing(symbols), 82 | !(missing(regions) && missing(symbols))) 83 | stopifnot(is.character(destination), length(destination) == 1L, 84 | (overwrite && file.exists(destination)) || !file.exists(destination)) 85 | 86 | if (!missing(symbols)) 87 | body <- list(gencode=I(symbols)) 88 | else 89 | ## FIXME: validate regions 90 | body <- list(regions=regions) 91 | 92 | response <- .gdc_post( 93 | endpoint=sprintf("slicing/view/%s", uuid), 94 | add_headers('Content-type'='application/json'), 95 | write_disk(destination, overwrite), 96 | if (progress) progress() else NULL, 97 | body=toJSON(body), token=token) 98 | if (progress) 99 | cat("\n") 100 | 101 | destination 102 | } 103 | -------------------------------------------------------------------------------- /R/manifest.R: -------------------------------------------------------------------------------- 1 | #' Prepare GDC manifest file for bulk download 2 | #' 3 | #' The \code{manifest} function/method creates a manifest of files to be downloaded 4 | #' using the GDC Data Transfer Tool. There are methods for 5 | #' creating manifest data frames from \code{\link{GDCQuery}} objects 6 | #' that contain file information ("cases" and "files" queries). 7 | #' 8 | #' @param x An \code{\link{GDCQuery}} object of subclass "gdc_files" or "gdc_cases". 9 | #' 10 | #' @param size The total number of records to return. Default 11 | #' will return the usually desirable full set of records. 12 | #' 13 | #' @param from Record number from which to start when returning the manifest. 14 | #' 15 | #' @param ... passed to \code{\link[httr]{PUT}}. 16 | #' 17 | #' @return A \code{\link[tibble]{tibble}}, also of type "gdc_manifest", with five columns: 18 | #' \itemize{ 19 | #' \item{id} 20 | #' \item{filename} 21 | #' \item{md5} 22 | #' \item{size} 23 | #' \item{state} 24 | #'} 25 | #' 26 | #' @examples 27 | #' gFiles = files() 28 | #' shortManifest = gFiles |> manifest(size=10) 29 | #' head(shortManifest,n=3) 30 | #' 31 | #' 32 | #' @export 33 | manifest <- function(x,from=0,size=count(x),...) { 34 | UseMethod('manifest',x) 35 | } 36 | 37 | #' @describeIn manifest 38 | #' 39 | #' @export 40 | manifest.gdc_files <- function(x,from=0,size=count(x),...) { 41 | .manifestCall(x=x,from=from,size=size,...) 42 | } 43 | 44 | #' @describeIn manifest 45 | #' 46 | #' @export 47 | manifest.GDCfilesResponse <- function(x,from=0,size=count(x),...) { 48 | .manifestCall(x=x$query,from=from,size=size,...) 49 | } 50 | 51 | #' @describeIn manifest 52 | #' 53 | #' @export 54 | manifest.GDCcasesResponse <- function(x,from=0,size=count(x),...) { 55 | manifest(x=x$query,from=from,size=size,...) 56 | } 57 | 58 | 59 | 60 | #' @importFrom readr read_tsv 61 | .manifestCall <- function(x,from=0,size=count(x),...) { 62 | body = Filter(function(z) !is.null(z),x) 63 | body[['facets']]=NULL 64 | body[['fields']]=paste0(default_fields(x),collapse=",") 65 | body[['from']]=from 66 | body[['size']]=size 67 | # remove return_type for now 68 | # body[['return_type']]='manifest' 69 | tmp <- httr::content( 70 | .gdc_post(entity_name(x), body=body, token=NULL, ...), 71 | as = "text", encoding = "UTF-8" 72 | ) 73 | tmp <- jsonlite::fromJSON(tmp)[["data"]][["hits"]] 74 | if ("acl" %in% names(tmp)) 75 | tmp <- tidyr::unnest_wider(data = tmp, col = "acl", names_sep = "_") 76 | if(ncol(tmp)<5) { 77 | tmp=data.frame() 78 | } 79 | class(tmp) <- c('GDCManifest',class(tmp)) 80 | return(tmp) 81 | } 82 | 83 | #' write a manifest data.frame to disk 84 | #' 85 | #' The \code{\link{manifest}} method creates a data.frame 86 | #' that represents the data for a manifest file needed 87 | #' by the GDC Data Transfer Tool. While the file format 88 | #' is nothing special, this is a simple helper function 89 | #' to write a manifest data.frame to disk. It returns 90 | #' the path to which the file is written, so it can 91 | #' be used "in-line" in a call to \code{\link{transfer}}. 92 | #' 93 | #' @param manifest A data.frame with five columns, typically 94 | #' created by a call to \code{\link{manifest}} 95 | #' 96 | #' @param destfile The filename for saving the manifest. 97 | #' 98 | #' @return character(1) the destination file name. 99 | #' 100 | #' @importFrom utils write.table 101 | #' 102 | #' @examples 103 | #' mf = files() |> manifest(size=10) 104 | #' write_manifest(mf) 105 | #' 106 | #' @export 107 | write_manifest <- function(manifest,destfile=tempfile()) { 108 | stopifnot( 109 | all(.gdc_manifest_colnames %in% colnames(manifest)), 110 | ncol(manifest) > 5 111 | ) 112 | write.table(manifest,file=destfile,sep="\t", 113 | col.names=TRUE,row.names=FALSE,quote=FALSE) 114 | destfile 115 | } 116 | 117 | -------------------------------------------------------------------------------- /R/clinical.R: -------------------------------------------------------------------------------- 1 | #' Get clinical information from GDC 2 | #' 3 | #' The NCI GDC has a complex data model that allows various studies to 4 | #' supply numerous clinical and demographic data elements. However, 5 | #' across all projects that enter the GDC, there are 6 | #' similarities. This function returns four data.frames associated 7 | #' with case_ids from the GDC. 8 | #' 9 | #' @param case_ids a character() vector of case_ids, typically from 10 | #' "cases" query. 11 | #' 12 | #' @param include_list_cols logical(1), whether to include list 13 | #' columns in the "main" data.frame. These list columns have 14 | #' values for aliquots, samples, etc. While these may be useful 15 | #' for some situations, they are generally not that useful as 16 | #' clinical annotations. 17 | #' 18 | #' @importFrom jsonlite fromJSON 19 | #' @importFrom dplyr bind_rows 20 | #' @importFrom tibble as_tibble 21 | #' 22 | #' @details 23 | #' Note that these data.frames can, in general, have different numbers 24 | #' of rows (or even no rows at all). If one wishes to combine to 25 | #' produce a single data.frame, using the approach of left joining to 26 | #' the "main" data.frame will yield a useful combined data.frame. We 27 | #' do not do that directly given the potential for 1:many 28 | #' relationships. It is up to the user to determine what the best 29 | #' approach is for any given dataset. 30 | #' 31 | #' 32 | #' @return 33 | #' A list of four data.frames: 34 | #' \enumerate{ 35 | #' \item main, representing basic case identification and metadata 36 | #' (update date, etc.) 37 | #' \item diagnoses 38 | #' \item esposures 39 | #' \item demographic 40 | #' } 41 | #' 42 | #' 43 | #' @examples 44 | #' case_ids = cases() |> results(size=10) |> ids() 45 | #' clinical_data = gdc_clinical(case_ids) 46 | #' 47 | #' # overview of clinical results 48 | #' class(clinical_data) 49 | #' names(clinical_data) 50 | #' sapply(clinical_data, class) 51 | #' sapply(clinical_data, nrow) 52 | #' 53 | #' # available data 54 | #' head(clinical_data$main) 55 | #' head(clinical_data$demographic) 56 | #' head(clinical_data$diagnoses) 57 | #' head(clinical_data$exposures) 58 | #' 59 | #' @export 60 | gdc_clinical = function(case_ids, include_list_cols = FALSE) { 61 | stopifnot(is.character(case_ids)) 62 | stopifnot(is.logical(include_list_cols) & length(include_list_cols)==1) 63 | resp = cases() |> 64 | filter( ~ case_id %in% case_ids) |> 65 | expand(c("diagnoses", 66 | "demographic", 67 | "exposures", 68 | "follow_ups.other_clinical_attributes")) |> 69 | response_all(response_handler = function(x) jsonlite::fromJSON(x, simplifyDataFrame = TRUE)) 70 | demographic = resp$results$demographic 71 | demographic$case_id = rownames(demographic) 72 | 73 | nodx <- vapply(resp$results$diagnoses, is.null, logical(1L)) 74 | if (any(nodx)) 75 | resp$results$diagnoses[nodx] <- list(data.frame()) 76 | 77 | diagnoses <- suppressMessages({ 78 | bind_rows( 79 | lapply(resp$results$diagnoses, readr::type_convert), 80 | .id = "case_id" 81 | ) 82 | }) 83 | 84 | exposures = bind_rows(resp$results$exposures, .id = "case_id") 85 | follow_ups = bind_rows(resp$results$follow_ups, .id = "case_id") 86 | 87 | # set up main table by removing data.frame columns 88 | cnames = setdiff(colnames(resp$results), c('exposures', 'follow_ups', 'diagnoses', 'demographic')) 89 | main = resp$results[, cnames] 90 | 91 | if(!include_list_cols) { 92 | non_list_cols = names(Filter(function(cname) cname!='list', sapply(main, class))) 93 | main = main[, non_list_cols] 94 | } 95 | 96 | y = list(demographic = as_tibble(demographic), 97 | diagnoses = as_tibble(diagnoses), 98 | exposures = as_tibble(exposures), 99 | follow_ups = as_tibble(follow_ups), 100 | main = as_tibble(main)) 101 | class(y) = c('GDCClinicalList', class(y)) 102 | return(y) 103 | } 104 | -------------------------------------------------------------------------------- /R/REST.R: -------------------------------------------------------------------------------- 1 | #" (internal) Extract header field element from httr response 2 | #' @importFrom httr headers 3 | .gdc_header_elt <- function(response, field, element) { 4 | value <- headers(response)[[field]] 5 | if (is.null(value)) 6 | stop("response header does not contain field '", field, "'") 7 | 8 | value <- strsplit(strsplit(value, "; *")[[1L]], "= *") 9 | key <- vapply(value, `[[`, character(1), 1L) 10 | idx <- element == key 11 | if (sum(idx) != 1L) 12 | stop("response header field '", field, 13 | "' does not contain unique element '", element, "'") 14 | 15 | value[[which(idx)]][[2]] 16 | } 17 | 18 | #" (internal) Rename a file 'from' to 'to' 19 | .gdc_file_rename <- function(from, to, overwrite) { 20 | if (overwrite && file.exists(to)) 21 | unlink(to) 22 | 23 | reason <- NULL 24 | status <- withCallingHandlers({ 25 | file.rename(from, to) 26 | }, warning=function(w) { 27 | reason <<- conditionMessage(w) 28 | invokeRestart("muffleWarning") 29 | }) 30 | unlink(from) 31 | if (!status) 32 | stop("failed to rename downloaded file:\n", 33 | "\n from: '", from, "'", 34 | "\n to: '", to, "'", 35 | "\n reason:", 36 | "\n", .wrapstr(reason)) 37 | else if (!is.null(reason)) 38 | warning(reason) # forward non-fatal file rename warning 39 | 40 | to 41 | } 42 | 43 | #" (internal) GET endpoint / uri 44 | #' @importFrom httr GET add_headers stop_for_status 45 | .gdc_get <- 46 | function(endpoint, parameters=list(), token=NULL, ..., base=.gdc_base) 47 | { 48 | stopifnot(is.character(endpoint), length(endpoint) == 1L) 49 | uri <- sprintf("%s/%s", base, endpoint) 50 | uri <- sprintf("%s%s", uri, .parameter_string(parameters)) 51 | if(getOption('gdc.verbose',FALSE)) { 52 | message("GET request uri:\n",uri) 53 | } 54 | response <- GET(uri, add_headers(`X-Auth-Token`=token), 55 | #config = httr::config(ssl_verifypeer = FALSE), 56 | ...) 57 | stop_for_status(response) 58 | response 59 | } 60 | 61 | #" (internal) POST endpoint / uri 62 | #' @importFrom httr POST add_headers write_disk stop_for_status 63 | .gdc_post <- 64 | function(endpoint, body, token=NULL, ..., base=.gdc_base) 65 | { 66 | stopifnot(is.character(endpoint), length(endpoint) == 1L) 67 | uri <- sprintf("%s/%s", base, endpoint) 68 | if(getOption('gdc.verbose',FALSE)) { 69 | message("POST request uri:\n",uri) 70 | message("POST body: ",jsonlite::toJSON(body)) 71 | } 72 | if('fields' %in% names(body)) 73 | body[['fields']] = paste0(body[['fields']],collapse=',') 74 | response <- POST( 75 | uri, add_headers( 76 | `X-Auth-Token` = token, 77 | Accept = "application/json", 78 | `Content-Type` = "application/json" 79 | ), 80 | ..., 81 | #config = httr::config(ssl_verifypeer = FALSE), 82 | body=body, encode="json") 83 | stop_for_status(response) 84 | } 85 | 86 | #" (internal) Download one file from GDC, renaming to remote filename 87 | #' @importFrom httr GET write_disk add_headers stop_for_status 88 | .gdc_download_one <- 89 | function(uri, destination, overwrite, progress, token=NULL, base=.gdc_base) 90 | { 91 | uri = sprintf('%s/%s',base,uri) 92 | if(getOption('gdc.verbose',FALSE)) { 93 | message("GET request uri:\n",uri) 94 | } 95 | 96 | if(!dir.exists(destination)) { 97 | dir.create(destination) 98 | } 99 | destfile = file.path(destination, '.partial_download') 100 | 101 | response <- GET(uri, write_disk(destfile, overwrite = TRUE), 102 | if (progress) progress() else NULL, 103 | add_headers(`X-Auth-Token`=token)) 104 | stop_for_status(response) 105 | if (progress) cat("\n") 106 | 107 | filename <- .gdc_header_elt(response, "content-disposition", "filename") 108 | to <- file.path(destination, filename) 109 | .gdc_file_rename(destfile, to, overwrite) 110 | } 111 | -------------------------------------------------------------------------------- /R/gdcdata.R: -------------------------------------------------------------------------------- 1 | #' Download GDC files 2 | #' 3 | #' Download one or more files from GDC. Files are downloaded using the 4 | #' UUID and renamed to the file name on the remote system. By default, 5 | #' neither the uuid nor the file name on the remote system can exist. 6 | #' 7 | #' This function is appropriate for one or several files; for large 8 | #' downloads use \code{\link{manifest}} to create a manifest for and 9 | #' the GDC Data Transfer Tool. 10 | #' 11 | #' @param uuids character() of GDC file UUIDs. 12 | #' 13 | #' @param use_cached logical(1) default TRUE indicating that, 14 | #' if found in the cache, the file will not be downloaded 15 | #' again. If FALSE, all supplied uuids will be re-downloaded. 16 | #' 17 | #' @param progress logical(1) default TRUE in interactive sessions, 18 | #' FALSE otherwise indicating whether a progress par should be 19 | #' produced for each file download. 20 | #' 21 | #' @param access_method character(1), either 'api' or 'client'. See details. 22 | #' 23 | #' @param transfer_args character(1), additional arguments to pass to 24 | #' the gdc-client command line. See \code{\link{gdc_client}} and 25 | #' \code{\link{transfer_help}} for details. 26 | #' 27 | #' @param token (optional) character(1) security token allowing access 28 | #' to restricted data. See 29 | #' \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}. 30 | #' 31 | #' @param ... further arguments passed to files 32 | #' 33 | #' @seealso \code{\link{manifest}} for downloading large data. 34 | #' 35 | #' @return a named vector with file uuids as the names and paths as 36 | #' the value 37 | #' 38 | #' @details When access_method is "api", the GDC "data" endpoint is the 39 | #' transfer mechanism used. The alternative access_method, "client", will 40 | #' utilize the \code{gdc-client} transfer tool, which must be 41 | #' downloaded separately and available. See 42 | #' \code{\link{gdc_client}} for details on specifying the location 43 | #' of the gdc-client executable. 44 | #' 45 | #' 46 | #' @examples 47 | #' # get some example file uuids 48 | #' uuids <- files() |> 49 | #' filter(~ access == 'open' & file_size < 100000) |> 50 | #' results(size = 3) |> 51 | #' ids() 52 | #' 53 | #' # and get the data, placing it into the gdc_cache() directory 54 | #' gdcdata(uuids, use_cached=TRUE) 55 | #' 56 | #' @export 57 | gdcdata <- 58 | function(uuids, use_cached=TRUE, 59 | progress=interactive(), token=NULL, access_method='api', 60 | transfer_args = character(), ...) 61 | { 62 | stopifnot(is.character(uuids)) 63 | 64 | uuids = trimws(uuids) 65 | manifest = files(...) |> 66 | GenomicDataCommons::filter( ~ file_id %in% uuids ) |> 67 | GenomicDataCommons::manifest() 68 | # files from previous downloads should have the following 69 | # path and filenames 70 | fs = file.path(gdc_cache(), manifest[["id"]], manifest[["file_name"]]) 71 | 72 | # Restrict new manifest to those that we need to download, 73 | to_do_manifest = manifest[!file.exists(fs),] 74 | 75 | # These are the uuids of the cache misses 76 | missing_uuids = to_do_manifest[["id"]] 77 | 78 | # And these are the cache hits 79 | names(fs) = manifest[["id"]] 80 | 81 | # Using API download to fetch missing uuids 82 | endpoint <- "data" 83 | cache_dir <- gdc_cache() 84 | 85 | destinations <- file.path(cache_dir, missing_uuids) 86 | if(access_method == 'api') { 87 | uris <- sprintf("%s/%s", endpoint, missing_uuids) 88 | value <- mapply(.gdc_download_one, uris, destinations, 89 | MoreArgs=list(overwrite=!use_cached, progress=progress, 90 | token=token), 91 | SIMPLIFY=TRUE, USE.NAMES=FALSE) 92 | names(value) <- missing_uuids 93 | } else { 94 | ## in the future, may want to transition to 95 | ## passing the actual manifest, since we 96 | ## are going to regenerate it, anyway. 97 | value = NULL 98 | if(length(missing_uuids)>0) 99 | value = transfer(missing_uuids, token = token, args = transfer_args) 100 | } 101 | 102 | # combine cache hits with cache misses 103 | # 104 | # Return vector of file file path, name=uuid 105 | fs 106 | } 107 | -------------------------------------------------------------------------------- /R/fields.R: -------------------------------------------------------------------------------- 1 | #' S3 Generic to return all GDC fields 2 | #' 3 | #' @param x A character(1) string ('cases','files','projects', 4 | #' 'annotations') or an subclass of \code{\link{GDCQuery}}. 5 | #' @return a character vector of the default fields 6 | #' 7 | #' @examples 8 | #' available_fields('projects') 9 | #' projQuery = query('projects') 10 | #' available_fields(projQuery) 11 | #' 12 | #' @export 13 | available_fields = function(x) { 14 | UseMethod('available_fields',x) 15 | } 16 | 17 | #' @describeIn available_fields GDCQuery method 18 | #' @export 19 | available_fields.GDCQuery = function(x) { 20 | return(mapping(entity_name(x))$field) 21 | } 22 | 23 | #' @describeIn available_fields character method 24 | #' @export 25 | available_fields.character = function(x) { 26 | stopifnot(length(x)==1,x %in% .gdc_entities) 27 | return(mapping(x)$field) 28 | } 29 | 30 | 31 | #' S3 Generic to return default GDC fields 32 | #' 33 | #' @param x A character string ('cases','files','projects', 34 | #' 'annotations') or an subclass of \code{\link{GDCQuery}}. 35 | #' @return a character vector of the default fields 36 | #' 37 | #' @examples 38 | #' default_fields('projects') 39 | #' projQuery = query('projects') 40 | #' default_fields(projQuery) 41 | #' 42 | #' @export 43 | default_fields = function(x) { 44 | UseMethod('default_fields',x) 45 | } 46 | 47 | #' @describeIn default_fields character method 48 | #' @export 49 | default_fields.character = function(x) { 50 | defaults=NA # just to avoid no visible binding note 51 | stopifnot(length(x)==1,x %in% .gdc_entities) 52 | return(subset(mapping(x),defaults)$field) 53 | } 54 | 55 | #' @describeIn default_fields GDCQuery method 56 | #' @export 57 | default_fields.GDCQuery = function(x) { 58 | return(default_fields(entity_name(x))) 59 | } 60 | 61 | #' S3 generic to set GDCQuery fields 62 | #' 63 | #' @param x the objects on which to set fields 64 | #' @param fields a character vector specifying the fields 65 | #' 66 | #' 67 | #' @return A \code{\link{GDCQuery}} object, with the fields 68 | #' member altered. 69 | #' 70 | #' @examples 71 | #' gProj = projects() 72 | #' gProj$fields 73 | #' head(available_fields(gProj)) 74 | #' default_fields(gProj) 75 | #' 76 | #' gProj |> 77 | #' select(default_fields(gProj)[1:2]) |> 78 | #' response() |> 79 | #' str(max_level=2) 80 | #' 81 | #' @export 82 | select <- function(x,fields) { 83 | UseMethod('select',x) 84 | } 85 | 86 | 87 | 88 | #" (internal) rectify specified fields with available fields 89 | .gdcRectifyFieldsForEntity <- function(entity,fields) { 90 | af = available_fields(entity) 91 | mismatches = fields[!(fields %in% af)] 92 | if(length(mismatches)>0) 93 | stop(sprintf('fields specified included fields not available in %s including (%s)',entity,mismatches)) 94 | fields = union(paste0(sub('s$','',entity),"_id"),fields) 95 | return(fields) 96 | } 97 | 98 | #' @describeIn select set fields on a GDCQuery object 99 | #' @export 100 | select.GDCQuery <- function(x,fields) { 101 | x$fields = .gdcRectifyFieldsForEntity(entity_name(x),fields) 102 | return(x) 103 | } 104 | 105 | #' Find matching field names 106 | #' 107 | #' This utility function allows quick text-based search of available 108 | #' fields for using \code{\link{grep}} 109 | #' 110 | #' @param entity one of the available gdc entities ('files','cases',...) 111 | #' against which to gather available fields for matching 112 | #' 113 | #' @param pattern A regular expression that will be used 114 | #' in a call to \code{\link{grep}} 115 | #' 116 | #' @param ... passed on to grep 117 | #' 118 | #' @param value logical(1) whether to return values as opposed 119 | #' to indices (passed along to grep) 120 | #' 121 | #' @return character() vector of field names matching 122 | #' \code{pattern} 123 | #' 124 | #' @examples 125 | #' grep_fields('files','analysis') 126 | #' 127 | #' @export 128 | grep_fields <- function(entity,pattern,...,value=TRUE) { 129 | stopifnot(entity %in% .gdc_entities) 130 | return(grep(pattern=pattern, 131 | x=available_fields(entity), 132 | value=TRUE,...)) 133 | } 134 | 135 | #' Find common values for a GDC field 136 | #' 137 | #' @param entity character(1), a GDC entity ("cases", "files", "annotations", "projects") 138 | #' @param field character(1), a field that is present in the entity record 139 | #' 140 | #' @return character vector of the top 100 (or fewer) most frequent 141 | #' values for a the given field 142 | #' 143 | #' @examples 144 | #' available_values('files','cases.project.project_id')[1:5] 145 | #' 146 | #' @export 147 | available_values <- function(entity,field) { 148 | stopifnot(entity %in% .gdc_entities) 149 | agg = query(entity) |> facet(field) |> aggregations() 150 | agg[[field]]$key 151 | } 152 | 153 | #' S3 Generic that returns the field description text, if available 154 | #' 155 | #' @param entity character(1) string ('cases','files','projects', 156 | #' 'annotations', etc.) or an subclass of \code{\link{GDCQuery}}. 157 | #' 158 | #' @param field character(1), the name of the field that will be used to look 159 | #' up the description. 160 | #' 161 | #' @return character(1) descriptive text or character(0) if no description 162 | #' is available. 163 | #' 164 | #' @examples 165 | #' field_description('cases', 'annotations.category') 166 | #' casesQuery = query('cases') 167 | #' field_description(casesQuery, 'annotations.category') 168 | #' field_description(cases(), 'annotations.category') 169 | #' 170 | #' @export 171 | field_description = function(entity, field) { 172 | UseMethod('field_description',entity) 173 | } 174 | 175 | #' @describeIn field_description GDCQuery method 176 | #' @export 177 | field_description.GDCQuery = function(entity, field) { 178 | stopifnot(length(field)==1) 179 | m = mapping(entity_name(entity)) 180 | return(m$description[m$field==field]) 181 | } 182 | 183 | #' @describeIn field_description character method 184 | #' @export 185 | field_description.character = function(entity, field) { 186 | stopifnot(length(entity)==1,entity %in% .gdc_entities) 187 | stopifnot(length(field)==1) 188 | m = mapping(entity) 189 | return(m$description[m$field==field]) 190 | } 191 | -------------------------------------------------------------------------------- /R/bulk_transfer.R: -------------------------------------------------------------------------------- 1 | #' Bulk data download 2 | #' 3 | #' The GDC maintains a special tool, 4 | #' \href{the GDC Data Transfer Tool}{https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Getting_Started/}, 5 | #' that enables high-performance, potentially parallel, and 6 | #' resumable downloads. The Data Transfer Tool is an external 7 | #' program that requires separate download. Due to recent changes in the 8 | #' GDC API, the transfer function now validates the version of the `gdc-client` 9 | #' to ensure reliable downloads. 10 | #' 11 | #' @param uuids character() vector of GDC file UUIDs 12 | #' 13 | #' @param args character() vector specifying command-line arguments to 14 | #' be passed to \code{gdc-client}. See \code{\link{transfer_help}} for 15 | #' possible values. The arguments \code{--manifest}, \code{--dir}, 16 | #' and \code{--token-file} are determined by \code{manifest}, 17 | #' \code{destination_dir}, and \code{token}, respectively, and 18 | #' should NOT be provided as elements of \code{args}. 19 | #' 20 | #' @param token character(1) containing security 21 | #' token allowing access to restricted data. See 22 | #' \url{https://gdc-docs.nci.nih.gov/API/Users_Guide/Authentication_and_Authorization/}. 23 | #' Note that the GDC transfer tool requires a file for data 24 | #' transfer. Therefore, this token will be written to a temporary 25 | #' file (with appropriate permissions set). 26 | #' 27 | #' @param overwrite logical(1) default FALSE indicating whether 28 | #' existing files with identical name should be over-written. 29 | #' 30 | #' @return character(1) directory path to which the files were 31 | #' downloaded. 32 | #' 33 | #' @examples 34 | #' \dontrun{ 35 | #' uuids = files() |> 36 | #' filter(access == "open") |> 37 | #' results() |> 38 | #' ids() 39 | #' file_paths <- transfer(uuids) 40 | #' file_paths 41 | #' names(file_paths) 42 | #' # and with authenication 43 | #' # REQUIRES gdc_token 44 | #' # destination <- transfer(uuids,token=gdc_token()) 45 | #' } 46 | #' 47 | #' @importFrom utils read.table 48 | #' @export 49 | transfer <- 50 | function(uuids, args=character(), token=NULL, overwrite=FALSE) 51 | { 52 | stopifnot(is.character(uuids)) 53 | destination_dir <- gdc_cache() 54 | 55 | manifest = files() |> 56 | GenomicDataCommons::filter( ~ file_id %in% uuids ) |> 57 | GenomicDataCommons::manifest() 58 | manifest_file = write_manifest(manifest) 59 | 60 | 61 | dir_arg <- sprintf("--dir %s", destination_dir) 62 | manifest_arg <- sprintf("--manifest %s", manifest_file) 63 | token_file = tempfile() 64 | if (!is.null(token)) { 65 | writeLines(token,con=token_file) 66 | stopifnot(file.exists(token_file)) 67 | Sys.chmod(token_file,mode="600") 68 | token <- sprintf("--token-file %s", token_file) 69 | } 70 | gdc_client_version_validate() 71 | args <- paste(c("download", dir_arg, manifest_arg, args, token), collapse=" ") 72 | system2(gdc_client(), args) 73 | 74 | if(!is.null(token)) 75 | unlink(token_file) 76 | 77 | filepaths <- file.path(gdc_cache(), uuids, 78 | as.character(manifest[[2]])) 79 | names(filepaths) = uuids 80 | return(filepaths) 81 | } 82 | 83 | #' return gdc-client executable path 84 | #' 85 | #' This function is a convenience function to 86 | #' find and return the path to the GDC Data Transfer 87 | #' Tool executable assumed to be named 'gdc-client'. 88 | #' The assumption is that the appropriate version of the 89 | #' GDC Data Transfer Tool is a separate download available 90 | #' from \href{the GDC website}{https://gdc.cancer.gov/access-data/gdc-data-transfer-tool} 91 | #' and as a backup from \href{on github}{https://github.com/NCI-GDC/gdc-client}. 92 | #' 93 | #' @details 94 | #' The path is checked in the following order: 95 | #' \enumerate{ 96 | #' \item an R option("gdc_client") 97 | #' \item an environment variable GDC_CLIENT 98 | #' \item from the search PATH 99 | #' \item in the current working directory 100 | #' } 101 | #' 102 | #' @return character(1) the path to the gdc-client executable. 103 | #' 104 | #' @examples 105 | #' # this cannot run without first 106 | #' # downloading the GDC Data Transfer Tool 107 | #' gdc_client = try(gdc_client(),silent=TRUE) 108 | #' 109 | #' @export 110 | gdc_client = function() { 111 | if(!is.null(getOption('gdc_client'))) 112 | if(file.exists(getOption('gdc_client'))) 113 | return(getOption('gdc_client')) 114 | if(file.exists(Sys.getenv("GDC_CLIENT"))) 115 | return(Sys.getenv("GDC_CLIENT")) 116 | if(!(Sys.which("gdc-client")=='')) 117 | return(Sys.which("gdc-client")) 118 | client=dir('.',pattern='^gdc-client$',full.names=TRUE) 119 | if(length(client)==1) 120 | if(client=='./gdc-client') 121 | return(client) 122 | stop('gdc_client not found. Be sure to install the command \nline GDC client available from the GDC website.') 123 | } 124 | 125 | gdc_client_version <- function() { 126 | gc_loc <- gdc_client() 127 | vers <- system2(gc_loc, "--version", stdout = TRUE, stderr = TRUE) 128 | vers <- gsub("^v", "", vers) 129 | package_version(vers) 130 | } 131 | 132 | .GDC_COMPATIBLE_VERSION <- "1.3.0" 133 | 134 | #' @describeIn transfer 135 | #' 136 | #' If you are using the 'client' option, your `gdc-client` should be 137 | #' up-to-date (>= 1.3.0). 138 | #' 139 | #' @param valid_version character(1) The last known version that works for the 140 | #' current data release for which to validate against, not typically changed 141 | #' by the end-user. 142 | #' 143 | #' @export 144 | gdc_client_version_validate <- 145 | function(valid_version = .GDC_COMPATIBLE_VERSION) 146 | { 147 | client_ver <- gdc_client_version() 148 | if (client_ver < package_version(valid_version)) 149 | stop("Update the 'gdc_client' to a version >= ", valid_version) 150 | } 151 | 152 | #' \code{transfer_help()} queries the the command line GDC Data 153 | #' Transfer Tool, \code{gdc-client}, for available options to be used 154 | #' in the \code{\link{transfer}} command. 155 | #' 156 | #' @describeIn transfer 157 | #' 158 | #' @export 159 | transfer_help <- function() { 160 | system2(gdc_client(), "download -h") 161 | } 162 | -------------------------------------------------------------------------------- /R/filters.R: -------------------------------------------------------------------------------- 1 | #.unary_op <- function(left) { 2 | # force(left) 3 | # function(e1) { 4 | # force(e1) 5 | # list(op=e1,content=c(field=left,value=c(right))) 6 | # } 7 | #} 8 | 9 | #' @importFrom jsonlite unbox 10 | .binary_op <- function(sep) { 11 | force(sep) 12 | function(e1, e2) { 13 | force(e1) 14 | force(e2) 15 | list(op=unbox(sep),content=list(field=e1,value=e2)) 16 | } 17 | } 18 | 19 | .missing_op <- function(sep) { 20 | force(sep) 21 | function(e1) { 22 | force(e1) 23 | list(op=unbox(sep), content = list(field = e1, value = "MISSING")) 24 | } 25 | } 26 | 27 | .negate_op <- function(sep) { 28 | force(sep) 29 | function(op) { 30 | force(op) 31 | list(op = unbox(sep), 32 | content = list( 33 | field = op$content$field, 34 | value = op$content$value) 35 | ) 36 | } 37 | } 38 | 39 | #' @importFrom jsonlite unbox 40 | .combine_op <- function(sep) { 41 | force(sep) 42 | function(e1, e2) { 43 | force(e1) 44 | force(e2) 45 | return(list(op=unbox(sep),content=list(e1,e2))) 46 | } 47 | } 48 | 49 | #.f_env = new.env(parent=emptyenv()) 50 | .f_env = list() 51 | .f_env$`==` = .binary_op('=') 52 | .f_env$`!=` = .binary_op('!=') 53 | .f_env$`<` = .binary_op('<') 54 | .f_env$'>' = .binary_op('>') 55 | .f_env$'&' = .combine_op('and') 56 | .f_env$'|' = .combine_op('or') 57 | .f_env$`<=` = .binary_op('<=') 58 | .f_env$'>=' = .binary_op('>=') 59 | .f_env$'%in%' = .binary_op('in') 60 | .f_env$'%exclude%' = .binary_op('exclude') 61 | .f_env$`missing` = .missing_op("is") 62 | .f_env$'!' = .negate_op("NOT") 63 | 64 | 65 | #' Create NCI GDC filters for limiting GDC query results 66 | #' 67 | #' Searching the NCI GDC allows for complex filtering based 68 | #' on logical operations and simple comparisons. This function 69 | #' facilitates writing such filter expressions in R-like syntax 70 | #' with R code evaluation. 71 | #' 72 | #' If used with available_fields, "bare" fields that are 73 | #' named in the available_fields character vector can be used 74 | #' in the filter expression without quotes. 75 | #' 76 | #' @param expr a lazy-wrapped expression or a formula RHS equivalent 77 | #' 78 | #' @param available_fields a character vector of the 79 | #' additional names that will be injected into the 80 | #' filter evaluation environment 81 | #' 82 | #' @return a \code{list} that represents an R version 83 | #' of the JSON that will ultimately be used in an 84 | #' NCI GDC search or other query. 85 | #' 86 | #' @importFrom rlang eval_tidy f_rhs f_env 87 | #' 88 | #' @export 89 | make_filter = function(expr,available_fields) { 90 | available_fields=as.list(available_fields) 91 | names(available_fields)=available_fields 92 | filt_env = c(as.list(.f_env),available_fields) 93 | if(is_formula(expr)) { 94 | return(rlang::eval_tidy(rlang::f_rhs(expr), data=filt_env, env = rlang::f_env(expr))) 95 | } else { 96 | return(rlang::eval_tidy(expr,data=filt_env)) 97 | } 98 | } 99 | 100 | 101 | 102 | #' Manipulating GDCQuery filters 103 | #' 104 | #' @name filtering 105 | #' 106 | #' @return A \code{\link{GDCQuery}} object with the filter 107 | #' field replaced by specified filter expression 108 | #' 109 | #' @examples 110 | #' # make a GDCQuery object to start 111 | #' # 112 | #' # Projects 113 | #' # 114 | #' pQuery = projects() 115 | #' 116 | #' # check for the default fields 117 | #' # so that we can use one of them to build a filter 118 | #' default_fields(pQuery) 119 | #' pQuery = filter(pQuery,~ project_id == 'TCGA-LUAC') 120 | #' get_filter(pQuery) 121 | #' 122 | #' # 123 | #' # Files 124 | #' # 125 | #' fQuery = files() 126 | #' default_fields(fQuery) 127 | #' 128 | #' fQuery = filter(fQuery,~ data_format == 'VCF') 129 | #' # OR 130 | #' # with recent GenomicDataCommons versions: 131 | #' # no "~" needed 132 | #' fQuery = filter(fQuery, data_format == 'VCF') 133 | #' 134 | #' get_filter(fQuery) 135 | #' 136 | #' fQuery = filter(fQuery,~ data_format == 'VCF' 137 | #' & experimental_strategy == 'WXS' 138 | #' & type == 'simple_somatic_mutation') 139 | #' 140 | #' files() |> filter(~ data_format == 'VCF' 141 | #' & experimental_strategy=='WXS' 142 | #' & type == 'simple_somatic_mutation') |> count() 143 | #' 144 | #' 145 | #' files() |> filter( data_format == 'VCF' 146 | #' & experimental_strategy=='WXS' 147 | #' & type == 'simple_somatic_mutation') |> count() 148 | #' 149 | #' # Filters may be chained for the 150 | #' # equivalent query 151 | #' # 152 | #' # When chained, filters are combined with logical AND 153 | #' 154 | #' files() |> 155 | #' filter(~ data_format == 'VCF') |> 156 | #' filter(~ experimental_strategy == 'WXS') |> 157 | #' filter(~ type == 'simple_somatic_mutation') |> 158 | #' count() 159 | #' 160 | #' # OR 161 | #' 162 | #' files() |> 163 | #' filter( data_format == 'VCF') |> 164 | #' filter( experimental_strategy == 'WXS') |> 165 | #' filter( type == 'simple_somatic_mutation') |> 166 | #' count() 167 | #' 168 | #' # Use str() to get a cleaner picture 169 | #' str(get_filter(fQuery)) 170 | NULL 171 | 172 | #' The \code{filter} is simply a safe accessor for 173 | #' the filter element in \code{\link{GDCQuery}} objects. 174 | #' 175 | #' @param x the object on which to set the filter list 176 | #' member 177 | #' @param expr a filter expression in the form of 178 | #' the right hand side of a formula, where bare names 179 | #' (without quotes) are allowed if they are available 180 | #' fields associated with the GDCQuery object, \code{x} 181 | #' 182 | #' @rdname filtering 183 | #' 184 | #' @export 185 | filter = function(x,expr) { 186 | UseMethod('filter',x) 187 | } 188 | 189 | #' @rdname filtering 190 | #' 191 | #' @importFrom rlang enquo is_formula 192 | #' 193 | #' @export 194 | filter.GDCQuery = function(x,expr) { 195 | filt = try({ 196 | if(rlang::is_formula(expr)) 197 | make_filter(expr,available_fields(x)) 198 | }, silent=TRUE) 199 | if(inherits(filt, "try-error")) 200 | filt = make_filter(enquo(expr), available_fields(x)) 201 | if(!is.null(x$filters)) 202 | x$filters=list(op="and", content=list(x$filters,filt)) 203 | else 204 | x$filters = filt 205 | return(x) 206 | } 207 | 208 | #' The \code{get_filter} is simply a safe accessor for 209 | #' the filter element in \code{\link{GDCQuery}} objects. 210 | #' 211 | #' @rdname filtering 212 | #' 213 | #' 214 | #' @export 215 | get_filter = function(x) { 216 | UseMethod('get_filter',x) 217 | } 218 | 219 | #' @rdname filtering 220 | #' 221 | #' @export 222 | get_filter.GDCQuery = function(x) { 223 | return(x$filters) 224 | } 225 | 226 | 227 | 228 | -------------------------------------------------------------------------------- /inst/script/README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | knit: (function(inputFile, encoding) { 4 | rmarkdown::render(inputFile, encoding = encoding, output_dir = "../../") }) 5 | --- 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | cache = TRUE, 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | ```{r,echo=FALSE,include=FALSE,eval=FALSE} 17 | rmarkdown::render("inst/script/README.Rmd", output_dir = ".") 18 | ``` 19 | 20 | 21 | # GenomicDataCommons 22 | 23 | 24 | [![R-CMD-check](https://github.com/Bioconductor/GenomicDataCommons/workflows/R-CMD-check/badge.svg)](https://github.com/Bioconductor/GenomicDataCommons/actions) 25 | 26 | 27 | # What is the GDC? 28 | 29 | From the [Genomic Data Commons (GDC) website](https://gdc.nci.nih.gov/about-gdc): 30 | 31 | The National Cancer Institute's (NCI's) Genomic Data Commons (GDC) is 32 | a data sharing platform that promotes precision medicine in 33 | oncology. It is not just a database or a tool; it is an expandable 34 | knowledge network supporting the import and standardization of genomic 35 | and clinical data from cancer research programs. 36 | 37 | The GDC contains NCI-generated data from some of the largest and most 38 | comprehensive cancer genomic datasets, including The Cancer Genome 39 | Atlas (TCGA) and Therapeutically Applicable Research to Generate 40 | Effective Therapies (TARGET). For the first time, these datasets have 41 | been harmonized using a common set of bioinformatics pipelines, so 42 | that the data can be directly compared. 43 | 44 | As a growing knowledge system for cancer, the GDC also enables 45 | researchers to submit data, and harmonizes these data for import into 46 | the GDC. As more researchers add clinical and genomic data to the GDC, 47 | it will become an even more powerful tool for making discoveries about 48 | the molecular basis of cancer that may lead to better care for 49 | patients. 50 | 51 | The 52 | [data model for the GDC is complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components), 53 | but it worth a quick overview. The data model is encoded as a 54 | so-called property graph. Nodes represent entities such as Projects, 55 | Cases, Diagnoses, Files (various kinds), and Annotations. The 56 | relationships between these entities are maintained as edges. Both 57 | nodes and edges may have Properties that supply instance details. The 58 | GDC API exposes these nodes and edges in a somewhat simplified set 59 | of 60 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) 61 | endpoints. 62 | 63 | # Quickstart 64 | 65 | This software is available at Bioconductor.org and can be downloaded via 66 | `BiocManager::install`. 67 | 68 | To report bugs or problems, either 69 | [submit a new issue](https://github.com/Bioconductor/GenomicDataCommons/issues) 70 | or submit a `bug.report(package='GenomicDataCommons')` from within R (which 71 | will redirect you to the new issue on GitHub). 72 | 73 | ## Installation 74 | 75 | Installation can be achieved via Bioconductor's `BiocManager` package. 76 | 77 | ```{r,eval=FALSE} 78 | if (!require("BiocManager")) 79 | install.packages("BiocManager") 80 | 81 | BiocManager::install('GenomicDataCommons') 82 | ``` 83 | 84 | ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} 85 | library(GenomicDataCommons) 86 | ``` 87 | 88 | ## Check basic functionality 89 | 90 | ```{r} 91 | status() 92 | ``` 93 | 94 | ## Find data 95 | 96 | The following code builds a `manifest` that can be used to guide the 97 | download of raw data. Here, filtering finds gene expression files 98 | quantified as raw counts using `STAR` from ovarian cancer patients. 99 | 100 | ```{r} 101 | ge_manifest <- files() |> 102 | filter( cases.project.project_id == 'TCGA-OV') |> 103 | filter( type == 'gene_expression' ) |> 104 | filter( analysis.workflow_type == 'STAR - Counts') |> 105 | manifest(size = 5) 106 | ge_manifest 107 | ``` 108 | 109 | ## Download data 110 | 111 | This code block downloads the `r nrow(ge_manifest)` gene expression files 112 | specified in the query above. Using multiple processes to do the download very 113 | significantly speeds up the transfer in many cases. The following completes in 114 | about 15 seconds. 115 | 116 | ```{r,eval=FALSE} 117 | library(BiocParallel) 118 | register(MulticoreParam()) 119 | destdir <- tempdir() 120 | fnames <- lapply(ge_manifest$id,gdcdata) 121 | ``` 122 | 123 | If the download had included controlled-access data, the download above would 124 | have needed to include a `token`. Details are available in 125 | [the authentication section below](#authentication). 126 | 127 | ## Metadata queries 128 | 129 | Here we use a couple of ad-hoc helper functions to handle the output of the 130 | query. See the `inst/script/README.Rmd` folder for the source. 131 | 132 | ```{r,echo=FALSE} 133 | filterAllNA <- function(df) { 134 | notallna <- vapply(df, function(x) !all(is.na(x)), logical(1L)) 135 | df[, notallna] 136 | } 137 | 138 | bindrowname <- function(resultList) { 139 | if (is.data.frame(resultList)) 140 | stop("Only run this on the list type of outputs") 141 | datadf <- dplyr::bind_rows(resultList) 142 | rownames(datadf) <- names(resultList) 143 | filterAllNA(datadf) 144 | } 145 | ``` 146 | 147 | First, create a `data.frame` from the clinical data: 148 | 149 | ```{r} 150 | expands <- c("diagnoses","annotations", 151 | "demographic","exposures") 152 | clinResults <- cases() |> 153 | GenomicDataCommons::select(NULL) |> 154 | GenomicDataCommons::expand(expands) |> 155 | results(size=6) 156 | demoDF <- filterAllNA(clinResults$demographic) 157 | exposuresDF <- bindrowname(clinResults$exposures) 158 | ``` 159 | 160 | ```{r} 161 | demoDF[, 1:4] 162 | ``` 163 | 164 | ```{r} 165 | exposuresDF[, 1:4] 166 | ``` 167 | 168 | Note that the diagnoses data has multiple lines per patient: 169 | 170 | ```{r} 171 | diagDF <- bindrowname(clinResults$diagnoses) 172 | diagDF[, 1:4] 173 | ``` 174 | 175 | # Basic design 176 | 177 | This package design is meant to have some similarities to the "tidyverse" 178 | approach of dplyr. Roughly, the functionality for finding and accessing files 179 | and metadata can be divided into: 180 | 181 | 1. Simple query constructors based on GDC API endpoints. 182 | 2. A set of verbs that when applied, adjust filtering, field selection, and 183 | faceting (fields for aggregation) and result in a new query object (an 184 | endomorphism) 185 | 3. A set of verbs that take a query and return results from the GDC 186 | 187 | In addition, there are auxiliary functions for asking the GDC API for 188 | information about available and default fields, slicing BAM files, and 189 | downloading actual data files. Here is an overview of functionality[^1]. 190 | 191 | 192 | - Creating a query 193 | - `projects()` 194 | - `cases()` 195 | - `files()` 196 | - `annotations()` 197 | - Manipulating a query 198 | - `filter()` 199 | - `facet()` 200 | - `select()` 201 | - Introspection on the GDC API fields 202 | - `mapping()` 203 | - `available_fields()` 204 | - `default_fields()` 205 | - `grep_fields()` 206 | - `available_values()` 207 | - `available_expand()` 208 | - Executing an API call to retrieve query results 209 | - `results()` 210 | - `count()` 211 | - `response()` 212 | - Raw data file downloads 213 | - `gdcdata()` 214 | - `transfer()` 215 | - `gdc_client()` 216 | - Summarizing and aggregating field values (faceting) 217 | - `aggregations()` 218 | - Authentication 219 | - `gdc_token()` 220 | - BAM file slicing 221 | - `slicing()` 222 | 223 | [^1]: See individual function and methods documentation for specific details. -------------------------------------------------------------------------------- /R/response.R: -------------------------------------------------------------------------------- 1 | #' Fetch \code{\link{GDCQuery}} metadata from GDC 2 | #' 3 | #' @aliases GDCResponse 4 | #' 5 | #' @param x a \code{\link{GDCQuery}} object 6 | #' @param from integer index from which to start returning data 7 | #' @param size number of records to return 8 | #' @param ... passed to httr (good for passing config info, etc.) 9 | #' @param response_handler a function that processes JSON (as text) 10 | #' and returns an R object. Default is \code{\link[jsonlite]{fromJSON}}. 11 | #' 12 | #' @rdname response 13 | #' 14 | #' @return A \code{GDCResponse} object which is a list with the following 15 | #' members: 16 | #' \itemize{ 17 | #' \item{results} 18 | #' \item{query} 19 | #' \item{aggregations} 20 | #' \item{pages} 21 | #' } 22 | #' 23 | #' 24 | #' @examples 25 | #' 26 | #' # basic class stuff 27 | #' gCases = cases() 28 | #' resp = response(gCases) 29 | #' class(resp) 30 | #' names(resp) 31 | #' 32 | #' # And results from query 33 | #' resp$results[[1]] 34 | #' 35 | #' @export 36 | response = function(x,...) { 37 | UseMethod('response',x) 38 | } 39 | 40 | #' provide count of records in a \code{\link{GDCQuery}} 41 | #' 42 | #' @param x a \code{\link{GDCQuery}} object 43 | #' @param ... passed to httr (good for passing config info, etc.) 44 | #' 45 | #' @return integer(1) representing the count of records that will 46 | #' be returned by the current query 47 | #' 48 | #' @examples 49 | #' # total number of projects 50 | #' projects() |> count() 51 | #' 52 | #' # total number of cases 53 | #' cases() |> count() 54 | #' 55 | #' @export 56 | count = function(x,...) { 57 | UseMethod('count',x) 58 | } 59 | 60 | #' @describeIn count 61 | #' 62 | #' @export 63 | count.GDCQuery = function(x,...) { 64 | resp = x |> response(size=1) 65 | return(resp$pages$total) 66 | } 67 | 68 | #' @describeIn count 69 | #' 70 | #' @export 71 | count.GDCResponse = function(x,...) { 72 | x$pages$total 73 | } 74 | 75 | 76 | #" (internal) prepare "results" for return 77 | #" 78 | #" In particular, this function sets 79 | #" entity_ids for every element so that 80 | #" one does not loose track of the relationships 81 | #" given the nested nature of GDC returns 82 | .prepareResults <- function(res,idfield) { 83 | for(i in names(res)) { 84 | if(inherits(res[[i]],'data.frame')) 85 | rownames(res[[i]]) = res[[idfield]] 86 | else 87 | names(res[[i]]) = res[[idfield]]} 88 | return(res) 89 | } 90 | 91 | #' @rdname response 92 | #' 93 | #' @importFrom jsonlite fromJSON 94 | #' 95 | #' @export 96 | response.GDCQuery = function(x, from = 0, size = 10, ..., 97 | response_handler = jsonlite::fromJSON) { 98 | body = Filter(function(z) !is.null(z),x) 99 | body[['facets']]=paste0(body[['facets']],collapse=",") 100 | body[['fields']]=paste0(body[['fields']],collapse=",") 101 | body[['expand']]=paste0(body[['expand']],collapse=",") 102 | body[['from']]=from 103 | body[['size']]=size 104 | body[['format']]='JSON' 105 | body[['pretty']]='FALSE' 106 | tmp = response_handler(httr::content( 107 | .gdc_post(entity_name(x),body=body, token=NULL,...), 108 | as="text", encoding = "UTF-8")) 109 | res = tmp$data$hits 110 | idfield = paste0(sub('s$','',entity_name(x)),'_id') 111 | ## the following code just sets names on the 112 | structure( 113 | list(results = .prepareResults(res,idfield), 114 | query = x, 115 | pages = tmp$data$pagination, 116 | aggregations = lapply(tmp$data$aggregations,function(x) {x$buckets})), 117 | class = c(paste0('GDC',entity_name(x),'Response'),'GDCResponse','list') 118 | ) 119 | } 120 | 121 | #' @rdname response 122 | #' 123 | #' @export 124 | response_all = function(x,...) { 125 | count = count(x) 126 | return(response(x=x,size=count,from=0,...)) 127 | } 128 | 129 | 130 | #' aggregations 131 | #' 132 | #' @param x a \code{\link{GDCQuery}} object 133 | #' 134 | #' @return a \code{list} of \code{data.frame} with one 135 | #' member for each requested facet. The data frames 136 | #' each have two columns, key and doc_count. 137 | #' 138 | #' @examples 139 | #' # Number of each file type 140 | #' res = files() |> facet(c('type','data_type')) |> aggregations() 141 | #' res$type 142 | #' 143 | #' @export 144 | aggregations = function(x) { 145 | UseMethod('aggregations',x) 146 | } 147 | 148 | 149 | #' @describeIn aggregations 150 | #' 151 | #' 152 | #' @export 153 | aggregations.GDCQuery = function(x) { 154 | if(is.null(x$facets)) 155 | x = x |> facet() 156 | return(response(x)$aggregations) 157 | } 158 | 159 | #' @describeIn aggregations 160 | #' 161 | #' 162 | #' @export 163 | aggregations.GDCResponse = function(x) { 164 | x$aggregations 165 | } 166 | 167 | 168 | #' results 169 | #' 170 | #' @param x a \code{\link{GDCQuery}} object 171 | #' @param ... passed on to \code{\link{response}} 172 | #' 173 | #' @return A (typically nested) \code{list} of GDC records 174 | #' 175 | #' @examples 176 | #' qcases = cases() |> results() 177 | #' length(qcases) 178 | #' 179 | #' @export 180 | results = function(x,...) { 181 | UseMethod('results',x) 182 | } 183 | 184 | #' results_all 185 | #' 186 | #' @param x a \code{\link{GDCQuery}} object 187 | #' 188 | #' @return A (typically nested) \code{list} of GDC records 189 | #' 190 | #' @examples 191 | #' # details of all available projects 192 | #' projResults = projects() |> results_all() 193 | #' length(projResults) 194 | #' count(projects()) 195 | #' 196 | #' 197 | #' @export 198 | results_all = function(x) { 199 | UseMethod('results_all',x) 200 | } 201 | 202 | 203 | #' @describeIn results 204 | #' 205 | #' 206 | #' @export 207 | results.GDCQuery = function(x,...) { 208 | results(response(x,...)) 209 | } 210 | 211 | #' @describeIn results_all 212 | #' 213 | #' 214 | #' @export 215 | results_all.GDCQuery = function(x) { 216 | results(response_all(x)) 217 | } 218 | 219 | #' @describeIn results 220 | #' 221 | #' 222 | #' @export 223 | results.GDCResponse = function(x,...) { 224 | structure( 225 | x$results, 226 | class=c(sub('Response','Results',class(x))) 227 | ) 228 | } 229 | 230 | #' @describeIn results_all 231 | #' 232 | #' 233 | #' @export 234 | results_all.GDCResponse = function(x) { 235 | structure( 236 | x$results, 237 | class=c(sub('Response','Results',class(x))) 238 | ) 239 | } 240 | 241 | 242 | 243 | 244 | #' @importFrom xml2 xml_find_all 245 | .response_warnings <- function(warnings, endpoint) 246 | { 247 | warnings <- vapply(warnings, as.character, character(1)) 248 | if (length(warnings) && nzchar(warnings)) 249 | warning("'", endpoint, "' query warnings:\n", .wrapstr(warnings)) 250 | NULL 251 | } 252 | 253 | .response_json_as_list <- function(json, endpoint) 254 | { 255 | type <- substr(endpoint, 1, nchar(endpoint) - 1L) 256 | type_id <- sprintf("%s_id", type) 257 | type_list <- sprintf("%ss_list", type) 258 | 259 | hits <- json[["data"]][["hits"]] 260 | names(hits) <- vapply(hits, "[[", character(1), type_id) 261 | hits <- lapply(hits, "[[<-", type_id, NULL) 262 | hits <- lapply(hits, lapply, unlist) # collapse field elt 'list' 263 | class(hits) <- c(type_list, "gdc_list", "list") 264 | hits 265 | } 266 | 267 | #' @importFrom stats setNames 268 | #' @importFrom xml2 xml_find_all xml_text 269 | .response_xml_as_data_frame <- function(xml, fields) 270 | { 271 | xpaths <- setNames(sprintf("/response/data/hits/item/%s", fields), fields) 272 | 273 | columns <- lapply(xpaths, function(xpath, xml) { 274 | nodes <- xml_find_all(xml, xpath) 275 | vapply(nodes, xml_text, character(1)) 276 | }, xml=xml) 277 | columns <- Filter(length, columns) 278 | 279 | dropped <- fields[!fields %in% names(columns)] 280 | if (length(dropped)) 281 | warning("fields not available:\n", .wrapstr(dropped)) 282 | if (length(columns)==0) { 283 | warning("No records found. Check on filter criteria to ensure they do what you expect. ") 284 | return(NULL) 285 | } 286 | if (!length(unique(lengths(columns)))) { 287 | lens <- paste(sprintf("%s = %d", names(columns), lengths(columns)), 288 | collapse=", ") 289 | stop("fields are different lengths:\n", .wrapstr(lens)) 290 | } 291 | 292 | as.data.frame(columns, stringsAsFactors=FALSE) 293 | } 294 | 295 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # GenomicDataCommons 3 | 4 | 5 | 6 | [![R-CMD-check](https://github.com/Bioconductor/GenomicDataCommons/workflows/R-CMD-check/badge.svg)](https://github.com/Bioconductor/GenomicDataCommons/actions) 7 | 8 | 9 | # What is the GDC? 10 | 11 | From the [Genomic Data Commons (GDC) 12 | website](https://gdc.nci.nih.gov/about-gdc): 13 | 14 | The National Cancer Institute’s (NCI’s) Genomic Data Commons (GDC) is a 15 | data sharing platform that promotes precision medicine in oncology. It 16 | is not just a database or a tool; it is an expandable knowledge network 17 | supporting the import and standardization of genomic and clinical data 18 | from cancer research programs. 19 | 20 | The GDC contains NCI-generated data from some of the largest and most 21 | comprehensive cancer genomic datasets, including The Cancer Genome Atlas 22 | (TCGA) and Therapeutically Applicable Research to Generate Effective 23 | Therapies (TARGET). For the first time, these datasets have been 24 | harmonized using a common set of bioinformatics pipelines, so that the 25 | data can be directly compared. 26 | 27 | As a growing knowledge system for cancer, the GDC also enables 28 | researchers to submit data, and harmonizes these data for import into 29 | the GDC. As more researchers add clinical and genomic data to the GDC, 30 | it will become an even more powerful tool for making discoveries about 31 | the molecular basis of cancer that may lead to better care for patients. 32 | 33 | The [data model for the GDC is 34 | complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components), 35 | but it worth a quick overview. The data model is encoded as a so-called 36 | property graph. Nodes represent entities such as Projects, Cases, 37 | Diagnoses, Files (various kinds), and Annotations. The relationships 38 | between these entities are maintained as edges. Both nodes and edges may 39 | have Properties that supply instance details. The GDC API exposes these 40 | nodes and edges in a somewhat simplified set of 41 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) 42 | endpoints. 43 | 44 | # Quickstart 45 | 46 | This software is available at Bioconductor.org and can be downloaded via 47 | `BiocManager::install`. 48 | 49 | To report bugs or problems, either [submit a new 50 | issue](https://github.com/Bioconductor/GenomicDataCommons/issues) or 51 | submit a `bug.report(package='GenomicDataCommons')` from within R (which 52 | will redirect you to the new issue on GitHub). 53 | 54 | ## Installation 55 | 56 | Installation can be achieved via Bioconductor’s `BiocManager` package. 57 | 58 | ``` r 59 | if (!require("BiocManager")) 60 | install.packages("BiocManager") 61 | 62 | BiocManager::install('GenomicDataCommons') 63 | ``` 64 | 65 | ``` r 66 | library(GenomicDataCommons) 67 | ``` 68 | 69 | ## Check basic functionality 70 | 71 | ``` r 72 | status() 73 | #> $commit 74 | #> [1] "4dd3680528a19ed33cfc83c7d049426c97bb903b" 75 | #> 76 | #> $data_release 77 | #> [1] "Data Release 34.0 - July 27, 2022" 78 | #> 79 | #> $status 80 | #> [1] "OK" 81 | #> 82 | #> $tag 83 | #> [1] "3.0.0" 84 | #> 85 | #> $version 86 | #> [1] 1 87 | ``` 88 | 89 | ## Find data 90 | 91 | The following code builds a `manifest` that can be used to guide the 92 | download of raw data. Here, filtering finds gene expression files 93 | quantified as raw counts using `STAR` from ovarian cancer patients. 94 | 95 | ``` r 96 | ge_manifest <- files() |> 97 | filter( cases.project.project_id == 'TCGA-OV') |> 98 | filter( type == 'gene_expression' ) |> 99 | filter( analysis.workflow_type == 'STAR - Counts') |> 100 | manifest(size = 5) 101 | ge_manifest 102 | #> id data_format access file_name 103 | #> 1 7c69529f-2273-4dc4-b213-e84924d78bea TSV open d6472bd0-b4e2-4ed1-a892-e1702c195dc7.rna_seq.augmented_star_gene_counts.tsv 104 | #> 2 0eff4634-f8c4-4db9-8a7c-331b21689bae TSV open 42165baf-b32c-4fc4-8b04-29c5b4e76de0.rna_seq.augmented_star_gene_counts.tsv 105 | #> 3 7d74b4c5-6391-4b3e-95a3-020ea0869e86 TSV controlled accf08d4-a784-4908-831a-7a08d4c5f0f5.rna_seq.star_splice_junctions.tsv.gz 106 | #> 4 dc2aeea4-3cd0-4623-92f4-bbbc962851cc TSV controlled 8ab508b9-2993-4e66-b8f9-81e32e936d4a.rna_seq.star_splice_junctions.tsv.gz 107 | #> 5 0cf852be-d2e3-4fde-bba8-c93efae2961a TSV open 93831282-1dd1-49a3-acd7-dae2a49ca62e.rna_seq.augmented_star_gene_counts.tsv 108 | #> submitter_id data_category acl type file_size created_datetime md5sum 109 | #> 1 7085a70b-2f63-4402-9e53-70f091f26fcb Transcriptome Profiling open gene_expression 4254435 2021-12-13T20:53:42.329364-06:00 19d5596bba8949f4c138793608497d56 110 | #> 2 f0d44930-b1ad-447a-86b9-27d0285954b9 Transcriptome Profiling open gene_expression 4257461 2021-12-13T20:47:24.326497-06:00 d89d71b7c028c1643d7a3ee7857d8e01 111 | #> 3 e6473134-6d65-414c-9f52-2c25057fac7d Transcriptome Profiling phs000178 gene_expression 3109435 2021-12-13T21:03:56.008440-06:00 fb8332d6413c44a9de02a1cbe6b018aa 112 | #> 4 f99b93a9-70cb-44f8-bd1f-4edeee4425a4 Transcriptome Profiling phs000178 gene_expression 4607701 2021-12-13T21:02:23.944851-06:00 26231bed1ef67c093d3ce2b39def81cd 113 | #> 5 fb4d7abe-b61a-4f35-9700-605f1bc1512f Transcriptome Profiling open gene_expression 4265694 2021-12-13T20:50:55.234254-06:00 050763aabd36509f954137fbdc4eeb00 114 | #> updated_datetime file_id data_type state experimental_strategy 115 | #> 1 2022-01-19T14:47:28.965154-06:00 7c69529f-2273-4dc4-b213-e84924d78bea Gene Expression Quantification released RNA-Seq 116 | #> 2 2022-01-19T14:47:07.478144-06:00 0eff4634-f8c4-4db9-8a7c-331b21689bae Gene Expression Quantification released RNA-Seq 117 | #> 3 2022-01-19T14:01:15.621847-06:00 7d74b4c5-6391-4b3e-95a3-020ea0869e86 Splice Junction Quantification released RNA-Seq 118 | #> 4 2022-01-19T14:01:15.621847-06:00 dc2aeea4-3cd0-4623-92f4-bbbc962851cc Splice Junction Quantification released RNA-Seq 119 | #> 5 2022-01-19T14:47:07.036781-06:00 0cf852be-d2e3-4fde-bba8-c93efae2961a Gene Expression Quantification released RNA-Seq 120 | ``` 121 | 122 | ## Download data 123 | 124 | This code block downloads the 5 gene expression files specified in the 125 | query above. Using multiple processes to do the download very 126 | significantly speeds up the transfer in many cases. The following 127 | completes in about 15 seconds. 128 | 129 | ``` r 130 | library(BiocParallel) 131 | register(MulticoreParam()) 132 | destdir <- tempdir() 133 | fnames <- lapply(ge_manifest$id,gdcdata) 134 | ``` 135 | 136 | If the download had included controlled-access data, the download above 137 | would have needed to include a `token`. Details are available in [the 138 | authentication section below](#authentication). 139 | 140 | ## Metadata queries 141 | 142 | Here we use a couple of ad-hoc helper functions to handle the output of 143 | the query. See the `inst/script/README.Rmd` folder for the source. 144 | 145 | First, create a `data.frame` from the clinical data: 146 | 147 | ``` r 148 | expands <- c("diagnoses","annotations", 149 | "demographic","exposures") 150 | clinResults <- cases() |> 151 | GenomicDataCommons::select(NULL) |> 152 | GenomicDataCommons::expand(expands) |> 153 | results(size=6) 154 | demoDF <- filterAllNA(clinResults$demographic) 155 | exposuresDF <- bindrowname(clinResults$exposures) 156 | ``` 157 | 158 | ``` r 159 | demoDF[, 1:4] 160 | #> cause_of_death race gender ethnicity 161 | #> 2525bfef-6962-4b7f-8e80-6186400ce624 not reported female not reported 162 | #> 126507c3-c0d7-41fb-9093-7deed5baf431 Cancer Related not reported female not reported 163 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0 not reported female not reported 164 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44 Cancer Related not reported male not reported 165 | #> 59122a43-606a-4669-806b-6747e0ac9985 white male not hispanic or latino 166 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc Cancer Related white female not hispanic or latino 167 | ``` 168 | 169 | ``` r 170 | exposuresDF[, 1:4] 171 | #> submitter_id created_datetime alcohol_intensity pack_years_smoked 172 | #> 2525bfef-6962-4b7f-8e80-6186400ce624 C3N-03839-EXP 2019-12-30T10:23:07.190853-06:00 Lifelong Non-Drinker NA 173 | #> 126507c3-c0d7-41fb-9093-7deed5baf431 C3N-01518-EXP 2018-06-21T14:27:48.817254-05:00 Lifelong Non-Drinker NA 174 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0 C3N-03933-EXP 2019-03-14T08:23:14.054975-05:00 Lifelong Non-Drinker NA 175 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44 C3N-02695-EXP 2019-03-14T08:23:14.054975-05:00 Occasional Drinker 16.8 176 | #> 59122a43-606a-4669-806b-6747e0ac9985 C3L-03642-EXP 2019-06-24T07:53:15.534197-05:00 Lifelong Non-Drinker 39.0 177 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc C3L-03728-EXP 2019-06-24T07:53:15.534197-05:00 Lifelong Non-Drinker NA 178 | ``` 179 | 180 | Note that the diagnoses data has multiple lines per patient: 181 | 182 | ``` r 183 | diagDF <- bindrowname(clinResults$diagnoses) 184 | diagDF[, 1:4] 185 | #> ajcc_pathologic_stage created_datetime tissue_or_organ_of_origin age_at_diagnosis 186 | #> 2525bfef-6962-4b7f-8e80-6186400ce624 Stage IIB 2019-07-22T06:40:02.183501-05:00 Head of pancreas 19956 187 | #> 126507c3-c0d7-41fb-9093-7deed5baf431 Not Reported 2018-12-03T12:05:16.846188-06:00 Temporal lobe 26312 188 | #> c43ac461-9f03-44bc-be7d-3d867eb708a0 Stage III 2019-03-14T10:37:34.405260-05:00 Floor of mouth, NOS 25635 189 | #> a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44 Not Reported 2019-03-14T10:37:34.405260-05:00 Floor of mouth, NOS 16652 190 | #> 59122a43-606a-4669-806b-6747e0ac9985 Not Reported 2019-07-22T06:40:02.183501-05:00 Upper lobe, lung 23384 191 | #> 4447a969-e5c8-4291-b83c-53a0f7e77cbc Not Reported 2019-05-07T07:41:33.411909-05:00 Frontal lobe 29326 192 | ``` 193 | 194 | # Basic design 195 | 196 | This package design is meant to have some similarities to the 197 | “tidyverse” approach of dplyr. Roughly, the functionality for finding 198 | and accessing files and metadata can be divided into: 199 | 200 | 1. Simple query constructors based on GDC API endpoints. 201 | 2. A set of verbs that when applied, adjust filtering, field selection, 202 | and faceting (fields for aggregation) and result in a new query 203 | object (an endomorphism) 204 | 3. A set of verbs that take a query and return results from the GDC 205 | 206 | In addition, there are auxiliary functions for asking the GDC API for 207 | information about available and default fields, slicing BAM files, and 208 | downloading actual data files. Here is an overview of functionality[^1]. 209 | 210 | - Creating a query 211 | - `projects()` 212 | - `cases()` 213 | - `files()` 214 | - `annotations()` 215 | - Manipulating a query 216 | - `filter()` 217 | - `facet()` 218 | - `select()` 219 | - Introspection on the GDC API fields 220 | - `mapping()` 221 | - `available_fields()` 222 | - `default_fields()` 223 | - `grep_fields()` 224 | - `available_values()` 225 | - `available_expand()` 226 | - Executing an API call to retrieve query results 227 | - `results()` 228 | - `count()` 229 | - `response()` 230 | - Raw data file downloads 231 | - `gdcdata()` 232 | - `transfer()` 233 | - `gdc_client()` 234 | - Summarizing and aggregating field values (faceting) 235 | - `aggregations()` 236 | - Authentication 237 | - `gdc_token()` 238 | - BAM file slicing 239 | - `slicing()` 240 | 241 | [^1]: See individual function and methods documentation for specific 242 | details. 243 | -------------------------------------------------------------------------------- /vignettes/overview.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The GenomicDataCommons Package" 3 | author: "Sean Davis & Martin Morgan" 4 | date: "`r format(Sys.Date(), '%A, %B %d, %Y')`" 5 | always_allow_html: yes 6 | output: 7 | BiocStyle::html_document: 8 | df_print: paged 9 | toc_float: true 10 | abstract: > 11 | The National Cancer Institute (NCI) has established 12 | the [Genomic Data Commons](https://gdc.nci.nih.gov/) (GDC). The GDC 13 | provides the cancer research community with an open and unified 14 | repository for sharing and accessing data across numerous cancer 15 | studies and projects via a high-performance data transfer and query 16 | infrastructure. The *GenomicDataCommons* Bioconductor package 17 | provides basic infrastructure for querying, accessing, and mining 18 | genomic datasets available from the GDC. We expect that the 19 | Bioconductor developer and the larger bioinformatics communities will 20 | build on the *GenomicDataCommons* package to add higher-level 21 | functionality and expose cancer genomics data to the plethora of 22 | state-of-the-art bioinformatics methods available in Bioconductor. 23 | 24 | vignette: > 25 | %\VignetteIndexEntry{Introduction to Accessing the NCI Genomic Data Commons} 26 | %\VignetteEngine{knitr::rmarkdown} 27 | %\VignetteEncoding{UTF-8} 28 | --- 29 | 30 | ```{r init, results='hide', echo=FALSE, warning=FALSE, message=FALSE} 31 | library(knitr) 32 | opts_chunk$set(warning=FALSE, message=FALSE) 33 | BiocStyle::markdown() 34 | ``` 35 | 36 | 37 | # What is the GDC? 38 | 39 | From the [Genomic Data Commons (GDC) website](https://gdc.cancer.gov/about-gdc): 40 | 41 | > The National Cancer Institute's (NCI's) Genomic Data Commons (GDC) is 42 | a data sharing platform that promotes precision medicine in 43 | oncology. It is not just a database or a tool; it is an expandable 44 | knowledge network supporting the import and standardization of genomic 45 | and clinical data from cancer research programs. 46 | > The GDC contains NCI-generated data from some of the largest and most 47 | comprehensive cancer genomic datasets, including The Cancer Genome 48 | Atlas (TCGA) and Therapeutically Applicable Research to Generate 49 | Effective Therapies (TARGET). For the first time, these datasets have 50 | been harmonized using a common set of bioinformatics pipelines, so 51 | that the data can be directly compared. 52 | > As a growing knowledge system for cancer, the GDC also enables 53 | researchers to submit data, and harmonizes these data for import into 54 | the GDC. As more researchers add clinical and genomic data to the GDC, 55 | it will become an even more powerful tool for making discoveries about 56 | the molecular basis of cancer that may lead to better care for 57 | patients. 58 | 59 | The 60 | [data model for the GDC is complex](https://gdc.cancer.gov/developers/gdc-data-model/gdc-data-model-components), 61 | but it worth a quick overview and a graphical representation is included here. 62 | 63 | ![The data model is encoded as a 64 | so-called property graph. Nodes represent entities such as Projects, 65 | Cases, Diagnoses, Files (various kinds), and Annotations. The 66 | relationships between these entities are maintained as edges. Both 67 | nodes and edges may have Properties that supply instance details. ](all_nodes_040318.png) 68 | 69 | The 70 | GDC API exposes these nodes and edges in a somewhat simplified set 71 | of 72 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) endpoints. 73 | 74 | # Quickstart 75 | 76 | This quickstart section is just meant to show basic 77 | functionality. More details of functionality are included further on 78 | in this vignette and in function-specific help. 79 | 80 | This software is available at Bioconductor.org and can be downloaded via 81 | `BiocManager::install`. 82 | 83 | To report bugs or problems, either 84 | [submit a new issue](https://github.com/Bioconductor/GenomicDataCommons/issues) 85 | or submit a `bug.report(package='GenomicDataCommons')` from within R (which 86 | will redirect you to the new issue on GitHub). 87 | 88 | ## Installation 89 | 90 | Installation can be achieved via Bioconductor's `BiocManager` package. 91 | 92 | ```{r install_bioc, eval=FALSE} 93 | if (!require("BiocManager")) 94 | install.packages("BiocManager") 95 | BiocManager::install('GenomicDataCommons') 96 | ``` 97 | 98 | ```{r libraries, message=FALSE} 99 | library(GenomicDataCommons) 100 | ``` 101 | 102 | ## Check connectivity and status 103 | 104 | The `r Biocpkg("GenomicDataCommons")` package relies on having network 105 | connectivity. In addition, the NCI GDC API must also be operational 106 | and not under maintenance. Checking `status` can be used to check this 107 | connectivity and functionality. 108 | 109 | ```{r statusQS} 110 | GenomicDataCommons::status() 111 | ``` 112 | 113 | And to check the status in code: 114 | 115 | ```{r statusCheck} 116 | stopifnot(GenomicDataCommons::status()$status=="OK") 117 | ``` 118 | 119 | 120 | ## Find data 121 | 122 | The following code builds a `manifest` that can be used to guide the 123 | download of raw data. Here, filtering finds gene expression files 124 | quantified as raw counts using `STAR` from ovarian cancer patients. 125 | 126 | ```{r manifest} 127 | ge_manifest <- files() |> 128 | filter( cases.project.project_id == 'TCGA-OV') |> 129 | filter( type == 'gene_expression' ) |> 130 | filter( analysis.workflow_type == 'STAR - Counts') |> 131 | manifest() 132 | head(ge_manifest) 133 | ``` 134 | 135 | ## Download data 136 | 137 | After the `r nrow(ge_manifest)` gene expression files 138 | specified in the query above. Using multiple processes to do the download very 139 | significantly speeds up the transfer in many cases. On a standard 1Gb 140 | connection, the following completes in about 30 seconds. The first time the 141 | data are downloaded, R will ask to create a cache directory (see `?gdc_cache` 142 | for details of setting and interacting with the cache). Resulting 143 | downloaded files will be stored in the cache directory. Future access to 144 | the same files will be directly from the cache, alleviating multiple downloads. 145 | 146 | ```{r downloadQS, eval=FALSE} 147 | fnames <- lapply(ge_manifest$id[1:20], gdcdata) 148 | ``` 149 | 150 | If the download had included controlled-access data, the download above would 151 | have needed to include a `token`. Details are available in 152 | [the authentication section below](#authentication). 153 | 154 | ## Metadata queries 155 | 156 | ### Clinical data 157 | 158 | Accessing clinical data is a very common task. Given a set of `case_ids`, 159 | the `gdc_clinical()` function will return a list of four `tibble`s. 160 | 161 | - demographic 162 | - diagnoses 163 | - exposures 164 | - main 165 | 166 | ```{r gdc_clinical} 167 | case_ids = cases() |> results(size=10) |> ids() 168 | clindat = gdc_clinical(case_ids) 169 | names(clindat) 170 | ``` 171 | 172 | ```{r clinData} 173 | head(clindat[["main"]]) 174 | head(clindat[["diagnoses"]]) 175 | ``` 176 | 177 | ### General metadata queries 178 | 179 | The `r Biocpkg("GenomicDataCommons")` package can access the significant 180 | clinical, demographic, biospecimen, and annotation information 181 | contained in the NCI GDC. The `gdc_clinical()` function will often 182 | be all that is needed, but the API and `r Biocpkg("GenomicDataCommons")` package 183 | make much flexibility if fine-tuning is required. 184 | 185 | ```{r metadataQS} 186 | expands = c("diagnoses","annotations", 187 | "demographic","exposures") 188 | clinResults = cases() |> 189 | GenomicDataCommons::select(NULL) |> 190 | GenomicDataCommons::expand(expands) |> 191 | results(size=50) 192 | str(clinResults[[1]],list.len=6) 193 | # or listviewer::jsonedit(clinResults) 194 | ``` 195 | 196 | # Basic design 197 | 198 | This package design is meant to have some similarities to the "hadleyverse" 199 | approach of dplyr. Roughly, the functionality for finding and accessing files 200 | and metadata can be divided into: 201 | 202 | 1. Simple query constructors based on GDC API endpoints. 203 | 2. A set of verbs that when applied, adjust filtering, field selection, and 204 | faceting (fields for aggregation) and result in a new query object (an 205 | endomorphism) 206 | 3. A set of verbs that take a query and return results from the GDC 207 | 208 | In addition, there are exhiliary functions for asking the GDC API for 209 | information about available and default fields, slicing BAM files, and 210 | downloading actual data files. Here is an overview of functionality[^1]. 211 | 212 | 213 | - Creating a query 214 | - `projects()` 215 | - `cases()` 216 | - `files()` 217 | - `annotations()` 218 | - Manipulating a query 219 | - `filter()` 220 | - `facet()` 221 | - `select()` 222 | - Introspection on the GDC API fields 223 | - `mapping()` 224 | - `available_fields()` 225 | - `default_fields()` 226 | - `grep_fields()` 227 | - `available_values()` 228 | - `available_expand()` 229 | - Executing an API call to retrieve query results 230 | - `results()` 231 | - `count()` 232 | - `response()` 233 | - Raw data file downloads 234 | - `gdcdata()` 235 | - `transfer()` 236 | - `gdc_client()` 237 | - Summarizing and aggregating field values (faceting) 238 | - `aggregations()` 239 | - Authentication 240 | - `gdc_token()` 241 | - BAM file slicing 242 | - `slicing()` 243 | 244 | [^1]: See individual function and methods documentation for specific details. 245 | 246 | 247 | # Usage 248 | 249 | There are two main classes of operations when working with the NCI GDC. 250 | 251 | 1. [Querying metadata and finding data files](#querying-metadata) (e.g., finding 252 | all gene expression quantifications data files for all colon cancer patients). 253 | 2. [Transferring raw or processed data](#datafile-access-and-download) from the 254 | GDC to another computer (e.g., downloading raw or processed data) 255 | 256 | Both classes of operation are reviewed in detail in the following sections. 257 | 258 | ## Querying metadata 259 | 260 | Vast amounts of metadata about cases (patients, basically), files, projects, and 261 | so-called annotations are available via the NCI GDC API. Typically, one will 262 | want to query metadata to either focus in on a set of files for download or 263 | transfer *or* to perform so-called aggregations (pivot-tables, facets, similar 264 | to the R `table()` functionality). 265 | 266 | Querying metadata starts with [creating a "blank" query](#creating-a-query). One 267 | will often then want to [`filter`](#filtering) the query to limit results prior 268 | to [retrieving results](#retrieving-results). The GenomicDataCommons package has 269 | [helper functions for listing fields](#fields-and-values) that are available for 270 | filtering. 271 | 272 | In addition to fetching results, the GDC API allows 273 | [faceting, or aggregating,](#facets-and-aggregation), useful for compiling 274 | reports, generating dashboards, or building user interfaces to GDC data (see GDC 275 | web query interface for a non-R-based example). 276 | 277 | ### Creating a query 278 | 279 | A query of the GDC starts its life in R. Queries follow the four metadata 280 | endpoints available at the GDC. In particular, there are four convenience 281 | functions that each create `GDCQuery` objects (actually, specific subclasses of 282 | `GDCQuery`): 283 | 284 | - `projects()` 285 | - `cases()` 286 | - `files()` 287 | - `annotations()` 288 | 289 | ```{r projectquery} 290 | pquery = projects() 291 | ``` 292 | 293 | The `pquery` object is now an object of (S3) class, `GDCQuery` (and 294 | `gdc_projects` and `list`). The object contains the following elements: 295 | 296 | - fields: This is a character vector of the fields that will be returned when we 297 | [retrieve data](#retrieving-results). If no fields are specified to, for 298 | example, the `projects()` function, the default fields from the GDC are used 299 | (see `default_fields()`) 300 | - filters: This will contain results after calling the 301 | [`filter()` method](#filtering) and will be used to filter results on 302 | [retrieval](#retrieving-results). 303 | - facets: A character vector of field names that will be used for 304 | [aggregating data](#facets-and-aggregation) in a call to `aggregations()`. 305 | - token: A character(1) token from the GDC. See 306 | [the authentication section](#authentication) for details, but note that, in 307 | general, the token is not necessary for metadata query and retrieval, only for 308 | actual data download. 309 | 310 | Looking at the actual object (get used to using `str()`!), note that the query 311 | contains no results. 312 | 313 | ```{r pquery} 314 | str(pquery) 315 | ``` 316 | ### Retrieving results 317 | 318 | [[ GDC pagination documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#size-and-from) 319 | 320 | [[ GDC sorting documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#sort) 321 | 322 | With a query object available, the next step is to retrieve results from the 323 | GDC. The GenomicDataCommons package. The most basic type of results we can get 324 | is a simple `count()` of records available that satisfy the filter criteria. 325 | Note that we have not set any filters, so a `count()` here will represent all 326 | the project records publicly available at the GDC in the "default" archive" 327 | 328 | ```{r pquerycount} 329 | pcount = count(pquery) 330 | # or 331 | pcount = pquery |> count() 332 | pcount 333 | ``` 334 | 335 | The `results()` method will fetch actual results. 336 | 337 | ```{r pqueryresults} 338 | presults = pquery |> results() 339 | ``` 340 | These results are 341 | returned from the GDC in [JSON](http://www.json.org/) format and 342 | converted into a (potentially nested) list in R. The `str()` method is useful 343 | for taking a quick glimpse of the data. 344 | 345 | ```{r presultsstr} 346 | str(presults) 347 | ``` 348 | 349 | A default of only 10 records are returned. We can use the `size` and `from` 350 | arguments to `results()` to either page through results or to change the number 351 | of results. Finally, there is a convenience method, `results_all()` that will 352 | simply fetch all the available results given a query. Note that `results_all()` 353 | may take a long time and return HUGE result sets if not used carefully. Use of a 354 | combination of `count()` and `results()` to get a sense of the expected data 355 | size is probably warranted before calling `results_all()` 356 | 357 | ```{r presultsall} 358 | length(ids(presults)) 359 | presults = pquery |> results_all() 360 | length(ids(presults)) 361 | # includes all records 362 | length(ids(presults)) == count(pquery) 363 | ``` 364 | 365 | Extracting subsets of 366 | results or manipulating the results into a more conventional R data 367 | structure is not easily generalizable. However, 368 | the 369 | [purrr](https://github.com/hadley/purrr), 370 | [rlist](https://renkun.me/rlist/), 371 | and [data.tree](https://cran.r-project.org/web/packages/data.tree/vignettes/data.tree.html) packages 372 | are all potentially of interest for manipulating complex, nested list 373 | structures. For viewing the results in an interactive viewer, consider the 374 | [listviewer](https://github.com/timelyportfolio/listviewer) package. 375 | 376 | 377 | ### Fields and Values 378 | 379 | [[ GDC `fields` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#fields) 380 | 381 | Central to querying and retrieving data from the GDC is the ability to specify 382 | which fields to return, filtering by fields and values, and faceting or 383 | aggregating. The GenomicDataCommons package includes two simple functions, 384 | `available_fields()` and `default_fields()`. Each can operate on a character(1) 385 | endpoint name ("cases", "files", "annotations", or "projects") or a `GDCQuery` 386 | object. 387 | 388 | ```{r defaultfields} 389 | default_fields('files') 390 | # The number of fields available for files endpoint 391 | length(available_fields('files')) 392 | # The first few fields available for files endpoint 393 | head(available_fields('files')) 394 | ``` 395 | 396 | The fields to be returned by a query can be specified following a similar 397 | paradigm to that of the dplyr package. The `select()` function is a verb that 398 | resets the fields slot of a `GDCQuery`; note that this is not quite analogous to 399 | the dplyr `select()` verb that limits from already-present fields. We 400 | *completely replace* the fields when using `select()` on a `GDCQuery`. 401 | 402 | ```{r selectexample} 403 | # Default fields here 404 | qcases = cases() 405 | qcases$fields 406 | # set up query to use ALL available fields 407 | # Note that checking of fields is done by select() 408 | qcases = cases() |> GenomicDataCommons::select(available_fields('cases')) 409 | head(qcases$fields) 410 | ``` 411 | 412 | Finding fields of interest is such a common operation that the 413 | GenomicDataCommons includes the `grep_fields()` function. 414 | See the appropriate help pages for details. 415 | 416 | ### Facets and aggregation 417 | 418 | [[ GDC `facet` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#facets) 419 | 420 | The GDC API offers a feature known as aggregation or faceting. By 421 | specifying one or more fields (of appropriate type), the GDC can 422 | return to us a count of the number of records matching each potential 423 | value. This is similar to the R `table` method. Multiple fields can be 424 | returned at once, but the GDC API does not have a cross-tabulation 425 | feature; all aggregations are only on one field at a time. Results of 426 | `aggregation()` calls come back as a list of data.frames (actually, 427 | tibbles). 428 | 429 | ```{r aggexample} 430 | # total number of files of a specific type 431 | res = files() |> facet(c('type','data_type')) |> aggregations() 432 | res$type 433 | ``` 434 | 435 | Using `aggregations()` is an also easy way to learn the contents of individual 436 | fields and forms the basis for faceted search pages. 437 | 438 | ### Filtering 439 | 440 | [[ GDC `filtering` documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#filters-specifying-the-query) 441 | 442 | The GenomicDataCommons package uses a form of non-standard evaluation to specify 443 | R-like queries that are then translated into an R list. That R list is, upon 444 | calling a method that fetches results from the GDC API, translated into the 445 | appropriate JSON string. The R expression uses the formula interface as 446 | suggested by Hadley Wickham in his [vignette on non-standard evaluation](https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html) 447 | 448 | > It’s best to use a formula because a formula captures both the expression to 449 | evaluate and the environment where the evaluation occurs. This is important if 450 | the expression is a mixture of variables in a data frame and objects in the 451 | local environment [for example]. 452 | 453 | For the user, these details will not be too important except to note that a 454 | filter expression must begin with a "~". 455 | 456 | ```{r allfilesunfiltered} 457 | qfiles = files() 458 | qfiles |> count() # all files 459 | ``` 460 | To limit the file type, we can refer back to the 461 | [section on faceting](#facets-and-aggregation) to see the possible values for 462 | the file field "type". For example, to filter file results to only 463 | "gene_expression" files, we simply specify a filter. 464 | 465 | ```{r onlyGeneExpression} 466 | qfiles = files() |> filter( type == 'gene_expression') 467 | # here is what the filter looks like after translation 468 | str(get_filter(qfiles)) 469 | ``` 470 | 471 | What if we want to create a filter based on the project ('TCGA-OVCA', for 472 | example)? Well, we have a couple of possible ways to discover available fields. 473 | The first is based on base R functionality and some intuition. 474 | 475 | ```{r filtAvailFields} 476 | grep('pro',available_fields('files'),value=TRUE) |> 477 | head() 478 | ``` 479 | 480 | Interestingly, the project information is "nested" inside the case. We don't 481 | need to know that detail other than to know that we now have a few potential 482 | guesses for where our information might be in the files records. We need to 483 | know where because we need to construct the appropriate filter. 484 | 485 | ```{r filtProgramID} 486 | files() |> 487 | facet('cases.project.project_id') |> 488 | aggregations() |> 489 | head() 490 | ``` 491 | 492 | We note that `cases.project.project_id` looks like it is a good fit. We also 493 | note that `TCGA-OV` is the correct project_id, not `TCGA-OVCA`. Note that 494 | *unlike with dplyr and friends, the `filter()` method here **replaces** the 495 | filter and does not build on any previous filters*. 496 | 497 | ```{r filtfinal} 498 | qfiles = files() |> 499 | filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression') 500 | str(get_filter(qfiles)) 501 | qfiles |> count() 502 | ``` 503 | 504 | Asking for a `count()` of results given these new filter criteria gives `r 505 | qfiles |> count()` results. Filters can be chained (or nested) to 506 | accomplish the same effect as multiple `&` conditionals. The `count()` 507 | below is equivalent to the `&` filtering done above. 508 | 509 | ```{r filtChain} 510 | qfiles2 = files() |> 511 | filter( cases.project.project_id == 'TCGA-OV') |> 512 | filter( type == 'gene_expression') 513 | qfiles2 |> count() 514 | (qfiles |> count()) == (qfiles2 |> count()) #TRUE 515 | ``` 516 | 517 | 518 | 519 | Generating a manifest for bulk downloads is as 520 | simple as asking for the manifest from the current query. 521 | 522 | ```{r filtAndManifest} 523 | manifest_df = qfiles |> manifest() 524 | head(manifest_df) 525 | ``` 526 | 527 | Note that we might still not be quite there. Looking at filenames, there are 528 | suspiciously named files that might include "FPKM", "FPKM-UQ", or "counts". 529 | Another round of `grep` and `available_fields`, looking for "type" turned up 530 | that the field "analysis.workflow_type" has the appropriate filter criteria. 531 | 532 | 533 | ```{r filterForSTARCounts} 534 | qfiles = files() |> filter( ~ cases.project.project_id == 'TCGA-OV' & 535 | type == 'gene_expression' & 536 | access == "open" & 537 | analysis.workflow_type == 'STAR - Counts') 538 | manifest_df = qfiles |> manifest() 539 | nrow(manifest_df) 540 | ``` 541 | 542 | The GDC Data Transfer Tool can be used (from R, `transfer()` or from the 543 | command-line) to orchestrate high-performance, restartable transfers of all the 544 | files in the manifest. See [the bulk downloads section](bulk-downloads) for 545 | details. 546 | 547 | 548 | ## Authentication 549 | 550 | [[ GDC authentication documentation ]](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#facets) 551 | 552 | The GDC offers both "controlled-access" and "open" data. As of this 553 | writing, only data stored as files is "controlled-access"; that is, 554 | metadata accessible via the GDC is all "open" data and some files are 555 | "open" and some are "controlled-access". Controlled-access data are 556 | only available 557 | after 558 | [going through the process of obtaining access.](https://gdc.cancer.gov/access-data/obtaining-access-controlled-data) 559 | 560 | After controlled-access to one or more datasets has been granted, 561 | logging into the GDC web portal will allow you 562 | to 563 | [access a GDC authentication token](https://docs.gdc.cancer.gov/Data_Portal/Users_Guide/Authentication/#gdc-authentication-tokens), 564 | which can be downloaded and then used to access available 565 | controlled-access data via the GenomicDataCommons package. 566 | 567 | The GenomicDataCommons uses authentication tokens only for downloading 568 | data (see `transfer` and `gdcdata` documentation). The package 569 | includes a helper function, `gdc_token`, that looks for the token to 570 | be stored in one of three ways (resolved in this order): 571 | 572 | 1. As a string stored in the environment variable, `GDC_TOKEN` 573 | 2. As a file, stored in the file named by the environment variable, 574 | `GDC_TOKEN_FILE` 575 | 3. In a file in the user home directory, called `.gdc_token` 576 | 577 | As a concrete example: 578 | 579 | ```{r authenNoRun, eval=FALSE} 580 | token = gdc_token() 581 | transfer(...,token=token) 582 | # or 583 | transfer(...,token=get_token()) 584 | ``` 585 | 586 | 587 | ## Datafile access and download 588 | 589 | ### Data downloads via the GDC API 590 | 591 | The `gdcdata` function takes a character vector of one or more file 592 | ids. A simple way of producing such a vector is to produce a 593 | `manifest` data frame and then pass in the first column, which will 594 | contain file ids. 595 | 596 | ```{r singlefileDL} 597 | fnames = gdcdata(manifest_df$id[1:2],progress=FALSE) 598 | 599 | ``` 600 | 601 | Note that for controlled-access data, a 602 | GDC [authentication token](#authentication) is required. Using the 603 | `BiocParallel` package may be useful for downloading in parallel, 604 | particularly for large numbers of smallish files. 605 | 606 | ### Bulk downloads 607 | 608 | The bulk download functionality is only efficient (as of v1.2.0 of the 609 | GDC Data Transfer Tool) for relatively large files, so use this 610 | approach only when transferring BAM files or larger VCF files, for 611 | example. Otherwise, consider using the approach shown above, perhaps 612 | in parallel. 613 | 614 | ```{r bulkDL, eval=FALSE} 615 | # Requires gcd_client command-line utility to be isntalled 616 | # separately. 617 | fnames = gdcdata(manifest_df$id[3:10], access_method = 'client') 618 | ``` 619 | 620 | 621 | ### BAM slicing 622 | 623 | # Use Cases 624 | 625 | ## Cases 626 | 627 | ### How many cases are there per project_id? 628 | 629 | ```{r casesPerProject} 630 | res = cases() |> facet("project.project_id") |> aggregations() 631 | head(res) 632 | library(ggplot2) 633 | ggplot(res$project.project_id,aes(x = key, y = doc_count)) + 634 | geom_bar(stat='identity') + 635 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 636 | ``` 637 | 638 | ### How many cases are included in all TARGET projects? 639 | 640 | ```{r casesInTCGA} 641 | cases() |> filter(~ project.program.name=='TARGET') |> count() 642 | ``` 643 | 644 | ### How many cases are included in all TCGA projects? 645 | 646 | ```{r casesInTARGET} 647 | cases() |> filter(~ project.program.name=='TCGA') |> count() 648 | ``` 649 | 650 | ### What is the breakdown of sample types in TCGA-BRCA? 651 | 652 | ```{r casesTCGABRCASampleTypes} 653 | # The need to do the "&" here is a requirement of the 654 | # current version of the GDC API. I have filed a feature 655 | # request to remove this requirement. 656 | resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' & 657 | project.project_id=='TCGA-BRCA' ) |> 658 | facet('samples.sample_type') |> aggregations() 659 | resp$samples.sample_type 660 | ``` 661 | 662 | ### Fetch all samples in TCGA-BRCA that use "Solid Tissue" as a normal. 663 | 664 | ```{r casesTCGABRCASolidNormal} 665 | # The need to do the "&" here is a requirement of the 666 | # current version of the GDC API. I have filed a feature 667 | # request to remove this requirement. 668 | resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' & 669 | samples.sample_type=='Solid Tissue Normal') |> 670 | GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) |> 671 | response_all() 672 | count(resp) 673 | res = resp |> results() 674 | str(res[1],list.len=6) 675 | head(ids(resp)) 676 | ``` 677 | 678 | ### Get all TCGA case ids that are female 679 | 680 | ```{r casesFemaleTCGA} 681 | cases() |> 682 | GenomicDataCommons::filter(~ project.program.name == 'TCGA' & 683 | "cases.demographic.gender" %in% "female") |> 684 | GenomicDataCommons::results(size = 4) |> 685 | ids() 686 | ``` 687 | 688 | ### Get all TCGA-COAD case ids that are NOT female 689 | 690 | ```{r notFemaleTCGACOAD} 691 | cases() |> 692 | GenomicDataCommons::filter(~ project.project_id == 'TCGA-COAD' & 693 | "cases.demographic.gender" %exclude% "female") |> 694 | GenomicDataCommons::results(size = 4) |> 695 | ids() 696 | ``` 697 | 698 | ### Get all TCGA cases that are missing gender 699 | 700 | ```{r missingGenderTCGA} 701 | cases() |> 702 | GenomicDataCommons::filter(~ project.program.name == 'TCGA' & 703 | missing("cases.demographic.gender")) |> 704 | GenomicDataCommons::results(size = 4) |> 705 | ids() 706 | ``` 707 | 708 | ### Get all TCGA cases that are NOT missing gender 709 | 710 | ```{r notMissingGenderTCGA} 711 | cases() |> 712 | GenomicDataCommons::filter(~ project.program.name == 'TCGA' & 713 | !missing("cases.demographic.gender")) |> 714 | GenomicDataCommons::results(size = 4) |> 715 | ids() 716 | ``` 717 | 718 | 719 | ## Files 720 | 721 | ### How many of each type of file are available? 722 | 723 | ```{r filesVCFCount} 724 | res = files() |> facet('type') |> aggregations() 725 | res$type 726 | ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') + 727 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 728 | ``` 729 | 730 | ### Find gene-level RNA-seq quantification files for GBM 731 | 732 | ```{r filesRNAseqGeneGBM} 733 | q = files() |> 734 | GenomicDataCommons::select(available_fields('files')) |> 735 | filter(~ cases.project.project_id=='TCGA-GBM' & 736 | data_type=='Gene Expression Quantification') 737 | q |> facet('analysis.workflow_type') |> aggregations() 738 | # so need to add another filter 739 | file_ids = q |> filter(~ cases.project.project_id=='TCGA-GBM' & 740 | data_type=='Gene Expression Quantification' & 741 | analysis.workflow_type == 'STAR - Counts') |> 742 | GenomicDataCommons::select('file_id') |> 743 | response_all() |> 744 | ids() 745 | ``` 746 | 747 | ## Slicing 748 | 749 | ### Get all BAM file ids from TCGA-GBM 750 | 751 | **I need to figure out how to do slicing reproducibly in a testing environment 752 | and for vignette building**. 753 | 754 | ```{r filesRNAseqGeneGBMforBAM} 755 | q = files() |> 756 | GenomicDataCommons::select(available_fields('files')) |> 757 | filter(~ cases.project.project_id == 'TCGA-GBM' & 758 | data_type == 'Aligned Reads' & 759 | experimental_strategy == 'RNA-Seq' & 760 | data_format == 'BAM') 761 | file_ids = q |> response_all() |> ids() 762 | ``` 763 | 764 | 765 | ```{r slicing10, eval=FALSE} 766 | bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token()) 767 | library(GenomicAlignments) 768 | aligns = readGAlignments(bamfile) 769 | ``` 770 | 771 | # Troubleshooting 772 | 773 | ## SSL connection errors 774 | 775 | * Symptom: Trying to connect to the API results in: 776 | ``` 777 | Error in curl::curl_fetch_memory(url, handle = handle) : 778 | SSL connect error 779 | ``` 780 | * Possible solutions: The [issue 781 | is that the GDC supports only recent security Transport Layer Security (TLS)](http://stackoverflow.com/a/42599546/459633), 782 | so the only known fix is to upgrade the system `openssl` to version 783 | 1.0.1 or later. 784 | * [[Mac OS]](https://github.com/Bioconductor/GenomicDataCommons/issues/35#issuecomment-284233510), 785 | * [[Ubuntu]](http://askubuntu.com/a/434245) 786 | * [[Centos/RHEL]](https://www.liquidweb.com/kb/update-and-patch-openssl-for-the-ccs-injection-vulnerability/). 787 | After upgrading `openssl`, reinstall the R `curl` and `httr` packages. 788 | 789 | 790 | # sessionInfo() 791 | 792 | ```{r sessionInfo} 793 | sessionInfo() 794 | ``` 795 | 796 | # Developer notes 797 | 798 | - The `S3` object-oriented programming paradigm is used. 799 | - We have adopted a functional programming style with functions and methods that 800 | often take an "object" as the first argument. This style lends itself to 801 | pipeline-style programming. 802 | - The GenomicDataCommons package uses the 803 | [alternative request format (POST)](https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/#alternative-request-format) 804 | to allow very large request bodies. 805 | 806 | --------------------------------------------------------------------------------