├── .Rbuildignore
├── .github
    └── workflows
    │   └── pr_check.yml
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── ID-translation.R
    ├── TCGAbarcode.R
    ├── TCGAbiospec.R
    ├── TCGAprimaryTumors.R
    ├── TCGAsampleSelect.R
    ├── TCGAutils-pkg.R
    ├── builds.R
    ├── curatedTCGAData-helpers.R
    ├── data.R
    ├── findGRangesCols.R
    ├── generateMap.R
    ├── getFileName.R
    ├── imputeAssay.R
    ├── makeGRangesListFromCopyNumber.R
    ├── makeGRangesListFromExonFiles.R
    ├── oncoPrintTCGA.R
    ├── simplifyColData.R
    ├── simplifyTCGA.R
    └── utils.R
├── README.md
├── _pkgdown.yml
├── data
    ├── clinicalNames.rda
    ├── diseaseCodes.rda
    └── sampleTypes.rda
├── inst
    ├── extdata
    │   ├── blca_cnaseq.R
    │   ├── blca_cnaseq.txt
    │   ├── bt.exon_quant.R
    │   └── bt.exon_quantification.txt
    └── scripts
    │   ├── clinicalNames.R
    │   ├── diseaseCodes.R
    │   └── sampleTypes.R
├── man
    ├── ID-translation.Rd
    ├── TCGAbarcode.Rd
    ├── TCGAbiospec.Rd
    ├── TCGAprimaryTumors.Rd
    ├── TCGAsampleSelect.Rd
    ├── TCGAutils-package.Rd
    ├── builds.Rd
    ├── clinicalNames.Rd
    ├── curatedTCGAData-helpers.Rd
    ├── diseaseCodes.Rd
    ├── findGRangesCols.Rd
    ├── generateMap.Rd
    ├── getFileName.Rd
    ├── hidden-helpers.Rd
    ├── imputeAssay.Rd
    ├── makeGRangesListFromCopyNumber.Rd
    ├── makeGRangesListFromExonFiles.Rd
    ├── mergeColData.Rd
    ├── oncoPrintTCGA.Rd
    ├── sampleTypes.Rd
    ├── simplifyTCGA-defunct.Rd
    ├── simplifyTCGA.Rd
    └── trimColData.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-ID-translation.R
    │   ├── test-builds.R
    │   └── test-identifiers.R
└── vignettes
    └── TCGAutils.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.github$
2 | ^.*\.Rproj$
3 | ^\.Rproj\.user$
4 | ^data-raw$
5 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_check.yml:
--------------------------------------------------------------------------------
  1 | name: PR CMD check & build site
  2 | 
  3 | on:
  4 |   pull_request:
  5 |   push:
  6 |     paths:
  7 |       - 'DESCRIPTION'
  8 |       - '**.yml'
  9 |     branches:
 10 |       - devel
 11 |       - RELEASE_3_21
 12 | 
 13 | env:
 14 |   R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
 15 |   GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 16 |   CRAN: https://p3m.dev/cran/__linux__/noble/latest
 17 |   BIOC_RELEASE: RELEASE_3_21
 18 | 
 19 | jobs:
 20 |   set-matrix:
 21 |     runs-on: ubuntu-24.04
 22 |     outputs:
 23 |       matrix: ${{ steps.set.outputs.matrix }}
 24 |       dockerfile_exists: ${{ steps.dockerfile.outputs.exists }}
 25 |     steps:
 26 |       - name: Set Matrix Bioconductor Version
 27 |         id: set
 28 |         run: |
 29 |           MATRIX="{\"include\":[{\"bioc_version\":\"$GITHUB_REF_NAME\"}]}"
 30 |           echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
 31 |       - name: Check for Dockerfile
 32 |         id: dockerfile
 33 |         run: |
 34 |           echo "exists=$( [ -f ./inst/docker/pkg/Dockerfile ] && echo true || echo false )" >> $GITHUB_OUTPUT
 35 | 
 36 |   check:
 37 |     needs: set-matrix
 38 |     runs-on: ubuntu-latest
 39 |     strategy:
 40 |       matrix: ${{ fromJson(needs.set-matrix.outputs.matrix) }}
 41 |     container: bioconductor/bioconductor_docker:${{ matrix.bioc_version }}
 42 | 
 43 |     steps:
 44 |       - name: Checkout Repository
 45 |         uses: actions/checkout@v4
 46 |         with:
 47 |           ref: ${{ matrix.bioc_version }}
 48 | 
 49 |       - name: Query dependencies
 50 |         run: |
 51 |           BiocManager::install(c("covr", "BiocCheck"))
 52 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
 53 |         shell: Rscript {0}
 54 | 
 55 |       - name: Cache R packages
 56 |         uses: actions/cache@v4
 57 |         with:
 58 |           path: /usr/local/lib/R/site-library
 59 |           key: ${{ runner.os }}-r-${{ matrix.bioc_version }}-${{ hashFiles('.github/depends.Rds') }}
 60 |           restore-keys: ${{ runner.os }}-r-${{ matrix.bioc_version }}-
 61 | 
 62 |       - name: Install GPG
 63 |         if: ${{ github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 64 |         run: sudo apt-get update && sudo apt-get install -y gpg
 65 | 
 66 |       - name: Install Dependencies
 67 |         run: |
 68 |           remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories())
 69 |           BiocManager::install(c("rcmdcheck", "BiocCheck"), ask = FALSE, update = TRUE)
 70 |         shell: Rscript {0}
 71 | 
 72 |       - name: Check Package
 73 |         env:
 74 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
 75 |         run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error", check_dir = "check")
 76 |         shell: Rscript {0}
 77 | 
 78 |       - name: Test coverage
 79 |         if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 80 |         run: |
 81 |           cov <- covr::package_coverage(
 82 |             quiet = FALSE,
 83 |             clean = FALSE,
 84 |             type = "all",
 85 |             install_path = file.path(
 86 |               normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"),
 87 |               "package"
 88 |             )
 89 |           )
 90 |           covr::to_cobertura(cov)
 91 |         shell: Rscript {0}
 92 | 
 93 |       - name: Upload test results to Codecov
 94 |         if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }}
 95 |         uses: codecov/codecov-action@v4
 96 |         with:
 97 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
 98 |           file: ./cobertura.xml
 99 |           plugin: noop
100 |           disable_search: true
101 |           token: ${{ secrets.CODECOV_TOKEN }}
102 | 
103 |       - name: Run BiocCheck
104 |         id: bioccheck
105 |         run: |
106 |           BiocCheck::BiocCheck(
107 |             dir('check', 'tar.gz$', full.names = TRUE),
108 |             `quit-with-status` = TRUE, `no-check-bioc-help` = TRUE
109 |           )
110 |         shell: Rscript {0}
111 | 
112 |       - name: Build pkgdown
113 |         if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }}
114 |         run: |
115 |            PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site()'
116 | 
117 |       - name: Upload pkgdown artifact
118 |         if: github.ref == format('refs/heads/{0}', env.BIOC_RELEASE)
119 |         uses: actions/upload-pages-artifact@v3
120 |         with:
121 |           path: docs
122 | 
123 |   dock:
124 |     needs:
125 |       - check
126 |       - set-matrix
127 |     runs-on: ubuntu-24.04
128 |     if: ${{ github.ref == 'refs/heads/devel' && needs.set-matrix.outputs.dockerfile_exists == 'true' }}
129 |     steps:
130 |       - name: Checkout Repository
131 |         if: ${{ success() && github.event_name != 'pull_request' }}
132 |         uses: actions/checkout@v4
133 | 
134 |       - name: Register repo name
135 |         if: ${{ github.event_name != 'pull_request' }}
136 |         id: reg_repo_name
137 |         run: |
138 |           echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV
139 | 
140 |       - name: Login to Docker Hub
141 |         if: ${{ github.event_name != 'pull_request' }}
142 |         uses: docker/login-action@v2
143 |         with:
144 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
145 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
146 | 
147 |       - name: Build and Push Docker
148 |         if: ${{ success() && github.event_name != 'pull_request' }}
149 |         uses: docker/build-push-action@v6
150 |         with:
151 |           context: .
152 |           file: ./inst/docker/pkg/Dockerfile
153 |           push: true
154 |           tags: >
155 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest,
156 |             ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel
157 | 
158 |   deploy:
159 |     needs: check
160 |     permissions:
161 |       contents: write
162 |       pages: write
163 |       id-token: write
164 |     runs-on: ubuntu-24.04
165 | 
166 |     steps:
167 |       - name: Deploy to GitHub Pages
168 |         if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }}
169 |         id: deployment
170 |         uses: actions/deploy-pages@v4
171 | 
172 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | *.txt
 8 | *.tar.gz
 9 | !blca_cnaseq.txt
10 | 
11 | # RStudio files
12 | .Rproj.user
13 | *.Rproj
14 | 
15 | # produced vignettes
16 | vignettes/*.html
17 | vignettes/*.pdf
18 | 
19 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
20 | .httr-oauth
21 | 
22 | # Versioned files
23 | *.orig
24 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: TCGAutils
 2 | Title: TCGA utility functions for data management
 3 | Version: 1.29.3
 4 | Description: A suite of helper functions for checking and manipulating TCGA
 5 |   data including data obtained from the curatedTCGAData experiment package.
 6 |   These functions aim to simplify and make working with TCGA data more
 7 |   manageable. Exported functions include those that import data from flat files
 8 |   into Bioconductor objects, convert row annotations, and identifier
 9 |   translation via the GDC API.
10 | Authors@R: c(
11 |     person("Marcel", "Ramos", email = "marcel.ramos@sph.cuny.edu",
12 |         role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3242-0582")),
13 |     person("Lucas", "Schiffer", role = "aut"),
14 |     person("Sean", "Davis", role = "ctb"),
15 |     person("Levi", "Waldron", role = "aut")
16 |     )
17 | Depends:
18 |     R (>= 4.5.0)
19 | Imports:
20 |     AnnotationDbi,
21 |     BiocGenerics,
22 |     BiocBaseUtils,
23 |     GenomeInfoDb,
24 |     GenomicFeatures,
25 |     GenomicRanges,
26 |     GenomicDataCommons,
27 |     IRanges,
28 |     methods,
29 |     MultiAssayExperiment,
30 |     RaggedExperiment,
31 |     rvest,
32 |     S4Vectors,
33 |     stats,
34 |     stringr,
35 |     SummarizedExperiment,
36 |     utils,
37 |     xml2
38 | Suggests:
39 |     AnnotationHub,
40 |     BiocStyle,
41 |     curatedTCGAData,
42 |     ComplexHeatmap,
43 |     devtools,
44 |     dplyr,
45 |     httr,
46 |     IlluminaHumanMethylation450kanno.ilmn12.hg19,
47 |     impute,
48 |     knitr,
49 |     magrittr,
50 |     org.Hs.eg.db,
51 |     RColorBrewer,
52 |     readr,
53 |     rmarkdown,
54 |     RTCGAToolbox,
55 |     rtracklayer,
56 |     R.utils,
57 |     testthat,
58 |     TxDb.Hsapiens.UCSC.hg18.knownGene,
59 |     TxDb.Hsapiens.UCSC.hg19.knownGene
60 | License: Artistic-2.0
61 | Roxygen: list(markdown = TRUE)
62 | Encoding: UTF-8
63 | BugReports: https://github.com/waldronlab/TCGAutils/issues
64 | biocViews: Software, WorkflowStep, Preprocessing, DataImport
65 | VignetteBuilder: knitr
66 | RoxygenNote: 7.3.2
67 | Date: 2025-06-09
68 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CpGtoRanges)
 4 | export(TCGAbarcode)
 5 | export(TCGAbiospec)
 6 | export(TCGAprimaryTumors)
 7 | export(TCGAsampleSelect)
 8 | export(TCGAsplitAssays)
 9 | export(UUIDhistory)
10 | export(UUIDtoBarcode)
11 | export(UUIDtoUUID)
12 | export(barcodeToUUID)
13 | export(correctBuild)
14 | export(extractBuild)
15 | export(filenameToBarcode)
16 | export(findGRangesCols)
17 | export(generateMap)
18 | export(getClinicalNames)
19 | export(getFileName)
20 | export(getSubtypeMap)
21 | export(imputeAssay)
22 | export(isCorrect)
23 | export(makeGRangesListFromCopyNumber)
24 | export(makeGRangesListFromExonFiles)
25 | export(mergeColData)
26 | export(mirToRanges)
27 | export(oncoPrintTCGA)
28 | export(qreduceTCGA)
29 | export(sampleTables)
30 | export(simplifyTCGA)
31 | export(symbolsToRanges)
32 | export(translateBuild)
33 | export(trimColData)
34 | export(uniformBuilds)
35 | import(methods)
36 | importFrom(BiocBaseUtils,checkInstalled)
37 | importFrom(BiocBaseUtils,isScalarCharacter)
38 | importFrom(BiocBaseUtils,isScalarNumber)
39 | importFrom(BiocBaseUtils,lifeCycle)
40 | importFrom(BiocBaseUtils,selectSome)
41 | importFrom(BiocBaseUtils,setSlots)
42 | importFrom(GenomeInfoDb,"genome<-")
43 | importFrom(GenomeInfoDb,"seqlevelsStyle<-")
44 | importFrom(GenomeInfoDb,genome)
45 | importFrom(GenomeInfoDb,keepStandardChromosomes)
46 | importFrom(GenomeInfoDb,seqlevelsStyle)
47 | importFrom(GenomicDataCommons,cases)
48 | importFrom(GenomicDataCommons,expand)
49 | importFrom(GenomicDataCommons,files)
50 | importFrom(GenomicDataCommons,filter)
51 | importFrom(GenomicDataCommons,ids)
52 | importFrom(GenomicDataCommons,results_all)
53 | importFrom(GenomicDataCommons,select)
54 | importFrom(GenomicFeatures,genes)
55 | importFrom(GenomicFeatures,microRNAs)
56 | importFrom(GenomicRanges,GRanges)
57 | importFrom(GenomicRanges,GRangesList)
58 | importFrom(GenomicRanges,granges)
59 | importFrom(GenomicRanges,makeGRangesListFromDataFrame)
60 | importFrom(MultiAssayExperiment,"colData<-")
61 | importFrom(MultiAssayExperiment,ExperimentList)
62 | importFrom(MultiAssayExperiment,colData)
63 | importFrom(MultiAssayExperiment,experiments)
64 | importFrom(MultiAssayExperiment,metadata)
65 | importFrom(MultiAssayExperiment,subsetByColumn)
66 | importFrom(S4Vectors,DataFrame)
67 | importFrom(S4Vectors,isSingleInteger)
68 | importFrom(S4Vectors,isSingleNumber)
69 | importFrom(S4Vectors,isSingleString)
70 | importFrom(SummarizedExperiment,"mcols<-")
71 | importFrom(SummarizedExperiment,"rowData<-")
72 | importFrom(SummarizedExperiment,SummarizedExperiment)
73 | importFrom(SummarizedExperiment,mcols)
74 | importFrom(SummarizedExperiment,rowData)
75 | importFrom(rvest,html_attr)
76 | importFrom(rvest,html_nodes)
77 | importFrom(stats,as.formula)
78 | importFrom(stats,na.omit)
79 | importFrom(stats,setNames)
80 | importFrom(stringr,str_extract)
81 | importFrom(utils,data)
82 | importFrom(utils,head)
83 | importFrom(utils,read.delim)
84 | importFrom(xml2,read_html)
85 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | ## Changes in version 1.30.0
  2 | 
  3 | ### Significant User-visible changes
  4 | 
  5 | * Deprecated `mirbase.db` package affects `mirToRanges` function.
  6 | 
  7 | ### Bug fixes and minor improvements
  8 | 
  9 | * Use `BiocBaseUtils::checkInstalled` to check for suggested packages.
 10 | 
 11 | ## Changes in version 1.24.0
 12 | 
 13 | ### Significant User-visible changes
 14 | 
 15 | * The `legacy` argument in ID translation functions (`UUIDtoBarcode`,
 16 | `UUIDtoUUID`, `barcodeToUUID`, and `filenameToBarcode`) has been defunct and
 17 | removed.
 18 | 
 19 | ### Bug fixes and minor improvements
 20 | 
 21 | * `UUIDtoBarcode` ensures that results are ordered based on the input UUIDs.
 22 | * Include informative error message regarding translation of UUIDs from legacy
 23 | files.
 24 | 
 25 | ## Changes in version 1.22.0
 26 | 
 27 | ### Bug fixes and minor improvements
 28 | 
 29 | * `UUIDtoBarcode` returns barcodes consistent with Genomic Data Commons API
 30 | update
 31 | 
 32 | ## Changes in version 1.20.0
 33 | 
 34 | ### New features
 35 | 
 36 | * `makeSummarizedExperimentFromGISTIC` and `splitAssays` are now defunct.
 37 | 
 38 | ### Minor changes and bug fixes
 39 | 
 40 | * `makeGRangesListFromExonFiles` example removed from the vignette, the GDC
 41 | `legacy` endpoint has been deprecated. For more information see the GDC API
 42 | release notes version v3.28.0.
 43 | 
 44 | ## Changes in version 1.18.0
 45 | 
 46 | ### Minor changes and bug fixes
 47 | 
 48 | * Use https instead of http in `getFileName` helper.
 49 | * Warn when column names in assays are not mappable and subsequently dropped in
 50 | `generateMap`.
 51 | * Updated `qreduceTCGA` documentation for clarity.
 52 | 
 53 | ## Changes in version 1.16.0
 54 | 
 55 | ### New features
 56 | 
 57 | * The `UUIDhistory` function allows users to map old UUIDs to new UUIDs
 58 | according to the latest data release for UUIDs that were affected and no longer
 59 | query-able.
 60 | * The `slides` argument has been added to the `filenameToBarcode` function for
 61 | translating slide file names into barcodes. Currently, the API returns all
 62 | barcodes of the associated case ID.
 63 | * Add sections in the vignette regarding GDC Data Updates and UUID history
 64 | lookup
 65 | 
 66 | ### Minor changes and bug fixes
 67 | 
 68 | * Update examples in package to new GDC Data Release, see vignette.
 69 | * Use `AnnotationHub` to download chain file in main vignette.
 70 | * Slide file names now resolve to a single TCGA barcode in `filenameToBarcode`
 71 | (Thanks @hermidalc)
 72 | * Improved error messages and documentation for `makeGRangesListFromExonFiles`
 73 | 
 74 | ## Changes in version 1.14.0
 75 | 
 76 | ### Minor changes and bug fixes
 77 | 
 78 | * `UUIDtoBarcode` with the `from_type = "file_id"` argument now returns the IDs
 79 | in the proper order when more than one `UUID` is input.
 80 | * Update `makeGRangesListFromCopyNumber` examples with new names from API e.g.,
 81 | 'associated_entities.entity_submitter_id'
 82 | 
 83 | ## Changes in version 1.12.0
 84 | 
 85 | ### New features
 86 | 
 87 | * `makeSummarizedExperimentFromGISTIC` has been moved to `RTCGAToolbox`.
 88 | * `splitAssays` now deprecated for `TCGAsplitAssays` to avoid conflict with
 89 | `MultiAssayExperiment::splitAssays`
 90 | 
 91 | ### Minor changes and bug fixes
 92 | 
 93 | * Properly identifies genome annotation (`hg*`) in `oncoPrintTCGA`
 94 | * `qreduceTCGA` now works with updates to `seqlevelsStyle` where genome
 95 | annotation include patch versions when available
 96 | 
 97 | ## Changes in version 1.10.0
 98 | 
 99 | ### New features
100 | 
101 | * `correctBuild` attempts to provide the official name of a particular human
102 | genome build to agree with changes in `GenomeInfoDb`
103 | * `isCorrect` checks that the build name matches the official name
104 | 
105 | ### Minor changes and bug fixes
106 | 
107 | * Documentation improvements to `simplifyTCGA`
108 | * Improvements to `findGRangesCols` to locate ranged columns in a `DataFrame`
109 | * Fixed a bug in `UUIDtoBarcode` where only the first record was returned
110 | (#26, @DarioS)
111 | * Fixed a bug in `filenameToBarcode` when multiple inputs were used (#22,
112 | @DarioS)
113 | 
114 | ## Changes in version 1.8.0
115 | 
116 | ### New features
117 | 
118 | * `README.md` now includes a cheat sheet for reference
119 | * `mergeColData` and `oncoPrintTCGA` sections updated/included in the vignette
120 | 
121 | ### Minor changes and bug fixes
122 | 
123 | * `translateBuild` more robust to consistent inputs
124 | * `translateBuild` returns vector output instead of single string as before
125 | * `makeSummarizedExperimentFromGISTIC` now has a more open interface with
126 | `...` input to `RTCGAToolbox::getGISTICPeaks`
127 | * `oncoPrintTCGA` now uses `seqlevels` from input throughout
128 | 
129 | ## Changes in version 1.6.0
130 | 
131 | ### New features
132 | 
133 | * `oncoPrintTCGA`: Create an `oncoPrint` visualization for mutation data
134 | * Support `aliquot_ids` as input to `UUIDtoBarcode` function
135 | * Additional sections in the vignette: `CpGtoRanges`, `UUIDtoBarcode` for
136 | `aliquot_ids`
137 | * `TCGAprimaryTumors` allows users to select all primary tumors for a given
138 | `curatedTCGAData` `MultiAssayExperiment` object (suggested by @vjcitn)
139 | 
140 | ### Minor changes and bug fixes
141 | 
142 | * Now merging clinical data using both rows and columns in `mergeColData`
143 | * Added informative error when query results are empty in `UUIDtoBarcode`
144 | * Updates to `makeGRangesListFromExonFiles` to use `S4Vectors::splitAsList`
145 | (@hpages)
146 | 
147 | ## Changes in version 1.4.0
148 | 
149 | ### New features
150 | 
151 | * `trimColData` added to remove any extra columns from the `colData` slot
152 | (thanks to @vjcitn)
153 | * `CpGtoRanges` translates CpG islands to genomic positions using an annotation
154 | package and `minfi`
155 | * Overhaul of the barcode translation services allows accurate translation
156 | of identifiers
157 | * `splitAssays` now separates all assays by sample codes contained therein
158 | by default, previous behavior had default values
159 | * Documentation for `simplifyTCGA` was modified to include similar operations,
160 | such as, `symbolsToRanges`, `mirToRanges`, `CpGtoRanges`, etc.
161 | * Vignette includes comprehensive examples of new functionality
162 | 
163 | ### Minor changes and bug fixes
164 | 
165 | * `getFileNames` renamed to `getFileName`
166 | * `TCGAsampleSelect` now allows multiple sample type inputs as the
167 | `sampleCodes` argument
168 | * `getSubtypeMap` updates column names to accurately represent patient
169 | identifiers
170 | * More robust checks were added to `splitAssays` to ensure valid sample codes
171 | in the input and provided as arguments
172 | * `makeGRangesListFromExonFiles` is optimized to use `dplyr` when available
173 | and fast operations from `IRanges`
174 | * Various enhancements to `*toRanges` functions, including re-using underlying
175 | common helper function
176 | * The internal `weightedmean` function in `qreduceTCGA` has been updated for
177 | correctness
178 | * The `keep` arugment in `qreduceTCGA` and related functions was changed
179 | to `keep.assay`
180 | 
181 | ## Changes in version 1.2.0
182 | 
183 | ### New features
184 | 
185 | * `imputeAssay` added to impute data for MultiAssayExperiment assays
186 | * `UUIDtoUUID` translation available to translate from file to case IDs
187 | * A suite of functions is available to enhance existing MultiAssayExperiment
188 | datasets: `qreduceTCGA`, `mirToRanges`, `symbolsToRanges`. Thanks to @lwaldron
189 | 
190 | ### Minor changes and bug fixes
191 | 
192 | * Various changes to examples for compatibility with RaggedExperiment
193 | * Bug fix to internal functions for finding GRanges columns
194 | 
195 | ## Changes in version 1.1.5
196 | 
197 | * `uniformBuilds` cleans up a vector of inconsistently labelled builds by
198 | changing the build with the lowest frequency
199 | 
200 | ## Changes in version 1.1.4
201 | 
202 | ### New features
203 | 
204 | * The `UUIDtoUUID` function can translate from case to file UUIDs and vice
205 | versa
206 | * `imputeAssay` allows missing data imputation using KNN for
207 | `MultiAssayExperiment` assays
208 | 
209 | ## Changes in version 1.1.1
210 | 
211 | ### New features
212 | 
213 | * exported the internal helper, `filenameToBarcode`. See examples
214 | 
215 | ## Changes in version 0.99.68
216 | 
217 | ### Minor changes and bug fixes
218 | 
219 | * Minor changes in response to review, avoid switching from logical to numeric
220 | index, updated vignette introduction
221 | * Fix examples to updated `GenomicDataCommons` interface
222 | * Move `RTCGAToolbox` to `Suggests` field in DESCRIPTION
223 | * Removed `BiocFileCache` from `Imports` field
224 | 
225 | ## Changes in version 0.99.51
226 | 
227 | ### New features
228 | 
229 | * Added a group of ID translation helper functions (see ?ID-translation)
230 | * Added a group of helper functions that work with `curatedTCGAData`
231 | * `UUIDtoBarcode` function added thanks to @seandavi
232 | * Renamed `makeGRangesListFromTCGA` to `makeGRangesListFromCopyNumber`
233 | * `makeSummarizedExperimentFromGISTIC` is now available to convert
234 | `RTCGAToolbox`'s `FirehoseGISTIC` data class to `SummarizedExperiment`
235 | * Created a function to merge external `colData` to a `MultiAssayExperiment`
236 | `colData` slot
237 | * Revamped vignette documentation
238 | 
239 | ### Minor changes and bug fixes
240 | 
241 | * Improvements to `TCGAbiospec` and `TCGAbarcode`
242 | * Updated `sampleTypes` and `clinicalNames` datasets
243 | * Updated DESCRIPTION file with appropriate imports and exports
244 | * Various improvements to `findGRangesCols`
245 | * `generateMap` is now updated to the recent `MultiAssayExperiment` API with
246 | improved example
247 | * Updated `getFileNames` to most recent `RTCGAToolbox` API
248 | * Various updates to data generating scripts in `data-raw` folder
249 | * Format updates to NEWS file
250 | * Added tests
251 | 
252 | ## Changes in version 0.2.0
253 | 
254 | ### New features
255 | 
256 | * Package renamed to `TCGAutils` for working with TCGA data
257 | 
258 | ## Changes in version 0.1.0
259 | 
260 | ### New features
261 | 
262 | * `TCGAtranslateID` now works with GDC API
263 | 
264 | ### Minor changes and bug fixes
265 | 
266 | * Code cleaned up
267 | * Added proper import directives
268 | 
269 | ## Changes in version 0.0.70
270 | 
271 | ### New features
272 | 
273 | * `makeGRangesListFromDataFrame` now moved to `GenomicRanges`
274 | * `makeSummarizedExperimentFromDataFrame` now moved to `SummarizedExperiment`
275 | * `getFileNames` function will obtain filenames used in `RTCGAToolbox`
276 | * Improved `getFileNames` with `xml2` and `rvest` dependencies and removes the
277 | `XML` dependency
278 | 
279 | ### Minor changes and bug fixes
280 | 
281 | * `TCGAextract` now uses the `findGRangesCols` to automatically detect ranged
282 | data columns
283 | * Arguments in functions `TCGA*` now renamed to match `MultiAssayExperiment`
284 | conventions
285 | * Informative errors in `TCGAextract`
286 | 
287 | ## Changes in version 0.0.60
288 | 
289 | * `makeGRangesListFromTCGA` data builds on `makeGRangesListFromDataFrame`
290 | * `makeGRangesListFromDataFrame` and
291 | `makeRangedSummarizedExperimentFromDataFrame` will be moving to standard
292 | Bioconductor packages soon.
293 | * `tcga` and `ccle` functions soon to be deprecated.
294 | * Upcoming: `TCGAbarcode` will be modified for efficiency
295 | 
296 | ## Changes in version 0.0.50
297 | 
298 | * Add your own identifier parsing function for generating a `sampleMap` in
299 | `generateMap`!
300 | * Add proper genome build to ranged based objects.
301 | * Return `SummarizedExperiment` class for certain data types.
302 | * Fix genome build bugs
303 | 
304 | ## Changes in version 0.0.46
305 | 
306 | * `makeRSE` function for creating a `RangedSummarizedExperiment` object from a
307 | data frame.
308 | * Bug fixes to `getRangeNames` including the option to enter a regular
309 | expression vector for finding ranged column names.
310 | * `matchClinical` renamed to `TCGAmatchClinical`
311 | 
312 | ## Changes in version 0.0.44
313 | 
314 | * `getRangedNames` function will try to extract minimum necessary names for
315 | creating ranges (works on a vector of names)
316 | * minor bug fixes to `TCGAbiospec`, `TCGAextract`, `makeGRangesList`
317 | 
318 | ## Changes in version 0.0.40
319 | 
320 | * Package renamed to `BiocInterfaces`!
321 | * `TCGA` specific functions now start with the letters `TCGA`
322 | * Included: more examples of use of the `TCGAbarcode` function
323 | * Updated `makeGRangesList` function to work with `tcga` and `ccle` data
324 |     parameter functions
325 | 
326 | ## Changes in version 0.0.2
327 | 
328 | * Added a `NEWS.md` file to track changes to the package.
329 | * TCGAmisc now a standalone package! (previously in `RTCGAToolbox`)
330 | * Provides helper functions for converting raw data into S4 objects (e.g.,
331 | `GRangesList`)
332 | * Provides functions for creating a MultiAssayExperiment object such as:
333 |     * `generateTCGAmap`
334 |     * `cleanExpList`
335 | 


--------------------------------------------------------------------------------
/R/ID-translation.R:
--------------------------------------------------------------------------------
  1 | ## function to figure out exact endpoint based on TCGA barcode
  2 | .barcode_files <- function(startPoint = "cases", submitter_id = TRUE) {
  3 |     keywords <- c("cases", "samples", "portions", "analytes", "aliquots")
  4 |     last <- match.arg(startPoint, keywords)
  5 |     indx <- seq_len(which(keywords == last))
  6 |     sub_id <- if (submitter_id) "submitter_id" else NULL
  7 |     paste(c(keywords[indx], sub_id), collapse = ".")
  8 | }
  9 | 
 10 | .subword_id <- function(keyword) {
 11 |     ret <- paste0(keyword, "_ids")
 12 |     setNames(paste0("submitter_", ret), ret)
 13 | }
 14 | 
 15 | .barcode_cases <- function(bcodeType = "case") {
 16 |     if (identical(bcodeType, "case"))
 17 |         setNames("submitter_id", "case_id")
 18 |     else
 19 |         .subword_id(bcodeType)
 20 | }
 21 | 
 22 | .findBarcodeLimit <- function(barcode) {
 23 |     .checkBarcodes(barcode)
 24 |     filler <- .uniqueDelim(barcode)
 25 |     splitCodes <- strsplit(barcode, filler)
 26 |     obsIdx <- unique(lengths(splitCodes))
 27 | 
 28 |     if (obsIdx < 3L)
 29 |         stop("Minimum barcode fields required: ", 3L,
 30 |             "; first three are 'project-TSS-participant'")
 31 | 
 32 |     key <- c(rep("case", 3L), "sample", "analyte", "aliquot", "aliquot")[obsIdx]
 33 |     if (identical(key, "analyte")) {
 34 |         analyte_chars <- unique(
 35 |             vapply(splitCodes, function(x) nchar(x[[obsIdx]]), integer(1L))
 36 |         )
 37 |         if (!S4Vectors::isSingleInteger(analyte_chars))
 38 |             stop("Inconsistent '", key, "' barcodes")
 39 |         if (analyte_chars < 3)
 40 |             key <- "portion"
 41 |     } else if (identical(key, "aliquot")) {
 42 |         if (identical(obsIdx, 6L)) {
 43 |             ali_chars <- vapply(splitCodes, function(x)
 44 |                 nchar(x[c(obsIdx-1L, obsIdx)]), integer(2L))
 45 |             if (identical(ali_chars, c(2L, 3L)))
 46 |                 key <- "slide"
 47 |         }
 48 |     }
 49 |     key
 50 | }
 51 | 
 52 | .buildIDframe <- function(info, id_list) {
 53 |     barcodes_per_file <- lengths(id_list)
 54 |     # And build the data.frame
 55 |     data.frame(
 56 |         id = rep(ids(info), barcodes_per_file),
 57 |         barcode = if (!length(ids(info))) character(0L) else unlist(id_list),
 58 |         row.names = NULL,
 59 |         stringsAsFactors = FALSE
 60 |     )
 61 | }
 62 | 
 63 | .cleanExpand <- function(result, ids) {
 64 |     samps <- result[["samples"]]
 65 |     usamps <- unlist(samps)
 66 |     splitsamps <- split(unname(usamps), gsub("[0-9]*$", "", names(usamps)))
 67 |     splits <- strsplit(names(splitsamps), "\\.")
 68 |     cnames <- unique(vapply(splits, function(x) {
 69 |         paste0(x[-1], collapse = ".") }, character(1)))
 70 |     first <- unlist(splitsamps[c(TRUE, FALSE)])
 71 |     second <- unlist(splitsamps[c(FALSE, TRUE)])
 72 |     pos <- match(ids, first)
 73 |     resframe <- cbind.data.frame(first[pos], second[pos], row.names = NULL,
 74 |         stringsAsFactors = FALSE)
 75 |     names(resframe) <- cnames
 76 |     resframe
 77 | }
 78 | 
 79 | .orderedDF <- function(..., orderBy) {
 80 |     df <- data.frame(..., stringsAsFactors = FALSE)
 81 |     orderIdx <- match(orderBy, df[["info..from_type.."]])
 82 |     res <- df[orderIdx, ]
 83 |     rownames(res) <- NULL
 84 |     res
 85 | }
 86 | 
 87 | .nestedlisttodf <- function(x, orderBy) {
 88 |     .check_ids_found(names(x), orderBy)
 89 |     x <- Filter(length, x[orderBy])
 90 |     data.frame(
 91 |         rep(names(x), vapply(x, nrow, integer(1))),
 92 |         unlist(x, use.names = FALSE),
 93 |         stringsAsFactors = FALSE
 94 |     )
 95 | }
 96 | 
 97 | #' @importFrom BiocBaseUtils selectSome
 98 | .check_ids_found <- function(resnames, id_vector) {
 99 |     idin <- id_vector %in% resnames
100 |     if (!all(idin)) {
101 |         mids <- paste(
102 |             selectSome(id_vector[!idin], 4), collapse = ", "
103 |         )
104 |         warning("Identifiers not found: ", mids, call. = FALSE)
105 |     }
106 | }
107 | 
108 | #' @name ID-translation
109 | #'
110 | #' @title Translate study identifiers from barcode to UUID and vice versa
111 | #'
112 | #' @description These functions allow the user to enter a character vector of
113 | #' identifiers and use the GDC API to translate from TCGA barcodes to
114 | #' Universally Unique Identifiers (UUID) and vice versa. These relationships
115 | #' are not one-to-one. Therefore, a `data.frame` is returned for all
116 | #' inputs. The UUID to TCGA barcode translation only applies to file and case
117 | #' UUIDs. Two-way UUID translation is available from 'file_id' to 'case_id'
118 | #' and vice versa. Please double check any results before using these
119 | #' features for analysis. Case / submitter identifiers are translated by
120 | #' default, see the `from_type` argument for details. All identifiers are
121 | #' converted to lower case.
122 | #'
123 | #' @details
124 | #' Based on the file UUID supplied, the appropriate entity_id (TCGA barcode) is
125 | #' returned. In previous versions of the package, the 'end_point' parameter
126 | #' would require the user to specify what type of barcode needed. This is no
127 | #' longer supported as `entity_id` returns the appropriate one.
128 | #'
129 | #' @param id_vector character() A vector of UUIDs corresponding to
130 | #'     either files or cases (default assumes case_ids)
131 | #'
132 | #' @param from_type character(1) Either `case_id` or `file_id` indicating the
133 | #'     type of `id_vector` entered (default `"case_id"`)
134 | #'
135 | #' @return Generally, a `data.frame` of identifier mappings
136 | #'
137 | #' @md
138 | #'
139 | #' @examples
140 | #' ## Translate UUIDs >> TCGA Barcode
141 | #'
142 | #' uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac",
143 | #' "5ca9fa79-53bc-4e91-82cd-5715038ee23e",
144 | #' "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382")
145 | #'
146 | #' UUIDtoBarcode(uuids, from_type = "file_id")
147 | #'
148 | #' UUIDtoBarcode("ae55b2d3-62a1-419e-9f9a-5ddfac356db4", from_type = "case_id")
149 | #'
150 | #' UUIDtoBarcode("d85d8a17-8aea-49d3-8a03-8f13141c163b", "aliquot_ids")
151 | #'
152 | #' @author Sean Davis, M. Ramos
153 | #'
154 | #' @export UUIDtoBarcode
155 | UUIDtoBarcode <-  function(
156 |     id_vector, from_type = c("case_id", "file_id", "aliquot_ids")
157 | ) {
158 |     from_type <- match.arg(from_type)
159 |     targetElement <- APIendpoint <- "submitter_id"
160 |     if (identical(from_type, "file_id")) {
161 |         APIendpoint <- "associated_entities.entity_submitter_id"
162 |         targetElement <- "associated_entities"
163 |     } else if (identical(from_type, "aliquot_ids")) {
164 |         APIendpoint <- "samples.portions.analytes.aliquots.submitter_id"
165 |         targetElement <- "samples"
166 |     }
167 |     selector <- switch(from_type,
168 |         case_id = identity,
169 |         aliquot_ids =
170 |             function(x)
171 |                 select(
172 |                     x = x,
173 |                     fields = c(
174 |                         APIendpoint,
175 |                         "samples.portions.analytes.aliquots.aliquot_id"
176 |                     )
177 |                 ),
178 |         function(x) select(x = x, fields = APIendpoint)
179 |     )
180 | 
181 |     funcRes <- switch(from_type,
182 |         file_id = files(),
183 |         case_id = cases(),
184 |         aliquot_ids = cases())
185 |     info <- results_all(
186 |         selector(
187 |             GenomicDataCommons::filter(funcRes, as.formula(
188 |                 paste("~ ", from_type, "%in% id_vector")
189 |             ))
190 |         )
191 |     )
192 |     if (!length(info))
193 |         stop(
194 |             paste(strwrap(
195 |                 "No barcodes were found. Note that legacy files were removed
196 |                 as of GDC Data Portal version 1.30.4; see
197 |                 https://docs.gdc.cancer.gov/. Only case, file, and aliquot
198 |                 UUIDs are supported.",
199 |                 exdent = 2
200 |             ), collapse = "\n"),
201 |             call. = FALSE
202 |         )
203 | 
204 |     rframe <-
205 |         if (identical(from_type, "case_id"))
206 |             .orderedDF(
207 |                 info[[from_type]], info[[targetElement]], orderBy = id_vector
208 |             )
209 |         else if (identical(from_type, "file_id"))
210 |             .nestedlisttodf(info[[targetElement]], id_vector)
211 |         else
212 |             return(.cleanExpand(info, id_vector))
213 | 
214 |     names(rframe) <- c(from_type, APIendpoint)
215 |     rframe
216 | }
217 | 
218 | #' @rdname ID-translation
219 | #'
220 | #' @param to_type character(1) The desired UUID type to obtain, can either be
221 | #'     `"case_id"` (default) or `"file_id"`
222 | #'
223 | #' @examples
224 | #' ## Translate file UUIDs >> case UUIDs
225 | #'
226 | #' uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac",
227 | #' "5ca9fa79-53bc-4e91-82cd-5715038ee23e",
228 | #' "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382")
229 | #'
230 | #' UUIDtoUUID(uuids)
231 | #'
232 | #' @export UUIDtoUUID
233 | UUIDtoUUID <- function(
234 |     id_vector, to_type = c("case_id", "file_id")
235 | ) {
236 |     id_vector <- tolower(id_vector)
237 |     type_ops <- c("case_id", "file_id")
238 |     to_type <- match.arg(to_type)
239 |     from_type <- type_ops[!type_ops %in% to_type]
240 |     if (!length(from_type))
241 |         stop("Provide a valid UUID type")
242 | 
243 |     endpoint <- switch(to_type,
244 |         case_id = "cases.case_id",
245 |         file_id = "files.file_id")
246 |     apifun <- switch(to_type,
247 |         file_id = cases(),
248 |         case_id = files())
249 |     info <- results_all(
250 |         select(filter(apifun, as.formula(
251 |             paste("~ ", from_type, "%in% id_vector")
252 |             )),
253 |         endpoint)
254 |     )
255 |     targetElement <- gsub("(\\w+).*", "\\1", endpoint)
256 |     id_list <- lapply(info[[targetElement]], function(x) {x[[1]]})
257 | 
258 |     rframe <- .buildIDframe(info, id_list)
259 |     names(rframe) <- c(from_type, endpoint)
260 |     rframe
261 | }
262 | 
263 | #' @rdname ID-translation
264 | #'
265 | #' @param barcodes character() A vector of TCGA barcodes
266 | #'
267 | #' @examples
268 | #' ## Translate TCGA Barcode >> UUIDs
269 | #'
270 | #' fullBarcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
271 | #' "TCGA-B0-5094-11A-01D-1421-08",
272 | #' "TCGA-E9-A295-10A-01D-A16D-09")
273 | #'
274 | #' sample_ids <- TCGAbarcode(fullBarcodes, sample = TRUE)
275 | #'
276 | #' barcodeToUUID(sample_ids)
277 | #'
278 | #' participant_ids <- c("TCGA-CK-4948", "TCGA-D1-A17N",
279 | #' "TCGA-4V-A9QX", "TCGA-4V-A9QM")
280 | #'
281 | #' barcodeToUUID(participant_ids)
282 | #'
283 | #' @export barcodeToUUID
284 | barcodeToUUID <-
285 |     function(barcodes)
286 | {
287 |     .checkBarcodes(barcodes)
288 |     bend <- .findBarcodeLimit(barcodes)
289 |     endtargets <- .barcode_cases(bend)
290 |     expander <- gsub("cases\\.", "", .barcode_files(bend, FALSE))
291 | 
292 |     pand <- switch(expander, cases = identity,
293 |         function(x) expand(x = x, expand = expander))
294 |     info <- results_all(
295 |         pand(x = filter(cases(), as.formula(
296 |             paste("~ ", endtargets, "%in% barcodes")
297 |         )))
298 |     )
299 |     if (identical(expander, "cases")) {
300 |         rframe <- as.data.frame(info[c(endtargets, names(endtargets))],
301 |             stringsAsFactors = FALSE)
302 |     } else {
303 |         idnames <- lapply(ids(info), function(ident) {
304 |             info[["samples"]][[ident]]
305 |         })
306 |         if (!identical(expander, "samples")) {
307 |             exFUN <- switch(expander,
308 |                 samples.portions =
309 |                     function(x, i) x[["portions"]],
310 |                 samples.portions.analytes =
311 |                     function(x, i) unlist(lapply(
312 |                         x[["portions"]], `[[`, "analytes"), recursive = FALSE),
313 |                 samples.portions.analytes.aliquots =
314 |                     function(x, i) unlist(lapply(
315 |                         unlist(
316 |                             lapply(x[["portions"]], `[[`, "analytes"),
317 |                             recursive = FALSE), `[[`, "aliquots"),
318 |                         recursive = FALSE)
319 |                 )
320 |             idnames <- unlist(lapply(seq_along(idnames), function(i)
321 |                 exFUN(x = idnames[[i]], i = i)
322 |             ), recursive = FALSE)
323 |             idnames <- Filter(function(g) length(g) >= 2L, idnames)
324 |         }
325 |         rescols <- lapply(idnames, `[`,
326 |             c("submitter_id", gsub("s$", "", names(endtargets))))
327 |         rframe <- do.call(rbind, c(rescols, stringsAsFactors = FALSE))
328 |         names(rframe) <- c(endtargets, names(endtargets))
329 |     }
330 |     rframe[na.omit(match(barcodes, rframe[[endtargets]])), , drop = FALSE]
331 | }
332 | 
333 | .matchSort <- function(major, minor) {
334 |     hits <- S4Vectors::findMatches(major, minor)
335 |     order(S4Vectors::subjectHits(hits))
336 | }
337 | 
338 | .unnest_df <- function(dlist, cols) {
339 |     dlist <- lapply(unname(dlist), unlist)
340 |     if (!missing(cols)) {
341 |         cols <- gsub("cases\\.", "", cols)
342 |         dlist <- lapply(dlist, function(d) d[names(d) %in% cols])
343 |     }
344 |     do.call(rbind, dlist) |>
345 |         as.data.frame()
346 | }
347 | 
348 | #' @rdname ID-translation
349 | #'
350 | #' @param filenames `character()` A vector of file names usually obtained
351 | #'     from a `GenomicDataCommons` query
352 | #'
353 | #' @param slides `logical(1L)` **DEPRECATED**: Whether the provided file names
354 | #'   correspond to slides typically with an `.svs` extension. **Note** The
355 | #'   barcodes returned correspond 1:1 with the `filename` inputs. Always triple
356 | #'   check the output against the Genomic Data Commons Data Portal by searching
357 | #'   the file name and comparing associated "Entity ID" with the `submitter_id`
358 | #'   given by the function.
359 | #'
360 | #' @details When providing slide file names, the function will only work if
361 | #'   **all** the provided files are slide files with an `.svs` extension.
362 | #'
363 | #' @examples
364 | #' library(GenomicDataCommons)
365 | #'
366 | #' ### Query CNV data and get file names
367 | #'
368 | #' cnv <- files() |>
369 | #'     filter(
370 | #'         ~ cases.project.project_id == "TCGA-COAD" &
371 | #'         data_category == "Copy Number Variation" &
372 | #'         data_type == "Copy Number Segment"
373 | #'     ) |>
374 | #'     results(size = 6)
375 | #'
376 | #' filenameToBarcode(cnv$file_name)
377 | #'
378 | #' ### Query slides data and get file names
379 | #'
380 | #' slides <- files() |>
381 | #'     filter(
382 | #'         ~ cases.project.project_id == "TCGA-BRCA" &
383 | #'         cases.samples.sample_type == "Primary Tumor" &
384 | #'         data_type == "Slide Image" &
385 | #'         experimental_strategy == "Diagnostic Slide"
386 | #'     ) |>
387 | #'     results(size = 3)
388 | #'
389 | #' filenameToBarcode(slides$file_name, slides = TRUE)
390 | #'
391 | #' @export filenameToBarcode
392 | filenameToBarcode <- function(filenames, slides = FALSE) {
393 |     endwithsvs <- endsWith(filenames, "svs")
394 |     allsvs <- all(endwithsvs)
395 |     if (!allsvs && any(endwithsvs))
396 |         stop("Not all file names have an 'svs' extension.")
397 |     if (!missing(slides)) {
398 |         .Deprecated(
399 |             msg = "The 'slides' argument is deprecated.", package = "TCGAutils"
400 |         )
401 |         if (allsvs && !slides)
402 |             warning(
403 |                 "All files have an 'svs' extension. Setting 'slides' to TRUE."
404 |             )
405 |         slides <- allsvs
406 |     }
407 |     filesres <- files()
408 |     endpoint <- "cases.samples.portions.analytes.aliquots.submitter_id"
409 |     reselem <- "cases"
410 |     if (slides) {
411 |         cases_fields <- c(
412 |             "cases.project.project_id",
413 |             "cases.samples.tissue_type",
414 |             "cases.samples.tumor_descriptor"
415 |         )
416 |         endpoint <- c(
417 |             "cases.samples.portions.slides.submitter_id",
418 |             "associated_entities.entity_id",
419 |             "associated_entities.entity_submitter_id",
420 |             "associated_entities.entity_type",
421 |             "associated_entities.case_id",
422 |             cases_fields
423 |         )
424 |         reselem <- "associated_entities"
425 |     }
426 | 
427 |     info <- GenomicDataCommons::filter(filesres, ~ file_name %in% filenames) |>
428 |         GenomicDataCommons::select(c("file_name", endpoint)) |>
429 |         results_all()
430 | 
431 |     if (!length(info))
432 |         stop("Query did not return any results. Check 'filenames' input.")
433 | 
434 |     reps <- unlist(lapply(info[[reselem]], nrow))
435 |     res <- data.frame(
436 |         file_name = rep(info[["file_name"]], reps),
437 |         file_id = rep(info[["file_id"]], reps),
438 |         row.names = NULL,
439 |         stringsAsFactors = FALSE
440 |     )
441 |     res <- cbind(res, .unnest_df(info[[reselem]]))
442 |     if (slides) {
443 |         slidedf <- .unnest_df(info[["cases"]], cols = cases_fields)
444 |         res <- cbind.data.frame(res, slidedf)
445 |     }
446 |     idx <- .matchSort(res[["file_name"]], filenames)
447 |     res[idx, ]
448 | }
449 | 
450 | .HISTORY_ENDPOINT <- "https://api.gdc.cancer.gov/history"
451 | 
452 | #' @rdname ID-translation
453 | #'
454 | #' @param id character(1) A UUID whose history of versions is sought
455 | #'
456 | #' @param endpoint character(1) Generally a constant pertaining to the location
457 | #'     of the history api endpoint. This argument rarely needs to change.
458 | #'
459 | #' @return UUIDhistory: A `data.frame` containting a list of associated UUIDs
460 | #'     for the given input along with `file_change` status, `data_release`
461 | #'     versions, etc.
462 | #'
463 | #' @examples
464 | #' ## Get the version history of a BAM file in TCGA-KIRC
465 | #' UUIDhistory("0001801b-54b0-4551-8d7a-d66fb59429bf")
466 | #'
467 | #' @export
468 | UUIDhistory <- function(id, endpoint = .HISTORY_ENDPOINT) {
469 |     if (!requireNamespace("httr", quietly = TRUE))
470 |         stop("Install 'httr' to check UUID status")
471 |     qurl <- paste(endpoint, id, sep = "/")
472 |     resp <- httr::GET(qurl)
473 |     do.call(rbind.data.frame, httr::content(resp))
474 | }
475 | 


--------------------------------------------------------------------------------
/R/TCGAbarcode.R:
--------------------------------------------------------------------------------
 1 | .uniqueDelim <- function(ids) {
 2 |     nonnum <- gsub("[a-zA-Z0-9]", "", ids)
 3 |     dels <- unique(unlist(
 4 |         strsplit(nonnum, "")
 5 |     ))
 6 |     if (!length(dels))
 7 |         dels <- ""
 8 |     dels
 9 | }
10 | 
11 | .checkBarcodes <- function(barcodes, check.sample = FALSE) {
12 |     if (!all(startsWith(toupper(barcodes), "TCGA")))
13 |         stop("Barcodes must start with 'TCGA'")
14 |     filler <- .uniqueDelim(barcodes)
15 |     if (length(filler) != 1L)
16 |         stop("Barcode delimiters not consistent")
17 |     bcodelens <- unique(nchar(barcodes))
18 |     if (length(bcodelens) > 1L)
19 |         warning("Inconsistent barcode lengths: ",
20 |             paste(bcodelens, collapse = ", "))
21 |     if (check.sample) {
22 |         if (any(bcodelens < 15L))
23 |         stop("'barcodes' should be at least 15 characters ",
24 |                 "with sample information")
25 |     }
26 | }
27 | 
28 | #' Parse data from TCGA barcode
29 | #'
30 | #' This function returns the specified snippet of information obtained from
31 | #' the TCGA barcode.
32 | #'
33 | #' @param barcodes A character vector of TCGA barcodes
34 | #' @param participant Logical (default TRUE) participant identifier chunk
35 | #' @param sample Logical (default FALSE) includes the numeric sample code of
36 | #' the barcode and the vial letter
37 | #' @param portion Logical (default FALSE) includes the portion and analyte
38 | #' codes of the barcode
39 | #' @param plate Logical (default FALSE) returns the plate value
40 | #' @param center Logical (default FALSE) returns a matrix with the plate and
41 | #' center codes
42 | #' @param index An optional numeric vector indicating barcode positions when
43 | #'   split by the delimiter (i.e., hyphen '-'). For example, an index of
44 | #'   `c(1, 2)` corresponds to 'TCGA-ZZ' in `TCGA-ZZ-A1A1`.
45 | #'
46 | #' @return A character vector or data matrix of TCGA barcode information
47 | #'
48 | #' @author M. Ramos
49 | #'
50 | #' @examples
51 | #' barcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
52 | #' "TCGA-B0-5094-11A-01D-1421-08",
53 | #' "TCGA-E9-A295-10A-01D-A16D-09")
54 | #'
55 | #' ## Patient identifiers
56 | #' TCGAbarcode(barcodes)
57 | #'
58 | #' ## Sample identifiers
59 | #' TCGAbarcode(barcodes, sample = TRUE)
60 | #'
61 | #' @export TCGAbarcode
62 | TCGAbarcode <- function(barcodes, participant = TRUE, sample = FALSE,
63 |     portion = FALSE, plate = FALSE, center = FALSE, index = NULL)
64 | {
65 |     .checkBarcodes(barcodes)
66 |     filler <- .uniqueDelim(barcodes)
67 |     stopifnot(is.null(index) || is.numeric(index))
68 |     if (is.null(index))
69 |         index <- which(
70 |             c(rep(participant, 3), sample, portion, plate, center)
71 |         )
72 |     barcodeMat <- do.call(rbind, strsplit(barcodes, filler))
73 |     apply(barcodeMat[, index, drop = FALSE], 1L, paste, collapse = filler)
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/R/TCGAbiospec.R:
--------------------------------------------------------------------------------
 1 | .strsep <- function(text, pos) {
 2 |     stopifnot(length(unique(nchar(text))) == 1L)
 3 |     lengthText <- unique(nchar(text))
 4 |     allIndx <- seq_len(lengthText)
 5 |     stopifnot(pos %in% allIndx)
 6 |     fgroup <- seq_len(pos)
 7 |     sgroup <- allIndx[!allIndx %in% fgroup]
 8 |     list(
 9 |         substr(text, min(fgroup), max(fgroup)),
10 |         substr(text, min(sgroup), max(sgroup))
11 |     )
12 | }
13 | 
14 | #' Extract biospecimen data from the TCGA barcode
15 | #'
16 | #' This function uses the full TCGA barcode to return a data frame of the
17 | #' data pertinent to laboratory variables such as vials, portions, analytes,
18 | #' plates and the center.
19 | #'
20 | #' @param barcodes A character vector of TCGA barcodes
21 | #' @return A `dataframe` with sample type, sample code, portion, plate,
22 | #' and center columns.
23 | #'
24 | #' @author M. Ramos
25 | #'
26 | #' @examples
27 | #' example("TCGAbarcode")
28 | #' TCGAbiospec(barcodes)
29 | #'
30 | #' @export TCGAbiospec
31 | TCGAbiospec <- function(barcodes) {
32 |     .checkBarcodes(barcodes)
33 |     filler <- .uniqueDelim(barcodes)
34 |     maxIndx <- unique(lengths(strsplit(barcodes, filler)))
35 |     if (maxIndx < 4L)
36 |         stop("Provide a longer barcode")
37 | 
38 |     local_data_store <- new.env(parent = emptyenv())
39 |     data("sampleTypes", envir = local_data_store, package = "TCGAutils")
40 |     sampleTypes <- local_data_store[["sampleTypes"]]
41 |     sampCode <- TCGAbarcode(barcodes, FALSE, TRUE)
42 |     sampVial <- .strsep(sampCode, 2L)
43 |     names(sampVial) <- c("sample", "vial")
44 |     sample_definition <- sampleTypes[["Definition"]][
45 |         match(sampVial[["sample"]], sampleTypes[["Code"]])]
46 |     biospec <-
47 |         data.frame(
48 |             submitter_id = TCGAbarcode(barcodes),
49 |             sample_definition,
50 |             as.data.frame(sampVial, stringsAsFactors = FALSE),
51 |             stringsAsFactors = FALSE
52 |         )
53 |     if (identical(maxIndx, 4L))
54 |         return(biospec)
55 |     else
56 |         splitDex <- seq(5L, maxIndx)
57 | 
58 |     tailBarcode <- strsplit(TCGAbarcode(barcodes, index = splitDex), filler)
59 |     splitCol <- splitDex == 5L
60 |     tailBarcode <- lapply(tailBarcode, function(x)
61 |         c(unlist(.strsep(x[[1L]], 2L)), x[!splitCol]))
62 | 
63 |     portPlateCent <- do.call(rbind.data.frame,
64 |         args = c(tailBarcode, list(stringsAsFactors = FALSE)))
65 |     names(portPlateCent) <-
66 |         c("portion", "analyte", "plate", "center")[seq_along(c(splitDex, 1L))]
67 | 
68 |     cbind.data.frame(biospec, portPlateCent, stringsAsFactors = FALSE)
69 | }
70 | 


--------------------------------------------------------------------------------
/R/TCGAprimaryTumors.R:
--------------------------------------------------------------------------------
 1 | #' Select primary tumors from TCGA datasets
 2 | #'
 3 | #' Tumor selection is decided using the `sampleTypes` data. For 'LAML' datasets,
 4 | #' the primary tumor code used is "03" otherwise, "01" is used.
 5 | #'
 6 | #' @param multiassayexperiment A
 7 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 8 | #'   with TCGA data as obtained from [curatedTCGAData::curatedTCGAData()]
 9 | #'
10 | #' @return A `MultiAssayExperiment` containing only primary tumor samples
11 | #'
12 | #' @examples
13 | #'
14 | #' example(getSubtypeMap)
15 | #'
16 | #' TCGAprimaryTumors(gbm)
17 | #'
18 | #' @export TCGAprimaryTumors
19 | TCGAprimaryTumors <- function(multiassayexperiment) {
20 |     if (!is(multiassayexperiment, "MultiAssayExperiment"))
21 |         stop("Provide a 'MultiAssayExperiment' object as input")
22 | 
23 |     exptnames <- names(experiments(multiassayexperiment))
24 |     dcodes <- vapply(strsplit(exptnames, "_"), `[[`, character(1L), 1L)
25 | 
26 |     primaries <- ifelse(dcodes == "LAML", "03", "01")
27 |     primaries <- setNames(primaries, dcodes)
28 | 
29 |     logisub <- Map(function(barcodes, tumorcode) {
30 |         TCGAsampleSelect(barcodes, tumorcode)
31 |     }, colnames(multiassayexperiment), primaries)
32 | 
33 |     multiassayexperiment[, logisub, ]
34 | }
35 | 


--------------------------------------------------------------------------------
/R/TCGAsampleSelect.R:
--------------------------------------------------------------------------------
 1 | #' Select samples from barcodes from lookup table
 2 | #'
 3 | #' The TCGA barcode contains several pieces of information which can
 4 | #' be parsed by the [TCGAbarcode] function. To select a specific type of
 5 | #' sample, enter the appropriate sampleCode argument from the lookup table.
 6 | #' See lookup table in `data("sampleTypes")`. Barcode inputs can be a
 7 | #' character vector or a [CharacterList][IRanges::CharacterList-class] object.
 8 | #'
 9 | #' @param barcodes Either a TCGA barcode vector or
10 | #'   [CharacterList][IRanges::CharacterList-class] containing patient
11 | #'   identifiers, sample, portion, plate, and center codes.
12 | #'
13 | #' @param sampleCodes Either a character or numeric vector of TCGA sample codes.
14 | #'   See the `sampleType` dataset.
15 | #'
16 | #' @return A logical vector or [LogicalList][IRanges::LogicalList-class] of the
17 | #'   same length as 'barcodes' indicating sample type matches
18 | #'
19 | #' @examples
20 | #'
21 | #' example("TCGAbarcode")
22 | #' TCGAsampleSelect(barcodes, c(11, 01))
23 | #'
24 | #' @export TCGAsampleSelect
25 | TCGAsampleSelect <- function(barcodes, sampleCodes) {
26 |     stopifnot(
27 |         is.character(sampleCodes) || is.numeric(sampleCodes),
28 |         !is.na(sampleCodes), !is.logical(sampleCodes)
29 |     )
30 |     if (clist <- is(barcodes, "CharacterList")) {
31 |         bcodes <- barcodes
32 |         barcodes <- unlist(barcodes, use.names = FALSE)
33 |     }
34 | 
35 |     .checkBarcodes(barcodes, check.sample = TRUE)
36 | 
37 |     sampleCodes <- .addLeadingZero(sampleCodes)
38 |     .checkSampleCodes(sampleCodes, strict = TRUE)
39 | 
40 |     sampleSnippet <- TCGAbarcode(barcodes, sample = TRUE, participant = FALSE)
41 |     barcodeSamples <- substr(sampleSnippet, 1L, 2L)
42 |     barc <- setNames(barcodeSamples %in% sampleCodes, barcodeSamples)
43 |     if (exists("clist") && isTRUE(clist))
44 |         barc <- BiocGenerics::relist(barc, bcodes)
45 |     return(barc)
46 | }
47 | 


--------------------------------------------------------------------------------
/R/TCGAutils-pkg.R:
--------------------------------------------------------------------------------
 1 | #' TCGAutils: Helper functions for working with TCGA and MultiAssayExperiment
 2 | #' data
 3 | #'
 4 | #' TCGAutils is a toolbox to work with TCGA specific datasets. It allows the
 5 | #' user to manipulate and translate TCGA barcodes, conveniently convert a list
 6 | #' of data files to [GRangesList][GenomicRanges::GRangesList-class]. Take
 7 | #' datasets from GISTIC and return a
 8 | #' [SummarizedExperiment][SummarizedExperiment::SummarizedExperiment-class]
 9 | #' class object. The package also provides functions for working with data from
10 | #' the `curatedTCGAData`
11 | #' experiment data package. It provides convenience functions for extracting
12 | #' subtype metadata data and adding clinical data to existing
13 | #' [MultiAssayExperiment][MultiAssayExperiment::MultiAssayExperiment-class]
14 | #' objects.
15 | "_PACKAGE"
16 | 


--------------------------------------------------------------------------------
/R/builds.R:
--------------------------------------------------------------------------------
  1 | human_builds <- function() {
  2 |     S4Vectors::DataFrame(
  3 |         Date = c("July 2004", "May 2004", "March 2006", "February 2009",
  4 |             "December 2013"),
  5 |         NCBI_PRE = c("NCBI", "NCBI", "NCBI", "GRCh", "GRCh"),
  6 |         NCBI_NO = c("34", "35", "36", "37", "38"),
  7 |         NCBI =  c("NCBI34", "NCBI35", "NCBI36", "GRCh37", "GRCh38"),
  8 |         UCSC_PRE = c("hg", "hg", "hg", "hg", "hg"),
  9 |         UCSC_NO = c("16", "17", "18", "19", "38"),
 10 |         UCSC = c("hg16", "hg17", "hg18", "hg19", "hg38")
 11 |     )
 12 | }
 13 | 
 14 | #' @name builds
 15 | #'
 16 | #' @title Utilities for working with *HUMAN* genome builds
 17 | #'
 18 | #' @description A few functions are available to search for build versions,
 19 | #' either from NCBI or UCSC.
 20 | #'
 21 | #' \itemize{
 22 | #'   \item `translateBuild`: translates between UCSC and NCBI build
 23 | #'   versions
 24 | #'   \item `extractBuild`: use grep patterns to find the first build
 25 | #'   within the string input
 26 | #'   \item `uniformBuilds`: replace build occurrences below a threshold
 27 | #'   level of occurence with the alternative build
 28 | #'   \item `correctBuild`: Ensure that the build annotation is correct
 29 | #'   based on the NCBI/UCSC website. If not, use `translateBuild` with
 30 | #'   the indicated 'style' input
 31 | #'   \item `isCorrect`: Check to see if the build is exactly as annotated
 32 | #' }
 33 | #'
 34 | #' @details The `correctBuild` function takes the input and ensures that
 35 | #' the style specified matches the input. Otherwise, it will
 36 | #' return the correct style for use with  `seqlevelsStyle`.
 37 | #' Currently, the function does not support patched builds
 38 | #' (e.g., 'GRCh38.p13') Build names are taken from the website:
 39 | #' \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/}
 40 | #'
 41 | #' @param from character() A vector of build versions typically from `genome()`
 42 | #'     (e.g., "37"). The build vector must be homogenous (i.e.,
 43 | #'     `length(unique(x)) == 1L`).
 44 | #'
 45 | #' @param to character(1) The name of the desired build version (either "UCSC"
 46 | #'     or "NCBI"; default: "UCSC")
 47 | #'
 48 | #' @param build character(1) A string providing the genome build
 49 | #'
 50 | #' @param style character(1) The annotation style, either 'UCSC' or 'NCBI'
 51 | #'
 52 | #' @examples
 53 | #'
 54 | #' translateBuild("GRCh35", "UCSC")
 55 | #'
 56 | #' @return
 57 | #'     translateBuild: A character vector of translated genome builds
 58 | #'
 59 | #'     extractBuild: A character string of the build information available
 60 | #'
 61 | #'     uniformBuilds: A character vector of builds where all builds are
 62 | #'         identical `identical(length(unique(build)), 1L)`
 63 | #'
 64 | #'     correctBuild: A character string of the 'corrected' build name
 65 | #'
 66 | #'     isCorrect: A logical indicating if the build is exactly as annotated
 67 | #'
 68 | #' @export
 69 | translateBuild <- function(from, to = c("UCSC", "NCBI")) {
 70 |     lfro <- length(from)
 71 |     from <- unique(from)
 72 |     if (!.isSingleValue(from))
 73 |         stop("Enter a consistent vector of genomic builds")
 74 | 
 75 |     to <- match.arg(to)
 76 |     buildDF <- human_builds()
 77 | 
 78 |     bnames <- c("UCSC", "NCBI")
 79 |     from_build <- bnames[bnames != to]
 80 | 
 81 |     bfrom <- correctBuild(from, from_build)
 82 | 
 83 |     buildIndex <- match(bfrom, buildDF[[from_build]])
 84 |     rep(buildDF[[to]][buildIndex], lfro)
 85 | }
 86 | 
 87 | #' @rdname builds
 88 | #'
 89 | #' @param build character(1) A string providing the genome build
 90 | #'
 91 | #' @param style character(1) The annotation style, either 'UCSC' or 'NCBI'
 92 | #'
 93 | #' @examples
 94 | #'
 95 | #' correctBuild("grch38", "NCBI")
 96 | #' correctBuild("hg19", "NCBI")
 97 | #'
 98 | #' @export
 99 | correctBuild <- function(build, style = c("UCSC", "NCBI")) {
100 |     build.df <- human_builds()
101 |     pre <- paste0(style, "_PRE")
102 |     digits <- as.character(gsub(".*([[:digit:]]{2})", "\\1", build))
103 |     pref <- gsub("(.*)([[:digit:]]{2})", "\\1", build)
104 |     if (identical(tolower(pref), "hg") && identical(style, "NCBI"))
105 |         return(translateBuild(build, style))
106 |     if (
107 |         tolower(pref) %in% tolower(build.df[["NCBI_PRE"]]) &&
108 |         identical(style, "UCSC")
109 |     )
110 |         return(translateBuild(build, style))
111 |     idx <- match(digits, build.df[[paste0(style, "_NO")]])
112 |     if (is.na(idx))
113 |         return(NA_character_)
114 |     num <- build.df[[paste0(style, "_NO")]][idx]
115 |     pref <- build.df[[pre]][idx]
116 |     paste0(pref, num)
117 | }
118 | 
119 | #' @rdname builds
120 | #'
121 | #' @examples
122 | #'
123 | #' isCorrect("GRCh38", "NCBI")
124 | #'
125 | #' isCorrect("hg19", "UCSC")
126 | #'
127 | #' @export
128 | isCorrect <- function(build, style = c("UCSC", "NCBI")) {
129 |     identical(
130 |         correctBuild(build, style),
131 |         build
132 |     )
133 | }
134 | 
135 | #' @rdname builds
136 | #'
137 | #' @param string A single character string
138 | #' @param build A vector of build version names (default UCSC, NCBI)
139 | #'
140 | #' @examples
141 | #'
142 | #' extractBuild(
143 | #' "SCENA_p_TCGAb29and30_SNP_N_GenomeWideSNP_6_G05_569110.nocnv_grch38.seg.txt"
144 | #' )
145 | #'
146 | #' @export
147 | extractBuild <- function(string, build = c("UCSC", "NCBI")) {
148 |     if (!S4Vectors::isSingleString(string))
149 |         stop("Provide a single string for build search")
150 |     builds <- vector(mode = "character", length(build))
151 |     names(builds) <- build
152 |     for (i in build) {
153 |         pattrn <- switch(i, UCSC = "[Hh][Gg][0-9]{2}",
154 |             NCBI = "[Gg][Rr][Cc][Hh][0-9]{2}")
155 |         builds[[i]] <- stringr::str_extract(string, pattrn)
156 |     }
157 |     builds <- Filter(function(x) !is.na(x), builds)
158 |     if (!length(builds))
159 |         NA_character_
160 |     else if (length(builds))
161 |         builds[1L]
162 | }
163 | 
164 | .isSingleValue <- function(charvec) {
165 |     identical(length(unique(charvec)), 1L)
166 | }
167 | 
168 | .consistentNumbers <- function(charvec) {
169 |     bnos <- gsub("(.*)([0-9]{2})", "\\2", charvec)
170 |     .isSingleValue(bnos)
171 | }
172 | 
173 | 
174 | .replaceHighProp <- function(charvec) {
175 |     tt <- table(charvec)
176 |     if (length(tt) > 2L)
177 |         stop("<internal> Table has more than 2 values")
178 | 
179 |     proptt <- prop.table(tt)
180 | 
181 |     highprop <- names(which.max(proptt))
182 |     charvec[charvec != highprop] <- highprop
183 |     charvec
184 | }
185 | 
186 | #' @rdname builds
187 | #'
188 | #' @param builds A character vector of builds
189 | #'
190 | #' @param cutoff numeric(1L) An inclusive threshold tolerance value for missing
191 | #'     values and translating builds that are below the threshold
192 | #'
193 | #' @param na character() The values to be considered as missing (default:
194 | #'     c("", "NA"))
195 | #'
196 | #' @examples
197 | #'
198 | #' buildvec <- rep(c("GRCh37", "hg19"), times = c(5, 1))
199 | #' uniformBuilds(buildvec)
200 | #'
201 | #' navec <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA")
202 | #' uniformBuilds(navec)
203 | #'
204 | #' @export uniformBuilds
205 | uniformBuilds <- function(builds, cutoff = 0.2, na = c("", "NA")) {
206 |     tbuild <- table(builds)
207 |     if (.consistentNumbers(builds)) {
208 |         if (identical(length(tbuild), 1L))
209 |             return(builds)
210 |         else
211 |             builds <- .replaceHighProp(builds)
212 |     }
213 | 
214 |     wbuilds <- toupper(builds)
215 |     nabuilds <- wbuilds %in% na | is.na(wbuilds)
216 |     wbuilds[nabuilds] <- NA_character_
217 | 
218 |     tt <- table(wbuilds, useNA = "always")
219 |     proptt <- prop.table(tt)
220 | 
221 |     uvals <- names(proptt)
222 |     nanames <- is.na(uvals)
223 |     propna <- proptt[nanames]
224 | 
225 |     if (propna >= cutoff)
226 |         stop("Frequency of NA values higher than the cutoff tolerance")
227 | 
228 |     ubuilds <- uvals[!nanames]
229 | 
230 |     if (.isSingleValue(ubuilds)) {
231 |         builds[nabuilds] <- ubuilds
232 |         return(builds)
233 |     } else if (sum(!nanames) > 2)
234 |         stop("Only two build types at a time can be used")
235 | 
236 |     props <- proptt[!nanames]
237 | 
238 |     offbuild <- names(props[props <= cutoff])
239 |     mainbuild <- names(props[props > cutoff])
240 |     mainbuild <- builds[match(mainbuild, toupper(builds))]
241 |     if (any(nabuilds))
242 |         builds[nabuilds] <- mainbuild
243 | 
244 |     samebuilds <- .consistentNumbers(builds)
245 |     if (samebuilds) {
246 |         builds[wbuilds == offbuild] <- mainbuild
247 |     } else {
248 |         pattrn <- vapply(
249 |             c(UCSC = "[Hh][Gg][0-9]{2}", NCBI = "[Gg][Rr][Cc][Hh][0-9]{2}"),
250 |             grepl, logical(1L), offbuild)
251 |         toconv <- names(pattrn)[!pattrn]
252 |         results <- translateBuild(offbuild, toconv)
253 |         builds[wbuilds == offbuild] <- results
254 |     }
255 |     builds
256 | }
257 | 
258 | 


--------------------------------------------------------------------------------
/R/curatedTCGAData-helpers.R:
--------------------------------------------------------------------------------
  1 | #' @import methods
  2 | #' @importFrom xml2 read_html
  3 | #' @importFrom rvest html_nodes html_attr
  4 | #' @importFrom GenomicRanges GRanges GRangesList makeGRangesListFromDataFrame
  5 | #' granges
  6 | #' @importFrom GenomeInfoDb genome genome<-
  7 | #' @importFrom MultiAssayExperiment ExperimentList colData colData<- metadata
  8 | #' subsetByColumn experiments
  9 | #' @importFrom utils data head read.delim
 10 | #' @importFrom stats as.formula na.omit setNames
 11 | #' @importFrom stringr str_extract
 12 | #' @importFrom SummarizedExperiment SummarizedExperiment mcols mcols<- rowData
 13 | #'   rowData<-
 14 | #' @importFrom GenomicDataCommons files results_all select filter ids cases
 15 | #'   expand
 16 | #' @importFrom S4Vectors isSingleNumber isSingleInteger isSingleString
 17 | #' DataFrame
 18 | NULL
 19 | 
 20 | ## Helpers for downloaded objects
 21 | 
 22 | #' @name curatedTCGAData-helpers
 23 | #'
 24 | #' @title Helper functions for managing MultiAssayExperiment from
 25 | #' curatedTCGAData
 26 | #'
 27 | #' @aliases getSubtypeMap
 28 | #'
 29 | #' @description
 30 | #' Additional helper functions for cleaning and uncovering metadata
 31 | #' within a downloaded `MultiAssayExperiment` from `curatedTCGAData`.
 32 | #'
 33 | #' @details Note that for `getSubtypeMap`, the column of in-data variable names
 34 | #'   may need to go through `make.names` to be found in the `colData` of the
 35 | #'   `MultiAssayExperiment`.
 36 | #'
 37 | #' @section getSubtypeMap: provides a two column `data.frame` with
 38 | #'   interpreted names and in-data variable names. 'Name' usually refers to the
 39 | #'   `colData` row names a.k.a. the `patientID`.
 40 | #'
 41 | #' @section getClinicalNames: provides a vector of common variable names that
 42 | #'   exist in the `colData` `DataFrame` of a `curatedTCGAData`
 43 | #'   `MultiAssayExperiment` object. These variables are directly obtained
 44 | #'   from the BroadFirehose clinical data (downloaded with
 45 | #'   \link[RTCGAToolbox]{getFirehoseData}) and tend to be present across cancer
 46 | #'   disease codes.
 47 | #'
 48 | #' @param multiassayexperiment A
 49 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 50 | #'   object
 51 | #'
 52 | #' @examples
 53 | #'
 54 | #' library(curatedTCGAData)
 55 | #'
 56 | #' gbm <- curatedTCGAData("GBM", c("RPPA*", "CNA*"), version = "2.0.1", FALSE)
 57 | #'
 58 | #' getSubtypeMap(gbm)
 59 | #'
 60 | #' sampleTables(gbm)
 61 | #'
 62 | #' TCGAsplitAssays(gbm, c("01", "10"))
 63 | #'
 64 | #' @return \itemize{
 65 | #'     \item{getSubtypeMap}: A `data.frame` with explanatory names
 66 | #'     and their in-data variable names. They may not be present for all
 67 | #'     cancer types.
 68 | #'     \item{getClinicalNames}: A `vector` of common variable names that
 69 | #'     may be found across several cancer disease codes.
 70 | #' }
 71 | #'
 72 | #' @export
 73 | getSubtypeMap <- function(multiassayexperiment) {
 74 | 
 75 |     if (!is(multiassayexperiment, "MultiAssayExperiment"))
 76 |         stop("Provide a 'MultiAssayExperiment' object")
 77 | 
 78 |     frameMap <- metadata(colData(multiassayexperiment))[["subtypes"]]
 79 |     frameMap[] <- lapply(frameMap, as.character)
 80 | 
 81 |     if (is.null(frameMap))
 82 |         return(message("No subtype data available"))
 83 | 
 84 |     subColIdx <- grep("subtype", names(frameMap))
 85 | 
 86 |     pats <-
 87 |         frameMap[[subColIdx]] %in% c("patient", "SAMPLE", "Complete TCGA ID")
 88 | 
 89 |     frameMap[pats, subColIdx] <- "patientID"
 90 |     frameMap
 91 | }
 92 | 
 93 | #' @rdname curatedTCGAData-helpers
 94 | #'
 95 | #' @param diseaseCode A TCGA cancer code (e.g., "BRCA")
 96 | #' @examples
 97 | #' getClinicalNames("COAD")
 98 | #'
 99 | #' @export
100 | getClinicalNames <- function(diseaseCode) {
101 |     stopifnot(S4Vectors::isSingleString(diseaseCode))
102 |     env <- new.env(parent = emptyenv())
103 |     data("clinicalNames", envir = env)
104 |     clinNames <- env[["clinicalNames"]]
105 |     clinNames[[diseaseCode]]
106 | }
107 | 
108 | .samplesInData <- function(mae) {
109 |     IRanges::CharacterList(lapply(sampleTables(mae), names))
110 | }
111 | 
112 | .checkSampleCodes <-
113 |     function(sampleCodes, type = "'sampleCodes'", strict = FALSE) {
114 |     FUN <- if (strict) any else all
115 |     env <- new.env(parent = emptyenv())
116 |     data("sampleTypes", envir = env, package = "TCGAutils")
117 |     sampleTypes <- env[["sampleTypes"]]
118 |     if (FUN(!sampleCodes %in% sampleTypes[["Code"]]))
119 |         stop("Provide valid TCGA 'sampleCodes' in ", type)
120 | }
121 | 
122 | .checkCodesAgainstData <- function(samplist, sampleCodes) {
123 |     invalidCodes <- IRanges::LogicalList(lapply(samplist,
124 |         function(acode) !sampleCodes %in% acode))
125 | 
126 |     if (all(all(invalidCodes) & lengths(invalidCodes)))
127 |         stop("'sampleCodes' not found in assay data, check 'sampleTables()'",
128 |             "\n    and see the 'data(\"sampleTypes\")' table", call. = FALSE)
129 | 
130 |     if (any(any(invalidCodes))) {
131 |         missingcodes <-
132 |             IRanges::CharacterList(lapply(invalidCodes[any(invalidCodes)],
133 |                 function(inv) sampleCodes[inv]))
134 |         warning("Some 'sampleCodes' not found in assays", call. = FALSE)
135 |     }
136 | }
137 | 
138 | .addLeadingZero <- function(vect) {
139 |     vect <- as.character(vect)
140 |     singleDigits <- nchar(vect) < 2L
141 |     if (any(singleDigits))
142 |         vect <- replace(vect, singleDigits, paste0("0", vect[singleDigits]))
143 |     vect
144 | }
145 | 
146 | #' @rdname curatedTCGAData-helpers
147 | #'
148 | #' @param sampleCodes character (default NULL) A string of sample type codes
149 | #' (refer to `data(sampleTypes)`; `TCGAsplitAssays` section)
150 | #' @param exclusive logical (default FALSE) Whether to return only assays that
151 | #' contain all codes in `sampleCodes`
152 | #'
153 | #' @section TCGAsplitAssays:
154 | #'     Separates samples by indicated sample codes into different assays
155 | #'     in a `MultiAssayExperiment`. Refer to the `sampleTypes`
156 | #'     data object for a list of available codes. This operation generates
157 | #'     \strong{n} times the number of assays based on the number of sample codes
158 | #'     entered. By default, all assays will be split by samples present in
159 | #'     the data.
160 | #'
161 | #' @importFrom BiocBaseUtils setSlots
162 | #'
163 | #' @export
164 | TCGAsplitAssays <- function(multiassayexperiment, sampleCodes = NULL,
165 |     exclusive = FALSE) {
166 |     if (!is(multiassayexperiment, "MultiAssayExperiment"))
167 |         stop("Provide a 'MultiAssayExperiment' object")
168 | 
169 |     sampList <- .samplesInData(multiassayexperiment)
170 |     .checkSampleCodes(unique(unlist(sampList)),
171 |         "colnames(MultiAssayExperiment)")
172 | 
173 |     if (!is.null(sampleCodes)) {
174 |         sampleCodes <- .addLeadingZero(sampleCodes)
175 |         .checkSampleCodes(sampleCodes)
176 |         .checkCodesAgainstData(sampList, sampleCodes)
177 |         if (exclusive) {
178 |             inCodes <-
179 |                 S4Vectors::`%in%`(IRanges::CharacterList(sampleCodes), sampList)
180 |             sampList <- sampList[all(inCodes)]
181 |         }
182 |         if (!length(sampList))
183 |             stop("Not all 'sampleCodes' were found in data")
184 |         subCodes <- S4Vectors::`%in%`(sampList, sampleCodes)
185 |         sampList <- sampList[subCodes]
186 |     }
187 | 
188 |     validExp <- Filter(length, sampList)
189 |     exps <- experiments(multiassayexperiment)
190 |     exps <- exps[names(exps) %in% names(validExp)]
191 | 
192 |     egroups <- unlist(Map(function(exprmt, sampcodes, ename) {
193 |         expnames <- setNames(sampcodes, paste0(sampcodes, "_", ename))
194 |         lapply(expnames, function(code) {
195 |             logitype <- TCGAsampleSelect(colnames(exprmt), code)
196 |             exprmt[, logitype, drop = FALSE]
197 |         })
198 |     }, exprmt = exps, sampcodes = validExp, ename = names(validExp),
199 |     USE.NAMES = FALSE), recursive = FALSE)
200 | 
201 |     sampmap <- generateMap(
202 |         experiments = egroups,
203 |         colData = colData(multiassayexperiment),
204 |         idConverter = TCGAbarcode
205 |     )
206 | 
207 |     setSlots(
208 |         object = multiassayexperiment,
209 |         ExperimentList = ExperimentList(egroups),
210 |         sampleMap = sampmap
211 |     )
212 | }
213 | 
214 | #' @rdname curatedTCGAData-helpers
215 | #' @param vial (logical default FALSE) whether to display vials in the
216 | #' table output
217 | #'
218 | #' @section sampleTables:
219 | #'     Display all the available samples in each of the assays
220 | #' @export
221 | sampleTables <- function(multiassayexperiment, vial = FALSE) {
222 |     lapply(colnames(multiassayexperiment), function(x) {
223 |         scodes <- TCGAbarcode(x, participant = FALSE, sample = TRUE)
224 |         if (!vial)
225 |             scodes <- substr(scodes, 1L, 2L)
226 |         table(unname(scodes))
227 |     })
228 | }
229 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' TCGA Cancer Disease Codes Table
 2 | #'
 3 | #' A dataset for obtaining the cancer codes in TCGA for about 13 different
 4 | #' types of cancers.
 5 | #'
 6 | #' @format A data frame with 37 rows and 2 variables:
 7 | #'   * Study.Abbreviation: Disease Code used in TCGA
 8 | #'   * Available: Cancer datasets available via curatedTCGAData
 9 | #'   * SubtypeData: Subtype curation data available via curatedTCGAData
10 | #'   * Study.Name: The full length study name (i.e., type of cancer)
11 | #' @return The TCGA `diseaseCodes` table
12 | #'
13 | #' @usage data("diseaseCodes")
14 | #'
15 | #' @source <https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations>
16 | "diseaseCodes"
17 | 
18 | #' Barcode Sample Type Table
19 | #'
20 | #' A dataset that contains the mappings for sample codes in the TCGA
21 | #' barcodes.
22 | #' @format A data frame with 19 rows and 3 variables:
23 | #'   * Code: Two digit code number found in the barcode
24 | #'   * Definition: Long name for the sample type
25 | #'   * Short.Letter.Code: Letter code for the sample type
26 | #'
27 | #' @return The TCGA `sampleTypes` table
28 | #'
29 | #' @usage data("sampleTypes")
30 | #'
31 | #' @source <https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes>
32 | "sampleTypes"
33 | 
34 | #' Clinical dataset names in TCGA
35 | #'
36 | #' A dataset of names for each of the TCGA cancer codes available.
37 | #' These names were obtained by the clinical datasets from
38 | #' [getFirehoseData][RTCGAToolbox::getFirehoseData]. They serve to subset the
39 | #' current datasets provided by `curatedTCGAData`.
40 | #'
41 | #' @format A [CharacterList][IRanges::CharacterList-class] of names for 33
42 | #'   cancer codes
43 | #'
44 | #' @return The clinical dataset column names in TCGA as provided by the
45 | #' `RTCGAToolbox`
46 | #'
47 | #' @usage data("clinicalNames")
48 | "clinicalNames"
49 | 


--------------------------------------------------------------------------------
/R/findGRangesCols.R:
--------------------------------------------------------------------------------
  1 | .find_with_xfix <- function(df_colnames, xfix1, xfix2,
  2 |         start.field, end.field, xfixType = "pre") {
  3 |     fixint <- intersect(xfix1, xfix2)
  4 |     fixint <- fixint[fixint != ""]
  5 |     if (length(fixint) > 1L) {
  6 |         kword <- "region"
  7 |         warning(" Multiple ", xfixType, "fixes found, using keyword '", kword,
  8 |                 "' or taking first one")
  9 |         ## keywords to keep, else take first one
 10 |         gfix <- grep(kword, fixint, value = TRUE)
 11 |         if (length(gfix) && isSingleString(gfix))
 12 |             fixint <- gfix
 13 |         fixint <- fixint[[1L]]
 14 |     }
 15 |     if (!isSingleString(fixint))
 16 |         stop("'start.field' and 'end.field' ", xfixType, "fixes do not match")
 17 |     names(fixint) <- xfixType
 18 | 
 19 |     fixFUN <- switch(xfixType, pre = I, suf = rev)
 20 |     start.field <- paste(fixFUN(c(fixint, start.field)), collapse = "")
 21 |     validEnd <- vapply(end.field, function(efield)
 22 |         paste(fixFUN(c(fixint, efield)), collapse = "") %in% df_colnames,
 23 |         logical(1L))
 24 |     stopifnot(sum(validEnd) == 1L)
 25 |     end.field <- paste(fixFUN(c(fixint, end.field[validEnd])), collapse = "")
 26 |     if (!length(start.field) && !length(end.field))
 27 |         list(c(start.field = "", end.field = ""), "")
 28 |     else
 29 |         list(c(start.field = start.field, end.field = end.field), fixint)
 30 | }
 31 | 
 32 | .tallySameLength <- function(fix1, fix2) {
 33 |     if (!length(fix1) && !length(fix2)) {
 34 |         0L
 35 |     } else {
 36 |         hasPos <- sum(vapply(c(fix1, fix2),
 37 |             function(x) grepl("pos", x, ignore.case = TRUE),
 38 |             logical(1L)
 39 |         ))
 40 |         sum(
 41 |             identical(fix1, fix2),
 42 |             identical(length(fix1), length(fix2)),
 43 |             hasPos
 44 |         )
 45 |     }
 46 | }
 47 | 
 48 | .strMatch <- function(strings, table) {
 49 |     unlist(lapply(strings, function(x)
 50 |         grep(x, table, ignore.case = TRUE)
 51 |     ))
 52 | }
 53 | 
 54 | ## Helper functions
 55 | .find_start_end_cols <- function (df_colnames, start.field, end.field) {
 56 |     idx1 <- which(df_colnames %in% start.field)
 57 |     idx2 <- which(df_colnames %in% end.field)
 58 |     if (length(idx1) == 1L && length(idx2) == 1L) {
 59 |         return(list(c(start = idx1, end = idx2), list(c(none = ""))))
 60 |     }
 61 |     idx1 <- .strMatch(start.field, df_colnames)
 62 |     idx2 <- .strMatch(end.field, df_colnames)
 63 |     if (length(idx1) == 1L && length(idx2) == 1L) {
 64 |         return(list(c(start = idx1, end = idx2), list(c(none = ""))))
 65 |     }
 66 |     prefixes1 <- .collect_prefixes(df_colnames, start.field)
 67 |     prefixes2 <- .collect_prefixes(df_colnames, end.field)
 68 |     suffixes1 <- .collect_suffixes(df_colnames, start.field)
 69 |     suffixes2 <- .collect_suffixes(df_colnames, end.field)
 70 |     tallypre <- .tallySameLength(prefixes1, prefixes2)
 71 |     tallysuff <- .tallySameLength(suffixes1, suffixes2)
 72 |     tally <- sort(c(prefixes = tallypre, suffixes = tallysuff))[2]
 73 |     reslist <- list(
 74 |         c(start = NA_integer_, end = NA_integer_), list(c(none = ""))
 75 |     )
 76 |     if (!tally) return(reslist)
 77 |     fix <- names(tally)
 78 |     startend.fields <- .find_with_xfix(
 79 |         df_colnames, get(paste0(fix, 1)), get(paste0(fix, 2)),
 80 |         start.field, end.field, substr(fix, 1, 3)
 81 |     )
 82 |     idx1 <- which(df_colnames %in% startend.fields[[1L]][["start.field"]])
 83 |     idx2 <- which(df_colnames %in% startend.fields[[1L]][["end.field"]])
 84 |     if (length(idx1) == 1L && length(idx2) == 1L) {
 85 |         reslist[[1L]] <- c(start = idx1, end = idx2)
 86 |         reslist[[2L]][[1L]] <- startend.fields[[2L]]
 87 |     }
 88 |     reslist
 89 | }
 90 | 
 91 | .collect_prefixes <- function (df_colnames, field) {
 92 |     df_colnames_nc <- nchar(df_colnames)
 93 |     prefixes <- lapply(field, function(suf) {
 94 |         pref_nc <- df_colnames_nc - nchar(suf)
 95 |         idx <- which(substr(df_colnames, pref_nc + 1L, df_colnames_nc) == suf)
 96 |         substr(df_colnames[idx], 1L, pref_nc[idx])
 97 |     })
 98 |     pref <- unique(unlist(prefixes))
 99 |     pref[pref != ""]
100 | }
101 | 
102 | .collect_suffixes <- function(df_colnames, field) {
103 |     suffixes <- lapply(field, function(pre) {
104 |         idx <- which(startsWith(df_colnames, pre))
105 |         substr(df_colnames[idx], nchar(field) + 1L,
106 |             nchar(df_colnames[idx]))
107 |     })
108 |     suff <- unique(unlist(suffixes))
109 |     suff[suff != ""]
110 | }
111 | 
112 | .find_strands_col <- function(df_colnames, strand.field, xfix) {
113 |     fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I)
114 |     idx <- which(df_colnames %in%
115 |         paste(fixFUN(c(xfix, strand.field)), collapse = ""))
116 |     if (length(idx) == 0L)
117 |         idx <- which(df_colnames %in% strand.field)
118 |     if (length(idx) == 0L)
119 |         return(NA_integer_)
120 |     if (length(idx) >= 2L) {
121 |         warning("Multiple strand measurements detected, taking first one")
122 |         idx <- idx[[1L]]
123 |     }
124 |     idx
125 | }
126 | 
127 | .find_seqnames_col <- function (df_colnames, seqnames.field, xfix) {
128 |     fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I)
129 |     idx <- which(df_colnames %in%
130 |         paste(fixFUN(c(xfix, seqnames.field)), collapse = ""))
131 |     if (length(idx) == 0L)
132 |         idx <- which(df_colnames %in% seqnames.field)
133 |     if (length(idx) == 0L)
134 |         return(NA_integer_)
135 |     if (length(idx) >= 2L)
136 |         warning("cannnot determine seqnames column unambiguously")
137 |         return(idx[[1L]])
138 |     idx
139 | }
140 | 
141 | .find_width_col <- function (df_colnames, width.field, xfix) {
142 |     fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I)
143 |     idx <- which(df_colnames %in%
144 |         paste(fixFUN(c(xfix, width.field)), collapse = ""))
145 |     if (length(idx) == 0L)
146 |         idx <- which(df_colnames %in% width.field)
147 |     if (length(idx) == 0L)
148 |         return(NA_integer_)
149 |     if (length(idx) >= 2L) {
150 |         warning("cannnot determine width column unambiguously")
151 |         return(idx[[1L]])
152 |     }
153 |     idx
154 | }
155 | 
156 | #' Obtain minimum necessary names for the creation of a GRangesList object
157 | #'
158 | #' This function attempts to match chromosome, start position, end position and
159 | #' strand names in the given character vector. Modified helper from the
160 | #' `GenomicRanges` package.
161 | #'
162 | #' @param df_colnames A `character` vector of names in a dataset
163 | #' @param seqnames.field A `character` vector of the chromosome name
164 | #' @param start.field A `character` vector that indicates the column name
165 | #' of the start positions of ranged data
166 | #' @param end.field A `character` vector that indicates the end position
167 | #' of ranged data
168 | #' @param strand.field A `character` vector of the column name that
169 | #' indicates the strand type
170 | #' @param ignore.strand logical (default FALSE) whether to ignore the strand
171 | #' field in the data
172 | #' @return Index positions vector indicating columns with appropriate names
173 | #'
174 | #' @examples
175 | #' myDataColNames <- c("Start_position", "End_position", "strand",
176 | #'                  "chromosome", "num_probes", "segment_mean")
177 | #' findGRangesCols(myDataColNames)
178 | #'
179 | #' @export findGRangesCols
180 | findGRangesCols <- function (df_colnames,
181 |     seqnames.field = c("seqnames", "seqname", "chromosome",
182 |         "chrom", "chr", "chromosome_name", "seqid", "om"),
183 |     start.field = "start",
184 |     end.field = c("end", "stop"),
185 |     strand.field = "strand",
186 |     ignore.strand = FALSE) {
187 | 
188 |     df_colnames0 <- tolower(df_colnames)
189 |     seqnames.field0 <-
190 |         GenomicRanges:::.normarg_field(seqnames.field, "seqnames")
191 |     start.field0 <- GenomicRanges:::.normarg_field(start.field, "start")
192 |     end.field0 <- GenomicRanges:::.normarg_field(end.field, "end")
193 |     start_end_cols <- .find_start_end_cols(df_colnames0, start.field0,
194 |         end.field0)
195 |     xfix <- start_end_cols[[2L]]
196 |     width_col <- .find_width_col(df_colnames0, "width", xfix)
197 |     seqnames_col <- .find_seqnames_col(df_colnames0, seqnames.field0, xfix)
198 |     if (ignore.strand) {
199 |         strand_col <- NA_integer_
200 |     } else {
201 |         strand.field0 <- GenomicRanges:::.normarg_field(strand.field, "strand")
202 |         strand_col <- .find_strands_col(df_colnames0, strand.field0, xfix)
203 |     }
204 |     c(seqnames = seqnames_col, start_end_cols[[1L]], width = width_col,
205 |         strand = strand_col)
206 | }
207 | 


--------------------------------------------------------------------------------
/R/generateMap.R:
--------------------------------------------------------------------------------
  1 | #' Create a sampleMap from an experiment list and phenoData dataframe
  2 | #'
  3 | #' This function helps create a sampleMap in preparation of a
  4 | #' `MultiAssayExperiment` object. This especially useful when the
  5 | #' sample identifiers are not very different, as in the case of TCGA barcodes.
  6 | #' An `idConverter` function can be provided to truncate such sample
  7 | #' identifiers and obtain patient identifiers.
  8 | #'
  9 | #' @param experiments A named `list` of experiments compatible with the
 10 | #' `MultiAssayExperiment` API
 11 | #' @param colData A `data.frame` of clinical data with patient identifiers
 12 | #' as rownames
 13 | #' @param idConverter A function to be used against the sample or specimen
 14 | #' identifiers to match those in the rownames of the `colData`
 15 | #' (default NULL)
 16 | #' @param sampleCol A single string indicating the sample identifiers
 17 | #' column in the colData dataset
 18 | #' @param patientCol A single string indicating the patient identifiers
 19 | #' in colData, "row.names" extracts the colData row names
 20 | #' @param ... Additonal arguments to pass to the 'idConverter' function.
 21 | #'
 22 | #' @return A `DataFrame` class object of mapped samples and patient
 23 | #' identifiers including assays
 24 | #'
 25 | #' @author M. Ramos, M. Morgan, L. Schiffer
 26 | #'
 27 | #' @examples
 28 | #' ## Minimal example
 29 | #' expList <- list(assay1 = matrix(1:6, ncol = 2L,
 30 | #'         dimnames = list(paste0("feature", 1:3), c("A-J", "B-J"))),
 31 | #'     assay2 = matrix(1:4, ncol = 2,
 32 | #'         dimnames = list(paste0("gene", 1:2), c("A-L", "B-L"))))
 33 | #'
 34 | #' ## Mock colData
 35 | #' myPheno <- data.frame(var1 = c("Yes", "No"), var2 = c("High", "Low"),
 36 | #'     row.names = c("a", "b"))
 37 | #'
 38 | #' ## A look at the identifiers
 39 | #' vapply(expList, colnames, character(2L))
 40 | #' rownames(myPheno)
 41 | #'
 42 | #' ## Use 'idConverter' to correspond sample names to patient identifiers
 43 | #' generateMap(expList, myPheno,
 44 | #'     idConverter = function(x) substr(tolower(x), 1L, 1L))
 45 | #'
 46 | #' @export generateMap
 47 | generateMap <- function(experiments, colData, idConverter = identity,
 48 |     sampleCol, patientCol, ...) {
 49 |     if (!is(experiments, "ExperimentList"))
 50 |         experiments <- ExperimentList(experiments)
 51 |     samps <- colnames(experiments)
 52 |     expnames <- names(samps)
 53 |     assay <- factor(rep(expnames, lengths(samps)), levels=expnames)
 54 |     colname <- unlist(samps, use.names=FALSE)
 55 |     if (!missing(sampleCol) && !missing(patientCol)) {
 56 |         if (!S4Vectors::isSingleString(sampleCol) ||
 57 |             !S4Vectors::isSingleString(patientCol))
 58 |             stop("Provide character names in colData for mapping")
 59 |         if (identical(patientCol, "row.names"))
 60 |             pts <- rownames(colData)
 61 |         else
 62 |             pts <- colData[[patientCol]]
 63 |         samples <- colData[[sampleCol]]
 64 |         autoMap <- cbind.data.frame(assay = NA_character_, primary = pts,
 65 |             colname = samples, stringsAsFactors = FALSE)
 66 |         autoMap <- Map(function(cnames, i) {
 67 |             submap <- autoMap[autoMap[["colname"]] %in% cnames, ]
 68 |             if (nrow(submap)) {
 69 |                 submap[["assay"]] <- i
 70 |             } else {
 71 |                 warning(
 72 |                     "'", i, "' assay dropped; 'colnames' not mappable",
 73 |                     call. = FALSE
 74 |                 )
 75 |             }
 76 |             submap
 77 |         }, cnames = samps, i = names(samps))
 78 |         autoMap <- do.call(function(...) {
 79 |             rbind(..., make.row.names = FALSE)
 80 |         }, autoMap)
 81 |         autoMap[["assay"]] <- factor(autoMap[["assay"]])
 82 |     } else {
 83 |         matches <- match(idConverter(colname, ...), rownames(colData))
 84 |         if (length(matches) && all(is.na(matches)))
 85 |             stop("no way to map colData to ExperimentList")
 86 |         primary <- rownames(colData)[matches]
 87 |         autoMap <- S4Vectors::DataFrame(assay=assay,
 88 |             primary=primary, colname=colname)
 89 |     }
 90 |     missingPrimary <- is.na(autoMap[["primary"]])
 91 |     if (nrow(autoMap) && any(missingPrimary)) {
 92 |         notFound <- autoMap[missingPrimary, ]
 93 |         warning("Data from rows:",
 94 |             sprintf("\n %s - %s", notFound[, "primary"],
 95 |                 notFound[, "colname"]),
 96 |                 "\ndropped due to missing phenotype data")
 97 |         autoMap <- autoMap[!is.na(autoMap[["primary"]]), ]
 98 |     }
 99 |     autoMap
100 | }
101 | 
102 | 


--------------------------------------------------------------------------------
/R/getFileName.R:
--------------------------------------------------------------------------------
 1 | .getLinks <- function(keyWord1, keyWord2, datasetLink = NULL, doc) {
 2 |     # Function from RTCGAToolbox
 3 |     keyWord <- keyWord1
 4 |     keyWord <- paste0("//a[contains(@href, '",keyWord,"')]")
 5 |     plinks <- rvest::html_nodes(doc, xpath = keyWord)
 6 |     plinks <- rvest::html_attr(plinks, "href")
 7 | 
 8 |     if (is.null(datasetLink))
 9 |         plinks[grepl(keyWord2,plinks)]
10 |     else
11 |         plinks[grepl(paste0("*.",datasetLink,keyWord2),plinks)]
12 | }
13 | 
14 | #' Find the file names used in RTCGAToolbox
15 | #'
16 | #' Part of this function is from the RTCGAToolbox. It aims to extract the file
17 | #' name used inside of the \link[RTCGAToolbox]{getFirehoseData} function.
18 | #' The arguments of the function parallel those in the
19 | #' \link[RTCGAToolbox]{getFirehoseData} function. It is only available for
20 | #' select data types.
21 | #'
22 | #' @param disease The TCGA cancer disease code, e.g., "COAD"
23 | #' @param runDate The single `string` used in the `getFirehoseData`
24 | #' function (default "20160128")
25 | #' @param dataType A single character vector (default "CNASNP") indicating the
26 | #' data type for which to get the source file name
27 | #'
28 | #' @return A single `character` file name
29 | #'
30 | #' @examples
31 | #'
32 | #' getFileName("COAD", dataType = "CNASNP")
33 | #'
34 | #' @export getFileName
35 | getFileName <- function(disease, runDate = "20160128",
36 |     dataType = c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation")) {
37 | 
38 |     dataType <- match.arg(dataType,
39 |         c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation"))
40 | 
41 |     fh_url <- "https://gdac.broadinstitute.org/runs/stddata__"
42 |     fh_url <- paste0(fh_url, substr(runDate,1,4), "_",
43 |         substr(runDate,5,6), "_", substr(runDate,7,8), "/data/")
44 |     fh_url <- paste0(fh_url, disease, "/", runDate, "/")
45 |     doc <- xml2::read_html(fh_url)
46 | 
47 |     switch(dataType,
48 |         CNASNP = .getLinks(
49 |             "Level_3__segmented_scna_hg19__seg.Level_3",
50 |             paste0("[.]Merge_snp__.*.__Level_3__segmented",
51 |                 "_scna_hg19__seg.Level_3.*.tar[.]gz$"),
52 |             disease, doc),
53 |         CNVSNP = .getLinks(
54 |             "Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3",
55 |             paste0("[.]Merge_snp__.*.__Level_3__segmented_scna_",
56 |                 "minus_germline_cnv_hg19__seg.Level_3.*.tar[.]gz$"),
57 |             disease, doc),
58 |         CNASeq = .getLinks("__Level_3__segmentation__seg.Level_3",
59 |             paste0("[.]Merge_cna__.*.dnaseq.*.__Level_3__",
60 |                 "segmentation__seg.Level_3.*.tar[.]gz$"),
61 |             disease, doc),
62 |         CNACGH = .getLinks("__Level_3__segmentation__seg.Level_3",
63 |             paste0("[.]Merge_cna__.*.cgh.*.__Level_3__",
64 |                 "segmentation__seg.Level_3.*.tar[.]gz$"),
65 |             disease, doc),
66 |         Mutation = .getLinks("Mutation_Packager_Calls",
67 |             "[.]Mutation_Packager_Calls[.]Level_3[.].*.tar[.]gz$",
68 |             disease, doc)
69 |     )
70 | }
71 | 


--------------------------------------------------------------------------------
/R/imputeAssay.R:
--------------------------------------------------------------------------------
 1 | #' @name imputeAssay
 2 | #'
 3 | #' @title This function imputes assays values inside a
 4 | #' `MultiAssayExperiment`
 5 | #'
 6 | #' @description These function allow the user to enter a
 7 | #' `MultiAssayExperiment` and impute all the NA values inside assays.
 8 | #'
 9 | #' @param multiassayexperiment A `MultiAssayExperiment` with genes in the
10 | #' rows, samples in the columns
11 | #' @param i A numeric, logical, or character `vector` indicating the
12 | #' assays to perform imputation on (default 1L)
13 | #' @inheritDotParams impute::impute.knn
14 | #'
15 | #' @return A `MultiAssayExperiment` with imputed assays values
16 | #'
17 | #' @examples
18 | #'
19 | #' example(getSubtypeMap)
20 | #'
21 | #' ## convert data to matrix and add as experiment
22 | #' gbm <-
23 | #'   c(gbm, RPPA_matrix = data.matrix(assay(gbm[["GBM_RPPAArray-20160128"]])))
24 | #'
25 | #' imputeAssay(gbm, i = "RPPA_matrix")
26 | #'
27 | #' @export
28 | imputeAssay <- function(multiassayexperiment, i = 1, ...) {
29 |     if (!requireNamespace("impute", quietly = TRUE))
30 |         stop("Install the 'impute' package to run 'imputeAssay'")
31 | 
32 |     if (!is(multiassayexperiment, "MultiAssayExperiment"))
33 |         stop("Input has to be a MultiAssayExperiment")
34 |     if (!any(is.character(i), is.numeric(i), is.logical(i)))
35 |         stop("'i' has to be character or numeric or logical")
36 | 
37 |     sub.multiassayexperiment <- multiassayexperiment[,,i]
38 |     assays <- assays(sub.multiassayexperiment)
39 |     assayclasses <- vapply(assays, is.matrix, logical(1L))
40 |     if (!all(assayclasses))
41 |         stop("Only matrix assay(s) can be imputed")
42 |     data.imputed <- lapply(assays, function(mat) {
43 |         impute::impute.knn(mat, ...)$data
44 |     })
45 | 
46 |     for (x in i) {
47 |         multiassayexperiment[[x]] <- data.imputed[[x]]
48 |     }
49 | 
50 |     return(multiassayexperiment)
51 | }
52 | 


--------------------------------------------------------------------------------
/R/makeGRangesListFromCopyNumber.R:
--------------------------------------------------------------------------------
 1 | #' Make a GRangesList from TCGA Copy Number data
 2 | #'
 3 | #' `makeGRangesListFromCopyNumber` allows the user to convert objects of
 4 | #' class `data.frame` or [S4Vectors::DataFrame] to a
 5 | #' [GRangesList][GenomicRanges::GRangesList-class]. It includes additional
 6 | #' features specific to TCGA data such as, hugo symbols, probe numbers, segment
 7 | #' means, and ucsc build (if available).
 8 | #'
 9 | #' @param df A `data.frame` or `DataFrame` class object. `list`
10 | #' class objects are coerced to `data.frame` or `DataFrame`.
11 | #' @param split.field A `character` vector of length one indicating
12 | #' the column to be used as sample identifiers
13 | #' @param names.field A `character` vector of length one indicating the
14 | #' column to be used as names for each of the ranges in the data
15 | #' @param ... Additional arguments to pass on to
16 | #' [GenomicRanges::makeGRangesListFromDataFrame]
17 | #'
18 | #' @return A [GRangesList][GenomicRanges::GRangesList-class] class object
19 | #'
20 | #' @examples
21 | #' library(GenomicDataCommons)
22 | #'
23 | #' manif <- files() |>
24 | #'     filter(~ cases.project.project_id == "TCGA-COAD" &
25 | #'         data_type == "Copy Number Segment") |>
26 | #'     manifest(size = 1)
27 | #'
28 | #' fname <- gdcdata(manif$id)
29 | #'
30 | #' barcode <- UUIDtoBarcode(names(fname), from_type = "file_id")
31 | #' barcode <- barcode[["associated_entities.entity_submitter_id"]]
32 | #'
33 | #' cndata <- read.delim(fname[[1L]], nrows = 10L)
34 | #'
35 | #' cngrl <- makeGRangesListFromCopyNumber(cndata, split.field = "GDC_Aliquot",
36 | #'     keep.extra.columns = TRUE)
37 | #'
38 | #' names(cngrl) <- barcode
39 | #' GenomeInfoDb::genome(cngrl) <- extractBuild(fname[[1L]])
40 | #' cngrl
41 | #'
42 | #' @export makeGRangesListFromCopyNumber
43 | makeGRangesListFromCopyNumber <-
44 |     function(df, split.field, names.field = "Hugo_Symbol", ...) {
45 |         if (is.list(df) && !inherits(df, "data.frame"))
46 |             df <- do.call(rbind, df)
47 | 
48 |         if (!S4Vectors::isSingleString(names.field))
49 |             stop("'names.field' must be a single sting")
50 |         if (!S4Vectors::isSingleString(split.field))
51 |             stop("'split.field' must be a single sting")
52 | 
53 |         twoMeta <- all(c("num_probes", "segment_mean") %in% tolower(names(df)))
54 |         rnames <- tolower(names(df)) %in% tolower(names.field)
55 |         ncbi <- tolower(names(df)) %in% "ncbi_build"
56 | 
57 |         if (any(rnames) && sum(rnames) == 1L) {
58 |             setrname <- names(df)[rnames]
59 |             grl <- makeGRangesListFromDataFrame(df = df,
60 |                 split.field = split.field, names.field = setrname, ...)
61 |         } else {
62 |             grl <- makeGRangesListFromDataFrame(df = df, split.field =
63 |                 split.field, ...)
64 |         }
65 | 
66 |         if (twoMeta) {
67 |             numProb <- names(df)[match("num_probes", tolower(names(df)))]
68 |             segMean <- names(df)[match("segment_mean", tolower(names(df)))]
69 |             mcols(grl) <- cbind(mcols(grl), DataFrame(num_probes = numProb,
70 |                 segment_mean = segMean))
71 |         }
72 |         if (any(ncbi) && sum(ncbi) == 1L) {
73 |             ncbi_build <- names(df)[ncbi]
74 |             build_name <- unique(df[[ncbi_build]])
75 |             if (length(build_name) != 1L) {
76 |                 warning("inconsistent ncbi_build values in data")
77 |             } else {
78 |                 ucscBuild <- translateBuild(build_name, "UCSC")
79 |                 GenomeInfoDb::genome(grl) <- ucscBuild
80 |             }
81 |         }
82 |         grl
83 |     }
84 | 


--------------------------------------------------------------------------------
/R/makeGRangesListFromExonFiles.R:
--------------------------------------------------------------------------------
 1 | #' Read exon-level expression files and create a `GRangesList`
 2 | #'
 3 | #' This function serves to read exon-level expression data. It works for exon
 4 | #' quantification (raw counts and RPKM) and junction quantification
 5 | #' (raw counts) file paths and represents such data as a
 6 | #' [GRangesList][GenomicRanges::GRangesList-class]. The data files can be
 7 | #' downloaded via the Genomic Data Commons (GDC) Legacy Archive.
 8 | #'
 9 | #' @details The `rangesColumn` name in the GDC data files is usually "exon"
10 | #'   but can be changed with the `rangesColumn` argument, if different.
11 | #'   To avoid programmatically obtaining TCGA barcodes from the GDC
12 | #'   API, set the `getBarcodes` to `FALSE`. When `getBarcodes` is set to
13 | #'   `FALSE`, the file names are used to name the elements of the `GRangesList`
14 | #'   output.
15 | #'
16 | #' @param filepaths character() vector of file paths containing TCGA exon
17 | #'     data usually obtained from the GDC
18 | #'
19 | #' @param sampleNames character() vector of TCGA barcodes to be used as
20 | #'     names for the `GRangesList` output (default NULL)
21 | #'
22 | #' @param fileNames character() vector of file names as downloaded from
23 | #'     the Genomic Data Commons Legacy archive (default `basename(filepaths)`)
24 | #'
25 | #' @param getBarcodes logical(1). Whether to query the GDC API with the
26 | #'     `filenameToBarcode` and obtain the TCGA barcodes from the file names
27 | #'     (default TRUE); see details.
28 | #'
29 | #' @param rangesColumn character(1). The name of the column in the data
30 | #'     containing the ranges information (default "exon"); see details.
31 | #'
32 | #' @param nrows numeric(1). The number of rows to return from each of the files
33 | #'     read in (all rows by default; default Inf)
34 | #'
35 | #' @md
36 | #'
37 | #' @return A [GRangesList][GenomicRanges::GRangesList-class] object
38 | #'
39 | #' @author M. Ramos
40 | #'
41 | #' @examples
42 | #'
43 | #' ## Load example file found in package
44 | #' pkgDir <- system.file("extdata", package = "TCGAutils", mustWork = TRUE)
45 | #' exonFile <- list.files(pkgDir, pattern = "cation\\.txt$", full.names = TRUE)
46 | #'
47 | #' filePrefix <- "unc.edu.32741f9a-9fec-441f-96b4-e504e62c5362.1755371."
48 | #'
49 | #' ## Add actual file name manually (due to Windows OS restriction)
50 | #' makeGRangesListFromExonFiles(exonFile,
51 | #'     fileNames = paste0(filePrefix, basename(exonFile)),
52 | #'     sampleNames = "TCGA-AA-3678-01A-01R-0905-07")
53 | #'
54 | #' @export makeGRangesListFromExonFiles
55 | makeGRangesListFromExonFiles <- function(filepaths, sampleNames = NULL,
56 |     fileNames = basename(filepaths), getBarcodes = TRUE, rangesColumn = "exon",
57 |     nrows = Inf)
58 | {
59 |     if (is.null(sampleNames) && getBarcodes) {
60 |         sampleNames <-
61 |             filenameToBarcode(filenames = fileNames)[[
62 |                 "cases.samples.portions.analytes.aliquots.submitter_id"
63 |             ]]
64 |     } else if (is.null(sampleNames)) {
65 |         sampleNames <- fileNames
66 |     }
67 | 
68 |     if (!identical(length(filepaths), length(sampleNames)))
69 |         stop("'sampleNames' length is inconsistent with 'fileNames'")
70 | 
71 |     btData <- lapply(filepaths, function(file) {
72 |         if (requireNamespace("readr", quietly = TRUE)) {
73 |             readr::local_edition(1)
74 |             readr::read_delim(file, delim = "\t", n_max = nrows)
75 |         } else
76 |             read.delim(file, sep = "\t",
77 |                 nrows = if (is.infinite(nrows)) -1 else nrows)
78 |     })
79 | 
80 |     names(btData) <- sampleNames
81 | 
82 |     allrowdata <-
83 |         if (requireNamespace("dplyr", quietly = TRUE))
84 |             dplyr::bind_rows(btData)
85 |         else
86 |             do.call(rbind, btData)
87 | 
88 |     newGRanges <- GenomicRanges::GRanges(allrowdata[[rangesColumn]])
89 |     mcols(newGRanges) <- allrowdata[, names(allrowdata) != rangesColumn]
90 | 
91 |     splitIndx <- rep(names(btData), vapply(btData, nrow, integer(1L)))
92 |     S4Vectors::splitAsList(newGRanges, splitIndx)
93 | }
94 | 


--------------------------------------------------------------------------------
/R/oncoPrintTCGA.R:
--------------------------------------------------------------------------------
  1 | #' OncoPrint for TCGA Mutation Assays
  2 | #'
  3 | #' @param multiassayexperiment A `MultiAssayExperiment`, usually from
  4 | #'    `curatedTCGAData`
  5 | #'
  6 | #' @param matchassay character(1) The name of the assay containing mutation
  7 | #'     data, this can be a pattern (e.g., "*_Mutation-*", the default)
  8 | #'
  9 | #' @param variantCol character(1) The name of the metadata column containing
 10 | #'     the mutation categories, usually "Variant_Classification" in TCGA
 11 | #'
 12 | #' @param brewerPal character(1) The name of the `RColorBrewer::brewer.pal`
 13 | #'     palette, (default: "Set3")
 14 | #'
 15 | #' @param ntop integer(1) The number of the top N genes for displaying based
 16 | #'     on per-sample mutation frequency
 17 | #'
 18 | #' @param incl.thresh double(1) The inclusion threshold for empirical mutations,
 19 | #'     mutations less frequent than this value will not be included
 20 | #'
 21 | #' @param rowcol character(1) The name of the column in the metadata to annotate
 22 | #'     the rows with either "Hugo_Symbol" (default) or
 23 | #'
 24 | #' @importFrom BiocBaseUtils isScalarCharacter isScalarNumber checkInstalled
 25 | #'
 26 | #' @return An oncoPrint plot of mutations
 27 | #'
 28 | #' @examples
 29 | #'
 30 | #' library(curatedTCGAData)
 31 | #'
 32 | #' acc <- curatedTCGAData("ACC", "Mutation", version = "1.1.38", FALSE)
 33 | #'
 34 | #' oncoPrintTCGA(acc)
 35 | #'
 36 | #' @export
 37 | oncoPrintTCGA <-
 38 |     function(multiassayexperiment, matchassay = "*_Mutation-*",
 39 |         variantCol = "Variant_Classification", brewerPal = "Set3", ntop = 25,
 40 |         incl.thresh = 0.01, rowcol = "Hugo_Symbol")
 41 | {
 42 |     stopifnot(
 43 |         isScalarCharacter(matchassay), isScalarCharacter(variantCol),
 44 |         isScalarCharacter(brewerPal), isScalarNumber(ntop),
 45 |         is(multiassayexperiment, "MultiAssayExperiment"),
 46 |         isScalarNumber(incl.thresh), isScalarCharacter(rowcol)
 47 |     )
 48 | 
 49 |     checkInstalled(c("org.Hs.eg.db", "ComplexHeatmap", "RColorBrewer"))
 50 | 
 51 |     mutname <- grep(utils::glob2rx(matchassay),
 52 |         names(multiassayexperiment), value = TRUE)
 53 | 
 54 |     if (length(mutname) > 1)
 55 |         stop("Only one mutation assay supported at this time")
 56 | 
 57 |     ragex <- multiassayexperiment[[mutname]]
 58 |     stopifnot(is(ragex, "RaggedExperiment"))
 59 | 
 60 |     rownames(ragex) <- mcols(ragex)[[rowcol]]
 61 |     somaticnonsilent <- mcols(ragex)[["Mutation_Status"]] == "Somatic" &
 62 |         mcols(ragex)[[variantCol]] != "Silent"
 63 |     ragex <- ragex[somaticnonsilent, ]
 64 | 
 65 |     Variants <- mcols(ragex)[[variantCol]]
 66 |     Variants <- gsub("_", " ", Variants)
 67 |     mcols(ragex)[[variantCol]] <- Variants
 68 | 
 69 |     types <- table(Variants)
 70 |     tottypes <- sum(types)
 71 |     incl <- (types/tottypes) > incl.thresh
 72 |     types <- types[incl]
 73 |     validvariants <- setNames(names(types), names(types))
 74 | 
 75 |     ragex <- ragex[mcols(ragex)[[variantCol]] %in% validvariants, ]
 76 |     rr <- BiocGenerics::unstrand(RaggedExperiment::rowRanges(ragex))
 77 |     ragex <- RaggedExperiment::`rowRanges<-`(ragex, value = rr)
 78 | 
 79 |     gen <- GenomeInfoDb::genome(ragex)
 80 |     genomeannot <- unique(gen)
 81 |     genomelen <- length(gen)
 82 | 
 83 |     if (length(genomeannot) > 1)
 84 |         stop("'genome' annotation is not consistent")
 85 | 
 86 |     if (!grepl("^[Hh][Gg]", genomeannot)) {
 87 |         cbuild <- correctBuild(genomeannot, "NCBI")
 88 |         ragex <- GenomeInfoDb::`genome<-`(ragex, cbuild)
 89 |         ragex <- GenomeInfoDb::`seqlevelsStyle<-`(ragex, "UCSC")
 90 |         genomeannot <- translateBuild(genomeannot)
 91 |     }
 92 | 
 93 |     checkInstalled(paste0("TxDb.Hsapiens.UCSC.", genomeannot, ".knownGene"))
 94 | 
 95 |     gn <- sort(.getGN(genomeannot))
 96 |     gn <- BiocGenerics::unstrand(gn)
 97 |     gn <- gn[!is.na(names(gn))]
 98 |     sqls <- seqlevelsStyle(ragex)
 99 |     seqlevelsStyle(gn) <- sqls
100 | 
101 |     simplify_fun <- function(scores, ranges, qranges)
102 |         { any(scores != "Silent") }
103 | 
104 |     res <- RaggedExperiment::qreduceAssay(
105 |         ragex, gn, simplify_fun, "Variant_Classification", background = FALSE
106 |     )
107 |     rownames(res) <- names(gn)
108 | 
109 |     topgenes <- head(sort(rowSums(res), decreasing = TRUE), ntop)
110 |     gn2 <- gn[match(names(topgenes), names(gn))]
111 | 
112 |     qualcolors <-
113 |         RColorBrewer::brewer.pal(n = length(validvariants), brewerPal)
114 |     colors <- setNames(qualcolors, validvariants)
115 | 
116 |     colfuns <- lapply(colors, function(couleur) {
117 |         args <- alist(x =, y =, w =, h =)
118 |         args <- as.pairlist(args)
119 |         body <- substitute({
120 |             grid::grid.rect(x, y, w, h, gp = grid::gpar(fill = z, col = NA))
121 |         }, list(z = couleur))
122 |         eval(call("function", args, body))
123 |     })
124 | 
125 |     background <- function(x, y, w, h)
126 |         grid::grid.rect(x, y, w, h,
127 |             gp = grid::gpar(fill = "#FFFFFF", col = "#FFFFFF"))
128 |     mutfuns <- c(background = background, colfuns)
129 | 
130 |     simplify_funs <- lapply(validvariants,
131 |         function(variant) {
132 |             args <- alist(scores =, ranges =, qranges =)
133 |             args <- as.pairlist(args)
134 |             body <- substitute({
135 |                 as.numeric(any(S4Vectors::`%in%`(scores, z)))
136 |             }, list(z = variant))
137 |             eval(call("function", args, body))
138 |         }
139 |     )
140 | 
141 |     list_mats <- lapply(simplify_funs, function(variant_fun) {
142 |         res <- RaggedExperiment::qreduceAssay(ragex, gn2, variant_fun,
143 |             "Variant_Classification", background = 0)
144 |         rownames(res) <- names(gn2)
145 |         res
146 |     })
147 | 
148 |     return(
149 |         ComplexHeatmap::oncoPrint(
150 |             list_mats, alter_fun = mutfuns, col = colors, show_pct = FALSE
151 |         )
152 |     )
153 | }
154 | 
155 | 


--------------------------------------------------------------------------------
/R/simplifyColData.R:
--------------------------------------------------------------------------------
 1 | #' Take a MultiAssayExperiment and include curated variables
 2 | #'
 3 | #' This function works on the `colData` of a
 4 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
 5 | #' object to merge curated variable columns or other clinical variables that
 6 | #' would like to be added. It is recommended that the user run the scripts in
 7 | #' the `MultiAssayExperiment.TCGA` repository that build the "enhanced" type of
 8 | #' data but not necessary if using different clinical data. Please see the
 9 | #' repository's README for more information.
10 | #'
11 | #' @param MultiAssayExperiment A
12 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
13 | #'   object
14 | #' @param colData A `DataFrame` or `data.frame` to merge with
15 | #' clinical data in the `MultiAssayExperiment` object
16 | #'
17 | #' @return A
18 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
19 | #'   object
20 | #'
21 | #' @examples
22 | #'
23 | #' library(MultiAssayExperiment)
24 | #'
25 | #' mergeColData(MultiAssayExperiment(), S4Vectors::DataFrame())
26 | #'
27 | #' @export mergeColData
28 | mergeColData <- function(MultiAssayExperiment, colData) {
29 |     if (!is(MultiAssayExperiment, "MultiAssayExperiment"))
30 |         stop("Provide a valid MultiAssayExperiment object")
31 |     if (!is(colData, "DataFrame") && !is.data.frame(colData))
32 |         stop("'colData' must be 'DataFrame' or 'data.frame'")
33 |     if (is.null(rownames(colData)) && length(colData))
34 |         stop("'colData' data must have rownames")
35 | 
36 |     maeClinical <- colData(MultiAssayExperiment)
37 |     mergedClin <- merge(maeClinical, colData,
38 |         by = c("row.names", intersect(names(maeClinical), names(colData))),
39 |         all = TRUE, sort = FALSE, stringsAsFactors = FALSE)
40 | 
41 |     rownames(mergedClin) <- mergedClin[["Row.names"]]
42 |     mergedClin <- mergedClin[, names(mergedClin) != "Row.names", drop = FALSE]
43 |     colData(MultiAssayExperiment) <- as(mergedClin, "DataFrame")
44 |     MultiAssayExperiment
45 | }
46 | 
47 | #' Minimize the number of variables in colData
48 | #'
49 | #' This function removes variables that have a high number of missing data
50 | #' and contain keywords.
51 | #'
52 | #' @param multiassayexperiment A
53 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
54 | #'   object with `colData`
55 | #'
56 | #' @param maxNAfrac (numeric default 0.2) A decimal between 0 and 1 to indicate
57 | #'   the amount of NA values allowed per column
58 | #'
59 | #' @param keystring (character) A vector of keywords to match and remove
60 | #'   variables
61 | #'
62 | #' @return A
63 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
64 | #'   object
65 | #'
66 | #' @examples
67 | #'
68 | #' example(getSubtypeMap)
69 | #'
70 | #' (gbm_trimmed <- trimColData(gbm))
71 | #'
72 | #' head(colData(gbm_trimmed))[1:5]
73 | #'
74 | #' @export trimColData
75 | trimColData <- function(multiassayexperiment, maxNAfrac = 0.2,
76 |     keystring = c("portion", "analyte")) {
77 |     if (!is(multiassayexperiment, "MultiAssayExperiment"))
78 |         stop("Provide a 'MultiAssayExperiment' input")
79 |     DF <- colData(multiassayexperiment)
80 |     keystring <- na.omit(keystring)
81 | 
82 |     NAabove <- vapply(DF, function(x) mean(is.na(x)) >= maxNAfrac, logical(1L))
83 | 
84 |     keymat <- vapply(keystring, function(string)
85 |         grepl(string, names(DF)), logical(length(DF)))
86 |     keymatch <- apply(keymat, 1L, any)
87 | 
88 |     todrop <- NAabove | keymatch
89 |     colData(multiassayexperiment) <- DF[, !todrop]
90 | 
91 |     multiassayexperiment
92 | }
93 | 


--------------------------------------------------------------------------------
/R/simplifyTCGA.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom GenomicFeatures genes microRNAs
  2 | #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevelsStyle
  3 | #' seqlevelsStyle<-
  4 | NULL
  5 | 
  6 | .checkHas <-
  7 |     function(x, pattern = c("^hsa", "^cg", "symbols"), threshold = 0.9) {
  8 |     if (identical(pattern, "symbols"))
  9 |         pattern <- "^[A-Z0-9]{1,6}|^C[0-9]orf[0-9]{1,4}"
 10 |     mean(c(FALSE, grepl(pattern, rownames(x))), na.rm = TRUE) > 0.9
 11 | }
 12 | 
 13 | .isSummarizedExperiment <- function(x) {
 14 |     is(x, "SummarizedExperiment") & !is(x, "RangedSummarizedExperiment")
 15 | }
 16 | 
 17 | .convertTo <- function(x, which, FUN, keep, unmap) {
 18 |     for (i in which(which)) {
 19 |         lookup <- FUN(rownames(x[[i]]))
 20 |         ranges <- lookup[["mapped"]]
 21 |         rse <- x[[i]][names(ranges), ]
 22 |         # rowData not merged with mcols of RHS in `rowRanges<-` method
 23 |         mcols(ranges) <-
 24 |             S4Vectors::DataFrame(rowData(rse), S4Vectors::mcols(ranges))
 25 |         SummarizedExperiment::rowRanges(rse) <- ranges
 26 |         x <- c(x, setNames(S4Vectors::List(rse),
 27 |             paste0(names(x)[i], "_ranged")))
 28 |         if (length(lookup[["unmapped"]]) && unmap) {
 29 |             se <- x[[i]][lookup[["unmapped"]], ]
 30 |             x <- c(x, setNames(S4Vectors::List(se),
 31 |                 paste0(names(x)[i], "_unranged")))
 32 |         }
 33 |     }
 34 |     if (!keep & any(which))
 35 |         x <- x[, , -match(names(which(which)), names(x))]
 36 |     x
 37 | }
 38 | 
 39 | #' @name hidden-helpers
 40 | #' @title A small document for helper functions
 41 | #' @param x A character vector
 42 | #' @param gn A GRanges object with some of its names found in x
 43 | #' @return A list of length 2: unmapped (character vector) and mapped (GRanges)
 44 | #' @keywords internal
 45 | .makeListRanges <- function(x, gn) {
 46 |     res <- list(unmapped = x[!x %in% names(gn)])
 47 |     x <- x[x %in% names(gn)]
 48 |     gn <- gn[match(x, names(gn))]
 49 |     res[["mapped"]] <- gn
 50 |     return(res)
 51 | }
 52 | 
 53 | #' @importFrom BiocBaseUtils isScalarCharacter
 54 | .getGN <- function(gen) {
 55 |     stopifnot(isScalarCharacter(gen))
 56 | 
 57 |     txdb <- if (identical(gen, "hg18"))
 58 |         TxDb.Hsapiens.UCSC.hg18.knownGene::TxDb.Hsapiens.UCSC.hg18.knownGene
 59 |     else if (identical(gen, "hg19"))
 60 |         TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
 61 | 
 62 |     gn <- keepStandardChromosomes(
 63 |         GenomicFeatures::genes(txdb), pruning.mode = "coarse"
 64 |     )
 65 |     seqlevelsStyle(gn) <- "NCBI"
 66 | 
 67 |     names(gn) <- AnnotationDbi::mapIds(
 68 |         org.Hs.eg.db::org.Hs.eg.db,
 69 |         names(gn),
 70 |         keytype = "ENTREZID",
 71 |         column = "SYMBOL"
 72 |     )
 73 | 
 74 |     gn
 75 | }
 76 | 
 77 | #' @rdname hidden-helpers
 78 | #' @return list of length 2: "unmapped" is a character vector providing
 79 | #' unmapped symbols, "mapped" is a GRanges object with ranges of mapped symbols
 80 | #' @keywords internal
 81 | .getRangesOfSYMBOLS <- function(x) {
 82 |     gn <- .getGN("hg19")
 83 |     .makeListRanges(x, gn)
 84 | }
 85 | 
 86 | .getRangesOfCpG <- function(x) {
 87 |     local_data_store <- new.env(parent = emptyenv())
 88 |     data(
 89 |         "Locations",
 90 |         envir = local_data_store,
 91 |         package = "IlluminaHumanMethylation450kanno.ilmn12.hg19"
 92 |     )
 93 |     Locations <- local_data_store[["Locations"]]
 94 | 
 95 |     clist <- list(seqnames = "chr", pos = "pos", strand = "strand")
 96 |     gps <- do.call(
 97 |         GenomicRanges::GPos,
 98 |         lapply(clist, function(x) Locations[, x])
 99 |     )
100 |     names(gps) <- rownames(Locations)
101 |     seqlevelsStyle(gps) <- "NCBI"
102 | 
103 |     .makeListRanges(x, gps)
104 | }
105 | 
106 | #' @rdname simplifyTCGA
107 | #'
108 | #' @title Functions to convert rows annotations to ranges and RaggedExperiment
109 | #' to RangedSummarizedExperiment
110 | #'
111 | #' @description This group of functions will convert row annotations as
112 | #' either gene symbols or miRNA symbols to row ranges based on database
113 | #' resources 'TxDB' and 'org.Hs' packages. It will also simplify the
114 | #' representation of
115 | #' [RaggedExperiment][RaggedExperiment::RaggedExperiment-class] objects to
116 | #' [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class].
117 | #'
118 | #' @details The original `SummarizedExperiment` containing either gene symbol
119 | #'   or miR annotations is replaced or supplemented by a
120 | #'   [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class]
121 | #'   for those that could be mapped to
122 | #'   [GRanges][GenomicRanges::GRanges-class], and optionally another
123 | #'   [SummarizedExperiment][SummarizedExperiment::SummarizedExperiment-class]
124 | #'   for annotations that could not be mapped to
125 | #'   [GRanges][GenomicRanges::GRanges-class].
126 | #'
127 | #' @section qreduceTCGA:
128 | #'
129 | #' Using `TxDb.Hsapiens.UCSC.hg19.knownGene` as the reference, `qreduceTCGA`
130 | #' reduces the data by applying either the `weightedmean` or `nonsilent`
131 | #' function (see below) to non-mutation or mutation data, respectively.
132 | #' Internally, it uses [RaggedExperiment::qreduceAssay()] to reduce the ranges
133 | #' to the gene-level.
134 | #'
135 | #' `qreduceTCGA` will update `genome(x)` based on the NCBI reference annotation
136 | #' which includes the patch number, e.g., GRCh37.p14, as provided by the
137 | #' `seqlevelsStyle` setter, `seqlevelsStyle(gn) <- "NCBI"`. `qreduceTCGA`
138 | #' uses the NCBI genome annotation as the default reference.
139 | #'
140 | #'     nonsilent <- function(scores, ranges, qranges)
141 | #'         any(scores != "Silent")
142 | #'
143 | #' `RaggedExperiment` mutation objects become a genes by patients
144 | #' `RangedSummarizedExperiment` object containing '1' if there is a non-silent
145 | #' mutation somewhere in the gene, and '0' otherwise as obtained from the
146 | #' `Variant_Classification` column in the data.
147 | #'
148 | #'     weightedmean <- function(scores, ranges, qranges) {
149 | #'         isects <- GenomicRanges::pintersect(ranges, qranges)
150 | #'         sum(scores * BiocGenerics::width(isects)) /
151 | #'             sum(BiocGenerics::width(isects))
152 | #'     }
153 | #'
154 | #' "CNA" and "CNV" segmented copy number are reduced using a weighted mean in
155 | #' the rare cases of overlapping (non-disjoint) copy number regions.
156 | #'
157 | #' These functions rely on `TxDb.Hsapiens.UCSC.hg19.knownGene` and
158 | #' `org.Hs.eg.db` to map to the 'hg19' NCBI build. Use the `liftOver` procedure
159 | #' for datasets that are provided against a different reference genome (usually
160 | #' 'hg18'). See an example in the vignette.
161 | #'
162 | #' @param obj A `MultiAssayExperiment` object obtained from `curatedTCGAData`
163 | #'
164 | #' @param keep.assay logical (default FALSE) Whether to keep the
165 | #'   `SummarizedExperiment` assays that have been converted to
166 | #'   `RangedSummarizedExperiment`
167 | #'
168 | #' @param unmapped logical (default TRUE) Include an assay of data that was
169 | #'   not able to be mapped in reference database
170 | #'
171 | #' @param suffix character (default "_simplified") A character string to append
172 | #'   to the newly modified assay for `qreduceTCGA`.
173 | #'
174 | #' @return A
175 | #'   [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
176 | #'   with any gene expression, miRNA, copy number, and mutations converted to
177 | #'   [`RangedSummarizedExperiment`][SummarizedExperiment::RangedSummarizedExperiment-class]
178 | #'   objects
179 | #'
180 | #' @author L. Waldron
181 | #'
182 | #' @md
183 | #'
184 | #' @examples
185 | #'
186 | #' library(curatedTCGAData)
187 | #' library(GenomeInfoDb)
188 | #'
189 | #' accmae <-
190 | #'     curatedTCGAData(diseaseCode = "ACC",
191 | #'     assays = c("CNASNP", "Mutation", "miRNASeqGene", "GISTICT"),
192 | #'     version = "1.1.38",
193 | #'     dry.run = FALSE)
194 | #'
195 | #' ## update genome annotation
196 | #' rex <- accmae[["ACC_Mutation-20160128"]]
197 | #'
198 | #' ## Translate build to "hg19"
199 | #' tgenome <- vapply(genome(rex), translateBuild, character(1L))
200 | #' genome(rex) <- tgenome
201 | #'
202 | #' accmae[["ACC_Mutation-20160128"]] <- rex
203 | #'
204 | #' simplifyTCGA(accmae)
205 | #'
206 | #' @export
207 | simplifyTCGA <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
208 |     obj <- qreduceTCGA(obj, keep.assay)
209 |     symbolsToRanges(obj, keep.assay, unmapped)
210 | }
211 | 
212 | #' @name simplifyTCGA
213 | #' @aliases symbolsToRanges
214 | #' @importFrom BiocBaseUtils checkInstalled
215 | #' @export
216 | symbolsToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
217 |     can.fix <- vapply(
218 |         experiments(obj),
219 |         function(y) {
220 |             .checkHas(y, "symbols") & .isSummarizedExperiment(y)
221 |         },
222 |         logical(1L)
223 |     )
224 | 
225 |     checkInstalled(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db"))
226 |     .convertTo(
227 |         x = obj,
228 |         which = can.fix,
229 |         FUN = .getRangesOfSYMBOLS,
230 |         keep = keep.assay,
231 |         unmap = unmapped
232 |     )
233 | }
234 | 
235 | #' @name simplifyTCGA-defunct
236 | #'
237 | #' @title Defunct TCGAutils functions
238 | #'
239 | #' @inheritParams simplifyTCGA
240 | #'
241 | #' @description `mirToRanges` is defunct and will be removed in the next
242 | #' release. The `mirbase.db` package is currently deprecated in `RELEASE_3_21`.
243 | #'
244 | #' @aliases mirToRanges
245 | #'
246 | #' @importFrom BiocBaseUtils lifeCycle
247 | #'
248 | #' @export
249 | mirToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
250 |     lifeCycle(cycle = "defunct", title = "simplifyTCGA")
251 | }
252 | 
253 | #' @name simplifyTCGA
254 | #' @aliases CpGtoRanges
255 | #' @export
256 | CpGtoRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
257 |     can.fix <- vapply(
258 |         experiments(obj),
259 |         function(y) {
260 |             .checkHas(y, "^cg") & .isSummarizedExperiment(y)
261 |         },
262 |         logical(1L)
263 |     )
264 | 
265 |     checkInstalled("IlluminaHumanMethylation450kanno.ilmn12.hg19")
266 | 
267 |     .convertTo(
268 |         x = obj,
269 |         which = can.fix,
270 |         FUN = .getRangesOfCpG,
271 |         keep = keep.assay,
272 |         unmap = unmapped
273 |     )
274 | }
275 | 
276 | #' @name simplifyTCGA
277 | #' @aliases qreduceTCGA
278 | #' @export
279 | qreduceTCGA <- function(obj, keep.assay = FALSE, suffix = "_simplified") {
280 |     checkInstalled(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db"))
281 |     gn <- genes(
282 |         TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
283 |     )
284 |     gn <- keepStandardChromosomes(
285 |         GenomicRanges::granges(gn),
286 |         pruning.mode = "coarse"
287 |     )
288 |     seqlevelsStyle(gn) <- "NCBI"
289 |     names(gn) <- AnnotationDbi::mapIds(
290 |         org.Hs.eg.db::org.Hs.eg.db,
291 |         names(gn),
292 |         keytype = "ENTREZID",
293 |         column = "SYMBOL"
294 |     )
295 | 
296 |     weightedmean <- function(scores, ranges, qranges) {
297 |         isects <- GenomicRanges::pintersect(ranges, qranges)
298 |         sum(scores * BiocGenerics::width(isects)) /
299 |             sum(BiocGenerics::width(isects))
300 |     }
301 | 
302 |     nonsilent <- function(scores, ranges, qranges)
303 |         any(scores != "Silent")
304 | 
305 |     isRE <-
306 |         function(x) vapply(experiments(x), is, logical(1L), "RaggedExperiment")
307 | 
308 |     isMut <- function(x) grepl("Mutation", names(x))
309 | 
310 |     for (i in which(isMut(obj))) {
311 |         sqls <- seqlevelsStyle(obj[[i]])
312 |         seqlevelsStyle(gn) <- sqls
313 |         ## remove patch release info
314 |         gname <- genome(gn)
315 |         genome(gn) <- gsub("\\.p[0-9]{1,2}$", "", genome(gn))
316 |         mutations <- RaggedExperiment::qreduceAssay(
317 |             obj[[i]],
318 |             gn,
319 |             nonsilent,
320 |             "Variant_Classification"
321 |         )
322 |         rownames(mutations) <- names(gn)
323 |         mutations[is.na(mutations)] <- 0
324 |         remove.rows <- is.na(rownames(mutations))
325 |         mut_ranges <- gn[!remove.rows]
326 |         ## replace patch release info
327 |         genome(mut_ranges) <- gname
328 |         mutations <- SummarizedExperiment(
329 |             mutations[!remove.rows, ], rowRanges = mut_ranges
330 |         )
331 |         el <- ExperimentList(x = mutations)
332 |         names(el) <- paste0(names(obj)[i], suffix)
333 |         obj <- c(obj, el)
334 |     }
335 |     for (i in which(isRE(obj) & !isMut(obj))) {
336 |         sqls <- seqlevelsStyle(obj[[i]])
337 |         seqlevelsStyle(gn) <- sqls
338 |         suppressWarnings(
339 |             cn <- RaggedExperiment::qreduceAssay(
340 |                 obj[[i]],
341 |                 gn,
342 |                 weightedmean,
343 |                 "Segment_Mean"
344 |             )
345 |         )
346 |         rownames(cn) <- names(gn)
347 |         remove.rows <- is.na(rownames(cn))
348 |         cn <- SummarizedExperiment(
349 |             cn[!remove.rows, ], rowRanges = gn[!remove.rows]
350 |         )
351 |         el <- ExperimentList(x = cn)
352 |         names(el) <- paste0(names(obj)[i], suffix)
353 |         obj <- c(obj, el)
354 |     }
355 |     if (!keep.assay) {
356 |         obj <- obj[, , !isRE(obj)]
357 |     }
358 |     return(obj)
359 | }
360 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | ## Helper for finding barcode column
 2 | ## **Takes the first result!**
 3 | .findBarcodeCol <- function(DF) {
 4 |     cnames <- names(DF)
 5 |     containsBC <- vapply(head(DF), function(column) {
 6 |         all(startsWith(column, "TCGA"))
 7 |     }, logical(1L))
 8 |     names(containsBC) <- cnames
 9 |     bcIdx <- which(containsBC)
10 |     stopifnot(S4Vectors::isSingleInteger(which(containsBC)))
11 |     names(containsBC)[bcIdx]
12 | }
13 | 
14 | ## Standardize barcode format
15 | .standardBarcodes <- function(sampleBarcode) {
16 |     if (!length(sampleBarcode)) {
17 |         stop("<internal> Barcode must be of positive length")
18 |     }
19 |     sampleBC <- base::sample(sampleBarcode, 10L, replace = TRUE)
20 |     bcodeTest <- grepl("\\.", sampleBC)
21 |     if (all(bcodeTest))
22 |         sampleBarcode <- gsub("\\.", "-", sampleBarcode)
23 |     toupper(sampleBarcode)
24 | }
25 | 
26 | ## Find columns that are all NA
27 | .findNAColumns <- function(dataset) {
28 |     apply(dataset, 2L, function(column) {
29 |         all(is.na(column))
30 |     })
31 | }
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TCGAutils <a href='https://waldronlab.io/'><img src='https://raw.githubusercontent.com/Bioconductor/BiocStickers/master/TCGAutils/TCGAutils.png' align="right" height="139" /></a>
 2 | 
 3 | The `TCGAutils` package provides a suite of helper functions that aid in
 4 | the management and cleaning of data from The Cancer Genome Atlas (TCGA).
 5 | 
 6 | Many of the functions contained herein work on raw and derived data objects
 7 | from The Cancer Genome Atlas (TCGA), the `RTCGAToolbox` package and
 8 | `curatedTCGAData` experiment data package.
 9 | 
10 | Please make sure to download the latest version of `RTCGAToolbox`
11 | from Bioconductor.
12 | 
13 | ## Installation
14 | 
15 | ```
16 | if (!require("BiocManager"))
17 |     install.packages("BiocManager")
18 | 
19 | library(BiocManager)
20 | 
21 | install("TCGAutils")
22 | ```
23 | 
24 | ## Cheatsheet
25 | 
26 | <a href="https://github.com/waldronlab/cheatsheets/blob/main/TCGAutils_cheatsheet.pdf"><img src="https://raw.githubusercontent.com/waldronlab/cheatsheets/main/pngs/TCGAutils_cheatsheet.png" width="989" height="1091"/></a>
27 | 
28 | Please report minimally reproducible bugs at our [github issue page][]
29 | 
30 | [github issue page]: https://github.com/waldronlab/TCGAutils/issues
31 | 
32 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | title: TCGAutils
2 | url: https://waldronlab.github.io/TCGAutils
3 | 
4 | template:
5 |   bootstrap: 5
6 |   params:
7 |     bootswatch: flatly
8 | 


--------------------------------------------------------------------------------
/data/clinicalNames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/clinicalNames.rda


--------------------------------------------------------------------------------
/data/diseaseCodes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/diseaseCodes.rda


--------------------------------------------------------------------------------
/data/sampleTypes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/sampleTypes.rda


--------------------------------------------------------------------------------
/inst/extdata/blca_cnaseq.R:
--------------------------------------------------------------------------------
 1 | ## Generate blca_cnaseq data
 2 | if (!requireNamespace("RTCGAToolbox"))
 3 |     stop("Download package 'RTCGAToolbox' to regenerate data")
 4 | 
 5 | library(RTCGAToolbox)
 6 | 
 7 | blca <- getFirehoseData("BLCA", clinical = FALSE,
 8 |     CNASeq = TRUE, destdir = tempdir())
 9 | bl <- getData(blca, "CNASeq")
10 | 
11 | set.seed(777)
12 | blsplit <- lapply(split(bl, bl[["Sample"]]), function(x)
13 |     x[sample(seq_len(nrow(x)), 2L), ])
14 | 
15 | blframe <- dplyr::bind_rows(blsplit)
16 | blca_cnaseq <- blframe[c(TRUE, TRUE, FALSE, FALSE), ]
17 | 
18 | write.table(blca_cnaseq, file = "inst/extdata/blca_cnaseq.txt")
19 | 


--------------------------------------------------------------------------------
/inst/extdata/blca_cnaseq.txt:
--------------------------------------------------------------------------------
  1 | "Sample" "Chromosome" "Start" "End" "Num_Probes" "Segment_Mean"
  2 | "1" "TCGA-BL-A0C8-01A-11D-A10R-02" 14 70362113 73912204 NA -0.182879930738387
  3 | "2" "TCGA-BL-A0C8-01A-11D-A10R-02" 9 115609546 131133898 NA 0.0396751622235396
  4 | "5" "TCGA-BL-A13I-01A-11D-A13U-02" 13 19020028 49129100 NA 0.00208555197637913
  5 | "6" "TCGA-BL-A13I-01A-11D-A13U-02" 1 10208 246409808 NA -0.0142247519016688
  6 | "9" "TCGA-BL-A13J-01A-11D-A10R-02" 23 3119586 5636448 NA 0.877072555244314
  7 | "10" "TCGA-BL-A13J-01A-11D-A10R-02" 7 10127 35776912 NA 0.113873871106118
  8 | "13" "TCGA-BL-A13J-11A-13D-A10R-02" 13 27638070 28786211 NA 0.135933760992049
  9 | "14" "TCGA-BL-A13J-11A-13D-A10R-02" 13 31546838 31837888 NA 0.831083871851924
 10 | "17" "TCGA-BL-A3JM-11A-31D-A21C-26" 6 63814 171050932 NA -0.222089556733618
 11 | "18" "TCGA-BL-A3JM-11A-31D-A21C-26" 19 70880 12247668 NA 0.212002033643438
 12 | "21" "TCGA-BT-A0S7-10A-01D-A10R-02" 8 18542416 43427219 NA -0.109425720019835
 13 | "22" "TCGA-BT-A0S7-10A-01D-A10R-02" 12 129318803 129977811 NA -0.750308530005678
 14 | "25" "TCGA-BT-A0YX-10A-01D-A10R-02" 1 100222446 100622130 NA 0.260037976378174
 15 | "26" "TCGA-BT-A0YX-10A-01D-A10R-02" 16 19103580 90294729 NA -0.0243020219348178
 16 | "29" "TCGA-BT-A20N-11A-11D-A14U-02" 23 144220182 151288847 NA -0.218732574411559
 17 | "30" "TCGA-BT-A20N-11A-11D-A14U-02" 11 126948609 128472558 NA 0.147130578528114
 18 | "33" "TCGA-BT-A20O-11A-11D-A14U-02" 19 27741477 47326510 NA 0.218065351393204
 19 | "34" "TCGA-BT-A20O-11A-11D-A14U-02" 7 38394968 142360699 NA 0.00783656168862701
 20 | "37" "TCGA-BT-A20P-11A-11D-A14U-02" 19 70880 59118869 NA -0.0585214412378456
 21 | "38" "TCGA-BT-A20P-11A-11D-A14U-02" 5 141615867 162228885 NA -0.0573410519735708
 22 | "41" "TCGA-BT-A20Q-11A-11D-A14U-02" 1 65116406 105786887 NA -0.0137743863413686
 23 | "42" "TCGA-BT-A20Q-11A-11D-A14U-02" 6 63814 143557977 NA -0.0263569581064626
 24 | "45" "TCGA-BT-A20R-11A-11D-A16N-02" 4 159329222 184058329 NA -0.285766564893208
 25 | "46" "TCGA-BT-A20R-11A-11D-A16N-02" 23 2699503 154930285 NA -0.0507782046619671
 26 | "49" "TCGA-BT-A20T-11A-11D-A14U-02" 21 9411344 48119869 NA -0.00912023316078493
 27 | "50" "TCGA-BT-A20T-11A-11D-A14U-02" 2 143209111 153872733 NA -0.28106978566767
 28 | "53" "TCGA-BT-A20U-11A-11D-A14U-02" 2 10001 243189359 NA -0.0085274424194125
 29 | "54" "TCGA-BT-A20U-11A-11D-A14U-02" 5 11769 180905246 NA -0.00773472005569888
 30 | "57" "TCGA-BT-A20W-11A-11D-A14U-02" 1 187465256 201152272 NA -0.161874068627793
 31 | "58" "TCGA-BT-A20W-11A-11D-A14U-02" 22 16051206 22110145 NA 0.212805544089224
 32 | "61" "TCGA-BT-A20X-11A-12D-A16N-02" 17 1 21505693 NA -0.341408571718122
 33 | "62" "TCGA-BT-A20X-11A-12D-A16N-02" 4 68818 41597686 NA -0.129306283769583
 34 | "65" "TCGA-BT-A2LA-11A-11D-A18D-02" 20 56213353 56783730 NA 0.285173876119266
 35 | "66" "TCGA-BT-A2LA-11A-11D-A18D-02" 2 133819934 243189359 NA -0.635651500181676
 36 | "69" "TCGA-BT-A2LB-10A-01D-A18D-02" 23 58318413 62045719 NA 0.521363874910562
 37 | "70" "TCGA-BT-A2LB-10A-01D-A18D-02" 12 60730 34560974 NA 0.179375958983952
 38 | "73" "TCGA-BT-A2LD-01A-12D-A210-26" 22 40591555 51244552 NA -0.115569189056885
 39 | "74" "TCGA-BT-A2LD-01A-12D-A210-26" 8 72056431 86554763 NA -0.254362985423551
 40 | "77" "TCGA-BT-A3PH-01A-11D-A221-26" 5 49559700 180905246 NA -0.251117604050586
 41 | "78" "TCGA-BT-A3PH-01A-11D-A221-26" 12 38440830 133841505 NA 0.0199294370182946
 42 | "81" "TCGA-BT-A3PJ-01A-21D-A221-26" 7 70562186 71417321 NA 0.376852503791604
 43 | "82" "TCGA-BT-A3PJ-01A-21D-A221-26" 8 125999777 146303846 NA -0.503864205599995
 44 | "85" "TCGA-BT-A3PK-01A-21D-A221-26" 4 73332096 78163988 NA 0.941466762482077
 45 | "86" "TCGA-BT-A3PK-01A-21D-A221-26" 10 61818 78260947 NA 0.0574869749635298
 46 | "89" "TCGA-C4-A0F0-01A-12D-A10R-02" 12 38009613 82438515 NA -0.368132483970951
 47 | "90" "TCGA-C4-A0F0-01A-12D-A10R-02" 16 32562492 33969362 NA -0.303646313844007
 48 | "93" "TCGA-C4-A0F6-01A-11D-A10R-02" 6 63814 2319181 NA -0.121211466525656
 49 | "94" "TCGA-C4-A0F6-01A-11D-A10R-02" 23 80225640 82442603 NA 0.150857739370916
 50 | "97" "TCGA-CF-A1HR-01A-11D-A13U-02" 7 856949 17999178 NA 0.0848758525580564
 51 | "98" "TCGA-CF-A1HR-01A-11D-A13U-02" 4 121888639 123874924 NA -0.424295799956765
 52 | "101" "TCGA-CF-A1HS-01A-11D-A13U-02" 21 9422166 10021274 NA 0.832013365283215
 53 | "102" "TCGA-CF-A1HS-01A-11D-A13U-02" 10 100081385 135524732 NA -0.080306080581503
 54 | "105" "TCGA-CF-A27C-01A-11D-A16N-02" 24 2649474 28783838 NA 0.100350641151954
 55 | "106" "TCGA-CF-A27C-01A-11D-A16N-02" 13 72125634 115109864 NA 0.0476642681135593
 56 | "109" "TCGA-CF-A3MF-01A-12D-A21C-26" 13 19020028 115109864 NA -0.0962439892452074
 57 | "110" "TCGA-CF-A3MF-01A-12D-A21C-26" 14 19141347 107289526 NA -0.126569530580431
 58 | "113" "TCGA-CF-A3MG-01A-11D-A210-26" 9 1461804 2510267 NA -0.311690759521937
 59 | "114" "TCGA-CF-A3MG-01A-11D-A210-26" 9 80421293 141153413 NA -0.585006712032954
 60 | "117" "TCGA-CF-A3MH-01A-11D-A210-26" 1 10208 249240606 NA 0.0143519914953139
 61 | "118" "TCGA-CF-A3MH-01A-11D-A210-26" 16 60001 90294729 NA 0.00941993371591381
 62 | "121" "TCGA-CF-A3MI-01A-11D-A210-26" 12 42897585 43131785 NA 1.42565689631929
 63 | "122" "TCGA-CF-A3MI-01A-11D-A210-26" 8 20441971 21087370 NA 0.0549983590579611
 64 | "125" "TCGA-CU-A0YN-01A-21D-A10R-02" 8 23379 39801453 NA -0.310204325025844
 65 | "126" "TCGA-CU-A0YN-01A-21D-A10R-02" 15 74139211 74241947 NA 0.642523331815196
 66 | "129" "TCGA-CU-A0YO-01A-11D-A10R-02" 20 60001 62965506 NA 0.198193566579685
 67 | "130" "TCGA-CU-A0YO-01A-11D-A10R-02" 12 169006 2861395 NA 0.207262247259086
 68 | "133" "TCGA-CU-A0YR-01A-12D-A10R-02" 7 79955020 104117560 NA -0.377495201658043
 69 | "134" "TCGA-CU-A0YR-01A-12D-A10R-02" 11 48389139 50402704 NA 0.334858333412045
 70 | "137" "TCGA-CU-A0YR-11A-13D-A10R-02" 8 23379 17786984 NA -0.755070670539717
 71 | "138" "TCGA-CU-A0YR-11A-13D-A10R-02" 8 123319655 125120728 NA 1.3772493355834
 72 | "141" "TCGA-CU-A3KJ-10A-01D-A21C-26" 16 31390499 32511983 NA -0.521629286457645
 73 | "142" "TCGA-CU-A3KJ-10A-01D-A21C-26" 16 16731215 18769004 NA -1.2494171348026
 74 | "145" "TCGA-CU-A3QU-10B-01D-A233-26" 2 208994397 243189359 NA -1.52996811774475
 75 | "146" "TCGA-CU-A3QU-10B-01D-A233-26" 1 10208 49541384 NA 0.460765725702518
 76 | "149" "TCGA-CU-A3YL-10A-01D-A233-26" 2 239629318 243189359 NA -0.701322622714515
 77 | "150" "TCGA-CU-A3YL-10A-01D-A233-26" 3 71088642 71453135 NA -1.86922073857518
 78 | "153" "TCGA-DK-A1A3-10A-01D-A13U-02" 6 16613611 19925191 NA -0.363448575873029
 79 | "154" "TCGA-DK-A1A3-10A-01D-A13U-02" 19 28118887 28731289 NA 0.376931649965926
 80 | "157" "TCGA-DK-A1A5-10A-01D-A13U-02" 3 71815322 130634593 NA -0.031816052401891
 81 | "158" "TCGA-DK-A1A5-10A-01D-A13U-02" 2 72065672 72886364 NA -0.310663081153216
 82 | "161" "TCGA-DK-A1A6-10A-01D-A13U-02" 17 28962289 32213654 NA 0.62211927661298
 83 | "162" "TCGA-DK-A1A6-10A-01D-A13U-02" 5 14580750 15469477 NA -1.24169638329031
 84 | "165" "TCGA-DK-A1A7-10A-01D-A13U-02" 18 66468600 78017233 NA -0.535997156763619
 85 | "166" "TCGA-DK-A1A7-10A-01D-A13U-02" 9 10001 23301195 NA -0.125621658141106
 86 | "169" "TCGA-DK-A1AA-10A-01D-A13U-02" 7 105930243 159128640 NA -0.00385890425740978
 87 | "170" "TCGA-DK-A1AA-10A-01D-A13U-02" 6 475695 93983427 NA -0.00845643217356545
 88 | "173" "TCGA-DK-A1AB-10A-01D-A13U-02" 6 63814 171050932 NA -0.373717242247084
 89 | "174" "TCGA-DK-A1AB-10A-01D-A13U-02" 21 9422166 48119869 NA 0.0682825854749836
 90 | "177" "TCGA-DK-A1AC-10A-01D-A13U-02" 16 57189726 79008568 NA -0.525391161119694
 91 | "178" "TCGA-DK-A1AC-10A-01D-A13U-02" 1 10208 15244115 NA 0.520556297903463
 92 | "181" "TCGA-DK-A1AD-10A-01D-A13U-02" 1 159676413 161070225 NA 1.86289656474361
 93 | "182" "TCGA-DK-A1AD-10A-01D-A13U-02" 13 19020028 20299513 NA 0.127958459808601
 94 | "185" "TCGA-DK-A1AE-10A-01D-A13U-02" 16 63147135 63177728 NA -1.63212120821474
 95 | "186" "TCGA-DK-A1AE-10A-01D-A13U-02" 10 117242386 135524732 NA 0.0308715661449148
 96 | "189" "TCGA-DK-A1AG-10A-01D-A13U-02" 20 60001 4556887 NA -0.00203031673746495
 97 | "190" "TCGA-DK-A1AG-10A-01D-A13U-02" 15 20000001 102521366 NA 0.0182477385688014
 98 | "193" "TCGA-DK-A2HX-10A-01D-A18D-02" 7 8883170 9814709 NA 0.0273305076343656
 99 | "194" "TCGA-DK-A2HX-10A-01D-A18D-02" 6 66048503 66741301 NA 0.0979709656531481
100 | "197" "TCGA-DK-A2I1-10A-01D-A17R-02" 4 175233634 178082237 NA -0.0101897136371403
101 | "198" "TCGA-DK-A2I1-10A-01D-A17R-02" 1 104056927 104090604 NA 0.77892793161476
102 | "201" "TCGA-DK-A2I2-10A-01D-A17R-02" 18 10064 18510945 NA 0.036105889090774
103 | "202" "TCGA-DK-A2I2-10A-01D-A17R-02" 4 71725144 74690959 NA 0.161070154821952
104 | "205" "TCGA-DK-A2I6-10A-01D-A18D-02" 12 64764820 66505825 NA -0.290763281991839
105 | "206" "TCGA-DK-A2I6-10A-01D-A18D-02" 1 235594502 249240606 NA -0.18624845048347
106 | "209" "TCGA-DK-A3IK-10A-01D-A21C-26" 20 39116327 40164928 NA 0.524610262981439
107 | "210" "TCGA-DK-A3IK-10A-01D-A21C-26" 16 5352600 7575607 NA -0.004329967791247
108 | "213" "TCGA-DK-A3IL-10A-01D-A210-26" 11 87224559 99891593 NA 0.595682235146572
109 | "214" "TCGA-DK-A3IL-10A-01D-A210-26" 16 35155273 46459008 NA 1.75030804297734
110 | "217" "TCGA-DK-A3IN-10A-01D-A210-26" 3 31451568 37904707 NA 0.122265816847628
111 | "218" "TCGA-DK-A3IN-10A-01D-A210-26" 4 115198041 191014415 NA -0.197608781117688
112 | "221" "TCGA-DK-A3IQ-10A-01D-A210-26" 20 60001 18388033 NA 0.115949737357392
113 | "222" "TCGA-DK-A3IQ-10A-01D-A210-26" 9 3251728 4485530 NA -0.250696214306595
114 | "225" "TCGA-DK-A3IT-10A-01D-A210-26" 12 26283974 26505903 NA 0.73808981078023
115 | "226" "TCGA-DK-A3IT-10A-01D-A210-26" 1 10208 4234646 NA -0.116563794362821
116 | "229" "TCGA-DK-A3IU-10A-01D-A210-26" 12 37991253 38053526 NA 1.87478993183408
117 | "230" "TCGA-DK-A3IU-10A-01D-A210-26" 17 25301752 81195162 NA -0.137542875623982
118 | "233" "TCGA-DK-A3IV-10A-01D-A21C-26" 8 65187706 65225497 NA 1.28329726536649
119 | "234" "TCGA-DK-A3IV-10A-01D-A21C-26" 22 17156707 48633608 NA -0.494888197759455
120 | "237" "TCGA-DK-A3WX-10A-01D-A233-26" 15 20000001 102521366 NA 0.079847704874919
121 | "238" "TCGA-DK-A3WX-10A-01D-A233-26" 6 63814 171050932 NA 0.0487661224113239
122 | "241" "TCGA-DK-A3WY-10A-01D-A233-26" 14 106177004 107289526 NA -0.280540869169745
123 | "242" "TCGA-DK-A3WY-10A-01D-A233-26" 11 195901 134946455 NA -0.00181591314713781
124 | "245" "TCGA-DK-A3X1-10A-01D-A233-26" 3 10988589 11874432 NA 0.741599099191288
125 | "246" "TCGA-DK-A3X1-10A-01D-A233-26" 19 13471712 24513884 NA -0.103991232721606
126 | "249" "TCGA-DK-A3X2-10A-01D-A233-26" 4 58691318 59886313 NA -0.769692286553588
127 | "250" "TCGA-DK-A3X2-10A-01D-A233-26" 14 42263552 86829169 NA 0.232667267761356
128 | "253" "TCGA-E5-A2PC-10B-01D-A204-02" 10 61818 5025173 NA -0.130858593157441
129 | "254" "TCGA-E5-A2PC-10B-01D-A204-02" 4 145764691 151076668 NA -0.637674731273411
130 | "257" "TCGA-E7-A3X6-10A-01D-A233-26" 23 45054413 102902405 NA 0.113174219539276
131 | "258" "TCGA-E7-A3X6-10A-01D-A233-26" 9 10001 21965768 NA -0.526924136501349
132 | "261" "TCGA-E7-A3Y1-10A-01D-A233-26" 6 63814 171050932 NA -0.0392434632506376
133 | "262" "TCGA-E7-A3Y1-10A-01D-A233-26" 1 142825886 249240606 NA 0.47588858781785
134 | "265" "TCGA-FD-A3B3-10A-01D-A204-02" 17 57789929 81195162 NA -0.0358779386299791
135 | "266" "TCGA-FD-A3B3-10A-01D-A204-02" 2 78689086 87740684 NA -0.225400054573825
136 | "269" "TCGA-FD-A3B4-10A-01D-A204-02" 7 41117608 90058582 NA -0.0454754926077756
137 | "270" "TCGA-FD-A3B4-10A-01D-A204-02" 20 60001 62965506 NA -0.0293578026480229
138 | "273" "TCGA-FD-A3B5-10A-01D-A210-26" 23 7785329 154930285 NA 0.31861949381711
139 | "274" "TCGA-FD-A3B5-10A-01D-A210-26" 1 144531908 249240606 NA 0.195129750882562
140 | "277" "TCGA-FD-A3B6-10A-01D-A210-26" 24 2649474 28783838 NA -0.0816521284861819
141 | "278" "TCGA-FD-A3B6-10A-01D-A210-26" 10 65768355 67062950 NA 0.708790577327999
142 | "281" "TCGA-FD-A3B7-10A-01D-A210-26" 4 148104418 152605747 NA 0.440942926444969
143 | "282" "TCGA-FD-A3B7-10A-01D-A210-26" 6 63814 5660280 NA -0.156037696605089
144 | "285" "TCGA-FD-A3B8-10A-01D-A210-26" 16 60001 90294729 NA -0.0273358821397191
145 | "286" "TCGA-FD-A3B8-10A-01D-A210-26" 23 2699503 154930285 NA 0.0327613575222251
146 | "289" "TCGA-FD-A3N5-10A-01D-A21C-26" 6 44317165 171050932 NA -0.0955242025861752
147 | "290" "TCGA-FD-A3N5-10A-01D-A21C-26" 15 20000001 32010494 NA -0.545547383873378
148 | "293" "TCGA-FD-A3N6-10A-01D-A21C-26" 3 60362957 60478746 NA -2.61639502153454
149 | "294" "TCGA-FD-A3N6-10A-01D-A21C-26" 20 34183971 62965506 NA -0.0596705976054726
150 | "297" "TCGA-FD-A3NA-10A-01D-A21C-26" 5 49551633 150135519 NA -0.0942552581652406
151 | "298" "TCGA-FD-A3NA-10A-01D-A21C-26" 4 126241915 129194848 NA -0.433257831498876
152 | "301" "TCGA-FD-A3SJ-10A-01D-A233-26" 9 27489874 29746925 NA 0.169912427768146
153 | "302" "TCGA-FD-A3SJ-10A-01D-A233-26" 1 173792657 175670239 NA 1.24471530069067
154 | "305" "TCGA-FD-A3SL-10A-01D-A233-26" 12 188219 133841505 NA -0.182399287422105
155 | "306" "TCGA-FD-A3SL-10A-01D-A233-26" 4 189693990 191014415 NA -0.427894104774698
156 | "309" "TCGA-FD-A3SM-10A-01D-A233-26" 4 68818 191014415 NA 0.0146938161241947
157 | "310" "TCGA-FD-A3SM-10A-01D-A233-26" 8 37588240 38564934 NA 1.22494145713849
158 | "313" "TCGA-FD-A3SN-10A-01D-A233-26" 5 58768314 59103278 NA 0.679421511035303
159 | "314" "TCGA-FD-A3SN-10A-01D-A233-26" 13 27050102 87833143 NA -0.324319276758869
160 | "317" "TCGA-FD-A3SO-10A-01D-A233-26" 8 23379 48419845 NA -0.434005574572034
161 | "318" "TCGA-FD-A3SO-10A-01D-A233-26" 9 10001 141153413 NA 0.058427449099493
162 | "321" "TCGA-FD-A3SP-10A-01D-A233-26" 8 43092750 43097417 NA 3.04336277414551
163 | "322" "TCGA-FD-A3SP-10A-01D-A233-26" 18 10064 19584050 NA 0.0715882620781257
164 | "325" "TCGA-FD-A3SQ-10A-01D-A233-26" 1 174801569 178932598 NA 0.250005637997436
165 | "326" "TCGA-FD-A3SQ-10A-01D-A233-26" 9 93619293 94000052 NA -0.873309344786371
166 | "329" "TCGA-FD-A3SR-10A-01D-A233-26" 18 9394393 10349109 NA 0.530418349466572
167 | "330" "TCGA-FD-A3SR-10A-01D-A233-26" 1 145367168 147730635 NA 1.23563497416252
168 | "333" "TCGA-FD-A3SS-10A-01D-A233-26" 5 46389437 175639955 NA -0.538298365219699
169 | "334" "TCGA-FD-A3SS-10A-01D-A233-26" 6 121407729 122145390 NA 0.645298912027161
170 | "337" "TCGA-FT-A3EE-10A-01D-A204-02" 4 33708672 34503662 NA 0.105441838837718
171 | "338" "TCGA-FT-A3EE-10A-01D-A204-02" 22 16051206 51244552 NA -0.261788604248097
172 | "341" "TCGA-G2-A2EC-10A-01D-A17R-02" 24 2649474 58844021 NA 0.0913301706855381
173 | "342" "TCGA-G2-A2EC-10A-01D-A17R-02" 5 155105650 179550250 NA -0.44866303953823
174 | "345" "TCGA-G2-A2EF-10A-01D-A18D-02" 8 48180587 90329793 NA -0.0446394091409028
175 | "346" "TCGA-G2-A2EF-10A-01D-A18D-02" 11 66880142 134946455 NA -0.0250633239777246
176 | "349" "TCGA-G2-A2EJ-10A-01D-A17R-02" 2 32222860 32415046 NA -0.617629467331965
177 | "350" "TCGA-G2-A2EJ-10A-01D-A17R-02" 11 78758057 81264883 NA -0.576249020003528
178 | "353" "TCGA-G2-A2EK-10A-01D-A18D-02" 3 148076113 149533806 NA -0.436122467986259
179 | "354" "TCGA-G2-A2EK-10A-01D-A18D-02" 1 65888453 144005542 NA -0.0245890959067118
180 | "357" "TCGA-G2-A2EL-10A-01D-A18D-02" 3 60174 25828606 NA 0.136988592875077
181 | "358" "TCGA-G2-A2EL-10A-01D-A18D-02" 5 11769 31466816 NA 0.068512529538564
182 | "361" "TCGA-G2-A2ES-11A-31D-A17R-02" 5 45901673 49555334 NA 0.45574195724214
183 | "362" "TCGA-G2-A2ES-11A-31D-A17R-02" 11 26207298 30218540 NA -0.000160033704997661
184 | "365" "TCGA-G2-A3IB-10A-01D-A210-26" 16 60001 223583 NA 0.561251033599773
185 | "366" "TCGA-G2-A3IB-10A-01D-A210-26" 8 43428314 46921966 NA 0.475207847898585
186 | "369" "TCGA-G2-A3IE-10A-01D-A210-26" 3 39612761 60264981 NA 0.495559264888242
187 | "370" "TCGA-G2-A3IE-10A-01D-A210-26" 4 93594714 173425586 NA -0.293160662895868
188 | "373" "TCGA-G2-A3VY-10A-01D-A233-26" 9 109973701 132903224 NA -0.386039301234847
189 | "374" "TCGA-G2-A3VY-10A-01D-A233-26" 12 188219 1880680 NA -0.0591011865665831
190 | "377" "TCGA-GC-A3BM-10A-01D-A23Q-26" 18 39622999 40749298 NA -0.444429050209849
191 | "378" "TCGA-GC-A3BM-10A-01D-A23Q-26" 11 81013505 81387487 NA 1.47813863932257
192 | "381" "TCGA-GC-A3I6-01A-11D-A210-26" 14 20424940 107289526 NA 0.131404191409226
193 | "382" "TCGA-GC-A3I6-01A-11D-A210-26" 12 188219 34560974 NA 0.142121978063855
194 | "385" "TCGA-GC-A3OO-01A-11D-A233-26" 1 10208 249240606 NA 0.0527960405418375
195 | "386" "TCGA-GC-A3OO-01A-11D-A233-26" 20 60001 62965506 NA 0.212520376173027
196 | "389" "TCGA-GC-A3RD-01A-12D-A233-26" 12 69198316 69372087 NA 4.35545916730313
197 | "390" "TCGA-GC-A3RD-01A-12D-A233-26" 7 32425044 41415395 NA 0.852549524688694
198 | "393" "TCGA-GC-A3WC-01A-31D-A233-26" 8 23379 3275420 NA -0.337312402464662
199 | "394" "TCGA-GC-A3WC-01A-31D-A233-26" 21 37514100 38576723 NA 0.697730704202874
200 | "397" "TCGA-GC-A3WC-11A-11D-A233-26" 4 66456541 67063379 NA 0.415428401579314
201 | "398" "TCGA-GC-A3WC-11A-11D-A233-26" 8 43786144 146303846 NA 0.228660175848778
202 | "401" "TCGA-GD-A2C5-10A-01D-A17R-02" 11 31360720 31984769 NA -0.0806177664337259
203 | "402" "TCGA-GD-A2C5-10A-01D-A17R-02" 16 12144408 17594332 NA -0.449976633028893
204 | "405" "TCGA-GD-A3OP-01A-21D-A221-26" 11 195901 134946455 NA -0.103725067423542
205 | "406" "TCGA-GD-A3OP-01A-21D-A221-26" 5 46389437 180905246 NA -0.0997474434064362
206 | "409" "TCGA-GD-A3OP-11A-11D-A221-26" 6 63814 42759519 NA -0.103720485767818
207 | "410" "TCGA-GD-A3OP-11A-11D-A221-26" 16 34196882 90294729 NA -0.220063223344308
208 | "413" "TCGA-GD-A3OQ-10A-01D-A221-26" 17 43673826 44212734 NA -0.721342656508171
209 | "414" "TCGA-GD-A3OQ-10A-01D-A221-26" 16 60001 90294729 NA -0.0279590914074535
210 | "417" "TCGA-GD-A3OS-01A-12D-A221-26" 10 24549423 25199875 NA 0.0797066752625248
211 | "418" "TCGA-GD-A3OS-01A-12D-A221-26" 16 33241258 34459387 NA -0.232184795634823
212 | "421" "TCGA-GV-A3JV-01A-11D-A221-26" 14 71221735 71931850 NA 1.32760392205331
213 | "422" "TCGA-GV-A3JV-01A-11D-A221-26" 12 60730 19244878 NA -0.162214454113409
214 | "425" "TCGA-GV-A3JW-01A-11D-A210-26" 23 128050183 154930285 NA 0.283849828370155
215 | "426" "TCGA-GV-A3JW-01A-11D-A210-26" 16 25856506 26583505 NA 0.777314229943503
216 | "429" "TCGA-GV-A3JX-01A-11D-A210-26" 22 16051206 51244552 NA -0.756450574913272
217 | "430" "TCGA-GV-A3JX-01A-11D-A210-26" 15 22697825 102521366 NA -0.791298494359646
218 | "433" "TCGA-GV-A3JZ-01A-11D-A21C-26" 17 12150816 12759218 NA 0.509791521326194
219 | "434" "TCGA-GV-A3JZ-01A-11D-A21C-26" 5 23123410 27184334 NA -0.163013211330329
220 | "437" "TCGA-GV-A3QF-01A-31D-A233-26" 1 26509374 59339528 NA -0.0952285398758193
221 | "438" "TCGA-GV-A3QF-01A-31D-A233-26" 3 60744709 93533249 NA 0.151052036455775
222 | "441" "TCGA-GV-A3QG-01A-11D-A221-26" 5 11769 53060372 NA -0.0423279562882088
223 | "442" "TCGA-GV-A3QG-01A-11D-A221-26" 2 14483090 243189359 NA -0.031985504966719
224 | "445" "TCGA-GV-A3QH-01A-11D-A221-26" 16 62764166 63596223 NA -0.391701952787151
225 | "446" "TCGA-GV-A3QH-01A-11D-A221-26" 4 129845569 130791851 NA 0.544607886025542
226 | "449" "TCGA-H4-A2HQ-01A-11D-A17R-02" 16 3887120 8290659 NA -0.212174077026258
227 | "450" "TCGA-H4-A2HQ-01A-11D-A17R-02" 1 62972796 64923432 NA 1.05481458964742
228 | "453" "TCGA-HQ-A2OE-01A-11D-A204-02" 10 104951493 105299191 NA -0.271937682930961
229 | "454" "TCGA-HQ-A2OE-01A-11D-A204-02" 5 52184783 52518798 NA -0.230021318992407
230 | "457" "TCGA-K4-A3WS-01A-11D-A23Q-26" 20 26213766 46534498 NA 0.374258403148417
231 | "458" "TCGA-K4-A3WS-01A-11D-A23Q-26" 7 7788999 38637598 NA -0.0778093714176013
232 | "461" "TCGA-K4-A3WV-01A-11D-A23Q-26" 10 76962861 77154224 NA 0.176828116408713
233 | "462" "TCGA-K4-A3WV-01A-11D-A23Q-26" 6 6016367 6589128 NA -0.46322899113314
234 | 


--------------------------------------------------------------------------------
/inst/extdata/bt.exon_quant.R:
--------------------------------------------------------------------------------
 1 | ## Download example dataset from legacy archive
 2 | if (!requireNamespace("GenomicDataCommons"))
 3 |     stop("Please download 'GenomicDataCommons' to update file")
 4 | 
 5 | library(GenomicDataCommons)
 6 | 
 7 | manifile <- files() |>
 8 |     filter(~ file_id == "d56a5dec-cb55-457f-8d93-dd1f3911ae9f") |>
 9 |         manifest()
10 | 
11 | gdcdata(manifile[["id"]], use_cached = TRUE)
12 | 
13 | flist <- list.files(gdc_cache(), pattern = "cation.txt$", recursive = TRUE,
14 |     full.names = TRUE)
15 | flist <- flist[grepl("^unc", basename(flist))]
16 | 
17 | exonFile <- "bt.exon_quantification.txt"
18 | file.rename(flist, exonFile)
19 | 
20 | exonEx <- read.delim(exonFile, nrows = 100)
21 | 
22 | write.table(exonEx, file.path("inst", "extdata", basename(exonFile)),
23 |     sep = "\t", row.names = FALSE)
24 | 


--------------------------------------------------------------------------------
/inst/extdata/bt.exon_quantification.txt:
--------------------------------------------------------------------------------
  1 | "exon"	"raw_counts"	"median_length_normalized"	"RPKM"
  2 | "chr1:11874-12227:+"	4	0.4929178	0.322476823123937
  3 | "chr1:12595-12721:+"	2	0.3412699	0.449436202306589
  4 | "chr1:12613-12721:+"	2	0.3981481	0.523655024705842
  5 | "chr1:12646-12697:+"	2	0.372549	1.09766149409494
  6 | "chr1:13221-14409:+"	39	0.6329966	0.936104924316458
  7 | "chr1:13403-14409:+"	36	0.6192843	1.02026927355796
  8 | "chr1:14363-16765:-"	1033	0.9941715	12.2684113226808
  9 | "chr1:16854-17055:-"	249	1	35.1795074889635
 10 | "chr1:17233-18061:-"	503	0.9649758	17.3163052108246
 11 | "chr1:18268-18379:-"	132	1	33.6354843547663
 12 | "chr1:18497-18554:-"	27	1	13.2854891181836
 13 | "chr1:18913-19759:-"	277	1	9.33336255073406
 14 | "chr1:24738-24901:-"	65	0.9570552	11.3112678354905
 15 | "chr1:29321-29370:-"	8	0.5714286	4.56627181543495
 16 | "chr1:29824-29961:-"	1	0.5547445	0.206805788742525
 17 | "chr1:34612-35174:-"	1	0.1352313	0.0506912945763205
 18 | "chr1:35277-35481:-"	0	0	0
 19 | "chr1:35721-36081:-"	0	0	0
 20 | "chr1:69091-70008:+"	0	0	0
 21 | "chr1:89295-90404:-"	37	0.7321911	0.951306628215614
 22 | "chr1:137839-139228:-"	419	0.9971202	8.60282324940307
 23 | "chr1:236615-237877:-"	14	0.5380349	0.31634899750638
 24 | "chr1:321084-321114:+"	45	1	41.4278692932606
 25 | "chr1:321146-321223:+"	41	1	15.0013737526308
 26 | "chr1:322037-322228:+"	29	1	4.310608159102
 27 | "chr1:323892-324060:+"	22	1	3.71516198001364
 28 | "chr1:324288-324345:+"	3	0.7368421	1.47616545757595
 29 | "chr1:324439-328580:+"	1539	0.9951702	10.6040142502933
 30 | "chr1:367659-368595:+"	0	0	0
 31 | "chr1:420206-420296:+"	0	0	0
 32 | "chr1:420992-421258:+"	0	0	0
 33 | "chr1:421396-421839:+"	0	0	0
 34 | "chr1:566462-568045:+"	149133	1	2686.95476109241
 35 | "chr1:568149-568842:+"	24079	1	990.19505623071
 36 | "chr1:568844-568912:+"	1157	1	478.548595150202
 37 | "chr1:569327-570349:+"	97615	0.9589041	2723.21983909874
 38 | "chr1:621098-622034:-"	0	0	0
 39 | "chr1:661140-665184:-"	1518	0.9967853	10.7101369218638
 40 | "chr1:665278-665335:-"	13	1	6.39671698282913
 41 | "chr1:665563-665731:-"	19	1	3.20854898273905
 42 | "chr1:667397-667587:-"	38	1	5.67795579144398
 43 | "chr1:668402-668479:-"	47	1	17.1966967408207
 44 | "chr1:668511-668541:-"	47	1	43.2691079285166
 45 | "chr1:668687-668744:-"	13	1	6.39671698282913
 46 | "chr1:670803-670994:-"	31	0.9947644	4.60789148041938
 47 | "chr1:671808-671885:-"	41	1	15.0013737526308
 48 | "chr1:671917-671947:-"	11	1	10.1268124939081
 49 | "chr1:674240-674404:-"	38	1	6.57266397676242
 50 | "chr1:675183-675415:-"	92	1	11.2686965402365
 51 | "chr1:675509-675566:-"	7	1	3.44438606767722
 52 | "chr1:678666-678730:-"	7	1	3.07345218346583
 53 | "chr1:679575-679736:-"	14	1	2.46635051759604
 54 | "chr1:700237-700627:-"	20	0.9461538	1.45980556759429
 55 | "chr1:701709-701767:-"	2	1	0.967430469371811
 56 | "chr1:703928-703993:-"	3	0.7230769	1.29723631120311
 57 | "chr1:704877-705092:-"	13	1	1.71763696761152
 58 | "chr1:708356-708487:-"	13	1	2.8106786742734
 59 | "chr1:709551-709660:-"	14	1	3.63226167136871
 60 | "chr1:713664-714006:-"	13	0.7982456	1.08166059767956
 61 | "chr1:761587-762902:-"	29	0.7315589	0.628903318045277
 62 | "chr1:763064-763155:+"	8	1	2.4816694649103
 63 | "chr1:764383-764484:+"	8	1	2.23836853697791
 64 | "chr1:783034-783186:+"	3	0.4736842	0.559592134244479
 65 | "chr1:787307-787490:+"	16	1	2.4816694649103
 66 | "chr1:788051-788146:+"	11	1	3.27011653449117
 67 | "chr1:788771-789740:+"	91	0.9649123	2.67738875776147
 68 | "chr1:791898-794579:+"	31	0.460276	0.32987142589132
 69 | "chr1:803453-804055:-"	8	0.3056479	0.378629503767409
 70 | "chr1:809492-810535:-"	1	0.0728667	0.0273363973625176
 71 | "chr1:812126-812182:-"	0	0	0
 72 | "chr1:846815-846853:+"	0	0	0
 73 | "chr1:847325-850328:+"	8	0.1714952	0.0760031926670264
 74 | "chr1:852953-853100:-"	3	0.8639456	0.5784972739149
 75 | "chr1:853402-853555:-"	3	0.6732026	0.555958419087047
 76 | "chr1:854205-854295:-"	3	1	0.940852709224233
 77 | "chr1:854715-854817:-"	3	0.9509804	0.831238801353449
 78 | "chr1:860530-860569:+"	0	0	0
 79 | "chr1:861121-861180:+"	0	0	0
 80 | "chr1:861302-861393:+"	4	1	1.24083473245515
 81 | "chr1:865535-865716:+"	6	1	0.940852709224233
 82 | "chr1:866419-866469:+"	2	0.74	1.11918426848896
 83 | "chr1:871152-871276:+"	2	0.6129032	0.456627181543495
 84 | "chr1:874420-874509:+"	2	0.7303371	0.634204418810409
 85 | "chr1:874655-874840:+"	3	0.9081081	0.460309658814007
 86 | "chr1:876524-876686:+"	7	0.962963	1.22560976641275
 87 | "chr1:877516-877631:+"	12	1	2.9523309151519
 88 | "chr1:877790-877868:+"	10	1	3.61255681600866
 89 | "chr1:877939-878438:+"	23	0.9338678	1.31280314693755
 90 | "chr1:878633-878757:+"	2	0.6451613	0.456627181543495
 91 | "chr1:879078-879188:+"	8	0.8	2.05687919614187
 92 | "chr1:879288-879961:+"	319	1	13.5074249733285
 93 | "chr1:879584-880180:-"	446	1	21.3207415167921
 94 | "chr1:880422-880526:-"	148	1	40.2266802788317
 95 | "chr1:880898-881033:-"	174	1	36.5133867594522
 96 | "chr1:881553-881666:-"	181	1	45.3122367650069
 97 | "chr1:881782-881925:-"	179	1	35.4758096772073
 98 | "chr1:883511-883612:-"	151	1	42.2492061354581
 99 | "chr1:883870-883983:-"	155	1	38.8032966772158
100 | "chr1:886507-886618:-"	144	1	36.6932556597451
101 | "chr1:887380-887519:-"	158	1	32.2085244124429
102 | 


--------------------------------------------------------------------------------
/inst/scripts/clinicalNames.R:
--------------------------------------------------------------------------------
 1 | # Locate Clinical datasets for each cancer
 2 | # Script used with https://github.com/waldronlab/MultiAssayExperiment-TCGA
 3 | 
 4 | if (!requireNamespace("RTCGAToolbox"))
 5 |     stop("Install `RTCGAToolbox` to generate 'clinicalNames' data")
 6 | 
 7 | TCGAcodes <- RTCGAToolbox::getFirehoseDatasets()
 8 | 
 9 | excludedCodes <- c("COADREAD", "GBMLGG", "KIPAN", "STES", "FPPP", "CNTL",
10 |     "LCML", "MISC")
11 | TCGAcodes <- TCGAcodes[-which(TCGAcodes %in% excludedCodes)]
12 | 
13 | myDataDir <- tempdir()
14 | 
15 | lapply(TCGAcodes, function(cancer) {
16 |     if (!file.exists(file.path(myDataDir, cancer, "clinical.csv"))) {
17 |         clinDat <- RTCGAToolbox::getFirehoseData(dataset = cancer,
18 |             destdir = myDataDir)
19 |         clinFrame <- RTCGAToolbox::getData(clinDat, "clinical")
20 |         rownames(clinFrame) <-
21 |             TCGAutils:::.standardBarcodes(rownames(clinFrame))
22 | 
23 |         dir.create(file.path(myDataDir, cancer))
24 | 
25 |         write.csv(clinFrame, file.path(myDataDir, cancer, "clinical.csv"))
26 |         message(cancer, " clinical data saved.")
27 |     } else {
28 |         message(cancer, " clinical data already exists!")
29 |     }
30 | })
31 | 
32 | names(TCGAcodes) <- TCGAcodes
33 | 
34 | clinicalNames <- IRanges::CharacterList(lapply(TCGAcodes, function(cancer) {
35 |     clinDat <- read.csv(file.path(myDataDir, cancer, "clinical.csv"),
36 |         row.names = 1L)
37 |     allNA <- vapply(clinDat, function(col) all(is.na(col)), logical(1L))
38 |     clinDat <- clinDat[, !allNA]
39 |     names(clinDat)[names(clinDat) != "Composite.Element.REF"]
40 | }))
41 | 
42 | devtools::use_data(clinicalNames, overwrite = TRUE)
43 | 


--------------------------------------------------------------------------------
/inst/scripts/diseaseCodes.R:
--------------------------------------------------------------------------------
 1 | if (!requireNamespace("BiocFileCache"))
 2 |     stop("Please install 'BiocFileCache' to manage and generate data")
 3 | 
 4 | ## Extract cancer codes from TCGA project
 5 | .parseDiseaseCodes <- function(from, to) {
 6 |     htcc <- xml2::read_html(from)
 7 |     diseaseCodes <- rvest::html_table(htcc, fill = TRUE)[[2L]]
 8 |     names(diseaseCodes) <- make.names(colnames(diseaseCodes))
 9 | 
10 |     excludedCodes <- c("COADREAD", "GBMLGG", "KIPAN", "STES", "FPPP", "CNTL",
11 |                        "LCML", "MISC")
12 |     available <- !diseaseCodes[["Study.Abbreviation"]] %in% excludedCodes
13 |     diseaseCodes[["Available"]] <- factor(available,  levels = c("TRUE", "FALSE"),
14 |         labels = c("Yes", "No"))
15 | 
16 |     subtypeCodes <- c("ACC", "BLCA", "BRCA", "COAD", "GBM", "HNSC", "KICH",
17 |         "KIRC", "KIRP", "LAML", "LGG", "LUAD", "LUSC", "OV", "PRAD", "SKCM",
18 |         "STAD", "THCA", "UCEC")
19 |     diseaseCodes[["SubtypeData"]] <- factor(
20 |         diseaseCodes[["Study.Abbreviation"]] %in% subtypeCodes,
21 |         levels = c("TRUE", "FALSE"), labels = c("Yes", "No"))
22 | 
23 |     diseaseCodes <- diseaseCodes[order(diseaseCodes[["Study.Abbreviation"]]), ]
24 |     ## Rearrange column order
25 |     diseaseCodes <- diseaseCodes[,
26 |         c("Study.Abbreviation", "Available", "SubtypeData", "Study.Name")]
27 |     rownames(diseaseCodes) <- NULL
28 | 
29 |     ## Coerce to standard data.frame (no tibble required)
30 |     diseaseCodes <- as(diseaseCodes, "data.frame")
31 | 
32 |     ## For easy subsetting use:
33 |     ## diseaseCodes[["Study.Abbreviation"]][diseaseCodes$Available == "Yes"]
34 | 
35 |     ## Save dataset for exported use
36 |     save(diseaseCodes, file = to, compress = "bzip2")
37 |     TRUE
38 | }
39 | 
40 | .get_cache <- function() {
41 |     cache <- rappdirs::user_cache_dir("TCGAutils")
42 |     BiocFileCache::BiocFileCache(cache)
43 | }
44 | 
45 | update_data_file <-
46 |     function(fileURL, verbose = FALSE , resource, ext = ".rda", FUN) {
47 |     bfc <- .get_cache()
48 |     rid <- BiocFileCache::bfcquery(bfc, fileURL, "rname")$rid
49 |     if (!length(rid)) {
50 |         if (verbose)
51 |             message( "Downloading ", resource, " file" )
52 |         rid <- names(BiocFileCache::bfcadd(bfc, fileURL, download = FALSE,
53 |             ext = ".rda"))
54 |     }
55 |     if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid))) {
56 |         rpath <- BiocFileCache::bfcdownload(bfc, rid, ask = FALSE,
57 |             FUN = FUN, ext = ".rda")
58 |         ## copy to data dir after updating
59 |         file.copy(rpath, file.path("data", paste0(resource, ext)),
60 |             overwrite = TRUE)
61 |     }
62 |     if (verbose)
63 |         message(resource, " update complete")
64 | 
65 |     bfcrpath(bfc, rids = rid)
66 | }
67 | 
68 | url1 <- paste0("https://gdc.cancer.gov/resources-tcga-users/",
69 |     "tcga-code-tables/tcga-study-abbreviations")
70 | update_data_file(url1, verbose = FALSE,
71 |     resource = "diseaseCodes", FUN = .parseDiseaseCodes)
72 | 


--------------------------------------------------------------------------------
/inst/scripts/sampleTypes.R:
--------------------------------------------------------------------------------
 1 | ## Extract sample types table from TCGA website
 2 | .parseSampleTypes <- function(from, to) {
 3 |     stcc <- xml2::read_html(from)
 4 | 
 5 |     sampleTypes <- rvest::html_table(stcc, fill = TRUE)[[2L]]
 6 | 
 7 |     ## convert code column to character
 8 |     codeCol <- sampleTypes[["Code"]]
 9 |     singleDigit <- codeCol < 10L
10 |     sampleTypes[["Code"]][singleDigit] <-
11 |         paste0("0", sampleTypes[["Code"]][singleDigit])
12 | 
13 |     names(sampleTypes) <- make.names(colnames(sampleTypes))
14 | 
15 |     ## Coerce to standard data.frame (no tibble required)
16 |     sampleTypes <- as(sampleTypes, "data.frame")
17 | 
18 |     ## Save dataset for exported use
19 |     save(sampleTypes, file = to, compress = "bzip2")
20 |     TRUE
21 | }
22 | 
23 | url2 <-
24 | "https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes"
25 | ## update_data_file in data-raw/diseaseCodes.R
26 | update_data_file(url2, verbose = FALSE, resource = "sampleTypes",
27 |     FUN = .parseSampleTypes)
28 | 


--------------------------------------------------------------------------------
/man/ID-translation.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ID-translation.R
  3 | \name{ID-translation}
  4 | \alias{ID-translation}
  5 | \alias{UUIDtoBarcode}
  6 | \alias{UUIDtoUUID}
  7 | \alias{barcodeToUUID}
  8 | \alias{filenameToBarcode}
  9 | \alias{UUIDhistory}
 10 | \title{Translate study identifiers from barcode to UUID and vice versa}
 11 | \usage{
 12 | UUIDtoBarcode(id_vector, from_type = c("case_id", "file_id", "aliquot_ids"))
 13 | 
 14 | UUIDtoUUID(id_vector, to_type = c("case_id", "file_id"))
 15 | 
 16 | barcodeToUUID(barcodes)
 17 | 
 18 | filenameToBarcode(filenames, slides = FALSE)
 19 | 
 20 | UUIDhistory(id, endpoint = .HISTORY_ENDPOINT)
 21 | }
 22 | \arguments{
 23 | \item{id_vector}{character() A vector of UUIDs corresponding to
 24 | either files or cases (default assumes case_ids)}
 25 | 
 26 | \item{from_type}{character(1) Either \code{case_id} or \code{file_id} indicating the
 27 | type of \code{id_vector} entered (default \code{"case_id"})}
 28 | 
 29 | \item{to_type}{character(1) The desired UUID type to obtain, can either be
 30 | \code{"case_id"} (default) or \code{"file_id"}}
 31 | 
 32 | \item{barcodes}{character() A vector of TCGA barcodes}
 33 | 
 34 | \item{filenames}{\code{character()} A vector of file names usually obtained
 35 | from a \code{GenomicDataCommons} query}
 36 | 
 37 | \item{slides}{\code{logical(1L)} \strong{DEPRECATED}: Whether the provided file names
 38 | correspond to slides typically with an \code{.svs} extension. \strong{Note} The
 39 | barcodes returned correspond 1:1 with the \code{filename} inputs. Always triple
 40 | check the output against the Genomic Data Commons Data Portal by searching
 41 | the file name and comparing associated "Entity ID" with the \code{submitter_id}
 42 | given by the function.}
 43 | 
 44 | \item{id}{character(1) A UUID whose history of versions is sought}
 45 | 
 46 | \item{endpoint}{character(1) Generally a constant pertaining to the location
 47 | of the history api endpoint. This argument rarely needs to change.}
 48 | }
 49 | \value{
 50 | Generally, a \code{data.frame} of identifier mappings
 51 | 
 52 | UUIDhistory: A \code{data.frame} containting a list of associated UUIDs
 53 | for the given input along with \code{file_change} status, \code{data_release}
 54 | versions, etc.
 55 | }
 56 | \description{
 57 | These functions allow the user to enter a character vector of
 58 | identifiers and use the GDC API to translate from TCGA barcodes to
 59 | Universally Unique Identifiers (UUID) and vice versa. These relationships
 60 | are not one-to-one. Therefore, a \code{data.frame} is returned for all
 61 | inputs. The UUID to TCGA barcode translation only applies to file and case
 62 | UUIDs. Two-way UUID translation is available from 'file_id' to 'case_id'
 63 | and vice versa. Please double check any results before using these
 64 | features for analysis. Case / submitter identifiers are translated by
 65 | default, see the \code{from_type} argument for details. All identifiers are
 66 | converted to lower case.
 67 | }
 68 | \details{
 69 | Based on the file UUID supplied, the appropriate entity_id (TCGA barcode) is
 70 | returned. In previous versions of the package, the 'end_point' parameter
 71 | would require the user to specify what type of barcode needed. This is no
 72 | longer supported as \code{entity_id} returns the appropriate one.
 73 | 
 74 | When providing slide file names, the function will only work if
 75 | \strong{all} the provided files are slide files with an \code{.svs} extension.
 76 | }
 77 | \examples{
 78 | ## Translate UUIDs >> TCGA Barcode
 79 | 
 80 | uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac",
 81 | "5ca9fa79-53bc-4e91-82cd-5715038ee23e",
 82 | "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382")
 83 | 
 84 | UUIDtoBarcode(uuids, from_type = "file_id")
 85 | 
 86 | UUIDtoBarcode("ae55b2d3-62a1-419e-9f9a-5ddfac356db4", from_type = "case_id")
 87 | 
 88 | UUIDtoBarcode("d85d8a17-8aea-49d3-8a03-8f13141c163b", "aliquot_ids")
 89 | 
 90 | ## Translate file UUIDs >> case UUIDs
 91 | 
 92 | uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac",
 93 | "5ca9fa79-53bc-4e91-82cd-5715038ee23e",
 94 | "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382")
 95 | 
 96 | UUIDtoUUID(uuids)
 97 | 
 98 | ## Translate TCGA Barcode >> UUIDs
 99 | 
100 | fullBarcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
101 | "TCGA-B0-5094-11A-01D-1421-08",
102 | "TCGA-E9-A295-10A-01D-A16D-09")
103 | 
104 | sample_ids <- TCGAbarcode(fullBarcodes, sample = TRUE)
105 | 
106 | barcodeToUUID(sample_ids)
107 | 
108 | participant_ids <- c("TCGA-CK-4948", "TCGA-D1-A17N",
109 | "TCGA-4V-A9QX", "TCGA-4V-A9QM")
110 | 
111 | barcodeToUUID(participant_ids)
112 | 
113 | library(GenomicDataCommons)
114 | 
115 | ### Query CNV data and get file names
116 | 
117 | cnv <- files() |>
118 |     filter(
119 |         ~ cases.project.project_id == "TCGA-COAD" &
120 |         data_category == "Copy Number Variation" &
121 |         data_type == "Copy Number Segment"
122 |     ) |>
123 |     results(size = 6)
124 | 
125 | filenameToBarcode(cnv$file_name)
126 | 
127 | ### Query slides data and get file names
128 | 
129 | slides <- files() |>
130 |     filter(
131 |         ~ cases.project.project_id == "TCGA-BRCA" &
132 |         cases.samples.sample_type == "Primary Tumor" &
133 |         data_type == "Slide Image" &
134 |         experimental_strategy == "Diagnostic Slide"
135 |     ) |>
136 |     results(size = 3)
137 | 
138 | filenameToBarcode(slides$file_name, slides = TRUE)
139 | 
140 | ## Get the version history of a BAM file in TCGA-KIRC
141 | UUIDhistory("0001801b-54b0-4551-8d7a-d66fb59429bf")
142 | 
143 | }
144 | \author{
145 | Sean Davis, M. Ramos
146 | }
147 | 


--------------------------------------------------------------------------------
/man/TCGAbarcode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TCGAbarcode.R
 3 | \name{TCGAbarcode}
 4 | \alias{TCGAbarcode}
 5 | \title{Parse data from TCGA barcode}
 6 | \usage{
 7 | TCGAbarcode(
 8 |   barcodes,
 9 |   participant = TRUE,
10 |   sample = FALSE,
11 |   portion = FALSE,
12 |   plate = FALSE,
13 |   center = FALSE,
14 |   index = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{barcodes}{A character vector of TCGA barcodes}
19 | 
20 | \item{participant}{Logical (default TRUE) participant identifier chunk}
21 | 
22 | \item{sample}{Logical (default FALSE) includes the numeric sample code of
23 | the barcode and the vial letter}
24 | 
25 | \item{portion}{Logical (default FALSE) includes the portion and analyte
26 | codes of the barcode}
27 | 
28 | \item{plate}{Logical (default FALSE) returns the plate value}
29 | 
30 | \item{center}{Logical (default FALSE) returns a matrix with the plate and
31 | center codes}
32 | 
33 | \item{index}{An optional numeric vector indicating barcode positions when
34 | split by the delimiter (i.e., hyphen '-'). For example, an index of
35 | \code{c(1, 2)} corresponds to 'TCGA-ZZ' in \code{TCGA-ZZ-A1A1}.}
36 | }
37 | \value{
38 | A character vector or data matrix of TCGA barcode information
39 | }
40 | \description{
41 | This function returns the specified snippet of information obtained from
42 | the TCGA barcode.
43 | }
44 | \examples{
45 | barcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
46 | "TCGA-B0-5094-11A-01D-1421-08",
47 | "TCGA-E9-A295-10A-01D-A16D-09")
48 | 
49 | ## Patient identifiers
50 | TCGAbarcode(barcodes)
51 | 
52 | ## Sample identifiers
53 | TCGAbarcode(barcodes, sample = TRUE)
54 | 
55 | }
56 | \author{
57 | M. Ramos
58 | }
59 | 


--------------------------------------------------------------------------------
/man/TCGAbiospec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TCGAbiospec.R
 3 | \name{TCGAbiospec}
 4 | \alias{TCGAbiospec}
 5 | \title{Extract biospecimen data from the TCGA barcode}
 6 | \usage{
 7 | TCGAbiospec(barcodes)
 8 | }
 9 | \arguments{
10 | \item{barcodes}{A character vector of TCGA barcodes}
11 | }
12 | \value{
13 | A \code{dataframe} with sample type, sample code, portion, plate,
14 | and center columns.
15 | }
16 | \description{
17 | This function uses the full TCGA barcode to return a data frame of the
18 | data pertinent to laboratory variables such as vials, portions, analytes,
19 | plates and the center.
20 | }
21 | \examples{
22 | example("TCGAbarcode")
23 | TCGAbiospec(barcodes)
24 | 
25 | }
26 | \author{
27 | M. Ramos
28 | }
29 | 


--------------------------------------------------------------------------------
/man/TCGAprimaryTumors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TCGAprimaryTumors.R
 3 | \name{TCGAprimaryTumors}
 4 | \alias{TCGAprimaryTumors}
 5 | \title{Select primary tumors from TCGA datasets}
 6 | \usage{
 7 | TCGAprimaryTumors(multiassayexperiment)
 8 | }
 9 | \arguments{
10 | \item{multiassayexperiment}{A
11 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
12 | with TCGA data as obtained from \code{\link[curatedTCGAData:curatedTCGAData]{curatedTCGAData::curatedTCGAData()}}}
13 | }
14 | \value{
15 | A \code{MultiAssayExperiment} containing only primary tumor samples
16 | }
17 | \description{
18 | Tumor selection is decided using the \code{sampleTypes} data. For 'LAML' datasets,
19 | the primary tumor code used is "03" otherwise, "01" is used.
20 | }
21 | \examples{
22 | 
23 | example(getSubtypeMap)
24 | 
25 | TCGAprimaryTumors(gbm)
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/TCGAsampleSelect.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TCGAsampleSelect.R
 3 | \name{TCGAsampleSelect}
 4 | \alias{TCGAsampleSelect}
 5 | \title{Select samples from barcodes from lookup table}
 6 | \usage{
 7 | TCGAsampleSelect(barcodes, sampleCodes)
 8 | }
 9 | \arguments{
10 | \item{barcodes}{Either a TCGA barcode vector or
11 | \link[IRanges:AtomicList-class]{CharacterList} containing patient
12 | identifiers, sample, portion, plate, and center codes.}
13 | 
14 | \item{sampleCodes}{Either a character or numeric vector of TCGA sample codes.
15 | See the \code{sampleType} dataset.}
16 | }
17 | \value{
18 | A logical vector or \link[IRanges:AtomicList-class]{LogicalList} of the
19 | same length as 'barcodes' indicating sample type matches
20 | }
21 | \description{
22 | The TCGA barcode contains several pieces of information which can
23 | be parsed by the \link{TCGAbarcode} function. To select a specific type of
24 | sample, enter the appropriate sampleCode argument from the lookup table.
25 | See lookup table in \code{data("sampleTypes")}. Barcode inputs can be a
26 | character vector or a \link[IRanges:AtomicList-class]{CharacterList} object.
27 | }
28 | \examples{
29 | 
30 | example("TCGAbarcode")
31 | TCGAsampleSelect(barcodes, c(11, 01))
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/TCGAutils-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TCGAutils-pkg.R
 3 | \docType{package}
 4 | \name{TCGAutils-package}
 5 | \alias{TCGAutils}
 6 | \alias{TCGAutils-package}
 7 | \title{TCGAutils: Helper functions for working with TCGA and MultiAssayExperiment
 8 | data}
 9 | \description{
10 | TCGAutils is a toolbox to work with TCGA specific datasets. It allows the
11 | user to manipulate and translate TCGA barcodes, conveniently convert a list
12 | of data files to \link[GenomicRanges:GRangesList-class]{GRangesList}. Take
13 | datasets from GISTIC and return a
14 | \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}
15 | class object. The package also provides functions for working with data from
16 | the \code{curatedTCGAData}
17 | experiment data package. It provides convenience functions for extracting
18 | subtype metadata data and adding clinical data to existing
19 | \link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}
20 | objects.
21 | }
22 | \seealso{
23 | Useful links:
24 | \itemize{
25 |   \item Report bugs at \url{https://github.com/waldronlab/TCGAutils/issues}
26 | }
27 | 
28 | }
29 | \author{
30 | \strong{Maintainer}: Marcel Ramos \email{marcel.ramos@sph.cuny.edu} (\href{https://orcid.org/0000-0002-3242-0582}{ORCID})
31 | 
32 | Authors:
33 | \itemize{
34 |   \item Lucas Schiffer
35 |   \item Levi Waldron
36 | }
37 | 
38 | Other contributors:
39 | \itemize{
40 |   \item Sean Davis [contributor]
41 | }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/man/builds.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/builds.R
  3 | \name{builds}
  4 | \alias{builds}
  5 | \alias{translateBuild}
  6 | \alias{correctBuild}
  7 | \alias{isCorrect}
  8 | \alias{extractBuild}
  9 | \alias{uniformBuilds}
 10 | \title{Utilities for working with \emph{HUMAN} genome builds}
 11 | \usage{
 12 | translateBuild(from, to = c("UCSC", "NCBI"))
 13 | 
 14 | correctBuild(build, style = c("UCSC", "NCBI"))
 15 | 
 16 | isCorrect(build, style = c("UCSC", "NCBI"))
 17 | 
 18 | extractBuild(string, build = c("UCSC", "NCBI"))
 19 | 
 20 | uniformBuilds(builds, cutoff = 0.2, na = c("", "NA"))
 21 | }
 22 | \arguments{
 23 | \item{from}{character() A vector of build versions typically from \code{genome()}
 24 | (e.g., "37"). The build vector must be homogenous (i.e.,
 25 | \code{length(unique(x)) == 1L}).}
 26 | 
 27 | \item{to}{character(1) The name of the desired build version (either "UCSC"
 28 | or "NCBI"; default: "UCSC")}
 29 | 
 30 | \item{build}{A vector of build version names (default UCSC, NCBI)}
 31 | 
 32 | \item{style}{character(1) The annotation style, either 'UCSC' or 'NCBI'}
 33 | 
 34 | \item{string}{A single character string}
 35 | 
 36 | \item{builds}{A character vector of builds}
 37 | 
 38 | \item{cutoff}{numeric(1L) An inclusive threshold tolerance value for missing
 39 | values and translating builds that are below the threshold}
 40 | 
 41 | \item{na}{character() The values to be considered as missing (default:
 42 | c("", "NA"))}
 43 | }
 44 | \value{
 45 | \if{html}{\out{<div class="sourceCode">}}\preformatted{translateBuild: A character vector of translated genome builds
 46 | 
 47 | extractBuild: A character string of the build information available
 48 | 
 49 | uniformBuilds: A character vector of builds where all builds are
 50 |     identical `identical(length(unique(build)), 1L)`
 51 | 
 52 | correctBuild: A character string of the 'corrected' build name
 53 | 
 54 | isCorrect: A logical indicating if the build is exactly as annotated
 55 | }\if{html}{\out{</div>}}
 56 | }
 57 | \description{
 58 | A few functions are available to search for build versions,
 59 | either from NCBI or UCSC.
 60 | 
 61 | \itemize{
 62 | \item \code{translateBuild}: translates between UCSC and NCBI build
 63 | versions
 64 | \item \code{extractBuild}: use grep patterns to find the first build
 65 | within the string input
 66 | \item \code{uniformBuilds}: replace build occurrences below a threshold
 67 | level of occurence with the alternative build
 68 | \item \code{correctBuild}: Ensure that the build annotation is correct
 69 | based on the NCBI/UCSC website. If not, use \code{translateBuild} with
 70 | the indicated 'style' input
 71 | \item \code{isCorrect}: Check to see if the build is exactly as annotated
 72 | }
 73 | }
 74 | \details{
 75 | The \code{correctBuild} function takes the input and ensures that
 76 | the style specified matches the input. Otherwise, it will
 77 | return the correct style for use with  \code{seqlevelsStyle}.
 78 | Currently, the function does not support patched builds
 79 | (e.g., 'GRCh38.p13') Build names are taken from the website:
 80 | \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/}
 81 | }
 82 | \examples{
 83 | 
 84 | translateBuild("GRCh35", "UCSC")
 85 | 
 86 | 
 87 | correctBuild("grch38", "NCBI")
 88 | correctBuild("hg19", "NCBI")
 89 | 
 90 | 
 91 | isCorrect("GRCh38", "NCBI")
 92 | 
 93 | isCorrect("hg19", "UCSC")
 94 | 
 95 | 
 96 | extractBuild(
 97 | "SCENA_p_TCGAb29and30_SNP_N_GenomeWideSNP_6_G05_569110.nocnv_grch38.seg.txt"
 98 | )
 99 | 
100 | 
101 | buildvec <- rep(c("GRCh37", "hg19"), times = c(5, 1))
102 | uniformBuilds(buildvec)
103 | 
104 | navec <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA")
105 | uniformBuilds(navec)
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/man/clinicalNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{clinicalNames}
 5 | \alias{clinicalNames}
 6 | \title{Clinical dataset names in TCGA}
 7 | \format{
 8 | A \link[IRanges:AtomicList-class]{CharacterList} of names for 33
 9 | cancer codes
10 | }
11 | \usage{
12 | data("clinicalNames")
13 | }
14 | \value{
15 | The clinical dataset column names in TCGA as provided by the
16 | \code{RTCGAToolbox}
17 | }
18 | \description{
19 | A dataset of names for each of the TCGA cancer codes available.
20 | These names were obtained by the clinical datasets from
21 | \link[RTCGAToolbox:getFirehoseData]{getFirehoseData}. They serve to subset the
22 | current datasets provided by \code{curatedTCGAData}.
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/curatedTCGAData-helpers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/curatedTCGAData-helpers.R
 3 | \name{curatedTCGAData-helpers}
 4 | \alias{curatedTCGAData-helpers}
 5 | \alias{getSubtypeMap}
 6 | \alias{getClinicalNames}
 7 | \alias{TCGAsplitAssays}
 8 | \alias{sampleTables}
 9 | \title{Helper functions for managing MultiAssayExperiment from
10 | curatedTCGAData}
11 | \usage{
12 | getSubtypeMap(multiassayexperiment)
13 | 
14 | getClinicalNames(diseaseCode)
15 | 
16 | TCGAsplitAssays(multiassayexperiment, sampleCodes = NULL, exclusive = FALSE)
17 | 
18 | sampleTables(multiassayexperiment, vial = FALSE)
19 | }
20 | \arguments{
21 | \item{multiassayexperiment}{A
22 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
23 | object}
24 | 
25 | \item{diseaseCode}{A TCGA cancer code (e.g., "BRCA")}
26 | 
27 | \item{sampleCodes}{character (default NULL) A string of sample type codes
28 | (refer to \code{data(sampleTypes)}; \code{TCGAsplitAssays} section)}
29 | 
30 | \item{exclusive}{logical (default FALSE) Whether to return only assays that
31 | contain all codes in \code{sampleCodes}}
32 | 
33 | \item{vial}{(logical default FALSE) whether to display vials in the
34 | table output}
35 | }
36 | \value{
37 | \itemize{
38 | \item{getSubtypeMap}: A \code{data.frame} with explanatory names
39 | and their in-data variable names. They may not be present for all
40 | cancer types.
41 | \item{getClinicalNames}: A \code{vector} of common variable names that
42 | may be found across several cancer disease codes.
43 | }
44 | }
45 | \description{
46 | Additional helper functions for cleaning and uncovering metadata
47 | within a downloaded \code{MultiAssayExperiment} from \code{curatedTCGAData}.
48 | }
49 | \details{
50 | Note that for \code{getSubtypeMap}, the column of in-data variable names
51 | may need to go through \code{make.names} to be found in the \code{colData} of the
52 | \code{MultiAssayExperiment}.
53 | }
54 | \section{getSubtypeMap}{
55 |  provides a two column \code{data.frame} with
56 | interpreted names and in-data variable names. 'Name' usually refers to the
57 | \code{colData} row names a.k.a. the \code{patientID}.
58 | }
59 | 
60 | \section{getClinicalNames}{
61 |  provides a vector of common variable names that
62 | exist in the \code{colData} \code{DataFrame} of a \code{curatedTCGAData}
63 | \code{MultiAssayExperiment} object. These variables are directly obtained
64 | from the BroadFirehose clinical data (downloaded with
65 | \link[RTCGAToolbox]{getFirehoseData}) and tend to be present across cancer
66 | disease codes.
67 | }
68 | 
69 | \section{TCGAsplitAssays}{
70 | 
71 | Separates samples by indicated sample codes into different assays
72 | in a \code{MultiAssayExperiment}. Refer to the \code{sampleTypes}
73 | data object for a list of available codes. This operation generates
74 | \strong{n} times the number of assays based on the number of sample codes
75 | entered. By default, all assays will be split by samples present in
76 | the data.
77 | }
78 | 
79 | \section{sampleTables}{
80 | 
81 | Display all the available samples in each of the assays
82 | }
83 | 
84 | \examples{
85 | 
86 | library(curatedTCGAData)
87 | 
88 | gbm <- curatedTCGAData("GBM", c("RPPA*", "CNA*"), version = "2.0.1", FALSE)
89 | 
90 | getSubtypeMap(gbm)
91 | 
92 | sampleTables(gbm)
93 | 
94 | TCGAsplitAssays(gbm, c("01", "10"))
95 | 
96 | getClinicalNames("COAD")
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/man/diseaseCodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{diseaseCodes}
 5 | \alias{diseaseCodes}
 6 | \title{TCGA Cancer Disease Codes Table}
 7 | \format{
 8 | A data frame with 37 rows and 2 variables:
 9 | \itemize{
10 | \item Study.Abbreviation: Disease Code used in TCGA
11 | \item Available: Cancer datasets available via curatedTCGAData
12 | \item SubtypeData: Subtype curation data available via curatedTCGAData
13 | \item Study.Name: The full length study name (i.e., type of cancer)
14 | }
15 | }
16 | \source{
17 | \url{https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations}
18 | }
19 | \usage{
20 | data("diseaseCodes")
21 | }
22 | \value{
23 | The TCGA \code{diseaseCodes} table
24 | }
25 | \description{
26 | A dataset for obtaining the cancer codes in TCGA for about 13 different
27 | types of cancers.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/findGRangesCols.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/findGRangesCols.R
 3 | \name{findGRangesCols}
 4 | \alias{findGRangesCols}
 5 | \title{Obtain minimum necessary names for the creation of a GRangesList object}
 6 | \usage{
 7 | findGRangesCols(
 8 |   df_colnames,
 9 |   seqnames.field = c("seqnames", "seqname", "chromosome", "chrom", "chr",
10 |     "chromosome_name", "seqid", "om"),
11 |   start.field = "start",
12 |   end.field = c("end", "stop"),
13 |   strand.field = "strand",
14 |   ignore.strand = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{df_colnames}{A \code{character} vector of names in a dataset}
19 | 
20 | \item{seqnames.field}{A \code{character} vector of the chromosome name}
21 | 
22 | \item{start.field}{A \code{character} vector that indicates the column name
23 | of the start positions of ranged data}
24 | 
25 | \item{end.field}{A \code{character} vector that indicates the end position
26 | of ranged data}
27 | 
28 | \item{strand.field}{A \code{character} vector of the column name that
29 | indicates the strand type}
30 | 
31 | \item{ignore.strand}{logical (default FALSE) whether to ignore the strand
32 | field in the data}
33 | }
34 | \value{
35 | Index positions vector indicating columns with appropriate names
36 | }
37 | \description{
38 | This function attempts to match chromosome, start position, end position and
39 | strand names in the given character vector. Modified helper from the
40 | \code{GenomicRanges} package.
41 | }
42 | \examples{
43 | myDataColNames <- c("Start_position", "End_position", "strand",
44 |                  "chromosome", "num_probes", "segment_mean")
45 | findGRangesCols(myDataColNames)
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/man/generateMap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generateMap.R
 3 | \name{generateMap}
 4 | \alias{generateMap}
 5 | \title{Create a sampleMap from an experiment list and phenoData dataframe}
 6 | \usage{
 7 | generateMap(
 8 |   experiments,
 9 |   colData,
10 |   idConverter = identity,
11 |   sampleCol,
12 |   patientCol,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{experiments}{A named \code{list} of experiments compatible with the
18 | \code{MultiAssayExperiment} API}
19 | 
20 | \item{colData}{A \code{data.frame} of clinical data with patient identifiers
21 | as rownames}
22 | 
23 | \item{idConverter}{A function to be used against the sample or specimen
24 | identifiers to match those in the rownames of the \code{colData}
25 | (default NULL)}
26 | 
27 | \item{sampleCol}{A single string indicating the sample identifiers
28 | column in the colData dataset}
29 | 
30 | \item{patientCol}{A single string indicating the patient identifiers
31 | in colData, "row.names" extracts the colData row names}
32 | 
33 | \item{...}{Additonal arguments to pass to the 'idConverter' function.}
34 | }
35 | \value{
36 | A \code{DataFrame} class object of mapped samples and patient
37 | identifiers including assays
38 | }
39 | \description{
40 | This function helps create a sampleMap in preparation of a
41 | \code{MultiAssayExperiment} object. This especially useful when the
42 | sample identifiers are not very different, as in the case of TCGA barcodes.
43 | An \code{idConverter} function can be provided to truncate such sample
44 | identifiers and obtain patient identifiers.
45 | }
46 | \examples{
47 | ## Minimal example
48 | expList <- list(assay1 = matrix(1:6, ncol = 2L,
49 |         dimnames = list(paste0("feature", 1:3), c("A-J", "B-J"))),
50 |     assay2 = matrix(1:4, ncol = 2,
51 |         dimnames = list(paste0("gene", 1:2), c("A-L", "B-L"))))
52 | 
53 | ## Mock colData
54 | myPheno <- data.frame(var1 = c("Yes", "No"), var2 = c("High", "Low"),
55 |     row.names = c("a", "b"))
56 | 
57 | ## A look at the identifiers
58 | vapply(expList, colnames, character(2L))
59 | rownames(myPheno)
60 | 
61 | ## Use 'idConverter' to correspond sample names to patient identifiers
62 | generateMap(expList, myPheno,
63 |     idConverter = function(x) substr(tolower(x), 1L, 1L))
64 | 
65 | }
66 | \author{
67 | M. Ramos, M. Morgan, L. Schiffer
68 | }
69 | 


--------------------------------------------------------------------------------
/man/getFileName.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getFileName.R
 3 | \name{getFileName}
 4 | \alias{getFileName}
 5 | \title{Find the file names used in RTCGAToolbox}
 6 | \usage{
 7 | getFileName(
 8 |   disease,
 9 |   runDate = "20160128",
10 |   dataType = c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation")
11 | )
12 | }
13 | \arguments{
14 | \item{disease}{The TCGA cancer disease code, e.g., "COAD"}
15 | 
16 | \item{runDate}{The single \code{string} used in the \code{getFirehoseData}
17 | function (default "20160128")}
18 | 
19 | \item{dataType}{A single character vector (default "CNASNP") indicating the
20 | data type for which to get the source file name}
21 | }
22 | \value{
23 | A single \code{character} file name
24 | }
25 | \description{
26 | Part of this function is from the RTCGAToolbox. It aims to extract the file
27 | name used inside of the \link[RTCGAToolbox]{getFirehoseData} function.
28 | The arguments of the function parallel those in the
29 | \link[RTCGAToolbox]{getFirehoseData} function. It is only available for
30 | select data types.
31 | }
32 | \examples{
33 | 
34 | getFileName("COAD", dataType = "CNASNP")
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/man/hidden-helpers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simplifyTCGA.R
 3 | \name{hidden-helpers}
 4 | \alias{hidden-helpers}
 5 | \alias{.makeListRanges}
 6 | \alias{.getRangesOfSYMBOLS}
 7 | \title{A small document for helper functions}
 8 | \usage{
 9 | .makeListRanges(x, gn)
10 | 
11 | .getRangesOfSYMBOLS(x)
12 | }
13 | \arguments{
14 | \item{x}{A character vector}
15 | 
16 | \item{gn}{A GRanges object with some of its names found in x}
17 | }
18 | \value{
19 | A list of length 2: unmapped (character vector) and mapped (GRanges)
20 | 
21 | list of length 2: "unmapped" is a character vector providing
22 | unmapped symbols, "mapped" is a GRanges object with ranges of mapped symbols
23 | }
24 | \description{
25 | A small document for helper functions
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/man/imputeAssay.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/imputeAssay.R
 3 | \name{imputeAssay}
 4 | \alias{imputeAssay}
 5 | \title{This function imputes assays values inside a
 6 | \code{MultiAssayExperiment}}
 7 | \usage{
 8 | imputeAssay(multiassayexperiment, i = 1, ...)
 9 | }
10 | \arguments{
11 | \item{multiassayexperiment}{A \code{MultiAssayExperiment} with genes in the
12 | rows, samples in the columns}
13 | 
14 | \item{i}{A numeric, logical, or character \code{vector} indicating the
15 | assays to perform imputation on (default 1L)}
16 | 
17 | \item{...}{
18 |   Arguments passed on to \code{\link[impute:impute.knn]{impute::impute.knn}}
19 |   \describe{
20 |     \item{\code{data}}{An expression matrix with genes in the rows, samples in the columns}
21 |     \item{\code{k}}{Number of neighbors to be used in the
22 |     imputation (default=10)}
23 |     \item{\code{rowmax}}{The maximum percent missing data allowed in any row
24 |     (default 50\%). For any rows with more than \code{rowmax}\% missing
25 |     are imputed using the overall mean per sample.}
26 |     \item{\code{colmax}}{The maximum percent missing data allowed in any column
27 |     (default 80\%). If any column has more than \code{colmax}\% missing data,
28 |     the program halts and reports an error.}
29 |     \item{\code{maxp}}{The largest block of genes imputed using the knn
30 |     algorithm inside \code{impute.knn} (default
31 |     1500); larger blocks are divided by two-means clustering
32 |     (recursively) prior to imputation. If \code{maxp=p}, only knn
33 |     imputation is done.}
34 |     \item{\code{rng.seed}}{The seed used for the random number generator (default
35 |     362436069) for reproducibility.}
36 |   }}
37 | }
38 | \value{
39 | A \code{MultiAssayExperiment} with imputed assays values
40 | }
41 | \description{
42 | These function allow the user to enter a
43 | \code{MultiAssayExperiment} and impute all the NA values inside assays.
44 | }
45 | \examples{
46 | 
47 | example(getSubtypeMap)
48 | 
49 | ## convert data to matrix and add as experiment
50 | gbm <-
51 |   c(gbm, RPPA_matrix = data.matrix(assay(gbm[["GBM_RPPAArray-20160128"]])))
52 | 
53 | imputeAssay(gbm, i = "RPPA_matrix")
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/man/makeGRangesListFromCopyNumber.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/makeGRangesListFromCopyNumber.R
 3 | \name{makeGRangesListFromCopyNumber}
 4 | \alias{makeGRangesListFromCopyNumber}
 5 | \title{Make a GRangesList from TCGA Copy Number data}
 6 | \usage{
 7 | makeGRangesListFromCopyNumber(
 8 |   df,
 9 |   split.field,
10 |   names.field = "Hugo_Symbol",
11 |   ...
12 | )
13 | }
14 | \arguments{
15 | \item{df}{A \code{data.frame} or \code{DataFrame} class object. \code{list}
16 | class objects are coerced to \code{data.frame} or \code{DataFrame}.}
17 | 
18 | \item{split.field}{A \code{character} vector of length one indicating
19 | the column to be used as sample identifiers}
20 | 
21 | \item{names.field}{A \code{character} vector of length one indicating the
22 | column to be used as names for each of the ranges in the data}
23 | 
24 | \item{...}{Additional arguments to pass on to
25 | \link[GenomicRanges:makeGRangesListFromDataFrame]{GenomicRanges::makeGRangesListFromDataFrame}}
26 | }
27 | \value{
28 | A \link[GenomicRanges:GRangesList-class]{GRangesList} class object
29 | }
30 | \description{
31 | \code{makeGRangesListFromCopyNumber} allows the user to convert objects of
32 | class \code{data.frame} or \link[S4Vectors:DataFrame-class]{S4Vectors::DataFrame} to a
33 | \link[GenomicRanges:GRangesList-class]{GRangesList}. It includes additional
34 | features specific to TCGA data such as, hugo symbols, probe numbers, segment
35 | means, and ucsc build (if available).
36 | }
37 | \examples{
38 | library(GenomicDataCommons)
39 | 
40 | manif <- files() |>
41 |     filter(~ cases.project.project_id == "TCGA-COAD" &
42 |         data_type == "Copy Number Segment") |>
43 |     manifest(size = 1)
44 | 
45 | fname <- gdcdata(manif$id)
46 | 
47 | barcode <- UUIDtoBarcode(names(fname), from_type = "file_id")
48 | barcode <- barcode[["associated_entities.entity_submitter_id"]]
49 | 
50 | cndata <- read.delim(fname[[1L]], nrows = 10L)
51 | 
52 | cngrl <- makeGRangesListFromCopyNumber(cndata, split.field = "GDC_Aliquot",
53 |     keep.extra.columns = TRUE)
54 | 
55 | names(cngrl) <- barcode
56 | GenomeInfoDb::genome(cngrl) <- extractBuild(fname[[1L]])
57 | cngrl
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/man/makeGRangesListFromExonFiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/makeGRangesListFromExonFiles.R
 3 | \name{makeGRangesListFromExonFiles}
 4 | \alias{makeGRangesListFromExonFiles}
 5 | \title{Read exon-level expression files and create a \code{GRangesList}}
 6 | \usage{
 7 | makeGRangesListFromExonFiles(
 8 |   filepaths,
 9 |   sampleNames = NULL,
10 |   fileNames = basename(filepaths),
11 |   getBarcodes = TRUE,
12 |   rangesColumn = "exon",
13 |   nrows = Inf
14 | )
15 | }
16 | \arguments{
17 | \item{filepaths}{character() vector of file paths containing TCGA exon
18 | data usually obtained from the GDC}
19 | 
20 | \item{sampleNames}{character() vector of TCGA barcodes to be used as
21 | names for the \code{GRangesList} output (default NULL)}
22 | 
23 | \item{fileNames}{character() vector of file names as downloaded from
24 | the Genomic Data Commons Legacy archive (default \code{basename(filepaths)})}
25 | 
26 | \item{getBarcodes}{logical(1). Whether to query the GDC API with the
27 | \code{filenameToBarcode} and obtain the TCGA barcodes from the file names
28 | (default TRUE); see details.}
29 | 
30 | \item{rangesColumn}{character(1). The name of the column in the data
31 | containing the ranges information (default "exon"); see details.}
32 | 
33 | \item{nrows}{numeric(1). The number of rows to return from each of the files
34 | read in (all rows by default; default Inf)}
35 | }
36 | \value{
37 | A \link[GenomicRanges:GRangesList-class]{GRangesList} object
38 | }
39 | \description{
40 | This function serves to read exon-level expression data. It works for exon
41 | quantification (raw counts and RPKM) and junction quantification
42 | (raw counts) file paths and represents such data as a
43 | \link[GenomicRanges:GRangesList-class]{GRangesList}. The data files can be
44 | downloaded via the Genomic Data Commons (GDC) Legacy Archive.
45 | }
46 | \details{
47 | The \code{rangesColumn} name in the GDC data files is usually "exon"
48 | but can be changed with the \code{rangesColumn} argument, if different.
49 | To avoid programmatically obtaining TCGA barcodes from the GDC
50 | API, set the \code{getBarcodes} to \code{FALSE}. When \code{getBarcodes} is set to
51 | \code{FALSE}, the file names are used to name the elements of the \code{GRangesList}
52 | output.
53 | }
54 | \examples{
55 | 
56 | ## Load example file found in package
57 | pkgDir <- system.file("extdata", package = "TCGAutils", mustWork = TRUE)
58 | exonFile <- list.files(pkgDir, pattern = "cation\\\\.txt$", full.names = TRUE)
59 | 
60 | filePrefix <- "unc.edu.32741f9a-9fec-441f-96b4-e504e62c5362.1755371."
61 | 
62 | ## Add actual file name manually (due to Windows OS restriction)
63 | makeGRangesListFromExonFiles(exonFile,
64 |     fileNames = paste0(filePrefix, basename(exonFile)),
65 |     sampleNames = "TCGA-AA-3678-01A-01R-0905-07")
66 | 
67 | }
68 | \author{
69 | M. Ramos
70 | }
71 | 


--------------------------------------------------------------------------------
/man/mergeColData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simplifyColData.R
 3 | \name{mergeColData}
 4 | \alias{mergeColData}
 5 | \title{Take a MultiAssayExperiment and include curated variables}
 6 | \usage{
 7 | mergeColData(MultiAssayExperiment, colData)
 8 | }
 9 | \arguments{
10 | \item{MultiAssayExperiment}{A
11 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
12 | object}
13 | 
14 | \item{colData}{A \code{DataFrame} or \code{data.frame} to merge with
15 | clinical data in the \code{MultiAssayExperiment} object}
16 | }
17 | \value{
18 | A
19 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
20 | object
21 | }
22 | \description{
23 | This function works on the \code{colData} of a
24 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
25 | object to merge curated variable columns or other clinical variables that
26 | would like to be added. It is recommended that the user run the scripts in
27 | the \code{MultiAssayExperiment.TCGA} repository that build the "enhanced" type of
28 | data but not necessary if using different clinical data. Please see the
29 | repository's README for more information.
30 | }
31 | \examples{
32 | 
33 | library(MultiAssayExperiment)
34 | 
35 | mergeColData(MultiAssayExperiment(), S4Vectors::DataFrame())
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/oncoPrintTCGA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/oncoPrintTCGA.R
 3 | \name{oncoPrintTCGA}
 4 | \alias{oncoPrintTCGA}
 5 | \title{OncoPrint for TCGA Mutation Assays}
 6 | \usage{
 7 | oncoPrintTCGA(
 8 |   multiassayexperiment,
 9 |   matchassay = "*_Mutation-*",
10 |   variantCol = "Variant_Classification",
11 |   brewerPal = "Set3",
12 |   ntop = 25,
13 |   incl.thresh = 0.01,
14 |   rowcol = "Hugo_Symbol"
15 | )
16 | }
17 | \arguments{
18 | \item{multiassayexperiment}{A \code{MultiAssayExperiment}, usually from
19 | \code{curatedTCGAData}}
20 | 
21 | \item{matchassay}{character(1) The name of the assay containing mutation
22 | data, this can be a pattern (e.g., "\emph{_Mutation-}", the default)}
23 | 
24 | \item{variantCol}{character(1) The name of the metadata column containing
25 | the mutation categories, usually "Variant_Classification" in TCGA}
26 | 
27 | \item{brewerPal}{character(1) The name of the \code{RColorBrewer::brewer.pal}
28 | palette, (default: "Set3")}
29 | 
30 | \item{ntop}{integer(1) The number of the top N genes for displaying based
31 | on per-sample mutation frequency}
32 | 
33 | \item{incl.thresh}{double(1) The inclusion threshold for empirical mutations,
34 | mutations less frequent than this value will not be included}
35 | 
36 | \item{rowcol}{character(1) The name of the column in the metadata to annotate
37 | the rows with either "Hugo_Symbol" (default) or}
38 | }
39 | \value{
40 | An oncoPrint plot of mutations
41 | }
42 | \description{
43 | OncoPrint for TCGA Mutation Assays
44 | }
45 | \examples{
46 | 
47 | library(curatedTCGAData)
48 | 
49 | acc <- curatedTCGAData("ACC", "Mutation", version = "1.1.38", FALSE)
50 | 
51 | oncoPrintTCGA(acc)
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/man/sampleTypes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sampleTypes}
 5 | \alias{sampleTypes}
 6 | \title{Barcode Sample Type Table}
 7 | \format{
 8 | A data frame with 19 rows and 3 variables:
 9 | \itemize{
10 | \item Code: Two digit code number found in the barcode
11 | \item Definition: Long name for the sample type
12 | \item Short.Letter.Code: Letter code for the sample type
13 | }
14 | }
15 | \source{
16 | \url{https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes}
17 | }
18 | \usage{
19 | data("sampleTypes")
20 | }
21 | \value{
22 | The TCGA \code{sampleTypes} table
23 | }
24 | \description{
25 | A dataset that contains the mappings for sample codes in the TCGA
26 | barcodes.
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/man/simplifyTCGA-defunct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simplifyTCGA.R
 3 | \name{simplifyTCGA-defunct}
 4 | \alias{simplifyTCGA-defunct}
 5 | \alias{mirToRanges}
 6 | \title{Defunct TCGAutils functions}
 7 | \usage{
 8 | mirToRanges(obj, keep.assay = FALSE, unmapped = TRUE)
 9 | }
10 | \arguments{
11 | \item{obj}{A \code{MultiAssayExperiment} object obtained from \code{curatedTCGAData}}
12 | 
13 | \item{keep.assay}{logical (default FALSE) Whether to keep the
14 | \code{SummarizedExperiment} assays that have been converted to
15 | \code{RangedSummarizedExperiment}}
16 | 
17 | \item{unmapped}{logical (default TRUE) Include an assay of data that was
18 | not able to be mapped in reference database}
19 | }
20 | \description{
21 | \code{mirToRanges} is defunct and will be removed in the next
22 | release. The \code{mirbase.db} package is currently deprecated in \code{RELEASE_3_21}.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/simplifyTCGA.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/simplifyTCGA.R
  3 | \name{simplifyTCGA}
  4 | \alias{simplifyTCGA}
  5 | \alias{symbolsToRanges}
  6 | \alias{CpGtoRanges}
  7 | \alias{qreduceTCGA}
  8 | \title{Functions to convert rows annotations to ranges and RaggedExperiment
  9 | to RangedSummarizedExperiment}
 10 | \usage{
 11 | simplifyTCGA(obj, keep.assay = FALSE, unmapped = TRUE)
 12 | 
 13 | symbolsToRanges(obj, keep.assay = FALSE, unmapped = TRUE)
 14 | 
 15 | CpGtoRanges(obj, keep.assay = FALSE, unmapped = TRUE)
 16 | 
 17 | qreduceTCGA(obj, keep.assay = FALSE, suffix = "_simplified")
 18 | }
 19 | \arguments{
 20 | \item{obj}{A \code{MultiAssayExperiment} object obtained from \code{curatedTCGAData}}
 21 | 
 22 | \item{keep.assay}{logical (default FALSE) Whether to keep the
 23 | \code{SummarizedExperiment} assays that have been converted to
 24 | \code{RangedSummarizedExperiment}}
 25 | 
 26 | \item{unmapped}{logical (default TRUE) Include an assay of data that was
 27 | not able to be mapped in reference database}
 28 | 
 29 | \item{suffix}{character (default "_simplified") A character string to append
 30 | to the newly modified assay for \code{qreduceTCGA}.}
 31 | }
 32 | \value{
 33 | A
 34 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
 35 | with any gene expression, miRNA, copy number, and mutations converted to
 36 | \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment}}
 37 | objects
 38 | }
 39 | \description{
 40 | This group of functions will convert row annotations as
 41 | either gene symbols or miRNA symbols to row ranges based on database
 42 | resources 'TxDB' and 'org.Hs' packages. It will also simplify the
 43 | representation of
 44 | \link[RaggedExperiment:RaggedExperiment-class]{RaggedExperiment} objects to
 45 | \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment}.
 46 | }
 47 | \details{
 48 | The original \code{SummarizedExperiment} containing either gene symbol
 49 | or miR annotations is replaced or supplemented by a
 50 | \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment}
 51 | for those that could be mapped to
 52 | \link[GenomicRanges:GRanges-class]{GRanges}, and optionally another
 53 | \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}
 54 | for annotations that could not be mapped to
 55 | \link[GenomicRanges:GRanges-class]{GRanges}.
 56 | }
 57 | \section{qreduceTCGA}{
 58 | 
 59 | 
 60 | Using \code{TxDb.Hsapiens.UCSC.hg19.knownGene} as the reference, \code{qreduceTCGA}
 61 | reduces the data by applying either the \code{weightedmean} or \code{nonsilent}
 62 | function (see below) to non-mutation or mutation data, respectively.
 63 | Internally, it uses \code{\link[RaggedExperiment:assay-functions]{RaggedExperiment::qreduceAssay()}} to reduce the ranges
 64 | to the gene-level.
 65 | 
 66 | \code{qreduceTCGA} will update \code{genome(x)} based on the NCBI reference annotation
 67 | which includes the patch number, e.g., GRCh37.p14, as provided by the
 68 | \code{seqlevelsStyle} setter, \code{seqlevelsStyle(gn) <- "NCBI"}. \code{qreduceTCGA}
 69 | uses the NCBI genome annotation as the default reference.
 70 | 
 71 | \if{html}{\out{<div class="sourceCode">}}\preformatted{nonsilent <- function(scores, ranges, qranges)
 72 |     any(scores != "Silent")
 73 | }\if{html}{\out{</div>}}
 74 | 
 75 | \code{RaggedExperiment} mutation objects become a genes by patients
 76 | \code{RangedSummarizedExperiment} object containing '1' if there is a non-silent
 77 | mutation somewhere in the gene, and '0' otherwise as obtained from the
 78 | \code{Variant_Classification} column in the data.
 79 | 
 80 | \if{html}{\out{<div class="sourceCode">}}\preformatted{weightedmean <- function(scores, ranges, qranges) \{
 81 |     isects <- GenomicRanges::pintersect(ranges, qranges)
 82 |     sum(scores * BiocGenerics::width(isects)) /
 83 |         sum(BiocGenerics::width(isects))
 84 | \}
 85 | }\if{html}{\out{</div>}}
 86 | 
 87 | "CNA" and "CNV" segmented copy number are reduced using a weighted mean in
 88 | the rare cases of overlapping (non-disjoint) copy number regions.
 89 | 
 90 | These functions rely on \code{TxDb.Hsapiens.UCSC.hg19.knownGene} and
 91 | \code{org.Hs.eg.db} to map to the 'hg19' NCBI build. Use the \code{liftOver} procedure
 92 | for datasets that are provided against a different reference genome (usually
 93 | 'hg18'). See an example in the vignette.
 94 | }
 95 | 
 96 | \examples{
 97 | 
 98 | library(curatedTCGAData)
 99 | library(GenomeInfoDb)
100 | 
101 | accmae <-
102 |     curatedTCGAData(diseaseCode = "ACC",
103 |     assays = c("CNASNP", "Mutation", "miRNASeqGene", "GISTICT"),
104 |     version = "1.1.38",
105 |     dry.run = FALSE)
106 | 
107 | ## update genome annotation
108 | rex <- accmae[["ACC_Mutation-20160128"]]
109 | 
110 | ## Translate build to "hg19"
111 | tgenome <- vapply(genome(rex), translateBuild, character(1L))
112 | genome(rex) <- tgenome
113 | 
114 | accmae[["ACC_Mutation-20160128"]] <- rex
115 | 
116 | simplifyTCGA(accmae)
117 | 
118 | }
119 | \author{
120 | L. Waldron
121 | }
122 | 


--------------------------------------------------------------------------------
/man/trimColData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simplifyColData.R
 3 | \name{trimColData}
 4 | \alias{trimColData}
 5 | \title{Minimize the number of variables in colData}
 6 | \usage{
 7 | trimColData(
 8 |   multiassayexperiment,
 9 |   maxNAfrac = 0.2,
10 |   keystring = c("portion", "analyte")
11 | )
12 | }
13 | \arguments{
14 | \item{multiassayexperiment}{A
15 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
16 | object with \code{colData}}
17 | 
18 | \item{maxNAfrac}{(numeric default 0.2) A decimal between 0 and 1 to indicate
19 | the amount of NA values allowed per column}
20 | 
21 | \item{keystring}{(character) A vector of keywords to match and remove
22 | variables}
23 | }
24 | \value{
25 | A
26 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}}
27 | object
28 | }
29 | \description{
30 | This function removes variables that have a high number of missing data
31 | and contain keywords.
32 | }
33 | \examples{
34 | 
35 | example(getSubtypeMap)
36 | 
37 | (gbm_trimmed <- trimColData(gbm))
38 | 
39 | head(colData(gbm_trimmed))[1:5]
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | 
3 | library(TCGAutils)
4 | 
5 | test_check("TCGAutils")
6 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ID-translation.R:
--------------------------------------------------------------------------------
  1 | context("ID translation testing")
  2 | 
  3 | test_that("barcodeToUUID translates correctly", {
  4 |     pts <- c("TCGA-06-6391", "TCGA-06-6700")
  5 |     case_id <- barcodeToUUID(pts)
  6 |     expect_true("case_id" %in% names(case_id))
  7 |     expect_equal(
  8 |         case_id[["case_id"]],
  9 |         c("7a4c0a14-ac97-4c2b-a9cc-68cb561b2494",
 10 |         "3dddfc44-7bb1-4974-8a65-a84fd4bac484")
 11 |     )
 12 |     samps <- c("TCGA-06-6700-01A", "TCGA-AD-6888-01A")
 13 |     samp_id <- barcodeToUUID(samps)
 14 |     expect_true("sample_ids" %in% names(samp_id))
 15 |     expect_equal(
 16 |         samp_id[["sample_ids"]],
 17 |         c("8d35786c-5edb-4a84-b3e5-c401b8c73bd6",
 18 |         "ecf0f65b-bf3c-4d0e-899a-f209247cbe97")
 19 |     )
 20 |     analytes <- c("TCGA-AA-A00L-10A-01X", "TCGA-AA-A00L-10A-01D",
 21 |         "TCGA-12-0653-10A-01D")
 22 |     analyte_ids <- barcodeToUUID(analytes)
 23 |     expect_true("analyte_ids" %in% names(analyte_ids))
 24 |     expect_equal(
 25 |         analyte_ids[["analyte_ids"]],
 26 |         c("4b6a77dc-7a2a-459e-a7a0-253f950f1c8c",
 27 |         "1c429d23-89eb-4c35-bef3-9eff2508d9d5",
 28 |         "63645523-bb46-40b3-899b-c3fa5fefd121")
 29 |     )
 30 |     portions <- c("TCGA-AA-A00L-10A-01", "TCGA-AA-A00L-01A-31",
 31 |         "TCGA-12-0653-10A-01")
 32 |     portion_ids <- barcodeToUUID(portions)
 33 |     expect_true("portion_ids" %in% names(portion_ids))
 34 |     expect_equal(
 35 |         portion_ids[["portion_ids"]],
 36 |         c("c72ff462-a355-49fa-8275-c34ef5dd91c9",
 37 |         "7d25aecc-9068-463b-adc5-71ec2f4ba7aa",
 38 |         "03209a36-67a0-48df-a9f7-a0cedd0db82f")
 39 |     )
 40 |     aliquots <- c("TCGA-12-0653-10A-01D-0333-01",
 41 |         "TCGA-12-0653-10A-01D-0334-04", "TCGA-AA-3556-01A-01D-1953-10")
 42 |     aliquot_ids <- barcodeToUUID(aliquots)
 43 |     expect_true("aliquot_ids" %in% names(aliquot_ids))
 44 |     expect_equal(aliquot_ids[["aliquot_ids"]],
 45 |         c("51ddbc44-1cae-454f-bc67-5c5cc3d9e853",
 46 |         "2f0fe3f0-6a24-47ee-acba-df9c04d89532",
 47 |         "2303247f-9691-4b38-bac2-8a30d6e08cc9")
 48 |     )
 49 | })
 50 | 
 51 | 
 52 | test_that("UUIDtoBarcode translates correctly", {
 53 |     file_id <- c(
 54 |         "6b7d7a7f-f16d-472d-9b7b-3482c434cc99",
 55 |         "2ea70743-f3c6-4b01-8e20-9c8957a71229"
 56 |     )
 57 |     `associated_entities.entity_submitter_id` <- c(
 58 |         "TCGA-NA-A4QY-01A-11D-A28Q-01", "TCGA-NA-A4QY-01A-11D-A28S-05"
 59 |     )
 60 |     resframe <- UUIDtoBarcode(file_id, from_type = "file_id")
 61 |     expect_identical(
 62 |         resframe,
 63 |         data.frame(
 64 |             file_id,
 65 |             `associated_entities.entity_submitter_id`
 66 |         )
 67 |     )
 68 | 
 69 |     `portions.analytes.aliquots.aliquot_id` <- c(
 70 |         "f8c7d038-1182-42d0-8787-b84b5ca57eaf",
 71 |         "b37ea112-340e-4613-8514-d8a8bd47410f",
 72 |         "4a9967bf-444c-4573-a082-121a30be7f3b"
 73 |     )
 74 |     `portions.analytes.aliquots.submitter_id` <- c(
 75 |         "TCGA-UF-A71A-06A-11D-A390-01",
 76 |         "TCGA-BB-4224-01A-01D-1432-01",
 77 |         "TCGA-CN-4735-01A-01D-1432-01"
 78 |     )
 79 |     resframe <- UUIDtoBarcode(
 80 |         `portions.analytes.aliquots.aliquot_id`, from_type = "aliquot_ids"
 81 |     )
 82 |     expect_identical(
 83 |         resframe,
 84 |         data.frame(
 85 |             `portions.analytes.aliquots.aliquot_id`,
 86 |             `portions.analytes.aliquots.submitter_id`
 87 |         )
 88 |     )
 89 | 
 90 |     case_id <- c(
 91 |         "ce2b2c41-7d28-4d8b-a037-af842a8fe20f",
 92 |         "58574e35-8a30-4207-b127-59fff7c87a43"
 93 |     )
 94 |     submitter_id <- c("TCGA-NA-A4QY", "TCGA-BB-4224")
 95 |     resframe <- UUIDtoBarcode(case_id, from_type = "case_id")
 96 |     expect_identical(
 97 |         resframe,
 98 |         data.frame(
 99 |             case_id,
100 |             submitter_id
101 |         )
102 |     )
103 | })
104 | 
105 | 
106 | test_that("UUIDtoBarcode shows multiple entries per file_id", {
107 | 
108 |     file_ids <- c(
109 |         "f9f06937-ac64-4660-baf3-0174736d25b2",
110 |         "5dec335c-83c3-4a4a-80f5-9ec1d1847960",
111 |         "514bc5eb-006d-423b-8432-8fbe7795a312"
112 |     )
113 | 
114 |     restabs <- lapply(file_ids, UUIDtoBarcode, "file_id")
115 |     results <- do.call(rbind, restabs)
116 | 
117 |     expect_identical(results, UUIDtoBarcode(file_ids, "file_id"))
118 | 
119 |     file_ids[2] <- paste(rev(unlist(strsplit(file_ids[2], ""))), collapse = "")
120 | 
121 |     expect_warning(UUIDtoBarcode(file_ids, "file_id"))
122 | })
123 | 
124 | test_that("UUIDhistory correctly returns the appropriate identifiers", {
125 | 
126 |     old_uuids <- c("0001801b-54b0-4551-8d7a-d66fb59429bf",
127 |     "002c67f2-ff52-4246-9d65-a3f69df6789e",
128 |     "003143c8-bbbf-46b9-a96f-f58530f4bb82")
129 | 
130 |     updated_ids <- vapply(
131 |         stats::setNames(nm = old_uuids),
132 |         function(x) {
133 |             hist <- UUIDhistory(x)
134 |             ## test for data release version 32.0
135 |             cond <- hist[["file_change"]] == "released" &
136 |                 hist[["data_release"]] == "32.0"
137 |             hist[cond, "uuid"]
138 |         },
139 |         character(1L)
140 |     )
141 | 
142 |     ## Updated IDs taken from the GDC Data Portal
143 |     new_uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac",
144 |     "5ca9fa79-53bc-4e91-82cd-5715038ee23e",
145 |     "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382")
146 | 
147 |     expect_identical(updated_ids, setNames(new_uuids, old_uuids))
148 | 
149 | })
150 | 


--------------------------------------------------------------------------------
/tests/testthat/test-builds.R:
--------------------------------------------------------------------------------
 1 | context("Build information testing")
 2 | 
 3 | test_that("translateBuild works correctly", {
 4 |     buildDF <- human_builds()
 5 |     ncbinos <- as.character(34:38)
 6 |     resbuilds <- vapply(ncbinos, translateBuild, character(1L))
 7 | 
 8 |     expect_identical(unname(resbuilds), buildDF[["UCSC"]])
 9 | 
10 |     ucscnos <- paste0("hg", c(16:19, 38))
11 |     resbuilds <- vapply(ucscnos, translateBuild, character(1L), "NCBI")
12 |     expect_identical(unname(resbuilds), buildDF[["NCBI"]])
13 | 
14 |     ## UCSC (default 'to')
15 |     expect_identical(translateBuild("Grch37"), "hg19")
16 |     expect_identical(translateBuild("GrCh37"), "hg19")
17 |     expect_identical(translateBuild("grch37"), "hg19")
18 | 
19 |     expect_identical(
20 |         translateBuild("hg19", to = "NCBI"),
21 |         "GRCh37"
22 |     )
23 |     expect_identical(
24 |         translateBuild("HG19", to = "NCBI"),
25 |         "GRCh37"
26 |     )
27 |     expect_identical(
28 |         translateBuild("hG19", to = "NCBI"),
29 |         "GRCh37"
30 |     )
31 |     expect_true(
32 |             is.na(translateBuild(NA_character_))
33 |     )
34 |     expect_true(
35 |             is.na(translateBuild("33"))
36 |     )
37 | })
38 | 
39 | test_that("correctBuild returns an appropriate build name", {
40 |     build <- correctBuild("grch38", "NCBI")
41 |     expect_identical("GRCh38", build)
42 |     build <- correctBuild("hg19", "NCBI")
43 |     expect_identical(translateBuild("hg19", "NCBI"), build)
44 |     build <- correctBuild("HG19", "NCBI")
45 |     expect_identical(translateBuild("hg19", "NCBI"), build)
46 |     build <- correctBuild("HG19", "UCSC")
47 |     expect_identical("hg19", build)
48 | })
49 | 
50 | test_that("uniformBuilds is returning the appropriate output", {
51 |     build <- rep(c("GRCh37", "hg19"), times = c(5, 1))
52 |     rebuild <- uniformBuilds(build)
53 |     expect_identical(1L, length(unique(rebuild)))
54 | 
55 |     ## NA imputed to rest of builds
56 |     build <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA")
57 |     rebuild <- uniformBuilds(build)
58 |     expect_identical(1L, length(unique(rebuild)))
59 | 
60 |     build <- c(rep(c("GRCh37", "hg19"), times = c(2, 1)), "NA")
61 |     expect_error(uniformBuilds(build, cutoff = 0.2))
62 | 
63 |     # NA prop > 0.2
64 |     build <- c(rep(c("GRCh37", "hg19"), times = c(7, 1)), "NA", "NA")
65 |     expect_error(uniformBuilds(build, cutoff = 0.2))
66 | 
67 |     # NA converted to main build annotation
68 |     build <- c(rep(c("GRCh37", "hg19"), times = c(7, 2)), NA_character_)
69 |     rebuild <- uniformBuilds(build, cutoff = 0.2)
70 |     expect_identical(1L , length(unique(rebuild)))
71 | 
72 |     # if build numbers identical then replace with high prop
73 |     build <- rep(c("GRCh37", "37"), times = c(7, 2))
74 |     rebuild <- uniformBuilds(build, cutoff = 0.2)
75 |     expect_identical(rebuild, rep("GRCh37", length(rebuild)))
76 | 
77 |     build <- c(rep(c("GRCh37", "37"), times = c(7, 2)), NA_character_)
78 |     rebuild <- uniformBuilds(build, cutoff = 0.2)
79 |     expect_identical(rebuild, rep("GRCh37", length(rebuild)))
80 | 
81 |     build <- c(rep(c("GRCh37", "37"), times = c(7, 2)), rep(NA_character_, 3))
82 |     expect_error(uniformBuilds(build, cutoff = 0.2))
83 | })
84 | 
85 | 


--------------------------------------------------------------------------------
/tests/testthat/test-identifiers.R:
--------------------------------------------------------------------------------
 1 | context("Identifier tests")
 2 | 
 3 | .sectionNums <- function(bcode) {
 4 |     filler <- .uniqueDelim(bcode)
 5 |     unique(lengths(strsplit(bcode, filler)))
 6 | }
 7 | 
 8 | test_that("TCGAbarcode works", {
 9 |     barcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
10 |         "TCGA-B0-5094-11A-01D-1421-08",
11 |         "TCGA-E9-A295-10A-01D-A16D-09")
12 |     expect_identical(.sectionNums(TCGAbarcode(barcodes)), 3L)
13 | 
14 |     expect_identical(.sectionNums(TCGAbarcode(barcodes, sample = TRUE)), 4L)
15 | 
16 |     expect_identical(
17 |         .sectionNums(
18 |             TCGAbarcode(barcodes, sample = TRUE, portion = TRUE)), 5L)
19 | 
20 |     expect_identical(
21 |         .sectionNums(
22 |             TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, plate = TRUE)),
23 |         6L)
24 |     expect_identical(
25 |         .sectionNums(
26 |             TCGAbarcode(barcodes, sample = TRUE, portion = TRUE,
27 |                 plate = TRUE, center = TRUE)),
28 |         7L)
29 | })
30 | 
31 | test_that("TCGAbiospec works", {
32 |     barcodes <- c("TCGA-B0-5117-11A-01D-1421-08",
33 |         "TCGA-B0-5094-11A-01D-1421-08",
34 |         "TCGA-E9-A295-10A-01D-A16D-09")
35 |     bc0 <- TCGAbarcode(barcodes)
36 |     expect_error(TCGAbiospec(bc0))
37 |     bc1 <- TCGAbarcode(barcodes, sample = TRUE)
38 |     expect_identical(dim(TCGAbiospec(bc1)), c(length(bc1), .sectionNums(bc1)))
39 |     bc2 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE)
40 |     expect_identical(dim(TCGAbiospec(bc2)), c(length(bc2),
41 |         .sectionNums(bc2)+1L))
42 |     bc3 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, plate = TRUE)
43 |     expect_identical(dim(TCGAbiospec(bc3)), c(length(bc3),
44 |         .sectionNums(bc3)+1L))
45 |     bc4 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE,
46 |         plate = TRUE, center = TRUE)
47 |     expect_identical(dim(TCGAbiospec(bc4)), c(length(bc4),
48 |         .sectionNums(bc4)+1L))
49 |     expect_identical(names(TCGAbiospec(barcodes)), c("submitter_id",
50 |         "sample_definition", "sample", "vial", "portion", "analyte", "plate",
51 |         "center"))
52 | })
53 | 


--------------------------------------------------------------------------------