├── .Rbuildignore ├── .github └── workflows │ └── pr_check.yml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── ID-translation.R ├── TCGAbarcode.R ├── TCGAbiospec.R ├── TCGAprimaryTumors.R ├── TCGAsampleSelect.R ├── TCGAutils-pkg.R ├── builds.R ├── curatedTCGAData-helpers.R ├── data.R ├── findGRangesCols.R ├── generateMap.R ├── getFileName.R ├── imputeAssay.R ├── makeGRangesListFromCopyNumber.R ├── makeGRangesListFromExonFiles.R ├── oncoPrintTCGA.R ├── simplifyColData.R ├── simplifyTCGA.R └── utils.R ├── README.md ├── _pkgdown.yml ├── data ├── clinicalNames.rda ├── diseaseCodes.rda └── sampleTypes.rda ├── inst ├── extdata │ ├── blca_cnaseq.R │ ├── blca_cnaseq.txt │ ├── bt.exon_quant.R │ └── bt.exon_quantification.txt └── scripts │ ├── clinicalNames.R │ ├── diseaseCodes.R │ └── sampleTypes.R ├── man ├── ID-translation.Rd ├── TCGAbarcode.Rd ├── TCGAbiospec.Rd ├── TCGAprimaryTumors.Rd ├── TCGAsampleSelect.Rd ├── TCGAutils-package.Rd ├── builds.Rd ├── clinicalNames.Rd ├── curatedTCGAData-helpers.Rd ├── diseaseCodes.Rd ├── findGRangesCols.Rd ├── generateMap.Rd ├── getFileName.Rd ├── hidden-helpers.Rd ├── imputeAssay.Rd ├── makeGRangesListFromCopyNumber.Rd ├── makeGRangesListFromExonFiles.Rd ├── mergeColData.Rd ├── oncoPrintTCGA.Rd ├── sampleTypes.Rd ├── simplifyTCGA-defunct.Rd ├── simplifyTCGA.Rd └── trimColData.Rd ├── tests ├── testthat.R └── testthat │ ├── test-ID-translation.R │ ├── test-builds.R │ └── test-identifiers.R └── vignettes └── TCGAutils.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^\.github$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^data-raw$ 5 | -------------------------------------------------------------------------------- /.github/workflows/pr_check.yml: -------------------------------------------------------------------------------- 1 | name: PR CMD check & build site 2 | 3 | on: 4 | pull_request: 5 | push: 6 | paths: 7 | - 'DESCRIPTION' 8 | - '**.yml' 9 | branches: 10 | - devel 11 | - RELEASE_3_21 12 | 13 | env: 14 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | CRAN: https://p3m.dev/cran/__linux__/noble/latest 17 | BIOC_RELEASE: RELEASE_3_21 18 | 19 | jobs: 20 | set-matrix: 21 | runs-on: ubuntu-24.04 22 | outputs: 23 | matrix: ${{ steps.set.outputs.matrix }} 24 | dockerfile_exists: ${{ steps.dockerfile.outputs.exists }} 25 | steps: 26 | - name: Set Matrix Bioconductor Version 27 | id: set 28 | run: | 29 | MATRIX="{\"include\":[{\"bioc_version\":\"$GITHUB_REF_NAME\"}]}" 30 | echo "matrix=$MATRIX" >> $GITHUB_OUTPUT 31 | - name: Check for Dockerfile 32 | id: dockerfile 33 | run: | 34 | echo "exists=$( [ -f ./inst/docker/pkg/Dockerfile ] && echo true || echo false )" >> $GITHUB_OUTPUT 35 | 36 | check: 37 | needs: set-matrix 38 | runs-on: ubuntu-latest 39 | strategy: 40 | matrix: ${{ fromJson(needs.set-matrix.outputs.matrix) }} 41 | container: bioconductor/bioconductor_docker:${{ matrix.bioc_version }} 42 | 43 | steps: 44 | - name: Checkout Repository 45 | uses: actions/checkout@v4 46 | with: 47 | ref: ${{ matrix.bioc_version }} 48 | 49 | - name: Query dependencies 50 | run: | 51 | BiocManager::install(c("covr", "BiocCheck")) 52 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 53 | shell: Rscript {0} 54 | 55 | - name: Cache R packages 56 | uses: actions/cache@v4 57 | with: 58 | path: /usr/local/lib/R/site-library 59 | key: ${{ runner.os }}-r-${{ matrix.bioc_version }}-${{ hashFiles('.github/depends.Rds') }} 60 | restore-keys: ${{ runner.os }}-r-${{ matrix.bioc_version }}- 61 | 62 | - name: Install GPG 63 | if: ${{ github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 64 | run: sudo apt-get update && sudo apt-get install -y gpg 65 | 66 | - name: Install Dependencies 67 | run: | 68 | remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories()) 69 | BiocManager::install(c("rcmdcheck", "BiocCheck"), ask = FALSE, update = TRUE) 70 | shell: Rscript {0} 71 | 72 | - name: Check Package 73 | env: 74 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 75 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error", check_dir = "check") 76 | shell: Rscript {0} 77 | 78 | - name: Test coverage 79 | if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 80 | run: | 81 | cov <- covr::package_coverage( 82 | quiet = FALSE, 83 | clean = FALSE, 84 | type = "all", 85 | install_path = file.path( 86 | normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), 87 | "package" 88 | ) 89 | ) 90 | covr::to_cobertura(cov) 91 | shell: Rscript {0} 92 | 93 | - name: Upload test results to Codecov 94 | if: ${{ success() && github.ref == 'refs/heads/devel' && github.event_name != 'pull_request' }} 95 | uses: codecov/codecov-action@v4 96 | with: 97 | fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} 98 | file: ./cobertura.xml 99 | plugin: noop 100 | disable_search: true 101 | token: ${{ secrets.CODECOV_TOKEN }} 102 | 103 | - name: Run BiocCheck 104 | id: bioccheck 105 | run: | 106 | BiocCheck::BiocCheck( 107 | dir('check', 'tar.gz$', full.names = TRUE), 108 | `quit-with-status` = TRUE, `no-check-bioc-help` = TRUE 109 | ) 110 | shell: Rscript {0} 111 | 112 | - name: Build pkgdown 113 | if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }} 114 | run: | 115 | PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site()' 116 | 117 | - name: Upload pkgdown artifact 118 | if: github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) 119 | uses: actions/upload-pages-artifact@v3 120 | with: 121 | path: docs 122 | 123 | dock: 124 | needs: 125 | - check 126 | - set-matrix 127 | runs-on: ubuntu-24.04 128 | if: ${{ github.ref == 'refs/heads/devel' && needs.set-matrix.outputs.dockerfile_exists == 'true' }} 129 | steps: 130 | - name: Checkout Repository 131 | if: ${{ success() && github.event_name != 'pull_request' }} 132 | uses: actions/checkout@v4 133 | 134 | - name: Register repo name 135 | if: ${{ github.event_name != 'pull_request' }} 136 | id: reg_repo_name 137 | run: | 138 | echo CONT_IMG_NAME=$(echo ${{ github.event.repository.name }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV 139 | 140 | - name: Login to Docker Hub 141 | if: ${{ github.event_name != 'pull_request' }} 142 | uses: docker/login-action@v2 143 | with: 144 | username: ${{ secrets.DOCKERHUB_USERNAME }} 145 | password: ${{ secrets.DOCKERHUB_TOKEN }} 146 | 147 | - name: Build and Push Docker 148 | if: ${{ success() && github.event_name != 'pull_request' }} 149 | uses: docker/build-push-action@v6 150 | with: 151 | context: . 152 | file: ./inst/docker/pkg/Dockerfile 153 | push: true 154 | tags: > 155 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:latest, 156 | ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.CONT_IMG_NAME }}:devel 157 | 158 | deploy: 159 | needs: check 160 | permissions: 161 | contents: write 162 | pages: write 163 | id-token: write 164 | runs-on: ubuntu-24.04 165 | 166 | steps: 167 | - name: Deploy to GitHub Pages 168 | if: ${{ github.ref == format('refs/heads/{0}', env.BIOC_RELEASE) && github.event_name != 'pull_request' }} 169 | id: deployment 170 | uses: actions/deploy-pages@v4 171 | 172 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | *.txt 8 | *.tar.gz 9 | !blca_cnaseq.txt 10 | 11 | # RStudio files 12 | .Rproj.user 13 | *.Rproj 14 | 15 | # produced vignettes 16 | vignettes/*.html 17 | vignettes/*.pdf 18 | 19 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 20 | .httr-oauth 21 | 22 | # Versioned files 23 | *.orig 24 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TCGAutils 2 | Title: TCGA utility functions for data management 3 | Version: 1.29.3 4 | Description: A suite of helper functions for checking and manipulating TCGA 5 | data including data obtained from the curatedTCGAData experiment package. 6 | These functions aim to simplify and make working with TCGA data more 7 | manageable. Exported functions include those that import data from flat files 8 | into Bioconductor objects, convert row annotations, and identifier 9 | translation via the GDC API. 10 | Authors@R: c( 11 | person("Marcel", "Ramos", email = "marcel.ramos@sph.cuny.edu", 12 | role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3242-0582")), 13 | person("Lucas", "Schiffer", role = "aut"), 14 | person("Sean", "Davis", role = "ctb"), 15 | person("Levi", "Waldron", role = "aut") 16 | ) 17 | Depends: 18 | R (>= 4.5.0) 19 | Imports: 20 | AnnotationDbi, 21 | BiocGenerics, 22 | BiocBaseUtils, 23 | GenomeInfoDb, 24 | GenomicFeatures, 25 | GenomicRanges, 26 | GenomicDataCommons, 27 | IRanges, 28 | methods, 29 | MultiAssayExperiment, 30 | RaggedExperiment, 31 | rvest, 32 | S4Vectors, 33 | stats, 34 | stringr, 35 | SummarizedExperiment, 36 | utils, 37 | xml2 38 | Suggests: 39 | AnnotationHub, 40 | BiocStyle, 41 | curatedTCGAData, 42 | ComplexHeatmap, 43 | devtools, 44 | dplyr, 45 | httr, 46 | IlluminaHumanMethylation450kanno.ilmn12.hg19, 47 | impute, 48 | knitr, 49 | magrittr, 50 | org.Hs.eg.db, 51 | RColorBrewer, 52 | readr, 53 | rmarkdown, 54 | RTCGAToolbox, 55 | rtracklayer, 56 | R.utils, 57 | testthat, 58 | TxDb.Hsapiens.UCSC.hg18.knownGene, 59 | TxDb.Hsapiens.UCSC.hg19.knownGene 60 | License: Artistic-2.0 61 | Roxygen: list(markdown = TRUE) 62 | Encoding: UTF-8 63 | BugReports: https://github.com/waldronlab/TCGAutils/issues 64 | biocViews: Software, WorkflowStep, Preprocessing, DataImport 65 | VignetteBuilder: knitr 66 | RoxygenNote: 7.3.2 67 | Date: 2025-06-09 68 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CpGtoRanges) 4 | export(TCGAbarcode) 5 | export(TCGAbiospec) 6 | export(TCGAprimaryTumors) 7 | export(TCGAsampleSelect) 8 | export(TCGAsplitAssays) 9 | export(UUIDhistory) 10 | export(UUIDtoBarcode) 11 | export(UUIDtoUUID) 12 | export(barcodeToUUID) 13 | export(correctBuild) 14 | export(extractBuild) 15 | export(filenameToBarcode) 16 | export(findGRangesCols) 17 | export(generateMap) 18 | export(getClinicalNames) 19 | export(getFileName) 20 | export(getSubtypeMap) 21 | export(imputeAssay) 22 | export(isCorrect) 23 | export(makeGRangesListFromCopyNumber) 24 | export(makeGRangesListFromExonFiles) 25 | export(mergeColData) 26 | export(mirToRanges) 27 | export(oncoPrintTCGA) 28 | export(qreduceTCGA) 29 | export(sampleTables) 30 | export(simplifyTCGA) 31 | export(symbolsToRanges) 32 | export(translateBuild) 33 | export(trimColData) 34 | export(uniformBuilds) 35 | import(methods) 36 | importFrom(BiocBaseUtils,checkInstalled) 37 | importFrom(BiocBaseUtils,isScalarCharacter) 38 | importFrom(BiocBaseUtils,isScalarNumber) 39 | importFrom(BiocBaseUtils,lifeCycle) 40 | importFrom(BiocBaseUtils,selectSome) 41 | importFrom(BiocBaseUtils,setSlots) 42 | importFrom(GenomeInfoDb,"genome<-") 43 | importFrom(GenomeInfoDb,"seqlevelsStyle<-") 44 | importFrom(GenomeInfoDb,genome) 45 | importFrom(GenomeInfoDb,keepStandardChromosomes) 46 | importFrom(GenomeInfoDb,seqlevelsStyle) 47 | importFrom(GenomicDataCommons,cases) 48 | importFrom(GenomicDataCommons,expand) 49 | importFrom(GenomicDataCommons,files) 50 | importFrom(GenomicDataCommons,filter) 51 | importFrom(GenomicDataCommons,ids) 52 | importFrom(GenomicDataCommons,results_all) 53 | importFrom(GenomicDataCommons,select) 54 | importFrom(GenomicFeatures,genes) 55 | importFrom(GenomicFeatures,microRNAs) 56 | importFrom(GenomicRanges,GRanges) 57 | importFrom(GenomicRanges,GRangesList) 58 | importFrom(GenomicRanges,granges) 59 | importFrom(GenomicRanges,makeGRangesListFromDataFrame) 60 | importFrom(MultiAssayExperiment,"colData<-") 61 | importFrom(MultiAssayExperiment,ExperimentList) 62 | importFrom(MultiAssayExperiment,colData) 63 | importFrom(MultiAssayExperiment,experiments) 64 | importFrom(MultiAssayExperiment,metadata) 65 | importFrom(MultiAssayExperiment,subsetByColumn) 66 | importFrom(S4Vectors,DataFrame) 67 | importFrom(S4Vectors,isSingleInteger) 68 | importFrom(S4Vectors,isSingleNumber) 69 | importFrom(S4Vectors,isSingleString) 70 | importFrom(SummarizedExperiment,"mcols<-") 71 | importFrom(SummarizedExperiment,"rowData<-") 72 | importFrom(SummarizedExperiment,SummarizedExperiment) 73 | importFrom(SummarizedExperiment,mcols) 74 | importFrom(SummarizedExperiment,rowData) 75 | importFrom(rvest,html_attr) 76 | importFrom(rvest,html_nodes) 77 | importFrom(stats,as.formula) 78 | importFrom(stats,na.omit) 79 | importFrom(stats,setNames) 80 | importFrom(stringr,str_extract) 81 | importFrom(utils,data) 82 | importFrom(utils,head) 83 | importFrom(utils,read.delim) 84 | importFrom(xml2,read_html) 85 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Changes in version 1.30.0 2 | 3 | ### Significant User-visible changes 4 | 5 | * Deprecated `mirbase.db` package affects `mirToRanges` function. 6 | 7 | ### Bug fixes and minor improvements 8 | 9 | * Use `BiocBaseUtils::checkInstalled` to check for suggested packages. 10 | 11 | ## Changes in version 1.24.0 12 | 13 | ### Significant User-visible changes 14 | 15 | * The `legacy` argument in ID translation functions (`UUIDtoBarcode`, 16 | `UUIDtoUUID`, `barcodeToUUID`, and `filenameToBarcode`) has been defunct and 17 | removed. 18 | 19 | ### Bug fixes and minor improvements 20 | 21 | * `UUIDtoBarcode` ensures that results are ordered based on the input UUIDs. 22 | * Include informative error message regarding translation of UUIDs from legacy 23 | files. 24 | 25 | ## Changes in version 1.22.0 26 | 27 | ### Bug fixes and minor improvements 28 | 29 | * `UUIDtoBarcode` returns barcodes consistent with Genomic Data Commons API 30 | update 31 | 32 | ## Changes in version 1.20.0 33 | 34 | ### New features 35 | 36 | * `makeSummarizedExperimentFromGISTIC` and `splitAssays` are now defunct. 37 | 38 | ### Minor changes and bug fixes 39 | 40 | * `makeGRangesListFromExonFiles` example removed from the vignette, the GDC 41 | `legacy` endpoint has been deprecated. For more information see the GDC API 42 | release notes version v3.28.0. 43 | 44 | ## Changes in version 1.18.0 45 | 46 | ### Minor changes and bug fixes 47 | 48 | * Use https instead of http in `getFileName` helper. 49 | * Warn when column names in assays are not mappable and subsequently dropped in 50 | `generateMap`. 51 | * Updated `qreduceTCGA` documentation for clarity. 52 | 53 | ## Changes in version 1.16.0 54 | 55 | ### New features 56 | 57 | * The `UUIDhistory` function allows users to map old UUIDs to new UUIDs 58 | according to the latest data release for UUIDs that were affected and no longer 59 | query-able. 60 | * The `slides` argument has been added to the `filenameToBarcode` function for 61 | translating slide file names into barcodes. Currently, the API returns all 62 | barcodes of the associated case ID. 63 | * Add sections in the vignette regarding GDC Data Updates and UUID history 64 | lookup 65 | 66 | ### Minor changes and bug fixes 67 | 68 | * Update examples in package to new GDC Data Release, see vignette. 69 | * Use `AnnotationHub` to download chain file in main vignette. 70 | * Slide file names now resolve to a single TCGA barcode in `filenameToBarcode` 71 | (Thanks @hermidalc) 72 | * Improved error messages and documentation for `makeGRangesListFromExonFiles` 73 | 74 | ## Changes in version 1.14.0 75 | 76 | ### Minor changes and bug fixes 77 | 78 | * `UUIDtoBarcode` with the `from_type = "file_id"` argument now returns the IDs 79 | in the proper order when more than one `UUID` is input. 80 | * Update `makeGRangesListFromCopyNumber` examples with new names from API e.g., 81 | 'associated_entities.entity_submitter_id' 82 | 83 | ## Changes in version 1.12.0 84 | 85 | ### New features 86 | 87 | * `makeSummarizedExperimentFromGISTIC` has been moved to `RTCGAToolbox`. 88 | * `splitAssays` now deprecated for `TCGAsplitAssays` to avoid conflict with 89 | `MultiAssayExperiment::splitAssays` 90 | 91 | ### Minor changes and bug fixes 92 | 93 | * Properly identifies genome annotation (`hg*`) in `oncoPrintTCGA` 94 | * `qreduceTCGA` now works with updates to `seqlevelsStyle` where genome 95 | annotation include patch versions when available 96 | 97 | ## Changes in version 1.10.0 98 | 99 | ### New features 100 | 101 | * `correctBuild` attempts to provide the official name of a particular human 102 | genome build to agree with changes in `GenomeInfoDb` 103 | * `isCorrect` checks that the build name matches the official name 104 | 105 | ### Minor changes and bug fixes 106 | 107 | * Documentation improvements to `simplifyTCGA` 108 | * Improvements to `findGRangesCols` to locate ranged columns in a `DataFrame` 109 | * Fixed a bug in `UUIDtoBarcode` where only the first record was returned 110 | (#26, @DarioS) 111 | * Fixed a bug in `filenameToBarcode` when multiple inputs were used (#22, 112 | @DarioS) 113 | 114 | ## Changes in version 1.8.0 115 | 116 | ### New features 117 | 118 | * `README.md` now includes a cheat sheet for reference 119 | * `mergeColData` and `oncoPrintTCGA` sections updated/included in the vignette 120 | 121 | ### Minor changes and bug fixes 122 | 123 | * `translateBuild` more robust to consistent inputs 124 | * `translateBuild` returns vector output instead of single string as before 125 | * `makeSummarizedExperimentFromGISTIC` now has a more open interface with 126 | `...` input to `RTCGAToolbox::getGISTICPeaks` 127 | * `oncoPrintTCGA` now uses `seqlevels` from input throughout 128 | 129 | ## Changes in version 1.6.0 130 | 131 | ### New features 132 | 133 | * `oncoPrintTCGA`: Create an `oncoPrint` visualization for mutation data 134 | * Support `aliquot_ids` as input to `UUIDtoBarcode` function 135 | * Additional sections in the vignette: `CpGtoRanges`, `UUIDtoBarcode` for 136 | `aliquot_ids` 137 | * `TCGAprimaryTumors` allows users to select all primary tumors for a given 138 | `curatedTCGAData` `MultiAssayExperiment` object (suggested by @vjcitn) 139 | 140 | ### Minor changes and bug fixes 141 | 142 | * Now merging clinical data using both rows and columns in `mergeColData` 143 | * Added informative error when query results are empty in `UUIDtoBarcode` 144 | * Updates to `makeGRangesListFromExonFiles` to use `S4Vectors::splitAsList` 145 | (@hpages) 146 | 147 | ## Changes in version 1.4.0 148 | 149 | ### New features 150 | 151 | * `trimColData` added to remove any extra columns from the `colData` slot 152 | (thanks to @vjcitn) 153 | * `CpGtoRanges` translates CpG islands to genomic positions using an annotation 154 | package and `minfi` 155 | * Overhaul of the barcode translation services allows accurate translation 156 | of identifiers 157 | * `splitAssays` now separates all assays by sample codes contained therein 158 | by default, previous behavior had default values 159 | * Documentation for `simplifyTCGA` was modified to include similar operations, 160 | such as, `symbolsToRanges`, `mirToRanges`, `CpGtoRanges`, etc. 161 | * Vignette includes comprehensive examples of new functionality 162 | 163 | ### Minor changes and bug fixes 164 | 165 | * `getFileNames` renamed to `getFileName` 166 | * `TCGAsampleSelect` now allows multiple sample type inputs as the 167 | `sampleCodes` argument 168 | * `getSubtypeMap` updates column names to accurately represent patient 169 | identifiers 170 | * More robust checks were added to `splitAssays` to ensure valid sample codes 171 | in the input and provided as arguments 172 | * `makeGRangesListFromExonFiles` is optimized to use `dplyr` when available 173 | and fast operations from `IRanges` 174 | * Various enhancements to `*toRanges` functions, including re-using underlying 175 | common helper function 176 | * The internal `weightedmean` function in `qreduceTCGA` has been updated for 177 | correctness 178 | * The `keep` arugment in `qreduceTCGA` and related functions was changed 179 | to `keep.assay` 180 | 181 | ## Changes in version 1.2.0 182 | 183 | ### New features 184 | 185 | * `imputeAssay` added to impute data for MultiAssayExperiment assays 186 | * `UUIDtoUUID` translation available to translate from file to case IDs 187 | * A suite of functions is available to enhance existing MultiAssayExperiment 188 | datasets: `qreduceTCGA`, `mirToRanges`, `symbolsToRanges`. Thanks to @lwaldron 189 | 190 | ### Minor changes and bug fixes 191 | 192 | * Various changes to examples for compatibility with RaggedExperiment 193 | * Bug fix to internal functions for finding GRanges columns 194 | 195 | ## Changes in version 1.1.5 196 | 197 | * `uniformBuilds` cleans up a vector of inconsistently labelled builds by 198 | changing the build with the lowest frequency 199 | 200 | ## Changes in version 1.1.4 201 | 202 | ### New features 203 | 204 | * The `UUIDtoUUID` function can translate from case to file UUIDs and vice 205 | versa 206 | * `imputeAssay` allows missing data imputation using KNN for 207 | `MultiAssayExperiment` assays 208 | 209 | ## Changes in version 1.1.1 210 | 211 | ### New features 212 | 213 | * exported the internal helper, `filenameToBarcode`. See examples 214 | 215 | ## Changes in version 0.99.68 216 | 217 | ### Minor changes and bug fixes 218 | 219 | * Minor changes in response to review, avoid switching from logical to numeric 220 | index, updated vignette introduction 221 | * Fix examples to updated `GenomicDataCommons` interface 222 | * Move `RTCGAToolbox` to `Suggests` field in DESCRIPTION 223 | * Removed `BiocFileCache` from `Imports` field 224 | 225 | ## Changes in version 0.99.51 226 | 227 | ### New features 228 | 229 | * Added a group of ID translation helper functions (see ?ID-translation) 230 | * Added a group of helper functions that work with `curatedTCGAData` 231 | * `UUIDtoBarcode` function added thanks to @seandavi 232 | * Renamed `makeGRangesListFromTCGA` to `makeGRangesListFromCopyNumber` 233 | * `makeSummarizedExperimentFromGISTIC` is now available to convert 234 | `RTCGAToolbox`'s `FirehoseGISTIC` data class to `SummarizedExperiment` 235 | * Created a function to merge external `colData` to a `MultiAssayExperiment` 236 | `colData` slot 237 | * Revamped vignette documentation 238 | 239 | ### Minor changes and bug fixes 240 | 241 | * Improvements to `TCGAbiospec` and `TCGAbarcode` 242 | * Updated `sampleTypes` and `clinicalNames` datasets 243 | * Updated DESCRIPTION file with appropriate imports and exports 244 | * Various improvements to `findGRangesCols` 245 | * `generateMap` is now updated to the recent `MultiAssayExperiment` API with 246 | improved example 247 | * Updated `getFileNames` to most recent `RTCGAToolbox` API 248 | * Various updates to data generating scripts in `data-raw` folder 249 | * Format updates to NEWS file 250 | * Added tests 251 | 252 | ## Changes in version 0.2.0 253 | 254 | ### New features 255 | 256 | * Package renamed to `TCGAutils` for working with TCGA data 257 | 258 | ## Changes in version 0.1.0 259 | 260 | ### New features 261 | 262 | * `TCGAtranslateID` now works with GDC API 263 | 264 | ### Minor changes and bug fixes 265 | 266 | * Code cleaned up 267 | * Added proper import directives 268 | 269 | ## Changes in version 0.0.70 270 | 271 | ### New features 272 | 273 | * `makeGRangesListFromDataFrame` now moved to `GenomicRanges` 274 | * `makeSummarizedExperimentFromDataFrame` now moved to `SummarizedExperiment` 275 | * `getFileNames` function will obtain filenames used in `RTCGAToolbox` 276 | * Improved `getFileNames` with `xml2` and `rvest` dependencies and removes the 277 | `XML` dependency 278 | 279 | ### Minor changes and bug fixes 280 | 281 | * `TCGAextract` now uses the `findGRangesCols` to automatically detect ranged 282 | data columns 283 | * Arguments in functions `TCGA*` now renamed to match `MultiAssayExperiment` 284 | conventions 285 | * Informative errors in `TCGAextract` 286 | 287 | ## Changes in version 0.0.60 288 | 289 | * `makeGRangesListFromTCGA` data builds on `makeGRangesListFromDataFrame` 290 | * `makeGRangesListFromDataFrame` and 291 | `makeRangedSummarizedExperimentFromDataFrame` will be moving to standard 292 | Bioconductor packages soon. 293 | * `tcga` and `ccle` functions soon to be deprecated. 294 | * Upcoming: `TCGAbarcode` will be modified for efficiency 295 | 296 | ## Changes in version 0.0.50 297 | 298 | * Add your own identifier parsing function for generating a `sampleMap` in 299 | `generateMap`! 300 | * Add proper genome build to ranged based objects. 301 | * Return `SummarizedExperiment` class for certain data types. 302 | * Fix genome build bugs 303 | 304 | ## Changes in version 0.0.46 305 | 306 | * `makeRSE` function for creating a `RangedSummarizedExperiment` object from a 307 | data frame. 308 | * Bug fixes to `getRangeNames` including the option to enter a regular 309 | expression vector for finding ranged column names. 310 | * `matchClinical` renamed to `TCGAmatchClinical` 311 | 312 | ## Changes in version 0.0.44 313 | 314 | * `getRangedNames` function will try to extract minimum necessary names for 315 | creating ranges (works on a vector of names) 316 | * minor bug fixes to `TCGAbiospec`, `TCGAextract`, `makeGRangesList` 317 | 318 | ## Changes in version 0.0.40 319 | 320 | * Package renamed to `BiocInterfaces`! 321 | * `TCGA` specific functions now start with the letters `TCGA` 322 | * Included: more examples of use of the `TCGAbarcode` function 323 | * Updated `makeGRangesList` function to work with `tcga` and `ccle` data 324 | parameter functions 325 | 326 | ## Changes in version 0.0.2 327 | 328 | * Added a `NEWS.md` file to track changes to the package. 329 | * TCGAmisc now a standalone package! (previously in `RTCGAToolbox`) 330 | * Provides helper functions for converting raw data into S4 objects (e.g., 331 | `GRangesList`) 332 | * Provides functions for creating a MultiAssayExperiment object such as: 333 | * `generateTCGAmap` 334 | * `cleanExpList` 335 | -------------------------------------------------------------------------------- /R/ID-translation.R: -------------------------------------------------------------------------------- 1 | ## function to figure out exact endpoint based on TCGA barcode 2 | .barcode_files <- function(startPoint = "cases", submitter_id = TRUE) { 3 | keywords <- c("cases", "samples", "portions", "analytes", "aliquots") 4 | last <- match.arg(startPoint, keywords) 5 | indx <- seq_len(which(keywords == last)) 6 | sub_id <- if (submitter_id) "submitter_id" else NULL 7 | paste(c(keywords[indx], sub_id), collapse = ".") 8 | } 9 | 10 | .subword_id <- function(keyword) { 11 | ret <- paste0(keyword, "_ids") 12 | setNames(paste0("submitter_", ret), ret) 13 | } 14 | 15 | .barcode_cases <- function(bcodeType = "case") { 16 | if (identical(bcodeType, "case")) 17 | setNames("submitter_id", "case_id") 18 | else 19 | .subword_id(bcodeType) 20 | } 21 | 22 | .findBarcodeLimit <- function(barcode) { 23 | .checkBarcodes(barcode) 24 | filler <- .uniqueDelim(barcode) 25 | splitCodes <- strsplit(barcode, filler) 26 | obsIdx <- unique(lengths(splitCodes)) 27 | 28 | if (obsIdx < 3L) 29 | stop("Minimum barcode fields required: ", 3L, 30 | "; first three are 'project-TSS-participant'") 31 | 32 | key <- c(rep("case", 3L), "sample", "analyte", "aliquot", "aliquot")[obsIdx] 33 | if (identical(key, "analyte")) { 34 | analyte_chars <- unique( 35 | vapply(splitCodes, function(x) nchar(x[[obsIdx]]), integer(1L)) 36 | ) 37 | if (!S4Vectors::isSingleInteger(analyte_chars)) 38 | stop("Inconsistent '", key, "' barcodes") 39 | if (analyte_chars < 3) 40 | key <- "portion" 41 | } else if (identical(key, "aliquot")) { 42 | if (identical(obsIdx, 6L)) { 43 | ali_chars <- vapply(splitCodes, function(x) 44 | nchar(x[c(obsIdx-1L, obsIdx)]), integer(2L)) 45 | if (identical(ali_chars, c(2L, 3L))) 46 | key <- "slide" 47 | } 48 | } 49 | key 50 | } 51 | 52 | .buildIDframe <- function(info, id_list) { 53 | barcodes_per_file <- lengths(id_list) 54 | # And build the data.frame 55 | data.frame( 56 | id = rep(ids(info), barcodes_per_file), 57 | barcode = if (!length(ids(info))) character(0L) else unlist(id_list), 58 | row.names = NULL, 59 | stringsAsFactors = FALSE 60 | ) 61 | } 62 | 63 | .cleanExpand <- function(result, ids) { 64 | samps <- result[["samples"]] 65 | usamps <- unlist(samps) 66 | splitsamps <- split(unname(usamps), gsub("[0-9]*$", "", names(usamps))) 67 | splits <- strsplit(names(splitsamps), "\\.") 68 | cnames <- unique(vapply(splits, function(x) { 69 | paste0(x[-1], collapse = ".") }, character(1))) 70 | first <- unlist(splitsamps[c(TRUE, FALSE)]) 71 | second <- unlist(splitsamps[c(FALSE, TRUE)]) 72 | pos <- match(ids, first) 73 | resframe <- cbind.data.frame(first[pos], second[pos], row.names = NULL, 74 | stringsAsFactors = FALSE) 75 | names(resframe) <- cnames 76 | resframe 77 | } 78 | 79 | .orderedDF <- function(..., orderBy) { 80 | df <- data.frame(..., stringsAsFactors = FALSE) 81 | orderIdx <- match(orderBy, df[["info..from_type.."]]) 82 | res <- df[orderIdx, ] 83 | rownames(res) <- NULL 84 | res 85 | } 86 | 87 | .nestedlisttodf <- function(x, orderBy) { 88 | .check_ids_found(names(x), orderBy) 89 | x <- Filter(length, x[orderBy]) 90 | data.frame( 91 | rep(names(x), vapply(x, nrow, integer(1))), 92 | unlist(x, use.names = FALSE), 93 | stringsAsFactors = FALSE 94 | ) 95 | } 96 | 97 | #' @importFrom BiocBaseUtils selectSome 98 | .check_ids_found <- function(resnames, id_vector) { 99 | idin <- id_vector %in% resnames 100 | if (!all(idin)) { 101 | mids <- paste( 102 | selectSome(id_vector[!idin], 4), collapse = ", " 103 | ) 104 | warning("Identifiers not found: ", mids, call. = FALSE) 105 | } 106 | } 107 | 108 | #' @name ID-translation 109 | #' 110 | #' @title Translate study identifiers from barcode to UUID and vice versa 111 | #' 112 | #' @description These functions allow the user to enter a character vector of 113 | #' identifiers and use the GDC API to translate from TCGA barcodes to 114 | #' Universally Unique Identifiers (UUID) and vice versa. These relationships 115 | #' are not one-to-one. Therefore, a `data.frame` is returned for all 116 | #' inputs. The UUID to TCGA barcode translation only applies to file and case 117 | #' UUIDs. Two-way UUID translation is available from 'file_id' to 'case_id' 118 | #' and vice versa. Please double check any results before using these 119 | #' features for analysis. Case / submitter identifiers are translated by 120 | #' default, see the `from_type` argument for details. All identifiers are 121 | #' converted to lower case. 122 | #' 123 | #' @details 124 | #' Based on the file UUID supplied, the appropriate entity_id (TCGA barcode) is 125 | #' returned. In previous versions of the package, the 'end_point' parameter 126 | #' would require the user to specify what type of barcode needed. This is no 127 | #' longer supported as `entity_id` returns the appropriate one. 128 | #' 129 | #' @param id_vector character() A vector of UUIDs corresponding to 130 | #' either files or cases (default assumes case_ids) 131 | #' 132 | #' @param from_type character(1) Either `case_id` or `file_id` indicating the 133 | #' type of `id_vector` entered (default `"case_id"`) 134 | #' 135 | #' @return Generally, a `data.frame` of identifier mappings 136 | #' 137 | #' @md 138 | #' 139 | #' @examples 140 | #' ## Translate UUIDs >> TCGA Barcode 141 | #' 142 | #' uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac", 143 | #' "5ca9fa79-53bc-4e91-82cd-5715038ee23e", 144 | #' "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382") 145 | #' 146 | #' UUIDtoBarcode(uuids, from_type = "file_id") 147 | #' 148 | #' UUIDtoBarcode("ae55b2d3-62a1-419e-9f9a-5ddfac356db4", from_type = "case_id") 149 | #' 150 | #' UUIDtoBarcode("d85d8a17-8aea-49d3-8a03-8f13141c163b", "aliquot_ids") 151 | #' 152 | #' @author Sean Davis, M. Ramos 153 | #' 154 | #' @export UUIDtoBarcode 155 | UUIDtoBarcode <- function( 156 | id_vector, from_type = c("case_id", "file_id", "aliquot_ids") 157 | ) { 158 | from_type <- match.arg(from_type) 159 | targetElement <- APIendpoint <- "submitter_id" 160 | if (identical(from_type, "file_id")) { 161 | APIendpoint <- "associated_entities.entity_submitter_id" 162 | targetElement <- "associated_entities" 163 | } else if (identical(from_type, "aliquot_ids")) { 164 | APIendpoint <- "samples.portions.analytes.aliquots.submitter_id" 165 | targetElement <- "samples" 166 | } 167 | selector <- switch(from_type, 168 | case_id = identity, 169 | aliquot_ids = 170 | function(x) 171 | select( 172 | x = x, 173 | fields = c( 174 | APIendpoint, 175 | "samples.portions.analytes.aliquots.aliquot_id" 176 | ) 177 | ), 178 | function(x) select(x = x, fields = APIendpoint) 179 | ) 180 | 181 | funcRes <- switch(from_type, 182 | file_id = files(), 183 | case_id = cases(), 184 | aliquot_ids = cases()) 185 | info <- results_all( 186 | selector( 187 | GenomicDataCommons::filter(funcRes, as.formula( 188 | paste("~ ", from_type, "%in% id_vector") 189 | )) 190 | ) 191 | ) 192 | if (!length(info)) 193 | stop( 194 | paste(strwrap( 195 | "No barcodes were found. Note that legacy files were removed 196 | as of GDC Data Portal version 1.30.4; see 197 | https://docs.gdc.cancer.gov/. Only case, file, and aliquot 198 | UUIDs are supported.", 199 | exdent = 2 200 | ), collapse = "\n"), 201 | call. = FALSE 202 | ) 203 | 204 | rframe <- 205 | if (identical(from_type, "case_id")) 206 | .orderedDF( 207 | info[[from_type]], info[[targetElement]], orderBy = id_vector 208 | ) 209 | else if (identical(from_type, "file_id")) 210 | .nestedlisttodf(info[[targetElement]], id_vector) 211 | else 212 | return(.cleanExpand(info, id_vector)) 213 | 214 | names(rframe) <- c(from_type, APIendpoint) 215 | rframe 216 | } 217 | 218 | #' @rdname ID-translation 219 | #' 220 | #' @param to_type character(1) The desired UUID type to obtain, can either be 221 | #' `"case_id"` (default) or `"file_id"` 222 | #' 223 | #' @examples 224 | #' ## Translate file UUIDs >> case UUIDs 225 | #' 226 | #' uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac", 227 | #' "5ca9fa79-53bc-4e91-82cd-5715038ee23e", 228 | #' "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382") 229 | #' 230 | #' UUIDtoUUID(uuids) 231 | #' 232 | #' @export UUIDtoUUID 233 | UUIDtoUUID <- function( 234 | id_vector, to_type = c("case_id", "file_id") 235 | ) { 236 | id_vector <- tolower(id_vector) 237 | type_ops <- c("case_id", "file_id") 238 | to_type <- match.arg(to_type) 239 | from_type <- type_ops[!type_ops %in% to_type] 240 | if (!length(from_type)) 241 | stop("Provide a valid UUID type") 242 | 243 | endpoint <- switch(to_type, 244 | case_id = "cases.case_id", 245 | file_id = "files.file_id") 246 | apifun <- switch(to_type, 247 | file_id = cases(), 248 | case_id = files()) 249 | info <- results_all( 250 | select(filter(apifun, as.formula( 251 | paste("~ ", from_type, "%in% id_vector") 252 | )), 253 | endpoint) 254 | ) 255 | targetElement <- gsub("(\\w+).*", "\\1", endpoint) 256 | id_list <- lapply(info[[targetElement]], function(x) {x[[1]]}) 257 | 258 | rframe <- .buildIDframe(info, id_list) 259 | names(rframe) <- c(from_type, endpoint) 260 | rframe 261 | } 262 | 263 | #' @rdname ID-translation 264 | #' 265 | #' @param barcodes character() A vector of TCGA barcodes 266 | #' 267 | #' @examples 268 | #' ## Translate TCGA Barcode >> UUIDs 269 | #' 270 | #' fullBarcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 271 | #' "TCGA-B0-5094-11A-01D-1421-08", 272 | #' "TCGA-E9-A295-10A-01D-A16D-09") 273 | #' 274 | #' sample_ids <- TCGAbarcode(fullBarcodes, sample = TRUE) 275 | #' 276 | #' barcodeToUUID(sample_ids) 277 | #' 278 | #' participant_ids <- c("TCGA-CK-4948", "TCGA-D1-A17N", 279 | #' "TCGA-4V-A9QX", "TCGA-4V-A9QM") 280 | #' 281 | #' barcodeToUUID(participant_ids) 282 | #' 283 | #' @export barcodeToUUID 284 | barcodeToUUID <- 285 | function(barcodes) 286 | { 287 | .checkBarcodes(barcodes) 288 | bend <- .findBarcodeLimit(barcodes) 289 | endtargets <- .barcode_cases(bend) 290 | expander <- gsub("cases\\.", "", .barcode_files(bend, FALSE)) 291 | 292 | pand <- switch(expander, cases = identity, 293 | function(x) expand(x = x, expand = expander)) 294 | info <- results_all( 295 | pand(x = filter(cases(), as.formula( 296 | paste("~ ", endtargets, "%in% barcodes") 297 | ))) 298 | ) 299 | if (identical(expander, "cases")) { 300 | rframe <- as.data.frame(info[c(endtargets, names(endtargets))], 301 | stringsAsFactors = FALSE) 302 | } else { 303 | idnames <- lapply(ids(info), function(ident) { 304 | info[["samples"]][[ident]] 305 | }) 306 | if (!identical(expander, "samples")) { 307 | exFUN <- switch(expander, 308 | samples.portions = 309 | function(x, i) x[["portions"]], 310 | samples.portions.analytes = 311 | function(x, i) unlist(lapply( 312 | x[["portions"]], `[[`, "analytes"), recursive = FALSE), 313 | samples.portions.analytes.aliquots = 314 | function(x, i) unlist(lapply( 315 | unlist( 316 | lapply(x[["portions"]], `[[`, "analytes"), 317 | recursive = FALSE), `[[`, "aliquots"), 318 | recursive = FALSE) 319 | ) 320 | idnames <- unlist(lapply(seq_along(idnames), function(i) 321 | exFUN(x = idnames[[i]], i = i) 322 | ), recursive = FALSE) 323 | idnames <- Filter(function(g) length(g) >= 2L, idnames) 324 | } 325 | rescols <- lapply(idnames, `[`, 326 | c("submitter_id", gsub("s$", "", names(endtargets)))) 327 | rframe <- do.call(rbind, c(rescols, stringsAsFactors = FALSE)) 328 | names(rframe) <- c(endtargets, names(endtargets)) 329 | } 330 | rframe[na.omit(match(barcodes, rframe[[endtargets]])), , drop = FALSE] 331 | } 332 | 333 | .matchSort <- function(major, minor) { 334 | hits <- S4Vectors::findMatches(major, minor) 335 | order(S4Vectors::subjectHits(hits)) 336 | } 337 | 338 | .unnest_df <- function(dlist, cols) { 339 | dlist <- lapply(unname(dlist), unlist) 340 | if (!missing(cols)) { 341 | cols <- gsub("cases\\.", "", cols) 342 | dlist <- lapply(dlist, function(d) d[names(d) %in% cols]) 343 | } 344 | do.call(rbind, dlist) |> 345 | as.data.frame() 346 | } 347 | 348 | #' @rdname ID-translation 349 | #' 350 | #' @param filenames `character()` A vector of file names usually obtained 351 | #' from a `GenomicDataCommons` query 352 | #' 353 | #' @param slides `logical(1L)` **DEPRECATED**: Whether the provided file names 354 | #' correspond to slides typically with an `.svs` extension. **Note** The 355 | #' barcodes returned correspond 1:1 with the `filename` inputs. Always triple 356 | #' check the output against the Genomic Data Commons Data Portal by searching 357 | #' the file name and comparing associated "Entity ID" with the `submitter_id` 358 | #' given by the function. 359 | #' 360 | #' @details When providing slide file names, the function will only work if 361 | #' **all** the provided files are slide files with an `.svs` extension. 362 | #' 363 | #' @examples 364 | #' library(GenomicDataCommons) 365 | #' 366 | #' ### Query CNV data and get file names 367 | #' 368 | #' cnv <- files() |> 369 | #' filter( 370 | #' ~ cases.project.project_id == "TCGA-COAD" & 371 | #' data_category == "Copy Number Variation" & 372 | #' data_type == "Copy Number Segment" 373 | #' ) |> 374 | #' results(size = 6) 375 | #' 376 | #' filenameToBarcode(cnv$file_name) 377 | #' 378 | #' ### Query slides data and get file names 379 | #' 380 | #' slides <- files() |> 381 | #' filter( 382 | #' ~ cases.project.project_id == "TCGA-BRCA" & 383 | #' cases.samples.sample_type == "Primary Tumor" & 384 | #' data_type == "Slide Image" & 385 | #' experimental_strategy == "Diagnostic Slide" 386 | #' ) |> 387 | #' results(size = 3) 388 | #' 389 | #' filenameToBarcode(slides$file_name, slides = TRUE) 390 | #' 391 | #' @export filenameToBarcode 392 | filenameToBarcode <- function(filenames, slides = FALSE) { 393 | endwithsvs <- endsWith(filenames, "svs") 394 | allsvs <- all(endwithsvs) 395 | if (!allsvs && any(endwithsvs)) 396 | stop("Not all file names have an 'svs' extension.") 397 | if (!missing(slides)) { 398 | .Deprecated( 399 | msg = "The 'slides' argument is deprecated.", package = "TCGAutils" 400 | ) 401 | if (allsvs && !slides) 402 | warning( 403 | "All files have an 'svs' extension. Setting 'slides' to TRUE." 404 | ) 405 | slides <- allsvs 406 | } 407 | filesres <- files() 408 | endpoint <- "cases.samples.portions.analytes.aliquots.submitter_id" 409 | reselem <- "cases" 410 | if (slides) { 411 | cases_fields <- c( 412 | "cases.project.project_id", 413 | "cases.samples.tissue_type", 414 | "cases.samples.tumor_descriptor" 415 | ) 416 | endpoint <- c( 417 | "cases.samples.portions.slides.submitter_id", 418 | "associated_entities.entity_id", 419 | "associated_entities.entity_submitter_id", 420 | "associated_entities.entity_type", 421 | "associated_entities.case_id", 422 | cases_fields 423 | ) 424 | reselem <- "associated_entities" 425 | } 426 | 427 | info <- GenomicDataCommons::filter(filesres, ~ file_name %in% filenames) |> 428 | GenomicDataCommons::select(c("file_name", endpoint)) |> 429 | results_all() 430 | 431 | if (!length(info)) 432 | stop("Query did not return any results. Check 'filenames' input.") 433 | 434 | reps <- unlist(lapply(info[[reselem]], nrow)) 435 | res <- data.frame( 436 | file_name = rep(info[["file_name"]], reps), 437 | file_id = rep(info[["file_id"]], reps), 438 | row.names = NULL, 439 | stringsAsFactors = FALSE 440 | ) 441 | res <- cbind(res, .unnest_df(info[[reselem]])) 442 | if (slides) { 443 | slidedf <- .unnest_df(info[["cases"]], cols = cases_fields) 444 | res <- cbind.data.frame(res, slidedf) 445 | } 446 | idx <- .matchSort(res[["file_name"]], filenames) 447 | res[idx, ] 448 | } 449 | 450 | .HISTORY_ENDPOINT <- "https://api.gdc.cancer.gov/history" 451 | 452 | #' @rdname ID-translation 453 | #' 454 | #' @param id character(1) A UUID whose history of versions is sought 455 | #' 456 | #' @param endpoint character(1) Generally a constant pertaining to the location 457 | #' of the history api endpoint. This argument rarely needs to change. 458 | #' 459 | #' @return UUIDhistory: A `data.frame` containting a list of associated UUIDs 460 | #' for the given input along with `file_change` status, `data_release` 461 | #' versions, etc. 462 | #' 463 | #' @examples 464 | #' ## Get the version history of a BAM file in TCGA-KIRC 465 | #' UUIDhistory("0001801b-54b0-4551-8d7a-d66fb59429bf") 466 | #' 467 | #' @export 468 | UUIDhistory <- function(id, endpoint = .HISTORY_ENDPOINT) { 469 | if (!requireNamespace("httr", quietly = TRUE)) 470 | stop("Install 'httr' to check UUID status") 471 | qurl <- paste(endpoint, id, sep = "/") 472 | resp <- httr::GET(qurl) 473 | do.call(rbind.data.frame, httr::content(resp)) 474 | } 475 | -------------------------------------------------------------------------------- /R/TCGAbarcode.R: -------------------------------------------------------------------------------- 1 | .uniqueDelim <- function(ids) { 2 | nonnum <- gsub("[a-zA-Z0-9]", "", ids) 3 | dels <- unique(unlist( 4 | strsplit(nonnum, "") 5 | )) 6 | if (!length(dels)) 7 | dels <- "" 8 | dels 9 | } 10 | 11 | .checkBarcodes <- function(barcodes, check.sample = FALSE) { 12 | if (!all(startsWith(toupper(barcodes), "TCGA"))) 13 | stop("Barcodes must start with 'TCGA'") 14 | filler <- .uniqueDelim(barcodes) 15 | if (length(filler) != 1L) 16 | stop("Barcode delimiters not consistent") 17 | bcodelens <- unique(nchar(barcodes)) 18 | if (length(bcodelens) > 1L) 19 | warning("Inconsistent barcode lengths: ", 20 | paste(bcodelens, collapse = ", ")) 21 | if (check.sample) { 22 | if (any(bcodelens < 15L)) 23 | stop("'barcodes' should be at least 15 characters ", 24 | "with sample information") 25 | } 26 | } 27 | 28 | #' Parse data from TCGA barcode 29 | #' 30 | #' This function returns the specified snippet of information obtained from 31 | #' the TCGA barcode. 32 | #' 33 | #' @param barcodes A character vector of TCGA barcodes 34 | #' @param participant Logical (default TRUE) participant identifier chunk 35 | #' @param sample Logical (default FALSE) includes the numeric sample code of 36 | #' the barcode and the vial letter 37 | #' @param portion Logical (default FALSE) includes the portion and analyte 38 | #' codes of the barcode 39 | #' @param plate Logical (default FALSE) returns the plate value 40 | #' @param center Logical (default FALSE) returns a matrix with the plate and 41 | #' center codes 42 | #' @param index An optional numeric vector indicating barcode positions when 43 | #' split by the delimiter (i.e., hyphen '-'). For example, an index of 44 | #' `c(1, 2)` corresponds to 'TCGA-ZZ' in `TCGA-ZZ-A1A1`. 45 | #' 46 | #' @return A character vector or data matrix of TCGA barcode information 47 | #' 48 | #' @author M. Ramos 49 | #' 50 | #' @examples 51 | #' barcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 52 | #' "TCGA-B0-5094-11A-01D-1421-08", 53 | #' "TCGA-E9-A295-10A-01D-A16D-09") 54 | #' 55 | #' ## Patient identifiers 56 | #' TCGAbarcode(barcodes) 57 | #' 58 | #' ## Sample identifiers 59 | #' TCGAbarcode(barcodes, sample = TRUE) 60 | #' 61 | #' @export TCGAbarcode 62 | TCGAbarcode <- function(barcodes, participant = TRUE, sample = FALSE, 63 | portion = FALSE, plate = FALSE, center = FALSE, index = NULL) 64 | { 65 | .checkBarcodes(barcodes) 66 | filler <- .uniqueDelim(barcodes) 67 | stopifnot(is.null(index) || is.numeric(index)) 68 | if (is.null(index)) 69 | index <- which( 70 | c(rep(participant, 3), sample, portion, plate, center) 71 | ) 72 | barcodeMat <- do.call(rbind, strsplit(barcodes, filler)) 73 | apply(barcodeMat[, index, drop = FALSE], 1L, paste, collapse = filler) 74 | } 75 | 76 | -------------------------------------------------------------------------------- /R/TCGAbiospec.R: -------------------------------------------------------------------------------- 1 | .strsep <- function(text, pos) { 2 | stopifnot(length(unique(nchar(text))) == 1L) 3 | lengthText <- unique(nchar(text)) 4 | allIndx <- seq_len(lengthText) 5 | stopifnot(pos %in% allIndx) 6 | fgroup <- seq_len(pos) 7 | sgroup <- allIndx[!allIndx %in% fgroup] 8 | list( 9 | substr(text, min(fgroup), max(fgroup)), 10 | substr(text, min(sgroup), max(sgroup)) 11 | ) 12 | } 13 | 14 | #' Extract biospecimen data from the TCGA barcode 15 | #' 16 | #' This function uses the full TCGA barcode to return a data frame of the 17 | #' data pertinent to laboratory variables such as vials, portions, analytes, 18 | #' plates and the center. 19 | #' 20 | #' @param barcodes A character vector of TCGA barcodes 21 | #' @return A `dataframe` with sample type, sample code, portion, plate, 22 | #' and center columns. 23 | #' 24 | #' @author M. Ramos 25 | #' 26 | #' @examples 27 | #' example("TCGAbarcode") 28 | #' TCGAbiospec(barcodes) 29 | #' 30 | #' @export TCGAbiospec 31 | TCGAbiospec <- function(barcodes) { 32 | .checkBarcodes(barcodes) 33 | filler <- .uniqueDelim(barcodes) 34 | maxIndx <- unique(lengths(strsplit(barcodes, filler))) 35 | if (maxIndx < 4L) 36 | stop("Provide a longer barcode") 37 | 38 | local_data_store <- new.env(parent = emptyenv()) 39 | data("sampleTypes", envir = local_data_store, package = "TCGAutils") 40 | sampleTypes <- local_data_store[["sampleTypes"]] 41 | sampCode <- TCGAbarcode(barcodes, FALSE, TRUE) 42 | sampVial <- .strsep(sampCode, 2L) 43 | names(sampVial) <- c("sample", "vial") 44 | sample_definition <- sampleTypes[["Definition"]][ 45 | match(sampVial[["sample"]], sampleTypes[["Code"]])] 46 | biospec <- 47 | data.frame( 48 | submitter_id = TCGAbarcode(barcodes), 49 | sample_definition, 50 | as.data.frame(sampVial, stringsAsFactors = FALSE), 51 | stringsAsFactors = FALSE 52 | ) 53 | if (identical(maxIndx, 4L)) 54 | return(biospec) 55 | else 56 | splitDex <- seq(5L, maxIndx) 57 | 58 | tailBarcode <- strsplit(TCGAbarcode(barcodes, index = splitDex), filler) 59 | splitCol <- splitDex == 5L 60 | tailBarcode <- lapply(tailBarcode, function(x) 61 | c(unlist(.strsep(x[[1L]], 2L)), x[!splitCol])) 62 | 63 | portPlateCent <- do.call(rbind.data.frame, 64 | args = c(tailBarcode, list(stringsAsFactors = FALSE))) 65 | names(portPlateCent) <- 66 | c("portion", "analyte", "plate", "center")[seq_along(c(splitDex, 1L))] 67 | 68 | cbind.data.frame(biospec, portPlateCent, stringsAsFactors = FALSE) 69 | } 70 | -------------------------------------------------------------------------------- /R/TCGAprimaryTumors.R: -------------------------------------------------------------------------------- 1 | #' Select primary tumors from TCGA datasets 2 | #' 3 | #' Tumor selection is decided using the `sampleTypes` data. For 'LAML' datasets, 4 | #' the primary tumor code used is "03" otherwise, "01" is used. 5 | #' 6 | #' @param multiassayexperiment A 7 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 8 | #' with TCGA data as obtained from [curatedTCGAData::curatedTCGAData()] 9 | #' 10 | #' @return A `MultiAssayExperiment` containing only primary tumor samples 11 | #' 12 | #' @examples 13 | #' 14 | #' example(getSubtypeMap) 15 | #' 16 | #' TCGAprimaryTumors(gbm) 17 | #' 18 | #' @export TCGAprimaryTumors 19 | TCGAprimaryTumors <- function(multiassayexperiment) { 20 | if (!is(multiassayexperiment, "MultiAssayExperiment")) 21 | stop("Provide a 'MultiAssayExperiment' object as input") 22 | 23 | exptnames <- names(experiments(multiassayexperiment)) 24 | dcodes <- vapply(strsplit(exptnames, "_"), `[[`, character(1L), 1L) 25 | 26 | primaries <- ifelse(dcodes == "LAML", "03", "01") 27 | primaries <- setNames(primaries, dcodes) 28 | 29 | logisub <- Map(function(barcodes, tumorcode) { 30 | TCGAsampleSelect(barcodes, tumorcode) 31 | }, colnames(multiassayexperiment), primaries) 32 | 33 | multiassayexperiment[, logisub, ] 34 | } 35 | -------------------------------------------------------------------------------- /R/TCGAsampleSelect.R: -------------------------------------------------------------------------------- 1 | #' Select samples from barcodes from lookup table 2 | #' 3 | #' The TCGA barcode contains several pieces of information which can 4 | #' be parsed by the [TCGAbarcode] function. To select a specific type of 5 | #' sample, enter the appropriate sampleCode argument from the lookup table. 6 | #' See lookup table in `data("sampleTypes")`. Barcode inputs can be a 7 | #' character vector or a [CharacterList][IRanges::CharacterList-class] object. 8 | #' 9 | #' @param barcodes Either a TCGA barcode vector or 10 | #' [CharacterList][IRanges::CharacterList-class] containing patient 11 | #' identifiers, sample, portion, plate, and center codes. 12 | #' 13 | #' @param sampleCodes Either a character or numeric vector of TCGA sample codes. 14 | #' See the `sampleType` dataset. 15 | #' 16 | #' @return A logical vector or [LogicalList][IRanges::LogicalList-class] of the 17 | #' same length as 'barcodes' indicating sample type matches 18 | #' 19 | #' @examples 20 | #' 21 | #' example("TCGAbarcode") 22 | #' TCGAsampleSelect(barcodes, c(11, 01)) 23 | #' 24 | #' @export TCGAsampleSelect 25 | TCGAsampleSelect <- function(barcodes, sampleCodes) { 26 | stopifnot( 27 | is.character(sampleCodes) || is.numeric(sampleCodes), 28 | !is.na(sampleCodes), !is.logical(sampleCodes) 29 | ) 30 | if (clist <- is(barcodes, "CharacterList")) { 31 | bcodes <- barcodes 32 | barcodes <- unlist(barcodes, use.names = FALSE) 33 | } 34 | 35 | .checkBarcodes(barcodes, check.sample = TRUE) 36 | 37 | sampleCodes <- .addLeadingZero(sampleCodes) 38 | .checkSampleCodes(sampleCodes, strict = TRUE) 39 | 40 | sampleSnippet <- TCGAbarcode(barcodes, sample = TRUE, participant = FALSE) 41 | barcodeSamples <- substr(sampleSnippet, 1L, 2L) 42 | barc <- setNames(barcodeSamples %in% sampleCodes, barcodeSamples) 43 | if (exists("clist") && isTRUE(clist)) 44 | barc <- BiocGenerics::relist(barc, bcodes) 45 | return(barc) 46 | } 47 | -------------------------------------------------------------------------------- /R/TCGAutils-pkg.R: -------------------------------------------------------------------------------- 1 | #' TCGAutils: Helper functions for working with TCGA and MultiAssayExperiment 2 | #' data 3 | #' 4 | #' TCGAutils is a toolbox to work with TCGA specific datasets. It allows the 5 | #' user to manipulate and translate TCGA barcodes, conveniently convert a list 6 | #' of data files to [GRangesList][GenomicRanges::GRangesList-class]. Take 7 | #' datasets from GISTIC and return a 8 | #' [SummarizedExperiment][SummarizedExperiment::SummarizedExperiment-class] 9 | #' class object. The package also provides functions for working with data from 10 | #' the `curatedTCGAData` 11 | #' experiment data package. It provides convenience functions for extracting 12 | #' subtype metadata data and adding clinical data to existing 13 | #' [MultiAssayExperiment][MultiAssayExperiment::MultiAssayExperiment-class] 14 | #' objects. 15 | "_PACKAGE" 16 | -------------------------------------------------------------------------------- /R/builds.R: -------------------------------------------------------------------------------- 1 | human_builds <- function() { 2 | S4Vectors::DataFrame( 3 | Date = c("July 2004", "May 2004", "March 2006", "February 2009", 4 | "December 2013"), 5 | NCBI_PRE = c("NCBI", "NCBI", "NCBI", "GRCh", "GRCh"), 6 | NCBI_NO = c("34", "35", "36", "37", "38"), 7 | NCBI = c("NCBI34", "NCBI35", "NCBI36", "GRCh37", "GRCh38"), 8 | UCSC_PRE = c("hg", "hg", "hg", "hg", "hg"), 9 | UCSC_NO = c("16", "17", "18", "19", "38"), 10 | UCSC = c("hg16", "hg17", "hg18", "hg19", "hg38") 11 | ) 12 | } 13 | 14 | #' @name builds 15 | #' 16 | #' @title Utilities for working with *HUMAN* genome builds 17 | #' 18 | #' @description A few functions are available to search for build versions, 19 | #' either from NCBI or UCSC. 20 | #' 21 | #' \itemize{ 22 | #' \item `translateBuild`: translates between UCSC and NCBI build 23 | #' versions 24 | #' \item `extractBuild`: use grep patterns to find the first build 25 | #' within the string input 26 | #' \item `uniformBuilds`: replace build occurrences below a threshold 27 | #' level of occurence with the alternative build 28 | #' \item `correctBuild`: Ensure that the build annotation is correct 29 | #' based on the NCBI/UCSC website. If not, use `translateBuild` with 30 | #' the indicated 'style' input 31 | #' \item `isCorrect`: Check to see if the build is exactly as annotated 32 | #' } 33 | #' 34 | #' @details The `correctBuild` function takes the input and ensures that 35 | #' the style specified matches the input. Otherwise, it will 36 | #' return the correct style for use with `seqlevelsStyle`. 37 | #' Currently, the function does not support patched builds 38 | #' (e.g., 'GRCh38.p13') Build names are taken from the website: 39 | #' \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/} 40 | #' 41 | #' @param from character() A vector of build versions typically from `genome()` 42 | #' (e.g., "37"). The build vector must be homogenous (i.e., 43 | #' `length(unique(x)) == 1L`). 44 | #' 45 | #' @param to character(1) The name of the desired build version (either "UCSC" 46 | #' or "NCBI"; default: "UCSC") 47 | #' 48 | #' @param build character(1) A string providing the genome build 49 | #' 50 | #' @param style character(1) The annotation style, either 'UCSC' or 'NCBI' 51 | #' 52 | #' @examples 53 | #' 54 | #' translateBuild("GRCh35", "UCSC") 55 | #' 56 | #' @return 57 | #' translateBuild: A character vector of translated genome builds 58 | #' 59 | #' extractBuild: A character string of the build information available 60 | #' 61 | #' uniformBuilds: A character vector of builds where all builds are 62 | #' identical `identical(length(unique(build)), 1L)` 63 | #' 64 | #' correctBuild: A character string of the 'corrected' build name 65 | #' 66 | #' isCorrect: A logical indicating if the build is exactly as annotated 67 | #' 68 | #' @export 69 | translateBuild <- function(from, to = c("UCSC", "NCBI")) { 70 | lfro <- length(from) 71 | from <- unique(from) 72 | if (!.isSingleValue(from)) 73 | stop("Enter a consistent vector of genomic builds") 74 | 75 | to <- match.arg(to) 76 | buildDF <- human_builds() 77 | 78 | bnames <- c("UCSC", "NCBI") 79 | from_build <- bnames[bnames != to] 80 | 81 | bfrom <- correctBuild(from, from_build) 82 | 83 | buildIndex <- match(bfrom, buildDF[[from_build]]) 84 | rep(buildDF[[to]][buildIndex], lfro) 85 | } 86 | 87 | #' @rdname builds 88 | #' 89 | #' @param build character(1) A string providing the genome build 90 | #' 91 | #' @param style character(1) The annotation style, either 'UCSC' or 'NCBI' 92 | #' 93 | #' @examples 94 | #' 95 | #' correctBuild("grch38", "NCBI") 96 | #' correctBuild("hg19", "NCBI") 97 | #' 98 | #' @export 99 | correctBuild <- function(build, style = c("UCSC", "NCBI")) { 100 | build.df <- human_builds() 101 | pre <- paste0(style, "_PRE") 102 | digits <- as.character(gsub(".*([[:digit:]]{2})", "\\1", build)) 103 | pref <- gsub("(.*)([[:digit:]]{2})", "\\1", build) 104 | if (identical(tolower(pref), "hg") && identical(style, "NCBI")) 105 | return(translateBuild(build, style)) 106 | if ( 107 | tolower(pref) %in% tolower(build.df[["NCBI_PRE"]]) && 108 | identical(style, "UCSC") 109 | ) 110 | return(translateBuild(build, style)) 111 | idx <- match(digits, build.df[[paste0(style, "_NO")]]) 112 | if (is.na(idx)) 113 | return(NA_character_) 114 | num <- build.df[[paste0(style, "_NO")]][idx] 115 | pref <- build.df[[pre]][idx] 116 | paste0(pref, num) 117 | } 118 | 119 | #' @rdname builds 120 | #' 121 | #' @examples 122 | #' 123 | #' isCorrect("GRCh38", "NCBI") 124 | #' 125 | #' isCorrect("hg19", "UCSC") 126 | #' 127 | #' @export 128 | isCorrect <- function(build, style = c("UCSC", "NCBI")) { 129 | identical( 130 | correctBuild(build, style), 131 | build 132 | ) 133 | } 134 | 135 | #' @rdname builds 136 | #' 137 | #' @param string A single character string 138 | #' @param build A vector of build version names (default UCSC, NCBI) 139 | #' 140 | #' @examples 141 | #' 142 | #' extractBuild( 143 | #' "SCENA_p_TCGAb29and30_SNP_N_GenomeWideSNP_6_G05_569110.nocnv_grch38.seg.txt" 144 | #' ) 145 | #' 146 | #' @export 147 | extractBuild <- function(string, build = c("UCSC", "NCBI")) { 148 | if (!S4Vectors::isSingleString(string)) 149 | stop("Provide a single string for build search") 150 | builds <- vector(mode = "character", length(build)) 151 | names(builds) <- build 152 | for (i in build) { 153 | pattrn <- switch(i, UCSC = "[Hh][Gg][0-9]{2}", 154 | NCBI = "[Gg][Rr][Cc][Hh][0-9]{2}") 155 | builds[[i]] <- stringr::str_extract(string, pattrn) 156 | } 157 | builds <- Filter(function(x) !is.na(x), builds) 158 | if (!length(builds)) 159 | NA_character_ 160 | else if (length(builds)) 161 | builds[1L] 162 | } 163 | 164 | .isSingleValue <- function(charvec) { 165 | identical(length(unique(charvec)), 1L) 166 | } 167 | 168 | .consistentNumbers <- function(charvec) { 169 | bnos <- gsub("(.*)([0-9]{2})", "\\2", charvec) 170 | .isSingleValue(bnos) 171 | } 172 | 173 | 174 | .replaceHighProp <- function(charvec) { 175 | tt <- table(charvec) 176 | if (length(tt) > 2L) 177 | stop(" Table has more than 2 values") 178 | 179 | proptt <- prop.table(tt) 180 | 181 | highprop <- names(which.max(proptt)) 182 | charvec[charvec != highprop] <- highprop 183 | charvec 184 | } 185 | 186 | #' @rdname builds 187 | #' 188 | #' @param builds A character vector of builds 189 | #' 190 | #' @param cutoff numeric(1L) An inclusive threshold tolerance value for missing 191 | #' values and translating builds that are below the threshold 192 | #' 193 | #' @param na character() The values to be considered as missing (default: 194 | #' c("", "NA")) 195 | #' 196 | #' @examples 197 | #' 198 | #' buildvec <- rep(c("GRCh37", "hg19"), times = c(5, 1)) 199 | #' uniformBuilds(buildvec) 200 | #' 201 | #' navec <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA") 202 | #' uniformBuilds(navec) 203 | #' 204 | #' @export uniformBuilds 205 | uniformBuilds <- function(builds, cutoff = 0.2, na = c("", "NA")) { 206 | tbuild <- table(builds) 207 | if (.consistentNumbers(builds)) { 208 | if (identical(length(tbuild), 1L)) 209 | return(builds) 210 | else 211 | builds <- .replaceHighProp(builds) 212 | } 213 | 214 | wbuilds <- toupper(builds) 215 | nabuilds <- wbuilds %in% na | is.na(wbuilds) 216 | wbuilds[nabuilds] <- NA_character_ 217 | 218 | tt <- table(wbuilds, useNA = "always") 219 | proptt <- prop.table(tt) 220 | 221 | uvals <- names(proptt) 222 | nanames <- is.na(uvals) 223 | propna <- proptt[nanames] 224 | 225 | if (propna >= cutoff) 226 | stop("Frequency of NA values higher than the cutoff tolerance") 227 | 228 | ubuilds <- uvals[!nanames] 229 | 230 | if (.isSingleValue(ubuilds)) { 231 | builds[nabuilds] <- ubuilds 232 | return(builds) 233 | } else if (sum(!nanames) > 2) 234 | stop("Only two build types at a time can be used") 235 | 236 | props <- proptt[!nanames] 237 | 238 | offbuild <- names(props[props <= cutoff]) 239 | mainbuild <- names(props[props > cutoff]) 240 | mainbuild <- builds[match(mainbuild, toupper(builds))] 241 | if (any(nabuilds)) 242 | builds[nabuilds] <- mainbuild 243 | 244 | samebuilds <- .consistentNumbers(builds) 245 | if (samebuilds) { 246 | builds[wbuilds == offbuild] <- mainbuild 247 | } else { 248 | pattrn <- vapply( 249 | c(UCSC = "[Hh][Gg][0-9]{2}", NCBI = "[Gg][Rr][Cc][Hh][0-9]{2}"), 250 | grepl, logical(1L), offbuild) 251 | toconv <- names(pattrn)[!pattrn] 252 | results <- translateBuild(offbuild, toconv) 253 | builds[wbuilds == offbuild] <- results 254 | } 255 | builds 256 | } 257 | 258 | -------------------------------------------------------------------------------- /R/curatedTCGAData-helpers.R: -------------------------------------------------------------------------------- 1 | #' @import methods 2 | #' @importFrom xml2 read_html 3 | #' @importFrom rvest html_nodes html_attr 4 | #' @importFrom GenomicRanges GRanges GRangesList makeGRangesListFromDataFrame 5 | #' granges 6 | #' @importFrom GenomeInfoDb genome genome<- 7 | #' @importFrom MultiAssayExperiment ExperimentList colData colData<- metadata 8 | #' subsetByColumn experiments 9 | #' @importFrom utils data head read.delim 10 | #' @importFrom stats as.formula na.omit setNames 11 | #' @importFrom stringr str_extract 12 | #' @importFrom SummarizedExperiment SummarizedExperiment mcols mcols<- rowData 13 | #' rowData<- 14 | #' @importFrom GenomicDataCommons files results_all select filter ids cases 15 | #' expand 16 | #' @importFrom S4Vectors isSingleNumber isSingleInteger isSingleString 17 | #' DataFrame 18 | NULL 19 | 20 | ## Helpers for downloaded objects 21 | 22 | #' @name curatedTCGAData-helpers 23 | #' 24 | #' @title Helper functions for managing MultiAssayExperiment from 25 | #' curatedTCGAData 26 | #' 27 | #' @aliases getSubtypeMap 28 | #' 29 | #' @description 30 | #' Additional helper functions for cleaning and uncovering metadata 31 | #' within a downloaded `MultiAssayExperiment` from `curatedTCGAData`. 32 | #' 33 | #' @details Note that for `getSubtypeMap`, the column of in-data variable names 34 | #' may need to go through `make.names` to be found in the `colData` of the 35 | #' `MultiAssayExperiment`. 36 | #' 37 | #' @section getSubtypeMap: provides a two column `data.frame` with 38 | #' interpreted names and in-data variable names. 'Name' usually refers to the 39 | #' `colData` row names a.k.a. the `patientID`. 40 | #' 41 | #' @section getClinicalNames: provides a vector of common variable names that 42 | #' exist in the `colData` `DataFrame` of a `curatedTCGAData` 43 | #' `MultiAssayExperiment` object. These variables are directly obtained 44 | #' from the BroadFirehose clinical data (downloaded with 45 | #' \link[RTCGAToolbox]{getFirehoseData}) and tend to be present across cancer 46 | #' disease codes. 47 | #' 48 | #' @param multiassayexperiment A 49 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 50 | #' object 51 | #' 52 | #' @examples 53 | #' 54 | #' library(curatedTCGAData) 55 | #' 56 | #' gbm <- curatedTCGAData("GBM", c("RPPA*", "CNA*"), version = "2.0.1", FALSE) 57 | #' 58 | #' getSubtypeMap(gbm) 59 | #' 60 | #' sampleTables(gbm) 61 | #' 62 | #' TCGAsplitAssays(gbm, c("01", "10")) 63 | #' 64 | #' @return \itemize{ 65 | #' \item{getSubtypeMap}: A `data.frame` with explanatory names 66 | #' and their in-data variable names. They may not be present for all 67 | #' cancer types. 68 | #' \item{getClinicalNames}: A `vector` of common variable names that 69 | #' may be found across several cancer disease codes. 70 | #' } 71 | #' 72 | #' @export 73 | getSubtypeMap <- function(multiassayexperiment) { 74 | 75 | if (!is(multiassayexperiment, "MultiAssayExperiment")) 76 | stop("Provide a 'MultiAssayExperiment' object") 77 | 78 | frameMap <- metadata(colData(multiassayexperiment))[["subtypes"]] 79 | frameMap[] <- lapply(frameMap, as.character) 80 | 81 | if (is.null(frameMap)) 82 | return(message("No subtype data available")) 83 | 84 | subColIdx <- grep("subtype", names(frameMap)) 85 | 86 | pats <- 87 | frameMap[[subColIdx]] %in% c("patient", "SAMPLE", "Complete TCGA ID") 88 | 89 | frameMap[pats, subColIdx] <- "patientID" 90 | frameMap 91 | } 92 | 93 | #' @rdname curatedTCGAData-helpers 94 | #' 95 | #' @param diseaseCode A TCGA cancer code (e.g., "BRCA") 96 | #' @examples 97 | #' getClinicalNames("COAD") 98 | #' 99 | #' @export 100 | getClinicalNames <- function(diseaseCode) { 101 | stopifnot(S4Vectors::isSingleString(diseaseCode)) 102 | env <- new.env(parent = emptyenv()) 103 | data("clinicalNames", envir = env) 104 | clinNames <- env[["clinicalNames"]] 105 | clinNames[[diseaseCode]] 106 | } 107 | 108 | .samplesInData <- function(mae) { 109 | IRanges::CharacterList(lapply(sampleTables(mae), names)) 110 | } 111 | 112 | .checkSampleCodes <- 113 | function(sampleCodes, type = "'sampleCodes'", strict = FALSE) { 114 | FUN <- if (strict) any else all 115 | env <- new.env(parent = emptyenv()) 116 | data("sampleTypes", envir = env, package = "TCGAutils") 117 | sampleTypes <- env[["sampleTypes"]] 118 | if (FUN(!sampleCodes %in% sampleTypes[["Code"]])) 119 | stop("Provide valid TCGA 'sampleCodes' in ", type) 120 | } 121 | 122 | .checkCodesAgainstData <- function(samplist, sampleCodes) { 123 | invalidCodes <- IRanges::LogicalList(lapply(samplist, 124 | function(acode) !sampleCodes %in% acode)) 125 | 126 | if (all(all(invalidCodes) & lengths(invalidCodes))) 127 | stop("'sampleCodes' not found in assay data, check 'sampleTables()'", 128 | "\n and see the 'data(\"sampleTypes\")' table", call. = FALSE) 129 | 130 | if (any(any(invalidCodes))) { 131 | missingcodes <- 132 | IRanges::CharacterList(lapply(invalidCodes[any(invalidCodes)], 133 | function(inv) sampleCodes[inv])) 134 | warning("Some 'sampleCodes' not found in assays", call. = FALSE) 135 | } 136 | } 137 | 138 | .addLeadingZero <- function(vect) { 139 | vect <- as.character(vect) 140 | singleDigits <- nchar(vect) < 2L 141 | if (any(singleDigits)) 142 | vect <- replace(vect, singleDigits, paste0("0", vect[singleDigits])) 143 | vect 144 | } 145 | 146 | #' @rdname curatedTCGAData-helpers 147 | #' 148 | #' @param sampleCodes character (default NULL) A string of sample type codes 149 | #' (refer to `data(sampleTypes)`; `TCGAsplitAssays` section) 150 | #' @param exclusive logical (default FALSE) Whether to return only assays that 151 | #' contain all codes in `sampleCodes` 152 | #' 153 | #' @section TCGAsplitAssays: 154 | #' Separates samples by indicated sample codes into different assays 155 | #' in a `MultiAssayExperiment`. Refer to the `sampleTypes` 156 | #' data object for a list of available codes. This operation generates 157 | #' \strong{n} times the number of assays based on the number of sample codes 158 | #' entered. By default, all assays will be split by samples present in 159 | #' the data. 160 | #' 161 | #' @importFrom BiocBaseUtils setSlots 162 | #' 163 | #' @export 164 | TCGAsplitAssays <- function(multiassayexperiment, sampleCodes = NULL, 165 | exclusive = FALSE) { 166 | if (!is(multiassayexperiment, "MultiAssayExperiment")) 167 | stop("Provide a 'MultiAssayExperiment' object") 168 | 169 | sampList <- .samplesInData(multiassayexperiment) 170 | .checkSampleCodes(unique(unlist(sampList)), 171 | "colnames(MultiAssayExperiment)") 172 | 173 | if (!is.null(sampleCodes)) { 174 | sampleCodes <- .addLeadingZero(sampleCodes) 175 | .checkSampleCodes(sampleCodes) 176 | .checkCodesAgainstData(sampList, sampleCodes) 177 | if (exclusive) { 178 | inCodes <- 179 | S4Vectors::`%in%`(IRanges::CharacterList(sampleCodes), sampList) 180 | sampList <- sampList[all(inCodes)] 181 | } 182 | if (!length(sampList)) 183 | stop("Not all 'sampleCodes' were found in data") 184 | subCodes <- S4Vectors::`%in%`(sampList, sampleCodes) 185 | sampList <- sampList[subCodes] 186 | } 187 | 188 | validExp <- Filter(length, sampList) 189 | exps <- experiments(multiassayexperiment) 190 | exps <- exps[names(exps) %in% names(validExp)] 191 | 192 | egroups <- unlist(Map(function(exprmt, sampcodes, ename) { 193 | expnames <- setNames(sampcodes, paste0(sampcodes, "_", ename)) 194 | lapply(expnames, function(code) { 195 | logitype <- TCGAsampleSelect(colnames(exprmt), code) 196 | exprmt[, logitype, drop = FALSE] 197 | }) 198 | }, exprmt = exps, sampcodes = validExp, ename = names(validExp), 199 | USE.NAMES = FALSE), recursive = FALSE) 200 | 201 | sampmap <- generateMap( 202 | experiments = egroups, 203 | colData = colData(multiassayexperiment), 204 | idConverter = TCGAbarcode 205 | ) 206 | 207 | setSlots( 208 | object = multiassayexperiment, 209 | ExperimentList = ExperimentList(egroups), 210 | sampleMap = sampmap 211 | ) 212 | } 213 | 214 | #' @rdname curatedTCGAData-helpers 215 | #' @param vial (logical default FALSE) whether to display vials in the 216 | #' table output 217 | #' 218 | #' @section sampleTables: 219 | #' Display all the available samples in each of the assays 220 | #' @export 221 | sampleTables <- function(multiassayexperiment, vial = FALSE) { 222 | lapply(colnames(multiassayexperiment), function(x) { 223 | scodes <- TCGAbarcode(x, participant = FALSE, sample = TRUE) 224 | if (!vial) 225 | scodes <- substr(scodes, 1L, 2L) 226 | table(unname(scodes)) 227 | }) 228 | } 229 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' TCGA Cancer Disease Codes Table 2 | #' 3 | #' A dataset for obtaining the cancer codes in TCGA for about 13 different 4 | #' types of cancers. 5 | #' 6 | #' @format A data frame with 37 rows and 2 variables: 7 | #' * Study.Abbreviation: Disease Code used in TCGA 8 | #' * Available: Cancer datasets available via curatedTCGAData 9 | #' * SubtypeData: Subtype curation data available via curatedTCGAData 10 | #' * Study.Name: The full length study name (i.e., type of cancer) 11 | #' @return The TCGA `diseaseCodes` table 12 | #' 13 | #' @usage data("diseaseCodes") 14 | #' 15 | #' @source 16 | "diseaseCodes" 17 | 18 | #' Barcode Sample Type Table 19 | #' 20 | #' A dataset that contains the mappings for sample codes in the TCGA 21 | #' barcodes. 22 | #' @format A data frame with 19 rows and 3 variables: 23 | #' * Code: Two digit code number found in the barcode 24 | #' * Definition: Long name for the sample type 25 | #' * Short.Letter.Code: Letter code for the sample type 26 | #' 27 | #' @return The TCGA `sampleTypes` table 28 | #' 29 | #' @usage data("sampleTypes") 30 | #' 31 | #' @source 32 | "sampleTypes" 33 | 34 | #' Clinical dataset names in TCGA 35 | #' 36 | #' A dataset of names for each of the TCGA cancer codes available. 37 | #' These names were obtained by the clinical datasets from 38 | #' [getFirehoseData][RTCGAToolbox::getFirehoseData]. They serve to subset the 39 | #' current datasets provided by `curatedTCGAData`. 40 | #' 41 | #' @format A [CharacterList][IRanges::CharacterList-class] of names for 33 42 | #' cancer codes 43 | #' 44 | #' @return The clinical dataset column names in TCGA as provided by the 45 | #' `RTCGAToolbox` 46 | #' 47 | #' @usage data("clinicalNames") 48 | "clinicalNames" 49 | -------------------------------------------------------------------------------- /R/findGRangesCols.R: -------------------------------------------------------------------------------- 1 | .find_with_xfix <- function(df_colnames, xfix1, xfix2, 2 | start.field, end.field, xfixType = "pre") { 3 | fixint <- intersect(xfix1, xfix2) 4 | fixint <- fixint[fixint != ""] 5 | if (length(fixint) > 1L) { 6 | kword <- "region" 7 | warning(" Multiple ", xfixType, "fixes found, using keyword '", kword, 8 | "' or taking first one") 9 | ## keywords to keep, else take first one 10 | gfix <- grep(kword, fixint, value = TRUE) 11 | if (length(gfix) && isSingleString(gfix)) 12 | fixint <- gfix 13 | fixint <- fixint[[1L]] 14 | } 15 | if (!isSingleString(fixint)) 16 | stop("'start.field' and 'end.field' ", xfixType, "fixes do not match") 17 | names(fixint) <- xfixType 18 | 19 | fixFUN <- switch(xfixType, pre = I, suf = rev) 20 | start.field <- paste(fixFUN(c(fixint, start.field)), collapse = "") 21 | validEnd <- vapply(end.field, function(efield) 22 | paste(fixFUN(c(fixint, efield)), collapse = "") %in% df_colnames, 23 | logical(1L)) 24 | stopifnot(sum(validEnd) == 1L) 25 | end.field <- paste(fixFUN(c(fixint, end.field[validEnd])), collapse = "") 26 | if (!length(start.field) && !length(end.field)) 27 | list(c(start.field = "", end.field = ""), "") 28 | else 29 | list(c(start.field = start.field, end.field = end.field), fixint) 30 | } 31 | 32 | .tallySameLength <- function(fix1, fix2) { 33 | if (!length(fix1) && !length(fix2)) { 34 | 0L 35 | } else { 36 | hasPos <- sum(vapply(c(fix1, fix2), 37 | function(x) grepl("pos", x, ignore.case = TRUE), 38 | logical(1L) 39 | )) 40 | sum( 41 | identical(fix1, fix2), 42 | identical(length(fix1), length(fix2)), 43 | hasPos 44 | ) 45 | } 46 | } 47 | 48 | .strMatch <- function(strings, table) { 49 | unlist(lapply(strings, function(x) 50 | grep(x, table, ignore.case = TRUE) 51 | )) 52 | } 53 | 54 | ## Helper functions 55 | .find_start_end_cols <- function (df_colnames, start.field, end.field) { 56 | idx1 <- which(df_colnames %in% start.field) 57 | idx2 <- which(df_colnames %in% end.field) 58 | if (length(idx1) == 1L && length(idx2) == 1L) { 59 | return(list(c(start = idx1, end = idx2), list(c(none = "")))) 60 | } 61 | idx1 <- .strMatch(start.field, df_colnames) 62 | idx2 <- .strMatch(end.field, df_colnames) 63 | if (length(idx1) == 1L && length(idx2) == 1L) { 64 | return(list(c(start = idx1, end = idx2), list(c(none = "")))) 65 | } 66 | prefixes1 <- .collect_prefixes(df_colnames, start.field) 67 | prefixes2 <- .collect_prefixes(df_colnames, end.field) 68 | suffixes1 <- .collect_suffixes(df_colnames, start.field) 69 | suffixes2 <- .collect_suffixes(df_colnames, end.field) 70 | tallypre <- .tallySameLength(prefixes1, prefixes2) 71 | tallysuff <- .tallySameLength(suffixes1, suffixes2) 72 | tally <- sort(c(prefixes = tallypre, suffixes = tallysuff))[2] 73 | reslist <- list( 74 | c(start = NA_integer_, end = NA_integer_), list(c(none = "")) 75 | ) 76 | if (!tally) return(reslist) 77 | fix <- names(tally) 78 | startend.fields <- .find_with_xfix( 79 | df_colnames, get(paste0(fix, 1)), get(paste0(fix, 2)), 80 | start.field, end.field, substr(fix, 1, 3) 81 | ) 82 | idx1 <- which(df_colnames %in% startend.fields[[1L]][["start.field"]]) 83 | idx2 <- which(df_colnames %in% startend.fields[[1L]][["end.field"]]) 84 | if (length(idx1) == 1L && length(idx2) == 1L) { 85 | reslist[[1L]] <- c(start = idx1, end = idx2) 86 | reslist[[2L]][[1L]] <- startend.fields[[2L]] 87 | } 88 | reslist 89 | } 90 | 91 | .collect_prefixes <- function (df_colnames, field) { 92 | df_colnames_nc <- nchar(df_colnames) 93 | prefixes <- lapply(field, function(suf) { 94 | pref_nc <- df_colnames_nc - nchar(suf) 95 | idx <- which(substr(df_colnames, pref_nc + 1L, df_colnames_nc) == suf) 96 | substr(df_colnames[idx], 1L, pref_nc[idx]) 97 | }) 98 | pref <- unique(unlist(prefixes)) 99 | pref[pref != ""] 100 | } 101 | 102 | .collect_suffixes <- function(df_colnames, field) { 103 | suffixes <- lapply(field, function(pre) { 104 | idx <- which(startsWith(df_colnames, pre)) 105 | substr(df_colnames[idx], nchar(field) + 1L, 106 | nchar(df_colnames[idx])) 107 | }) 108 | suff <- unique(unlist(suffixes)) 109 | suff[suff != ""] 110 | } 111 | 112 | .find_strands_col <- function(df_colnames, strand.field, xfix) { 113 | fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I) 114 | idx <- which(df_colnames %in% 115 | paste(fixFUN(c(xfix, strand.field)), collapse = "")) 116 | if (length(idx) == 0L) 117 | idx <- which(df_colnames %in% strand.field) 118 | if (length(idx) == 0L) 119 | return(NA_integer_) 120 | if (length(idx) >= 2L) { 121 | warning("Multiple strand measurements detected, taking first one") 122 | idx <- idx[[1L]] 123 | } 124 | idx 125 | } 126 | 127 | .find_seqnames_col <- function (df_colnames, seqnames.field, xfix) { 128 | fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I) 129 | idx <- which(df_colnames %in% 130 | paste(fixFUN(c(xfix, seqnames.field)), collapse = "")) 131 | if (length(idx) == 0L) 132 | idx <- which(df_colnames %in% seqnames.field) 133 | if (length(idx) == 0L) 134 | return(NA_integer_) 135 | if (length(idx) >= 2L) 136 | warning("cannnot determine seqnames column unambiguously") 137 | return(idx[[1L]]) 138 | idx 139 | } 140 | 141 | .find_width_col <- function (df_colnames, width.field, xfix) { 142 | fixFUN <- switch(names(xfix[[1]]), pre = I, suf = rev, none = I) 143 | idx <- which(df_colnames %in% 144 | paste(fixFUN(c(xfix, width.field)), collapse = "")) 145 | if (length(idx) == 0L) 146 | idx <- which(df_colnames %in% width.field) 147 | if (length(idx) == 0L) 148 | return(NA_integer_) 149 | if (length(idx) >= 2L) { 150 | warning("cannnot determine width column unambiguously") 151 | return(idx[[1L]]) 152 | } 153 | idx 154 | } 155 | 156 | #' Obtain minimum necessary names for the creation of a GRangesList object 157 | #' 158 | #' This function attempts to match chromosome, start position, end position and 159 | #' strand names in the given character vector. Modified helper from the 160 | #' `GenomicRanges` package. 161 | #' 162 | #' @param df_colnames A `character` vector of names in a dataset 163 | #' @param seqnames.field A `character` vector of the chromosome name 164 | #' @param start.field A `character` vector that indicates the column name 165 | #' of the start positions of ranged data 166 | #' @param end.field A `character` vector that indicates the end position 167 | #' of ranged data 168 | #' @param strand.field A `character` vector of the column name that 169 | #' indicates the strand type 170 | #' @param ignore.strand logical (default FALSE) whether to ignore the strand 171 | #' field in the data 172 | #' @return Index positions vector indicating columns with appropriate names 173 | #' 174 | #' @examples 175 | #' myDataColNames <- c("Start_position", "End_position", "strand", 176 | #' "chromosome", "num_probes", "segment_mean") 177 | #' findGRangesCols(myDataColNames) 178 | #' 179 | #' @export findGRangesCols 180 | findGRangesCols <- function (df_colnames, 181 | seqnames.field = c("seqnames", "seqname", "chromosome", 182 | "chrom", "chr", "chromosome_name", "seqid", "om"), 183 | start.field = "start", 184 | end.field = c("end", "stop"), 185 | strand.field = "strand", 186 | ignore.strand = FALSE) { 187 | 188 | df_colnames0 <- tolower(df_colnames) 189 | seqnames.field0 <- 190 | GenomicRanges:::.normarg_field(seqnames.field, "seqnames") 191 | start.field0 <- GenomicRanges:::.normarg_field(start.field, "start") 192 | end.field0 <- GenomicRanges:::.normarg_field(end.field, "end") 193 | start_end_cols <- .find_start_end_cols(df_colnames0, start.field0, 194 | end.field0) 195 | xfix <- start_end_cols[[2L]] 196 | width_col <- .find_width_col(df_colnames0, "width", xfix) 197 | seqnames_col <- .find_seqnames_col(df_colnames0, seqnames.field0, xfix) 198 | if (ignore.strand) { 199 | strand_col <- NA_integer_ 200 | } else { 201 | strand.field0 <- GenomicRanges:::.normarg_field(strand.field, "strand") 202 | strand_col <- .find_strands_col(df_colnames0, strand.field0, xfix) 203 | } 204 | c(seqnames = seqnames_col, start_end_cols[[1L]], width = width_col, 205 | strand = strand_col) 206 | } 207 | -------------------------------------------------------------------------------- /R/generateMap.R: -------------------------------------------------------------------------------- 1 | #' Create a sampleMap from an experiment list and phenoData dataframe 2 | #' 3 | #' This function helps create a sampleMap in preparation of a 4 | #' `MultiAssayExperiment` object. This especially useful when the 5 | #' sample identifiers are not very different, as in the case of TCGA barcodes. 6 | #' An `idConverter` function can be provided to truncate such sample 7 | #' identifiers and obtain patient identifiers. 8 | #' 9 | #' @param experiments A named `list` of experiments compatible with the 10 | #' `MultiAssayExperiment` API 11 | #' @param colData A `data.frame` of clinical data with patient identifiers 12 | #' as rownames 13 | #' @param idConverter A function to be used against the sample or specimen 14 | #' identifiers to match those in the rownames of the `colData` 15 | #' (default NULL) 16 | #' @param sampleCol A single string indicating the sample identifiers 17 | #' column in the colData dataset 18 | #' @param patientCol A single string indicating the patient identifiers 19 | #' in colData, "row.names" extracts the colData row names 20 | #' @param ... Additonal arguments to pass to the 'idConverter' function. 21 | #' 22 | #' @return A `DataFrame` class object of mapped samples and patient 23 | #' identifiers including assays 24 | #' 25 | #' @author M. Ramos, M. Morgan, L. Schiffer 26 | #' 27 | #' @examples 28 | #' ## Minimal example 29 | #' expList <- list(assay1 = matrix(1:6, ncol = 2L, 30 | #' dimnames = list(paste0("feature", 1:3), c("A-J", "B-J"))), 31 | #' assay2 = matrix(1:4, ncol = 2, 32 | #' dimnames = list(paste0("gene", 1:2), c("A-L", "B-L")))) 33 | #' 34 | #' ## Mock colData 35 | #' myPheno <- data.frame(var1 = c("Yes", "No"), var2 = c("High", "Low"), 36 | #' row.names = c("a", "b")) 37 | #' 38 | #' ## A look at the identifiers 39 | #' vapply(expList, colnames, character(2L)) 40 | #' rownames(myPheno) 41 | #' 42 | #' ## Use 'idConverter' to correspond sample names to patient identifiers 43 | #' generateMap(expList, myPheno, 44 | #' idConverter = function(x) substr(tolower(x), 1L, 1L)) 45 | #' 46 | #' @export generateMap 47 | generateMap <- function(experiments, colData, idConverter = identity, 48 | sampleCol, patientCol, ...) { 49 | if (!is(experiments, "ExperimentList")) 50 | experiments <- ExperimentList(experiments) 51 | samps <- colnames(experiments) 52 | expnames <- names(samps) 53 | assay <- factor(rep(expnames, lengths(samps)), levels=expnames) 54 | colname <- unlist(samps, use.names=FALSE) 55 | if (!missing(sampleCol) && !missing(patientCol)) { 56 | if (!S4Vectors::isSingleString(sampleCol) || 57 | !S4Vectors::isSingleString(patientCol)) 58 | stop("Provide character names in colData for mapping") 59 | if (identical(patientCol, "row.names")) 60 | pts <- rownames(colData) 61 | else 62 | pts <- colData[[patientCol]] 63 | samples <- colData[[sampleCol]] 64 | autoMap <- cbind.data.frame(assay = NA_character_, primary = pts, 65 | colname = samples, stringsAsFactors = FALSE) 66 | autoMap <- Map(function(cnames, i) { 67 | submap <- autoMap[autoMap[["colname"]] %in% cnames, ] 68 | if (nrow(submap)) { 69 | submap[["assay"]] <- i 70 | } else { 71 | warning( 72 | "'", i, "' assay dropped; 'colnames' not mappable", 73 | call. = FALSE 74 | ) 75 | } 76 | submap 77 | }, cnames = samps, i = names(samps)) 78 | autoMap <- do.call(function(...) { 79 | rbind(..., make.row.names = FALSE) 80 | }, autoMap) 81 | autoMap[["assay"]] <- factor(autoMap[["assay"]]) 82 | } else { 83 | matches <- match(idConverter(colname, ...), rownames(colData)) 84 | if (length(matches) && all(is.na(matches))) 85 | stop("no way to map colData to ExperimentList") 86 | primary <- rownames(colData)[matches] 87 | autoMap <- S4Vectors::DataFrame(assay=assay, 88 | primary=primary, colname=colname) 89 | } 90 | missingPrimary <- is.na(autoMap[["primary"]]) 91 | if (nrow(autoMap) && any(missingPrimary)) { 92 | notFound <- autoMap[missingPrimary, ] 93 | warning("Data from rows:", 94 | sprintf("\n %s - %s", notFound[, "primary"], 95 | notFound[, "colname"]), 96 | "\ndropped due to missing phenotype data") 97 | autoMap <- autoMap[!is.na(autoMap[["primary"]]), ] 98 | } 99 | autoMap 100 | } 101 | 102 | -------------------------------------------------------------------------------- /R/getFileName.R: -------------------------------------------------------------------------------- 1 | .getLinks <- function(keyWord1, keyWord2, datasetLink = NULL, doc) { 2 | # Function from RTCGAToolbox 3 | keyWord <- keyWord1 4 | keyWord <- paste0("//a[contains(@href, '",keyWord,"')]") 5 | plinks <- rvest::html_nodes(doc, xpath = keyWord) 6 | plinks <- rvest::html_attr(plinks, "href") 7 | 8 | if (is.null(datasetLink)) 9 | plinks[grepl(keyWord2,plinks)] 10 | else 11 | plinks[grepl(paste0("*.",datasetLink,keyWord2),plinks)] 12 | } 13 | 14 | #' Find the file names used in RTCGAToolbox 15 | #' 16 | #' Part of this function is from the RTCGAToolbox. It aims to extract the file 17 | #' name used inside of the \link[RTCGAToolbox]{getFirehoseData} function. 18 | #' The arguments of the function parallel those in the 19 | #' \link[RTCGAToolbox]{getFirehoseData} function. It is only available for 20 | #' select data types. 21 | #' 22 | #' @param disease The TCGA cancer disease code, e.g., "COAD" 23 | #' @param runDate The single `string` used in the `getFirehoseData` 24 | #' function (default "20160128") 25 | #' @param dataType A single character vector (default "CNASNP") indicating the 26 | #' data type for which to get the source file name 27 | #' 28 | #' @return A single `character` file name 29 | #' 30 | #' @examples 31 | #' 32 | #' getFileName("COAD", dataType = "CNASNP") 33 | #' 34 | #' @export getFileName 35 | getFileName <- function(disease, runDate = "20160128", 36 | dataType = c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation")) { 37 | 38 | dataType <- match.arg(dataType, 39 | c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation")) 40 | 41 | fh_url <- "https://gdac.broadinstitute.org/runs/stddata__" 42 | fh_url <- paste0(fh_url, substr(runDate,1,4), "_", 43 | substr(runDate,5,6), "_", substr(runDate,7,8), "/data/") 44 | fh_url <- paste0(fh_url, disease, "/", runDate, "/") 45 | doc <- xml2::read_html(fh_url) 46 | 47 | switch(dataType, 48 | CNASNP = .getLinks( 49 | "Level_3__segmented_scna_hg19__seg.Level_3", 50 | paste0("[.]Merge_snp__.*.__Level_3__segmented", 51 | "_scna_hg19__seg.Level_3.*.tar[.]gz$"), 52 | disease, doc), 53 | CNVSNP = .getLinks( 54 | "Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3", 55 | paste0("[.]Merge_snp__.*.__Level_3__segmented_scna_", 56 | "minus_germline_cnv_hg19__seg.Level_3.*.tar[.]gz$"), 57 | disease, doc), 58 | CNASeq = .getLinks("__Level_3__segmentation__seg.Level_3", 59 | paste0("[.]Merge_cna__.*.dnaseq.*.__Level_3__", 60 | "segmentation__seg.Level_3.*.tar[.]gz$"), 61 | disease, doc), 62 | CNACGH = .getLinks("__Level_3__segmentation__seg.Level_3", 63 | paste0("[.]Merge_cna__.*.cgh.*.__Level_3__", 64 | "segmentation__seg.Level_3.*.tar[.]gz$"), 65 | disease, doc), 66 | Mutation = .getLinks("Mutation_Packager_Calls", 67 | "[.]Mutation_Packager_Calls[.]Level_3[.].*.tar[.]gz$", 68 | disease, doc) 69 | ) 70 | } 71 | -------------------------------------------------------------------------------- /R/imputeAssay.R: -------------------------------------------------------------------------------- 1 | #' @name imputeAssay 2 | #' 3 | #' @title This function imputes assays values inside a 4 | #' `MultiAssayExperiment` 5 | #' 6 | #' @description These function allow the user to enter a 7 | #' `MultiAssayExperiment` and impute all the NA values inside assays. 8 | #' 9 | #' @param multiassayexperiment A `MultiAssayExperiment` with genes in the 10 | #' rows, samples in the columns 11 | #' @param i A numeric, logical, or character `vector` indicating the 12 | #' assays to perform imputation on (default 1L) 13 | #' @inheritDotParams impute::impute.knn 14 | #' 15 | #' @return A `MultiAssayExperiment` with imputed assays values 16 | #' 17 | #' @examples 18 | #' 19 | #' example(getSubtypeMap) 20 | #' 21 | #' ## convert data to matrix and add as experiment 22 | #' gbm <- 23 | #' c(gbm, RPPA_matrix = data.matrix(assay(gbm[["GBM_RPPAArray-20160128"]]))) 24 | #' 25 | #' imputeAssay(gbm, i = "RPPA_matrix") 26 | #' 27 | #' @export 28 | imputeAssay <- function(multiassayexperiment, i = 1, ...) { 29 | if (!requireNamespace("impute", quietly = TRUE)) 30 | stop("Install the 'impute' package to run 'imputeAssay'") 31 | 32 | if (!is(multiassayexperiment, "MultiAssayExperiment")) 33 | stop("Input has to be a MultiAssayExperiment") 34 | if (!any(is.character(i), is.numeric(i), is.logical(i))) 35 | stop("'i' has to be character or numeric or logical") 36 | 37 | sub.multiassayexperiment <- multiassayexperiment[,,i] 38 | assays <- assays(sub.multiassayexperiment) 39 | assayclasses <- vapply(assays, is.matrix, logical(1L)) 40 | if (!all(assayclasses)) 41 | stop("Only matrix assay(s) can be imputed") 42 | data.imputed <- lapply(assays, function(mat) { 43 | impute::impute.knn(mat, ...)$data 44 | }) 45 | 46 | for (x in i) { 47 | multiassayexperiment[[x]] <- data.imputed[[x]] 48 | } 49 | 50 | return(multiassayexperiment) 51 | } 52 | -------------------------------------------------------------------------------- /R/makeGRangesListFromCopyNumber.R: -------------------------------------------------------------------------------- 1 | #' Make a GRangesList from TCGA Copy Number data 2 | #' 3 | #' `makeGRangesListFromCopyNumber` allows the user to convert objects of 4 | #' class `data.frame` or [S4Vectors::DataFrame] to a 5 | #' [GRangesList][GenomicRanges::GRangesList-class]. It includes additional 6 | #' features specific to TCGA data such as, hugo symbols, probe numbers, segment 7 | #' means, and ucsc build (if available). 8 | #' 9 | #' @param df A `data.frame` or `DataFrame` class object. `list` 10 | #' class objects are coerced to `data.frame` or `DataFrame`. 11 | #' @param split.field A `character` vector of length one indicating 12 | #' the column to be used as sample identifiers 13 | #' @param names.field A `character` vector of length one indicating the 14 | #' column to be used as names for each of the ranges in the data 15 | #' @param ... Additional arguments to pass on to 16 | #' [GenomicRanges::makeGRangesListFromDataFrame] 17 | #' 18 | #' @return A [GRangesList][GenomicRanges::GRangesList-class] class object 19 | #' 20 | #' @examples 21 | #' library(GenomicDataCommons) 22 | #' 23 | #' manif <- files() |> 24 | #' filter(~ cases.project.project_id == "TCGA-COAD" & 25 | #' data_type == "Copy Number Segment") |> 26 | #' manifest(size = 1) 27 | #' 28 | #' fname <- gdcdata(manif$id) 29 | #' 30 | #' barcode <- UUIDtoBarcode(names(fname), from_type = "file_id") 31 | #' barcode <- barcode[["associated_entities.entity_submitter_id"]] 32 | #' 33 | #' cndata <- read.delim(fname[[1L]], nrows = 10L) 34 | #' 35 | #' cngrl <- makeGRangesListFromCopyNumber(cndata, split.field = "GDC_Aliquot", 36 | #' keep.extra.columns = TRUE) 37 | #' 38 | #' names(cngrl) <- barcode 39 | #' GenomeInfoDb::genome(cngrl) <- extractBuild(fname[[1L]]) 40 | #' cngrl 41 | #' 42 | #' @export makeGRangesListFromCopyNumber 43 | makeGRangesListFromCopyNumber <- 44 | function(df, split.field, names.field = "Hugo_Symbol", ...) { 45 | if (is.list(df) && !inherits(df, "data.frame")) 46 | df <- do.call(rbind, df) 47 | 48 | if (!S4Vectors::isSingleString(names.field)) 49 | stop("'names.field' must be a single sting") 50 | if (!S4Vectors::isSingleString(split.field)) 51 | stop("'split.field' must be a single sting") 52 | 53 | twoMeta <- all(c("num_probes", "segment_mean") %in% tolower(names(df))) 54 | rnames <- tolower(names(df)) %in% tolower(names.field) 55 | ncbi <- tolower(names(df)) %in% "ncbi_build" 56 | 57 | if (any(rnames) && sum(rnames) == 1L) { 58 | setrname <- names(df)[rnames] 59 | grl <- makeGRangesListFromDataFrame(df = df, 60 | split.field = split.field, names.field = setrname, ...) 61 | } else { 62 | grl <- makeGRangesListFromDataFrame(df = df, split.field = 63 | split.field, ...) 64 | } 65 | 66 | if (twoMeta) { 67 | numProb <- names(df)[match("num_probes", tolower(names(df)))] 68 | segMean <- names(df)[match("segment_mean", tolower(names(df)))] 69 | mcols(grl) <- cbind(mcols(grl), DataFrame(num_probes = numProb, 70 | segment_mean = segMean)) 71 | } 72 | if (any(ncbi) && sum(ncbi) == 1L) { 73 | ncbi_build <- names(df)[ncbi] 74 | build_name <- unique(df[[ncbi_build]]) 75 | if (length(build_name) != 1L) { 76 | warning("inconsistent ncbi_build values in data") 77 | } else { 78 | ucscBuild <- translateBuild(build_name, "UCSC") 79 | GenomeInfoDb::genome(grl) <- ucscBuild 80 | } 81 | } 82 | grl 83 | } 84 | -------------------------------------------------------------------------------- /R/makeGRangesListFromExonFiles.R: -------------------------------------------------------------------------------- 1 | #' Read exon-level expression files and create a `GRangesList` 2 | #' 3 | #' This function serves to read exon-level expression data. It works for exon 4 | #' quantification (raw counts and RPKM) and junction quantification 5 | #' (raw counts) file paths and represents such data as a 6 | #' [GRangesList][GenomicRanges::GRangesList-class]. The data files can be 7 | #' downloaded via the Genomic Data Commons (GDC) Legacy Archive. 8 | #' 9 | #' @details The `rangesColumn` name in the GDC data files is usually "exon" 10 | #' but can be changed with the `rangesColumn` argument, if different. 11 | #' To avoid programmatically obtaining TCGA barcodes from the GDC 12 | #' API, set the `getBarcodes` to `FALSE`. When `getBarcodes` is set to 13 | #' `FALSE`, the file names are used to name the elements of the `GRangesList` 14 | #' output. 15 | #' 16 | #' @param filepaths character() vector of file paths containing TCGA exon 17 | #' data usually obtained from the GDC 18 | #' 19 | #' @param sampleNames character() vector of TCGA barcodes to be used as 20 | #' names for the `GRangesList` output (default NULL) 21 | #' 22 | #' @param fileNames character() vector of file names as downloaded from 23 | #' the Genomic Data Commons Legacy archive (default `basename(filepaths)`) 24 | #' 25 | #' @param getBarcodes logical(1). Whether to query the GDC API with the 26 | #' `filenameToBarcode` and obtain the TCGA barcodes from the file names 27 | #' (default TRUE); see details. 28 | #' 29 | #' @param rangesColumn character(1). The name of the column in the data 30 | #' containing the ranges information (default "exon"); see details. 31 | #' 32 | #' @param nrows numeric(1). The number of rows to return from each of the files 33 | #' read in (all rows by default; default Inf) 34 | #' 35 | #' @md 36 | #' 37 | #' @return A [GRangesList][GenomicRanges::GRangesList-class] object 38 | #' 39 | #' @author M. Ramos 40 | #' 41 | #' @examples 42 | #' 43 | #' ## Load example file found in package 44 | #' pkgDir <- system.file("extdata", package = "TCGAutils", mustWork = TRUE) 45 | #' exonFile <- list.files(pkgDir, pattern = "cation\\.txt$", full.names = TRUE) 46 | #' 47 | #' filePrefix <- "unc.edu.32741f9a-9fec-441f-96b4-e504e62c5362.1755371." 48 | #' 49 | #' ## Add actual file name manually (due to Windows OS restriction) 50 | #' makeGRangesListFromExonFiles(exonFile, 51 | #' fileNames = paste0(filePrefix, basename(exonFile)), 52 | #' sampleNames = "TCGA-AA-3678-01A-01R-0905-07") 53 | #' 54 | #' @export makeGRangesListFromExonFiles 55 | makeGRangesListFromExonFiles <- function(filepaths, sampleNames = NULL, 56 | fileNames = basename(filepaths), getBarcodes = TRUE, rangesColumn = "exon", 57 | nrows = Inf) 58 | { 59 | if (is.null(sampleNames) && getBarcodes) { 60 | sampleNames <- 61 | filenameToBarcode(filenames = fileNames)[[ 62 | "cases.samples.portions.analytes.aliquots.submitter_id" 63 | ]] 64 | } else if (is.null(sampleNames)) { 65 | sampleNames <- fileNames 66 | } 67 | 68 | if (!identical(length(filepaths), length(sampleNames))) 69 | stop("'sampleNames' length is inconsistent with 'fileNames'") 70 | 71 | btData <- lapply(filepaths, function(file) { 72 | if (requireNamespace("readr", quietly = TRUE)) { 73 | readr::local_edition(1) 74 | readr::read_delim(file, delim = "\t", n_max = nrows) 75 | } else 76 | read.delim(file, sep = "\t", 77 | nrows = if (is.infinite(nrows)) -1 else nrows) 78 | }) 79 | 80 | names(btData) <- sampleNames 81 | 82 | allrowdata <- 83 | if (requireNamespace("dplyr", quietly = TRUE)) 84 | dplyr::bind_rows(btData) 85 | else 86 | do.call(rbind, btData) 87 | 88 | newGRanges <- GenomicRanges::GRanges(allrowdata[[rangesColumn]]) 89 | mcols(newGRanges) <- allrowdata[, names(allrowdata) != rangesColumn] 90 | 91 | splitIndx <- rep(names(btData), vapply(btData, nrow, integer(1L))) 92 | S4Vectors::splitAsList(newGRanges, splitIndx) 93 | } 94 | -------------------------------------------------------------------------------- /R/oncoPrintTCGA.R: -------------------------------------------------------------------------------- 1 | #' OncoPrint for TCGA Mutation Assays 2 | #' 3 | #' @param multiassayexperiment A `MultiAssayExperiment`, usually from 4 | #' `curatedTCGAData` 5 | #' 6 | #' @param matchassay character(1) The name of the assay containing mutation 7 | #' data, this can be a pattern (e.g., "*_Mutation-*", the default) 8 | #' 9 | #' @param variantCol character(1) The name of the metadata column containing 10 | #' the mutation categories, usually "Variant_Classification" in TCGA 11 | #' 12 | #' @param brewerPal character(1) The name of the `RColorBrewer::brewer.pal` 13 | #' palette, (default: "Set3") 14 | #' 15 | #' @param ntop integer(1) The number of the top N genes for displaying based 16 | #' on per-sample mutation frequency 17 | #' 18 | #' @param incl.thresh double(1) The inclusion threshold for empirical mutations, 19 | #' mutations less frequent than this value will not be included 20 | #' 21 | #' @param rowcol character(1) The name of the column in the metadata to annotate 22 | #' the rows with either "Hugo_Symbol" (default) or 23 | #' 24 | #' @importFrom BiocBaseUtils isScalarCharacter isScalarNumber checkInstalled 25 | #' 26 | #' @return An oncoPrint plot of mutations 27 | #' 28 | #' @examples 29 | #' 30 | #' library(curatedTCGAData) 31 | #' 32 | #' acc <- curatedTCGAData("ACC", "Mutation", version = "1.1.38", FALSE) 33 | #' 34 | #' oncoPrintTCGA(acc) 35 | #' 36 | #' @export 37 | oncoPrintTCGA <- 38 | function(multiassayexperiment, matchassay = "*_Mutation-*", 39 | variantCol = "Variant_Classification", brewerPal = "Set3", ntop = 25, 40 | incl.thresh = 0.01, rowcol = "Hugo_Symbol") 41 | { 42 | stopifnot( 43 | isScalarCharacter(matchassay), isScalarCharacter(variantCol), 44 | isScalarCharacter(brewerPal), isScalarNumber(ntop), 45 | is(multiassayexperiment, "MultiAssayExperiment"), 46 | isScalarNumber(incl.thresh), isScalarCharacter(rowcol) 47 | ) 48 | 49 | checkInstalled(c("org.Hs.eg.db", "ComplexHeatmap", "RColorBrewer")) 50 | 51 | mutname <- grep(utils::glob2rx(matchassay), 52 | names(multiassayexperiment), value = TRUE) 53 | 54 | if (length(mutname) > 1) 55 | stop("Only one mutation assay supported at this time") 56 | 57 | ragex <- multiassayexperiment[[mutname]] 58 | stopifnot(is(ragex, "RaggedExperiment")) 59 | 60 | rownames(ragex) <- mcols(ragex)[[rowcol]] 61 | somaticnonsilent <- mcols(ragex)[["Mutation_Status"]] == "Somatic" & 62 | mcols(ragex)[[variantCol]] != "Silent" 63 | ragex <- ragex[somaticnonsilent, ] 64 | 65 | Variants <- mcols(ragex)[[variantCol]] 66 | Variants <- gsub("_", " ", Variants) 67 | mcols(ragex)[[variantCol]] <- Variants 68 | 69 | types <- table(Variants) 70 | tottypes <- sum(types) 71 | incl <- (types/tottypes) > incl.thresh 72 | types <- types[incl] 73 | validvariants <- setNames(names(types), names(types)) 74 | 75 | ragex <- ragex[mcols(ragex)[[variantCol]] %in% validvariants, ] 76 | rr <- BiocGenerics::unstrand(RaggedExperiment::rowRanges(ragex)) 77 | ragex <- RaggedExperiment::`rowRanges<-`(ragex, value = rr) 78 | 79 | gen <- GenomeInfoDb::genome(ragex) 80 | genomeannot <- unique(gen) 81 | genomelen <- length(gen) 82 | 83 | if (length(genomeannot) > 1) 84 | stop("'genome' annotation is not consistent") 85 | 86 | if (!grepl("^[Hh][Gg]", genomeannot)) { 87 | cbuild <- correctBuild(genomeannot, "NCBI") 88 | ragex <- GenomeInfoDb::`genome<-`(ragex, cbuild) 89 | ragex <- GenomeInfoDb::`seqlevelsStyle<-`(ragex, "UCSC") 90 | genomeannot <- translateBuild(genomeannot) 91 | } 92 | 93 | checkInstalled(paste0("TxDb.Hsapiens.UCSC.", genomeannot, ".knownGene")) 94 | 95 | gn <- sort(.getGN(genomeannot)) 96 | gn <- BiocGenerics::unstrand(gn) 97 | gn <- gn[!is.na(names(gn))] 98 | sqls <- seqlevelsStyle(ragex) 99 | seqlevelsStyle(gn) <- sqls 100 | 101 | simplify_fun <- function(scores, ranges, qranges) 102 | { any(scores != "Silent") } 103 | 104 | res <- RaggedExperiment::qreduceAssay( 105 | ragex, gn, simplify_fun, "Variant_Classification", background = FALSE 106 | ) 107 | rownames(res) <- names(gn) 108 | 109 | topgenes <- head(sort(rowSums(res), decreasing = TRUE), ntop) 110 | gn2 <- gn[match(names(topgenes), names(gn))] 111 | 112 | qualcolors <- 113 | RColorBrewer::brewer.pal(n = length(validvariants), brewerPal) 114 | colors <- setNames(qualcolors, validvariants) 115 | 116 | colfuns <- lapply(colors, function(couleur) { 117 | args <- alist(x =, y =, w =, h =) 118 | args <- as.pairlist(args) 119 | body <- substitute({ 120 | grid::grid.rect(x, y, w, h, gp = grid::gpar(fill = z, col = NA)) 121 | }, list(z = couleur)) 122 | eval(call("function", args, body)) 123 | }) 124 | 125 | background <- function(x, y, w, h) 126 | grid::grid.rect(x, y, w, h, 127 | gp = grid::gpar(fill = "#FFFFFF", col = "#FFFFFF")) 128 | mutfuns <- c(background = background, colfuns) 129 | 130 | simplify_funs <- lapply(validvariants, 131 | function(variant) { 132 | args <- alist(scores =, ranges =, qranges =) 133 | args <- as.pairlist(args) 134 | body <- substitute({ 135 | as.numeric(any(S4Vectors::`%in%`(scores, z))) 136 | }, list(z = variant)) 137 | eval(call("function", args, body)) 138 | } 139 | ) 140 | 141 | list_mats <- lapply(simplify_funs, function(variant_fun) { 142 | res <- RaggedExperiment::qreduceAssay(ragex, gn2, variant_fun, 143 | "Variant_Classification", background = 0) 144 | rownames(res) <- names(gn2) 145 | res 146 | }) 147 | 148 | return( 149 | ComplexHeatmap::oncoPrint( 150 | list_mats, alter_fun = mutfuns, col = colors, show_pct = FALSE 151 | ) 152 | ) 153 | } 154 | 155 | -------------------------------------------------------------------------------- /R/simplifyColData.R: -------------------------------------------------------------------------------- 1 | #' Take a MultiAssayExperiment and include curated variables 2 | #' 3 | #' This function works on the `colData` of a 4 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 5 | #' object to merge curated variable columns or other clinical variables that 6 | #' would like to be added. It is recommended that the user run the scripts in 7 | #' the `MultiAssayExperiment.TCGA` repository that build the "enhanced" type of 8 | #' data but not necessary if using different clinical data. Please see the 9 | #' repository's README for more information. 10 | #' 11 | #' @param MultiAssayExperiment A 12 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 13 | #' object 14 | #' @param colData A `DataFrame` or `data.frame` to merge with 15 | #' clinical data in the `MultiAssayExperiment` object 16 | #' 17 | #' @return A 18 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 19 | #' object 20 | #' 21 | #' @examples 22 | #' 23 | #' library(MultiAssayExperiment) 24 | #' 25 | #' mergeColData(MultiAssayExperiment(), S4Vectors::DataFrame()) 26 | #' 27 | #' @export mergeColData 28 | mergeColData <- function(MultiAssayExperiment, colData) { 29 | if (!is(MultiAssayExperiment, "MultiAssayExperiment")) 30 | stop("Provide a valid MultiAssayExperiment object") 31 | if (!is(colData, "DataFrame") && !is.data.frame(colData)) 32 | stop("'colData' must be 'DataFrame' or 'data.frame'") 33 | if (is.null(rownames(colData)) && length(colData)) 34 | stop("'colData' data must have rownames") 35 | 36 | maeClinical <- colData(MultiAssayExperiment) 37 | mergedClin <- merge(maeClinical, colData, 38 | by = c("row.names", intersect(names(maeClinical), names(colData))), 39 | all = TRUE, sort = FALSE, stringsAsFactors = FALSE) 40 | 41 | rownames(mergedClin) <- mergedClin[["Row.names"]] 42 | mergedClin <- mergedClin[, names(mergedClin) != "Row.names", drop = FALSE] 43 | colData(MultiAssayExperiment) <- as(mergedClin, "DataFrame") 44 | MultiAssayExperiment 45 | } 46 | 47 | #' Minimize the number of variables in colData 48 | #' 49 | #' This function removes variables that have a high number of missing data 50 | #' and contain keywords. 51 | #' 52 | #' @param multiassayexperiment A 53 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 54 | #' object with `colData` 55 | #' 56 | #' @param maxNAfrac (numeric default 0.2) A decimal between 0 and 1 to indicate 57 | #' the amount of NA values allowed per column 58 | #' 59 | #' @param keystring (character) A vector of keywords to match and remove 60 | #' variables 61 | #' 62 | #' @return A 63 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 64 | #' object 65 | #' 66 | #' @examples 67 | #' 68 | #' example(getSubtypeMap) 69 | #' 70 | #' (gbm_trimmed <- trimColData(gbm)) 71 | #' 72 | #' head(colData(gbm_trimmed))[1:5] 73 | #' 74 | #' @export trimColData 75 | trimColData <- function(multiassayexperiment, maxNAfrac = 0.2, 76 | keystring = c("portion", "analyte")) { 77 | if (!is(multiassayexperiment, "MultiAssayExperiment")) 78 | stop("Provide a 'MultiAssayExperiment' input") 79 | DF <- colData(multiassayexperiment) 80 | keystring <- na.omit(keystring) 81 | 82 | NAabove <- vapply(DF, function(x) mean(is.na(x)) >= maxNAfrac, logical(1L)) 83 | 84 | keymat <- vapply(keystring, function(string) 85 | grepl(string, names(DF)), logical(length(DF))) 86 | keymatch <- apply(keymat, 1L, any) 87 | 88 | todrop <- NAabove | keymatch 89 | colData(multiassayexperiment) <- DF[, !todrop] 90 | 91 | multiassayexperiment 92 | } 93 | -------------------------------------------------------------------------------- /R/simplifyTCGA.R: -------------------------------------------------------------------------------- 1 | #' @importFrom GenomicFeatures genes microRNAs 2 | #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevelsStyle 3 | #' seqlevelsStyle<- 4 | NULL 5 | 6 | .checkHas <- 7 | function(x, pattern = c("^hsa", "^cg", "symbols"), threshold = 0.9) { 8 | if (identical(pattern, "symbols")) 9 | pattern <- "^[A-Z0-9]{1,6}|^C[0-9]orf[0-9]{1,4}" 10 | mean(c(FALSE, grepl(pattern, rownames(x))), na.rm = TRUE) > 0.9 11 | } 12 | 13 | .isSummarizedExperiment <- function(x) { 14 | is(x, "SummarizedExperiment") & !is(x, "RangedSummarizedExperiment") 15 | } 16 | 17 | .convertTo <- function(x, which, FUN, keep, unmap) { 18 | for (i in which(which)) { 19 | lookup <- FUN(rownames(x[[i]])) 20 | ranges <- lookup[["mapped"]] 21 | rse <- x[[i]][names(ranges), ] 22 | # rowData not merged with mcols of RHS in `rowRanges<-` method 23 | mcols(ranges) <- 24 | S4Vectors::DataFrame(rowData(rse), S4Vectors::mcols(ranges)) 25 | SummarizedExperiment::rowRanges(rse) <- ranges 26 | x <- c(x, setNames(S4Vectors::List(rse), 27 | paste0(names(x)[i], "_ranged"))) 28 | if (length(lookup[["unmapped"]]) && unmap) { 29 | se <- x[[i]][lookup[["unmapped"]], ] 30 | x <- c(x, setNames(S4Vectors::List(se), 31 | paste0(names(x)[i], "_unranged"))) 32 | } 33 | } 34 | if (!keep & any(which)) 35 | x <- x[, , -match(names(which(which)), names(x))] 36 | x 37 | } 38 | 39 | #' @name hidden-helpers 40 | #' @title A small document for helper functions 41 | #' @param x A character vector 42 | #' @param gn A GRanges object with some of its names found in x 43 | #' @return A list of length 2: unmapped (character vector) and mapped (GRanges) 44 | #' @keywords internal 45 | .makeListRanges <- function(x, gn) { 46 | res <- list(unmapped = x[!x %in% names(gn)]) 47 | x <- x[x %in% names(gn)] 48 | gn <- gn[match(x, names(gn))] 49 | res[["mapped"]] <- gn 50 | return(res) 51 | } 52 | 53 | #' @importFrom BiocBaseUtils isScalarCharacter 54 | .getGN <- function(gen) { 55 | stopifnot(isScalarCharacter(gen)) 56 | 57 | txdb <- if (identical(gen, "hg18")) 58 | TxDb.Hsapiens.UCSC.hg18.knownGene::TxDb.Hsapiens.UCSC.hg18.knownGene 59 | else if (identical(gen, "hg19")) 60 | TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 61 | 62 | gn <- keepStandardChromosomes( 63 | GenomicFeatures::genes(txdb), pruning.mode = "coarse" 64 | ) 65 | seqlevelsStyle(gn) <- "NCBI" 66 | 67 | names(gn) <- AnnotationDbi::mapIds( 68 | org.Hs.eg.db::org.Hs.eg.db, 69 | names(gn), 70 | keytype = "ENTREZID", 71 | column = "SYMBOL" 72 | ) 73 | 74 | gn 75 | } 76 | 77 | #' @rdname hidden-helpers 78 | #' @return list of length 2: "unmapped" is a character vector providing 79 | #' unmapped symbols, "mapped" is a GRanges object with ranges of mapped symbols 80 | #' @keywords internal 81 | .getRangesOfSYMBOLS <- function(x) { 82 | gn <- .getGN("hg19") 83 | .makeListRanges(x, gn) 84 | } 85 | 86 | .getRangesOfCpG <- function(x) { 87 | local_data_store <- new.env(parent = emptyenv()) 88 | data( 89 | "Locations", 90 | envir = local_data_store, 91 | package = "IlluminaHumanMethylation450kanno.ilmn12.hg19" 92 | ) 93 | Locations <- local_data_store[["Locations"]] 94 | 95 | clist <- list(seqnames = "chr", pos = "pos", strand = "strand") 96 | gps <- do.call( 97 | GenomicRanges::GPos, 98 | lapply(clist, function(x) Locations[, x]) 99 | ) 100 | names(gps) <- rownames(Locations) 101 | seqlevelsStyle(gps) <- "NCBI" 102 | 103 | .makeListRanges(x, gps) 104 | } 105 | 106 | #' @rdname simplifyTCGA 107 | #' 108 | #' @title Functions to convert rows annotations to ranges and RaggedExperiment 109 | #' to RangedSummarizedExperiment 110 | #' 111 | #' @description This group of functions will convert row annotations as 112 | #' either gene symbols or miRNA symbols to row ranges based on database 113 | #' resources 'TxDB' and 'org.Hs' packages. It will also simplify the 114 | #' representation of 115 | #' [RaggedExperiment][RaggedExperiment::RaggedExperiment-class] objects to 116 | #' [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class]. 117 | #' 118 | #' @details The original `SummarizedExperiment` containing either gene symbol 119 | #' or miR annotations is replaced or supplemented by a 120 | #' [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class] 121 | #' for those that could be mapped to 122 | #' [GRanges][GenomicRanges::GRanges-class], and optionally another 123 | #' [SummarizedExperiment][SummarizedExperiment::SummarizedExperiment-class] 124 | #' for annotations that could not be mapped to 125 | #' [GRanges][GenomicRanges::GRanges-class]. 126 | #' 127 | #' @section qreduceTCGA: 128 | #' 129 | #' Using `TxDb.Hsapiens.UCSC.hg19.knownGene` as the reference, `qreduceTCGA` 130 | #' reduces the data by applying either the `weightedmean` or `nonsilent` 131 | #' function (see below) to non-mutation or mutation data, respectively. 132 | #' Internally, it uses [RaggedExperiment::qreduceAssay()] to reduce the ranges 133 | #' to the gene-level. 134 | #' 135 | #' `qreduceTCGA` will update `genome(x)` based on the NCBI reference annotation 136 | #' which includes the patch number, e.g., GRCh37.p14, as provided by the 137 | #' `seqlevelsStyle` setter, `seqlevelsStyle(gn) <- "NCBI"`. `qreduceTCGA` 138 | #' uses the NCBI genome annotation as the default reference. 139 | #' 140 | #' nonsilent <- function(scores, ranges, qranges) 141 | #' any(scores != "Silent") 142 | #' 143 | #' `RaggedExperiment` mutation objects become a genes by patients 144 | #' `RangedSummarizedExperiment` object containing '1' if there is a non-silent 145 | #' mutation somewhere in the gene, and '0' otherwise as obtained from the 146 | #' `Variant_Classification` column in the data. 147 | #' 148 | #' weightedmean <- function(scores, ranges, qranges) { 149 | #' isects <- GenomicRanges::pintersect(ranges, qranges) 150 | #' sum(scores * BiocGenerics::width(isects)) / 151 | #' sum(BiocGenerics::width(isects)) 152 | #' } 153 | #' 154 | #' "CNA" and "CNV" segmented copy number are reduced using a weighted mean in 155 | #' the rare cases of overlapping (non-disjoint) copy number regions. 156 | #' 157 | #' These functions rely on `TxDb.Hsapiens.UCSC.hg19.knownGene` and 158 | #' `org.Hs.eg.db` to map to the 'hg19' NCBI build. Use the `liftOver` procedure 159 | #' for datasets that are provided against a different reference genome (usually 160 | #' 'hg18'). See an example in the vignette. 161 | #' 162 | #' @param obj A `MultiAssayExperiment` object obtained from `curatedTCGAData` 163 | #' 164 | #' @param keep.assay logical (default FALSE) Whether to keep the 165 | #' `SummarizedExperiment` assays that have been converted to 166 | #' `RangedSummarizedExperiment` 167 | #' 168 | #' @param unmapped logical (default TRUE) Include an assay of data that was 169 | #' not able to be mapped in reference database 170 | #' 171 | #' @param suffix character (default "_simplified") A character string to append 172 | #' to the newly modified assay for `qreduceTCGA`. 173 | #' 174 | #' @return A 175 | #' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class] 176 | #' with any gene expression, miRNA, copy number, and mutations converted to 177 | #' [`RangedSummarizedExperiment`][SummarizedExperiment::RangedSummarizedExperiment-class] 178 | #' objects 179 | #' 180 | #' @author L. Waldron 181 | #' 182 | #' @md 183 | #' 184 | #' @examples 185 | #' 186 | #' library(curatedTCGAData) 187 | #' library(GenomeInfoDb) 188 | #' 189 | #' accmae <- 190 | #' curatedTCGAData(diseaseCode = "ACC", 191 | #' assays = c("CNASNP", "Mutation", "miRNASeqGene", "GISTICT"), 192 | #' version = "1.1.38", 193 | #' dry.run = FALSE) 194 | #' 195 | #' ## update genome annotation 196 | #' rex <- accmae[["ACC_Mutation-20160128"]] 197 | #' 198 | #' ## Translate build to "hg19" 199 | #' tgenome <- vapply(genome(rex), translateBuild, character(1L)) 200 | #' genome(rex) <- tgenome 201 | #' 202 | #' accmae[["ACC_Mutation-20160128"]] <- rex 203 | #' 204 | #' simplifyTCGA(accmae) 205 | #' 206 | #' @export 207 | simplifyTCGA <- function(obj, keep.assay = FALSE, unmapped = TRUE) { 208 | obj <- qreduceTCGA(obj, keep.assay) 209 | symbolsToRanges(obj, keep.assay, unmapped) 210 | } 211 | 212 | #' @name simplifyTCGA 213 | #' @aliases symbolsToRanges 214 | #' @importFrom BiocBaseUtils checkInstalled 215 | #' @export 216 | symbolsToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) { 217 | can.fix <- vapply( 218 | experiments(obj), 219 | function(y) { 220 | .checkHas(y, "symbols") & .isSummarizedExperiment(y) 221 | }, 222 | logical(1L) 223 | ) 224 | 225 | checkInstalled(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db")) 226 | .convertTo( 227 | x = obj, 228 | which = can.fix, 229 | FUN = .getRangesOfSYMBOLS, 230 | keep = keep.assay, 231 | unmap = unmapped 232 | ) 233 | } 234 | 235 | #' @name simplifyTCGA-defunct 236 | #' 237 | #' @title Defunct TCGAutils functions 238 | #' 239 | #' @inheritParams simplifyTCGA 240 | #' 241 | #' @description `mirToRanges` is defunct and will be removed in the next 242 | #' release. The `mirbase.db` package is currently deprecated in `RELEASE_3_21`. 243 | #' 244 | #' @aliases mirToRanges 245 | #' 246 | #' @importFrom BiocBaseUtils lifeCycle 247 | #' 248 | #' @export 249 | mirToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) { 250 | lifeCycle(cycle = "defunct", title = "simplifyTCGA") 251 | } 252 | 253 | #' @name simplifyTCGA 254 | #' @aliases CpGtoRanges 255 | #' @export 256 | CpGtoRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) { 257 | can.fix <- vapply( 258 | experiments(obj), 259 | function(y) { 260 | .checkHas(y, "^cg") & .isSummarizedExperiment(y) 261 | }, 262 | logical(1L) 263 | ) 264 | 265 | checkInstalled("IlluminaHumanMethylation450kanno.ilmn12.hg19") 266 | 267 | .convertTo( 268 | x = obj, 269 | which = can.fix, 270 | FUN = .getRangesOfCpG, 271 | keep = keep.assay, 272 | unmap = unmapped 273 | ) 274 | } 275 | 276 | #' @name simplifyTCGA 277 | #' @aliases qreduceTCGA 278 | #' @export 279 | qreduceTCGA <- function(obj, keep.assay = FALSE, suffix = "_simplified") { 280 | checkInstalled(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db")) 281 | gn <- genes( 282 | TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 283 | ) 284 | gn <- keepStandardChromosomes( 285 | GenomicRanges::granges(gn), 286 | pruning.mode = "coarse" 287 | ) 288 | seqlevelsStyle(gn) <- "NCBI" 289 | names(gn) <- AnnotationDbi::mapIds( 290 | org.Hs.eg.db::org.Hs.eg.db, 291 | names(gn), 292 | keytype = "ENTREZID", 293 | column = "SYMBOL" 294 | ) 295 | 296 | weightedmean <- function(scores, ranges, qranges) { 297 | isects <- GenomicRanges::pintersect(ranges, qranges) 298 | sum(scores * BiocGenerics::width(isects)) / 299 | sum(BiocGenerics::width(isects)) 300 | } 301 | 302 | nonsilent <- function(scores, ranges, qranges) 303 | any(scores != "Silent") 304 | 305 | isRE <- 306 | function(x) vapply(experiments(x), is, logical(1L), "RaggedExperiment") 307 | 308 | isMut <- function(x) grepl("Mutation", names(x)) 309 | 310 | for (i in which(isMut(obj))) { 311 | sqls <- seqlevelsStyle(obj[[i]]) 312 | seqlevelsStyle(gn) <- sqls 313 | ## remove patch release info 314 | gname <- genome(gn) 315 | genome(gn) <- gsub("\\.p[0-9]{1,2}$", "", genome(gn)) 316 | mutations <- RaggedExperiment::qreduceAssay( 317 | obj[[i]], 318 | gn, 319 | nonsilent, 320 | "Variant_Classification" 321 | ) 322 | rownames(mutations) <- names(gn) 323 | mutations[is.na(mutations)] <- 0 324 | remove.rows <- is.na(rownames(mutations)) 325 | mut_ranges <- gn[!remove.rows] 326 | ## replace patch release info 327 | genome(mut_ranges) <- gname 328 | mutations <- SummarizedExperiment( 329 | mutations[!remove.rows, ], rowRanges = mut_ranges 330 | ) 331 | el <- ExperimentList(x = mutations) 332 | names(el) <- paste0(names(obj)[i], suffix) 333 | obj <- c(obj, el) 334 | } 335 | for (i in which(isRE(obj) & !isMut(obj))) { 336 | sqls <- seqlevelsStyle(obj[[i]]) 337 | seqlevelsStyle(gn) <- sqls 338 | suppressWarnings( 339 | cn <- RaggedExperiment::qreduceAssay( 340 | obj[[i]], 341 | gn, 342 | weightedmean, 343 | "Segment_Mean" 344 | ) 345 | ) 346 | rownames(cn) <- names(gn) 347 | remove.rows <- is.na(rownames(cn)) 348 | cn <- SummarizedExperiment( 349 | cn[!remove.rows, ], rowRanges = gn[!remove.rows] 350 | ) 351 | el <- ExperimentList(x = cn) 352 | names(el) <- paste0(names(obj)[i], suffix) 353 | obj <- c(obj, el) 354 | } 355 | if (!keep.assay) { 356 | obj <- obj[, , !isRE(obj)] 357 | } 358 | return(obj) 359 | } 360 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ## Helper for finding barcode column 2 | ## **Takes the first result!** 3 | .findBarcodeCol <- function(DF) { 4 | cnames <- names(DF) 5 | containsBC <- vapply(head(DF), function(column) { 6 | all(startsWith(column, "TCGA")) 7 | }, logical(1L)) 8 | names(containsBC) <- cnames 9 | bcIdx <- which(containsBC) 10 | stopifnot(S4Vectors::isSingleInteger(which(containsBC))) 11 | names(containsBC)[bcIdx] 12 | } 13 | 14 | ## Standardize barcode format 15 | .standardBarcodes <- function(sampleBarcode) { 16 | if (!length(sampleBarcode)) { 17 | stop(" Barcode must be of positive length") 18 | } 19 | sampleBC <- base::sample(sampleBarcode, 10L, replace = TRUE) 20 | bcodeTest <- grepl("\\.", sampleBC) 21 | if (all(bcodeTest)) 22 | sampleBarcode <- gsub("\\.", "-", sampleBarcode) 23 | toupper(sampleBarcode) 24 | } 25 | 26 | ## Find columns that are all NA 27 | .findNAColumns <- function(dataset) { 28 | apply(dataset, 2L, function(column) { 29 | all(is.na(column)) 30 | }) 31 | } 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TCGAutils 2 | 3 | The `TCGAutils` package provides a suite of helper functions that aid in 4 | the management and cleaning of data from The Cancer Genome Atlas (TCGA). 5 | 6 | Many of the functions contained herein work on raw and derived data objects 7 | from The Cancer Genome Atlas (TCGA), the `RTCGAToolbox` package and 8 | `curatedTCGAData` experiment data package. 9 | 10 | Please make sure to download the latest version of `RTCGAToolbox` 11 | from Bioconductor. 12 | 13 | ## Installation 14 | 15 | ``` 16 | if (!require("BiocManager")) 17 | install.packages("BiocManager") 18 | 19 | library(BiocManager) 20 | 21 | install("TCGAutils") 22 | ``` 23 | 24 | ## Cheatsheet 25 | 26 | 27 | 28 | Please report minimally reproducible bugs at our [github issue page][] 29 | 30 | [github issue page]: https://github.com/waldronlab/TCGAutils/issues 31 | 32 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | title: TCGAutils 2 | url: https://waldronlab.github.io/TCGAutils 3 | 4 | template: 5 | bootstrap: 5 6 | params: 7 | bootswatch: flatly 8 | -------------------------------------------------------------------------------- /data/clinicalNames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/clinicalNames.rda -------------------------------------------------------------------------------- /data/diseaseCodes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/diseaseCodes.rda -------------------------------------------------------------------------------- /data/sampleTypes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/waldronlab/TCGAutils/9a877b11f48cd6e72b7748bf3b455067178a28ed/data/sampleTypes.rda -------------------------------------------------------------------------------- /inst/extdata/blca_cnaseq.R: -------------------------------------------------------------------------------- 1 | ## Generate blca_cnaseq data 2 | if (!requireNamespace("RTCGAToolbox")) 3 | stop("Download package 'RTCGAToolbox' to regenerate data") 4 | 5 | library(RTCGAToolbox) 6 | 7 | blca <- getFirehoseData("BLCA", clinical = FALSE, 8 | CNASeq = TRUE, destdir = tempdir()) 9 | bl <- getData(blca, "CNASeq") 10 | 11 | set.seed(777) 12 | blsplit <- lapply(split(bl, bl[["Sample"]]), function(x) 13 | x[sample(seq_len(nrow(x)), 2L), ]) 14 | 15 | blframe <- dplyr::bind_rows(blsplit) 16 | blca_cnaseq <- blframe[c(TRUE, TRUE, FALSE, FALSE), ] 17 | 18 | write.table(blca_cnaseq, file = "inst/extdata/blca_cnaseq.txt") 19 | -------------------------------------------------------------------------------- /inst/extdata/blca_cnaseq.txt: -------------------------------------------------------------------------------- 1 | "Sample" "Chromosome" "Start" "End" "Num_Probes" "Segment_Mean" 2 | "1" "TCGA-BL-A0C8-01A-11D-A10R-02" 14 70362113 73912204 NA -0.182879930738387 3 | "2" "TCGA-BL-A0C8-01A-11D-A10R-02" 9 115609546 131133898 NA 0.0396751622235396 4 | "5" "TCGA-BL-A13I-01A-11D-A13U-02" 13 19020028 49129100 NA 0.00208555197637913 5 | "6" "TCGA-BL-A13I-01A-11D-A13U-02" 1 10208 246409808 NA -0.0142247519016688 6 | "9" "TCGA-BL-A13J-01A-11D-A10R-02" 23 3119586 5636448 NA 0.877072555244314 7 | "10" "TCGA-BL-A13J-01A-11D-A10R-02" 7 10127 35776912 NA 0.113873871106118 8 | "13" "TCGA-BL-A13J-11A-13D-A10R-02" 13 27638070 28786211 NA 0.135933760992049 9 | "14" "TCGA-BL-A13J-11A-13D-A10R-02" 13 31546838 31837888 NA 0.831083871851924 10 | "17" "TCGA-BL-A3JM-11A-31D-A21C-26" 6 63814 171050932 NA -0.222089556733618 11 | "18" "TCGA-BL-A3JM-11A-31D-A21C-26" 19 70880 12247668 NA 0.212002033643438 12 | "21" "TCGA-BT-A0S7-10A-01D-A10R-02" 8 18542416 43427219 NA -0.109425720019835 13 | "22" "TCGA-BT-A0S7-10A-01D-A10R-02" 12 129318803 129977811 NA -0.750308530005678 14 | "25" "TCGA-BT-A0YX-10A-01D-A10R-02" 1 100222446 100622130 NA 0.260037976378174 15 | "26" "TCGA-BT-A0YX-10A-01D-A10R-02" 16 19103580 90294729 NA -0.0243020219348178 16 | "29" "TCGA-BT-A20N-11A-11D-A14U-02" 23 144220182 151288847 NA -0.218732574411559 17 | "30" "TCGA-BT-A20N-11A-11D-A14U-02" 11 126948609 128472558 NA 0.147130578528114 18 | "33" "TCGA-BT-A20O-11A-11D-A14U-02" 19 27741477 47326510 NA 0.218065351393204 19 | "34" "TCGA-BT-A20O-11A-11D-A14U-02" 7 38394968 142360699 NA 0.00783656168862701 20 | "37" "TCGA-BT-A20P-11A-11D-A14U-02" 19 70880 59118869 NA -0.0585214412378456 21 | "38" "TCGA-BT-A20P-11A-11D-A14U-02" 5 141615867 162228885 NA -0.0573410519735708 22 | "41" "TCGA-BT-A20Q-11A-11D-A14U-02" 1 65116406 105786887 NA -0.0137743863413686 23 | "42" "TCGA-BT-A20Q-11A-11D-A14U-02" 6 63814 143557977 NA -0.0263569581064626 24 | "45" "TCGA-BT-A20R-11A-11D-A16N-02" 4 159329222 184058329 NA -0.285766564893208 25 | "46" "TCGA-BT-A20R-11A-11D-A16N-02" 23 2699503 154930285 NA -0.0507782046619671 26 | "49" "TCGA-BT-A20T-11A-11D-A14U-02" 21 9411344 48119869 NA -0.00912023316078493 27 | "50" "TCGA-BT-A20T-11A-11D-A14U-02" 2 143209111 153872733 NA -0.28106978566767 28 | "53" "TCGA-BT-A20U-11A-11D-A14U-02" 2 10001 243189359 NA -0.0085274424194125 29 | "54" "TCGA-BT-A20U-11A-11D-A14U-02" 5 11769 180905246 NA -0.00773472005569888 30 | "57" "TCGA-BT-A20W-11A-11D-A14U-02" 1 187465256 201152272 NA -0.161874068627793 31 | "58" "TCGA-BT-A20W-11A-11D-A14U-02" 22 16051206 22110145 NA 0.212805544089224 32 | "61" "TCGA-BT-A20X-11A-12D-A16N-02" 17 1 21505693 NA -0.341408571718122 33 | "62" "TCGA-BT-A20X-11A-12D-A16N-02" 4 68818 41597686 NA -0.129306283769583 34 | "65" "TCGA-BT-A2LA-11A-11D-A18D-02" 20 56213353 56783730 NA 0.285173876119266 35 | "66" "TCGA-BT-A2LA-11A-11D-A18D-02" 2 133819934 243189359 NA -0.635651500181676 36 | "69" "TCGA-BT-A2LB-10A-01D-A18D-02" 23 58318413 62045719 NA 0.521363874910562 37 | "70" "TCGA-BT-A2LB-10A-01D-A18D-02" 12 60730 34560974 NA 0.179375958983952 38 | "73" "TCGA-BT-A2LD-01A-12D-A210-26" 22 40591555 51244552 NA -0.115569189056885 39 | "74" "TCGA-BT-A2LD-01A-12D-A210-26" 8 72056431 86554763 NA -0.254362985423551 40 | "77" "TCGA-BT-A3PH-01A-11D-A221-26" 5 49559700 180905246 NA -0.251117604050586 41 | "78" "TCGA-BT-A3PH-01A-11D-A221-26" 12 38440830 133841505 NA 0.0199294370182946 42 | "81" "TCGA-BT-A3PJ-01A-21D-A221-26" 7 70562186 71417321 NA 0.376852503791604 43 | "82" "TCGA-BT-A3PJ-01A-21D-A221-26" 8 125999777 146303846 NA -0.503864205599995 44 | "85" "TCGA-BT-A3PK-01A-21D-A221-26" 4 73332096 78163988 NA 0.941466762482077 45 | "86" "TCGA-BT-A3PK-01A-21D-A221-26" 10 61818 78260947 NA 0.0574869749635298 46 | "89" "TCGA-C4-A0F0-01A-12D-A10R-02" 12 38009613 82438515 NA -0.368132483970951 47 | "90" "TCGA-C4-A0F0-01A-12D-A10R-02" 16 32562492 33969362 NA -0.303646313844007 48 | "93" "TCGA-C4-A0F6-01A-11D-A10R-02" 6 63814 2319181 NA -0.121211466525656 49 | "94" "TCGA-C4-A0F6-01A-11D-A10R-02" 23 80225640 82442603 NA 0.150857739370916 50 | "97" "TCGA-CF-A1HR-01A-11D-A13U-02" 7 856949 17999178 NA 0.0848758525580564 51 | "98" "TCGA-CF-A1HR-01A-11D-A13U-02" 4 121888639 123874924 NA -0.424295799956765 52 | "101" "TCGA-CF-A1HS-01A-11D-A13U-02" 21 9422166 10021274 NA 0.832013365283215 53 | "102" "TCGA-CF-A1HS-01A-11D-A13U-02" 10 100081385 135524732 NA -0.080306080581503 54 | "105" "TCGA-CF-A27C-01A-11D-A16N-02" 24 2649474 28783838 NA 0.100350641151954 55 | "106" "TCGA-CF-A27C-01A-11D-A16N-02" 13 72125634 115109864 NA 0.0476642681135593 56 | "109" "TCGA-CF-A3MF-01A-12D-A21C-26" 13 19020028 115109864 NA -0.0962439892452074 57 | "110" "TCGA-CF-A3MF-01A-12D-A21C-26" 14 19141347 107289526 NA -0.126569530580431 58 | "113" "TCGA-CF-A3MG-01A-11D-A210-26" 9 1461804 2510267 NA -0.311690759521937 59 | "114" "TCGA-CF-A3MG-01A-11D-A210-26" 9 80421293 141153413 NA -0.585006712032954 60 | "117" "TCGA-CF-A3MH-01A-11D-A210-26" 1 10208 249240606 NA 0.0143519914953139 61 | "118" "TCGA-CF-A3MH-01A-11D-A210-26" 16 60001 90294729 NA 0.00941993371591381 62 | "121" "TCGA-CF-A3MI-01A-11D-A210-26" 12 42897585 43131785 NA 1.42565689631929 63 | "122" "TCGA-CF-A3MI-01A-11D-A210-26" 8 20441971 21087370 NA 0.0549983590579611 64 | "125" "TCGA-CU-A0YN-01A-21D-A10R-02" 8 23379 39801453 NA -0.310204325025844 65 | "126" "TCGA-CU-A0YN-01A-21D-A10R-02" 15 74139211 74241947 NA 0.642523331815196 66 | "129" "TCGA-CU-A0YO-01A-11D-A10R-02" 20 60001 62965506 NA 0.198193566579685 67 | "130" "TCGA-CU-A0YO-01A-11D-A10R-02" 12 169006 2861395 NA 0.207262247259086 68 | "133" "TCGA-CU-A0YR-01A-12D-A10R-02" 7 79955020 104117560 NA -0.377495201658043 69 | "134" "TCGA-CU-A0YR-01A-12D-A10R-02" 11 48389139 50402704 NA 0.334858333412045 70 | "137" "TCGA-CU-A0YR-11A-13D-A10R-02" 8 23379 17786984 NA -0.755070670539717 71 | "138" "TCGA-CU-A0YR-11A-13D-A10R-02" 8 123319655 125120728 NA 1.3772493355834 72 | "141" "TCGA-CU-A3KJ-10A-01D-A21C-26" 16 31390499 32511983 NA -0.521629286457645 73 | "142" "TCGA-CU-A3KJ-10A-01D-A21C-26" 16 16731215 18769004 NA -1.2494171348026 74 | "145" "TCGA-CU-A3QU-10B-01D-A233-26" 2 208994397 243189359 NA -1.52996811774475 75 | "146" "TCGA-CU-A3QU-10B-01D-A233-26" 1 10208 49541384 NA 0.460765725702518 76 | "149" "TCGA-CU-A3YL-10A-01D-A233-26" 2 239629318 243189359 NA -0.701322622714515 77 | "150" "TCGA-CU-A3YL-10A-01D-A233-26" 3 71088642 71453135 NA -1.86922073857518 78 | "153" "TCGA-DK-A1A3-10A-01D-A13U-02" 6 16613611 19925191 NA -0.363448575873029 79 | "154" "TCGA-DK-A1A3-10A-01D-A13U-02" 19 28118887 28731289 NA 0.376931649965926 80 | "157" "TCGA-DK-A1A5-10A-01D-A13U-02" 3 71815322 130634593 NA -0.031816052401891 81 | "158" "TCGA-DK-A1A5-10A-01D-A13U-02" 2 72065672 72886364 NA -0.310663081153216 82 | "161" "TCGA-DK-A1A6-10A-01D-A13U-02" 17 28962289 32213654 NA 0.62211927661298 83 | "162" "TCGA-DK-A1A6-10A-01D-A13U-02" 5 14580750 15469477 NA -1.24169638329031 84 | "165" "TCGA-DK-A1A7-10A-01D-A13U-02" 18 66468600 78017233 NA -0.535997156763619 85 | "166" "TCGA-DK-A1A7-10A-01D-A13U-02" 9 10001 23301195 NA -0.125621658141106 86 | "169" "TCGA-DK-A1AA-10A-01D-A13U-02" 7 105930243 159128640 NA -0.00385890425740978 87 | "170" "TCGA-DK-A1AA-10A-01D-A13U-02" 6 475695 93983427 NA -0.00845643217356545 88 | "173" "TCGA-DK-A1AB-10A-01D-A13U-02" 6 63814 171050932 NA -0.373717242247084 89 | "174" "TCGA-DK-A1AB-10A-01D-A13U-02" 21 9422166 48119869 NA 0.0682825854749836 90 | "177" "TCGA-DK-A1AC-10A-01D-A13U-02" 16 57189726 79008568 NA -0.525391161119694 91 | "178" "TCGA-DK-A1AC-10A-01D-A13U-02" 1 10208 15244115 NA 0.520556297903463 92 | "181" "TCGA-DK-A1AD-10A-01D-A13U-02" 1 159676413 161070225 NA 1.86289656474361 93 | "182" "TCGA-DK-A1AD-10A-01D-A13U-02" 13 19020028 20299513 NA 0.127958459808601 94 | "185" "TCGA-DK-A1AE-10A-01D-A13U-02" 16 63147135 63177728 NA -1.63212120821474 95 | "186" "TCGA-DK-A1AE-10A-01D-A13U-02" 10 117242386 135524732 NA 0.0308715661449148 96 | "189" "TCGA-DK-A1AG-10A-01D-A13U-02" 20 60001 4556887 NA -0.00203031673746495 97 | "190" "TCGA-DK-A1AG-10A-01D-A13U-02" 15 20000001 102521366 NA 0.0182477385688014 98 | "193" "TCGA-DK-A2HX-10A-01D-A18D-02" 7 8883170 9814709 NA 0.0273305076343656 99 | "194" "TCGA-DK-A2HX-10A-01D-A18D-02" 6 66048503 66741301 NA 0.0979709656531481 100 | "197" "TCGA-DK-A2I1-10A-01D-A17R-02" 4 175233634 178082237 NA -0.0101897136371403 101 | "198" "TCGA-DK-A2I1-10A-01D-A17R-02" 1 104056927 104090604 NA 0.77892793161476 102 | "201" "TCGA-DK-A2I2-10A-01D-A17R-02" 18 10064 18510945 NA 0.036105889090774 103 | "202" "TCGA-DK-A2I2-10A-01D-A17R-02" 4 71725144 74690959 NA 0.161070154821952 104 | "205" "TCGA-DK-A2I6-10A-01D-A18D-02" 12 64764820 66505825 NA -0.290763281991839 105 | "206" "TCGA-DK-A2I6-10A-01D-A18D-02" 1 235594502 249240606 NA -0.18624845048347 106 | "209" "TCGA-DK-A3IK-10A-01D-A21C-26" 20 39116327 40164928 NA 0.524610262981439 107 | "210" "TCGA-DK-A3IK-10A-01D-A21C-26" 16 5352600 7575607 NA -0.004329967791247 108 | "213" "TCGA-DK-A3IL-10A-01D-A210-26" 11 87224559 99891593 NA 0.595682235146572 109 | "214" "TCGA-DK-A3IL-10A-01D-A210-26" 16 35155273 46459008 NA 1.75030804297734 110 | "217" "TCGA-DK-A3IN-10A-01D-A210-26" 3 31451568 37904707 NA 0.122265816847628 111 | "218" "TCGA-DK-A3IN-10A-01D-A210-26" 4 115198041 191014415 NA -0.197608781117688 112 | "221" "TCGA-DK-A3IQ-10A-01D-A210-26" 20 60001 18388033 NA 0.115949737357392 113 | "222" "TCGA-DK-A3IQ-10A-01D-A210-26" 9 3251728 4485530 NA -0.250696214306595 114 | "225" "TCGA-DK-A3IT-10A-01D-A210-26" 12 26283974 26505903 NA 0.73808981078023 115 | "226" "TCGA-DK-A3IT-10A-01D-A210-26" 1 10208 4234646 NA -0.116563794362821 116 | "229" "TCGA-DK-A3IU-10A-01D-A210-26" 12 37991253 38053526 NA 1.87478993183408 117 | "230" "TCGA-DK-A3IU-10A-01D-A210-26" 17 25301752 81195162 NA -0.137542875623982 118 | "233" "TCGA-DK-A3IV-10A-01D-A21C-26" 8 65187706 65225497 NA 1.28329726536649 119 | "234" "TCGA-DK-A3IV-10A-01D-A21C-26" 22 17156707 48633608 NA -0.494888197759455 120 | "237" "TCGA-DK-A3WX-10A-01D-A233-26" 15 20000001 102521366 NA 0.079847704874919 121 | "238" "TCGA-DK-A3WX-10A-01D-A233-26" 6 63814 171050932 NA 0.0487661224113239 122 | "241" "TCGA-DK-A3WY-10A-01D-A233-26" 14 106177004 107289526 NA -0.280540869169745 123 | "242" "TCGA-DK-A3WY-10A-01D-A233-26" 11 195901 134946455 NA -0.00181591314713781 124 | "245" "TCGA-DK-A3X1-10A-01D-A233-26" 3 10988589 11874432 NA 0.741599099191288 125 | "246" "TCGA-DK-A3X1-10A-01D-A233-26" 19 13471712 24513884 NA -0.103991232721606 126 | "249" "TCGA-DK-A3X2-10A-01D-A233-26" 4 58691318 59886313 NA -0.769692286553588 127 | "250" "TCGA-DK-A3X2-10A-01D-A233-26" 14 42263552 86829169 NA 0.232667267761356 128 | "253" "TCGA-E5-A2PC-10B-01D-A204-02" 10 61818 5025173 NA -0.130858593157441 129 | "254" "TCGA-E5-A2PC-10B-01D-A204-02" 4 145764691 151076668 NA -0.637674731273411 130 | "257" "TCGA-E7-A3X6-10A-01D-A233-26" 23 45054413 102902405 NA 0.113174219539276 131 | "258" "TCGA-E7-A3X6-10A-01D-A233-26" 9 10001 21965768 NA -0.526924136501349 132 | "261" "TCGA-E7-A3Y1-10A-01D-A233-26" 6 63814 171050932 NA -0.0392434632506376 133 | "262" "TCGA-E7-A3Y1-10A-01D-A233-26" 1 142825886 249240606 NA 0.47588858781785 134 | "265" "TCGA-FD-A3B3-10A-01D-A204-02" 17 57789929 81195162 NA -0.0358779386299791 135 | "266" "TCGA-FD-A3B3-10A-01D-A204-02" 2 78689086 87740684 NA -0.225400054573825 136 | "269" "TCGA-FD-A3B4-10A-01D-A204-02" 7 41117608 90058582 NA -0.0454754926077756 137 | "270" "TCGA-FD-A3B4-10A-01D-A204-02" 20 60001 62965506 NA -0.0293578026480229 138 | "273" "TCGA-FD-A3B5-10A-01D-A210-26" 23 7785329 154930285 NA 0.31861949381711 139 | "274" "TCGA-FD-A3B5-10A-01D-A210-26" 1 144531908 249240606 NA 0.195129750882562 140 | "277" "TCGA-FD-A3B6-10A-01D-A210-26" 24 2649474 28783838 NA -0.0816521284861819 141 | "278" "TCGA-FD-A3B6-10A-01D-A210-26" 10 65768355 67062950 NA 0.708790577327999 142 | "281" "TCGA-FD-A3B7-10A-01D-A210-26" 4 148104418 152605747 NA 0.440942926444969 143 | "282" "TCGA-FD-A3B7-10A-01D-A210-26" 6 63814 5660280 NA -0.156037696605089 144 | "285" "TCGA-FD-A3B8-10A-01D-A210-26" 16 60001 90294729 NA -0.0273358821397191 145 | "286" "TCGA-FD-A3B8-10A-01D-A210-26" 23 2699503 154930285 NA 0.0327613575222251 146 | "289" "TCGA-FD-A3N5-10A-01D-A21C-26" 6 44317165 171050932 NA -0.0955242025861752 147 | "290" "TCGA-FD-A3N5-10A-01D-A21C-26" 15 20000001 32010494 NA -0.545547383873378 148 | "293" "TCGA-FD-A3N6-10A-01D-A21C-26" 3 60362957 60478746 NA -2.61639502153454 149 | "294" "TCGA-FD-A3N6-10A-01D-A21C-26" 20 34183971 62965506 NA -0.0596705976054726 150 | "297" "TCGA-FD-A3NA-10A-01D-A21C-26" 5 49551633 150135519 NA -0.0942552581652406 151 | "298" "TCGA-FD-A3NA-10A-01D-A21C-26" 4 126241915 129194848 NA -0.433257831498876 152 | "301" "TCGA-FD-A3SJ-10A-01D-A233-26" 9 27489874 29746925 NA 0.169912427768146 153 | "302" "TCGA-FD-A3SJ-10A-01D-A233-26" 1 173792657 175670239 NA 1.24471530069067 154 | "305" "TCGA-FD-A3SL-10A-01D-A233-26" 12 188219 133841505 NA -0.182399287422105 155 | "306" "TCGA-FD-A3SL-10A-01D-A233-26" 4 189693990 191014415 NA -0.427894104774698 156 | "309" "TCGA-FD-A3SM-10A-01D-A233-26" 4 68818 191014415 NA 0.0146938161241947 157 | "310" "TCGA-FD-A3SM-10A-01D-A233-26" 8 37588240 38564934 NA 1.22494145713849 158 | "313" "TCGA-FD-A3SN-10A-01D-A233-26" 5 58768314 59103278 NA 0.679421511035303 159 | "314" "TCGA-FD-A3SN-10A-01D-A233-26" 13 27050102 87833143 NA -0.324319276758869 160 | "317" "TCGA-FD-A3SO-10A-01D-A233-26" 8 23379 48419845 NA -0.434005574572034 161 | "318" "TCGA-FD-A3SO-10A-01D-A233-26" 9 10001 141153413 NA 0.058427449099493 162 | "321" "TCGA-FD-A3SP-10A-01D-A233-26" 8 43092750 43097417 NA 3.04336277414551 163 | "322" "TCGA-FD-A3SP-10A-01D-A233-26" 18 10064 19584050 NA 0.0715882620781257 164 | "325" "TCGA-FD-A3SQ-10A-01D-A233-26" 1 174801569 178932598 NA 0.250005637997436 165 | "326" "TCGA-FD-A3SQ-10A-01D-A233-26" 9 93619293 94000052 NA -0.873309344786371 166 | "329" "TCGA-FD-A3SR-10A-01D-A233-26" 18 9394393 10349109 NA 0.530418349466572 167 | "330" "TCGA-FD-A3SR-10A-01D-A233-26" 1 145367168 147730635 NA 1.23563497416252 168 | "333" "TCGA-FD-A3SS-10A-01D-A233-26" 5 46389437 175639955 NA -0.538298365219699 169 | "334" "TCGA-FD-A3SS-10A-01D-A233-26" 6 121407729 122145390 NA 0.645298912027161 170 | "337" "TCGA-FT-A3EE-10A-01D-A204-02" 4 33708672 34503662 NA 0.105441838837718 171 | "338" "TCGA-FT-A3EE-10A-01D-A204-02" 22 16051206 51244552 NA -0.261788604248097 172 | "341" "TCGA-G2-A2EC-10A-01D-A17R-02" 24 2649474 58844021 NA 0.0913301706855381 173 | "342" "TCGA-G2-A2EC-10A-01D-A17R-02" 5 155105650 179550250 NA -0.44866303953823 174 | "345" "TCGA-G2-A2EF-10A-01D-A18D-02" 8 48180587 90329793 NA -0.0446394091409028 175 | "346" "TCGA-G2-A2EF-10A-01D-A18D-02" 11 66880142 134946455 NA -0.0250633239777246 176 | "349" "TCGA-G2-A2EJ-10A-01D-A17R-02" 2 32222860 32415046 NA -0.617629467331965 177 | "350" "TCGA-G2-A2EJ-10A-01D-A17R-02" 11 78758057 81264883 NA -0.576249020003528 178 | "353" "TCGA-G2-A2EK-10A-01D-A18D-02" 3 148076113 149533806 NA -0.436122467986259 179 | "354" "TCGA-G2-A2EK-10A-01D-A18D-02" 1 65888453 144005542 NA -0.0245890959067118 180 | "357" "TCGA-G2-A2EL-10A-01D-A18D-02" 3 60174 25828606 NA 0.136988592875077 181 | "358" "TCGA-G2-A2EL-10A-01D-A18D-02" 5 11769 31466816 NA 0.068512529538564 182 | "361" "TCGA-G2-A2ES-11A-31D-A17R-02" 5 45901673 49555334 NA 0.45574195724214 183 | "362" "TCGA-G2-A2ES-11A-31D-A17R-02" 11 26207298 30218540 NA -0.000160033704997661 184 | "365" "TCGA-G2-A3IB-10A-01D-A210-26" 16 60001 223583 NA 0.561251033599773 185 | "366" "TCGA-G2-A3IB-10A-01D-A210-26" 8 43428314 46921966 NA 0.475207847898585 186 | "369" "TCGA-G2-A3IE-10A-01D-A210-26" 3 39612761 60264981 NA 0.495559264888242 187 | "370" "TCGA-G2-A3IE-10A-01D-A210-26" 4 93594714 173425586 NA -0.293160662895868 188 | "373" "TCGA-G2-A3VY-10A-01D-A233-26" 9 109973701 132903224 NA -0.386039301234847 189 | "374" "TCGA-G2-A3VY-10A-01D-A233-26" 12 188219 1880680 NA -0.0591011865665831 190 | "377" "TCGA-GC-A3BM-10A-01D-A23Q-26" 18 39622999 40749298 NA -0.444429050209849 191 | "378" "TCGA-GC-A3BM-10A-01D-A23Q-26" 11 81013505 81387487 NA 1.47813863932257 192 | "381" "TCGA-GC-A3I6-01A-11D-A210-26" 14 20424940 107289526 NA 0.131404191409226 193 | "382" "TCGA-GC-A3I6-01A-11D-A210-26" 12 188219 34560974 NA 0.142121978063855 194 | "385" "TCGA-GC-A3OO-01A-11D-A233-26" 1 10208 249240606 NA 0.0527960405418375 195 | "386" "TCGA-GC-A3OO-01A-11D-A233-26" 20 60001 62965506 NA 0.212520376173027 196 | "389" "TCGA-GC-A3RD-01A-12D-A233-26" 12 69198316 69372087 NA 4.35545916730313 197 | "390" "TCGA-GC-A3RD-01A-12D-A233-26" 7 32425044 41415395 NA 0.852549524688694 198 | "393" "TCGA-GC-A3WC-01A-31D-A233-26" 8 23379 3275420 NA -0.337312402464662 199 | "394" "TCGA-GC-A3WC-01A-31D-A233-26" 21 37514100 38576723 NA 0.697730704202874 200 | "397" "TCGA-GC-A3WC-11A-11D-A233-26" 4 66456541 67063379 NA 0.415428401579314 201 | "398" "TCGA-GC-A3WC-11A-11D-A233-26" 8 43786144 146303846 NA 0.228660175848778 202 | "401" "TCGA-GD-A2C5-10A-01D-A17R-02" 11 31360720 31984769 NA -0.0806177664337259 203 | "402" "TCGA-GD-A2C5-10A-01D-A17R-02" 16 12144408 17594332 NA -0.449976633028893 204 | "405" "TCGA-GD-A3OP-01A-21D-A221-26" 11 195901 134946455 NA -0.103725067423542 205 | "406" "TCGA-GD-A3OP-01A-21D-A221-26" 5 46389437 180905246 NA -0.0997474434064362 206 | "409" "TCGA-GD-A3OP-11A-11D-A221-26" 6 63814 42759519 NA -0.103720485767818 207 | "410" "TCGA-GD-A3OP-11A-11D-A221-26" 16 34196882 90294729 NA -0.220063223344308 208 | "413" "TCGA-GD-A3OQ-10A-01D-A221-26" 17 43673826 44212734 NA -0.721342656508171 209 | "414" "TCGA-GD-A3OQ-10A-01D-A221-26" 16 60001 90294729 NA -0.0279590914074535 210 | "417" "TCGA-GD-A3OS-01A-12D-A221-26" 10 24549423 25199875 NA 0.0797066752625248 211 | "418" "TCGA-GD-A3OS-01A-12D-A221-26" 16 33241258 34459387 NA -0.232184795634823 212 | "421" "TCGA-GV-A3JV-01A-11D-A221-26" 14 71221735 71931850 NA 1.32760392205331 213 | "422" "TCGA-GV-A3JV-01A-11D-A221-26" 12 60730 19244878 NA -0.162214454113409 214 | "425" "TCGA-GV-A3JW-01A-11D-A210-26" 23 128050183 154930285 NA 0.283849828370155 215 | "426" "TCGA-GV-A3JW-01A-11D-A210-26" 16 25856506 26583505 NA 0.777314229943503 216 | "429" "TCGA-GV-A3JX-01A-11D-A210-26" 22 16051206 51244552 NA -0.756450574913272 217 | "430" "TCGA-GV-A3JX-01A-11D-A210-26" 15 22697825 102521366 NA -0.791298494359646 218 | "433" "TCGA-GV-A3JZ-01A-11D-A21C-26" 17 12150816 12759218 NA 0.509791521326194 219 | "434" "TCGA-GV-A3JZ-01A-11D-A21C-26" 5 23123410 27184334 NA -0.163013211330329 220 | "437" "TCGA-GV-A3QF-01A-31D-A233-26" 1 26509374 59339528 NA -0.0952285398758193 221 | "438" "TCGA-GV-A3QF-01A-31D-A233-26" 3 60744709 93533249 NA 0.151052036455775 222 | "441" "TCGA-GV-A3QG-01A-11D-A221-26" 5 11769 53060372 NA -0.0423279562882088 223 | "442" "TCGA-GV-A3QG-01A-11D-A221-26" 2 14483090 243189359 NA -0.031985504966719 224 | "445" "TCGA-GV-A3QH-01A-11D-A221-26" 16 62764166 63596223 NA -0.391701952787151 225 | "446" "TCGA-GV-A3QH-01A-11D-A221-26" 4 129845569 130791851 NA 0.544607886025542 226 | "449" "TCGA-H4-A2HQ-01A-11D-A17R-02" 16 3887120 8290659 NA -0.212174077026258 227 | "450" "TCGA-H4-A2HQ-01A-11D-A17R-02" 1 62972796 64923432 NA 1.05481458964742 228 | "453" "TCGA-HQ-A2OE-01A-11D-A204-02" 10 104951493 105299191 NA -0.271937682930961 229 | "454" "TCGA-HQ-A2OE-01A-11D-A204-02" 5 52184783 52518798 NA -0.230021318992407 230 | "457" "TCGA-K4-A3WS-01A-11D-A23Q-26" 20 26213766 46534498 NA 0.374258403148417 231 | "458" "TCGA-K4-A3WS-01A-11D-A23Q-26" 7 7788999 38637598 NA -0.0778093714176013 232 | "461" "TCGA-K4-A3WV-01A-11D-A23Q-26" 10 76962861 77154224 NA 0.176828116408713 233 | "462" "TCGA-K4-A3WV-01A-11D-A23Q-26" 6 6016367 6589128 NA -0.46322899113314 234 | -------------------------------------------------------------------------------- /inst/extdata/bt.exon_quant.R: -------------------------------------------------------------------------------- 1 | ## Download example dataset from legacy archive 2 | if (!requireNamespace("GenomicDataCommons")) 3 | stop("Please download 'GenomicDataCommons' to update file") 4 | 5 | library(GenomicDataCommons) 6 | 7 | manifile <- files() |> 8 | filter(~ file_id == "d56a5dec-cb55-457f-8d93-dd1f3911ae9f") |> 9 | manifest() 10 | 11 | gdcdata(manifile[["id"]], use_cached = TRUE) 12 | 13 | flist <- list.files(gdc_cache(), pattern = "cation.txt$", recursive = TRUE, 14 | full.names = TRUE) 15 | flist <- flist[grepl("^unc", basename(flist))] 16 | 17 | exonFile <- "bt.exon_quantification.txt" 18 | file.rename(flist, exonFile) 19 | 20 | exonEx <- read.delim(exonFile, nrows = 100) 21 | 22 | write.table(exonEx, file.path("inst", "extdata", basename(exonFile)), 23 | sep = "\t", row.names = FALSE) 24 | -------------------------------------------------------------------------------- /inst/extdata/bt.exon_quantification.txt: -------------------------------------------------------------------------------- 1 | "exon" "raw_counts" "median_length_normalized" "RPKM" 2 | "chr1:11874-12227:+" 4 0.4929178 0.322476823123937 3 | "chr1:12595-12721:+" 2 0.3412699 0.449436202306589 4 | "chr1:12613-12721:+" 2 0.3981481 0.523655024705842 5 | "chr1:12646-12697:+" 2 0.372549 1.09766149409494 6 | "chr1:13221-14409:+" 39 0.6329966 0.936104924316458 7 | "chr1:13403-14409:+" 36 0.6192843 1.02026927355796 8 | "chr1:14363-16765:-" 1033 0.9941715 12.2684113226808 9 | "chr1:16854-17055:-" 249 1 35.1795074889635 10 | "chr1:17233-18061:-" 503 0.9649758 17.3163052108246 11 | "chr1:18268-18379:-" 132 1 33.6354843547663 12 | "chr1:18497-18554:-" 27 1 13.2854891181836 13 | "chr1:18913-19759:-" 277 1 9.33336255073406 14 | "chr1:24738-24901:-" 65 0.9570552 11.3112678354905 15 | "chr1:29321-29370:-" 8 0.5714286 4.56627181543495 16 | "chr1:29824-29961:-" 1 0.5547445 0.206805788742525 17 | "chr1:34612-35174:-" 1 0.1352313 0.0506912945763205 18 | "chr1:35277-35481:-" 0 0 0 19 | "chr1:35721-36081:-" 0 0 0 20 | "chr1:69091-70008:+" 0 0 0 21 | "chr1:89295-90404:-" 37 0.7321911 0.951306628215614 22 | "chr1:137839-139228:-" 419 0.9971202 8.60282324940307 23 | "chr1:236615-237877:-" 14 0.5380349 0.31634899750638 24 | "chr1:321084-321114:+" 45 1 41.4278692932606 25 | "chr1:321146-321223:+" 41 1 15.0013737526308 26 | "chr1:322037-322228:+" 29 1 4.310608159102 27 | "chr1:323892-324060:+" 22 1 3.71516198001364 28 | "chr1:324288-324345:+" 3 0.7368421 1.47616545757595 29 | "chr1:324439-328580:+" 1539 0.9951702 10.6040142502933 30 | "chr1:367659-368595:+" 0 0 0 31 | "chr1:420206-420296:+" 0 0 0 32 | "chr1:420992-421258:+" 0 0 0 33 | "chr1:421396-421839:+" 0 0 0 34 | "chr1:566462-568045:+" 149133 1 2686.95476109241 35 | "chr1:568149-568842:+" 24079 1 990.19505623071 36 | "chr1:568844-568912:+" 1157 1 478.548595150202 37 | "chr1:569327-570349:+" 97615 0.9589041 2723.21983909874 38 | "chr1:621098-622034:-" 0 0 0 39 | "chr1:661140-665184:-" 1518 0.9967853 10.7101369218638 40 | "chr1:665278-665335:-" 13 1 6.39671698282913 41 | "chr1:665563-665731:-" 19 1 3.20854898273905 42 | "chr1:667397-667587:-" 38 1 5.67795579144398 43 | "chr1:668402-668479:-" 47 1 17.1966967408207 44 | "chr1:668511-668541:-" 47 1 43.2691079285166 45 | "chr1:668687-668744:-" 13 1 6.39671698282913 46 | "chr1:670803-670994:-" 31 0.9947644 4.60789148041938 47 | "chr1:671808-671885:-" 41 1 15.0013737526308 48 | "chr1:671917-671947:-" 11 1 10.1268124939081 49 | "chr1:674240-674404:-" 38 1 6.57266397676242 50 | "chr1:675183-675415:-" 92 1 11.2686965402365 51 | "chr1:675509-675566:-" 7 1 3.44438606767722 52 | "chr1:678666-678730:-" 7 1 3.07345218346583 53 | "chr1:679575-679736:-" 14 1 2.46635051759604 54 | "chr1:700237-700627:-" 20 0.9461538 1.45980556759429 55 | "chr1:701709-701767:-" 2 1 0.967430469371811 56 | "chr1:703928-703993:-" 3 0.7230769 1.29723631120311 57 | "chr1:704877-705092:-" 13 1 1.71763696761152 58 | "chr1:708356-708487:-" 13 1 2.8106786742734 59 | "chr1:709551-709660:-" 14 1 3.63226167136871 60 | "chr1:713664-714006:-" 13 0.7982456 1.08166059767956 61 | "chr1:761587-762902:-" 29 0.7315589 0.628903318045277 62 | "chr1:763064-763155:+" 8 1 2.4816694649103 63 | "chr1:764383-764484:+" 8 1 2.23836853697791 64 | "chr1:783034-783186:+" 3 0.4736842 0.559592134244479 65 | "chr1:787307-787490:+" 16 1 2.4816694649103 66 | "chr1:788051-788146:+" 11 1 3.27011653449117 67 | "chr1:788771-789740:+" 91 0.9649123 2.67738875776147 68 | "chr1:791898-794579:+" 31 0.460276 0.32987142589132 69 | "chr1:803453-804055:-" 8 0.3056479 0.378629503767409 70 | "chr1:809492-810535:-" 1 0.0728667 0.0273363973625176 71 | "chr1:812126-812182:-" 0 0 0 72 | "chr1:846815-846853:+" 0 0 0 73 | "chr1:847325-850328:+" 8 0.1714952 0.0760031926670264 74 | "chr1:852953-853100:-" 3 0.8639456 0.5784972739149 75 | "chr1:853402-853555:-" 3 0.6732026 0.555958419087047 76 | "chr1:854205-854295:-" 3 1 0.940852709224233 77 | "chr1:854715-854817:-" 3 0.9509804 0.831238801353449 78 | "chr1:860530-860569:+" 0 0 0 79 | "chr1:861121-861180:+" 0 0 0 80 | "chr1:861302-861393:+" 4 1 1.24083473245515 81 | "chr1:865535-865716:+" 6 1 0.940852709224233 82 | "chr1:866419-866469:+" 2 0.74 1.11918426848896 83 | "chr1:871152-871276:+" 2 0.6129032 0.456627181543495 84 | "chr1:874420-874509:+" 2 0.7303371 0.634204418810409 85 | "chr1:874655-874840:+" 3 0.9081081 0.460309658814007 86 | "chr1:876524-876686:+" 7 0.962963 1.22560976641275 87 | "chr1:877516-877631:+" 12 1 2.9523309151519 88 | "chr1:877790-877868:+" 10 1 3.61255681600866 89 | "chr1:877939-878438:+" 23 0.9338678 1.31280314693755 90 | "chr1:878633-878757:+" 2 0.6451613 0.456627181543495 91 | "chr1:879078-879188:+" 8 0.8 2.05687919614187 92 | "chr1:879288-879961:+" 319 1 13.5074249733285 93 | "chr1:879584-880180:-" 446 1 21.3207415167921 94 | "chr1:880422-880526:-" 148 1 40.2266802788317 95 | "chr1:880898-881033:-" 174 1 36.5133867594522 96 | "chr1:881553-881666:-" 181 1 45.3122367650069 97 | "chr1:881782-881925:-" 179 1 35.4758096772073 98 | "chr1:883511-883612:-" 151 1 42.2492061354581 99 | "chr1:883870-883983:-" 155 1 38.8032966772158 100 | "chr1:886507-886618:-" 144 1 36.6932556597451 101 | "chr1:887380-887519:-" 158 1 32.2085244124429 102 | -------------------------------------------------------------------------------- /inst/scripts/clinicalNames.R: -------------------------------------------------------------------------------- 1 | # Locate Clinical datasets for each cancer 2 | # Script used with https://github.com/waldronlab/MultiAssayExperiment-TCGA 3 | 4 | if (!requireNamespace("RTCGAToolbox")) 5 | stop("Install `RTCGAToolbox` to generate 'clinicalNames' data") 6 | 7 | TCGAcodes <- RTCGAToolbox::getFirehoseDatasets() 8 | 9 | excludedCodes <- c("COADREAD", "GBMLGG", "KIPAN", "STES", "FPPP", "CNTL", 10 | "LCML", "MISC") 11 | TCGAcodes <- TCGAcodes[-which(TCGAcodes %in% excludedCodes)] 12 | 13 | myDataDir <- tempdir() 14 | 15 | lapply(TCGAcodes, function(cancer) { 16 | if (!file.exists(file.path(myDataDir, cancer, "clinical.csv"))) { 17 | clinDat <- RTCGAToolbox::getFirehoseData(dataset = cancer, 18 | destdir = myDataDir) 19 | clinFrame <- RTCGAToolbox::getData(clinDat, "clinical") 20 | rownames(clinFrame) <- 21 | TCGAutils:::.standardBarcodes(rownames(clinFrame)) 22 | 23 | dir.create(file.path(myDataDir, cancer)) 24 | 25 | write.csv(clinFrame, file.path(myDataDir, cancer, "clinical.csv")) 26 | message(cancer, " clinical data saved.") 27 | } else { 28 | message(cancer, " clinical data already exists!") 29 | } 30 | }) 31 | 32 | names(TCGAcodes) <- TCGAcodes 33 | 34 | clinicalNames <- IRanges::CharacterList(lapply(TCGAcodes, function(cancer) { 35 | clinDat <- read.csv(file.path(myDataDir, cancer, "clinical.csv"), 36 | row.names = 1L) 37 | allNA <- vapply(clinDat, function(col) all(is.na(col)), logical(1L)) 38 | clinDat <- clinDat[, !allNA] 39 | names(clinDat)[names(clinDat) != "Composite.Element.REF"] 40 | })) 41 | 42 | devtools::use_data(clinicalNames, overwrite = TRUE) 43 | -------------------------------------------------------------------------------- /inst/scripts/diseaseCodes.R: -------------------------------------------------------------------------------- 1 | if (!requireNamespace("BiocFileCache")) 2 | stop("Please install 'BiocFileCache' to manage and generate data") 3 | 4 | ## Extract cancer codes from TCGA project 5 | .parseDiseaseCodes <- function(from, to) { 6 | htcc <- xml2::read_html(from) 7 | diseaseCodes <- rvest::html_table(htcc, fill = TRUE)[[2L]] 8 | names(diseaseCodes) <- make.names(colnames(diseaseCodes)) 9 | 10 | excludedCodes <- c("COADREAD", "GBMLGG", "KIPAN", "STES", "FPPP", "CNTL", 11 | "LCML", "MISC") 12 | available <- !diseaseCodes[["Study.Abbreviation"]] %in% excludedCodes 13 | diseaseCodes[["Available"]] <- factor(available, levels = c("TRUE", "FALSE"), 14 | labels = c("Yes", "No")) 15 | 16 | subtypeCodes <- c("ACC", "BLCA", "BRCA", "COAD", "GBM", "HNSC", "KICH", 17 | "KIRC", "KIRP", "LAML", "LGG", "LUAD", "LUSC", "OV", "PRAD", "SKCM", 18 | "STAD", "THCA", "UCEC") 19 | diseaseCodes[["SubtypeData"]] <- factor( 20 | diseaseCodes[["Study.Abbreviation"]] %in% subtypeCodes, 21 | levels = c("TRUE", "FALSE"), labels = c("Yes", "No")) 22 | 23 | diseaseCodes <- diseaseCodes[order(diseaseCodes[["Study.Abbreviation"]]), ] 24 | ## Rearrange column order 25 | diseaseCodes <- diseaseCodes[, 26 | c("Study.Abbreviation", "Available", "SubtypeData", "Study.Name")] 27 | rownames(diseaseCodes) <- NULL 28 | 29 | ## Coerce to standard data.frame (no tibble required) 30 | diseaseCodes <- as(diseaseCodes, "data.frame") 31 | 32 | ## For easy subsetting use: 33 | ## diseaseCodes[["Study.Abbreviation"]][diseaseCodes$Available == "Yes"] 34 | 35 | ## Save dataset for exported use 36 | save(diseaseCodes, file = to, compress = "bzip2") 37 | TRUE 38 | } 39 | 40 | .get_cache <- function() { 41 | cache <- rappdirs::user_cache_dir("TCGAutils") 42 | BiocFileCache::BiocFileCache(cache) 43 | } 44 | 45 | update_data_file <- 46 | function(fileURL, verbose = FALSE , resource, ext = ".rda", FUN) { 47 | bfc <- .get_cache() 48 | rid <- BiocFileCache::bfcquery(bfc, fileURL, "rname")$rid 49 | if (!length(rid)) { 50 | if (verbose) 51 | message( "Downloading ", resource, " file" ) 52 | rid <- names(BiocFileCache::bfcadd(bfc, fileURL, download = FALSE, 53 | ext = ".rda")) 54 | } 55 | if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid))) { 56 | rpath <- BiocFileCache::bfcdownload(bfc, rid, ask = FALSE, 57 | FUN = FUN, ext = ".rda") 58 | ## copy to data dir after updating 59 | file.copy(rpath, file.path("data", paste0(resource, ext)), 60 | overwrite = TRUE) 61 | } 62 | if (verbose) 63 | message(resource, " update complete") 64 | 65 | bfcrpath(bfc, rids = rid) 66 | } 67 | 68 | url1 <- paste0("https://gdc.cancer.gov/resources-tcga-users/", 69 | "tcga-code-tables/tcga-study-abbreviations") 70 | update_data_file(url1, verbose = FALSE, 71 | resource = "diseaseCodes", FUN = .parseDiseaseCodes) 72 | -------------------------------------------------------------------------------- /inst/scripts/sampleTypes.R: -------------------------------------------------------------------------------- 1 | ## Extract sample types table from TCGA website 2 | .parseSampleTypes <- function(from, to) { 3 | stcc <- xml2::read_html(from) 4 | 5 | sampleTypes <- rvest::html_table(stcc, fill = TRUE)[[2L]] 6 | 7 | ## convert code column to character 8 | codeCol <- sampleTypes[["Code"]] 9 | singleDigit <- codeCol < 10L 10 | sampleTypes[["Code"]][singleDigit] <- 11 | paste0("0", sampleTypes[["Code"]][singleDigit]) 12 | 13 | names(sampleTypes) <- make.names(colnames(sampleTypes)) 14 | 15 | ## Coerce to standard data.frame (no tibble required) 16 | sampleTypes <- as(sampleTypes, "data.frame") 17 | 18 | ## Save dataset for exported use 19 | save(sampleTypes, file = to, compress = "bzip2") 20 | TRUE 21 | } 22 | 23 | url2 <- 24 | "https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes" 25 | ## update_data_file in data-raw/diseaseCodes.R 26 | update_data_file(url2, verbose = FALSE, resource = "sampleTypes", 27 | FUN = .parseSampleTypes) 28 | -------------------------------------------------------------------------------- /man/ID-translation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ID-translation.R 3 | \name{ID-translation} 4 | \alias{ID-translation} 5 | \alias{UUIDtoBarcode} 6 | \alias{UUIDtoUUID} 7 | \alias{barcodeToUUID} 8 | \alias{filenameToBarcode} 9 | \alias{UUIDhistory} 10 | \title{Translate study identifiers from barcode to UUID and vice versa} 11 | \usage{ 12 | UUIDtoBarcode(id_vector, from_type = c("case_id", "file_id", "aliquot_ids")) 13 | 14 | UUIDtoUUID(id_vector, to_type = c("case_id", "file_id")) 15 | 16 | barcodeToUUID(barcodes) 17 | 18 | filenameToBarcode(filenames, slides = FALSE) 19 | 20 | UUIDhistory(id, endpoint = .HISTORY_ENDPOINT) 21 | } 22 | \arguments{ 23 | \item{id_vector}{character() A vector of UUIDs corresponding to 24 | either files or cases (default assumes case_ids)} 25 | 26 | \item{from_type}{character(1) Either \code{case_id} or \code{file_id} indicating the 27 | type of \code{id_vector} entered (default \code{"case_id"})} 28 | 29 | \item{to_type}{character(1) The desired UUID type to obtain, can either be 30 | \code{"case_id"} (default) or \code{"file_id"}} 31 | 32 | \item{barcodes}{character() A vector of TCGA barcodes} 33 | 34 | \item{filenames}{\code{character()} A vector of file names usually obtained 35 | from a \code{GenomicDataCommons} query} 36 | 37 | \item{slides}{\code{logical(1L)} \strong{DEPRECATED}: Whether the provided file names 38 | correspond to slides typically with an \code{.svs} extension. \strong{Note} The 39 | barcodes returned correspond 1:1 with the \code{filename} inputs. Always triple 40 | check the output against the Genomic Data Commons Data Portal by searching 41 | the file name and comparing associated "Entity ID" with the \code{submitter_id} 42 | given by the function.} 43 | 44 | \item{id}{character(1) A UUID whose history of versions is sought} 45 | 46 | \item{endpoint}{character(1) Generally a constant pertaining to the location 47 | of the history api endpoint. This argument rarely needs to change.} 48 | } 49 | \value{ 50 | Generally, a \code{data.frame} of identifier mappings 51 | 52 | UUIDhistory: A \code{data.frame} containting a list of associated UUIDs 53 | for the given input along with \code{file_change} status, \code{data_release} 54 | versions, etc. 55 | } 56 | \description{ 57 | These functions allow the user to enter a character vector of 58 | identifiers and use the GDC API to translate from TCGA barcodes to 59 | Universally Unique Identifiers (UUID) and vice versa. These relationships 60 | are not one-to-one. Therefore, a \code{data.frame} is returned for all 61 | inputs. The UUID to TCGA barcode translation only applies to file and case 62 | UUIDs. Two-way UUID translation is available from 'file_id' to 'case_id' 63 | and vice versa. Please double check any results before using these 64 | features for analysis. Case / submitter identifiers are translated by 65 | default, see the \code{from_type} argument for details. All identifiers are 66 | converted to lower case. 67 | } 68 | \details{ 69 | Based on the file UUID supplied, the appropriate entity_id (TCGA barcode) is 70 | returned. In previous versions of the package, the 'end_point' parameter 71 | would require the user to specify what type of barcode needed. This is no 72 | longer supported as \code{entity_id} returns the appropriate one. 73 | 74 | When providing slide file names, the function will only work if 75 | \strong{all} the provided files are slide files with an \code{.svs} extension. 76 | } 77 | \examples{ 78 | ## Translate UUIDs >> TCGA Barcode 79 | 80 | uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac", 81 | "5ca9fa79-53bc-4e91-82cd-5715038ee23e", 82 | "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382") 83 | 84 | UUIDtoBarcode(uuids, from_type = "file_id") 85 | 86 | UUIDtoBarcode("ae55b2d3-62a1-419e-9f9a-5ddfac356db4", from_type = "case_id") 87 | 88 | UUIDtoBarcode("d85d8a17-8aea-49d3-8a03-8f13141c163b", "aliquot_ids") 89 | 90 | ## Translate file UUIDs >> case UUIDs 91 | 92 | uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac", 93 | "5ca9fa79-53bc-4e91-82cd-5715038ee23e", 94 | "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382") 95 | 96 | UUIDtoUUID(uuids) 97 | 98 | ## Translate TCGA Barcode >> UUIDs 99 | 100 | fullBarcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 101 | "TCGA-B0-5094-11A-01D-1421-08", 102 | "TCGA-E9-A295-10A-01D-A16D-09") 103 | 104 | sample_ids <- TCGAbarcode(fullBarcodes, sample = TRUE) 105 | 106 | barcodeToUUID(sample_ids) 107 | 108 | participant_ids <- c("TCGA-CK-4948", "TCGA-D1-A17N", 109 | "TCGA-4V-A9QX", "TCGA-4V-A9QM") 110 | 111 | barcodeToUUID(participant_ids) 112 | 113 | library(GenomicDataCommons) 114 | 115 | ### Query CNV data and get file names 116 | 117 | cnv <- files() |> 118 | filter( 119 | ~ cases.project.project_id == "TCGA-COAD" & 120 | data_category == "Copy Number Variation" & 121 | data_type == "Copy Number Segment" 122 | ) |> 123 | results(size = 6) 124 | 125 | filenameToBarcode(cnv$file_name) 126 | 127 | ### Query slides data and get file names 128 | 129 | slides <- files() |> 130 | filter( 131 | ~ cases.project.project_id == "TCGA-BRCA" & 132 | cases.samples.sample_type == "Primary Tumor" & 133 | data_type == "Slide Image" & 134 | experimental_strategy == "Diagnostic Slide" 135 | ) |> 136 | results(size = 3) 137 | 138 | filenameToBarcode(slides$file_name, slides = TRUE) 139 | 140 | ## Get the version history of a BAM file in TCGA-KIRC 141 | UUIDhistory("0001801b-54b0-4551-8d7a-d66fb59429bf") 142 | 143 | } 144 | \author{ 145 | Sean Davis, M. Ramos 146 | } 147 | -------------------------------------------------------------------------------- /man/TCGAbarcode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TCGAbarcode.R 3 | \name{TCGAbarcode} 4 | \alias{TCGAbarcode} 5 | \title{Parse data from TCGA barcode} 6 | \usage{ 7 | TCGAbarcode( 8 | barcodes, 9 | participant = TRUE, 10 | sample = FALSE, 11 | portion = FALSE, 12 | plate = FALSE, 13 | center = FALSE, 14 | index = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{barcodes}{A character vector of TCGA barcodes} 19 | 20 | \item{participant}{Logical (default TRUE) participant identifier chunk} 21 | 22 | \item{sample}{Logical (default FALSE) includes the numeric sample code of 23 | the barcode and the vial letter} 24 | 25 | \item{portion}{Logical (default FALSE) includes the portion and analyte 26 | codes of the barcode} 27 | 28 | \item{plate}{Logical (default FALSE) returns the plate value} 29 | 30 | \item{center}{Logical (default FALSE) returns a matrix with the plate and 31 | center codes} 32 | 33 | \item{index}{An optional numeric vector indicating barcode positions when 34 | split by the delimiter (i.e., hyphen '-'). For example, an index of 35 | \code{c(1, 2)} corresponds to 'TCGA-ZZ' in \code{TCGA-ZZ-A1A1}.} 36 | } 37 | \value{ 38 | A character vector or data matrix of TCGA barcode information 39 | } 40 | \description{ 41 | This function returns the specified snippet of information obtained from 42 | the TCGA barcode. 43 | } 44 | \examples{ 45 | barcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 46 | "TCGA-B0-5094-11A-01D-1421-08", 47 | "TCGA-E9-A295-10A-01D-A16D-09") 48 | 49 | ## Patient identifiers 50 | TCGAbarcode(barcodes) 51 | 52 | ## Sample identifiers 53 | TCGAbarcode(barcodes, sample = TRUE) 54 | 55 | } 56 | \author{ 57 | M. Ramos 58 | } 59 | -------------------------------------------------------------------------------- /man/TCGAbiospec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TCGAbiospec.R 3 | \name{TCGAbiospec} 4 | \alias{TCGAbiospec} 5 | \title{Extract biospecimen data from the TCGA barcode} 6 | \usage{ 7 | TCGAbiospec(barcodes) 8 | } 9 | \arguments{ 10 | \item{barcodes}{A character vector of TCGA barcodes} 11 | } 12 | \value{ 13 | A \code{dataframe} with sample type, sample code, portion, plate, 14 | and center columns. 15 | } 16 | \description{ 17 | This function uses the full TCGA barcode to return a data frame of the 18 | data pertinent to laboratory variables such as vials, portions, analytes, 19 | plates and the center. 20 | } 21 | \examples{ 22 | example("TCGAbarcode") 23 | TCGAbiospec(barcodes) 24 | 25 | } 26 | \author{ 27 | M. Ramos 28 | } 29 | -------------------------------------------------------------------------------- /man/TCGAprimaryTumors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TCGAprimaryTumors.R 3 | \name{TCGAprimaryTumors} 4 | \alias{TCGAprimaryTumors} 5 | \title{Select primary tumors from TCGA datasets} 6 | \usage{ 7 | TCGAprimaryTumors(multiassayexperiment) 8 | } 9 | \arguments{ 10 | \item{multiassayexperiment}{A 11 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 12 | with TCGA data as obtained from \code{\link[curatedTCGAData:curatedTCGAData]{curatedTCGAData::curatedTCGAData()}}} 13 | } 14 | \value{ 15 | A \code{MultiAssayExperiment} containing only primary tumor samples 16 | } 17 | \description{ 18 | Tumor selection is decided using the \code{sampleTypes} data. For 'LAML' datasets, 19 | the primary tumor code used is "03" otherwise, "01" is used. 20 | } 21 | \examples{ 22 | 23 | example(getSubtypeMap) 24 | 25 | TCGAprimaryTumors(gbm) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /man/TCGAsampleSelect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TCGAsampleSelect.R 3 | \name{TCGAsampleSelect} 4 | \alias{TCGAsampleSelect} 5 | \title{Select samples from barcodes from lookup table} 6 | \usage{ 7 | TCGAsampleSelect(barcodes, sampleCodes) 8 | } 9 | \arguments{ 10 | \item{barcodes}{Either a TCGA barcode vector or 11 | \link[IRanges:AtomicList-class]{CharacterList} containing patient 12 | identifiers, sample, portion, plate, and center codes.} 13 | 14 | \item{sampleCodes}{Either a character or numeric vector of TCGA sample codes. 15 | See the \code{sampleType} dataset.} 16 | } 17 | \value{ 18 | A logical vector or \link[IRanges:AtomicList-class]{LogicalList} of the 19 | same length as 'barcodes' indicating sample type matches 20 | } 21 | \description{ 22 | The TCGA barcode contains several pieces of information which can 23 | be parsed by the \link{TCGAbarcode} function. To select a specific type of 24 | sample, enter the appropriate sampleCode argument from the lookup table. 25 | See lookup table in \code{data("sampleTypes")}. Barcode inputs can be a 26 | character vector or a \link[IRanges:AtomicList-class]{CharacterList} object. 27 | } 28 | \examples{ 29 | 30 | example("TCGAbarcode") 31 | TCGAsampleSelect(barcodes, c(11, 01)) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/TCGAutils-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TCGAutils-pkg.R 3 | \docType{package} 4 | \name{TCGAutils-package} 5 | \alias{TCGAutils} 6 | \alias{TCGAutils-package} 7 | \title{TCGAutils: Helper functions for working with TCGA and MultiAssayExperiment 8 | data} 9 | \description{ 10 | TCGAutils is a toolbox to work with TCGA specific datasets. It allows the 11 | user to manipulate and translate TCGA barcodes, conveniently convert a list 12 | of data files to \link[GenomicRanges:GRangesList-class]{GRangesList}. Take 13 | datasets from GISTIC and return a 14 | \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment} 15 | class object. The package also provides functions for working with data from 16 | the \code{curatedTCGAData} 17 | experiment data package. It provides convenience functions for extracting 18 | subtype metadata data and adding clinical data to existing 19 | \link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment} 20 | objects. 21 | } 22 | \seealso{ 23 | Useful links: 24 | \itemize{ 25 | \item Report bugs at \url{https://github.com/waldronlab/TCGAutils/issues} 26 | } 27 | 28 | } 29 | \author{ 30 | \strong{Maintainer}: Marcel Ramos \email{marcel.ramos@sph.cuny.edu} (\href{https://orcid.org/0000-0002-3242-0582}{ORCID}) 31 | 32 | Authors: 33 | \itemize{ 34 | \item Lucas Schiffer 35 | \item Levi Waldron 36 | } 37 | 38 | Other contributors: 39 | \itemize{ 40 | \item Sean Davis [contributor] 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/builds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/builds.R 3 | \name{builds} 4 | \alias{builds} 5 | \alias{translateBuild} 6 | \alias{correctBuild} 7 | \alias{isCorrect} 8 | \alias{extractBuild} 9 | \alias{uniformBuilds} 10 | \title{Utilities for working with \emph{HUMAN} genome builds} 11 | \usage{ 12 | translateBuild(from, to = c("UCSC", "NCBI")) 13 | 14 | correctBuild(build, style = c("UCSC", "NCBI")) 15 | 16 | isCorrect(build, style = c("UCSC", "NCBI")) 17 | 18 | extractBuild(string, build = c("UCSC", "NCBI")) 19 | 20 | uniformBuilds(builds, cutoff = 0.2, na = c("", "NA")) 21 | } 22 | \arguments{ 23 | \item{from}{character() A vector of build versions typically from \code{genome()} 24 | (e.g., "37"). The build vector must be homogenous (i.e., 25 | \code{length(unique(x)) == 1L}).} 26 | 27 | \item{to}{character(1) The name of the desired build version (either "UCSC" 28 | or "NCBI"; default: "UCSC")} 29 | 30 | \item{build}{A vector of build version names (default UCSC, NCBI)} 31 | 32 | \item{style}{character(1) The annotation style, either 'UCSC' or 'NCBI'} 33 | 34 | \item{string}{A single character string} 35 | 36 | \item{builds}{A character vector of builds} 37 | 38 | \item{cutoff}{numeric(1L) An inclusive threshold tolerance value for missing 39 | values and translating builds that are below the threshold} 40 | 41 | \item{na}{character() The values to be considered as missing (default: 42 | c("", "NA"))} 43 | } 44 | \value{ 45 | \if{html}{\out{
}}\preformatted{translateBuild: A character vector of translated genome builds 46 | 47 | extractBuild: A character string of the build information available 48 | 49 | uniformBuilds: A character vector of builds where all builds are 50 | identical `identical(length(unique(build)), 1L)` 51 | 52 | correctBuild: A character string of the 'corrected' build name 53 | 54 | isCorrect: A logical indicating if the build is exactly as annotated 55 | }\if{html}{\out{
}} 56 | } 57 | \description{ 58 | A few functions are available to search for build versions, 59 | either from NCBI or UCSC. 60 | 61 | \itemize{ 62 | \item \code{translateBuild}: translates between UCSC and NCBI build 63 | versions 64 | \item \code{extractBuild}: use grep patterns to find the first build 65 | within the string input 66 | \item \code{uniformBuilds}: replace build occurrences below a threshold 67 | level of occurence with the alternative build 68 | \item \code{correctBuild}: Ensure that the build annotation is correct 69 | based on the NCBI/UCSC website. If not, use \code{translateBuild} with 70 | the indicated 'style' input 71 | \item \code{isCorrect}: Check to see if the build is exactly as annotated 72 | } 73 | } 74 | \details{ 75 | The \code{correctBuild} function takes the input and ensures that 76 | the style specified matches the input. Otherwise, it will 77 | return the correct style for use with \code{seqlevelsStyle}. 78 | Currently, the function does not support patched builds 79 | (e.g., 'GRCh38.p13') Build names are taken from the website: 80 | \url{https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/} 81 | } 82 | \examples{ 83 | 84 | translateBuild("GRCh35", "UCSC") 85 | 86 | 87 | correctBuild("grch38", "NCBI") 88 | correctBuild("hg19", "NCBI") 89 | 90 | 91 | isCorrect("GRCh38", "NCBI") 92 | 93 | isCorrect("hg19", "UCSC") 94 | 95 | 96 | extractBuild( 97 | "SCENA_p_TCGAb29and30_SNP_N_GenomeWideSNP_6_G05_569110.nocnv_grch38.seg.txt" 98 | ) 99 | 100 | 101 | buildvec <- rep(c("GRCh37", "hg19"), times = c(5, 1)) 102 | uniformBuilds(buildvec) 103 | 104 | navec <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA") 105 | uniformBuilds(navec) 106 | 107 | } 108 | -------------------------------------------------------------------------------- /man/clinicalNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{clinicalNames} 5 | \alias{clinicalNames} 6 | \title{Clinical dataset names in TCGA} 7 | \format{ 8 | A \link[IRanges:AtomicList-class]{CharacterList} of names for 33 9 | cancer codes 10 | } 11 | \usage{ 12 | data("clinicalNames") 13 | } 14 | \value{ 15 | The clinical dataset column names in TCGA as provided by the 16 | \code{RTCGAToolbox} 17 | } 18 | \description{ 19 | A dataset of names for each of the TCGA cancer codes available. 20 | These names were obtained by the clinical datasets from 21 | \link[RTCGAToolbox:getFirehoseData]{getFirehoseData}. They serve to subset the 22 | current datasets provided by \code{curatedTCGAData}. 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/curatedTCGAData-helpers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/curatedTCGAData-helpers.R 3 | \name{curatedTCGAData-helpers} 4 | \alias{curatedTCGAData-helpers} 5 | \alias{getSubtypeMap} 6 | \alias{getClinicalNames} 7 | \alias{TCGAsplitAssays} 8 | \alias{sampleTables} 9 | \title{Helper functions for managing MultiAssayExperiment from 10 | curatedTCGAData} 11 | \usage{ 12 | getSubtypeMap(multiassayexperiment) 13 | 14 | getClinicalNames(diseaseCode) 15 | 16 | TCGAsplitAssays(multiassayexperiment, sampleCodes = NULL, exclusive = FALSE) 17 | 18 | sampleTables(multiassayexperiment, vial = FALSE) 19 | } 20 | \arguments{ 21 | \item{multiassayexperiment}{A 22 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 23 | object} 24 | 25 | \item{diseaseCode}{A TCGA cancer code (e.g., "BRCA")} 26 | 27 | \item{sampleCodes}{character (default NULL) A string of sample type codes 28 | (refer to \code{data(sampleTypes)}; \code{TCGAsplitAssays} section)} 29 | 30 | \item{exclusive}{logical (default FALSE) Whether to return only assays that 31 | contain all codes in \code{sampleCodes}} 32 | 33 | \item{vial}{(logical default FALSE) whether to display vials in the 34 | table output} 35 | } 36 | \value{ 37 | \itemize{ 38 | \item{getSubtypeMap}: A \code{data.frame} with explanatory names 39 | and their in-data variable names. They may not be present for all 40 | cancer types. 41 | \item{getClinicalNames}: A \code{vector} of common variable names that 42 | may be found across several cancer disease codes. 43 | } 44 | } 45 | \description{ 46 | Additional helper functions for cleaning and uncovering metadata 47 | within a downloaded \code{MultiAssayExperiment} from \code{curatedTCGAData}. 48 | } 49 | \details{ 50 | Note that for \code{getSubtypeMap}, the column of in-data variable names 51 | may need to go through \code{make.names} to be found in the \code{colData} of the 52 | \code{MultiAssayExperiment}. 53 | } 54 | \section{getSubtypeMap}{ 55 | provides a two column \code{data.frame} with 56 | interpreted names and in-data variable names. 'Name' usually refers to the 57 | \code{colData} row names a.k.a. the \code{patientID}. 58 | } 59 | 60 | \section{getClinicalNames}{ 61 | provides a vector of common variable names that 62 | exist in the \code{colData} \code{DataFrame} of a \code{curatedTCGAData} 63 | \code{MultiAssayExperiment} object. These variables are directly obtained 64 | from the BroadFirehose clinical data (downloaded with 65 | \link[RTCGAToolbox]{getFirehoseData}) and tend to be present across cancer 66 | disease codes. 67 | } 68 | 69 | \section{TCGAsplitAssays}{ 70 | 71 | Separates samples by indicated sample codes into different assays 72 | in a \code{MultiAssayExperiment}. Refer to the \code{sampleTypes} 73 | data object for a list of available codes. This operation generates 74 | \strong{n} times the number of assays based on the number of sample codes 75 | entered. By default, all assays will be split by samples present in 76 | the data. 77 | } 78 | 79 | \section{sampleTables}{ 80 | 81 | Display all the available samples in each of the assays 82 | } 83 | 84 | \examples{ 85 | 86 | library(curatedTCGAData) 87 | 88 | gbm <- curatedTCGAData("GBM", c("RPPA*", "CNA*"), version = "2.0.1", FALSE) 89 | 90 | getSubtypeMap(gbm) 91 | 92 | sampleTables(gbm) 93 | 94 | TCGAsplitAssays(gbm, c("01", "10")) 95 | 96 | getClinicalNames("COAD") 97 | 98 | } 99 | -------------------------------------------------------------------------------- /man/diseaseCodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{diseaseCodes} 5 | \alias{diseaseCodes} 6 | \title{TCGA Cancer Disease Codes Table} 7 | \format{ 8 | A data frame with 37 rows and 2 variables: 9 | \itemize{ 10 | \item Study.Abbreviation: Disease Code used in TCGA 11 | \item Available: Cancer datasets available via curatedTCGAData 12 | \item SubtypeData: Subtype curation data available via curatedTCGAData 13 | \item Study.Name: The full length study name (i.e., type of cancer) 14 | } 15 | } 16 | \source{ 17 | \url{https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations} 18 | } 19 | \usage{ 20 | data("diseaseCodes") 21 | } 22 | \value{ 23 | The TCGA \code{diseaseCodes} table 24 | } 25 | \description{ 26 | A dataset for obtaining the cancer codes in TCGA for about 13 different 27 | types of cancers. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/findGRangesCols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/findGRangesCols.R 3 | \name{findGRangesCols} 4 | \alias{findGRangesCols} 5 | \title{Obtain minimum necessary names for the creation of a GRangesList object} 6 | \usage{ 7 | findGRangesCols( 8 | df_colnames, 9 | seqnames.field = c("seqnames", "seqname", "chromosome", "chrom", "chr", 10 | "chromosome_name", "seqid", "om"), 11 | start.field = "start", 12 | end.field = c("end", "stop"), 13 | strand.field = "strand", 14 | ignore.strand = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{df_colnames}{A \code{character} vector of names in a dataset} 19 | 20 | \item{seqnames.field}{A \code{character} vector of the chromosome name} 21 | 22 | \item{start.field}{A \code{character} vector that indicates the column name 23 | of the start positions of ranged data} 24 | 25 | \item{end.field}{A \code{character} vector that indicates the end position 26 | of ranged data} 27 | 28 | \item{strand.field}{A \code{character} vector of the column name that 29 | indicates the strand type} 30 | 31 | \item{ignore.strand}{logical (default FALSE) whether to ignore the strand 32 | field in the data} 33 | } 34 | \value{ 35 | Index positions vector indicating columns with appropriate names 36 | } 37 | \description{ 38 | This function attempts to match chromosome, start position, end position and 39 | strand names in the given character vector. Modified helper from the 40 | \code{GenomicRanges} package. 41 | } 42 | \examples{ 43 | myDataColNames <- c("Start_position", "End_position", "strand", 44 | "chromosome", "num_probes", "segment_mean") 45 | findGRangesCols(myDataColNames) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /man/generateMap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generateMap.R 3 | \name{generateMap} 4 | \alias{generateMap} 5 | \title{Create a sampleMap from an experiment list and phenoData dataframe} 6 | \usage{ 7 | generateMap( 8 | experiments, 9 | colData, 10 | idConverter = identity, 11 | sampleCol, 12 | patientCol, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{experiments}{A named \code{list} of experiments compatible with the 18 | \code{MultiAssayExperiment} API} 19 | 20 | \item{colData}{A \code{data.frame} of clinical data with patient identifiers 21 | as rownames} 22 | 23 | \item{idConverter}{A function to be used against the sample or specimen 24 | identifiers to match those in the rownames of the \code{colData} 25 | (default NULL)} 26 | 27 | \item{sampleCol}{A single string indicating the sample identifiers 28 | column in the colData dataset} 29 | 30 | \item{patientCol}{A single string indicating the patient identifiers 31 | in colData, "row.names" extracts the colData row names} 32 | 33 | \item{...}{Additonal arguments to pass to the 'idConverter' function.} 34 | } 35 | \value{ 36 | A \code{DataFrame} class object of mapped samples and patient 37 | identifiers including assays 38 | } 39 | \description{ 40 | This function helps create a sampleMap in preparation of a 41 | \code{MultiAssayExperiment} object. This especially useful when the 42 | sample identifiers are not very different, as in the case of TCGA barcodes. 43 | An \code{idConverter} function can be provided to truncate such sample 44 | identifiers and obtain patient identifiers. 45 | } 46 | \examples{ 47 | ## Minimal example 48 | expList <- list(assay1 = matrix(1:6, ncol = 2L, 49 | dimnames = list(paste0("feature", 1:3), c("A-J", "B-J"))), 50 | assay2 = matrix(1:4, ncol = 2, 51 | dimnames = list(paste0("gene", 1:2), c("A-L", "B-L")))) 52 | 53 | ## Mock colData 54 | myPheno <- data.frame(var1 = c("Yes", "No"), var2 = c("High", "Low"), 55 | row.names = c("a", "b")) 56 | 57 | ## A look at the identifiers 58 | vapply(expList, colnames, character(2L)) 59 | rownames(myPheno) 60 | 61 | ## Use 'idConverter' to correspond sample names to patient identifiers 62 | generateMap(expList, myPheno, 63 | idConverter = function(x) substr(tolower(x), 1L, 1L)) 64 | 65 | } 66 | \author{ 67 | M. Ramos, M. Morgan, L. Schiffer 68 | } 69 | -------------------------------------------------------------------------------- /man/getFileName.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getFileName.R 3 | \name{getFileName} 4 | \alias{getFileName} 5 | \title{Find the file names used in RTCGAToolbox} 6 | \usage{ 7 | getFileName( 8 | disease, 9 | runDate = "20160128", 10 | dataType = c("CNASNP", "CNVSNP", "CNAseq", "CNACGH", "Mutation") 11 | ) 12 | } 13 | \arguments{ 14 | \item{disease}{The TCGA cancer disease code, e.g., "COAD"} 15 | 16 | \item{runDate}{The single \code{string} used in the \code{getFirehoseData} 17 | function (default "20160128")} 18 | 19 | \item{dataType}{A single character vector (default "CNASNP") indicating the 20 | data type for which to get the source file name} 21 | } 22 | \value{ 23 | A single \code{character} file name 24 | } 25 | \description{ 26 | Part of this function is from the RTCGAToolbox. It aims to extract the file 27 | name used inside of the \link[RTCGAToolbox]{getFirehoseData} function. 28 | The arguments of the function parallel those in the 29 | \link[RTCGAToolbox]{getFirehoseData} function. It is only available for 30 | select data types. 31 | } 32 | \examples{ 33 | 34 | getFileName("COAD", dataType = "CNASNP") 35 | 36 | } 37 | -------------------------------------------------------------------------------- /man/hidden-helpers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simplifyTCGA.R 3 | \name{hidden-helpers} 4 | \alias{hidden-helpers} 5 | \alias{.makeListRanges} 6 | \alias{.getRangesOfSYMBOLS} 7 | \title{A small document for helper functions} 8 | \usage{ 9 | .makeListRanges(x, gn) 10 | 11 | .getRangesOfSYMBOLS(x) 12 | } 13 | \arguments{ 14 | \item{x}{A character vector} 15 | 16 | \item{gn}{A GRanges object with some of its names found in x} 17 | } 18 | \value{ 19 | A list of length 2: unmapped (character vector) and mapped (GRanges) 20 | 21 | list of length 2: "unmapped" is a character vector providing 22 | unmapped symbols, "mapped" is a GRanges object with ranges of mapped symbols 23 | } 24 | \description{ 25 | A small document for helper functions 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/imputeAssay.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/imputeAssay.R 3 | \name{imputeAssay} 4 | \alias{imputeAssay} 5 | \title{This function imputes assays values inside a 6 | \code{MultiAssayExperiment}} 7 | \usage{ 8 | imputeAssay(multiassayexperiment, i = 1, ...) 9 | } 10 | \arguments{ 11 | \item{multiassayexperiment}{A \code{MultiAssayExperiment} with genes in the 12 | rows, samples in the columns} 13 | 14 | \item{i}{A numeric, logical, or character \code{vector} indicating the 15 | assays to perform imputation on (default 1L)} 16 | 17 | \item{...}{ 18 | Arguments passed on to \code{\link[impute:impute.knn]{impute::impute.knn}} 19 | \describe{ 20 | \item{\code{data}}{An expression matrix with genes in the rows, samples in the columns} 21 | \item{\code{k}}{Number of neighbors to be used in the 22 | imputation (default=10)} 23 | \item{\code{rowmax}}{The maximum percent missing data allowed in any row 24 | (default 50\%). For any rows with more than \code{rowmax}\% missing 25 | are imputed using the overall mean per sample.} 26 | \item{\code{colmax}}{The maximum percent missing data allowed in any column 27 | (default 80\%). If any column has more than \code{colmax}\% missing data, 28 | the program halts and reports an error.} 29 | \item{\code{maxp}}{The largest block of genes imputed using the knn 30 | algorithm inside \code{impute.knn} (default 31 | 1500); larger blocks are divided by two-means clustering 32 | (recursively) prior to imputation. If \code{maxp=p}, only knn 33 | imputation is done.} 34 | \item{\code{rng.seed}}{The seed used for the random number generator (default 35 | 362436069) for reproducibility.} 36 | }} 37 | } 38 | \value{ 39 | A \code{MultiAssayExperiment} with imputed assays values 40 | } 41 | \description{ 42 | These function allow the user to enter a 43 | \code{MultiAssayExperiment} and impute all the NA values inside assays. 44 | } 45 | \examples{ 46 | 47 | example(getSubtypeMap) 48 | 49 | ## convert data to matrix and add as experiment 50 | gbm <- 51 | c(gbm, RPPA_matrix = data.matrix(assay(gbm[["GBM_RPPAArray-20160128"]]))) 52 | 53 | imputeAssay(gbm, i = "RPPA_matrix") 54 | 55 | } 56 | -------------------------------------------------------------------------------- /man/makeGRangesListFromCopyNumber.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/makeGRangesListFromCopyNumber.R 3 | \name{makeGRangesListFromCopyNumber} 4 | \alias{makeGRangesListFromCopyNumber} 5 | \title{Make a GRangesList from TCGA Copy Number data} 6 | \usage{ 7 | makeGRangesListFromCopyNumber( 8 | df, 9 | split.field, 10 | names.field = "Hugo_Symbol", 11 | ... 12 | ) 13 | } 14 | \arguments{ 15 | \item{df}{A \code{data.frame} or \code{DataFrame} class object. \code{list} 16 | class objects are coerced to \code{data.frame} or \code{DataFrame}.} 17 | 18 | \item{split.field}{A \code{character} vector of length one indicating 19 | the column to be used as sample identifiers} 20 | 21 | \item{names.field}{A \code{character} vector of length one indicating the 22 | column to be used as names for each of the ranges in the data} 23 | 24 | \item{...}{Additional arguments to pass on to 25 | \link[GenomicRanges:makeGRangesListFromDataFrame]{GenomicRanges::makeGRangesListFromDataFrame}} 26 | } 27 | \value{ 28 | A \link[GenomicRanges:GRangesList-class]{GRangesList} class object 29 | } 30 | \description{ 31 | \code{makeGRangesListFromCopyNumber} allows the user to convert objects of 32 | class \code{data.frame} or \link[S4Vectors:DataFrame-class]{S4Vectors::DataFrame} to a 33 | \link[GenomicRanges:GRangesList-class]{GRangesList}. It includes additional 34 | features specific to TCGA data such as, hugo symbols, probe numbers, segment 35 | means, and ucsc build (if available). 36 | } 37 | \examples{ 38 | library(GenomicDataCommons) 39 | 40 | manif <- files() |> 41 | filter(~ cases.project.project_id == "TCGA-COAD" & 42 | data_type == "Copy Number Segment") |> 43 | manifest(size = 1) 44 | 45 | fname <- gdcdata(manif$id) 46 | 47 | barcode <- UUIDtoBarcode(names(fname), from_type = "file_id") 48 | barcode <- barcode[["associated_entities.entity_submitter_id"]] 49 | 50 | cndata <- read.delim(fname[[1L]], nrows = 10L) 51 | 52 | cngrl <- makeGRangesListFromCopyNumber(cndata, split.field = "GDC_Aliquot", 53 | keep.extra.columns = TRUE) 54 | 55 | names(cngrl) <- barcode 56 | GenomeInfoDb::genome(cngrl) <- extractBuild(fname[[1L]]) 57 | cngrl 58 | 59 | } 60 | -------------------------------------------------------------------------------- /man/makeGRangesListFromExonFiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/makeGRangesListFromExonFiles.R 3 | \name{makeGRangesListFromExonFiles} 4 | \alias{makeGRangesListFromExonFiles} 5 | \title{Read exon-level expression files and create a \code{GRangesList}} 6 | \usage{ 7 | makeGRangesListFromExonFiles( 8 | filepaths, 9 | sampleNames = NULL, 10 | fileNames = basename(filepaths), 11 | getBarcodes = TRUE, 12 | rangesColumn = "exon", 13 | nrows = Inf 14 | ) 15 | } 16 | \arguments{ 17 | \item{filepaths}{character() vector of file paths containing TCGA exon 18 | data usually obtained from the GDC} 19 | 20 | \item{sampleNames}{character() vector of TCGA barcodes to be used as 21 | names for the \code{GRangesList} output (default NULL)} 22 | 23 | \item{fileNames}{character() vector of file names as downloaded from 24 | the Genomic Data Commons Legacy archive (default \code{basename(filepaths)})} 25 | 26 | \item{getBarcodes}{logical(1). Whether to query the GDC API with the 27 | \code{filenameToBarcode} and obtain the TCGA barcodes from the file names 28 | (default TRUE); see details.} 29 | 30 | \item{rangesColumn}{character(1). The name of the column in the data 31 | containing the ranges information (default "exon"); see details.} 32 | 33 | \item{nrows}{numeric(1). The number of rows to return from each of the files 34 | read in (all rows by default; default Inf)} 35 | } 36 | \value{ 37 | A \link[GenomicRanges:GRangesList-class]{GRangesList} object 38 | } 39 | \description{ 40 | This function serves to read exon-level expression data. It works for exon 41 | quantification (raw counts and RPKM) and junction quantification 42 | (raw counts) file paths and represents such data as a 43 | \link[GenomicRanges:GRangesList-class]{GRangesList}. The data files can be 44 | downloaded via the Genomic Data Commons (GDC) Legacy Archive. 45 | } 46 | \details{ 47 | The \code{rangesColumn} name in the GDC data files is usually "exon" 48 | but can be changed with the \code{rangesColumn} argument, if different. 49 | To avoid programmatically obtaining TCGA barcodes from the GDC 50 | API, set the \code{getBarcodes} to \code{FALSE}. When \code{getBarcodes} is set to 51 | \code{FALSE}, the file names are used to name the elements of the \code{GRangesList} 52 | output. 53 | } 54 | \examples{ 55 | 56 | ## Load example file found in package 57 | pkgDir <- system.file("extdata", package = "TCGAutils", mustWork = TRUE) 58 | exonFile <- list.files(pkgDir, pattern = "cation\\\\.txt$", full.names = TRUE) 59 | 60 | filePrefix <- "unc.edu.32741f9a-9fec-441f-96b4-e504e62c5362.1755371." 61 | 62 | ## Add actual file name manually (due to Windows OS restriction) 63 | makeGRangesListFromExonFiles(exonFile, 64 | fileNames = paste0(filePrefix, basename(exonFile)), 65 | sampleNames = "TCGA-AA-3678-01A-01R-0905-07") 66 | 67 | } 68 | \author{ 69 | M. Ramos 70 | } 71 | -------------------------------------------------------------------------------- /man/mergeColData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simplifyColData.R 3 | \name{mergeColData} 4 | \alias{mergeColData} 5 | \title{Take a MultiAssayExperiment and include curated variables} 6 | \usage{ 7 | mergeColData(MultiAssayExperiment, colData) 8 | } 9 | \arguments{ 10 | \item{MultiAssayExperiment}{A 11 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 12 | object} 13 | 14 | \item{colData}{A \code{DataFrame} or \code{data.frame} to merge with 15 | clinical data in the \code{MultiAssayExperiment} object} 16 | } 17 | \value{ 18 | A 19 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 20 | object 21 | } 22 | \description{ 23 | This function works on the \code{colData} of a 24 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 25 | object to merge curated variable columns or other clinical variables that 26 | would like to be added. It is recommended that the user run the scripts in 27 | the \code{MultiAssayExperiment.TCGA} repository that build the "enhanced" type of 28 | data but not necessary if using different clinical data. Please see the 29 | repository's README for more information. 30 | } 31 | \examples{ 32 | 33 | library(MultiAssayExperiment) 34 | 35 | mergeColData(MultiAssayExperiment(), S4Vectors::DataFrame()) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /man/oncoPrintTCGA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/oncoPrintTCGA.R 3 | \name{oncoPrintTCGA} 4 | \alias{oncoPrintTCGA} 5 | \title{OncoPrint for TCGA Mutation Assays} 6 | \usage{ 7 | oncoPrintTCGA( 8 | multiassayexperiment, 9 | matchassay = "*_Mutation-*", 10 | variantCol = "Variant_Classification", 11 | brewerPal = "Set3", 12 | ntop = 25, 13 | incl.thresh = 0.01, 14 | rowcol = "Hugo_Symbol" 15 | ) 16 | } 17 | \arguments{ 18 | \item{multiassayexperiment}{A \code{MultiAssayExperiment}, usually from 19 | \code{curatedTCGAData}} 20 | 21 | \item{matchassay}{character(1) The name of the assay containing mutation 22 | data, this can be a pattern (e.g., "\emph{_Mutation-}", the default)} 23 | 24 | \item{variantCol}{character(1) The name of the metadata column containing 25 | the mutation categories, usually "Variant_Classification" in TCGA} 26 | 27 | \item{brewerPal}{character(1) The name of the \code{RColorBrewer::brewer.pal} 28 | palette, (default: "Set3")} 29 | 30 | \item{ntop}{integer(1) The number of the top N genes for displaying based 31 | on per-sample mutation frequency} 32 | 33 | \item{incl.thresh}{double(1) The inclusion threshold for empirical mutations, 34 | mutations less frequent than this value will not be included} 35 | 36 | \item{rowcol}{character(1) The name of the column in the metadata to annotate 37 | the rows with either "Hugo_Symbol" (default) or} 38 | } 39 | \value{ 40 | An oncoPrint plot of mutations 41 | } 42 | \description{ 43 | OncoPrint for TCGA Mutation Assays 44 | } 45 | \examples{ 46 | 47 | library(curatedTCGAData) 48 | 49 | acc <- curatedTCGAData("ACC", "Mutation", version = "1.1.38", FALSE) 50 | 51 | oncoPrintTCGA(acc) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /man/sampleTypes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sampleTypes} 5 | \alias{sampleTypes} 6 | \title{Barcode Sample Type Table} 7 | \format{ 8 | A data frame with 19 rows and 3 variables: 9 | \itemize{ 10 | \item Code: Two digit code number found in the barcode 11 | \item Definition: Long name for the sample type 12 | \item Short.Letter.Code: Letter code for the sample type 13 | } 14 | } 15 | \source{ 16 | \url{https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes} 17 | } 18 | \usage{ 19 | data("sampleTypes") 20 | } 21 | \value{ 22 | The TCGA \code{sampleTypes} table 23 | } 24 | \description{ 25 | A dataset that contains the mappings for sample codes in the TCGA 26 | barcodes. 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /man/simplifyTCGA-defunct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simplifyTCGA.R 3 | \name{simplifyTCGA-defunct} 4 | \alias{simplifyTCGA-defunct} 5 | \alias{mirToRanges} 6 | \title{Defunct TCGAutils functions} 7 | \usage{ 8 | mirToRanges(obj, keep.assay = FALSE, unmapped = TRUE) 9 | } 10 | \arguments{ 11 | \item{obj}{A \code{MultiAssayExperiment} object obtained from \code{curatedTCGAData}} 12 | 13 | \item{keep.assay}{logical (default FALSE) Whether to keep the 14 | \code{SummarizedExperiment} assays that have been converted to 15 | \code{RangedSummarizedExperiment}} 16 | 17 | \item{unmapped}{logical (default TRUE) Include an assay of data that was 18 | not able to be mapped in reference database} 19 | } 20 | \description{ 21 | \code{mirToRanges} is defunct and will be removed in the next 22 | release. The \code{mirbase.db} package is currently deprecated in \code{RELEASE_3_21}. 23 | } 24 | -------------------------------------------------------------------------------- /man/simplifyTCGA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simplifyTCGA.R 3 | \name{simplifyTCGA} 4 | \alias{simplifyTCGA} 5 | \alias{symbolsToRanges} 6 | \alias{CpGtoRanges} 7 | \alias{qreduceTCGA} 8 | \title{Functions to convert rows annotations to ranges and RaggedExperiment 9 | to RangedSummarizedExperiment} 10 | \usage{ 11 | simplifyTCGA(obj, keep.assay = FALSE, unmapped = TRUE) 12 | 13 | symbolsToRanges(obj, keep.assay = FALSE, unmapped = TRUE) 14 | 15 | CpGtoRanges(obj, keep.assay = FALSE, unmapped = TRUE) 16 | 17 | qreduceTCGA(obj, keep.assay = FALSE, suffix = "_simplified") 18 | } 19 | \arguments{ 20 | \item{obj}{A \code{MultiAssayExperiment} object obtained from \code{curatedTCGAData}} 21 | 22 | \item{keep.assay}{logical (default FALSE) Whether to keep the 23 | \code{SummarizedExperiment} assays that have been converted to 24 | \code{RangedSummarizedExperiment}} 25 | 26 | \item{unmapped}{logical (default TRUE) Include an assay of data that was 27 | not able to be mapped in reference database} 28 | 29 | \item{suffix}{character (default "_simplified") A character string to append 30 | to the newly modified assay for \code{qreduceTCGA}.} 31 | } 32 | \value{ 33 | A 34 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 35 | with any gene expression, miRNA, copy number, and mutations converted to 36 | \code{\link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment}} 37 | objects 38 | } 39 | \description{ 40 | This group of functions will convert row annotations as 41 | either gene symbols or miRNA symbols to row ranges based on database 42 | resources 'TxDB' and 'org.Hs' packages. It will also simplify the 43 | representation of 44 | \link[RaggedExperiment:RaggedExperiment-class]{RaggedExperiment} objects to 45 | \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment}. 46 | } 47 | \details{ 48 | The original \code{SummarizedExperiment} containing either gene symbol 49 | or miR annotations is replaced or supplemented by a 50 | \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment} 51 | for those that could be mapped to 52 | \link[GenomicRanges:GRanges-class]{GRanges}, and optionally another 53 | \link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment} 54 | for annotations that could not be mapped to 55 | \link[GenomicRanges:GRanges-class]{GRanges}. 56 | } 57 | \section{qreduceTCGA}{ 58 | 59 | 60 | Using \code{TxDb.Hsapiens.UCSC.hg19.knownGene} as the reference, \code{qreduceTCGA} 61 | reduces the data by applying either the \code{weightedmean} or \code{nonsilent} 62 | function (see below) to non-mutation or mutation data, respectively. 63 | Internally, it uses \code{\link[RaggedExperiment:assay-functions]{RaggedExperiment::qreduceAssay()}} to reduce the ranges 64 | to the gene-level. 65 | 66 | \code{qreduceTCGA} will update \code{genome(x)} based on the NCBI reference annotation 67 | which includes the patch number, e.g., GRCh37.p14, as provided by the 68 | \code{seqlevelsStyle} setter, \code{seqlevelsStyle(gn) <- "NCBI"}. \code{qreduceTCGA} 69 | uses the NCBI genome annotation as the default reference. 70 | 71 | \if{html}{\out{
}}\preformatted{nonsilent <- function(scores, ranges, qranges) 72 | any(scores != "Silent") 73 | }\if{html}{\out{
}} 74 | 75 | \code{RaggedExperiment} mutation objects become a genes by patients 76 | \code{RangedSummarizedExperiment} object containing '1' if there is a non-silent 77 | mutation somewhere in the gene, and '0' otherwise as obtained from the 78 | \code{Variant_Classification} column in the data. 79 | 80 | \if{html}{\out{
}}\preformatted{weightedmean <- function(scores, ranges, qranges) \{ 81 | isects <- GenomicRanges::pintersect(ranges, qranges) 82 | sum(scores * BiocGenerics::width(isects)) / 83 | sum(BiocGenerics::width(isects)) 84 | \} 85 | }\if{html}{\out{
}} 86 | 87 | "CNA" and "CNV" segmented copy number are reduced using a weighted mean in 88 | the rare cases of overlapping (non-disjoint) copy number regions. 89 | 90 | These functions rely on \code{TxDb.Hsapiens.UCSC.hg19.knownGene} and 91 | \code{org.Hs.eg.db} to map to the 'hg19' NCBI build. Use the \code{liftOver} procedure 92 | for datasets that are provided against a different reference genome (usually 93 | 'hg18'). See an example in the vignette. 94 | } 95 | 96 | \examples{ 97 | 98 | library(curatedTCGAData) 99 | library(GenomeInfoDb) 100 | 101 | accmae <- 102 | curatedTCGAData(diseaseCode = "ACC", 103 | assays = c("CNASNP", "Mutation", "miRNASeqGene", "GISTICT"), 104 | version = "1.1.38", 105 | dry.run = FALSE) 106 | 107 | ## update genome annotation 108 | rex <- accmae[["ACC_Mutation-20160128"]] 109 | 110 | ## Translate build to "hg19" 111 | tgenome <- vapply(genome(rex), translateBuild, character(1L)) 112 | genome(rex) <- tgenome 113 | 114 | accmae[["ACC_Mutation-20160128"]] <- rex 115 | 116 | simplifyTCGA(accmae) 117 | 118 | } 119 | \author{ 120 | L. Waldron 121 | } 122 | -------------------------------------------------------------------------------- /man/trimColData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simplifyColData.R 3 | \name{trimColData} 4 | \alias{trimColData} 5 | \title{Minimize the number of variables in colData} 6 | \usage{ 7 | trimColData( 8 | multiassayexperiment, 9 | maxNAfrac = 0.2, 10 | keystring = c("portion", "analyte") 11 | ) 12 | } 13 | \arguments{ 14 | \item{multiassayexperiment}{A 15 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 16 | object with \code{colData}} 17 | 18 | \item{maxNAfrac}{(numeric default 0.2) A decimal between 0 and 1 to indicate 19 | the amount of NA values allowed per column} 20 | 21 | \item{keystring}{(character) A vector of keywords to match and remove 22 | variables} 23 | } 24 | \value{ 25 | A 26 | \code{\link[MultiAssayExperiment:MultiAssayExperiment-class]{MultiAssayExperiment}} 27 | object 28 | } 29 | \description{ 30 | This function removes variables that have a high number of missing data 31 | and contain keywords. 32 | } 33 | \examples{ 34 | 35 | example(getSubtypeMap) 36 | 37 | (gbm_trimmed <- trimColData(gbm)) 38 | 39 | head(colData(gbm_trimmed))[1:5] 40 | 41 | } 42 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | library(TCGAutils) 4 | 5 | test_check("TCGAutils") 6 | -------------------------------------------------------------------------------- /tests/testthat/test-ID-translation.R: -------------------------------------------------------------------------------- 1 | context("ID translation testing") 2 | 3 | test_that("barcodeToUUID translates correctly", { 4 | pts <- c("TCGA-06-6391", "TCGA-06-6700") 5 | case_id <- barcodeToUUID(pts) 6 | expect_true("case_id" %in% names(case_id)) 7 | expect_equal( 8 | case_id[["case_id"]], 9 | c("7a4c0a14-ac97-4c2b-a9cc-68cb561b2494", 10 | "3dddfc44-7bb1-4974-8a65-a84fd4bac484") 11 | ) 12 | samps <- c("TCGA-06-6700-01A", "TCGA-AD-6888-01A") 13 | samp_id <- barcodeToUUID(samps) 14 | expect_true("sample_ids" %in% names(samp_id)) 15 | expect_equal( 16 | samp_id[["sample_ids"]], 17 | c("8d35786c-5edb-4a84-b3e5-c401b8c73bd6", 18 | "ecf0f65b-bf3c-4d0e-899a-f209247cbe97") 19 | ) 20 | analytes <- c("TCGA-AA-A00L-10A-01X", "TCGA-AA-A00L-10A-01D", 21 | "TCGA-12-0653-10A-01D") 22 | analyte_ids <- barcodeToUUID(analytes) 23 | expect_true("analyte_ids" %in% names(analyte_ids)) 24 | expect_equal( 25 | analyte_ids[["analyte_ids"]], 26 | c("4b6a77dc-7a2a-459e-a7a0-253f950f1c8c", 27 | "1c429d23-89eb-4c35-bef3-9eff2508d9d5", 28 | "63645523-bb46-40b3-899b-c3fa5fefd121") 29 | ) 30 | portions <- c("TCGA-AA-A00L-10A-01", "TCGA-AA-A00L-01A-31", 31 | "TCGA-12-0653-10A-01") 32 | portion_ids <- barcodeToUUID(portions) 33 | expect_true("portion_ids" %in% names(portion_ids)) 34 | expect_equal( 35 | portion_ids[["portion_ids"]], 36 | c("c72ff462-a355-49fa-8275-c34ef5dd91c9", 37 | "7d25aecc-9068-463b-adc5-71ec2f4ba7aa", 38 | "03209a36-67a0-48df-a9f7-a0cedd0db82f") 39 | ) 40 | aliquots <- c("TCGA-12-0653-10A-01D-0333-01", 41 | "TCGA-12-0653-10A-01D-0334-04", "TCGA-AA-3556-01A-01D-1953-10") 42 | aliquot_ids <- barcodeToUUID(aliquots) 43 | expect_true("aliquot_ids" %in% names(aliquot_ids)) 44 | expect_equal(aliquot_ids[["aliquot_ids"]], 45 | c("51ddbc44-1cae-454f-bc67-5c5cc3d9e853", 46 | "2f0fe3f0-6a24-47ee-acba-df9c04d89532", 47 | "2303247f-9691-4b38-bac2-8a30d6e08cc9") 48 | ) 49 | }) 50 | 51 | 52 | test_that("UUIDtoBarcode translates correctly", { 53 | file_id <- c( 54 | "6b7d7a7f-f16d-472d-9b7b-3482c434cc99", 55 | "2ea70743-f3c6-4b01-8e20-9c8957a71229" 56 | ) 57 | `associated_entities.entity_submitter_id` <- c( 58 | "TCGA-NA-A4QY-01A-11D-A28Q-01", "TCGA-NA-A4QY-01A-11D-A28S-05" 59 | ) 60 | resframe <- UUIDtoBarcode(file_id, from_type = "file_id") 61 | expect_identical( 62 | resframe, 63 | data.frame( 64 | file_id, 65 | `associated_entities.entity_submitter_id` 66 | ) 67 | ) 68 | 69 | `portions.analytes.aliquots.aliquot_id` <- c( 70 | "f8c7d038-1182-42d0-8787-b84b5ca57eaf", 71 | "b37ea112-340e-4613-8514-d8a8bd47410f", 72 | "4a9967bf-444c-4573-a082-121a30be7f3b" 73 | ) 74 | `portions.analytes.aliquots.submitter_id` <- c( 75 | "TCGA-UF-A71A-06A-11D-A390-01", 76 | "TCGA-BB-4224-01A-01D-1432-01", 77 | "TCGA-CN-4735-01A-01D-1432-01" 78 | ) 79 | resframe <- UUIDtoBarcode( 80 | `portions.analytes.aliquots.aliquot_id`, from_type = "aliquot_ids" 81 | ) 82 | expect_identical( 83 | resframe, 84 | data.frame( 85 | `portions.analytes.aliquots.aliquot_id`, 86 | `portions.analytes.aliquots.submitter_id` 87 | ) 88 | ) 89 | 90 | case_id <- c( 91 | "ce2b2c41-7d28-4d8b-a037-af842a8fe20f", 92 | "58574e35-8a30-4207-b127-59fff7c87a43" 93 | ) 94 | submitter_id <- c("TCGA-NA-A4QY", "TCGA-BB-4224") 95 | resframe <- UUIDtoBarcode(case_id, from_type = "case_id") 96 | expect_identical( 97 | resframe, 98 | data.frame( 99 | case_id, 100 | submitter_id 101 | ) 102 | ) 103 | }) 104 | 105 | 106 | test_that("UUIDtoBarcode shows multiple entries per file_id", { 107 | 108 | file_ids <- c( 109 | "f9f06937-ac64-4660-baf3-0174736d25b2", 110 | "5dec335c-83c3-4a4a-80f5-9ec1d1847960", 111 | "514bc5eb-006d-423b-8432-8fbe7795a312" 112 | ) 113 | 114 | restabs <- lapply(file_ids, UUIDtoBarcode, "file_id") 115 | results <- do.call(rbind, restabs) 116 | 117 | expect_identical(results, UUIDtoBarcode(file_ids, "file_id")) 118 | 119 | file_ids[2] <- paste(rev(unlist(strsplit(file_ids[2], ""))), collapse = "") 120 | 121 | expect_warning(UUIDtoBarcode(file_ids, "file_id")) 122 | }) 123 | 124 | test_that("UUIDhistory correctly returns the appropriate identifiers", { 125 | 126 | old_uuids <- c("0001801b-54b0-4551-8d7a-d66fb59429bf", 127 | "002c67f2-ff52-4246-9d65-a3f69df6789e", 128 | "003143c8-bbbf-46b9-a96f-f58530f4bb82") 129 | 130 | updated_ids <- vapply( 131 | stats::setNames(nm = old_uuids), 132 | function(x) { 133 | hist <- UUIDhistory(x) 134 | ## test for data release version 32.0 135 | cond <- hist[["file_change"]] == "released" & 136 | hist[["data_release"]] == "32.0" 137 | hist[cond, "uuid"] 138 | }, 139 | character(1L) 140 | ) 141 | 142 | ## Updated IDs taken from the GDC Data Portal 143 | new_uuids <- c("b4bce3ff-7fdc-4849-880b-56f2b348ceac", 144 | "5ca9fa79-53bc-4e91-82cd-5715038ee23e", 145 | "b7c3e5ad-4ffc-4fc4-acbf-1dfcbd2e5382") 146 | 147 | expect_identical(updated_ids, setNames(new_uuids, old_uuids)) 148 | 149 | }) 150 | -------------------------------------------------------------------------------- /tests/testthat/test-builds.R: -------------------------------------------------------------------------------- 1 | context("Build information testing") 2 | 3 | test_that("translateBuild works correctly", { 4 | buildDF <- human_builds() 5 | ncbinos <- as.character(34:38) 6 | resbuilds <- vapply(ncbinos, translateBuild, character(1L)) 7 | 8 | expect_identical(unname(resbuilds), buildDF[["UCSC"]]) 9 | 10 | ucscnos <- paste0("hg", c(16:19, 38)) 11 | resbuilds <- vapply(ucscnos, translateBuild, character(1L), "NCBI") 12 | expect_identical(unname(resbuilds), buildDF[["NCBI"]]) 13 | 14 | ## UCSC (default 'to') 15 | expect_identical(translateBuild("Grch37"), "hg19") 16 | expect_identical(translateBuild("GrCh37"), "hg19") 17 | expect_identical(translateBuild("grch37"), "hg19") 18 | 19 | expect_identical( 20 | translateBuild("hg19", to = "NCBI"), 21 | "GRCh37" 22 | ) 23 | expect_identical( 24 | translateBuild("HG19", to = "NCBI"), 25 | "GRCh37" 26 | ) 27 | expect_identical( 28 | translateBuild("hG19", to = "NCBI"), 29 | "GRCh37" 30 | ) 31 | expect_true( 32 | is.na(translateBuild(NA_character_)) 33 | ) 34 | expect_true( 35 | is.na(translateBuild("33")) 36 | ) 37 | }) 38 | 39 | test_that("correctBuild returns an appropriate build name", { 40 | build <- correctBuild("grch38", "NCBI") 41 | expect_identical("GRCh38", build) 42 | build <- correctBuild("hg19", "NCBI") 43 | expect_identical(translateBuild("hg19", "NCBI"), build) 44 | build <- correctBuild("HG19", "NCBI") 45 | expect_identical(translateBuild("hg19", "NCBI"), build) 46 | build <- correctBuild("HG19", "UCSC") 47 | expect_identical("hg19", build) 48 | }) 49 | 50 | test_that("uniformBuilds is returning the appropriate output", { 51 | build <- rep(c("GRCh37", "hg19"), times = c(5, 1)) 52 | rebuild <- uniformBuilds(build) 53 | expect_identical(1L, length(unique(rebuild))) 54 | 55 | ## NA imputed to rest of builds 56 | build <- c(rep(c("GRCh37", "hg19"), times = c(5, 1)), "NA") 57 | rebuild <- uniformBuilds(build) 58 | expect_identical(1L, length(unique(rebuild))) 59 | 60 | build <- c(rep(c("GRCh37", "hg19"), times = c(2, 1)), "NA") 61 | expect_error(uniformBuilds(build, cutoff = 0.2)) 62 | 63 | # NA prop > 0.2 64 | build <- c(rep(c("GRCh37", "hg19"), times = c(7, 1)), "NA", "NA") 65 | expect_error(uniformBuilds(build, cutoff = 0.2)) 66 | 67 | # NA converted to main build annotation 68 | build <- c(rep(c("GRCh37", "hg19"), times = c(7, 2)), NA_character_) 69 | rebuild <- uniformBuilds(build, cutoff = 0.2) 70 | expect_identical(1L , length(unique(rebuild))) 71 | 72 | # if build numbers identical then replace with high prop 73 | build <- rep(c("GRCh37", "37"), times = c(7, 2)) 74 | rebuild <- uniformBuilds(build, cutoff = 0.2) 75 | expect_identical(rebuild, rep("GRCh37", length(rebuild))) 76 | 77 | build <- c(rep(c("GRCh37", "37"), times = c(7, 2)), NA_character_) 78 | rebuild <- uniformBuilds(build, cutoff = 0.2) 79 | expect_identical(rebuild, rep("GRCh37", length(rebuild))) 80 | 81 | build <- c(rep(c("GRCh37", "37"), times = c(7, 2)), rep(NA_character_, 3)) 82 | expect_error(uniformBuilds(build, cutoff = 0.2)) 83 | }) 84 | 85 | -------------------------------------------------------------------------------- /tests/testthat/test-identifiers.R: -------------------------------------------------------------------------------- 1 | context("Identifier tests") 2 | 3 | .sectionNums <- function(bcode) { 4 | filler <- .uniqueDelim(bcode) 5 | unique(lengths(strsplit(bcode, filler))) 6 | } 7 | 8 | test_that("TCGAbarcode works", { 9 | barcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 10 | "TCGA-B0-5094-11A-01D-1421-08", 11 | "TCGA-E9-A295-10A-01D-A16D-09") 12 | expect_identical(.sectionNums(TCGAbarcode(barcodes)), 3L) 13 | 14 | expect_identical(.sectionNums(TCGAbarcode(barcodes, sample = TRUE)), 4L) 15 | 16 | expect_identical( 17 | .sectionNums( 18 | TCGAbarcode(barcodes, sample = TRUE, portion = TRUE)), 5L) 19 | 20 | expect_identical( 21 | .sectionNums( 22 | TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, plate = TRUE)), 23 | 6L) 24 | expect_identical( 25 | .sectionNums( 26 | TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, 27 | plate = TRUE, center = TRUE)), 28 | 7L) 29 | }) 30 | 31 | test_that("TCGAbiospec works", { 32 | barcodes <- c("TCGA-B0-5117-11A-01D-1421-08", 33 | "TCGA-B0-5094-11A-01D-1421-08", 34 | "TCGA-E9-A295-10A-01D-A16D-09") 35 | bc0 <- TCGAbarcode(barcodes) 36 | expect_error(TCGAbiospec(bc0)) 37 | bc1 <- TCGAbarcode(barcodes, sample = TRUE) 38 | expect_identical(dim(TCGAbiospec(bc1)), c(length(bc1), .sectionNums(bc1))) 39 | bc2 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE) 40 | expect_identical(dim(TCGAbiospec(bc2)), c(length(bc2), 41 | .sectionNums(bc2)+1L)) 42 | bc3 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, plate = TRUE) 43 | expect_identical(dim(TCGAbiospec(bc3)), c(length(bc3), 44 | .sectionNums(bc3)+1L)) 45 | bc4 <- TCGAbarcode(barcodes, sample = TRUE, portion = TRUE, 46 | plate = TRUE, center = TRUE) 47 | expect_identical(dim(TCGAbiospec(bc4)), c(length(bc4), 48 | .sectionNums(bc4)+1L)) 49 | expect_identical(names(TCGAbiospec(barcodes)), c("submitter_id", 50 | "sample_definition", "sample", "vial", "portion", "analyte", "plate", 51 | "center")) 52 | }) 53 | --------------------------------------------------------------------------------