├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── ChangeLog.txt
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── Makefile.inc
├── README.md
├── VERSION
├── __init__.py
├── docs
    └── GDCtools-overview.pdf
├── gdctools
    ├── GDCcore.py
    ├── GDCtool.py
    ├── Makefile
    ├── __init__.py
    ├── config
    │   ├── cptac3.cfg
    │   ├── google.cfg
    │   └── tcga.cfg
    ├── default.cfg
    ├── gdc_dice.py
    ├── gdc_list.py
    ├── gdc_loadfile.py
    ├── gdc_mirror.py
    ├── gdc_report.py
    ├── lib
    │   ├── GDCSampleReport.R
    │   ├── __init__.py
    │   ├── annotations_table.tsv
    │   ├── api.py
    │   ├── clinxml.py
    │   ├── common.py
    │   ├── convert
    │   │   ├── __init__.py
    │   │   ├── copy.py
    │   │   ├── maf.py
    │   │   ├── py_clinical.py
    │   │   ├── seg.py
    │   │   ├── tsv2idtsv.py
    │   │   └── tsv2magetab.py
    │   ├── heatmap.py
    │   └── meta.py
    ├── reference
    │   ├── centerCode.txt
    │   ├── diseaseStudy.txt
    │   ├── platformCode.txt
    │   └── sampleType.txt
    └── tool_template.py
├── generate.py
├── requirements.txt
├── setup.py
├── tests
    ├── Makefile
    ├── baselines
    │   ├── TCGA-ACCSKCM.Participant.loadfile.txt
    │   ├── TCGA.filtered_samples.txt
    │   ├── dice-files.txt
    │   ├── dice-md5sums.txt
    │   ├── legacy-files.txt
    │   ├── legacy-md5sums.txt
    │   ├── load-files.txt
    │   ├── load-md5sums-google.txt
    │   ├── load-md5sums.txt
    │   ├── mirror-files.txt
    │   ├── mirror-md5sums.txt
    │   ├── misctests.txt
    │   ├── onlycases-files.txt
    │   └── report-files.txt
    ├── config
    │   └── blacklist.tsv
    ├── legacy.cfg
    ├── misctests.py
    ├── onlycases.cfg
    ├── tcgaSmoketest.cfg
    ├── test_lock_context.py
    └── testchoose.py
└── util
    ├── bdiff.py
    ├── checkError.sh
    ├── checkMD5.sh
    ├── findPython.sh
    └── runpy.ac


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Pull Request Template
 5 | Describe your changes here.
 6 | 
 7 | 
 8 | ## Style Checklist
 9 | Please ensure that your pull request meets the following standards for quality.
10 | Code should not be merged into the master branch until all of these criteria have been satisfied.
11 | 
12 | ### Comments
13 | - [ ] Each source file includes comments at the top describing its purpose
14 | - [ ] Each function includes a comment/docstring describing inputs and outputs, and any assumptions it makes
15 | - [ ] Variable and function names have semantic meaning, and are not reused with a different meaning within the same scope
16 | - [ ] “Magic” numbers, such index of a particular column name, have a comment describing their value, or are declared as a global constant with a semantic name (e.g. TCGA_ID_COL = 16)
17 | - [ ] Commented-out code is removed
18 | 
19 | ### Style/Execution
20 | - [ ] Code contains no hard-coded paths
21 | - [ ] Code contains appropriate logging & or debugging
22 | - [ ] If possible, input data is validated early in the execution. If not, errors are sufficiently detailed to aid debugging.
23 | - [ ] Code uses a library (e.g. optparse, argparse) for command-line parsing
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | /.project
 3 | /.pydevproject
 4 | /gdc2fh.py
 5 | /.settings/
 6 | /GDCversion.py
 7 | /tests/sandbox/
 8 | /tests/GDCtool/
 9 | /tests/*.log
10 | /tests/gdctools_tmp/
11 | /tests/legacy/
12 | /tests/onlycases/
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | tests/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 
59 | # pyenv
60 | .python-version
61 | 
62 | # dotenv
63 | .env
64 | 
65 | # virtualenv
66 | .venv
67 | venv/
68 | ENV/
69 | 
70 | 


--------------------------------------------------------------------------------
/ChangeLog.txt:
--------------------------------------------------------------------------------
 1 | Version 0.2.13: (not released yet)
 2 | .  All tools now offer --show flag, to display key=value configuration state
 3 | .  CPTAC clinical files now processed with tsv2magetab in dicer to add sample ID
 4 | .  Added CPTAC3 disease study abbreviations
 5 | Version 0.2.12:
 6 | .  gdc_report now gracefully tolerates absence of a sample filter list
 7 | Version 0.2.11:
 8 | .  Updated CPTAC3 to re-enable WXS-based segfile converter isolated during merge
 9 | Version 0.2.10:
10 | .  Initial support for CPTAC3
11 | .  Updated regression test baselines to reflect latest GDC data
12 | .  The --help output for gdc_list now gives several usage examples
13 | Version 0.2.9:
14 | .  Support new V2 seg files from GDC; update regression tests to latest data
15 |    GDC data release, and ensure they are downloaded outside of repo directory
16 |    tree so as to not slow down builds/installs
17 | Version 0.2.8:
18 | .  Install .cfg files to bin dir
19 | .  Avoid mirroring pathology slides for now, as they are extremely large; issue
20 |    #73 created so that this avoidance can happen via config files, instead
21 | Version 0.2.6:
22 | .  More spit-n-polish on the regression tests
23 | .  Better segregation of FFPE samples by gdc_loadfile when creating sample sets
24 | Version 0.2.5:
25 | .  gdc_miror now longer exposes --date CLI arg; it was never relevant because
26 |    the mirror creates datestamps
27 | .  rest of tools more intelligently infer correct datestamp to use
28 | .  gdc_loadfile now exposes --format arg, to choose between FireCloud/Firehose
29 |    and generates case (participant/donor/individual) loadfiles in firecloud mode
30 | Version 0.2.4:
31 | .  Corrected file paths in files generated by gdc_loadfile
32 | Version 0.2.3:
33 | .  Misc tweaks arising from OPS testing
34 | Version 0.2.2:
35 | .  Major step forward in rationalizing interplay of CLI args and config files
36 | .  Including definition of scopes, specificty & precedence in GDCtool.py
37 | .  Tweaked some of the tests to demonstrate & test this interplay
38 | .  Updated a few more baseline test datafiles that had lingering V6 content
39 | Version 0.2.1
40 | .  Changed paginated queries to begin at page=0 instead of page=1;
41 | .  apparently in v1.9.0 the GDC API changed from 1-based to 0-based indexing?
42 | Version 0.2.0
43 | .  Re-establish usability with BOTH Python 2 and 3
44 | .  Motivated by contributions from https://github.com/aswanipranjal
45 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2016-2017, Broad Institute, Inc. {{{
 3 | # All rights reserved.
 4 | #
 5 | # This file is part of gdctools: Python and UNIX CLI utilities to simplify
 6 | # interaction with the NIH/NCI Genomics Data Commons.
 7 | #
 8 | # The gdctools package is distributed under the following BSD-style license:
 9 | #
10 | # Copyright (c) 2016-2017 Broad Institute, Inc.  All rights reserved.
11 | # Corresponding Author: Michael S. Noble (mnoble@broadinstitute.org)
12 | # Contributing Authors: Timothy DeFreitas (timdef@broadinstitute.org)
13 | #                       David Heiman (dheiman@broadinstitute.org)
14 | #
15 | # Redistribution and use in source and binary forms, with or without
16 | # modification, are permitted provided that the following conditions are met:
17 | #
18 | # 1. Redistributions of source code must retain the above copyright notice, this
19 | #    list of conditions and the following disclaimer.
20 | # 2. Redistributions in binary form must reproduce the above copyright notice,
21 | #    this list of conditions and the following disclaimer in the documentation
22 | #    and/or other materials provided with the distribution.
23 | # 3. Neither the name of the Broad Institute, Inc. nor the names of its
24 | #    contributors may be used to endorse or promote products derived from this
25 | #    software without specific prior written permission.
26 | #
27 | # THIS SOFTWARE IS PROVIDED "AS IS."  BROAD MAKES NO EXPRESS OR IMPLIED 
28 | # REPRESENTATIONS OR WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND 
29 | # COPYRIGHT, kINCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY,
30 | # FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY DOCUMENTATION,
31 | # NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT
32 | # DISCOVERABLE. IN NO EVENT SHALL BROAD, THE COPYRIGHT HOLDERS, OR CONTRIBUTORS
33 | # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
34 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO PROCUREMENT OF 
35 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
36 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
37 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF,
39 | # HAVE REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF SUCH DAMAGE.
40 | #
41 | # If, by operation of law or otherwise, any of the aforementioned warranty 
42 | # disclaimers are determined inapplicable, your sole remedy, regardless of the
43 | # form of action, including, but not limited to, negligence and strict 
44 | # liability, shall be replacement of the software with an updated version if
45 | # one exists.
46 | #
47 | # In addition, gdctools is distributed, in part, under and subject to the
48 | # provisions of licenses for:
49 | #
50 | #   Python requests library
51 | #   (http://docs.python-requests.org/en/latest/user/intro),
52 | #   Copyright (c) 2015 Kenneth Reitz (all rights reserved); and
53 | #
54 | #   Python 2.7.9 / Python 3
55 | #   (https://docs.python.org/3/license.html),
56 | #   Copyright (c) 2001-2015 Python Software Foundation (all rights reserved)
57 | #
58 | #   Fasteners locking library
59 | #   https://github.com/harlowja/fasteners
60 | #   Copyright (c) 2015-2016 Joshua Harlow (all rights reserved)
61 | #
62 | # Development of gdctools has been funded in whole or in part with federal funds
63 | # from the National Institutes of Health, Department of Health and Human 
64 | # Services.
65 | #
66 | # }}}
67 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include VERSION
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include Makefile.inc
 3 | 
 4 | help:
 5 | 	@echo
 6 | 	@echo "Build, test and install GDCtools.  Requires GNUmake 3.81 or later"
 7 | 	@echo
 8 | 	@echo "Targets:"
 9 | 	@echo
10 | 	@echo  "1. test | test3             Exercise tests for this package"
11 | 	@echo  "2. install                  Install locally, using pip"
12 | 	@echo  "3. uninstall                Remove local install, using pip"
13 | 	@echo  "4. publish                  Submit to PyPI"
14 | 	@echo
15 | 
16 | install:
17 | 	$(PIP) install --upgrade .
18 | 
19 | reinstall:
20 | 	$(MAKE) uninstall
21 | 	$(MAKE) install
22 | 
23 | uninstall:
24 | 	$(PIP) uninstall -y gdctools
25 | 
26 | publish:
27 | 	$(PYTHON) setup.py sdist upload && \
28 | 	rm -rf build dist *.egg-info
29 | 
30 | clean:
31 | 	rm -rf build dist *.egg-info *~
32 | 
33 | rclean: clean
34 | 	(cd tests && $(MAKE) rclean)
35 | 	(cd gdctools && $(MAKE) rclean)
36 | 
37 | test:
38 | 	cd tests && $(MAKE) test
39 | 
40 | test3:
41 | 	cd tests && $(MAKE) -e PYTHON_VER=3 test
42 | 
43 | .PHONY: help test install release publish clean
44 | 


--------------------------------------------------------------------------------
/Makefile.inc:
--------------------------------------------------------------------------------
 1 | 
 2 | # Makefile.inc: common definitions for use throughout the set of Makefiles in
 3 | # the GDCtools build system.  GNU make 3.81 or later is required by GDCtools.
 4 | 
 5 | SHELL=/bin/bash
 6 | __FILE__=$(lastword $(MAKEFILE_LIST))
 7 | __PATH__=$(abspath $(dir $(__FILE__)))
 8 | ROOT=$(__PATH__)
 9 | EMAKE=$(MAKE) -e
10 | TIMESTAMP = $(shell date +"%F %T")
11 | REPO_HASH=$(shell $(GIT) log -n 1 --pretty=%H | cut -c 1-24)
12 | VERSION = $(shell cat $(ROOT)/VERSION)
13 | LONGVERSION=$(VERSION) ($(TIMESTAMP) $(REPO_HASH))
14 | 
15 | # Root dir for test data is NOT within dev tree, so as to save time & disk space
16 | # during pip install (because pip copies entire dev tree during packaging)
17 | TEST_CONFIG_FILE=$(ROOT)/tests/tcgaSmoketest.cfg
18 | TEST_ROOT=$(abspath $(ROOT)/tests/$(shell grep ROOT_DIR: $(TEST_CONFIG_FILE) | awk '{print $$NF}'))
19 | 
20 | # Simple way to use Python3, e.g. for install, tests etc:
21 | #               make -e PYTHON_VER=3 install test
22 | # This will automatically adjust pip to Python3, too
23 | 
24 | PYTHON_VER=
25 | PYTHON_EXE=python$(PYTHON_VER)
26 | PYTHON_HOME=$(shell $(ROOT)/util/findPython.sh $(PYTHON_EXE))
27 | ifeq ($(PYTHON_HOME),)
28 | $(error Unable to continue, no $(PYTHON_EXE) found)
29 | endif
30 | MD5=$(ROOT)/util/checkMD5.sh
31 | DEST=$(PYTHON_HOME)
32 | BIN_DIR=$(DEST)/bin					# Python virtual environment here
33 | PYTHON=$(DEST)/bin/$(PYTHON_EXE)
34 | PIP=$(DEST)/bin/pip$(PYTHON_VER)
35 | 
36 | ABORT_ON_ERROR=$(ROOT)/util/checkError.sh
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gdctools
 2 | Python and UNIX CLI utilities to simplify search and retrieval of open-access data from the [NIH/NCI Genomics Data Commons](https://gdc.cancer.gov/), and automate tasks that are common to most data-driven science projects.   For more information and examples see the [pictorial overview](https://docs.google.com/viewer?url=https://github.com/broadinstitute/gdctools/files/825892/GDCtools-overview.pdf), [Wiki pages](https://github.com/broadinstitute/gdctools/wiki) or [tests/Makefile](tests/Makefile).  To get started from a Unix command line, simply `pip install gdctools` or clone the repo and install:
 3 | ```
 4 |     %  git clone https://github.com/broadinstitute/gdctools
 5 |     %  cd gdctools
 6 |     %  make install
 7 | ```
 8 | This should take only a minute or two, and may install [requests](http://docs.python-requests.org/en/master/), [fasteners](https://github.com/harlowja/fasteners) or [matplotlib](http://matplotlib.org/) dependencies.  *Note that if you are installing to a protected location you may need to preface the `make install` command with `sudo.`  After this you should be able to easily [mirror](https://github.com/broadinstitute/gdctools/wiki/GDC-Mirror) either [harmonized](https://gdc.cancer.gov/about-data/gdc-data-harmonization) or [legacy](https://gdc-portal.nci.nih.gov/legacy-archive) data directly from the command line 
 9 | ```
10 |     gdc_mirror --config tests/tcgaSmoketest.cfg
11 | ```
12 | (this is what the `make test` target does), even for a single patient case
13 | ```
14 |     gdc_mirror --cases TCGA-EE-A3J8
15 | ```
16 | or just one category of data for that patient
17 | ```
18 |     gdc_mirror --cases TCGA-EE-A3J8 --categories "Copy Number Variation"
19 | ```
20 | or perform other operations such as seeing which NIH/NCI programs have exposed data for download
21 | ```
22 |     %  gdc_list programs
23 |     [
24 |       "TCGA", 
25 |       "TARGET"
26 |     ]
27 | ```
28 | or what programs have submitted data (that may not be exposed yet)
29 | ```
30 |     %  gdc_list submission
31 |     [
32 |       "CCLE", 
33 |       "REBC", 
34 |       "TCGA", 
35 |       "TARGET", 
36 |       "CGCI", 
37 |       "CDDP", 
38 |       "ALCHEMIST", 
39 |       "GDC", 
40 |       "Exceptional_Responders", 
41 |       "UAT08", 
42 |       "TRIO", 
43 |       "CPTAC"
44 |     ]
45 | ```
46 | After mirroring you may run [gdc_dice](https://github.com/broadinstitute/gdctools/wiki/GDC-Dicer) on the mirror tree, followed by [gdc_loadfile](https://github.com/broadinstitute/gdctools/wiki/Create-Loadfile) to generate a sample "freeze" list which identifies the data for loading into pipeline execution systems like Firehose or FireCloud.  Finally, if you have *matplotlib* and *R* installed you may also run the [gdc_report](https://github.com/broadinstitute/gdctools/wiki/Sample-reports) tool to generate an HTML samples report ([similar to this](http://gdac.broadinstitute.org/runs/sampleReports/latest/)) that provides an annotated description of the processed data; note that this tool will attempt to automatically install [Nozzle](https://confluence.broadinstitute.org/display/GDAC/Nozzle) if it is not detected within the R installation. As noted earlier, the [tests/Makefile](tests/Makefile) provides examples of using the dice, loadfile and report tools.  GDCtools has been verified to function properly with multiple Python2 and Python3 versions, and we are [grateful for the community contributions](https://github.com/broadinstitute/gdctools/commit/53be8ee4d720b502c2dbb1e110e7c20754331e3e) in support of this goal.
47 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.2.13
2 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/GDCtools-overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/gdctools/ce388330c73152a5fa6e1e005c5b50997f29e5f6/docs/GDCtools-overview.pdf


--------------------------------------------------------------------------------
/gdctools/GDCcore.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2016, Broad Institute, Inc. {{{
  3 | # All rights reserved.
  4 | #
  5 | # This file is part of GDCtools: Python and UNIX commandn line wrappers
  6 | # for the Genomics Data Commons api.
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are met:
 10 | #
 11 | #  * Redistributions of source code must retain the above copyright notice,
 12 | #    this list of conditions and the following disclaimer.
 13 | #
 14 | #  * Redistributions in binary form must reproduce the above copyright notice,
 15 | #    this list of conditions and the following disclaimer in the documentation
 16 | #    and/or other materials provided with the distribution.
 17 | #
 18 | #  * Neither the name Broad Institute, Inc. nor the names of its
 19 | #    contributors may be used to endorse or promote products derived
 20 | #    from this software without specific prior written permission.
 21 | #
 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 23 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 24 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 25 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 26 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 28 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 29 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 30 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
 32 | # }}}
 33 | 
 34 | from __future__ import print_function
 35 | import os
 36 | import re
 37 | import sys
 38 | import inspect
 39 | import requests
 40 | import builtins
 41 | import logging
 42 | from pkg_resources import get_distribution, DistributionNotFound
 43 | 
 44 | __interactive__ = os.isatty(sys.stdout.fileno())
 45 | 
 46 | # Silence SSL warnings on older systems: should check Unix kernel
 47 | requests.packages.urllib3.disable_warnings()
 48 | 
 49 | GDC_ROOT_URI = "https://gdc-api.nci.nih.gov"
 50 | try:
 51 |     GDCT_VERSION = get_distribution('gdctools').version
 52 | except DistributionNotFound as dnf:
 53 |     GDCT_VERSION = 'TESTING'
 54 | 
 55 | def eprint(*args, **kwargs):
 56 |     # If not interactive (e.g. writing to log), show user from whence msg came
 57 |     if not __interactive__:
 58 |         print('gdctools Error: ', file=sys.stderr, end='')
 59 |     print(*args, file=sys.stderr, **kwargs)
 60 | 
 61 | def gprint(*args, **kwargs):
 62 |     # If not interactive (e.g. writing to log), show user from whence msg came
 63 |     if not __interactive__:
 64 |         print('gdctools: ', file=sys.stdout, end='')
 65 |     print(*args, file=sys.stdout, **kwargs)
 66 | 
 67 | def gabort(errCode, *args, **kwargs):
 68 |     gprint(*args, **kwargs)
 69 |     # This purpose of this method is to abort with a short, easily comprehended
 70 |     # message; so, disable logging to stop it from printing exception stacktrace
 71 |     logging.disable(logging.CRITICAL)
 72 |     sys.exit(errCode)
 73 | 
 74 | class attrdict(dict):
 75 |     """ dict whose members can be accessed as attributes, and default value is
 76 |     transparently returned for undefined keys; this yields more natural syntax
 77 |     dict[key]/dict.key for all use cases, instead of dict.get(key, <default>)
 78 |     """
 79 | 
 80 |     def __init__(self, srcdict=None, default=None):
 81 |         if srcdict is None:
 82 |             srcdict = {}
 83 |         dict.__init__(self, srcdict)
 84 |         self.__dict__["__default__"] = default
 85 | 
 86 |     def __getitem__(self, item):
 87 |         try:
 88 |             return dict.__getitem__(self, item)
 89 |         except KeyError:
 90 |             return self.__dict__["__default__"]
 91 | 
 92 |     def __getattr__(self, item):
 93 |         return self.__getitem__(item)
 94 | 
 95 |     def __setattr__(self, item, value):
 96 |         if item in self.__dict__:
 97 |             dict.__setattr__(self, item, value)
 98 |         else:
 99 |             self.__setitem__(item, value)
100 | 


--------------------------------------------------------------------------------
/gdctools/GDCtool.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016-2018 The Broad Institute, Inc.  All rights are reserved.
  7 | 
  8 | GDCtool.py: this file is part of gdctools.  See the <root>/COPYRIGHT
  9 | file for the SOFTWARE COPYRIGHT and WARRANTY NOTICE.
 10 | 
 11 | @author: Michael S. Noble, Timothy DeFreitas, David I. Heiman
 12 | @date:  2018-10-04
 13 | '''
 14 | 
 15 | # }}}
 16 | 
 17 | import sys
 18 | import os
 19 | import configparser
 20 | import time
 21 | import logging
 22 | from pkg_resources import resource_filename
 23 | from gdctools.GDCcore import *
 24 | from gdctools.lib import common
 25 | from gdctools.lib import api
 26 | from signal import signal, SIGPIPE, SIG_DFL
 27 | import argparse
 28 | 
 29 | # Stop Python from complaining when I/O pipes are closed
 30 | signal(SIGPIPE, SIG_DFL)
 31 | 
 32 | class GDCtool(object):
 33 |     ''' Base class for each tool in the GDCtools suite '''
 34 |     def __init__(self, version="", description=None, datestamp_required=True):
 35 |         self.version = version + " (GDCtools: " + GDCT_VERSION + ")"
 36 |         self.cli = argparse.ArgumentParser( description=description,
 37 |                     formatter_class=argparse.RawDescriptionHelpFormatter)
 38 | 
 39 |         self.datestamp_required = datestamp_required
 40 |         self.config_add_args()
 41 |         self.cli.add_argument('--version', action='version', version=self.version)
 42 |         self.cli.add_argument('-V', '--verbose', dest='verbose',
 43 |                 action='count', help=\
 44 |                 'Each time specified, increment verbosity level [%(default)s]')
 45 | 
 46 |         # Derived classes should add custom options & behavior in their
 47 |         # respective __init__/config_customize/execute implementations
 48 | 
 49 |     def execute(self):
 50 |         self.options = self.cli.parse_args()
 51 |         api.set_verbosity(self.options.verbose)
 52 | 
 53 |         if not self.config_supported():
 54 |             return
 55 | 
 56 |         self.config_initialize()
 57 |         self.config_customize()
 58 |         self.config_finalize()
 59 | 
 60 |         if self.options.show:
 61 |             for key,val in self.config.iteritems():
 62 |                 if isinstance(val, dict):
 63 |                     key += ':'
 64 |                     val = ''.join(['\n\t%s=%s' % (k,v) for k,v in val.iteritems()])
 65 |                 else:
 66 |                     val = '=' + str(val)
 67 |                 print('%s%s' % (key,val))
 68 |             sys.exit(0)
 69 | 
 70 |         if self.datestamp_required:
 71 |             datestamp = self.options.datestamp
 72 |             if not datestamp:
 73 |                 datestamp = 'latest'
 74 | 
 75 |             existing_dates = self.datestamps()         # ascending sort order
 76 |             if len(existing_dates) == 0:
 77 |                 raise ValueError("No datestamps found, use upstream tool first")
 78 | 
 79 |             if datestamp == 'latest':
 80 |                 datestamp = existing_dates[-1]
 81 |             elif datestamp not in existing_dates:
 82 |                 raise ValueError("Requested datestamp not present in "
 83 |                              + self.config.datestamps + "\n"
 84 |                              + "Existing datestamps: " + repr(existing_dates))
 85 |         else:
 86 |             datestamp = time.strftime('%Y_%m_%d', time.localtime())
 87 | 
 88 |         self.datestamp = datestamp
 89 |         self.init_logging()
 90 | 
 91 |     def get_values_as_list(self, values):
 92 |         if values:
 93 |             if type(values) is list:
 94 |                 return values
 95 |             return [ v.strip() for v in values.split(',') ]
 96 |         else:
 97 |             return [ ]
 98 | 
 99 |     def config_supported(self):
100 |         return True
101 | 
102 |     def config_add_args(self):
103 |         ''' If tool supports config file (i.e. a named [TOOL] section), then
104 |         reflect config file vars that are common across all tools as CLI args,
105 |         too.  Note that args with nargs=+ will be instantiated as lists.'''
106 |         if not self.config_supported():
107 |             return
108 |         cli = self.cli
109 |         cli.add_argument('--config', nargs='+', type=argparse.FileType('r'),
110 |                             help='One or more configuration files')
111 |         cli.add_argument('--show', default=False, action='store_true',
112 |                         help='Show key=value configuration state, then exit')
113 | 
114 |         if self.datestamp_required:
115 |             cli.add_argument('--date', nargs='?', dest='datestamp',
116 |                 help='Use data from a given dated version (snapshot) of '
117 |                 'GDC data, specified in YYYY_MM_DD form.  If omitted, '
118 |                 'the latest available snapshot will be used.')
119 |         cli.add_argument('--cases', nargs='+', metavar='case_id',
120 |                 help='Process data only from these GDC cases')
121 |         cli.add_argument('--categories',nargs='+',metavar='category',
122 |                 help='Mirror data only from these GDC data categories. '
123 |                 'Note that many category names contain spaces, so use '
124 |                 'quotes to delimit (e.g. \'Copy Number Variation\')')
125 |         cli.add_argument('-L', '--log-dir',
126 |                 help='Directory where logfiles will be written')
127 |         cli.add_argument('--programs', nargs='+', metavar='program',
128 |                     help='Process data only from these GDC programs')
129 |         cli.add_argument('--projects', nargs='+', metavar='project',
130 |                     help='Process data only from these GDC projects')
131 |         cli.add_argument('--workflow',
132 |                 help='Process data only from this GDC workflow type')
133 | 
134 |     def config_initialize(self):
135 |         '''
136 |         Read initial configuration state from one or more config files; store
137 |         this state within .config member, a nested dict whose keys may also be
138 |         referenced as attributes (safely, with a default value of None if unset)
139 |         '''
140 | 
141 |         self.config = attrdict(default=attrdict())      # initially empty
142 |         if not self.options.config:                     # list of file objects
143 |             # No config file specified, use default
144 |             cfg_default = resource_filename(__name__, "default.cfg")
145 |             self.options.config = [open(cfg_default,"r")]
146 | 
147 |         cfgparser = configparser.SafeConfigParser()
148 |         # The argparse module turns CLI config file args into file handles,
149 |         # but config parser expects file names, so convert them here
150 |         cfgparser.read([f.name for f in self.options.config])
151 |         config = self.config
152 | 
153 |         # [DEFAULT] defines common variables for interpolation/substitution in
154 |         # other sections, and are stored at the root level of the config object
155 |         for keyval in cfgparser.items('DEFAULT'):
156 |             config[keyval[0]] = keyval[1]
157 | 
158 |         for section in cfgparser.sections():
159 |             # Note that tool-specific sections should be named to match the
160 |             # tool name, i.e. [toolname] for each gdc_<toolname> tool
161 |             config[section] = attrdict()
162 |             for option in cfgparser.options(section):
163 |                 # DEFAULT vars ALSO behave as though they were defined in every
164 |                 # section, but we purposely skip them here so that each section
165 |                 # reflects only the options explicitly defined in that section
166 |                 if not config[option]:
167 |                     config[section][option] = cfgparser.get(section, option)
168 | 
169 |         self.validate_config(["root_dir"], UnsetValue={})
170 |         if not config.datestamps:
171 |             config.datestamps = os.path.join(config.root_dir, "datestamps.txt")
172 | 
173 |         if not config.missing_file_value:
174 |             config.missing_file_value = "__DELETE__"
175 | 
176 |         # Ensure that aggregate cohort names (if present) are in uppercase
177 |         # (necessary because ConfigParser returns option names in lowercase)
178 |         # If no aggregates are defined, change None obj to empty dict, for
179 |         # cleaner "if X in config.aggregates:" queries that will always work
180 |         if config.aggregates:
181 |             for key, val in config.aggregates.items():
182 |                 config.aggregates[key.upper()] = config.aggregates.pop(key)
183 |         else:
184 |             config.aggregates = {}
185 | 
186 |         for var in ["cases", "categories", "projects", "programs"]:
187 |             config[var] = self.get_values_as_list(config[var])
188 | 
189 |     def config_customize(self):
190 |         pass
191 | 
192 |     def config_finalize(self):
193 |         # Here we define & enforce precedence in runtime/configuration state:
194 |         #   1) amongst the SCOPES (sources) from which that state is gathered
195 |         #   2) then using intersection to decide what needs to be processed
196 |         #
197 |         # The runtime configuration of each GDCtool comes from several SCOPES
198 |         #   - built in defaults
199 |         #   - configuration files
200 |         #   - command line flags
201 |         # in order of increasing precedence (CLI flags highest). For example,
202 |         # setting --cases <something> at the command line will override a
203 |         # CASES: entry in a configuration file.  Note we need to enforce this
204 |         # precedence explicitly here because of an unavoidable chicken/egg
205 |         # problem: namely that precdence can't be enforced simply by waiting
206 |         # to parse the CLI args UNTIL AFTER reading the config file, because
207 |         # --config is ALSO a CLI arg.  So we have to give the --config CLI flag
208 |         # a chance to be used in config_initialize(), then override the config
209 |         # file variables (as given in named sections) here if they were ALSO
210 |         # set as CLI flags.
211 |         #
212 |         # After scoping the values of each configuration variable, we then
213 |         # intersect the cases/projects/programs values to ultimately decide
214 |         # what to process (i.e. what samples to mirror/download, dice, etc)
215 | 
216 |         def enforce_scope(From, To):
217 |             if From.log_dir:    To.log_dir    = From.log_dir
218 |             if From.workflow:   To.workflow   = From.workflow
219 |             if From.categories: To.categories = From.categories
220 |             if From.programs:   To.programs   = From.programs
221 |             if From.projects:   To.projects   = From.projects
222 |             if From.cases:      To.cases      = From.cases
223 | 
224 |         config = self.config
225 |         toolname = self.__class__.__name__.split('gdc_')[-1]
226 |         enforce_scope(config[toolname], config)
227 |         enforce_scope(self.options, config)
228 | 
229 |         # Determine what to ultimately process, noting that
230 |         #
231 |         #  - explicitly specified cases implicitly constrain projects & programs
232 |         #  - explicitly specified projects implicitly constrains programs
233 |         #
234 |         # and using intersection here to adjudicate.  Also note that specifying
235 |         #
236 |         #  - only a program selects all projects & cases within that program
237 |         #  - only a project selects all cases for that project (w/in 1 program)
238 |         #
239 |         # but effect of the latter two is achieved downstream (not here), when
240 |         # the invoked tool is performing its function. Finally, nearly all
241 |         # GDCtools utilities need at least 1 case, project or program to be
242 |         # specified as a prerequisite to nominal operation.
243 | 
244 |         projs = set(api.get_project_from_cases(config.cases))
245 |         if not projs:
246 |             projs = set(config.projects)
247 |         elif config.projects:
248 |             projs &= set(config.projects)
249 |             # If this results in an empty set of projects, reset to CLI value
250 |             # as warning sign (by trying to induce downstream failure/exception)
251 |             if not projs:
252 |                 projs = config.projects
253 | 
254 |         progs = set(api.get_programs(projs))
255 |         if not progs:
256 |             progs = set(config.programs)
257 |         elif config.programs:
258 |             progs &= set(config.programs)
259 |             # If this results in an empty set of programs, reset to CLI value
260 |             # as warning sign (by trying to induce downstream failure/exception)
261 |             if not progs:
262 |                 progs = config.programs
263 | 
264 |         config.projects = list(projs)
265 |         config.programs = list(progs)
266 |         for var in ["cases", "categories", "projects", "programs"]:
267 |             config[var] = self.get_values_as_list(config[var])
268 | 
269 |     def validate_config(self, vars_to_examine, UnsetValue=None):
270 |         '''
271 |         Ensure that sufficient configuration state has been defined for tool to
272 |         initiate its work; should only be called after CLI flags are parsed,
273 |         because CLI has the highest precedence in setting configuration state
274 |         '''
275 |         for v in vars_to_examine:
276 |             result = eval("self.config." + v)
277 |             if result == UnsetValue:
278 |                 gabort(100, "Required config variable is unset: %s" % v)
279 | 
280 |     def datestamps(self):
281 |         """ Returns a list of valid datestamps by reading the datestamps file """
282 |         if not os.path.isfile(self.config.datestamps):
283 |             return []
284 |         else:
285 |             raw = open(self.config.datestamps).read().strip()
286 |             if not raw:
287 |                 return [] # Empty file
288 |             else:
289 |                 # stamps are listed one per line, sorting is a sanity check
290 |                 return sorted(raw.split('\n'))
291 | 
292 |     def init_logging(self):
293 | 
294 |         if not self.config:
295 |             return
296 | 
297 |         log_dir = self.config.log_dir
298 |         datestamp = self.datestamp
299 |         tool_name = self.__class__.__name__
300 |         root_logger = logging.getLogger()
301 |         root_logger.setLevel(logging.DEBUG)
302 |         log_formatter = logging.Formatter('%(asctime)s[%(levelname)s]: %(message)s')
303 | 
304 |         # Write logging data to file
305 |         if log_dir and datestamp is not None:
306 |             log_dir = os.path.join(log_dir, tool_name)
307 |             if not os.path.isdir(log_dir):
308 |                 try:
309 |                     os.makedirs(log_dir)
310 |                 except:
311 |                     logging.info(" could not create logging dir: " + log_dir)
312 |                     return
313 | 
314 |             logfile = os.path.join(log_dir, ".".join([tool_name, datestamp, "log"]))
315 |             logfile = common.increment_file(logfile)
316 | 
317 |             file_handler = logging.FileHandler(logfile)
318 |             file_handler.setLevel(logging.DEBUG)
319 |             file_handler.setFormatter(log_formatter)
320 |             root_logger.addHandler(file_handler)
321 | 
322 |             logging.info("Logfile:" + logfile)
323 |             # For easier eyeballing & CLI tab-completion, symlink to latest.log
324 |             latest = os.path.join(log_dir, "latest.log")
325 |             common.silent_rm(latest)
326 |             os.symlink(os.path.abspath(logfile), latest)
327 | 
328 |         # Send to console, too, if running at valid TTY (e.g. not cron job)
329 |         if os.isatty(sys.stdin.fileno()):
330 |             console_handler = logging.StreamHandler()
331 |             console_handler.setLevel(logging.INFO)
332 |             console_handler.setFormatter(log_formatter)
333 |             root_logger.addHandler(console_handler)
334 | 
335 |     def status(self):
336 |         # Emit system info (as header comments suitable for TSV, etc) ...
337 |         gprint('#')  # @UndefinedVariable
338 |         gprint('# %-22s = %s' % (self.__class__.__name__ + ' version ',  # @UndefinedVariable
339 |                                  self.version))
340 |         gprint('#')  # @UndefinedVariable
341 | 
342 | if __name__ == "__main__":
343 | 
344 |     class ToolExample(GDCtool):
345 |         def config_supported (self):
346 |             return False
347 | 
348 |     tool = ToolExample()
349 |     tool.execute()
350 |     tool.status()
351 | 


--------------------------------------------------------------------------------
/gdctools/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include ../Makefile.inc
 3 | 
 4 | default:
 5 | 	@:
 6 | 	
 7 | clean:
 8 | 	rm -rf build dist *.egg-info *~  *.pyc lib/*~ lib/*.pyc gdctools_tmp
 9 | 
10 | rclean: clean
11 | 
12 | .PHONY: default clean
13 | 
14 | 


--------------------------------------------------------------------------------
/gdctools/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/gdctools/config/cptac3.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | #  This file contains the GDCtools configuration for processing CPTAC3 genomic
 3 | #  data with GDCtools.  As of 2018_08_31, the only CPTAC3 data available at
 4 | #  the Genomic Data Commons (https://gdc.cancer.gov) is controlled access
 5 | #  primary FASTQ/BAM level data; no genomic variant calls derived from these
 6 | #  BAMs are yet available for CPTAC3.  For this reason the typical "first step"
 7 | #  of using gdc_mirror to obtain genomic variant calls & metadata is NOT BEING
 8 | #  PERFORMED (yet) for CPTAC3.  Instead, an analogous process to mimic such
 9 | #  mirroring is performed by the Broad GDAC, whereby data is obtained from the
10 | #  CPTAC DCC then "minimally re-mapped" to make it appear as though the mapped
11 | #  data were obtained from GDC by GDCtools; this approach allows us to leverage
12 | #  the remainder of the GDAC toolchain and run infrastructure "as is," namely
13 | #  gdc_dice, gdc_loadfile, gdc_report, stddata_new, analyses_new, etc.  When
14 | #  the GDC begins serving open-access CPTAC3 variant calls then the mapping
15 | #  procedure should be deprecated in favor of using gdc_mirror as originally
16 | #  intended from GDCtools.
17 | #
18 | #  For more details on the CPTAC3 data mapping procedure, see the code at
19 | #       https://github.com/broadinstitute/gdac_experiment/cptac 
20 | #
21 | #  For more information on the sections of this config file and how they're
22 | #  used, see the additional config files in the GDCtools distribution, e.g.
23 | #  tcga.cfg and google.cfg, as well as the various cfg files used in the 
24 | #  ./tests directory.
25 | #
26 | #  Finally, note that to ensure gdc_mirror is not accidentally used for CPTAC3
27 | #  the tool will simply exit (with status 0) if/when PROGRAM=CPTAC3 is given.
28 | 
29 | [DEFAULT]
30 | ROOT_DIR: /xchip/gdac_data/gdc
31 | LOG_DIR: %(ROOT_DIR)s/logs
32 | REFERENCE_DIR: %(ROOT_DIR)s/reference
33 | PROGRAMS: CPTAC3
34 | CPTAC3_CUSTOM_ROOT: /xchip/gdac_data/cptac3
35 | DATESTAMPS: %(CPTAC3_CUSTOM_ROOT)s/datestamps.txt
36 | 
37 | [mirror]
38 | DIR: %(CPTAC3_CUSTOM_ROOT)s/mirror
39 | 
40 | [dice]
41 | DIR: %(ROOT_DIR)s/dice
42 | 
43 | [loadfile]
44 | DIR: %(ROOT_DIR)s/loadfiles
45 | FILTERED_SAMPLES: %(ROOT_DIR)s/loadfiles/filtered_samples.txt
46 | 
47 | [report]
48 | # Temporarily work around fact that reports are not segregated by data PROGRAM
49 | DIR: %(ROOT_DIR)s/cptac3/reports
50 | HEATMAPS_DIR: %(ROOT_DIR)s/cptac3/heatmaps
51 | REDACTIONS_DIR: %(ROOT_DIR)s/cptac3/redactions
52 | BLACKLIST: /dev/null
53 | 


--------------------------------------------------------------------------------
/gdctools/config/google.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | # This config file is meant to be used in conjunction with other, program-
 3 | # specific config files (like tcga.cfg); augmenting them so that, instead
 4 | # of generating loadfiles that refer to local storage they can instead 
 5 | # refer to buckets in Google cloud storage.  It is presumed that the act
 6 | # of populating the buckets (referenced in these Google loadfiles) happens
 7 | # outside of GDCtools proper.
 8 | #
 9 | # This configuration extends a core configuration by specifying an alternative
10 | #
11 | #   1) loadfile dir, to cleanly separate google-ified loadfiles from those
12 | #      generated with references to local (on-premises) storage
13 | #   2) FILE_PREFIX, so that each file referenced within the loadfile begins
14 | #      with a Google bucket URI instead of path to local storage.  Usually the
15 | #      FILE_PREFIX variable need not be mentioned in a [loadfiles] section, as
16 | #      it will default to the value of ROOT_DIR in the [dice] config section.
17 | #	3) MISSING_FILE_VAULE (which is analogous to NA value in R, but for files)
18 | 
19 | [DEFAULT]
20 | MISSING_FILE_VALUE: gs://broad-institute-gdac/GDAC_FC_NULL
21 | 
22 | [loadfile]
23 | DIR: %(ROOT_DIR)s/loadfiles/google
24 | FILE_PREFIX: gs://broad-institute-gdac/gdc/dice
25 | 


--------------------------------------------------------------------------------
/gdctools/config/tcga.cfg:
--------------------------------------------------------------------------------
 1 | #  This file contains the GDCtools configuration for processing TCGA data
 2 | #  from the Genomic Data Commons at https://gdc.cancer.gov.  That processing
 3 | #  currently involves several steps:
 4 | #
 5 | #		1) Mirroring data & metadata from the GDC to local storage
 6 | #		2) Dicing (interpeting) that data to understand its content, enforce
 7 | #		   file format standards, count samples, etc
 8 | #		3) Generate sample loadfiles (akin to freeze lists), used to load the
 9 | #		   diced data into pipeline execution systems like Firehose
10 | #		4) Easily generate aggregate cohorts: which combined two or more
11 | #		   singleton cohorts into a larger cohort (e.g. combining the
12 | #		   from TCGA into a pan-kidney cohort, aka KI
13 | #
14 | #  Each of these steps has a respective section in this config file.
15 | #
16 | #  On 2016_07_01 the Broad Institute proposed to GDC and NIH that some, and
17 | #  perhaps most/all, of these steps might be performed more quickly and less
18 | #  expensively, if instead of __only__ exposing data through an API for
19 | #  downloading to local storate the GDC __also__ exposed data as pointers to
20 | #  cloud buckets/storage; this would obviate the need to mirror large files 
21 | #  (like BAMs) to local storage, and make data loading into cloud-based analysis
22 | #  sytems (such as FireCloud) as easy as a shallow-copy of the bucket URI.
23 | #
24 | #  Config file version:  2016_09_11
25 | 
26 | [DEFAULT]
27 | ROOT_DIR: /xchip/gdac_data/gdc
28 | LOG_DIR: %(ROOT_DIR)s/logs
29 | REFERENCE_DIR: %(ROOT_DIR)s/reference
30 | PROGRAMS: TCGA
31 | 
32 | [mirror]
33 | DIR: %(ROOT_DIR)s/mirror
34 | 
35 | [dice]
36 | DIR: %(ROOT_DIR)s/dice
37 | 
38 | [loadfile]
39 | DIR: %(ROOT_DIR)s/loadfiles
40 | FILTERED_SAMPLES: %(ROOT_DIR)s/loadfiles/filtered_samples.txt
41 | 
42 | [report]
43 | DIR: %(ROOT_DIR)s/sample_reports
44 | HEATMAPS_DIR: %(ROOT_DIR)s/heatmaps
45 | REDACTIONS_DIR: %(ROOT_DIR)s/redactions
46 | BLACKLIST: %(ROOT_DIR)s/config/blacklist.tsv
47 | 
48 | [aggregates]
49 | TCGA-COADREAD: TCGA-COAD,TCGA-READ
50 | TCGA-GBMLGG: TCGA-GBM,TCGA-LGG
51 | TCGA-KIPAN: TCGA-KICH,TCGA-KIRC,TCGA-KIRP
52 | TCGA-STES: TCGA-STAD,TCGA-ESCA
53 | TCGA-PANGI: TCGA-COAD,TCGA-READ,TCGA-STAD,TCGA-ESCA
54 | 


--------------------------------------------------------------------------------
/gdctools/default.cfg:
--------------------------------------------------------------------------------
 1 | #  This is the defaut configuration file for GDCtools: it will be used by
 2 | #  individual tools when no --config option is specified, and as such it
 3 | #  is expressly intended to be incomplete so as to avoid having the tools
 4 | #  perform expensive & time consuming interaction with the GDC (e.g. by
 5 | #  downloading a great deal of data.  Look at tcga.cfg for a full-featured
 6 | #  config file used by the Broad Institute GDAC to obtain and process TCGA
 7 | #  data from the GDC.
 8 | #
 9 | #  Config file version:  2017_03_05.1
10 | 
11 | [DEFAULT]
12 | ROOT_DIR: ./gdctools_tmp
13 | # Logging to files is turned off by default
14 | #LOG_DIR: %(ROOT_DIR)s/logs
15 | PROGRAMS:
16 | 
17 | [mirror]
18 | DIR: %(ROOT_DIR)s/mirror
19 | 
20 | [dice]
21 | DIR: %(ROOT_DIR)s/dice
22 | 
23 | [loadfiles]
24 | DIR: %(ROOT_DIR)s/loadfiles
25 | FILTERED_SAMPLES: %(ROOT_DIR)s/loadfiles/filtered_samples.txt
26 | 
27 | [reports]
28 | DIR: %(ROOT_DIR)s/sample_reports
29 | HEATMAPS_DIR: %(ROOT_DIR)s/heatmaps
30 | REDACTIONS_DIR: %(ROOT_DIR)s/redactions
31 | BLACKLIST: %(ROOT_DIR)s/config/blacklist.tsv
32 | 
33 | [aggregates]
34 | 


--------------------------------------------------------------------------------
/gdctools/gdc_list.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016 The Broad Institute, Inc.  All rights are reserved.
  7 | 
  8 | gdc_list: List/query operational features of GDCtools, EITHER from a
  9 |           a local instance or the remote GDC server.
 10 | 
 11 | @author: Michael S. Noble, Timothy DeFreitas
 12 | @date:  2017_05_08
 13 | '''
 14 | 
 15 | from __future__ import print_function
 16 | import json
 17 | import types
 18 | from collections import defaultdict
 19 | from gdctools.GDCtool import GDCtool
 20 | from gdctools.GDCcore import *
 21 | from gdctools.lib.api import GDCQuery, _eq_filter as eq_filter, _and_filter as and_filter
 22 | 
 23 | # }}}
 24 | 
 25 | features = defaultdict(lambda:None)
 26 | 
 27 | def features_identify():
 28 |     for name,sym in globals().items():
 29 |         if name.startswith("feature_") and isinstance(sym, types.FunctionType):
 30 |             features[name[8:]] = sym
 31 | 
 32 | def feature_what(args):
 33 |     ''' Display entire set of GDCtools features that may be queried'''
 34 |     for name in sorted(features.keys()):
 35 |         print("%-12s %s" % (name, features[name].func_doc))
 36 | 
 37 | def feature_submitted(args):
 38 |     ' List the names of programs which have submitted data to the GDC.\n'\
 39 |     '\t      The difference between this and \'programs\' is that the latter\n'\
 40 |     '\t      is a subset of \'submitted,\' indicating which submissions have\n'\
 41 |     '\t      been processed by GDC and exposed for public download.'
 42 |     call_gdc_api("submission", args)
 43 | 
 44 | def feature_annotations(args):
 45 |     ''' List annotations attached to patient cases at the GDC'''
 46 |     call_gdc_api("annotations", args)
 47 | 
 48 | def feature_cases(args):
 49 |     ''' List the patient cases across all projects/programs at the GDC'''
 50 |     def walk(results_list):
 51 |         print("feature_cases: length(results) = %d" % len(results_list))
 52 |         print("Submitter_Case_ID\tDisease_Type\tSubmitter_Sample_ID\tGDC_Sample_ID")
 53 |         for case in sorted(results_list, key=lambda case: case["disease_type"]):
 54 |             case = attrdict(case)
 55 |             if not case.sample_ids:
 56 |                 case.sample_ids = case.submitter_sample_ids = [None]
 57 |             for i in xrange(len(case.sample_ids)):
 58 |                 print("%s\t%s\t%s\t%s" % ( case.submitter_id, case.disease_type,
 59 |                             case.submitter_sample_ids[i], case.sample_ids[i]))
 60 |     call_gdc_api("cases", args, walk)
 61 | 
 62 | def feature_files(args):
 63 |     ''' List the files in all projects/programs at the GDC'''
 64 |     call_gdc_api("files", args)
 65 | 
 66 | def feature_projects(args):
 67 |     ''' Give name/disease/site (in TSV form) of projects stored at GDC'''
 68 | 
 69 |     def walk(results_list):
 70 |         print("Project_ID\tDisease\tPrimary_Site")
 71 |         for p in sorted(results_list, key=lambda proj: proj["project_id"]):
 72 |             p = attrdict(p)
 73 |             print("%s\t%s\t%s" % (p.project_id, p.name, p.primary_site[0]))
 74 | 
 75 |     call_gdc_api("projects", args, walk)
 76 | 
 77 | def feature_programs(args):
 78 |     ''' List the names of all programs (data sets) warehoused at the GDC'''
 79 |     call_gdc_api("programs", args)
 80 | 
 81 | def call_gdc_api(feature, args, callback=None):
 82 |     ''' Issue GDC API call, first parsing filters/fields/expand args from CLI'''
 83 | 
 84 |     query = GDCQuery(feature)
 85 | 
 86 |     # Ask that result set be pruned, by applying KEY=VALUE filters
 87 |     for filt in args.filters:
 88 |         key, value = filt.split("=")
 89 |         query.add_eq_filter(key, value)
 90 | 
 91 |     # Additional fields to include in each item of result set
 92 |     #if args.fields:
 93 |     #    query.add_fields(*(tuple(args.fields)))
 94 | 
 95 |     # See GDC docs for meaning of expand
 96 |     #if args.expand:
 97 |     #    query.add_expansions(*(tuple(args.expand)))
 98 | 
 99 |     results = query.get()
100 |     if not callback or args.raw:
101 |         print(json.dumps(results, indent=2))
102 |     else:
103 |         callback(results)
104 | 
105 | class gdc_list(GDCtool):
106 | 
107 |     def __init__(self):
108 | 
109 |         description = 'Quicklook tool for examining datasets available at the '\
110 |             'GDC, following the\nsyntax of portal.gdc.cancer.gov/query.  '\
111 |             'Examples:\n\n'\
112 |             '    # Show datasets exposed via the GDC public API\n' \
113 |             '    gdc_list programs\n\n' \
114 |             '    # Show all datasets submitted to GDC, including non-public\n' \
115 |             '    gdc_list submitted\n\n' \
116 |             '    # Shows patient cases in the TCGA adrenocortical cohort\n'\
117 |             '    gdc_list cases project.project_id=TCGA-ACC\n\n' \
118 |             '    # Shows metadata of all files in TCGA uveal melanoma cohort\n'\
119 |             '    gdc_list files cases.project.project_id=TCGA-UVM\n\n' \
120 |             '    # Show what queries may be performed, in summary form\n'\
121 |             '    gdc_list what\n\n' \
122 |             '    # Show open-accss clinical data in TCGA ovarian cohort\n'\
123 |             '    gdc_list files cases.project.project_id=TCGA-OV files.data_category=Clinical files.access=open'
124 | 
125 |         super(gdc_list, self).__init__("0.1.2", description)
126 |         cli = self.cli
127 |         #cli.add_argument('-e', '--expand', nargs='+',
128 |         #    help='Expand these nested fields')
129 |         #cli.add_argument('-f', '--fields', nargs='+')
130 |         cli.add_argument('-n', '--num-results', default=-1, type=int,
131 |             help='return at most this many results')
132 |         cli.add_argument('-r', '--raw', action='store_true',
133 |             help='Some features process the payload returned by the GDC API '\
134 |                  'to simplify interpretion; this flag turns that off, '\
135 |                  'permitting direct inspection of the raw payload')
136 |         cli.add_argument('-s', '--page-size', default=500, type=int,
137 |             help='Server page size')
138 |         cli.add_argument('feature',
139 |             help='Which feature to query/list (case insensitive). The special '\
140 |                  'feature of \'what\' may be given here, to display the names '\
141 |                  'of all features that may be queried')
142 |         cli.add_argument('filters', nargs='*', metavar='filter',
143 |             help="Prune with key=value filters, e.g. program.name=TCGA")
144 | 
145 |         features_identify()
146 | 
147 |     def execute(self):
148 |         super(gdc_list, self).execute()
149 |         args = self.options
150 |         feature = features[args.feature]
151 |         if feature:
152 |             feature(args)
153 |         else:
154 |             gabort(1, "Unsupported feature: " + args.feature)
155 | 
156 |     def config_supported(self):
157 |         return False
158 | 
159 | def main():
160 |     gdc_list().execute()
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/gdctools/gdc_mirror.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016 The Broad Institute, Inc.  All rights are reserved.
  7 | 
  8 | gdc_mirror: this file is part of gdctools.  See the <root>/COPYRIGHT
  9 | file for the SOFTWARE COPYRIGHT and WARRANTY NOTICE.
 10 | 
 11 | @author: Michael S. Noble, Timothy DeFreitas
 12 | @date:  2016_05_18
 13 | '''
 14 | 
 15 | # }}}
 16 | 
 17 | import sys
 18 | import os
 19 | import logging
 20 | import time
 21 | import json
 22 | 
 23 | from gdctools.GDCcore import *
 24 | from gdctools.GDCtool import GDCtool
 25 | import gdctools.lib.api as api
 26 | import gdctools.lib.meta as meta
 27 | import gdctools.lib.common as common
 28 | 
 29 | class gdc_mirror(GDCtool):
 30 | 
 31 |     def __init__(self):
 32 | 
 33 |         description = 'Create local mirror of data from arbitrary set of '\
 34 |             'programs/projects warehoused\nat the Genomic Data Commons (GDC)'
 35 | 
 36 |         # Note that gdc_mirror is the only GDCtool which does not require
 37 |         # datestamp (i.e. data version) to exist apriori, b/c it creates them
 38 |         super(gdc_mirror, self).__init__("0.9.2", description, False)
 39 |         cli = self.cli
 40 |         cli.add_argument('-m', '--mirror-dir',
 41 |                 help='Root of mirrored data folder tree')
 42 |         cli.add_argument('-l', '--legacy', default=False, action='store_true',
 43 |                 help='Retrieve legacy data (e.g. TCGA HG19), instead of '
 44 |                 'data harmonized at the GDC (the default)')
 45 |         cli.add_argument('-f', '--force-download', action='store_true',
 46 |                 help='Download files even if already mirrored locally.'+
 47 |                 ' (DO NOT use during incremental mirroring)')
 48 | 
 49 |         # Detect if we have curl installed
 50 |         self.has_cURL = api.curl_exists()
 51 | 
 52 |     def config_customize(self):
 53 |         opts = self.options
 54 |         config = self.config
 55 |         if opts.mirror_dir: config.mirror.dir = opts.mirror_dir
 56 |         self.force_download = opts.force_download
 57 |         self.workflow = opts.workflow
 58 | 
 59 |         if config.mirror.legacy:
 60 |             # Legacy mode has been requested in config file, coerce to boolean
 61 |             value = config.mirror.legacy.lower()
 62 |             config.mirror.legacy = (value in ["1", "true", "on", "yes"])
 63 | 
 64 |         # Allow command line flag to override config file
 65 |         if opts.legacy:
 66 |             config.mirror.legacy = opts.legacy
 67 | 
 68 |         # Legacy mode has several effects:
 69 |         #   1) Ensures that api requests are routed to the GDC legacy API
 70 |         #   2) Ensuring that legacy program data are returned (e.g. TCGA HG19)
 71 |         #   3) Turns OFF strict file processing: returned data files are thus
 72 |         #      mirrored "as is," i.e. verbatim, regardless of file type (or
 73 |         #      extension), with no UUID inserted into names of mirrored files
 74 |         #   4) Prohibits subsequent processing, e.g. dicing: the GDCtools suite
 75 |         #      ONLY supports MIRRORING of legacy, nothing else
 76 |         api.set_legacy(config.mirror.legacy)
 77 | 
 78 |     def mirror(self):
 79 | 
 80 |         config = self.config
 81 |         if not os.path.isdir(config.mirror.dir):
 82 |             os.makedirs(config.mirror.dir)
 83 | 
 84 |         # Validate program and project names, if specified
 85 |         projects = []
 86 |         programs = []
 87 |         if config.projects:
 88 |             all_projects = api.get_projects()
 89 |             for proj in config.projects:
 90 |                 if proj not in all_projects:
 91 |                     gprint("Project " + proj + " not found in GDC, ignoring")
 92 |                 else:
 93 |                     projects.append(proj)
 94 | 
 95 |         if config.programs:
 96 |             all_programs = api.get_programs()
 97 |             for prog in config.programs:
 98 |                 if prog not in all_programs:
 99 |                     # Special handling for certain data not yet exposed by GDC:
100 |                     #    exit with zero status code, so that downstream tools
101 |                     #    (e.g. gdc_dice) run in a shell script or Makefile can
102 |                     #    still be executed, if desired, upon mocked-up data
103 |                     if prog in ["CPTAC3",]:
104 |                         gabort(0, "Nothing to mirror yet for %s, exiting gracefully" % prog)
105 |                     gprint("Program %s not found in GDC, ignoring" % prog)
106 |                 else:
107 |                     programs.append(prog)
108 | 
109 |         # Avoid accidental attempts to download entire GDC. Also note that
110 |         # other tools do not need to be this stringent, because they can
111 |         # infer programs/projects/cases from mirror or derivatives of it
112 |         if not (programs or projects):
113 |             gabort(1, "No valid programs or projects specified in config "+
114 |                       "file or command line flags")
115 | 
116 |         logging.info("GDC Mirror Version: %s", self.version)
117 |         logging.info("Command: " + " ".join(sys.argv))
118 | 
119 |         if not projects:
120 |             logging.info("No projects specified, inferring from programs")
121 |             projects = []
122 |             for prgm in programs:
123 |                 projects_for_this_program = api.get_projects(program=prgm)
124 |                 logging.info("%d project(s) found for %s: %s" % \
125 |                              (len(projects_for_this_program),
126 |                               prgm, ",".join(projects_for_this_program)))
127 |                 projects.extend(projects_for_this_program)
128 | 
129 |         # Make list of which projects belong to each program
130 |         program_projects = dict()
131 |         for project in projects:
132 |             prgm = api.get_program(project)
133 |             if prgm not in program_projects: program_projects[prgm] = []
134 |             program_projects[prgm].append(project)
135 | 
136 |         # Now loop over each program, acquiring lock
137 |         for prgm in program_projects:
138 |             projects = program_projects[prgm]
139 |             prgm_root = os.path.abspath(os.path.join(config.mirror.dir, prgm))
140 | 
141 |             with common.lock_context(prgm_root, "mirror"):
142 |                 for project in sorted(projects):
143 |                     self.mirror_project(prgm, project)
144 | 
145 |         # Update the datestamps file with this version of the mirror
146 |         self.update_datestamps_file()
147 |         logging.info("Mirror completed successfully.")
148 | 
149 |     def __mirror_file(self, file_d, proj_root, n, total, retries=3):
150 |         '''Mirror a file into <proj_root>/<cat>/<type>.
151 | 
152 |         Files are uniquely identified by uuid.
153 |         '''
154 |         strict = not self.config.mirror.legacy
155 |         savepath = meta.mirror_path(proj_root, file_d, strict=strict)
156 |         dirname, basename = os.path.split(savepath)
157 |         logging.info("Mirroring file {0} | {1} of {2}".format(basename, n, total))
158 | 
159 |         #Ensure <root>/<cat>/<type>/ exists
160 |         if not os.path.isdir(dirname):
161 |             os.makedirs(dirname)
162 | 
163 |         md5path = savepath + ".md5"
164 | 
165 |         # Download if force is enabled or if the file is not on disk
166 |         if (self.force_download or not meta.md5_matches(file_d, md5path, strict)
167 |                 or not os.path.isfile(savepath)):
168 | 
169 |             # New file, mirror to this folder
170 |             time = 180
171 |             retry = 0
172 |             while retry <= retries:
173 |                 try:
174 |                     #Download file
175 |                     uuid = file_d['file_id']
176 |                     if self.has_cURL:
177 |                         api.curl_download_file(uuid, savepath, max_time=time)
178 |                     else:
179 |                         api.py_download_file(uuid, savepath)
180 |                     break
181 |                 except Exception as e:
182 |                     logging.warning("Download failed: " + str(e) + '\nRetrying...')
183 |                     retry += 1
184 |                     # Give some more time, in case the file is large...
185 |                     # TODO: is this worth it?
186 |                     time += 180
187 | 
188 |             if retry > retries:
189 |                 # A partially downloaded file will interfere with subsequent
190 |                 # mirrors
191 |                 common.silent_rm(savepath)
192 |                 logging.error("Error downloading file {0}, too many retries ({1})".format(savepath, retries))
193 |             else:
194 |                 #Save md5 checksum on success
195 |                 md5sum = file_d['md5sum']
196 |                 md5path = savepath + ".md5"
197 |                 with open(md5path, 'w') as mf:
198 |                     mf.write(md5sum + "  " + basename)
199 | 
200 |     def mirror_project(self, program, project):
201 |         '''Mirror one project folder'''
202 | 
203 |         datestamp = self.datestamp
204 |         config = self.config
205 |         logging.info("Mirroring started for {0} ({1})".format(project, program))
206 | 
207 |         categories = config.categories
208 |         if not categories:
209 |             logging.info("No categories specified, using GDC API to " + \
210 |                          "discover ALL available categories")
211 |             categories = api.get_categories(project)
212 | 
213 |         logging.info("Using %d data categories: %s" % \
214 |                      (len(categories), ",".join(categories)))
215 |         proj_dir = os.path.join(config.mirror.dir, program, project)
216 |         logging.info("Mirroring data to " + proj_dir)
217 | 
218 |         # Read the previous metadata, if present
219 |         prev_datestamp = meta.latest_datestamp(proj_dir, None)
220 |         prev_metadata = []
221 |         if prev_datestamp is not None:
222 |             prev_stamp_dir = os.path.join(proj_dir, "metadata", prev_datestamp)
223 |             prev_metadata = meta.latest_metadata(prev_stamp_dir)
224 | 
225 |         # Mirror each category separately, recording metadata (file dicts)
226 |         file_metadata = []
227 |         for cat in sorted(categories):
228 |             cat_data = self.mirror_category(program, project, cat,
229 |                                             self.workflow,
230 |                                             prev_metadata)
231 |             file_metadata.extend(cat_data)
232 | 
233 |         # Record project-level metadata
234 |         # file dicts, counts, redactions, blacklist, etc.
235 |         meta_folder = os.path.join(proj_dir,"metadata")
236 |         stamp_folder = os.path.join(meta_folder, datestamp)
237 |         if not os.path.isdir(stamp_folder):
238 |             os.makedirs(stamp_folder)
239 | 
240 |         # Write file metadata
241 |         meta_json = ".".join(["metadata", project, datestamp, "json" ])
242 |         meta_json = os.path.join(stamp_folder, meta_json)
243 |         with open(meta_json, 'w') as jf:
244 |             json.dump(file_metadata, jf, indent=2)
245 | 
246 |     def mirror_category(self, program, project, category,
247 |                         workflow, prev_metadata):
248 |         '''Mirror one category of data in a particular project.
249 |         Return the mirrored file metadata.
250 |         '''
251 |         proj_dir = os.path.join(self.config.mirror.dir, program, project)
252 |         cat_dir = os.path.join(proj_dir, category.replace(' ', '_'))
253 |         strict = not self.config.mirror.legacy
254 | 
255 |         # Create data folder
256 |         if not os.path.isdir(cat_dir):
257 |             logging.info("Creating folder: " + cat_dir)
258 |             os.makedirs(cat_dir)
259 | 
260 |         # If cases is a list, only files from these cases will be returned,
261 |         # otherwise all files from the category will be
262 |         cases = self.config.cases
263 |         file_metadata = api.get_project_files(project, category,
264 |                                               workflow, cases=cases)
265 | 
266 |         # Filter out extraneous cases from multi-case (e.g. MAF) file metadata
267 |         # if cases have been specified
268 |         if cases:
269 |             for idx, file_dict in enumerate(file_metadata):
270 |                 if len(file_dict.get("cases", [])) > 1:
271 |                     file_metadata[idx]["cases"] = \
272 |                     [case for case in file_metadata[idx]["cases"] \
273 |                      if case["submitter_id"] in cases]
274 | 
275 |         new_metadata = file_metadata
276 | 
277 |         # If we aren't forcing a full mirror, check the existing metadata
278 |         # to see what files are new
279 |         if not self.force_download:
280 |             new_metadata = meta.files_diff(proj_dir, file_metadata,
281 |                                            prev_metadata, strict)
282 | 
283 |         num_files = len(new_metadata)
284 |         logging.info("{0} new {1} files".format(num_files, category))
285 | 
286 |         for n, file_d in enumerate(new_metadata):
287 |             self.__mirror_file(file_d, proj_dir, n+1, num_files)
288 | 
289 |         return file_metadata
290 | 
291 |     def execute(self):
292 |         super(gdc_mirror, self).execute()
293 |         try:
294 |             self.mirror()
295 |         except Exception:
296 |             logging.exception("Mirroring FAILED:")
297 |             sys.exit(1)
298 | 
299 |     def update_datestamps_file(self):
300 |         """ Update the datestamps file with this mirror """
301 |         datestamps_file = self.config.datestamps
302 | 
303 |         logging.info("Updating datestamps in " + datestamps_file)
304 | 
305 |         # if it doesn't exist, create a blank one
306 |         if not os.path.isfile(datestamps_file):
307 |             open(datestamps_file, 'w')
308 | 
309 |         # Now read the file
310 |         datestamps_file = open(datestamps_file, 'r+')
311 |         stamps = datestamps_file.read().strip().split('\n')
312 |         if stamps[-1] != self.datestamp:
313 |             datestamps_file.write(self.datestamp + '\n')
314 | 
315 | def main():
316 |     gdc_mirror().execute()
317 | 
318 | if __name__ == "__main__":
319 |     main()
320 | 


--------------------------------------------------------------------------------
/gdctools/gdc_report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016 The Broad Institute, Inc.  All rights are reserved.
  7 | 
  8 | gdc_report: wrapper around SampleSummaryReport.R for GDC-derived data
  9 | See the <root>/COPYRIGHT file for the SOFTWARE COPYRIGHT and WARRANTY NOTICE.
 10 | 
 11 | @author: Timothy DeFreitas, Michael S. Noble
 12 | @date:  2016_09_11
 13 | '''
 14 | 
 15 | # }}}
 16 | 
 17 | from __future__ import print_function
 18 | import subprocess
 19 | import logging
 20 | import os
 21 | import sys
 22 | from pkg_resources import resource_filename
 23 | from glob import iglob
 24 | 
 25 | from gdctools.lib.heatmap import draw_heatmaps
 26 | from gdctools.lib.meta import extract_case_data
 27 | from gdctools.lib.common import silent_rm
 28 | from gdctools.GDCtool import GDCtool
 29 | 
 30 | class gdc_report(GDCtool):
 31 | 
 32 |     def __init__(self):
 33 |         description = 'Generate a sample report for a snapshot of data ' + \
 34 |                 'mirrored & diced\nfrom the Genomic Data Commons (GDC)'
 35 |         super(gdc_report, self).__init__("0.3.4", description)
 36 | 
 37 |         # FIXME: add options for each config setting
 38 | 
 39 |     def config_customize(self):
 40 |         # Ensure tool has sufficient configuration info to run
 41 |         mandatory_config  =  ["dice.dir", "loadfile.dir", "reference_dir"]
 42 |         mandatory_config +=  ["report.dir", "report.blacklist"]
 43 |         self.validate_config(mandatory_config)
 44 | 
 45 |     def generate_report(self, program):
 46 |         config = self.config
 47 |         diced_prog_root = os.path.join(config.dice.dir, program)
 48 |         datestamp = self.datestamp
 49 |         logging.info("Generating report for snapshot date " + str(datestamp))
 50 |         latest = os.path.join(config.report.dir, 'latest')
 51 |         config.report.dir = os.path.join(config.report.dir, 'report_'+datestamp)
 52 |         if not os.path.isdir(config.report.dir):
 53 |             os.makedirs(config.report.dir)
 54 |             silent_rm(latest)
 55 |             os.symlink(os.path.abspath(config.report.dir), latest)
 56 | 
 57 |         # Now infer remaining values from the diced data directory
 58 |         logging.info("Obtaining diced metadata...")
 59 |         get_diced_metadata(diced_prog_root, config.report.dir, datestamp)
 60 |         link_loadfile_metadata(config.loadfile.dir, program, config.report.dir, datestamp)
 61 | 
 62 |         if config.aggregates:
 63 |             logging.info("Writing aggregate cohort definitions to report dir...")
 64 |             self.write_aggregate_definitions()
 65 | 
 66 |         logging.info("Linking combined sample counts ...")
 67 |         all_counts_file = '.'.join(['sample_counts', datestamp, 'tsv'])
 68 |         link_metadata_file(os.path.join(diced_prog_root, 'metadata'),
 69 |                            self.config.report.dir, all_counts_file)
 70 | 
 71 |         # Command line arguments for report generation
 72 |         self.cmdArgs = ["Rscript", "--vanilla"]
 73 |         report_script = resource_filename(__name__, "lib/GDCSampleReport.R")
 74 |         self.cmdArgs.extend([ report_script,            # From gdctools pkg
 75 |                               datestamp,                # Specified from cli
 76 |                               config.report.dir,
 77 |                               config.reference_dir,
 78 |                               config.report.blacklist
 79 |                             ])
 80 | 
 81 |         logging.info("Running GDCSampleReport.R ")
 82 |         logging.info("CMD Args: " + " ".join(self.cmdArgs))
 83 |         p = subprocess.Popen(self.cmdArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 84 |         for line in p.stdout:
 85 |             logging.info(line.rstrip())
 86 |             p.stdout.flush()
 87 | 
 88 |     def execute(self):
 89 |         super(gdc_report, self).execute()
 90 |         try:
 91 |             for program in self.config.programs:
 92 |                 self.generate_report(program)
 93 |         except:
 94 |             logging.exception("Sample report generation FAILED:")
 95 |             sys.exit(1)
 96 | 
 97 |     def write_aggregate_definitions(self):
 98 |         '''Creates an aggregates.txt file in the reports directory. aggregates
 99 |         information is read from the [aggregates] section of the config file.
100 |         '''
101 |         aggregates = self.config.aggregates
102 |         ag_file = os.path.join(self.config.report.dir, 'aggregates.txt')
103 |         with open(ag_file, 'w') as f:
104 |             f.write('Aggregate Name\tTumor Types\n')
105 |             for agg in sorted(aggregates.keys()):
106 |                 f.write(agg + '\t' + aggregates[agg] + '\n')
107 | 
108 | def get_diced_metadata(diced_prog_root, report_dir, datestamp):
109 |     '''
110 |     Create heatmaps and symlinks to dicing metadata in
111 |     <reports_dir>/report_<datestamp>.
112 |     '''
113 |     for meta_dir in iglob(os.path.join(diced_prog_root, '*', 'metadata',
114 |                                        datestamp)):
115 |         project = meta_dir.split(os.path.sep)[-3]
116 | 
117 |         #Link project-level sample counts
118 |         samp_counts = '.'.join([project, datestamp, 'sample_counts', 'tsv'])
119 |         link_metadata_file(meta_dir, report_dir, samp_counts)
120 | 
121 |         # Link the diced metadata TSV
122 |         diced_meta = '.'.join([project, datestamp, 'diced_metadata', 'tsv'])
123 |         link_metadata_file(meta_dir, report_dir, diced_meta)
124 | 
125 |         # Create high and low res heatmaps in the report dir
126 |         logging.info("Generating heatmaps for " + project)
127 |         case_data = extract_case_data(os.path.join(meta_dir, diced_meta))
128 |         draw_heatmaps(case_data, project, datestamp, report_dir)
129 | 
130 | def link_metadata_file(from_dir, report_dir, filename):
131 |     """ Ensures symlink report_dir/filename -> from_dir/filename exists"""
132 |     from_path = os.path.join(from_dir, filename)
133 |     from_path = os.path.abspath(from_path)
134 |     rpt_path = os.path.join(report_dir, filename)
135 |     rpt_path = os.path.abspath(rpt_path)
136 |     if os.path.isfile(from_path) and not os.path.isfile(rpt_path):
137 |         os.symlink(from_path, rpt_path)
138 | 
139 | def link_loadfile_metadata(loadfiles_dir, program, report_dir, datestamp):
140 |     """Symlink loadfile and filtered samples into report directory"""
141 |     from_dir = os.path.join(loadfiles_dir, program, datestamp)
142 |     loadfile = program + ".Sample.loadfile.txt"
143 |     link_metadata_file(from_dir, report_dir, loadfile)
144 |     filtered = program + ".filtered_samples.txt"
145 |     link_metadata_file(from_dir, report_dir, filtered)
146 | 
147 | def main():
148 |     gdc_report().execute()
149 | 
150 | if __name__ == "__main__":
151 |     main()
152 | 


--------------------------------------------------------------------------------
/gdctools/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/gdctools/ce388330c73152a5fa6e1e005c5b50997f29e5f6/gdctools/lib/__init__.py


--------------------------------------------------------------------------------
/gdctools/lib/annotations_table.tsv:
--------------------------------------------------------------------------------
 1 | Firehose_annotation	converter	data_category	data_type	experimental_strategy	platform	center_namespace	tags	workflow_type
 2 | clinical__biospecimen	clinical	Biospecimen	Biospecimen Supplement					
 3 | clinical__primary	clinical	Clinical	Clinical Supplement					
 4 | clinical__primary	clin2magetab	Clinical	Clinical Supplement					CPTAC Clinical
 5 | CNV__unfiltered__snp6	segfile_snp6	Copy Number Variation	Copy Number Segment	Genotyping Array	Affymetrix SNP 6.0			DNAcopy
 6 | CNV__snp6	segfile_snp6	Copy Number Variation	Masked Copy Number Segment	Genotyping Array	Affymetrix SNP 6.0			DNAcopy
 7 | CNV__wxs	seg_wxs_washu	Copy Number Variation	Masked Copy Number Segment	DNASeqC	WashU_CNV_WXS			DNAcopy
 8 | methylation__HM27	usc_meth2magetab	DNA Methylation	Methylation Beta Value	Methylation Array	Illumina Human Methylation 27			Liftover
 9 | methylation__HM450	usc_meth2magetab	DNA Methylation	Methylation Beta Value	Methylation Array	Illumina Human Methylation 450			Liftover
10 | methylation__EPIC	washu_meth2magetab	DNA Methylation	Methylation Beta Value	Methylation Array	Illumina Methylation EPIC			Bioconductor - methylationArrayAnalysis
11 | miR__geneExp	tsv2magetab	Transcriptome Profiling	miRNA Expression Quantification	miRNA-Seq				BCGSC miRNA Profiling
12 | miR__isoformExp	tsv2idtsv	Transcriptome Profiling	Isoform Expression Quantification	miRNA-Seq				BCGSC miRNA Profiling
13 | mRNA__geneExp__FPKM	unzip_fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				HTSeq - FPKM
14 | mRNA__geneExpNormed__FPKM	unzip_fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				HTSeq - FPKM-UQ
15 | mRNA__counts__FPKM	unzip_fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				HTSeq - Counts
16 | mRNA__geneExp__FPKM	fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				TopHat2 - FPKM
17 | mRNA__geneExpNormed__FPKM	fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				TopHat2 - FPKM-UQ
18 | mRNA__counts__FPKM	fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				TopHat2 - Counts
19 | mRNA__geneExp__FPKM	fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				Cufflinks - FPKM
20 | mRNA__geneExpNormed__FPKM	fpkm2magetab	Transcriptome Profiling	Gene Expression Quantification	RNA-Seq				Cufflinks - FPKM-UQ
21 | SNV__mutect	maf	Simple Nucleotide Variation	Masked Somatic Mutation	WXS				MuTect2 Variant Aggregation and Masking
22 | SNV__varscan	maf_uncompressed	Simple Nucleotide Variation	Masked Somatic Mutation	WXS	WashU_Somatic_WXS			VarScan2 Variant Aggregation and Masking
23 | 


--------------------------------------------------------------------------------
/gdctools/lib/api.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016 The Broad Institute, Inc.  All rights are reserved.
  7 | GDCQuery class and high-level API functions
  8 | 
  9 | @author: Timothy DeFreitas, David Heiman, Michael S. Noble
 10 | @date:  2016_11_10
 11 | '''
 12 | 
 13 | # }}}
 14 | 
 15 | import requests
 16 | import json
 17 | import logging
 18 | import subprocess
 19 | import os
 20 | 
 21 | __legacy = False
 22 | __verbosity = 0
 23 | logging.getLogger("requests").setLevel(logging.WARNING)
 24 | 
 25 | class GDCQuery(object):
 26 |     # Class variables
 27 |     ENDPOINTS = ('cases', 'files', 'programs', 'projects', 'submission')
 28 |     GDC_ROOT = 'https://api.gdc.cancer.gov/'
 29 | 
 30 |     # Queries returning more than this many results will log a warning
 31 |     WARN_RESULT_CT = 5000
 32 | 
 33 |     def __init__(self, endpoint, fields=None, expand=None, filters=None):
 34 |         self._endpoint = endpoint.lower()               # normalize to lowercase
 35 |         assert(endpoint in GDCQuery.ENDPOINTS)
 36 |         # Make copies of all mutable
 37 |         self._fields   = fields if fields else []
 38 |         self._expand   = expand if expand else []
 39 |         self._filters  = filters if filters else []
 40 | 
 41 |     def add_eq_filter(self, field, value):
 42 |         self._filters.append(_eq_filter(field, value))
 43 |         return self
 44 | 
 45 |     def add_neq_filter(self, field, value):
 46 |         self._filters.append(_neq_filter(field, value))
 47 |         return self
 48 | 
 49 |     def add_in_filter(self, field, values):
 50 |         self._filters.append(_in_filter(field,values))
 51 | 
 52 |     def filters(self):
 53 |         return self._filters
 54 | 
 55 |     def add_fields(self, *fields):
 56 |         self._fields.extend(fields)
 57 |         return self
 58 | 
 59 |     def add_expansions(self, *fields):
 60 |         self._expand.extend(fields)
 61 |         return self
 62 | 
 63 |     def url(self):
 64 |         '''For debugging purposes, build a PreparedRequest to show the url.'''
 65 |         req = requests.Request('GET', self._base_url(), params=self._params())
 66 |         r = req.prepare()
 67 |         return r.url
 68 | 
 69 |     def _base_url(self):
 70 |         url = GDCQuery.GDC_ROOT
 71 |         if get_legacy(): url += 'legacy/'
 72 |         url += self._endpoint
 73 |         return url
 74 | 
 75 |     def _params(self):
 76 |         params = dict()
 77 |         if self._fields:
 78 |             params['fields'] = ','.join(self._fields)
 79 |         if self._expand:
 80 |             params['expand'] = ','.join(self._expand)
 81 |         if self._filters:
 82 |             if len(self._filters) == 1:
 83 |                 params['filters'] = json.dumps(self._filters[0])
 84 |             else:
 85 |                 params['filters'] = json.dumps(_and_filter(self._filters))
 86 |         return params
 87 | 
 88 |     def _query_paginator(self, page_size=500, from_idx=0, to_idx=-1):
 89 |         '''Returns list of hits, iterating over server paging'''
 90 |         endpoint = self._base_url()
 91 |         p = self._params()
 92 |         p['from'] = from_idx
 93 |         p['size'] = page_size
 94 | 
 95 |         # For pagination to work, the records must specify a sort order. This
 96 |         # lookup tells the right field to use based on the endpoint
 97 |         sort_lookup = { 'files' : 'file_id',
 98 |                         'cases' : 'case_id',
 99 |                         'projects' : 'project_id',
100 |                         'submission': 'links'}
101 | 
102 |         endpoint_name = endpoint.rstrip('/').split('/')[-1]
103 |         p['sort'] = sort_lookup.get(endpoint_name, "")
104 | 
105 |         # Make initial call
106 |         r = requests.get(endpoint, params=p)
107 |         if get_verbosity():
108 |             print("\nGDC query: %s\n" % r.url)
109 |         r_json = _decode_json(r)
110 | 
111 |         # Log any warnings in response, but don't raise an error yet
112 |         _log_warnings(r_json, r.url)
113 | 
114 |         # GDC 'submission' endpoint is inconsistent (does not yet return results
115 |         # within a {"data": {"hits": … } }  JSON block--so we work around here.
116 |         if endpoint_name == 'submission':
117 |             results = r_json['links']
118 |             results = [ program.split('/')[-1] for program in results ]
119 |             self.hits = results
120 |             return results
121 | 
122 |         # The 'programs' endpoint does not actually exist in GDC api (but has
123 |         # been requested by Broad). Until then we fake it for convenience.
124 |         if endpoint_name == 'programs':
125 |             self.hits = get_programs()
126 |             return self.hits
127 | 
128 |         # Get first page of hits, and pagination data
129 |         data = r_json['data']
130 |         all_hits = data['hits']
131 |         pagination = data['pagination']
132 |         total = pagination['total'] if to_idx == -1 else to_idx
133 | 
134 |         # Some queries can return a large number of results, warn here
135 |         if total > GDCQuery.WARN_RESULT_CT:
136 |             logging.warning(str(total) + " files match this query, paging "
137 |                             + "through all results may take some time")
138 | 
139 |         for from_idx in range(page_size+1, total, page_size):
140 |             #Iterate over pages to get the remaning hits
141 |             p['from'] = from_idx
142 |             r = requests.get(endpoint, params=p)
143 | 
144 |             hits = _decode_json(r)['data']['hits']
145 |             all_hits.extend(hits)
146 | 
147 |         self.hits = all_hits
148 |         return all_hits # Chop off hits on the last page if they exceed to_idx
149 | 
150 |     def get(self, page_size=500):
151 |         return self._query_paginator(page_size=page_size)
152 | 
153 | def get_projects(program=None):
154 |     query = GDCQuery('projects')
155 |     if program:
156 |         query.add_eq_filter('program.name', program)
157 |     query.add_fields('project_id')
158 |     projects = [d['project_id'] for d in query.get()]
159 |     return sorted(projects)
160 | 
161 | def get_project_from_cases(cases, program=None):
162 |     if not cases: return []
163 |     query = GDCQuery('cases')
164 |     query.add_in_filter('submitter_id', cases)
165 |     query.add_fields('project.project_id')
166 |     projects = [p['project']['project_id'] for p in query.get()]
167 |     return sorted(set(projects))
168 | 
169 | def get_categories(project):
170 |     query = GDCQuery('projects')
171 |     query.add_eq_filter('project_id', project)
172 |     query.add_fields('summary.data_categories.data_category')
173 |     projects = query.get()
174 | 
175 |     #Sanity check
176 |     if len(projects) > 1:
177 |         raise ValueError("More than one project matched '" + project + "'")
178 | 
179 |     proj = projects[0]
180 |     if 'summary' in proj:
181 |         return [d['data_category'] for d in proj['summary']['data_categories']]
182 |     else:
183 |         return [] # Needed to protect against projects with no data
184 | 
185 | def get_project_files(project_id, data_category, workflow_type=None, cases=None,
186 |                       page_size=500):
187 |     query = GDCQuery('files')
188 |     query.add_eq_filter("cases.project.project_id", project_id)
189 |     query.add_eq_filter("files.data_category", data_category)
190 |     query.add_eq_filter("access", "open")
191 | 
192 |     if not __legacy:
193 |         if workflow_type:
194 |             query.add_eq_filter('analysis.workflow_type', workflow_type)
195 |         query.add_fields('analysis.workflow_type')
196 | 
197 |     if cases:
198 |         query.add_in_filter('cases.submitter_id', cases)
199 | 
200 |     query.add_fields('file_id', 'file_name', 'cases.samples.sample_id',
201 |                      'data_type', 'data_category', 'data_format',
202 |                      'experimental_strategy', 'md5sum','platform','tags',
203 |                      'center.namespace', 'cases.submitter_id',
204 |                      'cases.project.project_id',
205 |                      # For protein expression data
206 |                      'cases.samples.portions.submitter_id',
207 |                      # For aliquot-level data
208 |                      'cases.samples.portions.analytes.aliquots.submitter_id')
209 | 
210 |     # Prune Clinical/Biospecimen data, by avoiding download/mirror of
211 |     #   - path reports/images (Data Type: Slide Image), as they can be huge
212 |     #   - Biotab files, as redundant with BCR XML (and incomplete from TCGA)
213 |     # But note that this is better done with a config file (see issue 73)
214 |     if data_category == "Biospecimen":
215 |         query.add_eq_filter("data_type", "Biospecimen Supplement")
216 |         query.add_neq_filter("data_format", "BCR Biotab")
217 |     elif data_category == "Clinical":
218 |         query.add_neq_filter("data_format", "BCR Biotab")
219 | 
220 |     query.add_expansions('cases', 'annotations', 'cases.samples')
221 |     return query.get(page_size=page_size)
222 | 
223 | def curl_exists():
224 |     """ Return true if curl can be executed on this system """
225 |     try:
226 |         DEV_NULL = open(os.devnull, 'w')
227 |         subprocess.check_call(['curl', '-V'],
228 |                                stdout=DEV_NULL, stderr=subprocess.STDOUT)
229 |         return True
230 |     except (OSError, subprocess.CalledProcessError):
231 |         return False
232 | 
233 | def py_download_file(uuid, file_name, chunk_size=4096):
234 |     """Download a single file from GDC."""
235 |     url = GDCQuery.GDC_ROOT
236 |     if __legacy: url += 'legacy/'
237 |     url += 'data/' + uuid
238 |     r = requests.get(url, stream=True)
239 |     # TODO: Optimize chunk size
240 |     # Larger chunk size == more memory, but fewer packets
241 |     with open(file_name, 'wb') as f:
242 |         for chunk in r.iter_content(chunk_size=chunk_size):
243 |             if chunk:
244 |                 f.write(chunk)
245 | 
246 |     # Return the response, which includes status_code, http headers, etc.
247 |     return r
248 | 
249 | def curl_download_file(uuid, file_name, max_time=180):
250 |     """Download a single file from the GDC, using cURL"""
251 |     url = GDCQuery.GDC_ROOT
252 |     if __legacy: url += 'legacy/'
253 |     url += 'data/' + uuid
254 |     curl_args = ['curl', '--max-time', str(max_time), '--fail', '-o', file_name, url]
255 |     return subprocess.check_call(curl_args)
256 | 
257 | def get_program(project):
258 |     '''Return the program name of a project.'''
259 |     query = GDCQuery('projects')
260 |     query.add_eq_filter('project_id', project)
261 |     query.add_fields('program.name')
262 |     projects = query.get()
263 | 
264 |     # Sanity check
265 |     if len(projects) > 1:
266 |         raise ValueError("More than one project matched '" + project + "'")
267 |     elif len(projects) == 0:
268 |         raise ValueError("No project matched '" + project + "'")
269 | 
270 |     return projects[0]['program']['name']
271 | 
272 | def get_programs(projects=None):
273 |     '''Return list of programs that have data EXPOSED in GDC.  Note that this
274 |        may be different from the set of programs that have SUBMITTED data to
275 |        the GDC, because (a) it takes time to validate submissions before GDC
276 |        will make them public, and (b) GDC does only periodic data releases.
277 |        An optional projects parameter can be passed to prune the list of
278 |        programs to those that have submitted data to the specified project(s)
279 |        '''
280 | 
281 |     if projects:
282 |         projects = list(set(projects) & set(get_projects()))
283 |     else:
284 |         projects = get_projects()
285 |     programs  = [ proj.split('-')[0] for proj in projects]
286 |     return list(set(programs))
287 | 
288 | # Module helpers
289 | def _log_warnings(r_json, r_url):
290 |     '''Check for warnings in a server response'''
291 |     warnings = r_json.get('warnings', None)
292 |     if warnings:
293 |         warnmsg =  "GDC query produced a warning:\n"
294 |         warnmsg += json.dumps(warnings, indent=2)
295 |         warnmsg += "\nRequest URL: " + r_url
296 |         logging.warning(warnmsg)
297 | 
298 | def _eq_filter(field, value):
299 |     return {"op" : "=", "content" : {"field" : field, "value" : [value]}}
300 | 
301 | def _neq_filter(field, value):
302 |     return {"op" : "!=", "content" : {"field" : field, "value" : [value]}}
303 | 
304 | def _and_filter(filters):
305 |     return {"op" : "and", "content" : filters}
306 | 
307 | def _in_filter(field, values):
308 |     return {"op" : "in", "content" : {"field": field, "value": values} }
309 | 
310 | def _decode_json(request):
311 |     """ Attempt to decode response from request using the .json() method.
312 | 
313 |     If one cannot be decoded, raise a more useful error than the default by
314 |     printing the text content, rather than just raising a ValueError"""
315 |     try:
316 |         return request.json()
317 |     except ValueError:
318 |         emsg = "No JSON object could be decoded from response. Content:\n"
319 |         emsg += request.text
320 |         raise ValueError(emsg)
321 | 
322 | def set_legacy(legacy=False):
323 |     global __legacy
324 |     previous_value = __legacy
325 |     __legacy = True if legacy else False
326 |     return previous_value
327 | 
328 | def get_legacy():
329 |     return __legacy
330 | 
331 | def set_verbosity(verbosity):
332 |     global __verbosity
333 |     previous_value = __verbosity
334 |     try:
335 |         __verbosity = int(verbosity)
336 |     except Exception:
337 |         pass                            # simply keep previous value
338 |     return previous_value
339 | 
340 | def get_verbosity():
341 |     return __verbosity
342 | 


--------------------------------------------------------------------------------
/gdctools/lib/clinxml.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | import sys
 5 | import codecs
 6 | 
 7 | def path_iter(elem, prefix=""):
 8 |     '''Iterates over nodes in an Element Tree with full node paths'''
 9 | 
10 |     #Some fields may be repeated by specifying a sequence number 
11 |     # Use this dictionary to keep track of the last sequence number assigned to each unique path
12 |     path_sequences = dict()
13 | 
14 |     for child in elem:
15 |         tag = child.tag.split("}")[1] # Split off the xmlns info  --> ${xmlns:abc}tag_name
16 |         child_path = (prefix + "." + tag) if prefix != "" else tag
17 | 
18 |         #If this is a member of a sequence, append the sequence number. "1" is omitted
19 |         if child_path in path_sequences:
20 |             seq = str(path_sequences[child_path] + 1)
21 |             path_sequences[child_path] += 1
22 |         else:
23 |             seq = child.attrib.get("sequence", "1")
24 |             path_sequences[child_path] = int(seq)
25 | 
26 |         if seq != "1": child_path += "-" + seq
27 | 
28 |         if len(list(child)) == 0:
29 |             yield child_path.lower(), parse_element_value(child.text)
30 |         for nested_nodes in path_iter(child, child_path):
31 |             yield nested_nodes
32 | 
33 | def parse_element_value(s):
34 |     '''Converts a element node's text value into valid data.
35 |     None/whitespace --> NA
36 |     String --> stripped string, in lowercase
37 |     '''
38 |     return "NA" if s == None or s.strip() == '' else s.lower()
39 | 
40 | def parse_clinical_xml(xmlfile, outtsv):
41 |     """Parses the clinical xml file and outputs node values in two column tsv file."""
42 |     tree = ET.parse(xmlfile)
43 |     root = tree.getroot()
44 |     with codecs.open(outtsv, 'w', 'utf-8') as f:
45 |         f.write("node_name\tnode_value\n")
46 |         for tup in path_iter(root):
47 |             f.write("\t".join(tup) + "\n")
48 | 
49 | if __name__ == '__main__':
50 |     if len(sys.argv) != 3:
51 |         print("Usage: clin_xml_parser <input_xml> <output_tsv>")
52 |         sys.exit(1)
53 |     parse_clinical_xml(sys.argv[1], sys.argv[2])
54 | 
55 | 


--------------------------------------------------------------------------------
/gdctools/lib/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import time
  4 | import os
  5 | import fnmatch
  6 | import csv
  7 | import errno
  8 | import logging
  9 | import re
 10 | import sys
 11 | import contextlib
 12 | from argparse import RawDescriptionHelpFormatter, SUPPRESS, OPTIONAL, ZERO_OR_MORE
 13 | from fasteners import InterProcessLock
 14 | 
 15 | # Helpful constants
 16 | DATESTAMP_REGEX = re.compile("^\d{4}_[01]\d_[0-3]\d$")
 17 | 
 18 | #TODO: Configurable?
 19 | REPORT_DATA_TYPES = ('BCR', 'Clinical', 'CN', 'mRNA', 'miR', 'MAF', 'Methylation')
 20 | 
 21 | 
 22 | ANNOT_TO_DATATYPE = {
 23 |     'clinical__primary'         : 'Clinical',
 24 |     'clinical__biospecimen'     : 'BCR',
 25 |     'CNV__unfiltered__snp6'                 : 'CN',
 26 |     'CNV__snp6'     : 'CN',
 27 |     'CNV__wxs'      : 'CN',
 28 |     'methylation__HM27' : 'Methylation',
 29 |     'methylation__HM450' : 'Methylation',
 30 |     'methylation__EPIC' : 'Methylation',
 31 |     'miR__geneExp'              : 'miR',
 32 |     'miR__isoformExp'           : 'miR',
 33 |     'mRNA__geneExp__FPKM'       : 'mRNA',
 34 |     'mRNA__geneExpNormed__FPKM' : 'mRNA',
 35 |     'mRNA__counts__FPKM'        : 'mRNA',
 36 |     'SNV__mutect'               : 'MAF',
 37 |     'SNV__varscan'              : 'MAF'
 38 | }
 39 | 
 40 | __PY3__ = sys.version_info > (3,)
 41 | if __PY3__:
 42 |     def safe_open(file, *args, **kwargs):
 43 |         # Used to interpret newlines correctly: since CSV etc modules etc do
 44 |         # their own/universal newline handling, it's safe to specify newline=''
 45 |         kwargs['newline'] = ''
 46 |         return open(file, *args, **kwargs)
 47 | else:
 48 |     safe_open = open
 49 | 
 50 | def silent_rm(filename):
 51 |     try:
 52 |         os.remove(filename)
 53 |     except OSError as e:
 54 |         #ENOENT means file doesn't exist, ignore
 55 |         if e.errno != errno.ENOENT:
 56 |             raise
 57 | 
 58 | def datestamp(timetuple=time.localtime()):
 59 |     '''Takes a time-tuple and converts it to the standard GDAC datestamp
 60 |     (YYYY_MM_DD). No argument will generate current date'''
 61 |     return time.strftime('%Y_%m_%d', timetuple)
 62 | 
 63 | def increment_file(filepath):
 64 |     '''Returns filepath if filepath doesn't exist. Otherwise returns
 65 |     <filepath>.<matches + 1>. e.g. if only one file matches filepath*,
 66 |     filepath.2 is returned; two files: filepath.3, etc.'''
 67 |     if os.path.exists(filepath):
 68 |         dirname, filename = os.path.split(filepath)
 69 |         count = sum((1 for _ in fnmatch.filter(os.listdir(dirname), filename + '*')), 1)
 70 |         filepath = '.'.join((filepath, str(count)))
 71 |     return filepath
 72 | 
 73 | def immediate_subdirs(path):
 74 |     subdirs = [d for d in os.listdir(path)
 75 |             if os.path.isdir(os.path.join(path, d))]
 76 |     return sorted(subdirs)
 77 | 
 78 | def safeMakeDirs(dir_name, permissions=None):
 79 |     """
 80 |     Makes directory structure, or ends gracefully if directory already exists.
 81 |     If permissions passed, then honor them, however os.makedirs ignores the
 82 |     sticky bit. Use changeMod if this matters.
 83 |     """
 84 |     try:
 85 |         if permissions is None:
 86 |             os.makedirs(dir_name)
 87 |         else:
 88 |             # Current process umask affects mode (mode & ~umask & 0777) so set to 0
 89 |             curUmask = os.umask(0)
 90 |             os.makedirs(dir_name, permissions)
 91 |             os.umask(curUmask)
 92 |     except OSError as value:
 93 |         error_num = value.errno
 94 |         # what is 183? don't know... came from legacy code.
 95 |         if  error_num==errno.EEXIST or error_num==183 or error_num==17:
 96 |             pass  # Directory already existed
 97 |         else:
 98 |             raise  # Reraise other errors
 99 | 
100 | def safe_make_hardlink(input_file_path,output_file_path):
101 |     output_file_dir = os.path.dirname(output_file_path)
102 |     # Verify the input file is actually there
103 |     if not os.path.exists(input_file_path):
104 |         raise Exception("can't find file %s"%input_file_path)
105 |     safeMakeDirs(output_file_dir)
106 |     try:
107 |         os.link(input_file_path,output_file_path)
108 |     except OSError as err:
109 |         if err.errno == errno.EEXIST:
110 |             # link already exists, check that it is identical to the one we are trying to put down
111 |             if not os.path.samefile(input_file_path,output_file_path):
112 |                 raise Exception('Existing file %s is different than the new hardlink %s' % (input_file_path, output_file_path))
113 |         else:
114 |             msg = '%s\n' % err
115 |             msg += 'Input file: %s\n' % input_file_path
116 |             msg += 'Output file: %s\n' % output_file_path
117 |             raise Exception(msg)
118 | 
119 | def getTabFileHeader(filepath):
120 |     '''Return the column names of a tsv as a list'''
121 |     with open(filepath) as f:
122 |         header = f.readline()
123 |         if header:
124 |             header = header.strip().split('\t')
125 |     return header
126 | 
127 | def map_blank_to_na(csvfile):
128 |     """
129 |     Convert all blank csv fields to 'NA'.
130 | 
131 |     Yield the csv header,
132 |     and then yield each csv row with
133 |     all blank fields replaced by NAs.
134 |     """
135 |     yield next(csvfile)
136 |     for row in csvfile:
137 |         yield map(lambda f: f if f != '' else 'NA', row)
138 | 
139 | def rearrange_columns(csvfile, col_order):
140 |     """
141 |     csvfile : iterable of list
142 |     col_order : list of int
143 |         E.g.: col_order = [0, 2, 3, 1] will cause column 1 of the input to be column 3 of the output.
144 |     """
145 |     min_expected = max(col_order) + 1
146 |     for row in csvfile:
147 |         if len(row) < min_expected:
148 |             raise ValueError('Unexpected number of columns, expected at least %d but found %d' % (min_expected, len(row)))
149 |         new_row = [row[i] for i in col_order]
150 |         yield new_row
151 | 
152 | def writeCsvFile(filename, data):
153 |     """
154 |     Write a row iterator's data to a csv file.
155 |     """
156 |     with safe_open(filename, "w") as f:
157 |         csvfile = csv.writer(f, dialect='excel-tab', lineterminator='\n')
158 |         csvfile.writerows(data)
159 | 
160 | @contextlib.contextmanager
161 | def lock_context(path, name="gdctool"):
162 |     '''Process level lock context, to prevent access to path by other processes
163 | 
164 |     Sample Usage:
165 |     with lock_context(dice_root, "dicer"):
166 |         dice()
167 | 
168 |     '''
169 |     lockname = os.path.join(path, ".".join(["", name, "lock"]))
170 |     lock = InterProcessLock(lockname)
171 |     logging.info("Attempting to acquire lock: " + lockname)
172 |     with lock:
173 |         logging.info("Lock acquired.")
174 |         yield
175 |         logging.info("Releasing lock: " + lockname)
176 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/gdctools/ce388330c73152a5fa6e1e005c5b50997f29e5f6/gdctools/lib/convert/__init__.py


--------------------------------------------------------------------------------
/gdctools/lib/convert/copy.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | from ..meta import tcga_id, diced_file_paths
 4 | from ..common import safeMakeDirs
 5 | # Copy from mirror to dice dir
 6 | 
 7 | def process(file_dict, mirror_path, dice_path):
 8 |     _tcga_id = tcga_id(file_dict)
 9 |     filepath = diced_file_paths(dice_path, file_dict)[0]
10 |     safeMakeDirs(dice_path)
11 | 
12 |     # copy to new name in
13 |     shutil.copy(mirror_path, filepath)
14 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/maf.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | import gzip
  4 | import os
  5 | import sys
  6 | 
  7 | from .. import meta
  8 | from ..common import safeMakeDirs, safe_open
  9 | 
 10 | _TUMOR_SAMPLE_COLNAME_LC    = 'Tumor_Sample_Barcode'
 11 | _TUMOR_SAMPLE_COLNAME_UC    = 'TUMOR_SAMPLE_ID'
 12 | _DEFAULT_SAMPLE_INDEX       = 15
 13 | 
 14 | # Sample barcode pattern to handle various forms found in MAFs (i.e. LUAD-35-5375-Tumor,
 15 | # LUAD-35-3615-D-Tumor, LUAD-44-2656_DN-Tumor, TCGA-E2-A154-01A-11D-A10Y-09) There are
 16 | # two capture groups: TSS ([0-9A-Za-z]{2}) and the Participant ([0-9A-Za-z]{4}). These are
 17 | # used to form the standard TCGA individual id (TCGA-<TSS>-<Participant>-01) we place in
 18 | # our SDRFs and processed MAFs.
 19 | # _INVALID_PATTERN = re.compile("^[A-Z]+-([0-9A-Za-z]{2})-([0-9A-Za-z]{4})(-|_).+$")
 20 | 
 21 | # Remove these columns from the MAF if found
 22 | # _COLUMNS_TO_REMOVE = ['patient_name', 'patient']
 23 | 
 24 | def process(file_dict, mafFile, outdir, is_compressed=True):
 25 |     safeMakeDirs(outdir)
 26 |     logging.info("Processing MAF %s...", mafFile)
 27 |     # First unzip the maf File to the outdir
 28 |     if is_compressed:
 29 |         tmpMAF = file_dict['file_id'] + ".maf.txt"
 30 |         tmpMAF = os.path.join(outdir, tmpMAF)
 31 |         with safe_open(tmpMAF, 'w') as mafout, gzip.open(mafFile, 'rt') as cmaf:
 32 |             mafout.write(cmaf.read())
 33 |         mafFile = tmpMAF
 34 |     tumor_samples = meta.samples(file_dict, tumor_only=True)
 35 | 
 36 |     # Get all aliquot ids
 37 |     sample_ids = meta.aliquot_ids(tumor_samples)
 38 | 
 39 |     tcgaSampleIdToMafLinesMap = map_sample_ids_to_MAF_lines(mafFile, sample_ids)
 40 | 
 41 |     maf_uuid = file_dict['file_id']
 42 | 
 43 |     for sample_id in tcgaSampleIdToMafLinesMap:
 44 |         # TODO: Insert maf center into filename?
 45 |         sample_maf_filename = ".".join([sample_id, maf_uuid, "maf.txt"])
 46 |         logging.info("Writing sample MAF: " + sample_maf_filename)
 47 |         sample_maf_filename = os.path.join(outdir, sample_maf_filename)
 48 |         with safe_open(sample_maf_filename, 'w') as smf:
 49 |             outwriter = csv.writer(smf, delimiter='\t')
 50 |             outwriter.writerows(tcgaSampleIdToMafLinesMap[sample_id])
 51 | 
 52 | 
 53 | #===============================================================================
 54 | # Extract unique samples from MAF and reformat
 55 | #===============================================================================
 56 | # def processMAF(mafFilename, extension, hyb2tcga, outdir, ref_dir, test_short, tmp_dir_root, debug_max_count):
 57 | #     logging.info("Processing MAF %s...", mafFilename)
 58 | #
 59 | #     # Map each MAF line to an updated sample barcode
 60 | #     logging.info("generate new sample barcode to maf lines map")
 61 | #     tcgaSampleIdToMafLinesMap = generateTcgaSampleIdToMafLinesMap(mafFilename)
 62 | #     logging.info("Done generating new sample barcode to maf lines map")
 63 | #
 64 | #     # Create new MAF files (one sample per MAF) in temp space
 65 | #     mafTmpPaths = []
 66 | #     tmpdir = tempfile.mkdtemp(prefix='split_maf_',dir=tmp_dir_root)
 67 | #     try:
 68 | #         for index, tcgaSampleId in enumerate(tcgaSampleIdToMafLinesMap.keys()):
 69 | #             if test_short and index >= debug_max_count:
 70 | #                 break
 71 | #             mafTmpFilename = '.'.join([tcgaSampleId,extension,'txt'])
 72 | #             mafTmpPath = os.path.join(tmpdir,mafTmpFilename)
 73 | #             mafTmpPaths.append(mafTmpPath)
 74 | #
 75 | #             outfid = open(mafTmpPath,'w')
 76 | #             outwriter = csv.writer(outfid, dialect='excel-tab',lineterminator='\n')
 77 | #             outwriter.writerows(tcgaSampleIdToMafLinesMap[tcgaSampleId])
 78 | #             outfid.close()
 79 | #
 80 | #         # Copy MAF files created in temp space to final output directory
 81 | #         ioUtilities.safeMakeDirs(outdir)
 82 | #         for index, mafTmpPath in enumerate(mafTmpPaths):
 83 | #             if test_short and index >= debug_max_count:
 84 | #                 break
 85 | #             mafFilename = os.path.basename(mafTmpPath)
 86 | #             mafPath = os.path.join(outdir,mafFilename)
 87 | #             shutil.copy(mafTmpPath, mafPath)
 88 | #
 89 | #         # Create empty (except for header) MAFs for those samples with no mutations.
 90 | #         # TODO DICARA hyb2tcga may contain malformatted TCGA ids
 91 | #         # PERHAPS RUN THROUGH ALL HYB2TCGA VALUES AND IF DON"T MATCH PATTERN THEN THROW OUT
 92 | #         tcgaIdSet              = set(tcgaSampleIdToMafLinesMap.keys())
 93 | #         allSamples             = set(hyb2tcga.values())
 94 | #         samplesWithNoMutations = allSamples.difference(tcgaIdSet)
 95 | #         missingSamples         = tcgaIdSet.difference(allSamples)
 96 | #
 97 | #         for sampleBarcode in missingSamples:
 98 | #             logging.warning("Skipping - processed sample not found in hyb2tcga map: %s", sampleBarcode)
 99 | #
100 | #         blank_maf_filepath = os.path.join(ref_dir,'maf_blank_header.txt')
101 | #         for sampleBarcode in samplesWithNoMutations:
102 | #             logging.warning("Mutations missing for sample - creating blank maf for: %s", sampleBarcode)
103 | #             filename = '.'.join([sampleBarcode,extension,'txt'])
104 | #             filepath = os.path.join(outdir,filename)
105 | #             shutil.copy(blank_maf_filepath,filepath)
106 | #
107 | #         logging.info("Finished processing MAF %s...", mafFilename)
108 | #     finally:
109 | #         # Clean up
110 | #         shutil.rmtree(tmpdir)
111 | 
112 | #===============================================================================
113 | # Return a map of tcga sample id to all corresponding MAF lines.
114 | # If the TCGA barcode is valid, leave it alone. Otherwise, reformat the barcode
115 | # (i.e. LUAD-44-2657-Tumor) to a standard TCGA sample id
116 | # (i.e. TCGA-44-2657-01) and replace all occurrences of TSS and
117 | # Participant (i.e. -44-2657) in all fields with this reformatted barcode.
118 | #===============================================================================
119 | def map_sample_ids_to_MAF_lines(mafFilename, sample_ids):
120 |     ''' Return a dictionary whose keys are TCGA sample ids, and whose
121 |     values are the lines in the MAF for that sample. Also reformats the barcode
122 |     if necessary to match a common format
123 |     '''
124 | 
125 |     # Prevent choking on abberrant files with enormous (and likely wrong) mutations
126 |     original_field_size_limit = csv.field_size_limit(sys.maxsize)
127 | 
128 |     # Open MAF file for reading
129 |     mafFile   = open(mafFilename)
130 |     mafReader = csv.reader(mafFile,dialect='excel-tab')
131 |     header    = next(mafReader)
132 | 
133 |     # Ignore leading comments (i.e. #version 2.2) in MAF file
134 |     while header[0].startswith('#'):
135 |         header = next(mafReader)
136 | 
137 |     ### TODO: reintroduce column removal later...
138 |     # # Determine indices of unwanted columns for removal
139 |     # columnIndicesToRemove = list()
140 |     # for columnName in _COLUMNS_TO_REMOVE:
141 |     #     if columnName in header:
142 |     #         columnIndicesToRemove.append(header.index(columnName))
143 |     #
144 |     # # Remove in-place, so need to pop off elements from last to first
145 |     # columnIndicesToRemove.sort(reverse=True)
146 |     #
147 |     # # Remove unwanted columns from header
148 |     # _removeColumns(header, columnIndicesToRemove)
149 | 
150 |     # Determine index of tumor sample barcode column
151 |     sampleIndex = _DEFAULT_SAMPLE_INDEX
152 |     if _TUMOR_SAMPLE_COLNAME_LC in header:
153 |         sampleIndex = header.index(_TUMOR_SAMPLE_COLNAME_LC)
154 |     elif _TUMOR_SAMPLE_COLNAME_UC in header:
155 |         sampleIndex = header.index(_TUMOR_SAMPLE_COLNAME_UC)
156 | 
157 |     unmatched_sample_barcodes     = set()
158 |     # Initialize entry for each possible sample id. This list of sample ids
159 |     # comes from the GDC metadata, so every row should map to one of these ids
160 |     # Also include header here, since it is easier than adding it later, plus
161 |     # gives the benefit of requiring each sample to have a non-zero number of
162 |     # lines
163 |     tcgaSampleIdToMafLinesMap     = {s:[header] for s in sample_ids}
164 | 
165 |     lineno = 0
166 |     for line in mafReader:
167 |         lineno += 1
168 |         # Skip blank and commented out lines
169 |         if line == [] or line[0].startswith('#'):
170 |             continue
171 | 
172 |         # Filter abberrant genes/lines with enormous (and likely wrong) mutations
173 |         sequence_length = len(line[10])
174 |         if sequence_length >= original_field_size_limit:
175 |             logging.warning('Omitting gene %s mutation (line %d): nucleotide ' \
176 |                 'sequence is very long (%d) and probably incorrectly called' % \
177 |                 (line[0], lineno, sequence_length))
178 |             continue
179 | 
180 |         # tcgaSampleId is the 15 digit barcode containing the sample type
181 |         # (i.e. TCGA-44-2657-01). The sampleBarcode is a generic name for
182 |         # whatever barcode is in the MAF - it may range from 12 to 28
183 |         # characters and may even be invalid. That is what we're
184 |         # attempting to standardize here.
185 |         # tcgaSampleId  = None
186 |         # participant   = None
187 |         # valid         = None
188 |         sampleBarcode = line[sampleIndex]
189 |         if sampleBarcode not in tcgaSampleIdToMafLinesMap:
190 |             # Not good, the GDC metadata does not match the sample id
191 |             unmatched_sample_barcodes.add(sampleBarcode)
192 |             continue
193 |         else:
194 |             # Good the line matches a sample, add the line to the map
195 |             tcgaSampleIdToMafLinesMap[sampleBarcode].append(line)
196 |         # elif sampleBarcode in sampleBarcodeToSampleInfoMap:
197 |         #     tcgaSampleId = sampleBarcodeToSampleInfoMap[sampleBarcode][0]
198 |         #     participant  = sampleBarcodeToSampleInfoMap[sampleBarcode][1]
199 |         #     valid        = sampleBarcodeToSampleInfoMap[sampleBarcode][2]
200 |         # else:
201 |         #     # Try and match TCGA barcode with a valid format
202 |         #     for pattern in VALID_BARCODE_PATTERNS:
203 |         #         matchObject = pattern.match(sampleBarcode)
204 |         #         if matchObject is not None:
205 |         #             participant = "-%s-%s" % (matchObject.group(1), matchObject.group(2))
206 |         #             # If the id contains the sample type, use it. Otherwise, assume it's a primary tumor (-01).
207 |         #             if matchObject.lastindex > 2:
208 |         #                 tcgaSampleId = sampleBarcode[:15]
209 |         #             else:
210 |         #                 tcgaSampleId = 'TCGA%s-01' % (participant)
211 |         #             valid            = True
212 |         #             sampleBarcodeToSampleInfoMap[sampleBarcode] = [tcgaSampleId, participant, valid]
213 |         #             break
214 |         #
215 |         #     # Special case to handle invalid, yet recognisable TCGA barcodes
216 |         #     if tcgaSampleId is None:
217 |         #         # Sample barcode pattern to handle various forms found in MAFs (i.e. LUAD-35-5375-Tumor,
218 |         #         # LUAD-35-3615-D-Tumor, LUAD-44-2656_DN-Tumor, TCGA-E2-A154-01A-11D-A10Y-09) There are
219 |         #         # two capture groups: TSS ([0-9A-Za-z]{2}) and the Participant ([0-9A-Za-z]{4}). These are
220 |         #         # used to form the standard TCGA individual id (TCGA-<TSS>-<Participant>-01) we place in
221 |         #         # our SDRFs and processed MAFs.
222 |         #         matchObject = _INVALID_PATTERN.match(sampleBarcode)
223 |         #         if matchObject is not None:
224 |         #             participant  = "-%s-%s" % (matchObject.group(1), matchObject.group(2))
225 |         #             tcgaSampleId = 'TCGA%s-01' % (participant)
226 |         #             valid        = False
227 |         #             sampleBarcodeToSampleInfoMap[sampleBarcode] = [tcgaSampleId, participant, valid]
228 |         #
229 |         # if tcgaSampleId:
230 |         #     # _removeColumns(line, columnIndicesToRemove)
231 |         #     if len(header) != len(line):
232 |         #         raise Exception("uneven number of fields in each line of maf file")
233 |         #
234 |         #     # If TCGA barcode is invalid, then fix all instances similarly in the given row.
235 |         #     if not valid:
236 |         #         _updateSampleBarcodes(line, participant, tcgaSampleId)
237 |         #
238 |         #     # First line in individual MAF should be a header.
239 |         #     if tcgaSampleId not in tcgaSampleIdToMafLinesMap:
240 |         #         tcgaSampleIdToMafLinesMap[tcgaSampleId]=[header]
241 |         #
242 |         #     # Add mutation to individual's MAF
243 |         #     tcgaSampleIdToMafLinesMap[tcgaSampleId].append(line)
244 |         # else:
245 |         #     unmatchedSamples.add(sampleBarcode)
246 |         #     logging.warning("Skipping unmatched sample: %s.", sampleBarcode)
247 |     if len(unmatched_sample_barcodes) > 0:
248 |         logging.warning("Unmatched sample barcodes found in MAF:\n"
249 |                         + "\n".join(sorted(unmatched_sample_barcodes)))
250 | 
251 |     # Reset CSV reader buffer size back to original value
252 |     csv.field_size_limit(original_field_size_limit)
253 | 
254 |     mafFile.close()
255 |     return tcgaSampleIdToMafLinesMap
256 | 
257 | #===============================================================================
258 | # Remove columns from line - columnIndicesToRemove must be sorted in reverse order
259 | #===============================================================================
260 | # def _removeColumns(line, columnIndicesToRemove):
261 | #     for index in columnIndicesToRemove:
262 | #         line.pop(index)
263 | 
264 | #===============================================================================
265 | # Checks every field in a MAF row to see if it contains the given participant
266 | # (e.g. TSS and Participant portion of barcode: -01-2345). If it does, replace
267 | # field with the new TCGA individual id (eg TCGA-01-2345-01).
268 | #===============================================================================
269 | # def _updateSampleBarcodes(line, participant, tcgaSampleId):
270 | #     indicesToUpdate = list()
271 | #     for index, field in enumerate(line):
272 | #         # If field contains the participant id (i.e. -01-2345), then replace with
273 | #         # new TCGA individual id
274 | #         if participant in field:
275 | #             indicesToUpdate.append(index)
276 | #     for index in indicesToUpdate:
277 | #         line[index] = tcgaSampleId
278 | 
279 | #===============================================================================
280 | # Run Main
281 | #===============================================================================
282 | # if __name__=='__main__':
283 | #     main(sys.argv[1:])
284 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/py_clinical.py:
--------------------------------------------------------------------------------
 1 | from ..meta import diced_file_paths
 2 | from ..clinxml import parse_clinical_xml
 3 | from ..common import safeMakeDirs
 4 | 
 5 | def process(file_dict, infile, outdir):
 6 |     # should only produce one file
 7 |     filepath = diced_file_paths(outdir, file_dict)[0]
 8 |     safeMakeDirs(outdir)
 9 |     parse_clinical_xml(infile, filepath)
10 |     return filepath
11 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import csv
  4 | 
  5 | from ..common import safeMakeDirs, writeCsvFile
  6 | from .. import meta
  7 | 
  8 | 
  9 | def process(file_dict, infile, outdir, platform=None):
 10 |     # Should only produce one outfile
 11 |     outfile = meta.diced_file_paths(outdir, file_dict)[0]
 12 |     hyb_id = file_dict['file_name'].split('.',1)[0]
 13 |     tcga_id = meta.aliquot_id(file_dict)
 14 | 
 15 |     rawfile = open(infile, 'r')
 16 |     csvfile = csv.DictReader(rawfile, dialect='excel-tab')
 17 |     converter = find_converter(csvfile, platform)
 18 | 
 19 |     seg_file_data = generate_seg_file(csvfile, converter, tcga_id, hyb_id)
 20 | 
 21 |     safeMakeDirs(outdir)
 22 |     writeCsvFile(outfile, seg_file_data)
 23 | 
 24 |     rawfile.close()
 25 |     return outfile
 26 | 
 27 | def find_converter(segfile, platform=None):
 28 |     """
 29 |     Determine what to invoke for reading seg file, either by:
 30 |         - looking for function with given platform name
 31 |         - OR inspecting header of open seg file
 32 |     """
 33 | 
 34 |     if platform:
 35 |         global_vars = globals()
 36 |         if platform in global_vars:
 37 |             return global_vars[platform]
 38 |         else:
 39 |             raise Exception('Unsupported seg file platform: %s' % platform)
 40 |     elif segfile.fieldnames[0] == 'GDC_Aliquot':
 41 |         return seg_gdc
 42 |     else:
 43 |         return seg_broad
 44 | 
 45 | def generate_seg_file(csvdict, converter, tcga_id, hyb_id):
 46 |     yield ['Sample', 'Chromosome', 'Start', 'End', 'Num_Probes', 'Segment_Mean']
 47 |     for row in csvdict:
 48 |         new_row = converter(row, tcga_id, hyb_id)
 49 |         if new_row:
 50 |             yield new_row
 51 | 
 52 | def seg_hudsonalpha(row, tcga_id, hyb_id):
 53 |     if row['Normalization Name'] != hyb_id:
 54 |         return None
 55 | 
 56 |     Sample       = tcga_id
 57 |     Chromosome   = fix_chromosome(row['chrom'])
 58 |     Start        = row['loc.start']
 59 |     End          = row['loc.end']
 60 |     Num_Probes   = 'NA'
 61 |     Segment_Mean = row['mean']
 62 | 
 63 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
 64 | 
 65 | def seg_mskcc(row, tcga_id, hyb_id):
 66 |     if row['sample'] != hyb_id:
 67 |         return None
 68 | 
 69 |     Sample       = tcga_id
 70 |     Chromosome   = fix_chromosome(row['chrom'])
 71 |     Start        = row['loc.start']
 72 |     End          = row['loc.end']
 73 |     Num_Probes   = row['num.mark']
 74 |     Segment_Mean = row['seg.mean']
 75 | 
 76 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
 77 | 
 78 | def seg_mskcc2(row, tcga_id, hyb_id):
 79 |     Sample       = tcga_id
 80 |     Chromosome   = fix_chromosome(row['chrom'])
 81 |     Start        = row['loc.start']
 82 |     End          = row['loc.end']
 83 |     Num_Probes   = row['num.mark']
 84 |     Segment_Mean = row['seg.mean']
 85 | 
 86 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
 87 | 
 88 | def seg_broad(row, tcga_id, hyb_id):
 89 |     if row['Sample'] != hyb_id:
 90 |         raise Exception('unexpected hybridization id mismatch... expected %s, found %s in file' % (hyb_id, row['Sample']))
 91 | 
 92 |     Sample       = tcga_id
 93 |     Chromosome   = fix_chromosome(row['Chromosome'])
 94 |     Start        = row['Start']
 95 |     End          = row['End']
 96 |     Num_Probes   = row['Num_Probes']
 97 |     Segment_Mean = row['Segment_Mean']
 98 | 
 99 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
100 | 
101 | def seg_gdc(row, tcga_id, hyb_id):
102 |     Sample       = tcga_id
103 |     Chromosome   = fix_chromosome(row['Chromosome'])
104 |     Start        = row['Start']
105 |     End          = row['End']
106 |     Num_Probes   = row['Num_Probes']
107 |     Segment_Mean = row['Segment_Mean']
108 | 
109 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
110 | 
111 | def seg_harvard(row, tcga_id, hyb_id):
112 |     Sample       = tcga_id
113 |     Chromosome   = fix_chromosome(row['Chromosome'])
114 |     Start        = row['Start']
115 |     End          = row['End']
116 |     Num_Probes   = row['Probe_Number']
117 |     Segment_Mean = row['Segment_Mean']
118 | 
119 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
120 | 
121 | def seg_harvardlowpass(row, tcga_id, hyb_id):
122 |     Sample       = tcga_id
123 |     Chromosome   = fix_chromosome(row['Chromosome'])
124 |     Start        = row['Start']
125 |     End          = row['End']
126 |     Num_Probes   = 'NA'
127 |     Segment_Mean = row['Segment_Mean']
128 | 
129 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
130 | 
131 | def seg_wxs_washu(row, tcga_id, hyb_id):
132 |     Sample       = tcga_id
133 |     Chromosome   = fix_chromosome(row['Chromosome'])
134 |     Start        = row['Start']
135 |     End          = row['End']
136 |     Num_Probes   = row['Num_Probes']
137 |     Segment_Mean = row['Segment_Mean']
138 | 
139 |     return [Sample, Chromosome, Start, End, Num_Probes, Segment_Mean]
140 | 
141 | def fix_chromosome(chrom):
142 |     chrom = chrom.lower()
143 |     chrom = chrom.lstrip("chr")
144 | 
145 |     if chrom.isdigit():
146 |         chrom_out = chrom
147 |     elif chrom == 'x':
148 |         chrom_out = '23'
149 |     elif chrom == 'y':
150 |         chrom_out = '24'
151 |     elif chrom == 'm' or chrom == 'mt':
152 |         chrom_out = '25'
153 |     elif chrom == 'xy':
154 |         chrom_out = '26'
155 |     else:
156 |         raise Exception('unexpected chromosome value: %s' % chrom)
157 | 
158 |     return chrom_out
159 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/tsv2idtsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import csv
 4 | from ..common import safeMakeDirs, map_blank_to_na, writeCsvFile
 5 | from ..meta import tcga_id, diced_file_paths
 6 | 
 7 | def process(file_dict, infile, outdir):
 8 |     # Should only produce one file
 9 |     filepath = diced_file_paths(outdir, file_dict)[0]
10 |     _tcga_id = tcga_id(file_dict)
11 |     rawfile = open(infile, 'r')
12 |     csvfile = csv.reader(rawfile, dialect='excel-tab')
13 | 
14 |     csvfile_with_ids = tsv2idtsv(csvfile, _tcga_id)
15 |     csvfile_with_NAs = map_blank_to_na(csvfile_with_ids)
16 | 
17 |     safeMakeDirs(outdir)
18 |     writeCsvFile(filepath, csvfile_with_NAs)
19 | 
20 |     rawfile.close()
21 | 
22 | def tsv2idtsv(csvfile, sampleName):
23 |     header = next(csvfile)
24 |     yield ['SampleId'] + header
25 | 
26 |     for row in csvfile:
27 |         yield [sampleName] + row
28 | 


--------------------------------------------------------------------------------
/gdctools/lib/convert/tsv2magetab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import csv
 4 | from os.path import basename
 5 | 
 6 | from ..common import safeMakeDirs, getTabFileHeader, map_blank_to_na, writeCsvFile, rearrange_columns
 7 | from ..meta import tcga_id, diced_file_paths
 8 | 
 9 | 
10 | def process(file_dict, infile, outdir, fpkm=False, col_order=None, data_cols=None, id_func=tcga_id):
11 |     '''
12 | col_order : list of int
13 |     E.g.: col_order = [0, 2, 3, 1] will cause column 1 of the input to be column 3 of the output. 
14 |     col_order = None leaves column order unchanged.
15 | data_cols : list of int
16 |     Columns listed in data_cols get a sample name in the top header row, other
17 |     columns are treated as header columns and do not get a sample name. 
18 |     If data_cols is None, treat column 0 as a header and all others as data columns.
19 |     '''
20 | 
21 | 
22 |     filepath = diced_file_paths(outdir, file_dict)[0]
23 |     safeMakeDirs(outdir)
24 |     _tcga_id = id_func(file_dict)
25 | 
26 |     hdr1, hdr2 = generate_headers(infile, _tcga_id, fpkm, data_cols)
27 | 
28 | 
29 |     rawfile = open(infile, 'r')
30 |     csvfile = csv.reader(fpkm_reader(rawfile) if fpkm else rawfile,
31 |                          dialect='excel-tab')
32 | 
33 |     csvfile_with_hdr = change_header__generator(csvfile, hdr1, hdr2)
34 |     csvfile_with_NAs = map_blank_to_na(csvfile_with_hdr)
35 |     if col_order is not None:
36 |         csvfile_with_new_column_order = rearrange_columns(csvfile_with_NAs, col_order)
37 |     else:
38 |         csvfile_with_new_column_order = csvfile_with_NAs
39 | 
40 |     safeMakeDirs(outdir)
41 |     writeCsvFile(filepath, csvfile_with_new_column_order)
42 | 
43 |     rawfile.close()
44 | 
45 | def generate_headers(infile, tcga_id, fpkm, data_cols):
46 |     old_hdr = fpkm_header(infile).split() if fpkm else getTabFileHeader(infile)
47 |     new_hdr = ['Hybridization REF']
48 |     for i in range(1, len(old_hdr)):
49 |         if data_cols is None or i in data_cols:
50 |             new_hdr += [tcga_id]
51 |         else:
52 |             new_hdr += ['']
53 | 
54 |     return new_hdr, old_hdr
55 | 
56 | def fpkm_header(filename):
57 |     return "gene_id\t" + ("raw_count" if "htseq.counts" in basename(filename)
58 |                           else "FPKM") + "\n"
59 | 
60 | def fpkm_reader(rawfile):
61 |     yield fpkm_header(rawfile.name)
62 |     for line in rawfile:
63 |         yield line
64 | 
65 | def change_header__generator(csvfile, header1, header2=None):
66 |     """
67 |     Replace a csv header row with a new one (or two).
68 | 
69 |     Skip the first row of the input csv file;
70 |     in its place, yield one (or two) new header(s),
71 |     and then yield the remainder of the input file.
72 |     """
73 |     yield header1
74 |     if header2:
75 |         yield header2
76 | 
77 |     next(csvfile)
78 |     for row in csvfile:
79 |         yield row
80 | 


--------------------------------------------------------------------------------
/gdctools/lib/heatmap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Front Matter {{{
  5 | '''
  6 | Copyright (c) 2016 The Broad Institute, Inc.  All rights are reserved.
  7 | 
  8 | report.py: Functions for creating reports of Mirrored/Diced Data
  9 | 
 10 | @author: Timothy DeFreitas
 11 | @date:  2016_06_06
 12 | '''
 13 | 
 14 | # }}}
 15 | import os
 16 | import numpy
 17 | import subprocess
 18 | import logging
 19 | 
 20 | from matplotlib.figure import Figure, Rectangle
 21 | from matplotlib.colors import ListedColormap, NoNorm
 22 | from matplotlib.backends.backend_agg import FigureCanvasAgg
 23 | from matplotlib import font_manager
 24 | 
 25 | from gdctools.lib.common import REPORT_DATA_TYPES
 26 | 
 27 | 
 28 | def draw_heatmaps(case_data, project, timestamp, diced_meta_dir):
 29 |     rownames, matrix = _build_heatmap_matrix(case_data)
 30 | # def draw_heatmaps(rownames, matrix, cohort, timestamp, outputDir):
 31 |     if not len(matrix) > 0:
 32 |         raise ValueError('input matrix must have nonzero length')
 33 |     if not len(matrix) == len(rownames):
 34 |         raise ValueError('Number of row names does not match input matrix')
 35 | 
 36 |     #Sort heatmaps rows by row count
 37 |     sorted_rownames, sorted_matrix = __sort_rows(rownames, matrix)
 38 | 
 39 |     green = '#338855'
 40 |     white = '#FFFFFF'
 41 |     cmap = ListedColormap([white, green])
 42 |     fig = Figure(figsize=(24,12))
 43 |     ax = fig.add_subplot(111)
 44 |     ax.set_title("%s: Data Type Breakdown by Participant" % project, weight="black")
 45 |     ax.set_ylabel("Data Type (Total Sample Count)", weight="black")
 46 |     ax.set_xlabel("Participant", weight="black")
 47 |     ax.set_xlim(0, len(sorted_matrix[0]))
 48 |     ax.set_yticks([0.5 + x for x in range(len(sorted_matrix))])
 49 | 
 50 |     counts = [sum(row) for row in sorted_matrix]
 51 |     ax.set_yticklabels(["%s (%s)" % (data_type, count) for data_type, count in zip(sorted_rownames, counts)])
 52 |     ax.pcolor(numpy.array(sorted_matrix), cmap=cmap, norm=NoNorm(), edgecolor="k")
 53 |     missing = Rectangle((0, 0), 1, 1, fc=white)
 54 |     present = Rectangle((0, 0), 1, 1, fc=green)
 55 |     ax.legend([present, missing], ["Present", "Absent"], loc=1)
 56 | 
 57 |     fig.set_size_inches(24,12)
 58 |     ax.title.set_size("xx-large")
 59 |     ax.xaxis.label.set_size("xx-large")
 60 |     ax.yaxis.label.set_size("xx-large")
 61 |     ax.tick_params(axis="both", labelsize="x-large")
 62 |     canvas = FigureCanvasAgg(fig)
 63 |     high_res_filepath = os.path.join(diced_meta_dir, ".".join([project, timestamp,"high_res.heatmap.png"]))
 64 |     fig.tight_layout()
 65 |     canvas.print_figure(high_res_filepath)
 66 | 
 67 |     fig.set_size_inches(12,6)
 68 |     ax.title.set_size("medium")
 69 |     ax.xaxis.label.set_size("small")
 70 |     ax.yaxis.label.set_size("small")
 71 |     ax.tick_params(axis="both", labelsize="x-small")
 72 |     fontProp = font_manager.FontProperties(size=9)
 73 |     ax.legend([present, missing], ["Present", "Absent"], loc=1, prop=fontProp)
 74 |     canvas = FigureCanvasAgg(fig)
 75 |     low_res_filepath = os.path.join(diced_meta_dir, ".".join([project, timestamp, "low_res.heatmap.png"]))
 76 |     fig.tight_layout()
 77 |     canvas.print_figure(low_res_filepath)
 78 | 
 79 | def _build_heatmap_matrix(case_data):
 80 |     '''Build a 2d matrix and rownames from annotations and load dict'''
 81 |     rownames = REPORT_DATA_TYPES
 82 |     annot_sample_data = dict()
 83 |     for case in case_data:
 84 |         c_dict = case_data[case].case_data
 85 |         # Flatten case_data[case_id][sample_type] = set(Data types)
 86 |         # into annot_sample_data[case_id] = set(Data types)
 87 |         # for simpler heatmap
 88 |         data_types = {dt for st in c_dict for dt in c_dict[st]}
 89 |         annot_sample_data[case] = data_types
 90 | 
 91 |     matrix = [[] for _ in rownames]
 92 |     # Now iterate over samples, inserting a 1 if data is presente
 93 |     for r in range(len(rownames)):
 94 |         for cid in sorted(annot_sample_data.keys()):
 95 |             # append 1 if data is present, else 0
 96 |             matrix[r].append( 1 if rownames[r] in annot_sample_data[cid] else 0)
 97 | 
 98 |     return rownames, matrix
 99 | 
100 | def __sort_rows(rownames, matrix):
101 |     '''Sort the rows in matrix by the number of values in the row, in ascending order'''
102 | 
103 |     row_dict = {rownames[i]:matrix[i] for i in range(len(rownames))}
104 |     sorted_rows = sorted(row_dict.keys(), key=lambda k: sum(row_dict[k]))
105 | 
106 |     sorted_mat = [row_dict[row] for row in sorted_rows]
107 | 
108 |     return sorted_rows, sorted_mat
109 | 


--------------------------------------------------------------------------------
/gdctools/reference/centerCode.txt:
--------------------------------------------------------------------------------
 1 | Code	Center Name	Center Type	Display Name	Short Name
 2 | 01	broad.mit.edu	CGCC	Broad Institute of MIT and Harvard	BI
 3 | 02	hms.harvard.edu	CGCC	Harvard Medical School	HMS
 4 | 03	lbl.gov	CGCC	Lawrence Berkeley National Laboratory	LBL
 5 | 04	mskcc.org	CGCC	Memorial Sloan-Kettering Cancer Center	MSKCC
 6 | 05	jhu-usc.edu	CGCC	Johns Hopkins / University of Southern California	JHU_USC
 7 | 06	hudsonalpha.org	CGCC	HudsonAlpha Institute for Biotechnology	HAIB
 8 | 07	unc.edu	CGCC	University of North Carolina	UNC
 9 | 08	broad.mit.edu	GSC	Broad Institute of MIT and Harvard	BI
10 | 09	genome.wustl.edu	GSC	Washington University School of Medicine	WUSM
11 | 10	hgsc.bcm.edu	GSC	Baylor College of Medicine	BCM
12 | 11	rubicongenomics.com	COM	Rubicon Genomics	RG
13 | 12	hgsc.bcm.edu	CGCC	Baylor College of Medicine	BCM
14 | 13	bcgsc.ca	CGCC	Canada's Michael Smith Genome Sciences Centre	BCGSC
15 | 14	broadinstitute.org	GDAC	Broad Institute of MIT and Harvard	BI
16 | 15	systemsbiology.org	GDAC	Institute for Systems Biology	ISB
17 | 16	lbl.gov	GDAC	Lawrence Berkely National Laboratory	LBL
18 | 17	mskcc.org	GDAC	Memorial Sloan-Kettering Cancer Center	MSKCC
19 | 18	ucsc.edu	GDAC	University of California, Santa Cruz	UCSC
20 | 19	mdanderson.org	GDAC	MD Anderson	MDA
21 | 20	mdanderson.org	CGCC	MD Anderson - RPPA Core Facility (Proteomics)	MDA
22 | 21	genome.wustl.edu	CGCC	Washington University School of Medicine	WUSM
23 | 22	intgen.org	CGCC	IGC	IGC
24 | 23	nationwidechildrens.org	CGCC	NCH BCR	NCH
25 | 24	mdanderson.org	CGCC	MD Anderson - Pathology/Lab Medicine Hamilton	MDA
26 | 25	ucsc.edu	GSC	University of California, Santa Cruz	UCSC
27 | 26	mdanderson.org	CGCC	MD Anderson - Institute for Applied Cancer Science	MDA
28 | 27	vanderbilt.edu	CGCC	Vanderbilt University Proteomics	VUMC
29 | 28	jhu.edu	CGCC	The Johns Hopkins University Proteomics	JHU
30 | 29	pnl.gov	CGCC	Pacific Northwest National Lab	PNNL
31 | 30	genome.wustl.edu	CGCC	Washington University School of Medicine Proteomics	WUSM
32 | 31	bcgsc.ca	CGCC	Canada's Michael Smith Genome Sciences Centre	BCGSC
33 | 32	sanger.ac.uk	GSC	Wellcome Trust Sanger Institute	SANGER
34 | 


--------------------------------------------------------------------------------
/gdctools/reference/diseaseStudy.txt:
--------------------------------------------------------------------------------
 1 | Study Abbreviation	Study Name
 2 | TCGA-ACC	Adrenocortical carcinoma
 3 | TCGA-BLCA	Bladder Urothelial Carcinoma
 4 | TCGA-BRCA	Breast invasive carcinoma
 5 | TCGA-CESC	Cervical squamous cell carcinoma and endocervical adenocarcinoma
 6 | TCGA-CHOL	Cholangiocarcinoma
 7 | TCGA-COAD	Colon adenocarcinoma
 8 | TCGA-COADREAD	Colorectal adenocarcinoma
 9 | TCGA-DLBC	Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
10 | TCGA-ESCA	Esophageal carcinoma 
11 | TCGA-FPPP	FFPE Pilot Phase II
12 | TCGA-GBM	Glioblastoma multiforme
13 | TCGA-GBMLGG	Glioma
14 | TCGA-HNSC	Head and Neck squamous cell carcinoma
15 | TCGA-KICH	Kidney Chromophobe
16 | TCGA-KIPAN	Pan-kidney cohort (KICH+KIRC+KIRP)
17 | TCGA-KIRC	Kidney renal clear cell carcinoma
18 | TCGA-KIRP	Kidney renal papillary cell carcinoma
19 | TCGA-LAML	Acute Myeloid Leukemia
20 | TCGA-LCML	Chronic Myelogenous Leukemia
21 | TCGA-LGG	Brain Lower Grade Glioma
22 | TCGA-LIHC	Liver hepatocellular carcinoma
23 | TCGA-LUAD	Lung adenocarcinoma
24 | TCGA-LUSC	Lung squamous cell carcinoma
25 | TCGA-MESO	Mesothelioma
26 | TCGA-OV	Ovarian serous cystadenocarcinoma
27 | TCGA-PAAD	Pancreatic adenocarcinoma
28 | TCGA-PANCAN12	PANCANCER cohort with 12 disease types
29 | TCGA-PANCAN18	PANCANCER cohort with 18 disease types
30 | TCGA-PANCAN8	PANCANCER cohort with 8 initial disease types
31 | TCGA-PANCANCER	Complete PANCANCER set
32 | TCGA-PANGI	Gastric carcinoma (COADREAD+STES)
33 | TCGA-PCPG	Pheochromocytoma and Paraganglioma
34 | TCGA-PRAD	Prostate adenocarcinoma
35 | TCGA-READ	Rectum adenocarcinoma
36 | TCGA-SARC	Sarcoma
37 | TCGA-SKCM	Skin Cutaneous Melanoma
38 | TCGA-STAD	Stomach adenocarcinoma
39 | TCGA-STES	Stomach and Esophageal carcinoma
40 | TCGA-TGCT	Testicular Germ Cell Tumors
41 | TCGA-THCA	Thyroid carcinoma
42 | TCGA-THYM	Thymoma
43 | TCGA-UCEC	Uterine Corpus Endometrial Carcinoma
44 | TCGA-UCS	Uterine Carcinosarcoma
45 | TCGA-UVM	Uveal Melanoma
46 | CPTAC3-SAR	Sarcoma
47 | CPTAC3-PDA	Pancreatic Ductal Adenocarcinoma
48 | CPTAC3-HNSCC	Head and Neck Squamous Cell Carcinoma
49 | CPTAC3-CM	Cutaneous Melanoma
50 | CPTAC3-LSCC	Lung Squamous Cell Carcinoma
51 | CPTAC3-LUAD	Lung Adenocarcinoma
52 | CPTAC3-UCEC	Uterine Corpus Endometrial Carcinoma
53 | CPTAC3-CCRCC	Clear Cell Renal Cell Carcinoma
54 | CPTAC3-AML	Acute Myeloid Leukemia
55 | CPTAC3-GBM	Glioblastoma Multiforme
56 | 


--------------------------------------------------------------------------------
/gdctools/reference/platformCode.txt:
--------------------------------------------------------------------------------
 1 | Platform Code	Platform Alias	Platform Name	Available
 2 | HT_HG-U133A	HT_HG-U133A	Affymetrix HT Human Genome U133 Array Plate Set	Yes
 3 | HuEx-1_0-st-v2	HuEx-1_0-st-v2	Affymetrix Human Exon 1.0 ST Array	Yes
 4 | Genome_Wide_SNP_6	Genome_Wide_SNP_6	Affymetrix Genome-Wide Human SNP Array 6.0	Yes
 5 | HG-CGH-415K_G4124A	HG-CGH-415K_G4124A	Agilent Human Genome CGH Custom Microarray 2x415K	Yes
 6 | WHG-CGH_4x44B	WHG-CGH_4x44B	Agilent Human Genome CGH Microarray 44K	No
 7 | HG-CGH-244A	HG-CGH-244A	Agilent Human Genome CGH Microarray 244A	Yes
 8 | WHG-1x44K_G4112A	1 x 44K	Agilent Whole Human Genome	No
 9 | WHG-4x44K_G4112F	4 x 44K	Agilent Whole Human Genome Microarray Kit	No
10 | AgilentG4502A_07_1	AgilentG4502A_07	Agilent 244K Custom Gene Expression G4502A-07-1	Yes
11 | H-miRNA_G4470A	H-miRNA_G4470A	Agilent Human miRNA Microarray	No
12 | AgilentG4502A_07_2	AgilentG4502A_07	Agilent 244K Custom Gene Expression G4502A-07-2	Yes
13 | H-miRNA_8x15Kv2	H-miRNA_8x15K	Agilent Human miRNA Microarray Rel12.0	Yes
14 | AgilentG4502A_07_3	AgilentG4502A_07	Agilent 244K Custom Gene Expression G4502A-07-3	Yes
15 | H-miRNA_8x15K	H-miRNA_8x15K	Agilent 8 x 15K Human miRNA-specific microarray	Yes
16 | CGH-1x1M_G4447A	CGH-1x1M_G4447A	Agilent SurePrint G3 Human CGH Microarray Kit 1x1M	Yes
17 | H-miRNA_EarlyAccess	H-miRNA_EarlyAccess	Agilent Human miRNA Early Access Array	No
18 | IlluminaGG	IlluminaGG	Illumina GoldenGate	No
19 | HumanMethylation27	HumanMethylation27	Illumina Infinium Human DNA Methylation 27	Yes
20 | IlluminaDNAMethylation_OMA003_CPI	IlluminaDNAMethylation	Illumina DNA Methylation OMA003 Cancer Panel I	Yes
21 | Human1MDuo	Human1MDuo	Illumina Human1M-Duo BeadChip	Yes
22 | IlluminaDNAMethylation_OMA002_CPI	IlluminaDNAMethylation	Illumina DNA Methylation OMA002 Cancer Panel I	Yes
23 | HumanHap550	HumanHap550	Illumina 550K Infinium HumanHap550 SNP Chip	Yes
24 | AgilentG4502A_07	AgilentG4502A_07	Agilent 244K Custom Gene Expression G4502A-07	Yes
25 | bio	bio	Biospecimen Metadata - Complete Set	No
26 | biotab	biotab	Biospecimen Metadata - Complete Set - All Samples - Tab-delimited	No
27 | minbio	minbio	Biospecimen Metadata - Minimal Set	No
28 | minbiotab	minbiotab	Biospecimen Metadata - Minimal Set - All Samples - Tab-delimited	No
29 | ABI	ABI	Applied Biosystems Sequence data	Yes
30 | IlluminaHiSeq_DNASeq	Mutation Calling	Illumina HiSeq 2000 DNA Sequencing	Yes
31 | SOLiD_DNASeq	Mutation Calling	ABI SOLiD DNA Sequencing	Yes
32 | IlluminaGA_DNASeq	Mutation Calling	Illumina Genome Analyzer DNA Sequencing	Yes
33 | IlluminaGA_mRNA_DGE	IlluminaGA_mRNA_DGE	Illumina Genome Analyzer mRNA Digital Gene Expression	Yes
34 | 454	454	454 Life Sciences Genome Sequence data	No
35 | HG-U133A_2	HG-U133A_2	Affymetrix Human Genome U133A 2.0 Array	No
36 | HG-U133_Plus_2	HG-U133_Plus_2	Affymetrix Human Genome U133 Plus 2.0 Array	Yes
37 | Mapping250K_Nsp	Mapping250K_Nsp	Affymetrix Human Mapping 250K Nsp Array	No
38 | Mapping250K_Sty	Mapping250K_Sty	Affymetrix Human Mapping 250K Sty Array	No
39 | GenomeWideSNP_5	GenomeWideSNP_5	Affymetrix Genome-Wide Human SNP Array 5.0	No
40 | tissue_images	tissue_images	Tissue Images	No
41 | IlluminaGA_RNASeq	IlluminaGA_RNASeq	Illumina Genome Analyzer RNA Sequencing	Yes
42 | IlluminaGA_miRNASeq	IlluminaGA_miRNASeq	Illumina Genome Analyzer miRNA Sequencing	Yes
43 | diagnostic_images	diagnostic_images	Diagnostic Images	Yes
44 | pathology_reports	pathology_reports	Pathology Reports	Yes
45 | MDA_RPPA_Core	MDA_RPPA_Core	M.D. Anderson Reverse Phase Protein Array Core	Yes
46 | microsat_i	microsat_i	Microsatellite Instability Analysis	Yes
47 | HumanMethylation450	HumanMethylation450	Illumina Infinium Human DNA Methylation 450	Yes
48 | IlluminaHiSeq_mRNA_DGE	IlluminaHiSeq_mRNA_DGE	Illumina HiSeq 2000 mRNA Digital Gene Expression	Yes
49 | IlluminaHiSeq_miRNASeq	IlluminaHiSeq_miRNASeq	Illumina HiSeq 2000 miRNA Sequencing	Yes
50 | IlluminaHiSeq_RNASeq	IlluminaHiSeq_RNASeq	Illumina HiSeq 2000 RNA Sequencing	Yes
51 | IlluminaHiSeq_DNASeqC	IlluminaHiSeq_DNASeqC	Illumina HiSeq for Copy Number Variation	Yes
52 | fh_analyses	fh_analyses	Firehose Analyses	Yes
53 | fh_stddata	fh_stddata	Firehose Standardized Data	Yes
54 | fh_reports	fh_reports	Firehose Reports	Yes
55 | IlluminaGA_RNASeqV2	IlluminaGA_RNASeqV2	Illumina Genome Analyzer RNA Sequencing Version 2 analysis	Yes
56 | IlluminaHiSeq_RNASeqV2	IlluminaHiSeq_RNASeqV2	Illumina HiSeq 2000 RNA Sequencing Version 2 analysis	Yes
57 | IlluminaHiSeq_DNASeq_Cont	Mutation Calling	Illumina HiSeq 2000 DNA Sequencing - Controlled	Yes
58 | IlluminaGA_DNASeq_Cont	Mutation Calling	Illumina Genome Analyzer DNA Sequencing - Controlled	Yes
59 | SOLiD_DNASeq_Cont	Mutation Calling	ABI SOLiD DNA Sequencing - Controlled	Yes
60 | IlluminaHiSeq_TotalRNASeqV2	IlluminaHiSeq_TotalRNASeqV2	Illumina HiSeq 2000 Total RNA Sequencing Version 2 analysis	Yes
61 | Mixed_DNASeq	Mutation Calling	Mixed DNA Sequencing	Yes
62 | Mixed_DNASeq_Cont	Mutation Calling	Mixed DNA Sequencing - Controlled	Yes
63 | IlluminaHiSeq_WGBS	IlluminaHiSeq_WGBS	Illumina HiSeq 2000 Bisulfite-converted DNA Sequencing	Yes
64 | IlluminaHiSeq_DNASeq_automated	Automated Mutation Calling	IlluminaHiSeq automated DNA sequencing	Yes
65 | IlluminaHiSeq_DNASeq_curated	Curated Mutation Calling	IlluminaHiSeq curated DNA sequencing	Yes
66 | IlluminaGA_DNASeq_automated	Automated Mutation Calling	IlluminaGA automated DNA sequencing	Yes
67 | IlluminaGA_DNASeq_curated	Curated Mutation Calling	IlluminaGA curated DNA sequencing	Yes
68 | SOLiD_DNASeq_automated	Automated Mutation Calling	SOLiD automated DNA sequencing	Yes
69 | SOLiD_DNASeq_curated	Curated Mutation Calling	SOLiD curated DNA sequencing	Yes
70 | Mixed_DNASeq_automated	Automated Mutation Calling	Mixed automated DNA sequencing	Yes
71 | Mixed_DNASeq_curated	Curated Mutation Calling	Mixed curated DNA sequencing	Yes
72 | IlluminaHiSeq_DNASeq_Cont_automated	Automated Mutation Calling	IlluminaHiSeq automated DNA sequencing - controlled	Yes
73 | IlluminaHiSeq_DNASeq_Cont_curated	Curated Mutation Calling	IlluminaHiSeq curated DNA sequencing - controlled	Yes
74 | IlluminaGA_DNASeq_Cont_automated	Automated Mutation Calling	IlluminaGA automated DNA sequencing - controlled	Yes
75 | IlluminaGA_DNASeq_Cont_curated	Curated Mutation Calling	IlluminaGA curated DNA sequencing - controlled	Yes
76 | SOLiD_DNASeq_Cont_automated	Automated Mutation Calling	SOLiD automated DNA sequencing - controlled	Yes
77 | SOLiD_DNASeq_Cont_curated	Curated Mutation Calling	SOLiD curated DNA sequencing - controlled	Yes
78 | Mixed_DNASeq_Cont_automated	Automated Mutation Calling	Mixed automated DNA sequencing - controlled	Yes
79 | Mixed_DNASeq_Cont_curated	Curated Mutation Calling	Mixed curated DNA sequencing - controlled	Yes
80 | 


--------------------------------------------------------------------------------
/gdctools/reference/sampleType.txt:
--------------------------------------------------------------------------------
 1 | Code	Definition	Short Letter Code
 2 | 01	Primary Tumor	TP
 3 | 02	Recurrent Solid Tumor	TR
 4 | 03	Primary Blood Derived Cancer - Peripheral Blood	TB
 5 | 04	Recurrent Blood Derived Cancer - Bone Marrow	TRBM
 6 | 05	Additional - New Primary	TAP
 7 | 06	Metastatic	TM
 8 | 07	Additional Metastatic	TAM
 9 | 08	Human Tumor Original Cells	THOC
10 | 09	Primary Blood Derived Cancer - Bone Marrow	TBM
11 | 10	Blood Derived Normal	NB
12 | 11	Solid Tissue Normal	NT
13 | 12	Buccal Cell Normal	NBC
14 | 13	EBV Immortalized Normal	NEBV
15 | 14	Bone Marrow Normal	NBM
16 | 20	Control Analyte	CELLC
17 | 40	Recurrent Blood Derived Cancer - Peripheral Blood	TRB
18 | 50	Cell Lines	CELL
19 | 60	Primary Xenograft Tissue	XP
20 | 61	Cell Line Derived Xenograft Tissue	XCL
21 | 


--------------------------------------------------------------------------------
/gdctools/tool_template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # Template file for gdctools: to use this, simply copy to <TOOLNAME>.py
 5 | # and edit as follows:
 6 | #   global search/replace TOOLNAME with name of GDC tool (e.g. gdcls)
 7 | #   edit the <DATE> field in the header
 8 | #   customize __init__() to add/change: version, flags and description
 9 | #   write custom content,functions etc to perform the work of the tool
10 | #   customize execute() as needed to reflect those custom functions etc
11 | #   then remove this entire comment section
12 | 
13 | # Front Matter {{{
14 | '''
15 | Copyright (c) 2017 The Broad Institute, Inc.  All rights are reserved.
16 | 
17 | TOOLNAME: this file is part of gdctools.  See the <root>/COPYRIGHT
18 | file for the SOFTWARE COPYRIGHT and WARRANTY NOTICE.
19 | 
20 | @author: Michael S. Noble
21 | @date:  <DATE>
22 | '''
23 | 
24 | # }}}
25 | 
26 | from GDCtool import GDCtool
27 | 
28 | class TOOLNAME(GDCtool):
29 | 
30 |     def __init__(self):
31 |         super(TOOLNAME, self).__init__(version="0.2.0")
32 | 
33 |         #desc = 'TOOLNAME description \n\n'
34 |         #desc += 'MORE TOOLNAME description ...\n'
35 |         #opts.description = desc
36 | 
37 |         # Optional arguments (if any)
38 |         #opts = self.options
39 |         #opts.add_argument('-w', '--what', default='all',
40 | 
41 |         # Positional (required) arguments (if any)
42 |         #opts.add_argument('-w', '--what', default='all',
43 | 
44 |     def execute(self):
45 |         super(TOOLNAME, self).execute()
46 | 
47 | if __name__ == "__main__":
48 |     tool = TOOLNAME()
49 |     tool.execute()
50 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Experimental code, to explore how to best automate wrapper generation
 3 | 
 4 | import GDCcore as core
 5 | from pprint import pprint
 6 | 
 7 | core.set_codec(core.CODEC_DJSON)
 8 | Endpoints=["projects", "cases", "files", "annotations"]
 9 | #Endpoints = ["files"]
10 | 
11 | fields = {}
12 | 
13 | for ep in Endpoints:
14 |     mapping = ep + "/_mapping"
15 |     core.set_debug(True)
16 |     mapping = core.get(mapping)
17 |     continue
18 |     print("\n\n%s/_mapping contains: %s" % (ep, str(mapping.keys())))
19 |     print(ep + " endpoint supports the query fields: ")
20 |     for field in mapping["fields"]:
21 |         print("\t" + field)
22 |         terms = field.split(".")
23 |         terms.reverse()
24 |         fields.setdefault(terms[0], []).extend(terms[1:])
25 | 
26 |     pprint(fields)
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | fasteners
3 | matplotlib
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #===============================================================================
 2 | # The Broad Institute
 3 | # SOFTWARE COPYRIGHT NOTICE AGREEMENT
 4 | # This software and its documentation are copyright 2016-2017 by the
 5 | # Broad Institute/Massachusetts Institute of Technology. All rights reserved.
 6 | #
 7 | # This software is supplied without any warranty or guaranteed support whatsoever.
 8 | # Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
 9 | # functionality.
10 | #===============================================================================
11 | 
12 | import os
13 | from setuptools import setup, find_packages
14 | 
15 | #===============================================================================
16 | # Setup
17 | #===============================================================================
18 | 
19 | README = open('README.md').read()
20 | README = README.replace("&nbsp;","")
21 | README = README.replace("**","")
22 | version = open('VERSION').read().strip()
23 | 
24 | setup(
25 | 	name         = 'gdctools',
26 |     version      = version,
27 |     author       = 'Michael S. Noble, Timothy DeFreitas, David Heiman',
28 |     author_email = 'gdac@broadinstitute.org',
29 |     url          = 'https://github.com/broadinstitute/gdctools',
30 |     packages     = find_packages(),
31 |     description  = (
32 | 		"GDCtools: Python and UNIX CLI utils to simplify interaction with the NIH/NCI Genomics Data Commons."
33 | 	),
34 |     long_description = README,
35 |     entry_points     = {
36 | 		'console_scripts': [
37 | 			# FIXME: this list s/b generated from $(TOOLS) macro in Makefile
38 | 			'gdc_dice = gdctools.gdc_dice:main',
39 | 			'gdc_list = gdctools.gdc_list:main',
40 | 			'gdc_mirror = gdctools.gdc_mirror:main',
41 |     		'gdc_loadfile = gdctools.gdc_loadfile:main',
42 |     		'gdc_report = gdctools.gdc_report:main'
43 | 		]
44 | 	},
45 |     # Put cfg files in bin, but better may be to look in pkg_data config subdir
46 |     data_files = [('bin', [
47 |         'gdctools/config/tcga.cfg',
48 |         'gdctools/config/cptac3.cfg',
49 |         'gdctools/config/google.cfg'
50 |                     ])],
51 |     package_data = {'gdctools': [
52 |                         'config/*.cfg',
53 |                         'lib/annot*.tsv',
54 |                         'lib/GDCSampleReport.R',
55 |                         'reference/*',
56 |                         'default.cfg'
57 |                     ],
58 |                     },
59 |     test_suite   = 'nose.collector',
60 |     install_requires = [
61 |         'requests',
62 |         'fasteners',
63 |         'matplotlib==2.1.1', # v2.1.1 avoids hardcoded dependency on bz2 module
64 |         'future',
65 |         'configparser',
66 |     ],
67 | )
68 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | include ../Makefile.inc
  3 | 
  4 | # ---------------------------- Macro definitions ---------------------------
  5 | 
  6 | # Recall that TEST_ROOT and TEST_CONFIG_FILE are defined in Makefile.inc
  7 | 
  8 | SRC=../gdctools
  9 | PYTHON := set -o pipefail && env PYTHONPATH=.. $(PYTHON)
 10 | SORT=env LC_COLLATE=C sort
 11 | #VERBOSITY=-V
 12 | CONFIG=--config $(TEST_CONFIG_FILE) $(VERBOSITY)
 13 | PROJECT_SUBSET=--projects TCGA-ACC TCGA-SKCM TCGA-ESCA TCGA-BLCA
 14 | 
 15 | # To improve readibility of targets definitions below, we define commands to
 16 | # encapsulate mirroring/dicing/file finding/loadfile parsing, etc operations
 17 | PERFORM_MIRROR=$(PYTHON) $(SRC)/gdc_mirror.py $(CONFIG)
 18 | FIND_MIRROR_FILES=cd $(TEST_ROOT)/mirror/TCGA && find TCGA-* \
 19 | 							-name '*.gz' -o -name '*.xml' -o -name '*.txt'
 20 | PERFORM_DICE=$(PYTHON) $(SRC)/gdc_dice.py $(CONFIG) $(PROJECT_SUBSET)
 21 | FIND_DICE_FILES=cd $(TEST_ROOT)/dice/TCGA && find TCGA-* -name '*.txt'
 22 | FIND_LEGACY_FILES=cd legacy/mirror/TCGA && find TCGA-* -name '*.*' -a ! -name '*.json'
 23 | PERFORM_REPORT=$(PYTHON) $(SRC)/gdc_report.py $(CONFIG) $(PROJECT_SUBSET)
 24 | PERFORM_LOADF=$(PYTHON) $(SRC)/gdc_loadfile.py $(LOADFILE_FORMAT) $(CONFIG) $(PROJECT_SUBSET)
 25 | ifeq ($(LOADFILE_DESTINATION),google)
 26 |    # We generate Google-bucketized loadfiles with an additional config file
 27 |    CONFIG := --config $(TEST_CONFIG_FILE) ../gdctools/config/google.cfg $(VERBOSITY)
 28 |    LOADFILE_SUBDIR=google/
 29 |    LOADFILE_MD5=load-md5sums-google.txt
 30 |    FILE_PREFIX=$(shell grep FILE_PREFIX: ../gdctools/config/google.cfg | awk '{print $$NF}')
 31 |    # In this mode we also generate FireCloud-style loadfiles, including cases/participants
 32 |    CHECK_CASES_LOADFILE=diff -b baselines/TCGA-ACCSKCM.Participant.loadfile.txt $(LOADFILES_DIR)/.
 33 | else
 34 |    LOADFILE_SUBDIR=
 35 |    LOADFILE_MD5=load-md5sums.txt
 36 |    FILE_PREFIX=$(TEST_ROOT)
 37 |    LOADFILE_FORMAT=--format=firehose
 38 | endif
 39 | LOADFILES_DIR=$(TEST_ROOT)/loadfiles/$(LOADFILE_SUBDIR)TCGA/latest
 40 | # Note: Participant loadfiles are excluded b/c they are NOT generated for legacy/firehose format
 41 | FIND_LOAD_FILES=cd $(LOADFILES_DIR) && find . -name 'TCGA-*loadfile.txt' ! -name '*Participant*'
 42 | FILE_PREFIX_LENGTH=$(shell printf $(FILE_PREFIX) | wc -c | awk '{print $$NF}')
 43 | # The goal of VALIDATE_FILE_PREFIX is to ensure that data files listed within
 44 | # the loadfile have correct paths, either to local filesystem or cloud buckets
 45 | VALIDATE_FILE_PREFIX=cd $(LOADFILES_DIR) && \
 46 | 	FILE_PATH=`sed -n '2,2p' TCGA-ACC.Sample.loadfile.txt | cut -f6` && \
 47 | 	ACTUAL_PREFIX=`echo $$FILE_PATH | cut -c-$(FILE_PREFIX_LENGTH)` && \
 48 | 	if [ $$ACTUAL_PREFIX != $(FILE_PREFIX) ] ; then \
 49 | 		echo "File paths in loadfiles must begin with $(FILE_PREFIX)" ; \
 50 | 		false ; \
 51 | 	fi
 52 | ENSURE_FAILURE_EXIT_CODE=\
 53 | 	if (($$Result)) ; then \
 54 | 		echo "Pass: aborted with exit code $$Result" ; \
 55 | 	else \
 56 | 		echo "Fail: returned 0 exit code , but should've returned non-zero" ; \
 57 | 		false ; \
 58 | 	fi
 59 | 
 60 | # --------------------------- Target definitions ---------------------------
 61 | 
 62 | help:
 63 | 	@echo
 64 | 	@echo "Run various GDCtools tests.  Requires GNUmake 3.81 or later"
 65 | 	@echo
 66 | 	@echo "Targets:"
 67 | 	@echo
 68 | 	@echo  "1. test                     Exercise tests for this package"
 69 | 	@echo  "2. install                  Install locally, using pip"
 70 | 	@echo  "3. uninstall                Remove local install, using pip"
 71 | 	@echo  "4. publish                  Submit to PyPI"
 72 | 	@echo
 73 | 
 74 | test: setup test_smoke test_dice test_loadfiles test_legacy test_report echo_success
 75 | test_smoke: setup echo_ver test_invoke test_mirror test_redo_mirror test_badcfg \
 76 | 			test_cases test_choose
 77 | 
 78 | setup:
 79 | 	mkdir -p $(TEST_ROOT)
 80 | 
 81 | test_invoke: setup
 82 | 	@echo
 83 | 	@echo Test runnability: invoke some tools to show nothing thrown to stderr
 84 | 	@$(PYTHON) $(SRC)/GDCcore.py >/dev/null
 85 | 	@$(PYTHON) $(SRC)/GDCtool.py >/dev/null
 86 | 	@$(PYTHON) $(SRC)/gdc_mirror.py --help >/dev/null
 87 | 	@$(PYTHON) $(SRC)/gdc_list.py --help >/dev/null
 88 | 	@echo Test assorted features or API calls directly, instead of via tools
 89 | 	@$(PYTHON) misctests.py > $(TEST_ROOT)/misctests.txt
 90 | 	diff $(TEST_ROOT)/misctests.txt baselines/misctests.txt
 91 | 
 92 | test_mirror:
 93 | 	@echo
 94 | 	@echo "Test mirror: download small set of data, compare to baselines"
 95 | 	@$(PERFORM_MIRROR) 2>&1 | tee $@.log | egrep "GDC|Mirroring data|Mirroring start"
 96 | 	$(ABORT_ON_ERROR) $@.log
 97 | 
 98 | 	@# Now see that the named set of mirrored files matches what we expect
 99 | 	$(FIND_MIRROR_FILES) | $(SORT) > $(TEST_ROOT)/mirror-files.txt
100 | 	diff -b baselines/mirror-files.txt $(TEST_ROOT)/.
101 | 
102 | 	@# Verify integrity (but using our stored MD5s, not those just downloaded)
103 | 	$(FIND_MIRROR_FILES) -exec $(MD5) {} \; | $(SORT) > $(TEST_ROOT)/mirror-md5sums.txt
104 | 	diff -b baselines/mirror-md5sums.txt $(TEST_ROOT)/.
105 | 
106 | test_redo_mirror:
107 | 	@echo
108 | 	@echo Test retry of mirror: nothing should be re-downloaded
109 | 	@$(PERFORM_MIRROR) 2>&1 | grep -w new | grep -vl " [^0][0-9]* new " >/dev/null
110 | 
111 | test_badcfg:
112 | 	@echo
113 | 	@echo Test that attempting to use bad config file quickly aborts with error
114 | 	@touch bad.cfg
115 | 	@$(PYTHON) ../gdctools/gdc_mirror.py --config bad.cfg || Result=$$? ; \
116 | 	$(ENSURE_FAILURE_EXIT_CODE)
117 | 	@echo
118 | 	@echo Now test that given bad program/project names also quickly aborts
119 | 	@$(PYTHON) ../gdctools/gdc_mirror.py --programs DUMMY || Result=$$? ; \
120 | 	$(ENSURE_FAILURE_EXIT_CODE)
121 | 	@echo
122 | 	@$(PYTHON) ../gdctools/gdc_mirror.py --projects DUMMY || Result=$$? ; \
123 | 	$(ENSURE_FAILURE_EXIT_CODE)
124 | 	@echo
125 | 	@$(PERFORM_DICE) --programs DUMMY 2>&1 | grep -i error || Result=$$? ; \
126 | 	$(ENSURE_FAILURE_EXIT_CODE)
127 | 	@echo
128 | 	@$(PERFORM_LOADF) --projects DUMMY 2>&1 | grep -i error || Result=$$? ; \
129 | 	$(ENSURE_FAILURE_EXIT_CODE)
130 | 	@echo
131 | 	@echo Induce failure in gdc_list, by giving bad input
132 | 	@$(PYTHON) $(SRC)/gdc_list.py DUMMY 2>&1 || Result=$$? ; \
133 | 	$(ENSURE_FAILURE_EXIT_CODE)
134 | 
135 | test_choose:
136 | 	@echo
137 | 	@echo Test that replicate filter is choosing the appropriate aliquots
138 | 	@$(PYTHON) testchoose.py
139 | 
140 | test_dice:
141 | 	@echo
142 | 	@echo Test dice: on subset of cohorts, to show CLI args override config file
143 | 	$(PERFORM_DICE) 2>&1 | tee $@.log  | egrep "Dicing TCGA|date"
144 | 	$(ABORT_ON_ERROR) $@.log
145 | 	$(FIND_DICE_FILES) | $(SORT) > $(TEST_ROOT)/dice-files.txt
146 | 	diff -b baselines/dice-files.txt $(TEST_ROOT)/.
147 | 
148 | 	@# Verify integrity (but using our stored MD5s, not those just downloaded)
149 | 	$(FIND_DICE_FILES) -exec $(MD5) {} \; | $(SORT) > $(TEST_ROOT)/dice-md5sums.txt
150 | 	diff -b baselines/dice-md5sums.txt $(TEST_ROOT)/.
151 | 
152 | test_loadfiles:
153 | 	# By default GDCtools generates firecloud-style loadfiles, so exercise other
154 | 	@$(EMAKE) test_loadfile
155 | 	@echo
156 | 	@$(EMAKE) test_loadfile LOADFILE_DESTINATION=google
157 | 
158 | test_loadfile:
159 | 	@echo
160 | 	@echo "Test $(LOADFILE_DESTINATION) loadfile generation ..."
161 | 	$(PERFORM_LOADF) 2>&1 | tee $@.log | egrep "Generating|date"
162 | 	$(ABORT_ON_ERROR) $@.log
163 | 	# First, ensure that data files are prefixed with appropriate path
164 | 	$(VALIDATE_FILE_PREFIX)
165 | 	# Now compare names of files
166 | 	$(FIND_LOAD_FILES) | $(SORT) > $(TEST_ROOT)/load-files.txt
167 | 	diff -b baselines/load-files.txt $(TEST_ROOT)/.
168 | 	# Now ensure that replicate etc filter worked
169 | 	diff -b baselines/TCGA.filtered_samples.txt $(LOADFILES_DIR)/.
170 | 	# And check that case/participants loadfile is OK, if relevant
171 | 	$(CHECK_CASES_LOADFILE)
172 | 	# Lastly, MD5 compare content after removing local path prefix to files
173 | 	$(FIND_LOAD_FILES) -exec sed 's|$(FILE_PREFIX)/||g' {} \; \
174 | 						| $(SORT) > $(TEST_ROOT)/load-munged.txt
175 | 	$(MD5) $(TEST_ROOT)/load-munged.txt | sed 's|$(TEST_ROOT)/||g' \
176 | 						> $(TEST_ROOT)/$(LOADFILE_MD5)
177 | 	diff -b baselines/$(LOADFILE_MD5) $(TEST_ROOT)/.
178 | 
179 | Rscript=$(shell type -P Rscript 2>/dev/null)
180 | test_report:
181 | 	@echo
182 | 	@if [ -z "$(Rscript)" ] ; then \
183 | 		echo "R / Rscript not installed on your system, skipping test_report" ;\
184 | 	else \
185 | 		echo "Test sample report generation ..." ; \
186 | 		$(PERFORM_REPORT) 2>&1 | tee $@.log | egrep "Generating|date" ; \
187 | 		$(ABORT_ON_ERROR) $@.log ; \
188 | 		(cd $(TEST_ROOT)/reports/latest && find . -name '*.html' | \
189 | 									$(SORT) > $(TEST_ROOT)/report-files.txt) ; \
190 | 		diff -b baselines/report-files.txt $(TEST_ROOT)/. ; \
191 | 	fi
192 | 
193 | test_cases:
194 | 	@echo
195 | 	@echo "Test fine-grained retrieval of 1 case (and 1 data category)"
196 | 	@$(PYTHON) $(SRC)/gdc_mirror.py --config onlycases.cfg 2>&1 \
197 | 					--categories Biospecimen | tee $@.log | \
198 | 					egrep "GDC|Mirroring data|Mirroring start|categorie"
199 | 	$(ABORT_ON_ERROR) $@.log
200 | 	@cd onlycases/mirror/TCGA && find TCGA-* \
201 | 					-name '*.*' -a ! -name '*.json' | \
202 | 					$(SORT) > ../../onlycases-files.txt
203 | 	diff -b baselines/onlycases-files.txt onlycases/.
204 | 
205 | test_legacy:
206 | 	@echo
207 | 	@echo "Test legacy data download (for 1 case and 3 data categories)"
208 | 	@$(PYTHON) ../gdctools/gdc_mirror.py --config legacy.cfg  2>&1 | \
209 | 					tee $@.log | \
210 | 					egrep "GDC|Mirroring data|Mirroring start"
211 | 	$(ABORT_ON_ERROR) $@.log
212 | 	$(FIND_LEGACY_FILES) | $(SORT) > ../../legacy-files.txt
213 | 	diff -b baselines/legacy-files.txt legacy/.
214 | 	$(FIND_LEGACY_FILES) -exec $(MD5) {} \; | $(SORT) > ../../legacy-md5sums.txt
215 | 	diff -b baselines/legacy-md5sums.txt legacy/.
216 | 
217 | test3:
218 | 	$(MAKE) -e PYTHON_VER=3 test
219 | 
220 | VERTEST="import gdctools as g; print('Version: ' + g.GDCcore.GDCT_VERSION)"
221 | testl:
222 | 	@# Test the package locally, as if it were installed
223 | 	@$(PYTHON) -c  $(VERTEST)
224 | 
225 | testi:
226 | 	@# Test the installed package
227 | 	@(cd /tmp ; $(PYTHON) -c $(VERTEST))
228 | 
229 | .PHONY: test clean echo_success test_mirror test_dice test_redo_mirror
230 | 
231 | echo_success:
232 | 	@echo
233 | 	@echo Success!
234 | 
235 | echo_ver:
236 | 	@echo Using $(PYTHON_HOME)/bin/$(PYTHON_EXE) ...
237 | 
238 | clean:
239 | 	rm -rf build dist *.egg-info *~ test_*.log bad.cfg
240 | 
241 | rclean: clean
242 | 	\rm -rf $(TEST_ROOT) GDCtool gdctools_tmp onlycases legacy
243 | 


--------------------------------------------------------------------------------
/tests/baselines/TCGA-ACCSKCM.Participant.loadfile.txt:
--------------------------------------------------------------------------------
1 | entity:participant_id
2 | TCGA-ACC-OR-A5K2
3 | TCGA-ACC-OR-A5L1
4 | TCGA-SKCM-D3-A3C7
5 | TCGA-SKCM-EE-A3J8
6 | 


--------------------------------------------------------------------------------
/tests/baselines/TCGA.filtered_samples.txt:
--------------------------------------------------------------------------------
 1 | Participant Id	Cohort	Annotation	Filter Reason	Removed Samples	Chosen Sample
 2 | TCGA-BL-A0C8	TCGA-BLCA	CNV__snp6	Analyte Replicate Filter	TCGA-BL-A0C8-10A-01D-A10T-01	TCGA-BL-A0C8-10A-01D-A273-01
 3 | TCGA-BL-A0C8	TCGA-BLCA	CNV__unfiltered__snp6	Analyte Replicate Filter	TCGA-BL-A0C8-10A-01D-A10T-01	TCGA-BL-A0C8-10A-01D-A273-01
 4 | TCGA-BL-A0C8	TCGA-BLCA	CNV__snp6	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11D-A10T-01	TCGA-BL-A0C8-01A-11D-A273-01
 5 | TCGA-BL-A0C8	TCGA-BLCA	CNV__unfiltered__snp6	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11D-A10T-01	TCGA-BL-A0C8-01A-11D-A273-01
 6 | TCGA-BL-A0C8	TCGA-BLCA	SNV__mutect	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11D-A10S-08	TCGA-BL-A0C8-01A-11D-A271-08
 7 | TCGA-BL-A0C8	TCGA-BLCA	mRNA__counts__FPKM	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11R-A10U-07	TCGA-BL-A0C8-01A-11R-A277-07
 8 | TCGA-BL-A0C8	TCGA-BLCA	mRNA__geneExpNormed__FPKM	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11R-A10U-07	TCGA-BL-A0C8-01A-11R-A277-07
 9 | TCGA-BL-A0C8	TCGA-BLCA	mRNA__geneExp__FPKM	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11R-A10U-07	TCGA-BL-A0C8-01A-11R-A277-07
10 | TCGA-BL-A0C8	TCGA-BLCA	methylation__HM450	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11D-A10W-05	TCGA-BL-A0C8-01A-11D-A276-05
11 | TCGA-BL-A0C8	TCGA-BLCA	miR__geneExp	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11R-A10V-13	TCGA-BL-A0C8-01A-11R-A27D-13
12 | TCGA-BL-A0C8	TCGA-BLCA	miR__isoformExp	Analyte Replicate Filter	TCGA-BL-A0C8-01A-11R-A10V-13	TCGA-BL-A0C8-01A-11R-A27D-13
13 | TCGA-BL-A13I	TCGA-BLCA	SNV__mutect	Analyte Replicate Filter	TCGA-BL-A13I-01A-11D-A13W-08	TCGA-BL-A13I-01A-11D-A271-08
14 | TCGA-BL-A13I	TCGA-BLCA	mRNA__counts__FPKM	Analyte Replicate Filter	TCGA-BL-A13I-01A-11R-A13Y-07	TCGA-BL-A13I-01A-11R-A277-07
15 | TCGA-BL-A13I	TCGA-BLCA	mRNA__geneExpNormed__FPKM	Analyte Replicate Filter	TCGA-BL-A13I-01A-11R-A13Y-07	TCGA-BL-A13I-01A-11R-A277-07
16 | TCGA-BL-A13I	TCGA-BLCA	mRNA__geneExp__FPKM	Analyte Replicate Filter	TCGA-BL-A13I-01A-11R-A13Y-07	TCGA-BL-A13I-01A-11R-A277-07
17 | TCGA-BL-A13I	TCGA-BLCA	methylation__HM450	Analyte Replicate Filter	TCGA-BL-A13I-01A-11D-A13Z-05	TCGA-BL-A13I-01A-11D-A276-05
18 | TCGA-BL-A13I	TCGA-BLCA	miR__geneExp	Analyte Replicate Filter	TCGA-BL-A13I-01A-11R-A13X-13	TCGA-BL-A13I-01A-11R-A27D-13
19 | TCGA-BL-A13I	TCGA-BLCA	miR__isoformExp	Analyte Replicate Filter	TCGA-BL-A13I-01A-11R-A13X-13	TCGA-BL-A13I-01A-11R-A27D-13
20 | TCGA-BL-A13J	TCGA-BLCA	CNV__snp6	Analyte Replicate Filter	TCGA-BL-A13J-10A-01D-A10T-01	TCGA-BL-A13J-10A-01D-A273-01
21 | TCGA-BL-A13J	TCGA-BLCA	CNV__unfiltered__snp6	Analyte Replicate Filter	TCGA-BL-A13J-10A-01D-A10T-01	TCGA-BL-A13J-10A-01D-A273-01
22 | TCGA-BL-A13J	TCGA-BLCA	CNV__snp6	Analyte Replicate Filter	TCGA-BL-A13J-01A-11D-A10T-01	TCGA-BL-A13J-01A-11D-A273-01
23 | TCGA-BL-A13J	TCGA-BLCA	CNV__unfiltered__snp6	Analyte Replicate Filter	TCGA-BL-A13J-01A-11D-A10T-01	TCGA-BL-A13J-01A-11D-A273-01
24 | TCGA-BL-A13J	TCGA-BLCA	SNV__mutect	Analyte Replicate Filter	TCGA-BL-A13J-01A-11D-A10S-08	TCGA-BL-A13J-01A-11D-A271-08
25 | TCGA-BL-A13J	TCGA-BLCA	mRNA__counts__FPKM	Analyte Replicate Filter	TCGA-BL-A13J-01A-11R-A10U-07	TCGA-BL-A13J-01A-11R-A277-07
26 | TCGA-BL-A13J	TCGA-BLCA	mRNA__geneExpNormed__FPKM	Analyte Replicate Filter	TCGA-BL-A13J-01A-11R-A10U-07	TCGA-BL-A13J-01A-11R-A277-07
27 | TCGA-BL-A13J	TCGA-BLCA	mRNA__geneExp__FPKM	Analyte Replicate Filter	TCGA-BL-A13J-01A-11R-A10U-07	TCGA-BL-A13J-01A-11R-A277-07
28 | TCGA-BL-A13J	TCGA-BLCA	methylation__HM450	Analyte Replicate Filter	TCGA-BL-A13J-01A-11D-A10W-05	TCGA-BL-A13J-01A-11D-A276-05
29 | TCGA-BL-A13J	TCGA-BLCA	miR__geneExp	Analyte Replicate Filter	TCGA-BL-A13J-01A-11R-A10V-13	TCGA-BL-A13J-01A-11R-A27D-13
30 | TCGA-BL-A13J	TCGA-BLCA	miR__isoformExp	Analyte Replicate Filter	TCGA-BL-A13J-01A-11R-A10V-13	TCGA-BL-A13J-01A-11R-A27D-13
31 | TCGA-IG-A3YB	TCGA-ESCA	miR__geneExp	Analyte Replicate Filter	TCGA-IG-A3YB-01A-11R-A24L-13	TCGA-IG-A3YB-01A-11R-A360-13
32 | TCGA-IG-A3YB	TCGA-ESCA	miR__isoformExp	Analyte Replicate Filter	TCGA-IG-A3YB-01A-11R-A24L-13	TCGA-IG-A3YB-01A-11R-A360-13
33 | TCGA-L5-A4OI	TCGA-ESCA	miR__geneExp	Analyte Replicate Filter	TCGA-L5-A4OI-01A-11R-A261-13	TCGA-L5-A4OI-01A-11R-A360-13
34 | TCGA-L5-A4OI	TCGA-ESCA	miR__isoformExp	Analyte Replicate Filter	TCGA-L5-A4OI-01A-11R-A261-13	TCGA-L5-A4OI-01A-11R-A360-13
35 | 


--------------------------------------------------------------------------------
/tests/baselines/dice-files.txt:
--------------------------------------------------------------------------------
  1 | TCGA-ACC/CNV__snp6/TCGA-OR-A5K2-01A-11D-A29H-01.2e839865-a79e-463a-b954-ec02b78d5f7d.txt
  2 | TCGA-ACC/CNV__snp6/TCGA-OR-A5K2-10B-01D-A29K-01.d7d61592-8a7e-482d-969b-cda38f4f2404.txt
  3 | TCGA-ACC/CNV__snp6/TCGA-OR-A5L1-01A-11D-A309-01.5c98cfc0-7cd0-4c0b-9fca-9aeb60bb57c4.txt
  4 | TCGA-ACC/CNV__snp6/TCGA-OR-A5L1-10A-01D-A309-01.a7dd6050-3ae7-4684-82ec-e5cc74624d1d.txt
  5 | TCGA-ACC/CNV__unfiltered__snp6/TCGA-OR-A5K2-01A-11D-A29H-01.08c898e1-be4a-4d68-852e-6a665d25bd8a.txt
  6 | TCGA-ACC/CNV__unfiltered__snp6/TCGA-OR-A5K2-10B-01D-A29K-01.029517fe-0995-43d5-b61e-cd968073f39c.txt
  7 | TCGA-ACC/CNV__unfiltered__snp6/TCGA-OR-A5L1-01A-11D-A309-01.5dfae645-caa6-4021-8f29-54e3cf043d33.txt
  8 | TCGA-ACC/CNV__unfiltered__snp6/TCGA-OR-A5L1-10A-01D-A309-01.b54b7ebe-5c61-4fdd-b74d-afae3bc68986.txt
  9 | TCGA-ACC/SNV__mutect/81ac2c46-37db-4dcd-923a-061a7ae626a3.maf.txt
 10 | TCGA-ACC/SNV__mutect/TCGA-OR-A5K2-01A-11D-A29I-10.81ac2c46-37db-4dcd-923a-061a7ae626a3.maf.txt
 11 | TCGA-ACC/SNV__mutect/TCGA-OR-A5L1-01A-11D-A30A-10.81ac2c46-37db-4dcd-923a-061a7ae626a3.maf.txt
 12 | TCGA-ACC/clinical__biospecimen/TCGA-OR-A5K2.a2e4dcd8-9cd0-4b3e-a551-4267a9da7248.txt
 13 | TCGA-ACC/clinical__biospecimen/TCGA-OR-A5L1.f7b24eea-4a80-4304-827f-abe143000ada.txt
 14 | TCGA-ACC/clinical__primary/TCGA-OR-A5K2.a04bd70d-dc86-48c9-aeee-1336d881453e.txt
 15 | TCGA-ACC/clinical__primary/TCGA-OR-A5L1.b0294729-31e1-4ea2-9bb3-971267c788ef.txt
 16 | TCGA-ACC/mRNA__counts__FPKM/TCGA-OR-A5K2-01A-11R-A29S-07.76bfb639-1e16-4cf0-b280-a17305da2e13.txt
 17 | TCGA-ACC/mRNA__geneExpNormed__FPKM/TCGA-OR-A5K2-01A-11R-A29S-07.1cf80958-0f98-445b-b91c-eae101b8fb38.txt
 18 | TCGA-ACC/mRNA__geneExp__FPKM/TCGA-OR-A5K2-01A-11R-A29S-07.cc530ddb-02c2-47aa-9f3e-a5c39af69c5c.txt
 19 | TCGA-ACC/methylation__HM450/TCGA-OR-A5K2-01A-11D-A29J-05.c5a36e2a-33cd-484a-91db-4f030a5f5415.data.txt
 20 | TCGA-ACC/miR__geneExp/TCGA-OR-A5K2-01A-11R-A29W-13.db7a6124-4fda-4e1c-aa66-98b9a47444f1.txt
 21 | TCGA-ACC/miR__isoformExp/TCGA-OR-A5K2-01A-11R-A29W-13.9532d362-fa8d-47a2-851e-4f2a80a26469.txt
 22 | TCGA-BLCA/CNV__snp6/TCGA-BL-A0C8-01A-11D-A10T-01.9f8b064a-9b90-4111-a461-de7e5bc7d32b.txt
 23 | TCGA-BLCA/CNV__snp6/TCGA-BL-A0C8-01A-11D-A273-01.85e23066-0321-4961-b2ee-95f66dafc1a7.txt
 24 | TCGA-BLCA/CNV__snp6/TCGA-BL-A0C8-01B-04D-A273-01.d976c433-7c55-4de6-a246-8f1b1d87eb98.txt
 25 | TCGA-BLCA/CNV__snp6/TCGA-BL-A0C8-10A-01D-A10T-01.fb9ec334-9b9b-43e9-bdc4-64c76bef0221.txt
 26 | TCGA-BLCA/CNV__snp6/TCGA-BL-A0C8-10A-01D-A273-01.cd1c7b7c-9482-4923-9e8d-4a8ca363aa5b.txt
 27 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13I-01A-11D-A13V-01.09bf33a8-358e-4351-8268-51705fd5036b.txt
 28 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13I-11A-11D-A13V-01.d99f0b97-ded8-44f4-aeb1-6f977303fea0.txt
 29 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13J-01A-11D-A10T-01.53d146f3-e074-46dc-832c-f1414af17385.txt
 30 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13J-01A-11D-A273-01.e8cb7bee-0401-4b5c-9bb1-8f308199f83d.txt
 31 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13J-01B-04D-A273-01.617526f7-2d64-4da1-95a2-636fa43d5bf1.txt
 32 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13J-10A-01D-A10T-01.b2cec796-a785-43eb-ac53-04e39b2df68e.txt
 33 | TCGA-BLCA/CNV__snp6/TCGA-BL-A13J-10A-01D-A273-01.ce39f6ea-a97a-4f48-b36e-a0b87b95ee17.txt
 34 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A0C8-01A-11D-A10T-01.1658654c-631f-4b89-8a87-d7231824464e.txt
 35 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A0C8-01A-11D-A273-01.12d4de16-e001-470f-a1f5-d1af66ff7133.txt
 36 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A0C8-01B-04D-A273-01.51112455-61ee-4bec-9f52-0ae1484a085d.txt
 37 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A0C8-10A-01D-A10T-01.947fde1c-31c2-40b4-a998-adae2ff8a02b.txt
 38 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A0C8-10A-01D-A273-01.ae7b541e-5384-4207-9d63-663c205bc6d2.txt
 39 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13I-01A-11D-A13V-01.3a86f64f-90ed-45e9-b74d-2213c2e03c19.txt
 40 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13I-11A-11D-A13V-01.ceec0a6e-945a-4ab4-8ceb-d9cc0f6e5907.txt
 41 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13J-01A-11D-A10T-01.53c4d4c7-79a2-434e-8b98-2eb98c908b0d.txt
 42 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13J-01A-11D-A273-01.3f936034-9769-468e-ac62-929773268e70.txt
 43 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13J-01B-04D-A273-01.86c7478b-e89b-4529-a5fd-f41c808a1049.txt
 44 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13J-10A-01D-A10T-01.7323d5c7-6db3-4e2d-a8eb-f206149e46cd.txt
 45 | TCGA-BLCA/CNV__unfiltered__snp6/TCGA-BL-A13J-10A-01D-A273-01.3d05493f-064d-4208-8afb-3cea28515d58.txt
 46 | TCGA-BLCA/SNV__mutect/0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 47 | TCGA-BLCA/SNV__mutect/TCGA-BL-A0C8-01A-11D-A10S-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 48 | TCGA-BLCA/SNV__mutect/TCGA-BL-A0C8-01A-11D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 49 | TCGA-BLCA/SNV__mutect/TCGA-BL-A0C8-01B-04D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 50 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13I-01A-11D-A13W-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 51 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13I-01A-11D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 52 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13I-01B-04D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 53 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13J-01A-11D-A10S-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 54 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13J-01A-11D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 55 | TCGA-BLCA/SNV__mutect/TCGA-BL-A13J-01B-04D-A271-08.0e239d8f-47b0-4e47-9716-e9ecc87605b9.maf.txt
 56 | TCGA-BLCA/clinical__biospecimen/TCGA-BL-A0C8.57af4f89-af41-46ea-9fb0-e66809f13fa4.txt
 57 | TCGA-BLCA/clinical__biospecimen/TCGA-BL-A13I.d14af2a9-9a93-4a41-938f-1e6fc5c07cfa.txt
 58 | TCGA-BLCA/clinical__biospecimen/TCGA-BL-A13J.9ab80028-9fbc-43e4-9473-3800dd30a1ac.txt
 59 | TCGA-BLCA/clinical__primary/TCGA-BL-A0C8.199989a2-01a1-43bf-97db-f02b704bf617.txt
 60 | TCGA-BLCA/clinical__primary/TCGA-BL-A13I.833baeaa-e9f9-4cf9-81c5-471090457680.txt
 61 | TCGA-BLCA/clinical__primary/TCGA-BL-A13J.c187fe3e-62a2-4a25-a123-b77a998788ff.txt
 62 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A0C8-01A-11R-A10U-07.20976445-b2a0-45ad-a89a-9273938727c6.txt
 63 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A0C8-01A-11R-A277-07.f832dfd0-f52e-4835-8ae1-144a171e922f.txt
 64 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A0C8-01B-04R-A277-07.46c1de22-9e01-4ce1-88fc-aee293424302.txt
 65 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13I-01A-11R-A13Y-07.8c6ffe1d-a2c4-4c94-a8b9-b8c5936223e7.txt
 66 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13I-01A-11R-A277-07.62981652-048b-4a40-8687-736024c7c0a1.txt
 67 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13I-01B-04R-A277-07.d4ce3f7a-633a-4d24-95b5-953b2bfd1e9f.txt
 68 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13J-01A-11R-A10U-07.76530c44-7357-4193-bada-a4eea20e79fe.txt
 69 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13J-01A-11R-A277-07.e5b4565c-9e79-4226-b106-29cf7f8f4a94.txt
 70 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13J-01B-04R-A277-07.96ee0893-21d0-439f-af4a-9f40102483d6.txt
 71 | TCGA-BLCA/mRNA__counts__FPKM/TCGA-BL-A13J-11A-13R-A10U-07.48566693-0845-44b2-8612-4f713fee1bd7.txt
 72 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A0C8-01A-11R-A10U-07.3d2db206-336e-4f2b-9109-f0b14460f6f3.txt
 73 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A0C8-01A-11R-A277-07.a69d254e-5222-4863-bbc2-5614e47504e8.txt
 74 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A0C8-01B-04R-A277-07.8290f9c4-6cb2-4e6a-bee3-2e0d26818ee2.txt
 75 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13I-01A-11R-A13Y-07.c257ac08-6ed2-4e7f-8733-d946be1c6d5c.txt
 76 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13I-01A-11R-A277-07.901fa73e-da21-4f2c-9289-6d1240eefc9d.txt
 77 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13I-01B-04R-A277-07.b1896668-90fc-407f-a53c-56c6510ad16e.txt
 78 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13J-01A-11R-A10U-07.827bacb7-49c2-4ce9-9cf5-6d21d290aec8.txt
 79 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13J-01A-11R-A277-07.1342191b-e6b6-4a1e-b189-05b1d4e79474.txt
 80 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13J-01B-04R-A277-07.48cbddd1-461d-4d07-bfea-237fb7c295ff.txt
 81 | TCGA-BLCA/mRNA__geneExpNormed__FPKM/TCGA-BL-A13J-11A-13R-A10U-07.3a460d44-54ce-49dc-8cb7-39071f42e696.txt
 82 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A0C8-01A-11R-A10U-07.0a360b17-8286-46d5-9379-211a1a90c546.txt
 83 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A0C8-01A-11R-A277-07.d638b334-c28d-47c8-8c55-a7d0d00918b7.txt
 84 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A0C8-01B-04R-A277-07.936c0db4-af45-4a2b-891a-dcde4a3897ed.txt
 85 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13I-01A-11R-A13Y-07.a5134a99-7364-4164-bfaa-4867a9cb54df.txt
 86 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13I-01A-11R-A277-07.1edc7409-2d42-4d3e-89c8-3499bc364323.txt
 87 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13I-01B-04R-A277-07.6e6131cf-fdea-452b-9b65-49d334a097e8.txt
 88 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13J-01A-11R-A10U-07.5b902084-dcfc-4b37-a411-4a1db76ed61d.txt
 89 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13J-01A-11R-A277-07.30f7c07d-32f3-4148-9309-21a193bae631.txt
 90 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13J-01B-04R-A277-07.e8e1abd1-9fa2-4d5b-899c-0fba5ef14ef1.txt
 91 | TCGA-BLCA/mRNA__geneExp__FPKM/TCGA-BL-A13J-11A-13R-A10U-07.277e49dd-ae87-4b6a-91f2-121d47c6f330.txt
 92 | TCGA-BLCA/methylation__HM450/TCGA-BL-A0C8-01A-11D-A10W-05.f0aa1e07-3c84-4ace-a078-e398c21ab1fc.data.txt
 93 | TCGA-BLCA/methylation__HM450/TCGA-BL-A0C8-01A-11D-A276-05.f26962a4-1d81-4674-9d74-c256a125aa46.data.txt
 94 | TCGA-BLCA/methylation__HM450/TCGA-BL-A0C8-01B-04D-A276-05.4121d775-5546-4493-9b26-4fe1fd420469.data.txt
 95 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13I-01A-11D-A13Z-05.47c9b2e8-50fe-4433-8c04-fb0461db2509.data.txt
 96 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13I-01A-11D-A276-05.2e6821ed-8e54-4b8f-8a3e-581bb8eae81a.data.txt
 97 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13I-01B-04D-A276-05.79e66bef-80a0-42c8-8e35-00687018b4b0.data.txt
 98 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13J-01A-11D-A10W-05.cf119c4a-09a7-43ba-8264-7ff34bda6287.data.txt
 99 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13J-01A-11D-A276-05.ec0334f7-076e-4382-ba6f-d5860388d634.data.txt
100 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13J-01B-04D-A276-05.8e2df3f9-f647-4438-b4a9-ac044f961d8c.data.txt
101 | TCGA-BLCA/methylation__HM450/TCGA-BL-A13J-11A-13D-A10W-05.416a338a-4d07-4134-be96-a660483f4d05.data.txt
102 | TCGA-BLCA/miR__geneExp/TCGA-BL-A0C8-01A-11R-A10V-13.a54c3dd8-04b8-492d-806a-0ec3f5e29f40.txt
103 | TCGA-BLCA/miR__geneExp/TCGA-BL-A0C8-01A-11R-A27D-13.878636a2-91d6-4a73-8339-a78e44f22899.txt
104 | TCGA-BLCA/miR__geneExp/TCGA-BL-A0C8-01B-04R-A27D-13.63cc8fcd-32cd-44ac-bca6-6eae811fe7a4.txt
105 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13I-01A-11R-A13X-13.a00d9ac6-ecd9-4b35-b171-e1363643a025.txt
106 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13I-01A-11R-A27D-13.806aa36e-dcc8-4e50-a38d-1f6319e857a0.txt
107 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13I-01B-04R-A27D-13.8fe18179-5eab-499d-ad4e-5a7cda8238fa.txt
108 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13J-01A-11R-A10V-13.219cb237-96f7-44f3-a936-58f0a66a6a91.txt
109 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13J-01A-11R-A27D-13.695eebf7-47f4-40e0-9aec-1050079c7bc3.txt
110 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13J-01B-04R-A27D-13.77e34cab-d40b-4323-af1e-c866daae99da.txt
111 | TCGA-BLCA/miR__geneExp/TCGA-BL-A13J-11A-13R-A10V-13.d82c671a-6279-4826-a95e-727436e9a5e4.txt
112 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A0C8-01A-11R-A10V-13.3659cb55-02ac-4883-82d2-54b3ed956645.txt
113 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A0C8-01A-11R-A27D-13.3b7dbb8a-3d1e-4bea-9500-8c4fcd5c24ad.txt
114 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A0C8-01B-04R-A27D-13.2cdd1cda-0fd1-47a6-8637-80efc3289948.txt
115 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13I-01A-11R-A13X-13.581f5126-21c0-4855-98b2-4e20c2830489.txt
116 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13I-01A-11R-A27D-13.401ead2e-9ce8-4d30-a136-40ea67058940.txt
117 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13I-01B-04R-A27D-13.7991ab78-8494-4dc1-b78f-9e31da0c26c3.txt
118 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13J-01A-11R-A10V-13.240ba7e4-909b-4da0-8ba1-9eb67361b0a2.txt
119 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13J-01A-11R-A27D-13.a9422c90-70bd-4a3e-9fce-7c2160781f00.txt
120 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13J-01B-04R-A27D-13.dfe88d23-d80a-4a6b-98d6-05ea35670319.txt
121 | TCGA-BLCA/miR__isoformExp/TCGA-BL-A13J-11A-13R-A10V-13.db772d0f-a3c5-47b1-8c37-6fc2d1c5c22b.txt
122 | TCGA-ESCA/CNV__snp6/TCGA-IG-A3YB-01A-11D-A246-01.4d453f15-45ab-4582-a869-84089e5269cf.txt
123 | TCGA-ESCA/CNV__snp6/TCGA-IG-A3YB-10A-01D-A246-01.1d89739d-6db8-42f1-8d2d-62a8d765e1ca.txt
124 | TCGA-ESCA/CNV__snp6/TCGA-L5-A4OI-01A-11D-A25X-01.e02da93d-46b8-4687-98c0-fd0e795366bb.txt
125 | TCGA-ESCA/CNV__snp6/TCGA-L5-A4OI-11A-11D-A25X-01.4e9b66b7-0cf9-46cb-846e-6b98c3ee0403.txt
126 | TCGA-ESCA/CNV__unfiltered__snp6/TCGA-IG-A3YB-01A-11D-A246-01.20fc1b49-3b90-4164-a324-db059017fe98.txt
127 | TCGA-ESCA/CNV__unfiltered__snp6/TCGA-IG-A3YB-10A-01D-A246-01.6914e1c3-4616-4c24-acec-de2f7fadaf9f.txt
128 | TCGA-ESCA/CNV__unfiltered__snp6/TCGA-L5-A4OI-01A-11D-A25X-01.0b9a8417-a829-4748-8ad7-e724e5cd376b.txt
129 | TCGA-ESCA/CNV__unfiltered__snp6/TCGA-L5-A4OI-11A-11D-A25X-01.d1de8016-b50c-4244-a573-577c83bb4125.txt
130 | TCGA-ESCA/SNV__mutect/7f8e1e7c-621c-4dfd-8fad-af07c739dbfc.maf.txt
131 | TCGA-ESCA/SNV__mutect/TCGA-IG-A3YB-01A-11D-A247-09.7f8e1e7c-621c-4dfd-8fad-af07c739dbfc.maf.txt
132 | TCGA-ESCA/SNV__mutect/TCGA-L5-A4OI-01A-11D-A27G-09.7f8e1e7c-621c-4dfd-8fad-af07c739dbfc.maf.txt
133 | TCGA-ESCA/clinical__biospecimen/TCGA-IG-A3YB.d9d75eab-ec8d-460f-b30a-0d5e84dc53ec.txt
134 | TCGA-ESCA/clinical__biospecimen/TCGA-L5-A4OI.e7e9477d-2116-484b-8e56-4b72253c8ae6.txt
135 | TCGA-ESCA/clinical__primary/TCGA-IG-A3YB.a2b91d5f-e33c-493e-ad90-e5d84c991135.txt
136 | TCGA-ESCA/clinical__primary/TCGA-L5-A4OI.a9ff011e-2de5-4da0-899f-252bba71a3c9.txt
137 | TCGA-ESCA/mRNA__counts__FPKM/TCGA-IG-A3YB-01A-11R-A36D-31.6001e694-b12d-4338-84dd-7180cac79e15.txt
138 | TCGA-ESCA/mRNA__counts__FPKM/TCGA-L5-A4OI-01A-11R-A36D-31.f1de94f8-4a68-4edd-b4f5-c3548aee6d0f.txt
139 | TCGA-ESCA/mRNA__geneExpNormed__FPKM/TCGA-IG-A3YB-01A-11R-A36D-31.3cf7418d-39a9-4e53-9136-1dbe8df9f1a6.txt
140 | TCGA-ESCA/mRNA__geneExpNormed__FPKM/TCGA-L5-A4OI-01A-11R-A36D-31.86602ac3-747a-477f-86cf-e24a314e6b35.txt
141 | TCGA-ESCA/mRNA__geneExp__FPKM/TCGA-IG-A3YB-01A-11R-A36D-31.e0ed1b91-be78-4e80-9a2a-8faf99ffd78c.txt
142 | TCGA-ESCA/mRNA__geneExp__FPKM/TCGA-L5-A4OI-01A-11R-A36D-31.ca91e70a-07b9-4a00-829a-3f90da996128.txt
143 | TCGA-ESCA/methylation__HM450/TCGA-IG-A3YB-01A-11D-A249-05.c7d718c2-750a-4f65-b3c6-aa3d23ff3947.data.txt
144 | TCGA-ESCA/methylation__HM450/TCGA-L5-A4OI-01A-11D-A265-05.09f09483-ca1b-4cc6-9d4f-784d22471df7.data.txt
145 | TCGA-ESCA/methylation__HM450/TCGA-L5-A4OI-11A-11D-A265-05.20048f6e-2f05-4029-af70-a664411f0fe7.data.txt
146 | TCGA-ESCA/miR__geneExp/TCGA-IG-A3YB-01A-11R-A24L-13.fd1fdfa1-6d90-4981-9c77-90b26b5a7a58.txt
147 | TCGA-ESCA/miR__geneExp/TCGA-IG-A3YB-01A-11R-A360-13.2b314047-9d5a-4c2c-96ef-b075625e160a.txt
148 | TCGA-ESCA/miR__geneExp/TCGA-L5-A4OI-01A-11R-A261-13.c0ae25e4-d0ff-415e-b021-814caed76ac9.txt
149 | TCGA-ESCA/miR__geneExp/TCGA-L5-A4OI-01A-11R-A360-13.e9da7437-c125-4b6e-af43-399e7cd3f170.txt
150 | TCGA-ESCA/miR__isoformExp/TCGA-IG-A3YB-01A-11R-A24L-13.eb8449f5-c2cb-4fe6-a09f-539a916aefc1.txt
151 | TCGA-ESCA/miR__isoformExp/TCGA-IG-A3YB-01A-11R-A360-13.50f5bb90-2804-444a-bdc0-e3a16a189266.txt
152 | TCGA-ESCA/miR__isoformExp/TCGA-L5-A4OI-01A-11R-A261-13.58321d07-1feb-46d3-bc3e-80c15dd99fb7.txt
153 | TCGA-ESCA/miR__isoformExp/TCGA-L5-A4OI-01A-11R-A360-13.1f560941-230e-4624-9c94-f0bba86c4f02.txt
154 | TCGA-SKCM/CNV__snp6/TCGA-D3-A3C7-06A-11D-A194-01.4499085e-e847-4ded-8745-c691a9146ec6.txt
155 | TCGA-SKCM/CNV__snp6/TCGA-D3-A3C7-10A-01D-A195-01.4de3740f-dbc8-48fd-9a65-b828d3ac8a0f.txt
156 | TCGA-SKCM/CNV__snp6/TCGA-EE-A3J8-06A-11D-A20B-01.5ce8aeb4-13cf-4452-92ae-39a8933084cc.txt
157 | TCGA-SKCM/CNV__snp6/TCGA-EE-A3J8-10A-01D-A20B-01.341c6fe3-e313-434e-91cc-dfd85db0ea99.txt
158 | TCGA-SKCM/CNV__unfiltered__snp6/TCGA-D3-A3C7-06A-11D-A194-01.95ed7b1f-9290-4a49-be3c-b8b662dcbc61.txt
159 | TCGA-SKCM/CNV__unfiltered__snp6/TCGA-D3-A3C7-10A-01D-A195-01.3962f65c-80d7-42f8-8d4a-73c2933e0182.txt
160 | TCGA-SKCM/CNV__unfiltered__snp6/TCGA-EE-A3J8-06A-11D-A20B-01.9dc96942-3de0-4912-932c-794ef79b5559.txt
161 | TCGA-SKCM/CNV__unfiltered__snp6/TCGA-EE-A3J8-10A-01D-A20B-01.8fabadc4-f5ba-4ec7-bbdf-ce2fedcd7ebd.txt
162 | TCGA-SKCM/SNV__mutect/4b7a5729-b83e-4837-9b61-a6002dce1c0a.maf.txt
163 | TCGA-SKCM/SNV__mutect/TCGA-D3-A3C7-06A-11D-A196-08.4b7a5729-b83e-4837-9b61-a6002dce1c0a.maf.txt
164 | TCGA-SKCM/SNV__mutect/TCGA-EE-A3J8-06A-11D-A20D-08.4b7a5729-b83e-4837-9b61-a6002dce1c0a.maf.txt
165 | TCGA-SKCM/clinical__biospecimen/TCGA-D3-A3C7.737c7d98-0b0b-4213-9048-edaec65bc68a.txt
166 | TCGA-SKCM/clinical__biospecimen/TCGA-EE-A3J8.93ea3c92-a345-4022-8aef-414f9ed4cd86.txt
167 | TCGA-SKCM/clinical__primary/TCGA-D3-A3C7.e19cfe7e-d15f-404c-9fee-517401d0690b.txt
168 | TCGA-SKCM/clinical__primary/TCGA-EE-A3J8.e7bf7546-91bc-4a3e-bfe6-b75d4cff924f.txt
169 | TCGA-SKCM/mRNA__counts__FPKM/TCGA-D3-A3C7-06A-11R-A18U-07.778eed3d-02ad-43c3-8308-0c4823922c39.txt
170 | TCGA-SKCM/mRNA__counts__FPKM/TCGA-EE-A3J8-06A-11R-A20F-07.9c488510-a229-45e9-b708-0b666c257dbc.txt
171 | TCGA-SKCM/mRNA__geneExpNormed__FPKM/TCGA-D3-A3C7-06A-11R-A18U-07.c7992525-cdee-49fe-9b24-78db03a6c58d.txt
172 | TCGA-SKCM/mRNA__geneExpNormed__FPKM/TCGA-EE-A3J8-06A-11R-A20F-07.abc1a5a6-cd0b-450c-a650-dfe14fdb356b.txt
173 | TCGA-SKCM/mRNA__geneExp__FPKM/TCGA-D3-A3C7-06A-11R-A18U-07.28cec425-f067-4008-9aa0-1a5cd689ff4f.txt
174 | TCGA-SKCM/mRNA__geneExp__FPKM/TCGA-EE-A3J8-06A-11R-A20F-07.662e5a74-9217-413e-8321-07951d802b8a.txt
175 | TCGA-SKCM/methylation__HM450/TCGA-D3-A3C7-06A-11D-A19B-05.2db5c7cc-25f8-4d93-991f-173b8704cb14.data.txt
176 | TCGA-SKCM/methylation__HM450/TCGA-EE-A3J8-06A-11D-A211-05.342836f4-b506-4bf8-a1f6-949ce9cb17dc.data.txt
177 | TCGA-SKCM/miR__geneExp/TCGA-D3-A3C7-06A-11R-A18X-13.d01dd285-b52e-4d28-a9d5-b771794f52ee.txt
178 | TCGA-SKCM/miR__geneExp/TCGA-EE-A3J8-06A-11R-A20E-13.abd6fb9f-b2d2-4391-8ae3-3bf2b67b83be.txt
179 | TCGA-SKCM/miR__isoformExp/TCGA-D3-A3C7-06A-11R-A18X-13.a2913b33-56c9-423f-aaf7-67f660826c98.txt
180 | TCGA-SKCM/miR__isoformExp/TCGA-EE-A3J8-06A-11R-A20E-13.0147e886-71af-4abe-a2cd-fac1bcf67aab.txt
181 | 


--------------------------------------------------------------------------------
/tests/baselines/legacy-files.txt:
--------------------------------------------------------------------------------
 1 | TCGA-SKCM/Clinical/Clinical_Supplement/nationwidechildrens.org_clinical.TCGA-D3-A3C7.xml
 2 | TCGA-SKCM/Clinical/Clinical_Supplement/nationwidechildrens.org_clinical.TCGA-D3-A3C7.xml.md5
 3 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_drug_skcm.txt
 4 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_drug_skcm.txt.md5
 5 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_follow_up_v2.0_skcm.txt
 6 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_follow_up_v2.0_skcm.txt.md5
 7 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_patient_skcm.txt
 8 | TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_patient_skcm.txt.md5
 9 | TCGA-SKCM/Clinical/Pathology_report/TCGA-D3-A3C7.9FABC766-7052-4352-8DC7-5160C9D768AB.pdf
10 | TCGA-SKCM/Clinical/Pathology_report/TCGA-D3-A3C7.9FABC766-7052-4352-8DC7-5160C9D768AB.pdf.md5
11 | TCGA-SKCM/Clinical/Tissue_slide_image/TCGA-D3-A3C7-06A-01-TSA.BD173A32-627A-4926-B505-16D1BFD0233D.svs
12 | TCGA-SKCM/Clinical/Tissue_slide_image/TCGA-D3-A3C7-06A-01-TSA.BD173A32-627A-4926-B505-16D1BFD0233D.svs.md5
13 | TCGA-SKCM/Gene_expression/Exon_junction_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1234280.junction_quantification.txt
14 | TCGA-SKCM/Gene_expression/Exon_junction_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1234280.junction_quantification.txt.md5
15 | TCGA-SKCM/Gene_expression/Exon_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1758520.bt.exon_quantification.txt
16 | TCGA-SKCM/Gene_expression/Exon_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1758520.bt.exon_quantification.txt.md5
17 | TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242913.rsem.genes.results
18 | TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242913.rsem.genes.results.md5
19 | TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243035.rsem.genes.normalized_results
20 | TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243035.rsem.genes.normalized_results.md5
21 | TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242915.rsem.isoforms.results
22 | TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242915.rsem.isoforms.results.md5
23 | TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243040.rsem.isoforms.normalized_results
24 | TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243040.rsem.isoforms.normalized_results.md5
25 | TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.mirna.quantification.txt
26 | TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.mirna.quantification.txt.md5
27 | TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.mirna.quantification.txt
28 | TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.mirna.quantification.txt.md5
29 | TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.isoform.quantification.txt
30 | TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.isoform.quantification.txt.md5
31 | TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.isoform.quantification.txt
32 | TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.isoform.quantification.txt.md5
33 | TCGA-SKCM/Simple_nucleotide_variation/Simple_somatic_mutation/SKCM_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf
34 | TCGA-SKCM/Simple_nucleotide_variation/Simple_somatic_mutation/SKCM_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf.md5
35 | 


--------------------------------------------------------------------------------
/tests/baselines/legacy-md5sums.txt:
--------------------------------------------------------------------------------
 1 | 0109bfa5f3c27c24bf2f3a387a6fe559  TCGA-SKCM/Gene_expression/Exon_junction_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1234280.junction_quantification.txt
 2 | 056c45416811d5a55553f0fa506419c0  TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242915.rsem.isoforms.results.md5
 3 | 07b816ef44b1633b82c5f592e5fde186  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_follow_up_v2.0_skcm.txt
 4 | 141681f0dd93e3cbdb8d23e8a98db52f  TCGA-SKCM/Clinical/Tissue_slide_image/TCGA-D3-A3C7-06A-01-TSA.BD173A32-627A-4926-B505-16D1BFD0233D.svs.md5
 5 | 1baaf02497a68d9285bde98e02c212bd  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_patient_skcm.txt
 6 | 1de1dba64b2b7b1910dafeefa3930ead  TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242915.rsem.isoforms.results
 7 | 2588198e8ab9b8933752278e8e8c20b2  TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.mirna.quantification.txt.md5
 8 | 31a35dc11fc65469077f1a5bae690ae9  TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243035.rsem.genes.normalized_results
 9 | 4a71658a4d93e9d1ca6b25325051836c  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_drug_skcm.txt
10 | 507b75d7deba9604f895732dbd9b7b9b  TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.isoform.quantification.txt.md5
11 | 57ae0657ad582786972aaab02517a229  TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243040.rsem.isoforms.normalized_results
12 | 65536eb820cc5fc67060ab8d4dc11621  TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.mirna.quantification.txt
13 | 6e0de69e1a9aa65e3aa69c41cac045cd  TCGA-SKCM/Gene_expression/Exon_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1758520.bt.exon_quantification.txt.md5
14 | 72c0514fb9e89f27ad363ae72a25708d  TCGA-SKCM/Simple_nucleotide_variation/Simple_somatic_mutation/SKCM_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf
15 | 76f1c75823836d5e79371150a8414d2a  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_follow_up_v2.0_skcm.txt.md5
16 | 8b2ce5ab8ffd433c2adcea378fd36aa2  TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.isoform.quantification.txt.md5
17 | 97e71534ba4b2606166564dd7ee4c99a  TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242913.rsem.genes.results.md5
18 | 9ac580afea66c360752cf583fd0874e0  TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243035.rsem.genes.normalized_results.md5
19 | 9de175ab22e0dc06beb4a02d04778e78  TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.mirna.quantification.txt
20 | 9ecb50eab7d3555bd646ecfcbd5bb4d0  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_drug_skcm.txt.md5
21 | aa34383d363e933ee6dbe67e1bb3ce2f  TCGA-SKCM/Gene_expression/Exon_junction_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1234280.junction_quantification.txt.md5
22 | b08d6709566f593b7e805f6ba4c405eb  TCGA-SKCM/Clinical/Clinical_Supplement/nationwidechildrens.org_clinical.TCGA-D3-A3C7.xml.md5
23 | b277e504d71628bbb341bca637e22309  TCGA-SKCM/Gene_expression/Gene_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1242913.rsem.genes.results
24 | be90b7675bceb0f8e37362b415814482  TCGA-SKCM/Clinical/Pathology_report/TCGA-D3-A3C7.9FABC766-7052-4352-8DC7-5160C9D768AB.pdf
25 | c504dc4aef1e9f643a22dee5a216172f  TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.isoform.quantification.txt
26 | cc69384e7e7b953a7c7b7b2b4d76541d  TCGA-SKCM/Clinical/Clinical_data/nationwidechildrens.org_clinical_patient_skcm.txt.md5
27 | ccd08985c07006bbc4e1640de71aa170  TCGA-SKCM/Simple_nucleotide_variation/Simple_somatic_mutation/SKCM_pairs.aggregated.capture.tcga.uuid.automated.somatic.maf.md5
28 | ccefa01e70ddbc438abe85349f7ae97f  TCGA-SKCM/Clinical/Tissue_slide_image/TCGA-D3-A3C7-06A-01-TSA.BD173A32-627A-4926-B505-16D1BFD0233D.svs
29 | d6d0b253d43ae48e0abb7f210dc9d325  TCGA-SKCM/Gene_expression/Isoform_expression_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1243040.rsem.isoforms.normalized_results.md5
30 | dcd4761c5fbc6f60c0c2beba6b9074c2  TCGA-SKCM/Gene_expression/miRNA_isoform_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.hg19.mirbase20.isoform.quantification.txt
31 | dcd586096e82eb95f15dce56030d8d02  TCGA-SKCM/Gene_expression/miRNA_gene_quantification/TCGA-D3-A3C7-06A-11R-A18X-13.mirna.quantification.txt.md5
32 | df17980f7da9b5060c8f25d6a5bbe353  TCGA-SKCM/Clinical/Clinical_Supplement/nationwidechildrens.org_clinical.TCGA-D3-A3C7.xml
33 | df9f4a42cb42ef1ee0fc8935150e724f  TCGA-SKCM/Gene_expression/Exon_quantification/unc.edu.75838292-c5fe-45f1-8fa5-94403cfc3bdf.1758520.bt.exon_quantification.txt
34 | f5c3ebcce3c205dbb68f2d923252e5b4  TCGA-SKCM/Clinical/Pathology_report/TCGA-D3-A3C7.9FABC766-7052-4352-8DC7-5160C9D768AB.pdf.md5
35 | 


--------------------------------------------------------------------------------
/tests/baselines/load-files.txt:
--------------------------------------------------------------------------------
 1 | ./TCGA-ACC.Sample.loadfile.txt
 2 | ./TCGA-ACC.Sample_Set.loadfile.txt
 3 | ./TCGA-ACCSKCM.Sample.loadfile.txt
 4 | ./TCGA-ACCSKCM.Sample_Set.loadfile.txt
 5 | ./TCGA-BLCA.Sample.loadfile.txt
 6 | ./TCGA-BLCA.Sample_Set.loadfile.txt
 7 | ./TCGA-ESCA.Sample.loadfile.txt
 8 | ./TCGA-ESCA.Sample_Set.loadfile.txt
 9 | ./TCGA-SKCM.Sample.loadfile.txt
10 | ./TCGA-SKCM.Sample_Set.loadfile.txt
11 | 


--------------------------------------------------------------------------------
/tests/baselines/load-md5sums-google.txt:
--------------------------------------------------------------------------------
1 | 99d22b3a0fd36610537150c5ca6dc4e3  load-munged.txt
2 | 


--------------------------------------------------------------------------------
/tests/baselines/load-md5sums.txt:
--------------------------------------------------------------------------------
1 | 7ff91dba7d89b5150a9fea2ad26967b9  load-munged.txt
2 | 


--------------------------------------------------------------------------------
/tests/baselines/misctests.txt:
--------------------------------------------------------------------------------
1 | All projects in TCGA program:['"TCGA-ACC"', '"TCGA-BLCA"', '"TCGA-BRCA"', '"TCGA-CESC"', '"TCGA-CHOL"', '"TCGA-COAD"', '"TCGA-DLBC"', '"TCGA-ESCA"', '"TCGA-GBM"', '"TCGA-HNSC"', '"TCGA-KICH"', '"TCGA-KIRC"', '"TCGA-KIRP"', '"TCGA-LAML"', '"TCGA-LGG"', '"TCGA-LIHC"', '"TCGA-LUAD"', '"TCGA-LUSC"', '"TCGA-MESO"', '"TCGA-OV"', '"TCGA-PAAD"', '"TCGA-PCPG"', '"TCGA-PRAD"', '"TCGA-READ"', '"TCGA-SARC"', '"TCGA-SKCM"', '"TCGA-STAD"', '"TCGA-TGCT"', '"TCGA-THCA"', '"TCGA-THYM"', '"TCGA-UCEC"', '"TCGA-UCS"', '"TCGA-UVM"']
2 | 


--------------------------------------------------------------------------------
/tests/baselines/onlycases-files.txt:
--------------------------------------------------------------------------------
1 | TCGA-SKCM/Biospecimen/Biospecimen_Supplement/nationwidechildrens.org_biospecimen.TCGA-EE-A3J8.0dc0eb62-2491-492b-9542-13fbe90e4d99.xml
2 | TCGA-SKCM/Biospecimen/Biospecimen_Supplement/nationwidechildrens.org_biospecimen.TCGA-EE-A3J8.0dc0eb62-2491-492b-9542-13fbe90e4d99.xml.md5
3 | TCGA-SKCM/Biospecimen/Biospecimen_Supplement/nationwidechildrens.org_ssf.TCGA-EE-A3J8.93ea3c92-a345-4022-8aef-414f9ed4cd86.xml
4 | TCGA-SKCM/Biospecimen/Biospecimen_Supplement/nationwidechildrens.org_ssf.TCGA-EE-A3J8.93ea3c92-a345-4022-8aef-414f9ed4cd86.xml.md5
5 | 


--------------------------------------------------------------------------------
/tests/baselines/report-files.txt:
--------------------------------------------------------------------------------
 1 | ./Blacklisted_Samples.html
 2 | ./Redactions.html
 3 | ./Replicate_Samples.html
 4 | ./TCGA-ACC.html
 5 | ./TCGA-ACCSKCM.html
 6 | ./TCGA-ACCSKCM_Blacklisted_Samples.html
 7 | ./TCGA-ACCSKCM_Redactions.html
 8 | ./TCGA-ACCSKCM_Replicate_Samples.html
 9 | ./TCGA-ACC_Blacklisted_Samples.html
10 | ./TCGA-ACC_Redactions.html
11 | ./TCGA-ACC_Replicate_Samples.html
12 | ./TCGA-BLCA.html
13 | ./TCGA-BLCA_Blacklisted_Samples.html
14 | ./TCGA-BLCA_Redactions.html
15 | ./TCGA-BLCA_Replicate_Samples.html
16 | ./TCGA-ESCA.html
17 | ./TCGA-ESCA_Blacklisted_Samples.html
18 | ./TCGA-ESCA_Redactions.html
19 | ./TCGA-ESCA_Replicate_Samples.html
20 | ./TCGA-SKCM.html
21 | ./TCGA-SKCM_Blacklisted_Samples.html
22 | ./TCGA-SKCM_Redactions.html
23 | ./TCGA-SKCM_Replicate_Samples.html
24 | ./index.html
25 | 


--------------------------------------------------------------------------------
/tests/config/blacklist.tsv:
--------------------------------------------------------------------------------
1 | TCGA ID	Tumor Type	Blacklist Reason
2 | 


--------------------------------------------------------------------------------
/tests/legacy.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | # Note that the LEGACY value in the [mirror] section is a boolean, and will
 3 | # evaluate to true when set (case-insensitively) to any of: 1, yes, true, on
 4 | # If --LEGACY CLI flag is specified, however, it will override config file
 5 | 
 6 | [DEFAULT]
 7 | ROOT_DIR: ./legacy
 8 | LOG_DIR: %(ROOT_DIR)s/logs
 9 | REFERENCE_DIR: ./reference
10 | DATESTAMPS: %(ROOT_DIR)s/datestamps.txt
11 | CASES: TCGA-D3-A3C7
12 | 
13 | [mirror]
14 | DIR: %(ROOT_DIR)s/mirror
15 | LEGACY: yes
16 | CATEGORIES: Clinical, Simple nucleotide variation, Gene expression
17 | 
18 | # The dice and loadfiles sections are here to enable the regression
19 | # tests to prove that ONLY mirroring of legacy data is supported
20 | [dice]
21 | DIR: %(ROOT_DIR)s/dice
22 | 
23 | [loadfile]
24 | DIR: %(ROOT_DIR)s/loadfiles
25 | 


--------------------------------------------------------------------------------
/tests/misctests.py:
--------------------------------------------------------------------------------
1 | 
2 | from __future__ import print_function
3 | import gdctools.lib.api as api
4 | import json
5 | 
6 | projects = [ json.dumps(s) for s in api.get_projects('TCGA')]
7 | print('All projects in TCGA program:{}'.format(projects))
8 | 


--------------------------------------------------------------------------------
/tests/onlycases.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | # This config file is used by the GDCtools test_cases regression test. It limits
 3 | # the set of data downloaded/mirrored, to a single category and case, and shows
 4 | # how sections within a config file can be mixed & matched: here we show how the
 5 | # CASES config variable is inherited by the mirror tool, while the mirror tool
 6 | # ALSO sets the CATEGORIES variable explicitly.  Such variables can also be
 7 | # overridden from the command line, which has the highest precedence of all.
 8 | # [see GDCtool::config_finalize() for an exhaustive discussion of precedence]
 9 | # That fact is demonstrated by the --categories Biospecimen flag given in the
10 | # Makefile having precedence over the DNA Methylation category defined below.
11 | 
12 | [DEFAULT]
13 | ROOT_DIR: ./onlycases
14 | LOG_DIR: %(ROOT_DIR)s/logs
15 | REFERENCE_DIR: ./reference
16 | DATESTAMPS: %(ROOT_DIR)s/datestamps.txt
17 | CASES: TCGA-EE-A3J8
18 | 
19 | [mirror]
20 | DIR: %(ROOT_DIR)s/mirror
21 | CATEGORIES: DNA Methylation
22 | 


--------------------------------------------------------------------------------
/tests/tcgaSmoketest.cfg:
--------------------------------------------------------------------------------
 1 | # NOTE: this .cfg is intended for use as a local smoke test, and is NOT intended
 2 | # for production use.  See config/tcga.cfg for a production config file (and
 3 | # more description of each config file section).  Finally, note that to simplify
 4 | # and hasten the tests, we choose only 2 cases from each of ACC, LAML, OV, and
 5 | # SKCM cohorts: 1 blood tumor (TB), 1 MET (TM) and 2 primaries (TP); the case
 6 | # ids are in sorted order below, according to their cohort abbreviation.
 7 | 
 8 | [DEFAULT]
 9 | ROOT_DIR: ../../gdctools-test-sandbox
10 | LOG_DIR: %(ROOT_DIR)s/logs
11 | REFERENCE_DIR: ../gdctools/reference
12 | PROGRAMS: TCGA
13 | DATESTAMPS: %(ROOT_DIR)s/datestamps.txt
14 | PROJECTS: TCGA-ACC, TCGA-LAML, TCGA-SKCM, TCGA-ESCA, TCGA-BLCA
15 | CASES: TCGA-OR-A5L1,TCGA-OR-A5K2,TCGA-IG-A3YB,TCGA-L5-A4OI,TCGA-AB-2959,TCGA-AB-2850,TCGA-D3-A3C7,TCGA-EE-A3J8,TCGA-BL-A0C8,TCGA-BL-A13I,TCGA-BL-A13J
16 | 
17 | [mirror]
18 | DIR: %(ROOT_DIR)s/mirror
19 | 
20 | [dice]
21 | DIR: %(ROOT_DIR)s/dice
22 | 
23 | [loadfile]
24 | DIR: %(ROOT_DIR)s/loadfiles
25 | FILTERED_SAMPLES: %(ROOT_DIR)s/loadfiles/filtered_samples.txt
26 | 
27 | [report]
28 | DIR: %(ROOT_DIR)s/reports
29 | HEATMAPS_DIR: %(ROOT_DIR)s/heatmaps
30 | REDACTIONS_DIR: %(ROOT_DIR)s/redactions
31 | BLACKLIST: %(ROOT_DIR)s/config/blacklist.tsv
32 | 
33 | [aggregates]
34 | TCGA-ACCSKCM: TCGA-ACC,TCGA-SKCM
35 | 


--------------------------------------------------------------------------------
/tests/test_lock_context.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from lib.common import lock_context, init_logging
 3 | import time
 4 | import logging
 5 | 
 6 | init_logging()
 7 | 
 8 | with lock_context("gdc_mirror_root", "mirror"):
 9 |     logging.info("Doing work..")
10 |     time.sleep(5)


--------------------------------------------------------------------------------
/tests/testchoose.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Regression test to ensure that our aliquot selector algorithm is working
 3 | # properly; it is needed when a case contains multiple aliquots for a given
 4 | # datatype, and presently operates only for TCGA data.
 5 | 
 6 | import sys
 7 | from gdctools.gdc_loadfile import choose_file
 8 | 
 9 | dummy_files = [
10 | # Should choose file2 (H > R)
11 | ['TCGA-A6-5656-01A-21R-2338-13', 'TCGA-A6-5656-01A-21H-1838-13'],
12 | # Should choose file2 (higher plate)
13 | ['TCGA-A6-5656-01A-21H-1838-13', 'TCGA-A6-5656-01A-21H-1900-13'],
14 | # Should choose file1 (H > T)
15 | ['TCGA-A6-5656-01A-21H-1838-13', 'TCGA-A6-5656-01A-21T-3900-13'],
16 | # Should choose file1 (D > W, unless W has higher plate)
17 | ['TCGA-43-2581-01A-01D-1522-08', 'TCGA-43-2581-01A-01W-0877-08'],
18 | # Should choose file1 (D > G, unless G has higher plate)
19 | ['TCGA-06-0122-10A-01D-0914-01', 'TCGA-06-0122-10A-01G-0289-01'],
20 | # Should choose file2 (D > X, unless X has higher plate)
21 | ['TCGA-06-0122-10A-01X-0289-01', 'TCGA-06-0122-10A-01D-0914-01'],
22 | # Should choose file2 (D > W, unless W has higher plate)
23 | ['TCGA-43-2581-01A-01D-0000-08', 'TCGA-43-2581-01A-01W-0877-08'],
24 | # Should choose file2 (D > G, unless W has higher plate)
25 | ['TCGA-06-0122-10A-01D-0000-01', 'TCGA-06-0122-10A-01G-0289-01'],
26 | # Should choose file1 (D > X, unless W has higher plate)
27 | ['TCGA-06-0122-10A-01X-0289-01', 'TCGA-06-0122-10A-01D-0000-01'],
28 | # Should choose file2 (same analyte, but higher plate)
29 | ['TCGA-37-4130-01A-01D-1097-01', 'TCGA-37-4130-01A-01D-1969-01'],
30 | 
31 | ]
32 | choices = []
33 | for group in dummy_files:
34 |     chosen, ignored = choose_file(group)
35 |     choices.append(chosen)
36 | 
37 | correct = [
38 | 'TCGA-A6-5656-01A-21H-1838-13',
39 | 'TCGA-A6-5656-01A-21H-1900-13',
40 | 'TCGA-A6-5656-01A-21H-1838-13', 
41 | 'TCGA-43-2581-01A-01D-1522-08',
42 | 'TCGA-06-0122-10A-01D-0914-01',
43 | 'TCGA-06-0122-10A-01D-0914-01',
44 | 'TCGA-43-2581-01A-01W-0877-08',
45 | 'TCGA-06-0122-10A-01G-0289-01',
46 | 'TCGA-06-0122-10A-01X-0289-01',
47 | 'TCGA-37-4130-01A-01D-1969-01',
48 | ]
49 | 
50 | if not choices == correct:
51 |     print("ERROR: replicate filter did not choose proper aliquots\n")
52 |     sys.exit(1)
53 | else:
54 |     print("GOOD: replicate filter chose aliquots properly\n")
55 |     sys.exit(0)
56 | 


--------------------------------------------------------------------------------
/util/bdiff.py:
--------------------------------------------------------------------------------
  1 | # Script to compare TCGA barcodes between GDC & DCC loadfiels
  2 | import csv
  3 | import os
  4 | import json
  5 | 
  6 | GDC_LOADFILE = "TCGA.2016_08_23__14_36_30.Sample.loadfile.txt"
  7 | GDC_FILTERED_SAMPLES = "TCGA.2016_08_23__14_36_30.filtered_samples.txt"
  8 | DCC_LOADFILE = "normalized.tcga_all_samples.2016_07_15__00_00_14.Sample.loadfile.txt"
  9 | DCC_FILTERED_SAMPLES = "filteredSamples.2016_07_15__00_00_14.txt"
 10 | GDC_RELEASE_NOTES = "release_notes.txt"
 11 | 
 12 | GDC_MISSING_FILE = "sample_based_GDC_missing.tsv"
 13 | GDC_CHANGED_FILE = "sample_based_GDC_changed.tsv"
 14 | GDC_NEW_FILE = "sample_based_GDC_new.tsv"
 15 | 
 16 | DCC_IGNORE_PLATFORMS = { 
 17 |     #'genome_wide_snp_6',     # GDC has these now
 18 |     'human1mduo',            # CN
 19 |     'humanhap550',
 20 |     'cgh_1x1m_g4447a', 'hg_cgh_244a',             # CNA
 21 |     'hg_cgh_415k_g4124a', 'illuminahiseq_dnaseqc',
 22 |     'illuminahiseq_dnaseqc',                      # LowP
 23 |     'humanmethylation27', 'humanmethylation450',  # Methylation
 24 |     'illuminadnamethylation_oma002_cpi',
 25 |     'illuminadnamethylation_oma003_cpi',
 26 |     'h_mirna_8x15k', 'h_mirna_8x15kv2',           # miR (array)
 27 |     'mda_rppa_core',                              # RPPA
 28 |     'agilentg4502a_07_1', 'agilentg4502a_07_2',   # mRNA (array)
 29 |     'agilentg4502a_07_3', 'ht_hg_u133a',
 30 |     'illuminaga_dnaseq', 'illuminaga_dnaseq_automated',   # MAF
 31 |     'solid_dna', 'solid_dna_automated',
 32 |     'huex_1_0_st_v2',                             # Exon
 33 | }
 34 | GDC_IGNORE_ANNOTS = {
 35 |     'SNV__mutect' # MAF
 36 | }
 37 | 
 38 | DCC_IGNORE_ANNOTS = {
 39 |     'snp__genome_wide_snp_6__broad_mit_edu__Level_2__birdseed_genotype__birdseed',
 40 |     'snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg18__seg',
 41 |     'snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg18__seg'
 42 | }
 43 | 
 44 | 
 45 | 
 46 | #'CNV__snp6', 'CNV_no_germline__snp6'} # GDC now has CN
 47 | 
 48 | DCC_TO_GDC = {
 49 |    'snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg':['CNV__snp6'],
 50 |    'snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg':['CNV_no_germline__snp6'],
 51 |    'clin__bio__genome_wustl_edu__Level_1__biospecimen__clin':['clinical__biospecimen'],
 52 |    'clin__bio__genome_wustl_edu__Level_1__clinical__clin':['clinical__primary'],
 53 |    'clin__bio__nationwidechildrens_org__Level_1__biospecimen__clin':['clinical__biospecimen'],
 54 |    'clin__bio__intgen_org__Level_1__biospecimen__clin':['clinical__biospecimen'],
 55 |    'clin__bio__nationwidechildrens_org__Level_1__clinical__clin':['clinical__primary'],
 56 |    'clin__bio__intgen_org__Level_1__clinical__clin':['clinical__primary'],
 57 |    'rnaseq__illuminaga_rnaseq__bcgsc_ca__Level_3__gene_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 58 |    'rnaseq__illuminahiseq_rnaseq__bcgsc_ca__Level_3__gene_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'], 
 59 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data' : ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 60 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 61 |    'rnaseq__illuminaga_rnaseq__unc_edu__Level_3__exon_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 62 |    'rnaseq__illuminaga_rnaseq__unc_edu__Level_3__gene_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 63 |    'rnaseq__illuminaga_rnaseq__unc_edu__Level_3__splice_junction_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 64 |    'rnaseq__illuminahiseq_rnaseq__unc_edu__Level_3__exon_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 65 |    'rnaseq__illuminahiseq_rnaseq__unc_edu__Level_3__gene_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 66 |    'rnaseq__illuminahiseq_rnaseq__unc_edu__Level_3__splice_junction_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 67 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 68 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 69 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 70 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 71 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__exon_quantification__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 72 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__junction_quantification__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 73 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 74 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 75 |    'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 76 |    'rnaseq__illuminahiseq_rnaseq__bcgsc_ca__Level_3__exon_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 77 |    'rnaseq__illuminaga_rnaseq__bcgsc_ca__Level_3__splice_junction_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 78 |    'rnaseq__illuminahiseq_rnaseq__bcgsc_ca__Level_3__splice_junction_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 79 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 80 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_isoforms__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 81 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 82 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__exon_quantification__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 83 |    'rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__junction_quantification__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 84 |    'rnaseq__illuminaga_rnaseq__bcgsc_ca__Level_3__exon_expression__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 85 |    'rnaseq__illuminaga_rnaseq__unc_edu__Level_3__coverage__data': ['mRNA__counts__FPKM', 'mRNA__geneExpNormed__FPKM', 'mRNA__geneExp__FPKM'],
 86 |    'mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data': ['miR__geneExp'],
 87 |    'mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data': ['miR__geneExp'],
 88 |    'mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data': ['miR__geneExp'],
 89 |    'mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data': ['miR__isoformExp'],
 90 |    'mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data': ['miR__isoformExp'],
 91 |    'mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data': ['miR__isoformExp'],
 92 |    'mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data': ['miR__isoformExp'],
 93 | }
 94 | 
 95 | 
 96 | # Build dictionary lookup for each loadfile
 97 | 
 98 | def get_failed_aliquots(manifest_file):
 99 |     """Build a set of aliquots that failed harmonization"""
100 |     failed_aliquots = set()
101 |     with open(manifest_file) as f:
102 |         reader = csv.DictReader(f, delimiter='\t')
103 |         for row in reader:
104 |             failed_wxs = row['WXS (aliquots that failed harmonization, QC, or pending analysis)']
105 |             if failed_wxs:
106 |                 failed_aliquots.update(failed_wxs.strip('"').split(','))
107 |             failed_rna = row['RNA-Seq (aliquot that failed harmonization, QC, or pending analysis)']
108 |             if failed_rna:
109 |                 failed_aliquots.update(failed_rna.strip('"').split(','))
110 |     return failed_aliquots
111 | 
112 | def should_use_dcc(a):
113 |     """Return true if this DCC annotation should be used in the diff"""
114 |     if a in DCC_IGNORE_ANNOTS:
115 |         return False
116 |     fields = a.split('__')
117 |     platform = fields[1]
118 |     #Ignore certain platforms
119 |     if platform in DCC_IGNORE_PLATFORMS:
120 |         return False
121 |     # Also ignore wustl clinical
122 |     center = fields[2]
123 |     type = fields[0]
124 |     if type == 'clin' and platform == 'bio':
125 |         #if center == 'genome_wustl_edu':
126 |         #    return False
127 |          # And ignore clinical ssf
128 |         t = fields[4]
129 |         if t == 'ssf' or t == 'auxiliary' or t == 'omf':
130 |             return False
131 |     return True
132 | 
133 | # TODO: break down barcodes by sample type
134 | def compare_barcodes(gdc_barcodes, gdc_cohort_lookup, dcc_barcodes, dcc_cohort_lookup, failed_aliquots):
135 |     new_gdc = []
136 |     changed_gdc = []
137 |     missing_gdc = []
138 | 
139 |     # Loop through DCC annotations, looking for missing or changed
140 |     # entries in gdc_barcodes
141 |     for tcga_id, dcc_annots in dcc_barcodes.iteritems():
142 |         cohort = dcc_cohort_lookup[tcga_id]
143 |         for dcc_annot in dcc_annots:
144 |             # If the annotation is in the DCC loadfile,
145 |             # but not in the GDC loadfile, then these files
146 |             # count as missing
147 |             barcode_list = dcc_barcodes[tcga_id][dcc_annot]
148 |             if tcga_id not in gdc_barcodes or dcc_annot not in gdc_barcodes[tcga_id]:
149 |                 barcode_str = ",".join(sorted(barcode_list))
150 |                 missing_entry = [tcga_id, cohort, dcc_annot, barcode_str, ""]
151 |                 missing_gdc.append(missing_entry)
152 |             else:
153 |             # Otherwise, we have to compare the two sets of barcodes, to see if there was a change
154 |                 dcc_barcode_set = dcc_barcodes[tcga_id][dcc_annot]
155 |                 gdc_barcode_set = gdc_barcodes[tcga_id][dcc_annot]
156 |                 if dcc_barcode_set != gdc_barcode_set:
157 |                     # The set of barcodes has changed
158 |                     dcc_barcode_str = ",".join(sorted(dcc_barcode_set))
159 |                     gdc_barcode_str = ",".join(sorted(gdc_barcode_set))
160 |                     change_entry = [tcga_id, cohort, dcc_annot, dcc_barcode_str, gdc_barcode_str]
161 |                     changed_gdc.append(change_entry)
162 | 
163 |     # Now do the same for GDC annotations
164 |     for tcga_id, gdc_annots in gdc_barcodes.iteritems():
165 |         for gdc_annot in gdc_annots:
166 |             # If the annotation is in the GDC loadfile,
167 |             # but not in the DCC loadfile, then these files
168 |             # count as new
169 |             cohort = gdc_cohort_lookup[tcga_id]
170 |             barcode_list = gdc_barcodes[tcga_id][gdc_annot]
171 |             if tcga_id not in dcc_barcodes or gdc_annot not in dcc_barcodes[tcga_id]:
172 |                 barcode_str = ",".join(barcode_list)
173 |                 new_entry = [tcga_id, cohort, dcc_annot, "", barcode_str]
174 |                 new_gdc.append(new_entry)
175 |             else:
176 |             # Otherwise, we have to compare the two sets of barcodes, to see if there was a change
177 |                 dcc_barcode_set = set(dcc_barcodes[tcga_id][gdc_annot])
178 |                 gdc_barcode_set = set(gdc_barcodes[tcga_id][gdc_annot])
179 |                 if dcc_barcode_set != gdc_barcode_set:
180 |                     # The set of barcodes has changed
181 |                     dcc_barcode_str = ",".join(sorted(dcc_barcode_set))
182 |                     gdc_barcode_str = ",".join(sorted(gdc_barcode_set))
183 |                     change_entry = [tcga_id, cohort, dcc_annot, dcc_barcode_str, gdc_barcode_str]
184 |                     changed_gdc.append(change_entry)
185 | 
186 |     # Now filter out any rows in missing where the aliquot failed harmonization
187 |     missing_gdc = [l for l in missing_gdc if l[3] not in failed_aliquots]
188 | 
189 |     return new_gdc, changed_gdc, missing_gdc
190 | 
191 | def loadfile_iter(loadfile):
192 |     with open(loadfile) as lf:
193 |         rdr = csv.DictReader(lf, delimiter='\t')
194 |         annots = rdr.fieldnames[4:]
195 |         for row in rdr:
196 |             tcga_samp_id = row['tcga_sample_id']
197 |             cohort = row['sample_id'].split('-')[0]
198 |             for a in annots:
199 |                 filename = row[a]
200 |                 if filename != '__DELETE__':
201 |                     barcode = os.path.basename(filename).split('.')[0]
202 |                     yield tcga_samp_id, cohort, a, barcode
203 | 
204 | def barcode_lookup(loadfile, isGDC):
205 |     '''Build a nested dictionary where lookup[tcga_sample_id][annotation] = barcode'''
206 |     sample_lookup = dict()
207 |     sample_cohort_lookup = dict()
208 |     for tcga_id, cohort, annot, barcode in loadfile_iter(loadfile):
209 | 
210 |         if len(barcode) == 12:
211 |             # Case level, use the barcode as the tcga_id
212 |             tcga_id = barcode
213 | 
214 |         if cohort == "FPPP" or cohort == "FPPPFFPE":
215 |             continue # Skip this, not a real cohort
216 |         sample_lookup[tcga_id] = sample_lookup.get(tcga_id, dict())
217 |         sample_cohort_lookup[tcga_id] = cohort
218 |         if isGDC:
219 |             if annot not in GDC_IGNORE_ANNOTS:
220 |                 if 'clinical' in annot:
221 |                     # only one barcode possible for clinical types 
222 |                     sample_lookup[tcga_id][annot] = set([barcode])
223 |                 else:
224 |                     sample_lookup[tcga_id][annot] = sample_lookup[tcga_id].get(annot, set())
225 |                     sample_lookup[tcga_id][annot].add(barcode)
226 |         elif should_use_dcc(annot):
227 |             # Replace the annot with the DCC equivalent
228 |             new_annots = DCC_TO_GDC[annot]
229 |             for a in new_annots:
230 |                 if 'clinical' in a:
231 |                     tcga_id = tcga_id[:12] # Case level tcga id
232 |                     # only one barcode possible for clinical types 
233 |                     sample_lookup[tcga_id][a] = set([barcode])
234 |                 else:
235 |                     sample_lookup[tcga_id][a] = sample_lookup[tcga_id].get(a, set())
236 |                     sample_lookup[tcga_id][a].add(barcode)
237 |     return sample_lookup, sample_cohort_lookup
238 | 
239 | def write_changes(change_list, to_file):
240 |     with open(to_file, 'w') as f:
241 |         f.write("tcga_sample_id\tcohort\tannot\tDCC Barcode(s)\tGDC Barcode(s)\n")
242 |         # Sort change list by combination of cohort + tcga_sample_id
243 |         change_list = sorted(change_list, key=lambda l: l[1]+l[0])
244 |         for change in change_list:
245 |             line = "\t".join(change) + "\n"
246 |             f.write(line)
247 | 
248 | # Main starts here
249 | 
250 | gdc_barcodes, gdc_cohort_lookup = barcode_lookup(GDC_LOADFILE, True)
251 | dcc_barcodes, dcc_cohort_lookup = barcode_lookup(DCC_LOADFILE, False)
252 | 
253 | 
254 | FAILED_ALIQUOTS = get_failed_aliquots(GDC_RELEASE_NOTES)
255 | NEW_GDC, CHANGED_GDC, MISSING_GDC = compare_barcodes(gdc_barcodes, gdc_cohort_lookup, dcc_barcodes, dcc_cohort_lookup, FAILED_ALIQUOTS)
256 | write_changes(NEW_GDC, GDC_NEW_FILE)
257 | write_changes(CHANGED_GDC, GDC_CHANGED_FILE)
258 | write_changes(MISSING_GDC, GDC_MISSING_FILE)
259 | 
260 | 


--------------------------------------------------------------------------------
/util/checkError.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | Errors=`egrep "Error|File|Traceback" $1`
4 | if [ -n "$Errors" ] ; then
5 |     echo "$Errors"
6 |     exit 1
7 | fi
8 | exit 0
9 | 


--------------------------------------------------------------------------------
/util/checkMD5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # checkMD5:  output MD5 sum of file(s), in a consistent manner across platforms
 4 | 
 5 | md5_func()
 6 | {
 7 |     # Mac OS/X 
 8 |     value=`md5 -q $1`
 9 |     echo "$value $1"
10 | }
11 | 
12 | md5sum_func()
13 | {
14 |     # Linux
15 |     md5sum $1
16 | }
17 | 
18 | 
19 | M5list="md5sum md5"
20 | M5util=
21 | M5func=
22 | for util in $M5list ; do
23 |     M5util=`type -P $util`
24 |     if [ -n "$M5util" ] ; then
25 |         M5func=${util}_func
26 |     fi
27 | done
28 | 
29 | if [ -z "$M5func" ] ; then
30 |     echo "Error: could not find an MD5 utility in your \$PATH" >&2
31 |     echo "       (looked for: $M5list)"
32 |     exit 1
33 | fi
34 | 
35 | for file in $@ ; do
36 |     $M5func $file
37 | done
38 | 


--------------------------------------------------------------------------------
/util/findPython.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # config.sh: Help determine installation location for gdctools package,
 4 | #            by identifying existing Python installations as candidates.
 5 | 
 6 | InstallDir=
 7 | 
 8 | if [ -n "$1" ] ; then
 9 |     PythonExe=$1
10 |     shift
11 | else
12 |     PythonExe=python
13 | fi
14 | 
15 | # If there is an active python virtual environment, give it precedence
16 | if [ -n "$VIRTUAL_ENV" ] ; then
17 |       InstallDir=$VIRTUAL_ENV
18 | fi
19 | 
20 | # For convenience, 2nd precedence given to well-known directories @ Broad
21 | if [ -z "$InstallDir" ] ; then
22 |     BroadDirs="/local/firebrowse/latest /xchip/tcga/Tools/gdac/latest"
23 |     for dir in $BroadDirs ; do
24 |         if [ -d $dir ] ; then
25 |             InstallDir=$dir
26 |             break
27 |         fi
28 |     done
29 | fi
30 | 
31 | # Finally, check existing user $PATH
32 | if [ -z "$InstallDir" ] ; then
33 |     Python=`type -P $PythonExe`
34 |     if [ -n "$Python" ]; then
35 |         InstallDir=`dirname $Python`
36 |         InstallDir=`dirname $InstallDir`
37 |     fi
38 | fi
39 | 
40 | if [ -z "$InstallDir" ] ; then
41 |     echo "Error: could not find a $PythonExe installation to use" >&2
42 |     exit 1
43 | fi
44 | 
45 | echo "$InstallDir"
46 | exit 0
47 | 


--------------------------------------------------------------------------------
/util/runpy.ac:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | Python=%PYTHON%
4 | This=`basename $0`
5 | exec $Python ${This}.py "$@"
6 | 


--------------------------------------------------------------------------------