├── .editorconfig ├── .github ├── release-drafter.yml └── workflows │ ├── after-master-commit.yml │ ├── compare-annotation.yml │ ├── compare-genomic-change-annotation.yml │ ├── pytest.yml │ └── release-management.yml ├── .gitignore ├── .version-level ├── AnnotatorCore.py ├── ClinicalDataAnnotator.py ├── CnaAnnotator.py ├── FusionAnnotator.py ├── GenerateReadMe.py ├── LICENSE ├── MafAnnotator.py ├── OncoKBPlots.py ├── README.md ├── StructuralVariantAnnotator.py ├── actionability_functions_msi_tmb_manuscript_R.r ├── data ├── example_atypical_alterations.txt ├── example_clinical.txt ├── example_cna.txt ├── example_fusions.txt ├── example_individual_cna.txt ├── example_maf.txt ├── example_maf_grch38.txt └── example_sv.txt ├── example.sh ├── flake8.ini ├── requirements ├── common.txt ├── pip2.7.txt └── pip3.txt ├── test_Annotation.py └── test_AnnotatorCore.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # The EditorConfig project consists of a file format for defining coding styles 2 | # and a collection of text editor plugins that enable editors to read the file format 3 | # and adhere to defined styles. 4 | 5 | # EditorConfig files are read top to bottom and the closest EditorConfig files are read last. 6 | # Properties from matching EditorConfig sections are applied in the order they were read, 7 | # so properties in closer files take precedence. 8 | 9 | # Please only specify the formats you want to apply through out the project in this file. 10 | # Otherwise, please create new config file in your directory where you want to apply these styles. 11 | 12 | # More details about EditorConfig: http://EditorConfig.org 13 | 14 | # top-most EditorConfig file 15 | root = true 16 | 17 | [*] 18 | # Unix-style newlines with a newline ending every file 19 | insert_final_newline = false 20 | trim_trailing_whitespace = false 21 | 22 | 23 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'v$NEXT_PATCH_VERSION' 2 | tag-template: 'v$NEXT_PATCH_VERSION' 3 | categories: 4 | - title: '🧬 Features' 5 | labels: 6 | - 'feature' 7 | - title: '🐛 Bug Fixes' 8 | labels: 9 | - 'fix' 10 | - title: '🏎 Performance Tweaks' 11 | labels: 12 | - 'performance' 13 | - title: '🎨 Style Tweaks' 14 | labels: 15 | - 'style tweak' 16 | - title: '📘 Documentation' 17 | labels: 18 | - 'documentation' 19 | - title: '🧹 Cleanup' 20 | labels: 21 | - 'cleanup' 22 | - title: '👷‍♀️ Testing, Configuration & Deployment' 23 | labels: 24 | - 'devops' 25 | - title: '🧰 Maintenance' 26 | labels: 27 | - 'chore' 28 | - 'dependencies' 29 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 30 | template: | 31 | ## Changes 32 | $CHANGES 33 | ## 🕵️‍♀️ Full commit logs 34 | - https://github.com/oncokb/oncokb-annotator/compare/$PREVIOUS_TAG...v$NEXT_PATCH_VERSION 35 | -------------------------------------------------------------------------------- /.github/workflows/after-master-commit.yml: -------------------------------------------------------------------------------- 1 | name: After master commit 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | check-version-level-and-update: 10 | if: github.repository == 'oncokb/oncokb-annotator' 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | - name: 'Update Version Level' 17 | run: | 18 | git pull 19 | VERSION_LEVEL=$(cat .version-level | tr "[:upper:]" "[:lower:]") 20 | 21 | RELEASE_DRAFTER_MINOR='NEXT_MINOR_VERSION' 22 | RELEASE_DRAFTER_PATCH='NEXT_PATCH_VERSION' 23 | 24 | if [[ $VERSION_LEVEL == 'minor' ]]; then 25 | sed -i "s/$RELEASE_DRAFTER_PATCH/$RELEASE_DRAFTER_MINOR/gi" .github/release-drafter.yml 26 | fi 27 | 28 | if [[ $VERSION_LEVEL == 'patch' ]]; then 29 | sed -i "s/$RELEASE_DRAFTER_MINOR/$RELEASE_DRAFTER_PATCH/gi" .github/release-drafter.yml 30 | fi 31 | 32 | CHANGED=$(git diff --name-only HEAD --) 33 | if [ -n "$CHANGED" ] 34 | then 35 | git config user.name oncokb-bot 36 | git config user.email dev.oncokb@gmail.com 37 | git add . 38 | git commit -m "Update action files to align the version level to $VERSION_LEVEL" 39 | git push 40 | fi 41 | -------------------------------------------------------------------------------- /.github/workflows/compare-annotation.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run annotation against the master annotation 2 | 3 | name: Compare Annotation 4 | 5 | on: 6 | push: 7 | branches: 8 | - master 9 | - next-minor-release 10 | pull_request: 11 | branches: 12 | - master 13 | - next-minor-release 14 | jobs: 15 | build: 16 | if: github.repository == 'oncokb/oncokb-annotator' 17 | runs-on: macos-latest 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python 3.8 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.8 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install flake8 28 | pip install -r requirements/common.txt -r requirements/pip3.txt 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 33 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 35 | - name: Annotate 36 | id: annotate 37 | env: 38 | ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} 39 | ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }} 40 | run: | 41 | git checkout -b compare 42 | 43 | MUTATION_DATA_NAME=data_mutations_mskcc.txt 44 | CLINICAL_DATA_NAME=data_clinical_sample.txt 45 | FUSION_DATA_NAME=data_fusions.txt 46 | INDIVIDUAL_CNA_DATA_NAME=data_individual_CNA.txt 47 | 48 | cd data || exit 49 | curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/data | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do 50 | if [[ "$name" == "$FIEL_NAME_PREFIX"* ]]; then 51 | curl -s "$downloadurl" -o $name 52 | fi 53 | done 54 | cd .. 55 | 56 | # create compare folder to add all annotated files 57 | mkdir compare 58 | 59 | PREFIX=oncokb 60 | IMAF=data/"$MUTATION_DATA_NAME" 61 | OMAF=compare/"$PREFIX"_"$MUTATION_DATA_NAME" 62 | 63 | IC=data/"$CLINICAL_DATA_NAME" 64 | OC=compare/"$PREFIX"_"$CLINICAL_DATA_NAME" 65 | 66 | IF=data/"$FUSION_DATA_NAME" 67 | OF=compare/"$PREFIX"_"$FUSION_DATA_NAME" 68 | 69 | IICNA=data/"$INDIVIDUAL_CNA_DATA_NAME" 70 | OICNA=compare/"$PREFIX"_"$INDIVIDUAL_CNA_DATA_NAME" 71 | 72 | python MafAnnotator.py -i "$IMAF" -o "$OMAF" -c "$IC" -b "$ONCOKB_API_TOKEN" 73 | python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$ONCOKB_API_TOKEN" 74 | python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$ONCOKB_API_TOKEN" -f "individual" 75 | python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OICNA,$OF" 76 | 77 | git config user.name oncokb-bot 78 | git config user.email dev.oncokb@gmail.com 79 | 80 | git add . 81 | git commit -m 'add analysis' 82 | 83 | - name: Compare annotation result with the ones from master 84 | id: compare 85 | env: 86 | ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }} 87 | FIEL_NAME_PREFIX: 'oncokb_data' 88 | run: | 89 | # remove everything under compare folder and replace wiht the ones from oncokb-data 90 | rm -f compare/*.txt 91 | 92 | cd compare || exit 93 | curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/annotation | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do 94 | if [[ "$name" == "$FIEL_NAME_PREFIX"* ]]; then 95 | curl -s "$downloadurl" -o $name 96 | fi 97 | done 98 | cd .. 99 | 100 | # compare 101 | CHANGED=$(git diff --name-only HEAD --) 102 | 103 | if [ -n "$CHANGED" ] 104 | then 105 | git diff 106 | exit 1 107 | fi 108 | 109 | -------------------------------------------------------------------------------- /.github/workflows/compare-genomic-change-annotation.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run annotation against the master annotation for a particular study 2 | 3 | name: Compare Genomic Change Annotation 4 | 5 | on: 6 | push: 7 | branches: 8 | - master 9 | - next-minor-release 10 | pull_request: 11 | branches: 12 | - master 13 | - next-minor-release 14 | jobs: 15 | build: 16 | if: github.repository == 'oncokb/oncokb-annotator' 17 | runs-on: macos-latest 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python 3.8 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.8 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install flake8 28 | pip install -r requirements/common.txt -r requirements/pip3.txt 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 33 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 35 | - name: Annotate 36 | id: annotate 37 | env: 38 | ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} 39 | ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }} 40 | run: | 41 | git checkout -b compare 42 | 43 | MUTATION_DATA_NAME=data_mutations_mskcc.txt 44 | CLINICAL_DATA_NAME=data_clinical_sample.txt 45 | 46 | cd data 47 | curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/data | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do 48 | if [[ "$name" == "$MUTATION_DATA_NAME" || "$name" == "$CLINICAL_DATA_NAME" ]]; then 49 | curl -s "$downloadurl" -o $name 50 | fi 51 | done 52 | cd .. 53 | 54 | # create compare folder to add all annotated files 55 | mkdir compare 56 | 57 | OGCMAF=oncokb_genomic_change_$MUTATION_DATA_NAME 58 | 59 | python MafAnnotator.py -i data/$MUTATION_DATA_NAME -o compare/$OGCMAF -c data/$CLINICAL_DATA_NAME -b $ONCOKB_API_TOKEN -q Genomic_Change 60 | 61 | git config user.name oncokb-bot 62 | git config user.email dev.oncokb@gmail.com 63 | 64 | git add . 65 | git commit -m 'add analysis' 66 | 67 | echo "::set-output name=FILE_NAME::$OGCMAF" 68 | 69 | - name: Compare annotation result with the ones from master 70 | id: compare 71 | env: 72 | FILE_NAME: ${{steps.annotate.outputs.FILE_NAME}} 73 | ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }} 74 | run: | 75 | # remove everything under compare folder and replace wiht the ones from oncokb-data 76 | rm -f compare/*.txt 77 | 78 | cd compare 79 | curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/annotation | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do 80 | if [[ "$name" == "$FILE_NAME" ]]; then 81 | curl -s "$downloadurl" -o $name 82 | fi 83 | done 84 | cd .. 85 | 86 | # compare 87 | CHANGED=$(git diff --name-only HEAD --) 88 | 89 | if [ -n "$CHANGED" ] 90 | then 91 | git diff 92 | exit 1 93 | fi 94 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Run all python tests 5 | 6 | on: 7 | push: 8 | branches: [ master, next-minor-release ] 9 | pull_request: 10 | branches: [ master, next-minor-release ] 11 | 12 | jobs: 13 | lint: 14 | name: Linting using flake8 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: actions/setup-python@v2 19 | with: 20 | python-version: "3.9" 21 | - name: Run flake8 22 | uses: julianwachholz/flake8-action@v2 23 | with: 24 | checkName: "Python Lint" 25 | path: . 26 | config: flake8.ini 27 | env: 28 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 29 | pytest: 30 | needs: lint 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | matrix: 34 | os: [ ubuntu-latest, macos-latest ] 35 | python-version: [ '3.8','3.9','3.10','3.11' ] 36 | steps: 37 | - uses: actions/checkout@v2 38 | - name: Set up Python ${{ matrix.python-version }} 39 | uses: actions/setup-python@v4 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | - name: Install dependencies 43 | env: 44 | PYTHON_VERSION: ${{ matrix.python-version }} 45 | run: | 46 | python -m pip install --upgrade pip 47 | pip install pytest 48 | if [[ $PYTHON_VERSION =~ ^2\.[0-9]+$ ]]; then pip install -r requirements/common.txt -r requirements/pip2.7.txt; fi 49 | if [[ $PYTHON_VERSION =~ ^3\.[0-9]+$ ]]; then pip install -r requirements/common.txt -r requirements/pip3.txt; fi 50 | - name: Test with pytest 51 | env: 52 | ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} 53 | run: | 54 | pytest 55 | 56 | build-in-windows: 57 | needs: lint 58 | runs-on: windows-latest 59 | strategy: 60 | matrix: 61 | python-version: [ '3.8','3.9','3.10','3.11' ] 62 | steps: 63 | - uses: actions/checkout@v2 64 | - name: Set up Python ${{ matrix.python-version }} 65 | uses: actions/setup-python@v4 66 | with: 67 | python-version: ${{ matrix.python-version }} 68 | - name: Install dependencies 69 | env: 70 | PYTHON_VERSION: ${{ matrix.python-version }} 71 | run: | 72 | python -m pip install --upgrade pip 73 | pip install pytest 74 | if ( $env:PYTHON_VERSION -match '^2\.[0-9]+$' ) 75 | { 76 | pip install -r requirements/common.txt -r requirements/pip2.7.txt 77 | } 78 | if ( $env:PYTHON_VERSION -match '^3\.[0-9]+$' ) 79 | { 80 | pip install -r requirements/common.txt -r requirements/pip3.txt 81 | } 82 | - name: Test with pytest 83 | env: 84 | ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }} 85 | run: | 86 | pytest 87 | -------------------------------------------------------------------------------- /.github/workflows/release-management.yml: -------------------------------------------------------------------------------- 1 | name: Release Management 2 | 3 | on: 4 | push: 5 | # branches to consider in the event; optional, defaults to all 6 | branches: 7 | - master 8 | 9 | jobs: 10 | update_draft_release: 11 | runs-on: ubuntu-latest 12 | steps: 13 | # Drafts your next Release notes as Pull Requests are merged into "master" 14 | - uses: release-drafter/release-drafter@v5 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # oncokb output data 2 | data/*.oncokb.* 3 | data/example_README.txt 4 | process 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # IPython Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # PyCharm 97 | .idea/ 98 | 99 | # MAC OS 100 | .DS_Store 101 | -------------------------------------------------------------------------------- /.version-level: -------------------------------------------------------------------------------- 1 | patch 2 | -------------------------------------------------------------------------------- /ClinicalDataAnnotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | import argparse 6 | import logging 7 | 8 | from AnnotatorCore import setsampleidsfileterfile 9 | from AnnotatorCore import process_clinical_data 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | log = logging.getLogger('ClinicalDataAnnotator') 13 | 14 | 15 | def main(argv): 16 | if argv.help: 17 | log.info( 18 | '\n' 19 | 'ClinicalDataAnnotator.py -i -o -a [-s sample list filter]\n' 20 | ' Essential clinical columns:\n' 21 | ' SAMPLE_ID: sample ID' 22 | ) 23 | sys.exit() 24 | if argv.sample_ids_filter: 25 | setsampleidsfileterfile(argv.sample_ids_filter) 26 | 27 | annotated_alteration_files = re.split(',|, ', argv.annotated_alteration_files) 28 | if argv.input_file == '' or argv.output_file == '' or len(annotated_alteration_files) == 0: 29 | required_params = [] 30 | if argv.input_file == '': 31 | required_params.append('-i') 32 | if argv.output_file == '': 33 | required_params.append('-o') 34 | if len(annotated_alteration_files) == 0: 35 | required_params.append('-a') 36 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 37 | log.info('for help: python ClinicalDataAnnotator.py -h') 38 | sys.exit(2) 39 | 40 | log.info('annotating %s ...' % argv.input_file) 41 | process_clinical_data(annotated_alteration_files, argv.input_file, argv.output_file) 42 | 43 | log.info('done!') 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser(add_help=False) 48 | parser.add_argument('-h', dest='help', action="store_true", default=False) 49 | parser.add_argument('-i', dest='input_file', default='', type=str) 50 | parser.add_argument('-o', dest='output_file', default='', type=str) 51 | parser.add_argument('-s', dest='sample_ids_filter', default='', type=str) 52 | parser.add_argument('-a', dest='annotated_alteration_files', default='', type=str) 53 | parser.set_defaults(func=main) 54 | 55 | args = parser.parse_args() 56 | args.func(args) 57 | -------------------------------------------------------------------------------- /CnaAnnotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import logging 6 | 7 | from AnnotatorCore import setsampleidsfileterfile 8 | from AnnotatorCore import setoncokbbaseurl 9 | from AnnotatorCore import setoncokbapitoken 10 | from AnnotatorCore import readCancerTypes 11 | from AnnotatorCore import validate_oncokb_token 12 | from AnnotatorCore import process_cna_data 13 | from AnnotatorCore import CNA_FILE_FORMAT_GISTIC 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | log = logging.getLogger('CnaAnnotator') 17 | 18 | 19 | def main(argv): 20 | if argv.help: 21 | log.info( 22 | '\n' 23 | 'CnaAnnotator.py -i -o [-p previous results] [-c ] ' 24 | '[-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb_api_bear_token] ' 25 | '[-z annotate_gain_loss] [-f CNA file formt, gistic or individual] [-d include descriptions]\n' 26 | ' Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n' 27 | ' Essential clinical columns:\n' 28 | ' SAMPLE_ID: sample ID\n' 29 | ' Cancer type will be assigned based on the following priority:\n' 30 | ' 1) ONCOTREE_CODE in clinical data file\n' 31 | ' 2) ONCOTREE_CODE exist in MAF\n' 32 | ' 3) default tumor type (-t)\n' 33 | ' We do not annotate Gain and Loss by default, add -z to include the analysis. See https://github.com/oncokb/oncokb-annotator/issues/51 for more information.\n' 34 | ' Default OncoKB base url is https://www.oncokb.org' 35 | ) 36 | sys.exit() 37 | if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': 38 | required_params = [] 39 | if argv.input_file == '': 40 | required_params.append('-i') 41 | if argv.output_file == '': 42 | required_params.append('-o') 43 | if argv.oncokb_api_bearer_token == '': 44 | required_params.append('-b') 45 | 46 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 47 | log.info('for help: python CnaAnnotator.py -h') 48 | sys.exit(2) 49 | if argv.sample_ids_filter: 50 | setsampleidsfileterfile(argv.sample_ids_filter) 51 | if argv.oncokb_api_url: 52 | setoncokbbaseurl(argv.oncokb_api_url) 53 | setoncokbapitoken(argv.oncokb_api_bearer_token) 54 | 55 | cancertypemap = {} 56 | if argv.input_clinical_file: 57 | readCancerTypes(argv.input_clinical_file, cancertypemap) 58 | 59 | validate_oncokb_token() 60 | 61 | log.info('annotating %s ...' % argv.input_file) 62 | process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.include_descriptions, argv.annotate_gain_loss, argv.cna_file_format.lower()) 63 | 64 | log.info('done!') 65 | 66 | 67 | if __name__ == "__main__": 68 | parser = argparse.ArgumentParser(add_help=False) 69 | parser.add_argument('-h', dest='help', action="store_true", default=False) 70 | parser.add_argument('-i', dest='input_file', default='', type=str) 71 | parser.add_argument('-o', dest='output_file', default='', type=str) 72 | parser.add_argument('-p', dest='previous_result_file', default='', type=str) 73 | parser.add_argument('-c', dest='input_clinical_file', default='', type=str) 74 | parser.add_argument('-s', dest='sample_ids_filter', default='', type=str) 75 | parser.add_argument('-t', dest='default_cancer_type', default='', type=str) 76 | parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) 77 | parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) 78 | parser.add_argument('-z', dest='annotate_gain_loss', action="store_true", default=False) 79 | parser.add_argument('-f', dest='cna_file_format', default=CNA_FILE_FORMAT_GISTIC) 80 | parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) 81 | parser.set_defaults(func=main) 82 | 83 | args = parser.parse_args() 84 | args.func(args) 85 | -------------------------------------------------------------------------------- /FusionAnnotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import logging 6 | 7 | from AnnotatorCore import setsampleidsfileterfile 8 | from AnnotatorCore import setcancerhotspotsbaseurl 9 | from AnnotatorCore import setoncokbbaseurl 10 | from AnnotatorCore import setoncokbapitoken 11 | from AnnotatorCore import readCancerTypes 12 | from AnnotatorCore import validate_oncokb_token 13 | from AnnotatorCore import process_fusion 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | log = logging.getLogger('FusionAnnotator') 17 | 18 | 19 | def main(argv): 20 | if argv.help: 21 | log.info( 22 | '\n' 23 | "FusionAnnotator.py -i -o [-p previous results] " 24 | "[-c ] [-s sample list filter] [-t ] [-u ] " 25 | "[-b ] [-r ] " 26 | "[-d include descriptions]\n" 27 | ' Essential Fusion columns (case insensitive):\n' 28 | ' HUGO_SYMBOL: Hugo gene symbol\n' 29 | ' VARIANT_CLASSIFICATION: Translational effect of variant allele\n' 30 | ' TUMOR_SAMPLE_BARCODE: sample ID\n' 31 | ' FUSION: amino acid change, e.g. "TMPRSS2-ERG"\n' 32 | ' Essential clinical columns:\n' 33 | ' SAMPLE_ID: sample ID\n' 34 | ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' 35 | ' Cancer type will be assigned based on the following priority:\n' 36 | ' 1) ONCOTREE_CODE in clinical data file\n' 37 | ' 2) ONCOTREE_CODE exist in Fusion\n' 38 | ' 3) default tumor type (-t)\n' 39 | ' Default OncoKB base url is https://www.oncokb.org' 40 | ) 41 | sys.exit() 42 | if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': 43 | required_params = [] 44 | if argv.input_file == '': 45 | required_params.append('-i') 46 | if argv.output_file == '': 47 | required_params.append('-o') 48 | if argv.oncokb_api_bearer_token == '': 49 | required_params.append('-b') 50 | 51 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 52 | log.info('for help: python FusionAnnotator.py -h') 53 | sys.exit(2) 54 | if argv.sample_ids_filter: 55 | setsampleidsfileterfile(argv.sample_ids_filter) 56 | if argv.cancer_hotspots_base_url: 57 | setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url) 58 | if argv.oncokb_api_url: 59 | setoncokbbaseurl(argv.oncokb_api_url) 60 | setoncokbapitoken(argv.oncokb_api_bearer_token) 61 | 62 | cancertypemap = {} 63 | if argv.input_clinical_file: 64 | readCancerTypes(argv.input_clinical_file, cancertypemap) 65 | 66 | validate_oncokb_token() 67 | 68 | log.info('annotating %s ...' % argv.input_file) 69 | process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format, argv.include_descriptions) 70 | 71 | log.info('done!') 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser(add_help=False) 76 | # ArgumentParser doesn't accept "store_true" and "type=" at the same time. 77 | parser.add_argument('-h', dest='help', action="store_true", default=False) 78 | parser.add_argument('-i', dest='input_file', default='', type=str) 79 | parser.add_argument('-o', dest='output_file', default='', type=str) 80 | parser.add_argument('-p', dest='previous_result_file', default='', type=str) 81 | parser.add_argument('-c', dest='input_clinical_file', default='', type=str) 82 | parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str) 83 | parser.add_argument('-t', dest='default_cancer_type', default='', type=str) 84 | parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) 85 | parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) 86 | parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) 87 | parser.add_argument('-r', dest='structural_variant_name_format', default=None, type=str) 88 | parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) 89 | parser.set_defaults(func=main) 90 | 91 | args = parser.parse_args() 92 | args.func(args) 93 | -------------------------------------------------------------------------------- /GenerateReadMe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import logging 6 | 7 | from AnnotatorCore import setoncokbbaseurl 8 | from AnnotatorCore import generateReadme 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | log = logging.getLogger('GenerateReadMe') 12 | 13 | 14 | def main(argv): 15 | if argv.help: 16 | log.info('\nGenerateReadMe.py -o [-u oncokb-base-url]\n' 17 | ' Default OncoKB base url is https://www.oncokb.org') 18 | sys.exit() 19 | if argv.output_file == '': 20 | log.error('The parameter -o can not be empty') 21 | log.info('for help: python GenerateReadMe.py -h') 22 | sys.exit(2) 23 | if argv.oncokb_api_url: 24 | setoncokbbaseurl(argv.oncokb_api_url) 25 | 26 | generateReadme(argv.output_file) 27 | log.info('done!') 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser(add_help=False) 32 | # ArgumentParser doesn't accept "store_true" and "type=" at the same time. 33 | parser.add_argument('-h', dest='help', action="store_true", default=False) 34 | parser.add_argument('-o', dest='output_file', default='', type=str) 35 | parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) 36 | parser.set_defaults(func=main) 37 | 38 | args = parser.parse_args() 39 | args.func(args) 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /MafAnnotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import logging 6 | 7 | from AnnotatorCore import setsampleidsfileterfile 8 | from AnnotatorCore import setcancerhotspotsbaseurl 9 | from AnnotatorCore import setoncokbbaseurl 10 | from AnnotatorCore import setoncokbapitoken 11 | from AnnotatorCore import readCancerTypes 12 | from AnnotatorCore import validate_oncokb_token 13 | from AnnotatorCore import processalterationevents 14 | from AnnotatorCore import QueryType 15 | from AnnotatorCore import ReferenceGenome 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | log = logging.getLogger('MafAnnotator') 19 | 20 | 21 | def main(argv): 22 | if argv.help: 23 | log.info( 24 | '\n' 25 | 'MafAnnotator.py -i -o [-p previous results] [-c ] ' 26 | '[-s sample list filter] [-t ] [-u oncokb-base-url] [-b oncokb api bear token] [-a] ' 27 | '[-q query type] [-r default reference genome] [-d include descriptions]\n' 28 | 'For definitions of the MAF format, please see https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/\n\n' 29 | 'Essential MAF columns for querying HGVSp_Short and HGVSp(case insensitive):\n' 30 | ' Hugo_Symbol: Hugo gene symbol\n' 31 | ' Tumor_Sample_Barcode: sample ID\n' 32 | ' HGVSp(query type: HGVSp): protein change in HGVSp format\n' 33 | ' HGVSp_Short(query type: HGVSp_Short): protein change in HGVSp format using 1-letter amino-acid codes\n' 34 | 'Essential MAF columns for querying HGVSg(case insensitive):\n' 35 | ' Tumor_Sample_Barcode: sample ID\n' 36 | ' HGVSg: Genomic change in HGVSg format\n' 37 | 'Essential MAF columns for querying genomic change(case insensitive):\n' 38 | ' Tumor_Sample_Barcode: sample ID\n' 39 | ' Chromosome: Chromosome number\n' 40 | ' Start_Position: Mutation start coordinate\n' 41 | ' End_Position: Mutation end coordinate\n' 42 | ' Reference_Allele: The plus strand reference allele at this position\n' 43 | ' Tumor_Seq_Allele1: Primary data genotype for tumor sequencing (discovery) allele\n' 44 | ' Tumor_Seq_Allele2: Tumor sequencing (discovery) allele 2\n' 45 | 'Essential clinical columns:\n' 46 | ' SAMPLE_ID: sample ID\n' 47 | ' ONCOTREE_CODE: tumor type code from oncotree (http://oncotree.mskcc.org)\n' 48 | 'Cancer type will be assigned based on the following priority:\n' 49 | ' 1) ONCOTREE_CODE in clinical data file\n' 50 | ' 2) ONCOTREE_CODE exist in MAF\n' 51 | ' 3) default tumor type (-t)\n' 52 | 'Query type only allows the following values (case-insensitive):\n' 53 | ' - HGVSp_Short\n' 54 | ' It reads from column HGVSp_Short or Alteration\n' 55 | ' - HGVSp\n' 56 | ' It reads from column HGVSp or Alteration\n' 57 | ' - HGVSg\n' 58 | ' It reads from column HGVSg or Alteration\n' 59 | ' - Genomic_Change\n' 60 | ' It reads from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2 \n' 61 | 'Reference Genome only allows the following values(case-insensitive):\n' 62 | ' - GRCh37\n' 63 | ' GRCh38\n' 64 | 'Default OncoKB base url is https://www.oncokb.org.\n' 65 | ) 66 | sys.exit() 67 | if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': 68 | required_params = [] 69 | if argv.input_file == '': 70 | required_params.append('-i') 71 | if argv.output_file == '': 72 | required_params.append('-o') 73 | if argv.oncokb_api_bearer_token == '': 74 | required_params.append('-b') 75 | 76 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 77 | log.info('For help: python MafAnnotator.py -h') 78 | sys.exit(2) 79 | 80 | if argv.sample_ids_filter: 81 | setsampleidsfileterfile(argv.sample_ids_filter) 82 | if argv.cancer_hotspots_base_url: 83 | setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url) 84 | if argv.oncokb_api_url: 85 | setoncokbbaseurl(argv.oncokb_api_url) 86 | setoncokbapitoken(argv.oncokb_api_bearer_token) 87 | 88 | cancertypemap = {} 89 | if argv.input_clinical_file: 90 | readCancerTypes(argv.input_clinical_file, cancertypemap) 91 | 92 | log.info('annotating %s ...' % argv.input_file) 93 | 94 | user_input_query_type = None 95 | if argv.query_type is not None: 96 | try: 97 | user_input_query_type = QueryType[argv.query_type.upper()] 98 | except KeyError: 99 | log.error( 100 | 'Query type is not acceptable. Only the following allows(case insensitive): HGVSp_Short, HGVSp, HGVSg, Genomic_Change') 101 | raise 102 | 103 | default_reference_genome = None 104 | if argv.default_reference_genome is not None: 105 | try: 106 | default_reference_genome = ReferenceGenome[argv.default_reference_genome.upper()] 107 | except KeyError: 108 | log.error( 109 | 'Reference genome is not acceptable. Only the following allows(case insensitive): GRCh37, GRCh38') 110 | raise 111 | 112 | validate_oncokb_token() 113 | 114 | processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, 115 | cancertypemap, argv.annotate_hotspots, user_input_query_type, default_reference_genome, 116 | argv.include_descriptions) 117 | 118 | log.info('done!') 119 | 120 | 121 | if __name__ == "__main__": 122 | parser = argparse.ArgumentParser(add_help=False) 123 | parser.add_argument('-h', dest='help', action="store_true", default=False) 124 | parser.add_argument('-i', dest='input_file', default='', type=str) 125 | parser.add_argument('-o', dest='output_file', default='', type=str) 126 | parser.add_argument('-p', dest='previous_result_file', default='', type=str) 127 | parser.add_argument('-c', dest='input_clinical_file', default='', type=str) 128 | parser.add_argument('-s', dest='sample_ids_filter', default='', type=str) 129 | parser.add_argument('-t', dest='default_cancer_type', default='', type=str) 130 | parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) 131 | parser.add_argument('-a', dest='annotate_hotspots', action="store_true", default=False) 132 | parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) 133 | parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) 134 | parser.add_argument('-q', dest='query_type', default=None, type=str) 135 | parser.add_argument('-r', dest='default_reference_genome', default=None, type=str) 136 | parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) 137 | parser.set_defaults(func=main) 138 | 139 | args = parser.parse_args() 140 | args.func(args) 141 | -------------------------------------------------------------------------------- /OncoKBPlots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | import argparse 6 | import logging 7 | import os 8 | import csv 9 | import matplotlib.pyplot as plt 10 | 11 | from AnnotatorCore import setsampleidsfileterfile 12 | from AnnotatorCore import readheaders 13 | from AnnotatorCore import geIndexOfHeader 14 | from AnnotatorCore import sampleidsfilter 15 | from AnnotatorCore import levels 16 | from AnnotatorCore import dxLevels 17 | from AnnotatorCore import pxLevels 18 | from AnnotatorCore import SAMPLE_HEADERS 19 | 20 | logging.basicConfig(level=logging.INFO) 21 | log = logging.getLogger('OncoKBPlots') 22 | 23 | 24 | def plotclinicalactionability(ax, annotatedclinicalfile, outfile, parameters): 25 | if os.path.isfile(outfile): 26 | os.remove(outfile) 27 | 28 | extlevels = levels + ["ONCOGENIC", "VUS"] 29 | if "levels" in parameters: 30 | extlevels = parameters["levels"] 31 | 32 | with open(annotatedclinicalfile, 'rU') as clinfile: 33 | reader = csv.reader(clinfile, delimiter='\t') 34 | headers = readheaders(reader) 35 | isample = geIndexOfHeader(headers, SAMPLE_HEADERS) 36 | ilevel = headers['HIGHEST_LEVEL'] 37 | ioncogenic = headers['ONCOGENIC_MUTATIONS'] 38 | icat = headers[parameters["catogerycolumn"].upper()] # e.g. "CANCER_TYPE" 39 | 40 | catsamplecount = {} 41 | catactionablesamplecount = {} 42 | oncogenicsamplecount = {} 43 | levelcatsamplecount = {} 44 | 45 | for row in reader: 46 | sample = row[isample] 47 | if sampleidsfilter and sample not in sampleidsfilter: 48 | continue 49 | 50 | cat = row[icat] 51 | if cat not in catsamplecount: 52 | catsamplecount[cat] = 0 53 | catsamplecount[cat] += 1 54 | 55 | if cat not in catactionablesamplecount: 56 | catactionablesamplecount[cat] = 0 57 | oncogenicsamplecount[cat] = 0 58 | 59 | level = row[ilevel] 60 | oncogenic = row[ioncogenic] 61 | 62 | exlevel = level 63 | 64 | if level in extlevels: 65 | catactionablesamplecount[cat] += 1 66 | oncogenicsamplecount[cat] += 1 67 | elif len(oncogenic.strip()) > 0: 68 | oncogenicsamplecount[cat] += 1 69 | exlevel = "ONCOGENIC" 70 | else: 71 | exlevel = "VUS" 72 | 73 | if exlevel not in levelcatsamplecount: 74 | levelcatsamplecount[exlevel] = {} 75 | if cat not in levelcatsamplecount[exlevel]: 76 | levelcatsamplecount[exlevel][cat] = 0 77 | levelcatsamplecount[exlevel][cat] += 1 78 | 79 | # plot 80 | catarray = [] # cancer types 81 | catactionabilityarray = [] # actionabiligy percentages per cancer type 82 | catoncogenicarray = [] # actionabiligy percentages per cancer type 83 | for cat in catsamplecount: 84 | if catsamplecount[cat] >= parameters["thresholdcat"]: 85 | catarray.append(cat) 86 | catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) 87 | catoncogenicarray.append(oncogenicsamplecount[cat] * 100.0 / catsamplecount[cat]) 88 | 89 | ncat = len(catarray) 90 | order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x], catoncogenicarray[x]))) 91 | drawplot(ax, 'OncoKB Actionability', extlevels, levelcatsamplecount, catarray, catsamplecount, order, 92 | parameters["thresholdcat"]) 93 | 94 | 95 | def plotimplications(ax, header, title, levels, annotatedclinicalfile, outfile, parameters): 96 | if os.path.isfile(outfile): 97 | os.remove(outfile) 98 | 99 | extlevels = levels 100 | if "levels" in parameters: 101 | extlevels = parameters["levels"] 102 | 103 | with open(annotatedclinicalfile, 'rU') as clinfile: 104 | reader = csv.reader(clinfile, delimiter='\t') 105 | headers = readheaders(reader) 106 | isample = headers['SAMPLE_ID'] 107 | ilevel = headers[header] 108 | icat = headers[parameters["catogerycolumn"].upper()] 109 | 110 | catsamplecount = {} 111 | catactionablesamplecount = {} 112 | levelcatsamplecount = {} 113 | 114 | for row in reader: 115 | sample = row[isample] 116 | if sampleidsfilter and sample not in sampleidsfilter: 117 | continue 118 | 119 | cat = row[icat] 120 | if cat not in catsamplecount: 121 | catsamplecount[cat] = 0 122 | catsamplecount[cat] += 1 123 | 124 | if cat not in catactionablesamplecount: 125 | catactionablesamplecount[cat] = 0 126 | 127 | level = row[ilevel] 128 | 129 | exlevel = level 130 | 131 | if level in extlevels: 132 | catactionablesamplecount[cat] += 1 133 | else: 134 | exlevel = "Other" 135 | 136 | if exlevel not in levelcatsamplecount: 137 | levelcatsamplecount[exlevel] = {} 138 | if cat not in levelcatsamplecount[exlevel]: 139 | levelcatsamplecount[exlevel][cat] = 0 140 | levelcatsamplecount[exlevel][cat] += 1 141 | 142 | # plot 143 | catarray = [] # cancer types 144 | catactionabilityarray = [] # actionabiligy percentages per cancer type 145 | for cat in catsamplecount: 146 | if catsamplecount[cat] >= parameters["thresholdcat"]: 147 | catarray.append(cat) 148 | catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat]) 149 | 150 | ncat = len(catarray) 151 | order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x]))) 152 | drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, parameters["thresholdcat"]) 153 | 154 | 155 | def drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, thresholdcat): 156 | # level colors 157 | levelcolors = { 158 | 'LEVEL_1': '#33A02C', 159 | 'LEVEL_2': '#1F78B4', 160 | 'LEVEL_3A': '#984EA3', 161 | 'LEVEL_3B': '#BE98CE', 162 | 'LEVEL_4': '#a8a8a8', 163 | 'LEVEL_R1': '#EE3424', 164 | 'LEVEL_R2': '#F79A92', 165 | 166 | 'LEVEL_Dx1': '#33A02C', 167 | 'LEVEL_Dx2': '#1F78B4', 168 | 'LEVEL_Dx3': '#984EA3', 169 | 170 | 'LEVEL_Px1': '#33A02C', 171 | 'LEVEL_Px2': '#1F78B4', 172 | 'LEVEL_Px3': '#984EA3', 173 | 174 | 'ONCOGENIC': '#ffdab9', 175 | 'VUS': '#d1d1d1', 176 | 'Other': 'grey' 177 | } 178 | 179 | # level legend 180 | levellegend = { 181 | 'LEVEL_1': 'Level 1', 182 | 'LEVEL_2': 'Level 2', 183 | 'LEVEL_3A': 'Level 3A', 184 | 'LEVEL_3B': 'Level 3B', 185 | 'LEVEL_4': 'Level 4', 186 | 'LEVEL_R1': 'Level R1', 187 | 'LEVEL_R2': 'Level R2', 188 | 189 | 'LEVEL_Dx1': 'Level Dx1', 190 | 'LEVEL_Dx2': 'Level Dx2', 191 | 'LEVEL_Dx3': 'Level Dx3', 192 | 193 | 'LEVEL_Px1': 'Level Px1', 194 | 'LEVEL_Px2': 'Level Px2', 195 | 'LEVEL_Px3': 'Level Px3', 196 | 197 | 'ONCOGENIC': 'Oncogenic, no level', 198 | 'VUS': 'VUS', 199 | 'Other': 'Other' 200 | } 201 | 202 | ncat = len(catarray) 203 | if ncat > 0: 204 | catarray = [catarray[i] for i in order] 205 | 206 | ind = range(ncat) 207 | 208 | legends = [] 209 | plts = [] 210 | accumlevelcancerperc = [0] * ncat 211 | for level in extlevels: 212 | if level not in levelcatsamplecount: 213 | continue 214 | 215 | levelcancerperc = [0] * ncat 216 | for k in ind: 217 | cat = catarray[k] 218 | if catsamplecount[cat] < thresholdcat: 219 | continue 220 | if cat in levelcatsamplecount[level]: 221 | levelcancerperc[k] = levelcatsamplecount[level][cat] * 100.0 / catsamplecount[cat] 222 | 223 | width = 0.75 224 | plts = [ax.bar(ind, levelcancerperc, width, color=levelcolors[level], bottom=accumlevelcancerperc)] + plts 225 | legends = [levellegend[level]] + legends 226 | accumlevelcancerperc = list(map(sum, zip(accumlevelcancerperc, levelcancerperc))) 227 | 228 | ax = plt.gca() 229 | ax.set_axisbelow(True) 230 | ax.set_aspect(0.1) 231 | 232 | ax.tick_params(axis='y', which='major', labelsize=6) 233 | ax.set_ylabel('% of samples', fontsize=6) 234 | ax.set_title(title, fontsize=8) 235 | ax.set_xticks([i + 0.5 for i in ind]) 236 | ax.set_xticklabels(catarray, rotation=60, ha="right", fontsize=4) 237 | # plt.yticks(np.arange(0, 81, 10)) 238 | ax.legend(plts, legends, fontsize=6, bbox_to_anchor=(1.01, 1), loc="upper left") 239 | 240 | 241 | def main(argv): 242 | params = { 243 | "catogerycolumn": argv.catogery_column, # -c 244 | "thresholdcat": argv.threshold_cat, # -n 245 | } 246 | if argv.help: 247 | log.info( 248 | '\n' 249 | 'OncoKBPlots.py -i -o [-c ] [-s sample list filter] [-n threshold of # samples in a category] [-l comma separated levels to include]\n' 251 | ' Essential clinical columns:\n' 252 | ' SAMPLE_ID: sample ID\n' 253 | ' HIGHEST_LEVEL: Highest OncoKB levels\n' 254 | ' Supported levels (-l): \n' 255 | ' LEVEL_1,LEVEL_2,LEVEL_3A,LEVEL_3B,LEVEL_4,ONCOGENIC,VUS' 256 | ) 257 | sys.exit() 258 | if argv.input_file == '' or argv.output_file == '': 259 | required_params = [] 260 | if argv.input_file == '': 261 | required_params.append('-i') 262 | if argv.output_file == '': 263 | required_params.append('-o') 264 | 265 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 266 | log.info('for help: python OncoKBPlots.py -h') 267 | sys.exit(2) 268 | if argv.sample_ids_filter: 269 | setsampleidsfileterfile(argv.sample_ids_filter) 270 | if argv.levels: 271 | params["levels"] = re.split(',', argv.levels) 272 | 273 | log.info('annotating %s ...' % argv.input_file) 274 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1) 275 | 276 | plotclinicalactionability(ax1, argv.input_file, argv.output_file, params) 277 | 278 | # ax.yaxis.grid(linestyle="dotted", color="lightgray") # horizontal lines 279 | # plt.margins(0.01) 280 | 281 | plotclinicalactionability(ax1, args.input_file, args.output_file, params) 282 | plotimplications(ax2, 'HIGHEST_DX_LEVEL', 'OncoKB Diagnostic Implications', dxLevels, args.input_file, 283 | argv.output_file, params) 284 | plotimplications(ax3, 'HIGHEST_PX_LEVEL', 'OncoKB Prognostic Implications', pxLevels, args.input_file, 285 | argv.output_file, params) 286 | 287 | plt.subplots_adjust(left=0.2, bottom=0.3) 288 | plt.gcf().text(0.90, 0.1, "Generated by OncoKB\n[Chakravarty et al., JCO PO 2017]", fontsize=6, 289 | horizontalalignment='right', verticalalignment='bottom') 290 | fig.tight_layout() 291 | fig.savefig(argv.output_file, bbox_inches='tight') 292 | 293 | log.info('done!') 294 | 295 | 296 | if __name__ == "__main__": 297 | parser = argparse.ArgumentParser(add_help=False) 298 | parser.add_argument('-h', dest='help', action="store_true", default=False) 299 | parser.add_argument('-i', dest='input_file', default='', type=str) 300 | parser.add_argument('-o', dest='output_file', default='', type=str) 301 | parser.add_argument('-c', dest='catogery_column', default='CANCER_TYPE', type=str) 302 | parser.add_argument('-s', dest='sample_ids_filter', default='', type=str) 303 | parser.add_argument('-n', dest='threshold_cat', default=0, type=int) 304 | parser.add_argument('-l', dest='levels', default='', type=str) 305 | parser.set_defaults(func=main) 306 | 307 | args = parser.parse_args() 308 | args.func(args) 309 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## UPDATE: 3 | - v3.4 allows you to include descriptions into the annotated files with `-d` parameter. 4 | - When annotating genomic change, HGVSg, three additional columns will be added. `ONCOKB_HUGO_SYMBOL`, `ONCOKB_PROTEIN_CHANGE` and `ONCOKB_CONSEQUENCE` 5 | - See [Columns added section](#columns-added) for more details 6 | 7 | # oncokb-annotator 8 | API token required, please see [OncoKB™ API section](#oncokb-api) for more information 9 | 10 | ## Status 11 | 12 | [![Run all python tests](https://github.com/oncokb/oncokb-annotator/workflows/Run%20all%20python%20tests/badge.svg)](https://github.com/oncokb/oncokb-annotator/actions?query=workflow%3A%22Run+all+python+tests%22) [![Compare Annotation](https://github.com/oncokb/oncokb-annotator/workflows/Compare%20Annotation/badge.svg)](https://github.com/oncokb/oncokb-annotator/actions?query=workflow%3A%22Compare+Annotation%22) 13 | 14 | ## Install dependencies 15 | For python 3 16 | ``` 17 | pip install -r requirements/common.txt -r requirements/pip3.txt 18 | ``` 19 | 20 | For python 2.7 21 | ``` 22 | pip install -r requirements/common.txt -r requirements/pip2.7.txt 23 | ``` 24 | 25 | 26 | ## Usage 27 | Example input files are under [data](data). An example script is here: [example.sh](example.sh) 28 | 29 | ### MAF 30 | Annotates variants in MAF(https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) with OncoKB™ annotation. Supports both python2 and python3. 31 | Get more details on the command line using `python MafAnnotator.py -h`. 32 | 33 | Since OncoKB Annotator only supports MAF files, one option is to use [vcf2maf](https://github.com/mskcc/vcf2maf/) for conversion before using the `MafAnnotator` script. 34 | Note that OncoKB’s canonical transcripts may differ from Ensembl’s, so it’s important to use the `--custom-enst` option with vcf2maf. You can download the latest transcript IDs from OncoKB’s [Cancer Gene List page](https://www.oncokb.org/cancer-genes), but be sure to preprocess the list to make it compatible with vcf2maf. 35 | 36 | #### Atypical Alteration 37 | You can still use MAF format to annotate atypical alterations, such as MSI-H, TMB-H, EGFR vIII. Please see more examples [HERE](data/example_atypical_alterations.txt). 38 | 39 | ### Copy Number Alteration 40 | #### Use GISTIC data format 41 | We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt). 42 | Columns `Locus ID` and `Cytoband` are not required. 43 | #### Individual CNA 44 | You can also list copy number alteration individually by specifying `-f individual`, please see examples [HERE](data/example_individual_cna.txt). 45 | 46 | Get more details on the command line using `python CnaAnnotator.py -h`. 47 | 48 | ### Fusion 49 | OncoKB™ offers to annotate functional fusions. 50 | The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`. 51 | For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`. 52 | 53 | Get more details on the command line using `python FusionAnnotator.py -h`. 54 | 55 | ### Structural Variant 56 | OncoKB™ offers to annotate structural variant. 57 | The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. 58 | All other types will be converted to UNKNOWN. 59 | 60 | All structural variants with two different gene partners, they will be considered as functional fusions. 61 | 62 | Get more details on the command line using `python StructuralVariantAnnotator.py -h`. 63 | 64 | ### Clinical Data (Combine MAF+CNA+Fusion) 65 | You can combine all annotation on sample/patient level using the clinical data annotator. 66 | 67 | Get more details on the command line using `python ClinicalDataAnnotator.py -h`. 68 | 69 | ### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change 70 | OncoKB™ MafAnnotator supports annotating the alteration with HGVSp, HGVSp_Short, HGVSg or Genomic Change format. Please specify the query type with -q parameter. 71 | The acceptable values are HGVSp_Short, HGVSp, HGVSg and Genomic_Change(case-insensitive). Please see data/example.sh for examples. 72 | If you do not specify query type, the MafAnnotator will try to figure out the query type based on the headers. 73 | 74 | #### For HGVSp_Short 75 | The annotator takes alteration from the column HGVSp_Short or Alteration 76 | 77 | #### For HGVSp 78 | The annotator takes alteration from the column HGVSp or Alteration 79 | 80 | #### For HGVSg 81 | The annotator takes alteration from the column HGVSg or Alteration 82 | 83 | #### For Genomic_Change 84 | The annotator takes genomic change from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1(Optional) and Tumor_Seq_Allele2. 85 | Typically Tumor_Seq_Allele1 is the reference allele, Tumor_Seq_Allele2 is the variant allele. This is why Tumor_Seq_Allele1 is optional. 86 | The annotator uses both if the value is different from Reference_Allele. Tumor_Seq_Allele2 has higher priority than Tumor_Seq_Allele1. 87 | 88 | Annotation with Genomic_Change is relatively slow. We need to annotate the variant first with GenomeNexus(https://www.genomenexus.org/) then get annotation one by one. There is a plan to improve this method. If you are annotating a lot of data, please prioritize using other query type if applicable. 89 | 90 | 91 | ### Annotate with different reference genomes (GRCh37, GRCh38) 92 | OncoKB™ MafAnnotator supports annotating the alteration with reference genome GRCh37 and GRCh38. 93 | 94 | The annotator will get the reference genome from MAF file column NCBI_Build or Reference_Genome. 95 | If there is no reference genome specified in the file, we will use the default reference genome through -r parameter. 96 | 97 | You can specify the default reference genome using -r parameter (This is only applicable to MafAnnotator.py). 98 | The acceptable values are GRCh37, GRCh38 (case in-sensitive). 99 | 100 | If both values are not specified, the annotator will use OncoKB™ default reference genome which is GRCh37. 101 | 102 | 103 | ## Levels of Evidence 104 | Introducing [Simplified OncoKB™ Levels of Evidence](https://www.oncokb.org/levels): 105 | - New Level 2, defined as “Standard care biomarker recommended by the NCCN or other expert panels predictive of response to an FDA-approved drug in this indication” (formerly Level 2A). 106 | - Unified Level 3B, defined as “Standard care or investigational biomarker predictive of response to an FDA-approved or investigational drug in another indication” (combination of previous Levels 2B and 3B). 107 | 108 | We have implemented these changes for 2 reasons: 109 | - To be consistent with the [Joint Consensus Recommendation by AMP, ASCO and CAP](https://www.sciencedirect.com/science/article/pii/S1525157816302239?via%3Dihub) and the [ESMO Scale for Clinical Actionability of molecular Targets (ESCAT)](https://academic.oup.com/annonc/article/29/9/1895/5076792?searchresult=1) 110 | - To reflect the clinical data that demonstrates patients with investigational predictive biomarkers for a specific tumor type based on compelling clinical evidence (currently Level 3A) are more likely to experience clinical benefit compared to patients with predictive biomarkers that are considered standard care in a different tumor type (previously Level 2B, now combined into Level 3B). 111 | 112 | 113 | ## OncoKB™ API 114 | When you run `MafAnnotator.py`, `FusionAnnotator.py` and `CnaAnnotator.py`, you need a token before accessing the OncoKB™ data via its web API. Please visit [OncoKB™ Data Access Page](https://www.oncokb.org/dataAccess) for more information about how to register an account and get an OncoKB™ API token. 115 | With the token listed under [OncoKB™ Account Settings Page](https://www.oncokb.org/account/settings), you could use it in the following format. 116 | ``` 117 | python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN} 118 | ``` 119 | 120 | 121 | ## Columns added 122 | ### MafAnnotator/CnaAnnotator/StructuralVariantAnnotator/FusionAnnotator 123 | | Column | Conditions | Possible Values | Description | 124 | |-----------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 125 | | ANNOTATED | | True, False | Whether the variant is annotated by OncoKB successfully. | 126 | | ONCOKB_HUGO_SYMBOL | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained gene hugo symbol from GenomeNexus. This can be cross-referenced with your own gene name. | 127 | | ONCOKB_PROTEIN_CHANGE | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained alteration protein change from GenomeNexus. This can be cross-referenced with your own protein change. | 128 | | ONCOKB_CONSEQUENCE | Only added when annotating genomic change or HGVSg | | When annotating genomic change, we obtained alteration consequence from GenomeNexus. This can be cross-referenced with your own consequence/Variant Class. | 129 | | GENE_IN_ONCOKB | | True, False | Whether the gene has been curated by the OncoKB Team. | 130 | | VARIANT_IN_ONCOKB | | True, False | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations. | 131 | | MUTATION_EFFECT | | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. | 132 | | MUTATION_EFFECT_CITATIONS | | PMID, Abstract, Website link | All citations related to the biological effect. | 133 | | ONCOGENIC | | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance | In OncoKB™, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014). | 134 | | LEVEL_* | | Therapeutic implications | The leveled therapeutic implications. | 135 | | HIGHEST_LEVEL | | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2 | The highest level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 > LEVEL_R2 | 136 | | HIGHEST_SENSITIVE_LEVEL | | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4 | The highest sensitive level of evidence for therapeutic implications. Order: LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 | 137 | | HIGHEST_RESISTANCE_LEVEL | | LEVEL_R1, LEVEL_R2 | The highest resistance level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_R2 | 138 | | TX_CITATIONS | | PMID, Abstract, Website link | All citations related to therapeutic implications. | 139 | | LEVEL_Dx* | | Tumor type the level of evidence is assigned to | The leveled diagnostic implications. | 140 | | HIGHEST_DX_LEVEL | | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3 | The highest level of evidence for diagnostic implications. | 141 | | DX_CITATIONS | | PMID, Abstract, Website link | All citations related to diagnostic implications. | 142 | | LEVEL_Px* | | Tumor type the level of evidence is assigned to | The leveled prognostic implications. | 143 | | HIGHEST_PX_LEVEL | | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3 | The highest level of evidence for prognostic implications. | 144 | | PX_CITATIONS | | PMID, Abstract, Website link | All citations related to prognostic implications. | 145 | | GENE_SUMMARY | Only when parameter -d is specified | | Brief overview of the gene and its role in cancer | 146 | | VARIANT_SUMMARY | Only when parameter -d is specified | | Variant summary describes the variant oncogenicity, last review if it is VUS | 147 | | TUMOR_TYPE_SUMMARY | Only when parameter -d is specified | | Tumor type summary describes the therapeutic implication that applies to the indication | 148 | | DIAGNOSTIC_SUMMARY | Only when parameter -d is specified | | Diagnostic summary that applies to the indication, for hematologic malignancies only | 149 | | PROGNOSTIC_SUMMARY | Only when parameter -d is specified | | Prognostic summary that applies to the indication, for hematologic malignancies only | 150 | | MUTATION_EFFECT_DESCRIPTION | Only when parameter -d is specified | | The mutation effect description provides a brief overview of the biological and oncogenic effect of the VPS and includes appropriate references to peer-reviewed literature. | 151 | 152 | ### ClinicalDataAnnotator 153 | Please see description above for columns LEVEL_*, HIGHEST_LEVEL, HIGHEST_SENSITIVE_LEVEL, HIGHEST_RESISTANCE_LEVEL, LEVEL_Dx*, HIGHEST_DX_LEVEL, LEVEL_Px*, HIGHEST_PX_LEVEL. 154 | Beside these columsn, the following columns will also be added. 155 | 156 | | Column | Description | 157 | |-----------------------------------------------------|-----------------------------------------------------------------------------| 158 | | ONCOGENIC_MUTATIONS | The list of mutations that are Oncogenic or Likely Oncogenic. | 159 | | #ONCOGENIC_MUTATIONS | Number of oncogenic mutations. | 160 | | RESISTANCE_MUTATIONS | The list of resistance mutations. | 161 | | #RESISTANCE_MUTATIONS | Number of resistance mutations. | 162 | | #MUTATIONS_WITH_SENSITIVE_THERAPEUTIC_IMPLICATIONS | Number of mutations in the sample with sensitive therapeutic implications. | 163 | | #MUTATIONS_WITH_RESISTANCE_THERAPEUTIC_IMPLICATIONS | Number of mutations in the sample with resistance therapeutic implications. | 164 | | #MUTATIONS_WITH_DIAGNOSTIC_IMPLICATIONS | Number of mutations in the sample with diagnostic implications. | 165 | | #MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS | Number of mutations in the sample with prognostic implications. | 166 | | #MUTATIONS | Number of mutations in the sample. | 167 | ## Questions? 168 | The best way is to email contact@oncokb.org, so all our team members can help. 169 | -------------------------------------------------------------------------------- /StructuralVariantAnnotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import logging 6 | 7 | from AnnotatorCore import setsampleidsfileterfile 8 | from AnnotatorCore import setcancerhotspotsbaseurl 9 | from AnnotatorCore import setoncokbbaseurl 10 | from AnnotatorCore import setoncokbapitoken 11 | from AnnotatorCore import readCancerTypes 12 | from AnnotatorCore import validate_oncokb_token 13 | from AnnotatorCore import process_sv 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | log = logging.getLogger('StructuralVariantAnnotator') 17 | 18 | 19 | def main(argv): 20 | if argv.help: 21 | log.info( 22 | '\n' 23 | 'StructuralVariantAnnotator.py -i -o ' 24 | '[-p previous results] [-c ] [-s sample list filter] [-t ] ' 25 | '[-u ] [-b ] [-d include descriptions]\n' 26 | ' Essential structural variant columns (case insensitive):\n' 27 | ' GENEA: Hugo gene symbol for gene A\n' 28 | ' GENEB: Hugo gene symbol for gene B\n' 29 | ' SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n' 30 | ' TUMOR_SAMPLE_BARCODE: sample ID\n' 31 | ' Essential clinical columns:\n' 32 | ' SAMPLE_ID: sample ID\n' 33 | ' ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n' 34 | ' Cancer type will be assigned based on the following priority:\n' 35 | ' 1) ONCOTREE_CODE in clinical data file\n' 36 | ' 2) ONCOTREE_CODE exist in structural variant\n' 37 | ' 3) default tumor type (-t)\n' 38 | ' Default OncoKB base url is https://www.oncokb.org' 39 | ) 40 | sys.exit() 41 | if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '': 42 | required_params = [] 43 | if argv.input_file == '': 44 | required_params.append('-i') 45 | if argv.output_file == '': 46 | required_params.append('-o') 47 | if argv.oncokb_api_bearer_token == '': 48 | required_params.append('-b') 49 | 50 | log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty') 51 | log.info('for help: python StructuralVariantAnnotator.py -h') 52 | sys.exit(2) 53 | if argv.sample_ids_filter: 54 | setsampleidsfileterfile(argv.sample_ids_filter) 55 | if argv.cancer_hotspots_base_url: 56 | setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url) 57 | if argv.oncokb_api_url: 58 | setoncokbbaseurl(argv.oncokb_api_url) 59 | setoncokbapitoken(argv.oncokb_api_bearer_token) 60 | 61 | cancertypemap = {} 62 | if argv.input_clinical_file: 63 | readCancerTypes(argv.input_clinical_file, cancertypemap) 64 | 65 | validate_oncokb_token() 66 | 67 | log.info('annotating %s ...' % argv.input_file) 68 | process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, 69 | argv.include_descriptions) 70 | 71 | log.info('done!') 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser(add_help=False) 76 | # ArgumentParser doesn't accept "store_true" and "type=" at the same time. 77 | parser.add_argument('-h', dest='help', action="store_true", default=False) 78 | parser.add_argument('-i', dest='input_file', default='', type=str) 79 | parser.add_argument('-o', dest='output_file', default='', type=str) 80 | parser.add_argument('-p', dest='previous_result_file', default='', type=str) 81 | parser.add_argument('-c', dest='input_clinical_file', default='', type=str) 82 | parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str) 83 | parser.add_argument('-t', dest='default_cancer_type', default='', type=str) 84 | parser.add_argument('-u', dest='oncokb_api_url', default='', type=str) 85 | parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str) 86 | parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str) 87 | parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False) 88 | parser.set_defaults(func=main) 89 | 90 | args = parser.parse_args() 91 | args.func(args) 92 | -------------------------------------------------------------------------------- /actionability_functions_msi_tmb_manuscript_R.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### Annotate IMPACT files using oncokb-annotator ### 4 | 5 | ### Chakravarty D, Gao J, Phillips SM, et al. OncoKB: A Precision Oncology Knowledge Base. JCO Precis Oncol. 2017;2017:PO.17.00011. doi:10.1200/PO.17.00011 ### 6 | 7 | ### Actionability Functions ### 8 | # Collection of functions use to clean, process, and analysis actionability data 9 | 10 | ### Input parameters 11 | 12 | # cna_df: OncoKB annotated IMPACT CNA data 13 | # mut_df: OncoKB annotated IMPACT mutation data 14 | # fus_df: OncoKB annotated IMPACT fusion data 15 | # clin_df: OncoKB annotated IMPACT clinical sample data 16 | # data_freeze: Sample data, must include *SAMPLE_ID*, group_col, and consent_col 17 | # group_col: Column name for the groups (cancer types) 18 | # consent_col: Columns name for 12-245 Part C consent status (YES/NO/NA) 19 | # path_df: Pathway data, must include gene and correpsonding pathway columns (in that order) 20 | # tsg_list: List of tumor suppresor genes (no header) 21 | # fusion_list: List of genes to isolate from fusion partners (ie. NTRK1-LMNA fusion becomes NTRK1 fusion) 22 | # prop_level_df: Output from action_levels_barplot_fun actionability_levels_barplot_table.txt 23 | # alt_final_df: Output from action_alterations_barplot_fun actionability_master_alterations_table.txt 24 | # alt_min: Minimum alteration percentage required in one cancer type to visualize alteration on main plot (default 1) 25 | # status: Include only somatic mutations, only germline mutations, or both (options: somatic, germline, both) 26 | # gene_order: List of genes for gene order, genes not included will be ordered by pathway following this list (no header) 27 | # only_highest_level: TRUE/FALSE, If true only visualize the highest level of evidence genes in main plot 28 | # msi_tmb_status: TRUE/FALSE, If true include Level 1 MSI/TMB status in actionability barplot, removes MSI/TMB in all other plots 29 | # msi_tmb_df: MSI/TMB annotated file (atypical alterations), visualizes MSI/TMB level 1 for actionability barplot, 30 | # removes all samples in file for all other plots 31 | 32 | 33 | ### 34 | 35 | 36 | # Load libraries 37 | if (!require('tidyverse')) install.packages('tidyverse'); library(tidyverse) 38 | if (!require('cowplot')) install.packages('cowplot'); library(cowplot) 39 | if (!require('reshape2')) install.packages('reshape2'); library(reshape2) 40 | 41 | # Collapse oncogenic alterations 42 | collapse_oncogenic <- function(data_frame, sample_column, alteration_type){ 43 | data_frame[, ] <- lapply(data_frame[, ], as.character) 44 | #data_frame_samp <- data_frame %>% dplyr::filter(oncogenic == "Oncogenic") #### TESTING 45 | data_frame_samp <- data_frame[grepl("Oncogenic", data_frame$oncogenic),] 46 | colnames(data_frame_samp)[which(names(data_frame_samp) == sample_column)] <- "SAMPLE_ID" 47 | data_frame_samp <- aggregate(oncogenic ~ SAMPLE_ID, data = data_frame_samp, toString, na.omit = TRUE) 48 | colnames(data_frame_samp)[2] <- paste0(alteration_type, "_oncogenic") 49 | return(data_frame_samp) 50 | } 51 | 52 | # Create frequency data frame by group for subgroup 53 | freq_dataframe <- function(data_frame, split_group, percentage_group){ 54 | # Split group is the column to group by 55 | # Percentage group is the column to calculate the percentage for, by group 56 | df <- data_frame %>% 57 | dplyr::select(percentage_group, split_group) %>% 58 | group_by_(split_group, percentage_group, .drop = F) %>% 59 | dplyr::summarise(n = n()) %>% 60 | dplyr::mutate(freq = n / sum(n)) %>% 61 | ungroup() 62 | return(df) 63 | } 64 | 65 | # Create actionability level barplot 66 | action_levels_barplot_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze, 67 | status = c("somatic", "germline", "both"), 68 | group_col, 69 | consent_col, 70 | msi_tmb_status, 71 | msi_tmb_df){ 72 | # Read in data 73 | cna_df <- read.delim(cna_df) 74 | fus_df <- read.delim(fus_df) 75 | mut_df <- read.delim(mut_df) 76 | clin_df <- read.delim(clin_df) 77 | data_freeze <- read.delim(data_freeze) 78 | data_freeze$SAMPLE_ID <- as.character(data_freeze$SAMPLE_ID) 79 | 80 | ###### 81 | 82 | # Optional MSI/TMB addition 83 | if (msi_tmb_status == TRUE){ 84 | msi_tmb_df <- read.delim(msi_tmb_path) 85 | msi_tmb_df <- msi_tmb_df %>% 86 | dplyr::select(SAMPLE_ID) %>% 87 | mutate_if(is.factor, as.character) %>% 88 | mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>% 89 | distinct() 90 | } else { 91 | msi_tmb_df <- data.frame(SAMPLE_ID = character(), Highest_level = character()) 92 | } 93 | 94 | ##### 95 | 96 | # Clean & filter clinical data 97 | # Add group column 98 | clin_df <- clin_df %>% 99 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 100 | HIGHEST_LEVEL = as.character(HIGHEST_LEVEL)) %>% 101 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 102 | left_join(data_freeze[,c("SAMPLE_ID", group_col, consent_col)], by = c("SAMPLE_ID")) %>% 103 | mutate(HIGHEST_LEVEL = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", HIGHEST_LEVEL)) 104 | group_col_dup <- paste0(group_col, ".y") 105 | colnames(clin_df)[which(names(clin_df) == group_col_dup)] <- group_col 106 | colnames(clin_df)[which(names(clin_df) == consent_col)] <- "consent" 107 | 108 | # Clean, filter, rename genomic data 109 | # Fix column names if upper 110 | # Filter for columns of interest 111 | col_list <- c("SAMPLE_ID", "oncogenic", "LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "Highest_level") 112 | 113 | # Fusions 114 | fus_df <- fus_df %>% 115 | dplyr::rename_all(recode, 116 | Tumor_Sample_Barcode = "SAMPLE_ID", 117 | HIGHEST_LEVEL = "Highest_level", 118 | ONCOGENIC = "oncogenic") %>% 119 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 120 | Highest_level = as.character(Highest_level)) %>% 121 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 122 | mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>% 123 | dplyr::select(col_list) 124 | 125 | # CNA 126 | cna_df <- cna_df %>% 127 | dplyr::rename_all(recode, 128 | Tumor_Sample_Barcode = "SAMPLE_ID", 129 | HIGHEST_LEVEL = "Highest_level", 130 | ONCOGENIC = "oncogenic") %>% 131 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 132 | Highest_level = as.character(Highest_level)) %>% 133 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 134 | mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>% 135 | dplyr::select(col_list) 136 | 137 | # Mutations 138 | mut_df <- mut_df %>% 139 | dplyr::rename_all(recode, 140 | Tumor_Sample_Barcode = "SAMPLE_ID", 141 | HIGHEST_LEVEL = "Highest_level", 142 | ONCOGENIC = "oncogenic") %>% 143 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 144 | Highest_level = as.character(Highest_level)) %>% 145 | mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>% 146 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) 147 | 148 | # Filter for status 149 | if (status == "somatic") { 150 | mut_somatic_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T) 151 | mut_germ_df <- filter(mut_df, Mutation_Status == "GERMLINE") 152 | mut_df <- mut_somatic_df[col_list] 153 | } else if (status == "germline") { 154 | clin_df <- filter(clin_df, consent == "YES") 155 | mut_germ_df <- filter(mut_df, Mutation_Status == "GERMLINE") 156 | mut_germ_df <- mut_germ_df[mut_germ_df$SAMPLE_ID %in% clin_df$SAMPLE_ID,] 157 | mut_df <- mut_germ_df[col_list] 158 | } else { 159 | mut_df <- mut_df[col_list] 160 | clin_germ_df <- filter(clin_df, consent == "YES" | is.na(consent) == T) 161 | } 162 | 163 | # Create master levels data frame for somatic 164 | master_df <- rbind(cna_df, fus_df) 165 | master_df <- rbind(master_df, mut_df) 166 | master_df <- master_df %>% 167 | dplyr::select(SAMPLE_ID, Highest_level) %>% 168 | filter(Highest_level != "") %>% 169 | mutate_if(is.factor, as.character) %>% 170 | group_by(SAMPLE_ID) %>% 171 | dplyr::arrange(Highest_level) %>% 172 | dplyr::slice(1) %>% 173 | ungroup() 174 | 175 | # Collapse oncogenic alterations 176 | cna_df <- collapse_oncogenic(cna_df, "SAMPLE_ID", "cna") 177 | fus_df <- collapse_oncogenic(fus_df, "SAMPLE_ID", "fus") 178 | mut_df <- collapse_oncogenic(mut_df, "SAMPLE_ID", "mut") 179 | 180 | # Filter if germline 181 | if (status == "germline") { 182 | clin_df <- left_join(clin_df, mut_germ_df[,c("SAMPLE_ID", "Highest_level")]) 183 | clin_df <- clin_df %>% mutate_if(is.factor, as.character) 184 | clin_df$HIGHEST_LEVEL <- ifelse(clin_df$SAMPLE_ID %in% mut_germ_df$SAMPLE_ID, clin_df$Highest_level, "NO_LEVEL") 185 | # Get list of sample with oncogenic alteration 186 | onco_samp_list <- mut_df$SAMPLE_ID 187 | } else if (status == "somatic") { 188 | clin_df <- left_join(clin_df, master_df, by = "SAMPLE_ID") 189 | clin_df <- clin_df %>% mutate_if(is.factor, as.character) 190 | clin_df$HIGHEST_LEVEL <- ifelse(clin_df$SAMPLE_ID %in% mut_germ_df$SAMPLE_ID, 191 | clin_df$Highest_level, clin_df$HIGHEST_LEVEL) 192 | # Merge to make master oncogenic list of samples 193 | all_df <- full_join(cna_df, fus_df, by = "SAMPLE_ID") 194 | all_df <- full_join(all_df, mut_df, by = "SAMPLE_ID") 195 | # Get list of sample with oncogenic alteration 196 | onco_samp_list <- all_df$SAMPLE_ID 197 | } else { 198 | # Merge to make master oncogenic list of samples 199 | all_df <- full_join(cna_df, fus_df, by = "SAMPLE_ID") 200 | all_df <- full_join(all_df, mut_df, by = "SAMPLE_ID") 201 | # Get list of sample with oncogenic alteration 202 | onco_samp_list <- all_df$SAMPLE_ID 203 | } 204 | 205 | # Fill in the highest level blanks: 206 | clin_df$HIGHEST_LEVEL <- as.character(clin_df$HIGHEST_LEVEL) 207 | clin_df$HIGHEST_LEVEL[clin_df$HIGHEST_LEVEL == "" | is.na(clin_df$HIGHEST_LEVEL) == T] <- "NO_LEVEL" 208 | clin_df$HIGHEST_LEVEL[(clin_df$SAMPLE_ID %in% onco_samp_list) & (clin_df$HIGHEST_LEVEL == "NO_LEVEL") ] <- "ONCOGENIC" 209 | 210 | # For highest level of actionability, calculate the percentage of each level by subtype 211 | prop_level_df <- freq_dataframe(clin_df, group_col, "HIGHEST_LEVEL") 212 | 213 | # Set level order 214 | level_order <- c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL") 215 | prop_level_df$HIGHEST_LEVEL <- factor(prop_level_df$HIGHEST_LEVEL, levels = level_order) 216 | 217 | # Add counts for labels 218 | # Check the number of oncotree codes and their frequency 219 | data_freeze <- data_freeze[data_freeze$SAMPLE_ID %in% clin_df$SAMPLE_ID,] 220 | clin_oncotree_freq <- as.data.frame(table(data_freeze[,group_col])) 221 | clin_oncotree_freq <- clin_oncotree_freq[order(clin_oncotree_freq$Freq, decreasing = T),] 222 | colnames(clin_oncotree_freq)[1] <- group_col 223 | prop_level_df <- left_join(prop_level_df, clin_oncotree_freq, by = group_col) 224 | 225 | if (status == "both") { 226 | data_freeze_2 <- data_freeze[data_freeze$SAMPLE_ID %in% clin_germ_df$SAMPLE_ID,] 227 | clin_oncotree_freq_germ <- as.data.frame(table(data_freeze_2[,group_col])) 228 | colnames(clin_oncotree_freq_germ)[1] <- group_col 229 | prop_level_df <- left_join(prop_level_df, clin_oncotree_freq_germ, by = group_col) 230 | prop_level_df$label <- apply(prop_level_df[ ,c(group_col, "Freq.x")], 1, paste0, collapse = " n=" ) 231 | prop_level_df$label <- apply(prop_level_df[ ,c("label", "Freq.y")], 1, paste0, collapse = ":" ) 232 | } else { 233 | prop_level_df$label <- apply(prop_level_df[ ,c(group_col, "Freq")], 1, paste0, collapse = " n=" ) 234 | } 235 | 236 | # # Arrange by frequency of actionability 237 | # prop_level_df <- prop_level_df %>% 238 | # arrange(HIGHEST_LEVEL, desc(freq)) 239 | 240 | # Arrange by frequency of combined top 4 levels of actionability 241 | prop_level_df_order <- prop_level_df %>% 242 | filter(HIGHEST_LEVEL %in% c("LEVEL_1_MSI-H_TMB-H", "LEVEL_1", "LEVEL_2", "LEVEL_3A")) %>% 243 | group_by(CANCER_TYPE) %>% 244 | dplyr::mutate(sum_freq = sum(freq)) %>% 245 | right_join(prop_level_df) %>% 246 | dplyr::arrange(desc(sum_freq), HIGHEST_LEVEL, desc(freq)) %>% 247 | dplyr::rename(total_count = Freq) %>% 248 | mutate(CANCER_TYPE = factor(CANCER_TYPE, levels = unique(CANCER_TYPE))) 249 | 250 | # Save 251 | write.table(prop_level_df_order, "./actionability_levels_barplot_table.txt", sep = "\t", row.names = F, quote = F) 252 | 253 | # Set orders 254 | cancer_order <- unique(prop_level_df_order$label) 255 | 256 | # Plot breakdown of levels of evidence as a percentage by sarcoma subtype 257 | percent_bar_plot <- ggplot(prop_level_df_order, aes(y = freq, x = label, fill = HIGHEST_LEVEL)) + 258 | geom_col(position = position_stack(reverse = TRUE)) + 259 | theme(axis.text.x = element_text(angle = 45, hjust = 0, size = 6), 260 | axis.text.y = element_text(size = 6), 261 | axis.ticks.x = element_blank(), 262 | panel.border = element_rect(colour = "black", fill=NA, size=1), 263 | panel.background = element_blank(), 264 | panel.grid.major = element_blank(), 265 | panel.grid.minor = element_blank(), 266 | axis.line = element_line(colour = "black"), 267 | legend.title = element_text(size = 7), 268 | legend.text = element_text(size = 6), 269 | legend.key.size = unit(0.4, "cm"), 270 | axis.title.x = element_blank(), 271 | plot.margin = unit(c(0.05, 0.05, 0.1, 0.05), "cm"), 272 | legend.justification="left", 273 | legend.margin=margin(0,0,0,0), 274 | legend.box.margin=margin(-10,0,-10,-5)) + 275 | scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0)) + 276 | scale_fill_manual(values = c("#88E281","#33A02C", "#1F78B4", "#984EA3", "#BE98CE", "#a8a8a8", "#ffdab9", "gray90"), 277 | limits = c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL"), 278 | labels = c("LEVEL 1 MSI/TMB-H","LEVEL 1", "LEVEL 2", "LEVEL 3A", "LEVEL 3B", "LEVEL 4", "ONCOGENIC", "NO LEVEL")) + 279 | scale_x_discrete(position = "top", 280 | limits = cancer_order) + 281 | ylab("Frequency") + 282 | labs(fill = "Highest Level of Evidence") 283 | 284 | return(percent_bar_plot) 285 | 286 | } 287 | 288 | # Create actionability alteration barplot 289 | action_alterations_barplot_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze, 290 | status = c("somatic", "germline", "both"), 291 | group_col, consent_col, 292 | prop_level_df = "./actionability_levels_barplot_table.txt", 293 | only_highest_level = F, 294 | msi_tmb_status, 295 | msi_tmb_df){ 296 | # Read in data 297 | cna_df <- read.delim(cna_df) 298 | fus_df <- read.delim(fus_df) 299 | mut_df <- read.delim(mut_df) 300 | clin_df <- read.delim(clin_df) 301 | data_freeze <- read.delim(data_freeze) 302 | prop_level_df <- read.delim(prop_level_df) 303 | 304 | # Set order 305 | cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)])) 306 | 307 | 308 | ###### 309 | 310 | # Optional MSI/TMB addition 311 | if (msi_tmb_status == TRUE){ 312 | msi_tmb_df <- read.delim(msi_tmb_path) 313 | msi_tmb_df <- msi_tmb_df %>% 314 | dplyr::select(SAMPLE_ID) %>% 315 | mutate_if(is.factor, as.character) %>% 316 | mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>% 317 | distinct() 318 | data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID) 319 | } 320 | 321 | ##### 322 | 323 | # Clean & filter clinical data 324 | # Add group column 325 | clin_df <- clin_df %>% 326 | mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>% 327 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 328 | left_join(data_freeze[,c("SAMPLE_ID", group_col, consent_col)], by = c("SAMPLE_ID")) 329 | group_col_dup <- paste0(group_col, ".y") 330 | colnames(clin_df)[which(names(clin_df) == group_col_dup)] <- group_col 331 | colnames(clin_df)[which(names(clin_df) == consent_col)] <- "consent" 332 | 333 | # Clean, filter, rename genomic data 334 | # Fix column names if upper 335 | fus_df <- fus_df %>% 336 | dplyr::rename_all(recode, 337 | Tumor_Sample_Barcode = "SAMPLE_ID", 338 | HIGHEST_LEVEL = "Highest_level", 339 | ONCOGENIC = "oncogenic") %>% 340 | mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>% 341 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 342 | mutate(Fusion = gsub(" fusion", "", Fusion)) %>% 343 | mutate(Fusion = gsub(" - Archer", "", Fusion)) %>% 344 | dplyr::select(SAMPLE_ID, oncogenic, Highest_level, Fusion) %>% 345 | rowwise() %>% 346 | mutate(Fusion = ifelse(grepl("intragenic", Fusion), Fusion, 347 | paste(sort(unlist(strsplit(Fusion, "-", fixed = TRUE))), collapse = "-"))) %>% 348 | ungroup() %>% 349 | distinct() %>% 350 | dplyr::select(-Fusion) %>% 351 | mutate(ALTERATION = "Fusion") %>% 352 | filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>% 353 | dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level) 354 | 355 | # CNA 356 | cna_df <- cna_df %>% 357 | dplyr::rename_all(recode, 358 | Tumor_Sample_Barcode = "SAMPLE_ID", 359 | HIGHEST_LEVEL = "Highest_level", 360 | ONCOGENIC = "oncogenic") %>% 361 | mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>% 362 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 363 | dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level) %>% 364 | filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>% 365 | dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level) 366 | 367 | # Mutations 368 | # Filter for status 369 | if (status == "somatic") { 370 | mut_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T) 371 | } else if (status == "germline") { 372 | clin_df <- filter(clin_df, consent == "YES") 373 | mut_df <- filter(mut_df, Mutation_Status == "GERMLINE") 374 | mut_df <- mut_df[mut_df$SAMPLE_ID %in% clin_df$SAMPLE_ID,] 375 | } 376 | # Clean & Filter 377 | mut_df <- mut_df %>% 378 | dplyr::rename_all(recode, 379 | Tumor_Sample_Barcode = "SAMPLE_ID", 380 | HIGHEST_LEVEL = "Highest_level", 381 | ONCOGENIC = "oncogenic") %>% 382 | mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>% 383 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>% 384 | dplyr::select(SAMPLE_ID, oncogenic, Highest_level) %>% 385 | filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>% 386 | mutate(ALTERATION = "Mutation") %>% 387 | dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level) 388 | 389 | 390 | # rbind to create master alterations data frame 391 | # Filter for status for mutation data frame 392 | if (status == "somatic" | status == "both") { 393 | alt_final <- rbind(cna_df, fus_df) 394 | alt_final <- rbind(alt_final, mut_df) 395 | } else if (status == "germline") { 396 | alt_final <- mut_df 397 | } 398 | alt_final <- left_join(alt_final, data_freeze[,c("SAMPLE_ID", group_col)], by = "SAMPLE_ID") 399 | 400 | # Save 401 | write.table(alt_final, "actionability_master_alterations_table.txt", sep = "\t", row.names = F, quote = F) 402 | 403 | ########## optional select only the highest level ########## 404 | 405 | if (only_highest_level == T){ 406 | alt_final <- alt_final %>% 407 | left_join(dplyr::select(clin_df, SAMPLE_ID, HIGHEST_LEVEL), by = "SAMPLE_ID") %>% 408 | mutate_if(is.factor, as.character) %>% 409 | filter(HIGHEST_LEVEL == Highest_level) 410 | } 411 | 412 | ########### 413 | 414 | # Save 415 | write.table(alt_final, "actionability_master_alterations_highest_level_table.txt", sep = "\t", row.names = F, quote = F) 416 | 417 | # Calculate the percentage of each alteration by subtype 418 | prop_alteration_df <- as.data.frame(freq_dataframe(alt_final, group_col, "ALTERATION")) 419 | prop_alteration_df$freq[is.na(prop_alteration_df$freq)] <- 0 420 | prop_alteration_df$ALTERATION <- factor(prop_alteration_df$ALTERATION, 421 | levels = c("Amplification", "Deletion", "Fusion", "Mutation")) 422 | prop_alteration_df$group <- factor(prop_alteration_df[,group_col], 423 | levels = cancer_order_other) 424 | 425 | # Save 426 | write.table(prop_alteration_df, "actionability_alterations_barplot_table.txt", sep = "\t", row.names = F, quote = F) 427 | 428 | # Plot for ACTIONABLE ALTERATIONS 429 | alt_freq_bar_plot <- ggplot(prop_alteration_df, aes(y = freq, x = group, fill = ALTERATION)) + 430 | geom_col(position = position_stack(reverse = FALSE)) + 431 | ylab("Frequency") + 432 | labs(fill = "Actionable Alteration") + 433 | theme(axis.text.x = element_blank(), 434 | axis.ticks.x = element_blank(), 435 | axis.text.y = element_text(size = 6), 436 | panel.border = element_rect(colour = "black", fill=NA, size=1), 437 | panel.background = element_blank(), 438 | panel.grid.major = element_blank(), 439 | panel.grid.minor = element_blank(), 440 | axis.line = element_line(colour = "black"), 441 | legend.title = element_text(size = 8), 442 | legend.text = element_text(size = 6), 443 | legend.key.size = unit(0.4, "cm"), 444 | axis.title.x = element_blank(), 445 | plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"), 446 | legend.justification="left", 447 | legend.margin=margin(0,0,0,0), 448 | legend.box.margin=margin(-10,0,-10,-5)) + 449 | scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0)) + 450 | ylab("Frequency") + 451 | scale_fill_manual(values = c("#A11111", "#02488E", "#660066", "#037903"), 452 | limits = c("Amplification", "Deletion", "Fusion", "Mutation"), 453 | labels = c("Amplification", "Deletion", "Fusion", "Mutation")) + 454 | scale_x_discrete(limits = cancer_order_other) 455 | alt_freq_bar_plot 456 | 457 | return(alt_freq_bar_plot) 458 | 459 | } 460 | 461 | # Create actionability count barplot 462 | action_count_barplot_fun <- function(clin_df, data_freeze, group_col, 463 | prop_level_df = "./actionability_levels_barplot_table.txt", 464 | status = c("somatic", "germline", "both"), 465 | consent_col, 466 | alt_final_df = "./actionability_master_alterations_table.txt", 467 | msi_tmb_status, 468 | msi_tmb_df){ 469 | 470 | # Read in files 471 | prop_level_df <- read.delim(prop_level_df) 472 | alt_final <- read.delim(alt_final_df) 473 | clin_df <- read.delim(clin_df) 474 | data_freeze <- read.delim(data_freeze) 475 | 476 | # Filter for samples in data freeze and clean consent column 477 | data_freeze$SAMPLE_ID <- as.character(data_freeze$SAMPLE_ID) 478 | 479 | ###### 480 | 481 | # Optional MSI/TMB addition 482 | if (msi_tmb_status == TRUE){ 483 | msi_tmb_df <- read.delim(msi_tmb_path) 484 | msi_tmb_df <- msi_tmb_df %>% 485 | dplyr::select(SAMPLE_ID) %>% 486 | mutate_if(is.factor, as.character) %>% 487 | mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>% 488 | distinct() 489 | data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID) 490 | } 491 | 492 | ##### 493 | 494 | # Clean 495 | clin_df <- clin_df[as.character(clin_df$SAMPLE_ID) %in% data_freeze$SAMPLE_ID,] 496 | colnames(data_freeze)[which(names(data_freeze) == consent_col)] <- "consent" 497 | 498 | # Set order 499 | cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)])) 500 | 501 | # Create data frame that counts the number of actionable oncogenic alterations 502 | alt_final$alt_count <- 1 503 | alt_final <- dplyr::select(alt_final, SAMPLE_ID, alt_count) 504 | 505 | # Filter for status 506 | if (status == "germline") { 507 | clin_df <- clin_df[clin_df$SAMPLE_ID %in% as.character(filter(data_freeze, consent == "YES")$SAMPLE_ID),] 508 | } 509 | 510 | # Add in samples that don't have an actionable alteration 511 | alt_final_none <- as.data.frame(clin_df[,c("SAMPLE_ID")]) 512 | colnames(alt_final_none)[1] <- "SAMPLE_ID" 513 | alt_final_none$alt_count <- 0 514 | alt_final <- rbind(alt_final, alt_final_none) 515 | alt_final <- aggregate(alt_count ~ SAMPLE_ID, alt_final, sum) 516 | 517 | # Add cancer subtypes to clinical data frame and create labels 518 | alt_final <- left_join(alt_final, data_freeze[,c("SAMPLE_ID", group_col)], by = "SAMPLE_ID") 519 | alt_final$label <- ifelse(alt_final$alt_count >= 3, "3+", alt_final$alt_count) 520 | 521 | # Calculate the percentage of each count by subtype 522 | prop_alt_count_df <- as.data.frame(freq_dataframe(alt_final, group_col, "label")) 523 | prop_alt_count_df$freq[is.na(prop_alt_count_df$freq)] <- 0 524 | 525 | # Set order 526 | prop_alt_count_df$label <- factor(prop_alt_count_df$label, levels = c("0", "1", "2", "3+")) 527 | prop_alt_count_df$group <- factor(prop_alt_count_df[,group_col], 528 | levels = cancer_order_other) 529 | 530 | # Save 531 | write.table(prop_alt_count_df, "actionability_count_table.txt", sep = "\t", row.names = F, quote = F) 532 | 533 | # Number of alterations plot 534 | alt_per_num_prop_plot <- ggplot(prop_alt_count_df, aes(y = freq, x = group, fill = label)) + 535 | geom_col(position = position_stack(reverse = FALSE)) + 536 | ylab("Frequency") + 537 | labs(fill = "# of Actionable Alterations") + 538 | theme(axis.text.x = element_blank(), 539 | axis.ticks.x = element_blank(), 540 | axis.text.y = element_text(size = 6), 541 | panel.border = element_rect(colour = "black", fill=NA, size=1), 542 | panel.background = element_blank(), 543 | panel.grid.major = element_blank(), 544 | panel.grid.minor = element_blank(), 545 | axis.line = element_line(colour = "black"), 546 | legend.title = element_text(size = 8), 547 | legend.text = element_text(size = 6), 548 | legend.key.size = unit(0.4, "cm"), 549 | axis.title.x = element_blank(), 550 | plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"), 551 | legend.justification="left", 552 | legend.margin=margin(0,0,0,0), 553 | legend.box.margin=margin(-10,0,-10,-5)) + 554 | scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0)) + 555 | scale_fill_manual(values = c("#F7E690", "#F7AA14", "#E17202" ,"#701C5A"), 556 | limits = c("0", "1", "2", "3+"), 557 | labels = c("0", "1", "2", "3+")) + 558 | scale_x_discrete(limits = cancer_order_other) 559 | alt_per_num_prop_plot 560 | 561 | return(alt_per_num_prop_plot) 562 | 563 | } 564 | 565 | # Create actionability alterations main plot 566 | action_main_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze, 567 | path_df, 568 | tsg_list, fusion_list, 569 | prop_level_df = "./actionability_levels_barplot_table.txt", 570 | group_col, 571 | consent_col, 572 | alt_min = 1, 573 | status = c("somatic", "germline", "both"), 574 | gene_order, 575 | only_highest_level = F, 576 | msi_tmb_status, 577 | msi_tmb_df, 578 | include_oncogenic = F){ 579 | 580 | # Read in data 581 | cna_df <- read.delim(cna_df) 582 | fus_df <- read.delim(fus_df) 583 | mut_df <- read.delim(mut_df) 584 | clin_df <- read.delim(clin_df) 585 | data_freeze <- read.delim(data_freeze) 586 | tsg_df <- read.delim(tsg_list, header = F) 587 | prop_level_df <- read.delim(prop_level_df) 588 | 589 | # Set order 590 | cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)])) 591 | 592 | # Clean and filter data 593 | # Data freeze 594 | colnames(data_freeze)[which(names(data_freeze) == group_col)] <- "cancer_type" 595 | colnames(data_freeze)[which(names(data_freeze) == consent_col)] <- "consent" 596 | data_freeze <- data_freeze %>% 597 | mutate_if(is.factor, as.character) 598 | 599 | # Optional MSI/TMB addition 600 | if (msi_tmb_status == TRUE){ 601 | msi_tmb_df <- read.delim(msi_tmb_path) 602 | msi_tmb_df <- msi_tmb_df %>% 603 | dplyr::select(SAMPLE_ID) %>% 604 | mutate_if(is.factor, as.character) %>% 605 | mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>% 606 | distinct() 607 | data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID) 608 | } 609 | 610 | # Clinical 611 | clin_df <- clin_df %>% 612 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) 613 | 614 | # CNA 615 | cna_df <- cna_df %>% 616 | dplyr::rename_all(recode, 617 | Tumor_Sample_Barcode = "SAMPLE_ID", 618 | HIGHEST_LEVEL = "Highest_level", 619 | ONCOGENIC = "oncogenic") %>% 620 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 621 | Highest_level = as.character(Highest_level)) %>% 622 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID, 623 | grepl("Oncogenic", oncogenic)) 624 | 625 | # Fusions 626 | fus_df <- fus_df %>% 627 | dplyr::rename_all(recode, 628 | Tumor_Sample_Barcode = "SAMPLE_ID", 629 | HIGHEST_LEVEL = "Highest_level", 630 | ONCOGENIC = "oncogenic") %>% 631 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 632 | Highest_level = as.character(Highest_level)) %>% 633 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID, 634 | grepl("Oncogenic", oncogenic)) 635 | 636 | # Mutations 637 | mut_df <- mut_df %>% 638 | dplyr::rename_all(recode, 639 | Tumor_Sample_Barcode = "SAMPLE_ID", 640 | HIGHEST_LEVEL = "Highest_level", 641 | ONCOGENIC = "oncogenic") %>% 642 | mutate(SAMPLE_ID = as.character(SAMPLE_ID), 643 | Highest_level = as.character(Highest_level)) %>% 644 | filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID, 645 | grepl("Oncogenic", oncogenic)) 646 | 647 | # Set tumor suppresor list 648 | tumor_suppressor_list <- as.character(tsg_df$V1) 649 | 650 | # Make count data frame - consider somatic/germline/both 651 | if (status == "germline") { 652 | data_freeze <- filter(data_freeze, consent == "YES") 653 | clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type)) 654 | colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count") 655 | mut_df <- mut_df[mut_df$SAMPLE_ID %in% data_freeze$SAMPLE_ID,] 656 | } else if (status == "somatic") { 657 | clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type)) 658 | colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count") 659 | } else { 660 | clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type)) 661 | data_freeze_1 <- filter(data_freeze, consent == "YES") 662 | data_freeze_1$SAMPLE_ID <- as.character(data_freeze_1$SAMPLE_ID) 663 | clin_oncotree_freq_1 <- as.data.frame(table(data_freeze_1$cancer_type)) 664 | clin_oncotree_freq <- left_join(clin_oncotree_freq, clin_oncotree_freq_1, by = "Var1") 665 | colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count", "germ_count") 666 | # Remove samples that have germline alterations but ARE NOT Part C consented 667 | remove_list <- intersect(filter(mut_df, Mutation_Status == "GERMLINE")$SAMPLE_ID, 668 | filter(data_freeze, consent == "NO")$SAMPLE_ID) 669 | mut_df <- mut_df[!(mut_df$SAMPLE_ID %in% remove_list),] 670 | } 671 | 672 | 673 | # Create CNA data frame, combine with pathways and tumor suppresor list 674 | cna_df <- cna_df %>% 675 | inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>% 676 | dplyr::select(SAMPLE_ID, HUGO_SYMBOL, ALTERATION, LEVEL_1, LEVEL_2, LEVEL_3A, 677 | LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type) %>% 678 | distinct() %>% 679 | filter(is.na(Highest_level) == F) %>% 680 | mutate(ALTERATION = substring(ALTERATION, 1, 3)) %>% 681 | dplyr::select(SAMPLE_ID, HUGO_SYMBOL, ALTERATION, Highest_level, oncogenic, cancer_type) %>% 682 | dplyr::rename(sample_id = SAMPLE_ID, 683 | gene_symbol = HUGO_SYMBOL, 684 | alteration = ALTERATION, 685 | highest_level = Highest_level) %>% 686 | mutate(onco_type = ifelse(gene_symbol %in% tumor_suppressor_list, "tumor_suppresor", NA)) 687 | 688 | # Create fusion data frame 689 | # Combine fusions where the hugo gene symbol is counted twice (impact and archer) 690 | fus_df <- fus_df %>% 691 | inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>% 692 | dplyr::select(SAMPLE_ID, Hugo_Symbol, Fusion, LEVEL_1, LEVEL_2, LEVEL_3A, 693 | LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type) %>% 694 | mutate_if(is.factor, as.character) %>% 695 | mutate(Fusion = gsub(" fusion", "", Fusion)) %>% 696 | mutate(Fusion = gsub(" - Archer", "", Fusion)) %>% 697 | rowwise() %>% 698 | mutate(Fusion = ifelse(grepl("intragenic", Fusion), Fusion, 699 | paste(sort(unlist(strsplit(Fusion, "-", fixed = TRUE))), collapse = "-"))) %>% 700 | ungroup() %>% 701 | distinct() 702 | 703 | # If fusion list is provided, select the gene partner of interest based on the list 704 | if (missing(fusion_list) == FALSE) { 705 | # Read in fusion list 706 | fusion_list <- read.delim(fusion_list, header = F) 707 | fusion_list <- as.character(fusion_list$V1) 708 | fusion_list_collapse <- paste0("\\b", paste(fusion_list , collapse="\\b|\\b"), "\\b") 709 | # Filter for fusion list or full fusion name 710 | fus_df <- fus_df %>% 711 | mutate_if(is.factor, as.character) %>% 712 | mutate(Fusion = ifelse(Hugo_Symbol %in% fusion_list, Hugo_Symbol, 713 | ifelse(grepl(fusion_list_collapse, Fusion) == F, Fusion, "REMOVE"))) %>% 714 | filter(Fusion != "REMOVE") %>% 715 | mutate(Fusion = gsub("-intragenic", "", Fusion)) 716 | } 717 | 718 | # Clean, add tumor suppresor columns 719 | fus_df <- fus_df %>% 720 | filter(Highest_level != "") %>% 721 | mutate(Alteration = "Fus") %>% 722 | dplyr::select(SAMPLE_ID, Fusion, Alteration, Highest_level, oncogenic, cancer_type) %>% 723 | dplyr::rename(sample_id = SAMPLE_ID, 724 | gene_symbol = Fusion, 725 | alteration = Alteration, 726 | highest_level = Highest_level) %>% 727 | mutate(onco_type = ifelse(gene_symbol %in% tumor_suppressor_list, "tumor_suppresor", NA)) %>% 728 | distinct() 729 | 730 | # Collapse NTRK fusions 731 | # Other fusions can be added to this list moving forward 732 | fus_df <- fus_df %>% 733 | mutate_if(is.factor, as.character) %>% 734 | mutate(gene_symbol = ifelse(gene_symbol %in% c("NTRK1", "NTRK2", "NTRK3"), "NTRK1/2/3", gene_symbol)) %>% 735 | distinct() 736 | 737 | # Filter for mutation status 738 | if (status == "somatic") { 739 | mut_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T) 740 | } else if (status == "germline") { 741 | mut_df <- filter(mut_df, Mutation_Status == "GERMLINE") 742 | } 743 | 744 | # Mutation 745 | mut_df <- mut_df %>% 746 | inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>% 747 | dplyr::select(SAMPLE_ID, Hugo_Symbol, Variant_Type, LEVEL_1, LEVEL_2, LEVEL_3A, 748 | LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type, HGVSp_Short, Mutation_Status) 749 | # Add in oncogenic here if included 750 | if (include_oncogenic == T) { 751 | mut_df <- mut_df %>% 752 | mutate(ONCOGENIC = "ONCOGENIC") 753 | } 754 | mut_df <- melt(mut_df, id.vars = c("SAMPLE_ID", "Hugo_Symbol", "Variant_Type", "Highest_level", 755 | "oncogenic", "cancer_type", "HGVSp_Short", "Mutation_Status")) 756 | 757 | # Aggregate by everything but strip for the highest level 758 | # This is just in case there is a gene alteration that has more than one level 759 | # Add pathways and tumor suppressor column 760 | # Remove duplicates if they are in the same pathway (use order of input df) 761 | mut_df <- mut_df %>% 762 | mutate_if(is.factor, as.character) %>% 763 | filter(value != "") %>% 764 | dplyr::select(-value, -Highest_level) %>% 765 | dplyr::rename(highest_level = variable) %>% 766 | filter(is.na(highest_level) == F) %>% 767 | mutate(highest_level == as.character(highest_level), 768 | Mutation_Status = ifelse(is.na(Mutation_Status) == T, "", Mutation_Status)) %>% 769 | group_by(SAMPLE_ID, Hugo_Symbol, Variant_Type, oncogenic, cancer_type, HGVSp_Short, Mutation_Status) %>% 770 | dplyr::summarise(highest_level = toString(highest_level)) %>% 771 | ungroup() %>% 772 | mutate(highest_level = gsub(",.*", "", highest_level), 773 | alteration = "Mut") %>% 774 | mutate(onco_type = ifelse(Hugo_Symbol %in% tumor_suppressor_list, "tumor_suppresor", NA)) %>% 775 | ### 776 | ### work in progress 777 | mutate(Hugo_Symbol = ifelse(Hugo_Symbol == "BRAF" & HGVSp_Short == "p.V600E", "BRAF_V600E", Hugo_Symbol)) %>% 778 | mutate(Hugo_Symbol = ifelse(Hugo_Symbol == "BRAF" & HGVSp_Short != "p.V600E", "BRAF_Other", Hugo_Symbol)) %>% 779 | ### 780 | ### 781 | distinct() %>% 782 | rename(gene_symbol = Hugo_Symbol) %>% 783 | mutate(gene_symbol = as.character(gene_symbol)) %>% 784 | dplyr::select(SAMPLE_ID, gene_symbol, alteration, highest_level, oncogenic, cancer_type, Mutation_Status, onco_type) %>% 785 | dplyr::rename(sample_id = SAMPLE_ID) %>% 786 | mutate(onco_type = ifelse(Mutation_Status == "GERMLINE", "germline", onco_type), 787 | gene_symbol = ifelse(Mutation_Status == "GERMLINE", paste0(gene_symbol, "*"), gene_symbol)) %>% 788 | dplyr::select(-Mutation_Status) %>% 789 | group_by(sample_id, gene_symbol, alteration, highest_level, oncogenic, cancer_type, onco_type) %>% 790 | dplyr::slice(1) %>% 791 | ungroup() 792 | 793 | # Filter for status 794 | # Combine CNA, FUS, and MUT - create final df 795 | if (status == "somatic" | status == "both") { 796 | gene_final_df <- rbind(cna_df, fus_df) 797 | gene_final_df <- rbind(gene_final_df, mut_df) 798 | } else if (status == "germline") { 799 | gene_final_df <- mut_df 800 | } 801 | 802 | # Optional include oncogenic alterations in plot 803 | if (include_oncogenic == T){ 804 | gene_final_df <- gene_final_df %>% 805 | mutate(highest_level = ifelse((is.na(highest_level) == T | highest_level == "") & 806 | grepl("Oncogenic", oncogenic) == T, "ONCOGENIC", highest_level)) 807 | } 808 | 809 | # Combine all tumor suppressor alterations (del, mut, fus) 810 | # If the alteration is on a tumor suppresor, ignore alteration label 811 | # Clean up gene symbol, remove everything after the comma 812 | # Remove mutation label to clean up y axis 813 | gene_final_df <- gene_final_df %>% 814 | filter(is.na(highest_level) == F & highest_level != "") %>% 815 | mutate_if(is.factor, as.character) %>% 816 | mutate(onco_type = ifelse(is.na(onco_type) == T, "oncogene", onco_type)) %>% 817 | group_by(sample_id, gene_symbol, highest_level, cancer_type, onco_type) %>% 818 | dplyr::summarise(alteration = toString(alteration)) %>% 819 | ungroup() %>% 820 | mutate(alteration = as.character(alteration)) %>% 821 | mutate(alteration = ifelse(onco_type == "tumor_suppresor", "Del", alteration)) %>% 822 | mutate(alteration = gsub(",.*", "", alteration)) %>% 823 | mutate(gene_symbol_label = gsub(" Mut", "", paste0(gene_symbol, " ", alteration))) 824 | 825 | # Optional select only the highest level 826 | if (only_highest_level == T){ 827 | colnames(clin_df)[which(names(clin_df) == "SAMPLE_ID")] <- "sample_id" 828 | gene_final_df <- gene_final_df %>% 829 | left_join(dplyr::select(clin_df, sample_id, HIGHEST_LEVEL), by = "sample_id") %>% 830 | mutate_if(is.factor, as.character) %>% 831 | filter(HIGHEST_LEVEL == highest_level) 832 | } 833 | 834 | # Manual alterations 835 | ### 836 | ### work in progress 837 | gene_final_df <- gene_final_df %>% 838 | mutate_if(is.factor, as.character) %>% 839 | mutate(gene_symbol = ifelse(gene_symbol %in% c("BRCA1", "BRCA2"), "BRCA1/2", gene_symbol), 840 | gene_symbol_label = ifelse(gene_symbol == "BRCA1/2", "BRCA1/2 Del", gene_symbol_label)) %>% 841 | mutate(gene_symbol = ifelse(gene_symbol %in% c("CHEK1", "CHEK2"), "CHEK1/2", gene_symbol), 842 | gene_symbol_label = ifelse(gene_symbol == "CHEK1/2", "CHEK1/2 Del", gene_symbol_label)) %>% 843 | mutate(gene_symbol = ifelse(gene_symbol %in% c("TSC1", "TSC2"), "TSC1/2", gene_symbol), 844 | gene_symbol_label = ifelse(gene_symbol == "TSC1/2", "TSC1/2 Del", gene_symbol_label)) %>% 845 | distinct() 846 | ### 847 | ### 848 | 849 | # Calculate the percentage of each count by subtype 850 | # Only select the highest level 851 | prop_main_plot_df <- gene_final_df %>% 852 | group_by(cancer_type, gene_symbol_label, highest_level) %>% 853 | dplyr::summarise(n = n()) %>% 854 | ungroup() %>% 855 | left_join(clin_oncotree_freq, by = "cancer_type") %>% 856 | dplyr::mutate(freq = n /total_count) %>% 857 | group_by(cancer_type, gene_symbol_label) %>% 858 | arrange(highest_level) %>% 859 | dplyr::slice(1) %>% 860 | ungroup() %>% 861 | dplyr::mutate(percentage = 100*freq, 862 | label_text = round(percentage, 0), 863 | label_text = ifelse(percentage > 0 & percentage < 1, " ", label_text)) 864 | 865 | # Optional add pathway if provided, if not use it to set gene list 866 | if (missing(path_df) == T) { 867 | path_df <- gene_final_df %>% 868 | left_join(prop_main_plot_df) %>% 869 | dplyr::select(gene_symbol_label, highest_level, cancer_type, percentage) %>% 870 | distinct() %>% 871 | group_by(gene_symbol_label, highest_level) %>% 872 | mutate(count = n()) %>% 873 | ungroup() %>% 874 | arrange(highest_level, desc(count), desc(percentage), gene_symbol_label) %>% 875 | group_by(gene_symbol_label) %>% 876 | dplyr::slice(1) %>% 877 | ungroup() %>% 878 | arrange(highest_level, desc(count), desc(percentage), gene_symbol_label) %>% 879 | mutate(pathway = row_number()) %>% 880 | dplyr::select(gene_symbol_label, pathway) 881 | gene_final_df <- gene_final_df %>% left_join(path_df) 882 | } else { 883 | path_df <- read.delim(path_df) 884 | colnames(path_df)[] <- c("gene_symbol", "pathway") 885 | gene_final_df <- gene_final_df %>% left_join(path_df) 886 | } 887 | 888 | # Add germline label if figure includes both somatic and germline 889 | if (status == "both") { 890 | prop_main_plot_df$freq <- ifelse(grepl("\\*",prop_main_plot_df$gene_symbol_label) == TRUE, 891 | prop_main_plot_df$n/prop_main_plot_df$germ_count, 892 | prop_main_plot_df$freq) 893 | } 894 | 895 | # Add pathways 896 | prop_main_plot_df <- prop_main_plot_df %>% 897 | left_join(dplyr::select(gene_final_df, gene_symbol_label, gene_symbol), by = "gene_symbol_label") %>% 898 | left_join(dplyr::select(gene_final_df, gene_symbol, cancer_type, highest_level, pathway, onco_type), 899 | by = c("gene_symbol", "cancer_type", "highest_level")) %>% 900 | group_by(gene_symbol, gene_symbol_label, cancer_type, pathway, onco_type, n, total_count, percentage, freq, label_text) %>% 901 | dplyr::summarise(highest_level = toString(highest_level)) %>% 902 | ungroup() %>% 903 | mutate(highest_level_label = gsub(",.*", "", highest_level)) %>% 904 | dplyr::select(-highest_level) %>% 905 | dplyr::arrange(pathway, gene_symbol, highest_level_label, cancer_type) 906 | 907 | # Only keep rows where at least one subtype meets the percetage threshold (alt_min) 908 | prop_main_plot_df_filter <- prop_main_plot_df %>% 909 | dplyr::select(gene_symbol_label, percentage) %>% 910 | group_by(gene_symbol_label) %>% 911 | filter(percentage == max(percentage)) %>% 912 | filter(percentage < alt_min) 913 | prop_main_plot_df <- prop_main_plot_df %>% 914 | filter(!gene_symbol_label %in%prop_main_plot_df_filter$gene_symbol_label) %>% 915 | mutate(cancer_type = factor(cancer_type, levels = cancer_order_other)) 916 | 917 | # Set gene order manually 918 | if (missing(gene_order) == F) { 919 | gene_order <- read.delim(gene_order, header = F) 920 | gene_order <- as.data.frame(gene_order[rep(seq_len(nrow(gene_order)), each = 2), ]) 921 | colnames(gene_order)[] <- c("gene_symbol") 922 | gene_order <- gene_order %>% 923 | mutate_if(is.factor, as.character) %>% 924 | mutate(order = seq(1:nrow(gene_order)), 925 | gene_symbol = ifelse(order %% 2 == 0, paste0(gene_symbol, "*"), gene_symbol)) 926 | prop_main_plot_df <- prop_main_plot_df %>% 927 | left_join(gene_order, by = "gene_symbol") %>% 928 | dplyr::arrange(order) 929 | } 930 | 931 | # Get text color order 932 | text_tsg_col <- prop_main_plot_df %>% 933 | dplyr::select(gene_symbol_label, onco_type) %>% 934 | distinct() %>% 935 | dplyr::arrange(onco_type) %>% 936 | group_by(gene_symbol_label) %>% 937 | dplyr::summarise(onco_type = toString(onco_type)) %>% 938 | ungroup() %>% 939 | mutate(col = ifelse(onco_type != "tumor_suppresor", ifelse(onco_type == "oncogene", "#7E1116", "#4F0043"), "#191A57")) %>% 940 | group_by(gene_symbol_label, col) %>% 941 | slice(1) %>% 942 | ungroup() %>% 943 | mutate(gene_symbol_label = factor(gene_symbol_label, levels = unique(prop_main_plot_df$gene_symbol_label))) %>% 944 | dplyr::arrange(gene_symbol_label) 945 | 946 | # Write out data frame 947 | write.table(prop_main_plot_df, "actionability_main_plot_data.txt", sep = "\t", quote = F, row.names = F) 948 | 949 | # Create main plot 950 | action_tile_plot_all <- ggplot(data = prop_main_plot_df, aes(x = cancer_type, y = gene_symbol_label)) + 951 | geom_tile(aes(fill = highest_level_label)) + 952 | geom_text(aes(label = label_text), colour = "white", size = 2) + 953 | theme(panel.grid.major = element_blank(), 954 | axis.text.x = element_blank(), 955 | axis.text.y = element_text(size = 6), # colour = rev(text_tsg_col$col)), 956 | panel.background = element_blank(), 957 | panel.border = element_rect(colour = "black", fill=NA, size=0.5), 958 | axis.title.x = element_blank(), 959 | axis.ticks.x = element_blank(), 960 | axis.title.y = element_text(size = 8), 961 | plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"), 962 | legend.title = element_text(size = 8), 963 | legend.text = element_text(size = 6), 964 | legend.justification="left", 965 | legend.margin=margin(0,0,0,0), 966 | legend.box.margin=margin(-10,0,-10,-5)) + 967 | geom_vline(xintercept=seq(1.5, length(levels(prop_main_plot_df$cancer_type))-0.5, 1), 968 | lwd=0.25, colour="gray80") + 969 | geom_hline(yintercept=seq(1.5, length(unique(prop_main_plot_df$gene_symbol_label))-0.5, 1), 970 | lwd=0.25, colour="gray80") + 971 | scale_fill_manual(values = c("#88E281","#33A02C", "#1F78B4", "#984EA3", "#BE98CE", "#a8a8a8", "#ffdab9", "gray90"), 972 | limits = c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL"), 973 | labels = c("LEVEL 1 MSI/TMB-H","LEVEL 1", "LEVEL 2", "LEVEL 3A", "LEVEL 3B", "LEVEL 4", "ONCOGENIC", "NO LEVEL")) + 974 | scale_y_discrete(limits = rev(unique(prop_main_plot_df$gene_symbol_label)), 975 | labels = gsub("_", " ", rev(unique(prop_main_plot_df$gene_symbol_label))), 976 | expand = c(0,0)) + 977 | scale_x_discrete(limits = cancer_order_other) + 978 | labs(fill = "Highest Level\nof Evidence") + 979 | guides(fill = guide_legend(override.aes = list(size = 1))) 980 | 981 | return(action_tile_plot_all) 982 | } 983 | 984 | 985 | # Create actionability TMB-H & MSI-H main plot add-on 986 | action_main_msi_tmb_fun <- function(clin_df, 987 | data_freeze, 988 | group_col, 989 | prop_level_df = "./actionability_levels_barplot_table.txt", 990 | msi_tmb_df){ 991 | # Read in data 992 | data_freeze <- read.delim(data_freeze) 993 | clin_df <- read.delim(clin_df) 994 | msi_tmb_df <- read.delim(msi_tmb_df) 995 | prop_level_df <- read.delim(prop_level_df) 996 | 997 | # Set order 998 | cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)])) 999 | 1000 | # Get MSI/TMB frequency 1001 | aty_alt_df <- msi_tmb_df %>% 1002 | mutate_if(is.factor, as.character) %>% 1003 | dplyr::select(SAMPLE_ID, ALTERATION, HIGHEST_LEVEL) %>% 1004 | right_join(data_freeze) %>% 1005 | dplyr::select(CANCER_TYPE, ALTERATION) %>% 1006 | dplyr::mutate_if(is.factor, as.character) %>% 1007 | dplyr::mutate(ALTERATION = ifelse(is.na(ALTERATION) == T, "NONE", ALTERATION)) 1008 | aty_alt_df <- aty_alt_df %>% 1009 | left_join(dplyr::count(dplyr::select(group_by(aty_alt_df, CANCER_TYPE), CANCER_TYPE))) %>% 1010 | dplyr::rename(total_count = n) %>% 1011 | mutate_if(is.character, as.factor) %>% 1012 | group_by(ALTERATION, CANCER_TYPE, total_count) %>% 1013 | dplyr::summarise(n = n()) %>% 1014 | dplyr::mutate(freq = n/total_count, 1015 | percentage = 100*freq, 1016 | label_text = as.character(round(percentage, 0)), 1017 | label_text_final = ifelse(percentage > 0 & percentage < 1, "", label_text)) %>% 1018 | filter(ALTERATION != "NONE") %>% 1019 | dplyr::mutate(ALTERATION = factor(ALTERATION, levels = c("TMB-H", "MSI-H")), 1020 | label = "MSI-H & TMB-H") 1021 | 1022 | # Plot 1023 | aty_alt_tile_plot_all <- ggplot(data = aty_alt_df, aes(x = CANCER_TYPE, y = ALTERATION)) + 1024 | geom_tile(aes(fill = label)) + 1025 | geom_text(aes(label = label_text_final), colour = "black", size = 2) + 1026 | theme(panel.grid.major = element_blank(), 1027 | axis.text.x = element_blank(), 1028 | axis.text.y = element_text(size = 6), 1029 | panel.background = element_blank(), 1030 | panel.border = element_rect(colour = "black", fill=NA, size=1), 1031 | axis.title.x = element_blank(), 1032 | axis.title.y = element_blank(), 1033 | axis.ticks.x = element_blank(), 1034 | plot.margin = unit(c(0.05, 0.05, 0.1, 0.05), "cm"), 1035 | legend.justification="left", 1036 | legend.margin = margin(0,0,0,0), 1037 | legend.box.margin = margin(-10,0,-10,-5), 1038 | legend.title = element_blank(), 1039 | legend.text = element_text(size = 6), 1040 | legend.key.size = unit(0.4, "cm")) + 1041 | geom_vline(xintercept=seq(1.5, length(levels(aty_alt_df$CANCER_TYPE))-0.5, 1), 1042 | lwd=0.5, colour="white") + 1043 | geom_hline(yintercept=seq(1.5, length(unique(aty_alt_df$ALTERATION))-0.5, 1), 1044 | lwd=0.25, colour="white") + 1045 | scale_fill_manual(values = c("#88E281"), 1046 | limits = c("MSI-H & TMB-H"), 1047 | labels = c("MSI-H & TMB-H")) + 1048 | scale_y_discrete(limits = levels(aty_alt_df$ALTERATION), 1049 | expand = c(0,0)) + 1050 | scale_x_discrete(limits = cancer_order_other) 1051 | 1052 | return(aty_alt_tile_plot_all) 1053 | 1054 | } 1055 | 1056 | 1057 | 1058 | #-------------- -------------------------------------------------------------------------------- /data/example_atypical_alterations.txt: -------------------------------------------------------------------------------- 1 | HUGO_SYMBOL SAMPLE_ID ALTERATION 2 | Other Biomarkers TCGA-A6-2672-01 MSI-H 3 | Other Biomarkers TCGA-A6-2672-01 Microsatellite Instability-High 4 | Other Biomarkers TCGA-AG-A002-01 TMB-H 5 | Other Biomarkers TCGA-AG-A002-01 Tumor Mutational Burden-High 6 | EGFR TCGA-FAKE-01 vIII 7 | EGFR TCGA-FAKE-02 vV 8 | FLT3 TCGA-FAKE-01 Internal tandem duplication 9 | FLT3 TCGA-FAKE-02 ITD 10 | BRAF TCGA-FAKE-01 Kinase Domain Duplication 11 | BRAF TCGA-FAKE-03 KDD 12 | EGFR TCGA-FAKE-01 C-terminal domain 13 | EGFR TCGA-FAKE-02 CTD 14 | -------------------------------------------------------------------------------- /data/example_clinical.txt: -------------------------------------------------------------------------------- 1 | SAMPLE_ID ONCOTREE_CODE 2 | TCGA-05-4417-01 LUAD 3 | TCGA-02-0033-01 BLLETV6RUNX1 4 | TCGA-06-0155-01 GBM 5 | TCGA-AG-A002-01 READ 6 | TCGA-A6-2672-01 COAD 7 | TCGA-FAKE-01 AML 8 | TCGA-FAKE-02 AML 9 | TCGA-FAKE-03 HDCN 10 | TCGA-A6-2672-01A-01W-0833-10 MEL 11 | -------------------------------------------------------------------------------- /data/example_cna.txt: -------------------------------------------------------------------------------- 1 | Gene Symbol Locus ID Cytoband TCGA-05-4417-01 TCGA-02-0033-01 2 | MET 0 0 2 2 3 | ERBB2 0 0 2 1 4 | CDK4 0 0 -2 2 5 | CDK4 0 0 -1 2 6 | -------------------------------------------------------------------------------- /data/example_fusions.txt: -------------------------------------------------------------------------------- 1 | Tumor_Sample_Barcode Fusion 2 | TCGA-02-0033-01 MLL2-intragenic 3 | TCGA-05-4417-01 ALK-EML4 4 | TCGA-06-0155-01 EGFR-intragenic 5 | TCGA-06-0155-01 TMPRSS2-ERG 6 | TCGA-05-4417-01 TMPRSS2-ERG 7 | TCGA-06-0155-01 ERBB2-intragenic 8 | TCGA-02-0033-01 ETV6-RUNX1 fusion 9 | -------------------------------------------------------------------------------- /data/example_individual_cna.txt: -------------------------------------------------------------------------------- 1 | Tumor_Sample_Barcode Hugo_Symbol Copy_Number_Alteration 2 | TCGA-05-4417-01 MET 2 3 | TCGA-05-4417-01 ERBB2 2 4 | TCGA-05-4417-01 CDK4 -2 5 | TCGA-05-4417-01 CDK4 -1 6 | TCGA-02-0033-01 MET 2 7 | TCGA-02-0033-01 ERBB2 1 8 | TCGA-02-0033-01 CDK4 2 9 | TCGA-02-0033-01 CDK4 2 10 | TCGA-05-4417-01 MET Amplification 11 | TCGA-05-4417-01 CDK4 Deletion 12 | TCGA-05-4417-01 CDK4 Loss 13 | TCGA-05-4417-01 ERBB2 Gain -------------------------------------------------------------------------------- /data/example_maf.txt: -------------------------------------------------------------------------------- 1 | NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 2 | GRCh37 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser 3 | GRCh37 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182* 4 | GRCh37 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.178936082G>A 3 178936082 178936082 G A A 5 | GRCh37 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met 6 | GRCh37 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55223543C>T 7 55223543 55223543 C T T 7 | GRCh37 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.89692922T>C 10 89692922 89692922 T C C 8 | GRCh37 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys 9 | GRCh37 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg 10 | GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285C>A 12 25398285 25398285 C A A 11 | GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285_25398286delinsAG 12 25398285 25398286 CA AG AG 12 | GRCh37 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702* 13 | GRCh37 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7577538C>T 17 7577538 7577538 C T T 14 | GRCh37 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.29586049G>A 17 29586049 29586049 G A A 15 | GRCh37 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg 16 | GRCh37 TERT 5'Flank TCGA-05-4417-01 5:g.1295228G>A 5 1295228 1295228 G A A 17 | GRCh37 MYD88 Missense_Mutation TCGA-05-4417-01 M232T 18 | GRCh37 EGFR Missense_Mutation TCGA-05-4417-01 T790M -------------------------------------------------------------------------------- /data/example_maf_grch38.txt: -------------------------------------------------------------------------------- 1 | NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 2 | GRCh38 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser 3 | GRCh38 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182* 4 | GRCh38 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.179218294G>A 3 179218294 179218294 G A A 5 | GRCh38 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met 6 | GRCh38 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55155850C>T 7 55155850 55155850 C T T 7 | GRCh38 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.87933165T>C 10 87933165 87933165 T C C 8 | GRCh38 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys 9 | GRCh38 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg 10 | GRCh38 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25245351C>A 12 25245351 25245351 C A A 11 | GRCh38 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702* 12 | GRCh38 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7674220C>T 17 7674220 7674220 C T T 13 | GRCh38 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.31259031G>A 17 31259031 31259031 G A A 14 | GRCh38 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg 15 | GRCh38 MYD88 Missense_Mutation TCGA-05-4417-01 M219T 16 | -------------------------------------------------------------------------------- /data/example_sv.txt: -------------------------------------------------------------------------------- 1 | Tumor_Sample_Barcode GeneA GeneB Sv_Type 2 | TCGA-02-0033-01 MLL2 MLL2 DELETION 3 | TCGA-05-4417-01 ALK EML4 FUSION 4 | TCGA-06-0155-01 EGFR EGFR DELETION 5 | TCGA-06-0155-01 TMPRSS2 ERG FUSION 6 | TCGA-05-4417-01 TMPRSS2 ERG FUSION 7 | TCGA-06-0155-01 ERBB2 ERBB2 DELETION 8 | TCGA-02-0033-01 ETV6 RUNX1 FUSION 9 | -------------------------------------------------------------------------------- /example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | IMAF="data/example_maf.txt" 3 | OMAF="data/example_maf.oncokb.txt" 4 | 5 | IMAF38="data/example_maf_grch38.txt" 6 | OMAF38="data/example_maf_grch38.oncokb.txt" 7 | 8 | OMAFHGVSPSHORT="data/example_maf_hgsp_short.oncokb.txt" 9 | OMAFHGVSP="data/example_maf_hgsp.oncokb.txt" 10 | OMAFHGVSG="data/example_maf_hgsg.oncokb.txt" 11 | OMAFGC="data/example_maf_genomic_change.oncokb.txt" 12 | 13 | IATYPICALALT="data/example_atypical_alterations.txt" 14 | OATYPICALALT="data/example_atypical_alterations.oncokb.txt" 15 | 16 | IF="data/example_fusions.txt" 17 | OF="data/example_fusions.oncokb.txt" 18 | 19 | ISV="data/example_sv.txt" 20 | OSV="data/example_sv.oncokb.txt" 21 | 22 | ICNA="data/example_cna.txt" 23 | OCNA="data/example_cna.oncokb.txt" 24 | 25 | IICNA="data/example_individual_cna.txt" 26 | OICNA="data/example_individual_cna.oncokb.txt" 27 | 28 | IC="data/example_clinical.txt" 29 | OC="data/example_clinical.oncokb.txt" 30 | 31 | TOKEN="" #OncoKB API Token 32 | README="data/example_README.txt" 33 | 34 | python MafAnnotator.py -i "$IMAF" -o "$OMAF" -c "$IC" -b "$TOKEN" 35 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSPSHORT" -c "$IC" -b "$TOKEN" -q hgvsp_short 36 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSP" -c "$IC" -b "$TOKEN" -q hgvsp 37 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSG" -c "$IC" -b "$TOKEN" -q hgvsg 38 | python MafAnnotator.py -i "$IMAF" -o "$OMAFGC" -c "$IC" -b "$TOKEN" -q genomic_change 39 | 40 | python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN" 41 | 42 | python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN" 43 | 44 | python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN" 45 | python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN" 46 | python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN" 47 | python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$TOKEN" -f "individual" -z 48 | python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV" 49 | 50 | python GenerateReadMe.py -o "$README" 51 | -------------------------------------------------------------------------------- /flake8.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501,W503,E126 -------------------------------------------------------------------------------- /requirements/common.txt: -------------------------------------------------------------------------------- 1 | requests==2.31.0 2 | urllib3==1.26.8 -------------------------------------------------------------------------------- /requirements/pip2.7.txt: -------------------------------------------------------------------------------- 1 | enum34==1.1.10 2 | kiwisolver==1.1.0 -------------------------------------------------------------------------------- /requirements/pip3.txt: -------------------------------------------------------------------------------- 1 | kiwisolver==1.2.0 2 | -------------------------------------------------------------------------------- /test_Annotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pytest 3 | import os 4 | import logging 5 | 6 | from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS, ONCOKB_ANNOTATION_HEADERS_GC 7 | from AnnotatorCore import pull_genomic_change_info 8 | from AnnotatorCore import pull_protein_change_info 9 | from AnnotatorCore import pull_structural_variant_info 10 | from AnnotatorCore import pull_cna_info 11 | from AnnotatorCore import setoncokbapitoken 12 | from AnnotatorCore import ProteinChangeQuery 13 | from AnnotatorCore import GenomicChangeQuery 14 | from AnnotatorCore import StructuralVariantQuery 15 | from AnnotatorCore import CNAQuery 16 | from AnnotatorCore import HGVSgQuery 17 | from AnnotatorCore import ReferenceGenome 18 | 19 | ONCOKB_API_TOKEN = os.environ["ONCOKB_API_TOKEN"] 20 | setoncokbapitoken(ONCOKB_API_TOKEN) 21 | 22 | log = logging.getLogger('test_Annotation') 23 | log.info('test-----------', os.environ["ONCOKB_API_TOKEN"], '------') 24 | 25 | VARIANT_EXISTS_INDEX = 2 26 | MUTATION_EFFECT_INDEX = VARIANT_EXISTS_INDEX + 1 27 | ONCOGENIC_INDEX = MUTATION_EFFECT_INDEX + 2 28 | LEVEL_1_INDEX = ONCOGENIC_INDEX + 1 29 | LEVEL_2_INDEX = LEVEL_1_INDEX + 1 30 | LEVEL_3A_INDEX = LEVEL_1_INDEX + 2 31 | HIGHEST_LEVEL_INDEX = LEVEL_1_INDEX + 7 32 | HIGHEST_DX_LEVEL_INDEX = HIGHEST_LEVEL_INDEX + 7 33 | HIGHEST_PX_LEVEL_INDEX = HIGHEST_DX_LEVEL_INDEX + 5 34 | UNKNOWN = 'Unknown' 35 | NUMBER_OF_ANNOTATION_COLUMNS = 27 36 | NUMBER_OF_DESCRIPTION_COLUMNS = len(DESCRIPTION_HEADERS) 37 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS = len(ONCOKB_ANNOTATION_HEADERS_GC) 38 | NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS 39 | NUMBER_OF_GC_ANNOTATION_COLUMNS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS 40 | NUMBER_OF_GC_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_GC_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS 41 | 42 | 43 | def fake_gene_one_query_suite(annotations, include_descriptions): 44 | assert len(annotations) == 1 45 | 46 | annotation = annotations[0] 47 | assert len( 48 | annotation) == NUMBER_OF_ANNOTATION_COLUMNS if include_descriptions is False else NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS 49 | assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN 50 | assert annotation[ONCOGENIC_INDEX] == UNKNOWN 51 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 52 | 53 | 54 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 55 | def test_check_protein_change(): 56 | queries = [ 57 | ProteinChangeQuery('BRAF', 'V600E', 'Colorectal Cancer'), 58 | ProteinChangeQuery('ABL1', 'BCR-ABL1 Fusion', 'Acute Leukemias of Ambiguous Lineage'), 59 | ] 60 | 61 | annotations = pull_protein_change_info(queries, False, False) 62 | assert len(annotations) == 2 63 | 64 | annotation = annotations[0] 65 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 66 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 67 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 68 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 69 | 70 | annotation = annotations[1] 71 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 72 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 73 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 74 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 75 | assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx1' 76 | assert annotation[HIGHEST_PX_LEVEL_INDEX] == 'LEVEL_Px1' 77 | 78 | 79 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 80 | def test_reference_genome(): 81 | queries = [ 82 | GenomicChangeQuery('7', '140453136', '140453136', 'A', 'T', 'LUAD', ReferenceGenome.GRCH37), 83 | GenomicChangeQuery('7', '140753336', '140753336', 'A', 'T', 'LUAD', ReferenceGenome.GRCH38) 84 | ] 85 | 86 | annotations = pull_genomic_change_info(queries, False, False) 87 | assert len(annotations) == 2 88 | 89 | annotation37 = annotations[0] 90 | annotation38 = annotations[1] 91 | assert annotation37 == annotation38 92 | 93 | queries = [ 94 | ProteinChangeQuery('MYD88', 'M232T', 'Ovarian Cancer', ReferenceGenome.GRCH37), 95 | ProteinChangeQuery('MYD88', 'M219T', 'Ovarian Cancer', ReferenceGenome.GRCH38) 96 | ] 97 | 98 | annotations = pull_protein_change_info(queries, False, False) 99 | assert len(annotations) == 2 100 | 101 | annotation37 = annotations[0] 102 | annotation38 = annotations[1] 103 | assert annotation37 == annotation38 104 | 105 | 106 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 107 | def test_fake_gene_protein_change(): 108 | queries = [ 109 | ProteinChangeQuery('test1', 'V600E', 'Ovarian Cancer') 110 | ] 111 | 112 | annotations = pull_protein_change_info(queries, False, False) 113 | fake_gene_one_query_suite(annotations, False) 114 | 115 | annotations = pull_protein_change_info(queries, False, False) 116 | fake_gene_one_query_suite(annotations, True) 117 | 118 | 119 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 120 | def test_check_atypical_alts(): 121 | queries = [ 122 | ProteinChangeQuery('Other Biomarkers', 'MSI-H', 'Colorectal Cancer'), 123 | ProteinChangeQuery('Other Biomarkers', 'MSI-H', 'Leukemia'), 124 | ProteinChangeQuery('TERT', 'Promoter Mutation', 'Bladder Cancer'), 125 | ProteinChangeQuery('TERT', 'Promoter Mutation', 'Bladder Cancer', None, '5\'Flank') 126 | ] 127 | 128 | annotations = pull_protein_change_info(queries, False, False) 129 | assert len(annotations) == 4 130 | 131 | annotation = annotations[0] 132 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 133 | assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN 134 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 135 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 136 | 137 | annotation = annotations[1] 138 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 139 | assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN 140 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 141 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 142 | 143 | annotation = annotations[2] 144 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 145 | assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function' 146 | assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic' 147 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 148 | 149 | annotation_dup = annotations[3] 150 | assert len(annotation_dup) == NUMBER_OF_ANNOTATION_COLUMNS 151 | assert annotation == annotation_dup 152 | 153 | 154 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 155 | def test_check_hgvsg(): 156 | queries = [ 157 | # KRAF G12C 158 | HGVSgQuery('12:g.25398285C>A', 'LUAD'), 159 | # KRAF G12C 160 | HGVSgQuery('12:g.25398285_25398286delinsAG', 'LUAD'), 161 | # TERT Promoter 162 | HGVSgQuery('5:g.1295167_1295168delinsAATG', 'LUAD'), 163 | ] 164 | 165 | annotations = pull_hgvsg_info(queries, False, False) 166 | assert len(annotations) == 3 167 | 168 | annotation = annotations[0] 169 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 170 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' 171 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' 172 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' 173 | 174 | annotation = annotations[1] 175 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 176 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' 177 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' 178 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' 179 | 180 | annotation = annotations[2] 181 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 182 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function' 183 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic' 184 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == '' 185 | 186 | 187 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 188 | def test_check_genomic_change(): 189 | queries = [ 190 | # KRAF G12C 191 | GenomicChangeQuery('12', '25398285', '25398285', 'C', 'A', 'LUAD'), 192 | # KRAF G12C 193 | GenomicChangeQuery('12', '25398285', '25398286', 'CA', 'AG', 'LUAD'), 194 | # TERT Promoter 195 | GenomicChangeQuery('5', '1295167', '1295168', 'TC', 'AATG', 'LUAD'), 196 | ] 197 | 198 | annotations = pull_genomic_change_info(queries, False, False) 199 | assert len(annotations) == 3 200 | 201 | annotation = annotations[0] 202 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 203 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' 204 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' 205 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' 206 | 207 | annotation = annotations[1] 208 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 209 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function' 210 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic' 211 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1' 212 | 213 | annotation = annotations[2] 214 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS 215 | assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function' 216 | assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic' 217 | assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == '' 218 | 219 | 220 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 221 | def test_check_structural_variants(): 222 | queries = [ 223 | StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'), 224 | StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'), 225 | StructuralVariantQuery('BCR', 'ABL1', 'FUSION', 'Acute Leukemias of Ambiguous Lineage'), 226 | ] 227 | 228 | annotations = pull_structural_variant_info(queries, False) 229 | assert len(annotations) == 3 230 | 231 | annotation = annotations[0] 232 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 233 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 234 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 235 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 236 | 237 | annotation = annotations[1] 238 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 239 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 240 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 241 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_3B' 242 | 243 | annotation = annotations[2] 244 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 245 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 246 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 247 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 248 | assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx1' 249 | assert annotation[HIGHEST_PX_LEVEL_INDEX] == 'LEVEL_Px1' 250 | 251 | 252 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 253 | def test_fake_fusion_gene(): 254 | queries = [ 255 | StructuralVariantQuery('test1', 'test2', 'FUSION', 'NSCLC'), 256 | ] 257 | 258 | annotations = pull_structural_variant_info(queries, False) 259 | fake_gene_one_query_suite(annotations, False) 260 | 261 | annotations = pull_structural_variant_info(queries, False) 262 | fake_gene_one_query_suite(annotations, True) 263 | 264 | 265 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 266 | def test_cna(): 267 | queries = [ 268 | CNAQuery('BRCA2', 'DELETION', 'Ovarian Cancer'), 269 | CNAQuery('ERBB2', 'Amplification', 'Breast Cancer'), 270 | CNAQuery('ERBB2', 'Amplification', 'Colorectal Cancer'), 271 | CNAQuery('CDKN2A', 'Deletion', 'AML with BCR-ABL1'), 272 | ] 273 | 274 | annotations = pull_cna_info(queries, False) 275 | assert len(annotations) == 4 276 | 277 | annotation = annotations[0] 278 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 279 | assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function' 280 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 281 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 282 | 283 | annotation = annotations[1] 284 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 285 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 286 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 287 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 288 | 289 | annotation = annotations[2] 290 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 291 | assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function' 292 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 293 | assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 294 | 295 | annotation = annotations[3] 296 | assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS 297 | assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function' 298 | assert annotation[ONCOGENIC_INDEX] == 'Oncogenic' 299 | assert annotation[HIGHEST_LEVEL_INDEX] == '' 300 | assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx2' 301 | assert annotation[HIGHEST_PX_LEVEL_INDEX] == '' 302 | 303 | 304 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 305 | def test_fake_cna(): 306 | queries = [ 307 | CNAQuery('test1', 'Amplification', 'Breast Cancer'), 308 | ] 309 | 310 | annotations = pull_cna_info(queries, False) 311 | fake_gene_one_query_suite(annotations, False) 312 | 313 | annotations = pull_cna_info(queries, True) 314 | fake_gene_one_query_suite(annotations, True) 315 | 316 | 317 | def check_brca2_s1882_without_cancertype(annotation, genomic_query=False): 318 | assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS if genomic_query else NUMBER_OF_ANNOTATION_COLUMNS 319 | assert annotation[( 320 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + MUTATION_EFFECT_INDEX) if genomic_query else MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function' 321 | assert annotation[( 322 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + ONCOGENIC_INDEX) if genomic_query else ONCOGENIC_INDEX] == 'Likely Oncogenic' 323 | assert annotation[( 324 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + HIGHEST_LEVEL_INDEX) if genomic_query else HIGHEST_LEVEL_INDEX] == 'LEVEL_1' 325 | assert annotation[( 326 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_1_INDEX) if genomic_query else LEVEL_1_INDEX] == 'Olaparib,Olaparib+Bevacizumab,Rucaparib,Olaparib+Abiraterone+Prednisone,Niraparib,Talazoparib+Enzalutamide,Niraparib+Abiraterone Acetate+Prednisone' 327 | assert annotation[( 328 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_2_INDEX) if genomic_query else LEVEL_2_INDEX] == 'Olaparib,Rucaparib,Niraparib' 329 | assert annotation[( 330 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_3A_INDEX) if genomic_query else LEVEL_3A_INDEX] == 'Olaparib,Talazoparib' 331 | 332 | 333 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required") 334 | def test_duplicated_treatments(): 335 | # there should not be any duplicated treatment listed when cancer type is not specified 336 | 337 | # test protein change query 338 | queries = [ 339 | ProteinChangeQuery('BRCA2', 'S1882*', ''), 340 | ] 341 | annotations = pull_protein_change_info(queries, False, False) 342 | assert len(annotations) == 1 343 | 344 | check_brca2_s1882_without_cancertype(annotations[0]) 345 | 346 | # test genomic change query 347 | queries = [ 348 | GenomicChangeQuery('13', '32914137', '32914137', 'C', 'A', ''), 349 | ] 350 | annotations = pull_genomic_change_info(queries, False, False) 351 | assert len(annotations) == 1 352 | 353 | check_brca2_s1882_without_cancertype(annotations[0], True) 354 | -------------------------------------------------------------------------------- /test_AnnotatorCore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pytest 3 | 4 | from AnnotatorCore import getgenesfromfusion 5 | from AnnotatorCore import conversion 6 | from AnnotatorCore import replace_all 7 | from AnnotatorCore import resolve_query_type 8 | from AnnotatorCore import get_highest_tx_level 9 | from AnnotatorCore import get_cna 10 | from AnnotatorCore import QueryType 11 | from AnnotatorCore import ALTERATION_HEADER 12 | from AnnotatorCore import HGVSP_HEADER 13 | from AnnotatorCore import HGVSP_SHORT_HEADER 14 | from AnnotatorCore import HGVSG_HEADER 15 | from AnnotatorCore import GC_REF_ALLELE_HEADER 16 | from AnnotatorCore import GC_CHROMOSOME_HEADER 17 | from AnnotatorCore import GC_START_POSITION_HEADER 18 | from AnnotatorCore import GC_END_POSITION_HEADER 19 | from AnnotatorCore import GC_VAR_ALLELE_1_HEADER 20 | from AnnotatorCore import GC_VAR_ALLELE_2_HEADER 21 | from AnnotatorCore import TX_TYPE_SENSITIVE 22 | from AnnotatorCore import TX_TYPE_RESISTANCE 23 | from AnnotatorCore import CNA_AMPLIFICATION_TXT 24 | from AnnotatorCore import CNA_DELETION_TXT 25 | from AnnotatorCore import CNA_GAIN_TXT 26 | from AnnotatorCore import CNA_LOSS_TXT 27 | 28 | 29 | def test_getgenesfromfusion(): 30 | AB_EXAMPLE = ('A', 'B') 31 | assert getgenesfromfusion('A-B') == AB_EXAMPLE 32 | assert getgenesfromfusion('A-B ') == AB_EXAMPLE 33 | assert getgenesfromfusion('a-b') == ('a', 'b') 34 | assert getgenesfromfusion('A') == ('A', 'A') 35 | assert getgenesfromfusion('A1-1B') == ('A1', '1B') 36 | 37 | # Test fusion case insensitive 38 | assert getgenesfromfusion('A-B fusion') == AB_EXAMPLE 39 | assert getgenesfromfusion('A-B Fusion') == AB_EXAMPLE 40 | 41 | # Test unnecessary characters will be trimmed off after fusion 42 | assert getgenesfromfusion('A-B fusion archer') == AB_EXAMPLE 43 | assert getgenesfromfusion('A-B fusion Archer') == AB_EXAMPLE 44 | assert getgenesfromfusion('A-B fusion -Archer') == AB_EXAMPLE 45 | assert getgenesfromfusion('A-B fusion -archer') == AB_EXAMPLE 46 | assert getgenesfromfusion('A-B fusion - archer') == AB_EXAMPLE 47 | assert getgenesfromfusion('A-B fusion - archer ') == AB_EXAMPLE 48 | 49 | assert getgenesfromfusion('A-B fusion test') == AB_EXAMPLE 50 | assert getgenesfromfusion('fusion A-B fusion') == AB_EXAMPLE 51 | 52 | # Test intragenic 53 | assert getgenesfromfusion('MLL2-intragenic') == ('MLL2', 'MLL2') 54 | 55 | 56 | def test_conversion(): 57 | # Test conversion case for case insensitivity 58 | assert conversion('tyr100') == 'Y100' 59 | assert conversion('tYr100') == 'Y100' 60 | assert conversion('Tyr100') == 'Y100' 61 | assert conversion('tyR100') == 'Y100' 62 | assert conversion('TyR100') == 'Y100' 63 | assert conversion('TYR100') == 'Y100' 64 | assert conversion('tYR100') == 'Y100' 65 | assert conversion('sEr100') == 'S100' 66 | 67 | # Test conversion only targets dict() keys 68 | assert conversion('hot100') == 'hot100' 69 | 70 | # Test conversion is not affected by empty string and whitespaces 71 | assert conversion('') == '' 72 | assert conversion(' sEr100') == ' S100' 73 | 74 | # Test conversion when the string contains three letter but not supposed to be converted 75 | assert conversion('Promoter') == 'Promoter' 76 | 77 | 78 | def test_replace_all(): 79 | # Test replace_all for case insensitivity 80 | assert replace_all('tyr') == 'Y' 81 | assert replace_all('tYr') == 'Y' 82 | assert replace_all('Tyr') == 'Y' 83 | assert replace_all('tyR') == 'Y' 84 | assert replace_all('TyR') == 'Y' 85 | assert replace_all('TYR') == 'Y' 86 | assert replace_all('tYR') == 'Y' 87 | assert replace_all('sEr') == 'S' 88 | 89 | # Test replace_all only targets the dict() keys 90 | assert replace_all('bubblegum juice cup dairy hot pot Tyr melon') == 'bubblegum juice cup dairy hot pot Y melon' 91 | assert replace_all('Ly Lys Pr Pro Gln Glad Ph PH Phe') == 'Ly K Pr P Q Glad Ph PH F' 92 | assert replace_all( 93 | 'nOt can fat Tan Rat cat dog man Men FAn rot taR car fAr map TAP Zip poP') == 'nOt can fat Tan Rat cat dog man Men FAn rot taR car fAr map TAP Zip poP' 94 | 95 | # Test replace_all is not affected by numbers 96 | assert replace_all('Tyr600E Cys56734342342454562456') == 'Y600E C56734342342454562456' 97 | assert replace_all( 98 | '60 045 434 345 4 26 567 254 245 34 67567 8 56 8 364 56 6 345 7567 3455 6 8 99 89 7 3') == '60 045 434 345 4 26 567 254 245 34 67567 8 56 8 364 56 6 345 7567 3455 6 8 99 89 7 3' 99 | 100 | # Test replace_all is not affected by empty string and whitespaces 101 | assert replace_all('') == '' 102 | assert replace_all(' ') == ' ' 103 | assert replace_all('Tyr Asn As n Ile Il e') == 'Y N As n I Il e' 104 | 105 | 106 | def test_resolve_query_type(): 107 | assert resolve_query_type(None, [HGVSG_HEADER]) == QueryType.HGVSG 108 | assert resolve_query_type(None, [HGVSP_HEADER]) == QueryType.HGVSP 109 | assert resolve_query_type(None, [HGVSP_SHORT_HEADER]) == QueryType.HGVSP_SHORT 110 | assert resolve_query_type(None, [HGVSG_HEADER, HGVSP_HEADER, HGVSP_SHORT_HEADER]) == QueryType.HGVSP_SHORT 111 | assert resolve_query_type(None, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER, 112 | GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER, 113 | GC_VAR_ALLELE_2_HEADER]) == QueryType.GENOMIC_CHANGE 114 | 115 | assert resolve_query_type(QueryType.HGVSG, [HGVSG_HEADER, HGVSP_HEADER, HGVSP_SHORT_HEADER]) == QueryType.HGVSG 116 | 117 | # Test extreme cases 118 | with pytest.raises(Exception): 119 | assert resolve_query_type(None, []) 120 | assert resolve_query_type(None, [ALTERATION_HEADER]) == QueryType.HGVSP_SHORT 121 | 122 | # Raise exception when the file does not have asked header 123 | with pytest.raises(Exception): 124 | assert resolve_query_type(QueryType.HGVSG, [HGVSP_SHORT_HEADER]) 125 | with pytest.raises(Exception): 126 | assert resolve_query_type(QueryType.GENOMIC_CHANGE, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER]) 127 | 128 | 129 | def test_get_highest_tx_level(): 130 | oncokb_data = {} 131 | assert get_highest_tx_level(oncokb_data) == '' 132 | assert get_highest_tx_level(oncokb_data, 'random') == '' 133 | assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == '' 134 | assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == '' 135 | 136 | oncokb_data = {'LEVEL_1': ['test'], 'LEVEL_R1': ['test'], 'LEVEL_R2': ['test']} 137 | assert get_highest_tx_level(oncokb_data) == 'LEVEL_R1' 138 | assert get_highest_tx_level(oncokb_data, 'random') == 'LEVEL_R1' 139 | assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == 'LEVEL_1' 140 | assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == 'LEVEL_R1' 141 | 142 | oncokb_data = {'LEVEL_1': ['test'], 'LEVEL_R2': ['test']} 143 | assert get_highest_tx_level(oncokb_data) == 'LEVEL_1' 144 | assert get_highest_tx_level(oncokb_data, 'random') == 'LEVEL_1' 145 | assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == 'LEVEL_1' 146 | assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == 'LEVEL_R2' 147 | 148 | 149 | def test_cna(): 150 | assert get_cna(None) is None 151 | assert get_cna('') is None 152 | assert get_cna('test') is None 153 | assert get_cna('Amplification') == CNA_AMPLIFICATION_TXT 154 | assert get_cna('Gain') is None 155 | assert get_cna('Deletion') == CNA_DELETION_TXT 156 | assert get_cna('Loss') is None 157 | assert get_cna('2') == CNA_AMPLIFICATION_TXT 158 | assert get_cna('1') is None 159 | assert get_cna('-2') == CNA_DELETION_TXT 160 | assert get_cna('-1.5') == CNA_DELETION_TXT 161 | assert get_cna('-1') is None 162 | assert get_cna('0') is None 163 | 164 | assert get_cna(None, False) is None 165 | assert get_cna('', False) is None 166 | assert get_cna('test', False) is None 167 | assert get_cna('Amplification', False) == CNA_AMPLIFICATION_TXT 168 | assert get_cna('Gain', False) is None 169 | assert get_cna('Deletion', False) == CNA_DELETION_TXT 170 | assert get_cna('Loss', False) is None 171 | 172 | assert get_cna(None, True) is None 173 | assert get_cna('', True) is None 174 | assert get_cna('test', True) is None 175 | assert get_cna('Amplification', True) == CNA_AMPLIFICATION_TXT 176 | assert get_cna('Gain', True) == CNA_GAIN_TXT 177 | assert get_cna('Deletion', True) == CNA_DELETION_TXT 178 | assert get_cna('Loss', True) == CNA_LOSS_TXT 179 | assert get_cna('2', True) == CNA_AMPLIFICATION_TXT 180 | assert get_cna('1', True) == CNA_GAIN_TXT 181 | assert get_cna('-2', True) == CNA_DELETION_TXT 182 | assert get_cna('-1.5', True) == CNA_DELETION_TXT 183 | assert get_cna('-1', True) == CNA_LOSS_TXT 184 | assert get_cna('0', True) is None 185 | --------------------------------------------------------------------------------