├── .editorconfig
├── .github
    ├── release-drafter.yml
    └── workflows
    │   ├── after-master-commit.yml
    │   ├── compare-annotation.yml
    │   ├── compare-genomic-change-annotation.yml
    │   ├── pytest.yml
    │   └── release-management.yml
├── .gitignore
├── .version-level
├── AnnotatorCore.py
├── ClinicalDataAnnotator.py
├── CnaAnnotator.py
├── FusionAnnotator.py
├── GenerateReadMe.py
├── LICENSE
├── MafAnnotator.py
├── OncoKBPlots.py
├── README.md
├── StructuralVariantAnnotator.py
├── actionability_functions_msi_tmb_manuscript_R.r
├── data
    ├── example_atypical_alterations.txt
    ├── example_clinical.txt
    ├── example_cna.txt
    ├── example_fusions.txt
    ├── example_individual_cna.txt
    ├── example_maf.txt
    ├── example_maf_grch38.txt
    └── example_sv.txt
├── example.sh
├── flake8.ini
├── requirements
    ├── common.txt
    ├── pip2.7.txt
    └── pip3.txt
├── test_Annotation.py
└── test_AnnotatorCore.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # The EditorConfig project consists of a file format for defining coding styles
 2 | # and a collection of text editor plugins that enable editors to read the file format
 3 | # and adhere to defined styles.
 4 | 
 5 | # EditorConfig files are read top to bottom and the closest EditorConfig files are read last.
 6 | # Properties from matching EditorConfig sections are applied in the order they were read,
 7 | # so properties in closer files take precedence.
 8 | 
 9 | # Please only specify the formats you want to apply through out the project in this file.
10 | # Otherwise, please create new config file in your directory where you want to apply these styles.
11 | 
12 | # More details about EditorConfig: http://EditorConfig.org
13 | 
14 | # top-most EditorConfig file
15 | root = true
16 | 
17 | [*]
18 | # Unix-style newlines with a newline ending every file
19 | insert_final_newline = false
20 | trim_trailing_whitespace = false
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'v$NEXT_PATCH_VERSION'
 2 | tag-template: 'v$NEXT_PATCH_VERSION'
 3 | categories:
 4 |   - title: '🧬 Features'
 5 |     labels:
 6 |       - 'feature'
 7 |   - title: '🐛 Bug Fixes'
 8 |     labels:
 9 |       - 'fix'
10 |   - title: '🏎 Performance Tweaks'
11 |     labels:
12 |       - 'performance'
13 |   - title: '🎨 Style Tweaks'
14 |     labels:
15 |       - 'style tweak'
16 |   - title: '📘 Documentation'
17 |     labels:
18 |       - 'documentation'
19 |   - title: '🧹 Cleanup'
20 |     labels:
21 |       - 'cleanup'
22 |   - title: '👷‍♀️ Testing, Configuration & Deployment'
23 |     labels:
24 |       - 'devops'
25 |   - title: '🧰 Maintenance'
26 |     labels:
27 |       - 'chore'
28 |       - 'dependencies'
29 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
30 | template: |
31 |   ## Changes
32 |   $CHANGES
33 |   ## 🕵️‍♀️ Full commit logs
34 |   - https://github.com/oncokb/oncokb-annotator/compare/$PREVIOUS_TAG...v$NEXT_PATCH_VERSION
35 | 


--------------------------------------------------------------------------------
/.github/workflows/after-master-commit.yml:
--------------------------------------------------------------------------------
 1 | name: After master commit
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   check-version-level-and-update:
10 |     if: github.repository == 'oncokb/oncokb-annotator'
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |         with:
15 |           fetch-depth: 0
16 |       - name: 'Update Version Level'
17 |         run: |
18 |           git pull
19 |           VERSION_LEVEL=$(cat .version-level | tr "[:upper:]" "[:lower:]")
20 | 
21 |           RELEASE_DRAFTER_MINOR='NEXT_MINOR_VERSION'
22 |           RELEASE_DRAFTER_PATCH='NEXT_PATCH_VERSION'
23 | 
24 |           if [[ $VERSION_LEVEL == 'minor' ]]; then
25 |           sed -i "s/$RELEASE_DRAFTER_PATCH/$RELEASE_DRAFTER_MINOR/gi" .github/release-drafter.yml
26 |           fi
27 | 
28 |           if [[ $VERSION_LEVEL == 'patch' ]]; then
29 |           sed -i "s/$RELEASE_DRAFTER_MINOR/$RELEASE_DRAFTER_PATCH/gi" .github/release-drafter.yml
30 |           fi
31 | 
32 |           CHANGED=$(git diff --name-only HEAD --)
33 |           if [ -n "$CHANGED" ]
34 |           then
35 |             git config user.name oncokb-bot
36 |             git config user.email dev.oncokb@gmail.com
37 |             git add .
38 |             git commit -m "Update action files to align the version level to $VERSION_LEVEL"
39 |             git push
40 |           fi
41 | 


--------------------------------------------------------------------------------
/.github/workflows/compare-annotation.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will install Python dependencies, run annotation against the master annotation
  2 | 
  3 | name: Compare Annotation
  4 | 
  5 | on:
  6 |   push:
  7 |     branches:
  8 |       - master
  9 |       - next-minor-release
 10 |   pull_request:
 11 |     branches:
 12 |       - master
 13 |       - next-minor-release
 14 | jobs:
 15 |   build:
 16 |     if: github.repository == 'oncokb/oncokb-annotator'
 17 |     runs-on: macos-latest
 18 |     steps:
 19 |       - uses: actions/checkout@v2
 20 |       - name: Set up Python 3.8
 21 |         uses: actions/setup-python@v2
 22 |         with:
 23 |           python-version: 3.8
 24 |       - name: Install dependencies
 25 |         run: |
 26 |           python -m pip install --upgrade pip
 27 |           pip install flake8
 28 |           pip install -r requirements/common.txt -r requirements/pip3.txt
 29 |       - name: Lint with flake8
 30 |         run: |
 31 |           # stop the build if there are Python syntax errors or undefined names
 32 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 33 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 34 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 35 |       - name: Annotate
 36 |         id: annotate
 37 |         env:
 38 |           ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }}
 39 |           ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }}
 40 |         run: |
 41 |           git checkout -b compare
 42 | 
 43 |           MUTATION_DATA_NAME=data_mutations_mskcc.txt
 44 |           CLINICAL_DATA_NAME=data_clinical_sample.txt
 45 |           FUSION_DATA_NAME=data_fusions.txt
 46 |           INDIVIDUAL_CNA_DATA_NAME=data_individual_CNA.txt
 47 | 
 48 |           cd data || exit
 49 |           curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/data | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do
 50 |             if [[ "$name" == "$FIEL_NAME_PREFIX"* ]]; then
 51 |               curl -s "$downloadurl" -o $name
 52 |             fi
 53 |           done
 54 |           cd ..
 55 | 
 56 |           # create compare folder to add all annotated files
 57 |           mkdir compare
 58 | 
 59 |           PREFIX=oncokb
 60 |           IMAF=data/"$MUTATION_DATA_NAME"
 61 |           OMAF=compare/"$PREFIX"_"$MUTATION_DATA_NAME"
 62 | 
 63 |           IC=data/"$CLINICAL_DATA_NAME"
 64 |           OC=compare/"$PREFIX"_"$CLINICAL_DATA_NAME"
 65 | 
 66 |           IF=data/"$FUSION_DATA_NAME"
 67 |           OF=compare/"$PREFIX"_"$FUSION_DATA_NAME"
 68 | 
 69 |           IICNA=data/"$INDIVIDUAL_CNA_DATA_NAME"
 70 |           OICNA=compare/"$PREFIX"_"$INDIVIDUAL_CNA_DATA_NAME"
 71 | 
 72 |           python MafAnnotator.py -i "$IMAF" -o "$OMAF" -c "$IC" -b "$ONCOKB_API_TOKEN"
 73 |           python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$ONCOKB_API_TOKEN"
 74 |           python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$ONCOKB_API_TOKEN" -f "individual"
 75 |           python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OICNA,$OF"
 76 | 
 77 |           git config user.name oncokb-bot
 78 |           git config user.email dev.oncokb@gmail.com
 79 | 
 80 |           git add .
 81 |           git commit -m 'add analysis'
 82 | 
 83 |       - name: Compare annotation result with the ones from master
 84 |         id: compare
 85 |         env:
 86 |           ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }}
 87 |           FIEL_NAME_PREFIX: 'oncokb_data'
 88 |         run: |
 89 |           # remove everything under compare folder and replace wiht the ones from oncokb-data
 90 |           rm -f compare/*.txt
 91 | 
 92 |           cd compare || exit
 93 |           curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/annotation | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do
 94 |             if [[ "$name" == "$FIEL_NAME_PREFIX"* ]]; then
 95 |               curl -s "$downloadurl" -o $name
 96 |             fi
 97 |           done
 98 |           cd ..
 99 | 
100 |           # compare
101 |           CHANGED=$(git diff --name-only HEAD --)
102 | 
103 |           if [ -n "$CHANGED" ]
104 |           then
105 |             git diff
106 |             exit 1
107 |           fi
108 |             
109 | 


--------------------------------------------------------------------------------
/.github/workflows/compare-genomic-change-annotation.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run annotation against the master annotation for a particular study
 2 | 
 3 | name: Compare Genomic Change Annotation
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - master
 9 |       - next-minor-release
10 |   pull_request:
11 |     branches:
12 |       - master
13 |       - next-minor-release
14 | jobs:
15 |   build:
16 |     if: github.repository == 'oncokb/oncokb-annotator'
17 |     runs-on: macos-latest
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Set up Python 3.8
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: 3.8
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install flake8
28 |           pip install -r requirements/common.txt -r requirements/pip3.txt
29 |       - name: Lint with flake8
30 |         run: |
31 |           # stop the build if there are Python syntax errors or undefined names
32 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
33 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
34 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
35 |       - name: Annotate
36 |         id: annotate
37 |         env:
38 |           ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }}
39 |           ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }}
40 |         run: |
41 |           git checkout -b compare
42 | 
43 |           MUTATION_DATA_NAME=data_mutations_mskcc.txt
44 |           CLINICAL_DATA_NAME=data_clinical_sample.txt
45 | 
46 |           cd data
47 |           curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/data | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do
48 |             if [[ "$name" == "$MUTATION_DATA_NAME" || "$name" == "$CLINICAL_DATA_NAME" ]]; then
49 |               curl -s "$downloadurl" -o $name
50 |             fi
51 |           done
52 |           cd ..
53 | 
54 |           # create compare folder to add all annotated files
55 |           mkdir compare
56 | 
57 |           OGCMAF=oncokb_genomic_change_$MUTATION_DATA_NAME
58 | 
59 |           python MafAnnotator.py -i data/$MUTATION_DATA_NAME -o compare/$OGCMAF -c data/$CLINICAL_DATA_NAME -b $ONCOKB_API_TOKEN -q Genomic_Change
60 | 
61 |           git config user.name oncokb-bot
62 |           git config user.email dev.oncokb@gmail.com
63 | 
64 |           git add .
65 |           git commit -m 'add analysis'
66 | 
67 |           echo "::set-output name=FILE_NAME::$OGCMAF"
68 | 
69 |       - name: Compare annotation result with the ones from master
70 |         id: compare
71 |         env:
72 |           FILE_NAME: ${{steps.annotate.outputs.FILE_NAME}}
73 |           ONCOKB_OAUTH_TOKEN: ${{ secrets.ONCOKB_OAUTH_TOKEN }}
74 |         run: |
75 |           # remove everything under compare folder and replace wiht the ones from oncokb-data
76 |           rm -f compare/*.txt
77 | 
78 |           cd compare
79 |           curl -s -H "Authorization: token ${ONCOKB_OAUTH_TOKEN}" https://api.github.com/repos/knowledgesystems/oncokb-data/contents/annotation/annotator-test/annotation | jq -r '.[] | .download_url + " " + .name' | while IFS=' ' read -r downloadurl name; do
80 |             if [[ "$name" == "$FILE_NAME" ]]; then
81 |               curl -s "$downloadurl" -o $name
82 |             fi
83 |           done
84 |           cd ..
85 | 
86 |           # compare
87 |           CHANGED=$(git diff --name-only HEAD --)
88 | 
89 |           if [ -n "$CHANGED" ]
90 |           then
91 |             git diff
92 |             exit 1
93 |           fi
94 |             


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Run all python tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master, next-minor-release ]
 9 |   pull_request:
10 |     branches: [ master, next-minor-release ]
11 | 
12 | jobs:
13 |   lint:
14 |     name: Linting using flake8
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - uses: actions/setup-python@v2
19 |         with:
20 |           python-version: "3.9"
21 |       - name: Run flake8
22 |         uses: julianwachholz/flake8-action@v2
23 |         with:
24 |           checkName: "Python Lint"
25 |           path: .
26 |           config: flake8.ini
27 |         env:
28 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
29 |   pytest:
30 |     needs: lint
31 |     runs-on: ${{ matrix.os }}
32 |     strategy:
33 |       matrix:
34 |         os: [ ubuntu-latest, macos-latest ]
35 |         python-version: [ '3.8','3.9','3.10','3.11' ]
36 |     steps:
37 |     - uses: actions/checkout@v2
38 |     - name: Set up Python ${{ matrix.python-version }}
39 |       uses: actions/setup-python@v4
40 |       with:
41 |         python-version: ${{ matrix.python-version }}
42 |     - name: Install dependencies
43 |       env:
44 |         PYTHON_VERSION: ${{ matrix.python-version }}
45 |       run: |
46 |         python -m pip install --upgrade pip
47 |         pip install pytest
48 |         if [[ $PYTHON_VERSION =~ ^2\.[0-9]+$ ]]; then pip install -r requirements/common.txt -r requirements/pip2.7.txt; fi
49 |         if [[ $PYTHON_VERSION =~ ^3\.[0-9]+$ ]]; then pip install -r requirements/common.txt -r requirements/pip3.txt; fi
50 |     - name: Test with pytest
51 |       env:
52 |         ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }}
53 |       run: |
54 |         pytest
55 | 
56 |   build-in-windows:
57 |     needs: lint
58 |     runs-on: windows-latest
59 |     strategy:
60 |       matrix:
61 |         python-version: [ '3.8','3.9','3.10','3.11' ]
62 |     steps:
63 |     - uses: actions/checkout@v2
64 |     - name: Set up Python ${{ matrix.python-version }}
65 |       uses: actions/setup-python@v4
66 |       with:
67 |         python-version: ${{ matrix.python-version }}
68 |     - name: Install dependencies
69 |       env:
70 |         PYTHON_VERSION: ${{ matrix.python-version }}
71 |       run: |
72 |         python -m pip install --upgrade pip
73 |         pip install pytest
74 |         if ( $env:PYTHON_VERSION -match '^2\.[0-9]+$' )
75 |         {
76 |           pip install -r requirements/common.txt -r requirements/pip2.7.txt
77 |         }
78 |         if ( $env:PYTHON_VERSION -match '^3\.[0-9]+$' )
79 |         {
80 |           pip install -r requirements/common.txt -r requirements/pip3.txt
81 |         }
82 |     - name: Test with pytest
83 |       env:
84 |         ONCOKB_API_TOKEN: ${{ secrets.ONCOKB_BOT_API_TOKEN }}
85 |       run: |
86 |         pytest
87 | 


--------------------------------------------------------------------------------
/.github/workflows/release-management.yml:
--------------------------------------------------------------------------------
 1 | name: Release Management
 2 | 
 3 | on:
 4 |   push:
 5 |     # branches to consider in the event; optional, defaults to all
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   update_draft_release:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       # Drafts your next Release notes as Pull Requests are merged into "master"
14 |       - uses: release-drafter/release-drafter@v5
15 |         env:
16 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # oncokb output data
  2 | data/*.oncokb.*
  3 | data/example_README.txt
  4 | process
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | env/
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *,cover
 51 | .hypothesis/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # IPython Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | 
 93 | # Rope project settings
 94 | .ropeproject
 95 | 
 96 | # PyCharm
 97 | .idea/
 98 | 
 99 | # MAC OS
100 | .DS_Store
101 | 


--------------------------------------------------------------------------------
/.version-level:
--------------------------------------------------------------------------------
1 | patch
2 | 


--------------------------------------------------------------------------------
/ClinicalDataAnnotator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import re
 5 | import argparse
 6 | import logging
 7 | 
 8 | from AnnotatorCore import setsampleidsfileterfile
 9 | from AnnotatorCore import process_clinical_data
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | log = logging.getLogger('ClinicalDataAnnotator')
13 | 
14 | 
15 | def main(argv):
16 |     if argv.help:
17 |         log.info(
18 |             '\n'
19 |             'ClinicalDataAnnotator.py -i <input clinical file> -o <output clinical file> -a <annotated alteration files, separate by ,> [-s sample list filter]\n'
20 |             '  Essential clinical columns:\n'
21 |             '    SAMPLE_ID: sample ID'
22 |         )
23 |         sys.exit()
24 |     if argv.sample_ids_filter:
25 |         setsampleidsfileterfile(argv.sample_ids_filter)
26 | 
27 |     annotated_alteration_files = re.split(',|, ', argv.annotated_alteration_files)
28 |     if argv.input_file == '' or argv.output_file == '' or len(annotated_alteration_files) == 0:
29 |         required_params = []
30 |         if argv.input_file == '':
31 |             required_params.append('-i')
32 |         if argv.output_file == '':
33 |             required_params.append('-o')
34 |         if len(annotated_alteration_files) == 0:
35 |             required_params.append('-a')
36 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
37 |         log.info('for help: python ClinicalDataAnnotator.py -h')
38 |         sys.exit(2)
39 | 
40 |     log.info('annotating %s ...' % argv.input_file)
41 |     process_clinical_data(annotated_alteration_files, argv.input_file, argv.output_file)
42 | 
43 |     log.info('done!')
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser(add_help=False)
48 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
49 |     parser.add_argument('-i', dest='input_file', default='', type=str)
50 |     parser.add_argument('-o', dest='output_file', default='', type=str)
51 |     parser.add_argument('-s', dest='sample_ids_filter', default='', type=str)
52 |     parser.add_argument('-a', dest='annotated_alteration_files', default='', type=str)
53 |     parser.set_defaults(func=main)
54 | 
55 |     args = parser.parse_args()
56 |     args.func(args)
57 | 


--------------------------------------------------------------------------------
/CnaAnnotator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import logging
 6 | 
 7 | from AnnotatorCore import setsampleidsfileterfile
 8 | from AnnotatorCore import setoncokbbaseurl
 9 | from AnnotatorCore import setoncokbapitoken
10 | from AnnotatorCore import readCancerTypes
11 | from AnnotatorCore import validate_oncokb_token
12 | from AnnotatorCore import process_cna_data
13 | from AnnotatorCore import CNA_FILE_FORMAT_GISTIC
14 | 
15 | logging.basicConfig(level=logging.INFO)
16 | log = logging.getLogger('CnaAnnotator')
17 | 
18 | 
19 | def main(argv):
20 |     if argv.help:
21 |         log.info(
22 |             '\n'
23 |             'CnaAnnotator.py -i <input CNA file> -o <output CNA file> [-p previous results] [-c <input clinical file>] '
24 |             '[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb_api_bear_token] '
25 |             '[-z annotate_gain_loss] [-f CNA file formt, gistic or individual] [-d include descriptions]\n'
26 |             '  Input CNA file uses GISTIC output by default (https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-1). You can also list copy number alteration individually by specifying -f=individual\n'
27 |             '  Essential clinical columns:\n'
28 |             '    SAMPLE_ID: sample ID\n'
29 |             '  Cancer type will be assigned based on the following priority:\n'
30 |             '     1) ONCOTREE_CODE in clinical data file\n'
31 |             '     2) ONCOTREE_CODE exist in MAF\n'
32 |             '     3) default tumor type (-t)\n'
33 |             '  We do not annotate Gain and Loss by default, add -z to include the analysis. See https://github.com/oncokb/oncokb-annotator/issues/51 for more information.\n'
34 |             '  Default OncoKB base url is https://www.oncokb.org'
35 |         )
36 |         sys.exit()
37 |     if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
38 |         required_params = []
39 |         if argv.input_file == '':
40 |             required_params.append('-i')
41 |         if argv.output_file == '':
42 |             required_params.append('-o')
43 |         if argv.oncokb_api_bearer_token == '':
44 |             required_params.append('-b')
45 | 
46 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
47 |         log.info('for help: python CnaAnnotator.py -h')
48 |         sys.exit(2)
49 |     if argv.sample_ids_filter:
50 |         setsampleidsfileterfile(argv.sample_ids_filter)
51 |     if argv.oncokb_api_url:
52 |         setoncokbbaseurl(argv.oncokb_api_url)
53 |     setoncokbapitoken(argv.oncokb_api_bearer_token)
54 | 
55 |     cancertypemap = {}
56 |     if argv.input_clinical_file:
57 |         readCancerTypes(argv.input_clinical_file, cancertypemap)
58 | 
59 |     validate_oncokb_token()
60 | 
61 |     log.info('annotating %s ...' % argv.input_file)
62 |     process_cna_data(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.include_descriptions, argv.annotate_gain_loss, argv.cna_file_format.lower())
63 | 
64 |     log.info('done!')
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser(add_help=False)
69 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
70 |     parser.add_argument('-i', dest='input_file', default='', type=str)
71 |     parser.add_argument('-o', dest='output_file', default='', type=str)
72 |     parser.add_argument('-p', dest='previous_result_file', default='', type=str)
73 |     parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
74 |     parser.add_argument('-s', dest='sample_ids_filter', default='', type=str)
75 |     parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
76 |     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
77 |     parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
78 |     parser.add_argument('-z', dest='annotate_gain_loss', action="store_true", default=False)
79 |     parser.add_argument('-f', dest='cna_file_format', default=CNA_FILE_FORMAT_GISTIC)
80 |     parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False)
81 |     parser.set_defaults(func=main)
82 | 
83 |     args = parser.parse_args()
84 |     args.func(args)
85 | 


--------------------------------------------------------------------------------
/FusionAnnotator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import logging
 6 | 
 7 | from AnnotatorCore import setsampleidsfileterfile
 8 | from AnnotatorCore import setcancerhotspotsbaseurl
 9 | from AnnotatorCore import setoncokbbaseurl
10 | from AnnotatorCore import setoncokbapitoken
11 | from AnnotatorCore import readCancerTypes
12 | from AnnotatorCore import validate_oncokb_token
13 | from AnnotatorCore import process_fusion
14 | 
15 | logging.basicConfig(level=logging.INFO)
16 | log = logging.getLogger('FusionAnnotator')
17 | 
18 | 
19 | def main(argv):
20 |     if argv.help:
21 |         log.info(
22 |             '\n'
23 |             "FusionAnnotator.py -i <input Fusion file> -o <output Fusion file> [-p previous results] "
24 |             "[-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] [-u <oncokb api url>] "
25 |             "[-b <oncokb api bear token>] [-r <structural variant name format, default: [A-Za-z\\d]+-[A-Za-z\\d]+>] "
26 |             "[-d include descriptions]\n"
27 |             '  Essential Fusion columns (case insensitive):\n'
28 |             '    HUGO_SYMBOL: Hugo gene symbol\n'
29 |             '    VARIANT_CLASSIFICATION: Translational effect of variant allele\n'
30 |             '    TUMOR_SAMPLE_BARCODE: sample ID\n'
31 |             '    FUSION: amino acid change, e.g. "TMPRSS2-ERG"\n'
32 |             '  Essential clinical columns:\n'
33 |             '    SAMPLE_ID: sample ID\n'
34 |             '    ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
35 |             '  Cancer type will be assigned based on the following priority:\n'
36 |             '     1) ONCOTREE_CODE in clinical data file\n'
37 |             '     2) ONCOTREE_CODE exist in Fusion\n'
38 |             '     3) default tumor type (-t)\n'
39 |             '  Default OncoKB base url is https://www.oncokb.org'
40 |         )
41 |         sys.exit()
42 |     if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
43 |         required_params = []
44 |         if argv.input_file == '':
45 |             required_params.append('-i')
46 |         if argv.output_file == '':
47 |             required_params.append('-o')
48 |         if argv.oncokb_api_bearer_token == '':
49 |             required_params.append('-b')
50 | 
51 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
52 |         log.info('for help: python FusionAnnotator.py -h')
53 |         sys.exit(2)
54 |     if argv.sample_ids_filter:
55 |         setsampleidsfileterfile(argv.sample_ids_filter)
56 |     if argv.cancer_hotspots_base_url:
57 |         setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
58 |     if argv.oncokb_api_url:
59 |         setoncokbbaseurl(argv.oncokb_api_url)
60 |     setoncokbapitoken(argv.oncokb_api_bearer_token)
61 | 
62 |     cancertypemap = {}
63 |     if argv.input_clinical_file:
64 |         readCancerTypes(argv.input_clinical_file, cancertypemap)
65 | 
66 |     validate_oncokb_token()
67 | 
68 |     log.info('annotating %s ...' % argv.input_file)
69 |     process_fusion(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap, argv.structural_variant_name_format, argv.include_descriptions)
70 | 
71 |     log.info('done!')
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     parser = argparse.ArgumentParser(add_help=False)
76 |     # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
77 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
78 |     parser.add_argument('-i', dest='input_file', default='', type=str)
79 |     parser.add_argument('-o', dest='output_file', default='', type=str)
80 |     parser.add_argument('-p', dest='previous_result_file', default='', type=str)
81 |     parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
82 |     parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
83 |     parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
84 |     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
85 |     parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
86 |     parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
87 |     parser.add_argument('-r', dest='structural_variant_name_format', default=None, type=str)
88 |     parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False)
89 |     parser.set_defaults(func=main)
90 | 
91 |     args = parser.parse_args()
92 |     args.func(args)
93 | 


--------------------------------------------------------------------------------
/GenerateReadMe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import logging
 6 | 
 7 | from AnnotatorCore import setoncokbbaseurl
 8 | from AnnotatorCore import generateReadme
 9 | 
10 | logging.basicConfig(level=logging.INFO)
11 | log = logging.getLogger('GenerateReadMe')
12 | 
13 | 
14 | def main(argv):
15 |     if argv.help:
16 |         log.info('\nGenerateReadMe.py -o <output README file> [-u oncokb-base-url]\n'
17 |                  '  Default OncoKB base url is https://www.oncokb.org')
18 |         sys.exit()
19 |     if argv.output_file == '':
20 |         log.error('The parameter -o can not be empty')
21 |         log.info('for help: python GenerateReadMe.py -h')
22 |         sys.exit(2)
23 |     if argv.oncokb_api_url:
24 |         setoncokbbaseurl(argv.oncokb_api_url)
25 | 
26 |     generateReadme(argv.output_file)
27 |     log.info('done!')
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser(add_help=False)
32 |     # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
33 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
34 |     parser.add_argument('-o', dest='output_file', default='', type=str)
35 |     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
36 |     parser.set_defaults(func=main)
37 | 
38 |     args = parser.parse_args()
39 |     args.func(args)
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published
637 |     by the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <http://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/MafAnnotator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | 
  7 | from AnnotatorCore import setsampleidsfileterfile
  8 | from AnnotatorCore import setcancerhotspotsbaseurl
  9 | from AnnotatorCore import setoncokbbaseurl
 10 | from AnnotatorCore import setoncokbapitoken
 11 | from AnnotatorCore import readCancerTypes
 12 | from AnnotatorCore import validate_oncokb_token
 13 | from AnnotatorCore import processalterationevents
 14 | from AnnotatorCore import QueryType
 15 | from AnnotatorCore import ReferenceGenome
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | log = logging.getLogger('MafAnnotator')
 19 | 
 20 | 
 21 | def main(argv):
 22 |     if argv.help:
 23 |         log.info(
 24 |             '\n'
 25 |             'MafAnnotator.py -i <input MAF file> -o <output MAF file> [-p previous results] [-c <input clinical file>] '
 26 |             '[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] '
 27 |             '[-q query type] [-r default reference genome] [-d include descriptions]\n'
 28 |             'For definitions of the MAF format, please see https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/\n\n'
 29 |             'Essential MAF columns for querying HGVSp_Short and HGVSp(case insensitive):\n'
 30 |             '    Hugo_Symbol: Hugo gene symbol\n'
 31 |             '    Tumor_Sample_Barcode: sample ID\n'
 32 |             '    HGVSp(query type: HGVSp): protein change in HGVSp format\n'
 33 |             '    HGVSp_Short(query type: HGVSp_Short): protein change in HGVSp format using 1-letter amino-acid codes\n'
 34 |             'Essential MAF columns for querying HGVSg(case insensitive):\n'
 35 |             '    Tumor_Sample_Barcode: sample ID\n'
 36 |             '    HGVSg: Genomic change in HGVSg format\n'
 37 |             'Essential MAF columns for querying genomic change(case insensitive):\n'
 38 |             '    Tumor_Sample_Barcode: sample ID\n'
 39 |             '    Chromosome: Chromosome number\n'
 40 |             '    Start_Position: Mutation start coordinate\n'
 41 |             '    End_Position: Mutation end coordinate\n'
 42 |             '    Reference_Allele: The plus strand reference allele at this position\n'
 43 |             '    Tumor_Seq_Allele1: Primary data genotype for tumor sequencing (discovery) allele\n'
 44 |             '    Tumor_Seq_Allele2: Tumor sequencing (discovery) allele 2\n'
 45 |             'Essential clinical columns:\n'
 46 |             '    SAMPLE_ID: sample ID\n'
 47 |             '    ONCOTREE_CODE: tumor type code from oncotree (http://oncotree.mskcc.org)\n'
 48 |             'Cancer type will be assigned based on the following priority:\n'
 49 |             '    1) ONCOTREE_CODE in clinical data file\n'
 50 |             '    2) ONCOTREE_CODE exist in MAF\n'
 51 |             '    3) default tumor type (-t)\n'
 52 |             'Query type only allows the following values (case-insensitive):\n'
 53 |             '    - HGVSp_Short\n'
 54 |             '      It reads from column HGVSp_Short or Alteration\n'
 55 |             '    - HGVSp\n'
 56 |             '      It reads from column HGVSp or Alteration\n'
 57 |             '    - HGVSg\n'
 58 |             '      It reads from column HGVSg or Alteration\n'
 59 |             '    - Genomic_Change\n'
 60 |             '      It reads from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2  \n'
 61 |             'Reference Genome only allows the following values(case-insensitive):\n'
 62 |             '    - GRCh37\n'
 63 |             '      GRCh38\n'
 64 |             'Default OncoKB base url is https://www.oncokb.org.\n'
 65 |         )
 66 |         sys.exit()
 67 |     if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
 68 |         required_params = []
 69 |         if argv.input_file == '':
 70 |             required_params.append('-i')
 71 |         if argv.output_file == '':
 72 |             required_params.append('-o')
 73 |         if argv.oncokb_api_bearer_token == '':
 74 |             required_params.append('-b')
 75 | 
 76 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
 77 |         log.info('For help: python MafAnnotator.py -h')
 78 |         sys.exit(2)
 79 | 
 80 |     if argv.sample_ids_filter:
 81 |         setsampleidsfileterfile(argv.sample_ids_filter)
 82 |     if argv.cancer_hotspots_base_url:
 83 |         setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
 84 |     if argv.oncokb_api_url:
 85 |         setoncokbbaseurl(argv.oncokb_api_url)
 86 |     setoncokbapitoken(argv.oncokb_api_bearer_token)
 87 | 
 88 |     cancertypemap = {}
 89 |     if argv.input_clinical_file:
 90 |         readCancerTypes(argv.input_clinical_file, cancertypemap)
 91 | 
 92 |     log.info('annotating %s ...' % argv.input_file)
 93 | 
 94 |     user_input_query_type = None
 95 |     if argv.query_type is not None:
 96 |         try:
 97 |             user_input_query_type = QueryType[argv.query_type.upper()]
 98 |         except KeyError:
 99 |             log.error(
100 |                 'Query type is not acceptable. Only the following allows(case insensitive): HGVSp_Short, HGVSp, HGVSg, Genomic_Change')
101 |             raise
102 | 
103 |     default_reference_genome = None
104 |     if argv.default_reference_genome is not None:
105 |         try:
106 |             default_reference_genome = ReferenceGenome[argv.default_reference_genome.upper()]
107 |         except KeyError:
108 |             log.error(
109 |                 'Reference genome is not acceptable. Only the following allows(case insensitive): GRCh37, GRCh38')
110 |             raise
111 | 
112 |     validate_oncokb_token()
113 | 
114 |     processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
115 |                             cancertypemap, argv.annotate_hotspots, user_input_query_type, default_reference_genome,
116 |                             argv.include_descriptions)
117 | 
118 |     log.info('done!')
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     parser = argparse.ArgumentParser(add_help=False)
123 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
124 |     parser.add_argument('-i', dest='input_file', default='', type=str)
125 |     parser.add_argument('-o', dest='output_file', default='', type=str)
126 |     parser.add_argument('-p', dest='previous_result_file', default='', type=str)
127 |     parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
128 |     parser.add_argument('-s', dest='sample_ids_filter', default='', type=str)
129 |     parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
130 |     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
131 |     parser.add_argument('-a', dest='annotate_hotspots', action="store_true", default=False)
132 |     parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
133 |     parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
134 |     parser.add_argument('-q', dest='query_type', default=None, type=str)
135 |     parser.add_argument('-r', dest='default_reference_genome', default=None, type=str)
136 |     parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False)
137 |     parser.set_defaults(func=main)
138 | 
139 |     args = parser.parse_args()
140 |     args.func(args)
141 | 


--------------------------------------------------------------------------------
/OncoKBPlots.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import re
  5 | import argparse
  6 | import logging
  7 | import os
  8 | import csv
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | from AnnotatorCore import setsampleidsfileterfile
 12 | from AnnotatorCore import readheaders
 13 | from AnnotatorCore import geIndexOfHeader
 14 | from AnnotatorCore import sampleidsfilter
 15 | from AnnotatorCore import levels
 16 | from AnnotatorCore import dxLevels
 17 | from AnnotatorCore import pxLevels
 18 | from AnnotatorCore import SAMPLE_HEADERS
 19 | 
 20 | logging.basicConfig(level=logging.INFO)
 21 | log = logging.getLogger('OncoKBPlots')
 22 | 
 23 | 
 24 | def plotclinicalactionability(ax, annotatedclinicalfile, outfile, parameters):
 25 |     if os.path.isfile(outfile):
 26 |         os.remove(outfile)
 27 | 
 28 |     extlevels = levels + ["ONCOGENIC", "VUS"]
 29 |     if "levels" in parameters:
 30 |         extlevels = parameters["levels"]
 31 | 
 32 |     with open(annotatedclinicalfile, 'rU') as clinfile:
 33 |         reader = csv.reader(clinfile, delimiter='\t')
 34 |         headers = readheaders(reader)
 35 |         isample = geIndexOfHeader(headers, SAMPLE_HEADERS)
 36 |         ilevel = headers['HIGHEST_LEVEL']
 37 |         ioncogenic = headers['ONCOGENIC_MUTATIONS']
 38 |         icat = headers[parameters["catogerycolumn"].upper()]  # e.g. "CANCER_TYPE"
 39 | 
 40 |         catsamplecount = {}
 41 |         catactionablesamplecount = {}
 42 |         oncogenicsamplecount = {}
 43 |         levelcatsamplecount = {}
 44 | 
 45 |         for row in reader:
 46 |             sample = row[isample]
 47 |             if sampleidsfilter and sample not in sampleidsfilter:
 48 |                 continue
 49 | 
 50 |             cat = row[icat]
 51 |             if cat not in catsamplecount:
 52 |                 catsamplecount[cat] = 0
 53 |             catsamplecount[cat] += 1
 54 | 
 55 |             if cat not in catactionablesamplecount:
 56 |                 catactionablesamplecount[cat] = 0
 57 |                 oncogenicsamplecount[cat] = 0
 58 | 
 59 |             level = row[ilevel]
 60 |             oncogenic = row[ioncogenic]
 61 | 
 62 |             exlevel = level
 63 | 
 64 |             if level in extlevels:
 65 |                 catactionablesamplecount[cat] += 1
 66 |                 oncogenicsamplecount[cat] += 1
 67 |             elif len(oncogenic.strip()) > 0:
 68 |                 oncogenicsamplecount[cat] += 1
 69 |                 exlevel = "ONCOGENIC"
 70 |             else:
 71 |                 exlevel = "VUS"
 72 | 
 73 |             if exlevel not in levelcatsamplecount:
 74 |                 levelcatsamplecount[exlevel] = {}
 75 |             if cat not in levelcatsamplecount[exlevel]:
 76 |                 levelcatsamplecount[exlevel][cat] = 0
 77 |             levelcatsamplecount[exlevel][cat] += 1
 78 | 
 79 |     # plot
 80 |     catarray = []  # cancer types
 81 |     catactionabilityarray = []  # actionabiligy percentages per cancer type
 82 |     catoncogenicarray = []  # actionabiligy percentages per cancer type
 83 |     for cat in catsamplecount:
 84 |         if catsamplecount[cat] >= parameters["thresholdcat"]:
 85 |             catarray.append(cat)
 86 |             catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat])
 87 |             catoncogenicarray.append(oncogenicsamplecount[cat] * 100.0 / catsamplecount[cat])
 88 | 
 89 |     ncat = len(catarray)
 90 |     order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x], catoncogenicarray[x])))
 91 |     drawplot(ax, 'OncoKB Actionability', extlevels, levelcatsamplecount, catarray, catsamplecount, order,
 92 |              parameters["thresholdcat"])
 93 | 
 94 | 
 95 | def plotimplications(ax, header, title, levels, annotatedclinicalfile, outfile, parameters):
 96 |     if os.path.isfile(outfile):
 97 |         os.remove(outfile)
 98 | 
 99 |     extlevels = levels
100 |     if "levels" in parameters:
101 |         extlevels = parameters["levels"]
102 | 
103 |     with open(annotatedclinicalfile, 'rU') as clinfile:
104 |         reader = csv.reader(clinfile, delimiter='\t')
105 |         headers = readheaders(reader)
106 |         isample = headers['SAMPLE_ID']
107 |         ilevel = headers[header]
108 |         icat = headers[parameters["catogerycolumn"].upper()]
109 | 
110 |         catsamplecount = {}
111 |         catactionablesamplecount = {}
112 |         levelcatsamplecount = {}
113 | 
114 |         for row in reader:
115 |             sample = row[isample]
116 |             if sampleidsfilter and sample not in sampleidsfilter:
117 |                 continue
118 | 
119 |             cat = row[icat]
120 |             if cat not in catsamplecount:
121 |                 catsamplecount[cat] = 0
122 |             catsamplecount[cat] += 1
123 | 
124 |             if cat not in catactionablesamplecount:
125 |                 catactionablesamplecount[cat] = 0
126 | 
127 |             level = row[ilevel]
128 | 
129 |             exlevel = level
130 | 
131 |             if level in extlevels:
132 |                 catactionablesamplecount[cat] += 1
133 |             else:
134 |                 exlevel = "Other"
135 | 
136 |             if exlevel not in levelcatsamplecount:
137 |                 levelcatsamplecount[exlevel] = {}
138 |             if cat not in levelcatsamplecount[exlevel]:
139 |                 levelcatsamplecount[exlevel][cat] = 0
140 |             levelcatsamplecount[exlevel][cat] += 1
141 | 
142 |     # plot
143 |     catarray = []  # cancer types
144 |     catactionabilityarray = []  # actionabiligy percentages per cancer type
145 |     for cat in catsamplecount:
146 |         if catsamplecount[cat] >= parameters["thresholdcat"]:
147 |             catarray.append(cat)
148 |             catactionabilityarray.append(catactionablesamplecount[cat] * 100.0 / catsamplecount[cat])
149 | 
150 |     ncat = len(catarray)
151 |     order = reversed(sorted(range(ncat), key=lambda x: (catactionabilityarray[x])))
152 |     drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, parameters["thresholdcat"])
153 | 
154 | 
155 | def drawplot(ax, title, extlevels, levelcatsamplecount, catarray, catsamplecount, order, thresholdcat):
156 |     # level colors
157 |     levelcolors = {
158 |         'LEVEL_1': '#33A02C',
159 |         'LEVEL_2': '#1F78B4',
160 |         'LEVEL_3A': '#984EA3',
161 |         'LEVEL_3B': '#BE98CE',
162 |         'LEVEL_4': '#a8a8a8',
163 |         'LEVEL_R1': '#EE3424',
164 |         'LEVEL_R2': '#F79A92',
165 | 
166 |         'LEVEL_Dx1': '#33A02C',
167 |         'LEVEL_Dx2': '#1F78B4',
168 |         'LEVEL_Dx3': '#984EA3',
169 | 
170 |         'LEVEL_Px1': '#33A02C',
171 |         'LEVEL_Px2': '#1F78B4',
172 |         'LEVEL_Px3': '#984EA3',
173 | 
174 |         'ONCOGENIC': '#ffdab9',
175 |         'VUS': '#d1d1d1',
176 |         'Other': 'grey'
177 |     }
178 | 
179 |     # level legend
180 |     levellegend = {
181 |         'LEVEL_1': 'Level 1',
182 |         'LEVEL_2': 'Level 2',
183 |         'LEVEL_3A': 'Level 3A',
184 |         'LEVEL_3B': 'Level 3B',
185 |         'LEVEL_4': 'Level 4',
186 |         'LEVEL_R1': 'Level R1',
187 |         'LEVEL_R2': 'Level R2',
188 | 
189 |         'LEVEL_Dx1': 'Level Dx1',
190 |         'LEVEL_Dx2': 'Level Dx2',
191 |         'LEVEL_Dx3': 'Level Dx3',
192 | 
193 |         'LEVEL_Px1': 'Level Px1',
194 |         'LEVEL_Px2': 'Level Px2',
195 |         'LEVEL_Px3': 'Level Px3',
196 | 
197 |         'ONCOGENIC': 'Oncogenic, no level',
198 |         'VUS': 'VUS',
199 |         'Other': 'Other'
200 |     }
201 | 
202 |     ncat = len(catarray)
203 |     if ncat > 0:
204 |         catarray = [catarray[i] for i in order]
205 | 
206 |         ind = range(ncat)
207 | 
208 |         legends = []
209 |         plts = []
210 |         accumlevelcancerperc = [0] * ncat
211 |         for level in extlevels:
212 |             if level not in levelcatsamplecount:
213 |                 continue
214 | 
215 |             levelcancerperc = [0] * ncat
216 |             for k in ind:
217 |                 cat = catarray[k]
218 |                 if catsamplecount[cat] < thresholdcat:
219 |                     continue
220 |                 if cat in levelcatsamplecount[level]:
221 |                     levelcancerperc[k] = levelcatsamplecount[level][cat] * 100.0 / catsamplecount[cat]
222 | 
223 |             width = 0.75
224 |             plts = [ax.bar(ind, levelcancerperc, width, color=levelcolors[level], bottom=accumlevelcancerperc)] + plts
225 |             legends = [levellegend[level]] + legends
226 |             accumlevelcancerperc = list(map(sum, zip(accumlevelcancerperc, levelcancerperc)))
227 | 
228 |         ax = plt.gca()
229 |         ax.set_axisbelow(True)
230 |         ax.set_aspect(0.1)
231 | 
232 |         ax.tick_params(axis='y', which='major', labelsize=6)
233 |         ax.set_ylabel('% of samples', fontsize=6)
234 |         ax.set_title(title, fontsize=8)
235 |         ax.set_xticks([i + 0.5 for i in ind])
236 |         ax.set_xticklabels(catarray, rotation=60, ha="right", fontsize=4)
237 |         # plt.yticks(np.arange(0, 81, 10))
238 |         ax.legend(plts, legends, fontsize=6, bbox_to_anchor=(1.01, 1), loc="upper left")
239 | 
240 | 
241 | def main(argv):
242 |     params = {
243 |         "catogerycolumn": argv.catogery_column,  # -c
244 |         "thresholdcat": argv.threshold_cat,  # -n
245 |     }
246 |     if argv.help:
247 |         log.info(
248 |             '\n'
249 |             'OncoKBPlots.py -i <annotated clinical file> -o <output PDF file> [-c <categorization column, '
250 |             'e.g. CANCER_TYPE>] [-s sample list filter] [-n threshold of # samples in a category] [-l comma separated levels to include]\n'
251 |             '  Essential clinical columns:\n'
252 |             '    SAMPLE_ID: sample ID\n'
253 |             '    HIGHEST_LEVEL: Highest OncoKB levels\n'
254 |             '  Supported levels (-l): \n'
255 |             '    LEVEL_1,LEVEL_2,LEVEL_3A,LEVEL_3B,LEVEL_4,ONCOGENIC,VUS'
256 |         )
257 |         sys.exit()
258 |     if argv.input_file == '' or argv.output_file == '':
259 |         required_params = []
260 |         if argv.input_file == '':
261 |             required_params.append('-i')
262 |         if argv.output_file == '':
263 |             required_params.append('-o')
264 | 
265 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
266 |         log.info('for help: python OncoKBPlots.py -h')
267 |         sys.exit(2)
268 |     if argv.sample_ids_filter:
269 |         setsampleidsfileterfile(argv.sample_ids_filter)
270 |     if argv.levels:
271 |         params["levels"] = re.split(',', argv.levels)
272 | 
273 |     log.info('annotating %s ...' % argv.input_file)
274 |     fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
275 | 
276 |     plotclinicalactionability(ax1, argv.input_file, argv.output_file, params)
277 | 
278 |     # ax.yaxis.grid(linestyle="dotted", color="lightgray") # horizontal lines
279 |     # plt.margins(0.01)
280 | 
281 |     plotclinicalactionability(ax1, args.input_file, args.output_file, params)
282 |     plotimplications(ax2, 'HIGHEST_DX_LEVEL', 'OncoKB Diagnostic Implications', dxLevels, args.input_file,
283 |                      argv.output_file, params)
284 |     plotimplications(ax3, 'HIGHEST_PX_LEVEL', 'OncoKB Prognostic Implications', pxLevels, args.input_file,
285 |                      argv.output_file, params)
286 | 
287 |     plt.subplots_adjust(left=0.2, bottom=0.3)
288 |     plt.gcf().text(0.90, 0.1, "Generated by OncoKB\n[Chakravarty et al., JCO PO 2017]", fontsize=6,
289 |                    horizontalalignment='right', verticalalignment='bottom')
290 |     fig.tight_layout()
291 |     fig.savefig(argv.output_file, bbox_inches='tight')
292 | 
293 |     log.info('done!')
294 | 
295 | 
296 | if __name__ == "__main__":
297 |     parser = argparse.ArgumentParser(add_help=False)
298 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
299 |     parser.add_argument('-i', dest='input_file', default='', type=str)
300 |     parser.add_argument('-o', dest='output_file', default='', type=str)
301 |     parser.add_argument('-c', dest='catogery_column', default='CANCER_TYPE', type=str)
302 |     parser.add_argument('-s', dest='sample_ids_filter', default='', type=str)
303 |     parser.add_argument('-n', dest='threshold_cat', default=0, type=int)
304 |     parser.add_argument('-l', dest='levels', default='', type=str)
305 |     parser.set_defaults(func=main)
306 | 
307 |     args = parser.parse_args()
308 |     args.func(args)
309 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## UPDATE: 
  3 | - v3.4 allows you to include descriptions into the annotated files with `-d` parameter.
  4 | - When annotating genomic change, HGVSg, three additional columns will be added. `ONCOKB_HUGO_SYMBOL`, `ONCOKB_PROTEIN_CHANGE` and `ONCOKB_CONSEQUENCE`
  5 | - See [Columns added section](#columns-added) for more details
  6 | 
  7 | # oncokb-annotator <a href="https://ascopubs.org/doi/full/10.1200/PO.17.00011"><img src="https://img.shields.io/badge/DOI-10.1200%2FPO.17.00011-1c75cd" /></a>
  8 | API token required, please see [OncoKB™ API section](#oncokb-api) for more information
  9 | 
 10 | ## Status
 11 | 
 12 | [![Run all python tests](https://github.com/oncokb/oncokb-annotator/workflows/Run%20all%20python%20tests/badge.svg)](https://github.com/oncokb/oncokb-annotator/actions?query=workflow%3A%22Run+all+python+tests%22) [![Compare Annotation](https://github.com/oncokb/oncokb-annotator/workflows/Compare%20Annotation/badge.svg)](https://github.com/oncokb/oncokb-annotator/actions?query=workflow%3A%22Compare+Annotation%22)
 13 | 
 14 | ## Install dependencies
 15 | For python 3
 16 | ```
 17 | pip install -r requirements/common.txt -r requirements/pip3.txt
 18 | ```
 19 | 
 20 | For python 2.7
 21 | ```
 22 | pip install -r requirements/common.txt -r requirements/pip2.7.txt
 23 | ```
 24 | 
 25 | 
 26 | ## Usage
 27 | Example input files are under [data](data). An example script is here: [example.sh](example.sh)
 28 | 
 29 | ### MAF
 30 | Annotates variants in MAF(https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) with OncoKB™ annotation. Supports both python2 and python3.  
 31 | Get more details on the command line using `python MafAnnotator.py -h`.  
 32 | 
 33 | Since OncoKB Annotator only supports MAF files, one option is to use [vcf2maf](https://github.com/mskcc/vcf2maf/) for conversion before using the `MafAnnotator` script.
 34 | Note that OncoKB’s canonical transcripts may differ from Ensembl’s, so it’s important to use the `--custom-enst` option with vcf2maf. You can download the latest transcript IDs from OncoKB’s [Cancer Gene List page](https://www.oncokb.org/cancer-genes), but be sure to preprocess the list to make it compatible with vcf2maf.
 35 | 
 36 | #### Atypical Alteration
 37 | You can still use MAF format to annotate atypical alterations, such as MSI-H, TMB-H, EGFR vIII. Please see more examples [HERE](data/example_atypical_alterations.txt).  
 38 | 
 39 | ### Copy Number Alteration
 40 | #### Use GISTIC data format
 41 | We use GISTIC 2.0 format by default. For more information, please see https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data, please see examples [HERE](data/example_cna.txt).   
 42 | Columns `Locus ID` and `Cytoband` are not required.
 43 | #### Individual CNA
 44 | You can also list copy number alteration individually by specifying `-f individual`, please see examples [HERE](data/example_individual_cna.txt).
 45 | 
 46 | Get more details on the command line using `python CnaAnnotator.py -h`.  
 47 | 
 48 | ### Fusion
 49 | OncoKB™ offers to annotate functional fusions.
 50 | The fusion format for intragenic deletion is `GENE-intragenic` or `GENE-GENE`.
 51 | For other fusions, please use `GENEA-GENEB` or `GENEA-GENEB Fusion`.  
 52 | 
 53 | Get more details on the command line using `python FusionAnnotator.py -h`.  
 54 | 
 55 | ### Structural Variant
 56 | OncoKB™ offers to annotate structural variant.
 57 | The types supported are DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN.
 58 | All other types will be converted to UNKNOWN.
 59 | 
 60 | All structural variants with two different gene partners, they will be considered as functional fusions.
 61 | 
 62 | Get more details on the command line using `python StructuralVariantAnnotator.py -h`.
 63 | 
 64 | ### Clinical Data (Combine MAF+CNA+Fusion)
 65 | You can combine all annotation on sample/patient level using the clinical data annotator.  
 66 | 
 67 | Get more details on the command line using `python ClinicalDataAnnotator.py -h`.  
 68 | 
 69 | ### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change
 70 | OncoKB™ MafAnnotator supports annotating the alteration with HGVSp, HGVSp_Short, HGVSg or Genomic Change format. Please specify the query type with -q parameter.
 71 | The acceptable values are HGVSp_Short, HGVSp, HGVSg and Genomic_Change(case-insensitive). Please see data/example.sh for examples.  
 72 | If you do not specify query type, the MafAnnotator will try to figure out the query type based on the headers.  
 73 | 
 74 | #### For HGVSp_Short
 75 | The annotator takes alteration from the column HGVSp_Short or Alteration  
 76 | 
 77 | #### For HGVSp
 78 | The annotator takes alteration from the column HGVSp or Alteration  
 79 | 
 80 | #### For HGVSg
 81 | The annotator takes alteration from the column HGVSg or Alteration  
 82 | 
 83 | #### For Genomic_Change
 84 | The annotator takes genomic change from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1(Optional) and Tumor_Seq_Allele2.  
 85 | Typically Tumor_Seq_Allele1 is the reference allele, Tumor_Seq_Allele2 is the variant allele. This is why Tumor_Seq_Allele1 is optional.  
 86 | The annotator uses both if the value is different from Reference_Allele. Tumor_Seq_Allele2 has higher priority than Tumor_Seq_Allele1.  
 87 | 
 88 | Annotation with Genomic_Change is relatively slow. We need to annotate the variant first with GenomeNexus(https://www.genomenexus.org/) then get annotation one by one. There is a plan to improve this method. If you are annotating a lot of data, please prioritize using other query type if applicable. 
 89 | 
 90 | 
 91 | ### Annotate with different reference genomes (GRCh37, GRCh38)
 92 | OncoKB™ MafAnnotator supports annotating the alteration with reference genome GRCh37 and GRCh38.  
 93 | 
 94 | The annotator will get the reference genome from MAF file column NCBI_Build or Reference_Genome.  
 95 | If there is no reference genome specified in the file, we will use the default reference genome through -r parameter.  
 96 | 
 97 | You can specify the default reference genome using -r parameter (This is only applicable to MafAnnotator.py).  
 98 | The acceptable values are GRCh37, GRCh38 (case in-sensitive).  
 99 | 
100 | If both values are not specified, the annotator will use OncoKB™ default reference genome which is GRCh37.
101 | 
102 | 
103 | ## Levels of Evidence
104 | Introducing [Simplified OncoKB™ Levels of Evidence](https://www.oncokb.org/levels):
105 | - New Level 2, defined as “Standard care biomarker recommended by the NCCN or other expert panels predictive of response to an FDA-approved drug in this indication” (formerly Level 2A).
106 | - Unified Level 3B, defined as “Standard care or investigational biomarker predictive of response to an FDA-approved or investigational drug in another indication” (combination of previous Levels 2B and 3B).
107 | 
108 | We have implemented these changes for 2 reasons:
109 | - To be consistent with the [Joint Consensus Recommendation by AMP, ASCO and CAP](https://www.sciencedirect.com/science/article/pii/S1525157816302239?via%3Dihub) and the [ESMO Scale for Clinical Actionability of molecular Targets (ESCAT)](https://academic.oup.com/annonc/article/29/9/1895/5076792?searchresult=1)
110 | - To reflect the clinical data that demonstrates patients with investigational predictive biomarkers for a specific tumor type based on compelling clinical evidence (currently Level 3A) are more likely to experience clinical benefit compared to patients with predictive biomarkers that are considered standard care in a different tumor type (previously Level 2B, now combined into Level 3B).
111 | 
112 | 
113 | ## OncoKB™ API
114 | When you run `MafAnnotator.py`, `FusionAnnotator.py` and `CnaAnnotator.py`, you need a token before accessing the OncoKB™ data via its web API. Please visit [OncoKB™ Data Access Page](https://www.oncokb.org/dataAccess) for more information about how to register an account and get an OncoKB™ API token.  
115 | With the token listed under [OncoKB™ Account Settings Page](https://www.oncokb.org/account/settings), you could use it in the following format.
116 | ```
117 | python ${FILE_NAME.py} -i ${INPUT_FILE} -o ${OUTPUT_FILE} -b ${ONCOKB_API_TOKEN}
118 | ``` 
119 | 
120 | 
121 | ## Columns added
122 | ### MafAnnotator/CnaAnnotator/StructuralVariantAnnotator/FusionAnnotator
123 | | Column                      | Conditions                                         | Possible Values                                                                                                                                                                     | Description                                                                                                                                                                                                                      |
124 | |-----------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
125 | | ANNOTATED                   |                                                    | True, False                                                                                                                                                                         | Whether the variant is annotated by OncoKB successfully.                                                                                                                                                                         |
126 | | ONCOKB_HUGO_SYMBOL          | Only added when annotating genomic change or HGVSg |                                                                                                                                                                                     | When annotating genomic change, we obtained gene hugo symbol from GenomeNexus. This can be cross-referenced with your own gene name.                                                                                             |
127 | | ONCOKB_PROTEIN_CHANGE       | Only added when annotating genomic change or HGVSg |                                                                                                                                                                                     | When annotating genomic change, we obtained alteration protein change from GenomeNexus. This can be cross-referenced with your own protein change.                                                                               |
128 | | ONCOKB_CONSEQUENCE          | Only added when annotating genomic change or HGVSg |                                                                                                                                                                                     | When annotating genomic change, we obtained alteration consequence from GenomeNexus. This can be cross-referenced with your own consequence/Variant Class.                                                                       |
129 | | GENE_IN_ONCOKB              |                                                    | True, False                                                                                                                                                                         | Whether the gene has been curated by the OncoKB Team.                                                                                                                                                                            |
130 | | VARIANT_IN_ONCOKB           |                                                    | True, False                                                                                                                                                                         | Whether the variant has been curated by the OncoKB Team. Note: when a variant does not exist, it may still have annotations.                                                                                                     |
131 | | MUTATION_EFFECT             |                                                    | Gain-of-function, Likely Gain-of-function, Loss-of-function, Likely Loss-of-function, Switch-of-function, Likely Switch-of-function, Neutral, Likely Neutral, Inconclusive, Unknown | The biological effect of a mutation/alteration on the protein function that gives rise to changes in the biological properties of cells expressing the mutant/altered protein compared to cells expressing the wildtype protein. |
132 | | MUTATION_EFFECT_CITATIONS   |                                                    | PMID, Abstract, Website link                                                                                                                                                        | All citations related to the biological effect.                                                                                                                                                                                  |
133 | | ONCOGENIC                   |                                                    | Oncogenic, Likely Oncogenic, Likely Neutral, Inconclusive, Unknown, Resistance                                                                                                      | In OncoKB™, “oncogenic” is defined as “referring to the ability to induce or cause cancer” as described in the second edition of The Biology of Cancer by Robert Weinberg (2014).                                                |
134 | | LEVEL_*                     |                                                    | Therapeutic implications                                                                                                                                                            | The leveled therapeutic implications.                                                                                                                                                                                            |
135 | | HIGHEST_LEVEL               |                                                    | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4, LEVEL_R1, LEVEL_R2                                                                                                                   | The highest level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4 > LEVEL_R2                                                                                       |
136 | | HIGHEST_SENSITIVE_LEVEL     |                                                    | LEVEL_1, LEVEL_2, LEVEL_3A, LEVEL_3B, LEVEL_4                                                                                                                                       | The highest sensitive level of evidence for therapeutic implications. Order: LEVEL_1 > LEVEL_2 > LEVEL_3A > LEVEL_3B > LEVEL_4                                                                                                   |
137 | | HIGHEST_RESISTANCE_LEVEL    |                                                    | LEVEL_R1, LEVEL_R2                                                                                                                                                                  | The highest resistance level of evidence for therapeutic implications. Order: LEVEL_R1 > LEVEL_R2                                                                                                                                |
138 | | TX_CITATIONS                |                                                    | PMID, Abstract, Website link                                                                                                                                                        | All citations related to therapeutic implications.                                                                                                                                                                               |
139 | | LEVEL_Dx*                   |                                                    | Tumor type the level of evidence is assigned to                                                                                                                                     | The leveled diagnostic implications.                                                                                                                                                                                             |
140 | | HIGHEST_DX_LEVEL            |                                                    | LEVEL_Dx1, LEVEL_Dx2, LEVEL_Dx3                                                                                                                                                     | The highest level of evidence for diagnostic implications.                                                                                                                                                                       |
141 | | DX_CITATIONS                |                                                    | PMID, Abstract, Website link                                                                                                                                                        | All citations related to diagnostic implications.                                                                                                                                                                                |
142 | | LEVEL_Px*                   |                                                    | Tumor type the level of evidence is assigned to                                                                                                                                     | The leveled prognostic implications.                                                                                                                                                                                             |
143 | | HIGHEST_PX_LEVEL            |                                                    | LEVEL_Px1, LEVEL_Px2, LEVEL_Px3                                                                                                                                                     | The highest level of evidence for prognostic implications.                                                                                                                                                                       |
144 | | PX_CITATIONS                |                                                    | PMID, Abstract, Website link                                                                                                                                                        | All citations related to prognostic implications.                                                                                                                                                                                |
145 | | GENE_SUMMARY                | Only when parameter -d is specified                |                                                                                                                                                                                     | Brief overview of the gene and its role in cancer                                                                                                                                                                                |
146 | | VARIANT_SUMMARY             | Only when parameter -d is specified                |                                                                                                                                                                                     | Variant summary describes the variant oncogenicity, last review if it is VUS                                                                                                                                                     |
147 | | TUMOR_TYPE_SUMMARY          | Only when parameter -d is specified                |                                                                                                                                                                                     | Tumor type summary describes the therapeutic implication that applies to the indication                                                                                                                                          |
148 | | DIAGNOSTIC_SUMMARY          | Only when parameter -d is specified                |                                                                                                                                                                                     | Diagnostic summary that applies to the indication, for hematologic malignancies only                                                                                                                                             |
149 | | PROGNOSTIC_SUMMARY          | Only when parameter -d is specified                |                                                                                                                                                                                     | Prognostic summary that applies to the indication, for hematologic malignancies only                                                                                                                                             |
150 | | MUTATION_EFFECT_DESCRIPTION | Only when parameter -d is specified                |                                                                                                                                                                                     | The mutation effect description provides a brief overview of the biological and oncogenic effect of the VPS and includes appropriate references to peer-reviewed literature.                                                     |
151 | 
152 | ### ClinicalDataAnnotator
153 | Please see description above for columns LEVEL_&ast;, HIGHEST_LEVEL, HIGHEST_SENSITIVE_LEVEL, HIGHEST_RESISTANCE_LEVEL, LEVEL_Dx*, HIGHEST_DX_LEVEL, LEVEL_Px*, HIGHEST_PX_LEVEL.   
154 | Beside these columsn, the following columns will also be added.
155 | 
156 | | Column                                              | Description                                                                 |
157 | |-----------------------------------------------------|-----------------------------------------------------------------------------|
158 | | ONCOGENIC_MUTATIONS                                 | The list of mutations that are Oncogenic or Likely Oncogenic.               |
159 | | #ONCOGENIC_MUTATIONS                                | Number of oncogenic mutations.                                              |
160 | | RESISTANCE_MUTATIONS                                | The list of resistance mutations.                                           |
161 | | #RESISTANCE_MUTATIONS                               | Number of resistance mutations.                                             |
162 | | #MUTATIONS_WITH_SENSITIVE_THERAPEUTIC_IMPLICATIONS  | Number of mutations in the sample with sensitive therapeutic implications.  |
163 | | #MUTATIONS_WITH_RESISTANCE_THERAPEUTIC_IMPLICATIONS | Number of mutations in the sample with resistance therapeutic implications. |
164 | | #MUTATIONS_WITH_DIAGNOSTIC_IMPLICATIONS             | Number of mutations in the sample with diagnostic implications.             |
165 | | #MUTATIONS_WITH_PROGNOSTIC_IMPLICATIONS             | Number of mutations in the sample with prognostic implications.             |
166 | | #MUTATIONS                                          | Number of mutations in the sample.                                          |
167 | ## Questions?
168 | The best way is to email contact@oncokb.org, so all our team members can help.
169 | 


--------------------------------------------------------------------------------
/StructuralVariantAnnotator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | import logging
 6 | 
 7 | from AnnotatorCore import setsampleidsfileterfile
 8 | from AnnotatorCore import setcancerhotspotsbaseurl
 9 | from AnnotatorCore import setoncokbbaseurl
10 | from AnnotatorCore import setoncokbapitoken
11 | from AnnotatorCore import readCancerTypes
12 | from AnnotatorCore import validate_oncokb_token
13 | from AnnotatorCore import process_sv
14 | 
15 | logging.basicConfig(level=logging.INFO)
16 | log = logging.getLogger('StructuralVariantAnnotator')
17 | 
18 | 
19 | def main(argv):
20 |     if argv.help:
21 |         log.info(
22 |             '\n'
23 |             'StructuralVariantAnnotator.py -i <input structural variant file> -o <output structural variant file> '
24 |             '[-p previous results] [-c <input clinical file>] [-s sample list filter] [-t <default tumor type>] '
25 |             '[-u <oncokb api url>] [-b <oncokb api bear token>] [-d include descriptions]\n'
26 |             '  Essential structural variant columns (case insensitive):\n'
27 |             '    GENEA: Hugo gene symbol for gene A\n'
28 |             '    GENEB: Hugo gene symbol for gene B\n'
29 |             '    SV_TYPE: Structural variant type. Available values: DELETION, TRANSLOCATION, DUPLICATION, INSERTION, INVERSION, FUSION, UNKNOWN. Other type will be converted to UNKNOWN\n'
30 |             '    TUMOR_SAMPLE_BARCODE: sample ID\n'
31 |             '  Essential clinical columns:\n'
32 |             '    SAMPLE_ID: sample ID\n'
33 |             '    ONCOTREE_CODE: tumor type code from oncotree (oncotree.mskcc.org)\n'
34 |             '  Cancer type will be assigned based on the following priority:\n'
35 |             '     1) ONCOTREE_CODE in clinical data file\n'
36 |             '     2) ONCOTREE_CODE exist in structural variant\n'
37 |             '     3) default tumor type (-t)\n'
38 |             '  Default OncoKB base url is https://www.oncokb.org'
39 |         )
40 |         sys.exit()
41 |     if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
42 |         required_params = []
43 |         if argv.input_file == '':
44 |             required_params.append('-i')
45 |         if argv.output_file == '':
46 |             required_params.append('-o')
47 |         if argv.oncokb_api_bearer_token == '':
48 |             required_params.append('-b')
49 | 
50 |         log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
51 |         log.info('for help: python StructuralVariantAnnotator.py -h')
52 |         sys.exit(2)
53 |     if argv.sample_ids_filter:
54 |         setsampleidsfileterfile(argv.sample_ids_filter)
55 |     if argv.cancer_hotspots_base_url:
56 |         setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
57 |     if argv.oncokb_api_url:
58 |         setoncokbbaseurl(argv.oncokb_api_url)
59 |     setoncokbapitoken(argv.oncokb_api_bearer_token)
60 | 
61 |     cancertypemap = {}
62 |     if argv.input_clinical_file:
63 |         readCancerTypes(argv.input_clinical_file, cancertypemap)
64 | 
65 |     validate_oncokb_token()
66 | 
67 |     log.info('annotating %s ...' % argv.input_file)
68 |     process_sv(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type, cancertypemap,
69 |                argv.include_descriptions)
70 | 
71 |     log.info('done!')
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     parser = argparse.ArgumentParser(add_help=False)
76 |     # ArgumentParser doesn't accept "store_true" and "type=" at the same time.
77 |     parser.add_argument('-h', dest='help', action="store_true", default=False)
78 |     parser.add_argument('-i', dest='input_file', default='', type=str)
79 |     parser.add_argument('-o', dest='output_file', default='', type=str)
80 |     parser.add_argument('-p', dest='previous_result_file', default='', type=str)
81 |     parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
82 |     parser.add_argument('-s', dest='sample_ids_filter', default=None, type=str)
83 |     parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
84 |     parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
85 |     parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
86 |     parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
87 |     parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False)
88 |     parser.set_defaults(func=main)
89 | 
90 |     args = parser.parse_args()
91 |     args.func(args)
92 | 


--------------------------------------------------------------------------------
/actionability_functions_msi_tmb_manuscript_R.r:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env Rscript
   2 | 
   3 | ### Annotate IMPACT files using oncokb-annotator ###
   4 | 
   5 | ### Chakravarty D, Gao J, Phillips SM, et al. OncoKB: A Precision Oncology Knowledge Base. JCO Precis Oncol. 2017;2017:PO.17.00011. doi:10.1200/PO.17.00011 ###
   6 | 
   7 | ### Actionability Functions ###
   8 | # Collection of functions use to clean, process, and analysis actionability data
   9 | 
  10 | ### Input parameters
  11 | 
  12 | # cna_df: OncoKB annotated IMPACT CNA data  
  13 | # mut_df: OncoKB annotated IMPACT mutation data  
  14 | # fus_df: OncoKB annotated IMPACT fusion data  
  15 | # clin_df: OncoKB annotated IMPACT clinical sample data  
  16 | # data_freeze: Sample data, must include *SAMPLE_ID*, group_col, and consent_col
  17 | # group_col: Column name for the groups (cancer types)
  18 | # consent_col: Columns name for 12-245 Part C consent status (YES/NO/NA)
  19 | # path_df: Pathway data, must include gene and correpsonding pathway columns (in that order)  
  20 | # tsg_list: List of tumor suppresor genes (no header)  
  21 | # fusion_list: List of genes to isolate from fusion partners (ie. NTRK1-LMNA fusion becomes NTRK1 fusion)  
  22 | # prop_level_df: Output from action_levels_barplot_fun actionability_levels_barplot_table.txt  
  23 | # alt_final_df: Output from action_alterations_barplot_fun actionability_master_alterations_table.txt  
  24 | # alt_min: Minimum alteration percentage required in one cancer type to visualize alteration on main plot (default 1)  
  25 | # status: Include only somatic mutations, only germline mutations, or both (options: somatic, germline, both)
  26 | # gene_order: List of genes for gene order, genes not included will be ordered by pathway following this list (no header) 
  27 | # only_highest_level: TRUE/FALSE, If true only visualize the highest level of evidence genes in main plot
  28 | # msi_tmb_status: TRUE/FALSE, If true include Level 1 MSI/TMB status in actionability barplot, removes MSI/TMB in all other plots
  29 | # msi_tmb_df: MSI/TMB annotated file (atypical alterations), visualizes MSI/TMB level 1 for actionability barplot, 
  30 |   # removes all samples in file for all other plots
  31 | 
  32 | 
  33 | ###
  34 | 
  35 | 
  36 | # Load libraries
  37 | if (!require('tidyverse')) install.packages('tidyverse'); library(tidyverse)
  38 | if (!require('cowplot')) install.packages('cowplot'); library(cowplot)
  39 | if (!require('reshape2')) install.packages('reshape2'); library(reshape2)
  40 | 
  41 | # Collapse oncogenic alterations
  42 | collapse_oncogenic <- function(data_frame, sample_column, alteration_type){
  43 |   data_frame[, ] <- lapply(data_frame[, ], as.character)
  44 |   #data_frame_samp <- data_frame %>% dplyr::filter(oncogenic == "Oncogenic")  #### TESTING
  45 |   data_frame_samp <- data_frame[grepl("Oncogenic", data_frame$oncogenic),]
  46 |   colnames(data_frame_samp)[which(names(data_frame_samp) == sample_column)] <- "SAMPLE_ID"
  47 |   data_frame_samp <- aggregate(oncogenic ~ SAMPLE_ID, data = data_frame_samp, toString, na.omit = TRUE)
  48 |   colnames(data_frame_samp)[2] <- paste0(alteration_type, "_oncogenic")
  49 |   return(data_frame_samp)
  50 | }
  51 | 
  52 | # Create frequency data frame by group for subgroup
  53 | freq_dataframe <- function(data_frame, split_group, percentage_group){
  54 |   # Split group is the column to group by
  55 |   # Percentage group is the column to calculate the percentage for, by group
  56 |   df <- data_frame %>%
  57 |     dplyr::select(percentage_group, split_group) %>%
  58 |     group_by_(split_group, percentage_group, .drop = F) %>%
  59 |     dplyr::summarise(n = n()) %>%
  60 |     dplyr::mutate(freq = n / sum(n)) %>%
  61 |     ungroup()
  62 |   return(df)
  63 | }
  64 | 
  65 | # Create actionability level barplot
  66 | action_levels_barplot_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze,
  67 |                                       status = c("somatic", "germline", "both"),
  68 |                                       group_col,
  69 |                                       consent_col,
  70 |                                       msi_tmb_status,
  71 |                                       msi_tmb_df){
  72 |   # Read in data
  73 |   cna_df <- read.delim(cna_df)
  74 |   fus_df <- read.delim(fus_df)
  75 |   mut_df <- read.delim(mut_df)
  76 |   clin_df <- read.delim(clin_df)
  77 |   data_freeze <- read.delim(data_freeze)
  78 |   data_freeze$SAMPLE_ID <- as.character(data_freeze$SAMPLE_ID)
  79 |   
  80 |   ######
  81 |   
  82 |   # Optional MSI/TMB addition
  83 |   if (msi_tmb_status == TRUE){
  84 |     msi_tmb_df <- read.delim(msi_tmb_path)
  85 |     msi_tmb_df <- msi_tmb_df %>%
  86 |       dplyr::select(SAMPLE_ID) %>% 
  87 |       mutate_if(is.factor, as.character) %>%
  88 |       mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>%
  89 |       distinct()
  90 |   } else {
  91 |     msi_tmb_df <- data.frame(SAMPLE_ID = character(), Highest_level = character())
  92 |   }
  93 |   
  94 |   #####
  95 |   
  96 |   # Clean & filter clinical data
  97 |   # Add group column
  98 |   clin_df <- clin_df %>%
  99 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 100 |            HIGHEST_LEVEL = as.character(HIGHEST_LEVEL)) %>%
 101 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>%
 102 |     left_join(data_freeze[,c("SAMPLE_ID", group_col, consent_col)], by = c("SAMPLE_ID")) %>%
 103 |     mutate(HIGHEST_LEVEL = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", HIGHEST_LEVEL))
 104 |   group_col_dup <- paste0(group_col, ".y")
 105 |   colnames(clin_df)[which(names(clin_df) == group_col_dup)] <- group_col
 106 |   colnames(clin_df)[which(names(clin_df) == consent_col)] <- "consent"
 107 |   
 108 |   # Clean, filter, rename genomic data 
 109 |   # Fix column names if upper
 110 |   # Filter for columns of interest
 111 |   col_list <- c("SAMPLE_ID", "oncogenic", "LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "Highest_level")
 112 |   
 113 |   # Fusions
 114 |   fus_df <- fus_df %>%
 115 |     dplyr::rename_all(recode, 
 116 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 117 |                       HIGHEST_LEVEL = "Highest_level",
 118 |                       ONCOGENIC = "oncogenic") %>%
 119 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 120 |            Highest_level = as.character(Highest_level)) %>%
 121 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>%
 122 |     mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>%
 123 |     dplyr::select(col_list)
 124 |   
 125 |   # CNA
 126 |   cna_df <- cna_df %>%
 127 |     dplyr::rename_all(recode, 
 128 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 129 |                       HIGHEST_LEVEL = "Highest_level",
 130 |                       ONCOGENIC = "oncogenic") %>%
 131 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 132 |            Highest_level = as.character(Highest_level)) %>%
 133 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID)  %>%
 134 |     mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>%
 135 |     dplyr::select(col_list)
 136 |   
 137 |   # Mutations
 138 |   mut_df <- mut_df %>%
 139 |     dplyr::rename_all(recode, 
 140 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 141 |                       HIGHEST_LEVEL = "Highest_level",
 142 |                       ONCOGENIC = "oncogenic") %>%
 143 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 144 |            Highest_level = as.character(Highest_level)) %>%
 145 |     mutate(Highest_level = ifelse(SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID, "LEVEL_1_MSI-H_TMB-H", Highest_level)) %>%
 146 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID)
 147 |   
 148 |   # Filter for status
 149 |   if (status == "somatic") {
 150 |     mut_somatic_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T)
 151 |     mut_germ_df <- filter(mut_df, Mutation_Status == "GERMLINE")
 152 |     mut_df <- mut_somatic_df[col_list]
 153 |   } else if (status == "germline") {
 154 |     clin_df <- filter(clin_df, consent == "YES")
 155 |     mut_germ_df <- filter(mut_df, Mutation_Status == "GERMLINE")
 156 |     mut_germ_df <- mut_germ_df[mut_germ_df$SAMPLE_ID %in% clin_df$SAMPLE_ID,]
 157 |     mut_df <- mut_germ_df[col_list]
 158 |   } else {
 159 |     mut_df <- mut_df[col_list]
 160 |     clin_germ_df <- filter(clin_df, consent == "YES" | is.na(consent) == T)
 161 |   }
 162 |   
 163 |   # Create master levels data frame for somatic 
 164 |   master_df <- rbind(cna_df, fus_df)
 165 |   master_df <- rbind(master_df, mut_df)
 166 |   master_df <- master_df %>%
 167 |     dplyr::select(SAMPLE_ID, Highest_level) %>%
 168 |     filter(Highest_level != "") %>%
 169 |     mutate_if(is.factor, as.character) %>%
 170 |     group_by(SAMPLE_ID) %>%
 171 |     dplyr::arrange(Highest_level) %>%
 172 |     dplyr::slice(1) %>%
 173 |     ungroup()
 174 |   
 175 |   # Collapse oncogenic alterations
 176 |   cna_df <- collapse_oncogenic(cna_df, "SAMPLE_ID", "cna")
 177 |   fus_df <- collapse_oncogenic(fus_df, "SAMPLE_ID", "fus")
 178 |   mut_df <- collapse_oncogenic(mut_df, "SAMPLE_ID", "mut")
 179 |   
 180 |   # Filter if germline
 181 |   if (status == "germline") {
 182 |     clin_df <- left_join(clin_df, mut_germ_df[,c("SAMPLE_ID", "Highest_level")])
 183 |     clin_df <- clin_df %>% mutate_if(is.factor, as.character)
 184 |     clin_df$HIGHEST_LEVEL <- ifelse(clin_df$SAMPLE_ID %in% mut_germ_df$SAMPLE_ID, clin_df$Highest_level, "NO_LEVEL")
 185 |     # Get list of sample with oncogenic alteration
 186 |     onco_samp_list <- mut_df$SAMPLE_ID
 187 |   } else if (status == "somatic") {
 188 |     clin_df <- left_join(clin_df, master_df, by = "SAMPLE_ID")
 189 |     clin_df <- clin_df %>% mutate_if(is.factor, as.character)
 190 |     clin_df$HIGHEST_LEVEL <- ifelse(clin_df$SAMPLE_ID %in% mut_germ_df$SAMPLE_ID,
 191 |                                     clin_df$Highest_level, clin_df$HIGHEST_LEVEL)
 192 |     # Merge to make master oncogenic list of samples
 193 |     all_df <- full_join(cna_df, fus_df, by = "SAMPLE_ID")
 194 |     all_df <- full_join(all_df, mut_df, by = "SAMPLE_ID")
 195 |     # Get list of sample with oncogenic alteration
 196 |     onco_samp_list <- all_df$SAMPLE_ID
 197 |   } else {
 198 |     # Merge to make master oncogenic list of samples
 199 |     all_df <- full_join(cna_df, fus_df, by = "SAMPLE_ID")
 200 |     all_df <- full_join(all_df, mut_df, by = "SAMPLE_ID")
 201 |     # Get list of sample with oncogenic alteration
 202 |     onco_samp_list <- all_df$SAMPLE_ID
 203 |   }
 204 |   
 205 |   # Fill in the highest level blanks:
 206 |   clin_df$HIGHEST_LEVEL <- as.character(clin_df$HIGHEST_LEVEL)
 207 |   clin_df$HIGHEST_LEVEL[clin_df$HIGHEST_LEVEL == "" | is.na(clin_df$HIGHEST_LEVEL) == T] <- "NO_LEVEL"
 208 |   clin_df$HIGHEST_LEVEL[(clin_df$SAMPLE_ID %in% onco_samp_list) & (clin_df$HIGHEST_LEVEL == "NO_LEVEL") ] <- "ONCOGENIC"
 209 |   
 210 |   # For highest level of actionability, calculate the percentage of each level by subtype
 211 |   prop_level_df <- freq_dataframe(clin_df, group_col, "HIGHEST_LEVEL")
 212 |   
 213 |   # Set level order
 214 |   level_order <- c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL")
 215 |   prop_level_df$HIGHEST_LEVEL <- factor(prop_level_df$HIGHEST_LEVEL, levels = level_order)
 216 |   
 217 |   # Add counts for labels
 218 |   # Check the number of oncotree codes and their frequency
 219 |   data_freeze <- data_freeze[data_freeze$SAMPLE_ID %in% clin_df$SAMPLE_ID,]
 220 |   clin_oncotree_freq <- as.data.frame(table(data_freeze[,group_col]))
 221 |   clin_oncotree_freq <- clin_oncotree_freq[order(clin_oncotree_freq$Freq, decreasing = T),]
 222 |   colnames(clin_oncotree_freq)[1] <- group_col
 223 |   prop_level_df <- left_join(prop_level_df, clin_oncotree_freq, by = group_col)
 224 |   
 225 |   if (status == "both") {
 226 |     data_freeze_2 <- data_freeze[data_freeze$SAMPLE_ID %in% clin_germ_df$SAMPLE_ID,]
 227 |     clin_oncotree_freq_germ <- as.data.frame(table(data_freeze_2[,group_col]))
 228 |     colnames(clin_oncotree_freq_germ)[1] <- group_col
 229 |     prop_level_df <- left_join(prop_level_df, clin_oncotree_freq_germ, by = group_col)
 230 |     prop_level_df$label <- apply(prop_level_df[ ,c(group_col, "Freq.x")], 1, paste0, collapse = " n=" )
 231 |     prop_level_df$label <- apply(prop_level_df[ ,c("label", "Freq.y")], 1, paste0, collapse = ":" )
 232 |   } else {
 233 |     prop_level_df$label <- apply(prop_level_df[ ,c(group_col, "Freq")], 1, paste0, collapse = " n=" )
 234 |   }
 235 |   
 236 |   # # Arrange by frequency of actionability
 237 |   # prop_level_df <- prop_level_df %>%
 238 |   #   arrange(HIGHEST_LEVEL, desc(freq))
 239 |   
 240 |   # Arrange by frequency of combined top 4 levels of actionability
 241 |   prop_level_df_order <- prop_level_df %>%
 242 |     filter(HIGHEST_LEVEL %in% c("LEVEL_1_MSI-H_TMB-H", "LEVEL_1", "LEVEL_2", "LEVEL_3A")) %>%
 243 |     group_by(CANCER_TYPE) %>%
 244 |     dplyr::mutate(sum_freq = sum(freq)) %>%
 245 |     right_join(prop_level_df) %>%
 246 |     dplyr::arrange(desc(sum_freq), HIGHEST_LEVEL, desc(freq)) %>%
 247 |     dplyr::rename(total_count = Freq) %>%
 248 |     mutate(CANCER_TYPE = factor(CANCER_TYPE, levels = unique(CANCER_TYPE)))
 249 |   
 250 |   # Save
 251 |   write.table(prop_level_df_order, "./actionability_levels_barplot_table.txt", sep = "\t", row.names = F, quote = F)
 252 |   
 253 |   # Set orders
 254 |   cancer_order <- unique(prop_level_df_order$label)
 255 |   
 256 |   # Plot breakdown of levels of evidence as a percentage by sarcoma subtype
 257 |   percent_bar_plot <- ggplot(prop_level_df_order, aes(y = freq, x = label, fill = HIGHEST_LEVEL)) +
 258 |     geom_col(position = position_stack(reverse = TRUE)) +
 259 |     theme(axis.text.x = element_text(angle = 45, hjust = 0, size = 6),
 260 |           axis.text.y = element_text(size = 6),
 261 |           axis.ticks.x = element_blank(),
 262 |           panel.border = element_rect(colour = "black", fill=NA, size=1),
 263 |           panel.background = element_blank(),
 264 |           panel.grid.major = element_blank(),
 265 |           panel.grid.minor = element_blank(),
 266 |           axis.line = element_line(colour = "black"),
 267 |           legend.title = element_text(size = 7),
 268 |           legend.text = element_text(size = 6),
 269 |           legend.key.size = unit(0.4, "cm"),
 270 |           axis.title.x = element_blank(),
 271 |           plot.margin = unit(c(0.05, 0.05, 0.1, 0.05), "cm"),
 272 |           legend.justification="left",
 273 |           legend.margin=margin(0,0,0,0),
 274 |           legend.box.margin=margin(-10,0,-10,-5)) +
 275 |     scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0)) +
 276 |     scale_fill_manual(values = c("#88E281","#33A02C", "#1F78B4", "#984EA3", "#BE98CE", "#a8a8a8", "#ffdab9", "gray90"),
 277 |                       limits = c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL"),
 278 |                       labels = c("LEVEL 1 MSI/TMB-H","LEVEL 1", "LEVEL 2", "LEVEL 3A", "LEVEL 3B", "LEVEL 4", "ONCOGENIC", "NO LEVEL")) +
 279 |     scale_x_discrete(position = "top",
 280 |                      limits = cancer_order) +
 281 |     ylab("Frequency") +
 282 |     labs(fill = "Highest Level of Evidence")
 283 |   
 284 |   return(percent_bar_plot)
 285 |   
 286 | }
 287 | 
 288 | # Create actionability alteration barplot
 289 | action_alterations_barplot_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze,
 290 |                                            status = c("somatic", "germline", "both"),
 291 |                                            group_col, consent_col,
 292 |                                            prop_level_df = "./actionability_levels_barplot_table.txt",
 293 |                                            only_highest_level = F,
 294 |                                            msi_tmb_status,
 295 |                                            msi_tmb_df){
 296 |   # Read in data
 297 |   cna_df <- read.delim(cna_df)
 298 |   fus_df <- read.delim(fus_df)
 299 |   mut_df <- read.delim(mut_df)
 300 |   clin_df <- read.delim(clin_df)
 301 |   data_freeze <- read.delim(data_freeze)
 302 |   prop_level_df <- read.delim(prop_level_df)
 303 |   
 304 |   # Set order
 305 |   cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)]))
 306 |   
 307 |   
 308 |   ######
 309 |   
 310 |   # Optional MSI/TMB addition
 311 |   if (msi_tmb_status == TRUE){
 312 |     msi_tmb_df <- read.delim(msi_tmb_path)
 313 |     msi_tmb_df <- msi_tmb_df %>%
 314 |       dplyr::select(SAMPLE_ID) %>% 
 315 |       mutate_if(is.factor, as.character) %>%
 316 |       mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>%
 317 |       distinct()
 318 |     data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID)
 319 |   } 
 320 |   
 321 |   #####
 322 |   
 323 |   # Clean & filter clinical data
 324 |   # Add group column
 325 |   clin_df <- clin_df %>%
 326 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>%
 327 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>%
 328 |     left_join(data_freeze[,c("SAMPLE_ID", group_col, consent_col)], by = c("SAMPLE_ID"))
 329 |   group_col_dup <- paste0(group_col, ".y")
 330 |   colnames(clin_df)[which(names(clin_df) == group_col_dup)] <- group_col
 331 |   colnames(clin_df)[which(names(clin_df) == consent_col)] <- "consent"
 332 |   
 333 |   # Clean, filter, rename genomic data 
 334 |   # Fix column names if upper
 335 |   fus_df <- fus_df %>%
 336 |     dplyr::rename_all(recode, 
 337 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 338 |                       HIGHEST_LEVEL = "Highest_level",
 339 |                       ONCOGENIC = "oncogenic") %>%
 340 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>%
 341 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>%
 342 |     mutate(Fusion = gsub(" fusion", "", Fusion)) %>%
 343 |     mutate(Fusion = gsub(" - Archer", "", Fusion)) %>%
 344 |     dplyr::select(SAMPLE_ID, oncogenic, Highest_level, Fusion) %>%
 345 |     rowwise() %>%
 346 |     mutate(Fusion = ifelse(grepl("intragenic", Fusion), Fusion, 
 347 |                            paste(sort(unlist(strsplit(Fusion, "-", fixed = TRUE))), collapse = "-"))) %>%
 348 |     ungroup() %>%
 349 |     distinct() %>%
 350 |     dplyr::select(-Fusion) %>%
 351 |     mutate(ALTERATION = "Fusion") %>%
 352 |     filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>%
 353 |     dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level)
 354 |   
 355 |   # CNA
 356 |   cna_df <- cna_df %>%
 357 |     dplyr::rename_all(recode, 
 358 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 359 |                       HIGHEST_LEVEL = "Highest_level",
 360 |                       ONCOGENIC = "oncogenic") %>%
 361 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>%
 362 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID)  %>%
 363 |     dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level) %>%
 364 |     filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>%
 365 |     dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level)
 366 |   
 367 |   # Mutations
 368 |   # Filter for status
 369 |   if (status == "somatic") {
 370 |     mut_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T)
 371 |   } else if (status == "germline") {
 372 |     clin_df <- filter(clin_df, consent == "YES")
 373 |     mut_df <- filter(mut_df, Mutation_Status == "GERMLINE")
 374 |     mut_df <- mut_df[mut_df$SAMPLE_ID %in% clin_df$SAMPLE_ID,]
 375 |   }
 376 |   # Clean & Filter
 377 |   mut_df <- mut_df %>%
 378 |     dplyr::rename_all(recode, 
 379 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 380 |                       HIGHEST_LEVEL = "Highest_level",
 381 |                       ONCOGENIC = "oncogenic") %>%
 382 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID)) %>%
 383 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID) %>%
 384 |     dplyr::select(SAMPLE_ID, oncogenic, Highest_level) %>%
 385 |     filter(grepl("Oncogenic", oncogenic) == T, is.na(Highest_level) == F & Highest_level != "") %>%
 386 |     mutate(ALTERATION = "Mutation") %>%
 387 |     dplyr::select(SAMPLE_ID, ALTERATION, oncogenic, Highest_level)
 388 |   
 389 |   
 390 |   # rbind to create master alterations data frame
 391 |   # Filter for status for mutation data frame
 392 |   if (status == "somatic" | status == "both") {
 393 |     alt_final <- rbind(cna_df, fus_df)
 394 |     alt_final <- rbind(alt_final, mut_df)
 395 |   } else if (status == "germline") {
 396 |     alt_final <- mut_df
 397 |   }
 398 |   alt_final <- left_join(alt_final, data_freeze[,c("SAMPLE_ID", group_col)], by = "SAMPLE_ID")
 399 |   
 400 |   # Save
 401 |   write.table(alt_final, "actionability_master_alterations_table.txt", sep = "\t", row.names = F, quote = F)
 402 |   
 403 |   ########## optional select only the highest level ##########
 404 |   
 405 |   if (only_highest_level == T){
 406 |     alt_final <- alt_final %>% 
 407 |       left_join(dplyr::select(clin_df, SAMPLE_ID, HIGHEST_LEVEL), by = "SAMPLE_ID") %>%
 408 |       mutate_if(is.factor, as.character) %>%
 409 |       filter(HIGHEST_LEVEL == Highest_level)
 410 |   }
 411 |   
 412 |   ###########
 413 |   
 414 |   # Save
 415 |   write.table(alt_final, "actionability_master_alterations_highest_level_table.txt", sep = "\t", row.names = F, quote = F)
 416 |   
 417 |   # Calculate the percentage of each alteration by subtype
 418 |   prop_alteration_df <- as.data.frame(freq_dataframe(alt_final, group_col, "ALTERATION"))
 419 |   prop_alteration_df$freq[is.na(prop_alteration_df$freq)] <- 0
 420 |   prop_alteration_df$ALTERATION <- factor(prop_alteration_df$ALTERATION,
 421 |                                           levels = c("Amplification", "Deletion", "Fusion", "Mutation"))
 422 |   prop_alteration_df$group <- factor(prop_alteration_df[,group_col],
 423 |                                      levels = cancer_order_other)
 424 |   
 425 |   # Save
 426 |   write.table(prop_alteration_df, "actionability_alterations_barplot_table.txt", sep = "\t", row.names = F, quote = F)
 427 |   
 428 |   # Plot for ACTIONABLE ALTERATIONS
 429 |   alt_freq_bar_plot <- ggplot(prop_alteration_df, aes(y = freq, x = group, fill = ALTERATION)) +
 430 |     geom_col(position = position_stack(reverse = FALSE)) +
 431 |     ylab("Frequency") +
 432 |     labs(fill = "Actionable Alteration") +
 433 |     theme(axis.text.x = element_blank(),
 434 |           axis.ticks.x = element_blank(),
 435 |           axis.text.y = element_text(size = 6),
 436 |           panel.border = element_rect(colour = "black", fill=NA, size=1),
 437 |           panel.background = element_blank(),
 438 |           panel.grid.major = element_blank(),
 439 |           panel.grid.minor = element_blank(),
 440 |           axis.line = element_line(colour = "black"),
 441 |           legend.title = element_text(size = 8),
 442 |           legend.text = element_text(size = 6),
 443 |           legend.key.size = unit(0.4, "cm"),
 444 |           axis.title.x = element_blank(),
 445 |           plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"),
 446 |           legend.justification="left",
 447 |           legend.margin=margin(0,0,0,0),
 448 |           legend.box.margin=margin(-10,0,-10,-5)) +
 449 |     scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0)) +
 450 |     ylab("Frequency") +
 451 |     scale_fill_manual(values = c("#A11111",  "#02488E", "#660066", "#037903"),
 452 |                       limits = c("Amplification", "Deletion", "Fusion", "Mutation"),
 453 |                       labels = c("Amplification", "Deletion", "Fusion", "Mutation")) +
 454 |     scale_x_discrete(limits = cancer_order_other)
 455 |   alt_freq_bar_plot
 456 |   
 457 |   return(alt_freq_bar_plot)
 458 |   
 459 | }
 460 | 
 461 | # Create actionability count barplot
 462 | action_count_barplot_fun <- function(clin_df, data_freeze, group_col,
 463 |                                      prop_level_df = "./actionability_levels_barplot_table.txt",
 464 |                                      status = c("somatic", "germline", "both"),
 465 |                                      consent_col,
 466 |                                      alt_final_df = "./actionability_master_alterations_table.txt",
 467 |                                      msi_tmb_status,
 468 |                                      msi_tmb_df){
 469 |   
 470 |   # Read in files
 471 |   prop_level_df <- read.delim(prop_level_df)
 472 |   alt_final <- read.delim(alt_final_df)
 473 |   clin_df <- read.delim(clin_df)
 474 |   data_freeze <- read.delim(data_freeze)
 475 |   
 476 |   # Filter for samples in data freeze and clean consent column
 477 |   data_freeze$SAMPLE_ID <- as.character(data_freeze$SAMPLE_ID)
 478 |   
 479 |   ######
 480 |   
 481 |   # Optional MSI/TMB addition
 482 |   if (msi_tmb_status == TRUE){
 483 |     msi_tmb_df <- read.delim(msi_tmb_path)
 484 |     msi_tmb_df <- msi_tmb_df %>%
 485 |       dplyr::select(SAMPLE_ID) %>% 
 486 |       mutate_if(is.factor, as.character) %>%
 487 |       mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>%
 488 |       distinct()
 489 |     data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID)
 490 |   } 
 491 |   
 492 |   #####
 493 |   
 494 |   # Clean
 495 |   clin_df <- clin_df[as.character(clin_df$SAMPLE_ID) %in% data_freeze$SAMPLE_ID,]
 496 |   colnames(data_freeze)[which(names(data_freeze) == consent_col)] <- "consent"
 497 |   
 498 |   # Set order
 499 |   cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)]))
 500 |   
 501 |   # Create data frame that counts the number of actionable oncogenic alterations
 502 |   alt_final$alt_count <- 1
 503 |   alt_final <- dplyr::select(alt_final, SAMPLE_ID, alt_count)
 504 |   
 505 |   # Filter for status
 506 |   if (status == "germline") {
 507 |     clin_df <- clin_df[clin_df$SAMPLE_ID %in% as.character(filter(data_freeze, consent == "YES")$SAMPLE_ID),]
 508 |   }
 509 |   
 510 |   # Add in samples that don't have an actionable alteration
 511 |   alt_final_none <- as.data.frame(clin_df[,c("SAMPLE_ID")])
 512 |   colnames(alt_final_none)[1] <- "SAMPLE_ID"
 513 |   alt_final_none$alt_count <- 0
 514 |   alt_final <- rbind(alt_final, alt_final_none)
 515 |   alt_final <- aggregate(alt_count ~ SAMPLE_ID, alt_final, sum)
 516 |   
 517 |   # Add cancer subtypes to clinical data frame and create labels
 518 |   alt_final <- left_join(alt_final, data_freeze[,c("SAMPLE_ID", group_col)], by = "SAMPLE_ID")
 519 |   alt_final$label <- ifelse(alt_final$alt_count >= 3, "3+", alt_final$alt_count)
 520 |   
 521 |   # Calculate the percentage of each count by subtype
 522 |   prop_alt_count_df <- as.data.frame(freq_dataframe(alt_final, group_col, "label"))
 523 |   prop_alt_count_df$freq[is.na(prop_alt_count_df$freq)] <- 0
 524 |   
 525 |   # Set order
 526 |   prop_alt_count_df$label <- factor(prop_alt_count_df$label,  levels = c("0", "1", "2", "3+"))
 527 |   prop_alt_count_df$group <- factor(prop_alt_count_df[,group_col],
 528 |                                     levels = cancer_order_other)
 529 |   
 530 |   # Save
 531 |   write.table(prop_alt_count_df, "actionability_count_table.txt", sep = "\t", row.names = F, quote = F)
 532 |   
 533 |   # Number of alterations plot
 534 |   alt_per_num_prop_plot <- ggplot(prop_alt_count_df, aes(y = freq, x = group, fill = label)) +
 535 |     geom_col(position = position_stack(reverse = FALSE)) +
 536 |     ylab("Frequency") +
 537 |     labs(fill = "# of Actionable Alterations") +
 538 |     theme(axis.text.x = element_blank(),
 539 |           axis.ticks.x = element_blank(),
 540 |           axis.text.y = element_text(size = 6),
 541 |           panel.border = element_rect(colour = "black", fill=NA, size=1),
 542 |           panel.background = element_blank(),
 543 |           panel.grid.major = element_blank(),
 544 |           panel.grid.minor = element_blank(),
 545 |           axis.line = element_line(colour = "black"),
 546 |           legend.title = element_text(size = 8),
 547 |           legend.text = element_text(size = 6),
 548 |           legend.key.size = unit(0.4, "cm"),
 549 |           axis.title.x = element_blank(),
 550 |           plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"),
 551 |           legend.justification="left",
 552 |           legend.margin=margin(0,0,0,0),
 553 |           legend.box.margin=margin(-10,0,-10,-5)) +
 554 |     scale_y_continuous(limits=c(0, 1.00), expand = c(0, 0))  +
 555 |     scale_fill_manual(values = c("#F7E690", "#F7AA14", "#E17202" ,"#701C5A"),
 556 |                       limits = c("0", "1", "2", "3+"),
 557 |                       labels = c("0", "1", "2", "3+")) +
 558 |     scale_x_discrete(limits = cancer_order_other)
 559 |   alt_per_num_prop_plot
 560 |   
 561 |   return(alt_per_num_prop_plot)
 562 |   
 563 | }
 564 | 
 565 | # Create actionability alterations main plot
 566 | action_main_fun <- function(cna_df, mut_df, fus_df, clin_df, data_freeze,
 567 |                             path_df,
 568 |                             tsg_list, fusion_list,
 569 |                             prop_level_df = "./actionability_levels_barplot_table.txt",
 570 |                             group_col,
 571 |                             consent_col,
 572 |                             alt_min = 1,
 573 |                             status = c("somatic", "germline", "both"),
 574 |                             gene_order,
 575 |                             only_highest_level = F,
 576 |                             msi_tmb_status,
 577 |                             msi_tmb_df,
 578 |                             include_oncogenic = F){
 579 |   
 580 |   # Read in data
 581 |   cna_df <- read.delim(cna_df)
 582 |   fus_df <- read.delim(fus_df)
 583 |   mut_df <- read.delim(mut_df)
 584 |   clin_df <- read.delim(clin_df)
 585 |   data_freeze <- read.delim(data_freeze)
 586 |   tsg_df <- read.delim(tsg_list, header = F)
 587 |   prop_level_df <- read.delim(prop_level_df)
 588 |   
 589 |   # Set order
 590 |   cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)]))
 591 |   
 592 |   # Clean and filter data
 593 |   # Data freeze
 594 |   colnames(data_freeze)[which(names(data_freeze) == group_col)] <- "cancer_type"
 595 |   colnames(data_freeze)[which(names(data_freeze) == consent_col)] <- "consent"
 596 |   data_freeze <- data_freeze %>%
 597 |     mutate_if(is.factor, as.character)
 598 |   
 599 |   # Optional MSI/TMB addition
 600 |   if (msi_tmb_status == TRUE){
 601 |     msi_tmb_df <- read.delim(msi_tmb_path)
 602 |     msi_tmb_df <- msi_tmb_df %>%
 603 |       dplyr::select(SAMPLE_ID) %>% 
 604 |       mutate_if(is.factor, as.character) %>%
 605 |       mutate(Highest_level = "LEVEL_1_MSI-H_TMB-H") %>%
 606 |       distinct()
 607 |     data_freeze <- filter(data_freeze, !SAMPLE_ID %in% msi_tmb_df$SAMPLE_ID)
 608 |   } 
 609 |   
 610 |   # Clinical
 611 |   clin_df <- clin_df %>%
 612 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID)
 613 |   
 614 |   # CNA
 615 |   cna_df <- cna_df %>%
 616 |     dplyr::rename_all(recode, 
 617 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 618 |                       HIGHEST_LEVEL = "Highest_level",
 619 |                       ONCOGENIC = "oncogenic") %>%
 620 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 621 |            Highest_level = as.character(Highest_level)) %>%
 622 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID,
 623 |            grepl("Oncogenic", oncogenic)) 
 624 |   
 625 |   # Fusions
 626 |   fus_df <- fus_df %>%
 627 |     dplyr::rename_all(recode, 
 628 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 629 |                       HIGHEST_LEVEL = "Highest_level",
 630 |                       ONCOGENIC = "oncogenic") %>%
 631 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 632 |            Highest_level = as.character(Highest_level)) %>%
 633 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID,
 634 |            grepl("Oncogenic", oncogenic))
 635 |   
 636 |   # Mutations
 637 |   mut_df <- mut_df %>%
 638 |     dplyr::rename_all(recode, 
 639 |                       Tumor_Sample_Barcode = "SAMPLE_ID", 
 640 |                       HIGHEST_LEVEL = "Highest_level",
 641 |                       ONCOGENIC = "oncogenic") %>%
 642 |     mutate(SAMPLE_ID = as.character(SAMPLE_ID),
 643 |            Highest_level = as.character(Highest_level)) %>%
 644 |     filter(SAMPLE_ID %in% data_freeze$SAMPLE_ID,
 645 |            grepl("Oncogenic", oncogenic))
 646 |   
 647 |   # Set tumor suppresor list
 648 |   tumor_suppressor_list <- as.character(tsg_df$V1)
 649 |   
 650 |   # Make count data frame - consider somatic/germline/both
 651 |   if (status == "germline") {
 652 |     data_freeze <- filter(data_freeze, consent == "YES")
 653 |     clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type))
 654 |     colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count")
 655 |     mut_df <- mut_df[mut_df$SAMPLE_ID %in% data_freeze$SAMPLE_ID,]
 656 |   } else if (status == "somatic") {
 657 |     clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type))
 658 |     colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count")
 659 |   } else {
 660 |     clin_oncotree_freq <- as.data.frame(table(data_freeze$cancer_type))
 661 |     data_freeze_1 <- filter(data_freeze, consent == "YES")
 662 |     data_freeze_1$SAMPLE_ID <- as.character(data_freeze_1$SAMPLE_ID)
 663 |     clin_oncotree_freq_1 <- as.data.frame(table(data_freeze_1$cancer_type))
 664 |     clin_oncotree_freq <- left_join(clin_oncotree_freq, clin_oncotree_freq_1, by = "Var1")
 665 |     colnames(clin_oncotree_freq)[] <- c("cancer_type", "total_count", "germ_count")
 666 |     # Remove samples that have germline alterations but ARE NOT Part C consented
 667 |     remove_list <- intersect(filter(mut_df, Mutation_Status == "GERMLINE")$SAMPLE_ID,
 668 |                              filter(data_freeze, consent == "NO")$SAMPLE_ID)
 669 |     mut_df <- mut_df[!(mut_df$SAMPLE_ID %in% remove_list),]
 670 |   }
 671 |   
 672 |   
 673 |   # Create CNA data frame, combine with pathways and tumor suppresor list
 674 |   cna_df <- cna_df %>%
 675 |     inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>%
 676 |     dplyr::select(SAMPLE_ID, HUGO_SYMBOL, ALTERATION, LEVEL_1, LEVEL_2, LEVEL_3A,
 677 |                   LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type) %>%
 678 |     distinct() %>%
 679 |     filter(is.na(Highest_level) == F) %>%
 680 |     mutate(ALTERATION = substring(ALTERATION, 1, 3)) %>%
 681 |     dplyr::select(SAMPLE_ID, HUGO_SYMBOL, ALTERATION, Highest_level, oncogenic, cancer_type) %>%
 682 |     dplyr::rename(sample_id = SAMPLE_ID, 
 683 |                   gene_symbol = HUGO_SYMBOL, 
 684 |                   alteration = ALTERATION, 
 685 |                   highest_level = Highest_level) %>%
 686 |     mutate(onco_type = ifelse(gene_symbol %in% tumor_suppressor_list, "tumor_suppresor", NA))
 687 |   
 688 |   # Create fusion data frame
 689 |   # Combine fusions where the hugo gene symbol is counted twice (impact and archer)
 690 |   fus_df <- fus_df %>%
 691 |     inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>%
 692 |     dplyr::select(SAMPLE_ID, Hugo_Symbol, Fusion, LEVEL_1, LEVEL_2, LEVEL_3A,
 693 |                   LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type) %>%
 694 |     mutate_if(is.factor, as.character) %>%
 695 |     mutate(Fusion = gsub(" fusion", "", Fusion)) %>%
 696 |     mutate(Fusion = gsub(" - Archer", "", Fusion)) %>%
 697 |     rowwise() %>%
 698 |     mutate(Fusion = ifelse(grepl("intragenic", Fusion), Fusion,
 699 |                            paste(sort(unlist(strsplit(Fusion, "-", fixed = TRUE))), collapse = "-"))) %>%
 700 |     ungroup() %>%
 701 |     distinct()
 702 |   
 703 |   # If fusion list is provided, select the gene partner of interest based on the list
 704 |   if (missing(fusion_list) == FALSE) {
 705 |     # Read in fusion list
 706 |     fusion_list <- read.delim(fusion_list, header = F)
 707 |     fusion_list <- as.character(fusion_list$V1)
 708 |     fusion_list_collapse <- paste0("\\b", paste(fusion_list , collapse="\\b|\\b"), "\\b")
 709 |     # Filter for fusion list or full fusion name
 710 |     fus_df <- fus_df %>% 
 711 |       mutate_if(is.factor, as.character) %>%
 712 |       mutate(Fusion = ifelse(Hugo_Symbol %in% fusion_list, Hugo_Symbol,
 713 |                              ifelse(grepl(fusion_list_collapse, Fusion) == F, Fusion, "REMOVE"))) %>%
 714 |       filter(Fusion != "REMOVE") %>%
 715 |       mutate(Fusion = gsub("-intragenic", "", Fusion))
 716 |   }
 717 |   
 718 |   # Clean, add tumor suppresor columns
 719 |   fus_df <- fus_df %>%
 720 |     filter(Highest_level != "") %>%
 721 |     mutate(Alteration = "Fus") %>%
 722 |     dplyr::select(SAMPLE_ID, Fusion, Alteration, Highest_level, oncogenic, cancer_type) %>%
 723 |     dplyr::rename(sample_id = SAMPLE_ID,
 724 |                   gene_symbol = Fusion,
 725 |                   alteration = Alteration,
 726 |                   highest_level = Highest_level) %>%
 727 |     mutate(onco_type = ifelse(gene_symbol %in% tumor_suppressor_list, "tumor_suppresor", NA)) %>%
 728 |     distinct()
 729 |   
 730 |   # Collapse NTRK fusions
 731 |   # Other fusions can be added to this list moving forward
 732 |   fus_df <- fus_df %>%
 733 |     mutate_if(is.factor, as.character) %>%
 734 |     mutate(gene_symbol = ifelse(gene_symbol %in% c("NTRK1", "NTRK2", "NTRK3"), "NTRK1/2/3", gene_symbol)) %>%
 735 |     distinct()
 736 |   
 737 |   # Filter for mutation status
 738 |   if (status == "somatic") {
 739 |     mut_df <- filter(mut_df, Mutation_Status != "GERMLINE" | is.na(Mutation_Status) == T)
 740 |   } else if (status == "germline") {
 741 |     mut_df <- filter(mut_df, Mutation_Status == "GERMLINE")
 742 |   }
 743 |   
 744 |   # Mutation
 745 |   mut_df <- mut_df %>%
 746 |     inner_join(dplyr::select(data_freeze, SAMPLE_ID, cancer_type), by = "SAMPLE_ID") %>%
 747 |     dplyr::select(SAMPLE_ID, Hugo_Symbol, Variant_Type, LEVEL_1, LEVEL_2, LEVEL_3A,
 748 |                   LEVEL_3B, LEVEL_4, Highest_level, oncogenic, cancer_type, HGVSp_Short, Mutation_Status)
 749 |   # Add in oncogenic here if included
 750 |   if (include_oncogenic == T) {
 751 |     mut_df <- mut_df %>%
 752 |       mutate(ONCOGENIC = "ONCOGENIC")
 753 |   } 
 754 |   mut_df <- melt(mut_df, id.vars = c("SAMPLE_ID", "Hugo_Symbol", "Variant_Type", "Highest_level",
 755 |                                      "oncogenic", "cancer_type", "HGVSp_Short", "Mutation_Status"))
 756 |   
 757 |   # Aggregate by everything but strip for the highest level
 758 |   # This is just in case there is a gene alteration that has more than one level
 759 |   # Add pathways and tumor suppressor column
 760 |   # Remove duplicates if they are in the same pathway (use order of input df)
 761 |   mut_df <- mut_df %>%
 762 |     mutate_if(is.factor, as.character) %>%
 763 |     filter(value != "") %>%
 764 |     dplyr::select(-value, -Highest_level) %>%
 765 |     dplyr::rename(highest_level = variable) %>%
 766 |     filter(is.na(highest_level) == F) %>%
 767 |     mutate(highest_level == as.character(highest_level),
 768 |            Mutation_Status = ifelse(is.na(Mutation_Status) == T, "", Mutation_Status)) %>%
 769 |     group_by(SAMPLE_ID, Hugo_Symbol, Variant_Type, oncogenic, cancer_type, HGVSp_Short, Mutation_Status) %>%
 770 |     dplyr::summarise(highest_level = toString(highest_level)) %>%
 771 |     ungroup() %>%
 772 |     mutate(highest_level = gsub(",.*", "", highest_level),
 773 |            alteration = "Mut") %>%
 774 |     mutate(onco_type = ifelse(Hugo_Symbol %in% tumor_suppressor_list, "tumor_suppresor", NA)) %>%
 775 |     ###
 776 |     ### work in progress
 777 |     mutate(Hugo_Symbol = ifelse(Hugo_Symbol == "BRAF" & HGVSp_Short == "p.V600E", "BRAF_V600E", Hugo_Symbol)) %>%
 778 |     mutate(Hugo_Symbol = ifelse(Hugo_Symbol == "BRAF" & HGVSp_Short != "p.V600E", "BRAF_Other", Hugo_Symbol)) %>%
 779 |     ###
 780 |     ###
 781 |     distinct() %>%
 782 |     rename(gene_symbol = Hugo_Symbol) %>%
 783 |     mutate(gene_symbol = as.character(gene_symbol)) %>%
 784 |     dplyr::select(SAMPLE_ID, gene_symbol, alteration, highest_level, oncogenic, cancer_type, Mutation_Status, onco_type) %>%
 785 |     dplyr::rename(sample_id = SAMPLE_ID) %>%
 786 |     mutate(onco_type = ifelse(Mutation_Status == "GERMLINE", "germline", onco_type),
 787 |            gene_symbol = ifelse(Mutation_Status == "GERMLINE", paste0(gene_symbol, "*"), gene_symbol)) %>%
 788 |     dplyr::select(-Mutation_Status) %>%
 789 |     group_by(sample_id, gene_symbol, alteration, highest_level, oncogenic, cancer_type, onco_type) %>%
 790 |     dplyr::slice(1) %>%
 791 |     ungroup()
 792 |   
 793 |   # Filter for status
 794 |   # Combine CNA, FUS, and MUT - create final df
 795 |   if (status == "somatic" | status == "both") {
 796 |     gene_final_df <- rbind(cna_df, fus_df)
 797 |     gene_final_df <- rbind(gene_final_df, mut_df)
 798 |   } else if (status == "germline") {
 799 |     gene_final_df <- mut_df
 800 |   }
 801 |   
 802 |   # Optional include oncogenic alterations in plot
 803 |   if (include_oncogenic == T){
 804 |     gene_final_df <- gene_final_df %>%
 805 |       mutate(highest_level = ifelse((is.na(highest_level) == T | highest_level == "") &
 806 |                                       grepl("Oncogenic", oncogenic) == T, "ONCOGENIC", highest_level))
 807 |   } 
 808 |   
 809 |   # Combine all tumor suppressor alterations (del, mut, fus)
 810 |   # If the alteration is on a tumor suppresor, ignore alteration label
 811 |   # Clean up gene symbol, remove everything after the comma
 812 |   # Remove mutation label to clean up y axis
 813 |   gene_final_df <- gene_final_df %>%
 814 |     filter(is.na(highest_level) == F & highest_level != "") %>%
 815 |     mutate_if(is.factor, as.character) %>%
 816 |     mutate(onco_type = ifelse(is.na(onco_type) == T, "oncogene", onco_type)) %>%
 817 |     group_by(sample_id, gene_symbol, highest_level, cancer_type, onco_type) %>%
 818 |     dplyr::summarise(alteration = toString(alteration)) %>%
 819 |     ungroup() %>%
 820 |     mutate(alteration = as.character(alteration)) %>%
 821 |     mutate(alteration = ifelse(onco_type == "tumor_suppresor", "Del", alteration)) %>%
 822 |     mutate(alteration = gsub(",.*", "", alteration)) %>%
 823 |     mutate(gene_symbol_label = gsub(" Mut", "", paste0(gene_symbol, " ", alteration)))
 824 | 
 825 |   # Optional select only the highest level
 826 |   if (only_highest_level == T){
 827 |     colnames(clin_df)[which(names(clin_df) == "SAMPLE_ID")] <- "sample_id"
 828 |     gene_final_df <- gene_final_df %>%
 829 |       left_join(dplyr::select(clin_df, sample_id, HIGHEST_LEVEL), by = "sample_id") %>%
 830 |       mutate_if(is.factor, as.character) %>%
 831 |       filter(HIGHEST_LEVEL == highest_level)
 832 |   }
 833 |   
 834 |   # Manual alterations
 835 |   ###
 836 |   ### work in progress
 837 |   gene_final_df <- gene_final_df %>% 
 838 |     mutate_if(is.factor, as.character) %>%
 839 |     mutate(gene_symbol = ifelse(gene_symbol %in% c("BRCA1", "BRCA2"), "BRCA1/2", gene_symbol),
 840 |            gene_symbol_label = ifelse(gene_symbol == "BRCA1/2", "BRCA1/2 Del", gene_symbol_label)) %>%
 841 |     mutate(gene_symbol = ifelse(gene_symbol %in% c("CHEK1", "CHEK2"), "CHEK1/2", gene_symbol),
 842 |            gene_symbol_label = ifelse(gene_symbol == "CHEK1/2", "CHEK1/2 Del", gene_symbol_label)) %>%
 843 |     mutate(gene_symbol = ifelse(gene_symbol %in% c("TSC1", "TSC2"), "TSC1/2", gene_symbol),
 844 |            gene_symbol_label = ifelse(gene_symbol == "TSC1/2", "TSC1/2 Del", gene_symbol_label)) %>% 
 845 |     distinct()
 846 |   ###
 847 |   ###
 848 |   
 849 |   # Calculate the percentage of each count by subtype
 850 |   # Only select the highest level
 851 |   prop_main_plot_df <- gene_final_df %>%
 852 |     group_by(cancer_type, gene_symbol_label, highest_level) %>%
 853 |     dplyr::summarise(n = n()) %>%
 854 |     ungroup() %>%
 855 |     left_join(clin_oncotree_freq, by = "cancer_type") %>%
 856 |     dplyr::mutate(freq = n /total_count) %>%
 857 |     group_by(cancer_type, gene_symbol_label) %>%
 858 |     arrange(highest_level) %>%
 859 |     dplyr::slice(1) %>%
 860 |     ungroup() %>%
 861 |     dplyr::mutate(percentage = 100*freq,
 862 |                   label_text = round(percentage, 0),
 863 |                   label_text = ifelse(percentage > 0 & percentage < 1, " ", label_text))
 864 |   
 865 |   # Optional add pathway if provided, if not use it to set gene list
 866 |   if (missing(path_df) == T) {
 867 |     path_df <- gene_final_df  %>%
 868 |       left_join(prop_main_plot_df) %>%
 869 |       dplyr::select(gene_symbol_label, highest_level, cancer_type, percentage) %>%
 870 |       distinct() %>%
 871 |       group_by(gene_symbol_label, highest_level) %>%
 872 |       mutate(count = n()) %>%
 873 |       ungroup() %>%
 874 |       arrange(highest_level, desc(count), desc(percentage), gene_symbol_label) %>%
 875 |       group_by(gene_symbol_label) %>%
 876 |       dplyr::slice(1) %>%
 877 |       ungroup() %>%
 878 |       arrange(highest_level, desc(count), desc(percentage), gene_symbol_label) %>%
 879 |       mutate(pathway = row_number()) %>%
 880 |       dplyr::select(gene_symbol_label, pathway)
 881 |     gene_final_df <- gene_final_df %>% left_join(path_df)
 882 |   } else {
 883 |     path_df <- read.delim(path_df)
 884 |     colnames(path_df)[] <- c("gene_symbol", "pathway")
 885 |     gene_final_df <- gene_final_df %>% left_join(path_df)
 886 |   }
 887 |   
 888 |   # Add germline label if figure includes both somatic and germline
 889 |   if (status == "both") {
 890 |     prop_main_plot_df$freq <- ifelse(grepl("\\*",prop_main_plot_df$gene_symbol_label) == TRUE,
 891 |                                      prop_main_plot_df$n/prop_main_plot_df$germ_count,
 892 |                                      prop_main_plot_df$freq)
 893 |   }
 894 |   
 895 |   # Add pathways
 896 |   prop_main_plot_df <- prop_main_plot_df %>%
 897 |     left_join(dplyr::select(gene_final_df, gene_symbol_label, gene_symbol), by = "gene_symbol_label") %>%
 898 |     left_join(dplyr::select(gene_final_df, gene_symbol, cancer_type, highest_level, pathway, onco_type),
 899 |               by = c("gene_symbol", "cancer_type", "highest_level")) %>%
 900 |     group_by(gene_symbol, gene_symbol_label, cancer_type, pathway, onco_type, n, total_count, percentage, freq, label_text) %>%
 901 |     dplyr::summarise(highest_level = toString(highest_level)) %>%
 902 |     ungroup() %>%
 903 |     mutate(highest_level_label = gsub(",.*", "", highest_level)) %>%
 904 |     dplyr::select(-highest_level) %>%
 905 |     dplyr::arrange(pathway, gene_symbol, highest_level_label, cancer_type)
 906 |   
 907 |   # Only keep rows where at least one subtype meets the percetage threshold (alt_min)
 908 |   prop_main_plot_df_filter <- prop_main_plot_df %>%
 909 |     dplyr::select(gene_symbol_label, percentage) %>%
 910 |     group_by(gene_symbol_label) %>%
 911 |     filter(percentage == max(percentage)) %>%
 912 |     filter(percentage < alt_min)
 913 |   prop_main_plot_df <- prop_main_plot_df %>%
 914 |     filter(!gene_symbol_label %in%prop_main_plot_df_filter$gene_symbol_label) %>%
 915 |     mutate(cancer_type = factor(cancer_type, levels = cancer_order_other))
 916 |   
 917 |   # Set gene order manually
 918 |   if (missing(gene_order) == F) {
 919 |     gene_order <- read.delim(gene_order, header = F)
 920 |     gene_order <- as.data.frame(gene_order[rep(seq_len(nrow(gene_order)), each = 2), ])
 921 |     colnames(gene_order)[] <- c("gene_symbol")
 922 |     gene_order <- gene_order %>%
 923 |       mutate_if(is.factor, as.character) %>%
 924 |       mutate(order = seq(1:nrow(gene_order)),
 925 |              gene_symbol = ifelse(order %% 2 == 0, paste0(gene_symbol, "*"), gene_symbol))
 926 |     prop_main_plot_df <- prop_main_plot_df %>%
 927 |       left_join(gene_order, by = "gene_symbol") %>%
 928 |       dplyr::arrange(order)
 929 |   }
 930 |   
 931 |   # Get text color order
 932 |   text_tsg_col <- prop_main_plot_df %>%
 933 |     dplyr::select(gene_symbol_label, onco_type) %>%
 934 |     distinct() %>%
 935 |     dplyr::arrange(onco_type) %>%
 936 |     group_by(gene_symbol_label) %>%
 937 |     dplyr::summarise(onco_type = toString(onco_type)) %>%
 938 |     ungroup() %>%
 939 |     mutate(col = ifelse(onco_type != "tumor_suppresor", ifelse(onco_type == "oncogene", "#7E1116", "#4F0043"), "#191A57")) %>%
 940 |     group_by(gene_symbol_label, col) %>%
 941 |     slice(1) %>%
 942 |     ungroup() %>%
 943 |     mutate(gene_symbol_label = factor(gene_symbol_label, levels = unique(prop_main_plot_df$gene_symbol_label))) %>%
 944 |     dplyr::arrange(gene_symbol_label)
 945 |   
 946 |   # Write out data frame
 947 |   write.table(prop_main_plot_df, "actionability_main_plot_data.txt", sep = "\t", quote = F, row.names = F)
 948 |   
 949 |   # Create main plot
 950 |   action_tile_plot_all <- ggplot(data = prop_main_plot_df, aes(x = cancer_type, y = gene_symbol_label)) +
 951 |     geom_tile(aes(fill = highest_level_label)) +
 952 |     geom_text(aes(label = label_text), colour = "white", size = 2) +
 953 |     theme(panel.grid.major = element_blank(),
 954 |           axis.text.x = element_blank(),
 955 |           axis.text.y = element_text(size = 6), # colour = rev(text_tsg_col$col)),
 956 |           panel.background = element_blank(),
 957 |           panel.border = element_rect(colour = "black", fill=NA, size=0.5),
 958 |           axis.title.x = element_blank(),
 959 |           axis.ticks.x = element_blank(),
 960 |           axis.title.y = element_text(size = 8),
 961 |           plot.margin = unit(c(0.05, 0.05, 0.05, 0.05), "cm"),
 962 |           legend.title = element_text(size = 8),
 963 |           legend.text = element_text(size = 6),
 964 |           legend.justification="left",
 965 |           legend.margin=margin(0,0,0,0),
 966 |           legend.box.margin=margin(-10,0,-10,-5)) +
 967 |     geom_vline(xintercept=seq(1.5, length(levels(prop_main_plot_df$cancer_type))-0.5, 1),
 968 |                lwd=0.25, colour="gray80") +
 969 |     geom_hline(yintercept=seq(1.5, length(unique(prop_main_plot_df$gene_symbol_label))-0.5, 1),
 970 |                lwd=0.25, colour="gray80") +
 971 |     scale_fill_manual(values = c("#88E281","#33A02C", "#1F78B4", "#984EA3", "#BE98CE", "#a8a8a8", "#ffdab9", "gray90"),
 972 |                       limits = c("LEVEL_1_MSI-H_TMB-H","LEVEL_1", "LEVEL_2", "LEVEL_3A", "LEVEL_3B", "LEVEL_4", "ONCOGENIC", "NO_LEVEL"),
 973 |                       labels = c("LEVEL 1 MSI/TMB-H","LEVEL 1", "LEVEL 2", "LEVEL 3A", "LEVEL 3B", "LEVEL 4", "ONCOGENIC", "NO LEVEL")) +
 974 |     scale_y_discrete(limits = rev(unique(prop_main_plot_df$gene_symbol_label)),
 975 |                      labels = gsub("_", " ", rev(unique(prop_main_plot_df$gene_symbol_label))),
 976 |                      expand = c(0,0)) +
 977 |     scale_x_discrete(limits = cancer_order_other) +
 978 |     labs(fill = "Highest Level\nof Evidence") +
 979 |     guides(fill = guide_legend(override.aes = list(size = 1)))
 980 |   
 981 |   return(action_tile_plot_all)
 982 | }
 983 | 
 984 | 
 985 | # Create actionability TMB-H & MSI-H main plot add-on
 986 | action_main_msi_tmb_fun <- function(clin_df, 
 987 |                                     data_freeze,
 988 |                                     group_col,
 989 |                                     prop_level_df = "./actionability_levels_barplot_table.txt",
 990 |                                     msi_tmb_df){
 991 |   # Read in data
 992 |   data_freeze <- read.delim(data_freeze)
 993 |   clin_df <- read.delim(clin_df)
 994 |   msi_tmb_df <- read.delim(msi_tmb_df)
 995 |   prop_level_df <-  read.delim(prop_level_df)
 996 |   
 997 |   # Set order
 998 |   cancer_order_other <- as.character(unique(prop_level_df[,c(group_col)]))
 999 |   
1000 |   # Get MSI/TMB frequency
1001 |   aty_alt_df <- msi_tmb_df %>% 
1002 |     mutate_if(is.factor, as.character) %>%
1003 |     dplyr::select(SAMPLE_ID, ALTERATION, HIGHEST_LEVEL) %>%
1004 |     right_join(data_freeze) %>%
1005 |     dplyr::select(CANCER_TYPE, ALTERATION) %>%
1006 |     dplyr::mutate_if(is.factor, as.character) %>%
1007 |     dplyr::mutate(ALTERATION = ifelse(is.na(ALTERATION) == T, "NONE", ALTERATION))
1008 |   aty_alt_df <- aty_alt_df %>%
1009 |     left_join(dplyr::count(dplyr::select(group_by(aty_alt_df, CANCER_TYPE), CANCER_TYPE))) %>%
1010 |     dplyr::rename(total_count = n) %>%
1011 |     mutate_if(is.character, as.factor) %>%
1012 |     group_by(ALTERATION, CANCER_TYPE, total_count) %>%
1013 |     dplyr::summarise(n = n()) %>%
1014 |     dplyr::mutate(freq = n/total_count,
1015 |                   percentage = 100*freq,
1016 |                   label_text = as.character(round(percentage, 0)),
1017 |                   label_text_final = ifelse(percentage > 0 & percentage < 1, "", label_text)) %>%
1018 |     filter(ALTERATION != "NONE") %>%
1019 |     dplyr::mutate(ALTERATION = factor(ALTERATION, levels = c("TMB-H", "MSI-H")),
1020 |                   label = "MSI-H & TMB-H") 
1021 |   
1022 |   # Plot
1023 |   aty_alt_tile_plot_all <- ggplot(data = aty_alt_df, aes(x = CANCER_TYPE, y = ALTERATION)) +
1024 |     geom_tile(aes(fill = label)) +
1025 |     geom_text(aes(label = label_text_final), colour = "black", size = 2) +
1026 |     theme(panel.grid.major = element_blank(),
1027 |           axis.text.x = element_blank(),
1028 |           axis.text.y = element_text(size = 6),
1029 |           panel.background = element_blank(),
1030 |           panel.border = element_rect(colour = "black", fill=NA, size=1),
1031 |           axis.title.x = element_blank(),
1032 |           axis.title.y = element_blank(),
1033 |           axis.ticks.x = element_blank(),
1034 |           plot.margin = unit(c(0.05, 0.05, 0.1, 0.05), "cm"),
1035 |           legend.justification="left",
1036 |           legend.margin = margin(0,0,0,0),
1037 |           legend.box.margin = margin(-10,0,-10,-5),
1038 |           legend.title = element_blank(),
1039 |           legend.text = element_text(size = 6),
1040 |           legend.key.size = unit(0.4, "cm")) +
1041 |     geom_vline(xintercept=seq(1.5, length(levels(aty_alt_df$CANCER_TYPE))-0.5, 1),
1042 |                lwd=0.5, colour="white") +
1043 |     geom_hline(yintercept=seq(1.5, length(unique(aty_alt_df$ALTERATION))-0.5, 1),
1044 |                lwd=0.25, colour="white") +
1045 |     scale_fill_manual(values = c("#88E281"),
1046 |                       limits = c("MSI-H & TMB-H"),
1047 |                       labels = c("MSI-H & TMB-H")) +
1048 |     scale_y_discrete(limits = levels(aty_alt_df$ALTERATION),
1049 |                      expand = c(0,0)) +
1050 |     scale_x_discrete(limits = cancer_order_other)
1051 |   
1052 |   return(aty_alt_tile_plot_all)
1053 |   
1054 | }
1055 | 
1056 | 
1057 | 
1058 | #--------------


--------------------------------------------------------------------------------
/data/example_atypical_alterations.txt:
--------------------------------------------------------------------------------
 1 | HUGO_SYMBOL	SAMPLE_ID	ALTERATION
 2 | Other Biomarkers	TCGA-A6-2672-01	MSI-H
 3 | Other Biomarkers	TCGA-A6-2672-01	Microsatellite Instability-High
 4 | Other Biomarkers	TCGA-AG-A002-01	TMB-H
 5 | Other Biomarkers	TCGA-AG-A002-01	Tumor Mutational Burden-High
 6 | EGFR	TCGA-FAKE-01	vIII
 7 | EGFR	TCGA-FAKE-02	vV
 8 | FLT3	TCGA-FAKE-01	Internal tandem duplication
 9 | FLT3	TCGA-FAKE-02	ITD
10 | BRAF	TCGA-FAKE-01	Kinase Domain Duplication
11 | BRAF	TCGA-FAKE-03	KDD
12 | EGFR	TCGA-FAKE-01	C-terminal domain
13 | EGFR	TCGA-FAKE-02	CTD
14 | 


--------------------------------------------------------------------------------
/data/example_clinical.txt:
--------------------------------------------------------------------------------
 1 | SAMPLE_ID	ONCOTREE_CODE
 2 | TCGA-05-4417-01	LUAD
 3 | TCGA-02-0033-01	BLLETV6RUNX1
 4 | TCGA-06-0155-01	GBM
 5 | TCGA-AG-A002-01	READ
 6 | TCGA-A6-2672-01	COAD
 7 | TCGA-FAKE-01	AML
 8 | TCGA-FAKE-02	AML
 9 | TCGA-FAKE-03	HDCN
10 | TCGA-A6-2672-01A-01W-0833-10	MEL
11 | 


--------------------------------------------------------------------------------
/data/example_cna.txt:
--------------------------------------------------------------------------------
1 | Gene Symbol	Locus ID	Cytoband	TCGA-05-4417-01	TCGA-02-0033-01
2 | MET	0	0	2	2
3 | ERBB2	0	0	2	1
4 | CDK4	0	0	-2	2
5 | CDK4	0	0	-1	2
6 | 


--------------------------------------------------------------------------------
/data/example_fusions.txt:
--------------------------------------------------------------------------------
1 | Tumor_Sample_Barcode	Fusion
2 | TCGA-02-0033-01	MLL2-intragenic
3 | TCGA-05-4417-01	ALK-EML4
4 | TCGA-06-0155-01	EGFR-intragenic
5 | TCGA-06-0155-01	TMPRSS2-ERG
6 | TCGA-05-4417-01	TMPRSS2-ERG
7 | TCGA-06-0155-01	ERBB2-intragenic
8 | TCGA-02-0033-01	ETV6-RUNX1 fusion
9 | 


--------------------------------------------------------------------------------
/data/example_individual_cna.txt:
--------------------------------------------------------------------------------
 1 | Tumor_Sample_Barcode	Hugo_Symbol	Copy_Number_Alteration
 2 | TCGA-05-4417-01	MET	2
 3 | TCGA-05-4417-01	ERBB2	2
 4 | TCGA-05-4417-01	CDK4	-2
 5 | TCGA-05-4417-01	CDK4	-1
 6 | TCGA-02-0033-01	MET	2
 7 | TCGA-02-0033-01	ERBB2	1
 8 | TCGA-02-0033-01	CDK4	2
 9 | TCGA-02-0033-01	CDK4	2
10 | TCGA-05-4417-01	MET	Amplification
11 | TCGA-05-4417-01	CDK4	Deletion
12 | TCGA-05-4417-01	CDK4	Loss
13 | TCGA-05-4417-01	ERBB2	Gain


--------------------------------------------------------------------------------
/data/example_maf.txt:
--------------------------------------------------------------------------------
 1 | NCBI_Build	Hugo_Symbol	Variant_Classification	Tumor_Sample_Barcode	HGVSp_Short	HGVSp	HGVSg	Chromosome	Start_Position	End_Position	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2
 2 | GRCh37	CUL1	Missense_Mutation	TCGA-A6-2672-01A-01W-0833-10	p.Y466S	Tyr466Ser							
 3 | GRCh37	AKT3	Nonsense_Mutation	TCGA-05-4417-01	p.E182*	Glu182*							
 4 | GRCh37	PIK3CA	Missense_Mutation	TCGA-02-0033-01	p.E542K	Glu542Lys	3:g.178936082G>A	3	178936082	178936082	G	A	A
 5 | GRCh37	FGFR3	Missense_Mutation	TCGA-05-4417-01	p.V271M	Val271Met							
 6 | GRCh37	EGFR	Missense_Mutation	TCGA-06-0155-01	p.H304Y	His304Tyr	7:g.55223543C>T	7	55223543	55223543	C	T	T
 7 | GRCh37	PTEN	Missense_Mutation	TCGA-06-0155-01	p.C136R	Cys136Arg	10:g.89692922T>C	10	89692922	89692922	T	C	C
 8 | GRCh37	FGFR2	Missense_Mutation	TCGA-02-0033-01	p.Q212K	Gln121Lys							
 9 | GRCh37	ATM	Missense_Mutation	TCGA-05-4417-01	p.L2890R	Leu2890Arg							
10 | GRCh37	KRAS	Missense_Mutation	TCGA-05-4417-01	p.G12C	Gly12Cys	12:g.25398285C>A	12	25398285	25398285	C	A	A
11 | GRCh37	KRAS	Missense_Mutation	TCGA-05-4417-01	p.G12C	Gly12Cys	12:g.25398285_25398286delinsAG	12	25398285	25398286	CA	AG	AG
12 | GRCh37	RB1	Nonsense_Mutation	TCGA-02-0033-01	p.Q702*	Gln702*							
13 | GRCh37	TP53	Missense_Mutation	TCGA-02-0033-01	p.R248Q	Arg248Gln	17:g.7577538C>T	17	7577538	7577538	C	T	T
14 | GRCh37	NF1	Splice_Site	TCGA-02-0033-01	p.X1445_splice	X1445_splice	17:g.29586049G>A	17	29586049	29586049	G	A	A
15 | GRCh37	STK11	Missense_Mutation	TCGA-05-4417-01	p.H168R	His168Arg							
16 | GRCh37	TERT	5'Flank	TCGA-05-4417-01			5:g.1295228G>A	5	1295228	1295228	G	A	A
17 | GRCh37	MYD88	Missense_Mutation	TCGA-05-4417-01	M232T								
18 | GRCh37	EGFR	Missense_Mutation	TCGA-05-4417-01	T790M								


--------------------------------------------------------------------------------
/data/example_maf_grch38.txt:
--------------------------------------------------------------------------------
 1 | NCBI_Build	Hugo_Symbol	Variant_Classification	Tumor_Sample_Barcode	HGVSp_Short	HGVSp	HGVSg	Chromosome	Start_Position	End_Position	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2
 2 | GRCh38	CUL1	Missense_Mutation	TCGA-A6-2672-01A-01W-0833-10	p.Y466S	Tyr466Ser							
 3 | GRCh38	AKT3	Nonsense_Mutation	TCGA-05-4417-01	p.E182*	Glu182*							
 4 | GRCh38	PIK3CA	Missense_Mutation	TCGA-02-0033-01	p.E542K	Glu542Lys	3:g.179218294G>A	3	179218294	179218294	G	A	A
 5 | GRCh38	FGFR3	Missense_Mutation	TCGA-05-4417-01	p.V271M	Val271Met							
 6 | GRCh38	EGFR	Missense_Mutation	TCGA-06-0155-01	p.H304Y	His304Tyr	7:g.55155850C>T	7	55155850	55155850	C	T	T
 7 | GRCh38	PTEN	Missense_Mutation	TCGA-06-0155-01	p.C136R	Cys136Arg	10:g.87933165T>C	10	87933165	87933165	T	C	C
 8 | GRCh38	FGFR2	Missense_Mutation	TCGA-02-0033-01	p.Q212K	Gln121Lys							
 9 | GRCh38	ATM	Missense_Mutation	TCGA-05-4417-01	p.L2890R	Leu2890Arg							
10 | GRCh38	KRAS	Missense_Mutation	TCGA-05-4417-01	p.G12C	Gly12Cys	12:g.25245351C>A	12	25245351	25245351	C	A	A
11 | GRCh38	RB1	Nonsense_Mutation	TCGA-02-0033-01	p.Q702*	Gln702*							
12 | GRCh38	TP53	Missense_Mutation	TCGA-02-0033-01	p.R248Q	Arg248Gln	17:g.7674220C>T	17	7674220	7674220	C	T	T
13 | GRCh38	NF1	Splice_Site	TCGA-02-0033-01	p.X1445_splice	X1445_splice	17:g.31259031G>A	17	31259031	31259031	G	A	A
14 | GRCh38	STK11	Missense_Mutation	TCGA-05-4417-01	p.H168R	His168Arg							
15 | GRCh38	MYD88	Missense_Mutation	TCGA-05-4417-01	M219T								
16 | 


--------------------------------------------------------------------------------
/data/example_sv.txt:
--------------------------------------------------------------------------------
1 | Tumor_Sample_Barcode	GeneA	GeneB	Sv_Type
2 | TCGA-02-0033-01	MLL2	MLL2	DELETION
3 | TCGA-05-4417-01	ALK	EML4	FUSION
4 | TCGA-06-0155-01	EGFR	EGFR	DELETION
5 | TCGA-06-0155-01	TMPRSS2	ERG	FUSION
6 | TCGA-05-4417-01	TMPRSS2	ERG	FUSION
7 | TCGA-06-0155-01	ERBB2	ERBB2	DELETION
8 | TCGA-02-0033-01	ETV6	RUNX1	FUSION
9 | 


--------------------------------------------------------------------------------
/example.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | IMAF="data/example_maf.txt"
 3 | OMAF="data/example_maf.oncokb.txt"
 4 | 
 5 | IMAF38="data/example_maf_grch38.txt"
 6 | OMAF38="data/example_maf_grch38.oncokb.txt"
 7 | 
 8 | OMAFHGVSPSHORT="data/example_maf_hgsp_short.oncokb.txt"
 9 | OMAFHGVSP="data/example_maf_hgsp.oncokb.txt"
10 | OMAFHGVSG="data/example_maf_hgsg.oncokb.txt"
11 | OMAFGC="data/example_maf_genomic_change.oncokb.txt"
12 | 
13 | IATYPICALALT="data/example_atypical_alterations.txt"
14 | OATYPICALALT="data/example_atypical_alterations.oncokb.txt"
15 | 
16 | IF="data/example_fusions.txt"
17 | OF="data/example_fusions.oncokb.txt"
18 | 
19 | ISV="data/example_sv.txt"
20 | OSV="data/example_sv.oncokb.txt"
21 | 
22 | ICNA="data/example_cna.txt"
23 | OCNA="data/example_cna.oncokb.txt"
24 | 
25 | IICNA="data/example_individual_cna.txt"
26 | OICNA="data/example_individual_cna.oncokb.txt"
27 | 
28 | IC="data/example_clinical.txt"
29 | OC="data/example_clinical.oncokb.txt"
30 | 
31 | TOKEN="" #OncoKB API Token
32 | README="data/example_README.txt"
33 | 
34 | python MafAnnotator.py -i "$IMAF" -o "$OMAF" -c "$IC" -b "$TOKEN"
35 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSPSHORT" -c "$IC" -b "$TOKEN" -q hgvsp_short
36 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSP" -c "$IC" -b "$TOKEN" -q hgvsp
37 | python MafAnnotator.py -i "$IMAF" -o "$OMAFHGVSG" -c "$IC" -b "$TOKEN" -q hgvsg
38 | python MafAnnotator.py -i "$IMAF" -o "$OMAFGC" -c "$IC" -b "$TOKEN" -q genomic_change
39 | 
40 | python MafAnnotator.py -i "$IMAF38" -o "$OMAF38" -c "$IC" -b "$TOKEN"
41 | 
42 | python MafAnnotator.py -i "$IATYPICALALT" -o "$OATYPICALALT" -c "$IC" -b "$TOKEN"
43 | 
44 | python FusionAnnotator.py -i "$IF" -o "$OF" -c "$IC" -b "$TOKEN"
45 | python StructuralVariantAnnotator.py -i "$ISV" -o "$OSV" -c "$IC" -b "$TOKEN"
46 | python CnaAnnotator.py -i "$ICNA" -o "$OCNA" -c "$IC" -b "$TOKEN"
47 | python CnaAnnotator.py -i "$IICNA" -o "$OICNA" -c "$IC" -b "$TOKEN" -f "individual" -z
48 | python ClinicalDataAnnotator.py -i "$IC" -o "$OC" -a "$OMAF,$OATYPICALALT,$OCNA,$OF,$OSV"
49 | 
50 | python GenerateReadMe.py -o "$README"
51 | 


--------------------------------------------------------------------------------
/flake8.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501,W503,E126


--------------------------------------------------------------------------------
/requirements/common.txt:
--------------------------------------------------------------------------------
1 | requests==2.31.0
2 | urllib3==1.26.8


--------------------------------------------------------------------------------
/requirements/pip2.7.txt:
--------------------------------------------------------------------------------
1 | enum34==1.1.10
2 | kiwisolver==1.1.0


--------------------------------------------------------------------------------
/requirements/pip3.txt:
--------------------------------------------------------------------------------
1 | kiwisolver==1.2.0
2 | 


--------------------------------------------------------------------------------
/test_Annotation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import pytest
  3 | import os
  4 | import logging
  5 | 
  6 | from AnnotatorCore import pull_hgvsg_info, DESCRIPTION_HEADERS, ONCOKB_ANNOTATION_HEADERS_GC
  7 | from AnnotatorCore import pull_genomic_change_info
  8 | from AnnotatorCore import pull_protein_change_info
  9 | from AnnotatorCore import pull_structural_variant_info
 10 | from AnnotatorCore import pull_cna_info
 11 | from AnnotatorCore import setoncokbapitoken
 12 | from AnnotatorCore import ProteinChangeQuery
 13 | from AnnotatorCore import GenomicChangeQuery
 14 | from AnnotatorCore import StructuralVariantQuery
 15 | from AnnotatorCore import CNAQuery
 16 | from AnnotatorCore import HGVSgQuery
 17 | from AnnotatorCore import ReferenceGenome
 18 | 
 19 | ONCOKB_API_TOKEN = os.environ["ONCOKB_API_TOKEN"]
 20 | setoncokbapitoken(ONCOKB_API_TOKEN)
 21 | 
 22 | log = logging.getLogger('test_Annotation')
 23 | log.info('test-----------', os.environ["ONCOKB_API_TOKEN"], '------')
 24 | 
 25 | VARIANT_EXISTS_INDEX = 2
 26 | MUTATION_EFFECT_INDEX = VARIANT_EXISTS_INDEX + 1
 27 | ONCOGENIC_INDEX = MUTATION_EFFECT_INDEX + 2
 28 | LEVEL_1_INDEX = ONCOGENIC_INDEX + 1
 29 | LEVEL_2_INDEX = LEVEL_1_INDEX + 1
 30 | LEVEL_3A_INDEX = LEVEL_1_INDEX + 2
 31 | HIGHEST_LEVEL_INDEX = LEVEL_1_INDEX + 7
 32 | HIGHEST_DX_LEVEL_INDEX = HIGHEST_LEVEL_INDEX + 7
 33 | HIGHEST_PX_LEVEL_INDEX = HIGHEST_DX_LEVEL_INDEX + 5
 34 | UNKNOWN = 'Unknown'
 35 | NUMBER_OF_ANNOTATION_COLUMNS = 27
 36 | NUMBER_OF_DESCRIPTION_COLUMNS = len(DESCRIPTION_HEADERS)
 37 | NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS = len(ONCOKB_ANNOTATION_HEADERS_GC)
 38 | NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS
 39 | NUMBER_OF_GC_ANNOTATION_COLUMNS = NUMBER_OF_ANNOTATION_COLUMNS + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS
 40 | NUMBER_OF_GC_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS = NUMBER_OF_GC_ANNOTATION_COLUMNS + NUMBER_OF_DESCRIPTION_COLUMNS
 41 | 
 42 | 
 43 | def fake_gene_one_query_suite(annotations, include_descriptions):
 44 |     assert len(annotations) == 1
 45 | 
 46 |     annotation = annotations[0]
 47 |     assert len(
 48 |         annotation) == NUMBER_OF_ANNOTATION_COLUMNS if include_descriptions is False else NUMBER_OF_ANNOTATION_COLUMNS_WITH_DESCRIPTIONS
 49 |     assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN
 50 |     assert annotation[ONCOGENIC_INDEX] == UNKNOWN
 51 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
 52 | 
 53 | 
 54 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
 55 | def test_check_protein_change():
 56 |     queries = [
 57 |         ProteinChangeQuery('BRAF', 'V600E', 'Colorectal Cancer'),
 58 |         ProteinChangeQuery('ABL1', 'BCR-ABL1 Fusion', 'Acute Leukemias of Ambiguous Lineage'),
 59 |     ]
 60 | 
 61 |     annotations = pull_protein_change_info(queries, False, False)
 62 |     assert len(annotations) == 2
 63 | 
 64 |     annotation = annotations[0]
 65 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
 66 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
 67 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
 68 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
 69 | 
 70 |     annotation = annotations[1]
 71 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
 72 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
 73 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
 74 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
 75 |     assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx1'
 76 |     assert annotation[HIGHEST_PX_LEVEL_INDEX] == 'LEVEL_Px1'
 77 | 
 78 | 
 79 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
 80 | def test_reference_genome():
 81 |     queries = [
 82 |         GenomicChangeQuery('7', '140453136', '140453136', 'A', 'T', 'LUAD', ReferenceGenome.GRCH37),
 83 |         GenomicChangeQuery('7', '140753336', '140753336', 'A', 'T', 'LUAD', ReferenceGenome.GRCH38)
 84 |     ]
 85 | 
 86 |     annotations = pull_genomic_change_info(queries, False, False)
 87 |     assert len(annotations) == 2
 88 | 
 89 |     annotation37 = annotations[0]
 90 |     annotation38 = annotations[1]
 91 |     assert annotation37 == annotation38
 92 | 
 93 |     queries = [
 94 |         ProteinChangeQuery('MYD88', 'M232T', 'Ovarian Cancer', ReferenceGenome.GRCH37),
 95 |         ProteinChangeQuery('MYD88', 'M219T', 'Ovarian Cancer', ReferenceGenome.GRCH38)
 96 |     ]
 97 | 
 98 |     annotations = pull_protein_change_info(queries, False, False)
 99 |     assert len(annotations) == 2
100 | 
101 |     annotation37 = annotations[0]
102 |     annotation38 = annotations[1]
103 |     assert annotation37 == annotation38
104 | 
105 | 
106 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
107 | def test_fake_gene_protein_change():
108 |     queries = [
109 |         ProteinChangeQuery('test1', 'V600E', 'Ovarian Cancer')
110 |     ]
111 | 
112 |     annotations = pull_protein_change_info(queries, False, False)
113 |     fake_gene_one_query_suite(annotations, False)
114 | 
115 |     annotations = pull_protein_change_info(queries, False, False)
116 |     fake_gene_one_query_suite(annotations, True)
117 | 
118 | 
119 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
120 | def test_check_atypical_alts():
121 |     queries = [
122 |         ProteinChangeQuery('Other Biomarkers', 'MSI-H', 'Colorectal Cancer'),
123 |         ProteinChangeQuery('Other Biomarkers', 'MSI-H', 'Leukemia'),
124 |         ProteinChangeQuery('TERT', 'Promoter Mutation', 'Bladder Cancer'),
125 |         ProteinChangeQuery('TERT', 'Promoter Mutation', 'Bladder Cancer', None, '5\'Flank')
126 |     ]
127 | 
128 |     annotations = pull_protein_change_info(queries, False, False)
129 |     assert len(annotations) == 4
130 | 
131 |     annotation = annotations[0]
132 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
133 |     assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN
134 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
135 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
136 | 
137 |     annotation = annotations[1]
138 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
139 |     assert annotation[MUTATION_EFFECT_INDEX] == UNKNOWN
140 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
141 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
142 | 
143 |     annotation = annotations[2]
144 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
145 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Likely Gain-of-function'
146 |     assert annotation[ONCOGENIC_INDEX] == 'Likely Oncogenic'
147 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
148 | 
149 |     annotation_dup = annotations[3]
150 |     assert len(annotation_dup) == NUMBER_OF_ANNOTATION_COLUMNS
151 |     assert annotation == annotation_dup
152 | 
153 | 
154 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
155 | def test_check_hgvsg():
156 |     queries = [
157 |         # KRAF G12C
158 |         HGVSgQuery('12:g.25398285C>A', 'LUAD'),
159 |         # KRAF G12C
160 |         HGVSgQuery('12:g.25398285_25398286delinsAG', 'LUAD'),
161 |         # TERT Promoter
162 |         HGVSgQuery('5:g.1295167_1295168delinsAATG', 'LUAD'),
163 |     ]
164 | 
165 |     annotations = pull_hgvsg_info(queries, False, False)
166 |     assert len(annotations) == 3
167 | 
168 |     annotation = annotations[0]
169 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
170 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
171 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
172 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'
173 | 
174 |     annotation = annotations[1]
175 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
176 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
177 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
178 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'
179 | 
180 |     annotation = annotations[2]
181 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
182 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
183 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
184 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''
185 | 
186 | 
187 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
188 | def test_check_genomic_change():
189 |     queries = [
190 |         # KRAF G12C
191 |         GenomicChangeQuery('12', '25398285', '25398285', 'C', 'A', 'LUAD'),
192 |         # KRAF G12C
193 |         GenomicChangeQuery('12', '25398285', '25398286', 'CA', 'AG', 'LUAD'),
194 |         # TERT Promoter
195 |         GenomicChangeQuery('5', '1295167', '1295168', 'TC', 'AATG', 'LUAD'),
196 |     ]
197 | 
198 |     annotations = pull_genomic_change_info(queries, False, False)
199 |     assert len(annotations) == 3
200 | 
201 |     annotation = annotations[0]
202 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
203 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
204 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
205 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'
206 | 
207 |     annotation = annotations[1]
208 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
209 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Gain-of-function'
210 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Oncogenic'
211 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'LEVEL_1'
212 | 
213 |     annotation = annotations[2]
214 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS
215 |     assert annotation[MUTATION_EFFECT_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Gain-of-function'
216 |     assert annotation[ONCOGENIC_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == 'Likely Oncogenic'
217 |     assert annotation[HIGHEST_LEVEL_INDEX + NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS] == ''
218 | 
219 | 
220 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
221 | def test_check_structural_variants():
222 |     queries = [
223 |         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'NSCLC'),
224 |         StructuralVariantQuery('ALK', 'EML4', 'FUSION', 'Melanoma'),
225 |         StructuralVariantQuery('BCR', 'ABL1', 'FUSION', 'Acute Leukemias of Ambiguous Lineage'),
226 |     ]
227 | 
228 |     annotations = pull_structural_variant_info(queries, False)
229 |     assert len(annotations) == 3
230 | 
231 |     annotation = annotations[0]
232 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
233 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
234 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
235 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
236 | 
237 |     annotation = annotations[1]
238 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
239 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
240 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
241 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_3B'
242 | 
243 |     annotation = annotations[2]
244 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
245 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
246 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
247 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
248 |     assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx1'
249 |     assert annotation[HIGHEST_PX_LEVEL_INDEX] == 'LEVEL_Px1'
250 | 
251 | 
252 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
253 | def test_fake_fusion_gene():
254 |     queries = [
255 |         StructuralVariantQuery('test1', 'test2', 'FUSION', 'NSCLC'),
256 |     ]
257 | 
258 |     annotations = pull_structural_variant_info(queries, False)
259 |     fake_gene_one_query_suite(annotations, False)
260 | 
261 |     annotations = pull_structural_variant_info(queries, False)
262 |     fake_gene_one_query_suite(annotations, True)
263 | 
264 | 
265 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
266 | def test_cna():
267 |     queries = [
268 |         CNAQuery('BRCA2', 'DELETION', 'Ovarian Cancer'),
269 |         CNAQuery('ERBB2', 'Amplification', 'Breast Cancer'),
270 |         CNAQuery('ERBB2', 'Amplification', 'Colorectal Cancer'),
271 |         CNAQuery('CDKN2A', 'Deletion', 'AML with BCR-ABL1'),
272 |     ]
273 | 
274 |     annotations = pull_cna_info(queries, False)
275 |     assert len(annotations) == 4
276 | 
277 |     annotation = annotations[0]
278 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
279 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function'
280 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
281 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
282 | 
283 |     annotation = annotations[1]
284 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
285 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
286 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
287 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
288 | 
289 |     annotation = annotations[2]
290 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
291 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Gain-of-function'
292 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
293 |     assert annotation[HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
294 | 
295 |     annotation = annotations[3]
296 |     assert len(annotation) == NUMBER_OF_ANNOTATION_COLUMNS
297 |     assert annotation[MUTATION_EFFECT_INDEX] == 'Loss-of-function'
298 |     assert annotation[ONCOGENIC_INDEX] == 'Oncogenic'
299 |     assert annotation[HIGHEST_LEVEL_INDEX] == ''
300 |     assert annotation[HIGHEST_DX_LEVEL_INDEX] == 'LEVEL_Dx2'
301 |     assert annotation[HIGHEST_PX_LEVEL_INDEX] == ''
302 | 
303 | 
304 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
305 | def test_fake_cna():
306 |     queries = [
307 |         CNAQuery('test1', 'Amplification', 'Breast Cancer'),
308 |     ]
309 | 
310 |     annotations = pull_cna_info(queries, False)
311 |     fake_gene_one_query_suite(annotations, False)
312 | 
313 |     annotations = pull_cna_info(queries, True)
314 |     fake_gene_one_query_suite(annotations, True)
315 | 
316 | 
317 | def check_brca2_s1882_without_cancertype(annotation, genomic_query=False):
318 |     assert len(annotation) == NUMBER_OF_GC_ANNOTATION_COLUMNS if genomic_query else NUMBER_OF_ANNOTATION_COLUMNS
319 |     assert annotation[(
320 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + MUTATION_EFFECT_INDEX) if genomic_query else MUTATION_EFFECT_INDEX] == 'Likely Loss-of-function'
321 |     assert annotation[(
322 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + ONCOGENIC_INDEX) if genomic_query else ONCOGENIC_INDEX] == 'Likely Oncogenic'
323 |     assert annotation[(
324 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + HIGHEST_LEVEL_INDEX) if genomic_query else HIGHEST_LEVEL_INDEX] == 'LEVEL_1'
325 |     assert annotation[(
326 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_1_INDEX) if genomic_query else LEVEL_1_INDEX] == 'Olaparib,Olaparib+Bevacizumab,Rucaparib,Olaparib+Abiraterone+Prednisone,Niraparib,Talazoparib+Enzalutamide,Niraparib+Abiraterone Acetate+Prednisone'
327 |     assert annotation[(
328 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_2_INDEX) if genomic_query else LEVEL_2_INDEX] == 'Olaparib,Rucaparib,Niraparib'
329 |     assert annotation[(
330 |             NUMBER_OF_ONCOKB_ANNOTATION_GC_COLUMNS + LEVEL_3A_INDEX) if genomic_query else LEVEL_3A_INDEX] == 'Olaparib,Talazoparib'
331 | 
332 | 
333 | @pytest.mark.skipif(ONCOKB_API_TOKEN in (None, ''), reason="oncokb api token required")
334 | def test_duplicated_treatments():
335 |     # there should not be any duplicated treatment listed when cancer type is not specified
336 | 
337 |     # test protein change query
338 |     queries = [
339 |         ProteinChangeQuery('BRCA2', 'S1882*', ''),
340 |     ]
341 |     annotations = pull_protein_change_info(queries, False, False)
342 |     assert len(annotations) == 1
343 | 
344 |     check_brca2_s1882_without_cancertype(annotations[0])
345 | 
346 |     # test genomic change query
347 |     queries = [
348 |         GenomicChangeQuery('13', '32914137', '32914137', 'C', 'A', ''),
349 |     ]
350 |     annotations = pull_genomic_change_info(queries, False, False)
351 |     assert len(annotations) == 1
352 | 
353 |     check_brca2_s1882_without_cancertype(annotations[0], True)
354 | 


--------------------------------------------------------------------------------
/test_AnnotatorCore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import pytest
  3 | 
  4 | from AnnotatorCore import getgenesfromfusion
  5 | from AnnotatorCore import conversion
  6 | from AnnotatorCore import replace_all
  7 | from AnnotatorCore import resolve_query_type
  8 | from AnnotatorCore import get_highest_tx_level
  9 | from AnnotatorCore import get_cna
 10 | from AnnotatorCore import QueryType
 11 | from AnnotatorCore import ALTERATION_HEADER
 12 | from AnnotatorCore import HGVSP_HEADER
 13 | from AnnotatorCore import HGVSP_SHORT_HEADER
 14 | from AnnotatorCore import HGVSG_HEADER
 15 | from AnnotatorCore import GC_REF_ALLELE_HEADER
 16 | from AnnotatorCore import GC_CHROMOSOME_HEADER
 17 | from AnnotatorCore import GC_START_POSITION_HEADER
 18 | from AnnotatorCore import GC_END_POSITION_HEADER
 19 | from AnnotatorCore import GC_VAR_ALLELE_1_HEADER
 20 | from AnnotatorCore import GC_VAR_ALLELE_2_HEADER
 21 | from AnnotatorCore import TX_TYPE_SENSITIVE
 22 | from AnnotatorCore import TX_TYPE_RESISTANCE
 23 | from AnnotatorCore import CNA_AMPLIFICATION_TXT
 24 | from AnnotatorCore import CNA_DELETION_TXT
 25 | from AnnotatorCore import CNA_GAIN_TXT
 26 | from AnnotatorCore import CNA_LOSS_TXT
 27 | 
 28 | 
 29 | def test_getgenesfromfusion():
 30 |     AB_EXAMPLE = ('A', 'B')
 31 |     assert getgenesfromfusion('A-B') == AB_EXAMPLE
 32 |     assert getgenesfromfusion('A-B ') == AB_EXAMPLE
 33 |     assert getgenesfromfusion('a-b') == ('a', 'b')
 34 |     assert getgenesfromfusion('A') == ('A', 'A')
 35 |     assert getgenesfromfusion('A1-1B') == ('A1', '1B')
 36 | 
 37 |     # Test fusion case insensitive
 38 |     assert getgenesfromfusion('A-B fusion') == AB_EXAMPLE
 39 |     assert getgenesfromfusion('A-B Fusion') == AB_EXAMPLE
 40 | 
 41 |     # Test unnecessary characters will be trimmed off after fusion
 42 |     assert getgenesfromfusion('A-B fusion archer') == AB_EXAMPLE
 43 |     assert getgenesfromfusion('A-B fusion Archer') == AB_EXAMPLE
 44 |     assert getgenesfromfusion('A-B fusion -Archer') == AB_EXAMPLE
 45 |     assert getgenesfromfusion('A-B fusion -archer') == AB_EXAMPLE
 46 |     assert getgenesfromfusion('A-B fusion - archer') == AB_EXAMPLE
 47 |     assert getgenesfromfusion('A-B fusion - archer ') == AB_EXAMPLE
 48 | 
 49 |     assert getgenesfromfusion('A-B fusion test') == AB_EXAMPLE
 50 |     assert getgenesfromfusion('fusion A-B fusion') == AB_EXAMPLE
 51 | 
 52 |     # Test intragenic
 53 |     assert getgenesfromfusion('MLL2-intragenic') == ('MLL2', 'MLL2')
 54 | 
 55 | 
 56 | def test_conversion():
 57 |     # Test conversion case for case insensitivity
 58 |     assert conversion('tyr100') == 'Y100'
 59 |     assert conversion('tYr100') == 'Y100'
 60 |     assert conversion('Tyr100') == 'Y100'
 61 |     assert conversion('tyR100') == 'Y100'
 62 |     assert conversion('TyR100') == 'Y100'
 63 |     assert conversion('TYR100') == 'Y100'
 64 |     assert conversion('tYR100') == 'Y100'
 65 |     assert conversion('sEr100') == 'S100'
 66 | 
 67 |     # Test conversion only targets dict() keys
 68 |     assert conversion('hot100') == 'hot100'
 69 | 
 70 |     # Test conversion is not affected by empty string and whitespaces
 71 |     assert conversion('') == ''
 72 |     assert conversion(' sEr100') == ' S100'
 73 | 
 74 |     # Test conversion when the string contains three letter but not supposed to be converted
 75 |     assert conversion('Promoter') == 'Promoter'
 76 | 
 77 | 
 78 | def test_replace_all():
 79 |     # Test replace_all for case insensitivity
 80 |     assert replace_all('tyr') == 'Y'
 81 |     assert replace_all('tYr') == 'Y'
 82 |     assert replace_all('Tyr') == 'Y'
 83 |     assert replace_all('tyR') == 'Y'
 84 |     assert replace_all('TyR') == 'Y'
 85 |     assert replace_all('TYR') == 'Y'
 86 |     assert replace_all('tYR') == 'Y'
 87 |     assert replace_all('sEr') == 'S'
 88 | 
 89 |     # Test replace_all only targets the dict() keys
 90 |     assert replace_all('bubblegum juice cup dairy hot pot Tyr melon') == 'bubblegum juice cup dairy hot pot Y melon'
 91 |     assert replace_all('Ly Lys Pr Pro Gln Glad Ph PH Phe') == 'Ly K Pr P Q Glad Ph PH F'
 92 |     assert replace_all(
 93 |         'nOt can fat Tan Rat cat dog man Men FAn rot taR car fAr map TAP Zip poP') == 'nOt can fat Tan Rat cat dog man Men FAn rot taR car fAr map TAP Zip poP'
 94 | 
 95 |     # Test replace_all is not affected by numbers
 96 |     assert replace_all('Tyr600E Cys56734342342454562456') == 'Y600E C56734342342454562456'
 97 |     assert replace_all(
 98 |         '60 045 434 345 4 26 567 254 245 34 67567 8 56 8 364 56 6 345 7567 3455 6 8 99 89 7 3') == '60 045 434 345 4 26 567 254 245 34 67567 8 56 8 364 56 6 345 7567 3455 6 8 99 89 7 3'
 99 | 
100 |     # Test replace_all is not affected by empty string and whitespaces
101 |     assert replace_all('') == ''
102 |     assert replace_all(' ') == ' '
103 |     assert replace_all('Tyr Asn As n Ile Il e') == 'Y N As n I Il e'
104 | 
105 | 
106 | def test_resolve_query_type():
107 |     assert resolve_query_type(None, [HGVSG_HEADER]) == QueryType.HGVSG
108 |     assert resolve_query_type(None, [HGVSP_HEADER]) == QueryType.HGVSP
109 |     assert resolve_query_type(None, [HGVSP_SHORT_HEADER]) == QueryType.HGVSP_SHORT
110 |     assert resolve_query_type(None, [HGVSG_HEADER, HGVSP_HEADER, HGVSP_SHORT_HEADER]) == QueryType.HGVSP_SHORT
111 |     assert resolve_query_type(None, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER, GC_END_POSITION_HEADER,
112 |                                      GC_REF_ALLELE_HEADER, GC_VAR_ALLELE_1_HEADER,
113 |                                      GC_VAR_ALLELE_2_HEADER]) == QueryType.GENOMIC_CHANGE
114 | 
115 |     assert resolve_query_type(QueryType.HGVSG, [HGVSG_HEADER, HGVSP_HEADER, HGVSP_SHORT_HEADER]) == QueryType.HGVSG
116 | 
117 |     # Test extreme cases
118 |     with pytest.raises(Exception):
119 |         assert resolve_query_type(None, [])
120 |     assert resolve_query_type(None, [ALTERATION_HEADER]) == QueryType.HGVSP_SHORT
121 | 
122 |     # Raise exception when the file does not have asked header
123 |     with pytest.raises(Exception):
124 |         assert resolve_query_type(QueryType.HGVSG, [HGVSP_SHORT_HEADER])
125 |     with pytest.raises(Exception):
126 |         assert resolve_query_type(QueryType.GENOMIC_CHANGE, [GC_CHROMOSOME_HEADER, GC_START_POSITION_HEADER])
127 | 
128 | 
129 | def test_get_highest_tx_level():
130 |     oncokb_data = {}
131 |     assert get_highest_tx_level(oncokb_data) == ''
132 |     assert get_highest_tx_level(oncokb_data, 'random') == ''
133 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == ''
134 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == ''
135 | 
136 |     oncokb_data = {'LEVEL_1': ['test'], 'LEVEL_R1': ['test'], 'LEVEL_R2': ['test']}
137 |     assert get_highest_tx_level(oncokb_data) == 'LEVEL_R1'
138 |     assert get_highest_tx_level(oncokb_data, 'random') == 'LEVEL_R1'
139 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == 'LEVEL_1'
140 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == 'LEVEL_R1'
141 | 
142 |     oncokb_data = {'LEVEL_1': ['test'], 'LEVEL_R2': ['test']}
143 |     assert get_highest_tx_level(oncokb_data) == 'LEVEL_1'
144 |     assert get_highest_tx_level(oncokb_data, 'random') == 'LEVEL_1'
145 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_SENSITIVE) == 'LEVEL_1'
146 |     assert get_highest_tx_level(oncokb_data, TX_TYPE_RESISTANCE) == 'LEVEL_R2'
147 | 
148 | 
149 | def test_cna():
150 |     assert get_cna(None) is None
151 |     assert get_cna('') is None
152 |     assert get_cna('test') is None
153 |     assert get_cna('Amplification') == CNA_AMPLIFICATION_TXT
154 |     assert get_cna('Gain') is None
155 |     assert get_cna('Deletion') == CNA_DELETION_TXT
156 |     assert get_cna('Loss') is None
157 |     assert get_cna('2') == CNA_AMPLIFICATION_TXT
158 |     assert get_cna('1') is None
159 |     assert get_cna('-2') == CNA_DELETION_TXT
160 |     assert get_cna('-1.5') == CNA_DELETION_TXT
161 |     assert get_cna('-1') is None
162 |     assert get_cna('0') is None
163 | 
164 |     assert get_cna(None, False) is None
165 |     assert get_cna('', False) is None
166 |     assert get_cna('test', False) is None
167 |     assert get_cna('Amplification', False) == CNA_AMPLIFICATION_TXT
168 |     assert get_cna('Gain', False) is None
169 |     assert get_cna('Deletion', False) == CNA_DELETION_TXT
170 |     assert get_cna('Loss', False) is None
171 | 
172 |     assert get_cna(None, True) is None
173 |     assert get_cna('', True) is None
174 |     assert get_cna('test', True) is None
175 |     assert get_cna('Amplification', True) == CNA_AMPLIFICATION_TXT
176 |     assert get_cna('Gain', True) == CNA_GAIN_TXT
177 |     assert get_cna('Deletion', True) == CNA_DELETION_TXT
178 |     assert get_cna('Loss', True) == CNA_LOSS_TXT
179 |     assert get_cna('2', True) == CNA_AMPLIFICATION_TXT
180 |     assert get_cna('1', True) == CNA_GAIN_TXT
181 |     assert get_cna('-2', True) == CNA_DELETION_TXT
182 |     assert get_cna('-1.5', True) == CNA_DELETION_TXT
183 |     assert get_cna('-1', True) == CNA_LOSS_TXT
184 |     assert get_cna('0', True) is None
185 | 


--------------------------------------------------------------------------------