├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── RELEASING.md
├── deploy.sh
├── develop.sh
├── lint.sh
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
├── test
    ├── __init__.py
    ├── data.py
    ├── data
    │   ├── B16-StringTie-chr1-subset.gtf
    │   ├── genes.fpkm_tracking
    │   ├── isoforms.fpkm_tracking
    │   └── tiny_test_ligandome_dir
    │   │   ├── A0201
    │   │   └── HLA-B0704
    ├── test_args_outputs.py
    ├── test_cli_protein_changes.py
    ├── test_contains_mutant_residues.py
    ├── test_dataframe.py
    ├── test_effect_expression_filters.py
    ├── test_epitopes_from_commandline_args.py
    ├── test_load_cufflinks_fpkm.py
    ├── test_load_stringtie_gtf_fpkm.py
    ├── test_mutant_epitope_predictions_class1.py
    ├── test_mutant_epitope_predictions_class2.py
    ├── test_padding.py
    ├── test_peptide_mutation_interval.py
    ├── test_rna_helpers.py
    └── test_variant_expression_filters.py
└── topiary
    ├── __init__.py
    ├── cli
        ├── __init__.py
        ├── args.py
        ├── errors.py
        ├── filtering.py
        ├── outputs.py
        ├── protein_changes.py
        ├── rna.py
        ├── script.py
        └── sequence.py
    ├── filters.py
    ├── predictor.py
    ├── rna
        ├── __init__.py
        ├── common.py
        ├── cufflinks.py
        └── gtf.py
    └── sequence_helpers.py


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Tests
 5 | on: [push, pull_request]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       fail-fast: true
12 |       matrix:
13 |         python-version: ["3.10", "3.11", "3.12"]
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - name: Set up Python ${{ matrix.python-version }}
17 |         uses: actions/setup-python@v3
18 |         with:
19 |           python-version: ${{ matrix.python-version }}
20 |       - name: Checkout private netmhc-bundle repo
21 |         uses: actions/checkout@v4
22 |         with:
23 |           repository: openvax/netmhc-bundle
24 |           token: ${{ secrets.NETMHC_BUNDLE_ACCESS_TOKEN }}
25 |           path: netmhc-bundle
26 | 
27 |       - name: Install netmhc-bundle dependencies
28 |         uses: awalsh128/cache-apt-pkgs-action@latest
29 |         with:
30 |           packages: tcsh gawk python2-minimal
31 |           version: 1.0
32 |       - name: Install dependencies
33 |         run: |
34 |           python -m pip install --upgrade pip
35 |           python -m pip install pytest pytest-cov pylint 
36 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
37 |       - name: Install wkthtmltopdf
38 |         run: |
39 |           sudo apt-get install -y xfonts-base xfonts-75dpi
40 |           wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.bionic_amd64.deb
41 |           sudo dpkg -i wkhtmltox_0.12.6-1.bionic_amd64.deb
42 |       - name: Lint with PyLint
43 |         run: |
44 |           ./lint.sh
45 |       - name: Download Ensembl data
46 |         run: |
47 |           echo "Before installing Ensembl releases" && df -h
48 |           pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
49 |           pyensembl install --release 102 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.102/
50 |           pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/
51 |           pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/
52 |       - name: Test with pytest
53 |         run: |
54 |           # configure netmhc-bundle paths
55 |           export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle
56 |           echo "NetMHC-bundle dir:" && ls -l $NETMHC_BUNDLE_HOME
57 |           mkdir $PWD/netmhc-bundle-tmp
58 |           export NETMHC_BUNDLE_TMPDIR=$PWD/netmhc-bundle-tmp
59 |           export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin
60 |           ./test.sh
61 |       - name: Publish coverage to Coveralls
62 |         uses: coverallsapp/github-action@v2.2.3
63 |         with:
64 |           parallel: true
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false  # Use container-based infrastructure
 2 | language: python
 3 | dist: trusty
 4 | python:
 5 |   - "2.7"
 6 |   - "3.6"
 7 | git:
 8 |   # don't need the default depth of 50
 9 |   # but don't want to use a depth of 1 since that affects
10 |   # whether jobs run when you have multiple commits queued
11 |   # https://github.com/travis-ci/travis-ci/issues/4575
12 |   depth: 10
13 | cache:
14 |   pip: true
15 |   # cache directory used for Ensembl downloads of GTF and FASTA files
16 |   # along with the indexed db of intervals and ID mappings and pickles
17 |   # of sequence dictionaries. Also, pip
18 |   directories:
19 |     - $HOME/.cache/pyensembl/
20 | addons:
21 |   apt:
22 |     packages:
23 |       # Needed for NetMHC
24 |       - tcsh
25 | env:
26 |   global:
27 |     # MHC_BUNDLE_PASS
28 |     - secure: "TLTzSIABO/iYke8C66c0PRaWDZ5lx90s8XimSfDONOTXaX74V25O65qxzIWPAihxcdfLYA+bE2YRsjYOtuK+6DB2vjXbmoCQAXIFT/QXz4+iZTxN3g/s5N4hIR8tf9MSQ3KdNHOw7lKzdgAWKsFDQ8vwrqzYUNJGVtvoQSWCmPw="
29 | before_install:
30 |   - |
31 |     if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
32 |       wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
33 |     else
34 |       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
35 |     fi
36 |   - bash miniconda.sh -b -p $HOME/miniconda
37 |   - export PATH="$HOME/miniconda/bin:$PATH"
38 |   - hash -r
39 |   - conda config --set always_yes yes --set changeps1 no
40 |   - conda update -q conda
41 |   # Useful for debugging any issues with conda
42 |   - conda info -a
43 |   - python --version
44 |   # install MHC predictors
45 |   - git clone https://mhcbundle:$MHC_BUNDLE_PASS@github.com/openvax/netmhc-bundle.git
46 |   - export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle
47 |   - mkdir tmp
48 |   - export NETMHC_BUNDLE_TMPDIR=$PWD/tmp
49 |   - export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin
50 | install:
51 |   - >
52 |       conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
53 |       numpy nose pandas pandoc
54 |   - source activate test-environment
55 |   - pip install pypandoc
56 |   - pip install -r requirements.txt
57 |   - pip install .
58 |   - pip install coveralls
59 |   - pyensembl install --release 75 --species human
60 |   - pyensembl install --release 87 --species human
61 | script:
62 |   - ./lint.sh
63 |   - nosetests test --with-coverage --cover-package=topiary
64 | after_success:
65 |   coveralls
66 | deploy:
67 |   provider: pypi
68 |   distributions: sdist
69 |   user: openvax
70 |   password: # See http://docs.travis-ci.com/user/encryption-keys/
71 |     secure: "S4KWAhJpKYx5F/cBc6cf9GCZ8Hd+WtMA6V6PP25PglLnVaXrxB5QxuAIWGAvr/jGuTHjfCSCNDwTptW3natLjJR9IfJdJPp3gNvM0RDjWY4FsziFz/nG/bZo9qnh4ZCDhK/Po1izxXM0u9z6gUc0U2iKK1ZSdfawyW4nZbAXQUU="
72 |   on:
73 |     branch: master
74 |     condition: $TRAVIS_PYTHON_VERSION = "2.7"
75 | 
76 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Tests](https://github.com/openvax/topiary/actions/workflows/tests.yml/badge.svg)](https://github.com/openvax/topiary/actions/workflows/tests.yml)
  2 | <a href="https://coveralls.io/github/openvax/topiary?branch=master">
  3 |     <img src="https://coveralls.io/repos/openvax/topiary/badge.svg?branch=master&service=github" alt="Coverage Status" />
  4 | </a>
  5 | <a href="https://pypi.python.org/pypi/topiary/">
  6 |     <img src="https://img.shields.io/pypi/v/topiary.svg?maxAge=1000" alt="PyPI" />
  7 | </a>
  8 | 
  9 | # Topiary
 10 | 
 11 | Predict mutation-derived cancer T-cell epitopes from (1) somatic variants (2) tumor RNA expression data, and (3) patient HLA type.
 12 | 
 13 | ## Example
 14 | 
 15 | ```sh
 16 | ./topiary \
 17 |   --vcf somatic.vcf \
 18 |   --mhc-predictor netmhcpan \
 19 |   --mhc-alleles HLA-A*02:01,HLA-B*07:02 \
 20 |   --ic50-cutoff 500 \
 21 |   --percentile-cutoff 2.0 \
 22 |   --mhc-epitope-lengths 8-11 \
 23 |   --rna-gene-fpkm-tracking-file genes.fpkm_tracking \
 24 |   --rna-min-gene-expression 4.0 \
 25 |   --rna-transcript-fpkm-tracking-file isoforms.fpkm_tracking \
 26 |   --rna-min-transcript-expression 1.5 \
 27 |   --output-csv epitopes.csv \
 28 |   --output-html epitopes.html
 29 | ```
 30 | 
 31 | ## Installation
 32 | 
 33 | You can install Topiary and all of the libraries it depends on by running:
 34 | ```
 35 | pip install topiary
 36 | ```
 37 | 
 38 | You'll need to download the reference genome sequences and annotations for a
 39 | recent Ensembl release (e.g. 81) by running:
 40 | 
 41 | ```
 42 | pyensembl install --release 81 --species human
 43 | ```
 44 | 
 45 | If you want to work with variants which were aligned against the older reference
 46 | GRCh37, you will need to also download its annotation data, which is contained
 47 | in Ensembl release 75:
 48 | 
 49 | ```
 50 | pyensembl install --release 75 --species human
 51 | ```
 52 | 
 53 | 
 54 | ## Commandline Arguments
 55 | 
 56 | ### Genomic Variants
 57 | 
 58 | Specify some variants by giving at least one of the following options. They can
 59 | be used in combination and repeated.
 60 | 
 61 | * `--vcf VCF_FILENAME`: Load a [VCF](http://www.1000genomes.org/wiki/analysis/variant%20call%20format/vcf-variant-call-format-version-41) file
 62 | * `--maf MAF_FILENAME`: Load a TCGA [MAF](https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification) file
 63 | * `--variant CHR POS REF ALT : Specify an individual variant (requires --ensembl-version)`
 64 | 
 65 | ### Output Format
 66 | 
 67 | * `--output-csv OUTPUT_CSV_FILENAME`: Path to an output CSV file
 68 | * `--output-html OUTPUT_HTML_FILENAME`: Path to an output HTML file
 69 | 
 70 | ### RNA Expression Filtering
 71 | 
 72 | Optional flags to use Cufflinks expression estimates for dropping epitopes
 73 | arising from genes or transcripts that are not highly expressed.
 74 | 
 75 | * `--rna-gene-fpkm-tracking-file RNA_GENE_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking file
 76 | containing gene expression estimates.
 77 | * `--rna-min-gene-expression RNA_MIN_GENE_EXPRESSION`: Minimum FPKM for genes
 78 | * `--rna-transcript-fpkm-tracking-file RNA_TRANSCRIPT_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking
 79 | file containing transcript expression estimates.
 80 | * `--rna-min-transcript-expression RNA_MIN_TRANSCRIPT_EXPRESSION`: Minimum FPKM
 81 | for transcripts
 82 | * `--rna-transcript-fpkm-gtf-file RNA_TRANSCRIPT_FPKM_GTF_FILE`: StringTie GTF file
 83 | file containing transcript expression estimates.
 84 | 
 85 | ### Choose an MHC Binding Predictor
 86 | 
 87 | You *must* choose an MHC binding predictor using one of the following values
 88 | for the `--mhc-predictor` flag:
 89 | 
 90 | * `netmhc`: Local [NetMHC](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHC) predictor (Topiary will attempt to automatically detect whether NetMHC 3.x or 4.0 is available)
 91 | * `netmhcpan`: Local [NetMHCpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan) predictor
 92 | * `netmhciipan`: Local [NetMHCIIpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCIIpan) predictor
 93 | * `netmhccons`: Local [NetMHCcons](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCcons)
 94 | * `random`: Random IC50 values
 95 | * `smm`: Local [SMM](http://www.mhc-pathway.net/smm) predictor
 96 | * `smm-pmbec`: Local [SMM-PMBEC](http://www.mhc-pathway.net/smmpmbec) predictor
 97 | * `netmhcpan-iedb`: Use NetMHCpan via the IEDB web API
 98 | * `netmhccons-iedb`: Use NetMHCcons via the IEDB web API
 99 | * `smm-iedb`: Use SMM via the IEDB web API
100 | * `smm-pmbec-iedb`: Use SMM-PMBEC via the IEDB web API
101 | 
102 | ### MHC Alleles
103 | You must specify the alleles to perform binding prediction for using one of
104 | the following flags:
105 | 
106 | * `--mhc-alleles-file MHC_ALLELES_FILE`: Text file containing one allele name per
107 | line
108 | * `--mhc-alleles MHC_ALLELES`: Comma separated list of allele names,
109 | e.g. "HLA-A02:01,HLA-B07:02"
110 | 
111 | ### Peptide Length
112 | 
113 | * `--mhc-epitope-lengths MHC_EPITOPE_LENGTHS`: comma separated list of integers
114 | specifying which peptide lengths to use for MHC binding prediction
115 | 
116 | ### Binding Prediction Filtering
117 | 
118 | * `--only-novel-epitopes`: Topiary will normally keep all predicted epitopes,
119 | even those which occur in a given self-ligandome or don't overlap a mutated region
120 | of a protein. Use this flag to drop any epitopes which don't contain mutations
121 | or that occur elsewhere in the self-ligandome.
122 | * `--ic50-cutoff IC50_CUTOFF`: Drop peptides with predicted IC50 nM greater
123 | than this value (typical value is 500.0)
124 | * `--percentile-cutoff PERCENTILE_CUTOFF`: Drop peptides with percentile rank
125 | of their predicted IC50 (among predictions for a particular allele) fall below
126 | this threshold (lower values are stricter filters, typical value is 2.0)
127 | 
128 | ### Misc
129 | 
130 | * `--padding-around-mutation PADDING_AROUND_MUTATION`: Include more unmutated residues
131 | around the mutation (useful when not using `--only-novel-epitopes`)
132 | * `--self-filter-directory SELF_FILTER_DIRECTORY`: Directory of files named by MHC allele
133 | containing a self peptide ligandome (peptides which should be excluded from
134 | results)
135 | * `--skip-variant-errors`: If a particular mutation causes an exception to be raised
136 | during annotation, you can skip it using this flag.
137 | 
138 | 


--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
1 | # Releasing Topiary
2 | 
3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
4 | 
5 | 1. Bump the [version](http://semver.org/) on __init__.py, as part of the PR you want to release.
6 | 2. Merge your branch into master.
7 | 3. After the Topiary unit tests complete successfully on Travis then the latest version
8 | of the code (with the version specified above) will be pushed to [PyPI](https://pypi.python.org/pypi) automatically. If you're curious about how automatic deployment is achieved, see our [Travis configuration](https://github.com/hammerlab/topiary/blob/master/.travis.yml#L58).
9 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
 1 | ./lint.sh && \
 2 | ./test.sh && \
 3 | python3 -m pip install --upgrade build && \
 4 | python3 -m pip install --upgrade twine && \
 5 | rm -rf dist && \
 6 | python3 -m build && \
 7 | git --version && \
 8 | python3 -m twine upload dist/* && \
 9 | git tag "$(python3 topiary/version.py)" &&  \
10 | git push --tags
11 | 
12 | 


--------------------------------------------------------------------------------
/develop.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | 
3 | uv pip install -e .
4 | 


--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | 
 4 | 
 5 | # disabling several categories of errors due to false positives in pylint,
 6 | # see these issues:
 7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
 8 | # - https://bitbucket.org/logilab/pylint/issues/58
 9 | 
10 | find topiary/ -name '*.py' \
11 |   | xargs pylint \
12 |   --errors-only \
13 |   --disable=unsubscriptable-object,not-an-iterable,no-member,invalid-unary-operand-type
14 | 
15 | echo 'Passes pylint check'
16 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7
2 | pandas>=0.13.1
3 | mhctools>=1.3.0
4 | varcode>=0.3.17
5 | pylint>=1.4.4
6 | nose>=1.3.6
7 | gtfparse>=0.0.4
8 | mhcnames
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | import os
17 | import re
18 | 
19 | from setuptools import setup, find_packages
20 | 
21 | readme_dir = os.path.dirname(__file__)
22 | readme_path = os.path.join(readme_dir, 'README.md')
23 | 
24 | try:
25 |     with open(readme_path, 'r') as f:
26 |         readme_markdown = f.read()
27 | except:
28 |     readme_markdown = ""
29 | 
30 | try:
31 |     import pypandoc
32 |     readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md')
33 | except:
34 |     readme_restructured = readme_markdown
35 |     print(
36 |         "Conversion of long_description from MD to reStructuredText failed...")
37 | 
38 | with open('topiary/__init__.py', 'r') as f:
39 |     version = re.search(
40 |         r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
41 |         f.read(),
42 |         re.MULTILINE).group(1)
43 | 
44 | if not version:
45 |     raise RuntimeError('Cannot find version information')
46 | 
47 | if __name__ == '__main__':
48 |     setup(
49 |         name='topiary',
50 |         version=version,
51 |         description="Predict cancer epitopes from cancer sequence data",
52 |         author="Alex Rubinsteyn, Tavi Nathanson",
53 |         author_email="alex.rubinsteyn@gmail.com",
54 |         url="https://github.com/hammerlab/topiary",
55 |         license="http://www.apache.org/licenses/LICENSE-2.0.html",
56 |         classifiers=[
57 |             'Development Status :: 3 - Alpha',
58 |             'Environment :: Console',
59 |             'Operating System :: OS Independent',
60 |             'Intended Audience :: Science/Research',
61 |             'License :: OSI Approved :: Apache Software License',
62 |             'Programming Language :: Python',
63 |             'Topic :: Scientific/Engineering :: Bio-Informatics',
64 |         ],
65 |         install_requires=[
66 |             'numpy >=1.7, <2.0',
67 |             'pandas >=0.13.1',
68 |             'mhctools >= 1.3.0',
69 |             'varcode >=0.3.17',
70 |             'nose >=1.3.6',
71 |             'gtfparse >=0.0.4',
72 |             'mhcnames',
73 |         ],
74 |         long_description=readme_restructured,
75 |         packages=find_packages(exclude="test"),
76 |         entry_points={
77 |             'console_scripts': [
78 |                 'topiary = topiary.cli.script:main'
79 |             ]
80 |         }
81 |     )
82 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=topiary/ --cov-report=term-missing tests
2 | 
3 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/topiary/4ce5ed007a7a19d9666ba9f20cfcf5dfe745a4e3/test/__init__.py


--------------------------------------------------------------------------------
/test/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #         http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Helper functions and shared datasets for tests
17 | """
18 | 
19 | 
20 | from __future__ import print_function, division, absolute_import
21 | import os
22 | 
23 | from varcode import Variant, VariantCollection
24 | from pyensembl import ensembl_grch38
25 | 
26 | def data_path(name):
27 |     """
28 |     Return the absolute path to a file in the varcode/test/data directory.
29 |     The name specified should be relative to varcode/test/data.
30 |     """
31 |     return os.path.join(os.path.dirname(__file__), "data", name)
32 | 
33 | # BRAF variant coordinates from COSMIC entry:
34 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476
35 | braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38)
36 | 
37 | # TP53 variant coordinates from COSMIC entry:
38 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656
39 | tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38)
40 | 
41 | cancer_test_variants = VariantCollection([
42 |     braf_V600E_variant,
43 |     tp53_R248W_variant
44 | ])
45 | 
46 | cancer_test_variant_gene_ids = {
47 |     gene_id
48 |     for v in cancer_test_variants
49 |     for gene_id in v.gene_ids
50 | }
51 | 
52 | cancer_test_variant_transcript_ids = {
53 |     transcript_id
54 |     for v in cancer_test_variants
55 |     for transcript_id in v.transcript_ids
56 | }
57 | 


--------------------------------------------------------------------------------
/test/data/B16-StringTie-chr1-subset.gtf:
--------------------------------------------------------------------------------
1 | # StringTie version 1.1.2
2 | 1	StringTie	transcript	4492457	4493604	1000	-	.	gene_id "STRG.2"; transcript_id "STRG.2.2"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215"; FPKM "0.125126"; TPM "0.255858";
3 | 1	StringTie	exon	4492457	4493604	1000	-	.	gene_id "STRG.2"; transcript_id "STRG.2.2"; exon_number "1"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215";
4 | 1	StringTie	transcript	4492465	4493735	1000	-	.	gene_id "STRG.2"; transcript_id "STRG.2.1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "6.349273"; FPKM "0.680062"; TPM "1.390592";
5 | 1	StringTie	exon	4492465	4492668	1000	-	.	gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "20.261032";
6 | 1	StringTie	exon	4493100	4493735	1000	-	.	gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "2"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.887011";
7 | 1	StringTie	transcript	4687934	4689403	1000	-	.	gene_id "STRG.1"; transcript_id "STRG.1.1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422"; FPKM "0.054028"; TPM "0.110476";
8 | 1	StringTie	exon	4687934	4689403	1000	-	.	gene_id "STRG.1"; transcript_id "STRG.1.1"; exon_number "1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422";


--------------------------------------------------------------------------------
/test/data/genes.fpkm_tracking:
--------------------------------------------------------------------------------
 1 | tracking_id class_code  nearest_ref_id  gene_id gene_short_name tss_id  locus   length  coverage    FPKM    FPKM_conf_lo    FPKM_conf_hi    FPKM_status
 2 | ENSG00000240361 -   -   ENSG00000240361 OR4G11P -   chr1:62947-63887    -   -   0   0   0   OK
 3 | ENSG00000268020 -   -   ENSG00000268020 AL627309.1  -   chr1:53048-54936    -   -   0   0   0   OK
 4 | ENSG00000186092 -   -   ENSG00000186092 OR4F5   -   chr1:69090-70008    -   -   0   0   0   OK
 5 | CUFF.1  -   -   CUFF.1  FAM138A -   chr1:34553-36081    -   -   0.0222016   0   0.0614304   OK
 6 | CUFF.2  -   -   CUFF.2  DDX11L1 -   chr1:11868-14412    -   -   0   0   0.0497629   OK
 7 | CUFF.3  -   -   CUFF.3  MIR1302-10  -   chr1:29553-31109    -   -   0   0   0.154007    OK
 8 | CUFF.4  -   -   CUFF.4  WASH7P  -   chr1:14362-29806    -   -   10.3844 9.63011 11.2268 OK
 9 | ENSG00000269308 -   -   ENSG00000269308 AL645608.2  -   chr1:818042-819983  -   -   0   0   0   OK
10 | CUFF.5  -   -   CUFF.5  -   -   chr1:841474-842801  -   -   0.172295    0.113069    0.231522    OK
11 | CUFF.6  -   -   CUFF.6  -   -   chr1:1-20  -   -   0.172295    0.113069    0.231522    FAIL
12 | CUFF.7  -   -   CUFF.7  -   -   chr2:1-20  -   -   0.172295    0.113069    0.231522    LOWDATA
13 | CUFF.8  -   -   CUFF.8  -   -   chr3:1-20  -   -   0.172295    0.113069    0.231522    HIDATA
14 | 


--------------------------------------------------------------------------------
/test/data/isoforms.fpkm_tracking:
--------------------------------------------------------------------------------
 1 | tracking_id class_code  nearest_ref_id  gene_id gene_short_name tss_id  locus   length  coverage    FPKM    FPKM_conf_lo    FPKM_conf_hi    FPKM_status
 2 | ENST00000492842 -   -   ENSG00000240361 OR4G11P -   chr1:62947-63887    940 0   0   0   0   OK
 3 | ENST00000594647 -   -   ENSG00000268020 AL627309.1  -   chr1:53048-54936    126 0   0   0   0   OK
 4 | ENST00000335137 -   -   ENSG00000186092 OR4F5   -   chr1:69090-70008    918 0   0   0   0   OK
 5 | ENST00000417324 -   -   CUFF.1  FAM138A -   chr1:34553-36081    1187    0   0   0   0.0120385   OK
 6 | ENST00000461467 -   -   CUFF.1  FAM138A -   chr1:35244-36073    590 0.621469    0.0222016   0   0.0484398   OK
 7 | ENST00000456328 -   -   CUFF.2  DDX11L1 -   chr1:11868-14409    1657    0   0   0   0.0129358   LOWDATA
 8 | ENST00000515242 -   -   CUFF.2  DDX11L1 -   chr1:11871-14412    1653    0   0   0   0.00864472  LOWDATA
 9 | ENST00000518655 -   -   CUFF.2  DDX11L1 -   chr1:11873-14409    1483    0   0   0   0.00963569  OK
10 | ENST00000450305 -   -   CUFF.2  DDX11L1 -   chr1:12009-13670    632 0   0   0   0.0226103   LOWDATA
11 | CUFF.7604.1 -   -   CUFF.7604   -   -   chr2:45395607-45402815  1004    4.73445 0.194033    0.113862    0.27754 OK
12 | ENST00000496445 -   -   CUFF.38259  VTI1A   -   chr10:114207021-114298405   853 0   0   0   0   FAIL


--------------------------------------------------------------------------------
/test/data/tiny_test_ligandome_dir/A0201:
--------------------------------------------------------------------------------
1 | SIINFKEL
2 | QQQQQQQQ
3 | 


--------------------------------------------------------------------------------
/test/data/tiny_test_ligandome_dir/HLA-B0704:
--------------------------------------------------------------------------------
1 | RRRRRRRRR


--------------------------------------------------------------------------------
/test/test_args_outputs.py:
--------------------------------------------------------------------------------
 1 | from topiary.cli.args import arg_parser
 2 | from topiary.cli.outputs import write_outputs
 3 | import tempfile
 4 | import pandas as pd
 5 | from nose.tools import eq_
 6 | 
 7 | 
 8 | def test_write_outputs():
 9 | 
10 |     with tempfile.NamedTemporaryFile(mode="r+", delete=False) as f:
11 |         df = pd.DataFrame({
12 |             "x": [1, 2, 3],
13 |             "y": [10, 20, 30]
14 |         })
15 |         args = arg_parser.parse_args([
16 |             "--output-csv", f.name,
17 |             "--subset-output-columns", "x",
18 |             "--rename-output-column", "x", "X",
19 |             "--mhc-predictor", "random",
20 |             "--mhc-alleles", "A0201",
21 |         ])
22 | 
23 |         write_outputs(
24 |             df,
25 |             args,
26 |             print_df_before_filtering=True,
27 |             print_df_after_filtering=True)
28 |         print("File: %s" % f.name)
29 |         df_from_file = pd.read_csv(f.name, index_col="#")
30 | 
31 |         df_expected = pd.DataFrame({
32 |             "X": [1, 2, 3]})
33 |         print(df_from_file)
34 |         eq_(len(df_expected), len(df_from_file))
35 |         assert (df_expected == df_from_file).all().all()
36 | 


--------------------------------------------------------------------------------
/test/test_cli_protein_changes.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_
 2 | from topiary.cli.protein_changes import protein_change_effects_from_args
 3 | from topiary.cli.args import create_arg_parser
 4 | 
 5 | arg_parser = create_arg_parser(
 6 |     mhc=False,
 7 |     rna=False,
 8 |     output=False)
 9 | 
10 | def test_protein_change_effects_from_args_substitutions():
11 |     args = arg_parser.parse_args([
12 |         "--protein-change", "EGFR", "T790M",
13 |         "--genome", "grch37",
14 |     ])
15 | 
16 |     effects = protein_change_effects_from_args(args)
17 |     eq_(len(effects), 1)
18 |     effect = effects[0]
19 |     eq_(effect.aa_ref, "T")
20 |     eq_(effect.aa_mutation_start_offset, 789)
21 |     eq_(effect.aa_alt, "M")
22 | 
23 |     transcript = effect.transcript
24 |     eq_(transcript.name, "EGFR-001")
25 | 
26 | def test_protein_change_effects_from_args_malformed_missing_ref():
27 | 
28 |     args = arg_parser.parse_args([
29 |         "--protein-change", "EGFR", "790M",
30 |         "--genome", "grch37"])
31 | 
32 |     effects = protein_change_effects_from_args(args)
33 |     eq_(len(effects), 0)
34 | 
35 | def test_protein_change_effects_from_args_malformed_missing_alt():
36 |     args = arg_parser.parse_args([
37 |         "--protein-change", "EGFR", "T790",
38 |         "--genome", "grch37"])
39 |     effects = protein_change_effects_from_args(args)
40 |     eq_(len(effects), 0)
41 | 
42 | def test_protein_change_effects_from_args_multiple_effects():
43 |     args = arg_parser.parse_args([
44 |         "--protein-change", "EGFR", "T790M",
45 |         "--protein-change", "KRAS", "G10D",
46 |         "--genome", "grch37"])
47 |     effects = protein_change_effects_from_args(args)
48 |     print(effects)
49 |     eq_(len(effects), 2)
50 | 


--------------------------------------------------------------------------------
/test/test_contains_mutant_residues.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_
 2 | from topiary import contains_mutant_residues
 3 | 
 4 | def test_contains_mutant_residues_before():
 5 |     eq_(
 6 |         contains_mutant_residues(
 7 |             peptide_start_in_protein=10,
 8 |             peptide_length=9,
 9 |             mutation_start_in_protein=5,
10 |             mutation_end_in_protein=6),
11 |         False)
12 | 
13 | 
14 | def test_contains_mutant_residues_after():
15 |     eq_(
16 |         contains_mutant_residues(
17 |             peptide_start_in_protein=10,
18 |             peptide_length=9,
19 |             mutation_start_in_protein=25,
20 |             mutation_end_in_protein=26),
21 |         False)
22 | 
23 | def test_contains_mutant_residues_inside():
24 |     eq_(
25 |         contains_mutant_residues(
26 |             peptide_start_in_protein=10,
27 |             peptide_length=9,
28 |             mutation_start_in_protein=12,
29 |             mutation_end_in_protein=13),
30 |         True)
31 | 
32 | def test_contains_mutant_residues_deletion_before_beginning():
33 |     # peptide only contains the residue *after* the mutation
34 |     # so it still looks like it's wildtype
35 |     eq_(
36 |         contains_mutant_residues(
37 |             peptide_start_in_protein=10,
38 |             peptide_length=9,
39 |             mutation_start_in_protein=10,
40 |             mutation_end_in_protein=10),
41 |         False)
42 | 
43 | 
44 | def test_contains_mutant_residues_deletion_at_beginning():
45 |     # peptide contains mutation before *and* after mutation so
46 |     # it should count as having a mutant juxtaposition of residues
47 |     eq_(
48 |         contains_mutant_residues(
49 |             peptide_start_in_protein=10,
50 |             peptide_length=9,
51 |             mutation_start_in_protein=11,
52 |             mutation_end_in_protein=11),
53 |         True)
54 | 
55 | def test_contains_mutant_residues_deletion_after_end():
56 |     # peptide only contains the residue *before* the mutation
57 |     # so it still looks like it's wildtype
58 |     eq_(
59 |         contains_mutant_residues(
60 |             peptide_start_in_protein=10,
61 |             peptide_length=9,
62 |             mutation_start_in_protein=19,
63 |             mutation_end_in_protein=19),
64 |         False)
65 | 
66 | def test_contains_mutant_residues_deletion_at_end():
67 |     # peptide contains mutation before *and* after mutation so
68 |     # it should count as having a mutant juxtaposition of residues
69 |     eq_(
70 |         contains_mutant_residues(
71 |             peptide_start_in_protein=10,
72 |             peptide_length=9,
73 |             mutation_start_in_protein=18,
74 |             mutation_end_in_protein=18),
75 |         True)
76 | 


--------------------------------------------------------------------------------
/test/test_dataframe.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from mhctools import NetMHC
 3 | from topiary import TopiaryPredictor
 4 | from .data import cancer_test_variants
 5 | 
 6 | alleles = [
 7 |     'A02:01',
 8 |     'B*07:02',
 9 |     'HLA-C*07:02',
10 | ]
11 | 
12 | mhc_model = NetMHC(
13 |     alleles=alleles,
14 |     default_peptide_lengths=[8, 9, 10])
15 | 
16 | DEFAULT_FPKM = 1.0
17 | 
18 | def test_epitopes_to_dataframe_transcript_expression():
19 |     predictor = TopiaryPredictor(
20 |         mhc_model=mhc_model,
21 |         only_novel_epitopes=False)
22 |     df = predictor.predict_from_variants(
23 |         variants=cancer_test_variants,
24 |         transcript_expression_dict={
25 |             transcript_id: DEFAULT_FPKM
26 |             for variant in cancer_test_variants
27 |             for transcript_id in variant.transcript_ids
28 |         })
29 | 
30 |     assert "transcript_expression" in df.columns, \
31 |         "transcript_expression missing from %s" % (df.columns,)
32 |     assert(df["transcript_expression"] == DEFAULT_FPKM).all(), \
33 |         "Invalid FPKM values in DataFrame transcript_expression column"
34 | 
35 | def test_epitopes_to_dataframe_gene_expression():
36 |     predictor = TopiaryPredictor(
37 |         mhc_model=mhc_model,
38 |         only_novel_epitopes=False)
39 | 
40 |     df = predictor.predict_from_variants(
41 |         variants=cancer_test_variants,
42 |         gene_expression_dict={
43 |             gene_id: DEFAULT_FPKM
44 |             for variant in cancer_test_variants
45 |             for gene_id in variant.gene_ids
46 |         })
47 | 
48 |     assert "gene_expression" in df.columns, \
49 |         "gene_expression missing from %s" % (df.columns,)
50 |     assert(df["gene_expression"] == DEFAULT_FPKM).all(), \
51 |         "Invalid FPKM values in DataFrame gene_expression column"
52 | 


--------------------------------------------------------------------------------
/test/test_effect_expression_filters.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .data import (
 3 |     cancer_test_variants,
 4 |     cancer_test_variant_gene_ids,
 5 |     cancer_test_variant_transcript_ids
 6 | )
 7 | from topiary.filters import apply_effect_expression_filters
 8 | 
 9 | cancer_test_effects = cancer_test_variants.effects()
10 | 
11 | DEFAULT_FPKM = 1.0
12 | 
13 | # associate every gene ID with 1.0 FPKM
14 | gene_expression_dict = {
15 |     gene_id: DEFAULT_FPKM
16 |     for gene_id in cancer_test_variant_gene_ids
17 | }
18 | 
19 | # associate every transcript with 1.0 FPKM
20 | transcript_expression_dict = {
21 |     transcript_id: DEFAULT_FPKM
22 |     for transcript_id in cancer_test_variant_transcript_ids
23 | }
24 | 
25 | 
26 | def test_apply_effect_gene_expression_below_threshold():
27 |     filtered = apply_effect_expression_filters(
28 |         cancer_test_effects,
29 |         gene_expression_dict=gene_expression_dict,
30 |         gene_expression_threshold=2 * DEFAULT_FPKM,
31 |         transcript_expression_dict=None,
32 |         transcript_expression_threshold=None)
33 |     assert len(filtered) == 0, \
34 |         "All variants should have been filtered out but got: %s" % (filtered,)
35 | 
36 | def test_apply_effect_gene_expression_above_threshold():
37 |     filtered = apply_effect_expression_filters(
38 |         cancer_test_effects,
39 |         gene_expression_dict=gene_expression_dict,
40 |         gene_expression_threshold=0.5 * DEFAULT_FPKM,
41 |         transcript_expression_dict=None,
42 |         transcript_expression_threshold=None)
43 |     assert len(filtered) == len(cancer_test_effects), \
44 |         "Expected %s effects but got %s" % (len(
45 |             cancer_test_effects), len(filtered))
46 | 
47 | def test_apply_effect_gene_expression_equal_threshold():
48 |     # expect genes with expression at threshold to NOT get filtered
49 |     filtered = apply_effect_expression_filters(
50 |         cancer_test_effects,
51 |         gene_expression_dict=gene_expression_dict,
52 |         gene_expression_threshold=DEFAULT_FPKM,
53 |         transcript_expression_dict=None,
54 |         transcript_expression_threshold=None)
55 |     assert len(filtered) == len(cancer_test_effects), \
56 |         "Expected %s effects but got %s" % (len(
57 |             cancer_test_effects), len(filtered))
58 | 
59 | def test_apply_effect_transcript_expression_below_threshold():
60 |     filtered = apply_effect_expression_filters(
61 |         cancer_test_effects,
62 |         gene_expression_dict=None,
63 |         gene_expression_threshold=None,
64 |         transcript_expression_dict=transcript_expression_dict,
65 |         transcript_expression_threshold=2 * DEFAULT_FPKM)
66 |     assert len(filtered) == 0, \
67 |         "All effects should have been filtered out but got: %s" % (filtered,)
68 | 
69 | def test_apply_effect_transcript_expression_above_threshold():
70 |     filtered = apply_effect_expression_filters(
71 |         cancer_test_effects,
72 |         gene_expression_dict=None,
73 |         gene_expression_threshold=None,
74 |         transcript_expression_dict=transcript_expression_dict,
75 |         transcript_expression_threshold=0.5 * DEFAULT_FPKM)
76 |     assert len(filtered) == len(cancer_test_effects), \
77 |         "Expected %s effects but got %s" % (
78 |             len(cancer_test_effects), len(filtered))
79 | 
80 | def test_apply_effect_transcript_expression_equal_threshold():
81 |     # expect transcripts with expression at threshold to NOT be filtered
82 |     filtered = apply_effect_expression_filters(
83 |         cancer_test_effects,
84 |         gene_expression_dict=None,
85 |         gene_expression_threshold=None,
86 |         transcript_expression_dict=transcript_expression_dict,
87 |         transcript_expression_threshold=DEFAULT_FPKM)
88 |     assert len(filtered) == len(cancer_test_effects), \
89 |         "Expected %s effects but got %s" % (
90 |             len(cancer_test_effects), len(filtered))
91 | 


--------------------------------------------------------------------------------
/test/test_epitopes_from_commandline_args.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_
 2 | 
 3 | from topiary.cli.args import arg_parser, predict_epitopes_from_args
 4 | 
 5 | from .data import cancer_test_variants
 6 | 
 7 | 
 8 | def test_cancer_epitopes_from_args():
 9 |     epitope_lengths = [9, 10]
10 |     alleles = ["HLA-A*02:01", "C0701"]
11 |     args_list = [
12 |         "--mhc-predictor", "netmhc",
13 |         "--mhc-epitope-lengths", ",".join(str(x) for x in epitope_lengths),
14 |         "--mhc-alleles", ",".join(alleles),
15 |         "--genome", "GRCh38",
16 |         "--only-novel-epitopes",
17 |     ]
18 |     for variant in cancer_test_variants:
19 |         args_list.append("--variant")
20 |         args_list.append(str(variant.contig))
21 |         args_list.append(str(variant.start))
22 |         args_list.append(variant.ref)
23 |         args_list.append(variant.alt)
24 | 
25 |     parsed_args = arg_parser.parse_args(args_list)
26 |     epitope_predictions = predict_epitopes_from_args(parsed_args)
27 |     expected_number_of_epitopes = 0
28 |     for epitope_length in epitope_lengths:
29 |         expected_number_of_epitopes += epitope_length * len(cancer_test_variants) * len(alleles)
30 |     eq_(len(epitope_predictions), expected_number_of_epitopes)
31 | 


--------------------------------------------------------------------------------
/test/test_load_cufflinks_fpkm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """
 16 | test_cufflinks : Test that we can correctly load Cufflinks tracking files which
 17 | contain the estimated expression levels of genes and isoforms (computed from
 18 | RNA-Seq reads).
 19 | """
 20 | 
 21 | 
 22 | from __future__ import print_function, division, absolute_import
 23 | 
 24 | from topiary.rna import load_cufflinks_dataframe
 25 | 
 26 | from nose.tools import eq_
 27 | 
 28 | from .data import data_path
 29 | 
 30 | def test_load_cufflinks_genes():
 31 |     genes_df = load_cufflinks_dataframe(
 32 |         data_path("genes.fpkm_tracking"),
 33 |         drop_lowdata=True,
 34 |         drop_hidata=True,
 35 |         drop_failed=True,
 36 |         drop_novel=False)
 37 |     gene_ids = set(genes_df.id)
 38 |     expected_gene_ids = {
 39 |         "ENSG00000240361",
 40 |         "ENSG00000268020",
 41 |         "ENSG00000186092",
 42 |         "ENSG00000269308",
 43 |         "CUFF.1",
 44 |         "CUFF.2",
 45 |         "CUFF.3",
 46 |         "CUFF.4",
 47 |         "CUFF.5"
 48 |     }
 49 |     eq_(gene_ids, expected_gene_ids)
 50 | 
 51 | def test_load_cufflinks_genes_drop_novel():
 52 |     genes_df = load_cufflinks_dataframe(
 53 |         data_path("genes.fpkm_tracking"),
 54 |         drop_lowdata=True,
 55 |         drop_hidata=True,
 56 |         drop_failed=True,
 57 |         drop_novel=True)
 58 |     gene_ids = set(genes_df.id)
 59 |     expected_gene_ids = {
 60 |         "ENSG00000240361",
 61 |         "ENSG00000268020",
 62 |         "ENSG00000186092",
 63 |         "ENSG00000269308",
 64 |     }
 65 |     eq_(gene_ids, expected_gene_ids)
 66 | 
 67 | 
 68 | def test_load_cufflinks_isoforms():
 69 |     transcripts_df = load_cufflinks_dataframe(
 70 |         data_path("isoforms.fpkm_tracking"),
 71 |         drop_lowdata=True,
 72 |         drop_hidata=True,
 73 |         drop_failed=True,
 74 |         drop_novel=False)
 75 |     transcript_ids = set(transcripts_df.id)
 76 |     expected_transcript_ids = {
 77 |         "ENST00000492842",
 78 |         "ENST00000594647",
 79 |         "ENST00000335137",
 80 |         "ENST00000417324",
 81 |         "ENST00000461467",
 82 |         "ENST00000518655",
 83 |         "CUFF.7604.1",
 84 |     }
 85 |     eq_(transcript_ids, expected_transcript_ids)
 86 | 
 87 | def test_load_cufflinks_isoforms_drop_novel():
 88 |     transcripts_df = load_cufflinks_dataframe(
 89 |         data_path("isoforms.fpkm_tracking"),
 90 |         drop_lowdata=True,
 91 |         drop_hidata=True,
 92 |         drop_failed=True,
 93 |         drop_novel=True)
 94 |     transcript_ids = set(transcripts_df.id)
 95 |     expected_transcript_ids = {
 96 |         "ENST00000492842",
 97 |         "ENST00000594647",
 98 |         "ENST00000335137",
 99 |         "ENST00000417324",
100 |         "ENST00000461467",
101 |         "ENST00000518655",
102 |     }
103 |     eq_(transcript_ids, expected_transcript_ids)
104 | 


--------------------------------------------------------------------------------
/test/test_load_stringtie_gtf_fpkm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division, absolute_import
 2 | 
 3 | from topiary.rna import load_transcript_fpkm_dict_from_gtf
 4 | 
 5 | from nose.tools import eq_
 6 | 
 7 | from .data import data_path
 8 | 
 9 | 
10 | def test_load_stringtie_gtf_transcripts():
11 |     transcript_fpkms = load_transcript_fpkm_dict_from_gtf(
12 |         data_path("B16-StringTie-chr1-subset.gtf"))
13 |     transcript_ids = set(transcript_fpkms.keys())
14 |     expected_fpkms_dict = {
15 |         "ENSMUST00000192505": 0.125126,
16 |         "ENSMUST00000191939": 0.680062,
17 |         "ENSMUST00000182774": 0.054028,
18 |     }
19 |     expected_transcript_ids = set(expected_fpkms_dict.keys())
20 |     eq_(expected_transcript_ids, transcript_ids)
21 |     for transcript_id, fpkm in expected_fpkms_dict.items():
22 |         eq_(fpkm, transcript_fpkms[transcript_id])
23 | 


--------------------------------------------------------------------------------
/test/test_mutant_epitope_predictions_class1.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from __future__ import print_function, division, absolute_import
17 | 
18 | from mhctools import NetMHCpan
19 | from nose.tools import eq_, raises
20 | from pyensembl import ensembl_grch37
21 | from topiary import TopiaryPredictor
22 | from varcode import Variant, VariantCollection
23 | 
24 | # TODO: find out about these variants,
25 | # what do we expect from them? Are they SNVs?
26 | variants = VariantCollection([
27 |     Variant(
28 |         contig=10,
29 |         start=100018900,
30 |         ref='C',
31 |         alt='T',
32 |         ensembl=ensembl_grch37),
33 |     Variant(
34 |         contig=11,
35 |         start=32861682,
36 |         ref='G',
37 |         alt='A',
38 |         ensembl=ensembl_grch37)])
39 | 
40 | alleles = [
41 |     'A02:01',
42 |     'a0204',
43 |     'B*07:02',
44 |     'HLA-B14:02',
45 |     'HLA-C*07:02',
46 |     'hla-c07:01'
47 | ]
48 | 
49 | mhc_model = NetMHCpan(
50 |     alleles=alleles,
51 |     default_peptide_lengths=[9])
52 | 
53 | 
54 | def test_epitope_prediction_without_padding():
55 |     output_without_padding = TopiaryPredictor(
56 |         mhc_model=mhc_model,
57 |         only_novel_epitopes=True).predict_from_variants(variants=variants)
58 |     # one prediction for each variant * number of alleles
59 |     strong_binders = output_without_padding[output_without_padding.affinity <= 500]
60 |     eq_(len(strong_binders), 5)
61 | 
62 | @raises(ValueError)
63 | def test_epitope_prediction_with_invalid_padding():
64 |     TopiaryPredictor(
65 |         mhc_model=mhc_model,
66 |         padding_around_mutation=7).predict_from_variants(variants=variants)
67 | 
68 | 
69 | @raises(ValueError)
70 | def test_epitope_prediction_with_invalid_zero_padding():
71 |     TopiaryPredictor(
72 |         mhc_model=mhc_model,
73 |         padding_around_mutation=7).predict_from_variants(variants=variants)
74 | 
75 | 
76 | def test_epitope_prediction_with_valid_padding():
77 |     predictor = TopiaryPredictor(
78 |         mhc_model=mhc_model,
79 |         padding_around_mutation=8,
80 |         only_novel_epitopes=True)
81 |     output_with_padding = predictor.predict_from_variants(variants=variants)
82 |     # 6 alleles * 2 mutations * 9 distinct windows = 108
83 |     eq_(len(output_with_padding), 108)
84 | 


--------------------------------------------------------------------------------
/test/test_mutant_epitope_predictions_class2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from __future__ import print_function, division, absolute_import
17 | 
18 | from mhctools import NetMHCIIpan
19 | from nose.tools import eq_
20 | from pyensembl import ensembl_grch37
21 | from topiary import TopiaryPredictor
22 | from varcode import Variant, VariantCollection
23 | 
24 | # TODO: find out about these variants,
25 | # what do we expect from them? Are they SNVs?
26 | variants = VariantCollection([
27 |     Variant(
28 |         contig=10,
29 |         start=100018900,
30 |         ref='C',
31 |         alt='T',
32 |         ensembl=ensembl_grch37),
33 |     Variant(
34 |         contig=11,
35 |         start=32861682,
36 |         ref='G',
37 |         alt='A',
38 |         ensembl=ensembl_grch37)])
39 | 
40 | alleles = [
41 |     "HLA-DPA1*01:05/DPB1*100:01",
42 |     "DRB10102"
43 | ]
44 | 
45 | mhc_model = NetMHCIIpan(
46 |     alleles=alleles,
47 |     default_peptide_lengths=[15, 16])
48 | 
49 | def test_netmhcii_pan_epitopes():
50 |     epitope_predictions = TopiaryPredictor(
51 |         mhc_model=mhc_model,
52 |         only_novel_epitopes=True).predict_from_variants(variants=variants)
53 | 
54 |     # expect (15 + 16 mutant peptides) * (2 alleles) * 2 variants =
55 |     # 124 total epitope predictions
56 |     eq_(len(epitope_predictions), 124)
57 |     unique_alleles = set(epitope_predictions.allele)
58 |     assert len(unique_alleles) == 2, \
59 |         "Expected 2 unique alleles, got %s" % (unique_alleles,)
60 |     unique_lengths = set(epitope_predictions.peptide_length)
61 |     assert unique_lengths == {15, 16}, \
62 |         "Expected epitopes of length 15 and 16 but got lengths %s" % (unique_lengths,)
63 | 


--------------------------------------------------------------------------------
/test/test_padding.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_, assert_raises
 2 | from topiary import check_padding_around_mutation
 3 | 
 4 | def test_default_padding():
 5 |     # expect padding to be one less than the largest epitope length
 6 |     eq_(check_padding_around_mutation(None, [8, 9, 10]), 9)
 7 | 
 8 | def test_invalid_padding():
 9 |     # padding is insufficient for the epitope lengths given
10 |     with assert_raises(ValueError):
11 |         check_padding_around_mutation(2, [9])
12 | 


--------------------------------------------------------------------------------
/test/test_peptide_mutation_interval.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import eq_, assert_raises
 2 | from topiary import peptide_mutation_interval
 3 | 
 4 | def test_peptide_mutation_interval_middle():
 5 |     start, end = peptide_mutation_interval(
 6 |         peptide_start_in_protein=10,
 7 |         peptide_length=9,
 8 |         mutation_start_in_protein=11,
 9 |         mutation_end_in_protein=12)
10 |     eq_(start, 1)
11 |     eq_(end, 2)
12 | 
13 | 
14 | def test_peptide_mutation_interval_start():
15 |     start, end = peptide_mutation_interval(
16 |         peptide_start_in_protein=10,
17 |         peptide_length=9,
18 |         mutation_start_in_protein=7,
19 |         mutation_end_in_protein=12)
20 |     eq_(start, 0)
21 |     eq_(end, 2)
22 | 
23 | def test_peptide_mutation_interval_end():
24 |     start, end = peptide_mutation_interval(
25 |         peptide_start_in_protein=10,
26 |         peptide_length=9,
27 |         mutation_start_in_protein=18,
28 |         mutation_end_in_protein=20)
29 |     eq_(start, 8)
30 |     eq_(end, 9)
31 | 
32 | def test_peptide_mutation_interval_deletion():
33 |     start, end = peptide_mutation_interval(
34 |         peptide_start_in_protein=10,
35 |         peptide_length=9,
36 |         mutation_start_in_protein=15,
37 |         mutation_end_in_protein=15)
38 |     eq_(start, 5)
39 |     eq_(end, 5)
40 | 
41 | 
42 | def test_peptide_mutation_interval_no_overlap_before():
43 |     with assert_raises(ValueError):
44 |         peptide_mutation_interval(
45 |             peptide_start_in_protein=10,
46 |             peptide_length=9,
47 |             mutation_start_in_protein=5,
48 |             mutation_end_in_protein=6)
49 | 
50 | def test_peptide_mutation_interval_no_overlap_after():
51 |     with assert_raises(ValueError):
52 |         peptide_mutation_interval(
53 |             peptide_start_in_protein=10,
54 |             peptide_length=9,
55 |             mutation_start_in_protein=25,
56 |             mutation_end_in_protein=26)
57 | 


--------------------------------------------------------------------------------
/test/test_rna_helpers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from topiary.rna.cufflinks import parse_locus_column
 3 | from nose.tools import eq_
 4 | 
 5 | 
 6 | def test_parse_locus_column_with_chr():
 7 |     """
 8 |     test_parse_locus_column_with_chr: Test that 'chr' prefix from
 9 |     chromosome names gets correctly dropped
10 |     """
11 |     df = pd.DataFrame({"locus": ["chr1:10-20", "chrX:30-40"]})
12 |     loci = df["locus"]
13 |     chromosomes, starts, ends = parse_locus_column(loci)
14 |     eq_(list(chromosomes), ["1", "X"])
15 |     eq_(list(starts), [10, 30])
16 |     eq_(list(ends), [20, 40])
17 | 
18 | 
19 | def test_parse_locus_column_without_chr():
20 |     """
21 |     test_parse_locus_column_without_chr: Test that chromosome names can be
22 |     parsed without 'chr' prefix
23 |     """
24 |     df = pd.DataFrame({"locus": ["1:10-20", "X:30-40"]})
25 |     loci = df["locus"]
26 |     chromosomes, starts, ends = parse_locus_column(loci)
27 |     eq_(list(chromosomes), ["1", "X"])
28 |     eq_(list(starts), [10, 30])
29 |     eq_(list(ends), [20, 40])
30 | 


--------------------------------------------------------------------------------
/test/test_variant_expression_filters.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from topiary.filters import apply_variant_expression_filters
 3 | 
 4 | from .data import (
 5 |     cancer_test_variants,
 6 |     cancer_test_variant_gene_ids,
 7 |     cancer_test_variant_transcript_ids,
 8 | )
 9 | 
10 | DEFAULT_FPKM = 1.0
11 | 
12 | # associate every gene ID with 1.0 FPKM
13 | gene_expression_dict = {
14 |     gene_id: DEFAULT_FPKM
15 |     for gene_id in cancer_test_variant_gene_ids
16 | }
17 | 
18 | # associate every transcript with 1.0 FPKM
19 | transcript_expression_dict = {
20 |     transcript_id: DEFAULT_FPKM
21 |     for transcript_id in cancer_test_variant_transcript_ids
22 | }
23 | 
24 | def test_apply_variant_gene_expression_below_threshold():
25 |     filtered = apply_variant_expression_filters(
26 |         cancer_test_variants,
27 |         gene_expression_dict=gene_expression_dict,
28 |         gene_expression_threshold=2 * DEFAULT_FPKM,
29 |         transcript_expression_dict=None,
30 |         transcript_expression_threshold=None)
31 |     assert len(filtered) == 0, \
32 |         "All variants should have been filtered out but got: %s" % (filtered,)
33 | 
34 | def test_apply_variant_gene_expression_above_threshold():
35 |     filtered = apply_variant_expression_filters(
36 |         cancer_test_variants,
37 |         gene_expression_dict=gene_expression_dict,
38 |         gene_expression_threshold=0.5 * DEFAULT_FPKM,
39 |         transcript_expression_dict=None,
40 |         transcript_expression_threshold=None)
41 |     assert len(filtered) == len(cancer_test_variants), \
42 |         "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered))
43 | 
44 | def test_apply_variant_transcript_expression_below_threshold():
45 |     filtered = apply_variant_expression_filters(
46 |         cancer_test_variants,
47 |         gene_expression_dict=None,
48 |         gene_expression_threshold=None,
49 |         transcript_expression_dict=transcript_expression_dict,
50 |         transcript_expression_threshold=2 * DEFAULT_FPKM)
51 |     assert len(filtered) == 0, \
52 |         "All variants should have been filtered out but got: %s" % (filtered,)
53 | 
54 | def test_apply_variant_transcript_expression_above_threshold():
55 |     filtered = apply_variant_expression_filters(
56 |         cancer_test_variants,
57 |         gene_expression_dict=None,
58 |         gene_expression_threshold=None,
59 |         transcript_expression_dict=transcript_expression_dict,
60 |         transcript_expression_threshold=0.5 * DEFAULT_FPKM)
61 |     assert len(filtered) == len(cancer_test_variants), \
62 |         "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered))
63 | 


--------------------------------------------------------------------------------
/topiary/__init__.py:
--------------------------------------------------------------------------------
 1 | from .predictor import TopiaryPredictor
 2 | from .sequence_helpers import (
 3 |     check_padding_around_mutation,
 4 |     peptide_mutation_interval,
 5 |     contains_mutant_residues,
 6 |     protein_subsequences_around_mutations,
 7 | )
 8 | 
 9 | __version__ = '3.0.6'
10 | 
11 | __all__ = [
12 |     "TopiaryPredictor",
13 |     "contains_mutant_residues",
14 |     "check_padding_around_mutation",
15 |     "peptide_mutation_interval",
16 |     "protein_subsequences_around_mutations",
17 | ]
18 | 


--------------------------------------------------------------------------------
/topiary/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/topiary/cli/args.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Common commandline arguments used by scripts
17 | """
18 | 
19 | from __future__ import print_function, division, absolute_import
20 | 
21 | from argparse import ArgumentParser
22 | from mhctools.cli import add_mhc_args, mhc_binding_predictor_from_args
23 | from varcode.cli import add_variant_args, variant_collection_from_args
24 | 
25 | from .filtering import add_filter_args
26 | from .rna import (
27 |     add_rna_args,
28 |     rna_gene_expression_dict_from_args,
29 |     rna_transcript_expression_dict_from_args,
30 | )
31 | from .sequence import add_sequence_args
32 | from .errors import add_error_args
33 | from .outputs import add_output_args
34 | from .protein_changes import add_protein_change_args
35 | from ..predictor import TopiaryPredictor
36 | 
37 | def create_arg_parser(
38 |         rna=True,
39 |         mhc=True,
40 |         variants=True,
41 |         protein_changes=True,
42 |         filters=True,
43 |         sequence_options=True,
44 |         error_options=True,
45 |         output=True):
46 |     arg_parser = ArgumentParser()
47 |     if rna:
48 |         add_rna_args(arg_parser)
49 |     if mhc:
50 |         add_mhc_args(arg_parser)
51 |     if variants:
52 |         add_variant_args(arg_parser)
53 |     if protein_changes:
54 |         add_protein_change_args(arg_parser)
55 |     if filters:
56 |         add_filter_args(arg_parser)
57 |     if sequence_options:
58 |         add_sequence_args(arg_parser)
59 |     if error_options:
60 |         add_error_args(arg_parser)
61 |     if output:
62 |         add_output_args(arg_parser)
63 |     return arg_parser
64 | 
65 | # keeping global instance for backwards compatibility with existing code
66 | arg_parser = create_arg_parser()
67 | 
68 | def predict_epitopes_from_args(args):
69 |     """
70 |     Returns an epitope collection from the given commandline arguments.
71 | 
72 |     Parameters
73 |     ----------
74 |     args : argparse.Namespace
75 |         Parsed commandline arguments for Topiary
76 |     """
77 |     mhc_model = mhc_binding_predictor_from_args(args)
78 |     variants = variant_collection_from_args(args)
79 |     gene_expression_dict = rna_gene_expression_dict_from_args(args)
80 |     transcript_expression_dict = rna_transcript_expression_dict_from_args(args)
81 | 
82 |     predictor = TopiaryPredictor(
83 |         mhc_model=mhc_model,
84 |         padding_around_mutation=args.padding_around_mutation,
85 |         ic50_cutoff=args.ic50_cutoff,
86 |         percentile_cutoff=args.percentile_cutoff,
87 |         min_transcript_expression=args.rna_min_transcript_expression,
88 |         min_gene_expression=args.rna_min_gene_expression,
89 |         only_novel_epitopes=args.only_novel_epitopes,
90 |         raise_on_error=not args.skip_variant_errors)
91 |     return predictor.predict_from_variants(
92 |         variants=variants,
93 |         transcript_expression_dict=transcript_expression_dict,
94 |         gene_expression_dict=gene_expression_dict)
95 | 


--------------------------------------------------------------------------------
/topiary/cli/errors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Commandline arguments related to error handling
17 | """
18 | 
19 | from __future__ import print_function, division, absolute_import
20 | 
21 | def add_error_args(arg_parser):
22 |     error_group = arg_parser.add_argument_group(
23 |         title="Errors",
24 |         description="Options for error handling")
25 | 
26 |     error_group.add_argument(
27 |         "--skip-variant-errors",
28 |         default=False,
29 |         action="store_true",
30 |         help="Skip variants which cause runtime errors of any kind")
31 | 
32 |     return error_group
33 | 


--------------------------------------------------------------------------------
/topiary/cli/filtering.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | """
17 | Commandline arguments related to epitope filtering
18 | """
19 | 
20 | from __future__ import print_function, division, absolute_import
21 | 
22 | def add_filter_args(arg_parser):
23 |     filter_group = arg_parser.add_argument_group(
24 |         title="Filtering Options",
25 |         description="Criteria for removing epitopes from results")
26 | 
27 |     filter_group.add_argument(
28 |         "--ic50-cutoff",
29 |         help="Drop epitopes with predicted IC50 nM affinity above this value",
30 |         default=None,
31 |         type=float)
32 | 
33 |     filter_group.add_argument(
34 |         "--percentile-cutoff",
35 |         help="Drop epitopes with predicted IC50 percentile rank above this value",
36 |         default=None,
37 |         type=float)
38 | 
39 |     filter_group.add_argument(
40 |         "--only-novel-epitopes",
41 |         help="".join([
42 |             "Drop epitopes which do not contain mutated residues or occur ",
43 |             "in the self-ligandome."]),
44 |         default=False,
45 |         action="store_true")
46 | 
47 |     filter_group.add_argument(
48 |         "--wildtype-ligandome-directory",
49 |         help="".join([
50 |             "Directory of 'self' ligand peptide sets, in files named ",
51 |             "by allele (e.g. 'A0201'). Any predicted mutant epitope which ",
52 |             "is in the files associated with the given alleles is treated as ",
53 |             "wildtype (non-mutated)."]))
54 |     return filter_group
55 | 


--------------------------------------------------------------------------------
/topiary/cli/outputs.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """
 16 | Common commandline arguments for output files
 17 | """
 18 | 
 19 | from __future__ import print_function, division, absolute_import
 20 | 
 21 | import logging
 22 | 
 23 | def add_output_args(arg_parser):
 24 |     output_group = arg_parser.add_argument_group(
 25 |         title="Output",
 26 |         description="How and where to write results")
 27 | 
 28 |     output_group.add_argument(
 29 |         "--output-csv",
 30 |         default=None,
 31 |         help="Path to output CSV file")
 32 | 
 33 |     output_group.add_argument(
 34 |         "--output-html",
 35 |         default=None,
 36 |         help="Path to output HTML file")
 37 | 
 38 |     output_group.add_argument(
 39 |         "--output-csv-sep",
 40 |         default=",",
 41 |         help="Separator for CSV file")
 42 | 
 43 |     output_group.add_argument(
 44 |         "--subset-output-columns",
 45 |         nargs="*")
 46 | 
 47 |     output_group.add_argument(
 48 |         "--rename-output-column",
 49 |         nargs=2,
 50 |         action="append",
 51 |         help=(
 52 |             "Rename original column (first parameter) to new"
 53 |             " name (second parameter)"))
 54 | 
 55 |     output_group.add_argument(
 56 |         "--print-columns",
 57 |         default=False,
 58 |         action="store_true",
 59 |         help="Print columns before writing data to file(s)")
 60 | 
 61 |     return output_group
 62 | 
 63 | def write_outputs(
 64 |         df,
 65 |         args,
 66 |         print_df_before_filtering=False,
 67 |         print_df_after_filtering=False):
 68 |     if print_df_before_filtering:
 69 |         print(df)
 70 | 
 71 |     if args.subset_output_columns:
 72 |         subset_columns = []
 73 |         for column in args.subset_output_columns:
 74 |             if column not in df.columns:
 75 |                 logging.warn(
 76 |                     "Invalid column name '%s', available: %s" % (
 77 |                         column, list(df.columns)))
 78 |             else:
 79 |                 subset_columns.append(column)
 80 |         df = df[subset_columns]
 81 | 
 82 |     if args.rename_output_column:
 83 |         for (old_name, new_name) in args.rename_output_column:
 84 |             if old_name not in df.columns:
 85 |                 logging.warn(
 86 |                     "Can't rename column '%s' since it doesn't exist, available: %s" % (
 87 |                         old_name, list(df.columns)))
 88 |             else:
 89 |                 df.rename(columns={old_name: new_name}, inplace=True)
 90 | 
 91 |     if print_df_after_filtering:
 92 |         print(df)
 93 | 
 94 |     if args.print_columns:
 95 |         print("Columns:")
 96 |         for column in df.columns:
 97 |             print("-- %s" % column)
 98 | 
 99 |     if args.output_csv:
100 |         print("Saving %s..." % args.output_csv)
101 |         df.to_csv(
102 |             args.output_csv,
103 |             index=True,
104 |             index_label="#",
105 |             sep=args.output_csv_sep)
106 | 
107 |     if args.output_html:
108 |         print("Saving %s..." % args.output_html)
109 |         df.to_html(args.output_html, index=True)
110 | 


--------------------------------------------------------------------------------
/topiary/cli/protein_changes.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import logging
 16 | from pyensembl import ensembl_grch38
 17 | from varcode import EffectCollection
 18 | from varcode.effects import Substitution
 19 | from varcode.reference import infer_genome
 20 | import re
 21 | 
 22 | def add_protein_change_args(arg_parser):
 23 |     protein_change_group = arg_parser.add_argument_group(
 24 |         title="Protein Changes",
 25 |         description="Input protein changes without associated genomic variants")
 26 | 
 27 |     protein_change_group.add_argument(
 28 |         "--protein-change",
 29 |         default=[],
 30 |         nargs=2,
 31 |         action="append",
 32 |         help="Protein modification without genomic variant (e.g. EGFR T790M)")
 33 | 
 34 |     return arg_parser
 35 | 
 36 | def genome_from_args(args):
 37 |     if args.genome:
 38 |         return infer_genome(args.genome)
 39 |     else:
 40 |         # no genome specified, assume it can be inferred from the file(s)
 41 |         # we're loading
 42 |         return ensembl_grch38
 43 | 
 44 | def transcript_sort_key(transcript):
 45 |     """
 46 |     Key function used to sort transcripts. Taking the negative of
 47 |     protein sequence length and nucleotide sequence length so that
 48 |     the transcripts with longest sequences come first in the list. This couldn't
 49 |     be accomplished with `reverse=True` since we're also sorting by
 50 |     transcript name (which places TP53-001 before TP53-002).
 51 |     """
 52 |     return (
 53 |         -len(transcript.protein_sequence),
 54 |         -len(transcript.sequence),
 55 |         transcript.name
 56 |     )
 57 | 
 58 | def best_transcript(transcripts):
 59 |     """
 60 |     Given a set of coding transcripts, choose the one with the longest
 61 |     protein sequence and in cases of ties use the following tie-breaking
 62 |     criteria:
 63 |         - transcript sequence (including UTRs)
 64 |         - transcript name (so TP53-001 should come before TP53-202)
 65 |     """
 66 |     assert len(transcripts) > 0
 67 |     sorted_list = sorted(transcripts, key=transcript_sort_key)
 68 |     return sorted_list[0]
 69 | 
 70 | def protein_change_effects_from_args(args):
 71 |     genome = genome_from_args(args)
 72 |     valid_gene_names = set(genome.gene_names())
 73 |     substitution_regex = re.compile("([A-Z]+)([0-9]+)([A-Z]+)")
 74 |     effects = []
 75 |     for gene_name, protein_change_string in args.protein_change:
 76 |         match_obj = substitution_regex.match(protein_change_string)
 77 |         if match_obj is None:
 78 |             logging.warn(
 79 |                 "Unable to parse protein modification: '%s'" % protein_change_string)
 80 |             continue
 81 | 
 82 |         ref, base1_pos, alt = match_obj.groups()
 83 | 
 84 |         base1_pos = int(base1_pos)
 85 | 
 86 |         if gene_name not in valid_gene_names:
 87 |             logging.warn("Invalid gene name '%s' in protein modification: '%s'" % (
 88 |                 gene_name, protein_change_string))
 89 |             continue
 90 | 
 91 |         candidate_transcripts = []
 92 |         for candidate_gene in genome.genes_by_name(gene_name):
 93 |             for candidate_transcript in candidate_gene.transcripts:
 94 |                 if not candidate_transcript.is_protein_coding:
 95 |                     continue
 96 |                 protein_sequence = candidate_transcript.protein_sequence
 97 |                 if protein_sequence is None:
 98 |                     continue
 99 |                 if len(protein_sequence) < (base1_pos + len(ref) - 1):
100 |                     # protein sequence too short for this modification
101 |                     # e.g. EGFR T790M can't happen in an EGFR transcript
102 |                     # with only 789 amino acids
103 |                     continue
104 | 
105 |                 seq_at_pos = protein_sequence[base1_pos - 1: base1_pos + len(ref) - 1]
106 |                 if seq_at_pos != ref:
107 |                     # if this transcript doesn't have the same reference amino
108 |                     # acids as the change then skip it and use a different
109 |                     # transcript
110 |                     continue
111 |                 candidate_transcripts.append(candidate_transcript)
112 |         if len(candidate_transcripts) > 0:
113 |             transcript = best_transcript(candidate_transcripts)
114 |             effects.append(Substitution(
115 |                 variant=None,
116 |                 transcript=transcript,
117 |                 aa_ref=ref,
118 |                 aa_alt=alt,
119 |                 aa_mutation_start_offset=base1_pos - 1))
120 |     return EffectCollection(effects)
121 | 


--------------------------------------------------------------------------------
/topiary/cli/rna.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Common commandline arguments for filtering by gene/transcript expression
17 | """
18 | 
19 | from __future__ import print_function, division, absolute_import
20 | 
21 | from ..rna import (
22 |     load_cufflinks_fpkm_dict,
23 |     load_transcript_fpkm_dict_from_gtf
24 | )
25 | 
26 | def add_rna_args(arg_parser):
27 |     rna_group = arg_parser.add_argument_group(
28 |         title="RNA-Seq",
29 |         description="Transcript and gene abundance quantification")
30 | 
31 |     rna_group.add_argument(
32 |         "--rna-transcript-fpkm-tracking-file",
33 |         help="".join([
34 |             "Cufflinks tracking file (FPKM estimates for Ensembl transcripts). ",
35 |             "Used both for expression filtering and selecting the most abundant ",
36 |             "transcript to use for determining a mutant protein sequence."]))
37 | 
38 |     rna_group.add_argument(
39 |         "--rna-transcript-fpkm-gtf-file",
40 |         help="".join([
41 |             "GTF file containing FPKM estimates for Ensembl transcripts.",
42 |             "Used both for expression filtering and selecting the most abundant ",
43 |             "transcript to use for determining a mutant protein sequence."]))
44 | 
45 |     rna_group.add_argument(
46 |         "--rna-min-transcript-expression",
47 |         help="Minimum FPKM for transcript expression",
48 |         default=0.0,
49 |         type=float)
50 | 
51 |     rna_group.add_argument(
52 |         "--rna-gene-fpkm-tracking-file",
53 |         help="Cufflinks tracking file (FPKM estimates for Ensembl genes)",
54 |         required=False)
55 | 
56 |     rna_group.add_argument(
57 |         "--rna-min-gene-expression",
58 |         help="Minimum FPKM for gene expression",
59 |         default=0.0,
60 |         type=float)
61 | 
62 |     return rna_group
63 | 
64 | def rna_gene_expression_dict_from_args(args):
65 |     """
66 |     Returns a dictionary mapping Ensembl gene IDs to FPKM expression values
67 |     or None if neither Cufflinks tracking file nor StringTie GTF file specified
68 |     in the commandline arguments.
69 |     """
70 |     if args.rna_gene_fpkm_tracking_file:
71 |         return load_cufflinks_fpkm_dict(args.rna_gene_fpkm_tracking_file)
72 |     else:
73 |         return None
74 | 
75 | def rna_transcript_expression_dict_from_args(args):
76 |     """
77 |     Returns a dictionary mapping Ensembl transcript IDs to FPKM expression
78 |     values or None if neither Cufflinks tracking file nor StringTie GTF file
79 |     were specified.
80 |     """
81 |     if args.rna_transcript_fpkm_tracking_file:
82 |         return load_cufflinks_fpkm_dict(args.rna_transcript_fpkm_tracking_file)
83 |     elif args.rna_transcript_fpkm_gtf_file:
84 |         return load_transcript_fpkm_dict_from_gtf(
85 |             args.rna_transcript_fpkm_gtf_file)
86 |     else:
87 |         return None
88 | 


--------------------------------------------------------------------------------
/topiary/cli/script.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """
16 | Script to generate epitope predictions from somatic cancer variants
17 | and (optionally) tumor RNA-seq data.
18 | 
19 | Example usage:
20 |     topiary \
21 |         --mhc-predictor netmhcpan
22 |         --mhc-alleles-file HLA.txt
23 |         --vcf somatic.vcf
24 |         --rna-gene-fpkm-file genes.fpkm_tracking
25 |         --rna-transcript-fpkm-file isoforms.fpkm_tracking
26 |         --filter-ic50 500
27 |         --filter-percentile 2
28 |         --output results.csv
29 | """
30 | 
31 | from __future__ import print_function, division, absolute_import
32 | 
33 | import sys
34 | 
35 | from .args import arg_parser, predict_epitopes_from_args
36 | 
37 | from .outputs import write_outputs
38 | 
39 | 
40 | def parse_args(args_list=None):
41 |     if args_list is None:
42 |         args_list = sys.argv[1:]
43 |     return arg_parser.parse_args(args_list)
44 | 
45 | def main(args_list=None):
46 |     """
47 |     Script entry-point to predict neo-epitopes from genomic variants using
48 |     Topiary.
49 |     """
50 |     args = parse_args(args_list)
51 |     print("Topiary commandline arguments:")
52 |     print(args)
53 |     df = predict_epitopes_from_args(args)
54 |     write_outputs(df, args)
55 |     print("Total count: %d" % len(df))
56 | 


--------------------------------------------------------------------------------
/topiary/cli/sequence.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | """
17 | Commandline arguments related to translated variant protein sequences.
18 | """
19 | 
20 | from __future__ import print_function, division, absolute_import
21 | 
22 | def add_sequence_args(arg_parser):
23 |     sequence_group = arg_parser.add_argument_group(
24 |         title="Protein Sequence Options",
25 |         description="Parameters related to the mutant protein sequence")
26 | 
27 |     sequence_group.add_argument(
28 |         "--padding-around-mutation",
29 |         default=None,
30 |         help="".join([
31 |             "How many extra amino acids to include on either side of a mutation.",
32 |             "Default is determined by epitope lengths but can be overridden to ",
33 |             "predict wildtype epitopes in a larger context around a mutant residue.",
34 |         ]),
35 |         type=int)
36 | 
37 |     return sequence_group
38 | 


--------------------------------------------------------------------------------
/topiary/filters.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """
 16 | Helper functions for filtering variants, effects, and epitope predictions
 17 | """
 18 | 
 19 | from __future__ import print_function, division, absolute_import
 20 | import logging
 21 | 
 22 | from varcode import NonsilentCodingMutation
 23 | 
 24 | def apply_filter(
 25 |         filter_fn,
 26 |         collection,
 27 |         result_fn=None,
 28 |         filter_name="",
 29 |         collection_name=""):
 30 |     """
 31 |     Apply filter to effect collection and print number of dropped elements
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     """
 36 |     n_before = len(collection)
 37 |     filtered = [x for x in collection if filter_fn(x)]
 38 |     n_after = len(filtered)
 39 |     if not collection_name:
 40 |         collection_name = collection.__class__.__name__
 41 |     logging.info(
 42 |         "%s filtering removed %d/%d entries of %s",
 43 |         filter_name,
 44 |         (n_before - n_after),
 45 |         n_before,
 46 |         collection_name)
 47 |     return result_fn(filtered) if result_fn else collection.__class__(filtered)
 48 | 
 49 | def filter_silent_and_noncoding_effects(effects):
 50 |     """
 51 |     Keep only variant effects which result in modified proteins.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     effects : varcode.EffectCollection
 56 |     """
 57 |     return apply_filter(
 58 |         filter_fn=lambda effect: isinstance(effect, NonsilentCodingMutation),
 59 |         collection=effects,
 60 |         result_fn=effects.clone_with_new_elements,
 61 |         filter_name="Silent mutation")
 62 | 
 63 | 
 64 | def apply_variant_expression_filters(
 65 |         variants,
 66 |         gene_expression_dict,
 67 |         gene_expression_threshold,
 68 |         transcript_expression_dict,
 69 |         transcript_expression_threshold):
 70 |     """
 71 |     Filter a collection of variants by gene and transcript expression thresholds
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     variants : varcode.VariantCollection
 76 | 
 77 |     gene_expression_dict : dict
 78 | 
 79 |     gene_expression_threshold : float
 80 | 
 81 |     transcript_expression_dict : dict
 82 | 
 83 |     transcript_expression_threshold : float
 84 |     """
 85 |     if gene_expression_dict:
 86 |         variants = apply_filter(
 87 |             lambda variant: any(
 88 |                 gene_expression_dict.get(gene_id, 0.0) >=
 89 |                 gene_expression_threshold
 90 |                 for gene_id in variant.gene_ids
 91 |             ),
 92 |             variants,
 93 |             result_fn=variants.clone_with_new_elements,
 94 |             filter_name="Variant gene expression (min=%0.4f)" % gene_expression_threshold)
 95 |     if transcript_expression_dict:
 96 |         variants = apply_filter(
 97 |             lambda variant: any(
 98 |                 transcript_expression_dict.get(transcript_id, 0.0) >=
 99 |                 transcript_expression_threshold
100 |                 for transcript_id in variant.transcript_ids
101 |             ),
102 |             variants,
103 |             result_fn=variants.clone_with_new_elements,
104 |             filter_name=(
105 |                 "Variant transcript expression (min=%0.4f)" % (
106 |                     transcript_expression_threshold,)))
107 |     return variants
108 | 
109 | def apply_effect_expression_filters(
110 |         effects,
111 |         gene_expression_dict,
112 |         gene_expression_threshold,
113 |         transcript_expression_dict,
114 |         transcript_expression_threshold):
115 |     """
116 |     Filter collection of varcode effects by given gene
117 |     and transcript expression thresholds.
118 | 
119 |     Parameters
120 |     ----------
121 |     effects : varcode.EffectCollection
122 | 
123 |     gene_expression_dict : dict
124 | 
125 |     gene_expression_threshold : float
126 | 
127 |     transcript_expression_dict : dict
128 | 
129 |     transcript_expression_threshold : float
130 |     """
131 |     if gene_expression_dict:
132 |         effects = apply_filter(
133 |             lambda effect: (
134 |                 gene_expression_dict.get(effect.gene_id, 0.0) >=
135 |                 gene_expression_threshold),
136 |             effects,
137 |             result_fn=effects.clone_with_new_elements,
138 |             filter_name="Effect gene expression (min = %0.4f)" % gene_expression_threshold)
139 | 
140 |     if transcript_expression_dict:
141 |         effects = apply_filter(
142 |             lambda effect: (
143 |                 transcript_expression_dict.get(effect.transcript_id, 0.0) >=
144 |                 transcript_expression_threshold
145 |             ),
146 |             effects,
147 |             result_fn=effects.clone_with_new_elements,
148 |             filter_name=(
149 |                 "Effect transcript expression (min=%0.4f)" % (
150 |                     transcript_expression_threshold,)))
151 |     return effects
152 | 


--------------------------------------------------------------------------------
/topiary/predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function, division, absolute_import
 16 | 
 17 | import logging
 18 | 
 19 | from collections import OrderedDict
 20 | 
 21 | 
 22 | from .filters import (
 23 |     apply_effect_expression_filters,
 24 |     apply_variant_expression_filters,
 25 |     filter_silent_and_noncoding_effects,
 26 | )
 27 | from .sequence_helpers import (
 28 |     protein_subsequences_around_mutations,
 29 |     check_padding_around_mutation,
 30 |     contains_mutant_residues,
 31 |     peptide_mutation_interval,
 32 | )
 33 | 
 34 | class TopiaryPredictor(object):
 35 |     def __init__(
 36 |             self,
 37 |             mhc_model,
 38 |             padding_around_mutation=None,
 39 |             ic50_cutoff=None,
 40 |             percentile_cutoff=None,
 41 |             min_gene_expression=0.0,
 42 |             min_transcript_expression=0.0,
 43 |             only_novel_epitopes=False,
 44 |             raise_on_error=True):
 45 |         """
 46 |         Parameters
 47 |         ----------
 48 |         mhc_model : mhctools.BasePredictor
 49 |             Any instance of a peptide-MHC binding affinity predictor
 50 | 
 51 |         padding_around_mutation : int
 52 |             How many residues surrounding a mutation to consider including in a
 53 |             candidate epitope. Default is the minimum size necessary for epitope
 54 |             length of the mhc model.
 55 | 
 56 |         min_gene_expression : float, optional
 57 |             If gene expression values are provided, only keep effects on
 58 |             genes with expression above this threshold.
 59 | 
 60 |         min_transcript_expression : float, optional
 61 |             If transcript expression values are provided, only keep effects on
 62 |             transcripts with expression above this threshold.
 63 | 
 64 |         ic50_cutoff : float, optional
 65 |             Maximum predicted IC50 value for a peptide to be considered a binder.
 66 | 
 67 |         percentile_cutoff : float, optional
 68 |             Maximum percentile rank of IC50 values for a peptide to be considered
 69 |             a binder.
 70 | 
 71 |         only_novel_epitopes : bool, optional
 72 |             If True, then drop peptides which either don't contain a mutation.
 73 |             TODO: make this also check that peptide doesn't occur elsewhere in
 74 |             the reference ligandome
 75 | 
 76 |         raise_on_error : bool
 77 |             Raise an exception if error is encountered or skip
 78 |             the variant or peptide which generated the error.
 79 |         """
 80 |         self.mhc_model = mhc_model
 81 |         self.padding_around_mutation = check_padding_around_mutation(
 82 |             given_padding=padding_around_mutation,
 83 |             epitope_lengths=self.mhc_model.default_peptide_lengths)
 84 |         self.ic50_cutoff = ic50_cutoff
 85 |         self.percentile_cutoff = percentile_cutoff
 86 |         self.min_transcript_expression = min_transcript_expression
 87 |         self.min_gene_expression = min_gene_expression
 88 |         self.only_novel_epitopes = only_novel_epitopes
 89 |         self.raise_on_error = raise_on_error
 90 | 
 91 |     def predict_from_named_sequences(
 92 |             self, name_to_sequence_dict):
 93 |         """
 94 |         Parameters
 95 |         ----------
 96 |         name_to_sequence_dict : (str->str) dict
 97 |             Dictionary mapping sequence names to amino acid sequences
 98 | 
 99 |         Returns pandas.DataFrame with the following columns:
100 |             - source_sequence_name
101 |             - peptide
102 |             - peptide_offset
103 |             - peptide_length
104 |             - allele
105 |             - affinity
106 |             - percentile_rank
107 |             - prediction_method_name
108 |         """
109 |         df = self.mhc_model.predict_subsequences_dataframe(name_to_sequence_dict)
110 |         return df.rename(
111 |             columns={
112 |                 "length": "peptide_length",
113 |                 "offset": "peptide_offset"})
114 | 
115 |     def predict_from_sequences(self, sequences):
116 |         """
117 |         Predict MHC ligands for sub-sequences of each input sequence.
118 | 
119 |         Parameters
120 |         ----------
121 |         sequences : list of str
122 |             Multiple amino acid sequences (without any names or IDs)
123 | 
124 |         Returns DataFrame with the following fields:
125 |             - source_sequence
126 |             - peptide
127 |             - peptide_offset
128 |             - peptide_length
129 |             - allele
130 |             - affinity
131 |             - percentile_rank
132 |             - prediction_method_name
133 |         """
134 |         # make each sequence its own unique ID
135 |         sequence_dict = {
136 |             seq: seq
137 |             for seq in sequences
138 |         }
139 |         df = self.predict_from_named_sequences(sequence_dict)
140 |         return df.rename(columns={"source_sequence_name": "source_sequence"})
141 | 
142 |     def predict_from_mutation_effects(
143 |             self,
144 |             effects,
145 |             transcript_expression_dict=None,
146 |             gene_expression_dict=None):
147 |         """Given a Varcode.EffectCollection of predicted protein effects,
148 |         return predicted epitopes around each mutation.
149 | 
150 |         Parameters
151 |         ----------
152 |         effects : Varcode.EffectCollection
153 | 
154 |         transcript_expression_dict : dict
155 |             Dictionary mapping transcript IDs to RNA expression estimates. Used
156 |             both for transcript expression filtering and for selecting the
157 |             most abundant transcript for a particular variant. If omitted then
158 |             transcript selection is done using priority of variant effects and
159 |             transcript length.
160 | 
161 |         gene_expression_dict : dict, optional
162 |             Dictionary mapping gene IDs to RNA expression estimates
163 | 
164 |         Returns DataFrame with the following columns:
165 |             - variant
166 |             - gene
167 |             - gene_id
168 |             - transcript_id
169 |             - transcript_name
170 |             - effect
171 |             - effect_type
172 |             - peptide
173 |             - peptide_offset
174 |             - peptide_length
175 |             - allele
176 |             - affinity
177 |             - percentile_rank
178 |             - prediction_method_name
179 |             - contains_mutant_residues
180 |             - mutation_start_in_peptide
181 |             - mutation_end_in_peptide
182 | 
183 |         Optionall will also include the following columns if corresponding
184 |         expression dictionary inputs are provided:
185 |             - gene_expression
186 |             - transcript_expression
187 |         """
188 | 
189 |         # we only care about effects which impact the coding sequence of a
190 |         # protein
191 |         effects = filter_silent_and_noncoding_effects(effects)
192 | 
193 |         effects = apply_effect_expression_filters(
194 |             effects,
195 |             transcript_expression_dict=transcript_expression_dict,
196 |             transcript_expression_threshold=self.min_transcript_expression,
197 |             gene_expression_dict=gene_expression_dict,
198 |             gene_expression_threshold=self.min_gene_expression)
199 | 
200 |         # group by variants, so that we end up with only one mutant
201 |         # sequence per mutation
202 |         variant_effect_groups = effects.groupby_variant()
203 | 
204 |         if len(variant_effect_groups) == 0:
205 |             logging.warn("No candidates for MHC binding prediction")
206 |             return []
207 | 
208 |         if transcript_expression_dict:
209 |             # if expression data is available, then for each variant
210 |             # keep the effect annotation for the most abundant transcript
211 |             top_effects = [
212 |                 variant_effects.top_expression_effect(
213 |                     transcript_expression_dict)
214 |                 for variant_effects in variant_effect_groups.values()
215 |             ]
216 |         else:
217 |             # if no transcript abundance data is available, then
218 |             # for each variant keep the effect with the most significant
219 |             # predicted effect on the protein sequence, along with using
220 |             # transcript/CDS length as a tie-breaker for effects with the same
221 |             # priority.
222 |             top_effects = [
223 |                 variant_effects.top_priority_effect()
224 |                 for variant_effects in variant_effect_groups.values()
225 |             ]
226 | 
227 |         # 1) dictionary mapping varcode effect objects to subsequences
228 |         #    around each mutation
229 |         # 2) dictionary mapping varcode effect to start offset of subsequence
230 |         #    within the full mutant protein sequence
231 |         effect_to_subsequence_dict, effect_to_offset_dict = \
232 |             protein_subsequences_around_mutations(
233 |                 effects=top_effects,
234 |                 padding_around_mutation=self.padding_around_mutation)
235 | 
236 |         # since we know that each set of variant effects has been
237 |         # reduced to a single 'top priority' effect, we can uniquely
238 |         # identify each variant sequence by its original genomic variant
239 |         variant_string_to_effect_dict = {
240 |             effect.variant.short_description: effect
241 |             for effect in effect_to_subsequence_dict.keys()
242 |         }
243 |         variant_string_to_subsequence_dict = {
244 |             effect.variant.short_description: subseq
245 |             for (effect, subseq) in effect_to_subsequence_dict.items()
246 |         }
247 |         variant_string_to_offset_dict = {
248 |             effect.variant.short_description: subseq_offset
249 |             for (effect, subseq_offset) in effect_to_offset_dict.items()
250 |         }
251 |         df = self.predict_from_named_sequences(variant_string_to_subsequence_dict)
252 |         logging.info("MHC predictor returned %d peptide binding predictions" % (
253 |             len(df)))
254 | 
255 |         # since we used variant descrptions as the name of each sequence
256 |         # let's rename that column to be more informative
257 |         df = df.rename(columns={"source_sequence_name": "variant"})
258 | 
259 |         # adjust offset to be relative to start of protein, rather
260 |         # than whatever subsequence we used for prediction
261 |         def compute_peptide_offset_relative_to_protein(row):
262 |             subsequence_offset = variant_string_to_offset_dict[row.variant]
263 |             return row.peptide_offset + subsequence_offset
264 | 
265 |         df["peptide_offset"] = df.apply(
266 |             compute_peptide_offset_relative_to_protein,
267 |             axis=1)
268 | 
269 |         if self.ic50_cutoff:
270 |             df = df[df.affinity <= self.ic50_cutoff]
271 |             logging.info("Kept %d predictions after filtering affinity <= %f" % (
272 |                 len(df), self.ic50_cutoff))
273 | 
274 |         if self.percentile_cutoff:
275 |             df = df[df.percentile_rank <= self.percentile_cutoff]
276 |             logging.info("Kept %d predictions after filtering percentile <= %f" % (
277 |                 len(df), self.percentile_cutoff))
278 | 
279 |         extra_columns = OrderedDict([
280 |             ('gene', []),
281 |             ('gene_id', []),
282 |             ('transcript_id', []),
283 |             ('transcript_name', []),
284 |             ('effect', []),
285 |             ('effect_type', []),
286 |             ('contains_mutant_residues', []),
287 |             ('mutation_start_in_peptide', []),
288 |             ('mutation_end_in_peptide', []),
289 |         ])
290 |         if gene_expression_dict is not None:
291 |             extra_columns["gene_expression"] = []
292 |         if transcript_expression_dict is not None:
293 |             extra_columns["transcript_expression"] = []
294 | 
295 |         for _, row in df.iterrows():
296 |             effect = variant_string_to_effect_dict[row.variant]
297 |             mutation_start_in_protein = effect.aa_mutation_start_offset
298 |             mutation_end_in_protein = effect.aa_mutation_end_offset
299 |             peptide_length = len(row.peptide)
300 |             is_mutant = contains_mutant_residues(
301 |                 peptide_start_in_protein=row.peptide_offset,
302 |                 peptide_length=peptide_length,
303 |                 mutation_start_in_protein=mutation_start_in_protein,
304 |                 mutation_end_in_protein=mutation_end_in_protein)
305 |             if is_mutant:
306 |                 mutation_start_in_peptide, mutation_end_in_peptide = peptide_mutation_interval(
307 |                     peptide_start_in_protein=row.peptide_offset,
308 |                     peptide_length=peptide_length,
309 |                     mutation_start_in_protein=mutation_start_in_protein,
310 |                     mutation_end_in_protein=mutation_end_in_protein)
311 |             else:
312 |                 mutation_start_in_peptide = mutation_end_in_peptide = None
313 | 
314 |             extra_columns["gene"].append(effect.gene_name)
315 |             gene_id = effect.gene_id
316 |             extra_columns["gene_id"].append(gene_id)
317 |             if gene_expression_dict is not None:
318 |                 extra_columns["gene_expression"].append(
319 |                     gene_expression_dict.get(gene_id, 0.0))
320 | 
321 |             transcript_id = effect.transcript_id
322 |             extra_columns["transcript_id"].append(transcript_id)
323 |             extra_columns["transcript_name"].append(effect.transcript_name)
324 |             if transcript_expression_dict is not None:
325 |                 extra_columns["transcript_expression"].append(
326 |                     transcript_expression_dict.get(transcript_id, 0.0))
327 | 
328 |             extra_columns["effect"].append(effect.short_description)
329 |             extra_columns["effect_type"].append(effect.__class__.__name__)
330 | 
331 |             extra_columns["contains_mutant_residues"].append(is_mutant)
332 |             extra_columns["mutation_start_in_peptide"].append(mutation_start_in_peptide)
333 |             extra_columns["mutation_end_in_peptide"].append(mutation_end_in_peptide)
334 | 
335 |         for col, values in extra_columns.items():
336 |             df[col] = values
337 | 
338 |         # TODO: add extra boolean field
339 |         #   novel = is_mutant | not_in_reference
340 |         # Requires keeping a quick lookup structure for all peptides in
341 |         # the reference proteome
342 |         if self.only_novel_epitopes:
343 |             df = df[df.contains_mutant_residues]
344 | 
345 |         return df
346 | 
347 |     def predict_from_variants(
348 |             self,
349 |             variants,
350 |             transcript_expression_dict=None,
351 |             gene_expression_dict=None):
352 |         """
353 |         Predict epitopes from a Variant collection, filtering options, and
354 |         optional gene and transcript expression data.
355 | 
356 |         Parameters
357 |         ----------
358 |         variants : varcode.VariantCollection
359 | 
360 |         transcript_expression_dict : dict
361 |             Maps from Ensembl transcript IDs to FPKM expression values.
362 | 
363 |         gene_expression_dict : dict, optional
364 |             Maps from Ensembl gene IDs to FPKM expression values.
365 | 
366 |         Returns DataFrame with the following columns:
367 |             - variant
368 |             - gene
369 |             - gene_id
370 |             - transcript_id
371 |             - transcript_name
372 |             - effect
373 |             - effect_type
374 |             - peptide
375 |             - peptide_offset
376 |             - peptide_length
377 |             - allele
378 |             - affinity
379 |             - percentile_rank
380 |             - prediction_method_name
381 |             - contains_mutant_residues
382 |             - mutation_start_in_peptide
383 |             - mutation_end_in_peptide
384 | 
385 |         Optionall will also include the following columns if corresponding
386 |         expression dictionary inputs are provided:
387 |             - gene_expression
388 |             - transcript_expression
389 |         """
390 |         # pre-filter variants by checking if any of the genes or
391 |         # transcripts they overlap have sufficient expression.
392 |         # I'm tolerating the redundancy of this code since it's much cheaper
393 |         # to filter a variant *before* trying to predict its impact/effect
394 |         # on the protein sequence.
395 |         variants = apply_variant_expression_filters(
396 |             variants,
397 |             transcript_expression_dict=transcript_expression_dict,
398 |             transcript_expression_threshold=self.min_transcript_expression,
399 |             gene_expression_dict=gene_expression_dict,
400 |             gene_expression_threshold=self.min_gene_expression)
401 | 
402 |         effects = variants.effects(raise_on_error=self.raise_on_error)
403 | 
404 |         return self.predict_from_mutation_effects(
405 |             effects=effects,
406 |             transcript_expression_dict=transcript_expression_dict,
407 |             gene_expression_dict=gene_expression_dict)
408 | 


--------------------------------------------------------------------------------
/topiary/rna/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cufflinks import (
 2 |     load_cufflinks_dataframe,
 3 |     load_cufflinks_dict,
 4 |     load_cufflinks_fpkm_dict,
 5 | )
 6 | from .gtf import load_transcript_fpkm_dict_from_gtf
 7 | 
 8 | __all__ = [
 9 |     "load_cufflinks_dataframe",
10 |     "load_cufflinks_dict",
11 |     "load_cufflinks_fpkm_dict",
12 |     "load_transcript_fpkm_dict_from_gtf",
13 | ]
14 | 


--------------------------------------------------------------------------------
/topiary/rna/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function, division, absolute_import
16 | 
17 | import re
18 | 
19 | def infer_delimiter(filename, comment_char="#", n_lines=3):
20 |     """
21 |     Given a file which contains data separated by one of the following:
22 |         - commas
23 |         - tabs
24 |         - spaces
25 |     Return the most likely separator by sniffing the first few lines
26 |     of the file's contents.
27 |     """
28 |     lines = []
29 |     with open(filename, "r") as f:
30 |         for line in f:
31 |             if line.startswith(comment_char):
32 |                 continue
33 |             if len(lines) < n_lines:
34 |                 lines.append(line)
35 |             else:
36 |                 break
37 |     if len(lines) < n_lines:
38 |         raise ValueError(
39 |             "Not enough lines in %s to infer delimiter" % filename)
40 |     candidate_delimiters = ["\t", ",", "\s+"]
41 |     for candidate_delimiter in candidate_delimiters:
42 |         counts = [len(re.split(candidate_delimiter, line)) for line in lines]
43 |         first_line_count = counts[0]
44 |         if all(c == first_line_count for c in counts) and first_line_count > 1:
45 |             return candidate_delimiter
46 |     raise ValueError("Could not determine delimiter for %s" % filename)
47 | 
48 | 
49 | def check_required_columns(df, filename, required_columns):
50 |     """
51 |     Ensure that all required columns are present in the given dataframe,
52 |     otherwise raise an exception.
53 |     """
54 |     available_columns = set(df.columns)
55 |     for column_name in required_columns:
56 |         if column_name not in available_columns:
57 |             raise ValueError("FPKM tracking file %s missing column '%s'" % (
58 |                 filename,
59 |                 column_name))
60 | 


--------------------------------------------------------------------------------
/topiary/rna/cufflinks.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function, division, absolute_import
 16 | 
 17 | import logging
 18 | 
 19 | import pandas as pd
 20 | import numpy as np
 21 | 
 22 | from .common import infer_delimiter, check_required_columns
 23 | 
 24 | def parse_locus_column(loci):
 25 |     # capture all characters before ':' (drop 'chr' if present)
 26 |     chromosomes = loci.str.extract("(?:chr)?([^:]*):.*", expand=False)
 27 |     # capture all characters after e.g. 'chr1:', which look like '132-394'
 28 |     ranges = loci.str.extract("(?:chr)?[^:]*:(.*)", expand=False)
 29 |     # capture all numbers before the dash
 30 |     starts = ranges.str.extract("(\d*)-\d*", expand=False).astype(int)
 31 |     # capture all numbers after the dash
 32 |     ends = ranges.str.extract("\d*-(\d*)", expand=False).astype(int)
 33 |     return chromosomes, starts, ends
 34 | 
 35 | 
 36 | # default column names from cufflinks tracking files
 37 | # for gene and isoform expression levels
 38 | STATUS_COLUMN = "FPKM_status"
 39 | ID_COLUMN = "tracking_id"
 40 | FPKM_COLUMN = "FPKM"
 41 | LOCUS_COLUMN = "locus"
 42 | GENE_NAMES_COLUMN = "gene_short_name"
 43 | 
 44 | 
 45 | def load_cufflinks_dataframe(
 46 |         filename,
 47 |         id_column=ID_COLUMN,
 48 |         fpkm_column=FPKM_COLUMN,
 49 |         status_column=STATUS_COLUMN,
 50 |         locus_column=LOCUS_COLUMN,
 51 |         gene_names_column=GENE_NAMES_COLUMN,
 52 |         drop_failed=True,
 53 |         drop_lowdata=False,
 54 |         drop_hidata=True,
 55 |         replace_hidata_fpkm_value=None,
 56 |         drop_nonchromosomal_loci=False,
 57 |         drop_novel=False,
 58 |         sep=None):
 59 |     """
 60 |     Loads a Cufflinks tracking file, which contains expression levels
 61 |     (in FPKM: Fragments Per Kilobase of transcript per Million fragments)
 62 |     for transcript isoforms or whole genes. These transcripts/genes may be
 63 |     previously known (in which case they have an Ensembl ID) or a novel
 64 |     assembly from the RNA-Seq data (in which case their IDs look like "CUFF.1")
 65 | 
 66 |     Parameters
 67 |     ----------
 68 | 
 69 |     filename : str
 70 |         Filename of tracking file e.g. "genes.tracking_fpkm"
 71 | 
 72 |     id_column : str, optional
 73 | 
 74 |     fpkm_column : str, optional
 75 | 
 76 |     status_column : str, optional
 77 |         Name of column which indicates the FPKM estimate status. The column
 78 |         name is typically "FPKM_status". Possible contained within this column
 79 |         will be OK, FAIL, LOWDATA, HIDATA.
 80 | 
 81 |     locus_column : str, optional
 82 | 
 83 |     gene_names_column : str, optional
 84 | 
 85 |     drop_failed : bool, optional
 86 |         Drop rows whose FPKM status is "FAIL" (default=True)
 87 | 
 88 |     drop_lowdata : bool, optional
 89 |         Drop rows whose FPKM status is "LOWDATA", meaning that Cufflinks thought
 90 |         there were too few reads to accurately estimate the FPKM (default=False)
 91 | 
 92 |     drop_hidata : bool, optional
 93 |         Drop rows whose FPKM status is "HIDATA", meaning that too many
 94 |         fragments aligned to a feature for Cufflinks to process. Dropping
 95 |         the most expressed genes seems like a stupid idea so: default=False
 96 | 
 97 |     replace_hidata_fpkm_value : float, optional
 98 |         If drop_hidata=False, the HIDATA entries will still have an FPKM=0.0,
 99 |         this argument lets you replace the FPKM with some known constant.
100 | 
101 |     drop_nonchromosomal_loci : bool, optional
102 |         Drop rows whose location isn't on a canonical chromosome
103 |         i.e. doesn't start with "chr" (default=False)
104 | 
105 |     drop_novel : bool, optional
106 |         Drop genes or isoforms that aren't found in Ensembl (default = False)
107 | 
108 |     sep : str, optional
109 |         Separator between data fields in the FPKM tracking file
110 |         (default is to infer whether the file uses comma or whitespace)
111 | 
112 |     Returns DataFrame with columns:
113 |         id : str
114 |         novel : bool
115 |         fpkm : float
116 |         chr : str
117 |         start : int
118 |         end : int
119 |         gene_names : str list
120 |     """
121 |     if sep is None:
122 |         sep = infer_delimiter(filename)
123 | 
124 |     df = pd.read_csv(filename, sep=sep, engine="c")
125 | 
126 |     required_columns = {
127 |         status_column,
128 |         locus_column,
129 |         id_column,
130 |         gene_names_column,
131 |         fpkm_column
132 |     }
133 |     check_required_columns(df, filename, required_columns)
134 | 
135 |     for flag, status_value in [
136 |             (drop_failed, "FAIL"),
137 |             (drop_lowdata, "LOWDATA"),
138 |             (drop_hidata, "HIDATA")]:
139 |         mask = df[status_column] == status_value
140 |         mask_count = mask.sum()
141 |         total_count = len(df)
142 |         if flag and mask_count > 0:
143 |             verb_str = "Dropping"
144 |             df = df[~mask]
145 |         else:
146 |             verb_str = "Keeping"
147 |         logging.info(
148 |             "%s %d/%d entries from %s with status=%s",
149 |             verb_str,
150 |             mask_count,
151 |             total_count,
152 |             filename,
153 |             status_value)
154 | 
155 |     if drop_nonchromosomal_loci:
156 |         loci = df[locus_column]
157 |         chromosomal_loci = loci.str.startswith("chr")
158 |         n_dropped = (~chromosomal_loci).sum()
159 |         if n_dropped > 0:
160 |             logging.info("Dropping %d/%d non-chromosomal loci from %s" % (
161 |                 n_dropped, len(df), filename))
162 |             df = df[chromosomal_loci]
163 | 
164 |     if replace_hidata_fpkm_value:
165 |         hidata_mask = df[status_column] == "HIDATA"
166 |         n_hidata = hidata_mask.sum()
167 |         logging.info(
168 |             "Setting FPKM=%s for %d/%d entries with status=HIDATA",
169 |             replace_hidata_fpkm_value,
170 |             n_hidata,
171 |             len(df))
172 |         df[fpkm_column][hidata_mask] = replace_hidata_fpkm_value
173 | 
174 |     if len(df) == 0:
175 |         raise ValueError("Empty FPKM tracking file: %s" % filename)
176 | 
177 |     ids = df[id_column]
178 |     known = ids.str.startswith("ENS")
179 | 
180 |     if known.sum() == 0:
181 |         raise ValueError("No Ensembl IDs found in %s" % filename)
182 | 
183 |     if drop_novel:
184 |         n_dropped = (~known).sum()
185 |         if n_dropped > 0:
186 |             logging.info(
187 |                 "Dropping %d/%d novel entries from %s",
188 |                 n_dropped,
189 |                 len(df),
190 |                 filename)
191 |             df = df[known]
192 |             known = np.ones(len(df), dtype='bool')
193 | 
194 |     loci = df[locus_column]
195 |     chromosomes, starts, ends = parse_locus_column(df[locus_column])
196 | 
197 |     # gene names are given either as "-" or a comma separated list
198 |     # e.g. "BRAF1,PFAM2"
199 |     gene_names_strings = df[gene_names_column].copy()
200 |     gene_names_strings[gene_names_strings == "-"] = ""
201 |     # split each entry into a list of zero or more strings
202 |     gene_names_lists = gene_names_strings.str.split(",")
203 | 
204 |     return pd.DataFrame({
205 |         "id": df[id_column],
206 |         "novel": ~known,
207 |         "fpkm": df[fpkm_column],
208 |         "chr": chromosomes,
209 |         "start": starts,
210 |         "end": ends,
211 |         "gene_names": gene_names_lists
212 |     })
213 | 
214 | 
215 | def load_cufflinks_dict(*args, **kwargs):
216 |     """
217 |     Returns dictionary mapping feature identifier (either transcript or gene ID)
218 |     to a DataFrame row with fields:
219 |         id : str
220 |         novel : bool
221 |         fpkm : float
222 |         chr : str
223 |         start : int
224 |         end : int
225 |         gene_names : str list
226 |     """
227 |     return {
228 |         row.id: row
229 |         for (_, row)
230 |         in load_cufflinks_dataframe(*args, **kwargs).iterrows()
231 |     }
232 | 
233 | 
234 | def load_cufflinks_fpkm_dict(*args, **kwargs):
235 |     """
236 |     Returns dictionary mapping feature identifier (either transcript or gene ID)
237 |     to FPKM expression value.
238 |     """
239 |     return {
240 |         row.id: row.fpkm
241 |         for (_, row)
242 |         in load_cufflinks_dataframe(*args, **kwargs).iterrows()
243 |     }
244 | 


--------------------------------------------------------------------------------
/topiary/rna/gtf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function, division, absolute_import
16 | 
17 | import logging
18 | 
19 | import gtfparse
20 | 
21 | 
22 | def _get_gtf_column(column_name, gtf_path, df):
23 |     """
24 |     Helper function which returns a dictionary column or raises an ValueError
25 |     abou the absence of that column in a GTF file.
26 |     """
27 |     if column_name in df.columns:
28 |         return list(df[column_name])
29 | 
30 |     else:
31 |         raise ValueError(
32 |             "Missing '%s' in columns of %s, available: %s" % (
33 |                 column_name,
34 |                 gtf_path,
35 |                 list(df.columns)))
36 | 
37 | def load_transcript_fpkm_dict_from_gtf(
38 |         gtf_path,
39 |         transcript_id_column_name="reference_id",
40 |         fpkm_column_name="FPKM",
41 |         feature_column_name="feature"):
42 |     """
43 |     Load a GTF file generated by StringTie which contains transcript-level
44 |     quantification of abundance. Returns a dictionary mapping Ensembl
45 |     IDs of transcripts to FPKM values.
46 |     """
47 |     df = gtfparse.read_gtf(
48 |         gtf_path,
49 |         column_converters={fpkm_column_name: float})
50 |     transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, df)
51 |     fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, df)
52 |     features = _get_gtf_column(feature_column_name, gtf_path, df)
53 |     logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path))
54 |     logging.info("Found %s transcript entries" % sum(
55 |         feature == "transcript" for feature in features))
56 |     result = {
57 |         transcript_id: float(fpkm)
58 |         for (transcript_id, fpkm, feature)
59 |         in zip(transcript_ids, fpkm_values, features)
60 |         if (
61 |             (transcript_id is not None) and
62 |             (len(transcript_id) > 0) and
63 |             (feature == "transcript")
64 |         )
65 |     }
66 |     logging.info("Keeping %d transcript rows with reference IDs" % (
67 |         len(result),))
68 |     return result
69 | 


--------------------------------------------------------------------------------
/topiary/sequence_helpers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import print_function, division, absolute_import
 16 | 
 17 | from typechecks import require_integer
 18 | 
 19 | def protein_subsequences_around_mutations(effects, padding_around_mutation):
 20 |     """
 21 |     From each effect get a mutant protein sequence and pull out a subsequence
 22 |     around the mutation (based on the given padding). Returns a dictionary
 23 |     of subsequences and a dictionary of subsequence start offsets.
 24 |     """
 25 |     protein_subsequences = {}
 26 |     protein_subsequence_start_offsets = {}
 27 |     for effect in effects:
 28 |         protein_sequence = effect.mutant_protein_sequence
 29 |         # some effects will lack a mutant protein sequence since
 30 |         # they are either silent or unpredictable
 31 |         if protein_sequence:
 32 |             mutation_start = effect.aa_mutation_start_offset
 33 |             mutation_end = effect.aa_mutation_end_offset
 34 |             seq_start_offset = max(
 35 |                 0,
 36 |                 mutation_start - padding_around_mutation)
 37 |             # some pseudogenes have stop codons in the reference sequence,
 38 |             # if we try to use them for epitope prediction we should trim
 39 |             # the sequence to not include the stop character '*'
 40 |             first_stop_codon_index = protein_sequence.find("*")
 41 |             if first_stop_codon_index < 0:
 42 |                 first_stop_codon_index = len(protein_sequence)
 43 | 
 44 |             seq_end_offset = min(
 45 |                 first_stop_codon_index,
 46 |                 mutation_end + padding_around_mutation)
 47 |             subsequence = protein_sequence[seq_start_offset:seq_end_offset]
 48 |             protein_subsequences[effect] = subsequence
 49 |             protein_subsequence_start_offsets[effect] = seq_start_offset
 50 |     return protein_subsequences, protein_subsequence_start_offsets
 51 | 
 52 | def check_padding_around_mutation(given_padding, epitope_lengths):
 53 |     """
 54 |     If user doesn't provide any padding around the mutation we need
 55 |     to at least include enough of the surrounding non-mutated
 56 |     esidues to construct candidate epitopes of the specified lengths.
 57 |     """
 58 |     min_required_padding = max(epitope_lengths) - 1
 59 |     if not given_padding:
 60 |         return min_required_padding
 61 |     else:
 62 |         require_integer(given_padding, "Padding around mutation")
 63 |         if given_padding < min_required_padding:
 64 |             raise ValueError(
 65 |                 "Padding around mutation %d cannot be less than %d "
 66 |                 "for epitope lengths %s" % (
 67 |                     given_padding,
 68 |                     min_required_padding,
 69 |                     epitope_lengths))
 70 |         return given_padding
 71 | 
 72 | def contains_mutant_residues(
 73 |         peptide_start_in_protein,
 74 |         peptide_length,
 75 |         mutation_start_in_protein,
 76 |         mutation_end_in_protein):
 77 |     peptide_end_in_protein = peptide_start_in_protein + peptide_length - 1
 78 |     return (
 79 |         peptide_start_in_protein < mutation_end_in_protein and
 80 |         peptide_end_in_protein >= mutation_start_in_protein
 81 |     )
 82 | 
 83 | def peptide_mutation_interval(
 84 |         peptide_start_in_protein,
 85 |         peptide_length,
 86 |         mutation_start_in_protein,
 87 |         mutation_end_in_protein):
 88 |     """
 89 |     Half-open interval of mutated residues in the peptide, determined
 90 |     from the mutation interval in the original protein sequence.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     peptide_start_in_protein : int
 95 |         Position of the first peptide residue within the protein
 96 |         (starting from 0)
 97 | 
 98 |     peptide_length : int
 99 | 
100 |     mutation_start_in_protein : int
101 |         Position of the first mutated residue starting from 0. In the case of a
102 |         deletion, the position where the first residue had been.
103 | 
104 |     mutation_end_in_protein : int
105 |         Position of the last mutated residue in the mutant protein. In the case
106 |         of a deletion, this is equal to the mutation_start_in_protein.
107 |     )
108 |     """
109 |     if peptide_start_in_protein > mutation_end_in_protein:
110 |         raise ValueError("Peptide starts after mutation")
111 |     elif peptide_start_in_protein + peptide_length < mutation_start_in_protein:
112 |         raise ValueError("Peptide ends before mutation")
113 | 
114 |     # need a half-open start/end interval
115 |     peptide_mutation_start_offset = min(
116 |         peptide_length,
117 |         max(0, mutation_start_in_protein - peptide_start_in_protein))
118 |     peptide_mutation_end_offset = min(
119 |         peptide_length,
120 |         max(0, mutation_end_in_protein - peptide_start_in_protein))
121 |     return (peptide_mutation_start_offset, peptide_mutation_end_offset)
122 | 


--------------------------------------------------------------------------------